diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py index 58a7bb97..63e0d3c6 100644 --- a/docling/datamodel/pipeline_options.py +++ b/docling/datamodel/pipeline_options.py @@ -6,11 +6,15 @@ class TableFormerMode(str, Enum): + """Modes for the TableFormer model.""" + FAST = "fast" ACCURATE = "accurate" class TableStructureOptions(BaseModel): + """Options for the table structure.""" + do_cell_matching: bool = ( True # True: Matches predictions back to PDF cells. Can break table output if PDF cells @@ -21,6 +25,8 @@ class TableStructureOptions(BaseModel): class OcrOptions(BaseModel): + """OCR options.""" + kind: str lang: List[str] force_full_page_ocr: bool = False # If enabled a full page OCR is always applied @@ -30,6 +36,8 @@ class OcrOptions(BaseModel): class RapidOcrOptions(OcrOptions): + """Options for the RapidOCR engine.""" + kind: Literal["rapidocr"] = "rapidocr" # English and chinese are the most commly used models and have been tested with RapidOCR. @@ -66,6 +74,8 @@ class RapidOcrOptions(OcrOptions): class EasyOcrOptions(OcrOptions): + """Options for the EasyOCR engine.""" + kind: Literal["easyocr"] = "easyocr" lang: List[str] = ["fr", "de", "es", "en"] use_gpu: bool = True # same default as easyocr.Reader @@ -79,6 +89,8 @@ class EasyOcrOptions(OcrOptions): class TesseractCliOcrOptions(OcrOptions): + """Options for the TesseractCli engine.""" + kind: Literal["tesseract"] = "tesseract" lang: List[str] = ["fra", "deu", "spa", "eng"] tesseract_cmd: str = "tesseract" @@ -90,6 +102,8 @@ class TesseractCliOcrOptions(OcrOptions): class TesseractOcrOptions(OcrOptions): + """Options for the Tesseract engine.""" + kind: Literal["tesserocr"] = "tesserocr" lang: List[str] = ["fra", "deu", "spa", "eng"] path: Optional[str] = None @@ -100,6 +114,8 @@ class TesseractOcrOptions(OcrOptions): class OcrMacOptions(OcrOptions): + """Options for the Mac OCR engine.""" + kind: Literal["ocrmac"] = "ocrmac" lang: List[str] = ["fr-FR", "de-DE", "es-ES", "en-US"] recognition: str = "accurate" @@ -111,12 +127,16 @@ class OcrMacOptions(OcrOptions): class PipelineOptions(BaseModel): + """Base pipeline options.""" + create_legacy_output: bool = ( True # This defautl will be set to False on a future version of docling ) class PdfPipelineOptions(PipelineOptions): + """Options for the PDF pipeline.""" + artifacts_path: Optional[Union[Path, str]] = None do_table_structure: bool = True # True: perform table structure extraction do_ocr: bool = True # True: perform OCR, replace programmatic PDF text diff --git a/docs/api_reference/docling_document.md b/docs/api_reference/docling_document.md new file mode 100644 index 00000000..d8327894 --- /dev/null +++ b/docs/api_reference/docling_document.md @@ -0,0 +1,52 @@ +# Docling Document + +This is an automatic generated API reference of the DoclingDocument type. + +::: docling_core.types.doc + handler: python + options: + members: + - DoclingDocument + - DocumentOrigin + - DocItem + - DocItemLabel + - ProvenanceItem + - GroupItem + - GroupLabel + - NodeItem + - PageItem + - FloatingItem + - TextItem + - TableItem + - TableCell + - TableData + - TableCellLabel + - KeyValueItem + - SectionHeaderItem + - PictureItem + - ImageRef + - PictureClassificationClass + - PictureClassificationData + - RefItem + - BoundingBox + - CoordOrigin + - ImageRefMode + - Size + show_if_no_docstring: true + show_submodules: true + docstring_section_style: list + filters: ["!^_"] + heading_level: 2 + show_root_toc_entry: true + inherited_members: true + merge_init_into_class: true + separate_signature: true + show_root_heading: true + show_root_full_path: false + show_signature_annotations: true + show_source: false + show_symbol_type_heading: true + show_symbol_type_toc: true + show_labels: false + signature_crossrefs: true + summary: true diff --git a/docs/api_reference/document_converter.md b/docs/api_reference/document_converter.md new file mode 100644 index 00000000..8dc85a3f --- /dev/null +++ b/docs/api_reference/document_converter.md @@ -0,0 +1,38 @@ +# Document converter + +This is an automatic generated API reference of the main components of Docling. + +::: docling.document_converter + handler: python + options: + members: + - DocumentConverter + - ConversionResult + - ConversionStatus + - FormatOption + - InputFormat + - PdfFormatOption + - ImageFormatOption + - StandardPdfPipeline + - WordFormatOption + - PowerpointFormatOption + - MarkdownFormatOption + - AsciiDocFormatOption + - HTMLFormatOption + - SimplePipeline + show_if_no_docstring: true + show_submodules: true + docstring_section_style: list + filters: ["!^_"] + heading_level: 2 + inherited_members: true + merge_init_into_class: true + separate_signature: true + show_root_heading: true + show_root_full_path: false + show_signature_annotations: true + show_source: false + show_symbol_type_heading: true + show_symbol_type_toc: true + signature_crossrefs: true + summary: true diff --git a/docs/api_reference/pipeline_options.md b/docs/api_reference/pipeline_options.md new file mode 100644 index 00000000..dc3924d7 --- /dev/null +++ b/docs/api_reference/pipeline_options.md @@ -0,0 +1,36 @@ +# Pipeline options + +Pipeline options allow to customize the execution of the models during the conversion pipeline. +This includes options for the OCR engines, the table model as well as enrichment options which +can be enabled with `do_xyz = True`. + + +This is an automatic generated API reference of the all the pipeline options available in Docling. + + +::: docling.datamodel.pipeline_options + handler: python + options: + show_if_no_docstring: true + show_submodules: true + docstring_section_style: list + filters: ["!^_"] + heading_level: 2 + inherited_members: true + merge_init_into_class: true + separate_signature: true + show_root_heading: true + show_root_full_path: false + show_signature_annotations: true + show_source: false + show_symbol_type_heading: true + show_symbol_type_toc: true + signature_crossrefs: true + summary: true + + + diff --git a/mkdocs.yml b/mkdocs.yml index 03424b66..731e2ee6 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -95,8 +95,10 @@ nav: - "Prodigy": integrations/prodigy.md - "spaCy": integrations/spacy.md # - "LangChain 🦜🔗": integrations/langchain.md - # - API reference: - # - API reference: api_reference/index.md + - API reference: + - Document Converter: api_reference/document_converter.md + - Pipeline options: api_reference/pipeline_options.md + - Docling Document: api_reference/docling_document.md markdown_extensions: - pymdownx.superfences @@ -112,12 +114,15 @@ markdown_extensions: plugins: - search - mkdocs-jupyter - # - mkdocstrings: - # default_handler: python - # options: - # preload_modules: - # - docling - # - docling_core + - mkdocstrings: + default_handler: python + options: + extensions: + - griffe_pydantic: + schema: true + preload_modules: + - docling + - docling_core extra_css: - stylesheets/extra.css diff --git a/poetry.lock b/poetry.lock index 9221eb2d..9e057ed0 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1365,6 +1365,34 @@ gitdb = ">=4.0.1,<5" doc = ["sphinx (==4.3.2)", "sphinx-autodoc-typehints", "sphinx-rtd-theme", "sphinxcontrib-applehelp (>=1.0.2,<=1.0.4)", "sphinxcontrib-devhelp (==1.0.2)", "sphinxcontrib-htmlhelp (>=2.0.0,<=2.0.1)", "sphinxcontrib-qthelp (==1.0.3)", "sphinxcontrib-serializinghtml (==1.1.5)"] test = ["coverage[toml]", "ddt (>=1.1.1,!=1.4.3)", "mock", "mypy", "pre-commit", "pytest (>=7.3.1)", "pytest-cov", "pytest-instafail", "pytest-mock", "pytest-sugar", "typing-extensions"] +[[package]] +name = "griffe" +version = "1.5.1" +description = "Signatures for entire Python programs. Extract the structure, the frame, the skeleton of your project, to generate API documentation or find breaking changes in your API." +optional = false +python-versions = ">=3.9" +files = [ + {file = "griffe-1.5.1-py3-none-any.whl", hash = "sha256:ad6a7980f8c424c9102160aafa3bcdf799df0e75f7829d75af9ee5aef656f860"}, + {file = "griffe-1.5.1.tar.gz", hash = "sha256:72964f93e08c553257706d6cd2c42d1c172213feb48b2be386f243380b405d4b"}, +] + +[package.dependencies] +colorama = ">=0.4" + +[[package]] +name = "griffe-pydantic" +version = "1.1.0" +description = "Griffe extension for Pydantic." +optional = false +python-versions = ">=3.9" +files = [ + {file = "griffe_pydantic-1.1.0-py3-none-any.whl", hash = "sha256:ac9cc2d9b016cf302d8d9f577c9b3ca2793d88060f500d0b2a65f33a4a785cf1"}, + {file = "griffe_pydantic-1.1.0.tar.gz", hash = "sha256:9c5a701cc485dab087857c1ac960b44671acee5008aaae0752f610b2aa82b068"}, +] + +[package.dependencies] +griffe = ">=0.49" + [[package]] name = "grpcio" version = "1.67.1" @@ -2640,6 +2668,22 @@ watchdog = ">=2.0" i18n = ["babel (>=2.9.0)"] min-versions = ["babel (==2.9.0)", "click (==7.0)", "colorama (==0.4)", "ghp-import (==1.0)", "importlib-metadata (==4.4)", "jinja2 (==2.11.1)", "markdown (==3.3.6)", "markupsafe (==2.0.1)", "mergedeep (==1.3.4)", "mkdocs-get-deps (==0.2.0)", "packaging (==20.5)", "pathspec (==0.11.1)", "pyyaml (==5.1)", "pyyaml-env-tag (==0.1)", "watchdog (==2.0)"] +[[package]] +name = "mkdocs-autorefs" +version = "1.2.0" +description = "Automatically link across pages in MkDocs." +optional = false +python-versions = ">=3.8" +files = [ + {file = "mkdocs_autorefs-1.2.0-py3-none-any.whl", hash = "sha256:d588754ae89bd0ced0c70c06f58566a4ee43471eeeee5202427da7de9ef85a2f"}, + {file = "mkdocs_autorefs-1.2.0.tar.gz", hash = "sha256:a86b93abff653521bda71cf3fc5596342b7a23982093915cb74273f67522190f"}, +] + +[package.dependencies] +Markdown = ">=3.3" +markupsafe = ">=2.0.1" +mkdocs = ">=1.1" + [[package]] name = "mkdocs-click" version = "0.8.1" @@ -2731,6 +2775,51 @@ files = [ {file = "mkdocs_material_extensions-1.3.1.tar.gz", hash = "sha256:10c9511cea88f568257f960358a467d12b970e1f7b2c0e5fb2bb48cab1928443"}, ] +[[package]] +name = "mkdocstrings" +version = "0.27.0" +description = "Automatic documentation from sources, for MkDocs." +optional = false +python-versions = ">=3.9" +files = [ + {file = "mkdocstrings-0.27.0-py3-none-any.whl", hash = "sha256:6ceaa7ea830770959b55a16203ac63da24badd71325b96af950e59fd37366332"}, + {file = "mkdocstrings-0.27.0.tar.gz", hash = "sha256:16adca6d6b0a1f9e0c07ff0b02ced8e16f228a9d65a37c063ec4c14d7b76a657"}, +] + +[package.dependencies] +click = ">=7.0" +importlib-metadata = {version = ">=4.6", markers = "python_version < \"3.10\""} +Jinja2 = ">=2.11.1" +Markdown = ">=3.6" +MarkupSafe = ">=1.1" +mkdocs = ">=1.4" +mkdocs-autorefs = ">=1.2" +mkdocstrings-python = {version = ">=0.5.2", optional = true, markers = "extra == \"python\""} +platformdirs = ">=2.2" +pymdown-extensions = ">=6.3" +typing-extensions = {version = ">=4.1", markers = "python_version < \"3.10\""} + +[package.extras] +crystal = ["mkdocstrings-crystal (>=0.3.4)"] +python = ["mkdocstrings-python (>=0.5.2)"] +python-legacy = ["mkdocstrings-python-legacy (>=0.2.1)"] + +[[package]] +name = "mkdocstrings-python" +version = "1.12.2" +description = "A Python handler for mkdocstrings." +optional = false +python-versions = ">=3.9" +files = [ + {file = "mkdocstrings_python-1.12.2-py3-none-any.whl", hash = "sha256:7f7d40d6db3cb1f5d19dbcd80e3efe4d0ba32b073272c0c0de9de2e604eda62a"}, + {file = "mkdocstrings_python-1.12.2.tar.gz", hash = "sha256:7a1760941c0b52a2cd87b960a9e21112ffe52e7df9d0b9583d04d47ed2e186f3"}, +] + +[package.dependencies] +griffe = ">=0.49" +mkdocs-autorefs = ">=1.2" +mkdocstrings = ">=0.26" + [[package]] name = "more-itertools" version = "10.5.0" @@ -3676,9 +3765,9 @@ numpy = [ {version = ">=1.21.0", markers = "python_version == \"3.9\" and platform_system == \"Darwin\" and platform_machine == \"arm64\""}, {version = ">=1.19.3", markers = "platform_system == \"Linux\" and platform_machine == \"aarch64\" and python_version >= \"3.8\" and python_version < \"3.10\" or python_version > \"3.9\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_system != \"Darwin\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_machine != \"arm64\" and python_version < \"3.10\""}, {version = ">=1.26.0", markers = "python_version >= \"3.12\""}, - {version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""}, {version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""}, {version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""}, + {version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""}, ] [[package]] @@ -3702,9 +3791,9 @@ numpy = [ {version = ">=1.21.0", markers = "python_version == \"3.9\" and platform_system == \"Darwin\" and platform_machine == \"arm64\""}, {version = ">=1.19.3", markers = "platform_system == \"Linux\" and platform_machine == \"aarch64\" and python_version >= \"3.8\" and python_version < \"3.10\" or python_version > \"3.9\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_system != \"Darwin\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_machine != \"arm64\" and python_version < \"3.10\""}, {version = ">=1.26.0", markers = "python_version >= \"3.12\""}, - {version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""}, {version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""}, {version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""}, + {version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""}, ] [[package]] @@ -7557,4 +7646,4 @@ tesserocr = ["tesserocr"] [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "3be886856c0f11033cfb7cb8bc30e5d59d7bb9804df9da9572b3cfbc2f6c3c56" +content-hash = "2e7c27ffe32d556a66cc1008a7147a90c17f63b01d2a6cde3e7b941ba7e268d7" diff --git a/pyproject.toml b/pyproject.toml index 81cffb10..4d4ceac9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -80,6 +80,8 @@ types-openpyxl = "^3.1.5.20241114" mkdocs-material = "^9.5.40" mkdocs-jupyter = "^0.25.0" mkdocs-click = "^0.8.1" +mkdocstrings = {extras = ["python"], version = "^0.27.0"} +griffe-pydantic = "^1.1.0" [tool.poetry.group.examples.dependencies] datasets = "^2.21.0"