Skip to content

Commit

Permalink
docs: add automatic api reference (#475)
Browse files Browse the repository at this point in the history
Signed-off-by: Michele Dolfi <[email protected]>
  • Loading branch information
dolfim-ibm authored Dec 2, 2024
1 parent 8ccb3c6 commit d487210
Show file tree
Hide file tree
Showing 7 changed files with 253 additions and 11 deletions.
20 changes: 20 additions & 0 deletions docling/datamodel/pipeline_options.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,15 @@


class TableFormerMode(str, Enum):
"""Modes for the TableFormer model."""

FAST = "fast"
ACCURATE = "accurate"


class TableStructureOptions(BaseModel):
"""Options for the table structure."""

do_cell_matching: bool = (
True
# True: Matches predictions back to PDF cells. Can break table output if PDF cells
Expand All @@ -21,6 +25,8 @@ class TableStructureOptions(BaseModel):


class OcrOptions(BaseModel):
"""OCR options."""

kind: str
lang: List[str]
force_full_page_ocr: bool = False # If enabled a full page OCR is always applied
Expand All @@ -30,6 +36,8 @@ class OcrOptions(BaseModel):


class RapidOcrOptions(OcrOptions):
"""Options for the RapidOCR engine."""

kind: Literal["rapidocr"] = "rapidocr"

# English and chinese are the most commly used models and have been tested with RapidOCR.
Expand Down Expand Up @@ -66,6 +74,8 @@ class RapidOcrOptions(OcrOptions):


class EasyOcrOptions(OcrOptions):
"""Options for the EasyOCR engine."""

kind: Literal["easyocr"] = "easyocr"
lang: List[str] = ["fr", "de", "es", "en"]
use_gpu: bool = True # same default as easyocr.Reader
Expand All @@ -79,6 +89,8 @@ class EasyOcrOptions(OcrOptions):


class TesseractCliOcrOptions(OcrOptions):
"""Options for the TesseractCli engine."""

kind: Literal["tesseract"] = "tesseract"
lang: List[str] = ["fra", "deu", "spa", "eng"]
tesseract_cmd: str = "tesseract"
Expand All @@ -90,6 +102,8 @@ class TesseractCliOcrOptions(OcrOptions):


class TesseractOcrOptions(OcrOptions):
"""Options for the Tesseract engine."""

kind: Literal["tesserocr"] = "tesserocr"
lang: List[str] = ["fra", "deu", "spa", "eng"]
path: Optional[str] = None
Expand All @@ -100,6 +114,8 @@ class TesseractOcrOptions(OcrOptions):


class OcrMacOptions(OcrOptions):
"""Options for the Mac OCR engine."""

kind: Literal["ocrmac"] = "ocrmac"
lang: List[str] = ["fr-FR", "de-DE", "es-ES", "en-US"]
recognition: str = "accurate"
Expand All @@ -111,12 +127,16 @@ class OcrMacOptions(OcrOptions):


class PipelineOptions(BaseModel):
"""Base pipeline options."""

create_legacy_output: bool = (
True # This defautl will be set to False on a future version of docling
)


class PdfPipelineOptions(PipelineOptions):
"""Options for the PDF pipeline."""

artifacts_path: Optional[Union[Path, str]] = None
do_table_structure: bool = True # True: perform table structure extraction
do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
Expand Down
52 changes: 52 additions & 0 deletions docs/api_reference/docling_document.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# Docling Document

This is an automatic generated API reference of the DoclingDocument type.

::: docling_core.types.doc
handler: python
options:
members:
- DoclingDocument
- DocumentOrigin
- DocItem
- DocItemLabel
- ProvenanceItem
- GroupItem
- GroupLabel
- NodeItem
- PageItem
- FloatingItem
- TextItem
- TableItem
- TableCell
- TableData
- TableCellLabel
- KeyValueItem
- SectionHeaderItem
- PictureItem
- ImageRef
- PictureClassificationClass
- PictureClassificationData
- RefItem
- BoundingBox
- CoordOrigin
- ImageRefMode
- Size
show_if_no_docstring: true
show_submodules: true
docstring_section_style: list
filters: ["!^_"]
heading_level: 2
show_root_toc_entry: true
inherited_members: true
merge_init_into_class: true
separate_signature: true
show_root_heading: true
show_root_full_path: false
show_signature_annotations: true
show_source: false
show_symbol_type_heading: true
show_symbol_type_toc: true
show_labels: false
signature_crossrefs: true
summary: true
38 changes: 38 additions & 0 deletions docs/api_reference/document_converter.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
# Document converter

This is an automatic generated API reference of the main components of Docling.

::: docling.document_converter
handler: python
options:
members:
- DocumentConverter
- ConversionResult
- ConversionStatus
- FormatOption
- InputFormat
- PdfFormatOption
- ImageFormatOption
- StandardPdfPipeline
- WordFormatOption
- PowerpointFormatOption
- MarkdownFormatOption
- AsciiDocFormatOption
- HTMLFormatOption
- SimplePipeline
show_if_no_docstring: true
show_submodules: true
docstring_section_style: list
filters: ["!^_"]
heading_level: 2
inherited_members: true
merge_init_into_class: true
separate_signature: true
show_root_heading: true
show_root_full_path: false
show_signature_annotations: true
show_source: false
show_symbol_type_heading: true
show_symbol_type_toc: true
signature_crossrefs: true
summary: true
36 changes: 36 additions & 0 deletions docs/api_reference/pipeline_options.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# Pipeline options

Pipeline options allow to customize the execution of the models during the conversion pipeline.
This includes options for the OCR engines, the table model as well as enrichment options which
can be enabled with `do_xyz = True`.


This is an automatic generated API reference of the all the pipeline options available in Docling.


::: docling.datamodel.pipeline_options
handler: python
options:
show_if_no_docstring: true
show_submodules: true
docstring_section_style: list
filters: ["!^_"]
heading_level: 2
inherited_members: true
merge_init_into_class: true
separate_signature: true
show_root_heading: true
show_root_full_path: false
show_signature_annotations: true
show_source: false
show_symbol_type_heading: true
show_symbol_type_toc: true
signature_crossrefs: true
summary: true

<!-- ::: docling.document_converter.DocumentConverter
handler: python
options:
show_if_no_docstring: true
show_submodules: true -->
21 changes: 13 additions & 8 deletions mkdocs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -95,8 +95,10 @@ nav:
- "Prodigy": integrations/prodigy.md
- "spaCy": integrations/spacy.md
# - "LangChain 🦜🔗": integrations/langchain.md
# - API reference:
# - API reference: api_reference/index.md
- API reference:
- Document Converter: api_reference/document_converter.md
- Pipeline options: api_reference/pipeline_options.md
- Docling Document: api_reference/docling_document.md

markdown_extensions:
- pymdownx.superfences
Expand All @@ -112,12 +114,15 @@ markdown_extensions:
plugins:
- search
- mkdocs-jupyter
# - mkdocstrings:
# default_handler: python
# options:
# preload_modules:
# - docling
# - docling_core
- mkdocstrings:
default_handler: python
options:
extensions:
- griffe_pydantic:
schema: true
preload_modules:
- docling
- docling_core

extra_css:
- stylesheets/extra.css
95 changes: 92 additions & 3 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,8 @@ types-openpyxl = "^3.1.5.20241114"
mkdocs-material = "^9.5.40"
mkdocs-jupyter = "^0.25.0"
mkdocs-click = "^0.8.1"
mkdocstrings = {extras = ["python"], version = "^0.27.0"}
griffe-pydantic = "^1.1.0"

[tool.poetry.group.examples.dependencies]
datasets = "^2.21.0"
Expand Down

0 comments on commit d487210

Please sign in to comment.