diff --git a/.nojekyll b/.nojekyll new file mode 100644 index 00000000..e69de29b diff --git a/404.html b/404.html new file mode 100644 index 00000000..ea3a8cfb --- /dev/null +++ b/404.html @@ -0,0 +1,1538 @@ + + + +
+ + + + + + + + + + + + + + +This is an automatic generated API reference of the DoclingDocument type.
+ + +
doc
+
+
+Package for models defined by the Document type.
+ + + + + + + +Classes:
+DoclingDocument
+ –
+ DoclingDocument.
+DocumentOrigin
+ –
+ FileSource.
+DocItem
+ –
+ DocItem.
+DocItemLabel
+ –
+ DocItemLabel.
+ProvenanceItem
+ –
+ ProvenanceItem.
+GroupItem
+ –
+ GroupItem.
+GroupLabel
+ –
+ GroupLabel.
+NodeItem
+ –
+ NodeItem.
+PageItem
+ –
+ PageItem.
+FloatingItem
+ –
+ FloatingItem.
+TextItem
+ –
+ TextItem.
+TableItem
+ –
+ TableItem.
+TableCell
+ –
+ TableCell.
+TableData
+ –
+ BaseTableData.
+TableCellLabel
+ –
+ TableCellLabel.
+KeyValueItem
+ –
+ KeyValueItem.
+SectionHeaderItem
+ –
+ SectionItem.
+PictureItem
+ –
+ PictureItem.
+ImageRef
+ –
+ ImageRef.
+PictureClassificationClass
+ –
+ PictureClassificationData.
+PictureClassificationData
+ –
+ PictureClassificationData.
+RefItem
+ –
+ RefItem.
+BoundingBox
+ –
+ BoundingBox.
+CoordOrigin
+ –
+ CoordOrigin.
+ImageRefMode
+ –
+ ImageRefMode.
+Size
+ –
+ Size.
+
DoclingDocument
+
+
+
+ Bases: BaseModel
DoclingDocument.
+ + + + + + + + + +Methods:
+add_group
+ –
+ add_group.
+add_heading
+ –
+ add_heading.
+add_list_item
+ –
+ add_list_item.
+add_page
+ –
+ add_page.
+add_picture
+ –
+ add_picture.
+add_table
+ –
+ add_table.
+add_text
+ –
+ add_text.
+add_title
+ –
+ add_title.
+check_version_is_compatible
+ –
+ Check if this document version is compatible with current version.
+export_to_dict
+ –
+ Export to dict.
+export_to_document_tokens
+ –
+ Exports the document content to a DocumentToken format.
+export_to_element_tree
+ –
+ Export_to_element_tree.
+export_to_html
+ –
+ Serialize to HTML.
+export_to_markdown
+ –
+ Serialize to Markdown.
+export_to_text
+ –
+ export_to_text.
+iterate_items
+ –
+ iterate_elements.
+load_from_json
+ –
+ load_from_json.
+num_pages
+ –
+ num_pages.
+print_element_tree
+ –
+ Print_element_tree.
+save_as_document_tokens
+ –
+ Save the document content to a DocumentToken format.
+save_as_html
+ –
+ Save to HTML.
+save_as_json
+ –
+ Save as json.
+save_as_markdown
+ –
+ Save to markdown.
+save_as_yaml
+ –
+ Save as yaml.
+validate_document
+ –
+ validate_document.
+validate_tree
+ –
+ validate_tree.
+Attributes:
+body
+ (GroupItem
)
+ –
+ furniture
+ (GroupItem
)
+ –
+ groups
+ (List[GroupItem]
)
+ –
+ key_value_items
+ (List[KeyValueItem]
)
+ –
+ name
+ (str
)
+ –
+ origin
+ (Optional[DocumentOrigin]
)
+ –
+ pages
+ (Dict[int, PageItem]
)
+ –
+ pictures
+ (List[PictureItem]
)
+ –
+ schema_name
+ (Literal['DoclingDocument']
)
+ –
+ tables
+ (List[TableItem]
)
+ –
+ texts
+ (List[Union[SectionHeaderItem, ListItem, TextItem]]
)
+ –
+ version
+ (Annotated[str, StringConstraints(pattern=VERSION_PATTERN, strict=True)]
)
+ –
+
body
+
+
+
furniture
+
+
+
name
+
+
+name: str
+
schema_name
+
+
+schema_name: Literal['DoclingDocument'] = 'DoclingDocument'
+
texts
+
+
+texts: List[
+ Union[SectionHeaderItem, ListItem, TextItem]
+] = []
+
version
+
+
+version: Annotated[
+ str,
+ StringConstraints(pattern=VERSION_PATTERN, strict=True),
+] = CURRENT_VERSION
+
add_group
+
+
+add_group(
+ label: Optional[GroupLabel] = None,
+ name: Optional[str] = None,
+ parent: Optional[GroupItem] = None,
+) -> GroupItem
+
add_group.
+:param label: Optional[GroupLabel]: (Default value = None) +:param name: Optional[str]: (Default value = None) +:param parent: Optional[GroupItem]: (Default value = None)
+ +
add_heading
+
+
+add_heading(
+ text: str,
+ orig: Optional[str] = None,
+ level: LevelNumber = 1,
+ prov: Optional[ProvenanceItem] = None,
+ parent: Optional[GroupItem] = None,
+)
+
add_heading.
+:param label: DocItemLabel: +:param text: str: +:param orig: Optional[str]: (Default value = None) +:param level: LevelNumber: (Default value = 1) +:param prov: Optional[ProvenanceItem]: (Default value = None) +:param parent: Optional[GroupItem]: (Default value = None)
+ +
add_list_item
+
+
+add_list_item(
+ text: str,
+ enumerated: bool = False,
+ marker: Optional[str] = None,
+ orig: Optional[str] = None,
+ prov: Optional[ProvenanceItem] = None,
+ parent: Optional[GroupItem] = None,
+)
+
add_list_item.
+:param label: str: +:param text: str: +:param orig: Optional[str]: (Default value = None) +:param prov: Optional[ProvenanceItem]: (Default value = None) +:param parent: Optional[GroupItem]: (Default value = None)
+ +
add_page
+
+
+add_page.
+:param page_no: int: +:param size: Size:
+ +
add_picture
+
+
+add_picture(
+ annotations: List[PictureDataType] = [],
+ image: Optional[ImageRef] = None,
+ caption: Optional[Union[TextItem, RefItem]] = None,
+ prov: Optional[ProvenanceItem] = None,
+ parent: Optional[GroupItem] = None,
+)
+
add_picture.
+:param data: List[PictureData]: (Default value = []) +:param caption: Optional[Union[TextItem: +:param RefItem]]: (Default value = None) +:param prov: Optional[ProvenanceItem]: (Default value = None) +:param parent: Optional[GroupItem]: (Default value = None)
+ +
add_table
+
+
+add_table(
+ data: TableData,
+ caption: Optional[Union[TextItem, RefItem]] = None,
+ prov: Optional[ProvenanceItem] = None,
+ parent: Optional[GroupItem] = None,
+)
+
add_table.
+:param data: BaseTableData: +:param caption: Optional[Union[TextItem: +:param RefItem]]: (Default value = None) +:param # This is not cool yet.prov: Optional[ProvenanceItem] +:param parent: Optional[GroupItem]: (Default value = None)
+ +
add_text
+
+
+add_text(
+ label: DocItemLabel,
+ text: str,
+ orig: Optional[str] = None,
+ prov: Optional[ProvenanceItem] = None,
+ parent: Optional[GroupItem] = None,
+)
+
add_text.
+:param label: str: +:param text: str: +:param orig: Optional[str]: (Default value = None) +:param prov: Optional[ProvenanceItem]: (Default value = None) +:param parent: Optional[GroupItem]: (Default value = None)
+ +
add_title
+
+
+add_title(
+ text: str,
+ orig: Optional[str] = None,
+ prov: Optional[ProvenanceItem] = None,
+ parent: Optional[GroupItem] = None,
+)
+
add_title.
+:param text: str: +:param orig: Optional[str]: (Default value = None) +:param prov: Optional[ProvenanceItem]: (Default value = None) +:param parent: Optional[GroupItem]: (Default value = None)
+ +
check_version_is_compatible
+
+
+check_version_is_compatible(v: str) -> str
+
Check if this document version is compatible with current version.
+ +
export_to_dict
+
+
+export_to_dict(
+ mode: str = "json",
+ by_alias: bool = True,
+ exclude_none: bool = True,
+) -> Dict
+
Export to dict.
+ +
export_to_document_tokens
+
+
+export_to_document_tokens(
+ delim: str = "\n",
+ from_element: int = 0,
+ to_element: int = maxsize,
+ labels: set[DocItemLabel] = DEFAULT_EXPORT_LABELS,
+ xsize: int = 100,
+ ysize: int = 100,
+ add_location: bool = True,
+ add_content: bool = True,
+ add_page_index: bool = True,
+ add_table_cell_location: bool = False,
+ add_table_cell_label: bool = True,
+ add_table_cell_text: bool = True,
+ page_no: Optional[int] = None,
+ with_groups: bool = True,
+ newline: bool = True,
+) -> str
+
Exports the document content to a DocumentToken format.
+Operates on a slice of the document's body as defined through arguments +from_element and to_element; defaulting to the whole main_text.
+:param delim: str: (Default value = "\n\n") +:param from_element: int: (Default value = 0) +:param to_element: Optional[int]: (Default value = None) +:param labels: set[DocItemLabel] +:param xsize: int: (Default value = 100) +:param ysize: int: (Default value = 100) +:param add_location: bool: (Default value = True) +:param add_content: bool: (Default value = True) +:param add_page_index: bool: (Default value = True) +:param # table specific flagsadd_table_cell_location: bool +:param add_table_cell_label: bool: (Default value = True) +:param add_table_cell_text: bool: (Default value = True) +:returns: The content of the document formatted as a DocTags string. +:rtype: str
+ +
export_to_element_tree
+
+
+export_to_element_tree() -> str
+
Export_to_element_tree.
+ +
export_to_html
+
+
+export_to_html(
+ from_element: int = 0,
+ to_element: int = maxsize,
+ labels: set[DocItemLabel] = DEFAULT_EXPORT_LABELS,
+ image_mode: ImageRefMode = PLACEHOLDER,
+ page_no: Optional[int] = None,
+ html_lang: str = "en",
+ html_head: str = _HTML_DEFAULT_HEAD,
+) -> str
+
Serialize to HTML.
+ +
export_to_markdown
+
+
+export_to_markdown(
+ delim: str = "\n",
+ from_element: int = 0,
+ to_element: int = maxsize,
+ labels: set[DocItemLabel] = DEFAULT_EXPORT_LABELS,
+ strict_text: bool = False,
+ image_placeholder: str = "<!-- image -->",
+ image_mode: ImageRefMode = PLACEHOLDER,
+ indent: int = 4,
+ text_width: int = -1,
+ page_no: Optional[int] = None,
+) -> str
+
Serialize to Markdown.
+Operates on a slice of the document's body as defined through arguments +from_element and to_element; defaulting to the whole document.
+:param delim: Delimiter to use when concatenating the various + Markdown parts. (Default value = "\n"). +:type delim: str = "\n" +:param from_element: Body slicing start index (inclusive). + (Default value = 0). +:type from_element: int = 0 +:param to_element: Body slicing stop index + (exclusive). (Default value = maxint). +:type to_element: int = sys.maxsize +:param labels: The set of document labels to include in the export. +:type labels: set[DocItemLabel] = DEFAULT_EXPORT_LABELS +:param strict_text: bool: Whether to only include the text content + of the document. (Default value = False). +:type strict_text: bool = False +:param image_placeholder: The placeholder to include to position + images in the markdown. (Default value = "\<!-- image -->"). +:type image_placeholder: str = "" +:param image_mode: The mode to use for including images in the + markdown. (Default value = ImageRefMode.PLACEHOLDER). +:type image_mode: ImageRefMode = ImageRefMode.PLACEHOLDER +:param indent: The indent in spaces of the nested lists. + (Default value = 4). +:type indent: int = 4 +:returns: The exported Markdown representation. +:rtype: str
+ +
export_to_text
+
+
+export_to_text(
+ delim: str = "\n\n",
+ from_element: int = 0,
+ to_element: int = 1000000,
+ labels: set[DocItemLabel] = DEFAULT_EXPORT_LABELS,
+) -> str
+
export_to_text.
+ +
iterate_items
+
+
+iterate_items(
+ root: Optional[NodeItem] = None,
+ with_groups: bool = False,
+ traverse_pictures: bool = True,
+ page_no: Optional[int] = None,
+ _level: int = 0,
+) -> Iterable[Tuple[NodeItem, int]]
+
iterate_elements.
+:param root: Optional[NodeItem]: (Default value = None) +:param with_groups: bool: (Default value = False) +:param traverse_pictures: bool: (Default value = True) +:param page_no: Optional[int]: (Default value = None) +:param _level: (Default value = 0) +:param # fixed parameter: +:param carries through the node nesting level:
+ +
load_from_json
+
+
+load_from_json(filename: Path) -> DoclingDocument
+
load_from_json.
+:param filename: The filename to load a saved DoclingDocument from a .json. +:type filename: Path
+:returns: The loaded DoclingDocument. +:rtype: DoclingDocument
+ +
num_pages
+
+
+num_pages()
+
num_pages.
+ +
print_element_tree
+
+
+print_element_tree()
+
Print_element_tree.
+ +
save_as_document_tokens
+
+
+save_as_document_tokens(
+ filename: Path,
+ delim: str = "\n\n",
+ from_element: int = 0,
+ to_element: int = maxsize,
+ labels: set[DocItemLabel] = DEFAULT_EXPORT_LABELS,
+ xsize: int = 100,
+ ysize: int = 100,
+ add_location: bool = True,
+ add_content: bool = True,
+ add_page_index: bool = True,
+ add_table_cell_location: bool = False,
+ add_table_cell_label: bool = True,
+ add_table_cell_text: bool = True,
+ page_no: Optional[int] = None,
+ with_groups: bool = True,
+)
+
Save the document content to a DocumentToken format.
+ +
save_as_html
+
+
+save_as_html(
+ filename: Path,
+ artifacts_dir: Optional[Path] = None,
+ from_element: int = 0,
+ to_element: int = maxsize,
+ labels: set[DocItemLabel] = DEFAULT_EXPORT_LABELS,
+ image_mode: ImageRefMode = PLACEHOLDER,
+ page_no: Optional[int] = None,
+ html_lang: str = "en",
+ html_head: str = _HTML_DEFAULT_HEAD,
+)
+
Save to HTML.
+ +
save_as_json
+
+
+save_as_json(
+ filename: Path,
+ artifacts_dir: Optional[Path] = None,
+ image_mode: ImageRefMode = EMBEDDED,
+ indent: int = 2,
+)
+
Save as json.
+ +
save_as_markdown
+
+
+save_as_markdown(
+ filename: Path,
+ artifacts_dir: Optional[Path] = None,
+ delim: str = "\n",
+ from_element: int = 0,
+ to_element: int = maxsize,
+ labels: set[DocItemLabel] = DEFAULT_EXPORT_LABELS,
+ strict_text: bool = False,
+ image_placeholder: str = "<!-- image -->",
+ image_mode: ImageRefMode = PLACEHOLDER,
+ indent: int = 4,
+ text_width: int = -1,
+ page_no: Optional[int] = None,
+)
+
Save to markdown.
+ +
save_as_yaml
+
+
+save_as_yaml(
+ filename: Path,
+ artifacts_dir: Optional[Path] = None,
+ image_mode: ImageRefMode = EMBEDDED,
+ default_flow_style: bool = False,
+)
+
Save as yaml.
+ +
validate_document
+
+
+validate_document(d: DoclingDocument)
+
validate_document.
+ +
validate_tree
+
+
+validate_tree(root) -> bool
+
validate_tree.
+ +
DocumentOrigin
+
+
+
+ Bases: BaseModel
FileSource.
+ + + + + + + + + +Methods:
+parse_hex_string
+ –
+ parse_hex_string.
+validate_mimetype
+ –
+ validate_mimetype.
+Attributes:
+binary_hash
+ (Uint64
)
+ –
+ filename
+ (str
)
+ –
+ mimetype
+ (str
)
+ –
+ uri
+ (Optional[AnyUrl]
)
+ –
+
binary_hash
+
+
+binary_hash: Uint64
+
filename
+
+
+filename: str
+
mimetype
+
+
+mimetype: str
+
uri
+
+
+uri: Optional[AnyUrl] = None
+
parse_hex_string
+
+
+parse_hex_string(value)
+
parse_hex_string.
+ +
validate_mimetype
+
+
+validate_mimetype(v)
+
validate_mimetype.
+ +
DocItem
+
+
+
+ Bases: NodeItem
DocItem.
+ + + + + + + + + +Methods:
+get_image
+ –
+ Returns the image of this DocItem.
+get_location_tokens
+ –
+ Get the location string for the BaseCell.
+get_ref
+ –
+ get_ref.
+Attributes:
+children
+ (List[RefItem]
)
+ –
+ label
+ (DocItemLabel
)
+ –
+ model_config
+ –
+ parent
+ (Optional[RefItem]
)
+ –
+ prov
+ (List[ProvenanceItem]
)
+ –
+ self_ref
+ (str
)
+ –
+
model_config
+
+
+model_config = ConfigDict(extra='forbid')
+
self_ref
+
+
+self_ref: str = Field(pattern=_JSON_POINTER_REGEX)
+
get_image
+
+
+get_image(doc: DoclingDocument) -> Optional[Image]
+
Returns the image of this DocItem.
+The function returns None if this DocItem has no valid provenance or +if a valid image of the page containing this DocItem is not available +in doc.
+ +
get_location_tokens
+
+
+get_location_tokens(
+ doc: DoclingDocument,
+ new_line: str,
+ xsize: int = 100,
+ ysize: int = 100,
+ add_page_index: bool = True,
+) -> str
+
Get the location string for the BaseCell.
+ +
get_ref
+
+
+get_ref()
+
get_ref.
+ +
DocItemLabel
+
+
+
+ Bases: str
, Enum
DocItemLabel.
+ + + + + + + + + + + +Attributes:
+CAPTION
+ –
+ CHECKBOX_SELECTED
+ –
+ CHECKBOX_UNSELECTED
+ –
+ CODE
+ –
+ DOCUMENT_INDEX
+ –
+ FOOTNOTE
+ –
+ FORM
+ –
+ FORMULA
+ –
+ KEY_VALUE_REGION
+ –
+ LIST_ITEM
+ –
+ PAGE_FOOTER
+ –
+ PAGE_HEADER
+ –
+ PARAGRAPH
+ –
+ PICTURE
+ –
+ REFERENCE
+ –
+ SECTION_HEADER
+ –
+ TABLE
+ –
+ TEXT
+ –
+ TITLE
+ –
+
CAPTION
+
+
+CAPTION = 'caption'
+
CHECKBOX_SELECTED
+
+
+CHECKBOX_SELECTED = 'checkbox_selected'
+
CHECKBOX_UNSELECTED
+
+
+CHECKBOX_UNSELECTED = 'checkbox_unselected'
+
CODE
+
+
+CODE = 'code'
+
DOCUMENT_INDEX
+
+
+DOCUMENT_INDEX = 'document_index'
+
FOOTNOTE
+
+
+FOOTNOTE = 'footnote'
+
FORM
+
+
+FORM = 'form'
+
FORMULA
+
+
+FORMULA = 'formula'
+
KEY_VALUE_REGION
+
+
+KEY_VALUE_REGION = 'key_value_region'
+
LIST_ITEM
+
+
+LIST_ITEM = 'list_item'
+
PAGE_FOOTER
+
+
+PAGE_FOOTER = 'page_footer'
+
PAGE_HEADER
+
+
+PAGE_HEADER = 'page_header'
+
PARAGRAPH
+
+
+PARAGRAPH = 'paragraph'
+
PICTURE
+
+
+PICTURE = 'picture'
+
REFERENCE
+
+
+REFERENCE = 'reference'
+
SECTION_HEADER
+
+
+SECTION_HEADER = 'section_header'
+
TABLE
+
+
+TABLE = 'table'
+
TEXT
+
+
+TEXT = 'text'
+
TITLE
+
+
+TITLE = 'title'
+
ProvenanceItem
+
+
+
+ Bases: BaseModel
ProvenanceItem.
+ + + + + + + + + + + +Attributes:
+bbox
+ (BoundingBox
)
+ –
+ charspan
+ (Tuple[int, int]
)
+ –
+ page_no
+ (int
)
+ –
+
charspan
+
+
+charspan: Tuple[int, int]
+
page_no
+
+
+page_no: int
+
GroupItem
+
+
+
+ Bases: NodeItem
GroupItem.
+ + + + + + + + + +Methods:
+get_ref
+ –
+ get_ref.
+Attributes:
+children
+ (List[RefItem]
)
+ –
+ label
+ (GroupLabel
)
+ –
+ model_config
+ –
+ name
+ (str
)
+ –
+ parent
+ (Optional[RefItem]
)
+ –
+ self_ref
+ (str
)
+ –
+
model_config
+
+
+model_config = ConfigDict(extra='forbid')
+
name
+
+
+name: str = 'group'
+
self_ref
+
+
+self_ref: str = Field(pattern=_JSON_POINTER_REGEX)
+
get_ref
+
+
+get_ref()
+
get_ref.
+ +
GroupLabel
+
+
+
+ Bases: str
, Enum
GroupLabel.
+ + + + + + + + + + + +Attributes:
+CHAPTER
+ –
+ LIST
+ –
+ ORDERED_LIST
+ –
+ SECTION
+ –
+ SHEET
+ –
+ SLIDE
+ –
+ UNSPECIFIED
+ –
+
CHAPTER
+
+
+CHAPTER = 'chapter'
+
LIST
+
+
+LIST = 'list'
+
ORDERED_LIST
+
+
+ORDERED_LIST = 'ordered_list'
+
SECTION
+
+
+SECTION = 'section'
+
SHEET
+
+
+SHEET = 'sheet'
+
SLIDE
+
+
+SLIDE = 'slide'
+
UNSPECIFIED
+
+
+UNSPECIFIED = 'unspecified'
+
NodeItem
+
+
+
+ Bases: BaseModel
NodeItem.
+ + + + + + + + + +Methods:
+get_ref
+ –
+ get_ref.
+Attributes:
+children
+ (List[RefItem]
)
+ –
+ model_config
+ –
+ parent
+ (Optional[RefItem]
)
+ –
+ self_ref
+ (str
)
+ –
+
model_config
+
+
+model_config = ConfigDict(extra='forbid')
+
self_ref
+
+
+self_ref: str = Field(pattern=_JSON_POINTER_REGEX)
+
get_ref
+
+
+get_ref()
+
get_ref.
+ +
PageItem
+
+
+
+ Bases: BaseModel
PageItem.
+ + + + + + + + + + + +Attributes:
+ + + + + + + +
FloatingItem
+
+
+
+ Bases: DocItem
FloatingItem.
+ + + + + + + + + +Methods:
+caption_text
+ –
+ Computes the caption as a single text.
+get_image
+ –
+ Returns the image corresponding to this FloatingItem.
+get_location_tokens
+ –
+ Get the location string for the BaseCell.
+get_ref
+ –
+ get_ref.
+Attributes:
+captions
+ (List[RefItem]
)
+ –
+ children
+ (List[RefItem]
)
+ –
+ footnotes
+ (List[RefItem]
)
+ –
+ image
+ (Optional[ImageRef]
)
+ –
+ label
+ (DocItemLabel
)
+ –
+ model_config
+ –
+ parent
+ (Optional[RefItem]
)
+ –
+ prov
+ (List[ProvenanceItem]
)
+ –
+ references
+ (List[RefItem]
)
+ –
+ self_ref
+ (str
)
+ –
+
model_config
+
+
+model_config = ConfigDict(extra='forbid')
+
self_ref
+
+
+self_ref: str = Field(pattern=_JSON_POINTER_REGEX)
+
caption_text
+
+
+caption_text(doc: DoclingDocument) -> str
+
Computes the caption as a single text.
+ +
get_image
+
+
+get_image(doc: DoclingDocument) -> Optional[Image]
+
Returns the image corresponding to this FloatingItem.
+This function returns the PIL image from self.image if one is available. +Otherwise, it uses DocItem.get_image to get an image of this FloatingItem.
+In particular, when self.image is None, the function returns None if this +FloatingItem has no valid provenance or the doc does not contain a valid image +for the required page.
+ +
get_location_tokens
+
+
+get_location_tokens(
+ doc: DoclingDocument,
+ new_line: str,
+ xsize: int = 100,
+ ysize: int = 100,
+ add_page_index: bool = True,
+) -> str
+
Get the location string for the BaseCell.
+ +
get_ref
+
+
+get_ref()
+
get_ref.
+ +
TextItem
+
+
+
+ Bases: DocItem
TextItem.
+ + + + + + + + + +Methods:
+export_to_document_tokens
+ –
+ Export text element to document tokens format.
+get_image
+ –
+ Returns the image of this DocItem.
+get_location_tokens
+ –
+ Get the location string for the BaseCell.
+get_ref
+ –
+ get_ref.
+Attributes:
+children
+ (List[RefItem]
)
+ –
+ label
+ (DocItemLabel
)
+ –
+ model_config
+ –
+ orig
+ (str
)
+ –
+ parent
+ (Optional[RefItem]
)
+ –
+ prov
+ (List[ProvenanceItem]
)
+ –
+ self_ref
+ (str
)
+ –
+ text
+ (str
)
+ –
+
model_config
+
+
+model_config = ConfigDict(extra='forbid')
+
orig
+
+
+orig: str
+
self_ref
+
+
+self_ref: str = Field(pattern=_JSON_POINTER_REGEX)
+
text
+
+
+text: str
+
export_to_document_tokens
+
+
+export_to_document_tokens(
+ doc: DoclingDocument,
+ new_line: str = "\n",
+ xsize: int = 100,
+ ysize: int = 100,
+ add_location: bool = True,
+ add_content: bool = True,
+ add_page_index: bool = True,
+)
+
Export text element to document tokens format.
+:param doc: "DoclingDocument": +:param new_line: str: (Default value = "\n") +:param xsize: int: (Default value = 100) +:param ysize: int: (Default value = 100) +:param add_location: bool: (Default value = True) +:param add_content: bool: (Default value = True) +:param add_page_index: bool: (Default value = True)
+ +
get_image
+
+
+get_image(doc: DoclingDocument) -> Optional[Image]
+
Returns the image of this DocItem.
+The function returns None if this DocItem has no valid provenance or +if a valid image of the page containing this DocItem is not available +in doc.
+ +
get_location_tokens
+
+
+get_location_tokens(
+ doc: DoclingDocument,
+ new_line: str,
+ xsize: int = 100,
+ ysize: int = 100,
+ add_page_index: bool = True,
+) -> str
+
Get the location string for the BaseCell.
+ +
get_ref
+
+
+get_ref()
+
get_ref.
+ +
TableItem
+
+
+
+ Bases: FloatingItem
TableItem.
+ + + + + + + + + +Methods:
+caption_text
+ –
+ Computes the caption as a single text.
+export_to_dataframe
+ –
+ Export the table as a Pandas DataFrame.
+export_to_document_tokens
+ –
+ Export table to document tokens format.
+export_to_html
+ –
+ Export the table as html.
+export_to_markdown
+ –
+ Export the table as markdown.
+get_image
+ –
+ Returns the image corresponding to this FloatingItem.
+get_location_tokens
+ –
+ Get the location string for the BaseCell.
+get_ref
+ –
+ get_ref.
+Attributes:
+captions
+ (List[RefItem]
)
+ –
+ children
+ (List[RefItem]
)
+ –
+ data
+ (TableData
)
+ –
+ footnotes
+ (List[RefItem]
)
+ –
+ image
+ (Optional[ImageRef]
)
+ –
+ label
+ (Literal[TABLE]
)
+ –
+ model_config
+ –
+ parent
+ (Optional[RefItem]
)
+ –
+ prov
+ (List[ProvenanceItem]
)
+ –
+ references
+ (List[RefItem]
)
+ –
+ self_ref
+ (str
)
+ –
+
model_config
+
+
+model_config = ConfigDict(extra='forbid')
+
self_ref
+
+
+self_ref: str = Field(pattern=_JSON_POINTER_REGEX)
+
caption_text
+
+
+caption_text(doc: DoclingDocument) -> str
+
Computes the caption as a single text.
+ +
export_to_dataframe
+
+
+export_to_dataframe() -> DataFrame
+
Export the table as a Pandas DataFrame.
+ +
export_to_document_tokens
+
+
+export_to_document_tokens(
+ doc: DoclingDocument,
+ new_line: str = "\n",
+ xsize: int = 100,
+ ysize: int = 100,
+ add_location: bool = True,
+ add_caption: bool = True,
+ add_content: bool = True,
+ add_cell_location: bool = True,
+ add_cell_label: bool = True,
+ add_cell_text: bool = True,
+ add_page_index: bool = True,
+)
+
Export table to document tokens format.
+:param doc: "DoclingDocument": +:param new_line: str: (Default value = "\n") +:param xsize: int: (Default value = 100) +:param ysize: int: (Default value = 100) +:param add_location: bool: (Default value = True) +:param add_caption: bool: (Default value = True) +:param add_content: bool: (Default value = True) +:param add_cell_location: bool: (Default value = True) +:param add_cell_label: bool: (Default value = True) +:param add_cell_text: bool: (Default value = True) +:param add_page_index: bool: (Default value = True)
+ +
export_to_html
+
+
+export_to_html(
+ doc: Optional[DoclingDocument] = None,
+ add_caption: bool = True,
+) -> str
+
Export the table as html.
+ +
export_to_markdown
+
+
+export_to_markdown() -> str
+
Export the table as markdown.
+ +
get_image
+
+
+get_image(doc: DoclingDocument) -> Optional[Image]
+
Returns the image corresponding to this FloatingItem.
+This function returns the PIL image from self.image if one is available. +Otherwise, it uses DocItem.get_image to get an image of this FloatingItem.
+In particular, when self.image is None, the function returns None if this +FloatingItem has no valid provenance or the doc does not contain a valid image +for the required page.
+ +
get_location_tokens
+
+
+get_location_tokens(
+ doc: DoclingDocument,
+ new_line: str,
+ xsize: int = 100,
+ ysize: int = 100,
+ add_page_index: bool = True,
+) -> str
+
Get the location string for the BaseCell.
+ +
get_ref
+
+
+get_ref()
+
get_ref.
+ +
TableCell
+
+
+
+ Bases: BaseModel
TableCell.
+ + + + + + + + + +Methods:
+from_dict_format
+ –
+ from_dict_format.
+Attributes:
+bbox
+ (Optional[BoundingBox]
)
+ –
+ col_span
+ (int
)
+ –
+ column_header
+ (bool
)
+ –
+ end_col_offset_idx
+ (int
)
+ –
+ end_row_offset_idx
+ (int
)
+ –
+ row_header
+ (bool
)
+ –
+ row_section
+ (bool
)
+ –
+ row_span
+ (int
)
+ –
+ start_col_offset_idx
+ (int
)
+ –
+ start_row_offset_idx
+ (int
)
+ –
+ text
+ (str
)
+ –
+
col_span
+
+
+col_span: int = 1
+
column_header
+
+
+column_header: bool = False
+
end_col_offset_idx
+
+
+end_col_offset_idx: int
+
end_row_offset_idx
+
+
+end_row_offset_idx: int
+
row_header
+
+
+row_header: bool = False
+
row_section
+
+
+row_section: bool = False
+
row_span
+
+
+row_span: int = 1
+
start_col_offset_idx
+
+
+start_col_offset_idx: int
+
start_row_offset_idx
+
+
+start_row_offset_idx: int
+
text
+
+
+text: str
+
from_dict_format
+
+
+from_dict_format(data: Any) -> Any
+
from_dict_format.
+ +
TableData
+
+
+
+ Bases: BaseModel
BaseTableData.
+ + + + + + + + + + + +Attributes:
+ + + + + + + +
TableCellLabel
+
+
+
+ Bases: str
, Enum
TableCellLabel.
+ + + + + + + + + + + +Attributes:
+BODY
+ –
+ COLUMN_HEADER
+ –
+ ROW_HEADER
+ –
+ ROW_SECTION
+ –
+
BODY
+
+
+BODY = 'body'
+
COLUMN_HEADER
+
+
+COLUMN_HEADER = 'col_header'
+
ROW_HEADER
+
+
+ROW_HEADER = 'row_header'
+
ROW_SECTION
+
+
+ROW_SECTION = 'row_section'
+
KeyValueItem
+
+
+
+ Bases: DocItem
KeyValueItem.
+ + + + + + + + + +Methods:
+get_image
+ –
+ Returns the image of this DocItem.
+get_location_tokens
+ –
+ Get the location string for the BaseCell.
+get_ref
+ –
+ get_ref.
+Attributes:
+children
+ (List[RefItem]
)
+ –
+ label
+ (DocItemLabel
)
+ –
+ model_config
+ –
+ parent
+ (Optional[RefItem]
)
+ –
+ prov
+ (List[ProvenanceItem]
)
+ –
+ self_ref
+ (str
)
+ –
+
model_config
+
+
+model_config = ConfigDict(extra='forbid')
+
self_ref
+
+
+self_ref: str = Field(pattern=_JSON_POINTER_REGEX)
+
get_image
+
+
+get_image(doc: DoclingDocument) -> Optional[Image]
+
Returns the image of this DocItem.
+The function returns None if this DocItem has no valid provenance or +if a valid image of the page containing this DocItem is not available +in doc.
+ +
get_location_tokens
+
+
+get_location_tokens(
+ doc: DoclingDocument,
+ new_line: str,
+ xsize: int = 100,
+ ysize: int = 100,
+ add_page_index: bool = True,
+) -> str
+
Get the location string for the BaseCell.
+ +
get_ref
+
+
+get_ref()
+
get_ref.
+ +
SectionHeaderItem
+
+
+
+ Bases: TextItem
SectionItem.
+ + + + + + + + + +Methods:
+export_to_document_tokens
+ –
+ Export text element to document tokens format.
+get_image
+ –
+ Returns the image of this DocItem.
+get_location_tokens
+ –
+ Get the location string for the BaseCell.
+get_ref
+ –
+ get_ref.
+Attributes:
+children
+ (List[RefItem]
)
+ –
+ label
+ (Literal[SECTION_HEADER]
)
+ –
+ level
+ (LevelNumber
)
+ –
+ model_config
+ –
+ orig
+ (str
)
+ –
+ parent
+ (Optional[RefItem]
)
+ –
+ prov
+ (List[ProvenanceItem]
)
+ –
+ self_ref
+ (str
)
+ –
+ text
+ (str
)
+ –
+
level
+
+
+level: LevelNumber
+
model_config
+
+
+model_config = ConfigDict(extra='forbid')
+
orig
+
+
+orig: str
+
self_ref
+
+
+self_ref: str = Field(pattern=_JSON_POINTER_REGEX)
+
text
+
+
+text: str
+
export_to_document_tokens
+
+
+export_to_document_tokens(
+ doc: DoclingDocument,
+ new_line: str = "\n",
+ xsize: int = 100,
+ ysize: int = 100,
+ add_location: bool = True,
+ add_content: bool = True,
+ add_page_index: bool = True,
+)
+
Export text element to document tokens format.
+:param doc: "DoclingDocument": +:param new_line: str: (Default value = "\n") +:param xsize: int: (Default value = 100) +:param ysize: int: (Default value = 100) +:param add_location: bool: (Default value = True) +:param add_content: bool: (Default value = True) +:param add_page_index: bool: (Default value = True)
+ +
get_image
+
+
+get_image(doc: DoclingDocument) -> Optional[Image]
+
Returns the image of this DocItem.
+The function returns None if this DocItem has no valid provenance or +if a valid image of the page containing this DocItem is not available +in doc.
+ +
get_location_tokens
+
+
+get_location_tokens(
+ doc: DoclingDocument,
+ new_line: str,
+ xsize: int = 100,
+ ysize: int = 100,
+ add_page_index: bool = True,
+) -> str
+
Get the location string for the BaseCell.
+ +
get_ref
+
+
+get_ref()
+
get_ref.
+ +
PictureItem
+
+
+
+ Bases: FloatingItem
PictureItem.
+ + + + + + + + + +Methods:
+caption_text
+ –
+ Computes the caption as a single text.
+export_to_document_tokens
+ –
+ Export picture to document tokens format.
+export_to_html
+ –
+ Export picture to HTML format.
+export_to_markdown
+ –
+ Export picture to Markdown format.
+get_image
+ –
+ Returns the image corresponding to this FloatingItem.
+get_location_tokens
+ –
+ Get the location string for the BaseCell.
+get_ref
+ –
+ get_ref.
+Attributes:
+annotations
+ (List[PictureDataType]
)
+ –
+ captions
+ (List[RefItem]
)
+ –
+ children
+ (List[RefItem]
)
+ –
+ footnotes
+ (List[RefItem]
)
+ –
+ image
+ (Optional[ImageRef]
)
+ –
+ label
+ (Literal[PICTURE]
)
+ –
+ model_config
+ –
+ parent
+ (Optional[RefItem]
)
+ –
+ prov
+ (List[ProvenanceItem]
)
+ –
+ references
+ (List[RefItem]
)
+ –
+ self_ref
+ (str
)
+ –
+
annotations
+
+
+annotations: List[PictureDataType] = []
+
model_config
+
+
+model_config = ConfigDict(extra='forbid')
+
self_ref
+
+
+self_ref: str = Field(pattern=_JSON_POINTER_REGEX)
+
caption_text
+
+
+caption_text(doc: DoclingDocument) -> str
+
Computes the caption as a single text.
+ +
export_to_document_tokens
+
+
+export_to_document_tokens(
+ doc: DoclingDocument,
+ new_line: str = "\n",
+ xsize: int = 100,
+ ysize: int = 100,
+ add_location: bool = True,
+ add_caption: bool = True,
+ add_content: bool = True,
+ add_page_index: bool = True,
+)
+
Export picture to document tokens format.
+:param doc: "DoclingDocument": +:param new_line: str: (Default value = "\n") +:param xsize: int: (Default value = 100) +:param ysize: int: (Default value = 100) +:param add_location: bool: (Default value = True) +:param add_caption: bool: (Default value = True) +:param add_content: bool: (Default value = True) +:param # not used at the momentadd_page_index: bool: (Default value = True)
+ +
export_to_html
+
+
+export_to_html(
+ doc: DoclingDocument,
+ add_caption: bool = True,
+ image_mode: ImageRefMode = PLACEHOLDER,
+) -> str
+
Export picture to HTML format.
+ +
export_to_markdown
+
+
+export_to_markdown(
+ doc: DoclingDocument,
+ add_caption: bool = True,
+ image_mode: ImageRefMode = EMBEDDED,
+ image_placeholder: str = "<!-- image -->",
+) -> str
+
Export picture to Markdown format.
+ +
get_image
+
+
+get_image(doc: DoclingDocument) -> Optional[Image]
+
Returns the image corresponding to this FloatingItem.
+This function returns the PIL image from self.image if one is available. +Otherwise, it uses DocItem.get_image to get an image of this FloatingItem.
+In particular, when self.image is None, the function returns None if this +FloatingItem has no valid provenance or the doc does not contain a valid image +for the required page.
+ +
get_location_tokens
+
+
+get_location_tokens(
+ doc: DoclingDocument,
+ new_line: str,
+ xsize: int = 100,
+ ysize: int = 100,
+ add_page_index: bool = True,
+) -> str
+
Get the location string for the BaseCell.
+ +
get_ref
+
+
+get_ref()
+
get_ref.
+ +
ImageRef
+
+
+
+ Bases: BaseModel
ImageRef.
+ + + + + + + + + +Methods:
+from_pil
+ –
+ Construct ImageRef from a PIL Image.
+validate_mimetype
+ –
+ validate_mimetype.
+Attributes:
+dpi
+ (int
)
+ –
+ mimetype
+ (str
)
+ –
+ pil_image
+ (Optional[Image]
)
+ –
+ Return the PIL Image.
+size
+ (Size
)
+ –
+ uri
+ (Union[AnyUrl, Path]
)
+ –
+
dpi
+
+
+dpi: int
+
mimetype
+
+
+mimetype: str
+
pil_image
+
+
+pil_image: Optional[Image]
+
Return the PIL Image.
+
uri
+
+
+uri: Union[AnyUrl, Path]
+
from_pil
+
+
+from_pil(image: Image, dpi: int) -> Self
+
Construct ImageRef from a PIL Image.
+ +
validate_mimetype
+
+
+validate_mimetype(v)
+
validate_mimetype.
+ +
PictureClassificationClass
+
+
+
+ Bases: BaseModel
PictureClassificationData.
+ + + + + + + + + + + +Attributes:
+class_name
+ (str
)
+ –
+ confidence
+ (float
)
+ –
+
class_name
+
+
+class_name: str
+
confidence
+
+
+confidence: float
+
PictureClassificationData
+
+
+
+ Bases: BasePictureData
PictureClassificationData.
+ + + + + + + + + + + +Attributes:
+kind
+ (Literal['classification']
)
+ –
+ predicted_classes
+ (List[PictureClassificationClass]
)
+ –
+ provenance
+ (str
)
+ –
+
kind
+
+
+kind: Literal['classification'] = 'classification'
+
provenance
+
+
+provenance: str
+
RefItem
+
+
+
+ Bases: BaseModel
RefItem.
+ + + + + + + + + +Methods:
+ + + + + +Attributes:
+cref
+ (str
)
+ –
+ model_config
+ –
+
cref
+
+
+cref: str = Field(alias="$ref", pattern=_JSON_POINTER_REGEX)
+
model_config
+
+
+model_config = ConfigDict(populate_by_name=True)
+
get_ref
+
+
+get_ref()
+
get_ref.
+ +
BoundingBox
+
+
+
+ Bases: BaseModel
BoundingBox.
+ + + + + + + + + +Methods:
+area
+ –
+ area.
+as_tuple
+ –
+ as_tuple.
+from_tuple
+ –
+ from_tuple.
+intersection_area_with
+ –
+ intersection_area_with.
+normalized
+ –
+ normalized.
+scaled
+ –
+ scaled.
+to_bottom_left_origin
+ –
+ to_bottom_left_origin.
+to_top_left_origin
+ –
+ to_top_left_origin.
+Attributes:
+b
+ (float
)
+ –
+ coord_origin
+ (CoordOrigin
)
+ –
+ height
+ –
+ height.
+l
+ (float
)
+ –
+ r
+ (float
)
+ –
+ t
+ (float
)
+ –
+ width
+ –
+ width.
+
b
+
+
+b: float
+
height
+
+
+height
+
height.
+
l
+
+
+l: float
+
r
+
+
+r: float
+
t
+
+
+t: float
+
width
+
+
+width
+
width.
+
area
+
+
+area() -> float
+
area.
+ +
as_tuple
+
+
+as_tuple()
+
as_tuple.
+ +
from_tuple
+
+
+from_tuple(coord: Tuple[float, ...], origin: CoordOrigin)
+
from_tuple.
+:param coord: Tuple[float: +:param ...]: +:param origin: CoordOrigin:
+ +
intersection_area_with
+
+
+intersection_area_with(other: BoundingBox) -> float
+
intersection_area_with.
+:param other: "BoundingBox":
+ +
normalized
+
+
+normalized(page_size: Size) -> BoundingBox
+
normalized.
+:param page_size: Size:
+ +
scaled
+
+
+scaled(scale: float) -> BoundingBox
+
scaled.
+:param scale: float:
+ +
to_bottom_left_origin
+
+
+to_bottom_left_origin(page_height) -> BoundingBox
+
to_bottom_left_origin.
+:param page_height:
+ +
to_top_left_origin
+
+
+to_top_left_origin(page_height)
+
to_top_left_origin.
+:param page_height:
+ +
CoordOrigin
+
+
+
+ Bases: str
, Enum
CoordOrigin.
+ + + + + + + + + + + +Attributes:
+BOTTOMLEFT
+ –
+ TOPLEFT
+ –
+
BOTTOMLEFT
+
+
+BOTTOMLEFT = 'BOTTOMLEFT'
+
TOPLEFT
+
+
+TOPLEFT = 'TOPLEFT'
+
ImageRefMode
+
+
+
+ Bases: str
, Enum
ImageRefMode.
+ + + + + + + + + + + +Attributes:
+EMBEDDED
+ –
+ PLACEHOLDER
+ –
+ REFERENCED
+ –
+
EMBEDDED
+
+
+EMBEDDED = 'embedded'
+
PLACEHOLDER
+
+
+PLACEHOLDER = 'placeholder'
+
REFERENCED
+
+
+REFERENCED = 'referenced'
+
Size
+
+
+
+ Bases: BaseModel
Size.
+ + + + + + + + + +Methods:
+as_tuple
+ –
+ as_tuple.
+Attributes:
+ + + + + +
height
+
+
+height: float = 0.0
+
width
+
+
+width: float = 0.0
+
as_tuple
+
+
+as_tuple()
+
as_tuple.
+ +This is an automatic generated API reference of the main components of Docling.
+ + +
document_converter
+
+
+Classes:
+DocumentConverter
+ –
+ ConversionResult
+ –
+ ConversionStatus
+ –
+ FormatOption
+ –
+ InputFormat
+ –
+ PdfFormatOption
+ –
+ ImageFormatOption
+ –
+ StandardPdfPipeline
+ –
+ WordFormatOption
+ –
+ PowerpointFormatOption
+ –
+ MarkdownFormatOption
+ –
+ AsciiDocFormatOption
+ –
+ HTMLFormatOption
+ –
+ SimplePipeline
+ –
+ SimpleModelPipeline.
+
DocumentConverter
+
+
+DocumentConverter(
+ allowed_formats: Optional[List[InputFormat]] = None,
+ format_options: Optional[
+ Dict[InputFormat, FormatOption]
+ ] = None,
+)
+
Methods:
+convert
+ –
+ convert_all
+ –
+ initialize_pipeline
+ –
+ Initialize the conversion pipeline for the selected format.
+Attributes:
+allowed_formats
+ –
+ format_to_options
+ –
+ initialized_pipelines
+ (Dict[Type[BasePipeline], BasePipeline]
)
+ –
+
allowed_formats
+
+
+
+ instance-attribute
+
+
+allowed_formats = (
+ allowed_formats
+ if allowed_formats is not None
+ else [e for e in InputFormat]
+)
+
format_to_options
+
+
+
+ instance-attribute
+
+
+format_to_options = {format: _get_default_option(format=format) if (custom_option := get(format)) is None else _m7TDcFIBxFIvfor format in allowed_formats}
+
initialized_pipelines
+
+
+
+ instance-attribute
+
+
+initialized_pipelines: Dict[
+ Type[BasePipeline], BasePipeline
+] = {}
+
convert
+
+
+convert(
+ source: Union[Path, str, DocumentStream],
+ raises_on_error: bool = True,
+ max_num_pages: int = maxsize,
+ max_file_size: int = maxsize,
+) -> ConversionResult
+
convert_all
+
+
+convert_all(
+ source: Iterable[Union[Path, str, DocumentStream]],
+ raises_on_error: bool = True,
+ max_num_pages: int = maxsize,
+ max_file_size: int = maxsize,
+) -> Iterator[ConversionResult]
+
initialize_pipeline
+
+
+initialize_pipeline(format: InputFormat)
+
Initialize the conversion pipeline for the selected format.
+ +
ConversionResult
+
+
+
+ Bases: BaseModel
Attributes:
+assembled
+ (AssembledUnit
)
+ –
+ document
+ (DoclingDocument
)
+ –
+ errors
+ (List[ErrorItem]
)
+ –
+ input
+ (InputDocument
)
+ –
+ legacy_document
+ –
+ pages
+ (List[Page]
)
+ –
+ status
+ (ConversionStatus
)
+ –
+ timings
+ (Dict[str, ProfilingItem]
)
+ –
+
assembled
+
+
+
+ class-attribute
+ instance-attribute
+
+
+assembled: AssembledUnit = AssembledUnit()
+
document
+
+
+
+ class-attribute
+ instance-attribute
+
+
+document: DoclingDocument = _EMPTY_DOCLING_DOC
+
errors
+
+
+
+ class-attribute
+ instance-attribute
+
+
+errors: List[ErrorItem] = []
+
input
+
+
+
+ instance-attribute
+
+
+input: InputDocument
+
legacy_document
+
+
+
+ property
+
+
+legacy_document
+
pages
+
+
+
+ class-attribute
+ instance-attribute
+
+
+pages: List[Page] = []
+
status
+
+
+
+ class-attribute
+ instance-attribute
+
+
+status: ConversionStatus = PENDING
+
timings
+
+
+
+ class-attribute
+ instance-attribute
+
+
+timings: Dict[str, ProfilingItem] = {}
+
ConversionStatus
+
+
+
+ Bases: str
, Enum
Attributes:
+FAILURE
+ –
+ PARTIAL_SUCCESS
+ –
+ PENDING
+ –
+ SKIPPED
+ –
+ STARTED
+ –
+ SUCCESS
+ –
+
FAILURE
+
+
+
+ class-attribute
+ instance-attribute
+
+
+FAILURE = auto()
+
PARTIAL_SUCCESS
+
+
+
+ class-attribute
+ instance-attribute
+
+
+PARTIAL_SUCCESS = auto()
+
PENDING
+
+
+
+ class-attribute
+ instance-attribute
+
+
+PENDING = auto()
+
SKIPPED
+
+
+
+ class-attribute
+ instance-attribute
+
+
+SKIPPED = auto()
+
STARTED
+
+
+
+ class-attribute
+ instance-attribute
+
+
+STARTED = auto()
+
SUCCESS
+
+
+
+ class-attribute
+ instance-attribute
+
+
+SUCCESS = auto()
+
FormatOption
+
+
+
+ Bases: BaseModel
Methods:
+set_optional_field_default
+ –
+ Attributes:
+backend
+ (Type[AbstractDocumentBackend]
)
+ –
+ model_config
+ –
+ pipeline_cls
+ (Type[BasePipeline]
)
+ –
+ pipeline_options
+ (Optional[PipelineOptions]
)
+ –
+
backend
+
+
+
+ instance-attribute
+
+
+backend: Type[AbstractDocumentBackend]
+
model_config
+
+
+
+ class-attribute
+ instance-attribute
+
+
+model_config = ConfigDict(arbitrary_types_allowed=True)
+
pipeline_cls
+
+
+
+ instance-attribute
+
+
+pipeline_cls: Type[BasePipeline]
+
pipeline_options
+
+
+
+ class-attribute
+ instance-attribute
+
+
+pipeline_options: Optional[PipelineOptions] = None
+
set_optional_field_default
+
+
+set_optional_field_default() -> FormatOption
+
InputFormat
+
+
+
+ Bases: str
, Enum
Attributes:
+ASCIIDOC
+ –
+ DOCX
+ –
+ HTML
+ –
+ IMAGE
+ –
+ MD
+ –
+ PDF
+ –
+ PPTX
+ –
+ XLSX
+ –
+
ASCIIDOC
+
+
+
+ class-attribute
+ instance-attribute
+
+
+ASCIIDOC = 'asciidoc'
+
DOCX
+
+
+
+ class-attribute
+ instance-attribute
+
+
+DOCX = 'docx'
+
HTML
+
+
+
+ class-attribute
+ instance-attribute
+
+
+HTML = 'html'
+
IMAGE
+
+
+
+ class-attribute
+ instance-attribute
+
+
+IMAGE = 'image'
+
MD
+
+
+
+ class-attribute
+ instance-attribute
+
+
+MD = 'md'
+
PDF
+
+
+
+ class-attribute
+ instance-attribute
+
+
+PDF = 'pdf'
+
PPTX
+
+
+
+ class-attribute
+ instance-attribute
+
+
+PPTX = 'pptx'
+
XLSX
+
+
+
+ class-attribute
+ instance-attribute
+
+
+XLSX = 'xlsx'
+
PdfFormatOption
+
+
+
+ Bases: FormatOption
Methods:
+set_optional_field_default
+ –
+ Attributes:
+backend
+ (Type[AbstractDocumentBackend]
)
+ –
+ model_config
+ –
+ pipeline_cls
+ (Type
)
+ –
+ pipeline_options
+ (Optional[PipelineOptions]
)
+ –
+
backend
+
+
+
+ class-attribute
+ instance-attribute
+
+
+backend: Type[AbstractDocumentBackend] = (
+ DoclingParseDocumentBackend
+)
+
model_config
+
+
+
+ class-attribute
+ instance-attribute
+
+
+model_config = ConfigDict(arbitrary_types_allowed=True)
+
pipeline_cls
+
+
+
+ class-attribute
+ instance-attribute
+
+
+pipeline_cls: Type = StandardPdfPipeline
+
pipeline_options
+
+
+
+ class-attribute
+ instance-attribute
+
+
+pipeline_options: Optional[PipelineOptions] = None
+
set_optional_field_default
+
+
+set_optional_field_default() -> FormatOption
+
ImageFormatOption
+
+
+
+ Bases: FormatOption
Methods:
+set_optional_field_default
+ –
+ Attributes:
+backend
+ (Type[AbstractDocumentBackend]
)
+ –
+ model_config
+ –
+ pipeline_cls
+ (Type
)
+ –
+ pipeline_options
+ (Optional[PipelineOptions]
)
+ –
+
backend
+
+
+
+ class-attribute
+ instance-attribute
+
+
+backend: Type[AbstractDocumentBackend] = (
+ DoclingParseDocumentBackend
+)
+
model_config
+
+
+
+ class-attribute
+ instance-attribute
+
+
+model_config = ConfigDict(arbitrary_types_allowed=True)
+
pipeline_cls
+
+
+
+ class-attribute
+ instance-attribute
+
+
+pipeline_cls: Type = StandardPdfPipeline
+
pipeline_options
+
+
+
+ class-attribute
+ instance-attribute
+
+
+pipeline_options: Optional[PipelineOptions] = None
+
set_optional_field_default
+
+
+set_optional_field_default() -> FormatOption
+
StandardPdfPipeline
+
+
+StandardPdfPipeline(pipeline_options: PdfPipelineOptions)
+
+ Bases: PaginatedPipeline
Methods:
+download_models_hf
+ –
+ execute
+ –
+ get_default_options
+ –
+ get_ocr_model
+ –
+ initialize_page
+ –
+ is_backend_supported
+ –
+ Attributes:
+artifacts_path
+ –
+ build_pipe
+ –
+ enrichment_pipe
+ –
+ glm_model
+ –
+ pipeline_options
+ (PdfPipelineOptions
)
+ –
+
artifacts_path
+
+
+
+ instance-attribute
+
+
+artifacts_path = download_models_hf()
+
build_pipe
+
+
+
+ instance-attribute
+
+
+build_pipe = [
+ PagePreprocessingModel(
+ options=PagePreprocessingOptions(
+ images_scale=images_scale
+ )
+ ),
+ ocr_model,
+ LayoutModel(
+ artifacts_path=artifacts_path / _layout_model_path
+ ),
+ TableStructureModel(
+ enabled=do_table_structure,
+ artifacts_path=artifacts_path / _table_model_path,
+ options=table_structure_options,
+ ),
+ PageAssembleModel(
+ options=PageAssembleOptions(keep_images=keep_images)
+ ),
+]
+
enrichment_pipe
+
+
+
+ instance-attribute
+
+
+enrichment_pipe = []
+
glm_model
+
+
+
+ instance-attribute
+
+
+glm_model = GlmModel(options=GlmOptions())
+
pipeline_options
+
+
+
+ instance-attribute
+
+
+pipeline_options: PdfPipelineOptions
+
download_models_hf
+
+
+
+ staticmethod
+
+
+download_models_hf(
+ local_dir: Optional[Path] = None, force: bool = False
+) -> Path
+
execute
+
+
+execute(
+ in_doc: InputDocument, raises_on_error: bool
+) -> ConversionResult
+
get_default_options
+
+
+
+ classmethod
+
+
+get_default_options() -> PdfPipelineOptions
+
get_ocr_model
+
+
+get_ocr_model() -> Optional[BaseOcrModel]
+
initialize_page
+
+
+initialize_page(
+ conv_res: ConversionResult, page: Page
+) -> Page
+
is_backend_supported
+
+
+
+ classmethod
+
+
+is_backend_supported(backend: AbstractDocumentBackend)
+
WordFormatOption
+
+
+
+ Bases: FormatOption
Methods:
+set_optional_field_default
+ –
+ Attributes:
+backend
+ (Type[AbstractDocumentBackend]
)
+ –
+ model_config
+ –
+ pipeline_cls
+ (Type
)
+ –
+ pipeline_options
+ (Optional[PipelineOptions]
)
+ –
+
backend
+
+
+
+ class-attribute
+ instance-attribute
+
+
+backend: Type[AbstractDocumentBackend] = (
+ MsWordDocumentBackend
+)
+
model_config
+
+
+
+ class-attribute
+ instance-attribute
+
+
+model_config = ConfigDict(arbitrary_types_allowed=True)
+
pipeline_cls
+
+
+
+ class-attribute
+ instance-attribute
+
+
+pipeline_cls: Type = SimplePipeline
+
pipeline_options
+
+
+
+ class-attribute
+ instance-attribute
+
+
+pipeline_options: Optional[PipelineOptions] = None
+
set_optional_field_default
+
+
+set_optional_field_default() -> FormatOption
+
PowerpointFormatOption
+
+
+
+ Bases: FormatOption
Methods:
+set_optional_field_default
+ –
+ Attributes:
+backend
+ (Type[AbstractDocumentBackend]
)
+ –
+ model_config
+ –
+ pipeline_cls
+ (Type
)
+ –
+ pipeline_options
+ (Optional[PipelineOptions]
)
+ –
+
backend
+
+
+
+ class-attribute
+ instance-attribute
+
+
+backend: Type[AbstractDocumentBackend] = (
+ MsPowerpointDocumentBackend
+)
+
model_config
+
+
+
+ class-attribute
+ instance-attribute
+
+
+model_config = ConfigDict(arbitrary_types_allowed=True)
+
pipeline_cls
+
+
+
+ class-attribute
+ instance-attribute
+
+
+pipeline_cls: Type = SimplePipeline
+
pipeline_options
+
+
+
+ class-attribute
+ instance-attribute
+
+
+pipeline_options: Optional[PipelineOptions] = None
+
set_optional_field_default
+
+
+set_optional_field_default() -> FormatOption
+
MarkdownFormatOption
+
+
+
+ Bases: FormatOption
Methods:
+set_optional_field_default
+ –
+ Attributes:
+backend
+ (Type[AbstractDocumentBackend]
)
+ –
+ model_config
+ –
+ pipeline_cls
+ (Type
)
+ –
+ pipeline_options
+ (Optional[PipelineOptions]
)
+ –
+
backend
+
+
+
+ class-attribute
+ instance-attribute
+
+
+backend: Type[AbstractDocumentBackend] = (
+ MarkdownDocumentBackend
+)
+
model_config
+
+
+
+ class-attribute
+ instance-attribute
+
+
+model_config = ConfigDict(arbitrary_types_allowed=True)
+
pipeline_cls
+
+
+
+ class-attribute
+ instance-attribute
+
+
+pipeline_cls: Type = SimplePipeline
+
pipeline_options
+
+
+
+ class-attribute
+ instance-attribute
+
+
+pipeline_options: Optional[PipelineOptions] = None
+
set_optional_field_default
+
+
+set_optional_field_default() -> FormatOption
+
AsciiDocFormatOption
+
+
+
+ Bases: FormatOption
Methods:
+set_optional_field_default
+ –
+ Attributes:
+backend
+ (Type[AbstractDocumentBackend]
)
+ –
+ model_config
+ –
+ pipeline_cls
+ (Type
)
+ –
+ pipeline_options
+ (Optional[PipelineOptions]
)
+ –
+
backend
+
+
+
+ class-attribute
+ instance-attribute
+
+
+backend: Type[AbstractDocumentBackend] = AsciiDocBackend
+
model_config
+
+
+
+ class-attribute
+ instance-attribute
+
+
+model_config = ConfigDict(arbitrary_types_allowed=True)
+
pipeline_cls
+
+
+
+ class-attribute
+ instance-attribute
+
+
+pipeline_cls: Type = SimplePipeline
+
pipeline_options
+
+
+
+ class-attribute
+ instance-attribute
+
+
+pipeline_options: Optional[PipelineOptions] = None
+
set_optional_field_default
+
+
+set_optional_field_default() -> FormatOption
+
HTMLFormatOption
+
+
+
+ Bases: FormatOption
Methods:
+set_optional_field_default
+ –
+ Attributes:
+backend
+ (Type[AbstractDocumentBackend]
)
+ –
+ model_config
+ –
+ pipeline_cls
+ (Type
)
+ –
+ pipeline_options
+ (Optional[PipelineOptions]
)
+ –
+
backend
+
+
+
+ class-attribute
+ instance-attribute
+
+
+backend: Type[AbstractDocumentBackend] = HTMLDocumentBackend
+
model_config
+
+
+
+ class-attribute
+ instance-attribute
+
+
+model_config = ConfigDict(arbitrary_types_allowed=True)
+
pipeline_cls
+
+
+
+ class-attribute
+ instance-attribute
+
+
+pipeline_cls: Type = SimplePipeline
+
pipeline_options
+
+
+
+ class-attribute
+ instance-attribute
+
+
+pipeline_options: Optional[PipelineOptions] = None
+
set_optional_field_default
+
+
+set_optional_field_default() -> FormatOption
+
SimplePipeline
+
+
+SimplePipeline(pipeline_options: PipelineOptions)
+
+ Bases: BasePipeline
SimpleModelPipeline.
+This class is used at the moment for formats / backends +which produce straight DoclingDocument output.
+ + + + + + + + + +Methods:
+execute
+ –
+ get_default_options
+ –
+ is_backend_supported
+ –
+ Attributes:
+build_pipe
+ (List[Callable]
)
+ –
+ enrichment_pipe
+ (List[BaseEnrichmentModel]
)
+ –
+ pipeline_options
+ –
+
build_pipe
+
+
+
+ instance-attribute
+
+
+build_pipe: List[Callable] = []
+
enrichment_pipe
+
+
+
+ instance-attribute
+
+
+enrichment_pipe: List[BaseEnrichmentModel] = []
+
pipeline_options
+
+
+
+ instance-attribute
+
+
+pipeline_options = pipeline_options
+
execute
+
+
+execute(
+ in_doc: InputDocument, raises_on_error: bool
+) -> ConversionResult
+
get_default_options
+
+
+
+ classmethod
+
+
+get_default_options() -> PipelineOptions
+
is_backend_supported
+
+
+
+ classmethod
+
+
+is_backend_supported(backend: AbstractDocumentBackend)
+
Pipeline options allow to customize the execution of the models during the conversion pipeline.
+This includes options for the OCR engines, the table model as well as enrichment options which
+can be enabled with do_xyz = True
.
This is an automatic generated API reference of the all the pipeline options available in Docling.
+ + +
pipeline_options
+
+
+Classes:
+EasyOcrOptions
+ –
+ Options for the EasyOCR engine.
+OcrMacOptions
+ –
+ Options for the Mac OCR engine.
+OcrOptions
+ –
+ OCR options.
+PdfPipelineOptions
+ –
+ Options for the PDF pipeline.
+PipelineOptions
+ –
+ Base pipeline options.
+RapidOcrOptions
+ –
+ Options for the RapidOCR engine.
+TableFormerMode
+ –
+ Modes for the TableFormer model.
+TableStructureOptions
+ –
+ Options for the table structure.
+TesseractCliOcrOptions
+ –
+ Options for the TesseractCli engine.
+TesseractOcrOptions
+ –
+ Options for the Tesseract engine.
+
EasyOcrOptions
+
+
+
+ Bases: OcrOptions
Options for the EasyOCR engine.
+ + + + + + + + + + + +Attributes:
+bitmap_area_threshold
+ (float
)
+ –
+ download_enabled
+ (bool
)
+ –
+ force_full_page_ocr
+ (bool
)
+ –
+ kind
+ (Literal['easyocr']
)
+ –
+ lang
+ (List[str]
)
+ –
+ model_config
+ –
+ model_storage_directory
+ (Optional[str]
)
+ –
+ use_gpu
+ (bool
)
+ –
+
bitmap_area_threshold
+
+
+
+ class-attribute
+ instance-attribute
+
+
+bitmap_area_threshold: float = 0.05
+
download_enabled
+
+
+
+ class-attribute
+ instance-attribute
+
+
+download_enabled: bool = True
+
force_full_page_ocr
+
+
+
+ class-attribute
+ instance-attribute
+
+
+force_full_page_ocr: bool = False
+
kind
+
+
+
+ class-attribute
+ instance-attribute
+
+
+kind: Literal['easyocr'] = 'easyocr'
+
lang
+
+
+
+ class-attribute
+ instance-attribute
+
+
+lang: List[str] = ['fr', 'de', 'es', 'en']
+
model_config
+
+
+
+ class-attribute
+ instance-attribute
+
+
+model_config = ConfigDict(
+ extra="forbid", protected_namespaces=()
+)
+
model_storage_directory
+
+
+
+ class-attribute
+ instance-attribute
+
+
+model_storage_directory: Optional[str] = None
+
use_gpu
+
+
+
+ class-attribute
+ instance-attribute
+
+
+use_gpu: bool = True
+
OcrMacOptions
+
+
+
+ Bases: OcrOptions
Options for the Mac OCR engine.
+ + + + + + + + + + + +Attributes:
+bitmap_area_threshold
+ (float
)
+ –
+ force_full_page_ocr
+ (bool
)
+ –
+ framework
+ (str
)
+ –
+ kind
+ (Literal['ocrmac']
)
+ –
+ lang
+ (List[str]
)
+ –
+ model_config
+ –
+ recognition
+ (str
)
+ –
+
bitmap_area_threshold
+
+
+
+ class-attribute
+ instance-attribute
+
+
+bitmap_area_threshold: float = 0.05
+
force_full_page_ocr
+
+
+
+ class-attribute
+ instance-attribute
+
+
+force_full_page_ocr: bool = False
+
framework
+
+
+
+ class-attribute
+ instance-attribute
+
+
+framework: str = 'vision'
+
kind
+
+
+
+ class-attribute
+ instance-attribute
+
+
+kind: Literal['ocrmac'] = 'ocrmac'
+
lang
+
+
+
+ class-attribute
+ instance-attribute
+
+
+lang: List[str] = ['fr-FR', 'de-DE', 'es-ES', 'en-US']
+
model_config
+
+
+
+ class-attribute
+ instance-attribute
+
+
+model_config = ConfigDict(extra='forbid')
+
recognition
+
+
+
+ class-attribute
+ instance-attribute
+
+
+recognition: str = 'accurate'
+
OcrOptions
+
+
+
+ Bases: BaseModel
OCR options.
+ + + + + + + + + + + +Attributes:
+bitmap_area_threshold
+ (float
)
+ –
+ force_full_page_ocr
+ (bool
)
+ –
+ kind
+ (str
)
+ –
+ lang
+ (List[str]
)
+ –
+
bitmap_area_threshold
+
+
+
+ class-attribute
+ instance-attribute
+
+
+bitmap_area_threshold: float = 0.05
+
force_full_page_ocr
+
+
+
+ class-attribute
+ instance-attribute
+
+
+force_full_page_ocr: bool = False
+
kind
+
+
+
+ instance-attribute
+
+
+kind: str
+
lang
+
+
+
+ instance-attribute
+
+
+lang: List[str]
+
PdfPipelineOptions
+
+
+
+ Bases: PipelineOptions
Options for the PDF pipeline.
+ + + + + + + + + + + +Attributes:
+artifacts_path
+ (Optional[Union[Path, str]]
)
+ –
+ create_legacy_output
+ (bool
)
+ –
+ do_ocr
+ (bool
)
+ –
+ do_table_structure
+ (bool
)
+ –
+ generate_page_images
+ (bool
)
+ –
+ generate_picture_images
+ (bool
)
+ –
+ generate_table_images
+ (bool
)
+ –
+ images_scale
+ (float
)
+ –
+ ocr_options
+ (Union[EasyOcrOptions, TesseractCliOcrOptions, TesseractOcrOptions, OcrMacOptions]
)
+ –
+ table_structure_options
+ (TableStructureOptions
)
+ –
+
artifacts_path
+
+
+
+ class-attribute
+ instance-attribute
+
+
+artifacts_path: Optional[Union[Path, str]] = None
+
create_legacy_output
+
+
+
+ class-attribute
+ instance-attribute
+
+
+create_legacy_output: bool = True
+
do_ocr
+
+
+
+ class-attribute
+ instance-attribute
+
+
+do_ocr: bool = True
+
do_table_structure
+
+
+
+ class-attribute
+ instance-attribute
+
+
+do_table_structure: bool = True
+
generate_page_images
+
+
+
+ class-attribute
+ instance-attribute
+
+
+generate_page_images: bool = False
+
generate_picture_images
+
+
+
+ class-attribute
+ instance-attribute
+
+
+generate_picture_images: bool = False
+
generate_table_images
+
+
+
+ class-attribute
+ instance-attribute
+
+
+generate_table_images: bool = Field(
+ default=False,
+ deprecated="Field `generate_table_images` is deprecated. To obtain table images, set `PdfPipelineOptions.generate_page_images = True` before conversion and then use the `TableItem.get_image` function.",
+)
+
images_scale
+
+
+
+ class-attribute
+ instance-attribute
+
+
+images_scale: float = 1.0
+
ocr_options
+
+
+
+ class-attribute
+ instance-attribute
+
+
+ocr_options: Union[
+ EasyOcrOptions,
+ TesseractCliOcrOptions,
+ TesseractOcrOptions,
+ OcrMacOptions,
+] = Field(EasyOcrOptions(), discriminator="kind")
+
table_structure_options
+
+
+
+ class-attribute
+ instance-attribute
+
+
+table_structure_options: TableStructureOptions = (
+ TableStructureOptions()
+)
+
PipelineOptions
+
+
+
+ Bases: BaseModel
Base pipeline options.
+ + + + + + + + + + + +Attributes:
+create_legacy_output
+ (bool
)
+ –
+
create_legacy_output
+
+
+
+ class-attribute
+ instance-attribute
+
+
+create_legacy_output: bool = True
+
RapidOcrOptions
+
+
+
+ Bases: OcrOptions
Options for the RapidOCR engine.
+ + + + + + + + + + + +Attributes:
+bitmap_area_threshold
+ (float
)
+ –
+ cls_model_path
+ (Optional[str]
)
+ –
+ det_model_path
+ (Optional[str]
)
+ –
+ force_full_page_ocr
+ (bool
)
+ –
+ kind
+ (Literal['rapidocr']
)
+ –
+ lang
+ (List[str]
)
+ –
+ model_config
+ –
+ print_verbose
+ (bool
)
+ –
+ rec_model_path
+ (Optional[str]
)
+ –
+ text_score
+ (float
)
+ –
+ use_cls
+ (Optional[bool]
)
+ –
+ use_det
+ (Optional[bool]
)
+ –
+ use_rec
+ (Optional[bool]
)
+ –
+
bitmap_area_threshold
+
+
+
+ class-attribute
+ instance-attribute
+
+
+bitmap_area_threshold: float = 0.05
+
cls_model_path
+
+
+
+ class-attribute
+ instance-attribute
+
+
+cls_model_path: Optional[str] = None
+
det_model_path
+
+
+
+ class-attribute
+ instance-attribute
+
+
+det_model_path: Optional[str] = None
+
force_full_page_ocr
+
+
+
+ class-attribute
+ instance-attribute
+
+
+force_full_page_ocr: bool = False
+
kind
+
+
+
+ class-attribute
+ instance-attribute
+
+
+kind: Literal['rapidocr'] = 'rapidocr'
+
lang
+
+
+
+ class-attribute
+ instance-attribute
+
+
+lang: List[str] = ['english', 'chinese']
+
model_config
+
+
+
+ class-attribute
+ instance-attribute
+
+
+model_config = ConfigDict(extra='forbid')
+
print_verbose
+
+
+
+ class-attribute
+ instance-attribute
+
+
+print_verbose: bool = False
+
rec_model_path
+
+
+
+ class-attribute
+ instance-attribute
+
+
+rec_model_path: Optional[str] = None
+
text_score
+
+
+
+ class-attribute
+ instance-attribute
+
+
+text_score: float = 0.5
+
use_cls
+
+
+
+ class-attribute
+ instance-attribute
+
+
+use_cls: Optional[bool] = None
+
use_det
+
+
+
+ class-attribute
+ instance-attribute
+
+
+use_det: Optional[bool] = None
+
use_rec
+
+
+
+ class-attribute
+ instance-attribute
+
+
+use_rec: Optional[bool] = None
+
TableFormerMode
+
+
+
+ Bases: str
, Enum
Modes for the TableFormer model.
+ + + + + + + + + + + +Attributes:
+ + + + + +
ACCURATE
+
+
+
+ class-attribute
+ instance-attribute
+
+
+ACCURATE = 'accurate'
+
FAST
+
+
+
+ class-attribute
+ instance-attribute
+
+
+FAST = 'fast'
+
TableStructureOptions
+
+
+
+ Bases: BaseModel
Options for the table structure.
+ + + + + + + + + + + +Attributes:
+do_cell_matching
+ (bool
)
+ –
+ mode
+ (TableFormerMode
)
+ –
+
do_cell_matching
+
+
+
+ class-attribute
+ instance-attribute
+
+
+do_cell_matching: bool = True
+
mode
+
+
+
+ class-attribute
+ instance-attribute
+
+
+mode: TableFormerMode = FAST
+
TesseractCliOcrOptions
+
+
+
+ Bases: OcrOptions
Options for the TesseractCli engine.
+ + + + + + + + + + + +Attributes:
+bitmap_area_threshold
+ (float
)
+ –
+ force_full_page_ocr
+ (bool
)
+ –
+ kind
+ (Literal['tesseract']
)
+ –
+ lang
+ (List[str]
)
+ –
+ model_config
+ –
+ path
+ (Optional[str]
)
+ –
+ tesseract_cmd
+ (str
)
+ –
+
bitmap_area_threshold
+
+
+
+ class-attribute
+ instance-attribute
+
+
+bitmap_area_threshold: float = 0.05
+
force_full_page_ocr
+
+
+
+ class-attribute
+ instance-attribute
+
+
+force_full_page_ocr: bool = False
+
kind
+
+
+
+ class-attribute
+ instance-attribute
+
+
+kind: Literal['tesseract'] = 'tesseract'
+
lang
+
+
+
+ class-attribute
+ instance-attribute
+
+
+lang: List[str] = ['fra', 'deu', 'spa', 'eng']
+
model_config
+
+
+
+ class-attribute
+ instance-attribute
+
+
+model_config = ConfigDict(extra='forbid')
+
path
+
+
+
+ class-attribute
+ instance-attribute
+
+
+path: Optional[str] = None
+
tesseract_cmd
+
+
+
+ class-attribute
+ instance-attribute
+
+
+tesseract_cmd: str = 'tesseract'
+
TesseractOcrOptions
+
+
+
+ Bases: OcrOptions
Options for the Tesseract engine.
+ + + + + + + + + + + +Attributes:
+bitmap_area_threshold
+ (float
)
+ –
+ force_full_page_ocr
+ (bool
)
+ –
+ kind
+ (Literal['tesserocr']
)
+ –
+ lang
+ (List[str]
)
+ –
+ model_config
+ –
+ path
+ (Optional[str]
)
+ –
+
bitmap_area_threshold
+
+
+
+ class-attribute
+ instance-attribute
+
+
+bitmap_area_threshold: float = 0.05
+
force_full_page_ocr
+
+
+
+ class-attribute
+ instance-attribute
+
+
+force_full_page_ocr: bool = False
+
kind
+
+
+
+ class-attribute
+ instance-attribute
+
+
+kind: Literal['tesserocr'] = 'tesserocr'
+
lang
+
+
+
+ class-attribute
+ instance-attribute
+
+
+lang: List[str] = ['fra', 'deu', 'spa', 'eng']
+
model_config
+
+
+
+ class-attribute
+ instance-attribute
+
+
+model_config = ConfigDict(extra='forbid')
+
path
+
+
+
+ class-attribute
+ instance-attribute
+
+
+path: Optional[str] = None
+