diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py index efdf3b1c..00ab7b41 100644 --- a/docling/datamodel/pipeline_options.py +++ b/docling/datamodel/pipeline_options.py @@ -221,6 +221,7 @@ class PdfPipelineOptions(PipelineOptions): do_ocr: bool = True # True: perform OCR, replace programmatic PDF text do_code_enrichment: bool = False # True: perform code OCR do_formula_enrichment: bool = False # True: perform formula OCR, return Latex code + do_picture_classification: bool = False # True: classify pictures in documents table_structure_options: TableStructureOptions = TableStructureOptions() ocr_options: Union[ diff --git a/docling/models/document_picture_classifier.py b/docling/models/document_picture_classifier.py new file mode 100644 index 00000000..6e2d90b4 --- /dev/null +++ b/docling/models/document_picture_classifier.py @@ -0,0 +1,187 @@ +from pathlib import Path +from typing import Iterable, List, Literal, Optional, Tuple, Union + +from docling_core.types.doc import ( + DoclingDocument, + NodeItem, + PictureClassificationClass, + PictureClassificationData, + PictureItem, +) +from PIL import Image +from pydantic import BaseModel + +from docling.datamodel.pipeline_options import AcceleratorOptions +from docling.models.base_model import BaseEnrichmentModel +from docling.utils.accelerator_utils import decide_device + + +class DocumentPictureClassifierOptions(BaseModel): + """ + Options for configuring the DocumentPictureClassifier. + + Attributes + ---------- + kind : Literal["document_picture_classifier"] + Identifier for the type of classifier. + """ + + kind: Literal["document_picture_classifier"] = "document_picture_classifier" + + +class DocumentPictureClassifier(BaseEnrichmentModel): + """ + A model for classifying pictures in documents. + + This class enriches document pictures with predicted classifications + based on a predefined set of classes. + + Attributes + ---------- + enabled : bool + Whether the classifier is enabled for use. + options : DocumentPictureClassifierOptions + Configuration options for the classifier. + document_picture_classifier : DocumentPictureClassifierPredictor + The underlying prediction model, loaded if the classifier is enabled. + + Methods + ------- + __init__(enabled, artifacts_path, options, accelerator_options) + Initializes the classifier with specified configurations. + is_processable(doc, element) + Checks if the given element can be processed by the classifier. + __call__(doc, element_batch) + Processes a batch of elements and adds classification annotations. + """ + + images_scale = 2 + + def __init__( + self, + enabled: bool, + artifacts_path: Optional[Union[Path, str]], + options: DocumentPictureClassifierOptions, + accelerator_options: AcceleratorOptions, + ): + """ + Initializes the DocumentPictureClassifier. + + Parameters + ---------- + enabled : bool + Indicates whether the classifier is enabled. + artifacts_path : Optional[Union[Path, str]], + Path to the directory containing model artifacts. + options : DocumentPictureClassifierOptions + Configuration options for the classifier. + accelerator_options : AcceleratorOptions + Options for configuring the device and parallelism. + """ + self.enabled = enabled + self.options = options + + if self.enabled: + device = decide_device(accelerator_options.device) + from docling_ibm_models.document_figure_classifier_model.document_figure_classifier_predictor import ( + DocumentFigureClassifierPredictor, + ) + + if artifacts_path is None: + artifacts_path = self.download_models_hf() + else: + artifacts_path = Path(artifacts_path) + + self.document_picture_classifier = DocumentFigureClassifierPredictor( + artifacts_path=artifacts_path, + device=device, + num_threads=accelerator_options.num_threads, + ) + + @staticmethod + def download_models_hf( + local_dir: Optional[Path] = None, force: bool = False + ) -> Path: + from huggingface_hub import snapshot_download + from huggingface_hub.utils import disable_progress_bars + + disable_progress_bars() + download_path = snapshot_download( + repo_id="ds4sd/DocumentFigureClassifier", + force_download=force, + local_dir=local_dir, + revision="v1.0.0", + ) + + return Path(download_path) + + def is_processable(self, doc: DoclingDocument, element: NodeItem) -> bool: + """ + Determines if the given element can be processed by the classifier. + + Parameters + ---------- + doc : DoclingDocument + The document containing the element. + element : NodeItem + The element to be checked. + + Returns + ------- + bool + True if the element is a PictureItem and processing is enabled; False otherwise. + """ + return self.enabled and isinstance(element, PictureItem) + + def __call__( + self, + doc: DoclingDocument, + element_batch: Iterable[NodeItem], + ) -> Iterable[NodeItem]: + """ + Processes a batch of elements and enriches them with classification predictions. + + Parameters + ---------- + doc : DoclingDocument + The document containing the elements to be processed. + element_batch : Iterable[NodeItem] + A batch of pictures to classify. + + Returns + ------- + Iterable[NodeItem] + An iterable of NodeItem objects after processing. The field + 'data.classification' is added containing the classification for each picture. + """ + if not self.enabled: + for element in element_batch: + yield element + return + + images: List[Image.Image] = [] + elements: List[PictureItem] = [] + for el in element_batch: + assert isinstance(el, PictureItem) + elements.append(el) + img = el.get_image(doc) + assert img is not None + images.append(img) + + outputs = self.document_picture_classifier.predict(images) + + for element, output in zip(elements, outputs): + element.annotations.append( + PictureClassificationData( + provenance="DocumentPictureClassifier", + predicted_classes=[ + PictureClassificationClass( + class_name=pred[0], + confidence=pred[1], + ) + for pred in output + ], + ) + ) + + yield element diff --git a/docling/pipeline/standard_pdf_pipeline.py b/docling/pipeline/standard_pdf_pipeline.py index 97bcc6b6..fe2201d6 100644 --- a/docling/pipeline/standard_pdf_pipeline.py +++ b/docling/pipeline/standard_pdf_pipeline.py @@ -19,6 +19,10 @@ ) from docling.models.base_ocr_model import BaseOcrModel from docling.models.code_formula_model import CodeFormulaModel, CodeFormulaModelOptions +from docling.models.document_picture_classifier import ( + DocumentPictureClassifier, + DocumentPictureClassifierOptions, +) from docling.models.ds_glm_model import GlmModel, GlmOptions from docling.models.easyocr_model import EasyOcrModel from docling.models.layout_model import LayoutModel @@ -104,6 +108,13 @@ def __init__(self, pipeline_options: PdfPipelineOptions): ), accelerator_options=pipeline_options.accelerator_options, ), + # Document Picture Classifier + DocumentPictureClassifier( + enabled=pipeline_options.do_picture_classification, + artifacts_path=pipeline_options.artifacts_path, + options=DocumentPictureClassifierOptions(), + accelerator_options=pipeline_options.accelerator_options, + ), ] if ( diff --git a/poetry.lock b/poetry.lock index 73fc85db..c3905141 100644 --- a/poetry.lock +++ b/poetry.lock @@ -888,13 +888,13 @@ chunking = ["semchunk (>=2.2.0,<3.0.0)", "transformers (>=4.34.0,<5.0.0)"] [[package]] name = "docling-ibm-models" -version = "3.2.1" +version = "3.3.0" description = "This package contains the AI models used by the Docling PDF conversion package" optional = false python-versions = "<4.0,>=3.9" files = [ - {file = "docling_ibm_models-3.2.1-py3-none-any.whl", hash = "sha256:55bca5673381cc5862f4de584345020d071414c46bc1b9f6436d674e3610ec97"}, - {file = "docling_ibm_models-3.2.1.tar.gz", hash = "sha256:abd1bdc58f00600065eedbfbd34876704d5004cd20884a2c0a61ca2ee5a927dd"}, + {file = "docling_ibm_models-3.3.0-py3-none-any.whl", hash = "sha256:f1c99d345cb524239c7a2090969920e4311fd2fe22dad9bd609bc38039ec56eb"}, + {file = "docling_ibm_models-3.3.0.tar.gz", hash = "sha256:5a7497053871179d59870c830945aa8664a34aac48b7e68edf602720ee7f6c49"}, ] [package.dependencies] @@ -1046,13 +1046,13 @@ testing = ["hatch", "pre-commit", "pytest", "tox"] [[package]] name = "executing" -version = "2.1.0" +version = "2.2.0" description = "Get the currently executing AST node of a frame, and other information" optional = false python-versions = ">=3.8" files = [ - {file = "executing-2.1.0-py2.py3-none-any.whl", hash = "sha256:8d63781349375b5ebccc3142f4b30350c0cd9c79f921cde38be2be4637e98eaf"}, - {file = "executing-2.1.0.tar.gz", hash = "sha256:8ea27ddd260da8150fa5a708269c4a10e76161e2496ec3e587da9e3c0fe4b9ab"}, + {file = "executing-2.2.0-py2.py3-none-any.whl", hash = "sha256:11387150cad388d62750327a53d3339fad4888b39a6fe233c3afbb54ecffd3aa"}, + {file = "executing-2.2.0.tar.gz", hash = "sha256:5d108c028108fe2551d1a7b2e8b713341e2cb4fc0aa7dcf966fa4327a5226755"}, ] [package.extras] @@ -3674,14 +3674,14 @@ files = [ [[package]] name = "nvidia-nvjitlink-cu12" -version = "12.6.85" +version = "12.8.61" description = "Nvidia JIT LTO Library" optional = false python-versions = ">=3" files = [ - {file = "nvidia_nvjitlink_cu12-12.6.85-py3-none-manylinux2010_x86_64.manylinux_2_12_x86_64.whl", hash = "sha256:eedc36df9e88b682efe4309aa16b5b4e78c2407eac59e8c10a6a47535164369a"}, - {file = "nvidia_nvjitlink_cu12-12.6.85-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:cf4eaa7d4b6b543ffd69d6abfb11efdeb2db48270d94dfd3a452c24150829e41"}, - {file = "nvidia_nvjitlink_cu12-12.6.85-py3-none-win_amd64.whl", hash = "sha256:e61120e52ed675747825cdd16febc6a0730537451d867ee58bee3853b1b13d1c"}, + {file = "nvidia_nvjitlink_cu12-12.8.61-py3-none-manylinux2010_x86_64.manylinux_2_12_x86_64.whl", hash = "sha256:45fd79f2ae20bd67e8bc411055939049873bfd8fac70ff13bd4865e0b9bdab17"}, + {file = "nvidia_nvjitlink_cu12-12.8.61-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:9b80ecab31085dda3ce3b41d043be0ec739216c3fc633b8abe212d5a30026df0"}, + {file = "nvidia_nvjitlink_cu12-12.8.61-py3-none-win_amd64.whl", hash = "sha256:1166a964d25fdc0eae497574d38824305195a5283324a21ccb0ce0c802cbf41c"}, ] [[package]] @@ -4612,13 +4612,13 @@ files = [ [[package]] name = "pydantic" -version = "2.10.5" +version = "2.10.6" description = "Data validation using Python type hints" optional = false python-versions = ">=3.8" files = [ - {file = "pydantic-2.10.5-py3-none-any.whl", hash = "sha256:4dd4e322dbe55472cb7ca7e73f4b63574eecccf2835ffa2af9021ce113c83c53"}, - {file = "pydantic-2.10.5.tar.gz", hash = "sha256:278b38dbbaec562011d659ee05f63346951b3a248a6f3642e1bc68894ea2b4ff"}, + {file = "pydantic-2.10.6-py3-none-any.whl", hash = "sha256:427d664bf0b8a2b34ff5dd0f5a18df00591adcee7198fbd71981054cef37b584"}, + {file = "pydantic-2.10.6.tar.gz", hash = "sha256:ca5daa827cce33de7a42be142548b0096bf05a7e7b365aebfa5f8eeec7128236"}, ] [package.dependencies] @@ -6124,13 +6124,13 @@ files = [ [[package]] name = "sentence-transformers" -version = "3.3.1" +version = "3.4.0" description = "State-of-the-Art Text Embeddings" optional = false python-versions = ">=3.9" files = [ - {file = "sentence_transformers-3.3.1-py3-none-any.whl", hash = "sha256:abffcc79dab37b7d18d21a26d5914223dd42239cfe18cb5e111c66c54b658ae7"}, - {file = "sentence_transformers-3.3.1.tar.gz", hash = "sha256:9635dbfb11c6b01d036b9cfcee29f7716ab64cf2407ad9f403a2e607da2ac48b"}, + {file = "sentence_transformers-3.4.0-py3-none-any.whl", hash = "sha256:f7d4ad81260149172a98108a3481d8e82c11d31f40d41885f43d481149237743"}, + {file = "sentence_transformers-3.4.0.tar.gz", hash = "sha256:334288062d4b888cdd7b75913fead46b1e42bfe836f8343d23478d17f799e650"}, ] [package.dependencies] @@ -7487,13 +7487,13 @@ files = [ [[package]] name = "xlsxwriter" -version = "3.2.0" +version = "3.2.1" description = "A Python module for creating Excel XLSX files." optional = false python-versions = ">=3.6" files = [ - {file = "XlsxWriter-3.2.0-py3-none-any.whl", hash = "sha256:ecfd5405b3e0e228219bcaf24c2ca0915e012ca9464a14048021d21a995d490e"}, - {file = "XlsxWriter-3.2.0.tar.gz", hash = "sha256:9977d0c661a72866a61f9f7a809e25ebbb0fb7036baa3b9fe74afcfca6b3cb8c"}, + {file = "XlsxWriter-3.2.1-py3-none-any.whl", hash = "sha256:7e8f7c60b7a1660ef791d46ab5de78469cb978b991ca841af61f5832d2f9f4fe"}, + {file = "XlsxWriter-3.2.1.tar.gz", hash = "sha256:97618759cb264fb6a93397f660cca156ffa9561743b1823dafb60dc4474e1902"}, ] [[package]] @@ -7751,4 +7751,4 @@ tesserocr = ["tesserocr"] [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "8bb0b67294a50c0340c5cc02ce60d3608ef4d1968ae50f7e0b8b4c8a26c34734" +content-hash = "7fcfc061454f229745d6f305e1fa593468a684059717195c6ae4174bec13d362" diff --git a/pyproject.toml b/pyproject.toml index c3e1fa67..6945e5a7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,7 +27,7 @@ packages = [{include = "docling"}] python = "^3.9" pydantic = "^2.0.0" docling-core = { version = "^2.15.1", extras = ["chunking"] } -docling-ibm-models = "^3.2.1" +docling-ibm-models = "^3.3.0" deepsearch-glm = "^1.0.0" docling-parse = "^3.1.0" filetype = "^1.2.0" diff --git a/tests/data/groundtruth/docling_v1/picture_classification.doctags.txt b/tests/data/groundtruth/docling_v1/picture_classification.doctags.txt new file mode 100644 index 00000000..cbcde73f --- /dev/null +++ b/tests/data/groundtruth/docling_v1/picture_classification.doctags.txt @@ -0,0 +1,17 @@ + +Figures Example +Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. +Figure 1: This is an example image. +
+ +Figure 1: This is an example image. +
+Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. +Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. +Figure 2: This is an example image. +
+ +Figure 2: This is an example image. +
+Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. +
\ No newline at end of file diff --git a/tests/data/groundtruth/docling_v1/picture_classification.json b/tests/data/groundtruth/docling_v1/picture_classification.json new file mode 100644 index 00000000..ad1bc5c2 --- /dev/null +++ b/tests/data/groundtruth/docling_v1/picture_classification.json @@ -0,0 +1 @@ +{"_name": "", "type": "pdf-document", "description": {"title": null, "abstract": null, "authors": null, "affiliations": null, "subjects": null, "keywords": null, "publication_date": null, "languages": null, "license": null, "publishers": null, "url_refs": null, "references": null, "publication": null, "reference_count": null, "citation_count": null, "citation_date": null, "advanced": null, "analytics": null, "logs": [], "collection": null, "acquisition": null}, "file-info": {"filename": "picture_classification.pdf", "filename-prov": null, "document-hash": "959854dff729acaa22404d629a45cefcad8d942e595961185fc03a80d9fcc3a1", "#-pages": 2, "collection-name": null, "description": null, "page-hashes": [{"hash": "d9e3fc1226356b30c66012f05ad14089b00c59ea129195cd6ff8a0c68bda6f39", "model": "default", "page": 1}, {"hash": "9386884e13a97ce9662210a7e4258bbbb4f2e0e00663636160918e55b2806575", "model": "default", "page": 2}]}, "main-text": [{"prov": [{"bbox": [133.76800537109375, 654.4518432617188, 252.35513305664062, 667.1912231445312], "page": 1, "span": [0, 15], "__ref_s3_data": null}], "text": "Figures Example", "type": "subtitle-level-1", "payload": null, "name": "Section-header", "font": null}, {"prov": [{"bbox": [133.76800537109375, 501.97412109375, 477.4827575683594, 642.3280639648438], "page": 1, "span": [0, 887], "__ref_s3_data": null}], "text": "Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.", "type": "paragraph", "payload": null, "name": "Text", "font": null}, {"prov": [{"bbox": [226.89100646972656, 254.0182647705078, 384.35479736328125, 262.86505126953125], "page": 1, "span": [0, 35], "__ref_s3_data": null}], "text": "Figure 1: This is an example image.", "type": "caption", "payload": null, "name": "Caption", "font": null}, {"name": "Picture", "type": "figure", "$ref": "#/figures/0"}, {"prov": [{"bbox": [133.76800537109375, 122.51225280761719, 477.4817199707031, 238.95504760742188], "page": 1, "span": [0, 747], "__ref_s3_data": null}], "text": "Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua.", "type": "paragraph", "payload": null, "name": "Text", "font": null}, {"prov": [{"bbox": [303.13299560546875, 87.43224334716797, 308.1142883300781, 96.27903747558594], "page": 1, "span": [0, 1], "__ref_s3_data": null}], "text": "1", "type": "page-footer", "payload": null, "name": "Page-footer", "font": null}, {"prov": [{"bbox": [133.76800537109375, 523.7951049804688, 477.4817199707031, 664.1490478515625], "page": 2, "span": [0, 887], "__ref_s3_data": null}], "text": "Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.", "type": "paragraph", "payload": null, "name": "Text", "font": null}, {"prov": [{"bbox": [226.89100646972656, 259.9422607421875, 384.35479736328125, 268.7890319824219], "page": 2, "span": [0, 35], "__ref_s3_data": null}], "text": "Figure 2: This is an example image.", "type": "caption", "payload": null, "name": "Caption", "font": null}, {"name": "Picture", "type": "figure", "$ref": "#/figures/1"}, {"prov": [{"bbox": [133.76800537109375, 117.32023620605469, 477.4817199707031, 245.71804809570312], "page": 2, "span": [0, 804], "__ref_s3_data": null}], "text": "Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum.", "type": "paragraph", "payload": null, "name": "Text", "font": null}, {"prov": [{"bbox": [303.13299560546875, 87.43224334716797, 308.1142883300781, 96.27903747558594], "page": 2, "span": [0, 1], "__ref_s3_data": null}], "text": "2", "type": "page-footer", "payload": null, "name": "Page-footer", "font": null}], "figures": [{"prov": [{"bbox": [134.92005920410156, 281.78173828125, 475.66351318359375, 487.109375], "page": 1, "span": [0, 35], "__ref_s3_data": null}], "text": "Figure 1: This is an example image.", "type": "figure", "payload": null, "bounding-box": null}, {"prov": [{"bbox": [218.8155517578125, 283.10589599609375, 391.96246337890625, 513.984619140625], "page": 2, "span": [0, 35], "__ref_s3_data": null}], "text": "Figure 2: This is an example image.", "type": "figure", "payload": null, "bounding-box": null}], "tables": [], "bitmaps": null, "equations": [], "footnotes": [], "page-dimensions": [{"height": 792.0, "page": 1, "width": 612.0}, {"height": 792.0, "page": 2, "width": 612.0}], "page-footers": [], "page-headers": [], "_s3_data": null, "identifiers": null} \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v1/picture_classification.md b/tests/data/groundtruth/docling_v1/picture_classification.md new file mode 100644 index 00000000..6b9d1faf --- /dev/null +++ b/tests/data/groundtruth/docling_v1/picture_classification.md @@ -0,0 +1,15 @@ +## Figures Example + +Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. + +Figure 1: This is an example image. + + +Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. + +Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. + +Figure 2: This is an example image. + + +Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v1/picture_classification.pages.json b/tests/data/groundtruth/docling_v1/picture_classification.pages.json new file mode 100644 index 00000000..744f54fb --- /dev/null +++ b/tests/data/groundtruth/docling_v1/picture_classification.pages.json @@ -0,0 +1 @@ +[{"page_no": 0, "size": {"width": 612.0, "height": 792.0}, "cells": [{"id": 0, "text": "Figures Example", "bbox": {"l": 133.76801, "t": 124.80877999999996, "r": 252.35513, "b": 137.54816000000005, "coord_origin": "TOPLEFT"}}, {"id": 1, "text": "Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eir-", "bbox": {"l": 133.76801, "t": 149.67193999999995, "r": 477.47971, "b": 158.51873999999998, "coord_origin": "TOPLEFT"}}, {"id": 2, "text": "mod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam volup-", "bbox": {"l": 133.76801, "t": 161.62694999999997, "r": 477.4806500000001, "b": 170.47375, "coord_origin": "TOPLEFT"}}, {"id": 3, "text": "tua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd", "bbox": {"l": 133.76801, "t": 173.58196999999996, "r": 477.47571000000005, "b": 182.42877, "coord_origin": "TOPLEFT"}}, {"id": 4, "text": "gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ip-", "bbox": {"l": 133.76801, "t": 185.53698999999995, "r": 477.47466999999995, "b": 194.38378999999998, "coord_origin": "TOPLEFT"}}, {"id": 5, "text": "sum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor", "bbox": {"l": 133.76801, "t": 197.49199999999996, "r": 477.47668, "b": 206.33880999999997, "coord_origin": "TOPLEFT"}}, {"id": 6, "text": "invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero", "bbox": {"l": 133.76801, "t": 209.44701999999995, "r": 477.48068000000006, "b": 218.29381999999998, "coord_origin": "TOPLEFT"}}, {"id": 7, "text": "eos et accusam et justo duo dolores et ea rebum.", "bbox": {"l": 133.76801, "t": 221.40301999999997, "r": 355.20874, "b": 230.24982, "coord_origin": "TOPLEFT"}}, {"id": 8, "text": "Stet clita kasd gubergren,", "bbox": {"l": 362.60098, "t": 221.40301999999997, "r": 477.48276, "b": 230.24982, "coord_origin": "TOPLEFT"}}, {"id": 9, "text": "no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor", "bbox": {"l": 133.76801, "t": 233.35802999999999, "r": 477.47772, "b": 242.20483000000002, "coord_origin": "TOPLEFT"}}, {"id": 10, "text": "sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt", "bbox": {"l": 133.76801, "t": 245.31304999999998, "r": 477.47971, "b": 254.15985, "coord_origin": "TOPLEFT"}}, {"id": 11, "text": "ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et", "bbox": {"l": 133.76801, "t": 257.26806999999997, "r": 477.47473, "b": 266.11487, "coord_origin": "TOPLEFT"}}, {"id": 12, "text": "accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea", "bbox": {"l": 133.76801, "t": 269.22308, "r": 477.47571000000005, "b": 278.06989, "coord_origin": "TOPLEFT"}}, {"id": 13, "text": "takimata sanctus est Lorem ipsum dolor sit amet.", "bbox": {"l": 133.76801, "t": 281.17911, "r": 351.48471, "b": 290.02588, "coord_origin": "TOPLEFT"}}, {"id": 14, "text": "Figure 1: This is an example image.", "bbox": {"l": 226.89101, "t": 529.13495, "r": 384.3548, "b": 537.98174, "coord_origin": "TOPLEFT"}}, {"id": 15, "text": "Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy", "bbox": {"l": 148.71201, "t": 553.04495, "r": 477.47786999999994, "b": 561.89174, "coord_origin": "TOPLEFT"}}, {"id": 16, "text": "eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam", "bbox": {"l": 133.76801, "t": 564.99995, "r": 477.47860999999995, "b": 573.84674, "coord_origin": "TOPLEFT"}}, {"id": 17, "text": "voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita", "bbox": {"l": 133.76801, "t": 576.95496, "r": 477.47571000000005, "b": 585.80174, "coord_origin": "TOPLEFT"}}, {"id": 18, "text": "kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem", "bbox": {"l": 133.76801, "t": 588.90996, "r": 477.47559, "b": 597.75674, "coord_origin": "TOPLEFT"}}, {"id": 19, "text": "ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod", "bbox": {"l": 133.76801, "t": 600.86595, "r": 477.48169000000007, "b": 609.7127399999999, "coord_origin": "TOPLEFT"}}, {"id": 20, "text": "tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At", "bbox": {"l": 133.76801, "t": 612.82095, "r": 477.48062, "b": 621.66774, "coord_origin": "TOPLEFT"}}, {"id": 21, "text": "vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren,", "bbox": {"l": 133.76801, "t": 624.7759599999999, "r": 477.48172000000005, "b": 633.62274, "coord_origin": "TOPLEFT"}}, {"id": 22, "text": "no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor", "bbox": {"l": 133.76801, "t": 636.73096, "r": 477.47772, "b": 645.57774, "coord_origin": "TOPLEFT"}}, {"id": 23, "text": "sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt", "bbox": {"l": 133.76801, "t": 648.68596, "r": 477.47971, "b": 657.53275, "coord_origin": "TOPLEFT"}}, {"id": 24, "text": "ut labore et dolore magna aliquyam erat, sed diam voluptua.", "bbox": {"l": 133.76801, "t": 660.64096, "r": 399.57816, "b": 669.48775, "coord_origin": "TOPLEFT"}}, {"id": 25, "text": "1", "bbox": {"l": 303.133, "t": 695.720963, "r": 308.11429, "b": 704.567757, "coord_origin": "TOPLEFT"}}], "predictions": {"layout": {"clusters": [{"id": 3, "label": "section_header", "bbox": {"l": 133.76801, "t": 124.80877999999996, "r": 252.35513, "b": 137.54816000000005, "coord_origin": "TOPLEFT"}, "confidence": 0.9627318382263184, "cells": [{"id": 0, "text": "Figures Example", "bbox": {"l": 133.76801, "t": 124.80877999999996, "r": 252.35513, "b": 137.54816000000005, "coord_origin": "TOPLEFT"}}], "children": []}, {"id": 0, "label": "text", "bbox": {"l": 133.76801, "t": 149.67193999999995, "r": 477.48276, "b": 290.02588, "coord_origin": "TOPLEFT"}, "confidence": 0.9869933128356934, "cells": [{"id": 1, "text": "Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eir-", "bbox": {"l": 133.76801, "t": 149.67193999999995, "r": 477.47971, "b": 158.51873999999998, "coord_origin": "TOPLEFT"}}, {"id": 2, "text": "mod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam volup-", "bbox": {"l": 133.76801, "t": 161.62694999999997, "r": 477.4806500000001, "b": 170.47375, "coord_origin": "TOPLEFT"}}, {"id": 3, "text": "tua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd", "bbox": {"l": 133.76801, "t": 173.58196999999996, "r": 477.47571000000005, "b": 182.42877, "coord_origin": "TOPLEFT"}}, {"id": 4, "text": "gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ip-", "bbox": {"l": 133.76801, "t": 185.53698999999995, "r": 477.47466999999995, "b": 194.38378999999998, "coord_origin": "TOPLEFT"}}, {"id": 5, "text": "sum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor", "bbox": {"l": 133.76801, "t": 197.49199999999996, "r": 477.47668, "b": 206.33880999999997, "coord_origin": "TOPLEFT"}}, {"id": 6, "text": "invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero", "bbox": {"l": 133.76801, "t": 209.44701999999995, "r": 477.48068000000006, "b": 218.29381999999998, "coord_origin": "TOPLEFT"}}, {"id": 7, "text": "eos et accusam et justo duo dolores et ea rebum.", "bbox": {"l": 133.76801, "t": 221.40301999999997, "r": 355.20874, "b": 230.24982, "coord_origin": "TOPLEFT"}}, {"id": 8, "text": "Stet clita kasd gubergren,", "bbox": {"l": 362.60098, "t": 221.40301999999997, "r": 477.48276, "b": 230.24982, "coord_origin": "TOPLEFT"}}, {"id": 9, "text": "no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor", "bbox": {"l": 133.76801, "t": 233.35802999999999, "r": 477.47772, "b": 242.20483000000002, "coord_origin": "TOPLEFT"}}, {"id": 10, "text": "sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt", "bbox": {"l": 133.76801, "t": 245.31304999999998, "r": 477.47971, "b": 254.15985, "coord_origin": "TOPLEFT"}}, {"id": 11, "text": "ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et", "bbox": {"l": 133.76801, "t": 257.26806999999997, "r": 477.47473, "b": 266.11487, "coord_origin": "TOPLEFT"}}, {"id": 12, "text": "accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea", "bbox": {"l": 133.76801, "t": 269.22308, "r": 477.47571000000005, "b": 278.06989, "coord_origin": "TOPLEFT"}}, {"id": 13, "text": "takimata sanctus est Lorem ipsum dolor sit amet.", "bbox": {"l": 133.76801, "t": 281.17911, "r": 351.48471, "b": 290.02588, "coord_origin": "TOPLEFT"}}], "children": []}, {"id": 4, "label": "caption", "bbox": {"l": 226.89101, "t": 529.13495, "r": 384.3548, "b": 537.98174, "coord_origin": "TOPLEFT"}, "confidence": 0.9477447271347046, "cells": [{"id": 14, "text": "Figure 1: This is an example image.", "bbox": {"l": 226.89101, "t": 529.13495, "r": 384.3548, "b": 537.98174, "coord_origin": "TOPLEFT"}}], "children": []}, {"id": 1, "label": "text", "bbox": {"l": 133.76801, "t": 553.04495, "r": 477.48172000000005, "b": 669.48775, "coord_origin": "TOPLEFT"}, "confidence": 0.9862836599349976, "cells": [{"id": 15, "text": "Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy", "bbox": {"l": 148.71201, "t": 553.04495, "r": 477.47786999999994, "b": 561.89174, "coord_origin": "TOPLEFT"}}, {"id": 16, "text": "eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam", "bbox": {"l": 133.76801, "t": 564.99995, "r": 477.47860999999995, "b": 573.84674, "coord_origin": "TOPLEFT"}}, {"id": 17, "text": "voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita", "bbox": {"l": 133.76801, "t": 576.95496, "r": 477.47571000000005, "b": 585.80174, "coord_origin": "TOPLEFT"}}, {"id": 18, "text": "kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem", "bbox": {"l": 133.76801, "t": 588.90996, "r": 477.47559, "b": 597.75674, "coord_origin": "TOPLEFT"}}, {"id": 19, "text": "ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod", "bbox": {"l": 133.76801, "t": 600.86595, "r": 477.48169000000007, "b": 609.7127399999999, "coord_origin": "TOPLEFT"}}, {"id": 20, "text": "tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At", "bbox": {"l": 133.76801, "t": 612.82095, "r": 477.48062, "b": 621.66774, "coord_origin": "TOPLEFT"}}, {"id": 21, "text": "vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren,", "bbox": {"l": 133.76801, "t": 624.7759599999999, "r": 477.48172000000005, "b": 633.62274, "coord_origin": "TOPLEFT"}}, {"id": 22, "text": "no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor", "bbox": {"l": 133.76801, "t": 636.73096, "r": 477.47772, "b": 645.57774, "coord_origin": "TOPLEFT"}}, {"id": 23, "text": "sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt", "bbox": {"l": 133.76801, "t": 648.68596, "r": 477.47971, "b": 657.53275, "coord_origin": "TOPLEFT"}}, {"id": 24, "text": "ut labore et dolore magna aliquyam erat, sed diam voluptua.", "bbox": {"l": 133.76801, "t": 660.64096, "r": 399.57816, "b": 669.48775, "coord_origin": "TOPLEFT"}}], "children": []}, {"id": 5, "label": "page_footer", "bbox": {"l": 303.133, "t": 695.720963, "r": 308.11429, "b": 704.567757, "coord_origin": "TOPLEFT"}, "confidence": 0.8646790981292725, "cells": [{"id": 25, "text": "1", "bbox": {"l": 303.133, "t": 695.720963, "r": 308.11429, "b": 704.567757, "coord_origin": "TOPLEFT"}}], "children": []}, {"id": 2, "label": "picture", "bbox": {"l": 134.92005920410156, "t": 304.890625, "r": 475.66351318359375, "b": 510.21826171875, "coord_origin": "TOPLEFT"}, "confidence": 0.9803217053413391, "cells": [], "children": []}]}, "tablestructure": {"table_map": {}}, "figures_classification": null, "equations_prediction": null}, "assembled": {"elements": [{"label": "section_header", "id": 3, "page_no": 0, "cluster": {"id": 3, "label": "section_header", "bbox": {"l": 133.76801, "t": 124.80877999999996, "r": 252.35513, "b": 137.54816000000005, "coord_origin": "TOPLEFT"}, "confidence": 0.9627318382263184, "cells": [{"id": 0, "text": "Figures Example", "bbox": {"l": 133.76801, "t": 124.80877999999996, "r": 252.35513, "b": 137.54816000000005, "coord_origin": "TOPLEFT"}}], "children": []}, "text": "Figures Example"}, {"label": "text", "id": 0, "page_no": 0, "cluster": {"id": 0, "label": "text", "bbox": {"l": 133.76801, "t": 149.67193999999995, "r": 477.48276, "b": 290.02588, "coord_origin": "TOPLEFT"}, "confidence": 0.9869933128356934, "cells": [{"id": 1, "text": "Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eir-", "bbox": {"l": 133.76801, "t": 149.67193999999995, "r": 477.47971, "b": 158.51873999999998, "coord_origin": "TOPLEFT"}}, {"id": 2, "text": "mod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam volup-", "bbox": {"l": 133.76801, "t": 161.62694999999997, "r": 477.4806500000001, "b": 170.47375, "coord_origin": "TOPLEFT"}}, {"id": 3, "text": "tua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd", "bbox": {"l": 133.76801, "t": 173.58196999999996, "r": 477.47571000000005, "b": 182.42877, "coord_origin": "TOPLEFT"}}, {"id": 4, "text": "gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ip-", "bbox": {"l": 133.76801, "t": 185.53698999999995, "r": 477.47466999999995, "b": 194.38378999999998, "coord_origin": "TOPLEFT"}}, {"id": 5, "text": "sum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor", "bbox": {"l": 133.76801, "t": 197.49199999999996, "r": 477.47668, "b": 206.33880999999997, "coord_origin": "TOPLEFT"}}, {"id": 6, "text": "invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero", "bbox": {"l": 133.76801, "t": 209.44701999999995, "r": 477.48068000000006, "b": 218.29381999999998, "coord_origin": "TOPLEFT"}}, {"id": 7, "text": "eos et accusam et justo duo dolores et ea rebum.", "bbox": {"l": 133.76801, "t": 221.40301999999997, "r": 355.20874, "b": 230.24982, "coord_origin": "TOPLEFT"}}, {"id": 8, "text": "Stet clita kasd gubergren,", "bbox": {"l": 362.60098, "t": 221.40301999999997, "r": 477.48276, "b": 230.24982, "coord_origin": "TOPLEFT"}}, {"id": 9, "text": "no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor", "bbox": {"l": 133.76801, "t": 233.35802999999999, "r": 477.47772, "b": 242.20483000000002, "coord_origin": "TOPLEFT"}}, {"id": 10, "text": "sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt", "bbox": {"l": 133.76801, "t": 245.31304999999998, "r": 477.47971, "b": 254.15985, "coord_origin": "TOPLEFT"}}, {"id": 11, "text": "ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et", "bbox": {"l": 133.76801, "t": 257.26806999999997, "r": 477.47473, "b": 266.11487, "coord_origin": "TOPLEFT"}}, {"id": 12, "text": "accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea", "bbox": {"l": 133.76801, "t": 269.22308, "r": 477.47571000000005, "b": 278.06989, "coord_origin": "TOPLEFT"}}, {"id": 13, "text": "takimata sanctus est Lorem ipsum dolor sit amet.", "bbox": {"l": 133.76801, "t": 281.17911, "r": 351.48471, "b": 290.02588, "coord_origin": "TOPLEFT"}}], "children": []}, "text": "Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet."}, {"label": "caption", "id": 4, "page_no": 0, "cluster": {"id": 4, "label": "caption", "bbox": {"l": 226.89101, "t": 529.13495, "r": 384.3548, "b": 537.98174, "coord_origin": "TOPLEFT"}, "confidence": 0.9477447271347046, "cells": [{"id": 14, "text": "Figure 1: This is an example image.", "bbox": {"l": 226.89101, "t": 529.13495, "r": 384.3548, "b": 537.98174, "coord_origin": "TOPLEFT"}}], "children": []}, "text": "Figure 1: This is an example image."}, {"label": "text", "id": 1, "page_no": 0, "cluster": {"id": 1, "label": "text", "bbox": {"l": 133.76801, "t": 553.04495, "r": 477.48172000000005, "b": 669.48775, "coord_origin": "TOPLEFT"}, "confidence": 0.9862836599349976, "cells": [{"id": 15, "text": "Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy", "bbox": {"l": 148.71201, "t": 553.04495, "r": 477.47786999999994, "b": 561.89174, "coord_origin": "TOPLEFT"}}, {"id": 16, "text": "eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam", "bbox": {"l": 133.76801, "t": 564.99995, "r": 477.47860999999995, "b": 573.84674, "coord_origin": "TOPLEFT"}}, {"id": 17, "text": "voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita", "bbox": {"l": 133.76801, "t": 576.95496, "r": 477.47571000000005, "b": 585.80174, "coord_origin": "TOPLEFT"}}, {"id": 18, "text": "kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem", "bbox": {"l": 133.76801, "t": 588.90996, "r": 477.47559, "b": 597.75674, "coord_origin": "TOPLEFT"}}, {"id": 19, "text": "ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod", "bbox": {"l": 133.76801, "t": 600.86595, "r": 477.48169000000007, "b": 609.7127399999999, "coord_origin": "TOPLEFT"}}, {"id": 20, "text": "tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At", "bbox": {"l": 133.76801, "t": 612.82095, "r": 477.48062, "b": 621.66774, "coord_origin": "TOPLEFT"}}, {"id": 21, "text": "vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren,", "bbox": {"l": 133.76801, "t": 624.7759599999999, "r": 477.48172000000005, "b": 633.62274, "coord_origin": "TOPLEFT"}}, {"id": 22, "text": "no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor", "bbox": {"l": 133.76801, "t": 636.73096, "r": 477.47772, "b": 645.57774, "coord_origin": "TOPLEFT"}}, {"id": 23, "text": "sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt", "bbox": {"l": 133.76801, "t": 648.68596, "r": 477.47971, "b": 657.53275, "coord_origin": "TOPLEFT"}}, {"id": 24, "text": "ut labore et dolore magna aliquyam erat, sed diam voluptua.", "bbox": {"l": 133.76801, "t": 660.64096, "r": 399.57816, "b": 669.48775, "coord_origin": "TOPLEFT"}}], "children": []}, "text": "Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua."}, {"label": "page_footer", "id": 5, "page_no": 0, "cluster": {"id": 5, "label": "page_footer", "bbox": {"l": 303.133, "t": 695.720963, "r": 308.11429, "b": 704.567757, "coord_origin": "TOPLEFT"}, "confidence": 0.8646790981292725, "cells": [{"id": 25, "text": "1", "bbox": {"l": 303.133, "t": 695.720963, "r": 308.11429, "b": 704.567757, "coord_origin": "TOPLEFT"}}], "children": []}, "text": "1"}, {"label": "picture", "id": 2, "page_no": 0, "cluster": {"id": 2, "label": "picture", "bbox": {"l": 134.92005920410156, "t": 304.890625, "r": 475.66351318359375, "b": 510.21826171875, "coord_origin": "TOPLEFT"}, "confidence": 0.9803217053413391, "cells": [], "children": []}, "text": "", "annotations": [], "provenance": null, "predicted_class": null, "confidence": null}], "body": [{"label": "section_header", "id": 3, "page_no": 0, "cluster": {"id": 3, "label": "section_header", "bbox": {"l": 133.76801, "t": 124.80877999999996, "r": 252.35513, "b": 137.54816000000005, "coord_origin": "TOPLEFT"}, "confidence": 0.9627318382263184, "cells": [{"id": 0, "text": "Figures Example", "bbox": {"l": 133.76801, "t": 124.80877999999996, "r": 252.35513, "b": 137.54816000000005, "coord_origin": "TOPLEFT"}}], "children": []}, "text": "Figures Example"}, {"label": "text", "id": 0, "page_no": 0, "cluster": {"id": 0, "label": "text", "bbox": {"l": 133.76801, "t": 149.67193999999995, "r": 477.48276, "b": 290.02588, "coord_origin": "TOPLEFT"}, "confidence": 0.9869933128356934, "cells": [{"id": 1, "text": "Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eir-", "bbox": {"l": 133.76801, "t": 149.67193999999995, "r": 477.47971, "b": 158.51873999999998, "coord_origin": "TOPLEFT"}}, {"id": 2, "text": "mod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam volup-", "bbox": {"l": 133.76801, "t": 161.62694999999997, "r": 477.4806500000001, "b": 170.47375, "coord_origin": "TOPLEFT"}}, {"id": 3, "text": "tua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd", "bbox": {"l": 133.76801, "t": 173.58196999999996, "r": 477.47571000000005, "b": 182.42877, "coord_origin": "TOPLEFT"}}, {"id": 4, "text": "gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ip-", "bbox": {"l": 133.76801, "t": 185.53698999999995, "r": 477.47466999999995, "b": 194.38378999999998, "coord_origin": "TOPLEFT"}}, {"id": 5, "text": "sum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor", "bbox": {"l": 133.76801, "t": 197.49199999999996, "r": 477.47668, "b": 206.33880999999997, "coord_origin": "TOPLEFT"}}, {"id": 6, "text": "invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero", "bbox": {"l": 133.76801, "t": 209.44701999999995, "r": 477.48068000000006, "b": 218.29381999999998, "coord_origin": "TOPLEFT"}}, {"id": 7, "text": "eos et accusam et justo duo dolores et ea rebum.", "bbox": {"l": 133.76801, "t": 221.40301999999997, "r": 355.20874, "b": 230.24982, "coord_origin": "TOPLEFT"}}, {"id": 8, "text": "Stet clita kasd gubergren,", "bbox": {"l": 362.60098, "t": 221.40301999999997, "r": 477.48276, "b": 230.24982, "coord_origin": "TOPLEFT"}}, {"id": 9, "text": "no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor", "bbox": {"l": 133.76801, "t": 233.35802999999999, "r": 477.47772, "b": 242.20483000000002, "coord_origin": "TOPLEFT"}}, {"id": 10, "text": "sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt", "bbox": {"l": 133.76801, "t": 245.31304999999998, "r": 477.47971, "b": 254.15985, "coord_origin": "TOPLEFT"}}, {"id": 11, "text": "ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et", "bbox": {"l": 133.76801, "t": 257.26806999999997, "r": 477.47473, "b": 266.11487, "coord_origin": "TOPLEFT"}}, {"id": 12, "text": "accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea", "bbox": {"l": 133.76801, "t": 269.22308, "r": 477.47571000000005, "b": 278.06989, "coord_origin": "TOPLEFT"}}, {"id": 13, "text": "takimata sanctus est Lorem ipsum dolor sit amet.", "bbox": {"l": 133.76801, "t": 281.17911, "r": 351.48471, "b": 290.02588, "coord_origin": "TOPLEFT"}}], "children": []}, "text": "Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet."}, {"label": "caption", "id": 4, "page_no": 0, "cluster": {"id": 4, "label": "caption", "bbox": {"l": 226.89101, "t": 529.13495, "r": 384.3548, "b": 537.98174, "coord_origin": "TOPLEFT"}, "confidence": 0.9477447271347046, "cells": [{"id": 14, "text": "Figure 1: This is an example image.", "bbox": {"l": 226.89101, "t": 529.13495, "r": 384.3548, "b": 537.98174, "coord_origin": "TOPLEFT"}}], "children": []}, "text": "Figure 1: This is an example image."}, {"label": "text", "id": 1, "page_no": 0, "cluster": {"id": 1, "label": "text", "bbox": {"l": 133.76801, "t": 553.04495, "r": 477.48172000000005, "b": 669.48775, "coord_origin": "TOPLEFT"}, "confidence": 0.9862836599349976, "cells": [{"id": 15, "text": "Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy", "bbox": {"l": 148.71201, "t": 553.04495, "r": 477.47786999999994, "b": 561.89174, "coord_origin": "TOPLEFT"}}, {"id": 16, "text": "eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam", "bbox": {"l": 133.76801, "t": 564.99995, "r": 477.47860999999995, "b": 573.84674, "coord_origin": "TOPLEFT"}}, {"id": 17, "text": "voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita", "bbox": {"l": 133.76801, "t": 576.95496, "r": 477.47571000000005, "b": 585.80174, "coord_origin": "TOPLEFT"}}, {"id": 18, "text": "kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem", "bbox": {"l": 133.76801, "t": 588.90996, "r": 477.47559, "b": 597.75674, "coord_origin": "TOPLEFT"}}, {"id": 19, "text": "ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod", "bbox": {"l": 133.76801, "t": 600.86595, "r": 477.48169000000007, "b": 609.7127399999999, "coord_origin": "TOPLEFT"}}, {"id": 20, "text": "tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At", "bbox": {"l": 133.76801, "t": 612.82095, "r": 477.48062, "b": 621.66774, "coord_origin": "TOPLEFT"}}, {"id": 21, "text": "vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren,", "bbox": {"l": 133.76801, "t": 624.7759599999999, "r": 477.48172000000005, "b": 633.62274, "coord_origin": "TOPLEFT"}}, {"id": 22, "text": "no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor", "bbox": {"l": 133.76801, "t": 636.73096, "r": 477.47772, "b": 645.57774, "coord_origin": "TOPLEFT"}}, {"id": 23, "text": "sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt", "bbox": {"l": 133.76801, "t": 648.68596, "r": 477.47971, "b": 657.53275, "coord_origin": "TOPLEFT"}}, {"id": 24, "text": "ut labore et dolore magna aliquyam erat, sed diam voluptua.", "bbox": {"l": 133.76801, "t": 660.64096, "r": 399.57816, "b": 669.48775, "coord_origin": "TOPLEFT"}}], "children": []}, "text": "Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua."}, {"label": "picture", "id": 2, "page_no": 0, "cluster": {"id": 2, "label": "picture", "bbox": {"l": 134.92005920410156, "t": 304.890625, "r": 475.66351318359375, "b": 510.21826171875, "coord_origin": "TOPLEFT"}, "confidence": 0.9803217053413391, "cells": [], "children": []}, "text": "", "annotations": [], "provenance": null, "predicted_class": null, "confidence": null}], "headers": [{"label": "page_footer", "id": 5, "page_no": 0, "cluster": {"id": 5, "label": "page_footer", "bbox": {"l": 303.133, "t": 695.720963, "r": 308.11429, "b": 704.567757, "coord_origin": "TOPLEFT"}, "confidence": 0.8646790981292725, "cells": [{"id": 25, "text": "1", "bbox": {"l": 303.133, "t": 695.720963, "r": 308.11429, "b": 704.567757, "coord_origin": "TOPLEFT"}}], "children": []}, "text": "1"}]}}, {"page_no": 1, "size": {"width": 612.0, "height": 792.0}, "cells": [{"id": 0, "text": "Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy", "bbox": {"l": 148.71201, "t": 127.85095000000013, "r": 477.47786999999994, "b": 136.69775000000004, "coord_origin": "TOPLEFT"}}, {"id": 1, "text": "eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam", "bbox": {"l": 133.76801, "t": 139.80597, "r": 477.47860999999995, "b": 148.65277000000003, "coord_origin": "TOPLEFT"}}, {"id": 2, "text": "voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita", "bbox": {"l": 133.76801, "t": 151.76099, "r": 477.47571000000005, "b": 160.60779000000002, "coord_origin": "TOPLEFT"}}, {"id": 3, "text": "kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem", "bbox": {"l": 133.76801, "t": 163.716, "r": 477.47559, "b": 172.56281, "coord_origin": "TOPLEFT"}}, {"id": 4, "text": "ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod", "bbox": {"l": 133.76801, "t": 175.67102, "r": 477.48169000000007, "b": 184.51782000000003, "coord_origin": "TOPLEFT"}}, {"id": 5, "text": "tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At", "bbox": {"l": 133.76801, "t": 187.62701000000004, "r": 477.48062, "b": 196.47382000000005, "coord_origin": "TOPLEFT"}}, {"id": 6, "text": "vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren,", "bbox": {"l": 133.76801, "t": 199.58203000000003, "r": 477.48172000000005, "b": 208.42882999999995, "coord_origin": "TOPLEFT"}}, {"id": 7, "text": "no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor", "bbox": {"l": 133.76801, "t": 211.53705000000002, "r": 477.47772, "b": 220.38385000000005, "coord_origin": "TOPLEFT"}}, {"id": 8, "text": "sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt", "bbox": {"l": 133.76801, "t": 223.49207, "r": 477.47971, "b": 232.33887000000004, "coord_origin": "TOPLEFT"}}, {"id": 9, "text": "ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et", "bbox": {"l": 133.76801, "t": 235.44708000000003, "r": 477.47473, "b": 244.29387999999994, "coord_origin": "TOPLEFT"}}, {"id": 10, "text": "accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea", "bbox": {"l": 133.76801, "t": 247.40210000000002, "r": 477.47571000000005, "b": 256.24890000000005, "coord_origin": "TOPLEFT"}}, {"id": 11, "text": "takimata sanctus est Lorem ipsum dolor sit amet.", "bbox": {"l": 133.76801, "t": 259.35808999999995, "r": 351.48471, "b": 268.20489999999995, "coord_origin": "TOPLEFT"}}, {"id": 12, "text": "Figure 2: This is an example image.", "bbox": {"l": 226.89101, "t": 523.21097, "r": 384.3548, "b": 532.05774, "coord_origin": "TOPLEFT"}}, {"id": 13, "text": "Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy", "bbox": {"l": 148.71201, "t": 546.28195, "r": 477.47786999999994, "b": 555.12874, "coord_origin": "TOPLEFT"}}, {"id": 14, "text": "eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam", "bbox": {"l": 133.76801, "t": 558.23695, "r": 477.47860999999995, "b": 567.08374, "coord_origin": "TOPLEFT"}}, {"id": 15, "text": "voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita", "bbox": {"l": 133.76801, "t": 570.19196, "r": 477.47571000000005, "b": 579.03874, "coord_origin": "TOPLEFT"}}, {"id": 16, "text": "kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem", "bbox": {"l": 133.76801, "t": 582.14696, "r": 477.47559, "b": 590.99374, "coord_origin": "TOPLEFT"}}, {"id": 17, "text": "ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod", "bbox": {"l": 133.76801, "t": 594.10196, "r": 477.48169000000007, "b": 602.94875, "coord_origin": "TOPLEFT"}}, {"id": 18, "text": "tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At", "bbox": {"l": 133.76801, "t": 606.05696, "r": 477.48062, "b": 614.90375, "coord_origin": "TOPLEFT"}}, {"id": 19, "text": "vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren,", "bbox": {"l": 133.76801, "t": 618.01295, "r": 477.48172000000005, "b": 626.85974, "coord_origin": "TOPLEFT"}}, {"id": 20, "text": "no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor", "bbox": {"l": 133.76801, "t": 629.96796, "r": 477.47772, "b": 638.81474, "coord_origin": "TOPLEFT"}}, {"id": 21, "text": "sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt", "bbox": {"l": 133.76801, "t": 641.92296, "r": 477.47971, "b": 650.76974, "coord_origin": "TOPLEFT"}}, {"id": 22, "text": "ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et", "bbox": {"l": 133.76801, "t": 653.87796, "r": 477.47473, "b": 662.72475, "coord_origin": "TOPLEFT"}}, {"id": 23, "text": "accusam et justo duo dolores et ea rebum.", "bbox": {"l": 133.76801, "t": 665.83296, "r": 318.01736, "b": 674.67976, "coord_origin": "TOPLEFT"}}, {"id": 24, "text": "2", "bbox": {"l": 303.133, "t": 695.720963, "r": 308.11429, "b": 704.567757, "coord_origin": "TOPLEFT"}}], "predictions": {"layout": {"clusters": [{"id": 1, "label": "text", "bbox": {"l": 133.76801, "t": 127.85095000000013, "r": 477.48172000000005, "b": 268.20489999999995, "coord_origin": "TOPLEFT"}, "confidence": 0.987092912197113, "cells": [{"id": 0, "text": "Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy", "bbox": {"l": 148.71201, "t": 127.85095000000013, "r": 477.47786999999994, "b": 136.69775000000004, "coord_origin": "TOPLEFT"}}, {"id": 1, "text": "eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam", "bbox": {"l": 133.76801, "t": 139.80597, "r": 477.47860999999995, "b": 148.65277000000003, "coord_origin": "TOPLEFT"}}, {"id": 2, "text": "voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita", "bbox": {"l": 133.76801, "t": 151.76099, "r": 477.47571000000005, "b": 160.60779000000002, "coord_origin": "TOPLEFT"}}, {"id": 3, "text": "kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem", "bbox": {"l": 133.76801, "t": 163.716, "r": 477.47559, "b": 172.56281, "coord_origin": "TOPLEFT"}}, {"id": 4, "text": "ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod", "bbox": {"l": 133.76801, "t": 175.67102, "r": 477.48169000000007, "b": 184.51782000000003, "coord_origin": "TOPLEFT"}}, {"id": 5, "text": "tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At", "bbox": {"l": 133.76801, "t": 187.62701000000004, "r": 477.48062, "b": 196.47382000000005, "coord_origin": "TOPLEFT"}}, {"id": 6, "text": "vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren,", "bbox": {"l": 133.76801, "t": 199.58203000000003, "r": 477.48172000000005, "b": 208.42882999999995, "coord_origin": "TOPLEFT"}}, {"id": 7, "text": "no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor", "bbox": {"l": 133.76801, "t": 211.53705000000002, "r": 477.47772, "b": 220.38385000000005, "coord_origin": "TOPLEFT"}}, {"id": 8, "text": "sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt", "bbox": {"l": 133.76801, "t": 223.49207, "r": 477.47971, "b": 232.33887000000004, "coord_origin": "TOPLEFT"}}, {"id": 9, "text": "ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et", "bbox": {"l": 133.76801, "t": 235.44708000000003, "r": 477.47473, "b": 244.29387999999994, "coord_origin": "TOPLEFT"}}, {"id": 10, "text": "accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea", "bbox": {"l": 133.76801, "t": 247.40210000000002, "r": 477.47571000000005, "b": 256.24890000000005, "coord_origin": "TOPLEFT"}}, {"id": 11, "text": "takimata sanctus est Lorem ipsum dolor sit amet.", "bbox": {"l": 133.76801, "t": 259.35808999999995, "r": 351.48471, "b": 268.20489999999995, "coord_origin": "TOPLEFT"}}], "children": []}, {"id": 3, "label": "caption", "bbox": {"l": 226.89101, "t": 523.21097, "r": 384.3548, "b": 532.05774, "coord_origin": "TOPLEFT"}, "confidence": 0.9494235515594482, "cells": [{"id": 12, "text": "Figure 2: This is an example image.", "bbox": {"l": 226.89101, "t": 523.21097, "r": 384.3548, "b": 532.05774, "coord_origin": "TOPLEFT"}}], "children": []}, {"id": 0, "label": "text", "bbox": {"l": 133.76801, "t": 546.28195, "r": 477.48172000000005, "b": 674.67976, "coord_origin": "TOPLEFT"}, "confidence": 0.9874356985092163, "cells": [{"id": 13, "text": "Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy", "bbox": {"l": 148.71201, "t": 546.28195, "r": 477.47786999999994, "b": 555.12874, "coord_origin": "TOPLEFT"}}, {"id": 14, "text": "eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam", "bbox": {"l": 133.76801, "t": 558.23695, "r": 477.47860999999995, "b": 567.08374, "coord_origin": "TOPLEFT"}}, {"id": 15, "text": "voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita", "bbox": {"l": 133.76801, "t": 570.19196, "r": 477.47571000000005, "b": 579.03874, "coord_origin": "TOPLEFT"}}, {"id": 16, "text": "kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem", "bbox": {"l": 133.76801, "t": 582.14696, "r": 477.47559, "b": 590.99374, "coord_origin": "TOPLEFT"}}, {"id": 17, "text": "ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod", "bbox": {"l": 133.76801, "t": 594.10196, "r": 477.48169000000007, "b": 602.94875, "coord_origin": "TOPLEFT"}}, {"id": 18, "text": "tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At", "bbox": {"l": 133.76801, "t": 606.05696, "r": 477.48062, "b": 614.90375, "coord_origin": "TOPLEFT"}}, {"id": 19, "text": "vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren,", "bbox": {"l": 133.76801, "t": 618.01295, "r": 477.48172000000005, "b": 626.85974, "coord_origin": "TOPLEFT"}}, {"id": 20, "text": "no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor", "bbox": {"l": 133.76801, "t": 629.96796, "r": 477.47772, "b": 638.81474, "coord_origin": "TOPLEFT"}}, {"id": 21, "text": "sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt", "bbox": {"l": 133.76801, "t": 641.92296, "r": 477.47971, "b": 650.76974, "coord_origin": "TOPLEFT"}}, {"id": 22, "text": "ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et", "bbox": {"l": 133.76801, "t": 653.87796, "r": 477.47473, "b": 662.72475, "coord_origin": "TOPLEFT"}}, {"id": 23, "text": "accusam et justo duo dolores et ea rebum.", "bbox": {"l": 133.76801, "t": 665.83296, "r": 318.01736, "b": 674.67976, "coord_origin": "TOPLEFT"}}], "children": []}, {"id": 4, "label": "page_footer", "bbox": {"l": 303.133, "t": 695.720963, "r": 308.11429, "b": 704.567757, "coord_origin": "TOPLEFT"}, "confidence": 0.888852059841156, "cells": [{"id": 24, "text": "2", "bbox": {"l": 303.133, "t": 695.720963, "r": 308.11429, "b": 704.567757, "coord_origin": "TOPLEFT"}}], "children": []}, {"id": 2, "label": "picture", "bbox": {"l": 218.8155517578125, "t": 278.0154113769531, "r": 391.96246337890625, "b": 508.89410400390625, "coord_origin": "TOPLEFT"}, "confidence": 0.9837717413902283, "cells": [], "children": []}]}, "tablestructure": {"table_map": {}}, "figures_classification": null, "equations_prediction": null}, "assembled": {"elements": [{"label": "text", "id": 1, "page_no": 1, "cluster": {"id": 1, "label": "text", "bbox": {"l": 133.76801, "t": 127.85095000000013, "r": 477.48172000000005, "b": 268.20489999999995, "coord_origin": "TOPLEFT"}, "confidence": 0.987092912197113, "cells": [{"id": 0, "text": "Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy", "bbox": {"l": 148.71201, "t": 127.85095000000013, "r": 477.47786999999994, "b": 136.69775000000004, "coord_origin": "TOPLEFT"}}, {"id": 1, "text": "eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam", "bbox": {"l": 133.76801, "t": 139.80597, "r": 477.47860999999995, "b": 148.65277000000003, "coord_origin": "TOPLEFT"}}, {"id": 2, "text": "voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita", "bbox": {"l": 133.76801, "t": 151.76099, "r": 477.47571000000005, "b": 160.60779000000002, "coord_origin": "TOPLEFT"}}, {"id": 3, "text": "kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem", "bbox": {"l": 133.76801, "t": 163.716, "r": 477.47559, "b": 172.56281, "coord_origin": "TOPLEFT"}}, {"id": 4, "text": "ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod", "bbox": {"l": 133.76801, "t": 175.67102, "r": 477.48169000000007, "b": 184.51782000000003, "coord_origin": "TOPLEFT"}}, {"id": 5, "text": "tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At", "bbox": {"l": 133.76801, "t": 187.62701000000004, "r": 477.48062, "b": 196.47382000000005, "coord_origin": "TOPLEFT"}}, {"id": 6, "text": "vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren,", "bbox": {"l": 133.76801, "t": 199.58203000000003, "r": 477.48172000000005, "b": 208.42882999999995, "coord_origin": "TOPLEFT"}}, {"id": 7, "text": "no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor", "bbox": {"l": 133.76801, "t": 211.53705000000002, "r": 477.47772, "b": 220.38385000000005, "coord_origin": "TOPLEFT"}}, {"id": 8, "text": "sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt", "bbox": {"l": 133.76801, "t": 223.49207, "r": 477.47971, "b": 232.33887000000004, "coord_origin": "TOPLEFT"}}, {"id": 9, "text": "ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et", "bbox": {"l": 133.76801, "t": 235.44708000000003, "r": 477.47473, "b": 244.29387999999994, "coord_origin": "TOPLEFT"}}, {"id": 10, "text": "accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea", "bbox": {"l": 133.76801, "t": 247.40210000000002, "r": 477.47571000000005, "b": 256.24890000000005, "coord_origin": "TOPLEFT"}}, {"id": 11, "text": "takimata sanctus est Lorem ipsum dolor sit amet.", "bbox": {"l": 133.76801, "t": 259.35808999999995, "r": 351.48471, "b": 268.20489999999995, "coord_origin": "TOPLEFT"}}], "children": []}, "text": "Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet."}, {"label": "caption", "id": 3, "page_no": 1, "cluster": {"id": 3, "label": "caption", "bbox": {"l": 226.89101, "t": 523.21097, "r": 384.3548, "b": 532.05774, "coord_origin": "TOPLEFT"}, "confidence": 0.9494235515594482, "cells": [{"id": 12, "text": "Figure 2: This is an example image.", "bbox": {"l": 226.89101, "t": 523.21097, "r": 384.3548, "b": 532.05774, "coord_origin": "TOPLEFT"}}], "children": []}, "text": "Figure 2: This is an example image."}, {"label": "text", "id": 0, "page_no": 1, "cluster": {"id": 0, "label": "text", "bbox": {"l": 133.76801, "t": 546.28195, "r": 477.48172000000005, "b": 674.67976, "coord_origin": "TOPLEFT"}, "confidence": 0.9874356985092163, "cells": [{"id": 13, "text": "Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy", "bbox": {"l": 148.71201, "t": 546.28195, "r": 477.47786999999994, "b": 555.12874, "coord_origin": "TOPLEFT"}}, {"id": 14, "text": "eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam", "bbox": {"l": 133.76801, "t": 558.23695, "r": 477.47860999999995, "b": 567.08374, "coord_origin": "TOPLEFT"}}, {"id": 15, "text": "voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita", "bbox": {"l": 133.76801, "t": 570.19196, "r": 477.47571000000005, "b": 579.03874, "coord_origin": "TOPLEFT"}}, {"id": 16, "text": "kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem", "bbox": {"l": 133.76801, "t": 582.14696, "r": 477.47559, "b": 590.99374, "coord_origin": "TOPLEFT"}}, {"id": 17, "text": "ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod", "bbox": {"l": 133.76801, "t": 594.10196, "r": 477.48169000000007, "b": 602.94875, "coord_origin": "TOPLEFT"}}, {"id": 18, "text": "tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At", "bbox": {"l": 133.76801, "t": 606.05696, "r": 477.48062, "b": 614.90375, "coord_origin": "TOPLEFT"}}, {"id": 19, "text": "vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren,", "bbox": {"l": 133.76801, "t": 618.01295, "r": 477.48172000000005, "b": 626.85974, "coord_origin": "TOPLEFT"}}, {"id": 20, "text": "no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor", "bbox": {"l": 133.76801, "t": 629.96796, "r": 477.47772, "b": 638.81474, "coord_origin": "TOPLEFT"}}, {"id": 21, "text": "sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt", "bbox": {"l": 133.76801, "t": 641.92296, "r": 477.47971, "b": 650.76974, "coord_origin": "TOPLEFT"}}, {"id": 22, "text": "ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et", "bbox": {"l": 133.76801, "t": 653.87796, "r": 477.47473, "b": 662.72475, "coord_origin": "TOPLEFT"}}, {"id": 23, "text": "accusam et justo duo dolores et ea rebum.", "bbox": {"l": 133.76801, "t": 665.83296, "r": 318.01736, "b": 674.67976, "coord_origin": "TOPLEFT"}}], "children": []}, "text": "Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum."}, {"label": "page_footer", "id": 4, "page_no": 1, "cluster": {"id": 4, "label": "page_footer", "bbox": {"l": 303.133, "t": 695.720963, "r": 308.11429, "b": 704.567757, "coord_origin": "TOPLEFT"}, "confidence": 0.888852059841156, "cells": [{"id": 24, "text": "2", "bbox": {"l": 303.133, "t": 695.720963, "r": 308.11429, "b": 704.567757, "coord_origin": "TOPLEFT"}}], "children": []}, "text": "2"}, {"label": "picture", "id": 2, "page_no": 1, "cluster": {"id": 2, "label": "picture", "bbox": {"l": 218.8155517578125, "t": 278.0154113769531, "r": 391.96246337890625, "b": 508.89410400390625, "coord_origin": "TOPLEFT"}, "confidence": 0.9837717413902283, "cells": [], "children": []}, "text": "", "annotations": [], "provenance": null, "predicted_class": null, "confidence": null}], "body": [{"label": "text", "id": 1, "page_no": 1, "cluster": {"id": 1, "label": "text", "bbox": {"l": 133.76801, "t": 127.85095000000013, "r": 477.48172000000005, "b": 268.20489999999995, "coord_origin": "TOPLEFT"}, "confidence": 0.987092912197113, "cells": [{"id": 0, "text": "Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy", "bbox": {"l": 148.71201, "t": 127.85095000000013, "r": 477.47786999999994, "b": 136.69775000000004, "coord_origin": "TOPLEFT"}}, {"id": 1, "text": "eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam", "bbox": {"l": 133.76801, "t": 139.80597, "r": 477.47860999999995, "b": 148.65277000000003, "coord_origin": "TOPLEFT"}}, {"id": 2, "text": "voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita", "bbox": {"l": 133.76801, "t": 151.76099, "r": 477.47571000000005, "b": 160.60779000000002, "coord_origin": "TOPLEFT"}}, {"id": 3, "text": "kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem", "bbox": {"l": 133.76801, "t": 163.716, "r": 477.47559, "b": 172.56281, "coord_origin": "TOPLEFT"}}, {"id": 4, "text": "ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod", "bbox": {"l": 133.76801, "t": 175.67102, "r": 477.48169000000007, "b": 184.51782000000003, "coord_origin": "TOPLEFT"}}, {"id": 5, "text": "tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At", "bbox": {"l": 133.76801, "t": 187.62701000000004, "r": 477.48062, "b": 196.47382000000005, "coord_origin": "TOPLEFT"}}, {"id": 6, "text": "vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren,", "bbox": {"l": 133.76801, "t": 199.58203000000003, "r": 477.48172000000005, "b": 208.42882999999995, "coord_origin": "TOPLEFT"}}, {"id": 7, "text": "no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor", "bbox": {"l": 133.76801, "t": 211.53705000000002, "r": 477.47772, "b": 220.38385000000005, "coord_origin": "TOPLEFT"}}, {"id": 8, "text": "sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt", "bbox": {"l": 133.76801, "t": 223.49207, "r": 477.47971, "b": 232.33887000000004, "coord_origin": "TOPLEFT"}}, {"id": 9, "text": "ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et", "bbox": {"l": 133.76801, "t": 235.44708000000003, "r": 477.47473, "b": 244.29387999999994, "coord_origin": "TOPLEFT"}}, {"id": 10, "text": "accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea", "bbox": {"l": 133.76801, "t": 247.40210000000002, "r": 477.47571000000005, "b": 256.24890000000005, "coord_origin": "TOPLEFT"}}, {"id": 11, "text": "takimata sanctus est Lorem ipsum dolor sit amet.", "bbox": {"l": 133.76801, "t": 259.35808999999995, "r": 351.48471, "b": 268.20489999999995, "coord_origin": "TOPLEFT"}}], "children": []}, "text": "Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet."}, {"label": "caption", "id": 3, "page_no": 1, "cluster": {"id": 3, "label": "caption", "bbox": {"l": 226.89101, "t": 523.21097, "r": 384.3548, "b": 532.05774, "coord_origin": "TOPLEFT"}, "confidence": 0.9494235515594482, "cells": [{"id": 12, "text": "Figure 2: This is an example image.", "bbox": {"l": 226.89101, "t": 523.21097, "r": 384.3548, "b": 532.05774, "coord_origin": "TOPLEFT"}}], "children": []}, "text": "Figure 2: This is an example image."}, {"label": "text", "id": 0, "page_no": 1, "cluster": {"id": 0, "label": "text", "bbox": {"l": 133.76801, "t": 546.28195, "r": 477.48172000000005, "b": 674.67976, "coord_origin": "TOPLEFT"}, "confidence": 0.9874356985092163, "cells": [{"id": 13, "text": "Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy", "bbox": {"l": 148.71201, "t": 546.28195, "r": 477.47786999999994, "b": 555.12874, "coord_origin": "TOPLEFT"}}, {"id": 14, "text": "eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam", "bbox": {"l": 133.76801, "t": 558.23695, "r": 477.47860999999995, "b": 567.08374, "coord_origin": "TOPLEFT"}}, {"id": 15, "text": "voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita", "bbox": {"l": 133.76801, "t": 570.19196, "r": 477.47571000000005, "b": 579.03874, "coord_origin": "TOPLEFT"}}, {"id": 16, "text": "kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem", "bbox": {"l": 133.76801, "t": 582.14696, "r": 477.47559, "b": 590.99374, "coord_origin": "TOPLEFT"}}, {"id": 17, "text": "ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod", "bbox": {"l": 133.76801, "t": 594.10196, "r": 477.48169000000007, "b": 602.94875, "coord_origin": "TOPLEFT"}}, {"id": 18, "text": "tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At", "bbox": {"l": 133.76801, "t": 606.05696, "r": 477.48062, "b": 614.90375, "coord_origin": "TOPLEFT"}}, {"id": 19, "text": "vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren,", "bbox": {"l": 133.76801, "t": 618.01295, "r": 477.48172000000005, "b": 626.85974, "coord_origin": "TOPLEFT"}}, {"id": 20, "text": "no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor", "bbox": {"l": 133.76801, "t": 629.96796, "r": 477.47772, "b": 638.81474, "coord_origin": "TOPLEFT"}}, {"id": 21, "text": "sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt", "bbox": {"l": 133.76801, "t": 641.92296, "r": 477.47971, "b": 650.76974, "coord_origin": "TOPLEFT"}}, {"id": 22, "text": "ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et", "bbox": {"l": 133.76801, "t": 653.87796, "r": 477.47473, "b": 662.72475, "coord_origin": "TOPLEFT"}}, {"id": 23, "text": "accusam et justo duo dolores et ea rebum.", "bbox": {"l": 133.76801, "t": 665.83296, "r": 318.01736, "b": 674.67976, "coord_origin": "TOPLEFT"}}], "children": []}, "text": "Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum."}, {"label": "picture", "id": 2, "page_no": 1, "cluster": {"id": 2, "label": "picture", "bbox": {"l": 218.8155517578125, "t": 278.0154113769531, "r": 391.96246337890625, "b": 508.89410400390625, "coord_origin": "TOPLEFT"}, "confidence": 0.9837717413902283, "cells": [], "children": []}, "text": "", "annotations": [], "provenance": null, "predicted_class": null, "confidence": null}], "headers": [{"label": "page_footer", "id": 4, "page_no": 1, "cluster": {"id": 4, "label": "page_footer", "bbox": {"l": 303.133, "t": 695.720963, "r": 308.11429, "b": 704.567757, "coord_origin": "TOPLEFT"}, "confidence": 0.888852059841156, "cells": [{"id": 24, "text": "2", "bbox": {"l": 303.133, "t": 695.720963, "r": 308.11429, "b": 704.567757, "coord_origin": "TOPLEFT"}}], "children": []}, "text": "2"}]}}] \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/picture_classification.doctags.txt b/tests/data/groundtruth/docling_v2/picture_classification.doctags.txt new file mode 100644 index 00000000..a86cbe7b --- /dev/null +++ b/tests/data/groundtruth/docling_v2/picture_classification.doctags.txt @@ -0,0 +1,15 @@ + +Figures Example +Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. +
+ +Figure 1: This is an example image. +
+Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. +Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. +
+ +Figure 2: This is an example image. +
+Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. +
\ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/picture_classification.json b/tests/data/groundtruth/docling_v2/picture_classification.json new file mode 100644 index 00000000..e80011b3 --- /dev/null +++ b/tests/data/groundtruth/docling_v2/picture_classification.json @@ -0,0 +1 @@ +{"schema_name": "DoclingDocument", "version": "1.0.0", "name": "picture_classification", "origin": {"mimetype": "application/pdf", "binary_hash": 6445357065749877499, "filename": "picture_classification.pdf", "uri": null}, "furniture": {"self_ref": "#/furniture", "parent": null, "children": [], "name": "_root_", "label": "unspecified"}, "body": {"self_ref": "#/body", "parent": null, "children": [{"cref": "#/texts/0"}, {"cref": "#/texts/1"}, {"cref": "#/texts/2"}, {"cref": "#/pictures/0"}, {"cref": "#/texts/3"}, {"cref": "#/texts/4"}, {"cref": "#/texts/5"}, {"cref": "#/texts/6"}, {"cref": "#/pictures/1"}, {"cref": "#/texts/7"}, {"cref": "#/texts/8"}], "name": "_root_", "label": "unspecified"}, "groups": [], "texts": [{"self_ref": "#/texts/0", "parent": {"cref": "#/body"}, "children": [], "label": "section_header", "prov": [{"page_no": 1, "bbox": {"l": 133.76800537109375, "t": 667.1912231445312, "r": 252.35513305664062, "b": 654.4518432617188, "coord_origin": "BOTTOMLEFT"}, "charspan": [0, 15]}], "orig": "Figures Example", "text": "Figures Example", "level": 1}, {"self_ref": "#/texts/1", "parent": {"cref": "#/body"}, "children": [], "label": "text", "prov": [{"page_no": 1, "bbox": {"l": 133.76800537109375, "t": 642.3280639648438, "r": 477.4827575683594, "b": 501.97412109375, "coord_origin": "BOTTOMLEFT"}, "charspan": [0, 887]}], "orig": "Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.", "text": "Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet."}, {"self_ref": "#/texts/2", "parent": {"cref": "#/body"}, "children": [], "label": "caption", "prov": [{"page_no": 1, "bbox": {"l": 226.89100646972656, "t": 262.86505126953125, "r": 384.35479736328125, "b": 254.0182647705078, "coord_origin": "BOTTOMLEFT"}, "charspan": [0, 35]}], "orig": "Figure 1: This is an example image.", "text": "Figure 1: This is an example image."}, {"self_ref": "#/texts/3", "parent": {"cref": "#/body"}, "children": [], "label": "text", "prov": [{"page_no": 1, "bbox": {"l": 133.76800537109375, "t": 238.95504760742188, "r": 477.4817199707031, "b": 122.51225280761719, "coord_origin": "BOTTOMLEFT"}, "charspan": [0, 747]}], "orig": "Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua.", "text": "Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua."}, {"self_ref": "#/texts/4", "parent": {"cref": "#/body"}, "children": [], "label": "page_footer", "prov": [{"page_no": 1, "bbox": {"l": 303.13299560546875, "t": 96.27903747558594, "r": 308.1142883300781, "b": 87.43224334716797, "coord_origin": "BOTTOMLEFT"}, "charspan": [0, 1]}], "orig": "1", "text": "1"}, {"self_ref": "#/texts/5", "parent": {"cref": "#/body"}, "children": [], "label": "text", "prov": [{"page_no": 2, "bbox": {"l": 133.76800537109375, "t": 664.1490478515625, "r": 477.4817199707031, "b": 523.7951049804688, "coord_origin": "BOTTOMLEFT"}, "charspan": [0, 887]}], "orig": "Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.", "text": "Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet."}, {"self_ref": "#/texts/6", "parent": {"cref": "#/body"}, "children": [], "label": "caption", "prov": [{"page_no": 2, "bbox": {"l": 226.89100646972656, "t": 268.7890319824219, "r": 384.35479736328125, "b": 259.9422607421875, "coord_origin": "BOTTOMLEFT"}, "charspan": [0, 35]}], "orig": "Figure 2: This is an example image.", "text": "Figure 2: This is an example image."}, {"self_ref": "#/texts/7", "parent": {"cref": "#/body"}, "children": [], "label": "text", "prov": [{"page_no": 2, "bbox": {"l": 133.76800537109375, "t": 245.71804809570312, "r": 477.4817199707031, "b": 117.32023620605469, "coord_origin": "BOTTOMLEFT"}, "charspan": [0, 804]}], "orig": "Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum.", "text": "Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum."}, {"self_ref": "#/texts/8", "parent": {"cref": "#/body"}, "children": [], "label": "page_footer", "prov": [{"page_no": 2, "bbox": {"l": 303.13299560546875, "t": 96.27903747558594, "r": 308.1142883300781, "b": 87.43224334716797, "coord_origin": "BOTTOMLEFT"}, "charspan": [0, 1]}], "orig": "2", "text": "2"}], "pictures": [{"self_ref": "#/pictures/0", "parent": {"cref": "#/body"}, "children": [], "label": "picture", "prov": [{"page_no": 1, "bbox": {"l": 134.92005920410156, "t": 487.109375, "r": 475.66351318359375, "b": 281.78173828125, "coord_origin": "BOTTOMLEFT"}, "charspan": [0, 35]}], "captions": [{"cref": "#/texts/2"}], "references": [], "footnotes": [], "image": null, "annotations": []}, {"self_ref": "#/pictures/1", "parent": {"cref": "#/body"}, "children": [], "label": "picture", "prov": [{"page_no": 2, "bbox": {"l": 218.8155517578125, "t": 513.984619140625, "r": 391.96246337890625, "b": 283.10589599609375, "coord_origin": "BOTTOMLEFT"}, "charspan": [0, 35]}], "captions": [{"cref": "#/texts/6"}], "references": [], "footnotes": [], "image": null, "annotations": []}], "tables": [], "key_value_items": [], "pages": {"1": {"size": {"width": 612.0, "height": 792.0}, "image": null, "page_no": 1}, "2": {"size": {"width": 612.0, "height": 792.0}, "image": null, "page_no": 2}}} \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/picture_classification.md b/tests/data/groundtruth/docling_v2/picture_classification.md new file mode 100644 index 00000000..8e233a96 --- /dev/null +++ b/tests/data/groundtruth/docling_v2/picture_classification.md @@ -0,0 +1,17 @@ +## Figures Example + +Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. + +Figure 1: This is an example image. + + + +Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. + +Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. + +Figure 2: This is an example image. + + + +Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/picture_classification.pages.json b/tests/data/groundtruth/docling_v2/picture_classification.pages.json new file mode 100644 index 00000000..744f54fb --- /dev/null +++ b/tests/data/groundtruth/docling_v2/picture_classification.pages.json @@ -0,0 +1 @@ +[{"page_no": 0, "size": {"width": 612.0, "height": 792.0}, "cells": [{"id": 0, "text": "Figures Example", "bbox": {"l": 133.76801, "t": 124.80877999999996, "r": 252.35513, "b": 137.54816000000005, "coord_origin": "TOPLEFT"}}, {"id": 1, "text": "Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eir-", "bbox": {"l": 133.76801, "t": 149.67193999999995, "r": 477.47971, "b": 158.51873999999998, "coord_origin": "TOPLEFT"}}, {"id": 2, "text": "mod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam volup-", "bbox": {"l": 133.76801, "t": 161.62694999999997, "r": 477.4806500000001, "b": 170.47375, "coord_origin": "TOPLEFT"}}, {"id": 3, "text": "tua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd", "bbox": {"l": 133.76801, "t": 173.58196999999996, "r": 477.47571000000005, "b": 182.42877, "coord_origin": "TOPLEFT"}}, {"id": 4, "text": "gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ip-", "bbox": {"l": 133.76801, "t": 185.53698999999995, "r": 477.47466999999995, "b": 194.38378999999998, "coord_origin": "TOPLEFT"}}, {"id": 5, "text": "sum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor", "bbox": {"l": 133.76801, "t": 197.49199999999996, "r": 477.47668, "b": 206.33880999999997, "coord_origin": "TOPLEFT"}}, {"id": 6, "text": "invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero", "bbox": {"l": 133.76801, "t": 209.44701999999995, "r": 477.48068000000006, "b": 218.29381999999998, "coord_origin": "TOPLEFT"}}, {"id": 7, "text": "eos et accusam et justo duo dolores et ea rebum.", "bbox": {"l": 133.76801, "t": 221.40301999999997, "r": 355.20874, "b": 230.24982, "coord_origin": "TOPLEFT"}}, {"id": 8, "text": "Stet clita kasd gubergren,", "bbox": {"l": 362.60098, "t": 221.40301999999997, "r": 477.48276, "b": 230.24982, "coord_origin": "TOPLEFT"}}, {"id": 9, "text": "no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor", "bbox": {"l": 133.76801, "t": 233.35802999999999, "r": 477.47772, "b": 242.20483000000002, "coord_origin": "TOPLEFT"}}, {"id": 10, "text": "sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt", "bbox": {"l": 133.76801, "t": 245.31304999999998, "r": 477.47971, "b": 254.15985, "coord_origin": "TOPLEFT"}}, {"id": 11, "text": "ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et", "bbox": {"l": 133.76801, "t": 257.26806999999997, "r": 477.47473, "b": 266.11487, "coord_origin": "TOPLEFT"}}, {"id": 12, "text": "accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea", "bbox": {"l": 133.76801, "t": 269.22308, "r": 477.47571000000005, "b": 278.06989, "coord_origin": "TOPLEFT"}}, {"id": 13, "text": "takimata sanctus est Lorem ipsum dolor sit amet.", "bbox": {"l": 133.76801, "t": 281.17911, "r": 351.48471, "b": 290.02588, "coord_origin": "TOPLEFT"}}, {"id": 14, "text": "Figure 1: This is an example image.", "bbox": {"l": 226.89101, "t": 529.13495, "r": 384.3548, "b": 537.98174, "coord_origin": "TOPLEFT"}}, {"id": 15, "text": "Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy", "bbox": {"l": 148.71201, "t": 553.04495, "r": 477.47786999999994, "b": 561.89174, "coord_origin": "TOPLEFT"}}, {"id": 16, "text": "eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam", "bbox": {"l": 133.76801, "t": 564.99995, "r": 477.47860999999995, "b": 573.84674, "coord_origin": "TOPLEFT"}}, {"id": 17, "text": "voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita", "bbox": {"l": 133.76801, "t": 576.95496, "r": 477.47571000000005, "b": 585.80174, "coord_origin": "TOPLEFT"}}, {"id": 18, "text": "kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem", "bbox": {"l": 133.76801, "t": 588.90996, "r": 477.47559, "b": 597.75674, "coord_origin": "TOPLEFT"}}, {"id": 19, "text": "ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod", "bbox": {"l": 133.76801, "t": 600.86595, "r": 477.48169000000007, "b": 609.7127399999999, "coord_origin": "TOPLEFT"}}, {"id": 20, "text": "tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At", "bbox": {"l": 133.76801, "t": 612.82095, "r": 477.48062, "b": 621.66774, "coord_origin": "TOPLEFT"}}, {"id": 21, "text": "vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren,", "bbox": {"l": 133.76801, "t": 624.7759599999999, "r": 477.48172000000005, "b": 633.62274, "coord_origin": "TOPLEFT"}}, {"id": 22, "text": "no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor", "bbox": {"l": 133.76801, "t": 636.73096, "r": 477.47772, "b": 645.57774, "coord_origin": "TOPLEFT"}}, {"id": 23, "text": "sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt", "bbox": {"l": 133.76801, "t": 648.68596, "r": 477.47971, "b": 657.53275, "coord_origin": "TOPLEFT"}}, {"id": 24, "text": "ut labore et dolore magna aliquyam erat, sed diam voluptua.", "bbox": {"l": 133.76801, "t": 660.64096, "r": 399.57816, "b": 669.48775, "coord_origin": "TOPLEFT"}}, {"id": 25, "text": "1", "bbox": {"l": 303.133, "t": 695.720963, "r": 308.11429, "b": 704.567757, "coord_origin": "TOPLEFT"}}], "predictions": {"layout": {"clusters": [{"id": 3, "label": "section_header", "bbox": {"l": 133.76801, "t": 124.80877999999996, "r": 252.35513, "b": 137.54816000000005, "coord_origin": "TOPLEFT"}, "confidence": 0.9627318382263184, "cells": [{"id": 0, "text": "Figures Example", "bbox": {"l": 133.76801, "t": 124.80877999999996, "r": 252.35513, "b": 137.54816000000005, "coord_origin": "TOPLEFT"}}], "children": []}, {"id": 0, "label": "text", "bbox": {"l": 133.76801, "t": 149.67193999999995, "r": 477.48276, "b": 290.02588, "coord_origin": "TOPLEFT"}, "confidence": 0.9869933128356934, "cells": [{"id": 1, "text": "Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eir-", "bbox": {"l": 133.76801, "t": 149.67193999999995, "r": 477.47971, "b": 158.51873999999998, "coord_origin": "TOPLEFT"}}, {"id": 2, "text": "mod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam volup-", "bbox": {"l": 133.76801, "t": 161.62694999999997, "r": 477.4806500000001, "b": 170.47375, "coord_origin": "TOPLEFT"}}, {"id": 3, "text": "tua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd", "bbox": {"l": 133.76801, "t": 173.58196999999996, "r": 477.47571000000005, "b": 182.42877, "coord_origin": "TOPLEFT"}}, {"id": 4, "text": "gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ip-", "bbox": {"l": 133.76801, "t": 185.53698999999995, "r": 477.47466999999995, "b": 194.38378999999998, "coord_origin": "TOPLEFT"}}, {"id": 5, "text": "sum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor", "bbox": {"l": 133.76801, "t": 197.49199999999996, "r": 477.47668, "b": 206.33880999999997, "coord_origin": "TOPLEFT"}}, {"id": 6, "text": "invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero", "bbox": {"l": 133.76801, "t": 209.44701999999995, "r": 477.48068000000006, "b": 218.29381999999998, "coord_origin": "TOPLEFT"}}, {"id": 7, "text": "eos et accusam et justo duo dolores et ea rebum.", "bbox": {"l": 133.76801, "t": 221.40301999999997, "r": 355.20874, "b": 230.24982, "coord_origin": "TOPLEFT"}}, {"id": 8, "text": "Stet clita kasd gubergren,", "bbox": {"l": 362.60098, "t": 221.40301999999997, "r": 477.48276, "b": 230.24982, "coord_origin": "TOPLEFT"}}, {"id": 9, "text": "no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor", "bbox": {"l": 133.76801, "t": 233.35802999999999, "r": 477.47772, "b": 242.20483000000002, "coord_origin": "TOPLEFT"}}, {"id": 10, "text": "sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt", "bbox": {"l": 133.76801, "t": 245.31304999999998, "r": 477.47971, "b": 254.15985, "coord_origin": "TOPLEFT"}}, {"id": 11, "text": "ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et", "bbox": {"l": 133.76801, "t": 257.26806999999997, "r": 477.47473, "b": 266.11487, "coord_origin": "TOPLEFT"}}, {"id": 12, "text": "accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea", "bbox": {"l": 133.76801, "t": 269.22308, "r": 477.47571000000005, "b": 278.06989, "coord_origin": "TOPLEFT"}}, {"id": 13, "text": "takimata sanctus est Lorem ipsum dolor sit amet.", "bbox": {"l": 133.76801, "t": 281.17911, "r": 351.48471, "b": 290.02588, "coord_origin": "TOPLEFT"}}], "children": []}, {"id": 4, "label": "caption", "bbox": {"l": 226.89101, "t": 529.13495, "r": 384.3548, "b": 537.98174, "coord_origin": "TOPLEFT"}, "confidence": 0.9477447271347046, "cells": [{"id": 14, "text": "Figure 1: This is an example image.", "bbox": {"l": 226.89101, "t": 529.13495, "r": 384.3548, "b": 537.98174, "coord_origin": "TOPLEFT"}}], "children": []}, {"id": 1, "label": "text", "bbox": {"l": 133.76801, "t": 553.04495, "r": 477.48172000000005, "b": 669.48775, "coord_origin": "TOPLEFT"}, "confidence": 0.9862836599349976, "cells": [{"id": 15, "text": "Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy", "bbox": {"l": 148.71201, "t": 553.04495, "r": 477.47786999999994, "b": 561.89174, "coord_origin": "TOPLEFT"}}, {"id": 16, "text": "eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam", "bbox": {"l": 133.76801, "t": 564.99995, "r": 477.47860999999995, "b": 573.84674, "coord_origin": "TOPLEFT"}}, {"id": 17, "text": "voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita", "bbox": {"l": 133.76801, "t": 576.95496, "r": 477.47571000000005, "b": 585.80174, "coord_origin": "TOPLEFT"}}, {"id": 18, "text": "kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem", "bbox": {"l": 133.76801, "t": 588.90996, "r": 477.47559, "b": 597.75674, "coord_origin": "TOPLEFT"}}, {"id": 19, "text": "ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod", "bbox": {"l": 133.76801, "t": 600.86595, "r": 477.48169000000007, "b": 609.7127399999999, "coord_origin": "TOPLEFT"}}, {"id": 20, "text": "tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At", "bbox": {"l": 133.76801, "t": 612.82095, "r": 477.48062, "b": 621.66774, "coord_origin": "TOPLEFT"}}, {"id": 21, "text": "vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren,", "bbox": {"l": 133.76801, "t": 624.7759599999999, "r": 477.48172000000005, "b": 633.62274, "coord_origin": "TOPLEFT"}}, {"id": 22, "text": "no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor", "bbox": {"l": 133.76801, "t": 636.73096, "r": 477.47772, "b": 645.57774, "coord_origin": "TOPLEFT"}}, {"id": 23, "text": "sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt", "bbox": {"l": 133.76801, "t": 648.68596, "r": 477.47971, "b": 657.53275, "coord_origin": "TOPLEFT"}}, {"id": 24, "text": "ut labore et dolore magna aliquyam erat, sed diam voluptua.", "bbox": {"l": 133.76801, "t": 660.64096, "r": 399.57816, "b": 669.48775, "coord_origin": "TOPLEFT"}}], "children": []}, {"id": 5, "label": "page_footer", "bbox": {"l": 303.133, "t": 695.720963, "r": 308.11429, "b": 704.567757, "coord_origin": "TOPLEFT"}, "confidence": 0.8646790981292725, "cells": [{"id": 25, "text": "1", "bbox": {"l": 303.133, "t": 695.720963, "r": 308.11429, "b": 704.567757, "coord_origin": "TOPLEFT"}}], "children": []}, {"id": 2, "label": "picture", "bbox": {"l": 134.92005920410156, "t": 304.890625, "r": 475.66351318359375, "b": 510.21826171875, "coord_origin": "TOPLEFT"}, "confidence": 0.9803217053413391, "cells": [], "children": []}]}, "tablestructure": {"table_map": {}}, "figures_classification": null, "equations_prediction": null}, "assembled": {"elements": [{"label": "section_header", "id": 3, "page_no": 0, "cluster": {"id": 3, "label": "section_header", "bbox": {"l": 133.76801, "t": 124.80877999999996, "r": 252.35513, "b": 137.54816000000005, "coord_origin": "TOPLEFT"}, "confidence": 0.9627318382263184, "cells": [{"id": 0, "text": "Figures Example", "bbox": {"l": 133.76801, "t": 124.80877999999996, "r": 252.35513, "b": 137.54816000000005, "coord_origin": "TOPLEFT"}}], "children": []}, "text": "Figures Example"}, {"label": "text", "id": 0, "page_no": 0, "cluster": {"id": 0, "label": "text", "bbox": {"l": 133.76801, "t": 149.67193999999995, "r": 477.48276, "b": 290.02588, "coord_origin": "TOPLEFT"}, "confidence": 0.9869933128356934, "cells": [{"id": 1, "text": "Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eir-", "bbox": {"l": 133.76801, "t": 149.67193999999995, "r": 477.47971, "b": 158.51873999999998, "coord_origin": "TOPLEFT"}}, {"id": 2, "text": "mod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam volup-", "bbox": {"l": 133.76801, "t": 161.62694999999997, "r": 477.4806500000001, "b": 170.47375, "coord_origin": "TOPLEFT"}}, {"id": 3, "text": "tua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd", "bbox": {"l": 133.76801, "t": 173.58196999999996, "r": 477.47571000000005, "b": 182.42877, "coord_origin": "TOPLEFT"}}, {"id": 4, "text": "gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ip-", "bbox": {"l": 133.76801, "t": 185.53698999999995, "r": 477.47466999999995, "b": 194.38378999999998, "coord_origin": "TOPLEFT"}}, {"id": 5, "text": "sum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor", "bbox": {"l": 133.76801, "t": 197.49199999999996, "r": 477.47668, "b": 206.33880999999997, "coord_origin": "TOPLEFT"}}, {"id": 6, "text": "invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero", "bbox": {"l": 133.76801, "t": 209.44701999999995, "r": 477.48068000000006, "b": 218.29381999999998, "coord_origin": "TOPLEFT"}}, {"id": 7, "text": "eos et accusam et justo duo dolores et ea rebum.", "bbox": {"l": 133.76801, "t": 221.40301999999997, "r": 355.20874, "b": 230.24982, "coord_origin": "TOPLEFT"}}, {"id": 8, "text": "Stet clita kasd gubergren,", "bbox": {"l": 362.60098, "t": 221.40301999999997, "r": 477.48276, "b": 230.24982, "coord_origin": "TOPLEFT"}}, {"id": 9, "text": "no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor", "bbox": {"l": 133.76801, "t": 233.35802999999999, "r": 477.47772, "b": 242.20483000000002, "coord_origin": "TOPLEFT"}}, {"id": 10, "text": "sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt", "bbox": {"l": 133.76801, "t": 245.31304999999998, "r": 477.47971, "b": 254.15985, "coord_origin": "TOPLEFT"}}, {"id": 11, "text": "ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et", "bbox": {"l": 133.76801, "t": 257.26806999999997, "r": 477.47473, "b": 266.11487, "coord_origin": "TOPLEFT"}}, {"id": 12, "text": "accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea", "bbox": {"l": 133.76801, "t": 269.22308, "r": 477.47571000000005, "b": 278.06989, "coord_origin": "TOPLEFT"}}, {"id": 13, "text": "takimata sanctus est Lorem ipsum dolor sit amet.", "bbox": {"l": 133.76801, "t": 281.17911, "r": 351.48471, "b": 290.02588, "coord_origin": "TOPLEFT"}}], "children": []}, "text": "Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet."}, {"label": "caption", "id": 4, "page_no": 0, "cluster": {"id": 4, "label": "caption", "bbox": {"l": 226.89101, "t": 529.13495, "r": 384.3548, "b": 537.98174, "coord_origin": "TOPLEFT"}, "confidence": 0.9477447271347046, "cells": [{"id": 14, "text": "Figure 1: This is an example image.", "bbox": {"l": 226.89101, "t": 529.13495, "r": 384.3548, "b": 537.98174, "coord_origin": "TOPLEFT"}}], "children": []}, "text": "Figure 1: This is an example image."}, {"label": "text", "id": 1, "page_no": 0, "cluster": {"id": 1, "label": "text", "bbox": {"l": 133.76801, "t": 553.04495, "r": 477.48172000000005, "b": 669.48775, "coord_origin": "TOPLEFT"}, "confidence": 0.9862836599349976, "cells": [{"id": 15, "text": "Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy", "bbox": {"l": 148.71201, "t": 553.04495, "r": 477.47786999999994, "b": 561.89174, "coord_origin": "TOPLEFT"}}, {"id": 16, "text": "eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam", "bbox": {"l": 133.76801, "t": 564.99995, "r": 477.47860999999995, "b": 573.84674, "coord_origin": "TOPLEFT"}}, {"id": 17, "text": "voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita", "bbox": {"l": 133.76801, "t": 576.95496, "r": 477.47571000000005, "b": 585.80174, "coord_origin": "TOPLEFT"}}, {"id": 18, "text": "kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem", "bbox": {"l": 133.76801, "t": 588.90996, "r": 477.47559, "b": 597.75674, "coord_origin": "TOPLEFT"}}, {"id": 19, "text": "ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod", "bbox": {"l": 133.76801, "t": 600.86595, "r": 477.48169000000007, "b": 609.7127399999999, "coord_origin": "TOPLEFT"}}, {"id": 20, "text": "tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At", "bbox": {"l": 133.76801, "t": 612.82095, "r": 477.48062, "b": 621.66774, "coord_origin": "TOPLEFT"}}, {"id": 21, "text": "vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren,", "bbox": {"l": 133.76801, "t": 624.7759599999999, "r": 477.48172000000005, "b": 633.62274, "coord_origin": "TOPLEFT"}}, {"id": 22, "text": "no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor", "bbox": {"l": 133.76801, "t": 636.73096, "r": 477.47772, "b": 645.57774, "coord_origin": "TOPLEFT"}}, {"id": 23, "text": "sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt", "bbox": {"l": 133.76801, "t": 648.68596, "r": 477.47971, "b": 657.53275, "coord_origin": "TOPLEFT"}}, {"id": 24, "text": "ut labore et dolore magna aliquyam erat, sed diam voluptua.", "bbox": {"l": 133.76801, "t": 660.64096, "r": 399.57816, "b": 669.48775, "coord_origin": "TOPLEFT"}}], "children": []}, "text": "Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua."}, {"label": "page_footer", "id": 5, "page_no": 0, "cluster": {"id": 5, "label": "page_footer", "bbox": {"l": 303.133, "t": 695.720963, "r": 308.11429, "b": 704.567757, "coord_origin": "TOPLEFT"}, "confidence": 0.8646790981292725, "cells": [{"id": 25, "text": "1", "bbox": {"l": 303.133, "t": 695.720963, "r": 308.11429, "b": 704.567757, "coord_origin": "TOPLEFT"}}], "children": []}, "text": "1"}, {"label": "picture", "id": 2, "page_no": 0, "cluster": {"id": 2, "label": "picture", "bbox": {"l": 134.92005920410156, "t": 304.890625, "r": 475.66351318359375, "b": 510.21826171875, "coord_origin": "TOPLEFT"}, "confidence": 0.9803217053413391, "cells": [], "children": []}, "text": "", "annotations": [], "provenance": null, "predicted_class": null, "confidence": null}], "body": [{"label": "section_header", "id": 3, "page_no": 0, "cluster": {"id": 3, "label": "section_header", "bbox": {"l": 133.76801, "t": 124.80877999999996, "r": 252.35513, "b": 137.54816000000005, "coord_origin": "TOPLEFT"}, "confidence": 0.9627318382263184, "cells": [{"id": 0, "text": "Figures Example", "bbox": {"l": 133.76801, "t": 124.80877999999996, "r": 252.35513, "b": 137.54816000000005, "coord_origin": "TOPLEFT"}}], "children": []}, "text": "Figures Example"}, {"label": "text", "id": 0, "page_no": 0, "cluster": {"id": 0, "label": "text", "bbox": {"l": 133.76801, "t": 149.67193999999995, "r": 477.48276, "b": 290.02588, "coord_origin": "TOPLEFT"}, "confidence": 0.9869933128356934, "cells": [{"id": 1, "text": "Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eir-", "bbox": {"l": 133.76801, "t": 149.67193999999995, "r": 477.47971, "b": 158.51873999999998, "coord_origin": "TOPLEFT"}}, {"id": 2, "text": "mod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam volup-", "bbox": {"l": 133.76801, "t": 161.62694999999997, "r": 477.4806500000001, "b": 170.47375, "coord_origin": "TOPLEFT"}}, {"id": 3, "text": "tua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd", "bbox": {"l": 133.76801, "t": 173.58196999999996, "r": 477.47571000000005, "b": 182.42877, "coord_origin": "TOPLEFT"}}, {"id": 4, "text": "gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ip-", "bbox": {"l": 133.76801, "t": 185.53698999999995, "r": 477.47466999999995, "b": 194.38378999999998, "coord_origin": "TOPLEFT"}}, {"id": 5, "text": "sum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor", "bbox": {"l": 133.76801, "t": 197.49199999999996, "r": 477.47668, "b": 206.33880999999997, "coord_origin": "TOPLEFT"}}, {"id": 6, "text": "invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero", "bbox": {"l": 133.76801, "t": 209.44701999999995, "r": 477.48068000000006, "b": 218.29381999999998, "coord_origin": "TOPLEFT"}}, {"id": 7, "text": "eos et accusam et justo duo dolores et ea rebum.", "bbox": {"l": 133.76801, "t": 221.40301999999997, "r": 355.20874, "b": 230.24982, "coord_origin": "TOPLEFT"}}, {"id": 8, "text": "Stet clita kasd gubergren,", "bbox": {"l": 362.60098, "t": 221.40301999999997, "r": 477.48276, "b": 230.24982, "coord_origin": "TOPLEFT"}}, {"id": 9, "text": "no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor", "bbox": {"l": 133.76801, "t": 233.35802999999999, "r": 477.47772, "b": 242.20483000000002, "coord_origin": "TOPLEFT"}}, {"id": 10, "text": "sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt", "bbox": {"l": 133.76801, "t": 245.31304999999998, "r": 477.47971, "b": 254.15985, "coord_origin": "TOPLEFT"}}, {"id": 11, "text": "ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et", "bbox": {"l": 133.76801, "t": 257.26806999999997, "r": 477.47473, "b": 266.11487, "coord_origin": "TOPLEFT"}}, {"id": 12, "text": "accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea", "bbox": {"l": 133.76801, "t": 269.22308, "r": 477.47571000000005, "b": 278.06989, "coord_origin": "TOPLEFT"}}, {"id": 13, "text": "takimata sanctus est Lorem ipsum dolor sit amet.", "bbox": {"l": 133.76801, "t": 281.17911, "r": 351.48471, "b": 290.02588, "coord_origin": "TOPLEFT"}}], "children": []}, "text": "Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet."}, {"label": "caption", "id": 4, "page_no": 0, "cluster": {"id": 4, "label": "caption", "bbox": {"l": 226.89101, "t": 529.13495, "r": 384.3548, "b": 537.98174, "coord_origin": "TOPLEFT"}, "confidence": 0.9477447271347046, "cells": [{"id": 14, "text": "Figure 1: This is an example image.", "bbox": {"l": 226.89101, "t": 529.13495, "r": 384.3548, "b": 537.98174, "coord_origin": "TOPLEFT"}}], "children": []}, "text": "Figure 1: This is an example image."}, {"label": "text", "id": 1, "page_no": 0, "cluster": {"id": 1, "label": "text", "bbox": {"l": 133.76801, "t": 553.04495, "r": 477.48172000000005, "b": 669.48775, "coord_origin": "TOPLEFT"}, "confidence": 0.9862836599349976, "cells": [{"id": 15, "text": "Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy", "bbox": {"l": 148.71201, "t": 553.04495, "r": 477.47786999999994, "b": 561.89174, "coord_origin": "TOPLEFT"}}, {"id": 16, "text": "eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam", "bbox": {"l": 133.76801, "t": 564.99995, "r": 477.47860999999995, "b": 573.84674, "coord_origin": "TOPLEFT"}}, {"id": 17, "text": "voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita", "bbox": {"l": 133.76801, "t": 576.95496, "r": 477.47571000000005, "b": 585.80174, "coord_origin": "TOPLEFT"}}, {"id": 18, "text": "kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem", "bbox": {"l": 133.76801, "t": 588.90996, "r": 477.47559, "b": 597.75674, "coord_origin": "TOPLEFT"}}, {"id": 19, "text": "ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod", "bbox": {"l": 133.76801, "t": 600.86595, "r": 477.48169000000007, "b": 609.7127399999999, "coord_origin": "TOPLEFT"}}, {"id": 20, "text": "tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At", "bbox": {"l": 133.76801, "t": 612.82095, "r": 477.48062, "b": 621.66774, "coord_origin": "TOPLEFT"}}, {"id": 21, "text": "vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren,", "bbox": {"l": 133.76801, "t": 624.7759599999999, "r": 477.48172000000005, "b": 633.62274, "coord_origin": "TOPLEFT"}}, {"id": 22, "text": "no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor", "bbox": {"l": 133.76801, "t": 636.73096, "r": 477.47772, "b": 645.57774, "coord_origin": "TOPLEFT"}}, {"id": 23, "text": "sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt", "bbox": {"l": 133.76801, "t": 648.68596, "r": 477.47971, "b": 657.53275, "coord_origin": "TOPLEFT"}}, {"id": 24, "text": "ut labore et dolore magna aliquyam erat, sed diam voluptua.", "bbox": {"l": 133.76801, "t": 660.64096, "r": 399.57816, "b": 669.48775, "coord_origin": "TOPLEFT"}}], "children": []}, "text": "Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua."}, {"label": "picture", "id": 2, "page_no": 0, "cluster": {"id": 2, "label": "picture", "bbox": {"l": 134.92005920410156, "t": 304.890625, "r": 475.66351318359375, "b": 510.21826171875, "coord_origin": "TOPLEFT"}, "confidence": 0.9803217053413391, "cells": [], "children": []}, "text": "", "annotations": [], "provenance": null, "predicted_class": null, "confidence": null}], "headers": [{"label": "page_footer", "id": 5, "page_no": 0, "cluster": {"id": 5, "label": "page_footer", "bbox": {"l": 303.133, "t": 695.720963, "r": 308.11429, "b": 704.567757, "coord_origin": "TOPLEFT"}, "confidence": 0.8646790981292725, "cells": [{"id": 25, "text": "1", "bbox": {"l": 303.133, "t": 695.720963, "r": 308.11429, "b": 704.567757, "coord_origin": "TOPLEFT"}}], "children": []}, "text": "1"}]}}, {"page_no": 1, "size": {"width": 612.0, "height": 792.0}, "cells": [{"id": 0, "text": "Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy", "bbox": {"l": 148.71201, "t": 127.85095000000013, "r": 477.47786999999994, "b": 136.69775000000004, "coord_origin": "TOPLEFT"}}, {"id": 1, "text": "eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam", "bbox": {"l": 133.76801, "t": 139.80597, "r": 477.47860999999995, "b": 148.65277000000003, "coord_origin": "TOPLEFT"}}, {"id": 2, "text": "voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita", "bbox": {"l": 133.76801, "t": 151.76099, "r": 477.47571000000005, "b": 160.60779000000002, "coord_origin": "TOPLEFT"}}, {"id": 3, "text": "kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem", "bbox": {"l": 133.76801, "t": 163.716, "r": 477.47559, "b": 172.56281, "coord_origin": "TOPLEFT"}}, {"id": 4, "text": "ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod", "bbox": {"l": 133.76801, "t": 175.67102, "r": 477.48169000000007, "b": 184.51782000000003, "coord_origin": "TOPLEFT"}}, {"id": 5, "text": "tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At", "bbox": {"l": 133.76801, "t": 187.62701000000004, "r": 477.48062, "b": 196.47382000000005, "coord_origin": "TOPLEFT"}}, {"id": 6, "text": "vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren,", "bbox": {"l": 133.76801, "t": 199.58203000000003, "r": 477.48172000000005, "b": 208.42882999999995, "coord_origin": "TOPLEFT"}}, {"id": 7, "text": "no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor", "bbox": {"l": 133.76801, "t": 211.53705000000002, "r": 477.47772, "b": 220.38385000000005, "coord_origin": "TOPLEFT"}}, {"id": 8, "text": "sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt", "bbox": {"l": 133.76801, "t": 223.49207, "r": 477.47971, "b": 232.33887000000004, "coord_origin": "TOPLEFT"}}, {"id": 9, "text": "ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et", "bbox": {"l": 133.76801, "t": 235.44708000000003, "r": 477.47473, "b": 244.29387999999994, "coord_origin": "TOPLEFT"}}, {"id": 10, "text": "accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea", "bbox": {"l": 133.76801, "t": 247.40210000000002, "r": 477.47571000000005, "b": 256.24890000000005, "coord_origin": "TOPLEFT"}}, {"id": 11, "text": "takimata sanctus est Lorem ipsum dolor sit amet.", "bbox": {"l": 133.76801, "t": 259.35808999999995, "r": 351.48471, "b": 268.20489999999995, "coord_origin": "TOPLEFT"}}, {"id": 12, "text": "Figure 2: This is an example image.", "bbox": {"l": 226.89101, "t": 523.21097, "r": 384.3548, "b": 532.05774, "coord_origin": "TOPLEFT"}}, {"id": 13, "text": "Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy", "bbox": {"l": 148.71201, "t": 546.28195, "r": 477.47786999999994, "b": 555.12874, "coord_origin": "TOPLEFT"}}, {"id": 14, "text": "eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam", "bbox": {"l": 133.76801, "t": 558.23695, "r": 477.47860999999995, "b": 567.08374, "coord_origin": "TOPLEFT"}}, {"id": 15, "text": "voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita", "bbox": {"l": 133.76801, "t": 570.19196, "r": 477.47571000000005, "b": 579.03874, "coord_origin": "TOPLEFT"}}, {"id": 16, "text": "kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem", "bbox": {"l": 133.76801, "t": 582.14696, "r": 477.47559, "b": 590.99374, "coord_origin": "TOPLEFT"}}, {"id": 17, "text": "ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod", "bbox": {"l": 133.76801, "t": 594.10196, "r": 477.48169000000007, "b": 602.94875, "coord_origin": "TOPLEFT"}}, {"id": 18, "text": "tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At", "bbox": {"l": 133.76801, "t": 606.05696, "r": 477.48062, "b": 614.90375, "coord_origin": "TOPLEFT"}}, {"id": 19, "text": "vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren,", "bbox": {"l": 133.76801, "t": 618.01295, "r": 477.48172000000005, "b": 626.85974, "coord_origin": "TOPLEFT"}}, {"id": 20, "text": "no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor", "bbox": {"l": 133.76801, "t": 629.96796, "r": 477.47772, "b": 638.81474, "coord_origin": "TOPLEFT"}}, {"id": 21, "text": "sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt", "bbox": {"l": 133.76801, "t": 641.92296, "r": 477.47971, "b": 650.76974, "coord_origin": "TOPLEFT"}}, {"id": 22, "text": "ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et", "bbox": {"l": 133.76801, "t": 653.87796, "r": 477.47473, "b": 662.72475, "coord_origin": "TOPLEFT"}}, {"id": 23, "text": "accusam et justo duo dolores et ea rebum.", "bbox": {"l": 133.76801, "t": 665.83296, "r": 318.01736, "b": 674.67976, "coord_origin": "TOPLEFT"}}, {"id": 24, "text": "2", "bbox": {"l": 303.133, "t": 695.720963, "r": 308.11429, "b": 704.567757, "coord_origin": "TOPLEFT"}}], "predictions": {"layout": {"clusters": [{"id": 1, "label": "text", "bbox": {"l": 133.76801, "t": 127.85095000000013, "r": 477.48172000000005, "b": 268.20489999999995, "coord_origin": "TOPLEFT"}, "confidence": 0.987092912197113, "cells": [{"id": 0, "text": "Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy", "bbox": {"l": 148.71201, "t": 127.85095000000013, "r": 477.47786999999994, "b": 136.69775000000004, "coord_origin": "TOPLEFT"}}, {"id": 1, "text": "eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam", "bbox": {"l": 133.76801, "t": 139.80597, "r": 477.47860999999995, "b": 148.65277000000003, "coord_origin": "TOPLEFT"}}, {"id": 2, "text": "voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita", "bbox": {"l": 133.76801, "t": 151.76099, "r": 477.47571000000005, "b": 160.60779000000002, "coord_origin": "TOPLEFT"}}, {"id": 3, "text": "kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem", "bbox": {"l": 133.76801, "t": 163.716, "r": 477.47559, "b": 172.56281, "coord_origin": "TOPLEFT"}}, {"id": 4, "text": "ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod", "bbox": {"l": 133.76801, "t": 175.67102, "r": 477.48169000000007, "b": 184.51782000000003, "coord_origin": "TOPLEFT"}}, {"id": 5, "text": "tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At", "bbox": {"l": 133.76801, "t": 187.62701000000004, "r": 477.48062, "b": 196.47382000000005, "coord_origin": "TOPLEFT"}}, {"id": 6, "text": "vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren,", "bbox": {"l": 133.76801, "t": 199.58203000000003, "r": 477.48172000000005, "b": 208.42882999999995, "coord_origin": "TOPLEFT"}}, {"id": 7, "text": "no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor", "bbox": {"l": 133.76801, "t": 211.53705000000002, "r": 477.47772, "b": 220.38385000000005, "coord_origin": "TOPLEFT"}}, {"id": 8, "text": "sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt", "bbox": {"l": 133.76801, "t": 223.49207, "r": 477.47971, "b": 232.33887000000004, "coord_origin": "TOPLEFT"}}, {"id": 9, "text": "ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et", "bbox": {"l": 133.76801, "t": 235.44708000000003, "r": 477.47473, "b": 244.29387999999994, "coord_origin": "TOPLEFT"}}, {"id": 10, "text": "accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea", "bbox": {"l": 133.76801, "t": 247.40210000000002, "r": 477.47571000000005, "b": 256.24890000000005, "coord_origin": "TOPLEFT"}}, {"id": 11, "text": "takimata sanctus est Lorem ipsum dolor sit amet.", "bbox": {"l": 133.76801, "t": 259.35808999999995, "r": 351.48471, "b": 268.20489999999995, "coord_origin": "TOPLEFT"}}], "children": []}, {"id": 3, "label": "caption", "bbox": {"l": 226.89101, "t": 523.21097, "r": 384.3548, "b": 532.05774, "coord_origin": "TOPLEFT"}, "confidence": 0.9494235515594482, "cells": [{"id": 12, "text": "Figure 2: This is an example image.", "bbox": {"l": 226.89101, "t": 523.21097, "r": 384.3548, "b": 532.05774, "coord_origin": "TOPLEFT"}}], "children": []}, {"id": 0, "label": "text", "bbox": {"l": 133.76801, "t": 546.28195, "r": 477.48172000000005, "b": 674.67976, "coord_origin": "TOPLEFT"}, "confidence": 0.9874356985092163, "cells": [{"id": 13, "text": "Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy", "bbox": {"l": 148.71201, "t": 546.28195, "r": 477.47786999999994, "b": 555.12874, "coord_origin": "TOPLEFT"}}, {"id": 14, "text": "eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam", "bbox": {"l": 133.76801, "t": 558.23695, "r": 477.47860999999995, "b": 567.08374, "coord_origin": "TOPLEFT"}}, {"id": 15, "text": "voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita", "bbox": {"l": 133.76801, "t": 570.19196, "r": 477.47571000000005, "b": 579.03874, "coord_origin": "TOPLEFT"}}, {"id": 16, "text": "kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem", "bbox": {"l": 133.76801, "t": 582.14696, "r": 477.47559, "b": 590.99374, "coord_origin": "TOPLEFT"}}, {"id": 17, "text": "ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod", "bbox": {"l": 133.76801, "t": 594.10196, "r": 477.48169000000007, "b": 602.94875, "coord_origin": "TOPLEFT"}}, {"id": 18, "text": "tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At", "bbox": {"l": 133.76801, "t": 606.05696, "r": 477.48062, "b": 614.90375, "coord_origin": "TOPLEFT"}}, {"id": 19, "text": "vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren,", "bbox": {"l": 133.76801, "t": 618.01295, "r": 477.48172000000005, "b": 626.85974, "coord_origin": "TOPLEFT"}}, {"id": 20, "text": "no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor", "bbox": {"l": 133.76801, "t": 629.96796, "r": 477.47772, "b": 638.81474, "coord_origin": "TOPLEFT"}}, {"id": 21, "text": "sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt", "bbox": {"l": 133.76801, "t": 641.92296, "r": 477.47971, "b": 650.76974, "coord_origin": "TOPLEFT"}}, {"id": 22, "text": "ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et", "bbox": {"l": 133.76801, "t": 653.87796, "r": 477.47473, "b": 662.72475, "coord_origin": "TOPLEFT"}}, {"id": 23, "text": "accusam et justo duo dolores et ea rebum.", "bbox": {"l": 133.76801, "t": 665.83296, "r": 318.01736, "b": 674.67976, "coord_origin": "TOPLEFT"}}], "children": []}, {"id": 4, "label": "page_footer", "bbox": {"l": 303.133, "t": 695.720963, "r": 308.11429, "b": 704.567757, "coord_origin": "TOPLEFT"}, "confidence": 0.888852059841156, "cells": [{"id": 24, "text": "2", "bbox": {"l": 303.133, "t": 695.720963, "r": 308.11429, "b": 704.567757, "coord_origin": "TOPLEFT"}}], "children": []}, {"id": 2, "label": "picture", "bbox": {"l": 218.8155517578125, "t": 278.0154113769531, "r": 391.96246337890625, "b": 508.89410400390625, "coord_origin": "TOPLEFT"}, "confidence": 0.9837717413902283, "cells": [], "children": []}]}, "tablestructure": {"table_map": {}}, "figures_classification": null, "equations_prediction": null}, "assembled": {"elements": [{"label": "text", "id": 1, "page_no": 1, "cluster": {"id": 1, "label": "text", "bbox": {"l": 133.76801, "t": 127.85095000000013, "r": 477.48172000000005, "b": 268.20489999999995, "coord_origin": "TOPLEFT"}, "confidence": 0.987092912197113, "cells": [{"id": 0, "text": "Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy", "bbox": {"l": 148.71201, "t": 127.85095000000013, "r": 477.47786999999994, "b": 136.69775000000004, "coord_origin": "TOPLEFT"}}, {"id": 1, "text": "eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam", "bbox": {"l": 133.76801, "t": 139.80597, "r": 477.47860999999995, "b": 148.65277000000003, "coord_origin": "TOPLEFT"}}, {"id": 2, "text": "voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita", "bbox": {"l": 133.76801, "t": 151.76099, "r": 477.47571000000005, "b": 160.60779000000002, "coord_origin": "TOPLEFT"}}, {"id": 3, "text": "kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem", "bbox": {"l": 133.76801, "t": 163.716, "r": 477.47559, "b": 172.56281, "coord_origin": "TOPLEFT"}}, {"id": 4, "text": "ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod", "bbox": {"l": 133.76801, "t": 175.67102, "r": 477.48169000000007, "b": 184.51782000000003, "coord_origin": "TOPLEFT"}}, {"id": 5, "text": "tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At", "bbox": {"l": 133.76801, "t": 187.62701000000004, "r": 477.48062, "b": 196.47382000000005, "coord_origin": "TOPLEFT"}}, {"id": 6, "text": "vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren,", "bbox": {"l": 133.76801, "t": 199.58203000000003, "r": 477.48172000000005, "b": 208.42882999999995, "coord_origin": "TOPLEFT"}}, {"id": 7, "text": "no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor", "bbox": {"l": 133.76801, "t": 211.53705000000002, "r": 477.47772, "b": 220.38385000000005, "coord_origin": "TOPLEFT"}}, {"id": 8, "text": "sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt", "bbox": {"l": 133.76801, "t": 223.49207, "r": 477.47971, "b": 232.33887000000004, "coord_origin": "TOPLEFT"}}, {"id": 9, "text": "ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et", "bbox": {"l": 133.76801, "t": 235.44708000000003, "r": 477.47473, "b": 244.29387999999994, "coord_origin": "TOPLEFT"}}, {"id": 10, "text": "accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea", "bbox": {"l": 133.76801, "t": 247.40210000000002, "r": 477.47571000000005, "b": 256.24890000000005, "coord_origin": "TOPLEFT"}}, {"id": 11, "text": "takimata sanctus est Lorem ipsum dolor sit amet.", "bbox": {"l": 133.76801, "t": 259.35808999999995, "r": 351.48471, "b": 268.20489999999995, "coord_origin": "TOPLEFT"}}], "children": []}, "text": "Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet."}, {"label": "caption", "id": 3, "page_no": 1, "cluster": {"id": 3, "label": "caption", "bbox": {"l": 226.89101, "t": 523.21097, "r": 384.3548, "b": 532.05774, "coord_origin": "TOPLEFT"}, "confidence": 0.9494235515594482, "cells": [{"id": 12, "text": "Figure 2: This is an example image.", "bbox": {"l": 226.89101, "t": 523.21097, "r": 384.3548, "b": 532.05774, "coord_origin": "TOPLEFT"}}], "children": []}, "text": "Figure 2: This is an example image."}, {"label": "text", "id": 0, "page_no": 1, "cluster": {"id": 0, "label": "text", "bbox": {"l": 133.76801, "t": 546.28195, "r": 477.48172000000005, "b": 674.67976, "coord_origin": "TOPLEFT"}, "confidence": 0.9874356985092163, "cells": [{"id": 13, "text": "Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy", "bbox": {"l": 148.71201, "t": 546.28195, "r": 477.47786999999994, "b": 555.12874, "coord_origin": "TOPLEFT"}}, {"id": 14, "text": "eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam", "bbox": {"l": 133.76801, "t": 558.23695, "r": 477.47860999999995, "b": 567.08374, "coord_origin": "TOPLEFT"}}, {"id": 15, "text": "voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita", "bbox": {"l": 133.76801, "t": 570.19196, "r": 477.47571000000005, "b": 579.03874, "coord_origin": "TOPLEFT"}}, {"id": 16, "text": "kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem", "bbox": {"l": 133.76801, "t": 582.14696, "r": 477.47559, "b": 590.99374, "coord_origin": "TOPLEFT"}}, {"id": 17, "text": "ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod", "bbox": {"l": 133.76801, "t": 594.10196, "r": 477.48169000000007, "b": 602.94875, "coord_origin": "TOPLEFT"}}, {"id": 18, "text": "tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At", "bbox": {"l": 133.76801, "t": 606.05696, "r": 477.48062, "b": 614.90375, "coord_origin": "TOPLEFT"}}, {"id": 19, "text": "vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren,", "bbox": {"l": 133.76801, "t": 618.01295, "r": 477.48172000000005, "b": 626.85974, "coord_origin": "TOPLEFT"}}, {"id": 20, "text": "no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor", "bbox": {"l": 133.76801, "t": 629.96796, "r": 477.47772, "b": 638.81474, "coord_origin": "TOPLEFT"}}, {"id": 21, "text": "sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt", "bbox": {"l": 133.76801, "t": 641.92296, "r": 477.47971, "b": 650.76974, "coord_origin": "TOPLEFT"}}, {"id": 22, "text": "ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et", "bbox": {"l": 133.76801, "t": 653.87796, "r": 477.47473, "b": 662.72475, "coord_origin": "TOPLEFT"}}, {"id": 23, "text": "accusam et justo duo dolores et ea rebum.", "bbox": {"l": 133.76801, "t": 665.83296, "r": 318.01736, "b": 674.67976, "coord_origin": "TOPLEFT"}}], "children": []}, "text": "Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum."}, {"label": "page_footer", "id": 4, "page_no": 1, "cluster": {"id": 4, "label": "page_footer", "bbox": {"l": 303.133, "t": 695.720963, "r": 308.11429, "b": 704.567757, "coord_origin": "TOPLEFT"}, "confidence": 0.888852059841156, "cells": [{"id": 24, "text": "2", "bbox": {"l": 303.133, "t": 695.720963, "r": 308.11429, "b": 704.567757, "coord_origin": "TOPLEFT"}}], "children": []}, "text": "2"}, {"label": "picture", "id": 2, "page_no": 1, "cluster": {"id": 2, "label": "picture", "bbox": {"l": 218.8155517578125, "t": 278.0154113769531, "r": 391.96246337890625, "b": 508.89410400390625, "coord_origin": "TOPLEFT"}, "confidence": 0.9837717413902283, "cells": [], "children": []}, "text": "", "annotations": [], "provenance": null, "predicted_class": null, "confidence": null}], "body": [{"label": "text", "id": 1, "page_no": 1, "cluster": {"id": 1, "label": "text", "bbox": {"l": 133.76801, "t": 127.85095000000013, "r": 477.48172000000005, "b": 268.20489999999995, "coord_origin": "TOPLEFT"}, "confidence": 0.987092912197113, "cells": [{"id": 0, "text": "Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy", "bbox": {"l": 148.71201, "t": 127.85095000000013, "r": 477.47786999999994, "b": 136.69775000000004, "coord_origin": "TOPLEFT"}}, {"id": 1, "text": "eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam", "bbox": {"l": 133.76801, "t": 139.80597, "r": 477.47860999999995, "b": 148.65277000000003, "coord_origin": "TOPLEFT"}}, {"id": 2, "text": "voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita", "bbox": {"l": 133.76801, "t": 151.76099, "r": 477.47571000000005, "b": 160.60779000000002, "coord_origin": "TOPLEFT"}}, {"id": 3, "text": "kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem", "bbox": {"l": 133.76801, "t": 163.716, "r": 477.47559, "b": 172.56281, "coord_origin": "TOPLEFT"}}, {"id": 4, "text": "ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod", "bbox": {"l": 133.76801, "t": 175.67102, "r": 477.48169000000007, "b": 184.51782000000003, "coord_origin": "TOPLEFT"}}, {"id": 5, "text": "tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At", "bbox": {"l": 133.76801, "t": 187.62701000000004, "r": 477.48062, "b": 196.47382000000005, "coord_origin": "TOPLEFT"}}, {"id": 6, "text": "vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren,", "bbox": {"l": 133.76801, "t": 199.58203000000003, "r": 477.48172000000005, "b": 208.42882999999995, "coord_origin": "TOPLEFT"}}, {"id": 7, "text": "no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor", "bbox": {"l": 133.76801, "t": 211.53705000000002, "r": 477.47772, "b": 220.38385000000005, "coord_origin": "TOPLEFT"}}, {"id": 8, "text": "sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt", "bbox": {"l": 133.76801, "t": 223.49207, "r": 477.47971, "b": 232.33887000000004, "coord_origin": "TOPLEFT"}}, {"id": 9, "text": "ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et", "bbox": {"l": 133.76801, "t": 235.44708000000003, "r": 477.47473, "b": 244.29387999999994, "coord_origin": "TOPLEFT"}}, {"id": 10, "text": "accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea", "bbox": {"l": 133.76801, "t": 247.40210000000002, "r": 477.47571000000005, "b": 256.24890000000005, "coord_origin": "TOPLEFT"}}, {"id": 11, "text": "takimata sanctus est Lorem ipsum dolor sit amet.", "bbox": {"l": 133.76801, "t": 259.35808999999995, "r": 351.48471, "b": 268.20489999999995, "coord_origin": "TOPLEFT"}}], "children": []}, "text": "Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet."}, {"label": "caption", "id": 3, "page_no": 1, "cluster": {"id": 3, "label": "caption", "bbox": {"l": 226.89101, "t": 523.21097, "r": 384.3548, "b": 532.05774, "coord_origin": "TOPLEFT"}, "confidence": 0.9494235515594482, "cells": [{"id": 12, "text": "Figure 2: This is an example image.", "bbox": {"l": 226.89101, "t": 523.21097, "r": 384.3548, "b": 532.05774, "coord_origin": "TOPLEFT"}}], "children": []}, "text": "Figure 2: This is an example image."}, {"label": "text", "id": 0, "page_no": 1, "cluster": {"id": 0, "label": "text", "bbox": {"l": 133.76801, "t": 546.28195, "r": 477.48172000000005, "b": 674.67976, "coord_origin": "TOPLEFT"}, "confidence": 0.9874356985092163, "cells": [{"id": 13, "text": "Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy", "bbox": {"l": 148.71201, "t": 546.28195, "r": 477.47786999999994, "b": 555.12874, "coord_origin": "TOPLEFT"}}, {"id": 14, "text": "eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam", "bbox": {"l": 133.76801, "t": 558.23695, "r": 477.47860999999995, "b": 567.08374, "coord_origin": "TOPLEFT"}}, {"id": 15, "text": "voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita", "bbox": {"l": 133.76801, "t": 570.19196, "r": 477.47571000000005, "b": 579.03874, "coord_origin": "TOPLEFT"}}, {"id": 16, "text": "kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem", "bbox": {"l": 133.76801, "t": 582.14696, "r": 477.47559, "b": 590.99374, "coord_origin": "TOPLEFT"}}, {"id": 17, "text": "ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod", "bbox": {"l": 133.76801, "t": 594.10196, "r": 477.48169000000007, "b": 602.94875, "coord_origin": "TOPLEFT"}}, {"id": 18, "text": "tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At", "bbox": {"l": 133.76801, "t": 606.05696, "r": 477.48062, "b": 614.90375, "coord_origin": "TOPLEFT"}}, {"id": 19, "text": "vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren,", "bbox": {"l": 133.76801, "t": 618.01295, "r": 477.48172000000005, "b": 626.85974, "coord_origin": "TOPLEFT"}}, {"id": 20, "text": "no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor", "bbox": {"l": 133.76801, "t": 629.96796, "r": 477.47772, "b": 638.81474, "coord_origin": "TOPLEFT"}}, {"id": 21, "text": "sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt", "bbox": {"l": 133.76801, "t": 641.92296, "r": 477.47971, "b": 650.76974, "coord_origin": "TOPLEFT"}}, {"id": 22, "text": "ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et", "bbox": {"l": 133.76801, "t": 653.87796, "r": 477.47473, "b": 662.72475, "coord_origin": "TOPLEFT"}}, {"id": 23, "text": "accusam et justo duo dolores et ea rebum.", "bbox": {"l": 133.76801, "t": 665.83296, "r": 318.01736, "b": 674.67976, "coord_origin": "TOPLEFT"}}], "children": []}, "text": "Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum."}, {"label": "picture", "id": 2, "page_no": 1, "cluster": {"id": 2, "label": "picture", "bbox": {"l": 218.8155517578125, "t": 278.0154113769531, "r": 391.96246337890625, "b": 508.89410400390625, "coord_origin": "TOPLEFT"}, "confidence": 0.9837717413902283, "cells": [], "children": []}, "text": "", "annotations": [], "provenance": null, "predicted_class": null, "confidence": null}], "headers": [{"label": "page_footer", "id": 4, "page_no": 1, "cluster": {"id": 4, "label": "page_footer", "bbox": {"l": 303.133, "t": 695.720963, "r": 308.11429, "b": 704.567757, "coord_origin": "TOPLEFT"}, "confidence": 0.888852059841156, "cells": [{"id": 24, "text": "2", "bbox": {"l": 303.133, "t": 695.720963, "r": 308.11429, "b": 704.567757, "coord_origin": "TOPLEFT"}}], "children": []}, "text": "2"}]}}] \ No newline at end of file diff --git a/tests/data/picture_classification.pdf b/tests/data/picture_classification.pdf new file mode 100644 index 00000000..230f74fd Binary files /dev/null and b/tests/data/picture_classification.pdf differ diff --git a/tests/test_document_picture_classifier.py b/tests/test_document_picture_classifier.py new file mode 100644 index 00000000..0ad87e96 --- /dev/null +++ b/tests/test_document_picture_classifier.py @@ -0,0 +1,81 @@ +from pathlib import Path + +from docling_core.types.doc import PictureClassificationData + +from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend +from docling.datamodel.base_models import InputFormat +from docling.datamodel.document import ConversionResult +from docling.datamodel.pipeline_options import PdfPipelineOptions +from docling.document_converter import DocumentConverter, PdfFormatOption +from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline + + +def get_converter(): + + pipeline_options = PdfPipelineOptions() + pipeline_options.generate_page_images = True + + pipeline_options.do_ocr = False + pipeline_options.do_table_structure = False + pipeline_options.do_code_enrichment = False + pipeline_options.do_formula_enrichment = False + pipeline_options.do_picture_classification = True + pipeline_options.generate_picture_images = True + pipeline_options.images_scale = 2 + + converter = DocumentConverter( + format_options={ + InputFormat.PDF: PdfFormatOption( + backend=DoclingParseV2DocumentBackend, + pipeline_cls=StandardPdfPipeline, + pipeline_options=pipeline_options, + ) + } + ) + + return converter + + +def test_picture_classifier(): + pdf_path = Path("tests/data/picture_classification.pdf") + converter = get_converter() + + print(f"converting {pdf_path}") + + doc_result: ConversionResult = converter.convert(pdf_path) + + results = doc_result.document.pictures + + assert len(results) == 2 + + res = results[0] + assert len(res.annotations) == 1 + assert type(res.annotations[0]) == PictureClassificationData + classification_data = res.annotations[0] + assert classification_data.provenance == "DocumentPictureClassifier" + assert ( + len(classification_data.predicted_classes) == 16 + ), "Number of predicted classes is not equal to 16" + confidences = [pred.confidence for pred in classification_data.predicted_classes] + assert confidences == sorted( + confidences, reverse=True + ), "Predictions are not sorted in descending order of confidence" + assert ( + classification_data.predicted_classes[0].class_name == "bar_chart" + ), "The prediction is wrong for the bar chart image." + + res = results[1] + assert len(res.annotations) == 1 + assert type(res.annotations[0]) == PictureClassificationData + classification_data = res.annotations[0] + assert classification_data.provenance == "DocumentPictureClassifier" + assert ( + len(classification_data.predicted_classes) == 16 + ), "Number of predicted classes is not equal to 16" + confidences = [pred.confidence for pred in classification_data.predicted_classes] + assert confidences == sorted( + confidences, reverse=True + ), "Predictions are not sorted in descending order of confidence" + assert ( + classification_data.predicted_classes[0].class_name == "map" + ), "The prediction is wrong for the bar chart image."