From 5e351e9d86398c7d36bde15231f87ba403a4a4c3 Mon Sep 17 00:00:00 2001 From: Peter Staar Date: Mon, 27 Jan 2025 05:52:11 +0100 Subject: [PATCH] added the reading-order model Signed-off-by: Peter Staar --- docling/models/ds_ro_model.py | 45 +++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) create mode 100644 docling/models/ds_ro_model.py diff --git a/docling/models/ds_ro_model.py b/docling/models/ds_ro_model.py new file mode 100644 index 00000000..4cd4896a --- /dev/null +++ b/docling/models/ds_ro_model.py @@ -0,0 +1,45 @@ + +from typing import List, Dict +from pydantic import BaseModel, ConfigDict, TypeAdapter +from docling_ibm_models.reading_order.reading_order_rb import PageElement, ReadingOrderPredictor + +class ReadingOrderRbOptions(BaseModel): + model_config = ConfigDict(protected_namespaces=()) + +class ReadingOrderRbModel: + + def __init__(self, options: ReadingOrderRbOptions): + self.options = options + + self.model = ReadingOrderPredictor() + + def __call__(self, conv_res: ConversionResult) -> DoclingDocument: + + with TimeRecorder(conv_res, "ReadingOrderRbModel", scope=ProfilingScope.DOCUMENT): + + pred_elements: Dict[int, List[PageElement]] = {} + + for element in conv_res.assembled.elements: + + page_no = element.page_no + page_height = page_no_to_page[element.page_no].size.height + + bbox = element.cluster.bbox.to_bottom_left_origin( + page_height=page_height + ) + + if page_no not in pred_elements: + pred_elements[page_no] = [] + + pred_elements[prov.page_no].append( + PageElement( + page_no=page_no, + cid=len(true_elements[page_no]), + pid=0, + label=element.label, + bbox=bbox + ) + ) + + for page_no,elements in pred_elements.items(): + sorted_elements, to_captions, to_footnotes = self.model.predict_page(page_elements=elements)