diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py index b0e1007f..da72252b 100644 --- a/docling_core/types/doc/document.py +++ b/docling_core/types/doc/document.py @@ -5,6 +5,7 @@ import hashlib import html import json +import logging import mimetypes import os import re @@ -20,6 +21,7 @@ from xml.sax.saxutils import unescape import latex2mathml.converter +import latex2mathml.exceptions import pandas as pd import yaml from PIL import Image as PILImage @@ -44,6 +46,8 @@ from docling_core.types.doc.tokens import DocumentToken, TableToken from docling_core.types.doc.utils import relative_path +_logger = logging.getLogger(__name__) + Uint64 = typing.Annotated[int, Field(ge=0, le=(2**64 - 1))] LevelNumber = typing.Annotated[int, Field(ge=1, le=100)] CURRENT_VERSION: Final = "1.0.0" @@ -2487,34 +2491,47 @@ def _prepare_tag_content( ) text = "" - # If the formula is not processed correcty, use its image - if ( - item.text == "" - and item.orig != "" - and image_mode == ImageRefMode.EMBEDDED - and len(item.prov) > 0 - ): + def _image_fallback(item: TextItem): item_image = item.get_image(doc=self) if item_image is not None: img_ref = ImageRef.from_pil(item_image, dpi=72) - text = ( + return ( "
" f'{item.orig}' "
" ) + # If the formula is not processed correcty, use its image + if ( + item.text == "" + and item.orig != "" + and image_mode == ImageRefMode.EMBEDDED + and len(item.prov) > 0 + ): + text = _image_fallback(item) + # Building a math equation in MathML format # ref https://www.w3.org/TR/wai-aria-1.1/#math elif formula_to_mathml: - mathml_element = latex2mathml.converter.convert_to_element( - math_formula, display="block" - ) - annotation = SubElement( - mathml_element, "annotation", dict(encoding="TeX") - ) - annotation.text = math_formula - mathml = unescape(tostring(mathml_element, encoding="unicode")) - text = f"
{mathml}
" + try: + mathml_element = latex2mathml.converter.convert_to_element( + math_formula, display="block" + ) + annotation = SubElement( + mathml_element, "annotation", dict(encoding="TeX") + ) + annotation.text = math_formula + mathml = unescape(tostring(mathml_element, encoding="unicode")) + text = f"
{mathml}
" + except Exception as err: + _logger.warning( + "Malformed formula cannot be rendered. " + f"Error {err.__class__.__name__}, formula={math_formula}" + ) + if image_mode == ImageRefMode.EMBEDDED and len(item.prov) > 0: + text = _image_fallback(item) + else: + text = f"
{math_formula}
" elif math_formula != "": text = f"
{math_formula}
"