Skip to content

Commit

Permalink
fix: image fallback for malformed equations (#149)
Browse files Browse the repository at this point in the history
fix mathml for malformed equations

Signed-off-by: Michele Dolfi <[email protected]>
  • Loading branch information
dolfim-ibm authored Feb 3, 2025
1 parent 285438d commit eb9b4b3
Showing 1 changed file with 34 additions and 17 deletions.
51 changes: 34 additions & 17 deletions docling_core/types/doc/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import hashlib
import html
import json
import logging
import mimetypes
import os
import re
Expand All @@ -20,6 +21,7 @@
from xml.sax.saxutils import unescape

import latex2mathml.converter
import latex2mathml.exceptions
import pandas as pd
import yaml
from PIL import Image as PILImage
Expand All @@ -44,6 +46,8 @@
from docling_core.types.doc.tokens import DocumentToken, TableToken
from docling_core.types.doc.utils import relative_path

_logger = logging.getLogger(__name__)

Uint64 = typing.Annotated[int, Field(ge=0, le=(2**64 - 1))]
LevelNumber = typing.Annotated[int, Field(ge=1, le=100)]
CURRENT_VERSION: Final = "1.0.0"
Expand Down Expand Up @@ -2487,34 +2491,47 @@ def _prepare_tag_content(
)
text = ""

# If the formula is not processed correcty, use its image
if (
item.text == ""
and item.orig != ""
and image_mode == ImageRefMode.EMBEDDED
and len(item.prov) > 0
):
def _image_fallback(item: TextItem):
item_image = item.get_image(doc=self)
if item_image is not None:
img_ref = ImageRef.from_pil(item_image, dpi=72)
text = (
return (
"<figure>"
f'<img src="{img_ref.uri}" alt="{item.orig}" />'
"</figure>"
)

# If the formula is not processed correcty, use its image
if (
item.text == ""
and item.orig != ""
and image_mode == ImageRefMode.EMBEDDED
and len(item.prov) > 0
):
text = _image_fallback(item)

# Building a math equation in MathML format
# ref https://www.w3.org/TR/wai-aria-1.1/#math
elif formula_to_mathml:
mathml_element = latex2mathml.converter.convert_to_element(
math_formula, display="block"
)
annotation = SubElement(
mathml_element, "annotation", dict(encoding="TeX")
)
annotation.text = math_formula
mathml = unescape(tostring(mathml_element, encoding="unicode"))
text = f"<div>{mathml}</div>"
try:
mathml_element = latex2mathml.converter.convert_to_element(
math_formula, display="block"
)
annotation = SubElement(
mathml_element, "annotation", dict(encoding="TeX")
)
annotation.text = math_formula
mathml = unescape(tostring(mathml_element, encoding="unicode"))
text = f"<div>{mathml}</div>"
except Exception as err:
_logger.warning(
"Malformed formula cannot be rendered. "
f"Error {err.__class__.__name__}, formula={math_formula}"
)
if image_mode == ImageRefMode.EMBEDDED and len(item.prov) > 0:
text = _image_fallback(item)
else:
text = f"<pre>{math_formula}</pre>"

elif math_formula != "":
text = f"<pre>{math_formula}</pre>"
Expand Down

0 comments on commit eb9b4b3

Please sign in to comment.