diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py
index 0f463001..b0e1007f 100644
--- a/docling_core/types/doc/document.py
+++ b/docling_core/types/doc/document.py
@@ -1393,6 +1393,17 @@ class DoclingDocument(BaseModel):
math annotation {
display: none;
}
+ .formula-not-decoded {
+ background: repeating-linear-gradient(
+ 45deg, /* Angle of the stripes */
+ LightGray, /* First color */
+ LightGray 10px, /* Length of the first color */
+ White 10px, /* Second color */
+ White 20px /* Length of the second color */
+ );
+ margin: 0;
+ text-align: center;
+ }
"""
@@ -2216,11 +2227,18 @@ def _append_text(text: str, do_escape_html=True, do_escape_underscores=True):
elif isinstance(item, TextItem) and item.label in [DocItemLabel.FORMULA]:
in_list = False
- _append_text(
- f"$${item.text}$$\n",
- do_escape_underscores=False,
- do_escape_html=False,
- )
+ if item.text != "":
+ _append_text(
+ f"$${item.text}$$\n",
+ do_escape_underscores=False,
+ do_escape_html=False,
+ )
+ elif item.orig != "":
+ _append_text(
+ "\n",
+ do_escape_underscores=False,
+ do_escape_html=False,
+ )
elif isinstance(item, TextItem) and item.label in labels:
in_list = False
@@ -2467,9 +2485,27 @@ def _prepare_tag_content(
math_formula = _prepare_tag_content(
item.text, do_escape_html=False, do_replace_newline=False
)
- if formula_to_mathml:
- # Building a math equation in MathML format
- # ref https://www.w3.org/TR/wai-aria-1.1/#math
+ text = ""
+
+ # If the formula is not processed correcty, use its image
+ if (
+ item.text == ""
+ and item.orig != ""
+ and image_mode == ImageRefMode.EMBEDDED
+ and len(item.prov) > 0
+ ):
+ item_image = item.get_image(doc=self)
+ if item_image is not None:
+ img_ref = ImageRef.from_pil(item_image, dpi=72)
+ text = (
+ ""
+ f''
+ ""
+ )
+
+ # Building a math equation in MathML format
+ # ref https://www.w3.org/TR/wai-aria-1.1/#math
+ elif formula_to_mathml:
mathml_element = latex2mathml.converter.convert_to_element(
math_formula, display="block"
)
@@ -2480,9 +2516,15 @@ def _prepare_tag_content(
mathml = unescape(tostring(mathml_element, encoding="unicode"))
text = f"
{mathml}
"
- else:
+ elif math_formula != "":
text = f"
{math_formula}
"
- html_texts.append(text)
+
+ if text != "":
+ html_texts.append(text)
+ else:
+ html_texts.append(
+ '
Formula not decoded
'
+ )
elif isinstance(item, ListItem):
diff --git a/test/data/doc/2206.01062.yaml.html b/test/data/doc/2206.01062.yaml.html
index ea69871b..01f2456b 100644
--- a/test/data/doc/2206.01062.yaml.html
+++ b/test/data/doc/2206.01062.yaml.html
@@ -56,6 +56,17 @@
math annotation {
display: none;
}
+ .formula-not-decoded {
+ background: repeating-linear-gradient(
+ 45deg, /* Angle of the stripes */
+ LightGray, /* First color */
+ LightGray 10px, /* Length of the first color */
+ White 10px, /* Second color */
+ White 20px /* Length of the second color */
+ );
+ margin: 0;
+ text-align: center;
+ }
DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis
diff --git a/test/data/doc/bad_doc.yaml.html b/test/data/doc/bad_doc.yaml.html
index c6c20f87..1b143924 100644
--- a/test/data/doc/bad_doc.yaml.html
+++ b/test/data/doc/bad_doc.yaml.html
@@ -56,6 +56,17 @@
math annotation {
display: none;
}
+ .formula-not-decoded {
+ background: repeating-linear-gradient(
+ 45deg, /* Angle of the stripes */
+ LightGray, /* First color */
+ LightGray 10px, /* Length of the first color */
+ White 10px, /* Second color */
+ White 20px /* Length of the second color */
+ );
+ margin: 0;
+ text-align: center;
+ }
This is the title
diff --git a/test/data/doc/constructed_doc.embedded.html.gt b/test/data/doc/constructed_doc.embedded.html.gt
index ec63d1a2..c158d024 100644
--- a/test/data/doc/constructed_doc.embedded.html.gt
+++ b/test/data/doc/constructed_doc.embedded.html.gt
@@ -56,6 +56,17 @@
math annotation {
display: none;
}
+ .formula-not-decoded {
+ background: repeating-linear-gradient(
+ 45deg, /* Angle of the stripes */
+ LightGray, /* First color */
+ LightGray 10px, /* Length of the first color */
+ White 10px, /* Second color */
+ White 20px /* Length of the second color */
+ );
+ margin: 0;
+ text-align: center;
+ }
Title of the Document
diff --git a/test/data/doc/constructed_doc.placeholder.html.gt b/test/data/doc/constructed_doc.placeholder.html.gt
index f14a80e0..e7ba7427 100644
--- a/test/data/doc/constructed_doc.placeholder.html.gt
+++ b/test/data/doc/constructed_doc.placeholder.html.gt
@@ -56,6 +56,17 @@
math annotation {
display: none;
}
+ .formula-not-decoded {
+ background: repeating-linear-gradient(
+ 45deg, /* Angle of the stripes */
+ LightGray, /* First color */
+ LightGray 10px, /* Length of the first color */
+ White 10px, /* Second color */
+ White 20px /* Length of the second color */
+ );
+ margin: 0;
+ text-align: center;
+ }
Title of the Document
diff --git a/test/data/doc/constructed_doc.referenced.html.gt b/test/data/doc/constructed_doc.referenced.html.gt
index 4678f9cb..2098b6a5 100644
--- a/test/data/doc/constructed_doc.referenced.html.gt
+++ b/test/data/doc/constructed_doc.referenced.html.gt
@@ -56,6 +56,17 @@
math annotation {
display: none;
}
+ .formula-not-decoded {
+ background: repeating-linear-gradient(
+ 45deg, /* Angle of the stripes */
+ LightGray, /* First color */
+ LightGray 10px, /* Length of the first color */
+ White 10px, /* Second color */
+ White 20px /* Length of the second color */
+ );
+ margin: 0;
+ text-align: center;
+ }
Title of the Document
diff --git a/test/data/doc/constructed_document.yaml.html b/test/data/doc/constructed_document.yaml.html
index f14a80e0..e7ba7427 100644
--- a/test/data/doc/constructed_document.yaml.html
+++ b/test/data/doc/constructed_document.yaml.html
@@ -56,6 +56,17 @@
math annotation {
display: none;
}
+ .formula-not-decoded {
+ background: repeating-linear-gradient(
+ 45deg, /* Angle of the stripes */
+ LightGray, /* First color */
+ LightGray 10px, /* Length of the first color */
+ White 10px, /* Second color */
+ White 20px /* Length of the second color */
+ );
+ margin: 0;
+ text-align: center;
+ }
Title of the Document
diff --git a/test/data/doc/dummy_doc.yaml.html b/test/data/doc/dummy_doc.yaml.html
index 46d5c090..051eaf81 100644
--- a/test/data/doc/dummy_doc.yaml.html
+++ b/test/data/doc/dummy_doc.yaml.html
@@ -56,6 +56,17 @@
math annotation {
display: none;
}
+ .formula-not-decoded {
+ background: repeating-linear-gradient(
+ 45deg, /* Angle of the stripes */
+ LightGray, /* First color */
+ LightGray 10px, /* Length of the first color */
+ White 10px, /* Second color */
+ White 20px /* Length of the second color */
+ );
+ margin: 0;
+ text-align: center;
+ }
DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis