diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py index 0f463001..b0e1007f 100644 --- a/docling_core/types/doc/document.py +++ b/docling_core/types/doc/document.py @@ -1393,6 +1393,17 @@ class DoclingDocument(BaseModel): math annotation { display: none; } + .formula-not-decoded { + background: repeating-linear-gradient( + 45deg, /* Angle of the stripes */ + LightGray, /* First color */ + LightGray 10px, /* Length of the first color */ + White 10px, /* Second color */ + White 20px /* Length of the second color */ + ); + margin: 0; + text-align: center; + } """ @@ -2216,11 +2227,18 @@ def _append_text(text: str, do_escape_html=True, do_escape_underscores=True): elif isinstance(item, TextItem) and item.label in [DocItemLabel.FORMULA]: in_list = False - _append_text( - f"$${item.text}$$\n", - do_escape_underscores=False, - do_escape_html=False, - ) + if item.text != "": + _append_text( + f"$${item.text}$$\n", + do_escape_underscores=False, + do_escape_html=False, + ) + elif item.orig != "": + _append_text( + "\n", + do_escape_underscores=False, + do_escape_html=False, + ) elif isinstance(item, TextItem) and item.label in labels: in_list = False @@ -2467,9 +2485,27 @@ def _prepare_tag_content( math_formula = _prepare_tag_content( item.text, do_escape_html=False, do_replace_newline=False ) - if formula_to_mathml: - # Building a math equation in MathML format - # ref https://www.w3.org/TR/wai-aria-1.1/#math + text = "" + + # If the formula is not processed correcty, use its image + if ( + item.text == "" + and item.orig != "" + and image_mode == ImageRefMode.EMBEDDED + and len(item.prov) > 0 + ): + item_image = item.get_image(doc=self) + if item_image is not None: + img_ref = ImageRef.from_pil(item_image, dpi=72) + text = ( + "
" + f'{item.orig}' + "
" + ) + + # Building a math equation in MathML format + # ref https://www.w3.org/TR/wai-aria-1.1/#math + elif formula_to_mathml: mathml_element = latex2mathml.converter.convert_to_element( math_formula, display="block" ) @@ -2480,9 +2516,15 @@ def _prepare_tag_content( mathml = unescape(tostring(mathml_element, encoding="unicode")) text = f"
{mathml}
" - else: + elif math_formula != "": text = f"
{math_formula}
" - html_texts.append(text) + + if text != "": + html_texts.append(text) + else: + html_texts.append( + '
Formula not decoded
' + ) elif isinstance(item, ListItem): diff --git a/test/data/doc/2206.01062.yaml.html b/test/data/doc/2206.01062.yaml.html index ea69871b..01f2456b 100644 --- a/test/data/doc/2206.01062.yaml.html +++ b/test/data/doc/2206.01062.yaml.html @@ -56,6 +56,17 @@ math annotation { display: none; } + .formula-not-decoded { + background: repeating-linear-gradient( + 45deg, /* Angle of the stripes */ + LightGray, /* First color */ + LightGray 10px, /* Length of the first color */ + White 10px, /* Second color */ + White 20px /* Length of the second color */ + ); + margin: 0; + text-align: center; + }

DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis

diff --git a/test/data/doc/bad_doc.yaml.html b/test/data/doc/bad_doc.yaml.html index c6c20f87..1b143924 100644 --- a/test/data/doc/bad_doc.yaml.html +++ b/test/data/doc/bad_doc.yaml.html @@ -56,6 +56,17 @@ math annotation { display: none; } + .formula-not-decoded { + background: repeating-linear-gradient( + 45deg, /* Angle of the stripes */ + LightGray, /* First color */ + LightGray 10px, /* Length of the first color */ + White 10px, /* Second color */ + White 20px /* Length of the second color */ + ); + margin: 0; + text-align: center; + }

This is the title

diff --git a/test/data/doc/constructed_doc.embedded.html.gt b/test/data/doc/constructed_doc.embedded.html.gt index ec63d1a2..c158d024 100644 --- a/test/data/doc/constructed_doc.embedded.html.gt +++ b/test/data/doc/constructed_doc.embedded.html.gt @@ -56,6 +56,17 @@ math annotation { display: none; } + .formula-not-decoded { + background: repeating-linear-gradient( + 45deg, /* Angle of the stripes */ + LightGray, /* First color */ + LightGray 10px, /* Length of the first color */ + White 10px, /* Second color */ + White 20px /* Length of the second color */ + ); + margin: 0; + text-align: center; + }

Title of the Document

diff --git a/test/data/doc/constructed_doc.placeholder.html.gt b/test/data/doc/constructed_doc.placeholder.html.gt index f14a80e0..e7ba7427 100644 --- a/test/data/doc/constructed_doc.placeholder.html.gt +++ b/test/data/doc/constructed_doc.placeholder.html.gt @@ -56,6 +56,17 @@ math annotation { display: none; } + .formula-not-decoded { + background: repeating-linear-gradient( + 45deg, /* Angle of the stripes */ + LightGray, /* First color */ + LightGray 10px, /* Length of the first color */ + White 10px, /* Second color */ + White 20px /* Length of the second color */ + ); + margin: 0; + text-align: center; + }

Title of the Document

diff --git a/test/data/doc/constructed_doc.referenced.html.gt b/test/data/doc/constructed_doc.referenced.html.gt index 4678f9cb..2098b6a5 100644 --- a/test/data/doc/constructed_doc.referenced.html.gt +++ b/test/data/doc/constructed_doc.referenced.html.gt @@ -56,6 +56,17 @@ math annotation { display: none; } + .formula-not-decoded { + background: repeating-linear-gradient( + 45deg, /* Angle of the stripes */ + LightGray, /* First color */ + LightGray 10px, /* Length of the first color */ + White 10px, /* Second color */ + White 20px /* Length of the second color */ + ); + margin: 0; + text-align: center; + }

Title of the Document

diff --git a/test/data/doc/constructed_document.yaml.html b/test/data/doc/constructed_document.yaml.html index f14a80e0..e7ba7427 100644 --- a/test/data/doc/constructed_document.yaml.html +++ b/test/data/doc/constructed_document.yaml.html @@ -56,6 +56,17 @@ math annotation { display: none; } + .formula-not-decoded { + background: repeating-linear-gradient( + 45deg, /* Angle of the stripes */ + LightGray, /* First color */ + LightGray 10px, /* Length of the first color */ + White 10px, /* Second color */ + White 20px /* Length of the second color */ + ); + margin: 0; + text-align: center; + }

Title of the Document

diff --git a/test/data/doc/dummy_doc.yaml.html b/test/data/doc/dummy_doc.yaml.html index 46d5c090..051eaf81 100644 --- a/test/data/doc/dummy_doc.yaml.html +++ b/test/data/doc/dummy_doc.yaml.html @@ -56,6 +56,17 @@ math annotation { display: none; } + .formula-not-decoded { + background: repeating-linear-gradient( + 45deg, /* Angle of the stripes */ + LightGray, /* First color */ + LightGray 10px, /* Length of the first color */ + White 10px, /* Second color */ + White 20px /* Length of the second color */ + ); + margin: 0; + text-align: center; + }

DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis