From 3302e783a6c44c2e34ab55d8211210d02e006099 Mon Sep 17 00:00:00 2001 From: Michele Dolfi Date: Fri, 31 Jan 2025 11:22:53 +0100 Subject: [PATCH 01/11] remove un-needed logic the labels allowlist is checked before Signed-off-by: Michele Dolfi --- docling_core/types/doc/document.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py index 87815319..db5f2ed4 100644 --- a/docling_core/types/doc/document.py +++ b/docling_core/types/doc/document.py @@ -2481,7 +2481,7 @@ def _sanitize_text(text: str, do_escape_html=True) -> str: text = f"
  • {_sanitize_text(item.text)}
  • " html_texts.append(text) - elif isinstance(item, CodeItem) and item.label in labels: + elif isinstance(item, CodeItem): text = ( "
    "
                         f"{_sanitize_text(item.text, do_escape_html=False)}"
    @@ -2489,7 +2489,7 @@ def _sanitize_text(text: str, do_escape_html=True) -> str:
                     )
                     html_texts.append(text.strip())
     
    -            elif isinstance(item, TextItem) and item.label in labels:
    +            elif isinstance(item, TextItem):
     
                     text = f"

    {_sanitize_text(item.text)}

    " html_texts.append(text.strip()) From de8f4aadd5b4922535b1fae2c0b9520746a406cd Mon Sep 17 00:00:00 2001 From: Michele Dolfi Date: Fri, 31 Jan 2025 11:24:36 +0100 Subject: [PATCH 02/11] textitem cannot have label code Signed-off-by: Michele Dolfi --- docling_core/types/doc/document.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py index db5f2ed4..2c7962bd 100644 --- a/docling_core/types/doc/document.py +++ b/docling_core/types/doc/document.py @@ -2466,11 +2466,6 @@ def _sanitize_text(text: str, do_escape_html=True) -> str: ) html_texts.append(text.strip()) - elif isinstance(item, TextItem) and item.label in [DocItemLabel.CODE]: - - text = f"
    {_sanitize_text(item.text, do_escape_html=False)}
    " - html_texts.append(text) - elif isinstance(item, ListItem): text = f"
  • {_sanitize_text(item.text)}
  • " From d42f37565309fc6f7a56397c00e77ddd89a26ded Mon Sep 17 00:00:00 2001 From: Michele Dolfi Date: Fri, 31 Jan 2025 13:24:13 +0100 Subject: [PATCH 03/11] display formulas with mathml in exported html Signed-off-by: Michele Dolfi --- docling_core/types/doc/document.py | 25 +++++++++++++++++++ poetry.lock | 13 +++++++++- pyproject.toml | 1 + .../export/formula_mathml.html | 5 ++++ test/test_docling_doc.py | 14 +++++++++++ 5 files changed, 57 insertions(+), 1 deletion(-) create mode 100644 test/data/docling_document/export/formula_mathml.html diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py index 2c7962bd..94f37e32 100644 --- a/docling_core/types/doc/document.py +++ b/docling_core/types/doc/document.py @@ -16,7 +16,10 @@ from pathlib import Path from typing import Any, Dict, Final, List, Literal, Optional, Tuple, Union from urllib.parse import quote, unquote +from xml.etree.cElementTree import SubElement, tostring +from xml.sax.saxutils import unescape +import latex2mathml.converter import pandas as pd import yaml from PIL import Image as PILImage @@ -2347,6 +2350,7 @@ def export_to_html( # noqa: C901 to_element: int = sys.maxsize, labels: set[DocItemLabel] = DEFAULT_EXPORT_LABELS, image_mode: ImageRefMode = ImageRefMode.PLACEHOLDER, + formula_to_mathml: bool = False, page_no: Optional[int] = None, html_lang: str = "en", html_head: str = _HTML_DEFAULT_HEAD, @@ -2466,6 +2470,27 @@ def _sanitize_text(text: str, do_escape_html=True) -> str: ) html_texts.append(text.strip()) + elif isinstance(item, TextItem) and item.label in [DocItemLabel.FORMULA]: + + if formula_to_mathml: + # Building a math equation in MathML format + # ref https://www.w3.org/TR/wai-aria-1.1/#math + math_formula = _sanitize_text(item.text, do_escape_html=False) + mathml_element = latex2mathml.converter.convert_to_element( + math_formula, display="block" + ) + annotation = SubElement( + mathml_element, "annotation", dict(encoding="TeX") + ) + annotation.text = math_formula + mathml = unescape(tostring(mathml_element, encoding="unicode")) + text = f"
    {mathml}
    " + else: + text = ( + f"
    {_sanitize_text(item.text, do_escape_html=False)}
    " + ) + html_texts.append(text) + elif isinstance(item, ListItem): text = f"
  • {_sanitize_text(item.text)}
  • " diff --git a/poetry.lock b/poetry.lock index c039dbe1..79535c36 100644 --- a/poetry.lock +++ b/poetry.lock @@ -862,6 +862,17 @@ enabler = ["pytest-enabler (>=2.2)"] test = ["pyfakefs", "pytest (>=6,!=8.1.*)"] type = ["pygobject-stubs", "pytest-mypy", "shtab", "types-pywin32"] +[[package]] +name = "latex2mathml" +version = "3.77.0" +description = "Pure Python library for LaTeX to MathML conversion" +optional = false +python-versions = ">=3.8.1,<4.0.0" +files = [ + {file = "latex2mathml-3.77.0-py3-none-any.whl", hash = "sha256:5531e18a2a9eae7c24e257118b6a444cbba253cd27ff3e81f1bd6c41e88e786e"}, + {file = "latex2mathml-3.77.0.tar.gz", hash = "sha256:e2f501d1878f2e489c3f6f12786bef74c62f712d2770f7f3c837eb20a55d0a1e"}, +] + [[package]] name = "markdown-it-py" version = "3.0.0" @@ -2682,4 +2693,4 @@ chunking = ["semchunk", "transformers"] [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "0475894c9c8ec390cda2f68e75edbf3be6601224090e8a6d826bfc4aa4f36aeb" +content-hash = "10d96addf225f2632b23451d92cb0641f6724fd9bcd59939b45f0ad86351fb20" diff --git a/pyproject.toml b/pyproject.toml index bc11419d..3b2f0aa1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -59,6 +59,7 @@ typing-extensions = "^4.12.2" transformers = { version = "^4.34.0", optional = true } semchunk = { version = "^2.2.0", optional = true } typer = "^0.12.5" +latex2mathml = "^3.77.0" [tool.poetry.extras] chunking = ["transformers", "semchunk"] diff --git a/test/data/docling_document/export/formula_mathml.html b/test/data/docling_document/export/formula_mathml.html new file mode 100644 index 00000000..58f3435f --- /dev/null +++ b/test/data/docling_document/export/formula_mathml.html @@ -0,0 +1,5 @@ + + + +
    1x\frac{1}{x}
    + \ No newline at end of file diff --git a/test/test_docling_doc.py b/test/test_docling_doc.py index c29ae59b..e7bc1412 100644 --- a/test/test_docling_doc.py +++ b/test/test_docling_doc.py @@ -661,6 +661,20 @@ def test_version_doc(): assert doc.version == CURRENT_VERSION +def test_formula_mathml(): + doc = DoclingDocument(name="Dummy") + equation = "\\frac{1}{x}" + doc.add_text(label=DocItemLabel.FORMULA, text=equation) + + doc_html = doc.export_to_html(formula_to_mathml=True, html_head="") + + gt_html = Path("test/data/docling_document/export/formula_mathml.html").read_text( + encoding="utf8" + ) + + assert doc_html == gt_html + + def test_docitem_get_image(): # Prepare the document doc = DoclingDocument(name="Dummy") From a59501e95d4eca82caa2ebabf307c9c82ff238c4 Mon Sep 17 00:00:00 2001 From: Michele Dolfi Date: Fri, 31 Jan 2025 13:25:38 +0100 Subject: [PATCH 04/11] expose argument in save_as_html Signed-off-by: Michele Dolfi --- docling_core/types/doc/document.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py index 94f37e32..83df77bf 100644 --- a/docling_core/types/doc/document.py +++ b/docling_core/types/doc/document.py @@ -2285,6 +2285,7 @@ def save_as_html( to_element: int = sys.maxsize, labels: set[DocItemLabel] = DEFAULT_EXPORT_LABELS, image_mode: ImageRefMode = ImageRefMode.PLACEHOLDER, + formula_to_mathml: bool = False, page_no: Optional[int] = None, html_lang: str = "en", html_head: str = _HTML_DEFAULT_HEAD, @@ -2304,6 +2305,7 @@ def save_as_html( to_element=to_element, labels=labels, image_mode=image_mode, + formula_to_mathml=formula_to_mathml, page_no=page_no, html_lang=html_lang, html_head=html_head, From 16bc35f76918e2216330b742cc59b92402c414e1 Mon Sep 17 00:00:00 2001 From: Michele Dolfi Date: Fri, 31 Jan 2025 13:30:28 +0100 Subject: [PATCH 05/11] rename sanitize in prepare and add \n Signed-off-by: Michele Dolfi --- docling_core/types/doc/document.py | 35 ++++++++++++++++++------------ 1 file changed, 21 insertions(+), 14 deletions(-) diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py index 83df77bf..ed7c00c4 100644 --- a/docling_core/types/doc/document.py +++ b/docling_core/types/doc/document.py @@ -2387,9 +2387,13 @@ def close_lists( in_ordered_list: List[bool] = [] # False - def _sanitize_text(text: str, do_escape_html=True) -> str: + def _prepare_text( + text: str, do_escape_html=True, do_replace_newline=True + ) -> str: if do_escape_html: text = html.escape(text, quote=False) + if do_replace_newline: + text = text.replace("\n", "
    ") return text for ix, (item, curr_level) in enumerate( @@ -2442,7 +2446,7 @@ def _sanitize_text(text: str, do_escape_html=True) -> str: elif isinstance(item, TextItem) and item.label in [DocItemLabel.TITLE]: - text = f"

    {_sanitize_text(item.text)}

    " + text = f"

    {_prepare_text(item.text)}

    " html_texts.append(text.strip()) elif isinstance(item, SectionHeaderItem): @@ -2451,7 +2455,7 @@ def _sanitize_text(text: str, do_escape_html=True) -> str: text = ( f"" - f"{_sanitize_text(item.text)}" + f"{_prepare_text(item.text)}" ) html_texts.append(text.strip()) @@ -2468,16 +2472,18 @@ def _sanitize_text(text: str, do_escape_html=True) -> str: section_level = 6 text = ( - f"{_sanitize_text(item.text)}" + f"{_prepare_text(item.text)}" ) html_texts.append(text.strip()) elif isinstance(item, TextItem) and item.label in [DocItemLabel.FORMULA]: + math_formula = _prepare_text( + item.text, do_escape_html=False, do_replace_newline=False + ) if formula_to_mathml: # Building a math equation in MathML format # ref https://www.w3.org/TR/wai-aria-1.1/#math - math_formula = _sanitize_text(item.text, do_escape_html=False) mathml_element = latex2mathml.converter.convert_to_element( math_formula, display="block" ) @@ -2488,32 +2494,34 @@ def _sanitize_text(text: str, do_escape_html=True) -> str: mathml = unescape(tostring(mathml_element, encoding="unicode")) text = f"
    {mathml}
    " else: - text = ( - f"
    {_sanitize_text(item.text, do_escape_html=False)}
    " - ) + text = f"
    {math_formula}
    " html_texts.append(text) elif isinstance(item, ListItem): - text = f"
  • {_sanitize_text(item.text)}
  • " + text = f"
  • {_prepare_text(item.text)}
  • " html_texts.append(text) elif isinstance(item, TextItem) and item.label in [DocItemLabel.LIST_ITEM]: - text = f"
  • {_sanitize_text(item.text)}
  • " + text = f"
  • {_prepare_text(item.text)}
  • " html_texts.append(text) elif isinstance(item, CodeItem): text = ( "
    "
    -                    f"{_sanitize_text(item.text, do_escape_html=False)}"
    +                    f"{_prepare_text(
    +                        item.text,
    +                        do_escape_html=False,
    +                        do_replace_newline=False
    +                    )}"
                         "
    " ) html_texts.append(text.strip()) elif isinstance(item, TextItem): - text = f"

    {_sanitize_text(item.text)}

    " + text = f"

    {_prepare_text(item.text)}

    " html_texts.append(text.strip()) elif isinstance(item, TableItem): @@ -2535,8 +2543,7 @@ def _sanitize_text(text: str, do_escape_html=True) -> str: lines = [] lines.extend(head_lines) - for i, line in enumerate(html_texts): - lines.append(line.replace("\n", "
    ")) + lines.extend(html_texts) delim = "\n" html_text = (delim.join(lines)).strip() From dd70bc8c477dff650d3620fb223e3ba400cdaabf Mon Sep 17 00:00:00 2001 From: Michele Dolfi Date: Fri, 31 Jan 2025 13:35:44 +0100 Subject: [PATCH 06/11] fix mypy parsing Signed-off-by: Michele Dolfi --- docling_core/types/doc/document.py | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py index ed7c00c4..bd5fac2f 100644 --- a/docling_core/types/doc/document.py +++ b/docling_core/types/doc/document.py @@ -2508,15 +2508,10 @@ def _prepare_text( html_texts.append(text) elif isinstance(item, CodeItem): - text = ( - "
    "
    -                    f"{_prepare_text(
    -                        item.text,
    -                        do_escape_html=False,
    -                        do_replace_newline=False
    -                    )}"
    -                    "
    " + code_text = _prepare_text( + item.text, do_escape_html=False, do_replace_newline=False ) + text = f"
    {code_text}
    " html_texts.append(text.strip()) elif isinstance(item, TextItem): From 01383951215f358cb9cb71064137c91e6d3e364e Mon Sep 17 00:00:00 2001 From: Michele Dolfi Date: Fri, 31 Jan 2025 14:08:27 +0100 Subject: [PATCH 07/11] remove unused/impossible elif Signed-off-by: Michele Dolfi --- docling_core/types/doc/document.py | 35 ++++++++---------------------- 1 file changed, 9 insertions(+), 26 deletions(-) diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py index bd5fac2f..25c4a3fd 100644 --- a/docling_core/types/doc/document.py +++ b/docling_core/types/doc/document.py @@ -2387,7 +2387,7 @@ def close_lists( in_ordered_list: List[bool] = [] # False - def _prepare_text( + def _prepare_tag_content( text: str, do_escape_html=True, do_replace_newline=True ) -> str: if do_escape_html: @@ -2446,39 +2446,22 @@ def _prepare_text( elif isinstance(item, TextItem) and item.label in [DocItemLabel.TITLE]: - text = f"

    {_prepare_text(item.text)}

    " + text = f"

    {_prepare_tag_content(item.text)}

    " html_texts.append(text.strip()) elif isinstance(item, SectionHeaderItem): - section_level: int = item.level + 1 + section_level: int = min(item.level + 1, 6) text = ( f"" - f"{_prepare_text(item.text)}" - ) - html_texts.append(text.strip()) - - elif isinstance(item, TextItem) and item.label in [ - DocItemLabel.SECTION_HEADER - ]: - - section_level = curr_level - - if section_level <= 1: - section_level = 2 - - if section_level >= 6: - section_level = 6 - - text = ( - f"{_prepare_text(item.text)}" + f"{_prepare_tag_content(item.text)}" ) html_texts.append(text.strip()) elif isinstance(item, TextItem) and item.label in [DocItemLabel.FORMULA]: - math_formula = _prepare_text( + math_formula = _prepare_tag_content( item.text, do_escape_html=False, do_replace_newline=False ) if formula_to_mathml: @@ -2499,16 +2482,16 @@ def _prepare_text( elif isinstance(item, ListItem): - text = f"
  • {_prepare_text(item.text)}
  • " + text = f"
  • {_prepare_tag_content(item.text)}
  • " html_texts.append(text) elif isinstance(item, TextItem) and item.label in [DocItemLabel.LIST_ITEM]: - text = f"
  • {_prepare_text(item.text)}
  • " + text = f"
  • {_prepare_tag_content(item.text)}
  • " html_texts.append(text) elif isinstance(item, CodeItem): - code_text = _prepare_text( + code_text = _prepare_tag_content( item.text, do_escape_html=False, do_replace_newline=False ) text = f"
    {code_text}
    " @@ -2516,7 +2499,7 @@ def _prepare_text( elif isinstance(item, TextItem): - text = f"

    {_prepare_text(item.text)}

    " + text = f"

    {_prepare_tag_content(item.text)}

    " html_texts.append(text.strip()) elif isinstance(item, TableItem): From 9dbbfa8fb18f5119a6f387446c942b335cd39287 Mon Sep 17 00:00:00 2001 From: Michele Dolfi Date: Fri, 31 Jan 2025 14:14:22 +0100 Subject: [PATCH 08/11] remove strip() Signed-off-by: Michele Dolfi --- docling_core/types/doc/document.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py index 25c4a3fd..299d786f 100644 --- a/docling_core/types/doc/document.py +++ b/docling_core/types/doc/document.py @@ -2426,7 +2426,7 @@ def _prepare_tag_content( ]: text = "
      " - html_texts.append(text.strip()) + html_texts.append(text) # Increment list nesting level when entering a new list in_ordered_list.append(True) @@ -2436,7 +2436,7 @@ def _prepare_tag_content( ]: text = "
        " - html_texts.append(text.strip()) + html_texts.append(text) # Increment list nesting level when entering a new list in_ordered_list.append(False) @@ -2447,7 +2447,7 @@ def _prepare_tag_content( elif isinstance(item, TextItem) and item.label in [DocItemLabel.TITLE]: text = f"

        {_prepare_tag_content(item.text)}

        " - html_texts.append(text.strip()) + html_texts.append(text) elif isinstance(item, SectionHeaderItem): @@ -2457,7 +2457,7 @@ def _prepare_tag_content( f"" f"{_prepare_tag_content(item.text)}" ) - html_texts.append(text.strip()) + html_texts.append(text) elif isinstance(item, TextItem) and item.label in [DocItemLabel.FORMULA]: @@ -2495,12 +2495,12 @@ def _prepare_tag_content( item.text, do_escape_html=False, do_replace_newline=False ) text = f"
        {code_text}
        " - html_texts.append(text.strip()) + html_texts.append(text) elif isinstance(item, TextItem): text = f"

        {_prepare_tag_content(item.text)}

        " - html_texts.append(text.strip()) + html_texts.append(text) elif isinstance(item, TableItem): text = item.export_to_html(doc=self, add_caption=True) From eb0cac44f6725c6d3d025319bae6271d78132853 Mon Sep 17 00:00:00 2001 From: Michele Dolfi Date: Fri, 31 Jan 2025 14:17:05 +0100 Subject: [PATCH 09/11] add display none for latex annotation Signed-off-by: Michele Dolfi --- docling_core/types/doc/document.py | 3 +++ test/data/doc/2206.01062.yaml.html | 3 +++ test/data/doc/bad_doc.yaml.html | 3 +++ test/data/doc/constructed_doc.embedded.html.gt | 3 +++ test/data/doc/constructed_doc.placeholder.html.gt | 3 +++ test/data/doc/constructed_doc.referenced.html.gt | 3 +++ test/data/doc/constructed_document.yaml.html | 3 +++ test/data/doc/dummy_doc.yaml.html | 3 +++ 8 files changed, 24 insertions(+) diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py index 299d786f..eb56e7cc 100644 --- a/docling_core/types/doc/document.py +++ b/docling_core/types/doc/document.py @@ -1390,6 +1390,9 @@ class DoclingDocument(BaseModel): table tr:nth-child(even) td{ background-color: LightGray; } + math annotation { + display: none; + } """ diff --git a/test/data/doc/2206.01062.yaml.html b/test/data/doc/2206.01062.yaml.html index ba2add20..ea69871b 100644 --- a/test/data/doc/2206.01062.yaml.html +++ b/test/data/doc/2206.01062.yaml.html @@ -53,6 +53,9 @@ table tr:nth-child(even) td{ background-color: LightGray; } + math annotation { + display: none; + }

        DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis

        diff --git a/test/data/doc/bad_doc.yaml.html b/test/data/doc/bad_doc.yaml.html index 12cf61d0..c6c20f87 100644 --- a/test/data/doc/bad_doc.yaml.html +++ b/test/data/doc/bad_doc.yaml.html @@ -53,6 +53,9 @@ table tr:nth-child(even) td{ background-color: LightGray; } + math annotation { + display: none; + }

        This is the title

        diff --git a/test/data/doc/constructed_doc.embedded.html.gt b/test/data/doc/constructed_doc.embedded.html.gt index 12ef29ee..ec63d1a2 100644 --- a/test/data/doc/constructed_doc.embedded.html.gt +++ b/test/data/doc/constructed_doc.embedded.html.gt @@ -53,6 +53,9 @@ table tr:nth-child(even) td{ background-color: LightGray; } + math annotation { + display: none; + }

        Title of the Document

        diff --git a/test/data/doc/constructed_doc.placeholder.html.gt b/test/data/doc/constructed_doc.placeholder.html.gt index e63df310..f14a80e0 100644 --- a/test/data/doc/constructed_doc.placeholder.html.gt +++ b/test/data/doc/constructed_doc.placeholder.html.gt @@ -53,6 +53,9 @@ table tr:nth-child(even) td{ background-color: LightGray; } + math annotation { + display: none; + }

        Title of the Document

        diff --git a/test/data/doc/constructed_doc.referenced.html.gt b/test/data/doc/constructed_doc.referenced.html.gt index da449dfb..4678f9cb 100644 --- a/test/data/doc/constructed_doc.referenced.html.gt +++ b/test/data/doc/constructed_doc.referenced.html.gt @@ -53,6 +53,9 @@ table tr:nth-child(even) td{ background-color: LightGray; } + math annotation { + display: none; + }

        Title of the Document

        diff --git a/test/data/doc/constructed_document.yaml.html b/test/data/doc/constructed_document.yaml.html index e63df310..f14a80e0 100644 --- a/test/data/doc/constructed_document.yaml.html +++ b/test/data/doc/constructed_document.yaml.html @@ -53,6 +53,9 @@ table tr:nth-child(even) td{ background-color: LightGray; } + math annotation { + display: none; + }

        Title of the Document

        diff --git a/test/data/doc/dummy_doc.yaml.html b/test/data/doc/dummy_doc.yaml.html index 7de2efc7..46d5c090 100644 --- a/test/data/doc/dummy_doc.yaml.html +++ b/test/data/doc/dummy_doc.yaml.html @@ -53,6 +53,9 @@ table tr:nth-child(even) td{ background-color: LightGray; } + math annotation { + display: none; + }

        DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis

        From 25c681328268af038e6246b533c5fba16fdd4c3e Mon Sep 17 00:00:00 2001 From: Michele Dolfi Date: Fri, 31 Jan 2025 15:24:27 +0100 Subject: [PATCH 10/11] make mathml the default Signed-off-by: Michele Dolfi --- docling_core/types/doc/document.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py index eb56e7cc..a3616a4a 100644 --- a/docling_core/types/doc/document.py +++ b/docling_core/types/doc/document.py @@ -2288,7 +2288,7 @@ def save_as_html( to_element: int = sys.maxsize, labels: set[DocItemLabel] = DEFAULT_EXPORT_LABELS, image_mode: ImageRefMode = ImageRefMode.PLACEHOLDER, - formula_to_mathml: bool = False, + formula_to_mathml: bool = True, page_no: Optional[int] = None, html_lang: str = "en", html_head: str = _HTML_DEFAULT_HEAD, @@ -2355,7 +2355,7 @@ def export_to_html( # noqa: C901 to_element: int = sys.maxsize, labels: set[DocItemLabel] = DEFAULT_EXPORT_LABELS, image_mode: ImageRefMode = ImageRefMode.PLACEHOLDER, - formula_to_mathml: bool = False, + formula_to_mathml: bool = True, page_no: Optional[int] = None, html_lang: str = "en", html_head: str = _HTML_DEFAULT_HEAD, @@ -2479,7 +2479,13 @@ def _prepare_tag_content( annotation.text = math_formula mathml = unescape(tostring(mathml_element, encoding="unicode")) text = f"
        {mathml}
        " - else: + + elif ( + item.text == "" + and item.orig != "" + and image_mode == ImageRefMode.EMBEDDED + ): + text = f"
        {math_formula}
        " html_texts.append(text) From 4af63cdc4ff100b136b06cba327294411765e4f3 Mon Sep 17 00:00:00 2001 From: Michele Dolfi Date: Fri, 31 Jan 2025 15:26:11 +0100 Subject: [PATCH 11/11] revert wrong commit Signed-off-by: Michele Dolfi --- docling_core/types/doc/document.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py index a3616a4a..0f463001 100644 --- a/docling_core/types/doc/document.py +++ b/docling_core/types/doc/document.py @@ -2480,12 +2480,7 @@ def _prepare_tag_content( mathml = unescape(tostring(mathml_element, encoding="unicode")) text = f"
        {mathml}
        " - elif ( - item.text == "" - and item.orig != "" - and image_mode == ImageRefMode.EMBEDDED - ): - + else: text = f"
        {math_formula}
        " html_texts.append(text)