Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(HTML): Export formulas with mathml #144

Merged
merged 11 commits into from
Jan 31, 2025
58 changes: 41 additions & 17 deletions docling_core/types/doc/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,10 @@
from pathlib import Path
from typing import Any, Dict, Final, List, Literal, Optional, Tuple, Union
from urllib.parse import quote, unquote
from xml.etree.cElementTree import SubElement, tostring
from xml.sax.saxutils import unescape

import latex2mathml.converter
import pandas as pd
import yaml
from PIL import Image as PILImage
Expand Down Expand Up @@ -2282,6 +2285,7 @@ def save_as_html(
to_element: int = sys.maxsize,
labels: set[DocItemLabel] = DEFAULT_EXPORT_LABELS,
image_mode: ImageRefMode = ImageRefMode.PLACEHOLDER,
formula_to_mathml: bool = False,
page_no: Optional[int] = None,
html_lang: str = "en",
html_head: str = _HTML_DEFAULT_HEAD,
Expand All @@ -2301,6 +2305,7 @@ def save_as_html(
to_element=to_element,
labels=labels,
image_mode=image_mode,
formula_to_mathml=formula_to_mathml,
page_no=page_no,
html_lang=html_lang,
html_head=html_head,
Expand Down Expand Up @@ -2347,6 +2352,7 @@ def export_to_html( # noqa: C901
to_element: int = sys.maxsize,
labels: set[DocItemLabel] = DEFAULT_EXPORT_LABELS,
image_mode: ImageRefMode = ImageRefMode.PLACEHOLDER,
formula_to_mathml: bool = False,
page_no: Optional[int] = None,
html_lang: str = "en",
html_head: str = _HTML_DEFAULT_HEAD,
Expand Down Expand Up @@ -2381,9 +2387,13 @@ def close_lists(

in_ordered_list: List[bool] = [] # False

def _sanitize_text(text: str, do_escape_html=True) -> str:
def _prepare_text(
text: str, do_escape_html=True, do_replace_newline=True
) -> str:
if do_escape_html:
text = html.escape(text, quote=False)
if do_replace_newline:
text = text.replace("\n", "<br>")
return text

for ix, (item, curr_level) in enumerate(
Expand Down Expand Up @@ -2436,7 +2446,7 @@ def _sanitize_text(text: str, do_escape_html=True) -> str:

elif isinstance(item, TextItem) and item.label in [DocItemLabel.TITLE]:

text = f"<h1>{_sanitize_text(item.text)}</h1>"
text = f"<h1>{_prepare_text(item.text)}</h1>"
html_texts.append(text.strip())

elif isinstance(item, SectionHeaderItem):
Expand All @@ -2445,7 +2455,7 @@ def _sanitize_text(text: str, do_escape_html=True) -> str:

text = (
f"<h{(section_level)}>"
f"{_sanitize_text(item.text)}</h{(section_level)}>"
f"{_prepare_text(item.text)}</h{(section_level)}>"
)
html_texts.append(text.strip())

Expand All @@ -2462,36 +2472,51 @@ def _sanitize_text(text: str, do_escape_html=True) -> str:
section_level = 6

text = (
f"<h{section_level}>{_sanitize_text(item.text)}</h{section_level}>"
f"<h{section_level}>{_prepare_text(item.text)}</h{section_level}>"
)
html_texts.append(text.strip())

elif isinstance(item, TextItem) and item.label in [DocItemLabel.CODE]:
elif isinstance(item, TextItem) and item.label in [DocItemLabel.FORMULA]:

text = f"<pre>{_sanitize_text(item.text, do_escape_html=False)}</pre>"
math_formula = _prepare_text(
item.text, do_escape_html=False, do_replace_newline=False
)
if formula_to_mathml:
# Building a math equation in MathML format
# ref https://www.w3.org/TR/wai-aria-1.1/#math
mathml_element = latex2mathml.converter.convert_to_element(
math_formula, display="block"
)
annotation = SubElement(
mathml_element, "annotation", dict(encoding="TeX")
)
annotation.text = math_formula
mathml = unescape(tostring(mathml_element, encoding="unicode"))
text = f"<div>{mathml}</div>"
else:
text = f"<pre>{math_formula}</pre>"
html_texts.append(text)

elif isinstance(item, ListItem):

text = f"<li>{_sanitize_text(item.text)}</li>"
text = f"<li>{_prepare_text(item.text)}</li>"
html_texts.append(text)

elif isinstance(item, TextItem) and item.label in [DocItemLabel.LIST_ITEM]:

text = f"<li>{_sanitize_text(item.text)}</li>"
text = f"<li>{_prepare_text(item.text)}</li>"
html_texts.append(text)

elif isinstance(item, CodeItem) and item.label in labels:
text = (
"<pre><code>"
f"{_sanitize_text(item.text, do_escape_html=False)}"
"</code></pre>"
elif isinstance(item, CodeItem):
code_text = _prepare_text(
item.text, do_escape_html=False, do_replace_newline=False
)
text = f"<pre><code>{code_text}</code></pre>"
html_texts.append(text.strip())

elif isinstance(item, TextItem) and item.label in labels:
elif isinstance(item, TextItem):

text = f"<p>{_sanitize_text(item.text)}</p>"
text = f"<p>{_prepare_text(item.text)}</p>"
html_texts.append(text.strip())
elif isinstance(item, TableItem):

Expand All @@ -2513,8 +2538,7 @@ def _sanitize_text(text: str, do_escape_html=True) -> str:

lines = []
lines.extend(head_lines)
for i, line in enumerate(html_texts):
lines.append(line.replace("\n", "<br>"))
lines.extend(html_texts)

delim = "\n"
html_text = (delim.join(lines)).strip()
Expand Down
13 changes: 12 additions & 1 deletion poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ typing-extensions = "^4.12.2"
transformers = { version = "^4.34.0", optional = true }
semchunk = { version = "^2.2.0", optional = true }
typer = "^0.12.5"
latex2mathml = "^3.77.0"

[tool.poetry.extras]
chunking = ["transformers", "semchunk"]
Expand Down
5 changes: 5 additions & 0 deletions test/data/docling_document/export/formula_mathml.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
<!DOCTYPE html>
<html lang="en">

<div><math xmlns="http://www.w3.org/1998/Math/MathML" display="block"><mrow><mfrac><mrow><mn>1</mn></mrow><mrow><mi>x</mi></mrow></mfrac></mrow><annotation encoding="TeX">\frac{1}{x}</annotation></math></div>
</html>
14 changes: 14 additions & 0 deletions test/test_docling_doc.py
Original file line number Diff line number Diff line change
Expand Up @@ -661,6 +661,20 @@ def test_version_doc():
assert doc.version == CURRENT_VERSION


def test_formula_mathml():
doc = DoclingDocument(name="Dummy")
equation = "\\frac{1}{x}"
doc.add_text(label=DocItemLabel.FORMULA, text=equation)

doc_html = doc.export_to_html(formula_to_mathml=True, html_head="")

gt_html = Path("test/data/docling_document/export/formula_mathml.html").read_text(
encoding="utf8"
)

assert doc_html == gt_html


def test_docitem_get_image():
# Prepare the document
doc = DoclingDocument(name="Dummy")
Expand Down