Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(HTML): Export formulas with mathml #144

Merged
merged 11 commits into from
Jan 31, 2025
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
88 changes: 49 additions & 39 deletions docling_core/types/doc/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,10 @@
from pathlib import Path
from typing import Any, Dict, Final, List, Literal, Optional, Tuple, Union
from urllib.parse import quote, unquote
from xml.etree.cElementTree import SubElement, tostring
from xml.sax.saxutils import unescape

import latex2mathml.converter
import pandas as pd
import yaml
from PIL import Image as PILImage
Expand Down Expand Up @@ -1387,6 +1390,9 @@ class DoclingDocument(BaseModel):
table tr:nth-child(even) td{
background-color: LightGray;
}
math annotation {
display: none;
}
</style>
</head>"""

Expand Down Expand Up @@ -2282,6 +2288,7 @@ def save_as_html(
to_element: int = sys.maxsize,
labels: set[DocItemLabel] = DEFAULT_EXPORT_LABELS,
image_mode: ImageRefMode = ImageRefMode.PLACEHOLDER,
formula_to_mathml: bool = False,
page_no: Optional[int] = None,
html_lang: str = "en",
html_head: str = _HTML_DEFAULT_HEAD,
Expand All @@ -2301,6 +2308,7 @@ def save_as_html(
to_element=to_element,
labels=labels,
image_mode=image_mode,
formula_to_mathml=formula_to_mathml,
page_no=page_no,
html_lang=html_lang,
html_head=html_head,
Expand Down Expand Up @@ -2347,6 +2355,7 @@ def export_to_html( # noqa: C901
to_element: int = sys.maxsize,
labels: set[DocItemLabel] = DEFAULT_EXPORT_LABELS,
image_mode: ImageRefMode = ImageRefMode.PLACEHOLDER,
formula_to_mathml: bool = False,
page_no: Optional[int] = None,
html_lang: str = "en",
html_head: str = _HTML_DEFAULT_HEAD,
Expand Down Expand Up @@ -2381,9 +2390,13 @@ def close_lists(

in_ordered_list: List[bool] = [] # False

def _sanitize_text(text: str, do_escape_html=True) -> str:
def _prepare_tag_content(
text: str, do_escape_html=True, do_replace_newline=True
) -> str:
if do_escape_html:
text = html.escape(text, quote=False)
if do_replace_newline:
text = text.replace("\n", "<br>")
return text

for ix, (item, curr_level) in enumerate(
Expand Down Expand Up @@ -2416,7 +2429,7 @@ def _sanitize_text(text: str, do_escape_html=True) -> str:
]:

text = "<ol>"
html_texts.append(text.strip())
html_texts.append(text)

# Increment list nesting level when entering a new list
in_ordered_list.append(True)
Expand All @@ -2426,7 +2439,7 @@ def _sanitize_text(text: str, do_escape_html=True) -> str:
]:

text = "<ul>"
html_texts.append(text.strip())
html_texts.append(text)

# Increment list nesting level when entering a new list
in_ordered_list.append(False)
Expand All @@ -2436,63 +2449,61 @@ def _sanitize_text(text: str, do_escape_html=True) -> str:

elif isinstance(item, TextItem) and item.label in [DocItemLabel.TITLE]:

text = f"<h1>{_sanitize_text(item.text)}</h1>"
html_texts.append(text.strip())
text = f"<h1>{_prepare_tag_content(item.text)}</h1>"
html_texts.append(text)

elif isinstance(item, SectionHeaderItem):

section_level: int = item.level + 1
section_level: int = min(item.level + 1, 6)

text = (
f"<h{(section_level)}>"
f"{_sanitize_text(item.text)}</h{(section_level)}>"
f"{_prepare_tag_content(item.text)}</h{(section_level)}>"
)
html_texts.append(text.strip())

elif isinstance(item, TextItem) and item.label in [
DocItemLabel.SECTION_HEADER
]:

section_level = curr_level

if section_level <= 1:
section_level = 2
html_texts.append(text)

if section_level >= 6:
section_level = 6
elif isinstance(item, TextItem) and item.label in [DocItemLabel.FORMULA]:

text = (
f"<h{section_level}>{_sanitize_text(item.text)}</h{section_level}>"
math_formula = _prepare_tag_content(
item.text, do_escape_html=False, do_replace_newline=False
)
html_texts.append(text.strip())

elif isinstance(item, TextItem) and item.label in [DocItemLabel.CODE]:

text = f"<pre>{_sanitize_text(item.text, do_escape_html=False)}</pre>"
if formula_to_mathml:
# Building a math equation in MathML format
# ref https://www.w3.org/TR/wai-aria-1.1/#math
mathml_element = latex2mathml.converter.convert_to_element(
math_formula, display="block"
)
annotation = SubElement(
mathml_element, "annotation", dict(encoding="TeX")
)
annotation.text = math_formula
mathml = unescape(tostring(mathml_element, encoding="unicode"))
text = f"<div>{mathml}</div>"
else:
text = f"<pre>{math_formula}</pre>"
html_texts.append(text)

elif isinstance(item, ListItem):

text = f"<li>{_sanitize_text(item.text)}</li>"
text = f"<li>{_prepare_tag_content(item.text)}</li>"
html_texts.append(text)

elif isinstance(item, TextItem) and item.label in [DocItemLabel.LIST_ITEM]:

text = f"<li>{_sanitize_text(item.text)}</li>"
text = f"<li>{_prepare_tag_content(item.text)}</li>"
html_texts.append(text)

elif isinstance(item, CodeItem) and item.label in labels:
text = (
"<pre><code>"
f"{_sanitize_text(item.text, do_escape_html=False)}"
"</code></pre>"
elif isinstance(item, CodeItem):
code_text = _prepare_tag_content(
item.text, do_escape_html=False, do_replace_newline=False
)
html_texts.append(text.strip())
text = f"<pre><code>{code_text}</code></pre>"
html_texts.append(text)

elif isinstance(item, TextItem) and item.label in labels:
elif isinstance(item, TextItem):

text = f"<p>{_sanitize_text(item.text)}</p>"
html_texts.append(text.strip())
text = f"<p>{_prepare_tag_content(item.text)}</p>"
html_texts.append(text)
elif isinstance(item, TableItem):

text = item.export_to_html(doc=self, add_caption=True)
Expand All @@ -2513,8 +2524,7 @@ def _sanitize_text(text: str, do_escape_html=True) -> str:

lines = []
lines.extend(head_lines)
for i, line in enumerate(html_texts):
lines.append(line.replace("\n", "<br>"))
lines.extend(html_texts)

delim = "\n"
html_text = (delim.join(lines)).strip()
Expand Down
13 changes: 12 additions & 1 deletion poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ typing-extensions = "^4.12.2"
transformers = { version = "^4.34.0", optional = true }
semchunk = { version = "^2.2.0", optional = true }
typer = "^0.12.5"
latex2mathml = "^3.77.0"

[tool.poetry.extras]
chunking = ["transformers", "semchunk"]
Expand Down
3 changes: 3 additions & 0 deletions test/data/doc/2206.01062.yaml.html
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,9 @@
table tr:nth-child(even) td{
background-color: LightGray;
}
math annotation {
display: none;
}
</style>
</head>
<h2>DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis</h2>
Expand Down
3 changes: 3 additions & 0 deletions test/data/doc/bad_doc.yaml.html
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,9 @@
table tr:nth-child(even) td{
background-color: LightGray;
}
math annotation {
display: none;
}
</style>
</head>
<h1>This is the title</h1>
Expand Down
3 changes: 3 additions & 0 deletions test/data/doc/constructed_doc.embedded.html.gt
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,9 @@
table tr:nth-child(even) td{
background-color: LightGray;
}
math annotation {
display: none;
}
</style>
</head>
<h1>Title of the Document</h1>
Expand Down
3 changes: 3 additions & 0 deletions test/data/doc/constructed_doc.placeholder.html.gt
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,9 @@
table tr:nth-child(even) td{
background-color: LightGray;
}
math annotation {
display: none;
}
</style>
</head>
<h1>Title of the Document</h1>
Expand Down
3 changes: 3 additions & 0 deletions test/data/doc/constructed_doc.referenced.html.gt
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,9 @@
table tr:nth-child(even) td{
background-color: LightGray;
}
math annotation {
display: none;
}
</style>
</head>
<h1>Title of the Document</h1>
Expand Down
3 changes: 3 additions & 0 deletions test/data/doc/constructed_document.yaml.html
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,9 @@
table tr:nth-child(even) td{
background-color: LightGray;
}
math annotation {
display: none;
}
</style>
</head>
<h1>Title of the Document</h1>
Expand Down
3 changes: 3 additions & 0 deletions test/data/doc/dummy_doc.yaml.html
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,9 @@
table tr:nth-child(even) td{
background-color: LightGray;
}
math annotation {
display: none;
}
</style>
</head>
<h1>DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis</h1>
Expand Down
5 changes: 5 additions & 0 deletions test/data/docling_document/export/formula_mathml.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
<!DOCTYPE html>
<html lang="en">

<div><math xmlns="http://www.w3.org/1998/Math/MathML" display="block"><mrow><mfrac><mrow><mn>1</mn></mrow><mrow><mi>x</mi></mrow></mfrac></mrow><annotation encoding="TeX">\frac{1}{x}</annotation></math></div>
</html>
14 changes: 14 additions & 0 deletions test/test_docling_doc.py
Original file line number Diff line number Diff line change
Expand Up @@ -661,6 +661,20 @@ def test_version_doc():
assert doc.version == CURRENT_VERSION


def test_formula_mathml():
doc = DoclingDocument(name="Dummy")
equation = "\\frac{1}{x}"
doc.add_text(label=DocItemLabel.FORMULA, text=equation)

doc_html = doc.export_to_html(formula_to_mathml=True, html_head="")

gt_html = Path("test/data/docling_document/export/formula_mathml.html").read_text(
encoding="utf8"
)

assert doc_html == gt_html


def test_docitem_get_image():
# Prepare the document
doc = DoclingDocument(name="Dummy")
Expand Down