Skip to content

Commit

Permalink
feat(HTML): Fallback showing formulas as images (#146)
Browse files Browse the repository at this point in the history
* remove un-needed logic

the labels allowlist is checked before

Signed-off-by: Michele Dolfi <[email protected]>

* textitem cannot have label code

Signed-off-by: Michele Dolfi <[email protected]>

* display formulas with mathml in exported html

Signed-off-by: Michele Dolfi <[email protected]>

* expose argument in save_as_html

Signed-off-by: Michele Dolfi <[email protected]>

* rename sanitize in prepare and add \n

Signed-off-by: Michele Dolfi <[email protected]>

* fix mypy parsing

Signed-off-by: Michele Dolfi <[email protected]>

* remove unused/impossible elif

Signed-off-by: Michele Dolfi <[email protected]>

* remove strip()

Signed-off-by: Michele Dolfi <[email protected]>

* add display none for latex annotation

Signed-off-by: Michele Dolfi <[email protected]>

* fallback showing equations as image

Signed-off-by: Michele Dolfi <[email protected]>

* add html placeholder

Signed-off-by: Michele Dolfi <[email protected]>

* markdown placeholder

Signed-off-by: Michele Dolfi <[email protected]>

---------

Signed-off-by: Michele Dolfi <[email protected]>
  • Loading branch information
dolfim-ibm authored Feb 3, 2025
1 parent ed36437 commit 23477f7
Show file tree
Hide file tree
Showing 8 changed files with 129 additions and 10 deletions.
62 changes: 52 additions & 10 deletions docling_core/types/doc/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -1393,6 +1393,17 @@ class DoclingDocument(BaseModel):
math annotation {
display: none;
}
.formula-not-decoded {
background: repeating-linear-gradient(
45deg, /* Angle of the stripes */
LightGray, /* First color */
LightGray 10px, /* Length of the first color */
White 10px, /* Second color */
White 20px /* Length of the second color */
);
margin: 0;
text-align: center;
}
</style>
</head>"""

Expand Down Expand Up @@ -2216,11 +2227,18 @@ def _append_text(text: str, do_escape_html=True, do_escape_underscores=True):

elif isinstance(item, TextItem) and item.label in [DocItemLabel.FORMULA]:
in_list = False
_append_text(
f"$${item.text}$$\n",
do_escape_underscores=False,
do_escape_html=False,
)
if item.text != "":
_append_text(
f"$${item.text}$$\n",
do_escape_underscores=False,
do_escape_html=False,
)
elif item.orig != "":
_append_text(
"<!-- formula-not-decoded -->\n",
do_escape_underscores=False,
do_escape_html=False,
)

elif isinstance(item, TextItem) and item.label in labels:
in_list = False
Expand Down Expand Up @@ -2467,9 +2485,27 @@ def _prepare_tag_content(
math_formula = _prepare_tag_content(
item.text, do_escape_html=False, do_replace_newline=False
)
if formula_to_mathml:
# Building a math equation in MathML format
# ref https://www.w3.org/TR/wai-aria-1.1/#math
text = ""

# If the formula is not processed correcty, use its image
if (
item.text == ""
and item.orig != ""
and image_mode == ImageRefMode.EMBEDDED
and len(item.prov) > 0
):
item_image = item.get_image(doc=self)
if item_image is not None:
img_ref = ImageRef.from_pil(item_image, dpi=72)
text = (
"<figure>"
f'<img src="{img_ref.uri}" alt="{item.orig}" />'
"</figure>"
)

# Building a math equation in MathML format
# ref https://www.w3.org/TR/wai-aria-1.1/#math
elif formula_to_mathml:
mathml_element = latex2mathml.converter.convert_to_element(
math_formula, display="block"
)
Expand All @@ -2480,9 +2516,15 @@ def _prepare_tag_content(
mathml = unescape(tostring(mathml_element, encoding="unicode"))
text = f"<div>{mathml}</div>"

else:
elif math_formula != "":
text = f"<pre>{math_formula}</pre>"
html_texts.append(text)

if text != "":
html_texts.append(text)
else:
html_texts.append(
'<div class="formula-not-decoded">Formula not decoded</div>'
)

elif isinstance(item, ListItem):

Expand Down
11 changes: 11 additions & 0 deletions test/data/doc/2206.01062.yaml.html
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,17 @@
math annotation {
display: none;
}
.formula-not-decoded {
background: repeating-linear-gradient(
45deg, /* Angle of the stripes */
LightGray, /* First color */
LightGray 10px, /* Length of the first color */
White 10px, /* Second color */
White 20px /* Length of the second color */
);
margin: 0;
text-align: center;
}
</style>
</head>
<h2>DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis</h2>
Expand Down
11 changes: 11 additions & 0 deletions test/data/doc/bad_doc.yaml.html
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,17 @@
math annotation {
display: none;
}
.formula-not-decoded {
background: repeating-linear-gradient(
45deg, /* Angle of the stripes */
LightGray, /* First color */
LightGray 10px, /* Length of the first color */
White 10px, /* Second color */
White 20px /* Length of the second color */
);
margin: 0;
text-align: center;
}
</style>
</head>
<h1>This is the title</h1>
Expand Down
11 changes: 11 additions & 0 deletions test/data/doc/constructed_doc.embedded.html.gt
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,17 @@
math annotation {
display: none;
}
.formula-not-decoded {
background: repeating-linear-gradient(
45deg, /* Angle of the stripes */
LightGray, /* First color */
LightGray 10px, /* Length of the first color */
White 10px, /* Second color */
White 20px /* Length of the second color */
);
margin: 0;
text-align: center;
}
</style>
</head>
<h1>Title of the Document</h1>
Expand Down
11 changes: 11 additions & 0 deletions test/data/doc/constructed_doc.placeholder.html.gt
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,17 @@
math annotation {
display: none;
}
.formula-not-decoded {
background: repeating-linear-gradient(
45deg, /* Angle of the stripes */
LightGray, /* First color */
LightGray 10px, /* Length of the first color */
White 10px, /* Second color */
White 20px /* Length of the second color */
);
margin: 0;
text-align: center;
}
</style>
</head>
<h1>Title of the Document</h1>
Expand Down
11 changes: 11 additions & 0 deletions test/data/doc/constructed_doc.referenced.html.gt
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,17 @@
math annotation {
display: none;
}
.formula-not-decoded {
background: repeating-linear-gradient(
45deg, /* Angle of the stripes */
LightGray, /* First color */
LightGray 10px, /* Length of the first color */
White 10px, /* Second color */
White 20px /* Length of the second color */
);
margin: 0;
text-align: center;
}
</style>
</head>
<h1>Title of the Document</h1>
Expand Down
11 changes: 11 additions & 0 deletions test/data/doc/constructed_document.yaml.html
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,17 @@
math annotation {
display: none;
}
.formula-not-decoded {
background: repeating-linear-gradient(
45deg, /* Angle of the stripes */
LightGray, /* First color */
LightGray 10px, /* Length of the first color */
White 10px, /* Second color */
White 20px /* Length of the second color */
);
margin: 0;
text-align: center;
}
</style>
</head>
<h1>Title of the Document</h1>
Expand Down
11 changes: 11 additions & 0 deletions test/data/doc/dummy_doc.yaml.html
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,17 @@
math annotation {
display: none;
}
.formula-not-decoded {
background: repeating-linear-gradient(
45deg, /* Angle of the stripes */
LightGray, /* First color */
LightGray 10px, /* Length of the first color */
White 10px, /* Second color */
White 20px /* Length of the second color */
);
margin: 0;
text-align: center;
}
</style>
</head>
<h1>DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis</h1>
Expand Down

0 comments on commit 23477f7

Please sign in to comment.