Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: add html escape in md export and fix formula escapes #143

Merged
merged 2 commits into from
Jan 31, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
142 changes: 83 additions & 59 deletions docling_core/types/doc/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import base64
import copy
import hashlib
import html
import json
import mimetypes
import os
Expand Down Expand Up @@ -1045,7 +1046,7 @@ def export_to_html(

text = ""
if doc is not None and add_caption and len(self.captions):
text = self.caption_text(doc)
text = html.escape(self.caption_text(doc))

if len(self.data.table_cells) == 0:
return ""
Expand All @@ -1071,7 +1072,7 @@ def export_to_html(
if colstart != j:
continue

content = cell.text.strip()
content = html.escape(cell.text.strip())
celltag = "td"
if cell.column_header:
celltag = "th"
Expand Down Expand Up @@ -2082,6 +2083,46 @@ def export_to_markdown( # noqa: C901
previous_level = 0 # Track the previous item's level
in_list = False # Track if we're currently processing list items

# Our export markdown doesn't contain any emphasis styling:
# Bold, Italic, or Bold-Italic
# Hence, any underscore that we print into Markdown is coming from document text
# That means we need to escape it, to properly reflect content in the markdown
# However, we need to preserve underscores in image URLs
# to maintain their validity
# For example: ![image](path/to_image.png) should remain unchanged
def _escape_underscores(text):
"""Escape underscores but leave them intact in the URL.."""
# Firstly, identify all the URL patterns.
url_pattern = r"!\[.*?\]\((.*?)\)"
# Matches both inline ($...$) and block ($$...$$) LaTeX equations:
latex_pattern = r"\$\$?(?:\\.|[^$\\])*\$\$?"
combined_pattern = f"({url_pattern})|({latex_pattern})"

parts = []
last_end = 0

for match in re.finditer(combined_pattern, text):
# Text to add before the URL (needs to be escaped)
before_url = text[last_end : match.start()]
parts.append(re.sub(r"(?<!\\)_", r"\_", before_url))

# Add the full URL part (do not escape)
parts.append(match.group(0))
last_end = match.end()

# Add the final part of the text (which needs to be escaped)
if last_end < len(text):
parts.append(re.sub(r"(?<!\\)_", r"\_", text[last_end:]))

return "".join(parts)

def _append_text(text: str, do_escape_html=True, do_escape_underscores=True):
if do_escape_underscores and escaping_underscores:
text = _escape_underscores(text)
if do_escape_html:
text = html.escape(text, quote=False)
mdtexts.append(text)

for ix, (item, level) in enumerate(
self.iterate_items(self.body, with_groups=True, page_no=page_no)
):
Expand Down Expand Up @@ -2130,7 +2171,7 @@ def export_to_markdown( # noqa: C901
in_list = False
marker = "" if strict_text else "#"
text = f"{marker} {item.text}"
mdtexts.append(text.strip() + "\n")
_append_text(text.strip() + "\n")

elif (
isinstance(item, TextItem)
Expand All @@ -2143,12 +2184,12 @@ def export_to_markdown( # noqa: C901
if len(marker) < 2:
marker = "##"
text = f"{marker} {item.text}\n"
mdtexts.append(text.strip() + "\n")
_append_text(text.strip() + "\n")

elif isinstance(item, CodeItem) and item.label in labels:
in_list = False
text = f"```\n{item.text}\n```\n"
mdtexts.append(text)
_append_text(text, do_escape_underscores=False, do_escape_html=False)

elif isinstance(item, ListItem) and item.label in [DocItemLabel.LIST_ITEM]:
in_list = True
Expand All @@ -2165,85 +2206,54 @@ def export_to_markdown( # noqa: C901
marker = "-" # Markdown needs only dash as item marker.

text = f"{list_indent}{marker} {item.text}"
mdtexts.append(text)
_append_text(text)

elif isinstance(item, TextItem) and item.label in [DocItemLabel.FORMULA]:
in_list = False
mdtexts.append(f"$${item.text}$$\n")
_append_text(
f"$${item.text}$$\n",
do_escape_underscores=False,
do_escape_html=False,
)

elif isinstance(item, TextItem) and item.label in labels:
in_list = False
if len(item.text) and text_width > 0:
text = item.text
wrapped_text = textwrap.fill(text, width=text_width)
mdtexts.append(wrapped_text + "\n")
_append_text(wrapped_text + "\n")
elif len(item.text):
text = f"{item.text}\n"
mdtexts.append(text)
_append_text(text)

elif isinstance(item, TableItem) and not strict_text:
in_list = False
mdtexts.append(item.caption_text(self))
_append_text(item.caption_text(self))
md_table = item.export_to_markdown()
mdtexts.append("\n" + md_table + "\n")
_append_text("\n" + md_table + "\n")

elif isinstance(item, PictureItem) and not strict_text:
in_list = False
mdtexts.append(item.caption_text(self))
_append_text(item.caption_text(self))

line = item.export_to_markdown(
doc=self,
image_placeholder=image_placeholder,
image_mode=image_mode,
)

mdtexts.append(line)
_append_text(line, do_escape_html=False, do_escape_underscores=False)

elif isinstance(item, DocItem) and item.label in labels:
in_list = False
text = "<missing-text>"
mdtexts.append(text)
text = "<!-- missing-text -->"
_append_text(text, do_escape_html=False, do_escape_underscores=False)

mdtext = (delim.join(mdtexts)).strip()
mdtext = re.sub(
r"\n\n\n+", "\n\n", mdtext
) # remove cases of double or more empty lines.

# Our export markdown doesn't contain any emphasis styling:
# Bold, Italic, or Bold-Italic
# Hence, any underscore that we print into Markdown is coming from document text
# That means we need to escape it, to properly reflect content in the markdown
# However, we need to preserve underscores in image URLs
# to maintain their validity
# For example: ![image](path/to_image.png) should remain unchanged
def escape_underscores(text):
"""Escape underscores but leave them intact in the URL.."""
# Firstly, identify all the URL patterns.
url_pattern = r"!\[.*?\]\((.*?)\)"
# Matches both inline ($...$) and block ($$...$$) LaTeX equations:
latex_pattern = r"\$\$?(?:\\.|[^$\\])*\$\$?"
combined_pattern = f"({url_pattern})|({latex_pattern})"

parts = []
last_end = 0

for match in re.finditer(combined_pattern, text):
# Text to add before the URL (needs to be escaped)
before_url = text[last_end : match.start()]
parts.append(re.sub(r"(?<!\\)_", r"\_", before_url))

# Add the full URL part (do not escape)
parts.append(match.group(0))
last_end = match.end()

# Add the final part of the text (which needs to be escaped)
if last_end < len(text):
parts.append(re.sub(r"(?<!\\)_", r"\_", text[last_end:]))

return "".join(parts)

if escaping_underscores:
mdtext = escape_underscores(mdtext)

return mdtext

def export_to_text( # noqa: C901
Expand Down Expand Up @@ -2371,6 +2381,11 @@ def close_lists(

in_ordered_list: List[bool] = [] # False

def _sanitize_text(text: str, do_escape_html=True) -> str:
if do_escape_html:
text = html.escape(text, quote=False)
return text

for ix, (item, curr_level) in enumerate(
self.iterate_items(self.body, with_groups=True, page_no=page_no)
):
Expand Down Expand Up @@ -2421,14 +2436,17 @@ def close_lists(

elif isinstance(item, TextItem) and item.label in [DocItemLabel.TITLE]:

text = f"<h1>{item.text}</h1>"
text = f"<h1>{_sanitize_text(item.text)}</h1>"
html_texts.append(text.strip())

elif isinstance(item, SectionHeaderItem):

section_level: int = item.level + 1

text = f"<h{(section_level)}>{item.text}</h{(section_level)}>"
text = (
f"<h{(section_level)}>"
f"{_sanitize_text(item.text)}</h{(section_level)}>"
)
html_texts.append(text.strip())

elif isinstance(item, TextItem) and item.label in [
Expand All @@ -2443,31 +2461,37 @@ def close_lists(
if section_level >= 6:
section_level = 6

text = f"<h{section_level}>{item.text}</h{section_level}>"
text = (
f"<h{section_level}>{_sanitize_text(item.text)}</h{section_level}>"
)
html_texts.append(text.strip())

elif isinstance(item, TextItem) and item.label in [DocItemLabel.CODE]:

text = f"<pre>{item.text}</pre>"
text = f"<pre>{_sanitize_text(item.text, do_escape_html=False)}</pre>"
html_texts.append(text)

elif isinstance(item, ListItem):

text = f"<li>{item.text}</li>"
text = f"<li>{_sanitize_text(item.text)}</li>"
html_texts.append(text)

elif isinstance(item, TextItem) and item.label in [DocItemLabel.LIST_ITEM]:

text = f"<li>{item.text}</li>"
text = f"<li>{_sanitize_text(item.text)}</li>"
html_texts.append(text)

elif isinstance(item, CodeItem) and item.label in labels:
text = f"<pre><code>{item.text}</code></pre>"
text = (
"<pre><code>"
f"{_sanitize_text(item.text, do_escape_html=False)}"
"</code></pre>"
)
html_texts.append(text.strip())

elif isinstance(item, TextItem) and item.label in labels:

text = f"<p>{item.text}</p>"
text = f"<p>{_sanitize_text(item.text)}</p>"
html_texts.append(text.strip())
elif isinstance(item, TableItem):

Expand Down
Loading