Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: Define LTR/RTL text direction in HTML export #152

Merged
merged 6 commits into from
Feb 5, 2025
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 37 additions & 13 deletions docling_core/types/doc/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,11 @@
from docling_core.types.doc.base import ImageRefMode
from docling_core.types.doc.labels import CodeLanguageLabel, DocItemLabel, GroupLabel
from docling_core.types.doc.tokens import DocumentToken, TableToken
from docling_core.types.doc.utils import relative_path
from docling_core.types.doc.utils import (
get_html_tag_with_text_direction,
get_text_direction,
relative_path,
)

_logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -866,7 +870,9 @@ def export_to_html(

caption_text = ""
if len(text) > 0:
caption_text = f"<figcaption>{text}</figcaption>"
caption_text = get_html_tag_with_text_direction(
html_tag="figcaption", text=text
)

default_response = f"<figure>{caption_text}</figure>"

Expand Down Expand Up @@ -1090,15 +1096,28 @@ def export_to_html(
if colspan > 1:
opening_tag += f' colspan="{colspan}"'

text_dir = get_text_direction(content)
if text_dir == "rtl":
opening_tag += f' dir="{dir}"'

body += f"<{opening_tag}>{content}</{celltag}>"
body += "</tr>"

# dir = get_text_direction(text)

if len(text) > 0 and len(body) > 0:
body = f"<table><caption>{text}</caption><tbody>{body}</tbody></table>"
caption_text = get_html_tag_with_text_direction(
html_tag="caption", text=text
)
body = f"<table>{caption_text}<tbody>{body}</tbody></table>"

elif len(text) == 0 and len(body) > 0:
body = f"<table><tbody>{body}</tbody></table>"
elif len(text) > 0 and len(body) == 0:
body = f"<table><caption>{text}</caption></table>"
caption_text = get_html_tag_with_text_direction(
html_tag="caption", text=text
)
body = f"<table>{caption_text}</table>"
else:
body = "<table></table>"

Expand Down Expand Up @@ -2470,17 +2489,17 @@ def _prepare_tag_content(
continue

elif isinstance(item, TextItem) and item.label in [DocItemLabel.TITLE]:
text_inner = _prepare_tag_content(item.text)
text = get_html_tag_with_text_direction(html_tag="h1", text=text_inner)

text = f"<h1>{_prepare_tag_content(item.text)}</h1>"
html_texts.append(text)

elif isinstance(item, SectionHeaderItem):

section_level: int = min(item.level + 1, 6)

text = (
f"<h{(section_level)}>"
f"{_prepare_tag_content(item.text)}</h{(section_level)}>"
text = get_html_tag_with_text_direction(
html_tag=f"h{section_level}", text=item.text
)
html_texts.append(text)

Expand Down Expand Up @@ -2544,13 +2563,15 @@ def _image_fallback(item: TextItem):
)

elif isinstance(item, ListItem):

text = f"<li>{_prepare_tag_content(item.text)}</li>"
text = get_html_tag_with_text_direction(
html_tag="li", text=_prepare_tag_content(item.text)
)
html_texts.append(text)

elif isinstance(item, TextItem) and item.label in [DocItemLabel.LIST_ITEM]:

text = f"<li>{_prepare_tag_content(item.text)}</li>"
text = get_html_tag_with_text_direction(
html_tag="li", text=_prepare_tag_content(item.text)
)
html_texts.append(text)

elif isinstance(item, CodeItem):
Expand All @@ -2562,8 +2583,11 @@ def _image_fallback(item: TextItem):

elif isinstance(item, TextItem):

text = f"<p>{_prepare_tag_content(item.text)}</p>"
text = get_html_tag_with_text_direction(
html_tag="p", text=_prepare_tag_content(item.text)
)
html_texts.append(text)

elif isinstance(item, TableItem):

text = item.export_to_html(doc=self, add_caption=True)
Expand Down
27 changes: 27 additions & 0 deletions docling_core/types/doc/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

"""Utils for document types."""

import unicodedata
from pathlib import Path


Expand Down Expand Up @@ -46,3 +47,29 @@ def relative_path(src: Path, target: Path) -> Path:

# Combine and return the result
return Path(*up_segments, *down_segments)


def get_html_tag_with_text_direction(html_tag: str, text: str) -> str:
"""Form the HTML element with tag, text, and optional dir attribute."""
text_dir = get_text_direction(text)

if text_dir == "ltr":
return f"<{html_tag}>{text}</{html_tag}>"
else:
return f'<{html_tag} dir="{text_dir}">{text}</{html_tag}>'


def get_text_direction(text: str) -> str:
"""Determine the text direction of a given string as LTR or RTL script."""
if not text:
return "ltr" # Default for empty input

rtl_scripts = {"R", "AL"}
rtl_chars = sum(unicodedata.bidirectional(c) in rtl_scripts for c in text)

return (
"rtl"
if unicodedata.bidirectional(text[0]) in rtl_scripts
or rtl_chars > len(text) / 2
else "ltr"
)