Skip to content

Commit

Permalink
fix: Define LTR/RTL text direction in HTML export (#152)
Browse files Browse the repository at this point in the history
* fix: Detect RTL text and put according HTML tags

Signed-off-by: Christoph Auer <[email protected]>

* proposal for cau/rtl-text

Signed-off-by: Peter Staar <[email protected]>

* fix: Form HTML tags with utility method

Signed-off-by: Christoph Auer <[email protected]>

* Update tests

Signed-off-by: Christoph Auer <[email protected]>

* Remove commented code

Signed-off-by: Christoph Auer <[email protected]>

* Add back escaping

Signed-off-by: Christoph Auer <[email protected]>

---------

Signed-off-by: Christoph Auer <[email protected]>
Signed-off-by: Peter Staar <[email protected]>
Co-authored-by: Peter Staar <[email protected]>
  • Loading branch information
cau-git and PeterStaar-IBM authored Feb 5, 2025
1 parent 327f902 commit 3cf31cb
Show file tree
Hide file tree
Showing 2 changed files with 64 additions and 13 deletions.
50 changes: 37 additions & 13 deletions docling_core/types/doc/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,11 @@
from docling_core.types.doc.base import ImageRefMode
from docling_core.types.doc.labels import CodeLanguageLabel, DocItemLabel, GroupLabel
from docling_core.types.doc.tokens import DocumentToken, TableToken
from docling_core.types.doc.utils import relative_path
from docling_core.types.doc.utils import (
get_html_tag_with_text_direction,
get_text_direction,
relative_path,
)

_logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -866,7 +870,9 @@ def export_to_html(

caption_text = ""
if len(text) > 0:
caption_text = f"<figcaption>{text}</figcaption>"
caption_text = get_html_tag_with_text_direction(
html_tag="figcaption", text=text
)

default_response = f"<figure>{caption_text}</figure>"

Expand Down Expand Up @@ -1090,15 +1096,28 @@ def export_to_html(
if colspan > 1:
opening_tag += f' colspan="{colspan}"'

text_dir = get_text_direction(content)
if text_dir == "rtl":
opening_tag += f' dir="{dir}"'

body += f"<{opening_tag}>{content}</{celltag}>"
body += "</tr>"

# dir = get_text_direction(text)

if len(text) > 0 and len(body) > 0:
body = f"<table><caption>{text}</caption><tbody>{body}</tbody></table>"
caption_text = get_html_tag_with_text_direction(
html_tag="caption", text=text
)
body = f"<table>{caption_text}<tbody>{body}</tbody></table>"

elif len(text) == 0 and len(body) > 0:
body = f"<table><tbody>{body}</tbody></table>"
elif len(text) > 0 and len(body) == 0:
body = f"<table><caption>{text}</caption></table>"
caption_text = get_html_tag_with_text_direction(
html_tag="caption", text=text
)
body = f"<table>{caption_text}</table>"
else:
body = "<table></table>"

Expand Down Expand Up @@ -2470,17 +2489,17 @@ def _prepare_tag_content(
continue

elif isinstance(item, TextItem) and item.label in [DocItemLabel.TITLE]:
text_inner = _prepare_tag_content(item.text)
text = get_html_tag_with_text_direction(html_tag="h1", text=text_inner)

text = f"<h1>{_prepare_tag_content(item.text)}</h1>"
html_texts.append(text)

elif isinstance(item, SectionHeaderItem):

section_level: int = min(item.level + 1, 6)

text = (
f"<h{(section_level)}>"
f"{_prepare_tag_content(item.text)}</h{(section_level)}>"
text = get_html_tag_with_text_direction(
html_tag=f"h{section_level}", text=_prepare_tag_content(item.text)
)
html_texts.append(text)

Expand Down Expand Up @@ -2544,13 +2563,15 @@ def _image_fallback(item: TextItem):
)

elif isinstance(item, ListItem):

text = f"<li>{_prepare_tag_content(item.text)}</li>"
text = get_html_tag_with_text_direction(
html_tag="li", text=_prepare_tag_content(item.text)
)
html_texts.append(text)

elif isinstance(item, TextItem) and item.label in [DocItemLabel.LIST_ITEM]:

text = f"<li>{_prepare_tag_content(item.text)}</li>"
text = get_html_tag_with_text_direction(
html_tag="li", text=_prepare_tag_content(item.text)
)
html_texts.append(text)

elif isinstance(item, CodeItem):
Expand All @@ -2562,8 +2583,11 @@ def _image_fallback(item: TextItem):

elif isinstance(item, TextItem):

text = f"<p>{_prepare_tag_content(item.text)}</p>"
text = get_html_tag_with_text_direction(
html_tag="p", text=_prepare_tag_content(item.text)
)
html_texts.append(text)

elif isinstance(item, TableItem):

text = item.export_to_html(doc=self, add_caption=True)
Expand Down
27 changes: 27 additions & 0 deletions docling_core/types/doc/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

"""Utils for document types."""

import unicodedata
from pathlib import Path


Expand Down Expand Up @@ -46,3 +47,29 @@ def relative_path(src: Path, target: Path) -> Path:

# Combine and return the result
return Path(*up_segments, *down_segments)


def get_html_tag_with_text_direction(html_tag: str, text: str) -> str:
"""Form the HTML element with tag, text, and optional dir attribute."""
text_dir = get_text_direction(text)

if text_dir == "ltr":
return f"<{html_tag}>{text}</{html_tag}>"
else:
return f'<{html_tag} dir="{text_dir}">{text}</{html_tag}>'


def get_text_direction(text: str) -> str:
"""Determine the text direction of a given string as LTR or RTL script."""
if not text:
return "ltr" # Default for empty input

rtl_scripts = {"R", "AL"}
rtl_chars = sum(unicodedata.bidirectional(c) in rtl_scripts for c in text)

return (
"rtl"
if unicodedata.bidirectional(text[0]) in rtl_scripts
or rtl_chars > len(text) / 2
else "ltr"
)

0 comments on commit 3cf31cb

Please sign in to comment.