diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py
index da72252b..7a01b2b2 100644
--- a/docling_core/types/doc/document.py
+++ b/docling_core/types/doc/document.py
@@ -44,7 +44,11 @@
from docling_core.types.doc.base import ImageRefMode
from docling_core.types.doc.labels import CodeLanguageLabel, DocItemLabel, GroupLabel
from docling_core.types.doc.tokens import DocumentToken, TableToken
-from docling_core.types.doc.utils import relative_path
+from docling_core.types.doc.utils import (
+ get_html_tag_with_text_direction,
+ get_text_direction,
+ relative_path,
+)
_logger = logging.getLogger(__name__)
@@ -866,7 +870,9 @@ def export_to_html(
caption_text = ""
if len(text) > 0:
- caption_text = f"
{_prepare_tag_content(item.text)}
" + text = get_html_tag_with_text_direction( + html_tag="p", text=_prepare_tag_content(item.text) + ) html_texts.append(text) + elif isinstance(item, TableItem): text = item.export_to_html(doc=self, add_caption=True) diff --git a/docling_core/types/doc/utils.py b/docling_core/types/doc/utils.py index 14c4053a..c89e7836 100644 --- a/docling_core/types/doc/utils.py +++ b/docling_core/types/doc/utils.py @@ -5,6 +5,7 @@ """Utils for document types.""" +import unicodedata from pathlib import Path @@ -46,3 +47,29 @@ def relative_path(src: Path, target: Path) -> Path: # Combine and return the result return Path(*up_segments, *down_segments) + + +def get_html_tag_with_text_direction(html_tag: str, text: str) -> str: + """Form the HTML element with tag, text, and optional dir attribute.""" + text_dir = get_text_direction(text) + + if text_dir == "ltr": + return f"<{html_tag}>{text}{html_tag}>" + else: + return f'<{html_tag} dir="{text_dir}">{text}{html_tag}>' + + +def get_text_direction(text: str) -> str: + """Determine the text direction of a given string as LTR or RTL script.""" + if not text: + return "ltr" # Default for empty input + + rtl_scripts = {"R", "AL"} + rtl_chars = sum(unicodedata.bidirectional(c) in rtl_scripts for c in text) + + return ( + "rtl" + if unicodedata.bidirectional(text[0]) in rtl_scripts + or rtl_chars > len(text) / 2 + else "ltr" + )