diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py index da72252b..7a01b2b2 100644 --- a/docling_core/types/doc/document.py +++ b/docling_core/types/doc/document.py @@ -44,7 +44,11 @@ from docling_core.types.doc.base import ImageRefMode from docling_core.types.doc.labels import CodeLanguageLabel, DocItemLabel, GroupLabel from docling_core.types.doc.tokens import DocumentToken, TableToken -from docling_core.types.doc.utils import relative_path +from docling_core.types.doc.utils import ( + get_html_tag_with_text_direction, + get_text_direction, + relative_path, +) _logger = logging.getLogger(__name__) @@ -866,7 +870,9 @@ def export_to_html( caption_text = "" if len(text) > 0: - caption_text = f"
{text}
" + caption_text = get_html_tag_with_text_direction( + html_tag="figcaption", text=text + ) default_response = f"
{caption_text}
" @@ -1090,15 +1096,28 @@ def export_to_html( if colspan > 1: opening_tag += f' colspan="{colspan}"' + text_dir = get_text_direction(content) + if text_dir == "rtl": + opening_tag += f' dir="{dir}"' + body += f"<{opening_tag}>{content}" body += "" + # dir = get_text_direction(text) + if len(text) > 0 and len(body) > 0: - body = f"{body}
{text}
" + caption_text = get_html_tag_with_text_direction( + html_tag="caption", text=text + ) + body = f"{caption_text}{body}
" + elif len(text) == 0 and len(body) > 0: body = f"{body}
" elif len(text) > 0 and len(body) == 0: - body = f"
{text}
" + caption_text = get_html_tag_with_text_direction( + html_tag="caption", text=text + ) + body = f"{caption_text}
" else: body = "
" @@ -2470,17 +2489,17 @@ def _prepare_tag_content( continue elif isinstance(item, TextItem) and item.label in [DocItemLabel.TITLE]: + text_inner = _prepare_tag_content(item.text) + text = get_html_tag_with_text_direction(html_tag="h1", text=text_inner) - text = f"

{_prepare_tag_content(item.text)}

" html_texts.append(text) elif isinstance(item, SectionHeaderItem): section_level: int = min(item.level + 1, 6) - text = ( - f"" - f"{_prepare_tag_content(item.text)}" + text = get_html_tag_with_text_direction( + html_tag=f"h{section_level}", text=_prepare_tag_content(item.text) ) html_texts.append(text) @@ -2544,13 +2563,15 @@ def _image_fallback(item: TextItem): ) elif isinstance(item, ListItem): - - text = f"
  • {_prepare_tag_content(item.text)}
  • " + text = get_html_tag_with_text_direction( + html_tag="li", text=_prepare_tag_content(item.text) + ) html_texts.append(text) elif isinstance(item, TextItem) and item.label in [DocItemLabel.LIST_ITEM]: - - text = f"
  • {_prepare_tag_content(item.text)}
  • " + text = get_html_tag_with_text_direction( + html_tag="li", text=_prepare_tag_content(item.text) + ) html_texts.append(text) elif isinstance(item, CodeItem): @@ -2562,8 +2583,11 @@ def _image_fallback(item: TextItem): elif isinstance(item, TextItem): - text = f"

    {_prepare_tag_content(item.text)}

    " + text = get_html_tag_with_text_direction( + html_tag="p", text=_prepare_tag_content(item.text) + ) html_texts.append(text) + elif isinstance(item, TableItem): text = item.export_to_html(doc=self, add_caption=True) diff --git a/docling_core/types/doc/utils.py b/docling_core/types/doc/utils.py index 14c4053a..c89e7836 100644 --- a/docling_core/types/doc/utils.py +++ b/docling_core/types/doc/utils.py @@ -5,6 +5,7 @@ """Utils for document types.""" +import unicodedata from pathlib import Path @@ -46,3 +47,29 @@ def relative_path(src: Path, target: Path) -> Path: # Combine and return the result return Path(*up_segments, *down_segments) + + +def get_html_tag_with_text_direction(html_tag: str, text: str) -> str: + """Form the HTML element with tag, text, and optional dir attribute.""" + text_dir = get_text_direction(text) + + if text_dir == "ltr": + return f"<{html_tag}>{text}" + else: + return f'<{html_tag} dir="{text_dir}">{text}' + + +def get_text_direction(text: str) -> str: + """Determine the text direction of a given string as LTR or RTL script.""" + if not text: + return "ltr" # Default for empty input + + rtl_scripts = {"R", "AL"} + rtl_chars = sum(unicodedata.bidirectional(c) in rtl_scripts for c in text) + + return ( + "rtl" + if unicodedata.bidirectional(text[0]) in rtl_scripts + or rtl_chars > len(text) / 2 + else "ltr" + )