proposal for cau/rtl-text

Signed-off-by: Peter Staar <[email protected]>
DS4SD · Feb 5, 2025 · 537a6a6 · 537a6a6
1 parent 7a00100
commit 537a6a6
Show file tree

Hide file tree

Showing 2 changed files with 44 additions and 16 deletions.
diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py
@@ -44,7 +44,7 @@
 from docling_core.types.doc.base import ImageRefMode
 from docling_core.types.doc.labels import CodeLanguageLabel, DocItemLabel, GroupLabel
 from docling_core.types.doc.tokens import DocumentToken, TableToken
-from docling_core.types.doc.utils import get_text_direction, relative_path
+from docling_core.types.doc.utils import get_text_direction, get_html_tag_with_text_direction, relative_path
 
 _logger = logging.getLogger(__name__)
 
@@ -866,8 +866,7 @@ def export_to_html(
 
         caption_text = ""
         if len(text) > 0:
-            dir = get_text_direction(text)
-            caption_text = f'<figcaption dir="{dir}">{text}</figcaption>'
+            caption_text = get_html_tag_with_text_direction(html_tag="figcaption", text=text) 
 
         default_response = f"<figure>{caption_text}</figure>"
 
@@ -1091,23 +1090,31 @@ def export_to_html(
                 if colspan > 1:
                     opening_tag += f' colspan="{colspan}"'
 
-                dir = get_text_direction(content)
-                opening_tag += f' dir="{dir}"'
+                text_dir = get_text_direction(content)
+                if text_dir=="rtl":
+                    opening_tag += f' dir="{dir}"'
 
                 body += f"<{opening_tag}>{content}</{celltag}>"
             body += "</tr>"
 
-        dir = get_text_direction(text)
+        #dir = get_text_direction(text)
 
         if len(text) > 0 and len(body) > 0:
+            caption_text = get_html_tag_with_text_direction(html_tag="caption", text=text)
+            """
             body = (
                 f'<table><caption dir="{dir}">{text}</caption>'
                 f"<tbody>{body}</tbody></table>"
             )
+            """
+            body = f"<table>{caption_text}<tbody>{body}</tbody></table>"
+
         elif len(text) == 0 and len(body) > 0:
             body = f"<table><tbody>{body}</tbody></table>"
         elif len(text) > 0 and len(body) == 0:
-            body = f'<table><caption dir="{dir}">{text}</caption></table>'
+            caption_text = get_html_tag_with_text_direction(html_tag="caption", text=text)
+            #body = f'<table><caption dir="{dir}">{text}</caption></table>'
+            body = f'<table>{caption_text}</table>'
         else:
             body = "<table></table>"
 
@@ -2480,19 +2487,25 @@ def _prepare_tag_content(
 
             elif isinstance(item, TextItem) and item.label in [DocItemLabel.TITLE]:
                 text_inner = _prepare_tag_content(item.text)
-                dir = get_text_direction(item.text)
-                text = f'<h1 dir="{dir}">{text_inner}</h1>'
+                #dir = get_text_direction(item.text)
+                #text = f'<h1 dir="{dir}">{text_inner}</h1>'
+                text = get_html_tag_with_text_direction(html_tag="h1", text=text_inner)
+
                 html_texts.append(text)
 
             elif isinstance(item, SectionHeaderItem):
 
                 section_level: int = min(item.level + 1, 6)
+
+                """
                 dir = get_text_direction(item.text)
 
                 text = (
                     f'<h{(section_level)} dir="{dir}">'
                     f"{_prepare_tag_content(item.text)}</h{(section_level)}>"
                 )
+                """
+                text = get_html_tag_with_text_direction(html_tag=f"h{section_level}", text=item.text)
                 html_texts.append(text)
 
             elif isinstance(item, TextItem) and item.label in [DocItemLabel.FORMULA]:
@@ -2555,13 +2568,15 @@ def _image_fallback(item: TextItem):
                     )
 
             elif isinstance(item, ListItem):
-                dir = get_text_direction(item.text)
-                text = f'<li dir="{dir}">{_prepare_tag_content(item.text)}</li>'
+                # dir = get_text_direction(item.text)
+                # text = f'<li dir="{dir}">{_prepare_tag_content(item.text)}</li>'
+                text = get_html_tag_with_text_direction(html_tag="li", text=_prepare_tag_content(item.text))
                 html_texts.append(text)
 
             elif isinstance(item, TextItem) and item.label in [DocItemLabel.LIST_ITEM]:
-                dir = get_text_direction(item.text)
-                text = f'<li dir="{dir}">{_prepare_tag_content(item.text)}</li>'
+                # dir = get_text_direction(item.text)
+                # text = f'<li dir="{dir}">{_prepare_tag_content(item.text)}</li>'
+                text = get_html_tag_with_text_direction(html_tag="li", text=_prepare_tag_content(item.text))
                 html_texts.append(text)
 
             elif isinstance(item, CodeItem):
@@ -2572,9 +2587,12 @@ def _image_fallback(item: TextItem):
                 html_texts.append(text)
 
             elif isinstance(item, TextItem):
-                dir = get_text_direction(item.text)
-                text = f'<p dir="{dir}">{_prepare_tag_content(item.text)}</p>'
+                # dir = get_text_direction(item.text)
+                # text = f'<p dir="{dir}">{_prepare_tag_content(item.text)}</p>'
+
+                text = get_html_tag_with_text_direction(html_tag="p", text=_prepare_tag_content(item.text))
                 html_texts.append(text)
+
             elif isinstance(item, TableItem):
 
                 text = item.export_to_html(doc=self, add_caption=True)

diff --git a/docling_core/types/doc/utils.py b/docling_core/types/doc/utils.py
@@ -49,7 +49,17 @@ def relative_path(src: Path, target: Path) -> Path:
     return Path(*up_segments, *down_segments)
 
 
-def get_text_direction(text):
+def get_html_tag_with_text_direction(html_tag:str, text:str) -> str:
+
+    text_dir = get_text_direction(text)
+
+    if text_dir=="ltr":
+        return f"<{html_tag}>{text}</{html_tag}>"
+    else:
+        return f"<{html_tag} dir={text_dir}>{text}</{html_tag}>"
+
+
+def get_text_direction(text:str) -> str:
     """Determine the text direction of a given string as LTR or RTL script."""
     if not text:
         return "ltr"  # Default for empty input