diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py
index 3e4812b3..64233b62 100644
--- a/docling_core/types/doc/document.py
+++ b/docling_core/types/doc/document.py
@@ -44,7 +44,7 @@
from docling_core.types.doc.base import ImageRefMode
from docling_core.types.doc.labels import CodeLanguageLabel, DocItemLabel, GroupLabel
from docling_core.types.doc.tokens import DocumentToken, TableToken
-from docling_core.types.doc.utils import get_text_direction, relative_path
+from docling_core.types.doc.utils import get_text_direction, get_html_tag_with_text_direction, relative_path
_logger = logging.getLogger(__name__)
@@ -866,8 +866,7 @@ def export_to_html(
caption_text = ""
if len(text) > 0:
- dir = get_text_direction(text)
- caption_text = f'{text}'
+ caption_text = get_html_tag_with_text_direction(html_tag="figcaption", text=text)
default_response = f"{caption_text}"
@@ -1091,23 +1090,31 @@ def export_to_html(
if colspan > 1:
opening_tag += f' colspan="{colspan}"'
- dir = get_text_direction(content)
- opening_tag += f' dir="{dir}"'
+ text_dir = get_text_direction(content)
+ if text_dir=="rtl":
+ opening_tag += f' dir="{dir}"'
body += f"<{opening_tag}>{content}{celltag}>"
body += ""
- dir = get_text_direction(text)
+ #dir = get_text_direction(text)
if len(text) > 0 and len(body) > 0:
+ caption_text = get_html_tag_with_text_direction(html_tag="caption", text=text)
+ """
body = (
f'
{text}
'
f"{body}
"
)
+ """
+ body = f"
{caption_text}{body}
"
+
elif len(text) == 0 and len(body) > 0:
body = f"
{body}
"
elif len(text) > 0 and len(body) == 0:
- body = f'
{text}
'
+ caption_text = get_html_tag_with_text_direction(html_tag="caption", text=text)
+ #body = f'
{text}
'
+ body = f'
{caption_text}
'
else:
body = "
"
@@ -2480,19 +2487,25 @@ def _prepare_tag_content(
elif isinstance(item, TextItem) and item.label in [DocItemLabel.TITLE]:
text_inner = _prepare_tag_content(item.text)
- dir = get_text_direction(item.text)
- text = f'
{text_inner}
'
+ #dir = get_text_direction(item.text)
+ #text = f'
{text_inner}
'
+ text = get_html_tag_with_text_direction(html_tag="h1", text=text_inner)
+
html_texts.append(text)
elif isinstance(item, SectionHeaderItem):
section_level: int = min(item.level + 1, 6)
+
+ """
dir = get_text_direction(item.text)
text = (
f''
f"{_prepare_tag_content(item.text)}"
)
+ """
+ text = get_html_tag_with_text_direction(html_tag=f"h{section_level}", text=item.text)
html_texts.append(text)
elif isinstance(item, TextItem) and item.label in [DocItemLabel.FORMULA]:
@@ -2555,13 +2568,15 @@ def _image_fallback(item: TextItem):
)
elif isinstance(item, ListItem):
- dir = get_text_direction(item.text)
- text = f'
{_prepare_tag_content(item.text)}
'
+ # dir = get_text_direction(item.text)
+ # text = f'
{_prepare_tag_content(item.text)}
'
+ text = get_html_tag_with_text_direction(html_tag="li", text=_prepare_tag_content(item.text))
html_texts.append(text)
elif isinstance(item, TextItem) and item.label in [DocItemLabel.LIST_ITEM]:
- dir = get_text_direction(item.text)
- text = f'
{_prepare_tag_content(item.text)}
'
+ # dir = get_text_direction(item.text)
+ # text = f'
{_prepare_tag_content(item.text)}
'
+ text = get_html_tag_with_text_direction(html_tag="li", text=_prepare_tag_content(item.text))
html_texts.append(text)
elif isinstance(item, CodeItem):
@@ -2572,9 +2587,12 @@ def _image_fallback(item: TextItem):
html_texts.append(text)
elif isinstance(item, TextItem):
- dir = get_text_direction(item.text)
- text = f'
{_prepare_tag_content(item.text)}
'
+ # dir = get_text_direction(item.text)
+ # text = f'
{_prepare_tag_content(item.text)}
'
+
+ text = get_html_tag_with_text_direction(html_tag="p", text=_prepare_tag_content(item.text))
html_texts.append(text)
+
elif isinstance(item, TableItem):
text = item.export_to_html(doc=self, add_caption=True)
diff --git a/docling_core/types/doc/utils.py b/docling_core/types/doc/utils.py
index ca851dfa..d95c7c12 100644
--- a/docling_core/types/doc/utils.py
+++ b/docling_core/types/doc/utils.py
@@ -49,7 +49,17 @@ def relative_path(src: Path, target: Path) -> Path:
return Path(*up_segments, *down_segments)
-def get_text_direction(text):
+def get_html_tag_with_text_direction(html_tag:str, text:str) -> str:
+
+ text_dir = get_text_direction(text)
+
+ if text_dir=="ltr":
+ return f"<{html_tag}>{text}{html_tag}>"
+ else:
+ return f"<{html_tag} dir={text_dir}>{text}{html_tag}>"
+
+
+def get_text_direction(text:str) -> str:
"""Determine the text direction of a given string as LTR or RTL script."""
if not text:
return "ltr" # Default for empty input