diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py index 3e4812b3..64233b62 100644 --- a/docling_core/types/doc/document.py +++ b/docling_core/types/doc/document.py @@ -44,7 +44,7 @@ from docling_core.types.doc.base import ImageRefMode from docling_core.types.doc.labels import CodeLanguageLabel, DocItemLabel, GroupLabel from docling_core.types.doc.tokens import DocumentToken, TableToken -from docling_core.types.doc.utils import get_text_direction, relative_path +from docling_core.types.doc.utils import get_text_direction, get_html_tag_with_text_direction, relative_path _logger = logging.getLogger(__name__) @@ -866,8 +866,7 @@ def export_to_html( caption_text = "" if len(text) > 0: - dir = get_text_direction(text) - caption_text = f'
{text}
' + caption_text = get_html_tag_with_text_direction(html_tag="figcaption", text=text) default_response = f"
{caption_text}
" @@ -1091,23 +1090,31 @@ def export_to_html( if colspan > 1: opening_tag += f' colspan="{colspan}"' - dir = get_text_direction(content) - opening_tag += f' dir="{dir}"' + text_dir = get_text_direction(content) + if text_dir=="rtl": + opening_tag += f' dir="{dir}"' body += f"<{opening_tag}>{content}" body += "" - dir = get_text_direction(text) + #dir = get_text_direction(text) if len(text) > 0 and len(body) > 0: + caption_text = get_html_tag_with_text_direction(html_tag="caption", text=text) + """ body = ( f'' f"{body}
{text}
" ) + """ + body = f"{caption_text}{body}
" + elif len(text) == 0 and len(body) > 0: body = f"{body}
" elif len(text) > 0 and len(body) == 0: - body = f'
{text}
' + caption_text = get_html_tag_with_text_direction(html_tag="caption", text=text) + #body = f'
{text}
' + body = f'{caption_text}
' else: body = "
" @@ -2480,19 +2487,25 @@ def _prepare_tag_content( elif isinstance(item, TextItem) and item.label in [DocItemLabel.TITLE]: text_inner = _prepare_tag_content(item.text) - dir = get_text_direction(item.text) - text = f'

{text_inner}

' + #dir = get_text_direction(item.text) + #text = f'

{text_inner}

' + text = get_html_tag_with_text_direction(html_tag="h1", text=text_inner) + html_texts.append(text) elif isinstance(item, SectionHeaderItem): section_level: int = min(item.level + 1, 6) + + """ dir = get_text_direction(item.text) text = ( f'' f"{_prepare_tag_content(item.text)}" ) + """ + text = get_html_tag_with_text_direction(html_tag=f"h{section_level}", text=item.text) html_texts.append(text) elif isinstance(item, TextItem) and item.label in [DocItemLabel.FORMULA]: @@ -2555,13 +2568,15 @@ def _image_fallback(item: TextItem): ) elif isinstance(item, ListItem): - dir = get_text_direction(item.text) - text = f'
  • {_prepare_tag_content(item.text)}
  • ' + # dir = get_text_direction(item.text) + # text = f'
  • {_prepare_tag_content(item.text)}
  • ' + text = get_html_tag_with_text_direction(html_tag="li", text=_prepare_tag_content(item.text)) html_texts.append(text) elif isinstance(item, TextItem) and item.label in [DocItemLabel.LIST_ITEM]: - dir = get_text_direction(item.text) - text = f'
  • {_prepare_tag_content(item.text)}
  • ' + # dir = get_text_direction(item.text) + # text = f'
  • {_prepare_tag_content(item.text)}
  • ' + text = get_html_tag_with_text_direction(html_tag="li", text=_prepare_tag_content(item.text)) html_texts.append(text) elif isinstance(item, CodeItem): @@ -2572,9 +2587,12 @@ def _image_fallback(item: TextItem): html_texts.append(text) elif isinstance(item, TextItem): - dir = get_text_direction(item.text) - text = f'

    {_prepare_tag_content(item.text)}

    ' + # dir = get_text_direction(item.text) + # text = f'

    {_prepare_tag_content(item.text)}

    ' + + text = get_html_tag_with_text_direction(html_tag="p", text=_prepare_tag_content(item.text)) html_texts.append(text) + elif isinstance(item, TableItem): text = item.export_to_html(doc=self, add_caption=True) diff --git a/docling_core/types/doc/utils.py b/docling_core/types/doc/utils.py index ca851dfa..d95c7c12 100644 --- a/docling_core/types/doc/utils.py +++ b/docling_core/types/doc/utils.py @@ -49,7 +49,17 @@ def relative_path(src: Path, target: Path) -> Path: return Path(*up_segments, *down_segments) -def get_text_direction(text): +def get_html_tag_with_text_direction(html_tag:str, text:str) -> str: + + text_dir = get_text_direction(text) + + if text_dir=="ltr": + return f"<{html_tag}>{text}" + else: + return f"<{html_tag} dir={text_dir}>{text}" + + +def get_text_direction(text:str) -> str: """Determine the text direction of a given string as LTR or RTL script.""" if not text: return "ltr" # Default for empty input