Skip to content

Commit

Permalink
proposal for cau/rtl-text
Browse files Browse the repository at this point in the history
Signed-off-by: Peter Staar <[email protected]>
  • Loading branch information
PeterStaar-IBM authored and cau-git committed Feb 5, 2025
1 parent 7a00100 commit 537a6a6
Show file tree
Hide file tree
Showing 2 changed files with 44 additions and 16 deletions.
48 changes: 33 additions & 15 deletions docling_core/types/doc/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@
from docling_core.types.doc.base import ImageRefMode
from docling_core.types.doc.labels import CodeLanguageLabel, DocItemLabel, GroupLabel
from docling_core.types.doc.tokens import DocumentToken, TableToken
from docling_core.types.doc.utils import get_text_direction, relative_path
from docling_core.types.doc.utils import get_text_direction, get_html_tag_with_text_direction, relative_path

_logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -866,8 +866,7 @@ def export_to_html(

caption_text = ""
if len(text) > 0:
dir = get_text_direction(text)
caption_text = f'<figcaption dir="{dir}">{text}</figcaption>'
caption_text = get_html_tag_with_text_direction(html_tag="figcaption", text=text)

default_response = f"<figure>{caption_text}</figure>"

Expand Down Expand Up @@ -1091,23 +1090,31 @@ def export_to_html(
if colspan > 1:
opening_tag += f' colspan="{colspan}"'

dir = get_text_direction(content)
opening_tag += f' dir="{dir}"'
text_dir = get_text_direction(content)
if text_dir=="rtl":
opening_tag += f' dir="{dir}"'

body += f"<{opening_tag}>{content}</{celltag}>"
body += "</tr>"

dir = get_text_direction(text)
#dir = get_text_direction(text)

if len(text) > 0 and len(body) > 0:
caption_text = get_html_tag_with_text_direction(html_tag="caption", text=text)
"""
body = (
f'<table><caption dir="{dir}">{text}</caption>'
f"<tbody>{body}</tbody></table>"
)
"""
body = f"<table>{caption_text}<tbody>{body}</tbody></table>"

elif len(text) == 0 and len(body) > 0:
body = f"<table><tbody>{body}</tbody></table>"
elif len(text) > 0 and len(body) == 0:
body = f'<table><caption dir="{dir}">{text}</caption></table>'
caption_text = get_html_tag_with_text_direction(html_tag="caption", text=text)
#body = f'<table><caption dir="{dir}">{text}</caption></table>'
body = f'<table>{caption_text}</table>'
else:
body = "<table></table>"

Expand Down Expand Up @@ -2480,19 +2487,25 @@ def _prepare_tag_content(

elif isinstance(item, TextItem) and item.label in [DocItemLabel.TITLE]:
text_inner = _prepare_tag_content(item.text)
dir = get_text_direction(item.text)
text = f'<h1 dir="{dir}">{text_inner}</h1>'
#dir = get_text_direction(item.text)
#text = f'<h1 dir="{dir}">{text_inner}</h1>'
text = get_html_tag_with_text_direction(html_tag="h1", text=text_inner)

html_texts.append(text)

elif isinstance(item, SectionHeaderItem):

section_level: int = min(item.level + 1, 6)

"""
dir = get_text_direction(item.text)
text = (
f'<h{(section_level)} dir="{dir}">'
f"{_prepare_tag_content(item.text)}</h{(section_level)}>"
)
"""
text = get_html_tag_with_text_direction(html_tag=f"h{section_level}", text=item.text)
html_texts.append(text)

elif isinstance(item, TextItem) and item.label in [DocItemLabel.FORMULA]:
Expand Down Expand Up @@ -2555,13 +2568,15 @@ def _image_fallback(item: TextItem):
)

elif isinstance(item, ListItem):
dir = get_text_direction(item.text)
text = f'<li dir="{dir}">{_prepare_tag_content(item.text)}</li>'
# dir = get_text_direction(item.text)
# text = f'<li dir="{dir}">{_prepare_tag_content(item.text)}</li>'
text = get_html_tag_with_text_direction(html_tag="li", text=_prepare_tag_content(item.text))
html_texts.append(text)

elif isinstance(item, TextItem) and item.label in [DocItemLabel.LIST_ITEM]:
dir = get_text_direction(item.text)
text = f'<li dir="{dir}">{_prepare_tag_content(item.text)}</li>'
# dir = get_text_direction(item.text)
# text = f'<li dir="{dir}">{_prepare_tag_content(item.text)}</li>'
text = get_html_tag_with_text_direction(html_tag="li", text=_prepare_tag_content(item.text))
html_texts.append(text)

elif isinstance(item, CodeItem):
Expand All @@ -2572,9 +2587,12 @@ def _image_fallback(item: TextItem):
html_texts.append(text)

elif isinstance(item, TextItem):
dir = get_text_direction(item.text)
text = f'<p dir="{dir}">{_prepare_tag_content(item.text)}</p>'
# dir = get_text_direction(item.text)
# text = f'<p dir="{dir}">{_prepare_tag_content(item.text)}</p>'

text = get_html_tag_with_text_direction(html_tag="p", text=_prepare_tag_content(item.text))
html_texts.append(text)

elif isinstance(item, TableItem):

text = item.export_to_html(doc=self, add_caption=True)
Expand Down
12 changes: 11 additions & 1 deletion docling_core/types/doc/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,17 @@ def relative_path(src: Path, target: Path) -> Path:
return Path(*up_segments, *down_segments)


def get_text_direction(text):
def get_html_tag_with_text_direction(html_tag:str, text:str) -> str:

text_dir = get_text_direction(text)

if text_dir=="ltr":
return f"<{html_tag}>{text}</{html_tag}>"
else:
return f"<{html_tag} dir={text_dir}>{text}</{html_tag}>"


def get_text_direction(text:str) -> str:
"""Determine the text direction of a given string as LTR or RTL script."""
if not text:
return "ltr" # Default for empty input
Expand Down

0 comments on commit 537a6a6

Please sign in to comment.