Skip to content

Commit

Permalink
fix: Detect RTL text and put according HTML tags
Browse files Browse the repository at this point in the history
Signed-off-by: Christoph Auer <[email protected]>
  • Loading branch information
cau-git committed Feb 5, 2025
1 parent 327f902 commit 7a00100
Show file tree
Hide file tree
Showing 9 changed files with 221 additions and 193 deletions.
37 changes: 24 additions & 13 deletions docling_core/types/doc/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@
from docling_core.types.doc.base import ImageRefMode
from docling_core.types.doc.labels import CodeLanguageLabel, DocItemLabel, GroupLabel
from docling_core.types.doc.tokens import DocumentToken, TableToken
from docling_core.types.doc.utils import relative_path
from docling_core.types.doc.utils import get_text_direction, relative_path

_logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -866,7 +866,8 @@ def export_to_html(

caption_text = ""
if len(text) > 0:
caption_text = f"<figcaption>{text}</figcaption>"
dir = get_text_direction(text)
caption_text = f'<figcaption dir="{dir}">{text}</figcaption>'

default_response = f"<figure>{caption_text}</figure>"

Expand Down Expand Up @@ -1090,15 +1091,23 @@ def export_to_html(
if colspan > 1:
opening_tag += f' colspan="{colspan}"'

dir = get_text_direction(content)
opening_tag += f' dir="{dir}"'

body += f"<{opening_tag}>{content}</{celltag}>"
body += "</tr>"

dir = get_text_direction(text)

if len(text) > 0 and len(body) > 0:
body = f"<table><caption>{text}</caption><tbody>{body}</tbody></table>"
body = (
f'<table><caption dir="{dir}">{text}</caption>'
f"<tbody>{body}</tbody></table>"
)
elif len(text) == 0 and len(body) > 0:
body = f"<table><tbody>{body}</tbody></table>"
elif len(text) > 0 and len(body) == 0:
body = f"<table><caption>{text}</caption></table>"
body = f'<table><caption dir="{dir}">{text}</caption></table>'
else:
body = "<table></table>"

Expand Down Expand Up @@ -2470,16 +2479,18 @@ def _prepare_tag_content(
continue

elif isinstance(item, TextItem) and item.label in [DocItemLabel.TITLE]:

text = f"<h1>{_prepare_tag_content(item.text)}</h1>"
text_inner = _prepare_tag_content(item.text)
dir = get_text_direction(item.text)
text = f'<h1 dir="{dir}">{text_inner}</h1>'
html_texts.append(text)

elif isinstance(item, SectionHeaderItem):

section_level: int = min(item.level + 1, 6)
dir = get_text_direction(item.text)

text = (
f"<h{(section_level)}>"
f'<h{(section_level)} dir="{dir}">'
f"{_prepare_tag_content(item.text)}</h{(section_level)}>"
)
html_texts.append(text)
Expand Down Expand Up @@ -2544,13 +2555,13 @@ def _image_fallback(item: TextItem):
)

elif isinstance(item, ListItem):

text = f"<li>{_prepare_tag_content(item.text)}</li>"
dir = get_text_direction(item.text)
text = f'<li dir="{dir}">{_prepare_tag_content(item.text)}</li>'
html_texts.append(text)

elif isinstance(item, TextItem) and item.label in [DocItemLabel.LIST_ITEM]:

text = f"<li>{_prepare_tag_content(item.text)}</li>"
dir = get_text_direction(item.text)
text = f'<li dir="{dir}">{_prepare_tag_content(item.text)}</li>'
html_texts.append(text)

elif isinstance(item, CodeItem):
Expand All @@ -2561,8 +2572,8 @@ def _image_fallback(item: TextItem):
html_texts.append(text)

elif isinstance(item, TextItem):

text = f"<p>{_prepare_tag_content(item.text)}</p>"
dir = get_text_direction(item.text)
text = f'<p dir="{dir}">{_prepare_tag_content(item.text)}</p>'
html_texts.append(text)
elif isinstance(item, TableItem):

Expand Down
17 changes: 17 additions & 0 deletions docling_core/types/doc/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

"""Utils for document types."""

import unicodedata
from pathlib import Path


Expand Down Expand Up @@ -46,3 +47,19 @@ def relative_path(src: Path, target: Path) -> Path:

# Combine and return the result
return Path(*up_segments, *down_segments)


def get_text_direction(text):
"""Determine the text direction of a given string as LTR or RTL script."""
if not text:
return "ltr" # Default for empty input

rtl_scripts = {"R", "AL"}
rtl_chars = sum(unicodedata.bidirectional(c) in rtl_scripts for c in text)

return (
"rtl"
if unicodedata.bidirectional(text[0]) in rtl_scripts
or rtl_chars > len(text) / 2
else "ltr"
)
232 changes: 116 additions & 116 deletions test/data/doc/2206.01062.yaml.html

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions test/data/doc/bad_doc.yaml.html
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,6 @@
}
</style>
</head>
<h1>This is the title</h1>
<h2>This is the first section</h2>
<h1 dir="ltr">This is the title</h1>
<h2 dir="ltr">This is the first section</h2>
</html>
30 changes: 15 additions & 15 deletions test/data/doc/constructed_doc.embedded.html.gt
Original file line number Diff line number Diff line change
Expand Up @@ -69,23 +69,23 @@
}
</style>
</head>
<h1>Title of the Document</h1>
<p>Author 1<br>Affiliation 1</p>
<p>Author 2<br>Affiliation 2</p>
<h2>1. Introduction</h2>
<p>This paper introduces the biggest invention ever made. ...</p>
<h1 dir="ltr">Title of the Document</h1>
<p dir="ltr">Author 1<br>Affiliation 1</p>
<p dir="ltr">Author 2<br>Affiliation 2</p>
<h2 dir="ltr">1. Introduction</h2>
<p dir="ltr">This paper introduces the biggest invention ever made. ...</p>
<ul>
<li>list item 1</li>
<li>list item 2</li>
<li>list item 3</li>
<li dir="ltr">list item 1</li>
<li dir="ltr">list item 2</li>
<li dir="ltr">list item 3</li>
<ol>
<li>list item 3.a</li>
<li>list item 3.b</li>
<li>list item 3.c</li>
<li dir="ltr">list item 3.a</li>
<li dir="ltr">list item 3.b</li>
<li dir="ltr">list item 3.c</li>
</ol>
<li>list item 4</li>
<li dir="ltr">list item 4</li>
</ul>
<table><caption>This is the caption of table 1.</caption><tbody><tr><td rowspan="2">Product</td><td colspan="2">Years</td></tr><tr><td>2016</td><td>2017</td></tr><tr><td>Apple</td><td>49823</td><td>695944</td></tr></tbody></table>
<figure><figcaption>This is the caption of figure 1.</figcaption></figure>
<figure><figcaption>This is the caption of figure 2.</figcaption><img src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAEAAAABACAIAAAAlC+aJAAABNElEQVR4nO2ayw7DIAwEIf//z+nBErKAEBMe9rrMmYRZNr60xDCVW7AmTt1xwtsk0uu2H3rDiPosiY/PzlLnfFPpfmqFOqdXqGP9anWOXOsSrttp37WdKMBm+65NX7pSUc9oK7YasGAf3jQeAxixJxoy0iE2Sz2AqeMnnpQqAQzaE1WxPIBZe6LU8zUDxo+fyCQdNQBx/ARX9dIA0PETSdhFA3DHT5C2iwaggQ8QQQcgAd/ACaDNCaDNCaDNCaDNCaDNNfc/w81EDw1oC4ziIgDoGJC2iwYCYAlJ2EsDAaoEruqogQBSQibpq4FgvoRSr9KA2QxVsfonZDDDk5K7GUiYKqEh02rASIa2hkhS6xdsiZxoBlSqEG4qHeLNGeTb/dO1Sw7wxVcO8NVjDvDl75L91+9/ESIkdDQ3IX0AAAAASUVORK5CYII="></figure>
<table><caption dir="ltr">This is the caption of table 1.</caption><tbody><tr><td rowspan="2" dir="ltr">Product</td><td colspan="2" dir="ltr">Years</td></tr><tr><td dir="ltr">2016</td><td dir="ltr">2017</td></tr><tr><td dir="ltr">Apple</td><td dir="ltr">49823</td><td dir="ltr">695944</td></tr></tbody></table>
<figure><figcaption dir="ltr">This is the caption of figure 1.</figcaption></figure>
<figure><figcaption dir="ltr">This is the caption of figure 2.</figcaption><img src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAEAAAABACAIAAAAlC+aJAAABNElEQVR4nO2ayw7DIAwEIf//z+nBErKAEBMe9rrMmYRZNr60xDCVW7AmTt1xwtsk0uu2H3rDiPosiY/PzlLnfFPpfmqFOqdXqGP9anWOXOsSrttp37WdKMBm+65NX7pSUc9oK7YasGAf3jQeAxixJxoy0iE2Sz2AqeMnnpQqAQzaE1WxPIBZe6LU8zUDxo+fyCQdNQBx/ARX9dIA0PETSdhFA3DHT5C2iwaggQ8QQQcgAd/ACaDNCaDNCaDNCaDNCaDNNfc/w81EDw1oC4ziIgDoGJC2iwYCYAlJ2EsDAaoEruqogQBSQibpq4FgvoRSr9KA2QxVsfonZDDDk5K7GUiYKqEh02rASIa2hkhS6xdsiZxoBlSqEG4qHeLNGeTb/dO1Sw7wxVcO8NVjDvDl75L91+9/ESIkdDQ3IX0AAAAASUVORK5CYII="></figure>
</html>
30 changes: 15 additions & 15 deletions test/data/doc/constructed_doc.placeholder.html.gt
Original file line number Diff line number Diff line change
Expand Up @@ -69,23 +69,23 @@
}
</style>
</head>
<h1>Title of the Document</h1>
<p>Author 1<br>Affiliation 1</p>
<p>Author 2<br>Affiliation 2</p>
<h2>1. Introduction</h2>
<p>This paper introduces the biggest invention ever made. ...</p>
<h1 dir="ltr">Title of the Document</h1>
<p dir="ltr">Author 1<br>Affiliation 1</p>
<p dir="ltr">Author 2<br>Affiliation 2</p>
<h2 dir="ltr">1. Introduction</h2>
<p dir="ltr">This paper introduces the biggest invention ever made. ...</p>
<ul>
<li>list item 1</li>
<li>list item 2</li>
<li>list item 3</li>
<li dir="ltr">list item 1</li>
<li dir="ltr">list item 2</li>
<li dir="ltr">list item 3</li>
<ol>
<li>list item 3.a</li>
<li>list item 3.b</li>
<li>list item 3.c</li>
<li dir="ltr">list item 3.a</li>
<li dir="ltr">list item 3.b</li>
<li dir="ltr">list item 3.c</li>
</ol>
<li>list item 4</li>
<li dir="ltr">list item 4</li>
</ul>
<table><caption>This is the caption of table 1.</caption><tbody><tr><td rowspan="2">Product</td><td colspan="2">Years</td></tr><tr><td>2016</td><td>2017</td></tr><tr><td>Apple</td><td>49823</td><td>695944</td></tr></tbody></table>
<figure><figcaption>This is the caption of figure 1.</figcaption></figure>
<figure><figcaption>This is the caption of figure 2.</figcaption></figure>
<table><caption dir="ltr">This is the caption of table 1.</caption><tbody><tr><td rowspan="2" dir="ltr">Product</td><td colspan="2" dir="ltr">Years</td></tr><tr><td dir="ltr">2016</td><td dir="ltr">2017</td></tr><tr><td dir="ltr">Apple</td><td dir="ltr">49823</td><td dir="ltr">695944</td></tr></tbody></table>
<figure><figcaption dir="ltr">This is the caption of figure 1.</figcaption></figure>
<figure><figcaption dir="ltr">This is the caption of figure 2.</figcaption></figure>
</html>
30 changes: 15 additions & 15 deletions test/data/doc/constructed_doc.referenced.html.gt
Original file line number Diff line number Diff line change
Expand Up @@ -69,23 +69,23 @@
}
</style>
</head>
<h1>Title of the Document</h1>
<p>Author 1<br>Affiliation 1</p>
<p>Author 2<br>Affiliation 2</p>
<h2>1. Introduction</h2>
<p>This paper introduces the biggest invention ever made. ...</p>
<h1 dir="ltr">Title of the Document</h1>
<p dir="ltr">Author 1<br>Affiliation 1</p>
<p dir="ltr">Author 2<br>Affiliation 2</p>
<h2 dir="ltr">1. Introduction</h2>
<p dir="ltr">This paper introduces the biggest invention ever made. ...</p>
<ul>
<li>list item 1</li>
<li>list item 2</li>
<li>list item 3</li>
<li dir="ltr">list item 1</li>
<li dir="ltr">list item 2</li>
<li dir="ltr">list item 3</li>
<ol>
<li>list item 3.a</li>
<li>list item 3.b</li>
<li>list item 3.c</li>
<li dir="ltr">list item 3.a</li>
<li dir="ltr">list item 3.b</li>
<li dir="ltr">list item 3.c</li>
</ol>
<li>list item 4</li>
<li dir="ltr">list item 4</li>
</ul>
<table><caption>This is the caption of table 1.</caption><tbody><tr><td rowspan="2">Product</td><td colspan="2">Years</td></tr><tr><td>2016</td><td>2017</td></tr><tr><td>Apple</td><td>49823</td><td>695944</td></tr></tbody></table>
<figure><figcaption>This is the caption of figure 1.</figcaption></figure>
<figure><figcaption>This is the caption of figure 2.</figcaption><img src="constructed_images/image_000001_ccb4cbe7039fe17892f3d611cfb71eafff1d4d230b19b10779334cc4b63c98bc.png"></figure>
<table><caption dir="ltr">This is the caption of table 1.</caption><tbody><tr><td rowspan="2" dir="ltr">Product</td><td colspan="2" dir="ltr">Years</td></tr><tr><td dir="ltr">2016</td><td dir="ltr">2017</td></tr><tr><td dir="ltr">Apple</td><td dir="ltr">49823</td><td dir="ltr">695944</td></tr></tbody></table>
<figure><figcaption dir="ltr">This is the caption of figure 1.</figcaption></figure>
<figure><figcaption dir="ltr">This is the caption of figure 2.</figcaption><img src="constructed_images/image_000001_ccb4cbe7039fe17892f3d611cfb71eafff1d4d230b19b10779334cc4b63c98bc.png"></figure>
</html>
30 changes: 15 additions & 15 deletions test/data/doc/constructed_document.yaml.html
Original file line number Diff line number Diff line change
Expand Up @@ -69,23 +69,23 @@
}
</style>
</head>
<h1>Title of the Document</h1>
<p>Author 1<br>Affiliation 1</p>
<p>Author 2<br>Affiliation 2</p>
<h2>1. Introduction</h2>
<p>This paper introduces the biggest invention ever made. ...</p>
<h1 dir="ltr">Title of the Document</h1>
<p dir="ltr">Author 1<br>Affiliation 1</p>
<p dir="ltr">Author 2<br>Affiliation 2</p>
<h2 dir="ltr">1. Introduction</h2>
<p dir="ltr">This paper introduces the biggest invention ever made. ...</p>
<ul>
<li>list item 1</li>
<li>list item 2</li>
<li>list item 3</li>
<li dir="ltr">list item 1</li>
<li dir="ltr">list item 2</li>
<li dir="ltr">list item 3</li>
<ol>
<li>list item 3.a</li>
<li>list item 3.b</li>
<li>list item 3.c</li>
<li dir="ltr">list item 3.a</li>
<li dir="ltr">list item 3.b</li>
<li dir="ltr">list item 3.c</li>
</ol>
<li>list item 4</li>
<li dir="ltr">list item 4</li>
</ul>
<table><caption>This is the caption of table 1.</caption><tbody><tr><td rowspan="2">Product</td><td colspan="2">Years</td></tr><tr><td>2016</td><td>2017</td></tr><tr><td>Apple</td><td>49823</td><td>695944</td></tr></tbody></table>
<figure><figcaption>This is the caption of figure 1.</figcaption></figure>
<figure><figcaption>This is the caption of figure 2.</figcaption></figure>
<table><caption dir="ltr">This is the caption of table 1.</caption><tbody><tr><td rowspan="2" dir="ltr">Product</td><td colspan="2" dir="ltr">Years</td></tr><tr><td dir="ltr">2016</td><td dir="ltr">2017</td></tr><tr><td dir="ltr">Apple</td><td dir="ltr">49823</td><td dir="ltr">695944</td></tr></tbody></table>
<figure><figcaption dir="ltr">This is the caption of figure 1.</figcaption></figure>
<figure><figcaption dir="ltr">This is the caption of figure 2.</figcaption></figure>
</html>
4 changes: 2 additions & 2 deletions test/data/doc/dummy_doc.yaml.html
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@
}
</style>
</head>
<h1>DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis</h1>
<figure><figcaption>Figure 1: Four examples of complex page layouts across different document categories</figcaption></figure>
<h1 dir="ltr">DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis</h1>
<figure><figcaption dir="ltr">Figure 1: Four examples of complex page layouts across different document categories</figcaption></figure>

</html>

0 comments on commit 7a00100

Please sign in to comment.