Skip to content

Commit

Permalink
fix: parse html with omitted body tag (#818)
Browse files Browse the repository at this point in the history
* fix: parse HTML files without body tag

Parse HTML files without 'body' tag, since it is optional in HTML5 specification.

Signed-off-by: Cesar Berrospi Ramis <[email protected]>

* test: ensure docling converts HTML without body tag

Signed-off-by: Cesar Berrospi Ramis <[email protected]>

---------

Signed-off-by: Cesar Berrospi Ramis <[email protected]>
  • Loading branch information
ceberam authored Jan 27, 2025
1 parent 95b293a commit a112d7a
Show file tree
Hide file tree
Showing 6 changed files with 364 additions and 3 deletions.
5 changes: 3 additions & 2 deletions docling/backend/html_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,10 +78,11 @@ def convert(self) -> DoclingDocument:

if self.is_valid():
assert self.soup is not None
content = self.soup.body or self.soup
# Replace <br> tags with newline characters
for br in self.soup.body.find_all("br"):
for br in content.find_all("br"):
br.replace_with("\n")
doc = self.walk(self.soup.body, doc)
doc = self.walk(content, doc)
else:
raise RuntimeError(
f"Cannot convert doc with {self.document_hash} because the backend failed to init."
Expand Down
3 changes: 3 additions & 0 deletions tests/data/groundtruth/docling_v2/example_05.html.itxt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
item-0 at level 0: unspecified: group _root_
item-1 at level 1: title: Omitted html and body tags
item-2 at level 2: table with [4x3]
329 changes: 329 additions & 0 deletions tests/data/groundtruth/docling_v2/example_05.html.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,329 @@
{
"schema_name": "DoclingDocument",
"version": "1.0.0",
"name": "example_05",
"origin": {
"mimetype": "text/html",
"binary_hash": 1499806583410518209,
"filename": "example_05.html"
},
"furniture": {
"self_ref": "#/furniture",
"children": [],
"name": "_root_",
"label": "unspecified"
},
"body": {
"self_ref": "#/body",
"children": [
{
"$ref": "#/texts/0"
}
],
"name": "_root_",
"label": "unspecified"
},
"groups": [],
"texts": [
{
"self_ref": "#/texts/0",
"parent": {
"$ref": "#/body"
},
"children": [
{
"$ref": "#/tables/0"
}
],
"label": "title",
"prov": [],
"orig": "Omitted html and body tags",
"text": "Omitted html and body tags"
}
],
"pictures": [],
"tables": [
{
"self_ref": "#/tables/0",
"parent": {
"$ref": "#/texts/0"
},
"children": [],
"label": "table",
"prov": [],
"captions": [],
"references": [],
"footnotes": [],
"data": {
"table_cells": [
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "Header 1",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"row_span": 1,
"col_span": 2,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 1,
"end_col_offset_idx": 3,
"text": "Header 2 & 3 (colspan)",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"row_span": 2,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 3,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "Row 1 & 2, Col 1 (rowspan)",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "Row 1, Col 2",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 2,
"end_col_offset_idx": 3,
"text": "Row 1, Col 3",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"row_span": 1,
"col_span": 2,
"start_row_offset_idx": 2,
"end_row_offset_idx": 3,
"start_col_offset_idx": 1,
"end_col_offset_idx": 3,
"text": "Row 2, Col 2 & 3 (colspan)",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 3,
"end_row_offset_idx": 4,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "Row 3, Col 1",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 3,
"end_row_offset_idx": 4,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "Row 3, Col 2",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 3,
"end_row_offset_idx": 4,
"start_col_offset_idx": 2,
"end_col_offset_idx": 3,
"text": "Row 3, Col 3",
"column_header": false,
"row_header": false,
"row_section": false
}
],
"num_rows": 4,
"num_cols": 3,
"grid": [
[
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "Header 1",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"row_span": 1,
"col_span": 2,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 1,
"end_col_offset_idx": 3,
"text": "Header 2 & 3 (colspan)",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"row_span": 1,
"col_span": 2,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 1,
"end_col_offset_idx": 3,
"text": "Header 2 & 3 (colspan)",
"column_header": false,
"row_header": false,
"row_section": false
}
],
[
{
"row_span": 2,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 3,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "Row 1 & 2, Col 1 (rowspan)",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "Row 1, Col 2",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 2,
"end_col_offset_idx": 3,
"text": "Row 1, Col 3",
"column_header": false,
"row_header": false,
"row_section": false
}
],
[
{
"row_span": 2,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 3,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "Row 1 & 2, Col 1 (rowspan)",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"row_span": 1,
"col_span": 2,
"start_row_offset_idx": 2,
"end_row_offset_idx": 3,
"start_col_offset_idx": 1,
"end_col_offset_idx": 3,
"text": "Row 2, Col 2 & 3 (colspan)",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"row_span": 1,
"col_span": 2,
"start_row_offset_idx": 2,
"end_row_offset_idx": 3,
"start_col_offset_idx": 1,
"end_col_offset_idx": 3,
"text": "Row 2, Col 2 & 3 (colspan)",
"column_header": false,
"row_header": false,
"row_section": false
}
],
[
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 3,
"end_row_offset_idx": 4,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "Row 3, Col 1",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 3,
"end_row_offset_idx": 4,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "Row 3, Col 2",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 3,
"end_row_offset_idx": 4,
"start_col_offset_idx": 2,
"end_col_offset_idx": 3,
"text": "Row 3, Col 3",
"column_header": false,
"row_header": false,
"row_section": false
}
]
]
}
}
],
"key_value_items": [],
"pages": {}
}
7 changes: 7 additions & 0 deletions tests/data/groundtruth/docling_v2/example_05.html.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# Omitted html and body tags

| Header 1 | Header 2 & 3 (colspan) | Header 2 & 3 (colspan) |
|----------------------------|----------------------------|----------------------------|
| Row 1 & 2, Col 1 (rowspan) | Row 1, Col 2 | Row 1, Col 3 |
| Row 1 & 2, Col 1 (rowspan) | Row 2, Col 2 & 3 (colspan) | Row 2, Col 2 & 3 (colspan) |
| Row 3, Col 1 | Row 3, Col 2 | Row 3, Col 3 |
20 changes: 20 additions & 0 deletions tests/data/html/example_05.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
<h1>Omitted html and body tags</h1>
<table>
<tr>
<th>Header 1</th>
<th colspan="2">Header 2 & 3 (colspan)</th>
</tr>
<tr>
<td rowspan="2">Row 1 & 2, Col 1 (rowspan)</td>
<td>Row 1, Col 2</td>
<td>Row 1, Col 3</td>
</tr>
<tr>
<td colspan="2">Row 2, Col 2 & 3 (colspan)</td>
</tr>
<tr>
<td>Row 3, Col 1</td>
<td>Row 3, Col 2</td>
<td>Row 3, Col 3</td>
</tr>
</table>
3 changes: 2 additions & 1 deletion tests/test_backend_html.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import (
ConversionResult,
DoclingDocument,
InputDocument,
SectionHeaderItem,
)
Expand Down Expand Up @@ -44,7 +45,7 @@ def get_html_paths():
# Define the directory you want to search
directory = Path("./tests/data/html/")

# List all PDF files in the directory and its subdirectories
# List all HTML files in the directory and its subdirectories
html_files = sorted(directory.rglob("*.html"))
return html_files

Expand Down

0 comments on commit a112d7a

Please sign in to comment.