From a112d7a03512e8a00842a100416426254d6ecfc0 Mon Sep 17 00:00:00 2001 From: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> Date: Mon, 27 Jan 2025 16:59:00 +0100 Subject: [PATCH] fix: parse html with omitted body tag (#818) * fix: parse HTML files without body tag Parse HTML files without 'body' tag, since it is optional in HTML5 specification. Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> * test: ensure docling converts HTML without body tag Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> --------- Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> --- docling/backend/html_backend.py | 5 +- .../docling_v2/example_05.html.itxt | 3 + .../docling_v2/example_05.html.json | 329 ++++++++++++++++++ .../groundtruth/docling_v2/example_05.html.md | 7 + tests/data/html/example_05.html | 20 ++ tests/test_backend_html.py | 3 +- 6 files changed, 364 insertions(+), 3 deletions(-) create mode 100644 tests/data/groundtruth/docling_v2/example_05.html.itxt create mode 100644 tests/data/groundtruth/docling_v2/example_05.html.json create mode 100644 tests/data/groundtruth/docling_v2/example_05.html.md create mode 100644 tests/data/html/example_05.html diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py index 66dd4a2c..3de333dc 100644 --- a/docling/backend/html_backend.py +++ b/docling/backend/html_backend.py @@ -78,10 +78,11 @@ def convert(self) -> DoclingDocument: if self.is_valid(): assert self.soup is not None + content = self.soup.body or self.soup # Replace
tags with newline characters - for br in self.soup.body.find_all("br"): + for br in content.find_all("br"): br.replace_with("\n") - doc = self.walk(self.soup.body, doc) + doc = self.walk(content, doc) else: raise RuntimeError( f"Cannot convert doc with {self.document_hash} because the backend failed to init." diff --git a/tests/data/groundtruth/docling_v2/example_05.html.itxt b/tests/data/groundtruth/docling_v2/example_05.html.itxt new file mode 100644 index 00000000..3fc541a3 --- /dev/null +++ b/tests/data/groundtruth/docling_v2/example_05.html.itxt @@ -0,0 +1,3 @@ +item-0 at level 0: unspecified: group _root_ + item-1 at level 1: title: Omitted html and body tags + item-2 at level 2: table with [4x3] \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/example_05.html.json b/tests/data/groundtruth/docling_v2/example_05.html.json new file mode 100644 index 00000000..ae311397 --- /dev/null +++ b/tests/data/groundtruth/docling_v2/example_05.html.json @@ -0,0 +1,329 @@ +{ + "schema_name": "DoclingDocument", + "version": "1.0.0", + "name": "example_05", + "origin": { + "mimetype": "text/html", + "binary_hash": 1499806583410518209, + "filename": "example_05.html" + }, + "furniture": { + "self_ref": "#/furniture", + "children": [], + "name": "_root_", + "label": "unspecified" + }, + "body": { + "self_ref": "#/body", + "children": [ + { + "$ref": "#/texts/0" + } + ], + "name": "_root_", + "label": "unspecified" + }, + "groups": [], + "texts": [ + { + "self_ref": "#/texts/0", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/tables/0" + } + ], + "label": "title", + "prov": [], + "orig": "Omitted html and body tags", + "text": "Omitted html and body tags" + } + ], + "pictures": [], + "tables": [ + { + "self_ref": "#/tables/0", + "parent": { + "$ref": "#/texts/0" + }, + "children": [], + "label": "table", + "prov": [], + "captions": [], + "references": [], + "footnotes": [], + "data": { + "table_cells": [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Header 1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 2, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 1, + "end_col_offset_idx": 3, + "text": "Header 2 & 3 (colspan)", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 2, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 3, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Row 1 & 2, Col 1 (rowspan)", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Row 1, Col 2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Row 1, Col 3", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 2, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 1, + "end_col_offset_idx": 3, + "text": "Row 2, Col 2 & 3 (colspan)", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Row 3, Col 1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Row 3, Col 2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Row 3, Col 3", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + "num_rows": 4, + "num_cols": 3, + "grid": [ + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Header 1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 2, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 1, + "end_col_offset_idx": 3, + "text": "Header 2 & 3 (colspan)", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 2, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 1, + "end_col_offset_idx": 3, + "text": "Header 2 & 3 (colspan)", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 2, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 3, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Row 1 & 2, Col 1 (rowspan)", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Row 1, Col 2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Row 1, Col 3", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 2, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 3, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Row 1 & 2, Col 1 (rowspan)", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 2, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 1, + "end_col_offset_idx": 3, + "text": "Row 2, Col 2 & 3 (colspan)", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 2, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 1, + "end_col_offset_idx": 3, + "text": "Row 2, Col 2 & 3 (colspan)", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Row 3, Col 1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Row 3, Col 2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Row 3, Col 3", + "column_header": false, + "row_header": false, + "row_section": false + } + ] + ] + } + } + ], + "key_value_items": [], + "pages": {} +} \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/example_05.html.md b/tests/data/groundtruth/docling_v2/example_05.html.md new file mode 100644 index 00000000..787f6d23 --- /dev/null +++ b/tests/data/groundtruth/docling_v2/example_05.html.md @@ -0,0 +1,7 @@ +# Omitted html and body tags + +| Header 1 | Header 2 & 3 (colspan) | Header 2 & 3 (colspan) | +|----------------------------|----------------------------|----------------------------| +| Row 1 & 2, Col 1 (rowspan) | Row 1, Col 2 | Row 1, Col 3 | +| Row 1 & 2, Col 1 (rowspan) | Row 2, Col 2 & 3 (colspan) | Row 2, Col 2 & 3 (colspan) | +| Row 3, Col 1 | Row 3, Col 2 | Row 3, Col 3 | \ No newline at end of file diff --git a/tests/data/html/example_05.html b/tests/data/html/example_05.html new file mode 100644 index 00000000..e218ff1a --- /dev/null +++ b/tests/data/html/example_05.html @@ -0,0 +1,20 @@ +

Omitted html and body tags

+ + + + + + + + + + + + + + + + + + +
Header 1Header 2 & 3 (colspan)
Row 1 & 2, Col 1 (rowspan)Row 1, Col 2Row 1, Col 3
Row 2, Col 2 & 3 (colspan)
Row 3, Col 1Row 3, Col 2Row 3, Col 3
diff --git a/tests/test_backend_html.py b/tests/test_backend_html.py index 3bd27242..a4deb212 100644 --- a/tests/test_backend_html.py +++ b/tests/test_backend_html.py @@ -6,6 +6,7 @@ from docling.datamodel.base_models import InputFormat from docling.datamodel.document import ( ConversionResult, + DoclingDocument, InputDocument, SectionHeaderItem, ) @@ -44,7 +45,7 @@ def get_html_paths(): # Define the directory you want to search directory = Path("./tests/data/html/") - # List all PDF files in the directory and its subdirectories + # List all HTML files in the directory and its subdirectories html_files = sorted(directory.rglob("*.html")) return html_files