diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py
index 66dd4a2c..3de333dc 100644
--- a/docling/backend/html_backend.py
+++ b/docling/backend/html_backend.py
@@ -78,10 +78,11 @@ def convert(self) -> DoclingDocument:
if self.is_valid():
assert self.soup is not None
+ content = self.soup.body or self.soup
# Replace
tags with newline characters
- for br in self.soup.body.find_all("br"):
+ for br in content.find_all("br"):
br.replace_with("\n")
- doc = self.walk(self.soup.body, doc)
+ doc = self.walk(content, doc)
else:
raise RuntimeError(
f"Cannot convert doc with {self.document_hash} because the backend failed to init."
diff --git a/tests/data/groundtruth/docling_v2/example_05.html.itxt b/tests/data/groundtruth/docling_v2/example_05.html.itxt
new file mode 100644
index 00000000..3fc541a3
--- /dev/null
+++ b/tests/data/groundtruth/docling_v2/example_05.html.itxt
@@ -0,0 +1,3 @@
+item-0 at level 0: unspecified: group _root_
+ item-1 at level 1: title: Omitted html and body tags
+ item-2 at level 2: table with [4x3]
\ No newline at end of file
diff --git a/tests/data/groundtruth/docling_v2/example_05.html.json b/tests/data/groundtruth/docling_v2/example_05.html.json
new file mode 100644
index 00000000..ae311397
--- /dev/null
+++ b/tests/data/groundtruth/docling_v2/example_05.html.json
@@ -0,0 +1,329 @@
+{
+ "schema_name": "DoclingDocument",
+ "version": "1.0.0",
+ "name": "example_05",
+ "origin": {
+ "mimetype": "text/html",
+ "binary_hash": 1499806583410518209,
+ "filename": "example_05.html"
+ },
+ "furniture": {
+ "self_ref": "#/furniture",
+ "children": [],
+ "name": "_root_",
+ "label": "unspecified"
+ },
+ "body": {
+ "self_ref": "#/body",
+ "children": [
+ {
+ "$ref": "#/texts/0"
+ }
+ ],
+ "name": "_root_",
+ "label": "unspecified"
+ },
+ "groups": [],
+ "texts": [
+ {
+ "self_ref": "#/texts/0",
+ "parent": {
+ "$ref": "#/body"
+ },
+ "children": [
+ {
+ "$ref": "#/tables/0"
+ }
+ ],
+ "label": "title",
+ "prov": [],
+ "orig": "Omitted html and body tags",
+ "text": "Omitted html and body tags"
+ }
+ ],
+ "pictures": [],
+ "tables": [
+ {
+ "self_ref": "#/tables/0",
+ "parent": {
+ "$ref": "#/texts/0"
+ },
+ "children": [],
+ "label": "table",
+ "prov": [],
+ "captions": [],
+ "references": [],
+ "footnotes": [],
+ "data": {
+ "table_cells": [
+ {
+ "row_span": 1,
+ "col_span": 1,
+ "start_row_offset_idx": 0,
+ "end_row_offset_idx": 1,
+ "start_col_offset_idx": 0,
+ "end_col_offset_idx": 1,
+ "text": "Header 1",
+ "column_header": false,
+ "row_header": false,
+ "row_section": false
+ },
+ {
+ "row_span": 1,
+ "col_span": 2,
+ "start_row_offset_idx": 0,
+ "end_row_offset_idx": 1,
+ "start_col_offset_idx": 1,
+ "end_col_offset_idx": 3,
+ "text": "Header 2 & 3 (colspan)",
+ "column_header": false,
+ "row_header": false,
+ "row_section": false
+ },
+ {
+ "row_span": 2,
+ "col_span": 1,
+ "start_row_offset_idx": 1,
+ "end_row_offset_idx": 3,
+ "start_col_offset_idx": 0,
+ "end_col_offset_idx": 1,
+ "text": "Row 1 & 2, Col 1 (rowspan)",
+ "column_header": false,
+ "row_header": false,
+ "row_section": false
+ },
+ {
+ "row_span": 1,
+ "col_span": 1,
+ "start_row_offset_idx": 1,
+ "end_row_offset_idx": 2,
+ "start_col_offset_idx": 1,
+ "end_col_offset_idx": 2,
+ "text": "Row 1, Col 2",
+ "column_header": false,
+ "row_header": false,
+ "row_section": false
+ },
+ {
+ "row_span": 1,
+ "col_span": 1,
+ "start_row_offset_idx": 1,
+ "end_row_offset_idx": 2,
+ "start_col_offset_idx": 2,
+ "end_col_offset_idx": 3,
+ "text": "Row 1, Col 3",
+ "column_header": false,
+ "row_header": false,
+ "row_section": false
+ },
+ {
+ "row_span": 1,
+ "col_span": 2,
+ "start_row_offset_idx": 2,
+ "end_row_offset_idx": 3,
+ "start_col_offset_idx": 1,
+ "end_col_offset_idx": 3,
+ "text": "Row 2, Col 2 & 3 (colspan)",
+ "column_header": false,
+ "row_header": false,
+ "row_section": false
+ },
+ {
+ "row_span": 1,
+ "col_span": 1,
+ "start_row_offset_idx": 3,
+ "end_row_offset_idx": 4,
+ "start_col_offset_idx": 0,
+ "end_col_offset_idx": 1,
+ "text": "Row 3, Col 1",
+ "column_header": false,
+ "row_header": false,
+ "row_section": false
+ },
+ {
+ "row_span": 1,
+ "col_span": 1,
+ "start_row_offset_idx": 3,
+ "end_row_offset_idx": 4,
+ "start_col_offset_idx": 1,
+ "end_col_offset_idx": 2,
+ "text": "Row 3, Col 2",
+ "column_header": false,
+ "row_header": false,
+ "row_section": false
+ },
+ {
+ "row_span": 1,
+ "col_span": 1,
+ "start_row_offset_idx": 3,
+ "end_row_offset_idx": 4,
+ "start_col_offset_idx": 2,
+ "end_col_offset_idx": 3,
+ "text": "Row 3, Col 3",
+ "column_header": false,
+ "row_header": false,
+ "row_section": false
+ }
+ ],
+ "num_rows": 4,
+ "num_cols": 3,
+ "grid": [
+ [
+ {
+ "row_span": 1,
+ "col_span": 1,
+ "start_row_offset_idx": 0,
+ "end_row_offset_idx": 1,
+ "start_col_offset_idx": 0,
+ "end_col_offset_idx": 1,
+ "text": "Header 1",
+ "column_header": false,
+ "row_header": false,
+ "row_section": false
+ },
+ {
+ "row_span": 1,
+ "col_span": 2,
+ "start_row_offset_idx": 0,
+ "end_row_offset_idx": 1,
+ "start_col_offset_idx": 1,
+ "end_col_offset_idx": 3,
+ "text": "Header 2 & 3 (colspan)",
+ "column_header": false,
+ "row_header": false,
+ "row_section": false
+ },
+ {
+ "row_span": 1,
+ "col_span": 2,
+ "start_row_offset_idx": 0,
+ "end_row_offset_idx": 1,
+ "start_col_offset_idx": 1,
+ "end_col_offset_idx": 3,
+ "text": "Header 2 & 3 (colspan)",
+ "column_header": false,
+ "row_header": false,
+ "row_section": false
+ }
+ ],
+ [
+ {
+ "row_span": 2,
+ "col_span": 1,
+ "start_row_offset_idx": 1,
+ "end_row_offset_idx": 3,
+ "start_col_offset_idx": 0,
+ "end_col_offset_idx": 1,
+ "text": "Row 1 & 2, Col 1 (rowspan)",
+ "column_header": false,
+ "row_header": false,
+ "row_section": false
+ },
+ {
+ "row_span": 1,
+ "col_span": 1,
+ "start_row_offset_idx": 1,
+ "end_row_offset_idx": 2,
+ "start_col_offset_idx": 1,
+ "end_col_offset_idx": 2,
+ "text": "Row 1, Col 2",
+ "column_header": false,
+ "row_header": false,
+ "row_section": false
+ },
+ {
+ "row_span": 1,
+ "col_span": 1,
+ "start_row_offset_idx": 1,
+ "end_row_offset_idx": 2,
+ "start_col_offset_idx": 2,
+ "end_col_offset_idx": 3,
+ "text": "Row 1, Col 3",
+ "column_header": false,
+ "row_header": false,
+ "row_section": false
+ }
+ ],
+ [
+ {
+ "row_span": 2,
+ "col_span": 1,
+ "start_row_offset_idx": 1,
+ "end_row_offset_idx": 3,
+ "start_col_offset_idx": 0,
+ "end_col_offset_idx": 1,
+ "text": "Row 1 & 2, Col 1 (rowspan)",
+ "column_header": false,
+ "row_header": false,
+ "row_section": false
+ },
+ {
+ "row_span": 1,
+ "col_span": 2,
+ "start_row_offset_idx": 2,
+ "end_row_offset_idx": 3,
+ "start_col_offset_idx": 1,
+ "end_col_offset_idx": 3,
+ "text": "Row 2, Col 2 & 3 (colspan)",
+ "column_header": false,
+ "row_header": false,
+ "row_section": false
+ },
+ {
+ "row_span": 1,
+ "col_span": 2,
+ "start_row_offset_idx": 2,
+ "end_row_offset_idx": 3,
+ "start_col_offset_idx": 1,
+ "end_col_offset_idx": 3,
+ "text": "Row 2, Col 2 & 3 (colspan)",
+ "column_header": false,
+ "row_header": false,
+ "row_section": false
+ }
+ ],
+ [
+ {
+ "row_span": 1,
+ "col_span": 1,
+ "start_row_offset_idx": 3,
+ "end_row_offset_idx": 4,
+ "start_col_offset_idx": 0,
+ "end_col_offset_idx": 1,
+ "text": "Row 3, Col 1",
+ "column_header": false,
+ "row_header": false,
+ "row_section": false
+ },
+ {
+ "row_span": 1,
+ "col_span": 1,
+ "start_row_offset_idx": 3,
+ "end_row_offset_idx": 4,
+ "start_col_offset_idx": 1,
+ "end_col_offset_idx": 2,
+ "text": "Row 3, Col 2",
+ "column_header": false,
+ "row_header": false,
+ "row_section": false
+ },
+ {
+ "row_span": 1,
+ "col_span": 1,
+ "start_row_offset_idx": 3,
+ "end_row_offset_idx": 4,
+ "start_col_offset_idx": 2,
+ "end_col_offset_idx": 3,
+ "text": "Row 3, Col 3",
+ "column_header": false,
+ "row_header": false,
+ "row_section": false
+ }
+ ]
+ ]
+ }
+ }
+ ],
+ "key_value_items": [],
+ "pages": {}
+}
\ No newline at end of file
diff --git a/tests/data/groundtruth/docling_v2/example_05.html.md b/tests/data/groundtruth/docling_v2/example_05.html.md
new file mode 100644
index 00000000..787f6d23
--- /dev/null
+++ b/tests/data/groundtruth/docling_v2/example_05.html.md
@@ -0,0 +1,7 @@
+# Omitted html and body tags
+
+| Header 1 | Header 2 & 3 (colspan) | Header 2 & 3 (colspan) |
+|----------------------------|----------------------------|----------------------------|
+| Row 1 & 2, Col 1 (rowspan) | Row 1, Col 2 | Row 1, Col 3 |
+| Row 1 & 2, Col 1 (rowspan) | Row 2, Col 2 & 3 (colspan) | Row 2, Col 2 & 3 (colspan) |
+| Row 3, Col 1 | Row 3, Col 2 | Row 3, Col 3 |
\ No newline at end of file
diff --git a/tests/data/html/example_05.html b/tests/data/html/example_05.html
new file mode 100644
index 00000000..e218ff1a
--- /dev/null
+++ b/tests/data/html/example_05.html
@@ -0,0 +1,20 @@
+
Header 1 | +Header 2 & 3 (colspan) | +|
---|---|---|
Row 1 & 2, Col 1 (rowspan) | +Row 1, Col 2 | +Row 1, Col 3 | +
Row 2, Col 2 & 3 (colspan) | +||
Row 3, Col 1 | +Row 3, Col 2 | +Row 3, Col 3 | +