diff --git a/tests/data/groundtruth/docling_v2/example_05.html.itxt b/tests/data/groundtruth/docling_v2/example_05.html.itxt new file mode 100644 index 00000000..3fc541a3 --- /dev/null +++ b/tests/data/groundtruth/docling_v2/example_05.html.itxt @@ -0,0 +1,3 @@ +item-0 at level 0: unspecified: group _root_ + item-1 at level 1: title: Omitted html and body tags + item-2 at level 2: table with [4x3] \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/example_05.html.json b/tests/data/groundtruth/docling_v2/example_05.html.json new file mode 100644 index 00000000..ae311397 --- /dev/null +++ b/tests/data/groundtruth/docling_v2/example_05.html.json @@ -0,0 +1,329 @@ +{ + "schema_name": "DoclingDocument", + "version": "1.0.0", + "name": "example_05", + "origin": { + "mimetype": "text/html", + "binary_hash": 1499806583410518209, + "filename": "example_05.html" + }, + "furniture": { + "self_ref": "#/furniture", + "children": [], + "name": "_root_", + "label": "unspecified" + }, + "body": { + "self_ref": "#/body", + "children": [ + { + "$ref": "#/texts/0" + } + ], + "name": "_root_", + "label": "unspecified" + }, + "groups": [], + "texts": [ + { + "self_ref": "#/texts/0", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/tables/0" + } + ], + "label": "title", + "prov": [], + "orig": "Omitted html and body tags", + "text": "Omitted html and body tags" + } + ], + "pictures": [], + "tables": [ + { + "self_ref": "#/tables/0", + "parent": { + "$ref": "#/texts/0" + }, + "children": [], + "label": "table", + "prov": [], + "captions": [], + "references": [], + "footnotes": [], + "data": { + "table_cells": [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Header 1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 2, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 1, + "end_col_offset_idx": 3, + "text": "Header 2 & 3 (colspan)", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 2, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 3, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Row 1 & 2, Col 1 (rowspan)", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Row 1, Col 2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Row 1, Col 3", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 2, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 1, + "end_col_offset_idx": 3, + "text": "Row 2, Col 2 & 3 (colspan)", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Row 3, Col 1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Row 3, Col 2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Row 3, Col 3", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + "num_rows": 4, + "num_cols": 3, + "grid": [ + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Header 1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 2, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 1, + "end_col_offset_idx": 3, + "text": "Header 2 & 3 (colspan)", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 2, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 1, + "end_col_offset_idx": 3, + "text": "Header 2 & 3 (colspan)", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 2, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 3, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Row 1 & 2, Col 1 (rowspan)", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Row 1, Col 2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Row 1, Col 3", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 2, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 3, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Row 1 & 2, Col 1 (rowspan)", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 2, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 1, + "end_col_offset_idx": 3, + "text": "Row 2, Col 2 & 3 (colspan)", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 2, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 1, + "end_col_offset_idx": 3, + "text": "Row 2, Col 2 & 3 (colspan)", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Row 3, Col 1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Row 3, Col 2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Row 3, Col 3", + "column_header": false, + "row_header": false, + "row_section": false + } + ] + ] + } + } + ], + "key_value_items": [], + "pages": {} +} \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/example_05.html.md b/tests/data/groundtruth/docling_v2/example_05.html.md new file mode 100644 index 00000000..787f6d23 --- /dev/null +++ b/tests/data/groundtruth/docling_v2/example_05.html.md @@ -0,0 +1,7 @@ +# Omitted html and body tags + +| Header 1 | Header 2 & 3 (colspan) | Header 2 & 3 (colspan) | +|----------------------------|----------------------------|----------------------------| +| Row 1 & 2, Col 1 (rowspan) | Row 1, Col 2 | Row 1, Col 3 | +| Row 1 & 2, Col 1 (rowspan) | Row 2, Col 2 & 3 (colspan) | Row 2, Col 2 & 3 (colspan) | +| Row 3, Col 1 | Row 3, Col 2 | Row 3, Col 3 | \ No newline at end of file diff --git a/tests/data/html/example_05.html b/tests/data/html/example_05.html new file mode 100644 index 00000000..e218ff1a --- /dev/null +++ b/tests/data/html/example_05.html @@ -0,0 +1,20 @@ +
Header 1 | +Header 2 & 3 (colspan) | +|
---|---|---|
Row 1 & 2, Col 1 (rowspan) | +Row 1, Col 2 | +Row 1, Col 3 | +
Row 2, Col 2 & 3 (colspan) | +||
Row 3, Col 1 | +Row 3, Col 2 | +Row 3, Col 3 | +