diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py
index 0894deeb..3d3d3ca9 100644
--- a/docling/backend/html_backend.py
+++ b/docling/backend/html_backend.py
@@ -4,6 +4,8 @@
from typing import Set, Union
from bs4 import BeautifulSoup
+from bs4.element import Tag
+
from docling_core.types.doc import (
DocItemLabel,
DoclingDocument,
@@ -25,7 +27,7 @@ def __init__(
self,
in_doc: "InputDocument",
path_or_stream: Union[BytesIO, Path],
- skip_furniture: bool = False,
+ skip_furniture: bool = True,
):
super().__init__(in_doc, path_or_stream)
_log.debug("About to init HTML backend...")
@@ -45,10 +47,16 @@ def __init__(
try:
if isinstance(self.path_or_stream, BytesIO):
text_stream = self.path_or_stream.getvalue().decode("utf-8")
+ print("BytesIO")
self.soup = BeautifulSoup(text_stream, "html.parser")
if isinstance(self.path_or_stream, Path):
+ print("file")
with open(self.path_or_stream, "r", encoding="utf-8") as f:
html_content = f.read()
+
+ with open("./scratch/file.html", "w") as fw:
+ fw.write(html_content)
+
self.soup = BeautifulSoup(html_content, "html.parser")
except Exception as e:
raise RuntimeError(
@@ -101,27 +109,38 @@ def convert(self) -> DoclingDocument:
def walk(self, element, doc):
try:
- # Iterate over elements in the body of the document
- for idx, element in enumerate(element.children):
- try:
- self.analyse_element(element, idx, doc)
- except Exception as exc_child:
-
- _log.error(" -> error treating child: ", exc_child)
- _log.error(" => element: ", element, "\n")
- raise exc_child
+ if isinstance(element, Tag) and any(element.children):
+ # Iterate over elements in the body of the document
+ for idx, child in enumerate(element.children):
+ try:
+ self.analyse_element(child, idx, doc)
+ except Exception as exc:
+ _log.error(f" -> error treating child: {exc}")
+ raise exc
+ elif isinstance(element, Tag):
+ try:
+ self.analyse_element(element, 0, doc)
+ except Exception as exc:
+ _log.error(f" -> error treating elem: {exc}")
+ raise exc
+ else:
+ _log.warn(f"ignoring element of type {type(element)}")
+
except Exception as exc:
+ _log.warn(f"error walking element: {type(element)}")
pass
return doc
def analyse_element(self, element, idx, doc):
- """
- if element.name!=None:
- _log.debug("\t"*self.level, idx, "\t", f"{element.name} ({self.level})")
- """
+ if element.name!=None:
+ #_log.debug("\t"*self.level, idx, "\t", f"{element.name} ({self.level})")
+ print("\t"*self.level, idx, "\t", f"{element.name} ({self.level})")
+
+ #print(element.name)
+
if element.name in self.labels:
self.labels[element.name] += 1
else:
@@ -134,8 +153,11 @@ def analyse_element(self, element, idx, doc):
if (not self.contains_h1) or (self.contains_h1 and self.detected_h1):
self.handle_header(element, idx, doc)
elif element.name in ["p"]:
+ print(" --> detected ...")
if (not self.contains_h1) or (self.contains_h1 and self.detected_h1):
self.handle_paragraph(element, idx, doc)
+ print(" --> registered ...")
+
elif element.name in ["ul", "ol"]:
if (not self.contains_h1) or (self.contains_h1 and self.detected_h1):
self.handle_list(element, idx, doc)
@@ -151,6 +173,33 @@ def analyse_element(self, element, idx, doc):
elif element.name == "img":
if (not self.contains_h1) or (self.contains_h1 and self.detected_h1):
self.handle_image(element, idx, doc)
+ elif element.name == "svg":
+ if (not self.contains_h1) or (self.contains_h1 and self.detected_h1):
+ #self.handle_image(element, idx, doc)
+ _log.warn("Add `svg` elements")
+
+ elif True and isinstance(element, Tag) and element.name in ["section"] and element.has_attr('data-content'):
+ try:
+ #print("\n\n\nattempt decoding: ", element['data-content'])
+
+ # Decode the data-content attribute
+ #data_content = html.unescape(element['data-content'])
+ #print(data_content)
+
+ data_content = element['data-content']
+
+ # Parse the decoded HTML content
+ content_soup = BeautifulSoup(data_content, 'html.parser')
+ print("\n\n\nsoup: ", content_soup)
+
+ for jdx, _ in enumerate(content_soup):
+ print(_)
+ self.analyse_element(_, jdx, doc)
+ except:
+ _log.warn("could not parse the `data-content` attribute")
+
+ self.walk(element, doc)
+
else:
self.walk(element, doc)
@@ -229,11 +278,16 @@ def handle_header(self, element, idx, doc):
def handle_paragraph(self, element, idx, doc):
"""Handles paragraph tags (p)."""
if element.text is None:
+ print(" -> text is None ...")
return
text = element.text.strip()
+ print("doc is adding paragraph: ", text)
+
label = DocItemLabel.PARAGRAPH
if len(text) == 0:
+ print(" -> text is zero length ...")
return
+ print("doc is adding paragraph: ", text)
doc.add_text(parent=self.parents[self.level], label=label, text=text)
def handle_list(self, element, idx, doc):