diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py index 9dfbadb6..0894deeb 100644 --- a/docling/backend/html_backend.py +++ b/docling/backend/html_backend.py @@ -21,7 +21,12 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): - def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path], skip_furniture:bool=False): + def __init__( + self, + in_doc: "InputDocument", + path_or_stream: Union[BytesIO, Path], + skip_furniture: bool = False, + ): super().__init__(in_doc, path_or_stream) _log.debug("About to init HTML backend...") self.soup = None @@ -36,7 +41,7 @@ def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path] self.labels = {} # type: ignore self.skip_furniture = skip_furniture - + try: if isinstance(self.path_or_stream, BytesIO): text_stream = self.path_or_stream.getvalue().decode("utf-8") @@ -84,9 +89,9 @@ def convert(self) -> DoclingDocument: for br in self.soup.body.find_all("br"): br.replace_with("\n") - self.contains_h1 = bool(soup.find('h1')) and self.skip_furniture + self.contains_h1 = bool(self.soup.find("h1")) and self.skip_furniture self.detected_h1 = False - + doc = self.walk(self.soup.body, doc) else: raise RuntimeError( @@ -124,7 +129,7 @@ def analyse_element(self, element, idx, doc): if element.name in ["h1"]: self.detected_h1 = True - + if element.name in ["h1", "h2", "h3", "h4", "h5", "h6"]: if (not self.contains_h1) or (self.contains_h1 and self.detected_h1): self.handle_header(element, idx, doc)