From baf622ffad6dbc53ec5536945f36883de2f110c7 Mon Sep 17 00:00:00 2001 From: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> Date: Mon, 27 Jan 2025 15:12:18 +0100 Subject: [PATCH] fix: parse HTML files without body tag Parse HTML files without 'body' tag, since it is optional in HTML5 specification. Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> --- docling/backend/html_backend.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py index 66dd4a2c..3de333dc 100644 --- a/docling/backend/html_backend.py +++ b/docling/backend/html_backend.py @@ -78,10 +78,11 @@ def convert(self) -> DoclingDocument: if self.is_valid(): assert self.soup is not None + content = self.soup.body or self.soup # Replace
tags with newline characters - for br in self.soup.body.find_all("br"): + for br in content.find_all("br"): br.replace_with("\n") - doc = self.walk(self.soup.body, doc) + doc = self.walk(content, doc) else: raise RuntimeError( f"Cannot convert doc with {self.document_hash} because the backend failed to init."