Skip to content

Commit

Permalink
Added handling of code blocks in html with <pre> tag (#302)
Browse files Browse the repository at this point in the history
Signed-off-by: Maksym Lysak <[email protected]>
Co-authored-by: Maksym Lysak <[email protected]>
  • Loading branch information
maxmnemonic and Maksym Lysak authored Nov 11, 2024
1 parent 1239ade commit 53bf2d1
Showing 1 changed file with 12 additions and 0 deletions.
12 changes: 12 additions & 0 deletions docling/backend/html_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,8 @@ def analyse_element(self, element, idx, doc):
self.handle_header(element, idx, doc)
elif element.name in ["p"]:
self.handle_paragraph(element, idx, doc)
elif element.name in ["pre"]:
self.handle_code(element, idx, doc)
elif element.name in ["ul", "ol"]:
self.handle_list(element, idx, doc)
elif element.name in ["li"]:
Expand Down Expand Up @@ -205,6 +207,16 @@ def handle_header(self, element, idx, doc):
level=hlevel,
)

def handle_code(self, element, idx, doc):
"""Handles monospace code snippets (pre)."""
if element.text is None:
return
text = element.text.strip()
label = DocItemLabel.CODE
if len(text) == 0:
return
doc.add_text(parent=self.parents[self.level], label=label, text=text)

def handle_paragraph(self, element, idx, doc):
"""Handles paragraph tags (p)."""
if element.text is None:
Expand Down

0 comments on commit 53bf2d1

Please sign in to comment.