Skip to content

Commit

Permalink
reformatted the code
Browse files Browse the repository at this point in the history
Signed-off-by: Peter Staar <[email protected]>
  • Loading branch information
PeterStaar-IBM committed Nov 8, 2024
1 parent 5c82ff9 commit 311640f
Show file tree
Hide file tree
Showing 2 changed files with 83 additions and 26 deletions.
105 changes: 81 additions & 24 deletions docling/backend/html_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
DoclingDocument,
DocumentOrigin,
GroupLabel,
ImageRef,
Size,
TableCell,
TableData,
)
Expand Down Expand Up @@ -117,16 +119,6 @@ def walk(self, element, doc):
else:
_log.debug(f"ignoring element of type {type(element)}")

"""
elif isinstance(element, Tag):
try:
self.analyse_element(element, 0, doc)
except Exception as exc:
_log.info(f" -> error treating elem: {exc}")
raise exc
"""


except Exception as exc:
_log.debug(f"error walking element: {type(element)}")
pass
Expand Down Expand Up @@ -472,33 +464,98 @@ def extract_table_cell_text(self, cell):
)
return cell.text

def handle_figure(self, element, idx, doc):
"""Handles image tags (img)."""
def _get_imageref(self, element):

# Extract the image URI from the <img> tag
# image_uri = root.xpath('//figure//img/@src')[0]
fig_ref = None

contains_captions = element.find(["figcaption"])
if contains_captions is None:
doc.add_picture(parent=self.parents[self.level], caption=None)
img = element.find(["img"])
_log.info(img)

else:
if img is not None and img.has_attr("src"):
fig_uri = img["src"]
_log.info(fig_uri)

dpi = 128
try:
dpi = int(img["dpi"])
except:
_log.debug("could not identify `dpi` of image")

width = 128
try:
width = int(img["width"])
except:
_log.debug("could not identify `width` of image")

height = 128
try:
height = int(img["height"])
except:
_log.debug("could not identify `height` of image")

if fig_uri.endswith(".jpg"):
fig_ref = ImageRef(
mimetype="image/jpg", dpi=dpi, size=Size(width, height), uri=fig_uri
)

elif fig_uri.endswith(".jpeg"):
fig_ref = ImageRef(
mimetype="image/jpg", dpi=dpi, size=Size(width, height), uri=fig_uri
)

elif fig_uri.endswith(".png"):
fig_ref = ImageRef(
mimetype="image/png", dpi=dpi, size=Size(width, height), uri=fig_uri
)

elif fig_uri.endswith(".svg"):
fig_ref = ImageRef(
mimetype="image/svg", dpi=dpi, size=Size(width, height), uri=fig_uri
)

return fig_ref

def _get_figcaption(self, element, doc):

fig_caption = None

contains_captions = element.find(["figcaption"])
if contains_captions is not None:
texts = []
for item in contains_captions:
texts.append(item.text)

fig_caption = doc.add_text(
label=DocItemLabel.CAPTION, text=("".join(texts)).strip()
)
doc.add_picture(
parent=self.parents[self.level],
caption=fig_caption,
)

return fig_caption

def handle_figure(self, element, idx, doc):
"""Handles image tags (img)."""

fig_ref = self._get_imageref(element)
fig_caption = self._get_figcaption(element, doc)

_log.warn(fig_ref)

doc.add_picture(
parent=self.parents[self.level], image=fig_ref, caption=fig_caption
)

def handle_image(self, element, idx, doc):
"""Handles image tags (img)."""
doc.add_picture(parent=self.parents[self.level], caption=None)

fig_ref = self._get_imageref(element)

doc.add_picture(parent=self.parents[self.level], image=fig_ref, caption=None)

def handle_svg(self, element, idx, doc):
"""Handles svg tags."""
doc.add_picture(parent=self.parents[self.level], caption=None)

fig_ref = self._get_imageref(element)
fig_caption = self._get_figcaption(element, doc)

doc.add_picture(
parent=self.parents[self.level], image=fig_ref, caption=fig_caption
)
4 changes: 2 additions & 2 deletions docling/cli/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,8 +120,8 @@ def export_documents(
if export_itxt:
fname = output_dir / f"{doc_filename}.itxt"
with fname.open("w") as fp:
_log.info(f"writing Doc Tags output to {fname}")
fp.write(conv_res.document._export_to_indented_text())
_log.info(f"writing Indented Text output to {fname}")
fp.write(conv_res.document._export_to_indented_text())
else:
_log.warning(f"Document {conv_res.input.file} failed to convert.")
failure_count += 1
Expand Down

0 comments on commit 311640f

Please sign in to comment.