Skip to content

Commit

Permalink
fixed some issues
Browse files Browse the repository at this point in the history
Signed-off-by: Peter Staar <[email protected]>
  • Loading branch information
PeterStaar-IBM committed Jan 24, 2025
1 parent 8ec3f17 commit 60c2a86
Show file tree
Hide file tree
Showing 6 changed files with 87 additions and 40 deletions.
88 changes: 53 additions & 35 deletions docling/backend/msexcel_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@

from typing import Any, List

from PIL import Image
from PIL import Image as PILImage
from pydantic import BaseModel


Expand Down Expand Up @@ -327,43 +327,61 @@ def _find_images_in_sheet(
self, doc: DoclingDocument, sheet: Worksheet
) -> DoclingDocument:

# Iterate over images in the sheet
# Iterate over byte images in the sheet
for idx, image in enumerate(sheet._images): # type: ignore

image_bytes = BytesIO(image.ref.blob)
pil_image = Image.open(image_bytes)

doc.add_picture(
parent=self.parents[0],
image=ImageRef.from_pil(image=pil_image, dpi=72),
caption=None,
)
try:
pil_image = PILImage.open(image.ref)

doc.add_picture(
parent=self.parents[0],
image=ImageRef.from_pil(image=pil_image, dpi=72),
caption=None,
)
except:
_log.error("could not extract the image from excel sheets")

"""
for idx, chart in enumerate(sheet._charts): # type: ignore
chart_path = f"chart_{idx + 1}.png"
_log.info(
f"Chart found, but dynamic rendering is required for: {chart_path}"
)

_log.info(f"Chart {idx + 1}:")

# Chart type
_log.info(f"Type: {type(chart).__name__}")

# Title
if chart.title:
_log.info(f"Title: {chart.title}")
else:
_log.info("No title")

# Data series
for series in chart.series:
_log.info(" => series ...")
_log.info(f"Data Series: {series.title}")
_log.info(f"Values: {series.values}")
_log.info(f"Categories: {series.categories}")

# Position
# _log.info(f"Anchor Cell: {chart.anchor}")
try:
chart_path = f"chart_{idx + 1}.png"
_log.info(
f"Chart found, but dynamic rendering is required for: {chart_path}"
)
_log.info(f"Chart {idx + 1}:")
# Chart type
# _log.info(f"Type: {type(chart).__name__}")
print(f"Type: {type(chart).__name__}")
# Extract series data
for series_idx, series in enumerate(chart.series):
#_log.info(f"Series {series_idx + 1}:")
print(f"Series {series_idx + 1} type: {type(series).__name__}")
#print(f"x-values: {series.xVal}")
#print(f"y-values: {series.yVal}")
print(f"xval type: {type(series.xVal).__name__}")
xvals = []
for _ in series.xVal.numLit.pt:
print(f"xval type: {type(_).__name__}")
if hasattr(_, 'v'):
xvals.append(_.v)
print(f"x-values: {xvals}")
yvals = []
for _ in series.yVal:
if hasattr(_, 'v'):
yvals.append(_.v)
print(f"y-values: {yvals}")
except Exception as exc:
print(exc)
continue
"""

return doc
3 changes: 2 additions & 1 deletion tests/data/groundtruth/docling_v2/test-01.xlsx.itxt
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,5 @@ item-0 at level 0: unspecified: group _root_
item-6 at level 2: table with [5x3]
item-7 at level 1: section: group sheet: Sheet3
item-8 at level 2: table with [7x3]
item-9 at level 2: table with [7x3]
item-9 at level 2: table with [7x3]
item-10 at level 2: picture
30 changes: 28 additions & 2 deletions tests/data/groundtruth/docling_v2/test-01.xlsx.json

Large diffs are not rendered by default.

4 changes: 3 additions & 1 deletion tests/data/groundtruth/docling_v2/test-01.xlsx.md
Original file line number Diff line number Diff line change
Expand Up @@ -48,4 +48,6 @@
| 3 | 4 | 5 |
| 3 | 6 | 7 |
| 8 | 9 | 9 |
| 10 | 9 | 9 |
| 10 | 9 | 9 |

<!-- image -->
Binary file modified tests/data/xlsx/test-01.xlsx
Binary file not shown.
2 changes: 1 addition & 1 deletion tests/test_backend_msexcel.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ def test_e2e_xlsx_conversions():
converter = get_converter()

for xlsx_path in xlsx_paths:
# print(f"converting {xlsx_path}")
print(f"converting {xlsx_path}")

gt_path = (
xlsx_path.parent.parent / "groundtruth" / "docling_v2" / xlsx_path.name
Expand Down

0 comments on commit 60c2a86

Please sign in to comment.