Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat!: robustify parser_v2 and add sanitize_cells to the library API #57

Merged
merged 40 commits into from
Dec 7, 2024
Merged
Changes from 1 commit
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
fa25762
feat: adding sanitize_cells to the parser_v2 library API
PeterStaar-IBM Nov 13, 2024
f930562
implemented the init_from
PeterStaar-IBM Nov 14, 2024
346d2ee
added method to extract text from page and bbox
PeterStaar-IBM Nov 14, 2024
65d1cb1
reformatted the code
PeterStaar-IBM Nov 14, 2024
4303cb8
fixed the nasty bugs
PeterStaar-IBM Nov 14, 2024
9a57b60
reformatted the code
PeterStaar-IBM Nov 14, 2024
9729745
merged with main
PeterStaar-IBM Nov 22, 2024
372385e
merged with main (2)
PeterStaar-IBM Nov 22, 2024
49ff147
added the S3 processing script
PeterStaar-IBM Nov 25, 2024
0aff345
working on fixing the missing fonts
PeterStaar-IBM Nov 28, 2024
4aa4739
made the v2 parser a lot more robust
PeterStaar-IBM Nov 28, 2024
5986eeb
found another bug
PeterStaar-IBM Nov 29, 2024
56474cb
refactored into annots
PeterStaar-IBM Nov 29, 2024
b5b8926
fixed the utf8 issues and be more stricte with mismatching parameters
PeterStaar-IBM Nov 29, 2024
de1bef9
reformatted the python code
PeterStaar-IBM Nov 29, 2024
5fc2f9b
adopted the error versus warning messages and cleaned up the meta-dat…
PeterStaar-IBM Nov 29, 2024
643141d
fixed the sanitize-from-bbox function
PeterStaar-IBM Nov 29, 2024
ed5f0b8
updated the tests and README
PeterStaar-IBM Nov 30, 2024
f491457
updated the tests to be more robust
PeterStaar-IBM Dec 2, 2024
671a64a
restyled the code
PeterStaar-IBM Dec 2, 2024
c31e0ac
refactored the cell sanitisation with all parameters selectable and a…
PeterStaar-IBM Dec 3, 2024
a2498df
[BREAKING CHANGE]: rename docling_parse to pdf_parsers
PeterStaar-IBM Dec 4, 2024
abd946d
first attempt at fixing the ligatures
PeterStaar-IBM Dec 4, 2024
a4d08e5
added the visualization
PeterStaar-IBM Dec 4, 2024
7d4c4ab
Merge branch 'release_v3' of github.com:DS4SD/docling-parse into dev/…
cau-git Dec 4, 2024
8ea65ae
Rebase to release_v3
cau-git Dec 4, 2024
110a970
refactored the tests, updated with text output with ligatures and add…
PeterStaar-IBM Dec 5, 2024
6316d95
resolved merge conflicts
PeterStaar-IBM Dec 5, 2024
c764347
reformatted the code
PeterStaar-IBM Dec 5, 2024
1df1606
fixed the mypy errors
PeterStaar-IBM Dec 6, 2024
6999b8d
made the display text work
PeterStaar-IBM Dec 6, 2024
97c0497
updated the v1 parser with method documentation
PeterStaar-IBM Dec 6, 2024
ba6996e
added encode to see what fails
PeterStaar-IBM Dec 6, 2024
8913884
reformatted the code
PeterStaar-IBM Dec 6, 2024
cc4bde5
removed spurious utf8 test
PeterStaar-IBM Dec 6, 2024
489bbb1
refactored the parameter name and added test for sanitize_cells_in_bbox
PeterStaar-IBM Dec 6, 2024
7a70db3
added tough cases
PeterStaar-IBM Dec 6, 2024
d1b1cdf
fix the rhel build script
PeterStaar-IBM Dec 7, 2024
3914d2d
updated the extract_text_from_bbox
PeterStaar-IBM Dec 7, 2024
1fb52e9
added the function to detect the orientation
PeterStaar-IBM Dec 7, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
fixed the mypy errors
Signed-off-by: Peter Staar <[email protected]>
PeterStaar-IBM committed Dec 6, 2024

Verified

This commit was created on GitHub.com and signed with GitHub’s verified signature. The key has expired.
commit 1df160632eb73b531f4cf08ebcc781564a6912f3
46 changes: 15 additions & 31 deletions docling_parse/utils.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,19 @@
import json
import logging
from typing import Dict, List, Optional, Tuple
from typing import Dict, List, Optional, Tuple, Union

from PIL import Image, ImageColor, ImageDraw, ImageFont
from PIL.ImageFont import FreeTypeFont


def _draw_text_in_bounding_bbox(
img,
draw: ImageDraw.Draw,
img: Image.Image,
draw: ImageDraw.ImageDraw,
bbox: Tuple[float, float, float, float],
text: str,
font: Optional[ImageFont.ImageFont] = None,
font: Optional[Union[FreeTypeFont, ImageFont.ImageFont]] = None,
fill: str = "black",
):
) -> ImageDraw.ImageDraw:
"""
Draws text inside a bounding box by creating a temporary image,
resizing it, and pasting it into the original image at bbox.
@@ -75,7 +76,7 @@ def _draw_text_in_bounding_bbox(

# Paste the resized text image onto the original image
# draw.bitmap((paste_x, paste_y), resized_img)#, fill=None)
img.paste((paste_x, paste_y), "black", resized_img) # , fill=None)
img.paste((paste_x, paste_y), resized_img) # , "black") # , fill=None)

# draw.text((50, 50), text, font=font, fill=(0,0,0,255))

@@ -340,30 +341,18 @@ def _draw_annotations(
"""
# Create a blank white image with RGBA mode
img = Image.new("RGBA", (round(W), round(H)), (255, 255, 255, 255))
draw = ImageDraw.Draw(img)
"""
overlay = Image.new(
"RGBA", (round(W), round(H)), (255, 255, 255, 0)
) # Transparent overlay
draw = ImageDraw.Draw(overlay)
"""

# Draw each rectangle by connecting its four points
if draw_images:
for row in images:

"""
x0 = row[images_header.index("x0")]
y0 = row[images_header.index("y0")]
x1 = row[images_header.index("x1")]
y1 = row[images_header.index("y1")]

# Define the four corners of the rectangle
bl = (x0, H - y0)
br = (x1, H - y0)
tr = (x1, H - y1)
tl = (x0, H - y1)

# Draw the rectangle as a polygon
draw.polygon([bl, br, tr, tl], outline="green", fill="yellow")
"""
bbox = [
row[images_header.index("x0")],
row[images_header.index("y0")],
@@ -409,19 +398,14 @@ def _draw_annotations(
alpha=cell_alpha,
)

"""
if "glyph" in row[cells_header.index("text")]:
logging.info(f" skip cell -> {row}")
continue
"""

# Fixme: the _draw_text_in_bounding_bbox is not yet working
text = row[cells_header.index(f"text")]
if False and draw_cells_text and len(text) > 0:
if draw_cells_text and len(text) > 0:
draw = _draw_text_in_bounding_bbox(
overlay,
# overlay,
img,
draw,
bbox=[rect[0][0], rect[0][1], rect[2][0], rect[2][1]],
bbox=(rect[0][0], rect[0][1], rect[2][0], rect[2][1]),
text=text,
)

@@ -532,6 +516,6 @@ def _draw_annotations(
draw.polygon([bl, br, tr, tl], outline=outl_color, width=cropbox_width)

# Composite the overlay with the base image
img = Image.alpha_composite(img, overlay)
# img = Image.alpha_composite(img, overlay)

return img
13 changes: 8 additions & 5 deletions docling_parse/visualize.py
Original file line number Diff line number Diff line change
@@ -4,7 +4,7 @@
import os
from typing import Dict, Optional

from docling_parse.pdf_parsers import ( # type: ignore[attr-defined]
from docling_parse.pdf_parsers import ( # type: ignore[import]
pdf_parser_v1,
pdf_parser_v2,
)
@@ -333,14 +333,17 @@ def visualise_v2(

for category in categories:

img_orig = create_pil_image_of_page_v2(page, category=category)
img = create_pil_image_of_page_v2(
page, category=category, draw_cells_text=display_text
)

if interactive:
img_orig.show()
img.show()

if output_dir is not None and page_num == -1:
oname = os.path.join(
output_dir, f"{os.path.basename(pdf_path)}_page={pi}.v2.{_}.png"
output_dir,
f"{os.path.basename(pdf_path)}_page={pi}.v2.{category}.png",
)
logging.info(f"output: {oname}")

@@ -349,7 +352,7 @@ def visualise_v2(
elif output_dir is not None and page_num != -1:
oname = os.path.join(
output_dir,
f"{os.path.basename(pdf_path)}_page={page_num}.v2.{_}.png",
f"{os.path.basename(pdf_path)}_page={pi}.v2.{category}.png",
)
logging.info(f"output: {oname}")

3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -109,6 +109,9 @@ show_error_codes = true
python_version = "3.9"
# plugins = ["pydantic.mypy"]

#[mypy-docling_parse.*]
#ignore_missing_imports = True

[[tool.mypy.overrides]]
module = [
"tabulate.*",
2 changes: 1 addition & 1 deletion tests/test_parse_v1.py
Original file line number Diff line number Diff line change
@@ -12,7 +12,7 @@
import json
import os

from docling_parse.pdf_parsers import pdf_parser_v1 # type: ignore[attr-defined]
from docling_parse.pdf_parsers import pdf_parser_v1 # type: ignore[import]


def verify_reference_output(true_doc, pred_doc):
2 changes: 1 addition & 1 deletion tests/test_parse_v2.py
Original file line number Diff line number Diff line change
@@ -13,7 +13,7 @@
import json
import os

from docling_parse.pdf_parsers import pdf_parser_v2 # type: ignore[attr-defined]
from docling_parse.pdf_parsers import pdf_parser_v2 # type: ignore[import]
from docling_parse.utils import create_pil_image_of_page_v2