Skip to content

Commit

Permalink
updated the visualize script to use the pdf-parser
Browse files Browse the repository at this point in the history
Signed-off-by: Peter Staar <[email protected]>
  • Loading branch information
PeterStaar-IBM committed Jan 16, 2025
1 parent 49097d7 commit 229c0c4
Show file tree
Hide file tree
Showing 4 changed files with 67 additions and 46 deletions.
29 changes: 14 additions & 15 deletions docling_parse/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,9 +142,10 @@ def to_top_left_origin(self, page_height: float) -> "BoundingRectangle":
class PdfBaseElement(BaseModel):
ordering: int


class PdfColoredElement(PdfBaseElement):
rgba: ColorRGBA = ColorRGBA(r=0, g=0, b=0, a=255)
rgba: ColorRGBA = ColorRGBA(r=0, g=0, b=0, a=255)


class PdfCell(PdfColoredElement):

Expand All @@ -166,6 +167,7 @@ def to_bottom_left_origin(self, page_height: float):
def to_top_left_origin(self, page_height: float):
self.rect = self.rect.to_top_left_origin(page_height=page_height)


class PdfBitmapResource(PdfBaseElement):

rect: BoundingRectangle
Expand All @@ -176,13 +178,14 @@ def to_bottom_left_origin(self, page_height: float):

def to_top_left_origin(self, page_height: float):
self.rect = self.rect.to_top_left_origin(page_height=page_height)



class PdfLine(PdfColoredElement):

parent_id: int
points: List[Tuple[float, float]]
width: float = 1.0

coord_origin: CoordOrigin = CoordOrigin.BOTTOMLEFT

def __len__(self) -> int:
Expand Down Expand Up @@ -316,6 +319,13 @@ def crop_text(self, bbox: BoundingBox, eps: float = 1.0):
text += " "
text += cell.text

def export_to_textlines(self, add_location: bool = True) -> List[str]:
lines: List[str] = []
for cell in self.cells:
lines.append(f"{cell.text}")

return lines

def render(
self,
boundary_type: PageBoundaryType = PageBoundaryType.CROP_BOX, # media_box
Expand Down Expand Up @@ -513,17 +523,6 @@ def _draw_text_in_bounding_bbox(
return result


# class ParsedPageLabel(str, Enum):
# """ParsedPageLabel."""
#
# ORIGINAL = "orginal"
# SANITIZED = "sanitized"
#
# def __str__(self):
# """Get string value."""
# return str(self.value)


class ParsedPage(BaseModel):

original: SegmentedPage
Expand Down
11 changes: 6 additions & 5 deletions docling_parse/pdf_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,10 +71,11 @@ def get_page(self, page_no: int) -> ParsedPage:
self._pages[page_no] = self._to_parsed_page(page) # put on cache
return self._pages[page_no]

else:
raise ValueError(
f"incorrect page_no: {page_no} for key={self._key} (min:1, max:{self.number_of_pages()})"
)
raise ValueError(
f"incorrect page_no: {page_no} for key={self._key} (min:1, max:{self.number_of_pages()})"
)

return ParsedPage()

def load_all_pages(self):
doc_dict = self._parser.parse_pdf_from_key(
Expand Down Expand Up @@ -238,7 +239,7 @@ def _to_lines(self, data: dict) -> List[PdfLine]:
line = PdfLine(
ordering=ind,
parent_id=l,
points=points,
points=points,
)
result.append(line)

Expand Down
35 changes: 22 additions & 13 deletions docling_parse/visualize.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,14 @@
import os
from typing import Dict, Optional

from docling_parse.document import ParsedPage
from docling_parse.pdf_parser import DoclingPdfParser, PdfDocument
from docling_parse.pdf_parsers import ( # type: ignore[import]
pdf_parser_v1,
pdf_parser_v2,
)
from docling_parse.utils import create_pil_image_of_page_v1, create_pil_image_of_page_v2

from docling_parse.document import PageBoundaryType, ParsedPage
from docling_parse.pdf_parser import DoclingPdfParser, PdfDocument

# Configure logging
logging.basicConfig(
level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
Expand Down Expand Up @@ -277,15 +276,26 @@ def visualise_py(

pdf_page: ParsedPage = pdf_doc.get_page(page_no=page_num)

if category=="both":
pdf_page.original.render(draw_cells_bbox=(not display_text), draw_cells_text=display_text).show()
pdf_page.sanitized.render(draw_cells_bbox=(not display_text), draw_cells_text=display_text).show()
elif category=="sanitized":
pdf_page.sanitized.render(draw_cells_bbox=(not display_text), draw_cells_text=display_text).show()
elif category=="original":
pdf_page.original.render(draw_cells_bbox=(not display_text), draw_cells_text=display_text).show()


if category == "both":
pdf_page.original.render(
draw_cells_bbox=(not display_text), draw_cells_text=display_text
).show()
pdf_page.sanitized.render(
draw_cells_bbox=(not display_text), draw_cells_text=display_text
).show()
elif category == "sanitized":
pdf_page.sanitized.render(
draw_cells_bbox=(not display_text), draw_cells_text=display_text
).show()
elif category == "original":
pdf_page.original.render(
draw_cells_bbox=(not display_text), draw_cells_text=display_text
).show()

lines = pdf_page.sanitized.export_to_textlines()
print("\n".join(lines))


def main():

(
Expand Down Expand Up @@ -335,7 +345,6 @@ def main():
)
else:
return -1



if __name__ == "__main__":
Expand Down
38 changes: 25 additions & 13 deletions src/v2/pdf_states/grph.h
Original file line number Diff line number Diff line change
Expand Up @@ -211,23 +211,35 @@ namespace pdflib
QPDFObjectHandle arr = instructions[0].obj;

//assert(arr.isArray());
if(not arr.isArray()) { LOG_S(ERROR) << "instructions[0].obj is not an array"; return; }

for(int l=0; l<arr.getArrayNItems(); l++)
//if(not arr.isArray()) { LOG_S(ERROR) << "instructions[0].obj is not an array"; return; }
if(arr.isArray())
{
QPDFObjectHandle item = arr.getArrayItem(l);

//assert(item.isNumber());
if(item.isNumber())
for(int l=0; l<arr.getArrayNItems(); l++)
{
double val = item.getNumericValue();
dash_array.push_back(val);
}
else
{
LOG_S(WARNING) << "skipping items for dash_array ...";
QPDFObjectHandle item = arr.getArrayItem(l);

//assert(item.isNumber());
if(item.isNumber())
{
double val = item.getNumericValue();
dash_array.push_back(val);
}
else
{
LOG_S(WARNING) << "skipping items for dash_array ...";
}
}
}
else if(arr.isNull())
{
LOG_S(WARNING) << "instructions[0].obj is null, re-interpreting it as an empty array";
dash_array = {};
}
else
{
LOG_S(ERROR) << "instructions[0].obj is not an array nor null, defualting to an empty array";
dash_array = {};
}

if(instructions[1].is_integer())
{
Expand Down

0 comments on commit 229c0c4

Please sign in to comment.