Skip to content

Commit

Permalink
refactored the cell sanitisation with all parameters selectable and a…
Browse files Browse the repository at this point in the history
…dded page_boundary

Signed-off-by: Peter Staar <[email protected]>
  • Loading branch information
PeterStaar-IBM committed Dec 3, 2024
1 parent 671a64a commit c31e0ac
Show file tree
Hide file tree
Showing 13 changed files with 468 additions and 94 deletions.
17 changes: 12 additions & 5 deletions app/pybind_parse.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -102,16 +102,19 @@ PYBIND11_MODULE(docling_parse, m) {
)

.def("parse_pdf_from_key",
pybind11::overload_cast<std::string>(&docling::docling_parser_v2::parse_pdf_from_key),
//pybind11::overload_cast<std::string>(&docling::docling_parser_v2::parse_pdf_from_key),
&docling::docling_parser_v2::parse_pdf_from_key,
"parse pdf-document using doc-key into json",
pybind11::arg("key")
pybind11::arg("key"),
pybind11::arg("page_boundary") = "crop_box" // media_box
)

.def("parse_pdf_from_key_on_page",
&docling::docling_parser_v2::parse_pdf_from_key_on_page,
"parse specific page in pdf-document using doc-key from path into json",
pybind11::arg("key"),
pybind11::arg("page")
pybind11::arg("page"),
pybind11::arg("page_boundary") = "crop_box" // media_box
)

.def("sanitize_cells",
Expand All @@ -121,7 +124,9 @@ PYBIND11_MODULE(docling_parse, m) {
pybind11::arg("page_dimension"),
pybind11::arg("page_lines"),
pybind11::arg("delta_y0")=1.0,
pybind11::arg("enforce_same_font")=true
pybind11::arg("enforce_same_font")=true,
pybind11::arg("space_width_factor_for_merge")=1.5,
pybind11::arg("space_width_factor_for_merge_with_space")=0.33
)

.def("sanitize_cells_in_bbox",
Expand All @@ -131,6 +136,8 @@ PYBIND11_MODULE(docling_parse, m) {
pybind11::arg("bbox"),
pybind11::arg("iou_cutoff")=0.99,
pybind11::arg("delta_y0")=1.0,
pybind11::arg("enforce_same_font")=true
pybind11::arg("enforce_same_font")=true,
pybind11::arg("space_width_factor_for_merge")=1.5,
pybind11::arg("space_width_factor_for_merge_with_space")=0.33
);
}
76 changes: 57 additions & 19 deletions docling_parse/visualize.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import argparse
import hashlib
import io
import json
import logging
Expand Down Expand Up @@ -47,6 +48,17 @@ def parse_args():
help="Version [v1, v2]",
)

# Restrict page-boundary
parser.add_argument(
"-b",
"--page-boundary",
type=str,
choices=["crop_box", "media_box"],
required=True,
default="crop_box",
help="page-boundary [crop_box, media_box]",
)

# Add an argument for the path to the PDF file
parser.add_argument(
"-i", "--input-pdf", type=str, help="Path to the PDF file", required=True
Expand Down Expand Up @@ -105,6 +117,7 @@ def parse_args():
args.output_dir,
int(args.page),
args.display_text,
args.page_boundary,
)


Expand Down Expand Up @@ -137,8 +150,8 @@ def visualise_v1(

for pi, page in enumerate(doc["pages"]):

H = page["height"]
W = page["width"]
H = page["height"]

# Create a blank white image
img = Image.new("RGB", (round(W), round(H)), "white")
Expand Down Expand Up @@ -243,25 +256,33 @@ def visualise_v2(
output_dir: str,
page_num: int,
display_text: bool,
skip_out_of_bounds: bool = True,
page_boundary: str = "crop_box", # media_box
):

parser = pdf_parser_v2(log_level)
# parser.set_loglevel_with_label(log_level)

doc_key = "key"
hash_obj = hashlib.sha256(str(pdf_path).encode())
doc_key = str(hash_obj.hexdigest())

# doc_key = "key"
logging.info(f"{doc_key}: {pdf_path}")

success = parser.load_document(doc_key, pdf_path)

if success == False:
return

logging.info(f"page_boundary: {page_boundary}")

doc = None

try:
if page_num == -1:
doc = parser.parse_pdf_from_key(doc_key)
doc = parser.parse_pdf_from_key(doc_key, page_boundary)
else:
doc = parser.parse_pdf_from_key_on_page(doc_key, page_num)
doc = parser.parse_pdf_from_key_on_page(doc_key, page_num, page_boundary)

except Exception as exc:
logging.info(f"Could not parse pdf-document: {exc}")
doc = None
Expand All @@ -278,8 +299,6 @@ def visualise_v2(
dimension = page[_]["dimension"]
logging.info(f"dimensions: {json.dumps(dimension, indent=2)}")

page_bbox = dimension["bbox"]

cells = page[_]["cells"]["data"]
cells_header = page[_]["cells"]["header"]

Expand All @@ -292,17 +311,13 @@ def visualise_v2(

if PIL_INSTALLED:

# W = dimension["width"]
# H = dimension["height"]
W = dimension["width"]
H = dimension["height"]

W = page_bbox[2]
H = page_bbox[3]
logging.info(f"width: {W}, height: {H}")

# Create a blank white image
# img = Image.new("RGB", (round(W), round(H)), "white")
img = Image.new(
"RGB", (round(page_bbox[2]), round(page_bbox[3])), "white"
)
img = Image.new("RGB", (round(W), round(H)), "white")
draw = ImageDraw.Draw(img)

# Draw each rectangle by connecting its four points
Expand Down Expand Up @@ -387,7 +402,11 @@ def visualise_v2(
width=1,
)

if True: # Crop-box
# draw the crop-box
if page_boundary == "media_box": # Crop-box

page_bbox = dimension["rectangles"]["crop-bbox"]

x0 = page_bbox[0]
y0 = page_bbox[1]
x1 = page_bbox[2]
Expand Down Expand Up @@ -431,12 +450,31 @@ def visualise_v2(

def main():

log_level, version, pdf, interactive, output_dir, page, display_text = parse_args()
(
log_level,
version,
pdf_path,
interactive,
output_dir,
page_num,
display_text,
page_boundary,
) = parse_args()

logging.info(f"page_boundary: {page_boundary}")

if version == "v1":
visualise_v1(log_level, pdf, interactive, output_dir, page, display_text)
visualise_v1(log_level, pdf_path, interactive, output_dir, page, display_text)
elif version == "v2":
visualise_v2(log_level, pdf, interactive, output_dir, page, display_text)
visualise_v2(
log_level=log_level,
pdf_path=pdf_path,
interactive=interactive,
output_dir=output_dir,
page_num=page_num,
display_text=display_text,
page_boundary=page_boundary,
)
else:
return -1

Expand Down
45 changes: 31 additions & 14 deletions src/pybind/docling_parser_v2.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,21 +38,29 @@ namespace docling
nlohmann::json get_annotations(std::string key);
nlohmann::json get_table_of_contents(std::string key);

nlohmann::json parse_pdf_from_key(std::string key);
nlohmann::json parse_pdf_from_key(std::string key, std::string page_boundary);

nlohmann::json parse_pdf_from_key_on_page(std::string key, int page);
nlohmann::json parse_pdf_from_key_on_page(std::string key, int page, std::string page_boundary);

nlohmann::json sanitize_cells(nlohmann::json& original_cells,
nlohmann::json& page_dim,
nlohmann::json& page_lines,
double delta_y0,
bool enforce_same_font);

bool enforce_same_font,
double space_width_factor_for_merge, //=1.5,
double space_width_factor_for_merge_with_space); //=0.33);

nlohmann::json sanitize_cells_in_bbox(nlohmann::json& page,
std::array<double, 4> bbox,
double iou_cutoff,
double delta_y0,
bool enforce_same_font);
bool enforce_same_font,
double space_width_factor_for_merge, //=1.5,
double space_width_factor_for_merge_with_space); //=0.33);

private:

bool verify_page_boundary(std::string page_boundary);

private:

Expand Down Expand Up @@ -275,8 +283,8 @@ namespace docling

return (itr->second)->get_table_of_contents();
}
nlohmann::json docling_parser_v2::parse_pdf_from_key(std::string key)

nlohmann::json docling_parser_v2::parse_pdf_from_key(std::string key, std::string page_boundary)
{
LOG_S(INFO) << __FUNCTION__;

Expand All @@ -288,7 +296,7 @@ namespace docling
}

auto& decoder = itr->second;
decoder->decode_document();
decoder->decode_document(page_boundary);

LOG_S(INFO) << "decoding done for key: " << key;

Expand All @@ -300,7 +308,8 @@ namespace docling
return decoder->get();
}

nlohmann::json docling_parser_v2::parse_pdf_from_key_on_page(std::string key, int page)
nlohmann::json docling_parser_v2::parse_pdf_from_key_on_page(std::string key, int page,
std::string page_boundary)
{
LOG_S(INFO) << __FUNCTION__;

Expand All @@ -314,7 +323,7 @@ namespace docling
auto& decoder = itr->second;

std::vector<int> pages = {page};
decoder->decode_document(pages);
decoder->decode_document(pages, page_boundary);

LOG_S(INFO) << "decoding done for for key: " << key << " and page: " << page;

Expand All @@ -330,7 +339,9 @@ namespace docling
nlohmann::json& json_dim,
nlohmann::json& json_lines,
double delta_y0,
bool enforce_same_font)
bool enforce_same_font,
double space_width_factor_for_merge, //=1.5,
double space_width_factor_for_merge_with_space) //=0.33);
{
pdflib::pdf_resource<pdflib::PAGE_DIMENSION> dim;
dim.init_from(json_dim);
Expand All @@ -342,7 +353,9 @@ namespace docling
cells.init_from(json_cells);

pdflib::pdf_sanitator<pdflib::PAGE_CELLS> sanitizer(dim, lines);
sanitizer.sanitize(cells, delta_y0, enforce_same_font);
sanitizer.sanitize(cells, delta_y0, enforce_same_font,
space_width_factor_for_merge,
space_width_factor_for_merge_with_space);

return cells.get();
}
Expand All @@ -351,7 +364,9 @@ namespace docling
std::array<double, 4> bbox,
double iou_cutoff,
double delta_y0,
bool enforce_same_font)
bool enforce_same_font,
double space_width_factor_for_merge, //=1.5,
double space_width_factor_for_merge_with_space) //=0.33);
{
LOG_S(INFO) << __FUNCTION__
<< ", iou_cutoff: " << iou_cutoff
Expand Down Expand Up @@ -413,7 +428,9 @@ namespace docling
}

pdflib::pdf_sanitator<pdflib::PAGE_CELLS> sanitizer(dim, lines);
sanitizer.sanitize(selected_cells, delta_y0, enforce_same_font);
sanitizer.sanitize(selected_cells, delta_y0, enforce_same_font,
space_width_factor_for_merge,
space_width_factor_for_merge_with_space);

return selected_cells.get();
}
Expand Down
10 changes: 6 additions & 4 deletions src/v2/parser.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ namespace plib
bool parse_file(std::string inp_filename,
std::string out_filename,
nlohmann::json& task,
std::string page_boundary,
bool pretty_print=true);

private:
Expand Down Expand Up @@ -136,7 +137,7 @@ namespace plib
std::ifstream ifs(inp_filename);
if(ifs)
{
parse_file(inp_filename, out_filename, val);
parse_file(inp_filename, out_filename, val, "crop_box");
}
else
{
Expand Down Expand Up @@ -182,7 +183,8 @@ namespace plib
bool parser::parse_file(std::string inp_filename,
std::string out_filename,
nlohmann::json& task,
bool pretty_print)
std::string page_boundary,
bool pretty_print)
{
pdflib::pdf_decoder<pdflib::DOCUMENT> document_decoder(timings);

Expand All @@ -200,12 +202,12 @@ namespace plib

if(task.count("page-numbers")==0)
{
document_decoder.decode_document();
document_decoder.decode_document(page_boundary);
}
else
{
std::vector<int> page_numbers = task["page-numbers"];
document_decoder.decode_document(page_numbers);
document_decoder.decode_document(page_numbers, page_boundary);
}

nlohmann::json json_document = document_decoder.get();
Expand Down
Loading

0 comments on commit c31e0ac

Please sign in to comment.