Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: added more updates to better font-parsing #87

Merged
merged 6 commits into from
Jan 27, 2025
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@ externals
build
dist
extlib_*/
tests/data/groundtruth/*.json
scratch_*

# Created by https://www.toptal.com/developers/gitignore/api/python,macos,emacs,cmake,virtualenv
# Edit at https://www.toptal.com/developers/gitignore?templates=python,macos,emacs,cmake,virtualenv
Expand Down
13 changes: 11 additions & 2 deletions docling_parse/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,9 @@ class PdfCell(PdfColoredElement):

rect: BoundingRectangle

rect_fontbbox: Optional[BoundingRectangle] = None
rect_capheight: Optional[BoundingRectangle] = None

text: str
orig: str

Expand Down Expand Up @@ -358,6 +361,12 @@ def export_to_textlines(
for cell in self.cells:

line = ""
if add_location:
line += f"({cell.rect.r_x0:03.02f}, {cell.rect.r_y0:03.02f}) "
line += f"({cell.rect.r_x1:03.02f}, {cell.rect.r_y1:03.02f}) "
line += f"({cell.rect.r_x2:03.02f}, {cell.rect.r_y2:03.02f}) "
line += f"({cell.rect.r_x3:03.02f}, {cell.rect.r_y3:03.02f}) "

if add_fontkey:
line += f"{cell.font_key} "

Expand Down Expand Up @@ -453,8 +462,8 @@ def _draw_text_in_bounding_bbox(
width, height = round(x1 - x0), round(y0 - y1)

if width <= 2 or height <= 2:
logging.warning(f"skipping to draw text: {text}")
return img # draw
# logging.warning(f"skipping to draw text (width: {x1-x0}, height: {y1-y0}): {text}")
return img

# Use the default font if no font is provided
if font is None:
Expand Down
2 changes: 1 addition & 1 deletion docling_parse/visualize.py
Original file line number Diff line number Diff line change
Expand Up @@ -292,7 +292,7 @@ def visualise_py(
).show()

lines = pdf_page.sanitized.export_to_textlines(add_fontkey=True)
print("\n".join(lines))
print("text-lines: \n", "\n".join(lines))

"""
lines = pdf_page.original.export_to_textlines(add_fontkey=True)
Expand Down
63 changes: 53 additions & 10 deletions src/v2/pdf_resources/page_font.h
Original file line number Diff line number Diff line change
Expand Up @@ -463,18 +463,31 @@ namespace pdflib

std::string pdf_resource<PAGE_FONT>::get_correct_character(uint32_t c)
{
// sometimes, a font has differences-map and a cmap
// Sometimes, a font has differences-map and a cmap
// defined at the same time. So far, it seems that the
// differences should take precedent over the cmap. This
// is however not really clear (eg p 292)
// is however not really clear (eg p 292). Notice also that
// we init the cmap before we init the difference and that the
// difference inherits the content of a the cmap. It is a bit
// messy and unclear her.

/*
if(diff_numb_to_char.count(c)>0 and cmap_numb_to_char.count(c)>0)
{
LOG_S(WARNING) << "there might be some confusion here: "
<< "diff["<<c<<"]: " << diff_numb_to_char.at(c) << " "
<< "cmap["<<c<<"]: " << cmap_numb_to_char.at(c);
}
*/

if(diff_initialized and diff_numb_to_char.count(c)>0)
{
return diff_numb_to_char.at(c);
}
else if(cmap_initialized and cmap_numb_to_char.count(c)>0)
{
return cmap_numb_to_char.at(c);
}
}
else if(bfonts.has_corresponding_font(font_name))
{
// check if the font-name is registered as a 'special' font, eg
Expand Down Expand Up @@ -878,6 +891,7 @@ namespace pdflib
}



/*
void pdf_resource<PAGE_FONT>::init_fontfile3()
{
Expand Down Expand Up @@ -911,18 +925,35 @@ namespace pdflib
auto buffer = qpdf_obj.getRawStreamData();

LOG_S(INFO) << "buffer-size: " << buffer->getSize();
LOG_S(INFO) << "buffer: " << buffer->getBuffer();
//LOG_S(INFO) << "buffer: " << buffer->getBuffer();

std::string filename = "fontfile.zip";
std::ofstream outFile(filename, std::ios::binary);
if (!outFile) {
LOG_S(ERROR) << "opening file for writing: " << filename << std::endl;
return;
}

outFile.write(reinterpret_cast<const char*>(buffer->getBuffer()), buffer->getSize());
outFile.close();

if (!outFile) {
LOG_S(ERROR) << "Error occurred while writing to the file: " << filename << std::endl;
} else {
LOG_S(INFO) << "Buffer successfully written to " << filename << std::endl;
}
}

{
auto buffer = qpdf_obj.getStreamData(qpdf_dl_generalized);
auto buffer = qpdf_obj.getStreamData(qpdf_dl_generalized);

LOG_S(INFO) << "buffer-size: " << buffer->getSize();
LOG_S(INFO) << "buffer: " << buffer->getBuffer();
LOG_S(INFO) << "buffer-size: " << buffer->getSize();
//LOG_S(INFO) << "buffer: " << buffer->getBuffer();
}

assert(false);
//assert(false);
}

else if(utils::json::has(keys_0, desc_font))
{
auto qpdf_obj = qpdf_desc_font.getKey("/FontDescriptor").getKey("/FontFile3");
Expand Down Expand Up @@ -963,8 +994,9 @@ namespace pdflib
else
{
LOG_S(WARNING) << "fontfile3 is not a stream ...";
}
}
}

else
{
LOG_S(WARNING) << "no fontfile3 detected ...";
Expand Down Expand Up @@ -1616,6 +1648,7 @@ namespace pdflib
// Create a regex object
std::regex re_01(R"(\/(.+)\.(.+))");
std::regex re_02(R"((\/)?(uni|UNI)([0-9A-Ea-e]{4}))");
std::regex re_03(R"((\/)(g|G)\d+)");

if(utils::json::has(keys, json_font))
{
Expand Down Expand Up @@ -1654,10 +1687,13 @@ namespace pdflib
}
else
{}

LOG_S(INFO) << name << ", in cmap: " << cmap_numb_to_char.count(numb) << ", #-names: " << name_to_descr.size() << ", type: " << subtype;

if(name_to_descr.count(name)==1 and // only for TYPE_3 fonts
if(subtype==TYPE_3 and //name_to_descr.count(name)==1 and // only for TYPE_3 fonts
cmap_numb_to_char.count(numb)==1)
{
LOG_S(WARNING) << "overloading difference from cmap";
diff_numb_to_char[numb] = cmap_numb_to_char[numb];
}

Expand Down Expand Up @@ -1739,6 +1775,13 @@ namespace pdflib
<< diff_numb_to_char[numb]
<< " (from " << name << ")";
}
else if(std::regex_match(name, match, re_03) and cmap_numb_to_char.count(numb)==1) // if the name is of type /g23 of /G23 and we have a match in the cmap
{
LOG_S(WARNING) << "overloading difference from cmap";
diff_numb_to_char[numb] = cmap_numb_to_char[numb];
//diff_numb_to_char[numb] = name;
//LOG_S(ERROR) << "weird differences["<<numb<<"] -> " << name;
}
else
{
diff_numb_to_char[numb] = name;
Expand Down
2 changes: 2 additions & 0 deletions src/v2/pdf_states/text.h
Original file line number Diff line number Diff line change
Expand Up @@ -663,6 +663,8 @@ namespace pdflib
v += values[l];

std::pair<uint32_t, std::string> item(c,v);
LOG_S(INFO) << item.first << ": " << item.second;

result.push_back(item);
}
}
Expand Down
Binary file added tests/data/regression/font_01.pdf
Binary file not shown.
Binary file added tests/data/regression/font_02.pdf
Binary file not shown.
Binary file added tests/data/regression/font_03.pdf
Binary file not shown.
Binary file added tests/data/regression/font_04.pdf
Binary file not shown.
Binary file added tests/data/regression/font_05.pdf
Binary file not shown.
41 changes: 27 additions & 14 deletions tests/test_parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ def verify_bitmap_resources(


def verify_cells(
true_cells: List[PdfCell], pred_cells: List[PdfCell], eps: float
true_cells: List[PdfCell], pred_cells: List[PdfCell], eps: float, filename: str
) -> bool:

assert len(true_cells) == len(pred_cells), "len(true_cells)==len(pred_cells)"
Expand All @@ -67,23 +67,28 @@ def verify_cells(
true_cell.ordering == pred_cell.ordering
), "true_cell.ordering == pred_cell.ordering"

assert (
true_cell.text == pred_cell.text
), f"true_cell.text == pred_cell.text => {true_cell.text} == {pred_cell.text} for {filename}"
assert (
true_cell.orig == pred_cell.orig
), f"true_cell.orig == pred_cell.orig => {true_cell.orig} == {pred_cell.orig} for {filename}"

true_rect = true_cell.rect.to_polygon()
pred_rect = pred_cell.rect.to_polygon()

for l in range(0, 4):
assert (
abs(true_rect[l][0] - pred_rect[l][0]) < eps
), f"abs(true_rect[l][0]-pred_rect[l][0])<eps -> abs({true_rect[l][0]}-{pred_rect[l][0]})<{eps}"
), f"abs(true_rect[{l}][0]-pred_rect[{l}][0])<eps -> abs({true_rect[l][0]}-{pred_rect[l][0]})<{eps}"

assert (
abs(true_rect[l][1] - pred_rect[l][1]) < eps
), "abs(true_rect[l][1]-pred_rect[l][1])<eps"
), f"abs(true_rect[{l}][1]-pred_rect[{l}][1])<eps -> abs({true_rect[l][1]}-{pred_rect[l][1]})<{eps} for {filename}"

# print("true-text: ", true_cell.text)
# print("pred-text: ", pred_cell.text)

assert true_cell.text == pred_cell.text, "true_cell.text == pred_cell.text"
assert true_cell.orig == pred_cell.orig, "true_cell.orig == pred_cell.orig"

assert (
true_cell.font_key == pred_cell.font_key
), "true_cell.font_key == pred_cell.font_key"
Expand Down Expand Up @@ -160,23 +165,27 @@ def verify_lines(
return True


def verify_SegmentedPdfPage(true_page: SegmentedPdfPage, pred_page: SegmentedPdfPage):
def verify_SegmentedPdfPage(
true_page: SegmentedPdfPage, pred_page: SegmentedPdfPage, filename: str
):

eps = min(true_page.dimension.width / 100.0, true_page.dimension.height / 100.0)
eps = max(true_page.dimension.width / 100.0, true_page.dimension.height / 100.0)

verify_bitmap_resources(
true_page.bitmap_resources, pred_page.bitmap_resources, eps=eps
)

verify_cells(true_page.cells, pred_page.cells, eps=eps)
verify_cells(true_page.cells, pred_page.cells, eps=eps, filename=filename)

verify_lines(true_page.lines, pred_page.lines, eps=eps)


def verify_ParsedPdfPage(true_page: ParsedPdfPage, pred_page: ParsedPdfPage):
def verify_ParsedPdfPage(
true_page: ParsedPdfPage, pred_page: ParsedPdfPage, filename: str = ""
):

verify_SegmentedPdfPage(true_page.original, pred_page.original)
verify_SegmentedPdfPage(true_page.sanitized, pred_page.sanitized)
verify_SegmentedPdfPage(true_page.original, pred_page.original, filename=filename)
verify_SegmentedPdfPage(true_page.sanitized, pred_page.sanitized, filename=filename)


def test_reference_documents_from_filenames():
Expand Down Expand Up @@ -214,7 +223,9 @@ def test_reference_documents_from_filenames():
print(f"loading from {fname}")

true_page = SegmentedPdfPage.load_from_json(fname)
verify_SegmentedPdfPage(true_page, pred_page.original)
verify_SegmentedPdfPage(
true_page, pred_page.original, filename=pdf_doc_path
)

if True:
rname = os.path.basename(pdf_doc_path)
Expand All @@ -228,7 +239,9 @@ def test_reference_documents_from_filenames():
print(f"loading from {fname}")

true_page = SegmentedPdfPage.load_from_json(fname)
verify_SegmentedPdfPage(true_page, pred_page.sanitized)
verify_SegmentedPdfPage(
true_page, pred_page.sanitized, filename=fname
)

pred_page.original.render()
# res.show()
Expand Down
8 changes: 4 additions & 4 deletions tests/test_parse_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ def test_reference_documents_from_filenames_with_keys():
rname = os.path.basename(pdf_doc)
fname = os.path.join(GROUNDTRUTH_FOLDER, rname + ".v1.json")

if GENERATE:
if GENERATE or (not os.path.exists(fname)):
PeterStaar-IBM marked this conversation as resolved.
Show resolved Hide resolved
with open(fname, "w") as fw:
fw.write(json.dumps(pred_doc, indent=2))

Expand Down Expand Up @@ -130,7 +130,7 @@ def test_reference_documents_from_filenames_with_keys_page_by_page():

pred_doc = parser.parse_pdf_from_key_on_page(doc_key, page)

if GENERATE:
if GENERATE or (not os.path.exists(fname)):
with open(fname, "w") as fw:
fw.write(json.dumps(pred_doc, indent=2))

Expand Down Expand Up @@ -188,7 +188,7 @@ def test_reference_documents_from_bytesio_with_keys():
rname = os.path.basename(pdf_doc)
fname = os.path.join(GROUNDTRUTH_FOLDER, rname + ".v1.json")

if GENERATE:
if GENERATE or (not os.path.exists(fname)):
with open(fname, "w") as fw:
fw.write(json.dumps(pred_doc, indent=2))

Expand Down Expand Up @@ -234,7 +234,7 @@ def test_reference_documents_from_bytesio_with_keys_page_by_page():

pred_doc = parser.parse_pdf_from_key_on_page(doc_key, page)

if GENERATE:
if GENERATE or (not os.path.exists(fname)):
with open(fname, "w") as fw:
fw.write(json.dumps(pred_doc, indent=2))

Expand Down
8 changes: 4 additions & 4 deletions tests/test_parse_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,7 @@ def test_reference_documents_from_filenames_with_keys():
rname = os.path.basename(pdf_doc)
fname = os.path.join(GROUNDTRUTH_FOLDER, rname + ".v2.json")

if GENERATE:
if GENERATE or (not os.path.exists(fname)):
with open(fname, "w") as fw:
fw.write(json.dumps(pred_doc, indent=2))

Expand Down Expand Up @@ -249,7 +249,7 @@ def test_reference_documents_from_filenames_with_keys_page_by_page():

pred_doc = parser.parse_pdf_from_key_on_page(doc_key, page)

if GENERATE:
if GENERATE or (not os.path.exists(fname)):
with open(fname, "w") as fw:
fw.write(json.dumps(pred_doc, indent=2))

Expand Down Expand Up @@ -309,7 +309,7 @@ def test_reference_documents_from_bytesio_with_keys():
rname = os.path.basename(pdf_doc)
fname = os.path.join(GROUNDTRUTH_FOLDER, rname + ".v2.json")

if GENERATE:
if GENERATE or (not os.path.exists(fname)):
with open(fname, "w") as fw:
fw.write(json.dumps(pred_doc, indent=2))

Expand Down Expand Up @@ -355,7 +355,7 @@ def test_reference_documents_from_bytesio_with_keys_page_by_page():

pred_doc = parser.parse_pdf_from_key_on_page(doc_key, page)

if GENERATE:
if GENERATE or (not os.path.exists(fname)):
with open(fname, "w") as fw:
fw.write(json.dumps(pred_doc, indent=2))

Expand Down
Loading