Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: added more updates to better font-parsing #87

Merged
merged 6 commits into from
Jan 27, 2025
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -3,6 +3,7 @@ externals
build
dist
extlib_*/
scratch_*

# Created by https://www.toptal.com/developers/gitignore/api/python,macos,emacs,cmake,virtualenv
# Edit at https://www.toptal.com/developers/gitignore?templates=python,macos,emacs,cmake,virtualenv
13 changes: 11 additions & 2 deletions docling_parse/document.py
Original file line number Diff line number Diff line change
@@ -154,6 +154,9 @@ class PdfCell(PdfColoredElement):

rect: BoundingRectangle

rect_fontbbox: Optional[BoundingRectangle] = None
rect_capheight: Optional[BoundingRectangle] = None

text: str
orig: str

@@ -358,6 +361,12 @@ def export_to_textlines(
for cell in self.cells:

line = ""
if add_location:
line += f"({cell.rect.r_x0:03.02f}, {cell.rect.r_y0:03.02f}) "
line += f"({cell.rect.r_x1:03.02f}, {cell.rect.r_y1:03.02f}) "
line += f"({cell.rect.r_x2:03.02f}, {cell.rect.r_y2:03.02f}) "
line += f"({cell.rect.r_x3:03.02f}, {cell.rect.r_y3:03.02f}) "

if add_fontkey:
line += f"{cell.font_key} "

@@ -453,8 +462,8 @@ def _draw_text_in_bounding_bbox(
width, height = round(x1 - x0), round(y0 - y1)

if width <= 2 or height <= 2:
logging.warning(f"skipping to draw text: {text}")
return img # draw
# logging.warning(f"skipping to draw text (width: {x1-x0}, height: {y1-y0}): {text}")
return img

# Use the default font if no font is provided
if font is None:
2 changes: 1 addition & 1 deletion docling_parse/visualize.py
Original file line number Diff line number Diff line change
@@ -292,7 +292,7 @@ def visualise_py(
).show()

lines = pdf_page.sanitized.export_to_textlines(add_fontkey=True)
print("\n".join(lines))
print("text-lines: \n", "\n".join(lines))

"""
lines = pdf_page.original.export_to_textlines(add_fontkey=True)
63 changes: 53 additions & 10 deletions src/v2/pdf_resources/page_font.h
Original file line number Diff line number Diff line change
@@ -463,18 +463,31 @@ namespace pdflib

std::string pdf_resource<PAGE_FONT>::get_correct_character(uint32_t c)
{
// sometimes, a font has differences-map and a cmap
// Sometimes, a font has differences-map and a cmap
// defined at the same time. So far, it seems that the
// differences should take precedent over the cmap. This
// is however not really clear (eg p 292)
// is however not really clear (eg p 292). Notice also that
// we init the cmap before we init the difference and that the
// difference inherits the content of a the cmap. It is a bit
// messy and unclear her.

/*
if(diff_numb_to_char.count(c)>0 and cmap_numb_to_char.count(c)>0)
{
LOG_S(WARNING) << "there might be some confusion here: "
<< "diff["<<c<<"]: " << diff_numb_to_char.at(c) << " "
<< "cmap["<<c<<"]: " << cmap_numb_to_char.at(c);
}
*/

if(diff_initialized and diff_numb_to_char.count(c)>0)
{
return diff_numb_to_char.at(c);
}
else if(cmap_initialized and cmap_numb_to_char.count(c)>0)
{
return cmap_numb_to_char.at(c);
}
}
else if(bfonts.has_corresponding_font(font_name))
{
// check if the font-name is registered as a 'special' font, eg
@@ -878,6 +891,7 @@ namespace pdflib
}



/*
void pdf_resource<PAGE_FONT>::init_fontfile3()
{
@@ -911,18 +925,35 @@ namespace pdflib
auto buffer = qpdf_obj.getRawStreamData();

LOG_S(INFO) << "buffer-size: " << buffer->getSize();
LOG_S(INFO) << "buffer: " << buffer->getBuffer();
//LOG_S(INFO) << "buffer: " << buffer->getBuffer();

std::string filename = "fontfile.zip";
std::ofstream outFile(filename, std::ios::binary);
if (!outFile) {
LOG_S(ERROR) << "opening file for writing: " << filename << std::endl;
return;
}

outFile.write(reinterpret_cast<const char*>(buffer->getBuffer()), buffer->getSize());
outFile.close();

if (!outFile) {
LOG_S(ERROR) << "Error occurred while writing to the file: " << filename << std::endl;
} else {
LOG_S(INFO) << "Buffer successfully written to " << filename << std::endl;
}
}

{
auto buffer = qpdf_obj.getStreamData(qpdf_dl_generalized);
auto buffer = qpdf_obj.getStreamData(qpdf_dl_generalized);

LOG_S(INFO) << "buffer-size: " << buffer->getSize();
LOG_S(INFO) << "buffer: " << buffer->getBuffer();
LOG_S(INFO) << "buffer-size: " << buffer->getSize();
//LOG_S(INFO) << "buffer: " << buffer->getBuffer();
}

assert(false);
//assert(false);
}

else if(utils::json::has(keys_0, desc_font))
{
auto qpdf_obj = qpdf_desc_font.getKey("/FontDescriptor").getKey("/FontFile3");
@@ -963,8 +994,9 @@ namespace pdflib
else
{
LOG_S(WARNING) << "fontfile3 is not a stream ...";
}
}
}

else
{
LOG_S(WARNING) << "no fontfile3 detected ...";
@@ -1616,6 +1648,7 @@ namespace pdflib
// Create a regex object
std::regex re_01(R"(\/(.+)\.(.+))");
std::regex re_02(R"((\/)?(uni|UNI)([0-9A-Ea-e]{4}))");
std::regex re_03(R"((\/)(g|G)\d+)");

if(utils::json::has(keys, json_font))
{
@@ -1654,10 +1687,13 @@ namespace pdflib
}
else
{}

LOG_S(INFO) << name << ", in cmap: " << cmap_numb_to_char.count(numb) << ", #-names: " << name_to_descr.size() << ", type: " << subtype;

if(name_to_descr.count(name)==1 and // only for TYPE_3 fonts
if(subtype==TYPE_3 and //name_to_descr.count(name)==1 and // only for TYPE_3 fonts
cmap_numb_to_char.count(numb)==1)
{
LOG_S(WARNING) << "overloading difference from cmap";
diff_numb_to_char[numb] = cmap_numb_to_char[numb];
}

@@ -1739,6 +1775,13 @@ namespace pdflib
<< diff_numb_to_char[numb]
<< " (from " << name << ")";
}
else if(std::regex_match(name, match, re_03) and cmap_numb_to_char.count(numb)==1) // if the name is of type /g23 of /G23 and we have a match in the cmap
{
LOG_S(WARNING) << "overloading difference from cmap";
diff_numb_to_char[numb] = cmap_numb_to_char[numb];
//diff_numb_to_char[numb] = name;
//LOG_S(ERROR) << "weird differences["<<numb<<"] -> " << name;
}
else
{
diff_numb_to_char[numb] = name;
2 changes: 2 additions & 0 deletions src/v2/pdf_states/text.h
Original file line number Diff line number Diff line change
@@ -663,6 +663,8 @@ namespace pdflib
v += values[l];

std::pair<uint32_t, std::string> item(c,v);
LOG_S(INFO) << item.first << ": " << item.second;

result.push_back(item);
}
}
Loading