diff --git a/src/pybind/docling_parser_v2.h b/src/pybind/docling_parser_v2.h index b7e090da..4bd34f78 100644 --- a/src/pybind/docling_parser_v2.h +++ b/src/pybind/docling_parser_v2.h @@ -352,10 +352,12 @@ namespace docling pdflib::pdf_resource cells; cells.init_from(json_cells); - pdflib::pdf_sanitator sanitizer(dim, lines); - sanitizer.sanitize(cells, delta_y0, enforce_same_font, - space_width_factor_for_merge, - space_width_factor_for_merge_with_space); + pdflib::pdf_sanitator sanitizer;//(dim, lines); + sanitizer.sanitize_bbox(cells, delta_y0, enforce_same_font, + space_width_factor_for_merge, + space_width_factor_for_merge_with_space); + + sanitizer.sanitize_text(cells); return cells.get(); } @@ -412,10 +414,6 @@ namespace docling double iou = utils::values::compute_overlap(cells[i].x0, cells[i].y0, cells[i].x1, cells[i].y1, x0, y0, x1, y1); - //LOG_S(INFO) << "cell " << i << " => iou: " << iou; - - //if(x0<=cells[i].x0 and cells[i].x1<=x1 and - //y0<=cells[i].y0 and cells[i].y1<=y1) if(iou>iou_cutoff-1.e-3) { selected_cells.push_back(cells[i]); @@ -427,10 +425,12 @@ namespace docling return sanitized_cells; } - pdflib::pdf_sanitator sanitizer(dim, lines); - sanitizer.sanitize(selected_cells, delta_y0, enforce_same_font, - space_width_factor_for_merge, - space_width_factor_for_merge_with_space); + pdflib::pdf_sanitator sanitizer; + sanitizer.sanitize_bbox(selected_cells, delta_y0, enforce_same_font, + space_width_factor_for_merge, + space_width_factor_for_merge_with_space); + + sanitizer.sanitize_text(selected_cells); return selected_cells.get(); } diff --git a/src/v2/pdf_decoders/page.h b/src/v2/pdf_decoders/page.h index 6890b781..685d9466 100644 --- a/src/v2/pdf_decoders/page.h +++ b/src/v2/pdf_decoders/page.h @@ -466,9 +466,11 @@ namespace pdflib // sanitise the cells { + /* pdf_sanitator sanitator(page_dimension, page_lines); - + */ + pdf_sanitator sanitator; cells = page_cells; double delta_y0=1.0; @@ -476,12 +478,14 @@ namespace pdflib double space_width_factor_for_merge=1.5; double space_width_factor_for_merge_with_space=0.33; - sanitator.sanitize(cells, - delta_y0, - enforce_same_font, - space_width_factor_for_merge, - space_width_factor_for_merge_with_space); - + sanitator.sanitize_bbox(cells, + delta_y0, + enforce_same_font, + space_width_factor_for_merge, + space_width_factor_for_merge_with_space); + + sanitator.sanitize_text(cells); + LOG_S(INFO) << "#-page-cells: " << page_cells.size(); LOG_S(INFO) << "#-sani-cells: " << cells.size(); } diff --git a/src/v2/pdf_resources/page_cells.h b/src/v2/pdf_resources/page_cells.h index 05b58519..c5053070 100644 --- a/src/v2/pdf_resources/page_cells.h +++ b/src/v2/pdf_resources/page_cells.h @@ -30,6 +30,8 @@ namespace pdflib itr_type end() { return cells.end(); } itr_type erase(itr_type itr) { return cells.erase(itr); } + + pdf_resource& at(std::size_t i) { return cells.at(i); } private: diff --git a/src/v2/pdf_sanitators/cells.h b/src/v2/pdf_sanitators/cells.h index 7a983f8d..1b5a1b92 100644 --- a/src/v2/pdf_sanitators/cells.h +++ b/src/v2/pdf_sanitators/cells.h @@ -11,15 +11,21 @@ namespace pdflib { public: + /* pdf_sanitator(pdf_resource& page_dims_, pdf_resource& page_lines_); + */ + + pdf_sanitator(); ~pdf_sanitator(); - void sanitize(pdf_resource& cells, - double delta_y0, //=1.0, - bool enforce_same_font, //=true, - double space_width_factor_for_merge, //=1.5, - double space_width_factor_for_merge_with_space); //=0.33); + void sanitize_text(pdf_resource& cells); + + void sanitize_bbox(pdf_resource& cells, + double delta_y0, //=1.0, + bool enforce_same_font, //=true, + double space_width_factor_for_merge, //=1.5, + double space_width_factor_for_merge_with_space); //=0.33); private: @@ -36,28 +42,33 @@ namespace pdflib double space_width_factor_for_merge=1.5, double space_width_factor_for_merge_with_space=0.33); - void sanitise_text(); + private: - pdf_resource& page_dims; - pdf_resource& page_lines; + //pdf_resource& page_dims; + //pdf_resource& page_lines; }; + /* pdf_sanitator::pdf_sanitator(pdf_resource& page_dims_, pdf_resource& page_lines_): page_dims(page_dims_), page_lines(page_lines_) {} - + */ + + pdf_sanitator::pdf_sanitator() + {} + pdf_sanitator::~pdf_sanitator() {} - void pdf_sanitator::sanitize(pdf_resource& cells, - double delta_y0, - bool enforce_same_font, - double space_width_factor_for_merge, - double space_width_factor_for_merge_with_space) + void pdf_sanitator::sanitize_bbox(pdf_resource& cells, + double delta_y0, + bool enforce_same_font, + double space_width_factor_for_merge, + double space_width_factor_for_merge_with_space) { contract_cells_into_lines(cells, delta_y0, enforce_same_font, space_width_factor_for_merge, @@ -138,8 +149,6 @@ namespace pdflib double space_width_factor_for_merge, double space_width_factor_for_merge_with_space) { - //const double DELTA_Y0 = 1.0; - std::string font_i = cell_i.font_name; std::string font_j = cell_j.font_name; @@ -152,28 +161,30 @@ namespace pdflib std::string text_j = cell_j.text; int num_chars_i = utils::string::count_unicode_characters(text_i); - int num_chars_j = utils::string::count_unicode_characters(text_j); + //int num_chars_j = utils::string::count_unicode_characters(text_j); double len_i = std::sqrt(std::pow(cell_i.r_x1-cell_i.r_x0, 2) + std::pow(cell_i.r_y1-cell_i.r_y0, 2)); - double len_j = std::sqrt(std::pow(cell_j.r_x1-cell_j.r_x0, 2) + std::pow(cell_j.r_y1-cell_j.r_y0, 2)); + //double len_j = std::sqrt(std::pow(cell_j.r_x1-cell_j.r_x0, 2) + std::pow(cell_j.r_y1-cell_j.r_y0, 2)); double space_width_i = num_chars_i>0? len_i/num_chars_i : 0.0; - double space_width_j = num_chars_j>0? len_j/num_chars_j : 0.0; + //double space_width_j = num_chars_j>0? len_j/num_chars_j : 0.0; double space_width = cell_i.space_width; std::array bbox_i = {cell_i.x0, cell_i.y0, cell_i.x1, cell_i.y1}; std::array bbox_j = {cell_j.x0, cell_j.y0, cell_j.x1, cell_j.y1}; - LOG_S(INFO) << "l-cell: " << std::setw(10) << std::setprecision(3) << std::setfill('0') + /* + LOG_S(INFO) << "l-cell: " << std::setw(10) << std::setprecision(3) //<< std::setfill('0') << "font-sw: " << cell_i.space_width << ", computed sw: " << space_width_i << ", " << "bbox: " << bbox_i[0] << ", " << bbox_i[1] << ", " << bbox_i[2] << ", " << bbox_i[3] << ": " << cell_i.text << ", " << font_i; - LOG_S(INFO) << "r-cell: " << std::setw(10) << std::setprecision(3) << std::setfill('0') + LOG_S(INFO) << "r-cell: " << std::setw(10) << std::setprecision(3) //<< std::setfill('0') << "font-sw: " << cell_j.space_width << ", computed sw: " << space_width_j << ", " << "bbox: " << bbox_j[0] << ", " << bbox_j[1] << ", " << bbox_j[2] << ", " << bbox_j[3] << ": " << cell_j.text << ", " << font_j; - + */ + space_width = space_width_i; if(std::abs(bbox_i[1]-bbox_j[1]) merged without space!"; + //LOG_S(INFO) << " => merged without space!"; cell_i.text += cell_j.text; } else { - LOG_S(INFO) << " => merged with space!"; + //LOG_S(INFO) << " => merged with space!"; cell_i.text += " " + cell_j.text; } return true; } - LOG_S(INFO) << " => not merged"; + //LOG_S(INFO) << " => not merged"; return false; } @@ -236,6 +247,80 @@ namespace pdflib //return false; } */ + + void pdf_sanitator::sanitize_text(pdf_resource& cells) + { + std::vector > replacements = { + {R"(\f_f_i)", "ffi"}, + {R"(\f_f_l)", "ffl"}, + {R"(\f_i)", "fi"}, + {R"(\f_l)", "fl"}, + {R"(\f_f)", "ff"}, + + {R"(f_f_i)", "ffi"}, + {R"(f_f_l)", "ffl"}, + {R"(f_i)", "fi"}, + {R"(f_l)", "fl"}, + {R"(f_f)", "ff"}, + + {"\uFB00", "ff"}, + {"\uFB01", "fi"}, + {"\uFB02", "fl"}, + {"\uFB03", "ffi"}, + {"\uFB04", "ffl"}, + + {"\u2000", " "}, + {"\u2001", " "}, + {"\u2002", " "}, + {"\u2003", " "}, + {"\u2004", " "}, + {"\u2005", " "}, + {"\u2006", " "}, + {"\u2007", " "}, + {"\u2008", " "}, + {"\u2009", " "}, + {"\u200A", " "}, + + {"\u200B", ""}, + {"\u200C", ""}, + {"\u200D", ""}, + {"\u200E", ""}, + {"\u200F", ""}, + + {"\u2010", "-"}, + {"\u2011", "-"}, + {"\u2012", "-"}, + {"\u2013", "-"}, + {"\u2014", "-"}, + {"\u2015", "-"}, + + {"\u2018", "'"}, + {"\u2019", "'"}, + {"\u201A", ","}, + {"\u201B", "'"}, + {"\u201C", "'"}, + {"\u201D", "'"}, + {"\u201E", "'"}, + {"\u201F", "'"}, + + {"\u2212", "-"}, + }; + + for(auto pair:replacements) + { + LOG_S(INFO) << "`" << pair.first << "` => `" << pair.second << "`"; + } + + for(int i=0; i& pair:replacements) + { + utils::string::replace(text, pair.first, pair.second); + } + } + } } diff --git a/src/v2/utils/string.h b/src/v2/utils/string.h index 472ec72d..a29d31bc 100644 --- a/src/v2/utils/string.h +++ b/src/v2/utils/string.h @@ -169,7 +169,31 @@ namespace utils return vec_to_utf8(vec); } + std::string replace(std::string& text, const std::string& word_0, const std::string& word_1) + { + if(word_0==word_1) + { + return text; + } + + std::size_t pos=0; + while(pos