-
Notifications
You must be signed in to change notification settings - Fork 1k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Signed-off-by: Panos Vagenas <[email protected]>
- Loading branch information
Showing
25 changed files
with
1,324 additions
and
574 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
File renamed without changes
File renamed without changes
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,105 @@ | ||
import json | ||
import logging | ||
import time | ||
from pathlib import Path | ||
from typing import Iterable | ||
|
||
from docling.datamodel.base_models import ConversionStatus | ||
from docling.datamodel.document import ConversionResult, DocumentConversionInput | ||
from docling.document_converter import DocumentConverter | ||
|
||
_log = logging.getLogger(__name__) | ||
|
||
|
||
def export_documents( | ||
conv_results: Iterable[ConversionResult], | ||
output_dir: Path, | ||
): | ||
output_dir.mkdir(parents=True, exist_ok=True) | ||
|
||
success_count = 0 | ||
failure_count = 0 | ||
partial_success_count = 0 | ||
|
||
for conv_res in conv_results: | ||
if conv_res.status == ConversionStatus.SUCCESS: | ||
success_count += 1 | ||
doc_filename = conv_res.input.file.stem | ||
|
||
# Export Deep Search document JSON format: | ||
with (output_dir / f"{doc_filename}.json").open( | ||
"w", encoding="utf-8" | ||
) as fp: | ||
fp.write(json.dumps(conv_res.render_as_dict())) | ||
|
||
# Export Text format: | ||
with (output_dir / f"{doc_filename}.txt").open("w", encoding="utf-8") as fp: | ||
fp.write(conv_res.render_as_text()) | ||
|
||
# Export Markdown format: | ||
with (output_dir / f"{doc_filename}.md").open("w", encoding="utf-8") as fp: | ||
fp.write(conv_res.render_as_markdown()) | ||
|
||
# Export Document Tags format: | ||
with (output_dir / f"{doc_filename}.doctags").open( | ||
"w", encoding="utf-8" | ||
) as fp: | ||
fp.write(conv_res.render_as_doctags()) | ||
|
||
elif conv_res.status == ConversionStatus.PARTIAL_SUCCESS: | ||
_log.info( | ||
f"Document {conv_res.input.file} was partially converted with the following errors:" | ||
) | ||
for item in conv_res.errors: | ||
_log.info(f"\t{item.error_message}") | ||
partial_success_count += 1 | ||
else: | ||
_log.info(f"Document {conv_res.input.file} failed to convert.") | ||
failure_count += 1 | ||
|
||
_log.info( | ||
f"Processed {success_count + partial_success_count + failure_count} docs, " | ||
f"of which {failure_count} failed " | ||
f"and {partial_success_count} were partially converted." | ||
) | ||
return success_count, partial_success_count, failure_count | ||
|
||
|
||
def main(): | ||
logging.basicConfig(level=logging.INFO) | ||
|
||
input_doc_paths = [ | ||
Path("./tests/data/2206.01062.pdf"), | ||
Path("./tests/data/2203.01017v2.pdf"), | ||
Path("./tests/data/2305.03393v1.pdf"), | ||
Path("./tests/data/redp5110.pdf"), | ||
Path("./tests/data/redp5695.pdf"), | ||
] | ||
|
||
# buf = BytesIO(Path("./test/data/2206.01062.pdf").open("rb").read()) | ||
# docs = [DocumentStream(filename="my_doc.pdf", stream=buf)] | ||
# input = DocumentConversionInput.from_streams(docs) | ||
|
||
doc_converter = DocumentConverter() | ||
|
||
input = DocumentConversionInput.from_paths(input_doc_paths) | ||
|
||
start_time = time.time() | ||
|
||
conv_results = doc_converter.convert(input) | ||
success_count, partial_success_count, failure_count = export_documents( | ||
conv_results, output_dir=Path("./scratch") | ||
) | ||
|
||
end_time = time.time() - start_time | ||
|
||
_log.info(f"All documents were converted in {end_time:.2f} seconds.") | ||
|
||
if failure_count > 0: | ||
raise RuntimeError( | ||
f"The example failed converting {failure_count} on {len(input_doc_paths)}." | ||
) | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,175 @@ | ||
import json | ||
import logging | ||
import time | ||
from pathlib import Path | ||
from typing import Iterable | ||
|
||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend | ||
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend | ||
from docling.datamodel.base_models import ConversionStatus, PipelineOptions | ||
from docling.datamodel.document import ConversionResult, DocumentConversionInput | ||
from docling.datamodel.pipeline_options import ( | ||
TesseractCliOcrOptions, | ||
TesseractOcrOptions, | ||
) | ||
from docling.document_converter import DocumentConverter | ||
|
||
_log = logging.getLogger(__name__) | ||
|
||
|
||
def export_documents( | ||
conv_results: Iterable[ConversionResult], | ||
output_dir: Path, | ||
): | ||
output_dir.mkdir(parents=True, exist_ok=True) | ||
|
||
success_count = 0 | ||
failure_count = 0 | ||
|
||
for conv_res in conv_results: | ||
if conv_res.status == ConversionStatus.SUCCESS: | ||
success_count += 1 | ||
doc_filename = conv_res.input.file.stem | ||
|
||
# Export Deep Search document JSON format: | ||
with (output_dir / f"{doc_filename}.json").open( | ||
"w", encoding="utf-8" | ||
) as fp: | ||
fp.write(json.dumps(conv_res.render_as_dict())) | ||
|
||
# Export Text format: | ||
with (output_dir / f"{doc_filename}.txt").open("w", encoding="utf-8") as fp: | ||
fp.write(conv_res.render_as_text()) | ||
|
||
# Export Markdown format: | ||
with (output_dir / f"{doc_filename}.md").open("w", encoding="utf-8") as fp: | ||
fp.write(conv_res.render_as_markdown()) | ||
|
||
# Export Document Tags format: | ||
with (output_dir / f"{doc_filename}.doctags").open( | ||
"w", encoding="utf-8" | ||
) as fp: | ||
fp.write(conv_res.render_as_doctags()) | ||
|
||
else: | ||
_log.info(f"Document {conv_res.input.file} failed to convert.") | ||
failure_count += 1 | ||
|
||
_log.info( | ||
f"Processed {success_count + failure_count} docs, of which {failure_count} failed" | ||
) | ||
|
||
return success_count, failure_count | ||
|
||
|
||
def main(): | ||
logging.basicConfig(level=logging.INFO) | ||
|
||
input_doc_paths = [ | ||
Path("./tests/data/2206.01062.pdf"), | ||
] | ||
|
||
########################################################################### | ||
|
||
# The following sections contain a combination of PipelineOptions | ||
# and PDF Backends for various configurations. | ||
# Uncomment one section at the time to see the differences in the output. | ||
|
||
# PyPdfium without EasyOCR | ||
# -------------------- | ||
# pipeline_options = PipelineOptions() | ||
# pipeline_options.do_ocr=False | ||
# pipeline_options.do_table_structure=True | ||
# pipeline_options.table_structure_options.do_cell_matching = False | ||
|
||
# doc_converter = DocumentConverter( | ||
# pipeline_options=pipeline_options, | ||
# pdf_backend=PyPdfiumDocumentBackend, | ||
# ) | ||
|
||
# PyPdfium with EasyOCR | ||
# ----------------- | ||
# pipeline_options = PipelineOptions() | ||
# pipeline_options.do_ocr=True | ||
# pipeline_options.do_table_structure=True | ||
# pipeline_options.table_structure_options.do_cell_matching = True | ||
|
||
# doc_converter = DocumentConverter( | ||
# pipeline_options=pipeline_options, | ||
# pdf_backend=PyPdfiumDocumentBackend, | ||
# ) | ||
|
||
# Docling Parse without EasyOCR | ||
# ------------------------- | ||
pipeline_options = PipelineOptions() | ||
pipeline_options.do_ocr = False | ||
pipeline_options.do_table_structure = True | ||
pipeline_options.table_structure_options.do_cell_matching = True | ||
|
||
doc_converter = DocumentConverter( | ||
pipeline_options=pipeline_options, | ||
pdf_backend=DoclingParseDocumentBackend, | ||
) | ||
|
||
# Docling Parse with EasyOCR | ||
# ---------------------- | ||
# pipeline_options = PipelineOptions() | ||
# pipeline_options.do_ocr=True | ||
# pipeline_options.do_table_structure=True | ||
# pipeline_options.table_structure_options.do_cell_matching = True | ||
|
||
# doc_converter = DocumentConverter( | ||
# pipeline_options=pipeline_options, | ||
# pdf_backend=DoclingParseDocumentBackend, | ||
# ) | ||
|
||
# Docling Parse with Tesseract | ||
# ---------------------- | ||
# pipeline_options = PipelineOptions() | ||
# pipeline_options.do_ocr = True | ||
# pipeline_options.do_table_structure = True | ||
# pipeline_options.table_structure_options.do_cell_matching = True | ||
# pipeline_options.ocr_options = TesseractOcrOptions() | ||
|
||
# doc_converter = DocumentConverter( | ||
# pipeline_options=pipeline_options, | ||
# pdf_backend=DoclingParseDocumentBackend, | ||
# ) | ||
|
||
# Docling Parse with Tesseract CLI | ||
# ---------------------- | ||
# pipeline_options = PipelineOptions() | ||
# pipeline_options.do_ocr = True | ||
# pipeline_options.do_table_structure = True | ||
# pipeline_options.table_structure_options.do_cell_matching = True | ||
# pipeline_options.ocr_options = TesseractCliOcrOptions() | ||
|
||
# doc_converter = DocumentConverter( | ||
# pipeline_options=pipeline_options, | ||
# pdf_backend=DoclingParseDocumentBackend, | ||
# ) | ||
|
||
########################################################################### | ||
|
||
# Define input files | ||
input = DocumentConversionInput.from_paths(input_doc_paths) | ||
|
||
start_time = time.time() | ||
|
||
conv_results = doc_converter.convert(input) | ||
success_count, failure_count = export_documents( | ||
conv_results, output_dir=Path("./scratch") | ||
) | ||
|
||
end_time = time.time() - start_time | ||
|
||
_log.info(f"All documents were converted in {end_time:.2f} seconds.") | ||
|
||
if failure_count > 0: | ||
raise RuntimeError( | ||
f"The example failed converting {failure_count} on {len(input_doc_paths)}." | ||
) | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,85 @@ | ||
import logging | ||
import time | ||
from pathlib import Path | ||
from typing import Tuple | ||
|
||
from docling.datamodel.base_models import ( | ||
AssembleOptions, | ||
ConversionStatus, | ||
FigureElement, | ||
PageElement, | ||
TableElement, | ||
) | ||
from docling.datamodel.document import DocumentConversionInput | ||
from docling.document_converter import DocumentConverter | ||
|
||
_log = logging.getLogger(__name__) | ||
|
||
IMAGE_RESOLUTION_SCALE = 2.0 | ||
|
||
|
||
def main(): | ||
logging.basicConfig(level=logging.INFO) | ||
|
||
input_doc_paths = [ | ||
Path("./tests/data/2206.01062.pdf"), | ||
] | ||
output_dir = Path("./scratch") | ||
|
||
input_files = DocumentConversionInput.from_paths(input_doc_paths) | ||
|
||
# Important: For operating with page images, we must keep them, otherwise the DocumentConverter | ||
# will destroy them for cleaning up memory. | ||
# This is done by setting AssembleOptions.images_scale, which also defines the scale of images. | ||
# scale=1 correspond of a standard 72 DPI image | ||
assemble_options = AssembleOptions() | ||
assemble_options.images_scale = IMAGE_RESOLUTION_SCALE | ||
|
||
doc_converter = DocumentConverter(assemble_options=assemble_options) | ||
|
||
start_time = time.time() | ||
|
||
conv_results = doc_converter.convert(input_files) | ||
|
||
success_count = 0 | ||
failure_count = 0 | ||
output_dir.mkdir(parents=True, exist_ok=True) | ||
for conv_res in conv_results: | ||
if conv_res.status != ConversionStatus.SUCCESS: | ||
_log.info(f"Document {conv_res.input.file} failed to convert.") | ||
failure_count += 1 | ||
continue | ||
|
||
doc_filename = conv_res.input.file.stem | ||
|
||
# Export page images | ||
for page in conv_res.pages: | ||
page_no = page.page_no + 1 | ||
page_image_filename = output_dir / f"{doc_filename}-{page_no}.png" | ||
with page_image_filename.open("wb") as fp: | ||
page.image.save(fp, format="PNG") | ||
|
||
# Export figures and tables | ||
for element, image in conv_res.render_element_images( | ||
element_types=(FigureElement, TableElement) | ||
): | ||
element_image_filename = ( | ||
output_dir / f"{doc_filename}-element-{element.id}.png" | ||
) | ||
with element_image_filename.open("wb") as fp: | ||
image.save(fp, "PNG") | ||
|
||
success_count += 1 | ||
|
||
end_time = time.time() - start_time | ||
|
||
_log.info(f"All documents were converted in {end_time:.2f} seconds.") | ||
|
||
if failure_count > 0: | ||
raise RuntimeError( | ||
f"The example failed converting {failure_count} on {len(input_doc_paths)}." | ||
) | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
Oops, something went wrong.