Skip to content

Commit

Permalink
docs: introduce docs site (#141)
Browse files Browse the repository at this point in the history
Signed-off-by: Panos Vagenas <[email protected]>
  • Loading branch information
vagenas authored Oct 14, 2024
1 parent 2b1e72d commit d504432
Show file tree
Hide file tree
Showing 25 changed files with 1,324 additions and 574 deletions.
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
<p align="center">
<a href="https://github.com/ds4sd/docling">
<img loading="lazy" alt="Docling" src="https://github.com/DS4SD/docling/raw/main/logo.png" width="150" />
<img loading="lazy" alt="Docling" src="https://github.com/DS4SD/docling/raw/main/docs/assets/logo.png" width="150" />
</a>
</p>

Expand Down Expand Up @@ -200,8 +200,8 @@ To see all available options (export formats etc.) run `docling --help`.

### RAG
Check out the following examples showcasing RAG using Docling with standard LLM application frameworks:
- [Basic RAG pipeline with 🦙 LlamaIndex](https://github.com/DS4SD/docling/tree/main/examples/rag_llamaindex.ipynb)
- [Basic RAG pipeline with 🦜🔗 LangChain](https://github.com/DS4SD/docling/tree/main/examples/rag_langchain.ipynb)
- [Basic RAG pipeline with LlamaIndex 🦙](https://github.com/DS4SD/docling/tree/main/docs/examples/rag_llamaindex.ipynb)
- [Basic RAG pipeline with LangChain 🦜🔗](https://github.com/DS4SD/docling/tree/main/docs/examples/rag_langchain.ipynb)

## Advanced features

Expand Down
File renamed without changes
File renamed without changes
105 changes: 105 additions & 0 deletions docs/examples/batch_convert.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
import json
import logging
import time
from pathlib import Path
from typing import Iterable

from docling.datamodel.base_models import ConversionStatus
from docling.datamodel.document import ConversionResult, DocumentConversionInput
from docling.document_converter import DocumentConverter

_log = logging.getLogger(__name__)


def export_documents(
conv_results: Iterable[ConversionResult],
output_dir: Path,
):
output_dir.mkdir(parents=True, exist_ok=True)

success_count = 0
failure_count = 0
partial_success_count = 0

for conv_res in conv_results:
if conv_res.status == ConversionStatus.SUCCESS:
success_count += 1
doc_filename = conv_res.input.file.stem

# Export Deep Search document JSON format:
with (output_dir / f"{doc_filename}.json").open(
"w", encoding="utf-8"
) as fp:
fp.write(json.dumps(conv_res.render_as_dict()))

# Export Text format:
with (output_dir / f"{doc_filename}.txt").open("w", encoding="utf-8") as fp:
fp.write(conv_res.render_as_text())

# Export Markdown format:
with (output_dir / f"{doc_filename}.md").open("w", encoding="utf-8") as fp:
fp.write(conv_res.render_as_markdown())

# Export Document Tags format:
with (output_dir / f"{doc_filename}.doctags").open(
"w", encoding="utf-8"
) as fp:
fp.write(conv_res.render_as_doctags())

elif conv_res.status == ConversionStatus.PARTIAL_SUCCESS:
_log.info(
f"Document {conv_res.input.file} was partially converted with the following errors:"
)
for item in conv_res.errors:
_log.info(f"\t{item.error_message}")
partial_success_count += 1
else:
_log.info(f"Document {conv_res.input.file} failed to convert.")
failure_count += 1

_log.info(
f"Processed {success_count + partial_success_count + failure_count} docs, "
f"of which {failure_count} failed "
f"and {partial_success_count} were partially converted."
)
return success_count, partial_success_count, failure_count


def main():
logging.basicConfig(level=logging.INFO)

input_doc_paths = [
Path("./tests/data/2206.01062.pdf"),
Path("./tests/data/2203.01017v2.pdf"),
Path("./tests/data/2305.03393v1.pdf"),
Path("./tests/data/redp5110.pdf"),
Path("./tests/data/redp5695.pdf"),
]

# buf = BytesIO(Path("./test/data/2206.01062.pdf").open("rb").read())
# docs = [DocumentStream(filename="my_doc.pdf", stream=buf)]
# input = DocumentConversionInput.from_streams(docs)

doc_converter = DocumentConverter()

input = DocumentConversionInput.from_paths(input_doc_paths)

start_time = time.time()

conv_results = doc_converter.convert(input)
success_count, partial_success_count, failure_count = export_documents(
conv_results, output_dir=Path("./scratch")
)

end_time = time.time() - start_time

_log.info(f"All documents were converted in {end_time:.2f} seconds.")

if failure_count > 0:
raise RuntimeError(
f"The example failed converting {failure_count} on {len(input_doc_paths)}."
)


if __name__ == "__main__":
main()
175 changes: 175 additions & 0 deletions docs/examples/custom_convert.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,175 @@
import json
import logging
import time
from pathlib import Path
from typing import Iterable

from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import ConversionStatus, PipelineOptions
from docling.datamodel.document import ConversionResult, DocumentConversionInput
from docling.datamodel.pipeline_options import (
TesseractCliOcrOptions,
TesseractOcrOptions,
)
from docling.document_converter import DocumentConverter

_log = logging.getLogger(__name__)


def export_documents(
conv_results: Iterable[ConversionResult],
output_dir: Path,
):
output_dir.mkdir(parents=True, exist_ok=True)

success_count = 0
failure_count = 0

for conv_res in conv_results:
if conv_res.status == ConversionStatus.SUCCESS:
success_count += 1
doc_filename = conv_res.input.file.stem

# Export Deep Search document JSON format:
with (output_dir / f"{doc_filename}.json").open(
"w", encoding="utf-8"
) as fp:
fp.write(json.dumps(conv_res.render_as_dict()))

# Export Text format:
with (output_dir / f"{doc_filename}.txt").open("w", encoding="utf-8") as fp:
fp.write(conv_res.render_as_text())

# Export Markdown format:
with (output_dir / f"{doc_filename}.md").open("w", encoding="utf-8") as fp:
fp.write(conv_res.render_as_markdown())

# Export Document Tags format:
with (output_dir / f"{doc_filename}.doctags").open(
"w", encoding="utf-8"
) as fp:
fp.write(conv_res.render_as_doctags())

else:
_log.info(f"Document {conv_res.input.file} failed to convert.")
failure_count += 1

_log.info(
f"Processed {success_count + failure_count} docs, of which {failure_count} failed"
)

return success_count, failure_count


def main():
logging.basicConfig(level=logging.INFO)

input_doc_paths = [
Path("./tests/data/2206.01062.pdf"),
]

###########################################################################

# The following sections contain a combination of PipelineOptions
# and PDF Backends for various configurations.
# Uncomment one section at the time to see the differences in the output.

# PyPdfium without EasyOCR
# --------------------
# pipeline_options = PipelineOptions()
# pipeline_options.do_ocr=False
# pipeline_options.do_table_structure=True
# pipeline_options.table_structure_options.do_cell_matching = False

# doc_converter = DocumentConverter(
# pipeline_options=pipeline_options,
# pdf_backend=PyPdfiumDocumentBackend,
# )

# PyPdfium with EasyOCR
# -----------------
# pipeline_options = PipelineOptions()
# pipeline_options.do_ocr=True
# pipeline_options.do_table_structure=True
# pipeline_options.table_structure_options.do_cell_matching = True

# doc_converter = DocumentConverter(
# pipeline_options=pipeline_options,
# pdf_backend=PyPdfiumDocumentBackend,
# )

# Docling Parse without EasyOCR
# -------------------------
pipeline_options = PipelineOptions()
pipeline_options.do_ocr = False
pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = True

doc_converter = DocumentConverter(
pipeline_options=pipeline_options,
pdf_backend=DoclingParseDocumentBackend,
)

# Docling Parse with EasyOCR
# ----------------------
# pipeline_options = PipelineOptions()
# pipeline_options.do_ocr=True
# pipeline_options.do_table_structure=True
# pipeline_options.table_structure_options.do_cell_matching = True

# doc_converter = DocumentConverter(
# pipeline_options=pipeline_options,
# pdf_backend=DoclingParseDocumentBackend,
# )

# Docling Parse with Tesseract
# ----------------------
# pipeline_options = PipelineOptions()
# pipeline_options.do_ocr = True
# pipeline_options.do_table_structure = True
# pipeline_options.table_structure_options.do_cell_matching = True
# pipeline_options.ocr_options = TesseractOcrOptions()

# doc_converter = DocumentConverter(
# pipeline_options=pipeline_options,
# pdf_backend=DoclingParseDocumentBackend,
# )

# Docling Parse with Tesseract CLI
# ----------------------
# pipeline_options = PipelineOptions()
# pipeline_options.do_ocr = True
# pipeline_options.do_table_structure = True
# pipeline_options.table_structure_options.do_cell_matching = True
# pipeline_options.ocr_options = TesseractCliOcrOptions()

# doc_converter = DocumentConverter(
# pipeline_options=pipeline_options,
# pdf_backend=DoclingParseDocumentBackend,
# )

###########################################################################

# Define input files
input = DocumentConversionInput.from_paths(input_doc_paths)

start_time = time.time()

conv_results = doc_converter.convert(input)
success_count, failure_count = export_documents(
conv_results, output_dir=Path("./scratch")
)

end_time = time.time() - start_time

_log.info(f"All documents were converted in {end_time:.2f} seconds.")

if failure_count > 0:
raise RuntimeError(
f"The example failed converting {failure_count} on {len(input_doc_paths)}."
)


if __name__ == "__main__":
main()
85 changes: 85 additions & 0 deletions docs/examples/export_figures.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
import logging
import time
from pathlib import Path
from typing import Tuple

from docling.datamodel.base_models import (
AssembleOptions,
ConversionStatus,
FigureElement,
PageElement,
TableElement,
)
from docling.datamodel.document import DocumentConversionInput
from docling.document_converter import DocumentConverter

_log = logging.getLogger(__name__)

IMAGE_RESOLUTION_SCALE = 2.0


def main():
logging.basicConfig(level=logging.INFO)

input_doc_paths = [
Path("./tests/data/2206.01062.pdf"),
]
output_dir = Path("./scratch")

input_files = DocumentConversionInput.from_paths(input_doc_paths)

# Important: For operating with page images, we must keep them, otherwise the DocumentConverter
# will destroy them for cleaning up memory.
# This is done by setting AssembleOptions.images_scale, which also defines the scale of images.
# scale=1 correspond of a standard 72 DPI image
assemble_options = AssembleOptions()
assemble_options.images_scale = IMAGE_RESOLUTION_SCALE

doc_converter = DocumentConverter(assemble_options=assemble_options)

start_time = time.time()

conv_results = doc_converter.convert(input_files)

success_count = 0
failure_count = 0
output_dir.mkdir(parents=True, exist_ok=True)
for conv_res in conv_results:
if conv_res.status != ConversionStatus.SUCCESS:
_log.info(f"Document {conv_res.input.file} failed to convert.")
failure_count += 1
continue

doc_filename = conv_res.input.file.stem

# Export page images
for page in conv_res.pages:
page_no = page.page_no + 1
page_image_filename = output_dir / f"{doc_filename}-{page_no}.png"
with page_image_filename.open("wb") as fp:
page.image.save(fp, format="PNG")

# Export figures and tables
for element, image in conv_res.render_element_images(
element_types=(FigureElement, TableElement)
):
element_image_filename = (
output_dir / f"{doc_filename}-element-{element.id}.png"
)
with element_image_filename.open("wb") as fp:
image.save(fp, "PNG")

success_count += 1

end_time = time.time() - start_time

_log.info(f"All documents were converted in {end_time:.2f} seconds.")

if failure_count > 0:
raise RuntimeError(
f"The example failed converting {failure_count} on {len(input_doc_paths)}."
)


if __name__ == "__main__":
main()
Loading

0 comments on commit d504432

Please sign in to comment.