Skip to content

Commit

Permalink
Mark DoclingPdfParser API as experimental
Browse files Browse the repository at this point in the history
Signed-off-by: Christoph Auer <[email protected]>
  • Loading branch information
cau-git committed Jan 16, 2025
1 parent 229c0c4 commit 132b663
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 2 deletions.
6 changes: 6 additions & 0 deletions docling_parse/pdf_parser.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Parser for PDF files"""

import hashlib
import warnings
from io import BytesIO
from pathlib import Path
from typing import Dict, Iterator, List, Tuple, Union
Expand Down Expand Up @@ -283,6 +284,11 @@ def __init__(self, loglevel: str = "fatal"):
level (str): Logging level as a string.
One of ['fatal', 'error', 'warning', 'info']
"""
warnings.warn(
"This API is currently experimental and may change in upcoming versions without notice.",
category=UserWarning,
stacklevel=2,
)
self.parser = pdf_parser_v2(level=loglevel)

def set_loglevel(self, loglevel: str):
Expand Down
11 changes: 9 additions & 2 deletions tests/test_parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ def test_reference_documents_from_filenames():
) # default: True
assert pdf_doc is not None

# PdfDocument.iterate_pages() will automatically populate pages as they are yielded.
# No need to call PdfDocument.load_all_pages() before.
for page_no, page in pdf_doc.iterate_pages():
print(" -> Page ", page_no, end=" ")
print("has ", len(page.sanitized.cells), "cells.")
Expand All @@ -47,9 +49,11 @@ def test_load_lazy_or_eager():

pdf_doc_case2: PdfDocument = parser.load(path_or_stream=filename, lazy=False)

# The lazy doc has no pages populated, the eager one has them.
# The lazy doc has no pages populated, since they were never iterated so far.
# The eager doc one has the pages pre-populated before first iteration.
assert pdf_doc_case1._pages != pdf_doc_case2._pages

# This method triggers the pre-loading on the lazy document after creation.
pdf_doc_case1.load_all_pages()

# After loading the pages of the lazy doc, the two documents are equal.
Expand All @@ -71,6 +75,9 @@ def test_load_two_distinct_docs():
pdf_doc_case1.load_all_pages()
pdf_doc_case2.load_all_pages()

# The two PdfDocument instances must be non-equal. This confirms
# that no internal state is overwritten by accident when loading more than
# one document with the same DoclingPdfParser instance.
assert pdf_doc_case1._pages != pdf_doc_case2._pages


Expand All @@ -82,7 +89,7 @@ def test_serialize_and_reload():
pdf_doc: PdfDocument = parser.load(path_or_stream=filename, lazy=True)

# TODO a proper serialization model must be still established for a full PdfDocument

# We can serialize the pages dict the following way.
page_adapter = TypeAdapter(Dict[int, ParsedPage])

json_pages = page_adapter.dump_json(pdf_doc._pages)
Expand Down

0 comments on commit 132b663

Please sign in to comment.