Skip to content

Commit 435f071

Browse files
committed
better page count system
1 parent bb475ec commit 435f071

File tree

3 files changed

+34
-31
lines changed

3 files changed

+34
-31
lines changed

mindee/input/sources/local_input_source.py

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ class LocalInputSource:
3636
file_mimetype: str
3737
input_type: InputType
3838
filepath: Optional[str]
39+
_page_count: Optional[int] = None
3940

4041
def __init__(self, input_type: InputType):
4142
self.input_type = input_type
@@ -107,11 +108,14 @@ def page_count(self) -> int:
107108
108109
:return: The number of pages.
109110
"""
110-
if self.is_pdf():
111-
self.file_object.seek(0)
112-
pdf = pdfium.PdfDocument(self.file_object)
113-
return len(pdf)
114-
return 1
111+
if self._page_count is None:
112+
if self.is_pdf():
113+
self.file_object.seek(0)
114+
pdf = pdfium.PdfDocument(self.file_object)
115+
self._page_count = len(pdf)
116+
else:
117+
self._page_count = 1
118+
return self._page_count
115119

116120
def count_doc_pages(self) -> int:
117121
"""Deprecated. Use ``page_count`` instead."""
@@ -177,6 +181,7 @@ def merge_pdf_pages(self, page_numbers: set) -> None:
177181
bytes_io = io.BytesIO()
178182
new_pdf.save(bytes_io)
179183
self.file_object = bytes_io
184+
self._page_count = len(new_pdf)
180185

181186
def is_pdf_empty(self) -> bool:
182187
"""

tests/input/test_apply_page_options.py

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ def test_pdf_reconstruct_ok():
4242
@pytest.mark.parametrize("numb_pages", [1, 2, 3])
4343
def test_process_pdf_cut_n_pages(numb_pages: int):
4444
input_source = PathInput(FILE_TYPES_DIR / "pdf" / "multipage.pdf")
45+
assert input_source.page_count == 12
4546
input_source.process_pdf(
4647
behavior=KEEP_ONLY, on_min_pages=2, page_indexes=[0, -2, -1][:numb_pages]
4748
)
@@ -52,6 +53,7 @@ def test_process_pdf_cut_n_pages(numb_pages: int):
5253
@pytest.mark.parametrize("numb_pages", [1, 2, 3])
5354
def test_apply_pages_pdf_cut_n_pages(numb_pages: int):
5455
input_source = PathInput(FILE_TYPES_DIR / "pdf" / "multipage.pdf")
56+
assert input_source.page_count == 12
5557
input_source.apply_page_options(
5658
PageOptions(on_min_pages=2, page_indexes=[0, -2, -1][:numb_pages])
5759
)
@@ -61,7 +63,7 @@ def test_apply_pages_pdf_cut_n_pages(numb_pages: int):
6163

6264
def test_pdf_keep_5_first_pages():
6365
input_source = PathInput(FILE_TYPES_DIR / "pdf" / "multipage.pdf")
64-
assert input_source.is_pdf() is True
66+
assert input_source.page_count == 12
6567
input_source.process_pdf(
6668
behavior=KEEP_ONLY, on_min_pages=2, page_indexes=[0, 1, 2, 3, 4]
6769
)
@@ -70,11 +72,11 @@ def test_pdf_keep_5_first_pages():
7072

7173
def test_pdf_keep_invalid_pages():
7274
input_source = PathInput(FILE_TYPES_DIR / "pdf" / "multipage.pdf")
73-
assert input_source.is_pdf() is True
75+
assert input_source.page_count == 12
7476
input_source.process_pdf(
7577
behavior=KEEP_ONLY, on_min_pages=2, page_indexes=[0, 1, 17]
7678
)
77-
assert input_source.count_doc_pages() == 2
79+
assert input_source.page_count == 2
7880

7981

8082
def test_pdf_remove_5_last_pages():
@@ -83,7 +85,7 @@ def test_pdf_remove_5_last_pages():
8385
input_source.process_pdf(
8486
behavior=REMOVE, on_min_pages=2, page_indexes=[-5, -4, -3, -2, -1]
8587
)
86-
assert input_source.count_doc_pages() == 7
88+
assert input_source.page_count == 7
8789

8890

8991
def test_pdf_remove_5_first_pages():
@@ -92,14 +94,14 @@ def test_pdf_remove_5_first_pages():
9294
input_source.process_pdf(
9395
behavior=REMOVE, on_min_pages=2, page_indexes=list(range(5))
9496
)
95-
assert input_source.count_doc_pages() == 7
97+
assert input_source.page_count == 7
9698

9799

98100
def test_pdf_remove_invalid_pages():
99101
input_source = PathInput(FILE_TYPES_DIR / "pdf" / "multipage.pdf")
100102
assert input_source.is_pdf() is True
101103
input_source.process_pdf(behavior=REMOVE, on_min_pages=2, page_indexes=[16])
102-
assert input_source.count_doc_pages() == 12
104+
assert input_source.page_count == 12
103105

104106

105107
def test_pdf_keep_no_pages():
@@ -129,23 +131,23 @@ def test_pdf_input_from_file():
129131
input_source = FileInput(fp)
130132
assert input_source.is_pdf() is True
131133
input_source.process_pdf(behavior=KEEP_ONLY, on_min_pages=2, page_indexes=[0])
132-
assert input_source.count_doc_pages() == 1
134+
assert input_source.page_count == 1
133135

134136

135137
def test_pdf_input_from_base64():
136138
with open(PRODUCT_DATA_DIR / "invoices" / "invoice_10p.txt", "rt") as fp:
137139
input_source = Base64Input(fp.read(), filename="invoice_10p.pdf")
138140
assert input_source.is_pdf() is True
139141
input_source.process_pdf(behavior=KEEP_ONLY, on_min_pages=2, page_indexes=[0])
140-
assert input_source.count_doc_pages() == 1
142+
assert input_source.page_count == 1
141143

142144

143145
def test_pdf_input_from_bytes():
144146
with open(PRODUCT_DATA_DIR / "invoices" / "invoice_10p.pdf", "rb") as fp:
145147
input_source = BytesInput(fp.read(), filename="invoice_10p.pdf")
146148
assert input_source.is_pdf() is True
147149
input_source.process_pdf(behavior=KEEP_ONLY, on_min_pages=2, page_indexes=[0])
148-
assert input_source.count_doc_pages() == 1
150+
assert input_source.page_count == 1
149151

150152

151153
def test_pdf_blank_check():
@@ -158,4 +160,4 @@ def test_pdf_blank_check():
158160
input_source.process_pdf(behavior=KEEP_ONLY, on_min_pages=2, page_indexes=[0])
159161

160162
input_not_blank = PathInput(FILE_TYPES_DIR / "pdf" / "not_blank_image_only.pdf")
161-
assert input_not_blank.count_doc_pages() == 1
163+
assert input_not_blank.page_count == 1

tests/input/test_inputs.py

Lines changed: 12 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
Base64Input,
99
BytesInput,
1010
FileInput,
11+
LocalInputSource,
1112
PathInput,
1213
UrlInputSource,
1314
)
@@ -57,42 +58,37 @@ def test_pdf_input_from_url():
5758
)
5859

5960

60-
@pytest.mark.parametrize(("filename", "mimetype"), TEST_IMAGES)
61-
def test_image_input_from_path(filename, mimetype):
62-
input_source = PathInput(FILE_TYPES_DIR / filename)
61+
def _assert_image(input_source: LocalInputSource, mimetype: str) -> None:
6362
assert input_source.file_mimetype == mimetype
6463
assert input_source.is_pdf() is False
6564
assert input_source.page_count == 1
66-
assert isinstance(input_source.file_object, io.BufferedReader)
65+
assert isinstance(input_source.file_object.read(15), bytes)
66+
67+
68+
@pytest.mark.parametrize(("filename", "mimetype"), TEST_IMAGES)
69+
def test_image_input_from_path(filename, mimetype):
70+
input_source = PathInput(FILE_TYPES_DIR / filename)
71+
_assert_image(input_source, mimetype)
6772

6873

6974
@pytest.mark.parametrize(("filename", "mimetype"), TEST_IMAGES)
7075
def test_image_input_from_file(filename, mimetype):
7176
with open(FILE_TYPES_DIR / filename, "rb") as fp:
7277
input_source = FileInput(fp)
73-
assert input_source.file_mimetype == mimetype
74-
assert input_source.is_pdf() is False
75-
assert input_source.page_count == 1
76-
assert isinstance(input_source.file_object, io.BufferedReader)
78+
_assert_image(input_source, mimetype)
7779

7880

7981
@pytest.mark.parametrize(("filename", "mimetype"), TEST_IMAGES)
8082
def test_image_input_from_bytes(filename, mimetype):
8183
file_bytes = open(FILE_TYPES_DIR / filename, "rb").read()
8284
input_source = BytesInput(file_bytes, filename=filename)
83-
assert input_source.file_mimetype == mimetype
84-
assert input_source.is_pdf() is False
85-
assert input_source.page_count == 1
86-
assert isinstance(input_source.file_object, io.BytesIO)
85+
_assert_image(input_source, mimetype)
8786

8887

8988
def test_image_input_from_base64():
9089
base64_input = open(FILE_TYPES_DIR / "receipt.txt", "r").read()
9190
input_source = Base64Input(base64_input, filename="receipt.jpg")
92-
assert input_source.file_mimetype == "image/jpeg"
93-
assert input_source.is_pdf() is False
94-
assert input_source.page_count == 1
95-
assert isinstance(input_source.file_object, io.BytesIO)
91+
_assert_image(input_source, mimetype="image/jpeg")
9692

9793

9894
def test_txt_input_from_path():

0 commit comments

Comments
 (0)