diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index cd932ab..022a7a6 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -27,6 +27,11 @@ jobs: image: ghcr.io/data-house/pdfact:main ports: - 4567:4567 + liteparse: + image: ghcr.io/run-llama/liteparse-server:main + command: "bun run start-slim:bun" + ports: + - 5000:5000 steps: - uses: actions/checkout@v6 @@ -40,7 +45,7 @@ jobs: python-version: ${{ matrix.python }} - name: Pull dependencies - run: uv sync --all-extras --all-groups + run: uv sync --all-extras --all-groups --frozen - name: Create .env file with fake API keys run: | diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index 7b54fdb..312684e 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -26,10 +26,10 @@ jobs: python-version: 3.13 - name: Ruff lint - run: uv run ruff check --exit-zero . + run: uv run --frozen ruff check --exit-zero . - name: Ruff format - run: uv run ruff format . + run: uv run --frozen ruff format . - name: Commit changes uses: stefanzweifel/git-auto-commit-action@v7 diff --git a/.github/workflows/update-docs.yml b/.github/workflows/update-docs.yml index 35a8321..4544207 100644 --- a/.github/workflows/update-docs.yml +++ b/.github/workflows/update-docs.yml @@ -27,7 +27,7 @@ jobs: python-version: 3.13 - name: Install dependencies - run: uv sync + run: uv sync --frozen - name: Generate reference docs run: uv run python scripts/generate_docs.py diff --git a/docs/howto/configure_liteparse.md b/docs/howto/configure_liteparse.md new file mode 100644 index 0000000..1511d84 --- /dev/null +++ b/docs/howto/configure_liteparse.md @@ -0,0 +1,317 @@ +--- +title: Configure LiteParse +description: How to set up the LiteParse driver against a self-hosted LiteParse instance, configure OCR, DPI, and page selection, and override options on a per-document basis. +--- + +# Configure LiteParse + +This guide shows you how to configure the [LiteParse](https://www.llamaindex.ai/blog/liteparse-local-document-parsing-for-ai-agents) driver for document processing using a self-hosted [LiteParse](https://github.com/run-llama/liteparse-server) instance. + + +## Quick Start + +### Step 1: Start LiteParse + +Parxy comes with a sample Docker Compose file that includes LiteParse. Generate it in your current directory with: + +```bash +parxy docker +``` + +Then pull the image and start the service: + +```bash +docker compose pull liteparse && docker compose up -d liteparse +``` + +### Step 2: Parse a Document + +```python +from parxy_core.facade.parxy import Parxy + +doc = Parxy.parse("document.pdf", driver_name="liteparse") +print(f"Processed {len(doc.pages)} pages") +``` + +No `.env` configuration is required when LiteParse is running on the default address (`http://localhost:5000`). + +## Configuration Options + +### Environment Variables + +All LiteParse configuration uses environment variables with the `PARXY_LITEPARSE_` prefix: + +#### Connection + +| Variable | Type | Default | Description | +|----------|------|---------|-------------| +| `PARXY_LITEPARSE_BASE_URL` | string | `http://localhost:5000` | Base URL of the LiteParse server | +| `PARXY_LITEPARSE_TIMEOUT` | float | `30.0` | HTTP request timeout in seconds | + +#### OCR + +| Variable | Type | Default | Description | +|----------|------|---------|-------------| +| `PARXY_LITEPARSE_OCR_ENABLED` | bool | `true` | Enable OCR on pages that contain no embedded text | +| `PARXY_LITEPARSE_OCR_LANGUAGE` | string | `en` | Language code for OCR (e.g. `en`, `de`, `fr`) | +| `PARXY_LITEPARSE_OCR_SERVER_URL` | string | None | URL of an external HTTP OCR service; when set, LiteParse delegates OCR over HTTP instead of using in-process Tesseract | +| `PARXY_LITEPARSE_NUM_WORKERS` | int | `4` | Number of pages to OCR in parallel | + +#### Processing + +| Variable | Type | Default | Description | +|----------|------|---------|-------------| +| `PARXY_LITEPARSE_MAX_PAGES` | int | None | Maximum number of pages to process (all pages when unset) | +| `PARXY_LITEPARSE_DPI` | int | `150` | DPI used when rasterising pages for OCR | + +#### Features + +| Variable | Type | Default | Description | +|----------|------|---------|-------------| +| `PARXY_LITEPARSE_PRECISE_BOUNDING_BOX` | bool | `true` | Use precise bounding-box calculation | +| `PARXY_LITEPARSE_PRESERVE_VERY_SMALL_TEXT` | bool | `false` | Include very small text that would normally be filtered out | +| `PARXY_LITEPARSE_PRESERVE_LAYOUT_ALIGNMENT_ACROSS_PAGES` | bool | `false` | Preserve cross-page layout alignment | + +### Example `.env` file + +```bash +PARXY_LITEPARSE_BASE_URL=http://liteparse-server:5000 +PARXY_LITEPARSE_TIMEOUT=60 +PARXY_LITEPARSE_OCR_LANGUAGE=de +PARXY_LITEPARSE_DPI=200 +``` + +## Supported Extraction Levels + +| Level | Description | +|-------|-------------| +| `page` | Page-level text only — all text items concatenated per page | +| `block` | Page + individual `TextBlock` items with bounding boxes and font metadata, one per text item returned by LiteParse | + +```python +# Page-level extraction (default) +doc = Parxy.parse("document.pdf", driver_name="liteparse", level="page") + +# Block-level extraction +doc = Parxy.parse("document.pdf", driver_name="liteparse", level="block") +``` + +## Bounding Boxes and Style + +At `block` level each `TextBlock` includes the exact position and font information reported by LiteParse: + +```python +doc = Parxy.parse("document.pdf", driver_name="liteparse", level="block") + +for page in doc.pages: + print(f"Page {page.number}: {page.width} x {page.height}") + if page.blocks: + for block in page.blocks: + if block.bbox: + print( + f" [{block.bbox.x0:.1f}, {block.bbox.y0:.1f}] " + f"→ [{block.bbox.x1:.1f}, {block.bbox.y1:.1f}] '{block.text}'" + ) + if block.style: + print(f" font: {block.style.font_name}, size: {block.style.font_size}") +``` + +Each page also exposes `source_data` with the raw LiteParse response for that page (including the full `textItems` array), which is useful when you need fields not mapped to the Parxy model: + +```python +for page in doc.pages: + raw = page.source_data # original LiteParse page JSON + if raw: + for item in raw.get("textItems", []): + print(item["confidence"]) # OCR confidence score (0–1) +``` + +## Input Types + +### Local Files + +```python +doc = Parxy.parse("/path/to/document.pdf", driver_name="liteparse") +``` + +### Bytes / BytesIO + +```python +import io + +with open("document.pdf", "rb") as f: + data = io.BytesIO(f.read()) + +doc = Parxy.parse(data, driver_name="liteparse") +``` + +## Per-Call Configuration Overrides + +Any `LiteParseConfig` field can be overridden for a single call by passing it as a keyword argument to `Parxy.parse()`. + +```python +from parxy_core.facade.parxy import Parxy + +# Default configuration +doc1 = Parxy.parse("report.pdf", driver_name="liteparse") + +# Higher DPI for a document with small text +doc2 = Parxy.parse( + "small-text-report.pdf", + driver_name="liteparse", + dpi=300, +) + +# German OCR for a specific document +doc3 = Parxy.parse( + "german-contract.pdf", + driver_name="liteparse", + ocr_language="de", +) + +# Extract only the first three pages +doc4 = Parxy.parse( + "large-report.pdf", + driver_name="liteparse", + target_pages="1,2,3", +) + +# Open a password-protected PDF +doc5 = Parxy.parse( + "protected.pdf", + driver_name="liteparse", + password="s3cr3t", +) +``` + +### Supported Per-Call Options + +All `LiteParseConfig` fields (see [environment variables](#environment-variables) above) can be passed as snake_case keyword arguments. In addition: + +| Option | Type | Description | +|--------|------|-------------| +| `target_pages` | string | Comma-separated 1-based page numbers to extract (e.g. `"1,3,5"`). Useful for sampling or previewing large documents without processing all pages | +| `password` | string | Password for encrypted PDF documents | + +## Use Cases + +### Scanned Documents + +LiteParse uses Tesseract OCR by default for pages without embedded text. Use `block` level to also get per-word bounding boxes and OCR confidence scores: + +```python +doc = Parxy.parse( + "scanned-invoice.pdf", + driver_name="liteparse", + level="block", + ocr_enabled=True, + dpi=300, # higher DPI improves OCR accuracy on small text +) + +for page in doc.pages: + if page.blocks: + for block in page.blocks: + raw_item = next( + (i for i in (page.source_data or {}).get("textItems", []) if i.get("str") == block.text), + None, + ) + confidence = raw_item["confidence"] if raw_item else None + print(f"{block.text!r} confidence={confidence}") +``` + +### Non-English Documents + +Set `ocr_language` to the primary language of the document for better OCR accuracy: + +```python +doc = Parxy.parse( + "french-report.pdf", + driver_name="liteparse", + ocr_language="fr", +) +``` + +### Sampling Large Documents + +Process only a subset of pages to preview content or reduce processing time: + +```python +# Preview first and last pages of a 100-page document +doc = Parxy.parse( + "large-document.pdf", + driver_name="liteparse", + target_pages="1,2,99,100", +) +``` + +### Password-Protected PDFs + +```python +doc = Parxy.parse( + "confidential.pdf", + driver_name="liteparse", + password="document-password", +) +``` + +### Using an External OCR Service + +When `ocr_server_url` is set, LiteParse delegates OCR via HTTP instead of running Tesseract in-process. This is useful when OCR is handled by a dedicated service: + +```bash +PARXY_LITEPARSE_OCR_SERVER_URL=http://ocr-service:8080 +``` + +## Troubleshooting + +### Connection Errors + +If you see `Could not connect to LiteParse service`: + +1. Verify LiteParse is running: `curl http://localhost:5000` +2. Check that `PARXY_LITEPARSE_BASE_URL` matches the actual address +3. Ensure no firewall or network policy blocks port 5000 + +### Timeout Errors + +For large documents or slow hardware, increase the default timeout: + +```bash +PARXY_LITEPARSE_TIMEOUT=120 +``` + +Or per-call: + +```python +from parxy_core.models.config import LiteParseConfig +from parxy_core.drivers import LiteParseDriver + +driver = LiteParseDriver(config=LiteParseConfig(timeout=120)) +doc = driver.parse("large-document.pdf") +``` + +### Invalid Base URL + +The driver validates the base URL on startup: + +```python +# Raises ValueError: Invalid base URL +from parxy_core.models.config import LiteParseConfig +from parxy_core.drivers import LiteParseDriver + +LiteParseDriver(config=LiteParseConfig(base_url="not-a-url")) +``` + +### Poor OCR Quality + +If extracted text looks garbled: + +1. Increase DPI (`dpi=300`) for documents with small or dense text +2. Set the correct `ocr_language` for the document's language +3. Enable `preserve_very_small_text=True` if small annotations are missing + +## See Also + +- [LiteParse documentation](https://developers.llamaindex.ai/liteparse/) +- [LiteParse Server Repository](https://github.com/run-llama/liteparse-server) +- [Getting Started Tutorial](../tutorials/getting_started.md) diff --git a/docs/supported_services.md b/docs/supported_services.md index c9c6fb0..85b9ad9 100644 --- a/docs/supported_services.md +++ b/docs/supported_services.md @@ -20,6 +20,7 @@ Parxy supports the following document processing services and libraries. The **E | [**pdfplumber**](https://github.com/jsvine/pdfplumber) | Preview | `pdfplumber` | ✅ | ✅ | | [**PDFMiner**](https://github.com/pdfminer/pdfminer.six) | Preview | `pdfminer` | ✅ | ✅ | | [**Docling**](https://docling-project.github.io/docling/) | Preview | `docling` | ✅ | ✅ | +| [**LiteParse**](https://github.com/run-llama/liteparse) | Preview | `liteparse` | ✅ | ✅ | Status meanings: **Live** = stable; **Preview** = functional but the API may change. diff --git a/pytest.ini b/pytest.ini index 4a820ef..f85f960 100644 --- a/pytest.ini +++ b/pytest.ini @@ -2,5 +2,6 @@ filterwarnings = ignore:.*Swig.* ignore:.*no current event loop.* + ignore::DeprecationWarning:docling norecursedirs = .git worktrees parxy.worktrees diff --git a/src/parxy_cli/compose.example.yaml b/src/parxy_cli/compose.example.yaml index 5f0d63e..e0f6481 100644 --- a/src/parxy_cli/compose.example.yaml +++ b/src/parxy_cli/compose.example.yaml @@ -8,6 +8,16 @@ services: networks: - parxy + ## LiteParse service + liteparse: + image: "ghcr.io/run-llama/liteparse-server:main" + command: ["bun", "run", "start-slim:bun"] # Disable instrumentation + ports: + - "5000:5000" + networks: + - parxy + + ## Docling Service docling: image: ghcr.io/docling-project/docling-serve-cu128:v1.18.0 restart: unless-stopped diff --git a/src/parxy_core/drivers/__init__.py b/src/parxy_core/drivers/__init__.py index 110555b..eb215d6 100644 --- a/src/parxy_core/drivers/__init__.py +++ b/src/parxy_core/drivers/__init__.py @@ -14,3 +14,4 @@ from parxy_core.drivers.pdfplumber import PDFPlumberDriver as PDFPlumberDriver from parxy_core.drivers.pdfminer import PDFMinerDriver as PDFMinerDriver from parxy_core.drivers.docling import DoclingDriver as DoclingDriver +from parxy_core.drivers.liteparse import LiteParseDriver as LiteParseDriver diff --git a/src/parxy_core/drivers/factory.py b/src/parxy_core/drivers/factory.py index af05ed5..2fb2373 100644 --- a/src/parxy_core/drivers/factory.py +++ b/src/parxy_core/drivers/factory.py @@ -13,12 +13,14 @@ from parxy_core.drivers.pdfplumber import PDFPlumberDriver from parxy_core.drivers.pdfminer import PDFMinerDriver from parxy_core.drivers.docling import DoclingDriver +from parxy_core.drivers.liteparse import LiteParseDriver from parxy_core.models import ( PdfActConfig, LandingAIConfig, LlamaParseConfig, LlmWhispererConfig, UnstructuredLocalConfig, + LiteParseConfig, ParxyConfig, DoclingConfig, ) @@ -226,6 +228,9 @@ def _create_pdfminer_driver(self) -> PDFMinerDriver: def _create_docling_driver(self) -> DoclingDriver: return DoclingDriver(config=DoclingConfig(), logger=self._logger) + def _create_liteparse_driver(self) -> LiteParseDriver: + return LiteParseDriver(config=LiteParseConfig(), logger=self._logger) + def _create_landingai_driver(self) -> LandingAIADEDriver: """Create a LandingAI ADE Driver instance. @@ -304,6 +309,7 @@ def get_supported_drivers(self) -> List[str]: 'pdfplumber', 'pdfminer', 'docling', + 'liteparse', ] return supported_drivers diff --git a/src/parxy_core/drivers/liteparse.py b/src/parxy_core/drivers/liteparse.py new file mode 100644 index 0000000..0b36ab7 --- /dev/null +++ b/src/parxy_core/drivers/liteparse.py @@ -0,0 +1,155 @@ +"""LiteParse HTTP driver for parxy.""" + +import io +import json +from pathlib import Path +from urllib.parse import urljoin + +import validators +import httpx + +from parxy_core.drivers import Driver +from parxy_core.exceptions import ParsingException, RateLimitException +from parxy_core.models import Document, Page +from parxy_core.models.models import BoundingBox, Style, TextBlock +from parxy_core.models.config import LiteParseConfig + + +def _to_camel_case(name: str) -> str: + parts = name.split('_') + return parts[0] + ''.join(p.title() for p in parts[1:]) + + +def _map_text_block(item: dict, page_number: int) -> TextBlock: + x, y, w, h = item['x'], item['y'], item['width'], item['height'] + style = Style( + font_name=item.get('fontName'), + font_size=item.get('fontSize'), + ) + return TextBlock( + type='text', + text=item.get('str', ''), + page=page_number, + bbox=BoundingBox(x0=x, y0=y, x1=x + w, y1=y + h), + style=style if style.font_name or style.font_size else None, + ) + + +def _map_page(p: dict, level: str) -> Page: + page_number = p['pageNum'] + blocks = ( + [_map_text_block(item, page_number) for item in p.get('textItems', [])] + if level == 'block' + else None + ) + return Page( + number=page_number, + width=p.get('width'), + height=p.get('height'), + text=p.get('text', ''), + blocks=blocks, + source_data=p, + ) + + +class LiteParseDriver(Driver): + """PDF/document parser using the self-hosted LiteParse HTTP service. + + Calls POST /parse on the LiteParse server and maps the ParsedPage + response array to the Parxy Document model. + """ + + supported_levels = ['page', 'block'] + + _config: LiteParseConfig + + def _initialize_driver(self): + if httpx is None: + raise ImportError( + 'httpx is required. Install with: pip install parxy[liteparse]' + ) + + if validators.url(self._config.base_url, simple_host=True) is not True: + raise ValueError( + f'Invalid base URL. Expected URL, found [{self._config.base_url}].' + ) + + return self + + def _build_parse_config( + self, + target_pages: str | None = None, + password: str | None = None, + **overrides, + ) -> str: + """Serialize parse config fields to camelCase JSON for the LiteParse API.""" + data = self._config.model_dump(exclude={'base_url', 'timeout'}) + data.update({k: v for k, v in overrides.items() if k in data}) + result = {_to_camel_case(k): v for k, v in data.items() if v is not None} + result['outputFormat'] = 'json' + if target_pages is not None: + result['targetPages'] = target_pages + if password is not None: + result['password'] = password + return json.dumps(result) + + def _handle( + self, + file: str | io.BytesIO | bytes, + level: str = 'page', + target_pages: str | None = None, + password: str | None = None, + **kwargs, + ) -> Document: + + filename, stream = self.handle_file_input(file) + + # Separate LiteParse config overrides from tracing/span kwargs + config_fields = set(LiteParseConfig.model_fields) - {'base_url'} + config_overrides = {} + for key in list(kwargs.keys()): + if key in config_fields: + config_overrides[key] = kwargs.pop(key) + + with self._trace_parse(filename, stream, **kwargs) as span: + url = urljoin(self._config.base_url.rstrip('/') + '/', 'parse') + fname = Path(filename).name if filename else 'document.pdf' + parse_config = self._build_parse_config( + target_pages=target_pages, password=password, **config_overrides + ) + + try: + with httpx.Client(timeout=self._config.timeout) as client: # type: ignore[union-attr] + response = client.post( + url, + data={'config': parse_config}, + files={'file': (fname, stream, 'application/octet-stream')}, + ) + except httpx.ConnectError as e: # type: ignore[union-attr] + raise ParsingException( + message=f'Could not connect to LiteParse service at {self._config.base_url}', + service='LiteParse', + ) from e + + if response.status_code == 429: + raise RateLimitException( + message='Rate limit exceeded', + service='LiteParse', + ) + + if response.status_code != 200: + raise ParsingException( + message=f'LiteParse service returned HTTP {response.status_code}', + service='LiteParse', + details={'status_code': response.status_code}, + ) + + data = response.json() + pages = [_map_page(p, level) for p in data.get('pages', [])] + + span.set_attribute('output.pages', len(pages)) + + return Document( + filename=filename, + pages=pages, + ) diff --git a/src/parxy_core/models/__init__.py b/src/parxy_core/models/__init__.py index 60dc8f4..a2acee0 100644 --- a/src/parxy_core/models/__init__.py +++ b/src/parxy_core/models/__init__.py @@ -28,4 +28,5 @@ LlmWhispererConfig as LlmWhispererConfig, UnstructuredLocalConfig as UnstructuredLocalConfig, DoclingConfig as DoclingConfig, + LiteParseConfig as LiteParseConfig, ) diff --git a/src/parxy_core/models/config.py b/src/parxy_core/models/config.py index 1261a5a..8484edc 100644 --- a/src/parxy_core/models/config.py +++ b/src/parxy_core/models/config.py @@ -265,3 +265,47 @@ class DoclingConfig(BaseConfig): model_config = SettingsConfigDict( env_prefix='parxy_docling_', env_file='.env', extra='ignore' ) + + +class LiteParseConfig(BaseConfig): + """Configuration values for the LiteParse HTTP service. All env variables must start with `parxy_liteparse_`""" + + base_url: str = 'http://localhost:5000' + """Base URL of the LiteParse server.""" + + timeout: float = 30.0 + """HTTP request timeout in seconds. Default 30.""" + + # OCR + ocr_language: Optional[str] = 'en' + """OCR language code (e.g. 'en', 'de'). Default 'en'.""" + + ocr_enabled: Optional[bool] = True + """Enable OCR on bitmap pages. Default True.""" + + ocr_server_url: Optional[str] = None + """If set, delegates OCR to an external HTTP OCR service instead of in-process Tesseract.""" + + num_workers: Optional[int] = 4 + """Number of pages to OCR in parallel. Default 4.""" + + # Processing + max_pages: Optional[int] = None + """Maximum number of pages to process. Default None (all pages).""" + + dpi: Optional[int] = 150 + """Rendering DPI for rasterised pages. Default 150.""" + + # Features + precise_bounding_box: Optional[bool] = True + """Use precise bounding-box calculation. Default True.""" + + preserve_very_small_text: Optional[bool] = False + """Include very small text that would normally be filtered. Default False.""" + + preserve_layout_alignment_across_pages: Optional[bool] = False + """Preserve cross-page layout alignment. Default False.""" + + model_config = SettingsConfigDict( + env_prefix='parxy_liteparse_', env_file='.env', extra='ignore' + ) diff --git a/src/parxy_core/models/models.py b/src/parxy_core/models/models.py index 9765c6c..8d03072 100644 --- a/src/parxy_core/models/models.py +++ b/src/parxy_core/models/models.py @@ -13,6 +13,24 @@ class BoundingBox(BaseModel): x1: float y1: float + @property + def width(self) -> float: + return self.x1 - self.x0 + + @property + def height(self) -> float: + return self.y1 - self.y0 + + def to_pixels(self, dpi: float = 150) -> 'BoundingBox': + """Return a new BoundingBox scaled from PDF points to pixels at the given DPI.""" + scale = dpi / 72 + return BoundingBox( + x0=self.x0 * scale, + y0=self.y0 * scale, + x1=self.x1 * scale, + y1=self.y1 * scale, + ) + class Style(BaseModel): font_name: Optional[str] = None diff --git a/tests/drivers/test_liteparse.py b/tests/drivers/test_liteparse.py new file mode 100644 index 0000000..4f61094 --- /dev/null +++ b/tests/drivers/test_liteparse.py @@ -0,0 +1,598 @@ +import json +import os +import pytest +import httpx +from unittest.mock import Mock, patch, MagicMock + +from parxy_core.models import Page +from parxy_core.models.models import TextBlock, BoundingBox +from parxy_core.models.config import LiteParseConfig +from parxy_core.drivers import LiteParseDriver +from parxy_core.exceptions import ( + FileNotFoundException, + ParsingException, + RateLimitException, +) + + +def _liteparse_service_available(base_url: str = 'http://localhost:5000') -> bool: + try: + with httpx.Client(timeout=2.0) as client: + client.get(base_url) + return True + except Exception: + return False + + +def _make_mock_client(status_code: int, json_body: dict) -> MagicMock: + mock_response = MagicMock() + mock_response.status_code = status_code + mock_response.json.return_value = json_body + + mock_client = MagicMock() + mock_client.__enter__ = Mock(return_value=mock_client) + mock_client.__exit__ = Mock(return_value=False) + mock_client.post.return_value = mock_response + return mock_client + + +class TestLiteParseDriver: + def __fixture_path(self, file: str) -> str: + current_dir = os.path.dirname(os.path.abspath(__file__)) + fixtures_dir = os.path.join(os.path.dirname(current_dir), 'fixtures') + return os.path.join(fixtures_dir, file) + + def test_liteparse_driver_can_be_created(self): + driver = LiteParseDriver(config=LiteParseConfig()) + + assert driver.supported_levels == ['page', 'block'] + + def test_liteparse_driver_unrecognized_level_handled(self): + driver = LiteParseDriver(config=LiteParseConfig()) + path = self.__fixture_path('empty-doc.pdf') + + with pytest.raises(ValueError) as excinfo: + driver.parse(path, level='custom') + + assert 'not supported' in str(excinfo.value) + assert '[custom]' in str(excinfo.value) + + def test_liteparse_driver_handle_not_existing_file(self): + driver = LiteParseDriver(config=LiteParseConfig()) + path = self.__fixture_path('non-existing-file.pdf') + + with pytest.raises(FileNotFoundException): + driver.parse(path) + + def test_liteparse_driver_read_empty_document_page_level(self): + mock_client = _make_mock_client( + status_code=200, + json_body={ + 'pages': [ + { + 'pageNum': 1, + 'width': 612.0, + 'height': 792.0, + 'text': '1', + 'textItems': [], + }, + ] + }, + ) + + with patch( + 'parxy_core.drivers.liteparse.httpx.Client', return_value=mock_client + ): + driver = LiteParseDriver(config=LiteParseConfig()) + path = self.__fixture_path('empty-doc.pdf') + document = driver.parse(path, level='page') + + assert document is not None + assert document.language is None + assert document.outline is None + assert document.metadata is None + assert len(document.pages) == 1 + assert isinstance(document.pages[0], Page) + assert document.pages[0].number == 1 + assert document.pages[0].width == 612.0 + assert document.pages[0].height == 792.0 + assert document.pages[0].blocks is None + assert document.pages[0].text == '1' + + def test_liteparse_driver_read_document_block_level(self): + mock_client = _make_mock_client( + status_code=200, + json_body={ + 'pages': [ + { + 'pageNum': 1, + 'width': 612.0, + 'height': 792.0, + 'text': 'Revenue grew 15%\nCosts stayed flat', + 'textItems': [ + { + 'str': 'Revenue grew 15%', + 'x': 72.0, + 'y': 200.0, + 'width': 150.0, + 'height': 12.0, + 'w': 150.0, + 'h': 12.0, + 'r': 0, + 'fontName': 'Arial', + 'fontSize': 12.0, + 'confidence': 1, + }, + { + 'str': 'Costs stayed flat', + 'x': 72.0, + 'y': 220.0, + 'width': 130.0, + 'height': 12.0, + 'w': 130.0, + 'h': 12.0, + 'r': 0, + 'fontName': 'Arial', + 'fontSize': 10.0, + 'confidence': 1, + }, + ], + } + ] + }, + ) + + with patch( + 'parxy_core.drivers.liteparse.httpx.Client', return_value=mock_client + ): + driver = LiteParseDriver(config=LiteParseConfig()) + path = self.__fixture_path('test-doc.pdf') + document = driver.parse(path, level='block') + + page = document.pages[0] + assert page.blocks is not None + assert len(page.blocks) == 2 + + block = page.blocks[0] + assert isinstance(block, TextBlock) + assert block.text == 'Revenue grew 15%' + assert block.page == 1 + assert isinstance(block.bbox, BoundingBox) + assert block.bbox.x0 == 72.0 + assert block.bbox.y0 == 200.0 + assert block.bbox.x1 == 222.0 # 72 + 150 + assert block.bbox.y1 == 212.0 # 200 + 12 + assert block.style is not None + assert block.style.font_name == 'Arial' + assert block.style.font_size == 12.0 + + def test_liteparse_driver_page_level_has_no_blocks(self): + mock_client = _make_mock_client( + status_code=200, + json_body={ + 'pages': [ + { + 'pageNum': 1, + 'width': 612.0, + 'height': 792.0, + 'text': 'Hello', + 'textItems': [ + { + 'str': 'Hello', + 'x': 10, + 'y': 10, + 'width': 50, + 'height': 12, + 'w': 50, + 'h': 12, + 'r': 0, + 'fontName': 'Arial', + 'fontSize': 12, + 'confidence': 1, + }, + ], + } + ] + }, + ) + + with patch( + 'parxy_core.drivers.liteparse.httpx.Client', return_value=mock_client + ): + driver = LiteParseDriver(config=LiteParseConfig()) + path = self.__fixture_path('test-doc.pdf') + document = driver.parse(path, level='page') + + assert document.pages[0].blocks is None + + def test_liteparse_driver_read_document(self): + expected_text = ( + 'This is the header\n' + 'This is a test PDF to be used as input in unit\n' + 'tests\n' + 'This is a heading 1\n' + 'This is a paragraph below heading 1' + ) + mock_client = _make_mock_client( + status_code=200, + json_body={ + 'pages': [ + { + 'pageNum': 1, + 'width': 612.0, + 'height': 792.0, + 'text': expected_text, + 'textItems': [], + }, + ] + }, + ) + + with patch( + 'parxy_core.drivers.liteparse.httpx.Client', return_value=mock_client + ): + driver = LiteParseDriver(config=LiteParseConfig()) + path = self.__fixture_path('test-doc.pdf') + document = driver.parse(path, level='page') + + assert document is not None + assert document.language is None + assert document.metadata is None + assert len(document.pages) == 1 + assert isinstance(document.pages[0], Page) + assert document.pages[0].number == 1 + assert document.pages[0].blocks is None + assert document.pages[0].text == expected_text + + def test_liteparse_driver_rate_limit_raises_exception(self): + mock_client = _make_mock_client(status_code=429, json_body={}) + + with patch( + 'parxy_core.drivers.liteparse.httpx.Client', return_value=mock_client + ): + driver = LiteParseDriver(config=LiteParseConfig()) + path = self.__fixture_path('test-doc.pdf') + + with pytest.raises(RateLimitException): + driver.parse(path) + + def test_liteparse_driver_http_error_raises_parsing_exception(self): + mock_client = _make_mock_client(status_code=500, json_body={}) + + with patch( + 'parxy_core.drivers.liteparse.httpx.Client', return_value=mock_client + ): + driver = LiteParseDriver(config=LiteParseConfig()) + path = self.__fixture_path('test-doc.pdf') + + with pytest.raises(ParsingException): + driver.parse(path) + + def test_liteparse_driver_connection_error_raises_parsing_exception(self): + import httpx + + mock_client = MagicMock() + mock_client.__enter__ = Mock(return_value=mock_client) + mock_client.__exit__ = Mock(return_value=False) + mock_client.post.side_effect = httpx.ConnectError('Connection refused') + + with patch( + 'parxy_core.drivers.liteparse.httpx.Client', return_value=mock_client + ): + driver = LiteParseDriver(config=LiteParseConfig()) + path = self.__fixture_path('test-doc.pdf') + + with pytest.raises(ParsingException): + driver.parse(path) + + @patch('parxy_core.drivers.abstract_driver.tracer') + def test_liteparse_driver_tracing_span_created(self, mock_tracer): + mock_span = MagicMock() + mock_span.__enter__ = Mock(return_value=mock_span) + mock_span.__exit__ = Mock(return_value=False) + mock_tracer.span = Mock(return_value=mock_span) + mock_tracer.count = Mock() + mock_tracer.info = Mock() + + mock_client = _make_mock_client( + status_code=200, + json_body={ + 'pages': [ + { + 'pageNum': 1, + 'width': 612.0, + 'height': 792.0, + 'text': '1', + 'textItems': [], + } + ] + }, + ) + + with patch( + 'parxy_core.drivers.liteparse.httpx.Client', return_value=mock_client + ): + driver = LiteParseDriver(config=LiteParseConfig()) + path = self.__fixture_path('empty-doc.pdf') + driver.parse(path, level='page') + + mock_tracer.span.assert_called() + span_calls = mock_tracer.span.call_args_list + doc_processing_call = [ + c for c in span_calls if c[0][0] == 'document-processing' + ][0] + assert doc_processing_call[1]['driver'] == 'LiteParseDriver' + assert doc_processing_call[1]['level'] == 'page' + + mock_tracer.count.assert_called_once() + count_call = mock_tracer.count.call_args + assert count_call[0][0] == 'documents.processed' + assert count_call[1]['driver'] == 'LiteParseDriver' + + @patch('parxy_core.drivers.abstract_driver.tracer') + def test_liteparse_driver_tracing_exception_recorded(self, mock_tracer): + mock_span = MagicMock() + mock_span.__enter__ = Mock(return_value=mock_span) + mock_span.__exit__ = Mock(return_value=False) + mock_tracer.span = Mock(return_value=mock_span) + mock_tracer.count = Mock() + mock_tracer.error = Mock() + + driver = LiteParseDriver(config=LiteParseConfig()) + path = self.__fixture_path('non-existing-file.pdf') + + with pytest.raises(FileNotFoundException): + driver.parse(path, level='page') + + mock_tracer.error.assert_called_once() + assert mock_tracer.error.call_args[0][0] == 'Parsing failed' + mock_tracer.count.assert_called_once() + + def test_liteparse_driver_records_elapsed_time(self): + mock_client = _make_mock_client( + status_code=200, + json_body={ + 'pages': [ + { + 'pageNum': 1, + 'width': 612.0, + 'height': 792.0, + 'text': '1', + 'textItems': [], + } + ] + }, + ) + + with patch( + 'parxy_core.drivers.liteparse.httpx.Client', return_value=mock_client + ): + driver = LiteParseDriver(config=LiteParseConfig()) + path = self.__fixture_path('empty-doc.pdf') + document = driver.parse(path, level='page') + + assert document.parsing_metadata is not None + assert 'driver_elapsed_time' in document.parsing_metadata + assert isinstance(document.parsing_metadata['driver_elapsed_time'], float) + assert document.parsing_metadata['driver_elapsed_time'] > 0 + + def test_liteparse_driver_custom_base_url(self): + driver = LiteParseDriver( + config=LiteParseConfig(base_url='http://my-server:8080') + ) + + assert driver._config.base_url == 'http://my-server:8080' + + def test_liteparse_driver_posts_to_correct_url(self): + mock_client = _make_mock_client( + status_code=200, + json_body={ + 'pages': [ + { + 'pageNum': 1, + 'width': 612.0, + 'height': 792.0, + 'text': '1', + 'textItems': [], + } + ] + }, + ) + + with patch( + 'parxy_core.drivers.liteparse.httpx.Client', return_value=mock_client + ): + driver = LiteParseDriver( + config=LiteParseConfig(base_url='http://my-server:8080') + ) + path = self.__fixture_path('empty-doc.pdf') + driver.parse(path, level='page') + + call_url = mock_client.post.call_args[0][0] + assert call_url == 'http://my-server:8080/parse' + + def test_liteparse_driver_sends_config_json_to_api(self): + mock_client = _make_mock_client(status_code=200, json_body={'pages': []}) + + with patch( + 'parxy_core.drivers.liteparse.httpx.Client', return_value=mock_client + ): + driver = LiteParseDriver(config=LiteParseConfig()) + driver.parse(self.__fixture_path('empty-doc.pdf'), level='page') + + call_kwargs = mock_client.post.call_args.kwargs + assert 'data' in call_kwargs + assert 'config' in call_kwargs['data'] + config = json.loads(call_kwargs['data']['config']) + assert isinstance(config, dict) + + def test_liteparse_driver_config_fields_are_camel_case(self): + mock_client = _make_mock_client(status_code=200, json_body={'pages': []}) + + with patch( + 'parxy_core.drivers.liteparse.httpx.Client', return_value=mock_client + ): + driver = LiteParseDriver(config=LiteParseConfig(ocr_language='de', dpi=300)) + driver.parse(self.__fixture_path('empty-doc.pdf'), level='page') + + config = json.loads(mock_client.post.call_args.kwargs['data']['config']) + assert 'ocrLanguage' in config + assert config['ocrLanguage'] == 'de' + assert 'numWorkers' in config + assert 'preciseBoundingBox' in config + assert config['dpi'] == 300 + + def test_liteparse_driver_output_format_always_json(self): + mock_client = _make_mock_client(status_code=200, json_body={'pages': []}) + + with patch( + 'parxy_core.drivers.liteparse.httpx.Client', return_value=mock_client + ): + driver = LiteParseDriver(config=LiteParseConfig()) + driver.parse(self.__fixture_path('empty-doc.pdf'), level='page') + + config = json.loads(mock_client.post.call_args.kwargs['data']['config']) + assert config['outputFormat'] == 'json' + + def test_liteparse_driver_none_config_fields_excluded(self): + mock_client = _make_mock_client(status_code=200, json_body={'pages': []}) + + with patch( + 'parxy_core.drivers.liteparse.httpx.Client', return_value=mock_client + ): + driver = LiteParseDriver(config=LiteParseConfig()) + driver.parse(self.__fixture_path('empty-doc.pdf'), level='page') + + config = json.loads(mock_client.post.call_args.kwargs['data']['config']) + assert 'ocrServerUrl' not in config + assert 'targetPages' not in config + assert 'password' not in config + + def test_liteparse_driver_kwarg_overrides_config_value(self): + mock_client = _make_mock_client(status_code=200, json_body={'pages': []}) + + with patch( + 'parxy_core.drivers.liteparse.httpx.Client', return_value=mock_client + ): + driver = LiteParseDriver(config=LiteParseConfig(dpi=150)) + driver.parse(self.__fixture_path('empty-doc.pdf'), level='page', dpi=300) + + config = json.loads(mock_client.post.call_args.kwargs['data']['config']) + assert config['dpi'] == 300 + + def test_liteparse_driver_target_pages_and_password_are_per_request(self): + mock_client = _make_mock_client(status_code=200, json_body={'pages': []}) + + with patch( + 'parxy_core.drivers.liteparse.httpx.Client', return_value=mock_client + ): + driver = LiteParseDriver(config=LiteParseConfig()) + driver.parse( + self.__fixture_path('empty-doc.pdf'), + level='page', + target_pages='1,2', + password='s3cr3t', + ) + + config = json.loads(mock_client.post.call_args.kwargs['data']['config']) + assert config['targetPages'] == '1,2' + assert config['password'] == 's3cr3t' + + def test_liteparse_driver_multiple_kwarg_overrides(self): + mock_client = _make_mock_client(status_code=200, json_body={'pages': []}) + + with patch( + 'parxy_core.drivers.liteparse.httpx.Client', return_value=mock_client + ): + driver = LiteParseDriver(config=LiteParseConfig()) + driver.parse( + self.__fixture_path('empty-doc.pdf'), + level='page', + ocr_language='fr', + dpi=200, + target_pages='1,2', + preserve_very_small_text=True, + ) + + config = json.loads(mock_client.post.call_args.kwargs['data']['config']) + assert config['ocrLanguage'] == 'fr' + assert config['dpi'] == 200 + assert config['targetPages'] == '1,2' + assert config['preserveVerySmallText'] is True + + def test_liteparse_driver_kwarg_overrides_do_not_affect_tracing(self): + mock_client = _make_mock_client(status_code=200, json_body={'pages': []}) + + with patch( + 'parxy_core.drivers.liteparse.httpx.Client', return_value=mock_client + ): + driver = LiteParseDriver(config=LiteParseConfig()) + # dpi is a config override; it must not bleed into the span kwargs + driver.parse(self.__fixture_path('empty-doc.pdf'), level='page', dpi=300) + + config = json.loads(mock_client.post.call_args.kwargs['data']['config']) + assert config['dpi'] == 300 + + +@pytest.mark.skipif( + not _liteparse_service_available(), + reason='LiteParse service not available at http://localhost:5000', +) +class TestLiteParseDriverIntegration: + def __fixture_path(self, file: str) -> str: + current_dir = os.path.dirname(os.path.abspath(__file__)) + fixtures_dir = os.path.join(os.path.dirname(current_dir), 'fixtures') + return os.path.join(fixtures_dir, file) + + def test_liteparse_driver_read_document_page_level(self): + expected_text = ( + 'This is the header\n\n' + 'This is a test PDF to be used as input in unit\n' + 'tests\n\n' + 'This is a heading 1\n' + 'This is a paragraph below heading 1\n\n\n\n\n\n\n\n\n\n\n1' + ) + driver = LiteParseDriver(config=LiteParseConfig()) + path = self.__fixture_path('test-doc.pdf') + document = driver.parse(path, level='page') + + assert document is not None + assert document.language is None + assert document.metadata is None + assert len(document.pages) == 1 + assert isinstance(document.pages[0], Page) + assert document.pages[0].number == 1 + assert document.pages[0].width == 612 + assert document.pages[0].height == 792 + assert document.pages[0].blocks is None + assert document.pages[0].text == expected_text + + def test_liteparse_driver_read_document_block_level(self): + driver = LiteParseDriver(config=LiteParseConfig()) + path = self.__fixture_path('test-doc.pdf') + document = driver.parse(path, level='block') + + assert document is not None + assert len(document.pages) == 1 + assert document.pages[0].number == 1 + + def test_liteparse_driver_returns_parsing_metadata(self): + driver = LiteParseDriver(config=LiteParseConfig()) + path = self.__fixture_path('test-doc.pdf') + document = driver.parse(path, level='page') + + assert document.parsing_metadata is not None + assert 'driver_elapsed_time' in document.parsing_metadata + assert isinstance(document.parsing_metadata['driver_elapsed_time'], float) + assert document.parsing_metadata['driver_elapsed_time'] > 0 + + def test_liteparse_driver_target_pages_returns_subset(self): + driver = LiteParseDriver(config=LiteParseConfig()) + path = self.__fixture_path('pdf-headings-images-tables.pdf') + + document_all = driver.parse(path, level='page') + document_first = driver.parse(path, level='page', target_pages='1') + + assert len(document_all.pages) > 1 + assert len(document_first.pages) == 1 + assert document_first.pages[0].number == 1 diff --git a/tests/test_models.py b/tests/test_models.py index 32f44bf..2b6c2c7 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -1,3 +1,5 @@ +import pytest + from parxy_core.models import ( BoundingBox, Style, @@ -21,6 +23,44 @@ def test_bounding_box(self): assert bbox.x1 == 100.0 assert bbox.y1 == 50.0 + def test_bounding_box_width_and_height(self): + bbox = BoundingBox(x0=10.0, y0=20.0, x1=110.0, y1=70.0) + assert bbox.width == 100.0 + assert bbox.height == 50.0 + + def test_bounding_box_to_pixels_at_72dpi_is_identity(self): + bbox = BoundingBox(x0=72.0, y0=144.0, x1=216.0, y1=288.0) + px = bbox.to_pixels(dpi=72) + assert px.x0 == pytest.approx(72.0) + assert px.y0 == pytest.approx(144.0) + assert px.x1 == pytest.approx(216.0) + assert px.y1 == pytest.approx(288.0) + + def test_bounding_box_to_pixels_scales_correctly(self): + bbox = BoundingBox(x0=72.0, y0=72.0, x1=144.0, y1=144.0) + px = bbox.to_pixels(dpi=144) + scale = 144 / 72 # 2.0 + assert px.x0 == pytest.approx(72.0 * scale) + assert px.y0 == pytest.approx(72.0 * scale) + assert px.x1 == pytest.approx(144.0 * scale) + assert px.y1 == pytest.approx(144.0 * scale) + + def test_bounding_box_to_pixels_preserves_width_and_height_ratio(self): + bbox = BoundingBox(x0=0.0, y0=0.0, x1=72.0, y1=36.0) + px = bbox.to_pixels(dpi=150) + scale = 150 / 72 + assert px.width == pytest.approx(bbox.width * scale) + assert px.height == pytest.approx(bbox.height * scale) + + def test_bounding_box_to_pixels_default_dpi_is_150(self): + bbox = BoundingBox(x0=0.0, y0=0.0, x1=72.0, y1=72.0) + assert bbox.to_pixels() == bbox.to_pixels(dpi=150) + + def test_bounding_box_to_pixels_returns_new_instance(self): + bbox = BoundingBox(x0=0.0, y0=0.0, x1=72.0, y1=72.0) + px = bbox.to_pixels(dpi=150) + assert px is not bbox + def test_style(self): style = Style( font_name='Arial',