diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..61ed91c --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,40 @@ +name: CI + +on: + push: + branches: + - "main" + pull_request: + types: [opened, synchronize, reopened] + + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + test: + name: Test + runs-on: ubuntu-latest + + strategy: + fail-fast: true + matrix: + python: [3.12, 3.13] + + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 1 + + - name: Install the latest version of uv and set the python version + uses: astral-sh/setup-uv@v6.7.0 + with: + enable-cache: true + python-version: ${{ matrix.python }} + + - name: Pull dependencies + run: uv sync --all-extras --all-groups + + - name: Execute tests + run: uv run pytest diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml new file mode 100644 index 0000000..a715cf9 --- /dev/null +++ b/.github/workflows/lint.yml @@ -0,0 +1,35 @@ +name: Lint + +on: + push: + paths: + - '**.py' + - '.github/workflows/lint.yml' + +permissions: + contents: write + +jobs: + lint: + name: Lint Shell scripts + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 1 + + - uses: astral-sh/setup-uv@v6.7.0 + + - name: Ruff lint + run: uv run ruff check --exit-zero . + + - name: Ruff format + run: uv run ruff format . + + - name: Commit changes + uses: stefanzweifel/git-auto-commit-action@v5 + with: + commit_message: Fix styling + + diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml new file mode 100644 index 0000000..c5e9e2e --- /dev/null +++ b/.github/workflows/release.yml @@ -0,0 +1,69 @@ +name: Publish on PyPI + +on: + push: + branches: + - "main" + release: + types: [created] + +permissions: + contents: read + +jobs: + + build: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 1 + + - name: Install the latest version of uv and set the python version + uses: astral-sh/setup-uv@v6.7.0 + with: + python-version: "3.12" + + - name: Build wheel + run: | + uv build + + # Smoke test the build for packaging errors + - name: Smoke test (wheel) + run: uv run --isolated --no-project --with dist/*.whl tests/smoke_test.py + - name: Smoke test (source distribution) + run: uv run --isolated --no-project --with dist/*.tar.gz tests/smoke_test.py + + - name: Upload build for publishing + uses: actions/upload-artifact@v4 + with: + name: parxy_release + if-no-files-found: error + retention-days: 1 + path: dist/* + + pypi: + name: Upload release to PyPI + runs-on: ubuntu-latest + needs: build + environment: pypi + permissions: + id-token: write + steps: + - name: Install the latest version of uv and set the python version + uses: astral-sh/setup-uv@v6.7.0 + if: github.event_name == 'release' + with: + enable-cache: false + ignore-empty-workdir: true + python-version: "3.12" + + - name: Download build + uses: actions/download-artifact@v4 + with: + name: parxy_release + path: dist + + - name: Publish package distributions to PyPI + if: github.event_name == 'release' + run: uv publish \ No newline at end of file diff --git a/README.md b/README.md index 6211681..f6fb869 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ -[![CI](https://github.com/OneOffTech/parxy/actions/workflows/ci.yml/badge.svg)](https://github.com/OneOffTech/parxy/actions/workflows/ci.yml) [![Build Docker Image](https://github.com/OneOffTech/parxy/actions/workflows/docker.yml/badge.svg)](https://github.com/OneOffTech/parxy/actions/workflows/docker.yml) - [![uv](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/uv/main/assets/badge/v0.json)](https://github.com/astral-sh/uv) +![pypi](https://img.shields.io/pypi/v/parxy.svg) +[![Pydantic v2](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/pydantic/pydantic/main/docs/badge/v2.json)](https://docs.pydantic.dev/latest/contributing/#badges) [![uv](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/uv/main/assets/badge/v0.json)](https://github.com/astral-sh/uv) [![CI](https://github.com/OneOffTech/parxy/actions/workflows/ci.yml/badge.svg)](https://github.com/OneOffTech/parxy/actions/workflows/ci.yml) # OneOffTech Parxy @@ -36,13 +36,13 @@ Parxy is available as a standalone command line and a library. The quickest way Use with minimal footprint (fewer drivers supported): ```bash -uvx --from "git+https://github.com/oneofftech/parxy.git" parxy --help +uvx parxy --help ``` Use all supported drivers: ```bash -uvx --from "git+https://github.com/oneofftech/parxy.git[all]" parxy --help +uvx parxy[all] --help ``` See [Supported services](#supported-services) for the list of included drivers and their extras for the installation. diff --git a/pyproject.toml b/pyproject.toml index b7fd2a0..3445707 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,5 +1,5 @@ [project] -name = "parxy-core" +name = "parxy" version = "0.1.0" description = "Parxy document processing gateway" readme = "README.md" diff --git a/src/__main__.py b/src/__main__.py index 4acafff..a03744e 100644 --- a/src/__main__.py +++ b/src/__main__.py @@ -1,4 +1,4 @@ from parxy_cli import cli -if __name__ == "__main__": - cli() \ No newline at end of file +if __name__ == '__main__': + cli() diff --git a/src/parxy_cli/cli.py b/src/parxy_cli/cli.py index 12bab4e..b5dd445 100644 --- a/src/parxy_cli/cli.py +++ b/src/parxy_cli/cli.py @@ -8,7 +8,6 @@ import typer from rich import print from rich.console import Console -from rich.table import Table from parxy_core.facade import Parxy @@ -93,7 +92,7 @@ def parse( # Process each file for file_path in files: try: - console.print(f'----') + console.print('----') # Parse the document doc = Parxy.parse( diff --git a/src/parxy_core/drivers/abstract_driver.py b/src/parxy_core/drivers/abstract_driver.py index 3c87cd8..2e871af 100644 --- a/src/parxy_core/drivers/abstract_driver.py +++ b/src/parxy_core/drivers/abstract_driver.py @@ -121,7 +121,7 @@ def parse( except Exception as ex: self._logger.error( - f'Error while parsing file', + 'Error while parsing file', file, self.__class__.__name__, exc_info=True, diff --git a/src/parxy_core/drivers/llamaparse.py b/src/parxy_core/drivers/llamaparse.py index 3c2908e..d3ebe67 100644 --- a/src/parxy_core/drivers/llamaparse.py +++ b/src/parxy_core/drivers/llamaparse.py @@ -1,6 +1,5 @@ import io -from typing import TYPE_CHECKING, Dict, Any, Optional, Union -from logging import Logger +from typing import TYPE_CHECKING # Type hints that will be available at runtime when llama_cloud_services is installed if TYPE_CHECKING: @@ -51,7 +50,7 @@ def _initialize_driver(self): except ImportError as e: raise ImportError( 'LlamaParse dependencies not installed. ' - "Install with 'pip install parxy-core[llama]'" + "Install with 'pip install parxy[llama]'" ) from e self.__client = LlamaParse(**self._config) @@ -123,7 +122,7 @@ def _handle( # For all other errors, raise as parsing exception raise ParsingException(str(ex), self.__class__) from ex - if not res.error is None: + if res.error is not None: raise ParsingException( res.error, self.__class__, res.model_dump(exclude={'file_name'}) ) diff --git a/src/parxy_core/drivers/llmwhisperer.py b/src/parxy_core/drivers/llmwhisperer.py index d71f380..5516a4b 100644 --- a/src/parxy_core/drivers/llmwhisperer.py +++ b/src/parxy_core/drivers/llmwhisperer.py @@ -2,9 +2,7 @@ import validators -from typing import TYPE_CHECKING, Dict, Any - -from logging import Logger +from typing import TYPE_CHECKING # Type hints that will be available at runtime when llm whisperer is installed @@ -50,7 +48,7 @@ def _initialize_driver(self): except ImportError as e: raise ImportError( 'LlmWhisperer dependencies not installed. ' - "Install with 'pip install parxy-core[llmwhisperer]'" + "Install with 'pip install parxy[llmwhisperer]'" ) from e self.__client = LLMWhispererClientV2(**self._config) diff --git a/src/parxy_core/drivers/pdfact.py b/src/parxy_core/drivers/pdfact.py index f845e02..bc528a9 100644 --- a/src/parxy_core/drivers/pdfact.py +++ b/src/parxy_core/drivers/pdfact.py @@ -1,11 +1,10 @@ import io -from typing import Optional, Dict, Any +from typing import Optional import requests import validators from urllib.parse import urljoin -from logging import Logger from parxy_core.drivers import Driver from parxy_core.models import ( diff --git a/src/parxy_core/drivers/unstructured_local.py b/src/parxy_core/drivers/unstructured_local.py index 3519162..2e0e1ca 100644 --- a/src/parxy_core/drivers/unstructured_local.py +++ b/src/parxy_core/drivers/unstructured_local.py @@ -43,7 +43,7 @@ def _initialize_driver(self): except ImportError as e: raise ImportError( 'Unstructured dependencies not installed. ' - "Install with 'pip install parxy-core[unstructured_local]'" + "Install with 'pip install parxy[unstructured_local]'" ) from e def _handle( diff --git a/src/parxy_core/exceptions/__init__.py b/src/parxy_core/exceptions/__init__.py index dcd54e7..89a0c95 100644 --- a/src/parxy_core/exceptions/__init__.py +++ b/src/parxy_core/exceptions/__init__.py @@ -1,4 +1,10 @@ -from parxy_core.exceptions.authentication_exception import AuthenticationException as AuthenticationException +from parxy_core.exceptions.authentication_exception import ( + AuthenticationException as AuthenticationException, +) from parxy_core.exceptions.parsing_exception import ParsingException as ParsingException -from parxy_core.exceptions.file_not_found_exception import FileNotFoundException as FileNotFoundException -from parxy_core.exceptions.unsupported_format_exception import UnsupportedFormatException as UnsupportedFormatException +from parxy_core.exceptions.file_not_found_exception import ( + FileNotFoundException as FileNotFoundException, +) +from parxy_core.exceptions.unsupported_format_exception import ( + UnsupportedFormatException as UnsupportedFormatException, +) diff --git a/src/parxy_core/exceptions/authentication_exception.py b/src/parxy_core/exceptions/authentication_exception.py index e2a0e8a..ce10b35 100644 --- a/src/parxy_core/exceptions/authentication_exception.py +++ b/src/parxy_core/exceptions/authentication_exception.py @@ -1,6 +1,6 @@ class AuthenticationException(Exception): """Exception raised for authentication errors. - + This exception should be raised when authentication fails with external services or APIs, such as invalid API keys, expired tokens, or insufficient permissions. @@ -51,7 +51,7 @@ def __str__(self) -> str: str Formatted error message including service name and details """ - base_message = f"Authentication failed for {self.service}: {self.message}" + base_message = f'Authentication failed for {self.service}: {self.message}' if self.details: - return f"{base_message}\nDetails: {self.details}" - return base_message \ No newline at end of file + return f'{base_message}\nDetails: {self.details}' + return base_message diff --git a/src/parxy_core/exceptions/file_not_found_exception.py b/src/parxy_core/exceptions/file_not_found_exception.py index 77c3a1e..3826f18 100644 --- a/src/parxy_core/exceptions/file_not_found_exception.py +++ b/src/parxy_core/exceptions/file_not_found_exception.py @@ -1,6 +1,6 @@ class FileNotFoundException(FileNotFoundError): """Exception raised for file not found errors. - + This exception is raised when a file cannot be accessed for parsing. Attributes @@ -50,7 +50,7 @@ def __str__(self) -> str: str Formatted error message including service name and details """ - base_message = f"Parsing failed for {self.service}: {self.message}" + base_message = f'Parsing failed for {self.service}: {self.message}' if self.details: - return f"{base_message}\nDetails: {self.details}" - return base_message \ No newline at end of file + return f'{base_message}\nDetails: {self.details}' + return base_message diff --git a/src/parxy_core/exceptions/parsing_exception.py b/src/parxy_core/exceptions/parsing_exception.py index 552b883..6c13928 100644 --- a/src/parxy_core/exceptions/parsing_exception.py +++ b/src/parxy_core/exceptions/parsing_exception.py @@ -1,6 +1,6 @@ class ParsingException(Exception): """Exception raised for parsing errors. - + This exception is raised when parsing document fails. Attributes @@ -50,7 +50,7 @@ def __str__(self) -> str: str Formatted error message including service name and details """ - base_message = f"Parsing failed for {self.service}: {self.message}" + base_message = f'Parsing failed for {self.service}: {self.message}' if self.details: - return f"{base_message}\nDetails: {self.details}" - return base_message \ No newline at end of file + return f'{base_message}\nDetails: {self.details}' + return base_message diff --git a/src/parxy_core/exceptions/unsupported_format_exception.py b/src/parxy_core/exceptions/unsupported_format_exception.py index 34e0bde..d2d5f3a 100644 --- a/src/parxy_core/exceptions/unsupported_format_exception.py +++ b/src/parxy_core/exceptions/unsupported_format_exception.py @@ -1,6 +1,6 @@ class UnsupportedFormatException(Exception): """Exception raised for file format not supported. - + This exception is raised when a file is of a format not supported by the parsing service. Attributes @@ -50,7 +50,7 @@ def __str__(self) -> str: str Formatted error message including service name and details """ - base_message = f"Unsupported format for {self.service}: {self.message}" + base_message = f'Unsupported format for {self.service}: {self.message}' if self.details: - return f"{base_message}\nDetails: {self.details}" - return base_message \ No newline at end of file + return f'{base_message}\nDetails: {self.details}' + return base_message diff --git a/src/parxy_core/facade/parxy.py b/src/parxy_core/facade/parxy.py index 46ab656..435d1ab 100644 --- a/src/parxy_core/facade/parxy.py +++ b/src/parxy_core/facade/parxy.py @@ -1,7 +1,7 @@ """Facade for accessing Parxy document parsing functionality.""" import io -from typing import Optional, Dict, Any, Callable +from typing import Optional, Dict, Callable from parxy_core.drivers import DriverFactory, Driver from parxy_core.models import Document @@ -9,7 +9,7 @@ class Parxy: """Static facade for accessing Parxy document processing features. - + This class provides a simplified interface to the document parsing functionality. It maintains a single DriverFactory instance and provides static methods for common operations like parsing documents and accessing specific drivers. @@ -18,14 +18,14 @@ class Parxy: ------- Parse a document with default driver: >>> doc = Parxy.parse('path/to/document.pdf') - + Use a specific driver: >>> doc = Parxy.driver(Parxy.PYMUPDF).parse('path/to/document.pdf') - + """ # Constants for common document processing drivers - + PYMUPDF = 'pymupdf' PDFACT = 'pdfact' LLAMAPARSE = 'llamaparse' @@ -37,12 +37,12 @@ class Parxy: def __new__(cls): """Prevent instantiation of this static class.""" - raise TypeError(f"{cls.__name__} is a static class and cannot be instantiated") + raise TypeError(f'{cls.__name__} is a static class and cannot be instantiated') @classmethod def _get_factory(cls) -> DriverFactory: """Get or create the DriverFactory instance. - + Returns ------- DriverFactory @@ -53,9 +53,14 @@ def _get_factory(cls) -> DriverFactory: return cls._factory @classmethod - def parse(cls, file: str | io.BytesIO | bytes, level: str = "block", driver_name: Optional[str] = None) -> Document: + def parse( + cls, + file: str | io.BytesIO | bytes, + level: str = 'block', + driver_name: Optional[str] = None, + ) -> Document: """Parse a document using the specified or default driver. - + Parameters ---------- file : str | io.BytesIO | bytes @@ -64,7 +69,7 @@ def parse(cls, file: str | io.BytesIO | bytes, level: str = "block", driver_name The level of detail for parsing, by default "block" driver_name : str, optional Name of the driver to use. If None, uses the default driver - + Returns ------- Document @@ -75,12 +80,12 @@ def parse(cls, file: str | io.BytesIO | bytes, level: str = "block", driver_name @classmethod def driver(cls, name: Optional[str] = None) -> Driver: """Get a driver instance by name. - + Parameters ---------- name : str, optional Name of the driver to get. If None, returns the default driver - + Returns ------- Driver @@ -91,7 +96,7 @@ def driver(cls, name: Optional[str] = None) -> Driver: @classmethod def drivers(cls) -> Dict[str, Driver]: """Get the list of supported drivers. - + Returns ------- Driver @@ -99,11 +104,10 @@ def drivers(cls) -> Dict[str, Driver]: """ return cls._get_factory().get_supported_drivers() - @classmethod def extend(cls, name: str, callback: Callable[[], Driver]) -> 'DriverFactory': """Register a new driver with the factory. - + Parameters ---------- name : str @@ -113,4 +117,4 @@ def extend(cls, name: str, callback: Callable[[], Driver]) -> 'DriverFactory': config : Dict[str, Any], optional Initial configuration for the driver """ - return cls._get_factory().extend(name=name, callback=callback) \ No newline at end of file + return cls._get_factory().extend(name=name, callback=callback) diff --git a/src/parxy_core/models/__init__.py b/src/parxy_core/models/__init__.py index 189f5a3..3f4f8aa 100644 --- a/src/parxy_core/models/__init__.py +++ b/src/parxy_core/models/__init__.py @@ -1,4 +1,3 @@ - # Use an explicit re-export https://github.com/astral-sh/ruff/issues/5697#issuecomment-1631647211 from parxy_core.models.models import ( diff --git a/src/parxy_core/models/models.py b/src/parxy_core/models/models.py index bcc284a..583e4c8 100644 --- a/src/parxy_core/models/models.py +++ b/src/parxy_core/models/models.py @@ -29,7 +29,7 @@ class Character(BaseModel): source_data: Optional[dict[str, Any]] = None def isEmpty(self) -> bool: - return not self.text or self.text.strip() == "" + return not self.text or self.text.strip() == '' class Span(BaseModel): @@ -41,7 +41,7 @@ class Span(BaseModel): source_data: Optional[dict[str, Any]] = None def isEmpty(self) -> bool: - return not self.text or self.text.strip() == "" + return not self.text or self.text.strip() == '' class Line(BaseModel): @@ -53,7 +53,7 @@ class Line(BaseModel): source_data: Optional[dict[str, Any]] = None def isEmpty(self) -> bool: - return not self.text or self.text.strip() == "" + return not self.text or self.text.strip() == '' class Block(BaseModel, ABC): @@ -75,15 +75,13 @@ class TextBlock(BaseModel): text: str def isEmpty(self) -> bool: - return not self.text or self.text.strip() == "" + return not self.text or self.text.strip() == '' -class ImageBlock(Block): - ... +class ImageBlock(Block): ... -class TableBlock(Block): - ... +class TableBlock(Block): ... class Page(BaseModel): @@ -95,7 +93,7 @@ class Page(BaseModel): source_data: Optional[dict[str, Any]] = None def isEmpty(self) -> bool: - return not self.text or self.text.strip() == "" + return not self.text or self.text.strip() == '' class Metadata(BaseModel): @@ -120,90 +118,94 @@ class Document(BaseModel): def isEmpty(self) -> bool: return all(page.isEmpty() for page in self.pages) - def text(self, page_separator: str = "---") -> str: + def text(self, page_separator: str = '---') -> str: """Get the full text content of the document. - + Parameters ---------- page_separator : str, optional String to use as separator between pages, by default "---" Set to empty string or None to disable page separation - + Returns ------- str The concatenated text of all pages with optional separators """ if not self.pages: - return "" - + return '' + # Filter out empty pages texts = [page.text.strip() for page in self.pages if page.text] - + if not texts: - return "" - + return '' + # Add separator between pages if specified if page_separator: - return f"\n{page_separator}\n".join(texts) - - return "\n".join(texts) + return f'\n{page_separator}\n'.join(texts) + + return '\n'.join(texts) def markdown(self) -> str: """Get the document content formatted as Markdown. - + The method attempts to preserve the document structure by: 1. Converting TextBlocks to paragraphs based on their category 2. Preserving line breaks where meaningful 3. Adding section headers based on block levels - + Returns ------- str The document content formatted as Markdown """ if not self.pages: - return "" - + return '' + markdown_parts = [] - + for page in self.pages: if not page.blocks: if page.text.strip(): markdown_parts.append(page.text.strip()) continue - + page_parts = [] - + for block in page.blocks: if isinstance(block, TextBlock): # Handle different block categories - if block.category and block.category.lower() in ['heading', 'title', 'header']: + if block.category and block.category.lower() in [ + 'heading', + 'title', + 'header', + ]: # Determine heading level (h1-h6) based on block level or default to h2 level = min(block.level or 2, 6) - page_parts.append(f"{'#' * level} {block.text.strip()}") + page_parts.append(f'{"#" * level} {block.text.strip()}') elif block.category and block.category.lower() == 'list': # Convert to bullet points for line in block.text.splitlines(): if line.strip(): - page_parts.append(f"- {line.strip()}") + page_parts.append(f'- {line.strip()}') else: # Regular paragraph if block.text.strip(): page_parts.append(block.text.strip()) - + elif isinstance(block, ImageBlock): # Placeholder for images - could be enhanced with actual image data - page_parts.append("![Image]") - + page_parts.append('![Image]') + elif isinstance(block, TableBlock): # Placeholder for tables - could be enhanced with actual table data - page_parts.append("| Table content |") - + page_parts.append('| Table content |') + if page_parts: - markdown_parts.append("\n\n".join(page_parts)) - - return "\n\n".join(markdown_parts) + markdown_parts.append('\n\n'.join(page_parts)) + + return '\n\n'.join(markdown_parts) class HierarchyLevel(IntEnum): @@ -217,8 +219,7 @@ class HierarchyLevel(IntEnum): def estimate_lines_from_block( - block: TextBlock, - default_font_size: float = 11 + block: TextBlock, default_font_size: float = 11 ) -> TextBlock: """Estimate line-level layout inside a text block by splitting text and assigning bounding boxes. @@ -268,8 +269,8 @@ def estimate_lines_from_block( bbox=line_bbox, style=block.style, page=block.page, - source_data={"source": "split_from_block"}, - spans=None + source_data={'source': 'split_from_block'}, + spans=None, ) block.lines.append(line) return block diff --git a/src/parxy_core/tracing/__init__.py b/src/parxy_core/tracing/__init__.py index d7dd26b..b647c33 100644 --- a/src/parxy_core/tracing/__init__.py +++ b/src/parxy_core/tracing/__init__.py @@ -1,3 +1,3 @@ from .tracer import ( Tracer as Tracer, -) \ No newline at end of file +) diff --git a/src/parxy_core/tracing/tracer.py b/src/parxy_core/tracing/tracer.py index b3e47a5..f26823d 100644 --- a/src/parxy_core/tracing/tracer.py +++ b/src/parxy_core/tracing/tracer.py @@ -2,26 +2,23 @@ import json import os from datetime import datetime -from typing import Optional, Dict, Any, Callable +from typing import Any + class Tracer: - """Tracing service to store debug traces from document processing services. - - """ + """Tracing service to store debug traces from document processing services.""" def __init__(self, enabled=False, path='storage/traces'): """Initialize the tracing service.""" self._enabled = enabled self._directory = path - - + def _get_storage_directory(self) -> str: return os.path.join( os.path.dirname(os.path.dirname(__file__)), self._directory, ) - def trace( self, driver: str, @@ -29,7 +26,7 @@ def trace( file_source: str | io.BytesIO | bytes, ) -> None: """Save processing trace for debugging purposes. - + Parameters ---------- content : Any @@ -37,7 +34,6 @@ def trace( file_source : str | io.BytesIO | bytes The original input file/source """ - if not self._enabled: return @@ -48,29 +44,29 @@ def trace( os.makedirs(trace_dir, exist_ok=True) # Generate filename - timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') if isinstance(file_source, str): source_name = os.path.splitext(os.path.basename(file_source))[0] elif isinstance(file_source, io.BytesIO) and hasattr(file_source, 'name'): source_name = os.path.splitext(os.path.basename(file_source.name))[0] else: - source_name = "unnamed_source" + source_name = 'unnamed_source' - filename = f"{source_name}_{timestamp}.json" + filename = f'{source_name}_{timestamp}.json' filepath = os.path.join(trace_dir, filename) # Prepare trace data trace_data = { - "timestamp": timestamp, - "driver": driver, - "source": str(file_source), - "config": self._config + 'timestamp': timestamp, + 'driver': driver, + 'source': str(file_source), + 'config': self._config, } if hasattr(content, 'model_dump'): - trace_data["output"] = content.model_dump() + trace_data['output'] = content.model_dump() else: - trace_data["output"] = str(content) + trace_data['output'] = str(content) # else: # trace_data["error"] = { # "type": error.__class__.__name__ if error else "Unknown", @@ -80,5 +76,5 @@ def trace( # Save trace file with open(filepath, 'w', encoding='utf-8') as f: json.dump(trace_data, f, indent=2, ensure_ascii=False) - - self._logger.debug(f"Trace saved to {filepath}") + + self._logger.debug(f'Trace saved to {filepath}') diff --git a/tests/drivers/test_llamaparse.py b/tests/drivers/test_llamaparse.py index 191dbff..8b4e858 100644 --- a/tests/drivers/test_llamaparse.py +++ b/tests/drivers/test_llamaparse.py @@ -5,39 +5,39 @@ AuthenticationException, FileNotFoundException, ) -from parxy_core.models import ( - TextBlock, Page -) +from parxy_core.models import TextBlock, Page from parxy_core.drivers import LlamaParseDriver from parxy_core.models import LlamaParseConfig +@pytest.mark.skipif( + os.getenv('GITHUB_ACTIONS') == 'true', + reason='External service required, skipping tests in GitHub Actions.', +) class TestLlamaParseDriver: - def __fixture_path(self, file: str) -> str: current_dir = os.path.dirname(os.path.abspath(__file__)) - fixtures_dir = os.path.join(os.path.dirname(current_dir), "fixtures") + fixtures_dir = os.path.join(os.path.dirname(current_dir), 'fixtures') return os.path.join(fixtures_dir, file) - def test_llamaparse_driver_can_be_created(self): driver = LlamaParseDriver(LlamaParseConfig().model_dump()) - assert driver.supported_levels == ["page", "block"] + assert driver.supported_levels == ['page', 'block'] def test_llamaparse_driver_handle_invalid_key(self): driver = LlamaParseDriver(LlamaParseConfig(api_key='invalid').model_dump()) - path = self.__fixture_path("empty-doc.pdf") + path = self.__fixture_path('empty-doc.pdf') with pytest.raises(AuthenticationException) as excinfo: driver.parse(path) - + def test_llamaparse_driver_handle_not_existing_file(self): driver = LlamaParseDriver(LlamaParseConfig().model_dump()) - path = self.__fixture_path("non-existing-file.pdf") + path = self.__fixture_path('non-existing-file.pdf') with pytest.raises(FileNotFoundException) as excinfo: driver.parse(path) @@ -45,18 +45,18 @@ def test_llamaparse_driver_handle_not_existing_file(self): def test_llamaparse_driver_unrecognized_level_handled(self): driver = LlamaParseDriver(LlamaParseConfig().model_dump()) - path = self.__fixture_path("empty-doc.pdf") + path = self.__fixture_path('empty-doc.pdf') with pytest.raises(ValueError) as excinfo: driver.parse(path, level='custom') - assert "not supported" in str(excinfo.value) - assert "[custom]" in str(excinfo.value) - + assert 'not supported' in str(excinfo.value) + assert '[custom]' in str(excinfo.value) + def test_llamaparse_driver_read_empty_document_block_level(self): driver = LlamaParseDriver(LlamaParseConfig().model_dump()) - path = self.__fixture_path("empty-doc.pdf") + path = self.__fixture_path('empty-doc.pdf') document = driver.parse(path) assert document is not None @@ -68,11 +68,11 @@ def test_llamaparse_driver_read_empty_document_block_level(self): assert document.pages[0].text == '1' assert len(document.pages[0].blocks) == 1 assert isinstance(document.pages[0].blocks[0], TextBlock) - + def test_llamaparse_driver_read_empty_document_page_level(self): driver = LlamaParseDriver(LlamaParseConfig().model_dump()) - path = self.__fixture_path("empty-doc.pdf") + path = self.__fixture_path('empty-doc.pdf') document = driver.parse(path, level='page') assert document is not None @@ -81,12 +81,12 @@ def test_llamaparse_driver_read_empty_document_page_level(self): assert document.metadata is None assert len(document.pages) == 1 assert isinstance(document.pages[0], Page) - assert document.pages[0].text == "1" - + assert document.pages[0].text == '1' + def test_llamaparse_driver_read_document(self): driver = LlamaParseDriver(LlamaParseConfig().model_dump()) - path = self.__fixture_path("test-doc.pdf") + path = self.__fixture_path('test-doc.pdf') document = driver.parse(path, level='page') assert document is not None @@ -95,5 +95,7 @@ def test_llamaparse_driver_read_document(self): assert document.metadata is None assert len(document.pages) == 1 assert isinstance(document.pages[0], Page) - assert document.pages[0].text == "This is the header\n\nThis is a test PDF to be used as input in unit\ntests\n\nThis is a heading 1\nThis is a paragraph below heading 1\n\n\n1" - + assert ( + document.pages[0].text + == 'This is the header\n\nThis is a test PDF to be used as input in unit\ntests\n\nThis is a heading 1\nThis is a paragraph below heading 1\n\n\n1' + ) diff --git a/tests/drivers/test_llmwhisperer.py b/tests/drivers/test_llmwhisperer.py index 2da2d63..8389ad3 100644 --- a/tests/drivers/test_llmwhisperer.py +++ b/tests/drivers/test_llmwhisperer.py @@ -5,39 +5,39 @@ AuthenticationException, FileNotFoundException, ) -from parxy_core.models import ( - TextBlock, Page -) +from parxy_core.models import Page from parxy_core.drivers import LlmWhispererDriver from parxy_core.models import LlmWhispererConfig +@pytest.mark.skipif( + os.getenv('GITHUB_ACTIONS') == 'true', + reason='External service required, skipping tests in GitHub Actions.', +) class TestLlmWhispererDriver: - def __fixture_path(self, file: str) -> str: current_dir = os.path.dirname(os.path.abspath(__file__)) - fixtures_dir = os.path.join(os.path.dirname(current_dir), "fixtures") + fixtures_dir = os.path.join(os.path.dirname(current_dir), 'fixtures') return os.path.join(fixtures_dir, file) - def test_llmwhisperer_driver_can_be_created(self): driver = LlmWhispererDriver(LlmWhispererConfig().model_dump()) - assert driver.supported_levels == ["page"] + assert driver.supported_levels == ['page', 'block'] def test_llmwhisperer_driver_handle_invalid_key(self): driver = LlmWhispererDriver(LlmWhispererConfig(api_key='invalid').model_dump()) - path = self.__fixture_path("empty-doc.pdf") + path = self.__fixture_path('empty-doc.pdf') with pytest.raises(AuthenticationException) as excinfo: driver.parse(path) - + def test_llmwhisperer_driver_handle_not_existing_file(self): driver = LlmWhispererDriver(LlmWhispererConfig().model_dump()) - path = self.__fixture_path("non-existing-file.pdf") + path = self.__fixture_path('non-existing-file.pdf') with pytest.raises(FileNotFoundException) as excinfo: driver.parse(path) @@ -45,18 +45,18 @@ def test_llmwhisperer_driver_handle_not_existing_file(self): def test_llmwhisperer_driver_unrecognized_level_handled(self): driver = LlmWhispererDriver(LlmWhispererConfig().model_dump()) - path = self.__fixture_path("empty-doc.pdf") + path = self.__fixture_path('empty-doc.pdf') with pytest.raises(ValueError) as excinfo: driver.parse(path, level='custom') - assert "not supported" in str(excinfo.value) - assert "[custom]" in str(excinfo.value) - + assert 'not supported' in str(excinfo.value) + assert '[custom]' in str(excinfo.value) + def test_llmwhisperer_driver_read_empty_document_page_level(self): driver = LlmWhispererDriver(LlmWhispererConfig().model_dump()) - path = self.__fixture_path("empty-doc.pdf") + path = self.__fixture_path('empty-doc.pdf') document = driver.parse(path, level='page') assert document is not None @@ -65,12 +65,12 @@ def test_llmwhisperer_driver_read_empty_document_page_level(self): assert document.metadata is None assert len(document.pages) == 1 assert isinstance(document.pages[0], Page) - assert document.pages[0].text == "\n\n1 \n" - + assert document.pages[0].text == '\n\n1 \n' + def test_llmwhisperer_driver_read_document(self): driver = LlmWhispererDriver(LlmWhispererConfig().model_dump()) - path = self.__fixture_path("test-doc.pdf") + path = self.__fixture_path('test-doc.pdf') document = driver.parse(path, level='page') assert document is not None @@ -79,5 +79,7 @@ def test_llmwhisperer_driver_read_document(self): assert document.metadata is None assert len(document.pages) == 1 assert isinstance(document.pages[0], Page) - assert document.pages[0].text == "\n\nThis is the header \n\nThis is a test PDF to be used as input in unit \n\ntests \n\nThis is a heading 1 \nThis is a paragraph below heading 1 \n\n 1 \n" - + assert ( + document.pages[0].text + == '\n\nThis is the header \n\nThis is a test PDF to be used as input in unit \n\ntests \n\nThis is a heading 1 \nThis is a paragraph below heading 1 \n\n 1 \n' + ) diff --git a/tests/drivers/test_pdfact.py b/tests/drivers/test_pdfact.py index 1e6e1e6..9c4b370 100644 --- a/tests/drivers/test_pdfact.py +++ b/tests/drivers/test_pdfact.py @@ -1,49 +1,50 @@ import os import pytest -from parxy_core.models import ( - TextBlock, Page -) +from parxy_core.models import TextBlock, Page from parxy_core.drivers import PdfActDriver from parxy_core.models import PdfActConfig +@pytest.mark.skipif( + os.getenv('GITHUB_ACTIONS') == 'true', + reason='External service required, skipping tests in GitHub Actions.', +) class TestPdfActDriver: - def __fixture_path(self, file: str) -> str: current_dir = os.path.dirname(os.path.abspath(__file__)) - fixtures_dir = os.path.join(os.path.dirname(current_dir), "fixtures") + fixtures_dir = os.path.join(os.path.dirname(current_dir), 'fixtures') return os.path.join(fixtures_dir, file) - def test_pdfact_driver_can_be_created(self): driver = PdfActDriver(PdfActConfig().model_dump()) - assert driver.supported_levels == ["page", "paragraph", "block"] - - def test_pdfact_driver_requires_valid_base_url(self): + assert driver.supported_levels == ['page', 'paragraph', 'block'] + def test_pdfact_driver_requires_valid_base_url(self): with pytest.raises(ValueError) as excinfo: PdfActDriver(PdfActConfig(base_url='invalid-host').model_dump()) - assert "Invalid base URL. Expected URL, found [invalid-host]." in str(excinfo.value) + assert 'Invalid base URL. Expected URL, found [invalid-host].' in str( + excinfo.value + ) def test_pdfact_driver_unrecognized_level_handled(self): driver = PdfActDriver(PdfActConfig().model_dump()) - path = self.__fixture_path("empty-doc.pdf") + path = self.__fixture_path('empty-doc.pdf') with pytest.raises(ValueError) as excinfo: driver.parse(path, level='custom') - assert "not supported" in str(excinfo.value) - assert "[custom]" in str(excinfo.value) - + assert 'not supported' in str(excinfo.value) + assert '[custom]' in str(excinfo.value) + def test_pdfact_driver_read_empty_document_block_level(self): driver = PdfActDriver(PdfActConfig().model_dump()) - path = self.__fixture_path("empty-doc.pdf") + path = self.__fixture_path('empty-doc.pdf') document = driver.parse(path) assert document is not None @@ -55,11 +56,11 @@ def test_pdfact_driver_read_empty_document_block_level(self): assert document.pages[0].text == '1' assert len(document.pages[0].blocks) == 1 assert isinstance(document.pages[0].blocks[0], TextBlock) - + def test_pdfact_driver_read_empty_document_page_level(self): driver = PdfActDriver(PdfActConfig().model_dump()) - path = self.__fixture_path("empty-doc.pdf") + path = self.__fixture_path('empty-doc.pdf') document = driver.parse(path, level='page') assert document is not None @@ -68,12 +69,12 @@ def test_pdfact_driver_read_empty_document_page_level(self): assert document.metadata is None assert len(document.pages) == 1 assert isinstance(document.pages[0], Page) - assert document.pages[0].text == "1" - + assert document.pages[0].text == '1' + def test_pdfact_driver_read_document(self): driver = PdfActDriver(PdfActConfig().model_dump()) - path = self.__fixture_path("test-doc.pdf") + path = self.__fixture_path('test-doc.pdf') document = driver.parse(path, level='page') assert document is not None @@ -82,6 +83,7 @@ def test_pdfact_driver_read_document(self): assert document.metadata is None assert len(document.pages) == 1 assert isinstance(document.pages[0], Page) - assert document.pages[0].text == "This is the header\nThis is a test PDF to be used as input in unit tests\nThis is a heading 1\nThis is a paragraph below heading 1\n1" - - + assert ( + document.pages[0].text + == 'This is the header\nThis is a test PDF to be used as input in unit tests\nThis is a heading 1\nThis is a paragraph below heading 1\n1' + ) diff --git a/tests/drivers/test_pymupdf.py b/tests/drivers/test_pymupdf.py index 002706c..b93c1b2 100644 --- a/tests/drivers/test_pymupdf.py +++ b/tests/drivers/test_pymupdf.py @@ -1,50 +1,46 @@ import os import pytest -from parxy_core.models import ( - TextBlock, Page -) +from parxy_core.models import TextBlock, Page from parxy_core.drivers import PyMuPdfDriver from parxy_core.exceptions import FileNotFoundException class TestPymuPdfDriver: - def __fixture_path(self, file: str) -> str: current_dir = os.path.dirname(os.path.abspath(__file__)) - fixtures_dir = os.path.join(os.path.dirname(current_dir), "fixtures") + fixtures_dir = os.path.join(os.path.dirname(current_dir), 'fixtures') return os.path.join(fixtures_dir, file) - def test_pymupdf_driver_can_be_created(self): driver = PyMuPdfDriver() - assert driver.supported_levels == ["page", "block", "line", "span", "character"] + assert driver.supported_levels == ['page', 'block', 'line', 'span', 'character'] def test_pymupdf_driver_unrecognized_level_handled(self): driver = PyMuPdfDriver() - path = self.__fixture_path("empty-doc.pdf") + path = self.__fixture_path('empty-doc.pdf') with pytest.raises(ValueError) as excinfo: driver.parse(path, level='custom') - assert "not supported" in str(excinfo.value) - assert "[custom]" in str(excinfo.value) + assert 'not supported' in str(excinfo.value) + assert '[custom]' in str(excinfo.value) def test_pymupdf_driver_handle_not_existing_file(self): driver = PyMuPdfDriver() - path = self.__fixture_path("non-existing-file.pdf") + path = self.__fixture_path('non-existing-file.pdf') with pytest.raises(FileNotFoundException) as excinfo: driver.parse(path) - + def test_pymupdf_driver_read_empty_document_block_level(self): driver = PyMuPdfDriver() - path = self.__fixture_path("empty-doc.pdf") + path = self.__fixture_path('empty-doc.pdf') document = driver.parse(path) assert document is not None @@ -52,24 +48,24 @@ def test_pymupdf_driver_read_empty_document_block_level(self): assert document.outline is None assert document.metadata is not None assert document.metadata.title - assert document.metadata.title == "Test document" - assert document.metadata.author == "Data House Author" - assert document.metadata.subject == "" - assert document.metadata.keywords == "" - assert document.metadata.creator == "Microsoft® Word for Microsoft 365" - assert document.metadata.producer == "Microsoft® Word for Microsoft 365" - assert document.metadata.created_at == "2023-11-13T18:43:06" + assert document.metadata.title == 'Test document' + assert document.metadata.author == 'Data House Author' + assert document.metadata.subject == '' + assert document.metadata.keywords == '' + assert document.metadata.creator == 'Microsoft® Word for Microsoft 365' + assert document.metadata.producer == 'Microsoft® Word for Microsoft 365' + assert document.metadata.created_at == '2023-11-13T18:43:06' assert document.metadata.updated_at == document.metadata.created_at assert len(document.pages) == 1 assert isinstance(document.pages[0], Page) - assert document.pages[0].text == "1 \n \n " + assert document.pages[0].text == '1 \n \n ' assert len(document.pages[0].blocks) == 2 assert isinstance(document.pages[0].blocks[0], TextBlock) - + def test_pymupdf_driver_read_empty_document_page_level(self): driver = PyMuPdfDriver() - path = self.__fixture_path("empty-doc.pdf") + path = self.__fixture_path('empty-doc.pdf') document = driver.parse(path, level='page') assert document is not None @@ -77,23 +73,23 @@ def test_pymupdf_driver_read_empty_document_page_level(self): assert document.outline is None assert document.metadata is not None assert document.metadata.title - assert document.metadata.title == "Test document" - assert document.metadata.author == "Data House Author" - assert document.metadata.subject == "" - assert document.metadata.keywords == "" - assert document.metadata.creator == "Microsoft® Word for Microsoft 365" - assert document.metadata.producer == "Microsoft® Word for Microsoft 365" - assert document.metadata.created_at == "2023-11-13T18:43:06" + assert document.metadata.title == 'Test document' + assert document.metadata.author == 'Data House Author' + assert document.metadata.subject == '' + assert document.metadata.keywords == '' + assert document.metadata.creator == 'Microsoft® Word for Microsoft 365' + assert document.metadata.producer == 'Microsoft® Word for Microsoft 365' + assert document.metadata.created_at == '2023-11-13T18:43:06' assert document.metadata.updated_at == document.metadata.created_at assert len(document.pages) == 1 assert isinstance(document.pages[0], Page) assert document.pages[0].blocks is None - assert document.pages[0].text == "1 \n \n " - + assert document.pages[0].text == '1 \n \n ' + def test_pymupdf_driver_read_document(self): driver = PyMuPdfDriver() - path = self.__fixture_path("test-doc.pdf") + path = self.__fixture_path('test-doc.pdf') document = driver.parse(path, level='page') assert document is not None @@ -101,17 +97,18 @@ def test_pymupdf_driver_read_document(self): assert document.outline is None assert document.metadata is not None assert document.metadata.title - assert document.metadata.title == "Test document" - assert document.metadata.author == "Data House Author" - assert document.metadata.subject == "" - assert document.metadata.keywords == "" - assert document.metadata.creator == "Microsoft® Word for Microsoft 365" - assert document.metadata.producer == "Microsoft® Word for Microsoft 365" - assert document.metadata.created_at == "2023-05-09T11:34:41" + assert document.metadata.title == 'Test document' + assert document.metadata.author == 'Data House Author' + assert document.metadata.subject == '' + assert document.metadata.keywords == '' + assert document.metadata.creator == 'Microsoft® Word for Microsoft 365' + assert document.metadata.producer == 'Microsoft® Word for Microsoft 365' + assert document.metadata.created_at == '2023-05-09T11:34:41' assert document.metadata.updated_at == document.metadata.created_at assert len(document.pages) == 1 assert isinstance(document.pages[0], Page) assert document.pages[0].blocks is None - assert document.pages[0].text == "This is the header \n \n1 \n \nThis is a test PDF to be used as input in unit \ntests \n \nThis is a heading 1 \nThis is a paragraph below heading 1 \n \n " - - + assert ( + document.pages[0].text + == 'This is the header \n \n1 \n \nThis is a test PDF to be used as input in unit \ntests \n \nThis is a heading 1 \nThis is a paragraph below heading 1 \n \n ' + ) diff --git a/tests/drivers/test_unstructured_local.py b/tests/drivers/test_unstructured_local.py index 6f87e98..d9a77f1 100644 --- a/tests/drivers/test_unstructured_local.py +++ b/tests/drivers/test_unstructured_local.py @@ -2,34 +2,29 @@ import pytest from parxy_core.exceptions import ( - AuthenticationException, FileNotFoundException, ) -from parxy_core.models import ( - TextBlock, Page -) +from parxy_core.models import Page from parxy_core.drivers import UnstructuredLocalDriver from parxy_core.models import UnstructuredLocalConfig class TestUnstructuredLocalDriver: - def __fixture_path(self, file: str) -> str: current_dir = os.path.dirname(os.path.abspath(__file__)) - fixtures_dir = os.path.join(os.path.dirname(current_dir), "fixtures") + fixtures_dir = os.path.join(os.path.dirname(current_dir), 'fixtures') return os.path.join(fixtures_dir, file) - def test_unstructured_local_driver_can_be_created(self): driver = UnstructuredLocalDriver(UnstructuredLocalConfig().model_dump()) - assert driver.supported_levels == ["page", "block"] - + assert driver.supported_levels == ['page', 'block'] + def test_unstructured_local_driver_handle_not_existing_file(self): driver = UnstructuredLocalDriver(UnstructuredLocalConfig().model_dump()) - path = self.__fixture_path("non-existing-file.pdf") + path = self.__fixture_path('non-existing-file.pdf') with pytest.raises(FileNotFoundException) as excinfo: driver.parse(path) @@ -37,18 +32,18 @@ def test_unstructured_local_driver_handle_not_existing_file(self): def test_unstructured_local_driver_unrecognized_level_handled(self): driver = UnstructuredLocalDriver(UnstructuredLocalConfig().model_dump()) - path = self.__fixture_path("empty-doc.pdf") + path = self.__fixture_path('empty-doc.pdf') with pytest.raises(ValueError) as excinfo: driver.parse(path, level='custom') - assert "not supported" in str(excinfo.value) - assert "[custom]" in str(excinfo.value) - + assert 'not supported' in str(excinfo.value) + assert '[custom]' in str(excinfo.value) + def test_unstructured_local_driver_read_empty_document_page_level(self): driver = UnstructuredLocalDriver(UnstructuredLocalConfig().model_dump()) - path = self.__fixture_path("empty-doc.pdf") + path = self.__fixture_path('empty-doc.pdf') document = driver.parse(path, level='page') assert document is not None @@ -57,12 +52,12 @@ def test_unstructured_local_driver_read_empty_document_page_level(self): assert document.metadata is None assert len(document.pages) == 1 assert isinstance(document.pages[0], Page) - assert document.pages[0].text == "1" - + assert document.pages[0].text == '1' + def test_unstructured_local_driver_read_document(self): driver = UnstructuredLocalDriver(UnstructuredLocalConfig().model_dump()) - path = self.__fixture_path("test-doc.pdf") + path = self.__fixture_path('test-doc.pdf') document = driver.parse(path, level='page') assert document is not None @@ -71,5 +66,7 @@ def test_unstructured_local_driver_read_document(self): assert document.metadata is None assert len(document.pages) == 1 assert isinstance(document.pages[0], Page) - assert document.pages[0].text == "This is the header\nThis is a test PDF to be used as input in unit tests\nThis is a heading 1 This is a paragraph below heading 1\n1" - + assert ( + document.pages[0].text + == 'This is the header\nThis is a test PDF to be used as input in unit tests\nThis is a heading 1 This is a paragraph below heading 1\n1' + ) diff --git a/tests/smoke_test.py b/tests/smoke_test.py new file mode 100644 index 0000000..4c052e8 --- /dev/null +++ b/tests/smoke_test.py @@ -0,0 +1,13 @@ +"""Check that basic features work before publishing on Pypi + +Catch cases where e.g. files are missing so the import doesn't work. It is +recommended to check that e.g. assets are included.""" + +from parxy_core.facade import Parxy +from parxy_core.drivers import PyMuPdfDriver + +driver = Parxy.driver() +if isinstance(driver, PyMuPdfDriver): + print('Smoke test succeeded') +else: + raise RuntimeError('Failed to obtain PyMuPdfDriver') diff --git a/tests/test_base.py b/tests/test_base.py index 6f4e747..9b0513c 100644 --- a/tests/test_base.py +++ b/tests/test_base.py @@ -1,4 +1,3 @@ - class TestBase: def test_example(self): - assert True \ No newline at end of file + assert True diff --git a/tests/test_factory.py b/tests/test_factory.py index 370366f..7d0a9a4 100644 --- a/tests/test_factory.py +++ b/tests/test_factory.py @@ -1,3 +1,4 @@ +import os import pytest from parxy_core.drivers import DriverFactory @@ -13,55 +14,59 @@ class CustomDriverExample(Driver): """Example custom driver for testing.""" - supported_levels = ["page"] - def _handle(self, file, level="page") -> Document: + supported_levels = ['page'] + + def _handle(self, file, level='page') -> Document: return Document(pages=[]) class TestDriverFactory: - def test_build_required_to_create_instance(self): with pytest.raises(Exception) as excinfo: DriverFactory() - - assert "Use `DriverFactory.build()` to create an instance." in str(excinfo.value) - + + assert 'Use `DriverFactory.build()` to create an instance.' in str( + excinfo.value + ) + def test_singleton(self): factory_one = DriverFactory.build() factory_two = DriverFactory.build() - + assert factory_one is factory_two - + def test_unrecognized_driver(self): with pytest.raises(ValueError) as excinfo: DriverFactory.build().driver('unrecognized') - - assert "Driver [unrecognized] not supported" in str(excinfo.value) + + assert 'Driver [unrecognized] not supported' in str(excinfo.value) def test_register_custom_driver(self): - - DriverFactory.build().forget_drivers().extend('custom', lambda: CustomDriverExample()) - + DriverFactory.build().forget_drivers().extend( + 'custom', lambda: CustomDriverExample() + ) + driver = DriverFactory.build().driver('custom') - + document = driver.parse('example.pdf', level='page') assert isinstance(driver, CustomDriverExample) - + assert document is not None assert document.isEmpty() - def test_no_duplicate_driver_can_be_registered(self): """Test that registering a duplicate driver name raises ValueError.""" - DriverFactory.build().forget_drivers().extend('custom', lambda: CustomDriverExample()) - + DriverFactory.build().forget_drivers().extend( + 'custom', lambda: CustomDriverExample() + ) + with pytest.raises(ValueError) as excinfo: DriverFactory.build().extend('custom', lambda: CustomDriverExample()) - assert "Driver [custom] already registered" in str(excinfo.value) + assert 'Driver [custom] already registered' in str(excinfo.value) def test_default_driver_fallback_to_pymupdf(self): DriverFactory.reset() @@ -69,7 +74,12 @@ def test_default_driver_fallback_to_pymupdf(self): def test_default_driver_name_read_from_configuration(self): DriverFactory.reset() - assert DriverFactory.build().initialize(ParxyConfig(default_driver='pdfact')).default_driver_name() == 'pdfact' + assert ( + DriverFactory.build() + .initialize(ParxyConfig(default_driver='pdfact')) + .default_driver_name() + == 'pdfact' + ) def test_default_driver_instantiated(self): DriverFactory.reset() @@ -86,6 +96,10 @@ def test_pdfact_driver_instantiated(self): driver = DriverFactory.build().driver('pdfact') assert isinstance(driver, PdfActDriver) + @pytest.mark.skipif( + os.getenv('GITHUB_ACTIONS') == 'true', + reason='External service required, skipping tests in GitHub Actions.', + ) def test_llamaparse_driver_instantiated(self): DriverFactory.reset() driver = DriverFactory.build().driver('llamaparse') @@ -99,4 +113,4 @@ def test_llmwhisperer_driver_instantiated(self): def test_unstructured_local_driver_instantiated(self): DriverFactory.reset() driver = DriverFactory.build().driver('unstructured_local') - assert isinstance(driver, UnstructuredLocalDriver) \ No newline at end of file + assert isinstance(driver, UnstructuredLocalDriver) diff --git a/tests/test_models.py b/tests/test_models.py index f8c149a..ac24b05 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -1,6 +1,15 @@ from parxy_core.models import ( - BoundingBox, Style, Character, Span, Line, - TextBlock, ImageBlock, TableBlock, Page, Metadata, Document + BoundingBox, + Style, + Character, + Span, + Line, + TextBlock, + ImageBlock, + TableBlock, + Page, + Metadata, + Document, ) @@ -14,30 +23,25 @@ def test_bounding_box(self): def test_style(self): style = Style( - font_name="Arial", + font_name='Arial', font_size=12.0, - font_style="normal", - color="#000000", + font_style='normal', + color='#000000', alpha=255, - weight=400.0 + weight=400.0, ) - assert style.font_name == "Arial" + assert style.font_name == 'Arial' assert style.font_size == 12.0 - assert style.font_style == "normal" - assert style.color == "#000000" + assert style.font_style == 'normal' + assert style.color == '#000000' assert style.alpha == 255 assert style.weight == 400.0 def test_character(self): bbox = BoundingBox(x0=0.0, y0=0.0, x1=10.0, y1=10.0) - style = Style(font_name="Times") - char = Character( - text="A", - bbox=bbox, - style=style, - page=1 - ) - assert char.text == "A" + style = Style(font_name='Times') + char = Character(text='A', bbox=bbox, style=style, page=1) + assert char.text == 'A' assert char.bbox == bbox assert char.style == style assert char.page == 1 @@ -46,18 +50,9 @@ def test_character(self): def test_span(self): bbox = BoundingBox(x0=0.0, y0=0.0, x1=50.0, y1=20.0) style = Style(font_size=12.0) - chars = [ - Character(text="H", page=1), - Character(text="i", page=1) - ] - span = Span( - text="Hi", - bbox=bbox, - style=style, - characters=chars, - page=1 - ) - assert span.text == "Hi" + chars = [Character(text='H', page=1), Character(text='i', page=1)] + span = Span(text='Hi', bbox=bbox, style=style, characters=chars, page=1) + assert span.text == 'Hi' assert span.bbox == bbox assert span.style == style assert len(span.characters) == 2 @@ -66,19 +61,10 @@ def test_span(self): def test_line(self): bbox = BoundingBox(x0=0.0, y0=0.0, x1=100.0, y1=30.0) - style = Style(font_name="Helvetica") - spans = [ - Span(text="Hello", page=1), - Span(text="World", page=1) - ] - line = Line( - text="Hello World", - bbox=bbox, - style=style, - spans=spans, - page=1 - ) - assert line.text == "Hello World" + style = Style(font_name='Helvetica') + spans = [Span(text='Hello', page=1), Span(text='World', page=1)] + line = Line(text='Hello World', bbox=bbox, style=style, spans=spans, page=1) + assert line.text == 'Hello World' assert line.bbox == bbox assert line.style == style assert len(line.spans) == 2 @@ -88,111 +74,90 @@ def test_line(self): def test_text_block(self): bbox = BoundingBox(x0=0.0, y0=0.0, x1=200.0, y1=100.0) style = Style(font_size=14.0) - lines = [ - Line(text="First line", page=1), - Line(text="Second line", page=1) - ] + lines = [Line(text='First line', page=1), Line(text='Second line', page=1)] block = TextBlock( - type="text", + type='text', bbox=bbox, page=1, - category="paragraph", + category='paragraph', style=style, level=1, lines=lines, - text="First line\nSecond line" + text='First line\nSecond line', ) - assert block.type == "text" + assert block.type == 'text' assert block.bbox == bbox - assert block.category == "paragraph" + assert block.category == 'paragraph' assert block.style == style assert block.level == 1 assert len(block.lines) == 2 - assert block.text == "First line\nSecond line" + assert block.text == 'First line\nSecond line' assert not block.isEmpty() def test_image_block(self): bbox = BoundingBox(x0=0.0, y0=0.0, x1=300.0, y1=200.0) - image = ImageBlock( - type="image", - bbox=bbox, - page=1 - ) - assert image.type == "image" + image = ImageBlock(type='image', bbox=bbox, page=1) + assert image.type == 'image' assert image.bbox == bbox assert image.page == 1 def test_table_block(self): bbox = BoundingBox(x0=0.0, y0=0.0, x1=400.0, y1=300.0) - table = TableBlock( - type="table", - bbox=bbox, - page=1 - ) - assert table.type == "table" + table = TableBlock(type='table', bbox=bbox, page=1) + assert table.type == 'table' assert table.bbox == bbox assert table.page == 1 def test_page(self): - text_block = TextBlock( - type="text", - text="Sample text", - page=1 - ) - image_block = ImageBlock( - type="image", - page=1 - ) + text_block = TextBlock(type='text', text='Sample text', page=1) + image_block = ImageBlock(type='image', page=1) page = Page( number=1, width=612.0, height=792.0, blocks=[text_block, image_block], - text="Sample text" + text='Sample text', ) assert page.number == 1 assert page.width == 612.0 assert page.height == 792.0 assert len(page.blocks) == 2 - assert page.text == "Sample text" + assert page.text == 'Sample text' assert not page.isEmpty() def test_metadata(self): metadata = Metadata( - title="Test Document", - author="John Doe", - subject="Testing", - keywords="test,document", - creator="Test App", - producer="PDF Library", - created_at="2025-08-18", - updated_at="2025-08-18" + title='Test Document', + author='John Doe', + subject='Testing', + keywords='test,document', + creator='Test App', + producer='PDF Library', + created_at='2025-08-18', + updated_at='2025-08-18', ) - assert metadata.title == "Test Document" - assert metadata.author == "John Doe" - assert metadata.subject == "Testing" - assert metadata.keywords == "test,document" - assert metadata.creator == "Test App" - assert metadata.producer == "PDF Library" - assert metadata.created_at == "2025-08-18" - assert metadata.updated_at == "2025-08-18" + assert metadata.title == 'Test Document' + assert metadata.author == 'John Doe' + assert metadata.subject == 'Testing' + assert metadata.keywords == 'test,document' + assert metadata.creator == 'Test App' + assert metadata.producer == 'PDF Library' + assert metadata.created_at == '2025-08-18' + assert metadata.updated_at == '2025-08-18' def test_document(self): - metadata = Metadata(title="Test Document") - page = Page( - number=1, - text="Page content" - ) + metadata = Metadata(title='Test Document') + page = Page(number=1, text='Page content') doc = Document( - filename="test.pdf", - language="en", + filename='test.pdf', + language='en', metadata=metadata, pages=[page], - outline=["Chapter 1", "Chapter 2"] + outline=['Chapter 1', 'Chapter 2'], ) - assert doc.filename == "test.pdf" - assert doc.language == "en" + assert doc.filename == 'test.pdf' + assert doc.language == 'en' assert doc.metadata == metadata assert len(doc.pages) == 1 - assert doc.outline == ["Chapter 1", "Chapter 2"] - assert not doc.isEmpty() \ No newline at end of file + assert doc.outline == ['Chapter 1', 'Chapter 2'] + assert not doc.isEmpty() diff --git a/tests/test_parxy_facade.py b/tests/test_parxy_facade.py index 32ef3d2..9d11122 100644 --- a/tests/test_parxy_facade.py +++ b/tests/test_parxy_facade.py @@ -2,27 +2,24 @@ from parxy_core.facade import Parxy from parxy_core.drivers import DriverFactory -from parxy_core.drivers import Driver from parxy_core.drivers import PyMuPdfDriver from parxy_core.drivers import PdfActDriver -from parxy_core.drivers import LlamaParseDriver -from parxy_core.drivers import LlmWhispererDriver -from parxy_core.models import Document -from parxy_core.models import ParxyConfig + class TestParxyFacade: - def test_build_required_to_create_instance(self): with pytest.raises(TypeError) as excinfo: Parxy() - - assert "Parxy is a static class and cannot be instantiated" in str(excinfo.value) - + + assert 'Parxy is a static class and cannot be instantiated' in str( + excinfo.value + ) + def test_unrecognized_driver(self): with pytest.raises(ValueError) as excinfo: Parxy.driver('unrecognized') - - assert "Driver [unrecognized] not supported" in str(excinfo.value) + + assert 'Driver [unrecognized] not supported' in str(excinfo.value) def test_default_driver_instantiated(self): driver = Parxy.driver() @@ -39,5 +36,5 @@ def test_driver_factory_returned(self): def test_manager_is_singleton(self): factory_one = Parxy._get_factory() factory_two = Parxy._get_factory() - + assert factory_one is factory_two diff --git a/uv.lock b/uv.lock index 7053937..cb055bc 100644 --- a/uv.lock +++ b/uv.lock @@ -1718,7 +1718,7 @@ wheels = [ ] [[package]] -name = "parxy-core" +name = "parxy" version = "0.1.0" source = { editable = "." } dependencies = [