From 5f229d430b3f95628bdb391f03658f74f154fc61 Mon Sep 17 00:00:00 2001 From: Alessio Vertemati Date: Fri, 20 Feb 2026 14:15:19 +0100 Subject: [PATCH] Align markdown command to parse command --- docs/tutorials/using_cli.md | 59 +++++- src/parxy_cli/commands/markdown.py | 270 ++++++++++++++++++--------- tests/commands/test_markdown.py | 285 ++++++++++++++++++++++------- 3 files changed, 444 insertions(+), 170 deletions(-) diff --git a/docs/tutorials/using_cli.md b/docs/tutorials/using_cli.md index 58401ad..a350ed1 100644 --- a/docs/tutorials/using_cli.md +++ b/docs/tutorials/using_cli.md @@ -12,7 +12,7 @@ The Parxy CLI lets you: |------------------|-------------------------------------------------------------------------------------------------------------| | `parxy parse` | Extract text content from documents with customizable detail levels and output formats. Process files or folders with multiple drivers. | | `parxy preview` | Interactive document viewer with metadata, table of contents, and scrollable content preview | -| `parxy markdown` | Convert parsed documents into Markdown format (optionally combine multiple files) | +| `parxy markdown` | Convert documents to Markdown files, with support for multiple drivers and folder processing | | `parxy pdf:merge`| Merge multiple PDF files into one, with support for page ranges | | `parxy pdf:split`| Split a PDF file into individual pages | | `parxy drivers` | List available document processing drivers | @@ -176,27 +176,68 @@ This is ideal for quick document inspection before running a full parsing operat ## Converting to Markdown -The `markdown` command converts parsed documents into Markdown format, preserving structure such as headings and lists. +The `markdown` command converts documents to Markdown format, preserving structure such as headings and lists. It follows the same conventions as the `parse` command: output files are prefixed with the driver name and saved next to the source file by default. + +### Basic Usage ```bash parxy markdown document.pdf ``` -Output is printed to the console by default. To save Markdown files to disk: +This creates a `pymupdf-document.md` file in the same directory as the source file. + +### Processing Multiple Files and Folders ```bash -parxy markdown -o output/ document1.pdf document2.pdf +# Parse multiple files +parxy markdown doc1.pdf doc2.pdf doc3.pdf + +# Parse all PDFs in a folder (non-recursive by default) +parxy markdown /path/to/folder + +# Parse recursively +parxy markdown /path/to/folder --recursive + +# Limit recursion depth +parxy markdown /path/to/folder --recursive --max-depth 2 ``` -Each document will be saved as a `.md` file. +### Output Directory + +```bash +parxy markdown document.pdf -o output/ +``` -To combine multiple documents into a single Markdown file: +### Using Multiple Drivers + +Run the same documents through multiple drivers for comparison: + +```bash +parxy markdown document.pdf -d pymupdf -d llamaparse +``` + +This produces `pymupdf-document.md` and `llamaparse-document.md`. + +### Inline Output + +Use `--inline` with a single file to print markdown directly to stdout with a YAML frontmatter header — useful for shell pipelines: ```bash -parxy markdown --combine -o output/ doc1.pdf doc2.pdf doc3.pdf +parxy markdown document.pdf --inline +parxy markdown document.pdf --inline | your-tool ``` -This will generate a file named `combined_output.md` in the output directory. +Output format: + +```markdown +--- +file: "document.pdf" +pages: 10 +--- + +# Document heading +... +``` ## Manipulating PDFs @@ -317,7 +358,7 @@ With the CLI, you can use Parxy as a **standalone document parsing tool** — id |------------------|--------------------------------------------------------------| | `parxy parse` | Extract text from documents with multiple formats & drivers | | `parxy preview` | Interactive document viewer with metadata and TOC | -| `parxy markdown` | Generate Markdown output | +| `parxy markdown` | Generate Markdown files with driver prefix naming | | `parxy pdf:merge`| Merge multiple PDF files with page range support | | `parxy pdf:split`| Split PDF files into individual pages | | `parxy drivers` | List supported drivers | diff --git a/src/parxy_cli/commands/markdown.py b/src/parxy_cli/commands/markdown.py index 1aaab87..dfba9f6 100644 --- a/src/parxy_cli/commands/markdown.py +++ b/src/parxy_cli/commands/markdown.py @@ -1,12 +1,16 @@ -import os -import time +"""Markdown export command for Parxy document processing.""" + +from datetime import timedelta +from pathlib import Path from typing import Optional, List, Annotated import typer from parxy_core.facade import Parxy + from parxy_cli.models import Level from parxy_cli.console.console import Console +from parxy_cli.commands.parse import collect_files, format_timedelta app = typer.Typer() @@ -15,22 +19,18 @@ @app.command() def markdown( - files: Annotated[ + inputs: Annotated[ List[str], typer.Argument( - help='One or more files to parse', - exists=True, - file_okay=True, - dir_okay=False, - readable=True, + help='One or more files or folders to parse. Use --recursive to search subdirectories.', ), ], - driver: Annotated[ - Optional[str], + drivers: Annotated[ + Optional[List[str]], typer.Option( '--driver', '-d', - help='Driver to use for parsing (default: pymupdf or PARXY_DEFAULT_DRIVER)', + help='Driver(s) to use for parsing. Can be specified multiple times. (default: pymupdf or PARXY_DEFAULT_DRIVER)', ), ] = None, level: Annotated[ @@ -41,99 +41,189 @@ def markdown( help='Extraction level', ), ] = Level.BLOCK, - env_file: Annotated[ - str, - typer.Option( - '--env', - '-e', - help='Path to .env file with configuration', - exists=True, - file_okay=True, - dir_okay=False, - readable=True, - ), - ] = '.env', output_dir: Annotated[ Optional[str], typer.Option( '--output', '-o', - help='Directory to save markdown files. If not specified, output will be printed to console', + help='Directory to save markdown files. If not specified, files are saved next to the source files.', dir_okay=True, file_okay=False, ), ] = None, - combine: Annotated[ + inline: Annotated[ + bool, + typer.Option( + '--inline', + '-i', + help='Output markdown to stdout with file name as YAML frontmatter. Only valid with a single file.', + ), + ] = False, + recursive: Annotated[ + bool, + typer.Option( + '--recursive', + '-r', + help='Recursively search subdirectories when processing folders', + ), + ] = False, + max_depth: Annotated[ + Optional[int], + typer.Option( + '--max-depth', + help='Maximum depth to recurse into subdirectories (only applies with --recursive). 0 = current directory only, 1 = one level down, etc.', + min=0, + ), + ] = None, + stop_on_failure: Annotated[ bool, typer.Option( - '--combine', - '-c', - help='Combine all documents into a single markdown file', + '--stop-on-failure', + help='Stop processing files immediately if an error occurs with any file', ), ] = False, + workers: Annotated[ + int, + typer.Option( + '--workers', + '-w', + help='Number of parallel workers to use. Defaults to cpu count.', + min=1, + ), + ] = None, ): - """Parse documents to Markdown.""" + """Parse documents to Markdown. + + Examples: + + # Parse a single file + parxy markdown document.pdf + + # Parse with a specific driver and output to a folder + parxy markdown document.pdf -d pymupdf -o output/ + + # Parse all PDFs in a folder (non-recursive by default) + parxy markdown /path/to/folder + + # Parse recursively with multiple drivers + parxy markdown /path/to/folder --recursive -d pymupdf -d llamaparse + + # Output to stdout as YAML-frontmattered markdown (single file only) + parxy markdown document.pdf --inline + """ + console.action('Markdown export', space_after=False) + + # Collect all files + files = collect_files(inputs, recursive=recursive, max_depth=max_depth) + + if not files: + console.warning('No suitable files found to process.', panel=True) + raise typer.Exit(1) + + if inline and len(files) > 1: + console.error('--inline can only be used with a single file') + raise typer.Exit(1) + + # Use default driver if none specified + if not drivers: + drivers = [Parxy.default_driver()] + + output_path = Path(output_dir) if output_dir else None + + total_tasks = len(files) * len(drivers) + error_count = 0 + try: - # Create output directory if specified - if output_dir: - os.makedirs(output_dir, exist_ok=True) - - # For combined output - combined_content = [] - - # Process each file - for file_path in files: - try: - with console.shimmer(f'Processing {file_path}...'): - # Parse the document - doc = Parxy.parse( - file=file_path, - level=level.value, - driver_name=driver, + with console.shimmer( + f'Processing {len(files)} file{"s" if len(files) > 1 else ""} with {len(drivers)} driver{"s" if len(drivers) > 1 else ""}...' + ): + with console.progress('Processing documents') as progress: + task = progress.add_task('', total=total_tasks) + + batch_tasks = [str(f) for f in files] + + for result in Parxy.batch_iter( + tasks=batch_tasks, + drivers=drivers, + level=level.value, + workers=workers, + ): + file_name = ( + Path(result.file).name + if isinstance(result.file, str) + else 'document' ) - console.action(file_path) - console.faint(f'{len(doc.pages)} pages extracted.') - - # Prepare markdown content - file_info = f"""```yaml -file: "{file_path}" -pages: {len(doc.pages)} -```""" - header = f'# {os.path.basename(file_path)}\n' - content = doc.markdown() - - markdown_content = f'{file_info}\n{header}\n{content}' - - if output_dir and not combine: - # Generate output filename - base_name = os.path.splitext(os.path.basename(file_path))[0] - output_path = os.path.join(output_dir, f'{base_name}.md') - - # Save to file - with open(output_path, 'w', encoding='utf-8') as f: - f.write(markdown_content) - console.success(f'Saved to: {output_path}.') - - elif not output_dir: - # Print to console - console.markdown(markdown_content) - console.rule() - console.newline() - - if combine: - combined_content.append(markdown_content) - - except Exception as e: - console.error(f'Error processing {file_path}: {str(e)}') - - # Save combined content if requested - if combine and output_dir and combined_content: - output_path = os.path.join(output_dir, 'combined_output.md') - with open(output_path, 'w', encoding='utf-8') as f: - f.write('\n\n---\n\n'.join(combined_content)) - console.success(f'Combined output saved to: {output_path}') - - except Exception as e: - console.error(f'Error: {str(e)}') - raise typer.Exit() + if result.success: + doc = result.document + file_path = ( + Path(result.file) + if isinstance(result.file, str) + else Path('document') + ) + + content = doc.markdown() + + if inline: + frontmatter = f'---\nfile: "{result.file}"\npages: {len(doc.pages)}\n---\n\n' + console.print(frontmatter + content) + else: + if output_path: + output_path.mkdir(parents=True, exist_ok=True) + save_dir = output_path + else: + save_dir = file_path.parent + + base_name = file_path.stem + if result.driver: + base_name = f'{result.driver}-{base_name}' + + out_file = save_dir / f'{base_name}.md' + out_file.write_text(content, encoding='utf-8') + + console.print( + f'[faint]⎿ [/faint] {file_name} via {result.driver} to [success]{out_file}[/success] [faint]({len(doc.pages)} pages)[/faint]' + ) + else: + console.print( + f'[faint]⎿ [/faint] {file_name} via {result.driver} error. [error]{result.error}[/error]' + ) + error_count += 1 + + if stop_on_failure: + console.newline() + console.info( + 'Stopping due to error (--stop-on-failure flag is set)' + ) + raise typer.Exit(1) + + progress.update(task, advance=1) + + elapsed_time = format_timedelta( + timedelta(seconds=max(0, progress.tasks[0].elapsed)) + ) + except KeyboardInterrupt: + console.newline() + console.warning('Interrupted by user') + raise typer.Exit(130) + + if not inline: + console.newline() + + if error_count == len(files) * len(drivers): + console.error('All files were not processed due to errors') + return + + if error_count > 0: + console.warning( + f'Processed {len(files)} file{"s" if len(files) > 1 else ""} with warnings using {len(drivers)} driver{"s" if len(drivers) > 1 else ""}' + ) + console.print( + f'[faint]⎿ [/faint] [highlight]{error_count} files errored[/highlight]' + ) + return + + if not inline: + console.success( + f'Processed {len(files)} file{"s" if len(files) > 1 else ""} using {len(drivers)} driver{"s" if len(drivers) > 1 else ""} (took {elapsed_time})' + ) diff --git a/tests/commands/test_markdown.py b/tests/commands/test_markdown.py index a44b907..88b4d74 100644 --- a/tests/commands/test_markdown.py +++ b/tests/commands/test_markdown.py @@ -1,12 +1,13 @@ """Test suite for the markdown command.""" -from unittest.mock import patch, MagicMock +from pathlib import Path +from unittest.mock import patch import pytest from typer.testing import CliRunner from click.utils import strip_ansi from parxy_cli.commands.markdown import app -from parxy_core.models import Document, Page +from parxy_core.models import Document, Page, BatchResult @pytest.fixture @@ -21,117 +22,259 @@ def mock_document(): return Document(pages=[Page(number=0, text='# Test heading\n\nTest content')]) -def test_markdown_command_calls_facade_correctly(runner, mock_document): - """Test that the markdown command correctly invokes the Parxy facade.""" +@pytest.fixture +def pdf_file(tmp_path): + """Fixture providing a real temporary PDF file.""" + pdf = tmp_path / 'test.pdf' + pdf.write_bytes(b'%PDF-1.4 fake content') + return pdf + + +def test_markdown_command_saves_file_with_driver_prefix( + runner, mock_document, pdf_file +): + """Test that output file is named with driver prefix, saved next to source file.""" with patch('parxy_cli.commands.markdown.Parxy') as mock_parxy: - # Setup the mock to return our test document - mock_parxy.parse.return_value = mock_document + mock_parxy.default_driver.return_value = 'pymupdf' + mock_parxy.batch_iter.return_value = iter( + [ + BatchResult( + file=str(pdf_file), + driver='pymupdf', + document=mock_document, + error=None, + ) + ] + ) - # Run the command with a test file - result = runner.invoke(app, ['test.pdf']) + result = runner.invoke(app, [str(pdf_file)]) - # Assert the command executed successfully assert result.exit_code == 0 - # Assert Parxy.parse was called with the correct arguments - mock_parxy.parse.assert_called_once_with( - file='test.pdf', - level='block', # default level - driver_name=None, # default driver + mock_parxy.batch_iter.assert_called_once_with( + tasks=[str(pdf_file)], + drivers=['pymupdf'], + level='block', + workers=None, + ) + + expected_output = pdf_file.parent / 'pymupdf-test.md' + assert expected_output.exists() + assert '# Test heading' in expected_output.read_text() + + +def test_markdown_command_with_output_directory( + runner, mock_document, pdf_file, tmp_path +): + """Test that files are saved in the specified output directory with driver prefix.""" + + output_dir = tmp_path / 'output' + + with patch('parxy_cli.commands.markdown.Parxy') as mock_parxy: + mock_parxy.default_driver.return_value = 'pymupdf' + mock_parxy.batch_iter.return_value = iter( + [ + BatchResult( + file=str(pdf_file), + driver='pymupdf', + document=mock_document, + error=None, + ) + ] ) - # Clean ANSI color codes from output and verify content - cleaned_output = strip_ansi(result.stdout) - assert 'test.pdf' in cleaned_output - assert 'pages: 1' in cleaned_output - assert '# Test heading' in cleaned_output + result = runner.invoke(app, [str(pdf_file), '--output', str(output_dir)]) + + assert result.exit_code == 0 + expected_output = output_dir / 'pymupdf-test.md' + assert expected_output.exists() + assert '# Test heading' in expected_output.read_text() -def test_markdown_command_with_custom_options(runner, mock_document): - """Test that the markdown command correctly handles custom options.""" + +def test_markdown_command_with_custom_level(runner, mock_document, pdf_file): + """Test that the --level option is passed through to batch_iter.""" with patch('parxy_cli.commands.markdown.Parxy') as mock_parxy: - mock_parxy.parse.return_value = mock_document + mock_parxy.batch_iter.return_value = iter( + [ + BatchResult( + file=str(pdf_file), + driver='llamaparse', + document=mock_document, + error=None, + ) + ] + ) - # Run command with custom options result = runner.invoke( - app, ['test.pdf', '--driver', 'pymupdf', '--level', 'page'] + app, [str(pdf_file), '--driver', 'llamaparse', '--level', 'page'] ) assert result.exit_code == 0 - # Assert Parxy.parse was called with custom options - mock_parxy.parse.assert_called_once_with( - file='test.pdf', level='page', driver_name='pymupdf' + mock_parxy.batch_iter.assert_called_once_with( + tasks=[str(pdf_file)], + drivers=['llamaparse'], + level='page', + workers=None, ) -def test_markdown_command_with_output_directory(runner, mock_document, tmp_path): - """Test that the markdown command correctly handles file output.""" +def test_markdown_command_with_multiple_drivers( + runner, mock_document, pdf_file, tmp_path +): + """Test that multiple drivers produce separate output files.""" + + output_dir = tmp_path / 'output' with patch('parxy_cli.commands.markdown.Parxy') as mock_parxy: - mock_parxy.parse.return_value = mock_document + mock_parxy.batch_iter.return_value = iter( + [ + BatchResult( + file=str(pdf_file), + driver='pymupdf', + document=mock_document, + error=None, + ), + BatchResult( + file=str(pdf_file), + driver='llamaparse', + document=mock_document, + error=None, + ), + ] + ) + + result = runner.invoke( + app, + [ + str(pdf_file), + '--driver', + 'pymupdf', + '--driver', + 'llamaparse', + '--output', + str(output_dir), + ], + ) - # Create output path using tmp_path fixture - output_dir = tmp_path / 'output' + assert result.exit_code == 0 + assert (output_dir / 'pymupdf-test.md').exists() + assert (output_dir / 'llamaparse-test.md').exists() + + +def test_markdown_command_inline_outputs_to_stdout(runner, mock_document, pdf_file): + """Test that --inline prints YAML-frontmattered markdown to stdout without saving a file.""" + + with patch('parxy_cli.commands.markdown.Parxy') as mock_parxy: + mock_parxy.default_driver.return_value = 'pymupdf' + mock_parxy.batch_iter.return_value = iter( + [ + BatchResult( + file=str(pdf_file), + driver='pymupdf', + document=mock_document, + error=None, + ) + ] + ) - # Run command with output directory - result = runner.invoke(app, ['test.pdf', '--output', str(output_dir)]) + result = runner.invoke(app, [str(pdf_file), '--inline']) assert result.exit_code == 0 - # Verify the output file was created - output_file = output_dir / 'test.md' - assert output_file.exists() - # Verify file content contains markdown - content = output_file.read_text() - assert 'file: "test.pdf"' in content - assert 'pages: 1' in content - assert '# Test heading' in content + cleaned = strip_ansi(result.stdout) + assert '---' in cleaned + assert 'file:' in cleaned + assert 'pages: 1' in cleaned + assert '# Test heading' in cleaned + + # No output file should be written + assert not (pdf_file.parent / 'pymupdf-test.md').exists() + +def test_markdown_command_inline_rejected_with_multiple_files(runner, tmp_path): + """Test that --inline exits with an error when more than one file is provided.""" -def test_markdown_command_with_combine_option(runner, mock_document, tmp_path): - """Test that the markdown command correctly handles combining multiple files.""" + pdf1 = tmp_path / 'a.pdf' + pdf2 = tmp_path / 'b.pdf' + pdf1.write_bytes(b'%PDF fake') + pdf2.write_bytes(b'%PDF fake') with patch('parxy_cli.commands.markdown.Parxy') as mock_parxy: - mock_parxy.parse.return_value = mock_document + mock_parxy.default_driver.return_value = 'pymupdf' + mock_parxy.batch_iter.return_value = iter([]) - # Create output path using tmp_path fixture - output_dir = tmp_path / 'output' + result = runner.invoke(app, [str(pdf1), str(pdf2), '--inline']) - # Run command with combine option and multiple files - result = runner.invoke( - app, ['test1.pdf', 'test2.pdf', '--output', str(output_dir), '--combine'] + assert result.exit_code == 1 + assert '--inline' in strip_ansi(result.stdout) + + +def test_markdown_command_handles_errors(runner, pdf_file): + """Test that per-file errors are reported and processing continues.""" + + with patch('parxy_cli.commands.markdown.Parxy') as mock_parxy: + mock_parxy.default_driver.return_value = 'pymupdf' + mock_parxy.batch_iter.return_value = iter( + [ + BatchResult( + file=str(pdf_file), + driver='pymupdf', + document=None, + error='Parse failed', + ) + ] ) - assert result.exit_code == 0 + result = runner.invoke(app, [str(pdf_file)]) - # Verify the combined output file was created - output_file = output_dir / 'combined_output.md' - assert output_file.exists() + cleaned = strip_ansi(result.stdout) + assert 'Parse failed' in cleaned - # Verify file contains both documents - content = output_file.read_text() - assert '# test1.pdf' in content - assert '# test2.pdf' in content - assert content.count('# Test heading') == 2 # One for each input file +def test_markdown_command_stop_on_failure(runner, mock_document, tmp_path): + """Test that --stop-on-failure exits immediately on first error.""" -def test_markdown_command_handles_errors(runner): - """Test that the markdown command properly handles and displays errors.""" + pdf1 = tmp_path / 'a.pdf' + pdf2 = tmp_path / 'b.pdf' + pdf1.write_bytes(b'%PDF fake') + pdf2.write_bytes(b'%PDF fake') with patch('parxy_cli.commands.markdown.Parxy') as mock_parxy: - # Setup the mock to raise an exception - mock_parxy.parse.side_effect = Exception('Test error') + mock_parxy.default_driver.return_value = 'pymupdf' + mock_parxy.batch_iter.return_value = iter( + [ + BatchResult( + file=str(pdf1), + driver='pymupdf', + document=None, + error='Parse failed', + ), + BatchResult( + file=str(pdf2), + driver='pymupdf', + document=mock_document, + error=None, + ), + ] + ) - # Run the command - result = runner.invoke(app, ['test.pdf']) + result = runner.invoke(app, [str(pdf1), str(pdf2), '--stop-on-failure']) - # Clean ANSI codes and verify error message - cleaned_output = strip_ansi(result.stdout) - assert 'Error processing test.pdf' in cleaned_output - assert 'Test error' in cleaned_output + assert result.exit_code == 1 + assert 'stopping due to error' in strip_ansi(result.stdout).lower() - # Unlike parse command, markdown continues on individual file errors - assert result.exit_code == 0 + +def test_markdown_command_no_files_found(runner, tmp_path): + """Test that the command exits with an error when no PDF files are found.""" + + empty_dir = tmp_path / 'empty' + empty_dir.mkdir() + + with patch('parxy_cli.commands.markdown.Parxy'): + result = runner.invoke(app, [str(empty_dir)]) + + assert result.exit_code == 1