diff --git a/packages/moss-cli/src/moss_cli/commands/doc.py b/packages/moss-cli/src/moss_cli/commands/doc.py index be1b4c34..55d2b01b 100644 --- a/packages/moss-cli/src/moss_cli/commands/doc.py +++ b/packages/moss-cli/src/moss_cli/commands/doc.py @@ -30,7 +30,7 @@ def _client(ctx: typer.Context) -> MossClient: def add( ctx: typer.Context, index_name: str = typer.Argument(..., help="Index name"), - file: str = typer.Option(..., "--file", "-f", help="Path to JSON/CSV document file, or '-' for stdin"), + file: str = typer.Option(..., "--file", "-f", help="Path to JSON/CSV document file, or document file (PDF, DOCX, etc.), or '-' for stdin"), profile: Optional[str] = typer.Option( None, "--profile", help="Credential profile name" ), diff --git a/packages/moss-cli/src/moss_cli/documents.py b/packages/moss-cli/src/moss_cli/documents.py index 9c0631a3..34a1fdf0 100644 --- a/packages/moss-cli/src/moss_cli/documents.py +++ b/packages/moss-cli/src/moss_cli/documents.py @@ -1,4 +1,4 @@ -"""Load documents from JSON/CSV files or stdin.""" +"""Load documents from JSON/CSV files, document files, or stdin.""" from __future__ import annotations @@ -11,9 +11,17 @@ import typer from moss import DocumentInfo +# Import moss-doc-parser for file parsing +try: + from moss_doc_parser import FileTypeDetector + + DOC_PARSER_AVAILABLE = True +except ImportError: + DOC_PARSER_AVAILABLE = False + def load_documents(file_path: str) -> List[DocumentInfo]: - """Load documents from a JSON/CSV file or stdin ('-').""" + """Load documents from a JSON/CSV file, document file, or stdin ('-').""" if file_path == "-": raw = sys.stdin.read() return _parse_json_docs(raw, source="stdin") @@ -22,9 +30,13 @@ def load_documents(file_path: str) -> List[DocumentInfo]: if not path.exists(): raise typer.BadParameter(f"File not found: {file_path}") + # Check if it's a supported document file for parsing suffix = path.suffix.lower() + if DOC_PARSER_AVAILABLE and suffix in ['.pdf', '.docx', '.pptx', '.html', '.htm', '.md', '.markdown']: + return _parse_document_file(str(path)) + + # Otherwise treat as JSON/CSV content = path.read_text() - if suffix == ".csv": return _parse_csv_docs(content) elif suffix == ".jsonl": @@ -35,6 +47,33 @@ def load_documents(file_path: str) -> List[DocumentInfo]: return _parse_json_docs(content, source=file_path) +def _parse_document_file(file_path: str) -> List[DocumentInfo]: + """Parse a document file using moss-doc-parser and convert to DocumentInfo objects.""" + if not DOC_PARSER_AVAILABLE: + raise typer.BadParameter( + f"Document parsing not available. Please install moss-doc-parser to parse {file_path}" + ) + + try: + detector = FileTypeDetector() + parser = detector.get_parser_for_file(file_path) + parse_result = parser.parse(file_path) + + # Convert MossDocument objects to DocumentInfo objects + docs = [] + for doc in parse_result.documents: + docs.append( + DocumentInfo( + id=doc.id, + text=doc.text, + metadata=doc.metadata, + ) + ) + return docs + except Exception as e: + raise typer.BadParameter(f"Failed to parse document {file_path}: {str(e)}") + + def _parse_json_docs(raw: str, source: str = "input") -> List[DocumentInfo]: try: data = json.loads(raw) diff --git a/packages/moss-doc-parser/pyproject.toml b/packages/moss-doc-parser/pyproject.toml new file mode 100644 index 00000000..8fc4b605 --- /dev/null +++ b/packages/moss-doc-parser/pyproject.toml @@ -0,0 +1,67 @@ +[build-system] +requires = ["setuptools>=61.0", "wheel"] +build-backend = "setuptools.build_meta" + +[project] +name = "moss-doc-parser" +version = "0.1.0" +description = "Document parsing utilities for Moss semantic search" +readme = "README.md" +license-files = ["LICENSE"] +authors = [ + { name = "InferEdge Inc.", email = "contact@moss.dev" } +] +keywords = ["search", "semantic", "document", "parser", "moss"] +classifiers = [ + "Development Status :: 3 - Alpha", + "Intended Audience :: Developers", + "License :: OSI Approved :: BSD License", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", + "Topic :: Software Development :: Libraries :: Python Modules", +] +requires-python = ">=3.10" +dependencies = [ + "pypdf>=3.0", + "python-docx>=1.0", + "python-pptx>=0.6", + "beautifulsoup4>=4.12", + "python-magic>=0.4", + "markdown>=3.0", + "typing-extensions>=4.0.0", +] + +[project.optional-dependencies] +dev = [ + "pytest>=8.0.0", + "black>=24.0.0", + "isort>=5.0.0", + "flake8>=7.0.0", + "mypy>=1.0.0", + "build>=1.0.0", + "twine>=5.0.0", +] + +[tool.setuptools.packages.find] +where = ["src"] + +[tool.setuptools.package-dir] +"" = "src" + +[tool.black] +line-length = 88 +target-version = ['py310'] + +[tool.isort] +profile = "black" +line_length = 88 + +[tool.mypy] +python_version = "3.10" +warn_return_any = false +warn_unused_configs = true +disallow_untyped_defs = true +ignore_missing_imports = true \ No newline at end of file diff --git a/packages/moss-doc-parser/src/moss_doc_parser/__init__.py b/packages/moss-doc-parser/src/moss_doc_parser/__init__.py new file mode 100644 index 00000000..785a9048 --- /dev/null +++ b/packages/moss-doc-parser/src/moss_doc_parser/__init__.py @@ -0,0 +1,12 @@ +"""Moss document parser package.""" + +from .detector import FileTypeDetector +from .base import BaseParser +from .types import MossDocument, ParseResult + +__all__ = [ + "FileTypeDetector", + "BaseParser", + "MossDocument", + "ParseResult", +] diff --git a/packages/moss-doc-parser/src/moss_doc_parser/base.py b/packages/moss-doc-parser/src/moss_doc_parser/base.py new file mode 100644 index 00000000..ab0827f9 --- /dev/null +++ b/packages/moss-doc-parser/src/moss_doc_parser/base.py @@ -0,0 +1,31 @@ +"""Abstract base class for document parsers.""" + +from abc import ABC, abstractmethod +from typing import List + +from .types import ParseResult + + +class BaseParser(ABC): + """Abstract base class for all document parsers.""" + + @abstractmethod + def parse(self, file_path: str) -> ParseResult: + """Parse a file and return a list of MossDocument objects. + + Args: + file_path: Path to the file to parse. + + Returns: + ParseResult containing the parsed documents and metadata. + """ + pass + + @abstractmethod + def supported_extensions(self) -> List[str]: + """Return a list of file extensions this parser supports. + + Returns: + List of file extensions (without the dot, e.g., ['pdf', 'docx']). + """ + pass diff --git a/packages/moss-doc-parser/src/moss_doc_parser/detector.py b/packages/moss-doc-parser/src/moss_doc_parser/detector.py new file mode 100644 index 00000000..57d6c17d --- /dev/null +++ b/packages/moss-doc-parser/src/moss_doc_parser/detector.py @@ -0,0 +1,83 @@ +"""File type detector for document parsers.""" + +from typing import Dict, List, Type + +from .base import BaseParser +from .parsers.html import HTMLParser +from .parsers.docx import DocxParser +from .parsers.markdown import MarkdownParser +from .parsers.pdf import PDFParser +from .parsers.pptx import PPTXParser + + +class FileTypeDetector: + """Detects file type and returns appropriate parser.""" + + def __init__(self): + self._parsers: Dict[str, Type[BaseParser]] = { + "pdf": PDFParser, + "docx": DocxParser, + "pptx": PPTXParser, + "html": HTMLParser, + "htm": HTMLParser, + "md": MarkdownParser, + "markdown": MarkdownParser, + } + # Try to initialize python-magic, but make it optional + self._magic_available = False + self._magic = None + try: + import magic + + self._magic = magic.Magic(mime=True) + self._magic_available = True + except ImportError: + pass # magic not available, we'll rely on extension-based detection + + def get_parser_for_file(self, file_path: str) -> BaseParser: + """Get the appropriate parser for a file based on its content type. + + Args: + file_path: Path to the file to analyze. + + Returns: + An instance of the appropriate parser class. + + Raises: + ValueError: If no parser is available for the file type. + """ + # First try extension-based detection + extension = file_path.lower().split(".")[-1] if "." in file_path else "" + if extension in self._parsers: + return self._parsers[extension]() + + # Fallback to magic byte detection if available + if self._magic_available: + try: + mime_type = self._magic.from_file(file_path) + mime_to_extension = { + "application/pdf": "pdf", + "application/vnd.openxmlformats-officedocument.wordprocessingml.document": "docx", + "application/vnd.openxmlformats-officedocument.presentationml.presentation": "pptx", + "text/html": "html", + "text/plain": "md", # Assume markdown for plain text + # Note: This means plain text files without extensions will be processed as Markdown. + # For files with known extensions (like .txt, .csv), extension-based detection takes precedence. + # This is an acceptable trade-off as the markdown parser gracefully handles plain text. + } + + extension = mime_to_extension.get(mime_type) + if extension and extension in self._parsers: + return self._parsers[extension]() + except Exception: + pass # Fall through to extension-based detection failure + + raise ValueError(f"No parser available for file: {file_path}") + + def get_supported_extensions(self) -> List[str]: + """Get list of all supported file extensions. + + Returns: + List of supported file extensions (without the dot). + """ + return list(self._parsers.keys()) diff --git a/packages/moss-doc-parser/src/moss_doc_parser/parsers/__init__.py b/packages/moss-doc-parser/src/moss_doc_parser/parsers/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/packages/moss-doc-parser/src/moss_doc_parser/parsers/docx.py b/packages/moss-doc-parser/src/moss_doc_parser/parsers/docx.py new file mode 100644 index 00000000..ab758ee1 --- /dev/null +++ b/packages/moss-doc-parser/src/moss_doc_parser/parsers/docx.py @@ -0,0 +1,61 @@ +"""DOCX document parser.""" + +import time +from typing import Dict, List + +from docx import Document as DocxDocument + +from ..base import BaseParser +from ..types import MossDocument, ParseResult + + +class DocxParser(BaseParser): + """Parser for DOCX files.""" + + def parse(self, file_path: str) -> ParseResult: + """Parse a DOCX file and extract text from paragraphs. + + Args: + file_path: Path to the DOCX file. + + Returns: + ParseResult containing one document per paragraph (or chunked if needed). + """ + start_time = time.time() + + documents = [] + doc = DocxDocument(file_path) + + for para_num, paragraph in enumerate(doc.paragraphs): + text = paragraph.text + if text.strip(): # Only add non-empty paragraphs + doc_id = f"{file_path}_para_{para_num}" + metadata = { + "source_file": file_path, + "paragraph_number": para_num + 1, # 1-indexed for humans + "total_paragraphs": len( + [p for p in doc.paragraphs if p.text.strip()] + ), + } + documents.append( + MossDocument( + id=doc_id, + text=text.strip(), + metadata=metadata, + ) + ) + + parse_time_ms = (time.time() - start_time) * 1000 + return ParseResult( + documents=documents, + source_path=file_path, + parse_time_ms=parse_time_ms, + ) + + def supported_extensions(self) -> List[str]: + """Return a list of file extensions this parser supports. + + Returns: + List of file extensions (without the dot). + """ + return ["docx"] diff --git a/packages/moss-doc-parser/src/moss_doc_parser/parsers/html.py b/packages/moss-doc-parser/src/moss_doc_parser/parsers/html.py new file mode 100644 index 00000000..748f14b5 --- /dev/null +++ b/packages/moss-doc-parser/src/moss_doc_parser/parsers/html.py @@ -0,0 +1,68 @@ +"""HTML document parser.""" + +import time +from typing import Dict, List + +from bs4 import BeautifulSoup + +from ..base import BaseParser +from ..types import MossDocument, ParseResult + + +class HTMLParser(BaseParser): + """Parser for HTML files.""" + + def parse(self, file_path: str) -> ParseResult: + """Parse an HTML file and extract text content. + + Args: + file_path: Path to the HTML file. + + Returns: + ParseResult containing one document per significant text block. + """ + start_time = time.time() + + with open(file_path, "r", encoding="utf-8") as f: + soup = BeautifulSoup(f, "html.parser") + + # Remove script and style elements + for script in soup(["script", "style"]): + script.decompose() + + # Get text and split into meaningful chunks + text = soup.get_text() + lines = (line.strip() for line in text.splitlines()) + chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) + text_blocks = [chunk for chunk in chunks if chunk] + + documents = [] + for i, text_block in enumerate(text_blocks): + doc_id = f"{file_path}_block_{i}" + metadata = { + "source_file": file_path, + "block_number": i + 1, # 1-indexed for humans + "total_blocks": len(text_blocks), + } + documents.append( + MossDocument( + id=doc_id, + text=text_block, + metadata=metadata, + ) + ) + + parse_time_ms = (time.time() - start_time) * 1000 + return ParseResult( + documents=documents, + source_path=file_path, + parse_time_ms=parse_time_ms, + ) + + def supported_extensions(self) -> List[str]: + """Return a list of file extensions this parser supports. + + Returns: + List of file extensions (without the dot). + """ + return ["html", "htm"] diff --git a/packages/moss-doc-parser/src/moss_doc_parser/parsers/markdown.py b/packages/moss-doc-parser/src/moss_doc_parser/parsers/markdown.py new file mode 100644 index 00000000..e71dc0dc --- /dev/null +++ b/packages/moss-doc-parser/src/moss_doc_parser/parsers/markdown.py @@ -0,0 +1,139 @@ +"""Markdown document parser.""" + +import time +from typing import Dict, List + +import markdown +from bs4 import BeautifulSoup + +from ..base import BaseParser +from ..types import MossDocument, ParseResult + + +class MarkdownParser(BaseParser): + """Parser for Markdown files.""" + + def __init__(self): + self.md = markdown.Markdown(extensions=["extra", "toc"]) + + def parse(self, file_path: str) -> ParseResult: + """Parse a Markdown file and extract structured content. + + Args: + file_path: Path to the Markdown file. + + Returns: + ParseResult containing structured documents (headers, paragraphs, etc.). + """ + start_time = time.time() + + with open(file_path, "r", encoding="utf-8") as f: + markdown_content = f.read() + + # Convert markdown to HTML + self.md.reset() + html = self.md.convert(markdown_content) + + # Parse HTML with BeautifulSoup for structured extraction + soup = BeautifulSoup(html, "html.parser") + + documents = [] + current_header = "" + header_level = 0 + + # Process all elements in order + for element in soup.find_all( + ["h1", "h2", "h3", "h4", "h5", "h6", "p", "pre", "li"] + ): + if element.name.startswith("h"): + # Header element + header_level = int(element.name[1]) + current_header = element.get_text().strip() + if current_header: + doc_id = f"{file_path}_header_{len(documents)}" + metadata = { + "source_file": file_path, + "type": "header", + "header_level": header_level, + "header_text": current_header, + "element_index": len(documents), + } + documents.append( + MossDocument( + id=doc_id, + text=current_header, + metadata=metadata, + ) + ) + elif element.name == "p": + # Paragraph element + text = element.get_text().strip() + if text: + doc_id = f"{file_path}_para_{len(documents)}" + metadata = { + "source_file": file_path, + "type": "paragraph", + "header_context": current_header, + "header_level": header_level, + "element_index": len(documents), + } + documents.append( + MossDocument( + id=doc_id, + text=text, + metadata=metadata, + ) + ) + elif element.name == "pre": + # Code block element + text = element.get_text().strip() + if text: + doc_id = f"{file_path}_code_{len(documents)}" + metadata = { + "source_file": file_path, + "type": "code", + "header_context": current_header, + "header_level": header_level, + "element_index": len(documents), + } + documents.append( + MossDocument( + id=doc_id, + text=text, + metadata=metadata, + ) + ) + elif element.name == "li": + # List item element + text = element.get_text().strip() + if text: + doc_id = f"{file_path}_li_{len(documents)}" + metadata = { + "source_file": file_path, + "type": "list_item", + "header_context": current_header, + "header_level": header_level, + "element_index": len(documents), + } + documents.append( + MossDocument( + id=doc_id, + text=text, + metadata=metadata, + ) + ) + + parse_time_ms = (time.time() - start_time) * 1000 + return ParseResult( + documents=documents, + source_path=file_path, + parse_time_ms=parse_time_ms, + ) + + def supported_extensions(self) -> List[str]: + """Return a list of file extensions this parser supports. + + Returns: + List of file extensions (without the dot). + """ + return ["md", "markdown"] diff --git a/packages/moss-doc-parser/src/moss_doc_parser/parsers/pdf.py b/packages/moss-doc-parser/src/moss_doc_parser/parsers/pdf.py new file mode 100644 index 00000000..ff889b87 --- /dev/null +++ b/packages/moss-doc-parser/src/moss_doc_parser/parsers/pdf.py @@ -0,0 +1,59 @@ +"""PDF document parser.""" + +import time +from typing import Dict, List + +import pypdf + +from ..base import BaseParser +from ..types import MossDocument, ParseResult + + +class PDFParser(BaseParser): + """Parser for PDF files.""" + + def parse(self, file_path: str) -> ParseResult: + """Parse a PDF file and extract text from each page. + + Args: + file_path: Path to the PDF file. + + Returns: + ParseResult containing one document per page (or chunked if needed). + """ + start_time = time.time() + + documents = [] + with open(file_path, "rb") as f: + pdf = pypdf.PdfReader(f) + for page_num, page in enumerate(pdf.pages): + text = page.extract_text() + if text.strip(): # Only add non-empty pages + doc_id = f"{file_path}_page_{page_num}" + metadata = { + "source_file": file_path, + "page_number": page_num + 1, # 1-indexed for humans + "total_pages": len(pdf.pages), + } + documents.append( + MossDocument( + id=doc_id, + text=text.strip(), + metadata=metadata, + ) + ) + + parse_time_ms = (time.time() - start_time) * 1000 + return ParseResult( + documents=documents, + source_path=file_path, + parse_time_ms=parse_time_ms, + ) + + def supported_extensions(self) -> List[str]: + """Return a list of file extensions this parser supports. + + Returns: + List of file extensions (without the dot). + """ + return ["pdf"] diff --git a/packages/moss-doc-parser/src/moss_doc_parser/parsers/pptx.py b/packages/moss-doc-parser/src/moss_doc_parser/parsers/pptx.py new file mode 100644 index 00000000..0f20a5ec --- /dev/null +++ b/packages/moss-doc-parser/src/moss_doc_parser/parsers/pptx.py @@ -0,0 +1,66 @@ +"""PPTX document parser.""" + +import time +from typing import Dict, List + +from pptx import Presentation + +from ..base import BaseParser +from ..types import MossDocument, ParseResult + + +class PPTXParser(BaseParser): + """Parser for PPTX files.""" + + def parse(self, file_path: str) -> ParseResult: + """Parse a PPTX file and extract text from each slide. + + Args: + file_path: Path to the PPTX file. + + Returns: + ParseResult containing one document per slide (or chunked if needed). + """ + start_time = time.time() + + documents = [] + prs = Presentation(file_path) + + for slide_num, slide in enumerate(prs.slides): + text_runs = [] + for shape in slide.shapes: + if not shape.has_text_frame: + continue + for paragraph in shape.text_frame.paragraphs: + for run in paragraph.runs: + text_runs.append(run.text) + text = "\n".join(text_runs) + if text.strip(): # Only add non-empty slides + doc_id = f"{file_path}_slide_{slide_num}" + metadata = { + "source_file": file_path, + "slide_number": slide_num + 1, # 1-indexed for humans + "total_slides": len(prs.slides), + } + documents.append( + MossDocument( + id=doc_id, + text=text.strip(), + metadata=metadata, + ) + ) + + parse_time_ms = (time.time() - start_time) * 1000 + return ParseResult( + documents=documents, + source_path=file_path, + parse_time_ms=parse_time_ms, + ) + + def supported_extensions(self) -> List[str]: + """Return a list of file extensions this parser supports. + + Returns: + List of file extensions (without the dot). + """ + return ["pptx"] diff --git a/packages/moss-doc-parser/src/moss_doc_parser/types.py b/packages/moss-doc-parser/src/moss_doc_parser/types.py new file mode 100644 index 00000000..fcc17998 --- /dev/null +++ b/packages/moss-doc-parser/src/moss_doc_parser/types.py @@ -0,0 +1,22 @@ +"""Data types for Moss document parser.""" + +from dataclasses import dataclass +from typing import Any, Dict, List, Optional + + +@dataclass +class MossDocument: + """Represents a document ready for ingestion into Moss.""" + + id: str + text: str + metadata: Dict[str, Any] + + +@dataclass +class ParseResult: + """Result of parsing a file.""" + + documents: List[MossDocument] + source_path: str + parse_time_ms: float diff --git a/packages/moss-doc-parser/tests/__init__.py b/packages/moss-doc-parser/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/packages/moss-doc-parser/tests/test_docx_parser.py b/packages/moss-doc-parser/tests/test_docx_parser.py new file mode 100644 index 00000000..03124b37 --- /dev/null +++ b/packages/moss-doc-parser/tests/test_docx_parser.py @@ -0,0 +1,29 @@ +import unittest +from unittest.mock import patch, MagicMock + +from moss_doc_parser.parsers.docx import DocxParser +from moss_doc_parser.types import ParseResult + + +class TestDocxParser(unittest.TestCase): + @patch("moss_doc_parser.parsers.docx.DocxDocument") + def test_parse_returns_documents(self, mock_docx): + mock_paragraph = MagicMock() + mock_paragraph.text = "This is a paragraph." + mock_docx.return_value.paragraphs = [mock_paragraph] + + parser = DocxParser() + result = parser.parse("dummy_path.docx") + + self.assertIsInstance(result, ParseResult) + self.assertEqual(len(result.documents), 1) + self.assertEqual(result.documents[0].text, "This is a paragraph.") + self.assertIn("source_file", result.documents[0].metadata) + self.assertEqual(result.documents[0].metadata["paragraph_number"], 1) + self.assertEqual(result.documents[0].metadata["total_paragraphs"], 1) + + mock_docx.assert_called_once_with("dummy_path.docx") + + +if __name__ == "__main__": + unittest.main() \ No newline at end of file diff --git a/packages/moss-doc-parser/tests/test_html_parser.py b/packages/moss-doc-parser/tests/test_html_parser.py new file mode 100644 index 00000000..17ee4c64 --- /dev/null +++ b/packages/moss-doc-parser/tests/test_html_parser.py @@ -0,0 +1,40 @@ +import unittest +from unittest.mock import patch, mock_open, MagicMock + +from moss_doc_parser.parsers.html import HTMLParser +from moss_doc_parser.types import ParseResult + + +class TestHTMLParser(unittest.TestCase): + @patch( + "builtins.open", + new_callable=mock_open, + read_data="
Paragraph text.
", + ) + @patch("moss_doc_parser.parsers.html.BeautifulSoup") + def test_parse_returns_documents(self, mock_bs, mock_file): + # Setup mock for BeautifulSoup + mock_soup = MagicMock() + mock_soup.get_text.return_value = "Title\nParagraph text." + mock_bs.return_value = mock_soup + + parser = HTMLParser() + result = parser.parse("dummy_path.html") + + # Assertions + self.assertIsInstance(result, ParseResult) + # Should have documents for each text block (title and paragraph) + self.assertGreaterEqual(len(result.documents), 1) + self.assertIn("source_file", result.documents[0].metadata) + self.assertEqual(result.documents[0].metadata["block_number"], 1) + + # Check that open was called with the right path + mock_file.assert_called_once_with("dummy_path.html", "r", encoding="utf-8") + # Check that BeautifulSoup was called with the file handle and parser + mock_bs.assert_called_once() + # Check that get_text was called + mock_soup.get_text.assert_called_once() + + +if __name__ == "__main__": + unittest.main() diff --git a/packages/moss-doc-parser/tests/test_markdown_parser.py b/packages/moss-doc-parser/tests/test_markdown_parser.py new file mode 100644 index 00000000..f9f4275e --- /dev/null +++ b/packages/moss-doc-parser/tests/test_markdown_parser.py @@ -0,0 +1,62 @@ +import unittest +from unittest.mock import patch, mock_open, MagicMock + +from moss_doc_parser.parsers.markdown import MarkdownParser +from moss_doc_parser.types import ParseResult + + +class TestMarkdownParser(unittest.TestCase): + @patch( + "builtins.open", + new_callable=mock_open, + read_data="# Title\n\nThis is a paragraph.", + ) + @patch("moss_doc_parser.parsers.markdown.markdown.Markdown") + @patch("moss_doc_parser.parsers.markdown.BeautifulSoup") + def test_parse_returns_documents(self, mock_bs, mock_md, mock_file): + # Setup mock for markdown conversion + mock_md_instance = MagicMock() + mock_md_instance.convert.return_value = ( + "This is a paragraph.
" + ) + mock_md.return_value = mock_md_instance + + # Setup mock for BeautifulSoup + mock_soup = MagicMock() + # We need to mock the find_all method to return a list of elements + # For simplicity, we'll return two elements: a header and a paragraph + mock_h1 = MagicMock() + mock_h1.name = "h1" + mock_h1.get_text.return_value = "Title" + mock_p = MagicMock() + mock_p.name = "p" + mock_p.get_text.return_value = "This is a paragraph." + mock_soup.find_all.return_value = [mock_h1, mock_p] + mock_bs.return_value = mock_soup + + parser = MarkdownParser() + result = parser.parse("dummy_path.md") + + # Assertions + self.assertIsInstance(result, ParseResult) + # We expect at least two documents: one for the header and one for the paragraph + self.assertGreaterEqual(len(result.documents), 2) + # Check that the first document is the header + self.assertEqual(result.documents[0].text, "Title") + self.assertEqual(result.documents[0].metadata["type"], "header") + # Check that the second document is the paragraph + self.assertEqual(result.documents[1].text, "This is a paragraph.") + self.assertEqual(result.documents[1].metadata["type"], "paragraph") + + # Check that open was called with the right path + mock_file.assert_called_once_with("dummy_path.md", "r", encoding="utf-8") + # Check that markdown was called + mock_md.assert_called_once() + # Check that BeautifulSoup was called with the markdown output + mock_bs.assert_called_once() + # Check that find_all was called on the soup object + mock_soup.find_all.assert_called_once() + + +if __name__ == "__main__": + unittest.main() diff --git a/packages/moss-doc-parser/tests/test_pdf_parser.py b/packages/moss-doc-parser/tests/test_pdf_parser.py new file mode 100644 index 00000000..2b2f2836 --- /dev/null +++ b/packages/moss-doc-parser/tests/test_pdf_parser.py @@ -0,0 +1,37 @@ +import unittest +from unittest.mock import patch, mock_open, MagicMock + +from moss_doc_parser.parsers.pdf import PDFParser +from moss_doc_parser.types import ParseResult + + +class TestPDFParser(unittest.TestCase): + @patch("builtins.open", new_callable=mock_open, read_data=b"%PDF-1.4 test") + @patch("moss_doc_parser.parsers.pdf.pypdf.PdfReader") + def test_parse_returns_documents(self, mock_pdf_reader, mock_file): + # Setup mock for PdfReader + mock_page = MagicMock() + mock_page.extract_text.return_value = "This is page text." + mock_pdf_reader.return_value.pages = [mock_page] + + parser = PDFParser() + result = parser.parse("dummy_path.pdf") + + # Assertions + self.assertIsInstance(result, ParseResult) + self.assertEqual(len(result.documents), 1) + self.assertEqual(result.documents[0].text, "This is page text.") + self.assertIn("source_file", result.documents[0].metadata) + self.assertEqual(result.documents[0].metadata["page_number"], 1) + self.assertEqual(result.documents[0].metadata["total_pages"], 1) + + # Check that open was called with the right path + mock_file.assert_called_once_with("dummy_path.pdf", "rb") + # Check that PdfReader was called with the file handle + mock_pdf_reader.assert_called_once() + # Check that extract_text was called on the page + mock_page.extract_text.assert_called_once() + + +if __name__ == "__main__": + unittest.main() diff --git a/sdks/python/sdk/pyproject.toml b/sdks/python/sdk/pyproject.toml index a656fcc7..1a73aed3 100644 --- a/sdks/python/sdk/pyproject.toml +++ b/sdks/python/sdk/pyproject.toml @@ -46,6 +46,9 @@ dev = [ "griffe>=0.40.0", "pyyaml>=6.0", ] +doc-parser = [ + "moss-doc-parser>=0.1.0", +] [project.urls] Homepage = "https://github.com/usemoss/moss" diff --git a/sdks/python/sdk/src/moss/client/moss_client.py b/sdks/python/sdk/src/moss/client/moss_client.py index 51bd2833..1fce220a 100644 --- a/sdks/python/sdk/src/moss/client/moss_client.py +++ b/sdks/python/sdk/src/moss/client/moss_client.py @@ -91,6 +91,65 @@ async def create_index( resolved_model_id, ) + async def create_index_from_files( + self, + name: str, + file_paths: List[str], + model_id: Optional[str] = None, + ) -> MutationResult: + """Create a new index by parsing files and adding their contents. + + Supports PDF, DOCX, PPTX, HTML, and Markdown files. + Files are parsed into DocumentInfo objects before index creation. + + Args: + name: Name of the index to create + file_paths: List of file paths to parse and add to the index + model_id: Optional model ID to use for embeddings + + Returns: + MutationResult containing job information + """ + # Import moss-doc-parser here to avoid hard dependency + try: + from moss_doc_parser import FileTypeDetector + except ImportError as e: + raise ImportError( + "moss-doc-parser is required for create_index_from_files. " + "Install it with: pip install moss-doc-parser" + ) from e + + # Parse all files + all_docs: List[DocumentInfo] = [] + detector = FileTypeDetector() + + for file_path in file_paths: + try: + # Get appropriate parser for file type + parser = detector.get_parser_for_file(file_path) + # Parse the file + parse_result = parser.parse(file_path) + # Convert MossDocument objects to DocumentInfo objects + for doc in parse_result.documents: + all_docs.append( + DocumentInfo( + id=doc.id, + text=doc.text, + metadata=doc.metadata, + ) + ) + except Exception as e: + raise ValueError(f"Failed to parse file {file_path}: {str(e)}") + + # Create index with parsed documents + resolved_model_id = self._resolve_model_id(all_docs, model_id) + return await asyncio.to_thread( + self._manage.create_index, + name, + all_docs, + resolved_model_id, + ) + async def add_docs( self, name: str,