usemoss · PredictiveManish · Apr 29, 2026 · Apr 29, 2026 · May 1, 2026 · May 1, 2026
@@ -30,7 +30,7 @@ def _client(ctx: typer.Context) -> MossClient:
 def add(
     ctx: typer.Context,
     index_name: str = typer.Argument(..., help="Index name"),
-    file: str = typer.Option(..., "--file", "-f", help="Path to JSON/CSV document file, or '-' for stdin"),
+    file: str = typer.Option(..., "--file", "-f", help="Path to JSON/CSV document file, or document file (PDF, DOCX, etc.), or '-' for stdin"),
     profile: Optional[str] = typer.Option(
         None, "--profile", help="Credential profile name"
     ),

@@ -1,4 +1,4 @@
-"""Load documents from JSON/CSV files or stdin."""
+"""Load documents from JSON/CSV files, document files, or stdin."""
 
 from __future__ import annotations
 
@@ -11,9 +11,17 @@
 import typer
 from moss import DocumentInfo
 
+# Import moss-doc-parser for file parsing
+try:
+    from moss_doc_parser import FileTypeDetector
+
+    DOC_PARSER_AVAILABLE = True
+except ImportError:
+    DOC_PARSER_AVAILABLE = False
+
 
 def load_documents(file_path: str) -> List[DocumentInfo]:
-    """Load documents from a JSON/CSV file or stdin ('-')."""
+    """Load documents from a JSON/CSV file, document file, or stdin ('-')."""
     if file_path == "-":
         raw = sys.stdin.read()
         return _parse_json_docs(raw, source="stdin")
@@ -22,9 +30,13 @@ def load_documents(file_path: str) -> List[DocumentInfo]:
     if not path.exists():
         raise typer.BadParameter(f"File not found: {file_path}")
 
+    # Check if it's a supported document file for parsing
     suffix = path.suffix.lower()
+    if DOC_PARSER_AVAILABLE and suffix in ['.pdf', '.docx', '.pptx', '.html', '.htm', '.md', '.markdown']:
+        return _parse_document_file(str(path))
+
+    # Otherwise treat as JSON/CSV
     content = path.read_text()
-
     if suffix == ".csv":
         return _parse_csv_docs(content)
     elif suffix == ".jsonl":
@@ -35,6 +47,33 @@ def load_documents(file_path: str) -> List[DocumentInfo]:
         return _parse_json_docs(content, source=file_path)
 
 
+def _parse_document_file(file_path: str) -> List[DocumentInfo]:
+    """Parse a document file using moss-doc-parser and convert to DocumentInfo objects."""
+    if not DOC_PARSER_AVAILABLE:
+        raise typer.BadParameter(
+            f"Document parsing not available. Please install moss-doc-parser to parse {file_path}"
+        )
+
+    try:
+        detector = FileTypeDetector()
+        parser = detector.get_parser_for_file(file_path)
+        parse_result = parser.parse(file_path)
+
+        # Convert MossDocument objects to DocumentInfo objects
+        docs = []
+        for doc in parse_result.documents:
+            docs.append(
+                DocumentInfo(
+                    id=doc.id,
+                    text=doc.text,
+                    metadata=doc.metadata,
+                )
+            )
+        return docs
+    except Exception as e:
+        raise typer.BadParameter(f"Failed to parse document {file_path}: {str(e)}")
+
+
 def _parse_json_docs(raw: str, source: str = "input") -> List[DocumentInfo]:
     try:
         data = json.loads(raw)

@@ -0,0 +1,67 @@
+[build-system]
+requires = ["setuptools>=61.0", "wheel"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "moss-doc-parser"
+version = "0.1.0"
+description = "Document parsing utilities for Moss semantic search"
+readme = "README.md"
+license-files = ["LICENSE"]
+authors = [
+    { name = "InferEdge Inc.", email = "[email protected]" }
+]
+keywords = ["search", "semantic", "document", "parser", "moss"]
+classifiers = [
+    "Development Status :: 3 - Alpha",
+    "Intended Audience :: Developers",
+    "License :: OSI Approved :: BSD License",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+    "Programming Language :: Python :: 3.13",
+    "Topic :: Software Development :: Libraries :: Python Modules",
+]
+requires-python = ">=3.10"
+dependencies = [
+    "pypdf>=3.0",
+    "python-docx>=1.0",
+    "python-pptx>=0.6",
+    "beautifulsoup4>=4.12",
+    "python-magic>=0.4",
+    "markdown>=3.0",
+    "typing-extensions>=4.0.0",
+]
+
+[project.optional-dependencies]
+dev = [
+    "pytest>=8.0.0",
+    "black>=24.0.0",
+    "isort>=5.0.0",
+    "flake8>=7.0.0",
+    "mypy>=1.0.0",
+    "build>=1.0.0",
+    "twine>=5.0.0",
+]
+
+[tool.setuptools.packages.find]
+where = ["src"]
+
+[tool.setuptools.package-dir]
+"" = "src"
+
+[tool.black]
+line-length = 88
+target-version = ['py310']
+
+[tool.isort]
+profile = "black"
+line_length = 88
+
+[tool.mypy]
+python_version = "3.10"
+warn_return_any = false
+warn_unused_configs = true
+disallow_untyped_defs = true
+ignore_missing_imports = true
@@ -0,0 +1,12 @@
+"""Moss document parser package."""
+
+from .detector import FileTypeDetector
+from .base import BaseParser
+from .types import MossDocument, ParseResult
+
+__all__ = [
+    "FileTypeDetector",
+    "BaseParser",
+    "MossDocument",
+    "ParseResult",
+]
@@ -0,0 +1,31 @@
+"""Abstract base class for document parsers."""
+
+from abc import ABC, abstractmethod
+from typing import List
+
+from .types import ParseResult
+
+
+class BaseParser(ABC):
+    """Abstract base class for all document parsers."""
+
+    @abstractmethod
+    def parse(self, file_path: str) -> ParseResult:
+        """Parse a file and return a list of MossDocument objects.
+
+        Args:
+            file_path: Path to the file to parse.
+
+        Returns:
+            ParseResult containing the parsed documents and metadata.
+        """
+        pass
+
+    @abstractmethod
+    def supported_extensions(self) -> List[str]:
+        """Return a list of file extensions this parser supports.
+
+        Returns:
+            List of file extensions (without the dot, e.g., ['pdf', 'docx']).
+        """
+        pass
@@ -0,0 +1,83 @@
+"""File type detector for document parsers."""
+
+from typing import Dict, List, Type
+
+from .base import BaseParser
+from .parsers.html import HTMLParser
+from .parsers.docx import DocxParser
+from .parsers.markdown import MarkdownParser
+from .parsers.pdf import PDFParser
+from .parsers.pptx import PPTXParser
+
+
+class FileTypeDetector:
+    """Detects file type and returns appropriate parser."""
+
+    def __init__(self):
+        self._parsers: Dict[str, Type[BaseParser]] = {
+            "pdf": PDFParser,
+            "docx": DocxParser,
+            "pptx": PPTXParser,
+            "html": HTMLParser,
+            "htm": HTMLParser,
+            "md": MarkdownParser,
+            "markdown": MarkdownParser,
+        }
+        # Try to initialize python-magic, but make it optional
+        self._magic_available = False
+        self._magic = None
+        try:
+            import magic
+
+            self._magic = magic.Magic(mime=True)
+            self._magic_available = True
+        except ImportError:
+            pass  # magic not available, we'll rely on extension-based detection
+
+    def get_parser_for_file(self, file_path: str) -> BaseParser:
+        """Get the appropriate parser for a file based on its content type.
+
+        Args:
+            file_path: Path to the file to analyze.
+
+        Returns:
+            An instance of the appropriate parser class.
+
+        Raises:
+            ValueError: If no parser is available for the file type.
+        """
+        # First try extension-based detection
+        extension = file_path.lower().split(".")[-1] if "." in file_path else ""
+        if extension in self._parsers:
+            return self._parsers[extension]()
+
+        # Fallback to magic byte detection if available
+        if self._magic_available:
+            try:
+                mime_type = self._magic.from_file(file_path)
+                mime_to_extension = {
+                    "application/pdf": "pdf",
+                    "application/vnd.openxmlformats-officedocument.wordprocessingml.document": "docx",
+                    "application/vnd.openxmlformats-officedocument.presentationml.presentation": "pptx",
+                    "text/html": "html",
+                    "text/plain": "md",  # Assume markdown for plain text
+                    # Note: This means plain text files without extensions will be processed as Markdown.
+                    # For files with known extensions (like .txt, .csv), extension-based detection takes precedence.
+                    # This is an acceptable trade-off as the markdown parser gracefully handles plain text.
+                }
+
+                extension = mime_to_extension.get(mime_type)
+                if extension and extension in self._parsers:
+                    return self._parsers[extension]()
+            except Exception:
+                pass  # Fall through to extension-based detection failure
+
+        raise ValueError(f"No parser available for file: {file_path}")
+
+    def get_supported_extensions(self) -> List[str]:
+        """Get list of all supported file extensions.
+
+        Returns:
+            List of supported file extensions (without the dot).
+        """
+        return list(self._parsers.keys())
@@ -0,0 +1,61 @@
+"""DOCX document parser."""
+
+import time
+from typing import Dict, List
+
+from docx import Document as DocxDocument
+
+from ..base import BaseParser
+from ..types import MossDocument, ParseResult
+
+
+class DocxParser(BaseParser):
+    """Parser for DOCX files."""
+
+    def parse(self, file_path: str) -> ParseResult:
+        """Parse a DOCX file and extract text from paragraphs.
+
+        Args:
+            file_path: Path to the DOCX file.
+
+        Returns:
+            ParseResult containing one document per paragraph (or chunked if needed).
+        """
+        start_time = time.time()
+
+        documents = []
+        doc = DocxDocument(file_path)
+
+        for para_num, paragraph in enumerate(doc.paragraphs):
+            text = paragraph.text
+            if text.strip():  # Only add non-empty paragraphs
+                doc_id = f"{file_path}_para_{para_num}"
+                metadata = {
+                    "source_file": file_path,
+                    "paragraph_number": para_num + 1,  # 1-indexed for humans
+                    "total_paragraphs": len(
+                        [p for p in doc.paragraphs if p.text.strip()]
+                    ),
+                }
+                documents.append(
+                    MossDocument(
+                        id=doc_id,
+                        text=text.strip(),
+                        metadata=metadata,
+                    )
+                )
+
+        parse_time_ms = (time.time() - start_time) * 1000
+        return ParseResult(
+            documents=documents,
+            source_path=file_path,
+            parse_time_ms=parse_time_ms,
+        )
+
+    def supported_extensions(self) -> List[str]:
+        """Return a list of file extensions this parser supports.
+
+        Returns:
+            List of file extensions (without the dot).
+        """
+        return ["docx"]