Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion packages/moss-cli/src/moss_cli/commands/doc.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def _client(ctx: typer.Context) -> MossClient:
def add(
ctx: typer.Context,
index_name: str = typer.Argument(..., help="Index name"),
file: str = typer.Option(..., "--file", "-f", help="Path to JSON/CSV document file, or '-' for stdin"),
file: str = typer.Option(..., "--file", "-f", help="Path to JSON/CSV document file, or document file (PDF, DOCX, etc.), or '-' for stdin"),
profile: Optional[str] = typer.Option(
None, "--profile", help="Credential profile name"
),
Expand Down
45 changes: 42 additions & 3 deletions packages/moss-cli/src/moss_cli/documents.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"""Load documents from JSON/CSV files or stdin."""
"""Load documents from JSON/CSV files, document files, or stdin."""

from __future__ import annotations

Expand All @@ -11,9 +11,17 @@
import typer
from moss import DocumentInfo

# Import moss-doc-parser for file parsing
try:
from moss_doc_parser import FileTypeDetector

DOC_PARSER_AVAILABLE = True
except ImportError:
DOC_PARSER_AVAILABLE = False


def load_documents(file_path: str) -> List[DocumentInfo]:
"""Load documents from a JSON/CSV file or stdin ('-')."""
"""Load documents from a JSON/CSV file, document file, or stdin ('-')."""
if file_path == "-":
raw = sys.stdin.read()
return _parse_json_docs(raw, source="stdin")
Expand All @@ -22,9 +30,13 @@ def load_documents(file_path: str) -> List[DocumentInfo]:
if not path.exists():
raise typer.BadParameter(f"File not found: {file_path}")

# Check if it's a supported document file for parsing
suffix = path.suffix.lower()
if DOC_PARSER_AVAILABLE and suffix in ['.pdf', '.docx', '.pptx', '.html', '.htm', '.md', '.markdown']:
return _parse_document_file(str(path))

# Otherwise treat as JSON/CSV
content = path.read_text()

if suffix == ".csv":
return _parse_csv_docs(content)
elif suffix == ".jsonl":
Expand All @@ -35,6 +47,33 @@ def load_documents(file_path: str) -> List[DocumentInfo]:
return _parse_json_docs(content, source=file_path)


def _parse_document_file(file_path: str) -> List[DocumentInfo]:
"""Parse a document file using moss-doc-parser and convert to DocumentInfo objects."""
if not DOC_PARSER_AVAILABLE:
raise typer.BadParameter(
f"Document parsing not available. Please install moss-doc-parser to parse {file_path}"
)

try:
detector = FileTypeDetector()
parser = detector.get_parser_for_file(file_path)
parse_result = parser.parse(file_path)

# Convert MossDocument objects to DocumentInfo objects
docs = []
for doc in parse_result.documents:
docs.append(
DocumentInfo(
id=doc.id,
text=doc.text,
metadata=doc.metadata,
)
)
return docs
except Exception as e:
raise typer.BadParameter(f"Failed to parse document {file_path}: {str(e)}")


def _parse_json_docs(raw: str, source: str = "input") -> List[DocumentInfo]:
try:
data = json.loads(raw)
Expand Down
67 changes: 67 additions & 0 deletions packages/moss-doc-parser/pyproject.toml
Comment thread
PredictiveManish marked this conversation as resolved.
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
[build-system]
requires = ["setuptools>=61.0", "wheel"]
build-backend = "setuptools.build_meta"

[project]
name = "moss-doc-parser"
version = "0.1.0"
description = "Document parsing utilities for Moss semantic search"
readme = "README.md"
license-files = ["LICENSE"]
authors = [
{ name = "InferEdge Inc.", email = "[email protected]" }
]
keywords = ["search", "semantic", "document", "parser", "moss"]
classifiers = [
"Development Status :: 3 - Alpha",
"Intended Audience :: Developers",
"License :: OSI Approved :: BSD License",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
"Programming Language :: Python :: 3.13",
"Topic :: Software Development :: Libraries :: Python Modules",
]
requires-python = ">=3.10"
dependencies = [
"pypdf>=3.0",
"python-docx>=1.0",
"python-pptx>=0.6",
"beautifulsoup4>=4.12",
"python-magic>=0.4",
"markdown>=3.0",
"typing-extensions>=4.0.0",
]

[project.optional-dependencies]
dev = [
"pytest>=8.0.0",
"black>=24.0.0",
"isort>=5.0.0",
"flake8>=7.0.0",
"mypy>=1.0.0",
"build>=1.0.0",
"twine>=5.0.0",
]

[tool.setuptools.packages.find]
where = ["src"]

[tool.setuptools.package-dir]
"" = "src"

[tool.black]
line-length = 88
target-version = ['py310']

[tool.isort]
profile = "black"
line_length = 88

[tool.mypy]
python_version = "3.10"
warn_return_any = false
warn_unused_configs = true
disallow_untyped_defs = true
ignore_missing_imports = true
12 changes: 12 additions & 0 deletions packages/moss-doc-parser/src/moss_doc_parser/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
"""Moss document parser package."""

from .detector import FileTypeDetector
from .base import BaseParser
from .types import MossDocument, ParseResult

__all__ = [
"FileTypeDetector",
"BaseParser",
"MossDocument",
"ParseResult",
]
31 changes: 31 additions & 0 deletions packages/moss-doc-parser/src/moss_doc_parser/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
"""Abstract base class for document parsers."""

from abc import ABC, abstractmethod
from typing import List

from .types import ParseResult


class BaseParser(ABC):
"""Abstract base class for all document parsers."""

@abstractmethod
def parse(self, file_path: str) -> ParseResult:
"""Parse a file and return a list of MossDocument objects.

Args:
file_path: Path to the file to parse.

Returns:
ParseResult containing the parsed documents and metadata.
"""
pass

@abstractmethod
def supported_extensions(self) -> List[str]:
"""Return a list of file extensions this parser supports.

Returns:
List of file extensions (without the dot, e.g., ['pdf', 'docx']).
"""
pass
83 changes: 83 additions & 0 deletions packages/moss-doc-parser/src/moss_doc_parser/detector.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
"""File type detector for document parsers."""

from typing import Dict, List, Type

from .base import BaseParser
from .parsers.html import HTMLParser
from .parsers.docx import DocxParser
from .parsers.markdown import MarkdownParser
from .parsers.pdf import PDFParser
from .parsers.pptx import PPTXParser
Comment thread
PredictiveManish marked this conversation as resolved.


class FileTypeDetector:
"""Detects file type and returns appropriate parser."""

def __init__(self):
self._parsers: Dict[str, Type[BaseParser]] = {
"pdf": PDFParser,
"docx": DocxParser,
"pptx": PPTXParser,
"html": HTMLParser,
"htm": HTMLParser,
"md": MarkdownParser,
"markdown": MarkdownParser,
}
# Try to initialize python-magic, but make it optional
self._magic_available = False
self._magic = None
try:
import magic

self._magic = magic.Magic(mime=True)
self._magic_available = True
except ImportError:
pass # magic not available, we'll rely on extension-based detection

def get_parser_for_file(self, file_path: str) -> BaseParser:
"""Get the appropriate parser for a file based on its content type.

Args:
file_path: Path to the file to analyze.

Returns:
An instance of the appropriate parser class.

Raises:
ValueError: If no parser is available for the file type.
"""
# First try extension-based detection
extension = file_path.lower().split(".")[-1] if "." in file_path else ""
if extension in self._parsers:
return self._parsers[extension]()

# Fallback to magic byte detection if available
if self._magic_available:
try:
mime_type = self._magic.from_file(file_path)
mime_to_extension = {
"application/pdf": "pdf",
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": "docx",
"application/vnd.openxmlformats-officedocument.presentationml.presentation": "pptx",
"text/html": "html",
"text/plain": "md", # Assume markdown for plain text
Comment thread
PredictiveManish marked this conversation as resolved.
# Note: This means plain text files without extensions will be processed as Markdown.
# For files with known extensions (like .txt, .csv), extension-based detection takes precedence.
# This is an acceptable trade-off as the markdown parser gracefully handles plain text.
}

extension = mime_to_extension.get(mime_type)
if extension and extension in self._parsers:
return self._parsers[extension]()
except Exception:
pass # Fall through to extension-based detection failure

raise ValueError(f"No parser available for file: {file_path}")

def get_supported_extensions(self) -> List[str]:
"""Get list of all supported file extensions.

Returns:
List of supported file extensions (without the dot).
"""
return list(self._parsers.keys())
Empty file.
61 changes: 61 additions & 0 deletions packages/moss-doc-parser/src/moss_doc_parser/parsers/docx.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
"""DOCX document parser."""

import time
from typing import Dict, List

from docx import Document as DocxDocument

from ..base import BaseParser
from ..types import MossDocument, ParseResult


class DocxParser(BaseParser):
"""Parser for DOCX files."""

def parse(self, file_path: str) -> ParseResult:
"""Parse a DOCX file and extract text from paragraphs.

Args:
file_path: Path to the DOCX file.

Returns:
ParseResult containing one document per paragraph (or chunked if needed).
"""
start_time = time.time()

documents = []
doc = DocxDocument(file_path)

for para_num, paragraph in enumerate(doc.paragraphs):
text = paragraph.text
if text.strip(): # Only add non-empty paragraphs
doc_id = f"{file_path}_para_{para_num}"
metadata = {
"source_file": file_path,
"paragraph_number": para_num + 1, # 1-indexed for humans
"total_paragraphs": len(
[p for p in doc.paragraphs if p.text.strip()]
),
}
documents.append(
MossDocument(
id=doc_id,
text=text.strip(),
metadata=metadata,
)
)

parse_time_ms = (time.time() - start_time) * 1000
return ParseResult(
documents=documents,
source_path=file_path,
parse_time_ms=parse_time_ms,
)

def supported_extensions(self) -> List[str]:
"""Return a list of file extensions this parser supports.

Returns:
List of file extensions (without the dot).
"""
return ["docx"]
Loading
Loading