invopop · pmenendz · Jul 23, 2025 · Jul 20, 2025 · Jul 20, 2025 · Jul 23, 2025
diff --git a/config.yaml b/config.yaml
@@ -24,4 +24,14 @@ mcp:
 chat:
   welcome_message: "Welcome to Invopop Expert! Ask questions about GOBL, Invopop and the invopop/gobl library (type 'exit' to quit)"
   input_prompt: "Enter your multi-line question. Press Enter on an empty line to send."
-  max_history: 50
+  max_history: 50
+
+vector_store:
+  id: "" #vector store id
+  max_results: 10
+  score_threshold: 0.1
+  default_filters:
+    country: "es"  # Spain
+    subject: "verifactu"  # VeriFactu-specific docs
+
+checkpointer: "memory"
diff --git a/src/expert/agent.py b/src/expert/agent.py
@@ -10,6 +10,7 @@
 from opik.integrations.langchain import OpikTracer
 
 from .config import Config
+from .official_docs_tool import OfficialDocsTool
 
 AVAILABLE_REPOS = [
     "invopop/gobl",
@@ -31,6 +32,10 @@ def __init__(self, config: Config):
         self._load_prompts()
         self.opik_config = None
         self.mcp_client = None
+        self.official_docs_tool = OfficialDocsTool(
+            vector_store_id=self.config.vector_store_config["id"],
+            max_results=self.config.vector_store_config.get("max_results", 10),
+        )
 
     def _load_prompts(self):
         """Load prompt templates from files."""
@@ -88,6 +93,9 @@ async def setup(self):
             )
             renamed_tools.append(renamed_tool)
 
+        # Add the official docs tool
+        renamed_tools.append(self.official_docs_tool.get_tool())
+
         # Create the agent
         llm_config = self.config.llm_config
         provider = llm_config.get("provider", "openai")
@@ -170,6 +178,11 @@ async def get_response_with_context(self, messages: list[dict], thread_id: str)
                                     f"🔍 Searching {repo_name} repo:",
                                     tool_call["function"]["arguments"],
                                 )
+                            elif func_name == "search_official":
+                                print(
+                                    "🔍 Searching official docs:",
+                                    tool_call["function"]["arguments"],
+                                )
                     # Update the final message content
                     final_message_content = message.content
 

diff --git a/src/expert/config.py b/src/expert/config.py
@@ -101,6 +101,11 @@ def chat_config(self) -> dict[str, Any]:
         """Get chat configuration."""
         return self.config.get("chat", {})
 
+    @property
+    def vector_store_config(self) -> dict[str, Any]:
+        """Get vector store configuration."""
+        return self.config.get("vector_store", {})
+
     @property
     def checkpointer(self) -> BaseCheckpointSaver | None:
         """Get checkpointer configuration."""

diff --git a/src/expert/official_docs_tool.py b/src/expert/official_docs_tool.py
@@ -0,0 +1,117 @@
+"""RAG search tool using OpenAI Vector Store."""
+
+from pathlib import Path
+
+from langchain_core.tools import StructuredTool
+from openai import OpenAI
+from pydantic import BaseModel, Field
+
+
+class OfficialDocsInput(BaseModel):
+    """Input schema for RAG search tool."""
+
+    query: str = Field(description="The search query to find relevant documentation")
+    country: str | None = Field(
+        default="es", description="Country filter for the search (for now only 'es' for Spain)"
+    )
+    subject: str | None = Field(
+        default="verifactu",
+        description="""Subject filter for the search
+('regulation' for general regulations or 'verifactu' for VeriFactu-specific docs)""",
+    )
+
+
+class OfficialDocsTool:
+    """RAG search tool using OpenAI Vector Store."""
+
+    def __init__(self, vector_store_id: str, max_results: int = 10):
+        """Initialize the RAG search tool."""
+        self.client = OpenAI()
+        self.vector_store_id = vector_store_id
+        self.max_results = max_results
+
+        prompts_dir = Path(__file__).parent / "prompts"
+
+        with open(prompts_dir / "official_docs_description.md") as f:
+            self.official_docs_description = f.read().strip()
+
+    def search(self, query: str, country: str = "es", subject: str = "verifactu") -> str:
+        """
+        Search the vector store for relevant documents with filters.
+        Args:
+            query: The search query
+            country: Country filter (default: "es")
+            subject: Subject filter (default: "verifactu")
+        Returns:
+            Formatted string with search results
+        """
+        try:
+            # Build filters based on parameters
+            filters = {"type": "and", "filters": []}
+
+            # Add country filter
+            if country:
+                filters["filters"].append({"type": "eq", "key": "country", "value": country})
+
+            # Add subject filter
+            if subject:
+                filters["filters"].append({"type": "eq", "key": "subject", "value": subject})
+
+            # Use OpenAI client with built-in filter support
+            results = self.client.vector_stores.search(
+                vector_store_id=self.vector_store_id,
+                query=query,
+                filters=filters if filters["filters"] else None,
+            )
+
+            # Convert to dict for easier handling
+            results_dict = results.model_dump()
+
+            if not results_dict.get("data"):
+                return f"""No relevant documents found for your query with filters
+                (country: {country}, subject: {subject})."""
+
+            # Format the results
+            formatted_results = []
+            for i, result in enumerate(results_dict["data"][: self.max_results], 1):
+                content_texts = []
+                if result.get("content"):
+                    for content in result["content"]:
+                        if content.get("text"):
+                            content_texts.append(content["text"])
+
+                # Get attributes for context
+                attributes = result.get("attributes", {})
+                attr_info = []
+                if attributes.get("subject"):
+                    attr_info.append(f"Subject: {attributes['subject']}")
+                if attributes.get("type"):
+                    attr_info.append(f"Type: {attributes['type']}")
+                if attributes.get("country"):
+                    attr_info.append(f"Country: {attributes['country']}")
+                if attributes.get("format"):
+                    attr_info.append(f"Format: {attributes['format']}")
+                if attributes.get("url"):
+                    attr_info.append(f"Source: {attributes['url']}")
+
+                attr_str = f" ({', '.join(attr_info)})" if attr_info else ""
+
+                result_text = "\n".join(content_texts) if content_texts else "No content available"
+                formatted_results.append(f"## Result {i}{attr_str}\n\n{result_text}")
+
+            # Add filter info to the response
+            filter_info = f"**Filtered by:** Country='{country}', Subject='{subject}'\n\n"
+
+            return filter_info + "\n\n---\n\n".join(formatted_results)
+
+        except Exception as e:
+            return f"Error performing search: {str(e)}"
+
+    def get_tool(self) -> StructuredTool:
+        """Get the LangChain tool for this RAG search."""
+        return StructuredTool.from_function(
+            func=self.search,
+            name="search_official",
+            description=self.official_docs_description,
+            args_schema=OfficialDocsInput,
+        )
diff --git a/src/expert/prompts/official_docs_description.md b/src/expert/prompts/official_docs_description.md
@@ -0,0 +1,31 @@
+Search through official VeriFactu documentation using semantic search with country and subject filters.
+
+This tool provides modular access to official Spanish government documentation for VeriFactu:
+
+**Filters Available:**
+- Country: Currently supports 'es' (Spain)
+- Subject: 'verifactu' for VeriFactu-specific docs or 'regulation' for general regulations
+
+**Documentation Content:**
+- Official VeriFactu regulations and legal requirements
+- VeriFactu XML schema specifications and field definitions
+- Validation rules and business requirements
+- Error codes and troubleshooting guides
+- Technical implementation requirements
+- Spanish tax compliance procedures
+
+**Use this tool to find relevant information about:**
+- VeriFactu XML format and structure
+- Required vs optional fields and elements
+- Validation rules and constraints
+- Error codes and their meanings
+- Spanish tax compliance requirements
+- Official government regulations and updates
+- Troubleshooting common VeriFactu issues
+
+**Parameters:**
+- query: Your search question
+- country: Filter by country (default: 'es' for Spain)
+- subject: Filter by subject ('verifactu' for VeriFactu docs, 'regulation' for general regulations)
+
+The search returns filtered, relevant excerpts from official documentation to help answer compliance questions.