diff --git a/config.yaml b/config.yaml index c5962de..b19ef76 100644 --- a/config.yaml +++ b/config.yaml @@ -24,4 +24,14 @@ mcp: chat: welcome_message: "Welcome to Invopop Expert! Ask questions about GOBL, Invopop and the invopop/gobl library (type 'exit' to quit)" input_prompt: "Enter your multi-line question. Press Enter on an empty line to send." - max_history: 50 \ No newline at end of file + max_history: 50 + +vector_store: + id: "" #vector store id + max_results: 10 + score_threshold: 0.1 + default_filters: + country: "es" # Spain + subject: "verifactu" # VeriFactu-specific docs + +checkpointer: "memory" \ No newline at end of file diff --git a/src/expert/agent.py b/src/expert/agent.py index dcc0272..315f18b 100644 --- a/src/expert/agent.py +++ b/src/expert/agent.py @@ -10,6 +10,7 @@ from opik.integrations.langchain import OpikTracer from .config import Config +from .official_docs_tool import OfficialDocsTool AVAILABLE_REPOS = [ "invopop/gobl", @@ -31,6 +32,10 @@ def __init__(self, config: Config): self._load_prompts() self.opik_config = None self.mcp_client = None + self.official_docs_tool = OfficialDocsTool( + vector_store_id=self.config.vector_store_config["id"], + max_results=self.config.vector_store_config.get("max_results", 10), + ) def _load_prompts(self): """Load prompt templates from files.""" @@ -88,6 +93,9 @@ async def setup(self): ) renamed_tools.append(renamed_tool) + # Add the official docs tool + renamed_tools.append(self.official_docs_tool.get_tool()) + # Create the agent llm_config = self.config.llm_config provider = llm_config.get("provider", "openai") @@ -170,6 +178,11 @@ async def get_response_with_context(self, messages: list[dict], thread_id: str) f"🔍 Searching {repo_name} repo:", tool_call["function"]["arguments"], ) + elif func_name == "search_official": + print( + "🔍 Searching official docs:", + tool_call["function"]["arguments"], + ) # Update the final message content final_message_content = message.content diff --git a/src/expert/config.py b/src/expert/config.py index 126bbf9..e770a55 100644 --- a/src/expert/config.py +++ b/src/expert/config.py @@ -101,6 +101,11 @@ def chat_config(self) -> dict[str, Any]: """Get chat configuration.""" return self.config.get("chat", {}) + @property + def vector_store_config(self) -> dict[str, Any]: + """Get vector store configuration.""" + return self.config.get("vector_store", {}) + @property def checkpointer(self) -> BaseCheckpointSaver | None: """Get checkpointer configuration.""" diff --git a/src/expert/official_docs_tool.py b/src/expert/official_docs_tool.py new file mode 100644 index 0000000..66853e1 --- /dev/null +++ b/src/expert/official_docs_tool.py @@ -0,0 +1,117 @@ +"""RAG search tool using OpenAI Vector Store.""" + +from pathlib import Path + +from langchain_core.tools import StructuredTool +from openai import OpenAI +from pydantic import BaseModel, Field + + +class OfficialDocsInput(BaseModel): + """Input schema for RAG search tool.""" + + query: str = Field(description="The search query to find relevant documentation") + country: str | None = Field( + default="es", description="Country filter for the search (for now only 'es' for Spain)" + ) + subject: str | None = Field( + default="verifactu", + description="""Subject filter for the search +('regulation' for general regulations or 'verifactu' for VeriFactu-specific docs)""", + ) + + +class OfficialDocsTool: + """RAG search tool using OpenAI Vector Store.""" + + def __init__(self, vector_store_id: str, max_results: int = 10): + """Initialize the RAG search tool.""" + self.client = OpenAI() + self.vector_store_id = vector_store_id + self.max_results = max_results + + prompts_dir = Path(__file__).parent / "prompts" + + with open(prompts_dir / "official_docs_description.md") as f: + self.official_docs_description = f.read().strip() + + def search(self, query: str, country: str = "es", subject: str = "verifactu") -> str: + """ + Search the vector store for relevant documents with filters. + Args: + query: The search query + country: Country filter (default: "es") + subject: Subject filter (default: "verifactu") + Returns: + Formatted string with search results + """ + try: + # Build filters based on parameters + filters = {"type": "and", "filters": []} + + # Add country filter + if country: + filters["filters"].append({"type": "eq", "key": "country", "value": country}) + + # Add subject filter + if subject: + filters["filters"].append({"type": "eq", "key": "subject", "value": subject}) + + # Use OpenAI client with built-in filter support + results = self.client.vector_stores.search( + vector_store_id=self.vector_store_id, + query=query, + filters=filters if filters["filters"] else None, + ) + + # Convert to dict for easier handling + results_dict = results.model_dump() + + if not results_dict.get("data"): + return f"""No relevant documents found for your query with filters + (country: {country}, subject: {subject}).""" + + # Format the results + formatted_results = [] + for i, result in enumerate(results_dict["data"][: self.max_results], 1): + content_texts = [] + if result.get("content"): + for content in result["content"]: + if content.get("text"): + content_texts.append(content["text"]) + + # Get attributes for context + attributes = result.get("attributes", {}) + attr_info = [] + if attributes.get("subject"): + attr_info.append(f"Subject: {attributes['subject']}") + if attributes.get("type"): + attr_info.append(f"Type: {attributes['type']}") + if attributes.get("country"): + attr_info.append(f"Country: {attributes['country']}") + if attributes.get("format"): + attr_info.append(f"Format: {attributes['format']}") + if attributes.get("url"): + attr_info.append(f"Source: {attributes['url']}") + + attr_str = f" ({', '.join(attr_info)})" if attr_info else "" + + result_text = "\n".join(content_texts) if content_texts else "No content available" + formatted_results.append(f"## Result {i}{attr_str}\n\n{result_text}") + + # Add filter info to the response + filter_info = f"**Filtered by:** Country='{country}', Subject='{subject}'\n\n" + + return filter_info + "\n\n---\n\n".join(formatted_results) + + except Exception as e: + return f"Error performing search: {str(e)}" + + def get_tool(self) -> StructuredTool: + """Get the LangChain tool for this RAG search.""" + return StructuredTool.from_function( + func=self.search, + name="search_official", + description=self.official_docs_description, + args_schema=OfficialDocsInput, + ) diff --git a/src/expert/prompts/official_docs_description.md b/src/expert/prompts/official_docs_description.md new file mode 100644 index 0000000..6869fef --- /dev/null +++ b/src/expert/prompts/official_docs_description.md @@ -0,0 +1,31 @@ +Search through official VeriFactu documentation using semantic search with country and subject filters. + +This tool provides modular access to official Spanish government documentation for VeriFactu: + +**Filters Available:** +- Country: Currently supports 'es' (Spain) +- Subject: 'verifactu' for VeriFactu-specific docs or 'regulation' for general regulations + +**Documentation Content:** +- Official VeriFactu regulations and legal requirements +- VeriFactu XML schema specifications and field definitions +- Validation rules and business requirements +- Error codes and troubleshooting guides +- Technical implementation requirements +- Spanish tax compliance procedures + +**Use this tool to find relevant information about:** +- VeriFactu XML format and structure +- Required vs optional fields and elements +- Validation rules and constraints +- Error codes and their meanings +- Spanish tax compliance requirements +- Official government regulations and updates +- Troubleshooting common VeriFactu issues + +**Parameters:** +- query: Your search question +- country: Filter by country (default: 'es' for Spain) +- subject: Filter by subject ('verifactu' for VeriFactu docs, 'regulation' for general regulations) + +The search returns filtered, relevant excerpts from official documentation to help answer compliance questions. \ No newline at end of file