feat: introducing configurable retrieval workflows (QuivrHQ#3227)

# Description Major PR which, among other things, introduces the possibility of easily customizing the retrieval workflows. Workflows are based on LangGraph, and can be customized using a [yaml configuration file](core/tests/test_llm_endpoint.py), and adding the implementation of the nodes logic into [quivr_rag_langgraph.py](https://github.com/QuivrHQ/quivr/blob/1a0c98437a357e7bbc8039f3fd49912052a1640b/backend/core/quivr_core/quivr_rag_langgraph.py) This is a first, simple implementation that will significantly evolve in the coming weeks to enable more complex workflows (for instance, with conditional nodes). We also plan to adopt a similar approach for the ingestion part, i.e. to enable user to easily customize the ingestion pipeline. Closes CORE-195, CORE-203, CORE-204 ## Checklist before requesting a review Please delete options that are not relevant. - [X] My code follows the style guidelines of this project - [X] I have performed a self-review of my code - [X] I have commented hard-to-understand areas - [X] I have ideally added tests that prove my fix is effective or that my feature works - [X] New and existing unit tests pass locally with my changes - [X] Any dependent changes have been merged ## Screenshots (if appropriate):
srvo · Sep 23, 2024 · ef90e8e · ef90e8e
1 parent 5e48594
commit ef90e8e
Show file tree

Hide file tree

Showing 85 changed files with 5,348 additions and 782 deletions.
diff --git a/.env.example b/.env.example
@@ -1,8 +1,30 @@
 #### QUIVR Configuration
 # This file is used to configure the Quivr stack. It is used by the `docker-compose.yml` file to configure the stack.
 
+# API KEYS
 # OPENAI. Update this to use your API key. To skip OpenAI integration use a fake key, for example: tk-aabbccddAABBCCDDEeFfGgHhIiJKLmnopjklMNOPqQqQqQqQ
-OPENAI_API_KEY=CHANGE_ME
+OPENAI_API_KEY=your-openai-api-key
+# ANTHROPIC_API_KEY=your-anthropic-api-key
+# MISTRAL_API_KEY=your-mistral-api-key
+# GROQ_API_KEY=your-groq-api-key
+
+COHERE_API_KEY=your-cohere-api-key
+# JINA_API_KEY=your-jina-api-key
+
+# UNSTRUCTURED_API_KEY=your-unstructured-api-key
+# UNSTRUCTURED_API_URL=https://api.unstructured.io/general/v0/general
+
+# LLAMA_PARSE_API_KEY=your-llamaparse-api-key
+
+# Configuration files path
+BRAIN_CONFIG_PATH=config/retrieval_config_workflow.yaml
+CHAT_LLM_CONFIG_PATH=config/chat_llm_config.yaml
+
+# LangSmith
+# LANGCHAIN_TRACING_V2=true
+# LANGCHAIN_ENDPOINT="https://api.smith.langchain.com"
+# LANGCHAIN_API_KEY=your-langchain-api-key
+# LANGCHAIN_PROJECT=your-langchain-project-name
 
 # LOCAL
 # OLLAMA_API_BASE_URL=http://host.docker.internal:11434 # Uncomment to activate ollama. This is the local url for the ollama api
@@ -32,7 +54,6 @@ EXTERNAL_SUPABASE_URL=http://localhost:54321
 SUPABASE_SERVICE_KEY=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZS1kZW1vIiwicm9sZSI6InNlcnZpY2Vfcm9sZSIsImV4cCI6MTk4MzgxMjk5Nn0.EGIM96RAZx35lJzdJsyH-qQwv8Hdp7fsn3W0YpN81IU
 PG_DATABASE_URL=postgresql://postgres:[email protected]:54322/postgres
 PG_DATABASE_ASYNC_URL=postgresql+asyncpg://postgres:[email protected]:54322/postgres
-ANTHROPIC_API_KEY=null
 JWT_SECRET_KEY=super-secret-jwt-token-with-at-least-32-characters-long
 AUTHENTICATE=true
 TELEMETRY_ENABLED=true
@@ -41,7 +62,6 @@ CELEBRY_BROKER_QUEUE_NAME=quivr-preview.fifo
 QUIVR_DOMAIN=http://localhost:3000/
 BACKEND_URL=http://localhost:5050
 EMBEDDING_DIM=1536
-#COHERE_API_KEY=CHANGE_ME
 DEACTIVATE_STRIPE=true
 
 #RESEND

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -21,17 +21,19 @@ repos:
     hooks:
       # Run the linter.
       - id: ruff
-        args: [--fix]
+        args: [--fix, --isolated]
         additional_dependencies: []
       # Run the formatter.
       - id: ruff-format
+        args: [--isolated]
         additional_dependencies: []
   - repo: https://github.com/pre-commit/mirrors-mypy
     rev: v1.10.1
     hooks:
       - id: mypy
         name: mypy
-        additional_dependencies: ["types-aiofiles"]
+        args: ["--ignore-missing-imports", "--no-incremental", "--follow-imports=skip"]
+        additional_dependencies: ["types-aiofiles", "types-pyyaml", "pydantic", "sqlmodel"]
 ci:
   autofix_commit_msg: |
     [pre-commit.ci] auto fixes from pre-commit.com hooks

diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -16,7 +16,6 @@
     "**/.docusaurus/": true,
     "**/node_modules/": true
   },
-  "json.sortOnSave.enable": true,
   "[python]": {
     "editor.defaultFormatter": "charliermarsh.ruff",
     "editor.formatOnSave": true,
@@ -25,19 +24,10 @@
       "source.fixAll": "explicit"
     }
   },
-  "python.formatting.provider": "black",
   "python.analysis.extraPaths": [
     "./backend"
   ],
-  "python.sortImports.path": "isort",
-  "python.linting.mypyEnabled": true,
   "python.defaultInterpreterPath": "python3",
-  "python.linting.enabled": true,
-  "python.linting.flake8Enabled": true,
-  "python.linting.pycodestyleEnabled": true,
-  "python.linting.pylintEnabled": true,
-  "python.linting.pycodestyleCategorySeverity.W": "Error",
-  "python.linting.flake8CategorySeverity.W": "Error",
   "python.testing.pytestArgs": [
     "-v",
     "--color=yes",
@@ -53,5 +43,6 @@
     "reportMissingImports": "error",
     "reportUnusedImport": "warning",
     "reportGeneralTypeIssues": "warning"
-  }
-}
+  },
+  "makefile.configureOnOpen": false
+}
diff --git a/backend/Dockerfile.dev b/backend/Dockerfile.dev
@@ -33,6 +33,8 @@ COPY core/pyproject.toml core/README.md ./core/
 COPY core/quivr_core/__init__.py ./core/quivr_core/__init__.py
 COPY worker/pyproject.toml worker/README.md ./worker/
 COPY worker/quivr_worker/__init__.py ./worker/quivr_worker/__init__.py
+COPY core/MegaParse/pyproject.toml core/MegaParse/README.md ./core/MegaParse/
+COPY core/MegaParse/megaparse/__init__.py ./core/MegaParse/megaparse/__init__.py
 
 RUN PYTHONDONTWRITEBYTECODE=1 pip install --no-cache-dir -r requirements.lock
 

diff --git a/backend/api/quivr_api/modules/brain/entity/brain_entity.py b/backend/api/quivr_api/modules/brain/entity/brain_entity.py
@@ -4,6 +4,7 @@
 from uuid import UUID
 
 from pydantic import BaseModel
+from quivr_core.config import BrainConfig
 from sqlalchemy.dialects.postgresql import ENUM as PGEnum
 from sqlalchemy.ext.asyncio import AsyncAttrs
 from sqlmodel import TIMESTAMP, Column, Field, Relationship, SQLModel, text
@@ -58,43 +59,39 @@ class Brain(AsyncAttrs, SQLModel, table=True):
             default=BrainType.integration,
         ),
     )
-    brain_chat_history: List["ChatHistory"] = Relationship(  # noqa: F821
+    brain_chat_history: List["ChatHistory"] = Relationship(  # type: ignore # noqa: F821
         back_populates="brain", sa_relationship_kwargs={"lazy": "select"}
     )
     prompt_id: UUID | None = Field(default=None, foreign_key="prompts.id")
-    prompt: Prompt | None = Relationship(  # noqa: f821
+    prompt: Prompt | None = Relationship(  # noqa: F821
         back_populates="brain", sa_relationship_kwargs={"lazy": "joined"}
     )
     knowledges: List[KnowledgeDB] = Relationship(
         back_populates="brains", link_model=KnowledgeBrain
     )
 
-
     # TODO : add
     # "meaning" "public"."vector",
     # "tags" "public"."tags"[]
 
 
-class BrainEntity(BaseModel):
-    brain_id: UUID
-    name: str
+class BrainEntity(BrainConfig):
+    last_update: datetime | None = None
+    brain_type: BrainType | None = None
     description: Optional[str] = None
     temperature: Optional[float] = None
+    meaning: Optional[str] = None
+    openai_api_key: Optional[str] = None
+    tags: Optional[List[str]] = None
     model: Optional[str] = None
     max_tokens: Optional[int] = None
     status: Optional[str] = None
     prompt_id: Optional[UUID] = None
-    last_update: datetime
-    brain_type: BrainType
     integration: Optional[IntegrationEntity] = None
     integration_description: Optional[IntegrationDescriptionEntity] = None
     snippet_emoji: Optional[str] = None
     snippet_color: Optional[str] = None
 
-    @property
-    def id(self) -> UUID:
-        return self.brain_id
-
     def dict(self, **kwargs):
         data = super().dict(
             **kwargs,

diff --git a/backend/api/quivr_api/modules/brain/repository/integration_brains.py b/backend/api/quivr_api/modules/brain/repository/integration_brains.py
@@ -100,7 +100,7 @@ def delete_integration_brain(self, brain_id, user_id):
 
     def get_integration_brain_by_type_integration(
         self, integration_name
-    ) -> List[IntegrationEntity]:
+    ) -> List[IntegrationEntity] | None:
         response = (
             self.db.table("integrations_user")
             .select("*, integrations ()")

diff --git a/backend/api/quivr_api/modules/brain/service/brain_service.py b/backend/api/quivr_api/modules/brain/service/brain_service.py
@@ -1,4 +1,4 @@
-from typing import Optional
+from typing import Optional, Tuple, Dict
 from uuid import UUID
 
 from fastapi import HTTPException
@@ -54,7 +54,7 @@ def find_brain_from_question(
         chat_id: UUID,
         history,
         vector_store: CustomSupabaseVectorStore,
-    ) -> (Optional[BrainEntity], dict[str, str]):
+    ) -> Tuple[Optional[BrainEntity], Dict[str, str]]:
         """Find the brain to use for a question.
 
         Args:
@@ -106,12 +106,12 @@ def find_brain_from_question(
                 brain_id_to_use = list_brains[0]["id"]
                 brain_to_use = self.get_brain_by_id(brain_id_to_use)
 
-        return brain_to_use, metadata
+        return brain_to_use, metadata  # type: ignore
 
     def create_brain(
         self,
         user_id: UUID,
-        brain: Optional[CreateBrainProperties],
+        brain: CreateBrainProperties | None = None,
     ) -> BrainEntity:
         if brain is None:
             brain = CreateBrainProperties()
@@ -226,28 +226,3 @@ def get_brain_details(
                 )
 
         return brain
-
-    def get_connected_brains(self, brain_id: UUID) -> list[BrainEntity]:
-        return self.composite_brains_connections_repository.get_connected_brains(
-            brain_id
-        )
-
-    def update_secret_value(
-        self,
-        user_id: UUID,
-        brain_id: UUID,
-        secret_name: str,
-        secret_value: str,
-    ) -> None:
-        """Update an existing secret."""
-        self.external_api_secrets_repository.delete_secret(
-            user_id=user_id,
-            brain_id=brain_id,
-            secret_name=secret_name,
-        )
-        self.external_api_secrets_repository.create_secret(
-            user_id=user_id,
-            brain_id=brain_id,
-            secret_name=secret_name,
-            secret_value=secret_value,
-        )
diff --git a/backend/api/quivr_api/modules/chat/controller/chat/utils.py b/backend/api/quivr_api/modules/chat/controller/chat/utils.py
@@ -1,15 +1,61 @@
 import time
+import os
+from enum import Enum
 
 from fastapi import HTTPException
 from quivr_api.logger import get_logger
 from quivr_api.modules.models.entity.model import Model
 from quivr_api.modules.models.service.model_service import ModelService
 from quivr_api.modules.user.entity.user_identity import UserIdentity
 from quivr_api.modules.user.service.user_usage import UserUsage
+from quivr_core.config import RetrievalConfig
 
 logger = get_logger(__name__)
 
 
+class RetrievalConfigPathEnv(Enum):
+    CHAT_WITH_LLM = ("CHAT_LLM_CONFIG_PATH", "chat_llm_config.yaml")
+    RAG = ("BRAIN_CONFIG_PATH", "config/retrieval_config_workflow.yaml")
+
+    @property
+    def env_var(self) -> str:
+        return self.value[0]
+
+    @property
+    def default_path(self) -> str:
+        return self.value[1]
+
+
+def get_config_file_path(
+    config_path_env: RetrievalConfigPathEnv, current_path: str | None = None
+) -> str:
+    # Get the environment variable or fallback to the default path
+    _path = os.getenv(config_path_env.env_var, config_path_env.default_path)
+
+    if not current_path:
+        return _path
+
+    return os.path.join(current_path, _path)
+
+
+def load_and_merge_retrieval_configuration(
+    config_file_path: str, sqlmodel: Model
+) -> RetrievalConfig:
+    retrieval_config = RetrievalConfig.from_yaml(config_file_path)
+    field_mapping = {
+        "env_variable_name": "env_variable_name",
+        "endpoint_url": "llm_base_url",
+    }
+
+    retrieval_config.llm_config.set_from_sqlmodel(
+        sqlmodel=sqlmodel, mapping=field_mapping
+    )
+
+    retrieval_config.llm_config.set_llm_model(sqlmodel.name)
+
+    return retrieval_config
+
+
 # TODO: rewrite
 async def find_model_and_generate_metadata(
     brain_model: str | None,