diff --git a/backend/Dockerfile b/backend/Dockerfile index fa07a26b4abb..5d20cc09bb79 100644 --- a/backend/Dockerfile +++ b/backend/Dockerfile @@ -24,6 +24,7 @@ RUN apt-get clean && apt-get update && apt-get install -y \ libreoffice \ libpq-dev \ gcc \ + libhdf5-serial-dev \ pandoc && \ rm -rf /var/lib/apt/lists/* && apt-get clean @@ -46,6 +47,8 @@ COPY core/pyproject.toml core/README.md ./core/ COPY core/quivr_core/__init__.py ./core/quivr_core/__init__.py COPY worker/pyproject.toml worker/README.md ./worker/ COPY worker/quivr_worker/__init__.py ./worker/quivr_worker/__init__.py +COPY worker/diff-assistant/pyproject.toml worker/diff-assistant/README.md ./worker/diff-assistant/ +COPY worker/diff-assistant/quivr_diff_assistant/__init__.py ./worker/diff-assistant/quivr_diff_assistant/__init__.py COPY core/MegaParse/pyproject.toml core/MegaParse/README.md ./core/MegaParse/ COPY core/MegaParse/megaparse/__init__.py ./core/MegaParse/megaparse/__init__.py diff --git a/backend/Dockerfile.dev b/backend/Dockerfile.dev index 8efe6696c813..9fb458b16546 100644 --- a/backend/Dockerfile.dev +++ b/backend/Dockerfile.dev @@ -23,6 +23,7 @@ RUN apt-get clean && apt-get update && apt-get install -y \ libreoffice \ libpq-dev \ gcc \ + libhdf5-serial-dev \ pandoc && \ rm -rf /var/lib/apt/lists/* && apt-get clean @@ -33,6 +34,8 @@ COPY core/pyproject.toml core/README.md ./core/ COPY core/quivr_core/__init__.py ./core/quivr_core/__init__.py COPY worker/pyproject.toml worker/README.md ./worker/ COPY worker/quivr_worker/__init__.py ./worker/quivr_worker/__init__.py +COPY worker/diff-assistant/pyproject.toml worker/diff-assistant/README.md ./worker/diff-assistant/ +COPY worker/diff-assistant/quivr_diff_assistant/__init__.py ./worker/diff-assistant/quivr_diff_assistant/__init__.py COPY core/MegaParse/pyproject.toml core/MegaParse/README.md ./core/MegaParse/ COPY core/MegaParse/megaparse/__init__.py ./core/MegaParse/megaparse/__init__.py diff --git a/backend/api/quivr_api/modules/assistant/controller/assistant_routes.py b/backend/api/quivr_api/modules/assistant/controller/assistant_routes.py index 9d2e303bb6c5..289cd8ce84ae 100644 --- a/backend/api/quivr_api/modules/assistant/controller/assistant_routes.py +++ b/backend/api/quivr_api/modules/assistant/controller/assistant_routes.py @@ -2,7 +2,7 @@ from typing import Annotated, List from uuid import uuid4 -from fastapi import APIRouter, Depends, HTTPException, Request, UploadFile +from fastapi import APIRouter, Depends, File, HTTPException, Request, UploadFile from quivr_api.celery_config import celery from quivr_api.logger import get_logger @@ -16,6 +16,7 @@ from quivr_api.modules.assistant.entity.assistant_entity import ( AssistantSettings, ) +from quivr_api.modules.assistant.entity.task_entity import TaskMetadata from quivr_api.modules.assistant.services.tasks_service import TasksService from quivr_api.modules.dependencies import get_service from quivr_api.modules.upload.service.upload_file import ( @@ -64,12 +65,15 @@ async def create_task( current_user: UserIdentityDep, tasks_service: TasksServiceDep, request: Request, - input: InputAssistant, + input: str = File(...), files: List[UploadFile] = None, ): + input = InputAssistant.model_validate_json(input) + assistant = next( (assistant for assistant in assistants if assistant.id == input.id), None ) + if assistant is None: raise HTTPException(status_code=404, detail="Assistant not found") @@ -80,7 +84,7 @@ async def create_task( raise HTTPException(status_code=400, detail=error) else: print("Assistant input is valid.") - notification_uuid = uuid4() + notification_uuid = f"{assistant.name}-{str(uuid4())[:8]}" # Process files dynamically for upload_file in files: @@ -96,8 +100,14 @@ async def create_task( task = CreateTask( assistant_id=input.id, - pretty_id=str(notification_uuid), + assistant_name=assistant.name, + pretty_id=notification_uuid, settings=input.model_dump(mode="json"), + task_metadata=TaskMetadata( + input_files=[file.filename for file in files] + ).model_dump(mode="json") + if files + else None, # type: ignore ) task_created = await tasks_service.create_task(task, current_user.id) diff --git a/backend/api/quivr_api/modules/assistant/controller/assistants_definition.py b/backend/api/quivr_api/modules/assistant/controller/assistants_definition.py index 0ade87168ab4..a20b0c6f9857 100644 --- a/backend/api/quivr_api/modules/assistant/controller/assistants_definition.py +++ b/backend/api/quivr_api/modules/assistant/controller/assistants_definition.py @@ -1,8 +1,11 @@ from quivr_api.modules.assistant.dto.inputs import InputAssistant from quivr_api.modules.assistant.dto.outputs import ( AssistantOutput, + ConditionalInput, + InputBoolean, InputFile, Inputs, + InputSelectText, Pricing, ) @@ -166,10 +169,10 @@ def validate_assistant_input( assistant1 = AssistantOutput( id=1, - name="Assistant 1", - description="Assistant 1 description", + name="Compliance Check", + description="Allows analyzing the compliance of the information contained in documents against charter or regulatory requirements.", pricing=Pricing(), - tags=["tag1", "tag2"], + tags=["Disabled"], input_description="Input description", output_description="Output description", inputs=Inputs( @@ -183,19 +186,66 @@ def validate_assistant_input( assistant2 = AssistantOutput( id=2, - name="Assistant 2", - description="Assistant 2 description", + name="Consistency Check", + description="Ensures that the information in one document is replicated identically in another document.", pricing=Pricing(), - tags=["tag1", "tag2"], + tags=[], input_description="Input description", output_description="Output description", icon_url="https://example.com/icon.png", inputs=Inputs( files=[ - InputFile(key="file_1", description="File description"), - InputFile(key="file_2", description="File description"), + InputFile(key="Document 1", description="File description"), + InputFile(key="Document 2", description="File description"), + ], + select_texts=[ + InputSelectText( + key="DocumentsType", + description="Select Documents Type", + options=[ + "Etiquettes VS Cahier des charges", + "Fiche Dev VS Cahier des charges", + ], + ), + ], + ), +) + +assistant3 = AssistantOutput( + id=3, + name="Difference Detection", + description="Highlights differences between one document and another after modifications.", + pricing=Pricing(), + tags=[], + input_description="Input description", + output_description="Output description", + icon_url="https://example.com/icon.png", + inputs=Inputs( + files=[ + InputFile(key="Document 1", description="File description"), + InputFile(key="Document 2", description="File description"), + ], + booleans=[ + InputBoolean( + key="Hard-to-Read Document?", description="Boolean description" + ), + ], + select_texts=[ + InputSelectText( + key="DocumentsType", + description="Select Documents Type", + options=["Etiquettes", "Cahier des charges"], + ), + ], + conditional_inputs=[ + ConditionalInput( + key="DocumentsType", + conditional_key="Hard-to-Read Document?", + condition="equals", + value="Etiquettes", + ), ], ), ) -assistants = [assistant1, assistant2] +assistants = [assistant1, assistant2, assistant3] diff --git a/backend/api/quivr_api/modules/assistant/dto/inputs.py b/backend/api/quivr_api/modules/assistant/dto/inputs.py index 929f95535cf2..0847224dd2a4 100644 --- a/backend/api/quivr_api/modules/assistant/dto/inputs.py +++ b/backend/api/quivr_api/modules/assistant/dto/inputs.py @@ -1,4 +1,4 @@ -from typing import List, Optional +from typing import Dict, List, Optional from uuid import UUID from pydantic import BaseModel, root_validator @@ -7,7 +7,9 @@ class CreateTask(BaseModel): pretty_id: str assistant_id: int + assistant_name: str settings: dict + task_metadata: Dict | None = None class BrainInput(BaseModel): diff --git a/backend/api/quivr_api/modules/assistant/dto/outputs.py b/backend/api/quivr_api/modules/assistant/dto/outputs.py index 40574e5bfcbf..4703be843d8c 100644 --- a/backend/api/quivr_api/modules/assistant/dto/outputs.py +++ b/backend/api/quivr_api/modules/assistant/dto/outputs.py @@ -61,6 +61,21 @@ class InputSelectNumber(BaseModel): default: Optional[int] = None +class ConditionalInput(BaseModel): + """ + Conditional input is a list of inputs that are conditional to the value of another input. + key: The key of the input that is conditional. + conditional_key: The key that determines if the input is shown. + """ + + key: str + conditional_key: str + condition: Optional[str] = ( + None # e.g. "equals", "contains", "starts_with", "ends_with", "regex", "in", "not_in", "is_empty", "is_not_empty" + ) + value: Optional[str] = None + + class Inputs(BaseModel): files: Optional[List[InputFile]] = None urls: Optional[List[InputUrl]] = None @@ -70,6 +85,7 @@ class Inputs(BaseModel): select_texts: Optional[List[InputSelectText]] = None select_numbers: Optional[List[InputSelectNumber]] = None brain: Optional[BrainInput] = None + conditional_inputs: Optional[List[ConditionalInput]] = None class Pricing(BaseModel): diff --git a/backend/api/quivr_api/modules/assistant/entity/task_entity.py b/backend/api/quivr_api/modules/assistant/entity/task_entity.py index 01d5f33b255b..7972c07001a7 100644 --- a/backend/api/quivr_api/modules/assistant/entity/task_entity.py +++ b/backend/api/quivr_api/modules/assistant/entity/task_entity.py @@ -1,10 +1,15 @@ from datetime import datetime -from typing import Dict +from typing import Dict, List, Optional from uuid import UUID +from pydantic import BaseModel from sqlmodel import JSON, TIMESTAMP, BigInteger, Column, Field, SQLModel, text +class TaskMetadata(BaseModel): + input_files: Optional[List[str]] = None + + class Task(SQLModel, table=True): __tablename__ = "tasks" # type: ignore @@ -17,6 +22,7 @@ class Task(SQLModel, table=True): ), ) assistant_id: int + assistant_name: str pretty_id: str user_id: UUID status: str = Field(default="pending") @@ -29,6 +35,4 @@ class Task(SQLModel, table=True): ) settings: Dict = Field(default_factory=dict, sa_column=Column(JSON)) answer: str | None = Field(default=None) - - class Config: - arbitrary_types_allowed = True + task_metadata: Dict | None = Field(default_factory=dict, sa_column=Column(JSON)) diff --git a/backend/api/quivr_api/modules/assistant/repository/tasks.py b/backend/api/quivr_api/modules/assistant/repository/tasks.py index 7977a2f56dd3..cc1aae78da6c 100644 --- a/backend/api/quivr_api/modules/assistant/repository/tasks.py +++ b/backend/api/quivr_api/modules/assistant/repository/tasks.py @@ -3,7 +3,7 @@ from sqlalchemy import exc from sqlalchemy.ext.asyncio import AsyncSession -from sqlmodel import select +from sqlmodel import col, select from quivr_api.modules.assistant.dto.inputs import CreateTask from quivr_api.modules.assistant.entity.task_entity import Task @@ -21,9 +21,11 @@ async def create_task(self, task: CreateTask, user_id: UUID) -> Task: try: task_to_create = Task( assistant_id=task.assistant_id, + assistant_name=task.assistant_name, pretty_id=task.pretty_id, user_id=user_id, settings=task.settings, + task_metadata=task.task_metadata, # type: ignore ) self.session.add(task_to_create) await self.session.commit() @@ -40,7 +42,9 @@ async def get_task_by_id(self, task_id: UUID, user_id: UUID) -> Task: return response.one() async def get_tasks_by_user_id(self, user_id: UUID) -> Sequence[Task]: - query = select(Task).where(Task.user_id == user_id) + query = ( + select(Task).where(Task.user_id == user_id).order_by(col(Task.id).desc()) + ) response = await self.session.exec(query) return response.all() diff --git a/backend/api/quivr_api/modules/brain/service/brain_service.py b/backend/api/quivr_api/modules/brain/service/brain_service.py index e5b403d8f03e..7b9da881c7ff 100644 --- a/backend/api/quivr_api/modules/brain/service/brain_service.py +++ b/backend/api/quivr_api/modules/brain/service/brain_service.py @@ -1,4 +1,4 @@ -from typing import Optional, Tuple, Dict +from typing import Dict, Optional, Tuple from uuid import UUID from fastapi import HTTPException diff --git a/backend/api/quivr_api/modules/chat/controller/chat/utils.py b/backend/api/quivr_api/modules/chat/controller/chat/utils.py index 5306f4ecbeb7..ce5e684221df 100644 --- a/backend/api/quivr_api/modules/chat/controller/chat/utils.py +++ b/backend/api/quivr_api/modules/chat/controller/chat/utils.py @@ -1,5 +1,5 @@ -import time import os +import time from enum import Enum from fastapi import HTTPException diff --git a/backend/api/quivr_api/modules/chat/controller/chat_routes.py b/backend/api/quivr_api/modules/chat/controller/chat_routes.py index a42d7fe7fb7a..f89e792c81fa 100644 --- a/backend/api/quivr_api/modules/chat/controller/chat_routes.py +++ b/backend/api/quivr_api/modules/chat/controller/chat_routes.py @@ -1,9 +1,10 @@ +import os from typing import Annotated, List, Optional from uuid import UUID -import os from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException, Query, Request from fastapi.responses import StreamingResponse +from quivr_core.config import RetrievalConfig from quivr_api.logger import get_logger from quivr_api.middlewares.auth import AuthBearer, get_current_user @@ -36,7 +37,6 @@ from quivr_api.modules.vector.service.vector_service import VectorService from quivr_api.utils.telemetry import maybe_send_telemetry from quivr_api.utils.uuid_generator import generate_uuid_from_string -from quivr_core.config import RetrievalConfig logger = get_logger(__name__) diff --git a/backend/api/quivr_api/modules/knowledge/entity/knowledge.py b/backend/api/quivr_api/modules/knowledge/entity/knowledge.py index dcbb7a5b66d8..e08f3c0abcdb 100644 --- a/backend/api/quivr_api/modules/knowledge/entity/knowledge.py +++ b/backend/api/quivr_api/modules/knowledge/entity/knowledge.py @@ -2,8 +2,8 @@ from enum import Enum from typing import Any, Dict, List, Optional from uuid import UUID -from pydantic import BaseModel +from pydantic import BaseModel from quivr_core.models import KnowledgeStatus from sqlalchemy import JSON, TIMESTAMP, Column, text from sqlalchemy.ext.asyncio import AsyncAttrs diff --git a/backend/api/quivr_api/modules/knowledge/repository/storage.py b/backend/api/quivr_api/modules/knowledge/repository/storage.py index e53165e22282..ad35659dbbd0 100644 --- a/backend/api/quivr_api/modules/knowledge/repository/storage.py +++ b/backend/api/quivr_api/modules/knowledge/repository/storage.py @@ -86,4 +86,3 @@ async def remove_file(self, storage_path: str): except Exception as e: logger.error(e) raise e - diff --git a/backend/api/quivr_api/modules/knowledge/tests/test_knowledge_service.py b/backend/api/quivr_api/modules/knowledge/tests/test_knowledge_service.py index a0f49a07ed30..7381b6e917dd 100644 --- a/backend/api/quivr_api/modules/knowledge/tests/test_knowledge_service.py +++ b/backend/api/quivr_api/modules/knowledge/tests/test_knowledge_service.py @@ -527,7 +527,9 @@ async def test_should_process_knowledge_prev_error( assert new.file_sha1 -@pytest.mark.skip(reason="Bug: UnboundLocalError: cannot access local variable 'response'") +@pytest.mark.skip( + reason="Bug: UnboundLocalError: cannot access local variable 'response'" +) @pytest.mark.asyncio(loop_scope="session") async def test_get_knowledge_storage_path(session: AsyncSession, test_data: TestData): _, [knowledge, _] = test_data diff --git a/backend/api/quivr_api/modules/misc/controller/misc_routes.py b/backend/api/quivr_api/modules/misc/controller/misc_routes.py index 590b3cd0e3aa..054798b34c18 100644 --- a/backend/api/quivr_api/modules/misc/controller/misc_routes.py +++ b/backend/api/quivr_api/modules/misc/controller/misc_routes.py @@ -1,9 +1,8 @@ - from fastapi import APIRouter, Depends, HTTPException from quivr_api.logger import get_logger from quivr_api.modules.dependencies import get_async_session -from sqlmodel.ext.asyncio.session import AsyncSession from sqlmodel import text +from sqlmodel.ext.asyncio.session import AsyncSession logger = get_logger(__name__) @@ -20,7 +19,6 @@ async def root(): @misc_router.get("/healthz", tags=["Health"]) async def healthz(session: AsyncSession = Depends(get_async_session)): - try: result = await session.execute(text("SELECT 1")) if not result: diff --git a/backend/api/quivr_api/modules/rag_service/rag_service.py b/backend/api/quivr_api/modules/rag_service/rag_service.py index e7b2e0b3332a..b7d73f2febd3 100644 --- a/backend/api/quivr_api/modules/rag_service/rag_service.py +++ b/backend/api/quivr_api/modules/rag_service/rag_service.py @@ -2,7 +2,6 @@ import os from uuid import UUID, uuid4 -from quivr_api.utils.uuid_generator import generate_uuid_from_string from quivr_core.brain import Brain as BrainCore from quivr_core.chat import ChatHistory as ChatHistoryCore from quivr_core.config import LLMEndpointConfig, RetrievalConfig @@ -29,6 +28,7 @@ from quivr_api.modules.prompt.service.prompt_service import PromptService from quivr_api.modules.user.entity.user_identity import UserIdentity from quivr_api.modules.vector.service.vector_service import VectorService +from quivr_api.utils.uuid_generator import generate_uuid_from_string from quivr_api.vectorstore.supabase import CustomSupabaseVectorStore from .utils import generate_source diff --git a/backend/api/quivr_api/modules/rag_service/utils.py b/backend/api/quivr_api/modules/rag_service/utils.py index 068a2db28c5e..afc12082eac8 100644 --- a/backend/api/quivr_api/modules/rag_service/utils.py +++ b/backend/api/quivr_api/modules/rag_service/utils.py @@ -68,7 +68,7 @@ async def generate_source( try: file_name = doc.metadata["file_name"] file_path = await knowledge_service.get_knowledge_storage_path( - file_name=file_name, brain_id=brain_id + file_name=file_name, brain_id=brain_id ) if file_path in generated_urls: source_url = generated_urls[file_path] diff --git a/backend/api/quivr_api/modules/sync/repository/sync_user.py b/backend/api/quivr_api/modules/sync/repository/sync_user.py index a27507628ca6..09ff5007d7b4 100644 --- a/backend/api/quivr_api/modules/sync/repository/sync_user.py +++ b/backend/api/quivr_api/modules/sync/repository/sync_user.py @@ -93,9 +93,7 @@ def get_syncs_user(self, user_id: UUID, sync_user_id: int | None = None): sync_user_id, ) query = ( - self.db.from_("syncs_user") - .select("*") - .eq("user_id", user_id) + self.db.from_("syncs_user").select("*").eq("user_id", user_id) # .neq("status", "REMOVED") ) if sync_user_id: @@ -170,9 +168,9 @@ def update_sync_user( ) state_str = json.dumps(state) - self.db.from_("syncs_user").update(sync_user_input.model_dump(exclude_unset=True)).eq( - "user_id", str(sync_user_id) - ).eq("state", state_str).execute() + self.db.from_("syncs_user").update( + sync_user_input.model_dump(exclude_unset=True) + ).eq("user_id", str(sync_user_id)).eq("state", state_str).execute() logger.info("Sync user updated successfully") def update_sync_user_status(self, sync_user_id: int, status: str): diff --git a/backend/api/quivr_api/modules/sync/service/sync_notion.py b/backend/api/quivr_api/modules/sync/service/sync_notion.py index 32326e148f4b..5eca27d57fda 100644 --- a/backend/api/quivr_api/modules/sync/service/sync_notion.py +++ b/backend/api/quivr_api/modules/sync/service/sync_notion.py @@ -1,9 +1,9 @@ +import time from datetime import datetime, timezone from typing import List, Sequence from uuid import UUID from notion_client import Client -import time from quivr_api.logger import get_logger from quivr_api.modules.dependencies import BaseService @@ -165,7 +165,6 @@ async def store_notion_pages( def fetch_notion_pages( notion_client: Client, start_cursor: str | None = None, iteration: int = 0 ) -> NotionSearchResult: - if iteration > 10: return NotionSearchResult(results=[], has_more=False, next_cursor=None) search_result = notion_client.search( @@ -177,7 +176,9 @@ def fetch_notion_pages( if "code" in search_result and search_result["code"] == "rate_limited": # Wait 10 seconds time.sleep(10) - search_result = fetch_notion_pages(notion_client, start_cursor=start_cursor, iteration=iteration+1) + search_result = fetch_notion_pages( + notion_client, start_cursor=start_cursor, iteration=iteration + 1 + ) return NotionSearchResult.model_validate(search_result) diff --git a/backend/api/quivr_api/modules/sync/tests/test_notion_service.py b/backend/api/quivr_api/modules/sync/tests/test_notion_service.py index d866a3d11733..526114c5ef8b 100644 --- a/backend/api/quivr_api/modules/sync/tests/test_notion_service.py +++ b/backend/api/quivr_api/modules/sync/tests/test_notion_service.py @@ -74,7 +74,9 @@ def handler(request): assert len(result) == 0 -@pytest.mark.skip(reason="Bug: httpx.ConnectError: [Errno -2] Name or service not known'") +@pytest.mark.skip( + reason="Bug: httpx.ConnectError: [Errno -2] Name or service not known'" +) @pytest.mark.asyncio(loop_scope="session") async def test_store_notion_pages_success( session: AsyncSession, diff --git a/backend/api/quivr_api/modules/sync/tests/test_syncutils.py b/backend/api/quivr_api/modules/sync/tests/test_syncutils.py index 767a944029e4..3c20f70d9679 100644 --- a/backend/api/quivr_api/modules/sync/tests/test_syncutils.py +++ b/backend/api/quivr_api/modules/sync/tests/test_syncutils.py @@ -271,7 +271,10 @@ async def test_process_sync_file_not_supported(syncutils: SyncUtils): sync_active=sync_active, ) -@pytest.mark.skip(reason="Bug: UnboundLocalError: cannot access local variable 'response'") + +@pytest.mark.skip( + reason="Bug: UnboundLocalError: cannot access local variable 'response'" +) @pytest.mark.asyncio(loop_scope="session") async def test_process_sync_file_noprev( monkeypatch, @@ -327,8 +330,8 @@ def _send_task(*args, **kwargs): assert created_km.file_sha1 is None assert created_km.created_at is not None assert created_km.metadata == {"sync_file_id": "1"} - assert len(created_km.brains)> 0 - assert created_km.brains[0]["brain_id"]== brain_1.brain_id + assert len(created_km.brains) > 0 + assert created_km.brains[0]["brain_id"] == brain_1.brain_id # Assert celery task in correct assert task["args"] == ("process_file_task",) @@ -345,8 +348,9 @@ def _send_task(*args, **kwargs): ) - -@pytest.mark.skip(reason="Bug: UnboundLocalError: cannot access local variable 'response'") +@pytest.mark.skip( + reason="Bug: UnboundLocalError: cannot access local variable 'response'" +) @pytest.mark.asyncio(loop_scope="session") async def test_process_sync_file_with_prev( monkeypatch, @@ -424,7 +428,7 @@ def _send_task(*args, **kwargs): assert created_km.created_at assert created_km.updated_at == created_km.created_at # new line assert created_km.metadata == {"sync_file_id": str(dbfiles[0].id)} - assert created_km.brains[0]["brain_id"]== brain_1.brain_id + assert created_km.brains[0]["brain_id"] == brain_1.brain_id # Check file content changed assert check_file_exists(str(brain_1.brain_id), sync_file.name) diff --git a/backend/core/MegaParse/megaparse/multimodal_convertor/megaparse_vision.py b/backend/core/MegaParse/megaparse/multimodal_convertor/megaparse_vision.py index 0395a16ff922..f9391881a6b2 100644 --- a/backend/core/MegaParse/megaparse/multimodal_convertor/megaparse_vision.py +++ b/backend/core/MegaParse/megaparse/multimodal_convertor/megaparse_vision.py @@ -1,13 +1,14 @@ +import asyncio +import base64 +import re from enum import Enum from io import BytesIO from pathlib import Path from typing import List + from langchain_core.messages import HumanMessage from langchain_openai import ChatOpenAI -import base64 from pdf2image import convert_from_path -import asyncio -import re # BASE_OCR_PROMPT = """ # Transcribe the content of this file into markdown. Be mindful of the formatting. diff --git a/backend/core/MegaParse/megaparse/utils.py b/backend/core/MegaParse/megaparse/utils.py index 7dea8352481d..b16f022ebe91 100644 --- a/backend/core/MegaParse/megaparse/utils.py +++ b/backend/core/MegaParse/megaparse/utils.py @@ -1,9 +1,11 @@ from docx.document import Document as DocumentObject +from docx.oxml.table import CT_Tbl +from docx.oxml.text.paragraph import CT_P +from docx.section import Section +from docx.section import _Footer as Footer +from docx.section import _Header as Header from docx.table import Table from docx.text.paragraph import Paragraph -from docx.section import Section, _Header as Header, _Footer as Footer -from docx.oxml.text.paragraph import CT_P -from docx.oxml.table import CT_Tbl def print_element(element): diff --git a/backend/core/MegaParse/tests/test_import.py b/backend/core/MegaParse/tests/test_import.py index 72e196c3a9af..840d7baf41e2 100644 --- a/backend/core/MegaParse/tests/test_import.py +++ b/backend/core/MegaParse/tests/test_import.py @@ -1,5 +1,4 @@ import pytest - from megaparse.Converter import MegaParse diff --git a/backend/core/examples/simple_question.py b/backend/core/examples/simple_question.py index b7732d3e2cbc..35ffe1d8291c 100644 --- a/backend/core/examples/simple_question.py +++ b/backend/core/examples/simple_question.py @@ -2,7 +2,6 @@ from quivr_core import Brain from quivr_core.quivr_rag_langgraph import QuivrQARAGLangGraph - if __name__ == "__main__": with tempfile.NamedTemporaryFile(mode="w", suffix=".txt") as temp_file: diff --git a/backend/core/quivr_core/chat.py b/backend/core/quivr_core/chat.py index b8d3b1057774..458c7fafbaa5 100644 --- a/backend/core/quivr_core/chat.py +++ b/backend/core/quivr_core/chat.py @@ -1,7 +1,7 @@ +from copy import deepcopy from datetime import datetime -from typing import Any, Generator, Tuple, List +from typing import Any, Generator, List, Tuple from uuid import UUID, uuid4 -from copy import deepcopy from langchain_core.messages import AIMessage, HumanMessage diff --git a/backend/core/quivr_core/config.py b/backend/core/quivr_core/config.py index b974d3220ed7..4c5f9d8513a8 100644 --- a/backend/core/quivr_core/config.py +++ b/backend/core/quivr_core/config.py @@ -2,9 +2,9 @@ from enum import Enum from typing import Dict, List, Optional from uuid import UUID -from sqlmodel import SQLModel from megaparse.config import MegaparseConfig +from sqlmodel import SQLModel from quivr_core.base_config import QuivrBaseConfig from quivr_core.processor.splitter import SplitterConfig diff --git a/backend/core/quivr_core/prompts.py b/backend/core/quivr_core/prompts.py index fa30cb5b8490..48ec90a05e11 100644 --- a/backend/core/quivr_core/prompts.py +++ b/backend/core/quivr_core/prompts.py @@ -1,14 +1,14 @@ import datetime -from pydantic import ConfigDict, create_model -from langchain_core.prompts.base import BasePromptTemplate from langchain_core.prompts import ( ChatPromptTemplate, HumanMessagePromptTemplate, + MessagesPlaceholder, PromptTemplate, SystemMessagePromptTemplate, - MessagesPlaceholder, ) +from langchain_core.prompts.base import BasePromptTemplate +from pydantic import ConfigDict, create_model class CustomPromptsDict(dict): diff --git a/backend/core/quivr_core/quivr_rag_langgraph.py b/backend/core/quivr_core/quivr_rag_langgraph.py index 7a18f83a111c..12d0bea450ec 100644 --- a/backend/core/quivr_core/quivr_rag_langgraph.py +++ b/backend/core/quivr_core/quivr_rag_langgraph.py @@ -1,7 +1,7 @@ import logging +from enum import Enum from typing import Annotated, AsyncGenerator, Optional, Sequence, TypedDict from uuid import uuid4 -from enum import Enum # TODO(@aminediro): this is the only dependency to langchain package, we should remove it from langchain.retrievers import ContextualCompressionRetriever @@ -12,7 +12,7 @@ from langchain_core.messages import BaseMessage from langchain_core.messages.ai import AIMessageChunk from langchain_core.vectorstores import VectorStore -from langgraph.graph import START, END, StateGraph +from langgraph.graph import END, START, StateGraph from langgraph.graph.message import add_messages from quivr_core.chat import ChatHistory diff --git a/backend/pyproject.toml b/backend/pyproject.toml index 491e98cfe6f4..1565ff6c9a92 100644 --- a/backend/pyproject.toml +++ b/backend/pyproject.toml @@ -41,7 +41,7 @@ dev-dependencies = [ ] [tool.rye.workspace] -members = [".", "core", "worker", "api", "docs", "core/examples/chatbot", "core/MegaParse"] +members = [".", "core", "worker", "api", "docs", "core/examples/chatbot", "core/MegaParse", "worker/diff-assistant"] [tool.hatch.metadata] allow-direct-references = true diff --git a/backend/requirements-dev.lock b/backend/requirements-dev.lock index 0623ba330324..d96ff296597c 100644 --- a/backend/requirements-dev.lock +++ b/backend/requirements-dev.lock @@ -20,7 +20,10 @@ # via quivr-worker -e file:core/MegaParse # via quivr-core + # via quivr-diff-assistant -e file:worker +-e file:worker/diff-assistant + # via quivr-worker aiofiles==23.2.1 # via chainlit # via quivr-core @@ -43,6 +46,8 @@ anthropic==0.34.1 # via langchain-anthropic antlr4-python3-runtime==4.9.3 # via omegaconf +anyascii==0.3.2 + # via python-doctr anyio==3.7.1 # via anthropic # via asyncer @@ -126,6 +131,7 @@ click==8.1.7 # via mkdocs # via mkdocstrings # via nltk + # via python-oxmsg # via uvicorn click-didyoumean==0.3.1 # via celery @@ -178,6 +184,7 @@ defusedxml==0.7.1 # via fpdf2 # via langchain-anthropic # via nbconvert + # via python-doctr deprecated==1.2.14 # via llama-index-core # via llama-index-legacy @@ -188,6 +195,8 @@ deprecated==1.2.14 # via pikepdf deprecation==2.1.0 # via postgrest +diff-match-patch==20230430 + # via quivr-diff-assistant dirtyjson==1.0.8 # via llama-index-core # via llama-index-legacy @@ -198,6 +207,7 @@ distro==1.9.0 # via openai docx2txt==0.8 # via quivr-core + # via quivr-diff-assistant dropbox==12.0.2 # via quivr-api ecdsa==0.19.0 @@ -214,6 +224,7 @@ executing==2.0.1 # via stack-data faiss-cpu==1.8.0.post1 # via quivr-core + # via quivr-diff-assistant fastapi==0.110.3 # via chainlit # via quivr-api @@ -298,6 +309,9 @@ h11==0.14.0 # via wsproto h2==4.1.0 # via httpx +h5py==3.10.0 + # via python-doctr + # via quivr-diff-assistant hpack==4.0.0 # via h2 httpcore==1.0.5 @@ -325,6 +339,7 @@ httpx==0.27.0 httpx-sse==0.4.0 # via cohere huggingface-hub==0.24.6 + # via python-doctr # via timm # via tokenizers # via transformers @@ -371,6 +386,7 @@ jmespath==1.0.1 # via botocore joblib==1.4.2 # via nltk + # via scikit-learn jsonpatch==1.33 # via langchain-core jsonpath-python==1.0.6 @@ -399,11 +415,12 @@ kiwisolver==1.4.5 # via matplotlib kombu==5.4.0 # via celery -langchain==0.2.14 +langchain==0.2.16 # via langchain-community # via megaparse # via quivr-api # via quivr-core + # via quivr-diff-assistant langchain-anthropic==0.1.23 # via quivr-core # via quivr-monorepo @@ -414,7 +431,7 @@ langchain-community==0.2.12 # via megaparse # via quivr-api # via quivr-core -langchain-core==0.2.38 +langchain-core==0.2.41 # via langchain # via langchain-anthropic # via langchain-cohere @@ -428,18 +445,20 @@ langchain-core==0.2.38 # via quivr-core langchain-experimental==0.0.64 # via langchain-cohere -langchain-openai==0.1.22 +langchain-openai==0.1.25 # via megaparse # via quivr-api + # via quivr-diff-assistant langchain-text-splitters==0.2.2 # via langchain langdetect==1.0.9 + # via python-doctr # via unstructured langgraph==0.2.14 # via quivr-core langgraph-checkpoint==1.0.6 # via langgraph -langsmith==0.1.100 +langsmith==0.1.126 # via langchain # via langchain-community # via langchain-core @@ -453,14 +472,15 @@ literalai==0.0.607 # via chainlit llama-cloud==0.0.13 # via llama-index-indices-managed-llama-cloud -llama-index==0.10.67.post1 +llama-index==0.11.12 # via megaparse -llama-index-agent-openai==0.2.9 + # via quivr-diff-assistant +llama-index-agent-openai==0.3.4 # via llama-index # via llama-index-program-openai -llama-index-cli==0.1.13 +llama-index-cli==0.3.1 # via llama-index -llama-index-core==0.10.67 +llama-index-core==0.11.12 # via llama-index # via llama-index-agent-openai # via llama-index-cli @@ -473,35 +493,39 @@ llama-index-core==0.10.67 # via llama-index-readers-file # via llama-index-readers-llama-parse # via llama-parse -llama-index-embeddings-openai==0.1.11 +llama-index-embeddings-openai==0.2.5 # via llama-index # via llama-index-cli -llama-index-indices-managed-llama-cloud==0.2.7 +llama-index-indices-managed-llama-cloud==0.3.1 # via llama-index llama-index-legacy==0.9.48.post3 # via llama-index -llama-index-llms-openai==0.1.30 +llama-index-llms-openai==0.2.9 # via llama-index # via llama-index-agent-openai # via llama-index-cli # via llama-index-multi-modal-llms-openai # via llama-index-program-openai # via llama-index-question-gen-openai -llama-index-multi-modal-llms-openai==0.1.9 + # via quivr-diff-assistant +llama-index-multi-modal-llms-openai==0.2.1 # via llama-index -llama-index-program-openai==0.1.7 +llama-index-program-openai==0.2.0 # via llama-index # via llama-index-question-gen-openai -llama-index-question-gen-openai==0.1.3 +llama-index-question-gen-openai==0.2.0 # via llama-index -llama-index-readers-file==0.1.33 +llama-index-readers-file==0.2.2 # via llama-index -llama-index-readers-llama-parse==0.1.6 + # via quivr-diff-assistant +llama-index-readers-llama-parse==0.3.0 # via llama-index -llama-parse==0.4.9 +llama-parse==0.5.6 # via llama-index-readers-llama-parse # via megaparse # via quivr-api +llvmlite==0.43.0 + # via numba lxml==5.3.0 # via pikepdf # via python-docx @@ -535,7 +559,9 @@ marshmallow==3.22.0 marshmallow-enum==1.5.1 # via unstructured-client matplotlib==3.9.2 + # via mplcursors # via pycocotools + # via quivr-diff-assistant # via unstructured-inference matplotlib-inline==0.1.7 # via ipykernel @@ -576,6 +602,8 @@ mkdocstrings-python==1.11.1 # via mkdocstrings monotonic==1.6 # via posthog +mplcursors==0.5.3 + # via quivr-diff-assistant mpmath==1.3.0 # via sympy msal==1.30.0 @@ -608,6 +636,7 @@ networkx==3.2.1 # via torch # via unstructured nltk==3.9.1 + # via llama-index # via llama-index-core # via llama-index-legacy # via unstructured @@ -615,16 +644,20 @@ nodeenv==1.9.1 # via pre-commit notion-client==2.2.1 # via quivr-api +numba==0.60.0 + # via quivr-diff-assistant numpy==1.26.3 # via chainlit # via contourpy # via faiss-cpu + # via h5py # via langchain # via langchain-community # via layoutparser # via llama-index-core # via llama-index-legacy # via matplotlib + # via numba # via onnx # via onnxruntime # via opencv-python @@ -633,12 +666,18 @@ numpy==1.26.3 # via pdf2docx # via pgvector # via pycocotools + # via python-doctr + # via quivr-diff-assistant + # via scikit-learn # via scipy + # via shapely # via torchvision # via transformers # via unstructured oauthlib==3.2.2 # via requests-oauthlib +olefile==0.47 + # via python-oxmsg omegaconf==2.3.0 # via effdet onnx==1.16.2 @@ -646,21 +685,25 @@ onnx==1.16.2 # via unstructured-inference onnxruntime==1.19.0 # via unstructured-inference -openai==1.42.0 +openai==1.47.1 # via langchain-openai # via litellm # via llama-index-agent-openai - # via llama-index-core + # via llama-index-embeddings-openai # via llama-index-legacy # via llama-index-llms-openai # via quivr-api + # via quivr-diff-assistant # via quivr-worker opencv-python==4.10.0.84 # via layoutparser + # via python-doctr + # via quivr-diff-assistant # via unstructured-inference opencv-python-headless==4.10.0.84 # via pdf2docx openpyxl==3.1.5 + # via quivr-diff-assistant # via unstructured opentelemetry-api==1.27.0 # via opentelemetry-exporter-otlp-proto-grpc @@ -720,8 +763,9 @@ paginate==0.5.7 pandas==2.2.2 # via langchain-cohere # via layoutparser - # via llama-index-core # via llama-index-legacy + # via llama-index-readers-file + # via quivr-diff-assistant # via unstructured pandocfilters==1.5.1 # via nbconvert @@ -747,6 +791,8 @@ pexpect==4.9.0 ; sys_platform != 'emscripten' and sys_platform != 'win32' # via ipython pgvector==0.3.2 # via quivr-api +pi-heif==0.18.0 + # via unstructured pikepdf==9.1.1 # via unstructured pillow==10.2.0 @@ -756,13 +802,12 @@ pillow==10.2.0 # via matplotlib # via pdf2image # via pdfplumber + # via pi-heif # via pikepdf - # via pillow-heif + # via python-doctr # via python-pptx # via torchvision # via unstructured-pytesseract -pillow-heif==0.18.0 - # via unstructured platformdirs==4.2.2 # via black # via jupyter-core @@ -819,6 +864,8 @@ pyasn1==0.6.0 # via rsa pyasn1-modules==0.4.0 # via google-auth +pyclipper==1.3.0.post5 + # via python-doctr pycocotools==2.0.8 # via effdet pycodestyle==2.12.1 @@ -839,6 +886,7 @@ pydantic==2.8.2 # via litellm # via literalai # via llama-cloud + # via llama-index-core # via openai # via postgrest # via pydantic-settings @@ -879,9 +927,12 @@ pyparsing==3.1.2 # via unstructured-client pypdf==4.3.1 # via llama-index-readers-file + # via quivr-diff-assistant # via unstructured pypdfium2==4.30.0 # via pdfplumber + # via python-doctr + # via quivr-diff-assistant pyproject-api==1.6.1 # via tox pyreadline3==3.4.1 ; sys_platform == 'win32' @@ -910,6 +961,8 @@ python-dateutil==2.9.0.post0 # via realtime # via storage3 # via unstructured-client +python-doctr==0.9.0 + # via quivr-diff-assistant python-docx==1.1.2 # via megaparse # via pdf2docx @@ -921,6 +974,7 @@ python-dotenv==1.0.1 # via pydantic-settings # via pytest-dotenv # via quivr-api + # via quivr-diff-assistant # via quivr-worker python-engineio==4.9.1 # via python-socketio @@ -929,11 +983,14 @@ python-iso639==2024.4.27 python-jose==3.3.0 # via quivr-api python-magic==0.4.27 + # via quivr-diff-assistant # via unstructured python-multipart==0.0.9 # via chainlit # via quivr-api # via unstructured-inference +python-oxmsg==0.0.1 + # via unstructured python-pptx==1.0.2 # via megaparse # via unstructured @@ -967,6 +1024,7 @@ pyzmq==26.1.1 # via ipykernel # via jupyter-client rapidfuzz==3.9.6 + # via python-doctr # via unstructured # via unstructured-inference realtime==2.0.2 @@ -1021,14 +1079,20 @@ s3transfer==0.10.2 safetensors==0.4.4 # via timm # via transformers +scikit-learn==1.5.2 + # via quivr-diff-assistant scipy==1.14.1 # via layoutparser + # via python-doctr + # via scikit-learn sentencepiece==0.2.0 # via transformers sentry-sdk==2.13.0 # via quivr-api setuptools==70.0.0 # via opentelemetry-instrumentation +shapely==2.0.6 + # via python-doctr simple-websocket==1.0.0 # via python-engineio six==1.16.0 @@ -1091,6 +1155,8 @@ tenacity==8.5.0 # via llama-index-legacy termcolor==2.4.0 # via fire +threadpoolctl==3.5.0 + # via scikit-learn tiktoken==0.7.0 # via langchain-openai # via litellm @@ -1141,6 +1207,7 @@ tqdm==4.66.5 # via llama-index-core # via nltk # via openai + # via python-doctr # via transformers # via unstructured traitlets==5.14.3 @@ -1180,6 +1247,7 @@ typing-extensions==4.12.2 # via pydantic-core # via pyee # via python-docx + # via python-oxmsg # via python-pptx # via realtime # via resend @@ -1199,9 +1267,10 @@ tzdata==2024.1 # via pandas unidecode==1.3.8 # via quivr-api -unstructured==0.15.7 +unstructured==0.15.13 # via megaparse # via quivr-core + # via quivr-diff-assistant unstructured-client==0.6.0 # via unstructured unstructured-inference==0.7.36 diff --git a/backend/requirements.lock b/backend/requirements.lock index 3d8c76fcdb4b..ff4e2f9fbc3f 100644 --- a/backend/requirements.lock +++ b/backend/requirements.lock @@ -20,7 +20,10 @@ # via quivr-worker -e file:core/MegaParse # via quivr-core + # via quivr-diff-assistant -e file:worker +-e file:worker/diff-assistant + # via quivr-worker aiofiles==24.1.0 # via quivr-core aiohappyeyeballs==2.4.0 @@ -42,6 +45,8 @@ anthropic==0.34.2 # via langchain-anthropic antlr4-python3-runtime==4.9.3 # via omegaconf +anyascii==0.3.2 + # via python-doctr anyio==4.4.0 # via anthropic # via httpx @@ -108,6 +113,7 @@ click==8.1.7 # via mkdocs # via mkdocstrings # via nltk + # via python-oxmsg # via uvicorn click-didyoumean==0.3.1 # via celery @@ -155,12 +161,15 @@ defusedxml==0.7.1 # via fpdf2 # via langchain-anthropic # via nbconvert + # via python-doctr deprecated==1.2.14 # via llama-index-core # via llama-index-legacy # via pikepdf deprecation==2.1.0 # via postgrest +diff-match-patch==20230430 + # via quivr-diff-assistant dirtyjson==1.0.8 # via llama-index-core # via llama-index-legacy @@ -169,6 +178,7 @@ distro==1.9.0 # via openai docx2txt==0.8 # via quivr-core + # via quivr-diff-assistant dropbox==12.0.2 # via quivr-api ecdsa==0.19.0 @@ -183,6 +193,7 @@ executing==2.1.0 # via stack-data faiss-cpu==1.8.0.post1 # via quivr-core + # via quivr-diff-assistant fastapi==0.112.1 # via quivr-api # via sentry-sdk @@ -256,6 +267,9 @@ h11==0.14.0 # via uvicorn h2==4.1.0 # via httpx +h5py==3.10.0 + # via python-doctr + # via quivr-diff-assistant hpack==4.0.0 # via h2 httpcore==1.0.5 @@ -281,6 +295,7 @@ httpx==0.27.0 httpx-sse==0.4.0 # via cohere huggingface-hub==0.24.6 + # via python-doctr # via timm # via tokenizers # via transformers @@ -322,6 +337,7 @@ jmespath==1.0.1 # via botocore joblib==1.4.2 # via nltk + # via scikit-learn jsonpatch==1.33 # via langchain-core jsonpath-python==1.0.6 @@ -350,11 +366,12 @@ kiwisolver==1.4.5 # via matplotlib kombu==5.4.0 # via celery -langchain==0.2.14 +langchain==0.2.16 # via langchain-community # via megaparse # via quivr-api # via quivr-core + # via quivr-diff-assistant langchain-anthropic==0.1.23 # via quivr-core # via quivr-monorepo @@ -365,7 +382,7 @@ langchain-community==0.2.12 # via megaparse # via quivr-api # via quivr-core -langchain-core==0.2.38 +langchain-core==0.2.41 # via langchain # via langchain-anthropic # via langchain-cohere @@ -379,18 +396,20 @@ langchain-core==0.2.38 # via quivr-core langchain-experimental==0.0.64 # via langchain-cohere -langchain-openai==0.1.22 +langchain-openai==0.1.25 # via megaparse # via quivr-api + # via quivr-diff-assistant langchain-text-splitters==0.2.2 # via langchain langdetect==1.0.9 + # via python-doctr # via unstructured langgraph==0.2.19 # via quivr-core langgraph-checkpoint==1.0.9 # via langgraph -langsmith==0.1.100 +langsmith==0.1.126 # via langchain # via langchain-community # via langchain-core @@ -400,14 +419,15 @@ litellm==1.43.19 # via quivr-api llama-cloud==0.0.13 # via llama-index-indices-managed-llama-cloud -llama-index==0.10.67.post1 +llama-index==0.11.12 # via megaparse -llama-index-agent-openai==0.2.9 + # via quivr-diff-assistant +llama-index-agent-openai==0.3.4 # via llama-index # via llama-index-program-openai -llama-index-cli==0.1.13 +llama-index-cli==0.3.1 # via llama-index -llama-index-core==0.10.67 +llama-index-core==0.11.12 # via llama-index # via llama-index-agent-openai # via llama-index-cli @@ -420,35 +440,39 @@ llama-index-core==0.10.67 # via llama-index-readers-file # via llama-index-readers-llama-parse # via llama-parse -llama-index-embeddings-openai==0.1.11 +llama-index-embeddings-openai==0.2.5 # via llama-index # via llama-index-cli -llama-index-indices-managed-llama-cloud==0.2.7 +llama-index-indices-managed-llama-cloud==0.3.1 # via llama-index llama-index-legacy==0.9.48.post3 # via llama-index -llama-index-llms-openai==0.1.30 +llama-index-llms-openai==0.2.9 # via llama-index # via llama-index-agent-openai # via llama-index-cli # via llama-index-multi-modal-llms-openai # via llama-index-program-openai # via llama-index-question-gen-openai -llama-index-multi-modal-llms-openai==0.1.9 + # via quivr-diff-assistant +llama-index-multi-modal-llms-openai==0.2.1 # via llama-index -llama-index-program-openai==0.1.7 +llama-index-program-openai==0.2.0 # via llama-index # via llama-index-question-gen-openai -llama-index-question-gen-openai==0.1.3 +llama-index-question-gen-openai==0.2.0 # via llama-index -llama-index-readers-file==0.1.33 +llama-index-readers-file==0.2.2 # via llama-index -llama-index-readers-llama-parse==0.1.6 + # via quivr-diff-assistant +llama-index-readers-llama-parse==0.3.0 # via llama-index -llama-parse==0.4.9 +llama-parse==0.5.6 # via llama-index-readers-llama-parse # via megaparse # via quivr-api +llvmlite==0.43.0 + # via numba lxml==5.3.0 # via pikepdf # via python-docx @@ -482,7 +506,9 @@ marshmallow==3.22.0 marshmallow-enum==1.5.1 # via unstructured-client matplotlib==3.9.2 + # via mplcursors # via pycocotools + # via quivr-diff-assistant # via unstructured-inference matplotlib-inline==0.1.7 # via ipykernel @@ -521,6 +547,8 @@ mkdocstrings-python==1.11.1 # via mkdocstrings monotonic==1.6 # via posthog +mplcursors==0.5.3 + # via quivr-diff-assistant mpmath==1.3.0 # via sympy msal==1.30.0 @@ -549,20 +577,25 @@ networkx==3.2.1 # via torch # via unstructured nltk==3.9.1 + # via llama-index # via llama-index-core # via llama-index-legacy # via unstructured notion-client==2.2.1 # via quivr-api +numba==0.60.0 + # via quivr-diff-assistant numpy==1.26.3 # via contourpy # via faiss-cpu + # via h5py # via langchain # via langchain-community # via layoutparser # via llama-index-core # via llama-index-legacy # via matplotlib + # via numba # via onnx # via onnxruntime # via opencv-python @@ -571,12 +604,18 @@ numpy==1.26.3 # via pdf2docx # via pgvector # via pycocotools + # via python-doctr + # via quivr-diff-assistant + # via scikit-learn # via scipy + # via shapely # via torchvision # via transformers # via unstructured oauthlib==3.2.2 # via requests-oauthlib +olefile==0.47 + # via python-oxmsg omegaconf==2.3.0 # via effdet onnx==1.16.2 @@ -584,21 +623,25 @@ onnx==1.16.2 # via unstructured-inference onnxruntime==1.19.0 # via unstructured-inference -openai==1.42.0 +openai==1.47.1 # via langchain-openai # via litellm # via llama-index-agent-openai - # via llama-index-core + # via llama-index-embeddings-openai # via llama-index-legacy # via llama-index-llms-openai # via quivr-api + # via quivr-diff-assistant # via quivr-worker opencv-python==4.10.0.84 # via layoutparser + # via python-doctr + # via quivr-diff-assistant # via unstructured-inference opencv-python-headless==4.10.0.84 # via pdf2docx openpyxl==3.1.5 + # via quivr-diff-assistant # via unstructured orjson==3.10.7 # via langsmith @@ -624,8 +667,9 @@ paginate==0.5.7 pandas==2.2.2 # via langchain-cohere # via layoutparser - # via llama-index-core # via llama-index-legacy + # via llama-index-readers-file + # via quivr-diff-assistant # via unstructured pandocfilters==1.5.1 # via nbconvert @@ -650,6 +694,8 @@ pexpect==4.9.0 ; sys_platform != 'emscripten' and sys_platform != 'win32' # via ipython pgvector==0.3.2 # via quivr-api +pi-heif==0.18.0 + # via unstructured pikepdf==9.1.1 # via unstructured pillow==10.2.0 @@ -659,13 +705,12 @@ pillow==10.2.0 # via matplotlib # via pdf2image # via pdfplumber + # via pi-heif # via pikepdf - # via pillow-heif + # via python-doctr # via python-pptx # via torchvision # via unstructured-pytesseract -pillow-heif==0.18.0 - # via unstructured platformdirs==4.3.2 # via jupyter-core # via mkdocs-get-deps @@ -712,6 +757,8 @@ pyasn1==0.6.0 # via rsa pyasn1-modules==0.4.0 # via google-auth +pyclipper==1.3.0.post5 + # via python-doctr pycocotools==2.0.8 # via effdet pycparser==2.22 ; platform_python_implementation != 'PyPy' or implementation_name == 'pypy' @@ -728,6 +775,7 @@ pydantic==2.8.2 # via langsmith # via litellm # via llama-cloud + # via llama-index-core # via openai # via postgrest # via pydantic-settings @@ -765,9 +813,12 @@ pyparsing==3.1.2 # via unstructured-client pypdf==4.3.1 # via llama-index-readers-file + # via quivr-diff-assistant # via unstructured pypdfium2==4.30.0 # via pdfplumber + # via python-doctr + # via quivr-diff-assistant pyreadline3==3.4.1 ; sys_platform == 'win32' # via humanfriendly python-dateutil==2.9.0.post0 @@ -781,6 +832,8 @@ python-dateutil==2.9.0.post0 # via realtime # via storage3 # via unstructured-client +python-doctr==0.9.0 + # via quivr-diff-assistant python-docx==1.1.2 # via megaparse # via pdf2docx @@ -790,16 +843,20 @@ python-dotenv==1.0.1 # via megaparse # via pydantic-settings # via quivr-api + # via quivr-diff-assistant # via quivr-worker python-iso639==2024.4.27 # via unstructured python-jose==3.3.0 # via quivr-api python-magic==0.4.27 + # via quivr-diff-assistant # via unstructured python-multipart==0.0.9 # via quivr-api # via unstructured-inference +python-oxmsg==0.0.1 + # via unstructured python-pptx==1.0.2 # via megaparse # via unstructured @@ -830,6 +887,7 @@ pyzmq==26.2.0 # via ipykernel # via jupyter-client rapidfuzz==3.9.6 + # via python-doctr # via unstructured # via unstructured-inference realtime==2.0.2 @@ -882,12 +940,18 @@ s3transfer==0.10.2 safetensors==0.4.4 # via timm # via transformers +scikit-learn==1.5.2 + # via quivr-diff-assistant scipy==1.14.1 # via layoutparser + # via python-doctr + # via scikit-learn sentencepiece==0.2.0 # via transformers sentry-sdk==2.13.0 # via quivr-api +shapely==2.0.6 + # via python-doctr six==1.16.0 # via asttokens # via bleach @@ -945,6 +1009,8 @@ tenacity==8.5.0 # via llama-index-legacy termcolor==2.4.0 # via fire +threadpoolctl==3.5.0 + # via scikit-learn tiktoken==0.7.0 # via langchain-openai # via litellm @@ -992,6 +1058,7 @@ tqdm==4.66.5 # via llama-index-core # via nltk # via openai + # via python-doctr # via transformers # via unstructured traitlets==5.14.3 @@ -1029,6 +1096,7 @@ typing-extensions==4.12.2 # via pydantic-core # via pyee # via python-docx + # via python-oxmsg # via python-pptx # via realtime # via resend @@ -1048,9 +1116,10 @@ tzdata==2024.1 # via pandas unidecode==1.3.8 # via quivr-api -unstructured==0.15.7 +unstructured==0.15.13 # via megaparse # via quivr-core + # via quivr-diff-assistant unstructured-client==0.8.1 # via unstructured unstructured-inference==0.7.36 diff --git a/backend/supabase/migrations/20240925083103_assistants-name.sql b/backend/supabase/migrations/20240925083103_assistants-name.sql new file mode 100644 index 000000000000..79489e148da5 --- /dev/null +++ b/backend/supabase/migrations/20240925083103_assistants-name.sql @@ -0,0 +1,9 @@ +alter table "public"."tasks" add column "assistant_name" text; + +alter + publication supabase_realtime add table tasks; + + + + + diff --git a/backend/supabase/migrations/20240925124019_assistants-metadata.sql b/backend/supabase/migrations/20240925124019_assistants-metadata.sql new file mode 100644 index 000000000000..782ab3d97771 --- /dev/null +++ b/backend/supabase/migrations/20240925124019_assistants-metadata.sql @@ -0,0 +1,4 @@ + +alter table "public"."tasks" add column "task_metadata" jsonb; + + diff --git a/backend/worker/diff-assistant/.env.exemple b/backend/worker/diff-assistant/.env.exemple new file mode 100644 index 000000000000..72086a54591d --- /dev/null +++ b/backend/worker/diff-assistant/.env.exemple @@ -0,0 +1,2 @@ +OPENAI_API_KEY = myopenaikey +LLAMA_PARSE_API_KEY = myllamaparsekey \ No newline at end of file diff --git a/backend/worker/diff-assistant/.gitignore b/backend/worker/diff-assistant/.gitignore new file mode 100644 index 000000000000..f1a81184a3b3 --- /dev/null +++ b/backend/worker/diff-assistant/.gitignore @@ -0,0 +1,15 @@ +# python generated files +__pycache__/ +*.py[oc] +build/ +dist/ +wheels/ +*.egg-info + +# venv +.venv +.env +.DS_Store + +#pkl +*.pkl diff --git a/backend/worker/diff-assistant/.python-version b/backend/worker/diff-assistant/.python-version new file mode 100644 index 000000000000..2419ad5b0a32 --- /dev/null +++ b/backend/worker/diff-assistant/.python-version @@ -0,0 +1 @@ +3.11.9 diff --git a/backend/worker/diff-assistant/README.md b/backend/worker/diff-assistant/README.md new file mode 100644 index 000000000000..32d85962e567 --- /dev/null +++ b/backend/worker/diff-assistant/README.md @@ -0,0 +1,3 @@ +# diff-assistant + +Describe your project here. diff --git a/backend/worker/diff-assistant/data/cdc/Cas2-1-2_MUFFIN_MIRTYLLE.pdf b/backend/worker/diff-assistant/data/cdc/Cas2-1-2_MUFFIN_MIRTYLLE.pdf new file mode 100644 index 000000000000..8bc6268fa422 Binary files /dev/null and b/backend/worker/diff-assistant/data/cdc/Cas2-1-2_MUFFIN_MIRTYLLE.pdf differ diff --git a/backend/worker/diff-assistant/data/cdc/Cas2-1-3_Entremets_rond_vanille_pecan_individuel.docx b/backend/worker/diff-assistant/data/cdc/Cas2-1-3_Entremets_rond_vanille_pecan_individuel.docx new file mode 100644 index 000000000000..f24db287c246 Binary files /dev/null and b/backend/worker/diff-assistant/data/cdc/Cas2-1-3_Entremets_rond_vanille_pecan_individuel.docx differ diff --git a/backend/worker/diff-assistant/data/cdc/Cas3-1-1_AP_Merveilleux chocolat blanc.docx b/backend/worker/diff-assistant/data/cdc/Cas3-1-1_AP_Merveilleux chocolat blanc.docx new file mode 100644 index 000000000000..51c44e640480 Binary files /dev/null and b/backend/worker/diff-assistant/data/cdc/Cas3-1-1_AP_Merveilleux chocolat blanc.docx differ diff --git a/backend/worker/diff-assistant/data/cdc/Cas3-1-1_AV_Merveilleux chocolat blanc.docx b/backend/worker/diff-assistant/data/cdc/Cas3-1-1_AV_Merveilleux chocolat blanc.docx new file mode 100644 index 000000000000..4332d9fee6f1 Binary files /dev/null and b/backend/worker/diff-assistant/data/cdc/Cas3-1-1_AV_Merveilleux chocolat blanc.docx differ diff --git a/backend/worker/diff-assistant/data/etiquettes/Cas3-2-3-AP.pdf b/backend/worker/diff-assistant/data/etiquettes/Cas3-2-3-AP.pdf new file mode 100644 index 000000000000..5b79960517df Binary files /dev/null and b/backend/worker/diff-assistant/data/etiquettes/Cas3-2-3-AP.pdf differ diff --git a/backend/worker/diff-assistant/data/etiquettes/Cas3-2-3-AV.pdf b/backend/worker/diff-assistant/data/etiquettes/Cas3-2-3-AV.pdf new file mode 100644 index 000000000000..0b929186f480 Binary files /dev/null and b/backend/worker/diff-assistant/data/etiquettes/Cas3-2-3-AV.pdf differ diff --git a/backend/worker/diff-assistant/data/etiquettes/etiquette_0_after.pdf b/backend/worker/diff-assistant/data/etiquettes/etiquette_0_after.pdf new file mode 100644 index 000000000000..b281a31827e4 Binary files /dev/null and b/backend/worker/diff-assistant/data/etiquettes/etiquette_0_after.pdf differ diff --git a/backend/worker/diff-assistant/data/etiquettes/etiquette_0_before.pdf b/backend/worker/diff-assistant/data/etiquettes/etiquette_0_before.pdf new file mode 100644 index 000000000000..464cb7cdb556 Binary files /dev/null and b/backend/worker/diff-assistant/data/etiquettes/etiquette_0_before.pdf differ diff --git a/backend/worker/diff-assistant/data/fiche_dev_produit/Cas2-1-3_Entremets_rond_vanille_pecan_individuel.xlsx b/backend/worker/diff-assistant/data/fiche_dev_produit/Cas2-1-3_Entremets_rond_vanille_pecan_individuel.xlsx new file mode 100644 index 000000000000..a7532e845fbb Binary files /dev/null and b/backend/worker/diff-assistant/data/fiche_dev_produit/Cas2-1-3_Entremets_rond_vanille_pecan_individuel.xlsx differ diff --git a/backend/worker/diff-assistant/notebooks/use_case_3/test_etiquette.ipynb b/backend/worker/diff-assistant/notebooks/use_case_3/test_etiquette.ipynb new file mode 100644 index 000000000000..623448895539 --- /dev/null +++ b/backend/worker/diff-assistant/notebooks/use_case_3/test_etiquette.ipynb @@ -0,0 +1,958 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "from diff_algorithm import DiffAlgorithm\n", + "from parser import DeadlyParser\n", + "\n", + "file_path_after = \"/Users/chloed./Documents/quivr/diff-assistant/src/cdp3/test_docs/etiquette_0_before.pdf\"\n", + "file_path_before = \"/Users/chloed./Documents/quivr/diff-assistant/src/cdp3/test_docs/etiquette_0_after.pdf\"\n", + "complex_file = \"/Users/chloed./Documents/quivr/diff-assistant/src/cdp3/test_docs/Cas3-2-3.pdf\"" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "parser = DeadlyParser()\n", + "parsed_before = parser.parse(file_path_before)\n", + "parsed_after = parser.parse(file_path_after)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "text_before = parsed_before.render()\n", + "text_after = parsed_after.render()" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CUDA device False\n", + "\n", + "0: 1024x800 2 Pictures, 2 Section-headers, 18 Texts, 1091.6ms\n", + "Speed: 24.9ms preprocess, 1091.6ms inference, 84.6ms postprocess per image at shape (1, 3, 1024, 800)\n" + ] + } + ], + "source": [ + "from PIL import Image\n", + "import pypdfium2 as pdfium\n", + "import torchvision.transforms as transforms\n", + "\n", + "import torch\n", + "from ultralytics import YOLOv10\n", + "\n", + "print(\"CUDA device\", torch.cuda.is_available())\n", + "\n", + "device = torch.device(\"mps\") # Default CUDA device\n", + "\n", + "model = YOLOv10(\"./yolov10x_best.pt\").to(device)\n", + "\n", + "pdf = pdfium.PdfDocument(file_path_after)\n", + "page = pdf[0] # load a page\n", + "\n", + "bitmap = page.render(scale=500 / 72)\n", + "\n", + "pil_image = bitmap.to_pil()\n", + "\n", + "# Create a transform to convert PIL image to tensor\n", + "to_tensor = transforms.ToTensor()\n", + "\n", + "# Convert PIL image to tensor (this also normalizes values to [0, 1])\n", + "tensor_image = to_tensor(pil_image)\n", + "\n", + "# Add batch dimension\n", + "tensor_image = tensor_image.unsqueeze(0).to(device)\n", + "\n", + "# Assuming your model is already on the CUDA device\n", + "model = model.to(device)\n", + "\n", + "# Perform inference\n", + "with torch.no_grad():\n", + " results = model.predict(source=pil_image, imgsz=1024, conf=0.35, batch=1)\n", + "\n", + "\n", + "annotated_image = results[0].plot()[:, :, ::-1]\n", + "\n", + "im = Image.fromarray(annotated_image)\n", + "\n", + "im.show()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "tensor([0.8352, 0.8235, 0.8203, 0.8113, 0.7984, 0.7860, 0.6394, 0.5778, 0.5666, 0.5546, 0.5365, 0.5300, 0.4666, 0.4322, 0.4222, 0.3932, 0.3926, 0.3901], device='mps:0')\n", + "tensor([6., 9., 7., 9., 6., 9., 6., 9., 9., 9., 9., 9., 9., 9., 9., 9., 9., 9.], device='mps:0')\n" + ] + } + ], + "source": [ + "print(results[0].boxes.conf)\n", + "print(results[0].boxes.cls)\n", + "results[0].boxes.xyxyn" + ] + }, + { + "cell_type": "code", + "execution_count": 157, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain_openai import ChatOpenAI\n", + "from langchain_core.messages import HumanMessage, SystemMessage\n", + "from io import BytesIO\n", + "import base64\n", + "def check_transcription(file_path, text):\n", + " pdf = pdfium.PdfDocument(file_path)\n", + " page = pdf[0] # load a page\n", + " \n", + " bitmap = page.render(scale=500 / 72)\n", + " \n", + " pil_image_before = bitmap.to_pil()\n", + " \n", + " buffered = BytesIO()\n", + " pil_image_before.save(buffered, format=\"PNG\")\n", + " img_str = base64.b64encode(buffered.getvalue()).decode()\n", + " \n", + " chat = ChatOpenAI(model=\"gpt-4o\", temperature=0)\n", + " result = chat.invoke(\n", + " [\n", + " HumanMessage(\n", + " content=[\n", + " {\"type\": \"text\", \"text\": f\"Can you correct this entire text retranscription, respond only with the corrected transcription: {text}\"},\n", + " {\n", + " \"type\": \"image_url\",\n", + " \"image_url\": {\n", + " \"url\": f\"data:image/jpeg;base64,{img_str}\",\n", + " \"detail\": \"auto\",\n", + " },\n", + " },\n", + " ]\n", + " )\n", + " ]\n", + " )\n", + " return result" + ] + }, + { + "cell_type": "code", + "execution_count": 158, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "result_before = check_transcription(file_path_before, text_before)\n", + "result_after = check_transcription(file_path_after, text_after)" + ] + }, + { + "cell_type": "code", + "execution_count": 168, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Coup de pâtes\n", + "TRADITION & INNOVATION\n", + "\n", + "50 CREPES FINES SUCREES AU RHUM NEGRITA® (PLIEES EN QUATRE) D270 55g\n", + "50 Thin crêpes sweetened with rum Negrita® (folded in four) D270 55g\n", + "\n", + "25514\n", + "Rhum NEGRITA\n", + "50 Crêpes fines sucrées au rhum cuites, surgelées -\n", + "50 Crêpes sweetened with rum, baked, frozen\n", + "\n", + "Ingrédients : LAIT entier, farine de BLE, sucre de canne 16.4%, ŒUFS entiers*, beurre concentré (LAIT), eau, rhum Negrita (colorant: E150a) 3.6%, sel, poudres à lever: E500-E331-amidon de BLE.\n", + "* Œufs issus de poules élevées au sol\n", + "\n", + "Ingredients : Whole MILK, WHEAT flour, cane sugar 16.4%, whole EGGS*, concentrated butter (MILK), water, Negrita rum (colouring: E150a) 3.6%, salt, raising agents: E500-E331-WHEAT starch.\n", + "* Barn eggs\n", + "\n", + "Conseil d'utilisation : Décongeler le produit 1 heure entre 0° et 4°C. Après décongélation et maintien à 4°C, le produit se conserve au maximum pendant 24 heures. Suggestion: possibilité de décongeler les crêpes 30 secondes au four à micro-ondes.\n", + "How to prepare the product: Defrost the product 1 hour at 0°C - +4°C. After thawing, preserve the product at +4°C for 24 hours maximum. Suggestion: Defrost the crêpe 30 sec in the microwave.\n", + "\n", + "Informations nutritionnelles pour 100g / Average nutritional values for 100g:\n", + "Valeur énergétique/Energy: 1495 kJ / 356 kcal\n", + "Matières grasses totales/Fat (g): 11.4\n", + "- dont Acides Gras Saturés/of which saturated fatty acids (g): 5.9\n", + "Glucides/Carbohydrates (g): 49.5\n", + "- dont sucres/of which sugar (g): 25.2\n", + "Protéines/Proteins (g): 8.0\n", + "Sel/Salt (g): 0.45\n", + "\n", + "A conserver à -18°C : Ne jamais recongeler un produit décongelé\n", + "Store at -18°C: Don't refreeze, once defrosted\n", + "\n", + "Coup de pâtes\n", + "50 CREPES FINES SUCREES AU RHUM NEGRITA® (PLIEES EN QUATRE) D270 55g\n", + "50 Crêpes fines sucrées au rhum cuites, surgelées -\n", + "50 Crêpes sweetened with rum, baked, frozen\n", + "\n", + "N° DE LOT / BATCH : 116241 13:17\n", + "A consommer de préférence avant le / Best before : 25/10/2025\n", + "\n", + "25514\n", + "FAB : A04A\n", + "\n", + "(01)03604380255141(15)251025(10)116241(91)0316175\n", + "EAN No: 03604380255141\n", + "\n", + "Poids net / Net weight : 2750 g\n", + "\n", + "C.I: 7142 COUP DE PATES S.A.S. ZAC DU BEL AIR - 14-16 AVENUE JOSEPH PAXTON - FERRIERES EN BRIE - 77164 MARNE LA VALLEE CEDEX 3 - FRANCE\n" + ] + } + ], + "source": [ + "print(result_after.content)" + ] + }, + { + "cell_type": "code", + "execution_count": 171, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[(0, '50 CREPES FINES SUCREES AU\\nCoupdegal'),\n", + " (-1, 'g'),\n", + " (1, 'o'),\n", + " (0, '\\nRHUM NEGRITAO (PLIEES EN QUATRE)\\nTRA'),\n", + " (-1, 'C'),\n", + " (1, 'D'),\n", + " (0, 'ITION '),\n", + " (-1, '&'),\n", + " (1, 'a'),\n", + " (0, ' INNO'),\n", + " (-1, 'V'),\n", + " (1, 'Y'),\n", + " (0, 'AT'),\n", + " (-1, 'IG'),\n", + " (1, ':O'),\n", + " (0, 'N\\nD270 55g\\n50 Thin cr'),\n", + " (-1, 'ê'),\n", + " (1, 'è'),\n", + " (0, 'pes sweetened with rum Negrita'),\n", + " (-1, 'g'),\n", + " (1, 'e'),\n", + " (0, '\\n(folded in four) D270 55g\\n25514 R'),\n", + " (-1, 'k'),\n", + " (1, 'h'),\n", + " (0, 'um'),\n", + " (-1, 'y'),\n", + " (0, '\\n'),\n", + " (1, 'NEGRITA '),\n", + " (0, '50 Crêpes fines sucrées au rhum cuites, surgelées -\\n'),\n", + " (-1, 'NEGRITA\\n'),\n", + " (0, '50 Cr'),\n", + " (-1, 'è'),\n", + " (1, 'ê'),\n", + " (0,\n", + " 'pes sweetened with rum, baked, frozen\\nIngrédients : LAIT entier, farine de BLE, sucre de canne 16.'),\n", + " (-1, '6'),\n", + " (1, '4'),\n", + " (0,\n", + " '%, CEUFS entiers*,\\nbeurre concentré (LAIT), eau, rhum Negrita (colorant: E150a) 3.'),\n", + " (-1, '7'),\n", + " (1, '6'),\n", + " (0,\n", + " '%, sel, poudres à\\nlever: E500-E331-amidon de BLE.\\n* CEufs issus de poules élevées au sol\\nIngredients : Whole MILK, WHEAT flour, cane sugar 16.'),\n", + " (-1, '6'),\n", + " (1, '4'),\n", + " (0, '%, whole EGGS*, con'),\n", + " (-1, 'c'),\n", + " (1, 'ç'),\n", + " (0, 'entrated\\nbutter (MILK), water, Negrita rum (colouring: E150a) 3.'),\n", + " (-1, '7'),\n", + " (1, '6'),\n", + " (0,\n", + " \"%, salt, raising agents:\\nE500-E331-WHEAT starch.\\n* Barn eggs\\nConseil d'utilisation : Décongeler le produit 1 heure entre 0° et 4°C. Après décongélation et\\nmaintien à 4°C, le produit se conserve au maximum pendant 24 heures\"),\n", + " (1, '.'),\n", + " (0, '\\nSuggestion: possibilité de décongeler les cr'),\n", + " (-1, 'è'),\n", + " (1, 'é'),\n", + " (0, 'pes 30 secondes au four à micr'),\n", + " (-1, 'o'),\n", + " (1, 'c'),\n", + " (0, '-ondes.\\n'),\n", + " (-1, 'BPA le 24.09.2020 '),\n", + " (0, \"How to prepare the products: Defrost the product 1 hour at 0'C-+4\"),\n", + " (1, '°'),\n", + " (0, 'C. After thawing,\\npreserve the product at +4'),\n", + " (-1, '°'),\n", + " (1, '*'),\n", + " (0,\n", + " 'C for 24 hours maximum. Suggestion: Defrost the crèpe 30 sec\\nin the microwave.\\nInformations nutritionnelles pour 1 Average nutritional values for 100g:\\nValeur '),\n", + " (-1, 'e'),\n", + " (1, 'é'),\n", + " (0, 'nerg'),\n", + " (-1, 'e'),\n", + " (1, 'é'),\n", + " (0, 'tique/Energy: 149'),\n", + " (-1, '7'),\n", + " (1, '5'),\n", + " (0, ' kJ / 356 kcal\\nMatières grasses totales/Fat (g): 11.'),\n", + " (-1, '6'),\n", + " (1, '4'),\n", + " (0, '\\n- dont Acides Gras Saturés/of which saturated fatty acids (g): '),\n", + " (-1, '6'),\n", + " (1, '5'),\n", + " (0, '.'),\n", + " (-1, '1'),\n", + " (1, '9'),\n", + " (0, '\\nGiu'),\n", + " (-1, 'c'),\n", + " (1, 'ri'),\n", + " (0, 'des/Car'),\n", + " (-1, 'p'),\n", + " (1, 'b'),\n", + " (0, 'o'),\n", + " (-1, 'n'),\n", + " (1, 'h'),\n", + " (0, 'y'),\n", + " (-1, 'ct'),\n", + " (1, 'di'),\n", + " (0, 'ates (g): 4'),\n", + " (-1, '8'),\n", + " (1, '9'),\n", + " (0, '.'),\n", + " (-1, '9'),\n", + " (1, '5'),\n", + " (0, '\\n'),\n", + " (-1, '- '),\n", + " (0, 'dont sucres/of which sugar (g): 2'),\n", + " (-1, '4'),\n", + " (1, '5'),\n", + " (0, '.'),\n", + " (-1, '1'),\n", + " (1, '2'),\n", + " (0, '\\nProtéines'),\n", + " (-1, '/'),\n", + " (0, 'Proteins (g): 8.0\\nSel/Salt (g): 0.4'),\n", + " (-1, '8'),\n", + " (1, '5'),\n", + " (0,\n", + " \"\\nA conserver à -18°C : Ne jamais recongeler un produit décongelé\\nStore at -18°C: Don't refreeze, once defrosted\\n\"),\n", + " (-1, 'Fabriqué en France - Made in France\\n'),\n", + " (0, 'Cou'),\n", + " (-1, 'y'),\n", + " (0, 'pde'),\n", + " (-1, 'g'),\n", + " (1, ' '),\n", + " (0, 'al'),\n", + " (-1, 'g'),\n", + " (0, '\\n50 CREPES FINES SUCREES AU RHUM\\nNEGRITA'),\n", + " (-1, 'O'),\n", + " (1, 'B'),\n", + " (0, ' (PLIEES EN QUATRE) D270\\nT'),\n", + " (-1, 'R'),\n", + " (1, 'W'),\n", + " (0, 'ADITION & INNOVAT'),\n", + " (-1, ':'),\n", + " (1, 'I'),\n", + " (0, 'ON\\n55g\\n50 Cr'),\n", + " (-1, 'ê'),\n", + " (1, 'è'),\n", + " (0,\n", + " 'pes fines sucrées au rhum cuites, surgelées -\\nNo DE LOTI\\n50 Crèpes sweetened with rum, baked, frozen\\nBATCH : '),\n", + " (-1, '084'),\n", + " (1, '116'),\n", + " (0, '2'),\n", + " (-1, '0'),\n", + " (1, '4'),\n", + " (0, '1 1'),\n", + " (-1, '5'),\n", + " (1, '3'),\n", + " (0, ':'),\n", + " (-1, '4'),\n", + " (1, '1'),\n", + " (0, '7\\nA consommer de pr'),\n", + " (-1, 'è'),\n", + " (1, 'é'),\n", + " (0, 'f'),\n", + " (-1, 'è'),\n", + " (1, 'é'),\n", + " (0, 'rence avant'),\n", + " (-1, ' '),\n", + " (0, 'le '),\n", + " (-1, 'I'),\n", + " (1, '/'),\n", + " (0, '\\n25514\\nBest before : 2'),\n", + " (-1, '4'),\n", + " (1, '5'),\n", + " (0, '/'),\n", + " (1, '1'),\n", + " (0, '0'),\n", + " (-1, '9'),\n", + " (0, '/202'),\n", + " (-1, '1'),\n", + " (1, '5'),\n", + " (0, '\\n'),\n", + " (1, 'FAB :\\nA'),\n", + " (0, '0'),\n", + " (1, '4A\\n'),\n", + " (0, '0'),\n", + " (-1, '9'),\n", + " (1, '1.'),\n", + " (0, '0'),\n", + " (1, '9'),\n", + " (0, '80'),\n", + " (-1, '43'),\n", + " (1, '.9'),\n", + " (0, '80'),\n", + " (1, '2'),\n", + " (0, '55141052'),\n", + " (1, '5'),\n", + " (0, '10'),\n", + " (-1, '9'),\n", + " (0, '2'),\n", + " (-1, '4'),\n", + " (1, '5'),\n", + " (0, '10'),\n", + " (-1, '08'),\n", + " (1, '1162'),\n", + " (0, '4'),\n", + " (-1, '20'),\n", + " (0, '1 (91)0316'),\n", + " (-1, '4'),\n", + " (1, '1'),\n", + " (0, '7'),\n", + " (-1, '6'),\n", + " (1, '5'),\n", + " (0, '\\nEAN No: 03604380255141'),\n", + " (-1, ' FAB : 00001 '),\n", + " (1, '\\n'),\n", + " (0, 'Poids net'),\n", + " (-1, '\\n:\\n'),\n", + " (1, '! '),\n", + " (0, '2750\\nNet weight'),\n", + " (1, ': :'),\n", + " (0, '\\ng\\n'),\n", + " (-1, '\\n'),\n", + " (1, 'Ci: 7142 '),\n", + " (0, 'COUP DE'),\n", + " (1, 'F'),\n", + " (0, ' PATES'),\n", + " (-1, 'E'),\n", + " (1, 'O'),\n", + " (0, ' S'),\n", + " (-1, '.'),\n", + " (0, 'A.S'),\n", + " (-1, '-;'),\n", + " (0, ' ZA'),\n", + " (-1, 'C'),\n", + " (1, 'Ç'),\n", + " (0, ' DU BEL AIR'),\n", + " (-1, '-'),\n", + " (0, ' 14-16 AVENUE'),\n", + " (1, '.'),\n", + " (0, ' '),\n", + " (-1, 'J'),\n", + " (1, 'V'),\n", + " (0, 'OSEPH'),\n", + " (-1, ' '),\n", + " (0, 'PAXTON-\\nFERRIERES EN'),\n", + " (-1, 'I'),\n", + " (0, ' BRIE 77'),\n", + " (-1, '6'),\n", + " (1, '8'),\n", + " (0, '14 MARNE LA VALLEE CEDEX 3'),\n", + " (1, '- FRANÇE')]" + ] + }, + "execution_count": 171, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dmp= DiffAlgorithm()\n", + "diff_main = dmp.diff_main(result_before.content, result_after.content)\n", + "#diff_main = dmp.diff_main(text_before, text_after)\n", + "#result = dmp.to_pretty_json(diff_main, parsed_before)\n", + "diff_main" + ] + }, + { + "cell_type": "code", + "execution_count": 172, + "metadata": {}, + "outputs": [], + "source": [ + "#split differences and send to llm \n", + "cleaned_diff = []\n", + "for cat, content in diff_main:\n", + " if content.strip() and content != \"\\n\":\n", + " cleaned_diff.append((cat, content))" + ] + }, + { + "cell_type": "code", + "execution_count": 173, + "metadata": {}, + "outputs": [], + "source": [ + "def format_difference(main_diff):\n", + " text_modified = \"\"\n", + " sub_stack = 0\n", + " for op, data in main_diff:\n", + " if op == 0: \n", + " text_modified += data if sub_stack == 0 else f\"_]] {data}\"\n", + " elif op == -1: \n", + " if sub_stack == 0:\n", + " text_modified += f\"[[{data}->\"\n", + " sub_stack += 1\n", + " else:\n", + " text_modified += f\"{data}->\"\n", + " elif op == 1: \n", + " if sub_stack > 0:\n", + " text_modified += f\"{data}]]\"\n", + " sub_stack -= 1\n", + " else:\n", + " text_modified += f\"[[ _ ->{data}]]\"\n", + " return text_modified" + ] + }, + { + "cell_type": "code", + "execution_count": 174, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "\"50 CREPES FINES SUCREES AU\\nCoupdegal[[g->o]]\\nRHUM NEGRITAO (PLIEES EN QUATRE)\\nTRA[[C->D]]ITION [[&->a]] INNO[[V->Y]]AT[[IG->:O]]N\\nD270 55g\\n50 Thin cr[[ê->è]]pes sweetened with rum Negrita[[g->e]]\\n(folded in four) D270 55g\\n25514 R[[k->h]]um[[y->NEGRITA ]]50 Crêpes fines sucrées au rhum cuites, surgelées -\\n[[NEGRITA\\n->_]] 50 Crè->ê]]pes sweetened with rum, baked, frozen\\nIngrédients : LAIT entier, farine de BLE, sucre de canne 16.[[6->4]]%, CEUFS entiers*,\\nbeurre concentré (LAIT), eau, rhum Negrita (colorant: E150a) 3.[[7->6]]%, sel, poudres à\\nlever: E500-E331-amidon de BLE.\\n* CEufs issus de poules élevées au sol\\nIngredients : Whole MILK, WHEAT flour, cane sugar 16.[[6->4]]%, whole EGGS*, con[[c->ç]]entrated\\nbutter (MILK), water, Negrita rum (colouring: E150a) 3.[[7->6]]%, salt, raising agents:\\nE500-E331-WHEAT starch.\\n* Barn eggs\\nConseil d'utilisation : Décongeler le produit 1 heure entre 0° et 4°C. Après décongélation et\\nmaintien à 4°C, le produit se conserve au maximum pendant 24 heures[[ _ ->.]]\\nSuggestion: possibilité de décongeler les cr[[è->é]]pes 30 secondes au four à micr[[o->c]]-ondes.\\n[[BPA le 24.09.2020 ->_]] How to prepare the products: Defrost the product 1 hour at 0'C-+4°]]C. After thawing,\\npreserve the product at +4[[°->*]]C for 24 hours maximum. Suggestion: Defrost the crèpe 30 sec\\nin the microwave.\\nInformations nutritionnelles pour 1 Average nutritional values for 100g:\\nValeur [[e->é]]nerg[[e->é]]tique/Energy: 149[[7->5]] kJ / 356 kcal\\nMatières grasses totales/Fat (g): 11.[[6->4]]\\n- dont Acides Gras Saturés/of which saturated fatty acids (g): [[6->5]].[[1->9]]\\nGiu[[c->ri]]des/Car[[p->b]]o[[n->h]]y[[ct->di]]ates (g): 4[[8->9]].[[9->5]][[- ->_]] dont sucres/of which sugar (g): 24->5]].[[1->2]]\\nProtéines[[/->_]] Proteins (g): 8.0\\nSel/Salt (g): 0.48->5]]\\nA conserver à -18°C : Ne jamais recongeler un produit décongelé\\nStore at -18°C: Don't refreeze, once defrosted\\n[[Fabriqué en France - Made in France\\n->_]] Couy->_]] pdeg->_]] alg->_]] \\n50 CREPES FINES SUCREES AU RHUM\\nNEGRITAO->B]] (PLIEES EN QUATRE) D270\\nT[[R->W]]ADITION & INNOVAT[[:->I]]ON\\n55g\\n50 Cr[[ê->è]]pes fines sucrées au rhum cuites, surgelées -\\nNo DE LOTI\\n50 Crèpes sweetened with rum, baked, frozen\\nBATCH : [[084->116]]2[[0->4]]1 1[[5->3]]:[[4->1]]7\\nA consommer de pr[[è->é]]f[[è->é]]rence avantle [[I->/]]\\n25514\\nBest before : 2[[4->5]]/[[ _ ->1]]0[[9->_]] /2021->5]][[ _ ->FAB :\\nA]]0[[ _ ->4A\\n]]0[[9->1.]]0[[ _ ->9]]80[[43->.9]]80[[ _ ->2]]55141052[[ _ ->5]]10[[9->_]] 24->5]]10[[08->1162]]4[[20->_]] 1 (91)03164->1]]7[[6->5]]\\nEAN No: 03604380255141[[ FAB : 00001 ->_]] Poids net\\n:\\n->! ]]2750\\nNet weight[[ _ ->: :]]\\ng\\n[[ _ ->Ci: 7142 ]]COUP DE[[ _ ->F]] PATES[[E->O]] S[[.->_]] A.S-;->_]] ZAC->Ç]] DU BEL AIR[[-->_]] 14-16 AVENUE.]][[J->V]]OSEPHPAXTON-\\nFERRIERES EN[[I->_]] BRIE 776->8]]14 MARNE LA VALLEE CEDEX 3[[ _ ->- FRANÇE]]\"" + ] + }, + "execution_count": 174, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "format_difference(cleaned_diff)" + ] + }, + { + "cell_type": "code", + "execution_count": 175, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain_openai import ChatOpenAI\n", + "import os\n", + "\n", + "llm = ChatOpenAI(\n", + " model=\"gpt-4o\",\n", + " temperature=0,\n", + " max_tokens=None,\n", + " timeout=None,\n", + " max_retries=2,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 176, + "metadata": {}, + "outputs": [], + "source": [ + "section_diffs = [cleaned_diff]" + ] + }, + { + "cell_type": "code", + "execution_count": 177, + "metadata": {}, + "outputs": [], + "source": [ + "report = []\n", + "#modified_section_names = []\n", + "for section in section_diffs:\n", + " if len(section) == 1 and section[0][0] == 0:\n", + " print(\"No differences found in this section.\")\n", + " continue\n", + " else:\n", + " text_modified = format_difference(section)\n", + " #modified_section_names.append(section[0][1].split(\"\\n\")[1].split(\"#\")[-1].strip())\n", + " messages = [\n", + " (\n", + " \"human\",\n", + " f\"\"\"You are tasked with analyzing and reporting differences in text for a Quality engineer. The input text contains differences marked with special tokens. Your job is to parse these differences and create a clear, concise report.\n", + "\n", + " Here is the text containing the differences:\n", + "\n", + " \n", + " {text_modified}\n", + " \n", + "\n", + " RULE #1 : If there are no [[->]] tokens, it indicates no changes to report, inventing changes means death.\n", + " The differences are marked using the following format:\n", + " - [[before->after]] indicates a change from \"before\" to \"after\"\n", + " - If there is no \"before\" text, it indicates an addition\n", + " - If there is no \"after\" text, it indicates a deletion\n", + " - If there is no [[ ]] token, it indicates no changes to report\n", + " - Make sense of the difference and do not keep the '[' in the report.\n", + " - \"_\" alone means empty.\n", + "\n", + " Follow these steps to create your report:\n", + "\n", + " 1. Carefully read through the entire text.\n", + " 2. Identify each instance of [[ ]] tokens.\n", + " 3. For each instance, determine the modification that was made.\n", + " Present your report in the following format:\n", + " \n", + " In the section ..., the modification found are :\n", + " * the **black** cat was changed to : the **red** cat\n", + " * ...\n", + " \n", + " Note that there might be no modifications in some sections. In that case, simply state that no differences were found.\n", + "\n", + "\n", + " Remember, your goal is to create a clear and concise report that allows the Quality engineer to quickly verify the differences. Focus on accuracy and readability in your output, give every indication possible to make it easier to find the modification.\n", + " The report should be written in a professional and formal tone and in French.\"\"\",\n", + " ),\n", + " ]\n", + " response = llm.invoke(messages)\n", + " report.append(response.content)\n", + "\n", + "#print(\"The modified Sections are : \", modified_section_names)" + ] + }, + { + "cell_type": "code", + "execution_count": 178, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Dans la section \"50 CREPES FINES SUCREES AU\", les modifications trouvées sont :\n", + "* Coupdegal**g** a été changé en : Coupdegal**o**\n", + "\n", + "Dans la section \"RHUM NEGRITAO (PLIEES EN QUATRE)\", les modifications trouvées sont :\n", + "* TRA**C**ITION a été changé en : TRA**D**ITION\n", + "* TRA**&** INNO**V**ATION a été changé en : TRA**a** INNO**Y**ATION\n", + "* INNO**V**ATION a été changé en : INNO**Y**ATION\n", + "* INNO**IG**N a été changé en : INNO**:O**N\n", + "\n", + "Dans la section \"50 Thin crêpes sweetened with rum Negrita\", les modifications trouvées sont :\n", + "* cr**ê**pes a été changé en : cr**è**pes\n", + "* Negrita**g** a été changé en : Negrita**e**\n", + "\n", + "Dans la section \"25514 Rhum NEGRITA 50 Crêpes fines sucrées au rhum cuites, surgelées\", les modifications trouvées sont :\n", + "* R**k**um a été changé en : R**h**um\n", + "* Rhum**y** a été changé en : Rhum**NEGRITA**\n", + "* NEGRITA a été changé en : (supprimé)\n", + "* Crè**ê**pes a été changé en : Crè**e**pes\n", + "\n", + "Dans la section \"Ingrédients\", les modifications trouvées sont :\n", + "* sucre de canne 16.**6**% a été changé en : sucre de canne 16.**4**%\n", + "* rhum Negrita (colorant: E150a) 3.**7**% a été changé en : rhum Negrita (colorant: E150a) 3.**6**%\n", + "\n", + "Dans la section \"Ingredients\", les modifications trouvées sont :\n", + "* cane sugar 16.**6**% a été changé en : cane sugar 16.**4**%\n", + "* con**c**entrated butter a été changé en : con**ç**entrated butter\n", + "* Negrita rum (colouring: E150a) 3.**7**% a été changé en : Negrita rum (colouring: E150a) 3.**6**%\n", + "\n", + "Dans la section \"Conseil d'utilisation\", les modifications trouvées sont :\n", + "* 24 heures** _ ** a été changé en : 24 heures**.**\n", + "* cr**è**pes a été changé en : cr**é**pes\n", + "* micr**o**-ondes a été changé en : micr**c**-ondes\n", + "* BPA le 24.09.2020 a été changé en : (supprimé)\n", + "\n", + "Dans la section \"How to prepare the products\", les modifications trouvées sont :\n", + "* 0'C-+4**°**C a été changé en : 0'C-+4**C**\n", + "* +4**°**C a été changé en : +4**C**\n", + "\n", + "Dans la section \"Valeur énergétique/Energy\", les modifications trouvées sont :\n", + "* Valeur **e**nerg**e**tique a été changé en : Valeur **é**nerg**é**tique\n", + "* 149**7** kJ a été changé en : 149**5** kJ\n", + "\n", + "Dans la section \"Matières grasses totales/Fat (g)\", les modifications trouvées sont :\n", + "* 11.**6** a été changé en : 11.**4**\n", + "\n", + "Dans la section \"Acides Gras Saturés/of which saturated fatty acids (g)\", les modifications trouvées sont :\n", + "* **6**.1 a été changé en : **5**.9\n", + "\n", + "Dans la section \"Glucides/Carbohydrates (g)\", les modifications trouvées sont :\n", + "* Giu**c**des a été changé en : Giu**ri**des\n", + "* Car**p**o**n**y**ct**ates a été changé en : Car**b**o**h**y**di**ates\n", + "* 4**8**.9 a été changé en : 4**9**.5\n", + "* 24**-**1 a été changé en : 24**.2**\n", + "\n", + "Dans la section \"Protéines/Proteins (g)\", les modifications trouvées sont :\n", + "* Protéines**/** a été changé en : Protéines\n", + "\n", + "Dans la section \"Sel/Salt (g)\", les modifications trouvées sont :\n", + "* 0.48**->5** a été changé en : 0.48**5**\n", + "\n", + "Dans la section \"A conserver à -18°C\", les modifications trouvées sont :\n", + "* Fabriqué en France - Made in France a été changé en : (supprimé)\n", + "\n", + "Dans la section \"50 CREPES FINES SUCREES AU RHUM\", les modifications trouvées sont :\n", + "* NEGRITAO**->B** a été changé en : NEGRITAO**B**\n", + "* T**R**ADITION a été changé en : T**W**ADITION\n", + "* INNOVAT**:**ON a été changé en : INNOVAT**I**ON\n", + "\n", + "Dans la section \"50 Crêpes fines sucrées au rhum cuites, surgelées\", les modifications trouvées sont :\n", + "* cr**ê**pes a été changé en : cr**è**pes\n", + "\n", + "Dans la section \"BATCH\", les modifications trouvées sont :\n", + "* 084**2**0 a été changé en : 116**4**1\n", + "* 1**5**:4**7** a été changé en : 1**3**:1**7**\n", + "\n", + "Dans la section \"A consommer de préférence avant le\", les modifications trouvées sont :\n", + "* 2**4**/10/2021 a été changé en : 2**5**/10/2021\n", + "* FAB : A0 a été changé en : FAB : 4A\n", + "* 09.0 a été changé en : 1.0\n", + "* 98043 a été changé en : 980\n", + "* 255141052 a été changé en : 2551410525\n", + "* 109 a été changé en : 109\n", + "* 24.10.08 a été changé en : 24.10.1162\n", + "* 20 a été changé en : 20\n", + "* 1 (91)03164 a été changé en : 1 (91)03164-17\n", + "\n", + "Dans la section \"EAN No\", les modifications trouvées sont :\n", + "* EAN No: 03604380255141 FAB : 00001 a été changé en : EAN No: 03604380255141\n", + "\n", + "Dans la section \"Poids net\", les modifications trouvées sont :\n", + "* Poids net : 2750 a été changé en : Poids net : 2750\n", + "\n", + "Dans la section \"Net weight\", les modifications trouvées sont :\n", + "* Net weight : : a été changé en : Net weight : :\n", + "\n", + "Dans la section \"COUP DE PATES\", les modifications trouvées sont :\n", + "* COUP DE PATES a été changé en : COUP DE F PATES\n", + "* S.A.S a été changé en : S.A.S\n", + "* ZAC DU BEL AIR a été changé en : ZAC DU BEL AIR\n", + "* 14-16 AVENUE a été changé en : 14-16 AVENUE\n", + "* JOSEPH PAXTON a été changé en : JOSEPH PAXTON\n", + "* FERRIERES EN BRIE 77614 MARNE LA VALLEE CEDEX 3 a été changé en : FERRIERES EN BRIE 77614 MARNE LA VALLEE CEDEX 3\n", + "\n" + ] + } + ], + "source": [ + "print(report[0])" + ] + }, + { + "cell_type": "code", + "execution_count": 166, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[(1, 'Coup de pâtes\\nTRADITION & INNOVATION\\n\\n'),\n", + " (0,\n", + " '50 CREPES FINES SUCREES AU RHUM NEGRITA® (PLIEES EN QUATRE) D270 55g\\n50 Thin crêpes sweetened with rum Negrita® (folded in four) D270 55g\\n'),\n", + " (0, '25514'),\n", + " (0, 'Rhum NEGRITA\\n50 Crêpes fines sucrées au rhum cuites, surgelées -'),\n", + " (0, '50 Crêpes sweetened with rum, baked, frozen'),\n", + " (0, '\\nIngrédients : LAIT entier, farine de BLE, sucre de canne 16.'),\n", + " (-1, '6'),\n", + " (1, '4'),\n", + " (0,\n", + " '%, ŒUFS entiers*, beurre concentré (LAIT), eau, rhum Negrita (colorant: E150a) 3.'),\n", + " (-1, '7'),\n", + " (1, '6'),\n", + " (0,\n", + " '%, sel, poudres à lever: E500-E331-amidon de BLE.\\n* Œufs issus de poules élevées au sol\\n'),\n", + " (0, 'Ingredients : Whole MILK, WHEAT flour, cane sugar 16.'),\n", + " (-1, '6'),\n", + " (1, '4'),\n", + " (0,\n", + " '%, whole EGGS*, concentrated butter (MILK), water, Negrita rum (colouring: E150a) 3.'),\n", + " (-1, '7'),\n", + " (1, '6'),\n", + " (0, '%, salt, raising agents: E500-E331-WHEAT starch.\\n* Barn eggs\\n'),\n", + " (0,\n", + " \"Conseil d'utilisation : Décongeler le produit 1 heure entre 0° et 4°C. Après décongélation et maintien à 4°C, le produit se conserve au maximum pendant 24 heures\"),\n", + " (1, '. '),\n", + " (0,\n", + " 'Suggestion: possibilité de décongeler les crêpes 30 secondes au four à micro-ondes.\\n'),\n", + " (-1, 'BPA le 24.09.2020 '),\n", + " (0, 'How to prepare the product'),\n", + " (-1, 's'),\n", + " (0, ': Defrost the product 1 hour at 0°C'),\n", + " (0, '-'),\n", + " (1, ' +'),\n", + " (0, '4°C. After thawing, preserve the product at '),\n", + " (1, '+'),\n", + " (0,\n", + " '4°C for 24 hours maximum. Suggestion: Defrost the crêpe 30 sec in the microwave.\\n'),\n", + " (0,\n", + " 'Informations nutritionnelles pour 100g / Average nutritional values for 100g:\\nValeur énergétique/Energy: 149'),\n", + " (-1, '7'),\n", + " (1, '5'),\n", + " (0, ' kJ / 356 kcal\\nMatières grasses totales/Fat (g): 11.'),\n", + " (-1, '6'),\n", + " (1, '4'),\n", + " (0, '\\n- dont Acides Gras Saturés/of which saturated fatty acids (g): '),\n", + " (-1, '6'),\n", + " (1, '5'),\n", + " (0, '.'),\n", + " (-1, '1'),\n", + " (1, '9'),\n", + " (0, '\\nGlucides/Carbohydrates (g): 4'),\n", + " (-1, '8'),\n", + " (1, '9'),\n", + " (0, '.'),\n", + " (-1, '9'),\n", + " (1, '5'),\n", + " (0, '\\n- dont sucres/of which sugar (g): 2'),\n", + " (-1, '4'),\n", + " (1, '5'),\n", + " (0, '.'),\n", + " (-1, '1'),\n", + " (1, '2'),\n", + " (0, '\\nProtéines/Proteins (g): 8.0\\nSel/Salt (g): 0.4'),\n", + " (-1, '8'),\n", + " (1, '5\\n'),\n", + " (0,\n", + " \"\\nA conserver à -18°C : Ne jamais recongeler un produit décongelé\\nStore at -18°C: Don't refreeze, once defrosted\\n\"),\n", + " (-1, 'Fabriq'),\n", + " (1, '\\nCo'),\n", + " (0, 'u'),\n", + " (-1, 'é'),\n", + " (1, 'p'),\n", + " (-1, 'en France - Ma'),\n", + " (0, 'de '),\n", + " (-1, 'in Franc'),\n", + " (1, 'pât'),\n", + " (0, 'e'),\n", + " (1, 's'),\n", + " (0,\n", + " '\\n50 CREPES FINES SUCREES AU RHUM NEGRITA® (PLIEES EN QUATRE) D270 55g\\n50 Crêpes fines sucrées au rhum cuites, surgelées -'),\n", + " (0, '50 Crêpes sweetened with rum, baked, frozen\\n'),\n", + " (0, 'N° DE LOT / BATCH : '),\n", + " (-1, '08'),\n", + " (1, '1162'),\n", + " (0, '4'),\n", + " (-1, '20'),\n", + " (0, '1 1'),\n", + " (-1, '5'),\n", + " (1, '3'),\n", + " (0, ':'),\n", + " (-1, '4'),\n", + " (1, '1'),\n", + " (0, '7\\nA consommer de préférence avant le / Best before : 2'),\n", + " (-1, '4'),\n", + " (1, '5'),\n", + " (0, '/'),\n", + " (1, '1'),\n", + " (0, '0'),\n", + " (-1, '9'),\n", + " (0, '/202'),\n", + " (-1, '1'),\n", + " (1, '5\\n'),\n", + " (0, '\\n25514\\n'),\n", + " (1, 'FAB : A04A\\n\\n'),\n", + " (0, '(01)03604380255141(15)2'),\n", + " (1, '5'),\n", + " (0, '10'),\n", + " (-1, '9'),\n", + " (0, '2'),\n", + " (-1, '4'),\n", + " (1, '5'),\n", + " (0, '(10)'),\n", + " (-1, '08'),\n", + " (1, '1162'),\n", + " (0, '4'),\n", + " (-1, '20'),\n", + " (0, '1'),\n", + " (0, '(91)0316'),\n", + " (-1, '4'),\n", + " (1, '1'),\n", + " (0, '7'),\n", + " (-1, '6'),\n", + " (1, '5'),\n", + " (0, '\\nEAN N'),\n", + " (-1, '°'),\n", + " (1, 'o'),\n", + " (0, ': 03604380255141'),\n", + " (-1, ' FAB : 00001 '),\n", + " (0, 'Poids net / Net weight : 2750 g\\n'),\n", + " (1, '\\nC.I: 7142 '),\n", + " (0, 'COUP DE PATES'),\n", + " (-1, '®'),\n", + " (0, ' S.A.S'),\n", + " (-1, ' -'),\n", + " (1, '.'),\n", + " (0, ' ZAC DU BEL AIR - 14-16 AVENUE JOSEPH PAXTON - FERRIERES EN BRIE - 77'),\n", + " (-1, '6'),\n", + " (0, '1'),\n", + " (1, '6'),\n", + " (0, '4 MARNE LA VALLEE CEDEX 3'),\n", + " (1, ' - FRANCE')]" + ] + }, + "execution_count": 166, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cleaned_diff" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/backend/worker/diff-assistant/pyproject.toml b/backend/worker/diff-assistant/pyproject.toml new file mode 100644 index 000000000000..01ee4317cd2e --- /dev/null +++ b/backend/worker/diff-assistant/pyproject.toml @@ -0,0 +1,53 @@ +[project] +name = "quivr-diff-assistant" +version = "0.1.0" +description = "Diff Assistant" +authors = [ + { name = "Stan Girard", email = "stan@quivr.app" } +] + +dependencies = [ + "python-doctr>=0.9.0", + "matplotlib>=3.9.2", + "mplcursors>=0.5.3", + "diff-match-patch>=20230430", + "scikit-learn>=1.5.1", + "numpy>=1.16.0", + "unstructured>=0.15.9", + "python-magic>=0.4.27", + "pypdfium2>=4.30.0", + "numba>=0.60.0", + "docx2txt>=0.8", + "openpyxl>=3.1.5", + "faiss-cpu>=1.8.0.post1", + "llama-index>=0.11.8", + "openai>=1.44.1", + "pandas>=2.2.2", + "pypdf>=4.3.1", + "llama-index-readers-file>=0.2.1", + "llama-index-llms-openai>=0.2.3", + "python-dotenv>=1.0.1", + "langchain>=0.2.16", + "langchain-openai>=0.1.24", + "opencv-python>=4.10.0.84", + "megaparse>=0.0.31", + "h5py==3.10.0", +] +readme = "README.md" +requires-python = ">= 3.8" + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[tool.rye] +managed = true +dev-dependencies = [ + "pytest>=8.3.2", +] + +[tool.hatch.metadata] +allow-direct-references = true + +[tool.hatch.build.targets.wheel] +packages = ["quivr_diff_assistant"] diff --git a/frontend/app/assistants/AssistantModal/InputsStep/InputsStep.module.scss b/backend/worker/diff-assistant/quivr_diff_assistant/__init__.py similarity index 100% rename from frontend/app/assistants/AssistantModal/InputsStep/InputsStep.module.scss rename to backend/worker/diff-assistant/quivr_diff_assistant/__init__.py diff --git a/backend/worker/diff-assistant/quivr_diff_assistant/main_uc2.py b/backend/worker/diff-assistant/quivr_diff_assistant/main_uc2.py new file mode 100644 index 000000000000..6de22fd92c94 --- /dev/null +++ b/backend/worker/diff-assistant/quivr_diff_assistant/main_uc2.py @@ -0,0 +1,221 @@ +import asyncio +from enum import Enum + +import pandas as pd +import streamlit as st +from dotenv import load_dotenv +from langchain_core.language_models.chat_models import BaseChatModel +from langchain_core.output_parsers import StrOutputParser +from langchain_openai import ChatOpenAI +from llama_index.core import SimpleDirectoryReader, VectorStoreIndex +from llama_index.core.node_parser import UnstructuredElementNodeParser +from llama_index.core.query_engine import RetrieverQueryEngine +from llama_index.core.retrievers import RecursiveRetriever +from llama_index.core.schema import Document +from llama_index.llms.openai import OpenAI +from utils.utils import COMPARISON_PROMPT + +from quivr_diff_assistant.use_case_3.parser import DeadlyParser + +load_dotenv() + +# Set pandas display options +pd.set_option("display.max_rows", None) +pd.set_option("display.max_columns", None) +pd.set_option("display.width", None) +pd.set_option("display.max_colwidth", None) + + +def load_and_process_document(file_path, pickle_file): + print(file_path) + reader = SimpleDirectoryReader(input_files=[file_path]) + docs = reader.load_data() + print(len(docs), " and", len(docs[0].text)) + if len(docs) == 1 and len(docs[0].text) < 9: + print("No text found with classical parse, switching to OCR ...") + parser = DeadlyParser() + doc = parser.deep_parse(file_path) + docs = [Document().from_langchain_format(doc)] + + node_parser = UnstructuredElementNodeParser() + + raw_nodes = node_parser.get_nodes_from_documents(docs) + + base_nodes, node_mappings = node_parser.get_base_nodes_and_mappings(raw_nodes) + return base_nodes, node_mappings + + +def create_query_engine(base_nodes, node_mappings): + vector_index = VectorStoreIndex(base_nodes) + vector_retriever = vector_index.as_retriever(similarity_top_k=5) + recursive_retriever = RecursiveRetriever( + "vector", + retriever_dict={"vector": vector_retriever}, + node_dict=node_mappings, + verbose=True, + ) + return RetrieverQueryEngine.from_args( + recursive_retriever, llm=OpenAI(temperature=0, model="gpt-4") + ) + + +def compare_responses(response1, response2): + llm = OpenAI(temperature=0, model="gpt-4") + prompt = f""" + Compare the following two responses and determine if they convey the same information: + Response for document 1: {response1} + Response for document 2: {response2} + Are these responses essentially the same? Provide a brief explanation for your conclusion. The difference in format are not important, focus on the content and the numbers. + If there are any specific differences, please highlight them with bullet points. Respond in french and in a markdown format. + """ + return llm.complete(prompt) + + +class ComparisonTypes(str, Enum): + CDC_ETIQUETTE = "Cahier des Charges - Etiquette" + CDC_FICHE_DEV = "Cahier des Charges - Fiche Dev" + + +def llm_comparator( + document: str, cdc: str, llm: BaseChatModel, comparison_type: ComparisonTypes +): + chain = COMPARISON_PROMPT | llm | StrOutputParser() + + if comparison_type == ComparisonTypes.CDC_ETIQUETTE: + text_1 = "Etiquette" + elif comparison_type == ComparisonTypes.CDC_FICHE_DEV: + text_1 = "Fiche Dev" + + return chain.stream( + { + "document": document, + "text_1": text_1, + "cdc": cdc, + "text_2": "Cahier des Charges", + } + ) + + +async def test_main(): + cdc_doc = "/Users/jchevall/Coding/diff-assistant/data/Use case #2/Cas2-2-1_Mendiant Lait_QD PC F03 - FR Cahier des charges produit -rev 2021-v2.pdf" + doc = "/Users/jchevall/Coding/diff-assistant/data/Use case #2/Cas2-2-1_Proposition étiquette Mendiant Lait croustillant.pdf" + + cdc_doc = "/Users/jchevall/Coding/diff-assistant/data/Use case #2/Cas2-1-3_12_CDC_70690_Entremets rond vanille pécan individuel_2024.06.28 VALIDE.docx" + doc = "/Users/jchevall/Coding/diff-assistant/data/Use case #2/Cas2-1-3_CDP_R&D_TABL_01_Fiche développement produit - Entremets vanille pécan 28 06 2024.xlsx" + + comparison_type = ComparisonTypes.CDC_FICHE_DEV + + llm = ChatOpenAI( + model="gpt-4o", + temperature=0.1, + max_tokens=None, + max_retries=2, + ) + + parser = DeadlyParser() + parsed_cdc_doc = await parser.aparse(cdc_doc) + + if comparison_type == ComparisonTypes.CDC_ETIQUETTE: + parsed_doc = await parser.deep_aparse(doc, llm=llm) + else: + parsed_doc = await parser.aparse(doc) + + print("\n\n Cahier des Charges") + print(parsed_cdc_doc.page_content) + + print("\n\n Other document") + print(parsed_doc.page_content) + + comparison = llm_comparator( + document=parsed_doc.page_content, + cdc=parsed_cdc_doc.page_content, + llm=llm, + comparison_type=comparison_type, + ) + + print("\n\n Comparison") + print(comparison) + + +def get_document_path(doc): + try: + with open(doc.name, "wb") as temp_file: + temp_file.write(doc.getbuffer()) + path = temp_file.name + except: + path = doc + + return path + + +async def parse_documents(cdc_doc, doc, comparison_type: ComparisonTypes, llm): + parser = DeadlyParser() + + # Schedule the coroutines as tasks + cdc_task = asyncio.create_task(parser.aparse(get_document_path(cdc_doc))) + + if comparison_type == ComparisonTypes.CDC_ETIQUETTE: + doc_task = asyncio.create_task( + parser.deep_aparse(get_document_path(doc), llm=llm) + ) + else: + doc_task = asyncio.create_task(parser.aparse(get_document_path(doc))) + + # Optionally, do other work here while tasks are running + + # Await the tasks to get the results + parsed_cdc_doc = await cdc_task + print("\n\n Cahier de Charges: \n", parsed_cdc_doc.page_content) + + parsed_doc = await doc_task + print("\n\n Other doc: \n", parsed_doc.page_content) + + return parsed_cdc_doc, parsed_doc + + +def main(): + st.title("Document Comparison Tool : Use Case 2") + + # File uploaders for two documents + cdc_doc = st.file_uploader( + "Upload Cahier des Charges", type=["docx", "xlsx", "pdf", "txt"] + ) + doc = st.file_uploader( + "Upload Etiquette / Fiche Dev", type=["docx", "xlsx", "pdf", "txt"] + ) + + comparison_type = st.selectbox( + "Select document types", + [ComparisonTypes.CDC_ETIQUETTE.value, ComparisonTypes.CDC_FICHE_DEV.value], + ) + + if st.button("Process Documents and Questions"): + if not cdc_doc or not doc: + st.error("Please upload both documents before launching the processing.") + return + + with st.spinner("Processing files..."): + llm = ChatOpenAI( + model="gpt-4o", + temperature=0.1, + max_tokens=None, + max_retries=2, + ) + + parsed_cdc_doc, parsed_doc = asyncio.run( + parse_documents(cdc_doc, doc, comparison_type=comparison_type, llm=llm) + ) + + comparison = llm_comparator( + document=parsed_doc.page_content, + cdc=parsed_cdc_doc.page_content, + llm=llm, + comparison_type=comparison_type, + ) + # Run the async function using asyncio.run() + # comparison = asyncio.run(process_documents(cdc_doc, doc, comparison_type)) + st.write_stream(comparison) + + +if __name__ == "__main__": + main() diff --git a/backend/worker/diff-assistant/quivr_diff_assistant/main_uc3.py b/backend/worker/diff-assistant/quivr_diff_assistant/main_uc3.py new file mode 100644 index 000000000000..c406a6b8740c --- /dev/null +++ b/backend/worker/diff-assistant/quivr_diff_assistant/main_uc3.py @@ -0,0 +1,125 @@ +import asyncio +import os +import tempfile +from enum import Enum +from pathlib import Path + +import streamlit as st +from diff_match_patch import diff_match_patch + +# get environment variables +from dotenv import load_dotenv +from langchain_core.language_models.chat_models import BaseChatModel +from langchain_openai import ChatOpenAI +from use_case_3.diff_type import DiffResult, llm_comparator +from use_case_3.llm_reporter import redact_report +from use_case_3.parser import DeadlyParser + +load_dotenv() + + +class DocumentType(Enum): + ETIQUETTE = "etiquette" + CAHIER_DES_CHARGES = "cdc" + + +async def create_modification_report( + before_file: str | Path, + after_file: str | Path, + type: DocumentType, + llm: BaseChatModel, + partition: bool = False, + use_llm_comparator: bool = False, + parser=DeadlyParser(), +) -> str: + if type == DocumentType.ETIQUETTE: + print("parsing before file") + before_text = parser.deep_parse(before_file, partition=partition, llm=llm) + print("parsing after file") + after_text = parser.deep_parse(after_file, partition=partition, llm=llm) + elif type == DocumentType.CAHIER_DES_CHARGES: + before_text = await parser.aparse(before_file) + after_text = await parser.aparse(after_file) + + print(before_text.page_content) + print(after_text.page_content) + text_after_sections = before_text.page_content.split("\n# ") + text_before_sections = after_text.page_content.split("\n# ") + assert len(text_after_sections) == len(text_before_sections) + + if use_llm_comparator: + print("using llm comparator") + return llm_comparator( + before_text.page_content, after_text.page_content, llm=llm + ) + print("using diff match patch") + dmp = diff_match_patch() + section_diffs = [] + for after_section, before_section in zip( + text_after_sections, text_before_sections, strict=False + ): + main_diff: list[tuple[int, str]] = dmp.diff_main(after_section, before_section) + section_diffs.append(DiffResult(main_diff)) + + return redact_report(section_diffs, llm=llm) + + +def save_uploaded_file(uploaded_file): + with tempfile.NamedTemporaryFile( + delete=False, suffix=os.path.splitext(uploaded_file.name)[1] + ) as tmp_file: + tmp_file.write(uploaded_file.getvalue()) + return tmp_file.name + + +st.title("Document Modification Report Generator : Use Case 3") + +# File uploaders +before_file = st.file_uploader("Upload 'Before' file", type=["pdf", "docx"]) +after_file = st.file_uploader("Upload 'After' file", type=["pdf", "docx"]) + +# Document type selector +doc_type = st.selectbox("Select document type", ["ETIQUETTE", "CAHIER_DES_CHARGES"]) + +# Complexity of document +complexity = st.checkbox("Complex document (lot of text of OCRise)") + +# Process button +if st.button("Process"): + if before_file and after_file: + with st.spinner("Processing files..."): + # Save uploaded files + before_path = save_uploaded_file(before_file) + after_path = save_uploaded_file(after_file) + + # Initialize LLM + openai_gpt4o = ChatOpenAI( + model="gpt-4o", + temperature=0, + max_tokens=None, + max_retries=2, + ) + use_llm_comparator = True if doc_type == "ETIQUETTE" else False + + # Generate report + print("generating report") + report = asyncio.run( + create_modification_report( + before_path, + after_path, + DocumentType[doc_type], + openai_gpt4o, + partition=complexity, + use_llm_comparator=use_llm_comparator, + ) + ) + print("report generated") + # Display results + st.subheader("Modification Report") + st.write(report) + + # Clean up temporary files + os.unlink(before_path) + os.unlink(after_path) + else: + st.error("Please upload both 'Before' and 'After' files.") diff --git a/backend/worker/diff-assistant/quivr_diff_assistant/use_case_2/with_quivr_core.py b/backend/worker/diff-assistant/quivr_diff_assistant/use_case_2/with_quivr_core.py new file mode 100644 index 000000000000..6f17414f8d7d --- /dev/null +++ b/backend/worker/diff-assistant/quivr_diff_assistant/use_case_2/with_quivr_core.py @@ -0,0 +1,59 @@ +# from langchain_openai import OpenAIEmbeddings +# from rich.console import Console +# from rich.panel import Panel +# from rich.prompt import Prompt + +# from quivr_core import Brain +# from quivr_core.config import LLMEndpointConfig +# from quivr_core.llm.llm_endpoint import LLMEndpoint +# from quivr_core.quivr_rag import QuivrQARAG + + +# if __name__ == "__main__": +# brain_1 = Brain.from_files( +# name="cdc_brain", +# file_paths=["data/cdc/Cas2-1-3_Entremets_rond_vanille_pecan_individuel.docx"], +# llm=LLMEndpoint.from_config( +# LLMEndpointConfig(model="gpt-4o-mini", temperature=0.0) +# ), +# embedder=OpenAIEmbeddings(), +# ) + +# brain_2 = Brain.from_files( +# name="etiquette_brain", +# file_paths=[ +# "data/fiche_dev_produit/Cas2-1-3_Entremets_rond_vanille_pecan_individuel.xlsx" +# ], +# llm=LLMEndpoint.from_config( +# LLMEndpointConfig(model="gpt-4o-mini", temperature=0.0) +# ), +# embedder=OpenAIEmbeddings(), +# ) + +# # Check brain info +# brain_1.print_info() +# brain_2.print_info() + +# console = Console() +# console.print(Panel.fit("Ask what to compare : ", style="bold magenta")) + +# while True: +# # Get user input +# section = Prompt.ask("[bold cyan]Section[/bold cyan]") + +# # Check if user wants to exit +# if section.lower() == "exit": +# console.print(Panel("Goodbye!", style="bold yellow")) +# break + +# question = ( +# f"Quelle est/sont le(s) {section} ? Answer only with exact text citation." +# ) +# response_1 = brain_1.ask(question) +# response_2 = brain_2.ask(question, rag_pipeline=QuivrQARAG) +# # Print the answer with typing effect +# console.print(f"[bold green]Quivr CDC[/bold green]: {response_1.answer}") +# console.print() +# console.print(f"[bold blue]Quivr Fiche Dev[/bold blue]: {response_2.answer}") + +# console.print("-" * console.width) diff --git a/backend/worker/diff-assistant/quivr_diff_assistant/use_case_3/__init__.py b/backend/worker/diff-assistant/quivr_diff_assistant/use_case_3/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/backend/worker/diff-assistant/quivr_diff_assistant/use_case_3/diff_type.py b/backend/worker/diff-assistant/quivr_diff_assistant/use_case_3/diff_type.py new file mode 100644 index 000000000000..56646a6797ca --- /dev/null +++ b/backend/worker/diff-assistant/quivr_diff_assistant/use_case_3/diff_type.py @@ -0,0 +1,105 @@ +from typing import List, Tuple + +from langchain_core.language_models.chat_models import BaseChatModel +from langchain_core.prompts.prompt import PromptTemplate + +DIFF_PROMPT = PromptTemplate.from_template( + template=""" + You need to compare two texts and report all the differences. Your job is to parse these differences and create a clear, concise report. \ + Organize the report by sections and provide a detailed explanation of each difference. \ + Be specific on difference, it will be reviewed and verified by a Quality engineer. + Here are the different sections of the report: + * Dénominations, comprenant: + * dénomination légale: nom du produit tel qu’il est défini par la réglementation, \ + en général cela inclut aussi avec des information sur son état (cuite, cru, gelé ... ) + * dénomination commercial: nom du produit tel qu’il est vendu au consommateur + * Ingrédients et allergènes, comprenant: + * liste d’ingrédients + * traces d’allergènes + * Une sous-section pour chaque sous produit si il y a lieu; + * Eléments de traçabilité, comprenant: + * le code-barre EAN + * le code article + * DDM - date de durabilité minimale + * numéro de lot + * date de fabrication + * adresse de l'entreprise + * Conseils d’utilisation / de manipulation produit, comprenant : + * Conditions de remise en oeuvre + * Durée de vie + * Conditions de transport + * Conditions de conservation : « A conserver à -18°C / Ne pas recongeler un produit décongeler » + * Temps de decongelation + * Temperature de prechauffage + * Poids du produit + * Valeurs / informations nutritionnelles + * Autres + + Notes: + -> Coup de Pates: Tradition & Innovation, est l'entreprise productrice / marque du produit. + + Chaque sections doivent être organisées comme suit et séparées par des lignes entre chaque avant et après: + + ## section_name + + **Avant** : ... + + **Après** : ... + + **Modifications**: + * ... + * ... + + + -----TEXT BEFORE MODIFICATION----- + {before_text} + -----TEXT AFTER MODIFICATION----- + {after_text} + + The report should be written in a professional and formal tone and in French. + """ +) + + +class DiffResult: + def __init__(self, diffs: List[Tuple[int, str]]) -> None: + self.diffs = diffs + + def remove_dummy_diffs(self) -> None: + cleaned_diff = [] + for cat, content in self.diffs: + if content.strip() and content != "\n": + cleaned_diff.append((cat, content)) + + self.diffs = cleaned_diff + + def format_diffs(self) -> str: + text_modified = "" + + sub_stack = 0 + for op, data in self.diffs: + if op == 0: + text_modified += data if sub_stack == 0 else f"_]] {data}" + elif op == -1: + if sub_stack == 0: + text_modified += f"[[{data}->" + sub_stack += 1 + else: + text_modified += f"{data}->" + elif op == 1: + if sub_stack > 0: + text_modified += f"{data}]]" + sub_stack -= 1 + else: + text_modified += f"[[ _ ->{data}]]" + + return text_modified + + def __str__(self) -> str: + return self.format_diffs() + + +def llm_comparator(before_text: str, after_text: str, llm: BaseChatModel) -> str: + chain = DIFF_PROMPT | llm + result = chain.invoke({"before_text": before_text, "after_text": after_text}) + return str(result.content) diff --git a/backend/worker/diff-assistant/quivr_diff_assistant/use_case_3/llm_reporter.py b/backend/worker/diff-assistant/quivr_diff_assistant/use_case_3/llm_reporter.py new file mode 100644 index 000000000000..abae58afe09e --- /dev/null +++ b/backend/worker/diff-assistant/quivr_diff_assistant/use_case_3/llm_reporter.py @@ -0,0 +1,74 @@ +from typing import List + +from langchain_core.language_models.chat_models import BaseChatModel +from langchain_core.prompts.prompt import PromptTemplate + +from quivr_diff_assistant.use_case_3.diff_type import DiffResult + +REPORT_PROMPT = PromptTemplate.from_template( + template="""You are tasked with analyzing and reporting differences in text for a Quality engineer. The input text contains differences marked with special tokens. Your job is to parse these differences and create a clear, concise report. + + Here is the text containing the differences: + + + {text_modified} + + + RULE #1 : If there are no [[->]] tokens, it indicates no changes to report, inventing changes means death. + The differences are marked using the following format: + - [[before->after]] indicates a change from "before" to "after" + - If there is no "before" text, it indicates an addition + - If there is no "after" text, it indicates a deletion + - If there is no [[ ]] token, it indicates no changes to report + - Make sense of the difference and do not keep the '[' in the report. + - "_" alone means empty. + + Follow these steps to create your report: + + 1. Carefully read through the entire text. + 2. Identify each instance of [[ ]] tokens. + 3. For each instance, determine the modification that was made. + Present your report in the following markdown format: + + # Title (Difference Report) + ## Section Name + ### Subsection Name (if applicable) + * Original: Original text + * Modified: Modified text + * Changes: + * Change 1 + * Change 2 + * Change 3 + + Avoid repetitive infos, only report the changes. + Keep the checkbox when possible and compare the correct check box. + + + Every modification should be clearly stated with the original text and the modified text. + Note that there might be no modifications in some sections. In that case, simply return nothing. + Try to make the report as clear and concise as possible, a point for each modification found with details, avoid big comparisons. + + + Remember, your goal is to create a clear and concise report that allows the Quality engineer to quickly verify the differences. Focus on accuracy and readability in your output, give every indication possible to make it easier to find the modification. + The report should be written in a professional and formal tone and in French.""", +) + + +def redact_report(difference_per_section: List[DiffResult], llm: BaseChatModel) -> str: + report_per_section = [] + combined_diffs = "" + for section in difference_per_section: + if len(section.diffs) == 1 and section.diffs[0][0] == 0: + print("No differences found in this section.") + continue + combined_diffs += str(section) + + chain = REPORT_PROMPT | llm + result = chain.invoke({"text_modified": str(combined_diffs)}) + report_per_section.append(result.content) + + report_text = "" + + for rep in report_per_section: + report_text += "\n".join(rep.split("\n")[1:-1]) + "\n\n" + return report_text diff --git a/backend/worker/diff-assistant/quivr_diff_assistant/use_case_3/parser.py b/backend/worker/diff-assistant/quivr_diff_assistant/use_case_3/parser.py new file mode 100644 index 000000000000..4f6690968b60 --- /dev/null +++ b/backend/worker/diff-assistant/quivr_diff_assistant/use_case_3/parser.py @@ -0,0 +1,287 @@ +""" +All of this needs to be in MegaParse, this is just a placeholder for now. +""" + +import base64 +from typing import List + +import cv2 +import numpy as np +from doctr.io import DocumentFile +from doctr.io.elements import Document as doctrDocument +from doctr.models import ocr_predictor +from doctr.models.predictor.pytorch import OCRPredictor +from doctr.utils.common_types import AbstractFile +from langchain_core.documents import Document +from langchain_core.language_models.chat_models import BaseChatModel +from langchain_core.messages import HumanMessage +from megaparse import MegaParse # FIXME: @chloedia Version problems +from quivr_api.logger import get_logger + +logger = get_logger(__name__) + + +""" +This needs to be in megaparse @chloedia +""" + + +class DeadlyParser: + def __init__(self): + self.predictor: OCRPredictor = ocr_predictor( + pretrained=True, det_arch="fast_base", reco_arch="crnn_vgg16_bn" + ) + + async def deep_aparse( + self, + file: AbstractFile, + partition: bool = False, + llm: BaseChatModel | None = None, + ) -> Document: + """ + Parse the OCR output from the input file and return the extracted text. + """ + try: + docs = DocumentFile.from_pdf(file, scale=int(500 / 72)) + if partition: + cropped_image = crop_to_content(docs[0]) + # cv2.imshow("cropped", cropped_image) + # cv2.waitKey(0) # Wait for a key press + + docs = split_image(cropped_image) + # for i, sub_image in enumerate(docs): + # cv2.imshow(f"sub_image_{i}", sub_image) + # cv2.waitKey(0) # Wait for a key press + # cv2.destroyAllWindows() + + print("ocr start") + raw_results: doctrDocument = self.predictor(docs) + print("ocr done") + if llm: + entire_content = "" + print("ocr llm start") + for raw_result, img in zip(raw_results.pages, docs, strict=False): + if raw_result.render() == "": + continue + _, buffer = cv2.imencode(".png", img) + img_str64 = base64.b64encode(buffer.tobytes()).decode("utf-8") + + processed_result = llm.invoke( + [ + HumanMessage( + content=[ + { + "type": "text", + "text": f"Can you correct this entire text retranscription, respond only with the corrected transcription: {raw_result.render()},\n\n do not transcribe logos or images.", + }, + { + "type": "image_url", + "image_url": { + "url": f"data:image/jpeg;base64,{img_str64}", + "detail": "auto", + }, + }, + ] + ) + ] + ) + assert isinstance( + processed_result.content, str + ), "The LVM did not return a string" + entire_content += processed_result.content + print("ocr llm done") + return Document(page_content=entire_content) + + return Document(page_content=raw_results.render()) + except Exception as e: + print(e) + return Document(page_content=raw_results.render()) + + def deep_parse( + self, + file: AbstractFile, + partition: bool = False, + llm: BaseChatModel | None = None, + ) -> Document: + """ + Parse the OCR output from the input file and return the extracted text. + """ + try: + logger.info("Starting document processing") + + # Reduce image scale to lower memory usage + docs = DocumentFile.from_pdf(file, scale=int(300 / 72)) + logger.info("Document loaded") + + if partition: + logger.info("Partitioning document") + cropped_image = crop_to_content(docs[0]) + docs = split_image(cropped_image) + + logger.info("Starting OCR") + raw_results: doctrDocument = self.predictor(docs) + logger.info("OCR completed") + + if llm: + entire_content = "" + logger.info("Starting LLM processing") + for i, (raw_result, img) in enumerate( + zip(raw_results.pages, docs, strict=False) + ): + if raw_result.render() == "": + continue + _, buffer = cv2.imencode(".png", img) + img_str64 = base64.b64encode(buffer.tobytes()).decode("utf-8") + + processed_result = llm.invoke( + [ + HumanMessage( + content=[ + { + "type": "text", + "text": f"Can you correct this entire text retranscription, respond only with the corrected transcription: {raw_result.render()},\n\n do not transcribe logos or images.", + }, + { + "type": "image_url", + "image_url": { + "url": f"data:image/jpeg;base64,{img_str64}", + "detail": "auto", + }, + }, + ] + ) + ] + ) + assert isinstance( + processed_result.content, str + ), "The LLM did not return a string" + entire_content += processed_result.content + logger.info("LLM processing completed") + return Document(page_content=entire_content) + + return Document(page_content=raw_results.render()) + except Exception as e: + logger.error(f"Error in deep_parse: {str(e)}", exc_info=True) + raise + + def parse(self, file_path) -> Document: + """ + Parse with megaparse + """ + mp = MegaParse(file_path) + return mp.load() + + async def aparse(self, file_path) -> Document: + """ + Parse with megaparse + """ + mp = MegaParse(file_path) + return await mp.aload() + # except: + # reader = SimpleDirectoryReader(input_files=[file_path]) + # docs = reader.load_data() + # for doc in docs: + # print(doc) + # pause + # return "".join([doc.text for doc in docs]) + + +# FIXME: When time @chloedia optimize this function and discount random points on the scan +def crop_to_content(image: np.ndarray) -> np.ndarray: + """Crop the image to the text area.""" + # Convert to grayscale + gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) if len(image.shape) == 3 else image + + # Apply threshold to get image with only black and white + _, thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU) + + # Create rectangular kernel for dilation + kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 5)) + + # Dilate to connect text into blocks + dilated = cv2.dilate(thresh, kernel, iterations=5) + + # Find contours + contours, _ = cv2.findContours(dilated, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + + if contours: + # Find the bounding rectangles of all contours + bounding_rects = [cv2.boundingRect(c) for c in contours] + + # Combine all bounding rectangles + x = min(rect[0] for rect in bounding_rects) + y = min(rect[1] for rect in bounding_rects) + max_x = max(rect[0] + rect[2] for rect in bounding_rects) + max_y = max(rect[1] + rect[3] for rect in bounding_rects) + w = max_x - x + h = max_y - y + + # Add padding + padding = 10 + x = max(0, x - padding) + y = max(0, y - padding) + w = min(image.shape[1] - x, w + 2 * padding) + h = min(image.shape[0] - y, h + 2 * padding) + + # Crop the image + return image[y : y + h, x : x + w] + else: + return image + + +# FIXME: When time @chloedia optimize this function +def split_image(image: np.ndarray) -> List[np.ndarray]: + """Split the image into 4 parts along the y-axis, avoiding splitting letters.""" + if len(image.shape) == 3: + gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) + else: + gray = image + + # Apply threshold + _, thresh = cv2.threshold( + gray, 250, 255, cv2.THRESH_BINARY + ) # Adjust threshold for white pixels + + # Find horizontal projection + h_proj = np.sum(thresh, axis=1) + + # Calculate the ideal height for each part + total_height = image.shape[0] + ideal_height = total_height // 4 + + sub_images = [] + start = 0 + + for i in range(3): # We'll make 3 cuts to create 4 parts + target_end = (i + 1) * ideal_height + + # Look for the best cut point around the target end + best_cut = target_end + max_whitespace = 0 + + search_start = max(target_end - ideal_height // 2, 0) + search_end = min(target_end + ideal_height // 2, total_height) + + for j in range(search_start, search_end): + # Check for a continuous white line + if np.all(thresh[j, :] == 255): + whitespace = np.sum( + h_proj[max(0, j - 5) : min(total_height, j + 6)] + == 255 * image.shape[1] + ) + if whitespace > max_whitespace: + max_whitespace = whitespace + best_cut = j + + # If no suitable white line is found, use the target end + if max_whitespace == 0: + best_cut = target_end + + # Make the cut + sub_images.append(image[start:best_cut, :]) + start = best_cut + + # Add the last part + sub_images.append(image[start:, :]) + + return sub_images diff --git a/backend/worker/diff-assistant/quivr_diff_assistant/utils/__init__.py b/backend/worker/diff-assistant/quivr_diff_assistant/utils/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/backend/worker/diff-assistant/quivr_diff_assistant/utils/utils.py b/backend/worker/diff-assistant/quivr_diff_assistant/utils/utils.py new file mode 100644 index 000000000000..dd1cfe674fcd --- /dev/null +++ b/backend/worker/diff-assistant/quivr_diff_assistant/utils/utils.py @@ -0,0 +1,91 @@ +from langchain_core.prompts.prompt import PromptTemplate + +COMPARISON_PROMPT = PromptTemplate.from_template( + template=""" + You are provided with two texts and . You need to consider the information contained in \ + and compare it with the corresponding information contained in . \ + Keep in mind that contains non-relevant information for this task, and that in you \ + should only focus on the information correspnding to the information contained in . \ + You need to report all the differences between the information contained in and . \\ + Your job is to parse these differences and create a clear, concise report. \ + Organize the report by sections and provide a detailed explanation of each difference. \ + Be specific on difference, it will be reviewed and verified by a highly-trained quality engineer. + Here are the different sections of the report: + * Dénominations, comprenant: + * dénomination légale: nom du produit tel qu’il est défini par la réglementation, \ + en général cela inclut aussi avec des information sur son état (cuite, cru, gelé ... ) + * dénomination commercial: nom du produit tel qu’il est vendu au consommateur + * Ingrédients et allergènes (si presents dans plusieurs langues, comparer langue par langue), comprenant: + * liste d’ingrédients + * traces d’allergènes + * une sous-section pour chaque sous produit si il y a lieu; + * Eléments de traçabilité, comprenant: + * le code-barre EAN + * le code article + * numéro de lot + * date de fabrication + * adresse de l'entreprise + * Conseils d’utilisation / de manipulation produit, comprenant : + * Conditions / conseils de remise en oeuvre + * Durée de vie + * Durée de conservation (à compter de la date de production, à température ambiante / réfrigérée) + * DDM - date de durabilité minimale + * Conditions de transport + * Conditions de conservation : « A conserver à -18°C / Ne pas recongeler un produit décongeler » + * Temps de decongelation + * Temperature de prechauffage + * Caractéristiques / parametres physiques produit (unité de négoce), comprenant: + * poids de la pièce + * dimensions de la pièce + * poids du produit / unité de négoce (typiquement, carton) + * dimensions du produit / unité de négoce (typiquement, carton) + * nombre de pièces par unité de negoce (typiquement, carton) / colis + * poids du colis / carton + * Données palettisation / donnée technique sur palette (unité de transport) + * hauteur palette + * dimensions de l'unité de negoce (typiquement, carton) / colis + * nombre de colis par couche / palette + * Valeurs / informations nutritionnelles + * Autres + + Notes: + -> Coup de Pates: Tradition & Innovation, est l'entreprise productrice / marque du produit. + + Chaque sections doivent être organisées comme suit : + ## Section name + **** : + * ... + * ... + + **** : ... + * ... + * ... + + **Differences**: + * ... + * ... + + + Beginning of + {document} + End of + + + Beginning of + {cdc} + End of + + + You need to consider all the information contained in and compare it \ + with the corresponding information contained in . + The report should be written in a professional and formal tone and in French \ + and it should follow the structure outlined above. If doesn't contain a particular information, \ + then you should ignore that information for as well and avoid reporting any differences. + + In the report you should replace evry occurence of with {text_1} and every occurence of with {text_2}. + + ## Dénominations + **{text_1}** : + * + """ +) diff --git a/backend/worker/diff-assistant/requirements-dev.lock b/backend/worker/diff-assistant/requirements-dev.lock new file mode 100644 index 000000000000..e957ff01dae9 --- /dev/null +++ b/backend/worker/diff-assistant/requirements-dev.lock @@ -0,0 +1,760 @@ +# generated by rye +# use `rye lock` or `rye sync` to update this lockfile +# +# last locked with the following flags: +# pre: false +# features: [] +# all-features: false +# with-sources: false +# generate-hashes: false +# universal: false + +-e file:. +aiohappyeyeballs==2.4.0 + # via aiohttp +aiohttp==3.10.5 + # via langchain + # via langchain-community + # via llama-index-core + # via llama-index-legacy +aiosignal==1.3.1 + # via aiohttp +altair==5.4.1 + # via streamlit +annotated-types==0.7.0 + # via pydantic +antlr4-python3-runtime==4.9.3 + # via omegaconf +anyascii==0.3.2 + # via python-doctr +anyio==4.4.0 + # via httpx + # via openai +appnope==0.1.4 + # via ipykernel +asttokens==2.4.1 + # via stack-data +attrs==24.2.0 + # via aiohttp + # via jsonschema + # via referencing +backoff==2.2.1 + # via unstructured +beautifulsoup4==4.12.3 + # via llama-index-readers-file + # via unstructured +blinker==1.8.2 + # via streamlit +cachetools==5.5.0 + # via google-auth + # via streamlit +certifi==2024.7.4 + # via httpcore + # via httpx + # via requests + # via unstructured-client +cffi==1.17.1 + # via cryptography +chardet==5.2.0 + # via unstructured +charset-normalizer==3.3.2 + # via pdfminer-six + # via requests + # via unstructured-client +click==8.1.7 + # via nltk + # via python-oxmsg + # via streamlit +cobble==0.1.4 + # via mammoth +coloredlogs==15.0.1 + # via onnxruntime +comm==0.2.2 + # via ipykernel +contourpy==1.2.1 + # via matplotlib +cryptography==43.0.1 + # via pdfminer-six +cycler==0.12.1 + # via matplotlib +dataclasses-json==0.6.7 + # via langchain-community + # via llama-index-core + # via llama-index-legacy + # via unstructured + # via unstructured-client +debugpy==1.8.5 + # via ipykernel +decorator==5.1.1 + # via ipython +deepdiff==7.0.1 + # via unstructured-client +defusedxml==0.7.1 + # via python-doctr +deprecated==1.2.14 + # via llama-index-core + # via llama-index-legacy + # via pikepdf +diff-match-patch==20230430 + # via diff-assistant +dirtyjson==1.0.8 + # via llama-index-core + # via llama-index-legacy +distro==1.9.0 + # via openai +docx2txt==0.8 + # via diff-assistant +effdet==0.4.1 + # via unstructured +emoji==2.12.1 + # via unstructured +et-xmlfile==1.1.0 + # via openpyxl +executing==2.0.1 + # via stack-data +faiss-cpu==1.8.0.post1 + # via diff-assistant +filelock==3.15.4 + # via huggingface-hub + # via torch + # via transformers +filetype==1.2.0 + # via unstructured +fire==0.6.0 + # via pdf2docx +flatbuffers==24.3.25 + # via onnxruntime +fonttools==4.53.1 + # via matplotlib + # via pdf2docx +frozenlist==1.4.1 + # via aiohttp + # via aiosignal +fsspec==2024.6.1 + # via huggingface-hub + # via llama-index-core + # via llama-index-legacy + # via torch +gitdb==4.0.11 + # via gitpython +gitpython==3.1.43 + # via streamlit +google-api-core==2.19.2 + # via google-cloud-vision +google-auth==2.34.0 + # via google-api-core + # via google-cloud-vision +google-cloud-vision==3.7.4 + # via unstructured +googleapis-common-protos==1.65.0 + # via google-api-core + # via grpcio-status +greenlet==3.0.3 + # via sqlalchemy +grpcio==1.66.1 + # via google-api-core + # via grpcio-status +grpcio-status==1.66.1 + # via google-api-core +h11==0.14.0 + # via httpcore +h5py==3.11.0 + # via python-doctr +httpcore==1.0.5 + # via httpx +httpx==0.27.0 + # via langsmith + # via llama-cloud + # via llama-index-core + # via llama-index-legacy + # via openai + # via unstructured-client +huggingface-hub==0.24.6 + # via python-doctr + # via timm + # via tokenizers + # via transformers + # via unstructured-inference +humanfriendly==10.0 + # via coloredlogs +idna==3.7 + # via anyio + # via httpx + # via requests + # via unstructured-client + # via yarl +iniconfig==2.0.0 + # via pytest +iopath==0.1.10 + # via layoutparser +ipykernel==6.29.5 + # via diff-assistant +ipython==8.26.0 + # via ipykernel +jedi==0.19.1 + # via ipython +jinja2==3.1.4 + # via altair + # via pydeck + # via torch +jiter==0.5.0 + # via openai +joblib==1.4.2 + # via nltk + # via scikit-learn +jsonpatch==1.33 + # via langchain-core +jsonpath-python==1.0.6 + # via unstructured-client +jsonpointer==3.0.0 + # via jsonpatch +jsonschema==4.23.0 + # via altair +jsonschema-specifications==2023.12.1 + # via jsonschema +jupyter-client==8.6.2 + # via ipykernel +jupyter-core==5.7.2 + # via ipykernel + # via jupyter-client +kiwisolver==1.4.5 + # via matplotlib +langchain==0.2.16 + # via diff-assistant + # via langchain-community + # via megaparse +langchain-community==0.2.16 + # via megaparse +langchain-core==0.2.39 + # via langchain + # via langchain-community + # via langchain-openai + # via langchain-text-splitters + # via megaparse +langchain-openai==0.1.24 + # via diff-assistant + # via megaparse +langchain-text-splitters==0.2.4 + # via langchain +langdetect==1.0.9 + # via python-doctr + # via unstructured +langsmith==0.1.118 + # via langchain + # via langchain-community + # via langchain-core +layoutparser==0.3.4 + # via unstructured-inference +llama-cloud==0.0.17 + # via llama-index-indices-managed-llama-cloud +llama-index==0.11.8 + # via diff-assistant + # via megaparse +llama-index-agent-openai==0.3.1 + # via llama-index + # via llama-index-llms-openai + # via llama-index-program-openai +llama-index-cli==0.3.1 + # via llama-index +llama-index-core==0.11.8 + # via llama-index + # via llama-index-agent-openai + # via llama-index-cli + # via llama-index-embeddings-openai + # via llama-index-indices-managed-llama-cloud + # via llama-index-llms-openai + # via llama-index-multi-modal-llms-openai + # via llama-index-program-openai + # via llama-index-question-gen-openai + # via llama-index-readers-file + # via llama-index-readers-llama-parse + # via llama-parse +llama-index-embeddings-openai==0.2.4 + # via llama-index + # via llama-index-cli +llama-index-indices-managed-llama-cloud==0.3.0 + # via llama-index +llama-index-legacy==0.9.48.post3 + # via llama-index +llama-index-llms-openai==0.2.3 + # via diff-assistant + # via llama-index + # via llama-index-agent-openai + # via llama-index-cli + # via llama-index-multi-modal-llms-openai + # via llama-index-program-openai + # via llama-index-question-gen-openai +llama-index-multi-modal-llms-openai==0.2.0 + # via llama-index +llama-index-program-openai==0.2.0 + # via llama-index + # via llama-index-question-gen-openai +llama-index-question-gen-openai==0.2.0 + # via llama-index +llama-index-readers-file==0.2.1 + # via diff-assistant + # via llama-index +llama-index-readers-llama-parse==0.3.0 + # via llama-index +llama-parse==0.5.3 + # via llama-index-readers-llama-parse + # via megaparse +llvmlite==0.43.0 + # via numba +lxml==5.3.0 + # via pikepdf + # via python-docx + # via python-pptx + # via unstructured +mammoth==1.8.0 + # via megaparse +markdown-it-py==3.0.0 + # via rich +markupsafe==2.1.5 + # via jinja2 +marshmallow==3.22.0 + # via dataclasses-json + # via unstructured-client +matplotlib==3.9.2 + # via diff-assistant + # via mplcursors + # via pycocotools + # via unstructured-inference +matplotlib-inline==0.1.7 + # via ipykernel + # via ipython +mdurl==0.1.2 + # via markdown-it-py +megaparse==0.0.31 + # via diff-assistant +mplcursors==0.5.3 + # via diff-assistant +mpmath==1.3.0 + # via sympy +multidict==6.0.5 + # via aiohttp + # via yarl +mypy-extensions==1.0.0 + # via typing-inspect + # via unstructured-client +narwhals==1.6.2 + # via altair +nest-asyncio==1.6.0 + # via ipykernel + # via llama-index-core + # via llama-index-legacy + # via unstructured-client +networkx==3.3 + # via llama-index-core + # via llama-index-legacy + # via torch +nltk==3.9.1 + # via llama-index + # via llama-index-core + # via llama-index-legacy + # via unstructured +numba==0.60.0 + # via diff-assistant +numpy==1.26.4 + # via contourpy + # via diff-assistant + # via faiss-cpu + # via h5py + # via langchain + # via langchain-community + # via layoutparser + # via llama-index-core + # via llama-index-legacy + # via matplotlib + # via numba + # via onnx + # via onnxruntime + # via opencv-python + # via opencv-python-headless + # via pandas + # via pdf2docx + # via pyarrow + # via pycocotools + # via pydeck + # via python-doctr + # via scikit-learn + # via scipy + # via shapely + # via streamlit + # via torchvision + # via transformers + # via unstructured +olefile==0.47 + # via python-oxmsg +omegaconf==2.3.0 + # via effdet +onnx==1.16.2 + # via python-doctr + # via unstructured + # via unstructured-inference +onnxruntime==1.19.2 + # via unstructured-inference +openai==1.44.1 + # via diff-assistant + # via langchain-openai + # via llama-index-agent-openai + # via llama-index-embeddings-openai + # via llama-index-legacy + # via llama-index-llms-openai +opencv-python==4.10.0.84 + # via diff-assistant + # via layoutparser + # via python-doctr + # via unstructured-inference +opencv-python-headless==4.10.0.84 + # via pdf2docx +openpyxl==3.1.5 + # via diff-assistant +ordered-set==4.1.0 + # via deepdiff +orjson==3.10.7 + # via langsmith +packaging==24.1 + # via altair + # via faiss-cpu + # via huggingface-hub + # via ipykernel + # via langchain-core + # via marshmallow + # via matplotlib + # via onnxruntime + # via pikepdf + # via pytest + # via streamlit + # via transformers + # via unstructured-client + # via unstructured-pytesseract +pandas==2.2.2 + # via diff-assistant + # via layoutparser + # via llama-index-legacy + # via llama-index-readers-file + # via streamlit +parso==0.8.4 + # via jedi +pdf2docx==0.5.8 + # via megaparse +pdf2image==1.17.0 + # via layoutparser + # via unstructured +pdfminer-six==20231228 + # via pdfplumber + # via unstructured +pdfplumber==0.11.4 + # via layoutparser + # via megaparse +pexpect==4.9.0 + # via ipython +pi-heif==0.18.0 + # via unstructured +pikepdf==9.2.1 + # via unstructured +pillow==10.4.0 + # via layoutparser + # via llama-index-core + # via matplotlib + # via pdf2image + # via pdfplumber + # via pi-heif + # via pikepdf + # via python-doctr + # via python-pptx + # via streamlit + # via torchvision + # via unstructured-pytesseract +platformdirs==4.2.2 + # via jupyter-core +pluggy==1.5.0 + # via pytest +portalocker==2.10.1 + # via iopath +prompt-toolkit==3.0.47 + # via ipython +proto-plus==1.24.0 + # via google-api-core + # via google-cloud-vision +protobuf==5.27.3 + # via google-api-core + # via google-cloud-vision + # via googleapis-common-protos + # via grpcio-status + # via onnx + # via onnxruntime + # via proto-plus + # via streamlit +psutil==6.0.0 + # via ipykernel + # via unstructured +ptyprocess==0.7.0 + # via pexpect +pure-eval==0.2.3 + # via stack-data +pyarrow==17.0.0 + # via streamlit +pyasn1==0.6.1 + # via pyasn1-modules + # via rsa +pyasn1-modules==0.4.1 + # via google-auth +pyclipper==1.3.0.post5 + # via python-doctr +pycocotools==2.0.8 + # via effdet +pycparser==2.22 + # via cffi +pycryptodome==3.20.0 + # via megaparse +pydantic==2.8.2 + # via langchain + # via langchain-core + # via langsmith + # via llama-cloud + # via llama-index-core + # via openai +pydantic-core==2.20.1 + # via pydantic +pydeck==0.9.1 + # via streamlit +pygments==2.18.0 + # via ipython + # via rich +pymupdf==1.24.10 + # via pdf2docx +pymupdfb==1.24.10 + # via pymupdf +pyparsing==3.1.2 + # via matplotlib +pypdf==4.3.1 + # via diff-assistant + # via llama-index-readers-file + # via unstructured + # via unstructured-client +pypdfium2==4.30.0 + # via diff-assistant + # via pdfplumber + # via python-doctr +pytest==8.3.2 +python-dateutil==2.9.0.post0 + # via jupyter-client + # via matplotlib + # via pandas + # via unstructured-client +python-doctr==0.9.0 + # via diff-assistant +python-docx==1.1.2 + # via megaparse + # via pdf2docx +python-dotenv==1.0.1 + # via diff-assistant + # via megaparse +python-iso639==2024.4.27 + # via unstructured +python-magic==0.4.27 + # via diff-assistant + # via unstructured +python-multipart==0.0.9 + # via unstructured-inference +python-oxmsg==0.0.1 + # via unstructured +python-pptx==1.0.2 + # via megaparse +pytz==2024.1 + # via pandas +pyyaml==6.0.2 + # via huggingface-hub + # via langchain + # via langchain-community + # via langchain-core + # via layoutparser + # via llama-index-core + # via omegaconf + # via timm + # via transformers +pyzmq==26.1.1 + # via ipykernel + # via jupyter-client +rapidfuzz==3.9.6 + # via python-doctr + # via unstructured + # via unstructured-inference +referencing==0.35.1 + # via jsonschema + # via jsonschema-specifications +regex==2024.7.24 + # via nltk + # via tiktoken + # via transformers +requests==2.32.3 + # via google-api-core + # via huggingface-hub + # via langchain + # via langchain-community + # via langsmith + # via llama-index-core + # via llama-index-legacy + # via requests-toolbelt + # via streamlit + # via tiktoken + # via transformers + # via unstructured + # via unstructured-client +requests-toolbelt==1.0.0 + # via unstructured-client +rich==13.8.0 + # via streamlit +rpds-py==0.20.0 + # via jsonschema + # via referencing +rsa==4.9 + # via google-auth +safetensors==0.4.5 + # via timm + # via transformers +scikit-learn==1.5.1 + # via diff-assistant +scipy==1.14.1 + # via layoutparser + # via python-doctr + # via scikit-learn +shapely==2.0.6 + # via python-doctr +six==1.16.0 + # via asttokens + # via fire + # via langdetect + # via python-dateutil + # via unstructured-client +smmap==5.0.1 + # via gitdb +sniffio==1.3.1 + # via anyio + # via httpx + # via openai +soupsieve==2.6 + # via beautifulsoup4 +sqlalchemy==2.0.32 + # via langchain + # via langchain-community + # via llama-index-core + # via llama-index-legacy +stack-data==0.6.3 + # via ipython +streamlit==1.38.0 + # via diff-assistant +striprtf==0.0.26 + # via llama-index-readers-file +sympy==1.13.2 + # via onnxruntime + # via torch +tabulate==0.9.0 + # via unstructured +tenacity==8.5.0 + # via langchain + # via langchain-community + # via langchain-core + # via llama-index-core + # via llama-index-legacy + # via streamlit +termcolor==2.4.0 + # via fire +threadpoolctl==3.5.0 + # via scikit-learn +tiktoken==0.7.0 + # via langchain-openai + # via llama-index-core + # via llama-index-legacy +timm==1.0.9 + # via effdet + # via unstructured-inference +tokenizers==0.19.1 + # via transformers +toml==0.10.2 + # via streamlit +torch==2.3.1 + # via diff-assistant + # via effdet + # via python-doctr + # via timm + # via torchvision + # via unstructured-inference +torchvision==0.18.1 + # via effdet + # via python-doctr + # via timm +tornado==6.4.1 + # via ipykernel + # via jupyter-client + # via streamlit +tqdm==4.66.5 + # via huggingface-hub + # via iopath + # via llama-index-core + # via nltk + # via openai + # via python-doctr + # via transformers + # via unstructured +traitlets==5.14.3 + # via comm + # via ipykernel + # via ipython + # via jupyter-client + # via jupyter-core + # via matplotlib-inline +transformers==4.44.2 + # via unstructured-inference +typing-extensions==4.12.2 + # via altair + # via emoji + # via huggingface-hub + # via iopath + # via ipython + # via langchain-core + # via llama-index-core + # via llama-index-legacy + # via openai + # via pydantic + # via pydantic-core + # via python-docx + # via python-oxmsg + # via python-pptx + # via sqlalchemy + # via streamlit + # via torch + # via typing-inspect + # via unstructured + # via unstructured-client +typing-inspect==0.9.0 + # via dataclasses-json + # via llama-index-core + # via llama-index-legacy + # via unstructured-client +tzdata==2024.1 + # via pandas +unstructured==0.15.9 + # via diff-assistant + # via megaparse +unstructured-client==0.25.5 + # via unstructured +unstructured-inference==0.7.36 + # via unstructured +unstructured-pytesseract==0.3.13 + # via unstructured +urllib3==2.2.2 + # via requests + # via unstructured-client +wcwidth==0.2.13 + # via prompt-toolkit +wrapt==1.16.0 + # via deprecated + # via llama-index-core + # via unstructured +xlsxwriter==3.2.0 + # via python-pptx +yarl==1.9.7 + # via aiohttp diff --git a/backend/worker/diff-assistant/requirements.lock b/backend/worker/diff-assistant/requirements.lock new file mode 100644 index 000000000000..421906344b62 --- /dev/null +++ b/backend/worker/diff-assistant/requirements.lock @@ -0,0 +1,754 @@ +# generated by rye +# use `rye lock` or `rye sync` to update this lockfile +# +# last locked with the following flags: +# pre: false +# features: [] +# all-features: false +# with-sources: false +# generate-hashes: false +# universal: false + +-e file:. +aiohappyeyeballs==2.4.0 + # via aiohttp +aiohttp==3.10.5 + # via langchain + # via langchain-community + # via llama-index-core + # via llama-index-legacy +aiosignal==1.3.1 + # via aiohttp +altair==5.4.1 + # via streamlit +annotated-types==0.7.0 + # via pydantic +antlr4-python3-runtime==4.9.3 + # via omegaconf +anyascii==0.3.2 + # via python-doctr +anyio==4.4.0 + # via httpx + # via openai +appnope==0.1.4 + # via ipykernel +asttokens==2.4.1 + # via stack-data +attrs==24.2.0 + # via aiohttp + # via jsonschema + # via referencing +backoff==2.2.1 + # via unstructured +beautifulsoup4==4.12.3 + # via llama-index-readers-file + # via unstructured +blinker==1.8.2 + # via streamlit +cachetools==5.5.0 + # via google-auth + # via streamlit +certifi==2024.7.4 + # via httpcore + # via httpx + # via requests + # via unstructured-client +cffi==1.17.1 + # via cryptography +chardet==5.2.0 + # via unstructured +charset-normalizer==3.3.2 + # via pdfminer-six + # via requests + # via unstructured-client +click==8.1.7 + # via nltk + # via python-oxmsg + # via streamlit +cobble==0.1.4 + # via mammoth +coloredlogs==15.0.1 + # via onnxruntime +comm==0.2.2 + # via ipykernel +contourpy==1.2.1 + # via matplotlib +cryptography==43.0.1 + # via pdfminer-six +cycler==0.12.1 + # via matplotlib +dataclasses-json==0.6.7 + # via langchain-community + # via llama-index-core + # via llama-index-legacy + # via unstructured + # via unstructured-client +debugpy==1.8.5 + # via ipykernel +decorator==5.1.1 + # via ipython +deepdiff==7.0.1 + # via unstructured-client +defusedxml==0.7.1 + # via python-doctr +deprecated==1.2.14 + # via llama-index-core + # via llama-index-legacy + # via pikepdf +diff-match-patch==20230430 + # via diff-assistant +dirtyjson==1.0.8 + # via llama-index-core + # via llama-index-legacy +distro==1.9.0 + # via openai +docx2txt==0.8 + # via diff-assistant +effdet==0.4.1 + # via unstructured +emoji==2.12.1 + # via unstructured +et-xmlfile==1.1.0 + # via openpyxl +executing==2.0.1 + # via stack-data +faiss-cpu==1.8.0.post1 + # via diff-assistant +filelock==3.15.4 + # via huggingface-hub + # via torch + # via transformers +filetype==1.2.0 + # via unstructured +fire==0.6.0 + # via pdf2docx +flatbuffers==24.3.25 + # via onnxruntime +fonttools==4.53.1 + # via matplotlib + # via pdf2docx +frozenlist==1.4.1 + # via aiohttp + # via aiosignal +fsspec==2024.6.1 + # via huggingface-hub + # via llama-index-core + # via llama-index-legacy + # via torch +gitdb==4.0.11 + # via gitpython +gitpython==3.1.43 + # via streamlit +google-api-core==2.19.2 + # via google-cloud-vision +google-auth==2.34.0 + # via google-api-core + # via google-cloud-vision +google-cloud-vision==3.7.4 + # via unstructured +googleapis-common-protos==1.65.0 + # via google-api-core + # via grpcio-status +greenlet==3.0.3 + # via sqlalchemy +grpcio==1.66.1 + # via google-api-core + # via grpcio-status +grpcio-status==1.66.1 + # via google-api-core +h11==0.14.0 + # via httpcore +h5py==3.11.0 + # via python-doctr +httpcore==1.0.5 + # via httpx +httpx==0.27.0 + # via langsmith + # via llama-cloud + # via llama-index-core + # via llama-index-legacy + # via openai + # via unstructured-client +huggingface-hub==0.24.6 + # via python-doctr + # via timm + # via tokenizers + # via transformers + # via unstructured-inference +humanfriendly==10.0 + # via coloredlogs +idna==3.7 + # via anyio + # via httpx + # via requests + # via unstructured-client + # via yarl +iopath==0.1.10 + # via layoutparser +ipykernel==6.29.5 + # via diff-assistant +ipython==8.26.0 + # via ipykernel +jedi==0.19.1 + # via ipython +jinja2==3.1.4 + # via altair + # via pydeck + # via torch +jiter==0.5.0 + # via openai +joblib==1.4.2 + # via nltk + # via scikit-learn +jsonpatch==1.33 + # via langchain-core +jsonpath-python==1.0.6 + # via unstructured-client +jsonpointer==3.0.0 + # via jsonpatch +jsonschema==4.23.0 + # via altair +jsonschema-specifications==2023.12.1 + # via jsonschema +jupyter-client==8.6.2 + # via ipykernel +jupyter-core==5.7.2 + # via ipykernel + # via jupyter-client +kiwisolver==1.4.5 + # via matplotlib +langchain==0.2.16 + # via diff-assistant + # via langchain-community + # via megaparse +langchain-community==0.2.16 + # via megaparse +langchain-core==0.2.39 + # via langchain + # via langchain-community + # via langchain-openai + # via langchain-text-splitters + # via megaparse +langchain-openai==0.1.24 + # via diff-assistant + # via megaparse +langchain-text-splitters==0.2.4 + # via langchain +langdetect==1.0.9 + # via python-doctr + # via unstructured +langsmith==0.1.118 + # via langchain + # via langchain-community + # via langchain-core +layoutparser==0.3.4 + # via unstructured-inference +llama-cloud==0.0.17 + # via llama-index-indices-managed-llama-cloud +llama-index==0.11.8 + # via diff-assistant + # via megaparse +llama-index-agent-openai==0.3.1 + # via llama-index + # via llama-index-llms-openai + # via llama-index-program-openai +llama-index-cli==0.3.1 + # via llama-index +llama-index-core==0.11.8 + # via llama-index + # via llama-index-agent-openai + # via llama-index-cli + # via llama-index-embeddings-openai + # via llama-index-indices-managed-llama-cloud + # via llama-index-llms-openai + # via llama-index-multi-modal-llms-openai + # via llama-index-program-openai + # via llama-index-question-gen-openai + # via llama-index-readers-file + # via llama-index-readers-llama-parse + # via llama-parse +llama-index-embeddings-openai==0.2.4 + # via llama-index + # via llama-index-cli +llama-index-indices-managed-llama-cloud==0.3.0 + # via llama-index +llama-index-legacy==0.9.48.post3 + # via llama-index +llama-index-llms-openai==0.2.3 + # via diff-assistant + # via llama-index + # via llama-index-agent-openai + # via llama-index-cli + # via llama-index-multi-modal-llms-openai + # via llama-index-program-openai + # via llama-index-question-gen-openai +llama-index-multi-modal-llms-openai==0.2.0 + # via llama-index +llama-index-program-openai==0.2.0 + # via llama-index + # via llama-index-question-gen-openai +llama-index-question-gen-openai==0.2.0 + # via llama-index +llama-index-readers-file==0.2.1 + # via diff-assistant + # via llama-index +llama-index-readers-llama-parse==0.3.0 + # via llama-index +llama-parse==0.5.3 + # via llama-index-readers-llama-parse + # via megaparse +llvmlite==0.43.0 + # via numba +lxml==5.3.0 + # via pikepdf + # via python-docx + # via python-pptx + # via unstructured +mammoth==1.8.0 + # via megaparse +markdown-it-py==3.0.0 + # via rich +markupsafe==2.1.5 + # via jinja2 +marshmallow==3.22.0 + # via dataclasses-json + # via unstructured-client +matplotlib==3.9.2 + # via diff-assistant + # via mplcursors + # via pycocotools + # via unstructured-inference +matplotlib-inline==0.1.7 + # via ipykernel + # via ipython +mdurl==0.1.2 + # via markdown-it-py +megaparse==0.0.31 + # via diff-assistant +mplcursors==0.5.3 + # via diff-assistant +mpmath==1.3.0 + # via sympy +multidict==6.0.5 + # via aiohttp + # via yarl +mypy-extensions==1.0.0 + # via typing-inspect + # via unstructured-client +narwhals==1.6.2 + # via altair +nest-asyncio==1.6.0 + # via ipykernel + # via llama-index-core + # via llama-index-legacy + # via unstructured-client +networkx==3.3 + # via llama-index-core + # via llama-index-legacy + # via torch +nltk==3.9.1 + # via llama-index + # via llama-index-core + # via llama-index-legacy + # via unstructured +numba==0.60.0 + # via diff-assistant +numpy==1.26.4 + # via contourpy + # via diff-assistant + # via faiss-cpu + # via h5py + # via langchain + # via langchain-community + # via layoutparser + # via llama-index-core + # via llama-index-legacy + # via matplotlib + # via numba + # via onnx + # via onnxruntime + # via opencv-python + # via opencv-python-headless + # via pandas + # via pdf2docx + # via pyarrow + # via pycocotools + # via pydeck + # via python-doctr + # via scikit-learn + # via scipy + # via shapely + # via streamlit + # via torchvision + # via transformers + # via unstructured +olefile==0.47 + # via python-oxmsg +omegaconf==2.3.0 + # via effdet +onnx==1.16.2 + # via python-doctr + # via unstructured + # via unstructured-inference +onnxruntime==1.19.2 + # via unstructured-inference +openai==1.44.1 + # via diff-assistant + # via langchain-openai + # via llama-index-agent-openai + # via llama-index-embeddings-openai + # via llama-index-legacy + # via llama-index-llms-openai +opencv-python==4.10.0.84 + # via diff-assistant + # via layoutparser + # via python-doctr + # via unstructured-inference +opencv-python-headless==4.10.0.84 + # via pdf2docx +openpyxl==3.1.5 + # via diff-assistant +ordered-set==4.1.0 + # via deepdiff +orjson==3.10.7 + # via langsmith +packaging==24.1 + # via altair + # via faiss-cpu + # via huggingface-hub + # via ipykernel + # via langchain-core + # via marshmallow + # via matplotlib + # via onnxruntime + # via pikepdf + # via streamlit + # via transformers + # via unstructured-client + # via unstructured-pytesseract +pandas==2.2.2 + # via diff-assistant + # via layoutparser + # via llama-index-legacy + # via llama-index-readers-file + # via streamlit +parso==0.8.4 + # via jedi +pdf2docx==0.5.8 + # via megaparse +pdf2image==1.17.0 + # via layoutparser + # via unstructured +pdfminer-six==20231228 + # via pdfplumber + # via unstructured +pdfplumber==0.11.4 + # via layoutparser + # via megaparse +pexpect==4.9.0 + # via ipython +pi-heif==0.18.0 + # via unstructured +pikepdf==9.2.1 + # via unstructured +pillow==10.4.0 + # via layoutparser + # via llama-index-core + # via matplotlib + # via pdf2image + # via pdfplumber + # via pi-heif + # via pikepdf + # via python-doctr + # via python-pptx + # via streamlit + # via torchvision + # via unstructured-pytesseract +platformdirs==4.2.2 + # via jupyter-core +portalocker==2.10.1 + # via iopath +prompt-toolkit==3.0.47 + # via ipython +proto-plus==1.24.0 + # via google-api-core + # via google-cloud-vision +protobuf==5.27.3 + # via google-api-core + # via google-cloud-vision + # via googleapis-common-protos + # via grpcio-status + # via onnx + # via onnxruntime + # via proto-plus + # via streamlit +psutil==6.0.0 + # via ipykernel + # via unstructured +ptyprocess==0.7.0 + # via pexpect +pure-eval==0.2.3 + # via stack-data +pyarrow==17.0.0 + # via streamlit +pyasn1==0.6.1 + # via pyasn1-modules + # via rsa +pyasn1-modules==0.4.1 + # via google-auth +pyclipper==1.3.0.post5 + # via python-doctr +pycocotools==2.0.8 + # via effdet +pycparser==2.22 + # via cffi +pycryptodome==3.20.0 + # via megaparse +pydantic==2.8.2 + # via langchain + # via langchain-core + # via langsmith + # via llama-cloud + # via llama-index-core + # via openai +pydantic-core==2.20.1 + # via pydantic +pydeck==0.9.1 + # via streamlit +pygments==2.18.0 + # via ipython + # via rich +pymupdf==1.24.10 + # via pdf2docx +pymupdfb==1.24.10 + # via pymupdf +pyparsing==3.1.2 + # via matplotlib +pypdf==4.3.1 + # via diff-assistant + # via llama-index-readers-file + # via unstructured + # via unstructured-client +pypdfium2==4.30.0 + # via diff-assistant + # via pdfplumber + # via python-doctr +python-dateutil==2.9.0.post0 + # via jupyter-client + # via matplotlib + # via pandas + # via unstructured-client +python-doctr==0.9.0 + # via diff-assistant +python-docx==1.1.2 + # via megaparse + # via pdf2docx +python-dotenv==1.0.1 + # via diff-assistant + # via megaparse +python-iso639==2024.4.27 + # via unstructured +python-magic==0.4.27 + # via diff-assistant + # via unstructured +python-multipart==0.0.9 + # via unstructured-inference +python-oxmsg==0.0.1 + # via unstructured +python-pptx==1.0.2 + # via megaparse +pytz==2024.1 + # via pandas +pyyaml==6.0.2 + # via huggingface-hub + # via langchain + # via langchain-community + # via langchain-core + # via layoutparser + # via llama-index-core + # via omegaconf + # via timm + # via transformers +pyzmq==26.1.1 + # via ipykernel + # via jupyter-client +rapidfuzz==3.9.6 + # via python-doctr + # via unstructured + # via unstructured-inference +referencing==0.35.1 + # via jsonschema + # via jsonschema-specifications +regex==2024.7.24 + # via nltk + # via tiktoken + # via transformers +requests==2.32.3 + # via google-api-core + # via huggingface-hub + # via langchain + # via langchain-community + # via langsmith + # via llama-index-core + # via llama-index-legacy + # via requests-toolbelt + # via streamlit + # via tiktoken + # via transformers + # via unstructured + # via unstructured-client +requests-toolbelt==1.0.0 + # via unstructured-client +rich==13.8.0 + # via streamlit +rpds-py==0.20.0 + # via jsonschema + # via referencing +rsa==4.9 + # via google-auth +safetensors==0.4.5 + # via timm + # via transformers +scikit-learn==1.5.1 + # via diff-assistant +scipy==1.14.1 + # via layoutparser + # via python-doctr + # via scikit-learn +shapely==2.0.6 + # via python-doctr +six==1.16.0 + # via asttokens + # via fire + # via langdetect + # via python-dateutil + # via unstructured-client +smmap==5.0.1 + # via gitdb +sniffio==1.3.1 + # via anyio + # via httpx + # via openai +soupsieve==2.6 + # via beautifulsoup4 +sqlalchemy==2.0.32 + # via langchain + # via langchain-community + # via llama-index-core + # via llama-index-legacy +stack-data==0.6.3 + # via ipython +streamlit==1.38.0 + # via diff-assistant +striprtf==0.0.26 + # via llama-index-readers-file +sympy==1.13.2 + # via onnxruntime + # via torch +tabulate==0.9.0 + # via unstructured +tenacity==8.5.0 + # via langchain + # via langchain-community + # via langchain-core + # via llama-index-core + # via llama-index-legacy + # via streamlit +termcolor==2.4.0 + # via fire +threadpoolctl==3.5.0 + # via scikit-learn +tiktoken==0.7.0 + # via langchain-openai + # via llama-index-core + # via llama-index-legacy +timm==1.0.9 + # via effdet + # via unstructured-inference +tokenizers==0.19.1 + # via transformers +toml==0.10.2 + # via streamlit +torch==2.3.1 + # via diff-assistant + # via effdet + # via python-doctr + # via timm + # via torchvision + # via unstructured-inference +torchvision==0.18.1 + # via effdet + # via python-doctr + # via timm +tornado==6.4.1 + # via ipykernel + # via jupyter-client + # via streamlit +tqdm==4.66.5 + # via huggingface-hub + # via iopath + # via llama-index-core + # via nltk + # via openai + # via python-doctr + # via transformers + # via unstructured +traitlets==5.14.3 + # via comm + # via ipykernel + # via ipython + # via jupyter-client + # via jupyter-core + # via matplotlib-inline +transformers==4.44.2 + # via unstructured-inference +typing-extensions==4.12.2 + # via altair + # via emoji + # via huggingface-hub + # via iopath + # via ipython + # via langchain-core + # via llama-index-core + # via llama-index-legacy + # via openai + # via pydantic + # via pydantic-core + # via python-docx + # via python-oxmsg + # via python-pptx + # via sqlalchemy + # via streamlit + # via torch + # via typing-inspect + # via unstructured + # via unstructured-client +typing-inspect==0.9.0 + # via dataclasses-json + # via llama-index-core + # via llama-index-legacy + # via unstructured-client +tzdata==2024.1 + # via pandas +unstructured==0.15.9 + # via diff-assistant + # via megaparse +unstructured-client==0.25.5 + # via unstructured +unstructured-inference==0.7.36 + # via unstructured +unstructured-pytesseract==0.3.13 + # via unstructured +urllib3==2.2.2 + # via requests + # via unstructured-client +wcwidth==0.2.13 + # via prompt-toolkit +wrapt==1.16.0 + # via deprecated + # via llama-index-core + # via unstructured +xlsxwriter==3.2.0 + # via python-pptx +yarl==1.9.7 + # via aiohttp diff --git a/backend/worker/diff-assistant/tests/conftest.py b/backend/worker/diff-assistant/tests/conftest.py new file mode 100644 index 000000000000..fb82a30e724d --- /dev/null +++ b/backend/worker/diff-assistant/tests/conftest.py @@ -0,0 +1,6 @@ +import pytest + + +@pytest.fixture +def hello_message(): + return "Hello from diff-assistant!" diff --git a/backend/worker/diff-assistant/tests/test_hello.py b/backend/worker/diff-assistant/tests/test_hello.py new file mode 100644 index 000000000000..a8fb1175823f --- /dev/null +++ b/backend/worker/diff-assistant/tests/test_hello.py @@ -0,0 +1,5 @@ +from use_case_3 import hello + + +def test_hello(hello_message): + assert hello() == hello_message diff --git a/backend/worker/pyproject.toml b/backend/worker/pyproject.toml index eff842739bdc..f4893ed8bee9 100644 --- a/backend/worker/pyproject.toml +++ b/backend/worker/pyproject.toml @@ -8,6 +8,7 @@ authors = [ dependencies = [ "quivr-core[all]", "quivr-api", + "quivr-diff-assistant", "celery[redis]>=5.0.0", "python-dotenv>=1.0.0", "playwright>=1.0.0", @@ -48,3 +49,7 @@ path = "../quivr-core" [[tool.rye.sources]] name = "quivr-api" path = "../quivr-api" + +[[tool.rye.sources]] +name = "quivr-diff-assistant" +path = "./diff-assistant" diff --git a/backend/worker/quivr_worker/assistants/assistants.py b/backend/worker/quivr_worker/assistants/assistants.py index b44f7273ebbb..1571072bb0b7 100644 --- a/backend/worker/quivr_worker/assistants/assistants.py +++ b/backend/worker/quivr_worker/assistants/assistants.py @@ -5,6 +5,8 @@ upload_file_storage, ) +from quivr_worker.assistants.cdp_use_case_2 import process_cdp_use_case_2 +from quivr_worker.assistants.cdp_use_case_3 import process_cdp_use_case_3 from quivr_worker.utils.pdf_generator.pdf_generator import PDFGenerator, PDFModel @@ -15,19 +17,29 @@ async def process_assistant( tasks_service: TasksService, user_id: str, ): + print(task_id) task = await tasks_service.get_task_by_id(task_id, user_id) # type: ignore - - await tasks_service.update_task(task_id, {"status": "in_progress"}) - - print(task) - - task_result = {"status": "completed", "answer": "#### Assistant answer"} + assistant_name = task.assistant_name + output = "" + if assistant_id == 3: + output = await process_cdp_use_case_3( + assistant_id, notification_uuid, task_id, tasks_service, user_id + ) + elif assistant_id == 2: + output = await process_cdp_use_case_2( + assistant_id, notification_uuid, task_id, tasks_service, user_id + ) + else: + new_task = await tasks_service.update_task(task_id, {"status": "processing"}) + # Add a random delay of 10 to 20 seconds + + task_result = {"status": "completed", "answer": output} output_dir = f"{assistant_id}/{notification_uuid}" os.makedirs(output_dir, exist_ok=True) output_path = f"{output_dir}/output.pdf" - generated_pdf = PDFGenerator(PDFModel(title="Test", content="Test")) + generated_pdf = PDFGenerator(PDFModel(title=assistant_name, content=output)) generated_pdf.print_pdf() generated_pdf.output(output_path) @@ -36,5 +48,4 @@ async def process_assistant( # Now delete the file os.remove(output_path) - await tasks_service.update_task(task_id, task_result) diff --git a/backend/worker/quivr_worker/assistants/cdp_use_case_2.py b/backend/worker/quivr_worker/assistants/cdp_use_case_2.py new file mode 100644 index 000000000000..632e4cad067c --- /dev/null +++ b/backend/worker/quivr_worker/assistants/cdp_use_case_2.py @@ -0,0 +1,312 @@ +import random +import string +from enum import Enum + +import pandas as pd + +# get environment variables +from langchain_core.language_models.chat_models import BaseChatModel +from langchain_core.output_parsers import StrOutputParser +from langchain_openai import ChatOpenAI +from llama_index.core import SimpleDirectoryReader, VectorStoreIndex +from llama_index.core.node_parser import UnstructuredElementNodeParser +from llama_index.core.query_engine import RetrieverQueryEngine +from llama_index.core.retrievers import RecursiveRetriever +from llama_index.core.schema import Document +from llama_index.llms.openai import OpenAI +from quivr_api.logger import get_logger +from quivr_api.modules.assistant.dto.inputs import InputAssistant +from quivr_api.modules.assistant.services.tasks_service import TasksService +from quivr_api.modules.dependencies import get_supabase_client +from quivr_diff_assistant.use_case_3.parser import DeadlyParser +from quivr_diff_assistant.utils.utils import COMPARISON_PROMPT + +logger = get_logger(__name__) + +# Set pandas display options +pd.set_option("display.max_rows", None) +pd.set_option("display.max_columns", None) +pd.set_option("display.width", None) +pd.set_option("display.max_colwidth", None) + + +def load_and_process_document(file_path, pickle_file): + print(file_path) + reader = SimpleDirectoryReader(input_files=[file_path]) + docs = reader.load_data() + print(len(docs), " and", len(docs[0].text)) + if len(docs) == 1 and len(docs[0].text) < 9: + print("No text found with classical parse, switching to OCR ...") + parser = DeadlyParser() + doc = parser.deep_parse(file_path) + docs = [Document().from_langchain_format(doc)] + + node_parser = UnstructuredElementNodeParser() + + raw_nodes = node_parser.get_nodes_from_documents(docs) + + base_nodes, node_mappings = node_parser.get_base_nodes_and_mappings(raw_nodes) + return base_nodes, node_mappings + + +def create_query_engine(base_nodes, node_mappings): + vector_index = VectorStoreIndex(base_nodes) + vector_retriever = vector_index.as_retriever(similarity_top_k=5) + recursive_retriever = RecursiveRetriever( + "vector", + retriever_dict={"vector": vector_retriever}, + node_dict=node_mappings, + verbose=True, + ) + return RetrieverQueryEngine.from_args( + recursive_retriever, llm=OpenAI(temperature=0, model="gpt-4") + ) + + +def compare_responses(response1, response2): + llm = OpenAI(temperature=0, model="gpt-4") + prompt = f""" + Compare the following two responses and determine if they convey the same information: + Response for document 1: {response1} + Response for document 2: {response2} + Are these responses essentially the same? Provide a brief explanation for your conclusion. The difference in format are not important, focus on the content and the numbers. + If there are any specific differences, please highlight them with bullet points. Respond in french and in a markdown format. + """ + return llm.complete(prompt) + + +class ComparisonTypes(str, Enum): + CDC_ETIQUETTE = "Cahier des Charges - Etiquette" + CDC_FICHE_DEV = "Cahier des Charges - Fiche Dev" + + +def llm_comparator( + document: str, cdc: str, llm: BaseChatModel, comparison_type: ComparisonTypes +): + chain = COMPARISON_PROMPT | llm | StrOutputParser() + + if comparison_type == ComparisonTypes.CDC_ETIQUETTE: + text_1 = "Etiquette" + elif comparison_type == ComparisonTypes.CDC_FICHE_DEV: + text_1 = "Fiche Dev" + + return chain.stream( + { + "document": document, + "text_1": text_1, + "cdc": cdc, + "text_2": "Cahier des Charges", + } + ) + + +async def process_cdp_use_case_2( + assistant_id: str, + notification_uuid: str, + task_id: int, + tasks_service: TasksService, + user_id: str, +) -> str: + task = await tasks_service.get_task_by_id(task_id, user_id) # type: ignore + logger.info(f"Task: {task} 📝") + # Parse settings into InputAssistant + input_assistant = InputAssistant.model_validate(task.settings) + assert input_assistant.inputs.files is not None + assert len(input_assistant.inputs.files) == 2 + + # Get the value of the "Document 1" key and "Document 2" key. The input files might not be in the order of "Document 1" and "Document 2" + # So we need to find the correct order + logger.info(f"Input assistant: {input_assistant} 📂") + before_file_key = input_assistant.inputs.files[0].key + after_file_key = input_assistant.inputs.files[1].key + + before_file_value = input_assistant.inputs.files[0].value + after_file_value = input_assistant.inputs.files[1].value + + if before_file_key == "Document 2": + before_file_value = input_assistant.inputs.files[1].value + after_file_value = input_assistant.inputs.files[0].value + + # Get the files from supabase + supabase_client = get_supabase_client() + path = f"{task.assistant_id}/{task.pretty_id}/" + logger.info(f"Path: {path} 📁") + await tasks_service.update_task(task_id, {"status": "processing"}) + + before_file_data = supabase_client.storage.from_("quivr").download( + f"{path}{before_file_value}" + ) + after_file_data = supabase_client.storage.from_("quivr").download( + f"{path}{after_file_value}" + ) + + # Generate a random string of 8 characters + random_string = "".join(random.choices(string.ascii_letters + string.digits, k=8)) + + # Write temp files with the original name without using save_uploaded_file + # because the file is already in the quivr bucket + before_file_path = f"/tmp/{random_string}_{before_file_value}" + after_file_path = f"/tmp/{random_string}_{after_file_value}" + with open(before_file_path, "wb") as f: + f.write(before_file_data) + with open(after_file_path, "wb") as f: + f.write(after_file_data) + assert input_assistant.inputs.select_texts is not None + value_use_case = input_assistant.inputs.select_texts[0].value + + ## Get the document type + document_type = None + if value_use_case == "Etiquettes VS Cahier des charges": + document_type = ComparisonTypes.CDC_ETIQUETTE + elif value_use_case == "Fiche Dev VS Cahier des charges": + document_type = ComparisonTypes.CDC_FICHE_DEV + else: + logger.error(f"❌ Document type not supported: {value_use_case}") + raise ValueError(f"Document type not supported: {value_use_case}") + parser = DeadlyParser() + logger.info(f"Document type: {document_type} 📄") + llm = ChatOpenAI( + model="gpt-4o", + temperature=0.1, + max_tokens=None, + max_retries=2, + ) + + before_file_parsed = await parser.aparse(before_file_path) + logger.info("Before file parsed 📜") + after_file_parsed = None + if document_type == ComparisonTypes.CDC_ETIQUETTE: + logger.info("Parsing after file with deep parse 🔍") + after_file_parsed = await parser.deep_aparse(after_file_path, llm=llm) + else: + logger.info("Parsing after file with classical parse 🔍") + after_file_parsed = await parser.aparse(after_file_path) + + logger.info("Comparing documents ⚖️") + comparison = llm_comparator( + document=after_file_parsed.page_content, + cdc=before_file_parsed.page_content, + llm=llm, + comparison_type=document_type, + ) + + logger.info(f"Comparison: {comparison} ✅") + return "".join(comparison) + + +async def test_main(): + cdc_doc = "/Users/jchevall/Coding/diff-assistant/data/Use case #2/Cas2-2-1_Mendiant Lait_QD PC F03 - FR Cahier des charges produit -rev 2021-v2.pdf" + doc = "/Users/jchevall/Coding/diff-assistant/data/Use case #2/Cas2-2-1_Proposition étiquette Mendiant Lait croustillant.pdf" + + comparison_type = ComparisonTypes.CDC_FICHE_DEV + + llm = ChatOpenAI( + model="gpt-4o", + temperature=0.1, + max_tokens=None, + max_retries=2, + ) + + parser = DeadlyParser() + parsed_cdc_doc = await parser.aparse(cdc_doc) + + if comparison_type == ComparisonTypes.CDC_ETIQUETTE: + parsed_doc = await parser.deep_aparse(doc, llm=llm) + else: + parsed_doc = await parser.aparse(doc) + + print("\n\n Cahier des Charges") + print(parsed_cdc_doc.page_content) + + print("\n\n Other document") + print(parsed_doc.page_content) + + comparison = llm_comparator( + document=parsed_doc.page_content, + cdc=parsed_cdc_doc.page_content, + llm=llm, + comparison_type=comparison_type, + ) + + print("\n\n Comparison") + print(comparison) + + +def get_document_path(doc): + try: + with open(doc.name, "wb") as temp_file: + temp_file.write(doc.getbuffer()) + path = temp_file.name + except: + path = doc + + return path + + +async def parse_documents(cdc_doc, doc, comparison_type: ComparisonTypes, llm): + parser = DeadlyParser() + + # Schedule the coroutines as tasks + cdc_task = asyncio.create_task(parser.aparse(get_document_path(cdc_doc))) + + if comparison_type == ComparisonTypes.CDC_ETIQUETTE: + doc_task = asyncio.create_task( + parser.deep_aparse(get_document_path(doc), llm=llm) + ) + else: + doc_task = asyncio.create_task(parser.aparse(get_document_path(doc))) + + # Optionally, do other work here while tasks are running + + # Await the tasks to get the results + parsed_cdc_doc = await cdc_task + print("\n\n Cahier de Charges: \n", parsed_cdc_doc.page_content) + + parsed_doc = await doc_task + print("\n\n Other doc: \n", parsed_doc.page_content) + + return parsed_cdc_doc, parsed_doc + + +# def main(): +# st.title("Document Comparison Tool : Use Case 2") + +# # File uploaders for two documents +# cdc_doc = st.file_uploader( +# "Upload Cahier des Charges", type=["docx", "xlsx", "pdf", "txt"] +# ) +# doc = st.file_uploader( +# "Upload Etiquette / Fiche Dev", type=["docx", "xlsx", "pdf", "txt"] +# ) + +# comparison_type = st.selectbox( +# "Select document types", +# [ComparisonTypes.CDC_ETIQUETTE.value, ComparisonTypes.CDC_FICHE_DEV.value], +# ) + +# if st.button("Process Documents and Questions"): +# if not cdc_doc or not doc: +# st.error("Please upload both documents before launching the processing.") +# return + +# with st.spinner("Processing files..."): +# llm = ChatOpenAI( +# model="gpt-4o", +# temperature=0.1, +# max_tokens=None, +# max_retries=2, +# ) + +# parsed_cdc_doc, parsed_doc = asyncio.run( +# parse_documents(cdc_doc, doc, comparison_type=comparison_type, llm=llm) +# ) + +# comparison = llm_comparator( +# document=parsed_doc.page_content, +# cdc=parsed_cdc_doc.page_content, +# llm=llm, +# comparison_type=comparison_type, +# ) +# # Run the async function using asyncio.run() +# # comparison = asyncio.run(process_documents(cdc_doc, doc, comparison_type)) +# st.write_stream(comparison) diff --git a/backend/worker/quivr_worker/assistants/cdp_use_case_3.py b/backend/worker/quivr_worker/assistants/cdp_use_case_3.py new file mode 100644 index 000000000000..a657589e7107 --- /dev/null +++ b/backend/worker/quivr_worker/assistants/cdp_use_case_3.py @@ -0,0 +1,224 @@ +import os +import random +import string +import tempfile +from enum import Enum +from pathlib import Path + +from diff_match_patch import diff_match_patch + +# get environment variables +from langchain_core.language_models.chat_models import BaseChatModel +from langchain_openai import ChatOpenAI +from quivr_api.logger import get_logger +from quivr_api.modules.assistant.dto.inputs import InputAssistant +from quivr_api.modules.assistant.services.tasks_service import TasksService +from quivr_api.modules.dependencies import get_supabase_client +from quivr_diff_assistant.use_case_3.diff_type import DiffResult, llm_comparator +from quivr_diff_assistant.use_case_3.llm_reporter import redact_report +from quivr_diff_assistant.use_case_3.parser import DeadlyParser + +logger = get_logger(__name__) + + +class DocumentType(Enum): + ETIQUETTE = "etiquette" + CAHIER_DES_CHARGES = "cdc" + + +async def process_cdp_use_case_3( + assistant_id: str, + notification_uuid: str, + task_id: int, + tasks_service: TasksService, + user_id: str, +) -> str: + task = await tasks_service.get_task_by_id(task_id, user_id) # type: ignore + + # Parse settings into InputAssistant + input_assistant = InputAssistant.model_validate(task.settings) + assert input_assistant.inputs.files is not None + assert len(input_assistant.inputs.files) == 2 + + # Get the value of the "Document 1" key and "Document 2" key. The input files might not be in the order of "Document 1" and "Document 2" + # So we need to find the correct order + before_file_key = input_assistant.inputs.files[0].key + after_file_key = input_assistant.inputs.files[1].key + + before_file_value = input_assistant.inputs.files[0].value + after_file_value = input_assistant.inputs.files[1].value + + if before_file_key == "Document 2": + before_file_value = input_assistant.inputs.files[1].value + after_file_value = input_assistant.inputs.files[0].value + + # Get the files from supabase + supabase_client = get_supabase_client() + path = f"{task.assistant_id}/{task.pretty_id}/" + + await tasks_service.update_task(task_id, {"status": "processing"}) + + # Before file key - parsed from the + before_file_data = supabase_client.storage.from_("quivr").download( + f"{path}{before_file_value}" + ) + after_file_data = supabase_client.storage.from_("quivr").download( + f"{path}{after_file_value}" + ) + + # Generate a random string of 8 characters + random_string = "".join(random.choices(string.ascii_letters + string.digits, k=8)) + + # Write temp files with the original name without using save_uploaded_file + # because the file is already in the quivr bucket + before_file_path = f"/tmp/{random_string}_{before_file_value}" + after_file_path = f"/tmp/{random_string}_{after_file_value}" + with open(before_file_path, "wb") as f: + f.write(before_file_data) + with open(after_file_path, "wb") as f: + f.write(after_file_data) + + assert input_assistant.inputs.select_texts is not None + value_use_case = input_assistant.inputs.select_texts[0].value + + ## Get the document type + document_type = None + if value_use_case == "Etiquettes": + document_type = DocumentType.ETIQUETTE + elif value_use_case == "Cahier des charges": + document_type = DocumentType.CAHIER_DES_CHARGES + else: + raise ValueError(f"Invalid value for use case: {value_use_case}") + + ## Get the hard to read document boolean value + assert input_assistant.inputs.booleans is not None + hard_to_read_document = input_assistant.inputs.booleans[0].value + + assert before_file_data is not None + assert after_file_data is not None + + openai_gpt4o = ChatOpenAI( + model="gpt-4o", + temperature=0, + max_tokens=None, + max_retries=2, + ) + + llm_comparator = True if document_type == DocumentType.ETIQUETTE else False + report = await create_modification_report( + before_file=before_file_path, + after_file=after_file_path, + type=document_type, + llm=openai_gpt4o, + partition=hard_to_read_document, + use_llm_comparator=llm_comparator, + ) + + os.unlink(before_file_path) + os.unlink(after_file_path) + return report + + +async def create_modification_report( + before_file: str | Path | bytes, + after_file: str | Path | bytes, + type: DocumentType, + llm: BaseChatModel, + partition: bool = False, + use_llm_comparator: bool = False, + parser=DeadlyParser(), +) -> str: + if type == DocumentType.ETIQUETTE: + logger.debug("parsing before file") + before_text = parser.deep_parse(before_file, partition=partition, llm=llm) + logger.debug("parsing after file") + after_text = parser.deep_parse(after_file, partition=partition, llm=llm) + elif type == DocumentType.CAHIER_DES_CHARGES: + before_text = await parser.aparse(before_file) + after_text = await parser.aparse(after_file) + + logger.debug(before_text.page_content) + logger.debug(after_text.page_content) + text_after_sections = before_text.page_content.split("\n# ") + text_before_sections = after_text.page_content.split("\n# ") + assert len(text_after_sections) == len(text_before_sections) + + if use_llm_comparator: + logger.debug("using llm comparator") + llm_comparator_result = llm_comparator( + before_text.page_content, after_text.page_content, llm=llm + ) + return llm_comparator_result + logger.debug("using diff match patch") + dmp = diff_match_patch() + section_diffs = [] + for after_section, before_section in zip( + text_after_sections, text_before_sections, strict=False + ): + main_diff: list[tuple[int, str]] = dmp.diff_main(after_section, before_section) + section_diffs.append(DiffResult(main_diff)) + + logger.debug(section_diffs) + report = redact_report(section_diffs, llm=llm) + return report + + +def save_uploaded_file(uploaded_file): + with tempfile.NamedTemporaryFile( + delete=False, suffix=os.path.splitext(uploaded_file.name)[1] + ) as tmp_file: + tmp_file.write(uploaded_file.getvalue()) + return tmp_file.name + + +# st.title("Document Modification Report Generator : Use Case 3") + +# # File uploaders +# before_file = st.file_uploader("Upload 'Before' file", type=["pdf", "docx"]) +# after_file = st.file_uploader("Upload 'After' file", type=["pdf", "docx"]) + +# # Document type selector +# doc_type = st.selectbox("Select document type", ["ETIQUETTE", "CAHIER_DES_CHARGES"]) + +# # Complexity of document +# complexity = st.checkbox("Complex document (lot of text of OCRise)") + +# # Process button +# if st.button("Process"): +# if before_file and after_file: +# with st.spinner("Processing files..."): +# # Save uploaded files +# before_path = save_uploaded_file(before_file) +# after_path = save_uploaded_file(after_file) + +# # Initialize LLM +# openai_gpt4o = ChatOpenAI( +# model="gpt-4o", +# temperature=0, +# max_tokens=None, +# max_retries=2, +# ) +# use_llm_comparator = True if doc_type == "ETIQUETTE" else False + +# # Generate report +# logger.debug("generating report") +# report = asyncio.run( +# create_modification_report( +# before_path, +# after_path, +# DocumentType[doc_type], +# openai_gpt4o, +# partition=complexity, +# use_llm_comparator=use_llm_comparator, +# ) +# ) +# logger.debug("report generated") +# # Display results +# st.subheader("Modification Report") +# st.write(report) + +# # Clean up temporary files +# os.unlink(before_path) +# os.unlink(after_path) +# else: +# st.error("Please upload both 'Before' and 'After' files.") diff --git a/backend/worker/quivr_worker/celery_worker.py b/backend/worker/quivr_worker/celery_worker.py index bc6588d65f25..b1feb1510f57 100644 --- a/backend/worker/quivr_worker/celery_worker.py +++ b/backend/worker/quivr_worker/celery_worker.py @@ -2,6 +2,7 @@ import os from uuid import UUID +import torch from celery.schedules import crontab from celery.signals import worker_process_init from dotenv import load_dotenv @@ -32,8 +33,8 @@ from sqlmodel import Session, text from sqlmodel.ext.asyncio.session import AsyncSession -from quivr_worker.celery_monitor import is_being_executed from quivr_worker.assistants.assistants import process_assistant +from quivr_worker.celery_monitor import is_being_executed from quivr_worker.check_premium import check_is_premium from quivr_worker.process.process_s3_file import process_uploaded_file from quivr_worker.process.process_url import process_url_func @@ -46,6 +47,9 @@ from quivr_worker.syncs.store_notion import fetch_and_store_notion_files_async from quivr_worker.utils.utils import _patch_json +torch.set_num_threads(1) + + load_dotenv() get_logger("quivr_core") @@ -130,6 +134,8 @@ async def aprocess_assistant_task( task_id: int, user_id: str, ): + global async_engine + assert async_engine async with AsyncSession(async_engine) as async_session: try: await async_session.execute( diff --git a/backend/worker/quivr_worker/utils/pdf_generator/pdf_generator.py b/backend/worker/quivr_worker/utils/pdf_generator/pdf_generator.py index 13bdfcc8375b..4b1e3be51352 100644 --- a/backend/worker/quivr_worker/utils/pdf_generator/pdf_generator.py +++ b/backend/worker/quivr_worker/utils/pdf_generator/pdf_generator.py @@ -34,9 +34,9 @@ def __init__(self, pdf_model: PDFModel, *args, **kwargs): ) def header(self): - # Logo - logo_path = os.path.join(os.path.dirname(__file__), "logo.png") - self.image(logo_path, 10, 10, 20) # Adjust size as needed + # # Logo + # logo_path = os.path.join(os.path.dirname(__file__), "logo.png") + # self.image(logo_path, 10, 10, 20) # Adjust size as needed # Move cursor to right of image self.set_xy(20, 15) @@ -59,15 +59,31 @@ def footer(self): def chapter_body(self): self.set_font("DejaVu", "", 12) - self.multi_cell( - 0, - 10, - self.pdf_model.content, - markdown=True, - new_x=XPos.RIGHT, - new_y=YPos.TOP, - ) - self.ln() + content_lines = self.pdf_model.content.split("\n") + for line in content_lines: + if line.startswith("# "): + self.ln() # Add line break before header + self.set_font("DejaVu", "B", 16) + self.multi_cell(0, 10, line[2:], markdown=False) + elif line.startswith("## "): + self.ln() # Add line break before header + self.set_font("DejaVu", "B", 14) + self.multi_cell(0, 10, line[3:], markdown=False) + elif line.startswith("### "): + self.ln() # Add line break before header + self.set_font("DejaVu", "B", 12) + self.multi_cell(0, 10, line[4:], markdown=False) + else: + self.set_font("DejaVu", "", 12) + self.multi_cell( + 0, + 10, + line, + markdown=True, + new_x=XPos.RIGHT, + new_y=YPos.TOP, + ) + self.ln() def print_pdf(self): self.add_page() @@ -78,7 +94,11 @@ def print_pdf(self): pdf_model = PDFModel( title="Summary of Legal Services Rendered by Orrick", content=""" +# Main Header +## Sub Header +### Sub Sub Header **Summary:** +This is a summary of the legal services rendered. """, ) pdf = PDFGenerator(pdf_model) diff --git a/docker-compose.dev.yml b/docker-compose.dev.yml index 89a6dad6bab6..2f137a2d6580 100644 --- a/docker-compose.dev.yml +++ b/docker-compose.dev.yml @@ -53,7 +53,7 @@ services: volumes: - ./backend/:/app/ command: > - /bin/bash -c "python -m celery -A quivr_worker.celery_worker worker -l info -E" + /bin/bash -c "python -m celery -A quivr_worker.celery_worker worker -l info -E -P solo" restart: always depends_on: - redis diff --git a/frontend/app/assistants/AssistantModal/AssistantModal.module.scss b/frontend/app/assistants/AssistantModal/AssistantModal.module.scss deleted file mode 100644 index 5a06ee35fedd..000000000000 --- a/frontend/app/assistants/AssistantModal/AssistantModal.module.scss +++ /dev/null @@ -1,29 +0,0 @@ -@use "styles/Spacings.module.scss"; - -.modal_content_container { - padding: Spacings.$spacing05; - display: flex; - flex-direction: column; - height: 100%; - justify-content: space-between; - - .modal_content_wrapper { - display: flex; - flex-direction: column; - gap: Spacings.$spacing05; - - .message_wrapper { - display: flex; - flex-direction: column; - } - - .title { - font-weight: 600; - } - } - - .button { - display: flex; - align-self: flex-end; - } -} \ No newline at end of file diff --git a/frontend/app/assistants/AssistantModal/AssistantModal.tsx b/frontend/app/assistants/AssistantModal/AssistantModal.tsx deleted file mode 100644 index fe8988babaa8..000000000000 --- a/frontend/app/assistants/AssistantModal/AssistantModal.tsx +++ /dev/null @@ -1,151 +0,0 @@ -import { useState } from "react"; - -import { Assistant } from "@/lib/api/assistants/types"; -import { useAssistants } from "@/lib/api/assistants/useAssistants"; -import { Stepper } from "@/lib/components/AddBrainModal/components/Stepper/Stepper"; -import { StepValue } from "@/lib/components/AddBrainModal/types/types"; -import { MessageInfoBox } from "@/lib/components/ui/MessageInfoBox/MessageInfoBox"; -import { Modal } from "@/lib/components/ui/Modal/Modal"; -import { QuivrButton } from "@/lib/components/ui/QuivrButton/QuivrButton"; -import { Step } from "@/lib/types/Modal"; - -import styles from "./AssistantModal.module.scss"; -import { InputsStep } from "./InputsStep/InputsStep"; -import { OutputsStep } from "./OutputsStep/OutputsStep"; - -interface AssistantModalProps { - isOpen: boolean; - setIsOpen: (value: boolean) => void; - assistant: Assistant; -} - -export const AssistantModal = ({ - isOpen, - setIsOpen, - assistant, -}: AssistantModalProps): JSX.Element => { - const steps: Step[] = [ - { - label: "Inputs", - value: "FIRST_STEP", - }, - { - label: "Outputs", - value: "SECOND_STEP", - }, - ]; - const [currentStep, setCurrentStep] = useState("FIRST_STEP"); - const [emailOutput, setEmailOutput] = useState(true); - const [brainOutput, setBrainOutput] = useState(""); - const [files, setFiles] = useState<{ key: string; file: File | null }[]>( - assistant.inputs.files.map((fileInput) => ({ - key: fileInput.key, - file: null, - })) - ); - const { processAssistant } = useAssistants(); - - const handleFileChange = (file: File, inputKey: string) => { - setFiles((prevFiles) => - prevFiles.map((fileObj) => - fileObj.key === inputKey ? { ...fileObj, file } : fileObj - ) - ); - }; - - const handleSetIsOpen = (value: boolean) => { - if (!value) { - setCurrentStep("FIRST_STEP"); - } - setIsOpen(value); - }; - - const handleProcessAssistant = async () => { - handleSetIsOpen(false); - await processAssistant( - { - name: assistant.name, - inputs: { - files: files.map((file) => ({ - key: file.key, - value: (file.file as File).name, - })), - urls: [], - texts: [], - }, - outputs: { - email: { - activated: emailOutput, - }, - brain: { - activated: brainOutput !== "", - value: brainOutput, - }, - }, - }, - files.map((file) => file.file as File) - ); - }; - - return ( - } - > -
-
- - {currentStep === "FIRST_STEP" ? ( - -
- Expected Input - {assistant.input_description} -
-
- ) : ( - -
- Output - {assistant.output_description} -
-
- )} - {currentStep === "FIRST_STEP" ? ( - - ) : ( - - )} -
-
- {currentStep === "FIRST_STEP" ? ( - setCurrentStep("SECOND_STEP")} - disabled={!!files.find((file) => !file.file)} - /> - ) : ( - handleProcessAssistant()} - disabled={!emailOutput && brainOutput === ""} - /> - )} -
-
-
- ); -}; diff --git a/frontend/app/assistants/AssistantModal/InputsStep/InputsStep.tsx b/frontend/app/assistants/AssistantModal/InputsStep/InputsStep.tsx deleted file mode 100644 index ca3a1bcf7180..000000000000 --- a/frontend/app/assistants/AssistantModal/InputsStep/InputsStep.tsx +++ /dev/null @@ -1,28 +0,0 @@ -import { capitalCase } from "change-case"; - -import { AssistantInputs } from "@/lib/api/assistants/types"; -import { FileInput } from "@/lib/components/ui/FileInput/FileInput"; - -interface InputsStepProps { - inputs: AssistantInputs; - onFileChange: (file: File, inputKey: string) => void; // -} - -export const InputsStep = ({ - inputs, - onFileChange, -}: InputsStepProps): JSX.Element => { - return ( -
- {inputs.files.map((fileInput) => ( - onFileChange(file, fileInput.key)} - /> - ))} -
- ); -}; diff --git a/frontend/app/assistants/AssistantModal/OutputsStep/OutputsStep.module.scss b/frontend/app/assistants/AssistantModal/OutputsStep/OutputsStep.module.scss deleted file mode 100644 index b2fedc2fc510..000000000000 --- a/frontend/app/assistants/AssistantModal/OutputsStep/OutputsStep.module.scss +++ /dev/null @@ -1,16 +0,0 @@ -@use "styles/Spacings.module.scss"; - -.outputs_wrapper { - display: flex; - flex-direction: column; - gap: Spacings.$spacing03; - - .message_wrapper { - width: 100%; - } - - .brain_selector { - padding-block: Spacings.$spacing02; - max-width: 250px; - } -} \ No newline at end of file diff --git a/frontend/app/assistants/AssistantModal/OutputsStep/OutputsStep.tsx b/frontend/app/assistants/AssistantModal/OutputsStep/OutputsStep.tsx deleted file mode 100644 index 3903b1d59851..000000000000 --- a/frontend/app/assistants/AssistantModal/OutputsStep/OutputsStep.tsx +++ /dev/null @@ -1,83 +0,0 @@ -import { useMemo, useState } from "react"; - -import { formatMinimalBrainsToSelectComponentInput } from "@/app/chat/[chatId]/components/ActionsBar/components/KnowledgeToFeed/utils/formatMinimalBrainsToSelectComponentInput"; -import { Checkbox } from "@/lib/components/ui/Checkbox/Checkbox"; -import { MessageInfoBox } from "@/lib/components/ui/MessageInfoBox/MessageInfoBox"; -import { SingleSelector } from "@/lib/components/ui/SingleSelector/SingleSelector"; -import { requiredRolesForUpload } from "@/lib/config/upload"; -import { useBrainContext } from "@/lib/context/BrainProvider/hooks/useBrainContext"; - -import styles from "./OutputsStep.module.scss"; - -interface OutputsStepProps { - setEmailOutput: (value: boolean) => void; - setBrainOutput: (value: string) => void; -} - -export const OutputsStep = ({ - setEmailOutput, - setBrainOutput, -}: OutputsStepProps): JSX.Element => { - const [existingBrainChecked, setExistingBrainChecked] = - useState(false); - const [selectedBrainId, setSelectedBrainId] = useState(""); - const { allBrains } = useBrainContext(); - - const brainsWithUploadRights = formatMinimalBrainsToSelectComponentInput( - useMemo( - () => - allBrains.filter( - (brain) => - requiredRolesForUpload.includes(brain.role) && !!brain.max_files - ), - [allBrains] - ) - ); - - return ( -
- - It can take a few minutes to process. - - - { - if (existingBrainChecked) { - setBrainOutput(""); - setSelectedBrainId(""); - } - setExistingBrainChecked(!existingBrainChecked); - }} - /> - {existingBrainChecked && ( -
- { - setBrainOutput(brain); - setSelectedBrainId(brain); - }} - selectedOption={ - selectedBrainId - ? { - value: selectedBrainId, - label: allBrains.find( - (brain) => brain.id === selectedBrainId - )?.name as string, - } - : undefined - } - placeholder="Select a brain" - iconName="brain" - /> -
- )} -
- ); -}; diff --git a/frontend/app/assistants/page.module.scss b/frontend/app/assistants/page.module.scss deleted file mode 100644 index 0d475ea641b2..000000000000 --- a/frontend/app/assistants/page.module.scss +++ /dev/null @@ -1,20 +0,0 @@ -@use "styles/Spacings.module.scss"; - -.content_wrapper { - padding: Spacings.$spacing06; - display: flex; - flex-direction: column; - gap: Spacings.$spacing05; - - .assistants_grid { - display: flex; - gap: Spacings.$spacing03; - flex-wrap: wrap; - } - - .message_wrapper { - display: flex; - flex-direction: column; - gap: Spacings.$spacing02; - } -} \ No newline at end of file diff --git a/frontend/app/assistants/page.tsx b/frontend/app/assistants/page.tsx deleted file mode 100644 index fb24129b0b53..000000000000 --- a/frontend/app/assistants/page.tsx +++ /dev/null @@ -1,109 +0,0 @@ -"use client"; -import { redirect, usePathname } from "next/navigation"; -import { useEffect, useState } from "react"; - -import { Assistant } from "@/lib/api/assistants/types"; -import { useAssistants } from "@/lib/api/assistants/useAssistants"; -import { PageHeader } from "@/lib/components/PageHeader/PageHeader"; -import { BrainCard } from "@/lib/components/ui/BrainCard/BrainCard"; -import { MessageInfoBox } from "@/lib/components/ui/MessageInfoBox/MessageInfoBox"; -import { useSupabase } from "@/lib/context/SupabaseProvider"; -import { redirectToLogin } from "@/lib/router/redirectToLogin"; - -import { AssistantModal } from "./AssistantModal/AssistantModal"; -import styles from "./page.module.scss"; - -const Assistants = (): JSX.Element => { - const pathname = usePathname(); - const { session } = useSupabase(); - const [assistants, setAssistants] = useState([]); - const [assistantModalOpened, setAssistantModalOpened] = - useState(false); - const [currentAssistant, setCurrentAssistant] = useState( - null - ); - - const { getAssistants } = useAssistants(); - - useEffect(() => { - // REMOVE FOR NOW ACCESS TO QUIVR ASSISTANTS - redirect("/search"); - if (session === null) { - redirectToLogin(); - } - - void (async () => { - try { - const res = await getAssistants(); - if (res) { - setAssistants(res); - } - } catch (error) { - console.error(error); - } - })(); - }, [pathname, session]); - - return ( - <> -
- -
- -
- - A Quivr Assistant is an AI agent that apply specific processes - to an input in order to generate a usable output. - - - For now, you can try the summary assistant, that summarizes a - document and send the result by email or upload it in one of - your brains. - - But don't worry! Other assistants are cooking! -
-
- -
- - Feature still in Beta. Please provide feedbacks - on the chat below! - -
-
-
- {assistants.map((assistant) => { - return ( - { - setAssistantModalOpened(true); - setCurrentAssistant(assistant); - }} - key={assistant.name} - cardKey={assistant.name} - /> - ); - })} -
-
-
- {currentAssistant && ( - - )} - - ); -}; - -export default Assistants; diff --git a/frontend/app/chat/[chatId]/components/ChatDialogueArea/components/ChatDialogue/components/ChatItem/QADisplay/components/MessageRow/components/MessageContent/MessageContent.module.scss b/frontend/app/chat/[chatId]/components/ChatDialogueArea/components/ChatDialogue/components/ChatItem/QADisplay/components/MessageRow/components/MessageContent/MessageContent.module.scss index 54d2e027ff74..22f7bd5d7b8d 100644 --- a/frontend/app/chat/[chatId]/components/ChatDialogueArea/components/ChatDialogue/components/ChatItem/QADisplay/components/MessageRow/components/MessageContent/MessageContent.module.scss +++ b/frontend/app/chat/[chatId]/components/ChatDialogueArea/components/ChatDialogue/components/ChatItem/QADisplay/components/MessageRow/components/MessageContent/MessageContent.module.scss @@ -7,6 +7,7 @@ } .markdown { + font-size: Typography.$small; p { margin: 0; padding: 0; diff --git a/frontend/app/globals.css b/frontend/app/globals.css index 9c506122d6a7..5fa74f1c5a44 100644 --- a/frontend/app/globals.css +++ b/frontend/app/globals.css @@ -3,9 +3,7 @@ @import "tailwindcss/utilities"; @import './colors.css'; -* { - @apply scrollbar; -} + main { @apply max-w-screen-xl mx-auto flex flex-col; @@ -64,6 +62,7 @@ div:focus { --background-blur: rgba(0, 0, 0, 0.9); --background-success: var(--success-lightest); --background-error: var(--dangerous-lightest); + --background-pending: var(--background-3); /* Borders */ --border-0: var(--grey-5); @@ -101,6 +100,7 @@ body.dark_mode { --background-blur: rgba(0, 0, 0, 0.9); --background-success: var(--black-5); --background-error: var(--black-5); + --background-pending: var(--black-5); /* Borders */ --border-0: var(--black-5); diff --git a/frontend/app/quality-assistant/AssistantTab/AssistantCard/AssistantCard.module.scss b/frontend/app/quality-assistant/AssistantTab/AssistantCard/AssistantCard.module.scss new file mode 100644 index 000000000000..33978171560e --- /dev/null +++ b/frontend/app/quality-assistant/AssistantTab/AssistantCard/AssistantCard.module.scss @@ -0,0 +1,41 @@ +@use "styles/Radius.module.scss"; +@use "styles/Spacings.module.scss"; +@use "styles/Typography.module.scss"; + +.assistant_tab_wrapper { + display: flex; + flex-direction: column; + align-items: center; + gap: Spacings.$spacing05; + border-radius: Radius.$normal; + border: 1px solid var(--border-0); + padding: Spacings.$spacing05; + width: 250px; + cursor: pointer; + height: 100%; + + &.disabled { + pointer-events: none; + opacity: 0.3; + } + + .header { + display: flex; + align-self: flex-start; + align-items: center; + gap: Spacings.$spacing03; + + .title { + @include Typography.H3; + } + } + + .description { + font-size: Typography.$small; + font-style: italic; + } + + &:hover { + background-color: var(--background-3); + } +} diff --git a/frontend/app/quality-assistant/AssistantTab/AssistantCard/AssistantCard.tsx b/frontend/app/quality-assistant/AssistantTab/AssistantCard/AssistantCard.tsx new file mode 100644 index 000000000000..ffc2e45f560f --- /dev/null +++ b/frontend/app/quality-assistant/AssistantTab/AssistantCard/AssistantCard.tsx @@ -0,0 +1,29 @@ +"use client"; + +import { Icon } from "@/lib/components/ui/Icon/Icon"; + +import styles from "./AssistantCard.module.scss"; + +import { Assistant } from "../../types/assistant"; + +interface AssistantCardProps { + assistant: Assistant; +} + +const AssistantCard = ({ assistant }: AssistantCardProps): JSX.Element => { + return ( +
+
+ + {assistant.name} +
+ {assistant.description} +
+ ); +}; + +export default AssistantCard; diff --git a/frontend/app/quality-assistant/AssistantTab/AssistantTab.module.scss b/frontend/app/quality-assistant/AssistantTab/AssistantTab.module.scss new file mode 100644 index 000000000000..66388788390c --- /dev/null +++ b/frontend/app/quality-assistant/AssistantTab/AssistantTab.module.scss @@ -0,0 +1,62 @@ +@use "styles/Spacings.module.scss"; +@use "styles/Typography.module.scss"; + +.assistant_tab_wrapper { + height: 100%; + display: flex; + flex-direction: column; + justify-content: space-between; + + .content_section { + display: flex; + flex-direction: column; + gap: Spacings.$spacing06; + + .title { + @include Typography.H2; + } + + .assistant_choice_wrapper { + display: flex; + gap: Spacings.$spacing05; + align-items: stretch; + flex-wrap: wrap; + } + } + + .form_wrapper { + display: flex; + flex-direction: column; + gap: Spacings.$spacing06; + + .title { + @include Typography.H2; + } + + .file_inputs_wrapper { + display: flex; + justify-content: space-between; + width: 100%; + gap: Spacings.$spacing05; + + .file_input_wrapper { + width: 100%; + display: flex; + flex-direction: column; + gap: Spacings.$spacing03; + + .file_header { + display: flex; + align-items: center; + gap: Spacings.$spacing03; + font-size: Typography.$small; + } + } + } + } + + .buttons_wrapper { + display: flex; + justify-content: space-between; + } +} diff --git a/frontend/app/quality-assistant/AssistantTab/AssistantTab.tsx b/frontend/app/quality-assistant/AssistantTab/AssistantTab.tsx new file mode 100644 index 000000000000..1fadcbc6d4ee --- /dev/null +++ b/frontend/app/quality-assistant/AssistantTab/AssistantTab.tsx @@ -0,0 +1,267 @@ +"use client"; + +import { useEffect, useState } from "react"; + +import { useAssistants } from "@/lib/api/assistants/useAssistants"; +import { FileInput } from "@/lib/components/ui/FileInput/FileInput"; +import { Icon } from "@/lib/components/ui/Icon/Icon"; +import QuivrButton from "@/lib/components/ui/QuivrButton/QuivrButton"; + +import AssistantCard from "./AssistantCard/AssistantCard"; +import styles from "./AssistantTab.module.scss"; +import BooleansInputs from "./BooleansInputs/BooleansInputs"; +import SelectorsInputs from "./SelectorsInput/SelectorsInputs"; + +import { Assistant, ProcessAssistantData } from "../types/assistant"; + +export interface ProcessAssistantInput { + input: ProcessAssistantData; + files: File[]; +} + +interface AssistantTabProps { + setSelectedTab: (tab: string) => void; +} + +const FILE_TYPES = ["pdf", "docx", "doc", "txt"]; + +const useAssistantData = () => { + const [assistants, setAssistants] = useState([]); + const [assistantChoosed, setAssistantChoosed] = useState< + Assistant | undefined + >(undefined); + const { getAssistants } = useAssistants(); + + useEffect(() => { + void (async () => { + try { + const res = await getAssistants(); + setAssistants(res); + } catch (error) { + console.error(error); + } + })(); + }, []); + + return { assistants, assistantChoosed, setAssistantChoosed }; +}; + +const useFormStates = (assistantChoosed: Assistant | undefined) => { + const [booleanStates, setBooleanStates] = useState<{ + [key: string]: boolean | null; + }>({}); + const [selectTextStates, setSelectTextStates] = useState<{ + [key: string]: string | null; + }>({}); + const [fileStates, setFileStates] = useState<{ [key: string]: File }>({}); + const [isFormValid, setIsFormValid] = useState(false); + + useEffect(() => { + if (assistantChoosed?.inputs.booleans) { + const initialBooleanStates = assistantChoosed.inputs.booleans.reduce( + (acc, input) => ({ ...acc, [input.key]: false }), + {} + ); + setBooleanStates(initialBooleanStates); + } + if (assistantChoosed?.inputs.select_texts) { + const initialSelectTextStates = + assistantChoosed.inputs.select_texts.reduce( + (acc, input) => ({ ...acc, [input.key]: input.options[0] }), + {} + ); + setSelectTextStates(initialSelectTextStates); + } + }, [assistantChoosed]); + + return { + booleanStates, + setBooleanStates, + selectTextStates, + setSelectTextStates, + fileStates, + setFileStates, + isFormValid, + setIsFormValid, + }; +}; + +const validateForm = ( + assistantChoosed: Assistant | undefined, + booleanStates: { [x: string]: boolean | null }, + fileStates: { [x: string]: File | undefined }, + selectTextStates: { [x: string]: string | null } +) => { + if (!assistantChoosed) { + return false; + } + + const allBooleansSet = + assistantChoosed.inputs.booleans?.every( + (input) => + booleanStates[input.key] !== undefined && + booleanStates[input.key] !== null + ) ?? true; + + const allFilesSet = assistantChoosed.inputs.files.every( + (input) => fileStates[input.key] !== undefined + ); + + const allSelectTextsSet = + assistantChoosed.inputs.select_texts?.every( + (input) => + selectTextStates[input.key] !== undefined && + selectTextStates[input.key] !== null + ) ?? true; + + return allBooleansSet && allFilesSet && allSelectTextsSet; +}; + +const AssistantTab = ({ setSelectedTab }: AssistantTabProps): JSX.Element => { + const { assistants, assistantChoosed, setAssistantChoosed } = + useAssistantData(); + const { + booleanStates, + setBooleanStates, + selectTextStates, + setSelectTextStates, + fileStates, + setFileStates, + isFormValid, + setIsFormValid, + } = useFormStates(assistantChoosed); + const { processTask } = useAssistants(); + const [loading, setLoading] = useState(false); + + const handleFileChange = (key: string, file: File) => { + setFileStates((prevState) => ({ + ...prevState, + [key]: file, + })); + }; + + useEffect(() => { + setIsFormValid( + validateForm( + assistantChoosed, + booleanStates, + fileStates, + selectTextStates + ) + ); + }, [booleanStates, fileStates, selectTextStates, assistantChoosed]); + + const handleSubmit = async () => { + if (assistantChoosed) { + const processAssistantData: ProcessAssistantData = { + id: assistantChoosed.id, + name: assistantChoosed.name, + inputs: { + files: Object.keys(fileStates).map((key) => ({ + key, + value: fileStates[key].name, + })), + booleans: Object.keys(booleanStates).map((key) => ({ + key, + value: booleanStates[key] ?? null, + })), + select_texts: Object.keys(selectTextStates).map((key) => ({ + key, + value: selectTextStates[key], + })), + }, + }; + + const processAssistantInput: ProcessAssistantInput = { + input: processAssistantData, + files: Object.values(fileStates), + }; + + setLoading(true); + await processTask(processAssistantInput); + setSelectedTab("Process"); + setLoading(false); + } + }; + + const resetForm = () => { + setBooleanStates({}); + setSelectTextStates({}); + setFileStates({}); + setIsFormValid(false); + }; + + const handleBack = () => { + resetForm(); + setAssistantChoosed(undefined); + }; + + return ( +
+ {!assistantChoosed ? ( +
+ Choose an assistant +
+ {assistants.map((assistant, index) => ( +
setAssistantChoosed(assistant)}> + +
+ ))} +
+
+ ) : ( +
+ {assistantChoosed.name} +
+ {assistantChoosed.inputs.files.map((input, index) => ( +
+
+ + {input.key} +
+ handleFileChange(input.key, file)} + acceptedFileTypes={FILE_TYPES} + /> +
+ ))} +
+ + +
+ )} + {assistantChoosed && ( +
+ handleBack()} + /> + +
+ )} +
+ ); +}; + +export default AssistantTab; diff --git a/frontend/app/quality-assistant/AssistantTab/BooleansInputs/BooleansInputs.module.scss b/frontend/app/quality-assistant/AssistantTab/BooleansInputs/BooleansInputs.module.scss new file mode 100644 index 000000000000..8884e026675e --- /dev/null +++ b/frontend/app/quality-assistant/AssistantTab/BooleansInputs/BooleansInputs.module.scss @@ -0,0 +1,5 @@ +@use "styles/Variables.module.scss"; + +.boolean_inputs_wrapper { + width: Variables.$assistantInputWidth; +} diff --git a/frontend/app/quality-assistant/AssistantTab/BooleansInputs/BooleansInputs.tsx b/frontend/app/quality-assistant/AssistantTab/BooleansInputs/BooleansInputs.tsx new file mode 100644 index 000000000000..b723808b7fa5 --- /dev/null +++ b/frontend/app/quality-assistant/AssistantTab/BooleansInputs/BooleansInputs.tsx @@ -0,0 +1,74 @@ +"use client"; + +import { Checkbox } from "@/lib/components/ui/Checkbox/Checkbox"; + +import styles from "./BooleansInputs.module.scss"; + +import { ConditionalInput } from "../../types/assistant"; + +interface BooleansInputsProps { + booleans: { key: string; description: string }[]; + conditionalInputs?: ConditionalInput[]; + booleanStates: { [key: string]: boolean | null }; + setBooleanStates: React.Dispatch< + React.SetStateAction<{ [key: string]: boolean | null }> + >; + selectTextStates: { [key: string]: string | null }; +} + +const BooleansInputs = ({ + booleans, + conditionalInputs, + booleanStates, + setBooleanStates, + selectTextStates, +}: BooleansInputsProps): JSX.Element => { + const handleCheckboxChange = (key: string, checked: boolean) => { + setBooleanStates((prevState: { [key: string]: boolean | null }) => ({ + ...prevState, + [key]: checked, + })); + }; + + const checkCondition = (conditionalInput: ConditionalInput): boolean => { + const { key, condition, value } = conditionalInput; + const targetValue = + booleanStates[key]?.toString() ?? selectTextStates[key] ?? ""; + + if (condition === "equals") { + return targetValue === value; + } else { + return targetValue !== value; + } + }; + + return ( +
+ {booleans.map((input, index) => { + const shouldShow = !!conditionalInputs?.every((conditionalInput) => { + if (conditionalInput.conditional_key === input.key) { + return checkCondition(conditionalInput); + } + + return true; + }); + + if (!shouldShow) { + return null; + } + + return ( +
+ handleCheckboxChange(input.key, checked)} + /> +
+ ); + })} +
+ ); +}; + +export default BooleansInputs; diff --git a/frontend/app/quality-assistant/AssistantTab/SelectorsInput/SelectorsInputs.module.scss b/frontend/app/quality-assistant/AssistantTab/SelectorsInput/SelectorsInputs.module.scss new file mode 100644 index 000000000000..7bb6337530a9 --- /dev/null +++ b/frontend/app/quality-assistant/AssistantTab/SelectorsInput/SelectorsInputs.module.scss @@ -0,0 +1,5 @@ +@use "styles/Variables.module.scss"; + +.select_texts_wrapper { + width: Variables.$assistantInputWidth; +} diff --git a/frontend/app/quality-assistant/AssistantTab/SelectorsInput/SelectorsInputs.tsx b/frontend/app/quality-assistant/AssistantTab/SelectorsInput/SelectorsInputs.tsx new file mode 100644 index 000000000000..1a07f47d9bc5 --- /dev/null +++ b/frontend/app/quality-assistant/AssistantTab/SelectorsInput/SelectorsInputs.tsx @@ -0,0 +1,49 @@ +import React from "react"; + +import { SingleSelector } from "@/lib/components/ui/SingleSelector/SingleSelector"; + +import styles from "./SelectorsInputs.module.scss"; + +interface SelectorsInputsProps { + selectTexts: { key: string; options: string[] }[]; + selectTextStates: { [key: string]: string | null }; + setSelectTextStates: React.Dispatch< + React.SetStateAction<{ [key: string]: string | null }> + >; +} + +const SelectorsInputs = ({ + selectTexts, + selectTextStates, + setSelectTextStates, +}: SelectorsInputsProps): JSX.Element => { + const handleSelectTextChange = (key: string, value: string) => { + setSelectTextStates((prevState) => ({ + ...prevState, + [key]: value, + })); + }; + + return ( +
+ {selectTexts.map((input, index) => ( +
+ { + return { label: option, value: option }; + })} + onChange={(value) => handleSelectTextChange(input.key, value)} + selectedOption={{ + label: selectTextStates[input.key] ?? input.options[0], + value: selectTextStates[input.key] ?? input.options[0], + }} + /> +
+ ))} +
+ ); +}; + +export default SelectorsInputs; diff --git a/frontend/app/quality-assistant/ProcessTab/Process/ProcessLine.module.scss b/frontend/app/quality-assistant/ProcessTab/Process/ProcessLine.module.scss new file mode 100644 index 000000000000..3f3c6fc396cf --- /dev/null +++ b/frontend/app/quality-assistant/ProcessTab/Process/ProcessLine.module.scss @@ -0,0 +1,193 @@ +@use "styles/Radius.module.scss"; +@use "styles/Spacings.module.scss"; +@use "styles/Typography.module.scss"; +@use "styles/Variables.module.scss"; + +.process_wrapper { + padding-inline: Spacings.$spacing06; + overflow: hidden; + display: flex; + gap: Spacings.$spacing02; + justify-content: space-between; + align-items: center; + border: 1px solid var(--border-0); + padding-block: Spacings.$spacing03; + position: relative; + overflow: visible; + font-size: Typography.$small; + border-bottom: none; + + &.last { + border-radius: 0 0 Radius.$normal Radius.$normal; + border-bottom: 1px solid var(--border-0); + } + + &.clickable { + cursor: pointer; + + &:hover { + background-color: var(--background-1); + } + } + + .left { + display: flex; + align-items: center; + gap: calc(Spacings.$spacing06 + 6px); + overflow: hidden; + + .left_fields { + display: flex; + align-items: center; + overflow: hidden; + + .assistant { + font-size: Typography.$small; + min-width: Variables.$menuSectionWidth; + max-width: Variables.$menuSectionWidth; + } + + .files { + font-size: Typography.$tiny; + color: var(--text-4); + overflow: hidden; + + .filename { + @include Typography.EllipsisOverflow; + } + } + } + } + + .right { + display: flex; + gap: Spacings.$spacing05; + align-items: center; + + .date { + font-size: Typography.$very_tiny; + width: 150px; + display: flex; + align-items: center; + justify-content: center; + @include Typography.EllipsisOverflow; + } + + .status { + width: 100px; + display: flex; + align-items: center; + justify-content: center; + @include Typography.EllipsisOverflow; + } + } +} + +.markdown { + p { + margin: 0; + padding-block: Spacings.$spacing06; + align-items: center; + } + + ul { + list-style-type: disc; + margin-top: 0; + padding: 0; + margin-left: Spacings.$spacing05; + display: flex; + flex-direction: column; + gap: Spacings.$spacing03; + + li { + white-space-collapse: collapse; + } + } + + ol { + list-style-type: decimal; + padding-left: Spacings.$spacing05; + list-style-position: outside; + + li { + white-space-collapse: collapse; + } + } + + h1 { + @include Typography.H1; + } + + h2 { + @include Typography.H2; + } + + h3 { + @include Typography.H3; + } + + table { + width: 100%; + border-collapse: collapse; + margin: Spacings.$spacing05 0; + } + + thead { + background-color: var(--background-1); + } + + tr { + border-bottom: 1px solid var(--border-0); + } + + th, + td { + padding: Spacings.$spacing03; + text-align: left; + } + + th { + font-weight: bold; + } + + pre[class*="language-"] { + background: var(--background-5); + color: var(--white-0); + padding: Spacings.$spacing05; + border-radius: Radius.$normal; + overflow: auto; + margin: 0 0 Spacings.$spacing05 0; + white-space: pre-wrap; + font-size: Typography.$small; + font-family: "Courier New", Courier, monospace; + } + + code[class*="language-"] { + background: none; + color: inherit; + border-radius: Radius.$normal; + font-family: "Courier New", Courier, monospace; + font-size: Typography.$small; + white-space: pre-wrap; + } + + code { + background: var(--background-5); + color: var(--white-0); + padding: Spacings.$spacing01; + border-radius: Radius.$normal; + font-family: "Courier New", Courier, monospace; + font-size: Typography.$medium; + } + + .code_block { + .icon { + position: absolute; + right: 0; + padding: Spacings.$spacing05; + } + code { + white-space: pre-wrap; + } + } +} diff --git a/frontend/app/quality-assistant/ProcessTab/Process/ProcessLine.tsx b/frontend/app/quality-assistant/ProcessTab/Process/ProcessLine.tsx new file mode 100644 index 000000000000..c7077523d9a1 --- /dev/null +++ b/frontend/app/quality-assistant/ProcessTab/Process/ProcessLine.tsx @@ -0,0 +1,173 @@ +"use client"; + +import { capitalCase } from "change-case"; +import format from "date-fns/format"; +import { fr } from "date-fns/locale"; +import { saveAs } from "file-saver"; +import { useState } from "react"; +import ReactMarkdown from "react-markdown"; +import gfm from "remark-gfm"; + +import { useAssistants } from "@/lib/api/assistants/useAssistants"; +import { Checkbox } from "@/lib/components/ui/Checkbox/Checkbox"; +import { Icon } from "@/lib/components/ui/Icon/Icon"; +import { LoaderIcon } from "@/lib/components/ui/LoaderIcon/LoaderIcon"; +import { Modal } from "@/lib/components/ui/Modal/Modal"; +import { Tag } from "@/lib/components/ui/Tag/Tag"; +import { useDevice } from "@/lib/hooks/useDevice"; + +import styles from "./ProcessLine.module.scss"; + +import { Process } from "../../types/process"; + +interface ProcessLineProps { + process: Process; + last?: boolean; + selected: boolean; + setSelected: (selected: boolean, event: React.MouseEvent) => void; +} + +const ProcessLine = ({ + process, + last, + selected, + setSelected, +}: ProcessLineProps): JSX.Element => { + const [showResult, setShowResult] = useState(false); + const [downloadUrl, setDownloadUrl] = useState(null); + const { isMobile } = useDevice(); + const { downloadTaskResult } = useAssistants(); + + const handleMouseEnter = async () => { + if (process.status === "completed" && !downloadUrl) { + const res: string = await downloadTaskResult(process.id); + setDownloadUrl(res); + } + }; + + const handleDownload = async () => { + if (downloadUrl) { + const response = await fetch( + downloadUrl.replace("host.docker.internal", "localhost") + ); + const blob = await response.blob(); + const formattedDate = format( + new Date(process.creation_time), + "yyyy-MM-dd", + { locale: fr } + ); + const fileName = `${process.assistant_name}_${formattedDate}.pdf`; + saveAs(blob, fileName); + } + }; + + return ( + <> +
{ + if (process.status === "completed") { + setShowResult(!showResult); + } + }} + onMouseEnter={() => void handleMouseEnter()} + > +
+ setSelected(checked, event)} + /> +
+ {process.assistant_name} + + {process.task_metadata.input_files.map((file, index) => ( +
+ {file} +
+ ))} +
+
+
+
+ {!isMobile && ( + <> + + {format( + new Date(process.creation_time), + "d MMMM yyyy '-' HH:mm:ss", + { + locale: fr, + } + )} + +
+ +
+ + )} +
) => { + event.stopPropagation(); + }} + > + {process.status === "processing" ? ( + + ) : downloadUrl ? ( +
void handleDownload()}> + +
+ ) : ( + + )} +
+
+
+ + } + > + {process.answer && ( +
+ + {process.answer.replace(/\n/g, "\n")} + +
+ )} +
+ + ); +}; + +export default ProcessLine; diff --git a/frontend/app/quality-assistant/ProcessTab/ProcessTab.module.scss b/frontend/app/quality-assistant/ProcessTab/ProcessTab.module.scss new file mode 100644 index 000000000000..eda0ade1cec5 --- /dev/null +++ b/frontend/app/quality-assistant/ProcessTab/ProcessTab.module.scss @@ -0,0 +1,122 @@ +@use "styles/Radius.module.scss"; +@use "styles/ScreenSizes.module.scss"; +@use "styles/Spacings.module.scss"; +@use "styles/Typography.module.scss"; +@use "styles/Variables.module.scss"; + +.process_tab_wrapper { + display: flex; + flex-direction: column; + gap: Spacings.$spacing05; + padding-bottom: Spacings.$spacing10; + border-radius: Radius.$normal; + + @media screen and (max-width: ScreenSizes.$small) { + overflow-x: auto; + } + + .title { + @include Typography.H2; + } + + .table_header { + display: flex; + justify-content: space-between; + align-items: center; + gap: Spacings.$spacing03; + + .search { + width: 250px; + } + } + + .first_line { + display: flex; + justify-content: space-between; + padding-left: calc(Spacings.$spacing06); + padding-right: calc(Spacings.$spacing11 + 6px); + padding-block: Spacings.$spacing02; + font-weight: 500; + background-color: var(--background-1); + font-size: Typography.$small; + border: 1px solid var(--border-0); + border-radius: Radius.$normal Radius.$normal 0 0; + border-bottom: none; + + &.empty { + border: 1px solid var(--border-0); + border-radius: Radius.$normal; + } + + .left { + display: flex; + align-items: center; + gap: calc(Spacings.$spacing06 + 6px); + + .left_fields { + display: flex; + align-items: center; + + .field { + display: flex; + align-items: center; + gap: Spacings.$spacing02; + cursor: pointer; + + .icon { + visibility: hidden; + } + + &:hover { + .icon { + visibility: visible; + } + } + + &.assistant { + width: Variables.$menuSectionWidth; + } + } + } + } + + .right { + display: flex; + gap: calc(Spacings.$spacing12 + Spacings.$spacing06 + 2px); + + .status { + display: flex; + align-items: center; + gap: Spacings.$spacing02; + cursor: pointer; + + .icon { + visibility: hidden; + } + + &:hover { + .icon { + visibility: visible; + } + } + } + + .date { + display: flex; + align-items: center; + gap: Spacings.$spacing02; + cursor: pointer; + + .icon { + visibility: hidden; + } + + &:hover { + .icon { + visibility: visible; + } + } + } + } + } +} diff --git a/frontend/app/quality-assistant/ProcessTab/ProcessTab.tsx b/frontend/app/quality-assistant/ProcessTab/ProcessTab.tsx new file mode 100644 index 000000000000..8ca419b75008 --- /dev/null +++ b/frontend/app/quality-assistant/ProcessTab/ProcessTab.tsx @@ -0,0 +1,239 @@ +"use client"; + +import { useEffect, useState } from "react"; + +import { useAssistants } from "@/lib/api/assistants/useAssistants"; +import { Checkbox } from "@/lib/components/ui/Checkbox/Checkbox"; +import { Icon } from "@/lib/components/ui/Icon/Icon"; +import { QuivrButton } from "@/lib/components/ui/QuivrButton/QuivrButton"; +import { TextInput } from "@/lib/components/ui/TextInput/TextInput"; +import { useSupabase } from "@/lib/context/SupabaseProvider"; +import { filterAndSort, updateSelectedItems } from "@/lib/helpers/table"; +import { useDevice } from "@/lib/hooks/useDevice"; + +import ProcessLine from "./Process/ProcessLine"; +import styles from "./ProcessTab.module.scss"; + +import { Process } from "../types/process"; + +const ProcessTab = (): JSX.Element => { + const [processes, setProcesses] = useState([]); + const [searchQuery, setSearchQuery] = useState(""); + const [selectedProcess, setSelectedProcess] = useState([]); + const [allChecked, setAllChecked] = useState(false); + const [sortConfig, setSortConfig] = useState<{ + key: keyof Process; + direction: "ascending" | "descending"; + }>({ key: "creation_time", direction: "descending" }); + const [filteredProcess, setFilteredProcess] = useState([]); + const [lastSelectedIndex, setLastSelectedIndex] = useState( + null + ); + const [loading, setLoading] = useState(false); + + const { getTasks, deleteTask } = useAssistants(); + const { supabase } = useSupabase(); + const { isMobile } = useDevice(); + + const loadTasks = async () => { + try { + const res = await getTasks(); + setProcesses(res); + setFilteredProcess(res); + } catch (error) { + console.error(error); + } + }; + + const handleStatusChange = () => { + void loadTasks(); + }; + + useEffect(() => { + void loadTasks(); + }, []); + + useEffect(() => { + const channel = supabase + .channel("tasks") + .on( + "postgres_changes", + { event: "UPDATE", schema: "public", table: "tasks" }, + handleStatusChange + ) + .subscribe(); + + return () => { + void supabase.removeChannel(channel); + }; + }, []); + + useEffect(() => { + setFilteredProcess( + filterAndSort( + processes, + searchQuery, + sortConfig, + (process) => process[sortConfig.key] + ) + ); + }, [processes, searchQuery, sortConfig]); + + const handleDelete = async () => { + setLoading(true); + await Promise.all( + selectedProcess.map(async (process) => await deleteTask(process.id)) + ); + + const remainingProcesses = processes.filter( + (process) => + !selectedProcess.some((selected) => selected.id === process.id) + ); + + setProcesses(remainingProcesses); + setFilteredProcess( + filterAndSort( + remainingProcesses, + searchQuery, + sortConfig, + (process) => process[sortConfig.key] + ) + ); + + setSelectedProcess([]); + setAllChecked(false); + setLoading(false); + }; + + const handleSelect = ( + process: Process, + index: number, + event: React.MouseEvent + ) => { + const newSelectedProcess = updateSelectedItems({ + item: process, + index, + event, + lastSelectedIndex, + filteredList: filteredProcess, + selectedItems: selectedProcess, + }); + setSelectedProcess(newSelectedProcess.selectedItems); + setLastSelectedIndex(newSelectedProcess.lastSelectedIndex); + }; + + const handleSort = (key: keyof Process) => { + setSortConfig((prevSortConfig) => { + let direction: "ascending" | "descending" = "ascending"; + if ( + prevSortConfig.key === key && + prevSortConfig.direction === "ascending" + ) { + direction = "descending"; + } + + return { key, direction }; + }); + }; + + return ( +
+ My Results +
+
+ +
+ +
+
+
+
+ { + setAllChecked(checked); + setSelectedProcess(checked ? filteredProcess : []); + }} + /> +
+
handleSort("assistant_name")} + > + Assistant +
+ +
+
+
handleSort("name")}> + Files +
+ +
+
+
+
+
+ {!isMobile && ( + <> +
handleSort("creation_time")} + > + Date +
+ +
+
+
handleSort("status")} + > + Statut +
+ +
+
+ + )} +
+
+
+ {filteredProcess.map((process, index) => ( +
+ item.id === process.id + )} + setSelected={(_selected, event) => + handleSelect(process, index, event) + } + /> +
+ ))} +
+
+
+ ); +}; + +export default ProcessTab; diff --git a/frontend/app/quality-assistant/page.module.scss b/frontend/app/quality-assistant/page.module.scss new file mode 100644 index 000000000000..27b57110dba4 --- /dev/null +++ b/frontend/app/quality-assistant/page.module.scss @@ -0,0 +1,20 @@ +@use "styles/Spacings.module.scss"; + +.page_wrapper { + display: flex; + flex-direction: column; + gap: Spacings.$spacing05; + width: 100%; + height: 100vh; + overflow: hidden; + + .content_wrapper { + padding-inline: Spacings.$spacing09; + padding-block: Spacings.$spacing05; + overflow-y: auto; + display: flex; + flex-direction: column; + gap: Spacings.$spacing05; + height: 100%; + } +} diff --git a/frontend/app/quality-assistant/page.tsx b/frontend/app/quality-assistant/page.tsx new file mode 100644 index 000000000000..a4da3eb48c40 --- /dev/null +++ b/frontend/app/quality-assistant/page.tsx @@ -0,0 +1,47 @@ +"use client"; + +import { useState } from "react"; + +import PageHeader from "@/lib/components/PageHeader/PageHeader"; +import { Tabs } from "@/lib/components/ui/Tabs/Tabs"; +import { Tab } from "@/lib/types/Tab"; + +import AssistantTab from "./AssistantTab/AssistantTab"; +import ProcessTab from "./ProcessTab/ProcessTab"; +import styles from "./page.module.scss"; + +const QualityAssistant = (): JSX.Element => { + const [selectedTab, setSelectedTab] = useState("Assistants"); + + const qualityAssistantTab: Tab[] = [ + { + label: "Assistants", + isSelected: selectedTab === "Assistants", + onClick: () => setSelectedTab("Assistants"), + iconName: "assistant", + }, + { + label: "Process", + isSelected: selectedTab === "Process", + onClick: () => setSelectedTab("Process"), + iconName: "waiting", + }, + ]; + + return ( +
+
+ +
+
+ + {selectedTab === "Assistants" && ( + + )} + {selectedTab === "Process" && } +
+
+ ); +}; + +export default QualityAssistant; diff --git a/frontend/app/quality-assistant/types/assistant.ts b/frontend/app/quality-assistant/types/assistant.ts new file mode 100644 index 000000000000..3da1bca154dd --- /dev/null +++ b/frontend/app/quality-assistant/types/assistant.ts @@ -0,0 +1,123 @@ +interface Pricing { + cost: number; + description: string; +} + +interface InputFile { + key: string; + allowed_extensions: string[]; + required: boolean; + description: string; +} + +interface InputUrl { + key: string; + required: boolean; + description: string; +} + +interface InputText { + key: string; + required: boolean; + description: string; + validation_regex: string; +} + +interface InputBoolean { + key: string; + required: boolean; + description: string; +} + +interface InputNumber { + key: string; + required: boolean; + description: string; + min: number; + max: number; + increment: number; + default: number; +} + +interface SelectText { + key: string; + required: boolean; + description: string; + options: string[]; + default: string; +} + +interface SelectNumber { + key: string; + required: boolean; + description: string; + options: number[]; + default: number; +} + +interface Brain { + required: boolean; + description: string; + type: string; +} + +interface Inputs { + files: InputFile[]; + urls: InputUrl[]; + texts: InputText[]; + booleans?: InputBoolean[]; + numbers: InputNumber[]; + select_texts?: SelectText[]; + select_numbers: SelectNumber[]; + brain: Brain; + conditional_inputs?: ConditionalInput[]; +} + +export interface Assistant { + id: number; + name: string; + description: string; + pricing: Pricing; + tags: string[]; + input_description: string; + output_description: string; + inputs: Inputs; + icon_url: string; +} + +interface ProcessAssistantInputFile { + key: string; + value: string; +} + +export interface ConditionalInput { + key: string; + conditional_key: string; + condition: "equals" | "not_equals"; + value: string; +} + +export interface ProcessAssistantData { + id: number; + name: string; + inputs: { + files?: ProcessAssistantInputFile[]; + urls?: { key: string; value: string }[]; + texts?: { key: string; value: string }[]; + booleans?: { key: string; value: boolean | null }[]; + numbers?: { key: string; value: number }[]; + select_texts?: { key: string; value: string | null }[]; + select_numbers?: { key: string; value: number }[]; + brain?: { value: string }; + conditional_inputs?: ConditionalInput[]; + }; +} + +export interface ProcessAssistantInput { + input: ProcessAssistantData; + files: File[]; +} + +export interface ResultDownload { + data: string; +} diff --git a/frontend/app/quality-assistant/types/process.ts b/frontend/app/quality-assistant/types/process.ts new file mode 100644 index 000000000000..712bd6df32c7 --- /dev/null +++ b/frontend/app/quality-assistant/types/process.ts @@ -0,0 +1,13 @@ +export interface ProcessMetadata { + input_files: string[]; +} + +export interface Process { + answer: string; + id: number; + name: string; + creation_time: string; + status: "pending" | "processing" | "completed" | "error"; + assistant_name: string; + task_metadata: ProcessMetadata; +} diff --git a/frontend/app/studio/[brainId]/BrainManagementTabs/BrainManagementTabs.module.scss b/frontend/app/studio/[brainId]/BrainManagementTabs/BrainManagementTabs.module.scss index bc2d3efc5b99..5d884f035b08 100644 --- a/frontend/app/studio/[brainId]/BrainManagementTabs/BrainManagementTabs.module.scss +++ b/frontend/app/studio/[brainId]/BrainManagementTabs/BrainManagementTabs.module.scss @@ -14,6 +14,7 @@ margin-left: -(Spacings.$spacing05 + Spacings.$spacing03); gap: Spacings.$spacing03; align-items: center; + padding-top: Spacings.$spacing05; .tabs { width: 100%; diff --git a/frontend/app/studio/[brainId]/BrainManagementTabs/components/KnowledgeTab/KnowledgeTable/KnowledgeTable.tsx b/frontend/app/studio/[brainId]/BrainManagementTabs/components/KnowledgeTab/KnowledgeTable/KnowledgeTable.tsx index a7ac8f90ebcf..7bfd982907a1 100644 --- a/frontend/app/studio/[brainId]/BrainManagementTabs/components/KnowledgeTab/KnowledgeTable/KnowledgeTable.tsx +++ b/frontend/app/studio/[brainId]/BrainManagementTabs/components/KnowledgeTab/KnowledgeTable/KnowledgeTable.tsx @@ -4,6 +4,7 @@ import { Checkbox } from "@/lib/components/ui/Checkbox/Checkbox"; import { Icon } from "@/lib/components/ui/Icon/Icon"; import { QuivrButton } from "@/lib/components/ui/QuivrButton/QuivrButton"; import { TextInput } from "@/lib/components/ui/TextInput/TextInput"; +import { updateSelectedItems } from "@/lib/helpers/table"; import { useDevice } from "@/lib/hooks/useDevice"; import { isUploadedKnowledge, Knowledge } from "@/lib/types/Knowledge"; @@ -58,51 +59,6 @@ const filterAndSortKnowledge = ( return filteredList; }; -const updateSelectedKnowledge = ({ - knowledge, - index, - event, - lastSelectedIndex, - filteredKnowledgeList, - selectedKnowledge, -}: { - knowledge: Knowledge; - index: number; - event: React.MouseEvent; - lastSelectedIndex: number | null; - filteredKnowledgeList: Knowledge[]; - selectedKnowledge: Knowledge[]; -}): { selectedKnowledge: Knowledge[]; lastSelectedIndex: number | null } => { - if (event.shiftKey && lastSelectedIndex !== null) { - const start = Math.min(lastSelectedIndex, index); - const end = Math.max(lastSelectedIndex, index); - const range = filteredKnowledgeList.slice(start, end + 1); - - const newSelected = [...selectedKnowledge]; - range.forEach((item) => { - if (!newSelected.some((selectedItem) => selectedItem.id === item.id)) { - newSelected.push(item); - } - }); - - return { selectedKnowledge: newSelected, lastSelectedIndex: index }; - } else { - const isSelected = selectedKnowledge.some( - (item) => item.id === knowledge.id - ); - const newSelectedKnowledge = isSelected - ? selectedKnowledge.filter( - (selectedItem) => selectedItem.id !== knowledge.id - ) - : [...selectedKnowledge, knowledge]; - - return { - selectedKnowledge: newSelectedKnowledge, - lastSelectedIndex: isSelected ? null : index, - }; - } -}; - const KnowledgeTable = React.forwardRef( ({ knowledgeList }, ref) => { const [selectedKnowledge, setSelectedKnowledge] = useState([]); @@ -131,15 +87,15 @@ const KnowledgeTable = React.forwardRef( index: number, event: React.MouseEvent ) => { - const newSelectedKnowledge = updateSelectedKnowledge({ - knowledge, + const newSelectedKnowledge = updateSelectedItems({ + item: knowledge, index, event, lastSelectedIndex, - filteredKnowledgeList, - selectedKnowledge, + filteredList: filteredKnowledgeList, + selectedItems: selectedKnowledge, }); - setSelectedKnowledge(newSelectedKnowledge.selectedKnowledge); + setSelectedKnowledge(newSelectedKnowledge.selectedItems); setLastSelectedIndex(newSelectedKnowledge.lastSelectedIndex); }; diff --git a/frontend/lib/api/assistants/assistants.ts b/frontend/lib/api/assistants/assistants.ts index 7db2c0dfe267..8e02aae0d92d 100644 --- a/frontend/lib/api/assistants/assistants.ts +++ b/frontend/lib/api/assistants/assistants.ts @@ -1,38 +1,63 @@ import { AxiosInstance } from "axios"; -import { Assistant, ProcessAssistantRequest } from "./types"; +import { + Assistant, + ProcessAssistantInput, +} from "@/app/quality-assistant/types/assistant"; +import { Process } from "@/app/quality-assistant/types/process"; export const getAssistants = async ( axiosInstance: AxiosInstance -): Promise => { - return (await axiosInstance.get("/assistants")).data; +): Promise => { + return (await axiosInstance.get(`/assistants`)).data; }; -export const processAssistant = async ( +export const getTasks = async ( + axiosInstance: AxiosInstance +): Promise => { + return (await axiosInstance.get(`/assistants/tasks`)).data; +}; + +export const processTask = async ( axiosInstance: AxiosInstance, - input: ProcessAssistantRequest, - files: File[] -): Promise => { + processAssistantInput: ProcessAssistantInput +): Promise => { const formData = new FormData(); - formData.append( - "input", - JSON.stringify({ - name: input.name, - inputs: { - files: input.inputs.files, - urls: input.inputs.urls, - texts: input.inputs.texts, + formData.append("input", JSON.stringify(processAssistantInput.input)); + + processAssistantInput.files.forEach((file) => { + if (file instanceof File) { + formData.append("files", file); + } else { + console.error("L'élément n'est pas un fichier valide", file); + } + }); + + const response = await axiosInstance.post( + `/assistants/task`, + formData, + { + headers: { + "Content-Type": "multipart/form-data", }, - outputs: input.outputs, - }) + } ); - files.forEach((file) => { - formData.append("files", file); - }); + return response.data; +}; + +export const deleteTask = async ( + axiosInstance: AxiosInstance, + taskId: number +): Promise => { + await axiosInstance.delete(`/assistants/task/${taskId}`); +}; - return ( - await axiosInstance.post("/assistant/process", formData) - ).data; +export const downloadTaskResult = async ( + axiosInstance: AxiosInstance, + taskId: number +): Promise => { + return (await axiosInstance(`/assistants/task/${taskId}/download`)) + .data; }; diff --git a/frontend/lib/api/assistants/useAssistants.ts b/frontend/lib/api/assistants/useAssistants.ts index 90630ec2d97f..04df53d10743 100644 --- a/frontend/lib/api/assistants/useAssistants.ts +++ b/frontend/lib/api/assistants/useAssistants.ts @@ -1,7 +1,13 @@ +import { ProcessAssistantInput } from "@/app/quality-assistant/types/assistant"; import { useAxios } from "@/lib/hooks"; -import { getAssistants, processAssistant } from "./assistants"; -import { ProcessAssistantRequest } from "./types"; +import { + deleteTask, + downloadTaskResult, + getAssistants, + getTasks, + processTask, +} from "./assistants"; // eslint-disable-next-line @typescript-eslint/explicit-module-boundary-types export const useAssistants = () => { @@ -9,7 +15,11 @@ export const useAssistants = () => { return { getAssistants: async () => getAssistants(axiosInstance), - processAssistant: async (input: ProcessAssistantRequest, files: File[]) => - processAssistant(axiosInstance, input, files), + getTasks: async () => getTasks(axiosInstance), + processTask: async (processAssistantInput: ProcessAssistantInput) => + processTask(axiosInstance, processAssistantInput), + deleteTask: async (taskId: number) => deleteTask(axiosInstance, taskId), + downloadTaskResult: async (taskId: number) => + downloadTaskResult(axiosInstance, taskId), }; }; diff --git a/frontend/lib/components/Menu/Menu.tsx b/frontend/lib/components/Menu/Menu.tsx index 2d40cb50f830..33667f00f475 100644 --- a/frontend/lib/components/Menu/Menu.tsx +++ b/frontend/lib/components/Menu/Menu.tsx @@ -1,5 +1,6 @@ import { MotionConfig } from "framer-motion"; import { usePathname, useRouter } from "next/navigation"; +import { useFeatureFlagEnabled } from 'posthog-js/react'; import { useState } from "react"; import { MenuControlButton } from "@/app/chat/[chatId]/components/ActionsBar/components/ChatInput/components/MenuControlButton/MenuControlButton"; @@ -17,11 +18,13 @@ import { HomeButton } from "./components/HomeButton/HomeButton"; import { Notifications } from "./components/Notifications/Notifications"; import { NotificationsButton } from "./components/NotificationsButton/NotificationsButton"; import { ProfileButton } from "./components/ProfileButton/ProfileButton"; +import { QualityAssistantButton } from "./components/QualityAssistantButton/QualityAssistantButton"; import { SocialsButtons } from "./components/SocialsButtons/SocialsButtons"; import { StudioButton } from "./components/StudioButton/StudioButton"; import { ThreadsButton } from "./components/ThreadsButton/ThreadsButton"; import { UpgradeToPlusButton } from "./components/UpgradeToPlusButton/UpgradeToPlusButton"; + const showUpgradeButton = process.env.NEXT_PUBLIC_SHOW_TOKENS === "true"; export const Menu = (): JSX.Element => { @@ -31,6 +34,8 @@ export const Menu = (): JSX.Element => { const pathname = usePathname() ?? ""; const [isLogoHovered, setIsLogoHovered] = useState(false); const { isDarkMode } = useUserSettingsContext(); + const flagEnabled = useFeatureFlagEnabled('show-quality-assistant') + useChatsList(); @@ -44,6 +49,7 @@ export const Menu = (): JSX.Element => { "/library", "/search", "studio", + "/quality-assistant", "/user", ]; @@ -59,9 +65,8 @@ export const Menu = (): JSX.Element => {
@@ -83,6 +88,8 @@ export const Menu = (): JSX.Element => {
+ + {flagEnabled && } diff --git a/frontend/lib/components/Menu/components/QualityAssistantButton/QualityAssistantButton.module.scss b/frontend/lib/components/Menu/components/QualityAssistantButton/QualityAssistantButton.module.scss new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/frontend/lib/components/Menu/components/QualityAssistantButton/QualityAssistantButton.tsx b/frontend/lib/components/Menu/components/QualityAssistantButton/QualityAssistantButton.tsx new file mode 100644 index 000000000000..45a58f005bbb --- /dev/null +++ b/frontend/lib/components/Menu/components/QualityAssistantButton/QualityAssistantButton.tsx @@ -0,0 +1,21 @@ +import Link from "next/link"; +import { usePathname } from "next/navigation"; + +import { MenuButton } from "@/lib/components/Menu/components/MenuButton/MenuButton"; + +export const QualityAssistantButton = (): JSX.Element => { + const pathname = usePathname() ?? ""; + const isSelected = pathname.includes("/quality-assistant"); + + return ( + + + + ); +}; diff --git a/frontend/lib/components/ui/FileInput/FileInput.module.scss b/frontend/lib/components/ui/FileInput/FileInput.module.scss index 030051960717..93d8a34b01d4 100644 --- a/frontend/lib/components/ui/FileInput/FileInput.module.scss +++ b/frontend/lib/components/ui/FileInput/FileInput.module.scss @@ -1,24 +1,62 @@ @use "styles/Radius.module.scss"; +@use "styles/ScreenSizes.module.scss"; @use "styles/Spacings.module.scss"; @use "styles/Typography.module.scss"; -.header_wrapper { - display: flex; - justify-content: space-between; - align-items: center; - border: 1px solid var(--border-2); - border-radius: Radius.$big; - padding: Spacings.$spacing03; - cursor: pointer; - - &:hover { - background-color: var(--background-2); - border-color: var(--accent); +.file_input_wrapper { + width: 100%; + height: 200px; + + &.drag_active { + .header_wrapper { + border: 3px dashed var(--accent); + background-color: var(--background-3); + } } - .placeholder { - color: var(--text-2); - font-size: Typography.$small; + .header_wrapper { + display: flex; + justify-content: space-between; + align-items: center; + border: 1px solid var(--border-2); + border-radius: Radius.$big; + padding: Spacings.$spacing03; + cursor: pointer; + height: 100%; + width: 100%; + + &:hover { + border: 3px dashed var(--accent); + } + + &.drag_active { + border: 3px dashed var(--accent); + background-color: var(--background-3); + } + + .box_content { + width: 100%; + display: flex; + flex-direction: column; + column-gap: Spacings.$spacing05; + justify-content: center; + align-items: center; + height: 100%; + + .input { + display: flex; + gap: Spacings.$spacing02; + padding: Spacings.$spacing05; + + @media (max-width: ScreenSizes.$small) { + flex-direction: column; + } + + .clickable { + font-weight: bold; + } + } + } } } @@ -29,4 +67,4 @@ .error_message { font-size: Typography.$tiny; color: var(--dangerous); -} \ No newline at end of file +} diff --git a/frontend/lib/components/ui/FileInput/FileInput.tsx b/frontend/lib/components/ui/FileInput/FileInput.tsx index 650a742b25cf..69543b15ad96 100644 --- a/frontend/lib/components/ui/FileInput/FileInput.tsx +++ b/frontend/lib/components/ui/FileInput/FileInput.tsx @@ -1,35 +1,36 @@ import { useRef, useState } from "react"; - -import { iconList } from "@/lib/helpers/iconList"; +import { Accept, useDropzone } from "react-dropzone"; import styles from "./FileInput.module.scss"; -import { FieldHeader } from "../FieldHeader/FieldHeader"; import { Icon } from "../Icon/Icon"; interface FileInputProps { label: string; - icon: keyof typeof iconList; onFileChange: (file: File) => void; acceptedFileTypes?: string[]; } export const FileInput = (props: FileInputProps): JSX.Element => { - const [currentFile, setcurrentFile] = useState(null); + const [currentFile, setCurrentFile] = useState(null); const [errorMessage, setErrorMessage] = useState(""); const fileInputRef = useRef(null); - const handleFileChange = (event: React.ChangeEvent) => { + const handleFileChange = (file: File) => { + const fileExtension = file.name.split(".").pop(); + if (props.acceptedFileTypes?.includes(fileExtension || "")) { + props.onFileChange(file); + setCurrentFile(file); + setErrorMessage(""); + } else { + setErrorMessage("Wrong extension"); + } + }; + + const handleInputChange = (event: React.ChangeEvent) => { const file = event.target.files?.[0]; if (file) { - const fileExtension = file.name.split(".").pop(); - if (props.acceptedFileTypes?.includes(fileExtension || "")) { - props.onFileChange(file); - setcurrentFile(file); - setErrorMessage(""); - } else { - setErrorMessage("Wrong extension"); - } + handleFileChange(file); } }; @@ -37,30 +38,70 @@ export const FileInput = (props: FileInputProps): JSX.Element => { fileInputRef.current?.click(); }; + const mimeTypes: { [key: string]: string } = { + pdf: "application/pdf", + doc: "application/msword", + docx: "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + xls: "application/vnd.ms-excel", + xlsx: "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + csv: "text/csv", + txt: "text/plain", + jpg: "image/jpeg", + jpeg: "image/jpeg", + png: "image/png", + }; + + const accept: Accept | undefined = props.acceptedFileTypes?.reduce( + (acc, type) => { + const mimeType = mimeTypes[type]; + if (mimeType) { + acc[mimeType] = []; + } + + return acc; + }, + {} as Accept + ); + + const { getRootProps, getInputProps, isDragActive } = useDropzone({ + onDrop: (acceptedFiles) => { + const file = acceptedFiles[0]; + if (file) { + handleFileChange(file); + } + }, + accept, + }); + return ( -
- -
-
- - Click here to {currentFile ? "change your" : "upload a"} file - - +
+
+
+ +
+
+ Choose file +
+ or drag it here +
+ {currentFile && ( + {currentFile.name} + )}
- `application/${type}`) - .join(",")} - style={{ display: "none" }} - />
- {currentFile && ( - {currentFile.name} - )} + {errorMessage !== "" && ( {errorMessage} )} diff --git a/frontend/lib/components/ui/Tag/Tag.module.scss b/frontend/lib/components/ui/Tag/Tag.module.scss index 6a7caa788b08..ca4c6ed34b53 100644 --- a/frontend/lib/components/ui/Tag/Tag.module.scss +++ b/frontend/lib/components/ui/Tag/Tag.module.scss @@ -28,4 +28,9 @@ color: var(--success); background-color: var(--background-success); } + + &.grey { + color: var(--text-4); + background-color: var(--background-pending); + } } diff --git a/frontend/lib/helpers/table.ts b/frontend/lib/helpers/table.ts new file mode 100644 index 000000000000..65367c2146b3 --- /dev/null +++ b/frontend/lib/helpers/table.ts @@ -0,0 +1,117 @@ +import { UUID } from "crypto"; + +interface SortConfig { + key: keyof T; + direction: "ascending" | "descending"; +} + +interface HasId { + id: number | UUID; +} + +const getAllValues = (obj: T): string[] => { + let values: string[] = []; + for (const key in obj) { + if (Object.prototype.hasOwnProperty.call(obj, key)) { + const value = (obj as Record)[key]; + if (typeof value === "string" || typeof value === "number") { + values.push(value.toString()); + } else if (Array.isArray(value)) { + values = values.concat(value.map((v: string) => v.toString())); + } else if (typeof value === "object" && value !== null) { + values = values.concat(getAllValues(value)); + } + } + } + + return values; +}; + +export const filterAndSort = ( + dataList: T[], + searchQuery: string, + sortConfig: SortConfig, + getComparableValue: (item: T) => unknown +): T[] => { + let filteredList = dataList.filter((item) => + getAllValues(item).some((value) => + value.toLowerCase().includes(searchQuery.toLowerCase()) + ) + ); + + const compareValues = ( + a: string | number, + b: string | number, + direction: "ascending" | "descending" + ) => { + if (a < b) { + return direction === "ascending" ? -1 : 1; + } + if (a > b) { + return direction === "ascending" ? 1 : -1; + } + + return 0; + }; + + // Appliquer les configurations de tri + if (sortConfig.key) { + filteredList = filteredList.sort((a, b) => { + const aValue = getComparableValue(a); + const bValue = getComparableValue(b); + + // Vérifier que les valeurs sont des chaînes ou des nombres + if ( + (typeof aValue === "string" || typeof aValue === "number") && + (typeof bValue === "string" || typeof bValue === "number") + ) { + return compareValues(aValue, bValue, sortConfig.direction); + } + + return 0; + }); + } + + return filteredList; +}; + +export const updateSelectedItems = (params: { + item: T; + index: number; + event: React.MouseEvent; + lastSelectedIndex: number | null; + filteredList: T[]; + selectedItems: T[]; +}): { selectedItems: T[]; lastSelectedIndex: number | null } => { + const { item, index, event, lastSelectedIndex, filteredList, selectedItems } = + params; + + if (event.shiftKey && lastSelectedIndex !== null) { + const start = Math.min(lastSelectedIndex, index); + const end = Math.max(lastSelectedIndex, index); + const range = filteredList.slice(start, end + 1); + + const newSelected = [...selectedItems]; + range.forEach((rangeItem) => { + if ( + !newSelected.some((selectedItem) => selectedItem.id === rangeItem.id) + ) { + newSelected.push(rangeItem); + } + }); + + return { selectedItems: newSelected, lastSelectedIndex: index }; + } else { + const isSelected = selectedItems.some( + (selectedItem) => selectedItem.id === item.id + ); + const newSelectedItems = isSelected + ? selectedItems.filter((selectedItem) => selectedItem.id !== item.id) + : [...selectedItems, item]; + + return { + selectedItems: newSelectedItems, + lastSelectedIndex: isSelected ? null : index, + }; + } +}; diff --git a/frontend/next.config.js b/frontend/next.config.js index a6dc189b5d11..f0f920171feb 100644 --- a/frontend/next.config.js +++ b/frontend/next.config.js @@ -56,6 +56,7 @@ const ContentSecurityPolicy = { "*.intercomcdn.com", "https://*.vercel.app", process.env.NEXT_PUBLIC_FRONTEND_URL, + "http://host.docker.internal:54321", ], "connect-src": [ "'self'", @@ -72,7 +73,8 @@ const ContentSecurityPolicy = { "https://vitals.vercel-insights.com/v1/vitals", "https://us.posthog.com", "*.posthog.com", - "https://us.i.posthog.com" + "https://us.i.posthog.com", + "http://host.docker.internal:54321", ], "img-src": [ "'self'", diff --git a/frontend/package.json b/frontend/package.json index 17c035a961bb..6eb729f700e2 100644 --- a/frontend/package.json +++ b/frontend/package.json @@ -72,6 +72,7 @@ "eslint": "8.46.0", "eslint-config-next": "^14.1.0", "eslint-plugin-prefer-arrow": "1.2.3", + "file-saver": "^2.0.5", "framer-motion": "10.15.0", "front-matter": "4.0.2", "headlessui": "0.0.0", @@ -117,6 +118,7 @@ "@tailwindcss/typography": "0.5.9", "@testing-library/jest-dom": "6.1.3", "@testing-library/react": "14.0.0", + "@types/file-saver": "^2.0.7", "@types/prismjs": "^1.26.4", "@types/react-katex": "^3.0.4", "@types/uuid": "^10.0.0", diff --git a/frontend/styles/_Variables.module.scss b/frontend/styles/_Variables.module.scss index b9f1881fb841..075cc8c8ed76 100644 --- a/frontend/styles/_Variables.module.scss +++ b/frontend/styles/_Variables.module.scss @@ -2,3 +2,5 @@ $searchBarHeight: 62px; $pageHeaderHeight: 48px; $menuWidth: 230px; $brainButtonHeight: 105px; +$menuSectionWidth: 175px; +$assistantInputWidth: 300px; diff --git a/frontend/yarn.lock b/frontend/yarn.lock index 5f15c8a5d3c2..b993d6c9f699 100644 --- a/frontend/yarn.lock +++ b/frontend/yarn.lock @@ -2750,6 +2750,11 @@ resolved "https://registry.npmjs.org/@types/estree/-/estree-1.0.1.tgz" integrity sha512-LG4opVs2ANWZ1TJoKc937iMmNstM/d0ae1vNbnBvBhqCSezgVUOzcLCqbI5elV8Vy6WKwKjaqR+zO9VKirBBCA== +"@types/file-saver@^2.0.7": + version "2.0.7" + resolved "https://registry.yarnpkg.com/@types/file-saver/-/file-saver-2.0.7.tgz#8dbb2f24bdc7486c54aa854eb414940bbd056f7d" + integrity sha512-dNKVfHd/jk0SkR/exKGj2ggkB45MAkzvWCaqLUUgkyjITkGNzH8H+yUwr+BLJUBjZOe9w8X3wgmXhZDRg1ED6A== + "@types/hast@^2.0.0": version "2.3.5" resolved "https://registry.npmjs.org/@types/hast/-/hast-2.3.5.tgz" @@ -4760,6 +4765,11 @@ file-entry-cache@^6.0.1: dependencies: flat-cache "^3.0.4" +file-saver@^2.0.5: + version "2.0.5" + resolved "https://registry.npmjs.org/file-saver/-/file-saver-2.0.5.tgz" + integrity sha512-P9bmyZ3h/PRG+Nzga+rbdI4OEpNDzAVyy74uVO9ATgzLK6VtAsYybF/+TOCvrc0MO793d6+42lLyZTw7/ArVzA== + file-selector@^0.6.0: version "0.6.0" resolved "https://registry.npmjs.org/file-selector/-/file-selector-0.6.0.tgz"