diff --git a/backend/app/alembic/versions/041_add_config_in_evals_run_table.py b/backend/app/alembic/versions/041_add_config_in_evals_run_table.py new file mode 100644 index 00000000..449768b3 --- /dev/null +++ b/backend/app/alembic/versions/041_add_config_in_evals_run_table.py @@ -0,0 +1,60 @@ +"""add config in evals run table + +Revision ID: 041 +Revises: 040 +Create Date: 2025-12-15 14:03:22.082746 + +""" +from alembic import op +import sqlalchemy as sa +import sqlmodel.sql.sqltypes +from sqlalchemy.dialects import postgresql + +# revision identifiers, used by Alembic. +revision = "041" +down_revision = "040" +branch_labels = None +depends_on = None + + +def upgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.add_column( + "evaluation_run", + sa.Column( + "config_id", + sa.Uuid(), + nullable=True, + comment="Reference to the stored config used", + ), + ) + op.add_column( + "evaluation_run", + sa.Column( + "config_version", + sa.Integer(), + nullable=True, + comment="Version of the config used", + ), + ) + op.create_foreign_key(None, "evaluation_run", "config", ["config_id"], ["id"]) + op.drop_column("evaluation_run", "config") + # ### end Alembic commands ### + + +def downgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.add_column( + "evaluation_run", + sa.Column( + "config", + postgresql.JSONB(astext_type=sa.Text()), + autoincrement=False, + nullable=False, + comment="Evaluation configuration (model, instructions, etc.)", + ), + ) + op.drop_constraint(None, "evaluation_run", type_="foreignkey") + op.drop_column("evaluation_run", "config_version") + op.drop_column("evaluation_run", "config_id") + # ### end Alembic commands ### diff --git a/backend/app/api/routes/evaluation.py b/backend/app/api/routes/evaluation.py index 058950d6..5d574137 100644 --- a/backend/app/api/routes/evaluation.py +++ b/backend/app/api/routes/evaluation.py @@ -3,12 +3,21 @@ import logging import re from pathlib import Path - -from fastapi import APIRouter, Body, File, Form, HTTPException, Query, UploadFile +from uuid import UUID + +from fastapi import ( + APIRouter, + Body, + File, + Form, + HTTPException, + Query, + UploadFile, +) from app.api.deps import AuthContextDep, SessionDep from app.core.cloud import get_cloud_storage -from app.crud.assistants import get_assistant_by_id +from app.crud.config.version import ConfigVersionCrud from app.crud.evaluations import ( create_evaluation_dataset, create_evaluation_run, @@ -27,6 +36,9 @@ DatasetUploadResponse, EvaluationRunPublic, ) +from app.models.llm.request import LLMCallConfig +from app.services.llm.jobs import resolve_config_blob +from app.services.llm.providers import LLMProvider from app.utils import ( APIResponse, get_langfuse_client, @@ -34,6 +46,7 @@ load_description, ) + logger = logging.getLogger(__name__) # File upload security constants @@ -430,20 +443,9 @@ def evaluate( experiment_name: str = Body( ..., description="Name for this evaluation experiment/run" ), - config: dict = Body(default_factory=dict, description="Evaluation configuration"), - assistant_id: str - | None = Body( - None, description="Optional assistant ID to fetch configuration from" - ), + config_id: UUID = Body(..., description="Stored config ID"), + config_version: int = Body(..., ge=1, description="Stored config version"), ) -> APIResponse[EvaluationRunPublic]: - logger.info( - f"[evaluate] Starting evaluation | experiment_name={experiment_name} | " - f"dataset_id={dataset_id} | " - f"org_id={auth_context.organization.id} | " - f"assistant_id={assistant_id} | " - f"config_keys={list(config.keys())}" - ) - # Step 1: Fetch dataset from database dataset = get_dataset_by_id( session=_session, @@ -459,12 +461,6 @@ def evaluate( f"organization/project", ) - logger.info( - f"[evaluate] Found dataset | id={dataset.id} | name={dataset.name} | " - f"object_store_url={'present' if dataset.object_store_url else 'None'} | " - f"langfuse_id={dataset.langfuse_dataset_id}" - ) - dataset_name = dataset.name # Get API clients @@ -487,63 +483,35 @@ def evaluate( "Please ensure Langfuse credentials were configured when the dataset was created.", ) - # Handle assistant_id if provided - if assistant_id: - # Fetch assistant details from database - assistant = get_assistant_by_id( - session=_session, - assistant_id=assistant_id, - project_id=auth_context.project.id, - ) - - if not assistant: - raise HTTPException( - status_code=404, detail=f"Assistant {assistant_id} not found" - ) + config_version_crud = ConfigVersionCrud( + session=_session, config_id=config_id, project_id=auth_context.project.id + ) - logger.info( - f"[evaluate] Found assistant in DB | id={assistant.id} | " - f"model={assistant.model} | instructions=" - f"{assistant.instructions[:50] if assistant.instructions else 'None'}..." + config, error = resolve_config_blob( + config_crud=config_version_crud, + config=LLMCallConfig(id=config_id, version=config_version), + ) + if error: + raise HTTPException( + status_code=400, + detail=f"Failed to resolve config from stored config: {error}", ) - - # Build config from assistant (use provided config values to override - # if present) - config = { - "model": config.get("model", assistant.model), - "instructions": config.get("instructions", assistant.instructions), - "temperature": config.get("temperature", assistant.temperature), - } - - # Add tools if vector stores are available - vector_store_ids = config.get( - "vector_store_ids", assistant.vector_store_ids or [] + elif config.completion.provider != LLMProvider.OPENAI: + raise HTTPException( + status_code=422, + detail="Only 'openai' provider is supported for evaluation configs", ) - if vector_store_ids and len(vector_store_ids) > 0: - config["tools"] = [ - { - "type": "file_search", - "vector_store_ids": vector_store_ids, - } - ] - - logger.info("[evaluate] Using config from assistant") - else: - logger.info("[evaluate] Using provided config directly") - # Validate that config has minimum required fields - if not config.get("model"): - raise HTTPException( - status_code=400, - detail="Config must include 'model' when assistant_id is not provided", - ) - # Create EvaluationRun record + logger.info("[evaluate] Successfully resolved config from config management") + + # Create EvaluationRun record with config references eval_run = create_evaluation_run( session=_session, run_name=experiment_name, dataset_name=dataset_name, dataset_id=dataset_id, - config=config, + config_id=config_id, + config_version=config_version, organization_id=auth_context.organization.id, project_id=auth_context.project.id, ) @@ -555,7 +523,7 @@ def evaluate( openai_client=openai_client, session=_session, eval_run=eval_run, - config=config, + config=config.completion.params, ) logger.info( diff --git a/backend/app/crud/evaluations/__init__.py b/backend/app/crud/evaluations/__init__.py index 5ca0aacd..8344c3e9 100644 --- a/backend/app/crud/evaluations/__init__.py +++ b/backend/app/crud/evaluations/__init__.py @@ -5,6 +5,7 @@ create_evaluation_run, get_evaluation_run_by_id, list_evaluation_runs, + resolve_model_from_config, ) from app.crud.evaluations.cron import ( process_all_pending_evaluations, @@ -39,6 +40,7 @@ "create_evaluation_run", "get_evaluation_run_by_id", "list_evaluation_runs", + "resolve_model_from_config", # Cron "process_all_pending_evaluations", "process_all_pending_evaluations_sync", diff --git a/backend/app/crud/evaluations/core.py b/backend/app/crud/evaluations/core.py index b2b118df..b64aa820 100644 --- a/backend/app/crud/evaluations/core.py +++ b/backend/app/crud/evaluations/core.py @@ -1,12 +1,16 @@ import logging from typing import Any +from uuid import UUID from langfuse import Langfuse from sqlmodel import Session, select from app.core.util import now +from app.crud.config.version import ConfigVersionCrud from app.crud.evaluations.langfuse import fetch_trace_scores_from_langfuse from app.models import EvaluationRun +from app.models.llm.request import LLMCallConfig +from app.services.llm.jobs import resolve_config_blob logger = logging.getLogger(__name__) @@ -16,7 +20,8 @@ def create_evaluation_run( run_name: str, dataset_name: str, dataset_id: int, - config: dict, + config_id: UUID, + config_version: int, organization_id: int, project_id: int, ) -> EvaluationRun: @@ -28,7 +33,8 @@ def create_evaluation_run( run_name: Name of the evaluation run/experiment dataset_name: Name of the dataset being used dataset_id: ID of the dataset - config: Configuration dict for the evaluation + config_id: UUID of the stored config + config_version: Version number of the config organization_id: Organization ID project_id: Project ID @@ -39,7 +45,8 @@ def create_evaluation_run( run_name=run_name, dataset_name=dataset_name, dataset_id=dataset_id, - config=config, + config_id=config_id, + config_version=config_version, status="pending", organization_id=organization_id, project_id=project_id, @@ -56,7 +63,10 @@ def create_evaluation_run( logger.error(f"Failed to create EvaluationRun: {e}", exc_info=True) raise - logger.info(f"Created EvaluationRun record: id={eval_run.id}, run_name={run_name}") + logger.info( + f"Created EvaluationRun record: id={eval_run.id}, run_name={run_name}, " + f"config_id={config_id}, config_version={config_version}" + ) return eval_run @@ -293,3 +303,47 @@ def save_score( f"traces={len(score.get('traces', []))}" ) return eval_run + + +def resolve_model_from_config( + session: Session, + eval_run: EvaluationRun, +) -> str: + """ + Resolve the model name from the evaluation run's config. + + Args: + session: Database session + eval_run: EvaluationRun instance + + Returns: + Model name from config + + Raises: + ValueError: If config is missing, invalid, or has no model + """ + if not eval_run.config_id or not eval_run.config_version: + raise ValueError( + f"Evaluation run {eval_run.id} has no config reference " + f"(config_id={eval_run.config_id}, config_version={eval_run.config_version})" + ) + + config_version_crud = ConfigVersionCrud( + session=session, + config_id=eval_run.config_id, + project_id=eval_run.project_id, + ) + + config, error = resolve_config_blob( + config_crud=config_version_crud, + config=LLMCallConfig(id=eval_run.config_id, version=eval_run.config_version), + ) + + if error or config is None: + raise ValueError( + f"Config resolution failed for evaluation {eval_run.id} " + f"(config_id={eval_run.config_id}, version={eval_run.config_version}): {error}" + ) + + model = config.completion.params.get("model") + return model diff --git a/backend/app/crud/evaluations/embeddings.py b/backend/app/crud/evaluations/embeddings.py index 70e37421..22bd4852 100644 --- a/backend/app/crud/evaluations/embeddings.py +++ b/backend/app/crud/evaluations/embeddings.py @@ -364,19 +364,7 @@ def start_embedding_batch( logger.info(f"Starting embedding batch for evaluation run {eval_run.id}") # Get embedding model from config (default: text-embedding-3-large) - embedding_model = eval_run.config.get( - "embedding_model", "text-embedding-3-large" - ) - - # Validate and fallback to default if invalid - try: - validate_embedding_model(embedding_model) - except ValueError as e: - logger.warning( - f"Invalid embedding model '{embedding_model}' in config: {e}. " - f"Falling back to text-embedding-3-large" - ) - embedding_model = "text-embedding-3-large" + embedding_model = "text-embedding-3-large" # Step 1: Build embedding JSONL with trace_ids jsonl_data = build_embedding_jsonl( diff --git a/backend/app/crud/evaluations/processing.py b/backend/app/crud/evaluations/processing.py index 12b89266..653a2baf 100644 --- a/backend/app/crud/evaluations/processing.py +++ b/backend/app/crud/evaluations/processing.py @@ -26,7 +26,7 @@ upload_batch_results_to_object_store, ) from app.crud.evaluations.batch import fetch_dataset_items -from app.crud.evaluations.core import update_evaluation_run +from app.crud.evaluations.core import update_evaluation_run, resolve_model_from_config from app.crud.evaluations.embeddings import ( calculate_average_similarity, parse_embedding_results, @@ -253,16 +253,16 @@ async def process_completed_evaluation( if not results: raise ValueError("No valid results found in batch output") - # Extract model from config for cost tracking - model = eval_run.config.get("model") if eval_run.config else None - # Step 5: Create Langfuse dataset run with traces + # Use model stored at creation time for cost tracking + model = resolve_model_from_config(session=session, eval_run=eval_run) + trace_id_mapping = create_langfuse_dataset_run( langfuse=langfuse, dataset_name=eval_run.dataset_name, + model=model, run_name=eval_run.run_name, results=results, - model=model, ) # Store object store URL in database diff --git a/backend/app/models/evaluation.py b/backend/app/models/evaluation.py index f99fbb27..6ae4542f 100644 --- a/backend/app/models/evaluation.py +++ b/backend/app/models/evaluation.py @@ -1,5 +1,6 @@ from datetime import datetime from typing import TYPE_CHECKING, Any, Optional +from uuid import UUID from pydantic import BaseModel, Field from sqlalchemy import Column, Index, Text, UniqueConstraint @@ -193,15 +194,17 @@ class EvaluationRun(SQLModel, table=True): sa_column_kwargs={"comment": "Name of the Langfuse dataset used"}, ) - # Config field - dict requires sa_column - config: dict[str, Any] = SQLField( - default_factory=dict, - sa_column=Column( - JSONB, - nullable=False, - comment="Evaluation configuration (model, instructions, etc.)", - ), - description="Evaluation configuration", + config_id: UUID = SQLField( + foreign_key="config.id", + nullable=True, + description="Reference to the stored config used for this evaluation", + sa_column_kwargs={"comment": "Reference to the stored config used"}, + ) + config_version: int = SQLField( + nullable=True, + ge=1, + description="Version of the config used for this evaluation", + sa_column_kwargs={"comment": "Version of the config used"}, ) # Dataset reference @@ -339,7 +342,8 @@ class EvaluationRunPublic(SQLModel): id: int run_name: str dataset_name: str - config: dict[str, Any] + config_id: UUID | None + config_version: int | None dataset_id: int batch_job_id: int | None embedding_batch_job_id: int | None diff --git a/backend/app/tests/api/routes/test_evaluation.py b/backend/app/tests/api/routes/test_evaluation.py index c4eb3f0b..ca3d3dcb 100644 --- a/backend/app/tests/api/routes/test_evaluation.py +++ b/backend/app/tests/api/routes/test_evaluation.py @@ -1,11 +1,13 @@ import io from unittest.mock import Mock, patch +from uuid import uuid4 import pytest from sqlmodel import select from app.crud.evaluations.batch import build_evaluation_jsonl from app.models import EvaluationDataset, EvaluationRun +from app.tests.utils.test_data import create_test_config # Helper function to create CSV file-like object @@ -494,16 +496,20 @@ def sample_evaluation_config(self): } def test_start_batch_evaluation_invalid_dataset_id( - self, client, user_api_key_header, sample_evaluation_config + self, client, user_api_key_header, db, user_api_key ): """Test batch evaluation fails with invalid dataset_id.""" + # Create a valid config to use + config = create_test_config(db, project_id=user_api_key.project.id) + # Try to start evaluation with non-existent dataset_id response = client.post( "/api/v1/evaluations", json={ "experiment_name": "test_evaluation_run", "dataset_id": 99999, # Non-existent - "config": sample_evaluation_config, + "config_id": str(config.id), + "config_version": 1, }, headers=user_api_key_header, ) @@ -516,32 +522,27 @@ def test_start_batch_evaluation_invalid_dataset_id( assert "not found" in error_str.lower() or "not accessible" in error_str.lower() def test_start_batch_evaluation_missing_model(self, client, user_api_key_header): - """Test batch evaluation fails when model is missing from config.""" - # We don't need a real dataset for this test - the validation should happen - # before dataset lookup. Use any dataset_id and expect config validation error - invalid_config = { - "instructions": "You are a helpful assistant", - "temperature": 0.5, - } - + """Test batch evaluation fails with invalid config_id.""" + # Test with a non-existent config_id (random UUID) response = client.post( "/api/v1/evaluations", json={ - "experiment_name": "test_no_model", - "dataset_id": 1, # Dummy ID, error should come before this is checked - "config": invalid_config, + "experiment_name": "test_no_config", + "dataset_id": 1, # Dummy ID, config validation happens first + "config_id": str(uuid4()), # Non-existent config + "config_version": 1, }, headers=user_api_key_header, ) - # Should fail with either 400 (model missing) or 404 (dataset not found) + # Should fail with either 400 (config not found) or 404 (dataset/config not found) assert response.status_code in [400, 404] response_data = response.json() error_str = response_data.get( "detail", response_data.get("message", str(response_data)) ) - # Should fail with either "model" missing or "dataset not found" (both acceptable) - assert "model" in error_str.lower() or "not found" in error_str.lower() + # Should mention config or not found + assert "config" in error_str.lower() or "not found" in error_str.lower() def test_start_batch_evaluation_without_authentication( self, client, sample_evaluation_config @@ -728,11 +729,15 @@ def test_get_evaluation_run_trace_info_not_completed( self, client, user_api_key_header, db, user_api_key, create_test_dataset ): """Test requesting trace info for incomplete evaluation returns error.""" + # Create a config for the evaluation run + config = create_test_config(db, project_id=user_api_key.project.id) + eval_run = EvaluationRun( run_name="test_pending_run", dataset_name=create_test_dataset.name, dataset_id=create_test_dataset.id, - config={"model": "gpt-4o"}, + config_id=config.id, + config_version=1, status="pending", total_items=3, organization_id=user_api_key.organization_id, @@ -759,11 +764,15 @@ def test_get_evaluation_run_trace_info_completed( self, client, user_api_key_header, db, user_api_key, create_test_dataset ): """Test requesting trace info for completed evaluation returns cached scores.""" + # Create a config for the evaluation run + config = create_test_config(db, project_id=user_api_key.project.id) + eval_run = EvaluationRun( run_name="test_completed_run", dataset_name=create_test_dataset.name, dataset_id=create_test_dataset.id, - config={"model": "gpt-4o"}, + config_id=config.id, + config_version=1, status="completed", total_items=3, score={