Skip to content
Open
60 changes: 60 additions & 0 deletions backend/app/alembic/versions/041_add_config_in_evals_run_table.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
"""add config in evals run table

Revision ID: 041
Revises: 040
Create Date: 2025-12-15 14:03:22.082746

"""
from alembic import op
import sqlalchemy as sa
import sqlmodel.sql.sqltypes
from sqlalchemy.dialects import postgresql

# revision identifiers, used by Alembic.
revision = "041"
down_revision = "040"
branch_labels = None
depends_on = None


def upgrade():
# ### commands auto generated by Alembic - please adjust! ###
op.add_column(
"evaluation_run",
sa.Column(
"config_id",
sa.Uuid(),
nullable=True,
comment="Reference to the stored config used",
),
)
op.add_column(
"evaluation_run",
sa.Column(
"config_version",
sa.Integer(),
nullable=True,
comment="Version of the config used",
),
)
op.create_foreign_key(None, "evaluation_run", "config", ["config_id"], ["id"])
op.drop_column("evaluation_run", "config")
# ### end Alembic commands ###


def downgrade():
# ### commands auto generated by Alembic - please adjust! ###
op.add_column(
"evaluation_run",
sa.Column(
"config",
postgresql.JSONB(astext_type=sa.Text()),
autoincrement=False,
nullable=False,
comment="Evaluation configuration (model, instructions, etc.)",
),
)
op.drop_constraint(None, "evaluation_run", type_="foreignkey")
op.drop_column("evaluation_run", "config_version")
op.drop_column("evaluation_run", "config_id")
# ### end Alembic commands ###
110 changes: 39 additions & 71 deletions backend/app/api/routes/evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,21 @@
import logging
import re
from pathlib import Path

from fastapi import APIRouter, Body, File, Form, HTTPException, Query, UploadFile
from uuid import UUID

from fastapi import (
APIRouter,
Body,
File,
Form,
HTTPException,
Query,
UploadFile,
)

from app.api.deps import AuthContextDep, SessionDep
from app.core.cloud import get_cloud_storage
from app.crud.assistants import get_assistant_by_id
from app.crud.config.version import ConfigVersionCrud
from app.crud.evaluations import (
create_evaluation_dataset,
create_evaluation_run,
Expand All @@ -27,13 +36,17 @@
DatasetUploadResponse,
EvaluationRunPublic,
)
from app.models.llm.request import LLMCallConfig
from app.services.llm.jobs import resolve_config_blob
from app.services.llm.providers import LLMProvider
from app.utils import (
APIResponse,
get_langfuse_client,
get_openai_client,
load_description,
)


logger = logging.getLogger(__name__)

# File upload security constants
Expand Down Expand Up @@ -430,20 +443,9 @@ def evaluate(
experiment_name: str = Body(
..., description="Name for this evaluation experiment/run"
),
config: dict = Body(default_factory=dict, description="Evaluation configuration"),
assistant_id: str
| None = Body(
None, description="Optional assistant ID to fetch configuration from"
),
config_id: UUID = Body(..., description="Stored config ID"),
config_version: int = Body(..., ge=1, description="Stored config version"),
) -> APIResponse[EvaluationRunPublic]:
logger.info(
f"[evaluate] Starting evaluation | experiment_name={experiment_name} | "
f"dataset_id={dataset_id} | "
f"org_id={auth_context.organization.id} | "
f"assistant_id={assistant_id} | "
f"config_keys={list(config.keys())}"
)

# Step 1: Fetch dataset from database
dataset = get_dataset_by_id(
session=_session,
Expand All @@ -459,12 +461,6 @@ def evaluate(
f"organization/project",
)

logger.info(
f"[evaluate] Found dataset | id={dataset.id} | name={dataset.name} | "
f"object_store_url={'present' if dataset.object_store_url else 'None'} | "
f"langfuse_id={dataset.langfuse_dataset_id}"
)

dataset_name = dataset.name

# Get API clients
Expand All @@ -487,63 +483,35 @@ def evaluate(
"Please ensure Langfuse credentials were configured when the dataset was created.",
)

# Handle assistant_id if provided
if assistant_id:
# Fetch assistant details from database
assistant = get_assistant_by_id(
session=_session,
assistant_id=assistant_id,
project_id=auth_context.project.id,
)

if not assistant:
raise HTTPException(
status_code=404, detail=f"Assistant {assistant_id} not found"
)
config_version_crud = ConfigVersionCrud(
session=_session, config_id=config_id, project_id=auth_context.project.id
)

logger.info(
f"[evaluate] Found assistant in DB | id={assistant.id} | "
f"model={assistant.model} | instructions="
f"{assistant.instructions[:50] if assistant.instructions else 'None'}..."
config, error = resolve_config_blob(
config_crud=config_version_crud,
config=LLMCallConfig(id=config_id, version=config_version),
)
if error:
raise HTTPException(
status_code=400,
detail=f"Failed to resolve config from stored config: {error}",
)

# Build config from assistant (use provided config values to override
# if present)
config = {
"model": config.get("model", assistant.model),
"instructions": config.get("instructions", assistant.instructions),
"temperature": config.get("temperature", assistant.temperature),
}

# Add tools if vector stores are available
vector_store_ids = config.get(
"vector_store_ids", assistant.vector_store_ids or []
elif config.completion.provider != LLMProvider.OPENAI:
raise HTTPException(
status_code=422,
detail="Only 'openai' provider is supported for evaluation configs",
)
if vector_store_ids and len(vector_store_ids) > 0:
config["tools"] = [
{
"type": "file_search",
"vector_store_ids": vector_store_ids,
}
]

logger.info("[evaluate] Using config from assistant")
else:
logger.info("[evaluate] Using provided config directly")
# Validate that config has minimum required fields
if not config.get("model"):
raise HTTPException(
status_code=400,
detail="Config must include 'model' when assistant_id is not provided",
)

# Create EvaluationRun record
logger.info("[evaluate] Successfully resolved config from config management")

# Create EvaluationRun record with config references
eval_run = create_evaluation_run(
session=_session,
run_name=experiment_name,
dataset_name=dataset_name,
dataset_id=dataset_id,
config=config,
config_id=config_id,
config_version=config_version,
organization_id=auth_context.organization.id,
project_id=auth_context.project.id,
)
Expand All @@ -555,7 +523,7 @@ def evaluate(
openai_client=openai_client,
session=_session,
eval_run=eval_run,
config=config,
config=config.completion.params,
)

logger.info(
Expand Down
2 changes: 2 additions & 0 deletions backend/app/crud/evaluations/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
create_evaluation_run,
get_evaluation_run_by_id,
list_evaluation_runs,
resolve_model_from_config,
)
from app.crud.evaluations.cron import (
process_all_pending_evaluations,
Expand Down Expand Up @@ -39,6 +40,7 @@
"create_evaluation_run",
"get_evaluation_run_by_id",
"list_evaluation_runs",
"resolve_model_from_config",
# Cron
"process_all_pending_evaluations",
"process_all_pending_evaluations_sync",
Expand Down
62 changes: 58 additions & 4 deletions backend/app/crud/evaluations/core.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,16 @@
import logging
from typing import Any
from uuid import UUID

from langfuse import Langfuse
from sqlmodel import Session, select

from app.core.util import now
from app.crud.config.version import ConfigVersionCrud
from app.crud.evaluations.langfuse import fetch_trace_scores_from_langfuse
from app.models import EvaluationRun
from app.models.llm.request import LLMCallConfig
from app.services.llm.jobs import resolve_config_blob

logger = logging.getLogger(__name__)

Expand All @@ -16,7 +20,8 @@ def create_evaluation_run(
run_name: str,
dataset_name: str,
dataset_id: int,
config: dict,
config_id: UUID,
config_version: int,
organization_id: int,
project_id: int,
) -> EvaluationRun:
Expand All @@ -28,7 +33,8 @@ def create_evaluation_run(
run_name: Name of the evaluation run/experiment
dataset_name: Name of the dataset being used
dataset_id: ID of the dataset
config: Configuration dict for the evaluation
config_id: UUID of the stored config
config_version: Version number of the config
organization_id: Organization ID
project_id: Project ID

Expand All @@ -39,7 +45,8 @@ def create_evaluation_run(
run_name=run_name,
dataset_name=dataset_name,
dataset_id=dataset_id,
config=config,
config_id=config_id,
config_version=config_version,
status="pending",
organization_id=organization_id,
project_id=project_id,
Expand All @@ -56,7 +63,10 @@ def create_evaluation_run(
logger.error(f"Failed to create EvaluationRun: {e}", exc_info=True)
raise

logger.info(f"Created EvaluationRun record: id={eval_run.id}, run_name={run_name}")
logger.info(
f"Created EvaluationRun record: id={eval_run.id}, run_name={run_name}, "
f"config_id={config_id}, config_version={config_version}"
)

return eval_run

Expand Down Expand Up @@ -293,3 +303,47 @@ def save_score(
f"traces={len(score.get('traces', []))}"
)
return eval_run


def resolve_model_from_config(
session: Session,
eval_run: EvaluationRun,
) -> str:
"""
Resolve the model name from the evaluation run's config.

Args:
session: Database session
eval_run: EvaluationRun instance

Returns:
Model name from config

Raises:
ValueError: If config is missing, invalid, or has no model
"""
if not eval_run.config_id or not eval_run.config_version:
raise ValueError(
f"Evaluation run {eval_run.id} has no config reference "
f"(config_id={eval_run.config_id}, config_version={eval_run.config_version})"
)

config_version_crud = ConfigVersionCrud(
session=session,
config_id=eval_run.config_id,
project_id=eval_run.project_id,
)

config, error = resolve_config_blob(
config_crud=config_version_crud,
config=LLMCallConfig(id=eval_run.config_id, version=eval_run.config_version),
)

if error or config is None:
raise ValueError(
f"Config resolution failed for evaluation {eval_run.id} "
f"(config_id={eval_run.config_id}, version={eval_run.config_version}): {error}"
)

model = config.completion.params.get("model")
return model
14 changes: 1 addition & 13 deletions backend/app/crud/evaluations/embeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -364,19 +364,7 @@ def start_embedding_batch(
logger.info(f"Starting embedding batch for evaluation run {eval_run.id}")

# Get embedding model from config (default: text-embedding-3-large)
embedding_model = eval_run.config.get(
"embedding_model", "text-embedding-3-large"
)

# Validate and fallback to default if invalid
try:
validate_embedding_model(embedding_model)
except ValueError as e:
logger.warning(
f"Invalid embedding model '{embedding_model}' in config: {e}. "
f"Falling back to text-embedding-3-large"
)
embedding_model = "text-embedding-3-large"
embedding_model = "text-embedding-3-large"

# Step 1: Build embedding JSONL with trace_ids
jsonl_data = build_embedding_jsonl(
Expand Down
Loading