Skip to content

Langfuse eval library & runner #224

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -73,3 +73,7 @@ LANGFUSE_HOST="this_is_not_a_secret"
# Misc

CI=""

# For CI and running CLI commands as demo org
LOCAL_CREDENTIALS_ORG_OPENAI_API_KEY=""
LOCAL_CREDENTIALS_API_KEY=""
6 changes: 6 additions & 0 deletions Taskfile.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
version: '3'

includes:
backend:
taskfile: ./backend/Taskfile.yml
dir: ./backend
21 changes: 21 additions & 0 deletions backend/Taskfile.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
version: '3'

tasks:
eval:
desc: Run evaluation on kunji dataset
dotenv:
- ../.env
cmds:
- uv run ai-cli eval responses {{.CLI_ARGS}}

seed:
desc: Seed the database with initial data
dotenv:
- ../.env
cmds:
- uv run python -m app.seed_data.seed_data

logs:
desc: Show backend logs from docker compose
cmds:
- docker compose -f ../docker-compose.yml logs -f backend
39 changes: 4 additions & 35 deletions backend/app/cli/bench/commands.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import os
import csv
import json
import logging
import time
from datetime import datetime
Expand All @@ -11,6 +10,8 @@
import typer
from tqdm import tqdm

from ..utils import estimate_cost

logging.basicConfig(
level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s - %(message)s"
)
Expand Down Expand Up @@ -45,7 +46,7 @@ class ResponsesDatasetConfig:


def load_instructions(filename: str) -> str:
with open(os.path.join(os.path.dirname(__file__), "data", filename), "r") as file:
with open(os.path.join(os.path.dirname(__file__), "..", "data", filename), "r") as file:
return file.read()


Expand Down Expand Up @@ -99,38 +100,6 @@ class BenchItem:
model: str


def estimate_cost(model: str, input_tokens: int, output_tokens: int) -> float:
GPT_4o_MINI_2024_07_18_COSTING = {
"input": 0.15,
"cached_input": 0.075,
"output": 0.60,
}

GPT_4o_2024_08_06_COSTING = {
"input": 2.50,
"cached_input": 1.25,
"output": 10.00,
}

usd_per_1m = {
"gpt-4o": GPT_4o_2024_08_06_COSTING,
"gpt-4o-2024-08-06": GPT_4o_2024_08_06_COSTING,
"gpt-4o-mini": GPT_4o_MINI_2024_07_18_COSTING,
"gpt-4o-mini-2024-07-18": GPT_4o_MINI_2024_07_18_COSTING,
# Extend with more models as needed: https://platform.openai.com/docs/pricing
}

pricing = usd_per_1m.get(model.lower())
if not pricing:
logging.warning(f"No pricing found for model '{model}'. Returning cost = 0.")
return 0.0

# We don't care about cached_input for now, this just to be mindful of upper bound cost to run benchmark
input_cost = (input_tokens / 1_000_000) * pricing["input"]
output_cost = (output_tokens / 1_000_000) * pricing["output"]
return input_cost + output_cost


def output_csv(items: List[BenchItem]):
filename = f"bench_results_{datetime.now().strftime('%Y%m%d%H%M%S')}.csv"
file_exists = os.path.exists(filename)
Expand Down Expand Up @@ -224,7 +193,7 @@ def load_and_dedupe_csv(
) -> List[dict]:
"""Load and deduplicate CSV data for benchmarking."""
csv_file_path = os.path.join(
os.path.dirname(__file__), "data", dataset_config.filename
os.path.dirname(__file__), "..", "data", dataset_config.filename
)
with open(csv_file_path, "r") as file:
csv_reader = csv.DictReader(file)
Expand Down
Empty file.
Loading
Loading