ProjectTech4DevAI · EdmundKorley · Jun 12, 2025
diff --git a/.env.example b/.env.example
@@ -73,3 +73,7 @@ LANGFUSE_HOST="this_is_not_a_secret"
 # Misc
 
 CI=""
+
+# For CI and running CLI commands as demo org
+LOCAL_CREDENTIALS_ORG_OPENAI_API_KEY=""
+LOCAL_CREDENTIALS_API_KEY=""
diff --git a/Taskfile.yml b/Taskfile.yml
@@ -0,0 +1,6 @@
+version: '3'
+
+includes:
+  backend:
+    taskfile: ./backend/Taskfile.yml
+    dir: ./backend
diff --git a/backend/Taskfile.yml b/backend/Taskfile.yml
@@ -0,0 +1,21 @@
+version: '3'
+
+tasks:
+  eval:
+    desc: Run evaluation on kunji dataset
+    dotenv:
+      - ../.env
+    cmds:
+      - uv run ai-cli eval responses {{.CLI_ARGS}}
+
+  seed:
+    desc: Seed the database with initial data
+    dotenv:
+      - ../.env
+    cmds:
+      - uv run python -m app.seed_data.seed_data
+
+  logs:
+    desc: Show backend logs from docker compose
+    cmds:
+      - docker compose -f ../docker-compose.yml logs -f backend
diff --git a/backend/app/cli/bench/commands.py b/backend/app/cli/bench/commands.py
@@ -1,6 +1,5 @@
 import os
 import csv
-import json
 import logging
 import time
 from datetime import datetime
@@ -11,6 +10,8 @@
 import typer
 from tqdm import tqdm
 
+from ..utils import estimate_cost
+
 logging.basicConfig(
     level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s - %(message)s"
 )
@@ -45,7 +46,7 @@ class ResponsesDatasetConfig:
 
 
 def load_instructions(filename: str) -> str:
-    with open(os.path.join(os.path.dirname(__file__), "data", filename), "r") as file:
+    with open(os.path.join(os.path.dirname(__file__), "..", "data", filename), "r") as file:
         return file.read()
 
 
@@ -99,38 +100,6 @@ class BenchItem:
     model: str
 
 
-def estimate_cost(model: str, input_tokens: int, output_tokens: int) -> float:
-    GPT_4o_MINI_2024_07_18_COSTING = {
-        "input": 0.15,
-        "cached_input": 0.075,
-        "output": 0.60,
-    }
-
-    GPT_4o_2024_08_06_COSTING = {
-        "input": 2.50,
-        "cached_input": 1.25,
-        "output": 10.00,
-    }
-
-    usd_per_1m = {
-        "gpt-4o": GPT_4o_2024_08_06_COSTING,
-        "gpt-4o-2024-08-06": GPT_4o_2024_08_06_COSTING,
-        "gpt-4o-mini": GPT_4o_MINI_2024_07_18_COSTING,
-        "gpt-4o-mini-2024-07-18": GPT_4o_MINI_2024_07_18_COSTING,
-        # Extend with more models as needed: https://platform.openai.com/docs/pricing
-    }
-
-    pricing = usd_per_1m.get(model.lower())
-    if not pricing:
-        logging.warning(f"No pricing found for model '{model}'. Returning cost = 0.")
-        return 0.0
-
-    # We don't care about cached_input for now, this just to be mindful of upper bound cost to run benchmark
-    input_cost = (input_tokens / 1_000_000) * pricing["input"]
-    output_cost = (output_tokens / 1_000_000) * pricing["output"]
-    return input_cost + output_cost
-
-
 def output_csv(items: List[BenchItem]):
     filename = f"bench_results_{datetime.now().strftime('%Y%m%d%H%M%S')}.csv"
     file_exists = os.path.exists(filename)
@@ -224,7 +193,7 @@ def load_and_dedupe_csv(
 ) -> List[dict]:
     """Load and deduplicate CSV data for benchmarking."""
     csv_file_path = os.path.join(
-        os.path.dirname(__file__), "data", dataset_config.filename
+        os.path.dirname(__file__), "..", "data", dataset_config.filename
     )
     with open(csv_file_path, "r") as file:
         csv_reader = csv.DictReader(file)

diff --git a/.../bench/data/glific_kunji_test_queries.csv → ...pp/cli/data/glific_kunji_test_queries.csv b/.../bench/data/glific_kunji_test_queries.csv → ...pp/cli/data/glific_kunji_test_queries.csv
diff --git a/...app/cli/bench/data/kunji_instructions.txt → backend/app/cli/data/kunji_instructions.txt b/...app/cli/bench/data/kunji_instructions.txt → backend/app/cli/data/kunji_instructions.txt
diff --git a/backend/app/cli/bench/data/sneha_goldens.csv → backend/app/cli/data/sneha_goldens.csv b/backend/app/cli/bench/data/sneha_goldens.csv → backend/app/cli/data/sneha_goldens.csv
diff --git a/...app/cli/bench/data/sneha_instructions.txt → backend/app/cli/data/sneha_instructions.txt b/...app/cli/bench/data/sneha_instructions.txt → backend/app/cli/data/sneha_instructions.txt
diff --git a/backend/app/cli/eval/__init__.py b/backend/app/cli/eval/__init__.py