Skip to content

Commit

Permalink
New experiments (#30)
Browse files Browse the repository at this point in the history
* update litellm

* use latest models in evaluator

* improve gpt4 evaluator

* add simple evaluator

* create archive for previous experiment

* add simple evaluator by default

* gpt4o-2024-08-06

* fix claude region

* results for claude 3.5 sonnect

* add fireworks ai configurations

* update evaluator config

* new experiment for llama3.1

* results for llama3.1

* include all evaluators in results

* notebooks

* update questions

* update deps

* BLOCK_NONE is restricted for now

* vertex_location is different between claude and gemini

* alibaba now has openai compatible endpoint, use it so we have caching

* add experiment and result for qwen-max-2024-09-19

* fix model names

* make archive for 20240910 experiments

* update dependencies

* update questions

* update scripts and notebooks

* add xai

* default to use 3 evaluators

* experiment for xai

* add 60 days ttl to keys because the default for new litellm config is too low

* add grok result

* archive previous results

* archive grok results

* new experiment and results

* add claude evaluator

* move folder
  • Loading branch information
semio authored Dec 20, 2024
1 parent 3fe7eba commit f751b33
Show file tree
Hide file tree
Showing 51 changed files with 22,630 additions and 3,665 deletions.
4 changes: 4 additions & 0 deletions automation-api/.env.example
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,10 @@ VERTEXAI_PROJECT="gapminder-ai"
VERTEXAI_LOCATIONS="asia-southeast1,asia-east2,asia-northeast1"
# follow the guide in automation-api/DEV.md#obtaining-developer-specific-service-account-credentials-base64-encoded
VERTEX_SERVICE_ACCOUNT_CREDENTIALS=""
# fireworks
FIREWORKS_API_KEY=""
# for xai
XAI_API_KEY=""

# For local development / notebooks etc
SERVICE_ACCOUNT_CREDENTIALS=""
Expand Down
6,590 changes: 3,411 additions & 3,179 deletions automation-api/poetry.lock

Large diffs are not rendered by default.

6 changes: 6 additions & 0 deletions automation-api/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ duckdb = "^0.10.2"
duckdb-engine = "^0.12.0"
jupysql = "^0.10.10"
anthropic = {extras = ["vertex"], version = "^0.25.9"}
fireworks-ai = "^0.15.1"



Expand All @@ -85,6 +86,11 @@ ipykernel = "^6.6.0"
jupytext = "^1.14.4"
pytest-mock = "^3.6.1"

[[tool.poetry.source]]
name = "pytorch_cpu"
url = "https://download.pytorch.org/whl/cpu"
priority = "explicit"

[build-system]
requires = ["poetry-core>=1.0.0"]
build-backend = "poetry.core.masonry.api"
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
"""
ClaudeEvaluator is an evaluator that uses Anthropic's Claude model for evaluations.
The evaluator interfaces with Claude via litellm to present tasks and interpret
the model's responses to determine the quality or correctness of a given
experiment result.
"""
import copy
import logging

import litellm
from claude_evaluator_config import ClaudeEvaluatorConfig
from evaluator_common import (
CLASSIFY_STR,
calculate_choice_score,
choices_to_string,
completion_with_backpff,
extract_choice_from_response,
format_template,
)
from yival.evaluators.base_evaluator import BaseEvaluator
from yival.schemas.evaluator_config import (
EvaluatorOutput,
EvaluatorType,
MethodCalculationMethod,
MetricCalculatorConfig,
)
from yival.schemas.experiment_config import (
ExperimentResult,
InputData,
MultimodalOutput,
)

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


class ClaudeEvaluator(BaseEvaluator):
"""Evaluator using Claude for evaluation."""

default_config = ClaudeEvaluatorConfig(name="claude_evaluator") # type: ignore

def __init__(self, config: ClaudeEvaluatorConfig):
super().__init__(config)
self.config = config

def evaluate(self, experiment_result: ExperimentResult) -> EvaluatorOutput:
"""Evaluate the experiment result using Claude."""
format_dict = copy.deepcopy(experiment_result.input_data.content)
format_dict["raw_output"] = experiment_result.raw_output.text_output

prompt = format_template(self.config.prompt, format_dict)
if isinstance(prompt, str):
prompt = [{"role": "user", "content": prompt}]

prompt[-1]["content"] += "\n\n" + CLASSIFY_STR.format(
choices=choices_to_string(self.config.choices)
)
response = completion_with_backpff(
model=self.config.model_name,
messages=prompt,
temperature=0.0,
n=1,
max_tokens=2000,
request_timeout=60,
caching=True,
)
response_content = response["choices"][0]["message"]["content"]
choice = extract_choice_from_response(response_content, self.config.choices)
score = calculate_choice_score(choice, self.config.choice_scores)
return EvaluatorOutput(
name=self.config.name,
result=score if score is not None else choice,
display_name=self.config.display_name,
metric_calculators=self.config.metric_calculators,
)


BaseEvaluator.register_evaluator(
"claude_evaluator", ClaudeEvaluator, ClaudeEvaluatorConfig
)


def main():
"""Main function to test the ClaudeEvaluator."""
from example_evaluator_data import (
choice_scores,
choices,
content,
prompt,
raw_output,
)

litellm.set_verbose = True

evaluator_config = ClaudeEvaluatorConfig(
name="claude_evaluator",
display_name="correctness test",
metric_calculators=[
MetricCalculatorConfig(
MethodCalculationMethod(MethodCalculationMethod.AVERAGE)
)
],
prompt=prompt,
choices=choices,
evaluator_type=EvaluatorType.INDIVIDUAL,
choice_scores=choice_scores,
)
input_data_example = InputData(content=content)

experiment_result_example = ExperimentResult(
input_data=input_data_example,
combination={"wrapper1": "var1", "wrapper2": "var2"},
raw_output=MultimodalOutput(text_output=raw_output),
latency=150.0,
token_usage=50,
)

evaluator = ClaudeEvaluator(evaluator_config)
result = evaluator.evaluate(experiment_result_example)
print("Result: ", result.result)


if __name__ == "__main__":
main()
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
from dataclasses import asdict, dataclass, field
from typing import Any, Dict, List, Optional, Union

from yival.schemas.evaluator_config import EvaluatorConfig, EvaluatorType


@dataclass
class ClaudeEvaluatorConfig(EvaluatorConfig):
evaluator_type: EvaluatorType = EvaluatorType.INDIVIDUAL
prompt: Union[str, List[Dict[str, str]]] = ""
choices: List[str] = field(default_factory=list)
model_name: str = "claude-3-5-sonnet-20241022"
description: str = "This is an evaluator that uses Anthropic's Claude model."
scale_description: str = "0-4"
choice_scores: Optional[Dict[str, float]] = None

def asdict(self) -> Dict[str, Any]:
return asdict(self)
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,6 @@ def __init__(self, config: GPT4EvaluatorConfig):

def evaluate(self, experiment_result: ExperimentResult) -> EvaluatorOutput:
"""Evaluate the experiment result using OpenAI's prompt-based evaluation."""
assert isinstance(self.config, GPT4EvaluatorConfig)
format_dict = copy.deepcopy(experiment_result.input_data.content)
format_dict["raw_output"] = experiment_result.raw_output.text_output

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ class Llama3EvaluatorConfig(EvaluatorConfig):
evaluator_type: EvaluatorType = EvaluatorType.INDIVIDUAL
prompt: Union[str, List[Dict[str, str]]] = ""
choices: List[str] = field(default_factory=list)
model_name: str = "replicate/meta/meta-llama-3-70b-instruct"
model_name: str = "fireworks_ai/accounts/fireworks/models/llama-v3p1-405b-instruct"
description: str = "This is the description of the evaluator."
scale_description: str = "0-4"
choice_scores: Optional[Dict[str, float]] = None
Expand Down
Original file line number Diff line number Diff line change
@@ -1,18 +1,18 @@
safety_settings = [
{
"category": "HARM_CATEGORY_HARASSMENT",
"threshold": "BLOCK_NONE",
"threshold": "BLOCK_ONLY_HIGH",
},
{
"category": "HARM_CATEGORY_HATE_SPEECH",
"threshold": "BLOCK_NONE",
"threshold": "BLOCK_ONLY_HIGH",
},
{
"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
"threshold": "BLOCK_NONE",
"threshold": "BLOCK_ONLY_HIGH",
},
{
"category": "HARM_CATEGORY_DANGEROUS_CONTENT",
"threshold": "BLOCK_NONE",
"threshold": "BLOCK_ONLY_HIGH",
},
]
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,6 @@

# load env vars
from lib.config import read_config
from yival_experiments.custom_configuration.llms.alibaba_complete import (
llm_complete as alibaba_llm_complete,
)
from yival_experiments.custom_configuration.llms.palm_completion import safety_settings

read_config()
Expand All @@ -26,26 +23,31 @@
# vendor="OpenAI"
# )
# default_model_config = dict(
# model_id="vertex_ai/gemini-1.5-pro-preview-0409",
# model_id="vertex_ai/gemini-pro-experimental",
# params={"temperature": 0.5},
# vendor="Google",
# )
# default_model_config = dict(
# model_id="vertex_ai/claude-3-opus@20240229",
# params={"temperature": 0.5},
# vendor="Anthropic",
# )
# default_model_config = dict(
# model_id="replicate/meta/meta-llama-3-70b-instruct",
# params={"temperature": 0.5},
# vendor="Meta",
# )
default_model_config = dict(
model_id="vertex_ai/claude-3-opus@20240229",
params={"temperature": 0.5},
vendor="Anthropic",
)
default_model_config = dict(
model_id="replicate/meta/meta-llama-3-70b-instruct",
params={"temperature": 0.5},
vendor="Meta",
model_id="qwen-max", params={"temperature": 0.5}, vendor="Alibaba"
)
# set this to see verbose outputs
litellm.set_verbose = True
# enable caching in the evaluator.
# litellm.cache = litellm.Cache()
# to not use Redis for caching: uncomment the line above and comment the line below.
litellm.cache = litellm.Cache(type="redis", host="127.0.0.1", port=26379)
litellm.cache = litellm.Cache(
type="redis", host="127.0.0.1", port=26379, ttl=60 * 24 * 3600
)


def model_compare(
Expand Down Expand Up @@ -96,10 +98,10 @@ def model_compare(
litellm_params = dict(
model=model["model_id"],
messages=litellm_messages,
caching=False,
caching=True,
num_retries=10,
request_timeout=60,
**model["params"]
**model["params"],
)
if model["vendor"] == "Google":
# choose a vertex project location
Expand All @@ -109,23 +111,16 @@ def model_compare(
# google allows changing content filters. We will disable all
litellm_params["safety_settings"] = safety_settings
elif model["vendor"] == "Anthropic":
if "opus" in model["model_id"]:
# there is only one location where claude Opus is available.
litellm.vertex_location = "us-east5"
else:
litellm.vertex_location = "us-central1"

# all Anthropic models are abailable in us-east5
litellm.vertex_location = "us-east5"
elif model["vendor"] == "Alibaba":
# Alibaba has openai compatible endpoints
litellm_params["model"] = f"openai/{litellm_params['model']}"
litellm_params["api_key"] = os.getenv("DASHSCOPE_API_KEY")
litellm_params["api_base"] = "https://dashscope.aliyuncs.com/compatible-mode/v1"
try:
if model["vendor"] == "Alibaba":
# FIXME: alibaba's complete function doesn't support system prompt.
output = alibaba_llm_complete(
model_name=model["model_id"], prompt=prompt, **model["params"]
)
response = Response(output=output).output
response_text = response["choices"][0]["message"]["content"]
else:
response = Response(output=completion(**litellm_params)).output
response_text = response["choices"][0]["message"]["content"]
response = Response(output=completion(**litellm_params)).output
response_text = response["choices"][0]["message"]["content"]
except KeyboardInterrupt:
raise
except Exception as e:
Expand Down
Loading

0 comments on commit f751b33

Please sign in to comment.