-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* update litellm * use latest models in evaluator * improve gpt4 evaluator * add simple evaluator * create archive for previous experiment * add simple evaluator by default * gpt4o-2024-08-06 * fix claude region * results for claude 3.5 sonnect * add fireworks ai configurations * update evaluator config * new experiment for llama3.1 * results for llama3.1 * include all evaluators in results * notebooks * update questions * update deps * BLOCK_NONE is restricted for now * vertex_location is different between claude and gemini * alibaba now has openai compatible endpoint, use it so we have caching * add experiment and result for qwen-max-2024-09-19 * fix model names * make archive for 20240910 experiments * update dependencies * update questions * update scripts and notebooks * add xai * default to use 3 evaluators * experiment for xai * add 60 days ttl to keys because the default for new litellm config is too low * add grok result * archive previous results * archive grok results * new experiment and results * add claude evaluator * move folder
- Loading branch information
Showing
51 changed files
with
22,630 additions
and
3,665 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
125 changes: 125 additions & 0 deletions
125
automation-api/yival_experiments/custom_configuration/claude_evaluator.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,125 @@ | ||
""" | ||
ClaudeEvaluator is an evaluator that uses Anthropic's Claude model for evaluations. | ||
The evaluator interfaces with Claude via litellm to present tasks and interpret | ||
the model's responses to determine the quality or correctness of a given | ||
experiment result. | ||
""" | ||
import copy | ||
import logging | ||
|
||
import litellm | ||
from claude_evaluator_config import ClaudeEvaluatorConfig | ||
from evaluator_common import ( | ||
CLASSIFY_STR, | ||
calculate_choice_score, | ||
choices_to_string, | ||
completion_with_backpff, | ||
extract_choice_from_response, | ||
format_template, | ||
) | ||
from yival.evaluators.base_evaluator import BaseEvaluator | ||
from yival.schemas.evaluator_config import ( | ||
EvaluatorOutput, | ||
EvaluatorType, | ||
MethodCalculationMethod, | ||
MetricCalculatorConfig, | ||
) | ||
from yival.schemas.experiment_config import ( | ||
ExperimentResult, | ||
InputData, | ||
MultimodalOutput, | ||
) | ||
|
||
logging.basicConfig(level=logging.INFO) | ||
logger = logging.getLogger(__name__) | ||
|
||
|
||
class ClaudeEvaluator(BaseEvaluator): | ||
"""Evaluator using Claude for evaluation.""" | ||
|
||
default_config = ClaudeEvaluatorConfig(name="claude_evaluator") # type: ignore | ||
|
||
def __init__(self, config: ClaudeEvaluatorConfig): | ||
super().__init__(config) | ||
self.config = config | ||
|
||
def evaluate(self, experiment_result: ExperimentResult) -> EvaluatorOutput: | ||
"""Evaluate the experiment result using Claude.""" | ||
format_dict = copy.deepcopy(experiment_result.input_data.content) | ||
format_dict["raw_output"] = experiment_result.raw_output.text_output | ||
|
||
prompt = format_template(self.config.prompt, format_dict) | ||
if isinstance(prompt, str): | ||
prompt = [{"role": "user", "content": prompt}] | ||
|
||
prompt[-1]["content"] += "\n\n" + CLASSIFY_STR.format( | ||
choices=choices_to_string(self.config.choices) | ||
) | ||
response = completion_with_backpff( | ||
model=self.config.model_name, | ||
messages=prompt, | ||
temperature=0.0, | ||
n=1, | ||
max_tokens=2000, | ||
request_timeout=60, | ||
caching=True, | ||
) | ||
response_content = response["choices"][0]["message"]["content"] | ||
choice = extract_choice_from_response(response_content, self.config.choices) | ||
score = calculate_choice_score(choice, self.config.choice_scores) | ||
return EvaluatorOutput( | ||
name=self.config.name, | ||
result=score if score is not None else choice, | ||
display_name=self.config.display_name, | ||
metric_calculators=self.config.metric_calculators, | ||
) | ||
|
||
|
||
BaseEvaluator.register_evaluator( | ||
"claude_evaluator", ClaudeEvaluator, ClaudeEvaluatorConfig | ||
) | ||
|
||
|
||
def main(): | ||
"""Main function to test the ClaudeEvaluator.""" | ||
from example_evaluator_data import ( | ||
choice_scores, | ||
choices, | ||
content, | ||
prompt, | ||
raw_output, | ||
) | ||
|
||
litellm.set_verbose = True | ||
|
||
evaluator_config = ClaudeEvaluatorConfig( | ||
name="claude_evaluator", | ||
display_name="correctness test", | ||
metric_calculators=[ | ||
MetricCalculatorConfig( | ||
MethodCalculationMethod(MethodCalculationMethod.AVERAGE) | ||
) | ||
], | ||
prompt=prompt, | ||
choices=choices, | ||
evaluator_type=EvaluatorType.INDIVIDUAL, | ||
choice_scores=choice_scores, | ||
) | ||
input_data_example = InputData(content=content) | ||
|
||
experiment_result_example = ExperimentResult( | ||
input_data=input_data_example, | ||
combination={"wrapper1": "var1", "wrapper2": "var2"}, | ||
raw_output=MultimodalOutput(text_output=raw_output), | ||
latency=150.0, | ||
token_usage=50, | ||
) | ||
|
||
evaluator = ClaudeEvaluator(evaluator_config) | ||
result = evaluator.evaluate(experiment_result_example) | ||
print("Result: ", result.result) | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
18 changes: 18 additions & 0 deletions
18
automation-api/yival_experiments/custom_configuration/claude_evaluator_config.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
from dataclasses import asdict, dataclass, field | ||
from typing import Any, Dict, List, Optional, Union | ||
|
||
from yival.schemas.evaluator_config import EvaluatorConfig, EvaluatorType | ||
|
||
|
||
@dataclass | ||
class ClaudeEvaluatorConfig(EvaluatorConfig): | ||
evaluator_type: EvaluatorType = EvaluatorType.INDIVIDUAL | ||
prompt: Union[str, List[Dict[str, str]]] = "" | ||
choices: List[str] = field(default_factory=list) | ||
model_name: str = "claude-3-5-sonnet-20241022" | ||
description: str = "This is an evaluator that uses Anthropic's Claude model." | ||
scale_description: str = "0-4" | ||
choice_scores: Optional[Dict[str, float]] = None | ||
|
||
def asdict(self) -> Dict[str, Any]: | ||
return asdict(self) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
8 changes: 4 additions & 4 deletions
8
automation-api/yival_experiments/custom_configuration/llms/palm_completion.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,18 +1,18 @@ | ||
safety_settings = [ | ||
{ | ||
"category": "HARM_CATEGORY_HARASSMENT", | ||
"threshold": "BLOCK_NONE", | ||
"threshold": "BLOCK_ONLY_HIGH", | ||
}, | ||
{ | ||
"category": "HARM_CATEGORY_HATE_SPEECH", | ||
"threshold": "BLOCK_NONE", | ||
"threshold": "BLOCK_ONLY_HIGH", | ||
}, | ||
{ | ||
"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", | ||
"threshold": "BLOCK_NONE", | ||
"threshold": "BLOCK_ONLY_HIGH", | ||
}, | ||
{ | ||
"category": "HARM_CATEGORY_DANGEROUS_CONTENT", | ||
"threshold": "BLOCK_NONE", | ||
"threshold": "BLOCK_ONLY_HIGH", | ||
}, | ||
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.