New experiments (#30)

* update litellm * use latest models in evaluator * improve gpt4 evaluator * add simple evaluator * create archive for previous experiment * add simple evaluator by default * gpt4o-2024-08-06 * fix claude region * results for claude 3.5 sonnect * add fireworks ai configurations * update evaluator config * new experiment for llama3.1 * results for llama3.1 * include all evaluators in results * notebooks * update questions * update deps * BLOCK_NONE is restricted for now * vertex_location is different between claude and gemini * alibaba now has openai compatible endpoint, use it so we have caching * add experiment and result for qwen-max-2024-09-19 * fix model names * make archive for 20240910 experiments * update dependencies * update questions * update scripts and notebooks * add xai * default to use 3 evaluators * experiment for xai * add 60 days ttl to keys because the default for new litellm config is too low * add grok result * archive previous results * archive grok results * new experiment and results * add claude evaluator * move folder
Gapminder · Dec 20, 2024 · f751b33 · f751b33
1 parent 3fe7eba
commit f751b33
Show file tree

Hide file tree

Showing 51 changed files with 22,630 additions and 3,665 deletions.
diff --git a/automation-api/.env.example b/automation-api/.env.example
@@ -15,6 +15,10 @@ VERTEXAI_PROJECT="gapminder-ai"
 VERTEXAI_LOCATIONS="asia-southeast1,asia-east2,asia-northeast1"
 # follow the guide in automation-api/DEV.md#obtaining-developer-specific-service-account-credentials-base64-encoded
 VERTEX_SERVICE_ACCOUNT_CREDENTIALS=""
+# fireworks
+FIREWORKS_API_KEY=""
+# for xai
+XAI_API_KEY=""
 
 # For local development / notebooks etc
 SERVICE_ACCOUNT_CREDENTIALS=""

diff --git a/automation-api/poetry.lock b/automation-api/poetry.lock
diff --git a/automation-api/pyproject.toml b/automation-api/pyproject.toml
@@ -70,6 +70,7 @@ duckdb = "^0.10.2"
 duckdb-engine = "^0.12.0"
 jupysql = "^0.10.10"
 anthropic = {extras = ["vertex"], version = "^0.25.9"}
+fireworks-ai = "^0.15.1"
 
 
 
@@ -85,6 +86,11 @@ ipykernel = "^6.6.0"
 jupytext = "^1.14.4"
 pytest-mock = "^3.6.1"
 
+[[tool.poetry.source]]
+name = "pytorch_cpu"
+url = "https://download.pytorch.org/whl/cpu"
+priority = "explicit"
+
 [build-system]
 requires = ["poetry-core>=1.0.0"]
 build-backend = "poetry.core.masonry.api"

diff --git a/automation-api/yival_experiments/custom_configuration/claude_evaluator.py b/automation-api/yival_experiments/custom_configuration/claude_evaluator.py
@@ -0,0 +1,125 @@
+"""
+ClaudeEvaluator is an evaluator that uses Anthropic's Claude model for evaluations.
+
+The evaluator interfaces with Claude via litellm to present tasks and interpret
+the model's responses to determine the quality or correctness of a given
+experiment result.
+"""
+import copy
+import logging
+
+import litellm
+from claude_evaluator_config import ClaudeEvaluatorConfig
+from evaluator_common import (
+    CLASSIFY_STR,
+    calculate_choice_score,
+    choices_to_string,
+    completion_with_backpff,
+    extract_choice_from_response,
+    format_template,
+)
+from yival.evaluators.base_evaluator import BaseEvaluator
+from yival.schemas.evaluator_config import (
+    EvaluatorOutput,
+    EvaluatorType,
+    MethodCalculationMethod,
+    MetricCalculatorConfig,
+)
+from yival.schemas.experiment_config import (
+    ExperimentResult,
+    InputData,
+    MultimodalOutput,
+)
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+class ClaudeEvaluator(BaseEvaluator):
+    """Evaluator using Claude for evaluation."""
+
+    default_config = ClaudeEvaluatorConfig(name="claude_evaluator")  # type: ignore
+
+    def __init__(self, config: ClaudeEvaluatorConfig):
+        super().__init__(config)
+        self.config = config
+
+    def evaluate(self, experiment_result: ExperimentResult) -> EvaluatorOutput:
+        """Evaluate the experiment result using Claude."""
+        format_dict = copy.deepcopy(experiment_result.input_data.content)
+        format_dict["raw_output"] = experiment_result.raw_output.text_output
+
+        prompt = format_template(self.config.prompt, format_dict)
+        if isinstance(prompt, str):
+            prompt = [{"role": "user", "content": prompt}]
+
+        prompt[-1]["content"] += "\n\n" + CLASSIFY_STR.format(
+            choices=choices_to_string(self.config.choices)
+        )
+        response = completion_with_backpff(
+            model=self.config.model_name,
+            messages=prompt,
+            temperature=0.0,
+            n=1,
+            max_tokens=2000,
+            request_timeout=60,
+            caching=True,
+        )
+        response_content = response["choices"][0]["message"]["content"]
+        choice = extract_choice_from_response(response_content, self.config.choices)
+        score = calculate_choice_score(choice, self.config.choice_scores)
+        return EvaluatorOutput(
+            name=self.config.name,
+            result=score if score is not None else choice,
+            display_name=self.config.display_name,
+            metric_calculators=self.config.metric_calculators,
+        )
+
+
+BaseEvaluator.register_evaluator(
+    "claude_evaluator", ClaudeEvaluator, ClaudeEvaluatorConfig
+)
+
+
+def main():
+    """Main function to test the ClaudeEvaluator."""
+    from example_evaluator_data import (
+        choice_scores,
+        choices,
+        content,
+        prompt,
+        raw_output,
+    )
+
+    litellm.set_verbose = True
+
+    evaluator_config = ClaudeEvaluatorConfig(
+        name="claude_evaluator",
+        display_name="correctness test",
+        metric_calculators=[
+            MetricCalculatorConfig(
+                MethodCalculationMethod(MethodCalculationMethod.AVERAGE)
+            )
+        ],
+        prompt=prompt,
+        choices=choices,
+        evaluator_type=EvaluatorType.INDIVIDUAL,
+        choice_scores=choice_scores,
+    )
+    input_data_example = InputData(content=content)
+
+    experiment_result_example = ExperimentResult(
+        input_data=input_data_example,
+        combination={"wrapper1": "var1", "wrapper2": "var2"},
+        raw_output=MultimodalOutput(text_output=raw_output),
+        latency=150.0,
+        token_usage=50,
+    )
+
+    evaluator = ClaudeEvaluator(evaluator_config)
+    result = evaluator.evaluate(experiment_result_example)
+    print("Result: ", result.result)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/automation-api/yival_experiments/custom_configuration/claude_evaluator_config.py b/automation-api/yival_experiments/custom_configuration/claude_evaluator_config.py
@@ -0,0 +1,18 @@
+from dataclasses import asdict, dataclass, field
+from typing import Any, Dict, List, Optional, Union
+
+from yival.schemas.evaluator_config import EvaluatorConfig, EvaluatorType
+
+
+@dataclass
+class ClaudeEvaluatorConfig(EvaluatorConfig):
+    evaluator_type: EvaluatorType = EvaluatorType.INDIVIDUAL
+    prompt: Union[str, List[Dict[str, str]]] = ""
+    choices: List[str] = field(default_factory=list)
+    model_name: str = "claude-3-5-sonnet-20241022"
+    description: str = "This is an evaluator that uses Anthropic's Claude model."
+    scale_description: str = "0-4"
+    choice_scores: Optional[Dict[str, float]] = None
+
+    def asdict(self) -> Dict[str, Any]:
+        return asdict(self)
diff --git a/automation-api/yival_experiments/custom_configuration/gpt4_evaluator.py b/automation-api/yival_experiments/custom_configuration/gpt4_evaluator.py
@@ -47,7 +47,6 @@ def __init__(self, config: GPT4EvaluatorConfig):
 
     def evaluate(self, experiment_result: ExperimentResult) -> EvaluatorOutput:
         """Evaluate the experiment result using OpenAI's prompt-based evaluation."""
-        assert isinstance(self.config, GPT4EvaluatorConfig)
         format_dict = copy.deepcopy(experiment_result.input_data.content)
         format_dict["raw_output"] = experiment_result.raw_output.text_output
 

diff --git a/automation-api/yival_experiments/custom_configuration/llama3_evaluator_config.py b/automation-api/yival_experiments/custom_configuration/llama3_evaluator_config.py
@@ -9,7 +9,7 @@ class Llama3EvaluatorConfig(EvaluatorConfig):
     evaluator_type: EvaluatorType = EvaluatorType.INDIVIDUAL
     prompt: Union[str, List[Dict[str, str]]] = ""
     choices: List[str] = field(default_factory=list)
-    model_name: str = "replicate/meta/meta-llama-3-70b-instruct"
+    model_name: str = "fireworks_ai/accounts/fireworks/models/llama-v3p1-405b-instruct"
     description: str = "This is the description of the evaluator."
     scale_description: str = "0-4"
     choice_scores: Optional[Dict[str, float]] = None

diff --git a/automation-api/yival_experiments/custom_configuration/llms/palm_completion.py b/automation-api/yival_experiments/custom_configuration/llms/palm_completion.py
@@ -1,18 +1,18 @@
 safety_settings = [
     {
         "category": "HARM_CATEGORY_HARASSMENT",
-        "threshold": "BLOCK_NONE",
+        "threshold": "BLOCK_ONLY_HIGH",
     },
     {
         "category": "HARM_CATEGORY_HATE_SPEECH",
-        "threshold": "BLOCK_NONE",
+        "threshold": "BLOCK_ONLY_HIGH",
     },
     {
         "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
-        "threshold": "BLOCK_NONE",
+        "threshold": "BLOCK_ONLY_HIGH",
     },
     {
         "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
-        "threshold": "BLOCK_NONE",
+        "threshold": "BLOCK_ONLY_HIGH",
     },
 ]
diff --git a/automation-api/yival_experiments/custom_configuration/model_compare.py b/automation-api/yival_experiments/custom_configuration/model_compare.py
@@ -12,9 +12,6 @@
 
 # load env vars
 from lib.config import read_config
-from yival_experiments.custom_configuration.llms.alibaba_complete import (
-    llm_complete as alibaba_llm_complete,
-)
 from yival_experiments.custom_configuration.llms.palm_completion import safety_settings
 
 read_config()
@@ -26,26 +23,31 @@
 #     vendor="OpenAI"
 # )
 # default_model_config = dict(
-#     model_id="vertex_ai/gemini-1.5-pro-preview-0409",
+#     model_id="vertex_ai/gemini-pro-experimental",
 #     params={"temperature": 0.5},
 #     vendor="Google",
 # )
+# default_model_config = dict(
+#     model_id="vertex_ai/claude-3-opus@20240229",
+#     params={"temperature": 0.5},
+#     vendor="Anthropic",
+# )
+# default_model_config = dict(
+#     model_id="replicate/meta/meta-llama-3-70b-instruct",
+#     params={"temperature": 0.5},
+#     vendor="Meta",
+# )
 default_model_config = dict(
-    model_id="vertex_ai/claude-3-opus@20240229",
-    params={"temperature": 0.5},
-    vendor="Anthropic",
-)
-default_model_config = dict(
-    model_id="replicate/meta/meta-llama-3-70b-instruct",
-    params={"temperature": 0.5},
-    vendor="Meta",
+    model_id="qwen-max", params={"temperature": 0.5}, vendor="Alibaba"
 )
 # set this to see verbose outputs
 litellm.set_verbose = True
 # enable caching in the evaluator.
 # litellm.cache = litellm.Cache()
 # to not use Redis for caching: uncomment the line above and comment the line below.
-litellm.cache = litellm.Cache(type="redis", host="127.0.0.1", port=26379)
+litellm.cache = litellm.Cache(
+    type="redis", host="127.0.0.1", port=26379, ttl=60 * 24 * 3600
+)
 
 
 def model_compare(
@@ -96,10 +98,10 @@ def model_compare(
     litellm_params = dict(
         model=model["model_id"],
         messages=litellm_messages,
-        caching=False,
+        caching=True,
         num_retries=10,
         request_timeout=60,
-        **model["params"]
+        **model["params"],
     )
     if model["vendor"] == "Google":
         # choose a vertex project location
@@ -109,23 +111,16 @@ def model_compare(
         # google allows changing content filters. We will disable all
         litellm_params["safety_settings"] = safety_settings
     elif model["vendor"] == "Anthropic":
-        if "opus" in model["model_id"]:
-            # there is only one location where claude Opus is available.
-            litellm.vertex_location = "us-east5"
-        else:
-            litellm.vertex_location = "us-central1"
-
+        # all Anthropic models are abailable in us-east5
+        litellm.vertex_location = "us-east5"
+    elif model["vendor"] == "Alibaba":
+        # Alibaba has openai compatible endpoints
+        litellm_params["model"] = f"openai/{litellm_params['model']}"
+        litellm_params["api_key"] = os.getenv("DASHSCOPE_API_KEY")
+        litellm_params["api_base"] = "https://dashscope.aliyuncs.com/compatible-mode/v1"
     try:
-        if model["vendor"] == "Alibaba":
-            # FIXME: alibaba's complete function doesn't support system prompt.
-            output = alibaba_llm_complete(
-                model_name=model["model_id"], prompt=prompt, **model["params"]
-            )
-            response = Response(output=output).output
-            response_text = response["choices"][0]["message"]["content"]
-        else:
-            response = Response(output=completion(**litellm_params)).output
-            response_text = response["choices"][0]["message"]["content"]
+        response = Response(output=completion(**litellm_params)).output
+        response_text = response["choices"][0]["message"]["content"]
     except KeyboardInterrupt:
         raise
     except Exception as e: