add one new experiment

Gapminder · Jan 21, 2025 · 5df5db3 · 5df5db3
1 parent 7ef8593
commit 5df5db3
Show file tree

Hide file tree

Showing 8 changed files with 5,577 additions and 0 deletions.
diff --git a/experiments/20250120/ai_eval_sheets/evaluators.csv b/experiments/20250120/ai_eval_sheets/evaluators.csv
@@ -0,0 +1,4 @@
+evaluator
+vertex_ai/gemini-1.5-pro-002
+gpt-4o-2024-11-20
+anthropic/claude-3-5-sonnet-20241022
diff --git a/experiments/20250120/ai_eval_sheets/gen_ai_model_configs.csv b/experiments/20250120/ai_eval_sheets/gen_ai_model_configs.csv
@@ -0,0 +1,3 @@
+model_config_id,model_id,model_parameters,repeat_times,memory,memory_size
+mc039,qwen-max-2024-09-19,"{""temperature"": 0.01}",1,False,0
+mc041,vertex_ai/gemini-1.5-pro-002,"{""temperature"": 0.01}",1,False,0
diff --git a/experiments/20250120/ai_eval_sheets/gen_ai_models.csv b/experiments/20250120/ai_eval_sheets/gen_ai_models.csv
@@ -0,0 +1,48 @@
+model_id,vendor,model_name
+gpt-3.5-turbo-0613,OpenAI,GPT3.5 June 2023
+gpt-4-0613,OpenAI,GPT4 June 2023
+nan,OpenAI,"GPT4 (with vision, not publicly available yet)"
+nan,Microsoft,Bing
+nan,Meta,LLaMA
+spark,iFlyTek,Spark
+tiiuae/falcon-7b-instruct,HuggingFace,Falcon 7B
+google/flan-t5-large,HuggingFace,Flan T5 Large (780M)
+TheBloke/orca_mini_13B-GGML,HuggingFace,orca mini 13b (GGML ver.)
+bard,Google,Bard
+palm/text-bison,Google,PaLM (Text Bison)
+palm/chat-bison,Google,PaLM (Chat Bison)
+fakellm,Dummy,Dummy model
+claude,Anthropic,Claude
+qwen-turbo,Alibaba,Qianwen
+qwen-plus,Alibaba,Qianwen Plus
+qwen-v1,Alibaba,Qianwen V1
+nan,nan,Amazon Titan
+nan,nan,OpenAssistant
+nan,nan,TruthGPT
+replicate/llama-2-70b-chat:02e509c789964a7ea8736978a43525956ef40397be9033abf9fd2badfe68c9e3,Meta,llama2 (hosted on replicate)
+gemini/gemini-pro,Google,Gemini Pro
+gemini/gemini-1.0-pro,Google,Gemini Pro (1.0)
+gemini/gemini-1.5-pro-latest,Google,Gemini Pro (1.5)
+qwen-max-1201,Alibaba,Qianwen Max 2023-12-01
+gpt-3.5-turbo-1106,OpenAI,GPT3.5 Nov 2023'
+gpt-4-1106-preview,OpenAI,GPT4 Turbo Nov 2023
+gpt-4-0125-preview,OpenAI,GPT4 Turbo Jan 2024
+vertex_ai/gemini-1.5-pro-preview-0409,Google,Gemini Pro 1.5 on Vertex AI
+qwen-max-0403,Alibaba,Qianwen Max 2024-04-03
+vertex_ai/claude-3-sonnet@20240229,Anthropic,Claude 3 Sonnet on Vertex AI
+vertex_ai/claude-3-opus@20240229,Anthropic,Claude 3 Opus on Vertex AI
+gpt-4o-2024-05-13,OpenAI,GPT-4-o 2024-05-13
+replicate/meta/meta-llama-3-70b-instruct,Meta,Llama3 70B instruction fine tuned
+qwen-max-0428,Alibaba,Qianwen Max 2024-04-28
+gpt-4o-2024-08-06,OpenAI,OpenAI gpt-4o-2024-08-06
+vertex_ai/claude-3-5-sonnet@20240620,Anthropic,Claude 3.5 Sonnet on Vertex AI
+replicate/meta/meta-llama-3.1-405b-instruct,Meta,Llama 3.1 450B
+fireworks_ai/accounts/fireworks/models/llama-v3p1-405b-instruct,Meta,Llama 3.1 450B Instruct on fireworks
+qwen-max-2024-09-19,Alibaba,Qianwen Max 2024-09-19
+gpt-4o-2024-11-20,OpenAI,GPT-4-o 2024-11-20
+vertex_ai/claude-3-5-sonnet-v2@20241022,Anthropic,Claude 3.5 Sonnet 20241022 on Vertex AI
+xai/grok-beta,XAI,XAI grok beta
+vertex_ai/gemini-1.5-pro-002,Google,Gemini Pro 1.5 002 on Vertex AI
+o1-preview-2024-09-12,OpenAI,OpenAI o1 model
+fireworks_ai/accounts/fireworks/models/llama-v3p3-70b-instruct,Meta,Llama 3.3 70B Instruct on Fireworks.ai
+anthropic/claude-3-5-sonnet-20241022,Anthropic,Claude 3.5 Sonnect 20241022
diff --git a/experiments/20250120/ai_eval_sheets/metrics.csv b/experiments/20250120/ai_eval_sheets/metrics.csv
@@ -0,0 +1,25 @@
+name,description,prompt,choices,choice_scores
+correctness,Is the answer correct?,"You are assessing a submitted answer on a given task based on a criterion. Here is the data:
+- Task: Answer a question.
+- Is the answer correct?
+[Input]: {question_text}
+Possible Options and Correctness of Options:
+A. {option_a} ({option_a_correctness})
+B. {option_b} ({option_b_correctness})
+C. {option_c} ({option_c_correctness})
+[Result]: {raw_output}
+[Assessment requirements]:
+Assess the answer by selecting one of the following options. Please note that the question was asked in some context so the answer would contain some more contextual information. You should conclude which option this answer has chosen.
+A The answer looks like some kind of exception / error message; or it's an equivocal answer; or it doesn't answer to the question at all.
+B The answer is not correct, and matches/is close to the Very Wrong answer.
+C The answer is not correct, and matches/is close to the Wrong answer.
+D The answer is correct, i.e. matches the Correct answer.
+[Instruction]:
+First, write out in a step by step manner your reasoning to be sure that your
+conclusion is correct.
+Avoid simply stating the correct answer at the outset.
+Then print only a single choice from A,B,C,D (without quotes or punctuation)
+on its own line corresponding to the correct answer.
+At the end, repeat just the answer by itself on a new line.
+Reasoning:
+","A, B, C, D","0, 1, 2, 3"
diff --git a/experiments/20250120/ai_eval_sheets/prompt_variations.csv b/experiments/20250120/ai_eval_sheets/prompt_variations.csv