Skip to content

Commit 731e29d

Browse files
Rag metric update again (#1948)
* modified and new Signed-off-by: dafnapension <[email protected]> * delete old Signed-off-by: dafnapension <[email protected]> --------- Signed-off-by: dafnapension <[email protected]> Co-authored-by: Yoav Katz <[email protected]>
1 parent 800b2ba commit 731e29d

File tree

31 files changed

+43
-43
lines changed

31 files changed

+43
-43
lines changed

prepare/metrics/llm_as_judge/rag_judge.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424

2525
generic_engine_label = "generic_inference_engine"
2626
inference_models = {
27-
"llama_3_1_70b_instruct_wml": "engines.classification.llama_3_1_70b_instruct_wml",
27+
"llama_3_3_70b_instruct_wml": "engines.classification.llama_3_3_70b_instruct_wml",
2828
generic_engine_label: GenericInferenceEngine(),
2929
}
3030

src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_answer_correctness_q_a_gt_loose.json renamed to src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_answer_correctness_q_a_gt_loose.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
11
{
22
"__type__": "task_based_ll_mas_judge",
3-
"inference_model": "engines.classification.llama_3_1_70b_instruct_wml",
3+
"inference_model": "engines.classification.llama_3_3_70b_instruct_wml",
44
"template": "templates.rag_eval.answer_correctness.judge_loose_match_no_context",
55
"task": "tasks.rag_eval.answer_correctness.binary",
66
"format": null,
77
"main_score": "answer_correctness_q_a_gt_loose",
88
"prediction_field": "answer",
99
"infer_log_probs": false,
10-
"__deprecated_msg__": "This metric should be replaced with metrics.rag.answer_correctness.llama_3_1_70b_instruct_wml_q_a_gt_loose"
10+
"__deprecated_msg__": "This metric should be replaced with metrics.rag.answer_correctness.llama_3_3_70b_instruct_wml_q_a_gt_loose"
1111
}

src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_answer_correctness_q_a_gt_loose_logprobs.json renamed to src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_answer_correctness_q_a_gt_loose_logprobs.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
11
{
22
"__type__": "task_based_ll_mas_judge",
3-
"inference_model": "engines.classification.llama_3_1_70b_instruct_wml",
3+
"inference_model": "engines.classification.llama_3_3_70b_instruct_wml",
44
"template": "templates.rag_eval.answer_correctness.judge_loose_match_no_context_logprobs",
55
"task": "tasks.rag_eval.answer_correctness.binary",
66
"format": null,
77
"main_score": "answer_correctness_q_a_gt_loose_logprobs",
88
"prediction_field": "answer",
99
"infer_log_probs": true,
10-
"__deprecated_msg__": "This metric should be replaced with metrics.rag.answer_correctness.llama_3_1_70b_instruct_wml_q_a_gt_loose_logprobs"
10+
"__deprecated_msg__": "This metric should be replaced with metrics.rag.answer_correctness.llama_3_3_70b_instruct_wml_q_a_gt_loose_logprobs"
1111
}

src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_answer_relevance_q_a.json renamed to src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_answer_relevance_q_a.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
11
{
22
"__type__": "task_based_ll_mas_judge",
3-
"inference_model": "engines.classification.llama_3_1_70b_instruct_wml",
3+
"inference_model": "engines.classification.llama_3_3_70b_instruct_wml",
44
"template": "templates.rag_eval.answer_relevance.judge_answer_relevance",
55
"task": "tasks.rag_eval.answer_relevance.binary",
66
"format": null,
77
"main_score": "answer_relevance_q_a",
88
"prediction_field": "answer",
99
"infer_log_probs": false,
10-
"__deprecated_msg__": "This metric should be replaced with metrics.rag.answer_relevance.llama_3_1_70b_instruct_wml_q_a"
10+
"__deprecated_msg__": "This metric should be replaced with metrics.rag.answer_relevance.llama_3_3_70b_instruct_wml_q_a"
1111
}

src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_answer_relevance_q_a_logprobs.json renamed to src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_answer_relevance_q_a_logprobs.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
11
{
22
"__type__": "task_based_ll_mas_judge",
3-
"inference_model": "engines.classification.llama_3_1_70b_instruct_wml",
3+
"inference_model": "engines.classification.llama_3_3_70b_instruct_wml",
44
"template": "templates.rag_eval.answer_relevance.judge_answer_relevance_logprobs",
55
"task": "tasks.rag_eval.answer_relevance.binary",
66
"format": null,
77
"main_score": "answer_relevance_q_a_logprobs",
88
"prediction_field": "answer",
99
"infer_log_probs": true,
10-
"__deprecated_msg__": "This metric should be replaced with metrics.rag.answer_relevance.llama_3_1_70b_instruct_wml_q_a_logprobs"
10+
"__deprecated_msg__": "This metric should be replaced with metrics.rag.answer_relevance.llama_3_3_70b_instruct_wml_q_a_logprobs"
1111
}

src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_context_relevance_q_c_ares.json renamed to src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_context_relevance_q_c_ares.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
11
{
22
"__type__": "task_based_ll_mas_judge",
3-
"inference_model": "engines.classification.llama_3_1_70b_instruct_wml",
3+
"inference_model": "engines.classification.llama_3_3_70b_instruct_wml",
44
"template": "templates.rag_eval.context_relevance.judge_context_relevance_ares",
55
"task": "tasks.rag_eval.context_relevance.binary",
66
"format": null,
77
"main_score": "context_relevance_q_c_ares",
88
"prediction_field": "contexts",
99
"infer_log_probs": false,
10-
"__deprecated_msg__": "This metric should be replaced with metrics.rag.context_relevance.llama_3_1_70b_instruct_wml_q_c_ares"
10+
"__deprecated_msg__": "This metric should be replaced with metrics.rag.context_relevance.llama_3_3_70b_instruct_wml_q_c_ares"
1111
}

src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_context_relevance_q_c_ares_logprobs.json renamed to src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_context_relevance_q_c_ares_logprobs.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
11
{
22
"__type__": "task_based_ll_mas_judge",
3-
"inference_model": "engines.classification.llama_3_1_70b_instruct_wml",
3+
"inference_model": "engines.classification.llama_3_3_70b_instruct_wml",
44
"template": "templates.rag_eval.context_relevance.judge_context_relevance_ares_logprobs",
55
"task": "tasks.rag_eval.context_relevance.binary",
66
"format": null,
77
"main_score": "context_relevance_q_c_ares_logprobs",
88
"prediction_field": "contexts",
99
"infer_log_probs": true,
10-
"__deprecated_msg__": "This metric should be replaced with metrics.rag.context_relevance.llama_3_1_70b_instruct_wml_q_c_ares_logprobs"
10+
"__deprecated_msg__": "This metric should be replaced with metrics.rag.context_relevance.llama_3_3_70b_instruct_wml_q_c_ares_logprobs"
1111
}

src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_correctness_holistic_q_c_a.json renamed to src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_correctness_holistic_q_c_a.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
11
{
22
"__type__": "task_based_ll_mas_judge",
3-
"inference_model": "engines.classification.llama_3_1_70b_instruct_wml",
3+
"inference_model": "engines.classification.llama_3_3_70b_instruct_wml",
44
"template": "templates.rag_eval.correctness_holistic.judge_correctness_simple",
55
"task": "tasks.rag_eval.correctness_holistic.binary",
66
"format": null,
77
"main_score": "correctness_holistic_q_c_a",
88
"prediction_field": "answer",
99
"infer_log_probs": false,
10-
"__deprecated_msg__": "This metric should be replaced with metrics.rag.correctness_holistic.llama_3_1_70b_instruct_wml_q_c_a"
10+
"__deprecated_msg__": "This metric should be replaced with metrics.rag.correctness_holistic.llama_3_3_70b_instruct_wml_q_c_a"
1111
}

src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_correctness_holistic_q_c_a_logprobs.json renamed to src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_correctness_holistic_q_c_a_logprobs.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
11
{
22
"__type__": "task_based_ll_mas_judge",
3-
"inference_model": "engines.classification.llama_3_1_70b_instruct_wml",
3+
"inference_model": "engines.classification.llama_3_3_70b_instruct_wml",
44
"template": "templates.rag_eval.correctness_holistic.judge_correctness_simple_logprobs",
55
"task": "tasks.rag_eval.correctness_holistic.binary",
66
"format": null,
77
"main_score": "correctness_holistic_q_c_a_logprobs",
88
"prediction_field": "answer",
99
"infer_log_probs": true,
10-
"__deprecated_msg__": "This metric should be replaced with metrics.rag.correctness_holistic.llama_3_1_70b_instruct_wml_q_c_a_logprobs"
10+
"__deprecated_msg__": "This metric should be replaced with metrics.rag.correctness_holistic.llama_3_3_70b_instruct_wml_q_c_a_logprobs"
1111
}

src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_faithfulness_c_a.json renamed to src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_faithfulness_c_a.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
11
{
22
"__type__": "task_based_ll_mas_judge",
3-
"inference_model": "engines.classification.llama_3_1_70b_instruct_wml",
3+
"inference_model": "engines.classification.llama_3_3_70b_instruct_wml",
44
"template": "templates.rag_eval.faithfulness.judge_no_question_simplified",
55
"task": "tasks.rag_eval.faithfulness.binary",
66
"format": null,
77
"main_score": "faithfulness_c_a",
88
"prediction_field": "answer",
99
"infer_log_probs": false,
10-
"__deprecated_msg__": "This metric should be replaced with metrics.rag.faithfulness.llama_3_1_70b_instruct_wml_c_a"
10+
"__deprecated_msg__": "This metric should be replaced with metrics.rag.faithfulness.llama_3_3_70b_instruct_wml_c_a"
1111
}

0 commit comments

Comments
 (0)