Rag metric update again (#1948)

dafnapension · yoavkatz · web-flow · commit 731e29dfcb7e · 2025-12-01T12:12:23.000+02:00
* modified and new

Signed-off-by: dafnapension &lt;dafnashein@yahoo.com&gt;

* delete old

Signed-off-by: dafnapension &lt;dafnashein@yahoo.com&gt;

---------

Signed-off-by: dafnapension &lt;dafnashein@yahoo.com&gt;
Co-authored-by: Yoav Katz &lt;68273864+yoavkatz@users.noreply.github.com&gt;
diff --git a/prepare/metrics/llm_as_judge/rag_judge.py b/prepare/metrics/llm_as_judge/rag_judge.py
@@ -24,7 +24,7 @@
 
 generic_engine_label = "generic_inference_engine"
 inference_models = {
-    "llama_3_1_70b_instruct_wml": "engines.classification.llama_3_1_70b_instruct_wml",
+    "llama_3_3_70b_instruct_wml": "engines.classification.llama_3_3_70b_instruct_wml",
     generic_engine_label: GenericInferenceEngine(),
 }
 
diff --git a/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_answer_correctness_q_a_gt_loose.json b/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_answer_correctness_q_a_gt_loose.json
@@ -1,11 +1,11 @@
 {
     "__type__": "task_based_ll_mas_judge",
-    "inference_model": "engines.classification.llama_3_1_70b_instruct_wml",
+    "inference_model": "engines.classification.llama_3_3_70b_instruct_wml",
     "template": "templates.rag_eval.answer_correctness.judge_loose_match_no_context",
     "task": "tasks.rag_eval.answer_correctness.binary",
     "format": null,
     "main_score": "answer_correctness_q_a_gt_loose",
     "prediction_field": "answer",
     "infer_log_probs": false,
-    "__deprecated_msg__": "This metric should be replaced with metrics.rag.answer_correctness.llama_3_1_70b_instruct_wml_q_a_gt_loose"
+    "__deprecated_msg__": "This metric should be replaced with metrics.rag.answer_correctness.llama_3_3_70b_instruct_wml_q_a_gt_loose"
 }
diff --git a/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_answer_correctness_q_a_gt_loose_logprobs.json b/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_answer_correctness_q_a_gt_loose_logprobs.json
@@ -1,11 +1,11 @@
 {
     "__type__": "task_based_ll_mas_judge",
-    "inference_model": "engines.classification.llama_3_1_70b_instruct_wml",
+    "inference_model": "engines.classification.llama_3_3_70b_instruct_wml",
     "template": "templates.rag_eval.answer_correctness.judge_loose_match_no_context_logprobs",
     "task": "tasks.rag_eval.answer_correctness.binary",
     "format": null,
     "main_score": "answer_correctness_q_a_gt_loose_logprobs",
     "prediction_field": "answer",
     "infer_log_probs": true,
-    "__deprecated_msg__": "This metric should be replaced with metrics.rag.answer_correctness.llama_3_1_70b_instruct_wml_q_a_gt_loose_logprobs"
+    "__deprecated_msg__": "This metric should be replaced with metrics.rag.answer_correctness.llama_3_3_70b_instruct_wml_q_a_gt_loose_logprobs"
 }
diff --git a/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_answer_relevance_q_a.json b/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_answer_relevance_q_a.json
@@ -1,11 +1,11 @@
 {
     "__type__": "task_based_ll_mas_judge",
-    "inference_model": "engines.classification.llama_3_1_70b_instruct_wml",
+    "inference_model": "engines.classification.llama_3_3_70b_instruct_wml",
     "template": "templates.rag_eval.answer_relevance.judge_answer_relevance",
     "task": "tasks.rag_eval.answer_relevance.binary",
     "format": null,
     "main_score": "answer_relevance_q_a",
     "prediction_field": "answer",
     "infer_log_probs": false,
-    "__deprecated_msg__": "This metric should be replaced with metrics.rag.answer_relevance.llama_3_1_70b_instruct_wml_q_a"
+    "__deprecated_msg__": "This metric should be replaced with metrics.rag.answer_relevance.llama_3_3_70b_instruct_wml_q_a"
 }
diff --git a/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_answer_relevance_q_a_logprobs.json b/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_answer_relevance_q_a_logprobs.json
@@ -1,11 +1,11 @@
 {
     "__type__": "task_based_ll_mas_judge",
-    "inference_model": "engines.classification.llama_3_1_70b_instruct_wml",
+    "inference_model": "engines.classification.llama_3_3_70b_instruct_wml",
     "template": "templates.rag_eval.answer_relevance.judge_answer_relevance_logprobs",
     "task": "tasks.rag_eval.answer_relevance.binary",
     "format": null,
     "main_score": "answer_relevance_q_a_logprobs",
     "prediction_field": "answer",
     "infer_log_probs": true,
-    "__deprecated_msg__": "This metric should be replaced with metrics.rag.answer_relevance.llama_3_1_70b_instruct_wml_q_a_logprobs"
+    "__deprecated_msg__": "This metric should be replaced with metrics.rag.answer_relevance.llama_3_3_70b_instruct_wml_q_a_logprobs"
 }
diff --git a/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_context_relevance_q_c_ares.json b/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_context_relevance_q_c_ares.json
@@ -1,11 +1,11 @@
 {
     "__type__": "task_based_ll_mas_judge",
-    "inference_model": "engines.classification.llama_3_1_70b_instruct_wml",
+    "inference_model": "engines.classification.llama_3_3_70b_instruct_wml",
     "template": "templates.rag_eval.context_relevance.judge_context_relevance_ares",
     "task": "tasks.rag_eval.context_relevance.binary",
     "format": null,
     "main_score": "context_relevance_q_c_ares",
     "prediction_field": "contexts",
     "infer_log_probs": false,
-    "__deprecated_msg__": "This metric should be replaced with metrics.rag.context_relevance.llama_3_1_70b_instruct_wml_q_c_ares"
+    "__deprecated_msg__": "This metric should be replaced with metrics.rag.context_relevance.llama_3_3_70b_instruct_wml_q_c_ares"
 }
diff --git a/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_context_relevance_q_c_ares_logprobs.json b/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_context_relevance_q_c_ares_logprobs.json
@@ -1,11 +1,11 @@
 {
     "__type__": "task_based_ll_mas_judge",
-    "inference_model": "engines.classification.llama_3_1_70b_instruct_wml",
+    "inference_model": "engines.classification.llama_3_3_70b_instruct_wml",
     "template": "templates.rag_eval.context_relevance.judge_context_relevance_ares_logprobs",
     "task": "tasks.rag_eval.context_relevance.binary",
     "format": null,
     "main_score": "context_relevance_q_c_ares_logprobs",
     "prediction_field": "contexts",
     "infer_log_probs": true,
-    "__deprecated_msg__": "This metric should be replaced with metrics.rag.context_relevance.llama_3_1_70b_instruct_wml_q_c_ares_logprobs"
+    "__deprecated_msg__": "This metric should be replaced with metrics.rag.context_relevance.llama_3_3_70b_instruct_wml_q_c_ares_logprobs"
 }
diff --git a/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_correctness_holistic_q_c_a.json b/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_correctness_holistic_q_c_a.json
@@ -1,11 +1,11 @@
 {
     "__type__": "task_based_ll_mas_judge",
-    "inference_model": "engines.classification.llama_3_1_70b_instruct_wml",
+    "inference_model": "engines.classification.llama_3_3_70b_instruct_wml",
     "template": "templates.rag_eval.correctness_holistic.judge_correctness_simple",
     "task": "tasks.rag_eval.correctness_holistic.binary",
     "format": null,
     "main_score": "correctness_holistic_q_c_a",
     "prediction_field": "answer",
     "infer_log_probs": false,
-    "__deprecated_msg__": "This metric should be replaced with metrics.rag.correctness_holistic.llama_3_1_70b_instruct_wml_q_c_a"
+    "__deprecated_msg__": "This metric should be replaced with metrics.rag.correctness_holistic.llama_3_3_70b_instruct_wml_q_c_a"
 }
diff --git a/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_correctness_holistic_q_c_a_logprobs.json b/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_correctness_holistic_q_c_a_logprobs.json
@@ -1,11 +1,11 @@
 {
     "__type__": "task_based_ll_mas_judge",
-    "inference_model": "engines.classification.llama_3_1_70b_instruct_wml",
+    "inference_model": "engines.classification.llama_3_3_70b_instruct_wml",
     "template": "templates.rag_eval.correctness_holistic.judge_correctness_simple_logprobs",
     "task": "tasks.rag_eval.correctness_holistic.binary",
     "format": null,
     "main_score": "correctness_holistic_q_c_a_logprobs",
     "prediction_field": "answer",
     "infer_log_probs": true,
-    "__deprecated_msg__": "This metric should be replaced with metrics.rag.correctness_holistic.llama_3_1_70b_instruct_wml_q_c_a_logprobs"
+    "__deprecated_msg__": "This metric should be replaced with metrics.rag.correctness_holistic.llama_3_3_70b_instruct_wml_q_c_a_logprobs"
 }
diff --git a/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_faithfulness_c_a.json b/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_faithfulness_c_a.json
@@ -1,11 +1,11 @@
 {
     "__type__": "task_based_ll_mas_judge",
-    "inference_model": "engines.classification.llama_3_1_70b_instruct_wml",
+    "inference_model": "engines.classification.llama_3_3_70b_instruct_wml",
     "template": "templates.rag_eval.faithfulness.judge_no_question_simplified",
     "task": "tasks.rag_eval.faithfulness.binary",
     "format": null,
     "main_score": "faithfulness_c_a",
     "prediction_field": "answer",
     "infer_log_probs": false,
-    "__deprecated_msg__": "This metric should be replaced with metrics.rag.faithfulness.llama_3_1_70b_instruct_wml_c_a"
+    "__deprecated_msg__": "This metric should be replaced with metrics.rag.faithfulness.llama_3_3_70b_instruct_wml_c_a"
 }
diff --git a/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_faithfulness_c_a_logprobs.json b/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_faithfulness_c_a_logprobs.json
@@ -1,11 +1,11 @@
 {
     "__type__": "task_based_ll_mas_judge",
-    "inference_model": "engines.classification.llama_3_1_70b_instruct_wml",
+    "inference_model": "engines.classification.llama_3_3_70b_instruct_wml",
     "template": "templates.rag_eval.faithfulness.judge_no_question_simplified_logprobs",
     "task": "tasks.rag_eval.faithfulness.binary",
     "format": null,
     "main_score": "faithfulness_c_a_logprobs",
     "prediction_field": "answer",
     "infer_log_probs": true,
-    "__deprecated_msg__": "This metric should be replaced with metrics.rag.faithfulness.llama_3_1_70b_instruct_wml_c_a_logprobs"
+    "__deprecated_msg__": "This metric should be replaced with metrics.rag.faithfulness.llama_3_3_70b_instruct_wml_c_a_logprobs"
 }
diff --git a/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_faithfulness_q_c_a.json b/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_faithfulness_q_c_a.json
@@ -1,11 +1,11 @@
 {
     "__type__": "task_based_ll_mas_judge",
-    "inference_model": "engines.classification.llama_3_1_70b_instruct_wml",
+    "inference_model": "engines.classification.llama_3_3_70b_instruct_wml",
     "template": "templates.rag_eval.faithfulness.judge_with_question_simplified",
     "task": "tasks.rag_eval.faithfulness.binary",
     "format": null,
     "main_score": "faithfulness_q_c_a",
     "prediction_field": "answer",
     "infer_log_probs": false,
-    "__deprecated_msg__": "This metric should be replaced with metrics.rag.faithfulness.llama_3_1_70b_instruct_wml_q_c_a"
+    "__deprecated_msg__": "This metric should be replaced with metrics.rag.faithfulness.llama_3_3_70b_instruct_wml_q_c_a"
 }
diff --git a/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_faithfulness_q_c_a_logprobs.json b/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_faithfulness_q_c_a_logprobs.json
@@ -1,11 +1,11 @@
 {
     "__type__": "task_based_ll_mas_judge",
-    "inference_model": "engines.classification.llama_3_1_70b_instruct_wml",
+    "inference_model": "engines.classification.llama_3_3_70b_instruct_wml",
     "template": "templates.rag_eval.faithfulness.judge_with_question_simplified_logprobs",
     "task": "tasks.rag_eval.faithfulness.binary",
     "format": null,
     "main_score": "faithfulness_q_c_a_logprobs",
     "prediction_field": "answer",
     "infer_log_probs": true,
-    "__deprecated_msg__": "This metric should be replaced with metrics.rag.faithfulness.llama_3_1_70b_instruct_wml_q_c_a_logprobs"
+    "__deprecated_msg__": "This metric should be replaced with metrics.rag.faithfulness.llama_3_3_70b_instruct_wml_q_c_a_logprobs"
 }
diff --git a/src/unitxt/catalog/metrics/rag/answer_correctness/llama_3_3_70b_instruct_wml_q_a_gt_loose.json b/src/unitxt/catalog/metrics/rag/answer_correctness/llama_3_3_70b_instruct_wml_q_a_gt_loose.json
@@ -1,6 +1,6 @@
 {
     "__type__": "task_based_ll_mas_judge",
-    "inference_model": "engines.classification.llama_3_1_70b_instruct_wml",
+    "inference_model": "engines.classification.llama_3_3_70b_instruct_wml",
     "template": "templates.rag_eval.answer_correctness.judge_loose_match_no_context",
     "task": "tasks.rag_eval.answer_correctness.binary",
     "format": null,
diff --git a/src/unitxt/catalog/metrics/rag/answer_correctness/llama_3_3_70b_instruct_wml_q_a_gt_loose_logprobs.json b/src/unitxt/catalog/metrics/rag/answer_correctness/llama_3_3_70b_instruct_wml_q_a_gt_loose_logprobs.json
@@ -1,6 +1,6 @@
 {
     "__type__": "task_based_ll_mas_judge",
-    "inference_model": "engines.classification.llama_3_1_70b_instruct_wml",
+    "inference_model": "engines.classification.llama_3_3_70b_instruct_wml",
     "template": "templates.rag_eval.answer_correctness.judge_loose_match_no_context_logprobs",
     "task": "tasks.rag_eval.answer_correctness.binary",
     "format": null,
diff --git a/src/unitxt/catalog/metrics/rag/answer_correctness/llama_3_3_70b_instruct_wml_q_a_gt_loose_numeric.json b/src/unitxt/catalog/metrics/rag/answer_correctness/llama_3_3_70b_instruct_wml_q_a_gt_loose_numeric.json
@@ -1,6 +1,6 @@
 {
     "__type__": "task_based_ll_mas_judge",
-    "inference_model": "engines.classification.llama_3_1_70b_instruct_wml",
+    "inference_model": "engines.classification.llama_3_3_70b_instruct_wml",
     "template": "templates.rag_eval.answer_correctness.judge_loose_match_no_context_numeric",
     "task": "tasks.rag_eval.answer_correctness.binary",
     "format": null,
diff --git a/src/unitxt/catalog/metrics/rag/answer_relevance/llama_3_3_70b_instruct_wml_q_a.json b/src/unitxt/catalog/metrics/rag/answer_relevance/llama_3_3_70b_instruct_wml_q_a.json
@@ -1,6 +1,6 @@
 {
     "__type__": "task_based_ll_mas_judge",
-    "inference_model": "engines.classification.llama_3_1_70b_instruct_wml",
+    "inference_model": "engines.classification.llama_3_3_70b_instruct_wml",
     "template": "templates.rag_eval.answer_relevance.judge_answer_relevance",
     "task": "tasks.rag_eval.answer_relevance.binary",
     "format": null,
diff --git a/src/unitxt/catalog/metrics/rag/answer_relevance/llama_3_3_70b_instruct_wml_q_a_logprobs.json b/src/unitxt/catalog/metrics/rag/answer_relevance/llama_3_3_70b_instruct_wml_q_a_logprobs.json
@@ -1,6 +1,6 @@
 {
     "__type__": "task_based_ll_mas_judge",
-    "inference_model": "engines.classification.llama_3_1_70b_instruct_wml",
+    "inference_model": "engines.classification.llama_3_3_70b_instruct_wml",
     "template": "templates.rag_eval.answer_relevance.judge_answer_relevance_logprobs",
     "task": "tasks.rag_eval.answer_relevance.binary",
     "format": null,
diff --git a/src/unitxt/catalog/metrics/rag/answer_relevance/llama_3_3_70b_instruct_wml_q_a_numeric.json b/src/unitxt/catalog/metrics/rag/answer_relevance/llama_3_3_70b_instruct_wml_q_a_numeric.json
@@ -1,6 +1,6 @@
 {
     "__type__": "task_based_ll_mas_judge",
-    "inference_model": "engines.classification.llama_3_1_70b_instruct_wml",
+    "inference_model": "engines.classification.llama_3_3_70b_instruct_wml",
     "template": "templates.rag_eval.answer_relevance.judge_answer_relevance_numeric",
     "task": "tasks.rag_eval.answer_relevance.binary",
     "format": null,
diff --git a/src/unitxt/catalog/metrics/rag/context_relevance/llama_3_3_70b_instruct_wml_q_c_ares.json b/src/unitxt/catalog/metrics/rag/context_relevance/llama_3_3_70b_instruct_wml_q_c_ares.json
@@ -1,6 +1,6 @@
 {
     "__type__": "task_based_ll_mas_judge",
-    "inference_model": "engines.classification.llama_3_1_70b_instruct_wml",
+    "inference_model": "engines.classification.llama_3_3_70b_instruct_wml",
     "template": "templates.rag_eval.context_relevance.judge_context_relevance_ares",
     "task": "tasks.rag_eval.context_relevance.binary",
     "format": null,
diff --git a/src/unitxt/catalog/metrics/rag/context_relevance/llama_3_3_70b_instruct_wml_q_c_ares_logprobs.json b/src/unitxt/catalog/metrics/rag/context_relevance/llama_3_3_70b_instruct_wml_q_c_ares_logprobs.json
@@ -1,6 +1,6 @@
 {
     "__type__": "task_based_ll_mas_judge",
-    "inference_model": "engines.classification.llama_3_1_70b_instruct_wml",
+    "inference_model": "engines.classification.llama_3_3_70b_instruct_wml",
     "template": "templates.rag_eval.context_relevance.judge_context_relevance_ares_logprobs",
     "task": "tasks.rag_eval.context_relevance.binary",
     "format": null,
diff --git a/src/unitxt/catalog/metrics/rag/context_relevance/llama_3_3_70b_instruct_wml_q_c_ares_numeric.json b/src/unitxt/catalog/metrics/rag/context_relevance/llama_3_3_70b_instruct_wml_q_c_ares_numeric.json
@@ -1,6 +1,6 @@
 {
     "__type__": "task_based_ll_mas_judge",
-    "inference_model": "engines.classification.llama_3_1_70b_instruct_wml",
+    "inference_model": "engines.classification.llama_3_3_70b_instruct_wml",
     "template": "templates.rag_eval.context_relevance.judge_context_relevance_ares_numeric",
     "task": "tasks.rag_eval.context_relevance.binary",
     "format": null,
diff --git a/src/unitxt/catalog/metrics/rag/correctness_holistic/llama_3_3_70b_instruct_wml_q_c_a.json b/src/unitxt/catalog/metrics/rag/correctness_holistic/llama_3_3_70b_instruct_wml_q_c_a.json
@@ -1,6 +1,6 @@
 {
     "__type__": "task_based_ll_mas_judge",
-    "inference_model": "engines.classification.llama_3_1_70b_instruct_wml",
+    "inference_model": "engines.classification.llama_3_3_70b_instruct_wml",
     "template": "templates.rag_eval.correctness_holistic.judge_correctness_simple",
     "task": "tasks.rag_eval.correctness_holistic.binary",
     "format": null,
diff --git a/src/unitxt/catalog/metrics/rag/correctness_holistic/llama_3_3_70b_instruct_wml_q_c_a_logprobs.json b/src/unitxt/catalog/metrics/rag/correctness_holistic/llama_3_3_70b_instruct_wml_q_c_a_logprobs.json
@@ -1,6 +1,6 @@
 {
     "__type__": "task_based_ll_mas_judge",
-    "inference_model": "engines.classification.llama_3_1_70b_instruct_wml",
+    "inference_model": "engines.classification.llama_3_3_70b_instruct_wml",
     "template": "templates.rag_eval.correctness_holistic.judge_correctness_simple_logprobs",
     "task": "tasks.rag_eval.correctness_holistic.binary",
     "format": null,
diff --git a/src/unitxt/catalog/metrics/rag/correctness_holistic/llama_3_3_70b_instruct_wml_q_c_a_numeric.json b/src/unitxt/catalog/metrics/rag/correctness_holistic/llama_3_3_70b_instruct_wml_q_c_a_numeric.json
@@ -1,6 +1,6 @@
 {
     "__type__": "task_based_ll_mas_judge",
-    "inference_model": "engines.classification.llama_3_1_70b_instruct_wml",
+    "inference_model": "engines.classification.llama_3_3_70b_instruct_wml",
     "template": "templates.rag_eval.correctness_holistic.judge_correctness_simple_numeric",
     "task": "tasks.rag_eval.correctness_holistic.binary",
     "format": null,
diff --git a/src/unitxt/catalog/metrics/rag/faithfulness/llama_3_3_70b_instruct_wml_c_a.json b/src/unitxt/catalog/metrics/rag/faithfulness/llama_3_3_70b_instruct_wml_c_a.json
@@ -1,6 +1,6 @@
 {
     "__type__": "task_based_ll_mas_judge",
-    "inference_model": "engines.classification.llama_3_1_70b_instruct_wml",
+    "inference_model": "engines.classification.llama_3_3_70b_instruct_wml",
     "template": "templates.rag_eval.faithfulness.judge_no_question_simplified",
     "task": "tasks.rag_eval.faithfulness.binary",
     "format": null,
diff --git a/src/unitxt/catalog/metrics/rag/faithfulness/llama_3_3_70b_instruct_wml_c_a_logprobs.json b/src/unitxt/catalog/metrics/rag/faithfulness/llama_3_3_70b_instruct_wml_c_a_logprobs.json
@@ -1,6 +1,6 @@
 {
     "__type__": "task_based_ll_mas_judge",
-    "inference_model": "engines.classification.llama_3_1_70b_instruct_wml",
+    "inference_model": "engines.classification.llama_3_3_70b_instruct_wml",
     "template": "templates.rag_eval.faithfulness.judge_no_question_simplified_logprobs",
     "task": "tasks.rag_eval.faithfulness.binary",
     "format": null,
diff --git a/src/unitxt/catalog/metrics/rag/faithfulness/llama_3_3_70b_instruct_wml_c_a_verbal.json b/src/unitxt/catalog/metrics/rag/faithfulness/llama_3_3_70b_instruct_wml_c_a_verbal.json
@@ -1,6 +1,6 @@
 {
     "__type__": "task_based_ll_mas_judge",
-    "inference_model": "engines.classification.llama_3_1_70b_instruct_wml",
+    "inference_model": "engines.classification.llama_3_3_70b_instruct_wml",
     "template": "templates.rag_eval.faithfulness.judge_no_question_simplified_verbal",
     "task": "tasks.rag_eval.faithfulness.binary",
     "format": null,
diff --git a/src/unitxt/catalog/metrics/rag/faithfulness/llama_3_3_70b_instruct_wml_q_c_a.json b/src/unitxt/catalog/metrics/rag/faithfulness/llama_3_3_70b_instruct_wml_q_c_a.json
@@ -1,6 +1,6 @@
 {
     "__type__": "task_based_ll_mas_judge",
-    "inference_model": "engines.classification.llama_3_1_70b_instruct_wml",
+    "inference_model": "engines.classification.llama_3_3_70b_instruct_wml",
     "template": "templates.rag_eval.faithfulness.judge_with_question_simplified",
     "task": "tasks.rag_eval.faithfulness.binary",
     "format": null,
diff --git a/src/unitxt/catalog/metrics/rag/faithfulness/llama_3_3_70b_instruct_wml_q_c_a_logprobs.json b/src/unitxt/catalog/metrics/rag/faithfulness/llama_3_3_70b_instruct_wml_q_c_a_logprobs.json
@@ -1,6 +1,6 @@
 {
     "__type__": "task_based_ll_mas_judge",
-    "inference_model": "engines.classification.llama_3_1_70b_instruct_wml",
+    "inference_model": "engines.classification.llama_3_3_70b_instruct_wml",
     "template": "templates.rag_eval.faithfulness.judge_with_question_simplified_logprobs",
     "task": "tasks.rag_eval.faithfulness.binary",
     "format": null,
diff --git a/src/unitxt/catalog/metrics/rag/faithfulness/llama_3_3_70b_instruct_wml_q_c_a_verbal.json b/src/unitxt/catalog/metrics/rag/faithfulness/llama_3_3_70b_instruct_wml_q_c_a_verbal.json
@@ -1,6 +1,6 @@
 {
     "__type__": "task_based_ll_mas_judge",
-    "inference_model": "engines.classification.llama_3_1_70b_instruct_wml",
+    "inference_model": "engines.classification.llama_3_3_70b_instruct_wml",
     "template": "templates.rag_eval.faithfulness.judge_with_question_simplified_verbal",
     "task": "tasks.rag_eval.faithfulness.binary",
     "format": null,

Original file line number	Diff line number	Diff line change
`@@ -24,7 +24,7 @@`
`24`	`24`
`25`	`25`	`generic_engine_label = "generic_inference_engine"`
`26`	`26`	`inference_models = {`
`27`		`- "llama_3_1_70b_instruct_wml": "engines.classification.llama_3_1_70b_instruct_wml",`
	`27`	`+ "llama_3_3_70b_instruct_wml": "engines.classification.llama_3_3_70b_instruct_wml",`
`28`	`28`	`generic_engine_label: GenericInferenceEngine(),`
`29`	`29`	`}`
`30`	`30`
Original file line number	Diff line number	Diff line change
`@@ -1,11 +1,11 @@`
`1`	`1`	`{`
`2`	`2`	`"__type__": "task_based_ll_mas_judge",`
`3`		`- "inference_model": "engines.classification.llama_3_1_70b_instruct_wml",`
	`3`	`+ "inference_model": "engines.classification.llama_3_3_70b_instruct_wml",`
`4`	`4`	`"template": "templates.rag_eval.answer_correctness.judge_loose_match_no_context",`
`5`	`5`	`"task": "tasks.rag_eval.answer_correctness.binary",`
`6`	`6`	`"format": null,`
`7`	`7`	`"main_score": "answer_correctness_q_a_gt_loose",`
`8`	`8`	`"prediction_field": "answer",`
`9`	`9`	`"infer_log_probs": false,`
`10`		`- "__deprecated_msg__": "This metric should be replaced with metrics.rag.answer_correctness.llama_3_1_70b_instruct_wml_q_a_gt_loose"`
	`10`	`+ "__deprecated_msg__": "This metric should be replaced with metrics.rag.answer_correctness.llama_3_3_70b_instruct_wml_q_a_gt_loose"`
`11`	`11`	`}`