Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove IBM GenAI support and moved legacy GenAI metrics to use CrossProviderInferenceEngine #1508

Open
wants to merge 11 commits into
base: main
Choose a base branch
from
Open
2 changes: 1 addition & 1 deletion examples/evaluate_ensemble_judge.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
reference_fields={},
prediction_type="str",
metrics=[
"metrics.llm_as_judge.conversation_answer_topicality.ensemble_v1_ibmgenai_judges"
"metrics.llm_as_judge.conversation_answer_topicality.ensemble_v1_wml_judges"
],
),
templates=TemplatesDict(
Expand Down
2 changes: 1 addition & 1 deletion examples/evaluate_grounded_ensemble_judge.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
reference_fields={},
prediction_type="str",
metrics=[
"metrics.llm_as_judge.conversation_answer_groundedness.ensemble_v1_ibmgenai_judges"
"metrics.llm_as_judge.conversation_answer_groundedness.ensemble_v1_wml_judges"
],
),
templates=TemplatesDict(
Expand Down
4 changes: 1 addition & 3 deletions examples/evaluate_idk_judge.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,7 @@
input_fields={"inquiry": "str"},
reference_fields={},
prediction_type="str",
metrics=[
"metrics.llm_as_judge.conversation_answer_idk.llama3_v1_ibmgenai_judges"
],
metrics=["metrics.llm_as_judge.conversation_answer_idk.llama3_v1_wml_judges"],
),
templates=TemplatesDict(
{
Expand Down
4 changes: 2 additions & 2 deletions examples/evaluate_llm_as_judge_from_template.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@

# List of metrics to evaluate
metrics_to_check = [
"metrics.llm_as_judge.rating.llama_3_8b_instruct_ibm_genai_template_mt_bench_single_turn",
"metrics.llm_as_judge.rating.llama_3_70b_instruct_ibm_genai_template_generic_single_turn",
"metrics.llm_as_judge.rating.llama_3_8b_instruct.mt_bench_single_turn",
"metrics.llm_as_judge.rating.llama_3_70b_instruct.generic_single_turn",
]

for metric_to_check in metrics_to_check:
Expand Down
2 changes: 1 addition & 1 deletion examples/evaluate_using_metrics_ensemble.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
ensemble_metric = MetricsEnsemble(
metrics=[
"metrics.llm_as_judge.rating.llama_3_70b_instruct.generic_single_turn",
"metrics.llm_as_judge.rating.llama_3_8b_instruct_ibm_genai_template_mt_bench_single_turn",
"metrics.llm_as_judge.rating.llama_3_8b_instruct.mt_bench_single_turn",
],
weights=[0.75, 0.25],
)
Expand Down
2 changes: 1 addition & 1 deletion prepare/cards/scigen.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
}
),
],
task="tasks.generation.from_pair[metrics=[metrics.llm_as_judge.rating.llama_3_1_70b_instruct_cross_provider_template_table2text_single_turn_with_reference]]",
task="tasks.generation.from_pair[metrics=[metrics.llm_as_judge.rating.llama_3_1_70b_instruct.table2text_single_turn_with_reference]]",
templates=[
"templates.generation.from_pair.default[postprocessors=[processors.lower_case]]"
],
Expand Down
15 changes: 4 additions & 11 deletions prepare/engines/classification/classification_engines.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from unitxt import add_to_catalog
from unitxt.inference import (
AzureOpenAIInferenceEngine,
IbmGenAiInferenceEngine,
RITSInferenceEngine,
WMLInferenceEngineGeneration,
)
Expand All @@ -15,13 +14,7 @@ def get_inference_engine(model_name, framework_name):
random_seed=42,
decoding_method="greedy",
)
if framework_name == "ibm_gen_ai":
return IbmGenAiInferenceEngine(
model_name=model_name,
max_new_tokens=5,
random_seed=42,
decoding_method="greedy",
)

if framework_name == "openai":
return AzureOpenAIInferenceEngine(
model_name=model_name,
Expand All @@ -38,12 +31,12 @@ def get_inference_engine(model_name, framework_name):


model_names_to_infer_framework = {
"meta-llama/llama-3-1-70b-instruct": ["ibm_wml", "rits", "ibm_gen_ai"],
"meta-llama/llama-3-1-70b-instruct": ["ibm_wml", "rits"],
"meta-llama/llama-3-3-70b-instruct": ["ibm_wml", "rits"],
"gpt-4-turbo-2024-04-09": ["openai"],
"gpt-4o-2024-08-06": ["openai"],
"mistralai/mixtral-8x7b-instruct-v01": ["ibm_wml", "ibm_gen_ai", "rits"],
"meta-llama/llama-3-1-405b-instruct-fp8": ["ibm_gen_ai", "rits"],
"mistralai/mixtral-8x7b-instruct-v01": ["ibm_wml", "rits"],
"meta-llama/llama-3-1-405b-instruct-fp8": ["rits"],
"meta-llama/llama-3-405b-instruct": ["ibm_wml"],
}

Expand Down
Empty file.
11 changes: 0 additions & 11 deletions prepare/engines/ibm_genai/llama3.py

This file was deleted.

14 changes: 5 additions & 9 deletions prepare/metrics/llm_as_judge/conversation_groundedness.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,13 @@
import json

from unitxt import add_to_catalog
from unitxt.inference import (
IbmGenAiInferenceEngine,
IbmGenAiInferenceEngineParams,
)
from unitxt.inference import WMLInferenceEngineGeneration
from unitxt.llm_as_judge import LLMAsJudge
from unitxt.metrics import (
RandomForestMetricsEnsemble,
)

platform = "ibm_gen_ai"
gen_params = IbmGenAiInferenceEngineParams(max_new_tokens=256)
platform = "wml"

config_filepath = "prepare/metrics/llm_as_judge/ensemble_grounded_v1.json"

Expand All @@ -25,8 +21,8 @@

inference_model_lst = []
for model_name in model_lst:
inference_model = IbmGenAiInferenceEngine(
model_name=model_name, parameters=gen_params
inference_model = WMLInferenceEngineGeneration(
model_name=model_name, max_new_tokens=256
)
inference_model_lst.append(inference_model)

Expand Down Expand Up @@ -60,6 +56,6 @@
# ensemble_metric.load_weights(config_filepath)
add_to_catalog(
ensemble_metric,
"metrics.llm_as_judge.conversation_answer_groundedness.ensemble_v1_ibmgenai_judges",
"metrics.llm_as_judge.conversation_answer_groundedness.ensemble_v1_wml_judges",
overwrite=True,
)
13 changes: 4 additions & 9 deletions prepare/metrics/llm_as_judge/conversation_idk.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,14 @@
from unitxt import add_to_catalog
from unitxt.inference import (
GenericInferenceEngine,
IbmGenAiInferenceEngine,
IbmGenAiInferenceEngineParams,
)
from unitxt.inference import GenericInferenceEngine, WMLInferenceEngineGeneration
from unitxt.llm_as_judge import LLMAsJudge

template_name = "templates.response_assessment.judges.idk.v1"

inference_models = {
"llama3_v1_ibmgenai": {
"llama3_v1_wml": {
"model_name": "llama370binstruct",
"inference_model": IbmGenAiInferenceEngine(
model_name="meta-llama/llama-3-70b-instruct",
parameters=IbmGenAiInferenceEngineParams(max_new_tokens=256),
"inference_model": WMLInferenceEngineGeneration(
model_name="meta-llama/llama-3-70b-instruct", max_new_tokens=256
),
},
"generic_inference_engine": {
Expand Down
12 changes: 4 additions & 8 deletions prepare/metrics/llm_as_judge/conversation_topicality.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,13 @@
import json

from unitxt import add_to_catalog
from unitxt.inference import (
IbmGenAiInferenceEngine,
IbmGenAiInferenceEngineParams,
)
from unitxt.inference import WMLInferenceEngineGeneration
from unitxt.llm_as_judge import LLMAsJudge
from unitxt.metrics import (
RandomForestMetricsEnsemble,
)

platform = "ibm_gen_ai"
gen_params = IbmGenAiInferenceEngineParams(max_new_tokens=256)

config_filepath = "prepare/metrics/llm_as_judge/ensemble_topicality_v1.json"

Expand All @@ -25,8 +21,8 @@

inference_model_lst = []
for model_name in model_lst:
inference_model = IbmGenAiInferenceEngine(
model_name=model_name, parameters=gen_params
inference_model = WMLInferenceEngineGeneration(
model_name=model_name, max_new_tokens=256
)
inference_model_lst.append(inference_model)

Expand Down Expand Up @@ -60,6 +56,6 @@
# ensemble_metric.load_weights(config_filepath)
add_to_catalog(
ensemble_metric,
"metrics.llm_as_judge.conversation_answer_topicality.ensemble_v1_ibmgenai_judges",
"metrics.llm_as_judge.conversation_answer_topicality.ensemble_v1_wml_judges",
overwrite=True,
)
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
from unitxt.inference import (
CrossProviderInferenceEngine,
GenericInferenceEngine,
IbmGenAiInferenceEngine,
WMLInferenceEngine,
)
from unitxt.llm_as_judge import LLMAsJudge
Expand All @@ -16,7 +15,6 @@

inference_engines = [
("ibm_wml", WMLInferenceEngine),
("ibm_genai", IbmGenAiInferenceEngine),
("generic_engine", GenericInferenceEngine),
]

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,7 @@
inference_model = CrossProviderInferenceEngine(
model=model_id, max_tokens=252, seed=get_seed()
)
model_label = (
model_id.replace("-", "_").replace(".", ",").lower() + "_cross_provider"
)
model_label = model_id.replace("-", "_").replace(".", ",").lower()
template_label = template.split(".")[-1]
metric = LLMAsJudge(
inference_model=inference_model,
Expand All @@ -24,6 +22,6 @@
)
add_to_catalog(
metric,
f"metrics.llm_as_judge.rating.{model_label}_template_{template_label}",
f"metrics.llm_as_judge.rating.{model_label}.{template_label}",
overwrite=True,
)
37 changes: 37 additions & 0 deletions prepare/metrics/llm_as_judge/rating/llama_3_generic_template.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
from unitxt import add_to_catalog
from unitxt.inference import CrossProviderInferenceEngine
from unitxt.llm_as_judge_from_template import LLMAsJudge

inference_model = CrossProviderInferenceEngine(
model="llama-3-70b-instruct", max_tokens=252
)

metric = LLMAsJudge(
inference_model=inference_model,
template="templates.response_assessment.rating.generic_single_turn",
task="rating.single_turn",
format="formats.chat_api",
main_score="llama_3_70b_instruct_template_generic_single_turn",
prediction_type=str,
)

add_to_catalog(
metric,
"metrics.llm_as_judge.rating.llama_3_70b_instruct.generic_single_turn",
overwrite=True,
)

metric = LLMAsJudge(
inference_model=inference_model,
template="templates.response_assessment.rating.generic_single_turn_with_reference",
task="rating.single_turn_with_reference",
format="formats.chat_api",
single_reference_per_prediction=True,
main_score="llama_3_70b_instruct_template_generic_single_turn_with_reference",
)

add_to_catalog(
metric,
"metrics.llm_as_judge.rating.llama_3_70b_instruct.generic_single_turn_with_reference",
overwrite=True,
)

This file was deleted.

Loading
Loading