Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove IBM GenAI support and moved legacy GenAI metrics to use CrossProviderInferenceEngine #1508

Open
wants to merge 11 commits into
base: main
Choose a base branch
from
Open
2 changes: 1 addition & 1 deletion examples/evaluate_ensemble_judge.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
reference_fields={},
prediction_type="str",
metrics=[
"metrics.llm_as_judge.conversation_answer_topicality.ensemble_v1_ibmgenai_judges"
"metrics.llm_as_judge.conversation_answer_topicality.ensemble_v1_wml_judges"
],
),
templates=TemplatesDict(
Expand Down
2 changes: 1 addition & 1 deletion examples/evaluate_grounded_ensemble_judge.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
reference_fields={},
prediction_type="str",
metrics=[
"metrics.llm_as_judge.conversation_answer_groundedness.ensemble_v1_ibmgenai_judges"
"metrics.llm_as_judge.conversation_answer_groundedness.ensemble_v1_wml_judges"
],
),
templates=TemplatesDict(
Expand Down
4 changes: 1 addition & 3 deletions examples/evaluate_idk_judge.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,7 @@
input_fields={"inquiry": "str"},
reference_fields={},
prediction_type="str",
metrics=[
"metrics.llm_as_judge.conversation_answer_idk.llama3_v1_ibmgenai_judges"
],
metrics=["metrics.llm_as_judge.conversation_answer_idk.llama3_v1_wml_judges"],
),
templates=TemplatesDict(
{
Expand Down
4 changes: 2 additions & 2 deletions examples/evaluate_llm_as_judge_from_template.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@

# List of metrics to evaluate
metrics_to_check = [
"metrics.llm_as_judge.rating.llama_3_8b_instruct_ibm_genai_template_mt_bench_single_turn",
"metrics.llm_as_judge.rating.llama_3_70b_instruct_ibm_genai_template_generic_single_turn",
"metrics.llm_as_judge.rating.llama_3_8b_instruct.mt_bench_single_turn",
"metrics.llm_as_judge.rating.llama_3_70b_instruct.generic_single_turn",
]

for metric_to_check in metrics_to_check:
Expand Down
2 changes: 1 addition & 1 deletion examples/evaluate_using_metrics_ensemble.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
ensemble_metric = MetricsEnsemble(
metrics=[
"metrics.llm_as_judge.rating.llama_3_70b_instruct.generic_single_turn",
"metrics.llm_as_judge.rating.llama_3_8b_instruct_ibm_genai_template_mt_bench_single_turn",
"metrics.llm_as_judge.rating.llama_3_8b_instruct.mt_bench_single_turn",
],
weights=[0.75, 0.25],
)
Expand Down
2 changes: 1 addition & 1 deletion prepare/cards/scigen.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
}
),
],
task="tasks.generation.from_pair[metrics=[metrics.llm_as_judge.rating.llama_3_1_70b_instruct_cross_provider_template_table2text_single_turn_with_reference]]",
task="tasks.generation.from_pair[metrics=[metrics.llm_as_judge.rating.llama_3_1_70b_instruct.table2text_single_turn_with_reference]]",
templates=[
"templates.generation.from_pair.default[postprocessors=[processors.lower_case]]"
],
Expand Down
Empty file.
11 changes: 0 additions & 11 deletions prepare/engines/ibm_genai/llama3.py

This file was deleted.

14 changes: 5 additions & 9 deletions prepare/metrics/llm_as_judge/conversation_groundedness.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,13 @@
import json

from unitxt import add_to_catalog
from unitxt.inference import (
IbmGenAiInferenceEngine,
IbmGenAiInferenceEngineParams,
)
from unitxt.inference import WMLInferenceEngineGeneration
from unitxt.llm_as_judge import LLMAsJudge
from unitxt.metrics import (
RandomForestMetricsEnsemble,
)

platform = "ibm_gen_ai"
gen_params = IbmGenAiInferenceEngineParams(max_new_tokens=256)
platform = "wml"

config_filepath = "prepare/metrics/llm_as_judge/ensemble_grounded_v1.json"

Expand All @@ -25,8 +21,8 @@

inference_model_lst = []
for model_name in model_lst:
inference_model = IbmGenAiInferenceEngine(
model_name=model_name, parameters=gen_params
inference_model = WMLInferenceEngineGeneration(
model_name=model_name, max_new_tokens=256
)
inference_model_lst.append(inference_model)

Expand Down Expand Up @@ -60,6 +56,6 @@
# ensemble_metric.load_weights(config_filepath)
add_to_catalog(
ensemble_metric,
"metrics.llm_as_judge.conversation_answer_groundedness.ensemble_v1_ibmgenai_judges",
"metrics.llm_as_judge.conversation_answer_groundedness.ensemble_v1_wml_judges",
overwrite=True,
)
13 changes: 4 additions & 9 deletions prepare/metrics/llm_as_judge/conversation_idk.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,14 @@
from unitxt import add_to_catalog
from unitxt.inference import (
GenericInferenceEngine,
IbmGenAiInferenceEngine,
IbmGenAiInferenceEngineParams,
)
from unitxt.inference import GenericInferenceEngine, WMLInferenceEngineGeneration
from unitxt.llm_as_judge import LLMAsJudge

template_name = "templates.response_assessment.judges.idk.v1"

inference_models = {
"llama3_v1_ibmgenai": {
"llama3_v1_wml": {
"model_name": "llama370binstruct",
"inference_model": IbmGenAiInferenceEngine(
model_name="meta-llama/llama-3-70b-instruct",
parameters=IbmGenAiInferenceEngineParams(max_new_tokens=256),
"inference_model": WMLInferenceEngineGeneration(
model_name="meta-llama/llama-3-70b-instruct", max_new_tokens=256
),
},
"generic_inference_engine": {
Expand Down
12 changes: 4 additions & 8 deletions prepare/metrics/llm_as_judge/conversation_topicality.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,13 @@
import json

from unitxt import add_to_catalog
from unitxt.inference import (
IbmGenAiInferenceEngine,
IbmGenAiInferenceEngineParams,
)
from unitxt.inference import WMLInferenceEngineGeneration
from unitxt.llm_as_judge import LLMAsJudge
from unitxt.metrics import (
RandomForestMetricsEnsemble,
)

platform = "ibm_gen_ai"
gen_params = IbmGenAiInferenceEngineParams(max_new_tokens=256)

config_filepath = "prepare/metrics/llm_as_judge/ensemble_topicality_v1.json"

Expand All @@ -25,8 +21,8 @@

inference_model_lst = []
for model_name in model_lst:
inference_model = IbmGenAiInferenceEngine(
model_name=model_name, parameters=gen_params
inference_model = WMLInferenceEngineGeneration(
model_name=model_name, max_new_tokens=256
)
inference_model_lst.append(inference_model)

Expand Down Expand Up @@ -60,6 +56,6 @@
# ensemble_metric.load_weights(config_filepath)
add_to_catalog(
ensemble_metric,
"metrics.llm_as_judge.conversation_answer_topicality.ensemble_v1_ibmgenai_judges",
"metrics.llm_as_judge.conversation_answer_topicality.ensemble_v1_wml_judges",
overwrite=True,
)
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
from unitxt.inference import (
CrossProviderInferenceEngine,
GenericInferenceEngine,
IbmGenAiInferenceEngine,
WMLInferenceEngine,
)
from unitxt.llm_as_judge import LLMAsJudge
Expand All @@ -16,7 +15,6 @@

inference_engines = [
("ibm_wml", WMLInferenceEngine),
("ibm_genai", IbmGenAiInferenceEngine),
("generic_engine", GenericInferenceEngine),
]

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,7 @@
inference_model = CrossProviderInferenceEngine(
model=model_id, max_tokens=252, seed=get_seed()
)
model_label = (
model_id.replace("-", "_").replace(".", ",").lower() + "_cross_provider"
)
model_label = model_id.replace("-", "_").replace(".", ",").lower()
template_label = template.split(".")[-1]
metric = LLMAsJudge(
inference_model=inference_model,
Expand All @@ -24,6 +22,6 @@
)
add_to_catalog(
metric,
f"metrics.llm_as_judge.rating.{model_label}_template_{template_label}",
f"metrics.llm_as_judge.rating.{model_label}.{template_label}",
overwrite=True,
)
37 changes: 37 additions & 0 deletions prepare/metrics/llm_as_judge/rating/llama_3_generic_template.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
from unitxt import add_to_catalog
from unitxt.inference import CrossProviderInferenceEngine
from unitxt.llm_as_judge_from_template import LLMAsJudge

inference_model = CrossProviderInferenceEngine(
model="llama-3-70b-instruct", max_tokens=252
)

metric = LLMAsJudge(
inference_model=inference_model,
template="templates.response_assessment.rating.generic_single_turn",
task="rating.single_turn",
format="formats.chat_api",
main_score="llama_3_70b_instruct_template_generic_single_turn",
prediction_type=str,
)

add_to_catalog(
metric,
"metrics.llm_as_judge.rating.llama_3_70b_instruct.generic_single_turn",
overwrite=True,
)

metric = LLMAsJudge(
inference_model=inference_model,
template="templates.response_assessment.rating.generic_single_turn_with_reference",
task="rating.single_turn_with_reference",
format="formats.chat_api",
single_reference_per_prediction=True,
main_score="llama_3_70b_instruct_template_generic_single_turn_with_reference",
)

add_to_catalog(
metric,
"metrics.llm_as_judge.rating.llama_3_70b_instruct.generic_single_turn_with_reference",
overwrite=True,
)

This file was deleted.

Original file line number Diff line number Diff line change
@@ -1,32 +1,32 @@
from unitxt import add_to_catalog
from unitxt.inference import IbmGenAiInferenceEngine
from unitxt.inference import CrossProviderInferenceEngine
from unitxt.llm_as_judge_from_template import LLMAsJudge
from unitxt.random_utils import get_seed

model_list = ["meta-llama/llama-3-8b-instruct", "meta-llama/llama-3-70b-instruct"]
format = "formats.llama3_instruct"
model_list = ["llama-3-70b-instruct", "llama-3-8b-instruct"]
format = "formats.chat_api"
template = "templates.response_assessment.rating.mt_bench_single_turn"
task = "rating.single_turn"


for model_id in model_list:
inference_model = IbmGenAiInferenceEngine(
model_name=model_id, max_new_tokens=252, random_seed=get_seed()
inference_model = CrossProviderInferenceEngine(
model=model_id, max_tokens=252, seed=get_seed()
)
model_label = model_id.split("/")[1].replace("-", "_").replace(".", ",").lower()
model_label = f"{model_label}_ibm_genai"
model_label = model_id.replace("-", "_").replace(".", ",").lower()
template_label = template.split(".")[-1]
metric_label = f"{model_label}_template_{template_label}"
metric = LLMAsJudge(
inference_model=inference_model,
template=template,
task=task,
format=format,
format="formats.chat_api",
main_score=metric_label,
prediction_type=str,
)

add_to_catalog(
metric,
f"metrics.llm_as_judge.rating.{model_label}_template_{template_label}",
f"metrics.llm_as_judge.rating.{model_label}.{template_label}",
overwrite=True,
)
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,11 @@
format = "formats.llama3_instruct"
template = "templates.response_assessment.rating.table2text_single_turn_with_reference"
task = "rating.single_turn_with_reference"

for model_id in model_list:
inference_model = CrossProviderInferenceEngine(
model=model_id, max_tokens=252, seed=get_seed()
)
model_label = model_id.replace("-", "_").replace(".", ",").lower()
model_label = f"{model_label}"
template_label = template.split(".")[-1]
metric_label = f"{model_label}_template_{template_label}"
metric = LLMAsJudge(
Expand All @@ -27,6 +25,6 @@

add_to_catalog(
metric,
f"metrics.llm_as_judge.rating.{model_label}_template_{template_label}",
f"metrics.llm_as_judge.rating.{model_label}.{template_label}",
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@elronbandel @ShirApp - Please see the change in metric name I made to make table2text rating be consistent with other rating engine. As far as I saw the only use was in scigen.py.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It looks fine. Yes, we used it only for scigen.

overwrite=True,
)
Loading
Loading