Skip to content

Commit 52d3d33

Browse files
authored
Parametrizing the sampling evals from the CLI (#926)
This PR does several things. Contrain the Metric object creation. We go from f1_score_macro = CorpusLevelMetric( metric_name="f1", sample_level_fn=GenerativePreparator().prepare, category=SamplingMethod.GENERATIVE, corpus_level_fn=CorpusLevelF1Score(average="macro").compute, higher_is_better=True, ) to f1_score_macro = CorpusLevelMetric( metric_name="f1", sample_level_fn=GenerativePreparator(), category=SamplingMethod.GENERATIVE, corpus_level_fn=CorpusLevelF1Score(average="macro"), higher_is_better=True, ) sample_level_fn must derive from either a SampleLevelComputation or Preparator class. The former must implement a compute method, the latter a prepare one. corpus_level_fn either is a function (np.mean and so on), or a CorpusLevelComputation which must implement a .compute_corpus class. All metrics with parametrizable sample_level_fn can now be parametrized at CLI call, example: "lighteval|math_500@k=1|0|0" (user can also use normalization function names if correctly defined in the normalization file). Corpus level parametrization is not supported but could probably be if we choose another symbol. This parametrization of Metrics.MyMetric relies on a trick in making the enum callable. The metric list has been simplified to remove duplicate metrics which were only different by some params All tasks have therefore been changed to use the new metrics names. The test suite has been updated
1 parent 50756ef commit 52d3d33

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

50 files changed

+4896
-2798
lines changed

community_tasks/aimo_evals.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
"""
2727

2828
from lighteval.metrics.metrics import Metrics
29+
from lighteval.metrics.normalizations import math_normalizer
2930
from lighteval.tasks.lighteval_task import LightevalTaskConfig
3031
from lighteval.tasks.requests import Doc
3132

@@ -49,7 +50,9 @@ def aimo_prompt(line, task_name: str = None):
4950
evaluation_splits=["train"],
5051
few_shots_split="train",
5152
few_shots_select="sequential",
52-
metrics=[Metrics.quasi_exact_match_math],
53+
metrics=[
54+
Metrics.exact_match(sample_params={"normalize_gold": math_normalizer, "normalize_pred": math_normalizer})
55+
],
5356
generation_size=2048,
5457
stop_sequence=None,
5558
)

community_tasks/arabic_evals.py

Lines changed: 23 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,10 @@
3131
import re
3232
from typing import Any, Dict, List, Optional, Union
3333

34-
from lighteval.metrics.llm_as_judge import JudgeLM
35-
from lighteval.metrics.metrics import Metric, Metrics
34+
from lighteval.metrics.metrics import Metrics
35+
from lighteval.metrics.normalizations import LogProbCharNorm
36+
from lighteval.metrics.utils.llm_as_judge import JudgeLM
37+
from lighteval.metrics.utils.metric_utils import Metric
3638
from lighteval.tasks.default_prompts import LETTER_INDICES
3739
from lighteval.tasks.lighteval_task import LightevalTaskConfig
3840
from lighteval.tasks.requests import Doc, SamplingMethod
@@ -103,7 +105,7 @@ def __init__(
103105
hf_subset=hf_subset,
104106
prompt_function=arabic_mmlu_pfn,
105107
hf_repo="MBZUAI/ArabicMMLU",
106-
metrics=[Metrics.loglikelihood_acc_norm],
108+
metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
107109
hf_avail_splits=["test"],
108110
evaluation_splits=["test"],
109111
few_shots_split=["dev"],
@@ -164,7 +166,7 @@ def __init__(
164166
hf_subset=hf_subset,
165167
prompt_function=arabic_mmlu_ht_pfn,
166168
hf_repo="MBZUAI/human_translated_arabic_mmlu",
167-
metrics=[Metrics.loglikelihood_acc_norm],
169+
metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
168170
hf_avail_splits=["test"],
169171
evaluation_splits=["test"],
170172
few_shots_split=None,
@@ -228,7 +230,7 @@ def __init__(
228230
hf_subset=hf_subset,
229231
prompt_function=arabic_mmlu_mt_pfn,
230232
hf_repo="OALL/Arabic_MMLU",
231-
metrics=[Metrics.loglikelihood_acc_norm],
233+
metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
232234
hf_avail_splits=["test", "dev"],
233235
evaluation_splits=["test"],
234236
few_shots_split="dev",
@@ -283,7 +285,7 @@ def __init__(
283285
hf_subset=hf_subset,
284286
prompt_function=acva_pfn,
285287
hf_repo="OALL/ACVA",
286-
metrics=[Metrics.loglikelihood_acc_norm],
288+
metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
287289
hf_avail_splits=["test", "validation"],
288290
evaluation_splits=["test"],
289291
few_shots_split="validation",
@@ -339,7 +341,7 @@ def __init__(
339341
hf_subset=hf_subset,
340342
prompt_function=aratrust_pfn,
341343
hf_repo="asas-ai/AraTrust-categorized",
342-
metrics=[Metrics.loglikelihood_acc_norm],
344+
metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
343345
hf_avail_splits=["train"],
344346
evaluation_splits=["train"],
345347
few_shots_split=None,
@@ -387,7 +389,7 @@ def arabic_exams_pfn(line, task_name: str = None):
387389
evaluation_splits=["test"],
388390
few_shots_split="validation",
389391
few_shots_select="sequential",
390-
metrics=[Metrics.loglikelihood_acc_norm],
392+
metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
391393
version=0,
392394
)
393395

@@ -437,7 +439,7 @@ def __init__(
437439
hf_subset=hf_subset,
438440
prompt_function=alghafa_pfn,
439441
hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Native",
440-
metrics=[Metrics.loglikelihood_acc_norm],
442+
metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
441443
hf_avail_splits=["test", "validation"],
442444
evaluation_splits=["test"],
443445
few_shots_split="validation",
@@ -463,7 +465,7 @@ def __init__(
463465
evaluation_splits=["test"],
464466
few_shots_split="validation",
465467
few_shots_select="sequential",
466-
metrics=[Metrics.loglikelihood_acc_norm],
468+
metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
467469
version=0,
468470
)
469471

@@ -479,7 +481,7 @@ def __init__(
479481
evaluation_splits=["test"],
480482
few_shots_split="validation",
481483
few_shots_select="sequential",
482-
metrics=[Metrics.loglikelihood_acc_norm],
484+
metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
483485
version=0,
484486
)
485487

@@ -495,7 +497,7 @@ def __init__(
495497
evaluation_splits=["test"],
496498
few_shots_split="validation",
497499
few_shots_select="sequential",
498-
metrics=[Metrics.loglikelihood_acc_norm],
500+
metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
499501
version=0,
500502
)
501503

@@ -511,7 +513,7 @@ def __init__(
511513
evaluation_splits=["test"],
512514
few_shots_split="validation",
513515
few_shots_select="sequential",
514-
metrics=[Metrics.loglikelihood_acc_norm],
516+
metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
515517
version=0,
516518
)
517519

@@ -527,7 +529,7 @@ def __init__(
527529
evaluation_splits=["test"],
528530
few_shots_split="validation",
529531
few_shots_select="sequential",
530-
metrics=[Metrics.loglikelihood_acc_norm],
532+
metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
531533
version=0,
532534
)
533535

@@ -543,7 +545,7 @@ def __init__(
543545
evaluation_splits=["test"],
544546
few_shots_split="validation",
545547
few_shots_select="sequential",
546-
metrics=[Metrics.loglikelihood_acc_norm],
548+
metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
547549
version=0,
548550
)
549551

@@ -580,7 +582,7 @@ def boolq_arabic_pfn(line, task_name: str = None):
580582
evaluation_splits=["test"],
581583
few_shots_split="validation",
582584
few_shots_select="sequential",
583-
metrics=[Metrics.loglikelihood_acc_norm],
585+
metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
584586
version=0,
585587
)
586588

@@ -614,7 +616,7 @@ def copa_arabic_pfn(line, task_name: str = None):
614616
evaluation_splits=["test"],
615617
few_shots_split="validation",
616618
few_shots_select="sequential",
617-
metrics=[Metrics.loglikelihood_acc_norm],
619+
metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
618620
version=0,
619621
)
620622

@@ -657,7 +659,7 @@ def hellaswag_arabic_pfn(line, task_name: str = None):
657659
evaluation_splits=["test"],
658660
few_shots_split="validation",
659661
few_shots_select="sequential",
660-
metrics=[Metrics.loglikelihood_acc_norm],
662+
metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
661663
version=0,
662664
)
663665

@@ -693,7 +695,7 @@ def toxigen_arabic_pfn(line, task_name: str = None):
693695
evaluation_splits=["test"],
694696
few_shots_split="validation",
695697
few_shots_select="sequential",
696-
metrics=[Metrics.loglikelihood_acc_norm],
698+
metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
697699
version=0,
698700
)
699701

@@ -743,7 +745,7 @@ def sciq_arabic_pfn(line, task_name: str = None):
743745
evaluation_splits=["test"],
744746
few_shots_split="validation",
745747
few_shots_select="sequential",
746-
metrics=[Metrics.loglikelihood_acc_norm],
748+
metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
747749
version=0,
748750
)
749751

@@ -800,7 +802,7 @@ def __init__(
800802
hf_subset=hf_subset,
801803
prompt_function=madinah_qa_pfn,
802804
hf_repo="MBZUAI/MadinahQA",
803-
metrics=[Metrics.loglikelihood_acc_norm],
805+
metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
804806
hf_avail_splits=["test"],
805807
evaluation_splits=["test"],
806808
few_shots_split=["dev"],

community_tasks/french_evals.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
import random
3434

3535
from lighteval.metrics.metrics import Metrics
36+
from lighteval.metrics.normalizations import math_normalizer
3637
from lighteval.tasks.default_prompts import LETTER_INDICES
3738
from lighteval.tasks.extended.ifeval.main import ifeval_metrics
3839
from lighteval.tasks.lighteval_task import LightevalTaskConfig
@@ -136,7 +137,10 @@ def prompt_bac_fr(line, task_name: str = None):
136137
few_shots_split=None,
137138
few_shots_select="random_sampling",
138139
generation_size=1,
139-
metrics=[Metrics.quasi_exact_match_math, Metrics.exact_match],
140+
metrics=[
141+
Metrics.exact_match(sample_params={"normalize_gold": math_normalizer, "normalize_pred": math_normalizer}),
142+
Metrics.exact_match,
143+
],
140144
stop_sequence=["\n"],
141145
version=0,
142146
)

0 commit comments

Comments
 (0)