31
31
import re
32
32
from typing import Any , Dict , List , Optional , Union
33
33
34
- from lighteval .metrics .llm_as_judge import JudgeLM
35
- from lighteval .metrics .metrics import Metric , Metrics
34
+ from lighteval .metrics .metrics import Metrics
35
+ from lighteval .metrics .normalizations import LogProbCharNorm
36
+ from lighteval .metrics .utils .llm_as_judge import JudgeLM
37
+ from lighteval .metrics .utils .metric_utils import Metric
36
38
from lighteval .tasks .default_prompts import LETTER_INDICES
37
39
from lighteval .tasks .lighteval_task import LightevalTaskConfig
38
40
from lighteval .tasks .requests import Doc , SamplingMethod
@@ -103,7 +105,7 @@ def __init__(
103
105
hf_subset = hf_subset ,
104
106
prompt_function = arabic_mmlu_pfn ,
105
107
hf_repo = "MBZUAI/ArabicMMLU" ,
106
- metrics = [Metrics .loglikelihood_acc_norm ],
108
+ metrics = [Metrics .loglikelihood_acc ( sample_params = { "logprob_normalization" : LogProbCharNorm ()}) ],
107
109
hf_avail_splits = ["test" ],
108
110
evaluation_splits = ["test" ],
109
111
few_shots_split = ["dev" ],
@@ -164,7 +166,7 @@ def __init__(
164
166
hf_subset = hf_subset ,
165
167
prompt_function = arabic_mmlu_ht_pfn ,
166
168
hf_repo = "MBZUAI/human_translated_arabic_mmlu" ,
167
- metrics = [Metrics .loglikelihood_acc_norm ],
169
+ metrics = [Metrics .loglikelihood_acc ( sample_params = { "logprob_normalization" : LogProbCharNorm ()}) ],
168
170
hf_avail_splits = ["test" ],
169
171
evaluation_splits = ["test" ],
170
172
few_shots_split = None ,
@@ -228,7 +230,7 @@ def __init__(
228
230
hf_subset = hf_subset ,
229
231
prompt_function = arabic_mmlu_mt_pfn ,
230
232
hf_repo = "OALL/Arabic_MMLU" ,
231
- metrics = [Metrics .loglikelihood_acc_norm ],
233
+ metrics = [Metrics .loglikelihood_acc ( sample_params = { "logprob_normalization" : LogProbCharNorm ()}) ],
232
234
hf_avail_splits = ["test" , "dev" ],
233
235
evaluation_splits = ["test" ],
234
236
few_shots_split = "dev" ,
@@ -283,7 +285,7 @@ def __init__(
283
285
hf_subset = hf_subset ,
284
286
prompt_function = acva_pfn ,
285
287
hf_repo = "OALL/ACVA" ,
286
- metrics = [Metrics .loglikelihood_acc_norm ],
288
+ metrics = [Metrics .loglikelihood_acc ( sample_params = { "logprob_normalization" : LogProbCharNorm ()}) ],
287
289
hf_avail_splits = ["test" , "validation" ],
288
290
evaluation_splits = ["test" ],
289
291
few_shots_split = "validation" ,
@@ -339,7 +341,7 @@ def __init__(
339
341
hf_subset = hf_subset ,
340
342
prompt_function = aratrust_pfn ,
341
343
hf_repo = "asas-ai/AraTrust-categorized" ,
342
- metrics = [Metrics .loglikelihood_acc_norm ],
344
+ metrics = [Metrics .loglikelihood_acc ( sample_params = { "logprob_normalization" : LogProbCharNorm ()}) ],
343
345
hf_avail_splits = ["train" ],
344
346
evaluation_splits = ["train" ],
345
347
few_shots_split = None ,
@@ -387,7 +389,7 @@ def arabic_exams_pfn(line, task_name: str = None):
387
389
evaluation_splits = ["test" ],
388
390
few_shots_split = "validation" ,
389
391
few_shots_select = "sequential" ,
390
- metrics = [Metrics .loglikelihood_acc_norm ],
392
+ metrics = [Metrics .loglikelihood_acc ( sample_params = { "logprob_normalization" : LogProbCharNorm ()}) ],
391
393
version = 0 ,
392
394
)
393
395
@@ -437,7 +439,7 @@ def __init__(
437
439
hf_subset = hf_subset ,
438
440
prompt_function = alghafa_pfn ,
439
441
hf_repo = "OALL/AlGhafa-Arabic-LLM-Benchmark-Native" ,
440
- metrics = [Metrics .loglikelihood_acc_norm ],
442
+ metrics = [Metrics .loglikelihood_acc ( sample_params = { "logprob_normalization" : LogProbCharNorm ()}) ],
441
443
hf_avail_splits = ["test" , "validation" ],
442
444
evaluation_splits = ["test" ],
443
445
few_shots_split = "validation" ,
@@ -463,7 +465,7 @@ def __init__(
463
465
evaluation_splits = ["test" ],
464
466
few_shots_split = "validation" ,
465
467
few_shots_select = "sequential" ,
466
- metrics = [Metrics .loglikelihood_acc_norm ],
468
+ metrics = [Metrics .loglikelihood_acc ( sample_params = { "logprob_normalization" : LogProbCharNorm ()}) ],
467
469
version = 0 ,
468
470
)
469
471
@@ -479,7 +481,7 @@ def __init__(
479
481
evaluation_splits = ["test" ],
480
482
few_shots_split = "validation" ,
481
483
few_shots_select = "sequential" ,
482
- metrics = [Metrics .loglikelihood_acc_norm ],
484
+ metrics = [Metrics .loglikelihood_acc ( sample_params = { "logprob_normalization" : LogProbCharNorm ()}) ],
483
485
version = 0 ,
484
486
)
485
487
@@ -495,7 +497,7 @@ def __init__(
495
497
evaluation_splits = ["test" ],
496
498
few_shots_split = "validation" ,
497
499
few_shots_select = "sequential" ,
498
- metrics = [Metrics .loglikelihood_acc_norm ],
500
+ metrics = [Metrics .loglikelihood_acc ( sample_params = { "logprob_normalization" : LogProbCharNorm ()}) ],
499
501
version = 0 ,
500
502
)
501
503
@@ -511,7 +513,7 @@ def __init__(
511
513
evaluation_splits = ["test" ],
512
514
few_shots_split = "validation" ,
513
515
few_shots_select = "sequential" ,
514
- metrics = [Metrics .loglikelihood_acc_norm ],
516
+ metrics = [Metrics .loglikelihood_acc ( sample_params = { "logprob_normalization" : LogProbCharNorm ()}) ],
515
517
version = 0 ,
516
518
)
517
519
@@ -527,7 +529,7 @@ def __init__(
527
529
evaluation_splits = ["test" ],
528
530
few_shots_split = "validation" ,
529
531
few_shots_select = "sequential" ,
530
- metrics = [Metrics .loglikelihood_acc_norm ],
532
+ metrics = [Metrics .loglikelihood_acc ( sample_params = { "logprob_normalization" : LogProbCharNorm ()}) ],
531
533
version = 0 ,
532
534
)
533
535
@@ -543,7 +545,7 @@ def __init__(
543
545
evaluation_splits = ["test" ],
544
546
few_shots_split = "validation" ,
545
547
few_shots_select = "sequential" ,
546
- metrics = [Metrics .loglikelihood_acc_norm ],
548
+ metrics = [Metrics .loglikelihood_acc ( sample_params = { "logprob_normalization" : LogProbCharNorm ()}) ],
547
549
version = 0 ,
548
550
)
549
551
@@ -580,7 +582,7 @@ def boolq_arabic_pfn(line, task_name: str = None):
580
582
evaluation_splits = ["test" ],
581
583
few_shots_split = "validation" ,
582
584
few_shots_select = "sequential" ,
583
- metrics = [Metrics .loglikelihood_acc_norm ],
585
+ metrics = [Metrics .loglikelihood_acc ( sample_params = { "logprob_normalization" : LogProbCharNorm ()}) ],
584
586
version = 0 ,
585
587
)
586
588
@@ -614,7 +616,7 @@ def copa_arabic_pfn(line, task_name: str = None):
614
616
evaluation_splits = ["test" ],
615
617
few_shots_split = "validation" ,
616
618
few_shots_select = "sequential" ,
617
- metrics = [Metrics .loglikelihood_acc_norm ],
619
+ metrics = [Metrics .loglikelihood_acc ( sample_params = { "logprob_normalization" : LogProbCharNorm ()}) ],
618
620
version = 0 ,
619
621
)
620
622
@@ -657,7 +659,7 @@ def hellaswag_arabic_pfn(line, task_name: str = None):
657
659
evaluation_splits = ["test" ],
658
660
few_shots_split = "validation" ,
659
661
few_shots_select = "sequential" ,
660
- metrics = [Metrics .loglikelihood_acc_norm ],
662
+ metrics = [Metrics .loglikelihood_acc ( sample_params = { "logprob_normalization" : LogProbCharNorm ()}) ],
661
663
version = 0 ,
662
664
)
663
665
@@ -693,7 +695,7 @@ def toxigen_arabic_pfn(line, task_name: str = None):
693
695
evaluation_splits = ["test" ],
694
696
few_shots_split = "validation" ,
695
697
few_shots_select = "sequential" ,
696
- metrics = [Metrics .loglikelihood_acc_norm ],
698
+ metrics = [Metrics .loglikelihood_acc ( sample_params = { "logprob_normalization" : LogProbCharNorm ()}) ],
697
699
version = 0 ,
698
700
)
699
701
@@ -743,7 +745,7 @@ def sciq_arabic_pfn(line, task_name: str = None):
743
745
evaluation_splits = ["test" ],
744
746
few_shots_split = "validation" ,
745
747
few_shots_select = "sequential" ,
746
- metrics = [Metrics .loglikelihood_acc_norm ],
748
+ metrics = [Metrics .loglikelihood_acc ( sample_params = { "logprob_normalization" : LogProbCharNorm ()}) ],
747
749
version = 0 ,
748
750
)
749
751
@@ -800,7 +802,7 @@ def __init__(
800
802
hf_subset = hf_subset ,
801
803
prompt_function = madinah_qa_pfn ,
802
804
hf_repo = "MBZUAI/MadinahQA" ,
803
- metrics = [Metrics .loglikelihood_acc_norm ],
805
+ metrics = [Metrics .loglikelihood_acc ( sample_params = { "logprob_normalization" : LogProbCharNorm ()}) ],
804
806
hf_avail_splits = ["test" ],
805
807
evaluation_splits = ["test" ],
806
808
few_shots_split = ["dev" ],
0 commit comments