Skip to content

Commit e53587e

Browse files
authored
Upgrade to datasets 4.0 (#924)
* remove trust remote code for datasets and bump datasets to 4.0 * remove trust remote code for datasets in extended tasks * remove trust remote code for datasets in multilingual tasks * fixing tasks for datasets 4.0 * fixing custom task loading to be compatible with datasets 4.0 * fix lcb * revert main_tasks.py * revert registry.py * raise instead of assert
1 parent 63424f4 commit e53587e

File tree

17 files changed

+216
-1895
lines changed

17 files changed

+216
-1895
lines changed

community_tasks/arabic_evals.py

Lines changed: 0 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,6 @@ def __init__(
111111
suite=["community"],
112112
generation_size=-1,
113113
stop_sequence=None,
114-
trust_dataset=True,
115114
version=0,
116115
)
117116

@@ -173,7 +172,6 @@ def __init__(
173172
suite=["community"],
174173
generation_size=-1,
175174
stop_sequence=None,
176-
trust_dataset=True,
177175
version=0,
178176
)
179177

@@ -238,7 +236,6 @@ def __init__(
238236
suite=["community"],
239237
generation_size=-1,
240238
stop_sequence=None,
241-
trust_dataset=True,
242239
version=0,
243240
)
244241

@@ -294,7 +291,6 @@ def __init__(
294291
suite=["community"],
295292
generation_size=-1,
296293
stop_sequence=None,
297-
trust_dataset=True,
298294
version=0,
299295
)
300296

@@ -351,7 +347,6 @@ def __init__(
351347
suite=["community"],
352348
generation_size=-1,
353349
stop_sequence=None,
354-
trust_dataset=True,
355350
version=0,
356351
)
357352

@@ -393,7 +388,6 @@ def arabic_exams_pfn(line, task_name: str = None):
393388
few_shots_split="validation",
394389
few_shots_select="sequential",
395390
metrics=[Metrics.loglikelihood_acc_norm],
396-
trust_dataset=True,
397391
version=0,
398392
)
399393

@@ -451,7 +445,6 @@ def __init__(
451445
suite=["community"],
452446
generation_size=-1,
453447
stop_sequence=None,
454-
trust_dataset=True,
455448
version=0,
456449
)
457450

@@ -471,7 +464,6 @@ def __init__(
471464
few_shots_split="validation",
472465
few_shots_select="sequential",
473466
metrics=[Metrics.loglikelihood_acc_norm],
474-
trust_dataset=True,
475467
version=0,
476468
)
477469

@@ -488,7 +480,6 @@ def __init__(
488480
few_shots_split="validation",
489481
few_shots_select="sequential",
490482
metrics=[Metrics.loglikelihood_acc_norm],
491-
trust_dataset=True,
492483
version=0,
493484
)
494485

@@ -505,7 +496,6 @@ def __init__(
505496
few_shots_split="validation",
506497
few_shots_select="sequential",
507498
metrics=[Metrics.loglikelihood_acc_norm],
508-
trust_dataset=True,
509499
version=0,
510500
)
511501

@@ -522,7 +512,6 @@ def __init__(
522512
few_shots_split="validation",
523513
few_shots_select="sequential",
524514
metrics=[Metrics.loglikelihood_acc_norm],
525-
trust_dataset=True,
526515
version=0,
527516
)
528517

@@ -539,7 +528,6 @@ def __init__(
539528
few_shots_split="validation",
540529
few_shots_select="sequential",
541530
metrics=[Metrics.loglikelihood_acc_norm],
542-
trust_dataset=True,
543531
version=0,
544532
)
545533

@@ -556,7 +544,6 @@ def __init__(
556544
few_shots_split="validation",
557545
few_shots_select="sequential",
558546
metrics=[Metrics.loglikelihood_acc_norm],
559-
trust_dataset=True,
560547
version=0,
561548
)
562549

@@ -594,7 +581,6 @@ def boolq_arabic_pfn(line, task_name: str = None):
594581
few_shots_split="validation",
595582
few_shots_select="sequential",
596583
metrics=[Metrics.loglikelihood_acc_norm],
597-
trust_dataset=True,
598584
version=0,
599585
)
600586

@@ -629,7 +615,6 @@ def copa_arabic_pfn(line, task_name: str = None):
629615
few_shots_split="validation",
630616
few_shots_select="sequential",
631617
metrics=[Metrics.loglikelihood_acc_norm],
632-
trust_dataset=True,
633618
version=0,
634619
)
635620

@@ -673,7 +658,6 @@ def hellaswag_arabic_pfn(line, task_name: str = None):
673658
few_shots_split="validation",
674659
few_shots_select="sequential",
675660
metrics=[Metrics.loglikelihood_acc_norm],
676-
trust_dataset=True,
677661
version=0,
678662
)
679663

@@ -710,7 +694,6 @@ def toxigen_arabic_pfn(line, task_name: str = None):
710694
few_shots_split="validation",
711695
few_shots_select="sequential",
712696
metrics=[Metrics.loglikelihood_acc_norm],
713-
trust_dataset=True,
714697
version=0,
715698
)
716699

@@ -761,7 +744,6 @@ def sciq_arabic_pfn(line, task_name: str = None):
761744
few_shots_split="validation",
762745
few_shots_select="sequential",
763746
metrics=[Metrics.loglikelihood_acc_norm],
764-
trust_dataset=True,
765747
version=0,
766748
)
767749

@@ -826,7 +808,6 @@ def __init__(
826808
suite=["community"],
827809
generation_size=-1,
828810
stop_sequence=None,
829-
trust_dataset=True,
830811
version=0,
831812
)
832813

@@ -1038,7 +1019,6 @@ def process_judge_response(response) -> float:
10381019
hf_avail_splits=["train"],
10391020
evaluation_splits=["train"],
10401021
metrics=[wrapped_judge],
1041-
trust_dataset=True,
10421022
generation_size=200,
10431023
stop_sequence=[],
10441024
version=0,

community_tasks/french_evals.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,6 @@ def prompt_bac_fr(line, task_name: str = None):
121121
generation_size=1,
122122
metrics=[Metrics.loglikelihood_acc],
123123
stop_sequence=["\n"],
124-
trust_dataset=True,
125124
version=0,
126125
)
127126

@@ -139,7 +138,6 @@ def prompt_bac_fr(line, task_name: str = None):
139138
generation_size=1,
140139
metrics=[Metrics.quasi_exact_match_math, Metrics.exact_match],
141140
stop_sequence=["\n"],
142-
trust_dataset=True,
143141
version=0,
144142
)
145143

community_tasks/serbian_eval.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -283,10 +283,7 @@ def create_task_config(
283283
few_shots_select="sequential",
284284
metric=metric,
285285
generation_size=generation_size,
286-
# Since we use trust_dataset, we have to be careful about what is inside the dataset
287-
# script. We thus lock the revision to ensure that the script doesn't change
288286
hf_revision=HFSubsets.HF_REVISION.value,
289-
trust_dataset=True,
290287
version=0,
291288
)
292289

community_tasks/turkic_evals.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -136,7 +136,6 @@ def __init__(
136136
suite=["community"],
137137
generation_size=-1,
138138
stop_sequence=None,
139-
trust_dataset=False,
140139
version=0,
141140
)
142141

docs/source/saving-and-reading-results.mdx

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -182,7 +182,6 @@ The detail file contains the following columns:
182182
],
183183
"original_num_docs": 1319,
184184
"effective_num_docs": 1,
185-
"trust_dataset": true,
186185
"must_remove_duplicate_docs": null,
187186
"version": 0
188187
}

examples/custom_tasks_templates/custom_yourbench_task.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -258,7 +258,6 @@ def yourbench_prompt(line, task_name: str = ""):
258258
generation_size=8192,
259259
metrics=[Metrics.yourbench_metrics],
260260
stop_sequence=[],
261-
trust_dataset=True,
262261
version=0,
263262
)
264263

examples/custom_tasks_templates/custom_yourbench_task_mcq.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -94,8 +94,7 @@ def yourbench_prompt(line, task_name: str = ""):
9494
few_shots_split=None,
9595
few_shots_select=None,
9696
generation_size=8192,
97-
metric=[Metrics.yourbench_metrics],
98-
trust_dataset=True,
97+
metrics=[Metrics.yourbench_metrics],
9998
version=0,
10099
)
101100

examples/custom_tasks_tests.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,6 @@
3838
generation_size=512,
3939
metrics=[Metrics.expr_gold_metric],
4040
stop_sequence=None,
41-
trust_dataset=True,
4241
version=0,
4342
)
4443

@@ -55,7 +54,6 @@
5554
generation_size=2048,
5655
metrics=[Metrics.gpqa_instruct_pass_at_1_1n],
5756
stop_sequence=[], # no stop sequence, will use eos token
58-
trust_dataset=True,
5957
version=0,
6058
)
6159

examples/nanotron/custom_evaluation_tasks.py

Lines changed: 0 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,6 @@ def preprocess(text):
9090
hf_repo="hellaswag",
9191
hf_subset="default",
9292
metric=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace],
93-
trust_dataset=True,
9493
stop_sequence=["\n"],
9594
),
9695
LightevalTaskConfig(
@@ -99,7 +98,6 @@ def preprocess(text):
9998
hf_repo="winogrande",
10099
hf_subset="winogrande_xl",
101100
metric=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace],
102-
trust_dataset=True,
103101
stop_sequence=["\n"],
104102
),
105103
LightevalTaskConfig(
@@ -108,7 +106,6 @@ def preprocess(text):
108106
hf_repo="piqa",
109107
hf_subset="plain_text",
110108
metric=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace],
111-
trust_dataset=True,
112109
stop_sequence=["\n"],
113110
),
114111
LightevalTaskConfig(
@@ -118,7 +115,6 @@ def preprocess(text):
118115
hf_subset="default",
119116
hf_avail_splits=["train", "validation"],
120117
metric=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace],
121-
trust_dataset=True,
122118
stop_sequence=["\n"],
123119
),
124120
LightevalTaskConfig(
@@ -127,7 +123,6 @@ def preprocess(text):
127123
hf_repo="openbookqa",
128124
hf_subset="main",
129125
metric=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace],
130-
trust_dataset=True,
131126
stop_sequence=["\n"],
132127
),
133128
LightevalTaskConfig(
@@ -138,7 +133,6 @@ def preprocess(text):
138133
evaluation_splits=["test"],
139134
generation_size=1,
140135
metric=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace],
141-
trust_dataset=True,
142136
stop_sequence=["\n"],
143137
),
144138
LightevalTaskConfig(
@@ -149,7 +143,6 @@ def preprocess(text):
149143
evaluation_splits=["test"],
150144
generation_size=1,
151145
metric=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace],
152-
trust_dataset=True,
153146
stop_sequence=["\n"],
154147
),
155148
LightevalTaskConfig(
@@ -158,7 +151,6 @@ def preprocess(text):
158151
hf_repo="commonsense_qa",
159152
hf_subset="default",
160153
metric=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace],
161-
trust_dataset=True,
162154
stop_sequence=["\n"],
163155
),
164156
]
@@ -189,7 +181,6 @@ def natural_questions_prompt(line, task_name: str = None):
189181
hf_subset="rc.nocontext",
190182
metric=[Metrics.quasi_exact_match],
191183
generation_size=20,
192-
trust_dataset=True,
193184
stop_sequence=["\n", ".", ","],
194185
),
195186
LightevalTaskConfig(
@@ -199,7 +190,6 @@ def natural_questions_prompt(line, task_name: str = None):
199190
hf_subset="default",
200191
metric=[Metrics.quasi_exact_match],
201192
generation_size=20,
202-
trust_dataset=True,
203193
stop_sequence=["\n", ".", ","],
204194
),
205195
]
@@ -228,7 +218,6 @@ def boolq_prompt(line, task_name: str = None):
228218
hf_repo="super_glue",
229219
hf_subset="boolq",
230220
metric=[Metrics.target_perplexity],
231-
trust_dataset=True,
232221
stop_sequence=["\n"],
233222
),
234223
LightevalTaskConfig(
@@ -238,7 +227,6 @@ def boolq_prompt(line, task_name: str = None):
238227
hf_subset="deault",
239228
metric=[Metrics.quasi_exact_match],
240229
generation_size=20,
241-
trust_dataset=True,
242230
stop_sequence=["\n", ".", ","],
243231
),
244232
]
@@ -266,7 +254,6 @@ def __init__(
266254
few_shots_select=None,
267255
suite=["custom"],
268256
generation_size=40,
269-
trust_dataset=True,
270257
stop_sequence=None,
271258
):
272259
super().__init__(
@@ -281,7 +268,6 @@ def __init__(
281268
few_shots_select=few_shots_select,
282269
suite=suite,
283270
generation_size=generation_size,
284-
trust_dataset=trust_dataset,
285271
stop_sequence=(stop_sequence if stop_sequence is not None else ["\n"]),
286272
)
287273

@@ -365,7 +351,6 @@ def __init__(
365351
few_shots_select=None,
366352
suite=None,
367353
generation_size=-1,
368-
trust_dataset=True,
369354
stop_sequence=None,
370355
):
371356
super().__init__(
@@ -380,7 +365,6 @@ def __init__(
380365
few_shots_select=few_shots_select,
381366
suite=suite,
382367
generation_size=generation_size,
383-
trust_dataset=trust_dataset,
384368
stop_sequence=(stop_sequence if stop_sequence is not None else ["\n"]),
385369
)
386370

@@ -478,7 +462,6 @@ def __init__(
478462
few_shots_select=None,
479463
suite=None,
480464
generation_size=4,
481-
trust_dataset=True,
482465
stop_sequence=None,
483466
):
484467
super().__init__(
@@ -493,7 +476,6 @@ def __init__(
493476
few_shots_select=few_shots_select,
494477
suite=suite,
495478
generation_size=generation_size,
496-
trust_dataset=trust_dataset,
497479
stop_sequence=(stop_sequence if stop_sequence is not None else ["\n"]),
498480
)
499481

@@ -610,7 +592,6 @@ def __init__(
610592
few_shots_select=None,
611593
suite=None,
612594
generation_size=-1,
613-
trust_dataset=True,
614595
stop_sequence=None,
615596
):
616597
super().__init__(
@@ -625,7 +606,6 @@ def __init__(
625606
few_shots_select=few_shots_select,
626607
suite=suite,
627608
generation_size=generation_size,
628-
trust_dataset=trust_dataset,
629609
stop_sequence=(stop_sequence if stop_sequence is not None else ["\n"]),
630610
)
631611

0 commit comments

Comments
 (0)