-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathscandeval_benchmark_results.jsonl
225 lines (225 loc) · 274 KB
/
scandeval_benchmark_results.jsonl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
{"dataset": "dutch-social", "task": "sentiment-classification", "dataset_languages": ["nl"], "model": "yhavinga/gpt2-medium-dutch", "results": {"raw": {"test": [{"mcc": 0.0, "macro_f1": 0.0900900900900901}, {"mcc": 0.0, "macro_f1": 0.08468314862176755}, {"mcc": 0.0, "macro_f1": 0.08468314862176755}, {"mcc": 0.03518123650734, "macro_f1": 0.08479555807121102}, {"mcc": 0.0, "macro_f1": 0.09443978765018161}, {"mcc": 0.0, "macro_f1": 0.08319088319088319}, {"mcc": 0.0, "macro_f1": 0.08616780045351474}, {"mcc": -0.0036060368195856957, "macro_f1": 0.16469493037508243}, {"mcc": 0.02257912970438084, "macro_f1": 0.16724245677399105}, {"mcc": 0.0, "macro_f1": 0.0900900900900901}]}, "total": {"test_mcc": 0.5415432939213515, "test_mcc_se": 0.7913889159308428, "test_macro_f1": 10.300778939385795, "test_macro_f1_se": 2.0678908978199537}}, "num_model_parameters": 379988992, "max_sequence_length": 1024, "vocabulary_size": 50257, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "dutch-social", "task": "sentiment-classification", "dataset_languages": ["nl"], "model": "../fietje-test/fietje-2b-ckpt-1800", "results": {"raw": {"test": [{"mcc": 0.14210479515747024, "macro_f1": 0.41229256161252475}, {"mcc": 0.1411657773349638, "macro_f1": 0.4203779170963826}, {"mcc": 0.1168786150940225, "macro_f1": 0.38798330839283945}, {"mcc": 0.17231594799019773, "macro_f1": 0.4170092115595995}, {"mcc": 0.13322227188512387, "macro_f1": 0.40890647570276695}, {"mcc": 0.15725822891085028, "macro_f1": 0.42459544991433734}, {"mcc": 0.10950528992097042, "macro_f1": 0.36070389137586406}, {"mcc": 0.11427096701070942, "macro_f1": 0.4091052151197558}, {"mcc": 0.1393425457205052, "macro_f1": 0.4189395150470052}, {"mcc": 0.14875817207534298, "macro_f1": 0.44680110313625293}]}, "total": {"test_mcc": 13.748226111001566, "test_mcc_se": 1.228223297752718, "test_macro_f1": 41.06714648957329, "test_macro_f1_se": 1.4196556221998375}}, "num_model_parameters": -1, "max_sequence_length": 2048, "vocabulary_size": 51200, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "dutch-social", "task": "sentiment-classification", "dataset_languages": ["nl"], "model": "BramVanroy/GEITje-7B-ultra", "results": {"raw": {"test": [{"mcc": 0.13051125391335713, "macro_f1": 0.4318957238334333}, {"mcc": 0.16879211695530077, "macro_f1": 0.4567244831655859}, {"mcc": 0.06697798885332815, "macro_f1": 0.3977228975635459}, {"mcc": 0.12390404510055979, "macro_f1": 0.42362076733844295}, {"mcc": 0.1739802073624287, "macro_f1": 0.4592975154229794}, {"mcc": 0.12290606526999256, "macro_f1": 0.4067269670579232}, {"mcc": 0.20389257240932573, "macro_f1": 0.4711471814870132}, {"mcc": 0.13892825279916451, "macro_f1": 0.4041740490781322}, {"mcc": 0.04921286105485208, "macro_f1": 0.3527187131888094}, {"mcc": 0.12594692683194095, "macro_f1": 0.42554278885927016}]}, "total": {"test_mcc": 13.050522905502504, "test_mcc_se": 2.890870396101428, "test_macro_f1": 42.29571086995135, "test_macro_f1_se": 2.172427875155743}}, "num_model_parameters": 7241732096, "max_sequence_length": 8192, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "dutch-social", "task": "sentiment-classification", "dataset_languages": ["nl"], "model": "Rijgersberg/GEITje-7B-chat", "results": {"raw": {"test": [{"mcc": 0.05580777909173518, "macro_f1": 0.2549640006935328}, {"mcc": 0.06792531671192947, "macro_f1": 0.24714520909138848}, {"mcc": 0.08370201059338638, "macro_f1": 0.2773499933920241}, {"mcc": 0.07222815824762209, "macro_f1": 0.2726373609786899}, {"mcc": 0.1386969846173855, "macro_f1": 0.3460211038847634}, {"mcc": 0.07420907011215988, "macro_f1": 0.2800700498934714}, {"mcc": 0.10878535256451544, "macro_f1": 0.26377377720357686}, {"mcc": 0.09020644119650512, "macro_f1": 0.24905576748971434}, {"mcc": 0.07432872763453222, "macro_f1": 0.2627340075608145}, {"mcc": 0.07881762769989893, "macro_f1": 0.2988187293445112}]}, "total": {"test_mcc": 8.447074684696702, "test_mcc_se": 1.4682556296300564, "test_macro_f1": 27.52569999532487, "test_macro_f1_se": 1.821663338171289}}, "num_model_parameters": 7241732096, "max_sequence_length": 32768, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "conll-nl", "task": "named-entity-recognition", "dataset_languages": ["nl"], "model": "../fietje-test/fietje-2b-ckpt-1800", "results": {"raw": {"test": [{"micro_f1_no_misc": 0.1893687707641196, "micro_f1": 0.23060921248142646}, {"micro_f1_no_misc": 0.35580524344569286, "micro_f1": 0.2897669706180344}, {"micro_f1_no_misc": 0.3421828908554572, "micro_f1": 0.33247022268254794}, {"micro_f1_no_misc": 0.32555970149253727, "micro_f1": 0.2976889933411672}, {"micro_f1_no_misc": 0.302158273381295, "micro_f1": 0.24080499653018736}, {"micro_f1_no_misc": 0.1818181818181818, "micro_f1": 0.12893982808022922}, {"micro_f1_no_misc": 0.23813354786806112, "micro_f1": 0.25408203902827564}, {"micro_f1_no_misc": 0.322257053291536, "micro_f1": 0.2682803832576904}, {"micro_f1_no_misc": 0.37789395070948467, "micro_f1": 0.29402084476138235}, {"micro_f1_no_misc": 0.3398791540785499, "micro_f1": 0.2976835879743716}]}, "total": {"test_micro_f1_no_misc": 29.75056767704915, "test_micro_f1_no_misc_se": 4.317748536357651, "test_micro_f1": 26.343470787553123, "test_micro_f1_se": 3.489716666226831}}, "num_model_parameters": -1, "max_sequence_length": 2048, "vocabulary_size": 51200, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "conll-nl", "task": "named-entity-recognition", "dataset_languages": ["nl"], "model": "yhavinga/gpt2-medium-dutch", "results": {"raw": {"test": [{"micro_f1_no_misc": 0.0014124293785310734, "micro_f1": 0.006028636021100225}, {"micro_f1_no_misc": 0.0, "micro_f1": 0.0006682258603407951}, {"micro_f1_no_misc": 0.0, "micro_f1": 0.0}, {"micro_f1_no_misc": 0.0, "micro_f1": 0.0}, {"micro_f1_no_misc": 0.01658374792703151, "micro_f1": 0.008838891120932101}, {"micro_f1_no_misc": 0.0, "micro_f1": 0.0}, {"micro_f1_no_misc": 0.001949317738791423, "micro_f1": 0.0035714285714285713}, {"micro_f1_no_misc": 0.003924133420536297, "micro_f1": 0.008224006265909536}, {"micro_f1_no_misc": 0.0, "micro_f1": 0.0016934801016088058}, {"micro_f1_no_misc": 0.0, "micro_f1": 0.0038180082723512565}]}, "total": {"test_micro_f1_no_misc": 0.23869628464890302, "test_micro_f1_no_misc_se": 0.3195584160958439, "test_micro_f1": 0.3284267621367129, "test_micro_f1_se": 0.21159053024134472}}, "num_model_parameters": 379988992, "max_sequence_length": 1024, "vocabulary_size": 50257, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "scala-nl", "task": "linguistic-acceptability", "dataset_languages": ["nl"], "model": "yhavinga/gpt2-medium-dutch", "results": {"raw": {"test": [{"mcc": 0.0010984626312021111, "macro_f1": 0.4645627388190158}, {"mcc": -0.014146776093001008, "macro_f1": 0.4500997137263932}, {"mcc": 0.016653106912578226, "macro_f1": 0.35468560058543547}, {"mcc": -0.04256608051658233, "macro_f1": 0.3494180700953121}, {"mcc": -0.0023797529917253643, "macro_f1": 0.4813303988113953}, {"mcc": 0.005797215400694153, "macro_f1": 0.49380203515263643}, {"mcc": -0.004786392620691229, "macro_f1": 0.41499965767531233}, {"mcc": 0.0442712618886001, "macro_f1": 0.5216562286255795}, {"mcc": 0.008705376669072331, "macro_f1": 0.4828107237906938}, {"mcc": -0.0032328381390087334, "macro_f1": 0.4951204031532848}]}, "total": {"test_mcc": 0.09413583141138258, "test_mcc_se": 1.3688905701314587, "test_macro_f1": 45.08485570435059, "test_macro_f1_se": 3.683009141099775}}, "num_model_parameters": 379988992, "max_sequence_length": 1024, "vocabulary_size": 50257, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "conll-nl", "task": "named-entity-recognition", "dataset_languages": ["nl"], "model": "BramVanroy/GEITje-7B-ultra", "results": {"raw": {"test": [{"micro_f1_no_misc": 0.3522123893805309, "micro_f1": 0.24994869690129284}, {"micro_f1_no_misc": 0.36029648727038355, "micro_f1": 0.27732027598486536}, {"micro_f1_no_misc": 0.4241364232619152, "micro_f1": 0.26405867970660146}, {"micro_f1_no_misc": 0.3668215945656059, "micro_f1": 0.27756906077348065}, {"micro_f1_no_misc": 0.4521815777875716, "micro_f1": 0.28144280968201235}, {"micro_f1_no_misc": 0.3844427823485415, "micro_f1": 0.2823529411764706}, {"micro_f1_no_misc": 0.34620786516853935, "micro_f1": 0.250992501102779}, {"micro_f1_no_misc": 0.36009933774834435, "micro_f1": 0.2466920834267773}, {"micro_f1_no_misc": 0.41119221411192214, "micro_f1": 0.24740843123704212}, {"micro_f1_no_misc": 0.3379962192816635, "micro_f1": 0.24788334901222953}]}, "total": {"test_micro_f1_no_misc": 37.95586890925018, "test_micro_f1_no_misc_se": 2.336090283399193, "test_micro_f1": 26.25668829003552, "test_micro_f1_se": 0.965369616651723}}, "num_model_parameters": 7241732096, "max_sequence_length": 8192, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "scala-nl", "task": "linguistic-acceptability", "dataset_languages": ["nl"], "model": "../fietje-test/fietje-2b-ckpt-1800", "results": {"raw": {"test": [{"mcc": 0.03449756127928086, "macro_f1": 0.42245919163699086}, {"mcc": 0.051246017510790325, "macro_f1": 0.41087129435264425}, {"mcc": -0.02392848408706393, "macro_f1": 0.4138442391344006}, {"mcc": 0.1100882206078603, "macro_f1": 0.5124660938489776}, {"mcc": 0.10488021690660541, "macro_f1": 0.5517372658545335}, {"mcc": -0.03410333151804544, "macro_f1": 0.4077699549513097}, {"mcc": 0.018663848329591087, "macro_f1": 0.36480029929460606}, {"mcc": 0.024977609157634494, "macro_f1": 0.35315395846369296}, {"mcc": 0.10692899943927427, "macro_f1": 0.43439273762641667}, {"mcc": 0.04098580128983545, "macro_f1": 0.5196353883861102}]}, "total": {"test_mcc": 4.342364589157628, "test_mcc_se": 3.1940489543703725, "test_macro_f1": 43.911304235496814, "test_macro_f1_se": 4.141237318164313}}, "num_model_parameters": -1, "max_sequence_length": 2048, "vocabulary_size": 51200, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "conll-nl", "task": "named-entity-recognition", "dataset_languages": ["nl"], "model": "Rijgersberg/GEITje-7B-chat", "results": {"raw": {"test": [{"micro_f1_no_misc": 0.5105234754452239, "micro_f1": 0.3318824809575626}, {"micro_f1_no_misc": 0.5191183085919928, "micro_f1": 0.4321236559139785}, {"micro_f1_no_misc": 0.5301897642323175, "micro_f1": 0.3750414868901427}, {"micro_f1_no_misc": 0.5306504445484325, "micro_f1": 0.39655172413793105}, {"micro_f1_no_misc": 0.5393380358111775, "micro_f1": 0.32949512843224094}, {"micro_f1_no_misc": 0.5267620561738208, "micro_f1": 0.38326778691455127}, {"micro_f1_no_misc": 0.45857260049220677, "micro_f1": 0.33506422519814155}, {"micro_f1_no_misc": 0.5229455709711845, "micro_f1": 0.3310988050131157}, {"micro_f1_no_misc": 0.4613908872901679, "micro_f1": 0.2894812267125251}, {"micro_f1_no_misc": 0.5047923322683705, "micro_f1": 0.40144927536231884}]}, "total": {"test_micro_f1_no_misc": 51.042834758248944, "test_micro_f1_no_misc_se": 1.7603263703490695, "test_micro_f1": 36.05455795532508, "test_micro_f1_se": 2.707328808769377}}, "num_model_parameters": 7241732096, "max_sequence_length": 32768, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "squad-nl", "task": "question-answering", "dataset_languages": ["nl"], "model": "yhavinga/gpt2-medium-dutch", "results": {"raw": {"test": [{"em": 0.0, "f1": 1.2752742255770138}, {"em": 0.0, "f1": 1.624113420996427}, {"em": 0.0, "f1": 2.079259704920521}, {"em": 0.0, "f1": 1.518124970364732}, {"em": 0.0, "f1": 1.7297819319761019}, {"em": 0.0, "f1": 1.6752812969875823}, {"em": 0.0, "f1": 1.7184780776276534}, {"em": 0.0, "f1": 1.7815550064160042}, {"em": 0.0, "f1": 1.5208408532916533}, {"em": 0.0, "f1": 2.0250606527530595}]}, "total": {"test_em": 0.0, "test_em_se": 0.0, "test_f1": 1.6947770140910747, "test_f1_se": 0.14740559233930667}}, "num_model_parameters": 379988992, "max_sequence_length": 1024, "vocabulary_size": 50257, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "scala-nl", "task": "linguistic-acceptability", "dataset_languages": ["nl"], "model": "BramVanroy/GEITje-7B-ultra", "results": {"raw": {"test": [{"mcc": 0.1698272035808607, "macro_f1": 0.4911758820783032}, {"mcc": 0.1251342045847769, "macro_f1": 0.4405690973720621}, {"mcc": 0.21063616235021748, "macro_f1": 0.500770105472447}, {"mcc": 0.167715600142541, "macro_f1": 0.46130665228635936}, {"mcc": 0.20525439125372533, "macro_f1": 0.5197141668607492}, {"mcc": 0.18525643056313829, "macro_f1": 0.5027775974923498}, {"mcc": 0.16209594620555912, "macro_f1": 0.4751015425265005}, {"mcc": 0.23146017098538363, "macro_f1": 0.5886050079201292}, {"mcc": 0.21136948466424393, "macro_f1": 0.5069933500505635}, {"mcc": 0.21101776725430893, "macro_f1": 0.5461068086975793}]}, "total": {"test_mcc": 18.797673615847554, "test_mcc_se": 1.9759638850254158, "test_macro_f1": 50.33120210757043, "test_macro_f1_se": 2.616586494241302}}, "num_model_parameters": 7241732096, "max_sequence_length": 8192, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "squad-nl", "task": "question-answering", "dataset_languages": ["nl"], "model": "../fietje-test/fietje-2b-ckpt-1800", "results": {"raw": {"test": [{"em": 58.48179705654531, "f1": 67.95173669576833}, {"em": 58.83720930232558, "f1": 68.86658101595874}, {"em": 56.414219474497685, "f1": 68.34556993531962}, {"em": 57.320872274143305, "f1": 66.38732887776843}, {"em": 55.67567567567568, "f1": 67.71684218645726}, {"em": 57.59444872783346, "f1": 67.30342319595377}, {"em": 55.50493545937737, "f1": 68.06101862428484}, {"em": 57.09852598913887, "f1": 66.76381058498012}, {"em": 55.76470588235294, "f1": 67.59322367353178}, {"em": 58.22981366459627, "f1": 69.40066797367687}]}, "total": {"test_em": 57.092220350648645, "test_em_se": 0.7541016393777683, "test_f1": 67.83902027636996, "test_f1_se": 0.5634937281208867}}, "num_model_parameters": -1, "max_sequence_length": 2048, "vocabulary_size": 51200, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "scala-nl", "task": "linguistic-acceptability", "dataset_languages": ["nl"], "model": "Rijgersberg/GEITje-7B-chat", "results": {"raw": {"test": [{"mcc": 0.21834846602288802, "macro_f1": 0.5908190943120557}, {"mcc": 0.19930821516603095, "macro_f1": 0.5954372034472741}, {"mcc": 0.18628631436380622, "macro_f1": 0.5906362278398487}, {"mcc": 0.21444753101490416, "macro_f1": 0.5966507274038504}, {"mcc": 0.21871538821569983, "macro_f1": 0.6016802569804793}, {"mcc": 0.12037756594290688, "macro_f1": 0.5512728656318463}, {"mcc": 0.226842502624602, "macro_f1": 0.5681564132268357}, {"mcc": 0.2214852984312921, "macro_f1": 0.6101996100219348}, {"mcc": 0.22754758812504697, "macro_f1": 0.6113598724161198}, {"mcc": 0.1962253380198746, "macro_f1": 0.5767820438632378}]}, "total": {"test_mcc": 20.295842079270518, "test_mcc_se": 1.991506609160339, "test_macro_f1": 58.92994315143483, "test_macro_f1_se": 1.172784739416614}}, "num_model_parameters": 7241732096, "max_sequence_length": 32768, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "wiki-lingua-nl", "task": "summarization", "dataset_languages": ["nl"], "model": "yhavinga/gpt2-medium-dutch", "results": {"raw": {"test": [{"bertscore": 0.34054765962355305, "rouge_l": 0.0388706043724462}, {"bertscore": 0.33886752387479646, "rouge_l": 0.03809230774474156}, {"bertscore": 0.34334398923965637, "rouge_l": 0.03824475540409212}, {"bertscore": 0.35355640923808096, "rouge_l": 0.03988905353730709}, {"bertscore": 0.33517017901613144, "rouge_l": 0.03782594702053496}, {"bertscore": 0.338666006384301, "rouge_l": 0.03796456567110065}, {"bertscore": 0.3379762795884744, "rouge_l": 0.03927520318085238}, {"bertscore": 0.33997942375572165, "rouge_l": 0.03803733443435173}, {"bertscore": 0.3350590055415523, "rouge_l": 0.038586995471017266}, {"bertscore": 0.33743464593135286, "rouge_l": 0.03855868603030231}]}, "total": {"test_bertscore": 34.006011221936205, "test_bertscore_se": 0.33101624412126907, "test_rouge_l": 3.8534545286674624, "test_rouge_l_se": 0.0405556125416091}}, "num_model_parameters": 379988992, "max_sequence_length": 1024, "vocabulary_size": 50257, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "mmlu-nl", "task": "knowledge", "dataset_languages": ["nl"], "model": "yhavinga/gpt2-medium-dutch", "results": {"raw": {"test": [{"mcc": 0.0, "accuracy": 0.24267578125}, {"mcc": 0.019612742233523005, "accuracy": 0.255859375}, {"mcc": 0.012253350259811323, "accuracy": 0.24169921875}, {"mcc": -0.00883157321773685, "accuracy": 0.2451171875}, {"mcc": 0.010830153400196978, "accuracy": 0.23876953125}, {"mcc": 0.00015896865270103092, "accuracy": 0.2412109375}, {"mcc": -0.019902207133244764, "accuracy": 0.25537109375}, {"mcc": 0.005226174584226169, "accuracy": 0.25244140625}, {"mcc": 0.029585027289311356, "accuracy": 0.24951171875}, {"mcc": 0.02434294347442651, "accuracy": 0.2529296875}]}, "total": {"test_mcc": 0.7327557954321476, "test_mcc_se": 0.9432583259168265, "test_accuracy": 24.755859375, "test_accuracy_se": 0.3963953144705439}}, "num_model_parameters": 379988992, "max_sequence_length": 1024, "vocabulary_size": 50257, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "squad-nl", "task": "question-answering", "dataset_languages": ["nl"], "model": "BramVanroy/GEITje-7B-ultra", "results": {"raw": {"test": [{"em": 54.841208365608054, "f1": 66.66056750471118}, {"em": 54.34108527131783, "f1": 66.49173017048473}, {"em": 50.07727975270479, "f1": 65.309633363045}, {"em": 54.12772585669782, "f1": 66.24297402858551}, {"em": 54.826254826254825, "f1": 66.82088268598653}, {"em": 55.66692367000771, "f1": 67.83710011971938}, {"em": 52.54365983295368, "f1": 67.21316656382268}, {"em": 54.38324282389449, "f1": 66.22679199593114}, {"em": 51.529411764705884, "f1": 65.6025983062041}, {"em": 51.940993788819874, "f1": 66.27173065978809}]}, "total": {"test_em": 53.4277785952965, "test_em_se": 1.1130869635127556, "test_f1": 66.46771753982783, "test_f1_se": 0.45405266457404836}}, "num_model_parameters": 7241732096, "max_sequence_length": 8192, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "hellaswag-nl", "task": "common-sense-reasoning", "dataset_languages": ["nl"], "model": "yhavinga/gpt2-medium-dutch", "results": {"raw": {"test": [{"accuracy": 0.23486328125, "mcc": 0.017760572539733073}, {"accuracy": 0.236328125, "mcc": 0.0}, {"accuracy": 0.2275390625, "mcc": 0.0}, {"accuracy": 0.2451171875, "mcc": 0.0}, {"accuracy": 0.2333984375, "mcc": 0.0}, {"accuracy": 0.2392578125, "mcc": 0.0}, {"accuracy": 0.23193359375, "mcc": 0.0}, {"accuracy": 0.24365234375, "mcc": 0.0}, {"accuracy": 0.232421875, "mcc": 0.0}, {"accuracy": 0.236328125, "mcc": 0.0}]}, "total": {"test_accuracy": 23.6083984375, "test_accuracy_se": 0.3340482357014146, "test_mcc": 0.17760572539733074, "test_mcc_se": 0.3481072217787682}}, "num_model_parameters": 379988992, "max_sequence_length": 1024, "vocabulary_size": 50257, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "speed", "task": "speed", "dataset_languages": ["ab", "aa", "af", "sq", "am", "ar", "an", "hy", "as", "av", "ae", "ay", "az", "bm", "ba", "eu", "be", "bn", "bi", "bs", "br", "bg", "my", "ca", "ch", "ce", "ny", "zh", "cu", "cv", "kw", "co", "cr", "hr", "cs", "da", "dv", "nl", "dz", "en", "eo", "et", "ee", "fo", "fj", "fi", "fr", "fy", "ff", "gd", "gl", "lg", "ka", "de", "el", "kl", "gn", "gu", "ht", "ha", "he", "hz", "hi", "ho", "hu", "is", "io", "ig", "id", "ia", "ie", "iu", "ik", "ga", "it", "ja", "kn", "kr", "ks", "kk", "km", "ki", "rw", "ky", "kv", "kg", "ko", "kj", "ku", "lo", "la", "lv", "li", "ln", "lt", "lu", "lb", "mk", "mg", "ms", "ml", "mt", "gv", "mi", "mr", "mh", "mn", "na", "nv", "nd", "nr", "ng", "ne", "no", "nb", "nn", "ii", "oc", "oj", "or", "om", "os", "pi", "ps", "fa", "pl", "pt", "pa", "qu", "ro", "rm", "rn", "ru", "se", "sm", "sg", "sa", "sc", "sr", "sn", "sd", "si", "sk", "sl", "so", "st", "es", "su", "sw", "ss", "sv", "tl", "ty", "tg", "ta", "tt", "te", "th", "bo", "ti", "to", "ts", "tn", "tr", "tk", "tw", "ug", "uk", "ur", "uz", "ve", "vi", "vo", "wa", "cy", "wo", "xh", "yi", "yo", "za", "zu"], "model": "yhavinga/gpt2-medium-dutch", "results": {"raw": {"test": [{"test_speed": 5001.2699999999995, "test_speed_short": 597.3}, {"test_speed": 9084.73, "test_speed_short": 1149.56}, {"test_speed": 12187.949999999999, "test_speed_short": 2224.1400000000003}, {"test_speed": 15068.46, "test_speed_short": 2685.22}, {"test_speed": 16662.77, "test_speed_short": 3151.7999999999997}, {"test_speed": 18661.79, "test_speed_short": 4175.35}, {"test_speed": 19904.489999999998, "test_speed_short": 4662.17}, {"test_speed": 20470.230000000003, "test_speed_short": 4988.58}, {"test_speed": 20408.85, "test_speed_short": 5460.65}, {"test_speed": 20378.160000000003, "test_speed_short": 5920.070000000001}]}, "total": {"test_speed": 15782.87, "test_speed_se": 3363.7448341428812, "test_speed_short": 3501.4840000000004, "test_speed_short_se": 1133.23931900599}}, "num_model_parameters": 379988992, "max_sequence_length": 1024, "vocabulary_size": 50257, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "dutch-social", "task": "sentiment-classification", "dataset_languages": ["nl"], "model": "yhavinga/ul2-large-dutch", "results": {"raw": {"test": [{"mcc": 0.0036877862718103912, "macro_f1": 0.20006399488040957}, {"mcc": 0.036843868693738686, "macro_f1": 0.21729680110478247}, {"mcc": -0.0024147356512068644, "macro_f1": 0.19498228935342374}, {"mcc": -0.003128771026850043, "macro_f1": 0.1955089028148771}, {"mcc": 0.029718305671185084, "macro_f1": 0.21170906428408878}, {"mcc": 0.007418951217455495, "macro_f1": 0.20340296255517834}, {"mcc": 0.012093666405210318, "macro_f1": 0.20893509563722332}, {"mcc": 0.008311064042481007, "macro_f1": 0.19339113081383674}, {"mcc": 0.003749916003188481, "macro_f1": 0.20485099163423467}, {"mcc": 0.02300091132741035, "macro_f1": 0.21312483812483815}]}, "total": {"test_mcc": 1.1928096295442292, "test_mcc_se": 0.8419002103188777, "test_macro_f1": 20.43266071202893, "test_macro_f1_se": 0.5166393889289187}}, "num_model_parameters": 783150080, "max_sequence_length": 512, "vocabulary_size": 32128, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "scala-nl", "task": "linguistic-acceptability", "dataset_languages": ["nl"], "model": "yhavinga/ul2-large-dutch", "results": {"raw": {"test": [{"mcc": 0.005632482876553763, "macro_f1": 0.4951750854715814}, {"mcc": -0.019887954783119947, "macro_f1": 0.4840108242450406}, {"mcc": 0.00971351981583027, "macro_f1": 0.49422668378210566}, {"mcc": 0.002355030410635424, "macro_f1": 0.4916115627469222}, {"mcc": -0.0004766674952165763, "macro_f1": 0.4907256116836162}, {"mcc": 0.025139100561017164, "macro_f1": 0.500959217773377}, {"mcc": 0.004518914693420824, "macro_f1": 0.490904160357958}, {"mcc": -0.021077281987841208, "macro_f1": 0.48179949724729293}, {"mcc": 0.015149181850107534, "macro_f1": 0.5003090455737429}, {"mcc": -0.012485104256956805, "macro_f1": 0.48620632289636495}]}, "total": {"test_mcc": 0.08581221684430444, "test_mcc_se": 0.9244239662267806, "test_macro_f1": 49.15928011778002, "test_macro_f1_se": 0.39585136963309053}}, "num_model_parameters": 783150080, "max_sequence_length": 512, "vocabulary_size": 32128, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "squad-nl", "task": "question-answering", "dataset_languages": ["nl"], "model": "Rijgersberg/GEITje-7B-chat", "results": {"raw": {"test": [{"em": 56.390395042602634, "f1": 67.67993973471589}, {"em": 56.89922480620155, "f1": 67.83917352393539}, {"em": 54.01854714064915, "f1": 67.73724266761488}, {"em": 54.283489096573206, "f1": 66.05339775138516}, {"em": 53.745173745173744, "f1": 65.21436078214148}, {"em": 55.35851966075559, "f1": 66.5833468988403}, {"em": 53.83447228549734, "f1": 67.47458908096995}, {"em": 54.30566330488751, "f1": 65.93425865068257}, {"em": 52.94117647058823, "f1": 65.79772077211513}, {"em": 53.1055900621118, "f1": 66.88785304887917}]}, "total": {"test_em": 54.488225161504076, "test_em_se": 0.8204207908859052, "test_f1": 66.72018829112798, "test_f1_se": 0.5845185265261333}}, "num_model_parameters": 7241732096, "max_sequence_length": 32768, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "squad-nl", "task": "question-answering", "dataset_languages": ["nl"], "model": "yhavinga/ul2-large-dutch", "results": {"raw": {"test": [{"em": 0.0, "f1": 6.840735025571101}, {"em": 0.0, "f1": 6.778534585529612}, {"em": 0.0, "f1": 7.914395314319883}, {"em": 0.0, "f1": 6.502245978921603}, {"em": 0.0, "f1": 6.8915907726407255}, {"em": 0.0, "f1": 6.006298593617691}, {"em": 0.0, "f1": 8.49910955058404}, {"em": 0.0, "f1": 6.444120978564206}, {"em": 0.0, "f1": 7.543988873447243}, {"em": 0.0, "f1": 7.491357224120327}]}, "total": {"test_em": 0.0, "test_em_se": 0.0, "test_f1": 7.091237689731644, "test_f1_se": 0.4696702754677793}}, "num_model_parameters": 783150080, "max_sequence_length": 512, "vocabulary_size": 32128, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "mmlu-nl", "task": "knowledge", "dataset_languages": ["nl"], "model": "yhavinga/ul2-large-dutch", "results": {"raw": {"test": [{"mcc": 0.00045232589036133447, "accuracy": 0.26171875}, {"mcc": 0.002147058904407007, "accuracy": 0.2548828125}, {"mcc": -0.03383304381189186, "accuracy": 0.234375}, {"mcc": -0.002896169944308919, "accuracy": 0.2568359375}, {"mcc": 0.00582093522490666, "accuracy": 0.25634765625}, {"mcc": -8.074670064350477e-06, "accuracy": 0.2529296875}, {"mcc": 0.027781957046776058, "accuracy": 0.27197265625}, {"mcc": 0.019506587210523937, "accuracy": 0.25830078125}, {"mcc": -0.012724865986013698, "accuracy": 0.248046875}, {"mcc": 0.008386046451964956, "accuracy": 0.25439453125}]}, "total": {"test_mcc": 0.14632756316661127, "test_mcc_se": 1.0442998988905334, "test_accuracy": 25.498046875000004, "test_accuracy_se": 0.5943875500476878}}, "num_model_parameters": 783150080, "max_sequence_length": 512, "vocabulary_size": 32128, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "wiki-lingua-nl", "task": "summarization", "dataset_languages": ["nl"], "model": "../fietje-test/fietje-2b-ckpt-1800", "results": {"raw": {"test": [{"bertscore": 0.6107027929319884, "rouge_l": 0.17040447601473083}, {"bertscore": 0.6220110561844194, "rouge_l": 0.17425752066262618}, {"bertscore": 0.6174783017340815, "rouge_l": 0.1677386301214981}, {"bertscore": 0.6312416284054052, "rouge_l": 0.18405213048135524}, {"bertscore": 0.6053530646458967, "rouge_l": 0.15498276105816525}, {"bertscore": 0.6079172861936968, "rouge_l": 0.1642050912087077}, {"bertscore": 0.6056068968609907, "rouge_l": 0.15112618046245768}, {"bertscore": 0.5995554115943378, "rouge_l": 0.15504044210443219}, {"bertscore": 0.5953657484715222, "rouge_l": 0.14754452845712557}, {"bertscore": 0.6153296866541496, "rouge_l": 0.1535819079330772}]}, "total": {"test_bertscore": 61.105618736764875, "test_bertscore_se": 0.6655101070470905, "test_rouge_l": 16.229336685041755, "test_rouge_l_se": 0.7274784146831054}}, "num_model_parameters": -1, "max_sequence_length": 2048, "vocabulary_size": 51200, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "hellaswag-nl", "task": "common-sense-reasoning", "dataset_languages": ["nl"], "model": "yhavinga/ul2-large-dutch", "results": {"raw": {"test": [{"accuracy": 0.2392578125, "mcc": -0.01152820269032297}, {"accuracy": 0.2451171875, "mcc": 0.004463747233222901}, {"accuracy": 0.2421875, "mcc": -0.0023616195079204393}, {"accuracy": 0.24658203125, "mcc": 0.0050580769027641906}, {"accuracy": 0.24609375, "mcc": -0.004645874593754573}, {"accuracy": 0.2548828125, "mcc": 0.014950292623699762}, {"accuracy": 0.25634765625, "mcc": 0.008967787696398645}, {"accuracy": 0.24609375, "mcc": 0.017647829928974654}, {"accuracy": 0.2392578125, "mcc": 0.009985931647144175}, {"accuracy": 0.24462890625, "mcc": 0.002092627863133643}]}, "total": {"test_accuracy": 24.6044921875, "test_accuracy_se": 0.35450368964734136, "test_mcc": 0.4463059710333999, "test_mcc_se": 0.5566220964561331}}, "num_model_parameters": 783150080, "max_sequence_length": 512, "vocabulary_size": 32128, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "speed", "task": "speed", "dataset_languages": ["ab", "aa", "af", "sq", "am", "ar", "an", "hy", "as", "av", "ae", "ay", "az", "bm", "ba", "eu", "be", "bn", "bi", "bs", "br", "bg", "my", "ca", "ch", "ce", "ny", "zh", "cu", "cv", "kw", "co", "cr", "hr", "cs", "da", "dv", "nl", "dz", "en", "eo", "et", "ee", "fo", "fj", "fi", "fr", "fy", "ff", "gd", "gl", "lg", "ka", "de", "el", "kl", "gn", "gu", "ht", "ha", "he", "hz", "hi", "ho", "hu", "is", "io", "ig", "id", "ia", "ie", "iu", "ik", "ga", "it", "ja", "kn", "kr", "ks", "kk", "km", "ki", "rw", "ky", "kv", "kg", "ko", "kj", "ku", "lo", "la", "lv", "li", "ln", "lt", "lu", "lb", "mk", "mg", "ms", "ml", "mt", "gv", "mi", "mr", "mh", "mn", "na", "nv", "nd", "nr", "ng", "ne", "no", "nb", "nn", "ii", "oc", "oj", "or", "om", "os", "pi", "ps", "fa", "pl", "pt", "pa", "qu", "ro", "rm", "rn", "ru", "se", "sm", "sg", "sa", "sc", "sr", "sn", "sd", "si", "sk", "sl", "so", "st", "es", "su", "sw", "ss", "sv", "tl", "ty", "tg", "ta", "tt", "te", "th", "bo", "ti", "to", "ts", "tn", "tr", "tk", "tw", "ug", "uk", "ur", "uz", "ve", "vi", "vo", "wa", "cy", "wo", "xh", "yi", "yo", "za", "zu"], "model": "yhavinga/ul2-large-dutch", "results": {"raw": {"test": [{"test_speed": 1167.65, "test_speed_short": 134.81}, {"test_speed": 2190.69, "test_speed_short": 258.25}, {"test_speed": 3082.9399999999996, "test_speed_short": 495.39}, {"test_speed": 3828.7599999999998, "test_speed_short": 616.7099999999999}, {"test_speed": 4021.57, "test_speed_short": 725.62}, {"test_speed": 4011.35, "test_speed_short": 945.75}, {"test_speed": 4021.57, "test_speed_short": 1060.57}, {"test_speed": 4011.35, "test_speed_short": 1171.28}, {"test_speed": 4011.35, "test_speed_short": 1290.1}, {"test_speed": 4011.35, "test_speed_short": 1403.6}]}, "total": {"test_speed": 3435.8579999999993, "test_speed_se": 619.2902257885153, "test_speed_short": 810.208, "test_speed_short_se": 269.2465238611504}}, "num_model_parameters": 783150080, "max_sequence_length": 512, "vocabulary_size": 32128, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "mmlu-nl", "task": "knowledge", "dataset_languages": ["nl"], "model": "../fietje-test/fietje-2b-ckpt-1800", "results": {"raw": {"test": [{"mcc": 0.20806091757488201, "accuracy": 0.4033203125}, {"mcc": 0.2359136192225932, "accuracy": 0.41357421875}, {"mcc": 0.23494720232324753, "accuracy": 0.42431640625}, {"mcc": 0.22472021208652523, "accuracy": 0.408203125}, {"mcc": 0.22462959082834383, "accuracy": 0.40673828125}, {"mcc": 0.248668759630351, "accuracy": 0.42724609375}, {"mcc": 0.24153422281595816, "accuracy": 0.41943359375}, {"mcc": 0.24823434318423107, "accuracy": 0.4228515625}, {"mcc": 0.21678426449078567, "accuracy": 0.4111328125}, {"mcc": 0.25283678664241177, "accuracy": 0.43017578125}]}, "total": {"test_mcc": 23.3632991879933, "test_mcc_se": 0.9183289305973856, "test_accuracy": 41.669921875, "test_accuracy_se": 0.5797218936337779}}, "num_model_parameters": -1, "max_sequence_length": 2048, "vocabulary_size": 51200, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "dutch-social", "task": "sentiment-classification", "dataset_languages": ["nl"], "model": "Rijgersberg/GEITje-7B", "results": {"raw": {"test": [{"mcc": 0.08203995890567481, "macro_f1": 0.3052014910290955}, {"mcc": -0.032211250693467336, "macro_f1": 0.2387669605304347}, {"mcc": 0.021891740330394416, "macro_f1": 0.218581703285218}, {"mcc": 0.09279387365805307, "macro_f1": 0.4029329992866712}, {"mcc": 0.045034002388852745, "macro_f1": 0.2993757373820362}, {"mcc": 0.05204694050137779, "macro_f1": 0.35139593114241}, {"mcc": 0.044886003459791816, "macro_f1": 0.233445385354862}, {"mcc": 0.10507850409082183, "macro_f1": 0.34590163264766954}, {"mcc": 0.008903260521321626, "macro_f1": 0.2690436353182088}, {"mcc": -0.03858878668577801, "macro_f1": 0.1463731377397303}]}, "total": {"test_mcc": 3.818742464770427, "test_mcc_se": 3.0390065300396047, "test_macro_f1": 28.11018613716336, "test_macro_f1_se": 4.673375259677534}}, "num_model_parameters": 7241732096, "max_sequence_length": 32768, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "wiki-lingua-nl", "task": "summarization", "dataset_languages": ["nl"], "model": "BramVanroy/GEITje-7B-ultra", "results": {"raw": {"test": [{"bertscore": 0.6827032353758113, "rouge_l": 0.2114771969752503}, {"bertscore": 0.6778731114027323, "rouge_l": 0.20577893494347244}, {"bertscore": 0.6804254593444057, "rouge_l": 0.2101878359518688}, {"bertscore": 0.6891312246152665, "rouge_l": 0.21835270485719677}, {"bertscore": 0.6797829088172875, "rouge_l": 0.1892084582459262}, {"bertscore": 0.6959275831031846, "rouge_l": 0.21990579647146652}, {"bertscore": 0.6709579722955823, "rouge_l": 0.19069577392730308}, {"bertscore": 0.682601383261499, "rouge_l": 0.21129469287300606}, {"bertscore": 0.6866932107805042, "rouge_l": 0.21559637704612478}, {"bertscore": 0.6857721621636301, "rouge_l": 0.207475729415879}]}, "total": {"test_bertscore": 68.31868251159904, "test_bertscore_se": 0.42047991791935135, "test_rouge_l": 20.79973500707494, "test_rouge_l_se": 0.6507574405218374}}, "num_model_parameters": 7241732096, "max_sequence_length": 8192, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "conll-nl", "task": "named-entity-recognition", "dataset_languages": ["nl"], "model": "Rijgersberg/GEITje-7B", "results": {"raw": {"test": [{"micro_f1_no_misc": 0.2909545644018341, "micro_f1": 0.2400698384984723}, {"micro_f1_no_misc": 0.4280381584404811, "micro_f1": 0.3366392745820346}, {"micro_f1_no_misc": 0.42082429501084595, "micro_f1": 0.27786582144743793}, {"micro_f1_no_misc": 0.3312741312741313, "micro_f1": 0.30790880945679705}, {"micro_f1_no_misc": 0.5105932203389831, "micro_f1": 0.32175102599179206}, {"micro_f1_no_misc": 0.4580703336339044, "micro_f1": 0.35397160978556325}, {"micro_f1_no_misc": 0.35203094777562866, "micro_f1": 0.28}, {"micro_f1_no_misc": 0.3447432762836186, "micro_f1": 0.3094158442251267}, {"micro_f1_no_misc": 0.40768841679312084, "micro_f1": 0.2979683972911964}, {"micro_f1_no_misc": 0.3494029190623618, "micro_f1": 0.2780401416765053}]}, "total": {"test_micro_f1_no_misc": 38.936202630149104, "test_micro_f1_no_misc_se": 4.146418625749279, "test_micro_f1": 30.03630762954926, "test_micro_f1_se": 2.0500800792986844}}, "num_model_parameters": 7241732096, "max_sequence_length": 32768, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "hellaswag-nl", "task": "common-sense-reasoning", "dataset_languages": ["nl"], "model": "../fietje-test/fietje-2b-ckpt-1800", "results": {"raw": {"test": [{"accuracy": 0.2744140625, "mcc": 0.058539542986352236}, {"accuracy": 0.263671875, "mcc": 0.03551441468873122}, {"accuracy": 0.2939453125, "mcc": 0.06663194520785531}, {"accuracy": 0.24609375, "mcc": 0.002793102274219177}, {"accuracy": 0.2841796875, "mcc": 0.05735431911727611}, {"accuracy": 0.2705078125, "mcc": 0.03146906083054729}, {"accuracy": 0.2802734375, "mcc": 0.043666182246918775}, {"accuracy": 0.25439453125, "mcc": 0.03559573823721227}, {"accuracy": 0.255859375, "mcc": 0.029562858579123965}, {"accuracy": 0.279296875, "mcc": 0.059456432440672426}]}, "total": {"test_accuracy": 27.0263671875, "test_accuracy_se": 0.9332622342049633, "test_mcc": 4.205835966089088, "test_mcc_se": 1.1885199741996149}}, "num_model_parameters": -1, "max_sequence_length": 2048, "vocabulary_size": 51200, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "speed", "task": "speed", "dataset_languages": ["ab", "aa", "af", "sq", "am", "ar", "an", "hy", "as", "av", "ae", "ay", "az", "bm", "ba", "eu", "be", "bn", "bi", "bs", "br", "bg", "my", "ca", "ch", "ce", "ny", "zh", "cu", "cv", "kw", "co", "cr", "hr", "cs", "da", "dv", "nl", "dz", "en", "eo", "et", "ee", "fo", "fj", "fi", "fr", "fy", "ff", "gd", "gl", "lg", "ka", "de", "el", "kl", "gn", "gu", "ht", "ha", "he", "hz", "hi", "ho", "hu", "is", "io", "ig", "id", "ia", "ie", "iu", "ik", "ga", "it", "ja", "kn", "kr", "ks", "kk", "km", "ki", "rw", "ky", "kv", "kg", "ko", "kj", "ku", "lo", "la", "lv", "li", "ln", "lt", "lu", "lb", "mk", "mg", "ms", "ml", "mt", "gv", "mi", "mr", "mh", "mn", "na", "nv", "nd", "nr", "ng", "ne", "no", "nb", "nn", "ii", "oc", "oj", "or", "om", "os", "pi", "ps", "fa", "pl", "pt", "pa", "qu", "ro", "rm", "rn", "ru", "se", "sm", "sg", "sa", "sc", "sr", "sn", "sd", "si", "sk", "sl", "so", "st", "es", "su", "sw", "ss", "sv", "tl", "ty", "tg", "ta", "tt", "te", "th", "bo", "ti", "to", "ts", "tn", "tr", "tk", "tw", "ug", "uk", "ur", "uz", "ve", "vi", "vo", "wa", "cy", "wo", "xh", "yi", "yo", "za", "zu"], "model": "../fietje-test/fietje-2b-ckpt-1800", "results": {"raw": {"test": [{"test_speed": 1290.0700000000002, "test_speed_short": 160.24}, {"test_speed": 2165.7599999999998, "test_speed_short": 300.45000000000005}, {"test_speed": 3131.24, "test_speed_short": 574.78}, {"test_speed": 4085.74, "test_speed_short": 728.28}, {"test_speed": 4987.71, "test_speed_short": 864.3000000000001}, {"test_speed": 5637.1900000000005, "test_speed_short": 1140.0}, {"test_speed": 6432.099999999999, "test_speed_short": 1264.0}, {"test_speed": 6692.73, "test_speed_short": 1358.94}, {"test_speed": 7319.599999999999, "test_speed_short": 1481.2199999999998}, {"test_speed": 7395.55, "test_speed_short": 1606.4999999999998}]}, "total": {"test_speed": 4913.769, "test_speed_se": 1346.7525897145536, "test_speed_short": 947.8709999999999, "test_speed_short_se": 310.4536534920324}}, "num_model_parameters": -1, "max_sequence_length": 2048, "vocabulary_size": 51200, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "mmlu-nl", "task": "knowledge", "dataset_languages": ["nl"], "model": "BramVanroy/GEITje-7B-ultra", "results": {"raw": {"test": [{"mcc": 0.2825152024532061, "accuracy": 0.45849609375}, {"mcc": 0.24980479917754547, "accuracy": 0.43017578125}, {"mcc": 0.27283793369866877, "accuracy": 0.451171875}, {"mcc": 0.25273083508825817, "accuracy": 0.43408203125}, {"mcc": 0.248274195479924, "accuracy": 0.4267578125}, {"mcc": 0.2937006090531417, "accuracy": 0.46484375}, {"mcc": 0.280163590220334, "accuracy": 0.44580078125}, {"mcc": 0.2800712025510012, "accuracy": 0.45556640625}, {"mcc": 0.2849606756270155, "accuracy": 0.45068359375}, {"mcc": 0.25888108331354137, "accuracy": 0.43408203125}]}, "total": {"test_mcc": 27.039401266626363, "test_mcc_se": 1.0236945624310403, "test_accuracy": 44.5166015625, "test_accuracy_se": 0.8132012912524624}}, "num_model_parameters": 7241732096, "max_sequence_length": 8192, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "wiki-lingua-nl", "task": "summarization", "dataset_languages": ["nl"], "model": "Rijgersberg/GEITje-7B-chat", "results": {"raw": {"test": [{"bertscore": 0.6735337146383245, "rouge_l": 0.20767988197613113}, {"bertscore": 0.6728916375141125, "rouge_l": 0.2031965419790805}, {"bertscore": 0.6805166472040582, "rouge_l": 0.2140032648140755}, {"bertscore": 0.6613044391560834, "rouge_l": 0.20370686122133563}, {"bertscore": 0.6685160191555042, "rouge_l": 0.18662695262986362}, {"bertscore": 0.6633559162582969, "rouge_l": 0.20897509977603795}, {"bertscore": 0.6673091858974658, "rouge_l": 0.19310708069647026}, {"bertscore": 0.6727880279504461, "rouge_l": 0.20154266043473412}, {"bertscore": 0.6734737599035725, "rouge_l": 0.2162700822399044}, {"bertscore": 0.6524381520866882, "rouge_l": 0.18195308487710704}]}, "total": {"test_bertscore": 66.86127499764552, "test_bertscore_se": 0.49310952383152756, "test_rouge_l": 20.170615106447404, "test_rouge_l_se": 0.7005033998167418}}, "num_model_parameters": 7241732096, "max_sequence_length": 32768, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "dutch-social", "task": "sentiment-classification", "dataset_languages": ["nl"], "model": "../fietje-test/fietje-2b-ckpt-2700", "results": {"raw": {"test": [{"mcc": 0.2005071908823581, "macro_f1": 0.46539064658420654}, {"mcc": 0.14626832821618962, "macro_f1": 0.41106037922534683}, {"mcc": 0.14162066958890712, "macro_f1": 0.38909284106539416}, {"mcc": 0.1784734242140364, "macro_f1": 0.4528270454983638}, {"mcc": 0.17825977384405614, "macro_f1": 0.4035185029554304}, {"mcc": 0.12889306713005988, "macro_f1": 0.40737858115476255}, {"mcc": 0.1491918389852125, "macro_f1": 0.39550011222520737}, {"mcc": 0.09140625461214956, "macro_f1": 0.3525360020269819}, {"mcc": 0.11683086368350903, "macro_f1": 0.4052705871935726}, {"mcc": 0.13461103354581094, "macro_f1": 0.4456672283518445}]}, "total": {"test_mcc": 14.66062444702289, "test_mcc_se": 1.9972865312371662, "test_macro_f1": 41.2824192628111, "test_macro_f1_se": 2.078022218651918}}, "num_model_parameters": -1, "max_sequence_length": 2048, "vocabulary_size": 51200, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "scala-nl", "task": "linguistic-acceptability", "dataset_languages": ["nl"], "model": "Rijgersberg/GEITje-7B", "results": {"raw": {"test": [{"mcc": 0.3537602048165111, "macro_f1": 0.6740583554376658}, {"mcc": 0.37572235473551635, "macro_f1": 0.6697665741854308}, {"mcc": 0.3302513125344394, "macro_f1": 0.6497143715447193}, {"mcc": 0.3944122633560364, "macro_f1": 0.6962912787070948}, {"mcc": 0.22876506658836157, "macro_f1": 0.6029295143910063}, {"mcc": 0.2223428516439942, "macro_f1": 0.6043227342415896}, {"mcc": 0.36403671785619285, "macro_f1": 0.6658001646426804}, {"mcc": 0.1712099043714696, "macro_f1": 0.5357278834020736}, {"mcc": 0.3205717057534322, "macro_f1": 0.6254463504944459}, {"mcc": 0.31267303517056355, "macro_f1": 0.6514505475329138}]}, "total": {"test_mcc": 30.73745416826517, "test_mcc_se": 4.632932775400265, "test_macro_f1": 63.75507774579621, "test_macro_f1_se": 2.905493538572061}}, "num_model_parameters": 7241732096, "max_sequence_length": 32768, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "conll-nl", "task": "named-entity-recognition", "dataset_languages": ["nl"], "model": "../fietje-test/fietje-2b-ckpt-2700", "results": {"raw": {"test": [{"micro_f1_no_misc": 0.2303899596593456, "micro_f1": 0.24600456621004568}, {"micro_f1_no_misc": 0.3326904532304725, "micro_f1": 0.2983023443815683}, {"micro_f1_no_misc": 0.32903225806451614, "micro_f1": 0.3113245298119248}, {"micro_f1_no_misc": 0.2867557715674362, "micro_f1": 0.270326615705351}, {"micro_f1_no_misc": 0.4073139974779319, "micro_f1": 0.2919275123558484}, {"micro_f1_no_misc": 0.3945480631276901, "micro_f1": 0.3125}, {"micro_f1_no_misc": 0.3515731874145007, "micro_f1": 0.3180154534363563}, {"micro_f1_no_misc": 0.27093821510297483, "micro_f1": 0.24562767197823554}, {"micro_f1_no_misc": 0.37484586929716396, "micro_f1": 0.3038099950519545}, {"micro_f1_no_misc": 0.3721236028928337, "micro_f1": 0.32688172043010755}]}, "total": {"test_micro_f1_no_misc": 33.50211377834866, "test_micro_f1_no_misc_se": 3.5443820270892137, "test_micro_f1": 29.247204093613917, "test_micro_f1_se": 1.8011046324482143}}, "num_model_parameters": -1, "max_sequence_length": 2048, "vocabulary_size": 51200, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "scala-nl", "task": "linguistic-acceptability", "dataset_languages": ["nl"], "model": "../fietje-test/fietje-2b-ckpt-2700", "results": {"raw": {"test": [{"mcc": 0.05500983542310893, "macro_f1": 0.37329587805230474}, {"mcc": 0.054079993496644725, "macro_f1": 0.4021798761985678}, {"mcc": 0.020179648674724094, "macro_f1": 0.3760908911707256}, {"mcc": 0.09600150798925998, "macro_f1": 0.4504174300904714}, {"mcc": 0.048088020852890635, "macro_f1": 0.45056696052575956}, {"mcc": -0.021362059790985634, "macro_f1": 0.4276258447546264}, {"mcc": 0.07622945281604071, "macro_f1": 0.37839406720853475}, {"mcc": 0.08156608810178637, "macro_f1": 0.37201856672712397}, {"mcc": 0.1066518107435261, "macro_f1": 0.42980167864034785}, {"mcc": 0.09121102009164977, "macro_f1": 0.4804512870733748}]}, "total": {"test_mcc": 6.076553183986457, "test_mcc_se": 2.407295951789148, "test_macro_f1": 41.40842480441837, "test_macro_f1_se": 2.4251162784662363}}, "num_model_parameters": -1, "max_sequence_length": 2048, "vocabulary_size": 51200, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "mmlu-nl", "task": "knowledge", "dataset_languages": ["nl"], "model": "Rijgersberg/GEITje-7B-chat", "results": {"raw": {"test": [{"mcc": 0.22605359110121193, "accuracy": 0.4169921875}, {"mcc": 0.23394453492215464, "accuracy": 0.41943359375}, {"mcc": 0.2830665655292639, "accuracy": 0.46240234375}, {"mcc": 0.2305253177518992, "accuracy": 0.41357421875}, {"mcc": 0.2548074951896862, "accuracy": 0.41552734375}, {"mcc": 0.2504422100114556, "accuracy": 0.42431640625}, {"mcc": 0.30053655618309844, "accuracy": 0.44775390625}, {"mcc": 0.2383503179150569, "accuracy": 0.41015625}, {"mcc": 0.2467214071173486, "accuracy": 0.42431640625}, {"mcc": 0.20901020910700274, "accuracy": 0.39013671875}]}, "total": {"test_mcc": 24.734582048281784, "test_mcc_se": 1.6831800515368451, "test_accuracy": 42.24609375, "test_accuracy_se": 1.2403558216456938}}, "num_model_parameters": 7241732096, "max_sequence_length": 32768, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "hellaswag-nl", "task": "common-sense-reasoning", "dataset_languages": ["nl"], "model": "BramVanroy/GEITje-7B-ultra", "results": {"raw": {"test": [{"accuracy": 0.44189453125, "mcc": 0.27202953702868543}, {"accuracy": 0.42138671875, "mcc": 0.2305214256768402}, {"accuracy": 0.43408203125, "mcc": 0.24945434781360093}, {"accuracy": 0.4443359375, "mcc": 0.2651933502192802}, {"accuracy": 0.43896484375, "mcc": 0.2578026976776768}, {"accuracy": 0.42822265625, "mcc": 0.24761908357957568}, {"accuracy": 0.435546875, "mcc": 0.2544332153490529}, {"accuracy": 0.4306640625, "mcc": 0.24745628339487022}, {"accuracy": 0.4638671875, "mcc": 0.29121884972581125}, {"accuracy": 0.439453125, "mcc": 0.2621536685590586}]}, "total": {"test_accuracy": 43.7841796875, "test_accuracy_se": 0.7078941060513327, "test_mcc": 25.77882459024452, "test_mcc_se": 1.0181657056295055}}, "num_model_parameters": 7241732096, "max_sequence_length": 8192, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "speed", "task": "speed", "dataset_languages": ["ab", "aa", "af", "sq", "am", "ar", "an", "hy", "as", "av", "ae", "ay", "az", "bm", "ba", "eu", "be", "bn", "bi", "bs", "br", "bg", "my", "ca", "ch", "ce", "ny", "zh", "cu", "cv", "kw", "co", "cr", "hr", "cs", "da", "dv", "nl", "dz", "en", "eo", "et", "ee", "fo", "fj", "fi", "fr", "fy", "ff", "gd", "gl", "lg", "ka", "de", "el", "kl", "gn", "gu", "ht", "ha", "he", "hz", "hi", "ho", "hu", "is", "io", "ig", "id", "ia", "ie", "iu", "ik", "ga", "it", "ja", "kn", "kr", "ks", "kk", "km", "ki", "rw", "ky", "kv", "kg", "ko", "kj", "ku", "lo", "la", "lv", "li", "ln", "lt", "lu", "lb", "mk", "mg", "ms", "ml", "mt", "gv", "mi", "mr", "mh", "mn", "na", "nv", "nd", "nr", "ng", "ne", "no", "nb", "nn", "ii", "oc", "oj", "or", "om", "os", "pi", "ps", "fa", "pl", "pt", "pa", "qu", "ro", "rm", "rn", "ru", "se", "sm", "sg", "sa", "sc", "sr", "sn", "sd", "si", "sk", "sl", "so", "st", "es", "su", "sw", "ss", "sv", "tl", "ty", "tg", "ta", "tt", "te", "th", "bo", "ti", "to", "ts", "tn", "tr", "tk", "tw", "ug", "uk", "ur", "uz", "ve", "vi", "vo", "wa", "cy", "wo", "xh", "yi", "yo", "za", "zu"], "model": "BramVanroy/GEITje-7B-ultra", "results": {"raw": {"test": [{"test_speed": 1553.88, "test_speed_short": 200.31}, {"test_speed": 2631.7200000000003, "test_speed_short": 361.2}, {"test_speed": 3201.44, "test_speed_short": 678.3000000000001}, {"test_speed": 4043.54, "test_speed_short": 834.25}, {"test_speed": 4280.4400000000005, "test_speed_short": 987.28}, {"test_speed": 4314.32, "test_speed_short": 1261.7}, {"test_speed": 4853.76, "test_speed_short": 1406.0200000000002}, {"test_speed": 4981.8, "test_speed_short": 1555.72}, {"test_speed": 5156.2, "test_speed_short": 1705.89}, {"test_speed": 5087.28, "test_speed_short": 1849.1}]}, "total": {"test_speed": 4010.4379999999996, "test_speed_se": 741.5680072988354, "test_speed_short": 1083.977, "test_speed_short_se": 349.9811764725288}}, "num_model_parameters": 7241732096, "max_sequence_length": 8192, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "squad-nl", "task": "question-answering", "dataset_languages": ["nl"], "model": "../fietje-test/fietje-2b-ckpt-2700", "results": {"raw": {"test": [{"em": 58.79163439194423, "f1": 68.33662152237036}, {"em": 59.689922480620154, "f1": 69.36648485044991}, {"em": 57.650695517774345, "f1": 68.53144935396904}, {"em": 58.099688473520246, "f1": 67.32636916148188}, {"em": 57.83783783783784, "f1": 69.42464835056597}, {"em": 57.90285273708558, "f1": 67.50791636503115}, {"em": 56.18830675778284, "f1": 68.7920517861717}, {"em": 56.710628394103956, "f1": 66.57471530928639}, {"em": 56.86274509803921, "f1": 67.7030573118785}, {"em": 59.3944099378882, "f1": 69.56000832231524}]}, "total": {"test_em": 57.91287216265967, "test_em_se": 0.708399403419668, "test_f1": 68.31233223335202, "test_f1_se": 0.6264390247463079}}, "num_model_parameters": -1, "max_sequence_length": 2048, "vocabulary_size": 51200, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "dutch-social", "task": "sentiment-classification", "dataset_languages": ["nl"], "model": "BramVanroy/GEITje-7B-ultra-v2", "results": {"raw": {"test": [{"mcc": 0.0747939648850808, "macro_f1": 0.2450082908125144}, {"mcc": -0.018480306483022086, "macro_f1": 0.2856969640626034}, {"mcc": 0.03990044271564474, "macro_f1": 0.27000711650442477}, {"mcc": 0.11021107218047, "macro_f1": 0.4291369578947824}, {"mcc": 0.025421558611206943, "macro_f1": 0.30395556457343137}, {"mcc": 0.08032187730090651, "macro_f1": 0.38117392713619447}, {"mcc": 0.11008263362653399, "macro_f1": 0.36097749716268573}, {"mcc": 0.14045639226863302, "macro_f1": 0.39793059329930996}, {"mcc": 0.02020963961677602, "macro_f1": 0.29180322124543273}, {"mcc": -0.01992348113449598, "macro_f1": 0.18056013795019366}]}, "total": {"test_mcc": 5.62993793587734, "test_mcc_se": 3.440245274852063, "test_macro_f1": 31.462502706415734, "test_macro_f1_se": 4.752399601965406}}, "num_model_parameters": 7241732096, "max_sequence_length": 8192, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "conll-nl", "task": "named-entity-recognition", "dataset_languages": ["nl"], "model": "BramVanroy/GEITje-7B-ultra-v2", "results": {"raw": {"test": [{"micro_f1_no_misc": 0.3014311270125224, "micro_f1": 0.27092511013215853}, {"micro_f1_no_misc": 0.39469578783151327, "micro_f1": 0.34905951279679315}, {"micro_f1_no_misc": 0.474437627811861, "micro_f1": 0.3329473074696005}, {"micro_f1_no_misc": 0.35855546001719696, "micro_f1": 0.32219570405727926}, {"micro_f1_no_misc": 0.46950629235237173, "micro_f1": 0.34614202274823236}, {"micro_f1_no_misc": 0.453839516824849, "micro_f1": 0.3594351732991014}, {"micro_f1_no_misc": 0.38955656858682136, "micro_f1": 0.34112444725205304}, {"micro_f1_no_misc": 0.36153846153846153, "micro_f1": 0.3013100436681223}, {"micro_f1_no_misc": 0.43506493506493504, "micro_f1": 0.3128317202622541}, {"micro_f1_no_misc": 0.38680412371134026, "micro_f1": 0.33396704689480355}]}, "total": {"test_micro_f1_no_misc": 40.254299007518725, "test_micro_f1_no_misc_se": 3.4360460315625154, "test_micro_f1": 32.69938088580398, "test_micro_f1_se": 1.6289734373755718}}, "num_model_parameters": 7241732096, "max_sequence_length": 8192, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "scala-nl", "task": "linguistic-acceptability", "dataset_languages": ["nl"], "model": "BramVanroy/GEITje-7B-ultra-v2", "results": {"raw": {"test": [{"mcc": 0.1475355545684518, "macro_f1": 0.48117661731430506}, {"mcc": 0.10489655093150344, "macro_f1": 0.4401746632477065}, {"mcc": 0.15686607298199365, "macro_f1": 0.46939184317763616}, {"mcc": 0.07163449261307771, "macro_f1": 0.3878710528859841}, {"mcc": 0.19410343645760525, "macro_f1": 0.560409368775328}, {"mcc": 0.0848543779429547, "macro_f1": 0.5317055098236717}, {"mcc": 0.12512698250566184, "macro_f1": 0.41732008734391374}, {"mcc": 0.16274804979571708, "macro_f1": 0.5809391893632183}, {"mcc": 0.17621315854431047, "macro_f1": 0.5741712140914885}, {"mcc": 0.14716852053240922, "macro_f1": 0.5364315372603881}]}, "total": {"test_mcc": 13.711471968736848, "test_mcc_se": 2.4648750376130386, "test_macro_f1": 49.795910832836405, "test_macro_f1_se": 4.2477608960967705}}, "num_model_parameters": 7241732096, "max_sequence_length": 8192, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "hellaswag-nl", "task": "common-sense-reasoning", "dataset_languages": ["nl"], "model": "Rijgersberg/GEITje-7B-chat", "results": {"raw": {"test": [{"accuracy": 0.32568359375, "mcc": 0.1186323485930925}, {"accuracy": 0.29443359375, "mcc": 0.061209637565078134}, {"accuracy": 0.32470703125, "mcc": 0.10272231433552964}, {"accuracy": 0.314453125, "mcc": 0.11657385624160505}, {"accuracy": 0.3251953125, "mcc": 0.10562952472511489}, {"accuracy": 0.29248046875, "mcc": 0.08909996509724616}, {"accuracy": 0.27783203125, "mcc": 0.07277238814842102}, {"accuracy": 0.28564453125, "mcc": 0.07451347181229398}, {"accuracy": 0.357421875, "mcc": 0.15908953336588405}, {"accuracy": 0.28759765625, "mcc": 0.07691953540286772}]}, "total": {"test_accuracy": 30.8544921875, "test_accuracy_se": 1.5487914661859814, "test_mcc": 9.771625752871332, "test_mcc_se": 1.8067347870993424}}, "num_model_parameters": 7241732096, "max_sequence_length": 32768, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "speed", "task": "speed", "dataset_languages": ["ab", "aa", "af", "sq", "am", "ar", "an", "hy", "as", "av", "ae", "ay", "az", "bm", "ba", "eu", "be", "bn", "bi", "bs", "br", "bg", "my", "ca", "ch", "ce", "ny", "zh", "cu", "cv", "kw", "co", "cr", "hr", "cs", "da", "dv", "nl", "dz", "en", "eo", "et", "ee", "fo", "fj", "fi", "fr", "fy", "ff", "gd", "gl", "lg", "ka", "de", "el", "kl", "gn", "gu", "ht", "ha", "he", "hz", "hi", "ho", "hu", "is", "io", "ig", "id", "ia", "ie", "iu", "ik", "ga", "it", "ja", "kn", "kr", "ks", "kk", "km", "ki", "rw", "ky", "kv", "kg", "ko", "kj", "ku", "lo", "la", "lv", "li", "ln", "lt", "lu", "lb", "mk", "mg", "ms", "ml", "mt", "gv", "mi", "mr", "mh", "mn", "na", "nv", "nd", "nr", "ng", "ne", "no", "nb", "nn", "ii", "oc", "oj", "or", "om", "os", "pi", "ps", "fa", "pl", "pt", "pa", "qu", "ro", "rm", "rn", "ru", "se", "sm", "sg", "sa", "sc", "sr", "sn", "sd", "si", "sk", "sl", "so", "st", "es", "su", "sw", "ss", "sv", "tl", "ty", "tg", "ta", "tt", "te", "th", "bo", "ti", "to", "ts", "tn", "tr", "tk", "tw", "ug", "uk", "ur", "uz", "ve", "vi", "vo", "wa", "cy", "wo", "xh", "yi", "yo", "za", "zu"], "model": "Rijgersberg/GEITje-7B-chat", "results": {"raw": {"test": [{"test_speed": 1606.3200000000002, "test_speed_short": 238.04000000000002}, {"test_speed": 2447.9, "test_speed_short": 421.59999999999997}, {"test_speed": 2124.3199999999997, "test_speed_short": 777.8599999999999}, {"test_speed": 2765.68, "test_speed_short": 916.5}, {"test_speed": 3118.8, "test_speed_short": 1072.96}, {"test_speed": 3306.2, "test_speed_short": 1268.3600000000001}, {"test_speed": 3804.64, "test_speed_short": 1464.1200000000001}, {"test_speed": 3667.76, "test_speed_short": 1618.28}, {"test_speed": 3605.28, "test_speed_short": 1762.4499999999998}, {"test_speed": 3490.7400000000002, "test_speed_short": 1920.6000000000001}]}, "total": {"test_speed": 2993.764, "test_speed_se": 456.25411846196096, "test_speed_short": 1146.077, "test_speed_short_se": 349.5128949066091}}, "num_model_parameters": 7241732096, "max_sequence_length": 32768, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "squad-nl", "task": "question-answering", "dataset_languages": ["nl"], "model": "BramVanroy/GEITje-7B-ultra-v2", "results": {"raw": {"test": [{"em": 55.38342370255616, "f1": 65.80979301677556}, {"em": 55.968992248062015, "f1": 66.57681656934983}, {"em": 53.168469860896444, "f1": 65.97201122007135}, {"em": 53.504672897196265, "f1": 64.28147287661315}, {"em": 51.58301158301158, "f1": 61.684756711062796}, {"em": 56.74633770239013, "f1": 66.89387216355851}, {"em": 54.97342444950645, "f1": 67.90563208119632}, {"em": 55.314197051978276, "f1": 65.59575654694024}, {"em": 52.94117647058823, "f1": 64.36814670520727}, {"em": 52.63975155279503, "f1": 65.85618845476381}]}, "total": {"test_em": 54.22234575189806, "test_em_se": 1.038439628866657, "test_f1": 65.49444463455387, "test_f1_se": 1.0668746902346904}}, "num_model_parameters": 7241732096, "max_sequence_length": 8192, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "dutch-social", "task": "sentiment-classification", "dataset_languages": ["nl"], "model": "Rijgersberg/GEITje-7B-chat-v2", "results": {"raw": {"test": [{"mcc": 0.14227852420242929, "macro_f1": 0.4313136137407068}, {"mcc": 0.09963128950892121, "macro_f1": 0.39508226347707903}, {"mcc": 0.06496252830010472, "macro_f1": 0.3782678053597537}, {"mcc": 0.09021447946875566, "macro_f1": 0.3958452713836807}, {"mcc": 0.1495116366263112, "macro_f1": 0.43874676786156136}, {"mcc": 0.12249370834420827, "macro_f1": 0.41958766785588625}, {"mcc": 0.13201257358844964, "macro_f1": 0.4239245272721643}, {"mcc": 0.10274835701869842, "macro_f1": 0.38346946940514554}, {"mcc": 0.03498323055665479, "macro_f1": 0.35719294155315734}, {"mcc": 0.13521146025990616, "macro_f1": 0.3892030545848188}]}, "total": {"test_mcc": 10.740477878744393, "test_mcc_se": 2.2650452856180125, "test_macro_f1": 40.12633382493954, "test_macro_f1_se": 1.62126280421035}}, "num_model_parameters": 7241732096, "max_sequence_length": 32768, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "wiki-lingua-nl", "task": "summarization", "dataset_languages": ["nl"], "model": "../fietje-test/fietje-2b-ckpt-2700", "results": {"raw": {"test": [{"bertscore": 0.5973540123377461, "rouge_l": 0.16276164924589992}, {"bertscore": 0.6135887393029407, "rouge_l": 0.17001230707066028}, {"bertscore": 0.6039789856949938, "rouge_l": 0.15722888057735618}, {"bertscore": 0.6314515530830249, "rouge_l": 0.1809975045093589}, {"bertscore": 0.6092557169613428, "rouge_l": 0.15442487156960327}, {"bertscore": 0.6087029981135856, "rouge_l": 0.1699581614380427}, {"bertscore": 0.613555540854577, "rouge_l": 0.16121687433966397}, {"bertscore": 0.5952621073956834, "rouge_l": 0.15093946487729604}, {"bertscore": 0.5937636492890306, "rouge_l": 0.14918130114698802}, {"bertscore": 0.6186223341792356, "rouge_l": 0.1514770095796163}]}, "total": {"test_bertscore": 60.855356372121605, "test_bertscore_se": 0.7211412921080596, "test_rouge_l": 16.081980243544855, "test_rouge_l_se": 0.6378732813902452}}, "num_model_parameters": -1, "max_sequence_length": 2048, "vocabulary_size": 51200, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "conll-nl", "task": "named-entity-recognition", "dataset_languages": ["nl"], "model": "Rijgersberg/GEITje-7B-chat-v2", "results": {"raw": {"test": [{"micro_f1_no_misc": 0.3114241001564945, "micro_f1": 0.26144721233689205}, {"micro_f1_no_misc": 0.44950331125827814, "micro_f1": 0.3389539868533867}, {"micro_f1_no_misc": 0.456630109670987, "micro_f1": 0.3010464180305876}, {"micro_f1_no_misc": 0.40840336134453786, "micro_f1": 0.3427138478684871}, {"micro_f1_no_misc": 0.4606240713224368, "micro_f1": 0.3161309348274845}, {"micro_f1_no_misc": 0.4674329501915709, "micro_f1": 0.3228615863141524}, {"micro_f1_no_misc": 0.39039268013724737, "micro_f1": 0.307857334406007}, {"micro_f1_no_misc": 0.3403590944574551, "micro_f1": 0.273784355179704}, {"micro_f1_no_misc": 0.5198147195059186, "micro_f1": 0.33741753063147967}, {"micro_f1_no_misc": 0.41211025756891095, "micro_f1": 0.3136531365313653}]}, "total": {"test_micro_f1_no_misc": 42.16694655613837, "test_micro_f1_no_misc_se": 3.8790750276749186, "test_micro_f1": 31.158663429795464, "test_micro_f1_se": 1.6790964141027902}}, "num_model_parameters": 7241732096, "max_sequence_length": 32768, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "mmlu-nl", "task": "knowledge", "dataset_languages": ["nl"], "model": "../fietje-test/fietje-2b-ckpt-2700", "results": {"raw": {"test": [{"mcc": 0.22033848003863027, "accuracy": 0.4130859375}, {"mcc": 0.23283719810566642, "accuracy": 0.412109375}, {"mcc": 0.23505565265343661, "accuracy": 0.421875}, {"mcc": 0.23003710863341512, "accuracy": 0.41162109375}, {"mcc": 0.2212344049245831, "accuracy": 0.404296875}, {"mcc": 0.23411661135580963, "accuracy": 0.4130859375}, {"mcc": 0.24811487512627395, "accuracy": 0.423828125}, {"mcc": 0.22767308648209023, "accuracy": 0.4052734375}, {"mcc": 0.21074622266143705, "accuracy": 0.404296875}, {"mcc": 0.2602240638590798, "accuracy": 0.43212890625}]}, "total": {"test_mcc": 23.203777038404223, "test_mcc_se": 0.8758025159884694, "test_accuracy": 41.416015625, "test_accuracy_se": 0.5704491429247908}}, "num_model_parameters": -1, "max_sequence_length": 2048, "vocabulary_size": 51200, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "scala-nl", "task": "linguistic-acceptability", "dataset_languages": ["nl"], "model": "Rijgersberg/GEITje-7B-chat-v2", "results": {"raw": {"test": [{"mcc": 0.2475844388879707, "macro_f1": 0.5596170707419353}, {"mcc": 0.0938946837811155, "macro_f1": 0.3870687838597057}, {"mcc": 0.25411063826678504, "macro_f1": 0.5148514608014387}, {"mcc": 0.18442468398809292, "macro_f1": 0.4462604865874259}, {"mcc": 0.2643493358856912, "macro_f1": 0.5535956808235944}, {"mcc": 0.11688339241383923, "macro_f1": 0.4501791942618104}, {"mcc": 0.16040431888841833, "macro_f1": 0.41905928072655413}, {"mcc": 0.222955364412007, "macro_f1": 0.5814220690264704}, {"mcc": 0.16615542411771184, "macro_f1": 0.4774042123034159}, {"mcc": 0.2297603959126537, "macro_f1": 0.569945071623303}]}, "total": {"test_mcc": 19.405226765542853, "test_mcc_se": 3.667669494847196, "test_macro_f1": 49.59403310755654, "test_macro_f1_se": 4.29363749992002}}, "num_model_parameters": 7241732096, "max_sequence_length": 32768, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "hellaswag-nl", "task": "common-sense-reasoning", "dataset_languages": ["nl"], "model": "../fietje-test/fietje-2b-ckpt-2700", "results": {"raw": {"test": [{"accuracy": 0.26953125, "mcc": 0.056319085610799964}, {"accuracy": 0.26708984375, "mcc": 0.04117333461789348}, {"accuracy": 0.28125, "mcc": 0.05741561732816478}, {"accuracy": 0.248046875, "mcc": 0.007332149833257048}, {"accuracy": 0.275390625, "mcc": 0.048280170806987335}, {"accuracy": 0.267578125, "mcc": 0.030772185939849758}, {"accuracy": 0.26953125, "mcc": 0.029716994202131485}, {"accuracy": 0.24755859375, "mcc": 0.021012040812478157}, {"accuracy": 0.248046875, "mcc": 0.022132652785782553}, {"accuracy": 0.2822265625, "mcc": 0.06888829313214286}]}, "total": {"test_accuracy": 26.5625, "test_accuracy_se": 0.8253681735653965, "test_mcc": 3.8304252506948746, "test_mcc_se": 1.201706814745528}}, "num_model_parameters": -1, "max_sequence_length": 2048, "vocabulary_size": 51200, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "speed", "task": "speed", "dataset_languages": ["ab", "aa", "af", "sq", "am", "ar", "an", "hy", "as", "av", "ae", "ay", "az", "bm", "ba", "eu", "be", "bn", "bi", "bs", "br", "bg", "my", "ca", "ch", "ce", "ny", "zh", "cu", "cv", "kw", "co", "cr", "hr", "cs", "da", "dv", "nl", "dz", "en", "eo", "et", "ee", "fo", "fj", "fi", "fr", "fy", "ff", "gd", "gl", "lg", "ka", "de", "el", "kl", "gn", "gu", "ht", "ha", "he", "hz", "hi", "ho", "hu", "is", "io", "ig", "id", "ia", "ie", "iu", "ik", "ga", "it", "ja", "kn", "kr", "ks", "kk", "km", "ki", "rw", "ky", "kv", "kg", "ko", "kj", "ku", "lo", "la", "lv", "li", "ln", "lt", "lu", "lb", "mk", "mg", "ms", "ml", "mt", "gv", "mi", "mr", "mh", "mn", "na", "nv", "nd", "nr", "ng", "ne", "no", "nb", "nn", "ii", "oc", "oj", "or", "om", "os", "pi", "ps", "fa", "pl", "pt", "pa", "qu", "ro", "rm", "rn", "ru", "se", "sm", "sg", "sa", "sc", "sr", "sn", "sd", "si", "sk", "sl", "so", "st", "es", "su", "sw", "ss", "sv", "tl", "ty", "tg", "ta", "tt", "te", "th", "bo", "ti", "to", "ts", "tn", "tr", "tk", "tw", "ug", "uk", "ur", "uz", "ve", "vi", "vo", "wa", "cy", "wo", "xh", "yi", "yo", "za", "zu"], "model": "../fietje-test/fietje-2b-ckpt-2700", "results": {"raw": {"test": [{"test_speed": 1282.26, "test_speed_short": 158.72}, {"test_speed": 2153.07, "test_speed_short": 298.05}, {"test_speed": 3097.48, "test_speed_short": 571.59}, {"test_speed": 4007.06, "test_speed_short": 716.76}, {"test_speed": 4882.41, "test_speed_short": 856.13}, {"test_speed": 5628.7699999999995, "test_speed_short": 1139.4299999999998}, {"test_speed": 6378.09, "test_speed_short": 1258.88}, {"test_speed": 6675.900000000001, "test_speed_short": 1358.94}, {"test_speed": 7306.9800000000005, "test_speed_short": 1477.3200000000002}, {"test_speed": 7381.53, "test_speed_short": 1614.1499999999999}]}, "total": {"test_speed": 4879.3550000000005, "test_speed_se": 1346.8392587154506, "test_speed_short": 944.997, "test_speed_short_se": 311.62036179814004}}, "num_model_parameters": -1, "max_sequence_length": 2048, "vocabulary_size": 51200, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "dutch-social", "task": "sentiment-classification", "dataset_languages": ["nl"], "model": "../fietje-test/fietje-2b-ckpt-3600", "results": {"raw": {"test": [{"mcc": 0.1372903661643976, "macro_f1": 0.42733771474161775}, {"mcc": 0.1498290093367251, "macro_f1": 0.44624055134736196}, {"mcc": 0.13660317571190425, "macro_f1": 0.44248384638553895}, {"mcc": 0.180295693079314, "macro_f1": 0.41317824504700873}, {"mcc": 0.10382377132069759, "macro_f1": 0.41707398521510813}, {"mcc": 0.13440890293581206, "macro_f1": 0.4218324901131569}, {"mcc": 0.17239528111762029, "macro_f1": 0.4538996133801456}, {"mcc": 0.11748944848841955, "macro_f1": 0.42053837926030185}, {"mcc": 0.13505321922530764, "macro_f1": 0.43113311391984954}, {"mcc": 0.10645044795195857, "macro_f1": 0.38872864956171643}]}, "total": {"test_mcc": 13.736393153321567, "test_mcc_se": 1.5619923071853208, "test_macro_f1": 42.62446588971806, "test_macro_f1_se": 1.162813568968262}}, "num_model_parameters": -1, "max_sequence_length": 2048, "vocabulary_size": 51200, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "squad-nl", "task": "question-answering", "dataset_languages": ["nl"], "model": "Rijgersberg/GEITje-7B-chat-v2", "results": {"raw": {"test": [{"em": 60.26336173508908, "f1": 70.85808993513099}, {"em": 61.007751937984494, "f1": 71.58803142851002}, {"em": 59.27357032457496, "f1": 70.89373541803059}, {"em": 57.78816199376947, "f1": 68.06773979062882}, {"em": 58.61003861003861, "f1": 68.9207278526605}, {"em": 62.451811873554355, "f1": 72.37161006127724}, {"em": 58.46621108580106, "f1": 71.0850501515412}, {"em": 58.80527540729248, "f1": 69.04643075482012}, {"em": 58.27450980392157, "f1": 69.27345006239968}, {"em": 57.7639751552795, "f1": 69.77840047596172}]}, "total": {"test_em": 59.27046679273055, "test_em_se": 0.944129663614347, "test_f1": 70.18832659309608, "test_f1_se": 0.849053297664004}}, "num_model_parameters": 7241732096, "max_sequence_length": 32768, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "conll-nl", "task": "named-entity-recognition", "dataset_languages": ["nl"], "model": "../fietje-test/fietje-2b-ckpt-3600", "results": {"raw": {"test": [{"micro_f1_no_misc": 0.22258326563769293, "micro_f1": 0.2121354656632173}, {"micro_f1_no_misc": 0.28442776735459663, "micro_f1": 0.28297055057618437}, {"micro_f1_no_misc": 0.3043884220354809, "micro_f1": 0.2789937928781444}, {"micro_f1_no_misc": 0.2989690721649484, "micro_f1": 0.29027113237639557}, {"micro_f1_no_misc": 0.3578947368421053, "micro_f1": 0.2690200852099818}, {"micro_f1_no_misc": 0.3780332056194125, "micro_f1": 0.33591481122942884}, {"micro_f1_no_misc": 0.394484412470024, "micro_f1": 0.3331094694425789}, {"micro_f1_no_misc": 0.25480203841630733, "micro_f1": 0.24772190347620654}, {"micro_f1_no_misc": 0.35424764019988897, "micro_f1": 0.30802792321116934}, {"micro_f1_no_misc": 0.3780290791599354, "micro_f1": 0.31853417899929526}]}, "total": {"test_micro_f1_no_misc": 32.27859639900392, "test_micro_f1_no_misc_se": 3.6085445933332063, "test_micro_f1": 28.766993130626027, "test_micro_f1_se": 2.4016210464425285}}, "num_model_parameters": -1, "max_sequence_length": 2048, "vocabulary_size": 51200, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "wiki-lingua-nl", "task": "summarization", "dataset_languages": ["nl"], "model": "BramVanroy/GEITje-7B-ultra-v2", "results": {"raw": {"test": [{"bertscore": 0.6763415914174402, "rouge_l": 0.2034085628589033}, {"bertscore": 0.6712864198707393, "rouge_l": 0.1953795401335934}, {"bertscore": 0.6716822235757718, "rouge_l": 0.19883690223081565}, {"bertscore": 0.6748899523809087, "rouge_l": 0.20213328100557462}, {"bertscore": 0.6767000503314193, "rouge_l": 0.19179252560641535}, {"bertscore": 0.6894124801037833, "rouge_l": 0.2228927626993159}, {"bertscore": 0.6710337304248242, "rouge_l": 0.19658421493920988}, {"bertscore": 0.6790860080946004, "rouge_l": 0.20832355592002907}, {"bertscore": 0.683158915839158, "rouge_l": 0.21114986068096714}, {"bertscore": 0.6673220152151771, "rouge_l": 0.19057302289144315}]}, "total": {"test_bertscore": 67.60913387253822, "test_bertscore_se": 0.40446344482273466, "test_rouge_l": 20.21074228966268, "test_rouge_l_se": 0.6128935297464607}}, "num_model_parameters": 7241732096, "max_sequence_length": 8192, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "scala-nl", "task": "linguistic-acceptability", "dataset_languages": ["nl"], "model": "../fietje-test/fietje-2b-ckpt-3600", "results": {"raw": {"test": [{"mcc": 0.10348718783310298, "macro_f1": 0.4158863535410913}, {"mcc": 0.04964096246734264, "macro_f1": 0.40162478617246594}, {"mcc": 0.06334813469985784, "macro_f1": 0.41384432965804857}, {"mcc": 0.10876326298786379, "macro_f1": 0.4518589301197997}, {"mcc": 0.10266018963979184, "macro_f1": 0.5085480471934656}, {"mcc": -0.051658624876024316, "macro_f1": 0.39510774556091777}, {"mcc": 0.05139982158688047, "macro_f1": 0.3619533915884973}, {"mcc": 0.043727842348620115, "macro_f1": 0.37176446577185}, {"mcc": 0.08040243125453195, "macro_f1": 0.38850831411579073}, {"mcc": 0.06721607303217722, "macro_f1": 0.472861180715646}]}, "total": {"test_mcc": 6.189872809741446, "test_mcc_se": 2.881050121088761, "test_macro_f1": 41.81957544437573, "test_macro_f1_se": 2.871501340359755}}, "num_model_parameters": -1, "max_sequence_length": 2048, "vocabulary_size": 51200, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "squad-nl", "task": "question-answering", "dataset_languages": ["nl"], "model": "../fietje-test/fietje-2b-ckpt-3600", "results": {"raw": {"test": [{"em": 60.41828040278853, "f1": 69.42631703055733}, {"em": 60.85271317829457, "f1": 70.53276959296585}, {"em": 59.041731066460585, "f1": 69.79449970159645}, {"em": 59.50155763239876, "f1": 68.93187788419625}, {"em": 59.22779922779923, "f1": 69.95088613478696}, {"em": 58.519660755589825, "f1": 68.3600437919585}, {"em": 57.4791192103265, "f1": 70.19056635206927}, {"em": 57.48642358417378, "f1": 67.54485477244368}, {"em": 57.411764705882355, "f1": 69.01350898099003}, {"em": 59.47204968944099, "f1": 69.8303573319377}]}, "total": {"test_em": 58.94110994531551, "test_em_se": 0.75330832235958, "test_f1": 69.35756815735019, "test_f1_se": 0.5630334562221877}}, "num_model_parameters": -1, "max_sequence_length": 2048, "vocabulary_size": 51200, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "mmlu-nl", "task": "knowledge", "dataset_languages": ["nl"], "model": "BramVanroy/GEITje-7B-ultra-v2", "results": {"raw": {"test": [{"mcc": 0.2544830034012489, "accuracy": 0.4345703125}, {"mcc": 0.25139961308328673, "accuracy": 0.42431640625}, {"mcc": 0.2607570207268902, "accuracy": 0.4375}, {"mcc": 0.24711839214844006, "accuracy": 0.427734375}, {"mcc": 0.2404121996215651, "accuracy": 0.40966796875}, {"mcc": 0.260916679276987, "accuracy": 0.43359375}, {"mcc": 0.27096981478031745, "accuracy": 0.4306640625}, {"mcc": 0.24492624648212227, "accuracy": 0.42919921875}, {"mcc": 0.2461531587525746, "accuracy": 0.4111328125}, {"mcc": 0.22862291813825794, "accuracy": 0.40771484375}]}, "total": {"test_mcc": 25.057590464116902, "test_mcc_se": 0.7423633387104204, "test_accuracy": 42.4609375, "test_accuracy_se": 0.686458234540964}}, "num_model_parameters": 7241732096, "max_sequence_length": 8192, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "hellaswag-nl", "task": "common-sense-reasoning", "dataset_languages": ["nl"], "model": "BramVanroy/GEITje-7B-ultra-v2", "results": {"raw": {"test": [{"accuracy": 0.33154296875, "mcc": 0.11856858045966155}, {"accuracy": 0.298828125, "mcc": 0.06983066100219991}, {"accuracy": 0.326171875, "mcc": 0.10243111056143027}, {"accuracy": 0.28955078125, "mcc": 0.06522567895689584}, {"accuracy": 0.3125, "mcc": 0.09232275288950588}, {"accuracy": 0.30908203125, "mcc": 0.1132983086395471}, {"accuracy": 0.2705078125, "mcc": 0.06787542091826686}, {"accuracy": 0.3330078125, "mcc": 0.10838419121670101}, {"accuracy": 0.35498046875, "mcc": 0.150230967463744}, {"accuracy": 0.27685546875, "mcc": 0.06080878940122765}]}, "total": {"test_accuracy": 31.0302734375, "test_accuracy_se": 1.6638312380565856, "test_mcc": 9.489764615091802, "test_mcc_se": 1.8019419193532287}}, "num_model_parameters": 7241732096, "max_sequence_length": 8192, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "speed", "task": "speed", "dataset_languages": ["ab", "aa", "af", "sq", "am", "ar", "an", "hy", "as", "av", "ae", "ay", "az", "bm", "ba", "eu", "be", "bn", "bi", "bs", "br", "bg", "my", "ca", "ch", "ce", "ny", "zh", "cu", "cv", "kw", "co", "cr", "hr", "cs", "da", "dv", "nl", "dz", "en", "eo", "et", "ee", "fo", "fj", "fi", "fr", "fy", "ff", "gd", "gl", "lg", "ka", "de", "el", "kl", "gn", "gu", "ht", "ha", "he", "hz", "hi", "ho", "hu", "is", "io", "ig", "id", "ia", "ie", "iu", "ik", "ga", "it", "ja", "kn", "kr", "ks", "kk", "km", "ki", "rw", "ky", "kv", "kg", "ko", "kj", "ku", "lo", "la", "lv", "li", "ln", "lt", "lu", "lb", "mk", "mg", "ms", "ml", "mt", "gv", "mi", "mr", "mh", "mn", "na", "nv", "nd", "nr", "ng", "ne", "no", "nb", "nn", "ii", "oc", "oj", "or", "om", "os", "pi", "ps", "fa", "pl", "pt", "pa", "qu", "ro", "rm", "rn", "ru", "se", "sm", "sg", "sa", "sc", "sr", "sn", "sd", "si", "sk", "sl", "so", "st", "es", "su", "sw", "ss", "sv", "tl", "ty", "tg", "ta", "tt", "te", "th", "bo", "ti", "to", "ts", "tn", "tr", "tk", "tw", "ug", "uk", "ur", "uz", "ve", "vi", "vo", "wa", "cy", "wo", "xh", "yi", "yo", "za", "zu"], "model": "BramVanroy/GEITje-7B-ultra-v2", "results": {"raw": {"test": [{"test_speed": 1564.0, "test_speed_short": 203.28}, {"test_speed": 2633.54, "test_speed_short": 364.20000000000005}, {"test_speed": 3215.04, "test_speed_short": 684.76}, {"test_speed": 4058.0200000000004, "test_speed_short": 837.0699999999999}, {"test_speed": 4284.96, "test_speed_short": 999.04}, {"test_speed": 4330.58, "test_speed_short": 1272.0600000000002}, {"test_speed": 4885.360000000001, "test_speed_short": 1420.1299999999999}, {"test_speed": 4989.02, "test_speed_short": 1568.6000000000001}, {"test_speed": 5148.08, "test_speed_short": 1712.96}, {"test_speed": 5078.26, "test_speed_short": 1865.6000000000001}]}, "total": {"test_speed": 4018.6860000000006, "test_speed_se": 740.621454095178, "test_speed_short": 1092.77, "test_speed_short_se": 352.4533316591158}}, "num_model_parameters": 7241732096, "max_sequence_length": 8192, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "dutch-social", "task": "sentiment-classification", "dataset_languages": ["nl"], "model": "../fietje-test/fietje-2b-ckpt-900", "results": {"raw": {"test": [{"mcc": 0.14859711159511932, "macro_f1": 0.4152906109844556}, {"mcc": 0.06616407056550021, "macro_f1": 0.3870855185279534}, {"mcc": 0.1360161914658445, "macro_f1": 0.41344463494741523}, {"mcc": 0.16014909025531396, "macro_f1": 0.43664629029988616}, {"mcc": 0.10998647846580681, "macro_f1": 0.40285477524764596}, {"mcc": 0.14439313040018592, "macro_f1": 0.4294064871709555}, {"mcc": 0.13797531736617694, "macro_f1": 0.4219570906573565}, {"mcc": 0.11969013460559345, "macro_f1": 0.42072883335935857}, {"mcc": 0.13165232210144645, "macro_f1": 0.411709300358204}, {"mcc": 0.11635964747992436, "macro_f1": 0.43294225214403254}]}, "total": {"test_mcc": 12.70983494300912, "test_mcc_se": 1.6331028906940777, "test_macro_f1": 41.720657936972636, "test_macro_f1_se": 0.9171415435500646}}, "num_model_parameters": -1, "max_sequence_length": 2048, "vocabulary_size": 51200, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "wiki-lingua-nl", "task": "summarization", "dataset_languages": ["nl"], "model": "Rijgersberg/GEITje-7B-chat-v2", "results": {"raw": {"test": [{"bertscore": 0.6590270061860792, "rouge_l": 0.19773095419304182}, {"bertscore": 0.6545469993434381, "rouge_l": 0.18725059058885035}, {"bertscore": 0.6636224667890929, "rouge_l": 0.20489052461373236}, {"bertscore": 0.6525345801346703, "rouge_l": 0.192117856142705}, {"bertscore": 0.661011894204421, "rouge_l": 0.18329141955448158}, {"bertscore": 0.6465463010827079, "rouge_l": 0.2011916165480339}, {"bertscore": 0.6594854092545575, "rouge_l": 0.1854828580973722}, {"bertscore": 0.6580145664192969, "rouge_l": 0.19428318323256885}, {"bertscore": 0.6638431621249765, "rouge_l": 0.20335022724255103}, {"bertscore": 0.6378955039544962, "rouge_l": 0.1697928521976756}]}, "total": {"test_bertscore": 65.56527889493736, "test_bertscore_se": 0.5061396518562076, "test_rouge_l": 19.193820824110126, "test_rouge_l_se": 0.6699576977689399}}, "num_model_parameters": 7241732096, "max_sequence_length": 32768, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "conll-nl", "task": "named-entity-recognition", "dataset_languages": ["nl"], "model": "../fietje-test/fietje-2b-ckpt-900", "results": {"raw": {"test": [{"micro_f1_no_misc": 0.15779092702169625, "micro_f1": 0.23782826792564177}, {"micro_f1_no_misc": 0.24572823433685928, "micro_f1": 0.1848844472204872}, {"micro_f1_no_misc": 0.19981325863678803, "micro_f1": 0.20268096514745307}, {"micro_f1_no_misc": 0.23903312444046554, "micro_f1": 0.21566401816118044}, {"micro_f1_no_misc": 0.09716599190283401, "micro_f1": 0.14756049186830625}, {"micro_f1_no_misc": 0.03991130820399113, "micro_f1": 0.026356589147286825}, {"micro_f1_no_misc": 0.10516605166051661, "micro_f1": 0.11758180809761508}, {"micro_f1_no_misc": 0.2859315589353612, "micro_f1": 0.21139166177334118}, {"micro_f1_no_misc": 0.2275229357798165, "micro_f1": 0.16700889801505817}, {"micro_f1_no_misc": 0.19083255378858746, "micro_f1": 0.18319719953325556}]}, "total": {"test_micro_f1_no_misc": 17.888959447069162, "test_micro_f1_no_misc_se": 4.8170781861523855, "test_micro_f1": 16.941543468896253, "test_micro_f1_se": 3.795695843160492}}, "num_model_parameters": -1, "max_sequence_length": 2048, "vocabulary_size": 51200, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "wiki-lingua-nl", "task": "summarization", "dataset_languages": ["nl"], "model": "../fietje-test/fietje-2b-ckpt-3600", "results": {"raw": {"test": [{"bertscore": 0.6096998074062867, "rouge_l": 0.17051365799875073}, {"bertscore": 0.6230758576857625, "rouge_l": 0.17000590994914166}, {"bertscore": 0.6271028974078945, "rouge_l": 0.17252448980145932}, {"bertscore": 0.6328081720712362, "rouge_l": 0.1796037234623199}, {"bertscore": 0.6114707586384611, "rouge_l": 0.15812190148443497}, {"bertscore": 0.6161704916230519, "rouge_l": 0.17512464601241276}, {"bertscore": 0.6137811582593713, "rouge_l": 0.16117160009951628}, {"bertscore": 0.6073349489743123, "rouge_l": 0.1586194975007031}, {"bertscore": 0.6085726244928082, "rouge_l": 0.15516768718275614}, {"bertscore": 0.6213556662260089, "rouge_l": 0.1583250960588336}]}, "total": {"test_bertscore": 61.71372382785194, "test_bertscore_se": 0.533583612725616, "test_rouge_l": 16.591782095503284, "test_rouge_l_se": 0.5321098846904595}}, "num_model_parameters": -1, "max_sequence_length": 2048, "vocabulary_size": 51200, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "scala-nl", "task": "linguistic-acceptability", "dataset_languages": ["nl"], "model": "../fietje-test/fietje-2b-ckpt-900", "results": {"raw": {"test": [{"mcc": 0.07597630975370497, "macro_f1": 0.5239462166667528}, {"mcc": 0.029274866085765128, "macro_f1": 0.45075450106889026}, {"mcc": -0.004461583455616358, "macro_f1": 0.4702916035919775}, {"mcc": 0.11547922801740093, "macro_f1": 0.5557449618563866}, {"mcc": 0.0920377410606509, "macro_f1": 0.5405401241700174}, {"mcc": -0.0367187965387759, "macro_f1": 0.4148110625364487}, {"mcc": 0.03048722731203961, "macro_f1": 0.3798274141334152}, {"mcc": 0.09727050936670531, "macro_f1": 0.43739090122299706}, {"mcc": 0.08693730876549825, "macro_f1": 0.5261911174646584}, {"mcc": 0.016939547400699254, "macro_f1": 0.4191726873086253}]}, "total": {"test_mcc": 5.032223577680721, "test_mcc_se": 3.1139256555538757, "test_macro_f1": 47.186705900201694, "test_macro_f1_se": 3.7845453087549523}}, "num_model_parameters": -1, "max_sequence_length": 2048, "vocabulary_size": 51200, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "mmlu-nl", "task": "knowledge", "dataset_languages": ["nl"], "model": "../fietje-test/fietje-2b-ckpt-3600", "results": {"raw": {"test": [{"mcc": 0.23008704160632634, "accuracy": 0.4228515625}, {"mcc": 0.23271422234396202, "accuracy": 0.416015625}, {"mcc": 0.24559874980759866, "accuracy": 0.4345703125}, {"mcc": 0.22732385524964757, "accuracy": 0.416015625}, {"mcc": 0.22888551835107512, "accuracy": 0.41259765625}, {"mcc": 0.2514659496838467, "accuracy": 0.4306640625}, {"mcc": 0.26291478923646816, "accuracy": 0.43994140625}, {"mcc": 0.24784782136642314, "accuracy": 0.42333984375}, {"mcc": 0.22589509547793452, "accuracy": 0.41796875}, {"mcc": 0.27383785306998704, "accuracy": 0.44921875}]}, "total": {"test_mcc": 24.265708961932692, "test_mcc_se": 1.0243806021407864, "test_accuracy": 42.6318359375, "test_accuracy_se": 0.7415122482192514}}, "num_model_parameters": -1, "max_sequence_length": 2048, "vocabulary_size": 51200, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "squad-nl", "task": "question-answering", "dataset_languages": ["nl"], "model": "../fietje-test/fietje-2b-ckpt-900", "results": {"raw": {"test": [{"em": 58.01704105344694, "f1": 67.4869624345754}, {"em": 55.968992248062015, "f1": 67.32547369968461}, {"em": 55.255023183925815, "f1": 67.24259680557134}, {"em": 58.41121495327103, "f1": 67.33367723775378}, {"em": 57.2972972972973, "f1": 67.69726418200534}, {"em": 59.059367771781034, "f1": 68.2949029100065}, {"em": 53.83447228549734, "f1": 67.55098518677882}, {"em": 57.64158262218774, "f1": 66.65284494153795}, {"em": 55.372549019607845, "f1": 66.88333009622188}, {"em": 57.68633540372671, "f1": 68.47398413614772}]}, "total": {"test_em": 56.854387583880374, "test_em_se": 1.0289329994070893, "test_f1": 67.49420216302833, "test_f1_se": 0.3484291514329093}}, "num_model_parameters": -1, "max_sequence_length": 2048, "vocabulary_size": 51200, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "mmlu-nl", "task": "knowledge", "dataset_languages": ["nl"], "model": "Rijgersberg/GEITje-7B-chat-v2", "results": {"raw": {"test": [{"mcc": 0.2732839104676013, "accuracy": 0.45458984375}, {"mcc": 0.2655415185110307, "accuracy": 0.44384765625}, {"mcc": 0.2897969532027968, "accuracy": 0.46337890625}, {"mcc": 0.2718397388206594, "accuracy": 0.4521484375}, {"mcc": 0.26758441739103095, "accuracy": 0.4375}, {"mcc": 0.2769952498611348, "accuracy": 0.44921875}, {"mcc": 0.3006709196095718, "accuracy": 0.4619140625}, {"mcc": 0.2729279597350104, "accuracy": 0.45068359375}, {"mcc": 0.2819377096457762, "accuracy": 0.447265625}, {"mcc": 0.2580681214019661, "accuracy": 0.431640625}]}, "total": {"test_mcc": 27.58646498646578, "test_mcc_se": 0.7649722610750741, "test_accuracy": 44.921875, "test_accuracy_se": 0.6129650469784342}}, "num_model_parameters": 7241732096, "max_sequence_length": 32768, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "hellaswag-nl", "task": "common-sense-reasoning", "dataset_languages": ["nl"], "model": "../fietje-test/fietje-2b-ckpt-3600", "results": {"raw": {"test": [{"accuracy": 0.28466796875, "mcc": 0.061890027006239565}, {"accuracy": 0.2734375, "mcc": 0.056718274720610966}, {"accuracy": 0.30517578125, "mcc": 0.09618802980793491}, {"accuracy": 0.26220703125, "mcc": 0.02630554960357324}, {"accuracy": 0.26953125, "mcc": 0.045364672355247215}, {"accuracy": 0.25537109375, "mcc": 0.012142572918302709}, {"accuracy": 0.27587890625, "mcc": 0.05049808623200838}, {"accuracy": 0.29248046875, "mcc": 0.06189977954154437}, {"accuracy": 0.2587890625, "mcc": 0.03138371371955428}, {"accuracy": 0.2890625, "mcc": 0.07568491145308323}]}, "total": {"test_accuracy": 27.666015625, "test_accuracy_se": 0.9961301395858472, "test_mcc": 5.180756173580988, "test_mcc_se": 1.525418040426755}}, "num_model_parameters": -1, "max_sequence_length": 2048, "vocabulary_size": 51200, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "speed", "task": "speed", "dataset_languages": ["ab", "aa", "af", "sq", "am", "ar", "an", "hy", "as", "av", "ae", "ay", "az", "bm", "ba", "eu", "be", "bn", "bi", "bs", "br", "bg", "my", "ca", "ch", "ce", "ny", "zh", "cu", "cv", "kw", "co", "cr", "hr", "cs", "da", "dv", "nl", "dz", "en", "eo", "et", "ee", "fo", "fj", "fi", "fr", "fy", "ff", "gd", "gl", "lg", "ka", "de", "el", "kl", "gn", "gu", "ht", "ha", "he", "hz", "hi", "ho", "hu", "is", "io", "ig", "id", "ia", "ie", "iu", "ik", "ga", "it", "ja", "kn", "kr", "ks", "kk", "km", "ki", "rw", "ky", "kv", "kg", "ko", "kj", "ku", "lo", "la", "lv", "li", "ln", "lt", "lu", "lb", "mk", "mg", "ms", "ml", "mt", "gv", "mi", "mr", "mh", "mn", "na", "nv", "nd", "nr", "ng", "ne", "no", "nb", "nn", "ii", "oc", "oj", "or", "om", "os", "pi", "ps", "fa", "pl", "pt", "pa", "qu", "ro", "rm", "rn", "ru", "se", "sm", "sg", "sa", "sc", "sr", "sn", "sd", "si", "sk", "sl", "so", "st", "es", "su", "sw", "ss", "sv", "tl", "ty", "tg", "ta", "tt", "te", "th", "bo", "ti", "to", "ts", "tn", "tr", "tk", "tw", "ug", "uk", "ur", "uz", "ve", "vi", "vo", "wa", "cy", "wo", "xh", "yi", "yo", "za", "zu"], "model": "../fietje-test/fietje-2b-ckpt-3600", "results": {"raw": {"test": [{"test_speed": 1285.81, "test_speed_short": 160.08}, {"test_speed": 2148.84, "test_speed_short": 298.5}, {"test_speed": 3091.15, "test_speed_short": 573.91}, {"test_speed": 3984.58, "test_speed_short": 720.36}, {"test_speed": 4812.21, "test_speed_short": 856.13}, {"test_speed": 5481.42, "test_speed_short": 1122.3300000000002}, {"test_speed": 6176.78, "test_speed_short": 1228.8}, {"test_speed": 6485.16, "test_speed_short": 1324.86}, {"test_speed": 7054.58, "test_speed_short": 1446.12}, {"test_speed": 7115.150000000001, "test_speed_short": 1569.9499999999998}]}, "total": {"test_speed": 4763.568, "test_speed_se": 1283.476718685699, "test_speed_short": 930.1039999999999, "test_speed_short_se": 301.2339472994145}}, "num_model_parameters": -1, "max_sequence_length": 2048, "vocabulary_size": 51200, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "dutch-social", "task": "sentiment-classification", "dataset_languages": ["nl"], "model": "../fietje-test/fietje-2b-ckpt-4500", "results": {"raw": {"test": [{"mcc": 0.18954387562000655, "macro_f1": 0.4364461305939152}, {"mcc": 0.1281111891222261, "macro_f1": 0.427375389843265}, {"mcc": 0.11859328067826722, "macro_f1": 0.43224502730628506}, {"mcc": 0.16545984260983898, "macro_f1": 0.4158238661503742}, {"mcc": 0.12004715625854946, "macro_f1": 0.4280409240721681}, {"mcc": 0.1498310093454956, "macro_f1": 0.4320959580115942}, {"mcc": 0.17901955281851767, "macro_f1": 0.4596447465207496}, {"mcc": 0.1270078597551107, "macro_f1": 0.4328188734265675}, {"mcc": 0.14149202498600846, "macro_f1": 0.42078488603497227}, {"mcc": 0.09245811307968006, "macro_f1": 0.35199984423001496}]}, "total": {"test_mcc": 14.115639042737008, "test_mcc_se": 1.8637255299676085, "test_macro_f1": 42.37275646189905, "test_macro_f1_se": 1.7192650485641416}}, "num_model_parameters": -1, "max_sequence_length": 2048, "vocabulary_size": 51200, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "conll-nl", "task": "named-entity-recognition", "dataset_languages": ["nl"], "model": "../fietje-test/fietje-2b-ckpt-4500", "results": {"raw": {"test": [{"micro_f1_no_misc": 0.24521072796934865, "micro_f1": 0.23127463863337713}, {"micro_f1_no_misc": 0.285453716370269, "micro_f1": 0.2639556377079483}, {"micro_f1_no_misc": 0.31414356787491116, "micro_f1": 0.29095163806552254}, {"micro_f1_no_misc": 0.2920550377274745, "micro_f1": 0.2607062359128475}, {"micro_f1_no_misc": 0.20893007582139847, "micro_f1": 0.2106979778212655}, {"micro_f1_no_misc": 0.24844720496894404, "micro_f1": 0.18251928020565553}, {"micro_f1_no_misc": 0.19966996699669967, "micro_f1": 0.2303218301667313}, {"micro_f1_no_misc": 0.28898128898128894, "micro_f1": 0.2508333333333333}, {"micro_f1_no_misc": 0.28334714167357083, "micro_f1": 0.23350253807106602}, {"micro_f1_no_misc": 0.3203592814371257, "micro_f1": 0.2838360402165506}]}, "total": {"test_micro_f1_no_misc": 26.86598009821031, "test_micro_f1_no_misc_se": 2.5709565877904, "test_micro_f1": 24.385991501342975, "test_micro_f1_se": 2.0518534144431313}}, "num_model_parameters": -1, "max_sequence_length": 2048, "vocabulary_size": 51200, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "scala-nl", "task": "linguistic-acceptability", "dataset_languages": ["nl"], "model": "../fietje-test/fietje-2b-ckpt-4500", "results": {"raw": {"test": [{"mcc": 0.09797020934312271, "macro_f1": 0.4187354149803412}, {"mcc": 0.08290555128923374, "macro_f1": 0.43085611292204384}, {"mcc": 0.04307390737003227, "macro_f1": 0.36686541878125023}, {"mcc": 0.09480287171906102, "macro_f1": 0.4151274472256299}, {"mcc": 0.07397092365686066, "macro_f1": 0.45760583827049084}, {"mcc": -0.04874203794268628, "macro_f1": 0.37881667812512637}, {"mcc": 0.09501564714238517, "macro_f1": 0.3689791877998461}, {"mcc": 0.07483322603513552, "macro_f1": 0.3758193161178236}, {"mcc": 0.10464919269777243, "macro_f1": 0.39707387622898865}, {"mcc": 0.05991226850193064, "macro_f1": 0.4187645581216019}]}, "total": {"test_mcc": 6.783917598128478, "test_mcc_se": 2.795572225777418, "test_macro_f1": 40.28643848573143, "test_macro_f1_se": 1.8734430379604015}}, "num_model_parameters": -1, "max_sequence_length": 2048, "vocabulary_size": 51200, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "hellaswag-nl", "task": "common-sense-reasoning", "dataset_languages": ["nl"], "model": "Rijgersberg/GEITje-7B-chat-v2", "results": {"raw": {"test": [{"accuracy": 0.4033203125, "mcc": 0.2164372342322308}, {"accuracy": 0.326171875, "mcc": 0.13025218534559863}, {"accuracy": 0.38037109375, "mcc": 0.18192062344483031}, {"accuracy": 0.38916015625, "mcc": 0.19831743511184446}, {"accuracy": 0.38525390625, "mcc": 0.1854171735403732}, {"accuracy": 0.36328125, "mcc": 0.1707945668666941}, {"accuracy": 0.34375, "mcc": 0.1862625724382601}, {"accuracy": 0.34130859375, "mcc": 0.14247048615759922}, {"accuracy": 0.4033203125, "mcc": 0.21728592429366914}, {"accuracy": 0.33984375, "mcc": 0.16013634214968187}]}, "total": {"test_accuracy": 36.7578125, "test_accuracy_se": 1.7599894602296222, "test_mcc": 17.892945435807817, "test_mcc_se": 1.7864633568153359}}, "num_model_parameters": 7241732096, "max_sequence_length": 32768, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "speed", "task": "speed", "dataset_languages": ["ab", "aa", "af", "sq", "am", "ar", "an", "hy", "as", "av", "ae", "ay", "az", "bm", "ba", "eu", "be", "bn", "bi", "bs", "br", "bg", "my", "ca", "ch", "ce", "ny", "zh", "cu", "cv", "kw", "co", "cr", "hr", "cs", "da", "dv", "nl", "dz", "en", "eo", "et", "ee", "fo", "fj", "fi", "fr", "fy", "ff", "gd", "gl", "lg", "ka", "de", "el", "kl", "gn", "gu", "ht", "ha", "he", "hz", "hi", "ho", "hu", "is", "io", "ig", "id", "ia", "ie", "iu", "ik", "ga", "it", "ja", "kn", "kr", "ks", "kk", "km", "ki", "rw", "ky", "kv", "kg", "ko", "kj", "ku", "lo", "la", "lv", "li", "ln", "lt", "lu", "lb", "mk", "mg", "ms", "ml", "mt", "gv", "mi", "mr", "mh", "mn", "na", "nv", "nd", "nr", "ng", "ne", "no", "nb", "nn", "ii", "oc", "oj", "or", "om", "os", "pi", "ps", "fa", "pl", "pt", "pa", "qu", "ro", "rm", "rn", "ru", "se", "sm", "sg", "sa", "sc", "sr", "sn", "sd", "si", "sk", "sl", "so", "st", "es", "su", "sw", "ss", "sv", "tl", "ty", "tg", "ta", "tt", "te", "th", "bo", "ti", "to", "ts", "tn", "tr", "tk", "tw", "ug", "uk", "ur", "uz", "ve", "vi", "vo", "wa", "cy", "wo", "xh", "yi", "yo", "za", "zu"], "model": "Rijgersberg/GEITje-7B-chat-v2", "results": {"raw": {"test": [{"test_speed": 1621.0400000000002, "test_speed_short": 237.92999999999998}, {"test_speed": 2449.7200000000003, "test_speed_short": 421.4}, {"test_speed": 2132.48, "test_speed_short": 777.48}, {"test_speed": 2772.92, "test_speed_short": 914.62}, {"test_speed": 3118.8, "test_speed_short": 1076.88}, {"test_speed": 3311.6200000000003, "test_speed_short": 1268.3600000000001}, {"test_speed": 3785.6800000000003, "test_speed_short": 1461.6299999999999}, {"test_speed": 3660.5400000000004, "test_speed_short": 1614.6000000000001}, {"test_speed": 3580.92, "test_speed_short": 1761.44}, {"test_speed": 3481.72, "test_speed_short": 1917.3}]}, "total": {"test_speed": 2991.5440000000003, "test_speed_se": 449.90569477725757, "test_speed_short": 1145.164, "test_speed_short_se": 348.8825004738857}}, "num_model_parameters": 7241732096, "max_sequence_length": 32768, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "squad-nl", "task": "question-answering", "dataset_languages": ["nl"], "model": "../fietje-test/fietje-2b-ckpt-4500", "results": {"raw": {"test": [{"em": 60.10844306738962, "f1": 69.6527076035784}, {"em": 60.07751937984496, "f1": 70.42393450884575}, {"em": 58.11437403400309, "f1": 69.44222099727854}, {"em": 58.25545171339564, "f1": 68.49000622168586}, {"em": 57.99227799227799, "f1": 69.52317073169704}, {"em": 59.59907478797224, "f1": 69.55797969136671}, {"em": 56.71981776765376, "f1": 69.83230827250203}, {"em": 56.9433669511249, "f1": 67.26746274515915}, {"em": 56.470588235294116, "f1": 68.31051990487231}, {"em": 59.70496894409938, "f1": 70.68221815182316}]}, "total": {"test_em": 58.39858828730557, "test_em_se": 0.871892583397773, "test_f1": 69.31825288288088, "test_f1_se": 0.6361579116423033}}, "num_model_parameters": -1, "max_sequence_length": 2048, "vocabulary_size": 51200, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "dutch-social", "task": "sentiment-classification", "dataset_languages": ["nl"], "model": "BramVanroy/GEITje-7B-ultra-sft", "results": {"raw": {"test": [{"mcc": 0.08843912286512644, "macro_f1": 0.30687504005134625}, {"mcc": 0.05195520670506025, "macro_f1": 0.36096654358738967}, {"mcc": 0.012896811863094555, "macro_f1": 0.29763069033595735}, {"mcc": 0.12051605111749154, "macro_f1": 0.4241528358793339}, {"mcc": 0.07083195816699714, "macro_f1": 0.3665540319819785}, {"mcc": 0.08272022679681354, "macro_f1": 0.38404344664612705}, {"mcc": 0.16084485993060793, "macro_f1": 0.4307121986656625}, {"mcc": 0.11315809192492698, "macro_f1": 0.38169276294617305}, {"mcc": 0.018371969110414665, "macro_f1": 0.3077229258678174}, {"mcc": 0.010151538065133844, "macro_f1": 0.2450229228667503}]}, "total": {"test_mcc": 7.2988583654566686, "test_mcc_se": 3.12742298773194, "test_macro_f1": 35.053733988285366, "test_macro_f1_se": 3.69452621838194}}, "num_model_parameters": 7241732096, "max_sequence_length": 8192, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "wiki-lingua-nl", "task": "summarization", "dataset_languages": ["nl"], "model": "../fietje-test/fietje-2b-ckpt-900", "results": {"raw": {"test": [{"bertscore": 0.6143268333544256, "rouge_l": 0.16676887418310687}, {"bertscore": 0.6239423259830801, "rouge_l": 0.16381785092879697}, {"bertscore": 0.6344478130195057, "rouge_l": 0.1672705144773223}, {"bertscore": 0.6389003713848069, "rouge_l": 0.17876318436325728}, {"bertscore": 0.6127806148433592, "rouge_l": 0.1483552976648052}, {"bertscore": 0.6156971960008377, "rouge_l": 0.15914891278464113}, {"bertscore": 0.6217124850663822, "rouge_l": 0.15945154190111377}, {"bertscore": 0.6103478866280057, "rouge_l": 0.15657364184831718}, {"bertscore": 0.6190101564134238, "rouge_l": 0.15522334253325978}, {"bertscore": 0.624296627327567, "rouge_l": 0.15849749404992786}]}, "total": {"test_bertscore": 62.15462310021393, "test_bertscore_se": 0.575509214023403, "test_rouge_l": 16.138706547345482, "test_rouge_l_se": 0.5142317582851833}}, "num_model_parameters": -1, "max_sequence_length": 2048, "vocabulary_size": 51200, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "conll-nl", "task": "named-entity-recognition", "dataset_languages": ["nl"], "model": "BramVanroy/GEITje-7B-ultra-sft", "results": {"raw": {"test": [{"micro_f1_no_misc": 0.3054935238945958, "micro_f1": 0.2571428571428572}, {"micro_f1_no_misc": 0.3876257920238539, "micro_f1": 0.3186514410005438}, {"micro_f1_no_misc": 0.45631552670993514, "micro_f1": 0.3020693361999463}, {"micro_f1_no_misc": 0.3698296836982968, "micro_f1": 0.3189944134078212}, {"micro_f1_no_misc": 0.45633287608596246, "micro_f1": 0.3395362152877183}, {"micro_f1_no_misc": 0.43514297554910897, "micro_f1": 0.33985330073349634}, {"micro_f1_no_misc": 0.36983553951062975, "micro_f1": 0.3137821438890494}, {"micro_f1_no_misc": 0.3833034111310592, "micro_f1": 0.28579264947888094}, {"micro_f1_no_misc": 0.418323249783924, "micro_f1": 0.28292682926829266}, {"micro_f1_no_misc": 0.3689627870150436, "micro_f1": 0.3127379209370425}]}, "total": {"test_micro_f1_no_misc": 39.5116536540241, "test_micro_f1_no_misc_se": 2.9089069208793457, "test_micro_f1": 30.714871073456486, "test_micro_f1_se": 1.6050715621083997}}, "num_model_parameters": 7241732096, "max_sequence_length": 8192, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "mmlu-nl", "task": "knowledge", "dataset_languages": ["nl"], "model": "../fietje-test/fietje-2b-ckpt-900", "results": {"raw": {"test": [{"mcc": 0.2189629939975879, "accuracy": 0.41552734375}, {"mcc": 0.20943408085225015, "accuracy": 0.4052734375}, {"mcc": 0.22668562637054973, "accuracy": 0.42138671875}, {"mcc": 0.21627397998475073, "accuracy": 0.41259765625}, {"mcc": 0.21745850348658777, "accuracy": 0.4111328125}, {"mcc": 0.2314602257913511, "accuracy": 0.42138671875}, {"mcc": 0.23132395450828694, "accuracy": 0.4208984375}, {"mcc": 0.23403878876123488, "accuracy": 0.42333984375}, {"mcc": 0.21755790209517223, "accuracy": 0.412109375}, {"mcc": 0.23586077388185672, "accuracy": 0.42431640625}]}, "total": {"test_mcc": 22.39056829729628, "test_mcc_se": 0.5617887036883767, "test_accuracy": 41.6796875, "test_accuracy_se": 0.39438771539883016}}, "num_model_parameters": -1, "max_sequence_length": 2048, "vocabulary_size": 51200, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "scala-nl", "task": "linguistic-acceptability", "dataset_languages": ["nl"], "model": "BramVanroy/GEITje-7B-ultra-sft", "results": {"raw": {"test": [{"mcc": 0.18912822940106255, "macro_f1": 0.5275617797172214}, {"mcc": 0.10393461561886026, "macro_f1": 0.4772727272727273}, {"mcc": 0.19195323413068818, "macro_f1": 0.503541599909385}, {"mcc": 0.10289442380709371, "macro_f1": 0.4065536802378908}, {"mcc": 0.187449865112707, "macro_f1": 0.5383779617690165}, {"mcc": 0.11728643582893827, "macro_f1": 0.5406321578044346}, {"mcc": 0.14875911901224836, "macro_f1": 0.4752191050402347}, {"mcc": 0.18474812071888708, "macro_f1": 0.5914692169298534}, {"mcc": 0.21620546009966962, "macro_f1": 0.5907301342901998}, {"mcc": 0.20071546900631815, "macro_f1": 0.5806516707222981}]}, "total": {"test_mcc": 16.43074972736473, "test_mcc_se": 2.6313899649986467, "test_macro_f1": 52.32010033693262, "test_macro_f1_se": 3.6656176272345977}}, "num_model_parameters": 7241732096, "max_sequence_length": 8192, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "hellaswag-nl", "task": "common-sense-reasoning", "dataset_languages": ["nl"], "model": "../fietje-test/fietje-2b-ckpt-900", "results": {"raw": {"test": [{"accuracy": 0.30078125, "mcc": 0.08770687766165769}, {"accuracy": 0.287109375, "mcc": 0.055988422275982525}, {"accuracy": 0.2919921875, "mcc": 0.0703805196436575}, {"accuracy": 0.2490234375, "mcc": 0.007030982662726711}, {"accuracy": 0.29443359375, "mcc": 0.07901391523583898}, {"accuracy": 0.30224609375, "mcc": 0.08299444534247924}, {"accuracy": 0.31103515625, "mcc": 0.08521703965461563}, {"accuracy": 0.31103515625, "mcc": 0.0886814587970904}, {"accuracy": 0.27880859375, "mcc": 0.04856212070509997}, {"accuracy": 0.30029296875, "mcc": 0.08322168313835418}]}, "total": {"test_accuracy": 29.267578125, "test_accuracy_se": 1.135712926634746, "test_mcc": 6.88797465117503, "test_mcc_se": 1.5906496080366948}}, "num_model_parameters": -1, "max_sequence_length": 2048, "vocabulary_size": 51200, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "speed", "task": "speed", "dataset_languages": ["ab", "aa", "af", "sq", "am", "ar", "an", "hy", "as", "av", "ae", "ay", "az", "bm", "ba", "eu", "be", "bn", "bi", "bs", "br", "bg", "my", "ca", "ch", "ce", "ny", "zh", "cu", "cv", "kw", "co", "cr", "hr", "cs", "da", "dv", "nl", "dz", "en", "eo", "et", "ee", "fo", "fj", "fi", "fr", "fy", "ff", "gd", "gl", "lg", "ka", "de", "el", "kl", "gn", "gu", "ht", "ha", "he", "hz", "hi", "ho", "hu", "is", "io", "ig", "id", "ia", "ie", "iu", "ik", "ga", "it", "ja", "kn", "kr", "ks", "kk", "km", "ki", "rw", "ky", "kv", "kg", "ko", "kj", "ku", "lo", "la", "lv", "li", "ln", "lt", "lu", "lb", "mk", "mg", "ms", "ml", "mt", "gv", "mi", "mr", "mh", "mn", "na", "nv", "nd", "nr", "ng", "ne", "no", "nb", "nn", "ii", "oc", "oj", "or", "om", "os", "pi", "ps", "fa", "pl", "pt", "pa", "qu", "ro", "rm", "rn", "ru", "se", "sm", "sg", "sa", "sc", "sr", "sn", "sd", "si", "sk", "sl", "so", "st", "es", "su", "sw", "ss", "sv", "tl", "ty", "tg", "ta", "tt", "te", "th", "bo", "ti", "to", "ts", "tn", "tr", "tk", "tw", "ug", "uk", "ur", "uz", "ve", "vi", "vo", "wa", "cy", "wo", "xh", "yi", "yo", "za", "zu"], "model": "../fietje-test/fietje-2b-ckpt-900", "results": {"raw": {"test": [{"test_speed": 1281.55, "test_speed_short": 158.8}, {"test_speed": 2124.87, "test_speed_short": 297.6}, {"test_speed": 3091.15, "test_speed_short": 570.14}, {"test_speed": 3993.01, "test_speed_short": 718.92}, {"test_speed": 4840.29, "test_speed_short": 853.5500000000001}, {"test_speed": 5531.9400000000005, "test_speed_short": 1133.73}, {"test_speed": 6294.62, "test_speed_short": 1241.6}, {"test_speed": 6530.04, "test_speed_short": 1339.77}, {"test_speed": 7180.780000000001, "test_speed_short": 1457.04}, {"test_speed": 7248.34, "test_speed_short": 1592.8999999999999}]}, "total": {"test_speed": 4811.659, "test_speed_se": 1315.3373866487495, "test_speed_short": 936.405, "test_speed_short_se": 306.2933030531545}}, "num_model_parameters": -1, "max_sequence_length": 2048, "vocabulary_size": 51200, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "squad-nl", "task": "question-answering", "dataset_languages": ["nl"], "model": "BramVanroy/GEITje-7B-ultra-sft", "results": {"raw": {"test": [{"em": 54.76374903175833, "f1": 66.18398622704156}, {"em": 53.02325581395349, "f1": 65.32236631973164}, {"em": 50.54095826893354, "f1": 65.27541630048562}, {"em": 53.11526479750779, "f1": 65.26547605622173}, {"em": 52.277992277992276, "f1": 63.39770249160082}, {"em": 55.66692367000771, "f1": 67.28103179166709}, {"em": 52.923310554290055, "f1": 66.97633555179381}, {"em": 54.538401861908454, "f1": 66.0288934512726}, {"em": 51.05882352941177, "f1": 64.52123405776646}, {"em": 52.40683229813665, "f1": 66.45130390326761}]}, "total": {"test_em": 53.03155121039, "test_em_se": 0.9970115299371407, "test_f1": 65.6703746150849, "test_f1_se": 0.7224738602120578}}, "num_model_parameters": 7241732096, "max_sequence_length": 8192, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "wiki-lingua-nl", "task": "summarization", "dataset_languages": ["nl"], "model": "../fietje-test/fietje-2b-ckpt-4500", "results": {"raw": {"test": [{"bertscore": 0.6077664309050306, "rouge_l": 0.1685562886941672}, {"bertscore": 0.6224563488503918, "rouge_l": 0.17078342316979522}, {"bertscore": 0.6190725111810025, "rouge_l": 0.16823850733223905}, {"bertscore": 0.632636707086931, "rouge_l": 0.17909001489183018}, {"bertscore": 0.6145369321748149, "rouge_l": 0.15669562105291224}, {"bertscore": 0.6203545656098868, "rouge_l": 0.17791824799542993}, {"bertscore": 0.6169971513591008, "rouge_l": 0.15732238756666317}, {"bertscore": 0.5947135597671149, "rouge_l": 0.14513287028414085}, {"bertscore": 0.5959822648146655, "rouge_l": 0.1411437212565507}, {"bertscore": 0.6231603576161433, "rouge_l": 0.1530256182301642}]}, "total": {"test_bertscore": 61.47676829365082, "test_bertscore_se": 0.7471175888327066, "test_rouge_l": 16.17906700473893, "test_rouge_l_se": 0.8144721522170877}}, "num_model_parameters": -1, "max_sequence_length": 2048, "vocabulary_size": 51200, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "mmlu-nl", "task": "knowledge", "dataset_languages": ["nl"], "model": "../fietje-test/fietje-2b-ckpt-4500", "results": {"raw": {"test": [{"mcc": 0.23816141283170747, "accuracy": 0.42626953125}, {"mcc": 0.24385896147030964, "accuracy": 0.41552734375}, {"mcc": 0.24841633689704704, "accuracy": 0.4326171875}, {"mcc": 0.21491828960459314, "accuracy": 0.40283203125}, {"mcc": 0.2246196076385846, "accuracy": 0.4033203125}, {"mcc": 0.2502685685049747, "accuracy": 0.419921875}, {"mcc": 0.2580205305031247, "accuracy": 0.43017578125}, {"mcc": 0.25520702914466925, "accuracy": 0.4189453125}, {"mcc": 0.22740296808313795, "accuracy": 0.416015625}, {"mcc": 0.26247948649764075, "accuracy": 0.4306640625}]}, "total": {"test_mcc": 24.23353191175789, "test_mcc_se": 0.974686518760717, "test_accuracy": 41.962890625, "test_accuracy_se": 0.6612371087775719}}, "num_model_parameters": -1, "max_sequence_length": 2048, "vocabulary_size": 51200, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "hellaswag-nl", "task": "common-sense-reasoning", "dataset_languages": ["nl"], "model": "../fietje-test/fietje-2b-ckpt-4500", "results": {"raw": {"test": [{"accuracy": 0.27490234375, "mcc": 0.047693883699039916}, {"accuracy": 0.27001953125, "mcc": 0.05319188682859652}, {"accuracy": 0.283203125, "mcc": 0.06373597194200009}, {"accuracy": 0.2509765625, "mcc": 0.010820134901303001}, {"accuracy": 0.26708984375, "mcc": 0.03775824957091982}, {"accuracy": 0.26171875, "mcc": 0.02483774072987727}, {"accuracy": 0.275390625, "mcc": 0.041677524920052965}, {"accuracy": 0.27734375, "mcc": 0.05773252631840721}, {"accuracy": 0.255859375, "mcc": 0.032408862381724746}, {"accuracy": 0.28564453125, "mcc": 0.07173914740672163}]}, "total": {"test_accuracy": 27.021484375, "test_accuracy_se": 0.7050346727915761, "test_mcc": 4.415959286986432, "test_mcc_se": 1.1488705452421164}}, "num_model_parameters": -1, "max_sequence_length": 2048, "vocabulary_size": 51200, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "speed", "task": "speed", "dataset_languages": ["ab", "aa", "af", "sq", "am", "ar", "an", "hy", "as", "av", "ae", "ay", "az", "bm", "ba", "eu", "be", "bn", "bi", "bs", "br", "bg", "my", "ca", "ch", "ce", "ny", "zh", "cu", "cv", "kw", "co", "cr", "hr", "cs", "da", "dv", "nl", "dz", "en", "eo", "et", "ee", "fo", "fj", "fi", "fr", "fy", "ff", "gd", "gl", "lg", "ka", "de", "el", "kl", "gn", "gu", "ht", "ha", "he", "hz", "hi", "ho", "hu", "is", "io", "ig", "id", "ia", "ie", "iu", "ik", "ga", "it", "ja", "kn", "kr", "ks", "kk", "km", "ki", "rw", "ky", "kv", "kg", "ko", "kj", "ku", "lo", "la", "lv", "li", "ln", "lt", "lu", "lb", "mk", "mg", "ms", "ml", "mt", "gv", "mi", "mr", "mh", "mn", "na", "nv", "nd", "nr", "ng", "ne", "no", "nb", "nn", "ii", "oc", "oj", "or", "om", "os", "pi", "ps", "fa", "pl", "pt", "pa", "qu", "ro", "rm", "rn", "ru", "se", "sm", "sg", "sa", "sc", "sr", "sn", "sd", "si", "sk", "sl", "so", "st", "es", "su", "sw", "ss", "sv", "tl", "ty", "tg", "ta", "tt", "te", "th", "bo", "ti", "to", "ts", "tn", "tr", "tk", "tw", "ug", "uk", "ur", "uz", "ve", "vi", "vo", "wa", "cy", "wo", "xh", "yi", "yo", "za", "zu"], "model": "../fietje-test/fietje-2b-ckpt-4500", "results": {"raw": {"test": [{"test_speed": 1326.28, "test_speed_short": 165.6}, {"test_speed": 2264.46, "test_speed_short": 312.9}, {"test_speed": 3291.6, "test_speed_short": 604.0699999999999}, {"test_speed": 4268.389999999999, "test_speed_short": 763.92}, {"test_speed": 5135.13, "test_speed_short": 906.4399999999999}, {"test_speed": 5797.17, "test_speed_short": 1187.31}, {"test_speed": 6594.13, "test_speed_short": 1311.36}, {"test_speed": 6861.030000000001, "test_speed_short": 1411.48}, {"test_speed": 7515.21, "test_speed_short": 1541.2800000000002}, {"test_speed": 7521.7300000000005, "test_speed_short": 1678.75}]}, "total": {"test_speed": 5057.513000000001, "test_speed_se": 1366.4514820982934, "test_speed_short": 988.311, "test_speed_short_se": 322.93723592413954}}, "num_model_parameters": -1, "max_sequence_length": 2048, "vocabulary_size": 51200, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "wiki-lingua-nl", "task": "summarization", "dataset_languages": ["nl"], "model": "BramVanroy/GEITje-7B-ultra-sft", "results": {"raw": {"test": [{"bertscore": 0.6752474271488609, "rouge_l": 0.20154981904077873}, {"bertscore": 0.6711763876082841, "rouge_l": 0.1977603540540637}, {"bertscore": 0.6728898839792237, "rouge_l": 0.20262764430688857}, {"bertscore": 0.6784554057230707, "rouge_l": 0.206636987622372}, {"bertscore": 0.6782961289718514, "rouge_l": 0.1907513221997016}, {"bertscore": 0.6929577430710196, "rouge_l": 0.22360110559901547}, {"bertscore": 0.6719314547372051, "rouge_l": 0.1969283348924722}, {"bertscore": 0.6759496703016339, "rouge_l": 0.20357057374176246}, {"bertscore": 0.6824767799407709, "rouge_l": 0.20902728447866095}, {"bertscore": 0.6714381384808803, "rouge_l": 0.19536006284736163}]}, "total": {"test_bertscore": 67.708190199628, "test_bertscore_se": 0.41319520733132653, "test_rouge_l": 20.278134887830774, "test_rouge_l_se": 0.5653384685491469}}, "num_model_parameters": 7241732096, "max_sequence_length": 8192, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "wiki-lingua-nl", "task": "summarization", "dataset_languages": ["nl"], "model": "yhavinga/ul2-large-dutch", "results": {"raw": {"test": [{"bertscore": 0.42951872348203324, "rouge_l": 0.06907700444647805}, {"bertscore": 0.4289258516510017, "rouge_l": 0.06946987162328766}, {"bertscore": 0.4232820498145884, "rouge_l": 0.06638042010421016}, {"bertscore": 0.43233148389845155, "rouge_l": 0.070438594445}, {"bertscore": 0.42800485551560996, "rouge_l": 0.06908888141041157}, {"bertscore": 0.42411723209079355, "rouge_l": 0.07016853764334026}, {"bertscore": 0.43049364126636647, "rouge_l": 0.0710393088714113}, {"bertscore": 0.43089127422717866, "rouge_l": 0.07139898273029366}, {"bertscore": 0.42940187008935027, "rouge_l": 0.06922900083193384}, {"bertscore": 0.4352196870459011, "rouge_l": 0.07270539337785878}]}, "total": {"test_bertscore": 42.92186669081275, "test_bertscore_se": 0.2196947195963208, "test_rouge_l": 6.989959954842254, "test_rouge_l_se": 0.10562064139243339}}, "num_model_parameters": 783150080, "max_sequence_length": 512, "vocabulary_size": 32128, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "mmlu-nl", "task": "knowledge", "dataset_languages": ["nl"], "model": "BramVanroy/GEITje-7B-ultra-sft", "results": {"raw": {"test": [{"mcc": 0.26515714050397754, "accuracy": 0.44091796875}, {"mcc": 0.2525100375698397, "accuracy": 0.4228515625}, {"mcc": 0.26815415869078, "accuracy": 0.44091796875}, {"mcc": 0.25270491218138486, "accuracy": 0.43017578125}, {"mcc": 0.2508004646758814, "accuracy": 0.4169921875}, {"mcc": 0.27381708775579566, "accuracy": 0.44287109375}, {"mcc": 0.2749890494794267, "accuracy": 0.431640625}, {"mcc": 0.2607220206473327, "accuracy": 0.43896484375}, {"mcc": 0.24980843904517439, "accuracy": 0.4130859375}, {"mcc": 0.2417717144260144, "accuracy": 0.4140625}]}, "total": {"test_mcc": 25.90435024975607, "test_mcc_se": 0.6945427964965085, "test_accuracy": 42.9248046875, "test_accuracy_se": 0.7280779429061506}}, "num_model_parameters": 7241732096, "max_sequence_length": 8192, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "dutch-social", "task": "sentiment-classification", "dataset_languages": ["nl"], "model": "davidberenstein1957/ultra-feedback-dutch-cleaned-hq-spin-geitje-7b-ultra-sft_iter1", "results": {"raw": {"test": [{"mcc": 0.08107342300161269, "macro_f1": 0.31760519096501355}, {"mcc": 0.10062433943794953, "macro_f1": 0.348458264168294}, {"mcc": 0.07931314273369021, "macro_f1": 0.35670675086758913}, {"mcc": 0.08144300017356242, "macro_f1": 0.33884120969855364}, {"mcc": 0.16283114357497178, "macro_f1": 0.3964669492060602}, {"mcc": 0.08801300498146651, "macro_f1": 0.32555723132174236}, {"mcc": 0.17989429106308752, "macro_f1": 0.3729917128299587}, {"mcc": 0.10818079250378404, "macro_f1": 0.34873924484037966}, {"mcc": 0.06701389036154243, "macro_f1": 0.2942970155384459}, {"mcc": 0.07436667472671316, "macro_f1": 0.33880002526887526}]}, "total": {"test_mcc": 10.227537025583802, "test_mcc_se": 2.388534742662227, "test_macro_f1": 34.38463594704913, "test_macro_f1_se": 1.7721747506182612}}, "num_model_parameters": 7241732096, "max_sequence_length": 8192, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "squad-nl", "task": "question-answering", "dataset_languages": ["nl"], "model": "Rijgersberg/GEITje-7B", "results": {"raw": {"test": [{"em": 57.6297443841983, "f1": 67.41310259284774}, {"em": 56.27906976744186, "f1": 67.19841359790557}, {"em": 55.87326120556414, "f1": 69.17626365287516}, {"em": 57.00934579439252, "f1": 66.40479064288832}, {"em": 56.21621621621622, "f1": 67.18124933200014}, {"em": 58.36545875096376, "f1": 68.10502813235355}, {"em": 56.112376613515565, "f1": 68.40646729039833}, {"em": 57.331264546159815, "f1": 67.39416467326714}, {"em": 54.509803921568626, "f1": 65.9303067976018}, {"em": 56.36645962732919, "f1": 68.18136534439493}]}, "total": {"test_em": 56.56930008273499, "test_em_se": 0.6633755652389435, "test_f1": 67.53911520565327, "test_f1_se": 0.595977867129097}}, "num_model_parameters": 7241732096, "max_sequence_length": 32768, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "conll-nl", "task": "named-entity-recognition", "dataset_languages": ["nl"], "model": "davidberenstein1957/ultra-feedback-dutch-cleaned-hq-spin-geitje-7b-ultra-sft_iter1", "results": {"raw": {"test": [{"micro_f1_no_misc": 0.30909846281678444, "micro_f1": 0.22483289447032612}, {"micro_f1_no_misc": 0.372859450726979, "micro_f1": 0.2758178319435536}, {"micro_f1_no_misc": 0.41537781705700394, "micro_f1": 0.26819081116728954}, {"micro_f1_no_misc": 0.3409247757073844, "micro_f1": 0.26187961985216474}, {"micro_f1_no_misc": 0.40952778938570833, "micro_f1": 0.26977829638273043}, {"micro_f1_no_misc": 0.3764705882352941, "micro_f1": 0.2688411168766988}, {"micro_f1_no_misc": 0.3239108746258729, "micro_f1": 0.2477104230266027}, {"micro_f1_no_misc": 0.3687057844361215, "micro_f1": 0.25456977262594743}, {"micro_f1_no_misc": 0.3713956170703575, "micro_f1": 0.2440126525079078}, {"micro_f1_no_misc": 0.3641597697013314, "micro_f1": 0.26266852626685266}]}, "total": {"test_micro_f1_no_misc": 36.52430929762838, "test_micro_f1_no_misc_se": 2.0826541193094106, "test_micro_f1": 25.783019451200744, "test_micro_f1_se": 0.9529463495197862}}, "num_model_parameters": 7241732096, "max_sequence_length": 8192, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "scala-nl", "task": "linguistic-acceptability", "dataset_languages": ["nl"], "model": "davidberenstein1957/ultra-feedback-dutch-cleaned-hq-spin-geitje-7b-ultra-sft_iter1", "results": {"raw": {"test": [{"mcc": 0.17503156120036759, "macro_f1": 0.5142103940871678}, {"mcc": 0.17288273178991317, "macro_f1": 0.5427041499330656}, {"mcc": 0.2297150170892174, "macro_f1": 0.5817043353807622}, {"mcc": 0.2271029072794246, "macro_f1": 0.5281904049153935}, {"mcc": 0.22996574345065543, "macro_f1": 0.5846929030659747}, {"mcc": 0.1637894754185942, "macro_f1": 0.5689343146127422}, {"mcc": 0.19187686660867234, "macro_f1": 0.5717599810203909}, {"mcc": 0.16431908118522848, "macro_f1": 0.5718466898954704}, {"mcc": 0.19990998665975052, "macro_f1": 0.5357142857142857}, {"mcc": 0.15279521406471358, "macro_f1": 0.5741389612562442}]}, "total": {"test_mcc": 19.073885847465373, "test_mcc_se": 1.8357611797750326, "test_macro_f1": 55.73896419881497, "test_macro_f1_se": 1.5417188376167066}}, "num_model_parameters": 7241732096, "max_sequence_length": 8192, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "squad-nl", "task": "question-answering", "dataset_languages": ["nl"], "model": "davidberenstein1957/ultra-feedback-dutch-cleaned-hq-spin-geitje-7b-ultra-sft_iter1", "results": {"raw": {"test": [{"em": 53.29202168861348, "f1": 66.77165009730601}, {"em": 50.93023255813954, "f1": 64.9835982679466}, {"em": 47.75888717156105, "f1": 64.85406220942149}, {"em": 52.258566978193144, "f1": 65.23454736224674}, {"em": 49.96138996138996, "f1": 63.73056411402261}, {"em": 54.66461063993832, "f1": 68.36171771859264}, {"em": 50.26575550493546, "f1": 66.16908167018984}, {"em": 53.06439100077579, "f1": 66.26000486093531}, {"em": 49.88235294117647, "f1": 65.06964615739278}, {"em": 50.0, "f1": 66.14810313284298}]}, "total": {"test_em": 51.20782084447232, "test_em_se": 1.2814859985224214, "test_f1": 65.7582975590897, "test_f1_se": 0.7909149980917256}}, "num_model_parameters": 7241732096, "max_sequence_length": 8192, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "hellaswag-nl", "task": "common-sense-reasoning", "dataset_languages": ["nl"], "model": "BramVanroy/GEITje-7B-ultra-sft", "results": {"raw": {"test": [{"accuracy": 0.3525390625, "mcc": 0.15193799280529732}, {"accuracy": 0.31689453125, "mcc": 0.10675797049381743}, {"accuracy": 0.36767578125, "mcc": 0.1556077097396105}, {"accuracy": 0.314453125, "mcc": 0.11304402703036505}, {"accuracy": 0.32421875, "mcc": 0.10764781788304986}, {"accuracy": 0.3154296875, "mcc": 0.11043034356687041}, {"accuracy": 0.2802734375, "mcc": 0.08009063872179548}, {"accuracy": 0.3583984375, "mcc": 0.1423530839520787}, {"accuracy": 0.384765625, "mcc": 0.19399211663751795}, {"accuracy": 0.29443359375, "mcc": 0.09679374135517856}]}, "total": {"test_accuracy": 33.0908203125, "test_accuracy_se": 2.078570137758041, "test_mcc": 12.586554421855812, "test_mcc_se": 2.1188125968931764}}, "num_model_parameters": 7241732096, "max_sequence_length": 8192, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "speed", "task": "speed", "dataset_languages": ["ab", "aa", "af", "sq", "am", "ar", "an", "hy", "as", "av", "ae", "ay", "az", "bm", "ba", "eu", "be", "bn", "bi", "bs", "br", "bg", "my", "ca", "ch", "ce", "ny", "zh", "cu", "cv", "kw", "co", "cr", "hr", "cs", "da", "dv", "nl", "dz", "en", "eo", "et", "ee", "fo", "fj", "fi", "fr", "fy", "ff", "gd", "gl", "lg", "ka", "de", "el", "kl", "gn", "gu", "ht", "ha", "he", "hz", "hi", "ho", "hu", "is", "io", "ig", "id", "ia", "ie", "iu", "ik", "ga", "it", "ja", "kn", "kr", "ks", "kk", "km", "ki", "rw", "ky", "kv", "kg", "ko", "kj", "ku", "lo", "la", "lv", "li", "ln", "lt", "lu", "lb", "mk", "mg", "ms", "ml", "mt", "gv", "mi", "mr", "mh", "mn", "na", "nv", "nd", "nr", "ng", "ne", "no", "nb", "nn", "ii", "oc", "oj", "or", "om", "os", "pi", "ps", "fa", "pl", "pt", "pa", "qu", "ro", "rm", "rn", "ru", "se", "sm", "sg", "sa", "sc", "sr", "sn", "sd", "si", "sk", "sl", "so", "st", "es", "su", "sw", "ss", "sv", "tl", "ty", "tg", "ta", "tt", "te", "th", "bo", "ti", "to", "ts", "tn", "tr", "tk", "tw", "ug", "uk", "ur", "uz", "ve", "vi", "vo", "wa", "cy", "wo", "xh", "yi", "yo", "za", "zu"], "model": "BramVanroy/GEITje-7B-ultra-sft", "results": {"raw": {"test": [{"test_speed": 1591.6000000000001, "test_speed_short": 232.32000000000002}, {"test_speed": 2422.42, "test_speed_short": 413.0}, {"test_speed": 2113.44, "test_speed_short": 764.5600000000001}, {"test_speed": 2762.06, "test_speed_short": 900.52}, {"test_speed": 3136.88, "test_speed_short": 1065.12}, {"test_speed": 3327.8799999999997, "test_speed_short": 1255.78}, {"test_speed": 3817.28, "test_speed_short": 1451.6699999999998}, {"test_speed": 3667.76, "test_speed_short": 1545.6000000000001}, {"test_speed": 3564.68, "test_speed_short": 1690.7399999999998}, {"test_speed": 3472.7000000000003, "test_speed_short": 1854.6}]}, "total": {"test_speed": 2987.67, "test_speed_se": 459.22749537431366, "test_speed_short": 1117.391, "test_speed_short_se": 335.7037141288037}}, "num_model_parameters": 7241732096, "max_sequence_length": 8192, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "wiki-lingua-nl", "task": "summarization", "dataset_languages": ["nl"], "model": "Rijgersberg/GEITje-7B", "results": {"raw": {"test": [{"bertscore": 0.6596271338494262, "rouge_l": 0.19914075494341554}, {"bertscore": 0.6624133917939616, "rouge_l": 0.18938163926062934}, {"bertscore": 0.6690317133034114, "rouge_l": 0.2054643423945404}, {"bertscore": 0.6659272600954864, "rouge_l": 0.20139964291935436}, {"bertscore": 0.6598340878044837, "rouge_l": 0.18336985809308304}, {"bertscore": 0.6652313955419231, "rouge_l": 0.20579588923144726}, {"bertscore": 0.6434349174232921, "rouge_l": 0.16186660684913828}, {"bertscore": 0.6703049515199382, "rouge_l": 0.20581802633977583}, {"bertscore": 0.674042243801523, "rouge_l": 0.2139173846601239}, {"bertscore": 0.646994666865794, "rouge_l": 0.1741434787095743}]}, "total": {"test_bertscore": 66.1684176199924, "test_bertscore_se": 0.6089037103665542, "test_rouge_l": 19.402976234010822, "test_rouge_l_se": 1.0224783331599256}}, "num_model_parameters": 7241732096, "max_sequence_length": 32768, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "wiki-lingua-nl", "task": "summarization", "dataset_languages": ["nl"], "model": "davidberenstein1957/ultra-feedback-dutch-cleaned-hq-spin-geitje-7b-ultra-sft_iter1", "results": {"raw": {"test": [{"bertscore": 0.6914058617549017, "rouge_l": 0.21069935738233359}, {"bertscore": 0.6819326501863543, "rouge_l": 0.20533208809377457}, {"bertscore": 0.687918806070229, "rouge_l": 0.2153081105708059}, {"bertscore": 0.6976850195496809, "rouge_l": 0.22120208104182942}, {"bertscore": 0.6831016197247664, "rouge_l": 0.18590440556078866}, {"bertscore": 0.7072350798116531, "rouge_l": 0.22371164287132458}, {"bertscore": 0.6765348128683399, "rouge_l": 0.19313995044723317}, {"bertscore": 0.6863568927365122, "rouge_l": 0.20742376930543238}, {"bertscore": 0.6981546930037439, "rouge_l": 0.2217768088081195}, {"bertscore": 0.6869847095222212, "rouge_l": 0.2047123139872695}]}, "total": {"test_bertscore": 68.97310145228403, "test_bertscore_se": 0.5637384856178375, "test_rouge_l": 20.892105280689112, "test_rouge_l_se": 0.7703954732901472}}, "num_model_parameters": 7241732096, "max_sequence_length": 8192, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "mmlu-nl", "task": "knowledge", "dataset_languages": ["nl"], "model": "Rijgersberg/GEITje-7B", "results": {"raw": {"test": [{"mcc": 0.26139104077412834, "accuracy": 0.43115234375}, {"mcc": 0.2725054621863196, "accuracy": 0.43310546875}, {"mcc": 0.29051040107872966, "accuracy": 0.458984375}, {"mcc": 0.28077335463235165, "accuracy": 0.44970703125}, {"mcc": 0.25228556577498795, "accuracy": 0.41650390625}, {"mcc": 0.29003925269743647, "accuracy": 0.45263671875}, {"mcc": 0.31235528808418545, "accuracy": 0.46484375}, {"mcc": 0.2799979061567499, "accuracy": 0.45703125}, {"mcc": 0.2751494989438221, "accuracy": 0.4375}, {"mcc": 0.28500598166660407, "accuracy": 0.4423828125}]}, "total": {"test_mcc": 28.00013751995315, "test_mcc_se": 1.028425610148388, "test_accuracy": 44.4384765625, "test_accuracy_se": 0.928364161693487}}, "num_model_parameters": 7241732096, "max_sequence_length": 32768, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "mmlu-nl", "task": "knowledge", "dataset_languages": ["nl"], "model": "davidberenstein1957/ultra-feedback-dutch-cleaned-hq-spin-geitje-7b-ultra-sft_iter1", "results": {"raw": {"test": [{"mcc": 0.28682365843724006, "accuracy": 0.46484375}, {"mcc": 0.26325014170487404, "accuracy": 0.4462890625}, {"mcc": 0.27243566212251696, "accuracy": 0.4521484375}, {"mcc": 0.26282742715180574, "accuracy": 0.4443359375}, {"mcc": 0.2352191703179647, "accuracy": 0.423828125}, {"mcc": 0.30349030788703474, "accuracy": 0.47705078125}, {"mcc": 0.29052579079057145, "accuracy": 0.4619140625}, {"mcc": 0.28106321744756724, "accuracy": 0.458984375}, {"mcc": 0.2662113434518852, "accuracy": 0.4453125}, {"mcc": 0.26411207730399966, "accuracy": 0.447265625}]}, "total": {"test_mcc": 27.259587966154598, "test_mcc_se": 1.1764753246260904, "test_accuracy": 45.2197265625, "test_accuracy_se": 0.8985057364825358}}, "num_model_parameters": 7241732096, "max_sequence_length": 8192, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "hellaswag-nl", "task": "common-sense-reasoning", "dataset_languages": ["nl"], "model": "davidberenstein1957/ultra-feedback-dutch-cleaned-hq-spin-geitje-7b-ultra-sft_iter1", "results": {"raw": {"test": [{"accuracy": 0.47412109375, "mcc": 0.31452849767426416}, {"accuracy": 0.451171875, "mcc": 0.2663188611033735}, {"accuracy": 0.46630859375, "mcc": 0.288646196639059}, {"accuracy": 0.4677734375, "mcc": 0.2949886741565925}, {"accuracy": 0.46630859375, "mcc": 0.296084311679752}, {"accuracy": 0.4638671875, "mcc": 0.2853214293478178}, {"accuracy": 0.46728515625, "mcc": 0.2927091195950374}, {"accuracy": 0.45458984375, "mcc": 0.2738394538889012}, {"accuracy": 0.49853515625, "mcc": 0.33703877101113877}, {"accuracy": 0.47314453125, "mcc": 0.3026085392368166}]}, "total": {"test_accuracy": 46.8310546875, "test_accuracy_se": 0.7952834354894757, "test_mcc": 29.52083854332753, "test_mcc_se": 1.2425228778235684}}, "num_model_parameters": 7241732096, "max_sequence_length": 8192, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "speed", "task": "speed", "dataset_languages": ["ab", "aa", "af", "sq", "am", "ar", "an", "hy", "as", "av", "ae", "ay", "az", "bm", "ba", "eu", "be", "bn", "bi", "bs", "br", "bg", "my", "ca", "ch", "ce", "ny", "zh", "cu", "cv", "kw", "co", "cr", "hr", "cs", "da", "dv", "nl", "dz", "en", "eo", "et", "ee", "fo", "fj", "fi", "fr", "fy", "ff", "gd", "gl", "lg", "ka", "de", "el", "kl", "gn", "gu", "ht", "ha", "he", "hz", "hi", "ho", "hu", "is", "io", "ig", "id", "ia", "ie", "iu", "ik", "ga", "it", "ja", "kn", "kr", "ks", "kk", "km", "ki", "rw", "ky", "kv", "kg", "ko", "kj", "ku", "lo", "la", "lv", "li", "ln", "lt", "lu", "lb", "mk", "mg", "ms", "ml", "mt", "gv", "mi", "mr", "mh", "mn", "na", "nv", "nd", "nr", "ng", "ne", "no", "nb", "nn", "ii", "oc", "oj", "or", "om", "os", "pi", "ps", "fa", "pl", "pt", "pa", "qu", "ro", "rm", "rn", "ru", "se", "sm", "sg", "sa", "sc", "sr", "sn", "sd", "si", "sk", "sl", "so", "st", "es", "su", "sw", "ss", "sv", "tl", "ty", "tg", "ta", "tt", "te", "th", "bo", "ti", "to", "ts", "tn", "tr", "tk", "tw", "ug", "uk", "ur", "uz", "ve", "vi", "vo", "wa", "cy", "wo", "xh", "yi", "yo", "za", "zu"], "model": "davidberenstein1957/ultra-feedback-dutch-cleaned-hq-spin-geitje-7b-ultra-sft_iter1", "results": {"raw": {"test": [{"test_speed": 1575.9599999999998, "test_speed_short": 203.72}, {"test_speed": 2664.48, "test_speed_short": 368.0}, {"test_speed": 3244.96, "test_speed_short": 683.24}, {"test_speed": 4101.46, "test_speed_short": 848.8199999999999}, {"test_speed": 4330.16, "test_speed_short": 1002.96}, {"test_speed": 4357.679999999999, "test_speed_short": 1280.9399999999998}, {"test_speed": 4916.96, "test_speed_short": 1433.4099999999999}, {"test_speed": 5025.12, "test_speed_short": 1583.3200000000002}, {"test_speed": 5188.679999999999, "test_speed_short": 1733.16}, {"test_speed": 5114.34, "test_speed_short": 1870.0}]}, "total": {"test_speed": 4051.9800000000005, "test_speed_se": 744.8081655674022, "test_speed_short": 1100.757, "test_speed_short_se": 355.24903001405045}}, "num_model_parameters": 7241732096, "max_sequence_length": 8192, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "dutch-social", "task": "sentiment-classification", "dataset_languages": ["nl"], "model": "../fietje-test/fietje-2b-ckpt-5400", "results": {"raw": {"test": [{"mcc": 0.16687312516871028, "macro_f1": 0.4450963671774029}, {"mcc": 0.14104577252896291, "macro_f1": 0.44433732874583143}, {"mcc": 0.1192428365155031, "macro_f1": 0.4337642271490992}, {"mcc": 0.1355696882589698, "macro_f1": 0.38675441190969145}, {"mcc": 0.13880513077254103, "macro_f1": 0.43682068136667745}, {"mcc": 0.12762921783417094, "macro_f1": 0.4166275817442064}, {"mcc": 0.14919220994760066, "macro_f1": 0.4422013582724886}, {"mcc": 0.09970951989843654, "macro_f1": 0.40878167445343405}, {"mcc": 0.15010881947254764, "macro_f1": 0.42786449979107233}, {"mcc": 0.07478425336732483, "macro_f1": 0.35125501563254363}]}, "total": {"test_mcc": 13.029605737647678, "test_mcc_se": 1.6592153823984872, "test_macro_f1": 41.93503146242447, "test_macro_f1_se": 1.8723920427771743}}, "num_model_parameters": -1, "max_sequence_length": 2048, "vocabulary_size": 51200, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "hellaswag-nl", "task": "common-sense-reasoning", "dataset_languages": ["nl"], "model": "Rijgersberg/GEITje-7B", "results": {"raw": {"test": [{"accuracy": 0.3408203125, "mcc": 0.1289329772693837}, {"accuracy": 0.2890625, "mcc": 0.07011628021415209}, {"accuracy": 0.30224609375, "mcc": 0.07145463045358026}, {"accuracy": 0.3515625, "mcc": 0.1593493139435821}, {"accuracy": 0.34326171875, "mcc": 0.13469639994549376}, {"accuracy": 0.31591796875, "mcc": 0.11959644137130584}, {"accuracy": 0.29736328125, "mcc": 0.11961664906050626}, {"accuracy": 0.3203125, "mcc": 0.10671175633351461}, {"accuracy": 0.34521484375, "mcc": 0.15559161612056585}, {"accuracy": 0.2744140625, "mcc": 0.06417295403169723}]}, "total": {"test_accuracy": 31.801757812499996, "test_accuracy_se": 1.6588572340716832, "test_mcc": 11.302390187437819, "test_mcc_se": 2.14337539844732}}, "num_model_parameters": 7241732096, "max_sequence_length": 32768, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "speed", "task": "speed", "dataset_languages": ["ab", "aa", "af", "sq", "am", "ar", "an", "hy", "as", "av", "ae", "ay", "az", "bm", "ba", "eu", "be", "bn", "bi", "bs", "br", "bg", "my", "ca", "ch", "ce", "ny", "zh", "cu", "cv", "kw", "co", "cr", "hr", "cs", "da", "dv", "nl", "dz", "en", "eo", "et", "ee", "fo", "fj", "fi", "fr", "fy", "ff", "gd", "gl", "lg", "ka", "de", "el", "kl", "gn", "gu", "ht", "ha", "he", "hz", "hi", "ho", "hu", "is", "io", "ig", "id", "ia", "ie", "iu", "ik", "ga", "it", "ja", "kn", "kr", "ks", "kk", "km", "ki", "rw", "ky", "kv", "kg", "ko", "kj", "ku", "lo", "la", "lv", "li", "ln", "lt", "lu", "lb", "mk", "mg", "ms", "ml", "mt", "gv", "mi", "mr", "mh", "mn", "na", "nv", "nd", "nr", "ng", "ne", "no", "nb", "nn", "ii", "oc", "oj", "or", "om", "os", "pi", "ps", "fa", "pl", "pt", "pa", "qu", "ro", "rm", "rn", "ru", "se", "sm", "sg", "sa", "sc", "sr", "sn", "sd", "si", "sk", "sl", "so", "st", "es", "su", "sw", "ss", "sv", "tl", "ty", "tg", "ta", "tt", "te", "th", "bo", "ti", "to", "ts", "tn", "tr", "tk", "tw", "ug", "uk", "ur", "uz", "ve", "vi", "vo", "wa", "cy", "wo", "xh", "yi", "yo", "za", "zu"], "model": "Rijgersberg/GEITje-7B", "results": {"raw": {"test": [{"test_speed": 1585.16, "test_speed_short": 235.07000000000002}, {"test_speed": 2387.8399999999997, "test_speed_short": 418.0}, {"test_speed": 2091.6800000000003, "test_speed_short": 767.6}, {"test_speed": 2707.76, "test_speed_short": 899.58}, {"test_speed": 3041.96, "test_speed_short": 1059.52}, {"test_speed": 3214.06, "test_speed_short": 1249.8600000000001}, {"test_speed": 3659.28, "test_speed_short": 1435.9}, {"test_speed": 3537.8, "test_speed_short": 1586.08}, {"test_speed": 3475.36, "test_speed_short": 1735.18}, {"test_speed": 3373.48, "test_speed_short": 1883.2}]}, "total": {"test_speed": 2907.438, "test_speed_se": 430.31808884499196, "test_speed_short": 1126.9990000000003, "test_speed_short_se": 342.4022378871854}}, "num_model_parameters": 7241732096, "max_sequence_length": 32768, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "conll-nl", "task": "named-entity-recognition", "dataset_languages": ["nl"], "model": "../fietje-test/fietje-2b-ckpt-5400", "results": {"raw": {"test": [{"micro_f1_no_misc": 0.23174464363795363, "micro_f1": 0.20810024614007605}, {"micro_f1_no_misc": 0.26921634795650545, "micro_f1": 0.2576470588235294}, {"micro_f1_no_misc": 0.3484626647144949, "micro_f1": 0.3213030980517406}, {"micro_f1_no_misc": 0.29588431590656283, "micro_f1": 0.292945270484024}, {"micro_f1_no_misc": 0.3616734143049932, "micro_f1": 0.2667423382519864}, {"micro_f1_no_misc": 0.3773161145423919, "micro_f1": 0.3417879417879418}, {"micro_f1_no_misc": 0.40763546798029554, "micro_f1": 0.3092406221408966}, {"micro_f1_no_misc": 0.27071369975389664, "micro_f1": 0.2447457627118644}, {"micro_f1_no_misc": 0.35512094698919194, "micro_f1": 0.3149847094801223}, {"micro_f1_no_misc": 0.33928571428571425, "micro_f1": 0.2943777637397347}]}, "total": {"test_micro_f1_no_misc": 32.57053330072001, "test_micro_f1_no_misc_se": 3.4676255692751727, "test_micro_f1": 28.51874811611916, "test_micro_f1_se": 2.5130579809417783}}, "num_model_parameters": -1, "max_sequence_length": 2048, "vocabulary_size": 51200, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "dutch-social", "task": "sentiment-classification", "dataset_languages": ["nl"], "model": "davidberenstein1957/ultra-feedback-dutch-cleaned-hq-spin-geitje-7b-ultra-sft_iter0", "results": {"raw": {"test": [{"mcc": 0.10164819614436256, "macro_f1": 0.37397300053779015}, {"mcc": 0.0906570758257912, "macro_f1": 0.3668355773438854}, {"mcc": 0.05885923942113474, "macro_f1": 0.3755545157370039}, {"mcc": 0.0964464128259356, "macro_f1": 0.38463925588278275}, {"mcc": 0.16505952336465735, "macro_f1": 0.43601291895129773}, {"mcc": 0.1161478875154707, "macro_f1": 0.38817560835430226}, {"mcc": 0.19353115467677462, "macro_f1": 0.4230615939860452}, {"mcc": 0.11127220081473961, "macro_f1": 0.37593314125039073}, {"mcc": 0.06576582758184528, "macro_f1": 0.3453127075674229}, {"mcc": 0.122479157989883, "macro_f1": 0.39640164488183766}]}, "total": {"test_mcc": 11.218666761605947, "test_mcc_se": 2.5601977386551953, "test_macro_f1": 38.65899964492759, "test_macro_f1_se": 1.6481596023668572}}, "num_model_parameters": 7241732096, "max_sequence_length": 8192, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "scala-nl", "task": "linguistic-acceptability", "dataset_languages": ["nl"], "model": "../fietje-test/fietje-2b-ckpt-5400", "results": {"raw": {"test": [{"mcc": 0.06756974353939382, "macro_f1": 0.39029176706754104}, {"mcc": 0.09125029972164607, "macro_f1": 0.4423326440534776}, {"mcc": 0.08153978999460075, "macro_f1": 0.3900966183574879}, {"mcc": 0.1107819342414481, "macro_f1": 0.42915251145716954}, {"mcc": 0.05906270428885315, "macro_f1": 0.46297461715963917}, {"mcc": -0.028444827093079373, "macro_f1": 0.3987379143589456}, {"mcc": 0.11097252134793924, "macro_f1": 0.4091652714971049}, {"mcc": 0.035325284034294475, "macro_f1": 0.35629420368255493}, {"mcc": 0.152377928667706, "macro_f1": 0.42812085367343045}, {"mcc": 0.100954135216375, "macro_f1": 0.5050236508648966}]}, "total": {"test_mcc": 7.8138951395917715, "test_mcc_se": 3.066337886475756, "test_macro_f1": 42.12190052172248, "test_macro_f1_se": 2.6226374170001665}}, "num_model_parameters": -1, "max_sequence_length": 2048, "vocabulary_size": 51200, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "conll-nl", "task": "named-entity-recognition", "dataset_languages": ["nl"], "model": "davidberenstein1957/ultra-feedback-dutch-cleaned-hq-spin-geitje-7b-ultra-sft_iter0", "results": {"raw": {"test": [{"micro_f1_no_misc": 0.3458522492777549, "micro_f1": 0.23090586145648315}, {"micro_f1_no_misc": 0.3835354179961711, "micro_f1": 0.2807308970099668}, {"micro_f1_no_misc": 0.4477234401349072, "micro_f1": 0.2670078908082747}, {"micro_f1_no_misc": 0.37352415026833635, "micro_f1": 0.2811418685121107}, {"micro_f1_no_misc": 0.45911413969335607, "micro_f1": 0.2805674397658185}, {"micro_f1_no_misc": 0.374605401613469, "micro_f1": 0.271518544436669}, {"micro_f1_no_misc": 0.3615591397849462, "micro_f1": 0.2572387746537978}, {"micro_f1_no_misc": 0.37809330628803245, "micro_f1": 0.2535760728218466}, {"micro_f1_no_misc": 0.37944358578052545, "micro_f1": 0.24016190690353048}, {"micro_f1_no_misc": 0.3447341686420589, "micro_f1": 0.2527659574468086}]}, "total": {"test_micro_f1_no_misc": 38.481849994795574, "test_micro_f1_no_misc_se": 2.39610636154942, "test_micro_f1": 26.156152138153065, "test_micro_f1_se": 1.0929260251850603}}, "num_model_parameters": 7241732096, "max_sequence_length": 8192, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "squad-nl", "task": "question-answering", "dataset_languages": ["nl"], "model": "../fietje-test/fietje-2b-ckpt-5400", "results": {"raw": {"test": [{"em": 59.95352439969016, "f1": 69.73240192291823}, {"em": 60.310077519379846, "f1": 70.44300161660415}, {"em": 58.11437403400309, "f1": 69.67183802854262}, {"em": 58.25545171339564, "f1": 68.17819694165738}, {"em": 59.07335907335907, "f1": 70.70881801810278}, {"em": 58.905165767154976, "f1": 69.15487750866956}, {"em": 57.55504935459378, "f1": 70.2495475186726}, {"em": 57.253685027152834, "f1": 67.52189068686477}, {"em": 56.15686274509804, "f1": 68.446636291972}, {"em": 60.32608695652174, "f1": 70.88326189804101}]}, "total": {"test_em": 58.590363659034914, "test_em_se": 0.8588286614223051, "test_f1": 69.49904704320451, "test_f1_se": 0.7096421828721051}}, "num_model_parameters": -1, "max_sequence_length": 2048, "vocabulary_size": 51200, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "scala-nl", "task": "linguistic-acceptability", "dataset_languages": ["nl"], "model": "davidberenstein1957/ultra-feedback-dutch-cleaned-hq-spin-geitje-7b-ultra-sft_iter0", "results": {"raw": {"test": [{"mcc": 0.12724948905716982, "macro_f1": 0.4385478114256936}, {"mcc": 0.11771658338170034, "macro_f1": 0.4609064330423076}, {"mcc": 0.21003084980389325, "macro_f1": 0.510522576060978}, {"mcc": 0.19306833758935418, "macro_f1": 0.4621944721350429}, {"mcc": 0.2555943579934756, "macro_f1": 0.5267731987664704}, {"mcc": 0.21331281452836906, "macro_f1": 0.5485516272032176}, {"mcc": 0.21771426678119465, "macro_f1": 0.5042632020945512}, {"mcc": 0.2030062943444798, "macro_f1": 0.5824555341090285}, {"mcc": 0.22076607193795958, "macro_f1": 0.530391544262773}, {"mcc": 0.18027361382824791, "macro_f1": 0.5275550593996602}]}, "total": {"test_mcc": 19.387326792458442, "test_mcc_se": 2.6334805610671266, "test_macro_f1": 50.92161458499722, "test_macro_f1_se": 2.7385010061625654}}, "num_model_parameters": 7241732096, "max_sequence_length": 8192, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "squad-nl", "task": "question-answering", "dataset_languages": ["nl"], "model": "davidberenstein1957/ultra-feedback-dutch-cleaned-hq-spin-geitje-7b-ultra-sft_iter0", "results": {"raw": {"test": [{"em": 54.6862896979086, "f1": 66.46370554034101}, {"em": 54.10852713178294, "f1": 66.12824305197759}, {"em": 52.85935085007728, "f1": 67.73069110093775}, {"em": 54.04984423676012, "f1": 65.86601988221231}, {"em": 54.9034749034749, "f1": 66.73479978521982}, {"em": 56.90053970701619, "f1": 68.33514377306784}, {"em": 54.593773728170085, "f1": 69.10973719428506}, {"em": 55.00387897595035, "f1": 66.25483785112645}, {"em": 53.333333333333336, "f1": 66.49025788621479}, {"em": 53.649068322981364, "f1": 67.82285809073792}]}, "total": {"test_em": 54.408808088745516, "test_em_se": 0.6936551504185836, "test_f1": 67.09362941561204, "test_f1_se": 0.6718813995328693}}, "num_model_parameters": 7241732096, "max_sequence_length": 8192, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "wiki-lingua-nl", "task": "summarization", "dataset_languages": ["nl"], "model": "../fietje-test/fietje-2b-ckpt-5400", "results": {"raw": {"test": [{"bertscore": 0.6146155541355256, "rouge_l": 0.17071251306370083}, {"bertscore": 0.6262586458324222, "rouge_l": 0.17003547743926434}, {"bertscore": 0.6192706694564549, "rouge_l": 0.16526649797970677}, {"bertscore": 0.6340971038880525, "rouge_l": 0.17917147987104587}, {"bertscore": 0.6149103429197567, "rouge_l": 0.15426182443423608}, {"bertscore": 0.6195160402567126, "rouge_l": 0.17373660909959043}, {"bertscore": 0.6101879180641845, "rouge_l": 0.15236130880812476}, {"bertscore": 0.5946033373911632, "rouge_l": 0.14536979630730495}, {"bertscore": 0.5893511534522986, "rouge_l": 0.13812555795404552}, {"bertscore": 0.6268100457236869, "rouge_l": 0.15881477989205084}]}, "total": {"test_bertscore": 61.49620811120258, "test_bertscore_se": 0.8684055120544817, "test_rouge_l": 16.078558448490703, "test_rouge_l_se": 0.8208343519279312}}, "num_model_parameters": -1, "max_sequence_length": 2048, "vocabulary_size": 51200, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "mmlu-nl", "task": "knowledge", "dataset_languages": ["nl"], "model": "../fietje-test/fietje-2b-ckpt-5400", "results": {"raw": {"test": [{"mcc": 0.24521203714737572, "accuracy": 0.43212890625}, {"mcc": 0.24160970410229168, "accuracy": 0.416015625}, {"mcc": 0.2557786466553418, "accuracy": 0.439453125}, {"mcc": 0.24360304115164905, "accuracy": 0.42529296875}, {"mcc": 0.2415051539923984, "accuracy": 0.416015625}, {"mcc": 0.25450394227642587, "accuracy": 0.4267578125}, {"mcc": 0.2678013173841738, "accuracy": 0.43994140625}, {"mcc": 0.2642355918785504, "accuracy": 0.4287109375}, {"mcc": 0.2459465276297679, "accuracy": 0.43017578125}, {"mcc": 0.2588090217494369, "accuracy": 0.43408203125}]}, "total": {"test_mcc": 25.19004983967411, "test_mcc_se": 0.5979099268103715, "test_accuracy": 42.8857421875, "test_accuracy_se": 0.5151894771643861}}, "num_model_parameters": -1, "max_sequence_length": 2048, "vocabulary_size": 51200, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "hellaswag-nl", "task": "common-sense-reasoning", "dataset_languages": ["nl"], "model": "../fietje-test/fietje-2b-ckpt-5400", "results": {"raw": {"test": [{"accuracy": 0.27392578125, "mcc": 0.04551176841335784}, {"accuracy": 0.2587890625, "mcc": 0.036592975983670624}, {"accuracy": 0.29833984375, "mcc": 0.08824867954277711}, {"accuracy": 0.25146484375, "mcc": 0.011705259753881956}, {"accuracy": 0.25830078125, "mcc": 0.025246963348721355}, {"accuracy": 0.248046875, "mcc": 0.0037337413113448987}, {"accuracy": 0.26904296875, "mcc": 0.02948750454755639}, {"accuracy": 0.27294921875, "mcc": 0.0454223639655541}, {"accuracy": 0.24853515625, "mcc": 0.020950314667270854}, {"accuracy": 0.26904296875, "mcc": 0.04731221710426758}]}, "total": {"test_accuracy": 26.484375, "test_accuracy_se": 0.9494727562237885, "test_mcc": 3.5421178863840277, "test_mcc_se": 1.4681878442602694}}, "num_model_parameters": -1, "max_sequence_length": 2048, "vocabulary_size": 51200, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "speed", "task": "speed", "dataset_languages": ["ab", "aa", "af", "sq", "am", "ar", "an", "hy", "as", "av", "ae", "ay", "az", "bm", "ba", "eu", "be", "bn", "bi", "bs", "br", "bg", "my", "ca", "ch", "ce", "ny", "zh", "cu", "cv", "kw", "co", "cr", "hr", "cs", "da", "dv", "nl", "dz", "en", "eo", "et", "ee", "fo", "fj", "fi", "fr", "fy", "ff", "gd", "gl", "lg", "ka", "de", "el", "kl", "gn", "gu", "ht", "ha", "he", "hz", "hi", "ho", "hu", "is", "io", "ig", "id", "ia", "ie", "iu", "ik", "ga", "it", "ja", "kn", "kr", "ks", "kk", "km", "ki", "rw", "ky", "kv", "kg", "ko", "kj", "ku", "lo", "la", "lv", "li", "ln", "lt", "lu", "lb", "mk", "mg", "ms", "ml", "mt", "gv", "mi", "mr", "mh", "mn", "na", "nv", "nd", "nr", "ng", "ne", "no", "nb", "nn", "ii", "oc", "oj", "or", "om", "os", "pi", "ps", "fa", "pl", "pt", "pa", "qu", "ro", "rm", "rn", "ru", "se", "sm", "sg", "sa", "sc", "sr", "sn", "sd", "si", "sk", "sl", "so", "st", "es", "su", "sw", "ss", "sv", "tl", "ty", "tg", "ta", "tt", "te", "th", "bo", "ti", "to", "ts", "tn", "tr", "tk", "tw", "ug", "uk", "ur", "uz", "ve", "vi", "vo", "wa", "cy", "wo", "xh", "yi", "yo", "za", "zu"], "model": "../fietje-test/fietje-2b-ckpt-5400", "results": {"raw": {"test": [{"test_speed": 1321.31, "test_speed_short": 163.44}, {"test_speed": 2232.03, "test_speed_short": 306.15}, {"test_speed": 3274.72, "test_speed_short": 586.9599999999999}, {"test_speed": 4268.389999999999, "test_speed_short": 748.8000000000001}, {"test_speed": 5163.21, "test_speed_short": 890.5300000000001}, {"test_speed": 5788.75, "test_speed_short": 1170.78}, {"test_speed": 6569.580000000001, "test_speed_short": 1297.92}, {"test_speed": 6889.08, "test_speed_short": 1395.1499999999999}, {"test_speed": 7540.45, "test_speed_short": 1521.7800000000002}, {"test_speed": 7535.75, "test_speed_short": 1653.25}]}, "total": {"test_speed": 5058.326999999999, "test_speed_se": 1374.1313980136224, "test_speed_short": 973.476, "test_speed_short_se": 319.5974168294172}}, "num_model_parameters": -1, "max_sequence_length": 2048, "vocabulary_size": 51200, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "wiki-lingua-nl", "task": "summarization", "dataset_languages": ["nl"], "model": "davidberenstein1957/ultra-feedback-dutch-cleaned-hq-spin-geitje-7b-ultra-sft_iter0", "results": {"raw": {"test": [{"bertscore": 0.6834496051305905, "rouge_l": 0.21444480562086082}, {"bertscore": 0.6743342054542154, "rouge_l": 0.19994823946572787}, {"bertscore": 0.6817006946366746, "rouge_l": 0.21233010926797918}, {"bertscore": 0.6884179233165924, "rouge_l": 0.2165617500757095}, {"bertscore": 0.680984252190683, "rouge_l": 0.19059668864336976}, {"bertscore": 0.6984751700656489, "rouge_l": 0.21782267758317977}, {"bertscore": 0.6731714438501513, "rouge_l": 0.195885199065355}, {"bertscore": 0.6825971499056323, "rouge_l": 0.20985606032163637}, {"bertscore": 0.6896058777347207, "rouge_l": 0.21828421178985719}, {"bertscore": 0.6788547940959688, "rouge_l": 0.2004427656244498}]}, "total": {"test_bertscore": 68.31591116380878, "test_bertscore_se": 0.46529735234307557, "test_rouge_l": 20.76172507458125, "test_rouge_l_se": 0.6230365666657962}}, "num_model_parameters": 7241732096, "max_sequence_length": 8192, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "mmlu-nl", "task": "knowledge", "dataset_languages": ["nl"], "model": "davidberenstein1957/ultra-feedback-dutch-cleaned-hq-spin-geitje-7b-ultra-sft_iter0", "results": {"raw": {"test": [{"mcc": 0.2827151599223707, "accuracy": 0.4619140625}, {"mcc": 0.26906074859026896, "accuracy": 0.4482421875}, {"mcc": 0.27266050758861693, "accuracy": 0.45361328125}, {"mcc": 0.27795478373717797, "accuracy": 0.45556640625}, {"mcc": 0.2555459680427083, "accuracy": 0.43505859375}, {"mcc": 0.27935758131080674, "accuracy": 0.45654296875}, {"mcc": 0.30104113465739635, "accuracy": 0.4658203125}, {"mcc": 0.2820817890838234, "accuracy": 0.4580078125}, {"mcc": 0.29347451705665917, "accuracy": 0.46240234375}, {"mcc": 0.25947677217969245, "accuracy": 0.4375}]}, "total": {"test_mcc": 27.733689621695213, "test_mcc_se": 0.8673378875621289, "test_accuracy": 45.3466796875, "test_accuracy_se": 0.6399399847130661}}, "num_model_parameters": 7241732096, "max_sequence_length": 8192, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "hellaswag-nl", "task": "common-sense-reasoning", "dataset_languages": ["nl"], "model": "davidberenstein1957/ultra-feedback-dutch-cleaned-hq-spin-geitje-7b-ultra-sft_iter0", "results": {"raw": {"test": [{"accuracy": 0.43310546875, "mcc": 0.2686048424201564}, {"accuracy": 0.4326171875, "mcc": 0.24461722387496374}, {"accuracy": 0.42431640625, "mcc": 0.24249669276578734}, {"accuracy": 0.44677734375, "mcc": 0.26972769476115854}, {"accuracy": 0.43115234375, "mcc": 0.25472528591600935}, {"accuracy": 0.4130859375, "mcc": 0.22747739046606297}, {"accuracy": 0.44189453125, "mcc": 0.2643175167085566}, {"accuracy": 0.421875, "mcc": 0.2407268478109315}, {"accuracy": 0.453125, "mcc": 0.2846458168817161}, {"accuracy": 0.431640625, "mcc": 0.2522801307783558}]}, "total": {"test_accuracy": 43.2958984375, "test_accuracy_se": 0.7377421956629265, "test_mcc": 25.496194423836982, "test_mcc_se": 1.053569154082634}}, "num_model_parameters": 7241732096, "max_sequence_length": 8192, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "speed", "task": "speed", "dataset_languages": ["ab", "aa", "af", "sq", "am", "ar", "an", "hy", "as", "av", "ae", "ay", "az", "bm", "ba", "eu", "be", "bn", "bi", "bs", "br", "bg", "my", "ca", "ch", "ce", "ny", "zh", "cu", "cv", "kw", "co", "cr", "hr", "cs", "da", "dv", "nl", "dz", "en", "eo", "et", "ee", "fo", "fj", "fi", "fr", "fy", "ff", "gd", "gl", "lg", "ka", "de", "el", "kl", "gn", "gu", "ht", "ha", "he", "hz", "hi", "ho", "hu", "is", "io", "ig", "id", "ia", "ie", "iu", "ik", "ga", "it", "ja", "kn", "kr", "ks", "kk", "km", "ki", "rw", "ky", "kv", "kg", "ko", "kj", "ku", "lo", "la", "lv", "li", "ln", "lt", "lu", "lb", "mk", "mg", "ms", "ml", "mt", "gv", "mi", "mr", "mh", "mn", "na", "nv", "nd", "nr", "ng", "ne", "no", "nb", "nn", "ii", "oc", "oj", "or", "om", "os", "pi", "ps", "fa", "pl", "pt", "pa", "qu", "ro", "rm", "rn", "ru", "se", "sm", "sg", "sa", "sc", "sr", "sn", "sd", "si", "sk", "sl", "so", "st", "es", "su", "sw", "ss", "sv", "tl", "ty", "tg", "ta", "tt", "te", "th", "bo", "ti", "to", "ts", "tn", "tr", "tk", "tw", "ug", "uk", "ur", "uz", "ve", "vi", "vo", "wa", "cy", "wo", "xh", "yi", "yo", "za", "zu"], "model": "davidberenstein1957/ultra-feedback-dutch-cleaned-hq-spin-geitje-7b-ultra-sft_iter0", "results": {"raw": {"test": [{"test_speed": 1603.56, "test_speed_short": 235.17999999999998}, {"test_speed": 2422.42, "test_speed_short": 416.2}, {"test_speed": 2121.6, "test_speed_short": 769.8800000000001}, {"test_speed": 2762.06, "test_speed_short": 907.1}, {"test_speed": 3100.7200000000003, "test_speed_short": 1066.24}, {"test_speed": 3273.68, "test_speed_short": 1260.96}, {"test_speed": 3741.44, "test_speed_short": 1453.3300000000002}, {"test_speed": 3610.0, "test_speed_short": 1606.3200000000002}, {"test_speed": 3540.32, "test_speed_short": 1753.36}, {"test_speed": 3436.62, "test_speed_short": 1897.5}]}, "total": {"test_speed": 2961.2419999999997, "test_speed_se": 442.4511469480124, "test_speed_short": 1136.6070000000002, "test_speed_short_se": 346.9083359220456}}, "num_model_parameters": 7241732096, "max_sequence_length": 8192, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "dutch-social", "task": "sentiment-classification", "dataset_languages": ["nl"], "model": "../fietje-test/fietje-2b-ckpt-6300", "results": {"raw": {"test": [{"mcc": 0.11226164428893577, "macro_f1": 0.4005256231698704}, {"mcc": 0.12280224472485023, "macro_f1": 0.4191541618391886}, {"mcc": 0.10717003004031968, "macro_f1": 0.42790443850137416}, {"mcc": 0.11424973890467448, "macro_f1": 0.35656824108641677}, {"mcc": 0.14743457595850573, "macro_f1": 0.4381381499094897}, {"mcc": 0.14379933764352865, "macro_f1": 0.4053871387237719}, {"mcc": 0.17212167639277529, "macro_f1": 0.4434664564125639}, {"mcc": 0.13268204357459237, "macro_f1": 0.40795372423997595}, {"mcc": 0.14724868723399856, "macro_f1": 0.40398117636923603}, {"mcc": 0.09094506695381811, "macro_f1": 0.3371219686152746}]}, "total": {"test_mcc": 12.907150457159988, "test_mcc_se": 1.4935111837355304, "test_macro_f1": 40.402010788671625, "test_macro_f1_se": 2.0943539105892293}}, "num_model_parameters": -1, "max_sequence_length": 2048, "vocabulary_size": 51200, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "conll-nl", "task": "named-entity-recognition", "dataset_languages": ["nl"], "model": "../fietje-test/fietje-2b-ckpt-6300", "results": {"raw": {"test": [{"micro_f1_no_misc": 0.24967263203841114, "micro_f1": 0.21370967741935487}, {"micro_f1_no_misc": 0.27734095782701934, "micro_f1": 0.264933628318584}, {"micro_f1_no_misc": 0.35016211208893006, "micro_f1": 0.2827442827442827}, {"micro_f1_no_misc": 0.2774126599792459, "micro_f1": 0.27792592592592597}, {"micro_f1_no_misc": 0.4161922833649589, "micro_f1": 0.2839540744889387}, {"micro_f1_no_misc": 0.3871935967983992, "micro_f1": 0.3490388239728609}, {"micro_f1_no_misc": 0.42434394193188163, "micro_f1": 0.32201405152224827}, {"micro_f1_no_misc": 0.2674865488086088, "micro_f1": 0.24341682723185612}, {"micro_f1_no_misc": 0.36039603960396044, "micro_f1": 0.31065918653576435}, {"micro_f1_no_misc": 0.30462519936204147, "micro_f1": 0.26747544771808207}]}, "total": {"test_micro_f1_no_misc": 33.14825971803457, "test_micro_f1_no_misc_se": 3.998279731112401, "test_micro_f1": 28.158719258778987, "test_micro_f1_se": 2.4085234656978183}}, "num_model_parameters": -1, "max_sequence_length": 2048, "vocabulary_size": 51200, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "dutch-social", "task": "sentiment-classification", "dataset_languages": ["nl"], "model": "yhavinga/gpt-neo-1.3B-dutch", "results": {"raw": {"test": [{"mcc": 0.026588749590855817, "macro_f1": 0.31547089947089946}, {"mcc": 0.03880012208677462, "macro_f1": 0.23815687434422986}, {"mcc": 0.0018391772781599417, "macro_f1": 0.2082043666842018}, {"mcc": -0.006657462666447971, "macro_f1": 0.20958916432479083}, {"mcc": 0.030010511469109845, "macro_f1": 0.22773791849576672}, {"mcc": -0.03424757410115613, "macro_f1": 0.2533553523829576}, {"mcc": 0.00723066672848631, "macro_f1": 0.23408693951795143}, {"mcc": 0.007914594906118946, "macro_f1": 0.20453672112528057}, {"mcc": -0.00023294442563354017, "macro_f1": 0.20874092552647264}, {"mcc": 0.001996880106249656, "macro_f1": 0.3123744169232377}]}, "total": {"test_mcc": 0.7324272097251749, "test_mcc_se": 1.2928999719689895, "test_macro_f1": 24.122535787957887, "test_macro_f1_se": 2.568567221542315}}, "num_model_parameters": 711596032, "max_sequence_length": 1024, "vocabulary_size": 50257, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "scala-nl", "task": "linguistic-acceptability", "dataset_languages": ["nl"], "model": "../fietje-test/fietje-2b-ckpt-6300", "results": {"raw": {"test": [{"mcc": 0.11143377058034988, "macro_f1": 0.40938750395852075}, {"mcc": 0.0700303882570001, "macro_f1": 0.39649245975843517}, {"mcc": 0.03841761490078014, "macro_f1": 0.3649303582455063}, {"mcc": 0.09260204365985088, "macro_f1": 0.4080778710369879}, {"mcc": 0.07799752132095317, "macro_f1": 0.4531224776867664}, {"mcc": -0.021618270144939816, "macro_f1": 0.4138417444006618}, {"mcc": 0.09543558336873606, "macro_f1": 0.3926112460391815}, {"mcc": 0.07943998707974706, "macro_f1": 0.3778313472770139}, {"mcc": 0.13936070603083273, "macro_f1": 0.42874396135265697}, {"mcc": 0.07635654717538501, "macro_f1": 0.47612625918838275}]}, "total": {"test_mcc": 7.594558922286951, "test_mcc_se": 2.6871641017269376, "test_macro_f1": 41.21165228944113, "test_macro_f1_se": 2.077200934891809}}, "num_model_parameters": -1, "max_sequence_length": 2048, "vocabulary_size": 51200, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "squad-nl", "task": "question-answering", "dataset_languages": ["nl"], "model": "../fietje-test/fietje-2b-ckpt-6300", "results": {"raw": {"test": [{"em": 60.573199070487995, "f1": 70.04244464923653}, {"em": 60.3875968992248, "f1": 70.45968341987967}, {"em": 58.57805255023184, "f1": 69.46837332728268}, {"em": 58.48909657320872, "f1": 68.13457685508787}, {"em": 58.76447876447877, "f1": 70.07647952450162}, {"em": 59.29067077872012, "f1": 69.57286848643652}, {"em": 56.79574791192103, "f1": 69.95902840008708}, {"em": 57.64158262218774, "f1": 67.86453898921978}, {"em": 56.23529411764706, "f1": 68.2781391315244}, {"em": 59.93788819875776, "f1": 70.57823824674406}]}, "total": {"test_em": 58.66936074868657, "test_em_se": 0.9032421815337621, "test_f1": 69.44343710300002, "test_f1_se": 0.6172961288506454}}, "num_model_parameters": -1, "max_sequence_length": 2048, "vocabulary_size": 51200, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "wiki-lingua-nl", "task": "summarization", "dataset_languages": ["nl"], "model": "../fietje-test/fietje-2b-ckpt-6300", "results": {"raw": {"test": [{"bertscore": 0.6063149212859571, "rouge_l": 0.16494740462249852}, {"bertscore": 0.6233181888965191, "rouge_l": 0.16929894319375272}, {"bertscore": 0.6220384876942262, "rouge_l": 0.16736182247860726}, {"bertscore": 0.6350570787690231, "rouge_l": 0.18257665821587188}, {"bertscore": 0.6123150615603663, "rouge_l": 0.15507340915161427}, {"bertscore": 0.6153232628566911, "rouge_l": 0.17123877405928456}, {"bertscore": 0.609564057434909, "rouge_l": 0.15375467672351611}, {"bertscore": 0.5895975337989512, "rouge_l": 0.14263681338229217}, {"bertscore": 0.5888463330775267, "rouge_l": 0.14288805812953825}, {"bertscore": 0.6220449115789961, "rouge_l": 0.15589638481830498}]}, "total": {"test_bertscore": 61.24419836953165, "test_bertscore_se": 0.9115834134295108, "test_rouge_l": 16.056729447752808, "test_rouge_l_se": 0.7927587472729816}}, "num_model_parameters": -1, "max_sequence_length": 2048, "vocabulary_size": 51200, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "mmlu-nl", "task": "knowledge", "dataset_languages": ["nl"], "model": "../fietje-test/fietje-2b-ckpt-6300", "results": {"raw": {"test": [{"mcc": 0.2406924970424992, "accuracy": 0.4306640625}, {"mcc": 0.23868424588757467, "accuracy": 0.41943359375}, {"mcc": 0.2523094485774121, "accuracy": 0.4384765625}, {"mcc": 0.24966249849638697, "accuracy": 0.43212890625}, {"mcc": 0.23229927965454297, "accuracy": 0.41357421875}, {"mcc": 0.24644798204339508, "accuracy": 0.4228515625}, {"mcc": 0.26230822036024437, "accuracy": 0.43994140625}, {"mcc": 0.2561679192756392, "accuracy": 0.4267578125}, {"mcc": 0.23227455095868413, "accuracy": 0.42236328125}, {"mcc": 0.260269960940067, "accuracy": 0.4375}]}, "total": {"test_mcc": 24.71116603236446, "test_mcc_se": 0.6763908574776579, "test_accuracy": 42.8369140625, "test_accuracy_se": 0.5492262551410018}}, "num_model_parameters": -1, "max_sequence_length": 2048, "vocabulary_size": 51200, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "hellaswag-nl", "task": "common-sense-reasoning", "dataset_languages": ["nl"], "model": "../fietje-test/fietje-2b-ckpt-6300", "results": {"raw": {"test": [{"accuracy": 0.2841796875, "mcc": 0.059102102741612345}, {"accuracy": 0.265625, "mcc": 0.04884263065285848}, {"accuracy": 0.298828125, "mcc": 0.09149513231463403}, {"accuracy": 0.2509765625, "mcc": 0.010816015611646428}, {"accuracy": 0.26953125, "mcc": 0.04391222580639544}, {"accuracy": 0.25439453125, "mcc": 0.014532671461339896}, {"accuracy": 0.2802734375, "mcc": 0.049760189743756726}, {"accuracy": 0.2783203125, "mcc": 0.05431615595588067}, {"accuracy": 0.2509765625, "mcc": 0.026396389668665324}, {"accuracy": 0.27734375, "mcc": 0.06051381920154018}]}, "total": {"test_accuracy": 27.1044921875, "test_accuracy_se": 0.9770732366694984, "test_mcc": 4.5968733315832955, "test_mcc_se": 1.4841250372824213}}, "num_model_parameters": -1, "max_sequence_length": 2048, "vocabulary_size": 51200, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "speed", "task": "speed", "dataset_languages": ["ab", "aa", "af", "sq", "am", "ar", "an", "hy", "as", "av", "ae", "ay", "az", "bm", "ba", "eu", "be", "bn", "bi", "bs", "br", "bg", "my", "ca", "ch", "ce", "ny", "zh", "cu", "cv", "kw", "co", "cr", "hr", "cs", "da", "dv", "nl", "dz", "en", "eo", "et", "ee", "fo", "fj", "fi", "fr", "fy", "ff", "gd", "gl", "lg", "ka", "de", "el", "kl", "gn", "gu", "ht", "ha", "he", "hz", "hi", "ho", "hu", "is", "io", "ig", "id", "ia", "ie", "iu", "ik", "ga", "it", "ja", "kn", "kr", "ks", "kk", "km", "ki", "rw", "ky", "kv", "kg", "ko", "kj", "ku", "lo", "la", "lv", "li", "ln", "lt", "lu", "lb", "mk", "mg", "ms", "ml", "mt", "gv", "mi", "mr", "mh", "mn", "na", "nv", "nd", "nr", "ng", "ne", "no", "nb", "nn", "ii", "oc", "oj", "or", "om", "os", "pi", "ps", "fa", "pl", "pt", "pa", "qu", "ro", "rm", "rn", "ru", "se", "sm", "sg", "sa", "sc", "sr", "sn", "sd", "si", "sk", "sl", "so", "st", "es", "su", "sw", "ss", "sv", "tl", "ty", "tg", "ta", "tt", "te", "th", "bo", "ti", "to", "ts", "tn", "tr", "tk", "tw", "ug", "uk", "ur", "uz", "ve", "vi", "vo", "wa", "cy", "wo", "xh", "yi", "yo", "za", "zu"], "model": "../fietje-test/fietje-2b-ckpt-6300", "results": {"raw": {"test": [{"test_speed": 1681.99, "test_speed_short": 214.0}, {"test_speed": 2739.63, "test_speed_short": 403.2}, {"test_speed": 3977.3500000000004, "test_speed_short": 775.4599999999999}, {"test_speed": 3908.71, "test_speed_short": 938.16}, {"test_speed": 4914.0, "test_speed_short": 1114.13}, {"test_speed": 5557.2, "test_speed_short": 1454.6399999999999}, {"test_speed": 6383.0, "test_speed_short": 1633.92}, {"test_speed": 6232.71, "test_speed_short": 1716.78}, {"test_speed": 6316.3099999999995, "test_speed_short": 1899.3000000000002}, {"test_speed": 6498.2699999999995, "test_speed_short": 2063.8}]}, "total": {"test_speed": 4820.9169999999995, "test_speed_se": 1048.63761927231, "test_speed_short": 1221.339, "test_speed_short_se": 392.76481679171235}}, "num_model_parameters": -1, "max_sequence_length": 2048, "vocabulary_size": 51200, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "dutch-social", "task": "sentiment-classification", "dataset_languages": ["nl"], "model": "microsoft/phi-2", "results": {"raw": {"test": [{"mcc": -0.003058957655177578, "macro_f1": 0.20694356077227413}, {"mcc": 0.024604516739242813, "macro_f1": 0.297622091319294}, {"mcc": 0.1143698331616566, "macro_f1": 0.334878240970143}, {"mcc": 0.07106127888292284, "macro_f1": 0.343744826472602}, {"mcc": 0.04989235376862756, "macro_f1": 0.36447396808606514}, {"mcc": 0.010052380593821573, "macro_f1": 0.3122041684164684}, {"mcc": 0.04791800589664631, "macro_f1": 0.17170394089857724}, {"mcc": 0.08837753900581118, "macro_f1": 0.3543952606409879}, {"mcc": 0.09834986991978441, "macro_f1": 0.25882930745896054}, {"mcc": 0.0013746012170633016, "macro_f1": 0.28541706646580134}]}, "total": {"test_mcc": 5.02941421530399, "test_mcc_se": 2.5987153302977504, "test_macro_f1": 29.302124315011735, "test_macro_f1_se": 3.970873345828048}}, "num_model_parameters": 2779683840, "max_sequence_length": 2048, "vocabulary_size": 51200, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "dutch-social", "task": "sentiment-classification", "dataset_languages": ["nl"], "model": "microsoft/Phi-3-mini-4k-instruct", "results": {"raw": {"test": [{"mcc": 0.09402597647778105, "macro_f1": 0.2832090036417387}, {"mcc": 0.07837340133727555, "macro_f1": 0.32010984284673866}, {"mcc": 0.11244187719044256, "macro_f1": 0.29983843849302033}, {"mcc": 0.11953977787650856, "macro_f1": 0.2819994108476815}, {"mcc": 0.12804289387462042, "macro_f1": 0.33589220323654884}, {"mcc": 0.06474168399449758, "macro_f1": 0.2323323908974787}, {"mcc": 0.14463436203482788, "macro_f1": 0.31090505889026326}, {"mcc": 0.07174043775197718, "macro_f1": 0.3049930529952344}, {"mcc": 0.06781941466551539, "macro_f1": 0.27716799021861904}, {"mcc": 0.09928865915345371, "macro_f1": 0.2768113160728808}]}, "total": {"test_mcc": 9.806484843568999, "test_mcc_se": 1.7110638841019383, "test_macro_f1": 29.232587081402038, "test_macro_f1_se": 1.7882116757011068}}, "num_model_parameters": 3821079552, "max_sequence_length": 4096, "vocabulary_size": 32064, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "conll-nl", "task": "named-entity-recognition", "dataset_languages": ["nl"], "model": "microsoft/phi-2", "results": {"raw": {"test": [{"micro_f1_no_misc": 0.35809312638580937, "micro_f1": 0.2876382876382877}, {"micro_f1_no_misc": 0.34285714285714286, "micro_f1": 0.2826729745712596}, {"micro_f1_no_misc": 0.35927505330490406, "micro_f1": 0.3243410852713178}, {"micro_f1_no_misc": 0.3086369218174376, "micro_f1": 0.30171277997364954}, {"micro_f1_no_misc": 0.3673938002296211, "micro_f1": 0.3238815374921235}, {"micro_f1_no_misc": 0.37834474175482263, "micro_f1": 0.32945091514143093}, {"micro_f1_no_misc": 0.37624584717607973, "micro_f1": 0.3131313131313131}, {"micro_f1_no_misc": 0.3445026178010472, "micro_f1": 0.3557257222415593}, {"micro_f1_no_misc": 0.398635477582846, "micro_f1": 0.35947942314456566}, {"micro_f1_no_misc": 0.2737430167597765, "micro_f1": 0.27367055771725035}]}, "total": {"test_micro_f1_no_misc": 35.07727745669487, "test_micro_f1_no_misc_se": 2.2549986872917676, "test_micro_f1": 31.517045963227574, "test_micro_f1_se": 1.8121662943178887}}, "num_model_parameters": 2779683840, "max_sequence_length": 2048, "vocabulary_size": 51200, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "scala-nl", "task": "linguistic-acceptability", "dataset_languages": ["nl"], "model": "microsoft/phi-2", "results": {"raw": {"test": [{"mcc": 0.0, "macro_f1": 0.32962356792144026}, {"mcc": -0.008196249705001029, "macro_f1": 0.3924050632911392}, {"mcc": -0.0007538554744919827, "macro_f1": 0.35231849737891396}, {"mcc": -0.03752375506647475, "macro_f1": 0.3372168284789644}, {"mcc": 0.03062553325831971, "macro_f1": 0.4531550566302496}, {"mcc": -0.008861113586081269, "macro_f1": 0.4951152615175687}, {"mcc": 0.0026225643276197763, "macro_f1": 0.4593738450979357}, {"mcc": 0.009363316290442873, "macro_f1": 0.3337092227144583}, {"mcc": 0.0, "macro_f1": 0.33159268929503916}, {"mcc": -0.00822604007269173, "macro_f1": 0.3336601573519732}]}, "total": {"test_mcc": -0.20949600028358412, "test_mcc_se": 1.0556035718487051, "test_macro_f1": 38.18170189677683, "test_macro_f1_se": 3.962896083017162}}, "num_model_parameters": 2779683840, "max_sequence_length": 2048, "vocabulary_size": 51200, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "squad-nl", "task": "question-answering", "dataset_languages": ["nl"], "model": "microsoft/phi-2", "results": {"raw": {"test": [{"em": 32.14562354763749, "f1": 38.033373104675235}, {"em": 26.434108527131784, "f1": 34.72648363303193}, {"em": 32.22565687789799, "f1": 40.58873323046332}, {"em": 30.373831775700936, "f1": 36.55530032147945}, {"em": 26.94980694980695, "f1": 32.90532733079317}, {"em": 32.07401696222051, "f1": 37.70830300514187}, {"em": 29.00531511009871, "f1": 37.45203188595973}, {"em": 33.591931730023276, "f1": 39.83329287128999}, {"em": 27.607843137254903, "f1": 34.64745893480976}, {"em": 26.009316770186334, "f1": 32.97208671508227}]}, "total": {"test_em": 29.641745138795887, "test_em_se": 1.731933289748133, "test_f1": 36.54223910327268, "test_f1_se": 1.6589586337043085}}, "num_model_parameters": 2779683840, "max_sequence_length": 2048, "vocabulary_size": 51200, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "conll-nl", "task": "named-entity-recognition", "dataset_languages": ["nl"], "model": "microsoft/Phi-3-mini-4k-instruct", "results": {"raw": {"test": [{"micro_f1_no_misc": 0.49204665959703076, "micro_f1": 0.3697688944409744}, {"micro_f1_no_misc": 0.5384983261597321, "micro_f1": 0.4482366325369738}, {"micro_f1_no_misc": 0.5089647194910354, "micro_f1": 0.42358803986710963}, {"micro_f1_no_misc": 0.5067197610751618, "micro_f1": 0.42392444910807975}, {"micro_f1_no_misc": 0.5315126050420167, "micro_f1": 0.4610951008645533}, {"micro_f1_no_misc": 0.5155993431855501, "micro_f1": 0.43619122876333466}, {"micro_f1_no_misc": 0.4733840304182509, "micro_f1": 0.4085221508285425}, {"micro_f1_no_misc": 0.5169851380042463, "micro_f1": 0.459307010475423}, {"micro_f1_no_misc": 0.5635036496350365, "micro_f1": 0.4758364312267658}, {"micro_f1_no_misc": 0.42659279778393344, "micro_f1": 0.36984687868080096}]}, "total": {"test_micro_f1_no_misc": 50.73807030391995, "test_micro_f1_no_misc_se": 2.335679892280937, "test_micro_f1": 42.76316816792558, "test_micro_f1_se": 2.2657680055155134}}, "num_model_parameters": 3821079552, "max_sequence_length": 4096, "vocabulary_size": 32064, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "scala-nl", "task": "linguistic-acceptability", "dataset_languages": ["nl"], "model": "microsoft/Phi-3-mini-4k-instruct", "results": {"raw": {"test": [{"mcc": 0.10694512014140484, "macro_f1": 0.48547198736649966}, {"mcc": 0.0862914581733232, "macro_f1": 0.4247189297600682}, {"mcc": 0.08401618063807076, "macro_f1": 0.4696890586861934}, {"mcc": 0.13702705005673219, "macro_f1": 0.5624855979721944}, {"mcc": 0.14014349091798453, "macro_f1": 0.47875807707038637}, {"mcc": 0.09622767562238979, "macro_f1": 0.5013008984790824}, {"mcc": 0.11991500890379357, "macro_f1": 0.48453237092566653}, {"mcc": 0.13736015703135016, "macro_f1": 0.5427688124407442}, {"mcc": 0.09758054468233138, "macro_f1": 0.5460574797347089}, {"mcc": 0.12909844646643193, "macro_f1": 0.5306952828079589}]}, "total": {"test_mcc": 11.346051326338124, "test_mcc_se": 1.3581607535893452, "test_macro_f1": 50.26478495243503, "test_macro_f1_se": 2.630931804346084}}, "num_model_parameters": 3821079552, "max_sequence_length": 4096, "vocabulary_size": 32064, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "wiki-lingua-nl", "task": "summarization", "dataset_languages": ["nl"], "model": "microsoft/phi-2", "results": {"raw": {"test": [{"bertscore": 0.5993258193193469, "rouge_l": 0.11782111090016575}, {"bertscore": 0.5896826635871548, "rouge_l": 0.13146812156752624}, {"bertscore": 0.581314715615008, "rouge_l": 0.11234909680794633}, {"bertscore": 0.6094972882419825, "rouge_l": 0.1301372042676247}, {"bertscore": 0.5815443077881355, "rouge_l": 0.11852646780304332}, {"bertscore": 0.5995744414394721, "rouge_l": 0.12518522025341589}, {"bertscore": 0.5693353886308614, "rouge_l": 0.11645037460821897}, {"bertscore": 0.6116992678580573, "rouge_l": 0.11839317417647502}, {"bertscore": 0.5756924668530701, "rouge_l": 0.11229814380612319}, {"bertscore": 0.6081704432144761, "rouge_l": 0.12252455498403425}]}, "total": {"test_bertscore": 59.25836802547565, "test_bertscore_se": 0.9403960137626357, "test_rouge_l": 12.051534691745736, "test_rouge_l_se": 0.4156967928710337}}, "num_model_parameters": 2779683840, "max_sequence_length": 2048, "vocabulary_size": 51200, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "mmlu-nl", "task": "knowledge", "dataset_languages": ["nl"], "model": "microsoft/phi-2", "results": {"raw": {"test": [{"mcc": 0.12389356621656701, "accuracy": 0.3310546875}, {"mcc": 0.09389907261636839, "accuracy": 0.31787109375}, {"mcc": 0.10188179566823541, "accuracy": 0.31396484375}, {"mcc": 0.10334439453540263, "accuracy": 0.32275390625}, {"mcc": 0.08918580076717257, "accuracy": 0.306640625}, {"mcc": 0.0853661506811363, "accuracy": 0.30517578125}, {"mcc": 0.11673050930020581, "accuracy": 0.337890625}, {"mcc": 0.11050385141426554, "accuracy": 0.32568359375}, {"mcc": 0.09492097304938993, "accuracy": 0.31494140625}, {"mcc": 0.09869424347527493, "accuracy": 0.32177734375}]}, "total": {"test_mcc": 10.184203577240185, "test_mcc_se": 0.7556550025431692, "test_accuracy": 31.9775390625, "test_accuracy_se": 0.63665560314958}}, "num_model_parameters": 2779683840, "max_sequence_length": 2048, "vocabulary_size": 51200, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "squad-nl", "task": "question-answering", "dataset_languages": ["nl"], "model": "microsoft/Phi-3-mini-4k-instruct", "results": {"raw": {"test": [{"em": 38.57474825716499, "f1": 48.29764517486485}, {"em": 39.14728682170543, "f1": 48.874758322487274}, {"em": 41.57650695517774, "f1": 52.06571648091637}, {"em": 37.92834890965732, "f1": 46.11392438638001}, {"em": 36.833976833976834, "f1": 46.64990861741815}, {"em": 42.09714726291442, "f1": 51.19750110602108}, {"em": 37.43356112376613, "f1": 48.79522532839742}, {"em": 37.62606671838635, "f1": 47.724502965890366}, {"em": 36.07843137254902, "f1": 46.78820151838177}, {"em": 36.56832298136646, "f1": 47.44160138490776}]}, "total": {"test_em": 38.38643972366647, "test_em_se": 1.261776152637526, "test_f1": 48.39489852856651, "test_f1_se": 1.2032155437132042}}, "num_model_parameters": 3821079552, "max_sequence_length": 4096, "vocabulary_size": 32064, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "hellaswag-nl", "task": "common-sense-reasoning", "dataset_languages": ["nl"], "model": "microsoft/phi-2", "results": {"raw": {"test": [{"accuracy": 0.24658203125, "mcc": 0.007558184635756412}, {"accuracy": 0.263671875, "mcc": 0.01710927721432579}, {"accuracy": 0.2470703125, "mcc": 0.010463610207333855}, {"accuracy": 0.263671875, "mcc": 0.039097806980007976}, {"accuracy": 0.26220703125, "mcc": 0.019030422854535323}, {"accuracy": 0.255859375, "mcc": 0.01439360725581211}, {"accuracy": 0.24755859375, "mcc": 0.004161367942926189}, {"accuracy": 0.24853515625, "mcc": 0.016654190702664134}, {"accuracy": 0.263671875, "mcc": 0.020130954782215695}, {"accuracy": 0.27197265625, "mcc": 0.03098514981988833}]}, "total": {"test_accuracy": 25.7080078125, "test_accuracy_se": 0.5673095982715873, "test_mcc": 1.7958457239546581, "test_mcc_se": 0.6508013767401515}}, "num_model_parameters": 2779683840, "max_sequence_length": 2048, "vocabulary_size": 51200, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "speed", "task": "speed", "dataset_languages": ["ab", "aa", "af", "sq", "am", "ar", "an", "hy", "as", "av", "ae", "ay", "az", "bm", "ba", "eu", "be", "bn", "bi", "bs", "br", "bg", "my", "ca", "ch", "ce", "ny", "zh", "cu", "cv", "kw", "co", "cr", "hr", "cs", "da", "dv", "nl", "dz", "en", "eo", "et", "ee", "fo", "fj", "fi", "fr", "fy", "ff", "gd", "gl", "lg", "ka", "de", "el", "kl", "gn", "gu", "ht", "ha", "he", "hz", "hi", "ho", "hu", "is", "io", "ig", "id", "ia", "ie", "iu", "ik", "ga", "it", "ja", "kn", "kr", "ks", "kk", "km", "ki", "rw", "ky", "kv", "kg", "ko", "kj", "ku", "lo", "la", "lv", "li", "ln", "lt", "lu", "lb", "mk", "mg", "ms", "ml", "mt", "gv", "mi", "mr", "mh", "mn", "na", "nv", "nd", "nr", "ng", "ne", "no", "nb", "nn", "ii", "oc", "oj", "or", "om", "os", "pi", "ps", "fa", "pl", "pt", "pa", "qu", "ro", "rm", "rn", "ru", "se", "sm", "sg", "sa", "sc", "sr", "sn", "sd", "si", "sk", "sl", "so", "st", "es", "su", "sw", "ss", "sv", "tl", "ty", "tg", "ta", "tt", "te", "th", "bo", "ti", "to", "ts", "tn", "tr", "tk", "tw", "ug", "uk", "ur", "uz", "ve", "vi", "vo", "wa", "cy", "wo", "xh", "yi", "yo", "za", "zu"], "model": "microsoft/phi-2", "results": {"raw": {"test": [{"test_speed": 1691.9299999999998, "test_speed_short": 215.76}, {"test_speed": 2851.02, "test_speed_short": 405.59999999999997}, {"test_speed": 4002.6699999999996, "test_speed_short": 785.3199999999999}, {"test_speed": 4245.91, "test_speed_short": 973.8000000000001}, {"test_speed": 5328.18, "test_speed_short": 1153.6899999999998}, {"test_speed": 5704.55, "test_speed_short": 1515.06}, {"test_speed": 6515.57, "test_speed_short": 1703.04}, {"test_speed": 6434.67, "test_speed_short": 1735.95}, {"test_speed": 6890.5199999999995, "test_speed_short": 1909.44}, {"test_speed": 6848.7699999999995, "test_speed_short": 2083.35}]}, "total": {"test_speed": 5051.378999999999, "test_speed_se": 1113.5395899448959, "test_speed_short": 1248.101, "test_speed_short_se": 398.75283668411026}}, "num_model_parameters": 2779683840, "max_sequence_length": 2048, "vocabulary_size": 51200, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "dutch-social", "task": "sentiment-classification", "dataset_languages": ["nl"], "model": "occiglot/occiglot-7b-eu5", "results": {"raw": {"test": [{"mcc": 0.07129077832157317, "macro_f1": 0.2837601641530198}, {"mcc": 0.07011987016016401, "macro_f1": 0.2526802527295839}, {"mcc": 0.07492611319607832, "macro_f1": 0.298498567212003}, {"mcc": 0.05924844014214266, "macro_f1": 0.24020226756163296}, {"mcc": 0.07012412022387805, "macro_f1": 0.25811539549216894}, {"mcc": 0.05610681834722177, "macro_f1": 0.27741435014170907}, {"mcc": 0.1053695222544253, "macro_f1": 0.28535194763527866}, {"mcc": 0.06718459273622983, "macro_f1": 0.2366432169182975}, {"mcc": 0.057842998954479974, "macro_f1": 0.25587547508311076}, {"mcc": 0.11882216231374734, "macro_f1": 0.3154883980377539}]}, "total": {"test_mcc": 7.510354166499404, "test_mcc_se": 1.2843349344901622, "test_macro_f1": 27.040300349645584, "test_macro_f1_se": 1.600471332442667}}, "num_model_parameters": 7241732096, "max_sequence_length": 32768, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "conll-nl", "task": "named-entity-recognition", "dataset_languages": ["nl"], "model": "occiglot/occiglot-7b-eu5", "results": {"raw": {"test": [{"micro_f1_no_misc": 0.48484848484848486, "micro_f1": 0.36060100166944914}, {"micro_f1_no_misc": 0.48441674087266257, "micro_f1": 0.43719639139486466}, {"micro_f1_no_misc": 0.5399449035812672, "micro_f1": 0.4641093809342955}, {"micro_f1_no_misc": 0.47122479094933595, "micro_f1": 0.44336084021005256}, {"micro_f1_no_misc": 0.5163277880468269, "micro_f1": 0.42109038737446197}, {"micro_f1_no_misc": 0.5370902817711327, "micro_f1": 0.4359464627151052}, {"micro_f1_no_misc": 0.5193260654112983, "micro_f1": 0.37756801776790677}, {"micro_f1_no_misc": 0.5002718868950516, "micro_f1": 0.4577677224736048}, {"micro_f1_no_misc": 0.5961305925030229, "micro_f1": 0.49671484888304857}, {"micro_f1_no_misc": 0.4740502942750134, "micro_f1": 0.4008714596949892}]}, "total": {"test_micro_f1_no_misc": 51.23631829154096, "test_micro_f1_no_misc_se": 2.3824609481538785, "test_micro_f1": 42.95226513117779, "test_micro_f1_se": 2.542292522959876}}, "num_model_parameters": 7241732096, "max_sequence_length": 32768, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "scala-nl", "task": "linguistic-acceptability", "dataset_languages": ["nl"], "model": "occiglot/occiglot-7b-eu5", "results": {"raw": {"test": [{"mcc": 0.13352119174970026, "macro_f1": 0.4874483903131044}, {"mcc": 0.13148118366202852, "macro_f1": 0.5656270614661356}, {"mcc": 0.14053310143860795, "macro_f1": 0.5678003907271525}, {"mcc": 0.18259099178144492, "macro_f1": 0.5889647198687089}, {"mcc": 0.09328391674978674, "macro_f1": 0.47251516515116504}, {"mcc": 0.1166288860297902, "macro_f1": 0.5562796269787089}, {"mcc": 0.11897689067650638, "macro_f1": 0.502436151123395}, {"mcc": 0.09134806993846868, "macro_f1": 0.5208113383713634}, {"mcc": 0.17160460555705734, "macro_f1": 0.5852117761312703}, {"mcc": 0.10945628065725779, "macro_f1": 0.503443360966119}]}, "total": {"test_mcc": 12.894251182406485, "test_mcc_se": 1.8680643169545967, "test_macro_f1": 53.505379810971235, "test_macro_f1_se": 2.639591473842929}}, "num_model_parameters": 7241732096, "max_sequence_length": 32768, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "squad-nl", "task": "question-answering", "dataset_languages": ["nl"], "model": "occiglot/occiglot-7b-eu5", "results": {"raw": {"test": [{"em": 61.1154144074361, "f1": 70.64511985047393}, {"em": 59.06976744186046, "f1": 69.98274201907704}, {"em": 58.34621329211747, "f1": 69.50461958875334}, {"em": 57.242990654205606, "f1": 66.88586262817446}, {"em": 59.536679536679536, "f1": 70.39125845008186}, {"em": 62.68311488049345, "f1": 71.49152375402035}, {"em": 58.84586180713743, "f1": 71.03252543134202}, {"em": 60.74476338246703, "f1": 69.38871039735398}, {"em": 56.31372549019608, "f1": 67.08140518915296}, {"em": 58.92857142857143, "f1": 70.27884258108529}]}, "total": {"test_em": 59.28271023211645, "test_em_se": 1.156558792144744, "test_f1": 69.66826098895153, "test_f1_se": 0.9618141133698744}}, "num_model_parameters": 7241732096, "max_sequence_length": 32768, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "wiki-lingua-nl", "task": "summarization", "dataset_languages": ["nl"], "model": "occiglot/occiglot-7b-eu5", "results": {"raw": {"test": [{"bertscore": 0.6466401446377859, "rouge_l": 0.19389598641467537}, {"bertscore": 0.6573735526762903, "rouge_l": 0.18302915773728043}, {"bertscore": 0.6580803448596271, "rouge_l": 0.1926933823866479}, {"bertscore": 0.6538364247244317, "rouge_l": 0.19543238445388006}, {"bertscore": 0.6370324455056107, "rouge_l": 0.16398664900820242}, {"bertscore": 0.6424304547108477, "rouge_l": 0.1865397366274945}, {"bertscore": 0.628050649058423, "rouge_l": 0.15722284689686467}, {"bertscore": 0.659212700498756, "rouge_l": 0.18944418311388822}, {"bertscore": 0.6554613220941974, "rouge_l": 0.19021392293384773}, {"bertscore": 0.6283001979609253, "rouge_l": 0.1694493633309457}]}, "total": {"test_bertscore": 64.66418236726895, "test_bertscore_se": 0.7522432854271583, "test_rouge_l": 18.21907612903727, "test_rouge_l_se": 0.8459058027765887}}, "num_model_parameters": 7241732096, "max_sequence_length": 32768, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "mmlu-nl", "task": "knowledge", "dataset_languages": ["nl"], "model": "occiglot/occiglot-7b-eu5", "results": {"raw": {"test": [{"mcc": 0.2685360474995128, "accuracy": 0.4443359375}, {"mcc": 0.2649242386522397, "accuracy": 0.4453125}, {"mcc": 0.25675293484981965, "accuracy": 0.4345703125}, {"mcc": 0.27545130445607807, "accuracy": 0.4541015625}, {"mcc": 0.25307364572944274, "accuracy": 0.419921875}, {"mcc": 0.2972752232628593, "accuracy": 0.462890625}, {"mcc": 0.2946136735946587, "accuracy": 0.45654296875}, {"mcc": 0.2747192743257746, "accuracy": 0.4404296875}, {"mcc": 0.25866652255588246, "accuracy": 0.443359375}, {"mcc": 0.26668300964823605, "accuracy": 0.43359375}]}, "total": {"test_mcc": 27.106958745745036, "test_mcc_se": 0.9284833694797693, "test_accuracy": 44.3505859375, "test_accuracy_se": 0.7746174650998288}}, "num_model_parameters": 7241732096, "max_sequence_length": 32768, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "hellaswag-nl", "task": "common-sense-reasoning", "dataset_languages": ["nl"], "model": "occiglot/occiglot-7b-eu5", "results": {"raw": {"test": [{"accuracy": 0.35546875, "mcc": 0.15522317458648452}, {"accuracy": 0.38232421875, "mcc": 0.18420079266543024}, {"accuracy": 0.3251953125, "mcc": 0.10982670158193625}, {"accuracy": 0.32470703125, "mcc": 0.11090283554309346}, {"accuracy": 0.31591796875, "mcc": 0.12285379614852658}, {"accuracy": 0.34326171875, "mcc": 0.13261225080004369}, {"accuracy": 0.3427734375, "mcc": 0.1344323964724283}, {"accuracy": 0.29443359375, "mcc": 0.09195767502727896}, {"accuracy": 0.3994140625, "mcc": 0.20185689200300952}, {"accuracy": 0.36865234375, "mcc": 0.16465785549169434}]}, "total": {"test_accuracy": 34.521484375, "test_accuracy_se": 1.9856700819548694, "test_mcc": 14.08524370319926, "test_mcc_se": 2.172486179756659}}, "num_model_parameters": 7241732096, "max_sequence_length": 32768, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "speed", "task": "speed", "dataset_languages": ["ab", "aa", "af", "sq", "am", "ar", "an", "hy", "as", "av", "ae", "ay", "az", "bm", "ba", "eu", "be", "bn", "bi", "bs", "br", "bg", "my", "ca", "ch", "ce", "ny", "zh", "cu", "cv", "kw", "co", "cr", "hr", "cs", "da", "dv", "nl", "dz", "en", "eo", "et", "ee", "fo", "fj", "fi", "fr", "fy", "ff", "gd", "gl", "lg", "ka", "de", "el", "kl", "gn", "gu", "ht", "ha", "he", "hz", "hi", "ho", "hu", "is", "io", "ig", "id", "ia", "ie", "iu", "ik", "ga", "it", "ja", "kn", "kr", "ks", "kk", "km", "ki", "rw", "ky", "kv", "kg", "ko", "kj", "ku", "lo", "la", "lv", "li", "ln", "lt", "lu", "lb", "mk", "mg", "ms", "ml", "mt", "gv", "mi", "mr", "mh", "mn", "na", "nv", "nd", "nr", "ng", "ne", "no", "nb", "nn", "ii", "oc", "oj", "or", "om", "os", "pi", "ps", "fa", "pl", "pt", "pa", "qu", "ro", "rm", "rn", "ru", "se", "sm", "sg", "sa", "sc", "sr", "sn", "sd", "si", "sk", "sl", "so", "st", "es", "su", "sw", "ss", "sv", "tl", "ty", "tg", "ta", "tt", "te", "th", "bo", "ti", "to", "ts", "tn", "tr", "tk", "tw", "ug", "uk", "ur", "uz", "ve", "vi", "vo", "wa", "cy", "wo", "xh", "yi", "yo", "za", "zu"], "model": "occiglot/occiglot-7b-eu5", "results": {"raw": {"test": [{"test_speed": 1617.36, "test_speed_short": 134.2}, {"test_speed": 1554.2799999999997, "test_speed_short": 353.2}, {"test_speed": 2301.1200000000003, "test_speed_short": 771.02}, {"test_speed": 2762.06, "test_speed_short": 930.13}, {"test_speed": 3087.16, "test_speed_short": 1095.36}, {"test_speed": 3317.04, "test_speed_short": 1286.12}, {"test_speed": 2856.64, "test_speed_short": 856.5600000000001}, {"test_speed": 2924.1, "test_speed_short": 919.08}, {"test_speed": 3394.16, "test_speed_short": 1755.3799999999999}, {"test_speed": 3635.0600000000004, "test_speed_short": 1906.2999999999997}]}, "total": {"test_speed": 2744.898, "test_speed_se": 442.59008190166645, "test_speed_short": 1000.735, "test_speed_short_se": 341.695708900665}}, "num_model_parameters": 7241732096, "max_sequence_length": 32768, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "dutch-social", "task": "sentiment-classification", "dataset_languages": ["nl"], "model": "occiglot/occiglot-7b-eu5-instruct", "results": {"raw": {"test": [{"mcc": 0.0812268485149229, "macro_f1": 0.2596046337008948}, {"mcc": 0.042864417515292505, "macro_f1": 0.22176873987499227}, {"mcc": 0.08339181383386295, "macro_f1": 0.26344480287102484}, {"mcc": 0.07098421315944603, "macro_f1": 0.2296157846229047}, {"mcc": 0.11005730214917625, "macro_f1": 0.25537319091638394}, {"mcc": 0.05942650908845859, "macro_f1": 0.21769558982500734}, {"mcc": 0.09997859037171855, "macro_f1": 0.24630347162339114}, {"mcc": 0.07724805163370119, "macro_f1": 0.21302241976951578}, {"mcc": 0.05740371731497854, "macro_f1": 0.22795844445328983}, {"mcc": 0.11265759295826576, "macro_f1": 0.2904967200157477}]}, "total": {"test_mcc": 7.952390565398233, "test_mcc_se": 1.4286379515323795, "test_macro_f1": 24.252837976731524, "test_macro_f1_se": 1.5331141870491358}}, "num_model_parameters": 7241748480, "max_sequence_length": 32768, "vocabulary_size": 32002, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "conll-nl", "task": "named-entity-recognition", "dataset_languages": ["nl"], "model": "occiglot/occiglot-7b-eu5-instruct", "results": {"raw": {"test": [{"micro_f1_no_misc": 0.46492575524833585, "micro_f1": 0.35668073136427564}, {"micro_f1_no_misc": 0.5456919060052219, "micro_f1": 0.45883134130146086}, {"micro_f1_no_misc": 0.5534274193548386, "micro_f1": 0.4340983606557377}, {"micro_f1_no_misc": 0.5150789851603639, "micro_f1": 0.4340195747553156}, {"micro_f1_no_misc": 0.5368115942028986, "micro_f1": 0.41825368307581745}, {"micro_f1_no_misc": 0.5489361702127659, "micro_f1": 0.39973873285434364}, {"micro_f1_no_misc": 0.5533114138093, "micro_f1": 0.3778677462887989}, {"micro_f1_no_misc": 0.5307730182176268, "micro_f1": 0.42524705132291996}, {"micro_f1_no_misc": 0.5746352413019079, "micro_f1": 0.44930417495029823}, {"micro_f1_no_misc": 0.4907730673316708, "micro_f1": 0.3757377049180327}]}, "total": {"test_micro_f1_no_misc": 53.143645708449306, "test_micro_f1_no_misc_se": 2.0360316265067633, "test_micro_f1": 41.29779101487, "test_micro_f1_se": 2.112352054596076}}, "num_model_parameters": 7241748480, "max_sequence_length": 32768, "vocabulary_size": 32002, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "scala-nl", "task": "linguistic-acceptability", "dataset_languages": ["nl"], "model": "occiglot/occiglot-7b-eu5-instruct", "results": {"raw": {"test": [{"mcc": 0.183668487855436, "macro_f1": 0.5917777987595932}, {"mcc": 0.10487226372089598, "macro_f1": 0.4789474922725756}, {"mcc": 0.1892712995541878, "macro_f1": 0.5942374105779398}, {"mcc": 0.19133468370846113, "macro_f1": 0.5947768994773024}, {"mcc": 0.1252746819895536, "macro_f1": 0.5475361202797724}, {"mcc": 0.17026784073852913, "macro_f1": 0.5839178627702462}, {"mcc": 0.16231965361897513, "macro_f1": 0.4900570297761758}, {"mcc": 0.13648747032915357, "macro_f1": 0.5662148070907195}, {"mcc": 0.20004306925095316, "macro_f1": 0.5499750367460651}, {"mcc": 0.19791628623324, "macro_f1": 0.5561709708988131}]}, "total": {"test_mcc": 16.614557369993854, "test_mcc_se": 2.0630073093927717, "test_macro_f1": 55.53611428649202, "test_macro_f1_se": 2.573751692729951}}, "num_model_parameters": 7241748480, "max_sequence_length": 32768, "vocabulary_size": 32002, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "squad-nl", "task": "question-answering", "dataset_languages": ["nl"], "model": "occiglot/occiglot-7b-eu5-instruct", "results": {"raw": {"test": [{"em": 63.206816421378775, "f1": 73.97363757976399}, {"em": 65.11627906976744, "f1": 75.15347578950959}, {"em": 62.210200927357036, "f1": 73.33314468460055}, {"em": 62.07165109034268, "f1": 72.56917643301156}, {"em": 62.084942084942085, "f1": 73.06467389537102}, {"em": 66.9236700077101, "f1": 76.17115933893162}, {"em": 62.56643887623387, "f1": 74.17214497790943}, {"em": 63.847944142746314, "f1": 73.39609429684708}, {"em": 61.01960784313726, "f1": 72.4047087790655}, {"em": 62.18944099378882, "f1": 73.6999708181226}]}, "total": {"test_em": 63.12369914574043, "test_em_se": 1.0848574022446953, "test_f1": 73.79381865931329, "test_f1_se": 0.7161713942734751}}, "num_model_parameters": 7241748480, "max_sequence_length": 32768, "vocabulary_size": 32002, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "wiki-lingua-nl", "task": "summarization", "dataset_languages": ["nl"], "model": "occiglot/occiglot-7b-eu5-instruct", "results": {"raw": {"test": [{"bertscore": 0.6655820088926703, "rouge_l": 0.2033786193305577}, {"bertscore": 0.6685484604095109, "rouge_l": 0.19611305060188364}, {"bertscore": 0.6787087291450007, "rouge_l": 0.21273576120233512}, {"bertscore": 0.6767334476462565, "rouge_l": 0.21184356344121985}, {"bertscore": 0.6581592989095952, "rouge_l": 0.16967960248100022}, {"bertscore": 0.6650797319598496, "rouge_l": 0.2001436016691814}, {"bertscore": 0.6395407491945662, "rouge_l": 0.1681729066456817}, {"bertscore": 0.6696054682251997, "rouge_l": 0.19526396988544387}, {"bertscore": 0.6662659775029169, "rouge_l": 0.19795164840403257}, {"bertscore": 0.6581067494407762, "rouge_l": 0.19425104194712445}]}, "total": {"test_bertscore": 66.46330621326342, "test_bertscore_se": 0.6859974675924159, "test_rouge_l": 19.495337656084605, "test_rouge_l_se": 0.9391416610427565}}, "num_model_parameters": 7241748480, "max_sequence_length": 32768, "vocabulary_size": 32002, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "mmlu-nl", "task": "knowledge", "dataset_languages": ["nl"], "model": "occiglot/occiglot-7b-eu5-instruct", "results": {"raw": {"test": [{"mcc": 0.24894428554464132, "accuracy": 0.4228515625}, {"mcc": 0.27063392251735036, "accuracy": 0.451171875}, {"mcc": 0.2775289251819686, "accuracy": 0.44775390625}, {"mcc": 0.2528714382309833, "accuracy": 0.4306640625}, {"mcc": 0.28942775213165867, "accuracy": 0.4609375}, {"mcc": 0.3181778988254797, "accuracy": 0.486328125}, {"mcc": 0.28264323084314896, "accuracy": 0.462890625}, {"mcc": 0.29573028278215907, "accuracy": 0.470703125}, {"mcc": 0.2695436200344988, "accuracy": 0.44970703125}, {"mcc": 0.29206713950058194, "accuracy": 0.4658203125}]}, "total": {"test_mcc": 27.975684955924713, "test_mcc_se": 1.281844217885408, "test_accuracy": 45.48828125, "test_accuracy_se": 1.1620214427516877}}, "num_model_parameters": 7241748480, "max_sequence_length": 32768, "vocabulary_size": 32002, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "hellaswag-nl", "task": "common-sense-reasoning", "dataset_languages": ["nl"], "model": "occiglot/occiglot-7b-eu5-instruct", "results": {"raw": {"test": [{"accuracy": 0.37939453125, "mcc": 0.18138042065250426}, {"accuracy": 0.365234375, "mcc": 0.15658214876851356}, {"accuracy": 0.3330078125, "mcc": 0.11974932859813135}, {"accuracy": 0.33447265625, "mcc": 0.12324170066808655}, {"accuracy": 0.34619140625, "mcc": 0.1387033592458597}, {"accuracy": 0.359375, "mcc": 0.15248293682821898}, {"accuracy": 0.35400390625, "mcc": 0.1503923236767439}, {"accuracy": 0.33056640625, "mcc": 0.1260222658022612}, {"accuracy": 0.38916015625, "mcc": 0.18821484429238822}, {"accuracy": 0.3779296875, "mcc": 0.17197704934138097}]}, "total": {"test_accuracy": 35.693359375, "test_accuracy_se": 1.299277380270759, "test_mcc": 15.087463778740887, "test_mcc_se": 1.5046175912262731}}, "num_model_parameters": 7241748480, "max_sequence_length": 32768, "vocabulary_size": 32002, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "speed", "task": "speed", "dataset_languages": ["ab", "aa", "af", "sq", "am", "ar", "an", "hy", "as", "av", "ae", "ay", "az", "bm", "ba", "eu", "be", "bn", "bi", "bs", "br", "bg", "my", "ca", "ch", "ce", "ny", "zh", "cu", "cv", "kw", "co", "cr", "hr", "cs", "da", "dv", "nl", "dz", "en", "eo", "et", "ee", "fo", "fj", "fi", "fr", "fy", "ff", "gd", "gl", "lg", "ka", "de", "el", "kl", "gn", "gu", "ht", "ha", "he", "hz", "hi", "ho", "hu", "is", "io", "ig", "id", "ia", "ie", "iu", "ik", "ga", "it", "ja", "kn", "kr", "ks", "kk", "km", "ki", "rw", "ky", "kv", "kg", "ko", "kj", "ku", "lo", "la", "lv", "li", "ln", "lt", "lu", "lb", "mk", "mg", "ms", "ml", "mt", "gv", "mi", "mr", "mh", "mn", "na", "nv", "nd", "nr", "ng", "ne", "no", "nb", "nn", "ii", "oc", "oj", "or", "om", "os", "pi", "ps", "fa", "pl", "pt", "pa", "qu", "ro", "rm", "rn", "ru", "se", "sm", "sg", "sa", "sc", "sr", "sn", "sd", "si", "sk", "sl", "so", "st", "es", "su", "sw", "ss", "sv", "tl", "ty", "tg", "ta", "tt", "te", "th", "bo", "ti", "to", "ts", "tn", "tr", "tk", "tw", "ug", "uk", "ur", "uz", "ve", "vi", "vo", "wa", "cy", "wo", "xh", "yi", "yo", "za", "zu"], "model": "occiglot/occiglot-7b-eu5-instruct", "results": {"raw": {"test": [{"test_speed": 1591.6000000000001, "test_speed_short": 232.42999999999998}, {"test_speed": 2406.04, "test_speed_short": 413.2}, {"test_speed": 2118.88, "test_speed_short": 761.9}, {"test_speed": 2758.44, "test_speed_short": 898.17}, {"test_speed": 3091.68, "test_speed_short": 1053.92}, {"test_speed": 3262.8399999999997, "test_speed_short": 1248.38}, {"test_speed": 3741.44, "test_speed_short": 1441.71}, {"test_speed": 3602.78, "test_speed_short": 1594.36}, {"test_speed": 3540.32, "test_speed_short": 1741.2399999999998}, {"test_speed": 3436.62, "test_speed_short": 1887.6}]}, "total": {"test_speed": 2955.0639999999994, "test_speed_se": 444.2695540754797, "test_speed_short": 1127.291, "test_speed_short_se": 345.0175911716216}}, "num_model_parameters": 7241748480, "max_sequence_length": 32768, "vocabulary_size": 32002, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "wiki-lingua-nl", "task": "summarization", "dataset_languages": ["nl"], "model": "microsoft/Phi-3-mini-4k-instruct", "results": {"raw": {"test": [{"bertscore": 0.5573944420029875, "rouge_l": 0.13038814731923923}, {"bertscore": 0.5852949356049066, "rouge_l": 0.14679689504552043}, {"bertscore": 0.5900949654023862, "rouge_l": 0.13766955201192554}, {"bertscore": 0.5695770575548522, "rouge_l": 0.1497479616743002}, {"bertscore": 0.5690834621054819, "rouge_l": 0.13349357160437503}, {"bertscore": 0.563401580300706, "rouge_l": 0.14386363905376298}, {"bertscore": 0.5558790890645469, "rouge_l": 0.12918769332092245}, {"bertscore": 0.5541941010014853, "rouge_l": 0.12495357292070247}, {"bertscore": 0.5995471808419097, "rouge_l": 0.1457595714009951}, {"bertscore": 0.5723435426480137, "rouge_l": 0.13360077573293952}]}, "total": {"test_bertscore": 57.16810356527275, "test_bertscore_se": 0.955899833322896, "test_rouge_l": 13.754613800846831, "test_rouge_l_se": 0.5281543784830132}}, "num_model_parameters": 3821079552, "max_sequence_length": 4096, "vocabulary_size": 32064, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "mmlu-nl", "task": "knowledge", "dataset_languages": ["nl"], "model": "microsoft/Phi-3-mini-4k-instruct", "results": {"raw": {"test": [{"mcc": 0.20974627165647453, "accuracy": 0.40478515625}, {"mcc": 0.20421531019420716, "accuracy": 0.4013671875}, {"mcc": 0.21478830908931165, "accuracy": 0.41064453125}, {"mcc": 0.24880191477968272, "accuracy": 0.43212890625}, {"mcc": 0.18301435877436684, "accuracy": 0.38525390625}, {"mcc": 0.2133742390319255, "accuracy": 0.408203125}, {"mcc": 0.1949690421792669, "accuracy": 0.3935546875}, {"mcc": 0.2196608208371329, "accuracy": 0.41357421875}, {"mcc": 0.1804210537669653, "accuracy": 0.384765625}, {"mcc": 0.1958866635897574, "accuracy": 0.39404296875}]}, "total": {"test_mcc": 20.648779838990908, "test_mcc_se": 1.2347238974177794, "test_accuracy": 40.283203125, "test_accuracy_se": 0.8930012730115564}}, "num_model_parameters": 3821079552, "max_sequence_length": 4096, "vocabulary_size": 32064, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "hellaswag-nl", "task": "common-sense-reasoning", "dataset_languages": ["nl"], "model": "microsoft/Phi-3-mini-4k-instruct", "results": {"raw": {"test": [{"accuracy": 0.34228515625, "mcc": 0.12137675011686362}, {"accuracy": 0.3564453125, "mcc": 0.14035963635147486}, {"accuracy": 0.3212890625, "mcc": 0.09207003642564429}, {"accuracy": 0.3330078125, "mcc": 0.10980273695097768}, {"accuracy": 0.3583984375, "mcc": 0.14159366601815704}, {"accuracy": 0.345703125, "mcc": 0.12858775990210772}, {"accuracy": 0.36474609375, "mcc": 0.15360536313367917}, {"accuracy": 0.33837890625, "mcc": 0.12080258344897482}, {"accuracy": 0.349609375, "mcc": 0.13091100198278063}, {"accuracy": 0.359375, "mcc": 0.14503659559578685}]}, "total": {"test_accuracy": 34.6923828125, "test_accuracy_se": 0.8389731361353224, "test_mcc": 12.841461299264466, "test_mcc_se": 1.1331711821668944}}, "num_model_parameters": 3821079552, "max_sequence_length": 4096, "vocabulary_size": 32064, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "speed", "task": "speed", "dataset_languages": ["ab", "aa", "af", "sq", "am", "ar", "an", "hy", "as", "av", "ae", "ay", "az", "bm", "ba", "eu", "be", "bn", "bi", "bs", "br", "bg", "my", "ca", "ch", "ce", "ny", "zh", "cu", "cv", "kw", "co", "cr", "hr", "cs", "da", "dv", "nl", "dz", "en", "eo", "et", "ee", "fo", "fj", "fi", "fr", "fy", "ff", "gd", "gl", "lg", "ka", "de", "el", "kl", "gn", "gu", "ht", "ha", "he", "hz", "hi", "ho", "hu", "is", "io", "ig", "id", "ia", "ie", "iu", "ik", "ga", "it", "ja", "kn", "kr", "ks", "kk", "km", "ki", "rw", "ky", "kv", "kg", "ko", "kj", "ku", "lo", "la", "lv", "li", "ln", "lt", "lu", "lb", "mk", "mg", "ms", "ml", "mt", "gv", "mi", "mr", "mh", "mn", "na", "nv", "nd", "nr", "ng", "ne", "no", "nb", "nn", "ii", "oc", "oj", "or", "om", "os", "pi", "ps", "fa", "pl", "pt", "pa", "qu", "ro", "rm", "rn", "ru", "se", "sm", "sg", "sa", "sc", "sr", "sn", "sd", "si", "sk", "sl", "so", "st", "es", "su", "sw", "ss", "sv", "tl", "ty", "tg", "ta", "tt", "te", "th", "bo", "ti", "to", "ts", "tn", "tr", "tk", "tw", "ug", "uk", "ur", "uz", "ve", "vi", "vo", "wa", "cy", "wo", "xh", "yi", "yo", "za", "zu"], "model": "microsoft/Phi-3-mini-4k-instruct", "results": {"raw": {"test": [{"test_speed": 814.1999999999999, "test_speed_short": 103.18}, {"test_speed": 1501.5, "test_speed_short": 187.2}, {"test_speed": 1977.4399999999998, "test_speed_short": 356.05999999999995}, {"test_speed": 2476.08, "test_speed_short": 437.1}, {"test_speed": 2684.88, "test_speed_short": 518.0}, {"test_speed": 3056.8799999999997, "test_speed_short": 664.52}, {"test_speed": 3494.96, "test_speed_short": 742.8499999999999}, {"test_speed": 3689.42, "test_speed_short": 822.4799999999999}, {"test_speed": 3857.0, "test_speed_short": 899.91}, {"test_speed": 3977.82, "test_speed_short": 977.9000000000001}]}, "total": {"test_speed": 2753.018, "test_speed_se": 661.3445905556677, "test_speed_short": 570.9200000000001, "test_speed_short_se": 185.7542074655957}}, "num_model_parameters": 3821079552, "max_sequence_length": 4096, "vocabulary_size": 32064, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "wiki-lingua-nl", "task": "summarization", "dataset_languages": ["nl"], "model": "microsoft/Phi-3-mini-4k-instruct", "results": {"raw": {"test": [{"bertscore": 0.5568059768411331, "rouge_l": 0.13121439031832283}, {"bertscore": 0.5841299545718357, "rouge_l": 0.1467121597009773}, {"bertscore": 0.5889606589626055, "rouge_l": 0.13727474962326747}, {"bertscore": 0.5726260959345382, "rouge_l": 0.1515582136448907}, {"bertscore": 0.572063229381456, "rouge_l": 0.13418994131619835}, {"bertscore": 0.5658455364900874, "rouge_l": 0.14310484773917034}, {"bertscore": 0.555968249864236, "rouge_l": 0.1292660024310103}, {"bertscore": 0.5522098019864643, "rouge_l": 0.12321026888567918}, {"bertscore": 0.5994343057827791, "rouge_l": 0.1444637410349471}, {"bertscore": 0.5709360985347303, "rouge_l": 0.13182059666964308}]}, "total": {"test_bertscore": 57.18979908349866, "test_bertscore_se": 0.9481821549402464, "test_rouge_l": 13.72814911364107, "test_rouge_l_se": 0.5530977663781691}}, "num_model_parameters": 3821079552, "max_sequence_length": 4096, "vocabulary_size": 32064, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "dutch-social", "task": "sentiment-classification", "dataset_languages": ["nl"], "model": "../fietje-test/fietje-2b-sft/fietje-2b-sft-ckpt-178", "results": {"raw": {"test": [{"mcc": 0.13749216385150242, "macro_f1": 0.41412722066695346}, {"mcc": 0.11215834270556364, "macro_f1": 0.40631941529584986}, {"mcc": 0.11499023343310169, "macro_f1": 0.3735317739250547}, {"mcc": 0.15418212056603336, "macro_f1": 0.41838433425552296}, {"mcc": 0.1463217648218729, "macro_f1": 0.4016082206418919}, {"mcc": 0.12461811785026543, "macro_f1": 0.3828926914663255}, {"mcc": 0.13732790634982112, "macro_f1": 0.3534857271884739}, {"mcc": 0.1416277116822788, "macro_f1": 0.4191917632551078}, {"mcc": 0.10709354223220453, "macro_f1": 0.3905063374877617}, {"mcc": 0.15075891251883528, "macro_f1": 0.4327024441667968}]}, "total": {"test_mcc": 13.265708160114793, "test_mcc_se": 1.0440727188122032, "test_macro_f1": 39.92749928349738, "test_macro_f1_se": 1.498333501758683}}, "num_model_parameters": -1, "max_sequence_length": 2048, "vocabulary_size": 50297, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "conll-nl", "task": "named-entity-recognition", "dataset_languages": ["nl"], "model": "../fietje-test/fietje-2b-sft/fietje-2b-sft-ckpt-178", "results": {"raw": {"test": [{"micro_f1_no_misc": 0.33992805755395683, "micro_f1": 0.2671563002065641}, {"micro_f1_no_misc": 0.3373584905660378, "micro_f1": 0.31242532855436084}, {"micro_f1_no_misc": 0.30967206309672063, "micro_f1": 0.27280064568200163}, {"micro_f1_no_misc": 0.28038628038628044, "micro_f1": 0.27243417517463725}, {"micro_f1_no_misc": 0.3835360149672591, "micro_f1": 0.2921875}, {"micro_f1_no_misc": 0.30604982206405695, "micro_f1": 0.2935779816513761}, {"micro_f1_no_misc": 0.3452428743476515, "micro_f1": 0.27056424201223656}, {"micro_f1_no_misc": 0.27181467181467184, "micro_f1": 0.24645527197731373}, {"micro_f1_no_misc": 0.33126807104502276, "micro_f1": 0.2665289256198347}, {"micro_f1_no_misc": 0.3008521674694331, "micro_f1": 0.26294820717131473}]}, "total": {"test_micro_f1_no_misc": 32.061085133110915, "test_micro_f1_no_misc_se": 2.0716639041711766, "test_micro_f1": 27.570785780496397, "test_micro_f1_se": 1.1615051525151834}}, "num_model_parameters": -1, "max_sequence_length": 2048, "vocabulary_size": 50297, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "scala-nl", "task": "linguistic-acceptability", "dataset_languages": ["nl"], "model": "../fietje-test/fietje-2b-sft/fietje-2b-sft-ckpt-178", "results": {"raw": {"test": [{"mcc": 0.0786963396428459, "macro_f1": 0.46345120723752703}, {"mcc": 0.05890330443937126, "macro_f1": 0.46491901178459494}, {"mcc": 0.016259680814245704, "macro_f1": 0.452329345531316}, {"mcc": 0.039115454485404405, "macro_f1": 0.46669709875038257}, {"mcc": 0.06040745348545034, "macro_f1": 0.4797147915719424}, {"mcc": 1.7818000428905007e-05, "macro_f1": 0.4601627284372667}, {"mcc": 0.09066450549876778, "macro_f1": 0.5009746588693957}, {"mcc": 0.08388223379332521, "macro_f1": 0.4626707723383117}, {"mcc": 0.06725798087276781, "macro_f1": 0.45568628310582354}, {"mcc": 0.021160101244648975, "macro_f1": 0.4268699487317259}]}, "total": {"test_mcc": 5.163648722772562, "test_mcc_se": 1.9237942604027582, "test_macro_f1": 46.334758463582865, "test_macro_f1_se": 1.172127703131431}}, "num_model_parameters": -1, "max_sequence_length": 2048, "vocabulary_size": 50297, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "squad-nl", "task": "question-answering", "dataset_languages": ["nl"], "model": "../fietje-test/fietje-2b-sft/fietje-2b-sft-ckpt-178", "results": {"raw": {"test": [{"em": 61.03795507358637, "f1": 71.31739940971863}, {"em": 60.3875968992248, "f1": 71.03582201351456}, {"em": 60.51004636785162, "f1": 71.98396118230048}, {"em": 59.65732087227414, "f1": 69.78009256415544}, {"em": 60.61776061776062, "f1": 71.34581121423214}, {"em": 61.37239784117194, "f1": 71.10949816223469}, {"em": 60.36446469248292, "f1": 72.39983112377278}, {"em": 59.42591155934833, "f1": 69.52531551570328}, {"em": 58.588235294117645, "f1": 69.82886307821283}, {"em": 60.9472049689441, "f1": 71.17337786890737}]}, "total": {"test_em": 60.29088941867625, "test_em_se": 0.5217178488833257, "test_f1": 70.94999721327522, "test_f1_se": 0.5916471703546302}}, "num_model_parameters": -1, "max_sequence_length": 2048, "vocabulary_size": 50297, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "dutch-social", "task": "sentiment-classification", "dataset_languages": ["nl"], "model": "../fietje-test/fietje-2b-sft/fietje-2b-sft-ckpt-356", "results": {"raw": {"test": [{"mcc": 0.1625343630466955, "macro_f1": 0.43848046026697957}, {"mcc": 0.14819184050488635, "macro_f1": 0.44131592703311157}, {"mcc": 0.07753271825655852, "macro_f1": 0.39801920768307325}, {"mcc": 0.1733247267560933, "macro_f1": 0.43293352396277934}, {"mcc": 0.1881913025546264, "macro_f1": 0.4385278536992589}, {"mcc": 0.15124017178777044, "macro_f1": 0.393891882728465}, {"mcc": 0.1440886483116655, "macro_f1": 0.4122499130822764}, {"mcc": 0.14445366134189158, "macro_f1": 0.40827559093812965}, {"mcc": 0.0992423757115138, "macro_f1": 0.40317949996364505}, {"mcc": 0.06320456107159544, "macro_f1": 0.33616381451861077}]}, "total": {"test_mcc": 13.520043693432967, "test_mcc_se": 2.563970274042817, "test_macro_f1": 41.030376738763294, "test_macro_f1_se": 1.9643544364377037}}, "num_model_parameters": -1, "max_sequence_length": 2048, "vocabulary_size": 50297, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "wiki-lingua-nl", "task": "summarization", "dataset_languages": ["nl"], "model": "../fietje-test/fietje-2b-sft/fietje-2b-sft-ckpt-178", "results": {"raw": {"test": [{"bertscore": 0.6619193271762924, "rouge_l": 0.1818565429119086}, {"bertscore": 0.6631970972957788, "rouge_l": 0.18479451574726852}, {"bertscore": 0.6602360262186266, "rouge_l": 0.1753726448212931}, {"bertscore": 0.6755109137593536, "rouge_l": 0.1995416178330502}, {"bertscore": 0.6596141565969447, "rouge_l": 0.16755662680090078}, {"bertscore": 0.6693271613912657, "rouge_l": 0.18947321571086667}, {"bertscore": 0.6581783014262328, "rouge_l": 0.1702993892506838}, {"bertscore": 0.6604171007638797, "rouge_l": 0.17459477612821547}, {"bertscore": 0.6554536877811188, "rouge_l": 0.1695221862765732}, {"bertscore": 0.6658122418884886, "rouge_l": 0.1709504973238979}]}, "total": {"test_bertscore": 66.29666014297982, "test_bertscore_se": 0.36471606734852785, "test_rouge_l": 17.839620128046583, "test_rouge_l_se": 0.6413242593965581}}, "num_model_parameters": -1, "max_sequence_length": 2048, "vocabulary_size": 50297, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "conll-nl", "task": "named-entity-recognition", "dataset_languages": ["nl"], "model": "../fietje-test/fietje-2b-sft/fietje-2b-sft-ckpt-356", "results": {"raw": {"test": [{"micro_f1_no_misc": 0.36355051935788474, "micro_f1": 0.2700069589422408}, {"micro_f1_no_misc": 0.3560528992878942, "micro_f1": 0.3092369477911646}, {"micro_f1_no_misc": 0.40301467671558905, "micro_f1": 0.31570338058887676}, {"micro_f1_no_misc": 0.2744186046511628, "micro_f1": 0.2624457492979321}, {"micro_f1_no_misc": 0.3756119270137962, "micro_f1": 0.2821997105643994}, {"micro_f1_no_misc": 0.3138977635782748, "micro_f1": 0.2957658779576588}, {"micro_f1_no_misc": 0.3577364463791056, "micro_f1": 0.26984834968777877}, {"micro_f1_no_misc": 0.30082644628099175, "micro_f1": 0.25840220385674934}, {"micro_f1_no_misc": 0.42388561816652653, "micro_f1": 0.32223701731025295}, {"micro_f1_no_misc": 0.29315068493150687, "micro_f1": 0.24007651841224292}]}, "total": {"test_micro_f1_no_misc": 34.621455863627325, "test_micro_f1_no_misc_se": 3.045418165444872, "test_micro_f1": 28.259227144092968, "test_micro_f1_se": 1.6874623420119437}}, "num_model_parameters": -1, "max_sequence_length": 2048, "vocabulary_size": 50297, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "scala-nl", "task": "linguistic-acceptability", "dataset_languages": ["nl"], "model": "../fietje-test/fietje-2b-sft/fietje-2b-sft-ckpt-356", "results": {"raw": {"test": [{"mcc": 0.07297151598973677, "macro_f1": 0.4364743376437777}, {"mcc": 0.049226495774578334, "macro_f1": 0.430020941313641}, {"mcc": -0.01356928885376083, "macro_f1": 0.4067697485877542}, {"mcc": 0.07506191680160226, "macro_f1": 0.45916098362989893}, {"mcc": 0.0732703267926722, "macro_f1": 0.4422532023326791}, {"mcc": -0.006217820489205119, "macro_f1": 0.4352080936697291}, {"mcc": 0.06459223384895954, "macro_f1": 0.4416026267507881}, {"mcc": 0.07083138147935299, "macro_f1": 0.41997580327745043}, {"mcc": 0.08029792273853117, "macro_f1": 0.4199943358821863}, {"mcc": 0.04662396173947149, "macro_f1": 0.436364415195436}]}, "total": {"test_mcc": 5.130886458219388, "test_mcc_se": 2.1128546159175623, "test_macro_f1": 43.278244882833405, "test_macro_f1_se": 0.9023322380562249}}, "num_model_parameters": -1, "max_sequence_length": 2048, "vocabulary_size": 50297, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "mmlu-nl", "task": "knowledge", "dataset_languages": ["nl"], "model": "../fietje-test/fietje-2b-sft/fietje-2b-sft-ckpt-178", "results": {"raw": {"test": [{"mcc": 0.247423659842776, "accuracy": 0.43603515625}, {"mcc": 0.2626416404481749, "accuracy": 0.44677734375}, {"mcc": 0.2570376818515962, "accuracy": 0.44384765625}, {"mcc": 0.2516392727237481, "accuracy": 0.4384765625}, {"mcc": 0.2589555805338578, "accuracy": 0.44287109375}, {"mcc": 0.2689232230373374, "accuracy": 0.44970703125}, {"mcc": 0.2800655118749655, "accuracy": 0.4580078125}, {"mcc": 0.25752322785109166, "accuracy": 0.44287109375}, {"mcc": 0.24359216931614222, "accuracy": 0.4287109375}, {"mcc": 0.26018055833678644, "accuracy": 0.443359375}]}, "total": {"test_mcc": 25.879825258164757, "test_mcc_se": 0.6494347320705439, "test_accuracy": 44.306640625, "test_accuracy_se": 0.48824229011236486}}, "num_model_parameters": -1, "max_sequence_length": 2048, "vocabulary_size": 50297, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "squad-nl", "task": "question-answering", "dataset_languages": ["nl"], "model": "../fietje-test/fietje-2b-sft/fietje-2b-sft-ckpt-356", "results": {"raw": {"test": [{"em": 61.89000774593338, "f1": 71.64444560613195}, {"em": 62.093023255813954, "f1": 72.31324852831075}, {"em": 60.43276661514683, "f1": 71.97590848079639}, {"em": 59.73520249221184, "f1": 70.26121909072819}, {"em": 60.15444015444015, "f1": 71.88381245043426}, {"em": 62.60601387818042, "f1": 71.98460173746071}, {"em": 60.06074411541382, "f1": 72.52476049002055}, {"em": 59.891388673390225, "f1": 69.73459135017875}, {"em": 58.90196078431372, "f1": 70.18827016350227}, {"em": 61.41304347826087, "f1": 72.03661194332157}]}, "total": {"test_em": 60.717859119310525, "test_em_se": 0.747491326460262, "test_f1": 71.45474698408853, "test_f1_se": 0.6191299989921587}}, "num_model_parameters": -1, "max_sequence_length": 2048, "vocabulary_size": 50297, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "hellaswag-nl", "task": "common-sense-reasoning", "dataset_languages": ["nl"], "model": "../fietje-test/fietje-2b-sft/fietje-2b-sft-ckpt-178", "results": {"raw": {"test": [{"accuracy": 0.34228515625, "mcc": 0.13040178626136112}, {"accuracy": 0.33203125, "mcc": 0.12686533947973255}, {"accuracy": 0.33740234375, "mcc": 0.12155042353444669}, {"accuracy": 0.306640625, "mcc": 0.08200323848090652}, {"accuracy": 0.33837890625, "mcc": 0.12631340666987406}, {"accuracy": 0.32470703125, "mcc": 0.10688365522427766}, {"accuracy": 0.30517578125, "mcc": 0.07052321672921913}, {"accuracy": 0.3701171875, "mcc": 0.1598439644629954}, {"accuracy": 0.3310546875, "mcc": 0.114161782273389}, {"accuracy": 0.33056640625, "mcc": 0.1285910917150734}]}, "total": {"test_accuracy": 33.18359375, "test_accuracy_se": 1.1404522007498146, "test_mcc": 11.671379048312755, "test_mcc_se": 1.580037581063218}}, "num_model_parameters": -1, "max_sequence_length": 2048, "vocabulary_size": 50297, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "speed", "task": "speed", "dataset_languages": ["ab", "aa", "af", "sq", "am", "ar", "an", "hy", "as", "av", "ae", "ay", "az", "bm", "ba", "eu", "be", "bn", "bi", "bs", "br", "bg", "my", "ca", "ch", "ce", "ny", "zh", "cu", "cv", "kw", "co", "cr", "hr", "cs", "da", "dv", "nl", "dz", "en", "eo", "et", "ee", "fo", "fj", "fi", "fr", "fy", "ff", "gd", "gl", "lg", "ka", "de", "el", "kl", "gn", "gu", "ht", "ha", "he", "hz", "hi", "ho", "hu", "is", "io", "ig", "id", "ia", "ie", "iu", "ik", "ga", "it", "ja", "kn", "kr", "ks", "kk", "km", "ki", "rw", "ky", "kv", "kg", "ko", "kj", "ku", "lo", "la", "lv", "li", "ln", "lt", "lu", "lb", "mk", "mg", "ms", "ml", "mt", "gv", "mi", "mr", "mh", "mn", "na", "nv", "nd", "nr", "ng", "ne", "no", "nb", "nn", "ii", "oc", "oj", "or", "om", "os", "pi", "ps", "fa", "pl", "pt", "pa", "qu", "ro", "rm", "rn", "ru", "se", "sm", "sg", "sa", "sc", "sr", "sn", "sd", "si", "sk", "sl", "so", "st", "es", "su", "sw", "ss", "sv", "tl", "ty", "tg", "ta", "tt", "te", "th", "bo", "ti", "to", "ts", "tn", "tr", "tk", "tw", "ug", "uk", "ur", "uz", "ve", "vi", "vo", "wa", "cy", "wo", "xh", "yi", "yo", "za", "zu"], "model": "../fietje-test/fietje-2b-sft/fietje-2b-sft-ckpt-178", "results": {"raw": {"test": [{"test_speed": 1647.2, "test_speed_short": 213.6}, {"test_speed": 2647.98, "test_speed_short": 399.75}, {"test_speed": 3755.8, "test_speed_short": 755.74}, {"test_speed": 3726.06, "test_speed_short": 918.72}, {"test_speed": 4633.2, "test_speed_short": 1090.05}, {"test_speed": 5211.9800000000005, "test_speed_short": 1422.1499999999999}, {"test_speed": 5955.830000000001, "test_speed_short": 1583.36}, {"test_speed": 5840.01, "test_speed_short": 1666.37}, {"test_speed": 5893.54, "test_speed_short": 1843.92}, {"test_speed": 6063.650000000001, "test_speed_short": 2010.2499999999998}]}, "total": {"test_speed": 4537.525, "test_speed_se": 958.7288133498555, "test_speed_short": 1190.391, "test_speed_short_se": 380.16986684764044}}, "num_model_parameters": -1, "max_sequence_length": 2048, "vocabulary_size": 50297, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "wiki-lingua-nl", "task": "summarization", "dataset_languages": ["nl"], "model": "../fietje-test/fietje-2b-sft/fietje-2b-sft-ckpt-356", "results": {"raw": {"test": [{"bertscore": 0.6545730889774859, "rouge_l": 0.18092965123552446}, {"bertscore": 0.6558189640054479, "rouge_l": 0.17798088728344255}, {"bertscore": 0.6608585941721685, "rouge_l": 0.1794710612915058}, {"bertscore": 0.672997338231653, "rouge_l": 0.1960921972523403}, {"bertscore": 0.6525349816220114, "rouge_l": 0.1626974579454027}, {"bertscore": 0.6620874459622428, "rouge_l": 0.182439633913502}, {"bertscore": 0.6529629864235176, "rouge_l": 0.16643202842920615}, {"bertscore": 0.6528595700801816, "rouge_l": 0.16938315820923483}, {"bertscore": 0.6551638017554069, "rouge_l": 0.17112688264371678}, {"bertscore": 0.656845113800955, "rouge_l": 0.16293740020513037}]}, "total": {"test_bertscore": 65.7670188503107, "test_bertscore_se": 0.38995383371836995, "test_rouge_l": 17.494903584090057, "test_rouge_l_se": 0.6471162770660752}}, "num_model_parameters": -1, "max_sequence_length": 2048, "vocabulary_size": 50297, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "mmlu-nl", "task": "knowledge", "dataset_languages": ["nl"], "model": "../fietje-test/fietje-2b-sft/fietje-2b-sft-ckpt-356", "results": {"raw": {"test": [{"mcc": 0.2499810425237319, "accuracy": 0.4384765625}, {"mcc": 0.258410748444724, "accuracy": 0.44287109375}, {"mcc": 0.2614345727869977, "accuracy": 0.44775390625}, {"mcc": 0.24447820356041283, "accuracy": 0.43310546875}, {"mcc": 0.24807126905307494, "accuracy": 0.4345703125}, {"mcc": 0.2765757950468143, "accuracy": 0.45458984375}, {"mcc": 0.26419016353917196, "accuracy": 0.44580078125}, {"mcc": 0.23311249803896358, "accuracy": 0.42333984375}, {"mcc": 0.2547001810351889, "accuracy": 0.4384765625}, {"mcc": 0.27863170442995877, "accuracy": 0.45703125}]}, "total": {"test_mcc": 25.695861784590385, "test_mcc_se": 0.8731634809448819, "test_accuracy": 44.16015625, "test_accuracy_se": 0.6329282435517157}}, "num_model_parameters": -1, "max_sequence_length": 2048, "vocabulary_size": 50297, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "hellaswag-nl", "task": "common-sense-reasoning", "dataset_languages": ["nl"], "model": "../fietje-test/fietje-2b-sft/fietje-2b-sft-ckpt-356", "results": {"raw": {"test": [{"accuracy": 0.35546875, "mcc": 0.1484688255812365}, {"accuracy": 0.32958984375, "mcc": 0.11544199610101967}, {"accuracy": 0.34716796875, "mcc": 0.13307173941166406}, {"accuracy": 0.3095703125, "mcc": 0.09169681686514757}, {"accuracy": 0.33837890625, "mcc": 0.12297736498920044}, {"accuracy": 0.3173828125, "mcc": 0.09370849623445655}, {"accuracy": 0.31298828125, "mcc": 0.084850814818567}, {"accuracy": 0.36279296875, "mcc": 0.1513548885424795}, {"accuracy": 0.3310546875, "mcc": 0.11387204235001233}, {"accuracy": 0.3330078125, "mcc": 0.1290858227661207}]}, "total": {"test_accuracy": 33.3740234375, "test_accuracy_se": 1.0981791694590541, "test_mcc": 11.845288076599044, "test_mcc_se": 1.4340719296881332}}, "num_model_parameters": -1, "max_sequence_length": 2048, "vocabulary_size": 50297, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "speed", "task": "speed", "dataset_languages": ["ab", "aa", "af", "sq", "am", "ar", "an", "hy", "as", "av", "ae", "ay", "az", "bm", "ba", "eu", "be", "bn", "bi", "bs", "br", "bg", "my", "ca", "ch", "ce", "ny", "zh", "cu", "cv", "kw", "co", "cr", "hr", "cs", "da", "dv", "nl", "dz", "en", "eo", "et", "ee", "fo", "fj", "fi", "fr", "fy", "ff", "gd", "gl", "lg", "ka", "de", "el", "kl", "gn", "gu", "ht", "ha", "he", "hz", "hi", "ho", "hu", "is", "io", "ig", "id", "ia", "ie", "iu", "ik", "ga", "it", "ja", "kn", "kr", "ks", "kk", "km", "ki", "rw", "ky", "kv", "kg", "ko", "kj", "ku", "lo", "la", "lv", "li", "ln", "lt", "lu", "lb", "mk", "mg", "ms", "ml", "mt", "gv", "mi", "mr", "mh", "mn", "na", "nv", "nd", "nr", "ng", "ne", "no", "nb", "nn", "ii", "oc", "oj", "or", "om", "os", "pi", "ps", "fa", "pl", "pt", "pa", "qu", "ro", "rm", "rn", "ru", "se", "sm", "sg", "sa", "sc", "sr", "sn", "sd", "si", "sk", "sl", "so", "st", "es", "su", "sw", "ss", "sv", "tl", "ty", "tg", "ta", "tt", "te", "th", "bo", "ti", "to", "ts", "tn", "tr", "tk", "tw", "ug", "uk", "ur", "uz", "ve", "vi", "vo", "wa", "cy", "wo", "xh", "yi", "yo", "za", "zu"], "model": "../fietje-test/fietje-2b-sft/fietje-2b-sft-ckpt-356", "results": {"raw": {"test": [{"test_speed": 1632.29, "test_speed_short": 207.12}, {"test_speed": 2656.44, "test_speed_short": 389.25}, {"test_speed": 3823.32, "test_speed_short": 756.61}, {"test_speed": 3779.45, "test_speed_short": 910.4399999999999}, {"test_speed": 4752.54, "test_speed_short": 1074.57}, {"test_speed": 5376.17, "test_speed_short": 1407.3300000000002}, {"test_speed": 6171.87, "test_speed_short": 1578.24}, {"test_speed": 6047.58, "test_speed_short": 1661.3999999999999}, {"test_speed": 6145.9400000000005, "test_speed_short": 1838.46}, {"test_speed": 6301.99, "test_speed_short": 2000.8999999999999}]}, "total": {"test_speed": 4668.759, "test_speed_se": 1018.2919680716357, "test_speed_short": 1182.432, "test_speed_short_se": 380.0707607547335}}, "num_model_parameters": -1, "max_sequence_length": 2048, "vocabulary_size": 50297, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "dutch-social", "task": "sentiment-classification", "dataset_languages": ["nl"], "model": "../fietje-test/fietje-2b-sft/fietje-2b-sft-ckpt-534", "results": {"raw": {"test": [{"mcc": 0.1654566155177382, "macro_f1": 0.45039809472296094}, {"mcc": 0.14099652065422918, "macro_f1": 0.4281579284416462}, {"mcc": 0.08673932042463457, "macro_f1": 0.3978929870818833}, {"mcc": 0.16497750555149224, "macro_f1": 0.42821580515044894}, {"mcc": 0.20661385653878692, "macro_f1": 0.43736547629280015}, {"mcc": 0.14383191184172608, "macro_f1": 0.38605819074342307}, {"mcc": 0.1361238795431242, "macro_f1": 0.3872130032831886}, {"mcc": 0.1307971053992041, "macro_f1": 0.40515169512055665}, {"mcc": 0.10296448013349942, "macro_f1": 0.40114790696839414}, {"mcc": 0.09142310029181351, "macro_f1": 0.35550271739130435}]}, "total": {"test_mcc": 13.699242958962484, "test_mcc_se": 2.2915432880089113, "test_macro_f1": 40.771038051966066, "test_macro_f1_se": 1.7653103339286085}}, "num_model_parameters": -1, "max_sequence_length": 2048, "vocabulary_size": 50297, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "conll-nl", "task": "named-entity-recognition", "dataset_languages": ["nl"], "model": "../fietje-test/fietje-2b-sft/fietje-2b-sft-ckpt-534", "results": {"raw": {"test": [{"micro_f1_no_misc": 0.3828085957021489, "micro_f1": 0.27187646233036966}, {"micro_f1_no_misc": 0.40963855421686746, "micro_f1": 0.3222821896684657}, {"micro_f1_no_misc": 0.4013840830449827, "micro_f1": 0.30131121220230134}, {"micro_f1_no_misc": 0.3081252198381991, "micro_f1": 0.29354838709677417}, {"micro_f1_no_misc": 0.39471199244570354, "micro_f1": 0.28770532603285215}, {"micro_f1_no_misc": 0.3321523472099203, "micro_f1": 0.3113975576662144}, {"micro_f1_no_misc": 0.38786843229389145, "micro_f1": 0.27254419333184154}, {"micro_f1_no_misc": 0.3216348289649045, "micro_f1": 0.2778409090909091}, {"micro_f1_no_misc": 0.4104609929078014, "micro_f1": 0.29247764334560755}, {"micro_f1_no_misc": 0.30164670658682635, "micro_f1": 0.24181360201511334}]}, "total": {"test_micro_f1_no_misc": 36.50431753211246, "test_micro_f1_no_misc_se": 2.7185172354378317, "test_micro_f1": 28.72797482780449, "test_micro_f1_se": 1.4135084639742306}}, "num_model_parameters": -1, "max_sequence_length": 2048, "vocabulary_size": 50297, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "scala-nl", "task": "linguistic-acceptability", "dataset_languages": ["nl"], "model": "../fietje-test/fietje-2b-sft/fietje-2b-sft-ckpt-534", "results": {"raw": {"test": [{"mcc": 0.08375136329448662, "macro_f1": 0.4413529732678669}, {"mcc": 0.05524563471409866, "macro_f1": 0.4263043640991092}, {"mcc": -0.02104235208749127, "macro_f1": 0.40320927853951777}, {"mcc": 0.06034005527848655, "macro_f1": 0.4496858275790643}, {"mcc": 0.06236004676863409, "macro_f1": 0.45442782661643266}, {"mcc": -0.0013283088454780598, "macro_f1": 0.44441430199121046}, {"mcc": 0.0971515383670183, "macro_f1": 0.4561618214145519}, {"mcc": 0.052611741705993524, "macro_f1": 0.40555160343286706}, {"mcc": 0.04384238776682977, "macro_f1": 0.3927169187128674}, {"mcc": 0.04769447286225972, "macro_f1": 0.445195657306652}]}, "total": {"test_mcc": 4.806265798248378, "test_mcc_se": 2.2004589891763473, "test_macro_f1": 43.1902057296014, "test_macro_f1_se": 1.4490509208011901}}, "num_model_parameters": -1, "max_sequence_length": 2048, "vocabulary_size": 50297, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "squad-nl", "task": "question-answering", "dataset_languages": ["nl"], "model": "../fietje-test/fietje-2b-sft/fietje-2b-sft-ckpt-534", "results": {"raw": {"test": [{"em": 61.270333075135554, "f1": 71.39357125456047}, {"em": 61.24031007751938, "f1": 71.77902841951166}, {"em": 60.58732612055641, "f1": 71.99653475012472}, {"em": 60.124610591900314, "f1": 70.44395747302677}, {"em": 60.15444015444015, "f1": 72.02028062067853}, {"em": 62.22050886661527, "f1": 72.2193560730813}, {"em": 59.757023538344725, "f1": 72.5972787267755}, {"em": 60.27928626842514, "f1": 70.4279455679249}, {"em": 58.90196078431372, "f1": 70.34533270880014}, {"em": 61.80124223602485, "f1": 73.00763272723056}]}, "total": {"test_em": 60.63370417132755, "test_em_se": 0.6211306588788033, "test_f1": 71.62309183217145, "test_f1_se": 0.5860782340269844}}, "num_model_parameters": -1, "max_sequence_length": 2048, "vocabulary_size": 50297, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "wiki-lingua-nl", "task": "summarization", "dataset_languages": ["nl"], "model": "../fietje-test/fietje-2b-sft/fietje-2b-sft-ckpt-534", "results": {"raw": {"test": [{"bertscore": 0.6596110718382988, "rouge_l": 0.18573096176998927}, {"bertscore": 0.6584489357337588, "rouge_l": 0.18371671893192965}, {"bertscore": 0.6593526038923301, "rouge_l": 0.18094493689650507}, {"bertscore": 0.6767200428876095, "rouge_l": 0.1999753267755175}, {"bertscore": 0.654124743421562, "rouge_l": 0.1667975606936409}, {"bertscore": 0.6663553498219699, "rouge_l": 0.18865314840350345}, {"bertscore": 0.6539007799292449, "rouge_l": 0.17274759801202}, {"bertscore": 0.6570177439571125, "rouge_l": 0.1735177349278476}, {"bertscore": 0.6585980618692702, "rouge_l": 0.17875776294059526}, {"bertscore": 0.6570519697415875, "rouge_l": 0.1659544347901047}]}, "total": {"test_bertscore": 66.01181303092744, "test_bertscore_se": 0.4204017166162691, "test_rouge_l": 17.967961841416532, "test_rouge_l_se": 0.6494532930068725}}, "num_model_parameters": -1, "max_sequence_length": 2048, "vocabulary_size": 50297, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "mmlu-nl", "task": "knowledge", "dataset_languages": ["nl"], "model": "../fietje-test/fietje-2b-sft/fietje-2b-sft-ckpt-534", "results": {"raw": {"test": [{"mcc": 0.27570894159375015, "accuracy": 0.45751953125}, {"mcc": 0.2637801189053716, "accuracy": 0.447265625}, {"mcc": 0.27357398558493795, "accuracy": 0.45654296875}, {"mcc": 0.2434987740715526, "accuracy": 0.43310546875}, {"mcc": 0.2620776238009569, "accuracy": 0.4443359375}, {"mcc": 0.2865488563960343, "accuracy": 0.46240234375}, {"mcc": 0.2918421326209906, "accuracy": 0.46533203125}, {"mcc": 0.251324779980394, "accuracy": 0.4375}, {"mcc": 0.2499066005327495, "accuracy": 0.43359375}, {"mcc": 0.27814212568587626, "accuracy": 0.45654296875}]}, "total": {"test_mcc": 26.76403939172614, "test_mcc_se": 1.004349951732635, "test_accuracy": 44.94140625, "test_accuracy_se": 0.7385900685134079}}, "num_model_parameters": -1, "max_sequence_length": 2048, "vocabulary_size": 50297, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "hellaswag-nl", "task": "common-sense-reasoning", "dataset_languages": ["nl"], "model": "../fietje-test/fietje-2b-sft/fietje-2b-sft-ckpt-534", "results": {"raw": {"test": [{"accuracy": 0.3603515625, "mcc": 0.15650325486079494}, {"accuracy": 0.32763671875, "mcc": 0.11652719029186816}, {"accuracy": 0.34716796875, "mcc": 0.1367087988835828}, {"accuracy": 0.314453125, "mcc": 0.09607285367655319}, {"accuracy": 0.3564453125, "mcc": 0.14788251178307096}, {"accuracy": 0.33251953125, "mcc": 0.11538055046274646}, {"accuracy": 0.3154296875, "mcc": 0.0880961174802214}, {"accuracy": 0.37109375, "mcc": 0.1608669220363457}, {"accuracy": 0.35546875, "mcc": 0.14502707552684113}, {"accuracy": 0.3310546875, "mcc": 0.1295911454532897}]}, "total": {"test_accuracy": 34.1162109375, "test_accuracy_se": 1.2176882926829866, "test_mcc": 12.926564204553143, "test_mcc_se": 1.5381526559367207}}, "num_model_parameters": -1, "max_sequence_length": 2048, "vocabulary_size": 50297, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "speed", "task": "speed", "dataset_languages": ["ab", "aa", "af", "sq", "am", "ar", "an", "hy", "as", "av", "ae", "ay", "az", "bm", "ba", "eu", "be", "bn", "bi", "bs", "br", "bg", "my", "ca", "ch", "ce", "ny", "zh", "cu", "cv", "kw", "co", "cr", "hr", "cs", "da", "dv", "nl", "dz", "en", "eo", "et", "ee", "fo", "fj", "fi", "fr", "fy", "ff", "gd", "gl", "lg", "ka", "de", "el", "kl", "gn", "gu", "ht", "ha", "he", "hz", "hi", "ho", "hu", "is", "io", "ig", "id", "ia", "ie", "iu", "ik", "ga", "it", "ja", "kn", "kr", "ks", "kk", "km", "ki", "rw", "ky", "kv", "kg", "ko", "kj", "ku", "lo", "la", "lv", "li", "ln", "lt", "lu", "lb", "mk", "mg", "ms", "ml", "mt", "gv", "mi", "mr", "mh", "mn", "na", "nv", "nd", "nr", "ng", "ne", "no", "nb", "nn", "ii", "oc", "oj", "or", "om", "os", "pi", "ps", "fa", "pl", "pt", "pa", "qu", "ro", "rm", "rn", "ru", "se", "sm", "sg", "sa", "sc", "sr", "sn", "sd", "si", "sk", "sl", "so", "st", "es", "su", "sw", "ss", "sv", "tl", "ty", "tg", "ta", "tt", "te", "th", "bo", "ti", "to", "ts", "tn", "tr", "tk", "tw", "ug", "uk", "ur", "uz", "ve", "vi", "vo", "wa", "cy", "wo", "xh", "yi", "yo", "za", "zu"], "model": "../fietje-test/fietje-2b-sft/fietje-2b-sft-ckpt-534", "results": {"raw": {"test": [{"test_speed": 1606.73, "test_speed_short": 205.76}, {"test_speed": 2607.0899999999997, "test_speed_short": 386.70000000000005}, {"test_speed": 3743.14, "test_speed_short": 745.3}, {"test_speed": 3714.82, "test_speed_short": 901.4399999999999}, {"test_speed": 4636.71, "test_speed_short": 1066.83}, {"test_speed": 5233.03, "test_speed_short": 1386.24}, {"test_speed": 5985.29, "test_speed_short": 1555.2}, {"test_speed": 5862.45, "test_speed_short": 1640.1000000000001}, {"test_speed": 5912.469999999999, "test_speed_short": 1808.8200000000002}, {"test_speed": 6126.74, "test_speed_short": 1970.3}]}, "total": {"test_speed": 4542.847, "test_speed_se": 977.488834602835, "test_speed_short": 1166.6689999999999, "test_speed_short_se": 373.66150138494754}}, "num_model_parameters": -1, "max_sequence_length": 2048, "vocabulary_size": 50297, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "dutch-social", "task": "sentiment-classification", "dataset_languages": ["nl"], "model": "davidberenstein1957/ultra-feedback-dutch-cleaned-hq-spin-geitje-7b-ultra-sft_iter2", "results": {"raw": {"test": [{"mcc": 0.07933394829872478, "macro_f1": 0.3124075949040343}, {"mcc": 0.10059155549239204, "macro_f1": 0.3421129984540376}, {"mcc": 0.07620186744751482, "macro_f1": 0.35365884438217393}, {"mcc": 0.0836136008988403, "macro_f1": 0.341723767893877}, {"mcc": 0.17192739099293655, "macro_f1": 0.4020047142558137}, {"mcc": 0.089823497395216, "macro_f1": 0.33016759744433183}, {"mcc": 0.17478804375031282, "macro_f1": 0.3677374988846825}, {"mcc": 0.12410858431691343, "macro_f1": 0.3581260892555446}, {"mcc": 0.06647063976687699, "macro_f1": 0.30089831885989704}, {"mcc": 0.0861647145261422, "macro_f1": 0.34313595454966445}]}, "total": {"test_mcc": 10.5302384288587, "test_mcc_se": 2.422480997509129, "test_macro_f1": 34.519733788840576, "test_macro_f1_se": 1.760751895750864}}, "num_model_parameters": 7241732096, "max_sequence_length": 8192, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "conll-nl", "task": "named-entity-recognition", "dataset_languages": ["nl"], "model": "davidberenstein1957/ultra-feedback-dutch-cleaned-hq-spin-geitje-7b-ultra-sft_iter2", "results": {"raw": {"test": [{"micro_f1_no_misc": 0.3003327787021631, "micro_f1": 0.22608340147179068}, {"micro_f1_no_misc": 0.3762820512820513, "micro_f1": 0.27993182786536003}, {"micro_f1_no_misc": 0.4233318603623509, "micro_f1": 0.2723273207215134}, {"micro_f1_no_misc": 0.35149384885764506, "micro_f1": 0.2620778110303548}, {"micro_f1_no_misc": 0.4063280599500417, "micro_f1": 0.26594746716697937}, {"micro_f1_no_misc": 0.38404726735598227, "micro_f1": 0.2758791480931154}, {"micro_f1_no_misc": 0.3209054593874833, "micro_f1": 0.24066029539530842}, {"micro_f1_no_misc": 0.3739495798319328, "micro_f1": 0.2562150055991041}, {"micro_f1_no_misc": 0.35903337169159943, "micro_f1": 0.2351889112452493}, {"micro_f1_no_misc": 0.3685152057245081, "micro_f1": 0.2634146341463415}]}, "total": {"test_micro_f1_no_misc": 36.64219483145757, "test_micro_f1_no_misc_se": 2.264827340686107, "test_micro_f1": 25.77725822735116, "test_micro_f1_se": 1.1235433137983617}}, "num_model_parameters": 7241732096, "max_sequence_length": 8192, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "scala-nl", "task": "linguistic-acceptability", "dataset_languages": ["nl"], "model": "davidberenstein1957/ultra-feedback-dutch-cleaned-hq-spin-geitje-7b-ultra-sft_iter2", "results": {"raw": {"test": [{"mcc": 0.17103359376284447, "macro_f1": 0.5280095603040649}, {"mcc": 0.17223689661317332, "macro_f1": 0.5364120758069966}, {"mcc": 0.2462142978918859, "macro_f1": 0.5835897435897436}, {"mcc": 0.24477168603730182, "macro_f1": 0.5398728274692667}, {"mcc": 0.23900303859866837, "macro_f1": 0.5859168044696326}, {"mcc": 0.16909960931518264, "macro_f1": 0.5666251556662516}, {"mcc": 0.19614059127069322, "macro_f1": 0.5723837787504045}, {"mcc": 0.16759388766437985, "macro_f1": 0.57165664904484}, {"mcc": 0.2177702396148213, "macro_f1": 0.5369712921262639}, {"mcc": 0.17011046282598266, "macro_f1": 0.5849609375}]}, "total": {"test_mcc": 19.939743035949338, "test_mcc_se": 2.116715817516503, "test_macro_f1": 56.063988247274644, "test_macro_f1_se": 1.4157058958517625}}, "num_model_parameters": 7241732096, "max_sequence_length": 8192, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "squad-nl", "task": "question-answering", "dataset_languages": ["nl"], "model": "davidberenstein1957/ultra-feedback-dutch-cleaned-hq-spin-geitje-7b-ultra-sft_iter2", "results": {"raw": {"test": [{"em": 52.98218435321456, "f1": 66.98147522759862}, {"em": 50.62015503875969, "f1": 64.74262432094629}, {"em": 47.21792890262751, "f1": 64.99244778057364}, {"em": 52.10280373831776, "f1": 65.3438273635564}, {"em": 49.034749034749034, "f1": 63.1485331034202}, {"em": 54.20200462606014, "f1": 67.97714170178838}, {"em": 50.18982536066819, "f1": 65.92831870364634}, {"em": 52.676493405740885, "f1": 66.35134546210824}, {"em": 49.1764705882353, "f1": 64.73882512657174}, {"em": 49.68944099378882, "f1": 66.2017483747646}]}, "total": {"test_em": 50.789205604216185, "test_em_se": 1.3353295000183327, "test_f1": 65.64062871649745, "test_f1_se": 0.8391794701543068}}, "num_model_parameters": 7241732096, "max_sequence_length": 8192, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "wiki-lingua-nl", "task": "summarization", "dataset_languages": ["nl"], "model": "davidberenstein1957/ultra-feedback-dutch-cleaned-hq-spin-geitje-7b-ultra-sft_iter2", "results": {"raw": {"test": [{"bertscore": 0.693077197909588, "rouge_l": 0.21351360781146683}, {"bertscore": 0.6825540671707131, "rouge_l": 0.2050116934398628}, {"bertscore": 0.6895608811755665, "rouge_l": 0.21784124457719897}, {"bertscore": 0.6983623866108246, "rouge_l": 0.22262260238182158}, {"bertscore": 0.6840656989515992, "rouge_l": 0.18668270500400475}, {"bertscore": 0.7076790555438492, "rouge_l": 0.225191738306719}, {"bertscore": 0.6778783589979867, "rouge_l": 0.1954055908401876}, {"bertscore": 0.6876710127398837, "rouge_l": 0.20987363741841394}, {"bertscore": 0.6989307102339808, "rouge_l": 0.2231836021339252}, {"bertscore": 0.6904400557104964, "rouge_l": 0.20950202077066282}]}, "total": {"test_bertscore": 69.10219425044488, "test_bertscore_se": 0.548435981717268, "test_rouge_l": 21.088284426842634, "test_rouge_l_se": 0.7762861917029735}}, "num_model_parameters": 7241732096, "max_sequence_length": 8192, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "mmlu-nl", "task": "knowledge", "dataset_languages": ["nl"], "model": "davidberenstein1957/ultra-feedback-dutch-cleaned-hq-spin-geitje-7b-ultra-sft_iter2", "results": {"raw": {"test": [{"mcc": 0.27886047922684337, "accuracy": 0.458984375}, {"mcc": 0.26076001954930633, "accuracy": 0.4443359375}, {"mcc": 0.27777073684379117, "accuracy": 0.4560546875}, {"mcc": 0.2565401623395135, "accuracy": 0.439453125}, {"mcc": 0.23670491048212558, "accuracy": 0.4248046875}, {"mcc": 0.2979626529499345, "accuracy": 0.47265625}, {"mcc": 0.2924790028014591, "accuracy": 0.46337890625}, {"mcc": 0.2781534790913672, "accuracy": 0.45654296875}, {"mcc": 0.27068764123095357, "accuracy": 0.44775390625}, {"mcc": 0.2575598822082086, "accuracy": 0.4423828125}]}, "total": {"test_mcc": 27.074789667235024, "test_mcc_se": 1.1348703780615899, "test_accuracy": 45.0634765625, "test_accuracy_se": 0.8492450353256282}}, "num_model_parameters": 7241732096, "max_sequence_length": 8192, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "hellaswag-nl", "task": "common-sense-reasoning", "dataset_languages": ["nl"], "model": "davidberenstein1957/ultra-feedback-dutch-cleaned-hq-spin-geitje-7b-ultra-sft_iter2", "results": {"raw": {"test": [{"accuracy": 0.47607421875, "mcc": 0.3160624455291645}, {"accuracy": 0.4609375, "mcc": 0.27978493802019233}, {"accuracy": 0.46484375, "mcc": 0.28629480840429944}, {"accuracy": 0.4716796875, "mcc": 0.30001428432867644}, {"accuracy": 0.4814453125, "mcc": 0.31569920167144816}, {"accuracy": 0.47607421875, "mcc": 0.30174508217484053}, {"accuracy": 0.46484375, "mcc": 0.28982957537561416}, {"accuracy": 0.45556640625, "mcc": 0.27504071666334723}, {"accuracy": 0.4970703125, "mcc": 0.33455972066205725}, {"accuracy": 0.4755859375, "mcc": 0.3056381426834647}]}, "total": {"test_accuracy": 47.2412109375, "test_accuracy_se": 0.7325093263763628, "test_mcc": 30.046689155131045, "test_mcc_se": 1.1418166019007867}}, "num_model_parameters": 7241732096, "max_sequence_length": 8192, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "speed", "task": "speed", "dataset_languages": ["ab", "aa", "af", "sq", "am", "ar", "an", "hy", "as", "av", "ae", "ay", "az", "bm", "ba", "eu", "be", "bn", "bi", "bs", "br", "bg", "my", "ca", "ch", "ce", "ny", "zh", "cu", "cv", "kw", "co", "cr", "hr", "cs", "da", "dv", "nl", "dz", "en", "eo", "et", "ee", "fo", "fj", "fi", "fr", "fy", "ff", "gd", "gl", "lg", "ka", "de", "el", "kl", "gn", "gu", "ht", "ha", "he", "hz", "hi", "ho", "hu", "is", "io", "ig", "id", "ia", "ie", "iu", "ik", "ga", "it", "ja", "kn", "kr", "ks", "kk", "km", "ki", "rw", "ky", "kv", "kg", "ko", "kj", "ku", "lo", "la", "lv", "li", "ln", "lt", "lu", "lb", "mk", "mg", "ms", "ml", "mt", "gv", "mi", "mr", "mh", "mn", "na", "nv", "nd", "nr", "ng", "ne", "no", "nb", "nn", "ii", "oc", "oj", "or", "om", "os", "pi", "ps", "fa", "pl", "pt", "pa", "qu", "ro", "rm", "rn", "ru", "se", "sm", "sg", "sa", "sc", "sr", "sn", "sd", "si", "sk", "sl", "so", "st", "es", "su", "sw", "ss", "sv", "tl", "ty", "tg", "ta", "tt", "te", "th", "bo", "ti", "to", "ts", "tn", "tr", "tk", "tw", "ug", "uk", "ur", "uz", "ve", "vi", "vo", "wa", "cy", "wo", "xh", "yi", "yo", "za", "zu"], "model": "davidberenstein1957/ultra-feedback-dutch-cleaned-hq-spin-geitje-7b-ultra-sft_iter2", "results": {"raw": {"test": [{"test_speed": 1601.72, "test_speed_short": 235.73}, {"test_speed": 2420.6, "test_speed_short": 417.40000000000003}, {"test_speed": 2113.44, "test_speed_short": 768.3599999999999}, {"test_speed": 2740.34, "test_speed_short": 904.28}, {"test_speed": 3091.68, "test_speed_short": 1063.4399999999998}, {"test_speed": 3279.1, "test_speed_short": 1256.52}, {"test_speed": 3754.0800000000004, "test_speed_short": 1444.1999999999998}, {"test_speed": 3617.22, "test_speed_short": 1595.28}, {"test_speed": 3556.56, "test_speed_short": 1743.2600000000002}, {"test_speed": 3445.64, "test_speed_short": 1894.1999999999998}]}, "total": {"test_speed": 2962.038, "test_speed_se": 446.6691157331378, "test_speed_short": 1132.2669999999998, "test_speed_short_se": 344.7801217410439}}, "num_model_parameters": 7241732096, "max_sequence_length": 8192, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "dutch-social", "task": "sentiment-classification", "dataset_languages": ["nl"], "model": "BramVanroy/fietje-2b-chat", "results": {"raw": {"test": [{"mcc": 0.17489110765713348, "macro_f1": 0.4559457147408715}, {"mcc": 0.1493714342108084, "macro_f1": 0.41908642446094535}, {"mcc": 0.10915380678477897, "macro_f1": 0.3970860085317917}, {"mcc": 0.15553062559009534, "macro_f1": 0.4171539459398896}, {"mcc": 0.17083601716314836, "macro_f1": 0.4076526160904613}, {"mcc": 0.11294744562257225, "macro_f1": 0.3635635990467545}, {"mcc": 0.16249275671430843, "macro_f1": 0.38816058155680794}, {"mcc": 0.11352295326354792, "macro_f1": 0.3840470036326396}, {"mcc": 0.07504011165589441, "macro_f1": 0.37994743151204885}, {"mcc": 0.10151066717429776, "macro_f1": 0.3788710877311652}]}, "total": {"test_mcc": 13.252969258365855, "test_mcc_se": 2.1201367108180684, "test_macro_f1": 39.91514413243375, "test_macro_f1_se": 1.6552872070171711}}, "num_model_parameters": 2775059577, "max_sequence_length": 2048, "vocabulary_size": 50297, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "dutch-social", "task": "sentiment-classification", "dataset_languages": ["nl"], "model": "BramVanroy/fietje-2b-instruct", "results": {"raw": {"test": [{"mcc": 0.1654566155177382, "macro_f1": 0.45039809472296094}, {"mcc": 0.14099652065422918, "macro_f1": 0.4281579284416462}, {"mcc": 0.08673932042463457, "macro_f1": 0.3978929870818833}, {"mcc": 0.16497750555149224, "macro_f1": 0.42821580515044894}, {"mcc": 0.20661385653878692, "macro_f1": 0.43736547629280015}, {"mcc": 0.14383191184172608, "macro_f1": 0.38605819074342307}, {"mcc": 0.1361238795431242, "macro_f1": 0.3872130032831886}, {"mcc": 0.1307971053992041, "macro_f1": 0.40515169512055665}, {"mcc": 0.10296448013349942, "macro_f1": 0.40114790696839414}, {"mcc": 0.09142310029181351, "macro_f1": 0.35550271739130435}]}, "total": {"test_mcc": 13.699242958962484, "test_mcc_se": 2.2915432880089113, "test_macro_f1": 40.771038051966066, "test_macro_f1_se": 1.7653103339286085}}, "num_model_parameters": 2775059577, "max_sequence_length": 2048, "vocabulary_size": 50297, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "conll-nl", "task": "named-entity-recognition", "dataset_languages": ["nl"], "model": "BramVanroy/fietje-2b-chat", "results": {"raw": {"test": [{"micro_f1_no_misc": 0.43282801881860955, "micro_f1": 0.31045834388452775}, {"micro_f1_no_misc": 0.4417935006170301, "micro_f1": 0.34341129492298916}, {"micro_f1_no_misc": 0.4433827042522695, "micro_f1": 0.3467455621301775}, {"micro_f1_no_misc": 0.34552529182879377, "micro_f1": 0.32276119402985076}, {"micro_f1_no_misc": 0.4202483285577842, "micro_f1": 0.2997205994411989}, {"micro_f1_no_misc": 0.3560885608856089, "micro_f1": 0.3466955579631636}, {"micro_f1_no_misc": 0.41666666666666663, "micro_f1": 0.2924281984334204}, {"micro_f1_no_misc": 0.35721107927411655, "micro_f1": 0.3194263363754889}, {"micro_f1_no_misc": 0.41655044165504423, "micro_f1": 0.3277006638503319}, {"micro_f1_no_misc": 0.3265139116202946, "micro_f1": 0.27163198247535597}]}, "total": {"test_micro_f1_no_misc": 39.56808504176218, "test_micro_f1_no_misc_se": 2.741539482171855, "test_micro_f1": 31.80979733506505, "test_micro_f1_se": 1.5473258644987866}}, "num_model_parameters": 2775059577, "max_sequence_length": 2048, "vocabulary_size": 50297, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "conll-nl", "task": "named-entity-recognition", "dataset_languages": ["nl"], "model": "BramVanroy/fietje-2b-instruct", "results": {"raw": {"test": [{"micro_f1_no_misc": 0.3828085957021489, "micro_f1": 0.27187646233036966}, {"micro_f1_no_misc": 0.40963855421686746, "micro_f1": 0.3222821896684657}, {"micro_f1_no_misc": 0.4013840830449827, "micro_f1": 0.30131121220230134}, {"micro_f1_no_misc": 0.3081252198381991, "micro_f1": 0.29354838709677417}, {"micro_f1_no_misc": 0.39471199244570354, "micro_f1": 0.28770532603285215}, {"micro_f1_no_misc": 0.3321523472099203, "micro_f1": 0.3113975576662144}, {"micro_f1_no_misc": 0.38786843229389145, "micro_f1": 0.27254419333184154}, {"micro_f1_no_misc": 0.3216348289649045, "micro_f1": 0.2778409090909091}, {"micro_f1_no_misc": 0.4104609929078014, "micro_f1": 0.29247764334560755}, {"micro_f1_no_misc": 0.30164670658682635, "micro_f1": 0.24181360201511334}]}, "total": {"test_micro_f1_no_misc": 36.50431753211246, "test_micro_f1_no_misc_se": 2.7185172354378317, "test_micro_f1": 28.72797482780449, "test_micro_f1_se": 1.4135084639742306}}, "num_model_parameters": 2775059577, "max_sequence_length": 2048, "vocabulary_size": 50297, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "scala-nl", "task": "linguistic-acceptability", "dataset_languages": ["nl"], "model": "BramVanroy/fietje-2b-chat", "results": {"raw": {"test": [{"mcc": 0.1071357524736556, "macro_f1": 0.48813209494324045}, {"mcc": 0.033990700831060885, "macro_f1": 0.47613212909745467}, {"mcc": 0.05790089872057995, "macro_f1": 0.5080942009978869}, {"mcc": 0.12621881537582413, "macro_f1": 0.5467856011308523}, {"mcc": 0.13334940227450945, "macro_f1": 0.5580769986269252}, {"mcc": 0.07842197418969146, "macro_f1": 0.5304983936945746}, {"mcc": 0.10465735343702301, "macro_f1": 0.5332427592290788}, {"mcc": 0.09257583302149465, "macro_f1": 0.44997374787782457}, {"mcc": 0.0815827514807167, "macro_f1": 0.45024547988369495}, {"mcc": 0.11566378207599817, "macro_f1": 0.5575120461606153}]}, "total": {"test_mcc": 9.31497263880554, "test_mcc_se": 1.9201550918523473, "test_macro_f1": 50.98693451642149, "test_macro_f1_se": 2.5874669051304644}}, "num_model_parameters": 2775059577, "max_sequence_length": 2048, "vocabulary_size": 50297, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "scala-nl", "task": "linguistic-acceptability", "dataset_languages": ["nl"], "model": "BramVanroy/fietje-2b-instruct", "results": {"raw": {"test": [{"mcc": 0.08375136329448662, "macro_f1": 0.4413529732678669}, {"mcc": 0.05524563471409866, "macro_f1": 0.4263043640991092}, {"mcc": -0.02104235208749127, "macro_f1": 0.40320927853951777}, {"mcc": 0.06034005527848655, "macro_f1": 0.4496858275790643}, {"mcc": 0.06236004676863409, "macro_f1": 0.45442782661643266}, {"mcc": -0.0013283088454780598, "macro_f1": 0.44441430199121046}, {"mcc": 0.0971515383670183, "macro_f1": 0.4561618214145519}, {"mcc": 0.052611741705993524, "macro_f1": 0.40555160343286706}, {"mcc": 0.04384238776682977, "macro_f1": 0.3927169187128674}, {"mcc": 0.04769447286225972, "macro_f1": 0.445195657306652}]}, "total": {"test_mcc": 4.806265798248378, "test_mcc_se": 2.2004589891763473, "test_macro_f1": 43.1902057296014, "test_macro_f1_se": 1.4490509208011901}}, "num_model_parameters": 2775059577, "max_sequence_length": 2048, "vocabulary_size": 50297, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "squad-nl", "task": "question-answering", "dataset_languages": ["nl"], "model": "BramVanroy/fietje-2b-chat", "results": {"raw": {"test": [{"em": 61.58017041053447, "f1": 71.63990011334803}, {"em": 61.007751937984494, "f1": 71.23140499058486}, {"em": 60.89644513137558, "f1": 71.52975126497833}, {"em": 59.73520249221184, "f1": 69.81261596709638}, {"em": 59.84555984555985, "f1": 71.62980524192365}, {"em": 61.218195836545874, "f1": 70.98425602485929}, {"em": 59.5292331055429, "f1": 72.27073160430682}, {"em": 59.1155934833204, "f1": 69.18106749986761}, {"em": 58.666666666666664, "f1": 69.8909659388648}, {"em": 61.024844720496894, "f1": 72.11334154567228}]}, "total": {"test_em": 60.26196636302389, "test_em_se": 0.6211720730489843, "test_f1": 71.02838401915021, "test_f1_se": 0.6512112529321165}}, "num_model_parameters": 2775059577, "max_sequence_length": 2048, "vocabulary_size": 50297, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "squad-nl", "task": "question-answering", "dataset_languages": ["nl"], "model": "BramVanroy/fietje-2b-instruct", "results": {"raw": {"test": [{"em": 61.270333075135554, "f1": 71.39357125456047}, {"em": 61.24031007751938, "f1": 71.77902841951166}, {"em": 60.58732612055641, "f1": 71.99653475012472}, {"em": 60.124610591900314, "f1": 70.44395747302677}, {"em": 60.15444015444015, "f1": 72.02028062067853}, {"em": 62.22050886661527, "f1": 72.2193560730813}, {"em": 59.757023538344725, "f1": 72.5972787267755}, {"em": 60.27928626842514, "f1": 70.4279455679249}, {"em": 58.90196078431372, "f1": 70.34533270880014}, {"em": 61.80124223602485, "f1": 73.00763272723056}]}, "total": {"test_em": 60.63370417132755, "test_em_se": 0.6211306588788033, "test_f1": 71.62309183217145, "test_f1_se": 0.5860782340269844}}, "num_model_parameters": 2775059577, "max_sequence_length": 2048, "vocabulary_size": 50297, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "wiki-lingua-nl", "task": "summarization", "dataset_languages": ["nl"], "model": "BramVanroy/fietje-2b-chat", "results": {"raw": {"test": [{"bertscore": 0.6547282645042287, "rouge_l": 0.1836752030760273}, {"bertscore": 0.6532087284867885, "rouge_l": 0.1801832368060824}, {"bertscore": 0.6570256479099044, "rouge_l": 0.18027128291393213}, {"bertscore": 0.672940323522198, "rouge_l": 0.20069115663860257}, {"bertscore": 0.6412307269347366, "rouge_l": 0.15973338337159312}, {"bertscore": 0.6578760721749859, "rouge_l": 0.1838214901890423}, {"bertscore": 0.6488708046090323, "rouge_l": 0.1659335561110982}, {"bertscore": 0.6470002171263332, "rouge_l": 0.1709654987133084}, {"bertscore": 0.6489156382303918, "rouge_l": 0.17206325853256516}, {"bertscore": 0.6555914714408573, "rouge_l": 0.16568951547422808}]}, "total": {"test_bertscore": 65.37387894939457, "test_bertscore_se": 0.5267876388075372, "test_rouge_l": 17.630275818264803, "test_rouge_l_se": 0.7384836532552508}}, "num_model_parameters": 2775059577, "max_sequence_length": 2048, "vocabulary_size": 50297, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "wiki-lingua-nl", "task": "summarization", "dataset_languages": ["nl"], "model": "BramVanroy/fietje-2b-instruct", "results": {"raw": {"test": [{"bertscore": 0.6596110733953537, "rouge_l": 0.18573096176998927}, {"bertscore": 0.6584489361121086, "rouge_l": 0.18371671893192965}, {"bertscore": 0.6593526023207232, "rouge_l": 0.18094493689650507}, {"bertscore": 0.6767200423637405, "rouge_l": 0.1999753267755175}, {"bertscore": 0.654124744964065, "rouge_l": 0.1667975606936409}, {"bertscore": 0.6663553497346584, "rouge_l": 0.18865314840350345}, {"bertscore": 0.6539007825631415, "rouge_l": 0.17274759801202}, {"bertscore": 0.6570177437533857, "rouge_l": 0.1735177349278476}, {"bertscore": 0.6585980637610191, "rouge_l": 0.17875776294059526}, {"bertscore": 0.6570519689994399, "rouge_l": 0.1659544347901047}]}, "total": {"test_bertscore": 66.01181307967636, "test_bertscore_se": 0.4204016810303004, "test_rouge_l": 17.967961841416532, "test_rouge_l_se": 0.6494532930068725}}, "num_model_parameters": 2775059577, "max_sequence_length": 2048, "vocabulary_size": 50297, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "mmlu-nl", "task": "knowledge", "dataset_languages": ["nl"], "model": "BramVanroy/fietje-2b-chat", "results": {"raw": {"test": [{"mcc": 0.25973900591654564, "accuracy": 0.44287109375}, {"mcc": 0.25607464337545127, "accuracy": 0.44189453125}, {"mcc": 0.26399794417031486, "accuracy": 0.44677734375}, {"mcc": 0.2421874954880099, "accuracy": 0.43115234375}, {"mcc": 0.2631794761810005, "accuracy": 0.4462890625}, {"mcc": 0.2610466054780947, "accuracy": 0.4443359375}, {"mcc": 0.3191880511040354, "accuracy": 0.48876953125}, {"mcc": 0.2578455855974695, "accuracy": 0.443359375}, {"mcc": 0.2547713847017065, "accuracy": 0.43798828125}, {"mcc": 0.2847466554532462, "accuracy": 0.46240234375}]}, "total": {"test_mcc": 26.62776847465874, "test_mcc_se": 1.3250687040426687, "test_accuracy": 44.8583984375, "test_accuracy_se": 1.0022259436926175}}, "num_model_parameters": 2775059577, "max_sequence_length": 2048, "vocabulary_size": 50297, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "mmlu-nl", "task": "knowledge", "dataset_languages": ["nl"], "model": "BramVanroy/fietje-2b-instruct", "results": {"raw": {"test": [{"mcc": 0.27570894159375015, "accuracy": 0.45751953125}, {"mcc": 0.2637801189053716, "accuracy": 0.447265625}, {"mcc": 0.27357398558493795, "accuracy": 0.45654296875}, {"mcc": 0.2434987740715526, "accuracy": 0.43310546875}, {"mcc": 0.2620776238009569, "accuracy": 0.4443359375}, {"mcc": 0.2865488563960343, "accuracy": 0.46240234375}, {"mcc": 0.2918421326209906, "accuracy": 0.46533203125}, {"mcc": 0.251324779980394, "accuracy": 0.4375}, {"mcc": 0.2499066005327495, "accuracy": 0.43359375}, {"mcc": 0.27814212568587626, "accuracy": 0.45654296875}]}, "total": {"test_mcc": 26.76403939172614, "test_mcc_se": 1.004349951732635, "test_accuracy": 44.94140625, "test_accuracy_se": 0.7385900685134079}}, "num_model_parameters": 2775059577, "max_sequence_length": 2048, "vocabulary_size": 50297, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "hellaswag-nl", "task": "common-sense-reasoning", "dataset_languages": ["nl"], "model": "BramVanroy/fietje-2b-chat", "results": {"raw": {"test": [{"accuracy": 0.33349609375, "mcc": 0.13478515122105533}, {"accuracy": 0.3115234375, "mcc": 0.1018409256443926}, {"accuracy": 0.33642578125, "mcc": 0.12967987176862364}, {"accuracy": 0.3046875, "mcc": 0.08005363706427299}, {"accuracy": 0.33056640625, "mcc": 0.12636584969514797}, {"accuracy": 0.31298828125, "mcc": 0.09692305269579281}, {"accuracy": 0.32177734375, "mcc": 0.0982460663234001}, {"accuracy": 0.3525390625, "mcc": 0.13765250109145613}, {"accuracy": 0.34912109375, "mcc": 0.1392851008419882}, {"accuracy": 0.3173828125, "mcc": 0.11723349673343308}]}, "total": {"test_accuracy": 32.705078125, "test_accuracy_se": 0.9991495882154364, "test_mcc": 11.62065653079563, "test_mcc_se": 1.2788231344711753}}, "num_model_parameters": 2775059577, "max_sequence_length": 2048, "vocabulary_size": 50297, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "speed", "task": "speed", "dataset_languages": ["ab", "aa", "af", "sq", "am", "ar", "an", "hy", "as", "av", "ae", "ay", "az", "bm", "ba", "eu", "be", "bn", "bi", "bs", "br", "bg", "my", "ca", "ch", "ce", "ny", "zh", "cu", "cv", "kw", "co", "cr", "hr", "cs", "da", "dv", "nl", "dz", "en", "eo", "et", "ee", "fo", "fj", "fi", "fr", "fy", "ff", "gd", "gl", "lg", "ka", "de", "el", "kl", "gn", "gu", "ht", "ha", "he", "hz", "hi", "ho", "hu", "is", "io", "ig", "id", "ia", "ie", "iu", "ik", "ga", "it", "ja", "kn", "kr", "ks", "kk", "km", "ki", "rw", "ky", "kv", "kg", "ko", "kj", "ku", "lo", "la", "lv", "li", "ln", "lt", "lu", "lb", "mk", "mg", "ms", "ml", "mt", "gv", "mi", "mr", "mh", "mn", "na", "nv", "nd", "nr", "ng", "ne", "no", "nb", "nn", "ii", "oc", "oj", "or", "om", "os", "pi", "ps", "fa", "pl", "pt", "pa", "qu", "ro", "rm", "rn", "ru", "se", "sm", "sg", "sa", "sc", "sr", "sn", "sd", "si", "sk", "sl", "so", "st", "es", "su", "sw", "ss", "sv", "tl", "ty", "tg", "ta", "tt", "te", "th", "bo", "ti", "to", "ts", "tn", "tr", "tk", "tw", "ug", "uk", "ur", "uz", "ve", "vi", "vo", "wa", "cy", "wo", "xh", "yi", "yo", "za", "zu"], "model": "BramVanroy/fietje-2b-chat", "results": {"raw": {"test": [{"test_speed": 1643.6499999999999, "test_speed_short": 210.72}, {"test_speed": 2700.1499999999996, "test_speed_short": 396.90000000000003}, {"test_speed": 3895.0600000000004, "test_speed_short": 763.28}, {"test_speed": 3844.08, "test_speed_short": 924.1200000000001}, {"test_speed": 4780.62, "test_speed_short": 1090.05}, {"test_speed": 5435.11, "test_speed_short": 1423.8600000000001}, {"test_speed": 6216.06, "test_speed_short": 1597.44}, {"test_speed": 6086.849999999999, "test_speed_short": 1672.05}, {"test_speed": 6127.01, "test_speed_short": 1765.92}, {"test_speed": 6309.0, "test_speed_short": 2006.0000000000002}]}, "total": {"test_speed": 4703.759, "test_speed_se": 1014.5611876993948, "test_speed_short": 1185.034, "test_speed_short_se": 375.3758124577493}}, "num_model_parameters": 2775059577, "max_sequence_length": 2048, "vocabulary_size": 50297, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "hellaswag-nl", "task": "common-sense-reasoning", "dataset_languages": ["nl"], "model": "BramVanroy/fietje-2b-instruct", "results": {"raw": {"test": [{"accuracy": 0.3603515625, "mcc": 0.15650325486079494}, {"accuracy": 0.32763671875, "mcc": 0.11652719029186816}, {"accuracy": 0.34716796875, "mcc": 0.1367087988835828}, {"accuracy": 0.314453125, "mcc": 0.09607285367655319}, {"accuracy": 0.3564453125, "mcc": 0.14788251178307096}, {"accuracy": 0.33251953125, "mcc": 0.11538055046274646}, {"accuracy": 0.3154296875, "mcc": 0.0880961174802214}, {"accuracy": 0.37109375, "mcc": 0.1608669220363457}, {"accuracy": 0.35546875, "mcc": 0.14502707552684113}, {"accuracy": 0.3310546875, "mcc": 0.1295911454532897}]}, "total": {"test_accuracy": 34.1162109375, "test_accuracy_se": 1.2176882926829866, "test_mcc": 12.926564204553143, "test_mcc_se": 1.5381526559367207}}, "num_model_parameters": 2775059577, "max_sequence_length": 2048, "vocabulary_size": 50297, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "speed", "task": "speed", "dataset_languages": ["ab", "aa", "af", "sq", "am", "ar", "an", "hy", "as", "av", "ae", "ay", "az", "bm", "ba", "eu", "be", "bn", "bi", "bs", "br", "bg", "my", "ca", "ch", "ce", "ny", "zh", "cu", "cv", "kw", "co", "cr", "hr", "cs", "da", "dv", "nl", "dz", "en", "eo", "et", "ee", "fo", "fj", "fi", "fr", "fy", "ff", "gd", "gl", "lg", "ka", "de", "el", "kl", "gn", "gu", "ht", "ha", "he", "hz", "hi", "ho", "hu", "is", "io", "ig", "id", "ia", "ie", "iu", "ik", "ga", "it", "ja", "kn", "kr", "ks", "kk", "km", "ki", "rw", "ky", "kv", "kg", "ko", "kj", "ku", "lo", "la", "lv", "li", "ln", "lt", "lu", "lb", "mk", "mg", "ms", "ml", "mt", "gv", "mi", "mr", "mh", "mn", "na", "nv", "nd", "nr", "ng", "ne", "no", "nb", "nn", "ii", "oc", "oj", "or", "om", "os", "pi", "ps", "fa", "pl", "pt", "pa", "qu", "ro", "rm", "rn", "ru", "se", "sm", "sg", "sa", "sc", "sr", "sn", "sd", "si", "sk", "sl", "so", "st", "es", "su", "sw", "ss", "sv", "tl", "ty", "tg", "ta", "tt", "te", "th", "bo", "ti", "to", "ts", "tn", "tr", "tk", "tw", "ug", "uk", "ur", "uz", "ve", "vi", "vo", "wa", "cy", "wo", "xh", "yi", "yo", "za", "zu"], "model": "BramVanroy/fietje-2b-instruct", "results": {"raw": {"test": [{"test_speed": 1601.76, "test_speed_short": 204.16}, {"test_speed": 2653.62, "test_speed_short": 392.55}, {"test_speed": 3846.53, "test_speed_short": 753.13}, {"test_speed": 3818.79, "test_speed_short": 910.8000000000001}, {"test_speed": 4801.68, "test_speed_short": 1081.45}, {"test_speed": 5439.32, "test_speed_short": 1413.03}, {"test_speed": 6245.52, "test_speed_short": 1589.76}, {"test_speed": 6120.51, "test_speed_short": 1672.05}, {"test_speed": 6183.8, "test_speed_short": 1846.2600000000002}, {"test_speed": 6393.12, "test_speed_short": 2011.95}]}, "total": {"test_speed": 4710.465, "test_speed_se": 1039.648997376535, "test_speed_short": 1187.5140000000001, "test_speed_short_se": 382.9970161047127}}, "num_model_parameters": 2775059577, "max_sequence_length": 2048, "vocabulary_size": 50297, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "dutch-social", "task": "sentiment-classification", "dataset_languages": ["nl"], "model": "BramVanroy/fietje-2b", "results": {"raw": {"test": [{"mcc": 0.1318319831347061, "macro_f1": 0.41090040855873955}, {"mcc": 0.1065947989802212, "macro_f1": 0.4117794157241909}, {"mcc": 0.11337700515832627, "macro_f1": 0.432333126756082}, {"mcc": 0.15686076491495576, "macro_f1": 0.39064812292615475}, {"mcc": 0.1587794416541121, "macro_f1": 0.44865392608528953}, {"mcc": 0.1541834724290231, "macro_f1": 0.4169547739386459}, {"mcc": 0.16824203814707525, "macro_f1": 0.44028896227539854}, {"mcc": 0.1174924343791787, "macro_f1": 0.40334195401835243}, {"mcc": 0.14237842320133995, "macro_f1": 0.40776056866233984}, {"mcc": 0.08892795438806649, "macro_f1": 0.34070504173521554}]}, "total": {"test_mcc": 13.38668316387005, "test_mcc_se": 1.6361517270120356, "test_macro_f1": 41.0336630068041, "test_macro_f1_se": 1.869396578215213}}, "num_model_parameters": 2779683840, "max_sequence_length": 2048, "vocabulary_size": 51200, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "conll-nl", "task": "named-entity-recognition", "dataset_languages": ["nl"], "model": "BramVanroy/fietje-2b", "results": {"raw": {"test": [{"micro_f1_no_misc": 0.27189163038219644, "micro_f1": 0.2197750702905342}, {"micro_f1_no_misc": 0.3186119873817035, "micro_f1": 0.2988084326306141}, {"micro_f1_no_misc": 0.3762586115527292, "micro_f1": 0.31403901503038056}, {"micro_f1_no_misc": 0.28798842257597684, "micro_f1": 0.2911353032659409}, {"micro_f1_no_misc": 0.37566844919786097, "micro_f1": 0.2672340425531915}, {"micro_f1_no_misc": 0.4057142857142857, "micro_f1": 0.35313948588284866}, {"micro_f1_no_misc": 0.40176879343019584, "micro_f1": 0.3038449515473586}, {"micro_f1_no_misc": 0.2691511387163561, "micro_f1": 0.24547008547008548}, {"micro_f1_no_misc": 0.3872035300606729, "micro_f1": 0.31288094270621697}, {"micro_f1_no_misc": 0.29755671902268765, "micro_f1": 0.2567607726597325}]}, "total": {"test_micro_f1_no_misc": 33.91813568034666, "test_micro_f1_no_misc_se": 3.430127079020133, "test_micro_f1": 28.63088102036903, "test_micro_f1_se": 2.4234974272701635}}, "num_model_parameters": 2779683840, "max_sequence_length": 2048, "vocabulary_size": 51200, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "scala-nl", "task": "linguistic-acceptability", "dataset_languages": ["nl"], "model": "BramVanroy/fietje-2b", "results": {"raw": {"test": [{"mcc": 0.08672090025183518, "macro_f1": 0.4116482679862961}, {"mcc": 0.05086256175190577, "macro_f1": 0.39815532891431693}, {"mcc": 0.03057577011288058, "macro_f1": 0.3549508435489744}, {"mcc": 0.07981587962212158, "macro_f1": 0.40300994587776123}, {"mcc": 0.07869027645451038, "macro_f1": 0.45820741862308256}, {"mcc": -0.02990681688115148, "macro_f1": 0.40830924611395114}, {"mcc": 0.09684347618708675, "macro_f1": 0.40026621705563936}, {"mcc": 0.0847726631887017, "macro_f1": 0.3864673887650578}, {"mcc": 0.11256453023278074, "macro_f1": 0.41315694334721426}, {"mcc": 0.08430937276089524, "macro_f1": 0.4940817153624715}]}, "total": {"test_mcc": 6.752486136815664, "test_mcc_se": 2.551129259590871, "test_macro_f1": 41.282533155947654, "test_macro_f1_se": 2.3734534330078594}}, "num_model_parameters": 2779683840, "max_sequence_length": 2048, "vocabulary_size": 51200, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "squad-nl", "task": "question-answering", "dataset_languages": ["nl"], "model": "BramVanroy/fietje-2b", "results": {"raw": {"test": [{"em": 60.72811773818745, "f1": 70.36958541427433}, {"em": 60.310077519379846, "f1": 70.46938937486605}, {"em": 58.809891808346215, "f1": 69.83095947300819}, {"em": 58.25545171339564, "f1": 67.90203600452742}, {"em": 58.996138996138995, "f1": 70.19947640690663}, {"em": 59.21356977640709, "f1": 69.53446818203774}, {"em": 56.87167805618831, "f1": 69.76701174697344}, {"em": 57.02094647013188, "f1": 67.31260772937542}, {"em": 55.529411764705884, "f1": 67.72881068219596}, {"em": 59.93788819875776, "f1": 70.814380425821}]}, "total": {"test_em": 58.5673172041639, "test_em_se": 1.0296519732090643, "test_f1": 69.3928725439986, "test_f1_se": 0.7855038855025985}}, "num_model_parameters": 2779683840, "max_sequence_length": 2048, "vocabulary_size": 51200, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "wiki-lingua-nl", "task": "summarization", "dataset_languages": ["nl"], "model": "BramVanroy/fietje-2b", "results": {"raw": {"test": [{"bertscore": 0.607362448994536, "rouge_l": 0.16725284538527724}, {"bertscore": 0.6246193337574368, "rouge_l": 0.16815883530770076}, {"bertscore": 0.626352557723294, "rouge_l": 0.1705474270910149}, {"bertscore": 0.6390205500792945, "rouge_l": 0.18229977424965524}, {"bertscore": 0.6126786644308595, "rouge_l": 0.15520543721669133}, {"bertscore": 0.6176775256826659, "rouge_l": 0.17208166907033778}, {"bertscore": 0.6108438945375383, "rouge_l": 0.15361090822446166}, {"bertscore": 0.5940791981338407, "rouge_l": 0.14516459660995287}, {"bertscore": 0.5930090312904213, "rouge_l": 0.14103125589307736}, {"bertscore": 0.6238230813323753, "rouge_l": 0.15717633705926365}]}, "total": {"test_bertscore": 61.494662859622615, "test_bertscore_se": 0.8971537198509846, "test_rouge_l": 16.125290861074326, "test_rouge_l_se": 0.8027367332658722}}, "num_model_parameters": 2779683840, "max_sequence_length": 2048, "vocabulary_size": 51200, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "mmlu-nl", "task": "knowledge", "dataset_languages": ["nl"], "model": "BramVanroy/fietje-2b", "results": {"raw": {"test": [{"mcc": 0.23988775432449935, "accuracy": 0.42919921875}, {"mcc": 0.2514165091867351, "accuracy": 0.42578125}, {"mcc": 0.25360621086072305, "accuracy": 0.4384765625}, {"mcc": 0.24558077736139935, "accuracy": 0.4287109375}, {"mcc": 0.22821453805049605, "accuracy": 0.4091796875}, {"mcc": 0.2481329955409325, "accuracy": 0.42333984375}, {"mcc": 0.26277450859837365, "accuracy": 0.43896484375}, {"mcc": 0.2522912335208919, "accuracy": 0.4228515625}, {"mcc": 0.23048083626327331, "accuracy": 0.419921875}, {"mcc": 0.25384429161318195, "accuracy": 0.43115234375}]}, "total": {"test_mcc": 24.66229655320506, "test_mcc_se": 0.6742516764504672, "test_accuracy": 42.67578125, "test_accuracy_se": 0.5462442108631228}}, "num_model_parameters": 2779683840, "max_sequence_length": 2048, "vocabulary_size": 51200, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "hellaswag-nl", "task": "common-sense-reasoning", "dataset_languages": ["nl"], "model": "BramVanroy/fietje-2b", "results": {"raw": {"test": [{"accuracy": 0.29248046875, "mcc": 0.07447233272521797}, {"accuracy": 0.2607421875, "mcc": 0.040604248249629354}, {"accuracy": 0.306640625, "mcc": 0.1049056521455218}, {"accuracy": 0.25732421875, "mcc": 0.021186776056937574}, {"accuracy": 0.26513671875, "mcc": 0.03833268565234727}, {"accuracy": 0.25048828125, "mcc": 0.009810667999378938}, {"accuracy": 0.27392578125, "mcc": 0.04029386390391874}, {"accuracy": 0.287109375, "mcc": 0.06554781605953726}, {"accuracy": 0.2470703125, "mcc": 0.020341467836855854}, {"accuracy": 0.27783203125, "mcc": 0.06289643453294055}]}, "total": {"test_accuracy": 27.187499999999996, "test_accuracy_se": 1.197356208056011, "test_mcc": 4.783919451622854, "test_mcc_se": 1.8033083855372847}}, "num_model_parameters": 2779683840, "max_sequence_length": 2048, "vocabulary_size": 51200, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}
{"dataset": "speed", "task": "speed", "dataset_languages": ["ab", "aa", "af", "sq", "am", "ar", "an", "hy", "as", "av", "ae", "ay", "az", "bm", "ba", "eu", "be", "bn", "bi", "bs", "br", "bg", "my", "ca", "ch", "ce", "ny", "zh", "cu", "cv", "kw", "co", "cr", "hr", "cs", "da", "dv", "nl", "dz", "en", "eo", "et", "ee", "fo", "fj", "fi", "fr", "fy", "ff", "gd", "gl", "lg", "ka", "de", "el", "kl", "gn", "gu", "ht", "ha", "he", "hz", "hi", "ho", "hu", "is", "io", "ig", "id", "ia", "ie", "iu", "ik", "ga", "it", "ja", "kn", "kr", "ks", "kk", "km", "ki", "rw", "ky", "kv", "kg", "ko", "kj", "ku", "lo", "la", "lv", "li", "ln", "lt", "lu", "lb", "mk", "mg", "ms", "ml", "mt", "gv", "mi", "mr", "mh", "mn", "na", "nv", "nd", "nr", "ng", "ne", "no", "nb", "nn", "ii", "oc", "oj", "or", "om", "os", "pi", "ps", "fa", "pl", "pt", "pa", "qu", "ro", "rm", "rn", "ru", "se", "sm", "sg", "sa", "sc", "sr", "sn", "sd", "si", "sk", "sl", "so", "st", "es", "su", "sw", "ss", "sv", "tl", "ty", "tg", "ta", "tt", "te", "th", "bo", "ti", "to", "ts", "tn", "tr", "tk", "tw", "ug", "uk", "ur", "uz", "ve", "vi", "vo", "wa", "cy", "wo", "xh", "yi", "yo", "za", "zu"], "model": "BramVanroy/fietje-2b", "results": {"raw": {"test": [{"test_speed": 1679.86, "test_speed_short": 213.68}, {"test_speed": 2741.04, "test_speed_short": 402.6}, {"test_speed": 3947.8100000000004, "test_speed_short": 774.88}, {"test_speed": 3894.66, "test_speed_short": 936.36}, {"test_speed": 4885.92, "test_speed_short": 1112.41}, {"test_speed": 5536.150000000001, "test_speed_short": 1455.21}, {"test_speed": 6368.27, "test_speed_short": 1629.44}, {"test_speed": 6227.099999999999, "test_speed_short": 1717.49}, {"test_speed": 6272.139999999999, "test_speed_short": 1893.8400000000001}, {"test_speed": 6484.25, "test_speed_short": 2059.55}]}, "total": {"test_speed": 4803.719999999999, "test_speed_se": 1044.7124484421565, "test_speed_short": 1219.5459999999998, "test_speed_short_se": 392.0188057075321}}, "num_model_parameters": 2779683840, "max_sequence_length": 2048, "vocabulary_size": 51200, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.6.1"}