forked from GEM-benchmark/NL-Augmenter
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathleaderboard_wrapper.py
118 lines (108 loc) Β· 4.27 KB
/
leaderboard_wrapper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import sys
import pandas as pd
from evaluation.evaluation_engine import execute_model
from tasks.TaskTypes import TaskType
from TestRunner import OperationRuns, get_implementation
sys.path.append("..")
sys.path.append("../..")
"""This is a dict for default models to be included in the leaderboard.
Each entry is a combination of (MODEL_NAME, DATA_NAME) used in
Huggingface: https://huggingface.co/
"""
DEFAULT_LEADERBOARD_MODELS = {
"QUESTION_ANSWERING": [
("deepset/roberta-base-squad2", "squad"),
("bert-large-uncased-whole-word-masking-finetuned-squad", "squad"),
# ("distilbert-base-cased-distilled-squad", "squad")
],
"TEXT_CLASSIFICATION": [
# sentiment analysis
("textattack/roberta-base-SST-2", "sst2"),
("textattack/bert-base-uncased-QQP", "qqp"),
("roberta-large-mnli", "multi_nli"),
("textattack/roberta-base-imdb", "imdb"),
],
"TEXT_TAGGING": [],
"DIALOGUE_ACT_TO_TEXT": [],
"TABLE_TO_TEXT": [],
"RDF_TO_TEXT": [],
"RDF_TO_RDF": [],
"QUESTION_GENERATION": [],
"AMR_TO_TEXT": [],
"E2E_TASK": [],
}
def create_leaderboard_for_task(
task_type, trans_names_to_run=None, percentage_of_examples=20
):
"""Given a task type, the function runs a list of operations
and return a
Args:
task_type (Literal): Task as specified in tasks.taskTypes.
trans_names_to_run (List[str], optional):
Can choose to only run some of the transformations,
by giving a list of names for a given transformation.
Defaults to None.
percentage_of_examples (int, optional):
The percentage of examples to perturb.
Defaults to 1.
Raises:
ValueError: [description]
"""
if task_type not in TaskType:
# TODO: this might be more useful somewhere else.
raise ValueError(f"{task_type} does not exist.")
task_name = TaskType(task_type).name
all_trans = list(OperationRuns.get_all_operations_for_task(task_type))
all_trans_names = {t.name(): i for i, t in enumerate(all_trans)}
transformations = []
if trans_names_to_run is not None:
for name in trans_names_to_run:
if name in all_trans_names:
transformations.append(all_trans[all_trans_names[name]])
else:
print(f"WARNING: {name} is not supported.")
else:
transformations = all_trans
# filtered transformations
print(
f"""
Creating leaderboard for task: [{task_name}].
Transformations being run:
\t{", ".join([t.name() for t in transformations])}
"""
)
result_dict = {
t.name(): {"Transformation": t.name()} for t in transformations
}
for model_name, dataset_name in DEFAULT_LEADERBOARD_MODELS[task_name]:
# TODO: should we try to allow passing in models, rather than model names?
# in this leaderboard case the default implementation will cause unnecessary
# multiple inputs.
print(f"---- Evaluating {model_name} on {dataset_name} -----")
for trans in transformations:
print(f"| Transformation: {trans.name()}")
try:
result = execute_model(
implementation=get_implementation(trans.__name__),
task_type=task_name,
model_name=model_name,
dataset=dataset_name,
percentage_of_examples=percentage_of_examples,
)
if "accuracy" in result:
key, pt_key = "accuracy", "pt_accuracy"
if "bleu" in result:
key, pt_key = "bleu", "pt_bleu"
result_dict[trans.name()][f"{model_name.split('/')[-1]}"] = f"{result[key]}->{result[pt_key]} ({result[pt_key]-result[key]})"
except Exception as e:
print(f"\t Error on {trans.name()}: {e}")
df_result = pd.DataFrame(list(result_dict.values()))
print("Finished! The leaderboard:")
print(df_result.to_markdown(index=False))
filename = f"leaderboard_{TaskType(task_type).name}.csv"
df_result.to_csv(filename)
print("Saved the result to f{filename}")
return result_dict
if __name__ == "__main__":
for task_type in TaskType:
create_leaderboard_for_task(task_type)