forked from GEM-benchmark/NL-Augmenter
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathevaluate_question_answering.py
80 lines (67 loc) Β· 2.42 KB
/
evaluate_question_answering.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import numpy as np
from datasets import load_dataset
from transformers import pipeline
from dataset import KeyValueDataset
from tasks.TaskTypes import TaskType
def evaluate(
operation,
evaluate_filter,
model_name,
dataset_name,
split="validation[:20%]",
):
# (1) load model
if model_name is None:
model_name = "mrm8488/bert-tiny-5-finetuned-squadv2"
# (2) load test set
if dataset_name is None:
dataset_name = "squad"
print(
f"Loading <{dataset_name}> dataset to evaluate <{model_name}> model."
)
hf_dataset = load_dataset(dataset_name, split=split)
qa_pipeline = pipeline(
"question-answering", model=model_name, tokenizer=model_name
)
dataset = KeyValueDataset.from_huggingface(
hf_dataset,
TaskType.QUESTION_ANSWERING,
["context", "question", "answers"],
)
print(
f"Here is the performance of the model {model_name} on the {split} split of the {dataset_name} dataset"
)
if evaluate_filter:
filtered_dataset = dataset.apply_filter(operation)
print("Starting evaluation on the filtered dataset.")
performance = evaluate_on_dataset(filtered_dataset, qa_pipeline)
else:
print("Starting evaluation on the original dataset.")
performance = evaluate_on_dataset(dataset, qa_pipeline)
print("Starting evaluation on the transformed dataset.")
pt_dataset = dataset.apply_transformation(operation)
pt_performance = evaluate_on_dataset(pt_dataset, qa_pipeline)
performance["pt_accuracy"] = pt_performance["accuracy"]
# (3) Execute perturbation
# (4) Execute the performance of the original set and the perturbed set
performance["model_name"] = model_name
performance["split"] = split
performance["dataset_name"] = dataset_name
return performance
def evaluate_on_dataset(dataset, qa_pipeline):
accuracy = 0
total = 0
for example in dataset:
context, question, answers = example
prediction = qa_pipeline(
{"context": context, "question": question}, truncation=True
)["answer"]
if prediction in answers:
accuracy += 1
total += 1
print(f"The number of examples = {total}")
print(f"The accuracy of exact matching = {100 * accuracy / total}")
return {
"accuracy": np.round(100 * accuracy / total, 1),
"no_of_examples": total,
}