forked from GEM-benchmark/NL-Augmenter
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathevaluate_text_generation.py
108 lines (89 loc) Β· 3.33 KB
/
evaluate_text_generation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import numpy as np
from datasets import load_dataset
from sacrebleu import corpus_bleu
from transformers import pipeline
from dataset import KeyValueDataset
from tasks.TaskTypes import TaskType
def sacrebleu_score(hypotheses, references):
return corpus_bleu(hypotheses, [references]).score
def evaluate(
operation, evaluate_filter, model_name, dataset_name, split="test[:20%]"
):
# load model
if model_name is None:
model_name = "sshleifer/distilbart-xsum-12-6"
# load test set
if dataset_name is None:
dataset_name = "xsum"
print(
f"Loading <{dataset_name}> dataset to evaluate <{model_name}> model."
)
hf_dataset = (
load_dataset(dataset_name, "3.0.0", split=split)
if dataset_name == "xsum"
else load_dataset(dataset_name, split=split)
)
dataset = KeyValueDataset.from_huggingface(
hf_dataset, TaskType.TEXT_TO_TEXT_GENERATION, ["document", "summary"]
)
summarization_pipeline = pipeline(
"summarization", model=model_name, tokenizer=model_name
)
print(
f"Here is the performance of the model {model_name} on the {split} split of the {dataset_name} dataset"
)
if evaluate_filter:
performance = filter_performance(
dataset, summarization_pipeline, filter=operation
)
else:
performance = transformation_performance(
dataset, summarization_pipeline, transformation=operation
)
performance["model_name"] = model_name
performance["split"] = split
performance["dataset_name"] = dataset_name
return performance
def filter_performance(dataset, summarization_pipeline, filter):
print("Here is the performance of the model on the filtered set")
filtered_dataset = dataset.apply_filter(filter, subfields=["document"])
return performance_on_dataset(filtered_dataset, summarization_pipeline)
"""
Evaluates performance on the original set
and on the perturbed set.
"""
def transformation_performance(
dataset, summarization_pipeline, transformation
):
performance = performance_on_dataset(
dataset, summarization_pipeline
) # 15.989 BLEU
pt_dataset = dataset.apply_transformation(
transformation, subfields=["document"]
)
print("Here is the performance of the model on the transformed set")
pt_performance = performance_on_dataset(
pt_dataset, summarization_pipeline
) # 11.830 BLEU
return {"bleu": performance["bleu"], "pt_bleu": pt_performance["bleu"]}
def performance_on_dataset(dataset, summarization_pipeline):
references = []
raw_hypotheses = []
print(f"Length of Evaluation dataset is {len(dataset)}")
for example in dataset:
article, gold_summary = example
max_len = (
len(gold_summary.split(" ")) + 10
) # approximate max length to control summary generation upto length of gold summary
predicted_summary = summarization_pipeline(
article, truncation=True, max_length=max_len
)[0]["summary_text"]
references.append(gold_summary)
raw_hypotheses.append(predicted_summary)
predicted_summary_score = sacrebleu_score(
raw_hypotheses, references
) # 15.989 BLEU
print(f"Predicted BLEU score = {predicted_summary_score}")
return {
"bleu": np.round(predicted_summary_score, 1),
}