Skip to content

Commit

Permalink
feat(experiments): add manaual evaluation
Browse files Browse the repository at this point in the history
  • Loading branch information
AmitMY committed May 21, 2019
1 parent 2b7fd01 commit 225e1d9
Show file tree
Hide file tree
Showing 15 changed files with 3,700 additions and 92 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
.idea/
libs/
!libs/__init__.py
cache/
20 changes: 19 additions & 1 deletion data/WebNLG/reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,19 @@
from utils.dbpedia import get_dbpedia_entity, pronouns
from utils.relex import RepresentsInt

FOR_MANUAL_EVAL = {18, 27, 37, 40, 41, 42, 55, 66, 69, 87, 90, 97, 101, 119, 130, 131, 133, 135, 142, 143, 144, 149,
150, 155, 169, 184, 188, 202, 209, 213, 223, 224, 225, 235, 239, 243, 257, 262, 274, 294, 301, 305,
310, 311, 325, 330, 334, 337, 355, 356, 359, 362, 366, 369, 373, 374, 375, 376, 383, 396, 400, 402,
403, 414, 419, 424, 438, 449, 451, 471, 472, 476, 483, 502, 511, 513, 518, 525, 536, 537, 538, 561,
569, 578, 581, 584, 585, 586, 591, 593, 602, 603, 619, 621, 623, 624, 632, 633, 648, 666, 669, 672,
691, 695, 696, 700, 701, 702, 706, 707, 717, 724, 729, 730, 734, 737, 740, 762, 768, 782, 786, 788,
793, 797, 805, 820, 825, 826, 827, 828, 833, 835, 836, 837, 840, 842, 857, 869, 871, 873, 876, 881,
889, 891, 899, 908, 913, 916, 993, 1010, 1020, 1038, 1042, 1075, 1080, 1091, 1107, 1131, 1139, 1173,
1175, 1181, 1183, 1205, 1208, 1224, 1261, 1265, 1276, 1288, 1298, 1309, 1325, 1329, 1341, 1345, 1363,
1368, 1393, 1399, 1405, 1436, 1440, 1445, 1463, 1465, 1466, 1504, 1505, 1523, 1530, 1537, 1542, 1577,
1579, 1582, 1606, 1613, 1614, 1620, 1639, 1648, 1668, 1673, 1692, 1704, 1721, 1733, 1752, 1755, 1763,
1772, 1774, 1776, 1782, 1784, 1794, 1796, 1807, 1810, 1852, 1859, 1861}


class RDFFileReader:
def __init__(self, file_name):
Expand All @@ -27,7 +40,12 @@ def __init__(self, file_name):
sentences = list(self.extract_sentences(entry["lex"]))

for s in sentences:
self.data.append(Datum(rdfs=triplets, text=s, info={"seen": not is_test_file or i <= 970}))
info = {
"id": i,
"seen": not is_test_file or i <= 970,
"manual": is_test_file and i + 1 in FOR_MANUAL_EVAL and i <= 970
}
self.data.append(Datum(rdfs=triplets, text=s, info=info))

def extract_sentences(self, lex):
sentences = lex
Expand Down
53 changes: 47 additions & 6 deletions data/reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from multiprocessing.pool import Pool
from typing import List, Tuple, Dict, Callable

import numpy as np
from tqdm import tqdm
import time

Expand Down Expand Up @@ -68,6 +69,10 @@ def set_hyp(self, hyp: str):

def set_plan(self, plan: str):
self.plan = plan

if not hasattr(self, "plan_changes"): # TODO remove after EMNLP
self.plan_changes = 1

self.plan_changes += 1
return self

Expand Down Expand Up @@ -217,7 +222,13 @@ def create_plans(self, planner):
self.timing[g_size].append(time.time() - start)

graph_plan = {g.unique_key(): p for g, p in zip(unique_graphs, plans)}
self.data = [d.set_plan(graph_plan[d.graph.unique_key()]) for d in self.data]
for d in self.data:
plans = graph_plan[d.graph.unique_key()]
if isinstance(plans, list):
d.set_plan(plans[0])
d.set_plans(plans[1:])
else:
d.set_plan(plans)
return self

def tokenize_plans(self):
Expand Down Expand Up @@ -250,7 +261,7 @@ def translate_plans(self, model: Model, planner, opts=None):

for d, p, t in zip(data, plans, translations):
is_covered_ent, is_covered_order = self.single_coverage(p, t)
if (not planner.re_plan) or is_covered_order:
if is_covered_order:
d.set_hyp(t)

graph_key = d.graph.unique_key()
Expand All @@ -261,10 +272,17 @@ def translate_plans(self, model: Model, planner, opts=None):
if len(data) == 0:
break

unique_graphs = {d.graph.unique_key(): d.graph for d in data}
graph_plans = {k: planner.plan_random(g, 1)[0] for k, g in unique_graphs.items()}
for d in data:
d.set_plan(graph_plans[d.graph.unique_key()])
if planner.re_plan == "PREMADE":
for d in data:
plans = d.plans
if len(plans) > 0:
d.set_plans(plans[1:])
d.set_plan(plans[0])
else:
unique_graphs = {d.graph.unique_key(): d.graph for d in data}
graph_plans = {k: planner.plan_random(g, 1)[0] for k, g in unique_graphs.items()}
for d in data:
d.set_plan(graph_plans[d.graph.unique_key()])

self.coverage()

Expand Down Expand Up @@ -346,5 +364,28 @@ def coverage(self):

return coverage

def retries(self):
pairs = {"seen": {}, "unseen": {}}
for d in self.data:
pairs["seen" if d.info["seen"] else "unseen"][d.plan] = d.plan_changes - 1 if hasattr(d,
"plan_changes") else 1

sums = {k: np.average(list(v.values())) for k, v in pairs.items()}
print("sums", sums)
return sums

def for_manual_evaluation(self):
graphs = {}
for d in self.data:
if "manual" in d.info and d.info["manual"]:
graphs[d.graph.unique_key()] = {
"id": d.info["id"] if hasattr(d.info, "id") else None,
"sen": d.hyp,
"rdf": [(r, d, f, None) for r, d, f in d.graph.as_rdf()],
"hal": 0
}

return list(graphs.values())

def describe_entities(self):
return self
99 changes: 62 additions & 37 deletions experiments.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,9 @@
lambda f, x: x["translate"].copy().post_process(x[x["reg-name"] + "-reg"]))
PostProcessPipeline.enqueue("ents-reg-map", "Ents REG map",
lambda f, x: f["post-process"].ents_reg_map
if hasattr(f["post-process"], "ents_reg_map") else {})
if hasattr(f["post-process"], "ents_reg_map") else {})
PostProcessPipeline.enqueue("for-manual-eval", "Build manual eval file",
lambda f, x: json.dumps(f["post-process"].for_manual_evaluation()), ext="json")
PostProcessPipeline.enqueue("bleu", "Get BLEU score",
lambda f, x: f["post-process"].evaluate())

Expand All @@ -62,6 +64,8 @@
x["test-config"]))
TranslatePipeline.enqueue("coverage", "Coverage of translation",
lambda f, x: f["translate"].coverage())
TranslatePipeline.enqueue("retries", "Retries of translation",
lambda f, x: f["translate"].retries())
TranslatePipeline.enqueue("eval-naive-reg", "Evaluate naive REG", PostProcessPipeline.mutate({"reg-name": "naive"}))
TranslatePipeline.enqueue("eval-bert-reg", "Evaluate BERT REG", PostProcessPipeline.mutate({"reg-name": "bert"}))

Expand All @@ -70,7 +74,7 @@
PlannerTranslatePipeline.enqueue("translate-best", "Translate best out",
TranslatePipeline.mutate({"test-config": best_out_config}))
verify_out_config = {"beam_size": 5, "find_best": True}
PlannerTranslatePipeline.enqueue("translate-verify", "Translate best out",
PlannerTranslatePipeline.enqueue("translate-verify", "Translate verified out",
TranslatePipeline.mutate({"test-config": verify_out_config}))


Expand Down Expand Up @@ -100,8 +104,27 @@ def model_pipeline(train_config):
if __name__ == "__main__":
config = Config(reader=WebNLGDataReader)

res = ExperimentsPipeline.mutate({"config": config}) \
.execute("WebNLG Experiments", cache_name="WebNLG_Exp")
all_res = [
ExperimentsPipeline.mutate({"config": config}).execute("WebNLG Experiments", cache_name="WebNLG_Exp"),
# ExperimentsPipeline.mutate({"config": config}).execute("WebNLG Experiments", cache_name="WebNLG_Exp1"),
# ExperimentsPipeline.mutate({"config": config}).execute("WebNLG Experiments", cache_name="WebNLG_Exp2"),
# ExperimentsPipeline.mutate({"config": config}).execute("WebNLG Experiments", cache_name="WebNLG_Exp3"),
# ExperimentsPipeline.mutate({"config": config}).execute("WebNLG Experiments", cache_name="WebNLG_Exp4")
]

for model_name in ["model", "model-feats"]:
print(model_name)
for decoding_method in ["best", "verify"]:
# print("\t", decoding_method)
all_retries = {"seen": 0, "unseen": 0}
for res in all_res:
retries = res[model_name]["translate-neural"]["translate-" + decoding_method]["retries"]
all_retries["seen"] += retries["seen"]
all_retries["unseen"] += retries["unseen"]
# print("\t\t", retries)
all_retries["seen"] = all_retries["seen"] / len(all_res)
all_retries["unseen"] = all_retries["unseen"] / len(all_res)
print("\t", decoding_method, all_retries)

# print(res["naive-planner"]["test-corpus"].data[100].plan)
# print(res["model"]["translate-naive"]["translate-best"]["translate"].data[100].plan)
Expand All @@ -113,40 +136,42 @@ def model_pipeline(train_config):
# print(res["model"]["translate-neural"]["translate-best"]["translate"].data[100].plan)
# print(res["model"]["translate-neural"]["translate-best"]["translate"].data[100].hyp)

# Coverage
table = []
for model_name in ["model", "model-feats"]:
model = res[model_name]
print(model_name)
for planner_name in ["naive", "neural"]:
print("\t", planner_name)
translation = model["translate-" + planner_name]

for i, res in enumerate(all_res):
print("\n\n\n", i)
# Coverage
table = []
for model_name in ["model", "model-feats"]:
model = res[model_name]
print(model_name)
for planner_name in ["naive", "neural"]:
print("\t", planner_name)
translation = model["translate-" + planner_name]

for decoding_method in ["best", "verify"]:
cov = translation["translate-" + decoding_method]["coverage"]
# print("\t\t", decoding_method, "\t", translation["translate-" + decoding_method]["coverage"])
tabbed = "\t".join([str(round(a * 100, 1)) for a in
[cov["seen"]["entities"], cov["seen"]["order"], cov["unseen"]["entities"],
cov["unseen"]["order"]]])
table.append(tabbed)
print("\t\t", decoding_method, "\t", tabbed)
print("\n".join(table))

# BLEU
bleu_table = []
for model_name in ["model", "model-feats"]:
model = res[model_name]
print(model_name)
for decoding_method in ["best", "verify"]:
cov = translation["translate-" + decoding_method]["coverage"]
# print("\t\t", decoding_method, "\t", translation["translate-" + decoding_method]["coverage"])
tabbed = "\t".join([str(round(a * 100, 1)) for a in
[cov["seen"]["entities"], cov["seen"]["order"], cov["unseen"]["entities"],
cov["unseen"]["order"]]])
table.append(tabbed)
print("\t\t", decoding_method, "\t", tabbed)
print("\n".join(table))

# BLEU
bleu_table = []
for model_name in ["model", "model-feats"]:
model = res[model_name]
print(model_name)
for decoding_method in ["best", "verify"]:
bleus = []
bleus = []

for planner_name in ["naive", "neural"]:
translation = model["translate-" + planner_name]["translate-" + decoding_method]
for reg_name in ["naive", "bert"]:
bleus.append(translation["eval-" + reg_name + "-reg"]["bleu"][0])
for planner_name in ["naive", "neural"]:
translation = model["translate-" + planner_name]["translate-" + decoding_method]
for reg_name in ["naive", "bert"]:
bleus.append(translation["eval-" + reg_name + "-reg"]["bleu"][0])

tabbed = "\t".join([str(round(a, 2)) for a in bleus])
bleu_table.append(tabbed)
tabbed = "\t".join([str(round(a, 2)) for a in bleus])
bleu_table.append(tabbed)

print("\t", decoding_method, "\t", tabbed)
print("\n".join(bleu_table))
print("\t", decoding_method, "\t", tabbed)
print("\n".join(bleu_table))
43 changes: 15 additions & 28 deletions manual-evaluation/anal.py
Original file line number Diff line number Diff line change
@@ -1,33 +1,20 @@
from itertools import chain
from json import load

from amt.manual.create import files, load_delex, ids, seen_limit, corpus
import numpy as np

samples = load(open("samples.json"))
if __name__ == "__main__":
samples = load(open("samples.json"))
rdfs = list(chain.from_iterable([s["rdf"] for s in samples]))
hal = np.sum([s["hal"] for s in samples])

mapper = {(s["id"], s["sen"]): s for s in samples}
total = len(rdfs)
exists = len([r for s, r, o, res in rdfs if res == "yes"])
doesnt = len([r for s, r, o, res in rdfs if res == "no"])
wrong = len([r for s, r, o, res in rdfs if res == "no-lex"])
wrong_reg = len([r for s, r, o, res in rdfs if res == "no-reg"])

# if __name__ == "__main__":
# for f in files:
# s = load_delex(f)
#
# scores = []
#
# for id in ids:
# if id <= seen_limit:
# id = id - 1
# sen = s[id].lower().replace("ent_", "<b>ent_").replace("_ent", "_ent</b>")
#
# key = (id, relex(sen))
# if key in mapper:
# scores.append(mapper[(id, relex(sen))])
#
# rdfs = hal = exists = doesnt = wrong = 0
# for sc in scores:
# hal += sc["hal"]
# rdfs += len(sc["rdf"])
# exists += len([r for s, r, o, res in sc["rdf"] if res == "yes"])
# doesnt += len([r for s, r, o, res in sc["rdf"] if res == "no"])
# wrong += len([r for s, r, o, res in sc["rdf"] if res == "no-lex"])
#
# print(f, "rdfs", rdfs, "hallucinations", hal, "exists", exists, "doesn't", doesnt, "wrong-lex", wrong)
# print("verify", exists, "+", doesnt, "+", wrong, "=", exists+doesnt+wrong)
print([(s,r,o) for s, r, o, res in rdfs if res == "no"])

print("rdfs", total, "hallucinations", hal, "exists", exists, "doesn't", doesnt, "wrong-lex", wrong, "wrong-reg", wrong_reg)
print("verify", exists, "+", doesnt, "+", wrong, "+", wrong_reg, "=", total)
1 change: 1 addition & 0 deletions manual-evaluation/analysed/EMNLP-naive.json

Large diffs are not rendered by default.

Loading

0 comments on commit 225e1d9

Please sign in to comment.