feat(experiments): add manaual evaluation

Tijana37 · May 21, 2019 · 225e1d9 · 225e1d9
1 parent 2b7fd01
commit 225e1d9
Show file tree

Hide file tree

Showing 15 changed files with 3,700 additions and 92 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,4 @@
 .idea/
 libs/
 !libs/__init__.py
+cache/
diff --git a/data/WebNLG/reader.py b/data/WebNLG/reader.py
@@ -11,6 +11,19 @@
 from utils.dbpedia import get_dbpedia_entity, pronouns
 from utils.relex import RepresentsInt
 
+FOR_MANUAL_EVAL = {18, 27, 37, 40, 41, 42, 55, 66, 69, 87, 90, 97, 101, 119, 130, 131, 133, 135, 142, 143, 144, 149,
+                   150, 155, 169, 184, 188, 202, 209, 213, 223, 224, 225, 235, 239, 243, 257, 262, 274, 294, 301, 305,
+                   310, 311, 325, 330, 334, 337, 355, 356, 359, 362, 366, 369, 373, 374, 375, 376, 383, 396, 400, 402,
+                   403, 414, 419, 424, 438, 449, 451, 471, 472, 476, 483, 502, 511, 513, 518, 525, 536, 537, 538, 561,
+                   569, 578, 581, 584, 585, 586, 591, 593, 602, 603, 619, 621, 623, 624, 632, 633, 648, 666, 669, 672,
+                   691, 695, 696, 700, 701, 702, 706, 707, 717, 724, 729, 730, 734, 737, 740, 762, 768, 782, 786, 788,
+                   793, 797, 805, 820, 825, 826, 827, 828, 833, 835, 836, 837, 840, 842, 857, 869, 871, 873, 876, 881,
+                   889, 891, 899, 908, 913, 916, 993, 1010, 1020, 1038, 1042, 1075, 1080, 1091, 1107, 1131, 1139, 1173,
+                   1175, 1181, 1183, 1205, 1208, 1224, 1261, 1265, 1276, 1288, 1298, 1309, 1325, 1329, 1341, 1345, 1363,
+                   1368, 1393, 1399, 1405, 1436, 1440, 1445, 1463, 1465, 1466, 1504, 1505, 1523, 1530, 1537, 1542, 1577,
+                   1579, 1582, 1606, 1613, 1614, 1620, 1639, 1648, 1668, 1673, 1692, 1704, 1721, 1733, 1752, 1755, 1763,
+                   1772, 1774, 1776, 1782, 1784, 1794, 1796, 1807, 1810, 1852, 1859, 1861}
+
 
 class RDFFileReader:
     def __init__(self, file_name):
@@ -27,7 +40,12 @@ def __init__(self, file_name):
             sentences = list(self.extract_sentences(entry["lex"]))
 
             for s in sentences:
-                self.data.append(Datum(rdfs=triplets, text=s, info={"seen": not is_test_file or i <= 970}))
+                info = {
+                    "id": i,
+                    "seen": not is_test_file or i <= 970,
+                    "manual": is_test_file and i + 1 in FOR_MANUAL_EVAL and i <= 970
+                }
+                self.data.append(Datum(rdfs=triplets, text=s, info=info))
 
     def extract_sentences(self, lex):
         sentences = lex

diff --git a/data/reader.py b/data/reader.py
@@ -8,6 +8,7 @@
 from multiprocessing.pool import Pool
 from typing import List, Tuple, Dict, Callable
 
+import numpy as np
 from tqdm import tqdm
 import time
 
@@ -68,6 +69,10 @@ def set_hyp(self, hyp: str):
 
     def set_plan(self, plan: str):
         self.plan = plan
+
+        if not hasattr(self, "plan_changes"):  # TODO remove after EMNLP
+            self.plan_changes = 1
+
         self.plan_changes += 1
         return self
 
@@ -217,7 +222,13 @@ def create_plans(self, planner):
             self.timing[g_size].append(time.time() - start)
 
         graph_plan = {g.unique_key(): p for g, p in zip(unique_graphs, plans)}
-        self.data = [d.set_plan(graph_plan[d.graph.unique_key()]) for d in self.data]
+        for d in self.data:
+            plans = graph_plan[d.graph.unique_key()]
+            if isinstance(plans, list):
+                d.set_plan(plans[0])
+                d.set_plans(plans[1:])
+            else:
+                d.set_plan(plans)
         return self
 
     def tokenize_plans(self):
@@ -250,7 +261,7 @@ def translate_plans(self, model: Model, planner, opts=None):
 
             for d, p, t in zip(data, plans, translations):
                 is_covered_ent, is_covered_order = self.single_coverage(p, t)
-                if (not planner.re_plan) or is_covered_order:
+                if is_covered_order:
                     d.set_hyp(t)
 
                 graph_key = d.graph.unique_key()
@@ -261,10 +272,17 @@ def translate_plans(self, model: Model, planner, opts=None):
             if len(data) == 0:
                 break
 
-            unique_graphs = {d.graph.unique_key(): d.graph for d in data}
-            graph_plans = {k: planner.plan_random(g, 1)[0] for k, g in unique_graphs.items()}
-            for d in data:
-                d.set_plan(graph_plans[d.graph.unique_key()])
+            if planner.re_plan == "PREMADE":
+                for d in data:
+                    plans = d.plans
+                    if len(plans) > 0:
+                        d.set_plans(plans[1:])
+                        d.set_plan(plans[0])
+            else:
+                unique_graphs = {d.graph.unique_key(): d.graph for d in data}
+                graph_plans = {k: planner.plan_random(g, 1)[0] for k, g in unique_graphs.items()}
+                for d in data:
+                    d.set_plan(graph_plans[d.graph.unique_key()])
 
             self.coverage()
 
@@ -346,5 +364,28 @@ def coverage(self):
 
         return coverage
 
+    def retries(self):
+        pairs = {"seen": {}, "unseen": {}}
+        for d in self.data:
+            pairs["seen" if d.info["seen"] else "unseen"][d.plan] = d.plan_changes - 1 if hasattr(d,
+                                                                                                  "plan_changes") else 1
+
+        sums = {k: np.average(list(v.values())) for k, v in pairs.items()}
+        print("sums", sums)
+        return sums
+
+    def for_manual_evaluation(self):
+        graphs = {}
+        for d in self.data:
+            if "manual" in d.info and d.info["manual"]:
+                graphs[d.graph.unique_key()] = {
+                    "id": d.info["id"] if hasattr(d.info, "id") else None,
+                    "sen": d.hyp,
+                    "rdf": [(r, d, f, None) for r, d, f in d.graph.as_rdf()],
+                    "hal": 0
+                }
+
+        return list(graphs.values())
+
     def describe_entities(self):
         return self
diff --git a/experiments.py b/experiments.py
@@ -50,7 +50,9 @@
                             lambda f, x: x["translate"].copy().post_process(x[x["reg-name"] + "-reg"]))
 PostProcessPipeline.enqueue("ents-reg-map", "Ents REG map",
                             lambda f, x: f["post-process"].ents_reg_map
-                                                    if hasattr(f["post-process"], "ents_reg_map") else {})
+                            if hasattr(f["post-process"], "ents_reg_map") else {})
+PostProcessPipeline.enqueue("for-manual-eval", "Build manual eval file",
+                            lambda f, x: json.dumps(f["post-process"].for_manual_evaluation()), ext="json")
 PostProcessPipeline.enqueue("bleu", "Get BLEU score",
                             lambda f, x: f["post-process"].evaluate())
 
@@ -62,6 +64,8 @@
                                            x["test-config"]))
 TranslatePipeline.enqueue("coverage", "Coverage of translation",
                           lambda f, x: f["translate"].coverage())
+TranslatePipeline.enqueue("retries", "Retries of translation",
+                          lambda f, x: f["translate"].retries())
 TranslatePipeline.enqueue("eval-naive-reg", "Evaluate naive REG", PostProcessPipeline.mutate({"reg-name": "naive"}))
 TranslatePipeline.enqueue("eval-bert-reg", "Evaluate BERT REG", PostProcessPipeline.mutate({"reg-name": "bert"}))
 
@@ -70,7 +74,7 @@
 PlannerTranslatePipeline.enqueue("translate-best", "Translate best out",
                                  TranslatePipeline.mutate({"test-config": best_out_config}))
 verify_out_config = {"beam_size": 5, "find_best": True}
-PlannerTranslatePipeline.enqueue("translate-verify", "Translate best out",
+PlannerTranslatePipeline.enqueue("translate-verify", "Translate verified out",
                                  TranslatePipeline.mutate({"test-config": verify_out_config}))
 
 
@@ -100,8 +104,27 @@ def model_pipeline(train_config):
 if __name__ == "__main__":
     config = Config(reader=WebNLGDataReader)
 
-    res = ExperimentsPipeline.mutate({"config": config}) \
-        .execute("WebNLG Experiments", cache_name="WebNLG_Exp")
+    all_res = [
+        ExperimentsPipeline.mutate({"config": config}).execute("WebNLG Experiments", cache_name="WebNLG_Exp"),
+        # ExperimentsPipeline.mutate({"config": config}).execute("WebNLG Experiments", cache_name="WebNLG_Exp1"),
+        # ExperimentsPipeline.mutate({"config": config}).execute("WebNLG Experiments", cache_name="WebNLG_Exp2"),
+        # ExperimentsPipeline.mutate({"config": config}).execute("WebNLG Experiments", cache_name="WebNLG_Exp3"),
+        # ExperimentsPipeline.mutate({"config": config}).execute("WebNLG Experiments", cache_name="WebNLG_Exp4")
+    ]
+
+    for model_name in ["model", "model-feats"]:
+        print(model_name)
+        for decoding_method in ["best", "verify"]:
+            # print("\t", decoding_method)
+            all_retries = {"seen": 0, "unseen": 0}
+            for res in all_res:
+                retries = res[model_name]["translate-neural"]["translate-" + decoding_method]["retries"]
+                all_retries["seen"] += retries["seen"]
+                all_retries["unseen"] += retries["unseen"]
+                # print("\t\t", retries)
+            all_retries["seen"] = all_retries["seen"] / len(all_res)
+            all_retries["unseen"] = all_retries["unseen"] / len(all_res)
+            print("\t", decoding_method, all_retries)
 
     # print(res["naive-planner"]["test-corpus"].data[100].plan)
     # print(res["model"]["translate-naive"]["translate-best"]["translate"].data[100].plan)
@@ -113,40 +136,42 @@ def model_pipeline(train_config):
     # print(res["model"]["translate-neural"]["translate-best"]["translate"].data[100].plan)
     # print(res["model"]["translate-neural"]["translate-best"]["translate"].data[100].hyp)
 
-    # Coverage
-    table = []
-    for model_name in ["model", "model-feats"]:
-        model = res[model_name]
-        print(model_name)
-        for planner_name in ["naive", "neural"]:
-            print("\t", planner_name)
-            translation = model["translate-" + planner_name]
-
+    for i, res in enumerate(all_res):
+        print("\n\n\n", i)
+        # Coverage
+        table = []
+        for model_name in ["model", "model-feats"]:
+            model = res[model_name]
+            print(model_name)
+            for planner_name in ["naive", "neural"]:
+                print("\t", planner_name)
+                translation = model["translate-" + planner_name]
+
+                for decoding_method in ["best", "verify"]:
+                    cov = translation["translate-" + decoding_method]["coverage"]
+                    # print("\t\t", decoding_method, "\t", translation["translate-" + decoding_method]["coverage"])
+                    tabbed = "\t".join([str(round(a * 100, 1)) for a in
+                                        [cov["seen"]["entities"], cov["seen"]["order"], cov["unseen"]["entities"],
+                                         cov["unseen"]["order"]]])
+                    table.append(tabbed)
+                    print("\t\t", decoding_method, "\t", tabbed)
+        print("\n".join(table))
+
+        # BLEU
+        bleu_table = []
+        for model_name in ["model", "model-feats"]:
+            model = res[model_name]
+            print(model_name)
             for decoding_method in ["best", "verify"]:
-                cov = translation["translate-" + decoding_method]["coverage"]
-                # print("\t\t", decoding_method, "\t", translation["translate-" + decoding_method]["coverage"])
-                tabbed = "\t".join([str(round(a * 100, 1)) for a in
-                                    [cov["seen"]["entities"], cov["seen"]["order"], cov["unseen"]["entities"],
-                                     cov["unseen"]["order"]]])
-                table.append(tabbed)
-                print("\t\t", decoding_method, "\t", tabbed)
-    print("\n".join(table))
-
-    # BLEU
-    bleu_table = []
-    for model_name in ["model", "model-feats"]:
-        model = res[model_name]
-        print(model_name)
-        for decoding_method in ["best", "verify"]:
-            bleus = []
+                bleus = []
 
-            for planner_name in ["naive", "neural"]:
-                translation = model["translate-" + planner_name]["translate-" + decoding_method]
-                for reg_name in ["naive", "bert"]:
-                    bleus.append(translation["eval-" + reg_name + "-reg"]["bleu"][0])
+                for planner_name in ["naive", "neural"]:
+                    translation = model["translate-" + planner_name]["translate-" + decoding_method]
+                    for reg_name in ["naive", "bert"]:
+                        bleus.append(translation["eval-" + reg_name + "-reg"]["bleu"][0])
 
-            tabbed = "\t".join([str(round(a, 2)) for a in bleus])
-            bleu_table.append(tabbed)
+                tabbed = "\t".join([str(round(a, 2)) for a in bleus])
+                bleu_table.append(tabbed)
 
-            print("\t", decoding_method, "\t", tabbed)
-    print("\n".join(bleu_table))
+                print("\t", decoding_method, "\t", tabbed)
+        print("\n".join(bleu_table))
diff --git a/manual-evaluation/anal.py b/manual-evaluation/anal.py
@@ -1,33 +1,20 @@
+from itertools import chain
 from json import load
 
-from amt.manual.create import files, load_delex, ids, seen_limit, corpus
+import numpy as np
 
-samples = load(open("samples.json"))
+if __name__ == "__main__":
+    samples = load(open("samples.json"))
+    rdfs = list(chain.from_iterable([s["rdf"] for s in samples]))
+    hal = np.sum([s["hal"] for s in samples])
 
-mapper = {(s["id"], s["sen"]): s for s in samples}
+    total = len(rdfs)
+    exists = len([r for s, r, o, res in rdfs if res == "yes"])
+    doesnt = len([r for s, r, o, res in rdfs if res == "no"])
+    wrong = len([r for s, r, o, res in rdfs if res == "no-lex"])
+    wrong_reg = len([r for s, r, o, res in rdfs if res == "no-reg"])
 
-# if __name__ == "__main__":
-#     for f in files:
-#         s = load_delex(f)
-#
-#         scores = []
-#
-#         for id in ids:
-#             if id <= seen_limit:
-#                 id = id - 1
-#                 sen = s[id].lower().replace("ent_", "<b>ent_").replace("_ent", "_ent</b>")
-#
-#                 key = (id, relex(sen))
-#                 if key in mapper:
-#                     scores.append(mapper[(id, relex(sen))])
-#
-#         rdfs = hal = exists = doesnt = wrong = 0
-#         for sc in scores:
-#             hal += sc["hal"]
-#             rdfs += len(sc["rdf"])
-#             exists += len([r for s, r, o, res in sc["rdf"] if res == "yes"])
-#             doesnt += len([r for s, r, o, res in sc["rdf"] if res == "no"])
-#             wrong += len([r for s, r, o, res in sc["rdf"] if res == "no-lex"])
-#
-#         print(f, "rdfs", rdfs, "hallucinations", hal, "exists", exists, "doesn't", doesnt, "wrong-lex", wrong)
-#         print("verify", exists, "+", doesnt, "+", wrong, "=", exists+doesnt+wrong)
+    print([(s,r,o) for s, r, o, res in rdfs if res == "no"])
+
+    print("rdfs", total, "hallucinations", hal, "exists", exists, "doesn't", doesnt, "wrong-lex", wrong, "wrong-reg", wrong_reg)
+    print("verify", exists, "+", doesnt, "+", wrong, "+", wrong_reg, "=", total)
diff --git a/manual-evaluation/analysed/EMNLP-naive.json b/manual-evaluation/analysed/EMNLP-naive.json