Added support for subword unit view and corresponding subword unit em…

…beddings. Squashed merge commit of the following: commit e9367f8f94ffd09fe76b0c12d9ace54a79533a4f Author: Darlene Stewart <[email protected]> Date: Mon May 27 18:18:33 2019 -0400 Fixed YiSi SRL test reference files yet again. commit a7145d50093c360642b8da68c2501c5333973ab5 Merge: ea751e4 d1fe9d8 Author: Darlene Stewart <[email protected]> Date: Mon May 27 17:54:35 2019 -0400 Merge branch 'dev.merge.NRC-private' Merge NRC-private commits db8a070 through f35fd82: git cherry-pick -e -x db8a070 git cherry-pick -e -x db8a070..96a8a7d git cherry-pick -e -x f35fd82 Also, fixed code formatting and YiSi SRL test reference files. commit d1fe9d816c06940cd91a85beb0ca63680bae9114 Author: Darlene Stewart <[email protected]> Date: Mon May 27 17:39:21 2019 -0400 Fixed the YiSi SRL test reference files again. commit 7f3931c178255a4a1ecac013691cbee915f3a8e6 Author: Darlene Stewart <[email protected]> Date: Mon May 27 17:43:36 2019 -0400 Code formatting fixes commit d0022bcd5b53b5787f2ca4e8c628d345d4e2c8f7 Author: Darlene Stewart <[email protected]> Date: Mon May 27 14:27:02 2019 -0400 Fixed copyright year. commit 748df7c0dcef368720dc7b535d1838d6019bb9ad Author: Darlene Stewart <[email protected]> Date: Thu May 23 17:14:16 2019 -0400 Update YiSi test reference files to match output on NRC-private branch. (cherry picked from NRC-private branch commit f35fd82e646f1d60143b4976ff8d58c93760544b) commit 986b61974cfc8ed08e7e06d434b8195916f457e0 Author: Darlene Stewart <[email protected]> Date: Wed May 22 16:59:44 2019 -0400 Fixed an int-size_t comparison causing a g++ warning in yisi::read_sent(). (cherry picked from NRC-private branch commit 96a8a7d652888f05bc5fa95c60c243591cafc543) commit 08b31bfa966af2cb4e965ecb97dfc4cf8047c7ba Author: Darlene Stewart <[email protected]> Date: Wed May 22 16:20:00 2019 -0400 Update YiSi test reference files to match the current state of the NRC-private branch. (cherry picked from NRC-private branch commit 5b5f4a25e69625cd1b8a0f7a8d8974e0ce2b0e60) commit 5f60ca583f0b127431c04b87dcab6d606ca6214b Author: Darlene Stewart <[email protected]> Date: Wed May 22 14:39:23 2019 -0400 Fixed read_con1109 to set the sentence tokens for 'word' type. (cherry picked from NRC-private branch commit 2bd0c3f9e1c9029494619d2175c187a9605ca635) commit 745004781aeabbdd1cbdfbed198de473f1706ab2 Author: Jackie Lo <[email protected]> Date: Tue May 14 18:27:46 2019 -0400 Bug fix for reading srl parse in conll09 format. (cherry picked from NRC-private branch commit 4b8740d2ef1c23390bcef3d0fe4a6acdcc446dfe) commit 875171ae255dbe5a5ae7ce275e2aed5d13819373 Author: Jackie Lo <[email protected]> Date: Wed May 8 07:24:43 2019 -0400 Rewriting confusing progress message in main function. (cherry picked from NRC-private branch commit f29066dceba8ec5ac92b33e5142d7075f78aeafc) commit 783ee3611d90a46503528c21b6b1b3e1efd96f69 Author: Jackie Lo <[email protected]> Date: Wed May 8 07:15:35 2019 -0400 Another bug fix in reading conll09 formatted srl. (cherry picked from NRC-private branch commit 675e73d35dd231d39cc415a0e7311f56052af981) commit 7d038c0d2d49b6d96182326d3174899bfa2fef03 Author: Jackie Lo <[email protected]> Date: Wed May 8 00:01:41 2019 -0400 Bug fix in reading conll09 srl format. (cherry picked from NRC-private branch commit c3119343e2cceda62c37f3aed559c5f3014f7573) commit fb6e0b69589294f9bda98efd25ee143ba6341a98 Author: Jackie Lo <[email protected]> Date: Tue May 7 16:00:15 2019 -0400 Added data structure for sentence. (cherry picked from NRC-private branch commit d5cdb883271ef8a30aedb5727613b57fb799821c) commit efa1ea1014ed88910e42572720cf626423bc1713 Author: Jackie Lo <[email protected]> Date: Tue May 7 15:59:09 2019 -0400 Added some handy tools for general w2v embeddings analysis. (cherry picked from NRC-private branch commit 7bd1fca5447cfc176da26a48fdebb90f9d602da7) commit 2d81344c61d7e1dbeac55ab5943715c293c0ddd1 Author: Jackie Lo <[email protected]> Date: Tue May 7 12:38:12 2019 -0400 Redesign of data structure for sentences to support additional subword unit view and corresponding subword unit embeddings. (cherry picked from NRC-private branch commit 1df49fec9be3196adfbd5ab990dd7a323ec9c7f6)
nrc-cnrc · May 28, 2019 · fd07bfc · fd07bfc
1 parent 6333066
commit fd07bfc
Show file tree

Hide file tree

Showing 48 changed files with 2,112 additions and 1,201 deletions.
diff --git a/src/Makefile b/src/Makefile
@@ -12,7 +12,7 @@
 # Override the value of MATEPLUS_HOME with a command line definition, or
 # consider defining MATEPLUS_HOME in your .profile, for example:
 #   export MATEPLUS_HOME=~/u/sandboxes/mateplus
-MATEPLUS_HOME ?= ~/u/tools/MATE/mateplus-master/src
+MATEPLUS_HOME ?= /home/loc982/u/tools/mateplus
 
 MATETOOLS_HOME ?= $(MATEPLUS_HOME)
 
@@ -47,6 +47,7 @@ LIBRARIES += -Wl,-Bstatic -lcmdlp -Wl,-Bdynamic
 PROG_NAMES := yisi
 TEST_NAMES := srlgraph_test maxmatching_test lexsim_test w2v_test biw2v_test \
 	      lexweight_test phrasesim_test srl_test srlutil_test util_test \
+	      emap_test oov_test ngram_test overlapvocab_test \
 	      yisiscorer_test testbin
 CMDLP_TEST_NAMES := cmdlp_test
 

diff --git a/src/emap_test.cpp b/src/emap_test.cpp
@@ -0,0 +1,74 @@
+/**
+ * @file lexsim_test.cpp
+ * @brief Unit test for lexsim.
+ *
+ * @author Jackie Lo
+ *
+ * Multilingual Text Processing / Traitement multilingue de textes
+ * Digital Technologies Research Centre / Centre de recherche en technologies numériques
+ * National Research Council Canada / Conseil national de recherches Canada
+ * Copyright 2019, Her Majesty in Right of Canada /
+ * Copyright 2019, Sa Majeste la Reine du Chef du Canada
+ */
+
+#include "lexsim.h"
+
+#include <iostream>
+#include <set>
+
+using namespace std;
+using namespace yisi;
+
+int main(int argc, char* argv[])
+{
+   string inpembpath = argv[1];
+   string hypembpath = argv[2];
+   string inpmappath = argv[3];
+   string inpdocpath = argv[4];
+   ofstream INPMAP;
+   ofstream INPDOC;
+   open_ofstream(INPMAP, inpmappath);
+
+   map<string, vector<double> > inpemb;
+   map<string, vector<double> > hypemb;
+   map<string, vector<double> > inpfilemb;
+   int dim;
+   read_binw2v(inpembpath, inpemb, dim);
+   read_binw2v(hypembpath, hypemb, dim);
+   vector<string> inpsents = read_file(inpdocpath);
+   //filter the inp emb according to the inp doc
+   set<string> tokens;
+   for (auto it = inpsents.begin(); it != inpsents.end(); it++) {
+      auto sent = tokenize(*it);
+      tokens.insert(sent.begin(), sent.end());
+   }
+   for (auto it = tokens.begin(); it != tokens.end(); it++) {
+      auto jt = inpemb.find(*it);
+      if (jt != inpemb.end()) {
+         inpfilemb[*it] = jt->second;
+      }
+   }
+
+   string maxsim_str;
+   double maxsim_scr=0.0;
+   for (auto it = inpfilemb.begin(); it != inpfilemb.end(); it++) {
+      auto inp_s = it->first;
+      auto inp_v = it->second;
+      for (auto jt = hypemb.begin(); jt != hypemb.end(); jt++) {
+         auto hyp_s = jt->first;
+         auto hyp_v = jt->second;
+         double sim = 0.0;
+         for (int i = 0; i < dim; i++) {
+            sim += inp_v[i] * hyp_v[i];
+         }
+         if (sim > maxsim_scr) {
+            maxsim_str = hyp_s;
+            maxsim_scr = sim;
+         }
+      }
+      INPMAP << inp_s << "\t" << maxsim_str << endl;
+      maxsim_scr = 0.0;
+   }
+   return 0;
+}
+
diff --git a/src/lexsim.cpp b/src/lexsim.cpp
@@ -45,15 +45,15 @@ double lexsimexact_t::get_sim(string ref, string hyp, int mode) {
 double lexsimlcs_t::get_sim(string ref, string hyp, int mode) {
    if (mode == yisi::INP_MODE) {
       cerr << "ERROR: longest common subsequence lex sim model is not defined "
-           << "in crosslingual settings. Exiting..." << endl;
+         << "in crosslingual settings. Exiting..." << endl;
       exit(1);
    }
    double lcs_n = 0.0;
    size_t ref_n = ref.length();
    size_t hyp_n = hyp.length();
    // find the length of the longest common character subsequence
-   for (size_t i = 0; i < ref_n - lcs_n - 1; i++) {
-      //cerr << "Current ref pos: " << i << endl;
+   for (size_t i = 0; i < ref_n - lcs_n; i++) {
+      // cerr << "Current ref pos: " << i << endl;
       size_t j;
       for (j = lcs_n + 1; j <= ref_n - i; j++) {
          //cerr << "Previous common length: " << lcs_n << endl;
@@ -214,51 +214,51 @@ void lexsimw2v_t::write_txtw2v(std::string path) {
    cerr << "Done." << endl;
 }
 
-lexsimemapw2v_t::lexsimemapw2v_t(string emap_path, string outw2v_path)
-: lexsimw2v_t(outw2v_path) {
-  cerr << "Reading emap model from " << emap_path << endl;
-  ifstream EMAP(emap_path.c_str());
-  if (!EMAP) {
-    cerr << "ERROR: fail to open ibm model. Exiting..." << endl;
-    exit(1);
-  }
-  while (!EMAP.eof()) {
-    string inp;
-    string hyp;
-    EMAP >> inp >> hyp;
-    emap_m[inp]=hyp;
-  }
-  EMAP.close();
-  cerr << "Finished reading." << endl;
+lexsimemapw2v_t::lexsimemapw2v_t(string emap_path, string outw2v_path) :
+   lexsimw2v_t(outw2v_path) {
+   cerr << "Reading emap model from " << emap_path << endl;
+   ifstream EMAP(emap_path.c_str());
+   if (!EMAP) {
+      cerr << "ERROR: fail to open emap model. Exiting..." << endl;
+      exit(1);
+   }
+   while (!EMAP.eof()) {
+      string inp;
+      string hyp;
+      EMAP >> inp >> hyp;
+      emap_m[inp] = hyp;
+   }
+   EMAP.close();
+   cerr << "Finished reading." << endl;
 }
 
 vector<double>& lexsimemapw2v_t::get_wv(string word, int mode) {
-  if (mode == yisi::INP_MODE){
-    if (emap_m.find(word) != emap_m.end()) {
-      word = emap_m[word];
-    } else if (emap_m.find(lowercase(word)) != emap_m.end()){
-      word = emap_m[lowercase(word)];
-    }
-  }
-  return yisi::get_wv(outembeddings_m, word);
+   if (mode == yisi::INP_MODE) {
+      if (emap_m.find(word) != emap_m.end()) {
+         word = emap_m[word];
+      } else if (emap_m.find(lowercase(word)) != emap_m.end()) {
+         word = emap_m[lowercase(word)];
+      }
+   }
+   return yisi::get_wv(outembeddings_m, word);
 }
 
 double lexsimemapw2v_t::get_sim(string s1, string hyp, int mode) {
-  if (lowercase(s1) == lowercase(hyp)){
-    return 1.0;
-  } else {
-    double result = this->get_sim(this->get_wv(s1, mode), this->get_wv(hyp, yisi::HYP_MODE));
-    //cerr << "(" << s1 << "," << hyp << "," << mode << "," << result << ")" << endl;
-    return result;
-  }
+   if (lowercase(s1) == lowercase(hyp)) {
+      return 1.0;
+   } else {
+      double result = this->get_sim(this->get_wv(s1, mode), this->get_wv(hyp, yisi::HYP_MODE));
+      //cerr << "(" << s1 << "," << hyp << "," << mode << "," << result << ")" << endl;
+      return result;
+   }
 }
 
 double lexsimemapw2v_t::get_sim(vector<double>& s1, vector<double>& hyp) {
-  if ((int)s1.size() == dimension_m && (int)hyp.size() == dimension_m) {
-    return yisi::get_sim(s1, hyp, func_m);
-  } else {
-    return 0.0;
-  }
+   if ((int)s1.size() == dimension_m && (int)hyp.size() == dimension_m) {
+      return yisi::get_sim(s1, hyp, func_m);
+   } else {
+      return 0.0;
+   }
 }
 
 lexsimbiw2v_t::lexsimbiw2v_t(string inpw2v_path, string outw2v_path)
@@ -307,6 +307,15 @@ double lexsimbiw2v_t::get_sim(vector<double>& s1, vector<double>& hyp) {
    }
 }
 
+double lexsimemb_t::get_sim(string s1, string hyp, int mode){
+  cerr <<"ERROR: lexsim model is a contextual embedding model, cannot compute lexsim without providing the embedding. Exiting..." << endl;
+  exit(1);
+}
+
+double lexsimemb_t::get_sim(vector<double>& s1, vector<double>& hyp){
+  return yisi::get_sim(s1, hyp, func_m);
+}
+
 lexsim_t::lexsim_t() {
    lexsim_p = new lexsimexact_t();
 }
@@ -324,6 +333,8 @@ lexsim_t::lexsim_t(string name, string out_path, string inp_path) {
      lexsim_p = new lexsimbiw2v_t(inp_path, out_path);
    } else if (name == "lcs") {
      lexsim_p = new lexsimlcs_t();
+   } else if (name == "emb"){
+     lexsim_p = new lexsimemb_t();
    } else {
      cerr << "ERROR: Unknown lexsim model type " << name << endl;
    }
@@ -345,6 +356,8 @@ lexsim_t::lexsim_t(lexsim_t& rhs) {
       lexsim_p = new lexsimbiw2v_t(rhs.inplexsim_path_m, rhs.outlexsim_path_m);
    } else if (rhs.lexsim_name_m == "lcs") {
       lexsim_p = new lexsimlcs_t();
+   } else if (rhs.lexsim_name_m == "emb") {
+     lexsim_p = new lexsimemb_t();
    }
    lexsim_name_m = rhs.lexsim_name_m;
    outlexsim_path_m = rhs.outlexsim_path_m;
@@ -404,7 +417,7 @@ void yisi::read_binw2v(string path, map<string, vector<double> >& model, int& di
    long long d = 0;
    char tmp;
 
-   cerr << "Reading w2v model from " << path << endl;
+   cerr << "Reading w2v binary model from " << path << endl;
    ifstream W2V(path.c_str(), ios::in | ios::binary);
    if (!W2V) {
       cerr << "ERROR: Failed to open w2v model. Exiting..." << endl;

diff --git a/src/lexsim.h b/src/lexsim.h
@@ -97,16 +97,28 @@ namespace yisi {
       int dimension_m;
    }; // class lexsimw2v_t
 
+   class lexsimemb_t:public lexsimmodel_t {
+   public:
+      lexsimemb_t() {
+         func_m = "cosine";
+      }
+      virtual ~lexsimemb_t() {}
+      virtual double get_sim(std::string ref, std::string hyp, int mode);
+      virtual double get_sim(std::vector<double>& ref, std::vector<double>& hyp);
+   protected:
+      std::string func_m;
+   }; // class lexsimw2v_t
+
    class lexsimemapw2v_t:public lexsimw2v_t {
    public:
-     lexsimemapw2v_t() {}
-     lexsimemapw2v_t(std::string emap_path, std::string outw2v_func);
-     virtual ~lexsimemapw2v_t() {}
-     std::vector<double>& get_wv(std::string word, int mode);
-     virtual double get_sim(std::string s1, std::string hyp, int mode);
-     virtual double get_sim(std::vector<double>& s1, std::vector<double>& hyp);
+      lexsimemapw2v_t() {}
+      lexsimemapw2v_t(std::string emap_path, std::string outw2v_func);
+      virtual ~lexsimemapw2v_t() {}
+      std::vector<double>& get_wv(std::string word, int mode);
+      virtual double get_sim(std::string s1, std::string hyp, int mode);
+      virtual double get_sim(std::vector<double>& s1, std::vector<double>& hyp);
    private:
-     std::map<std::string, std::string> emap_m;
+      std::map<std::string, std::string> emap_m;
    }; // class lexsimemapw2v_t
 
 //   class lexsimibm_t:public lexsimmodel_t {

diff --git a/src/ngram_test.cpp b/src/ngram_test.cpp
@@ -0,0 +1,43 @@
+/**
+ * @file w2v_test.cpp
+ * @brief Unit test for w2v lexsim.
+ *
+ * @author Jackie Lo
+ *
+ * Multilingual Text Processing / Traitement multilingue de textes
+ * Digital Technologies Research Centre / Centre de recherche en technologies numériques
+ * National Research Council Canada / Conseil national de recherches Canada
+ * Copyright 2019, Her Majesty in Right of Canada /
+ * Copyright 2019, Sa Majeste la Reine du Chef du Canada
+ */
+
+#include "util.h"
+
+#include <iostream>
+#include <vector>
+#include <set>
+#include <string>
+
+using namespace std;
+using namespace yisi;
+
+int main(int argc, char* argv[])
+{
+   set<string> result;
+   while (!cin.eof()) {
+      string line;
+      cin >> line;
+      auto tokens = tokenize(line);
+      auto ngrams = collect_ngram(atoi(argv[1]), tokens);
+      for (auto it = ngrams.begin(); it != ngrams.end(); it++) {
+         auto ngram = join(*it);
+         result.insert(ngram);
+      }
+   }
+   for (auto it = result.begin(); it != result.end(); it++) {
+      cout << *it << endl;
+   }
+
+   return 0;
+}
+
diff --git a/src/oov_test.cpp b/src/oov_test.cpp
@@ -0,0 +1,39 @@
+/**
+ * @file w2v_test.cpp
+ * @brief Unit test for w2v lexsim.
+ *
+ * @author Jackie Lo
+ *
+ * Multilingual Text Processing / Traitement multilingue de textes
+ * Digital Technologies Research Centre / Centre de recherche en technologies numériques
+ * National Research Council Canada / Conseil national de recherches Canada
+ * Copyright 2019, Her Majesty in Right of Canada /
+ * Copyright 2019, Sa Majeste la Reine du Chef du Canada
+ */
+
+#include "lexsim.h"
+
+#include <iostream>
+
+using namespace std;
+using namespace yisi;
+
+int main(int argc, char* argv[])
+{
+   lexsim_t w2vtxt("w2v", argv[1], "cosine");
+   string sent;
+
+   while(!cin.eof()){
+      getline(cin, sent);
+      //cout << sent << endl;
+      auto tokens = tokenize(sent);
+      for (auto it = tokens.begin(); it != tokens.end(); it++){
+         if ((w2vtxt.get_wv(*it,HYP_MODE)).size() == 0){
+            cout << *it << endl;
+         }
+      }
+   }
+
+   return 0;
+}
+