-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Added support for subword unit view and corresponding subword unit em…
…beddings. Squashed merge commit of the following: commit e9367f8f94ffd09fe76b0c12d9ace54a79533a4f Author: Darlene Stewart <[email protected]> Date: Mon May 27 18:18:33 2019 -0400 Fixed YiSi SRL test reference files yet again. commit a7145d50093c360642b8da68c2501c5333973ab5 Merge: ea751e4 d1fe9d8 Author: Darlene Stewart <[email protected]> Date: Mon May 27 17:54:35 2019 -0400 Merge branch 'dev.merge.NRC-private' Merge NRC-private commits db8a070 through f35fd82: git cherry-pick -e -x db8a070 git cherry-pick -e -x db8a070..96a8a7d git cherry-pick -e -x f35fd82 Also, fixed code formatting and YiSi SRL test reference files. commit d1fe9d816c06940cd91a85beb0ca63680bae9114 Author: Darlene Stewart <[email protected]> Date: Mon May 27 17:39:21 2019 -0400 Fixed the YiSi SRL test reference files again. commit 7f3931c178255a4a1ecac013691cbee915f3a8e6 Author: Darlene Stewart <[email protected]> Date: Mon May 27 17:43:36 2019 -0400 Code formatting fixes commit d0022bcd5b53b5787f2ca4e8c628d345d4e2c8f7 Author: Darlene Stewart <[email protected]> Date: Mon May 27 14:27:02 2019 -0400 Fixed copyright year. commit 748df7c0dcef368720dc7b535d1838d6019bb9ad Author: Darlene Stewart <[email protected]> Date: Thu May 23 17:14:16 2019 -0400 Update YiSi test reference files to match output on NRC-private branch. (cherry picked from NRC-private branch commit f35fd82e646f1d60143b4976ff8d58c93760544b) commit 986b61974cfc8ed08e7e06d434b8195916f457e0 Author: Darlene Stewart <[email protected]> Date: Wed May 22 16:59:44 2019 -0400 Fixed an int-size_t comparison causing a g++ warning in yisi::read_sent(). (cherry picked from NRC-private branch commit 96a8a7d652888f05bc5fa95c60c243591cafc543) commit 08b31bfa966af2cb4e965ecb97dfc4cf8047c7ba Author: Darlene Stewart <[email protected]> Date: Wed May 22 16:20:00 2019 -0400 Update YiSi test reference files to match the current state of the NRC-private branch. (cherry picked from NRC-private branch commit 5b5f4a25e69625cd1b8a0f7a8d8974e0ce2b0e60) commit 5f60ca583f0b127431c04b87dcab6d606ca6214b Author: Darlene Stewart <[email protected]> Date: Wed May 22 14:39:23 2019 -0400 Fixed read_con1109 to set the sentence tokens for 'word' type. (cherry picked from NRC-private branch commit 2bd0c3f9e1c9029494619d2175c187a9605ca635) commit 745004781aeabbdd1cbdfbed198de473f1706ab2 Author: Jackie Lo <[email protected]> Date: Tue May 14 18:27:46 2019 -0400 Bug fix for reading srl parse in conll09 format. (cherry picked from NRC-private branch commit 4b8740d2ef1c23390bcef3d0fe4a6acdcc446dfe) commit 875171ae255dbe5a5ae7ce275e2aed5d13819373 Author: Jackie Lo <[email protected]> Date: Wed May 8 07:24:43 2019 -0400 Rewriting confusing progress message in main function. (cherry picked from NRC-private branch commit f29066dceba8ec5ac92b33e5142d7075f78aeafc) commit 783ee3611d90a46503528c21b6b1b3e1efd96f69 Author: Jackie Lo <[email protected]> Date: Wed May 8 07:15:35 2019 -0400 Another bug fix in reading conll09 formatted srl. (cherry picked from NRC-private branch commit 675e73d35dd231d39cc415a0e7311f56052af981) commit 7d038c0d2d49b6d96182326d3174899bfa2fef03 Author: Jackie Lo <[email protected]> Date: Wed May 8 00:01:41 2019 -0400 Bug fix in reading conll09 srl format. (cherry picked from NRC-private branch commit c3119343e2cceda62c37f3aed559c5f3014f7573) commit fb6e0b69589294f9bda98efd25ee143ba6341a98 Author: Jackie Lo <[email protected]> Date: Tue May 7 16:00:15 2019 -0400 Added data structure for sentence. (cherry picked from NRC-private branch commit d5cdb883271ef8a30aedb5727613b57fb799821c) commit efa1ea1014ed88910e42572720cf626423bc1713 Author: Jackie Lo <[email protected]> Date: Tue May 7 15:59:09 2019 -0400 Added some handy tools for general w2v embeddings analysis. (cherry picked from NRC-private branch commit 7bd1fca5447cfc176da26a48fdebb90f9d602da7) commit 2d81344c61d7e1dbeac55ab5943715c293c0ddd1 Author: Jackie Lo <[email protected]> Date: Tue May 7 12:38:12 2019 -0400 Redesign of data structure for sentences to support additional subword unit view and corresponding subword unit embeddings. (cherry picked from NRC-private branch commit 1df49fec9be3196adfbd5ab990dd7a323ec9c7f6)
- Loading branch information
1 parent
6333066
commit fd07bfc
Showing
48 changed files
with
2,112 additions
and
1,201 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
/** | ||
* @file lexsim_test.cpp | ||
* @brief Unit test for lexsim. | ||
* | ||
* @author Jackie Lo | ||
* | ||
* Multilingual Text Processing / Traitement multilingue de textes | ||
* Digital Technologies Research Centre / Centre de recherche en technologies numériques | ||
* National Research Council Canada / Conseil national de recherches Canada | ||
* Copyright 2019, Her Majesty in Right of Canada / | ||
* Copyright 2019, Sa Majeste la Reine du Chef du Canada | ||
*/ | ||
|
||
#include "lexsim.h" | ||
|
||
#include <iostream> | ||
#include <set> | ||
|
||
using namespace std; | ||
using namespace yisi; | ||
|
||
int main(int argc, char* argv[]) | ||
{ | ||
string inpembpath = argv[1]; | ||
string hypembpath = argv[2]; | ||
string inpmappath = argv[3]; | ||
string inpdocpath = argv[4]; | ||
ofstream INPMAP; | ||
ofstream INPDOC; | ||
open_ofstream(INPMAP, inpmappath); | ||
|
||
map<string, vector<double> > inpemb; | ||
map<string, vector<double> > hypemb; | ||
map<string, vector<double> > inpfilemb; | ||
int dim; | ||
read_binw2v(inpembpath, inpemb, dim); | ||
read_binw2v(hypembpath, hypemb, dim); | ||
vector<string> inpsents = read_file(inpdocpath); | ||
//filter the inp emb according to the inp doc | ||
set<string> tokens; | ||
for (auto it = inpsents.begin(); it != inpsents.end(); it++) { | ||
auto sent = tokenize(*it); | ||
tokens.insert(sent.begin(), sent.end()); | ||
} | ||
for (auto it = tokens.begin(); it != tokens.end(); it++) { | ||
auto jt = inpemb.find(*it); | ||
if (jt != inpemb.end()) { | ||
inpfilemb[*it] = jt->second; | ||
} | ||
} | ||
|
||
string maxsim_str; | ||
double maxsim_scr=0.0; | ||
for (auto it = inpfilemb.begin(); it != inpfilemb.end(); it++) { | ||
auto inp_s = it->first; | ||
auto inp_v = it->second; | ||
for (auto jt = hypemb.begin(); jt != hypemb.end(); jt++) { | ||
auto hyp_s = jt->first; | ||
auto hyp_v = jt->second; | ||
double sim = 0.0; | ||
for (int i = 0; i < dim; i++) { | ||
sim += inp_v[i] * hyp_v[i]; | ||
} | ||
if (sim > maxsim_scr) { | ||
maxsim_str = hyp_s; | ||
maxsim_scr = sim; | ||
} | ||
} | ||
INPMAP << inp_s << "\t" << maxsim_str << endl; | ||
maxsim_scr = 0.0; | ||
} | ||
return 0; | ||
} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
/** | ||
* @file w2v_test.cpp | ||
* @brief Unit test for w2v lexsim. | ||
* | ||
* @author Jackie Lo | ||
* | ||
* Multilingual Text Processing / Traitement multilingue de textes | ||
* Digital Technologies Research Centre / Centre de recherche en technologies numériques | ||
* National Research Council Canada / Conseil national de recherches Canada | ||
* Copyright 2019, Her Majesty in Right of Canada / | ||
* Copyright 2019, Sa Majeste la Reine du Chef du Canada | ||
*/ | ||
|
||
#include "util.h" | ||
|
||
#include <iostream> | ||
#include <vector> | ||
#include <set> | ||
#include <string> | ||
|
||
using namespace std; | ||
using namespace yisi; | ||
|
||
int main(int argc, char* argv[]) | ||
{ | ||
set<string> result; | ||
while (!cin.eof()) { | ||
string line; | ||
cin >> line; | ||
auto tokens = tokenize(line); | ||
auto ngrams = collect_ngram(atoi(argv[1]), tokens); | ||
for (auto it = ngrams.begin(); it != ngrams.end(); it++) { | ||
auto ngram = join(*it); | ||
result.insert(ngram); | ||
} | ||
} | ||
for (auto it = result.begin(); it != result.end(); it++) { | ||
cout << *it << endl; | ||
} | ||
|
||
return 0; | ||
} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
/** | ||
* @file w2v_test.cpp | ||
* @brief Unit test for w2v lexsim. | ||
* | ||
* @author Jackie Lo | ||
* | ||
* Multilingual Text Processing / Traitement multilingue de textes | ||
* Digital Technologies Research Centre / Centre de recherche en technologies numériques | ||
* National Research Council Canada / Conseil national de recherches Canada | ||
* Copyright 2019, Her Majesty in Right of Canada / | ||
* Copyright 2019, Sa Majeste la Reine du Chef du Canada | ||
*/ | ||
|
||
#include "lexsim.h" | ||
|
||
#include <iostream> | ||
|
||
using namespace std; | ||
using namespace yisi; | ||
|
||
int main(int argc, char* argv[]) | ||
{ | ||
lexsim_t w2vtxt("w2v", argv[1], "cosine"); | ||
string sent; | ||
|
||
while(!cin.eof()){ | ||
getline(cin, sent); | ||
//cout << sent << endl; | ||
auto tokens = tokenize(sent); | ||
for (auto it = tokens.begin(); it != tokens.end(); it++){ | ||
if ((w2vtxt.get_wv(*it,HYP_MODE)).size() == 0){ | ||
cout << *it << endl; | ||
} | ||
} | ||
} | ||
|
||
return 0; | ||
} | ||
|
Oops, something went wrong.