File tree Expand file tree Collapse file tree 1 file changed +36
-0
lines changed Expand file tree Collapse file tree 1 file changed +36
-0
lines changed Original file line number Diff line number Diff line change 1+ #!/usr/bin/env python
2+
3+ from argparse import ArgumentParser
4+
5+ from nltk .tree import Tree
6+
7+
8+ if __name__ == '__main__' :
9+ parser = ArgumentParser (
10+ description = ('Find the percentage of words in training data that '
11+ 'also exist in pretrained embedding' ))
12+ parser .add_argument ('train' , help = 'path to training data, one parsed sentence per line' )
13+ parser .add_argument ('pretrained' , help = 'path to pretrained embedding file' )
14+ args = parser .parse_args ()
15+
16+ train_words = set ()
17+ with open (args .train ) as f :
18+ for line in f :
19+ t = Tree .fromstring (line .strip ())
20+ for word in t .leaves ():
21+ train_words .add (word )
22+
23+ pretrained_words = set ()
24+ with open (args .pretrained ) as f :
25+ f_iter = iter (f )
26+ next (f_iter ) # skip first line
27+ for line in f_iter :
28+ word = line .split ()[0 ]
29+ pretrained_words .add (word )
30+
31+ pre_words_in_training = train_words .intersection (pretrained_words )
32+ pre_words_rate = len (pre_words_in_training ) / len (train_words )
33+ print (f'Number of pretrained words: { len (pretrained_words )} ' )
34+ print (f'Number of pretrained words in training: { len (pre_words_in_training )} ' )
35+ print (f'Number of word in training: { len (train_words )} ' )
36+ print (f'Percentage of pretrained words in training: { pre_words_rate :.2%} ' )
You can’t perform that action at this time.
0 commit comments