File tree 1 file changed +36
-0
lines changed
1 file changed +36
-0
lines changed Original file line number Diff line number Diff line change
1
+ #!/usr/bin/env python
2
+
3
+ from argparse import ArgumentParser
4
+
5
+ from nltk .tree import Tree
6
+
7
+
8
+ if __name__ == '__main__' :
9
+ parser = ArgumentParser (
10
+ description = ('Find the percentage of words in training data that '
11
+ 'also exist in pretrained embedding' ))
12
+ parser .add_argument ('train' , help = 'path to training data, one parsed sentence per line' )
13
+ parser .add_argument ('pretrained' , help = 'path to pretrained embedding file' )
14
+ args = parser .parse_args ()
15
+
16
+ train_words = set ()
17
+ with open (args .train ) as f :
18
+ for line in f :
19
+ t = Tree .fromstring (line .strip ())
20
+ for word in t .leaves ():
21
+ train_words .add (word )
22
+
23
+ pretrained_words = set ()
24
+ with open (args .pretrained ) as f :
25
+ f_iter = iter (f )
26
+ next (f_iter ) # skip first line
27
+ for line in f_iter :
28
+ word = line .split ()[0 ]
29
+ pretrained_words .add (word )
30
+
31
+ pre_words_in_training = train_words .intersection (pretrained_words )
32
+ pre_words_rate = len (pre_words_in_training ) / len (train_words )
33
+ print (f'Number of pretrained words: { len (pretrained_words )} ' )
34
+ print (f'Number of pretrained words in training: { len (pre_words_in_training )} ' )
35
+ print (f'Number of word in training: { len (train_words )} ' )
36
+ print (f'Percentage of pretrained words in training: { pre_words_rate :.2%} ' )
You can’t perform that action at this time.
0 commit comments