Skip to content

Commit 39f68cd

Browse files
committed
Write script to compute pretrained word percentage
1 parent 20544a4 commit 39f68cd

File tree

1 file changed

+36
-0
lines changed

1 file changed

+36
-0
lines changed

scripts/word_percentage.py

+36
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
#!/usr/bin/env python
2+
3+
from argparse import ArgumentParser
4+
5+
from nltk.tree import Tree
6+
7+
8+
if __name__ == '__main__':
9+
parser = ArgumentParser(
10+
description=('Find the percentage of words in training data that '
11+
'also exist in pretrained embedding'))
12+
parser.add_argument('train', help='path to training data, one parsed sentence per line')
13+
parser.add_argument('pretrained', help='path to pretrained embedding file')
14+
args = parser.parse_args()
15+
16+
train_words = set()
17+
with open(args.train) as f:
18+
for line in f:
19+
t = Tree.fromstring(line.strip())
20+
for word in t.leaves():
21+
train_words.add(word)
22+
23+
pretrained_words = set()
24+
with open(args.pretrained) as f:
25+
f_iter = iter(f)
26+
next(f_iter) # skip first line
27+
for line in f_iter:
28+
word = line.split()[0]
29+
pretrained_words.add(word)
30+
31+
pre_words_in_training = train_words.intersection(pretrained_words)
32+
pre_words_rate = len(pre_words_in_training) / len(train_words)
33+
print(f'Number of pretrained words: {len(pretrained_words)}')
34+
print(f'Number of pretrained words in training: {len(pre_words_in_training)}')
35+
print(f'Number of word in training: {len(train_words)}')
36+
print(f'Percentage of pretrained words in training: {pre_words_rate:.2%}')

0 commit comments

Comments
 (0)