-
Notifications
You must be signed in to change notification settings - Fork 25
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
some major refactor - curating the ArcHybrid object
- Loading branch information
1 parent
c5f3447
commit acbd1a3
Showing
7 changed files
with
421 additions
and
325 deletions.
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
import dynet as dy | ||
|
||
class BiLSTM(object): | ||
def __init__(self,in_dim,out_dim,model,dropout_rate=None): | ||
self.dropout_rate = dropout_rate | ||
self.surfaceBuilders = [dy.VanillaLSTMBuilder(1, in_dim, out_dim, model), | ||
dy.VanillaLSTMBuilder(1, in_dim, out_dim, model)] | ||
|
||
def set_token_vecs(self,sequence,dropout): | ||
""" | ||
Get the forward and backward vectors of tokens in a sequence | ||
and concatenate them | ||
The token objects have a .vec attribute which gets updated | ||
@param: sequence is a list of objects that have a .vec attribute which | ||
is a vector | ||
""" | ||
if dropout and self.dropout_rate is not None: | ||
self.surfaceBuilders[0].set_dropout(self.dropout_rate) | ||
self.surfaceBuilders[1].set_dropout(self.dropout_rate) | ||
else: | ||
self.surfaceBuilders[0].set_dropout(0) | ||
self.surfaceBuilders[1].set_dropout(0) | ||
|
||
|
||
forward = self.surfaceBuilders[0].initial_state() | ||
backward = self.surfaceBuilders[1].initial_state() | ||
|
||
for ftoken, rtoken in zip(sequence, reversed(sequence)): | ||
forward = forward.add_input( ftoken.vec ) | ||
backward = backward.add_input( rtoken.vec ) | ||
ftoken.fvec = forward.output() | ||
rtoken.bvec = backward.output() | ||
|
||
for token in sequence: | ||
token.vec = dy.concatenate( [token.fvec, token.bvec] ) | ||
|
||
def get_sequence_vector(self,sequence,dropout): | ||
""" | ||
Pass a sequence of vectors through the BiLSTM. Return the sequence | ||
vector. | ||
@param: sequence is a list of vectors | ||
dropout is a boolean | ||
""" | ||
if dropout: | ||
self.surfaceBuilders[0].set_dropout(self.dropout_rate) | ||
self.surfaceBuilders[1].set_dropout(self.dropout_rate) | ||
else: | ||
self.surfaceBuilders[0].set_dropout(0) | ||
self.surfaceBuilders[1].set_dropout(0) | ||
forward = self.surfaceBuilders[0].initial_state() | ||
backward = self.surfaceBuilders[1].initial_state() | ||
|
||
for ftoken, rtoken in zip(sequence, reversed(sequence)): | ||
forward = forward.add_input( ftoken ) | ||
backward = backward.add_input( rtoken ) | ||
|
||
return dy.concatenate([forward.output(), backward.output()]) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,129 @@ | ||
from bilstm import BiLSTM | ||
import dynet as dy | ||
import random | ||
|
||
class FeatureExtractor(object): | ||
def __init__(self,model,options,words,rels,langs,w2i,ch,nnvecs): | ||
self.model = model | ||
self.disableBilstm = options.disable_bilstm | ||
self.multiling = options.use_lembed and options.multiling | ||
self.lstm_output_size = options.lstm_output_size | ||
self.char_lstm_output_size = options.char_lstm_output_size | ||
self.word_emb_size = options.word_emb_size | ||
self.char_emb_size = options.char_emb_size | ||
self.lang_emb_size = options.lang_emb_size | ||
self.wordsCount = words | ||
self.vocab = {word: ind+2 for word, ind in w2i.iteritems()} # +2 for MLP padding vector and OOV vector | ||
self.chars = {char: ind+1 for ind, char in enumerate(ch)} # +1 for OOV vector | ||
self.rels = {word: ind for ind, word in enumerate(rels)} | ||
self.nnvecs = nnvecs | ||
if langs: | ||
self.langs = {lang: ind+1 for ind, lang in enumerate(langs)} # +1 for padding vector | ||
else: | ||
self.langs = None | ||
self.irels = rels | ||
self.external_embedding = None | ||
if options.external_embedding is not None: | ||
self.get_external_embeddings(options.external_embedding) | ||
|
||
lstm_input_size = self.word_emb_size + (self.edim if self.external_embedding is\ | ||
not None else 0) + (self.lang_emb_size if | ||
self.multiling else 0) + 2 * self.char_lstm_output_size | ||
|
||
if not self.disableBilstm: | ||
self.bilstm1 = BiLSTM(lstm_input_size, self.lstm_output_size, self.model, | ||
dropout_rate=0.33) | ||
self.bilstm2 = BiLSTM(2* self.lstm_output_size, | ||
self.lstm_output_size, self.model, | ||
dropout_rate=0.33) | ||
else: | ||
self.lstm_output_size = int(lstm_input_size * 0.5) | ||
|
||
self.char_bilstm = BiLSTM(self.char_emb_size, | ||
self.char_lstm_output_size, self.model, | ||
dropout_rate=0.33) | ||
|
||
self.clookup = self.model.add_lookup_parameters((len(ch) + 1, self.char_emb_size)) | ||
self.wlookup = self.model.add_lookup_parameters((len(words) + 2, self.word_emb_size)) | ||
if self.multiling and self.lang_emb_size > 0: | ||
self.langslookup = self.model.add_lookup_parameters((len(langs) + 1, self.lang_emb_size)) | ||
|
||
#used in the PaddingVec | ||
self.word2lstm = self.model.add_parameters((self.lstm_output_size * 2, lstm_input_size)) | ||
self.word2lstmbias = self.model.add_parameters((self.lstm_output_size *2)) | ||
self.chPadding = self.model.add_parameters((self.char_lstm_output_size *2)) | ||
|
||
def Init(self): | ||
evec = self.elookup[1] if self.external_embedding is not None else None | ||
paddingWordVec = self.wlookup[1] | ||
paddingLangVec = self.langslookup[0] if self.multiling and self.lang_emb_size > 0 else None | ||
|
||
self.paddingVec = dy.tanh(self.word2lstm.expr() * dy.concatenate(filter(None, | ||
[paddingWordVec, | ||
evec, | ||
self.chPadding.expr(), | ||
paddingLangVec])) + self.word2lstmbias.expr() ) | ||
self.empty = self.paddingVec if self.nnvecs == 1 else dy.concatenate([self.paddingVec for _ in xrange(self.nnvecs)]) | ||
|
||
def getWordEmbeddings(self, sentence, train): | ||
for root in sentence: | ||
wordcount = float(self.wordsCount.get(root.norm, 0)) | ||
noDropFlag = not train or (random.random() < (wordcount/(0.25+wordcount))) | ||
root.wordvec = self.wlookup[int(self.vocab.get(root.norm, 0)) if noDropFlag else 0] | ||
self.get_char_vector(root,train) | ||
|
||
if self.external_embedding is not None: | ||
if not noDropFlag and random.random() < 0.5: | ||
root.evec = self.elookup[0] | ||
elif root.form in self.external_embedding: | ||
root.evec = self.elookup[self.extrnd[root.form]] | ||
elif root.norm in self.external_embedding: | ||
root.evec = self.elookup[self.extrnd[root.norm]] | ||
else: | ||
root.evec = self.elookup[0] | ||
else: | ||
root.evec = None | ||
|
||
if self.multiling: | ||
root.langvec = self.langslookup[self.langs[root.language_id]] if self.lang_emb_size > 0 else None | ||
else: | ||
root.langvec = None | ||
|
||
root.vec = dy.concatenate(filter(None, [root.wordvec, | ||
root.evec, | ||
root.chVec, | ||
root.langvec])) | ||
if not self.disableBilstm: | ||
self.bilstm1.set_token_vecs(sentence,train) | ||
self.bilstm2.set_token_vecs(sentence,train) | ||
|
||
def get_char_vector(self,root,train): | ||
if root.form == "*root*": # no point running a character analysis over this placeholder token | ||
root.chVec = self.chPadding.expr() # use the padding vector if it's the root token | ||
else: | ||
char_vecs = [] | ||
for char in root.form: | ||
char_vecs.append(self.clookup[self.chars.get(char,0)]) | ||
root.chVec = self.char_bilstm.get_sequence_vector(char_vecs,train) | ||
|
||
|
||
def get_external_embeddings(self,external_embedding_file): | ||
external_embedding_fp = codecs.open(external_embedding_file,'r',encoding='utf-8') | ||
external_embedding_fp.readline() | ||
self.external_embedding = {} | ||
for line in external_embedding_fp: | ||
line = line.strip().split() | ||
self.external_embedding[line[0]] = [float(f) for f in line[1:]] | ||
|
||
external_embedding_fp.close() | ||
|
||
self.edim = len(self.external_embedding.values()[0]) | ||
self.noextrn = [0.0 for _ in xrange(self.edim)] #??? | ||
self.extrnd = {word: i + 3 for i, word in enumerate(self.external_embedding)} | ||
self.elookup = self.model.add_lookup_parameters((len(self.external_embedding) + 3, self.edim)) | ||
for word, i in self.extrnd.iteritems(): | ||
self.elookup.init_row(i, self.external_embedding[word]) | ||
self.extrnd['*PAD*'] = 1 | ||
self.extrnd['*INITIAL*'] = 2 | ||
|
||
print 'Load external embedding. Vector dimensions', self.edim |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
import dynet as dy | ||
|
||
class MLP(object): | ||
def __init__(self, model, name, in_dim, hid_dim, hid2_dim, out_dim, activation=dy.tanh): | ||
self.name=name | ||
self._W1 = model.add_parameters((hid_dim, in_dim), name='W1'+self.name) | ||
self._b1 = model.add_parameters(hid_dim,name='b1'+self.name) | ||
self.has_2_layers = False | ||
if hid2_dim > 0: | ||
self.has_2_layers = True | ||
self._W12 = model.add_parameters((hid_dim,hid_dim),name='W12' + | ||
self.name) | ||
self._b12 = model.add_parameters((hid_dim),name='b12' + self.name) | ||
self._W2 = model.add_parameters((out_dim, hid_dim),name='W2'+self.name) | ||
self._b2 = model.add_parameters(out_dim,name='b2'+self.name) | ||
#TODO: I think I've tried using it but maybe try again | ||
self.useDropout=False | ||
self.activation = activation | ||
|
||
def hid_layer(self,x,dropout): | ||
if dropout: | ||
W = dy.dropout(dy.parameter(self._W1),0.3) | ||
b = dy.dropout(dy.parameter(self._b1),0.3) | ||
else: | ||
W = dy.parameter(self._W1) | ||
b = dy.parameter(self._b1) | ||
return self.activation(W*x+b) | ||
|
||
def hid_2_layer(self,x,dropout): | ||
if dropout: | ||
W = dy.dropout(dy.parameter(self._W12),0.3) | ||
b = dy.dropout(dy.parameter(self._b12),0.3) | ||
else: | ||
W = dy.parameter(self._W12) | ||
b = dy.parameter(self._b12) | ||
return self.activation(W*x+b) | ||
|
||
def out_layer(self,x,dropout): | ||
if dropout: | ||
W = dy.dropout(dy.parameter(self._W2),0.3) | ||
b = dy.dropout(dy.parameter(self._b2),0.3) | ||
else: | ||
W = dy.parameter(self._W2) | ||
b = dy.parameter(self._b2) | ||
return (W*x+b) | ||
|
||
def __call__(self,x): | ||
h = self.hid_layer(x,self.useDropout) | ||
if self.has_2_layers: | ||
h2 = self.hid_2_layer(h,self.useDropout) | ||
h = h2 | ||
return self.out_layer(h,self.useDropout) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.