Skip to content

Commit

Permalink
some major refactor - curating the ArcHybrid object
Browse files Browse the repository at this point in the history
  • Loading branch information
mdelhoneux committed Mar 8, 2018
1 parent c5f3447 commit acbd1a3
Show file tree
Hide file tree
Showing 7 changed files with 421 additions and 325 deletions.
263 changes: 38 additions & 225 deletions barchybrid/src/arc_hybrid.py

Large diffs are not rendered by default.

58 changes: 58 additions & 0 deletions barchybrid/src/bilstm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
import dynet as dy

class BiLSTM(object):
def __init__(self,in_dim,out_dim,model,dropout_rate=None):
self.dropout_rate = dropout_rate
self.surfaceBuilders = [dy.VanillaLSTMBuilder(1, in_dim, out_dim, model),
dy.VanillaLSTMBuilder(1, in_dim, out_dim, model)]

def set_token_vecs(self,sequence,dropout):
"""
Get the forward and backward vectors of tokens in a sequence
and concatenate them
The token objects have a .vec attribute which gets updated
@param: sequence is a list of objects that have a .vec attribute which
is a vector
"""
if dropout and self.dropout_rate is not None:
self.surfaceBuilders[0].set_dropout(self.dropout_rate)
self.surfaceBuilders[1].set_dropout(self.dropout_rate)
else:
self.surfaceBuilders[0].set_dropout(0)
self.surfaceBuilders[1].set_dropout(0)


forward = self.surfaceBuilders[0].initial_state()
backward = self.surfaceBuilders[1].initial_state()

for ftoken, rtoken in zip(sequence, reversed(sequence)):
forward = forward.add_input( ftoken.vec )
backward = backward.add_input( rtoken.vec )
ftoken.fvec = forward.output()
rtoken.bvec = backward.output()

for token in sequence:
token.vec = dy.concatenate( [token.fvec, token.bvec] )

def get_sequence_vector(self,sequence,dropout):
"""
Pass a sequence of vectors through the BiLSTM. Return the sequence
vector.
@param: sequence is a list of vectors
dropout is a boolean
"""
if dropout:
self.surfaceBuilders[0].set_dropout(self.dropout_rate)
self.surfaceBuilders[1].set_dropout(self.dropout_rate)
else:
self.surfaceBuilders[0].set_dropout(0)
self.surfaceBuilders[1].set_dropout(0)
forward = self.surfaceBuilders[0].initial_state()
backward = self.surfaceBuilders[1].initial_state()

for ftoken, rtoken in zip(sequence, reversed(sequence)):
forward = forward.add_input( ftoken )
backward = backward.add_input( rtoken )

return dy.concatenate([forward.output(), backward.output()])

129 changes: 129 additions & 0 deletions barchybrid/src/feature_extractor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
from bilstm import BiLSTM
import dynet as dy
import random

class FeatureExtractor(object):
def __init__(self,model,options,words,rels,langs,w2i,ch,nnvecs):
self.model = model
self.disableBilstm = options.disable_bilstm
self.multiling = options.use_lembed and options.multiling
self.lstm_output_size = options.lstm_output_size
self.char_lstm_output_size = options.char_lstm_output_size
self.word_emb_size = options.word_emb_size
self.char_emb_size = options.char_emb_size
self.lang_emb_size = options.lang_emb_size
self.wordsCount = words
self.vocab = {word: ind+2 for word, ind in w2i.iteritems()} # +2 for MLP padding vector and OOV vector
self.chars = {char: ind+1 for ind, char in enumerate(ch)} # +1 for OOV vector
self.rels = {word: ind for ind, word in enumerate(rels)}
self.nnvecs = nnvecs
if langs:
self.langs = {lang: ind+1 for ind, lang in enumerate(langs)} # +1 for padding vector
else:
self.langs = None
self.irels = rels
self.external_embedding = None
if options.external_embedding is not None:
self.get_external_embeddings(options.external_embedding)

lstm_input_size = self.word_emb_size + (self.edim if self.external_embedding is\
not None else 0) + (self.lang_emb_size if
self.multiling else 0) + 2 * self.char_lstm_output_size

if not self.disableBilstm:
self.bilstm1 = BiLSTM(lstm_input_size, self.lstm_output_size, self.model,
dropout_rate=0.33)
self.bilstm2 = BiLSTM(2* self.lstm_output_size,
self.lstm_output_size, self.model,
dropout_rate=0.33)
else:
self.lstm_output_size = int(lstm_input_size * 0.5)

self.char_bilstm = BiLSTM(self.char_emb_size,
self.char_lstm_output_size, self.model,
dropout_rate=0.33)

self.clookup = self.model.add_lookup_parameters((len(ch) + 1, self.char_emb_size))
self.wlookup = self.model.add_lookup_parameters((len(words) + 2, self.word_emb_size))
if self.multiling and self.lang_emb_size > 0:
self.langslookup = self.model.add_lookup_parameters((len(langs) + 1, self.lang_emb_size))

#used in the PaddingVec
self.word2lstm = self.model.add_parameters((self.lstm_output_size * 2, lstm_input_size))
self.word2lstmbias = self.model.add_parameters((self.lstm_output_size *2))
self.chPadding = self.model.add_parameters((self.char_lstm_output_size *2))

def Init(self):
evec = self.elookup[1] if self.external_embedding is not None else None
paddingWordVec = self.wlookup[1]
paddingLangVec = self.langslookup[0] if self.multiling and self.lang_emb_size > 0 else None

self.paddingVec = dy.tanh(self.word2lstm.expr() * dy.concatenate(filter(None,
[paddingWordVec,
evec,
self.chPadding.expr(),
paddingLangVec])) + self.word2lstmbias.expr() )
self.empty = self.paddingVec if self.nnvecs == 1 else dy.concatenate([self.paddingVec for _ in xrange(self.nnvecs)])

def getWordEmbeddings(self, sentence, train):
for root in sentence:
wordcount = float(self.wordsCount.get(root.norm, 0))
noDropFlag = not train or (random.random() < (wordcount/(0.25+wordcount)))
root.wordvec = self.wlookup[int(self.vocab.get(root.norm, 0)) if noDropFlag else 0]
self.get_char_vector(root,train)

if self.external_embedding is not None:
if not noDropFlag and random.random() < 0.5:
root.evec = self.elookup[0]
elif root.form in self.external_embedding:
root.evec = self.elookup[self.extrnd[root.form]]
elif root.norm in self.external_embedding:
root.evec = self.elookup[self.extrnd[root.norm]]
else:
root.evec = self.elookup[0]
else:
root.evec = None

if self.multiling:
root.langvec = self.langslookup[self.langs[root.language_id]] if self.lang_emb_size > 0 else None
else:
root.langvec = None

root.vec = dy.concatenate(filter(None, [root.wordvec,
root.evec,
root.chVec,
root.langvec]))
if not self.disableBilstm:
self.bilstm1.set_token_vecs(sentence,train)
self.bilstm2.set_token_vecs(sentence,train)

def get_char_vector(self,root,train):
if root.form == "*root*": # no point running a character analysis over this placeholder token
root.chVec = self.chPadding.expr() # use the padding vector if it's the root token
else:
char_vecs = []
for char in root.form:
char_vecs.append(self.clookup[self.chars.get(char,0)])
root.chVec = self.char_bilstm.get_sequence_vector(char_vecs,train)


def get_external_embeddings(self,external_embedding_file):
external_embedding_fp = codecs.open(external_embedding_file,'r',encoding='utf-8')
external_embedding_fp.readline()
self.external_embedding = {}
for line in external_embedding_fp:
line = line.strip().split()
self.external_embedding[line[0]] = [float(f) for f in line[1:]]

external_embedding_fp.close()

self.edim = len(self.external_embedding.values()[0])
self.noextrn = [0.0 for _ in xrange(self.edim)] #???
self.extrnd = {word: i + 3 for i, word in enumerate(self.external_embedding)}
self.elookup = self.model.add_lookup_parameters((len(self.external_embedding) + 3, self.edim))
for word, i in self.extrnd.iteritems():
self.elookup.init_row(i, self.external_embedding[word])
self.extrnd['*PAD*'] = 1
self.extrnd['*INITIAL*'] = 2

print 'Load external embedding. Vector dimensions', self.edim
53 changes: 53 additions & 0 deletions barchybrid/src/multilayer_perceptron.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
import dynet as dy

class MLP(object):
def __init__(self, model, name, in_dim, hid_dim, hid2_dim, out_dim, activation=dy.tanh):
self.name=name
self._W1 = model.add_parameters((hid_dim, in_dim), name='W1'+self.name)
self._b1 = model.add_parameters(hid_dim,name='b1'+self.name)
self.has_2_layers = False
if hid2_dim > 0:
self.has_2_layers = True
self._W12 = model.add_parameters((hid_dim,hid_dim),name='W12' +
self.name)
self._b12 = model.add_parameters((hid_dim),name='b12' + self.name)
self._W2 = model.add_parameters((out_dim, hid_dim),name='W2'+self.name)
self._b2 = model.add_parameters(out_dim,name='b2'+self.name)
#TODO: I think I've tried using it but maybe try again
self.useDropout=False
self.activation = activation

def hid_layer(self,x,dropout):
if dropout:
W = dy.dropout(dy.parameter(self._W1),0.3)
b = dy.dropout(dy.parameter(self._b1),0.3)
else:
W = dy.parameter(self._W1)
b = dy.parameter(self._b1)
return self.activation(W*x+b)

def hid_2_layer(self,x,dropout):
if dropout:
W = dy.dropout(dy.parameter(self._W12),0.3)
b = dy.dropout(dy.parameter(self._b12),0.3)
else:
W = dy.parameter(self._W12)
b = dy.parameter(self._b12)
return self.activation(W*x+b)

def out_layer(self,x,dropout):
if dropout:
W = dy.dropout(dy.parameter(self._W2),0.3)
b = dy.dropout(dy.parameter(self._b2),0.3)
else:
W = dy.parameter(self._W2)
b = dy.parameter(self._b2)
return (W*x+b)

def __call__(self,x):
h = self.hid_layer(x,self.useDropout)
if self.has_2_layers:
h2 = self.hid_2_layer(h,self.useDropout)
h = h2
return self.out_layer(h,self.useDropout)

24 changes: 13 additions & 11 deletions barchybrid/src/options_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ def __init__(self,options):
if options.include and not options.datadir:
raise Exception("You need to specify the data dir to include UD languages")

if not options.predictFlag:
if not options.predict:
if not options.include and not options.trainfile:
raise Exception("If not using the --include option, you must specify your training data with --trainfile")
else:
Expand All @@ -33,7 +33,7 @@ def __init__(self,options):
print "Creating output directory " + options.outdir
os.mkdir(options.outdir)

if not options.predictFlag and not (options.rlFlag or options.rlMostFlag or options.headFlag):
if not options.predict and not (options.rlFlag or options.rlMostFlag or options.headFlag):
raise Exception("Must include either head, rl or rlmost (For example, if you specified --disable-head and --disable-rlmost, you must specify --userl)")

if options.rlFlag and options.rlMostFlag:
Expand All @@ -47,6 +47,7 @@ def __init__(self,options):

options.multi_monoling = False # set default
self.iterations = 1 # set default
self.conllu = True #default

if not options.include: # must specifiy explicitly train
treebank = utils.Treebank(options.trainfile, \
Expand All @@ -55,7 +56,7 @@ def __init__(self,options):
treebank.outdir = options.outdir
treebank.modeldir = options.modeldir
#just one model specified by train/dev and/or test
if options.predictFlag:
if options.predict:
if not options.testfile:
raise Exception("--testfile must be specified")
elif not os.path.exists(options.testfile):
Expand Down Expand Up @@ -90,7 +91,7 @@ def __init__(self,options):
print "Warning: skipping invalid language code " + lang

if options.multiling:
if options.predictFlag:
if options.predict:
model = "%s/%s"%(options.modeldir,options.model)
if not os.path.exists(model): # in multilingual case need model to be found in first language specified
raise Exception("Model not found. Path tried: %s"%model)
Expand All @@ -112,13 +113,13 @@ def __init__(self,options):
print ("Warning: language-specific subdirectory " + language.outdir
+ " already exists, contents may be overwritten")

if not options.predictFlag:
if not options.predict:
self.prepareDev(language,options)

if options.debug: # it is important that prepareDev be called before createDebugData
self.createDebugData(language,options)

if options.predictFlag and options.multi_monoling:
if options.predict and options.multi_monoling:
language.modeldir= "%s/%s"%(options.modeldir,language.iso_id)
model = "%s/%s"%(language.modeldir,options.model)
if not os.path.exists(model): # in multilingual case need model to be found in first language specified
Expand Down Expand Up @@ -173,21 +174,22 @@ def prepareDev(self,treebank,options):
# if debug options is set, we read in the training, dev and test files as appropriate, cap the number of sentences and store
# new files with these smaller data sets
def createDebugData(self,treebank,options):
ext = '.conllu' if self.conllu else '.conll'
print 'Creating smaller data sets for debugging'
if not options.predictFlag:
if not options.predict:
traindata = list(utils.read_conll(treebank.trainfile,treebank.iso_id,maxSize=options.debug_train_sents,hard_lim=True))
train_file = os.path.join(treebank.outdir,'train-debug' + '.conllu') # location for the new train file
train_file = os.path.join(treebank.outdir,'train-debug' + ext) # location for the new train file
utils.write_conll(train_file,traindata) # write the new dev data to file
treebank.trainfile = train_file
if treebank.devfile and os.path.exists(treebank.devfile):
if treebank.devfile and os.path.exists(treebank.devfile) and options.pred_dev:
devdata = list(utils.read_conll(treebank.devfile,treebank.iso_id,maxSize=options.debug_dev_sents,hard_lim=True))
dev_file = os.path.join(treebank.outdir,'dev-debug' + '.conllu') # location for the new dev file
dev_file = os.path.join(treebank.outdir,'dev-debug' + ext) # location for the new dev file
utils.write_conll(dev_file,devdata) # write the new dev data to file
treebank.dev_gold = dev_file
treebank.devfile = dev_file
else:
testdata = list(utils.read_conll(treebank.testfile,treebank.iso_id,maxSize=options.debug_test_sents,hard_lim=True))
test_file = os.path.join(treebank.outdir,'test-debug' + '.conllu') # location for the new dev file
test_file = os.path.join(treebank.outdir,'test-debug' + ext) # location for the new dev file
utils.write_conll(test_file,testdata) # write the new dev data to file
treebank.test_gold = test_file
treebank.testfile = test_file
Loading

0 comments on commit acbd1a3

Please sign in to comment.