Skip to content

Commit

Permalink
Merge pull request #22 from LoicGrobol/logger
Browse files Browse the repository at this point in the history
Use a proper logging mechanism
  • Loading branch information
mdelhoneux authored Mar 30, 2020
2 parents 7cbe047 + 9f6d67a commit dc480e6
Show file tree
Hide file tree
Showing 11 changed files with 284 additions and 143 deletions.
15 changes: 7 additions & 8 deletions scripts/analysis_multimono.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
from __future__ import print_function
import numpy as np
import sys
from optparse import OptionParser
Expand All @@ -18,7 +17,7 @@ def main(options,args):
bas_runs = int(args[1]) # number of baseline experiments
exp_name = args[2] # name of new experiments
exp_runs = int(args[3]) # number of new experiments
print("Results for experiment: " + exp_name)
print(f"Results for experiment: {exp_name}")

langs = args[4:]
bas_means = np.zeros((len(langs),))
Expand All @@ -35,23 +34,23 @@ def main(options,args):

def get_lang_mean(exp_name, lang, no_runs, options):
if options.final_epochs:
print("%s: mean of last %i epochs from %i runs for %s: " %(exp_name[1], options.no_epochs, no_runs, lang),end='')
print(f"{exp_name[1]}: mean of last {options.no_epochs:d} epochs from {no_runs:d} runs for {lang}: ",end='')
else:
print("%s: mean of best %i epochs from %i runs for %s: " %(exp_name[1], options.no_epochs, no_runs, lang),end='')
print(f"{exp_name[1]}: mean of best {options.no_epochs:d} epochs from {no_runs:d} runs for {lang}: ",end='')
lang_means = np.zeros((no_runs,))
for ind in range(1,no_runs+1): # loop over other baseline experiments
if ind==1:
scores_file = "./%s/%s/%s_scores.txt"%(exp_name[0],lang,lang)
scores_file = f"./{exp_name[0]}/{lang}/{lang}_scores.txt"
else:
scores_file = "./%s-%i/%s/%s_scores.txt"%(exp_name[0],ind,lang,lang)
scores_file = f"./{exp_name[0]}-{ind:d}/{lang}/{lang}_scores.txt"
scores = np.loadtxt(scores_file)
if not options.final_epochs:
scores = np.sort(scores)
run_mean = np.mean(scores[-options.no_epochs:])
lang_means[ind-1] = run_mean
print("%.2f "%run_mean,end='')
print(f"{run_mean:.2f} ",end='')
lang_mean = np.mean(lang_means)
print("(%.2f)"%lang_mean)
print(f"({lang_mean:.2f})")
return lang_mean

if __name__ == "__main__":
Expand Down
22 changes: 11 additions & 11 deletions scripts/pick_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,18 +13,18 @@
include_file = sys.argv[1]
trained_models_dir = sys.argv[2].strip("/")
#make sure there are no annoying spaces
print 'Removing leading and trailing spaces from ' + include_file
os.system("sed -i 's/\s*//g' %s"%include_file)
print 'Finding best iteration for each language and storing in best_epochs.txt'
cmd = './scripts/best_res.sh %s %s >best_epochs.txt'%(include_file, trained_models_dir)
print(f'Removing leading and trailing spaces from {include_file}')
os.system(f"sed -i 's/\\s*//g' {include_file}")
print('Finding best iteration for each language and storing in best_epochs.txt')
cmd = f'./scripts/best_res.sh {include_file} {trained_models_dir} >best_epochs.txt'
os.system(cmd)
d = {}
outdir = trained_models_dir
if len(sys.argv) == 4:
outdir = sys.argv[3]

if not os.path.exists(outdir):
print 'Creating directory ' + outdir
print(f'Creating directory {outdir}')
os.mkdir(outdir)
for line in open('best_epochs.txt','r'):
try:
Expand All @@ -35,20 +35,20 @@
except:
IndexError
lang = line.strip()
cmd = './scripts/get_last_epoch.sh %s %s'%(lang,trained_models_dir)
cmd = f'./scripts/get_last_epoch.sh {lang} {trained_models_dir}'
lastEpoch = os.popen(cmd)

for lang in d:
lpath = outdir + '/' + lang + '/'
if not os.path.exists(lpath):
print 'Creating directory ' + lpath
print(f'Creating directory {lpath}')
os.mkdir(lpath)
infile = trained_models_dir + '/' + lang + '/barchybrid.model' + str(d[lang])
outfile = lpath + 'barchybrid.model'
if os.path.exists(infile):
print 'Copying ' + infile + ' to ' + outfile
os.system('cp %s %s'%(infile,outfile))
print(f'Copying {infile} to {outfile}'')
os.system(f'cp {infile} {outfile}')
if outdir != trained_models_dir:
paramfile = trained_models_dir + '/' + lang + '/params.pickle'
print 'Copying ' + paramfile + ' to ' + lpath
os.system('cp %s %s'%(paramfile,lpath))
print(f'Copying {paramfile} to {lpath}'')
os.system(f'cp {paramfile} {lpath}')
2 changes: 2 additions & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,9 @@ python_requires = >= 3.6
install_requires =
dynet
h5py
loguru
numpy
tqdm

[options.entry_points]
console_scripts =
Expand Down
78 changes: 54 additions & 24 deletions uuparser/arc_hybrid.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@
from copy import deepcopy
from collections import defaultdict
import json
from loguru import logger

import tqdm

from uuparser import utils

Expand Down Expand Up @@ -118,11 +121,11 @@ def __evaluate(self, stack, buf, train):


def Save(self, filename):
print('Saving model to ' + filename)
logger.info(f'Saving model to {filename}')
self.model.save(filename)

def Load(self, filename):
print('Loading model from ' + filename)
logger.info(f'Loading model from {filename}')
self.model.populate(filename)


Expand Down Expand Up @@ -222,8 +225,7 @@ def Predict(self, treebanks, datasplit, options):
new_test_words = \
set(test_words) - self.feature_extractor.words.keys()

print("Number of OOV word types at test time: %i (out of %i)" %
(len(new_test_words), len(test_words)))
logger.debug(f"Number of OOV word types at test time: {len(new_test_words)} (out of {len(test_words)})")

if len(new_test_words) > 0:
# no point loading embeddings if there are no words to look for
Expand All @@ -236,15 +238,19 @@ def Predict(self, treebanks, datasplit, options):
)
test_embeddings["words"].update(embeddings)
if len(test_langs) > 1 and test_embeddings["words"]:
print("External embeddings found for %i words "\
"(out of %i)" % \
(len(test_embeddings["words"]), len(new_test_words)))
logger.debug(
"External embeddings found for {0} words (out of {1})".format(
len(test_embeddings["words"]),
len(new_test_words),
),
)

if options.char_emb_size > 0:
new_test_chars = \
set(test_chars) - self.feature_extractor.chars.keys()
print("Number of OOV char types at test time: %i (out of %i)" %
(len(new_test_chars), len(test_chars)))
logger.debug(
f"Number of OOV char types at test time: {len(new_test_chars)} (out of {len(test_chars)})"
)

if len(new_test_chars) > 0:
for lang in test_langs:
Expand All @@ -257,12 +263,25 @@ def Predict(self, treebanks, datasplit, options):
)
test_embeddings["chars"].update(embeddings)
if len(test_langs) > 1 and test_embeddings["chars"]:
print("External embeddings found for %i chars "\
"(out of %i)" % \
(len(test_embeddings["chars"]), len(new_test_chars)))
logger.debug(
"External embeddings found for {0} chars (out of {1})".format(
len(test_embeddings["chars"]),
len(new_test_chars),
),
)

data = utils.read_conll_dir(treebanks,datasplit,char_map=char_map)
for iSentence, osentence in enumerate(data,1):

pbar = tqdm.tqdm(
data,
desc="Parsing",
unit="sentences",
mininterval=1.0,
leave=False,
disable=options.quiet,
)

for iSentence, osentence in enumerate(pbar,1):
sentence = deepcopy(osentence)
reached_swap_for_i_sentence = False
max_swap = 2*len(sentence)
Expand All @@ -288,7 +307,7 @@ def Predict(self, treebanks, datasplit, options):
if iSwap == max_swap and not reached_swap_for_i_sentence:
reached_max_swap += 1
reached_swap_for_i_sentence = True
print("reached max swap in %d out of %d sentences"%(reached_max_swap, iSentence))
logger.debug(f"reached max swap in {reached_max_swap:d} out of {iSentence:d} sentences")
self.apply_transition(best,stack,buf,hoffset)
if best[1] == SWAP:
iSwap += 1
Expand Down Expand Up @@ -317,20 +336,31 @@ def Train(self, trainData, options):
start = time.time()

random.shuffle(trainData) # in certain cases the data will already have been shuffled after being read from file or while creating dev data
print("Length of training data: ", len(trainData))
logger.info(f"Length of training data: {len(trainData)}")

errs = []

self.feature_extractor.Init(options)

for iSentence, sentence in enumerate(trainData,1):
pbar = tqdm.tqdm(
trainData,
desc="Training",
unit="sentences",
mininterval=1.0,
leave=False,
disable=options.quiet,
)

for iSentence, sentence in enumerate(pbar,1):
if iSentence % 100 == 0:
loss_message = 'Processing sentence number: %d'%iSentence + \
' Loss: %.3f'%(eloss / etotal)+ \
' Errors: %.3f'%((float(eerrors)) / etotal)+\
' Labeled Errors: %.3f'%(float(lerrors) / etotal)+\
' Time: %.2gs'%(time.time()-start)
print(loss_message)
loss_message = (
f'Processing sentence number: {iSentence}'
f' Loss: {eloss / etotal:.3f}'
f' Errors: {eerrors / etotal:.3f}'
f' Labeled Errors: {lerrors / etotal:.3f}'
f' Time: {time.time()-start:.3f}s'
)
logger.debug(loss_message)
start = time.time()
eerrors = 0
eloss = 0.0
Expand Down Expand Up @@ -434,5 +464,5 @@ def Train(self, trainData, options):
dy.renew_cg()

self.trainer.update()
print("Loss: ", mloss/iSentence)
print("Total Training Time: %.2gs" % (time.time()-beg))
logger.info(f"Loss: {mloss/iSentence}")
logger.info(f"Total Training Time: {time.time()-beg:.2g}s")
4 changes: 2 additions & 2 deletions uuparser/chuliu_edmonds.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@ def set_root(scores, root):
assert best_tree is not None
except:
with open('debug.log', 'w') as f:
f.write('{}: {}, {}\n'.format(tree, scores, roots_to_try))
f.write('{}: {}, {}, {}\n'.format(_tree, _scores, tree_probs, tree_score))
f.write(f'{tree}: {scores}, {roots_to_try}\n')
f.write(f'{_tree}: {_scores}, {tree_probs}, {tree_score}\n')
raise
return best_tree
9 changes: 5 additions & 4 deletions uuparser/elmo.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,13 @@
import h5py
import numpy as np

from loguru import logger


class ELMo(object):

def __init__(self, elmo_file, gamma=1.0, learn_gamma=False):
print("Reading ELMo embeddings from '%s'" % elmo_file)
logger.info(f"Reading ELMo embeddings from '{elmo_file}'")
self.sentence_data = h5py.File(elmo_file, 'r')
self.weights = []

Expand All @@ -28,8 +30,7 @@ def get_sentence_representation(self, sentence):
sentence_index = self.sentence_to_index.get(sentence)
if not sentence_index:
raise ValueError(
"The sentence '%s' could not be found in the ELMo data."
% sentence
f"The sentence '{sentence}' could not be found in the ELMo data."
)

return ELMo.Sentence(self.sentence_data[sentence_index], self)
Expand All @@ -40,7 +41,7 @@ def init_weights(self, model):
scale=1.0)

if self.gamma is None:
print("ELMo: Learning gamma factor...")
logger.debug("ELMo: Learning gamma factor")
self.gamma = model.add_parameters(1, name="elmo-gamma", init=1.0)

class Sentence(object):
Expand Down
18 changes: 11 additions & 7 deletions uuparser/feature_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
from collections import defaultdict
import re, os

from loguru import logger

from uuparser import utils

class FeatureExtractor(object):
Expand Down Expand Up @@ -106,7 +108,7 @@ def __init__(self, model, options, vocab, nnvecs=1):
2 * (options.char_lstm_output_size
if options.char_emb_size > 0 else 0)
)
print("Word-level LSTM input size: " + str(self.lstm_input_size))
logger.debug(f"Word-level LSTM input size: {self.lstm_input_size}")

self.bilstms = []
if options.no_bilstms > 0:
Expand Down Expand Up @@ -226,19 +228,21 @@ def get_char_vector(self,root,train,test_embeddings_chars={}):
def init_lookups(self,options):

if self.external_embedding["words"]:
print('Initialising %i word vectors with external embeddings'%len(self.external_embedding["words"]))
n_words = len(self.external_embedding["words"])
logger.info(f'Initialising {n_words} word vectors with external embeddings')
for word in self.external_embedding["words"]:
if len(self.external_embedding["words"][word]) != options.word_emb_size:
raise Exception("Size of external embedding does not match specified word embedding size of %s"%(options.word_emb_size))
raise Exception(f"Size of external embedding does not match specified word embedding size of {options.word_emb_size}")
self.word_lookup.init_row(self.words[word],self.external_embedding["words"][word])
elif options.word_emb_size > 0:
print('No word external embeddings found: all vectors initialised randomly')
logger.info('No word external embeddings found: all vectors initialised randomly')

if self.external_embedding["chars"]:
print('Initialising %i char vectors with external embeddings'%len(self.external_embedding["chars"]))
n_chars = len(self.external_embedding["chars"])
logger.info(f'Initialising {n_chars} char vectors with external embeddings')
for char in self.external_embedding["chars"]:
if len(self.external_embedding["chars"][char]) != options.char_emb_size:
raise Exception("Size of external embedding does not match specified char embedding size of %s"%(options.char_emb_size))
raise Exception(f"Size of external embedding does not match specified char embedding size of {options.char_emb_size}")
self.char_lookup.init_row(self.chars[char],self.external_embedding["chars"][char])
elif options.char_emb_size > 0:
print('No character external embeddings found: all vectors initialised randomly')
logger.info('No character external embeddings found: all vectors initialised randomly')
Loading

0 comments on commit dc480e6

Please sign in to comment.