Skip to content

Commit

Permalink
Add POS NER WordNet Hypernyms features
Browse files Browse the repository at this point in the history
  • Loading branch information
Diego999 committed May 14, 2017
1 parent e4e7819 commit 4b27769
Show file tree
Hide file tree
Showing 146 changed files with 29,249 additions and 16 deletions.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
300 changes: 292 additions & 8 deletions src/dataset.py

Large diffs are not rendered by default.

63 changes: 62 additions & 1 deletion src/entity_lstm.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,10 +64,28 @@ def __init__(self, dataset, parameters):

# Placeholders for input, output and dropout
self.input_token_indices = tf.placeholder(tf.int32, [None], name="input_token_indices")
# POS
if parameters['use_pos']:
self.input_pos_indices = tf.placeholder(tf.int32, [None], name="input_pos_indices")
# NER
if parameters['use_ner']:
self.input_ner_indices = tf.placeholder(tf.int32, [None], name="input_ner_indices")
# WN
if parameters['use_wn']:
self.input_wn_indices = tf.placeholder(tf.int32, [None], name="input_wn_indices")
self.input_label_indices_vector = tf.placeholder(tf.float32, [None, dataset.number_of_classes], name="input_label_indices_vector")
self.input_label_indices_flat = tf.placeholder(tf.int32, [None], name="input_label_indices_flat")
self.input_token_character_indices = tf.placeholder(tf.int32, [None, None], name="input_token_indices")
self.input_token_lengths = tf.placeholder(tf.int32, [None], name="input_token_lengths")
# POS
if parameters['use_pos']:
self.input_pos_lengths = tf.placeholder(tf.int32, [None], name="input_pos_lengths")
# NER
if parameters['use_ner']:
self.input_ner_lengths = tf.placeholder(tf.int32, [None], name="input_ner_lengths")
# WN
if parameters['use_wn']:
self.input_wn_lengths = tf.placeholder(tf.int32, [None], name="input_wn_lengths")
self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob")

# Internal parameters
Expand Down Expand Up @@ -104,11 +122,54 @@ def __init__(self, dataset, parameters):
embedded_tokens = tf.nn.embedding_lookup(self.token_embedding_weights, self.input_token_indices)
utils_tf.variable_summaries(self.token_embedding_weights)

# POS embedding layer
if parameters['use_pos']:
with tf.variable_scope("pos_embedding"):
self.pos_embedding_weights = tf.get_variable(
"pos_embedding_weights",
shape=[dataset.pos_size, parameters['pos_embedding_dimension']],
initializer=initializer,
trainable=not parameters['freeze_pos_embeddings'])
embedded_poss = tf.nn.embedding_lookup(self.pos_embedding_weights, self.input_pos_indices)
utils_tf.variable_summaries(self.pos_embedding_weights)

# NER embedding layer
if parameters['use_ner']:
with tf.variable_scope("ner_embedding"):
self.ner_embedding_weights = tf.get_variable(
"ner_embedding_weights",
shape=[dataset.ner_size, parameters['ner_embedding_dimension']],
initializer=initializer,
trainable=not parameters['freeze_ner_embeddings'])
embedded_ners = tf.nn.embedding_lookup(self.ner_embedding_weights, self.input_ner_indices)
utils_tf.variable_summaries(self.ner_embedding_weights)

# WN embedding layer
if parameters['use_wn']:
with tf.variable_scope("wn_embedding"):
self.wn_embedding_weights = tf.get_variable(
"wn_embedding_weights",
shape=[dataset.wn_size, parameters['wn_embedding_dimension']],
initializer=initializer,
trainable=not parameters['freeze_wn_embeddings'])
embedded_wns = tf.nn.embedding_lookup(self.wn_embedding_weights, self.input_wn_indices)
utils_tf.variable_summaries(self.wn_embedding_weights)

# Concatenate character LSTM outputs and token embeddings
if parameters['use_character_lstm']:
with tf.variable_scope("concatenate_token_and_character_vectors"):
if self.verbose: print('embedded_tokens: {0}'.format(embedded_tokens))
token_lstm_input = tf.concat([character_lstm_output, embedded_tokens], axis=1, name='token_lstm_input')
temp = []
# POS
if parameters['use_pos']:
temp.append(embedded_poss)
# NER
if parameters['use_ner']:
temp.append(embedded_ners)
# WN
if parameters['use_wn']:
temp.append(embedded_wns)
token_lstm_input = tf.concat([character_lstm_output, embedded_tokens] + temp, axis=1, name='token_lstm_input')
if self.verbose: print("token_lstm_input: {0}".format(token_lstm_input))
else:
token_lstm_input = embedded_tokens
Expand Down
5 changes: 2 additions & 3 deletions src/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,6 @@
import warnings
warnings.filterwarnings('ignore')


def load_parameters(parameters_filepath=os.path.join('.','parameters.ini'), verbose=True):
'''
Load parameters from the ini file, and ensure that each parameter is cast to the correct type
Expand All @@ -57,12 +56,12 @@ def load_parameters(parameters_filepath=os.path.join('.','parameters.ini'), verb
v = random.choice(v.split(','))
parameters[k] = v
# Ensure that each parameter is cast to the correct type
if k in ['character_embedding_dimension','character_lstm_hidden_state_dimension','token_embedding_dimension',
if k in ['pos_embedding_dimension', 'ner_embedding_dimension', 'wn_embedding_dimension', 'character_embedding_dimension','character_lstm_hidden_state_dimension','token_embedding_dimension',
'token_lstm_hidden_state_dimension','patience','maximum_number_of_epochs','maximum_training_time','number_of_cpu_threads','number_of_gpus']:
parameters[k] = int(v)
elif k in ['dropout_rate', 'learning_rate', 'gradient_clipping_value']:
parameters[k] = float(v)
elif k in ['evaluate_aspect', 'remap_unknown_tokens_to_unk', 'use_character_lstm', 'use_crf', 'train_model', 'use_pretrained_model', 'debug', 'verbose',
elif k in ['use_pos', 'use_ner', 'use_wn', 'evaluate_aspect', 'remap_unknown_tokens_to_unk', 'remap_unknown_pos_to_unk', 'remap_unknown_ner_to_unk', 'remap_unknown_wn_to_unk', 'use_character_lstm', 'use_crf', 'train_model', 'use_pretrained_model', 'debug', 'verbose',
'reload_character_embeddings', 'reload_character_lstm', 'reload_token_embeddings', 'reload_token_lstm', 'reload_feedforward', 'reload_crf',
'check_for_lowercase', 'check_for_digits_replaced_with_zeros', 'freeze_token_embeddings', 'load_only_pretrained_token_embeddings']:
parameters[k] = distutils.util.strtobool(v)
Expand Down
15 changes: 15 additions & 0 deletions src/parameters.ini
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,15 @@ use_character_lstm = True
character_embedding_dimension = 25
character_lstm_hidden_state_dimension = 25

use_pos = True
pos_embedding_dimension = 25

use_ner = True
ner_embedding_dimension = 25

use_wn = True
wn_embedding_dimension = 25

# In order to use random initialization instead, set token_pretrained_embedding_filepath to empty string, as below:
# token_pretrained_embedding_filepath =
token_pretrained_embedding_filepath = ../data/word_vectors/glove.6B.100d.txt
Expand Down Expand Up @@ -89,6 +98,9 @@ spacylanguage = en

# If remap_unknown_tokens is set to True, map to UNK any token that hasn't been seen in neither the training set nor the pre-trained token embeddings.
remap_unknown_tokens_to_unk = True
remap_unknown_pos_to_unk = True
remap_unknown_ner_to_unk = True
remap_unknown_wn_to_unk = True

# If load_only_pretrained_token_embeddings is set to True, then token embeddings will only be loaded if it exists in token_pretrained_embedding_filepath
# or in pretrained_model_checkpoint_filepath, even for the training set.
Expand All @@ -107,6 +119,9 @@ check_for_digits_replaced_with_zeros = True

# If freeze_token_embeddings is set to True, token embedding will remain frozen (not be trained).
freeze_token_embeddings = False
freeze_pos_embeddings = False
freeze_ner_embeddings = False
freeze_wn_embeddings = False

# If debug is set to True, only 200 lines will be loaded for each split of the dataset.
debug = False
Expand Down
139 changes: 139 additions & 0 deletions src/parameters_Laptop.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@

#----- Possible modes of operation -----------------------------------------------------------------------------------------------------------------#
# training mode (from scratch): set continue_training to True, and use_pretrained_model to False (if training from scratch). #
# Must have train and valid sets in the dataset_text_folder, and test and deployment sets are optional. #
# training mode (from pretrained model): set continue_training to True, and use_pretrained_model to True (if training from a pretrained model). #
# Must have train and valid sets in the dataset_text_folder, and test and deployment sets are optional. #
# prediction mode (using pretrained model): set continue training to False, and use_pretrained_model to True. #
# Must have either a test set or a deployment set. #
# NOTE: Whenever use_pretrained_model is set to True, pretrained_model_folder must be set to the folder containing the pretrained model to use, and #
# model.ckpt, dataset.pickle and parameters.ini must exist in the same folder as the checkpoint file. #
#---------------------------------------------------------------------------------------------------------------------------------------------------#

[mode]
# At least one of use_pretrained_model and train_model must be set to True.
train_model = True
use_pretrained_model = False
pretrained_model_folder = ../trained_models/conll_2003_en

[dataset]
dataset_text_folder = ../data/SemEval-2014/Laptop
ref_file = ../data/SemEval-2014/Scorer/ref_Laptop_train.xml
eval_type = Laptop
scorer = ../data/SemEval-2014/Scorer/eval.jar
#../data/SemEval-2014/Laptop
#../data/conll2003/en

# main_evaluation_mode should be either 'conll', 'bio', 'token', or 'binary'. ('conll' is entity-based)
# It determines which metric to use for early stopping, displaying during training, and plotting F1-score vs. epoch.
main_evaluation_mode = binary
evaluate_aspect = True

#---------------------------------------------------------------------------------------------------------------------#
# The parameters below are for advanced users. Their default values should yield good performance in most cases. #
#---------------------------------------------------------------------------------------------------------------------#

[ann]
use_character_lstm = True
character_embedding_dimension = 25
character_lstm_hidden_state_dimension = 25

use_pos = True
pos_embedding_dimension = 25

use_ner = True
ner_embedding_dimension = 25

use_wn = True
wn_embedding_dimension = 25

# In order to use random initialization instead, set token_pretrained_embedding_filepath to empty string, as below:
# token_pretrained_embedding_filepath =
token_pretrained_embedding_filepath = ../data/word_vectors/glove.6B.100d.txt
token_embedding_dimension = 100
token_lstm_hidden_state_dimension = 100

use_crf = True

[training]
patience = 10
maximum_number_of_epochs = 100

# optimizer should be either 'sgd', 'adam', or 'adadelta'
optimizer = sgd
learning_rate = 0.005
# gradients will be clipped above |gradient_clipping_value| and below -|gradient_clipping_value|, if gradient_clipping_value is non-zero
# (set to 0 to disable gradient clipping)
gradient_clipping_value = 5.0

# dropout_rate should be between 0 and 1
dropout_rate = 0.5

# Upper bound on the number of CPU threads NeuroNER will use
number_of_cpu_threads = 4

# Upper bound on the number of GPU NeuroNER will use
# If number_of_gpus > 0, you need to have installed tensorflow-gpu
number_of_gpus = 0

[advanced]
experiment_name = test

# tagging_format should be either 'bioes' or 'bio'
tagging_format = bioes

# tokenizer should be either 'spacy' or 'stanford'. The tokenizer is only used when the original data is provided only in BRAT format.
# - 'spacy' refers to spaCy (https://spacy.io). To install spacy: pip install -U spacy
# - 'stanford' refers to Stanford CoreNLP (https://stanfordnlp.github.io/CoreNLP/). Stanford CoreNLP is written in Java: to use it one has to start a
# Stanford CoreNLP server, which can tokenize sentences given on the fly. Stanford CoreNLP is portable, which means that it can be run
# without any installation.
# To download Stanford CoreNLP: https://stanfordnlp.github.io/CoreNLP/download.html
# To run Stanford CoreNLP, execute in the terminal: `java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 50000`
# By default Stanford CoreNLP is in English. To use it in other languages, see: https://stanfordnlp.github.io/CoreNLP/human-languages.html
# Stanford CoreNLP 3.6.0 and higher requires Java 8. We have tested NeuroNER with Stanford CoreNLP 3.6.0.
tokenizer = spacy
# spacylanguage should be either 'de' (German), 'en' (English) or 'fr' (French). (https://spacy.io/docs/api/language-models)
# To install the spaCy language: `python -m spacy.de.download`; or `python -m spacy.en.download`; or `python -m spacy.fr.download`
spacylanguage = en

# If remap_unknown_tokens is set to True, map to UNK any token that hasn't been seen in neither the training set nor the pre-trained token embeddings.
remap_unknown_tokens_to_unk = True
remap_unknown_pos_to_unk = True
remap_unknown_ner_to_unk = True
remap_unknown_wn_to_unk = True

# If load_only_pretrained_token_embeddings is set to True, then token embeddings will only be loaded if it exists in token_pretrained_embedding_filepath
# or in pretrained_model_checkpoint_filepath, even for the training set.
load_only_pretrained_token_embeddings = False

# If check_for_lowercase is set to True, the lowercased version of each token will also be checked when loading the pretrained embeddings.
# For example, if the token 'Boston' does not exist in the pretrained embeddings, then it is mapped to the embedding of its lowercased version 'boston',
# if it exists among the pretrained embeddings.
check_for_lowercase = True

# If check_for_digits_replaced_with_zeros is set to True, each token with digits replaced with zeros will also be checked when loading pretrained embeddings.
# For example, if the token '123-456-7890' does not exist in the pretrained embeddings, then it is mapped to the embedding of '000-000-0000',
# if it exists among the pretrained embeddings.
# If both check_for_lowercase and check_for_digits_replaced_with_zeros are set to True, then the lowercased version is checked before the digit-zeroed version.
check_for_digits_replaced_with_zeros = True

# If freeze_token_embeddings is set to True, token embedding will remain frozen (not be trained).
freeze_token_embeddings = False
freeze_pos_embeddings = False
freeze_ner_embeddings = False
freeze_wn_embeddings = False

# If debug is set to True, only 200 lines will be loaded for each split of the dataset.
debug = False
verbose = False

# plot_format specifies the format of the plots generated by NeuroNER. It should be either 'png' or 'pdf'.
plot_format = pdf

# specify which layers to reload from the pretrained model
reload_character_embeddings = True
reload_character_lstm = True
reload_token_embeddings = True
reload_token_lstm = True
reload_feedforward = True
reload_crf = True
23 changes: 19 additions & 4 deletions src/random_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,17 @@ def draw_val(array):
file_exists = True

while file_exists:
embedding_set = [('glove.6B.100d.txt', 100), ('santos.txt', 400), ('matteo.txt', 600)]
embedding_set = [('glove.6B.100d.txt', 100)]#, ('santos.txt', 400), ('matteo.txt', 600)]
token_pretrained_embedding_filepath, token_embedding_dimension = draw_val(embedding_set)

type_review_set = ['Restaurant']#, 'Laptop']
type_review_set = ['Restaurant', 'Laptop']
type_review = draw_val(type_review_set)

char_emb_dim = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
character_embedding_dimension = draw_val(char_emb_dim)
pos_embedding_dimension = draw_val(char_emb_dim)
ner_embedding_dimension = draw_val(char_emb_dim)
wn_embedding_dimension = draw_val(char_emb_dim)

char_hidden_dim = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 125, 150, 175, 200, 225, 250, 275, 300, 325, 350, 375, 400]
character_lstm_hidden_state_dimension = draw_val(char_hidden_dim)
Expand All @@ -30,10 +33,10 @@ def draw_val(array):
learning_rate_set = [0.0001, 0.00025, 0.0005, 0.00075, 0.001, 0.0025, 0.005]
learning_rate = draw_val(learning_rate_set)

dropout_rate_set = [0.5, 0.6, 0.7]
dropout_rate_set = [0.5, 0.6, 0.7, 0.8]
dropout_rate = draw_val(dropout_rate_set)

parameters = [token_pretrained_embedding_filepath, token_embedding_dimension, type_review, character_embedding_dimension, character_lstm_hidden_state_dimension, token_lstm_hidden_state_dimension, learning_rate, dropout_rate]
parameters = [token_pretrained_embedding_filepath, token_embedding_dimension, type_review, character_embedding_dimension, pos_embedding_dimension, ner_embedding_dimension, wn_embedding_dimension, character_lstm_hidden_state_dimension, token_lstm_hidden_state_dimension, learning_rate, dropout_rate]
filename = 'parameters_' + '_'.join([str(x) for x in parameters]) + '.ini'
file_exists = os.path.isfile(filename)

Expand Down Expand Up @@ -78,6 +81,12 @@ def draw_val(array):
fp.write("[ann]\n")
fp.write("use_character_lstm = True\n")
fp.write("character_embedding_dimension = " + str(character_embedding_dimension) + "\n")
fp.write("use_pos = True\n")
fp.write("pos_embedding_dimension = " + str(pos_embedding_dimension) + "\n")
fp.write("use_ner = True\n")
fp.write("ner_embedding_dimension = " + str(ner_embedding_dimension) + "\n")
fp.write("use_wn = True\n")
fp.write("wn_embedding_dimension = " + str(wn_embedding_dimension) + "\n")
fp.write("character_lstm_hidden_state_dimension = " + str(character_lstm_hidden_state_dimension) + "\n")
fp.write("\n")
fp.write("# In order to use random initialization instead, set token_pretrained_embedding_filepath to empty string, as below:\n")
Expand Down Expand Up @@ -131,6 +140,9 @@ def draw_val(array):
fp.write("\n")
fp.write("# If remap_unknown_tokens is set to True, map to UNK any token that hasn't been seen in neither the training set nor the pre-trained token embeddings.\n")
fp.write("remap_unknown_tokens_to_unk = True\n")
fp.write("remap_unknown_pos_to_unk = True\n")
fp.write("remap_unknown_ner_to_unk = True\n")
fp.write("remap_unknown_wn_to_unk = True\n")
fp.write("\n")
fp.write("# If load_only_pretrained_token_embeddings is set to True, then token embeddings will only be loaded if it exists in token_pretrained_embedding_filepath \n")
fp.write("# or in pretrained_model_checkpoint_filepath, even for the training set.\n")
Expand All @@ -149,6 +161,9 @@ def draw_val(array):
fp.write("\n")
fp.write("# If freeze_token_embeddings is set to True, token embedding will remain frozen (not be trained).\n")
fp.write("freeze_token_embeddings = False\n")
fp.write("freeze_pos_embeddings = False\n")
fp.write("freeze_ner_embeddings = False\n")
fp.write("freeze_wn_embeddings = False\n")
fp.write("\n")
fp.write("# If debug is set to True, only 200 lines will be loaded for each split of the dataset.\n")
fp.write("debug = False\n")
Expand Down
6 changes: 6 additions & 0 deletions src/run_lap.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
#! /bin/bash

MAX=10000
for (( i = 0; i<MAX; i++)) ; {
python3 main.py parameters_Laptop.ini
}
6 changes: 6 additions & 0 deletions src/run_res.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
#! /bin/bash

MAX=10000
for (( i = 0; i<MAX; i++)) ; {
python3 main.py parameters.ini
}
Loading

0 comments on commit 4b27769

Please sign in to comment.