diff --git a/code/dropout.py b/code/dropout.py new file mode 100644 index 00000000..a4ff9c58 --- /dev/null +++ b/code/dropout.py @@ -0,0 +1,337 @@ +from __future__ import print_function + +import numpy as np +import os +import sys +import timeit +import six.moves.cPickle as pickle +import theano +import theano.tensor as T +import theano.tensor.shared_randomstreams +import gzip +from collections import OrderedDict +from logistic_sgd import LogisticRegression, load_data +from mlp import HiddenLayer + + +def _dropsout(rng, layer, p): + srng = theano.tensor.shared_randomstreams.RandomStreams(rng.randint(1000)) + mask = srng.binomial(n=1, p=1-p, size=layer.shape) + output = layer*T.cast(mask, theano.config.floatX) + return output / (1 - p) + + +class DropoutMLP(object): + """Multi-Layer Perceptron Class with partial hidden units + + An implementation of Multilayer Perceptron with dropping of hidden units at a probability + given by ```1-dropout_rate```. + """ + + def __init__(self, rng, input, n_in, n_hidden, dropout_rates, n_out): + """Initialize the parameters for the multilayer perceptron + + :type rng: numpy.random.RandomState + :param rng: a random number generator used to initialize weights + + :type input: theano.tensor.TensorType + :param input: symbolic variable that describes the input of the + architecture (one minibatch) + + :type n_in: int + :param n_in: number of input units, the dimension of the space in + which the datapoints lie + + :type n_hidden: int + :param n_hidden: number of hidden units + + :type dropout_rate: list + :param dropout_rate: array containing probabilities of retaining a unit + + :type n_out: int + :param n_out: number of output units, the dimension of the space in + which the labels lie + + """ + + #Dropping out the input layer + inp_dropout_layer = _dropsout(rng, input, p=dropout_rates[0]) + + self.drop_layer = HiddenLayer(rng=rng, + input=inp_dropout_layer, + n_in=n_in, n_out=n_hidden, + activation=T.tanh) + self.drop_layer.output = _dropsout(rng, self.drop_layer.output, p=dropout_rates[1]) + + + # Since we are dealing with a one hidden layer MLP, this will translate + # into a HiddenLayer with a tanh activation function connected to the + # LogisticRegression layer; the activation function can be replaced by + # sigmoid or any other nonlinear function + self.hiddenLayer = HiddenLayer( + rng=rng, + input=input, + n_in=n_in, + n_out=n_hidden, + W=self.drop_layer.W, + b=self.drop_layer.b, + activation=T.tanh + ) + + + self.drop_output_layer = LogisticRegression( + input=self.drop_layer.output, + n_in=n_hidden, + n_out=n_out) + + + # The logistic regression layer gets as input the hidden units + # of the hidden layer + self.logRegressionLayer = LogisticRegression( + input=self.hiddenLayer.output, + n_in=n_hidden, + n_out=n_out, + W=self.drop_output_layer.W, + b=self.drop_output_layer.b, + ) + + + self.drop_negative_log_likelihood = self.drop_output_layer.negative_log_likelihood + self.dropout_errors = self.drop_output_layer.errors + + # negative log likelihood of the MLP is given by the negative + # log likelihood of the output of the model, computed in the + # logistic regression layer + self.negative_log_likelihood = ( + self.logRegressionLayer.negative_log_likelihood + ) + # same holds for the function computing the number of errors + self.errors = self.logRegressionLayer.errors + + # the parameters of the model are the parameters of the two layer it is + # made out of + self.params = self.drop_layer.params + self.drop_output_layer.params + # end-snippet-3 + + # keep track of model input + self.input = input + + + +# In[36]: + + +def test_mlp(learning_rate=0.01, n_epochs=1000, dropout_rates = [0.2, 0.5], + dataset='mnist.pkl.gz', batch_size=20, n_hidden=500): + """ + Demonstrate stochastic gradient descent optimization for a multilayer + perceptron + + This is demonstrated on MNIST. + + :type learning_rate: float + :param learning_rate: learning rate used (factor for the stochastic + gradient + + :type n_epochs: int + :param n_epochs: maximal number of epochs to run the optimizer + + :type dataset: string + :param dataset: the path of the MNIST dataset file from + http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz + + :type dropout_rate: list + :param dropout_rate: array containing probabilities of retaining a unit + + """ + datasets = load_data(dataset) + + train_set_x, train_set_y = datasets[0] + valid_set_x, valid_set_y = datasets[1] + test_set_x, test_set_y = datasets[2] + + # compute number of minibatches for training, validation and testing + n_train_batches = train_set_x.get_value(borrow=True).shape[0] // batch_size + n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] // batch_size + n_test_batches = test_set_x.get_value(borrow=True).shape[0] // batch_size + + ###################### + # BUILD ACTUAL MODEL # + ###################### + print('... building the model') + + # allocate symbolic variables for the data + index = T.lscalar() # index to a [mini]batch + x = T.matrix('x') # the data is presented as rasterized images + y = T.ivector('y') # the labels are presented as 1D vector of + # [int] labels + + rng = np.random.RandomState(1234) + + + # construct the MLP class + classifier = HiddenMLP( + rng=rng, + input=x, + n_in=28 * 28, + n_hidden=n_hidden, + dropout_rates=dropout_rates, + n_out=10 + ) + + # start-snippet-4 + # the cost we minimize during training is the negative log likelihood of + # the model plus the regularization terms (L1 and L2); cost is expressed + # here symbolically + cost = ( + classifier.negative_log_likelihood(y) + ) + dropout_cost = classifier.drop_negative_log_likelihood(y) + + # end-snippet-4 + + # compiling a Theano function that computes the mistakes that are made + # by the model on a minibatch + test_model = theano.function( + inputs=[index], + outputs=classifier.errors(y), + givens={ + x: test_set_x[index * batch_size:(index + 1) * batch_size], + y: test_set_y[index * batch_size:(index + 1) * batch_size] + } + ) + + validate_model = theano.function( + inputs=[index], + outputs=classifier.errors(y), + givens={ + x: valid_set_x[index * batch_size:(index + 1) * batch_size], + y: valid_set_y[index * batch_size:(index + 1) * batch_size] + } + ) + + # start-snippet-5 + # compute the gradient of cost with respect to theta (sotred in params) + # the resulting gradients will be stored in a list gparams + gparams = [] + for param in classifier.params: + #Changing cost for with dropout layer and without + gparam = T.grad(dropout_cost, param) + gparams.append(gparam) + + # specify how to update the parameters of the model as a list of + # (variable, update expression) pairs + + # given two lists of the same length, A = [a1, a2, a3, a4] and + # B = [b1, b2, b3, b4], zip generates a list C of same size, where each + # element is a pair formed from the two lists : + # C = [(a1, b1), (a2, b2), (a3, b3), (a4, b4)] + #Stochastic Gradient Descent (SGD) updates + + output = dropout_cost + updates = OrderedDict() + for param, gparam in zip(classifier.params, gparams) : + updates[param] = param - learning_rate * gparam + + + # compiling a Theano function `train_model` that returns the cost, but + # in the same time updates the parameter of the model based on the rules + # defined in `updates` + train_model = theano.function( + inputs=[index], + outputs=cost, + updates=updates, + givens={ + x: train_set_x[index * batch_size: (index + 1) * batch_size], + y: train_set_y[index * batch_size: (index + 1) * batch_size] + } + ) + # end-snippet-5 + + ############### + # TRAIN MODEL # + ############### + print('... training') + + # early-stopping parameters + patience = 10000 # look as this many examples regardless + patience_increase = 2 # wait this much longer when a new best is + # found + improvement_threshold = 0.995 # a relative improvement of this much is + # considered significant + validation_frequency = min(n_train_batches, patience // 2) + # go through this many + # minibatche before checking the network + # on the validation set; in this case we + # check every epoch + + best_validation_loss = np.inf + best_iter = 0 + test_score = 0. + start_time = timeit.default_timer() + + epoch = 0 + done_looping = False + + while (epoch < n_epochs) and (not done_looping): + epoch = epoch + 1 + for minibatch_index in range(n_train_batches): + + minibatch_avg_cost = train_model(minibatch_index) + # iteration number + iter = (epoch - 1) * n_train_batches + minibatch_index + + if (iter + 1) % validation_frequency == 0: + # compute zero-one loss on validation set + validation_losses = [validate_model(i) for i + in range(n_valid_batches)] + this_validation_loss = np.mean(validation_losses) + + print( + 'epoch %i, minibatch %i/%i, validation error %f %%' % + ( + epoch, + minibatch_index + 1, + n_train_batches, + this_validation_loss * 100. + ) + ) + + # if we got the best validation score until now + if this_validation_loss < best_validation_loss: + #improve patience if loss improvement is good enough + if ( + this_validation_loss < best_validation_loss * + improvement_threshold + ): + patience = max(patience, iter * patience_increase) + + best_validation_loss = this_validation_loss + best_iter = iter + + # test it on the test set + test_losses = [test_model(i) for i + in range(n_test_batches)] + test_score = np.mean(test_losses) + + print((' epoch %i, minibatch %i/%i, test error of ' + 'best model %f %%') % + (epoch, minibatch_index + 1, n_train_batches, + test_score * 100.)) + + if patience <= iter: + done_looping = True + break + + end_time = timeit.default_timer() + print(('Optimization complete. Best validation score of %f %% ' + 'obtained at iteration %i, with test performance %f %%') % + (best_validation_loss * 100., best_iter + 1, test_score * 100.)) + print(('The code for file ' + + os.path.split(__file__)[1] + + ' ran for %.2fm' % ((end_time - start_time) / 60.)), file=sys.stderr) + + +if __name__ == '__main__': + test_mlp() + diff --git a/code/logistic_sgd.py b/code/logistic_sgd.py index 9f4427e7..0159798b 100644 --- a/code/logistic_sgd.py +++ b/code/logistic_sgd.py @@ -58,8 +58,10 @@ class LogisticRegression(object): determine a class membership probability. """ - def __init__(self, input, n_in, n_out): + def __init__(self, input, n_in, n_out, W=None, b=None): """ Initialize the parameters of the logistic regression + Weight matrix W is of shape (n_in,n_out) + and the bias vector b is of shape (n_out,). :type input: theano.tensor.TensorType :param input: symbolic variable that describes the input of the @@ -75,24 +77,23 @@ def __init__(self, input, n_in, n_out): """ # start-snippet-1 - # initialize with 0 the weights W as a matrix of shape (n_in, n_out) - self.W = theano.shared( - value=numpy.zeros( - (n_in, n_out), - dtype=theano.config.floatX - ), - name='W', - borrow=True - ) - # initialize the biases b as a vector of n_out 0s - self.b = theano.shared( - value=numpy.zeros( - (n_out,), - dtype=theano.config.floatX - ), - name='b', - borrow=True - ) + # initialize with 0 the weights W as a matrix of shape (n_in, n_out) if + #the parameter W is None + if W is None: + self.W = theano.shared( + value=numpy.zeros((n_in, n_out), dtype=theano.config.floatX), + name='W') + else: + self.W = W + + # initialize the biases b as a vector of n_out 0s if the parameter b is + #none + if b is None: + self.b = theano.shared( + value=numpy.zeros((n_out,), dtype=theano.config.floatX), + name='b') + else: + self.b = b # symbolic expression for computing the matrix of class-membership # probabilities diff --git a/doc/dropout.txt b/doc/dropout.txt new file mode 100644 index 00000000..f8defb7f --- /dev/null +++ b/doc/dropout.txt @@ -0,0 +1,41 @@ +.. index:: Dropout + +.. _dropout: + +Dropout +===================== + +.. note:: + This section assumes the reader has already read through :doc:`mlp`. + Overfitting can be reduced by using dropout to prevent complex co-adaptations + on the training data. The explanation of the model in this section + is based on the MLP model. + Additionally, it uses the following new Theano functions and concepts: + `T.cast`_. + If you intend to run the code on GPU also read `GPU`_. + + +.. note:: + The code for this section is available for download `here`_. + +.. _here: http://deeplearning.net/tutorial/code/dropout.py + +.. _T.cast: http://deeplearning.net/software/theano/library/tensor/basic.html#casting + +.. _GPU: http://deeplearning.net/software/theano/tutorial/using_gpu.html + + +The next architecture we are going to present using Theano is the +single-hidden-layer Multi-Layer Perceptron (MLP) with 20% dropout +to the input data and 50% dropout to the hidden layers. + +The Model ++++++++++ +This model is same as the MLP with the units in the hidden layers dropped +randomly with a probability of 50% and input layer dropped with a +probability of 20%. Before adding the first hidden layer, 20% dropout is applied to +the input data. Then, 50% dropout is added to the first hidden layer followed by 50% +dropout to the Logistic Regression layer. This is done via the _dropsout method. +The _dropsout method takes in the following parameters, random state , layer and a decimal +denoting the probability of retaining the units. The method multiplies the input with mask and +scales it up by 1/(1-rate). \ No newline at end of file