diff --git a/code/dropout.py b/code/dropout.py
new file mode 100644
index 00000000..a4ff9c58
--- /dev/null
+++ b/code/dropout.py
@@ -0,0 +1,337 @@
+from __future__ import print_function
+
+import numpy as np
+import os
+import sys
+import timeit
+import six.moves.cPickle as pickle
+import theano
+import theano.tensor as T
+import theano.tensor.shared_randomstreams
+import gzip
+from collections import OrderedDict
+from logistic_sgd import LogisticRegression, load_data
+from mlp import HiddenLayer
+
+
+def _dropsout(rng, layer, p):
+    srng = theano.tensor.shared_randomstreams.RandomStreams(rng.randint(1000))
+    mask = srng.binomial(n=1, p=1-p, size=layer.shape)
+    output = layer*T.cast(mask, theano.config.floatX)
+    return output / (1 - p)
+    
+
+class DropoutMLP(object):
+    """Multi-Layer Perceptron Class with partial hidden units
+
+    An implementation of Multilayer Perceptron with dropping of hidden units at a probability 
+    given by ```1-dropout_rate```.
+    """
+
+    def __init__(self, rng, input, n_in, n_hidden, dropout_rates, n_out):
+        """Initialize the parameters for the multilayer perceptron
+
+        :type rng: numpy.random.RandomState
+        :param rng: a random number generator used to initialize weights
+
+        :type input: theano.tensor.TensorType
+        :param input: symbolic variable that describes the input of the
+        architecture (one minibatch)
+
+        :type n_in: int
+        :param n_in: number of input units, the dimension of the space in
+        which the datapoints lie
+
+        :type n_hidden: int
+        :param n_hidden: number of hidden units
+
+        :type dropout_rate: list 
+        :param dropout_rate: array containing probabilities of retaining a unit
+
+        :type n_out: int
+        :param n_out: number of output units, the dimension of the space in
+        which the labels lie
+
+        """
+        
+        #Dropping out the input layer
+        inp_dropout_layer = _dropsout(rng, input, p=dropout_rates[0])
+        
+        self.drop_layer = HiddenLayer(rng=rng,
+                    input=inp_dropout_layer,
+                    n_in=n_in, n_out=n_hidden,
+                    activation=T.tanh)
+        self.drop_layer.output = _dropsout(rng, self.drop_layer.output, p=dropout_rates[1])
+        
+
+        # Since we are dealing with a one hidden layer MLP, this will translate
+        # into a HiddenLayer with a tanh activation function connected to the
+        # LogisticRegression layer; the activation function can be replaced by
+        # sigmoid or any other nonlinear function
+        self.hiddenLayer = HiddenLayer(
+            rng=rng,
+            input=input,
+            n_in=n_in,
+            n_out=n_hidden,
+            W=self.drop_layer.W,
+            b=self.drop_layer.b,
+            activation=T.tanh
+        )
+        
+        
+        self.drop_output_layer = LogisticRegression(
+        input=self.drop_layer.output,
+        n_in=n_hidden, 
+        n_out=n_out)
+        
+
+        # The logistic regression layer gets as input the hidden units
+        # of the hidden layer
+        self.logRegressionLayer = LogisticRegression(
+            input=self.hiddenLayer.output,
+            n_in=n_hidden,
+            n_out=n_out,
+            W=self.drop_output_layer.W,
+            b=self.drop_output_layer.b,
+        )
+        
+        
+        self.drop_negative_log_likelihood = self.drop_output_layer.negative_log_likelihood
+        self.dropout_errors = self.drop_output_layer.errors
+
+        # negative log likelihood of the MLP is given by the negative
+        # log likelihood of the output of the model, computed in the
+        # logistic regression layer
+        self.negative_log_likelihood = (
+            self.logRegressionLayer.negative_log_likelihood
+        )
+        # same holds for the function computing the number of errors
+        self.errors = self.logRegressionLayer.errors
+
+        # the parameters of the model are the parameters of the two layer it is
+        # made out of
+        self.params = self.drop_layer.params + self.drop_output_layer.params
+        # end-snippet-3
+
+        # keep track of model input
+        self.input = input
+
+
+
+# In[36]:
+
+
+def test_mlp(learning_rate=0.01, n_epochs=1000, dropout_rates = [0.2, 0.5],
+             dataset='mnist.pkl.gz', batch_size=20, n_hidden=500):
+    """
+    Demonstrate stochastic gradient descent optimization for a multilayer
+    perceptron
+
+    This is demonstrated on MNIST.
+
+    :type learning_rate: float
+    :param learning_rate: learning rate used (factor for the stochastic
+    gradient
+
+    :type n_epochs: int
+    :param n_epochs: maximal number of epochs to run the optimizer
+
+    :type dataset: string
+    :param dataset: the path of the MNIST dataset file from
+                 http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz
+
+    :type dropout_rate: list 
+    :param dropout_rate: array containing probabilities of retaining a unit
+
+   """
+    datasets = load_data(dataset)
+
+    train_set_x, train_set_y = datasets[0]
+    valid_set_x, valid_set_y = datasets[1]
+    test_set_x, test_set_y = datasets[2]
+
+    # compute number of minibatches for training, validation and testing
+    n_train_batches = train_set_x.get_value(borrow=True).shape[0] // batch_size
+    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] // batch_size
+    n_test_batches = test_set_x.get_value(borrow=True).shape[0] // batch_size
+
+    ######################
+    # BUILD ACTUAL MODEL #
+    ######################
+    print('... building the model')
+
+    # allocate symbolic variables for the data
+    index = T.lscalar()  # index to a [mini]batch
+    x = T.matrix('x')  # the data is presented as rasterized images
+    y = T.ivector('y')  # the labels are presented as 1D vector of
+                        # [int] labels
+
+    rng = np.random.RandomState(1234)
+
+
+    # construct the MLP class
+    classifier = HiddenMLP(
+        rng=rng,
+        input=x,
+        n_in=28 * 28,
+        n_hidden=n_hidden,
+        dropout_rates=dropout_rates,
+        n_out=10
+    )
+
+    # start-snippet-4
+    # the cost we minimize during training is the negative log likelihood of
+    # the model plus the regularization terms (L1 and L2); cost is expressed
+    # here symbolically
+    cost = (
+        classifier.negative_log_likelihood(y)
+    )
+    dropout_cost = classifier.drop_negative_log_likelihood(y)
+    
+    # end-snippet-4
+
+    # compiling a Theano function that computes the mistakes that are made
+    # by the model on a minibatch
+    test_model = theano.function(
+        inputs=[index],
+        outputs=classifier.errors(y),
+        givens={
+            x: test_set_x[index * batch_size:(index + 1) * batch_size],
+            y: test_set_y[index * batch_size:(index + 1) * batch_size]
+        }
+    )
+
+    validate_model = theano.function(
+        inputs=[index],
+        outputs=classifier.errors(y),
+        givens={
+            x: valid_set_x[index * batch_size:(index + 1) * batch_size],
+            y: valid_set_y[index * batch_size:(index + 1) * batch_size]
+        }
+    )
+
+    # start-snippet-5
+    # compute the gradient of cost with respect to theta (sotred in params)
+    # the resulting gradients will be stored in a list gparams
+    gparams = []
+    for param in classifier.params:
+        #Changing cost for with dropout layer and without 
+        gparam = T.grad(dropout_cost, param)
+        gparams.append(gparam)
+
+    # specify how to update the parameters of the model as a list of
+    # (variable, update expression) pairs
+
+    # given two lists of the same length, A = [a1, a2, a3, a4] and
+    # B = [b1, b2, b3, b4], zip generates a list C of same size, where each
+    # element is a pair formed from the two lists :
+    #    C = [(a1, b1), (a2, b2), (a3, b3), (a4, b4)]
+    #Stochastic Gradient Descent (SGD) updates
+
+    output = dropout_cost
+    updates = OrderedDict()
+    for param, gparam in zip(classifier.params, gparams) :
+        updates[param] = param - learning_rate * gparam
+
+
+    # compiling a Theano function `train_model` that returns the cost, but
+    # in the same time updates the parameter of the model based on the rules
+    # defined in `updates`
+    train_model = theano.function(
+        inputs=[index],
+        outputs=cost,
+        updates=updates,
+        givens={
+            x: train_set_x[index * batch_size: (index + 1) * batch_size],
+            y: train_set_y[index * batch_size: (index + 1) * batch_size]
+        }
+    )
+    # end-snippet-5
+
+    ###############
+    # TRAIN MODEL #
+    ###############
+    print('... training')
+
+    # early-stopping parameters
+    patience = 10000  # look as this many examples regardless
+    patience_increase = 2  # wait this much longer when a new best is
+                           # found
+    improvement_threshold = 0.995  # a relative improvement of this much is
+                                   # considered significant
+    validation_frequency = min(n_train_batches, patience // 2)
+                                  # go through this many
+                                  # minibatche before checking the network
+                                  # on the validation set; in this case we
+                                  # check every epoch
+
+    best_validation_loss = np.inf
+    best_iter = 0
+    test_score = 0.
+    start_time = timeit.default_timer()
+
+    epoch = 0
+    done_looping = False
+
+    while (epoch < n_epochs) and (not done_looping):
+        epoch = epoch + 1
+        for minibatch_index in range(n_train_batches):
+
+            minibatch_avg_cost = train_model(minibatch_index)
+            # iteration number
+            iter = (epoch - 1) * n_train_batches + minibatch_index
+
+            if (iter + 1) % validation_frequency == 0:
+                # compute zero-one loss on validation set
+                validation_losses = [validate_model(i) for i
+                                     in range(n_valid_batches)]
+                this_validation_loss = np.mean(validation_losses)
+
+                print(
+                    'epoch %i, minibatch %i/%i, validation error %f %%' %
+                    (
+                        epoch,
+                        minibatch_index + 1,
+                        n_train_batches,
+                        this_validation_loss * 100.
+                    )
+                )
+
+                # if we got the best validation score until now
+                if this_validation_loss < best_validation_loss:
+                    #improve patience if loss improvement is good enough
+                    if (
+                        this_validation_loss < best_validation_loss *
+                        improvement_threshold
+                    ):
+                        patience = max(patience, iter * patience_increase)
+
+                    best_validation_loss = this_validation_loss
+                    best_iter = iter
+
+                    # test it on the test set
+                    test_losses = [test_model(i) for i
+                                   in range(n_test_batches)]
+                    test_score = np.mean(test_losses)
+
+                    print(('     epoch %i, minibatch %i/%i, test error of '
+                           'best model %f %%') %
+                          (epoch, minibatch_index + 1, n_train_batches,
+                           test_score * 100.))
+
+            if patience <= iter:
+                done_looping = True
+                break
+
+    end_time = timeit.default_timer()
+    print(('Optimization complete. Best validation score of %f %% '
+           'obtained at iteration %i, with test performance %f %%') %
+          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
+    print(('The code for file ' +
+           os.path.split(__file__)[1] +
+           ' ran for %.2fm' % ((end_time - start_time) / 60.)), file=sys.stderr)
+
+
+if __name__ == '__main__':
+    test_mlp()
+
diff --git a/code/logistic_sgd.py b/code/logistic_sgd.py
index 9f4427e7..0159798b 100644
--- a/code/logistic_sgd.py
+++ b/code/logistic_sgd.py
@@ -58,8 +58,10 @@ class LogisticRegression(object):
     determine a class membership probability.
     """
 
-    def __init__(self, input, n_in, n_out):
+    def __init__(self, input, n_in, n_out, W=None, b=None):
         """ Initialize the parameters of the logistic regression
+        Weight matrix W is of shape (n_in,n_out)
+        and the bias vector b is of shape (n_out,).
 
         :type input: theano.tensor.TensorType
         :param input: symbolic variable that describes the input of the
@@ -75,24 +77,23 @@ def __init__(self, input, n_in, n_out):
 
         """
         # start-snippet-1
-        # initialize with 0 the weights W as a matrix of shape (n_in, n_out)
-        self.W = theano.shared(
-            value=numpy.zeros(
-                (n_in, n_out),
-                dtype=theano.config.floatX
-            ),
-            name='W',
-            borrow=True
-        )
-        # initialize the biases b as a vector of n_out 0s
-        self.b = theano.shared(
-            value=numpy.zeros(
-                (n_out,),
-                dtype=theano.config.floatX
-            ),
-            name='b',
-            borrow=True
-        )
+        # initialize with 0 the weights W as a matrix of shape (n_in, n_out) if 
+        #the parameter W is None
+        if W is None:
+            self.W = theano.shared(
+                    value=numpy.zeros((n_in, n_out), dtype=theano.config.floatX),
+                    name='W')
+        else:
+            self.W = W
+
+        # initialize the biases b as a vector of n_out 0s if the parameter b is
+        #none
+        if b is None:
+            self.b = theano.shared(
+                    value=numpy.zeros((n_out,), dtype=theano.config.floatX),
+                    name='b')
+        else:
+            self.b = b
 
         # symbolic expression for computing the matrix of class-membership
         # probabilities
diff --git a/doc/dropout.txt b/doc/dropout.txt
new file mode 100644
index 00000000..f8defb7f
--- /dev/null
+++ b/doc/dropout.txt
@@ -0,0 +1,41 @@
+.. index:: Dropout
+
+.. _dropout:
+
+Dropout
+=====================
+
+.. note::
+    This section assumes the reader has already read through :doc:`mlp`. 
+    Overfitting can be reduced by using dropout to prevent complex co-adaptations
+    on the training data. The explanation of the model in this section 
+    is based on the MLP model. 
+    Additionally, it uses the following new Theano functions and concepts:
+    `T.cast`_. 
+    If you intend to run the code on GPU also read `GPU`_.
+
+
+.. note::
+    The code for this section is available for download `here`_.
+
+.. _here: http://deeplearning.net/tutorial/code/dropout.py
+
+.. _T.cast: http://deeplearning.net/software/theano/library/tensor/basic.html#casting
+
+.. _GPU: http://deeplearning.net/software/theano/tutorial/using_gpu.html
+
+
+The next architecture we are going to present using Theano is the
+single-hidden-layer Multi-Layer Perceptron (MLP) with 20% dropout 
+to the input data and 50% dropout to the hidden layers. 
+
+The Model
++++++++++
+This model is same as the MLP with the units in the hidden layers dropped 
+randomly with a probability of 50% and input layer dropped with a 
+probability of 20%. Before adding the first hidden layer, 20% dropout is applied to 
+the input data. Then, 50% dropout is added to the first hidden layer followed by 50% 
+dropout to the Logistic Regression layer. This is done via the _dropsout method. 
+The _dropsout method takes in the following parameters, random state , layer and a decimal
+denoting the probability of retaining the units. The method multiplies the input with mask and 
+scales it up by 1/(1-rate). 
\ No newline at end of file