From 6d8858d34c885917c91bada1f243388eb3e8515c Mon Sep 17 00:00:00 2001
From: Nicholas Leonard <nleonard@twitter.com>
Date: Wed, 24 May 2017 14:27:31 -0400
Subject: [PATCH] move dpnn modules

---
 AbstractRecurrent.lua                         |   4 +
 AbstractSequencerCriterion.lua                |   6 +
 BatchNormalization.lua                        |  10 -
 BinaryLogisticRegression.lua                  |  91 ---
 CAddTensorTable.lua                           |  43 --
 CMakeLists.txt                                |  43 +-
 Clip.lua                                      |  35 -
 Collapse.lua                                  |  26 -
 Constant.lua                                  |  36 -
 Convert.lua                                   | 244 ------
 Criterion.lua                                 |   4 +
 Dictionary.lua                                |   6 -
 FireModule.lua                                |  47 --
 Inception.lua                                 | 192 -----
 Kmeans.lua                                    | 207 ------
 LinearNoBias.lua                              |  65 --
 MaskZeroCriterion.lua                         |   8 +-
 Module.lua                                    | 128 ----
 ModuleCriterion.lua                           |  44 --
 NCECriterion.lua                              |  61 +-
 NCEModule.lua                                 |  85 +--
 NormStabilizer.lua                            |   1 -
 OneHot.lua                                    |  65 --
 PCAColorTransform.lua                         | 117 ---
 Padding.lua                                   |  52 --
 README.md                                     | 554 +-------------
 RecGRU.lua                                    |   1 -
 RecLSTM.lua                                   |   1 -
 Recurrence.lua                                |   1 -
 Recursor.lua                                  |   2 -
 Sequencer.lua                                 |  25 +-
 Serial.lua                                    |  52 --
 SimpleColorTransform.lua                      |  90 ---
 SpatialBatchNormalization.lua                 |  12 -
 SpatialBinaryConvolution.lua                  | 173 -----
 SpatialBinaryLogisticRegression.lua           |  80 --
 SpatialConvolution.lua                        |   9 -
 SpatialConvolutionMM.lua                      |   3 -
 SpatialFeatNormalization.lua                  |  73 --
 SpatialMaxPooling.lua                         |   6 -
 SpatialRegionDropout.lua                      |  80 --
 SpatialUniformCrop.lua                        | 121 ---
 WhiteNoise.lua                                |  38 -
 ZeroGrad.lua                                  |  34 -
 ZipTable.lua                                  |  34 -
 ZipTableOneToMany.lua                         |  37 -
 deprecated/FastLSTM.lua                       |  10 +-
 deprecated/GRU.lua                            |   9 +-
 deprecated/LSTM.lua                           |   5 +-
 examples/README.md                            |   1 -
 examples/multigpu-nce-rnnlm.lua               |   3 +-
 examples/noise-contrastive-estimate.lua       |  10 +-
 examples/recurrent-language-model.lua         |  22 +-
 examples/recurrent-visual-attention.lua       |   3 +-
 init.lua                                      |  41 +-
 scripts/evaluate-rnnlm.lua                    |   8 +-
 test/bigtest.lua                              |  80 +-
 test/test.lua                                 | 700 +-----------------
 test/test_firemodule.lua                      |  40 -
 tutorials/ladder.md                           | 107 ---
 tutorials/ladder_network/ladder.lua           | 444 -----------
 .../ladder_network/ladder_help_funcs.lua      | 220 ------
 tutorials/lena.jpg                            | Bin 6600 -> 0 bytes
 tutorials/srd1.jpg                            | Bin 6504 -> 0 bytes
 tutorials/srd2.jpg                            | Bin 6478 -> 0 bytes
 utils.lua                                     |   2 +-
 66 files changed, 155 insertions(+), 4596 deletions(-)
 delete mode 100644 BinaryLogisticRegression.lua
 delete mode 100644 CAddTensorTable.lua
 delete mode 100644 Clip.lua
 delete mode 100644 Collapse.lua
 delete mode 100644 Constant.lua
 delete mode 100644 Convert.lua
 delete mode 100644 Dictionary.lua
 delete mode 100644 FireModule.lua
 delete mode 100644 Inception.lua
 delete mode 100644 Kmeans.lua
 delete mode 100644 LinearNoBias.lua
 delete mode 100644 ModuleCriterion.lua
 delete mode 100644 OneHot.lua
 delete mode 100644 PCAColorTransform.lua
 delete mode 100644 Padding.lua
 delete mode 100644 Serial.lua
 delete mode 100644 SimpleColorTransform.lua
 delete mode 100644 SpatialBatchNormalization.lua
 delete mode 100644 SpatialBinaryConvolution.lua
 delete mode 100644 SpatialBinaryLogisticRegression.lua
 delete mode 100644 SpatialConvolution.lua
 delete mode 100644 SpatialConvolutionMM.lua
 delete mode 100644 SpatialFeatNormalization.lua
 delete mode 100644 SpatialMaxPooling.lua
 delete mode 100644 SpatialRegionDropout.lua
 delete mode 100644 SpatialUniformCrop.lua
 delete mode 100644 WhiteNoise.lua
 delete mode 100644 ZeroGrad.lua
 delete mode 100644 ZipTable.lua
 delete mode 100644 ZipTableOneToMany.lua
 delete mode 100644 test/test_firemodule.lua
 delete mode 100644 tutorials/ladder.md
 delete mode 100644 tutorials/ladder_network/ladder.lua
 delete mode 100644 tutorials/ladder_network/ladder_help_funcs.lua
 delete mode 100644 tutorials/lena.jpg
 delete mode 100644 tutorials/srd1.jpg
 delete mode 100644 tutorials/srd2.jpg

diff --git a/AbstractRecurrent.lua b/AbstractRecurrent.lua
index 3338185..3bbac87 100644
--- a/AbstractRecurrent.lua
+++ b/AbstractRecurrent.lua
@@ -33,6 +33,9 @@ function AbstractRecurrent:getStepModule(step)
 end
 
 function AbstractRecurrent:updateOutput(input)
+   if self.train ~= false then
+      self:recycle()
+   end
    if self.zeroMask then
       -- where zeroMask = 1, the past is forgotten, that is, the output/gradOutput is zeroed
       local stepmodule = (self.train==false) and self.modules[1] or self:getStepModule(self.step)
@@ -189,6 +192,7 @@ end
 
 function AbstractRecurrent:maskZero(v1)
    if not self.maskzero then
+      assert(not torch.isTypeOf(self.modules[1], 'nn.AbstractRecurrent'), "Doesn't support zero-masking on nested AbstractRecurrent instances")
       self.maskzero = true
       local stepmodule = nn.MaskZero(self.modules[1], v1)
       self.sharedClones = {stepmodule}
diff --git a/AbstractSequencerCriterion.lua b/AbstractSequencerCriterion.lua
index ebac701..b62272a 100644
--- a/AbstractSequencerCriterion.lua
+++ b/AbstractSequencerCriterion.lua
@@ -46,3 +46,9 @@ function AbstractSequencerCriterion:setZeroMask(zeroMask)
    end
 end
 
+function AbstractSequencerCriterion:type(type, typecache)
+   for key, clone in pairs(self.clones) do
+      clone:type(type, typecache)
+   end
+   return parent.type(self, type, typecache)
+end
diff --git a/BatchNormalization.lua b/BatchNormalization.lua
index 2ffad94..82f5783 100644
--- a/BatchNormalization.lua
+++ b/BatchNormalization.lua
@@ -1,16 +1,6 @@
 local _ = require 'moses'
 local BN, parent = nn.BatchNormalization, nn.Module
 
-local empty = _.clone(parent.dpnn_mediumEmpty)
-table.insert(empty, 'buffer')
-table.insert(empty, 'buffer2')
-table.insert(empty, 'centered')
-table.insert(empty, 'std')
-table.insert(empty, 'normalized')
-table.insert(empty, 'output')
-table.insert(empty, 'gradInput')
-BN.dpnn_mediumEmpty = empty
-
 -- for sharedClone
 local params = _.clone(parent.dpnn_parameters)
 table.insert(params, 'running_mean')
diff --git a/BinaryLogisticRegression.lua b/BinaryLogisticRegression.lua
deleted file mode 100644
index 02ccaab..0000000
--- a/BinaryLogisticRegression.lua
+++ /dev/null
@@ -1,91 +0,0 @@
-------------------------------------------------------------------------
---[[ BinaryLogisticRegression ]]--
--- Takes an image of size batchSize x 1 or  just batchSize as input.
--- Computes Binary Logistic Regression Cost.
--- Useful for 2 class classification.
-------------------------------------------------------------------------
-
-local BinaryLogisticRegression, parent = torch.class('nn.BinaryLogisticRegression', 'nn.Criterion')
-
-function BinaryLogisticRegression:__init(sizeAverage)
-   parent.__init(self)
-   if sizeAverage ~= nil then
-      self.sizeAverage = sizeAverage
-   else
-      self.sizeAverage = true
-   end
-end
-
-function BinaryLogisticRegression:updateOutput(input, target)
-   local inputDim = input:nDimension()
-   local targetDim = target:nDimension()
-
-   -- Check dimensions of input and target
-   assert(inputDim == 1 or inputDim == 2,
-                                  "Input:Expecting batchSize or batchSize x 1")
-   assert(targetDim == 1 or targetDim == 2,
-                                 "Target:Expecting batchSize or batchSize x 1")
-   if inputDim == 2 then
-      assert(input:size(1)==1 or input:size(2)==1, 
-                                        "Input: Expecting batchSize x 1.")
-   end
-   if targetDim == 2 then
-      assert(target:size(1)==1 or target:size(2)==1,
-                                        "Target: Expecting batchSize x 1.")
-   end
-
-   local inputElements = input:nElement()
-   local targetElements = target:nElement()
-
-   assert(inputElements == targetElements,
-                           "No of input and target elements should be same.")
-
-   self._k = inputElements
-   local input = input:view(-1)
-   local target = target:view(-1)
-
-   self._baseExponents = self._baseExponents or input.new()
-   self._coeff = self._coeff or input.new()
-   self._logCoeff = self._logCoeff or input.new()
-
-   --Compute exponent = -target*input
-   self._baseExponents:resize(input:size()):copy(input)
-   self._baseExponents:cmul(target)
-   self._baseExponents:mul(-1)
-   -- Compute exp(exponent)
-   self._baseExponents:exp()
-
-   self._coeff:resize(input:size()):copy(self._baseExponents)
-   self._coeff:add(1)
-
-   self._logCoeff:resize(input:size()):copy(self._coeff)
-   self._logCoeff:log()
-
-   if self.sizeAverage then
-      return self._logCoeff:sum()/(self._k)
-   else
-      return self._logCoeff:sum()
-   end
-end
-
-function BinaryLogisticRegression:updateGradInput(input, target)
-   self.gradInput = self.gradInput or input.new()
-   local gradInput = self.gradInput
-   gradInput:resize(input:size()):copy(target)
-   gradInput:mul(-1)
-   gradInput:cmul(self._baseExponents)
-   gradInput:cdiv(self._coeff)
-   if self.sizeAverage then
-      gradInput:div(self._k)
-   end
-   return gradInput
-end
-
-function BinaryLogisticRegression:type(type, tensorCache)
-   if type then
-      self._baseExponents = nil
-      self._coeff = nil
-      self._logCoeff = nil
-   end
-   return parent.type(self, type, tensorCache)
-end
diff --git a/CAddTensorTable.lua b/CAddTensorTable.lua
deleted file mode 100644
index 16efe44..0000000
--- a/CAddTensorTable.lua
+++ /dev/null
@@ -1,43 +0,0 @@
-
-local CAddTensorTable, parent = torch.class('nn.CAddTensorTable', 'nn.Module')
-
-function CAddTensorTable:__init()
-   parent.__init(self)
-   self.gradInput = {}
-end
-
--- input is a table with 2 entries. input[1] is the vector to be added.
--- input[2] is the table to which we add the vector
-function CAddTensorTable:updateOutput(input)
-  local currentOutput = {}
-  for i=1,#input[2] do
-    currentOutput[i] = currentOutput[i] or input[1].new()
-    currentOutput[i]:resizeAs(input[1])
-    currentOutput[i]:copy(input[2][i])
-    currentOutput[i]:add(input[1])
-  end
-  for i = #input[2]+1, #currentOutput do
-    currentOutput[i] = nil
-  end
-  self.output = currentOutput
-  return self.output
-end
-
-function CAddTensorTable:updateGradInput(input, gradOutput)
-  self.gradInput[1] = self.gradInput[1] or input[1].new()
-  self.gradInput[1]:resizeAs(input[1])
-  self.gradInput[1]:copy(gradOutput[1])
-  for i=2, #input[2] do
-    self.gradInput[1]:add(gradOutput[i])
-  end
-  self.gradInput[2] = self.gradInput[2] or {}
-  for i=1,#input[2] do
-    self.gradInput[2][i] = self.gradInput[2][i] or input[1].new()
-    self.gradInput[2][i]:resizeAs(input[1])
-    self.gradInput[2][i]:copy(gradOutput[i])
-  end
-  for i=#input[2]+1, #self.gradInput[2] do
-     self.gradInput[2][i] = nil
-  end
-  return self.gradInput
-end
\ No newline at end of file
diff --git a/CMakeLists.txt b/CMakeLists.txt
index ec6dbbb..74efbc0 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -17,21 +17,15 @@ SET(luasrc
   AbstractSequencer.lua
   AbstractSequencerCriterion.lua
   BiSequencer.lua
-  deprecated/BiSequencerLM.lua
   CopyGrad.lua
   Dropout.lua
   ExpandAs.lua
-  deprecated/FastLSTM.lua
-  deprecated/GRU.lua
-  LinearNoBias.lua
   LookupTableMaskZero.lua
-  deprecated/LSTM.lua
   MaskZero.lua
   MaskZeroCriterion.lua
   Module.lua
   Mufuru.lua
   NormStabilizer.lua
-  Padding.lua
   Recurrence.lua
   RecurrentAttention.lua
   Recursor.lua
@@ -42,11 +36,8 @@ SET(luasrc
   SeqBLSTM.lua
   SeqGRU.lua
   SeqLSTM.lua
-  deprecated/SeqLSTMP.lua
-  deprecated/SeqReverseSequence.lua
   Sequencer.lua
   SequencerCriterion.lua
-  ZeroGrad.lua
   test/bigtest.lua
   test/test.lua
   VariableLength.lua
@@ -60,27 +51,13 @@ SET(luasrc
   ArgMax.lua
   BatchNormalization.lua
   BinaryClassReward.lua
-  BinaryLogisticRegression.lua
-  CAddTensorTable.lua
   CategoricalEntropy.lua
-  Clip.lua
-  Collapse.lua
-  Constant.lua
   Container.lua
-  Convert.lua
   Criterion.lua
-  Dictionary.lua
-  FireModule.lua
-  Inception.lua
-  Kmeans.lua
   LookupTable.lua
-  ModuleCriterion.lua
   NCECriterion.lua
   NCEModule.lua
-  OneHot.lua
-  PCAColorTransform.lua
   ParallelTable.lua
-  PrintSize.lua
   Reinforce.lua
   ReinforceBernoulli.lua
   ReinforceCategorical.lua
@@ -88,24 +65,16 @@ SET(luasrc
   ReinforceNormal.lua
   ReverseSequence.lua
   Sequential.lua
-  Serial.lua
-  SimpleColorTransform.lua
-  SpatialBatchNormalization.lua
-  SpatialBinaryConvolution.lua
-  SpatialBinaryLogisticRegression.lua
-  SpatialConvolution.lua
-  SpatialConvolutionMM.lua
-  SpatialFeatNormalization.lua
   SpatialGlimpse.lua
-  SpatialMaxPooling.lua
-  SpatialRegionDropout.lua
-  SpatialUniformCrop.lua
   TotalDropout.lua
   VRClassReward.lua
-  WhiteNoise.lua
-  ZipTable.lua
-  ZipTableOneToMany.lua
   ReverseUnreverse.lua
+  deprecated/SeqLSTMP.lua
+  deprecated/SeqReverseSequence.lua
+  deprecated/BiSequencerLM.lua
+  deprecated/FastLSTM.lua
+  deprecated/GRU.lua
+  deprecated/LSTM.lua
 )
 
 ADD_TORCH_PACKAGE(rnn "${src}" "${luasrc}" "An RNN library for Torch")
diff --git a/Clip.lua b/Clip.lua
deleted file mode 100644
index fdd04de..0000000
--- a/Clip.lua
+++ /dev/null
@@ -1,35 +0,0 @@
-------------------------------------------------------------------------
---[[ Clip ]]--
--- clips values within minval and maxval
-------------------------------------------------------------------------
-local Clip, parent = torch.class("nn.Clip", "nn.Module")
-
-function Clip:__init(minval, maxval)
-   assert(torch.type(minval) == 'number')
-   assert(torch.type(maxval) == 'number')
-   self.minval = minval
-   self.maxval = maxval
-   parent.__init(self)
-end
-
-function Clip:updateOutput(input)
-   -- bound results within height and width
-   self._mask = self._mask or input.new()
-   self._byte = self._byte or torch.ByteTensor()
-   self.output:resizeAs(input):copy(input)
-   self._mask:gt(self.output, self.maxval)
-   local byte = torch.type(self.output) == 'torch.CudaTensor' and self._mask 
-      or self._byte:resize(self._mask:size()):copy(self._mask)
-   self.output[byte] = self.maxval
-   self._mask:lt(self.output, self.minval)
-   byte = torch.type(self.output) == 'torch.CudaTensor' and self._mask 
-      or self._byte:resize(self._mask:size()):copy(self._mask)
-   self.output[byte] = self.minval
-   return self.output
-end
-
-function Clip:updateGradInput(input, gradOutput)
-   self.gradInput:set(gradOutput)
-   return self.gradInput
-end
-
diff --git a/Collapse.lua b/Collapse.lua
deleted file mode 100644
index 95fb98e..0000000
--- a/Collapse.lua
+++ /dev/null
@@ -1,26 +0,0 @@
-local Collapse, parent = torch.class('nn.Collapse', 'nn.Module')
-
--- collapses non-batch dims
-function Collapse:__init(nInputDim)
-   parent.__init(self)
-   self.nInputDim = nInputDim
-end
-
-function Collapse:updateOutput(input)
-   if not input:isContiguous() then
-      self._input = self._input or input.new()
-      self._input:resize(input:size()):copy(input)
-      input = self._input
-   end
-   if input:dim() > self.nInputDim then
-      self.output:view(input,input:size(1),-1)
-   else
-      self.output:view(input,-1)
-   end
-   return self.output
-end
-
-function Collapse:updateGradInput(input, gradOutput)
-   self.gradInput:view(gradOutput, input:size())
-   return self.gradInput
-end
diff --git a/Constant.lua b/Constant.lua
deleted file mode 100644
index fdfdff4..0000000
--- a/Constant.lua
+++ /dev/null
@@ -1,36 +0,0 @@
-------------------------------------------------------------------------
---[[ Constant ]]--
--- Outputs a constant value given an input.
--- If nInputDim is specified, uses the input to determine the size of 
--- the batch. The value is then replicated over the batch.
--- You can use this with nn.ConcatTable() to append constant inputs to
--- an input : nn.ConcatTable():add(nn.Constant(v)):add(nn.Identity()) .
-------------------------------------------------------------------------
-local Constant, parent = torch.class("nn.Constant", "nn.Module")
-
-function Constant:__init(value, nInputDim)
-   self.value = value
-   if torch.type(self.value) == 'number' then
-      self.value = torch.Tensor{self.value}
-   end
-   assert(torch.isTensor(self.value), "Expecting number or tensor at arg 1")
-   self.nInputDim = nInputDim
-   parent.__init(self)
-end
-
-function Constant:updateOutput(input)
-   if self.nInputDim and input:dim() > self.nInputDim then
-      local vsize = self.value:size():totable()
-      self.output:resize(input:size(1), table.unpack(vsize))
-      local value = self.value:view(1, table.unpack(vsize))
-      self.output:copy(value:expand(self.output:size())) 
-   else
-      self.output:resize(self.value:size()):copy(self.value)
-   end
-   return self.output
-end
-
-function Constant:updateGradInput(input, gradOutput)
-   self.gradInput:resizeAs(input):zero()
-   return self.gradInput
-end
diff --git a/Convert.lua b/Convert.lua
deleted file mode 100644
index 76d20ef..0000000
--- a/Convert.lua
+++ /dev/null
@@ -1,244 +0,0 @@
-------------------------------------------------------------------------
---[ nn.Convert ]--
--- Module to convert between different data formats
--- nn.Convert('bchw', 'bf') or nn.Convert('chw', 'f')
--- Automatically converts input to same type as self.output
--- Simplest use is for automatic input type converions : nn.Convert()
-------------------------------------------------------------------------
-local _ = require 'moses'
-local Convert, parent = torch.class("nn.Convert", "nn.Container")
-
-function Convert:__init(inputShape, outputShape)
-   if outputShape and not inputShape then
-      error"Expecting non-nil arg 1 when arg 2 is provided"
-   end
-   inputShape = inputShape or 'b*'
-   outputShape = outputShape or inputShape
-   self.inputShape = inputShape:find('b') and inputShape or ('b'..inputShape)
-   self.outputShape = outputShape:find('b') and outputShape or ('b'..outputShape)
-   self.inputBatchDim = self.inputShape:find('b')
-   self.outputBatchDim = self.outputShape:find('b')
-   if self.inputShape == 'b*' or self.outputShape == 'b*' then
-      assert(self.inputShape == 'b*' and self.outputShape == 'b*', 'Both or neither shapes must be b*')
-      self.nInputDim = -1
-      self.nOutputDim = -1
-      self.transposition = true
-   else
-      -- number of dims in batch mode
-      self.nInputDim = #self.inputShape
-      self.nOutputDim = #self.outputShape
-      -- is the outputShape just a transposition of the inputShape?
-      if self.nInputDim == self.nOutputDim then
-         self.transposition = true
-         for i=1,self.nInputDim do
-            if not self.outputShape:find(self.inputShape:sub(i,i)) then
-               self.transposition = false
-               break
-            end
-         end
-      end
-   end
-   parent.__init(self)
-end
-
--- post-initialization
-function Convert:buildConverter(input)
-   if self.transposition then
-      self.converter = self:transpose(self.outputShape)
-   else
-      if (torch.type(self[self.outputShape]) ~= 'function') then
-         error(string.format("Unrecognized conversion of shape %s to %s", self.inputShape, self.outputShape))
-      end
-      self.converter = self[self.outputShape](self, input)
-   end
-   assert(torch.isTensor(self.output), "Expecting Tensor output")
-   
-   self.converter:type(torch.type(self.output))
-   self.converter:serialMode(self.dpnn_serialEmpty, self.dpnn_serialType)
-   
-   self.modules[1] = self.converter
-end
-
-function Convert:updateOutput(input)
-   assert(torch.isTensor(input), "expecting Tensor")
-   if not torch.isTypeOf(input, torch.type(self.output)) then
-      -- handle different input type
-      self._input = self._input or self.output.new()
-      self._input:resize(input:size()):copy(input)
-      input = self._input
-   end
-   self.batchMode = true
-   if input:dim() < self.nInputDim then
-      -- handle non-batch mode
-      local inputSize = input:size():totable()
-      table.insert(inputSize, self.inputBatchDim, 1)
-      self.__input = self.__input or input.new()
-      self.__input:set(input):resize(unpack(inputSize))
-      input = self.__input
-      self.batchMode = false
-   end
-   if not self.converter then
-      self:buildConverter(input)
-   end
-   
-   self.output = self.converter:updateOutput(input)
-   
-   if not self.batchMode then
-      local outputSize = self.output:size():totable()
-      table.remove(outputSize, self.outputBatchDim)
-      self.__output = self.__output or self.output.new()
-      self.__output:set(self.output):resize(unpack(outputSize))
-      self.output = self.__output
-   end
-   return self.output
-end
-
-function Convert:updateGradInput(input, gradOutput)
-   local input_ = input
-   input = self._input or input
-   if not self.batchMode then
-      input = self.__input
-      self.__gradOutput = self.__gradOutput or gradOutput.new()
-      self.__gradOutput:set(gradOutput):resize(self.converter.output:size())
-      gradOutput = self.__gradOutput
-   end
-   
-   local gradInput = self.converter:updateGradInput(input, gradOutput)
-   
-   if not self.batchMode then
-      self.__gradInput = self.__gradInput or gradInput.new()
-      self.__gradInput:set(gradInput):resize(input_:size())
-      gradInput = self.__gradInput
-   end
-   if self._input then
-      self._gradInput = self._gradInput or input.new()
-      self._gradInput:resize(input:size()):copy(gradInput)
-      self.gradInput = self._gradInput
-   else
-      self.gradInput = gradInput
-   end
-   
-   return self.gradInput
-end
-
-function Convert:accGradParameters(input, gradOutput, scale)
-   input = self.batchMode and self.__input or self._input or input
-   gradOutput = self.batchMode and self.__gradOutput or gradOutput
-   self.converter:accGradParameters(input, gradOutput, scale)
-end
-
-function Convert:accUpdateGradParameters(input, gradOutput, lr)
-   input = self.batchMode and self.__input or self._input or input
-   gradOutput = self.batchMode and self.__gradOutput or gradOutput
-   self.converter:accUpdateGradParameters(input, gradOutput, lr)
-end
-
--- batch feature
-function Convert:bf(input)
-   local b_pos = self:findAxis('b', self.inputShape)
-   local dim = #self.inputShape
-   if self.inputShape == 'bt' then
-      error"Conversion of shape bt to bf not supported: open an issue on github"
-   end
-   -- was b
-   if dim == 1 then
-      return nn.Reshape(1)
-   end
-   -- was b...
-   local modula
-   if b_pos ~= 1 then
-      modula = nn.Transpose({1, b_pos})
-   end
-   if dim > 2 then
-      local transpose = modula
-      local sampleSize = input:select(self:findAxis('b'),1):nElement()
-      local reshape = nn.Reshape(sampleSize)
-      if transpose then
-         modula = nn.Sequential()
-         modula:add(transpose)
-         modula:add(reshape)
-      else
-         modula = reshape
-      end
-   end
-   return modula or nn.Identity()
-end
-
--- each example is a scalar; batch is a vector
-function Convert:b(input)
-   local b_pos = self:findAxis('b')
-   if self.inputShape == 'bt' or self.inputShape == 'tb' then
-      local t_pos = self:findAxis('t')
-      -- select first set of classes
-      return nn.Select(t_pos, 1)
-   elseif self.inputShape == 'bf' or self.inputShape == 'fb' then
-      -- this wont work as expected with size(f) > 1
-      local f_pos = self:findAxis('f')
-      if input:size(f_pos) > 1 then
-         error("Cannot convert shape "..self.inputShape.." to b when feature > 1")
-      end
-      return nn.Select(f_pos, 1)
-   else
-      error("Cannot convert shape "..self.inputShape.." to shape b")
-   end
-end
-
--- returns the current shape of the data
-function Convert:default()
-   return nn.Identity()
-end
-
--- multi-class (batch target)
-function Convert:bt()
-   local b_pos = self:findAxis('b')
-   local modula
-   if self.inputShape == 'b' then
-      modula = nn.Reshape(1)
-   else
-      error("cannot convert shape '"..self.inputShape.."' to bt")
-   end
-   return modula
-end
-
--- a generic function for transposing shape axes
-function Convert:transpose(newShape)
-   if newShape == self.inputShape then
-      return nn.Identity()
-   end
-   local inputShape = {}
-   for i=1,#self.inputShape do
-      table.insert(inputShape, self.inputShape:sub(i,i))
-   end
-   local transpositions = {}
-   for i=1,#newShape do
-      local j = _.indexOf(inputShape, newShape:sub(i,i))
-      if i ~= j then
-         local char = inputShape[i]
-         inputShape[i] = inputShape[j]
-         inputShape[j] = char
-         table.insert(transpositions, {j, i})
-      end
-   end
-   return nn.Transpose(unpack(transpositions))
-end
-
-function Convert:findAxis(axis_char, shape, silent)
-   shape = shape or self.inputShape
-   local axis_pos = shape:find(axis_char)
-   if (not silent) and (not axis_pos) then
-      error("Provided shape '"..shape.."' has no axis '"..axis_char.."'", 2)
-   end
-   return axis_pos
-end
-
-function Convert:type(type)
-   if not torch.isTypeOf(self.output, type) then
-      self._input = nil
-      self._gradInput = nil
-      self.__input = nil
-      self.__output = nil
-      self.__gradInput = nil
-      self.__gradOutput =  nil
-   end
-   return parent.type(self, type)
-end
diff --git a/Criterion.lua b/Criterion.lua
index 7f21a8a..11735d5 100644
--- a/Criterion.lua
+++ b/Criterion.lua
@@ -13,4 +13,8 @@ function Criterion:setZeroMask(zeroMask)
    if self.criterion then
    	  self.criterion:setZeroMask(zeroMask)
    end
+end
+
+function Criterion:clearState()
+  return nn.utils.clear(self, 'gradInput')
 end
\ No newline at end of file
diff --git a/Dictionary.lua b/Dictionary.lua
deleted file mode 100644
index 238283c..0000000
--- a/Dictionary.lua
+++ /dev/null
@@ -1,6 +0,0 @@
-local Dictionary, parent = torch.class("nn.Dictionary", "nn.LookupTable")
-
--- don't use this with optim (useless), use nn.LookupTable instead
-function Dictionary:__init(dictSize, embeddingSize, accUpdate)
-   error"DEPRECATED Jan 14, 2016"
-end
diff --git a/FireModule.lua b/FireModule.lua
deleted file mode 100644
index f4e583e..0000000
--- a/FireModule.lua
+++ /dev/null
@@ -1,47 +0,0 @@
---[[
-  Fire module as explained in SqueezeNet http://arxiv.org/pdf/1602.07360v1.pdf.
---]]
---FIXME works only for batches.
-
-local FireModule, Parent = torch.class('nn.FireModule', 'nn.Decorator')
-
-function FireModule:__init(nInputPlane, s1x1, e1x1, e3x3, activation)
-   self.nInputPlane = nInputPlane
-   self.s1x1 = s1x1
-   self.e1x1 = e1x1
-   self.e3x3 = e3x3
-   self.activation = activation or 'ReLU'
-
-   if self.s1x1 > (self.e1x1 + self.e3x3) then
-      print('Warning: <FireModule> s1x1 is recommended to be smaller'..
-            ' then e1x1+e3x3')
-   end
-
-   self.module = nn.Sequential()
-   self.squeeze = nn.SpatialConvolution(nInputPlane, s1x1, 1, 1)
-   self.expand = nn.Concat(2)
-   self.expand:add(nn.SpatialConvolution(s1x1, e1x1, 1, 1))
-   self.expand:add(nn.SpatialConvolution(s1x1, e3x3, 3, 3, 1, 1, 1, 1))
-
-   -- Fire Module
-   self.module:add(self.squeeze)
-   self.module:add(nn[self.activation]())
-   self.module:add(self.expand)
-   self.module:add(nn[self.activation]())
-
-   Parent.__init(self, self.module)
-end
-
---[[
-function FireModule:type(type, tensorCache)
-   assert(type, 'Module: must provide a type to convert to')
-   self.module = nn.utils.recursiveType(self.module, type, tensorCache)
-end
---]]
-
-function FireModule:__tostring__()
-   return string.format('%s inputPlanes: %d -> Squeeze Planes: %d -> '..
-                        'Expand: %d(1x1) + %d(3x3), activation: %s',
-                        torch.type(self), self.nInputPlane, self.s1x1,
-                        self.e1x1, self.e3x3, self.activation)
-end
diff --git a/Inception.lua b/Inception.lua
deleted file mode 100644
index 7d57c25..0000000
--- a/Inception.lua
+++ /dev/null
@@ -1,192 +0,0 @@
-------------------------------------------------------------------------
--- [[ Inception ]]--
--- Uses n+2 parallel "columns". The original paper uses 2+2 where
--- the first two are (but there could be more than two):
--- 1x1 conv (reduce) -> relu -> 5x5 conv -> relu
--- 1x1 conv (reduce) -> relu -> 3x3 conv -> relu
--- and where the other two are :
--- 3x3 maxpool -> 1x1 conv (reduce/project) -> relu
--- 1x1 conv (reduce) -> relu.
--- This Model allows the first group of columns to be of any
--- number while the last group consist of exactly two columns.
--- The 1x1 conv are used to reduce the number of input channels
--- (or filters) such that the capacity of the network doesnt
--- explode. We refer to these here has "reduce". Since each
--- column seems to have one and only one reduce, their initial
--- configuration options are specified in lists of n+2 elements.
-------------------------------------------------------------------------
-local Inception, parent = torch.class("nn.Inception", "nn.Decorator")
-
-function Inception:__init(config)
-   --[[ Required Arguments ]]--
-   -- Number of input channels or colors
-   self.inputSize = config.inputSize
-   -- Number of filters in the non-1x1 convolution kernel sizes, e.g. {32,48}
-   self.outputSize = config.outputSize
-   -- Number of filters in the 1x1 convolutions (reduction)
-   -- used in each column, e.g. {48,64,32,32}. The last 2 are
-   -- used respectively for the max pooling (projection) column
-   -- (the last column in the paper) and the column that has
-   -- nothing but a 1x1 conv (the first column in the paper).
-   -- This table should have two elements more than the outputSize
-   self.reduceSize = config.reduceSize
-
-   --[[ Optional Arguments ]]--
-   -- The strides of the 1x1 (reduction) convolutions. Defaults to {1,1,...}
-   self.reduceStride = config.reduceStride or {}
-   -- A transfer function like nn.Tanh, nn.Sigmoid, nn.ReLU, nn.Identity, etc.
-   -- It is used after each reduction (1x1 convolution) and convolution
-   self.transfer = config.transfer or nn.ReLU()
-   -- batch normalization can be awesome
-   self.batchNorm = config.batchNorm
-   -- Adding padding to the input of the convolutions such that
-   -- input width and height are same as that of output.
-   self.padding = true
-   if config.padding ~= nil then
-      self.padding = config.padding
-   end
-   -- The size (height=width) of the non-1x1 convolution kernels.
-   self.kernelSize = config.kernelSize or {5,3}
-   -- The stride (height=width) of the convolution.
-   self.kernelStride = config.kernelStride or {1,1}
-   -- The size (height=width) of the spatial max pooling used
-   -- in the next-to-last column.
-   self.poolSize = config.poolSize or 3
-   -- The stride (height=width) of the spatial max pooling.
-   self.poolStride = config.poolStride or 1
-   -- The pooling layer.
-   self.pool = config.pool or nn.SpatialMaxPooling(self.poolSize, self.poolSize, self.poolStride, self.poolStride)
-
-
-   -- Variables checking that all of the output sizes are the same for a sample input.
-   local iWidth, iHeight = 100, 200
-   local oWidth, oHeight
-
-   -- [[ Module Construction ]]--
-   local depthConcat = nn.DepthConcat(2) -- concat on 'c' dimension
-   -- 1x1 conv (reduce) -> 3x3 conv
-   -- 1x1 conv (reduce) -> 5x5 conv
-   -- ...
-   for i=1,#self.kernelSize do
-      local mlp = nn.Sequential()
-      -- 1x1 conv
-      local reduce = nn.SpatialConvolution(
-         self.inputSize, self.reduceSize[i], 1, 1,
-         self.reduceStride[i] or 1, self.reduceStride[i] or 1
-      )
-      mlp:add(reduce)
-      if self.batchNorm then
-         mlp:add(nn.SpatialBatchNormalization(self.reduceSize[i]))
-      end
-      mlp:add(self.transfer:clone())
-
-      -- nxn conv
-      local pad = self.padding and math.floor(self.kernelSize[i]/2) or 0
-      local conv = nn.SpatialConvolution(
-         self.reduceSize[i], self.outputSize[i],
-         self.kernelSize[i], self.kernelSize[i],
-         self.kernelStride[i], self.kernelStride[i],
-         pad
-      )
-      mlp:add(conv)
-      if self.batchNorm then
-         mlp:add(nn.SpatialBatchNormalization(self.outputSize[i]))
-      end
-      mlp:add(self.transfer:clone())
-      depthConcat:add(mlp)
-
-      -- Check the output sizes.
-      local oWidth_i = torch.floor(
-         (iWidth + 2*pad - self.kernelSize[i])/self.kernelStride[i] + 1)
-      local oHeight_i = torch.floor(
-         (iHeight + 2*pad - self.kernelSize[i])/self.kernelStride[i] + 1)
-      if oWidth == nil then
-         oWidth = oWidth_i
-         oHeight = oHeight_i
-      else
-         if oWidth ~= oWidth_i or oHeight ~= oHeight_i then
-            print("dpnn.Inception: Warning: Inconsistent output sizes.")
-         end
-      end
-   end
-
-   -- pool -> 1x1 conv
-   local mlp = nn.Sequential()
-   mlp:add(self.pool)
-   -- not sure if transfer should go here? mlp:add(transfer:clone())
-   local i = #(self.kernelSize) + 1
-   if self.reduceSize[i] then
-      local reduce = nn.SpatialConvolution(
-         self.inputSize, self.reduceSize[i], 1, 1,
-         self.reduceStride[i] or 1, self.reduceStride[i] or 1
-      )
-      mlp:add(reduce)
-      if self.batchNorm then
-         mlp:add(nn.SpatialBatchNormalization(self.reduceSize[i]))
-      end
-      mlp:add(self.transfer:clone())
-   end
-   depthConcat:add(mlp)
-
-   -- Check the output sizes. Infer the operation of the pooling layer.
-   if self.pool.kW ~= nil and self.pool.dW ~= nil and self.pool.padW ~= nil then
-      assert(oWidth ~= nil)
-      assert(oHeight ~= nil)
-      local oWidth_pool = torch.floor(
-         (iWidth + 2*self.pool.padW - self.pool.kW)/self.pool.dW + 1)
-      local oHeight_pool = torch.floor(
-         (iHeight + 2*self.pool.padH - self.pool.kH)/self.pool.dH + 1)
-      if oWidth ~= oWidth_pool or oHeight ~= oHeight_pool then
-         print("dpnn.Inception: Warning: Inconsistent output sizes in pooling.")
-      end
-   end
-
-   -- reduce: 1x1 conv (channel-wise pooling)
-   i = i + 1
-   if self.reduceSize[i] then
-      local mlp = nn.Sequential()
-      local reduce = nn.SpatialConvolution(
-          self.inputSize, self.reduceSize[i], 1, 1,
-          self.reduceStride[i] or 1, self.reduceStride[i] or 1
-      )
-      mlp:add(reduce)
-      if self.batchNorm then
-          mlp:add(nn.SpatialBatchNormalization(self.reduceSize[i]))
-      end
-      mlp:add(self.transfer:clone())
-      depthConcat:add(mlp)
-
-      -- Check the output sizes.
-      local oWidth_conv = torch.floor((iWidth - 1)/(self.reduceStride[i] or 1) + 1)
-      local oHeight_conv = torch.floor((iHeight - 1)/(self.reduceStride[i] or 1) + 1)
-      if oWidth ~= oWidth_conv or oHeight ~= oHeight_conv then
-         print("dpnn.Inception: Warning: Inconsistent output sizes in 1x1 conv.")
-      end
-   end
-
-   parent.__init(self, depthConcat)
-end
-
-function Inception:updateOutput(input)
-   local input = self:toBatch(input, 3)
-   local output = self.modules[1]:updateOutput(input)
-   self.output = self:fromBatch(output, 3)
-   return self.output
-end
-
-function Inception:updateGradInput(input, gradOutput)
-   local input, gradOutput = self:toBatch(input, 3), self:toBatch(gradOutput, 3)
-   local gradInput = self.modules[1]:updateGradInput(input, gradOutput)
-   self.gradInput = self:fromBatch(gradInput, 3)
-   return self.gradInput
-end
-
-function Inception:accGradParameters(input, gradOutput, scale)
-   local input, gradOutput = self:toBatch(input, 3), self:toBatch(gradOutput, 3)
-   self.modules[1]:accGradParameters(input, gradOutput, scale)
-end
-
-function Inception:accUpdateGradParameters(input, gradOutput, lr)
-   local input, gradOutput = self:toBatch(input, 3), self:toBatch(gradOutput, 3)
-   self.modules[1]:accUpdateGradParameters(input, gradOutput, lr)
-end
diff --git a/Kmeans.lua b/Kmeans.lua
deleted file mode 100644
index b67401f..0000000
--- a/Kmeans.lua
+++ /dev/null
@@ -1,207 +0,0 @@
--- Online (Hard) Kmeans layer.
-local Kmeans, parent = torch.class('nn.Kmeans', 'nn.Module')
-
-function Kmeans:__init(k, dim, scale)
-   parent.__init(self)
-   self.k = k
-   self.dim = dim
-
-   -- scale for online kmean update
-   self.scale = scale
-
-   assert(k > 0, "Clusters cannot be 0 or negative.")
-   assert(dim > 0, "Dimensionality cannot be 0 or negative.")
-
-   -- Kmeans centers -> self.weight
-   self.weight = torch.Tensor(self.k, self.dim)
-
-   self.gradWeight = torch.Tensor(self.weight:size())
-   self.loss = 0 -- within cluster error of the last forward
-
-   self.clusterSampleCount = torch.Tensor(self.k)
-
-   self:reset()
-end
-
--- Reset
-function Kmeans:reset(stdev)
-   stdev = stdev or 1
-   self.weight:uniform(-stdev, stdev)
-end
-
--- Initialize Kmeans weight with random samples from input.
-function Kmeans:initRandom(input)
-   local inputDim = input:nDimension()
-   assert(inputDim == 2, "Incorrect input dimensionality. Expecting 2D.")
-
-   local noOfSamples = input:size(1)
-   local dim = input:size(2)
-   assert(dim == self.dim, "Dimensionality of input and weight don't match.")
-   assert(noOfSamples >= self.k, "Need atleast k samples for initialization.")
-
-   local indices = torch.zeros(self.k)
-   indices:random(1, noOfSamples)
-
-   for i=1, self.k do
-      self.weight[i]:copy(input[indices[i]])
-   end
-end
-
--- Initialize using Kmeans++
-function Kmeans:initKmeansPlus(input, p)
-   self.p = p or self.p or 0.95
-   assert(self.p>=0 and self.p<=1, "P value should be between 0-1.")
-
-   local inputDim = input:nDimension()
-   assert(inputDim == 2, "Incorrect input dimensionality. Expecting 2D.")
-   local noOfSamples = input:size(1)
-
-   local pcount = math.ceil((1-self.p)*noOfSamples)
-   if pcount <= 0 then pcount = 1 end
-
-   local initializedK = 1
-   self.weight[initializedK]:copy(input[torch.random(noOfSamples)])
-   initializedK = initializedK + 1
-
-   local clusters = self.weight.new()
-   local clusterDistances = self.weight.new()
-   local temp = self.weight.new()
-   local expandedSample = self.weight.new()
-   local distances = self.weight.new()
-   distances:resize(noOfSamples):fill(math.huge)
-   local maxScores = self.weight.new()
-   local maxIndx = self.weight.new()
-
-   for k=initializedK, self.k do
-      clusters = self.weight[{{initializedK-1, initializedK-1}}]
-      for i=1, noOfSamples do
-         temp:expand(input[{{i}}], 1, self.dim)
-         expandedSample:resize(temp:size()):copy(temp)
-
-         -- Squared Euclidean distance
-         expandedSample:add(-1, clusters)
-         clusterDistances:norm(expandedSample, 2, 2)
-         clusterDistances:pow(2)
-         distances[i] = math.min(clusterDistances:min(), distances[i])
-      end
-      maxScores, maxIndx = distances:sort(true)
-      local tempIndx = torch.random(pcount)
-      local indx = maxIndx[tempIndx]
-      self.weight[initializedK]:copy(input[indx])
-      initializedK = initializedK + 1
-   end
-end
-
--- Kmeans updateOutput (forward)
-function Kmeans:updateOutput(input)
-   local inputDim = input:nDimension()
-   assert(inputDim == 2, "Incorrect input dimensionality. Expecting 2D.")
-
-   local batchSize = input:size(1)
-   local dim = input:size(2)
-   assert(dim == self.dim, "Dimensionality of input and weight don't match.")
-
-   assert(input:isContiguous(), "Input is not contiguous.")
-
-   -- a sample copied k times to compute distance between sample and weight
-   self._expandedSamples = self._expandedSamples or self.weight.new()
-
-   -- distance between a sample and weight
-   self._clusterDistances = self._clusterDistances or self.weight.new()
-
-   self._temp = self._temp or input.new()
-   self._tempExpanded = self._tempExpanded or input.new()
-
-   -- Expanding inputs
-   self._temp:view(input, 1, batchSize, self.dim)
-   self._tempExpanded:expand(self._temp, self.k, batchSize, self.dim)
-   self._expandedSamples:resize(self.k, batchSize, self.dim)
-                        :copy(self._tempExpanded)
-
-   -- Expanding weights
-   self._tempWeight = self._tempWeight or self.weight.new()
-   self._tempWeightExp = self._tempWeightExp or self.weight.new()
-   self._expandedWeight = self._expanedWeight or self.weight.new()
-   self._tempWeight:view(self.weight, self.k, 1, self.dim)
-   self._tempWeightExp:expand(self._tempWeight, self._expandedSamples:size())
-   self._expandedWeight:resize(self.k, batchSize, self.dim)
-                       :copy(self._tempWeightExp)
-
-   -- x-c
-   self._expandedSamples:add(-1, self._expandedWeight)
-   -- Squared Euclidean distance
-   self._clusterDistances:norm(self._expandedSamples, 2, 3)
-   self._clusterDistances:pow(2)
-   self._clusterDistances:resize(self.k, batchSize)
-
-   self._minScore = self._minScore or self.weight.new()
-   self._minIndx = self._minIndx or (torch.isCudaTensor(input) and torch.CudaLongTensor() or torch.LongTensor())
-   self._minScore:min(self._minIndx, self._clusterDistances, 1)
-   self._minIndx:resize(batchSize)
-
-   self.output:resize(batchSize):copy(self._minIndx)
-   self.loss = self._minScore:sum()
-
-   return self.output
-end
-
--- Kmeans has its own criterion hence gradInput are zeros
-function Kmeans:updateGradInput(input, gradOuput)
-   self.gradInput:resize(input:size()):zero()
-
-   return self.gradInput
-end
-
--- We define kmeans update rule as c -> c + scale * 1/n * sum_i (x-c).
--- n is no. of x's belonging to c.
--- With this update rule and gradient descent will be negative the gradWeights.
-function Kmeans:accGradParameters(input, gradOutput, scale)
-   local scale = self.scale or scale or 1
-   assert(scale > 0 , " Scale has to be positive.")
-
-   -- Update cluster sample count
-   local batchSize = input:size(1)
-   self._cscAdder = self._cscAdder or self.weight.new()
-   self._cscAdder:resize(batchSize):fill(1)
-   self.clusterSampleCount:zero()
-   self.clusterSampleCount:indexAdd(1, self._minIndx, self._cscAdder)
-
-   -- scale * (x[k]-c[k]) where k is nearest cluster to x
-   self._gradWeight = self._gradWeight or self.gradWeight.new()
-   self._gradWeight:index(self.weight, 1, self._minIndx)
-   self._gradWeight:mul(-1)
-   self._gradWeight:add(input)
-   self._gradWeight:mul(-scale)
-
-   self._gradWeight2 = self._gradWeight2 or self.gradWeight.new()
-   self._gradWeight2:resizeAs(self.gradWeight):zero()
-   self._gradWeight2:indexAdd(1, self._minIndx, self._gradWeight)
-
-   -- scale/n * sum_i (x-c)
-   self._ccounts = self._ccounts or self.clusterSampleCount.new()
-   self._ccounts:resize(self.k):copy(self.clusterSampleCount)
-   self._ccounts:add(0.0000001) -- prevent division by zero errors
-
-   self._gradWeight2:cdiv(self._ccounts:view(self.k,1):expandAs(self.gradWeight))
-
-   self.gradWeight:add(self._gradWeight2)
-end
-
-function Kmeans:clearState()
-   -- prevent premature memory allocations
-   self._expandedSamples = nil
-   self._clusterDistances = nil
-   self._temp = nil
-   self._tempExpanded = nil
-   self._tempWeight = nil
-   self._tempWeightExp = nil
-   self._expandedWeight = nil
-   self._minScore = nil
-   self._minIndx = nil
-   self._cscAdder = nil
-end
-
-function Kmeans:type(type, tensorCache)
-   self:clearState()
-   return parent.type(self, type, tensorCache)
-end
diff --git a/LinearNoBias.lua b/LinearNoBias.lua
deleted file mode 100644
index 3b4fd27..0000000
--- a/LinearNoBias.lua
+++ /dev/null
@@ -1,65 +0,0 @@
-------------------------------------------------------------------------
---[[ LinearNoBias ]]--
--- Subclass of nn.Linear with no bias term
-------------------------------------------------------------------------
-nn = require 'nn'
-local LinearNoBias, Linear = torch.class('nn.LinearNoBias', 'nn.Linear')
-
-function LinearNoBias:__init(inputSize, outputSize)
-   nn.Module.__init(self)
-
-   self.weight = torch.Tensor(outputSize, inputSize)
-   self.gradWeight = torch.Tensor(outputSize, inputSize)
-
-   self:reset()
-end
-
-function LinearNoBias:reset(stdv)
-   if stdv then
-      stdv = stdv * math.sqrt(3)
-   else
-      stdv = 1./math.sqrt(self.weight:size(2))
-   end
-   if nn.oldSeed then
-      for i=1,self.weight:size(1) do
-         self.weight:select(1, i):apply(function()
-            return torch.uniform(-stdv, stdv)
-         end)
-      end
-   else
-      self.weight:uniform(-stdv, stdv)
-   end
-
-   return self
-end
-
-function LinearNoBias:updateOutput(input)
-   if input:dim() == 1 then
-      self.output:resize(self.weight:size(1))
-      self.output:mv(self.weight, input)
-   elseif input:dim() == 2 then
-      local nframe = input:size(1)
-      local nElement = self.output:nElement()
-      self.output:resize(nframe, self.weight:size(1))
-      if self.output:nElement() ~= nElement then
-         self.output:zero()
-      end
-      if not self.addBuffer or self.addBuffer:nElement() ~= nframe then
-         self.addBuffer = input.new(nframe):fill(1)
-      end
-      self.output:addmm(0, self.output, 1, input, self.weight:t())
-   else
-      error('input must be vector or matrix')
-   end
-
-   return self.output
-end
-
-function LinearNoBias:accGradParameters(input, gradOutput, scale)
-   scale = scale or 1
-   if input:dim() == 1 then
-      self.gradWeight:addr(scale, gradOutput, input)
-   elseif input:dim() == 2 then
-      self.gradWeight:addmm(scale, gradOutput:t(), input)
-   end
-end
diff --git a/MaskZeroCriterion.lua b/MaskZeroCriterion.lua
index bc81f71..617d940 100644
--- a/MaskZeroCriterion.lua
+++ b/MaskZeroCriterion.lua
@@ -7,7 +7,7 @@ local MaskZeroCriterion, parent = torch.class("nn.MaskZeroCriterion", "nn.Criter
 
 function MaskZeroCriterion:__init(criterion, v1)
    parent.__init(self)
-   self.criterion = criterion
+   self.criterion = assert(criterion)
    assert(torch.isTypeOf(criterion, 'nn.Criterion'))
    self.v2 = not v1
 end
@@ -28,11 +28,12 @@ function MaskZeroCriterion:updateOutput(input, target)
    if self.isEmptyBatch then
       self.output = 0
    else
+      local first = nn.utils.recursiveGetFirst(input)
       -- e.g. 0,1,0 -> 1,0,1
       self._oneMask = self._oneMask or self.zeroMask.new()
       self._oneMask:lt(self.zeroMask, 1)
       -- 1,0,1 -> 1,3
-      self._indices = self._indices or torch.isCudaTensor(input) and torch.CudaLongTensor() or torch.LongTensor()
+      self._indices = self._indices or torch.isCudaTensor(first) and torch.CudaLongTensor() or torch.LongTensor()
       self._range = self._range or self._indices.new()
       self._range:range(1,self._oneMask:nElement())
       self._indices:maskedSelect(self._range, self._oneMask)
@@ -75,10 +76,13 @@ function MaskZeroCriterion:clearState()
    self.output = nil
    self.gradInput = nil
    self._gradInput = nil
+   self.criterion:clearState()
+   return parent.clearState(self)
 end
 
 function MaskZeroCriterion:type(type, ...)
    self:clearState()
+   self.criterion:type(type, ...)
    return parent.type(self, type, ...)
 end
 
diff --git a/Module.lua b/Module.lua
index 58d43ff..c613959 100644
--- a/Module.lua
+++ b/Module.lua
@@ -266,134 +266,6 @@ function Module:getParameters()
    return Module.flatten(parameters), Module.flatten(gradParameters)
 end
 
------------------ serialization (see nn.Serial) -------------------
-
-Module.dpnn_mediumEmpty = {'output', 'gradInput', 'momGradParams', 'dpnn_input'}
-Module.dpnn_lightEmpty = Module.dpnn_gradParameters
--- defaults to heavy serialization
-Module.dpnn_serialEmpty = {}
-
--- sets the serialization behavior of the entire module structure
-function Module:serialMode(empty)
-   assert(torch.type(empty) == 'table', "Expecting table at arg 1")
-   self.dpnn_serialEmpty = empty
-   -- set the serial of all encapsulated modules
-   local function recursiveSerial(tbl)
-      for k,v in pairs(tbl) do
-         if torch.isTypeOf(v, 'nn.Module') then
-            v:serialMode(empty)
-         elseif torch.type(v) == 'table' then
-            recursiveSerial(v)
-         end
-      end
-   end
-   recursiveSerial(self)
-   return self
-end
-
--- serialMode : serialize everything
-function Module:heavySerial()
-   return self:serialMode({})
-end
-
--- serialMode : serialize everything except dpnn_mediumEmpty attributes
-function Module:mediumSerial()
-
-   self.dpnn_serialEmpty = self.dpnn_mediumEmpty
-
-   -- set the serial of all encapsulated modules
-   local function recursiveSerial(tbl)
-      for k,v in pairs(tbl) do
-         if torch.isTypeOf(v, 'nn.Module') then
-            v:mediumSerial()
-         elseif torch.type(v) == 'table' then
-            recursiveSerial(v)
-         end
-      end
-   end
-   recursiveSerial(self)
-   return self
-end
-
--- serialMode : serialize everything except dpnn_mediumEmpty and dpnn_lightEmpty attributes
-function Module:lightSerial()
-
-   self.dpnn_serialEmpty = _.clone(self.dpnn_mediumEmpty)
-   for k,v in ipairs(self.dpnn_lightEmpty) do
-      table.insert(self.dpnn_serialEmpty, v)
-   end
-
-   -- set the serial of all encapsulated modules
-   local function recursiveSerial(tbl)
-      for k,v in pairs(tbl) do
-         if torch.isTypeOf(v, 'nn.Module') then
-            v:lightSerial()
-         elseif torch.type(v) == 'table' then
-            recursiveSerial(v)
-         end
-      end
-   end
-   recursiveSerial(self)
-
-   return self
-end
-
-function Module:getSerialState(states)
-   states = states or {}
-
-   -- dont get the serial state of the same module twice (reuse existing)
-   if states[self] then
-      return states[self]
-   end
-
-   local _ = require 'moses'
-   -- returns the object structure as tables (i.e. without metatables)
-   local function recursiveState(tbl)
-      local state = _.map(tbl,
-         function(k,v)
-            if torch.isTypeOf(tbl, 'nn.Module') and _.contains(tbl.dpnn_serialEmpty, k) then
-               -- "empties" module attributes found in empty
-               if torch.type(v) == 'table' then
-                  -- empty table
-                  return {}
-               elseif torch.isTensor(v) then
-                  -- empty tensor
-                  return v.new()
-               else
-                  -- not table nor tensor? then serialize as is
-                  return v
-               end
-            elseif torch.isTypeOf(v, 'nn.Module') then
-               -- recursive, yet can be overwritten
-               return v:getSerialState(states)
-            elseif torch.type(v) == 'table' then
-               -- in case it is a table of modules
-               if not states[v] then
-                  states[v] = recursiveState(v)
-               end
-               return states[v]
-            else
-               return v
-            end
-         end
-      )
-      return state
-   end
-
-   local state = recursiveState(self)
-
-   -- include typename so that module can be reconstructed from the state
-   state.dpnn_typename = torch.type(self)
-   states[self] = state
-
-   return state
-end
-
--- decorates self with nn.Serial
-function Module:Serial(tensortype)
-   return nn.Serial(self, tensortype)
-end
-
 ----------------------- for training -----------------------------
 
 -- useful to get the output size
diff --git a/ModuleCriterion.lua b/ModuleCriterion.lua
deleted file mode 100644
index bfc79ef..0000000
--- a/ModuleCriterion.lua
+++ /dev/null
@@ -1,44 +0,0 @@
-local ModuleCriterion, parent = torch.class("nn.ModuleCriterion", "nn.Criterion")
-
-function ModuleCriterion:__init(criterion, inputModule, targetModule, castTarget)
-   self.inputModule = inputModule
-   self.targetModule = targetModule
-   self.castTarget = (castTarget == nil) and true or castTarget
-   if self.inputModule then
-      local params = self.inputModule:parameters()
-      if params and #params > 0 then
-         print"Warning: nn.ModuleCriterion doesn't support parameter updates"
-      end
-   end
-   self.criterion = criterion
-end
-
-function ModuleCriterion:updateOutput(input, target)
-   if self.inputModule then
-      self.input = self.inputModule:forward(input)
-   end
-   if self.targetModule then
-      self.target = self.targetModule:forward(target)
-   end
-   self.output = self.criterion:forward(self.input or input, self.target or target)
-   return self.output
-end
-
-function ModuleCriterion:updateGradInput(input, target)
-   self.gradInput = self.criterion:backward(self.input or input, self.target or target)
-   if self.inputModule then
-      self.gradInput = self.inputModule:backward(input, self.gradInput)
-   end
-   return self.gradInput
-end
-
-function ModuleCriterion:type(type, typecache)
-   if self.inputModule then
-      self.inputModule:type(type, typecache)
-   end
-   if self.castTarget and self.targetModule then
-      self.targetModule:type(type, typecache)
-   end
-   self.criterion:type(type, typecache)
-   return parent.type(self, type, typecache)
-end
diff --git a/NCECriterion.lua b/NCECriterion.lua
index 1a6b935..d4cde60 100644
--- a/NCECriterion.lua
+++ b/NCECriterion.lua
@@ -7,32 +7,31 @@ local NCECriterion, parent = torch.class("nn.NCECriterion", "nn.Criterion")
 local eps = 0.0000001
 
 function NCECriterion:__init()
-   parent.__init(self)  
+   parent.__init(self)
    self.sizeAverage = true
-   
-   self.gradInput = {torch.Tensor(), torch.Tensor(), torch.Tensor(), torch.Tensor()}   
+   self.gradInput = {torch.Tensor(), torch.Tensor(), torch.Tensor(), torch.Tensor()}
 end
 
 function NCECriterion:updateOutput(inputTable, target)
    -- P_model(target), P_model(sample), P_noise(target), P_noise(sample)
    local Pmt, Pms, Pnt, Pns = unpack(inputTable)
    local k = Pms:size(2)
-   
+
    assert(Pmt:dim() == 1)
    assert(Pms:dim() == 2)
    assert(Pnt:dim() == 1)
    assert(Pns:dim() == 2)
-   
+
    -- equation 5 in ref. A
-   
-   -- eq 5.1 : P(origin=model) = Pmt / (Pmt + k*Pnt) 
+
+   -- eq 5.1 : P(origin=model) = Pmt / (Pmt + k*Pnt)
    self._Pom = self._Pom or Pmt.new()
    self._Pom:resizeAs(Pmt):copy(Pmt)
    self._Pomdiv = self._Pomdiv or Pmt.new()
    self._Pomdiv:resizeAs(Pmt):copy(Pmt)
    self._Pomdiv:add(k, Pnt):add(eps)
    self._Pom:cdiv(self._Pomdiv)
-   
+
    -- eq 5.2 : P(origin=noise) = k*Pns / (Pms + k*Pns)
    self._Pon = self._Pon or Pns.new()
    self._Pon:resizeAs(Pns):copy(Pns):mul(k)
@@ -40,34 +39,35 @@ function NCECriterion:updateOutput(inputTable, target)
    self._Pondiv:resizeAs(Pms):copy(Pms)
    self._Pondiv:add(k, Pns):add(eps)
    self._Pon:cdiv(self._Pondiv)
-   
+
    -- equation 6 in ref. A
-   
+
    self._lnPom = self._lnPom or self._Pom.new()
    self._lnPom:log(self._Pom)
-   
+
    self._lnPon = self._lnPon or self._Pon.new()
    self._lnPon:log(self._Pon)
-   
+
    local lnPomsum = self._lnPom:sum()
    local lnPonsum = self._lnPon:sum()
-   
+
    self.output = - (lnPomsum + lnPonsum)
-   
+
    if self.sizeAverage then
       self.output = self.output / Pmt:size(1)
    end
-   
+
    return self.output
 end
 
 function NCECriterion:updateGradInput(inputTable, target)
+   self.gradInput = self.gradInput or nn.utils.recursiveNew(inputTable)
    assert(#self.gradInput == 4)
    local Pmt, Pms, Pnt, Pns = unpack(inputTable)
    local k = Pms:size(2)
-   
+
    -- equation 7 in ref. A
-   
+
    -- d ln(Pom) / d input = -k*Pnt / ( Pmt * (Pmt + k*Pnt) )
    local dlnPom = self.gradInput[1]
    dlnPom = dlnPom or Pnt.new()
@@ -76,7 +76,7 @@ function NCECriterion:updateGradInput(inputTable, target)
    Pmt:add(eps)
    dlnPom:cdiv(Pmt) -- d ln(Pmt) / d Pmt = 1 / d Pmt
    Pmt:add(-eps)
-   
+
    -- d ln(Pon) / d input = Pms / ( Pms * (Pms + k*Pns) )
    local dlnPon = self.gradInput[2]
    dlnPon = dlnPon or Pms.new()
@@ -85,18 +85,35 @@ function NCECriterion:updateGradInput(inputTable, target)
    Pms:add(eps)
    dlnPon:cdiv(Pms) -- d ln(Pms) / d Pms = 1 / d Pms
    Pms:add(-eps)
-   
+
    if self.gradInput[3]:nElement() ~= Pnt:nElement() then
       self.gradInput[3]:resizeAs(Pnt):zero()
    end
    if self.gradInput[4]:nElement() ~= Pns:nElement() then
       self.gradInput[4]:resizeAs(Pns):zero()
    end
-   
+
    if self.sizeAverage then
       dlnPom:div(Pmt:size(1))
       dlnPon:div(Pmt:size(1))
    end
-   
-   return self.gradInput   
+
+   return self.gradInput
+end
+
+function NCECriterion:clearState()
+   self._Pom = nil
+   self._Pomdiv = nil
+   self._Pon = nil
+   self._Pondiv = nil
+   self._lnPon = nil
+   self._lnPom = nil
+   self.gradInput = nil
+   parent.clearState(self)
+   return self
+end
+
+function NCECriterion:type(...)
+   self:clearState()
+   return parent.type(self, ...)
 end
diff --git a/NCEModule.lua b/NCEModule.lua
index a3df10c..3172a15 100644
--- a/NCEModule.lua
+++ b/NCEModule.lua
@@ -4,19 +4,7 @@
 ------------------------------------------------------------------------
 local _ = require 'moses'
 local NCEModule, parent = torch.class("nn.NCEModule", "nn.Linear")
-NCEModule.version = 6 -- better bias init
-
--- for efficient serialization using nn.Serial
-local empty = _.clone(parent.dpnn_mediumEmpty)
-table.insert(empty, 'sampleidx')
-table.insert(empty, 'sampleprob')
-table.insert(empty, '_noiseidx')
-table.insert(empty, '_noiseprob')
-table.insert(empty, '_weight')
-table.insert(empty, '_gradWeight')
-table.insert(empty, '_gradOutput')
-table.insert(empty, '_tgradOutput')
-NCEModule.dpnn_mediumEmpty = empty
+NCEModule.version = 7 -- remove support for nn.Serial; use clearState()
 
 -- for sharedClone
 local params = _.clone(parent.dpnn_parameters)
@@ -333,44 +321,6 @@ function NCEModule:accGradParameters(inputTable, gradOutput, scale)
    end
 end
 
-function NCEModule:type(type, cache)
-   if type then
-      self.sampleidx = nil
-      self.sampleprob = nil
-      self._noiseidx = nil
-      self._noiseprob = nil
-      self._metaidx = nil
-      self._gradOutput = nil
-      self._tgradOutput = nil
-      self._gradWeight = nil
-      self._weight = nil
-   end
-   local unigrams = self.unigrams
-   self.unigrams = nil
-   local am = self.aliasmultinomial
-
-   local rtn
-   if type and torch.type(self.weight) == 'torch.MultiCudaTensor' then
-      assert(type == 'torch.CudaTensor', "Cannot convert a multicuda NCEModule to anything other than cuda")
-      local weight = self.weight
-      local gradWeight = self.gradWeight
-      self.weight = nil
-      self.gradWeight = nil
-
-      rtn = parent.type(self, type, cache)
-
-      assert(torch.type(self.aliasmultinomial.J) ~= 'torch.CudaTensor')
-      self.weight = weight
-      self.gradWeight = gradWeight
-   else
-      rtn = parent.type(self, type, cache)
-   end
-
-   self.unigrams = unigrams
-   self.aliasmultinomial = am
-   return rtn
-end
-
 function NCEModule:noiseProb(sampleprob, sampleidx)
    assert(sampleprob)
    assert(sampleidx)
@@ -404,6 +354,9 @@ function NCEModule:clearState()
    self._noiseprob = nil
    self._tgradOutput = nil
    self._gradOutput = nil
+   self._gradWeight = nil
+   self._weight = nil
+   self._metaidx = nil
    if torch.isTensor(self.output) then
       self.output:set()
    else
@@ -416,6 +369,36 @@ function NCEModule:clearState()
    end
 end
 
+function NCEModule:type(type, cache)
+   if type then
+      self:clearState()
+   end
+   local unigrams = self.unigrams
+   self.unigrams = nil
+   local am = self.aliasmultinomial
+
+   local rtn
+   if type and torch.type(self.weight) == 'torch.MultiCudaTensor' then
+      assert(type == 'torch.CudaTensor', "Cannot convert a multicuda NCEModule to anything other than cuda")
+      local weight = self.weight
+      local gradWeight = self.gradWeight
+      self.weight = nil
+      self.gradWeight = nil
+
+      rtn = parent.type(self, type, cache)
+
+      assert(torch.type(self.aliasmultinomial.J) ~= 'torch.CudaTensor')
+      self.weight = weight
+      self.gradWeight = gradWeight
+   else
+      rtn = parent.type(self, type, cache)
+   end
+
+   self.unigrams = unigrams
+   self.aliasmultinomial = am
+   return rtn
+end
+
 function NCEModule:multicuda(device1, device2)
    assert(device1 and device2, "specify two devices as arguments")
    require 'torchx'
diff --git a/NormStabilizer.lua b/NormStabilizer.lua
index 3e11f8f..7b56a60 100644
--- a/NormStabilizer.lua
+++ b/NormStabilizer.lua
@@ -21,7 +21,6 @@ function NS:_updateOutput(input)
    assert(input:dim() == 2)
    local output
    if self.train ~= false then
-      self:recycle()
       local rm = self:getStepModule(self.step)
       output = rm:updateOutput(input)
       -- in training mode, we also calculate norm of hidden state
diff --git a/OneHot.lua b/OneHot.lua
deleted file mode 100644
index 702e162..0000000
--- a/OneHot.lua
+++ /dev/null
@@ -1,65 +0,0 @@
-local OneHot, parent = torch.class('nn.OneHot', 'nn.Module')
-
--- adapted from https://github.com/karpathy/char-rnn
--- and https://github.com/hughperkins/char-lstm
-
-function OneHot:__init(outputSize)
-   parent.__init(self)
-   self.outputSize = outputSize
-end
-
-function OneHot:updateOutput(input)
-   local size
-   if type(input) == 'number' then
-      if self:type() == 'torch.CudaTensor' then
-         self._single = self._single or torch.CudaTensor():resize(1);
-      else
-         self._single = self._single or torch.LongTensor():resize(1);
-      end
-      self._single[1] = input
-      input = self._single;
-      size = {}
-   else
-      size = input:size():totable()
-   end
-   table.insert(size, self.outputSize)
-   
-   self.output:resize(unpack(size)):zero()
-   
-   size[#size] = 1
-   local input_ = input:view(unpack(size))
-   
-   if torch.type(input) == 'torch.CudaTensor' or torch.type(input) == 'torch.ClTensor' then
-      self.output:scatter(self.output:dim(), input_, 1)
-   else
-      if torch.type(self.output) == 'torch.CudaTensor' then 
-         -- input is not cuda, module is, cast input to cuda
-         self._input = self._input or torch.CudaTensor()
-         self._input:resize(input_:size()):copy(input_)
-         input_ = self._input
-      elseif torch.type(input) ~= 'torch.LongTensor' then 
-         -- input is not long, module isnot cuda, cast input to long
-         self._input = self._input or torch.LongTensor()
-         self._input:resize(input_:size()):copy(input_)
-         input_ = self._input
-      end
-      self.output:scatter(self.output:dim(), input_, 1)
-   end
-   
-   return self.output
-end
-
-function OneHot:updateGradInput(input, gradOutput)
-   if type(input) == 'number' then
-      return 0
-   else
-      self.gradInput:resize(input:size()):zero()
-      return self.gradInput
-   end
-end
-
-function OneHot:type(type, typecache)
-   self._single = nil
-   self._input = nil
-   return parent.type(self, type, typecache)
-end
diff --git a/PCAColorTransform.lua b/PCAColorTransform.lua
deleted file mode 100644
index 69f16d1..0000000
--- a/PCAColorTransform.lua
+++ /dev/null
@@ -1,117 +0,0 @@
---[[
-   Color transformation module: Commonly used data augmentation technique.
-   Random color noise is added to input image/images based on the Principal
-   Component Analysis (PCA) of pixel values.
-
-   Arguments
-   -> eigenVectors: Each row represent an eigen vector.
-   -> eigenValues: Corresponding eigen values.
-   -> std: std of gaussian distribution for augmentation (default 0.1).
---]]
-
-local PCAColorTransform, Parent = torch.class('nn.PCAColorTransform', 'nn.Module')
-
-function PCAColorTransform:__init(inputChannels, eigenVectors, eigenValues, std)
-   Parent.__init(self)
-
-   self.train = true
-   self.inputChannels = inputChannels
-   assert(inputChannels == eigenVectors:size(1),
-          "Number of input channels do not match number of eigen vectors.")
-   assert(eigenVectors:size(2) == eigenVectors:size(1),
-          "Invalid dimensionality: eigen vectors.")
-   assert(inputChannels == eigenValues:nElement(),
-          "Number of input channels do not match number of eigen values.")
-
-   self.eigenVectors = eigenVectors
-   self.eigenValues = eigenValues
-   self.std = std or 0.1
-end
-
-function PCAColorTransform:updateOutput(input)
-   self.output:resizeAs(input):copy(input)
-   if self.train then
-      self.noise = self.noise or self.output.new()
-      self.alphas = self.alphas or self.output.new()
-      self._tempNoise = self._tempNoise or self.output.new()
-      self._tempNoiseExpanded = self._tempNoiseExpanded or self.output.new()
-      self._tempNoiseSamples = self._tempNoiseSamples or self.output.new()
-      self._tempLambda = self._tempLambda or self.output.new()
-      self._tempLambdaExpanded = self._tempLambdaExpanded or self.output.new()
-
-      if self.output:nDimension() == 4 then
-         local batchSize = self.output:size(1)
-         local channels = self.output:size(2)
-         local height = self.output:size(3)
-         local width = self.output:size(4)
-         assert(channels == self.inputChannels)
-         
-         -- Randomly sample noise for each channel and scale by eigen values
-         self.alphas:resize(channels, batchSize)
-         self.alphas:normal(0, self.std)
-         self._tempLambda = self.eigenValues:view(self.inputChannels, 1)
-         self._tempLambdaExpanded = self._tempLambda:expand(channels, batchSize)
-         self.alphas:cmul(self._tempLambdaExpanded)
-
-         -- Scale by eigen vectors 
-         self.noise:resize(batchSize, self.inputChannels):zero()
-         self.noise:t():addmm(self.eigenVectors, self.alphas)
-
-         -- Add noise to the input
-         self._tempNoise = self.noise:view(batchSize, self.inputChannels, 1, 1)
-         self._tempNoiseExpanded:expand(self._tempNoise, batchSize,
-                                        channels, height, width)
-         self.output:add(self._tempNoiseExpanded)
-
-      elseif self.output:nDimension() == 3 then
-         local channels = self.output:size(1)
-         local height = self.output:size(2)
-         local width = self.output:size(3)
-         assert(channels == self.inputChannels)
-
-         -- Randomly sample noise for each channel and scale by eigen values
-         self.alphas:resize(channels, 1)
-         self.alphas:normal(0, self.std)
-         self._tempLambda = self.eigenValues:view(self.inputChannels, 1)
-         self._tempLambdaExpanded = self._tempLambda:expand(channels, 1)
-         self.alphas:cmul(self._tempLambdaExpanded)
-
-         -- Scale by eigen vectors 
-         self.noise:resize(1, self.inputChannels):zero()
-         self.noise:t():addmm(self.eigenVectors, self.alphas)
-
-         -- Add noise to the input
-         self._tempNoise = self.noise:view(self.inputChannels, 1, 1)
-         self._tempNoiseExpanded:expand(self._tempNoise, channels,
-                                        height, width)
-         self.output:add(self._tempNoiseExpanded)
-      else
-         error("Invalid input dimensionality.")
-      end
-   end
-   return self.output
-end
-
-function PCAColorTransform:updateGradInput(input, gradOutput)
-   if self.train then
-      self.gradInput:resizeAs(gradOutput):copy(gradOutput)
-   else
-      error('backprop only defined while training')
-   end
-   return self.gradInput
-end
-
-function PCAColorTransform:type(type, tensorCache)
-   self.noise = nil
-   self.alphas = nil
-   self._tempLambda = nil
-   self._tempLambdaExpanded = nil
-   self._tempNoise = nil
-   self._tempNoiseExpanded = nil
-   Parent.type(self, type, tensorCache)
-end
-
-function PCAColorTransform:__tostring__()
-  return string.format('%s channels: %d, std: %f', torch.type(self),
-                        self.inputChannels, self.std)
-end
diff --git a/Padding.lua b/Padding.lua
deleted file mode 100644
index b3b8f03..0000000
--- a/Padding.lua
+++ /dev/null
@@ -1,52 +0,0 @@
-local Padding, parent
-if nn.Padding then -- prevent name conflicts with nnx
-   Padding, parent = nn.Padding, nn.Module
-else
-   Padding, parent = torch.class('nn.Padding', 'nn.Module')
-end
-
--- pad can be positive (right) negative (left)
-function Padding:__init(dim, pad, nInputDim, value)
-   self.dim = dim
-   self.pad = pad
-   self.nInputDim = nInputDim
-   self.value = value or 0
-   self.outputSize = torch.LongStorage()
-   parent.__init(self)
-end
-
-function Padding:updateOutput(input)
-   self.outputSize:resize(input:dim())
-   self.outputSize:copy(input:size())
-   local dim = self.dim 
-   if self.nInputDim and input:dim() ~= self.nInputDim then
-      dim = dim + 1
-   end
-   self.outputSize[dim] = self.outputSize[dim] + math.abs(self.pad)
-   self.output:resize(self.outputSize)
-   self.output:fill(self.value)
-   local outputWindow
-   if self.pad > 0 then
-      outputWindow = self.output:narrow(dim, 1, input:size(dim)) 
-   else
-      outputWindow = self.output:narrow(dim, 1 - self.pad, input:size(dim))
-   end
-   outputWindow:copy(input)
-   return self.output
-end
-
-function Padding:updateGradInput(input, gradOutput)
-   self.gradInput:resizeAs(input)
-   local dim = self.dim 
-   if self.nInputDim and input:dim() ~= self.nInputDim then
-      dim = dim + 1
-   end
-   local gradOutputWindow
-   if self.pad > 0 then
-      gradOutputWindow = gradOutput:narrow(dim, 1, input:size(dim)) 
-   else
-      gradOutputWindow = gradOutput:narrow(dim, 1 - self.pad, input:size(dim))
-   end
-   self.gradInput:copy(gradOutputWindow:copy(input))
-   return self.gradInput
-end
diff --git a/README.md b/README.md
index 63123fb..bea88cf 100644
--- a/README.md
+++ b/README.md
@@ -7,8 +7,8 @@ This library includes documentation for the following objects:
 Modules that consider successive calls to `forward` as different time-steps in a sequence :
  * [AbstractRecurrent](#rnn.AbstractRecurrent) : an abstract class inherited by `Recurrence` and `RecLSTM`;
  * [Recurrence](#rnn.Recurrence) : decorates a module that outputs `output(t)` given `{input(t), output(t-1)}`;
-  * [LookupRNN](#rnn.LookupRNN): implements a simple RNN where the input layer is a `LookupTable`;
-  * [LinearRNN](#rnn.LinearRNN): implements a simple RNN where the input layer is a `Linear`;
+   * [LookupRNN](#rnn.LookupRNN): implements a simple RNN where the input layer is a `LookupTable`;
+   * [LinearRNN](#rnn.LinearRNN): implements a simple RNN where the input layer is a `Linear`;
  * [RecLSTM](#rnn.RecLSTM) : an LSTM that can be used for real-time RNNs;
  * [RecGRU](#rnn.RecGRU) : an GRU that can be used for real-time RNNs;
  * [Recursor](#rnn.Recursor) : decorates a module to make it conform to the [AbstractRecurrent](#rnn.AbstractRecurrent) interface;
@@ -21,8 +21,8 @@ Modules that `forward` entire sequences through a decorated `AbstractRecurrent`
  * [SeqLSTM](#rnn.SeqLSTM) : a faster version of `nn.Sequencer(nn.RecLSTM)` where the `input` and `output` are tensors;
  * [SeqGRU](#rnn.SeqGRU) : a faster version of `nn.Sequencer(nn.RecGRU)` where the `input` and `output` are tensors;
  * [BiSequencer](#rnn.BiSequencer) : used for implementing Bidirectional RNNs;
-  * [SeqBLSTM](#rnn.SeqBLSTM) : bidirectional LSTM that uses two `SeqLSTMs` internally;
-  * [SeqBGRU](#rnn.SeqBGRU) : bidirectional GRU that uses two `SeqGRUs` internally;
+   * [SeqBLSTM](#rnn.SeqBLSTM) : bidirectional LSTM that uses two `SeqLSTMs` internally;
+   * [SeqBGRU](#rnn.SeqBGRU) : bidirectional GRU that uses two `SeqGRUs` internally;
  * [Repeater](#rnn.Repeater) : repeatedly applies the same input to an `AbstractRecurrent` instance;
  * [RecurrentAttention](#rnn.RecurrentAttention) : a generalized attention model for [REINFORCE modules](https://github.com/nicholas-leonard/dpnn#nn.Reinforce);
 
@@ -34,6 +34,11 @@ Miscellaneous modules and criterions :
  * [MaskZeroCriterion](#rnn.MaskZeroCriterion) : zeros the `gradInput` and `loss` rows of the decorated criterion for commensurate
    * `input` rows which are tensors of zeros (version 1);
    * `zeroMask` elements which are 1 (version 2);
+ * [ReverseSequence](#nn.ReverseSequence) : reverse the order of elements in a sequence (table or tensor);
+ * [ReverseUnreverse](#nn.ReverseUnreverse) : used internally by `nn.BiSequencer` for decorating `bwd` RNN.
+ * [SpatialGlimpse](#nn.SpatialGlimpse) : takes a fovead glimpse of an image at a given location;
+ * [NCEModule](#nn.NCEModule) : optimized placeholder for a `Linear` + `SoftMax` using [noise-contrastive estimation](https://www.cs.toronto.edu/~amnih/papers/ncelm.pdf).
+ * [NCECriterion](#nn.NCECriterion) : criterion exclusively used with [NCEModule](#nn.NCEModule);
  * [VariableLength](#rnn.VariableLength): decorates a `Sequencer` to accept and produce a table of variable length inputs and outputs;
 
 Criterions used for handling sequential inputs and targets :
@@ -41,39 +46,6 @@ Criterions used for handling sequential inputs and targets :
  * [SequencerCriterion](#rnn.SequencerCriterion) : sequentially applies the same criterion to a sequence of inputs and targets;
  * [RepeaterCriterion](#rnn.RepeaterCriterion) : repeatedly applies the same criterion with the same target on a sequence.
 
-
-This package also provides many useful features that aren't part of the main nn package.
-These include [sharedClone](#nn.Module.sharedClone), which allows you to clone a module and share
-parameters or gradParameters with the original module, without incuring any memory overhead.
-We also redefined [type](#nn.Module.type) such that the type-cast preserves Tensor sharing within a structure of modules.
-
-The package provides the following Modules:
-
- * [Serial](#nn.Serial) : decorate a module makes its serialized output more compact ;
- * [Inception](#nn.Inception) : implements the Inception module of the GoogleLeNet article ;
- * [Collapse](#nn.Collapse) : just like `nn.View(-1)`;
- * [Convert](#nn.Convert) : convert between different tensor types or shapes;
- * [ZipTable](#nn.ZipTable) : zip a table of tables into a table of tables;
- * [ZipTableOneToMany](#nn.ZipTableOneToMany) : zip a table of element `el` and table of elements into a table of pairs of element `el` and table elements;
- * [CAddTensorTable](#nn.CAddTensorTable) : adds a tensor to a table of tensors of the same size;
- * [ReverseSequence](#nn.ReverseSequence) : reverse the order of elements in a sequence (table or tensor);
- * [ReverseUnreverse](#nn.ReverseUnreverse) : used internally by `nn.BiSequencer` for decorating `bwd` RNN.
- * [PrintSize](#nn.PrintSize) : prints the size of inputs and gradOutputs (useful for debugging);
- * [Clip](#nn.Clip) : clips the inputs to a min and max value;
- * [Constant](#nn.Constant) : outputs a constant value given an input (which is ignored);
- * [SpatialUniformCrop](#nn.SpatialUniformCrop) : uniformly crops patches from a input;
- * [SpatialGlimpse](#nn.SpatialGlimpse) : takes a fovead glimpse of an image at a given location;
- * [WhiteNoise](#nn.WhiteNoise) : adds isotropic Gaussian noise to the signal when in training mode;
- * [OneHot](#nn.OneHot) : transforms a tensor of indices into [one-hot](https://en.wikipedia.org/wiki/One-hot) encoding;
- * [Kmeans](#nn.Kmeans) : [Kmeans](https://en.wikipedia.org/wiki/K-means_clustering) clustering layer. Forward computes distances with respect to centroids and returns index of closest centroid. Centroids can be updated using gradient descent. Centroids could be initialized randomly or by using [kmeans++](https://en.wikipedia.org/wiki/K-means%2B%2B) algoirthm;
- * [SpatialRegionDropout](#nn.SpatialRegionDropout) : Randomly dropouts a region (top, bottom, leftmost, rightmost) of the input image. Works with batch and any number of channels;
- * [FireModule](#nn.FireModule) : FireModule as mentioned in the [SqueezeNet](http://arxiv.org/pdf/1602.07360v1.pdf);
- * [NCEModule](#nn.NCEModule) : optimized placeholder for a `Linear` + `SoftMax` using [noise-contrastive estimation](https://www.cs.toronto.edu/~amnih/papers/ncelm.pdf).
- * [SpatialFeatNormalization](#nn.SpatialFeatNormalization) : Module for widely used preprocessing step of mean zeroing and standardization for images.
- * [SpatialBinaryConvolution](#nn.SpatialBinaryConvolution) : Module for binary spatial convolution (Binary weights) as mentioned in [XNOR-Net](http://arxiv.org/pdf/1603.05279v2.pdf).
- * [SimpleColorTransform](#nn.SimpleColorTransform) : Module for adding independent random noise to input image channels.
- * [PCAColorTransform](#nn.PCAColorTransform) : Module for adding noise to input image using Principal Components Analysis.
-
 The following modules and criterions can be used to implement the REINFORCE algorithm :
 
  * [Reinforce](#nn.Reinforce) : abstract class for REINFORCE modules;
@@ -84,14 +56,6 @@ The following modules and criterions can be used to implement the REINFORCE algo
  * [VRClassReward](#nn.VRClassReward) : criterion for variance-reduced classification-based reward;
  * [BinaryClassReward](#nn.BinaryClassReward) : criterion for variance-reduced binary classification reward (like `VRClassReward`, but for binary classes);
 
-Additional differentiable criterions
- * [BinaryLogisticRegression](#nn.BLR) : criterion for binary logistic regression;
- * [SpatialBinaryLogisticRegression](#nn.SpatialBLR) : criterion for pixel wise binary logistic regression;
- * [NCECriterion](#nn.NCECriterion) : criterion exclusively used with [NCEModule](#nn.NCEModule).
- * [ModuleCriterion](#nn.ModuleCriterion) : adds an optional `inputModule` and `targetModule` before a decorated criterion;
- * [BinaryLogisticRegression](#nn.BLR) : criterion for binary logistic regression.
- * [SpatialBinaryLogisticRegression](#nn.SpatialBLR) : criterion for pixel wise binary logistic regression.
-
 
 <a name='rnn.examples'></a>
 ## Examples ##
@@ -122,7 +86,6 @@ If you are using CUDA :
 ```bash
 luarocks install cutorch
 luarocks install cunn
-luarocks install cunnx
 ```
 
 And don't forget to update this package :
@@ -130,7 +93,7 @@ And don't forget to update this package :
 luarocks install rnn
 ```
 
-If that doesn't fix it, open and issue on github.
+If that doesn't fix it, open an issue on github.
 
 <a name='rnn.AbstractRecurrent'></a>
 ## AbstractRecurrent ##
@@ -1544,224 +1507,6 @@ i.e. each example in a batch has its own scalar reward.
 Refer to [this example](https://github.com/Element-Research/rnn/blob/master/examples/recurrent-visual-attention.lua)
 for a complete training script making use of the REINFORCE interface.
 
-<a name='nn.Serial'></a>
-## Serial ##
-
-```lua
-dmodule = nn.Serial(module, [tensortype])
-dmodule:[light,medium,heavy]Serial()
-```
-
-This module is a decorator that can be used to control the serialization/deserialization
-behavior of the encapsulated module. Basically, making the resulting string or
-file heavy (the default), medium or light in terms of size.
-
-Furthermore, when specified, the `tensortype` attribute (e.g *torch.FloatTensor*, *torch.DoubleTensor* and so on.),
-determines what type the module will be cast to during serialization.
-Note that this will also be the type of the deserialized object.
-The default serialization `tensortype` is `nil`, i.e. the module is serialized as is.
-
-The `heavySerial()` has the serialization process serialize every attribute in the module graph,
-which is the default behavior of nn.
-
-The `mediumSerial()` has the serialization process serialize
-everything except the attributes specified in each module's `dpnn_mediumEmpty`
-table, which has a default value of `{'output', 'gradInput', 'momGradParams', 'dpnn_input'}`.
-During serialization, whether they be tables or Tensors, these attributes are emptied (no storage).
-Some modules overwrite the default `Module.dpnn_mediumEmpty` static attribute with their own.
-
-The `lightSerial()` has the serialization process empty
-everything a call to `mediumSerial(type)` would (so it uses `dpnn_mediumEmpty`).
-But also empties all the parameter gradients specified by the
-attribute `dpnn_gradParameters`, which defaults to `{gradWeight, gradBias}`.
-
-We recomment using `mediumSerial()` for training, and `lightSerial()` for
-production (feed-forward-only models).
-
-<a name='nn.Inception'></a>
-## Inception ##
-References :
-
-  * A. [Going Deeper with Convolutions](http://arxiv.org/abs/1409.4842)
-  * B. [GoogleLeNet](http://image-net.org/challenges/LSVRC/2014/slides/GoogLeNet.pptx)
-
-```lua
-module = nn.Inception(config)
-```
-
-This module uses `n`+2 parallel "columns".
-The original paper uses 2+2 where the first two are (but there could be more than two):
-
-  * 1x1 conv (reduce) -> relu -> 5x5 conv -> relu
-  * 1x1 conv (reduce) -> relu -> 3x3 conv -> relu
-
-and where the other two are :
-
-  * 3x3 maxpool -> 1x1 conv (reduce/project) -> relu
-  * 1x1 conv (reduce) -> relu.
-
-This module allows the first group of columns to be of any
-number while the last group consist of exactly two columns.
-The 1x1 convoluations are used to reduce the number of input channels
-(or filters) such that the capacity of the network doesn't explode.
-We refer to these here has *reduce*.
-Since each column seems to have one and only one reduce, their initial
-configuration options are specified in lists of n+2 elements.
-
-The sole argument `config` is a table taking the following key-values :
-
-  * Required Arguments :
-   * `inputSize` : number of input channels or colors, e.g. 3;
-   * `outputSize` : numbers of filters in the non-1x1 convolution kernel sizes, e.g. `{32,48}`
-   * `reduceSize` : numbers of filters in the 1x1 convolutions (reduction) used in each column, e.g. `{48,64,32,32}`. The last 2 are used respectively for the max pooling (projection) column (the last column in the paper) and the column that has nothing but a 1x1 conv (the first column in the paper). This table should have two elements more than the outputSize
-  * Optional Arguments :
-   * `reduceStride` : strides of the 1x1 (reduction) convolutions. Defaults to `{1,1,...}`.
-   * `transfer` : transfer function like `nn.Tanh`,`nn.Sigmoid`, `nn.ReLU`, `nn.Identity`, etc. It is used after each reduction (1x1 convolution) and convolution. Defaults to `nn.ReLU`.
-   * `batchNorm` : set this to `true` to use batch normalization. Defaults to `false`. Note that batch normalization can be awesome
-   * `padding` : set this to `true` to add padding to the input of the convolutions such that output width and height are same as that of the original non-padded `input`. Defaults to `true`.
-   * `kernelSize` : size (`height = width`) of the non-1x1 convolution kernels. Defaults to `{5,3}`.
-   * `kernelStride` : stride of the kernels (`height = width`) of the convolution. Defaults to `{1,1}`
-   * `poolSize`: size (`height = width`) of the spatial max pooling used in the next-to-last column. Defaults to 3.
-   * `poolStride` : stride (`height = width`) of the spatial max pooling. Defaults to 1.
-
-
-For a complete example using this module, refer to the following :
- * [deep inception training script](https://github.com/nicholas-leonard/dp/blob/master/examples/deepinception.lua) ;
- * [openface facial recognition](https://github.com/cmusatyalab/openface) (the model definition is [here](https://github.com/cmusatyalab/openface/blob/master/models/openface/nn4.def.lua)).
-
-<a name='nn.Collapse'></a>
-## Collapse ##
-
-```lua
-module = nn.Collapse(nInputDim)
-```
-
-This module is the equivalent of:
-```
-view = nn.View(-1)
-view:setNumInputDim(nInputDim)
-```
-It collapses all non-batch dimensions. This is useful for converting
-a spatial feature map to the single dimension required by a dense
-hidden layer like Linear.
-
-<a name='nn.Convert'></a>
-## Convert ##
-
-```lua
-module = nn.Convert([inputShape, outputShape])
-```
-Module to convert between different data formats.
-For example, we can flatten images by using :
-```lua
-module = nn.Convert('bchw', 'bf')
-```
-or equivalently
-```lua
-module = nn.Convert('chw', 'f')
-```
-Lets try it with an input:
-```lua
-print(module:forward(torch.randn(3,2,3,1)))
- 0.5692 -0.0190  0.5243  0.7530  0.4230  1.2483
--0.9142  0.6013  0.5608 -1.0417 -1.4014  1.0177
--1.5207 -0.1641 -0.4166  1.4810 -1.1725 -1.0037
-[torch.DoubleTensor of size 3x6]
-```
-You could also try:
-
-```lua
-module = nn.Convert('chw', 'hwc')
-input = torch.randn(1,2,3,2)
-input:select(2,1):fill(1)
-input:select(2,2):fill(2)
-print(input)
-(1,1,.,.) =
-  1  1
-  1  1
-  1  1
-(1,2,.,.) =
-  2  2
-  2  2
-  2  2
-[torch.DoubleTensor of size 1x2x3x2]
-print(module:forward(input))
-(1,1,.,.) =
-  1  2
-  1  2
-
-(1,2,.,.) =
-  1  2
-  1  2
-
-(1,3,.,.) =
-  1  2
-  1  2
-[torch.DoubleTensor of size 1x3x2x2]
-```
-
-
-Furthermore, it automatically converts the `input` to have the same type as `self.output`
-(i.e. the type of the module).
-So you can also just use is for automatic input type converions:
-```lua
-module = nn.Convert()
-print(module.output) -- type of module
-[torch.DoubleTensor with no dimension]
-input = torch.FloatTensor{1,2,3}
-print(module:forward(input))
- 1
- 2
- 3
-[torch.DoubleTensor of size 3]
-```
-
-<a name='nn.ZipTable'></a>
-## ZipTable ##
-
-```lua
-module = nn.ZipTable()
-```
-
-Zips a table of tables into a table of tables.
-
-Example:
-```lua
-print(module:forward{ {'a1','a2'}, {'b1','b2'}, {'c1','c2'} })
-{ {'a1','b1','c1'}, {'a2','b2','c2'} }
-```
-
-<a name='nn.ZipTableOneToMany'></a>
-## ZipTableOneToMany ##
-
-```lua
-module = nn.ZipTableOneToMany()
-```
-
-Zips a table of element `el` and table of elements `tab` into a table of tables, where the i-th table contains the element `el` and the i-th element in table `tab`
-
-Example:
-```lua
-print(module:forward{ 'el', {'a','b','c'} })
-{ {'el','a'}, {'el','b'}, {'el','c'} }
-```
-
-<a name='nn.CAddTensorTable'></a>
-## CAddTensorTable ##
-
-```lua
-module = nn.CAddTensorTable()
-```
-
-Adds the first element `el` of the input table `tab` to each tensor contained in the second element of `tab`, which is itself a table
-
-Example:
-```lua
-print(module:forward{ (0,1,1), {(0,0,0),(1,1,1)} })
-{ (0,1,1), (1,2,2) }
-```
-
-
 <a name='nn.ReverseSequence'></a>
 ## ReverseSequence ##
 
@@ -1805,67 +1550,6 @@ Then the `input` sequences are forwarded (in reverse order) through the `sequenc
 The resulting `sequencer.output` sequences are reversed with respect to the `input`.
 Before being returned to the caller, these are unreversed using another `ReverseSequence`.
 
-<a name='nn.PrintSize'></a>
-## PrintSize ##
-
-```lua
-module = nn.PrintSize(name)
-```
-
-This module is useful for debugging complicated module composites.
-It prints the size of the `input` and `gradOutput` during `forward`
-and `backward` propagation respectively.
-The `name` is a string used to identify the module along side the printed size.
-
-<a name='nn.Clip'></a>
-## Clip ##
-
-```lua
-module = nn.Clip(minval, maxval)
-```
-
-This module clips `input` values such that the output is between `minval` and `maxval`.
-
-<a name='nn.Constant'></a>
-## Constant ##
-
-```lua
-module = nn.Constant(value, nInputDim)
-```
-
-This module outputs a constant value given an input.
-If `nInputDim` is specified, it uses the input to determine the size of the batch.
-The `value` is then replicated over the batch.
-Otherwise, the `value` Tensor is output as is.
-During `backward`, the returned `gradInput` is a zero Tensor of the same size as the `input`.
-This module has no trainable parameters.
-
-You can use this with nn.ConcatTable() to append constant inputs to an input :
-
-```lua
-nn.ConcatTable():add(nn.Constant(v)):add(nn.Identity())
-```
-
-This is useful when you want to output a value that is independent of the
-input to the neural network (see [this example](https://github.com/Element-Research/rnn/blob/master/examples/recurrent-visual-attention.lua)).
-
-<a name='nn.SpatialUniformCrop'></a>
-## SpatialUniformCrop ##
-
-```lua
-module = nn.SpatialUniformCrop(oheight, owidth)
-```
-
-During training, this module will output a cropped patch of size `oheight, owidth`
-within the boundaries of the `input` image.
-For each example, a location is sampled from a uniform distribution
-such that each possible patch has an equal probability of being sampled.
-
-During evaluation, the center patch is cropped and output.
-
-This module is commonly used at the input layer to artificially
-augment the size of the dataset to prevent overfitting.
-
 <a name='nn.SpatialGlimpse'></a>
 ## SpatialGlimpse ##
 Ref. A. [Recurrent Model for Visual Attention](http://papers.nips.cc/paper/5542-recurrent-models-of-visual-attention.pdf)
@@ -1892,187 +1576,6 @@ on a region of the input `image`.
 It is commonly used with the [RecurrentAttention](https://github.com/Element-Research/rnn#rnn.RecurrentAttention)
 module (see [this example](https://github.com/Element-Research/rnn/blob/master/examples/recurrent-visual-attention.lua)).
 
-<a name='nn.WhiteNoise'></a>
-## WhiteNoise ##
-
-```lua
-module = nn.WhiteNoise([mean, stdev])
-```
-
-Useful in training [Denoising Autoencoders] (http://arxiv.org/pdf/1507.02672v1.pdf).
-Takes `mean` and `stdev` of the normal distribution as input.
-Default values for mean and standard deviation are 0 and 0.1 respectively.
-With `module:training()`, noise is added during forward.
-During `backward` gradients are passed as it is.
-With `module:evaluate()` the mean is added to the input.
-
-<a name='nn.SpatialRegionDropout'></a>
-## SpatialRegionDropout ##
-
-```lua
-module = nn.SpatialRegionDropout(p)
-```
-Following is an example of `SpatialRegionDropout` outputs on the famous lena image.
-
-**Input**
-
-![Lena](tutorials/lena.jpg)
-
-**Outputs**
-
-![Lena](tutorials/srd1.jpg)           ![Lena](tutorials/srd2.jpg)
-
-<a name='nn.FireModule'></a>
-## FireModule ##
-Ref: http://arxiv.org/pdf/1602.07360v1.pdf
-```lua
-module = nn.FireModule(nInputPlane, s1x1, e1x1, e3x3, activation)
-```
-FireModule is comprised of two submodules 1) A *squeeze* convolution module comprised of `1x1` filters followed by 2) an *expand* module that is comprised of a mix of `1x1` and `3x3` convolution filters.
-Arguments: `s1x1`: number of `1x1` filters in the squeeze submodule, `e1x1`: number of `1x1` filters in the expand submodule, `e3x3`: number of `3x3` filters in the expand submodule. It is recommended that `s1x1` be less than `(e1x1+e3x3)` if you want to limit the number of input channels to the `3x3` filters in the expand submodule.
-FireModule works only with batches, for single sample convert the sample to a batch of size 1.
-
-<a name='nn.SpatialFeatNormalization'></a>
-## SpatialFeatNormalization ##
-```lua
-module = nn.SpatialFeatNormalization(mean, std)
-```
-This module normalizies each feature channel of input image based on its corresponding mean and standard deviation scalar values. This module does not learn the `mean` and `std`, they are provided as arguments.
-
-<a name='nn.SpatialBinaryConvolution'></a>
-## SpatialBinaryConvolution ##
-
-```lua
-module = nn.SpatialBinaryConvolution(nInputPlane, nOutputPlane, kW, kH)
-```
-Functioning of SpatialBinaryConvolution is similar to nn/SpatialConvolution. Only difference is that Binary weights are used for forward/backward and floating point weights are used for weight updates. Check **Binary-Weight-Network** section of [XNOR-net](http://arxiv.org/pdf/1603.05279v2.pdf).
-
-<a name='nn.SimpleColorTransform'></a>
-## SimpleColorTransform ##
-
-```lua
-range = torch.rand(inputChannels) -- Typically range is specified by user.
-module = nn.SimpleColorTransform(inputChannels, range)
-```
-This module performs a simple data augmentation technique. SimpleColorTransform module adds random noise to each color channel independently. In more advanced data augmentation technique noise is added using principal components of color channels. For that please check **PCAColorTransform**
-
-<a name='nn.PCAColorTransform'></a>
-## PCAColorTransform ##
-
-```lua
-eigenVectors = torch.rand(inputChannels, inputChannels) -- Eigen Vectors
-eigenValues = torch.rand(inputChannels) -- Eigen
-std = 0.1 -- Std deviation of normal distribution with mean zero for noise.
-module = nn.PCAColorTransform(inputChannels, eigenVectors, eigenValues, std)
-```
-This module performs a data augmentation using Principal Component analysis of pixel values. When in training mode, mulitples of principal components are added to input image pixels. Magnitude of value added (noise) is dependent upon the corresponding eigen value and a random value sampled from a Gaussian distribution with mean zero and `std` (default 0.1) standard deviation. This technique was used in the famous [AlexNet](https://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks.pdf) paper.
-
-<a name = 'nn.OneHot'></a>
-## OneHot ##
-
-```lua
-module = nn.OneHot(outputSize)
-```
-
-Transforms a tensor of `input` indices having integer values between 1 and `outputSize` into
-a tensor of one-hot vectors of size `outputSize`.
-
-Forward an index to get a one-hot vector :
-
-```lua
-> module = nn.OneHot(5) -- 5 classes
-> module:forward(torch.LongTensor{3})
- 0  0  1  0  0
-[torch.DoubleTensor of size 1x5]
-```
-
-Forward a batch of 3 indices. Notice that these need not be stored as `torch.LongTensor` :
-
-```lua
-> module:forward(torch.Tensor{3,2,1})
- 0  0  1  0  0
- 0  1  0  0  0
- 1  0  0  0  0
-[torch.DoubleTensor of size 3x5]
-```
-
-Forward batch of `2 x 3` indices :
-
-```lua
-oh:forward(torch.Tensor{{3,2,1},{1,2,3}})
-(1,.,.) =
-  0  0  1  0  0
-  0  1  0  0  0
-  1  0  0  0  0
-
-(2,.,.) =
-  1  0  0  0  0
-  0  1  0  0  0
-  0  0  1  0  0
-[torch.DoubleTensor of size 2x3x5]
-```
-
-<a name='nn.Kmeans'></a>
-## Kmeans ##
-
-```lua
-km = nn.Kmeans(k, dim)
-```
-
-`k` is the number of centroids and `dim` is the dimensionality of samples.
-You can either initialize centroids randomly from input samples or by using *kmeans++* algorithm.
-
-```lua
-km:initRandom(samples) -- Randomly initialize centroids from input samples.
-km:initKmeansPlus(samples) -- Use Kmeans++ to initialize centroids.
-```
-
-Example showing how to use Kmeans module to do standard Kmeans clustering.
-
-```lua
-attempts = 10
-iter = 100 -- Number of iterations
-bestKm = nil
-bestLoss = math.huge
-learningRate = 1
-for j=1, attempts do
-   local km = nn.Kmeans(k, dim)
-   km:initKmeansPlus(samples)
-   for i=1, iter do
-      km:zeroGradParameters()
-      km:forward(samples) -- sets km.loss
-      km:backward(samples, gradOutput) -- gradOutput is ignored
-
-      -- Gradient Descent weight/centroids update
-      km:updateParameters(learningRate)
-   end
-
-   if km.loss < bestLoss then
-      bestLoss = km.loss
-      bestKm = km:clone()
-   end
-end
-```
-`nn.Kmeans()` module maintains loss only for the latest forward. If you want to maintain loss over the whole dataset then you who would need do it my adding the module loss for every forward.
-
-You can also use `nn.Kmeans()` as an auxillary layer in your network.
-A call to `forward` will generate an `output` containing the index of the nearest cluster for each sample in the batch.
-The `gradInput` generated by `updateGradInput` will be zero.
-
-<a name='nn.ModuleCriterion'></a>
-## ModuleCriterion ##
-
-```lua
-criterion = nn.ModuleCriterion(criterion [, inputModule, targetModule, castTarget])
-```
-
-This criterion decorates a `criterion` by allowing the `input` and `target` to be
-fed through an optional `inputModule` and `targetModule` before being passed to the
-`criterion`. The `inputModule` must not contain parameters as these would not be updated.
-
-When `castTarget = true` (the default), the `targetModule` is cast along with the `inputModule` and
-`criterion`. Otherwise, the `targetModule` isn't.
-
 <a name='nn.NCEModule'></a>
 ## NCEModule
 Ref. A [RNNLM training with NCE for Speech Recognition](https://www.cs.toronto.edu/~amnih/papers/ncelm.pdf)
@@ -2350,40 +1853,3 @@ So basically, the `input` is still a table of two tensors.
 The first input tensor is of size `batchsize` containing Bernoulli probabilities.
 The second input tensor is the baseline prediction described in `VRClassReward`.
 The targets contain zeros and ones.
-
-<a name='nn.BLR'></a>
-## BinaryLogisticRegression ##
-Ref A. [Learning to Segment Object Candidates](http://arxiv.org/pdf/1506.06204v2.pdf)
-This criterion implements the score criterion mentioned in (ref. A).
-
-```lua
-criterion = nn.BinaryLogisticRegression()
-```
-
-BinaryLogisticRegression implements following cost function for binary classification.
-
-```
-
- log( 1 + exp( -y_k * score(x_k) ) )
-
-```
-where `y_k` is binary target `score(x_k)` is the corresponding prediction. `y_k` has value `{-1, +1}` and `score(x_k)` has value in `[-1, +1]`.
-
-<a name='nn.SpatialBLR'></a>
-## SpatialBinaryLogisticRegression ##
-Ref A. [Learning to Segment Object Candidates](http://arxiv.org/pdf/1506.06204v2.pdf)
-
-This criterion implements the spatial component of the criterion mentioned in  (ref. A).
-
-```lua
-criterion = nn.SpatialBinaryLogisticRegression()
-```
-
-SpatialBinaryLogisticRegression implements following cost function for binary pixel classification.
-```
-   1
-_______ sum_ij [ log( 1 + exp( -m_ij * f_ij ) ) ]
- 2*w*h
-```
-where `m_ij` is target binary image and `f_ij` is the corresponding prediction. `m_ij` has value `{-1, +1}` and `f_ij` has value in `[-1, +1]`.
-
diff --git a/RecGRU.lua b/RecGRU.lua
index 76ba8b5..f25baa6 100644
--- a/RecGRU.lua
+++ b/RecGRU.lua
@@ -27,7 +27,6 @@ function RecGRU:_updateOutput(input)
    -- output(t) = gru{input(t), output(t-1)}
    local output
    if self.train ~= false then
-      self:recycle()
       local stepmodule = self:getStepModule(self.step)
       output = stepmodule:updateOutput({input, prevOutput})
    else
diff --git a/RecLSTM.lua b/RecLSTM.lua
index cf1303c..17cbc02 100644
--- a/RecLSTM.lua
+++ b/RecLSTM.lua
@@ -33,7 +33,6 @@ function RecLSTM:_updateOutput(input)
    -- output(t), cell(t) = lstm{input(t), output(t-1), cell(t-1)}
    local output, cell
    if self.train ~= false then
-      self:recycle()
       local stepmodule = self:getStepModule(self.step)
       -- the actual forward propagation
       output, cell = unpack(stepmodule:updateOutput{input, prevOutput, prevCell})
diff --git a/Recurrence.lua b/Recurrence.lua
index 71dc4fd..dbf0bd8 100644
--- a/Recurrence.lua
+++ b/Recurrence.lua
@@ -81,7 +81,6 @@ function Recurrence:_updateOutput(input)
    -- output(t) = stepmodule:forward{input(t), output(t-1)}
    local output
    if self.train ~= false then
-      self:recycle()
       local stepmodule = self:getStepModule(self.step)
       -- the actual forward propagation
       output = stepmodule:updateOutput{input, prevOutput}
diff --git a/Recursor.lua b/Recursor.lua
index ad59c7f..bfe8278 100644
--- a/Recursor.lua
+++ b/Recursor.lua
@@ -9,8 +9,6 @@ local Recursor, parent = torch.class('nn.Recursor', 'nn.AbstractRecurrent')
 function Recursor:_updateOutput(input)
    local output
    if self.train ~= false then -- if self.train or self.train == nil then
-      -- set/save the output states
-      self:recycle()
       local stepmodule = self:getStepModule(self.step)
       output = stepmodule:updateOutput(input)
    else
diff --git a/Sequencer.lua b/Sequencer.lua
index faacfd1..7bb93ad 100644
--- a/Sequencer.lua
+++ b/Sequencer.lua
@@ -18,9 +18,8 @@ function Sequencer:__init(module)
    end
 
    -- we can decorate the module with a Recursor to make it AbstractRecurrent
-   self.module = (not torch.isTypeOf(module, 'nn.AbstractRecurrent')) and nn.Recursor(module) or module
    -- backprop through time (BPTT) will be done online (in reverse order of forward)
-   self.modules = {self.module}
+   self.modules = {(not torch.isTypeOf(module, 'nn.AbstractRecurrent')) and nn.Recursor(module) or module}
 
    self.output = {}
    self.tableoutput = {}
@@ -28,10 +27,6 @@ function Sequencer:__init(module)
 
    -- table of buffers used for evaluation
    self._output = {}
-   -- so that these buffers aren't serialized :
-   local _ = require 'moses'
-   self.dpnn_mediumEmpty = _.clone(self.dpnn_mediumEmpty)
-   table.insert(self.dpnn_mediumEmpty, '_output')
    -- default is to forget previous inputs before each forward()
    self._remember = 'neither'
 end
@@ -46,16 +41,16 @@ function Sequencer:updateOutput(input)
    end
 
    -- Note that the Sequencer hijacks the seqlen attribute of the rnn
-   self.module:maxBPTTstep(nStep)
+   self.modules[1]:maxBPTTstep(nStep)
    if self.train ~= false then
       -- TRAINING
       if not (self._remember == 'train' or self._remember == 'both') then
-         self.module:forget()
+         self.modules[1]:forget()
       end
 
       self.tableoutput = {}
       for step=1,nStep do
-         self.tableoutput[step] = self.module:updateOutput(input[step])
+         self.tableoutput[step] = self.modules[1]:updateOutput(input[step])
       end
 
       if torch.isTensor(input) then
@@ -70,13 +65,13 @@ function Sequencer:updateOutput(input)
    else
       -- EVALUATION
       if not (self._remember == 'eval' or self._remember == 'both') then
-         self.module:forget()
+         self.modules[1]:forget()
       end
       -- during evaluation, recurrent modules reuse memory (i.e. outputs)
       -- so we need to copy each output into our own table or tensor
       if torch.isTensor(input) then
          for step=1,nStep do
-            local output = self.module:updateOutput(input[step])
+            local output = self.modules[1]:updateOutput(input[step])
             if step == 1 then
                self.output = torch.isTensor(self.output) and self.output or output.new()
                self.output:resize(nStep, unpack(output:size():totable()))
@@ -87,7 +82,7 @@ function Sequencer:updateOutput(input)
          for step=1,nStep do
             self.tableoutput[step] = nn.utils.recursiveCopy(
                self.tableoutput[step] or table.remove(self._output, 1),
-               self.module:updateOutput(input[step])
+               self.modules[1]:updateOutput(input[step])
             )
          end
          -- remove extra output tensors (save for later)
@@ -117,7 +112,7 @@ function Sequencer:updateGradInput(input, gradOutput)
    -- back-propagate through time
    self.tablegradinput = {}
    for step=nStep,1,-1 do
-      self.tablegradinput[step] = self.module:updateGradInput(input[step], gradOutput[step])
+      self.tablegradinput[step] = self.modules[1]:updateGradInput(input[step], gradOutput[step])
    end
 
    if torch.isTensor(input) then
@@ -147,7 +142,7 @@ function Sequencer:accGradParameters(input, gradOutput, scale)
 
    -- back-propagate through time
    for step=nStep,1,-1 do
-      self.module:accGradParameters(input[step], gradOutput[step], scale)
+      self.modules[1]:accGradParameters(input[step], gradOutput[step], scale)
    end
 end
 
@@ -189,7 +184,7 @@ function Sequencer:clearState()
    self._output = {}
    self.tableoutput = {}
    self.tablegradinput = {}
-   self.module:clearState()
+   self.modules[1]:clearState()
 end
 
 Sequencer.__tostring__ = nn.Decorator.__tostring__
diff --git a/Serial.lua b/Serial.lua
deleted file mode 100644
index b597de9..0000000
--- a/Serial.lua
+++ /dev/null
@@ -1,52 +0,0 @@
-------------------------------------------------------------------------
---[[ Serial ]]--
--- Decorator that modifies the serialization/deserialization 
--- behaviour of encapsulated module.
-------------------------------------------------------------------------
-local _ = require 'moses'
-local Serial, parent = torch.class("nn.Serial", "nn.Decorator")
-
-function Serial:__init(module, tensortype)
-   parent.__init(self, module)
-   self.tensortype = tensortype
-   if self.tensortype then
-      assert(tensortype:find('torch.*Tensor'), "Expecting tensortype (e.g. torch.LongTensor) at arg1")
-   end
-end
-
-function Serial:write(file)
-   local state = self:getSerialState()
-   
-   local function recursiveSetMetaTable(state)
-      for k,v in pairs(state) do
-         if torch.type(v) == 'table' then
-            recursiveSetMetaTable(v)
-         end
-      end
-      
-      if state.dpnn_typename then
-         torch.setmetatable(state, state.dpnn_typename)
-      end
-   end
-   
-   -- typecast before serialization (useful for cuda)
-   recursiveSetMetaTable(state)
-   
-   if self.tensortype then
-      state:type(self.tensortype)
-   end
-   
-   -- removes self's metatable
-   state = _.map(state, function(k,v) return v end)
-   
-   file:writeObject(state)
-end
-
-function Serial:read(file)
-   local state = file:readObject()
-   for k,v in pairs(state) do
-      self[k] = v
-   end
-end
-
-
diff --git a/SimpleColorTransform.lua b/SimpleColorTransform.lua
deleted file mode 100644
index 97b83ea..0000000
--- a/SimpleColorTransform.lua
+++ /dev/null
@@ -1,90 +0,0 @@
---[[
-   Simple Color transformation module: This module implements a simple data
-   augmentation technique of changing the pixel values of input image by adding
-   sample sampled small quantities.
-   Works only
---]]
-
-local SimpleColorTransform, Parent = torch.class('nn.SimpleColorTransform', 'nn.Module')
-
-function SimpleColorTransform:__init(inputChannels, range)
-   Parent.__init(self)
-
-   self.train = true
-   self.inputChannels = inputChannels
-   assert(inputChannels == range:nElement(),
-          "Number of input channels and number of range values don't match.")
-   self.range = range
-end
-
-function SimpleColorTransform:updateOutput(input)
-   self.output:resizeAs(input):copy(input)
-   if self.train then
-      self.noise = self.noise or self.output.new()
-      self._tempNoise = self._tempNoise or self.output.new()
-      self._tempNoiseExpanded = self._tempNoiseExpanded or self.output.new()
-      self._tempNoiseSamples = self._tempNoiseSamples or self.output.new()
-
-      if self.output:nDimension() == 4 then
-         local batchSize = self.output:size(1)
-         local channels = self.output:size(2)
-         local height = self.output:size(3)
-         local width = self.output:size(4)
-         assert(channels == self.inputChannels)
-         
-         -- Randomly sample noise for each channel 
-         self.noise:resize(batchSize, channels)
-         for i=1, channels do
-            self.noise[{{}, {i}}]:uniform(-self.range[i], self.range[i])
-         end
-         self._tempNoise = self.noise:view(batchSize, self.inputChannels, 1, 1)
-         self._tempNoiseExpanded:expand(self._tempNoise, batchSize,
-                                        channels, height, width)
-         self._tempNoiseSamples:resizeAs(self._tempNoiseExpanded)
-                               :copy(self._tempNoiseExpanded)
-         self.output:add(self._tempNoiseSamples)
-
-      elseif self.output:nDimension() == 3 then
-         local channels = self.output:size(1)
-         local height = self.output:size(2)
-         local width = self.output:size(3)
-         assert(channels == self.inputChannels)
-
-         -- Randomly sample noise for each channel 
-         self.noise:resize(channels)
-         for i=1, channels do
-            self.noise[i] = torch.uniform(-self.range[i], self.range[i])
-         end
-         self._tempNoise = self.noise:view(self.inputChannels, 1, 1)
-         self._tempNoiseExpanded:expand(self._tempNoise, channels,
-                                        height, width)
-         self._tempNoiseSamples:resizeAs(self._tempNoiseExpanded)
-                               :copy(self._tempNoiseExpanded)
-         self.output:add(self._tempNoiseSamples)
-      else
-         error("Invalid input dimensionality.")
-      end
-   end
-   return self.output
-end
-
-function SimpleColorTransform:updateGradInput(input, gradOutput)
-   if self.train then
-      self.gradInput:resizeAs(gradOutput):copy(gradOutput)
-   else
-      error('backprop only defined while training')
-   end
-   return self.gradInput
-end
-
-function SimpleColorTransform:type(type, tensorCache)
-   self.noise = nil
-   self._tempNoise = nil
-   self._tempNoiseExpanded = nil
-   self._tempNoiseSamples = nil
-   Parent.type(self, type, tensorCache)
-end
-
-function SimpleColorTransform:__tostring__()
-  return string.format('SimpleColorTransform', torch.type(self))
-end
diff --git a/SpatialBatchNormalization.lua b/SpatialBatchNormalization.lua
deleted file mode 100644
index 1b2fdf8..0000000
--- a/SpatialBatchNormalization.lua
+++ /dev/null
@@ -1,12 +0,0 @@
-local BN, parent = nn.SpatialBatchNormalization, nn.Module
-local _ = require 'moses'
-
-local empty = _.clone(parent.dpnn_mediumEmpty)
-table.insert(empty, 'buffer')
-table.insert(empty, 'buffer2')
-table.insert(empty, 'centered')
-table.insert(empty, 'std')
-table.insert(empty, 'normalized')
-table.insert(empty, 'output')
-table.insert(empty, 'gradInput')
-BN.dpnn_mediumEmpty = empty
diff --git a/SpatialBinaryConvolution.lua b/SpatialBinaryConvolution.lua
deleted file mode 100644
index 6365f8e..0000000
--- a/SpatialBinaryConvolution.lua
+++ /dev/null
@@ -1,173 +0,0 @@
--- Reference: http://arxiv.org/abs/1603.05279
--- We use floating point Matrix-Matrix multiplication as in SpatialConvolution.
--- Filters are made binary {-1, +1} using Sign.
--- Convolution output is scaled by L1-norm of the filters.
-
--- Inheriting nn/SpatialConvolution.
-
-local SpatialBinaryConvolution, parent = torch.class('nn.SpatialBinaryConvolution', 'nn.SpatialConvolution')
-
-function SpatialBinaryConvolution:__init(nInputPlane, nOutputPlane, kW, kH, dW, dH, padW, padH)
-   parent.__init(self, nInputPlane, nOutputPlane, kW, kH, dW, dH, padW, padH)
-   parent.noBias(self)
-
-   self.iwh = self.nInputPlane * self.kW * self.kH 
-   self.owh = self.nOutputPlane * self.kW * self.kH 
-   self.train = true
-end
-
-function SpatialBinaryConvolution:training()
-   self.train = true
-end
-
-function SpatialBinaryConvolution:evaluate()
-   self.train = false
-end
-
--- Function to binarize weights and compute L1 norms
-function SpatialBinaryConvolution:binarizeWeight()
-   self.tempWeight = self.tempWeight or self.weight.new()
-
-   -- Grad Input alphas
-   self.gradInputAlphas = self.gradInputAlphas or self.weight.new()
-   self.gradInputAlphas:resize(self.nInputPlane)
-
-   local temp = self.weight:transpose(1,2)
-   self.tempWeight:resizeAs(temp):copy(temp)
-   self.gradInputAlphas:norm(self.tempWeight:view(self.nInputPlane, -1), 1, 2)
-   self.gradInputAlphas:div(self.owh) -- 1/owh
-
-   -- alphas
-   self.tempWeight:resizeAs(self.weight):copy(self.weight)
-   self.alphas = self.alphas or self.weight.new()
-   self.alphas:resize(self.nOutputPlane)
-   self.alphas:norm(self.weight:view(self.nOutputPlane, -1), 1, 2)
-   self.alphas:div(self.iwh) -- 1/iwh
-
-   -- Binarize weights
-   if not self.wmask then
-      if torch.type(self.weight) == 'torch.CudaTensor' then
-         self.wmask = torch.CudaTensor()
-      else
-         self.wmask = torch.ByteTensor()
-      end
-   end
-
-   -- Binarizing weights
-   self.weight.ge(self.wmask, self.weight, 0)
-   self.weight[self.wmask] = 1
-   self.weight.lt(self.wmask, self.weight, 0)
-   self.weight[self.wmask] = -1
-end
-
-function SpatialBinaryConvolution:updateOutput(input)
-   -- Binarize Weights
-   self.binarizeWeight(self)
-
-   -- Convolution
-   self.output = parent.updateOutput(self, input)
-
-   -- Scale output by alphas
-   self._tempAlphas = self._tempAlphas or self.output.new()   
-   self._tempAlphasExpanded = self._tempAlphasExpanded or self.output.new() 
-   self._tempAlphasSamples = self._tempAlphasSamples or self.output.new()
-   if self.output:nDimension() == 4 then
-      local batchSize = self.output:size(1)
-      local height = self.output:size(3)
-      local width = self.output:size(4)
-
-      self._tempAlphas = self.alphas:view(1, self.nOutputPlane, 1, 1)
-      self._tempAlphasExpanded:expand(self._tempAlphas, batchSize,
-                                      self.nOutputPlane, height, width)
-      self._tempAlphasSamples:resizeAs(self._tempAlphasExpanded)
-                             :copy(self._tempAlphasExpanded)
-      self.output:cmul(self._tempAlphasSamples)
-   else
-      local height = self.output:size(2)
-      local width = self.output:size(3)
-
-      self._tempAlphas = self.alphas:view(self.nOutputPlane, 1, 1)
-      self._tempAlphasExpanded:expand(self._tempAlphas, self.nOutputPlane,
-                                      height, width)
-      self._tempAlphasSamples:resizeAs(self._tempAlphasExpanded)
-                             :copy(self._tempAlphasExpanded)
-      self.output:cmul(self._tempAlphasSamples)
-   end
-
-   -- In evaluate mode.
-   if not self.train then self.weight:copy(self.tempWeight) end
-
-   return self.output 
-end
-
-function SpatialBinaryConvolution:updateGradInput(input, gradOutput)
-   self.gradInput = parent.updateGradInput(self, input, gradOutput)
-
-   -- Scale gradInput by gradAlphas
-   self._tempGradAlphas = self._temp or self.gradInput.new()
-   self._tempGradAlphasExpanded = self._temp or self.gradInput.new()
-   self._tempGradAlphasSamples = self._temp or self.gradInput.new()
-   if self.gradInput:nDimension() == 4 then
-      local batchSize = self.gradInput:size(1)
-      local height = self.gradInput:size(3)
-      local width = self.gradInput:size(4)
-
-      self._tempGradAlphas = self.gradInputAlphas:view(1, self.nInputPlane,
-                                                       1, 1)
-      self._tempGradAlphasExpanded:expand(self._tempGradAlphas,
-                                          batchSize, self.nInputPlane,
-                                          height, width)
-      self._tempGradAlphasSamples:resizeAs(self._tempGradAlphasExpanded)
-                                 :copy(self._tempGradAlphasExpanded)
-
-      self.gradInput:cmul(self._tempGradAlphasSamples)
-   else
-      local height = self.gradInput:size(2)
-      local width = self.gradInput:size(3)
-
-      self._tempGradAlphas = self.gradInputAlphas:view(self.nInputPlane,
-                                                       1, 1)
-      self._tempGradAlphasExpanded:expand(self._tempGradAlphas,
-                                          self.nInputPlane,
-                                          height, width)
-      self._tempGradAlphasSamples:resizeAs(self._tempGradAlphasExpanded)
-                                 :copy(self._tempGradAlphasExpanded)
-
-      self.gradInput:cmul(self._tempGradAlphasSamples)
-   end
-   return self.gradInput
-end
-
-function SpatialBinaryConvolution:accGradParameters(input, gradOutput, scale)
-
-   parent.accGradParameters(self, input, gradOutput, scale)
-
-   --[[
-   Copy back floating point weights for weight update.
-   This could be done individually after forward and backward, but to avoid
-   additional copy is done at the end of backward.
-   --]]
-
-   self.weight:copy(self.tempWeight)
-end
-
-function SpatialBinaryConvolution:type(type, tensorCache)
-   self.tempWeight = nil
-   self.alphas = nil
-   self.gradInputAlphas = nil
-   self.wmask = nil
-
-   self._tempAlphas = nil 
-   self._tempAlphasExpanded = nil
-   self._tempAlphasSamples = nil
-
-   self._tempGradAlphas = nil
-   self._tempGradAlphasExpanded = nil
-   self._tempGradAlphasSamples = nil
-
-   parent.type(self, type, tensorCache)
-end
-
-function SpatialBinaryConvolution:__tostring__()
-   return "Binary Convolution: "..parent.__tostring__(self)
-end
diff --git a/SpatialBinaryLogisticRegression.lua b/SpatialBinaryLogisticRegression.lua
deleted file mode 100644
index 85fba99..0000000
--- a/SpatialBinaryLogisticRegression.lua
+++ /dev/null
@@ -1,80 +0,0 @@
-------------------------------------------------------------------------
---[[ SpatialBinaryLogisticRegression ]]--
--- Takes an image of size batchSize x nChannel x width x height as input.
--- Computes Binary Logistic Regression Cost.
--- Useful for 2 class pixel classification.
-------------------------------------------------------------------------
-
-local SpatialBinaryLogisticRegression, parent = torch.class('nn.SpatialBinaryLogisticRegression', 'nn.Criterion')
-
-function SpatialBinaryLogisticRegression:__init()
-   parent.__init(self)
-   self.sizeAverage = true
-end
-
-function SpatialBinaryLogisticRegression:updateOutput(input, target)
-   local inputDim = input:nDimension()
-   local targetDim = target:nDimension()
-
-   -- Check dimensions of input and target
-   assert(inputDim == targetDim, "nDimension of input and target don't match.")
-   assert(inputDim == 4 or inputDim == 3, "Expecting image or batch on images")
-
-   for i=1,inputDim do
-      assert(input:size(i) == target:size(i),
-                                  "Input and target dimensions don't match.")
-   end
-
-   -- Check batch or single image
-   if inputDim == 4 then
-      self._isBatch = true
-      assert(input:size(2) == 1, "No. of channels should be 1.")
-      self._k = input:size(1)
-      self._h = input:size(3)
-      self._w = input:size(4)
-   else
-      self._isBatch = false
-      assert(input:size(1) == 1, "No. of channels should be 1.")
-      self._k = 1
-      self._h = input:size(2)
-      self._w = input:size(3)
-   end
-
-   self._baseExponents = self._baseExponents or input.new()
-   self._coeff = self._coeff or input.new()
-   self._logCoeff = self._logCoeff or input.new()
-
-   --Compute exponent = -target*input
-   self._baseExponents:resize(input:size()):copy(input)
-   self._baseExponents:cmul(target)
-   self._baseExponents:mul(-1)
-   -- Compute exp(exponent)
-   self._baseExponents:exp()
-
-   self._coeff:resize(input:size()):copy(self._baseExponents)
-   self._coeff:add(1)
-
-   self._logCoeff:resize(input:size()):copy(self._coeff)
-   self._logCoeff:log()
-
-   if self.sizeAverage then
-      return self._logCoeff:sum()/(2 * self._k * self._h * self._w)
-   else
-      return self._logCoeff:sum()/(2 * self._h * self._w)
-   end
-end
-
-function SpatialBinaryLogisticRegression:updateGradInput(input, target)
-   self.gradInput = self.gradInput or input.new()
-   local gradInput = self.gradInput
-   gradInput:resize(target:size()):copy(target)
-   gradInput:mul(-1)
-   gradInput:cmul(self._baseExponents)
-   gradInput:cdiv(self._coeff)
-   if self.sizeAverage then
-      gradInput:div(2 * self._k * self._h * self._w)
-   else
-      gradInput:div(2 * self._h * self._w)
-   end
-   return gradInput
-end
diff --git a/SpatialConvolution.lua b/SpatialConvolution.lua
deleted file mode 100644
index a3144eb..0000000
--- a/SpatialConvolution.lua
+++ /dev/null
@@ -1,9 +0,0 @@
-local SpatialConvolution, parent = nn.SpatialConvolution, nn.Module
-local _ = require 'moses'
-
-local empty = _.clone(parent.dpnn_mediumEmpty)
-table.insert(empty, 'finput')
-table.insert(empty, 'fgradinput')
-table.insert(empty, '_input')
-table.insert(empty, '_gradOutput')
-SpatialConvolution.dpnn_mediumEmpty = empty
diff --git a/SpatialConvolutionMM.lua b/SpatialConvolutionMM.lua
deleted file mode 100644
index 4b50658..0000000
--- a/SpatialConvolutionMM.lua
+++ /dev/null
@@ -1,3 +0,0 @@
-local SpatialConvolutionMM, parent = nn.SpatialConvolutionMM, nn.Module
-
-SpatialConvolutionMM.dpnn_mediumEmpty = nn.SpatialConvolution.dpnn_mediumEmpty
diff --git a/SpatialFeatNormalization.lua b/SpatialFeatNormalization.lua
deleted file mode 100644
index 1aca767..0000000
--- a/SpatialFeatNormalization.lua
+++ /dev/null
@@ -1,73 +0,0 @@
---[[
-   Color normalization (mean zeroing and dividing by standard deviation).
-   Basic preprocessing step widely used in training classifier with images.
---]]
-
-local SpatialFeatNormalization, Parent = torch.class('nn.SpatialFeatNormalization', 'nn.Module')
-
-function SpatialFeatNormalization:__init(mean, std)
-   Parent.__init(self)
-   if mean:dim() ~= 1 then
-      error('<SpatialFeatNormalization> Mean/Std should be 1D.')
-   end
-   self.mean = torch.Tensor()
-   self.mean:resizeAs(mean):copy(mean)
-   self.std = torch.Tensor()
-   self.std:resizeAs(mean)
-   if std ~= nil then self.std:copy(std) else self.std:fill(1) end
-   self.noOfFeats = mean:size(1)
-end
-
-function SpatialFeatNormalization:updateOutput(input)
-   self.output:resizeAs(input):copy(input)
-   if input:dim() == 4 then
-      -- Batch of image/s
-      if input:size(2) ~= self.noOfFeats then
-         error('<SpatialFeatNormalization> No. of Feats dont match.')
-      else
-         for i=1, self.noOfFeats do
-            self.output[{{}, i, {}, {}}]:add(-self.mean[i])
-            self.output[{{}, i, {}, {}}]:div(self.std[i])
-         end
-      end
-   elseif input:dim() == 3 then
-      -- single image
-      if input:size(1) ~= self.noOfFeats then
-         error('<SpatialFeatNormalization> No. of Feats dont match.')
-      else
-         for i=1, self.noOfFeats do
-            self.output[{i, {}, {}}]:add(-self.mean[i])
-            self.output[{i, {}, {}}]:div(self.std[i])
-         end
-      end
-   else
-      error('<SpatialFeatNormalization> invalid input dims.')
-   end
-   return self.output 
-end
-
-function SpatialFeatNormalization:updateGradInput(input, gradOutput)
-   self.gradInput:resizeAs(gradOutput):copy(gradOutput)
-   if self.gradInput:dim() == 4 then
-      -- Batch of image/s
-      if self.gradInput:size(2) ~= self.noOfFeats then
-         error('<SpatialFeatNormalization> No. of Feats dont match.')
-      else
-         for i=1, self.noOfFeats do
-            self.gradInput[{{}, i, {}, {}}]:div(self.std[i])
-         end
-      end
-   elseif self.gradInput:dim() == 3 then
-      -- single image
-      if self.gradInput:size(1) ~= self.noOfFeats then
-         error('<SpatialFeatNormalization> No. of Feats dont match.')
-      else
-         for i=1, self.noOfFeats do
-            self.gradInput[{i, {}, {}}]:div(self.std[i])
-         end
-      end
-   else
-      error('<SpatialFeatNormalization> invalid self.gradInput dims.')
-   end
-   return self.gradInput
-end
diff --git a/SpatialMaxPooling.lua b/SpatialMaxPooling.lua
deleted file mode 100644
index 1d6669c..0000000
--- a/SpatialMaxPooling.lua
+++ /dev/null
@@ -1,6 +0,0 @@
-local SpatialMaxPooling, parent = nn.SpatialMaxPooling, nn.Module
-local _ = require 'moses'
-
-local empty = _.clone(parent.dpnn_mediumEmpty)
-table.insert(empty, 'indices')
-SpatialMaxPooling.dpnn_mediumEmpty = empty
diff --git a/SpatialRegionDropout.lua b/SpatialRegionDropout.lua
deleted file mode 100644
index 78c4a39..0000000
--- a/SpatialRegionDropout.lua
+++ /dev/null
@@ -1,80 +0,0 @@
---[[
-   Dropout edges rows or columns to simulate imperfect bounding boxes. 
---]]
-
-local SpatialRegionDropout, Parent = torch.class('nn.SpatialRegionDropout', 'nn.Module')
-
-function SpatialRegionDropout:__init(p)
-   Parent.__init(self)
-   self.p = p or 0.2 -- ratio of total number of rows or cols
-   self.train = true
-   self.noise = torch.Tensor()
-   if self.p >= 1 or self.p < 0 then
-      error('<SpatialRegionDropout> illegal percentage, must be 0 <= p < 1')
-   end
-end
-
-function SpatialRegionDropout:setp(p)
-   self.p = p
-end
-
--- Region Types
--- 1: Dropout p ratio of top rows
--- 2: Dropout p ratio of bottom rows
--- 3: Dropout p ratio of leftmost cols
--- 4: Dropout p ratio of rightmost cols
-function SpatialRegionDropout:updateOutput(input)
-   self.output:resizeAs(input):copy(input)
-   if self.train then
-      self.noise:resizeAs(input):fill(1)
-      self.regionType = torch.random(4)
-      if input:dim() == 4 then
-         local height = input:size(3)
-         local width = input:size(4)
-         if self.regionType == 1 then
-            self.noise[{{}, {}, {1, math.floor(height*self.p)}}]:fill(0)
-         elseif self.regionType == 2 then
-            self.noise[{{}, {}, 
-                      {height-math.floor(height*self.p)+1, height}}]:fill(0)
-         elseif self.regionType == 3 then
-            self.noise[{{}, {}, {}, {1, math.floor(width*self.p)}}]:fill(0)
-         elseif self.regionType == 4 then
-            self.noise[{{}, {}, {},
-                       {width-math.floor(width*self.p)+1, width}}]:fill(0)
-         end
-      elseif input:dim() == 3 then
-         local height = input:size(2)
-         local width = input:size(3)
-         if self.regionType == 1 then
-            self.noise[{{}, {1, math.floor(height*self.p)}}]:fill(0)
-         elseif self.regionType == 2 then
-            self.noise[{{}, 
-                       {height-math.floor(height*self.p)+1, height}}]:fill(0)
-         elseif self.regionType == 3 then
-            self.noise[{{}, {}, {1, math.floor(width*self.p)}}]:fill(0)
-         elseif self.regionType == 4 then
-            self.noise[{{}, {}, 
-                       {width-math.floor(width*self.p)+1, width}}]:fill(0)
-         end
-      else
-         error('Input must be 4D (nbatch, nfeat, h, w) or 3D (nfeat, h, w)')
-      end
-      self.noise:div(1-self.p)
-      self.output:cmul(self.noise)
-   end
-   return self.output
-end
-
-function SpatialRegionDropout:updateGradInput(input, gradOutput)
-   if self.train then
-      self.gradInput:resizeAs(gradOutput):copy(gradOutput)
-      self.gradInput:cmul(self.noise)
-   else
-      error('Backpropagation is only defined for training.')
-   end
-   return self.gradInput
-end
-
-function SpatialRegionDropout:__tostring__()
-   return string.format('%s p: %f', torch.type(self), self.p)
-end
diff --git a/SpatialUniformCrop.lua b/SpatialUniformCrop.lua
deleted file mode 100644
index ba81119..0000000
--- a/SpatialUniformCrop.lua
+++ /dev/null
@@ -1,121 +0,0 @@
-local SpatialUniformCrop, parent = torch.class("nn.SpatialUniformCrop", "nn.Module")
-
-function SpatialUniformCrop:__init(oheight, owidth, scale)
-   nn.require('nnx')
-   parent.__init(self)
-   self.scale = scale or nil
-   if self.scale ~= nil then
-      assert(torch.type(scale)=='table')
-      self.scaler = nn.SpatialReSampling{owidth=owidth, oheight=oheight}
-   end
-   self.oheight = oheight
-   self.owidth = owidth or oheight
-end
-
-function SpatialUniformCrop:updateOutput(input)
-   nn.require('nnx')
-   input = self:toBatch(input, 3)
-
-   self.output:resize(input:size(1), input:size(2), self.oheight, self.owidth)
-   self.coord = self.coord or torch.IntTensor()
-   self.coord:resize(input:size(1), 2)
-
-   if self.scale ~= nil then
-      self.scales = self.scales or torch.FloatTensor()
-      self.scales:resize(input:size(1))
-   end
-
-   local iH, iW = input:size(3), input:size(4)
-   if self.train ~= false then
-      if self.scale ~= nil then
-         for i=1,input:size(1) do
-            -- do random crop
-            local s = torch.uniform(self.scale['min'] or self.scale[1], self.scale['max'] or self.scale[2])
-            local soheight = math.ceil(s*self.oheight)
-            local sowidth = math.ceil(s*self.owidth)
-
-            local h = math.ceil(torch.uniform(1e-2, iH-soheight))
-            local w = math.ceil(torch.uniform(1e-2, iW-sowidth))
-
-            local ch = math.ceil(iH/2 - (iH-soheight)/2 + h)
-            local cw = math.ceil(iW/2 - (iH-sowidth)/2 + w)
-
-            local h1 = ch - math.ceil(soheight/2)
-            local w1 = cw - math.ceil(sowidth/2)
-            if h1 < 1 then h1 = 1 end
-            if w1 < 1 then w1 = 1 end
-
-            local crop = input[i]:narrow(2, h1, soheight):narrow(3, w1, sowidth)
-
-            self.output[i]:copy(self.scaler:forward(crop))
-            -- save crop coordinates and scale for backward
-            self.scales[i] = s
-            self.coord[{i,1}] = h
-            self.coord[{i,2}] = w
-         end
-      else
-         for i=1,input:size(1) do
-            -- do random crop
-            local h1 = math.ceil(torch.uniform(1e-2, iH-self.oheight))
-            local w1 = math.ceil(torch.uniform(1e-2, iW-self.owidth))
-            local crop = input[i]:narrow(2,h1,self.oheight):narrow(3,w1,self.owidth)
-            self.output[i]:copy(crop)
-            -- save crop coordinates for backward
-            self.coord[{i,1}] = h1
-            self.coord[{i,2}] = w1
-         end
-      end
-   else
-      -- use center crop
-      local h1 = math.ceil((iH-self.oheight)/2)
-      local w1 = math.ceil((iW-self.owidth)/2)
-      local crop = input:narrow(3,h1,self.oheight):narrow(4,w1,self.owidth)
-      self.output:copy(crop)
-   end
-
-   self.output = self:fromBatch(self.output, 1)
-   return self.output
-end
-
-function SpatialUniformCrop:updateGradInput(input, gradOutput)
-   input = self:toBatch(input, 3)
-   gradOutput = self:toBatch(gradOutput, 3)
-
-   self.gradInput:resizeAs(input):zero()
-   if self.scale ~= nil then
-      local iH, iW = input:size(3), input:size(4)
-      for i=1,input:size(1) do
-         local s = self.scales[i]
-         local soheight = math.ceil(s*self.oheight)
-         local sowidth = math.ceil(s*self.owidth)
-
-         local h, w = self.coord[{i,1}], self.coord[{i,2}]
-
-         local ch = math.ceil(iH/2 - (iH-soheight)/2 + h)
-         local cw = math.ceil(iW/2 - (iH-sowidth)/2 + w)
-
-         local h1 = ch - math.ceil(soheight/2)
-         local w1 = cw - math.ceil(sowidth/2)
-         if h1 < 1 then h1 = 1 end
-         if w1 < 1 then w1 = 1 end
-
-         local crop = input[i]:narrow(2, h1, soheight):narrow(3, w1, sowidth)
-         local samplerGradInput = self.scaler:updateGradInput(crop, gradOutput[i])
-
-         self.gradInput[i]:narrow(2, h1, soheight):narrow(3, w1, sowidth):copy(samplerGradInput)
-      end
-   else
-      for i=1,input:size(1) do
-         local h1, w1 = self.coord[{i,1}], self.coord[{i,2}]
-         self.gradInput[i]:narrow(2,h1,self.oheight):narrow(3,w1,self.owidth):copy(gradOutput[i])
-      end
-   end
-
-   self.gradInput = self:fromBatch(self.gradInput, 1)
-   return self.gradInput
-end
-
-function SpatialUniformCrop:type(type, cache)
-   self.coord = nil
-   return parent.type(self, type, cache)
-end
diff --git a/WhiteNoise.lua b/WhiteNoise.lua
deleted file mode 100644
index 518e749..0000000
--- a/WhiteNoise.lua
+++ /dev/null
@@ -1,38 +0,0 @@
-local WhiteNoise, Parent = torch.class('nn.WhiteNoise', 'nn.Module')
-
-function WhiteNoise:__init(mean, std)
-   Parent.__init(self)
-   -- std corresponds to 50% for MNIST training data std.
-   self.mean = mean or 0
-   self.std = std or 0.1
-   self.noise = torch.Tensor()
-end
-
-function WhiteNoise:updateOutput(input)
-   self.output:resizeAs(input):copy(input)
-   if self.train ~= false then
-      self.noise:resizeAs(input)
-      self.noise:normal(self.mean, self.std)
-      self.output:add(self.noise)
-   else
-      if self.mean ~= 0 then
-         self.output:add(self.mean)
-      end
-   end
-   return self.output
-end
-
-function WhiteNoise:updateGradInput(input, gradOutput)
-   if self.train ~= false then
-      -- Simply return the gradients.
-      self.gradInput:resizeAs(gradOutput):copy(gradOutput)
-   else
-      error('backprop only defined while training')
-   end
-   return self.gradInput
-end
-
-function WhiteNoise:__tostring__()
-  return string.format('%s mean: %f, std: %f', 
-                        torch.type(self), self.mean, self.std)
-end
diff --git a/ZeroGrad.lua b/ZeroGrad.lua
deleted file mode 100644
index 24286a4..0000000
--- a/ZeroGrad.lua
+++ /dev/null
@@ -1,34 +0,0 @@
-local ZeroGrad, parent
-if nn.ZeroGrad then -- prevent name conflicts with nnx
-   ZeroGrad, parent = nn.ZeroGrad, nn.Module
-else
-   ZeroGrad, parent = torch.class('nn.ZeroGrad', 'nn.Module')
-end
-
-local function recursiveZero(t1,t2)
-   if torch.type(t2) == 'table' then
-      t1 = (torch.type(t1) == 'table') and t1 or {t1}
-      for key,_ in pairs(t2) do
-         t1[key], t2[key] = recursiveZero(t1[key], t2[key])
-      end
-   elseif torch.isTensor(t2) then
-      t1 = torch.isTensor(t1) and t1 or t2.new()
-      t1:resizeAs(t2):zero()
-   else
-      error("expecting nested tensors or tables. Got "..
-            torch.type(t1).." and "..torch.type(t2).." instead")
-   end
-   return t1, t2
-end
-
-function ZeroGrad:updateOutput(input)
-   self.output:set(input)
-   return self.output
-end
-
--- the gradient is simply zeroed.
--- useful when you don't want to backpropgate through certain paths.
-function ZeroGrad:updateGradInput(input, gradOutput)
-   self.gradInput = recursiveZero(self.gradInput, gradOutput)
-   return self.gradInput
-end
diff --git a/ZipTable.lua b/ZipTable.lua
deleted file mode 100644
index 55b261d..0000000
--- a/ZipTable.lua
+++ /dev/null
@@ -1,34 +0,0 @@
-local ZipTable, parent = torch.class('nn.ZipTable', 'nn.Container')
-
--- input : { {a1,a2}, {b1,b2}, {c1,c2} }
--- output : { {a1,b1,c1}, {a2,b2,c2} }
-function ZipTable:__init()
-   parent.__init(self)
-   self.output = {}
-   self.gradInput = {}
-end
-
-function ZipTable:updateOutput(inputTable)
-   self.output = {}
-   for i,inTable in ipairs(inputTable) do
-      for j,input in ipairs(inTable) do
-         local output = self.output[j] or {}
-         output[i] = input
-         self.output[j] = output 
-      end 
-   end
-   return self.output
-end
-
-function ZipTable:updateGradInput(inputTable, gradOutputTable)
-   self.gradInput = {}
-   for i,gradOutTable in ipairs(gradOutputTable) do
-      for j,gradOutput in ipairs(gradOutTable) do
-         local gradInput = self.gradInput[j] or {}
-         gradInput[i] = gradOutput
-         self.gradInput[j] = gradInput 
-      end 
-   end
-   return self.gradInput
-end
-
diff --git a/ZipTableOneToMany.lua b/ZipTableOneToMany.lua
deleted file mode 100644
index fe8b385..0000000
--- a/ZipTableOneToMany.lua
+++ /dev/null
@@ -1,37 +0,0 @@
-local ZipTableOneToMany, parent = torch.class('nn.ZipTableOneToMany', 'nn.Container')
-
--- based on ZipTable in dpnn
-
--- input : { v, {a, b, c} } 
--- output : { {v,a}, {v,b}, {v,c} }
-function ZipTableOneToMany:__init()
-   parent.__init(self)
-   self.output = {}
-   self.gradInput = {}
-   -- make buffer to update during forward/backward
-   self.gradInputEl = torch.Tensor()
-end
-
-function ZipTableOneToMany:updateOutput(input)
-   assert(#input == 2, "input must be table of element and table")
-   local inputEl, inputTable = input[1], input[2]
-   self.output = {}
-   for i,v in ipairs(inputTable) do
-      self.output[i] = {inputEl, v}
-   end
-   return self.output
-end
-
-function ZipTableOneToMany:updateGradInput(input, gradOutput)
-   assert(#input == 2, "input must be table of element and table")
-   local inputEl, inputTable = input[1], input[2]
-   self.gradInputEl:resizeAs(inputEl):zero()
-   local gradInputTable = {}
-   for i,gradV in ipairs(gradOutput) do
-      self.gradInputEl:add(gradV[1])
-      gradInputTable[i] = gradV[2]
-   end
-   self.gradInput = {self.gradInputEl, gradInputTable}
-   return self.gradInput
-end
-
diff --git a/deprecated/FastLSTM.lua b/deprecated/FastLSTM.lua
index 1995a34..18e1be9 100644
--- a/deprecated/FastLSTM.lua
+++ b/deprecated/FastLSTM.lua
@@ -60,14 +60,14 @@ function FastLSTM:buildModel()
                         :add(nn.Dropout(self.p,false,false,true,self.mono))
                         :add(nn.Dropout(self.p,false,false,true,self.mono)))
                      :add(nn.ParallelTable()
-                        :add(nn.LinearNoBias(self.outputSize, self.outputSize))
-                        :add(nn.LinearNoBias(self.outputSize, self.outputSize))
-                        :add(nn.LinearNoBias(self.outputSize, self.outputSize))
-                        :add(nn.LinearNoBias(self.outputSize, self.outputSize)))
+                        :add(nn.Linear(self.outputSize, self.outputSize):noBias())
+                        :add(nn.Linear(self.outputSize, self.outputSize):noBias())
+                        :add(nn.Linear(self.outputSize, self.outputSize):noBias())
+                        :add(nn.Linear(self.outputSize, self.outputSize):noBias()))
                      :add(nn.JoinTable(2))
    else
       self.i2g = nn.Linear(self.inputSize, 4*self.outputSize)
-      self.o2g = nn.LinearNoBias(self.outputSize, 4*self.outputSize)
+      self.o2g = nn.Linear(self.outputSize, 4*self.outputSize):noBias()
    end
 
    if self.usenngraph or self.bn then
diff --git a/deprecated/GRU.lua b/deprecated/GRU.lua
index 460b24d..318d2f5 100644
--- a/deprecated/GRU.lua
+++ b/deprecated/GRU.lua
@@ -54,12 +54,12 @@ function GRU:buildModel()
                         :add(nn.Dropout(self.p,false,false,true,self.mono))
                         :add(nn.Dropout(self.p,false,false,true,self.mono)))
                      :add(nn.ParallelTable()
-                        :add(nn.LinearNoBias(self.outputSize, self.outputSize))
-                        :add(nn.LinearNoBias(self.outputSize, self.outputSize)))
+                        :add(nn.Linear(self.outputSize, self.outputSize):noBias())
+                        :add(nn.Linear(self.outputSize, self.outputSize):noBias()))
                      :add(nn.JoinTable(2))
    else
       self.i2g = nn.Linear(self.inputSize, 2*self.outputSize)
-      self.o2g = nn.LinearNoBias(self.outputSize, 2*self.outputSize)
+      self.o2g = nn.Linear(self.outputSize, 2*self.outputSize):noBias()
    end
 
    local para = nn.ParallelTable():add(self.i2g):add(self.o2g)
@@ -97,7 +97,7 @@ function GRU:buildModel()
       t2:add(nn.Dropout(self.p,false,false,true,self.mono))
    end
    t1:add(nn.Linear(self.inputSize, self.outputSize))
-   t2:add(nn.LinearNoBias(self.outputSize, self.outputSize))
+   t2:add(nn.Linear(self.outputSize, self.outputSize):noBias())
 
    concat:add(t1):add(t2)
    hidden:add(concat):add(nn.CAddTable()):add(nn.Tanh())
@@ -132,7 +132,6 @@ function GRU:_updateOutput(input)
    -- output(t) = gru{input(t), output(t-1)}
    local output
    if self.train ~= false then
-      self:recycle()
       local stepmodule = self:getStepModule(self.step)
       -- the actual forward propagation
       output = stepmodule:updateOutput{input, prevOutput}
diff --git a/deprecated/LSTM.lua b/deprecated/LSTM.lua
index 5c4560c..13784b4 100644
--- a/deprecated/LSTM.lua
+++ b/deprecated/LSTM.lua
@@ -49,7 +49,7 @@ function LSTM:buildGate()
          :add(nn.Linear(self.inputSize, self.outputSize))
    local output2gate = nn.Sequential()
          :add(nn.Dropout(self.p,false,false,true,self.mono))
-         :add(nn.LinearNoBias(self.outputSize, self.outputSize))
+         :add(nn.Linear(self.outputSize, self.outputSize):noBias())
    local para = nn.ParallelTable()
    para:add(input2gate):add(output2gate)
    if self.cell2gate then
@@ -80,7 +80,7 @@ function LSTM:buildHidden()
          :add(nn.Linear(self.inputSize, self.outputSize))
    local output2hidden = nn.Sequential()
          :add(nn.Dropout(self.p,false,false,true,self.mono))
-         :add(nn.LinearNoBias(self.outputSize, self.outputSize))
+         :add(nn.Linear(self.outputSize, self.outputSize):noBias())
    local para = nn.ParallelTable()
    para:add(input2hidden):add(output2hidden)
    hidden:add(para)
@@ -190,7 +190,6 @@ function LSTM:updateOutput(input)
    -- output(t), cell(t) = lstm{input(t), output(t-1), cell(t-1)}
    local output, cell
    if self.train ~= false then
-      self:recycle()
       local stepmodule = self:getStepModule(self.step)
       -- the actual forward propagation
       output, cell = unpack(stepmodule:updateOutput{input, prevOutput, prevCell})
diff --git a/examples/README.md b/examples/README.md
index e68be30..294db77 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -30,4 +30,3 @@ These scripts showcases the fundamental principles of the package.
   * [rnn-benchmarks](https://github.com/glample/rnn-benchmarks) : benchmarks comparing Torch (using this library), Theano and TensorFlow.
   * [dataload](https://github.com/Element-Research/dataload) : a collection of torch dataset loaders;
   * A brief (1 hours) overview of Torch7, which includes some details about the __rnn__ packages (at the end), is available via this [NVIDIA GTC Webinar video](http://on-demand.gputechconf.com/gtc/2015/webinar/torch7-applied-deep-learning-for-vision-natural-language.mp4). In any case, this presentation gives a nice overview of Logistic Regression, Multi-Layer Perceptrons, Convolutional Neural Networks and Recurrent Neural Networks using Torch7;
-  * [Sagar Waghmare](https://github.com/sagarwaghmare69) wrote a nice [tutorial](tutorials/ladder.md) on how to use rnn with nngraph to reproduce the [Lateral Connections in Denoising Autoencoders Support Supervised Learning](http://arxiv.org/pdf/1504.08215.pdf).
diff --git a/examples/multigpu-nce-rnnlm.lua b/examples/multigpu-nce-rnnlm.lua
index dad3371..055b1f5 100644
--- a/examples/multigpu-nce-rnnlm.lua
+++ b/examples/multigpu-nce-rnnlm.lua
@@ -189,8 +189,7 @@ if not xplog then
    xplog.dataset = 'GoogleBillionWords'
    xplog.vocab = trainset.vocab
    -- will only serialize params
-   xplog.model = nn.Serial(lm)
-   xplog.model:mediumSerial()
+   xplog.model = lm:sharedClone()
    xplog.criterion = criterion
    xplog.targetmodule = targetmodule
    -- keep a log of NLL for each epoch
diff --git a/examples/noise-contrastive-estimate.lua b/examples/noise-contrastive-estimate.lua
index b425214..bfbb012 100644
--- a/examples/noise-contrastive-estimate.lua
+++ b/examples/noise-contrastive-estimate.lua
@@ -129,7 +129,8 @@ if not lm then
       :add(nn.ZipTable()) -- {{x1,x2,...}, {t1,t2,...}} -> {{x1,t1},{x2,t2},...}
 
    -- encapsulate stepmodule into a Sequencer
-   lm:add(nn.Sequencer(nn.MaskZero(ncemodule)))
+   local nce = nn.Sequencer(nn.MaskZero(ncemodule))
+   lm:add(nce)
 
    -- remember previous state between batches
    lm:remember()
@@ -184,8 +185,7 @@ if not xplog then
    xplog.dataset = 'GoogleBillionWords'
    xplog.vocab = trainset.vocab
    -- will only serialize params
-   xplog.model = nn.Serial(lm)
-   xplog.model:mediumSerial()
+   xplog.model = lm:sharedClone()
    xplog.criterion = criterion
    xplog.targetmodule = targetmodule
    -- keep a log of NLL for each epoch
@@ -217,7 +217,7 @@ while opt.maxepoch <= 0 or epoch <= opt.maxepoch do
       inputs = {inputs, targets}
       -- zero-mask
       zeroMask = nn.utils.getZeroMaskSequence(inputs[1], zeroMask)
-      nn.utils.setZeroMask({lm, criterion}, zeroMask, opt.cuda)
+      nn.utils.setZeroMask({criterion, lm}, zeroMask, opt.cuda)
       -- forward
       local outputs = lm:forward(inputs)
       local err = criterion:forward(outputs, targets)
@@ -278,7 +278,7 @@ while opt.maxepoch <= 0 or epoch <= opt.maxepoch do
       targets = targetmodule:forward(targets)
       -- zero-mask
       zeroMask = nn.utils.getZeroMaskSequence(inputs, zeroMask)
-      nn.utils.setZeroMask({lm, criterion}, zeroMask, opt.cuda)
+      nn.utils.setZeroMask({criterion, lm}, zeroMask, opt.cuda)
       -- forward
       local outputs = lm:forward{inputs, targets}
       local err = criterion:forward(outputs, targets)
diff --git a/examples/recurrent-language-model.lua b/examples/recurrent-language-model.lua
index 95e2745..a8fefe0 100644
--- a/examples/recurrent-language-model.lua
+++ b/examples/recurrent-language-model.lua
@@ -82,7 +82,6 @@ lm:add(lookup) -- input is seqlen x batchsize
 if opt.dropout > 0 then
    lm:add(nn.Dropout(opt.dropout))
 end
-lm:add(nn.SplitTable(1)) -- tensor to table of tensors
 
 -- rnn layers
 local stepmodule = nn.Sequential() -- applied at each time-step
@@ -140,17 +139,10 @@ end
 
 --[[ loss function ]]--
 
-local crit = nn.ClassNLLCriterion()
-
 -- target is also seqlen x batchsize.
-local targetmodule = nn.SplitTable(1)
-if opt.cuda then
-   targetmodule = nn.Sequential()
-      :add(nn.Convert())
-      :add(targetmodule)
-end
-
-local criterion = nn.SequencerCriterion(crit)
+local targetmodule = opt.cuda and nn.Convert() or nn.Identity()
+-- NLL is applied to each time-step
+local criterion = nn.SequencerCriterion(nn.ClassNLLCriterion())
 
 --[[ CUDA ]]--
 
@@ -160,6 +152,9 @@ if opt.cuda then
    targetmodule:cuda()
 end
 
+-- make sure to call getParameters before sharedClone
+local params, grad_params = lm:getParameters()
+
 --[[ experiment log ]]--
 
 -- is saved to file every time a new validation minima is found
@@ -168,8 +163,7 @@ xplog.opt = opt -- save all hyper-parameters and such
 xplog.dataset = 'PennTreeBank'
 xplog.vocab = trainset.vocab
 -- will only serialize params
-xplog.model = nn.Serial(lm)
-xplog.model:mediumSerial()
+xplog.model = lm:sharedClone()
 xplog.criterion = criterion
 xplog.targetmodule = targetmodule
 -- keep a log of NLL for each epoch
@@ -179,8 +173,6 @@ xplog.valppl = {}
 xplog.minvalppl = 99999999
 xplog.epoch = 0
 
-local params, grad_params = lm:getParameters()
-
 local adamconfig = {
    beta1 = opt.adamconfig[1],
    beta2 = opt.adamconfig[2],
diff --git a/examples/recurrent-visual-attention.lua b/examples/recurrent-visual-attention.lua
index b9bff24..15f5bc5 100644
--- a/examples/recurrent-visual-attention.lua
+++ b/examples/recurrent-visual-attention.lua
@@ -181,8 +181,7 @@ end
 local xplog = {}
 xplog.opt = opt -- save all hyper-parameters and such
 -- will only serialize params
-xplog.model = nn.Serial(agent)
-xplog.model:mediumSerial()
+xplog.model = agent:sharedClone()
 xplog.criterion = criterion
 xplog.targetmodule = targetmodule
 -- keep a log of NLL for each epoch
diff --git a/init.lua b/init.lua
index e14443a..dd222d1 100644
--- a/init.lua
+++ b/init.lua
@@ -33,56 +33,24 @@ require('rnn.Sequential')
 require('rnn.ParallelTable')
 require('rnn.LookupTable')
 require('rnn.Dropout')
-
--- extensions to existing criterions
-require('rnn.Criterion')
-
--- decorator modules
-require('rnn.Serial')
-
--- extensions to make serialization more efficient
-require('rnn.SpatialMaxPooling')
-require('rnn.SpatialConvolution')
-require('rnn.SpatialConvolutionMM')
-require('rnn.SpatialBatchNormalization')
 require('rnn.BatchNormalization')
 
+-- extensions to existing nn.Criterion
+require('rnn.Criterion')
 
 -- modules
 require('rnn.LookupTableMaskZero')
 require('rnn.MaskZero')
-require('rnn.PrintSize')
-require('rnn.Convert')
-require('rnn.Constant')
-require('rnn.Collapse')
-require('rnn.ZipTable')
-require('rnn.ZipTableOneToMany')
-require('rnn.CAddTensorTable')
 require('rnn.ReverseSequence')
-require('rnn.Dictionary')
-require('rnn.Inception')
-require('rnn.Clip')
-require('rnn.SpatialUniformCrop')
 require('rnn.SpatialGlimpse')
-require('rnn.WhiteNoise')
 require('rnn.ArgMax')
 require('rnn.CategoricalEntropy')
 require('rnn.TotalDropout')
-require('rnn.Kmeans')
-require('rnn.OneHot')
-require('rnn.SpatialRegionDropout')
-require('rnn.FireModule')
-require('rnn.SpatialFeatNormalization')
-require('rnn.ZeroGrad')
-require('rnn.LinearNoBias')
 require('rnn.SAdd')
 require('rnn.CopyGrad')
 require('rnn.VariableLength')
 require('rnn.StepLSTM')
 require('rnn.StepGRU')
-require('rnn.SpatialBinaryConvolution')
-require('rnn.SimpleColorTransform')
-require('rnn.PCAColorTransform')
 require('rnn.ReverseUnreverse')
 
 -- Noise Contrastive Estimation
@@ -100,11 +68,6 @@ require('rnn.ReinforceCategorical')
 require('rnn.VRClassReward')
 require('rnn.BinaryClassReward')
 
--- criterions
-require('rnn.ModuleCriterion')
-require('rnn.BinaryLogisticRegression')
-require('rnn.SpatialBinaryLogisticRegression')
-
 -- for testing:
 require('rnn.test')
 require('rnn.bigtest')
diff --git a/scripts/evaluate-rnnlm.lua b/scripts/evaluate-rnnlm.lua
index 5c8dba7..865ea2e 100644
--- a/scripts/evaluate-rnnlm.lua
+++ b/scripts/evaluate-rnnlm.lua
@@ -44,14 +44,14 @@ if opt.dumpcsv then
    local csvfile = opt.xplogpath:match('([^/]+)[.]t7$')..'.csv'
    paths.mkdir('learningcurves')
    csvpath = paths.concat('learningcurves', csvfile)
-   
+
    local file = io.open(csvpath, 'w')
    file:write("epoch,trainerr,validerr\n")
    for i=1,#trainerr do
       file:write(string.format('%d,%f,%f\n', i, trainerr[i], validerr[i]))
    end
    file:close()
-   
+
    print("CSV file saved to "..csvpath)
    os.exit()
 end
@@ -134,7 +134,7 @@ if opt.nsample > 0 then
    end
 else
    local sumErr, count = 0, 0
-   
+
    for i, inputs, targets in testset:subiter(xplog.opt.seqlen or 100) do
       inputs:apply(function(x)
          if x > 0 then
@@ -147,7 +147,7 @@ else
       local err = criterion:forward(outputs, targets)
       sumErr = sumErr + err
    end
-   
+
    if count ~= testset:size() then
       local meanseqlen = testset:size()/(testset:size() - count)
       print("mean sequence length : "..meanseqlen)
diff --git a/test/bigtest.lua b/test/bigtest.lua
index 5facc30..72fd913 100644
--- a/test/bigtest.lua
+++ b/test/bigtest.lua
@@ -121,7 +121,7 @@ function rnnbigtest.LSTM_char_rnn()
          end
          -- evaluate the input sums at once for efficiency
          local i2h = nn.Linear(input_size_L, 4 * rnn_size)(x):annotate{name='i2h_'..L}
-         local h2h = nn.LinearNoBias(rnn_size, 4 * rnn_size)(prev_h):annotate{name='h2h_'..L}
+         local h2h = nn.Linear(rnn_size, 4 * rnn_size):noBias()(prev_h):annotate{name='h2h_'..L}
          local all_input_sums = nn.CAddTable()({i2h, h2h})
 
          local reshaped = nn.Reshape(4, rnn_size)(all_input_sums)
@@ -480,7 +480,7 @@ function rnnbigtest.Reinforce()
    mlp:add(nn.Linear(inputs:size(2),hiddenSize))
    mlp:add(nn.Tanh())
    mlp:add(nn.ReinforceNormal(stdev))
-   mlp:add(nn.Clip(-1,1))
+   mlp:add(nn.Clamp(-1,1))
    mlp:add(nn.Linear(hiddenSize, inputs:size(2)))
    mlp:add(nn.SoftMax())
 
@@ -548,82 +548,6 @@ function rnnbigtest.Reinforce()
    train(concat, cost, N, 'ReinforceCategorical')
 end
 
--- Unit Test Kmeans layer
-function rnnbigtest.Kmeans()
-   local k = 10
-   local dim = 5
-   local batchSize = 1000
-   local input = torch.Tensor(batchSize, dim)
-   for i=1, batchSize do
-      input[i]:fill(torch.random(1, k))
-   end
-
-   local verbose = false
-
-   local attempts = 10
-   local iter = 100
-   local bestLoss = 100000000
-   local bestKm = nil
-   local tempLoss = 0
-   local learningRate = 1
-
-   local initTypes = {'random', 'kmeans++'}
-   local useCudas = {false}
-   if pcall(function() require 'cunn' end) then
-      useCudas[2] = true
-   end
-   for _, initType in pairs(initTypes) do
-      for _, useCuda in pairs(useCudas) do
-
-         if useCuda then
-            input = input:cuda()
-         else
-            input = input:double()
-         end
-
-         sys.tic()
-         for j=1, attempts do
-            local km = nn.Kmeans(k, dim)
-            if useCuda then km:cuda() end
-
-            if initType == 'kmeans++' then
-               km:initKmeansPlus(input)
-            else
-               km:initRandom(input)
-            end
-
-            for i=1, iter do
-               km:zeroGradParameters()
-
-               km:forward(input)
-               km:backward(input, gradOutput)
-
-               -- Gradient descent
-               km.weight:add(-learningRate, km.gradWeight)
-               tempLoss = km.loss
-            end
-            if verbose then print("Attempt Loss " .. j ..": " .. tempLoss) end
-            if tempLoss < bestLoss then
-               bestLoss = tempLoss
-            end
-            if (initType == 'kmeans++' and bestLoss < 0.00001) or (initType == 'random' and bestLoss < 500) then
-               break
-            end
-         end
-         if verbose then
-            print("InitType: " .. initType .. " useCuda: " .. tostring(useCuda))
-            print("Best Loss: " .. bestLoss)
-            print("Total time: " .. sys.toc())
-         end
-         if initType == 'kmeans++' then
-            mytester:assert(bestLoss < 0.00001, "Kmeans++ error ("..(useCuda and 'cuda' or 'double')..")")
-         else
-            mytester:assert(bestLoss < 500, "Kmeans error ("..(useCuda and 'cuda' or 'double')..")")
-         end
-      end
-   end
-end
-
 function rnnbigtest.NCE_benchmark()
    pcall(function() require 'cunn' end) -- make sure to import cunn before initializing large tensors, else weird segfault...
 
diff --git a/test/test.lua b/test/test.lua
index 734b1e3..660883e 100644
--- a/test/test.lua
+++ b/test/test.lua
@@ -148,7 +148,6 @@ function rnntest.RecurrentAttention()
    glimpseSensor:add(nn.ReLU())
 
    local glimpse = nn.Sequential()
-   --glimpse:add(nn.PrintSize("preglimpse"))
    glimpse:add(nn.ConcatTable():add(locationSensor):add(glimpseSensor))
    glimpse:add(nn.JoinTable(1,1))
    glimpse:add(nn.Linear(opt.glimpseHiddenSize+opt.locatorHiddenSize, opt.imageHiddenSize))
@@ -2311,8 +2310,8 @@ function rnntest.issue129()
       output = model:forward(input):clone()
    end
 
-   mytester:assertTensorEq(model1:get(1).running_mean, model:get(2).module.sharedClones[1].modules[1].running_mean, 0.000001)
-   mytester:assertTensorEq(model:get(2).module.sharedClones[1].modules[1].running_mean, model:get(2).module.modules[1].modules[1].running_mean, 0.0000001)
+   mytester:assertTensorEq(model1:get(1).running_mean, model:get(2).modules[1].sharedClones[1].modules[1].running_mean, 0.000001)
+   mytester:assertTensorEq(model:get(2).modules[1].sharedClones[1].modules[1].running_mean, model:get(2).modules[1].modules[1].modules[1].running_mean, 0.0000001)
 
    model:evaluate()
    local output2 = model:forward(input):clone()
@@ -4794,188 +4793,6 @@ function rnntest.Module_getParameters()
    end
 end
 
-function rnntest.Serial()
-   function test(mlp, name)
-      local input = torch.randn(4,3)
-      local gradOutput = torch.randn(4,7)
-      local mlp2 = mlp:clone():Serial()
-
-      local output = mlp:forward(input):clone()
-      local gradInput = mlp:backward(input, gradOutput):clone()
-
-      local output2 = mlp2:forward(input)
-      local gradInput2 = mlp2:backward(input, gradOutput)
-
-      mytester:assertTensorEq(output, output2, 0.000001, name.." serial forward error")
-      mytester:assertTensorEq(gradInput, gradInput2, 0.00001, name.." serial backward error")
-
-      mlp2:mediumSerial()
-      mlp2.tensortype = 'torch.FloatTensor'
-      local mlp3 = mlp2:clone()
-
-      mytester:assert(mlp3.modules[1].output:nElement() == 0, name.." serial medium empty err")
-      mytester:assert(torch.type(mlp3.modules[1].output) == 'torch.FloatTensor', name.." serial medium type err")
-
-      mlp:zeroGradParameters()
-      local output = mlp:forward(input)
-      local gradInput = mlp:backward(input, gradOutput)
-
-      mlp3:zeroGradParameters()
-      local output2 = mlp3:forward(input:float())
-      local gradInput2 = mlp3:backward(input:float(), gradOutput:float())
-
-      mytester:assertTensorEq(output:float(), output2, 0.000001, name.." serial forward error")
-      mytester:assertTensorEq(gradInput:float(), gradInput2, 0.00001, name.." serial backward error")
-
-      local params, gradParams = mlp:parameters()
-      local params2, gradParams2 = mlp3:parameters()
-      mytester:assert(#params == #params2)
-      for i,param in ipairs(params) do
-         mytester:assertTensorEq(param:float(), params2[i], 0.00001, name.." params err "..i)
-         mytester:assertTensorEq(gradParams[i]:float(), gradParams2[i], 0.00001, name.." gradParams err "..i)
-      end
-   end
-
-   local mlp = nn.Sequential():extend(
-      nn.Linear(3,4),
-      nn.Tanh(),
-      nn.Linear(4,5),
-      nn.Sequential():extend(
-         nn.Linear(5,6),
-         nn.Tanh(),
-         nn.Linear(6,7)
-      )
-   )
-
-   test(mlp, 'mlp')
-
-   local seq = nn.Sequential()
-   seq:add(nn.Repeater(nn.LinearRNN(3,2), 3))
-   seq:add(nn.Sequencer(nn.Linear(2,7)))
-   seq:add(nn.SelectTable(-1))
-   test(seq, 'rnn2')
-end
-
-function rnntest.Convert()
-   -- batch mode
-   local c = nn.Convert('bchw', 'chwb')
-   local input = torch.randn(8,3,5,5)
-   local output = c:forward(input)
-   local output2 = input:transpose(1,4):transpose(1,3):transpose(1,2)
-   mytester:assertTensorEq(output, output2, 0.000001, "Convert fwd bchw->chwb")
-   local gradInput = c:backward(input, output)
-   mytester:assertTensorEq(gradInput, input, 0.000001, "Convert bwd bchw->chwb")
-   local c = nn.Convert('bchw', 'bf')
-   local output = c:forward(input)
-   local output2 = input:view(8,-1)
-   mytester:assertTensorEq(output, output2, 0.000001, "Convert fwd bchw->bf")
-   c:float()
-   local output = c:forward(input:float())
-   mytester:assertTensorEq(output, output2:float(), 0.000001, "Convert:type()")
-   local output = c:forward(input)
-   mytester:assertTensorEq(output, output2:float(), 0.000001, "Convert:type() double->float")
-   -- non-batch mode
-   local c = nn.Convert('chw', 'hwc')
-   local input = torch.randn(3,5,5)
-   local output = c:forward(input)
-   local output2 = input:transpose(1,3):transpose(1,2)
-   mytester:assertTensorEq(output, output2, 0.000001, "Convert fwd chw->hwc non-batch")
-   local gradInput = c:backward(input, output)
-   mytester:assertTensorEq(gradInput, input, 0.000001, "Convert bwd chw->hwc non-batch")
-   local c = nn.Convert('chw', 'f')
-   local output = c:forward(input)
-   local output2 = input:view(-1)
-   mytester:assertTensorEq(output, output2, 0.000001, "Convert fwd chw->bf non-batch")
-   c:float()
-   local output = c:forward(input:float())
-   mytester:assertTensorEq(output, output2:float(), 0.000001, "Convert:type() non-batch")
-   local output = c:forward(input)
-   mytester:assertTensorEq(output, output2:float(), 0.000001, "Convert:type() double->float non-batch")
-end
-
-function rnntest.Collapse()
-   local c = nn.Collapse(3)
-   local input = torch.randn(8,3,4,5)
-   local output = c:forward(input)
-   mytester:assertTensorEq(input:view(8,-1), output, 0.000001, "Collapse:forward")
-   local gradInput = c:backward(input, output)
-   mytester:assertTensorEq(gradInput, input, 0.000001, "Collapse:backward")
-   mytester:assertTableEq(gradInput:size():totable(), input:size():totable(), 0.000001, "Collapse:backward size")
-   local input2 = input:transpose(1,4)
-   local output2 = c:forward(input2)
-   mytester:assertTensorEq(input2:contiguous():view(5,-1), output2, 0.000001, "Collapse:forward non-contiguous")
-   local gradInput2 = c:backward(input2, output2)
-   mytester:assertTensorEq(gradInput2, input2, 0.000001, "Collapse:backward non-contiguous")
-   mytester:assertTableEq(gradInput2:size():totable(), input2:size():totable(), 0.000001, "Collapse:backward size non-contiguous")
-end
-
-function rnntest.ZipTable()
-   -- input : { {a1,a2}, {b1,b2}, {c1,c2} }
-   -- output : { {a1,b1,c1}, {a2,b2,c2} }
-   local z = nn.ZipTable()
-   local input = {
-      {torch.randn(3,4), torch.randn(3,4)},
-      {torch.randn(3,4), torch.randn(3,4)},
-      {torch.randn(3,4), torch.randn(3,4)}
-   }
-   local output = z:forward(input)
-   mytester:assert(#output == 2, "ZipTable #output")
-   mytester:assert(#(output[1]) == 3, "ZipTable #output[1]")
-   mytester:assertTensorEq(input[1][1], output[1][1], 0.000001, "ZipTable input11")
-   mytester:assertTensorEq(input[1][2], output[2][1], 0.000001, "ZipTable input12")
-   mytester:assertTensorEq(input[3][2], output[2][3], 0.000001, "ZipTable input32")
-   local gradInput = z:backward(input, output)
-   mytester:assert(#gradInput == 3, "ZipTable #gradInput")
-   mytester:assert(#(gradInput[1]) == 2, "ZipTable #gradInput[1]")
-   mytester:assertTensorEq(input[1][1], gradInput[1][1], 0.000001, "ZipTable gradInput11")
-   mytester:assertTensorEq(input[1][2], gradInput[1][2], 0.000001, "ZipTable gradInput12")
-   mytester:assertTensorEq(input[3][2], gradInput[3][2], 0.000001, "ZipTable gradInput32")
-end
-
-function rnntest.ZipTableOneToMany()
-   -- input : { v, {a,b,c} }
-   -- output : { {v,a}, {v,b}, {v,c} }
-   local z = nn.ZipTableOneToMany()
-   local input = { torch.randn(3), { torch.randn(4), torch.rand(4), torch.rand(4) } }
-   local output = z:forward(input)
-   mytester:assert(#output == 3, "ZipTableOneToMany #output")
-   mytester:assert(#(output[1]) == 2, "ZipTableOneToMany #output[1]")
-   mytester:assert(#(output[2]) == 2, "ZipTableOneToMany #output[2]")
-   mytester:assert(#(output[3]) == 2, "ZipTableOneToMany #output[3]")
-   mytester:assertTensorEq(input[1], output[1][1], 0.000001, "ZipTableOneToMany input1 output11")
-   mytester:assertTensorEq(input[1], output[2][1], 0.000001, "ZipTableOneToMany input1 output21")
-   mytester:assertTensorEq(input[1], output[3][1], 0.000001, "ZipTableOneToMany input1 output31")
-   mytester:assertTensorEq(input[2][1], output[1][2], 0.000001, "ZipTableOneToMany input21")
-   mytester:assertTensorEq(input[2][2], output[2][2], 0.000001, "ZipTableOneToMany input22")
-   mytester:assertTensorEq(input[2][3], output[3][2], 0.000001, "ZipTableOneToMany input23")
-   local gradInput = z:backward(input, output)
-   mytester:assert(#gradInput == 2, "ZipTableOneToMany #gradInput")
-   mytester:assert(#(gradInput[2]) == 3, "ZipTableOneToMany #gradInput[2]")
-   mytester:assertTensorEq(input[2][1], gradInput[2][1], 0.000001, "ZipTableOneToMany gradInput21")
-   mytester:assertTensorEq(input[2][2], gradInput[2][2], 0.000001, "ZipTableOneToMany gradInput22")
-   mytester:assertTensorEq(input[2][3], gradInput[2][3], 0.000001, "ZipTableOneToMany gradInput32")
-   mytester:assertTensorEq(torch.mul(input[1], 3), gradInput[1], 0.000001, "ZipTableOneToMany gradInput21")
-end
-
-function rnntest.CAddTensorTable()
-   -- input : { v, {a,b,c} }
-   -- output : { v+a, v+b, v+c }
-   local z = nn.CAddTensorTable()
-   local input = { torch.randn(3), { torch.randn(3), torch.rand(3), torch.rand(3) } }
-   local output = z:forward(input)
-   mytester:assert(#output == 3, "CAddTensorTable #output")
-   mytester:assertTensorEq(input[1]+input[2][1], output[1], 0.00001, "CAddTensorTable input21 output1")
-   mytester:assertTensorEq(input[1]+input[2][2], output[2], 0.00001, "CAddTensorTable input22 output2")
-   mytester:assertTensorEq(input[1]+input[2][3], output[3], 0.00001, "CAddTensorTable input23 output3")
-   local gradInput = z:backward(input, output)
-   mytester:assert(#gradInput == 2, "CAddTensorTable #gradInput")
-   mytester:assert(#(gradInput[2]) == 3, "CAddTensorTable #gradInput[2]")
-   mytester:assertTensorEq(output[1], gradInput[2][1], 0.000001, "CAddTensorTable gradInput21")
-   mytester:assertTensorEq(output[2], gradInput[2][2], 0.000001, "CAddTensorTable gradInput22")
-   mytester:assertTensorEq(output[3], gradInput[2][3], 0.000001, "CAddTensorTable gradInput23")
-   mytester:assertTensorEq(output[1]+output[2]+output[3], gradInput[1], 0.000001, "CAddTensorTable gradInput1")
-end
-
 function rnntest.ReverseSequence()
    -- test table
 
@@ -5018,79 +4835,6 @@ function rnntest.ReverseSequence()
    end
 end
 
-function rnntest.Inception()
-   local size = {8,3,32,32}
-   local outputSize = {8,16+24+8+12,32,32}
-   local input = torch.rand(unpack(size))
-   local gradOutput = torch.randn(unpack(outputSize))
-   local incep = nn.Inception{inputSize=3, outputSize={16,24}, reduceSize={14,16,8,12}}
-   for i, param in ipairs(incep:parameters()) do
-      mytester:assert(_.isFinite(param:sum()), 'inception init error')
-   end
-   local output = incep:forward(input)
-   mytester:assertTableEq(output:size():totable(), outputSize, 0.00001)
-   mytester:assert(_.isFinite(output:sum()))
-   incep:zeroGradParameters()
-   local gradInput = incep:backward(input, gradOutput)
-   mytester:assertTableEq(gradInput:size():totable(), size, 0.00001)
-   mytester:assert(_.isFinite(gradInput:sum()))
-   incep:updateParameters(0.1)
-   for i, param in ipairs(incep:parameters()) do
-      mytester:assert(_.isFinite(param:sum()), 'inception update error')
-   end
-   incep:maxParamNorm(1)
-   for i, param in ipairs(incep:parameters()) do
-      mytester:assert(_.isFinite(param:sum()), 'inception maxNorm error')
-   end
-end
-
-function rnntest.SpatialUniformCrop()
-   if not pcall(function() require "nnx" end) then return end -- needs the nnx package
-   local input = torch.Tensor(8,3,10,10):copy(torch.range(1,8):view(8,1,1,1):expand(8,3,10,10))
-   local gradOutput = torch.Tensor(8,3,4,4):copy(torch.range(1,8):view(8,1,1,1):expand(8,3,4,4))
-   local sc = nn.SpatialUniformCrop(4)
-   local output, gradInput
-   for i=1,100 do
-      output = sc:forward(input)
-      gradInput = sc:backward(input, gradOutput)
-   end
-   for i=1,8 do
-      mytester:assert(math.abs(output[i]:mean() - i) < 0.0001, "SpatialUniformCrop output err "..i)
-      mytester:assert(math.abs(gradInput[i]:mean() - ((i*4*4)/(10*10))) < 0.0001, "SpatialUniformCrop gradInput err"..i)
-   end
-
-   local input = torch.zeros(1, 1, 120, 120)
-   local temp = input[1]:narrow(2, 30, 60):narrow(3, 30, 60)
-   temp:fill(1)
-   local scale = {}
-   scale['min'] = 0.8
-   scale['max'] = 1.2
-
-   local layer = nn.SpatialUniformCrop(100, 100, scale)
-   local o = layer:forward(input)
-   gradInput = layer:backward(input, o)
-   mytester:assert(gradInput:max() ~= nil, "SpatialUniformCrop scaling error.")
-end
-
-function rnntest.ModuleCriterion()
-   local input = torch.randn(8,4)
-   local target = torch.randn(8,4)
-   local inputModule = nn.Tanh()
-   local criterion = nn.MSECriterion()
-   local mc = nn.ModuleCriterion(criterion, inputModule)
-
-   local err = mc:forward(input, target)
-   local gradInput = mc:backward(input, target)
-
-   local output = inputModule:forward(input)
-   local err2 = criterion:forward(output, target)
-   local gradOutput = criterion:backward(output, target)
-   local gradInput2 = inputModule:backward(input, gradOutput)
-
-   mytester:assert(err == err2, "ModuleCriterion backward err")
-   mytester:assertTensorEq(gradInput, gradInput2, 0.000001, "ModuleCriterion backward err")
-end
-
 function rnntest.ReinforceNormal()
    local input = torch.randn(500,1000) -- means
    local gradOutput = torch.Tensor() -- will be ignored
@@ -5290,35 +5034,6 @@ function rnntest.BinaryClassReward()
    mytester:assertTensorEq(rf2.reward, rf.reward, 0.0000001)
 end
 
-function rnntest.Clip()
-   local input = torch.randn(200,300)
-   local gradOutput = torch.randn(200,300)
-   local minval, maxval = -0.05, 0.1
-   local clip = nn.Clip(minval, maxval)
-   local output = clip:forward(input)
-   local output2 = input:clone()
-   local mask = input.new()
-   mask:gt(input, maxval)
-   output2[mask:type("torch.ByteTensor")] = maxval
-   mask:lt(input, minval)
-   output2[mask:type("torch.ByteTensor")] = minval
-   mytester:assertTensorEq(output, output2, 0.00001, "Clip forward err")
-   local gradInput = clip:backward(input, gradOutput)
-   mytester:assertTensorEq(gradInput, gradOutput, 0.00001, "Clip backward err")
-end
-
-function rnntest.Constant()
-   local input = torch.randn(20,3,7)
-   local gradOutput = torch.randn(20,30,6)
-   local value = torch.randn(30,6)
-   local const = nn.Constant(value:clone(), 2)
-   local output = const:forward(input)
-   local gradInput = const:backward(input, output)
-   local output2 = value:view(1,30,6):expand(20,30,6)
-   mytester:assertTensorEq(output2, output, 0.000001, "Constant forward err")
-   mytester:assertTensorEq(gradInput, input:zero(), 0.000001, "Constant backward err")
-end
-
 function rnntest.SpatialGlimpse()
    if not pcall(function() require "image" end) then return end -- needs the image package
    if not pcall(function() require "nnx" end) then return end -- needs the nnx package
@@ -6023,417 +5738,6 @@ function rnntest.TotalDropout()
    mytester:assert(nOne < 10 and nOne > 1, "TotalDropout bernoulli error")
 end
 
-
--- Unit Test WhiteNoise
-function rnntest.WhiteNoise()
-   local input = torch.zeros(3, 28, 28)
-   local addNoise = nn.WhiteNoise()
-   local output = addNoise:forward(input)
-   local meanValue = output:mean()
-   local stdValue = output:std()
-   mytester:assert(meanValue > -0.01 and meanValue < 0.01)
-   mytester:assert(stdValue < 0.15 and stdValue >= 0)
-
-   -- Evaluate
-   addNoise:evaluate()
-   output = addNoise:forward(input)
-   meanValue = output:mean()
-   stdValue = output:std()
-   mytester:assert(meanValue == 0)
-   mytester:assert(stdValue == 0)
-
-   -- backprop
-   addNoise:training()
-   local gradOutput = torch.rand(3, 28, 28)
-   local gradInput = addNoise:updateGradInput(input, gradOutput)
-   mytester:assertTensorEq(gradOutput, gradInput, 0.000001, "WhiteNoise backward err")
-end
-
--- Unit Test SpatialBinaryLogisticRegression criterion
-function rnntest.SpatialBinaryLogisticRegression()
-   local crit = nn.SpatialBinaryLogisticRegression()
-   local k = 32
-   local h = 28
-   local w = 28
-
-   -- Working with batch of images
-   local input = torch.zeros(k, 1, h, w)
-   local target = torch.zeros(k, 1, h, w)
-   local inputs = {1, 0, -1}
-   local targets = {1, 0, -1}
-   for _,i in pairs(inputs) do
-      for _,t in pairs(targets) do
-
-      input:fill(i)
-      target:fill(t)
-      -- Check forward
-      local loss = crit:updateOutput(input, target)
-      local myLoss = math.log(1+math.exp(-1*i*t))/2
-      mytester:assert( loss >= myLoss-precision and loss <= myLoss+precision,
-                       "SpatialBinaryLogisticRegression cost incorrect.")
-
-      -- Check backward
-      local gradInput = crit:updateGradInput(input, target)
-      local g1 = gradInput[1][1][1][1]
-      local gi = (1/(1+math.exp(-1*i*t)))*math.exp(-1*i*t)*(-1*t)/(2*k*h*w)
-      mytester:assert( g1 >= gi-precision and g1 <= gi+precision,
-                      "SpatialBinaryLogisticRegression gradInput error.")
-      end
-   end
-
-   -- Working with single image
-   k = 1
-   local input = torch.zeros(1, h, w)
-   local target = torch.zeros(1, h, w)
-   local inputs = {1, 0, -1}
-   local targets = {1, 0, -1}
-   for _,i in pairs(inputs) do
-      for _,t in pairs(targets) do
-
-      input:fill(i)
-      target:fill(t)
-      -- Check forward
-      local loss = crit:updateOutput(input, target)
-      local myLoss = math.log(1+math.exp(-1*i*t))/2
-      mytester:assert( loss >= myLoss-precision and loss <= myLoss+precision,
-                       "SpatialBinaryLogisticRegression cost incorrect.")
-
-      -- Check backward
-      local gradInput = crit:updateGradInput(input, target)
-      local g1 = gradInput[1][1][1]
-      local gi = (1/(1+math.exp(-1*i*t)))*math.exp(-1*i*t)*(-1*t)/(2*k*h*w)
-      mytester:assert( g1 >= gi-precision and g1 <= gi+precision,
-                      "SpatialBinaryLogisticRegression gradInput error.")
-      end
-   end
-end
-
--- Unit Test BinaryLogisticRegression criterion
-function rnntest.BinaryLogisticRegression()
-   local crit = nn.BinaryLogisticRegression()
-   local k = 32
-
-   -- Working with batch of images
-   local input = torch.zeros(k, 1)
-   local target = torch.zeros(k, 1)
-   local inputs = {1, 0, -1}
-   local targets = {1, 0, -1}
-   for _,i in pairs(inputs) do
-      for _,t in pairs(targets) do
-
-      input:fill(i)
-      target:fill(t)
-      -- Check forward
-      local loss = crit:updateOutput(input, target)
-      local myLoss = math.log(1+math.exp(-1*i*t))
-      mytester:assert( loss >= myLoss-precision and loss <= myLoss+precision,
-                       "BinaryLogisticRegression cost incorrect.")
-
-      -- Check backward
-      local gradInput = crit:updateGradInput(input, target)
-      local g1 = gradInput[1][1]
-      local gi = (1/(1+math.exp(-1*i*t)))*math.exp(-1*i*t)*(-1*t)/(k)
-      mytester:assert( g1 >= gi-precision and g1 <= gi+precision,
-                      "BinaryLogisticRegression gradInput error.")
-      end
-   end
-
-   -- Working nElements not matching.
-   local input = torch.zeros(1, k)
-   local target = torch.zeros(k, 1)
-   local inputs = {1, 0, -1}
-   local targets = {1, 0, -1}
-   for _,i in pairs(inputs) do
-      for _,t in pairs(targets) do
-
-      input:fill(i)
-      target:fill(t)
-      -- Check forward
-      local loss = crit:updateOutput(input, target)
-      local myLoss = math.log(1+math.exp(-1*i*t))
-      mytester:assert( loss >= myLoss-precision and loss <= myLoss+precision,
-                       "BinaryLogisticRegression cost incorrect.")
-
-      -- Check backward
-      local gradInput = crit:updateGradInput(input, target)
-      local g1 = gradInput[1][1]
-      local gi = (1/(1+math.exp(-1*i*t)))*math.exp(-1*i*t)*(-1*t)/(k)
-      mytester:assert( g1 >= gi-precision and g1 <= gi+precision,
-                      "BinaryLogisticRegression gradInput error.")
-      end
-   end
-end
-
--- Unit Test SpatialRegionDropout
-function rnntest.SpatialRegionDropout()
-   local hasCuda = pcall(function() require 'cunn' end)
-   local useCudas = {false, hasCuda}
-   local p = 0.2
-   local value = 2
-   local model = nn.SpatialRegionDropout(p)
-   local input = torch.zeros(3, 100, 100):fill(value)
-
-   for _, useCuda in pairs(useCudas) do
-      if useCuda then
-         model:cuda()
-         input = input:cuda()
-      end
-      local output = model:forward(input)
-      mytester:assert( output:mean() >= value-precision and
-                       output:mean() <= value+precision,
-                       "SpatialRegionDropout forward mean value incorrect.")
-
-      local gradInput = model:backward(input, input)
-      mytester:assert( gradInput:mean() >= value-precision and
-                       gradInput:mean() <= value+precision,
-                       "SpatialRegionDropout backward mean value incorrect.")
-   end
-end
-
--- Unit Test SpatialBinaryConvolution
-function rnntest.SpatialBinaryConvolution()
-   local hasCuda = pcall(function() require 'cunn' end)
-   local useCudas = {false, hasCuda}
-   local nInputPlane = 3
-   local nOutputPlane = 16
-   local kW = 3
-   local kH = 3
-   local height = 224
-   local width = 224
-
-   local model = nn.SpatialBinaryConvolution(nInputPlane, nOutputPlane,
-                                             kW, kH)
-   local input = torch.rand(nInputPlane, height, width)
-
-   for _, useCuda in pairs(useCudas) do
-      if useCuda then
-         model:cuda()
-         input = input:cuda()
-      end
-      model:zeroGradParameters()
-      local output = model:forward(input)
-      local gradInput = model:backward(input, output)
-   end
-end
-
--- Unit Test SimpleColorTransform
-function rnntest.SimpleColorTransform()
-   local hasCuda = pcall(function() require 'cunn' end)
-   local useCudas = {false, hasCuda}
-   local value = 10
-   local rangeValue = 2
-   local precision = rangeValue*0.1
-   local range = torch.zeros(3):fill(rangeValue)
-   local model = nn.SimpleColorTransform(3, range)
-   local input = torch.zeros(32, 3, 100, 100):fill(value)
-
-   for _, useCuda in pairs(useCudas) do
-      if useCuda then
-         model:cuda()
-         input = input:cuda()
-      end
-      local output = model:forward(input)
-      mytester:assert(output:std() <= rangeValue+precision,
-                       "SimpleColorTransform output value incorrect.")
-      local gradInput = model:backward(input, input)
-      mytester:assert(gradInput:sum() == input:sum(),
-                       "SimpleColorTransform gradInput value incorrect.")
-   end
-end
-
--- Unit Test PCAColorTransform
-function rnntest.PCAColorTransform()
-   local hasCuda = pcall(function() require 'cunn' end)
-   local useCudas = {false, hasCuda}
-   local std = 0.1
-   local value = 145
-   local rangeValue = 1800
-   local precision = rangeValue * 3 * std
-   local eigenVectors = torch.Tensor({{ 0.58786434,  0.56388045,  0.58004685},
-                                      {-0.65427388, -0.0902746 ,  0.75085031},
-                                      {-0.47575331,  0.82090763, -0.31586303}})
-   local eigenValues = torch.Tensor({4491.21, 722.85, 68.07})
-   local model = nn.PCAColorTransform(3, eigenVectors, eigenValues, std)
-   local input = torch.zeros(32, 3, 100, 100):fill(value)
-
-   for _, useCuda in pairs(useCudas) do
-      if useCuda then
-         model:cuda()
-         input = input:cuda()
-      end
-      local output = model:forward(input)
-      mytester:assert(output:std() <= rangeValue+precision,
-                       "PCAColorTransform output value incorrect.")
-      local gradInput = model:backward(input, input)
-      mytester:assert(gradInput:sum() == input:sum(),
-                       "PCAColorTransform gradInput value incorrect.")
-   end
-end
-
--- Unit Test FireModule
-function rnntest.FireModule()
-   local hasCuda = pcall(function() require 'cunn' end)
-   local useCudas = {false, hasCuda}
-   local activations = {'ReLU', 'Tanh', 'Sigmoid'}
-   local nInputPlane = 3
-   local width = 32
-   local height = 32
-   local s1x1 = 16
-   local e1x1 = 16
-   local e3x3 = 16
-   for _, activation in pairs(activations) do
-      for _, useCuda in pairs(useCudas) do
-         local model = nn.FireModule(nInputPlane, s1x1, e1x1, e3x3)
-         local input = torch.rand(1, nInputPlane, height, width)
-         if useCuda then
-            model:cuda()
-            input = input:cuda()
-         end
-         local output = model:forward(input)
-         local gradInput = model:backward(input, output)
-      end
-   end
-end
-
--- Unit Test SpatialFeatNormalization
-function rnntest.SpatialFeatNormalization()
-   local hasCuda = pcall(function() require 'cunn' end)
-   local useCudas = {false, hasCuda}
-   local input = torch.zeros(3, 32, 32):fill(2)
-   local mean = torch.zeros(3):fill(1)
-   local std = torch.zeros(3):fill(0.5)
-   local outputValue = 2
-   local gradValue = 4
-   for _, useCuda in pairs(useCudas) do
-      local model = nn.SpatialFeatNormalization(mean, std)
-      if useCuda then
-         model:cuda()
-         input = input:cuda()
-      end
-      local output = model:forward(input)
-      local gradInput = model:backward(input, output)
-      mytester:assert( output:mean() == outputValue,
-                     "SpatialFeatNormalization forward mean value incorrect.")
-      mytester:assert( gradInput:mean() == gradValue,
-                     "SpatialFeatNormalization backward mean value incorrect.")
-   end
-end
-
-function rnntest.OneHot()
-   local nClass = 10
-
-   -- batch mode
-   local batchSize = 3
-   local input = torch.LongTensor(batchSize):random(1, nClass)
-   local gradOutput = torch.randn(batchSize, nClass)
-
-   local oh = nn.OneHot(nClass)
-
-   local output = oh:forward(input)
-   local output2 = torch.Tensor(batchSize, nClass):zero()
-   local eye = torch.eye(nClass)
-   output2:index(eye, 1, input)
-   mytester:assertTensorEq(output, output2, 0.000001, "OneHot forward batch err")
-   mytester:assert(output:dim() == 2)
-
-   -- non-batch mode (number input)
-   local num = 3
-   local output3 = torch.zeros(nClass)
-   output3[num] = 1.0
-   mytester:assertTensorEq(oh:forward(num), output3, 0.000001, "OneHot forward number err")
-
-   local gradInput = oh:backward(input, gradOutput)
-   mytester:assertTensorEq(gradInput, input:double():zero(), 0.000001, "OneHot backward batch err")
-
-   if pcall(function() require 'cunn' end) then
-      oh:cuda()
-
-      -- test with long input
-      local output = oh:forward(input)
-      mytester:assert(torch.type(output) == 'torch.CudaTensor')
-      mytester:assertTensorEq(output:double(), output2, 0.000001, "OneHot forward batch long-cuda err")
-
-      -- test with cuda input
-      local input = input:cuda()
-      gradOutput = gradOutput:cuda()
-
-      local output = oh:forward(input)
-      mytester:assert(torch.type(output) == 'torch.CudaTensor')
-      mytester:assertTensorEq(output:double(), output2, 0.000001, "OneHot forward batch cuda err")
-
-      local gradInput2 = oh:backward(input, gradOutput)
-      mytester:assertTensorEq(gradInput, gradInput2:double(), 0.000001, "OneHot backward batch err")
-      cutorch.synchronize()
-
-      -- non-batch mode (number input)
-      mytester:assertTensorEq(oh:forward(num), output3:cuda(), 0.000001, "OneHot forward number err")
-   end
-
-   -- multi-dimensional input
-   local inputSize = 2
-   local input = torch.LongTensor(batchSize, inputSize):random(1, nClass)
-   local gradOutput = torch.randn(batchSize, inputSize, nClass)
-
-   local oh = nn.OneHot(nClass, 2)
-
-   local output = oh:forward(input)
-   local output2 = torch.Tensor(batchSize*inputSize, nClass):zero()
-   local eye = torch.eye(nClass)
-   output2:index(eye, 1, input:view(-1))
-   output2:resize(batchSize, inputSize, nClass)
-   mytester:assertTensorEq(output, output2, 0.000001, "OneHot 2d forward batch err")
-   mytester:assert(output:dim() == 3)
-
-   local gradInput = oh:backward(input, gradOutput)
-   mytester:assertTensorEq(gradInput, input:double():zero(), 0.000001, "OneHot 2d backward batch err")
-
-   if pcall(function() require 'cunn' end) then
-      oh:cuda()
-
-      -- test with long input
-      local output = oh:forward(input)
-      mytester:assert(torch.type(output) == 'torch.CudaTensor')
-      mytester:assertTensorEq(output:double(), output2, 0.000001, "OneHot 2d forward batch long-cuda err")
-
-      -- test with cuda input
-      local input = input:cuda()
-      gradOutput = gradOutput:cuda()
-
-      local output = oh:forward(input)
-      mytester:assert(torch.type(output) == 'torch.CudaTensor')
-      mytester:assertTensorEq(output:double(), output2, 0.000001, "OneHot 2d forward batch cuda err")
-
-      local gradInput2 = oh:backward(input, gradOutput)
-      mytester:assertTensorEq(gradInput, gradInput2:double(), 0.000001, "OneHot 2d backward batch err")
-
-      local benchmark = false
-      if benchmark then
-         local input = torch.FloatTensor(50, 50):random(1,65):cuda()
-
-         local oh = nn.OneHot(65):cuda()
-
-         oh:forward(input)
-         cutorch.synchronize()
-         local a = torch.Timer()
-         for i=1,10 do
-            oh:forward(input)
-         end
-         cutorch.synchronize()
-         local gputime = a:time().real
-
-         oh:float()
-         input = input:float()
-         oh:forward(input)
-         a = torch.Timer()
-         for i=1,10 do
-            oh:forward(input)
-         end
-         local cputime = a:time().real
-         print("Onehot GPU vs CPU time", gputime, cputime)
-      end
-   end
-end
-
 function rnntest.NCE_main()
    local batchsize = 4
    local k = 10
diff --git a/test/test_firemodule.lua b/test/test_firemodule.lua
deleted file mode 100644
index 9e36edb..0000000
--- a/test/test_firemodule.lua
+++ /dev/null
@@ -1,40 +0,0 @@
-require 'nn'
-require 'rnn'
-require 'cunn'
-require 'cutorch'
-
---torch.setdefaulttensortype('torch.FloatTensor')
-
--- FireModule issue 45
---[[
-m = nn.Sequential()
-m:add(nn.FireModule(1,1,1,1))
-_, p = m:getParameters()
-print(p:sum())
-
-m = m:cuda()
-_, p = m:getParameters()
-print(p:sum())
-
-m:zeroGradParameters()
-print(p:sum())--]]
-
-
--- Testing FireModule
-input = torch.rand(1, 3, 6, 6)
-model = nn.FireModule(3, 1, 1, 1, 'Tanh')
-print(model)
-print(model.module)
-parameters, gradParameters = model:getParameters()
-output = model:forward(input)
-grads = torch.rand(output:size())
-gi = model:backward(input, grads)
-print(gi:mean(), gi:std(), gi:min(), gi:max())
-
-cutorch.setDevice(1)
-model:cuda()
-print(model.module.modules[1].finput)
-cinput = input:cuda()
-output = model:forward(cinput)
-gi = model:backward(input:cuda(), grads:cuda())
-print(gi:mean(), gi:std(), gi:min(), gi:max())
diff --git a/tutorials/ladder.md b/tutorials/ladder.md
deleted file mode 100644
index 591a21b..0000000
--- a/tutorials/ladder.md
+++ /dev/null
@@ -1,107 +0,0 @@
-# Lateral Connections in Denoising Autoencoders Support Supervised Learning
-
-In this tutorial we will understand how to implement ladder network as explained in [[1](http://arxiv.org/pdf/1504.08215.pdf)]. In this paper the authors have shown how unsupervised learning using a denoising autoencoder with lateral connections help improve the classification accuracy in supervised learning.
-
-To produce results as mentioned in the paper please run following command (best test error we got was **`0.6%`**). To run this script you will need following torch packages: [`nn`](https://github.com/torch/nn), [`nngraph`](https://github.com/torch/nngraph), [`dp`](https://github.com/nicholas-leonard/dp), [`dpnn`](https://github.com/Element-Research/dpnn), [`optim`](https://github.com/torch/optim) and [`cunn`](https://github.com/torch/cunn) & [`cutorch`](https://github.com/torch/cutorch) if using cuda (```--useCuda``` flag).
-```
-   th tutorials/ladder.lua --verbose --eta 500 --epochs 100 --learningRate 0.002 --linearDecay --endLearningRate 0 --startEpoch 50 --useCuda --deviceId 1 --noiseSigma 0.3 --useBatchNorm --batchSize 100 --adam --noValidation --attempts 10
-```
-
-The unsupervised learning (denoising) task supplements the supervised learning task (classification in this case). As in autoencoders this network has an encoder and a decoder. The output of encoder is also used for classification. The output of encoder is **`N`** dimensional where **`N`** is number of classes. This **`N`** dimensional vector is used for computing classification cost as well as feeds into the decoder.
-
-## Classification
-Encoder/classifier units are defined as
-```lua
-   Z = nn.BatchNormalization(hidden_units)(nn.Linear(inputDims, hidden_units)(previous_H))
-```
-where
-```lua
-   H = nn.ReLU()(nn.CMul()(nn.Add()(Z)))
-```
-For first layer **`previous_H`** is the corrupted input.
-```lua
-   input = nn.WhiteNoise(mean, sigma)
-```
-
-**`H`** for last encoder unit is defined as
-```lua
-   H = nn.LogSoftMax()(nn.CMul()(nn.Add()(Z)))
-```
-Last **`H`** feeds into the negative log likelihood criterion.
-
-## Denoising
-Typically in denoising autoencoder the input samples are corrupted using Dropout [```nn.Dropout```](https://github.com/torch/nn/blob/master/Dropout.lua) but in this paper the authors use isotropic Gaussian noise [```nn.WhiteNoise```](https://github.com/Element-Research/dpnn/blob/master/WhiteNoise.lua) with zero mean.
-
-### Lateral Connections in Autoencoder
-**`Z`** units in encoder are laterally connected to corresponding unit in the decoder. The output of decoder unit for neuron `i` is defined by
-```
-   z^_i = a_i1 * z_i + a_i2 * sigmoid(a_i3 + a_i4) + a_i5
-```
-where 
-```
-   a_ij = c_ij * u_i + d_ij
-```
-**`U`** is output of decoder unit's ```nn.Linear()```. For the top most layer  **`U`** is zero. **`Z`** is output of corresponding encoder unit (this is lateral connection, decoder takes output from its previous unit through **`U`** as well as corresponding encoder unit). For the lowest layer of decoder **`Z`** is the corrupted input signal. **`c_j`** and **`d_j`** are trainable weight vectors. This forms the crux of the ladder network. This can be easily implemented using **`nngraph`** as follows
-
-For the topmost layer **`U`**`= 0` and **`Z`** is the batch normalized output from the corresponding (in this case last) encoder/classifier unit. **`Z^`** for topmost layer is defined as
-```lua
-   z_hat1 = nn.CMul(hiddens[i])(Z)
-   z_hat2 = nn.CMul(hiddens[i])(Z)
-   z_hat3 = nn.CMul(hiddens[i])(Z)
-   z_hat34 = nn.Add(hiddens[i])(z_hat3)
-   z_hatSigmoid34 = nn.Sigmoid()(z_hat34)
-   z_hat234 = nn.CMulTable()({z_hat2, z_hatSigmoid34})
-   z_hat5 = nn.CMul(hiddens_units)(Z)
-
-   -- Z_hat = z^
-   Z_hat = nn.CAddTable()({z_hat1, z_hat234, z_hat5})
-```
-
-For lower decoder units **`Z^`** is defined as
-```lua
-   
-      u = nn.Linear()(previous_Z_hat)
-
-      cu1 = nn.CMul(hidden_units)(u)
-      du1 = nn.Add(hidden_units])(u)
-      a1 = nn.CAddTable()({cu1, du1})
-      cu2 = nn.CMul(hidden_units)(u)
-      du2 = nn.Add(hidden_units)(u)
-      a2 = nn.CAddTable()({cu2, du2})
-      cu3 = nn.CMul(hidden_units)(u)
-      du3 = nn.Add(hidden_units)(u)
-      a3 = nn.CAddTable()({cu3, du3})
-      cu4 = nn.CMul(hidden_units)(u)
-      du4 = nn.Add(hidden_units)(u)
-      a4 = nn.CAddTable()({cu4, du4})
-      cu5 = nn.CMul(hidden_units)(u)
-      du5 = nn.Add(hidden_units)(u)
-      a5 = nn.CAddTable()({cu5, du5})
-
-      z_hat1 = nn.CMulTable()({a1, z})
-      z_hat2 = nn.CMulTable()({a3, z})
-      z_hat3 = nn.Sigmoid()(nn.CAddTable()({z_hat2, a4}))
-      z_hat4 = nn.CMulTable()({a2, z_hat3})
-      Z_hat = nn.CAddTable()({z_hat1, z_hat4, a5})
-```
-`Z_hat` is `z^`. Final `Z_hat` is the output of decoder and feeds into the mean squared error criterion.
-
-## Criterions
-Negative log likelihood criterion is used for classification task.
-```lua
-   nll = nn.ClassNLLCriterion()
-```
-Mean squared error is used for the auxillary task.
-```lua
-   mse = nn.MSECriterion()
-```
-These two training criterions are combined using `eta` which determines weight for auxillary task. If `eta` is zero then the model is trained for classification only.
-Combined criterion
-```lua
-   criterions = ParallelCriterion()
-   criterions:add(nll)
-   criterions:add(mse, eta)
-```
-
-## References
-[1] Rasmus, Antti, Harri Valpola, and Tapani Raiko. "Lateral Connections in Denoising Autoencoders Support Supervised Learning." arXiv preprint arXiv:1504.08215 (2015).
diff --git a/tutorials/ladder_network/ladder.lua b/tutorials/ladder_network/ladder.lua
deleted file mode 100644
index 5e556cf..0000000
--- a/tutorials/ladder_network/ladder.lua
+++ /dev/null
@@ -1,444 +0,0 @@
---[[!
-   Implementation of ladder as mentioned in http://arxiv.org/pdf/1504.08215.pdf
---]]
-
-require 'nn'
-require 'dp'
-require 'dpnn'
-require 'math'
-require 'xlua'
-require 'optim'
-require 'nngraph'
-
--- Cuda
-require 'cutorch'
-require 'cunn'
-
--- Help functions
-require 'ladder_help_funcs'
-
-torch.setdefaulttensortype("torch.FloatTensor")
-op = xlua.OptionParser('%prog [options]')
-
--- Data
-op:option{'--noValidation', action='store_true', dest='noValidation',
-          help='Use validation data for training as well.', default=false}
-op:option{'--best', action='store_true', dest='best',
-          help='Use best training or validation model.', default=false}
-
--- Model parameters
-op:option{'--noOfClasses', action='store', dest='noOfClasses',
-          help='Number of classes.', default=10} -- MNIST data
-op:option{'--noiseSigma', action='store', dest='noiseSigma',
-          help='Stdev for noise for denoising autoencoder (Mean is zero).',
-          default=0}
-op:option{'--hiddens', action='store', dest='hiddens',
-          help='Hiddens units', default='{1000, 500, 250, 250, 250}'}
-op:option{'--useBatchNorm', action='store_true', dest='useBatchNorm',
-          help='Use batch normalization.', default=false}
-op:option{'--weightTied', action='store_true', dest='weightTied',
-          help='Tie weights of decoder with encoder.', default=false}
-
--- Criterion and learning
-op:option{'--attempts', action='store', dest='attempts',
-          help='Run attempts independent experiments.', default=1}
-op:option{'--eta', action='store', dest='eta',
-          help='If zero then only classifier cost is considered.', default=0}
-op:option{'--batchSize', action='store', dest='batchSize',
-          help='Batch Size.',default=32}
-op:option{'--epochs', action='store', dest='epochs',
-          help='Number of epochs.',default=100}
-op:option{'--maxTries', action='store', dest='maxTries',
-          help='Number of tries for stopping.',default=0}
-op:option{'--learningRate', action='store', dest='learningRate',
-          help='Learning rate',default=0.002}
-op:option{'--learningRateDecay', action='store', dest='learningRateDecay',
-          help='Learning rate decay',default=1e-7}
-op:option{'--linearDecay', action='store_true', dest='linearDecay',
-          help='Linearly reduce learning rate', default=false}
-op:option{'--startEpoch', action='store', dest='startEpoch',
-          help='Epoch number when to start linear decay.',default=1}
-op:option{'--endLearningRate', action='store', dest='endLearningRate',
-          help='Learning rate at last epoch',default=0.0}
-op:option{'--momentum', action='store', dest='momentum',
-          help='Learning Momemtum',default=0}
-op:option{'--loss', action='store_true', dest='loss',
-          help='If true use loss for early stopping else confusion matrix.',
-          default=false}
-op:option{'--adam', action='store_true', dest='adam',
-          help='Use adaptive moment estimation optimizer.', default=false}
-
--- Use Cuda
-op:option{'--useCuda', action='store_true', dest='useCuda', help='Use GPU',
-          default=false}
-op:option{'--deviceId', action='store', dest='deviceId', help='GPU device Id',
-          default=2}
-
--- Print debug messages
-op:option{'--verbose', action='store_true', dest='verbose',
-          help='Print apppropriate debug messages.', default=false}
-
--- Command line arguments
-opt = op:parse()
-op:summarize()
-
--- Data
-noValidation = opt.noValidation
-best = opt.best
-verbose = opt.verbose
-
-   -- Cuda
-useCuda = opt.useCuda
-deviceId = tonumber(opt.deviceId)
-
--- MNIST Data source
-ds = dp.Mnist{}
-
-attempts = tonumber(opt.attempts)
-testAccus = torch.zeros(attempts)
-trData = {}
-tvData = {}
-tsData = {}
-for attempt=1,attempts do
-
-   local t1, t2
-
-   trData.data, t1, t2 = ds:get('train', 'input', 'bchw', 'float')
-   trData.labels, t1, t2 = ds:get('train', 'target')
-   trData.size = function() return trData.data:size()[1] end
-
-   tvData.data, t1, t2 = ds:get('valid', 'input', 'bchw', 'float')
-   tvData.labels, t1, t2 = ds:get('valid', 'target')
-   tvData.size = function() return tvData.data:size()[1] end
-
-   tsData.data, t1, t2 = ds:get('test', 'input', 'bchw', 'float')
-   tsData.labels, t1, t2 = ds:get('test', 'target')
-   tsData.size = function() return tsData.data:size()[1] end
-   collectgarbage()
-
-   local tempSample = trData.data[1]
-   local channels = tempSample:size(1)
-   local width = tempSample:size(2)
-   local height = tempSample:size(3)
-   local linFeats = channels * height * width
-
-   -- MNIST
-   local classes = {'1', '2', '3', '4', '5', '6', '7', '8', '9', '10'}
-   local confusion = optim.ConfusionMatrix(classes)
-
-   -- Model
-   local noOfClasses = tonumber(opt.noOfClasses)
-   local noiseSigma = tonumber(opt.noiseSigma)
-   local inputHiddens = dp.returnString(opt.hiddens)
-   local useBatchNorm = opt.useBatchNorm
-   local weightTied = opt.weightTied
-
-
-   hiddens = {linFeats}
-   for i=1,#inputHiddens do
-      hiddens[#hiddens+1] = inputHiddens[i]
-   end
-   hiddens[#hiddens+1] = noOfClasses
-
-   -- encoder input
-   local input = nil
-   if noiseSigma ~= 0 then
-      if verbose then print("Add noise to the samples.") end
-      input = nn.WhiteNoise(0, noiseSigma)()
-   else
-      input = nn.Identity()()
-   end
-
-   -- encoder model
-   local encoderLayers = {}
-   local Zs = {}
-   Zs[1] = input
-   local Hs = {}
-   Hs[1] = input
-   for i=2,#hiddens do
-      -- Zs
-      encoderLayers[i] = nn.Linear(hiddens[i-1], hiddens[i])
-      if useBatchNorm then
-         Zs[i] = nn.BatchNormalization(hiddens[i])
-                                      (encoderLayers[i](Hs[i-1]))
-      else
-         Zs[i] = encoderLayers[i](Hs[i-1])
-      end
-     
-      -- Hs
-      if i==#hiddens then
-         Hs[i] = nn.CMul(hiddens[i])(nn.Add(hiddens[i])(Zs[i]))
-      else
-         Hs[i] = nn.ReLU()(nn.CMul(hiddens[i])(nn.Add(hiddens[i])(Zs[i])))
-      end
-   end
-
-   -- classifier
-   local classifier = nn.LogSoftMax()(Hs[#Hs])
-
-   -- Decoder
-   local decoderLayers = {}
-   local Z_hats = {}
-   for i=#hiddens,1,-1 do
-
-      -- u = 0 hence no cij
-      if i==#hiddens then
-         z_hat1 = nn.CMul(hiddens[i])(Zs[i])
-         z_hat2 = nn.CMul(hiddens[i])(Zs[i])
-         z_hat3 = nn.CMul(hiddens[i])(Zs[i])
-         z_hat34 = nn.Add(hiddens[i])(z_hat3)
-         z_hatSigmoid34 = nn.Sigmoid()(z_hat34)
-         z_hat234 = nn.CMulTable()({z_hat2, z_hatSigmoid34})
-         z_hat5 = nn.CMul(hiddens[i])(Zs[i])
-         Z_hats[i] = nn.CAddTable()({z_hat1, z_hat234, z_hat5})
-      else
-         decoderLayers[i] = nn.Linear(hiddens[i+1], hiddens[i])
-         if weightTied then
-            if verbose then print("Tying encoder-decoder weights.") end
-            decoderLayers[i].weight:set(encoderLayers[i+1].weight:t())
-            decoderLayers[i].gradWeight:set(encoderLayers[i+1].gradWeight:t())
-         end
-
-         u = decoderLayers[i](Z_hats[i+1])
-
-         cu1 = nn.CMul(hiddens[i])(u)
-         du1 = nn.Add(hiddens[i])(u)
-         a1 = nn.CAddTable()({cu1, du1})
-         cu2 = nn.CMul(hiddens[i])(u)
-         du2 = nn.Add(hiddens[i])(u)
-         a2 = nn.CAddTable()({cu2, du2})
-         cu3 = nn.CMul(hiddens[i])(u)
-         du3 = nn.Add(hiddens[i])(u)
-         a3 = nn.CAddTable()({cu3, du3})
-         cu4 = nn.CMul(hiddens[i])(u)
-         du4 = nn.Add(hiddens[i])(u)
-         a4 = nn.CAddTable()({cu4, du4})
-         cu5 = nn.CMul(hiddens[i])(u)
-         du5 = nn.Add(hiddens[i])(u)
-         a5 = nn.CAddTable()({cu5, du5})
-
-         z_hat1 = nn.CMulTable()({a1, Zs[i]})
-         z_hat2 = nn.CMulTable()({a3, Zs[i]})
-         z_hat3 = nn.Sigmoid()(nn.CAddTable()({z_hat2, a4}))
-         z_hat4 = nn.CMulTable()({a2, z_hat3})
-         Z_hats[i] = nn.CAddTable()({z_hat1, z_hat4, a5})
-      end
-   end
-   local model = nn.gModule({input}, {classifier, Z_hats[1]--[[Decoder--]]})
-   if verbose then print(model) end
-
-   -- Criterion and learning
-   -- Criterion
-   local eta = tonumber(opt.eta)
-   local criterions = nn.ParallelCriterion()
-   local nll = nn.ClassNLLCriterion()
-   local mse = nn.MSECriterion()
-   criterions:add(nll)
-   criterions:add(mse, eta)
-
-   -- Learning
-   local batchSize = tonumber(opt.batchSize)
-   local epochs = tonumber(opt.epochs)
-   local maxTries = tonumber(opt.maxTries)
-   local learningRate = tonumber(opt.learningRate)
-   local learningRateDecay = tonumber(opt.learningRateDecay)
-   local linearDecay = opt.linearDecay
-   local startEpoch = tonumber(opt.startEpoch)
-   local endLearningRate = tonumber(opt.endLearningRate)
-   assert(epochs > startEpoch, "startEpoch should be smaller than epochs.")   
-
-   if linearDecay then
-      if verbose then print("Using linear decay.") end
-      learningRates = torch.zeros(startEpoch):fill(learningRate)
-      local temp = torch.range(learningRate, endLearningRate,
-                               -learningRate/(epochs-startEpoch))
-      learningRates = torch.cat(learningRates, temp)
-   end
-
-   local momentum = tonumber(opt.momentum)
-   local loss = opt.loss
-   local adam = opt.adam
-
-   -- Optimizer
-   local optimState = {
-                       coefL1 = 0,
-                       coefL2 = 0,
-                       learningRate = learningRate,
-                       weightDecay = 0.0,
-                       momentum = momentum,
-                       learningRateDecay = learningRateDecay
-                      }
-
-   -- If true use Adaptive moment estimation else SGD.
-   if adam then
-      if verbose then print("Using Adaptive moment estimation optimizer.") end
-      optimMethod = optim.adam
-   else
-      if verbose then print("Using Stocastic gradient descent optimizer.") end
-      optimMethod = optim.sgd
-   end
-   if verbose then
-      print(optimMethod)
-      print(optimState)
-   end
-
-
-   if useCuda then
-      if verbose then print("Using GPU: "..deviceId) end
-      cutorch.setDevice(deviceId)
-      if verbose then print("GPU set") end
-      model:cuda()
-      if verbose then print("Model copied to GPU.") end
-      criterions:cuda()
-      if verbose then print("Criterion copied to GPU.") end
-   else
-      if verbose then print("Not using GPU.") end
-   end
-
-   -- Retrieve parameters and gradients
-   parameters, gradParameters = model:getParameters()
-
-   -- Reshape samples from images to vectors
-   trData.data = trData.data:reshape(trData.size(1), linFeats)
-   tvData.data = tvData.data:reshape(tvData.size(1), linFeats)
-   tsData.data = tsData.data:reshape(tsData.size(1), linFeats)
-   collectgarbage()
-
-   if noValidation then
-      trData.data = torch.cat(trData.data, tvData.data, 1)
-      trData.labels = torch.cat(trData.labels, tvData.labels, 1)
-      tvData.data = nil
-      tvData.labels = nil
-      collectgarbage()
-   end
-
-   if verbose then
-      print(trData)
-      print(tvData)
-      print(tsData)
-   end
-
-   -- Training
-   local displayProgress = verbose
-   local classifierIndx = 1
-   local trainAccu = 0
-   local validAccu = 0
-   local bestTrainAccu = 0
-   local bestValidAccu = 0
-   local trainLoss = 0
-   local validLoss = 0
-   local bestTrainLoss = math.huge
-   local bestValidLoss = math.huge
-   local bestTrainModel = nn.Sequential()
-   local bestValidModel = nn.Sequential()
-   local earlyStopCount = 0
-   for i=1, epochs do
-      if linearDecay then
-         optimState.learningRate = learningRates[i]
-      end
-      -- Training
-      trainLoss = model_train_multi_criterion(model, criterions,
-                                              parameters, gradParameters, trData,
-                                              optimMethod, optimState, batchSize,
-                                              i, confusion, trainLogger,
-                                              useCuda, displayProgress,
-                                              classiferIndx)
-      confusion:updateValids()
-      if loss then
-         if verbose then
-            print("Current train loss: ".. trainLoss
-                     ..", best train loss: " .. bestTrainLoss)
-         end
-         if trainLoss < bestTrainLoss then
-            bestTrainLoss = trainLoss
-            bestTrainModel = model:clone()
-            print(confusion)
-         end
-      else -- Using classification accuracy for saving best train model
-         trainAccu = confusion.totalValid * 100
-         if bestTrainAccu < trainAccu then
-            bestTrainAccu = trainAccu
-            bestTrainModel = model:clone()
-            bestTrainLoss = trainLoss
-         end
-         if verbose then
-            print("Current train accu: ".. trainAccu
-                     ..", best train accu: " .. bestTrainAccu
-                     ..", best train loss: " .. bestTrainLoss)
-         end
-      end
-
-      -- Validating
-      if not noValidation then
-         validLoss = model_test_multi_criterion(model, criterions,
-                                                tvData, confusion,
-                                                useCuda, classifierIndx)
-         confusion:updateValids()
-         if loss then
-            if verbose then
-               print("Current valid loss: ".. validLoss
-                        ..", best valid loss: " .. bestValidLoss)
-            end
-            if validLoss < bestValidLoss then
-               earlyStopCount = 0
-               bestValidLoss = validLoss
-               bestValidModel = model:clone()
-               print(confusion)
-            else
-               earlyStopCount = earlyStopCount + 1
-            end
-         else
-            validAccu = confusion.totalValid * 100
-            if bestValidAccu < validAccu then
-               earlyStopCount = 0
-               bestValidAccu = validAccu
-               bestValidModel = model:clone()
-               bestValidLoss = validLoss
-            else
-               earlyStopCount = earlyStopCount + 1
-            end
-            if verbose then
-               print("Current valid accu: ".. validAccu
-                     ..", best valid accu: " .. bestValidAccu
-                     ..", best valid loss: " .. bestValidLoss)
-            end
-         end
-         if verbose then
-            print(noiseSigma, weightTied, useBatchNorm, eta, earlyStopCount)
-         end
-      end
-
-      if maxTries ~= 0 then
-         if earlyStopCount >= maxTries then
-            if verbose then print("Early stopping at epoch: " .. i) end
-            break
-         end
-      end
-   end
-
-   -- Testing
-   if best then
-      if noValidation then
-         testLoss = model_test_multi_criterion(bestTrainModel, criterions,
-                                               tsData, confusion,
-                                               useCuda, classifierIndx)
-      else
-         testLoss = model_test_multi_criterion(bestValidModel, criterions,
-                                               tsData, confusion,
-                                               useCuda, classifierIndx)
-      end
-   else
-      testLoss = model_test_multi_criterion(model, criterions,
-                                            tsData, confusion,
-                                            useCuda, classifierIndx)
-   end
-   confusion:updateValids()
-   testAccu = confusion.totalValid * 100
-   testAccus[attempt] = testAccu
-   if verbose then
-      print("Attempt: " .. tostring(attempt) .. " Test Accu: " .. testAccu)
-   end
-end
-print("Test accuracies.")
-print(testAccus)
-print("Max Test Error is: " .. tostring(100 - testAccus:max()) .. "%")
diff --git a/tutorials/ladder_network/ladder_help_funcs.lua b/tutorials/ladder_network/ladder_help_funcs.lua
deleted file mode 100644
index e6fe25e..0000000
--- a/tutorials/ladder_network/ladder_help_funcs.lua
+++ /dev/null
@@ -1,220 +0,0 @@
-require 'csvigo'
-require 'string'
-require 'xlua'
-require 'lfs'
-
--- Training function test
--- Processing a batch in one Go.
--- Has useCuda option to run on GPU [model and criterion expected in CUDA]
-local conTargets, conOutputs
-function model_train_multi_criterion(model, criterions, parameters,
-                                     gradParameters, trainData, 
-                                     optimMethod, optimState, batchSize,
-                                     epoch, confusion, trainLogger,
-                                     useCuda, displayProgress, classifierIndx)
-
-   model:training()
-   confusion:zero()
-   local displayProgress = displayProgress or false
-   local classifierIndx = classifierIndx or 1
-
-   -- epoch tracker
-   local epoch = epoch or 1
-
-   local totalLoss = 0
-   
-   -- shuffle at each epoch
-   local shuffle = torch.randperm(trainData.size())
-
-   local sampleSize = trainData.data[1]:size()
-   local isScalar = false
-   local labelSize
-   if trainData.labels:size():size() == 1 then
-      isScalar = true
-   else
-      labelSize = trainData.labels[1]:size()
-   end
-
-   print("Doing epoch on training data:")
-   print("Online epoch # " .. epoch .. " [batchSize = " .. batchSize .. "]")
-
-   -- local variables
-   local time = sys.clock()
-   local inputs
-   local targets
-   if isScalar then
-      targets = torch.Tensor(batchSize)
-   else
-      targets = torch.Tensor(batchSize, labelSize[1])
-   end
-
-   -- Samples
-   sizeLen = sampleSize:size()
-   if sizeLen == 1 then
-      inputs = torch.Tensor(batchSize, sampleSize[1])
-   elseif sizeLen == 2 then
-      inputs = torch.Tensor(batchSize, sampleSize[1], sampleSize[2])
-   elseif sizeLen == 3 then
-      inputs = torch.Tensor(batchSize, sampleSize[1], sampleSize[2],
-                                       sampleSize[3])
-   else
-      print("Invalid Sample Size")
-   end
-
-   local trainInputs = useCuda and torch.CudaTensor() or torch.FloatTensor()
-   local trainTargets = useCuda and torch.CudaTensor() or torch.FloatTensor()
-   local criterionTargets
-
-   t = 1
-   while t <= trainData.size() do
-      if displayProgress then xlua.progress(t, trainData.size()) end
-      noOfSamples = math.min(t + batchSize -1, trainData.size())
-      --create mini batch
-      indx = 1 
-      for i=t,math.min(t+batchSize-1, trainData.size()) do
-         -- Load new sample
-         inputs[indx] = trainData.data[shuffle[i]]
-         targets[indx] = trainData.labels[shuffle[i]]
-         indx = indx + 1
-      end
-      indx = indx - 1
-
-      local inputs_ = inputs[{{1,indx}}]
-      trainInputs:resize(inputs_:size()):copy(inputs_)
-
-      local targets_ = targets[{{1,indx}}]
-      trainTargets:resize(targets_:size()):copy(targets_)
-
-      criterionTargets = {trainTargets, trainInputs}
-
-      t = t + batchSize
-
-      -- create closure to evaluate F(X) and df/dX
-      local feval = function(x)
-         -- Get new parameters
-         if x ~= parameters then
-            parameters:copy(x)
-         end
-
-         -- reset gradients
-         gradParameters:zero()
-
-         -- evaluate function for complete mini batch
-         local outputs = model:forward(trainInputs)
-         local f = criterions:forward(outputs, criterionTargets)
-         -- Total Loss
-         totalLoss = totalLoss + f
-
-         local df_do = criterions:backward(outputs, criterionTargets)
-         model:backward(trainInputs, df_do)
-
-         if useCuda then
-            conOutputs = outputs[classifierIndx]:float()
-            conTargets = trainTargets:float()
-         else
-            conOutputs = outputs[classifierIndx]
-            conTargets = trainTargets
-         end
-
-         confusion:batchAdd(conOutputs, conTargets)
-
-         -- Normalize gradients
-         gradParameters:div(trainInputs:size()[1])
-         f = f/trainInputs:size()[1]
-
-         -- L1/L2 Regularization
-         if optimState.coefL1 ~= 0 or optimState.coefL2 ~= 0 then
-            -- locals"
-            local norm, sign = torch.norm, torch.sign
-         
-            -- Update loss with regularizer
-            f = f + optimState.coefL1 * norm(parameters, 1)
-            f = f + optimState.coefL2 * norm(parameters, 2)^2/2
-
-            -- Gradients
-            gradParameters:add(sign(parameters):mul(optimState.coefL1)
-                               + parameters:clone():mul(opt.coefL2))
-         end
-
-         -- return f and df/dX
-         return f, gradParameters
-      end
-
-      -- optimize on current mini batch # Using SGD/adam
-      optimMethod(feval, parameters, optimState)
-   end
-
-   -- time taken
-   time = sys.clock() - time
-   time = time/trainData.size()
-   print("\n==> time to learn 1 sample = " .. (time*1000) .. "ms")  
-
-   -- Total loss
-   totalLoss = totalLoss/trainData.size()
-
-   -- update logger
-   if trainLogger ~= nil then
-      trainLogger:add{["% mean class accuracy (train set)"] =
-                      confusion.totalValid * 100}
-   end
-   return totalLoss
-end
-
-function model_test_multi_criterion(model, criterions, testData, confusion, 
-                                    useCuda, classifierIndx)
-   local time = sys.clock()
-   model:evaluate()
-   confusion:zero()
-   local classifierIndx = classifierIndx or 1
-   local totalLoss = 0
-   local criterionTargets
-
-   if useCuda then
-      local batchSize = 64
-      local inputs = torch.CudaTensor()
-      local testInputs
-      local cpu_targets
-      local gpu_targets = torch.CudaTensor()
-      local gpu_preds
-      local cpu_preds
-      local i = 1
-      local j = 0
-      while i <= testData.size() do
-         j = math.min(i + batchSize -1, testData.size())
-         -- Copy input and targets to cuda
-         testInputs = testData.data[{{i, j}}]
-         inputs:resize(testInputs:size()):copy(testInputs)
-         cpu_targets = testData.labels[{{i, j}}]
-         gpu_targets:resize(cpu_targets:size()):copy(cpu_targets)
-         criterionTargets = {gpu_targets, inputs}
-
-         gpu_preds = model:forward(inputs)
-         totalLoss = totalLoss + criterions:forward(gpu_preds,
-                                                    criterionTargets)
-         cpu_preds = gpu_preds[classifierIndx]:float()
-         confusion:batchAdd(cpu_preds, cpu_targets)
-         i = i + batchSize
-      end
-   else
-      local trainInputs = testData.data
-      local trainTargets = testData.labels
-      criterionTargets = {trainTargets, trainInputs}
-
-      local outputs = model:forward(trainInputs)
-      totalLoss = criterions:forward(outputs, criterionTargets)
-
-      local conOutputs = outputs[classifierIndx]
-      local conTargets = trainTargets
-      confusion:batchAdd(conOutputs, conTargets)
-   end
-
-   -- time taken
-   time = sys.clock() - time
-   time = time/testData.size()
-   print("\n==> time to test 1 sample = " .. (time*1000) .. "ms")
-
-   -- Total loss
-   totalLoss = totalLoss/testData.size()
-
-   return totalLoss
-end
diff --git a/tutorials/lena.jpg b/tutorials/lena.jpg
deleted file mode 100644
index 9181d48eb6490d195793ba29427a91f8cbb65e20..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 6600
zcmbW4cQ72@_xE?LE+U8^2nj;;-r0x<B6>uNPIRmHT|snKh~74dDDeraufFQ~C?Q0w
zURLj`ul|$ocjkHidH#5w=iIqx?wxz?o%d_boSA#B$FJuBcV52GcmV(c0RZ6509;Q2
zQ~*T4zxLM?{WW6HUr$0z3?e2aAtn9KB%>fFC8HoCB_*dKr=a|6H>+Dzl(+tF{yy^G
zt3)Iq5D6t2DcOHQ{-4}+8-SJ^U<a@R0UrT~Xn`PF;B^;({YEFrf8^bW{jULuK*S`Z
zWH&OXZUSoW+{g!kZbXyZD87k?-kbx7X-ViFiYk-R>sgaM^1Le+mi&dB^I2s(gZ?m{
zOWejQoPzQmBNOv|?#Dd5PxvGxrKDwK<y4-lzED%wcxmw3@QsnNiK(rf{W}LoCueUT
zUqAnVz@Uf^kx|hxv2iJ%Qq$5iGPAP37JMr#DlYk6T2)<BTZgJ|Xzb|h>c;f^+uJwt
zd-Ts3c6?%T0k^ocyt2BszOlD|aCmflLO4D9%LN30{)=_v|Bqa>H(W%2b3pc&3rOUD
zGeNY(Bo9SN>6G=ztUc);iG`8heU|*CvYmocTp!P1<26iqk4s{Kd+#sWf5`stz{3B(
z$o?1D|K^$m+yVh_k_Vy%fB|^)l<AI>!js}@&89%K-u4_kZ83;#8twYwc~cc|{qQp2
zk{=6($i7mdsfv4~WVVD+Dr(m3G(tHEh(Wk36C&U)2Lk1c{RZX}m$aFqdIQrT$y;(l
zg<gVKkmKZ2dF5`GqAzl4>4NF9?QgL>U*@!jA97dj=4Mw4#}}sH)CG8y0SWxthF)eo
zeiXe-gJSh$?!jXZIw!Sz3Rs^zJ=<DOR!}ZUOVX@NSF>$b@9jN2JH*upDS-C$J46Gn
zsL=s&5$BO~)gZXvLqRx{FFN9=e{J@OZSssv4>4Rm>38uVvZ$42y1mqi|NETQ{Zcnl
z4K8LjJQ=`C?Lmv&SY^krxZ0h;`={xT<uFe;m%sa(X!!U(c_eDW>D$SFo1BxG7?u``
zi!X*MV1@?CdDudSBu~&?mg6U#p${Yqb4+XGls<pN&6$&P)?0{tDZtq4_4F+SC@>7V
zEI=j^^Vlx%#HXIch21WSh5ERSKkhzI=Sk^vsR9ZNF0w+UAOs%0VgN7gGcCyY6JcP+
zEU-vpE+gp#BlE;_cI&TSQAKrV=wuA*K#E72Vrv}>WwYK>%Jy!!48SN%drB@OHGhr!
zaFsu4B_ez&3|KW7l_06#->u1sVC#~vczLXJsTxp=-|=<eEz2)Pzi3R9^ZaD7|9jx%
z8ZeQu_!HSN6h}L?Nl*H#7ILU%AuFQ$a9$%~Js%p2WV=LP5t+ey(F{CTiBwtwOBdhC
zB=?RuTdTGmYN#|t3SMYz`#6l%YtZzW{Em%i+h*^Y3^E7p4xU=<nX9pktbt5yOc`o~
zu+MK6xj6k17rbn{75)OY7T0~}_5=?P(|T{!86hF)*piCSB7c_8PP@o!CDJ-(d9T+$
zbZixb8=6G-pyZ~D+l^=^oO<`CdITGr>U&SeliOw2$APMWPXo!@xNH-MWG>c_!vz`h
z1$SGr!Gh`jS%X^VZ(3*?uK^Gf<`5RU{qkSJ^x@FW&I_=pr~5Xue1%RwMvQ+=hrmmc
zRvJLsp^`G^rt5ixL>y`~e|T^0A(|3o(+cLxzUohGsy-YT6h1e(21LSK53>Z2ZZPJz
zgh7?%0_S0;vC`1QN$C`oYXHztIqA%q#@QiCV7}QfN7s{;a?x`tIX4WoxotjHJ0D+W
zd8~ym+s^HA!yvyqYcWGhhgQnU)n4VAhHVqF7nRtJ>&hI9%p$0pZRyO#0Gw<c=a-7&
zJt2;N3=HblYkd2t>t`H?*DUUgNG8$EC6cv&gm{78>WI5V&1jsDR4&nv+wv6M=HEyP
z%O*{mcX9El$ksexvOFbM&!J<DNKXu;>a^uqo&PR_L?{4DmA`W7t?!-QTU^!j4^haq
z>U|ujs7CBSOa$OEDqqhRPVpn#R<?ZyD5b7)0~L{a$cOc^dMB)WANBAT>C*FM6dWp4
zkVKlz0`|>o2FWd@VdKBT<-GR~l2Yvwt{Bu7)!M<4p*5?Wn~11u0K;L)H&jEM&erzR
zXVjQ9Stdpi+4ftvN1k#hNmW>>BiZqD-APsVP<727aZioN3yC7&nLV%I;z>GMnJ}R|
z*`M$T53IP(!xX<;%(*F`JHcTX4H*#o%fk}fr`jPzx7FkH=)O>zE4F~91;O-cd;uo?
zouFoCV`?3Wm?qucAah0Mbu#6@#f|lfo89Pl*MKlEJ4_UYy`Q$)LCulKC#b7j=3?aB
zF~Z&USOIy~>-leDAjYKbvxDUbwMzRvG~JFsEh5&hZ{M_QxQE!aJNuEc@igE^pKM#-
zdy@t$_3~0JwT?Nm3tGj?sASTtBI7ng!R<%t@#(HVUfv~nYP=7i%B|dSxh2V<Mr-P)
zEq0M#G(PZlkLi`7j8rjc`NddtmnGg{H_SrjG9B$Ox(A60fjE4ha+$<AVh@yoaW2AQ
z6!vYMVtF`bAjd@mMtdvi=lJK-=8_>DNVB))mpQzN@MQe6y=y=_1Q7Cg*lj7!<)@9C
z`yQ*?cm?MqW_-gi-|Rij4=&&J{nafOyusBy_V}UrqbcCyyl(@EO!lYm8aCqLAB1PY
zPvx=bFgOb^%$Jiz@Dg^gBF=1E9Wo-}z+YQj423r>7;TvpSrYiLdV7a*;m*>WKZApx
z0m7JwN+!rwdw71!Y^y@eA*@m!WzB+ZtRI8&SK{zIIg5o-qd$_q7k-?7sHNI*eASd6
zcnu&SN+j!;GBFfI>1qcbA#;5~1(ff`0J*()Lff#v!Kh1g7T!yi#qsA>KpXok`+}u7
z0Y=YVL(R;UOQcio{Wj~S>ORz7@ti$qbz)gTA8%Yt92d9CNAIDxc#51RsdL3PENHRi
zuI|<$bi@6A*x(5Yk}4Fp(P)8fnlpY|45uZbbjlGhFmNpLu*>a$eg&c0+ZXQTonJro
z`)c4CK)cmShf2We=b2J_w`aT=(-6q94kk=k%&M^~^9ieDJ!P7)P~GFixrW}_B5r(7
zZ)w4qS`GI?G64?YA8tWAo-QSSGMV@AN*DVYjuzM1oyO9~!`Z1A4JV#m1Ms(py1`6`
zkf`Ib;P1|aCTM-aH2_I<z+i=lW@8Xz!lsR1jVbZgW6L4;HiG~0s|!e>)?O&I6diR9
z`1MgQ)ow@VsX>Z&ct~)!k|b8a?7LPdj$#0}FF5TG{*<d+V}~3Q3}cwPxO0h+4254Q
zR#gs5GDPS^DQwDxhZ`D85q+&%O%+X<E*WC)s68gQ-lG@nY)q4^JGn?cy%Lwhyvr=q
zr18lgoBNYriMO@5tUx=WS&Ji@X&1A$T?<mVuf80s_oN^K<5mURIB#o{8m%zv!UzPD
zua1_t4};wv1vk`E>v1MN9u;Eo9_S0xM5Kmv?VMJK*KZrQlPmNrkae2#ABW=et^pAt
zGVUa6^=5WWV&a+-*8Zz(L_!z9T)X^l^cI_VNc*vAd~Ye?Jy{0J8w!?(z#bGAVxPZl
zpfVuPMvyo)h%4oM=ySi=h1)Bz#1!No_swBe>2AoB%K}fEpt6FjH)}RJrkv9xL!$;H
zTHq6CZ(Yk6(Kbz3b?0vjbrOb$(uF85+}%TMOYVVnekpK$Vm?HY6C`6A{(J)wai{C#
zWkvyW?F}`!gghWO<>d@xu^<o$Q--+UQ*%UMJ^ozIr1NDBAls7j@u7WWWT+F=^cv8G
zV4&B{zX~j~RQbL1JH?5L+&X1UR!S%cU|!FJ3BKb}K&7&oy1?7S-wFY%M2M7EH4F+V
z4EARfo!2>p*b*vn!bxaaqi#8P8c9lz8%b1Rf$@`lSGPwjHeuquOX=?vs+%g8uv6E7
zYD6cCWi*1y)yHRi@Z$>ZIfgr{A^GwOk{Zm5OhVhCqxLV92?cD^$q|rZoJHUR7q-fz
zw&kI!`{#r0wZb&(!|Ni(T<K$#Td|8WR-b)j3*`P`!pH<TxxJDJ%55l)``$Ei4an?|
zSv65N2~T?VZP>7A&o7a4C+*t{Rt6WC+7d4ksGqTq&^d|;6^;B`)D^aPbKMtR@g;qA
z;7QZdq|RfyZmGN?1+iFngsa9a8vc4Fq2BvML$%ttyR0<YwMP!y3jy|oyZ#tYhAuTO
z5A9`S=7kBaX_qp9!_hRblHOUWDn9F5^G5`?LQDHC0d#=OqU^&cW!CZw>F9qFI!ZO&
z9YPz}SXC_YZ#Vro(R*_~frX9abKyhVTOJR!w?4q`?ADI&a*RIDhJCAajn#jVw5@zk
znQkGqrAe%TMdu4xi7@vXD|sok*b}rQHeG`f1g!v)i!*<~o*UFj>JEIq4YZwNFwi$w
zd5#yH{o<H%ip6%w_eUYzm-y3cvUVHV4%-($p$yt8H<T3chuKPvb?I{8@#5@N-QdJ;
z66%BBmaiO6sR9Fp7RZ$+TMwwhI_(bKqytS)pcAM0@)8(_fOu6CKQpI4xroR^`CW#9
zqW(@Lrn>B1-R7o!*M)sl?467Lk79-y%!`VO4VSFL3W5tm6ZG?Ul^F1d$i=Y`>yoJ+
ziJ;(rOR<1^whF6Y0o4=Wze3U^l+1A#sB<=_S!x3H{C4PZ2_aW-r(if|E1&iE#Te*K
zGxrBiwd9n-Uri#*bb{lj70`lX4?-j|VE@=MCSS3cF}oG%HRAlLk7(78fgzCUe#vXY
zwbIW%3<p`WQg2b3Gx6th7oc^54u51&bCq9e+MExvc-A|g1#w(X*t|q9&cF^{jMSR*
zQ3coA27ebs?!rQH9A9a(QI@YzEoWln9e6f`C$PAagy;@-hJb!U6ne~ELGW`<n7g@F
zcmHWzGCu<oSe)0Py=|4W2)nC5Z@UnCq$RI~3bzH$8Ui>jJ!G!|N28XN@G|R?fM`rU
zV;xFfQisf(#W6GF$FSNe(n9pS+EP|Qi`C6o=rwxaywL>u3Qetd&>-(ymmkYcay5>4
z0Coh&Nl9g}GlwYO-J9)~NDhFpr1TdOLreUAM3Qdhzl=JClJg8IiKwgwK-$lCws*Z3
zQp3hkvT-Xo(BO@N*Ds)=7JttDH5bk`Uq_8iihBFQ{<)(oYoW0|*^I(8s;<Ue(S#_N
zKk;vLmV<xv+kcf9FoKb}29)>AtsEnN^z$2}-xo^=G@M(4c_J2^^)Z8VIXTN_yEh-$
zXvO|x8vHF&vd&@Lc}xsKH*1w8BM-f?cn#<=%w#|A4$heqOHa%6XbumG>%sFxmnksL
z3xYfAtoW4Ozj$MvOAK!!pBBw(lsX3`$i2yNw`*ujvqmNA2yA%(F3zw&ZCnK2JoaVz
zeW(;qp<#vpSm++VOLt=V8<$&9D%v9-rNwZ>-ies_-I0G6pV#^+bir7XzfF41eA;wZ
zP!#0B%nED+O~&E5yXu)Vx?gC|=qTIOXYn_vS{B=`pr`z1cPuB)pcU19?wtFhYAM>w
zZ6H=evE-g*H-Fdf`)$tmBQo6hrO8;b^$$|5J{+x0SOt!Iw*7=wpSx<6&V&%>W1qwQ
zTeEOcGlI4=5>=vx<Y8XjnYmV8-f<U~gfB>)U>r-#t^o0~$=Aj<z8szRU%ax0zCMi~
z;)pbuZ5;~7{b3aPd8NKb*l3K?-N{(_*+OB|Wx>*bAFM8h=5E9u=LARLcxMdzL}X4m
zEi~8^wSf$?1jb{l=A4DR-7fTlnz0A>nj4HSz{O^>98moYnRuxcznNP=69)%Qn=idq
zeP63291rt6B}!JRjUC+h`W(krl$XrAmrHEw&wp@DtEtqNrl|EBg*t~OT8Kup$vJdn
z`eX?{O~J7l8OLVyx0o-wdJ;25__&Z|W(jo2MEN0q9-sb{J1*WjTAQ<jg{F;mJd@-x
z^DQ->gQ!FkPWFml6FR+%)`C3YbI<U{X>4_AZV*aLHPh&qdS62eZF1cg%L%*+JGO_A
zt-bH-za3+}?d3`-#;a}v1pI@}vE9QJ)eTdeDaBq@;Q>n#76`lpuDo_3XYG(6C{$jM
zqAJYrS?D6cj+Zz;uSGSa91`YR;Vg0H40WJ-hwpgj?%O(2dBSiFNV*WVb@maadrK_s
z8B($c#CQN?FwpAkKlpovc=<L9dh}i$0%z`J?(Qh$x@~i3wwceI(fP?#HHku-cMr!|
z_ChmRZ~@^+u;$XxSMwEz{lVdcjs4=E(c7)LN2Q3`?Vf}Xdc3V}BO#>q;UB3--P{}P
z0(-_7uLsZinhcg_!abBZbA1UHLJz#kzt{fr>k=%2vHrebZ_@u9S!~A62^(R*gT;X{
zP6gEnC_F<H6_Fcl51AFY2pIAx7;9lXR{zzL=(8rlQa_MPfMCd8Pr&c?I~uZ#lf`=O
zXa&$j{CKly6fTuX)XORFP&bDB7HR{v)xpCu7R_yq>S>>$CsZ>Z2fk}O4HZ7<hUSZ9
zcUrN@K^a;v!3O)U?mb+rW``JUYoJcdh0~G>B%g@*@sJlducBOyRL?~T&<n%86P4d1
zQ=T!J9tGh*Lk@P=dq~9xVEkYuRgd)#vxY&v=bxpQTqe--S@=BL<GfbvPSG_$c{oJe
z<KWZD^Az>?`v;o`FvESBJ}<bX-nJc7Rp1p2b-NMaII*U;{$|sj64`r_1DRDEdddH6
z6Gs+xIH5G;TiQ*v=$T{%9*iTTX=j_|Po+7@k6&zG7FpHkgrBVHx7NdFesDQu4|k}K
zhmEV%r*+6%vl2;j(&nqe>>W+ZT8=AVGsVosEu*@VX5GB%i#FrrMlSYZudQq&+mIv3
z$qS9pbGc0_2nS$YjBSEa`A#M4*%Cl;^MFNV^xd$p<B@T~JI8M`Opt~44KWcpD}S~C
z8O|dZ6_!lu8jy356u(ELZ@j^Ojy#aJBqv-0^vz^y;?<u=aV||Z4^YHWNHfz0vZ(Ot
zil@A;7y=rNA{7J+2?rfg_ch=4Kp-W~6jnnrHm0ZHKVDLX4-gf&0X1&xdQ_>ithSJF
zL*FpH(Pwub@B36QjXzwiUmu4_ZktLcVKsj}`n5Hf;>$0czP~T1J))aD^8&Q%D%Sk!
z+cITJP63DAd$qMY-?v3Z60%M&ECrvBDCi&0Zt#DI+7W?Q{hMOlN!>oB5U#v*#Oi60
zgBMqBs=$UQ;v_yMhbrLR^!<rSDcN<JnH(nT-?)GoH}J-5XRg0=e7&Z=92gE$Au@Si
zvpkiRS&ch$Xn#Qe`%^cJ0q>VZ6gli_*V4j4lbZX8e#aQSOu%AR#%o_>@c3M9f=AD_
z@Av2P-`VtuPi!%>EF5yg`{i*-^|HJ&nNFz6pPdmux-#cq+YOVuEDEz}^!?_nvgm2u
z?=aa+Dzj80`V!Cw_nb>A3u5eqLd%vD{rAb|y)E+^N4%yl!&~p&&fr(mI=8&}^9sY5
zuSVEuc3U6cRAX&^*dFOvpGKKBg0h{odOzitb_CUm%)cf9@-fpd;Y#-}6AlB5@W>dh
zKLUlfebu*jII|VpQf4{Zho0T)IY5pJht2XOy%}DNwPPlis{<g1z>~W=c*d=~b4@~1
zhf#v-c8G162F{GSA}<E@GT(YJ(o&J<DVZIc3~1e>KKoCFQZI_9FlorMo=?Wrzb>w7
zY!j<85E6_(@63o7?<S9Xs<W$`W<68}^}hxXj}u5rLpboeu}T7jI{to5m!A*%WbWas
zek6ZMe~}rbsMCwBQ*>GW$!>_pcFC_T;7?)%c4JtUjUThb!5i3IHW5>t$AwzL#}8}w
zSTH^=h`SZlHI?al=&j)}7GQ#hU!7tMl2fiM#-sQ{(o|4hp#6RQ)PI_sP#Z&0V5iW@
zspI#A!o6a%J)Q`qhZ{i_iEO>KBW13JF9St$$#}@P;VWmfs)329gjyMuqNJ}}!+{^F
z-Swglm;9K%wx&ce8x%0{=?x3TUnD6NzBDLV+R!u_d3APvT%Y{)GF`ZwahJU88o(-s
zUE$4F6Xx{NI88!I=$U#z&Leg9z`YM3G3>jCp`B+LKYnF>2!EkZ59YMx@mPiD1qT_~
zE+Xli<*=PO%(b-i14$5;w}AnHq&<^ntaL;-?N?Gu?yb8<Qz`@lCF&5>r1w(kW6`>M
zmML6~6(~Ox-TvpuAr7db%Q|x6YX?gb1%lCBT(iW5_^rjXdZ2Am8rfIg`P*~)T6c|B
zC(c6-j$+S-NDH@0&NMD|(k^w(QF@A%iDy2bk@5gJoLvI|%*@JOcGn1VSp@rJG_Uma
zGN<-Z)qQL#Xp2v#rzPYXaMM{d<98B#X`&zSN%2A;4@bA$?G<(-Nt$Dd!7~nC;5i9B
zOPx<4ycgF1u!i)duae^B`wPLs&`uA+WBWIt^Ow4|GQ8fzx$_bKZqV2OH~7%ozxR+h
zvaVhDE`G7gw6rL8fxoEY*Wvl|@sAjGmO%}HRn|%gJ@Y@D%zIp{Y1=wFEeOSz`q4B)
uX_H5cOGzA~zEt1P**AS%4=D4i>O3DDu{s;ai~OqhW*l!Xd!VRuJ@G$@%Hx9o

diff --git a/tutorials/srd1.jpg b/tutorials/srd1.jpg
deleted file mode 100644
index 76971ffb23ba037234e739bbb3d97143a457b631..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 6504
zcmbVwXH*km*X>Y4Q>6EzfTA?%MI<6p6d^R}5ETIdAtJpusR98M5CtN=36YL~gdPwC
zY0`U<PC^L*0=e;h*ZS_C`{UkwW}UO1S#xIA-sh=%h!eyG;L0Pdhgtv`832%xE`T@<
zXaMA7|Lk8R|7VmG{~{G7B?Tol6*cvL97Ib)4Wb26Q`6AX(9-=gQj39}j^W?UzmNR)
zRdOl{3Mx7fHRwMn|4)I~3NX<CFMt;mWY++4CNc^pGGZqnK*CA&A9y6N|1~ml3Q8(!
z5D5l7DWUcX2|fh{2{aW6F)2HU^c<jMqGG-(txnBi_zZN-omD0*Ifq8*eq|e*(GW&h
z_W7G|S~_+PPA+bd>o-Ji-jb77P*hS@(RiS#rLFVuk+I1WQ!{f5OS>2L4vsIKoIO0f
zynTHA{3G5*Mn%WO#-+UfkeZhMG2>HiUVcGgQE^FWRdr2mU46s%#`ccRu5NTs@6X|p
z(Xl_{6O&VmOUo;(Yk$`_HnIB$hex<${K@G*USt5pe`%58|B)9Hi5L05J^=mWMMmyJ
zatbC&s;kn}%<6`qXYMT5WWs1z?<eO}w$Td78e!O;zZs%q7nWNT!TzK6A7=k|#KQl-
znEem2|IKR(V4xr)bshy100u5X)^p<k(L9h+7pK|>z)<tQk(Jc?A16EZ*-<6}looKD
z2j-Cb;9^*!%x_$bBW;e6ef5q1-yJxiY(Nfyzy8zNi6LT{r}zU!@?!I{FyjfHYYSQe
z#hBdT;k@A7NXbl*8s)tltr~LcB)a2RO{eZ>X{gqk8^~}Enzfp#k+CGxW$Dpe@u#n#
z89sL)NxRsr=CqxoHB{xjh9CmMz7L21M%!s8m`4|TeG3;V-74SU*>e>}=OLcJn$qJF
zR2CX$EJTTr^K8rOn8#@M$TwUJd7Wtmr3v11;%N`S=-<9AdAhR9b}CRFsM0(h@*G98
ziIWYL^x82tkv@rJo6)KJT$%j-ok7cdY=zKbP=t%OvtyIOpQLy_-=5NIhGWL_>(s!2
zHr8G@=0K$~tJFEG4!Yv09p>w_|8Vip1>zsHLqPw$U?c+J49@dc<nw<Aikv+>&(+J>
ztq{SyPULxTm)lhFZAE}6<@;8)9e+jw_b;a<TA{>2+-?vue3%Hdmx}<)nMA<pplvTA
zNi)a~gSM!{Y^q;QrLC(qJzySMnHDYz9sbEEm6^!tNXY=azB{gi9Zi=+;?9$s08{zt
zs|L5Jsb4c1C@Z0r2CS9faz9qVZ6V27!BmUWQ&~JA(<4M+1wObVaB+!oxHha&0596D
z_R5V2MC$ItOD_KKt|ThU*A@_gw$F9kZ!5o|Fh<5S!9^FWxa|NU&|~f1bBG**(h;1k
z?*IAI7$3{MARcz5yOdE-l4gL6*O7O@-3mtpc%xE?z+?+oqoRkka@rf-J3YFAo&5No
zK4yS(qm@5_U$E=Z%pOGZLOU#eA-*MR^lDKV5s<}HLGX(2ySW8oRW%n-{J}$d`I~dx
zV}(^sbwuE;)Gv!Cm+dlmcHVtlfxn1x|Jj&*)D=81kGVgX2(1O7#ZZy+bj{8OFa1q(
z>l)GVyUl97jxN7bfya-lUGKcT<zu(z8pAby%JN5z*CG(w^lkh?Y^wXfqJ2mkMj`EY
zO_(k29(hyoofyhu%6f6mh>0J^VHf0tI}M{XvzMnI4IgfKo9~F{>iASwiZR~`E#`{p
z0T@={-nmh*PvV+pMot~u>ovE#U*j}X`8W-7m(6yXdW%MMTbdA|2#gUNugGCb8X5k9
zyh9hV9cNaSz5^YG*lRDx(}wWk6TSB0NxE{cAkz+Jl<I;?f;Z`VXT+9r&Tf_IJcsbT
zfa^(>918qEPKDs5(syDPQL!lCf-8^>Ly*zDi>r5ElDMyE(_%TTEt2^dWUB2~aUWq$
zqa=?-ZBw1ymFj}<3Gm_5A2V-QinKqxlKkfV9L2{GKDy$Q!Lu(uNN?r%#jw*Z?VN+k
z#i=LNQl-X;^#>zZ)xWJ^Qf=AsdF=`+L?Bj8u`hpawq5wM9)#v(ZC<ssny2QZE=dQ?
z)&1rY+bI%;bD=Sv;&^r0;zQ8#48mV*#j*1{@=i_a52j<xg1&&^!?yZOBGA9ThrcE6
zj%;hX6ZwuuUMIGP2)NV+rhlPnj&Wntkd=HL5J7;$x?9>w)%s!?+d|g4dq1N1ff*em
zHXNK97`7MyRT!Sk@fHp(&;F~{fD{rY0?FX>W+I@mEgPelC56e`gHI0ZFQX9{h^yj5
z%<5aC&FsrqzS+pJn!Fo*X;>t^j5Ot{WQK92^f%rpB;`=Wi=pF_lxfxLOAqDFd<S=D
zI3)fec5@^?P_AS{RHby!L<5cL@Hah{y+?1vei?6O^m8i8uYMJ-i=TW^m!_YwJfO+T
zJREx43eJwN#YTZ6PM`M`F9m5`C<m6Sh@~K}Jb`_N0imGNI>9%c<j$|nxyHIj4b9f7
zWz!^{-mZ|J7k@gfccee4(Sj{PqZk8)z6EJN8Nb((4ohbeo*_GsX)IbqYsObqu9&@Z
znkP+$_zD8VCotVd@ej;NSlV)n!&N3jSiMTeAgQNEQQ_b!nT}l#b+bS>_zFRHIm>d{
z7^~Sg^KL>h(x_F6C-!lQdy>}7@+#Ub=nl^FJne`GOhK01vWoBlCc3UNQ&-QMN(Duj
zQWM7>c=kx}JkVVz9^~CIKE2E$*|ut~ZCajHk6H=#f)5@rEOQnwx}_f0T_AgKd!hU{
z&YMG=QL%;uFg^gwzcvVoGHA>$!O?`(!^tuWx9fotW_Ce0Xr`M%h+Z&*uSW-hPVSBb
z-R8ax#4?dH@B5fr;)@3Rdxct8P26sT{Hk&E`B@n*#D5nfw8$QNb{j8XjA4va8~qeF
zEAh#;rhmlxfzX`(%z{>n!x-0e+Ff%Ns5*fQla>8tuVkxLb^G%JnU>WAeHHA1fJgHC
z);*|l#u8@CJ@dn`;fB2%`uEe@O6yN;EQeuvuJgM}(1dx60TBp8R9`O;h;++Kx5C<t
z>c0NOpGm!*=avGIO{heI1l<D3y_0_hl4fx5PzZbT!-W_4gfnQM-7rCM!FTskcftY`
z(9vA07v(f(cMA4-&I@w?ped2m6e>$vRD&+KoKcs{Rk_aY;UzU#&{7jHk%W8VWS8SQ
z?S%I1HWE`$hR$ra8q?pk$e~NEoqf#MjeI?HKR-(Jo9t?4p43$68oWne4=^}#QgUy$
zFZ_|oRJOPG>2E4<u^QudljYL|iX?3vhTVgZd8khgW@uzWi`aQ#%PnO5kskXz#C&_F
zW)^gBgsEk)$l|_&hAX=tSeD&EwW(K#-+X)>wT-~|_btNOekk6H*V-56sc?YvI5fD~
zzYZYG3#!G4Ef9f;kVQEnz^W$G<tI+(T2obhspFP|!!!+8BZIL<3#D6AKh`XfkItEP
z>J`cQpnlbNnt}E5i7FAW9E_2^68|hRDs3$1bpcnss=c0d)_bAG#5HB_{6#Ib4I_6=
z(+2E(^h(e;a}PXT=AlbZZj0kY--!hdY*9p5UqdYvt9X+(mXj+&>>SB#H^%*sknlk1
z!e21=0w<X^(d>P_x(%wPUqYojGOF%c^u3ij4s^5>@56otk+v7ATP+7*2b@!`Qq(5P
z_h_z}ywh=Xiz2oqZvhb<(V=q3X)d_j@pfaj?g?b~igvTsQ+tXxJ=BUXqPRe%hnPg>
zLNG12uA5^>>&8;x6LA#tR$so^+Q2-n4hs*xpm>?a4`2pkVMkId8X>S!F3(>UPxlue
zBrhw5%O^efXzwIplX9Of`wi_$$P)keB`KB-GbnSQHp4MQVq{wRwzv~#DlbCf%1#2S
zY-#aXK`if;TZvyAk<l%Dwwragd;f0wC&is^6iwN1(VD&hefMMKcaP|X^i^X1@R=<^
zBMW}3K1x#w4wu`?*gev2oYLsx(RsVteSAI%le#2yVsFP+cG%1g4eu#d^Pc(iV`?1w
zn-C#0AU&fI>uxF8=OTIG$gwbgdDB)Jo1L`?22DePN}+Rp?9zT(n#nv2-zmf^vh*o}
zW)7_pf54aUitQoaine=Q7LXLal6(Q^Pvx14OiohQ@@8GdyO&n0_wNtGo^YAE|1c4B
zyort8X3Qsm&oc<1rI2aeWzgha?r(u@TjpT`j>57<eD1Nq%8L@;exx8zULs%Tl?m{@
zi(W7*K@TftfLJQ(JX=MLd-b8;n<oFJl4zpmMqU+qK*pp^R|--`|8!@!Q`0T*ErMMi
ztXjxS<BV&rN-YxcnH8Ic7{AASAOdyh9pOL-R<?7zkvgG#y$&~?hJENKK~e<@`@dW$
zGzC)>OlDgB^f>8M+<H&Xa4+y1%P}n=-|FtuvFswG4PLinZPC?IQ_(DuS6pX1f-+4|
zHmtR`tsKk@#nbC!f(9xJH|oAw^LVVubeDDjj!B>s<x+ufbw3K{eLtgZ*=wVjoU<7k
z6|j%U%%rvKjR()(n)Nl?Fa%%!WN2t>W0P@wwEuI`!OYfr7rLbTAZspemhl&C8ih+Z
zFAA1)N=pax5%gT;{Ns;hv=@CPEz&iaT6!klCKXzVQ^<&60<J(l5PV~1@y4yeLRxdj
z*_Y#{v|)Q4eqOL~Uc}6@+UFo?=zY@S^T+}7BMQOGZu+`>443~70Laa#;E%9~82?K|
zAmM_7aFbg&*sklVb0fc5=FjmxRgJ`m<CT3GAG0ZYwv9e|?qv9_CIY+<52sR9Q}OgW
zL5K&R?W_m!kkr8_-jO5nf+M8e$Af}COW#lOp4WOan&U#HqLm1YK>bhEYeA+?Hs4s^
zC_c6O@hngtQ=8tg7xl?3@~ers_n*qQoPVExF&QLKGWL&}9k_7UmBDg1Ojmbd*L4G4
znjkVJ7RXMHkcr)9=Ka>2s0{&&h}>_=K`IYj?2q9sSjqHDy@{II&qk{vG=m&5C>#vq
z6HefB$({36nKo^1eQ$vJUUzv||K4nTBC8$XPbbzENIH6!?=gwXWg`1g$c+7Y0;}zx
z(~!yF^{-OG-D?6j$=yWWN&Z$^IkiI!qtw*oYztF7yT-gMxV46SY_giNpC3~zFtM~L
zU8awCzgjmE?|2Blpu4J8?R6$?J>}~!7L961;gon@9aA%+vt23_FKOPy5?a<P%)p;|
z5`=|!aabbaB)_zbXf{-TlB<u>yqXgw`tCt#NGwmkHNU!v#=x84<&IM~B_GM*kT`8{
z48A~5^vEgr)jVd#X$e6?8kW%EmVU2~U8}Xr_RmP{rK>WTcy$k$)%LiA0Zo6$4%o!*
zX~<hkc-LTTxksU|%*CcsvqXU<R7=3JIgWXoE34jWno8`={_Tn6Y-<@&rC|R0Ayp0b
zSeV!Ix`W=Fje{<}(TMGF%*PM1q5@uNMxCCns|J<UH&JH}7r$-q*5Y0A2`o13F83Y2
zM_7&LB&+^V5A51sinZgczr;M9Vb?2pEynRVX>8>=9=bFP<NW8f&Ll622`JIB@FMQ_
z<&9&K7q-z^!6P9x3ia5{vqzY@6zmk<qkT%wv{62*hMsvw^^u)R`hzE2&+`TCXOdqT
z4=l}_U;O+u@ZNoKqGDT;O~@)r@(Tkw+K2aS{O^jmxD?%^Nz<9H8LA|?fzlt_1_Nrc
z_=tAyxBfP@39H*i{RK;v8bV>MR3_x+f%&{1JzasOLFp=ZDt&~u#|~ahEefRQB`x4;
zHPY)Rkmn!WIZwTTb6rHT&wX{kF$fzalwJD^kOLqkRmZ!n<=|*;^fneV-ma1kuNh>)
zhX?rw=zbPcdd#tJmG_Wy>Rseyc=l7}1?Glbm%(7JO<Mtc23uNcd5FuzGm4zX;ljxD
z;;n0<Thqu|qw_{L)d0-F0(DscrVh-y#Uo^1Fv>pV5>qpgX!=k$cN_9Pm~)?AO4S47
z%PV*Btwd>a7CwHE>A>~0z@~S<+;cN!dM!}u%#vUPc?TA#xc2S4MB&SEuzj8It_7df
zalCZf-&N2MNOipIqQjPpB={nW)ECRv`+wtg(FtE^LKg$KB#I}uDU)yVDceD%@r?Sq
zS#RbPK2znTX1<@cJ3@X%Eo7_gCND?`Epf!iPgk;1oUt}-&!+4Ip!w&&;E*V^k@@3N
zlM+u;?Im9ZGh7txC~n?a52aZ@TQn17U)Dq6XXwsp^X2sEQ${x`PiPe)dKC-Z;m9$k
z*IVN(Y*Zf=yS2dY4_G#j1>f+!V-q&=c)Ywoq$>Gj!hQX<AJT=S2Q0L_=BJQGShoI%
zGq48c>xCcKQ&)WG>r=<?Q$*nSSD^m&w`t@>3ZDe@;`;5>mCG&l`v%an>}q`6UR6_?
z%z-H1<&s^0LrWi9FlG%QmF!cjNxyuD4!)%snA@>P1Tq9<Htp6<vJ^{BYR{foO=_R1
zZ9>X6%?zy+K8so3&3wm;eCP9weMyUH`js0QPx9m&T56O)vtosXv;P*Kwl;?%cUo!G
zz@IryoxGqGO0n7luIFVnc9C1q3TVB5U&+PKY(XoFCmJt}>dxEz_Br97L76)gGw~ih
z1b@t2rN`o|n}iz$?e}oKp*_{Ykv@j_o68ZI@heXk;G@mpRE(_aj?fVHzMFD3pAUS~
zpD|2L)Oo9@BX{z?$Ub5l2Y<bA!oHuuP+6!m`JIi`F{Pk;`x(dN5hL~bJWDfu;mhtd
z9xFX#6S&^&>-hfDcRS{+u2dOM>|8^I*u3Lkr~Ir6tX(YrtFqcGCKOFy$DAQq-aXPf
z;Kz5zlkZXxJ*hx(wE1{jDMs^<XLZP;KKGDfjk1Y;X0Zu($>R>O_H}*3q!05fWLaO}
zBPAxdiHA1{KOp_rPHq*Z{T;N7tX=)_Q#*(zQC)SYueKw!PiU8mG78Af6N)2AkV#aB
z;;Cb7v(!?Xw2(=|_gc~XT^<uok~|Efwk85mV!!gwKydjj8%6cuzxCgLG`nIll9%uA
z9F$Tec#A3v^38CcQ7rC-%YAHiSwN9(u*a1xo~zj0KTl%(kR%iv%e(rqcOEQN8+h3-
z(d1@;TDjtW{pR3rU+r7I{Z^fKbC+SJ;~O69Qsf&jm<z9wzNL2eQt08J;fT;Ys{YYw
zJ<bQq29G*?JK}j{t<#$3_7!r7%vg+C>di^@#x#gPp#4N3)2xVa9V?f1fEM^X6lL;s
zxi;7oYwRWEas0{_s$a<0t}}Js7_yKIrA0X-qdui~!9(XRf5Xgoo{4ceRt4Rntg9bS
zqzwp&8K+%qm|+j*l(t#<auT8jekwt}^tUbhrpjEFXwsJS)U1Z8`J{Z!jc}SC`!kJz
z_`@w|LkLBnfxOK%xrFgMT0vC7cEMAoAq*kU2d9wTb9AHj3JNWEu&gcO-)QMyCBWFh
za+up?xR_n0dp(_YiDS4!cloW=N1u+fqn8YFzgdjw3_hsvgLLji&at$x^iefLjw0qd
zyEr|j*>DH~a|h0KEEweL(uv{>e4bvH))<vw$4eU^MCF@zkS}`$9X|bG1xGP{V+Fyu
zrN)FWD$xc-s_NMlC(#ciSzqeUOQG57K~@m~H^|h|d<W{AHNRNu<yi2$dzDoqf0F~-
zV4M@v?um3I=wMA>uCZ5oC5zW;(}=)$MNx@<bX`quYW_}euQL(YHyIp+O%3AyoEKi)
zCIn*{<J9;GPsR>mjBUPu{3Wt8OAXqXg!*rQL?)`e)-~ByylwA{Eij{Ve1on>4?G_b
ze!>HPmfgT9VDJ)I(($ZhxTBrFU9wlKwnYAxp|N+l$c6-CQ;3_NrPM@+K_+0X;cz(s
zPC8Vowd5tf9bc|7V}5L(qLK0|O|5zP#8A_WETpHR2tU69m+hQC;UcL?(592qx7}t7
zb~BCD_aCQWf=5eD^RdC=V_tiarX1X*9TNR4mTkD|D~#7pAhn+bBv+XtX8@6hmzpT3
z*iI$D3y%qZ5LiP;+_P`hmE4Gg$i_au!l_5NTsK1<y_v9xQUCD?a25M!nAd{wa^ITM
zkzUBTm|^3GaPzRY>haH5UzeSxJj{(Fbi|T=;TT`c65!K#CJ`-o{`}zgUGL)R3D%?+
zFMzvB<x=D}5WZHit7w6n8{@2oPy{#@QU2tDS*__jBT&8MBz&xZiW;Ca>l47dmWuS0
z+JQvzODoX8c6Vw6=reQ%Gbo==nU9{jItdGL5&`qM8bhd02NBqPNidEbwpgyuz#J1;
zAZ=3*LNEphG8^qR24~qd8IzprQ*N)-SG+)#ZM>nH5WHv+ngle2GsKAlZ?%+Va}7fT
zBBT(I-*Qc?(r7wbg$y3y^rTRke#kdgL(9>(H@ZB(r|#&XI`@ko>57y&;^4GnM*-_z
jH@_~XzcAffy&tFagb#E?c$ph_8dG4`;gNL=CQkk@ofMo~

diff --git a/tutorials/srd2.jpg b/tutorials/srd2.jpg
deleted file mode 100644
index ffacaffbe5eb458e982373e08fc8e4b3700deac3..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 6478
zcmbW4cQhRD+y95fM(?6;5=5d#iyA8tL_|c1l8ClS5QIbxHi#0$hA2^DKOs6<QA4oG
zY9V^>b#-=?wOIC%&-a|)@1N(7=XtI<_jP8@Tr=nWntSeR-a{TE&jBp=wRN=t3JL(A
zINt#BB%ldUQv8j-neuN?f&OM{Dk=~a4K)qTzXMErfd)(qrlGk&e}R_nZ=CNj(9<#e
zJ@|W*|6EE*4FXZqfoZ`1mi(_0xe;K!0N4ZeAc`vhB`XDpm4b``Am=)%|CM(x_CG;E
z38JE=0iVmDKQE|cIhPLtor|VESA1R_bbcP7Vx?xgEOVQN-Ov_%#gjueB>w9Kp*ugC
zIF0&o!g6+Ap|o^d7rA+OMXrioyDlcLa8pr9Sw-`%mbT74-TTHSk4??YEuO&a9UPsW
zJG;Dk?d{|1=N}OEE<7SK>V0&=r^KY>l+USYS=rxma`W;F3d_nXDyyo0*3>q)U|QSS
zJ34<43=R#CjE;>@%r7i1En`<!*Vgg7d;153L*mi#UoHv&^dGGA{NHl1o^w(DodfV+
zE(%KD^8sR|qP{Fc!*<&cZ0pH>MK<ID$DR1EKbmNT<cx5fc3%B-T*C76BKW^(|C0UB
zz(W5|Wd9HB|8q?M3?Pd0$pf(hP~Z%{niUP4^qyTv@>eJ&CM~mp82Q;a_~GpY@tv_s
z@o(=a`6R~qeNepyL?KR0@)n*U8_xCdUS}s6Kuny3!ucN_8a-J%WDQCcdaS6*QMH-S
zsz<3!@5bkYNMs#B-&l~4E&4LFSACX%SQDhgXh+i8PM)28IXx11X~6FCSs!AMW+70@
ztE4+M4ia2R8>%Hb;PdW(@c2dyD(Ga@XxZ$4|2Zt~j<!XdT*gAV8ufzYeRhV@Ug3Cm
zfQu@WM%{Kccd)U2l(M0T$QtBPwCY^EE+=evtI$eU+hq41=&iC4XmrEs#Y;c?5})yH
zyYdJ2X7YB4hWOZtp)RcUkwvyw0;VfMTi`Y~73)sNci%Q&lvL_+s*0272wE>#GPp&Z
z{xnD4;pfQyVZVa;R`UcgPMCEnkNZ*oK`IT!#2mb=BiMyBd-@}oCeY|K{!9cB0bzHH
z@Y|6X+0YzygLyg^G-}MiBx6;S$5w+Kl+=o{6<WkFnuqV7d$qs6p;W#3^gcYS+5;*y
zfObUl>E)21VGsOgyV_XxLS`Wk;9~o(AAGLE{Z(VdKKimw;!%R9nJ>N$H?UpJ;$!CH
zGAepNbjGJ>5fQ~M4Hxy$(uRH8Y2rmeQ)RI>Zvzhg1Y$)Q#VQMRrNh^QtRU>%{W-Mq
zHOY|!&~oPX6KJ?KXLiLfmhr30SjBMm$IxhI`pe+H42Yd;BN>>>CIjWu$W2$g*9xZA
z{t9#CnFw?K^OQ-r?{hu(gh`>=WNNTcZ1$I2RE(cv%~dlVis~8=K+!Au7Qi;#Ap@aN
zWT5Gu`lYiWLGFGs;HVFMtQ*KYS2RF4bV<({+D=rnGYx*lD<pe~yQ5QiQ81Eu`2cTz
z!0S3aV{p7PM*J>SU?7&^+2^b*Wd^~P^`jKZw~?AXB+nB!l72m&p#-<S&Ir{-ELFuV
zOxq_sU_WV-P!heR3Q%({z+D_DsHAseZ7ABS<;~VTJ9k%Gr5zDnqL>=Y((;x)s{a;q
zLVi|U<qjkR=?-ra8J6FPoaG?hPYH2E7~u|L4OxdCq-B>QZ7xkBj-9vS3BF-3BzF3p
zhQ?u4I?pgmvH0-0-B)eOsJQbd^--On9`E$FK|;?-`lezgB(cp=jC*V7?V^xqK4B_o
zV{<2*0Df9-d~3X_BFfHnM9YvZt}v4fXxyOjg2wE{Zo63Iyv{6WCj<B1pVBw6ctpG`
zDS<%GQiO&A!SkoSt+fe6SR>0(oJPVf)@huv*a`f)+uxG#h+gos5~>{SWKF+{W-5_1
zYIasoB@T)g>Tz|<b*QEZFJ0AyiAudRsbUO7<Jv8%aO=03l4z?cP50RPmnMblL3-EJ
z8oZQjGgxj=G#)+^YB#d1MNbm6Hz~|VUxaHka`QCa(pXyLUsT4gni7n#WFS&Kh9{D+
z@hK0szC%w;m^i)r$tra=T#CQgCRR}S$lxK30W{p<tk1d}*@T`T1M}f4&4}0lk-9$1
zCQ-t}{HnRBwTlcls!ezKIUPhEQcyHbG=*FuBEy3fB(oJNzvo=h{^s^kjJr9k=OUF8
z5GNP+QmNzw9;c}b-<)Nb=BxS6-=)kXuq4u8#~f(y@E*jy)+jj5&@468rMc*NB=FER
z&wWJwD$xvA-43O$sDCqu#t1+=hXch6dmVg3cDp&S%7eqg`eoO*%dYQOs|Wvvmm|02
z@PmZ8;RAz^fUB?XDTIKd0&fSzN!S>1RQMaFI??%FNWp@zZuvZ;2S{_GrvpBCyTzJ_
za^>rCqSq3rV`yE9Di!B=OYQr}mxcBby`ltn*jW8x$QdO`_2*E<G}Gk$f&C31^DT+2
zd%opA#M#8&=JP~#01V3rpYIO@1s=88iRp^cmehSo9slh3_=ggMq?lV<$*tM6TGO2=
z($@J>YCh|Qvch5T@la_VqF(!mN)GCM-JLY1JWr1oHb?`u$*!9;0*O23B1TOOw?jVt
zQaDoJ=d<6<da<?fR^5?sa2zClpn68Xd}8P4vEb1Njj*PEeZfBZ&6fKj=E*zvUp|r*
zUKW@}Z+qXgrRxW8!@{ij&Kj-D2S_IPe*EnF*G*DMS(d`_v=^`6np6HD(^rVdI0c{q
z4e1mNf)**iOO*0kH`aUOyG7DgYARkHV3$R5W(^Yhwj1c+O}AdV8O(2Gmf%$;jrE^w
z`VTxASNW$(8~En&(~G!t?B4DozV!xPZhoh#@0X3Y1H|u8bSC<!U}Zck-NjUAd|*B{
zmt)9M8g}$OD;g}ydF)Br4gTl6wbV86k9ecPp^)u;aSG2@PH*P9Xs<+Q@~Ia?#O_T^
z-fmpi54xCb*_pkwCsO67k4bPQ;E~fmrJnXer|&$yM%$^D>1Uk+pPMm=t(v`%L)_rX
zkv-xa4cRIww*8s@0&HmfO5|5(M-OFpv`TU!aNEXpv_Ue^7BP4BxolktX^4)q7JnGM
zRD|Jtik)n{L~&LU#s33HBTdRSJw{<3A3Z**f$t}xH4l4Ox?vIXyz1k-ZTtie$Bgv8
zojPayPRs0tO^D&Jcb|05<T6HA5eV%5UO@Ehhe;Isb1qEyZlM>&(d^T+BGZ2k(kFBo
zOlw;wALQz?yhz{>mX&4vQ`ezk#Jx*_0^<=FC>wFb^yd)C(ChFDfpOlXl)i1jSUAI;
zwOZ-Oa}xRR^b6FX9!~f~OwV&Z6C7rnd{7kycVCa9z3Tw0G!&4f*Y0N{ipjOBnw&ml
zmugxz*D)>mQjK2n<0g@T8$B}7aa?QGdLHW}ggcR90Y2GIC?Y07TF(0cnsxidKGA``
zU>#`_#pj3P`R(5Ik}m$T9nJDl+cJ26bgn&8O&OQ1S=RR*B{Fo7KOe37#dH;`7hytY
zZ^x;<;7gj+_6;oJz9O?YOR3KK99hmBasUO0##_4iB?IF*8xQ^{VD8*n+UlWl#kv`O
zseuiLDC|R;khJ6LOvjmZpq|bOP4R8>p+yu#=1`eOjp<-7HB*|=jHgpyL@?QOGl&KR
zii@BgF1dPqbNXyCoH$JesF{g8C$cVvMALO;Z#3uh-D=veYmM&FH`j#slqPv`UNkF>
zqR#xI-O$SFt&4b9OeT#%)NJKp+b!vqU~t^r)bkl?9MwB3SFe!p5XY1ipUaKDJa>)8
z>3@Qtq>Hoj$wj#J!6sw{bLf(cn%8N>uK5sUBg<;BwyZiq{qvgj7f9vyuL8eciw^>F
z+VZ?T9YiDLYbx?rzGOUYl1wyyVYv_ddI_|7zP<guSc(yLy(QVyEU>HAw6fk1yC^eM
zO6RqVbrZcfajbfj>kt?~oLoRpxGoeDdrkV?_7CEECiVMXGs^I79(yMW+28DU`m`tG
z2<xg#kXil^9CLPV-oV8TTqM51*B3AzS8ms0pW~}sBd}&*^q*bOYG?!VY2+EZ0bX-a
z!YrXNcT!LM#*}QXqg!d%zVgSDyRQ4c^n#u?Pdo?^sLDvN7{AMQ@k42o#?GhSgJNPB
z>xDLMb7!|F7wHVsU$vPGcaF=d=5NDNTpn`3wv$9T9=jq0h`~D3o)S^S7UAT&XmM;%
zEIJ7ZJ4l1~Ah5e3qiNSrW4&0-^;x1Qw3Xf<GME8BbaQqBW|D5vhmv2+n^h>IuF}8z
z%Tnn>iv~k{3PQ{hMzx5rAVz-oi@_W}9>pM8TW$?@y(aWLpy$;V(Gl&ss|wR%ST9sx
zv(urXI~pXY3@dyQZ&JS*B&9g6c5JW6_P**JgH*dX$zsW=Du90}NE|mJV76c#p3|px
zKUpO>RDL6M`#`65LbH|c-n-?t!_#pOX-1(V2be&S)B`~#BaF$69=v5=y&PoC6Wci0
z)G}{2n7g%cm|-0js9NCH-WK5djs$VLMJRy#FPXP4^KjjLA-lv-R_m?Ac=)Q{3vNzQ
zoSUURYetrfphD{y=YD$c2)b0goYMMM7me_)DCdhw?(L-4tv7M7Xghcml9AU2DOLZ3
zBLf5{9OOM2aFdROI5os*z<>DZ>Ftd%x)96#Wi;axau#Ll-nz#r`Lbx*oPth-Nyd0Z
zOvxR1G?INMnG8G|l@`>0qN8!{+$CJ<-hE)9(&87(7XU*zF&42$Qb`M6>e8Wla~a9p
z(RF1>c_O~k;*$o(pKys}psIaKIM4tuhZ(J<`B1W2MHo%S>-tNc|7_jd{o@MKx|DF!
zWUA3$pPNp_z4Q2lU{73IjA{V;mA4%aXXc@v5%pTI^KMVH6wTuJCGKs8Q6*oJ3u)|Z
zDuJ?D6X_q~f_i@Bu2q#<^1Z^!wiUJjP9MQX%7u{9s`}hnzw~x1uF6POmrRD*oA~<_
zW-{6i#=R%+%)08HF$7;tGc>e%_AK@AVE6a9qnVZEw)KME-7hoI)69Q7Ceefsr+L9r
z&dDiI0g}F(d_e4>tj@fjltqdbYeUD_yN|g~B|x&`xHl{Yi6p<MX`*psu#oo5VJ1`b
zgpS8fi@&$WC_i#)Q9V6K#`@gnHr;o`)kh%txlCV{3=_V^005;KHR3)V85O`t20omD
zNY{CVgJG@3F13PY8NWw&)HLJ54wrUiea$8uIM=%9dE*gczetEKB7#~?UFEg^5mF-f
zWNS5uj|@-^5uGlEpIwAE`@RCve|hLHrQy9=*M9L#<@>osAG8iQzFi46b$(XL;iYH;
ztG5kQz*VNS>_nuQg%_Jh_>BB`$Gu|rk4Z0yin)8pY|oXqs>ma2&2)L&<Eq}9=O)P1
zu{nyP15{j_nR&P6I(iLa5tdb|9HgS_>Trl?z)PhtcE)Mx*bbG2X$3jq&;$>hZzxH?
zHEYICWzw{+@zW!8O%>C?LyhU!I1U&PKquZ5s1p)fvcoDNpMmN|qf$wq`n5Pu)<|<f
z3g<M$iXV8UkVQ1r`nAH^&@o-=!B~3#EWPHq-mprVg{Nws2`?FgQgM?6)du7|S2o^v
z!PRmtlL1{DCr$5cn$jORg1Qp^C9J$^<Bd&@yOlN{n_O3f7I?{;cvTRY>&hD$Q;c%A
z*;D(Yuvk$Gt^IPlhA~sg#)2+B>=#JMj&FT-QjrjX-$J#xiO!|1mOo)v_|s60(!Amy
z5PB4|+{)G_<IXEmE!&=8sm0QR;XB&9udovNZLMTGeG`Q)R7Xdy&#m=k6x}lF_uW<7
zAf%sutwbZtCd?xCQ{qNjGP(M*D*KDK{xMIUUohsgti6)o#d#SdwG?BQ>C1K8PB0mH
zosM8?^|gIjz&j=54cKt0{#=<sLoeIf`Wc!2u3EP5T*O{FIQBu=5-IRVj3CR9(7;x)
zhqPU?gEVI}4?hzi_77Xj@}<R0ZSkeO&6?3hvDe3{NO+UQ{te|{1yLt1GnslFr+h=_
z4p`Bk+=J1t>Hwx#YI`No&EldGzyc^YEh=wQJCl?~piYR02G*gvpj<yBhJ9mj!gAcA
z*#@03&3mtH$Wp4CV~?U2NNRs&^#<2#blzq#9mN$Tt|wM47X%nblH?`obu3tQtfqqs
zQ?>>@7iS}pfWp(+@E^oH^Ksr*seh^lEwyqnUS;>6U9<~K$F+a7@LxoRHO}W9gzL7O
zZU&<>7H@P>i)|Yux#pQ9&&tiiCq@?%0xgKAo}A&GZgSc+Cz3xzy*k!s>ulqgq1V)-
zl(F^J1YTS(NkUSEnbBI?F|yf8&CTX6n102=O}p1d`2D@LF5V+VV}O_CMl!x)Op?EN
zDRUB166A3H=HW+QBPg69EOocPPe=U&8Oxb<+)QzuQ6ijzD}&Bd5}F;1a`dvwpzMuy
z=n6S_vFVAC+p%QBmipp`q?HEW@0h6OE$|=+c=J*`>ok!Bm5wF@&+F+LpLpD8RTyCU
zYGDTK^-&IEi}0#V6PrwFj`eQWB@+=nEeOr9KX0r_Y`^8cpA843M18l1j2}b7o%T9U
zvMd}wz(BWpM#(^f>w>4ea4Dy9Mc?rYQy4GpAD00iD<AdMAGp=QlH`GFpO#PB1q@yd
zL=V>D&s~@NQD98{oom)SV<zw?;>%nRW>UvdiZMj7d#&DJ;Y##1)S3`}ag_{=fiS*;
z0x7NZx#MkzW(nq#M`hQJF4?}q7XA=&c4$5+u-yzE9a7`V4t$7{O3e<+<$J1cY=Y3A
zei_?+9J6K4;YOYM80Pj?h|?$bWy0^WK<X1g@rIhhZx7~s$KUC5tA=4UzS%exZ1u=l
zyA$CHq0Z_TV_HTaV{S!jyHnQdEb9b0s%QegUOnzNu9iK~UPKYo==5BuHX{+2@UFos
zuQKHDGFS3S@00M#<@(<`K^Nj~tMzwPw!G~U+UB8(05Y?Mq7TTxIJ!mg*y(+p^g@%2
zkjc-Q%4^@Y`AoPw$Uq2<B^ihi|MTqxj8NEkrg(c`rMjlR&JC9uzj$YBuaNqK&o$*s
z0#m#vp!uCp`OkH(b7+b+uIQrqQ<Z0TPCqgyeiVBDo`3ms=PXpZGLXqX&gA+V^%BM1
z>h<1%pE_cG-A^&MvKBo|N7r7hN>i?Rc)0Q#J$$0mw(xep*Kkm17F~V+xSHUL=R`#8
zzZ-mgV0o`G*}d3cAC)>EvC#Q7$p`mS#K5|n3}l$)k*?z9llR&o>HQHVHj9<Prg&p-
z=~ss@tgIjA3N+uFIIV@x#aq*&T~HBeDXoaNGfbtpS<Hzzk5gHY7*$pEXdLaEH&LUs
z*q>8e!Q3*>mi{?{t3z!hDHm3nGOw%5e7W{<LuO)HQ_Xx_p+YqDLWjeN=9}344eOt9
zP~anlXIJDujH+q}Q3u0<CrsfCaJ$|KRND;Qki*TJ4XSvK28mKy`WGKOxS;a58%2bu
zZB~sA3@mP#u<xm`v3x)6Ff(%DQC6wN@V(x<-~A0RJK;0z4eVXiKf{NRGniKHSCgCs
zB#Ets;5Hl#_H)IcxdZJ|s*-CXKEU{C-w09rW$%5HV`&eaOk5&>Mn#v9%p2;g#JmC>
zaJZU2EdL{Y&qqtf?(BpM8y%=JGT?47u`t_$F0~XCPhxrxjnVi~Hn<WW*yO=IHt88h
zSI{1;<;OGpLcd`CO69qi8U3DD@G!EfA}i_JR&b{a8Q3-H?e&=GC5)Wrp4}h?<C&w?
z1xb&G_dS@K{6+#KGcyVwHL(hHi-JYQ%Dq>$IG22^REOu-+Q0e*U5)Iq>k)p;hp^53
z$qjk*997U_TQJblEZ8j7DPCEiAZBRnQzEh^$=nrvEeov)1#Te-dUp<MvE$F^N@{2n
z#cm5aDs%3mF=#HA+Qa4|`VWt7ld-fVHYXNQ;P}A2i9d)Guc+p2fbCnvbVz9Q2&5_h
zjS_Q&2y#54xd-DCe3NTK5}WQUKJmUr>$0L34YT2fwar<_6Sw-&5CepCYToeq2X((h
zd)<iEj)s*+=AB_PYF+ngwD|6s=$VMYiSI7dZ|-gqulgsr`by12R<DiG?f8$t{z63^
zt+$#lP#no0n3rd1s1+P|b~U)tjCk*Ohv0yx$-~l{CUS5Ctkx3nrp5-&K6Irzw;yKg
zGWu85@TQc5kh(o<j5Ex5t@7K)SlB5npkFI!mYEFHWaA%->N>^EIJL~;^r2v9)^lfc
zCy?43@>wRo_j9MwyL9K8CUE?aU^|r;1t)Ya3kf+BA=#dOv!3^e_I@4Ha>mdQ1l1VW
zeALc=O?bloMS|5Gpi4%*(NN3Phy}?xfHkYUNe0+|f4we|T!NfX6Ez#&`8t$a9`P*x
cDLRSn+PukY$S6K5T73OV5`wYEcZxjzU$iZ--v9sr

diff --git a/utils.lua b/utils.lua
index 3a1be38..4c8eac1 100644
--- a/utils.lua
+++ b/utils.lua
@@ -282,7 +282,7 @@ function nn.utils.recursiveMaskedCopy(dst, mask, src)
 end
 
 function nn.utils.setZeroMask(modules, zeroMask, cuda)
-   if cuda then
+   if cuda and not torch.isCudaTensor(zeroMask) then
       cuZeroMask = torch.getBuffer('setZeroMask', 'cuZeroMask', 'torch.CudaByteTensor')
       cuZeroMask:resize(zeroMask:size()):copy(zeroMask)
       zeroMask = cuZeroMask