diff --git a/ArgMax.lua b/ArgMax.lua
new file mode 100644
index 0000000..a715b49
--- /dev/null
+++ b/ArgMax.lua
@@ -0,0 +1,57 @@
+------------------------------------------------------------------------
+--[[ ArgMax ]]--
+-- Returns the index of the maxima for dimension dim.
+-- Cannot backpropagate through this module.
+-- Created for use with ReinforceCategorical.
+------------------------------------------------------------------------
+local ArgMax, parent = torch.class("nn.ArgMax", "nn.Module")
+
+function ArgMax:__init(dim, nInputDim, asLong)
+   parent.__init(self)
+   self.dim = dim or 1
+   self.nInputDim = nInputDim or 9999
+   self.asLong = (asLong == nil) and true or asLong
+   if self.asLong then
+      self.output = torch.LongTensor()
+   end
+end
+
+function ArgMax:updateOutput(input)
+   self._value = self._value or input.new()
+   self._indices = self._indices or
+      (torch.type(input) == 'torch.CudaTensor' and (torch.CudaLongTensor and torch.CudaLongTensor() or torch.CudaTensor()) or torch.LongTensor())
+   local dim = (input:dim() > self.nInputDim) and (self.dim + 1) or self.dim
+   
+   torch.max(self._value, self._indices, input, dim)
+   if input:dim() > 1 then
+      local idx = self._indices:select(dim, 1)
+      self.output:resize(idx:size()):copy(idx)
+   else
+      self.output:resize(self._indices:size()):copy(self._indices)
+   end
+   return self.output
+end
+
+function ArgMax:updateGradInput(input, gradOutput)
+   -- cannot backprop from an index so just return a dummy zero tensor
+   self.gradInput:resizeAs(input):zero()
+   return self.gradInput
+end
+
+function ArgMax:type(type)
+   -- torch.max expects a LongTensor as indices, whereas cutorch.max expects a CudaTensor.
+   if type == 'torch.CudaTensor' then
+      parent.type(self, type)
+   else
+      -- self._indices must be a LongTensor. Setting it to nil temporarily avoids
+      -- unnecessary memory allocations.
+      local indices
+      indices, self._indices = self._indices, nil
+      parent.type(self, type)
+      self._indices = indices and indices:long() or nil
+   end
+   if self.asLong then
+      self.output = torch.LongTensor()
+   end
+   return self
+end
diff --git a/BatchNormalization.lua b/BatchNormalization.lua
new file mode 100644
index 0000000..2ffad94
--- /dev/null
+++ b/BatchNormalization.lua
@@ -0,0 +1,18 @@
+local _ = require 'moses'
+local BN, parent = nn.BatchNormalization, nn.Module
+
+local empty = _.clone(parent.dpnn_mediumEmpty)
+table.insert(empty, 'buffer')
+table.insert(empty, 'buffer2')
+table.insert(empty, 'centered')
+table.insert(empty, 'std')
+table.insert(empty, 'normalized')
+table.insert(empty, 'output')
+table.insert(empty, 'gradInput')
+BN.dpnn_mediumEmpty = empty
+
+-- for sharedClone
+local params = _.clone(parent.dpnn_parameters)
+table.insert(params, 'running_mean')
+table.insert(params, 'running_var')
+BN.dpnn_parameters = params
diff --git a/BinaryClassReward.lua b/BinaryClassReward.lua
new file mode 100644
index 0000000..505e868
--- /dev/null
+++ b/BinaryClassReward.lua
@@ -0,0 +1,82 @@
+------------------------------------------------------------------------
+--[[ BinaryClassReward ]]--
+-- Variance reduced binary classification reinforcement criterion.
+-- The binary class version of VRClassReward.
+-- input : {class prediction, baseline reward}
+-- Reward is 1 for success, Reward is 0 otherwise.
+-- reward = scale*(Reward - baseline) where baseline is 2nd input element
+-- Note : for RNNs with R = 1 for last step in sequence, encapsulate it
+-- in nn.ModuleCriterion(BinaryClassReward, nn.SelectTable(-1))
+------------------------------------------------------------------------
+local BinaryClassReward, parent = torch.class("nn.BinaryClassReward", "nn.Criterion")
+
+function BinaryClassReward:__init(module, scale, criterion)
+   parent.__init(self)
+   self.module = module -- so it can call module:reinforce(reward)
+   self.scale = scale or 1 -- scale of reward
+   self.criterion = criterion or nn.MSECriterion() -- baseline criterion
+   self.sizeAverage = true
+   self.gradInput = {torch.Tensor()}
+end
+
+function BinaryClassReward:updateOutput(input, target)
+   assert(torch.type(input) == 'table')
+   local input = input[1]
+   assert(input:dim() == 1)
+   assert(target:dim() == 1)
+   self._binary = self._binary or input.new()
+   self._binary:gt(input, 0.5)
+   
+   -- max class value is class prediction
+   if torch.type(self._binary) ~= torch.type(target) then
+      self._target = self._target or self._binary.new()
+      self._target:resize(target:size()):copy(target)
+      target = self._target
+   end
+   
+   -- reward = scale when correctly classified
+   self._reward = self._reward or input.new()
+   self._reward:eq(self._binary, target)
+   self.reward = self.reward or input.new()
+   self.reward:resize(self._reward:size(1)):copy(self._reward)
+   self.reward:mul(self.scale)
+   
+   -- loss = -sum(reward)
+   self.output = -self.reward:sum()
+   if self.sizeAverage then
+      self.output = self.output/input:size(1)
+   end
+   return self.output
+end
+
+function BinaryClassReward:updateGradInput(inputTable, target)
+   local input, baseline = unpack(inputTable)
+   
+   -- reduce variance of reward using baseline
+   self.vrReward = self.vrReward or self.reward.new()
+   self.vrReward:resizeAs(self.reward):copy(self.reward)
+   self.vrReward:add(-1, baseline)
+   if self.sizeAverage then
+      self.vrReward:div(input:size(1))
+   end
+   -- broadcast reward to modules
+   self.module:reinforce(self.vrReward)  
+   
+   -- zero gradInput (this criterion has no gradInput for class pred)
+   self.gradInput[1]:resizeAs(input):zero()
+   
+   -- learn the baseline reward
+   self.gradInput[2] = self.criterion:backward(baseline, self.reward)
+   
+   return self.gradInput
+end
+
+function BinaryClassReward:type(type)
+   self._binary = nil
+   self._target = nil
+   local module = self.module
+   self.module = nil
+   local ret = parent.type(self, type)
+   self.module = module
+   return ret
+end
diff --git a/BinaryLogisticRegression.lua b/BinaryLogisticRegression.lua
new file mode 100644
index 0000000..02ccaab
--- /dev/null
+++ b/BinaryLogisticRegression.lua
@@ -0,0 +1,91 @@
+------------------------------------------------------------------------
+--[[ BinaryLogisticRegression ]]--
+-- Takes an image of size batchSize x 1 or  just batchSize as input.
+-- Computes Binary Logistic Regression Cost.
+-- Useful for 2 class classification.
+------------------------------------------------------------------------
+
+local BinaryLogisticRegression, parent = torch.class('nn.BinaryLogisticRegression', 'nn.Criterion')
+
+function BinaryLogisticRegression:__init(sizeAverage)
+   parent.__init(self)
+   if sizeAverage ~= nil then
+      self.sizeAverage = sizeAverage
+   else
+      self.sizeAverage = true
+   end
+end
+
+function BinaryLogisticRegression:updateOutput(input, target)
+   local inputDim = input:nDimension()
+   local targetDim = target:nDimension()
+
+   -- Check dimensions of input and target
+   assert(inputDim == 1 or inputDim == 2,
+                                  "Input:Expecting batchSize or batchSize x 1")
+   assert(targetDim == 1 or targetDim == 2,
+                                 "Target:Expecting batchSize or batchSize x 1")
+   if inputDim == 2 then
+      assert(input:size(1)==1 or input:size(2)==1, 
+                                        "Input: Expecting batchSize x 1.")
+   end
+   if targetDim == 2 then
+      assert(target:size(1)==1 or target:size(2)==1,
+                                        "Target: Expecting batchSize x 1.")
+   end
+
+   local inputElements = input:nElement()
+   local targetElements = target:nElement()
+
+   assert(inputElements == targetElements,
+                           "No of input and target elements should be same.")
+
+   self._k = inputElements
+   local input = input:view(-1)
+   local target = target:view(-1)
+
+   self._baseExponents = self._baseExponents or input.new()
+   self._coeff = self._coeff or input.new()
+   self._logCoeff = self._logCoeff or input.new()
+
+   --Compute exponent = -target*input
+   self._baseExponents:resize(input:size()):copy(input)
+   self._baseExponents:cmul(target)
+   self._baseExponents:mul(-1)
+   -- Compute exp(exponent)
+   self._baseExponents:exp()
+
+   self._coeff:resize(input:size()):copy(self._baseExponents)
+   self._coeff:add(1)
+
+   self._logCoeff:resize(input:size()):copy(self._coeff)
+   self._logCoeff:log()
+
+   if self.sizeAverage then
+      return self._logCoeff:sum()/(self._k)
+   else
+      return self._logCoeff:sum()
+   end
+end
+
+function BinaryLogisticRegression:updateGradInput(input, target)
+   self.gradInput = self.gradInput or input.new()
+   local gradInput = self.gradInput
+   gradInput:resize(input:size()):copy(target)
+   gradInput:mul(-1)
+   gradInput:cmul(self._baseExponents)
+   gradInput:cdiv(self._coeff)
+   if self.sizeAverage then
+      gradInput:div(self._k)
+   end
+   return gradInput
+end
+
+function BinaryLogisticRegression:type(type, tensorCache)
+   if type then
+      self._baseExponents = nil
+      self._coeff = nil
+      self._logCoeff = nil
+   end
+   return parent.type(self, type, tensorCache)
+end
diff --git a/CAddTensorTable.lua b/CAddTensorTable.lua
new file mode 100644
index 0000000..16efe44
--- /dev/null
+++ b/CAddTensorTable.lua
@@ -0,0 +1,43 @@
+
+local CAddTensorTable, parent = torch.class('nn.CAddTensorTable', 'nn.Module')
+
+function CAddTensorTable:__init()
+   parent.__init(self)
+   self.gradInput = {}
+end
+
+-- input is a table with 2 entries. input[1] is the vector to be added.
+-- input[2] is the table to which we add the vector
+function CAddTensorTable:updateOutput(input)
+  local currentOutput = {}
+  for i=1,#input[2] do
+    currentOutput[i] = currentOutput[i] or input[1].new()
+    currentOutput[i]:resizeAs(input[1])
+    currentOutput[i]:copy(input[2][i])
+    currentOutput[i]:add(input[1])
+  end
+  for i = #input[2]+1, #currentOutput do
+    currentOutput[i] = nil
+  end
+  self.output = currentOutput
+  return self.output
+end
+
+function CAddTensorTable:updateGradInput(input, gradOutput)
+  self.gradInput[1] = self.gradInput[1] or input[1].new()
+  self.gradInput[1]:resizeAs(input[1])
+  self.gradInput[1]:copy(gradOutput[1])
+  for i=2, #input[2] do
+    self.gradInput[1]:add(gradOutput[i])
+  end
+  self.gradInput[2] = self.gradInput[2] or {}
+  for i=1,#input[2] do
+    self.gradInput[2][i] = self.gradInput[2][i] or input[1].new()
+    self.gradInput[2][i]:resizeAs(input[1])
+    self.gradInput[2][i]:copy(gradOutput[i])
+  end
+  for i=#input[2]+1, #self.gradInput[2] do
+     self.gradInput[2][i] = nil
+  end
+  return self.gradInput
+end
\ No newline at end of file
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7979513..c4a8a34 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -56,6 +56,58 @@ SET(luasrc
   utils.lua
   LinearRNN.lua
   LookupRNN.lua
+  ArgMax.lua
+  BatchNormalization.lua
+  BinaryClassReward.lua
+  BinaryLogisticRegression.lua
+  CAddTensorTable.lua
+  CategoricalEntropy.lua
+  Clip.lua
+  Collapse.lua
+  Constant.lua
+  Container.lua
+  Convert.lua
+  Criterion.lua
+  Decorator.lua
+  Dictionary.lua
+  DontCast.lua
+  FireModule.lua
+  Inception.lua
+  Kmeans.lua
+  LookupTable.lua
+  ModuleCriterion.lua
+  NCECriterion.lua
+  NCEModule.lua
+  NaN.lua
+  OneHot.lua
+  PCAColorTransform.lua
+  ParallelTable.lua
+  PrintSize.lua
+  Profile.lua
+  Reinforce.lua
+  ReinforceBernoulli.lua
+  ReinforceCategorical.lua
+  ReinforceGamma.lua
+  ReinforceNormal.lua
+  ReverseTable.lua
+  Sequential.lua
+  Serial.lua
+  SimpleColorTransform.lua
+  SpatialBatchNormalization.lua
+  SpatialBinaryConvolution.lua
+  SpatialBinaryLogisticRegression.lua
+  SpatialConvolution.lua
+  SpatialConvolutionMM.lua
+  SpatialFeatNormalization.lua
+  SpatialGlimpse.lua
+  SpatialMaxPooling.lua
+  SpatialRegionDropout.lua
+  SpatialUniformCrop.lua
+  TotalDropout.lua
+  VRClassReward.lua
+  WhiteNoise.lua
+  ZipTable.lua
+  ZipTableOneToMany.lua
 )
 
 ADD_TORCH_PACKAGE(rnn "${src}" "${luasrc}" "An RNN library for Torch")
@@ -64,4 +116,4 @@ TARGET_LINK_LIBRARIES(rnn luaT TH)
 
 SET_TARGET_PROPERTIES(rnn_static PROPERTIES COMPILE_FLAGS "-fPIC -DSTATIC_TH")
 
-INSTALL(FILES ${luasrc} DESTINATION "${Torch_INSTALL_LUA_PATH_SUBDIR}/rnn")
\ No newline at end of file
+INSTALL(FILES ${luasrc} DESTINATION "${Torch_INSTALL_LUA_PATH_SUBDIR}/rnn")
diff --git a/CategoricalEntropy.lua b/CategoricalEntropy.lua
new file mode 100644
index 0000000..610494c
--- /dev/null
+++ b/CategoricalEntropy.lua
@@ -0,0 +1,63 @@
+------------------------------------------------------------------------
+--[[ CategoricalEntropy ]]--
+-- Maximize the entropy of a categorical distribution (e.g. softmax ).
+-- H(X) = E(-log(p(X)) = -sum(p(X)log(p(X)) 
+-- where X = 1,...,N and N is the number of categories.
+-- A batch with an entropy below minEntropy will be maximized.
+-- d H(X=x)     p(x)
+-- -------- = - ---- - log(p(x)) = -1 - log(p(x))
+--   d p        p(x)
+------------------------------------------------------------------------
+local CE, parent = torch.class("nn.CategoricalEntropy", "nn.Module")
+
+function CE:__init(scale, minEntropy)
+   parent.__init(self)
+   self.scale = scale or 1
+   self.minEntropy = minEntropy
+   
+   -- get the P(X) using the batch as a prior
+   self.module = nn.Sequential()
+   self.module:add(nn.Sum(1)) -- sum categorical probabilities over batch
+   self._mul = nn.MulConstant(1)
+   self.module:add(self._mul) -- make them sum to one (i.e. probabilities)
+   
+   -- get entropy H(X)
+   local concat = nn.ConcatTable()
+   concat:add(nn.Identity()) -- p(X)
+   local seq = nn.Sequential()
+   seq:add(nn.AddConstant(0.000001)) -- prevent log(0) = nan errors
+   seq:add(nn.Log())
+   concat:add(seq)
+   self.module:add(concat) -- log(p(x))
+   self.module:add(nn.CMulTable()) -- p(x)log(p(x))
+   self.module:add(nn.Sum()) -- sum(p(x)log(p(x)))
+   self.module:add(nn.MulConstant(-1)) -- H(x)
+   
+   self.modules = {self.module}
+   
+   self.minusOne = torch.Tensor{-self.scale} -- gradient descent on maximization
+   self.sizeAverage = true
+end
+
+function CE:updateOutput(input)
+   assert(input:dim() == 2, "CategoricalEntropy only works with batches")
+   self.output:set(input)
+   return self.output
+end
+
+function CE:updateGradInput(input, gradOutput, scale)
+   assert(input:dim() == 2, "CategoricalEntropy only works with batches")
+   self.gradInput:resizeAs(input):copy(gradOutput)
+   
+   self._mul.constant_scalar = 1/input:sum() -- sum to one
+   self.entropy = self.module:updateOutput(input)[1]
+   if (not self.minEntropy) or (self.entropy < self.minEntropy) then
+      local gradEntropy = self.module:updateGradInput(input,  self.minusOne, scale)
+      if self.sizeAverage then
+         gradEntropy:div(input:size(1))
+      end
+      self.gradInput:add(gradEntropy)
+   end
+   
+   return self.gradInput
+end
diff --git a/Clip.lua b/Clip.lua
new file mode 100644
index 0000000..fdd04de
--- /dev/null
+++ b/Clip.lua
@@ -0,0 +1,35 @@
+------------------------------------------------------------------------
+--[[ Clip ]]--
+-- clips values within minval and maxval
+------------------------------------------------------------------------
+local Clip, parent = torch.class("nn.Clip", "nn.Module")
+
+function Clip:__init(minval, maxval)
+   assert(torch.type(minval) == 'number')
+   assert(torch.type(maxval) == 'number')
+   self.minval = minval
+   self.maxval = maxval
+   parent.__init(self)
+end
+
+function Clip:updateOutput(input)
+   -- bound results within height and width
+   self._mask = self._mask or input.new()
+   self._byte = self._byte or torch.ByteTensor()
+   self.output:resizeAs(input):copy(input)
+   self._mask:gt(self.output, self.maxval)
+   local byte = torch.type(self.output) == 'torch.CudaTensor' and self._mask 
+      or self._byte:resize(self._mask:size()):copy(self._mask)
+   self.output[byte] = self.maxval
+   self._mask:lt(self.output, self.minval)
+   byte = torch.type(self.output) == 'torch.CudaTensor' and self._mask 
+      or self._byte:resize(self._mask:size()):copy(self._mask)
+   self.output[byte] = self.minval
+   return self.output
+end
+
+function Clip:updateGradInput(input, gradOutput)
+   self.gradInput:set(gradOutput)
+   return self.gradInput
+end
+
diff --git a/Collapse.lua b/Collapse.lua
new file mode 100644
index 0000000..95fb98e
--- /dev/null
+++ b/Collapse.lua
@@ -0,0 +1,26 @@
+local Collapse, parent = torch.class('nn.Collapse', 'nn.Module')
+
+-- collapses non-batch dims
+function Collapse:__init(nInputDim)
+   parent.__init(self)
+   self.nInputDim = nInputDim
+end
+
+function Collapse:updateOutput(input)
+   if not input:isContiguous() then
+      self._input = self._input or input.new()
+      self._input:resize(input:size()):copy(input)
+      input = self._input
+   end
+   if input:dim() > self.nInputDim then
+      self.output:view(input,input:size(1),-1)
+   else
+      self.output:view(input,-1)
+   end
+   return self.output
+end
+
+function Collapse:updateGradInput(input, gradOutput)
+   self.gradInput:view(gradOutput, input:size())
+   return self.gradInput
+end
diff --git a/Constant.lua b/Constant.lua
new file mode 100644
index 0000000..fdfdff4
--- /dev/null
+++ b/Constant.lua
@@ -0,0 +1,36 @@
+------------------------------------------------------------------------
+--[[ Constant ]]--
+-- Outputs a constant value given an input.
+-- If nInputDim is specified, uses the input to determine the size of 
+-- the batch. The value is then replicated over the batch.
+-- You can use this with nn.ConcatTable() to append constant inputs to
+-- an input : nn.ConcatTable():add(nn.Constant(v)):add(nn.Identity()) .
+------------------------------------------------------------------------
+local Constant, parent = torch.class("nn.Constant", "nn.Module")
+
+function Constant:__init(value, nInputDim)
+   self.value = value
+   if torch.type(self.value) == 'number' then
+      self.value = torch.Tensor{self.value}
+   end
+   assert(torch.isTensor(self.value), "Expecting number or tensor at arg 1")
+   self.nInputDim = nInputDim
+   parent.__init(self)
+end
+
+function Constant:updateOutput(input)
+   if self.nInputDim and input:dim() > self.nInputDim then
+      local vsize = self.value:size():totable()
+      self.output:resize(input:size(1), table.unpack(vsize))
+      local value = self.value:view(1, table.unpack(vsize))
+      self.output:copy(value:expand(self.output:size())) 
+   else
+      self.output:resize(self.value:size()):copy(self.value)
+   end
+   return self.output
+end
+
+function Constant:updateGradInput(input, gradOutput)
+   self.gradInput:resizeAs(input):zero()
+   return self.gradInput
+end
diff --git a/Container.lua b/Container.lua
new file mode 100644
index 0000000..bbf9af0
--- /dev/null
+++ b/Container.lua
@@ -0,0 +1,51 @@
+local Container = nn.Container
+
+-- multi-add
+function Container:extend(...)
+   for i,module in ipairs{...} do
+      self:add(module)
+   end
+   return self
+end
+
+function Container:sparseParameters()
+    local params = {}
+    local gradParams = {}
+    local scales = {}
+    local size = 0
+    for i=1,#self.modules do
+        local mParams, mGradParams, mScales, mSize = self.modules[i]:sparseParameters()
+        if mParams then
+            for k,param in pairs(mParams) do
+               assert(torch.type(param) ~= 'table')
+               params[size+k] = param
+               gradParams[size+k] = mGradParams[k]
+               scales[size+k] = mScales and mScales[k]
+            end
+            size = size + (mSize or #mParams)
+        end
+    end
+    return params, gradParams, scales, size
+end
+
+function Container:parameters()
+    local function tinsert(to, from)
+        if torch.type(from) == 'table' then -- we change this line so that it works with torch.MultiCudaTensor
+            for i=1,#from do
+                tinsert(to,from[i])
+            end
+        else
+            table.insert(to,from)
+        end
+    end
+    local w = {}
+    local gw = {}
+    for i=1,#self.modules do
+        local mw,mgw = self.modules[i]:parameters()
+        if mw then
+            tinsert(w,mw)
+            tinsert(gw,mgw)
+        end
+    end
+    return w,gw
+end
diff --git a/Convert.lua b/Convert.lua
new file mode 100644
index 0000000..76d20ef
--- /dev/null
+++ b/Convert.lua
@@ -0,0 +1,244 @@
+------------------------------------------------------------------------
+--[ nn.Convert ]--
+-- Module to convert between different data formats
+-- nn.Convert('bchw', 'bf') or nn.Convert('chw', 'f')
+-- Automatically converts input to same type as self.output
+-- Simplest use is for automatic input type converions : nn.Convert()
+------------------------------------------------------------------------
+local _ = require 'moses'
+local Convert, parent = torch.class("nn.Convert", "nn.Container")
+
+function Convert:__init(inputShape, outputShape)
+   if outputShape and not inputShape then
+      error"Expecting non-nil arg 1 when arg 2 is provided"
+   end
+   inputShape = inputShape or 'b*'
+   outputShape = outputShape or inputShape
+   self.inputShape = inputShape:find('b') and inputShape or ('b'..inputShape)
+   self.outputShape = outputShape:find('b') and outputShape or ('b'..outputShape)
+   self.inputBatchDim = self.inputShape:find('b')
+   self.outputBatchDim = self.outputShape:find('b')
+   if self.inputShape == 'b*' or self.outputShape == 'b*' then
+      assert(self.inputShape == 'b*' and self.outputShape == 'b*', 'Both or neither shapes must be b*')
+      self.nInputDim = -1
+      self.nOutputDim = -1
+      self.transposition = true
+   else
+      -- number of dims in batch mode
+      self.nInputDim = #self.inputShape
+      self.nOutputDim = #self.outputShape
+      -- is the outputShape just a transposition of the inputShape?
+      if self.nInputDim == self.nOutputDim then
+         self.transposition = true
+         for i=1,self.nInputDim do
+            if not self.outputShape:find(self.inputShape:sub(i,i)) then
+               self.transposition = false
+               break
+            end
+         end
+      end
+   end
+   parent.__init(self)
+end
+
+-- post-initialization
+function Convert:buildConverter(input)
+   if self.transposition then
+      self.converter = self:transpose(self.outputShape)
+   else
+      if (torch.type(self[self.outputShape]) ~= 'function') then
+         error(string.format("Unrecognized conversion of shape %s to %s", self.inputShape, self.outputShape))
+      end
+      self.converter = self[self.outputShape](self, input)
+   end
+   assert(torch.isTensor(self.output), "Expecting Tensor output")
+   
+   self.converter:type(torch.type(self.output))
+   self.converter:serialMode(self.dpnn_serialEmpty, self.dpnn_serialType)
+   
+   self.modules[1] = self.converter
+end
+
+function Convert:updateOutput(input)
+   assert(torch.isTensor(input), "expecting Tensor")
+   if not torch.isTypeOf(input, torch.type(self.output)) then
+      -- handle different input type
+      self._input = self._input or self.output.new()
+      self._input:resize(input:size()):copy(input)
+      input = self._input
+   end
+   self.batchMode = true
+   if input:dim() < self.nInputDim then
+      -- handle non-batch mode
+      local inputSize = input:size():totable()
+      table.insert(inputSize, self.inputBatchDim, 1)
+      self.__input = self.__input or input.new()
+      self.__input:set(input):resize(unpack(inputSize))
+      input = self.__input
+      self.batchMode = false
+   end
+   if not self.converter then
+      self:buildConverter(input)
+   end
+   
+   self.output = self.converter:updateOutput(input)
+   
+   if not self.batchMode then
+      local outputSize = self.output:size():totable()
+      table.remove(outputSize, self.outputBatchDim)
+      self.__output = self.__output or self.output.new()
+      self.__output:set(self.output):resize(unpack(outputSize))
+      self.output = self.__output
+   end
+   return self.output
+end
+
+function Convert:updateGradInput(input, gradOutput)
+   local input_ = input
+   input = self._input or input
+   if not self.batchMode then
+      input = self.__input
+      self.__gradOutput = self.__gradOutput or gradOutput.new()
+      self.__gradOutput:set(gradOutput):resize(self.converter.output:size())
+      gradOutput = self.__gradOutput
+   end
+   
+   local gradInput = self.converter:updateGradInput(input, gradOutput)
+   
+   if not self.batchMode then
+      self.__gradInput = self.__gradInput or gradInput.new()
+      self.__gradInput:set(gradInput):resize(input_:size())
+      gradInput = self.__gradInput
+   end
+   if self._input then
+      self._gradInput = self._gradInput or input.new()
+      self._gradInput:resize(input:size()):copy(gradInput)
+      self.gradInput = self._gradInput
+   else
+      self.gradInput = gradInput
+   end
+   
+   return self.gradInput
+end
+
+function Convert:accGradParameters(input, gradOutput, scale)
+   input = self.batchMode and self.__input or self._input or input
+   gradOutput = self.batchMode and self.__gradOutput or gradOutput
+   self.converter:accGradParameters(input, gradOutput, scale)
+end
+
+function Convert:accUpdateGradParameters(input, gradOutput, lr)
+   input = self.batchMode and self.__input or self._input or input
+   gradOutput = self.batchMode and self.__gradOutput or gradOutput
+   self.converter:accUpdateGradParameters(input, gradOutput, lr)
+end
+
+-- batch feature
+function Convert:bf(input)
+   local b_pos = self:findAxis('b', self.inputShape)
+   local dim = #self.inputShape
+   if self.inputShape == 'bt' then
+      error"Conversion of shape bt to bf not supported: open an issue on github"
+   end
+   -- was b
+   if dim == 1 then
+      return nn.Reshape(1)
+   end
+   -- was b...
+   local modula
+   if b_pos ~= 1 then
+      modula = nn.Transpose({1, b_pos})
+   end
+   if dim > 2 then
+      local transpose = modula
+      local sampleSize = input:select(self:findAxis('b'),1):nElement()
+      local reshape = nn.Reshape(sampleSize)
+      if transpose then
+         modula = nn.Sequential()
+         modula:add(transpose)
+         modula:add(reshape)
+      else
+         modula = reshape
+      end
+   end
+   return modula or nn.Identity()
+end
+
+-- each example is a scalar; batch is a vector
+function Convert:b(input)
+   local b_pos = self:findAxis('b')
+   if self.inputShape == 'bt' or self.inputShape == 'tb' then
+      local t_pos = self:findAxis('t')
+      -- select first set of classes
+      return nn.Select(t_pos, 1)
+   elseif self.inputShape == 'bf' or self.inputShape == 'fb' then
+      -- this wont work as expected with size(f) > 1
+      local f_pos = self:findAxis('f')
+      if input:size(f_pos) > 1 then
+         error("Cannot convert shape "..self.inputShape.." to b when feature > 1")
+      end
+      return nn.Select(f_pos, 1)
+   else
+      error("Cannot convert shape "..self.inputShape.." to shape b")
+   end
+end
+
+-- returns the current shape of the data
+function Convert:default()
+   return nn.Identity()
+end
+
+-- multi-class (batch target)
+function Convert:bt()
+   local b_pos = self:findAxis('b')
+   local modula
+   if self.inputShape == 'b' then
+      modula = nn.Reshape(1)
+   else
+      error("cannot convert shape '"..self.inputShape.."' to bt")
+   end
+   return modula
+end
+
+-- a generic function for transposing shape axes
+function Convert:transpose(newShape)
+   if newShape == self.inputShape then
+      return nn.Identity()
+   end
+   local inputShape = {}
+   for i=1,#self.inputShape do
+      table.insert(inputShape, self.inputShape:sub(i,i))
+   end
+   local transpositions = {}
+   for i=1,#newShape do
+      local j = _.indexOf(inputShape, newShape:sub(i,i))
+      if i ~= j then
+         local char = inputShape[i]
+         inputShape[i] = inputShape[j]
+         inputShape[j] = char
+         table.insert(transpositions, {j, i})
+      end
+   end
+   return nn.Transpose(unpack(transpositions))
+end
+
+function Convert:findAxis(axis_char, shape, silent)
+   shape = shape or self.inputShape
+   local axis_pos = shape:find(axis_char)
+   if (not silent) and (not axis_pos) then
+      error("Provided shape '"..shape.."' has no axis '"..axis_char.."'", 2)
+   end
+   return axis_pos
+end
+
+function Convert:type(type)
+   if not torch.isTypeOf(self.output, type) then
+      self._input = nil
+      self._gradInput = nil
+      self.__input = nil
+      self.__output = nil
+      self.__gradInput = nil
+      self.__gradOutput =  nil
+   end
+   return parent.type(self, type)
+end
diff --git a/Criterion.lua b/Criterion.lua
new file mode 100644
index 0000000..3ecc859
--- /dev/null
+++ b/Criterion.lua
@@ -0,0 +1,4 @@
+local Criterion = nn.Criterion
+
+Criterion.toBatch = nn.Module.toBatch
+Criterion.fromBatch = nn.Module.fromBatch
diff --git a/Decorator.lua b/Decorator.lua
new file mode 100644
index 0000000..05fb4db
--- /dev/null
+++ b/Decorator.lua
@@ -0,0 +1,47 @@
+local Decorator, parent = torch.class("nn.Decorator", "nn.Container")
+
+function Decorator:__init(module)
+   parent.__init(self)
+   -- so that it can be handled like a Container
+   self.modules[1] = module
+end
+
+function Decorator:updateOutput(input)
+   self.output = self.modules[1]:updateOutput(input)
+   return self.output
+end
+
+function Decorator:updateGradInput(input, gradOutput)
+   self.gradInput = self.modules[1]:updateGradInput(input, gradOutput)
+   return self.gradInput
+end
+
+function Decorator:accGradParameters(input, gradOutput, scale)
+   self.modules[1]:accGradParameters(input, gradOutput, scale)
+end
+
+function Decorator:accUpdateGradParameters(input, gradOutput, lr)
+   self.modules[1]:accUpdateGradParameters(input, gradOutput, lr)
+end
+
+function Decorator:sharedAccUpdateGradParameters(input, gradOutput, lr)
+   self.modules[1]:sharedAccUpdateGradParameters(input, gradOutput, lr)
+end
+
+function Decorator:__tostring__()
+   if self.modules[1].__tostring__ then
+      return torch.type(self) .. ' @ ' .. self.modules[1]:__tostring__()
+   else
+      return torch.type(self) .. ' @ ' .. torch.type(self.modules[1])
+   end
+end
+
+-- useful for multiple-inheritance
+function Decorator.decorate(class)
+   class.updateOutput = nn.Decorator.updateOutput
+   class.updateGradInput = nn.Decorator.updateGradInput
+   class.accGradParameters = nn.Decorator.accGradParameters
+   class.accUpdateGradParameters = nn.Decorator.accUpdateGradParameters
+   class.sharedAccUpdateGradParameters = nn.Decorator.sharedAccUpdateGradParameters
+   class.__tostring__ =  nn.Decorator.__tostring__
+end
diff --git a/Dictionary.lua b/Dictionary.lua
new file mode 100644
index 0000000..238283c
--- /dev/null
+++ b/Dictionary.lua
@@ -0,0 +1,6 @@
+local Dictionary, parent = torch.class("nn.Dictionary", "nn.LookupTable")
+
+-- don't use this with optim (useless), use nn.LookupTable instead
+function Dictionary:__init(dictSize, embeddingSize, accUpdate)
+   error"DEPRECATED Jan 14, 2016"
+end
diff --git a/DontCast.lua b/DontCast.lua
new file mode 100644
index 0000000..b89f543
--- /dev/null
+++ b/DontCast.lua
@@ -0,0 +1,124 @@
+local DontCast, parent = torch.class("nn.DontCast", "nn.Decorator")
+
+-- utility functions
+
+local function recursiveTypeCopy(dst, src, type_str)
+   if torch.type(src) == 'table' then
+      dst = (torch.type(dst) == 'table') and dst or {}
+      for k, v in pairs(src) do
+         dst[k] = recursiveTypeCopy(dst[k], v, type_str)
+      end
+   elseif torch.isTensor(src) then
+      dst = (torch.type(dst) == type_str) and dst or torch.getmetatable(type_str).new()
+      dst:resize(src:size())
+      if src:nElement() > 0 then
+         dst:copy(src)
+      end
+   end
+   return dst
+end
+
+local function tableTensorType(src)
+   if type(src) == 'table' then
+      local type_str, found
+      for k,v in pairs(src) do
+         type_str, found = tableTensorType(v)
+         if found then
+            return type_str, true
+         end
+      end
+      return type_str, found
+   else
+      return torch.type(src), torch.isTensor(src)
+   end
+end
+
+-- DontCast methods and constructor
+
+function DontCast:__init(module, castin, castout, moduleType)
+   parent.__init(self, module)
+   self.castin = castin
+   self.castout = (castout == nil) and castin or castout
+   self.moduleType = moduleType
+   if (self.castin or self.castout) and not self.moduleType then
+      local moduleType, found = tableTensorType(module.output)
+      if found then
+         self.moduleType = moduleType
+      else
+         moduleType, found = tableTensorType(module:parameters())
+         if found then
+            self.moduleType = moduleType
+         else
+            error"Cannot extrapolate moduleType. Provide constructor argument 4"
+         end
+      end
+   end
+end
+
+function DontCast:updateOutput(input)
+   if self.castin and tableTensorType(input) ~= self.moduleType then
+      self._input = recursiveTypeCopy(self._input, input, self.moduleType)
+      input = self._input
+   end
+
+   local output = self.modules[1]:updateOutput(input)
+
+   if self.castout then
+      self.output = recursiveTypeCopy(self.output, output, tableTensorType(self.output))
+   else
+      self.output = output
+   end
+   return self.output
+end
+
+function DontCast:updateGradInput(input, gradOutput)
+   if self.castin and tableTensorType(input) ~= self.moduleType then
+      input = self._input
+   end
+   if self.castout and tableTensorType(gradOutput) ~= self.moduleType then
+      self._gradOutput = recursiveTypeCopy(self._gradOutput, gradOutput, self.moduleType)
+      gradOutput = self._gradOutput
+   end
+
+   local gradInput = self.modules[1]:updateGradInput(input, gradOutput)
+
+   if self.castin then
+      self.gradInput = recursiveTypeCopy(self.gradInput, gradInput, tableTensorType(self.gradInput))
+   else
+      self.gradInput = gradInput
+   end
+   return self.gradInput
+end
+
+function DontCast:accGradParameters(input, gradOutput, scale)
+   if self.castin and tableTensorType(input) ~= self.moduleType then
+      input = self._input
+   end
+   if self.castout and tableTensorType(gradOutput) ~= self.moduleType then
+      gradOutput = self._gradOutput
+   end
+
+   self.modules[1]:accGradParameters(input, gradOutput, scale)
+end
+
+function DontCast:accUpdateGradParameters(input, gradOutput, lr)
+   if self.castin and tableTensorType(input) ~= self.moduleType then
+      input = self._input
+   end
+   if self.castout and tableTensorType(gradOutput) ~= self.moduleType then
+      gradOutput = self._gradOutput
+   end
+
+   self.modules[1]:accUpdateGradParameters(input, gradOutput, lr)
+end
+
+-- dont cast (the essence thereof)
+function DontCast:type(type)
+   if self.castout and tableTensorType(self.output) ~= type then
+      self.output = recursiveTypeCopy(nil, self.output, type)
+   end
+   if self.castin and tableTensorType(self.gradInput) ~= type then
+      self.gradInput = recursiveTypeCopy(nil, self.gradInput, type)
+   end
+   return self
+end
diff --git a/FireModule.lua b/FireModule.lua
new file mode 100644
index 0000000..c927c23
--- /dev/null
+++ b/FireModule.lua
@@ -0,0 +1,47 @@
+--[[
+  Fire module as explained in SqueezeNet http://arxiv.org/pdf/1602.07360v1.pdf.
+--]]
+--FIXME works only for batches.
+
+local FireModule, Parent = torch.class('nn.FireModule', 'nn.Decorator')
+
+function FireModule:__init(nInputPlane, s1x1, e1x1, e3x3, activation)
+   self.nInputPlane = nInputPlane
+   self.s1x1 = s1x1
+   self.e1x1 = e1x1
+   self.e3x3 = e3x3
+   self.activation = activation or 'ReLU'
+
+   if self.s1x1 > (self.e1x1 + self.e3x3) then
+      print('Warning: <FireModule> s1x1 is recommended to be smaller'..
+            ' then e1x1+e3x3')
+   end
+   
+   self.module = nn.Sequential()
+   self.squeeze = nn.SpatialConvolution(nInputPlane, s1x1, 1, 1)
+   self.expand = nn.Concat(2)
+   self.expand:add(nn.SpatialConvolution(s1x1, e1x1, 1, 1))
+   self.expand:add(nn.SpatialConvolution(s1x1, e3x3, 3, 3, 1, 1, 1, 1))
+
+   -- Fire Module
+   self.module:add(self.squeeze)
+   self.module:add(nn[self.activation]())
+   self.module:add(self.expand)
+   self.module:add(nn[self.activation]())
+   
+   Parent.__init(self, self.module)
+end
+
+--[[
+function FireModule:type(type, tensorCache)
+   assert(type, 'Module: must provide a type to convert to')
+   self.module = nn.utils.recursiveType(self.module, type, tensorCache)
+end
+--]]
+
+function FireModule:__tostring__()
+   return string.format('%s inputPlanes: %d -> Squeeze Planes: %d -> '..
+                        'Expand: %d(1x1) + %d(3x3), activation: %s',
+                        torch.type(self), self.nInputPlane, self.s1x1,
+                        self.e1x1, self.e3x3, self.activation)
+end
diff --git a/Inception.lua b/Inception.lua
new file mode 100644
index 0000000..7d57c25
--- /dev/null
+++ b/Inception.lua
@@ -0,0 +1,192 @@
+------------------------------------------------------------------------
+-- [[ Inception ]]--
+-- Uses n+2 parallel "columns". The original paper uses 2+2 where
+-- the first two are (but there could be more than two):
+-- 1x1 conv (reduce) -> relu -> 5x5 conv -> relu
+-- 1x1 conv (reduce) -> relu -> 3x3 conv -> relu
+-- and where the other two are :
+-- 3x3 maxpool -> 1x1 conv (reduce/project) -> relu
+-- 1x1 conv (reduce) -> relu.
+-- This Model allows the first group of columns to be of any
+-- number while the last group consist of exactly two columns.
+-- The 1x1 conv are used to reduce the number of input channels
+-- (or filters) such that the capacity of the network doesnt
+-- explode. We refer to these here has "reduce". Since each
+-- column seems to have one and only one reduce, their initial
+-- configuration options are specified in lists of n+2 elements.
+------------------------------------------------------------------------
+local Inception, parent = torch.class("nn.Inception", "nn.Decorator")
+
+function Inception:__init(config)
+   --[[ Required Arguments ]]--
+   -- Number of input channels or colors
+   self.inputSize = config.inputSize
+   -- Number of filters in the non-1x1 convolution kernel sizes, e.g. {32,48}
+   self.outputSize = config.outputSize
+   -- Number of filters in the 1x1 convolutions (reduction)
+   -- used in each column, e.g. {48,64,32,32}. The last 2 are
+   -- used respectively for the max pooling (projection) column
+   -- (the last column in the paper) and the column that has
+   -- nothing but a 1x1 conv (the first column in the paper).
+   -- This table should have two elements more than the outputSize
+   self.reduceSize = config.reduceSize
+
+   --[[ Optional Arguments ]]--
+   -- The strides of the 1x1 (reduction) convolutions. Defaults to {1,1,...}
+   self.reduceStride = config.reduceStride or {}
+   -- A transfer function like nn.Tanh, nn.Sigmoid, nn.ReLU, nn.Identity, etc.
+   -- It is used after each reduction (1x1 convolution) and convolution
+   self.transfer = config.transfer or nn.ReLU()
+   -- batch normalization can be awesome
+   self.batchNorm = config.batchNorm
+   -- Adding padding to the input of the convolutions such that
+   -- input width and height are same as that of output.
+   self.padding = true
+   if config.padding ~= nil then
+      self.padding = config.padding
+   end
+   -- The size (height=width) of the non-1x1 convolution kernels.
+   self.kernelSize = config.kernelSize or {5,3}
+   -- The stride (height=width) of the convolution.
+   self.kernelStride = config.kernelStride or {1,1}
+   -- The size (height=width) of the spatial max pooling used
+   -- in the next-to-last column.
+   self.poolSize = config.poolSize or 3
+   -- The stride (height=width) of the spatial max pooling.
+   self.poolStride = config.poolStride or 1
+   -- The pooling layer.
+   self.pool = config.pool or nn.SpatialMaxPooling(self.poolSize, self.poolSize, self.poolStride, self.poolStride)
+
+
+   -- Variables checking that all of the output sizes are the same for a sample input.
+   local iWidth, iHeight = 100, 200
+   local oWidth, oHeight
+
+   -- [[ Module Construction ]]--
+   local depthConcat = nn.DepthConcat(2) -- concat on 'c' dimension
+   -- 1x1 conv (reduce) -> 3x3 conv
+   -- 1x1 conv (reduce) -> 5x5 conv
+   -- ...
+   for i=1,#self.kernelSize do
+      local mlp = nn.Sequential()
+      -- 1x1 conv
+      local reduce = nn.SpatialConvolution(
+         self.inputSize, self.reduceSize[i], 1, 1,
+         self.reduceStride[i] or 1, self.reduceStride[i] or 1
+      )
+      mlp:add(reduce)
+      if self.batchNorm then
+         mlp:add(nn.SpatialBatchNormalization(self.reduceSize[i]))
+      end
+      mlp:add(self.transfer:clone())
+
+      -- nxn conv
+      local pad = self.padding and math.floor(self.kernelSize[i]/2) or 0
+      local conv = nn.SpatialConvolution(
+         self.reduceSize[i], self.outputSize[i],
+         self.kernelSize[i], self.kernelSize[i],
+         self.kernelStride[i], self.kernelStride[i],
+         pad
+      )
+      mlp:add(conv)
+      if self.batchNorm then
+         mlp:add(nn.SpatialBatchNormalization(self.outputSize[i]))
+      end
+      mlp:add(self.transfer:clone())
+      depthConcat:add(mlp)
+
+      -- Check the output sizes.
+      local oWidth_i = torch.floor(
+         (iWidth + 2*pad - self.kernelSize[i])/self.kernelStride[i] + 1)
+      local oHeight_i = torch.floor(
+         (iHeight + 2*pad - self.kernelSize[i])/self.kernelStride[i] + 1)
+      if oWidth == nil then
+         oWidth = oWidth_i
+         oHeight = oHeight_i
+      else
+         if oWidth ~= oWidth_i or oHeight ~= oHeight_i then
+            print("dpnn.Inception: Warning: Inconsistent output sizes.")
+         end
+      end
+   end
+
+   -- pool -> 1x1 conv
+   local mlp = nn.Sequential()
+   mlp:add(self.pool)
+   -- not sure if transfer should go here? mlp:add(transfer:clone())
+   local i = #(self.kernelSize) + 1
+   if self.reduceSize[i] then
+      local reduce = nn.SpatialConvolution(
+         self.inputSize, self.reduceSize[i], 1, 1,
+         self.reduceStride[i] or 1, self.reduceStride[i] or 1
+      )
+      mlp:add(reduce)
+      if self.batchNorm then
+         mlp:add(nn.SpatialBatchNormalization(self.reduceSize[i]))
+      end
+      mlp:add(self.transfer:clone())
+   end
+   depthConcat:add(mlp)
+
+   -- Check the output sizes. Infer the operation of the pooling layer.
+   if self.pool.kW ~= nil and self.pool.dW ~= nil and self.pool.padW ~= nil then
+      assert(oWidth ~= nil)
+      assert(oHeight ~= nil)
+      local oWidth_pool = torch.floor(
+         (iWidth + 2*self.pool.padW - self.pool.kW)/self.pool.dW + 1)
+      local oHeight_pool = torch.floor(
+         (iHeight + 2*self.pool.padH - self.pool.kH)/self.pool.dH + 1)
+      if oWidth ~= oWidth_pool or oHeight ~= oHeight_pool then
+         print("dpnn.Inception: Warning: Inconsistent output sizes in pooling.")
+      end
+   end
+
+   -- reduce: 1x1 conv (channel-wise pooling)
+   i = i + 1
+   if self.reduceSize[i] then
+      local mlp = nn.Sequential()
+      local reduce = nn.SpatialConvolution(
+          self.inputSize, self.reduceSize[i], 1, 1,
+          self.reduceStride[i] or 1, self.reduceStride[i] or 1
+      )
+      mlp:add(reduce)
+      if self.batchNorm then
+          mlp:add(nn.SpatialBatchNormalization(self.reduceSize[i]))
+      end
+      mlp:add(self.transfer:clone())
+      depthConcat:add(mlp)
+
+      -- Check the output sizes.
+      local oWidth_conv = torch.floor((iWidth - 1)/(self.reduceStride[i] or 1) + 1)
+      local oHeight_conv = torch.floor((iHeight - 1)/(self.reduceStride[i] or 1) + 1)
+      if oWidth ~= oWidth_conv or oHeight ~= oHeight_conv then
+         print("dpnn.Inception: Warning: Inconsistent output sizes in 1x1 conv.")
+      end
+   end
+
+   parent.__init(self, depthConcat)
+end
+
+function Inception:updateOutput(input)
+   local input = self:toBatch(input, 3)
+   local output = self.modules[1]:updateOutput(input)
+   self.output = self:fromBatch(output, 3)
+   return self.output
+end
+
+function Inception:updateGradInput(input, gradOutput)
+   local input, gradOutput = self:toBatch(input, 3), self:toBatch(gradOutput, 3)
+   local gradInput = self.modules[1]:updateGradInput(input, gradOutput)
+   self.gradInput = self:fromBatch(gradInput, 3)
+   return self.gradInput
+end
+
+function Inception:accGradParameters(input, gradOutput, scale)
+   local input, gradOutput = self:toBatch(input, 3), self:toBatch(gradOutput, 3)
+   self.modules[1]:accGradParameters(input, gradOutput, scale)
+end
+
+function Inception:accUpdateGradParameters(input, gradOutput, lr)
+   local input, gradOutput = self:toBatch(input, 3), self:toBatch(gradOutput, 3)
+   self.modules[1]:accUpdateGradParameters(input, gradOutput, lr)
+end
diff --git a/Kmeans.lua b/Kmeans.lua
new file mode 100644
index 0000000..fb0f3d8
--- /dev/null
+++ b/Kmeans.lua
@@ -0,0 +1,205 @@
+-- Online (Hard) Kmeans layer.
+local Kmeans, parent = torch.class('nn.Kmeans', 'nn.Module')
+
+function Kmeans:__init(k, dim, scale)
+   parent.__init(self)
+   self.k = k
+   self.dim = dim
+
+   -- scale for online kmean update
+   self.scale = scale
+
+   assert(k > 0, "Clusters cannot be 0 or negative.")
+   assert(dim > 0, "Dimensionality cannot be 0 or negative.")
+
+   -- Kmeans centers -> self.weight
+   self.weight = torch.Tensor(self.k, self.dim)
+
+   self.gradWeight = torch.Tensor(self.weight:size())
+   self.loss = 0 -- within cluster error of the last forward
+
+   self.clusterSampleCount = torch.Tensor(self.k)
+
+   self:reset()
+end
+
+-- Reset
+function Kmeans:reset(stdev)
+   local stdev = stdev or 1
+   self.weight:uniform(-stdev, stdev)
+end
+
+-- Initialize Kmeans weight with random samples from input.
+function Kmeans:initRandom(input)
+   local inputDim = input:nDimension()
+   assert(inputDim == 2, "Incorrect input dimensionality. Expecting 2D.")
+
+   local noOfSamples = input:size(1)
+   local dim = input:size(2)
+   assert(dim == self.dim, "Dimensionality of input and weight don't match.")
+   assert(noOfSamples >= self.k, "Need atleast k samples for initialization.")
+
+   local indices = torch.zeros(self.k)
+   indices:random(1, noOfSamples)
+
+   for i=1, self.k do
+      self.weight[i]:copy(input[indices[i]])
+   end
+end
+
+-- Initialize using Kmeans++
+function Kmeans:initKmeansPlus(input, p)
+   self.p = p or self.p or 0.95
+   assert(self.p>=0 and self.p<=1, "P value should be between 0-1.")
+
+   local inputDim = input:nDimension()
+   assert(inputDim == 2, "Incorrect input dimensionality. Expecting 2D.")
+   local noOfSamples = input:size(1)
+   
+   local pcount = math.ceil((1-self.p)*noOfSamples)
+   if pcount <= 0 then pcount = 1 end
+
+   local initializedK = 1
+   self.weight[initializedK]:copy(input[torch.random(noOfSamples)])
+   initializedK = initializedK + 1
+
+   local clusters = self.weight.new()
+   local clusterDistances = self.weight.new()
+   local temp = self.weight.new()
+   local expandedSample = self.weight.new()
+   local distances = self.weight.new()
+   distances:resize(noOfSamples):fill(math.huge)
+   local maxScores = self.weight.new()
+   local maxIndx = self.weight.new()
+   
+   for k=initializedK, self.k do
+      clusters = self.weight[{{initializedK-1, initializedK-1}}]
+      for i=1, noOfSamples do
+         temp:expand(input[{{i}}], 1, self.dim)
+         expandedSample:resize(temp:size()):copy(temp)
+      
+         -- Squared Euclidean distance
+         expandedSample:add(-1, clusters)
+         clusterDistances:norm(expandedSample, 2, 2)
+         clusterDistances:pow(2)
+         distances[i] = math.min(clusterDistances:min(), distances[i])
+      end
+      maxScores, maxIndx = distances:sort(true)
+      local tempIndx = torch.random(pcount)
+      local indx = maxIndx[tempIndx]
+      self.weight[initializedK]:copy(input[indx])
+      initializedK = initializedK + 1
+   end
+end
+
+-- Kmeans updateOutput (forward)
+function Kmeans:updateOutput(input)
+   local inputDim = input:nDimension()
+   assert(inputDim == 2, "Incorrect input dimensionality. Expecting 2D.")
+
+   local batchSize = input:size(1)
+   local dim = input:size(2)
+   assert(dim == self.dim, "Dimensionality of input and weight don't match.")
+
+   assert(input:isContiguous(), "Input is not contiguous.")
+
+   -- a sample copied k times to compute distance between sample and weight
+   self._expandedSamples = self._expandedSamples or self.weight.new()
+
+   -- distance between a sample and weight
+   self._clusterDistances = self._clusterDistances or self.weight.new()
+
+   self._temp = self._temp or input.new()
+   self._tempExpanded = self._tempExpanded or input.new()
+
+   -- Expanding inputs
+   self._temp:view(input, 1, batchSize, self.dim)
+   self._tempExpanded:expand(self._temp, self.k, batchSize, self.dim)
+   self._expandedSamples:resize(self.k, batchSize, self.dim)
+                        :copy(self._tempExpanded)
+
+   -- Expanding weights
+   self._tempWeight = self._tempWeight or self.weight.new()
+   self._tempWeightExp = self._tempWeightExp or self.weight.new()
+   self._expandedWeight = self._expanedWeight or self.weight.new()
+   self._tempWeight:view(self.weight, self.k, 1, self.dim)
+   self._tempWeightExp:expand(self._tempWeight, self._expandedSamples:size())
+   self._expandedWeight:resize(self.k, batchSize, self.dim)
+                       :copy(self._tempWeightExp)
+
+   -- x-c
+   self._expandedSamples:add(-1, self._expandedWeight)
+   -- Squared Euclidean distance
+   self._clusterDistances:norm(self._expandedSamples, 2, 3)
+   self._clusterDistances:pow(2)
+   self._clusterDistances:resize(self.k, batchSize)
+
+   self._minScore = self._minScore or self.weight.new()
+   self._minIndx = self._minIndx or torch.LongTensor()
+   self._minScore:min(self._minIndx, self._clusterDistances, 1)
+   self._minIndx:resize(batchSize)
+   
+   self.output:resize(batchSize):copy(self._minIndx)
+   self.loss = self._minScore:sum()
+  
+   return self.output 
+end
+
+-- Kmeans has its own criterion hence gradInput are zeros
+function Kmeans:updateGradInput(input, gradOuput)
+   self.gradInput:resize(input:size()):zero()
+   
+   return self.gradInput
+end
+
+-- We define kmeans update rule as c -> c + scale * 1/n * sum_i (x-c).
+-- n is no. of x's belonging to c.
+-- With this update rule and gradient descent will be negative the gradWeights.
+function Kmeans:accGradParameters(input, gradOutput, scale)
+   local scale = self.scale or scale or 1
+   assert(scale > 0 , " Scale has to be positive.")
+
+   -- Update cluster sample count
+   local batchSize = input:size(1)
+   self._cscAdder = self._cscAdder or self.weight.new()
+   self._cscAdder:resize(batchSize):fill(1)
+   self.clusterSampleCount:zero()
+   self.clusterSampleCount:indexAdd(1, self._minIndx, self._cscAdder)
+   
+   -- scale * (x[k]-c[k]) where k is nearest cluster to x
+   self._gradWeight = self._gradWeight or self.gradWeight.new()
+   self._gradWeight:index(self.weight, 1, self._minIndx)
+   self._gradWeight:mul(-1) 
+   self._gradWeight:add(input)
+   self._gradWeight:mul(-scale)
+   
+   self._gradWeight2 = self._gradWeight2 or self.gradWeight.new()
+   self._gradWeight2:resizeAs(self.gradWeight):zero()
+   self._gradWeight2:indexAdd(1, self._minIndx, self._gradWeight)
+   
+   -- scale/n * sum_i (x-c)
+   self._ccounts = self._ccounts or self.clusterSampleCount.new()
+   self._ccounts:resize(self.k):copy(self.clusterSampleCount)
+   self._ccounts:add(0.0000001) -- prevent division by zero errors
+   
+   self._gradWeight2:cdiv(self._ccounts:view(self.k,1):expandAs(self.gradWeight))
+   
+   self.gradWeight:add(self._gradWeight2)
+end
+
+function Kmeans:type(type, tensorCache)
+   if type then
+      -- prevent premature memory allocations
+      self._expandedSamples = nil
+      self._clusterDistances = nil
+      self._temp = nil
+      self._tempExpanded = nil
+      self._tempWeight = nil
+      self._tempWeightExp = nil
+      self._expandedWeight = nil
+      self._minScore = nil
+      self._minIndx = nil
+      self._cscAdder = nil
+   end
+   return parent.type(self, type, tensorCache)
+end
diff --git a/LookupTable.lua b/LookupTable.lua
new file mode 100644
index 0000000..668afa8
--- /dev/null
+++ b/LookupTable.lua
@@ -0,0 +1,17 @@
+local LookupTable, parent = nn.LookupTable, nn.Module
+
+function LookupTable:maxParamNorm(maxOutNorm, maxInNorm)
+   maxOutNorm = self.maxOutNorm or maxOutNorm or self.maxInNorm or maxInNorm
+   if not (maxOutNorm or maxInNorm) then
+      return
+   end
+   
+   if maxOutNorm and maxOutNorm > 0 then
+      -- cols feed into output neurons 
+      self.weight:renorm(2, 2, maxOutNorm)
+   end
+   if maxInNorm and maxInNorm > 0 then
+      -- rows feed out from input neurons
+      self.weight:renorm(2, 1, maxInNorm)
+   end
+end
diff --git a/Module.lua b/Module.lua
index 05b6085..5aa5145 100644
--- a/Module.lua
+++ b/Module.lua
@@ -39,6 +39,403 @@ function Module:setOutputStep(step)
    end
 end
 
+function Module:sparseParameters()
+   return self:parameters()
+end
+
+function Module:updateParameters(learningRate)
+   -- sparse params can have different learningRate scales per param
+   local params, gradParams, scales = self:sparseParameters()
+   if params then
+      for i,param in pairs(params) do -- pairs for sparse params
+         local scale = scales and scales[i] or 1
+         param:add(-learningRate*scale, gradParams[i])
+      end
+   end
+end
+
+function Module:zeroGradParameters()
+   local _,gradParams = self:sparseParameters()
+   if gradParams then
+      for i,gradParam in pairs(gradParams) do -- pairs for sparse params
+         gradParam:zero()
+      end
+   end
+end
+
+------------------------ clone and type --------------------------------
+
+Module.dpnn_parameters = {'weight', 'bias'}
+Module.dpnn_gradParameters = {'gradWeight', 'gradBias'}
+
+-- efficient version of :
+-- clone = self:clone()
+-- clone:share(self, paramNames, gradParamNames)
+-- Note that this method is the very bane of my existence.
+-- I have worked on it too many times...
+function Module:sharedClone(shareParams, shareGradParams, stepClone)
+   shareParams = (shareParams == nil) and true or shareParams
+   shareGradParams = (shareGradParams == nil) and true or shareGradParams
+
+   if stepClone and self.dpnn_stepclone then
+      -- this is for AbstractRecurrent modules (in rnn)
+      return self
+   end
+
+   local pointers = {} -- to params/gradParams (dont clone params/gradParams)
+   local scdone = {}
+
+   -- 1. remove all params/gradParams
+   local function recursiveRemove(obj) -- remove modules
+      local moduleTree
+      local isTable = type(obj) == 'table'
+      if torch.isTypeOf(obj, 'nn.Module') then
+         assert(isTable)
+         if stepClone and obj.dpnn_stepclone then
+            -- this is for AbstractRecurrent modules (in rnn)
+            moduleTree = obj
+            obj = nil
+            isTable = false
+         elseif obj.dpnn_sharedClone then
+            -- allow to use a custom sharedClone method on one module
+            moduleTree = obj
+            obj = nil
+            isTable = false
+         elseif scdone[torch.pointer(obj)] then
+            moduleTree = scdone[torch.pointer(obj)]
+         else
+            -- remove the params, gradParams. Save for later.
+            local params = {}
+
+            if shareParams then
+               for i,paramName in ipairs(obj.dpnn_parameters) do
+                  local param = obj[paramName]
+                  if param then
+                     params[paramName] = param
+                     obj[paramName] = nil
+                     if torch.isTensor(param) and param.storage and param:storage() then
+                        pointers[torch.pointer(param:storage():data())] = true
+                     end
+                  end
+               end
+            end
+
+            if shareGradParams then
+               for i,paramName in ipairs(obj.dpnn_gradParameters) do
+                  local gradParam = obj[paramName]
+                  if gradParam then
+                     params[paramName] = gradParam
+                     obj[paramName] = nil
+                     if torch.isTensor(gradParam) and gradParam.storage and gradParam:storage() then
+                        pointers[torch.pointer(gradParam:storage():data())] = true
+                     end
+                  end
+               end
+            end
+
+            -- find all obj.attribute tensors that share storage with the shared params
+            for paramName, param in pairs(obj) do
+               if torch.isTensor(param) and param:storage() then
+                  if pointers[torch.pointer(param:storage():data())] then
+                     params[paramName] = param
+                     obj[paramName] = nil
+                  end
+               end
+            end
+
+            moduleTree = params
+
+            scdone[torch.pointer(obj)] = moduleTree
+
+            for k,v in pairs(obj) do
+               moduleTree[k], obj[k] = recursiveRemove(v)
+            end
+
+         end
+      elseif isTable then
+         if scdone[torch.pointer(obj)] then
+            moduleTree = scdone[torch.pointer(obj)]
+         else
+            assert(not moduleTree)
+            moduleTree = {}
+            for k,v in pairs(obj) do
+               moduleTree[k], obj[k] = recursiveRemove(v)
+            end
+            scdone[torch.pointer(obj)] = moduleTree
+         end
+
+      end
+
+      return moduleTree, obj
+   end
+
+   local moduleTree, original = recursiveRemove(self)
+   assert(original)
+
+   -- 2. clone everything but parameters, gradients and modules (removed above)
+
+   local clone = self:clone()
+
+   -- 3. add back to self/clone everything that was removed in step 1
+
+   local function recursiveSet(clone, original, moduleTree)
+      assert(clone)
+      assert(original)
+      if scdone[torch.pointer(original)] then
+         for k,param in pairs(moduleTree) do
+            if torch.isTypeOf(param,'nn.Module') then
+               if param.dpnn_sharedClone then
+                  -- Call the custom sharedClone
+                  clone[k] = param:dpnn_sharedClone()
+               else
+                  -- AbstractRecurrent instances branch here with stepClone = true
+                  clone[k] = param
+               end
+               original[k] = param
+            elseif torch.isTensor(param) then
+               if param.storage then
+                  clone[k] = param.new():set(param)
+                  original[k] = param
+               else -- for torch.MultiCudaTensor
+                  clone[k] = param
+                  original[k] = param
+               end
+            elseif type(param) == 'table' then
+               recursiveSet(clone[k], original[k], param)
+            end
+         end
+         scdone[torch.pointer(original)] = nil
+      end
+
+   end
+
+   recursiveSet(clone, self, moduleTree)
+
+   return clone
+end
+
+-- we override this method such that hidden modules
+-- will be included in the getParameters call.
+-- Hidden modules are common for recurrent modules that
+-- have internal references to modules that share parameters
+-- with the main modules.
+-- These must also be included in the getParameters() call in order
+-- to maintain shared storage for tensors.
+function Module:getParameters()
+
+   local con = nn.Container()
+   con:add(self)
+
+   -- recursive get all modules (modules, sharedclones, etc.)
+   local function recursiveGetModules(tbl)
+      for k,m in pairs(tbl) do
+         if torch.isTypeOf(m, 'nn.Module') then
+            if not m.dpnn_getParameters_found then
+               con:add(m)
+               m.dpnn_getParameters_found = true
+               recursiveGetModules(m)
+            end
+         elseif torch.type(m) == 'table' then
+            recursiveGetModules(m)
+         end
+      end
+   end
+
+   recursiveGetModules(self)
+
+   for i,m in ipairs(con.modules) do
+      m.dpnn_getParameters_found = nil
+   end
+
+   -- get ALL parameters
+   local parameters,gradParameters = con:parameters()
+   return Module.flatten(parameters), Module.flatten(gradParameters)
+end
+
+----------------- serialization (see nn.Serial) -------------------
+
+Module.dpnn_mediumEmpty = {'output', 'gradInput', 'momGradParams', 'dpnn_input'}
+Module.dpnn_lightEmpty = Module.dpnn_gradParameters
+-- defaults to heavy serialization
+Module.dpnn_serialEmpty = {}
+
+-- sets the serialization behavior of the entire module structure
+function Module:serialMode(empty)
+   assert(torch.type(empty) == 'table', "Expecting table at arg 1")
+   self.dpnn_serialEmpty = empty
+   -- set the serial of all encapsulated modules
+   local function recursiveSerial(tbl)
+      for k,v in pairs(tbl) do
+         if torch.isTypeOf(v, 'nn.Module') then
+            v:serialMode(empty)
+         elseif torch.type(v) == 'table' then
+            recursiveSerial(v)
+         end
+      end
+   end
+   recursiveSerial(self)
+   return self
+end
+
+-- serialMode : serialize everything
+function Module:heavySerial()
+   return self:serialMode({})
+end
+
+-- serialMode : serialize everything except dpnn_mediumEmpty attributes
+function Module:mediumSerial()
+
+   self.dpnn_serialEmpty = self.dpnn_mediumEmpty
+
+   -- set the serial of all encapsulated modules
+   local function recursiveSerial(tbl)
+      for k,v in pairs(tbl) do
+         if torch.isTypeOf(v, 'nn.Module') then
+            v:mediumSerial()
+         elseif torch.type(v) == 'table' then
+            recursiveSerial(v)
+         end
+      end
+   end
+   recursiveSerial(self)
+   return self
+end
+
+-- serialMode : serialize everything except dpnn_mediumEmpty and dpnn_lightEmpty attributes
+function Module:lightSerial()
+
+   self.dpnn_serialEmpty = _.clone(self.dpnn_mediumEmpty)
+   for k,v in ipairs(self.dpnn_lightEmpty) do
+      table.insert(self.dpnn_serialEmpty, v)
+   end
+
+   -- set the serial of all encapsulated modules
+   local function recursiveSerial(tbl)
+      for k,v in pairs(tbl) do
+         if torch.isTypeOf(v, 'nn.Module') then
+            v:lightSerial()
+         elseif torch.type(v) == 'table' then
+            recursiveSerial(v)
+         end
+      end
+   end
+   recursiveSerial(self)
+
+   return self
+end
+
+function Module:getSerialState(states)
+   states = states or {}
+
+   -- dont get the serial state of the same module twice (reuse existing)
+   if states[self] then
+      return states[self]
+   end
+
+   local _ = require 'moses'
+   -- returns the object structure as tables (i.e. without metatables)
+   local function recursiveState(tbl)
+      local state = _.map(tbl,
+         function(k,v)
+            if torch.isTypeOf(tbl, 'nn.Module') and _.contains(tbl.dpnn_serialEmpty, k) then
+               -- "empties" module attributes found in empty
+               if torch.type(v) == 'table' then
+                  -- empty table
+                  return {}
+               elseif torch.isTensor(v) then
+                  -- empty tensor
+                  return v.new()
+               else
+                  -- not table nor tensor? then serialize as is
+                  return v
+               end
+            elseif torch.isTypeOf(v, 'nn.Module') then
+               -- recursive, yet can be overwritten
+               return v:getSerialState(states)
+            elseif torch.type(v) == 'table' then
+               -- in case it is a table of modules
+               if not states[v] then
+                  states[v] = recursiveState(v)
+               end
+               return states[v]
+            else
+               return v
+            end
+         end
+      )
+      return state
+   end
+
+   local state = recursiveState(self)
+
+   -- include typename so that module can be reconstructed from the state
+   state.dpnn_typename = torch.type(self)
+   states[self] = state
+
+   return state
+end
+
+-- decorates self with nn.Serial
+function Module:Serial(tensortype)
+   return nn.Serial(self, tensortype)
+end
+
+----------------------- for training -----------------------------
+
+-- useful to get the output size
+-- I chose this method name because it is less likely to be overriden.
+function Module:outside(insize)
+   local input
+   if torch.type(insize) == 'table' then
+      input = torch.randn(table.unpack(insize))
+   else
+      input = torch.randn(insize)
+   end
+   local output = self:updateOutput(input)
+   return output:size()
+end
+
+-- for those interested in implementing the visitor design pattern
+function Module:accept(visitor)
+   visitor:visit(self)
+end
+
+-- Can be used as a regularizer instead of weight decay
+-- Assumes that parameters are arranged (output dim x ... x input dim)
+function Module:maxParamNorm(maxOutNorm, maxInNorm)
+   -- this allows each module to set its own max[Out,In]Norm
+   maxOutNorm = self.maxOutNorm or maxOutNorm
+   maxInNorm = self.maxInNorm or maxInNorm
+   if not (maxOutNorm or maxInNorm) then
+      return
+   end
+
+   if self.modules then
+      for i,module in ipairs(self.modules) do
+         module:maxParamNorm(maxOutNorm, maxInNorm)
+      end
+   else
+      local params = self:parameters()
+      if not params or gradParams then
+         return
+      end
+      for k,param in pairs(params) do -- pairs for sparse params
+         -- By default, only affects non-1D params.
+         if param:dim() > 1 then
+            if maxOutNorm and maxOutNorm > 0 then
+               -- rows feed into output neurons
+               param:renorm(2, 1, maxOutNorm)
+            end
+            if maxInNorm and maxInNorm > 0 then
+               -- cols feed out from input neurons
+               param:renorm(2, param:dim(), maxInNorm)
+            end
+         end
+      end
+   end
+end
+
+
 -- set the maximum number of backpropagation through time (BPTT) time-steps
 function Module:maxBPTTstep(rho)
    if self.modules then
@@ -84,4 +481,232 @@ function Module:setGradHiddenState(step, gradHiddenState)
          module:setGradHiddenState(step, gradHiddenState[i])
       end
    end
-end
\ No newline at end of file
+end
+
+-- Similar to maxParamNorm, but norm is global to Module for which
+-- this is called. Unless moduleLocal is true, in which case, the
+-- norm constraint is applied to the norm of all parameters in each
+-- component (non-container) module.
+function Module:gradParamClip(cutoffNorm, moduleLocal)
+   -- this allows each module to set its own cutoffNorm
+   cutoffNorm = self.cutoffNorm or cutoffNorm
+   if cutoffNorm <= 0 then
+      return
+   end
+   if self.moduleLocal ~= nil then
+      moduleLocal = self.moduleLocal
+   end
+
+   local norm = 0
+   if moduleLocal and self.modules then
+      for i,module in ipairs(self.modules) do
+         norm = norm + math.pow(module:gradParamClip(cutoffNorm, moduleLocal), 2)
+      end
+      norm = math.sqrt(norm)
+   else
+      local params, gradParams = self:parameters()
+      if not (params and gradParams) then
+         return norm
+      end
+      for k,gradParam in pairs(gradParams) do -- pairs for sparse params
+         if torch.type(gradParam) == 'torch.CudaTensor' then
+            cutorch.withDevice(gradParam:getDevice(), function() -- support multi-device models
+               norm = norm + math.pow(gradParam:norm(),2)
+            end)
+         else
+            norm = norm + math.pow(gradParam:norm(),2)
+         end
+      end
+      norm = math.sqrt(norm)
+      if norm > cutoffNorm then
+         -- rescale gradParams to obtain desired cutoffNorm
+         for k,gradParam in pairs(gradParams) do
+            if torch.type(gradParam) == 'torch.CudaTensor' then
+               cutorch.withDevice(gradParam:getDevice(), function() -- support multi-device models
+                  gradParam:mul(cutoffNorm/norm)
+               end)
+            else
+               gradParam:mul(cutoffNorm/norm)
+            end
+         end
+      end
+   end
+   return norm
+end
+
+-- Adds weight decay constraint on params with dims > 2 (default).
+-- TODO : allow inplace weightDecay (before calling accUpdateGradParameters)
+function Module:weightDecay(wdFactor, wdMinDim)
+   -- this allows each module to set its own hyper-parameters
+   wdFactor = self.wdFactor or wdFactor
+   if wdFactor <= 0 then
+      return
+   end
+   wdMinDim = self.wdMinDim or wdMinDim or 2
+
+   if self.modules then
+      for i,module in ipairs(self.modules) do
+         module:weightDecay(wdFactor, wdMinDim)
+      end
+   else
+      local params, gradParams = self:parameters()
+      if not (params and gradParams) then
+         return
+      end
+
+      for i,param in pairs(params) do -- pairs for sparse params
+         if param:dim() >= wdMinDim then
+            gradParams[i]:add(wdFactor, param)
+         end
+      end
+   end
+end
+
+function Module:momentumGradParameters()
+   if (not self.momGradParams) or _.isEmpty(self.momGradParams) then
+      local params, gradParams = self:parameters()
+      if not gradParams or _.isEmpty(gradParams) then
+         return
+      end
+      self.momGradParams = {}
+      for i,gradParam in pairs(gradParams) do
+         if torch.type(gradParam) == 'torch.CudaTensor' then
+            cutorch.withDevice(gradParam:getDevice(), function() -- support multi-device models
+               self.momGradParams[i] = gradParam.new():resizeAs(gradParam):copy(gradParam)
+            end)
+         else
+            self.momGradParams[i] = gradParam.new():resizeAs(gradParam):copy(gradParam)
+         end
+      end
+   end
+   return self.momGradParams
+end
+
+-- uses momentum learning to update gradParams
+function Module:updateGradParameters(momFactor, momDamp, momNesterov)
+   -- this allows each module to set its own hyper-parameters
+   momFactor = self.momFactor or momFactor
+   if momFactor <= 0 then
+      return
+   end
+   momDamp = self.momDamp or momDamp or momFactor
+   if self.momNesterov ~= nil then
+      momNesterov = self.momNesterov
+   end
+
+   if self.modules then
+      for i,module in ipairs(self.modules) do
+         module:updateGradParameters(momFactor, momDamp, momNesterov)
+      end
+   else
+      local params, gradParams = self:parameters()
+      if (not params) or _.isEmpty(params) then
+         return
+      end
+      local momGradParams = self:momentumGradParameters()
+      for i,gradParam in pairs(gradParams) do
+         momGradParams[i]:mul(momFactor)
+         momGradParams[i]:add(1-momDamp, gradParam)
+      end
+
+      if momNesterov then
+         for i,gradParam in pairs(gradParams) do
+            gradParam:add(momFactor, momGradParams[i])
+         end
+      else
+         for i,gradParam in pairs(gradParams) do
+            gradParam:copy(momGradParams[i])
+         end
+      end
+   end
+end
+
+function Module:checkParameters()
+   local params = self:parameters() or {}
+   for k,param in pairs(params) do
+      if _.isNaN(param:sum()) then
+         error("NaN Error for param at index" ..k)
+      end
+   end
+end
+
+function Module:dontBackward()
+   self.backward = function() end
+   self.updateGradInput = function() end
+   self.accGradParameters = function() end
+   self.accUpdateGradParameters = function() end
+   return self
+end
+
+function Module:contiguousInput(input, backward)
+   if backward then
+      return self.dpnn_cinput or input
+   end
+   if not input:isContiguous() then
+      self.dpnn_cinput = self.dpnn_cinput or input.new()
+      self.dpnn_cinput:resizeAs(input):copy(input)
+      input = self.dpnn_cinput
+   end
+   return input
+end
+
+function Module:toBatch(tensor, nDim, batchDim)
+   local batchDim = batchDim or 1
+   if tensor:dim() == nDim then
+      self.dpnn_online = true
+      local size = tensor:size():totable()
+      table.insert(size, batchDim, 1)
+      tensor = tensor:view(table.unpack(size))
+   else
+      self.dpnn_online = false
+   end
+   return tensor
+end
+
+function Module:fromBatch(tensor, batchDim)
+   if self.dpnn_online then
+      local size = tensor:size():totable()
+      assert(table.remove(size, batchDim) == 1)
+      tensor = tensor:view(table.unpack(size))
+   end
+   return tensor
+end
+
+function Module:extrapolateType()
+   local params = module:parameters()
+   if params then
+      -- extrapolate the tensor type of the module
+      local types = {}
+      for i, param in ipairs(params) do
+         local tensorType = torch.type(param)
+         types[tensorType] = (types[tensorType] or 0) + 1
+      end
+      local maxCount = 0
+      local maxType
+      for tensorType, count in pairs(types) do
+         if count > maxCount then
+            maxtype = tensorType
+            maxCount = count
+         end
+      end
+      return maxType
+   end
+   return nil --unknown otherwise
+end
+
+function Module:profile()
+   if self.modules then
+      for i, module in ipairs(self.modules) do
+         module:profile()
+      end
+   end
+   self.dpnn_profile = true
+end
+
+function Module:reinforce(reward)
+   if self.modules then
+      for i, module in ipairs(self.modules) do
+         module:reinforce(reward)
+      end
+   end
+end
diff --git a/ModuleCriterion.lua b/ModuleCriterion.lua
new file mode 100644
index 0000000..bfc79ef
--- /dev/null
+++ b/ModuleCriterion.lua
@@ -0,0 +1,44 @@
+local ModuleCriterion, parent = torch.class("nn.ModuleCriterion", "nn.Criterion")
+
+function ModuleCriterion:__init(criterion, inputModule, targetModule, castTarget)
+   self.inputModule = inputModule
+   self.targetModule = targetModule
+   self.castTarget = (castTarget == nil) and true or castTarget
+   if self.inputModule then
+      local params = self.inputModule:parameters()
+      if params and #params > 0 then
+         print"Warning: nn.ModuleCriterion doesn't support parameter updates"
+      end
+   end
+   self.criterion = criterion
+end
+
+function ModuleCriterion:updateOutput(input, target)
+   if self.inputModule then
+      self.input = self.inputModule:forward(input)
+   end
+   if self.targetModule then
+      self.target = self.targetModule:forward(target)
+   end
+   self.output = self.criterion:forward(self.input or input, self.target or target)
+   return self.output
+end
+
+function ModuleCriterion:updateGradInput(input, target)
+   self.gradInput = self.criterion:backward(self.input or input, self.target or target)
+   if self.inputModule then
+      self.gradInput = self.inputModule:backward(input, self.gradInput)
+   end
+   return self.gradInput
+end
+
+function ModuleCriterion:type(type, typecache)
+   if self.inputModule then
+      self.inputModule:type(type, typecache)
+   end
+   if self.castTarget and self.targetModule then
+      self.targetModule:type(type, typecache)
+   end
+   self.criterion:type(type, typecache)
+   return parent.type(self, type, typecache)
+end
diff --git a/NCECriterion.lua b/NCECriterion.lua
new file mode 100644
index 0000000..1a6b935
--- /dev/null
+++ b/NCECriterion.lua
@@ -0,0 +1,102 @@
+------------------------------------------------------------------------
+--[[ Noise Contrast Estimation Criterion ]]--
+-- Ref.: A. http://mi.eng.cam.ac.uk/~xc257/papers/ICASSP2015-rnnlm-nce.pdf
+--       B. https://www.cs.toronto.edu/~amnih/papers/ncelm.pdf
+------------------------------------------------------------------------
+local NCECriterion, parent = torch.class("nn.NCECriterion", "nn.Criterion")
+local eps = 0.0000001
+
+function NCECriterion:__init()
+   parent.__init(self)  
+   self.sizeAverage = true
+   
+   self.gradInput = {torch.Tensor(), torch.Tensor(), torch.Tensor(), torch.Tensor()}   
+end
+
+function NCECriterion:updateOutput(inputTable, target)
+   -- P_model(target), P_model(sample), P_noise(target), P_noise(sample)
+   local Pmt, Pms, Pnt, Pns = unpack(inputTable)
+   local k = Pms:size(2)
+   
+   assert(Pmt:dim() == 1)
+   assert(Pms:dim() == 2)
+   assert(Pnt:dim() == 1)
+   assert(Pns:dim() == 2)
+   
+   -- equation 5 in ref. A
+   
+   -- eq 5.1 : P(origin=model) = Pmt / (Pmt + k*Pnt) 
+   self._Pom = self._Pom or Pmt.new()
+   self._Pom:resizeAs(Pmt):copy(Pmt)
+   self._Pomdiv = self._Pomdiv or Pmt.new()
+   self._Pomdiv:resizeAs(Pmt):copy(Pmt)
+   self._Pomdiv:add(k, Pnt):add(eps)
+   self._Pom:cdiv(self._Pomdiv)
+   
+   -- eq 5.2 : P(origin=noise) = k*Pns / (Pms + k*Pns)
+   self._Pon = self._Pon or Pns.new()
+   self._Pon:resizeAs(Pns):copy(Pns):mul(k)
+   self._Pondiv = self._Pondiv or Pms.new()
+   self._Pondiv:resizeAs(Pms):copy(Pms)
+   self._Pondiv:add(k, Pns):add(eps)
+   self._Pon:cdiv(self._Pondiv)
+   
+   -- equation 6 in ref. A
+   
+   self._lnPom = self._lnPom or self._Pom.new()
+   self._lnPom:log(self._Pom)
+   
+   self._lnPon = self._lnPon or self._Pon.new()
+   self._lnPon:log(self._Pon)
+   
+   local lnPomsum = self._lnPom:sum()
+   local lnPonsum = self._lnPon:sum()
+   
+   self.output = - (lnPomsum + lnPonsum)
+   
+   if self.sizeAverage then
+      self.output = self.output / Pmt:size(1)
+   end
+   
+   return self.output
+end
+
+function NCECriterion:updateGradInput(inputTable, target)
+   assert(#self.gradInput == 4)
+   local Pmt, Pms, Pnt, Pns = unpack(inputTable)
+   local k = Pms:size(2)
+   
+   -- equation 7 in ref. A
+   
+   -- d ln(Pom) / d input = -k*Pnt / ( Pmt * (Pmt + k*Pnt) )
+   local dlnPom = self.gradInput[1]
+   dlnPom = dlnPom or Pnt.new()
+   dlnPom:resizeAs(Pnt):copy(Pnt):mul(-k)
+   dlnPom:cdiv(self._Pomdiv)
+   Pmt:add(eps)
+   dlnPom:cdiv(Pmt) -- d ln(Pmt) / d Pmt = 1 / d Pmt
+   Pmt:add(-eps)
+   
+   -- d ln(Pon) / d input = Pms / ( Pms * (Pms + k*Pns) )
+   local dlnPon = self.gradInput[2]
+   dlnPon = dlnPon or Pms.new()
+   dlnPon:resizeAs(Pms):copy(Pms)
+   dlnPon:cdiv(self._Pondiv)
+   Pms:add(eps)
+   dlnPon:cdiv(Pms) -- d ln(Pms) / d Pms = 1 / d Pms
+   Pms:add(-eps)
+   
+   if self.gradInput[3]:nElement() ~= Pnt:nElement() then
+      self.gradInput[3]:resizeAs(Pnt):zero()
+   end
+   if self.gradInput[4]:nElement() ~= Pns:nElement() then
+      self.gradInput[4]:resizeAs(Pns):zero()
+   end
+   
+   if self.sizeAverage then
+      dlnPom:div(Pmt:size(1))
+      dlnPon:div(Pmt:size(1))
+   end
+   
+   return self.gradInput   
+end
diff --git a/NCEModule.lua b/NCEModule.lua
new file mode 100644
index 0000000..881cf5a
--- /dev/null
+++ b/NCEModule.lua
@@ -0,0 +1,439 @@
+------------------------------------------------------------------------
+--[[ Noise Contrast Estimation Module]]--
+-- Ref.: A. https://www.cs.toronto.edu/~amnih/papers/ncelm.pdf
+------------------------------------------------------------------------
+local _ = require 'moses'
+local NCEModule, parent = torch.class("nn.NCEModule", "nn.Linear")
+NCEModule.version = 6 -- better bias init
+
+-- for efficient serialization using nn.Serial
+local empty = _.clone(parent.dpnn_mediumEmpty)
+table.insert(empty, 'sampleidx')
+table.insert(empty, 'sampleprob')
+table.insert(empty, '_noiseidx')
+table.insert(empty, '_noiseprob')
+table.insert(empty, '_weight')
+table.insert(empty, '_gradWeight')
+table.insert(empty, '_gradOutput')
+table.insert(empty, '_tgradOutput')
+NCEModule.dpnn_mediumEmpty = empty
+
+-- for sharedClone
+local params = _.clone(parent.dpnn_parameters)
+table.insert(params, 'unigrams')
+table.insert(params, 'Z')
+NCEModule.dpnn_parameters = params
+
+function NCEModule:__init(inputSize, outputSize, k, unigrams, Z)
+   parent.__init(self, inputSize, outputSize)
+   assert(torch.type(k) == 'number')
+   assert(torch.isTensor(unigrams))
+   self.k = k
+   self.unigrams = unigrams
+   self.Z = torch.Tensor{Z or -1}
+   
+   self.batchnoise = true
+   
+   self:fastNoise()
+   
+   -- output is {P_linear(target|input), P_linear(samples|input), P_noise(target), P_noise(samples)}
+   self.output = {torch.Tensor(), torch.Tensor(), torch.Tensor(), torch.Tensor()}
+   self.gradInput = {torch.Tensor(), torch.Tensor()}
+end
+
+function NCEModule:reset(stdv)
+   if stdv then
+      self.weight:uniform(-stdv, stdv)
+      self.bias:uniform(-stdv, stdv)
+   else
+      stdv = stdv or 1./math.sqrt(self.weight:size(2))
+      self.weight:uniform(-stdv, stdv)
+      -- this is useful for Z = 1
+      self.bias:fill(-math.log(self.bias:size(1)))
+   end
+   return self
+end
+
+function NCEModule:fastNoise()
+   -- we use alias to speedup multinomial sampling (see noiseSample method)
+   require 'torchx'
+   assert(torch.AliasMultinomial, "update torchx : luarocks install torchx")
+   self.unigrams:div(self.unigrams:sum())
+   self.aliasmultinomial = torch.AliasMultinomial(self.unigrams)
+   self.aliasmultinomial.dpnn_parameters = {'J', 'q'}
+end
+
+function NCEModule:updateOutput(inputTable)
+   local input, target = unpack(inputTable)
+   assert(input:dim() == 2)
+   assert(target:dim() == 1)
+   local batchsize = input:size(1)
+   local inputsize = self.weight:size(2)
+   
+   if self.train == false and self.normalized then
+      self.linout = self.linout or input.new()
+      -- full linear + softmax
+      local nElement = self.linout:nElement()
+      self.linout:resize(batchsize, self.weight:size(1))
+      if self.linout:nElement() ~= nElement then
+         self.linout:zero()
+      end
+      self.addBuffer = self.addBuffer or input.new()
+      if self.addBuffer:nElement() ~= batchsize then
+         self.addBuffer:resize(batchsize):fill(1)
+      end
+      self.weight.addmm(self.linout, 0, self.linout, 1, input, self.weight:t())
+      if self.bias then self.linout:addr(1, self.addBuffer, self.bias) end
+      self.output = torch.type(self.output) == 'table' and input.new() or self.output
+      if self.logsoftmax then
+         input.THNN.LogSoftMax_updateOutput(
+            self.linout:cdata(),
+            self.output:cdata()
+         )
+      else
+         input.THNN.SoftMax_updateOutput(
+            self.linout:cdata(),
+            self.output:cdata()
+         )
+      end
+   elseif self.batchnoise then
+      self.output = (torch.type(self.output) == 'table' and #self.output == 4) and self.output
+         or {input.new(), input.new(), input.new(), input.new()}
+      assert(torch.type(target) == 'torch.CudaTensor' or torch.type(target) == 'torch.LongTensor')
+      self.sampleidx = self.sampleidx or target.new()
+      
+      -- the last elements contain the target indices
+      self.sampleidx:resize(self.k + batchsize)
+      self.sampleidx:narrow(1,self.k+1,batchsize):copy(target)
+      
+      -- sample k noise samples
+      self:noiseSample(self.sampleidx, 1, self.k)
+      self.sampleidx:resize(self.k + batchsize)
+      
+      -- build (batchsize+k, inputsize) weight tensor
+      self._weight = self._weight or self.bias.new()
+      self.weight.index(self._weight, self.weight, 1, self.sampleidx)
+      assert(self._weight:nElement() == (self.k+batchsize)*inputsize)
+      self._weight:resize(self.k+batchsize, inputsize)
+      
+      -- build (batchsize+k,) bias tensor
+      self._bias = self._bias or self.bias.new()
+      self._bias:index(self.bias, 1, self.sampleidx)
+      assert(self._bias:nElement() == (self.k+batchsize))
+      self._bias:resize(self.k+batchsize)
+      
+      -- separate sample and target weight matrices and bias vectors
+      local sweight = self._weight:narrow(1, 1, self.k)
+      local tweight = self._weight:narrow(1, self.k+1, batchsize)
+      local sbias = self._bias:narrow(1, 1, self.k)
+      local tbias = self._bias:narrow(1, self.k+1, batchsize)
+      
+      -- get model probability of targets (batchsize,)
+      local Pmt = self.output[1]
+      self._pm = self._pm or input.new()
+      self._pm:cmul(input, tweight)
+      Pmt:sum(self._pm, 2):resize(batchsize)
+      Pmt:add(tbias)
+      Pmt:exp()
+      
+      -- get model probability of samples (batchsize x k) samples
+      local Pms = self.output[2]
+      Pms:resize(batchsize, self.k)
+      Pms:copy(sbias:view(1,self.k):expand(batchsize, self.k))
+      Pms:addmm(1, Pms, 1, input, sweight:t())
+      Pms:exp()
+      
+      if self.Z[1] <= 0 then
+         -- approximate Z using current batch
+         self.Z[1] = Pms:mean()*self.weight:size(1)
+         print("normalization constant Z approximated to "..self.Z[1])
+      end
+      
+      -- divide by normalization constant
+      Pms:div(self.Z[1]) 
+      Pmt:div(self.Z[1])
+      
+      -- get noise probability (pn) for all samples
+      
+      self.sampleprob = self.sampleprob or Pms.new()
+      self.sampleprob = self:noiseProb(self.sampleprob, self.sampleidx)
+      
+      local Pnt = self.sampleprob:narrow(1,self.k+1,target:size(1))
+      local Pns = self.sampleprob:narrow(1,1,self.k)
+      Pns = Pns:resize(1, self.k):expand(batchsize, self.k)
+      
+      self.output[3]:set(Pnt)
+      self.output[4]:set(Pns)
+   else
+      self.output = (torch.type(self.output) == 'table' and #self.output == 4) and self.output
+         or {input.new(), input.new(), input.new(), input.new()}
+      self.sampleidx = self.sampleidx or target.new()
+      
+      -- the last first column will contain the target indices
+      self.sampleidx:resize(batchsize, self.k+1)
+      self.sampleidx:select(2,1):copy(target)
+      
+      self._sampleidx = self._sampleidx or self.sampleidx.new()
+      self._sampleidx:resize(batchsize, self.k)
+      
+      -- sample (batchsize x k+1) noise samples
+      self:noiseSample(self._sampleidx, batchsize, self.k)
+      
+      self.sampleidx:narrow(2,2,self.k):copy(self._sampleidx)
+      
+      -- make sure that targets are still first column of sampleidx
+      if not self.testedtargets then
+         for i=1,math.min(target:size(1),3) do
+            assert(self.sampleidx[{i,1}] == target[i])
+         end
+         self.testedtargets = true
+      end
+      
+      -- build (batchsize x k+1 x inputsize) weight tensor
+      self._weight = self._weight or self.bias.new()
+      self.weight.index(self._weight, self.weight, 1, self.sampleidx:view(-1))
+      assert(self._weight:nElement() == batchsize*(self.k+1)*inputsize)
+      self._weight:resize(batchsize, self.k+1, inputsize)
+      
+      -- build (batchsize x k+1) bias tensor
+      self._bias = self._bias or self.bias.new()
+      self._bias:index(self.bias, 1, self.sampleidx:view(-1))
+      assert(self._bias:nElement() == batchsize*(self.k+1))
+      self._bias:resize(batchsize, self.k+1)
+      
+      -- get model probability (pm) of sample and target (batchsize x k+1) samples
+      self._pm = self._pm or input.new()
+      self._pm:resizeAs(self._bias):copy(self._bias)
+      self._pm:resize(batchsize, 1, self.k+1)
+      local _input = input:view(batchsize, 1, inputsize)
+      self._pm:baddbmm(1, self._pm, 1, _input, self._weight:transpose(2,3))
+      self._pm:resize(batchsize, self.k+1)
+      self._pm:exp()
+      
+      if self.Z[1] <= 0 then
+         -- approximate Z using current batch
+         self.Z[1] = self._pm:mean()*self.weight:size(1)
+         print("normalization constant Z approximated to "..self.Z[1])
+      end
+      
+      self._pm:div(self.Z[1]) -- divide by normalization constant
+      
+      -- separate target from sample model probabilities
+      local Pmt = self._pm:select(2,1)
+      local Pms = self._pm:narrow(2,2,self.k)
+      
+      self.output[1]:set(Pmt)
+      self.output[2]:set(Pms)
+      
+      -- get noise probability (pn) for all samples
+      
+      self.sampleprob = self.sampleprob or self._pm.new()
+      self.sampleprob = self:noiseProb(self.sampleprob, self.sampleidx)
+      
+      local Pnt = self.sampleprob:select(2,1)
+      local Pns = self.sampleprob:narrow(2,2,self.k)
+      
+      self.output[3]:set(Pnt)
+      self.output[4]:set(Pns)
+   end
+   
+   return self.output
+end
+
+function NCEModule:updateGradInput(inputTable, gradOutput)
+   local input, target = unpack(inputTable)
+   assert(input:dim() == 2)
+   assert(target:dim() == 1)
+   local dPmt, dPms = gradOutput[1], gradOutput[2]
+   local batchsize = input:size(1)
+   local inputsize = self.weight:size(2)
+   
+   if self.batchnoise then
+      local Pmt, Pms = self.output[1], self.output[2]
+      
+      -- separate sample and target weight matrices
+      local sweight = self._weight:narrow(1, 1, self.k)
+      local tweight = self._weight:narrow(1, self.k+1, batchsize)
+      
+      -- the rest of equation 7
+      -- d Pm / d linear = exp(linear)/z
+      self._gradOutput = self._gradOutput or dPms.new()
+      self._tgradOutput = self._tgradOutput or dPmt.new()
+      self._gradOutput:cmul(dPms, Pms)
+      self._tgradOutput:cmul(dPmt, Pmt)
+      
+      -- gradient of linear
+      self.gradInput[1] = self.gradInput[1] or input.new()
+      self.gradInput[1]:cmul(self._tgradOutput:view(batchsize, 1):expandAs(tweight), tweight)
+      self.gradInput[1]:addmm(1, 1, self._gradOutput, sweight)
+   else
+      -- the rest of equation 7 (combine both sides of + sign into one tensor)
+      self._gradOutput = self._gradOutput or dPmt.new()
+      self._gradOutput:resize(batchsize, self.k+1)
+      self._gradOutput:select(2,1):copy(dPmt)
+      self._gradOutput:narrow(2,2,self.k):copy(dPms)
+      self._gradOutput:resize(batchsize, 1, self.k+1)
+      -- d Pm / d linear = exp(linear)/z
+      self._gradOutput:cmul(self._pm)
+      
+      -- gradient of linear
+      self.gradInput[1] = self.gradInput[1] or input.new()
+      self.gradInput[1]:resize(batchsize, 1, inputsize):zero()
+      self.gradInput[1]:baddbmm(0, 1, self._gradOutput, self._weight)
+      self.gradInput[1]:resizeAs(input)
+   end
+   
+   self.gradInput[2] = self.gradInput[2] or input.new()
+   if self.gradInput[2]:nElement() ~= target:nElement() then
+      self.gradInput[2]:resize(target:size()):zero()
+   end
+   
+   return self.gradInput
+end
+
+function NCEModule:accGradParameters(inputTable, gradOutput, scale)
+   local input, target = unpack(inputTable)
+   assert(input:dim() == 2)
+   assert(target:dim() == 1)
+   local batchsize = input:size(1)
+   local inputsize = self.weight:size(2)
+   
+   if self.batchnoise then
+      self._gradWeight = self._gradWeight or self.bias.new()
+      self._gradWeight:resizeAs(self._weight):zero() -- (batchsize + k) x inputsize
+      
+      local sgradWeight = self._gradWeight:narrow(1, 1, self.k)
+      local tgradWeight = self._gradWeight:narrow(1, self.k+1, batchsize)
+      
+      self._gradOutput:mul(scale)
+      self._tgradOutput:mul(scale)
+      
+      sgradWeight:addmm(0, sgradWeight, 1, self._gradOutput:t(), input)
+      tgradWeight:cmul(self._tgradOutput:view(batchsize, 1):expandAs(self.gradInput[1]), input)
+      
+      self.gradWeight:indexAdd(1, self.sampleidx, self._gradWeight)
+      self.gradBias:indexAdd(1, self.sampleidx:narrow(1,self.k+1,batchsize), self._tgradOutput)
+      self._tgradOutput:sum(self._gradOutput, 1) -- reuse buffer
+      self.gradBias:indexAdd(1, self.sampleidx:sub(1,self.k), self._tgradOutput:view(-1))
+      
+   else
+      self._gradWeight = self._gradWeight or self.bias.new()
+      self._gradWeight:resizeAs(self._weight):zero() -- batchsize x k+1 x inputsize
+      self._gradOutput:resize(batchsize, self.k+1, 1)
+      self._gradOutput:mul(scale)
+      local _input = input:view(batchsize, 1, inputsize)
+      self._gradWeight:baddbmm(0, self._gradWeight, 1, self._gradOutput, _input)
+      
+      local sampleidx = self.sampleidx:view(batchsize * (self.k+1))
+      local _gradWeight = self._gradWeight:view(batchsize * (self.k+1), inputsize)
+      self.gradWeight:indexAdd(1, sampleidx, _gradWeight)
+      
+      local _gradOutput = self._gradOutput:view(batchsize * (self.k+1))
+      self.gradBias:indexAdd(1, sampleidx, _gradOutput)
+   end
+end
+
+function NCEModule:type(type, cache)
+   if type then
+      self.sampleidx = nil
+      self.sampleprob = nil
+      self._noiseidx = nil
+      self._noiseprob = nil
+      self._metaidx = nil
+      self._gradOutput = nil
+      self._tgradOutput = nil
+      self._gradWeight = nil
+      self._weight = nil
+   end
+   local unigrams = self.unigrams
+   self.unigrams = nil
+   local am = self.aliasmultinomial
+   
+   local rtn
+   if type and torch.type(self.weight) == 'torch.MultiCudaTensor' then
+      assert(type == 'torch.CudaTensor', "Cannot convert a multicuda NCEModule to anything other than cuda")
+      local weight = self.weight
+      local gradWeight = self.gradWeight
+      self.weight = nil
+      self.gradWeight = nil
+      
+      rtn = parent.type(self, type, cache)
+      
+      assert(torch.type(self.aliasmultinomial.J) ~= 'torch.CudaTensor')
+      self.weight = weight
+      self.gradWeight = gradWeight
+   else
+      rtn = parent.type(self, type, cache)
+   end
+   
+   self.unigrams = unigrams
+   self.aliasmultinomial = am
+   return rtn
+end
+
+function NCEModule:noiseProb(sampleprob, sampleidx)
+   assert(sampleprob)
+   assert(sampleidx)
+   self._noiseprob = self._noiseprob or self.unigrams.new()
+   self._noiseidx = self._noiseidx or torch.LongTensor()
+   self._noiseidx:resize(sampleidx:size()):copy(sampleidx)
+   
+   self._noiseprob:index(self.unigrams, 1, self._noiseidx:view(-1))
+   
+   sampleprob:resize(sampleidx:size()):copy(self._noiseprob)
+   return sampleprob
+end
+
+function NCEModule:noiseSample(sampleidx, batchsize, k)
+   if torch.type(sampleidx) ~= 'torch.LongTensor' then
+      self._noiseidx = self._noiseidx or torch.LongTensor()
+      self._noiseidx:resize(batchsize, k)
+      self.aliasmultinomial:batchdraw(self._noiseidx)
+      sampleidx:resize(batchsize, k):copy(self._noiseidx)
+   else
+      sampleidx:resize(batchsize, k)
+      self.aliasmultinomial:batchdraw(sampleidx)
+   end
+   return sampleidx
+end
+
+function NCEModule:clearState()
+   self.sampleidx = nil
+   self.sampleprob = nil
+   self._noiseidx = nil
+   self._noiseprob = nil
+   self._tgradOutput = nil
+   self._gradOutput = nil
+   if torch.isTensor(self.output) then
+      self.output:set()
+   else
+      for i,output in ipairs(self.output) do
+         output:set()
+      end
+   end
+   for i,gradInput in ipairs(self.gradInput) do
+      gradInput:set()
+   end
+end
+
+function NCEModule:multicuda(device1, device2)
+   assert(device1 and device2, "specify two devices as arguments")
+   require 'torchx'
+   assert(torchx.version and torchx.version >= 1, "update torchx: luarocks install torchx")
+   
+   self:float()
+   
+   local isize = self.weight:size(2)
+   local weights = {
+      cutorch.withDevice(device1, function() return self.weight[{{}, {1, torch.round(isize/2)}}]:cuda() end),
+      cutorch.withDevice(device2, function() return self.weight[{{}, {torch.round(isize/2)+1, isize}}]:cuda() end)
+   }
+   self.weight = torch.MultiCudaTensor(2, weights)
+   local gradWeights = {
+      cutorch.withDevice(device1, function() return self.gradWeight[{{}, {1, torch.round(isize/2)}}]:cuda() end),
+      cutorch.withDevice(device2, function() return self.gradWeight[{{}, {torch.round(isize/2)+1, isize}}]:cuda() end)
+   }
+   self.gradWeight = torch.MultiCudaTensor(2, gradWeights)
+   
+   self:cuda()
+end
diff --git a/NaN.lua b/NaN.lua
new file mode 100644
index 0000000..b80f6a0
--- /dev/null
+++ b/NaN.lua
@@ -0,0 +1,72 @@
+------------------------------------------------------------------------
+--[[ NaN ]]--
+-- Asserts that outputs and gradInputs do not contain NaNs.
+-- Useful for locating the source of NaN errors.
+------------------------------------------------------------------------
+local NaN, parent = torch.class("nn.NaN", "nn.Decorator")
+
+local idseq = 0
+function NaN.newId()
+   idseq = idseq + 1
+   return idseq
+end
+
+function NaN:__init(module, id)
+   parent.__init(self, module)
+   self.id = id or NaN.newId()
+end
+
+function NaN:recursiveIsNaN(tensor)
+   local isNaN = false
+   if torch.type(tensor) == 'table' then
+      for k,v in pairs(tensor) do
+         isNaN = self:recursiveIsNaN(v)
+         if isNaN then break end
+      end
+   else
+      local _ = require 'moses'
+      isNaN = _.isNaN(tensor:sum())
+   end
+   return isNaN
+end
+
+function NaN:updateOutput(input)
+   self.output = self.modules[1]:updateOutput(input)
+   if self:recursiveIsNaN(self.output) then
+      if self:recursiveIsNaN(input) then
+         error(string.format("NaN found in input of module :\n%s", self:__tostring__()))
+      elseif self:recursiveIsNaN(self:parameters()) then
+         error(string.format("NaN found in parameters of module :\n%s", self:__tostring__()))
+      end
+      error(string.format("NaN found in output of module :\n%s", self:__tostring__()))
+   end
+   return self.output
+end
+
+function NaN:updateGradInput(input, gradOutput)
+   self.gradInput = self.modules[1]:updateGradInput(input, gradOutput)
+   if self:recursiveIsNaN(self.gradInput) then
+      if self:recursiveIsNaN(gradOutput) then
+         error(string.format("NaN found in gradOutput of module :\n%s", self:__tostring__()))
+      end
+      error(string.format("NaN found in gradInput of module :\n%s", self:__tostring__()))
+   end
+   return self.gradInput
+end
+
+function NaN:accGradParameters(input, gradOutput, scale)
+   self.modules[1]:accGradParameters(input, gradOutput, scale)
+   local params, gradParams = self:parameters()
+   if self:recursiveIsNaN(gradParams) then
+      error(string.format("NaN found in gradParameters of module :\n%s", self:__tostring__()))
+   end
+end
+
+function NaN:__tostring__()
+   local selfstring = torch.type(self) .. '(' .. self.id .. ')'
+   if self.modules[1].__tostring__ then
+      return selfstring .. ' @ ' .. self.modules[1]:__tostring__()
+   else
+      return selfstring .. ' @ ' .. torch.type(self.modules[1])
+   end
+end
diff --git a/OneHot.lua b/OneHot.lua
new file mode 100644
index 0000000..702e162
--- /dev/null
+++ b/OneHot.lua
@@ -0,0 +1,65 @@
+local OneHot, parent = torch.class('nn.OneHot', 'nn.Module')
+
+-- adapted from https://github.com/karpathy/char-rnn
+-- and https://github.com/hughperkins/char-lstm
+
+function OneHot:__init(outputSize)
+   parent.__init(self)
+   self.outputSize = outputSize
+end
+
+function OneHot:updateOutput(input)
+   local size
+   if type(input) == 'number' then
+      if self:type() == 'torch.CudaTensor' then
+         self._single = self._single or torch.CudaTensor():resize(1);
+      else
+         self._single = self._single or torch.LongTensor():resize(1);
+      end
+      self._single[1] = input
+      input = self._single;
+      size = {}
+   else
+      size = input:size():totable()
+   end
+   table.insert(size, self.outputSize)
+   
+   self.output:resize(unpack(size)):zero()
+   
+   size[#size] = 1
+   local input_ = input:view(unpack(size))
+   
+   if torch.type(input) == 'torch.CudaTensor' or torch.type(input) == 'torch.ClTensor' then
+      self.output:scatter(self.output:dim(), input_, 1)
+   else
+      if torch.type(self.output) == 'torch.CudaTensor' then 
+         -- input is not cuda, module is, cast input to cuda
+         self._input = self._input or torch.CudaTensor()
+         self._input:resize(input_:size()):copy(input_)
+         input_ = self._input
+      elseif torch.type(input) ~= 'torch.LongTensor' then 
+         -- input is not long, module isnot cuda, cast input to long
+         self._input = self._input or torch.LongTensor()
+         self._input:resize(input_:size()):copy(input_)
+         input_ = self._input
+      end
+      self.output:scatter(self.output:dim(), input_, 1)
+   end
+   
+   return self.output
+end
+
+function OneHot:updateGradInput(input, gradOutput)
+   if type(input) == 'number' then
+      return 0
+   else
+      self.gradInput:resize(input:size()):zero()
+      return self.gradInput
+   end
+end
+
+function OneHot:type(type, typecache)
+   self._single = nil
+   self._input = nil
+   return parent.type(self, type, typecache)
+end
diff --git a/PCAColorTransform.lua b/PCAColorTransform.lua
new file mode 100644
index 0000000..69f16d1
--- /dev/null
+++ b/PCAColorTransform.lua
@@ -0,0 +1,117 @@
+--[[
+   Color transformation module: Commonly used data augmentation technique.
+   Random color noise is added to input image/images based on the Principal
+   Component Analysis (PCA) of pixel values.
+
+   Arguments
+   -> eigenVectors: Each row represent an eigen vector.
+   -> eigenValues: Corresponding eigen values.
+   -> std: std of gaussian distribution for augmentation (default 0.1).
+--]]
+
+local PCAColorTransform, Parent = torch.class('nn.PCAColorTransform', 'nn.Module')
+
+function PCAColorTransform:__init(inputChannels, eigenVectors, eigenValues, std)
+   Parent.__init(self)
+
+   self.train = true
+   self.inputChannels = inputChannels
+   assert(inputChannels == eigenVectors:size(1),
+          "Number of input channels do not match number of eigen vectors.")
+   assert(eigenVectors:size(2) == eigenVectors:size(1),
+          "Invalid dimensionality: eigen vectors.")
+   assert(inputChannels == eigenValues:nElement(),
+          "Number of input channels do not match number of eigen values.")
+
+   self.eigenVectors = eigenVectors
+   self.eigenValues = eigenValues
+   self.std = std or 0.1
+end
+
+function PCAColorTransform:updateOutput(input)
+   self.output:resizeAs(input):copy(input)
+   if self.train then
+      self.noise = self.noise or self.output.new()
+      self.alphas = self.alphas or self.output.new()
+      self._tempNoise = self._tempNoise or self.output.new()
+      self._tempNoiseExpanded = self._tempNoiseExpanded or self.output.new()
+      self._tempNoiseSamples = self._tempNoiseSamples or self.output.new()
+      self._tempLambda = self._tempLambda or self.output.new()
+      self._tempLambdaExpanded = self._tempLambdaExpanded or self.output.new()
+
+      if self.output:nDimension() == 4 then
+         local batchSize = self.output:size(1)
+         local channels = self.output:size(2)
+         local height = self.output:size(3)
+         local width = self.output:size(4)
+         assert(channels == self.inputChannels)
+         
+         -- Randomly sample noise for each channel and scale by eigen values
+         self.alphas:resize(channels, batchSize)
+         self.alphas:normal(0, self.std)
+         self._tempLambda = self.eigenValues:view(self.inputChannels, 1)
+         self._tempLambdaExpanded = self._tempLambda:expand(channels, batchSize)
+         self.alphas:cmul(self._tempLambdaExpanded)
+
+         -- Scale by eigen vectors 
+         self.noise:resize(batchSize, self.inputChannels):zero()
+         self.noise:t():addmm(self.eigenVectors, self.alphas)
+
+         -- Add noise to the input
+         self._tempNoise = self.noise:view(batchSize, self.inputChannels, 1, 1)
+         self._tempNoiseExpanded:expand(self._tempNoise, batchSize,
+                                        channels, height, width)
+         self.output:add(self._tempNoiseExpanded)
+
+      elseif self.output:nDimension() == 3 then
+         local channels = self.output:size(1)
+         local height = self.output:size(2)
+         local width = self.output:size(3)
+         assert(channels == self.inputChannels)
+
+         -- Randomly sample noise for each channel and scale by eigen values
+         self.alphas:resize(channels, 1)
+         self.alphas:normal(0, self.std)
+         self._tempLambda = self.eigenValues:view(self.inputChannels, 1)
+         self._tempLambdaExpanded = self._tempLambda:expand(channels, 1)
+         self.alphas:cmul(self._tempLambdaExpanded)
+
+         -- Scale by eigen vectors 
+         self.noise:resize(1, self.inputChannels):zero()
+         self.noise:t():addmm(self.eigenVectors, self.alphas)
+
+         -- Add noise to the input
+         self._tempNoise = self.noise:view(self.inputChannels, 1, 1)
+         self._tempNoiseExpanded:expand(self._tempNoise, channels,
+                                        height, width)
+         self.output:add(self._tempNoiseExpanded)
+      else
+         error("Invalid input dimensionality.")
+      end
+   end
+   return self.output
+end
+
+function PCAColorTransform:updateGradInput(input, gradOutput)
+   if self.train then
+      self.gradInput:resizeAs(gradOutput):copy(gradOutput)
+   else
+      error('backprop only defined while training')
+   end
+   return self.gradInput
+end
+
+function PCAColorTransform:type(type, tensorCache)
+   self.noise = nil
+   self.alphas = nil
+   self._tempLambda = nil
+   self._tempLambdaExpanded = nil
+   self._tempNoise = nil
+   self._tempNoiseExpanded = nil
+   Parent.type(self, type, tensorCache)
+end
+
+function PCAColorTransform:__tostring__()
+  return string.format('%s channels: %d, std: %f', torch.type(self),
+                        self.inputChannels, self.std)
+end
diff --git a/ParallelTable.lua b/ParallelTable.lua
new file mode 100644
index 0000000..95584a5
--- /dev/null
+++ b/ParallelTable.lua
@@ -0,0 +1,44 @@
+local ParallelTable, parent = nn.ParallelTable, nn.Container
+
+function ParallelTable:profile()
+   function ParallelTable:updateOutput(input)
+      for i=1,#self.modules do
+         local start = sys.clock()
+         self.output[i] = self.modules[i]:updateOutput(input[i])
+         if cutorch then cutorch.synchronize() end
+         print(torch.type(self.modules[i])..' updateOutput: '..sys.clock() - start.." s")
+      end
+      return self.output
+   end
+
+   function ParallelTable:updateGradInput(input, gradOutput)
+      for i,module in ipairs(self.modules) do
+         local start = sys.clock()
+         self.gradInput[i]= module:updateGradInput(input[i], gradOutput[i])
+         if cutorch then cutorch.synchronize() end
+         print(torch.type(module)..' updateGradInput: '..sys.clock() - start.." s")
+      end
+      return self.gradInput
+   end
+
+   function ParallelTable:accGradParameters(input, gradOutput, scale)
+      scale = scale or 1
+      for i,module in ipairs(self.modules) do
+         local start = sys.clock()
+         module:accGradParameters(input[i], gradOutput[i], scale)
+         if cutorch then cutorch.synchronize() end
+         print(torch.type(module)..' accGradParameters: '..sys.clock() - start.." s")
+      end
+   end
+
+   function ParallelTable:accUpdateGradParameters(input, gradOutput, lr)
+      lr = lr or 1
+      for i,module in ipairs(self.modules) do
+         local start = sys.clock()
+         module:accUpdateGradParameters(input[i], gradOutput[i], lr)
+         if cutorch then cutorch.synchronize() end
+         print(torch.type(module)..' accUpdateGradParameters: '..sys.clock() - start.." s")
+      end
+   end
+   parent.profile(self)
+end
diff --git a/PrintSize.lua b/PrintSize.lua
new file mode 100644
index 0000000..1f1c64e
--- /dev/null
+++ b/PrintSize.lua
@@ -0,0 +1,36 @@
+local PrintSize, parent = torch.class('nn.PrintSize', 'nn.Module')
+
+function PrintSize:__init(prefix)
+   parent.__init(self)
+   self.prefix = prefix or "PrintSize"
+end
+
+function PrintSize:updateOutput(input)
+   self.output = input
+   local size
+   if torch.type(input) == 'table' then
+      size = input
+   elseif torch.type(input) == 'nil' then
+      size = 'missing size'
+   else
+      size = input:size()
+   end
+   print(self.prefix..":input\n", size)
+   return self.output
+end
+
+
+function PrintSize:updateGradInput(input, gradOutput)
+   local size 
+   if torch.type(gradOutput) == 'table' then
+      size = gradOutput
+   elseif torch.type(gradOutput) == 'nil' then
+      size = 'missing size'
+   else
+      size = gradOutput:size()
+   end
+   print(self.prefix..":gradOutput\n", size)
+   self.gradInput = gradOutput
+   return self.gradInput
+end
+
diff --git a/Profile.lua b/Profile.lua
new file mode 100755
index 0000000..36cd909
--- /dev/null
+++ b/Profile.lua
@@ -0,0 +1,55 @@
+local ProfileModule, parent = torch.class("nn.Profile", "nn.Decorator")
+
+function ProfileModule:__init(module, print_interval, name)
+   parent.__init(self, module)
+   self.print_interval = print_interval or 100
+   self.name = name or torch.type(module)
+   self.module = module
+   self.numFwds = 0
+   self.numBwds = 0
+   self.summedFwdTime = 0
+   self.summedBwdTime = 0
+   self.timer = torch.Timer()
+end
+
+function ProfileModule:updateOutput(input)
+   self.timer:reset()
+   self.output = self.module:updateOutput(input)
+   self.summedFwdTime = self.summedFwdTime + self.timer:time().real
+   self.numFwds = self.numFwds + 1
+   if self.numFwds % self.print_interval == 0 then
+      print (string.format('%s took %.3f seconds for %d forward passes',
+         self.name, self.summedFwdTime, self.print_interval))
+      self.numFwds = 0
+      self.summedFwdTime = 0
+   end
+   return self.output
+end
+
+function ProfileModule:updateGradInput(input, gradOutput)
+   self.timer:reset()
+   self.gradInput = self.module:updateGradInput(input, gradOutput)
+   self.summedBwdTime = self.summedBwdTime + self.timer:time().real
+   self.numBwds = self.numBwds + 1
+   if self.numBwds % self.print_interval == 0 then
+      print (string.format('%s took %.3f seconds for %d backward passes',
+         self.name, self.summedBwdTime, self.print_interval))
+      self.numBwds = 0
+      self.summedBwdTime = 0
+   end
+   return self.gradInput
+end
+
+local function makeTorchTimerSerializable()
+   -- The Timer object part of this class needs to be serializable
+   -- so that the layer can be saved, cloned, etc. We add a dummy
+   -- serialization of torch.Timer that just creates a new instance at read
+   local timerMetatable = getmetatable(torch.Timer())
+   timerMetatable['__factory'] = torch.Timer
+   timerMetatable['write'] = function(object, file) end
+   timerMetatable['read'] = function(object, file, versionNumber)
+      return object
+   end
+end
+
+makeTorchTimerSerializable()
diff --git a/README.md b/README.md
index 26cae5d..e748684 100644
--- a/README.md
+++ b/README.md
@@ -41,6 +41,60 @@ Criterions used for handling sequential inputs and targets :
  * [RepeaterCriterion](#rnn.RepeaterCriterion) : repeatedly applies the same criterion with the same target on a sequence.
 
 
+This package also provides many useful features that aren't part of the main nn package.
+These include [sharedClone](#nn.Module.sharedClone), which allows you to clone a module and share
+parameters or gradParameters with the original module, without incuring any memory overhead.
+We also redefined [type](#nn.Module.type) such that the type-cast preserves Tensor sharing within a structure of modules.
+
+The package provides the following Modules:
+
+ * [Decorator](#nn.Decorator) : abstract class to change the behaviour of an encapsulated module ;
+ * [DontCast](#nn.DontCast) : prevent encapsulated module from being casted by `Module:type()` ;
+ * [Serial](#nn.Serial) : decorate a module makes its serialized output more compact ;
+ * [NaN](#nn.NaN) : decorate a module to detect the source of NaN errors ;
+ * [Profile](#nn.Profile) : decorate a module to time its forwards and backwards passes ;
+ * [Inception](#nn.Inception) : implements the Inception module of the GoogleLeNet article ;
+ * [Collapse](#nn.Collapse) : just like `nn.View(-1)`;
+ * [Convert](#nn.Convert) : convert between different tensor types or shapes;
+ * [ZipTable](#nn.ZipTable) : zip a table of tables into a table of tables;
+ * [ZipTableOneToMany](#nn.ZipTableOneToMany) : zip a table of element `el` and table of elements into a table of pairs of element `el` and table elements;
+ * [CAddTensorTable](#nn.CAddTensorTable) : adds a tensor to a table of tensors of the same size;
+ * [ReverseTable](#nn.ReverseTable) : reverse the order of elements in a table;
+ * [PrintSize](#nn.PrintSize) : prints the size of inputs and gradOutputs (useful for debugging);
+ * [Clip](#nn.Clip) : clips the inputs to a min and max value;
+ * [Constant](#nn.Constant) : outputs a constant value given an input (which is ignored);
+ * [SpatialUniformCrop](#nn.SpatialUniformCrop) : uniformly crops patches from a input;
+ * [SpatialGlimpse](#nn.SpatialGlimpse) : takes a fovead glimpse of an image at a given location;
+ * [WhiteNoise](#nn.WhiteNoise) : adds isotropic Gaussian noise to the signal when in training mode;
+ * [OneHot](#nn.OneHot) : transforms a tensor of indices into [one-hot](https://en.wikipedia.org/wiki/One-hot) encoding;
+ * [Kmeans](#nn.Kmeans) : [Kmeans](https://en.wikipedia.org/wiki/K-means_clustering) clustering layer. Forward computes distances with respect to centroids and returns index of closest centroid. Centroids can be updated using gradient descent. Centroids could be initialized randomly or by using [kmeans++](https://en.wikipedia.org/wiki/K-means%2B%2B) algoirthm;
+ * [SpatialRegionDropout](#nn.SpatialRegionDropout) : Randomly dropouts a region (top, bottom, leftmost, rightmost) of the input image. Works with batch and any number of channels;
+ * [FireModule](#nn.FireModule) : FireModule as mentioned in the [SqueezeNet](http://arxiv.org/pdf/1602.07360v1.pdf);
+ * [NCEModule](#nn.NCEModule) : optimized placeholder for a `Linear` + `SoftMax` using [noise-contrastive estimation](https://www.cs.toronto.edu/~amnih/papers/ncelm.pdf).
+ * [SpatialFeatNormalization](#nn.SpatialFeatNormalization) : Module for widely used preprocessing step of mean zeroing and standardization for images.
+ * [SpatialBinaryConvolution](#nn.SpatialBinaryConvolution) : Module for binary spatial convolution (Binary weights) as mentioned in [XNOR-Net](http://arxiv.org/pdf/1603.05279v2.pdf).
+ * [SimpleColorTransform](#nn.SimpleColorTransform) : Module for adding independent random noise to input image channels.
+ * [PCAColorTransform](#nn.PCAColorTransform) : Module for adding noise to input image using Principal Components Analysis.
+
+The following modules and criterions can be used to implement the REINFORCE algorithm :
+
+ * [Reinforce](#nn.Reinforce) : abstract class for REINFORCE modules;
+ * [ReinforceBernoulli](#nn.ReinforceBernoulli) : samples from Bernoulli distribution;
+ * [ReinforceNormal](#nn.ReinforceNormal) : samples from Normal distribution;
+ * [ReinforceGamma](#nn.ReinforceGamma) : samples from Gamma distribution;
+ * [ReinforceCategorical](#nn.ReinforceCategorical) : samples from Categorical (Multinomial with one sample) distribution;
+ * [VRClassReward](#nn.VRClassReward) : criterion for variance-reduced classification-based reward;
+ * [BinaryClassReward](#nn.BinaryClassReward) : criterion for variance-reduced binary classification reward (like `VRClassReward`, but for binary classes);
+
+Additional differentiable criterions
+ * [BinaryLogisticRegression](#nn.BLR) : criterion for binary logistic regression;
+ * [SpatialBinaryLogisticRegression](#nn.SpatialBLR) : criterion for pixel wise binary logistic regression;
+ * [NCECriterion](#nn.NCECriterion) : criterion exclusively used with [NCEModule](#nn.NCEModule).
+ * [ModuleCriterion](#nn.ModuleCriterion) : adds an optional `inputModule` and `targetModule` before a decorated criterion;
+ * [BinaryLogisticRegression](#nn.BLR) : criterion for binary logistic regression.
+ * [SpatialBinaryLogisticRegression](#nn.SpatialBLR) : criterion for pixel wise binary logistic regression.
+
+
 <a name='rnn.examples'></a>
 ## Examples ##
 
@@ -66,6 +120,8 @@ The following are example training scripts using this package :
   * [Sequence to Sequence mapping using encoder-decoder RNNs](https://github.com/rahul-iisc/seq2seq-mapping) : a complete training example using synthetic data.
   * [ConvLSTM](https://github.com/viorik/ConvLSTM) is a repository for training a [Spatio-temporal video autoencoder with differentiable memory](http://arxiv.org/abs/1511.06309).
   * An [time series example](https://github.com/rracinskij/rnntest01/blob/master/rnntest01.lua) for univariate timeseries prediction.
+  * [Sagar Waghmare](https://github.com/sagarwaghmare69) wrote a nice [tutorial](tutorials/ladder.md) on how to use rnn with nngraph to reproduce the [Lateral Connections in Denoising Autoencoders Support Supervised Learning](http://arxiv.org/pdf/1504.08215.pdf).
+
 
 ## Citation ##
 
@@ -1273,3 +1329,1085 @@ repeatedly applied using the same `criterion` to each element in the `input` seq
 The output of `forward` is the sum of all individual losses in the sequence.
 This is useful for implementing models like [RCNNs](http://jmlr.org/proceedings/papers/v32/pinheiro14.pdf),
 which are repeatedly presented with the same target.
+
+<a name='nn.Module'></a>
+## Module ##
+
+The Module interface has been further extended with methods that facilitate
+stochastic gradient descent like [updateGradParameters](#nn.Module.updageGradParameters) (i.e. momentum learning),
+[weightDecay](#nn.Module.weightDecay), [maxParamNorm](#nn.Module.maxParamNorm) (for regularization), and so on.
+
+<a name='nn.Module.dpnn_parameters'></a>
+### Module.dpnn_parameters ###
+
+A table that specifies the name of parameter attributes.
+Defaults to `{'weight', 'bias'}`, which is a static variable (i.e. table exists in class namespace).
+Sub-classes can define their own table statically.
+
+<a name='nn.Module.dpnn_gradParameters'></a>
+### Module.dpnn_gradParameters ###
+
+A table that specifies the name of gradient w.r.t. parameter attributes.
+Defaults to `{'gradWeight', 'gradBias'}`, which is a static variable (i.e. table exists in class namespace).
+Sub-classes can define their own table statically.
+
+<a name='nn.Module.type'></a>
+### [self] Module:type(type_str) ###
+
+This function converts all the parameters of a module to the given `type_str`.
+The `type_str` can be one of the types defined for [torch.Tensor](https://github.com/torch/torch7/blob/master/doc/tensor.md)
+like `torch.DoubleTensor`, `torch.FloatTensor` and `torch.CudaTensor`.
+Unlike the [type method](https://github.com/torch/nn/blob/master/doc/module.md#nn.Module.type)
+defined in [nn](https://github.com/torch/nn), this one was overriden to
+maintain the sharing of [storage](https://github.com/torch/torch7/blob/master/doc/storage.md#storage)
+among Tensors. This is especially useful when cloning modules share `parameters` and `gradParameters`.
+
+<a name='nn.Module.sharedClone'></a>
+### [clone] Module:sharedClone([shareParams, shareGradParams]) ###
+
+Similar to [clone](https://github.com/torch/nn/blob/master/doc/module.md#nn.Module.clone).
+Yet when `shareParams = true` (the default), the cloned module will share the parameters
+with the original module.
+Furthermore, when `shareGradParams = true` (the default), the clone module will share
+the gradients w.r.t. parameters with the original module.
+This is equivalent to :
+```lua
+clone = mlp:clone()
+clone:share(mlp, 'weight', 'bias', 'gradWeight', 'gradBias')
+```
+yet it is much more efficient, especially for modules with lots of parameters, as these
+Tensors aren't needlessly copied during the `clone`.
+This is particularly useful for [Recurrent neural networks](https://github.com/Element-Research/rnn/blob/master/README.md)
+which require efficient copies with shared parameters and gradient w.r.t. parameters for each time-step.
+
+<a name='nn.Module.maxParamNorm'></a>
+### Module:maxParamNorm([maxOutNorm, maxInNorm]) ###
+
+This method implements a hard constraint on the upper bound of the norm of output and/or input neuron weights
+[(Hinton et al. 2012, p. 2)](http://arxiv.org/pdf/1207.0580.pdf) .
+In a weight matrix, this is a contraint on rows (`maxOutNorm`) and/or columns (`maxInNorm`), respectively.
+Has a regularization effect analogous to [weightDecay](#nn.Module.weightDecay), but with easier to optimize hyper-parameters.
+Assumes that parameters are arranged (`output dim x ... x input dim`).
+Only affects parameters with more than one dimension.
+The method should normally be called after [updateParameters](https://github.com/torch/nn/blob/master/doc/module.md#nn.Module.updateParameters).
+It uses the C/CUDA optimized [torch.renorm](https://github.com/torch/torch7/blob/master/doc/maths.md#torch.renorm) function.
+Hint : `maxOutNorm = 2` usually does the trick.
+
+<a name='nn.Module.momentumGradParameters'></a>
+### [momGradParams] Module:momentumGradParameters() ###
+
+Returns a table of Tensors (`momGradParams`). For each element in the
+table, a corresponding parameter (`params`) and gradient w.r.t. parameters
+(`gradParams`) is returned by a call to [parameters](https://github.com/torch/nn/blob/master/doc/module.md#nn.Module.parameters).
+This method is used internally by [updateGradParameters](#nn.Module.updateGradParameters).
+
+<a name='nn.Module.updateGradParameters'></a>
+### Module:updateGradParameters(momFactor [, momDamp, momNesterov]) ###
+
+Applies classic momentum or Nesterov momentum [(Sutskever, Martens et al, 2013)](http://www.cs.toronto.edu/~fritz/absps/momentum.pdf) to parameter gradients.
+Each parameter Tensor (`params`) has a corresponding Tensor of the same size for gradients w.r.t. parameters (`gradParams`).
+When using momentum learning, another Tensor is added for each parameter Tensor (`momGradParams`).
+This method should be called before [updateParameters](https://github.com/torch/nn/blob/master/doc/module.md#nn.Module.updateParameters)
+as it affects the gradients w.r.t. parameters.
+
+Classic momentum is computed as follows :
+
+```lua
+momGradParams = momFactor*momGradParams + (1-momDamp)*gradParams
+gradParams = momGradParams
+```
+
+where `momDamp` has a default value of `momFactor`.
+
+Nesterov momentum (`momNesterov = true`) is computed as follows (the first line is the same as classic momentum):
+
+```lua
+momGradParams = momFactor*momGradParams + (1-momDamp)*gradParams
+gradParams = gradParams + momFactor*momGradParams
+```
+The default is to use classic momentum (`momNesterov = false`).
+
+<a name='nn.Module.weightDecay'></a>
+### Module:weightDecay(wdFactor [, wdMinDim]) ###
+
+Decays the weight of the parameterized models.
+Implements an L2 norm loss on parameters with dimensions greater or equal to `wdMinDim` (default is 2).
+The resulting gradients are stored into the corresponding gradients w.r.t. parameters.
+Such that this method should be called before [updateParameters](https://github.com/torch/nn/blob/master/doc/module.md#nn.Module.updateParameters).
+
+<a name='nn.Module.gradParamClip'></a>
+### Module:gradParamClip(cutoffNorm [, moduleLocal]) ###
+
+Implements a contrainst on the norm of gradients w.r.t. parameters [(Pascanu et al. 2012)](http://arxiv.org/pdf/1211.5063.pdf).
+When `moduleLocal = false` (the default), the norm is calculated globally to Module for which this is called.
+So if you call it on an MLP, the norm is computed on the concatenation of all parameter Tensors.
+When `moduleLocal = true`, the norm constraint is applied
+to the norm of all parameters in each component (non-container) module.
+This method is useful to prevent the exploding gradient in
+[Recurrent neural networks](https://github.com/Element-Research/rnn/blob/master/README.md).
+
+<a name='nn.Module.reinforce'></a>
+### Module:reinforce(reward) ###
+
+This method is used by Criterions that implement the REINFORCE algorithm like [VRClassReward](#nn.VRClassReward).
+While vanilla backpropagation (gradient descent using the chain rule),
+REINFORCE Criterions broadcast a `reward` to all REINFORCE modules between the `forward` and the `backward`.
+In this way, when the following call to `backward` reaches the REINFORCE modules,
+these will compute a `gradInput` using the broadcasted `reward`.
+The `reward` is broadcast to all REINFORCE modules contained
+within `model` by calling `model:reinforce(reward)`.
+Note that the `reward` should be a 1D tensor of size `batchSize`,
+i.e. each example in a batch has its own scalar reward.
+
+Refer to [this example](https://github.com/Element-Research/rnn/blob/master/examples/recurrent-visual-attention.lua)
+for a complete training script making use of the REINFORCE interface.
+
+<a name='nn.Decorator'></a>
+## Decorator ##
+
+```lua
+dmodule = nn.Decorator(module)
+```
+
+This module is an abstract class used to decorate a `module`. This means
+that method calls to `dmodule` will call the same method on the encapsulated
+`module`, and return its results.
+
+<a name='nn.DontCast'></a>
+## DontCast ##
+
+```lua
+dmodule = nn.DontCast(module)
+```
+
+This module is a decorator. Use it to decorate a module that you don't
+want to be cast when the `type()` method is called.
+
+```lua
+module = nn.DontCast(nn.Linear(3,4):float())
+module:double()
+th> print(module:forward(torch.FloatTensor{1,2,3}))
+ 1.0927
+-1.9380
+-1.8158
+-0.0805
+[torch.FloatTensor of size 4]
+```
+
+<a name='nn.Serial'></a>
+## Serial ##
+
+```lua
+dmodule = nn.Serial(module, [tensortype])
+dmodule:[light,medium,heavy]Serial()
+```
+
+This module is a decorator that can be used to control the serialization/deserialization
+behavior of the encapsulated module. Basically, making the resulting string or
+file heavy (the default), medium or light in terms of size.
+
+Furthermore, when specified, the `tensortype` attribute (e.g *torch.FloatTensor*, *torch.DoubleTensor* and so on.),
+determines what type the module will be cast to during serialization.
+Note that this will also be the type of the deserialized object.
+The default serialization `tensortype` is `nil`, i.e. the module is serialized as is.
+
+The `heavySerial()` has the serialization process serialize every attribute in the module graph,
+which is the default behavior of nn.
+
+The `mediumSerial()` has the serialization process serialize
+everything except the attributes specified in each module's `dpnn_mediumEmpty`
+table, which has a default value of `{'output', 'gradInput', 'momGradParams', 'dpnn_input'}`.
+During serialization, whether they be tables or Tensors, these attributes are emptied (no storage).
+Some modules overwrite the default `Module.dpnn_mediumEmpty` static attribute with their own.
+
+The `lightSerial()` has the serialization process empty
+everything a call to `mediumSerial(type)` would (so it uses `dpnn_mediumEmpty`).
+But also empties all the parameter gradients specified by the
+attribute `dpnn_gradParameters`, which defaults to `{gradWeight, gradBias}`.
+
+We recomment using `mediumSerial()` for training, and `lightSerial()` for
+production (feed-forward-only models).
+
+<a name='nn.NaN'></a>
+## NaN ##
+
+```lua
+dmodule = nn.NaN(module, [id])
+```
+
+The `NaN` module asserts that the `output` and `gradInput` of the decorated `module` do not contain NaNs.
+This is useful for locating the source of those pesky NaN errors.
+The `id` defaults to automatically incremented values of `1,2,3,...`.
+
+For example :
+
+```lua
+linear = nn.Linear(3,4)
+mlp = nn.Sequential()
+mlp:add(nn.NaN(nn.Identity()))
+mlp:add(nn.NaN(linear))
+mlp:add(nn.NaN(nn.Linear(4,2)))
+print(mlp)
+```
+
+As you can see the `NaN` layers are have unique ids :
+
+```lua
+nn.Sequential {
+  [input -> (1) -> (2) -> (3) -> output]
+  (1): nn.NaN(1) @ nn.Identity
+  (2): nn.NaN(2) @ nn.Linear(3 -> 4)
+  (3): nn.NaN(3) @ nn.Linear(4 -> 2)
+}
+```
+
+And if we fill the `bias` of the linear module with NaNs and call `forward`:
+
+```lua
+nan = math.log(math.log(0)) -- this is a nan value
+linear.bias:fill(nan)
+mlp:forward(torch.randn(2,3))
+```
+
+We get a nice error message:
+```lua
+/usr/local/share/lua/5.1/dpnn/NaN.lua:39: NaN found in parameters of module :
+nn.NaN(2) @ nn.Linear(3 -> 4)
+```
+
+For a quick one-liner to catch NaNs anywhere inside a model (for example, a `nn.Sequential` or any other `nn.Container`), we can use this with the `nn.Module.replace` function:
+```lua
+model:replace(function(module) return nn.NaN(module) end)
+```
+
+<a name='nn.Profile'></a>
+## Profile ##
+
+```lua
+dmodule = nn.Profile(module, [print_interval, [name] ])
+```
+
+The `Profile` module times each forward and backward pass of the decorated `module`. It prints this information after `print_interval` passes, which is `100` by default. For timing multiple modules, the `name` argument allows this information to be printed accompanied by a name, which by default is the type of the decorated `module`.
+
+This is useful for profiling new modules you develop, and tracking down bottlenecks in the speed of a network.
+
+The timer and print statement can add a small amount of overhead to the overall speed.
+
+As an example:
+
+```lua
+mlp = nn.Sequential()
+mlp:add(nn.Identity())
+mlp:add(nn.Linear(1000,1000))
+mlp:add(nn.Tanh())
+mlp:replace(function(module) return nn.Profile(module, 1000) end)
+inp = torch.randn(1000)
+gradOutput = torch.randn(1000)
+for i=1,1000 do
+   mlp:forward(inp)
+   mlp:backward(inp, gradOutput)
+end
+```
+
+results in the following profile information:
+
+```
+nn.Identity took 0.026 seconds for 1000 forward passes
+nn.Linear took 0.119 seconds for 1000 forward passes
+nn.Tanh took 0.061 seconds for 1000 forward passes
+nn.Tanh took 0.032 seconds for 1000 backward passes
+nn.Linear took 0.161 seconds for 1000 backward passes
+nn.Identity took 0.026 seconds for 1000 backward passes
+```
+
+It's good practice to profile modules after a single forwards and backwards pass, since the initial pass often has to allocate memory. Thus, in the example above, you would run another 1000 forwards and backwards passes to time the modules in their normal mode of operation:
+
+```
+for i=1,1000 do
+   mlp:forward(inp)
+   mlp:backward(inp, gradOutput)
+end
+```
+
+<a name='nn.Inception'></a>
+## Inception ##
+References :
+
+  * A. [Going Deeper with Convolutions](http://arxiv.org/abs/1409.4842)
+  * B. [GoogleLeNet](http://image-net.org/challenges/LSVRC/2014/slides/GoogLeNet.pptx)
+
+```lua
+module = nn.Inception(config)
+```
+
+This module uses `n`+2 parallel "columns".
+The original paper uses 2+2 where the first two are (but there could be more than two):
+
+  * 1x1 conv (reduce) -> relu -> 5x5 conv -> relu
+  * 1x1 conv (reduce) -> relu -> 3x3 conv -> relu
+
+and where the other two are :
+
+  * 3x3 maxpool -> 1x1 conv (reduce/project) -> relu
+  * 1x1 conv (reduce) -> relu.
+
+This module allows the first group of columns to be of any
+number while the last group consist of exactly two columns.
+The 1x1 convoluations are used to reduce the number of input channels
+(or filters) such that the capacity of the network doesn't explode.
+We refer to these here has *reduce*.
+Since each column seems to have one and only one reduce, their initial
+configuration options are specified in lists of n+2 elements.
+
+The sole argument `config` is a table taking the following key-values :
+
+  * Required Arguments :
+   * `inputSize` : number of input channels or colors, e.g. 3;
+   * `outputSize` : numbers of filters in the non-1x1 convolution kernel sizes, e.g. `{32,48}`
+   * `reduceSize` : numbers of filters in the 1x1 convolutions (reduction) used in each column, e.g. `{48,64,32,32}`. The last 2 are used respectively for the max pooling (projection) column (the last column in the paper) and the column that has nothing but a 1x1 conv (the first column in the paper). This table should have two elements more than the outputSize
+  * Optional Arguments :
+   * `reduceStride` : strides of the 1x1 (reduction) convolutions. Defaults to `{1,1,...}`.
+   * `transfer` : transfer function like `nn.Tanh`,`nn.Sigmoid`, `nn.ReLU`, `nn.Identity`, etc. It is used after each reduction (1x1 convolution) and convolution. Defaults to `nn.ReLU`.
+   * `batchNorm` : set this to `true` to use batch normalization. Defaults to `false`. Note that batch normalization can be awesome
+   * `padding` : set this to `true` to add padding to the input of the convolutions such that output width and height are same as that of the original non-padded `input`. Defaults to `true`.
+   * `kernelSize` : size (`height = width`) of the non-1x1 convolution kernels. Defaults to `{5,3}`.
+   * `kernelStride` : stride of the kernels (`height = width`) of the convolution. Defaults to `{1,1}`
+   * `poolSize`: size (`height = width`) of the spatial max pooling used in the next-to-last column. Defaults to 3.
+   * `poolStride` : stride (`height = width`) of the spatial max pooling. Defaults to 1.
+
+
+For a complete example using this module, refer to the following :
+ * [deep inception training script](https://github.com/nicholas-leonard/dp/blob/master/examples/deepinception.lua) ;
+ * [openface facial recognition](https://github.com/cmusatyalab/openface) (the model definition is [here](https://github.com/cmusatyalab/openface/blob/master/models/openface/nn4.def.lua)).
+
+<a name='nn.Collapse'></a>
+## Collapse ##
+
+```lua
+module = nn.Collapse(nInputDim)
+```
+
+This module is the equivalent of:
+```
+view = nn.View(-1)
+view:setNumInputDim(nInputDim)
+```
+It collapses all non-batch dimensions. This is useful for converting
+a spatial feature map to the single dimension required by a dense
+hidden layer like Linear.
+
+<a name='nn.Convert'></a>
+## Convert ##
+
+```lua
+module = nn.Convert([inputShape, outputShape])
+```
+Module to convert between different data formats.
+For example, we can flatten images by using :
+```lua
+module = nn.Convert('bchw', 'bf')
+```
+or equivalently
+```lua
+module = nn.Convert('chw', 'f')
+```
+Lets try it with an input:
+```lua
+print(module:forward(torch.randn(3,2,3,1)))
+ 0.5692 -0.0190  0.5243  0.7530  0.4230  1.2483
+-0.9142  0.6013  0.5608 -1.0417 -1.4014  1.0177
+-1.5207 -0.1641 -0.4166  1.4810 -1.1725 -1.0037
+[torch.DoubleTensor of size 3x6]
+```
+You could also try:
+
+```lua
+module = nn.Convert('chw', 'hwc')
+input = torch.randn(1,2,3,2)
+input:select(2,1):fill(1)
+input:select(2,2):fill(2)
+print(input)
+(1,1,.,.) =
+  1  1
+  1  1
+  1  1
+(1,2,.,.) =
+  2  2
+  2  2
+  2  2
+[torch.DoubleTensor of size 1x2x3x2]
+print(module:forward(input))
+(1,1,.,.) =
+  1  2
+  1  2
+
+(1,2,.,.) =
+  1  2
+  1  2
+
+(1,3,.,.) =
+  1  2
+  1  2
+[torch.DoubleTensor of size 1x3x2x2]
+```
+
+
+Furthermore, it automatically converts the `input` to have the same type as `self.output`
+(i.e. the type of the module).
+So you can also just use is for automatic input type converions:
+```lua
+module = nn.Convert()
+print(module.output) -- type of module
+[torch.DoubleTensor with no dimension]
+input = torch.FloatTensor{1,2,3}
+print(module:forward(input))
+ 1
+ 2
+ 3
+[torch.DoubleTensor of size 3]
+```
+
+<a name='nn.ZipTable'></a>
+## ZipTable ##
+
+```lua
+module = nn.ZipTable()
+```
+
+Zips a table of tables into a table of tables.
+
+Example:
+```lua
+print(module:forward{ {'a1','a2'}, {'b1','b2'}, {'c1','c2'} })
+{ {'a1','b1','c1'}, {'a2','b2','c2'} }
+```
+
+<a name='nn.ZipTableOneToMany'></a>
+## ZipTableOneToMany ##
+
+```lua
+module = nn.ZipTableOneToMany()
+```
+
+Zips a table of element `el` and table of elements `tab` into a table of tables, where the i-th table contains the element `el` and the i-th element in table `tab`
+
+Example:
+```lua
+print(module:forward{ 'el', {'a','b','c'} })
+{ {'el','a'}, {'el','b'}, {'el','c'} }
+```
+
+<a name='nn.CAddTensorTable'></a>
+## CAddTensorTable ##
+
+```lua
+module = nn.CAddTensorTable()
+```
+
+Adds the first element `el` of the input table `tab` to each tensor contained in the second element of `tab`, which is itself a table
+
+Example:
+```lua
+print(module:forward{ (0,1,1), {(0,0,0),(1,1,1)} })
+{ (0,1,1), (1,2,2) }
+```
+
+
+<a name='nn.ReverseTable'></a>
+## ReverseTable ##
+
+```lua
+module = nn.ReverseTable()
+```
+
+Reverses the order of elements in a table.
+
+Example:
+
+```lua
+print(module:forward{1,2,3,4})
+{4,3,2,1}
+```
+
+<a name='nn.PrintSize'></a>
+## PrintSize ##
+
+```lua
+module = nn.PrintSize(name)
+```
+
+This module is useful for debugging complicated module composites.
+It prints the size of the `input` and `gradOutput` during `forward`
+and `backward` propagation respectively.
+The `name` is a string used to identify the module along side the printed size.
+
+<a name='nn.Clip'></a>
+## Clip ##
+
+```lua
+module = nn.Clip(minval, maxval)
+```
+
+This module clips `input` values such that the output is between `minval` and `maxval`.
+
+<a name='nn.Constant'></a>
+## Constant ##
+
+```lua
+module = nn.Constant(value, nInputDim)
+```
+
+This module outputs a constant value given an input.
+If `nInputDim` is specified, it uses the input to determine the size of the batch.
+The `value` is then replicated over the batch.
+Otherwise, the `value` Tensor is output as is.
+During `backward`, the returned `gradInput` is a zero Tensor of the same size as the `input`.
+This module has no trainable parameters.
+
+You can use this with nn.ConcatTable() to append constant inputs to an input :
+
+```lua
+nn.ConcatTable():add(nn.Constant(v)):add(nn.Identity())
+```
+
+This is useful when you want to output a value that is independent of the
+input to the neural network (see [this example](https://github.com/Element-Research/rnn/blob/master/examples/recurrent-visual-attention.lua)).
+
+<a name='nn.SpatialUniformCrop'></a>
+## SpatialUniformCrop ##
+
+```lua
+module = nn.SpatialUniformCrop(oheight, owidth)
+```
+
+During training, this module will output a cropped patch of size `oheight, owidth`
+within the boundaries of the `input` image.
+For each example, a location is sampled from a uniform distribution
+such that each possible patch has an equal probability of being sampled.
+
+During evaluation, the center patch is cropped and output.
+
+This module is commonly used at the input layer to artificially
+augment the size of the dataset to prevent overfitting.
+
+<a name='nn.SpatialGlimpse'></a>
+## SpatialGlimpse ##
+Ref. A. [Recurrent Model for Visual Attention](http://papers.nips.cc/paper/5542-recurrent-models-of-visual-attention.pdf)
+
+```lua
+module = nn.SpatialGlimpse(size, depth, scale)
+```
+
+A glimpse is the concatenation of down-scaled cropped images of
+increasing scale around a given location in a given image.
+The input is a pair of Tensors: `{image, location}`
+`location` are `(y,x)` coordinates of the center of the different scales
+of patches to be cropped from image `image`.
+Coordinates are between `(-1,-1)` (top-left) and `(1,1)` (bottom-right).
+The `output` is a batch of glimpses taken in image at location `(y,x)`.
+
+`size` can be either a scalar which specifies the `width = height` of glimpses,
+or a table of `{height, width}` to support a rectangular shape of glimpses.
+`depth` is number of patches to crop per glimpse (one patch per depth).
+`scale` determines the `size(t) = scale * size(t-1)` of successive cropped patches.
+
+So basically, this module can be used to focus the attention of the model
+on a region of the input `image`.
+It is commonly used with the [RecurrentAttention](https://github.com/Element-Research/rnn#rnn.RecurrentAttention)
+module (see [this example](https://github.com/Element-Research/rnn/blob/master/examples/recurrent-visual-attention.lua)).
+
+<a name='nn.WhiteNoise'></a>
+## WhiteNoise ##
+
+```lua
+module = nn.WhiteNoise([mean, stdev])
+```
+
+Useful in training [Denoising Autoencoders] (http://arxiv.org/pdf/1507.02672v1.pdf).
+Takes `mean` and `stdev` of the normal distribution as input.
+Default values for mean and standard deviation are 0 and 0.1 respectively.
+With `module:training()`, noise is added during forward.
+During `backward` gradients are passed as it is.
+With `module:evaluate()` the mean is added to the input.
+
+<a name='nn.SpatialRegionDropout'></a>
+## SpatialRegionDropout ##
+
+```lua
+module = nn.SpatialRegionDropout(p)
+```
+Following is an example of `SpatialRegionDropout` outputs on the famous lena image.
+
+**Input**
+
+![Lena](tutorials/lena.jpg)
+
+**Outputs**
+
+![Lena](tutorials/srd1.jpg)           ![Lena](tutorials/srd2.jpg)
+
+<a name='nn.FireModule'></a>
+## FireModule ##
+Ref: http://arxiv.org/pdf/1602.07360v1.pdf
+```lua
+module = nn.FireModule(nInputPlane, s1x1, e1x1, e3x3, activation)
+```
+FireModule is comprised of two submodules 1) A *squeeze* convolution module comprised of `1x1` filters followed by 2) an *expand* module that is comprised of a mix of `1x1` and `3x3` convolution filters.
+Arguments: `s1x1`: number of `1x1` filters in the squeeze submodule, `e1x1`: number of `1x1` filters in the expand submodule, `e3x3`: number of `3x3` filters in the expand submodule. It is recommended that `s1x1` be less than `(e1x1+e3x3)` if you want to limit the number of input channels to the `3x3` filters in the expand submodule.
+FireModule works only with batches, for single sample convert the sample to a batch of size 1.
+
+<a name='nn.SpatialFeatNormalization'></a>
+## SpatialFeatNormalization ##
+```lua
+module = nn.SpatialFeatNormalization(mean, std)
+```
+This module normalizies each feature channel of input image based on its corresponding mean and standard deviation scalar values. This module does not learn the `mean` and `std`, they are provided as arguments.
+
+<a name='nn.SpatialBinaryConvolution'></a>
+## SpatialBinaryConvolution ##
+
+```lua
+module = nn.SpatialBinaryConvolution(nInputPlane, nOutputPlane, kW, kH)
+```
+Functioning of SpatialBinaryConvolution is similar to nn/SpatialConvolution. Only difference is that Binary weights are used for forward/backward and floating point weights are used for weight updates. Check **Binary-Weight-Network** section of [XNOR-net](http://arxiv.org/pdf/1603.05279v2.pdf).
+
+<a name='nn.SimpleColorTransform'></a>
+## SimpleColorTransform ##
+
+```lua
+range = torch.rand(inputChannels) -- Typically range is specified by user.
+module = nn.SimpleColorTransform(inputChannels, range)
+```
+This module performs a simple data augmentation technique. SimpleColorTransform module adds random noise to each color channel independently. In more advanced data augmentation technique noise is added using principal components of color channels. For that please check **PCAColorTransform**
+
+<a name='nn.PCAColorTransform'></a>
+## PCAColorTransform ##
+
+```lua
+eigenVectors = torch.rand(inputChannels, inputChannels) -- Eigen Vectors
+eigenValues = torch.rand(inputChannels) -- Eigen
+std = 0.1 -- Std deviation of normal distribution with mean zero for noise.
+module = nn.PCAColorTransform(inputChannels, eigenVectors, eigenValues, std)
+```
+This module performs a data augmentation using Principal Component analysis of pixel values. When in training mode, mulitples of principal components are added to input image pixels. Magnitude of value added (noise) is dependent upon the corresponding eigen value and a random value sampled from a Gaussian distribution with mean zero and `std` (default 0.1) standard deviation. This technique was used in the famous [AlexNet](https://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks.pdf) paper.
+
+<a name = 'nn.OneHot'></a>
+## OneHot ##
+
+```lua
+module = nn.OneHot(outputSize)
+```
+
+Transforms a tensor of `input` indices having integer values between 1 and `outputSize` into
+a tensor of one-hot vectors of size `outputSize`.
+
+Forward an index to get a one-hot vector :
+
+```lua
+> module = nn.OneHot(5) -- 5 classes
+> module:forward(torch.LongTensor{3})
+ 0  0  1  0  0
+[torch.DoubleTensor of size 1x5]
+```
+
+Forward a batch of 3 indices. Notice that these need not be stored as `torch.LongTensor` :
+
+```lua
+> module:forward(torch.Tensor{3,2,1})
+ 0  0  1  0  0
+ 0  1  0  0  0
+ 1  0  0  0  0
+[torch.DoubleTensor of size 3x5]
+```
+
+Forward batch of `2 x 3` indices :
+
+```lua
+oh:forward(torch.Tensor{{3,2,1},{1,2,3}})
+(1,.,.) =
+  0  0  1  0  0
+  0  1  0  0  0
+  1  0  0  0  0
+
+(2,.,.) =
+  1  0  0  0  0
+  0  1  0  0  0
+  0  0  1  0  0
+[torch.DoubleTensor of size 2x3x5]
+```
+
+<a name='nn.Kmeans'></a>
+## Kmeans ##
+
+```lua
+km = nn.Kmeans(k, dim)
+```
+
+`k` is the number of centroids and `dim` is the dimensionality of samples.
+You can either initialize centroids randomly from input samples or by using *kmeans++* algorithm.
+
+```lua
+km:initRandom(samples) -- Randomly initialize centroids from input samples.
+km:initKmeansPlus(samples) -- Use Kmeans++ to initialize centroids.
+```
+
+Example showing how to use Kmeans module to do standard Kmeans clustering.
+
+```lua
+attempts = 10
+iter = 100 -- Number of iterations
+bestKm = nil
+bestLoss = math.huge
+learningRate = 1
+for j=1, attempts do
+   local km = nn.Kmeans(k, dim)
+   km:initKmeansPlus(samples)
+   for i=1, iter do
+      km:zeroGradParameters()
+      km:forward(samples) -- sets km.loss
+      km:backward(samples, gradOutput) -- gradOutput is ignored
+
+      -- Gradient Descent weight/centroids update
+      km:updateParameters(learningRate)
+   end
+
+   if km.loss < bestLoss then
+      bestLoss = km.loss
+      bestKm = km:clone()
+   end
+end
+```
+`nn.Kmeans()` module maintains loss only for the latest forward. If you want to maintain loss over the whole dataset then you who would need do it my adding the module loss for every forward.
+
+You can also use `nn.Kmeans()` as an auxillary layer in your network.
+A call to `forward` will generate an `output` containing the index of the nearest cluster for each sample in the batch.
+The `gradInput` generated by `updateGradInput` will be zero.
+
+<a name='nn.ModuleCriterion'></a>
+## ModuleCriterion ##
+
+```lua
+criterion = nn.ModuleCriterion(criterion [, inputModule, targetModule, castTarget])
+```
+
+This criterion decorates a `criterion` by allowing the `input` and `target` to be
+fed through an optional `inputModule` and `targetModule` before being passed to the
+`criterion`. The `inputModule` must not contain parameters as these would not be updated.
+
+When `castTarget = true` (the default), the `targetModule` is cast along with the `inputModule` and
+`criterion`. Otherwise, the `targetModule` isn't.
+
+<a name='nn.NCEModule'></a>
+## NCEModule
+Ref. A [RNNLM training with NCE for Speech Recognition](https://www.cs.toronto.edu/~amnih/papers/ncelm.pdf)
+
+```lua
+ncem = nn.NCEModule(inputSize, outputSize, k, unigrams, [Z])
+```
+
+When used in conjunction with [NCECriterion](#nn.NCECriterion),
+the `NCEModule` implements [noise-contrastive estimation](https://www.cs.toronto.edu/~amnih/papers/ncelm.pdf).
+
+The point of the NCE is to speedup computation for large `Linear` + `SoftMax` layers.
+Computing a forward/backward for `Linear(inputSize, outputSize)` for a large `outputSize` can be very expensive.
+This is common when implementing language models having with large vocabularies of a million words.
+In such cases, NCE can be an efficient alternative to computing the full `Linear` + `SoftMax` during training and
+cross-validation.
+
+The `inputSize` and `outputSize` are the same as for the `Linear` module.
+The number of noise samples to be drawn per example is `k`. A value of 25 should work well.
+Increasing it will yield better results, while a smaller value will be more efficient to process.
+The `unigrams` is a tensor of size `outputSize` that contains the frequencies or probability distribution over classes.
+It is used to sample noise samples via a fast implementation of `torch.multinomial`.
+The `Z` is the normalization constant of the approximated SoftMax.
+The default is `math.exp(9)` as specified in Ref. A.
+
+For inference, or measuring perplexity, the full `Linear` + `SoftMax` will need to
+be computed. The `NCEModule` can do this by switching on the following :
+
+```lua
+ncem:evaluate()
+ncem.normalized = true
+```
+
+Furthermore, to simulate `Linear` + `LogSoftMax` instead, one need only add the following to the above:
+
+```lua
+ncem.logsoftmax = true
+```
+
+An example is provided via the rnn package.
+
+<a name='nn.NCECriterion'></a>
+## NCECriterion
+
+```lua
+ncec = nn.NCECriterion()
+```
+
+This criterion only works with an [NCEModule](#nn.NCEModule) on the output layer.
+Together, they implement [noise-contrastive estimation](https://www.cs.toronto.edu/~amnih/papers/ncelm.pdf).
+
+
+<a name='nn.Reinforce'></a>
+## Reinforce ##
+Ref A. [Simple Statistical Gradient-Following Algorithms for Connectionist Reinforcement Learning](http://incompleteideas.net/sutton/williams-92.pdf)
+
+Abstract class for modules that implement the REINFORCE algorithm (ref. A).
+
+```lua
+module = nn.Reinforce([stochastic])
+```
+
+The `reinforce(reward)` method is called by a special Reward Criterion (e.g. [VRClassReward](#nn.VRClassReward)).
+After which, when backward is called, the reward will be used to generate gradInputs.
+When `stochastic=true`, the module is stochastic (i.e. samples from a distribution)
+during evaluation and training.
+When `stochastic=false` (the default), the module is only stochastic during training.
+
+The REINFORCE rule for a module can be summarized as follows :
+```lua
+            d ln(f(output,input))
+gradInput = ---------------------  * reward
+                  d input
+```
+where the `reward` is what is provided by a Reward criterion like
+[VRClassReward](#nn.VRClassReward) via the [reinforce](#nn.Module.reinforce) method.
+The criterion will normally be responsible for the following formula :
+```lua
+reward = a*(R - b)
+```
+where `a` is the alpha of the original paper, i.e. a reward scale,
+`R` is the raw reward (usually 0 or 1), and `b` is the baseline reward,
+which is often taken to be the expected raw reward `R`.
+
+The `output` is usually sampled from a probability distribution `f()`
+parameterized by the `input`.
+See [ReinforceBernoulli](#nn.ReinforceBernoulli) for a concrete derivation.
+
+Also, as you can see, the gradOutput is ignored. So within a backpropagation graph,
+the `Reinforce` modules will replace the backpropagated gradients (`gradOutput`)
+with their own obtained from the broadcasted `reward`.
+
+<a name='nn.ReinforceBernoulli'></a>
+## ReinforceBernoulli ##
+Ref A. [Simple Statistical Gradient-Following Algorithms for
+Connectionist Reinforcement Learning](http://incompleteideas.net/sutton/williams-92.pdf)
+
+```lua
+module = nn.ReinforceBernoulli([stochastic])
+```
+
+A [Reinforce](#nn.Reinforce) subclass that implements the REINFORCE algorithm
+(ref. A p.230-236) for the Bernoulli probability distribution.
+Inputs are bernoulli probabilities `p`.
+During training, outputs are samples drawn from this distribution.
+During evaluation, when `stochastic=false`, outputs are the same as the inputs.
+Uses the REINFORCE algorithm (ref. A p.230-236) which is
+implemented through the [reinforce](#nn.Module.reinforce) interface (`gradOutputs` are ignored).
+
+Given the following variables :
+
+ * `f` : bernoulli probability mass function
+ * `x` : the sampled values (0 or 1) (i.e. `self.output`)
+ * `p` : probability of sampling a 1
+
+the derivative of the log bernoulli w.r.t. probability `p` is :
+```
+d ln(f(output,input))   d ln(f(x,p))    (x - p)
+--------------------- = ------------ = ---------
+      d input               d p         p(1 - p)
+```
+
+<a name='nn.ReinforceNormal'></a>
+## ReinforceNormal ##
+Ref A. [Simple Statistical Gradient-Following Algorithms for Connectionist Reinforcement Learning](http://incompleteideas.net/sutton/williams-92.pdf)
+
+```lua
+module = nn.ReinforceNormal(stdev, [stochastic])
+```
+
+A [Reinforce](#nn.Reinforce) subclass that implements the REINFORCE algorithm
+(ref. A p.238-239) for a Normal (i.e. Gaussian) probability distribution.
+Inputs are the means of the normal distribution.
+The `stdev` argument specifies the standard deviation of the distribution.
+During training, outputs are samples drawn from this distribution.
+During evaluation, when `stochastic=false`, outputs are the same as the inputs, i.e. the means.
+Uses the REINFORCE algorithm (ref. A p.238-239) which is
+implemented through the [reinforce](#nn.Module.reinforce) interface (`gradOutputs` are ignored).
+
+Given the following variables :
+
+  * `f` : normal probability density function
+  * `x` : the sampled values (i.e. `self.output`)
+  * `u` : mean (`input`)
+  * `s` : standard deviation (`self.stdev`)
+
+the derivative of log normal w.r.t. mean `u` is :
+```
+d ln(f(x,u,s))   (x - u)
+-------------- = -------
+     d u           s^2
+```
+
+As an example, it is used to sample locations for the [RecurrentAttention](https://github.com/Element-Research/rnn#rnn.RecurrentAttention)
+module (see [this example](https://github.com/Element-Research/rnn/blob/master/examples/recurrent-visual-attention.lua)).
+
+<a name='nn.ReinforceGamma'></a>
+## ReinforceGamma ##
+Ref A. [Simple Statistical Gradient-Following Algorithms for Connectionist Reinforcement Learning](http://incompleteideas.net/sutton/williams-92.pdf)
+
+```lua
+module = nn.ReinforceGamma(scale, [stochastic])
+```
+
+A [Reinforce](#nn.Reinforce) subclass that implements the REINFORCE algorithm
+(ref. A) for a [Gamma probability distribution](https://en.wikipedia.org/wiki/Gamma_distribution)
+parametrized by shape (k) and scale (theta) variables.
+Inputs are the shapes of the gamma distribution.
+During training, outputs are samples drawn from this distribution.
+During evaluation, when `stochastic=false`, outputs are equal to the mean, defined as the product of
+shape and scale ie. `k*theta`.
+Uses the REINFORCE algorithm (ref. A) which is
+implemented through the [reinforce](#nn.Module.reinforce) interface (`gradOutputs` are ignored).
+
+Given the following variables :
+
+  * `f` : gamma probability density function
+  * `g` : digamma function
+  * `x` : the sampled values (i.e. `self.output`)
+  * `k` : shape (`input`)
+  * `t` : scale
+
+the derivative of log gamma w.r.t. shape `k` is :
+```
+d ln(f(x,k,t))
+-------------- = ln(x) - g(k) - ln(t)
+      d k
+```
+
+<a name='nn.ReinforceCategorical'></a>
+## ReinforceCategorical ##
+Ref A. [Simple Statistical Gradient-Following Algorithms for Connectionist Reinforcement Learning](http://incompleteideas.net/sutton/williams-92.pdf)
+
+```lua
+module = nn.ReinforceCategorical([stochastic])
+```
+
+A [Reinforce](#nn.Reinforce) subclass that implements the REINFORCE algorithm
+(ref. A) for a Categorical (i.e. Multinomial with one sample) probability distribution.
+Inputs are the categorical probabilities of the distribution : `p[1], p[2], ..., p[k]`.
+These are usually the output of a SoftMax.
+For `n` categories, both the `input` and `output` ares of size `batchSize x n`.
+During training, outputs are samples drawn from this distribution.
+The outputs are returned in one-hot encoding i.e.
+the output for each example has exactly one category having a 1, while the remainder are zero.
+During evaluation, when `stochastic=false`, outputs are the same as the inputs, i.e. the probabilities `p`.
+Uses the REINFORCE algorithm (ref. A) which is
+implemented through the [reinforce](#nn.Module.reinforce) interface (`gradOutputs` are ignored).
+
+
+Given the following variables :
+
+  * `f` : categorical probability mass function
+  * `x` : the sampled indices (one per sample) (`self.output` is the one-hot encoding of these indices)
+  * `p` : probability vector (`p[1], p[2], ..., p[k]`) (`input`)
+
+the derivative of log categorical w.r.t. probability vector `p` is :
+```
+d ln(f(x,p))     1/p[i]    if i = x
+------------ =
+    d p          0         otherwise
+```
+
+<a name='nn.VRClassReward'></a>
+## VRClassReward ##
+Ref A. [Simple Statistical Gradient-Following Algorithms for Connectionist Reinforcement Learning](http://incompleteideas.net/sutton/williams-92.pdf)
+
+This Reward criterion implements the REINFORCE algoritm (ref. A) for classification models.
+Specifically, it is a Variance Reduces (VR) classification reinforcement leanring (reward-based) criterion.
+
+```lua
+vcr = nn.VRClassReward(module [, scale, criterion])
+```
+
+While it conforms to the Criterion interface (which it inherits),
+it does not backpropagate gradients (except for the baseline `b`; see below).
+Instead, a `reward` is broadcast to the `module` via the [reinforce](#nn.Module.reinforce) method.
+
+The criterion implements the following formula :
+```lua
+reward = a*(R - b)
+```
+where `a` is the alpha described in Ref. A, i.e. a reward `scale` (defaults to 1),
+`R` is the raw reward (0 for incorrect and 1 for correct classification),
+and `b` is the baseline reward, which is often taken to be the expected raw reward `R`.
+
+The `target` of the criterion is a tensor of class indices.
+The `input` to the criterion is a table `{y,b}` where `y` is the probability
+(or log-probability) of classes (usually the output of a SoftMax),
+and `b` is the baseline reward discussed above.
+
+For each example, if `argmax(y)` is equal to the `target` class, the raw reward `R = 1`, otherwize `R = 0`.
+
+As for `b`, its `gradInputs` are obtained from the `criterion`, which defaults to `MSECriterion`.
+The `criterion`'s target is the commensurate raw reward `R`.
+Using `a*(R-b)` instead of `a*R` to obtain a `reward` is what makes this class variance reduced (VR).
+By reducing the variance, the training can converge faster (Ref. A).
+The predicted `b` can be nothing more than the expectation `E(R)`.
+
+Note : for RNNs with R = 1 for last step in sequence, encapsulate it
+in `nn.ModuleCriterion(VRClassReward, nn.SelectTable(-1))`.
+
+For an example, this criterion is used along with the [RecurrentAttention](https://github.com/Element-Research/rnn#rnn.RecurrentAttention)
+module to [train a recurrent model for visual attention](https://github.com/Element-Research/rnn/blob/master/examples/recurrent-visual-attention.lua).
+
+<a name='nn.BinaryClassReward'></a>
+## BinaryClassReward ##
+
+```lua
+bcr = nn.BinaryClassReward(module [, scale, criterion])
+```
+
+This module implements [VRClassReward](#nn.VRClassReward) for binary classification problems.
+So basically, the `input` is still a table of two tensors.
+The first input tensor is of size `batchsize` containing Bernoulli probabilities.
+The second input tensor is the baseline prediction described in `VRClassReward`.
+The targets contain zeros and ones.
+
+<a name='nn.BLR'></a>
+## BinaryLogisticRegression ##
+Ref A. [Learning to Segment Object Candidates](http://arxiv.org/pdf/1506.06204v2.pdf)
+This criterion implements the score criterion mentioned in (ref. A).
+
+```lua
+criterion = nn.BinaryLogisticRegression()
+```
+
+BinaryLogisticRegression implements following cost function for binary classification.
+
+```
+
+ log( 1 + exp( -y_k * score(x_k) ) )
+
+```
+where `y_k` is binary target `score(x_k)` is the corresponding prediction. `y_k` has value `{-1, +1}` and `score(x_k)` has value in `[-1, +1]`.
+
+<a name='nn.SpatialBLR'></a>
+## SpatialBinaryLogisticRegression ##
+Ref A. [Learning to Segment Object Candidates](http://arxiv.org/pdf/1506.06204v2.pdf)
+
+This criterion implements the spatial component of the criterion mentioned in  (ref. A).
+
+```lua
+criterion = nn.SpatialBinaryLogisticRegression()
+```
+
+SpatialBinaryLogisticRegression implements following cost function for binary pixel classification.
+```
+   1
+_______ sum_ij [ log( 1 + exp( -m_ij * f_ij ) ) ]
+ 2*w*h
+```
+where `m_ij` is target binary image and `f_ij` is the corresponding prediction. `m_ij` has value `{-1, +1}` and `f_ij` has value in `[-1, +1]`.
+
diff --git a/Reinforce.lua b/Reinforce.lua
new file mode 100644
index 0000000..d7e5f93
--- /dev/null
+++ b/Reinforce.lua
@@ -0,0 +1,52 @@
+------------------------------------------------------------------------
+--[[ Reinforce ]]--
+-- Ref A. http://incompleteideas.net/sutton/williams-92.pdf
+-- Abstract class for modules that use the REINFORCE algorithm (ref A).
+-- The reinforce(reward) method is called by a special Reward Criterion.
+-- After which, when backward is called, the reward will be used to 
+-- generate gradInputs. The gradOutput is usually ignored.
+------------------------------------------------------------------------
+local Reinforce, parent = torch.class("nn.Reinforce", "nn.Module")
+
+function Reinforce:__init(stochastic)
+   parent.__init(self)
+   -- true makes it stochastic during evaluation and training
+   -- false makes it stochastic only during training
+   self.stochastic = stochastic
+end
+
+-- a Reward Criterion will call this
+function Reinforce:reinforce(reward)
+   parent.reinforce(self, reward)
+   self.reward = reward
+end
+
+function Reinforce:updateOutput(input)
+   self.output:set(input)
+end
+
+function Reinforce:updateGradInput(input, gradOutput)
+   local reward = self:rewardAs(input)
+   self.gradInput:resizeAs(reward):copy(reward)
+end
+
+-- this can be called by updateGradInput
+function Reinforce:rewardAs(input)
+   assert(self.reward:dim() == 1)
+   if input:isSameSizeAs(self.reward) then
+      return self.reward
+   else
+      if self.reward:size(1) ~= input:size(1) then
+         -- assume input is in online-mode
+         input = self:toBatch(input, input:dim())
+         assert(self.reward:size(1) == input:size(1), self.reward:size(1).." ~= "..input:size(1))
+      end
+      self._reward = self._reward or self.reward.new()
+      self.__reward = self.__reward or self.reward.new()
+      local size = input:size():fill(1):totable()
+      size[1] = self.reward:size(1)
+      self._reward:view(self.reward, table.unpack(size))
+      self.__reward:expandAs(self._reward, input)
+      return self.__reward
+   end
+end
diff --git a/ReinforceBernoulli.lua b/ReinforceBernoulli.lua
new file mode 100644
index 0000000..a8ca0e6
--- /dev/null
+++ b/ReinforceBernoulli.lua
@@ -0,0 +1,51 @@
+------------------------------------------------------------------------
+--[[ ReinforceBernoulli ]]-- 
+-- Ref A. http://incompleteideas.net/sutton/williams-92.pdf
+-- Inputs are bernoulli probabilities (p) 
+-- Ouputs are samples drawn from this distribution.
+-- Uses the REINFORCE algorithm (ref. A p.230-236) which is 
+-- implemented through the nn.Module:reinforce(reward) interface.
+-- gradOutputs are ignored (REINFORCE algorithm).
+------------------------------------------------------------------------
+local ReinforceBernoulli, parent = torch.class("nn.ReinforceBernoulli", "nn.Reinforce")
+
+function ReinforceBernoulli:updateOutput(input)
+   self.output:resizeAs(input)
+   if self.stochastic or self.train ~= false then
+      -- sample from bernoulli with P(output=1) = input
+      self._uniform = self._uniform or input.new()
+      self._uniform:resizeAs(input):uniform(0,1)
+      self.output:lt(self._uniform, input)
+   else
+      -- use p for evaluation
+      self.output:copy(input)
+   end
+   return self.output
+end
+
+function ReinforceBernoulli:updateGradInput(input, gradOutput)
+   -- Note that gradOutput is ignored
+   -- f : bernoulli probability mass function
+   -- x : the sampled values (0 or 1) (self.output)
+   -- p : probability of sampling a 1
+   -- derivative of log bernoulli w.r.t. p
+   -- d ln(f(x,p))    (x - p)
+   -- ------------ = ---------
+   --     d p         p(1 - p)
+   self.gradInput:resizeAs(input)
+   -- (x - p)
+   self.gradInput:copy(self.output):add(-1, input)
+   -- divide by p(1 - p)
+   self._div = self._div or input.new()
+   self._div:resizeAs(input)
+   self._div:fill(1):add(-1, input):cmul(input)
+   self.gradInput:cdiv(self._div)
+   
+   -- multiply by reward 
+   self.gradInput:cmul(self:rewardAs(input))
+   -- multiply by -1 ( gradient descent on input )
+   self.gradInput:mul(-1)
+   return self.gradInput
+end
+
+
diff --git a/ReinforceCategorical.lua b/ReinforceCategorical.lua
new file mode 100644
index 0000000..7f66e21
--- /dev/null
+++ b/ReinforceCategorical.lua
@@ -0,0 +1,57 @@
+------------------------------------------------------------------------
+--[[ ReinforceCategorical ]]-- 
+-- Ref A. http://incompleteideas.net/sutton/williams-92.pdf
+-- Inputs are a vector of categorical prob : (p[1], p[2], ..., p[k]) 
+-- Ouputs are samples drawn from this distribution.
+-- Uses the REINFORCE algorithm (ref. A sec 6. p.230-236) which is 
+-- implemented through the nn.Module:reinforce(r,b) interface.
+-- gradOutputs are ignored (REINFORCE algorithm).
+------------------------------------------------------------------------
+local ReinforceCategorical, parent = torch.class("nn.ReinforceCategorical", "nn.Reinforce")
+
+function ReinforceCategorical:updateOutput(input)
+   self.output:resizeAs(input)
+   self._index = self._index or ((torch.type(input) == 'torch.CudaTensor') and torch.CudaTensor() or torch.LongTensor())
+   if self.stochastic or self.train ~= false then
+      -- sample from categorical with p = input
+      self._input = self._input or input.new()
+      -- prevent division by zero error (see updateGradInput)
+      self._input:resizeAs(input):copy(input):add(0.00000001) 
+      input.multinomial(self._index, input, 1)
+      -- one hot encoding
+      self.output:zero()
+      self.output:scatter(2, self._index, 1)
+   else
+      -- use p for evaluation
+      self.output:copy(input)
+   end
+   return self.output
+end
+
+function ReinforceCategorical:updateGradInput(input, gradOutput)
+   -- Note that gradOutput is ignored
+   -- f : categorical probability mass function
+   -- x : the sampled indices (one per sample) (self.output)
+   -- p : probability vector (p[1], p[2], ..., p[k]) 
+   -- derivative of log categorical w.r.t. p
+   -- d ln(f(x,p))     1/p[i]    if i = x  
+   -- ------------ =   
+   --     d p          0         otherwise
+   self.gradInput:resizeAs(input):zero()
+   self.gradInput:copy(self.output)
+   self._input = self._input or input.new()
+   -- prevent division by zero error
+   self._input:resizeAs(input):copy(input):add(0.00000001) 
+   self.gradInput:cdiv(self._input)
+   
+   -- multiply by reward 
+   self.gradInput:cmul(self:rewardAs(input))
+   -- multiply by -1 ( gradient descent on input )
+   self.gradInput:mul(-1)
+   return self.gradInput
+end
+
+function ReinforceCategorical:type(type, tc)
+   self._index = nil
+   return parent.type(self, type, tc)
+end
diff --git a/ReinforceGamma.lua b/ReinforceGamma.lua
new file mode 100644
index 0000000..8d21a23
--- /dev/null
+++ b/ReinforceGamma.lua
@@ -0,0 +1,129 @@
+------------------------------------------------------------------------
+--[[ ReinforceGamma ]]-- 
+-- Ref A. http://incompleteideas.net/sutton/williams-92.pdf
+-- Inputs are shape (k) and scale (theta) of multivariate Gamma distribution. 
+-- Ouputs are samples drawn from these distributions.
+-- Scale is provided as constructor argument.
+-- Uses the REINFORCE algorithm (ref. A sec 6. p.237-239) which is 
+-- implemented through the nn.Module:reinforce(r,b) interface.
+-- gradOutputs are ignored (REINFORCE algorithm).
+------------------------------------------------------------------------
+
+
+local ReinforceGamma, parent = torch.class("nn.ReinforceGamma", "nn.Reinforce")
+
+function ReinforceGamma:__init(scale, stochastic)
+   require('randomkit') -- needed to sample gamma dist : luarocks install randomkit
+   require('cephes') -- needed to compute digamma for gradient : 
+   parent.__init(self, stochastic)
+   self.scale = scale
+   if not scale then
+      self.gradInput = {torch.Tensor(), torch.Tensor()}
+   end
+end
+
+function ReinforceGamma:updateOutput(input)
+   local shape, scale = input, self.scale
+   if torch.type(input) == 'table' then
+      -- input is {shape, scale}
+      assert(#input == 2)
+      shape, scale = unpack(input)
+   end
+   assert(scale)
+   
+   self.output:resizeAs(shape)
+
+   if torch.type(scale) == 'number' then
+     scale = shape.new():resizeAs(shape):fill(scale)
+   elseif torch.isTensor(scale) then
+      if scale:dim() == shape:dim() then
+         assert(scale:isSameSizeAs(shape))
+      else
+         assert(scale:dim()+1 == shape:dim())
+         self._scale = self._scale or scale.new()
+         self._scale:view(scale,1,table.unpack(scale:size():totable()))
+         self.__scale = self.__scale or scale.new()
+         self.__scale:expandAs(self._scale, shape)
+         scale = self.__scale
+      end
+   else
+      error"unsupported shape type"
+   end
+
+   if self.stochastic or self.train ~= false then
+      self.output:copy(randomkit.gamma(shape:squeeze():float(),scale:squeeze():float()))
+   else
+      -- use maximum a posteriori (MAP) estimate
+      self.output:copy(shape):cmul(scale)
+   end
+
+   return self.output
+end
+
+function ReinforceGamma:updateGradInput(input, gradOutput)
+   -- Note that gradOutput is ignored
+   -- f : Gamma probability density function
+   -- g : Digamma probability density function
+   -- x : the sampled values (self.output)
+   -- shape : shape parameter of gamma dist
+   -- scale: scale parameter of gamma dist
+
+   local shape, scale = input, self.scale
+   local gradShape, gradScale = self.gradInput, nil
+   if torch.type(input) == 'table' then
+      shape, scale = unpack(input)
+      gradShape, gradScale = unpack(self.gradInput)
+   end
+   assert(scale)
+    
+   -- Derivative of log gamma w.r.t. shape :
+   -- d ln(f(x,shape,scale))
+   -- ---------------------- = ln(x) - g(shape) - ln(scale)
+   --         d shape
+   gradShape:resizeAs(shape)
+
+   if torch.type(scale) == 'number' then
+      scale = shape.new():resizeAs(shape):fill(scale)
+   else
+      if not scale:dim() == shape:dim() then
+         scale:copy(self.__scale)
+      end
+   end
+   gradShape:copy(cephes.digamma(shape:float()))
+   gradShape:mul(-1)
+
+   self._logOutput = self._logOutput or self.output.new()
+   self._logOutput:log( self.output )
+   
+   self._logScale = self._logScale or scale.new()
+   self._logScale:log( scale )
+
+   gradShape:add( self._logOutput )
+   gradShape:add(-1, self._logScale )
+
+   -- multiply by variance reduced reward
+   gradShape:cmul(self:rewardAs(shape) )
+   -- multiply by -1 ( gradient descent on shape )
+   gradShape:mul(-1)
+   
+   -- Derivative of log Gamma w.r.t. scale :
+   -- d ln(f(x,shape,scale))      x      shape
+   -- ---------------------- = ------- - -----
+   --         d scale          scale^2   scale
+   
+   if gradScale then
+      gradScale:resizeAs(scale)
+      gradScale:copy( torch.cdiv(self.output, torch.pow(scale,2)) )
+      gradScale:add(-1, torch.cdiv(shape, scale) )
+      gradScale:cmul( self:rewardAs(scale) )
+      gradScale:mul(-1)
+   end
+
+   return self.gradInput
+end
+
+function ReinforceGamma:type(type,cache)
+   self._logOutput = nil
+   self._logScale = nil
+   return parent.type(self,type,cache)
+end
diff --git a/ReinforceNormal.lua b/ReinforceNormal.lua
new file mode 100644
index 0000000..e5a9866
--- /dev/null
+++ b/ReinforceNormal.lua
@@ -0,0 +1,124 @@
+------------------------------------------------------------------------
+--[[ ReinforceNormal ]]-- 
+-- Ref A. http://incompleteideas.net/sutton/williams-92.pdf
+-- Inputs are mean (mu) of multivariate normal distribution. 
+-- Ouputs are samples drawn from these distributions.
+-- Standard deviation is provided as constructor argument.
+-- Uses the REINFORCE algorithm (ref. A sec 6. p.237-239) which is 
+-- implemented through the nn.Module:reinforce(r,b) interface.
+-- gradOutputs are ignored (REINFORCE algorithm).
+------------------------------------------------------------------------
+local ReinforceNormal, parent = torch.class("nn.ReinforceNormal", "nn.Reinforce")
+
+function ReinforceNormal:__init(stdev, stochastic)
+   parent.__init(self, stochastic)
+   self.stdev = stdev
+   if not stdev then
+      self.gradInput = {torch.Tensor(), torch.Tensor()}
+   end
+end
+
+function ReinforceNormal:updateOutput(input)
+   local mean, stdev = input, self.stdev
+   if torch.type(input) == 'table' then
+      -- input is {mean, stdev}
+      assert(#input == 2)
+      mean, stdev = unpack(input)
+   end
+   assert(stdev)
+   
+   self.output:resizeAs(mean)
+   
+   if self.stochastic or self.train ~= false then
+      self.output:normal()
+      -- multiply by standard deviations
+      if torch.type(stdev) == 'number' then
+         self.output:mul(stdev)
+      elseif torch.isTensor(stdev) then
+         if stdev:dim() == mean:dim() then
+            assert(stdev:isSameSizeAs(mean))
+            self.output:cmul(stdev)
+         else
+            assert(stdev:dim()+1 == mean:dim())
+            self._stdev = self._stdev or stdev.new()
+            self._stdev:view(stdev,1,table.unpack(stdev:size():totable()))
+            self.__stdev = self.__stdev or stdev.new()
+            self.__stdev:expandAs(self._stdev, mean)
+            self.output:cmul(self.__stdev)
+         end
+      else
+         error"unsupported mean type"
+      end
+      
+      -- re-center the means to the mean
+      self.output:add(mean)
+   else
+      -- use maximum a posteriori (MAP) estimate
+      self.output:copy(mean)
+   end
+   return self.output
+end
+
+function ReinforceNormal:updateGradInput(input, gradOutput)
+   -- Note that gradOutput is ignored
+   -- f : normal probability density function
+   -- x : the sampled values (self.output)
+   -- u : mean (mu) (mean)
+   -- s : standard deviation (sigma) (stdev)
+   
+   local mean, stdev = input, self.stdev
+   local gradMean, gradStdev = self.gradInput, nil
+   if torch.type(input) == 'table' then
+      mean, stdev = unpack(input)
+      gradMean, gradStdev = unpack(self.gradInput)
+   end
+   assert(stdev)   
+    
+   -- Derivative of log normal w.r.t. mean :
+   -- d ln(f(x,u,s))   (x - u)
+   -- -------------- = -------
+   --      d u           s^2
+   
+   gradMean:resizeAs(mean)
+   -- (x - u)
+   gradMean:copy(self.output):add(-1, mean)
+   
+   -- divide by squared standard deviations
+   if torch.type(stdev) == 'number' then
+      gradMean:div(stdev^2)
+   else
+      if stdev:dim() == mean:dim() then
+         gradMean:cdiv(stdev):cdiv(stdev)
+      else
+         gradMean:cdiv(self.__stdev):cdiv(self.__stdev)
+      end
+   end
+   -- multiply by reward
+   gradMean:cmul(self:rewardAs(mean) )
+   -- multiply by -1 ( gradient descent on mean )
+   gradMean:mul(-1)
+   
+   -- Derivative of log normal w.r.t. stdev :
+   -- d ln(f(x,u,s))   (x - u)^2 - s^2
+   -- -------------- = ---------------
+   --      d s              s^3
+   
+   if gradStdev then
+      gradStdev:resizeAs(stdev)
+      -- (x - u)^2
+      gradStdev:copy(self.output):add(-1, mean):pow(2)
+      -- subtract s^2
+      self._stdev2 = self._stdev2 or stdev.new()
+      self._stdev2:resizeAs(stdev):copy(stdev):cmul(stdev)
+      gradStdev:add(-1, self._stdev2)
+      -- divide by s^3
+      self._stdev2:cmul(stdev):add(0.00000001)
+      gradStdev:cdiv(self._stdev2)
+      -- multiply by reward
+      gradStdev:cmul(self:rewardAs(stdev))
+       -- multiply by -1 ( gradient descent on stdev )
+      gradStdev:mul(-1)
+   end
+   
+   return self.gradInput
+end
diff --git a/ReverseTable.lua b/ReverseTable.lua
new file mode 100644
index 0000000..69660a0
--- /dev/null
+++ b/ReverseTable.lua
@@ -0,0 +1,39 @@
+local ReverseTable, parent = torch.class("nn.ReverseTable", "nn.Module")
+
+function ReverseTable:__init()
+   parent.__init(self)
+   self.output = {}
+   self.gradInput = {}
+end
+
+function ReverseTable:updateOutput(inputTable)
+   assert(torch.type(inputTable) == 'table', "Expecting table at arg 1")
+   
+   -- empty output table
+   for k,v in ipairs(self.output) do
+      self.output[k] = nil
+   end
+   
+   -- reverse input
+   local k = 1
+   for i=#inputTable,1,-1 do
+      self.output[k] = inputTable[i]
+      k = k + 1
+   end
+   return self.output
+end
+
+function ReverseTable:updateGradInput(inputTable, gradOutputTable)
+   -- empty gradInput table
+   for k,v in ipairs(self.gradInput) do
+      self.gradInput[k] = nil
+   end
+   
+   -- reverse gradOutput
+   local k = 1
+   for i=#gradOutputTable,1,-1 do
+      self.gradInput[k] = gradOutputTable[i]
+      k = k + 1
+   end
+   return self.gradInput
+end
diff --git a/Sequential.lua b/Sequential.lua
new file mode 100644
index 0000000..a33c47d
--- /dev/null
+++ b/Sequential.lua
@@ -0,0 +1,98 @@
+local Sequential, parent = nn.Sequential, nn.Container
+
+function Sequential:profile()
+
+   function Sequential:updateOutput(input)
+      local currentOutput = input
+      for i=1,#self.modules do
+         local start = torch.Timer()
+         currentOutput = self.modules[i]:updateOutput(currentOutput)
+         if cutorch then cutorch.synchronize() end
+         print(torch.type(self.modules[i])..' updateOutput: '..start:time().real.." s")
+      end
+      self.output = currentOutput
+      return currentOutput
+   end
+
+   function Sequential:updateGradInput(input, gradOutput)
+      local currentGradOutput = gradOutput
+      local currentModule = self.modules[#self.modules]
+      for i=#self.modules-1,1,-1 do
+         local previousModule = self.modules[i]
+         local start = torch.Timer()
+         currentGradOutput = currentModule:updateGradInput(previousModule.output, currentGradOutput)
+         if cutorch then cutorch.synchronize() end
+         print(torch.type(currentModule)..' updateGradInput: '..start:time().real.." s")
+         currentModule = previousModule
+      end
+      local start = torch.Timer()
+      currentGradOutput = currentModule:updateGradInput(input, currentGradOutput)
+      if cutorch then cutorch.synchronize() end
+      print(torch.type(currentModule)..' updateGradInput: '..start:time().real.." s")
+      self.gradInput = currentGradOutput
+      return currentGradOutput
+   end
+
+   function Sequential:accGradParameters(input, gradOutput, scale)
+      scale = scale or 1
+
+      local currentGradOutput = gradOutput
+      local currentModule = self.modules[#self.modules]
+      for i=#self.modules-1,1,-1 do
+         local previousModule = self.modules[i]
+         local start = torch.Timer()
+         currentModule:accGradParameters(previousModule.output, currentGradOutput, scale)
+         if cutorch then cutorch.synchronize() end
+         print(torch.type(currentModule)..' accGradParameters: '..start:time().real.." s")
+         currentGradOutput = currentModule.gradInput
+         currentModule = previousModule
+      end
+      
+      local start = torch.Timer()
+      currentModule:accGradParameters(input, currentGradOutput, scale)
+      if cutorch then cutorch.synchronize() end
+      print(torch.type(currentModule)..' accGradParameters: '..start:time().real.." s")
+   end
+
+   function Sequential:backward(input, gradOutput, scale)
+      scale = scale or 1
+      local currentGradOutput = gradOutput
+      local currentModule = self.modules[#self.modules]
+      for i=#self.modules-1,1,-1 do
+         local previousModule = self.modules[i]
+         local start = torch.Timer()
+         currentGradOutput = currentModule:backward(previousModule.output, currentGradOutput, scale)
+         if cutorch then cutorch.synchronize() end
+         print(torch.type(currentModule)..' backward: '..start:time().real.." s")
+         currentModule.gradInput = currentGradOutput
+         currentModule = previousModule
+      end
+      local start = torch.Timer()
+      currentGradOutput = currentModule:backward(input, currentGradOutput, scale)
+      if cutorch then cutorch.synchronize() end
+      print(torch.type(currentModule)..' backward: '..start:time().real.." s")
+      self.gradInput = currentGradOutput
+      return currentGradOutput
+   end
+
+   function Sequential:accUpdateGradParameters(input, gradOutput, lr)
+      local currentGradOutput = gradOutput
+      local currentModule = self.modules[#self.modules]
+      for i=#self.modules-1,1,-1 do
+         local previousModule = self.modules[i]
+         local start = torch.Timer()
+         currentModule:accUpdateGradParameters(previousModule.output, currentGradOutput, lr)
+         if cutorch then cutorch.synchronize() end
+         print(torch.type(currentModule)..' accUpdateGradParameters: '..start:time().real.." s")
+         currentGradOutput = currentModule.gradInput
+         currentModule = previousModule
+      end
+
+      local start = torch.Timer()
+      currentModule:accUpdateGradParameters(input, currentGradOutput, lr)
+      if cutorch then cutorch.synchronize() end
+      print(torch.type(currentModule)..' accUpdateGradParameters: '..start:time().real.." s")
+   end
+
+   parent.profile(self)
+end
diff --git a/Serial.lua b/Serial.lua
new file mode 100644
index 0000000..b597de9
--- /dev/null
+++ b/Serial.lua
@@ -0,0 +1,52 @@
+------------------------------------------------------------------------
+--[[ Serial ]]--
+-- Decorator that modifies the serialization/deserialization 
+-- behaviour of encapsulated module.
+------------------------------------------------------------------------
+local _ = require 'moses'
+local Serial, parent = torch.class("nn.Serial", "nn.Decorator")
+
+function Serial:__init(module, tensortype)
+   parent.__init(self, module)
+   self.tensortype = tensortype
+   if self.tensortype then
+      assert(tensortype:find('torch.*Tensor'), "Expecting tensortype (e.g. torch.LongTensor) at arg1")
+   end
+end
+
+function Serial:write(file)
+   local state = self:getSerialState()
+   
+   local function recursiveSetMetaTable(state)
+      for k,v in pairs(state) do
+         if torch.type(v) == 'table' then
+            recursiveSetMetaTable(v)
+         end
+      end
+      
+      if state.dpnn_typename then
+         torch.setmetatable(state, state.dpnn_typename)
+      end
+   end
+   
+   -- typecast before serialization (useful for cuda)
+   recursiveSetMetaTable(state)
+   
+   if self.tensortype then
+      state:type(self.tensortype)
+   end
+   
+   -- removes self's metatable
+   state = _.map(state, function(k,v) return v end)
+   
+   file:writeObject(state)
+end
+
+function Serial:read(file)
+   local state = file:readObject()
+   for k,v in pairs(state) do
+      self[k] = v
+   end
+end
+
+
diff --git a/SimpleColorTransform.lua b/SimpleColorTransform.lua
new file mode 100644
index 0000000..97b83ea
--- /dev/null
+++ b/SimpleColorTransform.lua
@@ -0,0 +1,90 @@
+--[[
+   Simple Color transformation module: This module implements a simple data
+   augmentation technique of changing the pixel values of input image by adding
+   sample sampled small quantities.
+   Works only
+--]]
+
+local SimpleColorTransform, Parent = torch.class('nn.SimpleColorTransform', 'nn.Module')
+
+function SimpleColorTransform:__init(inputChannels, range)
+   Parent.__init(self)
+
+   self.train = true
+   self.inputChannels = inputChannels
+   assert(inputChannels == range:nElement(),
+          "Number of input channels and number of range values don't match.")
+   self.range = range
+end
+
+function SimpleColorTransform:updateOutput(input)
+   self.output:resizeAs(input):copy(input)
+   if self.train then
+      self.noise = self.noise or self.output.new()
+      self._tempNoise = self._tempNoise or self.output.new()
+      self._tempNoiseExpanded = self._tempNoiseExpanded or self.output.new()
+      self._tempNoiseSamples = self._tempNoiseSamples or self.output.new()
+
+      if self.output:nDimension() == 4 then
+         local batchSize = self.output:size(1)
+         local channels = self.output:size(2)
+         local height = self.output:size(3)
+         local width = self.output:size(4)
+         assert(channels == self.inputChannels)
+         
+         -- Randomly sample noise for each channel 
+         self.noise:resize(batchSize, channels)
+         for i=1, channels do
+            self.noise[{{}, {i}}]:uniform(-self.range[i], self.range[i])
+         end
+         self._tempNoise = self.noise:view(batchSize, self.inputChannels, 1, 1)
+         self._tempNoiseExpanded:expand(self._tempNoise, batchSize,
+                                        channels, height, width)
+         self._tempNoiseSamples:resizeAs(self._tempNoiseExpanded)
+                               :copy(self._tempNoiseExpanded)
+         self.output:add(self._tempNoiseSamples)
+
+      elseif self.output:nDimension() == 3 then
+         local channels = self.output:size(1)
+         local height = self.output:size(2)
+         local width = self.output:size(3)
+         assert(channels == self.inputChannels)
+
+         -- Randomly sample noise for each channel 
+         self.noise:resize(channels)
+         for i=1, channels do
+            self.noise[i] = torch.uniform(-self.range[i], self.range[i])
+         end
+         self._tempNoise = self.noise:view(self.inputChannels, 1, 1)
+         self._tempNoiseExpanded:expand(self._tempNoise, channels,
+                                        height, width)
+         self._tempNoiseSamples:resizeAs(self._tempNoiseExpanded)
+                               :copy(self._tempNoiseExpanded)
+         self.output:add(self._tempNoiseSamples)
+      else
+         error("Invalid input dimensionality.")
+      end
+   end
+   return self.output
+end
+
+function SimpleColorTransform:updateGradInput(input, gradOutput)
+   if self.train then
+      self.gradInput:resizeAs(gradOutput):copy(gradOutput)
+   else
+      error('backprop only defined while training')
+   end
+   return self.gradInput
+end
+
+function SimpleColorTransform:type(type, tensorCache)
+   self.noise = nil
+   self._tempNoise = nil
+   self._tempNoiseExpanded = nil
+   self._tempNoiseSamples = nil
+   Parent.type(self, type, tensorCache)
+end
+
+function SimpleColorTransform:__tostring__()
+  return string.format('SimpleColorTransform', torch.type(self))
+end
diff --git a/SpatialBatchNormalization.lua b/SpatialBatchNormalization.lua
new file mode 100644
index 0000000..1b2fdf8
--- /dev/null
+++ b/SpatialBatchNormalization.lua
@@ -0,0 +1,12 @@
+local BN, parent = nn.SpatialBatchNormalization, nn.Module
+local _ = require 'moses'
+
+local empty = _.clone(parent.dpnn_mediumEmpty)
+table.insert(empty, 'buffer')
+table.insert(empty, 'buffer2')
+table.insert(empty, 'centered')
+table.insert(empty, 'std')
+table.insert(empty, 'normalized')
+table.insert(empty, 'output')
+table.insert(empty, 'gradInput')
+BN.dpnn_mediumEmpty = empty
diff --git a/SpatialBinaryConvolution.lua b/SpatialBinaryConvolution.lua
new file mode 100644
index 0000000..6365f8e
--- /dev/null
+++ b/SpatialBinaryConvolution.lua
@@ -0,0 +1,173 @@
+-- Reference: http://arxiv.org/abs/1603.05279
+-- We use floating point Matrix-Matrix multiplication as in SpatialConvolution.
+-- Filters are made binary {-1, +1} using Sign.
+-- Convolution output is scaled by L1-norm of the filters.
+
+-- Inheriting nn/SpatialConvolution.
+
+local SpatialBinaryConvolution, parent = torch.class('nn.SpatialBinaryConvolution', 'nn.SpatialConvolution')
+
+function SpatialBinaryConvolution:__init(nInputPlane, nOutputPlane, kW, kH, dW, dH, padW, padH)
+   parent.__init(self, nInputPlane, nOutputPlane, kW, kH, dW, dH, padW, padH)
+   parent.noBias(self)
+
+   self.iwh = self.nInputPlane * self.kW * self.kH 
+   self.owh = self.nOutputPlane * self.kW * self.kH 
+   self.train = true
+end
+
+function SpatialBinaryConvolution:training()
+   self.train = true
+end
+
+function SpatialBinaryConvolution:evaluate()
+   self.train = false
+end
+
+-- Function to binarize weights and compute L1 norms
+function SpatialBinaryConvolution:binarizeWeight()
+   self.tempWeight = self.tempWeight or self.weight.new()
+
+   -- Grad Input alphas
+   self.gradInputAlphas = self.gradInputAlphas or self.weight.new()
+   self.gradInputAlphas:resize(self.nInputPlane)
+
+   local temp = self.weight:transpose(1,2)
+   self.tempWeight:resizeAs(temp):copy(temp)
+   self.gradInputAlphas:norm(self.tempWeight:view(self.nInputPlane, -1), 1, 2)
+   self.gradInputAlphas:div(self.owh) -- 1/owh
+
+   -- alphas
+   self.tempWeight:resizeAs(self.weight):copy(self.weight)
+   self.alphas = self.alphas or self.weight.new()
+   self.alphas:resize(self.nOutputPlane)
+   self.alphas:norm(self.weight:view(self.nOutputPlane, -1), 1, 2)
+   self.alphas:div(self.iwh) -- 1/iwh
+
+   -- Binarize weights
+   if not self.wmask then
+      if torch.type(self.weight) == 'torch.CudaTensor' then
+         self.wmask = torch.CudaTensor()
+      else
+         self.wmask = torch.ByteTensor()
+      end
+   end
+
+   -- Binarizing weights
+   self.weight.ge(self.wmask, self.weight, 0)
+   self.weight[self.wmask] = 1
+   self.weight.lt(self.wmask, self.weight, 0)
+   self.weight[self.wmask] = -1
+end
+
+function SpatialBinaryConvolution:updateOutput(input)
+   -- Binarize Weights
+   self.binarizeWeight(self)
+
+   -- Convolution
+   self.output = parent.updateOutput(self, input)
+
+   -- Scale output by alphas
+   self._tempAlphas = self._tempAlphas or self.output.new()   
+   self._tempAlphasExpanded = self._tempAlphasExpanded or self.output.new() 
+   self._tempAlphasSamples = self._tempAlphasSamples or self.output.new()
+   if self.output:nDimension() == 4 then
+      local batchSize = self.output:size(1)
+      local height = self.output:size(3)
+      local width = self.output:size(4)
+
+      self._tempAlphas = self.alphas:view(1, self.nOutputPlane, 1, 1)
+      self._tempAlphasExpanded:expand(self._tempAlphas, batchSize,
+                                      self.nOutputPlane, height, width)
+      self._tempAlphasSamples:resizeAs(self._tempAlphasExpanded)
+                             :copy(self._tempAlphasExpanded)
+      self.output:cmul(self._tempAlphasSamples)
+   else
+      local height = self.output:size(2)
+      local width = self.output:size(3)
+
+      self._tempAlphas = self.alphas:view(self.nOutputPlane, 1, 1)
+      self._tempAlphasExpanded:expand(self._tempAlphas, self.nOutputPlane,
+                                      height, width)
+      self._tempAlphasSamples:resizeAs(self._tempAlphasExpanded)
+                             :copy(self._tempAlphasExpanded)
+      self.output:cmul(self._tempAlphasSamples)
+   end
+
+   -- In evaluate mode.
+   if not self.train then self.weight:copy(self.tempWeight) end
+
+   return self.output 
+end
+
+function SpatialBinaryConvolution:updateGradInput(input, gradOutput)
+   self.gradInput = parent.updateGradInput(self, input, gradOutput)
+
+   -- Scale gradInput by gradAlphas
+   self._tempGradAlphas = self._temp or self.gradInput.new()
+   self._tempGradAlphasExpanded = self._temp or self.gradInput.new()
+   self._tempGradAlphasSamples = self._temp or self.gradInput.new()
+   if self.gradInput:nDimension() == 4 then
+      local batchSize = self.gradInput:size(1)
+      local height = self.gradInput:size(3)
+      local width = self.gradInput:size(4)
+
+      self._tempGradAlphas = self.gradInputAlphas:view(1, self.nInputPlane,
+                                                       1, 1)
+      self._tempGradAlphasExpanded:expand(self._tempGradAlphas,
+                                          batchSize, self.nInputPlane,
+                                          height, width)
+      self._tempGradAlphasSamples:resizeAs(self._tempGradAlphasExpanded)
+                                 :copy(self._tempGradAlphasExpanded)
+
+      self.gradInput:cmul(self._tempGradAlphasSamples)
+   else
+      local height = self.gradInput:size(2)
+      local width = self.gradInput:size(3)
+
+      self._tempGradAlphas = self.gradInputAlphas:view(self.nInputPlane,
+                                                       1, 1)
+      self._tempGradAlphasExpanded:expand(self._tempGradAlphas,
+                                          self.nInputPlane,
+                                          height, width)
+      self._tempGradAlphasSamples:resizeAs(self._tempGradAlphasExpanded)
+                                 :copy(self._tempGradAlphasExpanded)
+
+      self.gradInput:cmul(self._tempGradAlphasSamples)
+   end
+   return self.gradInput
+end
+
+function SpatialBinaryConvolution:accGradParameters(input, gradOutput, scale)
+
+   parent.accGradParameters(self, input, gradOutput, scale)
+
+   --[[
+   Copy back floating point weights for weight update.
+   This could be done individually after forward and backward, but to avoid
+   additional copy is done at the end of backward.
+   --]]
+
+   self.weight:copy(self.tempWeight)
+end
+
+function SpatialBinaryConvolution:type(type, tensorCache)
+   self.tempWeight = nil
+   self.alphas = nil
+   self.gradInputAlphas = nil
+   self.wmask = nil
+
+   self._tempAlphas = nil 
+   self._tempAlphasExpanded = nil
+   self._tempAlphasSamples = nil
+
+   self._tempGradAlphas = nil
+   self._tempGradAlphasExpanded = nil
+   self._tempGradAlphasSamples = nil
+
+   parent.type(self, type, tensorCache)
+end
+
+function SpatialBinaryConvolution:__tostring__()
+   return "Binary Convolution: "..parent.__tostring__(self)
+end
diff --git a/SpatialBinaryLogisticRegression.lua b/SpatialBinaryLogisticRegression.lua
new file mode 100644
index 0000000..85fba99
--- /dev/null
+++ b/SpatialBinaryLogisticRegression.lua
@@ -0,0 +1,80 @@
+------------------------------------------------------------------------
+--[[ SpatialBinaryLogisticRegression ]]--
+-- Takes an image of size batchSize x nChannel x width x height as input.
+-- Computes Binary Logistic Regression Cost.
+-- Useful for 2 class pixel classification.
+------------------------------------------------------------------------
+
+local SpatialBinaryLogisticRegression, parent = torch.class('nn.SpatialBinaryLogisticRegression', 'nn.Criterion')
+
+function SpatialBinaryLogisticRegression:__init()
+   parent.__init(self)
+   self.sizeAverage = true
+end
+
+function SpatialBinaryLogisticRegression:updateOutput(input, target)
+   local inputDim = input:nDimension()
+   local targetDim = target:nDimension()
+
+   -- Check dimensions of input and target
+   assert(inputDim == targetDim, "nDimension of input and target don't match.")
+   assert(inputDim == 4 or inputDim == 3, "Expecting image or batch on images")
+
+   for i=1,inputDim do
+      assert(input:size(i) == target:size(i),
+                                  "Input and target dimensions don't match.")
+   end
+
+   -- Check batch or single image
+   if inputDim == 4 then
+      self._isBatch = true
+      assert(input:size(2) == 1, "No. of channels should be 1.")
+      self._k = input:size(1)
+      self._h = input:size(3)
+      self._w = input:size(4)
+   else
+      self._isBatch = false
+      assert(input:size(1) == 1, "No. of channels should be 1.")
+      self._k = 1
+      self._h = input:size(2)
+      self._w = input:size(3)
+   end
+
+   self._baseExponents = self._baseExponents or input.new()
+   self._coeff = self._coeff or input.new()
+   self._logCoeff = self._logCoeff or input.new()
+
+   --Compute exponent = -target*input
+   self._baseExponents:resize(input:size()):copy(input)
+   self._baseExponents:cmul(target)
+   self._baseExponents:mul(-1)
+   -- Compute exp(exponent)
+   self._baseExponents:exp()
+
+   self._coeff:resize(input:size()):copy(self._baseExponents)
+   self._coeff:add(1)
+
+   self._logCoeff:resize(input:size()):copy(self._coeff)
+   self._logCoeff:log()
+
+   if self.sizeAverage then
+      return self._logCoeff:sum()/(2 * self._k * self._h * self._w)
+   else
+      return self._logCoeff:sum()/(2 * self._h * self._w)
+   end
+end
+
+function SpatialBinaryLogisticRegression:updateGradInput(input, target)
+   self.gradInput = self.gradInput or input.new()
+   local gradInput = self.gradInput
+   gradInput:resize(target:size()):copy(target)
+   gradInput:mul(-1)
+   gradInput:cmul(self._baseExponents)
+   gradInput:cdiv(self._coeff)
+   if self.sizeAverage then
+      gradInput:div(2 * self._k * self._h * self._w)
+   else
+      gradInput:div(2 * self._h * self._w)
+   end
+   return gradInput
+end
diff --git a/SpatialConvolution.lua b/SpatialConvolution.lua
new file mode 100644
index 0000000..a3144eb
--- /dev/null
+++ b/SpatialConvolution.lua
@@ -0,0 +1,9 @@
+local SpatialConvolution, parent = nn.SpatialConvolution, nn.Module
+local _ = require 'moses'
+
+local empty = _.clone(parent.dpnn_mediumEmpty)
+table.insert(empty, 'finput')
+table.insert(empty, 'fgradinput')
+table.insert(empty, '_input')
+table.insert(empty, '_gradOutput')
+SpatialConvolution.dpnn_mediumEmpty = empty
diff --git a/SpatialConvolutionMM.lua b/SpatialConvolutionMM.lua
new file mode 100644
index 0000000..4b50658
--- /dev/null
+++ b/SpatialConvolutionMM.lua
@@ -0,0 +1,3 @@
+local SpatialConvolutionMM, parent = nn.SpatialConvolutionMM, nn.Module
+
+SpatialConvolutionMM.dpnn_mediumEmpty = nn.SpatialConvolution.dpnn_mediumEmpty
diff --git a/SpatialFeatNormalization.lua b/SpatialFeatNormalization.lua
new file mode 100644
index 0000000..1aca767
--- /dev/null
+++ b/SpatialFeatNormalization.lua
@@ -0,0 +1,73 @@
+--[[
+   Color normalization (mean zeroing and dividing by standard deviation).
+   Basic preprocessing step widely used in training classifier with images.
+--]]
+
+local SpatialFeatNormalization, Parent = torch.class('nn.SpatialFeatNormalization', 'nn.Module')
+
+function SpatialFeatNormalization:__init(mean, std)
+   Parent.__init(self)
+   if mean:dim() ~= 1 then
+      error('<SpatialFeatNormalization> Mean/Std should be 1D.')
+   end
+   self.mean = torch.Tensor()
+   self.mean:resizeAs(mean):copy(mean)
+   self.std = torch.Tensor()
+   self.std:resizeAs(mean)
+   if std ~= nil then self.std:copy(std) else self.std:fill(1) end
+   self.noOfFeats = mean:size(1)
+end
+
+function SpatialFeatNormalization:updateOutput(input)
+   self.output:resizeAs(input):copy(input)
+   if input:dim() == 4 then
+      -- Batch of image/s
+      if input:size(2) ~= self.noOfFeats then
+         error('<SpatialFeatNormalization> No. of Feats dont match.')
+      else
+         for i=1, self.noOfFeats do
+            self.output[{{}, i, {}, {}}]:add(-self.mean[i])
+            self.output[{{}, i, {}, {}}]:div(self.std[i])
+         end
+      end
+   elseif input:dim() == 3 then
+      -- single image
+      if input:size(1) ~= self.noOfFeats then
+         error('<SpatialFeatNormalization> No. of Feats dont match.')
+      else
+         for i=1, self.noOfFeats do
+            self.output[{i, {}, {}}]:add(-self.mean[i])
+            self.output[{i, {}, {}}]:div(self.std[i])
+         end
+      end
+   else
+      error('<SpatialFeatNormalization> invalid input dims.')
+   end
+   return self.output 
+end
+
+function SpatialFeatNormalization:updateGradInput(input, gradOutput)
+   self.gradInput:resizeAs(gradOutput):copy(gradOutput)
+   if self.gradInput:dim() == 4 then
+      -- Batch of image/s
+      if self.gradInput:size(2) ~= self.noOfFeats then
+         error('<SpatialFeatNormalization> No. of Feats dont match.')
+      else
+         for i=1, self.noOfFeats do
+            self.gradInput[{{}, i, {}, {}}]:div(self.std[i])
+         end
+      end
+   elseif self.gradInput:dim() == 3 then
+      -- single image
+      if self.gradInput:size(1) ~= self.noOfFeats then
+         error('<SpatialFeatNormalization> No. of Feats dont match.')
+      else
+         for i=1, self.noOfFeats do
+            self.gradInput[{i, {}, {}}]:div(self.std[i])
+         end
+      end
+   else
+      error('<SpatialFeatNormalization> invalid self.gradInput dims.')
+   end
+   return self.gradInput
+end
diff --git a/SpatialGlimpse.lua b/SpatialGlimpse.lua
new file mode 100644
index 0000000..4b6782b
--- /dev/null
+++ b/SpatialGlimpse.lua
@@ -0,0 +1,184 @@
+------------------------------------------------------------------------
+--[[ SpatialGlimpse ]]--
+-- Ref A.: http://papers.nips.cc/paper/5542-recurrent-models-of-visual-attention.pdf
+-- a glimpse is the concatenation of down-scaled cropped images of
+-- increasing scale around a given location in a given image.
+-- input is a pair of Tensors: {image, location}
+-- locations are x,y coordinates of the center of cropped patches.
+-- Coordinates are between -1,-1 (top-left) and 1,1 (bottom right)
+-- output is a batch of glimpses taken in image at location (x,y)
+-- glimpse size is {height, width}, or width only if square-shaped
+-- depth is number of patches to crop per glimpse (one patch per scale)
+-- Each successive patch is scale x size of the previous patch
+------------------------------------------------------------------------
+local SpatialGlimpse, parent = torch.class("nn.SpatialGlimpse", "nn.Module")
+
+function SpatialGlimpse:__init(size, depth, scale)
+   nn.require('nnx')
+   if torch.type(size)=='table' then
+      self.height = size[1]
+      self.width = size[2]
+   else
+      self.width = size
+      self.height = size
+   end
+   self.depth = depth or 3
+   self.scale = scale or 2
+
+   assert(torch.type(self.width) == 'number')
+   assert(torch.type(self.height) == 'number')
+   assert(torch.type(self.depth) == 'number')
+   assert(torch.type(self.scale) == 'number')
+   parent.__init(self)
+   self.gradInput = {torch.Tensor(), torch.Tensor()}
+   if self.scale == 2 then
+      self.module = nn.SpatialAveragePooling(2,2,2,2)
+   else
+      self.module = nn.SpatialReSampling{oheight=self.height,owidth=self.width}
+   end
+   self.modules = {self.module}
+end
+
+-- a bandwidth limited sensor which focuses on a location.
+-- locations index the x,y coord of the center of the output glimpse
+function SpatialGlimpse:updateOutput(inputTable)
+   nn.require('nnx')
+   assert(torch.type(inputTable) == 'table')
+   assert(#inputTable >= 2)
+   local input, location = unpack(inputTable)
+   input, location = self:toBatch(input, 3), self:toBatch(location, 1)
+   assert(input:dim() == 4 and location:dim() == 2)
+
+   self.output:resize(input:size(1), self.depth, input:size(2), self.height, self.width)
+
+   self._crop = self._crop or self.output.new()
+   self._pad = self._pad or input.new()
+
+   for sampleIdx=1,self.output:size(1) do
+      local outputSample = self.output[sampleIdx]
+      local inputSample = input[sampleIdx]
+      local yx = location[sampleIdx]
+      -- (-1,-1) top left corner, (1,1) bottom right corner of image
+      local y, x = yx:select(1,1), yx:select(1,2)
+      -- (0,0), (1,1)
+      y, x = (y+1)/2, (x+1)/2
+
+      -- for each depth of glimpse : pad, crop, downscale
+      local glimpseWidth = math.floor(self.width)
+      local glimpseHeight = math.floor(self.height)
+      for depth=1,self.depth do
+         local dst = outputSample[depth]
+         if depth > 1 then
+            glimpseWidth = math.floor(glimpseWidth*self.scale)
+            glimpseHeight = math.floor(glimpseHeight*self.scale)
+         end
+
+         -- add zero padding (glimpse could be partially out of bounds)
+         local padWidth = math.floor((glimpseWidth-1)/2)
+         local padHeight = math.floor((glimpseHeight-1)/2)
+         self._pad:resize(input:size(2), input:size(3)+padHeight*2, input:size(4)+padWidth*2):zero()
+         local center = self._pad:narrow(2,padHeight+1,input:size(3)):narrow(3,padWidth+1,input:size(4))
+         center:copy(inputSample)
+
+         -- crop it
+         local h, w = self._pad:size(2)-glimpseHeight, self._pad:size(3)-glimpseWidth
+         local y, x = math.floor(math.min(h,math.max(0,y*h))), math.floor(math.min(w,math.max(0,x*w)))
+
+         if depth == 1 then
+            dst:copy(self._pad:narrow(2,y+1,glimpseHeight):narrow(3,x+1,glimpseWidth))
+         else
+            self._crop:resize(input:size(2), glimpseHeight, glimpseWidth)
+            self._crop:copy(self._pad:narrow(2,y+1,glimpseHeight):narrow(3,x+1,glimpseWidth))
+
+            if torch.type(self.module) == 'nn.SpatialAveragePooling' then
+               local poolWidth = glimpseWidth/self.width
+               assert(poolWidth % 2 == 0)
+               local poolHeight = glimpseHeight/self.height
+               assert(poolHeight % 2 == 0)
+               self.module.kW = poolWidth
+               self.module.kH = poolHeight
+               self.module.dW = poolWidth
+               self.module.dH = poolHeight
+            end
+            dst:copy(self.module:updateOutput(self._crop))
+         end
+      end
+   end
+
+   self.output:resize(input:size(1), self.depth*input:size(2), self.height, self.width)
+   self.output = self:fromBatch(self.output, 1)
+   return self.output
+end
+
+function SpatialGlimpse:updateGradInput(inputTable, gradOutput)
+   local input, location = unpack(inputTable)
+   if #self.gradInput ~= 2 then
+      self.gradInput = {input.new(), input.new()}
+   end
+   local gradInput, gradLocation = unpack(self.gradInput)
+   input, location = self:toBatch(input, 3), self:toBatch(location, 1)
+   gradOutput = self:toBatch(gradOutput, 3)
+
+   gradInput:resizeAs(input):zero()
+   gradLocation:resizeAs(location):zero() -- no backprop through location
+
+   gradOutput = gradOutput:view(input:size(1), self.depth, input:size(2), self.height, self.width)
+
+   for sampleIdx=1,gradOutput:size(1) do
+      local gradOutputSample = gradOutput[sampleIdx]
+      local gradInputSample = gradInput[sampleIdx]
+      local yx = location[sampleIdx] -- height, width
+      -- (-1,-1) top left corner, (1,1) bottom right corner of image
+      local y, x = yx:select(1,1), yx:select(1,2)
+      -- (0,0), (1,1)
+      y, x = (y+1)/2, (x+1)/2
+
+      -- for each depth of glimpse : pad, crop, downscale
+      local glimpseWidth = math.floor(self.width)
+      local glimpseHeight = math.floor(self.height)
+      for depth=1,self.depth do
+         local src = gradOutputSample[depth]
+         if depth > 1 then
+            glimpseWidth = math.floor(glimpseWidth*self.scale)
+            glimpseHeight = math.floor(glimpseHeight*self.scale)
+         end
+
+         -- add zero padding (glimpse could be partially out of bounds)
+         local padWidth = math.floor((glimpseWidth-1)/2)
+         local padHeight = math.floor((glimpseHeight-1)/2)
+         self._pad:resize(input:size(2), input:size(3)+padHeight*2, input:size(4)+padWidth*2):zero()
+
+         local h, w = self._pad:size(2)-glimpseHeight, self._pad:size(3)-glimpseWidth
+         local y, x = math.floor(math.min(h,math.max(0,y*h))), math.floor(math.min(w,math.max(0,x*w)))
+         local pad = self._pad:narrow(2, y+1, glimpseHeight):narrow(3, x+1, glimpseWidth)
+
+         -- upscale glimpse for different depths
+         if depth == 1 then
+            pad:copy(src)
+         else
+            self._crop:resize(input:size(2), glimpseHeight, glimpseWidth)
+
+            if torch.type(self.module) == 'nn.SpatialAveragePooling' then
+               local poolWidth = glimpseWidth/self.width
+               assert(poolWidth % 2 == 0)
+               local poolHeight = glimpseHeight/self.height
+               assert(poolHeight % 2 == 0)
+               self.module.kW = poolWidth
+               self.module.kH = poolHeight
+               self.module.dW = poolWidth
+               self.module.dH = poolHeight
+            end
+
+            pad:copy(self.module:updateGradInput(self._crop, src))
+         end
+
+         -- copy into gradInput tensor (excluding padding)
+         gradInputSample:add(self._pad:narrow(2, padHeight+1, input:size(3)):narrow(3, padWidth+1, input:size(4)))
+      end
+   end
+
+   self.gradInput[1] = self:fromBatch(gradInput, 1)
+   self.gradInput[2] = self:fromBatch(gradLocation, 1)
+
+   return self.gradInput
+end
diff --git a/SpatialMaxPooling.lua b/SpatialMaxPooling.lua
new file mode 100644
index 0000000..1d6669c
--- /dev/null
+++ b/SpatialMaxPooling.lua
@@ -0,0 +1,6 @@
+local SpatialMaxPooling, parent = nn.SpatialMaxPooling, nn.Module
+local _ = require 'moses'
+
+local empty = _.clone(parent.dpnn_mediumEmpty)
+table.insert(empty, 'indices')
+SpatialMaxPooling.dpnn_mediumEmpty = empty
diff --git a/SpatialRegionDropout.lua b/SpatialRegionDropout.lua
new file mode 100644
index 0000000..78c4a39
--- /dev/null
+++ b/SpatialRegionDropout.lua
@@ -0,0 +1,80 @@
+--[[
+   Dropout edges rows or columns to simulate imperfect bounding boxes. 
+--]]
+
+local SpatialRegionDropout, Parent = torch.class('nn.SpatialRegionDropout', 'nn.Module')
+
+function SpatialRegionDropout:__init(p)
+   Parent.__init(self)
+   self.p = p or 0.2 -- ratio of total number of rows or cols
+   self.train = true
+   self.noise = torch.Tensor()
+   if self.p >= 1 or self.p < 0 then
+      error('<SpatialRegionDropout> illegal percentage, must be 0 <= p < 1')
+   end
+end
+
+function SpatialRegionDropout:setp(p)
+   self.p = p
+end
+
+-- Region Types
+-- 1: Dropout p ratio of top rows
+-- 2: Dropout p ratio of bottom rows
+-- 3: Dropout p ratio of leftmost cols
+-- 4: Dropout p ratio of rightmost cols
+function SpatialRegionDropout:updateOutput(input)
+   self.output:resizeAs(input):copy(input)
+   if self.train then
+      self.noise:resizeAs(input):fill(1)
+      self.regionType = torch.random(4)
+      if input:dim() == 4 then
+         local height = input:size(3)
+         local width = input:size(4)
+         if self.regionType == 1 then
+            self.noise[{{}, {}, {1, math.floor(height*self.p)}}]:fill(0)
+         elseif self.regionType == 2 then
+            self.noise[{{}, {}, 
+                      {height-math.floor(height*self.p)+1, height}}]:fill(0)
+         elseif self.regionType == 3 then
+            self.noise[{{}, {}, {}, {1, math.floor(width*self.p)}}]:fill(0)
+         elseif self.regionType == 4 then
+            self.noise[{{}, {}, {},
+                       {width-math.floor(width*self.p)+1, width}}]:fill(0)
+         end
+      elseif input:dim() == 3 then
+         local height = input:size(2)
+         local width = input:size(3)
+         if self.regionType == 1 then
+            self.noise[{{}, {1, math.floor(height*self.p)}}]:fill(0)
+         elseif self.regionType == 2 then
+            self.noise[{{}, 
+                       {height-math.floor(height*self.p)+1, height}}]:fill(0)
+         elseif self.regionType == 3 then
+            self.noise[{{}, {}, {1, math.floor(width*self.p)}}]:fill(0)
+         elseif self.regionType == 4 then
+            self.noise[{{}, {}, 
+                       {width-math.floor(width*self.p)+1, width}}]:fill(0)
+         end
+      else
+         error('Input must be 4D (nbatch, nfeat, h, w) or 3D (nfeat, h, w)')
+      end
+      self.noise:div(1-self.p)
+      self.output:cmul(self.noise)
+   end
+   return self.output
+end
+
+function SpatialRegionDropout:updateGradInput(input, gradOutput)
+   if self.train then
+      self.gradInput:resizeAs(gradOutput):copy(gradOutput)
+      self.gradInput:cmul(self.noise)
+   else
+      error('Backpropagation is only defined for training.')
+   end
+   return self.gradInput
+end
+
+function SpatialRegionDropout:__tostring__()
+   return string.format('%s p: %f', torch.type(self), self.p)
+end
diff --git a/SpatialUniformCrop.lua b/SpatialUniformCrop.lua
new file mode 100644
index 0000000..ba81119
--- /dev/null
+++ b/SpatialUniformCrop.lua
@@ -0,0 +1,121 @@
+local SpatialUniformCrop, parent = torch.class("nn.SpatialUniformCrop", "nn.Module")
+
+function SpatialUniformCrop:__init(oheight, owidth, scale)
+   nn.require('nnx')
+   parent.__init(self)
+   self.scale = scale or nil
+   if self.scale ~= nil then
+      assert(torch.type(scale)=='table')
+      self.scaler = nn.SpatialReSampling{owidth=owidth, oheight=oheight}
+   end
+   self.oheight = oheight
+   self.owidth = owidth or oheight
+end
+
+function SpatialUniformCrop:updateOutput(input)
+   nn.require('nnx')
+   input = self:toBatch(input, 3)
+
+   self.output:resize(input:size(1), input:size(2), self.oheight, self.owidth)
+   self.coord = self.coord or torch.IntTensor()
+   self.coord:resize(input:size(1), 2)
+
+   if self.scale ~= nil then
+      self.scales = self.scales or torch.FloatTensor()
+      self.scales:resize(input:size(1))
+   end
+
+   local iH, iW = input:size(3), input:size(4)
+   if self.train ~= false then
+      if self.scale ~= nil then
+         for i=1,input:size(1) do
+            -- do random crop
+            local s = torch.uniform(self.scale['min'] or self.scale[1], self.scale['max'] or self.scale[2])
+            local soheight = math.ceil(s*self.oheight)
+            local sowidth = math.ceil(s*self.owidth)
+
+            local h = math.ceil(torch.uniform(1e-2, iH-soheight))
+            local w = math.ceil(torch.uniform(1e-2, iW-sowidth))
+
+            local ch = math.ceil(iH/2 - (iH-soheight)/2 + h)
+            local cw = math.ceil(iW/2 - (iH-sowidth)/2 + w)
+
+            local h1 = ch - math.ceil(soheight/2)
+            local w1 = cw - math.ceil(sowidth/2)
+            if h1 < 1 then h1 = 1 end
+            if w1 < 1 then w1 = 1 end
+
+            local crop = input[i]:narrow(2, h1, soheight):narrow(3, w1, sowidth)
+
+            self.output[i]:copy(self.scaler:forward(crop))
+            -- save crop coordinates and scale for backward
+            self.scales[i] = s
+            self.coord[{i,1}] = h
+            self.coord[{i,2}] = w
+         end
+      else
+         for i=1,input:size(1) do
+            -- do random crop
+            local h1 = math.ceil(torch.uniform(1e-2, iH-self.oheight))
+            local w1 = math.ceil(torch.uniform(1e-2, iW-self.owidth))
+            local crop = input[i]:narrow(2,h1,self.oheight):narrow(3,w1,self.owidth)
+            self.output[i]:copy(crop)
+            -- save crop coordinates for backward
+            self.coord[{i,1}] = h1
+            self.coord[{i,2}] = w1
+         end
+      end
+   else
+      -- use center crop
+      local h1 = math.ceil((iH-self.oheight)/2)
+      local w1 = math.ceil((iW-self.owidth)/2)
+      local crop = input:narrow(3,h1,self.oheight):narrow(4,w1,self.owidth)
+      self.output:copy(crop)
+   end
+
+   self.output = self:fromBatch(self.output, 1)
+   return self.output
+end
+
+function SpatialUniformCrop:updateGradInput(input, gradOutput)
+   input = self:toBatch(input, 3)
+   gradOutput = self:toBatch(gradOutput, 3)
+
+   self.gradInput:resizeAs(input):zero()
+   if self.scale ~= nil then
+      local iH, iW = input:size(3), input:size(4)
+      for i=1,input:size(1) do
+         local s = self.scales[i]
+         local soheight = math.ceil(s*self.oheight)
+         local sowidth = math.ceil(s*self.owidth)
+
+         local h, w = self.coord[{i,1}], self.coord[{i,2}]
+
+         local ch = math.ceil(iH/2 - (iH-soheight)/2 + h)
+         local cw = math.ceil(iW/2 - (iH-sowidth)/2 + w)
+
+         local h1 = ch - math.ceil(soheight/2)
+         local w1 = cw - math.ceil(sowidth/2)
+         if h1 < 1 then h1 = 1 end
+         if w1 < 1 then w1 = 1 end
+
+         local crop = input[i]:narrow(2, h1, soheight):narrow(3, w1, sowidth)
+         local samplerGradInput = self.scaler:updateGradInput(crop, gradOutput[i])
+
+         self.gradInput[i]:narrow(2, h1, soheight):narrow(3, w1, sowidth):copy(samplerGradInput)
+      end
+   else
+      for i=1,input:size(1) do
+         local h1, w1 = self.coord[{i,1}], self.coord[{i,2}]
+         self.gradInput[i]:narrow(2,h1,self.oheight):narrow(3,w1,self.owidth):copy(gradOutput[i])
+      end
+   end
+
+   self.gradInput = self:fromBatch(self.gradInput, 1)
+   return self.gradInput
+end
+
+function SpatialUniformCrop:type(type, cache)
+   self.coord = nil
+   return parent.type(self, type, cache)
+end
diff --git a/StepLSTM.lua b/StepLSTM.lua
index 5fb701f..42533a4 100644
--- a/StepLSTM.lua
+++ b/StepLSTM.lua
@@ -207,5 +207,3 @@ function StepLSTM:type(type, ...)
    self:clearState()
    return parent.type(self, type, ...)
 end
-
-StepLSTM.toFastLSTM = nn.SeqLSTM.toFastLSTM
\ No newline at end of file
diff --git a/TotalDropout.lua b/TotalDropout.lua
new file mode 100644
index 0000000..b239fec
--- /dev/null
+++ b/TotalDropout.lua
@@ -0,0 +1,38 @@
+------------------------------------------------------------------------
+--[[ TotalDropout ]]--
+-- Like vanilla Dropout, but on the entire inputs.
+-- So either the input is entirely forwarded or entirely zeroed.
+------------------------------------------------------------------------
+local TotalDropout, parent = torch.class("nn.TotalDropout", "nn.Module")
+
+function TotalDropout:__init(p)
+   self.p = p or 0.5
+   self.train = true
+   if self.p >= 1 or self.p < 0 then
+      error('<TotalDropout> illegal percentage, must be 0 <= p < 1')
+   end
+   parent.__init(self)
+end
+
+function TotalDropout:updateOutput(input)
+   self.output:resizeAs(input):copy(input)
+   if self.train then
+      self.noise = torch.bernoulli(1-self.p)
+      self.output:mul(self.noise)
+   end
+   return self.output
+end
+
+function TotalDropout:updateGradInput(input, gradOutput)
+   if self.train then
+      self.gradInput:resizeAs(gradOutput):copy(gradOutput)
+      self.gradInput:mul(self.noise) -- simply mask the gradients with the noise vector
+   else
+      error('backprop only defined while training')
+   end
+   return self.gradInput
+end
+
+function TotalDropout:__tostring__()
+  return string.format('%s(%f)', torch.type(self), self.p)
+end
diff --git a/VRClassReward.lua b/VRClassReward.lua
new file mode 100644
index 0000000..9c3561a
--- /dev/null
+++ b/VRClassReward.lua
@@ -0,0 +1,94 @@
+------------------------------------------------------------------------
+--[[ VRClassReward ]]--
+-- Variance reduced classification reinforcement criterion.
+-- input : {class prediction, baseline reward}
+-- Reward is 1 for success, Reward is 0 otherwise.
+-- reward = scale*(Reward - baseline) where baseline is 2nd input element
+-- Note : for RNNs with R = 1 for last step in sequence, encapsulate it
+-- in nn.ModuleCriterion(VRClassReward, nn.SelectTable(-1))
+------------------------------------------------------------------------
+local VRClassReward, parent = torch.class("nn.VRClassReward", "nn.Criterion")
+
+function VRClassReward:__init(module, scale, criterion)
+   parent.__init(self)
+   self.module = module -- so it can call module:reinforce(reward)
+   self.scale = scale or 1 -- scale of reward
+   self.criterion = criterion or nn.MSECriterion() -- baseline criterion
+   self.sizeAverage = true
+   self.gradInput = {torch.Tensor()}
+end
+
+function VRClassReward:updateOutput(input, target)
+   assert(torch.type(input) == 'table')
+   local input = self:toBatch(input[1], 1)
+   self._maxVal = self._maxVal or input.new()
+   self._maxIdx = self._maxIdx or torch.type(input) == 'torch.CudaTensor' and torch.CudaLongTensor() or torch.LongTensor()
+   
+   -- max class value is class prediction
+   self._maxVal:max(self._maxIdx, input, 2)
+   
+   -- reward = scale when correctly classified
+   local maxIdx = self._maxIdx
+   if torch.type(self._maxIdx) == 'torch.CudaLongTensor' then
+      self.__maxIdx = self.__maxIdx or torch.CudaTensor()
+      self.__maxIdx:resize(maxIdx:size()):copy(maxIdx)
+      maxIdx = self.__maxIdx
+   end
+   
+   if torch.type(maxIdx) ~= torch.type(target) then
+      self._target = self._target or maxIdx.new()
+      self._target:resize(target:size()):copy(target)
+      target = self._target
+   end
+   
+   -- reward = scale when correctly classified
+   self._reward = self._reward or maxIdx.new()
+   self._reward:eq(maxIdx, target)
+   self.reward = self.reward or input.new()
+   self.reward:resize(self._reward:size(1)):copy(self._reward)
+   self.reward:mul(self.scale)
+   
+   -- loss = -sum(reward)
+   self.output = -self.reward:sum()
+   if self.sizeAverage then
+      self.output = self.output/input:size(1)
+   end
+   return self.output
+end
+
+function VRClassReward:updateGradInput(inputTable, target)
+   local input = self:toBatch(inputTable[1], 1)
+   local baseline = self:toBatch(inputTable[2], 1)
+   
+   -- reduce variance of reward using baseline
+   self.vrReward = self.vrReward or self.reward.new()
+   self.vrReward:resizeAs(self.reward):copy(self.reward)
+   self.vrReward:add(-1, baseline)
+   if self.sizeAverage then
+      self.vrReward:div(input:size(1))
+   end
+   -- broadcast reward to modules
+   self.module:reinforce(self.vrReward)  
+   
+   -- zero gradInput (this criterion has no gradInput for class pred)
+   self.gradInput[1]:resizeAs(input):zero()
+   self.gradInput[1] = self:fromBatch(self.gradInput[1], 1)
+   
+   -- learn the baseline reward
+   self.criterion:forward(baseline, self.reward)
+   self.gradInput[2] = self.criterion:backward(baseline, self.reward)
+   self.gradInput[2] = self:fromBatch(self.gradInput[2], 1)
+   return self.gradInput
+end
+
+function VRClassReward:type(type)
+   self._maxVal = nil
+   self._maxIdx = nil
+   self.__maxIdx = nil
+   self._target = nil
+   local module = self.module
+   self.module = nil
+   local ret = parent.type(self, type)
+   self.module = module
+   return ret
+end
diff --git a/WhiteNoise.lua b/WhiteNoise.lua
new file mode 100644
index 0000000..518e749
--- /dev/null
+++ b/WhiteNoise.lua
@@ -0,0 +1,38 @@
+local WhiteNoise, Parent = torch.class('nn.WhiteNoise', 'nn.Module')
+
+function WhiteNoise:__init(mean, std)
+   Parent.__init(self)
+   -- std corresponds to 50% for MNIST training data std.
+   self.mean = mean or 0
+   self.std = std or 0.1
+   self.noise = torch.Tensor()
+end
+
+function WhiteNoise:updateOutput(input)
+   self.output:resizeAs(input):copy(input)
+   if self.train ~= false then
+      self.noise:resizeAs(input)
+      self.noise:normal(self.mean, self.std)
+      self.output:add(self.noise)
+   else
+      if self.mean ~= 0 then
+         self.output:add(self.mean)
+      end
+   end
+   return self.output
+end
+
+function WhiteNoise:updateGradInput(input, gradOutput)
+   if self.train ~= false then
+      -- Simply return the gradients.
+      self.gradInput:resizeAs(gradOutput):copy(gradOutput)
+   else
+      error('backprop only defined while training')
+   end
+   return self.gradInput
+end
+
+function WhiteNoise:__tostring__()
+  return string.format('%s mean: %f, std: %f', 
+                        torch.type(self), self.mean, self.std)
+end
diff --git a/ZipTable.lua b/ZipTable.lua
new file mode 100644
index 0000000..55b261d
--- /dev/null
+++ b/ZipTable.lua
@@ -0,0 +1,34 @@
+local ZipTable, parent = torch.class('nn.ZipTable', 'nn.Container')
+
+-- input : { {a1,a2}, {b1,b2}, {c1,c2} }
+-- output : { {a1,b1,c1}, {a2,b2,c2} }
+function ZipTable:__init()
+   parent.__init(self)
+   self.output = {}
+   self.gradInput = {}
+end
+
+function ZipTable:updateOutput(inputTable)
+   self.output = {}
+   for i,inTable in ipairs(inputTable) do
+      for j,input in ipairs(inTable) do
+         local output = self.output[j] or {}
+         output[i] = input
+         self.output[j] = output 
+      end 
+   end
+   return self.output
+end
+
+function ZipTable:updateGradInput(inputTable, gradOutputTable)
+   self.gradInput = {}
+   for i,gradOutTable in ipairs(gradOutputTable) do
+      for j,gradOutput in ipairs(gradOutTable) do
+         local gradInput = self.gradInput[j] or {}
+         gradInput[i] = gradOutput
+         self.gradInput[j] = gradInput 
+      end 
+   end
+   return self.gradInput
+end
+
diff --git a/ZipTableOneToMany.lua b/ZipTableOneToMany.lua
new file mode 100644
index 0000000..fe8b385
--- /dev/null
+++ b/ZipTableOneToMany.lua
@@ -0,0 +1,37 @@
+local ZipTableOneToMany, parent = torch.class('nn.ZipTableOneToMany', 'nn.Container')
+
+-- based on ZipTable in dpnn
+
+-- input : { v, {a, b, c} } 
+-- output : { {v,a}, {v,b}, {v,c} }
+function ZipTableOneToMany:__init()
+   parent.__init(self)
+   self.output = {}
+   self.gradInput = {}
+   -- make buffer to update during forward/backward
+   self.gradInputEl = torch.Tensor()
+end
+
+function ZipTableOneToMany:updateOutput(input)
+   assert(#input == 2, "input must be table of element and table")
+   local inputEl, inputTable = input[1], input[2]
+   self.output = {}
+   for i,v in ipairs(inputTable) do
+      self.output[i] = {inputEl, v}
+   end
+   return self.output
+end
+
+function ZipTableOneToMany:updateGradInput(input, gradOutput)
+   assert(#input == 2, "input must be table of element and table")
+   local inputEl, inputTable = input[1], input[2]
+   self.gradInputEl:resizeAs(inputEl):zero()
+   local gradInputTable = {}
+   for i,gradV in ipairs(gradOutput) do
+      self.gradInputEl:add(gradV[1])
+      gradInputTable[i] = gradV[2]
+   end
+   self.gradInput = {self.gradInputEl, gradInputTable}
+   return self.gradInput
+end
+
diff --git a/init.lua b/init.lua
index 78e8255..1493f0e 100644
--- a/init.lua
+++ b/init.lua
@@ -1,11 +1,22 @@
-require 'dpnn'
 require 'torchx'
-dpnn.version = dpnn.version or 0
-assert(dpnn.version > 1, "Please update dpnn : luarocks install dpnn")
+local _ = require 'moses'
+require 'nn'
 
 -- create global rnn table:
 rnn = {}
-rnn.version = 2.3 -- deprecated Recurrent and FastLSTM
+rnn.version = 2.4 -- merge dpnn
+
+-- lua 5.2 compat
+
+function nn.require(packagename)
+   assert(torch.type(packagename) == 'string')
+   local success, message = pcall(function() require(packagename) end)
+   if not success then
+      print("missing package "..packagename..": run 'luarocks install nnx'")
+      error(message)
+   end
+end
+
 
 -- c lib:
 require "paths"
@@ -13,67 +24,132 @@ paths.require 'librnn'
 
 unpack = unpack or table.unpack
 
-torch.include('rnn', 'recursiveUtils.lua')
-torch.include('rnn', 'utils.lua')
+require('rnn.recursiveUtils')
+require('rnn.utils')
 
--- extensions to nn.Module
-torch.include('rnn', 'Module.lua')
+-- extensions to existing nn.Module
+require('rnn.Module')
+require('rnn.Container')
+require('rnn.Sequential')
+require('rnn.ParallelTable')
+require('rnn.LookupTable')
+require('rnn.Dropout')
 
--- override nn.Dropout
-torch.include('rnn', 'Dropout.lua')
+-- extensions to existing criterions
+require('rnn.Criterion')
 
--- for testing:
-torch.include('rnn', 'test.lua')
-torch.include('rnn', 'bigtest.lua')
+-- decorator modules
+require('rnn.Decorator')
+require('rnn.Serial')
+require('rnn.DontCast')
+require('rnn.NaN')
+require('rnn.Profile')
+
+-- extensions to make serialization more efficient
+require('rnn.SpatialMaxPooling')
+require('rnn.SpatialConvolution')
+require('rnn.SpatialConvolutionMM')
+require('rnn.SpatialBatchNormalization')
+require('rnn.BatchNormalization')
+
+
+-- modules
+require('rnn.PrintSize')
+require('rnn.Convert')
+require('rnn.Constant')
+require('rnn.Collapse')
+require('rnn.ZipTable')
+require('rnn.ZipTableOneToMany')
+require('rnn.CAddTensorTable')
+require('rnn.ReverseTable')
+require('rnn.Dictionary')
+require('rnn.Inception')
+require('rnn.Clip')
+require('rnn.SpatialUniformCrop')
+require('rnn.SpatialGlimpse')
+require('rnn.WhiteNoise')
+require('rnn.ArgMax')
+require('rnn.CategoricalEntropy')
+require('rnn.TotalDropout')
+require('rnn.Kmeans')
+require('rnn.OneHot')
+require('rnn.SpatialRegionDropout')
+require('rnn.FireModule')
+require('rnn.SpatialFeatNormalization')
+require('rnn.ZeroGrad')
+require('rnn.LinearNoBias')
+require('rnn.SAdd')
+require('rnn.CopyGrad')
+require('rnn.VariableLength')
+require('rnn.StepLSTM')
+require('rnn.LookupTableMaskZero')
+require('rnn.MaskZero')
+require('rnn.TrimZero')
+require('rnn.SpatialBinaryConvolution')
+require('rnn.SimpleColorTransform')
+require('rnn.PCAColorTransform')
 
--- support modules
-torch.include('rnn', 'ZeroGrad.lua')
-torch.include('rnn', 'LinearNoBias.lua')
-torch.include('rnn', 'SAdd.lua')
-torch.include('rnn', 'CopyGrad.lua')
-torch.include('rnn', 'VariableLength.lua')
+-- Noise Contrastive Estimation
+require('rnn.NCEModule')
+require('rnn.NCECriterion')
+
+-- REINFORCE
+require('rnn.Reinforce')
+require('rnn.ReinforceGamma')
+require('rnn.ReinforceBernoulli')
+require('rnn.ReinforceNormal')
+require('rnn.ReinforceCategorical')
+
+-- REINFORCE criterions
+require('rnn.VRClassReward')
+require('rnn.BinaryClassReward')
+
+-- criterions
+require('rnn.ModuleCriterion')
+require('rnn.BinaryLogisticRegression')
+require('rnn.SpatialBinaryLogisticRegression')
+
+-- for testing:
+require('rnn.test')
+require('rnn.bigtest')
 
 -- recurrent modules
-torch.include('rnn', 'AbstractRecurrent.lua')
-torch.include('rnn', 'Recursor.lua')
-torch.include('rnn', 'Recurrence.lua')
-torch.include('rnn', 'LinearRNN.lua')
-torch.include('rnn', 'LookupRNN.lua')
-torch.include('rnn', 'LSTM.lua')
-torch.include('rnn', 'RecLSTM.lua')
-torch.include('rnn', 'GRU.lua')
-torch.include('rnn', 'Mufuru.lua')
-torch.include('rnn', 'NormStabilizer.lua')
+require('rnn.AbstractRecurrent')
+require('rnn.Recursor')
+require('rnn.Recurrence')
+require('rnn.LinearRNN')
+require('rnn.LookupRNN')
+require('rnn.LSTM')
+require('rnn.RecLSTM')
+require('rnn.GRU')
+require('rnn.Mufuru')
+require('rnn.NormStabilizer')
 
 -- sequencer modules
-torch.include('rnn', 'AbstractSequencer.lua')
-torch.include('rnn', 'Repeater.lua')
-torch.include('rnn', 'Sequencer.lua')
-torch.include('rnn', 'BiSequencer.lua')
-torch.include('rnn', 'BiSequencerLM.lua')
-torch.include('rnn', 'RecurrentAttention.lua')
+require('rnn.AbstractSequencer')
+require('rnn.Repeater')
+require('rnn.Sequencer')
+require('rnn.BiSequencer')
+require('rnn.BiSequencerLM')
+require('rnn.RecurrentAttention')
 
 -- sequencer + recurrent modules
-torch.include('rnn', 'SeqLSTM.lua')
-torch.include('rnn', 'SeqLSTMP.lua')
-torch.include('rnn', 'SeqGRU.lua')
-torch.include('rnn', 'SeqReverseSequence.lua')
-torch.include('rnn', 'SeqBRNN.lua')
-
--- step modules
-torch.include('rnn', 'StepLSTM.lua')
-torch.include('rnn', 'LookupTableMaskZero.lua')
-torch.include('rnn', 'MaskZero.lua')
-torch.include('rnn', 'TrimZero.lua')
+require('rnn.SeqLSTM')
+require('rnn.SeqLSTMP')
+require('rnn.SeqGRU')
+require('rnn.SeqReverseSequence')
+require('rnn.SeqBRNN')
 
 -- recurrent criterions:
-torch.include('rnn', 'SequencerCriterion.lua')
-torch.include('rnn', 'RepeaterCriterion.lua')
-torch.include('rnn', 'MaskZeroCriterion.lua')
+require('rnn.SequencerCriterion')
+require('rnn.RepeaterCriterion')
+require('rnn.MaskZeroCriterion')
 
 -- deprecated modules
-torch.include('rnn', 'FastLSTM.lua')
-torch.include('rnn', 'Recurrent.lua')
+require('rnn.FastLSTM')
+require('rnn.Recurrent')
 
 -- prevent likely name conflicts
 nn.rnn = rnn
+
+return rnn
\ No newline at end of file
diff --git a/rocks/rnn-scm-1.rockspec b/rocks/rnn-scm-1.rockspec
index 9228354..8acdd9d 100644
--- a/rocks/rnn-scm-1.rockspec
+++ b/rocks/rnn-scm-1.rockspec
@@ -18,7 +18,6 @@ A library to build RNNs, LSTMs, GRUs, BRNNs, BLSTMs, and so forth and so on.
 dependencies = {
    "torch >= 7.0",
    "nn >= 1.0",
-   "dpnn >= 1.0",
    "torchx >= 1.0"
 }
 
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
new file mode 100644
index 0000000..08be104
--- /dev/null
+++ b/test/CMakeLists.txt
@@ -0,0 +1,2 @@
+
+install_files(${INSTALL_PREFIX} test.lua)
diff --git a/test/bigtest.lua b/test/bigtest.lua
index 745c6ce..2df543a 100644
--- a/test/bigtest.lua
+++ b/test/bigtest.lua
@@ -114,6 +114,359 @@ function rnnbigtest.NCE_nan()
 
 end
 
+function rnnbigtest.Reinforce()
+   -- let us try to reinforce an mlp to learn a simple distribution
+   local n = 10
+   local inputs = torch.Tensor(n,3):uniform(0,0.1)
+   local targets = torch.Tensor(n):fill(0)
+   local stdev = 0.5
+   local beta = 0.9
+   local alpha = 1
+   local lr = 0.1
+
+   for i=1,inputs:size(1) do
+      local j = (i % inputs:size(2)) + 1
+      inputs[{i,j}] = torch.uniform(0.9,1.1)
+      targets[i] = j
+   end
+
+   local M = 10
+   local function train(mlp, cost, N, name)
+      local converged = false
+      local baseReward
+      local reward
+      for i=1,M do
+         mlp:reset()
+
+         baseReward = 0
+         for i=1,inputs:size(1) do
+            mlp:evaluate()
+            local target = targets:narrow(1,i,1)
+            local output = mlp:forward(inputs:narrow(1,i,1))
+            baseReward = baseReward - cost:forward(output, target)
+         end
+         baseReward = baseReward/inputs:size(1)
+
+         for k=1,N do
+
+            for i=1,inputs:size(1) do
+               mlp:training()
+               mlp:zeroGradParameters()
+               local target = targets:narrow(1,i,1)
+               local output = mlp:forward(inputs:narrow(1,i,1))
+               local err = cost:forward(output, target)
+               local gradOutput = cost:backward(output, target)
+               mlp:backward(inputs:narrow(1,i,1), gradOutput)
+               mlp:updateParameters(lr)
+            end
+
+            reward = 0
+            for i=1,inputs:size(1) do
+               mlp:evaluate()
+               local target = targets:narrow(1,i,1)
+               local output = mlp:forward(inputs:narrow(1,i,1))
+               reward = reward - cost:forward(output, target)
+            end
+            reward = reward/inputs:size(1)
+
+            -- is the baseReward lesser than 70% of reward after training?
+            -- i.e. did the reward increase sufficiently?
+            if reward*0.7 > baseReward then
+               converged = true
+               break
+            end
+         end
+
+         if reward*0.7 > baseReward then
+            converged = true
+            break
+         end
+      end
+
+      mytester:assert(converged, name.." did not converge : "..reward.."*0.7 < "..baseReward)
+   end
+
+   -- ReinforceNormal
+   local hiddenSize = 200
+   local N = 10
+   local mlp = nn.Sequential()
+   mlp:add(nn.Linear(inputs:size(2),hiddenSize))
+   mlp:add(nn.Tanh())
+   mlp:add(nn.ReinforceNormal(stdev))
+   mlp:add(nn.Clip(-1,1))
+   mlp:add(nn.Linear(hiddenSize, inputs:size(2)))
+   mlp:add(nn.SoftMax())
+
+   local concat = nn.ConcatTable()
+   concat:add(mlp)
+   concat:add( nn.Sequential():add( nn.Constant(1,1) ):add(nn.Add(1)) )
+
+   local cost = nn.VRClassReward(concat, alpha)
+
+   train(concat, cost, N, 'ReinforceNormal')
+
+   -- ReinforceGamma
+   local hiddenSize = 200
+   local N = 10
+   local mlp = nn.Sequential()
+   mlp:add(nn.Linear(inputs:size(2),hiddenSize))
+   mlp:add(nn.Sigmoid())
+   mlp:add(nn.ReinforceGamma(stdev))
+   mlp:add(nn.Linear(hiddenSize, inputs:size(2)))
+   mlp:add(nn.SoftMax())
+
+   local concat = nn.ConcatTable()
+   concat:add(mlp)
+   concat:add( nn.Sequential():add( nn.Constant(1,1) ):add(nn.Add(1)) )
+
+   local cost = nn.VRClassReward(concat, alpha)
+
+   train(concat, cost, N, 'ReinforceGamma')
+
+   -- ReinforceBernoulli
+   local hiddenSize = 20
+   local N = 30
+   local mlp = nn.Sequential()
+   mlp:add(nn.Linear(inputs:size(2),hiddenSize))
+   mlp:add(nn.Sigmoid())
+   mlp:add(nn.ReinforceBernoulli())
+   mlp:add(nn.Linear(hiddenSize, inputs:size(2)))
+   mlp:add(nn.SoftMax())
+
+   local concat = nn.ConcatTable()
+   concat:add(mlp)
+   concat:add( nn.Sequential():add( nn.Constant(1,1) ):add(nn.Add(1)) )
+
+   local cost = nn.VRClassReward(concat, alpha)
+
+   train(concat, cost, N, 'ReinforceBernoulli')
+
+   -- ReinforceCategorical
+   local hiddenSize = 200
+   local N = 10
+   local mlp = nn.Sequential()
+   mlp:add(nn.Linear(inputs:size(2),hiddenSize))
+   mlp:add(nn.Tanh())
+   mlp:add(nn.Linear(hiddenSize, inputs:size(2)))
+   mlp:add(nn.SoftMax())
+   mlp:add(nn.AddConstant(0.00001))
+   mlp:add(nn.ReinforceCategorical())
+
+   local concat = nn.ConcatTable()
+   concat:add(mlp)
+   concat:add( nn.Sequential():add( nn.Constant(1,1) ):add(nn.Add(1)) )
+
+   local cost = nn.VRClassReward(concat, alpha)
+
+   train(concat, cost, N, 'ReinforceCategorical')
+end
+
+-- Unit Test Kmeans layer
+function rnnbigtest.Kmeans()
+   local k = 10
+   local dim = 5
+   local batchSize = 1000
+   local input = torch.rand(batchSize, dim)
+   for i=1, batchSize do
+      input[i]:fill(torch.random(1, k))
+   end
+
+   local verbose = false
+
+   local attempts = 10
+   local iter = 100
+   local bestLoss = 100000000
+   local bestKm = nil
+   local tempLoss = 0
+   local learningRate = 1
+
+   local initTypes = {'random', 'kmeans++'}
+   local hasCuda = pcall(function() require 'cunn' end)
+   local useCudas = {false, hasCuda}
+   for _, initType in pairs(initTypes) do
+      for _, useCuda in pairs(useCudas) do
+
+         sys.tic()
+         for j=1, attempts do
+            local km = nn.Kmeans(k, dim)
+
+            if initType == 'kmeans++' then
+               km:initKmeansPlus(input)
+            else
+               km:initRandom(input)
+            end
+
+            if useCuda then km:cuda() end
+            for i=1, iter do
+               km:zeroGradParameters()
+
+               km:forward(input)
+               km:backward(input, gradOutput)
+
+               -- Gradient descent
+               km.weight:add(-learningRate, km.gradWeight)
+               tempLoss = km.loss
+            end
+            if verbose then print("Attempt Loss " .. j ..": " .. tempLoss) end
+            if tempLoss < bestLoss then
+               bestLoss = tempLoss
+            end
+         end
+         if verbose then
+            print("InitType: " .. initType .. " useCuda: " .. tostring(useCuda))
+            print("Best Loss: " .. bestLoss)
+            print("Total time: " .. sys.toc())
+         end
+         if initType == 'kmeans++' then
+            mytester:assert(bestLoss < 0.00001)
+         else
+            mytester:assert(bestLoss < 500)
+         end
+      end
+   end
+end
+
+function rnnbigtest.NCE_benchmark()
+   pcall(function() require 'cunn' end) -- make sure to import cunn before initializing large tensors, else weird segfault...
+
+   local nclass = 1000000
+   local hiddensize = 200
+   local batchsize = 50
+   local nloop = 5
+   local k = 25
+   local unigrams = torch.Tensor(nclass):uniform(0,1)
+   local mlp = nn.Sequential()
+      :add(nn.Linear(hiddensize, nclass))
+      :add(nn.SoftMax())
+   local nll = nn.ClassNLLCriterion()
+
+   local nce = nn.NCEModule(hiddensize, nclass, 25, unigrams)
+   local crit = nn.NCECriterion()
+
+   local input = torch.randn(batchsize, hiddensize)
+   local target = torch.LongTensor(batchsize):random(1,nclass)
+
+   local sync = function() return end
+   if pcall(function() require 'cunn' end) then
+      input = input:cuda()
+      target = target:cuda()
+      nce:cuda()
+      crit:cuda()
+      mlp:cuda()
+      nll:cuda()
+      sync = function() cutorch.synchronize() end
+   end
+
+   local output = nce:forward{input, target}
+   local loss = crit:forward(output, target)
+   local gradOutput = crit:backward(output, target)
+   local gradInput = nce:backward({input, target}, gradOutput)
+
+   local output = mlp:forward(input)
+   local loss = nll:forward(output, target)
+   local gradOutput = nll:backward(output, target)
+   local gradInput = mlp:backward(input, gradOutput)
+
+   sync()
+   local a = torch.Timer()
+   for i=1,nloop do
+      output = nce:forward{input, target}
+   end
+   sync()
+   local ncefwd = a:time().real
+
+   a:reset()
+   for i=1,nloop do
+      loss = crit:forward(output, target)
+   end
+   sync()
+   local critfwd = a:time().real
+
+   a:reset()
+   for i=1,nloop do
+      gradOutput = crit:backward(output, target)
+   end
+   sync()
+   local critbwd = a:time().real
+
+   a:reset()
+   for i=1,nloop do
+      gradInput = nce:backward({input, target}, gradOutput)
+   end
+   sync()
+   local ncebwd = a:time().real
+
+   -- mlp nll
+   local a = torch.Timer()
+   for i=1,nloop do
+      output = mlp:forward(input)
+   end
+   sync()
+   local mlpfwd = a:time().real
+
+   a:reset()
+   for i=1,nloop do
+      loss = nll:forward(output, target)
+   end
+   sync()
+   local nllfwd = a:time().real
+
+   a:reset()
+   for i=1,nloop do
+      gradOutput = nll:backward(output, target)
+   end
+   sync()
+   local nllbwd = a:time().real
+
+   a:reset()
+   for i=1,nloop do
+      gradInput = mlp:backward(input, gradOutput)
+   end
+   sync()
+   local mlpbwd = a:time().real
+
+   local ncetotal = ncefwd+critfwd+critbwd+ncebwd
+   local lintotal = mlpfwd+nllfwd+nllbwd+mlpbwd
+   print("module:forward (nce vs linear)", ncefwd, mlpfwd)
+   print("criterion:forward (nce vs nll)", critfwd, nllfwd)
+   print("criterion:backward (nce vs nll)", critbwd, nllbwd)
+   print("module:backward (nce vs linear)", ncebwd, mlpbwd)
+   print("total (nce vs linear)", ncetotal, lintotal, lintotal/ncetotal)
+
+   if not (cunn and cutorch.getDeviceCount() > 1) then
+      return
+   end
+
+   nce:multicuda(1,2)
+
+   local output = nce:forward{input, target}
+   local loss = crit:forward(output, target)
+   local gradOutput = crit:backward(output, target)
+   local gradInput = nce:backward({input, target}, gradOutput)
+   sync()
+
+   local a = torch.Timer()
+   for i=1,nloop do
+      output = nce:forward{input, target}
+   end
+   sync()
+   local ncefwd2 = a:time().real
+
+   a:reset()
+   for i=1,nloop do
+      gradInput = nce:backward({input, target}, gradOutput)
+   end
+   sync()
+   local ncebwd2 = a:time().real
+
+   local total1 = ncefwd+ncebwd
+   local total2 = ncefwd2+ncebwd2
+   print("module:forward (1 vs 2 gpu)", ncefwd, ncefwd2)
+   print("module:backward (1 vs 2 gpu)", ncebwd, ncebwd2)
+   print("total (1 vs 2 gpu)", total1, total2, total2/total1)
+end
+
+
 function rnnbigtest.LSTM()
    local seqlen, batchsize = 30, 32
    local inputsize, outputsize = 512, 512
@@ -176,6 +529,7 @@ function rnnbigtest.LSTM()
    end
 end
 
+
 function rnn.bigtest(tests)
    mytester = torch.Tester()
    mytester:add(rnnbigtest)
diff --git a/test/test.lua b/test/test.lua
index 52d85e6..becfae4 100644
--- a/test/test.lua
+++ b/test/test.lua
@@ -7098,7 +7098,7 @@ function rnntest.RecLSTM()
    local reclstm = nn.RecLSTM(inputsize, outputsize)
    local lstm = nn.Sequencer(reclstm)
 
-   local input = torch.Tensor(seqlen, batchsize, inputsize)
+   local input = torch.randn(seqlen, batchsize, inputsize)
    local output = lstm:forward(input)
 
    local seqlstm = nn.SeqLSTM(inputsize, outputsize)
@@ -7111,7 +7111,7 @@ function rnntest.RecLSTM()
    lstm:zeroGradParameters()
    seqlstm:zeroGradParameters()
 
-   local gradOutput = torch.Tensor(seqlen, batchsize, outputsize)
+   local gradOutput = torch.randn(seqlen, batchsize, outputsize)
    local gradInput = lstm:backward(input, gradOutput)
 
    local gradInput2 = seqlstm:backward(input, gradOutput)
@@ -7202,7 +7202,2587 @@ function rnntest.LookupRNN()
    end
 end
 
-function rnn.test(tests, benchmark_, exclude)
+
+function rnntest.Module_sharedClone()
+
+   local function testrnn(mlp, name)
+      mlp:zeroGradParameters()
+      local mlp = mlp:clone()
+      local clone = mlp:clone():sharedClone(true, true)
+
+      for i=1,2 do
+         local input = torch.randn(2,3)
+         local gradOutput = torch.randn(2,4)
+
+         local output = mlp:forward(input)
+         local gradInput = mlp:backward(input, gradOutput)
+         local output4 = clone:forward(input)
+         local gradInput4 = clone:backward(input, gradOutput)
+
+         mytester:assertTensorEq(output, output4, 0.00001, name.." updateOutput")
+         mytester:assertTensorEq(gradInput, gradInput4, 0.00001, name.." updateGradInput")
+
+         mlp:updateParameters(0.1)
+         clone:updateParameters(0.1)
+
+         local params, gradParams = mlp:parameters()
+         local params2, gradParams2 = clone:parameters()
+
+         mytester:assert(#params == #params2, name.." num params err")
+         mytester:assert(#gradParams == #gradParams2, name.." num gradParams err")
+
+         for i,param in ipairs(params) do
+            mytester:assertTensorEq(param, params2[i], 0.00001, name.." params2 err "..i)
+            mytester:assertTensorEq(gradParams[i], gradParams2[i], 0.00001, name.." gradParams2 err "..i)
+         end
+      end
+   end
+
+   local function test(mlp, name)
+      mlp:zeroGradParameters()
+      local clone = mlp:clone()
+      clone:share(mlp,"weight","bias","gradWeight","gradBias") -- this actually won't work for nn.Recurrent
+
+      local mlp2 = mlp:clone() -- not shared with mlp
+      local clone2 = mlp2:sharedClone(true, true)
+      mlp2.__test = 1
+      clone2.__test = 2
+      mytester:assert(mlp2.__test ~= clone2.__test)
+
+      local params, gradParams = mlp:parameters()
+      local params4, gradParams4 = clone:parameters()
+      local params2, gradParams2 = clone2:parameters()
+      local params3, gradParams3 = mlp2:parameters()
+
+      mytester:assert(#params == #params2, name.." num params err")
+      mytester:assert(#params3 == #params2, name.." num params err")
+      mytester:assert(#gradParams == #gradParams2, name.." num gradParams err")
+      mytester:assert(#gradParams == #gradParams3, name.." num gradParams err")
+
+      local input = torch.randn(2,3)
+      local gradOutput = torch.randn(2,4)
+
+      local output = mlp:forward(input)
+      local gradInput = mlp:backward(input, gradOutput)
+
+      for i,param in ipairs(params) do
+         mytester:assertTensorEq(param, params4[i], 0.00001, name.." params4  err "..i)
+         mytester:assertTensorEq(gradParams[i], gradParams4[i], 0.00001, name.." gradParams4 err "..i)
+      end
+
+      local output4 = clone:forward(input)
+      local gradInput4 = clone:backward(input, gradOutput)
+
+      mytester:assertTensorEq(output, output4, 0.00001, name.." updateOutput")
+      mytester:assertTensorEq(gradInput, gradInput4, 0.00001, name.." updateGradInput")
+
+      for i,param in ipairs(params) do
+         mytester:assertTensorEq(param, params4[i], 0.00001, name.." params4  err "..i)
+         mytester:assertTensorEq(gradParams[i], gradParams4[i], 0.00001, name.." gradParams4 err "..i)
+      end
+
+      local output2 = clone2:forward(input)
+      local gradInput2 = clone2:backward(input, gradOutput)
+
+      mytester:assertTensorEq(output, output2, 0.00001, name.." updateOutput")
+      mytester:assertTensorEq(gradInput, gradInput2, 0.00001, name.." updateGradInput")
+
+      for i,param in ipairs(params) do
+         mytester:assertTensorEq(params2[i], params3[i], 0.00001, name.." params 2 3  err "..i)
+         mytester:assertTensorEq(gradParams2[i], gradParams3[i], 0.00001, name.." gradParams 2 3 err "..i)
+      end
+
+      local output3 = mlp2:forward(input)
+      local gradInput3 = mlp2:backward(input, gradOutput)
+
+      mytester:assertTensorEq(output3, output2, 0.00001, name.." updateOutput")
+      mytester:assertTensorEq(gradInput3, gradInput2, 0.00001, name.." updateGradInput")
+
+      for i,param in ipairs(params) do
+         mytester:assertTensorEq(params2[i], params3[i], 0.00001, name.." params 2 3  err "..i)
+         mytester:assertTensorEq(gradParams2[i], gradParams3[i], 0.00001, name.." gradParams 2 3 err "..i)
+      end
+
+      mlp:updateParameters(0.1)
+      mlp2:updateParameters(0.1)
+
+      for i,param in ipairs(params) do
+         mytester:assertTensorEq(param, params3[i], 0.00001, name.." params3 (mlp vs mlp:clone()) err "..i) -- fail
+         mytester:assertTensorEq(gradParams[i], gradParams3[i], 0.00001, name.." gradParams3 err "..i) -- fail
+      end
+   end
+
+   test(nn.Linear(3,4), 'linear')
+
+   local mlp = nn.Sequential()
+   mlp:add(nn.Linear(3,7))
+   mlp:add(nn.Tanh())
+   mlp:add(nn.Euclidean(7,4))
+   mlp:add(nn.LogSoftMax())
+   test(mlp, 'sequential')
+
+
+   local function test2(rnn, name)
+      rnn:zeroGradParameters()
+      local clone = rnn:sharedClone()
+
+      local input = torch.randn(2,3)
+      local gradOutput = torch.randn(2,4)
+
+      local output = rnn:forward(input)
+      local gradInput = rnn:backward(input, gradOutput)
+      local output2 = clone:forward(input)
+      local gradInput2 = clone:backward(input, gradOutput)
+
+      mytester:assertTensorEq(output, output2, 0.00001, name.." updateOutput")
+      mytester:assertTensorEq(gradInput, gradInput2, 0.00001, name.." updateGradInput")
+
+      rnn:updateParameters(0.1)
+      clone:updateParameters(0.1)
+
+      local params, gradParams = rnn:parameters()
+      local params2, gradParams2 = clone:parameters()
+
+      mytester:assert(#params == #params2, name.." num params err")
+      mytester:assert(#gradParams == #gradParams2, name.." num gradParams err")
+
+      for i,param in ipairs(params) do
+         mytester:assertTensorEq(param, params2[i], 0.00001, name.." params (rnn vs rnn:sharedClone()) err "..i)
+         mytester:assertTensorEq(gradParams[i], gradParams2[i], 0.00001, name.." gradParams (rnn vs rnn:sharedClone()) err "..i)
+      end
+
+      local output = rnn:forward(input)
+      local gradInput = rnn:backward(input, gradOutput)
+      local output2 = clone:forward(input)
+      local gradInput2 = clone:backward(input, gradOutput)
+
+      mytester:assertTensorEq(output, output2, 0.00001, name.." updateOutput")
+      mytester:assertTensorEq(gradInput, gradInput2, 0.00001, name.." updateGradInput")
+
+      rnn:updateParameters(0.1)
+      clone:updateParameters(0.1)
+
+      local params, gradParams = rnn:parameters()
+      local params2, gradParams2 = clone:parameters()
+
+      mytester:assert(#params == #params2, name.." num params err")
+      mytester:assert(#gradParams == #gradParams2, name.." num gradParams err")
+
+      for i,param in ipairs(params) do
+         mytester:assertTensorEq(param, params2[i], 0.00001, name.." params (rnn vs rnn:sharedClone()) err "..i)
+         mytester:assertTensorEq(gradParams[i], gradParams2[i], 0.00001, name.." gradParams (rnn vs rnn:sharedClone()) err "..i)
+      end
+   end
+
+   if pcall(function() require 'rnn' end) then
+      local rnn = nn.Recurrent(4,nn.Linear(3,4),nn.Linear(4,4), nn.Sigmoid(), 999)
+      testrnn(rnn, 'rnn1')
+      local seq = nn.Sequential()
+      seq:add(nn.Repeater(nn.Recurrent(2,nn.Linear(3,2),nn.Linear(2,2), nn.Sigmoid(), 999), 3))
+      seq:add(nn.Sequencer(nn.Linear(2,4)))
+      seq:add(nn.SelectTable(-1))
+      test2(seq, 'rnn2')
+      test2(seq, 'rnn3')
+   end
+
+   if pcall(function() require 'nngraph' end) then
+      local lin1 = nn.Linear(10, 10)
+      local p1, gp1 = lin1:getParameters()
+
+      local lin2_ = lin1:clone()
+
+      local x = nn.Identity()()
+      local y = lin2_(x)
+
+      local lin2 = nn.gModule({x}, {y})
+
+      local lin3 = lin2:sharedClone()
+
+      local input = torch.randn(4, 10)
+      local gradOutput = torch.randn(4, 10)
+
+      lin1:zeroGradParameters()
+      lin2:zeroGradParameters()
+
+      local params1, gradParams1 = lin1:parameters()
+      local params2, gradParams2 = lin2:parameters()
+      local params3, gradParams3 = lin3:parameters()
+
+      local output1 = lin1:forward(input)
+      local gradInput1 = lin1:backward(input, gradOutput)
+      lin1:updateParameters(0.1)
+
+      local output2 = lin2:forward(input)
+      local gradInput2 = lin2:backward(input, gradOutput)
+      lin2:updateParameters(0.1)
+
+      mytester:assertTensorEq(output1, output2, 0.000001)
+      mytester:assertTensorEq(gradInput1, gradInput2, 0.000001)
+
+      for i=1,#params2 do
+         mytester:assertTensorEq(params2[i], params3[i], 0.000001, "sharedClone nngraph param err "..i)
+         mytester:assertTensorEq(gradParams2[i], gradParams3[i], 0.000001, "sharedClone nngraph gradParam err "..i)
+         mytester:assertTensorEq(params1[i], params3[i], 0.000001, "sharedClone nngraph param err "..i)
+         mytester:assertTensorEq(gradParams1[i], gradParams3[i], 0.000001, "sharedClone nngraph gradParam err "..i)
+      end
+
+      -- ok now lets forward/backward/update lin1 and lin3 to test sharedClone
+
+      local output1 = lin1:forward(input)
+      local gradInput1 = lin1:backward(input, gradOutput)
+
+      local output3 = lin3:forward(input)
+      local gradInput3 = lin3:backward(input, gradOutput)
+
+      for i=1,#params2 do
+         mytester:assertTensorEq(params2[i], params3[i], 0.000001, "sharedClone nngraph param err "..i)
+         mytester:assertTensorEq(gradParams2[i], gradParams3[i], 0.000001, "sharedClone nngraph gradParam err "..i)
+         mytester:assertTensorEq(params1[i], params3[i], 0.000001, "sharedClone nngraph param err "..i)
+         mytester:assertTensorEq(gradParams1[i], gradParams3[i], 0.000001, "sharedClone nngraph gradParam err "..i)
+      end
+
+      mytester:assertTensorEq(output1, output3, 0.000001)
+      mytester:assertTensorEq(gradInput1, gradInput3, 0.000001)
+
+      for i=1,#params2 do
+         mytester:assertTensorEq(gradParams1[i], gradParams3[i], 0.000001, "sharedClone nngraph gradParam err "..i)
+      end
+
+   end
+end
+
+function rnntest.Module_gradParamClip()
+   local mlp = nn.Sequential()
+   mlp:add(nn.Linear(10,10))
+   mlp:add(nn.Euclidean(15,12))
+   mlp:add(nn.SpatialConvolution(5,5,5,5))
+   mlp:add(nn.LookupTable(100,100))
+   local param, gradParam = mlp:getParameters()
+   gradParam:uniform(-1,1)
+   local norm = gradParam:norm()
+   local mlp2 = mlp:clone()
+   local cutoff = norm/2
+   local norm2 = mlp2:gradParamClip(cutoff)
+   mytester:assert(math.abs(norm2-norm) < 0.000001, "Module:gradParamClip norm err "..norm2.." ~= "..norm)
+   local shrink_factor = cutoff / norm
+   gradParam:mul(shrink_factor)
+   local param2, gradParam2 = mlp2:getParameters()
+   mytester:assertTensorEq(gradParam, gradParam2, 0.000001, "Module:gradParamClip clip err")
+
+   local norm = gradParam:norm()
+   local cutoff = norm*2
+   local norm2 = mlp2:gradParamClip(cutoff)
+   mytester:assert(math.abs(norm2-norm) < 0.000001, "Module:gradParamClip norm 2 err "..norm2.." ~= "..norm)
+   mytester:assertTensorEq(gradParam, gradParam2, 0.000001, "Module:gradParamClip clip 2 err")
+end
+
+function rnntest.Module_getParameters()
+   -- test that getParameters will preserve parameters sharing for hidden modules
+   local lin = nn.Linear(3,4)
+   local lin2 = lin:sharedClone()
+   lin.sharedClone = lin2
+   local params, gradParams = lin:getParameters()
+   params:add(-1)
+   gradParams:fill(-1)
+
+   local params1, gradParams1 = lin:parameters()
+   local params2, gradParams2 = lin2:parameters()
+
+   for i=1,#params1 do
+      mytester:assertTensorEq(params1[i], params2[i], 0.000001, "getParameters param err "..i)
+      mytester:assertTensorEq(gradParams1[i], gradParams2[i], 0.000001, "getParameters gradParam err "..i)
+   end
+end
+
+function rnntest.Serial()
+   function test(mlp, name)
+      local input = torch.randn(4,3)
+      local gradOutput = torch.randn(4,7)
+      local mlp2 = mlp:clone():Serial()
+
+      local output = mlp:forward(input):clone()
+      local gradInput = mlp:backward(input, gradOutput):clone()
+
+      local output2 = mlp2:forward(input)
+      local gradInput2 = mlp2:backward(input, gradOutput)
+
+      mytester:assertTensorEq(output, output2, 0.000001, name.." serial forward error")
+      mytester:assertTensorEq(gradInput, gradInput2, 0.00001, name.." serial backward error")
+
+      mlp2:mediumSerial()
+      mlp2.tensortype = 'torch.FloatTensor'
+      local mlp3 = mlp2:clone()
+
+      mytester:assert(mlp3.modules[1].output:nElement() == 0, name.." serial medium empty err")
+      mytester:assert(torch.type(mlp3.modules[1].output) == 'torch.FloatTensor', name.." serial medium type err")
+
+      mlp:zeroGradParameters()
+      local output = mlp:forward(input)
+      local gradInput = mlp:backward(input, gradOutput)
+
+      mlp3:zeroGradParameters()
+      local output2 = mlp3:forward(input:float())
+      local gradInput2 = mlp3:backward(input:float(), gradOutput:float())
+
+      mytester:assertTensorEq(output:float(), output2, 0.000001, name.." serial forward error")
+      mytester:assertTensorEq(gradInput:float(), gradInput2, 0.00001, name.." serial backward error")
+
+      local params, gradParams = mlp:parameters()
+      local params2, gradParams2 = mlp3:parameters()
+      mytester:assert(#params == #params2)
+      for i,param in ipairs(params) do
+         mytester:assertTensorEq(param:float(), params2[i], 0.00001, name.." params err "..i)
+         mytester:assertTensorEq(gradParams[i]:float(), gradParams2[i], 0.00001, name.." gradParams err "..i)
+      end
+   end
+
+   local mlp = nn.Sequential():extend(
+      nn.Linear(3,4),
+      nn.Tanh(),
+      nn.Linear(4,5),
+      nn.Sequential():extend(
+         nn.Linear(5,6),
+         nn.Tanh(),
+         nn.Linear(6,7)
+      )
+   )
+
+   test(mlp, 'mlp')
+
+   if pcall(function() require 'rnn' end) then
+      local seq = nn.Sequential()
+      seq:add(nn.Repeater(nn.Recurrent(2,nn.Linear(3,2),nn.Linear(2,2), nn.Sigmoid(), 999), 3))
+      seq:add(nn.Sequencer(nn.Linear(2,7)))
+      seq:add(nn.SelectTable(-1))
+      test(seq, 'rnn2')
+   end
+end
+
+function rnntest.Convert()
+   -- batch mode
+   local c = nn.Convert('bchw', 'chwb')
+   local input = torch.randn(8,3,5,5)
+   local output = c:forward(input)
+   local output2 = input:transpose(1,4):transpose(1,3):transpose(1,2)
+   mytester:assertTensorEq(output, output2, 0.000001, "Convert fwd bchw->chwb")
+   local gradInput = c:backward(input, output)
+   mytester:assertTensorEq(gradInput, input, 0.000001, "Convert bwd bchw->chwb")
+   local c = nn.Convert('bchw', 'bf')
+   local output = c:forward(input)
+   local output2 = input:view(8,-1)
+   mytester:assertTensorEq(output, output2, 0.000001, "Convert fwd bchw->bf")
+   c:float()
+   local output = c:forward(input:float())
+   mytester:assertTensorEq(output, output2:float(), 0.000001, "Convert:type()")
+   local output = c:forward(input)
+   mytester:assertTensorEq(output, output2:float(), 0.000001, "Convert:type() double->float")
+   -- non-batch mode
+   local c = nn.Convert('chw', 'hwc')
+   local input = torch.randn(3,5,5)
+   local output = c:forward(input)
+   local output2 = input:transpose(1,3):transpose(1,2)
+   mytester:assertTensorEq(output, output2, 0.000001, "Convert fwd chw->hwc non-batch")
+   local gradInput = c:backward(input, output)
+   mytester:assertTensorEq(gradInput, input, 0.000001, "Convert bwd chw->hwc non-batch")
+   local c = nn.Convert('chw', 'f')
+   local output = c:forward(input)
+   local output2 = input:view(-1)
+   mytester:assertTensorEq(output, output2, 0.000001, "Convert fwd chw->bf non-batch")
+   c:float()
+   local output = c:forward(input:float())
+   mytester:assertTensorEq(output, output2:float(), 0.000001, "Convert:type() non-batch")
+   local output = c:forward(input)
+   mytester:assertTensorEq(output, output2:float(), 0.000001, "Convert:type() double->float non-batch")
+end
+
+function rnntest.Collapse()
+   local c = nn.Collapse(3)
+   local input = torch.randn(8,3,4,5)
+   local output = c:forward(input)
+   mytester:assertTensorEq(input:view(8,-1), output, 0.000001, "Collapse:forward")
+   local gradInput = c:backward(input, output)
+   mytester:assertTensorEq(gradInput, input, 0.000001, "Collapse:backward")
+   mytester:assertTableEq(gradInput:size():totable(), input:size():totable(), 0.000001, "Collapse:backward size")
+   local input2 = input:transpose(1,4)
+   local output2 = c:forward(input2)
+   mytester:assertTensorEq(input2:contiguous():view(5,-1), output2, 0.000001, "Collapse:forward non-contiguous")
+   local gradInput2 = c:backward(input2, output2)
+   mytester:assertTensorEq(gradInput2, input2, 0.000001, "Collapse:backward non-contiguous")
+   mytester:assertTableEq(gradInput2:size():totable(), input2:size():totable(), 0.000001, "Collapse:backward size non-contiguous")
+end
+
+function rnntest.ZipTable()
+   -- input : { {a1,a2}, {b1,b2}, {c1,c2} }
+   -- output : { {a1,b1,c1}, {a2,b2,c2} }
+   local z = nn.ZipTable()
+   local input = {
+      {torch.randn(3,4), torch.randn(3,4)},
+      {torch.randn(3,4), torch.randn(3,4)},
+      {torch.randn(3,4), torch.randn(3,4)}
+   }
+   local output = z:forward(input)
+   mytester:assert(#output == 2, "ZipTable #output")
+   mytester:assert(#(output[1]) == 3, "ZipTable #output[1]")
+   mytester:assertTensorEq(input[1][1], output[1][1], 0.000001, "ZipTable input11")
+   mytester:assertTensorEq(input[1][2], output[2][1], 0.000001, "ZipTable input12")
+   mytester:assertTensorEq(input[3][2], output[2][3], 0.000001, "ZipTable input32")
+   local gradInput = z:backward(input, output)
+   mytester:assert(#gradInput == 3, "ZipTable #gradInput")
+   mytester:assert(#(gradInput[1]) == 2, "ZipTable #gradInput[1]")
+   mytester:assertTensorEq(input[1][1], gradInput[1][1], 0.000001, "ZipTable gradInput11")
+   mytester:assertTensorEq(input[1][2], gradInput[1][2], 0.000001, "ZipTable gradInput12")
+   mytester:assertTensorEq(input[3][2], gradInput[3][2], 0.000001, "ZipTable gradInput32")
+end
+
+function rnntest.ZipTableOneToMany()
+   -- input : { v, {a,b,c} }
+   -- output : { {v,a}, {v,b}, {v,c} }
+   local z = nn.ZipTableOneToMany()
+   local input = { torch.randn(3), { torch.randn(4), torch.rand(4), torch.rand(4) } }
+   local output = z:forward(input)
+   mytester:assert(#output == 3, "ZipTableOneToMany #output")
+   mytester:assert(#(output[1]) == 2, "ZipTableOneToMany #output[1]")
+   mytester:assert(#(output[2]) == 2, "ZipTableOneToMany #output[2]")
+   mytester:assert(#(output[3]) == 2, "ZipTableOneToMany #output[3]")
+   mytester:assertTensorEq(input[1], output[1][1], 0.000001, "ZipTableOneToMany input1 output11")
+   mytester:assertTensorEq(input[1], output[2][1], 0.000001, "ZipTableOneToMany input1 output21")
+   mytester:assertTensorEq(input[1], output[3][1], 0.000001, "ZipTableOneToMany input1 output31")
+   mytester:assertTensorEq(input[2][1], output[1][2], 0.000001, "ZipTableOneToMany input21")
+   mytester:assertTensorEq(input[2][2], output[2][2], 0.000001, "ZipTableOneToMany input22")
+   mytester:assertTensorEq(input[2][3], output[3][2], 0.000001, "ZipTableOneToMany input23")
+   local gradInput = z:backward(input, output)
+   mytester:assert(#gradInput == 2, "ZipTableOneToMany #gradInput")
+   mytester:assert(#(gradInput[2]) == 3, "ZipTableOneToMany #gradInput[2]")
+   mytester:assertTensorEq(input[2][1], gradInput[2][1], 0.000001, "ZipTableOneToMany gradInput21")
+   mytester:assertTensorEq(input[2][2], gradInput[2][2], 0.000001, "ZipTableOneToMany gradInput22")
+   mytester:assertTensorEq(input[2][3], gradInput[2][3], 0.000001, "ZipTableOneToMany gradInput32")
+   mytester:assertTensorEq(torch.mul(input[1], 3), gradInput[1], 0.000001, "ZipTableOneToMany gradInput21")
+end
+
+function rnntest.CAddTensorTable()
+   -- input : { v, {a,b,c} }
+   -- output : { v+a, v+b, v+c }
+   local z = nn.CAddTensorTable()
+   local input = { torch.randn(3), { torch.randn(3), torch.rand(3), torch.rand(3) } }
+   local output = z:forward(input)
+   mytester:assert(#output == 3, "CAddTensorTable #output")
+   mytester:assertTensorEq(input[1]+input[2][1], output[1], 0.00001, "CAddTensorTable input21 output1")
+   mytester:assertTensorEq(input[1]+input[2][2], output[2], 0.00001, "CAddTensorTable input22 output2")
+   mytester:assertTensorEq(input[1]+input[2][3], output[3], 0.00001, "CAddTensorTable input23 output3")
+   local gradInput = z:backward(input, output)
+   mytester:assert(#gradInput == 2, "CAddTensorTable #gradInput")
+   mytester:assert(#(gradInput[2]) == 3, "CAddTensorTable #gradInput[2]")
+   mytester:assertTensorEq(output[1], gradInput[2][1], 0.000001, "CAddTensorTable gradInput21")
+   mytester:assertTensorEq(output[2], gradInput[2][2], 0.000001, "CAddTensorTable gradInput22")
+   mytester:assertTensorEq(output[3], gradInput[2][3], 0.000001, "CAddTensorTable gradInput23")
+   mytester:assertTensorEq(output[1]+output[2]+output[3], gradInput[1], 0.000001, "CAddTensorTable gradInput1")
+end
+
+function rnntest.ReverseTable()
+   -- input : { a, b, c, d }
+   -- output : { c, b, a, d }
+   local r = nn.ReverseTable()
+   local input = {torch.randn(3,4), torch.randn(3,4), torch.randn(3,4), torch.randn(3,4)}
+   local output = r:forward(input)
+
+   mytester:assert(#output == 4, "ReverseTable #output")
+   local k = 1
+   for i=#input,1,-1 do
+      mytester:assertTensorEq(input[i], output[k], 0.00001, "ReverseTable output err "..k)
+      k = k + 1
+   end
+
+   local gradInput = r:backward(input, output)
+   mytester:assert(#gradInput == 4, "ReverseTable #gradInput")
+   for i=1,#input do
+      mytester:assertTensorEq(gradInput[i], input[i], 0.00001, "ReverseTable gradInput err "..i)
+   end
+end
+
+function rnntest.Inception()
+   local size = {8,3,32,32}
+   local outputSize = {8,16+24+8+12,32,32}
+   local input = torch.rand(unpack(size))
+   local gradOutput = torch.randn(unpack(outputSize))
+   local incep = nn.Inception{inputSize=3, outputSize={16,24}, reduceSize={14,16,8,12}}
+   for i, param in ipairs(incep:parameters()) do
+      mytester:assert(_.isFinite(param:sum()), 'inception init error')
+   end
+   local output = incep:forward(input)
+   mytester:assertTableEq(output:size():totable(), outputSize, 0.00001)
+   mytester:assert(_.isFinite(output:sum()))
+   incep:zeroGradParameters()
+   local gradInput = incep:backward(input, gradOutput)
+   mytester:assertTableEq(gradInput:size():totable(), size, 0.00001)
+   mytester:assert(_.isFinite(gradInput:sum()))
+   incep:updateParameters(0.1)
+   for i, param in ipairs(incep:parameters()) do
+      mytester:assert(_.isFinite(param:sum()), 'inception update error')
+   end
+   incep:maxParamNorm(1)
+   for i, param in ipairs(incep:parameters()) do
+      mytester:assert(_.isFinite(param:sum()), 'inception maxNorm error')
+   end
+end
+
+function rnntest.SpatialUniformCrop()
+   if not pcall(function() require "nnx" end) then return end -- needs the nnx package
+   local input = torch.Tensor(8,3,10,10):copy(torch.range(1,8):view(8,1,1,1):expand(8,3,10,10))
+   local gradOutput = torch.Tensor(8,3,4,4):copy(torch.range(1,8):view(8,1,1,1):expand(8,3,4,4))
+   local sc = nn.SpatialUniformCrop(4)
+   local output, gradInput
+   for i=1,100 do
+      output = sc:forward(input)
+      gradInput = sc:backward(input, gradOutput)
+   end
+   for i=1,8 do
+      mytester:assert(math.abs(output[i]:mean() - i) < 0.0001, "SpatialUniformCrop output err "..i)
+      mytester:assert(math.abs(gradInput[i]:mean() - ((i*4*4)/(10*10))) < 0.0001, "SpatialUniformCrop gradInput err"..i)
+   end
+
+   local input = torch.zeros(1, 1, 120, 120)
+   local temp = input[1]:narrow(2, 30, 60):narrow(3, 30, 60)
+   temp:fill(1)
+   local scale = {}
+   scale['min'] = 0.8
+   scale['max'] = 1.2
+
+   local layer = nn.SpatialUniformCrop(100, 100, scale)
+   local o = layer:forward(input)
+   gradInput = layer:backward(input, o)
+   mytester:assert(gradInput:max() ~= nil, "SpatialUniformCrop scaling error.")
+end
+
+function rnntest.DontCast()
+   local input = torch.randn(3,4)
+   local gradOutput = torch.randn(3,2)
+   local linear = nn.Linear(4,2):float()
+   local mlp = nn.DontCast(linear, true)
+   linear:zeroGradParameters()
+   local linear = linear:clone()
+   local output = mlp:forward(input)
+   local gradInput = mlp:backward(input, gradOutput)
+   mytester:assert(torch.type(output) == 'torch.DoubleTensor')
+   mytester:assert(torch.type(gradInput) == 'torch.DoubleTensor')
+   local output2 = linear:forward(input:float())
+   local gradInput2 = linear:backward(input:float(), gradOutput:float())
+   mytester:assertTensorEq(output:float(), output2, 0.000001)
+   mytester:assertTensorEq(gradInput:float(), gradInput2, 0.000001)
+   local mlp3 = nn.DontCast(linear:clone())
+   mlp3:zeroGradParameters()
+   local output3 = mlp3:forward(input:float())
+   local gradInput3 = mlp3:backward(input:float(), gradOutput:float())
+   mytester:assert(torch.type(output3) == 'torch.FloatTensor')
+   mytester:assert(torch.type(gradInput3) == 'torch.FloatTensor')
+   mytester:assertTensorEq(output3, output2, 0.000001)
+   mytester:assertTensorEq(gradInput3, gradInput2, 0.000001)
+   mlp:float()
+   local output4 = mlp:forward(input:float())
+   local gradInput4 = mlp:backward(input:float(), gradOutput:float())
+   mytester:assert(torch.type(output4) == 'torch.FloatTensor')
+   mytester:assert(torch.type(gradInput4) == 'torch.FloatTensor')
+   mytester:assertTensorEq(output3, output4, 0.000001)
+   mytester:assertTensorEq(gradInput3, gradInput4, 0.000001)
+   mlp:double()
+   mytester:assert(torch.type(linear.output) == 'torch.FloatTensor')
+   local output = mlp:forward(input)
+   local gradInput = mlp:backward(input, gradOutput)
+   mytester:assert(torch.type(output4) == 'torch.FloatTensor')
+   mytester:assert(torch.type(gradInput4) == 'torch.FloatTensor')
+   mytester:assertTensorEq(output3, output:float(), 0.000001)
+   mytester:assertTensorEq(gradInput3, gradInput:float(), 0.000001)
+
+   -- test table inputs/outputs
+   local input = {torch.randn(3,4), torch.randn(3,4)}
+   local gradOutput = {torch.randn(3,2), torch.randn(3,2)}
+   local linear = nn.ParallelTable():add(nn.Linear(4,2)):add(nn.Linear(4,2)):float()
+   local mlp = nn.DontCast(linear, true)
+   linear:zeroGradParameters()
+   local linear = linear:clone()
+   local output = mlp:forward(input)
+   local gradInput = mlp:backward(input, gradOutput)
+   mytester:assert(torch.type(output[1]) == 'torch.DoubleTensor')
+   mytester:assert(torch.type(gradInput[1]) == 'torch.DoubleTensor')
+   mytester:assert(torch.type(output[2]) == 'torch.DoubleTensor')
+   mytester:assert(torch.type(gradInput[2]) == 'torch.DoubleTensor')
+   local finput = _.map(input, function(k,v) return v:float() end)
+   local foutput = _.map(output, function(k,v) return v:float() end)
+   local fgradInput = _.map(gradInput, function(k,v) return v:float() end)
+   local fgradOutput = _.map(gradOutput, function(k,v) return v:float() end)
+   local output2 = linear:forward(finput)
+   local gradInput2 = linear:backward(finput, fgradOutput)
+   mytester:assertTensorEq(foutput[1], output2[1], 0.000001)
+   mytester:assertTensorEq(foutput[2], output2[2], 0.000001)
+   mytester:assertTensorEq(fgradInput[1], gradInput2[1], 0.000001)
+   mytester:assertTensorEq(fgradInput[2], gradInput2[2], 0.000001)
+   local mlp3 = nn.DontCast(linear:clone())
+   mlp3:zeroGradParameters()
+   local output3 = mlp3:forward(finput)
+   local gradInput3 = mlp3:backward(finput, fgradOutput)
+   mytester:assert(torch.type(output3[1]) == 'torch.FloatTensor')
+   mytester:assert(torch.type(gradInput3[1]) == 'torch.FloatTensor')
+   mytester:assert(torch.type(output3[2]) == 'torch.FloatTensor')
+   mytester:assert(torch.type(gradInput3[2]) == 'torch.FloatTensor')
+   mytester:assertTensorEq(output3[1], output2[1], 0.000001)
+   mytester:assertTensorEq(gradInput3[1], gradInput2[1], 0.000001)
+   mytester:assertTensorEq(output3[2], output2[2], 0.000001)
+   mytester:assertTensorEq(gradInput3[2], gradInput2[2], 0.000001)
+   mlp:float()
+   local output4 = mlp:forward(finput)
+   local gradInput4 = mlp:backward(finput, fgradOutput)
+   mytester:assert(torch.type(output4[1]) == 'torch.FloatTensor')
+   mytester:assert(torch.type(gradInput4[1]) == 'torch.FloatTensor')
+   mytester:assert(torch.type(output4[2]) == 'torch.FloatTensor')
+   mytester:assert(torch.type(gradInput4[2]) == 'torch.FloatTensor')
+   mytester:assertTensorEq(output3[1], output4[1], 0.000001)
+   mytester:assertTensorEq(gradInput3[1], gradInput4[1], 0.000001)
+   mytester:assertTensorEq(output3[2], output4[2], 0.000001)
+   mytester:assertTensorEq(gradInput3[2], gradInput4[2], 0.000001)
+   mlp:double()
+   mytester:assert(torch.type(linear.output) == 'table')
+   mytester:assert(torch.type(linear.output[1]) == 'torch.FloatTensor')
+   mytester:assert(torch.type(linear.output[2]) == 'torch.FloatTensor')
+   local output = mlp:forward(input)
+   local gradInput = mlp:backward(input, gradOutput)
+   mytester:assertTensorEq(output3[1], output[1]:float(), 0.000001)
+   mytester:assertTensorEq(gradInput3[1], gradInput[1]:float(), 0.000001)
+end
+
+function rnntest.ModuleCriterion()
+   local input = torch.randn(8,4)
+   local target = torch.randn(8,4)
+   local inputModule = nn.Tanh()
+   local criterion = nn.MSECriterion()
+   local mc = nn.ModuleCriterion(criterion, inputModule)
+
+   local err = mc:forward(input, target)
+   local gradInput = mc:backward(input, target)
+
+   local output = inputModule:forward(input)
+   local err2 = criterion:forward(output, target)
+   local gradOutput = criterion:backward(output, target)
+   local gradInput2 = inputModule:backward(input, gradOutput)
+
+   mytester:assert(err == err2, "ModuleCriterion backward err")
+   mytester:assertTensorEq(gradInput, gradInput2, 0.000001, "ModuleCriterion backward err")
+end
+
+function rnntest.ReinforceNormal()
+   local input = torch.randn(500,1000) -- means
+   local gradOutput = torch.Tensor() -- will be ignored
+   local reward = torch.randn(500)
+   -- test scalar stdev
+   local stdev = 1
+   local rn = nn.ReinforceNormal(stdev)
+   local output = rn:forward(input)
+   mytester:assert(input:isSameSizeAs(output), "ReinforceNormal forward size err")
+   local outstd = math.sqrt((input - output):pow(2):mean())
+   local err = math.abs(outstd - stdev)
+   mytester:assert(err < 0.1, "ReinforceNormal forward std err")
+   rn:reinforce(reward)
+   local gradInput = rn:updateGradInput(input, gradOutput)
+   local gradInput2 = output:clone()
+   gradInput2:add(-1, input):div(stdev^2)
+   local reward2 = reward:view(500,1):expandAs(input)
+   gradInput2:cmul(reward2):mul(-1)
+   mytester:assertTensorEq(gradInput2, gradInput, 0.00001, "ReinforceNormal backward err")
+   -- test input {mean, stdev}
+   local mean, stdev = torch.randn(4,10), torch.rand(4,10)
+   local input = {mean, stdev}
+   local rn = nn.ReinforceNormal()
+   local output = rn:updateOutput(input)
+   local reward = torch.randn(4)
+   rn:reinforce(reward)
+   local gradInput = rn:backward(input, gradOutput)
+   mytester:assert(mean:isSameSizeAs(output), "ReinforceNormal forward table input - output size err")
+   mytester:assert(gradInput[1]:isSameSizeAs(mean), "ReinforceNormal backward table input - mean size err")
+   mytester:assert(gradInput[2]:isSameSizeAs(stdev), "ReinforceNormal backward table input - stdev size err")
+   local gradStdev = output:clone():add(-1, mean):pow(2)
+   local stdev2 = torch.cmul(stdev,stdev)
+   gradStdev:add(-1,stdev2)
+   stdev2:cmul(stdev):add(0.00000001)
+   gradStdev:cdiv(stdev2)
+   local reward2 = reward:view(4,1):expandAs(gradStdev)
+   gradStdev:cmul(reward2):mul(-1)
+   mytester:assertTensorEq(gradInput[2], gradStdev, 0.000001, "ReinforceNormal backward table input - gradStdev err")
+end
+
+function rnntest.ReinforceGamma()
+   if not pcall(function() require 'randomkit'; require 'cephes' end) then
+      return
+   end
+   local input = torch.rand(500,1000):fill(250) -- shapes
+   local gradOutput = torch.Tensor() -- will be ignored
+   local reward = torch.randn(500)
+   -- test scalar scale
+   local scale = 2
+   local rn = nn.ReinforceGamma(scale)
+   local output = rn:forward(input)
+   mytester:assert(input:isSameSizeAs(output), "ReinforceGamma forward size err")
+   local outmean = torch.mean(output)
+   -- expected value of distribution is shape*scale
+   local err = math.abs(outmean - torch.mean(torch.mul(input,scale)))
+   mytester:assert(err < 0.1, "ReinforceGamma forward mean err")
+   rn:reinforce(reward)
+   local gradInput = rn:updateGradInput(input, gradOutput)
+   local gradInput2 = torch.log(output:clone())
+   gradInput2:add(-1, cephes.digamma(input))
+   gradInput2:add(-1*torch.log(scale) )
+   local reward2 = reward:view(500,1):expandAs(input)
+   gradInput2:cmul(reward2):mul(-1)
+   mytester:assertTensorEq(gradInput2, gradInput, 0.00001, "ReinforceGamma backward err")
+   -- test input {mean, stdev}
+   local shape, scale = torch.rand(4,10), torch.rand(4,10)
+   local input = {shape, scale}
+   local rn = nn.ReinforceGamma()
+   local output = rn:updateOutput(input)
+   local reward = torch.randn(4)
+   rn:reinforce(reward)
+   local gradInput = rn:backward(input, gradOutput)
+   mytester:assert(shape:isSameSizeAs(output), "ReinforceGamma forward table input - output size err")
+   mytester:assert(gradInput[1]:isSameSizeAs(shape), "ReinforceGamma backward table input - mean size err")
+   mytester:assert(gradInput[2]:isSameSizeAs(scale), "ReinforceGamma backward table input - stdev size err")
+   local gradScale = torch.cdiv(output:clone(), torch.pow(scale,2) )
+   gradScale:add( -1, torch.cdiv( shape, scale) )
+   local reward2 = reward:view(4,1):expandAs(gradScale)
+   gradScale:cmul(reward2):mul(-1)
+   mytester:assertTensorEq(gradInput[2], gradScale, 0.000001, "ReinforceGamma backward table input - gradStdev err")
+end
+
+function rnntest.ReinforceBernoulli()
+   local input = torch.Tensor(1000,10)
+   local p = torch.rand(1,10) -- probability of sampling a 1
+   input:copy(p:expandAs(input))
+   local gradOutput = torch.Tensor() -- will be ignored
+   local reward = torch.randn(1000)
+   local rb = nn.ReinforceBernoulli()
+   local output = rb:forward(input)
+   mytester:assert(input:isSameSizeAs(output), "ReinforceBernoulli forward size err")
+   mytester:assert(output:min() == 0, "ReinforceBernoulli forward min val err")
+   mytester:assert(output:max() == 1, "ReinforceBernoulli forward max val err")
+   local binary = true
+   output:apply(function(x) if not (x == 1 or x == 0) then binary = false end end)
+   mytester:assert(binary, "ReinforceBernoulli forward binary val err")
+   local p2 = output:mean(1)
+   local err = (p - p2):abs():mean()
+   mytester:assert(err < 0.05, "ReinforceBernoulli forward p err")
+   rb:reinforce(reward)
+   local gradInput = rb:updateGradInput(input, gradOutput)
+   local gradInput2 = output:clone()
+   local div = output:clone():fill(1):add(-1, input):cmul(input)
+   gradInput2:add(-1, input):cdiv(div)
+   local reward2 = reward:view(1000,1):expandAs(input)
+   gradInput2:cmul(reward2):mul(-1)
+   mytester:assertTensorEq(gradInput2, gradInput, 0.00001, "ReinforceBernoulli backward err")
+end
+
+function rnntest.ReinforceCategorical()
+   local input = torch.Tensor(1000,10)
+   local p = torch.rand(1,10)
+   p:div(p:sum())
+   input:copy(p:expandAs(input))
+   local gradOutput = torch.Tensor() -- will be ignored
+   local reward = torch.randn(1000)
+   local rc = nn.ReinforceCategorical()
+   local output = rc:forward(input)
+   mytester:assert(input:isSameSizeAs(output), "ReinforceCategorical forward size err")
+   mytester:assert(output:min() == 0, "ReinforceCategorical forward min val err")
+   mytester:assert(output:max() == 1, "ReinforceCategorical forward max val err")
+   mytester:assert(output:sum() == 1000, "ReinforceCategorical forward sum err")
+   local binary = true
+   output:apply(function(x) if not (x == 1 or x == 0) then binary = false end end)
+   mytester:assert(binary, "ReinforceCategorical forward binary val err")
+   local p2 = output:mean(1)
+   local err = (p - p2):abs():mean()
+   mytester:assert(err < 0.05, "ReinforceCategorical forward p err")
+   rc:reinforce(reward)
+   local gradInput = rc:updateGradInput(input, gradOutput)
+   local gradInput2 = output:clone()
+   gradInput2:cdiv(input+0.00000001)
+   local reward2 = reward:view(1000,1):expandAs(input)
+   gradInput2:cmul(reward2):mul(-1)
+   mytester:assertTensorEq(gradInput2, gradInput, 0.00001, "ReinforceCategorical backward err")
+end
+
+function rnntest.VRClassReward()
+   local input = {torch.randn(13,10):float(), torch.randn(13,1):float()}
+   local target = torch.IntTensor(13):random(1,10)
+   local rf = nn.Reinforce():float()
+   local vrc = nn.VRClassReward(rf):float()
+   local err = vrc:forward(input, target)
+   local gradInput = vrc:backward(input, target)
+   local val, idx = input[1]:max(2)
+   local reward = torch.eq(idx:select(2,1):int(), target):float()
+   local err2 = -reward:mean()
+   mytester:assert(err == err2, "VRClassReward forward err")
+   local gradInput2 = nn.MSECriterion():float():backward(input[2], reward)
+   mytester:assertTensorEq(gradInput[2], gradInput2, 0.000001, "VRClassReward backward baseline err")
+   mytester:assert(math.abs(gradInput[1]:sum()) < 0.000001, "VRClassReward backward class err")
+
+   if pcall(function() require 'cunn' end) then
+      local gradInput = {gradInput[1], gradInput[2]}
+      input[1], input[2] = input[1]:cuda(), input[2]:cuda()
+      target = target:cuda()
+      rf:cuda()
+      vrc:cuda()
+
+      local err2 = vrc:forward(input, target)
+      local gradInput2 = vrc:backward(input, target)
+
+      mytester:assert(math.abs(err - err2) < 0.000001, "VRClassReward forward cuda err")
+      mytester:assertTensorEq(gradInput[2], gradInput2[2]:float(), 0.000001, "VRClassReward backward baseline cuda err")
+      mytester:assertTensorEq(gradInput[1], gradInput2[1]:float(), 0.000001, "VRClassReward backward class cuda err")
+   end
+end
+
+function rnntest.BinaryClassReward()
+   local input = {torch.Tensor(10), torch.randn(10,1)}
+   input[1]:uniform(0,1)
+   local target = torch.LongTensor(10):random(0,1)
+   local rf = nn.Reinforce()
+   local bcr = nn.BinaryClassReward(rf)
+   local err = bcr:forward(input, target)
+   local gradInput = bcr:backward(input, target)
+   local idx = input[1].new():gt(input[1], 0.5)
+   local reward = torch.eq(idx:long(), target):double()
+   local err2 = -reward:mean()
+   mytester:assert(err == err2, "BinaryClassReward forward err")
+   local gradInput2 = nn.MSECriterion():backward(input[2], reward)
+   mytester:assertTensorEq(gradInput[2], gradInput2, 0.000001, "BinaryClassReward backward baseline err")
+   mytester:assertTensorEq(gradInput[1], torch.zeros(input[1]:size()), 0.000001, "BinaryClassReward backward class err")
+
+   -- test agains VRClassReward
+   local input2 = {torch.Tensor(10,2):zero(), input[2]}
+   local target2 = torch.add(target, 1)
+   for i=1,10 do
+      input2[1][i][input[1][i] > 0.5 and 2 or 1] = 1
+   end
+   local rf2 = nn.Reinforce()
+   local vrc = nn.VRClassReward(rf2)
+   local err2 = vrc:forward(input2, target2)
+   mytester:assert(math.abs(err - err2) < 0.0000001)
+   local gradInput2 = vrc:backward(input2, target2)
+   mytester:assertTensorEq(gradInput[2], gradInput2[2], 0.0000001)
+   mytester:assertTensorEq(rf2.reward, rf.reward, 0.0000001)
+end
+
+function rnntest.Clip()
+   local input = torch.randn(200,300)
+   local gradOutput = torch.randn(200,300)
+   local minval, maxval = -0.05, 0.1
+   local clip = nn.Clip(minval, maxval)
+   local output = clip:forward(input)
+   local output2 = input:clone()
+   local mask = input.new()
+   mask:gt(input, maxval)
+   output2[mask:type("torch.ByteTensor")] = maxval
+   mask:lt(input, minval)
+   output2[mask:type("torch.ByteTensor")] = minval
+   mytester:assertTensorEq(output, output2, 0.00001, "Clip forward err")
+   local gradInput = clip:backward(input, gradOutput)
+   mytester:assertTensorEq(gradInput, gradOutput, 0.00001, "Clip backward err")
+end
+
+function rnntest.Constant()
+   local input = torch.randn(20,3,7)
+   local gradOutput = torch.randn(20,30,6)
+   local value = torch.randn(30,6)
+   local const = nn.Constant(value:clone(), 2)
+   local output = const:forward(input)
+   local gradInput = const:backward(input, output)
+   local output2 = value:view(1,30,6):expand(20,30,6)
+   mytester:assertTensorEq(output2, output, 0.000001, "Constant forward err")
+   mytester:assertTensorEq(gradInput, input:zero(), 0.000001, "Constant backward err")
+end
+
+function rnntest.SpatialGlimpse()
+   if not pcall(function() require "image" end) then return end -- needs the image package
+   if not pcall(function() require "nnx" end) then return end -- needs the nnx package
+   local batchSize = 1
+   local inputSize = {2,8,8}
+   local glimpseSize = 4
+   local input = torch.Tensor(batchSize, unpack(inputSize))
+   input:range(1,input:nElement())
+   input:resize(batchSize, unpack(inputSize))
+   local sg = nn.SpatialGlimpse(glimpseSize)
+   local location = torch.Tensor(batchSize, 2):fill(0) -- center patch
+   local output = sg:forward{input,location}
+   local output_ = output:view(batchSize, 3, inputSize[1], glimpseSize, glimpseSize)
+   local output2 = input:narrow(3,3,glimpseSize):narrow(4,3,glimpseSize)
+   mytester:assertTensorEq(output2, output_:select(2, 1), 0.00001, "SpatialGlimpse center 4 output depth=1 err")
+   local outputSize = {batchSize, inputSize[1]*3, glimpseSize, glimpseSize}
+   mytester:assertTableEq(output:size():totable(), outputSize, 0.000001, "SpatialGlimpse output size err")
+
+   local input2 = torch.Tensor(unpack(inputSize))
+   input2:range(1,input2:nElement())
+   input2:resize(unpack(inputSize))
+   local sg = nn.SpatialGlimpse(glimpseSize)
+   local location2 = torch.Tensor(2):fill(0) -- center patch
+   local output2 = sg:forward{input2,location2}
+   mytester:assertTensorEq(output2, output[1], 0.00001, "SpatialGlimpse online output depth=1 err")
+
+   local glimpseSize = 5
+   local sg = nn.SpatialGlimpse(glimpseSize)
+   local location = torch.Tensor(batchSize, 2):fill(0) -- center patch
+   local output = sg:forward{input,location}
+   local output_ = output:view(batchSize, 3, inputSize[1], glimpseSize, glimpseSize)
+   local output2 = input:narrow(3,2,glimpseSize):narrow(4,2,glimpseSize)
+   mytester:assertTensorEq(output2, output_:select(2, 1), 0.00001, "SpatialGlimpse center 5 output depth=1 err")
+
+   local glimpseSize = 4
+   local sg = nn.SpatialGlimpse(glimpseSize)
+   local location = torch.Tensor(batchSize, 2):fill(-1) -- top left corner patch
+   local output = sg:forward{input,location}
+   local output_ = output:view(batchSize, 3, inputSize[1], glimpseSize, glimpseSize)
+   local padSize = math.floor((glimpseSize-1)/2)
+   local pad = torch.Tensor(batchSize, inputSize[1], inputSize[2]+padSize*2, inputSize[3]+padSize*2):zero()
+   pad:narrow(3, padSize + 1, inputSize[2]):narrow(4, padSize + 1, inputSize[3]):copy(input)
+   local output2 = pad:narrow(3,1,glimpseSize):narrow(4,1,glimpseSize)
+   mytester:assertTensorEq(output2, output_:select(2, 1), 0.00001, "SpatialGlimpse top-left 4 output depth=1 err")
+
+   local glimpseSize = 5
+   local sg = nn.SpatialGlimpse(glimpseSize)
+   local location = torch.Tensor(batchSize, 2):fill(-1) -- top left corner patch
+   local output = sg:forward{input,location}
+   local output_ = output:view(batchSize, 3, inputSize[1], glimpseSize, glimpseSize)
+   local pad = torch.Tensor(batchSize, inputSize[1], inputSize[2]+glimpseSize, inputSize[3]+glimpseSize):zero()
+   pad:narrow(3, (glimpseSize-1)/2 + 1, inputSize[2]):narrow(4, (glimpseSize-1)/2 + 1, inputSize[3]):copy(input)
+   local output2 = pad:narrow(3,1,glimpseSize):narrow(4,1,glimpseSize)
+   mytester:assertTensorEq(output2, output_:select(2, 1), 0.00001, "SpatialGlimpse top-left 5 output depth=1 err")
+
+   local glimpseSize = 4
+   local sg = nn.SpatialGlimpse(glimpseSize)
+   local location = torch.Tensor(batchSize, 2):fill(1) -- bottom-right corner patch
+   local output = sg:forward{input,location}
+   local output_ = output:view(batchSize, 3, inputSize[1], glimpseSize, glimpseSize)
+   local pad = torch.Tensor(batchSize, inputSize[1], inputSize[2]+glimpseSize, inputSize[3]+glimpseSize):zero()
+   pad:narrow(3, math.floor((glimpseSize-1)/2 + 1), inputSize[2]):narrow(4, math.floor((glimpseSize-1)/2 + 1), inputSize[3]):copy(input)
+   local output2 = pad:narrow(3,inputSize[2]-1,glimpseSize):narrow(4,inputSize[3]-1,glimpseSize)
+   --print('bottom-right', output2, output_:select(2, 1))
+   mytester:assertTensorEq(output2, output_:select(2, 1), 0.00001, "SpatialGlimpse bottom-right 4 output depth=1 err")
+
+   local glimpseSize = 5
+   local sg = nn.SpatialGlimpse(glimpseSize)
+   local location = torch.Tensor(batchSize, 2):fill(1) -- bottom-right corner patch
+   local output = sg:forward{input,location}
+   local output_ = output:view(batchSize, 3, inputSize[1], glimpseSize, glimpseSize)
+   local pad = torch.Tensor(batchSize, inputSize[1], inputSize[2]+glimpseSize, inputSize[3]+glimpseSize):zero()
+   pad:narrow(3, (glimpseSize-1)/2, inputSize[2]):narrow(4, (glimpseSize-1)/2, inputSize[3]):copy(input)
+   local output2 = pad:narrow(3,inputSize[2]-1,glimpseSize):narrow(4,inputSize[3]-1,glimpseSize)
+   --print('bottom-right', output2, output_:select(2, 1))
+   mytester:assertTensorEq(output2, output_:select(2, 1), 0.00001, "SpatialGlimpse bottom-right 5 output depth=1 err")
+
+   local glimpseSize = 4
+   local sg = nn.SpatialGlimpse(glimpseSize, 1)
+   local location = torch.Tensor(batchSize, 2):fill(0) -- center patch
+   local output = sg:forward{input,location}
+   local output_ = output:view(batchSize, 1, inputSize[1], glimpseSize, glimpseSize)
+   local output2 = input:narrow(3,3,glimpseSize):narrow(4,3,glimpseSize)
+   mytester:assertTensorEq(output2, output_:select(2, 1), 0.00001, "SpatialGlimpse center 4 output depth=1 err")
+   local gradInput = sg:backward({input,location}, output)
+   local gradInput2 = input:clone():zero()
+   gradInput2:narrow(3,3,glimpseSize):narrow(4,3,glimpseSize):copy(output_:select(2,1))
+   mytester:assertTensorEq(gradInput[1], gradInput2, 0.000001, "SpatialGlimpse backward 4 depth 1 error")
+
+   -- test with spatial resampling
+   local sg = nn.SpatialGlimpse(glimpseSize, 2)
+   sg.module = nn.SpatialReSampling{owidth=glimpseSize,oheight=glimpseSize}
+   local location = torch.Tensor(batchSize, 2):fill(0) -- center patch
+   local output = sg:forward{input,location}
+   local output_ = output:view(batchSize, 2, inputSize[1], glimpseSize, glimpseSize)
+   local output2 = input:narrow(3,3,glimpseSize):narrow(4,3,glimpseSize)
+   mytester:assertTensorEq(output2, output_:select(2, 1), 0.00001, "SpatialGlimpse center 4 output depth=1 err")
+   local gradOutput = output:clone()
+   gradOutput:view(batchSize, 2, 2, glimpseSize, glimpseSize):select(2,1):fill(0) -- ignore first scale of glimpse
+   local gradInput = sg:backward({input,location}, gradOutput)
+   local srs = nn.SpatialReSampling{oheight=glimpseSize*2,owidth=glimpseSize*2}
+   local gradInput2 = srs:updateGradInput(gradInput[1], output_:select(2,2))
+   mytester:assertTensorEq(gradInput[1], gradInput2, 0.000001, "SpatialGlimpse backward 4 depth 2 error")
+
+   local sg = nn.SpatialGlimpse(glimpseSize, 2)
+   sg.module = nn.SpatialReSampling{owidth=glimpseSize,oheight=glimpseSize}
+   local location = torch.Tensor(batchSize, 2):fill(0) -- center patch
+   local output = sg:forward{input,location}
+   local output_ = output:view(batchSize, 2, inputSize[1], glimpseSize, glimpseSize)
+   local output2 = input:narrow(3,3,glimpseSize):narrow(4,3,glimpseSize)
+   mytester:assertTensorEq(output2, output_:select(2, 1), 0.00001, "SpatialGlimpse center 4 output depth=1 err")
+   local gradOutput = output:clone()
+   local gradInput = sg:backward({input,location}, gradOutput)
+   local gradInput2 = input:clone():zero()
+   gradInput2:narrow(3,3,glimpseSize):narrow(4,3,glimpseSize):copy(output_:select(2,1))
+   gradInput2:add(srs:updateGradInput(gradInput[1], output_:select(2,2)))
+   mytester:assertTensorEq(gradInput[1], gradInput2, 0.000001, "SpatialGlimpse backward 4 depth 2 full error")
+
+   local sg = nn.SpatialGlimpse(glimpseSize, 2)
+   sg.module = nn.SpatialReSampling{owidth=glimpseSize,oheight=glimpseSize}
+   local output2 = sg:forward{input[1], location[1]}
+   local gradInput2 = sg:backward({input[1], location[1]}, gradOutput[1])
+   mytester:assertTensorEq(gradInput[1][1], gradInput2[1], 0.000001, "SpatialGlimpse backward online img err")
+   mytester:assertTensorEq(gradInput[2][1], gradInput2[2], 0.000001, "SpatialGlimpse backward online loc err")
+
+   -- test with spatial avg pool
+   local sg = nn.SpatialGlimpse(glimpseSize, 2)
+   local location = torch.Tensor(batchSize, 2):fill(0) -- center patch
+   local output = sg:forward{input,location}
+   local output_ = output:view(batchSize, 2, inputSize[1], glimpseSize, glimpseSize)
+   local output2 = input:narrow(3,3,glimpseSize):narrow(4,3,glimpseSize)
+   mytester:assertTensorEq(output2, output_:select(2, 1), 0.00001, "SpatialGlimpse avgpool center 4 output depth=1 err")
+   local gradOutput = output:clone()
+   gradOutput:view(batchSize, 2, 2, glimpseSize, glimpseSize):select(2,1):fill(0) -- ignore first scale of glimpse
+   local gradInput = sg:backward({input,location}, gradOutput)
+   local srs = nn.SpatialAveragePooling(2,2,2,2)
+   local gradInput2 = srs:updateGradInput(gradInput[1], output_:select(2,2))
+   mytester:assertTensorEq(gradInput[1], gradInput2, 0.000001, "SpatialGlimpse avgpool backward 4 depth 2 error")
+
+   local sg = nn.SpatialGlimpse(glimpseSize, 2)
+   local location = torch.Tensor(batchSize, 2):fill(0) -- center patch
+   local output = sg:forward{input,location}
+   local output_ = output:view(batchSize, 2, inputSize[1], glimpseSize, glimpseSize)
+   local output2 = input:narrow(3,3,glimpseSize):narrow(4,3,glimpseSize)
+   mytester:assertTensorEq(output2, output_:select(2, 1), 0.00001, "SpatialGlimpse avgpool center 4 output depth=1 err")
+   local gradOutput = output:clone()
+   local gradInput = sg:backward({input,location}, gradOutput)
+   local gradInput2 = input:clone():zero()
+   gradInput2:narrow(3,3,glimpseSize):narrow(4,3,glimpseSize):copy(output_:select(2,1))
+   gradInput2:add(srs:updateGradInput(gradInput[1], output_:select(2,2)))
+   mytester:assertTensorEq(gradInput[1], gradInput2, 0.000001, "SpatialGlimpse avgpool backward 4 depth 2 full error")
+
+   local sg = nn.SpatialGlimpse(glimpseSize, 2)
+   local output2 = sg:forward{input[1], location[1]}
+   local gradInput2 = sg:backward({input[1], location[1]}, gradOutput[1])
+   mytester:assertTensorEq(gradInput[1][1], gradInput2[1], 0.000001, "SpatialGlimpse avgpool backward online img err")
+   mytester:assertTensorEq(gradInput[2][1], gradInput2[2], 0.000001, "SpatialGlimpse avgpool backward online loc err")
+
+   -- test avg pool with cuda
+   if not pcall(function() require "cunn" end) then return end -- needs the cunn package
+   local input = input:cuda()
+
+   local sg = nn.SpatialGlimpse(glimpseSize, 2):cuda()
+   local location = torch.CudaTensor(batchSize, 2):fill(0) -- center patch
+   local output = sg:forward{input,location}
+   local output_ = output:view(batchSize, 2, inputSize[1], glimpseSize, glimpseSize)
+   local output2 = input:narrow(3,3,glimpseSize):narrow(4,3,glimpseSize)
+   mytester:assertTensorEq(output2, output_:select(2, 1), 0.00001, "SpatialGlimpse avgpool center 4 output depth=1 err")
+   local gradOutput = output:clone()
+   gradOutput:view(batchSize, 2, 2, glimpseSize, glimpseSize):select(2,1):fill(0) -- ignore first scale of glimpse
+   local gradInput = sg:backward({input,location}, gradOutput)
+   local srs = nn.SpatialAveragePooling(2,2,2,2):cuda()
+   local gradInput2 = srs:updateGradInput(gradInput[1], output_:select(2,2))
+   mytester:assertTensorEq(gradInput[1], gradInput2, 0.000001, "SpatialGlimpse avgpool backward 4 depth 2 error")
+
+   local sg = nn.SpatialGlimpse(glimpseSize, 2):cuda()
+   local location = torch.CudaTensor(batchSize, 2):fill(0) -- center patch
+   local output = sg:forward{input,location}
+   local output_ = output:view(batchSize, 2, inputSize[1], glimpseSize, glimpseSize)
+   local output2 = input:narrow(3,3,glimpseSize):narrow(4,3,glimpseSize)
+   mytester:assertTensorEq(output2, output_:select(2, 1), 0.00001, "SpatialGlimpse avgpool center 4 output depth=1 err")
+   local gradOutput = output:clone()
+   local gradInput = sg:backward({input,location}, gradOutput)
+   local gradInput2 = input:clone():zero()
+   gradInput2:narrow(3,3,glimpseSize):narrow(4,3,glimpseSize):copy(output_:select(2,1))
+   gradInput2:add(srs:updateGradInput(gradInput[1], output_:select(2,2)))
+   mytester:assertTensorEq(gradInput[1], gradInput2, 0.000001, "SpatialGlimpse avgpool backward 4 depth 2 full error")
+
+   local sg = nn.SpatialGlimpse(glimpseSize, 2):cuda()
+   local output2 = sg:forward{input[1], location[1]}
+   local gradInput2 = sg:backward({input[1], location[1]}, gradOutput[1])
+   mytester:assertTensorEq(gradInput[1][1], gradInput2[1], 0.000001, "SpatialGlimpse avgpool backward online img err")
+   mytester:assertTensorEq(gradInput[2][1], gradInput2[2], 0.000001, "SpatialGlimpse avgpool backward online loc err")
+
+   if false then
+      -- benchmark GPU vs CPU
+      local location = torch.FloatTensor(32,2):uniform(-1,1)
+      local input = torch.FloatTensor(32,3,224,224):uniform(0,1)
+      local gradOutput = torch.FloatTensor(32,9,32,32):uniform(0,1)
+      local sg = nn.SpatialGlimpse(32, 3, 2):float()
+      sg:forward{input,location}
+      local a = torch.Timer()
+      for i=1,5 do
+         sg:forward{input,location}
+      end
+      local fwdCPUtime = a:time().real
+
+      sg:cuda()
+      location = location:cuda()
+      input = input:cuda()
+      gradOutput = gradOutput:cuda()
+      sg:forward{input,location}
+      a = torch.Timer()
+      for i=1,5 do
+         sg:forward{input,location}
+      end
+      local fwdGPUtime = a:time().real
+      print(fwdGPUtime, fwdCPUtime, fwdCPUtime/fwdGPUtime)
+      -- 0.13885092735291  2.0344181060791  14.651815042678
+   end
+end
+
+function rnntest.SpatialGlimpse_backwardcompat()
+   if not pcall(function() require "nnx" end) then return end -- needs the nnx package
+   -- this is ugly, but I know this verson of the module works.
+   -- So we try to match the newer versions to it
+   local SG, parent = torch.class("nn.SG", "nn.Module")
+
+   function SG:__init(size, depth, scale)
+      self.size = size -- height == width
+      self.depth = depth or 3
+      self.scale = scale or 2
+
+      assert(torch.type(self.size) == 'number')
+      assert(torch.type(self.depth) == 'number')
+      assert(torch.type(self.scale) == 'number')
+      parent.__init(self)
+      self.gradInput = {torch.Tensor(), torch.Tensor()}
+      if self.scale == 2 then
+         self.module = nn.SpatialAveragePooling(2,2,2,2)
+      else
+         self.module = nn.SpatialReSampling{oheight=size,owidth=size}
+      end
+      self.modules = {self.module}
+   end
+
+   -- a bandwidth limited sensor which focuses on a location.
+   -- locations index the x,y coord of the center of the output glimpse
+   function SG:updateOutput(inputTable)
+      assert(torch.type(inputTable) == 'table')
+      assert(#inputTable >= 2)
+      local input, location = unpack(inputTable)
+      input, location = self:toBatch(input, 3), self:toBatch(location, 1)
+      assert(input:dim() == 4 and location:dim() == 2)
+
+      self.output:resize(input:size(1), self.depth, input:size(2), self.size, self.size)
+
+      self._crop = self._crop or self.output.new()
+      self._pad = self._pad or input.new()
+
+      for sampleIdx=1,self.output:size(1) do
+         local outputSample = self.output[sampleIdx]
+         local inputSample = input[sampleIdx]
+         local xy = location[sampleIdx]
+         -- (-1,-1) top left corner, (1,1) bottom right corner of image
+         local x, y = xy:select(1,1), xy:select(1,2)
+         -- (0,0), (1,1)
+         x, y = (x+1)/2, (y+1)/2
+
+         -- for each depth of glimpse : pad, crop, downscale
+         local glimpseSize = math.floor(self.size)
+         for depth=1,self.depth do
+            local dst = outputSample[depth]
+            if depth > 1 then
+               glimpseSize = math.floor(glimpseSize*self.scale)
+            end
+
+            -- add zero padding (glimpse could be partially out of bounds)
+            local padSize = math.floor((glimpseSize-1)/2)
+            self._pad:resize(input:size(2), input:size(3)+padSize*2, input:size(4)+padSize*2):zero()
+            local center = self._pad:narrow(2,padSize+1,input:size(3)):narrow(3,padSize+1,input:size(4))
+            center:copy(inputSample)
+
+            -- crop it
+            local h, w = self._pad:size(2)-glimpseSize, self._pad:size(3)-glimpseSize
+            local x, y = math.floor(math.min(h,math.max(0,x*h))), math.floor(math.min(w,math.max(0,y*w)))
+
+            if depth == 1 then
+               dst:copy(self._pad:narrow(2,x+1,glimpseSize):narrow(3,y+1,glimpseSize))
+            else
+               self._crop:resize(input:size(2), glimpseSize, glimpseSize)
+               self._crop:copy(self._pad:narrow(2,x+1,glimpseSize):narrow(3,y+1,glimpseSize))
+
+               if torch.type(self.module) == 'nn.SpatialAveragePooling' then
+                  local poolSize = glimpseSize/self.size
+                  assert(poolSize % 2 == 0)
+                  self.modules[1].kW = poolSize
+                  self.modules[1].kH = poolSize
+                  self.modules[1].dW = poolSize
+                  self.modules[1].dH = poolSize
+               end
+               dst:copy(self.modules[1]:updateOutput(self._crop))
+            end
+         end
+      end
+
+      self.output:resize(input:size(1), self.depth*input:size(2), self.size, self.size)
+      self.output = self:fromBatch(self.output, 1)
+      return self.output
+   end
+
+   function SG:updateGradInput(inputTable, gradOutput)
+      local input, location = unpack(inputTable)
+      local gradInput, gradLocation = unpack(self.gradInput)
+      input, location = self:toBatch(input, 3), self:toBatch(location, 1)
+      gradOutput = self:toBatch(gradOutput, 3)
+
+      gradInput:resizeAs(input):zero()
+      gradLocation:resizeAs(location):zero() -- no backprop through location
+
+      gradOutput = gradOutput:view(input:size(1), self.depth, input:size(2), self.size, self.size)
+
+      for sampleIdx=1,gradOutput:size(1) do
+         local gradOutputSample = gradOutput[sampleIdx]
+         local gradInputSample = gradInput[sampleIdx]
+         local xy = location[sampleIdx] -- height, width
+         -- (-1,-1) top left corner, (1,1) bottom right corner of image
+         local x, y = xy:select(1,1), xy:select(1,2)
+         -- (0,0), (1,1)
+         x, y = (x+1)/2, (y+1)/2
+
+         -- for each depth of glimpse : pad, crop, downscale
+         local glimpseSize = self.size
+         for depth=1,self.depth do
+            local src = gradOutputSample[depth]
+            if depth > 1 then
+               glimpseSize = glimpseSize*self.scale
+            end
+
+            -- add zero padding (glimpse could be partially out of bounds)
+            local padSize = math.floor((glimpseSize-1)/2)
+            self._pad:resize(input:size(2), input:size(3)+padSize*2, input:size(4)+padSize*2):zero()
+
+            local h, w = self._pad:size(2)-glimpseSize, self._pad:size(3)-glimpseSize
+            local x, y = math.min(h,math.max(0,x*h)),  math.min(w,math.max(0,y*w))
+            local pad = self._pad:narrow(2, x+1, glimpseSize):narrow(3, y+1, glimpseSize)
+
+            -- upscale glimpse for different depths
+            if depth == 1 then
+               pad:copy(src)
+            else
+               self._crop:resize(input:size(2), glimpseSize, glimpseSize)
+
+               if torch.type(self.module) == 'nn.SpatialAveragePooling' then
+                  local poolSize = glimpseSize/self.size
+                  assert(poolSize % 2 == 0)
+                  self.modules[1].kW = poolSize
+                  self.modules[1].kH = poolSize
+                  self.modules[1].dW = poolSize
+                  self.modules[1].dH = poolSize
+               end
+
+               pad:copy(self.modules[1]:updateGradInput(self._crop, src))
+            end
+
+            -- copy into gradInput tensor (excluding padding)
+            gradInputSample:add(self._pad:narrow(2, padSize+1, input:size(3)):narrow(3, padSize+1, input:size(4)))
+         end
+      end
+
+      self.gradInput[1] = self:fromBatch(gradInput, 1)
+      self.gradInput[2] = self:fromBatch(gradLocation, 1)
+
+      return self.gradInput
+   end
+
+   local batchSize = 1
+   local inputSize = {2,8,8}
+   local glimpseSize = 4
+   local input = torch.randn(batchSize, unpack(inputSize))
+   input:resize(batchSize, unpack(inputSize))
+
+   local sg = nn.SpatialGlimpse(glimpseSize, 2)
+   local sg2 = nn.SG(glimpseSize, 2)
+
+   for i=1,10 do
+      local location = torch.Tensor(batchSize, 2):uniform(-0.9,0.9)
+      local output = sg:forward{input,location}
+      local output2 = sg2:forward{input,location}
+      mytester:assertTensorEq(output, output2, 0.0000001, "SpatialGlimpse err")
+   end
+
+end
+
+-- test rectangle-shaped glimpse sampling
+function rnntest.SpatialGlimpseRect()
+   if not pcall(function() require "image" end) then return end -- needs the image package
+   if not pcall(function() require "nnx" end) then return end -- needs the nnx package
+   local batchSize = 1
+   local inputSize = {2,8,8}
+
+   local glimpseSize = {4,2} -- {height, width}
+   local input = torch.Tensor(batchSize, unpack(inputSize))
+   input:range(1,input:nElement())
+   input:resize(batchSize, unpack(inputSize))
+   local sg = nn.SpatialGlimpse(glimpseSize)
+   local location = torch.Tensor(batchSize, 2):fill(0) -- center patch
+   local output = sg:forward{input,location}
+   local output_ = output:view(batchSize, 3, inputSize[1], glimpseSize[1], glimpseSize[2])
+   local y0 = (input:size(3)-glimpseSize[1])/2 + 1
+   local x0 = (input:size(4)-glimpseSize[2])/2 + 1
+   local output2 = input:narrow(3,y0,glimpseSize[1]):narrow(4,x0,glimpseSize[2])
+   mytester:assertTensorEq(output2, output_:select(2, 1), 0.00001, "SpatialGlimpseRect center 4 output depth=1 err")
+   local outputSize = {batchSize, inputSize[1]*3, glimpseSize[1], glimpseSize[2]}
+   mytester:assertTableEq(output:size():totable(), outputSize, 0.000001, "SpatialGlimpseRect output size err")
+
+   local input2 = torch.Tensor(unpack(inputSize))
+   input2:range(1,input2:nElement())
+   input2:resize(unpack(inputSize))
+   local sg = nn.SpatialGlimpse(glimpseSize)
+   local location2 = torch.Tensor(2):fill(0) -- center patch
+   local output2 = sg:forward{input2,location2}
+   mytester:assertTensorEq(output2, output[1], 0.00001, "SpatialGlimpseRect online output depth=1 err")
+
+   local glimpseSize = {5,3}
+   local sg = nn.SpatialGlimpse(glimpseSize)
+   local location = torch.Tensor(batchSize, 2):fill(0) -- center patch
+   local output = sg:forward{input,location}
+   local output_ = output:view(batchSize, 3, inputSize[1], glimpseSize[1], glimpseSize[2])
+   local y0 = math.floor((input:size(3)-glimpseSize[1])/2) + 1
+   local x0 = math.floor((input:size(4)-glimpseSize[2])/2) + 1
+   local output2 = input:narrow(3,y0,glimpseSize[1]):narrow(4,x0,glimpseSize[2])
+   mytester:assertTensorEq(output2, output_:select(2, 1), 0.00001, "SpatialGlimpseRect center 5 output depth=1 err")
+
+   local glimpseSize = {4,3}
+   local sg = nn.SpatialGlimpse(glimpseSize)
+   local location = torch.Tensor(batchSize, 2):fill(-1) -- top left corner patch
+   local output = sg:forward{input,location}
+   local output_ = output:view(batchSize, 3, inputSize[1], glimpseSize[1], glimpseSize[2])
+   local padSize = {math.floor((glimpseSize[1]-1)/2), math.floor((glimpseSize[2]-1)/2)}
+   local pad = torch.Tensor(batchSize, inputSize[1], inputSize[2]+padSize[1]*2, inputSize[3]+padSize[2]*2):zero()
+   pad:narrow(3, padSize[1] + 1, inputSize[2]):narrow(4, padSize[2] + 1, inputSize[3]):copy(input)
+   local output2 = pad:narrow(3,1,glimpseSize[1]):narrow(4,1,glimpseSize[2])
+   --print('top-left', output2, output_:select(2, 1))
+   mytester:assertTensorEq(output2, output_:select(2, 1), 0.00001, "SpatialGlimpseRect top-left 4 output depth=1 err")
+
+   local glimpseSize = {5,4}
+   local sg = nn.SpatialGlimpse(glimpseSize)
+   local location = torch.Tensor(batchSize, 2):fill(-1) -- top left corner patch
+   local output = sg:forward{input,location}
+   local output_ = output:view(batchSize, 3, inputSize[1], glimpseSize[1], glimpseSize[2])
+   local pad = torch.Tensor(batchSize, inputSize[1], inputSize[2]+glimpseSize[1], inputSize[3]+glimpseSize[2]):zero()
+   local y0 = math.floor((glimpseSize[1]-1)/2) + 1
+   local x0 = math.floor((glimpseSize[2]-1)/2) + 1
+   pad:narrow(3, y0, inputSize[2]):narrow(4, x0, inputSize[3]):copy(input)
+   local output2 = pad:narrow(3,1,glimpseSize[1]):narrow(4,1,glimpseSize[2])
+   mytester:assertTensorEq(output2, output_:select(2, 1), 0.00001, "SpatialGlimpseRect top-left 5 output depth=1 err")
+
+   local glimpseSize = {3,4}
+   local sg = nn.SpatialGlimpse(glimpseSize)
+   local location = torch.Tensor(batchSize, 2):fill(1) -- bottom-right corner patch
+   local output = sg:forward{input,location}
+   local output_ = output:view(batchSize, 3, inputSize[1], glimpseSize[1], glimpseSize[2])
+   local pad = torch.Tensor(batchSize, inputSize[1], inputSize[2]+glimpseSize[1], inputSize[3]+glimpseSize[2]):zero()
+   local y0 = math.floor((glimpseSize[1]-1)/2) + 1
+   local x0 = math.floor((glimpseSize[2]-1)/2) + 1
+   pad:narrow(3, y0, inputSize[2]):narrow(4, x0, inputSize[3]):copy(input)
+   local dy = math.floor((glimpseSize[1])/2)
+   local dx = math.floor((glimpseSize[2])/2)
+   local output2 = pad:narrow(3,inputSize[2]-dy+1,glimpseSize[1]):narrow(4,inputSize[3]-dx+1,glimpseSize[2])
+   mytester:assertTensorEq(output2, output_:select(2, 1), 0.00001, "SpatialGlimpseRect bottom-right 4 output depth=1 err")
+
+   local glimpseSize = {4,5}
+   local sg = nn.SpatialGlimpse(glimpseSize)
+   local location = torch.Tensor(batchSize, 2):fill(1) -- bottom-right corner patch
+   local output = sg:forward{input,location}
+   local output_ = output:view(batchSize, 3, inputSize[1], glimpseSize[1], glimpseSize[2])
+   local pad = torch.Tensor(batchSize, inputSize[1], inputSize[2]+glimpseSize[1], inputSize[3]+glimpseSize[2]):zero()
+   local y0 = math.floor((glimpseSize[1])/2)
+   local x0 = math.floor((glimpseSize[2])/2)
+   pad:narrow(3, y0, inputSize[2]):narrow(4, x0, inputSize[3]):copy(input)
+   local dy = math.floor((glimpseSize[1])/2)
+   local dx = math.floor((glimpseSize[2])/2)
+   local output2 = pad:narrow(3,inputSize[2]-dy+1,glimpseSize[1]):narrow(4,inputSize[3]-dx+1,glimpseSize[2])
+   --print('bottom-right', output2, output_:select(2, 1))
+   mytester:assertTensorEq(output2, output_:select(2, 1), 0.00001, "SpatialGlimpseRect bottom-right 5 output depth=1 err")
+
+   -- test gradients
+   local glimpseSize = {4,4} -- {height, width}
+   local sg = nn.SpatialGlimpse(glimpseSize, 1)
+   local location = torch.Tensor(batchSize, 2):fill(0) -- center patch
+   local output = sg:forward{input,location}
+   local output_ = output:view(batchSize, 1, inputSize[1], glimpseSize[1], glimpseSize[2])
+   local y0 = math.floor((input:size(3)-glimpseSize[1])/2) + 1
+   local x0 = math.floor((input:size(4)-glimpseSize[2])/2) + 1
+   local output2 = input:narrow(3,y0,glimpseSize[1]):narrow(4,x0,glimpseSize[2])
+   mytester:assertTensorEq(output2, output_:select(2, 1), 0.00001, "SpatialGlimpseRect center 4 output depth=1 err")
+   local gradInput = sg:backward({input,location}, output)
+   local gradInput2 = input:clone():zero()
+   gradInput2:narrow(3,y0,glimpseSize[1]):narrow(4,x0,glimpseSize[2]):copy(output_:select(2,1))
+   mytester:assertTensorEq(gradInput[1], gradInput2, 0.000001, "SpatialGlimpseRect backward 4 depth 1 error")
+
+   -- test with spatial resampling
+   local sg = nn.SpatialGlimpse(glimpseSize, 2)
+   sg.module = nn.SpatialReSampling{owidth=glimpseSize[2],oheight=glimpseSize[1]}
+   local location = torch.Tensor(batchSize, 2):fill(0) -- center patch
+   local output = sg:forward{input,location}
+   local output_ = output:view(batchSize, 2, inputSize[1], glimpseSize[1], glimpseSize[2])
+   local y0 = math.floor((input:size(3)-glimpseSize[1])/2) + 1
+   local x0 = math.floor((input:size(4)-glimpseSize[2])/2) + 1
+   local output2 = input:narrow(3,y0,glimpseSize[1]):narrow(4,x0,glimpseSize[2])
+   mytester:assertTensorEq(output2, output_:select(2, 1), 0.00001, "SpatialGlimpseRect center 4 output depth=1 err")
+   local gradOutput = output:clone()
+   gradOutput:view(batchSize, 2, 2, glimpseSize[1], glimpseSize[2]):select(2,1):fill(0) -- ignore first scale of glimpse
+   local gradInput = sg:backward({input,location}, gradOutput)
+   local srs = nn.SpatialReSampling{oheight=glimpseSize[2]*2,owidth=glimpseSize[1]*2}
+   local gradInput2 = srs:updateGradInput(gradInput[1], output_:select(2,2))
+   --print('SpatialReSampling', gradInput2, gradInput[1])
+   mytester:assertTensorEq(gradInput[1], gradInput2, 0.000001, "SpatialGlimpseRect backward 4 depth 2 error")
+
+   local sg = nn.SpatialGlimpse(glimpseSize, 2)
+   sg.module = nn.SpatialReSampling{owidth=glimpseSize[2],oheight=glimpseSize[1]}
+   local location = torch.Tensor(batchSize, 2):fill(0) -- center patch
+   local output = sg:forward{input,location}
+   local output_ = output:view(batchSize, 2, inputSize[1], glimpseSize[1], glimpseSize[2])
+   local output2 = input:narrow(3,y0,glimpseSize[1]):narrow(4,x0,glimpseSize[2])
+   mytester:assertTensorEq(output2, output_:select(2, 1), 0.00001, "SpatialGlimpseRect center 4 output depth=1 err")
+   local gradOutput = output:clone()
+   local gradInput = sg:backward({input,location}, gradOutput)
+   local gradInput2 = input:clone():zero()
+   gradInput2:narrow(3,y0,glimpseSize[1]):narrow(4,x0,glimpseSize[2]):copy(output_:select(2,1))
+   gradInput2:add(srs:updateGradInput(gradInput[1], output_:select(2,2)))
+   --print('SpatialReSampling', gradInput2, gradInput[1])
+   mytester:assertTensorEq(gradInput[1], gradInput2, 0.000001, "SpatialGlimpseRect backward 4 depth 2 full error")
+
+   local sg = nn.SpatialGlimpse(glimpseSize, 2)
+   sg.module = nn.SpatialReSampling{owidth=glimpseSize[2],oheight=glimpseSize[1]}
+   local output2 = sg:forward{input[1], location[1]}
+   local gradInput2 = sg:backward({input[1], location[1]}, gradOutput[1])
+   mytester:assertTensorEq(gradInput[1][1], gradInput2[1], 0.000001, "SpatialGlimpseRect backward online img err")
+   mytester:assertTensorEq(gradInput[2][1], gradInput2[2], 0.000001, "SpatialGlimpseRect backward online loc err")
+
+   -- test with spatial avg pool
+   local sg = nn.SpatialGlimpse(glimpseSize, 2)
+   local location = torch.Tensor(batchSize, 2):fill(0) -- center patch
+   local output = sg:forward{input,location}
+   local output_ = output:view(batchSize, 2, inputSize[1], glimpseSize[1], glimpseSize[2])
+   local y0 = math.floor((input:size(3)-glimpseSize[1])/2) + 1
+   local x0 = math.floor((input:size(4)-glimpseSize[2])/2) + 1
+   local output2 = input:narrow(3,y0,glimpseSize[1]):narrow(4,x0,glimpseSize[2])
+   mytester:assertTensorEq(output2, output_:select(2, 1), 0.00001, "SpatialGlimpseRect avgpool center 4 output depth=1 err")
+   local gradOutput = output:clone()
+   gradOutput:view(batchSize, 2, 2, glimpseSize[1], glimpseSize[2]):select(2,1):fill(0) -- ignore first scale of glimpse
+   local gradInput = sg:backward({input,location}, gradOutput)
+   local srs = nn.SpatialAveragePooling(2,2,2,2)
+   local gradInput2 = srs:updateGradInput(gradInput[1], output_:select(2,2))
+   mytester:assertTensorEq(gradInput[1], gradInput2, 0.000001, "SpatialGlimpseRect avgpool backward 4 depth 2 error")
+
+   local sg = nn.SpatialGlimpse(glimpseSize, 2)
+   local location = torch.Tensor(batchSize, 2):fill(0) -- center patch
+   local output = sg:forward{input,location}
+   local output_ = output:view(batchSize, 2, inputSize[1], glimpseSize[1], glimpseSize[2])
+   local output2 = input:narrow(3,y0,glimpseSize[1]):narrow(4,x0,glimpseSize[2])
+   mytester:assertTensorEq(output2, output_:select(2, 1), 0.00001, "SpatialGlimpseRect avgpool center 4 output depth=1 err")
+   local gradOutput = output:clone()
+   local gradInput = sg:backward({input,location}, gradOutput)
+   local gradInput2 = input:clone():zero()
+   gradInput2:narrow(3,y0,glimpseSize[1]):narrow(4,x0,glimpseSize[2]):copy(output_:select(2,1))
+   gradInput2:add(srs:updateGradInput(gradInput[1], output_:select(2,2)))
+   mytester:assertTensorEq(gradInput[1], gradInput2, 0.000001, "SpatialGlimpseRect avgpool backward 4 depth 2 full error")
+
+   local sg = nn.SpatialGlimpse(glimpseSize, 2)
+   local output2 = sg:forward{input[1], location[1]}
+   local gradInput2 = sg:backward({input[1], location[1]}, gradOutput[1])
+   mytester:assertTensorEq(gradInput[1][1], gradInput2[1], 0.000001, "SpatialGlimpseRect avgpool backward online img err")
+   mytester:assertTensorEq(gradInput[2][1], gradInput2[2], 0.000001, "SpatialGlimpseRect avgpool backward online loc err")
+
+   -- test avg pool with cuda
+   if not pcall(function() require "cunn" end) then return end -- needs the cunn package
+   local input = input:cuda()
+
+   local sg = nn.SpatialGlimpse(glimpseSize, 2):cuda()
+   local location = torch.CudaTensor(batchSize, 2):fill(0) -- center patch
+   local output = sg:forward{input,location}
+   local output_ = output:view(batchSize, 2, inputSize[1], glimpseSize[1], glimpseSize[2])
+   local output2 = input:narrow(3,y0,glimpseSize[1]):narrow(4,x0,glimpseSize[2])
+   mytester:assertTensorEq(output2, output_:select(2, 1), 0.00001, "SpatialGlimpseRect avgpool center 4 output depth=1 err")
+   local gradOutput = output:clone()
+   gradOutput:view(batchSize, 2, 2, glimpseSize[1], glimpseSize[2]):select(2,1):fill(0) -- ignore first scale of glimpse
+   local gradInput = sg:backward({input,location}, gradOutput)
+   local srs = nn.SpatialAveragePooling(2,2,2,2):cuda()
+   local gradInput2 = srs:updateGradInput(gradInput[1], output_:select(2,2))
+   mytester:assertTensorEq(gradInput[1], gradInput2, 0.000001, "SpatialGlimpseRect avgpool backward 4 depth 2 error")
+
+   local sg = nn.SpatialGlimpse(glimpseSize, 2):cuda()
+   local location = torch.CudaTensor(batchSize, 2):fill(0) -- center patch
+   local output = sg:forward{input,location}
+   local output_ = output:view(batchSize, 2, inputSize[1], glimpseSize[1], glimpseSize[2])
+   local output2 = input:narrow(3,y0,glimpseSize[1]):narrow(4,x0,glimpseSize[2])
+   mytester:assertTensorEq(output2, output_:select(2, 1), 0.00001, "SpatialGlimpseRect avgpool center 4 output depth=1 err")
+   local gradOutput = output:clone()
+   local gradInput = sg:backward({input,location}, gradOutput)
+   local gradInput2 = input:clone():zero()
+   gradInput2:narrow(3,y0,glimpseSize[1]):narrow(4,x0,glimpseSize[2]):copy(output_:select(2,1))
+   gradInput2:add(srs:updateGradInput(gradInput[1], output_:select(2,2)))
+   mytester:assertTensorEq(gradInput[1], gradInput2, 0.000001, "SpatialGlimpseRect avgpool backward 4 depth 2 full error")
+
+   local sg = nn.SpatialGlimpse(glimpseSize, 2):cuda()
+   local output2 = sg:forward{input[1], location[1]}
+   local gradInput2 = sg:backward({input[1], location[1]}, gradOutput[1])
+   mytester:assertTensorEq(gradInput[1][1], gradInput2[1], 0.000001, "SpatialGlimpseRect avgpool backward online img err")
+   mytester:assertTensorEq(gradInput[2][1], gradInput2[2], 0.000001, "SpatialGlimpseRect avgpool backward online loc err")
+
+   if false then
+      -- benchmark GPU vs CPU
+      local location = torch.FloatTensor(32,2):uniform(-1,1)
+      local input = torch.FloatTensor(32,3,224,224):uniform(0,1)
+      local gradOutput = torch.FloatTensor(32,9,32,32):uniform(0,1)
+      local sg = nn.SpatialGlimpse({32,24}, 3, 2):float()
+      sg:forward{input,location}
+      local a = torch.Timer()
+      for i=1,5 do
+         sg:forward{input,location}
+      end
+      local fwdCPUtime = a:time().real
+
+      sg:cuda()
+      location = location:cuda()
+      input = input:cuda()
+      gradOutput = gradOutput:cuda()
+      sg:forward{input,location}
+      a = torch.Timer()
+      for i=1,5 do
+         sg:forward{input,location}
+      end
+      local fwdGPUtime = a:time().real
+      print(fwdGPUtime, fwdCPUtime, fwdCPUtime/fwdGPUtime)
+      --
+   end
+end
+
+function rnntest.ArgMax()
+   local inputSize = 5
+   local batchSize = 3
+   local input = torch.randn(batchSize, inputSize)
+   local gradOutput = torch.randn(batchSize):long()
+   local am = nn.ArgMax(1,1)
+   local output = am:forward(input)
+   local gradInput = am:backward(input, gradOutput)
+   local val, idx = torch.max(input, 2)
+   mytester:assertTensorEq(idx:select(2,1), output, 0.000001, "ArgMax output asLong err")
+   mytester:assertTensorEq(gradInput, input:clone():zero(), 0.000001, "ArgMax gradInput asLong err")
+   local am = nn.ArgMax(1,1,false)
+   local output = am:forward(input)
+   local gradInput = am:backward(input, gradOutput)
+   local val, idx = torch.max(input, 2)
+   mytester:assertTensorEq(idx:select(2,1):double(), output, 0.000001, "ArgMax output not asLong err")
+   mytester:assertTensorEq(gradInput, input:clone():zero(), 0.000001, "ArgMax gradInput not asLong err")
+end
+
+function rnntest.CategoricalEntropy()
+   local inputSize = 5
+   local batchSize = 10
+   local minEntropy = 12
+   local input_ = torch.randn(batchSize, inputSize)
+   local input = nn.SoftMax():updateOutput(input_)
+   local gradOutput = torch.Tensor(batchSize, inputSize):zero()
+   local ce = nn.CategoricalEntropy()
+   local output = ce:forward(input)
+   mytester:assertTensorEq(input, output, 0.0000001, "CategoricalEntropy output err")
+   local gradInput = ce:backward(input, gradOutput)
+   local output2 = input:sum(1)[1]
+   output2:div(output2:sum())
+   local log2 = torch.log(output2 + 0.000001)
+   local entropy2 = -output2:cmul(log2):sum()
+   mytester:assert(math.abs(ce.entropy - entropy2) < 0.000001, "CategoricalEntropy entropy err")
+   local gradEntropy2 = log2:add(1) -- -1*(-1 - log(p(x))) = 1 + log(p(x))
+   gradEntropy2:div(input:sum())
+   local gradInput2 = gradEntropy2:div(batchSize):view(1,inputSize):expandAs(input)
+   mytester:assertTensorEq(gradInput2, gradInput, 0.000001, "CategoricalEntropy gradInput err")
+end
+
+function rnntest.TotalDropout()
+   local batchSize = 4
+   local inputSize = 3
+   local input = torch.randn(batchSize, inputSize)
+   local gradOutput = torch.randn(batchSize, inputSize)
+   local td = nn.TotalDropout()
+   local nOne = 0
+   for i=1,10 do
+      local output = td:forward(input)
+      local gradInput = td:backward(input, gradOutput)
+      if td.noise == 0 then
+         mytester:assert(output:sum() == 0, "TotalDropout forward 0 err")
+         mytester:assert(gradInput:sum() == 0, "TotalDropout backward 0 err")
+      else
+         mytester:assertTensorEq(output, input, 0.000001, "TotalDropout forward 1 err")
+         mytester:assertTensorEq(gradInput, gradOutput, 0.000001, "TotalDropout backward 1 err")
+         nOne = nOne + 1
+      end
+   end
+   mytester:assert(nOne < 10 and nOne > 1, "TotalDropout bernoulli error")
+end
+
+
+-- Unit Test WhiteNoise
+function rnntest.WhiteNoise()
+   local input = torch.zeros(3, 28, 28)
+   local addNoise = nn.WhiteNoise()
+   local output = addNoise:forward(input)
+   local meanValue = output:mean()
+   local stdValue = output:std()
+   mytester:assert(meanValue > -0.01 and meanValue < 0.01)
+   mytester:assert(stdValue < 0.15 and stdValue >= 0)
+
+   -- Evaluate
+   addNoise:evaluate()
+   output = addNoise:forward(input)
+   meanValue = output:mean()
+   stdValue = output:std()
+   mytester:assert(meanValue == 0)
+   mytester:assert(stdValue == 0)
+
+   -- backprop
+   addNoise:training()
+   local gradOutput = torch.rand(3, 28, 28)
+   local gradInput = addNoise:updateGradInput(input, gradOutput)
+   mytester:assertTensorEq(gradOutput, gradInput, 0.000001, "WhiteNoise backward err")
+end
+
+-- Unit Test SpatialBinaryLogisticRegression criterion
+function rnntest.SpatialBinaryLogisticRegression()
+   local crit = nn.SpatialBinaryLogisticRegression()
+   local k = 32
+   local h = 28
+   local w = 28
+
+   -- Working with batch of images
+   local input = torch.zeros(k, 1, h, w)
+   local target = torch.zeros(k, 1, h, w)
+   local inputs = {1, 0, -1}
+   local targets = {1, 0, -1}
+   for _,i in pairs(inputs) do
+      for _,t in pairs(targets) do
+
+      input:fill(i)
+      target:fill(t)
+      -- Check forward
+      local loss = crit:updateOutput(input, target)
+      local myLoss = math.log(1+math.exp(-1*i*t))/2
+      mytester:assert( loss >= myLoss-precision and loss <= myLoss+precision,
+                       "SpatialBinaryLogisticRegression cost incorrect.")
+
+      -- Check backward
+      local gradInput = crit:updateGradInput(input, target)
+      local g1 = gradInput[1][1][1][1]
+      local gi = (1/(1+math.exp(-1*i*t)))*math.exp(-1*i*t)*(-1*t)/(2*k*h*w)
+      mytester:assert( g1 >= gi-precision and g1 <= gi+precision,
+                      "SpatialBinaryLogisticRegression gradInput error.")
+      end
+   end
+
+   -- Working with single image
+   k = 1
+   local input = torch.zeros(1, h, w)
+   local target = torch.zeros(1, h, w)
+   local inputs = {1, 0, -1}
+   local targets = {1, 0, -1}
+   for _,i in pairs(inputs) do
+      for _,t in pairs(targets) do
+
+      input:fill(i)
+      target:fill(t)
+      -- Check forward
+      local loss = crit:updateOutput(input, target)
+      local myLoss = math.log(1+math.exp(-1*i*t))/2
+      mytester:assert( loss >= myLoss-precision and loss <= myLoss+precision,
+                       "SpatialBinaryLogisticRegression cost incorrect.")
+
+      -- Check backward
+      local gradInput = crit:updateGradInput(input, target)
+      local g1 = gradInput[1][1][1]
+      local gi = (1/(1+math.exp(-1*i*t)))*math.exp(-1*i*t)*(-1*t)/(2*k*h*w)
+      mytester:assert( g1 >= gi-precision and g1 <= gi+precision,
+                      "SpatialBinaryLogisticRegression gradInput error.")
+      end
+   end
+end
+
+-- Unit Test BinaryLogisticRegression criterion
+function rnntest.BinaryLogisticRegression()
+   local crit = nn.BinaryLogisticRegression()
+   local k = 32
+
+   -- Working with batch of images
+   local input = torch.zeros(k, 1)
+   local target = torch.zeros(k, 1)
+   local inputs = {1, 0, -1}
+   local targets = {1, 0, -1}
+   for _,i in pairs(inputs) do
+      for _,t in pairs(targets) do
+
+      input:fill(i)
+      target:fill(t)
+      -- Check forward
+      local loss = crit:updateOutput(input, target)
+      local myLoss = math.log(1+math.exp(-1*i*t))
+      mytester:assert( loss >= myLoss-precision and loss <= myLoss+precision,
+                       "BinaryLogisticRegression cost incorrect.")
+
+      -- Check backward
+      local gradInput = crit:updateGradInput(input, target)
+      local g1 = gradInput[1][1]
+      local gi = (1/(1+math.exp(-1*i*t)))*math.exp(-1*i*t)*(-1*t)/(k)
+      mytester:assert( g1 >= gi-precision and g1 <= gi+precision,
+                      "BinaryLogisticRegression gradInput error.")
+      end
+   end
+
+   -- Working nElements not matching.
+   local input = torch.zeros(1, k)
+   local target = torch.zeros(k, 1)
+   local inputs = {1, 0, -1}
+   local targets = {1, 0, -1}
+   for _,i in pairs(inputs) do
+      for _,t in pairs(targets) do
+
+      input:fill(i)
+      target:fill(t)
+      -- Check forward
+      local loss = crit:updateOutput(input, target)
+      local myLoss = math.log(1+math.exp(-1*i*t))
+      mytester:assert( loss >= myLoss-precision and loss <= myLoss+precision,
+                       "BinaryLogisticRegression cost incorrect.")
+
+      -- Check backward
+      local gradInput = crit:updateGradInput(input, target)
+      local g1 = gradInput[1][1]
+      local gi = (1/(1+math.exp(-1*i*t)))*math.exp(-1*i*t)*(-1*t)/(k)
+      mytester:assert( g1 >= gi-precision and g1 <= gi+precision,
+                      "BinaryLogisticRegression gradInput error.")
+      end
+   end
+end
+
+-- Unit Test SpatialRegionDropout
+function rnntest.SpatialRegionDropout()
+   local hasCuda = pcall(function() require 'cunn' end)
+   local useCudas = {false, hasCuda}
+   local p = 0.2
+   local value = 2
+   local model = nn.SpatialRegionDropout(p)
+   local input = torch.zeros(3, 100, 100):fill(value)
+
+   for _, useCuda in pairs(useCudas) do
+      if useCuda then
+         model:cuda()
+         input = input:cuda()
+      end
+      local output = model:forward(input)
+      mytester:assert( output:mean() >= value-precision and
+                       output:mean() <= value+precision,
+                       "SpatialRegionDropout forward mean value incorrect.")
+
+      local gradInput = model:backward(input, input)
+      mytester:assert( gradInput:mean() >= value-precision and
+                       gradInput:mean() <= value+precision,
+                       "SpatialRegionDropout backward mean value incorrect.")
+   end
+end
+
+-- Unit Test SpatialBinaryConvolution
+function rnntest.SpatialBinaryConvolution()
+   local hasCuda = pcall(function() require 'cunn' end)
+   local useCudas = {false, hasCuda}
+   local nInputPlane = 3
+   local nOutputPlane = 16
+   local kW = 3
+   local kH = 3
+   local height = 224
+   local width = 224
+
+   local model = nn.SpatialBinaryConvolution(nInputPlane, nOutputPlane,
+                                             kW, kH)
+   local input = torch.rand(nInputPlane, height, width)
+
+   for _, useCuda in pairs(useCudas) do
+      if useCuda then
+         model:cuda()
+         input = input:cuda()
+      end
+      model:zeroGradParameters()
+      local output = model:forward(input)
+      local gradInput = model:backward(input, output)
+   end
+end
+
+-- Unit Test SimpleColorTransform
+function rnntest.SimpleColorTransform()
+   local hasCuda = pcall(function() require 'cunn' end)
+   local useCudas = {false, hasCuda}
+   local value = 10
+   local rangeValue = 2
+   local precision = rangeValue*0.1
+   local range = torch.zeros(3):fill(rangeValue)
+   local model = nn.SimpleColorTransform(3, range)
+   local input = torch.zeros(32, 3, 100, 100):fill(value)
+
+   for _, useCuda in pairs(useCudas) do
+      if useCuda then
+         model:cuda()
+         input = input:cuda()
+      end
+      local output = model:forward(input)
+      mytester:assert(output:std() <= rangeValue+precision,
+                       "SimpleColorTransform output value incorrect.")
+      local gradInput = model:backward(input, input)
+      mytester:assert(gradInput:sum() == input:sum(),
+                       "SimpleColorTransform gradInput value incorrect.")
+   end
+end
+
+-- Unit Test PCAColorTransform
+function rnntest.PCAColorTransform()
+   local hasCuda = pcall(function() require 'cunn' end)
+   local useCudas = {false, hasCuda}
+   local std = 0.1
+   local value = 145
+   local rangeValue = 1800
+   local precision = rangeValue * 3 * std
+   local eigenVectors = torch.Tensor({{ 0.58786434,  0.56388045,  0.58004685},
+                                      {-0.65427388, -0.0902746 ,  0.75085031},
+                                      {-0.47575331,  0.82090763, -0.31586303}})
+   local eigenValues = torch.Tensor({4491.21, 722.85, 68.07})
+   local model = nn.PCAColorTransform(3, eigenVectors, eigenValues, std)
+   local input = torch.zeros(32, 3, 100, 100):fill(value)
+
+   for _, useCuda in pairs(useCudas) do
+      if useCuda then
+         model:cuda()
+         input = input:cuda()
+      end
+      local output = model:forward(input)
+      mytester:assert(output:std() <= rangeValue+precision,
+                       "PCAColorTransform output value incorrect.")
+      local gradInput = model:backward(input, input)
+      mytester:assert(gradInput:sum() == input:sum(),
+                       "PCAColorTransform gradInput value incorrect.")
+   end
+end
+
+-- Unit Test FireModule
+function rnntest.FireModule()
+   local hasCuda = pcall(function() require 'cunn' end)
+   local useCudas = {false, hasCuda}
+   local activations = {'ReLU', 'Tanh', 'Sigmoid'}
+   local nInputPlane = 3
+   local width = 32
+   local height = 32
+   local s1x1 = 16
+   local e1x1 = 16
+   local e3x3 = 16
+   for _, activation in pairs(activations) do
+      for _, useCuda in pairs(useCudas) do
+         local model = nn.FireModule(nInputPlane, s1x1, e1x1, e3x3)
+         local input = torch.rand(1, nInputPlane, height, width)
+         if useCuda then
+            model:cuda()
+            input = input:cuda()
+         end
+         local output = model:forward(input)
+         local gradInput = model:backward(input, output)
+      end
+   end
+end
+
+-- Unit Test SpatialFeatNormalization
+function rnntest.SpatialFeatNormalization()
+   local hasCuda = pcall(function() require 'cunn' end)
+   local useCudas = {false, hasCuda}
+   local input = torch.zeros(3, 32, 32):fill(2)
+   local mean = torch.zeros(3):fill(1)
+   local std = torch.zeros(3):fill(0.5)
+   local outputValue = 2
+   local gradValue = 4
+   for _, useCuda in pairs(useCudas) do
+      local model = nn.SpatialFeatNormalization(mean, std)
+      if useCuda then
+         model:cuda()
+         input = input:cuda()
+      end
+      local output = model:forward(input)
+      local gradInput = model:backward(input, output)
+      mytester:assert( output:mean() == outputValue,
+                     "SpatialFeatNormalization forward mean value incorrect.")
+      mytester:assert( gradInput:mean() == gradValue,
+                     "SpatialFeatNormalization backward mean value incorrect.")
+   end
+end
+
+function rnntest.OneHot()
+   local nClass = 10
+
+   -- batch mode
+   local batchSize = 3
+   local input = torch.LongTensor(batchSize):random(1, nClass)
+   local gradOutput = torch.randn(batchSize, nClass)
+
+   local oh = nn.OneHot(nClass)
+
+   local output = oh:forward(input)
+   local output2 = torch.Tensor(batchSize, nClass):zero()
+   local eye = torch.eye(nClass)
+   output2:index(eye, 1, input)
+   mytester:assertTensorEq(output, output2, 0.000001, "OneHot forward batch err")
+   mytester:assert(output:dim() == 2)
+
+   -- non-batch mode (number input)
+   local num = 3
+   local output3 = torch.zeros(nClass)
+   output3[num] = 1.0
+   mytester:assertTensorEq(oh:forward(num), output3, 0.000001, "OneHot forward number err")
+
+   local gradInput = oh:backward(input, gradOutput)
+   mytester:assertTensorEq(gradInput, input:double():zero(), 0.000001, "OneHot backward batch err")
+
+   if pcall(function() require 'cunn' end) then
+      oh:cuda()
+
+      -- test with long input
+      local output = oh:forward(input)
+      mytester:assert(torch.type(output) == 'torch.CudaTensor')
+      mytester:assertTensorEq(output:double(), output2, 0.000001, "OneHot forward batch long-cuda err")
+
+      -- test with cuda input
+      local input = input:cuda()
+      gradOutput = gradOutput:cuda()
+
+      local output = oh:forward(input)
+      mytester:assert(torch.type(output) == 'torch.CudaTensor')
+      mytester:assertTensorEq(output:double(), output2, 0.000001, "OneHot forward batch cuda err")
+
+      local gradInput2 = oh:backward(input, gradOutput)
+      mytester:assertTensorEq(gradInput, gradInput2:double(), 0.000001, "OneHot backward batch err")
+      cutorch.synchronize()
+
+      -- non-batch mode (number input)
+      mytester:assertTensorEq(oh:forward(num), output3:cuda(), 0.000001, "OneHot forward number err")
+   end
+
+   -- multi-dimensional input
+   local inputSize = 2
+   local input = torch.LongTensor(batchSize, inputSize):random(1, nClass)
+   local gradOutput = torch.randn(batchSize, inputSize, nClass)
+
+   local oh = nn.OneHot(nClass, 2)
+
+   local output = oh:forward(input)
+   local output2 = torch.Tensor(batchSize*inputSize, nClass):zero()
+   local eye = torch.eye(nClass)
+   output2:index(eye, 1, input:view(-1))
+   output2:resize(batchSize, inputSize, nClass)
+   mytester:assertTensorEq(output, output2, 0.000001, "OneHot 2d forward batch err")
+   mytester:assert(output:dim() == 3)
+
+   local gradInput = oh:backward(input, gradOutput)
+   mytester:assertTensorEq(gradInput, input:double():zero(), 0.000001, "OneHot 2d backward batch err")
+
+   if pcall(function() require 'cunn' end) then
+      oh:cuda()
+
+      -- test with long input
+      local output = oh:forward(input)
+      mytester:assert(torch.type(output) == 'torch.CudaTensor')
+      mytester:assertTensorEq(output:double(), output2, 0.000001, "OneHot 2d forward batch long-cuda err")
+
+      -- test with cuda input
+      local input = input:cuda()
+      gradOutput = gradOutput:cuda()
+
+      local output = oh:forward(input)
+      mytester:assert(torch.type(output) == 'torch.CudaTensor')
+      mytester:assertTensorEq(output:double(), output2, 0.000001, "OneHot 2d forward batch cuda err")
+
+      local gradInput2 = oh:backward(input, gradOutput)
+      mytester:assertTensorEq(gradInput, gradInput2:double(), 0.000001, "OneHot 2d backward batch err")
+
+      local benchmark = false
+      if benchmark then
+         local input = torch.FloatTensor(50, 50):random(1,65):cuda()
+
+         local oh = nn.OneHot(65):cuda()
+
+         oh:forward(input)
+         cutorch.synchronize()
+         local a = torch.Timer()
+         for i=1,10 do
+            oh:forward(input)
+         end
+         cutorch.synchronize()
+         local gputime = a:time().real
+
+         oh:float()
+         input = input:float()
+         oh:forward(input)
+         a = torch.Timer()
+         for i=1,10 do
+            oh:forward(input)
+         end
+         local cputime = a:time().real
+         print("Onehot GPU vs CPU time", gputime, cputime)
+      end
+   end
+end
+
+function rnntest.NCE_main()
+   local batchsize = 4
+   local k = 10
+   local inputsize = 3
+   local outputsize = 100
+
+   local noise = torch.Tensor(outputsize):random(1,100)
+
+   local ncem = nn.NCEModule(inputsize, outputsize, k, noise)
+   ncem.batchnoise = false
+   local ncec = nn.NCECriterion()
+
+   local input = torch.randn(batchsize, inputsize)
+   local target = torch.LongTensor(batchsize):random(1,outputsize)
+   local inputTable = {input, target}
+
+   -- test training
+
+   -- NCEModule.forward
+   local output = ncem:forward(inputTable)
+
+   mytester:assert(torch.type(output) == 'table')
+   mytester:assert(#output == 4)
+
+   local Pmt, Pms, Pnt, Pns = unpack(output)
+
+   mytester:assertTableEq(Pmt:size():totable(), {batchsize}, 0.0000001)
+   mytester:assertTableEq(Pms:size():totable(), {batchsize, k}, 0.0000001)
+   mytester:assertTableEq(Pnt:size():totable(), {batchsize}, 0.0000001)
+   mytester:assertTableEq(Pns:size():totable(), {batchsize, k}, 0.0000001)
+
+   mytester:assert(ncem.sampleidx:min() >= 1 and ncem.sampleidx:max() <= outputsize)
+
+   local sampleprob2 = noise:index(1, ncem.sampleidx:view(-1)):view(batchsize, k+1)
+   mytester:assertTensorEq(sampleprob2:select(2,1), Pnt, 0.0000001)
+   mytester:assertTensorEq(sampleprob2:narrow(2,2,k), Pns, 0.0000001)
+
+   local linear = nn.Linear(inputsize, outputsize)
+   linear.weight:copy(ncem.weight)
+   linear.bias:copy(ncem.bias)
+   local mlp = nn.Sequential():add(linear):add(nn.Exp()):add(nn.MulConstant(1/ncem.Z[1]))
+
+   local output2_ = mlp:forward(input)
+   local output2 = torch.Tensor(batchsize, k+1)
+   for i=1,batchsize do
+      output2[i]:index(output2_[i],1,ncem.sampleidx[i])
+   end
+   local Pmt2 = output2:select(2,1)
+   local Pms2 = output2:narrow(2,2,k)
+
+   mytester:assertTensorEq(Pmt, Pmt2, 0.000001)
+   mytester:assertTensorEq(Pms, Pms2, 0.000001)
+
+   -- NCECriterion.forward
+   local loss = ncec:forward(output, target)
+
+   -- eq 5.1 : P(origin=model) = Pmt / (Pmt + k*Pnt)
+   local Pom = Pmt:clone()
+   local mdiv = Pmt:clone():add(k, Pnt):add(0.0000001)
+   Pom:cdiv(mdiv)
+
+   -- eq 5.2 : P(origin=noise) = k*Pns / (Pms + k*Pns)
+   local Pon = Pns:clone():mul(k)
+   local ndiv = Pms:clone():add(k, Pns):add(0.0000001)
+   Pon:cdiv(ndiv)
+
+   -- equation 6 in ref. A
+
+   local lossm = torch.log(Pom):sum()
+   local lossn = torch.log(Pon):sum()
+
+   local loss2 = - (lossm + lossn)/batchsize
+
+   mytester:assert(math.abs(loss - loss2) < 0.000001)
+
+   -- NCECriterion.backward
+   local gradOutput = ncec:backward(output, target)
+
+   mytester:assert(#gradOutput == 4)
+   mytester:assert(math.abs(gradOutput[3]:sum()) < 0.0000001)
+   mytester:assert(math.abs(gradOutput[4]:sum()) < 0.0000001)
+
+   local dPmt, dPms = gradOutput[1], gradOutput[2]
+
+   -- d Pmt / d input = -k*Pnt / ( Pmt * (Pmt + k*Pnt) )
+   local dPmt2 = torch.mul(Pnt, -k):cdiv(mdiv):cdiv(torch.add(Pmt, 0.0000001)):div(batchsize)
+   -- d Pms / d input = Pms / ( Pms * (Pms + k*Pns) )
+   local dPms2 = Pms:clone():cdiv(ndiv):cdiv(torch.add(Pms, 0.0000001)):div(batchsize)
+
+   mytester:assertTensorEq(dPmt, dPmt2, 0.0000001)
+   mytester:assertTensorEq(dPms, dPms2, 0.0000001)
+
+   mytester:assert(dPmt:sum() == dPmt:sum())
+   mytester:assert(dPms:sum() == dPms:sum())
+
+   -- NCEModule.backward
+   ncem:zeroGradParameters()
+   local gradInput = ncem:backward(inputTable, gradOutput)
+
+   -- updateGradInput
+   local gradOutput2_ = torch.zeros(batchsize, k+1)
+   gradOutput2_:select(2,1):copy(gradOutput[1])
+   gradOutput2_:narrow(2,2,k):copy(gradOutput[2])
+   local gradOutput2 = torch.zeros(batchsize, outputsize)
+   for i=1,batchsize do
+      gradOutput2[i]:indexAdd(1, ncem.sampleidx[i], gradOutput2_[i])
+   end
+   mlp:zeroGradParameters()
+   local gradInput2 = mlp:backward(input, gradOutput2)
+   mytester:assertTensorEq(gradInput[1], gradInput2, 0.0000001)
+
+   -- accGradParameters
+
+   local params, gradParams = ncem:parameters()
+   local params2, gradParams2 = mlp:parameters()
+
+   for i=1,#params do
+      mytester:assertTensorEq(gradParams[i], gradParams2[i], 0.0000001)
+   end
+
+
+   if pcall(function() require 'cunn' end) then
+      -- test training with cuda
+
+      ncem:cuda()
+      ncec:cuda()
+
+      local input = input:cuda()
+      local target = target:cuda()
+
+      local inputTable = {input, target}
+
+      -- NCEModule.forward
+      local output = ncem:forward(inputTable)
+
+      mytester:assert(torch.type(output) == 'table')
+      mytester:assert(#output == 4)
+
+      local Pmt, Pms, Pnt, Pns = unpack(output)
+
+      mytester:assertTableEq(Pmt:size():totable(), {batchsize}, 0.0000001)
+      mytester:assertTableEq(Pms:size():totable(), {batchsize, k}, 0.0000001)
+      mytester:assertTableEq(Pnt:size():totable(), {batchsize}, 0.0000001)
+      mytester:assertTableEq(Pns:size():totable(), {batchsize, k}, 0.0000001)
+
+      mytester:assert(ncem.sampleidx:min() >= 1 and ncem.sampleidx:max() <= outputsize)
+
+      local sampleprob2 = noise:cuda():index(1, ncem.sampleidx:view(-1)):view(batchsize, k+1)
+
+      mytester:assertTensorEq(sampleprob2:select(2,1), Pnt, 0.0000001)
+      mytester:assertTensorEq(sampleprob2:narrow(2,2,k), Pns, 0.0000001)
+
+      local linear = nn.Linear(inputsize, outputsize)
+      linear.weight:copy(ncem.weight)
+      linear.bias:copy(ncem.bias)
+      local mlp = nn.Sequential():add(linear):add(nn.Exp()):add(nn.MulConstant(1/ncem.Z[1]))
+      mlp:cuda()
+
+      local output2_ = mlp:forward(input)
+      local output2 = torch.CudaTensor(batchsize, k+1)
+      for i=1,batchsize do
+         output2[i]:index(output2_[i],1,ncem.sampleidx[i])
+      end
+      local Pmt2 = output2:select(2,1)
+      local Pms2 = output2:narrow(2,2,k)
+
+      mytester:assertTensorEq(Pmt, Pmt2, 0.000001)
+      mytester:assertTensorEq(Pms, Pms2, 0.000001)
+
+      -- NCECriterion.forward
+      local loss = ncec:forward(output, target)
+
+      -- eq 5.1 : P(origin=model) = Pmt / (Pmt + k*Pnt)
+      local Pom = Pmt:clone()
+      local mdiv = Pmt:clone():add(k, Pnt):add(0.0000001)
+      Pom:cdiv(mdiv)
+
+      -- eq 5.2 : P(origin=noise) = k*Pns / (Pms + k*Pns)
+      local Pon = Pns:clone():mul(k)
+      local ndiv = Pms:clone():add(k, Pns):add(0.0000001)
+      Pon:cdiv(ndiv)
+
+      -- equation 6 in ref. A
+
+      local lossm = torch.log(Pom):sum()
+      local lossn = torch.log(Pon):sum()
+
+      local loss2 = - (lossm + lossn)/batchsize
+
+      mytester:assert(math.abs(loss - loss2) < 0.000001)
+
+      -- NCECriterion.backward
+      local gradOutput = ncec:backward(output, target)
+
+      mytester:assert(#gradOutput == 4)
+      mytester:assert(math.abs(gradOutput[3]:sum()) < 0.0000001)
+      mytester:assert(math.abs(gradOutput[4]:sum()) < 0.0000001)
+
+      local dPmt, dPms = gradOutput[1], gradOutput[2]
+
+      -- d Pmt / d input = -k*Pnt / ( Pmt * (Pmt + k*Pnt) )
+      local dPmt2 = torch.mul(Pnt, -k):cdiv(mdiv):cdiv(torch.add(Pmt, 0.0000001)):div(batchsize)
+      -- d Pms / d input = Pms / ( Pms * (Pms + k*Pns) )
+      local dPms2 = Pms:clone():cdiv(ndiv):cdiv(torch.add(Pms, 0.0000001)):div(batchsize)
+
+      mytester:assertTensorEq(dPmt, dPmt2, 0.0000001)
+      mytester:assertTensorEq(dPms, dPms2, 0.0000001)
+
+      mytester:assert(dPmt:sum() == dPmt:sum())
+      mytester:assert(dPms:sum() == dPms:sum())
+
+      -- NCEModule.backward
+      ncem:zeroGradParameters()
+      local gradInput = ncem:backward(inputTable, gradOutput)
+
+      -- updateGradInput
+      local gradOutput2_ = torch.zeros(batchsize, k+1):cuda()
+      gradOutput2_:select(2,1):copy(gradOutput[1])
+      gradOutput2_:narrow(2,2,k):copy(gradOutput[2])
+      local gradOutput2 = torch.zeros(batchsize, outputsize):cuda()
+      for i=1,batchsize do
+         gradOutput2[i]:indexAdd(1, ncem.sampleidx[i], gradOutput2_[i])
+      end
+      mlp:zeroGradParameters()
+      local gradInput2 = mlp:backward(input, gradOutput2)
+      mytester:assertTensorEq(gradInput[1], gradInput2, 0.0000001)
+
+      -- accGradParameters
+
+      local params, gradParams = ncem:parameters()
+      local params2, gradParams2 = mlp:parameters()
+
+      for i=1,#params do
+         mytester:assertTensorEq(gradParams[i], gradParams2[i], 0.0000001)
+      end
+   end
+end
+
+function rnntest.NCE_multinomial()
+   local probs = torch.Tensor(10):uniform(0,1)
+   probs:div(probs:sum())
+   local nce = nn.NCEModule(10, 10, 2500, probs)
+
+   local output = torch.LongTensor()
+   nce:noiseSample(output, 1000, 2500)
+
+   local counts = torch.Tensor(10):zero()
+   output:apply(function(x)
+      counts[x] = counts[x] + 1
+   end)
+
+   counts:div(counts:sum())
+
+   mytester:assertTensorEq(probs, counts, 0.001)
+end
+
+function rnntest.NCE_batchnoise()
+   local batchsize = 4
+   local k = 10
+   local inputsize = 3
+   local outputsize = 100
+
+   local noise = torch.Tensor(outputsize):random(1,100)
+
+   local ncem = nn.NCEModule(inputsize, outputsize, k, noise, 1)
+   assert(ncem.batchnoise)
+   local ncec = nn.NCECriterion()
+
+   local ncem2 = ncem:clone()
+   ncem2.batchnoise = false
+   local ncec2 = ncec:clone()
+
+   local input = torch.randn(batchsize, inputsize)
+   local target = torch.LongTensor(batchsize):random(1,outputsize)
+   local inputTable = {input, target}
+
+   -- test training
+
+   -- NCEModule.forward
+   local output = ncem:forward(inputTable)
+
+   mytester:assert(torch.type(output) == 'table')
+   mytester:assert(#output == 4)
+
+   local Pmt, Pms, Pnt, Pns = unpack(output)
+
+   mytester:assertTableEq(Pmt:size():totable(), {batchsize}, 0.0000001)
+   mytester:assertTableEq(Pms:size():totable(), {batchsize, k}, 0.0000001)
+   mytester:assertTableEq(Pnt:size():totable(), {batchsize}, 0.0000001)
+   mytester:assertTableEq(Pns:size():totable(), {batchsize, k}, 0.0000001)
+
+   mytester:assert(ncem.sampleidx:min() >= 1 and ncem.sampleidx:max() <= outputsize)
+
+   local sampleprob2 = noise:index(1, ncem.sampleidx:view(-1))
+   mytester:assertTensorEq(sampleprob2:narrow(1,k+1,batchsize), Pnt, 0.0000001)
+   mytester:assertTensorEq(sampleprob2:narrow(1,1,k):contiguous():view(1,k):expand(batchsize, k), Pns, 0.0000001)
+
+   function ncem2.noiseSample(self, sampleidx, batchsize, k)
+      sampleidx:resize(batchsize, k):copy(ncem.sampleidx:narrow(1,1,k):view(1, k):expand(batchsize, k))
+      return sampleidx
+   end
+
+   local output2 = ncem2:forward(inputTable)
+   local Pmt2, Pms2, Pnt2, Pns2 = unpack(output2)
+
+   mytester:assertTensorEq(Pmt, Pmt2, 0.000001)
+   mytester:assertTensorEq(Pms, Pms2, 0.000001)
+
+   -- NCECriterion.forward
+   local loss = ncec:forward(output, target)
+   local loss2 = ncec2:forward(output, target)
+
+   mytester:assert(math.abs(loss - loss2) < 0.000001)
+
+   -- NCECriterion.backward
+   local gradOutput = ncec:backward(output, target)
+   local gradOutput2 = ncec2:backward(output, target)
+
+   mytester:assert(#gradOutput == 4)
+   mytester:assert(math.abs(gradOutput[3]:sum()) < 0.0000001)
+   mytester:assert(math.abs(gradOutput[4]:sum()) < 0.0000001)
+
+   mytester:assertTensorEq(gradOutput[1], gradOutput2[1], 0.0000001)
+   mytester:assertTensorEq(gradOutput[2], gradOutput2[2], 0.0000001)
+
+   -- NCEModule.backward
+   ncem:zeroGradParameters()
+   local gradInput = ncem:backward(inputTable, gradOutput)
+
+   ncem2:zeroGradParameters()
+   local gradInput2 = ncem2:backward(inputTable, gradOutput2)
+
+   -- updateGradInput
+   mytester:assertTensorEq(gradInput[1], gradInput2[1], 0.0000001)
+
+   -- accGradParameters
+   local params, gradParams = ncem:parameters()
+   local params2, gradParams2 = ncem2:parameters()
+
+   for i=1,#params do
+      mytester:assertTensorEq(gradParams[i], gradParams2[i], 0.0000001, tostring(gradParams[i])..tostring(gradParams2[i]))
+   end
+
+
+   if pcall(function() require 'cunn' end) then
+      -- test training with cuda
+
+      ncem:cuda()
+      ncec:cuda()
+
+      ncem2:cuda()
+      ncec2:cuda()
+
+      local input = input:cuda()
+      local target = target:cuda()
+      local noise = noise:cuda()
+
+      local inputTable = {input, target}
+
+      -- NCEModule.forward
+      local output = ncem:forward(inputTable)
+
+      mytester:assert(torch.type(output) == 'table')
+      mytester:assert(#output == 4)
+
+      local Pmt, Pms, Pnt, Pns = unpack(output)
+
+      mytester:assertTableEq(Pmt:size():totable(), {batchsize}, 0.0000001)
+      mytester:assertTableEq(Pms:size():totable(), {batchsize, k}, 0.0000001)
+      mytester:assertTableEq(Pnt:size():totable(), {batchsize}, 0.0000001)
+      mytester:assertTableEq(Pns:size():totable(), {batchsize, k}, 0.0000001)
+
+      mytester:assert(ncem.sampleidx:min() >= 1 and ncem.sampleidx:max() <= outputsize)
+
+      local sampleprob2 = noise:index(1, ncem.sampleidx:view(-1))
+      mytester:assertTensorEq(sampleprob2:narrow(1,k+1,batchsize), Pnt, 0.0000001)
+      mytester:assertTensorEq(sampleprob2:narrow(1,1,k):contiguous():view(1,k):expand(batchsize, k), Pns, 0.0000001)
+
+      function ncem2.noiseSample(self, sampleidx, batchsize, k)
+         sampleidx:resize(batchsize, k):copy(ncem.sampleidx:narrow(1,1,k):view(1, k):expand(batchsize, k))
+         return sampleidx
+      end
+
+      local output2 = ncem2:forward(inputTable)
+      local Pmt2, Pms2, Pnt2, Pns2 = unpack(output2)
+
+      mytester:assertTensorEq(Pmt, Pmt2, 0.000001)
+      mytester:assertTensorEq(Pms, Pms2, 0.000001)
+
+      -- NCECriterion.forward
+      local loss = ncec:forward(output, target)
+      local loss2 = ncec2:forward(output, target)
+
+      mytester:assert(math.abs(loss - loss2) < 0.000001)
+
+      -- NCECriterion.backward
+      local gradOutput = ncec:backward(output, target)
+      local gradOutput2 = ncec2:backward(output, target)
+
+      mytester:assert(#gradOutput == 4)
+      mytester:assert(math.abs(gradOutput[3]:sum()) < 0.0000001)
+      mytester:assert(math.abs(gradOutput[4]:sum()) < 0.0000001)
+
+      mytester:assertTensorEq(gradOutput[1], gradOutput2[1], 0.0000001)
+      mytester:assertTensorEq(gradOutput[2], gradOutput2[2], 0.0000001)
+
+      -- NCEModule.backward
+      ncem:zeroGradParameters()
+      local gradInput = ncem:backward(inputTable, gradOutput)
+
+      ncem2:zeroGradParameters()
+      local gradInput2 = ncem2:backward(inputTable, gradOutput2)
+
+      -- updateGradInput
+      mytester:assertTensorEq(gradInput[1], gradInput2[1], 0.0000001)
+
+      -- accGradParameters
+      local params, gradParams = ncem:parameters()
+      local params2, gradParams2 = ncem2:parameters()
+
+      for i=1,#params do
+         mytester:assertTensorEq(gradParams[i], gradParams2[i], 0.000001, tostring(gradParams[i])..tostring(gradParams2[i]))
+      end
+   end
+end
+
+
+function rnntest.NaN()
+   local _ = require 'moses'
+   local input = torch.randn(2,3)
+   local gradOutput = torch.randn(2,4)
+   local lin = nn.Linear(3,4)
+   lin:zeroGradParameters()
+   local nan = nn.NaN(lin)
+   mytester:assert(nan.id == 1)
+   -- test that it works when no NaNs are present
+   local output = nan:forward(input):clone()
+   local gradInput = nan:backward(input, gradOutput):clone()
+   local gradWeight = lin.gradWeight:clone()
+   local gradBias = lin.gradBias:clone()
+   lin:zeroGradParameters()
+   local output2 = lin:forward(input)
+   local gradInput2 = lin:backward(input, gradOutput)
+   mytester:assertTensorEq(output, output2, 0.000001)
+   mytester:assertTensorEq(gradInput, gradInput2, 0.000001)
+   mytester:assertTensorEq(gradWeight, lin.gradWeight, 0.000001)
+   mytester:assertTensorEq(gradBias, lin.gradBias, 0.000001)
+   -- test with some NaNs
+   input:zero():log():log()
+   local sum = input:sum()
+   mytester:assert(_.isNaN(sum))
+   mytester:assert(not pcall(function() nan:forward(input) end))
+   lin.bias:fill(sum)
+   input = torch.randn(2,3)
+   mytester:assert(not pcall(function() nan:forward(input) end))
+   lin.bias:uniform(0,1)
+   gradOutput:fill(sum)
+   mytester:assert(not pcall(function() nan:backward(input, gradOutput) end))
+   gradOutput:uniform(0,1)
+   lin.gradBias:fill(sum)
+   mytester:assert(not pcall(function() nan:backward(input, gradOutput) end))
+end
+
+function rnntest.profile()
+   -- timing the forward pass introduces some overhead to the module
+   -- We want to make sure this overhead isn't too large
+   local mx_overhead = 0.05
+   local print_every = 1000
+   local net = nn.Profile(nn.Linear(1024,1024), print_every)
+   local inp = torch.randn(1, 1024)
+
+   local timer = torch.Timer()
+   local tot_time = 0
+   for i=1,print_every-1 do
+      timer:reset()
+      net:forward(inp)
+      tot_time = tot_time + timer:time().real
+   end
+   mytester:assert(math.abs(net.summedFwdTime - tot_time) / tot_time < mx_overhead)
+   net:forward(inp)
+   -- Do the same test, now that all the memory has already been allocated
+   local tot_time = 0
+   for i=1,print_every-1 do
+      timer:reset()
+      net:forward(inp)
+      tot_time = tot_time + timer:time().real
+   end
+   mytester:assert(math.abs(net.summedFwdTime - tot_time) / tot_time < mx_overhead)
+end
+
+function rnntest.NCE_multicuda()
+   if not pcall(function() require 'torchx' end) then
+      return
+   end
+   if not pcall(function() require 'cunn' end) then
+      return
+   end
+   if cutorch.getDeviceCount() < 2 then
+      return
+   end
+   assert(torchx.version and torchx.version >= 1, "Update torchx")
+
+   local nclass = 1000
+   local hiddensize = 20
+   local batchsize = 5
+   local k = 25
+   local unigrams = torch.Tensor(nclass):uniform(0,1)
+   local noise = torch.LongTensor(batchsize, k):random(1,nclass)
+
+   local crit = nn.NCECriterion():cuda()
+   local crit2 = nn.NCECriterion():cuda()
+
+   local nce = nn.NCEModule(hiddensize, nclass, k, unigrams)
+   nce.batchnoise = math.random() < 0.5
+
+   -- make it deterministic
+   nce.noiseSample = function(self, sampleidx, batchsize, k)
+      sampleidx:resize(batchsize, k)
+      sampleidx:copy(noise:narrow(1,1,batchsize))
+      return sampleidx
+   end
+
+   local nce2 = nce:clone()
+   nce2:cuda()
+
+   local input = torch.randn(batchsize, hiddensize):cuda()
+   local target = torch.LongTensor(batchsize):random(1,nclass):cuda()
+
+   nce:multicuda(1, 2)
+
+   local output = nce:forward{input, target}
+   local loss = crit:forward(output, target)
+   local gradOutput = crit:backward(output, target)
+   nce:zeroGradParameters()
+   local gradInput = nce:backward({input, target}, gradOutput)
+
+   local output2 = nce2:forward{input, target}
+   local loss2 = crit2:forward(output2, target)
+   local gradOutput2 = crit2:backward(output2, target)
+   nce2:zeroGradParameters()
+   local gradInput2 = nce2:backward({input, target}, gradOutput2)
+
+   mytester:assertTensorEq(output[1], output2[1], 0.00001)
+   mytester:assertTensorEq(output[2], output2[2], 0.00001)
+   mytester:assertTensorEq(output[3], output2[3], 0.00001)
+   mytester:assertTensorEq(output[4], output2[4], 0.00001)
+
+   mytester:assertTensorEq(gradInput[1], gradInput2[1], 0.00001)
+   mytester:assertTensorEq(gradInput[2], gradInput2[2], 0.00001)
+
+
+   nce2:updateParameters(0.1)
+   nce:updateParameters(0.1)
+
+   mytester:assertTensorEq(nce2.bias, nce.bias, 0.000001)
+   mytester:assertTensorEq(nce2.gradBias, nce.gradBias, 0.000001)
+   mytester:assertTensorEq(nce2.weight[{{},{1,hiddensize/2}}]:float(), nce.weight.tensors[1]:float(), 0.000001)
+   mytester:assertTensorEq(nce2.weight[{{},{1+(hiddensize/2), hiddensize}}]:float(), nce.weight.tensors[2]:float(), 0.000001)
+   mytester:assertTensorEq(nce2.gradWeight[{{},{1,hiddensize/2}}]:float(), nce.gradWeight.tensors[1]:float(), 0.000001)
+   mytester:assertTensorEq(nce2.gradWeight[{{},{1+(hiddensize/2), hiddensize}}]:float(), nce.gradWeight.tensors[2]:float(), 0.000001)
+
+   -- test momentum
+   nce2:updateGradParameters(0.9)
+   nce:updateGradParameters(0.9)
+
+   mytester:assertTensorEq(nce2.gradBias, nce.gradBias, 0.000001)
+   mytester:assertTensorEq(nce2.momGradParams[1][{{},{1,hiddensize/2}}]:float(), nce.momGradParams[1].tensors[1]:float(), 0.000001)
+   mytester:assertTensorEq(nce2.momGradParams[1][{{},{1+(hiddensize/2), hiddensize}}]:float(), nce.momGradParams[1].tensors[2]:float(), 0.000001)
+   mytester:assertTensorEq(nce2.gradWeight[{{},{1,hiddensize/2}}]:float(), nce.gradWeight.tensors[1]:float(), 0.000001)
+   mytester:assertTensorEq(nce2.gradWeight[{{},{1+(hiddensize/2), hiddensize}}]:float(), nce.gradWeight.tensors[2]:float(), 0.000001)
+end
+
+function rnn.test(tests, exclude)
    mytester = torch.Tester()
    mytester:add(rnntest)
    math.randomseed(os.time())
diff --git a/test/test_firemodule.lua b/test/test_firemodule.lua
new file mode 100644
index 0000000..ef01f23
--- /dev/null
+++ b/test/test_firemodule.lua
@@ -0,0 +1,40 @@
+require 'nn'
+require 'dpnn'
+require 'cunn'
+require 'cutorch'
+
+--torch.setdefaulttensortype('torch.FloatTensor')
+
+-- FireModule issue 45
+--[[
+m = nn.Sequential()
+m:add(nn.FireModule(1,1,1,1))
+_, p = m:getParameters()
+print(p:sum())
+
+m = m:cuda()
+_, p = m:getParameters()
+print(p:sum())
+
+m:zeroGradParameters()
+print(p:sum())--]]
+
+
+-- Testing FireModule
+input = torch.rand(1, 3, 6, 6)
+model = nn.FireModule(3, 1, 1, 1, 'Tanh')
+print(model)
+print(model.module)
+parameters, gradParameters = model:getParameters()
+output = model:forward(input)
+grads = torch.rand(output:size())
+gi = model:backward(input, grads)
+print(gi:mean(), gi:std(), gi:min(), gi:max())
+
+cutorch.setDevice(1)
+model:cuda()
+print(model.module.modules[1].finput)
+cinput = input:cuda()
+output = model:forward(cinput)
+gi = model:backward(input:cuda(), grads:cuda())
+print(gi:mean(), gi:std(), gi:min(), gi:max())
diff --git a/tutorials/ladder.md b/tutorials/ladder.md
new file mode 100644
index 0000000..591a21b
--- /dev/null
+++ b/tutorials/ladder.md
@@ -0,0 +1,107 @@
+# Lateral Connections in Denoising Autoencoders Support Supervised Learning
+
+In this tutorial we will understand how to implement ladder network as explained in [[1](http://arxiv.org/pdf/1504.08215.pdf)]. In this paper the authors have shown how unsupervised learning using a denoising autoencoder with lateral connections help improve the classification accuracy in supervised learning.
+
+To produce results as mentioned in the paper please run following command (best test error we got was **`0.6%`**). To run this script you will need following torch packages: [`nn`](https://github.com/torch/nn), [`nngraph`](https://github.com/torch/nngraph), [`dp`](https://github.com/nicholas-leonard/dp), [`dpnn`](https://github.com/Element-Research/dpnn), [`optim`](https://github.com/torch/optim) and [`cunn`](https://github.com/torch/cunn) & [`cutorch`](https://github.com/torch/cutorch) if using cuda (```--useCuda``` flag).
+```
+   th tutorials/ladder.lua --verbose --eta 500 --epochs 100 --learningRate 0.002 --linearDecay --endLearningRate 0 --startEpoch 50 --useCuda --deviceId 1 --noiseSigma 0.3 --useBatchNorm --batchSize 100 --adam --noValidation --attempts 10
+```
+
+The unsupervised learning (denoising) task supplements the supervised learning task (classification in this case). As in autoencoders this network has an encoder and a decoder. The output of encoder is also used for classification. The output of encoder is **`N`** dimensional where **`N`** is number of classes. This **`N`** dimensional vector is used for computing classification cost as well as feeds into the decoder.
+
+## Classification
+Encoder/classifier units are defined as
+```lua
+   Z = nn.BatchNormalization(hidden_units)(nn.Linear(inputDims, hidden_units)(previous_H))
+```
+where
+```lua
+   H = nn.ReLU()(nn.CMul()(nn.Add()(Z)))
+```
+For first layer **`previous_H`** is the corrupted input.
+```lua
+   input = nn.WhiteNoise(mean, sigma)
+```
+
+**`H`** for last encoder unit is defined as
+```lua
+   H = nn.LogSoftMax()(nn.CMul()(nn.Add()(Z)))
+```
+Last **`H`** feeds into the negative log likelihood criterion.
+
+## Denoising
+Typically in denoising autoencoder the input samples are corrupted using Dropout [```nn.Dropout```](https://github.com/torch/nn/blob/master/Dropout.lua) but in this paper the authors use isotropic Gaussian noise [```nn.WhiteNoise```](https://github.com/Element-Research/dpnn/blob/master/WhiteNoise.lua) with zero mean.
+
+### Lateral Connections in Autoencoder
+**`Z`** units in encoder are laterally connected to corresponding unit in the decoder. The output of decoder unit for neuron `i` is defined by
+```
+   z^_i = a_i1 * z_i + a_i2 * sigmoid(a_i3 + a_i4) + a_i5
+```
+where 
+```
+   a_ij = c_ij * u_i + d_ij
+```
+**`U`** is output of decoder unit's ```nn.Linear()```. For the top most layer  **`U`** is zero. **`Z`** is output of corresponding encoder unit (this is lateral connection, decoder takes output from its previous unit through **`U`** as well as corresponding encoder unit). For the lowest layer of decoder **`Z`** is the corrupted input signal. **`c_j`** and **`d_j`** are trainable weight vectors. This forms the crux of the ladder network. This can be easily implemented using **`nngraph`** as follows
+
+For the topmost layer **`U`**`= 0` and **`Z`** is the batch normalized output from the corresponding (in this case last) encoder/classifier unit. **`Z^`** for topmost layer is defined as
+```lua
+   z_hat1 = nn.CMul(hiddens[i])(Z)
+   z_hat2 = nn.CMul(hiddens[i])(Z)
+   z_hat3 = nn.CMul(hiddens[i])(Z)
+   z_hat34 = nn.Add(hiddens[i])(z_hat3)
+   z_hatSigmoid34 = nn.Sigmoid()(z_hat34)
+   z_hat234 = nn.CMulTable()({z_hat2, z_hatSigmoid34})
+   z_hat5 = nn.CMul(hiddens_units)(Z)
+
+   -- Z_hat = z^
+   Z_hat = nn.CAddTable()({z_hat1, z_hat234, z_hat5})
+```
+
+For lower decoder units **`Z^`** is defined as
+```lua
+   
+      u = nn.Linear()(previous_Z_hat)
+
+      cu1 = nn.CMul(hidden_units)(u)
+      du1 = nn.Add(hidden_units])(u)
+      a1 = nn.CAddTable()({cu1, du1})
+      cu2 = nn.CMul(hidden_units)(u)
+      du2 = nn.Add(hidden_units)(u)
+      a2 = nn.CAddTable()({cu2, du2})
+      cu3 = nn.CMul(hidden_units)(u)
+      du3 = nn.Add(hidden_units)(u)
+      a3 = nn.CAddTable()({cu3, du3})
+      cu4 = nn.CMul(hidden_units)(u)
+      du4 = nn.Add(hidden_units)(u)
+      a4 = nn.CAddTable()({cu4, du4})
+      cu5 = nn.CMul(hidden_units)(u)
+      du5 = nn.Add(hidden_units)(u)
+      a5 = nn.CAddTable()({cu5, du5})
+
+      z_hat1 = nn.CMulTable()({a1, z})
+      z_hat2 = nn.CMulTable()({a3, z})
+      z_hat3 = nn.Sigmoid()(nn.CAddTable()({z_hat2, a4}))
+      z_hat4 = nn.CMulTable()({a2, z_hat3})
+      Z_hat = nn.CAddTable()({z_hat1, z_hat4, a5})
+```
+`Z_hat` is `z^`. Final `Z_hat` is the output of decoder and feeds into the mean squared error criterion.
+
+## Criterions
+Negative log likelihood criterion is used for classification task.
+```lua
+   nll = nn.ClassNLLCriterion()
+```
+Mean squared error is used for the auxillary task.
+```lua
+   mse = nn.MSECriterion()
+```
+These two training criterions are combined using `eta` which determines weight for auxillary task. If `eta` is zero then the model is trained for classification only.
+Combined criterion
+```lua
+   criterions = ParallelCriterion()
+   criterions:add(nll)
+   criterions:add(mse, eta)
+```
+
+## References
+[1] Rasmus, Antti, Harri Valpola, and Tapani Raiko. "Lateral Connections in Denoising Autoencoders Support Supervised Learning." arXiv preprint arXiv:1504.08215 (2015).
diff --git a/tutorials/ladder_network/ladder.lua b/tutorials/ladder_network/ladder.lua
new file mode 100644
index 0000000..5e556cf
--- /dev/null
+++ b/tutorials/ladder_network/ladder.lua
@@ -0,0 +1,444 @@
+--[[!
+   Implementation of ladder as mentioned in http://arxiv.org/pdf/1504.08215.pdf
+--]]
+
+require 'nn'
+require 'dp'
+require 'dpnn'
+require 'math'
+require 'xlua'
+require 'optim'
+require 'nngraph'
+
+-- Cuda
+require 'cutorch'
+require 'cunn'
+
+-- Help functions
+require 'ladder_help_funcs'
+
+torch.setdefaulttensortype("torch.FloatTensor")
+op = xlua.OptionParser('%prog [options]')
+
+-- Data
+op:option{'--noValidation', action='store_true', dest='noValidation',
+          help='Use validation data for training as well.', default=false}
+op:option{'--best', action='store_true', dest='best',
+          help='Use best training or validation model.', default=false}
+
+-- Model parameters
+op:option{'--noOfClasses', action='store', dest='noOfClasses',
+          help='Number of classes.', default=10} -- MNIST data
+op:option{'--noiseSigma', action='store', dest='noiseSigma',
+          help='Stdev for noise for denoising autoencoder (Mean is zero).',
+          default=0}
+op:option{'--hiddens', action='store', dest='hiddens',
+          help='Hiddens units', default='{1000, 500, 250, 250, 250}'}
+op:option{'--useBatchNorm', action='store_true', dest='useBatchNorm',
+          help='Use batch normalization.', default=false}
+op:option{'--weightTied', action='store_true', dest='weightTied',
+          help='Tie weights of decoder with encoder.', default=false}
+
+-- Criterion and learning
+op:option{'--attempts', action='store', dest='attempts',
+          help='Run attempts independent experiments.', default=1}
+op:option{'--eta', action='store', dest='eta',
+          help='If zero then only classifier cost is considered.', default=0}
+op:option{'--batchSize', action='store', dest='batchSize',
+          help='Batch Size.',default=32}
+op:option{'--epochs', action='store', dest='epochs',
+          help='Number of epochs.',default=100}
+op:option{'--maxTries', action='store', dest='maxTries',
+          help='Number of tries for stopping.',default=0}
+op:option{'--learningRate', action='store', dest='learningRate',
+          help='Learning rate',default=0.002}
+op:option{'--learningRateDecay', action='store', dest='learningRateDecay',
+          help='Learning rate decay',default=1e-7}
+op:option{'--linearDecay', action='store_true', dest='linearDecay',
+          help='Linearly reduce learning rate', default=false}
+op:option{'--startEpoch', action='store', dest='startEpoch',
+          help='Epoch number when to start linear decay.',default=1}
+op:option{'--endLearningRate', action='store', dest='endLearningRate',
+          help='Learning rate at last epoch',default=0.0}
+op:option{'--momentum', action='store', dest='momentum',
+          help='Learning Momemtum',default=0}
+op:option{'--loss', action='store_true', dest='loss',
+          help='If true use loss for early stopping else confusion matrix.',
+          default=false}
+op:option{'--adam', action='store_true', dest='adam',
+          help='Use adaptive moment estimation optimizer.', default=false}
+
+-- Use Cuda
+op:option{'--useCuda', action='store_true', dest='useCuda', help='Use GPU',
+          default=false}
+op:option{'--deviceId', action='store', dest='deviceId', help='GPU device Id',
+          default=2}
+
+-- Print debug messages
+op:option{'--verbose', action='store_true', dest='verbose',
+          help='Print apppropriate debug messages.', default=false}
+
+-- Command line arguments
+opt = op:parse()
+op:summarize()
+
+-- Data
+noValidation = opt.noValidation
+best = opt.best
+verbose = opt.verbose
+
+   -- Cuda
+useCuda = opt.useCuda
+deviceId = tonumber(opt.deviceId)
+
+-- MNIST Data source
+ds = dp.Mnist{}
+
+attempts = tonumber(opt.attempts)
+testAccus = torch.zeros(attempts)
+trData = {}
+tvData = {}
+tsData = {}
+for attempt=1,attempts do
+
+   local t1, t2
+
+   trData.data, t1, t2 = ds:get('train', 'input', 'bchw', 'float')
+   trData.labels, t1, t2 = ds:get('train', 'target')
+   trData.size = function() return trData.data:size()[1] end
+
+   tvData.data, t1, t2 = ds:get('valid', 'input', 'bchw', 'float')
+   tvData.labels, t1, t2 = ds:get('valid', 'target')
+   tvData.size = function() return tvData.data:size()[1] end
+
+   tsData.data, t1, t2 = ds:get('test', 'input', 'bchw', 'float')
+   tsData.labels, t1, t2 = ds:get('test', 'target')
+   tsData.size = function() return tsData.data:size()[1] end
+   collectgarbage()
+
+   local tempSample = trData.data[1]
+   local channels = tempSample:size(1)
+   local width = tempSample:size(2)
+   local height = tempSample:size(3)
+   local linFeats = channels * height * width
+
+   -- MNIST
+   local classes = {'1', '2', '3', '4', '5', '6', '7', '8', '9', '10'}
+   local confusion = optim.ConfusionMatrix(classes)
+
+   -- Model
+   local noOfClasses = tonumber(opt.noOfClasses)
+   local noiseSigma = tonumber(opt.noiseSigma)
+   local inputHiddens = dp.returnString(opt.hiddens)
+   local useBatchNorm = opt.useBatchNorm
+   local weightTied = opt.weightTied
+
+
+   hiddens = {linFeats}
+   for i=1,#inputHiddens do
+      hiddens[#hiddens+1] = inputHiddens[i]
+   end
+   hiddens[#hiddens+1] = noOfClasses
+
+   -- encoder input
+   local input = nil
+   if noiseSigma ~= 0 then
+      if verbose then print("Add noise to the samples.") end
+      input = nn.WhiteNoise(0, noiseSigma)()
+   else
+      input = nn.Identity()()
+   end
+
+   -- encoder model
+   local encoderLayers = {}
+   local Zs = {}
+   Zs[1] = input
+   local Hs = {}
+   Hs[1] = input
+   for i=2,#hiddens do
+      -- Zs
+      encoderLayers[i] = nn.Linear(hiddens[i-1], hiddens[i])
+      if useBatchNorm then
+         Zs[i] = nn.BatchNormalization(hiddens[i])
+                                      (encoderLayers[i](Hs[i-1]))
+      else
+         Zs[i] = encoderLayers[i](Hs[i-1])
+      end
+     
+      -- Hs
+      if i==#hiddens then
+         Hs[i] = nn.CMul(hiddens[i])(nn.Add(hiddens[i])(Zs[i]))
+      else
+         Hs[i] = nn.ReLU()(nn.CMul(hiddens[i])(nn.Add(hiddens[i])(Zs[i])))
+      end
+   end
+
+   -- classifier
+   local classifier = nn.LogSoftMax()(Hs[#Hs])
+
+   -- Decoder
+   local decoderLayers = {}
+   local Z_hats = {}
+   for i=#hiddens,1,-1 do
+
+      -- u = 0 hence no cij
+      if i==#hiddens then
+         z_hat1 = nn.CMul(hiddens[i])(Zs[i])
+         z_hat2 = nn.CMul(hiddens[i])(Zs[i])
+         z_hat3 = nn.CMul(hiddens[i])(Zs[i])
+         z_hat34 = nn.Add(hiddens[i])(z_hat3)
+         z_hatSigmoid34 = nn.Sigmoid()(z_hat34)
+         z_hat234 = nn.CMulTable()({z_hat2, z_hatSigmoid34})
+         z_hat5 = nn.CMul(hiddens[i])(Zs[i])
+         Z_hats[i] = nn.CAddTable()({z_hat1, z_hat234, z_hat5})
+      else
+         decoderLayers[i] = nn.Linear(hiddens[i+1], hiddens[i])
+         if weightTied then
+            if verbose then print("Tying encoder-decoder weights.") end
+            decoderLayers[i].weight:set(encoderLayers[i+1].weight:t())
+            decoderLayers[i].gradWeight:set(encoderLayers[i+1].gradWeight:t())
+         end
+
+         u = decoderLayers[i](Z_hats[i+1])
+
+         cu1 = nn.CMul(hiddens[i])(u)
+         du1 = nn.Add(hiddens[i])(u)
+         a1 = nn.CAddTable()({cu1, du1})
+         cu2 = nn.CMul(hiddens[i])(u)
+         du2 = nn.Add(hiddens[i])(u)
+         a2 = nn.CAddTable()({cu2, du2})
+         cu3 = nn.CMul(hiddens[i])(u)
+         du3 = nn.Add(hiddens[i])(u)
+         a3 = nn.CAddTable()({cu3, du3})
+         cu4 = nn.CMul(hiddens[i])(u)
+         du4 = nn.Add(hiddens[i])(u)
+         a4 = nn.CAddTable()({cu4, du4})
+         cu5 = nn.CMul(hiddens[i])(u)
+         du5 = nn.Add(hiddens[i])(u)
+         a5 = nn.CAddTable()({cu5, du5})
+
+         z_hat1 = nn.CMulTable()({a1, Zs[i]})
+         z_hat2 = nn.CMulTable()({a3, Zs[i]})
+         z_hat3 = nn.Sigmoid()(nn.CAddTable()({z_hat2, a4}))
+         z_hat4 = nn.CMulTable()({a2, z_hat3})
+         Z_hats[i] = nn.CAddTable()({z_hat1, z_hat4, a5})
+      end
+   end
+   local model = nn.gModule({input}, {classifier, Z_hats[1]--[[Decoder--]]})
+   if verbose then print(model) end
+
+   -- Criterion and learning
+   -- Criterion
+   local eta = tonumber(opt.eta)
+   local criterions = nn.ParallelCriterion()
+   local nll = nn.ClassNLLCriterion()
+   local mse = nn.MSECriterion()
+   criterions:add(nll)
+   criterions:add(mse, eta)
+
+   -- Learning
+   local batchSize = tonumber(opt.batchSize)
+   local epochs = tonumber(opt.epochs)
+   local maxTries = tonumber(opt.maxTries)
+   local learningRate = tonumber(opt.learningRate)
+   local learningRateDecay = tonumber(opt.learningRateDecay)
+   local linearDecay = opt.linearDecay
+   local startEpoch = tonumber(opt.startEpoch)
+   local endLearningRate = tonumber(opt.endLearningRate)
+   assert(epochs > startEpoch, "startEpoch should be smaller than epochs.")   
+
+   if linearDecay then
+      if verbose then print("Using linear decay.") end
+      learningRates = torch.zeros(startEpoch):fill(learningRate)
+      local temp = torch.range(learningRate, endLearningRate,
+                               -learningRate/(epochs-startEpoch))
+      learningRates = torch.cat(learningRates, temp)
+   end
+
+   local momentum = tonumber(opt.momentum)
+   local loss = opt.loss
+   local adam = opt.adam
+
+   -- Optimizer
+   local optimState = {
+                       coefL1 = 0,
+                       coefL2 = 0,
+                       learningRate = learningRate,
+                       weightDecay = 0.0,
+                       momentum = momentum,
+                       learningRateDecay = learningRateDecay
+                      }
+
+   -- If true use Adaptive moment estimation else SGD.
+   if adam then
+      if verbose then print("Using Adaptive moment estimation optimizer.") end
+      optimMethod = optim.adam
+   else
+      if verbose then print("Using Stocastic gradient descent optimizer.") end
+      optimMethod = optim.sgd
+   end
+   if verbose then
+      print(optimMethod)
+      print(optimState)
+   end
+
+
+   if useCuda then
+      if verbose then print("Using GPU: "..deviceId) end
+      cutorch.setDevice(deviceId)
+      if verbose then print("GPU set") end
+      model:cuda()
+      if verbose then print("Model copied to GPU.") end
+      criterions:cuda()
+      if verbose then print("Criterion copied to GPU.") end
+   else
+      if verbose then print("Not using GPU.") end
+   end
+
+   -- Retrieve parameters and gradients
+   parameters, gradParameters = model:getParameters()
+
+   -- Reshape samples from images to vectors
+   trData.data = trData.data:reshape(trData.size(1), linFeats)
+   tvData.data = tvData.data:reshape(tvData.size(1), linFeats)
+   tsData.data = tsData.data:reshape(tsData.size(1), linFeats)
+   collectgarbage()
+
+   if noValidation then
+      trData.data = torch.cat(trData.data, tvData.data, 1)
+      trData.labels = torch.cat(trData.labels, tvData.labels, 1)
+      tvData.data = nil
+      tvData.labels = nil
+      collectgarbage()
+   end
+
+   if verbose then
+      print(trData)
+      print(tvData)
+      print(tsData)
+   end
+
+   -- Training
+   local displayProgress = verbose
+   local classifierIndx = 1
+   local trainAccu = 0
+   local validAccu = 0
+   local bestTrainAccu = 0
+   local bestValidAccu = 0
+   local trainLoss = 0
+   local validLoss = 0
+   local bestTrainLoss = math.huge
+   local bestValidLoss = math.huge
+   local bestTrainModel = nn.Sequential()
+   local bestValidModel = nn.Sequential()
+   local earlyStopCount = 0
+   for i=1, epochs do
+      if linearDecay then
+         optimState.learningRate = learningRates[i]
+      end
+      -- Training
+      trainLoss = model_train_multi_criterion(model, criterions,
+                                              parameters, gradParameters, trData,
+                                              optimMethod, optimState, batchSize,
+                                              i, confusion, trainLogger,
+                                              useCuda, displayProgress,
+                                              classiferIndx)
+      confusion:updateValids()
+      if loss then
+         if verbose then
+            print("Current train loss: ".. trainLoss
+                     ..", best train loss: " .. bestTrainLoss)
+         end
+         if trainLoss < bestTrainLoss then
+            bestTrainLoss = trainLoss
+            bestTrainModel = model:clone()
+            print(confusion)
+         end
+      else -- Using classification accuracy for saving best train model
+         trainAccu = confusion.totalValid * 100
+         if bestTrainAccu < trainAccu then
+            bestTrainAccu = trainAccu
+            bestTrainModel = model:clone()
+            bestTrainLoss = trainLoss
+         end
+         if verbose then
+            print("Current train accu: ".. trainAccu
+                     ..", best train accu: " .. bestTrainAccu
+                     ..", best train loss: " .. bestTrainLoss)
+         end
+      end
+
+      -- Validating
+      if not noValidation then
+         validLoss = model_test_multi_criterion(model, criterions,
+                                                tvData, confusion,
+                                                useCuda, classifierIndx)
+         confusion:updateValids()
+         if loss then
+            if verbose then
+               print("Current valid loss: ".. validLoss
+                        ..", best valid loss: " .. bestValidLoss)
+            end
+            if validLoss < bestValidLoss then
+               earlyStopCount = 0
+               bestValidLoss = validLoss
+               bestValidModel = model:clone()
+               print(confusion)
+            else
+               earlyStopCount = earlyStopCount + 1
+            end
+         else
+            validAccu = confusion.totalValid * 100
+            if bestValidAccu < validAccu then
+               earlyStopCount = 0
+               bestValidAccu = validAccu
+               bestValidModel = model:clone()
+               bestValidLoss = validLoss
+            else
+               earlyStopCount = earlyStopCount + 1
+            end
+            if verbose then
+               print("Current valid accu: ".. validAccu
+                     ..", best valid accu: " .. bestValidAccu
+                     ..", best valid loss: " .. bestValidLoss)
+            end
+         end
+         if verbose then
+            print(noiseSigma, weightTied, useBatchNorm, eta, earlyStopCount)
+         end
+      end
+
+      if maxTries ~= 0 then
+         if earlyStopCount >= maxTries then
+            if verbose then print("Early stopping at epoch: " .. i) end
+            break
+         end
+      end
+   end
+
+   -- Testing
+   if best then
+      if noValidation then
+         testLoss = model_test_multi_criterion(bestTrainModel, criterions,
+                                               tsData, confusion,
+                                               useCuda, classifierIndx)
+      else
+         testLoss = model_test_multi_criterion(bestValidModel, criterions,
+                                               tsData, confusion,
+                                               useCuda, classifierIndx)
+      end
+   else
+      testLoss = model_test_multi_criterion(model, criterions,
+                                            tsData, confusion,
+                                            useCuda, classifierIndx)
+   end
+   confusion:updateValids()
+   testAccu = confusion.totalValid * 100
+   testAccus[attempt] = testAccu
+   if verbose then
+      print("Attempt: " .. tostring(attempt) .. " Test Accu: " .. testAccu)
+   end
+end
+print("Test accuracies.")
+print(testAccus)
+print("Max Test Error is: " .. tostring(100 - testAccus:max()) .. "%")
diff --git a/tutorials/ladder_network/ladder_help_funcs.lua b/tutorials/ladder_network/ladder_help_funcs.lua
new file mode 100644
index 0000000..e6fe25e
--- /dev/null
+++ b/tutorials/ladder_network/ladder_help_funcs.lua
@@ -0,0 +1,220 @@
+require 'csvigo'
+require 'string'
+require 'xlua'
+require 'lfs'
+
+-- Training function test
+-- Processing a batch in one Go.
+-- Has useCuda option to run on GPU [model and criterion expected in CUDA]
+local conTargets, conOutputs
+function model_train_multi_criterion(model, criterions, parameters,
+                                     gradParameters, trainData, 
+                                     optimMethod, optimState, batchSize,
+                                     epoch, confusion, trainLogger,
+                                     useCuda, displayProgress, classifierIndx)
+
+   model:training()
+   confusion:zero()
+   local displayProgress = displayProgress or false
+   local classifierIndx = classifierIndx or 1
+
+   -- epoch tracker
+   local epoch = epoch or 1
+
+   local totalLoss = 0
+   
+   -- shuffle at each epoch
+   local shuffle = torch.randperm(trainData.size())
+
+   local sampleSize = trainData.data[1]:size()
+   local isScalar = false
+   local labelSize
+   if trainData.labels:size():size() == 1 then
+      isScalar = true
+   else
+      labelSize = trainData.labels[1]:size()
+   end
+
+   print("Doing epoch on training data:")
+   print("Online epoch # " .. epoch .. " [batchSize = " .. batchSize .. "]")
+
+   -- local variables
+   local time = sys.clock()
+   local inputs
+   local targets
+   if isScalar then
+      targets = torch.Tensor(batchSize)
+   else
+      targets = torch.Tensor(batchSize, labelSize[1])
+   end
+
+   -- Samples
+   sizeLen = sampleSize:size()
+   if sizeLen == 1 then
+      inputs = torch.Tensor(batchSize, sampleSize[1])
+   elseif sizeLen == 2 then
+      inputs = torch.Tensor(batchSize, sampleSize[1], sampleSize[2])
+   elseif sizeLen == 3 then
+      inputs = torch.Tensor(batchSize, sampleSize[1], sampleSize[2],
+                                       sampleSize[3])
+   else
+      print("Invalid Sample Size")
+   end
+
+   local trainInputs = useCuda and torch.CudaTensor() or torch.FloatTensor()
+   local trainTargets = useCuda and torch.CudaTensor() or torch.FloatTensor()
+   local criterionTargets
+
+   t = 1
+   while t <= trainData.size() do
+      if displayProgress then xlua.progress(t, trainData.size()) end
+      noOfSamples = math.min(t + batchSize -1, trainData.size())
+      --create mini batch
+      indx = 1 
+      for i=t,math.min(t+batchSize-1, trainData.size()) do
+         -- Load new sample
+         inputs[indx] = trainData.data[shuffle[i]]
+         targets[indx] = trainData.labels[shuffle[i]]
+         indx = indx + 1
+      end
+      indx = indx - 1
+
+      local inputs_ = inputs[{{1,indx}}]
+      trainInputs:resize(inputs_:size()):copy(inputs_)
+
+      local targets_ = targets[{{1,indx}}]
+      trainTargets:resize(targets_:size()):copy(targets_)
+
+      criterionTargets = {trainTargets, trainInputs}
+
+      t = t + batchSize
+
+      -- create closure to evaluate F(X) and df/dX
+      local feval = function(x)
+         -- Get new parameters
+         if x ~= parameters then
+            parameters:copy(x)
+         end
+
+         -- reset gradients
+         gradParameters:zero()
+
+         -- evaluate function for complete mini batch
+         local outputs = model:forward(trainInputs)
+         local f = criterions:forward(outputs, criterionTargets)
+         -- Total Loss
+         totalLoss = totalLoss + f
+
+         local df_do = criterions:backward(outputs, criterionTargets)
+         model:backward(trainInputs, df_do)
+
+         if useCuda then
+            conOutputs = outputs[classifierIndx]:float()
+            conTargets = trainTargets:float()
+         else
+            conOutputs = outputs[classifierIndx]
+            conTargets = trainTargets
+         end
+
+         confusion:batchAdd(conOutputs, conTargets)
+
+         -- Normalize gradients
+         gradParameters:div(trainInputs:size()[1])
+         f = f/trainInputs:size()[1]
+
+         -- L1/L2 Regularization
+         if optimState.coefL1 ~= 0 or optimState.coefL2 ~= 0 then
+            -- locals"
+            local norm, sign = torch.norm, torch.sign
+         
+            -- Update loss with regularizer
+            f = f + optimState.coefL1 * norm(parameters, 1)
+            f = f + optimState.coefL2 * norm(parameters, 2)^2/2
+
+            -- Gradients
+            gradParameters:add(sign(parameters):mul(optimState.coefL1)
+                               + parameters:clone():mul(opt.coefL2))
+         end
+
+         -- return f and df/dX
+         return f, gradParameters
+      end
+
+      -- optimize on current mini batch # Using SGD/adam
+      optimMethod(feval, parameters, optimState)
+   end
+
+   -- time taken
+   time = sys.clock() - time
+   time = time/trainData.size()
+   print("\n==> time to learn 1 sample = " .. (time*1000) .. "ms")  
+
+   -- Total loss
+   totalLoss = totalLoss/trainData.size()
+
+   -- update logger
+   if trainLogger ~= nil then
+      trainLogger:add{["% mean class accuracy (train set)"] =
+                      confusion.totalValid * 100}
+   end
+   return totalLoss
+end
+
+function model_test_multi_criterion(model, criterions, testData, confusion, 
+                                    useCuda, classifierIndx)
+   local time = sys.clock()
+   model:evaluate()
+   confusion:zero()
+   local classifierIndx = classifierIndx or 1
+   local totalLoss = 0
+   local criterionTargets
+
+   if useCuda then
+      local batchSize = 64
+      local inputs = torch.CudaTensor()
+      local testInputs
+      local cpu_targets
+      local gpu_targets = torch.CudaTensor()
+      local gpu_preds
+      local cpu_preds
+      local i = 1
+      local j = 0
+      while i <= testData.size() do
+         j = math.min(i + batchSize -1, testData.size())
+         -- Copy input and targets to cuda
+         testInputs = testData.data[{{i, j}}]
+         inputs:resize(testInputs:size()):copy(testInputs)
+         cpu_targets = testData.labels[{{i, j}}]
+         gpu_targets:resize(cpu_targets:size()):copy(cpu_targets)
+         criterionTargets = {gpu_targets, inputs}
+
+         gpu_preds = model:forward(inputs)
+         totalLoss = totalLoss + criterions:forward(gpu_preds,
+                                                    criterionTargets)
+         cpu_preds = gpu_preds[classifierIndx]:float()
+         confusion:batchAdd(cpu_preds, cpu_targets)
+         i = i + batchSize
+      end
+   else
+      local trainInputs = testData.data
+      local trainTargets = testData.labels
+      criterionTargets = {trainTargets, trainInputs}
+
+      local outputs = model:forward(trainInputs)
+      totalLoss = criterions:forward(outputs, criterionTargets)
+
+      local conOutputs = outputs[classifierIndx]
+      local conTargets = trainTargets
+      confusion:batchAdd(conOutputs, conTargets)
+   end
+
+   -- time taken
+   time = sys.clock() - time
+   time = time/testData.size()
+   print("\n==> time to test 1 sample = " .. (time*1000) .. "ms")
+
+   -- Total loss
+   totalLoss = totalLoss/testData.size()
+
+   return totalLoss
+end
diff --git a/tutorials/lena.jpg b/tutorials/lena.jpg
new file mode 100644
index 0000000..9181d48
Binary files /dev/null and b/tutorials/lena.jpg differ
diff --git a/tutorials/srd1.jpg b/tutorials/srd1.jpg
new file mode 100644
index 0000000..76971ff
Binary files /dev/null and b/tutorials/srd1.jpg differ
diff --git a/tutorials/srd2.jpg b/tutorials/srd2.jpg
new file mode 100644
index 0000000..ffacaff
Binary files /dev/null and b/tutorials/srd2.jpg differ