Merge pull request #27 from torch/gpu-fixes

Optimizations and GPU fixes
torch · May 12, 2017 · 6ca4c57 · 6ca4c57
2 parents 38a6e59 + e5508b7
commit 6ca4c57
Show file tree

Hide file tree

Showing 13 changed files with 443 additions and 373 deletions.
diff --git a/Kmeans.lua b/Kmeans.lua
@@ -25,7 +25,7 @@ end
 
 -- Reset
 function Kmeans:reset(stdev)
-   local stdev = stdev or 1
+   stdev = stdev or 1
    self.weight:uniform(-stdev, stdev)
 end
 
@@ -55,7 +55,7 @@ function Kmeans:initKmeansPlus(input, p)
    local inputDim = input:nDimension()
    assert(inputDim == 2, "Incorrect input dimensionality. Expecting 2D.")
    local noOfSamples = input:size(1)
-   
+
    local pcount = math.ceil((1-self.p)*noOfSamples)
    if pcount <= 0 then pcount = 1 end
 
@@ -71,13 +71,13 @@ function Kmeans:initKmeansPlus(input, p)
    distances:resize(noOfSamples):fill(math.huge)
    local maxScores = self.weight.new()
    local maxIndx = self.weight.new()
-   
+
    for k=initializedK, self.k do
       clusters = self.weight[{{initializedK-1, initializedK-1}}]
       for i=1, noOfSamples do
          temp:expand(input[{{i}}], 1, self.dim)
          expandedSample:resize(temp:size()):copy(temp)
-      
+
          -- Squared Euclidean distance
          expandedSample:add(-1, clusters)
          clusterDistances:norm(expandedSample, 2, 2)
@@ -135,20 +135,20 @@ function Kmeans:updateOutput(input)
    self._clusterDistances:resize(self.k, batchSize)
 
    self._minScore = self._minScore or self.weight.new()
-   self._minIndx = self._minIndx or torch.LongTensor()
+   self._minIndx = self._minIndx or (torch.isCudaTensor(input) and torch.CudaLongTensor() or torch.LongTensor())
    self._minScore:min(self._minIndx, self._clusterDistances, 1)
    self._minIndx:resize(batchSize)
-   
+
    self.output:resize(batchSize):copy(self._minIndx)
    self.loss = self._minScore:sum()
-  
-   return self.output 
+
+   return self.output
 end
 
 -- Kmeans has its own criterion hence gradInput are zeros
 function Kmeans:updateGradInput(input, gradOuput)
    self.gradInput:resize(input:size()):zero()
-   
+
    return self.gradInput
 end
 
@@ -165,41 +165,43 @@ function Kmeans:accGradParameters(input, gradOutput, scale)
    self._cscAdder:resize(batchSize):fill(1)
    self.clusterSampleCount:zero()
    self.clusterSampleCount:indexAdd(1, self._minIndx, self._cscAdder)
-   
+
    -- scale * (x[k]-c[k]) where k is nearest cluster to x
    self._gradWeight = self._gradWeight or self.gradWeight.new()
    self._gradWeight:index(self.weight, 1, self._minIndx)
-   self._gradWeight:mul(-1) 
+   self._gradWeight:mul(-1)
    self._gradWeight:add(input)
    self._gradWeight:mul(-scale)
-   
+
    self._gradWeight2 = self._gradWeight2 or self.gradWeight.new()
    self._gradWeight2:resizeAs(self.gradWeight):zero()
    self._gradWeight2:indexAdd(1, self._minIndx, self._gradWeight)
-   
+
    -- scale/n * sum_i (x-c)
    self._ccounts = self._ccounts or self.clusterSampleCount.new()
    self._ccounts:resize(self.k):copy(self.clusterSampleCount)
    self._ccounts:add(0.0000001) -- prevent division by zero errors
-   
+
    self._gradWeight2:cdiv(self._ccounts:view(self.k,1):expandAs(self.gradWeight))
-   
+
    self.gradWeight:add(self._gradWeight2)
 end
 
+function Kmeans:clearState()
+   -- prevent premature memory allocations
+   self._expandedSamples = nil
+   self._clusterDistances = nil
+   self._temp = nil
+   self._tempExpanded = nil
+   self._tempWeight = nil
+   self._tempWeightExp = nil
+   self._expandedWeight = nil
+   self._minScore = nil
+   self._minIndx = nil
+   self._cscAdder = nil
+end
+
 function Kmeans:type(type, tensorCache)
-   if type then
-      -- prevent premature memory allocations
-      self._expandedSamples = nil
-      self._clusterDistances = nil
-      self._temp = nil
-      self._tempExpanded = nil
-      self._tempWeight = nil
-      self._tempWeightExp = nil
-      self._expandedWeight = nil
-      self._minScore = nil
-      self._minIndx = nil
-      self._cscAdder = nil
-   end
+   self:clearState()
    return parent.type(self, type, tensorCache)
 end
diff --git a/MaskZeroCriterion.lua b/MaskZeroCriterion.lua
@@ -32,8 +32,8 @@ function MaskZeroCriterion:updateOutput(input, target)
       self._oneMask = self._oneMask or self.zeroMask.new()
       self._oneMask:lt(self.zeroMask, 1)
       -- 1,0,1 -> 1,3
-      self._indices = self._indices or torch.LongTensor()
-      self._range = self._range or torch.LongTensor()
+      self._indices = self._indices or torch.isCudaTensor(input) and torch.CudaLongTensor() or torch.LongTensor()
+      self._range = self._range or self._indices.new()
       self._range:range(1,self._oneMask:nElement())
       self._indices:maskedSelect(self._range, self._oneMask)
       -- indexSelect the input

diff --git a/SeqGRU.lua b/SeqGRU.lua
@@ -97,10 +97,15 @@ function SeqGRU:updateOutput(input)
 
    local h = self.output
    h:resize(seqlen, batchsize, outputsize):zero()
-   self.gates:resize(seqlen, batchsize, 3 * outputsize):zero()
+
+   local nElement = self.gates:nElement()
+   self.gates:resize(seqlen, batchsize, 3 * outputsize)
+   if nElement ~= seqlen * batchsize * 3 * outputsize then
+      self.gates:zero()
+   end
 
    local prev_h = h0
-   if input.nn.StepGRU_updateOutput and not self.forceLua then
+   if input.nn and input.nn.StepGRU_updateOutput and not self.forceLua then
       for t = 1, seqlen do
          local cur_x, next_h, gates = input[t], h[t], self.gates[t]
          cur_x.nn.StepGRU_updateOutput(self.weight, self.bias,
@@ -152,7 +157,7 @@ function SeqGRU:backward(input, gradOutput, scale)
    self.gradInput:resizeAs(input):zero()
 
    local grad_next_h = self.grad_hT or self.buffer1:zero()
-   if input.nn.StepGRU_backward and not self.forceLua then
+   if input.nn and input.nn.StepGRU_backward and not self.forceLua then
       for t = seqlen, 1, -1 do
          local cur_x, next_h = input[t], h[t]
          local prev_h = (t == 1) and self.h0 or h[t - 1]
@@ -184,15 +189,16 @@ function SeqGRU:backward(input, gradOutput, scale)
          local u = self.gates[{t, {}, {outputsize + 1, 2 * outputsize}}]
          local hc = self.gates[{t, {}, {2 * outputsize + 1, 3 * outputsize}}]
 
-         local grad_a = self.grad_a_buffer:resize(batchsize, 3 * outputsize):zero()
+         local grad_a = self.grad_a_buffer:resize(batchsize, 3 * outputsize)
+
          local grad_ar = grad_a[{{}, {1, outputsize}}]
          local grad_au = grad_a[{{}, {outputsize + 1, 2 * outputsize}}]
          local grad_ahc = grad_a[{{}, {2 * outputsize + 1, 3 * outputsize}}]
 
          -- use grad_au as temporary buffer to compute grad_ahc.
 
          local grad_hc = grad_au:fill(0):addcmul(grad_next_h, -1, u, grad_next_h)
-         grad_ahc:fill(1):addcmul(-1, hc,hc):cmul(grad_hc)
+         grad_ahc:fill(1):addcmul(-1, hc, hc):cmul(grad_hc)
          local grad_r = grad_au:fill(0):addmm(grad_ahc, Wh[{{}, {2 * outputsize + 1, 3 * outputsize}}]:t() ):cmul(prev_h)
          grad_ar:fill(1):add(-1, r):cmul(r):cmul(grad_r)
 

diff --git a/SeqLSTM.lua b/SeqLSTM.lua
@@ -145,10 +145,15 @@ function SeqLSTM:updateOutput(input)
    local h, c = self.output, self.cell
    h:resize(seqlen, batchsize, outputsize)
    c:resize(seqlen, batchsize, hiddensize)
+
+   local nElement = self.gates:nElement()
    self.gates:resize(seqlen, batchsize, 4 * hiddensize)
+   if nElement ~= seqlen * batchsize * 4 * hiddensize then
+      self.gates:zero()
+   end
 
    local prev_h, prev_c = h0, c0
-   if input.nn.StepLSTM_updateOutput and not self.forceLua then
+   if input.nn and input.nn.StepLSTM_updateOutput and not self.forceLua then
       for t = 1, seqlen do
          local cur_x, next_h, next_c, gates = input[t], h[t], c[t], self.gates[t]
 
@@ -225,7 +230,7 @@ function SeqLSTM:backward(input, gradOutput, scale)
 
    local grad_next_h = self.grad_hT or self.buffer1:zero()
    local grad_next_c = self.grad_cT or self.buffer2:zero()
-   if input.nn.StepLSTM_backward and not self.forceLua then
+   if input.nn and input.nn.StepLSTM_backward and not self.forceLua then
       for t = seqlen, 1, -1 do
          local cur_x, next_h, next_c = input[t], h[t], c[t]
          local prev_h, prev_c

diff --git a/StepGRU.lua b/StepGRU.lua
@@ -37,7 +37,7 @@ end
 function StepGRU:updateOutput(input)
    self.recompute_backward = true
    local cur_x, prev_h, next_h = input[1], input[2], self.output
-   if cur_x.nn.StepGRU_updateOutput and not self.forceLua then
+   if cur_x.nn and cur_x.nn.StepGRU_updateOutput and not self.forceLua then
       cur_x.nn.StepGRU_updateOutput(self.weight, self.bias, self.gates,
                                     cur_x, prev_h,
                                     self.inputsize, self.outputsize,
@@ -52,8 +52,12 @@ function StepGRU:updateOutput(input)
       local Wh = self.weight:narrow(1, inputsize + 1, self.outputsize)
 
       next_h:resize(batchsize, outputsize)
-      self.gates:resize(batchsize, 3 * outputsize):zero()
       local gates = self.gates
+      local nElement = gates:nElement()
+      gates:resize(batchsize, 3 * outputsize)
+      if gates:nElement() ~= batchsize * 3 * outputsize then
+         gates:zero()
+      end
 
       gates:addmm(bias_expand, cur_x, Wx)
       local sub_gates = gates:narrow(2, 1, 2 * outputsize)
@@ -92,7 +96,6 @@ function StepGRU:backward(input, gradOutput, scale)
    scale = scale or 1.0
    assert(scale == 1.0, 'must have scale=1')
 
-   --
    local grad_gates = torch.getBuffer('StepGRU', 'grad_gates', self.gates) -- batchsize x 3*outputsize
    local buffer = torch.getBuffer('StepGRU', 'buffer', self.gates) -- 1 x 3*outputsize
 
@@ -101,7 +104,7 @@ function StepGRU:backward(input, gradOutput, scale)
       nn.utils.recursiveZeroMask(grad_next_h, self.zeroMask)
    end
 
-   if cur_x.nn.StepGRU_backward and not self.forceLua then
+   if cur_x.nn and cur_x.nn.StepGRU_backward and not self.forceLua then
       cur_x.nn.StepGRU_backward(self.weight, self.gates,
                                 self.gradWeight, self.gradBias, grad_gates, buffer,
                                 cur_x, prev_h, grad_next_h,
@@ -125,7 +128,8 @@ function StepGRU:backward(input, gradOutput, scale)
       local update_gate = gates:narrow(2, outputsize + 1, outputsize)
       local hidden_candidate = gates:narrow(2, 2 * outputsize + 1, outputsize)
 
-      grad_gates:resize(batchsize, 3 * outputsize):zero()
+      grad_gates:resize(batchsize, 3 * outputsize)
+
       local grad_reset_gate = grad_gates:narrow(2, 1, outputsize)
       local grad_update_gate = grad_gates:narrow(2, outputsize + 1, outputsize)
       local grad_hidden_candidate = grad_gates:narrow(2, 2 * outputsize + 1, outputsize)

diff --git a/StepLSTM.lua b/StepLSTM.lua
@@ -51,7 +51,7 @@ function StepLSTM:updateOutput(input)
    self.recompute_backward = true
    local cur_x, prev_h, prev_c = input[1], input[2], input[3]
    local next_h, next_c = self.output[1], self.output[2]
-   if cur_x.nn.StepLSTM_updateOutput and not self.forceLua then
+   if cur_x.nn and cur_x.nn.StepLSTM_updateOutput and not self.forceLua then
       if self.weightO then -- LSTMP
          self.hidden = self.hidden or cur_x.new()
          cur_x.nn.StepLSTM_updateOutput(self.weight, self.bias, self.gates,
@@ -82,8 +82,12 @@ function StepLSTM:updateOutput(input)
       next_h:resize(batchsize, hiddensize)
       next_c:resize(batchsize, hiddensize)
 
-      self.gates:resize(batchsize, 4 * hiddensize):zero()
       local gates = self.gates
+      local nElement = gates:nElement()
+      gates:resize(batchsize, 4 * hiddensize)
+      if gates:nElement() ~= batchsize * 4 * hiddensize then
+         gates:zero()
+      end
 
       -- forward
       gates:addmm(bias_expand, cur_x, Wx)
@@ -134,7 +138,7 @@ function StepLSTM:backward(input, gradOutput, scale)
       nn.utils.recursiveZeroMask({grad_next_h, grad_next_c}, self.zeroMask)
    end
 
-   if cur_x.nn.StepLSTM_backward and not self.forceLua then
+   if cur_x.nn and cur_x.nn.StepLSTM_backward and not self.forceLua then
       if self.weightO then -- LSTMP
          local grad_hidden = torch.getBuffer('StepLSTM', 'grad_hidden', self.hidden)
          cur_x.nn.StepLSTM_backward(self.weight, self.gates,
@@ -182,7 +186,8 @@ function StepLSTM:backward(input, gradOutput, scale)
       local output_gate = gates[{{}, {2 * hiddensize + 1, 3 * hiddensize}}]
       local input_transform = gates[{{}, {3 * hiddensize + 1, 4 * hiddensize}}]
 
-      grad_gates:resize(batchsize, 4 * hiddensize):zero()
+      grad_gates:resize(batchsize, 4 * hiddensize)
+
       local grad_input_gate = grad_gates[{{}, {1, hiddensize}}]
       local grad_forget_gate = grad_gates[{{}, {hiddensize + 1, 2 * hiddensize}}]
       local grad_output_gate = grad_gates[{{}, {2 * hiddensize + 1, 3 * hiddensize}}]

diff --git a/benchmark/README.md b/benchmark/README.md
@@ -1,4 +1,4 @@
 # Benchmark
 
-On CPU, using Ubuntu 16.04, using float32, Torch LSTM boasts 886 samples/sec compared to TF’s 809 samples/sec for LSTM with 512 hiddensize and 64 batchsize.
-On the other hand, for 128 hiddensize and 32 batchsize, Torch has 3950 compared to TF’s 4130 samples/sec.
+On CPU, using Ubuntu 16.04, using float32, Torch LSTM boasts 900 samples/sec compared to TF’s 809 samples/sec for LSTM with 512 hiddensize and 64 batchsize.
+On the other hand, for 128 hiddensize and 32 batchsize, Torch has 3990 compared to TF’s 4130 samples/sec.
diff --git a/generic/StepGRU.c b/generic/StepGRU.c
@@ -22,7 +22,10 @@ static int nn_(StepGRU_updateOutput)(lua_State *L) {
   buffer->size[0] = batchsize;
 
   THTensor_(resize2d)(next_h, batchsize, outputsize);
+  long nElement = THTensor_(nElement)(gates);
   THTensor_(resize2d)(gates, batchsize, 3 * outputsize);
+  if (nElement != batchsize * 3 * outputsize)
+    THTensor_(fill)(gates, 0);
 
   THTensor *Wx = THTensor_(newNarrow)(weight, 0, 0, inputsize);
   THTensor *Wh = THTensor_(newNarrow)(weight, 0, inputsize, outputsize);
@@ -32,8 +35,6 @@ static int nn_(StepGRU_updateOutput)(lua_State *L) {
   THTensor *update_gate = THTensor_(newNarrow)(gates, 1, outputsize, outputsize); // u = sig(Wx * x + Wh * prev_h + b)
   THTensor *hidden_candidate = THTensor_(newNarrow)(gates, 1, 2*outputsize, outputsize); // hc = tanh(Wx * x + Wh * r . prev_h + b)
 
-  //THTensor_(fill)(gates, 0);
-
   // forward
   THTensor_(addmm)(gates, 1, buffer, 1, cur_x, Wx);
   THTensor_(addmm)(sub_gates, 1, sub_gates, 1, prev_h, sub_Wh);
@@ -84,7 +85,6 @@ static int nn_(StepGRU_backward)(lua_State *L) {
   THTensor_(resize2d)(grad_cur_x, batchsize, inputsize);
   THTensor_(resize2d)(grad_prev_h, batchsize, outputsize);
   THTensor_(resize2d)(grad_gates, batchsize, 3 * outputsize);
-  THTensor_(fill)(grad_gates, 0);
 
   THTensor *Wx = THTensor_(newNarrow)(weight, 0, 0, inputsize);
   THTensor *Wh = THTensor_(newNarrow)(weight, 0, inputsize, outputsize);

diff --git a/generic/StepLSTM.c b/generic/StepLSTM.c
@@ -29,9 +29,10 @@ static int nn_(StepLSTM_updateOutput)(lua_State *L) {
 
   THTensor_(resize2d)(next_h, batchsize, hiddensize);
   THTensor_(resize2d)(next_c, batchsize, hiddensize);
-
+  long nElement = THTensor_(nElement)(gates);
   THTensor_(resize2d)(gates, batchsize, 4 * hiddensize);
-  //THTensor_(fill)(gates, 0);
+  if (nElement != batchsize * 4 * hiddensize)
+    THTensor_(fill)(gates, 0);
 
   // forward
   THTensor_(addmm)(gates, 1, buffer, 1, cur_x, Wx);
@@ -147,7 +148,6 @@ static int nn_(StepLSTM_backward)(lua_State *L) {
   THTensor *grad_Wh = THTensor_(newNarrow)(gradWeight, 0, inputsize, outputsize);
 
   THTensor_(resize2d)(grad_gates, batchsize, 4 * hiddensize);
-  THTensor_(fill)(grad_gates, 0);
 
   THTensor *grad_input_gate = THTensor_(newNarrow)(grad_gates, 1, 0, hiddensize);
   THTensor *grad_forget_gate = THTensor_(newNarrow)(grad_gates, 1, hiddensize, hiddensize);