Skip to content

Commit

Permalink
Merge pull request #27 from torch/gpu-fixes
Browse files Browse the repository at this point in the history
Optimizations and GPU fixes
  • Loading branch information
nicholas-leonard authored May 12, 2017
2 parents 38a6e59 + e5508b7 commit 6ca4c57
Show file tree
Hide file tree
Showing 13 changed files with 443 additions and 373 deletions.
58 changes: 30 additions & 28 deletions Kmeans.lua
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ end

-- Reset
function Kmeans:reset(stdev)
local stdev = stdev or 1
stdev = stdev or 1
self.weight:uniform(-stdev, stdev)
end

Expand Down Expand Up @@ -55,7 +55,7 @@ function Kmeans:initKmeansPlus(input, p)
local inputDim = input:nDimension()
assert(inputDim == 2, "Incorrect input dimensionality. Expecting 2D.")
local noOfSamples = input:size(1)

local pcount = math.ceil((1-self.p)*noOfSamples)
if pcount <= 0 then pcount = 1 end

Expand All @@ -71,13 +71,13 @@ function Kmeans:initKmeansPlus(input, p)
distances:resize(noOfSamples):fill(math.huge)
local maxScores = self.weight.new()
local maxIndx = self.weight.new()

for k=initializedK, self.k do
clusters = self.weight[{{initializedK-1, initializedK-1}}]
for i=1, noOfSamples do
temp:expand(input[{{i}}], 1, self.dim)
expandedSample:resize(temp:size()):copy(temp)

-- Squared Euclidean distance
expandedSample:add(-1, clusters)
clusterDistances:norm(expandedSample, 2, 2)
Expand Down Expand Up @@ -135,20 +135,20 @@ function Kmeans:updateOutput(input)
self._clusterDistances:resize(self.k, batchSize)

self._minScore = self._minScore or self.weight.new()
self._minIndx = self._minIndx or torch.LongTensor()
self._minIndx = self._minIndx or (torch.isCudaTensor(input) and torch.CudaLongTensor() or torch.LongTensor())
self._minScore:min(self._minIndx, self._clusterDistances, 1)
self._minIndx:resize(batchSize)

self.output:resize(batchSize):copy(self._minIndx)
self.loss = self._minScore:sum()
return self.output

return self.output
end

-- Kmeans has its own criterion hence gradInput are zeros
function Kmeans:updateGradInput(input, gradOuput)
self.gradInput:resize(input:size()):zero()

return self.gradInput
end

Expand All @@ -165,41 +165,43 @@ function Kmeans:accGradParameters(input, gradOutput, scale)
self._cscAdder:resize(batchSize):fill(1)
self.clusterSampleCount:zero()
self.clusterSampleCount:indexAdd(1, self._minIndx, self._cscAdder)

-- scale * (x[k]-c[k]) where k is nearest cluster to x
self._gradWeight = self._gradWeight or self.gradWeight.new()
self._gradWeight:index(self.weight, 1, self._minIndx)
self._gradWeight:mul(-1)
self._gradWeight:mul(-1)
self._gradWeight:add(input)
self._gradWeight:mul(-scale)

self._gradWeight2 = self._gradWeight2 or self.gradWeight.new()
self._gradWeight2:resizeAs(self.gradWeight):zero()
self._gradWeight2:indexAdd(1, self._minIndx, self._gradWeight)

-- scale/n * sum_i (x-c)
self._ccounts = self._ccounts or self.clusterSampleCount.new()
self._ccounts:resize(self.k):copy(self.clusterSampleCount)
self._ccounts:add(0.0000001) -- prevent division by zero errors

self._gradWeight2:cdiv(self._ccounts:view(self.k,1):expandAs(self.gradWeight))

self.gradWeight:add(self._gradWeight2)
end

function Kmeans:clearState()
-- prevent premature memory allocations
self._expandedSamples = nil
self._clusterDistances = nil
self._temp = nil
self._tempExpanded = nil
self._tempWeight = nil
self._tempWeightExp = nil
self._expandedWeight = nil
self._minScore = nil
self._minIndx = nil
self._cscAdder = nil
end

function Kmeans:type(type, tensorCache)
if type then
-- prevent premature memory allocations
self._expandedSamples = nil
self._clusterDistances = nil
self._temp = nil
self._tempExpanded = nil
self._tempWeight = nil
self._tempWeightExp = nil
self._expandedWeight = nil
self._minScore = nil
self._minIndx = nil
self._cscAdder = nil
end
self:clearState()
return parent.type(self, type, tensorCache)
end
4 changes: 2 additions & 2 deletions MaskZeroCriterion.lua
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,8 @@ function MaskZeroCriterion:updateOutput(input, target)
self._oneMask = self._oneMask or self.zeroMask.new()
self._oneMask:lt(self.zeroMask, 1)
-- 1,0,1 -> 1,3
self._indices = self._indices or torch.LongTensor()
self._range = self._range or torch.LongTensor()
self._indices = self._indices or torch.isCudaTensor(input) and torch.CudaLongTensor() or torch.LongTensor()
self._range = self._range or self._indices.new()
self._range:range(1,self._oneMask:nElement())
self._indices:maskedSelect(self._range, self._oneMask)
-- indexSelect the input
Expand Down
16 changes: 11 additions & 5 deletions SeqGRU.lua
Original file line number Diff line number Diff line change
Expand Up @@ -97,10 +97,15 @@ function SeqGRU:updateOutput(input)

local h = self.output
h:resize(seqlen, batchsize, outputsize):zero()
self.gates:resize(seqlen, batchsize, 3 * outputsize):zero()

local nElement = self.gates:nElement()
self.gates:resize(seqlen, batchsize, 3 * outputsize)
if nElement ~= seqlen * batchsize * 3 * outputsize then
self.gates:zero()
end

local prev_h = h0
if input.nn.StepGRU_updateOutput and not self.forceLua then
if input.nn and input.nn.StepGRU_updateOutput and not self.forceLua then
for t = 1, seqlen do
local cur_x, next_h, gates = input[t], h[t], self.gates[t]
cur_x.nn.StepGRU_updateOutput(self.weight, self.bias,
Expand Down Expand Up @@ -152,7 +157,7 @@ function SeqGRU:backward(input, gradOutput, scale)
self.gradInput:resizeAs(input):zero()

local grad_next_h = self.grad_hT or self.buffer1:zero()
if input.nn.StepGRU_backward and not self.forceLua then
if input.nn and input.nn.StepGRU_backward and not self.forceLua then
for t = seqlen, 1, -1 do
local cur_x, next_h = input[t], h[t]
local prev_h = (t == 1) and self.h0 or h[t - 1]
Expand Down Expand Up @@ -184,15 +189,16 @@ function SeqGRU:backward(input, gradOutput, scale)
local u = self.gates[{t, {}, {outputsize + 1, 2 * outputsize}}]
local hc = self.gates[{t, {}, {2 * outputsize + 1, 3 * outputsize}}]

local grad_a = self.grad_a_buffer:resize(batchsize, 3 * outputsize):zero()
local grad_a = self.grad_a_buffer:resize(batchsize, 3 * outputsize)

local grad_ar = grad_a[{{}, {1, outputsize}}]
local grad_au = grad_a[{{}, {outputsize + 1, 2 * outputsize}}]
local grad_ahc = grad_a[{{}, {2 * outputsize + 1, 3 * outputsize}}]

-- use grad_au as temporary buffer to compute grad_ahc.

local grad_hc = grad_au:fill(0):addcmul(grad_next_h, -1, u, grad_next_h)
grad_ahc:fill(1):addcmul(-1, hc,hc):cmul(grad_hc)
grad_ahc:fill(1):addcmul(-1, hc, hc):cmul(grad_hc)
local grad_r = grad_au:fill(0):addmm(grad_ahc, Wh[{{}, {2 * outputsize + 1, 3 * outputsize}}]:t() ):cmul(prev_h)
grad_ar:fill(1):add(-1, r):cmul(r):cmul(grad_r)

Expand Down
9 changes: 7 additions & 2 deletions SeqLSTM.lua
Original file line number Diff line number Diff line change
Expand Up @@ -145,10 +145,15 @@ function SeqLSTM:updateOutput(input)
local h, c = self.output, self.cell
h:resize(seqlen, batchsize, outputsize)
c:resize(seqlen, batchsize, hiddensize)

local nElement = self.gates:nElement()
self.gates:resize(seqlen, batchsize, 4 * hiddensize)
if nElement ~= seqlen * batchsize * 4 * hiddensize then
self.gates:zero()
end

local prev_h, prev_c = h0, c0
if input.nn.StepLSTM_updateOutput and not self.forceLua then
if input.nn and input.nn.StepLSTM_updateOutput and not self.forceLua then
for t = 1, seqlen do
local cur_x, next_h, next_c, gates = input[t], h[t], c[t], self.gates[t]

Expand Down Expand Up @@ -225,7 +230,7 @@ function SeqLSTM:backward(input, gradOutput, scale)

local grad_next_h = self.grad_hT or self.buffer1:zero()
local grad_next_c = self.grad_cT or self.buffer2:zero()
if input.nn.StepLSTM_backward and not self.forceLua then
if input.nn and input.nn.StepLSTM_backward and not self.forceLua then
for t = seqlen, 1, -1 do
local cur_x, next_h, next_c = input[t], h[t], c[t]
local prev_h, prev_c
Expand Down
14 changes: 9 additions & 5 deletions StepGRU.lua
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ end
function StepGRU:updateOutput(input)
self.recompute_backward = true
local cur_x, prev_h, next_h = input[1], input[2], self.output
if cur_x.nn.StepGRU_updateOutput and not self.forceLua then
if cur_x.nn and cur_x.nn.StepGRU_updateOutput and not self.forceLua then
cur_x.nn.StepGRU_updateOutput(self.weight, self.bias, self.gates,
cur_x, prev_h,
self.inputsize, self.outputsize,
Expand All @@ -52,8 +52,12 @@ function StepGRU:updateOutput(input)
local Wh = self.weight:narrow(1, inputsize + 1, self.outputsize)

next_h:resize(batchsize, outputsize)
self.gates:resize(batchsize, 3 * outputsize):zero()
local gates = self.gates
local nElement = gates:nElement()
gates:resize(batchsize, 3 * outputsize)
if gates:nElement() ~= batchsize * 3 * outputsize then
gates:zero()
end

gates:addmm(bias_expand, cur_x, Wx)
local sub_gates = gates:narrow(2, 1, 2 * outputsize)
Expand Down Expand Up @@ -92,7 +96,6 @@ function StepGRU:backward(input, gradOutput, scale)
scale = scale or 1.0
assert(scale == 1.0, 'must have scale=1')

--
local grad_gates = torch.getBuffer('StepGRU', 'grad_gates', self.gates) -- batchsize x 3*outputsize
local buffer = torch.getBuffer('StepGRU', 'buffer', self.gates) -- 1 x 3*outputsize

Expand All @@ -101,7 +104,7 @@ function StepGRU:backward(input, gradOutput, scale)
nn.utils.recursiveZeroMask(grad_next_h, self.zeroMask)
end

if cur_x.nn.StepGRU_backward and not self.forceLua then
if cur_x.nn and cur_x.nn.StepGRU_backward and not self.forceLua then
cur_x.nn.StepGRU_backward(self.weight, self.gates,
self.gradWeight, self.gradBias, grad_gates, buffer,
cur_x, prev_h, grad_next_h,
Expand All @@ -125,7 +128,8 @@ function StepGRU:backward(input, gradOutput, scale)
local update_gate = gates:narrow(2, outputsize + 1, outputsize)
local hidden_candidate = gates:narrow(2, 2 * outputsize + 1, outputsize)

grad_gates:resize(batchsize, 3 * outputsize):zero()
grad_gates:resize(batchsize, 3 * outputsize)

local grad_reset_gate = grad_gates:narrow(2, 1, outputsize)
local grad_update_gate = grad_gates:narrow(2, outputsize + 1, outputsize)
local grad_hidden_candidate = grad_gates:narrow(2, 2 * outputsize + 1, outputsize)
Expand Down
13 changes: 9 additions & 4 deletions StepLSTM.lua
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ function StepLSTM:updateOutput(input)
self.recompute_backward = true
local cur_x, prev_h, prev_c = input[1], input[2], input[3]
local next_h, next_c = self.output[1], self.output[2]
if cur_x.nn.StepLSTM_updateOutput and not self.forceLua then
if cur_x.nn and cur_x.nn.StepLSTM_updateOutput and not self.forceLua then
if self.weightO then -- LSTMP
self.hidden = self.hidden or cur_x.new()
cur_x.nn.StepLSTM_updateOutput(self.weight, self.bias, self.gates,
Expand Down Expand Up @@ -82,8 +82,12 @@ function StepLSTM:updateOutput(input)
next_h:resize(batchsize, hiddensize)
next_c:resize(batchsize, hiddensize)

self.gates:resize(batchsize, 4 * hiddensize):zero()
local gates = self.gates
local nElement = gates:nElement()
gates:resize(batchsize, 4 * hiddensize)
if gates:nElement() ~= batchsize * 4 * hiddensize then
gates:zero()
end

-- forward
gates:addmm(bias_expand, cur_x, Wx)
Expand Down Expand Up @@ -134,7 +138,7 @@ function StepLSTM:backward(input, gradOutput, scale)
nn.utils.recursiveZeroMask({grad_next_h, grad_next_c}, self.zeroMask)
end

if cur_x.nn.StepLSTM_backward and not self.forceLua then
if cur_x.nn and cur_x.nn.StepLSTM_backward and not self.forceLua then
if self.weightO then -- LSTMP
local grad_hidden = torch.getBuffer('StepLSTM', 'grad_hidden', self.hidden)
cur_x.nn.StepLSTM_backward(self.weight, self.gates,
Expand Down Expand Up @@ -182,7 +186,8 @@ function StepLSTM:backward(input, gradOutput, scale)
local output_gate = gates[{{}, {2 * hiddensize + 1, 3 * hiddensize}}]
local input_transform = gates[{{}, {3 * hiddensize + 1, 4 * hiddensize}}]

grad_gates:resize(batchsize, 4 * hiddensize):zero()
grad_gates:resize(batchsize, 4 * hiddensize)

local grad_input_gate = grad_gates[{{}, {1, hiddensize}}]
local grad_forget_gate = grad_gates[{{}, {hiddensize + 1, 2 * hiddensize}}]
local grad_output_gate = grad_gates[{{}, {2 * hiddensize + 1, 3 * hiddensize}}]
Expand Down
4 changes: 2 additions & 2 deletions benchmark/README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Benchmark

On CPU, using Ubuntu 16.04, using float32, Torch LSTM boasts 886 samples/sec compared to TF’s 809 samples/sec for LSTM with 512 hiddensize and 64 batchsize.
On the other hand, for 128 hiddensize and 32 batchsize, Torch has 3950 compared to TF’s 4130 samples/sec.
On CPU, using Ubuntu 16.04, using float32, Torch LSTM boasts 900 samples/sec compared to TF’s 809 samples/sec for LSTM with 512 hiddensize and 64 batchsize.
On the other hand, for 128 hiddensize and 32 batchsize, Torch has 3990 compared to TF’s 4130 samples/sec.
6 changes: 3 additions & 3 deletions generic/StepGRU.c
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,10 @@ static int nn_(StepGRU_updateOutput)(lua_State *L) {
buffer->size[0] = batchsize;

THTensor_(resize2d)(next_h, batchsize, outputsize);
long nElement = THTensor_(nElement)(gates);
THTensor_(resize2d)(gates, batchsize, 3 * outputsize);
if (nElement != batchsize * 3 * outputsize)
THTensor_(fill)(gates, 0);

THTensor *Wx = THTensor_(newNarrow)(weight, 0, 0, inputsize);
THTensor *Wh = THTensor_(newNarrow)(weight, 0, inputsize, outputsize);
Expand All @@ -32,8 +35,6 @@ static int nn_(StepGRU_updateOutput)(lua_State *L) {
THTensor *update_gate = THTensor_(newNarrow)(gates, 1, outputsize, outputsize); // u = sig(Wx * x + Wh * prev_h + b)
THTensor *hidden_candidate = THTensor_(newNarrow)(gates, 1, 2*outputsize, outputsize); // hc = tanh(Wx * x + Wh * r . prev_h + b)

//THTensor_(fill)(gates, 0);

// forward
THTensor_(addmm)(gates, 1, buffer, 1, cur_x, Wx);
THTensor_(addmm)(sub_gates, 1, sub_gates, 1, prev_h, sub_Wh);
Expand Down Expand Up @@ -84,7 +85,6 @@ static int nn_(StepGRU_backward)(lua_State *L) {
THTensor_(resize2d)(grad_cur_x, batchsize, inputsize);
THTensor_(resize2d)(grad_prev_h, batchsize, outputsize);
THTensor_(resize2d)(grad_gates, batchsize, 3 * outputsize);
THTensor_(fill)(grad_gates, 0);

THTensor *Wx = THTensor_(newNarrow)(weight, 0, 0, inputsize);
THTensor *Wh = THTensor_(newNarrow)(weight, 0, inputsize, outputsize);
Expand Down
6 changes: 3 additions & 3 deletions generic/StepLSTM.c
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,10 @@ static int nn_(StepLSTM_updateOutput)(lua_State *L) {

THTensor_(resize2d)(next_h, batchsize, hiddensize);
THTensor_(resize2d)(next_c, batchsize, hiddensize);

long nElement = THTensor_(nElement)(gates);
THTensor_(resize2d)(gates, batchsize, 4 * hiddensize);
//THTensor_(fill)(gates, 0);
if (nElement != batchsize * 4 * hiddensize)
THTensor_(fill)(gates, 0);

// forward
THTensor_(addmm)(gates, 1, buffer, 1, cur_x, Wx);
Expand Down Expand Up @@ -147,7 +148,6 @@ static int nn_(StepLSTM_backward)(lua_State *L) {
THTensor *grad_Wh = THTensor_(newNarrow)(gradWeight, 0, inputsize, outputsize);

THTensor_(resize2d)(grad_gates, batchsize, 4 * hiddensize);
THTensor_(fill)(grad_gates, 0);

THTensor *grad_input_gate = THTensor_(newNarrow)(grad_gates, 1, 0, hiddensize);
THTensor *grad_forget_gate = THTensor_(newNarrow)(grad_gates, 1, hiddensize, hiddensize);
Expand Down
Loading

0 comments on commit 6ca4c57

Please sign in to comment.