From 919c6f4eb59a57cc7245e6be55057399f5eb5a6b Mon Sep 17 00:00:00 2001 From: Sebastian Bodenstein Date: Sat, 9 Jul 2016 22:39:46 -0400 Subject: [PATCH 01/36] - first commit --- src/operator/cudnn_rnn-inl.h | 208 ++++++++++++++++ src/operator/rnn-inl.h | 471 +++++++++++++++++++++++++++++++++++ src/operator/rnn.cc | 41 +++ src/operator/rnn.cu | 33 +++ 4 files changed, 753 insertions(+) create mode 100644 src/operator/cudnn_rnn-inl.h create mode 100644 src/operator/rnn-inl.h create mode 100644 src/operator/rnn.cc create mode 100644 src/operator/rnn.cu diff --git a/src/operator/cudnn_rnn-inl.h b/src/operator/cudnn_rnn-inl.h new file mode 100644 index 000000000000..37895c2b2488 --- /dev/null +++ b/src/operator/cudnn_rnn-inl.h @@ -0,0 +1,208 @@ +/*! + * Copyright (c) 2016 by Contributors + * \file cudnn_spatial_transformer-inl.h + * \brief + * \author Sebastian Bodenstein +*/ +#ifndef MXNET_OPERATOR_CUDNN_RNN_INL_H_ +#define MXNET_OPERATOR_CUDNN_RNN_INL_H_ + +#include +#include +#include "./rnn-inl.h" +namespace mxnet { +namespace op { +#if defined(__CUDACC__) && MXNET_USE_CUDNN == 1 && CUDNN_MAJOR == 5 +template +class CuDNNRNNOp : public Operator { + public: + explicit CuDNNRNNOp(RNNParam param) { + this->param_ = param; + init_cudnn_ = false; + dtype_ = mshadow::DataType::kCudnnFlag; + // RNN Mode + switch (param_.mode) { + case rnn_enum::kRnnRelu: + rnn_mode_ = CUDNN_RNN_RELU; + break; + case rnn_enum::kRnnTanh: + rnn_mode_ = CUDNN_RNN_TANH; + break; + case rnn_enum::kLstm: + rnn_mode_ = CUDNN_LSTM; + break; + case rnn_enum::kGru: + rnn_mode_ = CUDNN_GRU; + break; + default: + LOG(FATAL) << "Not implmented"; + } + // RNN Direction + switch (param_.direction) { + case rnn_enum::kUnidirectional: + rnn_direction_ = CUDNN_UNIDIRECTIONAL; + break; + case rnn_enum::kBidirectional: + rnn_direction_ = CUDNN_BIDIRECTIONAL; + break; + default: + LOG(FATAL) << "Not implmented"; + } + } + // ~CuDNNRNNOp() { + // if (init_cudnn_) { + // CHECK_EQ(cudnnDestroyRNNDescriptor(rnn_desc_), CUDNN_STATUS_SUCCESS); + // // CHECK_EQ(cudnnDestroyTensorDescriptor(rnn_desc_), CUDNN_STATUS_SUCCESS); + // // CHECK_EQ(cudnnDestroyTensorDescriptor(_desc_), CUDNN_STATUS_SUCCESS); + // } + // } + + virtual void Forward(const OpContext &ctx, + const std::vector &in_data, + const std::vector &req, + const std::vector &out_data, + const std::vector &aux_args) { + using namespace mshadow; + // CHECK_EQ(in_data.size(), 2); + // CHECK_EQ(out_data.size(), 3); + // Stream *s = ctx.get_stream(); + // Tensor data = in_data[st::kData].get(s); + // Tensor out = out_data[st::kOut].get(s); + // Shape<3> loc_shape = Shape3(data.size(0), 2, 3); + // Shape<4> grid_shape = Shape4(out.size(0), out.size(2), out.size(3), 2); + // Tensor loc = in_data[st::kLoc].get_with_shape(loc_shape, s); + // Tensor grid = out_data[st::kGridSrc] + // .get_with_shape(grid_shape, s); + // if (!init_cudnn_) { + // Init(s, in_data, out_data); + // } + // CHECK_EQ(data.CheckContiguous(), true); + // CHECK_EQ(out.CheckContiguous(), true); + // typename DataType::ScaleType alpha = 1.0f; + // typename DataType::ScaleType beta = 0.0f; + // if (param_.transform_type == st::kAffine) { + // CHECK_EQ(cudnnSpatialTfGridGeneratorForward(s->dnn_handle_, + // st_desc_, + // loc.dptr_, + // grid.dptr_/*output*/), CUDNN_STATUS_SUCCESS); + // } + // CHECK_EQ(cudnnSpatialTfSamplerForward(s->dnn_handle_, + // st_desc_, + // &alpha, + // in_desc_, + // data.dptr_, + // grid.dptr_, + // &beta, + // out_desc_, + // out.dptr_/*output*/), CUDNN_STATUS_SUCCESS); + } + // + virtual void Backward(const OpContext &ctx, + const std::vector &out_grad, + const std::vector &in_data, + const std::vector &out_data, + const std::vector &req, + const std::vector &in_grad, + const std::vector &aux_args) { + using namespace mshadow; + // CHECK_EQ(in_data.size(), 2); + // CHECK_EQ(out_data.size(), 3); + // CHECK_EQ(out_grad.size(), 1); + // Stream *s = ctx.get_stream(); + // Tensor data = in_data[st::kData].get(s); + // Tensor grad = out_grad[st::kOut].get(s); + // Tensor ddata = in_grad[st::kData].get(s); + // Shape<3> loc_shape = Shape3(data.size(0), 2, 3); + // Shape<4> grid_shape = Shape4(grad.size(0), grad.size(2), grad.size(3), 2); + // Tensor dloc = in_grad[st::kLoc].get_with_shape(loc_shape, s); + // Tensor grid = out_data[st::kGridSrc] + // .get_with_shape(grid_shape, s); + // // do not use out_grad[st::kGridSrc], because dgrid is a intermediate tensor, and not include in + // // DeclareBackwardDependency, another, we can we reuse grid for inplace operator + // typename DataType::ScaleType alpha = 1.0f; + // typename DataType::ScaleType beta = 0.0f; + // typename DataType::ScaleType alpha_dgrid = 1.0f; + // typename DataType::ScaleType beta_dgrid = 0.0f; + // CHECK_EQ(cudnnSpatialTfSamplerBackward(s->dnn_handle_, + // st_desc_, + // &alpha, + // in_desc_, + // data.dptr_, + // &beta, + // in_desc_/*reuse in_desc_*/, + // ddata.dptr_/*output*/, + // &alpha_dgrid, + // out_desc_/*reuse out_desc_*/, + // grad.dptr_, + // grid.dptr_, + // &beta_dgrid, + // grid.dptr_/*output, reuse grid*/), CUDNN_STATUS_SUCCESS); + // if (param_.transform_type == st::kAffine) { + // CHECK_EQ(cudnnSpatialTfGridGeneratorBackward(s->dnn_handle_, + // st_desc_, + // grid.dptr_, + // dloc.dptr_/*out*/), CUDNN_STATUS_SUCCESS); + // } + } + // + private: + inline void Init(mshadow::Stream *s, + const std::vector &in_data, + const std::vector &out_data) { + using namespace mshadow; + // CHECK_EQ(in_data.size(), 2); + // CHECK_EQ(out_data.size(), 3); + // if (!init_cudnn_) { + // init_cudnn_ = true; + // // Tensor data = in_data[st::kData].get(s); + // // Tensor out = out_data[st::kOut].get(s); + // CHECK_EQ(cudnnCreateRNNDescriptor(&rnn_desc_), CUDNN_STATUS_SUCCESS); + // CHECK_EQ(cudnnCreateDropoutDescriptor(&rnn_dropout_), CUDNN_STATUS_SUCCESS); + + // CHECK_EQ(cudnnCreateTensorDescriptor(&in_desc_), CUDNN_STATUS_SUCCESS); + // CHECK_EQ(cudnnCreateTensorDescriptor(&out_desc_), CUDNN_STATUS_SUCCESS); + // CHECK_EQ(cudnnSetTensor4dDescriptor(in_desc_, + // format_, + // dtype_, + // data.size(0), + // data.size(1), + // data.size(2), + // data.size(3)), CUDNN_STATUS_SUCCESS); + // CHECK_EQ(cudnnSetTensor4dDescriptor(out_desc_, + // format_, + // dtype_, + // out.size(0), + // out.size(1), + // out.size(2), + // out.size(3)), CUDNN_STATUS_SUCCESS); + // if (param_.sampler_type == st::kBilinear) { + // int dim[] = {static_cast(out.size(0)), static_cast(out.size(1)), + // static_cast(out.size(2)), static_cast(out.size(3))}; + // CHECK_EQ(cudnnSetSpatialTransformerNdDescriptor(st_desc_, + // sampler_, + // dtype_, + // 4, + // dim) , CUDNN_STATUS_SUCCESS); + // } + // } + } + + bool init_cudnn_; + cudnnDataType_t dtype_; + cudnnRNNDescriptor_t rnn_desc_; + cudnnRNNMode_t rnn_mode_; + cudnnDirectionMode_t rnn_direction_; + cudnnRNNInputMode_t rnn_input_mode_; + cudnnDropoutDescriptor_t rnn_dropout_; + // cudnnTensorDescriptor_t in_desc_; + // cudnnTensorDescriptor_t out_desc_; + #if CUDNN_MAJOR == 5 + cudnnTensorFormat_t format_; + #endif + RNNParam param_; +}; +#endif // __CUDACC__ && CUDNN +} // namespace op +} // namespace mxnet + +#endif // MXNET_OPERATOR_CUDNN_SPATIAL_TRANSFORMER_INL_H_ diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h new file mode 100644 index 000000000000..3a538f001d5b --- /dev/null +++ b/src/operator/rnn-inl.h @@ -0,0 +1,471 @@ +/*! + * Copyright (c) 2015 by Contributors + * \file rnn-inl.h + * \brief + * \author Sebastian Bodenstein +*/ +#ifndef MXNET_OPERATOR_RNN_INL_H_ +#define MXNET_OPERATOR_RNN_INL_H_ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "./operator_common.h" + +namespace mxnet { +namespace op { + +namespace rnn_enum { + enum RNNOpInputs {kData, kWeight, kStateIn, kCellStateIn}; + enum RNNOpOutputs {kOut, kStateOut, kCellStateOut}; + enum RNNModeType {kRnnRelu, kRnnTanh, kLstm, kGru}; + enum RNNDirectionType {kUnidirectional, kBidirectional}; + enum RNNOpResource {kTempSpace}; +} + +// A utility function to calculate input size + +inline int rnn_single_param_size(int inputSize, + int hiddenSize, + int mode){ + int size = hiddenSize * (hiddenSize + inputSize + 2); + // Different RNN's have different num weights + switch(mode) + { + case rnn_enum::kRnnRelu: + size *= 1 ; + break; + case rnn_enum::kRnnTanh: + size *= 1; + break; + case rnn_enum::kLstm: + size *= 4; + break; + case rnn_enum::kGru: + size *= 3; + break; + } + return size; +} + +inline int rnn_param_size(int layerNum, + int inputSize, + int hiddenSize, + int direction, + int mode){ + // get size of first layer + int size = rnn_single_param_size(inputSize, hiddenSize, mode); + // get size of remaining layers + if(direction == rnn_enum::kUnidirectional) + size += (layerNum - 1) * rnn_single_param_size(hiddenSize, hiddenSize, mode); + else // bidirectional case: input size increases by 2 + size += (layerNum - 1) * rnn_single_param_size(2 * hiddenSize, hiddenSize, mode); + return size; +} + +struct RNNParam : public dmlc::Parameter { + uint32_t state_size; + uint32_t num_layers; + uint64_t workspace; + bool batch_first; + int direction; + int mode; + + DMLC_DECLARE_PARAMETER(RNNParam) { + DMLC_DECLARE_FIELD(state_size) + .describe("size of the state for each layer"); + + DMLC_DECLARE_FIELD(num_layers) + .describe("number of stacked layers"); + + DMLC_DECLARE_FIELD(workspace).set_default(512).set_range(0, 8192) + .describe("Tmp workspace for RNN (MB)"); + + DMLC_DECLARE_FIELD(direction) + .add_enum("unidirectional", rnn_enum::kUnidirectional) + .add_enum("bidirectional", rnn_enum::kBidirectional) + .describe("specifies the recurrence pattern"); + + DMLC_DECLARE_FIELD(mode) + .add_enum("rnn_relu", rnn_enum::kRnnRelu) + .add_enum("rnn_tanh", rnn_enum::kRnnTanh) + .add_enum("lstm", rnn_enum::kLstm) + .add_enum("gru", rnn_enum::kGru) + .describe("the type of RNN to compute"); + } +}; + +template +class RNNOp : public Operator { + public: + explicit RNNOp(RNNParam p) { + this->param_ = p; + // convert MBytes first to Bytes and then to elements. + param_.workspace = (param_.workspace << 20) / sizeof(real_t); + } + + virtual void Forward(const OpContext &ctx, + const std::vector &in_data, + const std::vector &req, + const std::vector &out_data, + const std::vector &aux_args) { + using namespace mshadow; + using namespace mshadow::expr; +// CHECK_EQ(req[rnn_enum::kOut], kWriteTo); + +// CHECK_EQ(in_data.size(), expected); +// CHECK_EQ(out_data.size(), 1); +// Stream *s = ctx.get_stream(); +// Tensor data = in_data[rnn_enum::kData].get(s); +// Tensor out = out_data[rnn_enum::kOut].get(s); +// Shape<3> wmat_shape = +// Shape3(param_.num_group, +// data.shape_[1] / param_.num_group, +// param_.num_filter / param_.num_group * param_.kernel[0] * param_.kernel[1]); +// Tensor wmat = +// in_data[rnn_enum::kWeight].get_with_shape(wmat_shape, s); +// #if defined(__CUDACC__) +// CHECK_EQ(s->blas_handle_ownership_, Stream::OwnHandle) +// << "Must init CuBLAS handle in stream"; +// #endif +// const index_t nbatch = data.size(0); +// Tensor workspace = +// ctx.requested[rnn_enum::kTempSpace].get_space_typed( +// Shape1(this->InitTemp(out.shape_, data.shape_)), s); +// for (index_t i = 0; i < nbatch; i += nstep_) { +// const index_t step = std::min(nstep_, nbatch - i); +// Tensor temp_col = Tensor( +// workspace.dptr_, +// Shape2(shape_colunit_[0], +// shape_colunit_[1] * step), s); +// Tensor temp_dst = Tensor( +// workspace.dptr_ + temp_col.shape_.Size(), +// Shape3(shape_dstunit_[0], +// shape_dstunit_[1], +// shape_dstunit_[2] * step), s); +// temp_dst = reshape(swapaxis<1, 0>(data.Slice(i, i + step)), temp_dst.shape_); +// if (param_.pad[0] == 0 && param_.pad[1] == 0) { +// temp_col = unpack_patch2col(out.Slice(i, i + step), +// param_.kernel[0], +// param_.kernel[1], +// param_.stride[0], +// param_.stride[1], +// 1, 1); // RNN only support dilate equals 1 +// } else { +// temp_col = unpack_patch2col(pad(out.Slice(i, i + step), +// param_.pad[0], param_.pad[1]), +// param_.kernel[0], +// param_.kernel[1], +// param_.stride[0], +// param_.stride[1], +// 1, 1); // RNN only support dilate equals 1 +// } +// const index_t gstride = temp_col.size(0) / param_.num_group; +// for (uint32_t gid = 0; gid < param_.num_group; ++gid) { +// mshadow::Tensor tmpc = temp_col.Slice(gstride * gid, +// gstride * (gid + 1)); +// tmpc = dot(wmat[gid].T(), temp_dst[gid]); +// } +// if (param_.pad[0] == 0 && param_.pad[1] == 0) { +// out.Slice(i, i + step) = pack_col2patch(temp_col, +// out.Slice(i, i + step).shape_, +// param_.kernel[0], +// param_.kernel[1], +// param_.stride[0], +// 1); // RNN only support dilate equals 1 +// } else { +// Shape<4> pshape = out.Slice(i, i + step).shape_; +// pshape[2] += 2 * param_.pad[0]; +// pshape[3] += 2 * param_.pad[1]; +// out.Slice(i, i + step) = crop(pack_col2patch(temp_col, +// pshape, +// param_.kernel[0], +// param_.kernel[1], +// param_.stride[0], +// 1), // RNN only support dilate equals 1 +// out[i][0].shape_); +// } +// } +// if (!param_.no_bias) { +// // add bias, broadcast bias to dim 1: channel +// Tensor bias = in_data[rnn_enum::kBias].get(s); +// out += broadcast<1>(bias, out.shape_); +// } + } + + virtual void Backward(const OpContext &ctx, + const std::vector &out_grad, + const std::vector &in_data, + const std::vector &out_data, + const std::vector &req, + const std::vector &in_grad, + const std::vector &aux_args) { + using namespace mshadow; + using namespace mshadow::expr; + // TODO(bing): check the BLAS Handle, be careful +// CHECK_EQ(out_grad.size(), 1); +// size_t expected = param_.no_bias == 0 ? 3 : 2; +// CHECK(in_data.size() == expected && in_grad.size() == expected); +// CHECK_EQ(req.size(), expected); +// CHECK_EQ(in_data[rnn_enum::kWeight].CheckContiguous(), true); +// // get data +// Stream *s = ctx.get_stream(); +// Tensor data = in_data[rnn_enum::kData].get(s); +// Tensor grad = out_grad[rnn_enum::kOut].get(s); +// Tensor gdata = in_grad[rnn_enum::kData].get(s); +// Shape<3> wmat_shape = +// Shape3(param_.num_group, +// data.shape_[1] / param_.num_group, +// param_.num_filter / param_.num_group * param_.kernel[0] * param_.kernel[1]); +// Tensor wmat = +// in_data[rnn_enum::kWeight].get_with_shape(wmat_shape, s); +// Tensor gwmat = +// in_grad[rnn_enum::kWeight].get_with_shape(wmat_shape, s); +// #if defined(__CUDACC__) +// CHECK_EQ(s->blas_handle_ownership_, Stream::OwnHandle) +// << "Must init CuBLAS handle in stream"; +// #endif +// const index_t nbatch = data.size(0); +// Tensor workspace = +// ctx.requested[rnn_enum::kTempSpace].get_space_typed( +// Shape1(this->InitTemp(grad.shape_, data.shape_)), s); +// for (index_t i = 0; i < nbatch; i += nstep_) { +// const index_t step = std::min(nstep_, nbatch - i); +// Tensor temp_col = Tensor( +// workspace.dptr_, +// Shape2(shape_colunit_[0], +// shape_colunit_[1] * step), s); +// Tensor temp_dst = Tensor( +// workspace.dptr_ + temp_col.shape_.Size(), +// Shape3(shape_dstunit_[0], +// shape_dstunit_[1], +// shape_dstunit_[2] * step), s); +// temp_dst = reshape(swapaxis<1, 0>(data.Slice(i, i + step)), temp_dst.shape_); +// if (param_.pad[0] == 0 && param_.pad[1] == 0) { +// temp_col = unpack_patch2col(grad.Slice(i, i + step), +// param_.kernel[0], +// param_.kernel[1], +// param_.stride[0], +// param_.stride[1], +// 1, 1); // RNN only support dilate equals 1 +// } else { +// temp_col = unpack_patch2col(pad(grad.Slice(i, i + step), param_.pad[0], param_.pad[1]), +// param_.kernel[0], +// param_.kernel[1], +// param_.stride[0], +// param_.stride[1], +// 1, 1); // RNN only support dilate equals 1 +// } +// const index_t gstride = temp_col.size(0) / param_.num_group; +// for (uint32_t gid = 0; gid < param_.num_group; ++gid) { +// Tensor tmpc = temp_col.Slice(gstride * gid, gstride * (gid + 1)); +// if (i == 0) { +// Tensor tmp_gwmat = gwmat[gid]; +// Assign(tmp_gwmat, req[rnn_enum::kWeight], dot(temp_dst[gid], tmpc.T())); +// } else { +// gwmat[gid] += dot(temp_dst[gid], tmpc.T()); +// } +// } +// if (req[rnn_enum::kData] == kWriteTo || req[rnn_enum::kData] == kWriteInplace) { +// for (uint32_t gid = 0; gid < param_.num_group; ++gid) { +// Tensor tmpc = temp_col.Slice(gstride * gid, gstride * (gid + 1)); +// temp_dst[gid] = dot(wmat[gid], tmpc); +// } +// gdata.Slice(i, i + step) = swapaxis<1, 0>(reshape(temp_dst, +// mshadow::Shape4(gdata.shape_[1], +// step, +// gdata.size(2), +// gdata.size(3)))); +// } +// } +// if (!param_.no_bias) { +// Tensor gbias = in_grad[rnn_enum::kBias].get(s); +// Assign(gbias, req[rnn_enum::kBias], sumall_except_dim<1>(grad)); +// } + } + + private: +// inline index_t InitTemp(const mshadow::Shape<4> &ishape, +// const mshadow::Shape<4> &oshape) { +// const int ksize_y = param_.kernel[0]; +// const int ksize_x = param_.kernel[1]; +// shape_colunit_ = mshadow::Shape2(ishape[1] * ksize_y * ksize_x, +// oshape[2] * oshape[3]); +// shape_dstunit_ = mshadow::Shape3(param_.num_group, +// oshape[1] / param_.num_group, +// oshape[2] * oshape[3]); +// // See convolution for workspace calculations +// nstep_ = std::max( +// std::min( +// static_cast( +// param_.workspace / (shape_colunit_.Size() + shape_dstunit_.Size())), +// ishape[0]), +// 1U); + +// mshadow::Shape<2> scol = mshadow::Shape2(shape_colunit_[0], +// shape_colunit_[1] * nstep_); +// mshadow::Shape<3> sdst = mshadow::Shape3(shape_dstunit_[0], +// shape_dstunit_[1], +// shape_dstunit_[2] * nstep_); +// index_t required_size = scol.Size() + sdst.Size(); +// CHECK_GE(param_.workspace, required_size) +// << "\nMinimum workspace size: " << required_size * sizeof(DType) << " Bytes\n" +// << "Given: " << param_.workspace * sizeof(DType); +// return required_size; +// } + + private: + RNNParam param_; +}; // class RNNOp + + + + +template +Operator* CreateOp(RNNParam param, int dtype); + +#if DMLC_USE_CXX11 +class RNNProp : public OperatorProperty { + public: + std::vector ListArguments() const override { + if (param_.mode == rnn_enum::kLstm) { + return {"data", "weight", "state", "cell_state"}; + } else { + return {"data", "weight", "state"}; + } + } + + void Init(const std::vector >& kwargs) override { + param_.Init(kwargs); + } + + std::map GetParams() const override { + return param_.__DICT__(); + } + + bool InferShape(std::vector *in_shape, + std::vector *out_shape, + std::vector *aux_shape) const override { + using namespace mshadow; + if (param_.mode == rnn_enum::kLstm) { + CHECK_EQ(in_shape->size(), 4) << "Input:[data, weight, state, cell_state]"; + } else { + CHECK_EQ(in_shape->size(), 3) << "Input:[data, weight, state]"; + } + const TShape &dshape = (*in_shape)[rnn_enum::kData]; + if (dshape.ndim() == 0) return false; + CHECK_EQ(dshape.ndim(), 3) \ + << "Input data should be rank-3 tensor of dim (seqLength, batch, inputDim)."; + // Infer hidden state + cell state + int batchSize = dshape[0]; + int inputSize = dshape[2]; + int numDirections = 1; + if(param_.direction == rnn_enum::kBidirectional){ + numDirections = 2; + } + int total_layers = numDirections * param_.num_layers; // double for bidirectional + SHAPE_ASSIGN_CHECK(*in_shape, + rnn_enum::kStateIn, + Shape3(total_layers, batchSize, param_.state_size)); + if (param_.mode == rnn_enum::kLstm){ + SHAPE_ASSIGN_CHECK(*in_shape, + rnn_enum::kCellStateIn, + Shape3(total_layers, batchSize, param_.state_size)); + } + // infer weight size + int weight_size = rnn_param_size(param_.num_layers, + inputSize, + param_.state_size, + param_.direction, + param_.mode); + SHAPE_ASSIGN_CHECK(*in_shape, rnn_enum::kWeight, Shape1(weight_size)); + // infer output size + TShape oshape = dshape; + oshape[3] = numDirections * param_.state_size; + // infer output state size + TShape outStateShape = dshape; + outStateShape[0] = total_layers; + outStateShape[1] = batchSize; + outStateShape[2] = param_.state_size; + + out_shape->clear(); + out_shape->push_back(oshape); + out_shape->push_back(outStateShape); + if (param_.mode == rnn_enum::kLstm) + out_shape->push_back(outStateShape); + return true; + } + + bool InferType(std::vector *in_type, + std::vector *out_type, + std::vector *aux_type) const override { + CHECK_GE(in_type->size(), 1); + int dtype = (*in_type)[0]; + CHECK_NE(dtype, -1) << "First input must have specified type"; + for (index_t i = 0; i < in_type->size(); ++i) { + if ((*in_type)[i] == -1) { + (*in_type)[i] = dtype; + } else { + CHECK_EQ((*in_type)[i], dtype) << "This layer requires uniform type. " + << "Expected " << dtype << " v.s. given " + << (*in_type)[i] << " at " << ListArguments()[i]; + } + } + out_type->clear(); + out_type->push_back(dtype); + out_type->push_back(dtype); + if (param_.mode == rnn_enum::kLstm) + out_type->push_back(dtype); + return true; + } + + OperatorProperty* Copy() const override { + auto ptr = new RNNProp(); + ptr->param_ = param_; + return ptr; + } + + std::string TypeString() const override { + return "RNN"; + } + + std::vector DeclareBackwardDependency( + const std::vector &out_grad, + const std::vector &in_data, + const std::vector &out_data) const override { + if (param_.mode == rnn_enum::kLstm) + return {out_grad[rnn_enum::kOut], in_data[rnn_enum::kData], in_data[rnn_enum::kWeight]}; + else + return {out_grad[rnn_enum::kOut], in_data[rnn_enum::kData], in_data[rnn_enum::kWeight]}; + } + + std::vector ForwardResource( + const std::vector &in_shape) const override { + return {ResourceRequest::kTempSpace}; + } + + std::vector BackwardResource( + const std::vector &in_shape) const override { + return {ResourceRequest::kTempSpace}; + } + + Operator* CreateOperator(Context ctx) const override { + LOG(FATAL) << "Not Implemented"; + return NULL; + } + + Operator* CreateOperatorEx(Context ctx, std::vector *in_shape, + std::vector *in_type) const override; + + private: + RNNParam param_; +}; // class RNNProp +#endif // DMLC_USE_CXX11 +} // namespace op +} // namespace mxnet +#endif // MXNET_OPERATOR_RNN_INL_H_ diff --git a/src/operator/rnn.cc b/src/operator/rnn.cc new file mode 100644 index 000000000000..40f7f705718d --- /dev/null +++ b/src/operator/rnn.cc @@ -0,0 +1,41 @@ +/*! + * Copyright (c) 2015 by Contributors + * \file rnn.cc + * \brief + * \author Sebastian Bodenstein +*/ + +#include "./rnn-inl.h" + +namespace mxnet { +namespace op { +template<> +Operator *CreateOp(RNNParam param, int dtype) { + LOG(FATAL) << "RNN is only available for gpu at the moment."; + Operator *op = NULL; + MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { + op = new RNNOp(param); + }); + return op; +} + +Operator *RNNProp::CreateOperatorEx(Context ctx, std::vector *in_shape, + std::vector *in_type) const { + std::vector out_shape, aux_shape; + std::vector out_type, aux_type; + CHECK(InferType(in_type, &out_type, &aux_type)); + CHECK(InferShape(in_shape, &out_shape, &aux_shape)); + DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0]); +} + +DMLC_REGISTER_PARAMETER(RNNParam); + +MXNET_REGISTER_OP_PROPERTY(RNN, RNNProp) +.describe("Apply a recurrent layer to input.") +.add_argument("data", "Symbol", "Input data to RNN") +.add_argument("weight", "Symbol", "Weight for RNN layers") +.add_argument("hidden_state", "Symbol", "initial hidden state of the RNN") +.add_argument("cell_state", "Symbol", "initial cell state for LSTM networks") +.add_arguments(RNNParam::__FIELDS__()); +} // namespace op +} // namespace mxnet diff --git a/src/operator/rnn.cu b/src/operator/rnn.cu new file mode 100644 index 000000000000..2cb482f591b2 --- /dev/null +++ b/src/operator/rnn.cu @@ -0,0 +1,33 @@ +/*! + * Copyright (c) 2015 by Contributors + * \file rnn.cu + * \brief + * \author Sebastian Bodenstein +*/ + +#include "./rnn-inl.h" +#include +#if MXNET_USE_CUDNN == 1 && CUDNN_MAJOR == 5 +#include "./cudnn_rnn-inl.h" +#endif // MXNET_USE_CUDNN && CUDNN_MAJOR + +namespace mxnet { +namespace op { +template<> +Operator* CreateOp(RNNParam param, int dtype) { + Operator *op = NULL; +#if MXNET_USE_CUDNN == 1 && CUDNN_MAJOR == 5 + MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { + op = new CuDNNRNNOp(param); + }) +#else + 1; + MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { + op = new SpatialTransformerOp(param); + }) +#endif // MXNET_USE_CUDNN && CUDNN_MAJOR + return op; +} + +} // namespace op +} // namespace mxnet From 7025db87033b4846c046fdcea74eacafa54127e3 Mon Sep 17 00:00:00 2001 From: Sebastian Bodenstein Date: Sat, 9 Jul 2016 23:17:47 -0400 Subject: [PATCH 02/36] - removed unnecssary commented out code - fixed error in output shape inference --- src/operator/rnn-inl.h | 207 +++-------------------------------------- 1 file changed, 12 insertions(+), 195 deletions(-) diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h index 3a538f001d5b..37150bf58878 100644 --- a/src/operator/rnn-inl.h +++ b/src/operator/rnn-inl.h @@ -29,7 +29,6 @@ namespace rnn_enum { } // A utility function to calculate input size - inline int rnn_single_param_size(int inputSize, int hiddenSize, int mode){ @@ -116,86 +115,7 @@ class RNNOp : public Operator { const std::vector &aux_args) { using namespace mshadow; using namespace mshadow::expr; -// CHECK_EQ(req[rnn_enum::kOut], kWriteTo); - -// CHECK_EQ(in_data.size(), expected); -// CHECK_EQ(out_data.size(), 1); -// Stream *s = ctx.get_stream(); -// Tensor data = in_data[rnn_enum::kData].get(s); -// Tensor out = out_data[rnn_enum::kOut].get(s); -// Shape<3> wmat_shape = -// Shape3(param_.num_group, -// data.shape_[1] / param_.num_group, -// param_.num_filter / param_.num_group * param_.kernel[0] * param_.kernel[1]); -// Tensor wmat = -// in_data[rnn_enum::kWeight].get_with_shape(wmat_shape, s); -// #if defined(__CUDACC__) -// CHECK_EQ(s->blas_handle_ownership_, Stream::OwnHandle) -// << "Must init CuBLAS handle in stream"; -// #endif -// const index_t nbatch = data.size(0); -// Tensor workspace = -// ctx.requested[rnn_enum::kTempSpace].get_space_typed( -// Shape1(this->InitTemp(out.shape_, data.shape_)), s); -// for (index_t i = 0; i < nbatch; i += nstep_) { -// const index_t step = std::min(nstep_, nbatch - i); -// Tensor temp_col = Tensor( -// workspace.dptr_, -// Shape2(shape_colunit_[0], -// shape_colunit_[1] * step), s); -// Tensor temp_dst = Tensor( -// workspace.dptr_ + temp_col.shape_.Size(), -// Shape3(shape_dstunit_[0], -// shape_dstunit_[1], -// shape_dstunit_[2] * step), s); -// temp_dst = reshape(swapaxis<1, 0>(data.Slice(i, i + step)), temp_dst.shape_); -// if (param_.pad[0] == 0 && param_.pad[1] == 0) { -// temp_col = unpack_patch2col(out.Slice(i, i + step), -// param_.kernel[0], -// param_.kernel[1], -// param_.stride[0], -// param_.stride[1], -// 1, 1); // RNN only support dilate equals 1 -// } else { -// temp_col = unpack_patch2col(pad(out.Slice(i, i + step), -// param_.pad[0], param_.pad[1]), -// param_.kernel[0], -// param_.kernel[1], -// param_.stride[0], -// param_.stride[1], -// 1, 1); // RNN only support dilate equals 1 -// } -// const index_t gstride = temp_col.size(0) / param_.num_group; -// for (uint32_t gid = 0; gid < param_.num_group; ++gid) { -// mshadow::Tensor tmpc = temp_col.Slice(gstride * gid, -// gstride * (gid + 1)); -// tmpc = dot(wmat[gid].T(), temp_dst[gid]); -// } -// if (param_.pad[0] == 0 && param_.pad[1] == 0) { -// out.Slice(i, i + step) = pack_col2patch(temp_col, -// out.Slice(i, i + step).shape_, -// param_.kernel[0], -// param_.kernel[1], -// param_.stride[0], -// 1); // RNN only support dilate equals 1 -// } else { -// Shape<4> pshape = out.Slice(i, i + step).shape_; -// pshape[2] += 2 * param_.pad[0]; -// pshape[3] += 2 * param_.pad[1]; -// out.Slice(i, i + step) = crop(pack_col2patch(temp_col, -// pshape, -// param_.kernel[0], -// param_.kernel[1], -// param_.stride[0], -// 1), // RNN only support dilate equals 1 -// out[i][0].shape_); -// } -// } -// if (!param_.no_bias) { -// // add bias, broadcast bias to dim 1: channel -// Tensor bias = in_data[rnn_enum::kBias].get(s); -// out += broadcast<1>(bias, out.shape_); -// } + // TODO: add MShadow implementation } virtual void Backward(const OpContext &ctx, @@ -207,125 +127,13 @@ class RNNOp : public Operator { const std::vector &aux_args) { using namespace mshadow; using namespace mshadow::expr; - // TODO(bing): check the BLAS Handle, be careful -// CHECK_EQ(out_grad.size(), 1); -// size_t expected = param_.no_bias == 0 ? 3 : 2; -// CHECK(in_data.size() == expected && in_grad.size() == expected); -// CHECK_EQ(req.size(), expected); -// CHECK_EQ(in_data[rnn_enum::kWeight].CheckContiguous(), true); -// // get data -// Stream *s = ctx.get_stream(); -// Tensor data = in_data[rnn_enum::kData].get(s); -// Tensor grad = out_grad[rnn_enum::kOut].get(s); -// Tensor gdata = in_grad[rnn_enum::kData].get(s); -// Shape<3> wmat_shape = -// Shape3(param_.num_group, -// data.shape_[1] / param_.num_group, -// param_.num_filter / param_.num_group * param_.kernel[0] * param_.kernel[1]); -// Tensor wmat = -// in_data[rnn_enum::kWeight].get_with_shape(wmat_shape, s); -// Tensor gwmat = -// in_grad[rnn_enum::kWeight].get_with_shape(wmat_shape, s); -// #if defined(__CUDACC__) -// CHECK_EQ(s->blas_handle_ownership_, Stream::OwnHandle) -// << "Must init CuBLAS handle in stream"; -// #endif -// const index_t nbatch = data.size(0); -// Tensor workspace = -// ctx.requested[rnn_enum::kTempSpace].get_space_typed( -// Shape1(this->InitTemp(grad.shape_, data.shape_)), s); -// for (index_t i = 0; i < nbatch; i += nstep_) { -// const index_t step = std::min(nstep_, nbatch - i); -// Tensor temp_col = Tensor( -// workspace.dptr_, -// Shape2(shape_colunit_[0], -// shape_colunit_[1] * step), s); -// Tensor temp_dst = Tensor( -// workspace.dptr_ + temp_col.shape_.Size(), -// Shape3(shape_dstunit_[0], -// shape_dstunit_[1], -// shape_dstunit_[2] * step), s); -// temp_dst = reshape(swapaxis<1, 0>(data.Slice(i, i + step)), temp_dst.shape_); -// if (param_.pad[0] == 0 && param_.pad[1] == 0) { -// temp_col = unpack_patch2col(grad.Slice(i, i + step), -// param_.kernel[0], -// param_.kernel[1], -// param_.stride[0], -// param_.stride[1], -// 1, 1); // RNN only support dilate equals 1 -// } else { -// temp_col = unpack_patch2col(pad(grad.Slice(i, i + step), param_.pad[0], param_.pad[1]), -// param_.kernel[0], -// param_.kernel[1], -// param_.stride[0], -// param_.stride[1], -// 1, 1); // RNN only support dilate equals 1 -// } -// const index_t gstride = temp_col.size(0) / param_.num_group; -// for (uint32_t gid = 0; gid < param_.num_group; ++gid) { -// Tensor tmpc = temp_col.Slice(gstride * gid, gstride * (gid + 1)); -// if (i == 0) { -// Tensor tmp_gwmat = gwmat[gid]; -// Assign(tmp_gwmat, req[rnn_enum::kWeight], dot(temp_dst[gid], tmpc.T())); -// } else { -// gwmat[gid] += dot(temp_dst[gid], tmpc.T()); -// } -// } -// if (req[rnn_enum::kData] == kWriteTo || req[rnn_enum::kData] == kWriteInplace) { -// for (uint32_t gid = 0; gid < param_.num_group; ++gid) { -// Tensor tmpc = temp_col.Slice(gstride * gid, gstride * (gid + 1)); -// temp_dst[gid] = dot(wmat[gid], tmpc); -// } -// gdata.Slice(i, i + step) = swapaxis<1, 0>(reshape(temp_dst, -// mshadow::Shape4(gdata.shape_[1], -// step, -// gdata.size(2), -// gdata.size(3)))); -// } -// } -// if (!param_.no_bias) { -// Tensor gbias = in_grad[rnn_enum::kBias].get(s); -// Assign(gbias, req[rnn_enum::kBias], sumall_except_dim<1>(grad)); -// } + // TODO: add MShadow implementation } - private: -// inline index_t InitTemp(const mshadow::Shape<4> &ishape, -// const mshadow::Shape<4> &oshape) { -// const int ksize_y = param_.kernel[0]; -// const int ksize_x = param_.kernel[1]; -// shape_colunit_ = mshadow::Shape2(ishape[1] * ksize_y * ksize_x, -// oshape[2] * oshape[3]); -// shape_dstunit_ = mshadow::Shape3(param_.num_group, -// oshape[1] / param_.num_group, -// oshape[2] * oshape[3]); -// // See convolution for workspace calculations -// nstep_ = std::max( -// std::min( -// static_cast( -// param_.workspace / (shape_colunit_.Size() + shape_dstunit_.Size())), -// ishape[0]), -// 1U); - -// mshadow::Shape<2> scol = mshadow::Shape2(shape_colunit_[0], -// shape_colunit_[1] * nstep_); -// mshadow::Shape<3> sdst = mshadow::Shape3(shape_dstunit_[0], -// shape_dstunit_[1], -// shape_dstunit_[2] * nstep_); -// index_t required_size = scol.Size() + sdst.Size(); -// CHECK_GE(param_.workspace, required_size) -// << "\nMinimum workspace size: " << required_size * sizeof(DType) << " Bytes\n" -// << "Given: " << param_.workspace * sizeof(DType); -// return required_size; -// } - private: RNNParam param_; }; // class RNNOp - - - template Operator* CreateOp(RNNParam param, int dtype); @@ -340,6 +148,14 @@ class RNNProp : public OperatorProperty { } } + std::vector ListOutputs() const override { + if (param_.mode == rnn_enum::kLstm) { + return {"output", "final_state", "final_state_cell"}; + } else { + return {"output", "final_state"}; + } + } + void Init(const std::vector >& kwargs) override { param_.Init(kwargs); } @@ -386,7 +202,7 @@ class RNNProp : public OperatorProperty { SHAPE_ASSIGN_CHECK(*in_shape, rnn_enum::kWeight, Shape1(weight_size)); // infer output size TShape oshape = dshape; - oshape[3] = numDirections * param_.state_size; + oshape[2] = numDirections * param_.state_size; // infer output state size TShape outStateShape = dshape; outStateShape[0] = total_layers; @@ -396,6 +212,7 @@ class RNNProp : public OperatorProperty { out_shape->clear(); out_shape->push_back(oshape); out_shape->push_back(outStateShape); + // Deal with lstm cell state if (param_.mode == rnn_enum::kLstm) out_shape->push_back(outStateShape); return true; From e7c2e98df7aef890682890021c34ab05e4ac1157 Mon Sep 17 00:00:00 2001 From: Sebastian Bodenstein Date: Sun, 10 Jul 2016 00:55:39 -0400 Subject: [PATCH 03/36] - some renaming - added cudnn destructors --- src/operator/cudnn_rnn-inl.h | 163 +++++++++++++++++++++++------------ src/operator/rnn-inl.h | 34 ++++---- src/operator/rnn.cc | 6 +- src/operator/rnn.cu | 5 +- 4 files changed, 129 insertions(+), 79 deletions(-) diff --git a/src/operator/cudnn_rnn-inl.h b/src/operator/cudnn_rnn-inl.h index 37895c2b2488..61d6d2c2f23a 100644 --- a/src/operator/cudnn_rnn-inl.h +++ b/src/operator/cudnn_rnn-inl.h @@ -23,16 +23,16 @@ class CuDNNRNNOp : public Operator { // RNN Mode switch (param_.mode) { case rnn_enum::kRnnRelu: - rnn_mode_ = CUDNN_RNN_RELU; + mode_ = CUDNN_RNN_RELU; break; case rnn_enum::kRnnTanh: - rnn_mode_ = CUDNN_RNN_TANH; + mode_ = CUDNN_RNN_TANH; break; case rnn_enum::kLstm: - rnn_mode_ = CUDNN_LSTM; + mode_ = CUDNN_LSTM; break; case rnn_enum::kGru: - rnn_mode_ = CUDNN_GRU; + mode_ = CUDNN_GRU; break; default: LOG(FATAL) << "Not implmented"; @@ -40,22 +40,31 @@ class CuDNNRNNOp : public Operator { // RNN Direction switch (param_.direction) { case rnn_enum::kUnidirectional: - rnn_direction_ = CUDNN_UNIDIRECTIONAL; + direction_ = CUDNN_UNIDIRECTIONAL; break; case rnn_enum::kBidirectional: - rnn_direction_ = CUDNN_BIDIRECTIONAL; + direction_ = CUDNN_BIDIRECTIONAL; break; default: LOG(FATAL) << "Not implmented"; } } - // ~CuDNNRNNOp() { - // if (init_cudnn_) { - // CHECK_EQ(cudnnDestroyRNNDescriptor(rnn_desc_), CUDNN_STATUS_SUCCESS); - // // CHECK_EQ(cudnnDestroyTensorDescriptor(rnn_desc_), CUDNN_STATUS_SUCCESS); - // // CHECK_EQ(cudnnDestroyTensorDescriptor(_desc_), CUDNN_STATUS_SUCCESS); - // } - // } + + ~CuDNNRNNOp() { + if (init_cudnn_) { + CHECK_EQ(cudnnDestroyTensorDescriptor(x_desc_), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudnnDestroyTensorDescriptor(hx_desc_), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudnnDestroyTensorDescriptor(y_desc_), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudnnDestroyTensorDescriptor(hy_desc_), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudnnDestroyFilterDescriptor(w_desc_), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudnnDestroyRNNDescriptor(rnn_desc_), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudnnDestroyDropoutDescriptor(dropout_desc_), CUDNN_STATUS_SUCCESS); + if (param_.mode == rnn_enum::kLstm){ + CHECK_EQ(cudnnDestroyTensorDescriptor(cx_desc_), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudnnDestroyTensorDescriptor(cy_desc_), CUDNN_STATUS_SUCCESS); + } + } + } virtual void Forward(const OpContext &ctx, const std::vector &in_data, @@ -150,52 +159,96 @@ class CuDNNRNNOp : public Operator { const std::vector &in_data, const std::vector &out_data) { using namespace mshadow; - // CHECK_EQ(in_data.size(), 2); - // CHECK_EQ(out_data.size(), 3); - // if (!init_cudnn_) { - // init_cudnn_ = true; - // // Tensor data = in_data[st::kData].get(s); - // // Tensor out = out_data[st::kOut].get(s); - // CHECK_EQ(cudnnCreateRNNDescriptor(&rnn_desc_), CUDNN_STATUS_SUCCESS); - // CHECK_EQ(cudnnCreateDropoutDescriptor(&rnn_dropout_), CUDNN_STATUS_SUCCESS); + #if CUDNN_MAJOR == 5 + format_ = CUDNN_TENSOR_NCHW; + #endif + + if (param_.mode == rnn_enum::kLstm){ + CHECK_EQ(in_data.size(), 4); + CHECK_EQ(out_data.size(), 3); + } + else{ + CHECK_EQ(in_data.size(), 3); + CHECK_EQ(out_data.size(), 2); + } + + if (!init_cudnn_) { + init_cudnn_ = true; + + Tensor data = in_data[rnn_enum::kData].get(s); + Tensor params = in_data[rnn_enum::kParams].get(s); + Tensor state = in_data[rnn_enum::kStateIn].get(s); + + Tensor out = out_data[rnn_enum::kOut].get(s); + Tensor out_state = out_data[rnn_enum::kOut].get(s); + + if (param_.mode == rnn_enum::kLstm){ + Tensor cell_state = + in_data[rnn_enum::kCellStateIn].get(s); + Tensor out_cell_state = + in_data[rnn_enum::kCellStateOut].get(s); + } + + CHECK_EQ(cudnnCreateRNNDescriptor(&rnn_desc_), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudnnCreateDropoutDescriptor(&dropout_desc_), CUDNN_STATUS_SUCCESS); - // CHECK_EQ(cudnnCreateTensorDescriptor(&in_desc_), CUDNN_STATUS_SUCCESS); - // CHECK_EQ(cudnnCreateTensorDescriptor(&out_desc_), CUDNN_STATUS_SUCCESS); - // CHECK_EQ(cudnnSetTensor4dDescriptor(in_desc_, - // format_, - // dtype_, - // data.size(0), - // data.size(1), - // data.size(2), - // data.size(3)), CUDNN_STATUS_SUCCESS); - // CHECK_EQ(cudnnSetTensor4dDescriptor(out_desc_, - // format_, - // dtype_, - // out.size(0), - // out.size(1), - // out.size(2), - // out.size(3)), CUDNN_STATUS_SUCCESS); - // if (param_.sampler_type == st::kBilinear) { - // int dim[] = {static_cast(out.size(0)), static_cast(out.size(1)), - // static_cast(out.size(2)), static_cast(out.size(3))}; - // CHECK_EQ(cudnnSetSpatialTransformerNdDescriptor(st_desc_, - // sampler_, - // dtype_, - // 4, - // dim) , CUDNN_STATUS_SUCCESS); - // } - // } + // Create tensors + CHECK_EQ(cudnnCreateFilterDescriptor(&w_desc_), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudnnCreateTensorDescriptor(&x_desc_), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudnnCreateTensorDescriptor(&hx_desc_), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudnnCreateTensorDescriptor(&y_desc_), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudnnCreateTensorDescriptor(&hy_desc_), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudnnCreateTensorDescriptor(&y_desc_), CUDNN_STATUS_SUCCESS); + if (param_.mode == rnn_enum::kLstm){ + CHECK_EQ(cudnnCreateTensorDescriptor(&cx_desc_), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudnnCreateTensorDescriptor(&cy_desc_), CUDNN_STATUS_SUCCESS); + } + + // CHECK_EQ(cudnnCreateTensorDescriptor(&in_desc_), CUDNN_STATUS_SUCCESS); + // CHECK_EQ(cudnnCreateTensorDescriptor(&out_desc_), CUDNN_STATUS_SUCCESS); + // CHECK_EQ(cudnnSetTensor4dDescriptor(in_desc_, + // format_, + // dtype_, + // data.size(0), + // data.size(1), + // data.size(2), + // data.size(3)), CUDNN_STATUS_SUCCESS); + // CHECK_EQ(cudnnSetTensor4dDescriptor(out_desc_, + // format_, + // dtype_, + // out.size(0), + // out.size(1), + // out.size(2), + // out.size(3)), CUDNN_STATUS_SUCCESS); + // if (param_.sampler_type == st::kBilinear) { + // int dim[] = {static_cast(out.size(0)), static_cast(out.size(1)), + // static_cast(out.size(2)), static_cast(out.size(3))}; + // CHECK_EQ(cudnnSetSpatialTransformerNdDescriptor(st_desc_, + // sampler_, + // dtype_, + // 4, + // dim) , CUDNN_STATUS_SUCCESS); + // } + } } - - bool init_cudnn_; + cudnnDataType_t dtype_; + bool init_cudnn_; cudnnRNNDescriptor_t rnn_desc_; - cudnnRNNMode_t rnn_mode_; - cudnnDirectionMode_t rnn_direction_; - cudnnRNNInputMode_t rnn_input_mode_; - cudnnDropoutDescriptor_t rnn_dropout_; - // cudnnTensorDescriptor_t in_desc_; - // cudnnTensorDescriptor_t out_desc_; + cudnnRNNMode_t mode_; + cudnnDirectionMode_t direction_; + cudnnRNNInputMode_t input_mode_; + cudnnDropoutDescriptor_t dropout_desc_; + + cudnnTensorDescriptor_t x_desc_; + cudnnTensorDescriptor_t hx_desc_; + cudnnTensorDescriptor_t cx_desc_; + cudnnTensorDescriptor_t y_desc_; + cudnnTensorDescriptor_t hy_desc_; + cudnnTensorDescriptor_t cy_desc_; + + cudnnFilterDescriptor_t w_desc_; + #if CUDNN_MAJOR == 5 cudnnTensorFormat_t format_; #endif diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h index 37150bf58878..2729a2ff49cc 100644 --- a/src/operator/rnn-inl.h +++ b/src/operator/rnn-inl.h @@ -21,16 +21,16 @@ namespace mxnet { namespace op { namespace rnn_enum { - enum RNNOpInputs {kData, kWeight, kStateIn, kCellStateIn}; + enum RNNOpInputs {kData, kParams, kStateIn, kCellStateIn}; enum RNNOpOutputs {kOut, kStateOut, kCellStateOut}; - enum RNNModeType {kRnnRelu, kRnnTanh, kLstm, kGru}; + enum RNNModeType {kRnnRelu, kRnnTanh, kLstm, kGru}; enum RNNDirectionType {kUnidirectional, kBidirectional}; enum RNNOpResource {kTempSpace}; } // A utility function to calculate input size inline int rnn_single_param_size(int inputSize, - int hiddenSize, + int hiddenSize, int mode){ int size = hiddenSize * (hiddenSize + inputSize + 2); // Different RNN's have different num weights @@ -52,10 +52,10 @@ inline int rnn_single_param_size(int inputSize, return size; } -inline int rnn_param_size(int layerNum, +inline int rnn_param_size(int layerNum, int inputSize, - int hiddenSize, - int direction, + int hiddenSize, + int direction, int mode){ // get size of first layer int size = rnn_single_param_size(inputSize, hiddenSize, mode); @@ -194,26 +194,26 @@ class RNNProp : public OperatorProperty { Shape3(total_layers, batchSize, param_.state_size)); } // infer weight size - int weight_size = rnn_param_size(param_.num_layers, - inputSize, - param_.state_size, - param_.direction, + int weight_size = rnn_param_size(param_.num_layers, + inputSize, + param_.state_size, + param_.direction, param_.mode); - SHAPE_ASSIGN_CHECK(*in_shape, rnn_enum::kWeight, Shape1(weight_size)); + SHAPE_ASSIGN_CHECK(*in_shape, rnn_enum::kParams, Shape1(weight_size)); // infer output size TShape oshape = dshape; oshape[2] = numDirections * param_.state_size; - // infer output state size + // infer output state size TShape outStateShape = dshape; outStateShape[0] = total_layers; outStateShape[1] = batchSize; outStateShape[2] = param_.state_size; - out_shape->clear(); + out_shape->clear(); out_shape->push_back(oshape); out_shape->push_back(outStateShape); // Deal with lstm cell state - if (param_.mode == rnn_enum::kLstm) + if (param_.mode == rnn_enum::kLstm) out_shape->push_back(outStateShape); return true; } @@ -236,7 +236,7 @@ class RNNProp : public OperatorProperty { out_type->clear(); out_type->push_back(dtype); out_type->push_back(dtype); - if (param_.mode == rnn_enum::kLstm) + if (param_.mode == rnn_enum::kLstm) out_type->push_back(dtype); return true; } @@ -256,9 +256,9 @@ class RNNProp : public OperatorProperty { const std::vector &in_data, const std::vector &out_data) const override { if (param_.mode == rnn_enum::kLstm) - return {out_grad[rnn_enum::kOut], in_data[rnn_enum::kData], in_data[rnn_enum::kWeight]}; + return {out_grad[rnn_enum::kOut], in_data[rnn_enum::kData], in_data[rnn_enum::kParams]}; else - return {out_grad[rnn_enum::kOut], in_data[rnn_enum::kData], in_data[rnn_enum::kWeight]}; + return {out_grad[rnn_enum::kOut], in_data[rnn_enum::kData], in_data[rnn_enum::kParams]}; } std::vector ForwardResource( diff --git a/src/operator/rnn.cc b/src/operator/rnn.cc index 40f7f705718d..2a485e5ef224 100644 --- a/src/operator/rnn.cc +++ b/src/operator/rnn.cc @@ -33,9 +33,9 @@ DMLC_REGISTER_PARAMETER(RNNParam); MXNET_REGISTER_OP_PROPERTY(RNN, RNNProp) .describe("Apply a recurrent layer to input.") .add_argument("data", "Symbol", "Input data to RNN") -.add_argument("weight", "Symbol", "Weight for RNN layers") +.add_argument("parameters", "Symbol", "Vector of all RNN trainable parameters") .add_argument("hidden_state", "Symbol", "initial hidden state of the RNN") -.add_argument("cell_state", "Symbol", "initial cell state for LSTM networks") -.add_arguments(RNNParam::__FIELDS__()); +.add_argument("cell_state", "Symbol", "initial cell state for LSTM networks (only for LSTM)") +.add_arguments(RNNParam::__FIELDS__()); } // namespace op } // namespace mxnet diff --git a/src/operator/rnn.cu b/src/operator/rnn.cu index 2cb482f591b2..fb90daf19b41 100644 --- a/src/operator/rnn.cu +++ b/src/operator/rnn.cu @@ -21,10 +21,7 @@ Operator* CreateOp(RNNParam param, int dtype) { op = new CuDNNRNNOp(param); }) #else - 1; - MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { - op = new SpatialTransformerOp(param); - }) + LOG(FATAL) << "RNN is only available for cuDNN at the moment."; #endif // MXNET_USE_CUDNN && CUDNN_MAJOR return op; } From 6af1646bb730b966f327e569e46ab6871c859b32 Mon Sep 17 00:00:00 2001 From: Sebastian Bodenstein Date: Sun, 17 Jul 2016 16:01:48 -0400 Subject: [PATCH 04/36] - added dropout --- src/operator/cudnn_rnn-inl.h | 166 +++++++++++++++++++++++++++-------- src/operator/rnn-inl.h | 5 ++ 2 files changed, 135 insertions(+), 36 deletions(-) diff --git a/src/operator/cudnn_rnn-inl.h b/src/operator/cudnn_rnn-inl.h index 61d6d2c2f23a..90bf5cbc9bc7 100644 --- a/src/operator/cudnn_rnn-inl.h +++ b/src/operator/cudnn_rnn-inl.h @@ -20,6 +20,8 @@ class CuDNNRNNOp : public Operator { this->param_ = param; init_cudnn_ = false; dtype_ = mshadow::DataType::kCudnnFlag; + // Defaults + input_mode_ = CUDNN_LINEAR_INPUT; // RNN Mode switch (param_.mode) { case rnn_enum::kRnnRelu: @@ -72,9 +74,48 @@ class CuDNNRNNOp : public Operator { const std::vector &out_data, const std::vector &aux_args) { using namespace mshadow; - // CHECK_EQ(in_data.size(), 2); - // CHECK_EQ(out_data.size(), 3); - // Stream *s = ctx.get_stream(); + Stream *s = ctx.get_stream(); + if(!init_cudnn_){ + Init(s, in_data, out_data); + } + // get input + output tensors + Tensor data = in_data[rnn_enum::kData].get(s); + Tensor params = in_data[rnn_enum::kParams].get(s); + Tensor state = in_data[rnn_enum::kStateIn].get(s); + + Tensor out = out_data[rnn_enum::kOut].get(s); + Tensor out_state = out_data[rnn_enum::kStateOut].get(s); + + if (param_.mode == rnn_enum::kLstm){ + Tensor cell_state = + in_data[rnn_enum::kCellStateIn].get(s); + Tensor out_cell_state = + in_data[rnn_enum::kCellStateOut].get(s); + } + // if (param_.mode == rnn_enum::kLstm){ + // CHECK_EQ(in_data.size(), 4); + // CHECK_EQ(out_data.size(), 3); + // } + // else{ + // CHECK_EQ(in_data.size(), 3); + // CHECK_EQ(out_data.size(), 2); + // } + // // Get tensors + // + // Tensor data = in_data[rnn_enum::kData].get(s); + // Tensor params = in_data[rnn_enum::kParams].get(s); + // Tensor state = in_data[rnn_enum::kStateIn].get(s); + + // Tensor out = out_data[rnn_enum::kOut].get(s); + // Tensor out_state = out_data[rnn_enum::kOut].get(s); + + // if (param_.mode == rnn_enum::kLstm){ + // Tensor cell_state = + // in_data[rnn_enum::kCellStateIn].get(s); + // Tensor out_cell_state = + // in_data[rnn_enum::kCellStateOut].get(s); + // } + // // Tensor data = in_data[st::kData].get(s); // Tensor out = out_data[st::kOut].get(s); // Shape<3> loc_shape = Shape3(data.size(0), 2, 3); @@ -162,8 +203,7 @@ class CuDNNRNNOp : public Operator { #if CUDNN_MAJOR == 5 format_ = CUDNN_TENSOR_NCHW; #endif - - if (param_.mode == rnn_enum::kLstm){ + if(param_.mode == rnn_enum::kLstm){ CHECK_EQ(in_data.size(), 4); CHECK_EQ(out_data.size(), 3); } @@ -171,64 +211,118 @@ class CuDNNRNNOp : public Operator { CHECK_EQ(in_data.size(), 3); CHECK_EQ(out_data.size(), 2); } - if (!init_cudnn_) { init_cudnn_ = true; - + // get input + output tensors Tensor data = in_data[rnn_enum::kData].get(s); Tensor params = in_data[rnn_enum::kParams].get(s); Tensor state = in_data[rnn_enum::kStateIn].get(s); Tensor out = out_data[rnn_enum::kOut].get(s); - Tensor out_state = out_data[rnn_enum::kOut].get(s); + Tensor out_state = out_data[rnn_enum::kStateOut].get(s); - if (param_.mode == rnn_enum::kLstm){ + if(param_.mode == rnn_enum::kLstm){ Tensor cell_state = in_data[rnn_enum::kCellStateIn].get(s); Tensor out_cell_state = in_data[rnn_enum::kCellStateOut].get(s); } + // Create descriptors CHECK_EQ(cudnnCreateRNNDescriptor(&rnn_desc_), CUDNN_STATUS_SUCCESS); CHECK_EQ(cudnnCreateDropoutDescriptor(&dropout_desc_), CUDNN_STATUS_SUCCESS); - // Create tensors CHECK_EQ(cudnnCreateFilterDescriptor(&w_desc_), CUDNN_STATUS_SUCCESS); CHECK_EQ(cudnnCreateTensorDescriptor(&x_desc_), CUDNN_STATUS_SUCCESS); CHECK_EQ(cudnnCreateTensorDescriptor(&hx_desc_), CUDNN_STATUS_SUCCESS); CHECK_EQ(cudnnCreateTensorDescriptor(&y_desc_), CUDNN_STATUS_SUCCESS); CHECK_EQ(cudnnCreateTensorDescriptor(&hy_desc_), CUDNN_STATUS_SUCCESS); - CHECK_EQ(cudnnCreateTensorDescriptor(&y_desc_), CUDNN_STATUS_SUCCESS); + if (param_.mode == rnn_enum::kLstm){ CHECK_EQ(cudnnCreateTensorDescriptor(&cx_desc_), CUDNN_STATUS_SUCCESS); CHECK_EQ(cudnnCreateTensorDescriptor(&cy_desc_), CUDNN_STATUS_SUCCESS); } + // set dropout + // cudnnSetDropoutDescriptor(dropout_desc_, + // s->dnn_handle_, + // param_.p, + // void * states, + // size_t stateSizeInBytes, + // unsigned long long seed) + // set RNN + CHECK_EQ(cudnnSetRNNDescriptor(rnn_desc_, + param_.state_size, + param_.num_layers, + dropout_desc_, + input_mode_, + direction_, + mode_, + dtype_), CUDNN_STATUS_SUCCESS); + // Set params + int dim_params[3] = {params.shape_[0], 1, 1}; + CHECK_EQ(cudnnSetFilterNdDescriptor(w_desc_, + dtype_, + format_, + 3, + dim_params + ), CUDNN_STATUS_SUCCESS); + // Get strides + int stride_data[3] = {data.shape_[2]*data.shape_[1], data.shape_[2], 1}; + int stride_state[3] = {state.shape_[2]*state.shape_[1], state.shape_[2], 1}; + int stride_out[3] = {out.shape_[2]*out.shape_[1], out.shape_[2], 1}; + int stride_out_state[3] = + {out_state.shape_[2]*out_state.shape_[1], out_state.shape_[2], 1}; + + // cuDNN needs int arrays for dim, not index_t array used in Shape + int dim_data[3]; + int dim_state[3]; + int dim_out[3]; + int dim_out_state[3]; + std::copy(std::begin(data.shape_.shape_), std::end(data.shape_.shape_), std::begin(dim_data)); + std::copy(std::begin(state.shape_.shape_), std::end(state.shape_.shape_), std::begin(dim_state)); + std::copy(std::begin(out.shape_.shape_), std::end(out.shape_.shape_), std::begin(dim_out)); + std::copy(std::begin(out_state.shape_.shape_), std::end(out_state.shape_.shape_), std::begin(dim_out_state)); - // CHECK_EQ(cudnnCreateTensorDescriptor(&in_desc_), CUDNN_STATUS_SUCCESS); - // CHECK_EQ(cudnnCreateTensorDescriptor(&out_desc_), CUDNN_STATUS_SUCCESS); - // CHECK_EQ(cudnnSetTensor4dDescriptor(in_desc_, - // format_, - // dtype_, - // data.size(0), - // data.size(1), - // data.size(2), - // data.size(3)), CUDNN_STATUS_SUCCESS); - // CHECK_EQ(cudnnSetTensor4dDescriptor(out_desc_, - // format_, - // dtype_, - // out.size(0), - // out.size(1), - // out.size(2), - // out.size(3)), CUDNN_STATUS_SUCCESS); - // if (param_.sampler_type == st::kBilinear) { - // int dim[] = {static_cast(out.size(0)), static_cast(out.size(1)), - // static_cast(out.size(2)), static_cast(out.size(3))}; - // CHECK_EQ(cudnnSetSpatialTransformerNdDescriptor(st_desc_, - // sampler_, - // dtype_, - // 4, - // dim) , CUDNN_STATUS_SUCCESS); - // } + // set the tensor descriptors + CHECK_EQ(cudnnSetTensorNdDescriptor(x_desc_, + dtype_, + 3, + dim_data, + stride_data + ), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudnnSetTensorNdDescriptor(hx_desc_, + dtype_, + 3, + dim_state, + stride_state + ), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudnnSetTensorNdDescriptor(y_desc_, + dtype_, + 3, + dim_out, + stride_out + ), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudnnSetTensorNdDescriptor(hy_desc_, + dtype_, + 3, + dim_out_state, + stride_out_state + ), CUDNN_STATUS_SUCCESS); + // LSTM has two extra descriptors + if (param_.mode == rnn_enum::kLstm){ + CHECK_EQ(cudnnSetTensorNdDescriptor(cx_desc_, + dtype_, + 3, + dim_state, + stride_state + ), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudnnSetTensorNdDescriptor(cy_desc_, + dtype_, + 3, + dim_out_state, + stride_out_state + ), CUDNN_STATUS_SUCCESS); + } } } diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h index 2729a2ff49cc..d81ed1637756 100644 --- a/src/operator/rnn-inl.h +++ b/src/operator/rnn-inl.h @@ -74,6 +74,7 @@ struct RNNParam : public dmlc::Parameter { bool batch_first; int direction; int mode; + float p; DMLC_DECLARE_PARAMETER(RNNParam) { DMLC_DECLARE_FIELD(state_size) @@ -96,6 +97,10 @@ struct RNNParam : public dmlc::Parameter { .add_enum("lstm", rnn_enum::kLstm) .add_enum("gru", rnn_enum::kGru) .describe("the type of RNN to compute"); + + DMLC_DECLARE_FIELD(p).set_default(0.) + .set_range(0, 1) + .describe("Fraction of the input that gets dropped out at training time"); } }; From 050ca51ce382bc88ce56d6f2d198d7d1ae90739c Mon Sep 17 00:00:00 2001 From: Sebastian Bodenstein Date: Mon, 18 Jul 2016 00:28:48 -0400 Subject: [PATCH 05/36] - major refactor - completed forward evaluation --- src/operator/cudnn_rnn-inl.h | 481 +++++++++++++++++++---------------- src/operator/rnn-inl.h | 39 ++- 2 files changed, 277 insertions(+), 243 deletions(-) diff --git a/src/operator/cudnn_rnn-inl.h b/src/operator/cudnn_rnn-inl.h index 90bf5cbc9bc7..134044321ad7 100644 --- a/src/operator/cudnn_rnn-inl.h +++ b/src/operator/cudnn_rnn-inl.h @@ -1,6 +1,6 @@ /*! * Copyright (c) 2016 by Contributors - * \file cudnn_spatial_transformer-inl.h + * \file cudnn_rnn-inl.h * \brief * \author Sebastian Bodenstein */ @@ -21,7 +21,7 @@ class CuDNNRNNOp : public Operator { init_cudnn_ = false; dtype_ = mshadow::DataType::kCudnnFlag; // Defaults - input_mode_ = CUDNN_LINEAR_INPUT; + input_mode_ = CUDNN_LINEAR_INPUT; // Don't support this yet // RNN Mode switch (param_.mode) { case rnn_enum::kRnnRelu: @@ -40,31 +40,29 @@ class CuDNNRNNOp : public Operator { LOG(FATAL) << "Not implmented"; } // RNN Direction - switch (param_.direction) { - case rnn_enum::kUnidirectional: - direction_ = CUDNN_UNIDIRECTIONAL; - break; - case rnn_enum::kBidirectional: - direction_ = CUDNN_BIDIRECTIONAL; - break; - default: - LOG(FATAL) << "Not implmented"; - } + direction_ = param_.bidirectional ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL; } ~CuDNNRNNOp() { if (init_cudnn_) { - CHECK_EQ(cudnnDestroyTensorDescriptor(x_desc_), CUDNN_STATUS_SUCCESS); + for(int i = 0; i < x_desc_vec_.size(); ++i){ + CHECK_EQ(cudnnDestroyTensorDescriptor(x_desc_vec_[i]), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudnnDestroyTensorDescriptor(y_desc_vec_[i]), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudnnDestroyTensorDescriptor(dx_desc_vec_[i]), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudnnDestroyTensorDescriptor(dy_desc_vec_[i]), CUDNN_STATUS_SUCCESS); + } CHECK_EQ(cudnnDestroyTensorDescriptor(hx_desc_), CUDNN_STATUS_SUCCESS); - CHECK_EQ(cudnnDestroyTensorDescriptor(y_desc_), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudnnDestroyTensorDescriptor(cx_desc_), CUDNN_STATUS_SUCCESS); CHECK_EQ(cudnnDestroyTensorDescriptor(hy_desc_), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudnnDestroyTensorDescriptor(cy_desc_), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudnnDestroyTensorDescriptor(dhx_desc_), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudnnDestroyTensorDescriptor(dcx_desc_), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudnnDestroyTensorDescriptor(dhy_desc_), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudnnDestroyTensorDescriptor(dcy_desc_), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudnnDestroyFilterDescriptor(w_desc_), CUDNN_STATUS_SUCCESS); CHECK_EQ(cudnnDestroyRNNDescriptor(rnn_desc_), CUDNN_STATUS_SUCCESS); CHECK_EQ(cudnnDestroyDropoutDescriptor(dropout_desc_), CUDNN_STATUS_SUCCESS); - if (param_.mode == rnn_enum::kLstm){ - CHECK_EQ(cudnnDestroyTensorDescriptor(cx_desc_), CUDNN_STATUS_SUCCESS); - CHECK_EQ(cudnnDestroyTensorDescriptor(cy_desc_), CUDNN_STATUS_SUCCESS); - } } } @@ -74,77 +72,83 @@ class CuDNNRNNOp : public Operator { const std::vector &out_data, const std::vector &aux_args) { using namespace mshadow; + size_t in_expected = param_.lstm_q_ ? 4 : 3; + size_t out_expected = param_.lstm_q_ ? 3 : 2; + CHECK_EQ(in_data.size(), in_expected); + CHECK_EQ(out_data.size(), out_expected); Stream *s = ctx.get_stream(); - if(!init_cudnn_){ - Init(s, in_data, out_data); - } // get input + output tensors - Tensor data = in_data[rnn_enum::kData].get(s); - Tensor params = in_data[rnn_enum::kParams].get(s); - Tensor state = in_data[rnn_enum::kStateIn].get(s); + Tensor x = in_data[rnn_enum::kData].get(s); + Tensor w = in_data[rnn_enum::kParams].get(s); + Tensor hx = in_data[rnn_enum::kStateIn].get(s); - Tensor out = out_data[rnn_enum::kOut].get(s); - Tensor out_state = out_data[rnn_enum::kStateOut].get(s); + Tensor y = out_data[rnn_enum::kOut].get(s); + Tensor hy = out_data[rnn_enum::kStateOut].get(s); + DType * cx_ptr = NULL; + DType * cy_ptr = NULL; if (param_.mode == rnn_enum::kLstm){ - Tensor cell_state = - in_data[rnn_enum::kCellStateIn].get(s); - Tensor out_cell_state = - in_data[rnn_enum::kCellStateOut].get(s); + cx_ptr = (in_data[rnn_enum::kCellStateIn].get(s)).dptr_; + cy_ptr = (in_data[rnn_enum::kCellStateOut].get(s)).dptr_; } - // if (param_.mode == rnn_enum::kLstm){ - // CHECK_EQ(in_data.size(), 4); - // CHECK_EQ(out_data.size(), 3); - // } - // else{ - // CHECK_EQ(in_data.size(), 3); - // CHECK_EQ(out_data.size(), 2); - // } - // // Get tensors - // - // Tensor data = in_data[rnn_enum::kData].get(s); - // Tensor params = in_data[rnn_enum::kParams].get(s); - // Tensor state = in_data[rnn_enum::kStateIn].get(s); - // Tensor out = out_data[rnn_enum::kOut].get(s); - // Tensor out_state = out_data[rnn_enum::kOut].get(s); + if(!init_cudnn_){ + Init(s, in_data, out_data); + } - // if (param_.mode == rnn_enum::kLstm){ - // Tensor cell_state = - // in_data[rnn_enum::kCellStateIn].get(s); - // Tensor out_cell_state = - // in_data[rnn_enum::kCellStateOut].get(s); - // } - // - // Tensor data = in_data[st::kData].get(s); - // Tensor out = out_data[st::kOut].get(s); - // Shape<3> loc_shape = Shape3(data.size(0), 2, 3); - // Shape<4> grid_shape = Shape4(out.size(0), out.size(2), out.size(3), 2); - // Tensor loc = in_data[st::kLoc].get_with_shape(loc_shape, s); - // Tensor grid = out_data[st::kGridSrc] - // .get_with_shape(grid_shape, s); - // if (!init_cudnn_) { - // Init(s, in_data, out_data); - // } - // CHECK_EQ(data.CheckContiguous(), true); - // CHECK_EQ(out.CheckContiguous(), true); - // typename DataType::ScaleType alpha = 1.0f; - // typename DataType::ScaleType beta = 0.0f; - // if (param_.transform_type == st::kAffine) { - // CHECK_EQ(cudnnSpatialTfGridGeneratorForward(s->dnn_handle_, - // st_desc_, - // loc.dptr_, - // grid.dptr_/*output*/), CUDNN_STATUS_SUCCESS); - // } - // CHECK_EQ(cudnnSpatialTfSamplerForward(s->dnn_handle_, - // st_desc_, - // &alpha, - // in_desc_, - // data.dptr_, - // grid.dptr_, - // &beta, - // out_desc_, - // out.dptr_/*output*/), CUDNN_STATUS_SUCCESS); + if (ctx.is_train) { + // training mode + Tensor temp_space = + ctx.requested[rnn_enum::kTempSpace].get_space_typed( + mshadow::Shape1(workspace_size_ + reserve_space_size_), s); + CHECK_EQ(cudnnRNNForwardTraining(s->dnn_handle_, + rnn_desc_, + param_.seq_length_, + x_desc_vec_.data(), + x.dptr_, + hx_desc_, + hx.dptr_, + cx_desc_, + cx_ptr, + w_desc_, + w.dptr_, + y_desc_vec_.data(), + y.dptr_, + hy_desc_, + hy.dptr_, + cy_desc_, + cy_ptr, + temp_space.dptr_, + workspace_byte_, + temp_space.dptr_ + workspace_size_, + reserve_space_byte_ + ), CUDNN_STATUS_SUCCESS); + } else { + // inference mode + Tensor temp_space = + ctx.requested[rnn_enum::kTempSpace].get_space_typed( + mshadow::Shape1(workspace_size_), s); + CHECK_EQ(cudnnRNNForwardInference(s->dnn_handle_, + rnn_desc_, + param_.seq_length_, + x_desc_vec_.data(), + x.dptr_, + hx_desc_, + hx.dptr_, + cx_desc_, + cx_ptr, + w_desc_, + w.dptr_, + y_desc_vec_.data(), + y.dptr_, + hy_desc_, + hy.dptr_, + cy_desc_, + cy_ptr, + temp_space.dptr_, + workspace_byte_ + ), CUDNN_STATUS_SUCCESS); + } } // virtual void Backward(const OpContext &ctx, @@ -155,46 +159,12 @@ class CuDNNRNNOp : public Operator { const std::vector &in_grad, const std::vector &aux_args) { using namespace mshadow; - // CHECK_EQ(in_data.size(), 2); - // CHECK_EQ(out_data.size(), 3); - // CHECK_EQ(out_grad.size(), 1); - // Stream *s = ctx.get_stream(); - // Tensor data = in_data[st::kData].get(s); - // Tensor grad = out_grad[st::kOut].get(s); - // Tensor ddata = in_grad[st::kData].get(s); - // Shape<3> loc_shape = Shape3(data.size(0), 2, 3); - // Shape<4> grid_shape = Shape4(grad.size(0), grad.size(2), grad.size(3), 2); - // Tensor dloc = in_grad[st::kLoc].get_with_shape(loc_shape, s); - // Tensor grid = out_data[st::kGridSrc] - // .get_with_shape(grid_shape, s); - // // do not use out_grad[st::kGridSrc], because dgrid is a intermediate tensor, and not include in - // // DeclareBackwardDependency, another, we can we reuse grid for inplace operator - // typename DataType::ScaleType alpha = 1.0f; - // typename DataType::ScaleType beta = 0.0f; - // typename DataType::ScaleType alpha_dgrid = 1.0f; - // typename DataType::ScaleType beta_dgrid = 0.0f; - // CHECK_EQ(cudnnSpatialTfSamplerBackward(s->dnn_handle_, - // st_desc_, - // &alpha, - // in_desc_, - // data.dptr_, - // &beta, - // in_desc_/*reuse in_desc_*/, - // ddata.dptr_/*output*/, - // &alpha_dgrid, - // out_desc_/*reuse out_desc_*/, - // grad.dptr_, - // grid.dptr_, - // &beta_dgrid, - // grid.dptr_/*output, reuse grid*/), CUDNN_STATUS_SUCCESS); - // if (param_.transform_type == st::kAffine) { - // CHECK_EQ(cudnnSpatialTfGridGeneratorBackward(s->dnn_handle_, - // st_desc_, - // grid.dptr_, - // dloc.dptr_/*out*/), CUDNN_STATUS_SUCCESS); - // } + size_t in_expected = param_.lstm_q_ ? 4 : 3; + size_t out_expected = param_.lstm_q_ ? 3 : 2; + CHECK_EQ(in_data.size(), in_expected); + CHECK_EQ(out_data.size(), out_expected); + CHECK_EQ(out_data.size(), out_expected); } - // private: inline void Init(mshadow::Stream *s, const std::vector &in_data, @@ -203,126 +173,193 @@ class CuDNNRNNOp : public Operator { #if CUDNN_MAJOR == 5 format_ = CUDNN_TENSOR_NCHW; #endif - if(param_.mode == rnn_enum::kLstm){ - CHECK_EQ(in_data.size(), 4); - CHECK_EQ(out_data.size(), 3); - } - else{ - CHECK_EQ(in_data.size(), 3); - CHECK_EQ(out_data.size(), 2); - } + size_t in_expected = param_.lstm_q_ ? 4 : 3; + size_t out_expected = param_.lstm_q_ ? 3 : 2; + CHECK_EQ(in_data.size(), in_expected); + CHECK_EQ(out_data.size(), out_expected); if (!init_cudnn_) { init_cudnn_ = true; // get input + output tensors - Tensor data = in_data[rnn_enum::kData].get(s); - Tensor params = in_data[rnn_enum::kParams].get(s); - Tensor state = in_data[rnn_enum::kStateIn].get(s); + Tensor x = in_data[rnn_enum::kData].get(s); + Tensor w = in_data[rnn_enum::kParams].get(s); + // Tensor Descriptors + std::vector x_vec(param_.seq_length_); + std::vector y_vec(param_.seq_length_); + std::vector dx_vec(param_.seq_length_); + std::vector dy_vec(param_.seq_length_); + int dimA[3]; + int strideA[3]; + for (int i = 0; i < param_.seq_length_; i++) { + CHECK_EQ(cudnnCreateTensorDescriptor(&x_vec[i]), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudnnCreateTensorDescriptor(&y_vec[i]), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudnnCreateTensorDescriptor(&dx_vec[i]), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudnnCreateTensorDescriptor(&dy_vec[i]), CUDNN_STATUS_SUCCESS); + + dimA[0] = x.shape_[0]; + dimA[1] = x.shape_[2]; + dimA[2] = 1; + strideA[0] = dimA[2] * dimA[1]; + strideA[1] = dimA[2]; + strideA[2] = 1; - Tensor out = out_data[rnn_enum::kOut].get(s); - Tensor out_state = out_data[rnn_enum::kStateOut].get(s); + CHECK_EQ(cudnnSetTensorNdDescriptor(x_vec[i], + dtype_, + 3, + dimA, + strideA + ), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudnnSetTensorNdDescriptor(dx_vec[i], + dtype_, + 3, + dimA, + strideA + ), CUDNN_STATUS_SUCCESS); + dimA[0] = x.shape_[0]; + dimA[1] = param_.bidirectional ? param_.state_size * 2 : param_.state_size; + dimA[2] = 1; + strideA[0] = dimA[2] * dimA[1]; + strideA[1] = dimA[2]; + strideA[2] = 1; - if(param_.mode == rnn_enum::kLstm){ - Tensor cell_state = - in_data[rnn_enum::kCellStateIn].get(s); - Tensor out_cell_state = - in_data[rnn_enum::kCellStateOut].get(s); + CHECK_EQ(cudnnSetTensorNdDescriptor(y_vec[i], + dtype_, + 3, + dimA, + strideA + ), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudnnSetTensorNdDescriptor(dy_vec[i], + dtype_, + 3, + dimA, + strideA + ), CUDNN_STATUS_SUCCESS); } + x_desc_vec_ = x_vec; + y_desc_vec_ = y_vec; + dx_desc_vec_ = dx_vec; + dy_desc_vec_ = dy_vec; - // Create descriptors - CHECK_EQ(cudnnCreateRNNDescriptor(&rnn_desc_), CUDNN_STATUS_SUCCESS); - CHECK_EQ(cudnnCreateDropoutDescriptor(&dropout_desc_), CUDNN_STATUS_SUCCESS); + // set the state tensors + dimA[0] = param_.num_layers * (param_.bidirectional ? 2 : 1); + dimA[1] = x.shape_[0]; //minibatch + dimA[2] = param_.state_size; + strideA[0] = dimA[2] * dimA[1]; + strideA[1] = dimA[2]; + strideA[2] = 1; - CHECK_EQ(cudnnCreateFilterDescriptor(&w_desc_), CUDNN_STATUS_SUCCESS); - CHECK_EQ(cudnnCreateTensorDescriptor(&x_desc_), CUDNN_STATUS_SUCCESS); CHECK_EQ(cudnnCreateTensorDescriptor(&hx_desc_), CUDNN_STATUS_SUCCESS); - CHECK_EQ(cudnnCreateTensorDescriptor(&y_desc_), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudnnCreateTensorDescriptor(&cx_desc_), CUDNN_STATUS_SUCCESS); CHECK_EQ(cudnnCreateTensorDescriptor(&hy_desc_), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudnnCreateTensorDescriptor(&cy_desc_), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudnnCreateTensorDescriptor(&dhx_desc_), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudnnCreateTensorDescriptor(&dcx_desc_), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudnnCreateTensorDescriptor(&dhy_desc_), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudnnCreateTensorDescriptor(&dcy_desc_), CUDNN_STATUS_SUCCESS); - if (param_.mode == rnn_enum::kLstm){ - CHECK_EQ(cudnnCreateTensorDescriptor(&cx_desc_), CUDNN_STATUS_SUCCESS); - CHECK_EQ(cudnnCreateTensorDescriptor(&cy_desc_), CUDNN_STATUS_SUCCESS); - } - // set dropout - // cudnnSetDropoutDescriptor(dropout_desc_, - // s->dnn_handle_, - // param_.p, - // void * states, - // size_t stateSizeInBytes, - // unsigned long long seed) - // set RNN - CHECK_EQ(cudnnSetRNNDescriptor(rnn_desc_, - param_.state_size, - param_.num_layers, - dropout_desc_, - input_mode_, - direction_, - mode_, - dtype_), CUDNN_STATUS_SUCCESS); - // Set params - int dim_params[3] = {params.shape_[0], 1, 1}; - CHECK_EQ(cudnnSetFilterNdDescriptor(w_desc_, + CHECK_EQ(cudnnSetTensorNdDescriptor(hx_desc_, dtype_, - format_, 3, - dim_params + dimA, + strideA ), CUDNN_STATUS_SUCCESS); - // Get strides - int stride_data[3] = {data.shape_[2]*data.shape_[1], data.shape_[2], 1}; - int stride_state[3] = {state.shape_[2]*state.shape_[1], state.shape_[2], 1}; - int stride_out[3] = {out.shape_[2]*out.shape_[1], out.shape_[2], 1}; - int stride_out_state[3] = - {out_state.shape_[2]*out_state.shape_[1], out_state.shape_[2], 1}; - - // cuDNN needs int arrays for dim, not index_t array used in Shape - int dim_data[3]; - int dim_state[3]; - int dim_out[3]; - int dim_out_state[3]; - std::copy(std::begin(data.shape_.shape_), std::end(data.shape_.shape_), std::begin(dim_data)); - std::copy(std::begin(state.shape_.shape_), std::end(state.shape_.shape_), std::begin(dim_state)); - std::copy(std::begin(out.shape_.shape_), std::end(out.shape_.shape_), std::begin(dim_out)); - std::copy(std::begin(out_state.shape_.shape_), std::end(out_state.shape_.shape_), std::begin(dim_out_state)); - - // set the tensor descriptors - CHECK_EQ(cudnnSetTensorNdDescriptor(x_desc_, + CHECK_EQ(cudnnSetTensorNdDescriptor(cx_desc_, dtype_, 3, - dim_data, - stride_data + dimA, + strideA ), CUDNN_STATUS_SUCCESS); - CHECK_EQ(cudnnSetTensorNdDescriptor(hx_desc_, + CHECK_EQ(cudnnSetTensorNdDescriptor(hy_desc_, dtype_, 3, - dim_state, - stride_state + dimA, + strideA ), CUDNN_STATUS_SUCCESS); - CHECK_EQ(cudnnSetTensorNdDescriptor(y_desc_, + CHECK_EQ(cudnnSetTensorNdDescriptor(cy_desc_, dtype_, 3, - dim_out, - stride_out + dimA, + strideA ), CUDNN_STATUS_SUCCESS); - CHECK_EQ(cudnnSetTensorNdDescriptor(hy_desc_, + CHECK_EQ(cudnnSetTensorNdDescriptor(dhx_desc_, dtype_, 3, - dim_out_state, - stride_out_state + dimA, + strideA ), CUDNN_STATUS_SUCCESS); - // LSTM has two extra descriptors - if (param_.mode == rnn_enum::kLstm){ - CHECK_EQ(cudnnSetTensorNdDescriptor(cx_desc_, - dtype_, - 3, - dim_state, - stride_state - ), CUDNN_STATUS_SUCCESS); - CHECK_EQ(cudnnSetTensorNdDescriptor(cy_desc_, - dtype_, - 3, - dim_out_state, - stride_out_state + CHECK_EQ(cudnnSetTensorNdDescriptor(dcx_desc_, + dtype_, + 3, + dimA, + strideA + ), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudnnSetTensorNdDescriptor(dhy_desc_, + dtype_, + 3, + dimA, + strideA + ), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudnnSetTensorNdDescriptor(dcy_desc_, + dtype_, + 3, + dimA, + strideA + ), CUDNN_STATUS_SUCCESS); + + // Get temp space sizes + CHECK_EQ(cudnnGetRNNWorkspaceSize(s->dnn_handle_, + rnn_desc_, + param_.seq_length_, + x_desc_vec_.data(), + &workspace_byte_ + ), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudnnGetRNNTrainingReserveSize(s->dnn_handle_, + rnn_desc_, + param_.seq_length_, + x_desc_vec_.data(), + &reserve_space_byte_ + ), CUDNN_STATUS_SUCCESS); + workspace_size_ = workspace_byte_ / sizeof(DType) + 1; + reserve_space_size_ = reserve_space_byte_ / sizeof(DType) + 1; + + // Set param descriptors + CHECK_EQ(cudnnCreateFilterDescriptor(&w_desc_), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudnnCreateFilterDescriptor(&dw_desc_), CUDNN_STATUS_SUCCESS); + int dim_w[3] = {w.shape_[0], 1, 1}; + CHECK_EQ(cudnnSetFilterNdDescriptor(w_desc_, + dtype_, + format_, + 3, + dim_w + ), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudnnSetFilterNdDescriptor(dw_desc_, + dtype_, + format_, + 3, + dim_w + ), CUDNN_STATUS_SUCCESS); + // Create Dropout descriptors + CHECK_EQ(cudnnCreateDropoutDescriptor(&dropout_desc_), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudnnDropoutGetStatesSize(s->dnn_handle_, + &dropout_byte_ ), CUDNN_STATUS_SUCCESS); - } + dropout_size_ = dropout_byte_ / sizeof(DType); + CHECK_EQ(cudnnSetDropoutDescriptor(dropout_desc_, + s->dnn_handle_, + param_.pkeep_, // keep probability + NULL, + dropout_byte_, + seed_), CUDNN_STATUS_SUCCESS); + // RNN descriptors + CHECK_EQ(cudnnCreateRNNDescriptor(&rnn_desc_), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudnnSetRNNDescriptor(rnn_desc_, + param_.state_size, + param_.num_layers, + dropout_desc_, + input_mode_, + direction_, + mode_, + dtype_), CUDNN_STATUS_SUCCESS); + } } @@ -333,15 +370,17 @@ class CuDNNRNNOp : public Operator { cudnnDirectionMode_t direction_; cudnnRNNInputMode_t input_mode_; cudnnDropoutDescriptor_t dropout_desc_; + unsigned long long seed_ = 4553; + size_t workspace_byte_, reserve_space_byte_, dropout_byte_; + int workspace_size_, reserve_space_size_, dropout_size_; - cudnnTensorDescriptor_t x_desc_; - cudnnTensorDescriptor_t hx_desc_; - cudnnTensorDescriptor_t cx_desc_; - cudnnTensorDescriptor_t y_desc_; - cudnnTensorDescriptor_t hy_desc_; - cudnnTensorDescriptor_t cy_desc_; + std::vector x_desc_vec_, y_desc_vec_, dx_desc_vec_, dy_desc_vec_; + cudnnTensorDescriptor_t hx_desc_, cx_desc_; + cudnnTensorDescriptor_t hy_desc_, cy_desc_; + cudnnTensorDescriptor_t dhx_desc_, dcx_desc_; + cudnnTensorDescriptor_t dhy_desc_, dcy_desc_; - cudnnFilterDescriptor_t w_desc_; + cudnnFilterDescriptor_t w_desc_, dw_desc_; #if CUDNN_MAJOR == 5 cudnnTensorFormat_t format_; @@ -352,4 +391,4 @@ class CuDNNRNNOp : public Operator { } // namespace op } // namespace mxnet -#endif // MXNET_OPERATOR_CUDNN_SPATIAL_TRANSFORMER_INL_H_ +#endif // MXNET_OPERATOR_CUDNN_RNN_INL_H_ diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h index d81ed1637756..53189d100ef2 100644 --- a/src/operator/rnn-inl.h +++ b/src/operator/rnn-inl.h @@ -24,7 +24,6 @@ namespace rnn_enum { enum RNNOpInputs {kData, kParams, kStateIn, kCellStateIn}; enum RNNOpOutputs {kOut, kStateOut, kCellStateOut}; enum RNNModeType {kRnnRelu, kRnnTanh, kLstm, kGru}; - enum RNNDirectionType {kUnidirectional, kBidirectional}; enum RNNOpResource {kTempSpace}; } @@ -55,26 +54,27 @@ inline int rnn_single_param_size(int inputSize, inline int rnn_param_size(int layerNum, int inputSize, int hiddenSize, - int direction, + bool bidirectional, int mode){ // get size of first layer int size = rnn_single_param_size(inputSize, hiddenSize, mode); // get size of remaining layers - if(direction == rnn_enum::kUnidirectional) - size += (layerNum - 1) * rnn_single_param_size(hiddenSize, hiddenSize, mode); - else // bidirectional case: input size increases by 2 + if(bidirectional) size += (layerNum - 1) * rnn_single_param_size(2 * hiddenSize, hiddenSize, mode); + else + size += (layerNum - 1) * rnn_single_param_size(hiddenSize, hiddenSize, mode); return size; } struct RNNParam : public dmlc::Parameter { uint32_t state_size; uint32_t num_layers; - uint64_t workspace; bool batch_first; - int direction; + bool bidirectional; int mode; - float p; + float p, pkeep_; + int seq_length_; + bool lstm_q_; // whether type is lstm DMLC_DECLARE_PARAMETER(RNNParam) { DMLC_DECLARE_FIELD(state_size) @@ -83,13 +83,8 @@ struct RNNParam : public dmlc::Parameter { DMLC_DECLARE_FIELD(num_layers) .describe("number of stacked layers"); - DMLC_DECLARE_FIELD(workspace).set_default(512).set_range(0, 8192) - .describe("Tmp workspace for RNN (MB)"); - - DMLC_DECLARE_FIELD(direction) - .add_enum("unidirectional", rnn_enum::kUnidirectional) - .add_enum("bidirectional", rnn_enum::kBidirectional) - .describe("specifies the recurrence pattern"); + DMLC_DECLARE_FIELD(bidirectional).set_default(false) + .describe("whether to use bidirectional recurrent layers"); DMLC_DECLARE_FIELD(mode) .add_enum("rnn_relu", rnn_enum::kRnnRelu) @@ -108,9 +103,12 @@ template class RNNOp : public Operator { public: explicit RNNOp(RNNParam p) { - this->param_ = p; // convert MBytes first to Bytes and then to elements. - param_.workspace = (param_.workspace << 20) / sizeof(real_t); + param_.pkeep_ = 1.0f - param_.p; + if(param_.mode == rnn_enum::kLstm) + param_.lstm_q_ = true; + else + param_.lstm_q_ = false; } virtual void Forward(const OpContext &ctx, @@ -185,10 +183,7 @@ class RNNProp : public OperatorProperty { // Infer hidden state + cell state int batchSize = dshape[0]; int inputSize = dshape[2]; - int numDirections = 1; - if(param_.direction == rnn_enum::kBidirectional){ - numDirections = 2; - } + int numDirections = param_.bidirectional ? 2 : 1; int total_layers = numDirections * param_.num_layers; // double for bidirectional SHAPE_ASSIGN_CHECK(*in_shape, rnn_enum::kStateIn, @@ -202,7 +197,7 @@ class RNNProp : public OperatorProperty { int weight_size = rnn_param_size(param_.num_layers, inputSize, param_.state_size, - param_.direction, + param_.bidirectional, param_.mode); SHAPE_ASSIGN_CHECK(*in_shape, rnn_enum::kParams, Shape1(weight_size)); // infer output size From f81d8e97c119a0976d1827fb4f056dc40b20515a Mon Sep 17 00:00:00 2001 From: Sebastian Bodenstein Date: Mon, 18 Jul 2016 02:25:19 -0400 Subject: [PATCH 06/36] - added parameter size test - fixed bug where cudnnGetRNNParamsSize needs to be called after cudnnSetRNNDescriptor --- src/operator/cudnn_rnn-inl.h | 64 +++++++++++++++++++++--------------- 1 file changed, 38 insertions(+), 26 deletions(-) diff --git a/src/operator/cudnn_rnn-inl.h b/src/operator/cudnn_rnn-inl.h index 134044321ad7..3a40b2f67fd7 100644 --- a/src/operator/cudnn_rnn-inl.h +++ b/src/operator/cudnn_rnn-inl.h @@ -150,7 +150,7 @@ class CuDNNRNNOp : public Operator { ), CUDNN_STATUS_SUCCESS); } } - // + virtual void Backward(const OpContext &ctx, const std::vector &out_grad, const std::vector &in_data, @@ -182,6 +182,9 @@ class CuDNNRNNOp : public Operator { // get input + output tensors Tensor x = in_data[rnn_enum::kData].get(s); Tensor w = in_data[rnn_enum::kParams].get(s); + + param_.seq_length_ = x.shape_[1]; + // Tensor Descriptors std::vector x_vec(param_.seq_length_); std::vector y_vec(param_.seq_length_); @@ -305,7 +308,29 @@ class CuDNNRNNOp : public Operator { strideA ), CUDNN_STATUS_SUCCESS); - // Get temp space sizes + // Create Dropout descriptors + CHECK_EQ(cudnnCreateDropoutDescriptor(&dropout_desc_), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudnnDropoutGetStatesSize(s->dnn_handle_, + &dropout_byte_ + ), CUDNN_STATUS_SUCCESS); + dropout_size_ = dropout_byte_ / sizeof(DType); + CHECK_EQ(cudnnSetDropoutDescriptor(dropout_desc_, + s->dnn_handle_, + param_.pkeep_, // keep probability + NULL, + dropout_byte_, + seed_), CUDNN_STATUS_SUCCESS); + // RNN descriptors + CHECK_EQ(cudnnCreateRNNDescriptor(&rnn_desc_), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudnnSetRNNDescriptor(rnn_desc_, + param_.state_size, + param_.num_layers, + dropout_desc_, + input_mode_, + direction_, + mode_, + dtype_), CUDNN_STATUS_SUCCESS); + // Get temp space sizes CHECK_EQ(cudnnGetRNNWorkspaceSize(s->dnn_handle_, rnn_desc_, param_.seq_length_, @@ -318,8 +343,17 @@ class CuDNNRNNOp : public Operator { x_desc_vec_.data(), &reserve_space_byte_ ), CUDNN_STATUS_SUCCESS); - workspace_size_ = workspace_byte_ / sizeof(DType) + 1; - reserve_space_size_ = reserve_space_byte_ / sizeof(DType) + 1; + workspace_size_ = workspace_byte_ / sizeof(DType); + reserve_space_size_ = reserve_space_byte_ / sizeof(DType); + + // check that number of params are correct + size_t cudnn_param_size; + CHECK_EQ(cudnnGetRNNParamsSize(s->dnn_handle_, + rnn_desc_, + x_desc_vec_[0], + &cudnn_param_size, + dtype_), CUDNN_STATUS_SUCCESS); + CHECK_EQ(w.shape_[0] * sizeof(DType), cudnn_param_size); // Set param descriptors CHECK_EQ(cudnnCreateFilterDescriptor(&w_desc_), CUDNN_STATUS_SUCCESS); @@ -337,28 +371,6 @@ class CuDNNRNNOp : public Operator { 3, dim_w ), CUDNN_STATUS_SUCCESS); - // Create Dropout descriptors - CHECK_EQ(cudnnCreateDropoutDescriptor(&dropout_desc_), CUDNN_STATUS_SUCCESS); - CHECK_EQ(cudnnDropoutGetStatesSize(s->dnn_handle_, - &dropout_byte_ - ), CUDNN_STATUS_SUCCESS); - dropout_size_ = dropout_byte_ / sizeof(DType); - CHECK_EQ(cudnnSetDropoutDescriptor(dropout_desc_, - s->dnn_handle_, - param_.pkeep_, // keep probability - NULL, - dropout_byte_, - seed_), CUDNN_STATUS_SUCCESS); - // RNN descriptors - CHECK_EQ(cudnnCreateRNNDescriptor(&rnn_desc_), CUDNN_STATUS_SUCCESS); - CHECK_EQ(cudnnSetRNNDescriptor(rnn_desc_, - param_.state_size, - param_.num_layers, - dropout_desc_, - input_mode_, - direction_, - mode_, - dtype_), CUDNN_STATUS_SUCCESS); } } From 812b7d4a80c5efdf4e83a469ef55e85a7f24a583 Mon Sep 17 00:00:00 2001 From: Sebastian Bodenstein Date: Mon, 18 Jul 2016 10:32:55 -0400 Subject: [PATCH 07/36] - checks for contiguous input tensors - more consistent param names - removed 'batch_first' option for now. Might add it later again --- src/operator/cudnn_rnn-inl.h | 119 +++++++++++++++++++---------------- src/operator/rnn-inl.h | 37 ++++++----- 2 files changed, 82 insertions(+), 74 deletions(-) diff --git a/src/operator/cudnn_rnn-inl.h b/src/operator/cudnn_rnn-inl.h index 3a40b2f67fd7..8c6eae9dc984 100644 --- a/src/operator/cudnn_rnn-inl.h +++ b/src/operator/cudnn_rnn-inl.h @@ -92,15 +92,24 @@ class CuDNNRNNOp : public Operator { cy_ptr = (in_data[rnn_enum::kCellStateOut].get(s)).dptr_; } + CHECK_EQ(x.CheckContiguous(), true); + CHECK_EQ(w.CheckContiguous(), true); + CHECK_EQ(hx.CheckContiguous(), true); + CHECK_EQ(y.CheckContiguous(), true); + CHECK_EQ(hy.CheckContiguous(), true); + if(!init_cudnn_){ Init(s, in_data, out_data); } + // Get temp space + int temp_size = workspace_size_; + temp_size += ctx.is_train ? reserve_space_size_ : 0; + Tensor temp_space = + ctx.requested[rnn_enum::kTempSpace].get_space_typed( + mshadow::Shape1(temp_size), s); + if (ctx.is_train) { - // training mode - Tensor temp_space = - ctx.requested[rnn_enum::kTempSpace].get_space_typed( - mshadow::Shape1(workspace_size_ + reserve_space_size_), s); CHECK_EQ(cudnnRNNForwardTraining(s->dnn_handle_, rnn_desc_, param_.seq_length_, @@ -125,9 +134,6 @@ class CuDNNRNNOp : public Operator { ), CUDNN_STATUS_SUCCESS); } else { // inference mode - Tensor temp_space = - ctx.requested[rnn_enum::kTempSpace].get_space_typed( - mshadow::Shape1(workspace_size_), s); CHECK_EQ(cudnnRNNForwardInference(s->dnn_handle_, rnn_desc_, param_.seq_length_, @@ -182,8 +188,9 @@ class CuDNNRNNOp : public Operator { // get input + output tensors Tensor x = in_data[rnn_enum::kData].get(s); Tensor w = in_data[rnn_enum::kParams].get(s); - - param_.seq_length_ = x.shape_[1]; + param_.seq_length_ = x.shape_[0]; + param_.batch_size_ = x.shape_[1]; + param_.input_size_ = x.shape_[2]; // Tensor Descriptors std::vector x_vec(param_.seq_length_); @@ -193,49 +200,51 @@ class CuDNNRNNOp : public Operator { int dimA[3]; int strideA[3]; for (int i = 0; i < param_.seq_length_; i++) { - CHECK_EQ(cudnnCreateTensorDescriptor(&x_vec[i]), CUDNN_STATUS_SUCCESS); - CHECK_EQ(cudnnCreateTensorDescriptor(&y_vec[i]), CUDNN_STATUS_SUCCESS); - CHECK_EQ(cudnnCreateTensorDescriptor(&dx_vec[i]), CUDNN_STATUS_SUCCESS); - CHECK_EQ(cudnnCreateTensorDescriptor(&dy_vec[i]), CUDNN_STATUS_SUCCESS); - - dimA[0] = x.shape_[0]; - dimA[1] = x.shape_[2]; - dimA[2] = 1; - strideA[0] = dimA[2] * dimA[1]; - strideA[1] = dimA[2]; - strideA[2] = 1; + CHECK_EQ(cudnnCreateTensorDescriptor(&x_vec[i]), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudnnCreateTensorDescriptor(&y_vec[i]), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudnnCreateTensorDescriptor(&dx_vec[i]), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudnnCreateTensorDescriptor(&dy_vec[i]), CUDNN_STATUS_SUCCESS); + + dimA[0] = param_.batch_size_; + dimA[1] = param_.input_size_; + dimA[2] = 1; + dimA[0] = param_.batch_size_; + dimA[1] = param_.input_size_; + strideA[0] = dimA[2] * dimA[1]; + strideA[1] = dimA[2]; + strideA[2] = 1; - CHECK_EQ(cudnnSetTensorNdDescriptor(x_vec[i], - dtype_, - 3, - dimA, - strideA - ), CUDNN_STATUS_SUCCESS); - CHECK_EQ(cudnnSetTensorNdDescriptor(dx_vec[i], - dtype_, - 3, - dimA, - strideA - ), CUDNN_STATUS_SUCCESS); - dimA[0] = x.shape_[0]; - dimA[1] = param_.bidirectional ? param_.state_size * 2 : param_.state_size; - dimA[2] = 1; - strideA[0] = dimA[2] * dimA[1]; - strideA[1] = dimA[2]; - strideA[2] = 1; + CHECK_EQ(cudnnSetTensorNdDescriptor(x_vec[i], + dtype_, + 3, + dimA, + strideA + ), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudnnSetTensorNdDescriptor(dx_vec[i], + dtype_, + 3, + dimA, + strideA + ), CUDNN_STATUS_SUCCESS); + dimA[0] = param_.batch_size_; + dimA[1] = param_.bidirectional ? param_.state_size_ * 2 : param_.state_size_; + dimA[2] = 1; + strideA[0] = dimA[2] * dimA[1]; + strideA[1] = dimA[2]; + strideA[2] = 1; - CHECK_EQ(cudnnSetTensorNdDescriptor(y_vec[i], - dtype_, - 3, - dimA, - strideA - ), CUDNN_STATUS_SUCCESS); - CHECK_EQ(cudnnSetTensorNdDescriptor(dy_vec[i], - dtype_, - 3, - dimA, - strideA - ), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudnnSetTensorNdDescriptor(y_vec[i], + dtype_, + 3, + dimA, + strideA + ), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudnnSetTensorNdDescriptor(dy_vec[i], + dtype_, + 3, + dimA, + strideA + ), CUDNN_STATUS_SUCCESS); } x_desc_vec_ = x_vec; y_desc_vec_ = y_vec; @@ -243,9 +252,9 @@ class CuDNNRNNOp : public Operator { dy_desc_vec_ = dy_vec; // set the state tensors - dimA[0] = param_.num_layers * (param_.bidirectional ? 2 : 1); - dimA[1] = x.shape_[0]; //minibatch - dimA[2] = param_.state_size; + dimA[0] = param_.num_layers_ * (param_.bidirectional ? 2 : 1); + dimA[1] = param_.batch_size_; + dimA[2] = param_.state_size_; strideA[0] = dimA[2] * dimA[1]; strideA[1] = dimA[2]; strideA[2] = 1; @@ -323,8 +332,8 @@ class CuDNNRNNOp : public Operator { // RNN descriptors CHECK_EQ(cudnnCreateRNNDescriptor(&rnn_desc_), CUDNN_STATUS_SUCCESS); CHECK_EQ(cudnnSetRNNDescriptor(rnn_desc_, - param_.state_size, - param_.num_layers, + param_.state_size_, + param_.num_layers_, dropout_desc_, input_mode_, direction_, diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h index 53189d100ef2..a4252b7e8fe5 100644 --- a/src/operator/rnn-inl.h +++ b/src/operator/rnn-inl.h @@ -67,20 +67,19 @@ inline int rnn_param_size(int layerNum, } struct RNNParam : public dmlc::Parameter { - uint32_t state_size; - uint32_t num_layers; - bool batch_first; + uint32_t state_size_; + uint32_t num_layers_; bool bidirectional; int mode; float p, pkeep_; - int seq_length_; + int seq_length_, batch_size_, input_size_; bool lstm_q_; // whether type is lstm DMLC_DECLARE_PARAMETER(RNNParam) { - DMLC_DECLARE_FIELD(state_size) + DMLC_DECLARE_FIELD(state_size_) .describe("size of the state for each layer"); - DMLC_DECLARE_FIELD(num_layers) + DMLC_DECLARE_FIELD(num_layers_) .describe("number of stacked layers"); DMLC_DECLARE_FIELD(bidirectional).set_default(false) @@ -179,35 +178,35 @@ class RNNProp : public OperatorProperty { const TShape &dshape = (*in_shape)[rnn_enum::kData]; if (dshape.ndim() == 0) return false; CHECK_EQ(dshape.ndim(), 3) \ - << "Input data should be rank-3 tensor of dim (seqLength, batch, inputDim)."; - // Infer hidden state + cell state - int batchSize = dshape[0]; - int inputSize = dshape[2]; + << "Input data should be rank-3 tensor of dim (sequence length, batch size, input size)"; + // Get input sizes + int batch_size = dshape[1]; + int input_size = dshape[2]; int numDirections = param_.bidirectional ? 2 : 1; - int total_layers = numDirections * param_.num_layers; // double for bidirectional + int total_layers = numDirections * param_.num_layers_; // double for bidirectional SHAPE_ASSIGN_CHECK(*in_shape, rnn_enum::kStateIn, - Shape3(total_layers, batchSize, param_.state_size)); + Shape3(total_layers, batch_size, param_.state_size_)); if (param_.mode == rnn_enum::kLstm){ SHAPE_ASSIGN_CHECK(*in_shape, rnn_enum::kCellStateIn, - Shape3(total_layers, batchSize, param_.state_size)); + Shape3(total_layers, batch_size, param_.state_size_)); } // infer weight size - int weight_size = rnn_param_size(param_.num_layers, - inputSize, - param_.state_size, + int weight_size = rnn_param_size(param_.num_layers_, + input_size, + param_.state_size_, param_.bidirectional, param_.mode); SHAPE_ASSIGN_CHECK(*in_shape, rnn_enum::kParams, Shape1(weight_size)); // infer output size TShape oshape = dshape; - oshape[2] = numDirections * param_.state_size; + oshape[2] = numDirections * param_.state_size_; // infer output state size TShape outStateShape = dshape; outStateShape[0] = total_layers; - outStateShape[1] = batchSize; - outStateShape[2] = param_.state_size; + outStateShape[1] = batch_size; + outStateShape[2] = param_.state_size_; out_shape->clear(); out_shape->push_back(oshape); From a7f64e243dc7401a341e9b45b80941eeb4333d51 Mon Sep 17 00:00:00 2001 From: Sebastian Bodenstein Date: Tue, 19 Jul 2016 23:06:39 +0200 Subject: [PATCH 08/36] - fixed input names --- src/operator/rnn-inl.h | 41 +++++++++++++++++++++++++---------------- src/operator/rnn.cc | 4 ++-- 2 files changed, 27 insertions(+), 18 deletions(-) diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h index a4252b7e8fe5..98f8a5953d70 100644 --- a/src/operator/rnn-inl.h +++ b/src/operator/rnn-inl.h @@ -144,18 +144,28 @@ class RNNProp : public OperatorProperty { public: std::vector ListArguments() const override { if (param_.mode == rnn_enum::kLstm) { - return {"data", "weight", "state", "cell_state"}; + return {"data", "parameters", "state", "cell_state"}; } else { - return {"data", "weight", "state"}; + return {"data", "parameters", "state"}; } } std::vector ListOutputs() const override { - if (param_.mode == rnn_enum::kLstm) { - return {"output", "final_state", "final_state_cell"}; - } else { - return {"output", "final_state"}; - } + if (param_.mode == rnn_enum::kLstm) + return {"output", "state", "state_cell"}; + else + return {"output", "state"}; + } + + int NumOutputs() const override { + if (param_.mode == rnn_enum::kLstm) + return 3; + else + return 2; + } + + int NumVisibleOutputs() const override { + return 1; } void Init(const std::vector >& kwargs) override { @@ -171,15 +181,15 @@ class RNNProp : public OperatorProperty { std::vector *aux_shape) const override { using namespace mshadow; if (param_.mode == rnn_enum::kLstm) { - CHECK_EQ(in_shape->size(), 4) << "Input:[data, weight, state, cell_state]"; + CHECK_EQ(in_shape->size(), 4) << "Input:[data, parameters, state, cell_state]"; } else { - CHECK_EQ(in_shape->size(), 3) << "Input:[data, weight, state]"; + CHECK_EQ(in_shape->size(), 3) << "Input:[data, parameters, state]"; } const TShape &dshape = (*in_shape)[rnn_enum::kData]; if (dshape.ndim() == 0) return false; CHECK_EQ(dshape.ndim(), 3) \ - << "Input data should be rank-3 tensor of dim (sequence length, batch size, input size)"; - // Get input sizes + << "Input data should be rank-3 tensor of dim [sequence length, batch size, input size]"; + // data: [sequence len, batch, input dimension] int batch_size = dshape[1]; int input_size = dshape[2]; int numDirections = param_.bidirectional ? 2 : 1; @@ -192,17 +202,16 @@ class RNNProp : public OperatorProperty { rnn_enum::kCellStateIn, Shape3(total_layers, batch_size, param_.state_size_)); } - // infer weight size - int weight_size = rnn_param_size(param_.num_layers_, + // calculate parameter vector length + int param_size = rnn_param_size(param_.num_layers_, input_size, param_.state_size_, param_.bidirectional, param_.mode); - SHAPE_ASSIGN_CHECK(*in_shape, rnn_enum::kParams, Shape1(weight_size)); - // infer output size + SHAPE_ASSIGN_CHECK(*in_shape, rnn_enum::kParams, Shape1(param_size)); + // output: [sequence len, batch, output size] TShape oshape = dshape; oshape[2] = numDirections * param_.state_size_; - // infer output state size TShape outStateShape = dshape; outStateShape[0] = total_layers; outStateShape[1] = batch_size; diff --git a/src/operator/rnn.cc b/src/operator/rnn.cc index 2a485e5ef224..5e3b2b8894af 100644 --- a/src/operator/rnn.cc +++ b/src/operator/rnn.cc @@ -20,7 +20,7 @@ Operator *CreateOp(RNNParam param, int dtype) { } Operator *RNNProp::CreateOperatorEx(Context ctx, std::vector *in_shape, - std::vector *in_type) const { + std::vector *in_type) const { std::vector out_shape, aux_shape; std::vector out_type, aux_type; CHECK(InferType(in_type, &out_type, &aux_type)); @@ -34,7 +34,7 @@ MXNET_REGISTER_OP_PROPERTY(RNN, RNNProp) .describe("Apply a recurrent layer to input.") .add_argument("data", "Symbol", "Input data to RNN") .add_argument("parameters", "Symbol", "Vector of all RNN trainable parameters") -.add_argument("hidden_state", "Symbol", "initial hidden state of the RNN") +.add_argument("state", "Symbol", "initial hidden state of the RNN") .add_argument("cell_state", "Symbol", "initial cell state for LSTM networks (only for LSTM)") .add_arguments(RNNParam::__FIELDS__()); } // namespace op From e311b8691d08160342a83cd698a2b95dcba0e53f Mon Sep 17 00:00:00 2001 From: Sebastian Bodenstein Date: Wed, 20 Jul 2016 12:50:59 +0200 Subject: [PATCH 09/36] - added backward method --- src/operator/cudnn_rnn-inl.h | 95 ++++++++++++++++++++++++++++++++++-- src/operator/rnn-inl.h | 8 +-- 2 files changed, 95 insertions(+), 8 deletions(-) diff --git a/src/operator/cudnn_rnn-inl.h b/src/operator/cudnn_rnn-inl.h index 8c6eae9dc984..6a642f6428f8 100644 --- a/src/operator/cudnn_rnn-inl.h +++ b/src/operator/cudnn_rnn-inl.h @@ -80,7 +80,7 @@ class CuDNNRNNOp : public Operator { // get input + output tensors Tensor x = in_data[rnn_enum::kData].get(s); Tensor w = in_data[rnn_enum::kParams].get(s); - Tensor hx = in_data[rnn_enum::kStateIn].get(s); + Tensor hx = in_data[rnn_enum::kState].get(s); Tensor y = out_data[rnn_enum::kOut].get(s); Tensor hy = out_data[rnn_enum::kStateOut].get(s); @@ -88,8 +88,8 @@ class CuDNNRNNOp : public Operator { DType * cx_ptr = NULL; DType * cy_ptr = NULL; if (param_.mode == rnn_enum::kLstm){ - cx_ptr = (in_data[rnn_enum::kCellStateIn].get(s)).dptr_; - cy_ptr = (in_data[rnn_enum::kCellStateOut].get(s)).dptr_; + cx_ptr = (in_data[rnn_enum::kStateCell].get(s)).dptr_; + cy_ptr = (in_data[rnn_enum::kStateCellOut].get(s)).dptr_; } CHECK_EQ(x.CheckContiguous(), true); @@ -169,7 +169,94 @@ class CuDNNRNNOp : public Operator { size_t out_expected = param_.lstm_q_ ? 3 : 2; CHECK_EQ(in_data.size(), in_expected); CHECK_EQ(out_data.size(), out_expected); - CHECK_EQ(out_data.size(), out_expected); + CHECK_EQ(in_grad.size(), in_expected); + CHECK_EQ(out_grad.size(), out_expected); + + Stream *s = ctx.get_stream(); + // get input + output tensors + Tensor x = in_data[rnn_enum::kData].get(s); + Tensor dx = in_grad[rnn_enum::kData].get(s); + Tensor w = in_data[rnn_enum::kParams].get(s); + Tensor dw = in_grad[rnn_enum::kParams].get(s); + Tensor hx = in_data[rnn_enum::kState].get(s); + Tensor dhx = in_grad[rnn_enum::kState].get(s); + Tensor hy = in_data[rnn_enum::kStateOut].get(s); + Tensor dhy = out_grad[rnn_enum::kStateOut].get(s); + Tensor y = out_data[rnn_enum::kOut].get(s); + Tensor dy = out_grad[rnn_enum::kOut].get(s); + + DType * cx_ptr = NULL; + // DType * cy_ptr = NULL; + DType * dcx_ptr = NULL; + DType * dcy_ptr = NULL; + if (param_.mode == rnn_enum::kLstm){ + cx_ptr = (in_data[rnn_enum::kStateCell].get(s)).dptr_; + // cy_ptr = (in_data[rnn_enum::kStateCellOut].get(s)).dptr_; + dcx_ptr = (in_grad[rnn_enum::kStateCell].get(s)).dptr_; + dcy_ptr = (out_grad[rnn_enum::kStateCellOut].get(s)).dptr_; + } + + CHECK_EQ(x.CheckContiguous(), true); + CHECK_EQ(w.CheckContiguous(), true); + CHECK_EQ(hx.CheckContiguous(), true); + CHECK_EQ(y.CheckContiguous(), true); + CHECK_EQ(hy.CheckContiguous(), true); + + if(!init_cudnn_){ + Init(s, in_data, out_data); + } + + // Get temp space + int temp_size = workspace_size_; + temp_size += ctx.is_train ? reserve_space_size_ : 0; + Tensor temp_space = + ctx.requested[rnn_enum::kTempSpace].get_space_typed( + mshadow::Shape1(temp_size), s); + + CHECK_EQ(cudnnRNNBackwardData(s->dnn_handle_, + rnn_desc_, + param_.seq_length_, + y_desc_vec_.data(), + y.dptr_, + dy_desc_vec_.data(), + dy.dptr_, + dhy_desc_, + dhy.dptr_, + dcy_desc_, + dcy_ptr, + w_desc_, + w.dptr_, + hx_desc_, + hx.dptr_, + cx_desc_, + cx_ptr, + dx_desc_vec_.data(), + dx.dptr_, + dhx_desc_, + dhx.dptr_, + dcx_desc_, + dcx_ptr, + temp_space.dptr_, + workspace_byte_, + temp_space.dptr_ + workspace_size_, + reserve_space_byte_ + ), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudnnRNNBackwardWeights(s->dnn_handle_, + rnn_desc_, + param_.seq_length_, + x_desc_vec_.data(), + x.dptr_, + hx_desc_, + hx.dptr_, + y_desc_vec_.data(), + y.dptr_, + temp_space.dptr_, + workspace_byte_, + dw_desc_, + dw.dptr_, + temp_space.dptr_ + workspace_size_, + reserve_space_byte_ + ), CUDNN_STATUS_SUCCESS); } private: inline void Init(mshadow::Stream *s, diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h index 98f8a5953d70..fd68fd628432 100644 --- a/src/operator/rnn-inl.h +++ b/src/operator/rnn-inl.h @@ -21,8 +21,8 @@ namespace mxnet { namespace op { namespace rnn_enum { - enum RNNOpInputs {kData, kParams, kStateIn, kCellStateIn}; - enum RNNOpOutputs {kOut, kStateOut, kCellStateOut}; + enum RNNOpInputs {kData, kParams, kState, kStateCell}; + enum RNNOpOutputs {kOut, kStateOut, kStateCellOut}; enum RNNModeType {kRnnRelu, kRnnTanh, kLstm, kGru}; enum RNNOpResource {kTempSpace}; } @@ -195,11 +195,11 @@ class RNNProp : public OperatorProperty { int numDirections = param_.bidirectional ? 2 : 1; int total_layers = numDirections * param_.num_layers_; // double for bidirectional SHAPE_ASSIGN_CHECK(*in_shape, - rnn_enum::kStateIn, + rnn_enum::kState, Shape3(total_layers, batch_size, param_.state_size_)); if (param_.mode == rnn_enum::kLstm){ SHAPE_ASSIGN_CHECK(*in_shape, - rnn_enum::kCellStateIn, + rnn_enum::kStateCell, Shape3(total_layers, batch_size, param_.state_size_)); } // calculate parameter vector length From ccb1ae53b6f460ec9ab93d390a0f31ba0a671003 Mon Sep 17 00:00:00 2001 From: Sebastian Bodenstein Date: Wed, 20 Jul 2016 13:16:50 +0200 Subject: [PATCH 10/36] - small fix for in/out names --- src/operator/rnn-inl.h | 8 ++++---- src/operator/rnn.cc | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h index fd68fd628432..137bebed5c06 100644 --- a/src/operator/rnn-inl.h +++ b/src/operator/rnn-inl.h @@ -144,7 +144,7 @@ class RNNProp : public OperatorProperty { public: std::vector ListArguments() const override { if (param_.mode == rnn_enum::kLstm) { - return {"data", "parameters", "state", "cell_state"}; + return {"data", "parameters", "state", "state_cell"}; } else { return {"data", "parameters", "state"}; } @@ -164,9 +164,9 @@ class RNNProp : public OperatorProperty { return 2; } - int NumVisibleOutputs() const override { - return 1; - } + // int NumVisibleOutputs() const override { + // return 1; + // } void Init(const std::vector >& kwargs) override { param_.Init(kwargs); diff --git a/src/operator/rnn.cc b/src/operator/rnn.cc index 5e3b2b8894af..337410c8ddc1 100644 --- a/src/operator/rnn.cc +++ b/src/operator/rnn.cc @@ -35,7 +35,7 @@ MXNET_REGISTER_OP_PROPERTY(RNN, RNNProp) .add_argument("data", "Symbol", "Input data to RNN") .add_argument("parameters", "Symbol", "Vector of all RNN trainable parameters") .add_argument("state", "Symbol", "initial hidden state of the RNN") -.add_argument("cell_state", "Symbol", "initial cell state for LSTM networks (only for LSTM)") +.add_argument("state_cell", "Symbol", "initial cell state for LSTM networks (only for LSTM)") .add_arguments(RNNParam::__FIELDS__()); } // namespace op } // namespace mxnet From 9b5e38382d94f667700297d5ef56cb61664581cf Mon Sep 17 00:00:00 2001 From: Sebastian Bodenstein Date: Wed, 20 Jul 2016 14:24:57 +0200 Subject: [PATCH 11/36] - fixed bug: parameters can't have underscore --- src/operator/cudnn_rnn-inl.h | 10 +++++----- src/operator/rnn-inl.h | 22 +++++++++++----------- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/src/operator/cudnn_rnn-inl.h b/src/operator/cudnn_rnn-inl.h index 6a642f6428f8..d696ead26255 100644 --- a/src/operator/cudnn_rnn-inl.h +++ b/src/operator/cudnn_rnn-inl.h @@ -314,7 +314,7 @@ class CuDNNRNNOp : public Operator { strideA ), CUDNN_STATUS_SUCCESS); dimA[0] = param_.batch_size_; - dimA[1] = param_.bidirectional ? param_.state_size_ * 2 : param_.state_size_; + dimA[1] = param_.bidirectional ? param_.state_size * 2 : param_.state_size; dimA[2] = 1; strideA[0] = dimA[2] * dimA[1]; strideA[1] = dimA[2]; @@ -339,9 +339,9 @@ class CuDNNRNNOp : public Operator { dy_desc_vec_ = dy_vec; // set the state tensors - dimA[0] = param_.num_layers_ * (param_.bidirectional ? 2 : 1); + dimA[0] = param_.num_layers * (param_.bidirectional ? 2 : 1); dimA[1] = param_.batch_size_; - dimA[2] = param_.state_size_; + dimA[2] = param_.state_size; strideA[0] = dimA[2] * dimA[1]; strideA[1] = dimA[2]; strideA[2] = 1; @@ -419,8 +419,8 @@ class CuDNNRNNOp : public Operator { // RNN descriptors CHECK_EQ(cudnnCreateRNNDescriptor(&rnn_desc_), CUDNN_STATUS_SUCCESS); CHECK_EQ(cudnnSetRNNDescriptor(rnn_desc_, - param_.state_size_, - param_.num_layers_, + param_.state_size, + param_.num_layers, dropout_desc_, input_mode_, direction_, diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h index 137bebed5c06..ed0cf0db84b1 100644 --- a/src/operator/rnn-inl.h +++ b/src/operator/rnn-inl.h @@ -67,8 +67,8 @@ inline int rnn_param_size(int layerNum, } struct RNNParam : public dmlc::Parameter { - uint32_t state_size_; - uint32_t num_layers_; + uint32_t state_size; + uint32_t num_layers; bool bidirectional; int mode; float p, pkeep_; @@ -76,10 +76,10 @@ struct RNNParam : public dmlc::Parameter { bool lstm_q_; // whether type is lstm DMLC_DECLARE_PARAMETER(RNNParam) { - DMLC_DECLARE_FIELD(state_size_) + DMLC_DECLARE_FIELD(state_size) .describe("size of the state for each layer"); - DMLC_DECLARE_FIELD(num_layers_) + DMLC_DECLARE_FIELD(num_layers) .describe("number of stacked layers"); DMLC_DECLARE_FIELD(bidirectional).set_default(false) @@ -193,29 +193,29 @@ class RNNProp : public OperatorProperty { int batch_size = dshape[1]; int input_size = dshape[2]; int numDirections = param_.bidirectional ? 2 : 1; - int total_layers = numDirections * param_.num_layers_; // double for bidirectional + int total_layers = numDirections * param_.num_layers; // double for bidirectional SHAPE_ASSIGN_CHECK(*in_shape, rnn_enum::kState, - Shape3(total_layers, batch_size, param_.state_size_)); + Shape3(total_layers, batch_size, param_.state_size)); if (param_.mode == rnn_enum::kLstm){ SHAPE_ASSIGN_CHECK(*in_shape, rnn_enum::kStateCell, - Shape3(total_layers, batch_size, param_.state_size_)); + Shape3(total_layers, batch_size, param_.state_size)); } // calculate parameter vector length - int param_size = rnn_param_size(param_.num_layers_, + int param_size = rnn_param_size(param_.num_layers, input_size, - param_.state_size_, + param_.state_size, param_.bidirectional, param_.mode); SHAPE_ASSIGN_CHECK(*in_shape, rnn_enum::kParams, Shape1(param_size)); // output: [sequence len, batch, output size] TShape oshape = dshape; - oshape[2] = numDirections * param_.state_size_; + oshape[2] = numDirections * param_.state_size; TShape outStateShape = dshape; outStateShape[0] = total_layers; outStateShape[1] = batch_size; - outStateShape[2] = param_.state_size_; + outStateShape[2] = param_.state_size; out_shape->clear(); out_shape->push_back(oshape); From 8997a5d96e1aed7927f5e4cfd10e481d5e968bec Mon Sep 17 00:00:00 2001 From: Sebastian Bodenstein Date: Wed, 20 Jul 2016 16:36:07 +0200 Subject: [PATCH 12/36] - fixed off-by-two error in weight shape inference for bidirectional net - moved calculated param to cudnn_rnn-inl.h --- src/operator/cudnn_rnn-inl.h | 7 ++++++- src/operator/rnn-inl.h | 10 +++------- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/src/operator/cudnn_rnn-inl.h b/src/operator/cudnn_rnn-inl.h index d696ead26255..1fd7afc90e3a 100644 --- a/src/operator/cudnn_rnn-inl.h +++ b/src/operator/cudnn_rnn-inl.h @@ -41,6 +41,12 @@ class CuDNNRNNOp : public Operator { } // RNN Direction direction_ = param_.bidirectional ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL; + // Other + param_.pkeep_ = 1.0f - param_.p; + if(param_.mode == rnn_enum::kLstm) + param_.lstm_q_ = true; + else + param_.lstm_q_ = false; } ~CuDNNRNNOp() { @@ -212,7 +218,6 @@ class CuDNNRNNOp : public Operator { Tensor temp_space = ctx.requested[rnn_enum::kTempSpace].get_space_typed( mshadow::Shape1(temp_size), s); - CHECK_EQ(cudnnRNNBackwardData(s->dnn_handle_, rnn_desc_, param_.seq_length_, diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h index ed0cf0db84b1..b51216bf9d4d 100644 --- a/src/operator/rnn-inl.h +++ b/src/operator/rnn-inl.h @@ -59,8 +59,10 @@ inline int rnn_param_size(int layerNum, // get size of first layer int size = rnn_single_param_size(inputSize, hiddenSize, mode); // get size of remaining layers - if(bidirectional) + if(bidirectional){ size += (layerNum - 1) * rnn_single_param_size(2 * hiddenSize, hiddenSize, mode); + size *= 2; + } else size += (layerNum - 1) * rnn_single_param_size(hiddenSize, hiddenSize, mode); return size; @@ -102,12 +104,6 @@ template class RNNOp : public Operator { public: explicit RNNOp(RNNParam p) { - // convert MBytes first to Bytes and then to elements. - param_.pkeep_ = 1.0f - param_.p; - if(param_.mode == rnn_enum::kLstm) - param_.lstm_q_ = true; - else - param_.lstm_q_ = false; } virtual void Forward(const OpContext &ctx, From 77bf61c2173d2e1f73504393e21feee8010902c9 Mon Sep 17 00:00:00 2001 From: Sebastian Bodenstein Date: Thu, 21 Jul 2016 13:24:56 +0200 Subject: [PATCH 13/36] - added option to control num outputs --- src/operator/cudnn_rnn-inl.h | 5 ++++- src/operator/rnn-inl.h | 21 ++++++++++++++------- 2 files changed, 18 insertions(+), 8 deletions(-) diff --git a/src/operator/cudnn_rnn-inl.h b/src/operator/cudnn_rnn-inl.h index 1fd7afc90e3a..0c943bab7da0 100644 --- a/src/operator/cudnn_rnn-inl.h +++ b/src/operator/cudnn_rnn-inl.h @@ -7,9 +7,12 @@ #ifndef MXNET_OPERATOR_CUDNN_RNN_INL_H_ #define MXNET_OPERATOR_CUDNN_RNN_INL_H_ -#include #include +#include +#include +#include #include "./rnn-inl.h" + namespace mxnet { namespace op { #if defined(__CUDACC__) && MXNET_USE_CUDNN == 1 && CUDNN_MAJOR == 5 diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h index b51216bf9d4d..d036e299e519 100644 --- a/src/operator/rnn-inl.h +++ b/src/operator/rnn-inl.h @@ -71,7 +71,7 @@ inline int rnn_param_size(int layerNum, struct RNNParam : public dmlc::Parameter { uint32_t state_size; uint32_t num_layers; - bool bidirectional; + bool bidirectional, state_outputs; int mode; float p, pkeep_; int seq_length_, batch_size_, input_size_; @@ -97,6 +97,10 @@ struct RNNParam : public dmlc::Parameter { DMLC_DECLARE_FIELD(p).set_default(0.) .set_range(0, 1) .describe("Fraction of the input that gets dropped out at training time"); + + DMLC_DECLARE_FIELD(state_outputs).set_default(false) + .describe("Whether to have the states as symbol outputs."); + } }; @@ -160,9 +164,11 @@ class RNNProp : public OperatorProperty { return 2; } - // int NumVisibleOutputs() const override { - // return 1; - // } + int NumVisibleOutputs() const override { + int mode_num = (param_.mode == rnn_enum::kLstm) ? 2 : 1; + int num_outputs = param_.state_outputs ? (mode_num + 1) : 1; + return num_outputs; + } void Init(const std::vector >& kwargs) override { param_.Init(kwargs); @@ -193,11 +199,11 @@ class RNNProp : public OperatorProperty { SHAPE_ASSIGN_CHECK(*in_shape, rnn_enum::kState, Shape3(total_layers, batch_size, param_.state_size)); - if (param_.mode == rnn_enum::kLstm){ + if (param_.mode == rnn_enum::kLstm) SHAPE_ASSIGN_CHECK(*in_shape, rnn_enum::kStateCell, Shape3(total_layers, batch_size, param_.state_size)); - } + // calculate parameter vector length int param_size = rnn_param_size(param_.num_layers, input_size, @@ -217,7 +223,7 @@ class RNNProp : public OperatorProperty { out_shape->push_back(oshape); out_shape->push_back(outStateShape); // Deal with lstm cell state - if (param_.mode == rnn_enum::kLstm) + if(param_.mode == rnn_enum::kLstm) out_shape->push_back(outStateShape); return true; } @@ -240,6 +246,7 @@ class RNNProp : public OperatorProperty { out_type->clear(); out_type->push_back(dtype); out_type->push_back(dtype); + // Deal with lstm cell state if (param_.mode == rnn_enum::kLstm) out_type->push_back(dtype); return true; From 62d6f8e33b7d4b01178d85541a328b424418d462 Mon Sep 17 00:00:00 2001 From: Sebastian Bodenstein Date: Thu, 21 Jul 2016 13:52:17 +0200 Subject: [PATCH 14/36] - removed lint --- src/operator/cudnn_rnn-inl.h | 137 +++++++++++++++-------------------- src/operator/rnn-inl.h | 32 ++++---- src/operator/rnn.cc | 5 +- src/operator/rnn.cu | 2 +- 4 files changed, 77 insertions(+), 99 deletions(-) diff --git a/src/operator/cudnn_rnn-inl.h b/src/operator/cudnn_rnn-inl.h index 0c943bab7da0..f3bfc1eac1fe 100644 --- a/src/operator/cudnn_rnn-inl.h +++ b/src/operator/cudnn_rnn-inl.h @@ -24,7 +24,7 @@ class CuDNNRNNOp : public Operator { init_cudnn_ = false; dtype_ = mshadow::DataType::kCudnnFlag; // Defaults - input_mode_ = CUDNN_LINEAR_INPUT; // Don't support this yet + input_mode_ = CUDNN_LINEAR_INPUT; // Don't support this yet // RNN Mode switch (param_.mode) { case rnn_enum::kRnnRelu: @@ -46,7 +46,7 @@ class CuDNNRNNOp : public Operator { direction_ = param_.bidirectional ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL; // Other param_.pkeep_ = 1.0f - param_.p; - if(param_.mode == rnn_enum::kLstm) + if (param_.mode == rnn_enum::kLstm) param_.lstm_q_ = true; else param_.lstm_q_ = false; @@ -54,7 +54,7 @@ class CuDNNRNNOp : public Operator { ~CuDNNRNNOp() { if (init_cudnn_) { - for(int i = 0; i < x_desc_vec_.size(); ++i){ + for (int i = 0; i < x_desc_vec_.size(); ++i) { CHECK_EQ(cudnnDestroyTensorDescriptor(x_desc_vec_[i]), CUDNN_STATUS_SUCCESS); CHECK_EQ(cudnnDestroyTensorDescriptor(y_desc_vec_[i]), CUDNN_STATUS_SUCCESS); CHECK_EQ(cudnnDestroyTensorDescriptor(dx_desc_vec_[i]), CUDNN_STATUS_SUCCESS); @@ -63,18 +63,18 @@ class CuDNNRNNOp : public Operator { CHECK_EQ(cudnnDestroyTensorDescriptor(hx_desc_), CUDNN_STATUS_SUCCESS); CHECK_EQ(cudnnDestroyTensorDescriptor(cx_desc_), CUDNN_STATUS_SUCCESS); CHECK_EQ(cudnnDestroyTensorDescriptor(hy_desc_), CUDNN_STATUS_SUCCESS); - CHECK_EQ(cudnnDestroyTensorDescriptor(cy_desc_), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudnnDestroyTensorDescriptor(cy_desc_), CUDNN_STATUS_SUCCESS); CHECK_EQ(cudnnDestroyTensorDescriptor(dhx_desc_), CUDNN_STATUS_SUCCESS); CHECK_EQ(cudnnDestroyTensorDescriptor(dcx_desc_), CUDNN_STATUS_SUCCESS); CHECK_EQ(cudnnDestroyTensorDescriptor(dhy_desc_), CUDNN_STATUS_SUCCESS); - CHECK_EQ(cudnnDestroyTensorDescriptor(dcy_desc_), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudnnDestroyTensorDescriptor(dcy_desc_), CUDNN_STATUS_SUCCESS); CHECK_EQ(cudnnDestroyFilterDescriptor(w_desc_), CUDNN_STATUS_SUCCESS); CHECK_EQ(cudnnDestroyRNNDescriptor(rnn_desc_), CUDNN_STATUS_SUCCESS); CHECK_EQ(cudnnDestroyDropoutDescriptor(dropout_desc_), CUDNN_STATUS_SUCCESS); } } - + virtual void Forward(const OpContext &ctx, const std::vector &in_data, const std::vector &req, @@ -96,7 +96,7 @@ class CuDNNRNNOp : public Operator { DType * cx_ptr = NULL; DType * cy_ptr = NULL; - if (param_.mode == rnn_enum::kLstm){ + if (param_.mode == rnn_enum::kLstm) { cx_ptr = (in_data[rnn_enum::kStateCell].get(s)).dptr_; cy_ptr = (in_data[rnn_enum::kStateCellOut].get(s)).dptr_; } @@ -107,9 +107,9 @@ class CuDNNRNNOp : public Operator { CHECK_EQ(y.CheckContiguous(), true); CHECK_EQ(hy.CheckContiguous(), true); - if(!init_cudnn_){ + if (!init_cudnn_) { Init(s, in_data, out_data); - } + } // Get temp space int temp_size = workspace_size_; @@ -117,8 +117,8 @@ class CuDNNRNNOp : public Operator { Tensor temp_space = ctx.requested[rnn_enum::kTempSpace].get_space_typed( mshadow::Shape1(temp_size), s); - - if (ctx.is_train) { + + if (ctx.is_train) { CHECK_EQ(cudnnRNNForwardTraining(s->dnn_handle_, rnn_desc_, param_.seq_length_, @@ -139,8 +139,7 @@ class CuDNNRNNOp : public Operator { temp_space.dptr_, workspace_byte_, temp_space.dptr_ + workspace_size_, - reserve_space_byte_ - ), CUDNN_STATUS_SUCCESS); + reserve_space_byte_), CUDNN_STATUS_SUCCESS); } else { // inference mode CHECK_EQ(cudnnRNNForwardInference(s->dnn_handle_, @@ -161,11 +160,10 @@ class CuDNNRNNOp : public Operator { cy_desc_, cy_ptr, temp_space.dptr_, - workspace_byte_ - ), CUDNN_STATUS_SUCCESS); + workspace_byte_), CUDNN_STATUS_SUCCESS); } } - + virtual void Backward(const OpContext &ctx, const std::vector &out_grad, const std::vector &in_data, @@ -198,7 +196,7 @@ class CuDNNRNNOp : public Operator { // DType * cy_ptr = NULL; DType * dcx_ptr = NULL; DType * dcy_ptr = NULL; - if (param_.mode == rnn_enum::kLstm){ + if (param_.mode == rnn_enum::kLstm) { cx_ptr = (in_data[rnn_enum::kStateCell].get(s)).dptr_; // cy_ptr = (in_data[rnn_enum::kStateCellOut].get(s)).dptr_; dcx_ptr = (in_grad[rnn_enum::kStateCell].get(s)).dptr_; @@ -211,9 +209,9 @@ class CuDNNRNNOp : public Operator { CHECK_EQ(y.CheckContiguous(), true); CHECK_EQ(hy.CheckContiguous(), true); - if(!init_cudnn_){ + if (!init_cudnn_) { Init(s, in_data, out_data); - } + } // Get temp space int temp_size = workspace_size_; @@ -247,25 +245,24 @@ class CuDNNRNNOp : public Operator { temp_space.dptr_, workspace_byte_, temp_space.dptr_ + workspace_size_, - reserve_space_byte_ - ), CUDNN_STATUS_SUCCESS); - CHECK_EQ(cudnnRNNBackwardWeights(s->dnn_handle_, - rnn_desc_, - param_.seq_length_, - x_desc_vec_.data(), - x.dptr_, + reserve_space_byte_), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudnnRNNBackwardWeights(s->dnn_handle_, + rnn_desc_, + param_.seq_length_, + x_desc_vec_.data(), + x.dptr_, hx_desc_, - hx.dptr_, - y_desc_vec_.data(), + hx.dptr_, + y_desc_vec_.data(), y.dptr_, - temp_space.dptr_, - workspace_byte_, - dw_desc_, + temp_space.dptr_, + workspace_byte_, + dw_desc_, dw.dptr_, - temp_space.dptr_ + workspace_size_, - reserve_space_byte_ - ), CUDNN_STATUS_SUCCESS); + temp_space.dptr_ + workspace_size_, + reserve_space_byte_), CUDNN_STATUS_SUCCESS); } + private: inline void Init(mshadow::Stream *s, const std::vector &in_data, @@ -299,7 +296,7 @@ class CuDNNRNNOp : public Operator { CHECK_EQ(cudnnCreateTensorDescriptor(&y_vec[i]), CUDNN_STATUS_SUCCESS); CHECK_EQ(cudnnCreateTensorDescriptor(&dx_vec[i]), CUDNN_STATUS_SUCCESS); CHECK_EQ(cudnnCreateTensorDescriptor(&dy_vec[i]), CUDNN_STATUS_SUCCESS); - + dimA[0] = param_.batch_size_; dimA[1] = param_.input_size_; dimA[2] = 1; @@ -307,21 +304,19 @@ class CuDNNRNNOp : public Operator { dimA[1] = param_.input_size_; strideA[0] = dimA[2] * dimA[1]; strideA[1] = dimA[2]; - strideA[2] = 1; + strideA[2] = 1; CHECK_EQ(cudnnSetTensorNdDescriptor(x_vec[i], dtype_, 3, dimA, - strideA - ), CUDNN_STATUS_SUCCESS); + strideA), CUDNN_STATUS_SUCCESS); CHECK_EQ(cudnnSetTensorNdDescriptor(dx_vec[i], dtype_, 3, dimA, - strideA - ), CUDNN_STATUS_SUCCESS); - dimA[0] = param_.batch_size_; + strideA), CUDNN_STATUS_SUCCESS); + dimA[0] = param_.batch_size_; dimA[1] = param_.bidirectional ? param_.state_size * 2 : param_.state_size; dimA[2] = 1; strideA[0] = dimA[2] * dimA[1]; @@ -332,21 +327,19 @@ class CuDNNRNNOp : public Operator { dtype_, 3, dimA, - strideA - ), CUDNN_STATUS_SUCCESS); + strideA), CUDNN_STATUS_SUCCESS); CHECK_EQ(cudnnSetTensorNdDescriptor(dy_vec[i], dtype_, 3, dimA, - strideA - ), CUDNN_STATUS_SUCCESS); + strideA), CUDNN_STATUS_SUCCESS); } x_desc_vec_ = x_vec; y_desc_vec_ = y_vec; dx_desc_vec_ = dx_vec; dy_desc_vec_ = dy_vec; - // set the state tensors + // set the state tensors dimA[0] = param_.num_layers * (param_.bidirectional ? 2 : 1); dimA[1] = param_.batch_size_; dimA[2] = param_.state_size; @@ -367,64 +360,55 @@ class CuDNNRNNOp : public Operator { dtype_, 3, dimA, - strideA - ), CUDNN_STATUS_SUCCESS); + strideA), CUDNN_STATUS_SUCCESS); CHECK_EQ(cudnnSetTensorNdDescriptor(cx_desc_, dtype_, 3, dimA, - strideA - ), CUDNN_STATUS_SUCCESS); + strideA), CUDNN_STATUS_SUCCESS); CHECK_EQ(cudnnSetTensorNdDescriptor(hy_desc_, dtype_, 3, dimA, - strideA - ), CUDNN_STATUS_SUCCESS); + strideA), CUDNN_STATUS_SUCCESS); CHECK_EQ(cudnnSetTensorNdDescriptor(cy_desc_, dtype_, 3, dimA, - strideA - ), CUDNN_STATUS_SUCCESS); + strideA), CUDNN_STATUS_SUCCESS); CHECK_EQ(cudnnSetTensorNdDescriptor(dhx_desc_, dtype_, 3, dimA, - strideA - ), CUDNN_STATUS_SUCCESS); + strideA), CUDNN_STATUS_SUCCESS); CHECK_EQ(cudnnSetTensorNdDescriptor(dcx_desc_, dtype_, 3, dimA, - strideA - ), CUDNN_STATUS_SUCCESS); + strideA), CUDNN_STATUS_SUCCESS); CHECK_EQ(cudnnSetTensorNdDescriptor(dhy_desc_, dtype_, 3, dimA, - strideA - ), CUDNN_STATUS_SUCCESS); + strideA), CUDNN_STATUS_SUCCESS); CHECK_EQ(cudnnSetTensorNdDescriptor(dcy_desc_, dtype_, 3, dimA, - strideA - ), CUDNN_STATUS_SUCCESS); + strideA), CUDNN_STATUS_SUCCESS); // Create Dropout descriptors CHECK_EQ(cudnnCreateDropoutDescriptor(&dropout_desc_), CUDNN_STATUS_SUCCESS); - CHECK_EQ(cudnnDropoutGetStatesSize(s->dnn_handle_, - &dropout_byte_ - ), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudnnDropoutGetStatesSize(s->dnn_handle_, + &dropout_byte_), CUDNN_STATUS_SUCCESS); dropout_size_ = dropout_byte_ / sizeof(DType); CHECK_EQ(cudnnSetDropoutDescriptor(dropout_desc_, s->dnn_handle_, - param_.pkeep_, // keep probability + param_.pkeep_, // keep probability NULL, dropout_byte_, seed_), CUDNN_STATUS_SUCCESS); - // RNN descriptors + // RNN descriptors CHECK_EQ(cudnnCreateRNNDescriptor(&rnn_desc_), CUDNN_STATUS_SUCCESS); CHECK_EQ(cudnnSetRNNDescriptor(rnn_desc_, param_.state_size, @@ -434,19 +418,17 @@ class CuDNNRNNOp : public Operator { direction_, mode_, dtype_), CUDNN_STATUS_SUCCESS); - // Get temp space sizes + // Get temp space sizes CHECK_EQ(cudnnGetRNNWorkspaceSize(s->dnn_handle_, rnn_desc_, param_.seq_length_, x_desc_vec_.data(), - &workspace_byte_ - ), CUDNN_STATUS_SUCCESS); + &workspace_byte_), CUDNN_STATUS_SUCCESS); CHECK_EQ(cudnnGetRNNTrainingReserveSize(s->dnn_handle_, rnn_desc_, param_.seq_length_, x_desc_vec_.data(), - &reserve_space_byte_ - ), CUDNN_STATUS_SUCCESS); + &reserve_space_byte_), CUDNN_STATUS_SUCCESS); workspace_size_ = workspace_byte_ / sizeof(DType); reserve_space_size_ = reserve_space_byte_ / sizeof(DType); @@ -467,15 +449,12 @@ class CuDNNRNNOp : public Operator { dtype_, format_, 3, - dim_w - ), CUDNN_STATUS_SUCCESS); + dim_w), CUDNN_STATUS_SUCCESS); CHECK_EQ(cudnnSetFilterNdDescriptor(dw_desc_, dtype_, format_, 3, - dim_w - ), CUDNN_STATUS_SUCCESS); - + dim_w), CUDNN_STATUS_SUCCESS); } } @@ -486,7 +465,7 @@ class CuDNNRNNOp : public Operator { cudnnDirectionMode_t direction_; cudnnRNNInputMode_t input_mode_; cudnnDropoutDescriptor_t dropout_desc_; - unsigned long long seed_ = 4553; + unsigned long long seed_ = 1337ull; size_t workspace_byte_, reserve_space_byte_, dropout_byte_; int workspace_size_, reserve_space_size_, dropout_size_; @@ -496,7 +475,7 @@ class CuDNNRNNOp : public Operator { cudnnTensorDescriptor_t dhx_desc_, dcx_desc_; cudnnTensorDescriptor_t dhy_desc_, dcy_desc_; - cudnnFilterDescriptor_t w_desc_, dw_desc_; + cudnnFilterDescriptor_t w_desc_, dw_desc_; #if CUDNN_MAJOR == 5 cudnnTensorFormat_t format_; diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h index d036e299e519..2c7d20fe279c 100644 --- a/src/operator/rnn-inl.h +++ b/src/operator/rnn-inl.h @@ -30,13 +30,12 @@ namespace rnn_enum { // A utility function to calculate input size inline int rnn_single_param_size(int inputSize, int hiddenSize, - int mode){ + int mode) { int size = hiddenSize * (hiddenSize + inputSize + 2); // Different RNN's have different num weights - switch(mode) - { + switch (mode) { case rnn_enum::kRnnRelu: - size *= 1 ; + size *= 1; break; case rnn_enum::kRnnTanh: size *= 1; @@ -55,16 +54,16 @@ inline int rnn_param_size(int layerNum, int inputSize, int hiddenSize, bool bidirectional, - int mode){ + int mode) { // get size of first layer int size = rnn_single_param_size(inputSize, hiddenSize, mode); // get size of remaining layers - if(bidirectional){ + if (bidirectional) { size += (layerNum - 1) * rnn_single_param_size(2 * hiddenSize, hiddenSize, mode); size *= 2; + } else { + size += (layerNum - 1) * rnn_single_param_size(hiddenSize, hiddenSize, mode); } - else - size += (layerNum - 1) * rnn_single_param_size(hiddenSize, hiddenSize, mode); return size; } @@ -75,7 +74,7 @@ struct RNNParam : public dmlc::Parameter { int mode; float p, pkeep_; int seq_length_, batch_size_, input_size_; - bool lstm_q_; // whether type is lstm + bool lstm_q_; // whether type is lstm DMLC_DECLARE_PARAMETER(RNNParam) { DMLC_DECLARE_FIELD(state_size) @@ -93,14 +92,13 @@ struct RNNParam : public dmlc::Parameter { .add_enum("lstm", rnn_enum::kLstm) .add_enum("gru", rnn_enum::kGru) .describe("the type of RNN to compute"); - + DMLC_DECLARE_FIELD(p).set_default(0.) .set_range(0, 1) .describe("Fraction of the input that gets dropped out at training time"); DMLC_DECLARE_FIELD(state_outputs).set_default(false) .describe("Whether to have the states as symbol outputs."); - } }; @@ -117,7 +115,7 @@ class RNNOp : public Operator { const std::vector &aux_args) { using namespace mshadow; using namespace mshadow::expr; - // TODO: add MShadow implementation + // TODO(sbodenstein): add MShadow implementation } virtual void Backward(const OpContext &ctx, @@ -129,7 +127,7 @@ class RNNOp : public Operator { const std::vector &aux_args) { using namespace mshadow; using namespace mshadow::expr; - // TODO: add MShadow implementation + // TODO(sbodenstein): add MShadow implementation } private: @@ -153,14 +151,14 @@ class RNNProp : public OperatorProperty { std::vector ListOutputs() const override { if (param_.mode == rnn_enum::kLstm) return {"output", "state", "state_cell"}; - else + else return {"output", "state"}; } int NumOutputs() const override { if (param_.mode == rnn_enum::kLstm) return 3; - else + else return 2; } @@ -195,7 +193,7 @@ class RNNProp : public OperatorProperty { int batch_size = dshape[1]; int input_size = dshape[2]; int numDirections = param_.bidirectional ? 2 : 1; - int total_layers = numDirections * param_.num_layers; // double for bidirectional + int total_layers = numDirections * param_.num_layers; // double for bidirectional SHAPE_ASSIGN_CHECK(*in_shape, rnn_enum::kState, Shape3(total_layers, batch_size, param_.state_size)); @@ -223,7 +221,7 @@ class RNNProp : public OperatorProperty { out_shape->push_back(oshape); out_shape->push_back(outStateShape); // Deal with lstm cell state - if(param_.mode == rnn_enum::kLstm) + if (param_.mode == rnn_enum::kLstm) out_shape->push_back(outStateShape); return true; } diff --git a/src/operator/rnn.cc b/src/operator/rnn.cc index 337410c8ddc1..3067c8e986c1 100644 --- a/src/operator/rnn.cc +++ b/src/operator/rnn.cc @@ -19,8 +19,9 @@ Operator *CreateOp(RNNParam param, int dtype) { return op; } -Operator *RNNProp::CreateOperatorEx(Context ctx, std::vector *in_shape, - std::vector *in_type) const { +Operator *RNNProp::CreateOperatorEx(Context ctx, + std::vector *in_shape, + std::vector *in_type) const { std::vector out_shape, aux_shape; std::vector out_type, aux_type; CHECK(InferType(in_type, &out_type, &aux_type)); diff --git a/src/operator/rnn.cu b/src/operator/rnn.cu index fb90daf19b41..bf914026019d 100644 --- a/src/operator/rnn.cu +++ b/src/operator/rnn.cu @@ -21,7 +21,7 @@ Operator* CreateOp(RNNParam param, int dtype) { op = new CuDNNRNNOp(param); }) #else - LOG(FATAL) << "RNN is only available for cuDNN at the moment."; + LOG(FATAL) << "RNN is only available for cuDNN at the moment."; #endif // MXNET_USE_CUDNN && CUDNN_MAJOR return op; } From 8b3c6b9ade40bfa00f2a1929a3fdc87f75da0709 Mon Sep 17 00:00:00 2001 From: Sebastian Bodenstein Date: Thu, 21 Jul 2016 15:47:41 +0200 Subject: [PATCH 15/36] - correct handling of backward dependencies --- src/operator/cudnn_rnn-inl.h | 27 +++++++++++++++------------ src/operator/rnn-inl.h | 20 ++++++++++++++++---- 2 files changed, 31 insertions(+), 16 deletions(-) diff --git a/src/operator/cudnn_rnn-inl.h b/src/operator/cudnn_rnn-inl.h index f3bfc1eac1fe..3f63bc4de0f5 100644 --- a/src/operator/cudnn_rnn-inl.h +++ b/src/operator/cudnn_rnn-inl.h @@ -187,27 +187,30 @@ class CuDNNRNNOp : public Operator { Tensor dw = in_grad[rnn_enum::kParams].get(s); Tensor hx = in_data[rnn_enum::kState].get(s); Tensor dhx = in_grad[rnn_enum::kState].get(s); - Tensor hy = in_data[rnn_enum::kStateOut].get(s); - Tensor dhy = out_grad[rnn_enum::kStateOut].get(s); Tensor y = out_data[rnn_enum::kOut].get(s); Tensor dy = out_grad[rnn_enum::kOut].get(s); - DType * cx_ptr = NULL; - // DType * cy_ptr = NULL; - DType * dcx_ptr = NULL; - DType * dcy_ptr = NULL; - if (param_.mode == rnn_enum::kLstm) { + // only need kStateOut grad output_states is true + void * dhy_ptr = NULL; + if (param_.state_outputs) + dhy_ptr = out_grad[rnn_enum::kStateOut].get(s).dptr_; + + // Deal with lstm + void * dcx_ptr = NULL; + void * dcy_ptr = NULL; + void * cx_ptr = NULL; + + if(param_.mode == rnn_enum::kLstm) { cx_ptr = (in_data[rnn_enum::kStateCell].get(s)).dptr_; - // cy_ptr = (in_data[rnn_enum::kStateCellOut].get(s)).dptr_; dcx_ptr = (in_grad[rnn_enum::kStateCell].get(s)).dptr_; - dcy_ptr = (out_grad[rnn_enum::kStateCellOut].get(s)).dptr_; } - + if ((param_.mode == rnn_enum::kLstm) && param_.state_outputs) + dcy_ptr = (out_grad[rnn_enum::kStateCellOut].get(s)).dptr_; + CHECK_EQ(x.CheckContiguous(), true); CHECK_EQ(w.CheckContiguous(), true); CHECK_EQ(hx.CheckContiguous(), true); CHECK_EQ(y.CheckContiguous(), true); - CHECK_EQ(hy.CheckContiguous(), true); if (!init_cudnn_) { Init(s, in_data, out_data); @@ -227,7 +230,7 @@ class CuDNNRNNOp : public Operator { dy_desc_vec_.data(), dy.dptr_, dhy_desc_, - dhy.dptr_, + dhy_ptr, dcy_desc_, dcy_ptr, w_desc_, diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h index 2c7d20fe279c..91284074b5d4 100644 --- a/src/operator/rnn-inl.h +++ b/src/operator/rnn-inl.h @@ -264,10 +264,22 @@ class RNNProp : public OperatorProperty { const std::vector &out_grad, const std::vector &in_data, const std::vector &out_data) const override { - if (param_.mode == rnn_enum::kLstm) - return {out_grad[rnn_enum::kOut], in_data[rnn_enum::kData], in_data[rnn_enum::kParams]}; - else - return {out_grad[rnn_enum::kOut], in_data[rnn_enum::kData], in_data[rnn_enum::kParams]}; + std::vector dep = {in_data[rnn_enum::kData], in_data[rnn_enum::kParams], + in_data[rnn_enum::kState], out_data[rnn_enum::kOut], out_grad[rnn_enum::kOut]}; + + if (param_.state_outputs) { + dep.push_back(out_data[rnn_enum::kStateOut]); + dep.push_back(out_grad[rnn_enum::kStateOut]); + } + + if (param_.mode == rnn_enum::kLstm) { + dep.push_back(in_data[rnn_enum::kStateCell]); + if(param_.state_outputs) { + dep.push_back(out_data[rnn_enum::kStateCellOut]); + dep.push_back(out_grad[rnn_enum::kStateCellOut]); + } + } + return dep; } std::vector ForwardResource( From 82ac0417e37c34d8026f0dae6d49db21fb2991d4 Mon Sep 17 00:00:00 2001 From: Sebastian Bodenstein Date: Thu, 21 Jul 2016 15:55:56 +0200 Subject: [PATCH 16/36] - fix lint --- src/operator/cudnn_rnn-inl.h | 6 +++--- src/operator/rnn-inl.h | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/operator/cudnn_rnn-inl.h b/src/operator/cudnn_rnn-inl.h index 3f63bc4de0f5..d5deca2af2f8 100644 --- a/src/operator/cudnn_rnn-inl.h +++ b/src/operator/cudnn_rnn-inl.h @@ -197,16 +197,16 @@ class CuDNNRNNOp : public Operator { // Deal with lstm void * dcx_ptr = NULL; - void * dcy_ptr = NULL; + void * dcy_ptr = NULL; void * cx_ptr = NULL; - if(param_.mode == rnn_enum::kLstm) { + if (param_.mode == rnn_enum::kLstm) { cx_ptr = (in_data[rnn_enum::kStateCell].get(s)).dptr_; dcx_ptr = (in_grad[rnn_enum::kStateCell].get(s)).dptr_; } if ((param_.mode == rnn_enum::kLstm) && param_.state_outputs) dcy_ptr = (out_grad[rnn_enum::kStateCellOut].get(s)).dptr_; - + CHECK_EQ(x.CheckContiguous(), true); CHECK_EQ(w.CheckContiguous(), true); CHECK_EQ(hx.CheckContiguous(), true); diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h index 91284074b5d4..ad4d21736345 100644 --- a/src/operator/rnn-inl.h +++ b/src/operator/rnn-inl.h @@ -274,7 +274,7 @@ class RNNProp : public OperatorProperty { if (param_.mode == rnn_enum::kLstm) { dep.push_back(in_data[rnn_enum::kStateCell]); - if(param_.state_outputs) { + if (param_.state_outputs) { dep.push_back(out_data[rnn_enum::kStateCellOut]); dep.push_back(out_grad[rnn_enum::kStateCellOut]); } From d1d7ce35278227c77590ae7797a0043aa99fee13 Mon Sep 17 00:00:00 2001 From: Sebastian Bodenstein Date: Sat, 9 Jul 2016 22:39:46 -0400 Subject: [PATCH 17/36] - first commit --- src/operator/cudnn_rnn-inl.h | 208 ++++++++++++++++ src/operator/rnn-inl.h | 471 +++++++++++++++++++++++++++++++++++ src/operator/rnn.cc | 41 +++ src/operator/rnn.cu | 33 +++ 4 files changed, 753 insertions(+) create mode 100644 src/operator/cudnn_rnn-inl.h create mode 100644 src/operator/rnn-inl.h create mode 100644 src/operator/rnn.cc create mode 100644 src/operator/rnn.cu diff --git a/src/operator/cudnn_rnn-inl.h b/src/operator/cudnn_rnn-inl.h new file mode 100644 index 000000000000..37895c2b2488 --- /dev/null +++ b/src/operator/cudnn_rnn-inl.h @@ -0,0 +1,208 @@ +/*! + * Copyright (c) 2016 by Contributors + * \file cudnn_spatial_transformer-inl.h + * \brief + * \author Sebastian Bodenstein +*/ +#ifndef MXNET_OPERATOR_CUDNN_RNN_INL_H_ +#define MXNET_OPERATOR_CUDNN_RNN_INL_H_ + +#include +#include +#include "./rnn-inl.h" +namespace mxnet { +namespace op { +#if defined(__CUDACC__) && MXNET_USE_CUDNN == 1 && CUDNN_MAJOR == 5 +template +class CuDNNRNNOp : public Operator { + public: + explicit CuDNNRNNOp(RNNParam param) { + this->param_ = param; + init_cudnn_ = false; + dtype_ = mshadow::DataType::kCudnnFlag; + // RNN Mode + switch (param_.mode) { + case rnn_enum::kRnnRelu: + rnn_mode_ = CUDNN_RNN_RELU; + break; + case rnn_enum::kRnnTanh: + rnn_mode_ = CUDNN_RNN_TANH; + break; + case rnn_enum::kLstm: + rnn_mode_ = CUDNN_LSTM; + break; + case rnn_enum::kGru: + rnn_mode_ = CUDNN_GRU; + break; + default: + LOG(FATAL) << "Not implmented"; + } + // RNN Direction + switch (param_.direction) { + case rnn_enum::kUnidirectional: + rnn_direction_ = CUDNN_UNIDIRECTIONAL; + break; + case rnn_enum::kBidirectional: + rnn_direction_ = CUDNN_BIDIRECTIONAL; + break; + default: + LOG(FATAL) << "Not implmented"; + } + } + // ~CuDNNRNNOp() { + // if (init_cudnn_) { + // CHECK_EQ(cudnnDestroyRNNDescriptor(rnn_desc_), CUDNN_STATUS_SUCCESS); + // // CHECK_EQ(cudnnDestroyTensorDescriptor(rnn_desc_), CUDNN_STATUS_SUCCESS); + // // CHECK_EQ(cudnnDestroyTensorDescriptor(_desc_), CUDNN_STATUS_SUCCESS); + // } + // } + + virtual void Forward(const OpContext &ctx, + const std::vector &in_data, + const std::vector &req, + const std::vector &out_data, + const std::vector &aux_args) { + using namespace mshadow; + // CHECK_EQ(in_data.size(), 2); + // CHECK_EQ(out_data.size(), 3); + // Stream *s = ctx.get_stream(); + // Tensor data = in_data[st::kData].get(s); + // Tensor out = out_data[st::kOut].get(s); + // Shape<3> loc_shape = Shape3(data.size(0), 2, 3); + // Shape<4> grid_shape = Shape4(out.size(0), out.size(2), out.size(3), 2); + // Tensor loc = in_data[st::kLoc].get_with_shape(loc_shape, s); + // Tensor grid = out_data[st::kGridSrc] + // .get_with_shape(grid_shape, s); + // if (!init_cudnn_) { + // Init(s, in_data, out_data); + // } + // CHECK_EQ(data.CheckContiguous(), true); + // CHECK_EQ(out.CheckContiguous(), true); + // typename DataType::ScaleType alpha = 1.0f; + // typename DataType::ScaleType beta = 0.0f; + // if (param_.transform_type == st::kAffine) { + // CHECK_EQ(cudnnSpatialTfGridGeneratorForward(s->dnn_handle_, + // st_desc_, + // loc.dptr_, + // grid.dptr_/*output*/), CUDNN_STATUS_SUCCESS); + // } + // CHECK_EQ(cudnnSpatialTfSamplerForward(s->dnn_handle_, + // st_desc_, + // &alpha, + // in_desc_, + // data.dptr_, + // grid.dptr_, + // &beta, + // out_desc_, + // out.dptr_/*output*/), CUDNN_STATUS_SUCCESS); + } + // + virtual void Backward(const OpContext &ctx, + const std::vector &out_grad, + const std::vector &in_data, + const std::vector &out_data, + const std::vector &req, + const std::vector &in_grad, + const std::vector &aux_args) { + using namespace mshadow; + // CHECK_EQ(in_data.size(), 2); + // CHECK_EQ(out_data.size(), 3); + // CHECK_EQ(out_grad.size(), 1); + // Stream *s = ctx.get_stream(); + // Tensor data = in_data[st::kData].get(s); + // Tensor grad = out_grad[st::kOut].get(s); + // Tensor ddata = in_grad[st::kData].get(s); + // Shape<3> loc_shape = Shape3(data.size(0), 2, 3); + // Shape<4> grid_shape = Shape4(grad.size(0), grad.size(2), grad.size(3), 2); + // Tensor dloc = in_grad[st::kLoc].get_with_shape(loc_shape, s); + // Tensor grid = out_data[st::kGridSrc] + // .get_with_shape(grid_shape, s); + // // do not use out_grad[st::kGridSrc], because dgrid is a intermediate tensor, and not include in + // // DeclareBackwardDependency, another, we can we reuse grid for inplace operator + // typename DataType::ScaleType alpha = 1.0f; + // typename DataType::ScaleType beta = 0.0f; + // typename DataType::ScaleType alpha_dgrid = 1.0f; + // typename DataType::ScaleType beta_dgrid = 0.0f; + // CHECK_EQ(cudnnSpatialTfSamplerBackward(s->dnn_handle_, + // st_desc_, + // &alpha, + // in_desc_, + // data.dptr_, + // &beta, + // in_desc_/*reuse in_desc_*/, + // ddata.dptr_/*output*/, + // &alpha_dgrid, + // out_desc_/*reuse out_desc_*/, + // grad.dptr_, + // grid.dptr_, + // &beta_dgrid, + // grid.dptr_/*output, reuse grid*/), CUDNN_STATUS_SUCCESS); + // if (param_.transform_type == st::kAffine) { + // CHECK_EQ(cudnnSpatialTfGridGeneratorBackward(s->dnn_handle_, + // st_desc_, + // grid.dptr_, + // dloc.dptr_/*out*/), CUDNN_STATUS_SUCCESS); + // } + } + // + private: + inline void Init(mshadow::Stream *s, + const std::vector &in_data, + const std::vector &out_data) { + using namespace mshadow; + // CHECK_EQ(in_data.size(), 2); + // CHECK_EQ(out_data.size(), 3); + // if (!init_cudnn_) { + // init_cudnn_ = true; + // // Tensor data = in_data[st::kData].get(s); + // // Tensor out = out_data[st::kOut].get(s); + // CHECK_EQ(cudnnCreateRNNDescriptor(&rnn_desc_), CUDNN_STATUS_SUCCESS); + // CHECK_EQ(cudnnCreateDropoutDescriptor(&rnn_dropout_), CUDNN_STATUS_SUCCESS); + + // CHECK_EQ(cudnnCreateTensorDescriptor(&in_desc_), CUDNN_STATUS_SUCCESS); + // CHECK_EQ(cudnnCreateTensorDescriptor(&out_desc_), CUDNN_STATUS_SUCCESS); + // CHECK_EQ(cudnnSetTensor4dDescriptor(in_desc_, + // format_, + // dtype_, + // data.size(0), + // data.size(1), + // data.size(2), + // data.size(3)), CUDNN_STATUS_SUCCESS); + // CHECK_EQ(cudnnSetTensor4dDescriptor(out_desc_, + // format_, + // dtype_, + // out.size(0), + // out.size(1), + // out.size(2), + // out.size(3)), CUDNN_STATUS_SUCCESS); + // if (param_.sampler_type == st::kBilinear) { + // int dim[] = {static_cast(out.size(0)), static_cast(out.size(1)), + // static_cast(out.size(2)), static_cast(out.size(3))}; + // CHECK_EQ(cudnnSetSpatialTransformerNdDescriptor(st_desc_, + // sampler_, + // dtype_, + // 4, + // dim) , CUDNN_STATUS_SUCCESS); + // } + // } + } + + bool init_cudnn_; + cudnnDataType_t dtype_; + cudnnRNNDescriptor_t rnn_desc_; + cudnnRNNMode_t rnn_mode_; + cudnnDirectionMode_t rnn_direction_; + cudnnRNNInputMode_t rnn_input_mode_; + cudnnDropoutDescriptor_t rnn_dropout_; + // cudnnTensorDescriptor_t in_desc_; + // cudnnTensorDescriptor_t out_desc_; + #if CUDNN_MAJOR == 5 + cudnnTensorFormat_t format_; + #endif + RNNParam param_; +}; +#endif // __CUDACC__ && CUDNN +} // namespace op +} // namespace mxnet + +#endif // MXNET_OPERATOR_CUDNN_SPATIAL_TRANSFORMER_INL_H_ diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h new file mode 100644 index 000000000000..3a538f001d5b --- /dev/null +++ b/src/operator/rnn-inl.h @@ -0,0 +1,471 @@ +/*! + * Copyright (c) 2015 by Contributors + * \file rnn-inl.h + * \brief + * \author Sebastian Bodenstein +*/ +#ifndef MXNET_OPERATOR_RNN_INL_H_ +#define MXNET_OPERATOR_RNN_INL_H_ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "./operator_common.h" + +namespace mxnet { +namespace op { + +namespace rnn_enum { + enum RNNOpInputs {kData, kWeight, kStateIn, kCellStateIn}; + enum RNNOpOutputs {kOut, kStateOut, kCellStateOut}; + enum RNNModeType {kRnnRelu, kRnnTanh, kLstm, kGru}; + enum RNNDirectionType {kUnidirectional, kBidirectional}; + enum RNNOpResource {kTempSpace}; +} + +// A utility function to calculate input size + +inline int rnn_single_param_size(int inputSize, + int hiddenSize, + int mode){ + int size = hiddenSize * (hiddenSize + inputSize + 2); + // Different RNN's have different num weights + switch(mode) + { + case rnn_enum::kRnnRelu: + size *= 1 ; + break; + case rnn_enum::kRnnTanh: + size *= 1; + break; + case rnn_enum::kLstm: + size *= 4; + break; + case rnn_enum::kGru: + size *= 3; + break; + } + return size; +} + +inline int rnn_param_size(int layerNum, + int inputSize, + int hiddenSize, + int direction, + int mode){ + // get size of first layer + int size = rnn_single_param_size(inputSize, hiddenSize, mode); + // get size of remaining layers + if(direction == rnn_enum::kUnidirectional) + size += (layerNum - 1) * rnn_single_param_size(hiddenSize, hiddenSize, mode); + else // bidirectional case: input size increases by 2 + size += (layerNum - 1) * rnn_single_param_size(2 * hiddenSize, hiddenSize, mode); + return size; +} + +struct RNNParam : public dmlc::Parameter { + uint32_t state_size; + uint32_t num_layers; + uint64_t workspace; + bool batch_first; + int direction; + int mode; + + DMLC_DECLARE_PARAMETER(RNNParam) { + DMLC_DECLARE_FIELD(state_size) + .describe("size of the state for each layer"); + + DMLC_DECLARE_FIELD(num_layers) + .describe("number of stacked layers"); + + DMLC_DECLARE_FIELD(workspace).set_default(512).set_range(0, 8192) + .describe("Tmp workspace for RNN (MB)"); + + DMLC_DECLARE_FIELD(direction) + .add_enum("unidirectional", rnn_enum::kUnidirectional) + .add_enum("bidirectional", rnn_enum::kBidirectional) + .describe("specifies the recurrence pattern"); + + DMLC_DECLARE_FIELD(mode) + .add_enum("rnn_relu", rnn_enum::kRnnRelu) + .add_enum("rnn_tanh", rnn_enum::kRnnTanh) + .add_enum("lstm", rnn_enum::kLstm) + .add_enum("gru", rnn_enum::kGru) + .describe("the type of RNN to compute"); + } +}; + +template +class RNNOp : public Operator { + public: + explicit RNNOp(RNNParam p) { + this->param_ = p; + // convert MBytes first to Bytes and then to elements. + param_.workspace = (param_.workspace << 20) / sizeof(real_t); + } + + virtual void Forward(const OpContext &ctx, + const std::vector &in_data, + const std::vector &req, + const std::vector &out_data, + const std::vector &aux_args) { + using namespace mshadow; + using namespace mshadow::expr; +// CHECK_EQ(req[rnn_enum::kOut], kWriteTo); + +// CHECK_EQ(in_data.size(), expected); +// CHECK_EQ(out_data.size(), 1); +// Stream *s = ctx.get_stream(); +// Tensor data = in_data[rnn_enum::kData].get(s); +// Tensor out = out_data[rnn_enum::kOut].get(s); +// Shape<3> wmat_shape = +// Shape3(param_.num_group, +// data.shape_[1] / param_.num_group, +// param_.num_filter / param_.num_group * param_.kernel[0] * param_.kernel[1]); +// Tensor wmat = +// in_data[rnn_enum::kWeight].get_with_shape(wmat_shape, s); +// #if defined(__CUDACC__) +// CHECK_EQ(s->blas_handle_ownership_, Stream::OwnHandle) +// << "Must init CuBLAS handle in stream"; +// #endif +// const index_t nbatch = data.size(0); +// Tensor workspace = +// ctx.requested[rnn_enum::kTempSpace].get_space_typed( +// Shape1(this->InitTemp(out.shape_, data.shape_)), s); +// for (index_t i = 0; i < nbatch; i += nstep_) { +// const index_t step = std::min(nstep_, nbatch - i); +// Tensor temp_col = Tensor( +// workspace.dptr_, +// Shape2(shape_colunit_[0], +// shape_colunit_[1] * step), s); +// Tensor temp_dst = Tensor( +// workspace.dptr_ + temp_col.shape_.Size(), +// Shape3(shape_dstunit_[0], +// shape_dstunit_[1], +// shape_dstunit_[2] * step), s); +// temp_dst = reshape(swapaxis<1, 0>(data.Slice(i, i + step)), temp_dst.shape_); +// if (param_.pad[0] == 0 && param_.pad[1] == 0) { +// temp_col = unpack_patch2col(out.Slice(i, i + step), +// param_.kernel[0], +// param_.kernel[1], +// param_.stride[0], +// param_.stride[1], +// 1, 1); // RNN only support dilate equals 1 +// } else { +// temp_col = unpack_patch2col(pad(out.Slice(i, i + step), +// param_.pad[0], param_.pad[1]), +// param_.kernel[0], +// param_.kernel[1], +// param_.stride[0], +// param_.stride[1], +// 1, 1); // RNN only support dilate equals 1 +// } +// const index_t gstride = temp_col.size(0) / param_.num_group; +// for (uint32_t gid = 0; gid < param_.num_group; ++gid) { +// mshadow::Tensor tmpc = temp_col.Slice(gstride * gid, +// gstride * (gid + 1)); +// tmpc = dot(wmat[gid].T(), temp_dst[gid]); +// } +// if (param_.pad[0] == 0 && param_.pad[1] == 0) { +// out.Slice(i, i + step) = pack_col2patch(temp_col, +// out.Slice(i, i + step).shape_, +// param_.kernel[0], +// param_.kernel[1], +// param_.stride[0], +// 1); // RNN only support dilate equals 1 +// } else { +// Shape<4> pshape = out.Slice(i, i + step).shape_; +// pshape[2] += 2 * param_.pad[0]; +// pshape[3] += 2 * param_.pad[1]; +// out.Slice(i, i + step) = crop(pack_col2patch(temp_col, +// pshape, +// param_.kernel[0], +// param_.kernel[1], +// param_.stride[0], +// 1), // RNN only support dilate equals 1 +// out[i][0].shape_); +// } +// } +// if (!param_.no_bias) { +// // add bias, broadcast bias to dim 1: channel +// Tensor bias = in_data[rnn_enum::kBias].get(s); +// out += broadcast<1>(bias, out.shape_); +// } + } + + virtual void Backward(const OpContext &ctx, + const std::vector &out_grad, + const std::vector &in_data, + const std::vector &out_data, + const std::vector &req, + const std::vector &in_grad, + const std::vector &aux_args) { + using namespace mshadow; + using namespace mshadow::expr; + // TODO(bing): check the BLAS Handle, be careful +// CHECK_EQ(out_grad.size(), 1); +// size_t expected = param_.no_bias == 0 ? 3 : 2; +// CHECK(in_data.size() == expected && in_grad.size() == expected); +// CHECK_EQ(req.size(), expected); +// CHECK_EQ(in_data[rnn_enum::kWeight].CheckContiguous(), true); +// // get data +// Stream *s = ctx.get_stream(); +// Tensor data = in_data[rnn_enum::kData].get(s); +// Tensor grad = out_grad[rnn_enum::kOut].get(s); +// Tensor gdata = in_grad[rnn_enum::kData].get(s); +// Shape<3> wmat_shape = +// Shape3(param_.num_group, +// data.shape_[1] / param_.num_group, +// param_.num_filter / param_.num_group * param_.kernel[0] * param_.kernel[1]); +// Tensor wmat = +// in_data[rnn_enum::kWeight].get_with_shape(wmat_shape, s); +// Tensor gwmat = +// in_grad[rnn_enum::kWeight].get_with_shape(wmat_shape, s); +// #if defined(__CUDACC__) +// CHECK_EQ(s->blas_handle_ownership_, Stream::OwnHandle) +// << "Must init CuBLAS handle in stream"; +// #endif +// const index_t nbatch = data.size(0); +// Tensor workspace = +// ctx.requested[rnn_enum::kTempSpace].get_space_typed( +// Shape1(this->InitTemp(grad.shape_, data.shape_)), s); +// for (index_t i = 0; i < nbatch; i += nstep_) { +// const index_t step = std::min(nstep_, nbatch - i); +// Tensor temp_col = Tensor( +// workspace.dptr_, +// Shape2(shape_colunit_[0], +// shape_colunit_[1] * step), s); +// Tensor temp_dst = Tensor( +// workspace.dptr_ + temp_col.shape_.Size(), +// Shape3(shape_dstunit_[0], +// shape_dstunit_[1], +// shape_dstunit_[2] * step), s); +// temp_dst = reshape(swapaxis<1, 0>(data.Slice(i, i + step)), temp_dst.shape_); +// if (param_.pad[0] == 0 && param_.pad[1] == 0) { +// temp_col = unpack_patch2col(grad.Slice(i, i + step), +// param_.kernel[0], +// param_.kernel[1], +// param_.stride[0], +// param_.stride[1], +// 1, 1); // RNN only support dilate equals 1 +// } else { +// temp_col = unpack_patch2col(pad(grad.Slice(i, i + step), param_.pad[0], param_.pad[1]), +// param_.kernel[0], +// param_.kernel[1], +// param_.stride[0], +// param_.stride[1], +// 1, 1); // RNN only support dilate equals 1 +// } +// const index_t gstride = temp_col.size(0) / param_.num_group; +// for (uint32_t gid = 0; gid < param_.num_group; ++gid) { +// Tensor tmpc = temp_col.Slice(gstride * gid, gstride * (gid + 1)); +// if (i == 0) { +// Tensor tmp_gwmat = gwmat[gid]; +// Assign(tmp_gwmat, req[rnn_enum::kWeight], dot(temp_dst[gid], tmpc.T())); +// } else { +// gwmat[gid] += dot(temp_dst[gid], tmpc.T()); +// } +// } +// if (req[rnn_enum::kData] == kWriteTo || req[rnn_enum::kData] == kWriteInplace) { +// for (uint32_t gid = 0; gid < param_.num_group; ++gid) { +// Tensor tmpc = temp_col.Slice(gstride * gid, gstride * (gid + 1)); +// temp_dst[gid] = dot(wmat[gid], tmpc); +// } +// gdata.Slice(i, i + step) = swapaxis<1, 0>(reshape(temp_dst, +// mshadow::Shape4(gdata.shape_[1], +// step, +// gdata.size(2), +// gdata.size(3)))); +// } +// } +// if (!param_.no_bias) { +// Tensor gbias = in_grad[rnn_enum::kBias].get(s); +// Assign(gbias, req[rnn_enum::kBias], sumall_except_dim<1>(grad)); +// } + } + + private: +// inline index_t InitTemp(const mshadow::Shape<4> &ishape, +// const mshadow::Shape<4> &oshape) { +// const int ksize_y = param_.kernel[0]; +// const int ksize_x = param_.kernel[1]; +// shape_colunit_ = mshadow::Shape2(ishape[1] * ksize_y * ksize_x, +// oshape[2] * oshape[3]); +// shape_dstunit_ = mshadow::Shape3(param_.num_group, +// oshape[1] / param_.num_group, +// oshape[2] * oshape[3]); +// // See convolution for workspace calculations +// nstep_ = std::max( +// std::min( +// static_cast( +// param_.workspace / (shape_colunit_.Size() + shape_dstunit_.Size())), +// ishape[0]), +// 1U); + +// mshadow::Shape<2> scol = mshadow::Shape2(shape_colunit_[0], +// shape_colunit_[1] * nstep_); +// mshadow::Shape<3> sdst = mshadow::Shape3(shape_dstunit_[0], +// shape_dstunit_[1], +// shape_dstunit_[2] * nstep_); +// index_t required_size = scol.Size() + sdst.Size(); +// CHECK_GE(param_.workspace, required_size) +// << "\nMinimum workspace size: " << required_size * sizeof(DType) << " Bytes\n" +// << "Given: " << param_.workspace * sizeof(DType); +// return required_size; +// } + + private: + RNNParam param_; +}; // class RNNOp + + + + +template +Operator* CreateOp(RNNParam param, int dtype); + +#if DMLC_USE_CXX11 +class RNNProp : public OperatorProperty { + public: + std::vector ListArguments() const override { + if (param_.mode == rnn_enum::kLstm) { + return {"data", "weight", "state", "cell_state"}; + } else { + return {"data", "weight", "state"}; + } + } + + void Init(const std::vector >& kwargs) override { + param_.Init(kwargs); + } + + std::map GetParams() const override { + return param_.__DICT__(); + } + + bool InferShape(std::vector *in_shape, + std::vector *out_shape, + std::vector *aux_shape) const override { + using namespace mshadow; + if (param_.mode == rnn_enum::kLstm) { + CHECK_EQ(in_shape->size(), 4) << "Input:[data, weight, state, cell_state]"; + } else { + CHECK_EQ(in_shape->size(), 3) << "Input:[data, weight, state]"; + } + const TShape &dshape = (*in_shape)[rnn_enum::kData]; + if (dshape.ndim() == 0) return false; + CHECK_EQ(dshape.ndim(), 3) \ + << "Input data should be rank-3 tensor of dim (seqLength, batch, inputDim)."; + // Infer hidden state + cell state + int batchSize = dshape[0]; + int inputSize = dshape[2]; + int numDirections = 1; + if(param_.direction == rnn_enum::kBidirectional){ + numDirections = 2; + } + int total_layers = numDirections * param_.num_layers; // double for bidirectional + SHAPE_ASSIGN_CHECK(*in_shape, + rnn_enum::kStateIn, + Shape3(total_layers, batchSize, param_.state_size)); + if (param_.mode == rnn_enum::kLstm){ + SHAPE_ASSIGN_CHECK(*in_shape, + rnn_enum::kCellStateIn, + Shape3(total_layers, batchSize, param_.state_size)); + } + // infer weight size + int weight_size = rnn_param_size(param_.num_layers, + inputSize, + param_.state_size, + param_.direction, + param_.mode); + SHAPE_ASSIGN_CHECK(*in_shape, rnn_enum::kWeight, Shape1(weight_size)); + // infer output size + TShape oshape = dshape; + oshape[3] = numDirections * param_.state_size; + // infer output state size + TShape outStateShape = dshape; + outStateShape[0] = total_layers; + outStateShape[1] = batchSize; + outStateShape[2] = param_.state_size; + + out_shape->clear(); + out_shape->push_back(oshape); + out_shape->push_back(outStateShape); + if (param_.mode == rnn_enum::kLstm) + out_shape->push_back(outStateShape); + return true; + } + + bool InferType(std::vector *in_type, + std::vector *out_type, + std::vector *aux_type) const override { + CHECK_GE(in_type->size(), 1); + int dtype = (*in_type)[0]; + CHECK_NE(dtype, -1) << "First input must have specified type"; + for (index_t i = 0; i < in_type->size(); ++i) { + if ((*in_type)[i] == -1) { + (*in_type)[i] = dtype; + } else { + CHECK_EQ((*in_type)[i], dtype) << "This layer requires uniform type. " + << "Expected " << dtype << " v.s. given " + << (*in_type)[i] << " at " << ListArguments()[i]; + } + } + out_type->clear(); + out_type->push_back(dtype); + out_type->push_back(dtype); + if (param_.mode == rnn_enum::kLstm) + out_type->push_back(dtype); + return true; + } + + OperatorProperty* Copy() const override { + auto ptr = new RNNProp(); + ptr->param_ = param_; + return ptr; + } + + std::string TypeString() const override { + return "RNN"; + } + + std::vector DeclareBackwardDependency( + const std::vector &out_grad, + const std::vector &in_data, + const std::vector &out_data) const override { + if (param_.mode == rnn_enum::kLstm) + return {out_grad[rnn_enum::kOut], in_data[rnn_enum::kData], in_data[rnn_enum::kWeight]}; + else + return {out_grad[rnn_enum::kOut], in_data[rnn_enum::kData], in_data[rnn_enum::kWeight]}; + } + + std::vector ForwardResource( + const std::vector &in_shape) const override { + return {ResourceRequest::kTempSpace}; + } + + std::vector BackwardResource( + const std::vector &in_shape) const override { + return {ResourceRequest::kTempSpace}; + } + + Operator* CreateOperator(Context ctx) const override { + LOG(FATAL) << "Not Implemented"; + return NULL; + } + + Operator* CreateOperatorEx(Context ctx, std::vector *in_shape, + std::vector *in_type) const override; + + private: + RNNParam param_; +}; // class RNNProp +#endif // DMLC_USE_CXX11 +} // namespace op +} // namespace mxnet +#endif // MXNET_OPERATOR_RNN_INL_H_ diff --git a/src/operator/rnn.cc b/src/operator/rnn.cc new file mode 100644 index 000000000000..40f7f705718d --- /dev/null +++ b/src/operator/rnn.cc @@ -0,0 +1,41 @@ +/*! + * Copyright (c) 2015 by Contributors + * \file rnn.cc + * \brief + * \author Sebastian Bodenstein +*/ + +#include "./rnn-inl.h" + +namespace mxnet { +namespace op { +template<> +Operator *CreateOp(RNNParam param, int dtype) { + LOG(FATAL) << "RNN is only available for gpu at the moment."; + Operator *op = NULL; + MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { + op = new RNNOp(param); + }); + return op; +} + +Operator *RNNProp::CreateOperatorEx(Context ctx, std::vector *in_shape, + std::vector *in_type) const { + std::vector out_shape, aux_shape; + std::vector out_type, aux_type; + CHECK(InferType(in_type, &out_type, &aux_type)); + CHECK(InferShape(in_shape, &out_shape, &aux_shape)); + DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0]); +} + +DMLC_REGISTER_PARAMETER(RNNParam); + +MXNET_REGISTER_OP_PROPERTY(RNN, RNNProp) +.describe("Apply a recurrent layer to input.") +.add_argument("data", "Symbol", "Input data to RNN") +.add_argument("weight", "Symbol", "Weight for RNN layers") +.add_argument("hidden_state", "Symbol", "initial hidden state of the RNN") +.add_argument("cell_state", "Symbol", "initial cell state for LSTM networks") +.add_arguments(RNNParam::__FIELDS__()); +} // namespace op +} // namespace mxnet diff --git a/src/operator/rnn.cu b/src/operator/rnn.cu new file mode 100644 index 000000000000..2cb482f591b2 --- /dev/null +++ b/src/operator/rnn.cu @@ -0,0 +1,33 @@ +/*! + * Copyright (c) 2015 by Contributors + * \file rnn.cu + * \brief + * \author Sebastian Bodenstein +*/ + +#include "./rnn-inl.h" +#include +#if MXNET_USE_CUDNN == 1 && CUDNN_MAJOR == 5 +#include "./cudnn_rnn-inl.h" +#endif // MXNET_USE_CUDNN && CUDNN_MAJOR + +namespace mxnet { +namespace op { +template<> +Operator* CreateOp(RNNParam param, int dtype) { + Operator *op = NULL; +#if MXNET_USE_CUDNN == 1 && CUDNN_MAJOR == 5 + MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { + op = new CuDNNRNNOp(param); + }) +#else + 1; + MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { + op = new SpatialTransformerOp(param); + }) +#endif // MXNET_USE_CUDNN && CUDNN_MAJOR + return op; +} + +} // namespace op +} // namespace mxnet From fde1cb30e85f6841b68a9d5ecc1bd27278d73d78 Mon Sep 17 00:00:00 2001 From: Sebastian Bodenstein Date: Sat, 9 Jul 2016 23:17:47 -0400 Subject: [PATCH 18/36] - removed unnecssary commented out code - fixed error in output shape inference --- src/operator/rnn-inl.h | 207 +++-------------------------------------- 1 file changed, 12 insertions(+), 195 deletions(-) diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h index 3a538f001d5b..37150bf58878 100644 --- a/src/operator/rnn-inl.h +++ b/src/operator/rnn-inl.h @@ -29,7 +29,6 @@ namespace rnn_enum { } // A utility function to calculate input size - inline int rnn_single_param_size(int inputSize, int hiddenSize, int mode){ @@ -116,86 +115,7 @@ class RNNOp : public Operator { const std::vector &aux_args) { using namespace mshadow; using namespace mshadow::expr; -// CHECK_EQ(req[rnn_enum::kOut], kWriteTo); - -// CHECK_EQ(in_data.size(), expected); -// CHECK_EQ(out_data.size(), 1); -// Stream *s = ctx.get_stream(); -// Tensor data = in_data[rnn_enum::kData].get(s); -// Tensor out = out_data[rnn_enum::kOut].get(s); -// Shape<3> wmat_shape = -// Shape3(param_.num_group, -// data.shape_[1] / param_.num_group, -// param_.num_filter / param_.num_group * param_.kernel[0] * param_.kernel[1]); -// Tensor wmat = -// in_data[rnn_enum::kWeight].get_with_shape(wmat_shape, s); -// #if defined(__CUDACC__) -// CHECK_EQ(s->blas_handle_ownership_, Stream::OwnHandle) -// << "Must init CuBLAS handle in stream"; -// #endif -// const index_t nbatch = data.size(0); -// Tensor workspace = -// ctx.requested[rnn_enum::kTempSpace].get_space_typed( -// Shape1(this->InitTemp(out.shape_, data.shape_)), s); -// for (index_t i = 0; i < nbatch; i += nstep_) { -// const index_t step = std::min(nstep_, nbatch - i); -// Tensor temp_col = Tensor( -// workspace.dptr_, -// Shape2(shape_colunit_[0], -// shape_colunit_[1] * step), s); -// Tensor temp_dst = Tensor( -// workspace.dptr_ + temp_col.shape_.Size(), -// Shape3(shape_dstunit_[0], -// shape_dstunit_[1], -// shape_dstunit_[2] * step), s); -// temp_dst = reshape(swapaxis<1, 0>(data.Slice(i, i + step)), temp_dst.shape_); -// if (param_.pad[0] == 0 && param_.pad[1] == 0) { -// temp_col = unpack_patch2col(out.Slice(i, i + step), -// param_.kernel[0], -// param_.kernel[1], -// param_.stride[0], -// param_.stride[1], -// 1, 1); // RNN only support dilate equals 1 -// } else { -// temp_col = unpack_patch2col(pad(out.Slice(i, i + step), -// param_.pad[0], param_.pad[1]), -// param_.kernel[0], -// param_.kernel[1], -// param_.stride[0], -// param_.stride[1], -// 1, 1); // RNN only support dilate equals 1 -// } -// const index_t gstride = temp_col.size(0) / param_.num_group; -// for (uint32_t gid = 0; gid < param_.num_group; ++gid) { -// mshadow::Tensor tmpc = temp_col.Slice(gstride * gid, -// gstride * (gid + 1)); -// tmpc = dot(wmat[gid].T(), temp_dst[gid]); -// } -// if (param_.pad[0] == 0 && param_.pad[1] == 0) { -// out.Slice(i, i + step) = pack_col2patch(temp_col, -// out.Slice(i, i + step).shape_, -// param_.kernel[0], -// param_.kernel[1], -// param_.stride[0], -// 1); // RNN only support dilate equals 1 -// } else { -// Shape<4> pshape = out.Slice(i, i + step).shape_; -// pshape[2] += 2 * param_.pad[0]; -// pshape[3] += 2 * param_.pad[1]; -// out.Slice(i, i + step) = crop(pack_col2patch(temp_col, -// pshape, -// param_.kernel[0], -// param_.kernel[1], -// param_.stride[0], -// 1), // RNN only support dilate equals 1 -// out[i][0].shape_); -// } -// } -// if (!param_.no_bias) { -// // add bias, broadcast bias to dim 1: channel -// Tensor bias = in_data[rnn_enum::kBias].get(s); -// out += broadcast<1>(bias, out.shape_); -// } + // TODO: add MShadow implementation } virtual void Backward(const OpContext &ctx, @@ -207,125 +127,13 @@ class RNNOp : public Operator { const std::vector &aux_args) { using namespace mshadow; using namespace mshadow::expr; - // TODO(bing): check the BLAS Handle, be careful -// CHECK_EQ(out_grad.size(), 1); -// size_t expected = param_.no_bias == 0 ? 3 : 2; -// CHECK(in_data.size() == expected && in_grad.size() == expected); -// CHECK_EQ(req.size(), expected); -// CHECK_EQ(in_data[rnn_enum::kWeight].CheckContiguous(), true); -// // get data -// Stream *s = ctx.get_stream(); -// Tensor data = in_data[rnn_enum::kData].get(s); -// Tensor grad = out_grad[rnn_enum::kOut].get(s); -// Tensor gdata = in_grad[rnn_enum::kData].get(s); -// Shape<3> wmat_shape = -// Shape3(param_.num_group, -// data.shape_[1] / param_.num_group, -// param_.num_filter / param_.num_group * param_.kernel[0] * param_.kernel[1]); -// Tensor wmat = -// in_data[rnn_enum::kWeight].get_with_shape(wmat_shape, s); -// Tensor gwmat = -// in_grad[rnn_enum::kWeight].get_with_shape(wmat_shape, s); -// #if defined(__CUDACC__) -// CHECK_EQ(s->blas_handle_ownership_, Stream::OwnHandle) -// << "Must init CuBLAS handle in stream"; -// #endif -// const index_t nbatch = data.size(0); -// Tensor workspace = -// ctx.requested[rnn_enum::kTempSpace].get_space_typed( -// Shape1(this->InitTemp(grad.shape_, data.shape_)), s); -// for (index_t i = 0; i < nbatch; i += nstep_) { -// const index_t step = std::min(nstep_, nbatch - i); -// Tensor temp_col = Tensor( -// workspace.dptr_, -// Shape2(shape_colunit_[0], -// shape_colunit_[1] * step), s); -// Tensor temp_dst = Tensor( -// workspace.dptr_ + temp_col.shape_.Size(), -// Shape3(shape_dstunit_[0], -// shape_dstunit_[1], -// shape_dstunit_[2] * step), s); -// temp_dst = reshape(swapaxis<1, 0>(data.Slice(i, i + step)), temp_dst.shape_); -// if (param_.pad[0] == 0 && param_.pad[1] == 0) { -// temp_col = unpack_patch2col(grad.Slice(i, i + step), -// param_.kernel[0], -// param_.kernel[1], -// param_.stride[0], -// param_.stride[1], -// 1, 1); // RNN only support dilate equals 1 -// } else { -// temp_col = unpack_patch2col(pad(grad.Slice(i, i + step), param_.pad[0], param_.pad[1]), -// param_.kernel[0], -// param_.kernel[1], -// param_.stride[0], -// param_.stride[1], -// 1, 1); // RNN only support dilate equals 1 -// } -// const index_t gstride = temp_col.size(0) / param_.num_group; -// for (uint32_t gid = 0; gid < param_.num_group; ++gid) { -// Tensor tmpc = temp_col.Slice(gstride * gid, gstride * (gid + 1)); -// if (i == 0) { -// Tensor tmp_gwmat = gwmat[gid]; -// Assign(tmp_gwmat, req[rnn_enum::kWeight], dot(temp_dst[gid], tmpc.T())); -// } else { -// gwmat[gid] += dot(temp_dst[gid], tmpc.T()); -// } -// } -// if (req[rnn_enum::kData] == kWriteTo || req[rnn_enum::kData] == kWriteInplace) { -// for (uint32_t gid = 0; gid < param_.num_group; ++gid) { -// Tensor tmpc = temp_col.Slice(gstride * gid, gstride * (gid + 1)); -// temp_dst[gid] = dot(wmat[gid], tmpc); -// } -// gdata.Slice(i, i + step) = swapaxis<1, 0>(reshape(temp_dst, -// mshadow::Shape4(gdata.shape_[1], -// step, -// gdata.size(2), -// gdata.size(3)))); -// } -// } -// if (!param_.no_bias) { -// Tensor gbias = in_grad[rnn_enum::kBias].get(s); -// Assign(gbias, req[rnn_enum::kBias], sumall_except_dim<1>(grad)); -// } + // TODO: add MShadow implementation } - private: -// inline index_t InitTemp(const mshadow::Shape<4> &ishape, -// const mshadow::Shape<4> &oshape) { -// const int ksize_y = param_.kernel[0]; -// const int ksize_x = param_.kernel[1]; -// shape_colunit_ = mshadow::Shape2(ishape[1] * ksize_y * ksize_x, -// oshape[2] * oshape[3]); -// shape_dstunit_ = mshadow::Shape3(param_.num_group, -// oshape[1] / param_.num_group, -// oshape[2] * oshape[3]); -// // See convolution for workspace calculations -// nstep_ = std::max( -// std::min( -// static_cast( -// param_.workspace / (shape_colunit_.Size() + shape_dstunit_.Size())), -// ishape[0]), -// 1U); - -// mshadow::Shape<2> scol = mshadow::Shape2(shape_colunit_[0], -// shape_colunit_[1] * nstep_); -// mshadow::Shape<3> sdst = mshadow::Shape3(shape_dstunit_[0], -// shape_dstunit_[1], -// shape_dstunit_[2] * nstep_); -// index_t required_size = scol.Size() + sdst.Size(); -// CHECK_GE(param_.workspace, required_size) -// << "\nMinimum workspace size: " << required_size * sizeof(DType) << " Bytes\n" -// << "Given: " << param_.workspace * sizeof(DType); -// return required_size; -// } - private: RNNParam param_; }; // class RNNOp - - - template Operator* CreateOp(RNNParam param, int dtype); @@ -340,6 +148,14 @@ class RNNProp : public OperatorProperty { } } + std::vector ListOutputs() const override { + if (param_.mode == rnn_enum::kLstm) { + return {"output", "final_state", "final_state_cell"}; + } else { + return {"output", "final_state"}; + } + } + void Init(const std::vector >& kwargs) override { param_.Init(kwargs); } @@ -386,7 +202,7 @@ class RNNProp : public OperatorProperty { SHAPE_ASSIGN_CHECK(*in_shape, rnn_enum::kWeight, Shape1(weight_size)); // infer output size TShape oshape = dshape; - oshape[3] = numDirections * param_.state_size; + oshape[2] = numDirections * param_.state_size; // infer output state size TShape outStateShape = dshape; outStateShape[0] = total_layers; @@ -396,6 +212,7 @@ class RNNProp : public OperatorProperty { out_shape->clear(); out_shape->push_back(oshape); out_shape->push_back(outStateShape); + // Deal with lstm cell state if (param_.mode == rnn_enum::kLstm) out_shape->push_back(outStateShape); return true; From 7a8a11b53fb0e143e24fa9f15f345f99140a548e Mon Sep 17 00:00:00 2001 From: Sebastian Bodenstein Date: Sun, 10 Jul 2016 00:55:39 -0400 Subject: [PATCH 19/36] - some renaming - added cudnn destructors --- src/operator/cudnn_rnn-inl.h | 163 +++++++++++++++++++++++------------ src/operator/rnn-inl.h | 34 ++++---- src/operator/rnn.cc | 6 +- src/operator/rnn.cu | 5 +- 4 files changed, 129 insertions(+), 79 deletions(-) diff --git a/src/operator/cudnn_rnn-inl.h b/src/operator/cudnn_rnn-inl.h index 37895c2b2488..61d6d2c2f23a 100644 --- a/src/operator/cudnn_rnn-inl.h +++ b/src/operator/cudnn_rnn-inl.h @@ -23,16 +23,16 @@ class CuDNNRNNOp : public Operator { // RNN Mode switch (param_.mode) { case rnn_enum::kRnnRelu: - rnn_mode_ = CUDNN_RNN_RELU; + mode_ = CUDNN_RNN_RELU; break; case rnn_enum::kRnnTanh: - rnn_mode_ = CUDNN_RNN_TANH; + mode_ = CUDNN_RNN_TANH; break; case rnn_enum::kLstm: - rnn_mode_ = CUDNN_LSTM; + mode_ = CUDNN_LSTM; break; case rnn_enum::kGru: - rnn_mode_ = CUDNN_GRU; + mode_ = CUDNN_GRU; break; default: LOG(FATAL) << "Not implmented"; @@ -40,22 +40,31 @@ class CuDNNRNNOp : public Operator { // RNN Direction switch (param_.direction) { case rnn_enum::kUnidirectional: - rnn_direction_ = CUDNN_UNIDIRECTIONAL; + direction_ = CUDNN_UNIDIRECTIONAL; break; case rnn_enum::kBidirectional: - rnn_direction_ = CUDNN_BIDIRECTIONAL; + direction_ = CUDNN_BIDIRECTIONAL; break; default: LOG(FATAL) << "Not implmented"; } } - // ~CuDNNRNNOp() { - // if (init_cudnn_) { - // CHECK_EQ(cudnnDestroyRNNDescriptor(rnn_desc_), CUDNN_STATUS_SUCCESS); - // // CHECK_EQ(cudnnDestroyTensorDescriptor(rnn_desc_), CUDNN_STATUS_SUCCESS); - // // CHECK_EQ(cudnnDestroyTensorDescriptor(_desc_), CUDNN_STATUS_SUCCESS); - // } - // } + + ~CuDNNRNNOp() { + if (init_cudnn_) { + CHECK_EQ(cudnnDestroyTensorDescriptor(x_desc_), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudnnDestroyTensorDescriptor(hx_desc_), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudnnDestroyTensorDescriptor(y_desc_), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudnnDestroyTensorDescriptor(hy_desc_), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudnnDestroyFilterDescriptor(w_desc_), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudnnDestroyRNNDescriptor(rnn_desc_), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudnnDestroyDropoutDescriptor(dropout_desc_), CUDNN_STATUS_SUCCESS); + if (param_.mode == rnn_enum::kLstm){ + CHECK_EQ(cudnnDestroyTensorDescriptor(cx_desc_), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudnnDestroyTensorDescriptor(cy_desc_), CUDNN_STATUS_SUCCESS); + } + } + } virtual void Forward(const OpContext &ctx, const std::vector &in_data, @@ -150,52 +159,96 @@ class CuDNNRNNOp : public Operator { const std::vector &in_data, const std::vector &out_data) { using namespace mshadow; - // CHECK_EQ(in_data.size(), 2); - // CHECK_EQ(out_data.size(), 3); - // if (!init_cudnn_) { - // init_cudnn_ = true; - // // Tensor data = in_data[st::kData].get(s); - // // Tensor out = out_data[st::kOut].get(s); - // CHECK_EQ(cudnnCreateRNNDescriptor(&rnn_desc_), CUDNN_STATUS_SUCCESS); - // CHECK_EQ(cudnnCreateDropoutDescriptor(&rnn_dropout_), CUDNN_STATUS_SUCCESS); + #if CUDNN_MAJOR == 5 + format_ = CUDNN_TENSOR_NCHW; + #endif + + if (param_.mode == rnn_enum::kLstm){ + CHECK_EQ(in_data.size(), 4); + CHECK_EQ(out_data.size(), 3); + } + else{ + CHECK_EQ(in_data.size(), 3); + CHECK_EQ(out_data.size(), 2); + } + + if (!init_cudnn_) { + init_cudnn_ = true; + + Tensor data = in_data[rnn_enum::kData].get(s); + Tensor params = in_data[rnn_enum::kParams].get(s); + Tensor state = in_data[rnn_enum::kStateIn].get(s); + + Tensor out = out_data[rnn_enum::kOut].get(s); + Tensor out_state = out_data[rnn_enum::kOut].get(s); + + if (param_.mode == rnn_enum::kLstm){ + Tensor cell_state = + in_data[rnn_enum::kCellStateIn].get(s); + Tensor out_cell_state = + in_data[rnn_enum::kCellStateOut].get(s); + } + + CHECK_EQ(cudnnCreateRNNDescriptor(&rnn_desc_), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudnnCreateDropoutDescriptor(&dropout_desc_), CUDNN_STATUS_SUCCESS); - // CHECK_EQ(cudnnCreateTensorDescriptor(&in_desc_), CUDNN_STATUS_SUCCESS); - // CHECK_EQ(cudnnCreateTensorDescriptor(&out_desc_), CUDNN_STATUS_SUCCESS); - // CHECK_EQ(cudnnSetTensor4dDescriptor(in_desc_, - // format_, - // dtype_, - // data.size(0), - // data.size(1), - // data.size(2), - // data.size(3)), CUDNN_STATUS_SUCCESS); - // CHECK_EQ(cudnnSetTensor4dDescriptor(out_desc_, - // format_, - // dtype_, - // out.size(0), - // out.size(1), - // out.size(2), - // out.size(3)), CUDNN_STATUS_SUCCESS); - // if (param_.sampler_type == st::kBilinear) { - // int dim[] = {static_cast(out.size(0)), static_cast(out.size(1)), - // static_cast(out.size(2)), static_cast(out.size(3))}; - // CHECK_EQ(cudnnSetSpatialTransformerNdDescriptor(st_desc_, - // sampler_, - // dtype_, - // 4, - // dim) , CUDNN_STATUS_SUCCESS); - // } - // } + // Create tensors + CHECK_EQ(cudnnCreateFilterDescriptor(&w_desc_), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudnnCreateTensorDescriptor(&x_desc_), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudnnCreateTensorDescriptor(&hx_desc_), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudnnCreateTensorDescriptor(&y_desc_), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudnnCreateTensorDescriptor(&hy_desc_), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudnnCreateTensorDescriptor(&y_desc_), CUDNN_STATUS_SUCCESS); + if (param_.mode == rnn_enum::kLstm){ + CHECK_EQ(cudnnCreateTensorDescriptor(&cx_desc_), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudnnCreateTensorDescriptor(&cy_desc_), CUDNN_STATUS_SUCCESS); + } + + // CHECK_EQ(cudnnCreateTensorDescriptor(&in_desc_), CUDNN_STATUS_SUCCESS); + // CHECK_EQ(cudnnCreateTensorDescriptor(&out_desc_), CUDNN_STATUS_SUCCESS); + // CHECK_EQ(cudnnSetTensor4dDescriptor(in_desc_, + // format_, + // dtype_, + // data.size(0), + // data.size(1), + // data.size(2), + // data.size(3)), CUDNN_STATUS_SUCCESS); + // CHECK_EQ(cudnnSetTensor4dDescriptor(out_desc_, + // format_, + // dtype_, + // out.size(0), + // out.size(1), + // out.size(2), + // out.size(3)), CUDNN_STATUS_SUCCESS); + // if (param_.sampler_type == st::kBilinear) { + // int dim[] = {static_cast(out.size(0)), static_cast(out.size(1)), + // static_cast(out.size(2)), static_cast(out.size(3))}; + // CHECK_EQ(cudnnSetSpatialTransformerNdDescriptor(st_desc_, + // sampler_, + // dtype_, + // 4, + // dim) , CUDNN_STATUS_SUCCESS); + // } + } } - - bool init_cudnn_; + cudnnDataType_t dtype_; + bool init_cudnn_; cudnnRNNDescriptor_t rnn_desc_; - cudnnRNNMode_t rnn_mode_; - cudnnDirectionMode_t rnn_direction_; - cudnnRNNInputMode_t rnn_input_mode_; - cudnnDropoutDescriptor_t rnn_dropout_; - // cudnnTensorDescriptor_t in_desc_; - // cudnnTensorDescriptor_t out_desc_; + cudnnRNNMode_t mode_; + cudnnDirectionMode_t direction_; + cudnnRNNInputMode_t input_mode_; + cudnnDropoutDescriptor_t dropout_desc_; + + cudnnTensorDescriptor_t x_desc_; + cudnnTensorDescriptor_t hx_desc_; + cudnnTensorDescriptor_t cx_desc_; + cudnnTensorDescriptor_t y_desc_; + cudnnTensorDescriptor_t hy_desc_; + cudnnTensorDescriptor_t cy_desc_; + + cudnnFilterDescriptor_t w_desc_; + #if CUDNN_MAJOR == 5 cudnnTensorFormat_t format_; #endif diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h index 37150bf58878..2729a2ff49cc 100644 --- a/src/operator/rnn-inl.h +++ b/src/operator/rnn-inl.h @@ -21,16 +21,16 @@ namespace mxnet { namespace op { namespace rnn_enum { - enum RNNOpInputs {kData, kWeight, kStateIn, kCellStateIn}; + enum RNNOpInputs {kData, kParams, kStateIn, kCellStateIn}; enum RNNOpOutputs {kOut, kStateOut, kCellStateOut}; - enum RNNModeType {kRnnRelu, kRnnTanh, kLstm, kGru}; + enum RNNModeType {kRnnRelu, kRnnTanh, kLstm, kGru}; enum RNNDirectionType {kUnidirectional, kBidirectional}; enum RNNOpResource {kTempSpace}; } // A utility function to calculate input size inline int rnn_single_param_size(int inputSize, - int hiddenSize, + int hiddenSize, int mode){ int size = hiddenSize * (hiddenSize + inputSize + 2); // Different RNN's have different num weights @@ -52,10 +52,10 @@ inline int rnn_single_param_size(int inputSize, return size; } -inline int rnn_param_size(int layerNum, +inline int rnn_param_size(int layerNum, int inputSize, - int hiddenSize, - int direction, + int hiddenSize, + int direction, int mode){ // get size of first layer int size = rnn_single_param_size(inputSize, hiddenSize, mode); @@ -194,26 +194,26 @@ class RNNProp : public OperatorProperty { Shape3(total_layers, batchSize, param_.state_size)); } // infer weight size - int weight_size = rnn_param_size(param_.num_layers, - inputSize, - param_.state_size, - param_.direction, + int weight_size = rnn_param_size(param_.num_layers, + inputSize, + param_.state_size, + param_.direction, param_.mode); - SHAPE_ASSIGN_CHECK(*in_shape, rnn_enum::kWeight, Shape1(weight_size)); + SHAPE_ASSIGN_CHECK(*in_shape, rnn_enum::kParams, Shape1(weight_size)); // infer output size TShape oshape = dshape; oshape[2] = numDirections * param_.state_size; - // infer output state size + // infer output state size TShape outStateShape = dshape; outStateShape[0] = total_layers; outStateShape[1] = batchSize; outStateShape[2] = param_.state_size; - out_shape->clear(); + out_shape->clear(); out_shape->push_back(oshape); out_shape->push_back(outStateShape); // Deal with lstm cell state - if (param_.mode == rnn_enum::kLstm) + if (param_.mode == rnn_enum::kLstm) out_shape->push_back(outStateShape); return true; } @@ -236,7 +236,7 @@ class RNNProp : public OperatorProperty { out_type->clear(); out_type->push_back(dtype); out_type->push_back(dtype); - if (param_.mode == rnn_enum::kLstm) + if (param_.mode == rnn_enum::kLstm) out_type->push_back(dtype); return true; } @@ -256,9 +256,9 @@ class RNNProp : public OperatorProperty { const std::vector &in_data, const std::vector &out_data) const override { if (param_.mode == rnn_enum::kLstm) - return {out_grad[rnn_enum::kOut], in_data[rnn_enum::kData], in_data[rnn_enum::kWeight]}; + return {out_grad[rnn_enum::kOut], in_data[rnn_enum::kData], in_data[rnn_enum::kParams]}; else - return {out_grad[rnn_enum::kOut], in_data[rnn_enum::kData], in_data[rnn_enum::kWeight]}; + return {out_grad[rnn_enum::kOut], in_data[rnn_enum::kData], in_data[rnn_enum::kParams]}; } std::vector ForwardResource( diff --git a/src/operator/rnn.cc b/src/operator/rnn.cc index 40f7f705718d..2a485e5ef224 100644 --- a/src/operator/rnn.cc +++ b/src/operator/rnn.cc @@ -33,9 +33,9 @@ DMLC_REGISTER_PARAMETER(RNNParam); MXNET_REGISTER_OP_PROPERTY(RNN, RNNProp) .describe("Apply a recurrent layer to input.") .add_argument("data", "Symbol", "Input data to RNN") -.add_argument("weight", "Symbol", "Weight for RNN layers") +.add_argument("parameters", "Symbol", "Vector of all RNN trainable parameters") .add_argument("hidden_state", "Symbol", "initial hidden state of the RNN") -.add_argument("cell_state", "Symbol", "initial cell state for LSTM networks") -.add_arguments(RNNParam::__FIELDS__()); +.add_argument("cell_state", "Symbol", "initial cell state for LSTM networks (only for LSTM)") +.add_arguments(RNNParam::__FIELDS__()); } // namespace op } // namespace mxnet diff --git a/src/operator/rnn.cu b/src/operator/rnn.cu index 2cb482f591b2..fb90daf19b41 100644 --- a/src/operator/rnn.cu +++ b/src/operator/rnn.cu @@ -21,10 +21,7 @@ Operator* CreateOp(RNNParam param, int dtype) { op = new CuDNNRNNOp(param); }) #else - 1; - MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { - op = new SpatialTransformerOp(param); - }) + LOG(FATAL) << "RNN is only available for cuDNN at the moment."; #endif // MXNET_USE_CUDNN && CUDNN_MAJOR return op; } From 8979b01ba3c845e58b7c3dde6e759d6b02da8e01 Mon Sep 17 00:00:00 2001 From: Sebastian Bodenstein Date: Sun, 17 Jul 2016 16:01:48 -0400 Subject: [PATCH 20/36] - added dropout --- src/operator/cudnn_rnn-inl.h | 166 +++++++++++++++++++++++++++-------- src/operator/rnn-inl.h | 5 ++ 2 files changed, 135 insertions(+), 36 deletions(-) diff --git a/src/operator/cudnn_rnn-inl.h b/src/operator/cudnn_rnn-inl.h index 61d6d2c2f23a..90bf5cbc9bc7 100644 --- a/src/operator/cudnn_rnn-inl.h +++ b/src/operator/cudnn_rnn-inl.h @@ -20,6 +20,8 @@ class CuDNNRNNOp : public Operator { this->param_ = param; init_cudnn_ = false; dtype_ = mshadow::DataType::kCudnnFlag; + // Defaults + input_mode_ = CUDNN_LINEAR_INPUT; // RNN Mode switch (param_.mode) { case rnn_enum::kRnnRelu: @@ -72,9 +74,48 @@ class CuDNNRNNOp : public Operator { const std::vector &out_data, const std::vector &aux_args) { using namespace mshadow; - // CHECK_EQ(in_data.size(), 2); - // CHECK_EQ(out_data.size(), 3); - // Stream *s = ctx.get_stream(); + Stream *s = ctx.get_stream(); + if(!init_cudnn_){ + Init(s, in_data, out_data); + } + // get input + output tensors + Tensor data = in_data[rnn_enum::kData].get(s); + Tensor params = in_data[rnn_enum::kParams].get(s); + Tensor state = in_data[rnn_enum::kStateIn].get(s); + + Tensor out = out_data[rnn_enum::kOut].get(s); + Tensor out_state = out_data[rnn_enum::kStateOut].get(s); + + if (param_.mode == rnn_enum::kLstm){ + Tensor cell_state = + in_data[rnn_enum::kCellStateIn].get(s); + Tensor out_cell_state = + in_data[rnn_enum::kCellStateOut].get(s); + } + // if (param_.mode == rnn_enum::kLstm){ + // CHECK_EQ(in_data.size(), 4); + // CHECK_EQ(out_data.size(), 3); + // } + // else{ + // CHECK_EQ(in_data.size(), 3); + // CHECK_EQ(out_data.size(), 2); + // } + // // Get tensors + // + // Tensor data = in_data[rnn_enum::kData].get(s); + // Tensor params = in_data[rnn_enum::kParams].get(s); + // Tensor state = in_data[rnn_enum::kStateIn].get(s); + + // Tensor out = out_data[rnn_enum::kOut].get(s); + // Tensor out_state = out_data[rnn_enum::kOut].get(s); + + // if (param_.mode == rnn_enum::kLstm){ + // Tensor cell_state = + // in_data[rnn_enum::kCellStateIn].get(s); + // Tensor out_cell_state = + // in_data[rnn_enum::kCellStateOut].get(s); + // } + // // Tensor data = in_data[st::kData].get(s); // Tensor out = out_data[st::kOut].get(s); // Shape<3> loc_shape = Shape3(data.size(0), 2, 3); @@ -162,8 +203,7 @@ class CuDNNRNNOp : public Operator { #if CUDNN_MAJOR == 5 format_ = CUDNN_TENSOR_NCHW; #endif - - if (param_.mode == rnn_enum::kLstm){ + if(param_.mode == rnn_enum::kLstm){ CHECK_EQ(in_data.size(), 4); CHECK_EQ(out_data.size(), 3); } @@ -171,64 +211,118 @@ class CuDNNRNNOp : public Operator { CHECK_EQ(in_data.size(), 3); CHECK_EQ(out_data.size(), 2); } - if (!init_cudnn_) { init_cudnn_ = true; - + // get input + output tensors Tensor data = in_data[rnn_enum::kData].get(s); Tensor params = in_data[rnn_enum::kParams].get(s); Tensor state = in_data[rnn_enum::kStateIn].get(s); Tensor out = out_data[rnn_enum::kOut].get(s); - Tensor out_state = out_data[rnn_enum::kOut].get(s); + Tensor out_state = out_data[rnn_enum::kStateOut].get(s); - if (param_.mode == rnn_enum::kLstm){ + if(param_.mode == rnn_enum::kLstm){ Tensor cell_state = in_data[rnn_enum::kCellStateIn].get(s); Tensor out_cell_state = in_data[rnn_enum::kCellStateOut].get(s); } + // Create descriptors CHECK_EQ(cudnnCreateRNNDescriptor(&rnn_desc_), CUDNN_STATUS_SUCCESS); CHECK_EQ(cudnnCreateDropoutDescriptor(&dropout_desc_), CUDNN_STATUS_SUCCESS); - // Create tensors CHECK_EQ(cudnnCreateFilterDescriptor(&w_desc_), CUDNN_STATUS_SUCCESS); CHECK_EQ(cudnnCreateTensorDescriptor(&x_desc_), CUDNN_STATUS_SUCCESS); CHECK_EQ(cudnnCreateTensorDescriptor(&hx_desc_), CUDNN_STATUS_SUCCESS); CHECK_EQ(cudnnCreateTensorDescriptor(&y_desc_), CUDNN_STATUS_SUCCESS); CHECK_EQ(cudnnCreateTensorDescriptor(&hy_desc_), CUDNN_STATUS_SUCCESS); - CHECK_EQ(cudnnCreateTensorDescriptor(&y_desc_), CUDNN_STATUS_SUCCESS); + if (param_.mode == rnn_enum::kLstm){ CHECK_EQ(cudnnCreateTensorDescriptor(&cx_desc_), CUDNN_STATUS_SUCCESS); CHECK_EQ(cudnnCreateTensorDescriptor(&cy_desc_), CUDNN_STATUS_SUCCESS); } + // set dropout + // cudnnSetDropoutDescriptor(dropout_desc_, + // s->dnn_handle_, + // param_.p, + // void * states, + // size_t stateSizeInBytes, + // unsigned long long seed) + // set RNN + CHECK_EQ(cudnnSetRNNDescriptor(rnn_desc_, + param_.state_size, + param_.num_layers, + dropout_desc_, + input_mode_, + direction_, + mode_, + dtype_), CUDNN_STATUS_SUCCESS); + // Set params + int dim_params[3] = {params.shape_[0], 1, 1}; + CHECK_EQ(cudnnSetFilterNdDescriptor(w_desc_, + dtype_, + format_, + 3, + dim_params + ), CUDNN_STATUS_SUCCESS); + // Get strides + int stride_data[3] = {data.shape_[2]*data.shape_[1], data.shape_[2], 1}; + int stride_state[3] = {state.shape_[2]*state.shape_[1], state.shape_[2], 1}; + int stride_out[3] = {out.shape_[2]*out.shape_[1], out.shape_[2], 1}; + int stride_out_state[3] = + {out_state.shape_[2]*out_state.shape_[1], out_state.shape_[2], 1}; + + // cuDNN needs int arrays for dim, not index_t array used in Shape + int dim_data[3]; + int dim_state[3]; + int dim_out[3]; + int dim_out_state[3]; + std::copy(std::begin(data.shape_.shape_), std::end(data.shape_.shape_), std::begin(dim_data)); + std::copy(std::begin(state.shape_.shape_), std::end(state.shape_.shape_), std::begin(dim_state)); + std::copy(std::begin(out.shape_.shape_), std::end(out.shape_.shape_), std::begin(dim_out)); + std::copy(std::begin(out_state.shape_.shape_), std::end(out_state.shape_.shape_), std::begin(dim_out_state)); - // CHECK_EQ(cudnnCreateTensorDescriptor(&in_desc_), CUDNN_STATUS_SUCCESS); - // CHECK_EQ(cudnnCreateTensorDescriptor(&out_desc_), CUDNN_STATUS_SUCCESS); - // CHECK_EQ(cudnnSetTensor4dDescriptor(in_desc_, - // format_, - // dtype_, - // data.size(0), - // data.size(1), - // data.size(2), - // data.size(3)), CUDNN_STATUS_SUCCESS); - // CHECK_EQ(cudnnSetTensor4dDescriptor(out_desc_, - // format_, - // dtype_, - // out.size(0), - // out.size(1), - // out.size(2), - // out.size(3)), CUDNN_STATUS_SUCCESS); - // if (param_.sampler_type == st::kBilinear) { - // int dim[] = {static_cast(out.size(0)), static_cast(out.size(1)), - // static_cast(out.size(2)), static_cast(out.size(3))}; - // CHECK_EQ(cudnnSetSpatialTransformerNdDescriptor(st_desc_, - // sampler_, - // dtype_, - // 4, - // dim) , CUDNN_STATUS_SUCCESS); - // } + // set the tensor descriptors + CHECK_EQ(cudnnSetTensorNdDescriptor(x_desc_, + dtype_, + 3, + dim_data, + stride_data + ), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudnnSetTensorNdDescriptor(hx_desc_, + dtype_, + 3, + dim_state, + stride_state + ), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudnnSetTensorNdDescriptor(y_desc_, + dtype_, + 3, + dim_out, + stride_out + ), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudnnSetTensorNdDescriptor(hy_desc_, + dtype_, + 3, + dim_out_state, + stride_out_state + ), CUDNN_STATUS_SUCCESS); + // LSTM has two extra descriptors + if (param_.mode == rnn_enum::kLstm){ + CHECK_EQ(cudnnSetTensorNdDescriptor(cx_desc_, + dtype_, + 3, + dim_state, + stride_state + ), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudnnSetTensorNdDescriptor(cy_desc_, + dtype_, + 3, + dim_out_state, + stride_out_state + ), CUDNN_STATUS_SUCCESS); + } } } diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h index 2729a2ff49cc..d81ed1637756 100644 --- a/src/operator/rnn-inl.h +++ b/src/operator/rnn-inl.h @@ -74,6 +74,7 @@ struct RNNParam : public dmlc::Parameter { bool batch_first; int direction; int mode; + float p; DMLC_DECLARE_PARAMETER(RNNParam) { DMLC_DECLARE_FIELD(state_size) @@ -96,6 +97,10 @@ struct RNNParam : public dmlc::Parameter { .add_enum("lstm", rnn_enum::kLstm) .add_enum("gru", rnn_enum::kGru) .describe("the type of RNN to compute"); + + DMLC_DECLARE_FIELD(p).set_default(0.) + .set_range(0, 1) + .describe("Fraction of the input that gets dropped out at training time"); } }; From 7861b3de9b8091f7d9243c6c34b41416c39bd069 Mon Sep 17 00:00:00 2001 From: Sebastian Bodenstein Date: Mon, 18 Jul 2016 00:28:48 -0400 Subject: [PATCH 21/36] - major refactor - completed forward evaluation --- src/operator/cudnn_rnn-inl.h | 481 +++++++++++++++++++---------------- src/operator/rnn-inl.h | 39 ++- 2 files changed, 277 insertions(+), 243 deletions(-) diff --git a/src/operator/cudnn_rnn-inl.h b/src/operator/cudnn_rnn-inl.h index 90bf5cbc9bc7..134044321ad7 100644 --- a/src/operator/cudnn_rnn-inl.h +++ b/src/operator/cudnn_rnn-inl.h @@ -1,6 +1,6 @@ /*! * Copyright (c) 2016 by Contributors - * \file cudnn_spatial_transformer-inl.h + * \file cudnn_rnn-inl.h * \brief * \author Sebastian Bodenstein */ @@ -21,7 +21,7 @@ class CuDNNRNNOp : public Operator { init_cudnn_ = false; dtype_ = mshadow::DataType::kCudnnFlag; // Defaults - input_mode_ = CUDNN_LINEAR_INPUT; + input_mode_ = CUDNN_LINEAR_INPUT; // Don't support this yet // RNN Mode switch (param_.mode) { case rnn_enum::kRnnRelu: @@ -40,31 +40,29 @@ class CuDNNRNNOp : public Operator { LOG(FATAL) << "Not implmented"; } // RNN Direction - switch (param_.direction) { - case rnn_enum::kUnidirectional: - direction_ = CUDNN_UNIDIRECTIONAL; - break; - case rnn_enum::kBidirectional: - direction_ = CUDNN_BIDIRECTIONAL; - break; - default: - LOG(FATAL) << "Not implmented"; - } + direction_ = param_.bidirectional ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL; } ~CuDNNRNNOp() { if (init_cudnn_) { - CHECK_EQ(cudnnDestroyTensorDescriptor(x_desc_), CUDNN_STATUS_SUCCESS); + for(int i = 0; i < x_desc_vec_.size(); ++i){ + CHECK_EQ(cudnnDestroyTensorDescriptor(x_desc_vec_[i]), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudnnDestroyTensorDescriptor(y_desc_vec_[i]), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudnnDestroyTensorDescriptor(dx_desc_vec_[i]), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudnnDestroyTensorDescriptor(dy_desc_vec_[i]), CUDNN_STATUS_SUCCESS); + } CHECK_EQ(cudnnDestroyTensorDescriptor(hx_desc_), CUDNN_STATUS_SUCCESS); - CHECK_EQ(cudnnDestroyTensorDescriptor(y_desc_), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudnnDestroyTensorDescriptor(cx_desc_), CUDNN_STATUS_SUCCESS); CHECK_EQ(cudnnDestroyTensorDescriptor(hy_desc_), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudnnDestroyTensorDescriptor(cy_desc_), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudnnDestroyTensorDescriptor(dhx_desc_), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudnnDestroyTensorDescriptor(dcx_desc_), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudnnDestroyTensorDescriptor(dhy_desc_), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudnnDestroyTensorDescriptor(dcy_desc_), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudnnDestroyFilterDescriptor(w_desc_), CUDNN_STATUS_SUCCESS); CHECK_EQ(cudnnDestroyRNNDescriptor(rnn_desc_), CUDNN_STATUS_SUCCESS); CHECK_EQ(cudnnDestroyDropoutDescriptor(dropout_desc_), CUDNN_STATUS_SUCCESS); - if (param_.mode == rnn_enum::kLstm){ - CHECK_EQ(cudnnDestroyTensorDescriptor(cx_desc_), CUDNN_STATUS_SUCCESS); - CHECK_EQ(cudnnDestroyTensorDescriptor(cy_desc_), CUDNN_STATUS_SUCCESS); - } } } @@ -74,77 +72,83 @@ class CuDNNRNNOp : public Operator { const std::vector &out_data, const std::vector &aux_args) { using namespace mshadow; + size_t in_expected = param_.lstm_q_ ? 4 : 3; + size_t out_expected = param_.lstm_q_ ? 3 : 2; + CHECK_EQ(in_data.size(), in_expected); + CHECK_EQ(out_data.size(), out_expected); Stream *s = ctx.get_stream(); - if(!init_cudnn_){ - Init(s, in_data, out_data); - } // get input + output tensors - Tensor data = in_data[rnn_enum::kData].get(s); - Tensor params = in_data[rnn_enum::kParams].get(s); - Tensor state = in_data[rnn_enum::kStateIn].get(s); + Tensor x = in_data[rnn_enum::kData].get(s); + Tensor w = in_data[rnn_enum::kParams].get(s); + Tensor hx = in_data[rnn_enum::kStateIn].get(s); - Tensor out = out_data[rnn_enum::kOut].get(s); - Tensor out_state = out_data[rnn_enum::kStateOut].get(s); + Tensor y = out_data[rnn_enum::kOut].get(s); + Tensor hy = out_data[rnn_enum::kStateOut].get(s); + DType * cx_ptr = NULL; + DType * cy_ptr = NULL; if (param_.mode == rnn_enum::kLstm){ - Tensor cell_state = - in_data[rnn_enum::kCellStateIn].get(s); - Tensor out_cell_state = - in_data[rnn_enum::kCellStateOut].get(s); + cx_ptr = (in_data[rnn_enum::kCellStateIn].get(s)).dptr_; + cy_ptr = (in_data[rnn_enum::kCellStateOut].get(s)).dptr_; } - // if (param_.mode == rnn_enum::kLstm){ - // CHECK_EQ(in_data.size(), 4); - // CHECK_EQ(out_data.size(), 3); - // } - // else{ - // CHECK_EQ(in_data.size(), 3); - // CHECK_EQ(out_data.size(), 2); - // } - // // Get tensors - // - // Tensor data = in_data[rnn_enum::kData].get(s); - // Tensor params = in_data[rnn_enum::kParams].get(s); - // Tensor state = in_data[rnn_enum::kStateIn].get(s); - // Tensor out = out_data[rnn_enum::kOut].get(s); - // Tensor out_state = out_data[rnn_enum::kOut].get(s); + if(!init_cudnn_){ + Init(s, in_data, out_data); + } - // if (param_.mode == rnn_enum::kLstm){ - // Tensor cell_state = - // in_data[rnn_enum::kCellStateIn].get(s); - // Tensor out_cell_state = - // in_data[rnn_enum::kCellStateOut].get(s); - // } - // - // Tensor data = in_data[st::kData].get(s); - // Tensor out = out_data[st::kOut].get(s); - // Shape<3> loc_shape = Shape3(data.size(0), 2, 3); - // Shape<4> grid_shape = Shape4(out.size(0), out.size(2), out.size(3), 2); - // Tensor loc = in_data[st::kLoc].get_with_shape(loc_shape, s); - // Tensor grid = out_data[st::kGridSrc] - // .get_with_shape(grid_shape, s); - // if (!init_cudnn_) { - // Init(s, in_data, out_data); - // } - // CHECK_EQ(data.CheckContiguous(), true); - // CHECK_EQ(out.CheckContiguous(), true); - // typename DataType::ScaleType alpha = 1.0f; - // typename DataType::ScaleType beta = 0.0f; - // if (param_.transform_type == st::kAffine) { - // CHECK_EQ(cudnnSpatialTfGridGeneratorForward(s->dnn_handle_, - // st_desc_, - // loc.dptr_, - // grid.dptr_/*output*/), CUDNN_STATUS_SUCCESS); - // } - // CHECK_EQ(cudnnSpatialTfSamplerForward(s->dnn_handle_, - // st_desc_, - // &alpha, - // in_desc_, - // data.dptr_, - // grid.dptr_, - // &beta, - // out_desc_, - // out.dptr_/*output*/), CUDNN_STATUS_SUCCESS); + if (ctx.is_train) { + // training mode + Tensor temp_space = + ctx.requested[rnn_enum::kTempSpace].get_space_typed( + mshadow::Shape1(workspace_size_ + reserve_space_size_), s); + CHECK_EQ(cudnnRNNForwardTraining(s->dnn_handle_, + rnn_desc_, + param_.seq_length_, + x_desc_vec_.data(), + x.dptr_, + hx_desc_, + hx.dptr_, + cx_desc_, + cx_ptr, + w_desc_, + w.dptr_, + y_desc_vec_.data(), + y.dptr_, + hy_desc_, + hy.dptr_, + cy_desc_, + cy_ptr, + temp_space.dptr_, + workspace_byte_, + temp_space.dptr_ + workspace_size_, + reserve_space_byte_ + ), CUDNN_STATUS_SUCCESS); + } else { + // inference mode + Tensor temp_space = + ctx.requested[rnn_enum::kTempSpace].get_space_typed( + mshadow::Shape1(workspace_size_), s); + CHECK_EQ(cudnnRNNForwardInference(s->dnn_handle_, + rnn_desc_, + param_.seq_length_, + x_desc_vec_.data(), + x.dptr_, + hx_desc_, + hx.dptr_, + cx_desc_, + cx_ptr, + w_desc_, + w.dptr_, + y_desc_vec_.data(), + y.dptr_, + hy_desc_, + hy.dptr_, + cy_desc_, + cy_ptr, + temp_space.dptr_, + workspace_byte_ + ), CUDNN_STATUS_SUCCESS); + } } // virtual void Backward(const OpContext &ctx, @@ -155,46 +159,12 @@ class CuDNNRNNOp : public Operator { const std::vector &in_grad, const std::vector &aux_args) { using namespace mshadow; - // CHECK_EQ(in_data.size(), 2); - // CHECK_EQ(out_data.size(), 3); - // CHECK_EQ(out_grad.size(), 1); - // Stream *s = ctx.get_stream(); - // Tensor data = in_data[st::kData].get(s); - // Tensor grad = out_grad[st::kOut].get(s); - // Tensor ddata = in_grad[st::kData].get(s); - // Shape<3> loc_shape = Shape3(data.size(0), 2, 3); - // Shape<4> grid_shape = Shape4(grad.size(0), grad.size(2), grad.size(3), 2); - // Tensor dloc = in_grad[st::kLoc].get_with_shape(loc_shape, s); - // Tensor grid = out_data[st::kGridSrc] - // .get_with_shape(grid_shape, s); - // // do not use out_grad[st::kGridSrc], because dgrid is a intermediate tensor, and not include in - // // DeclareBackwardDependency, another, we can we reuse grid for inplace operator - // typename DataType::ScaleType alpha = 1.0f; - // typename DataType::ScaleType beta = 0.0f; - // typename DataType::ScaleType alpha_dgrid = 1.0f; - // typename DataType::ScaleType beta_dgrid = 0.0f; - // CHECK_EQ(cudnnSpatialTfSamplerBackward(s->dnn_handle_, - // st_desc_, - // &alpha, - // in_desc_, - // data.dptr_, - // &beta, - // in_desc_/*reuse in_desc_*/, - // ddata.dptr_/*output*/, - // &alpha_dgrid, - // out_desc_/*reuse out_desc_*/, - // grad.dptr_, - // grid.dptr_, - // &beta_dgrid, - // grid.dptr_/*output, reuse grid*/), CUDNN_STATUS_SUCCESS); - // if (param_.transform_type == st::kAffine) { - // CHECK_EQ(cudnnSpatialTfGridGeneratorBackward(s->dnn_handle_, - // st_desc_, - // grid.dptr_, - // dloc.dptr_/*out*/), CUDNN_STATUS_SUCCESS); - // } + size_t in_expected = param_.lstm_q_ ? 4 : 3; + size_t out_expected = param_.lstm_q_ ? 3 : 2; + CHECK_EQ(in_data.size(), in_expected); + CHECK_EQ(out_data.size(), out_expected); + CHECK_EQ(out_data.size(), out_expected); } - // private: inline void Init(mshadow::Stream *s, const std::vector &in_data, @@ -203,126 +173,193 @@ class CuDNNRNNOp : public Operator { #if CUDNN_MAJOR == 5 format_ = CUDNN_TENSOR_NCHW; #endif - if(param_.mode == rnn_enum::kLstm){ - CHECK_EQ(in_data.size(), 4); - CHECK_EQ(out_data.size(), 3); - } - else{ - CHECK_EQ(in_data.size(), 3); - CHECK_EQ(out_data.size(), 2); - } + size_t in_expected = param_.lstm_q_ ? 4 : 3; + size_t out_expected = param_.lstm_q_ ? 3 : 2; + CHECK_EQ(in_data.size(), in_expected); + CHECK_EQ(out_data.size(), out_expected); if (!init_cudnn_) { init_cudnn_ = true; // get input + output tensors - Tensor data = in_data[rnn_enum::kData].get(s); - Tensor params = in_data[rnn_enum::kParams].get(s); - Tensor state = in_data[rnn_enum::kStateIn].get(s); + Tensor x = in_data[rnn_enum::kData].get(s); + Tensor w = in_data[rnn_enum::kParams].get(s); + // Tensor Descriptors + std::vector x_vec(param_.seq_length_); + std::vector y_vec(param_.seq_length_); + std::vector dx_vec(param_.seq_length_); + std::vector dy_vec(param_.seq_length_); + int dimA[3]; + int strideA[3]; + for (int i = 0; i < param_.seq_length_; i++) { + CHECK_EQ(cudnnCreateTensorDescriptor(&x_vec[i]), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudnnCreateTensorDescriptor(&y_vec[i]), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudnnCreateTensorDescriptor(&dx_vec[i]), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudnnCreateTensorDescriptor(&dy_vec[i]), CUDNN_STATUS_SUCCESS); + + dimA[0] = x.shape_[0]; + dimA[1] = x.shape_[2]; + dimA[2] = 1; + strideA[0] = dimA[2] * dimA[1]; + strideA[1] = dimA[2]; + strideA[2] = 1; - Tensor out = out_data[rnn_enum::kOut].get(s); - Tensor out_state = out_data[rnn_enum::kStateOut].get(s); + CHECK_EQ(cudnnSetTensorNdDescriptor(x_vec[i], + dtype_, + 3, + dimA, + strideA + ), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudnnSetTensorNdDescriptor(dx_vec[i], + dtype_, + 3, + dimA, + strideA + ), CUDNN_STATUS_SUCCESS); + dimA[0] = x.shape_[0]; + dimA[1] = param_.bidirectional ? param_.state_size * 2 : param_.state_size; + dimA[2] = 1; + strideA[0] = dimA[2] * dimA[1]; + strideA[1] = dimA[2]; + strideA[2] = 1; - if(param_.mode == rnn_enum::kLstm){ - Tensor cell_state = - in_data[rnn_enum::kCellStateIn].get(s); - Tensor out_cell_state = - in_data[rnn_enum::kCellStateOut].get(s); + CHECK_EQ(cudnnSetTensorNdDescriptor(y_vec[i], + dtype_, + 3, + dimA, + strideA + ), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudnnSetTensorNdDescriptor(dy_vec[i], + dtype_, + 3, + dimA, + strideA + ), CUDNN_STATUS_SUCCESS); } + x_desc_vec_ = x_vec; + y_desc_vec_ = y_vec; + dx_desc_vec_ = dx_vec; + dy_desc_vec_ = dy_vec; - // Create descriptors - CHECK_EQ(cudnnCreateRNNDescriptor(&rnn_desc_), CUDNN_STATUS_SUCCESS); - CHECK_EQ(cudnnCreateDropoutDescriptor(&dropout_desc_), CUDNN_STATUS_SUCCESS); + // set the state tensors + dimA[0] = param_.num_layers * (param_.bidirectional ? 2 : 1); + dimA[1] = x.shape_[0]; //minibatch + dimA[2] = param_.state_size; + strideA[0] = dimA[2] * dimA[1]; + strideA[1] = dimA[2]; + strideA[2] = 1; - CHECK_EQ(cudnnCreateFilterDescriptor(&w_desc_), CUDNN_STATUS_SUCCESS); - CHECK_EQ(cudnnCreateTensorDescriptor(&x_desc_), CUDNN_STATUS_SUCCESS); CHECK_EQ(cudnnCreateTensorDescriptor(&hx_desc_), CUDNN_STATUS_SUCCESS); - CHECK_EQ(cudnnCreateTensorDescriptor(&y_desc_), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudnnCreateTensorDescriptor(&cx_desc_), CUDNN_STATUS_SUCCESS); CHECK_EQ(cudnnCreateTensorDescriptor(&hy_desc_), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudnnCreateTensorDescriptor(&cy_desc_), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudnnCreateTensorDescriptor(&dhx_desc_), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudnnCreateTensorDescriptor(&dcx_desc_), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudnnCreateTensorDescriptor(&dhy_desc_), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudnnCreateTensorDescriptor(&dcy_desc_), CUDNN_STATUS_SUCCESS); - if (param_.mode == rnn_enum::kLstm){ - CHECK_EQ(cudnnCreateTensorDescriptor(&cx_desc_), CUDNN_STATUS_SUCCESS); - CHECK_EQ(cudnnCreateTensorDescriptor(&cy_desc_), CUDNN_STATUS_SUCCESS); - } - // set dropout - // cudnnSetDropoutDescriptor(dropout_desc_, - // s->dnn_handle_, - // param_.p, - // void * states, - // size_t stateSizeInBytes, - // unsigned long long seed) - // set RNN - CHECK_EQ(cudnnSetRNNDescriptor(rnn_desc_, - param_.state_size, - param_.num_layers, - dropout_desc_, - input_mode_, - direction_, - mode_, - dtype_), CUDNN_STATUS_SUCCESS); - // Set params - int dim_params[3] = {params.shape_[0], 1, 1}; - CHECK_EQ(cudnnSetFilterNdDescriptor(w_desc_, + CHECK_EQ(cudnnSetTensorNdDescriptor(hx_desc_, dtype_, - format_, 3, - dim_params + dimA, + strideA ), CUDNN_STATUS_SUCCESS); - // Get strides - int stride_data[3] = {data.shape_[2]*data.shape_[1], data.shape_[2], 1}; - int stride_state[3] = {state.shape_[2]*state.shape_[1], state.shape_[2], 1}; - int stride_out[3] = {out.shape_[2]*out.shape_[1], out.shape_[2], 1}; - int stride_out_state[3] = - {out_state.shape_[2]*out_state.shape_[1], out_state.shape_[2], 1}; - - // cuDNN needs int arrays for dim, not index_t array used in Shape - int dim_data[3]; - int dim_state[3]; - int dim_out[3]; - int dim_out_state[3]; - std::copy(std::begin(data.shape_.shape_), std::end(data.shape_.shape_), std::begin(dim_data)); - std::copy(std::begin(state.shape_.shape_), std::end(state.shape_.shape_), std::begin(dim_state)); - std::copy(std::begin(out.shape_.shape_), std::end(out.shape_.shape_), std::begin(dim_out)); - std::copy(std::begin(out_state.shape_.shape_), std::end(out_state.shape_.shape_), std::begin(dim_out_state)); - - // set the tensor descriptors - CHECK_EQ(cudnnSetTensorNdDescriptor(x_desc_, + CHECK_EQ(cudnnSetTensorNdDescriptor(cx_desc_, dtype_, 3, - dim_data, - stride_data + dimA, + strideA ), CUDNN_STATUS_SUCCESS); - CHECK_EQ(cudnnSetTensorNdDescriptor(hx_desc_, + CHECK_EQ(cudnnSetTensorNdDescriptor(hy_desc_, dtype_, 3, - dim_state, - stride_state + dimA, + strideA ), CUDNN_STATUS_SUCCESS); - CHECK_EQ(cudnnSetTensorNdDescriptor(y_desc_, + CHECK_EQ(cudnnSetTensorNdDescriptor(cy_desc_, dtype_, 3, - dim_out, - stride_out + dimA, + strideA ), CUDNN_STATUS_SUCCESS); - CHECK_EQ(cudnnSetTensorNdDescriptor(hy_desc_, + CHECK_EQ(cudnnSetTensorNdDescriptor(dhx_desc_, dtype_, 3, - dim_out_state, - stride_out_state + dimA, + strideA ), CUDNN_STATUS_SUCCESS); - // LSTM has two extra descriptors - if (param_.mode == rnn_enum::kLstm){ - CHECK_EQ(cudnnSetTensorNdDescriptor(cx_desc_, - dtype_, - 3, - dim_state, - stride_state - ), CUDNN_STATUS_SUCCESS); - CHECK_EQ(cudnnSetTensorNdDescriptor(cy_desc_, - dtype_, - 3, - dim_out_state, - stride_out_state + CHECK_EQ(cudnnSetTensorNdDescriptor(dcx_desc_, + dtype_, + 3, + dimA, + strideA + ), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudnnSetTensorNdDescriptor(dhy_desc_, + dtype_, + 3, + dimA, + strideA + ), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudnnSetTensorNdDescriptor(dcy_desc_, + dtype_, + 3, + dimA, + strideA + ), CUDNN_STATUS_SUCCESS); + + // Get temp space sizes + CHECK_EQ(cudnnGetRNNWorkspaceSize(s->dnn_handle_, + rnn_desc_, + param_.seq_length_, + x_desc_vec_.data(), + &workspace_byte_ + ), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudnnGetRNNTrainingReserveSize(s->dnn_handle_, + rnn_desc_, + param_.seq_length_, + x_desc_vec_.data(), + &reserve_space_byte_ + ), CUDNN_STATUS_SUCCESS); + workspace_size_ = workspace_byte_ / sizeof(DType) + 1; + reserve_space_size_ = reserve_space_byte_ / sizeof(DType) + 1; + + // Set param descriptors + CHECK_EQ(cudnnCreateFilterDescriptor(&w_desc_), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudnnCreateFilterDescriptor(&dw_desc_), CUDNN_STATUS_SUCCESS); + int dim_w[3] = {w.shape_[0], 1, 1}; + CHECK_EQ(cudnnSetFilterNdDescriptor(w_desc_, + dtype_, + format_, + 3, + dim_w + ), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudnnSetFilterNdDescriptor(dw_desc_, + dtype_, + format_, + 3, + dim_w + ), CUDNN_STATUS_SUCCESS); + // Create Dropout descriptors + CHECK_EQ(cudnnCreateDropoutDescriptor(&dropout_desc_), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudnnDropoutGetStatesSize(s->dnn_handle_, + &dropout_byte_ ), CUDNN_STATUS_SUCCESS); - } + dropout_size_ = dropout_byte_ / sizeof(DType); + CHECK_EQ(cudnnSetDropoutDescriptor(dropout_desc_, + s->dnn_handle_, + param_.pkeep_, // keep probability + NULL, + dropout_byte_, + seed_), CUDNN_STATUS_SUCCESS); + // RNN descriptors + CHECK_EQ(cudnnCreateRNNDescriptor(&rnn_desc_), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudnnSetRNNDescriptor(rnn_desc_, + param_.state_size, + param_.num_layers, + dropout_desc_, + input_mode_, + direction_, + mode_, + dtype_), CUDNN_STATUS_SUCCESS); + } } @@ -333,15 +370,17 @@ class CuDNNRNNOp : public Operator { cudnnDirectionMode_t direction_; cudnnRNNInputMode_t input_mode_; cudnnDropoutDescriptor_t dropout_desc_; + unsigned long long seed_ = 4553; + size_t workspace_byte_, reserve_space_byte_, dropout_byte_; + int workspace_size_, reserve_space_size_, dropout_size_; - cudnnTensorDescriptor_t x_desc_; - cudnnTensorDescriptor_t hx_desc_; - cudnnTensorDescriptor_t cx_desc_; - cudnnTensorDescriptor_t y_desc_; - cudnnTensorDescriptor_t hy_desc_; - cudnnTensorDescriptor_t cy_desc_; + std::vector x_desc_vec_, y_desc_vec_, dx_desc_vec_, dy_desc_vec_; + cudnnTensorDescriptor_t hx_desc_, cx_desc_; + cudnnTensorDescriptor_t hy_desc_, cy_desc_; + cudnnTensorDescriptor_t dhx_desc_, dcx_desc_; + cudnnTensorDescriptor_t dhy_desc_, dcy_desc_; - cudnnFilterDescriptor_t w_desc_; + cudnnFilterDescriptor_t w_desc_, dw_desc_; #if CUDNN_MAJOR == 5 cudnnTensorFormat_t format_; @@ -352,4 +391,4 @@ class CuDNNRNNOp : public Operator { } // namespace op } // namespace mxnet -#endif // MXNET_OPERATOR_CUDNN_SPATIAL_TRANSFORMER_INL_H_ +#endif // MXNET_OPERATOR_CUDNN_RNN_INL_H_ diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h index d81ed1637756..53189d100ef2 100644 --- a/src/operator/rnn-inl.h +++ b/src/operator/rnn-inl.h @@ -24,7 +24,6 @@ namespace rnn_enum { enum RNNOpInputs {kData, kParams, kStateIn, kCellStateIn}; enum RNNOpOutputs {kOut, kStateOut, kCellStateOut}; enum RNNModeType {kRnnRelu, kRnnTanh, kLstm, kGru}; - enum RNNDirectionType {kUnidirectional, kBidirectional}; enum RNNOpResource {kTempSpace}; } @@ -55,26 +54,27 @@ inline int rnn_single_param_size(int inputSize, inline int rnn_param_size(int layerNum, int inputSize, int hiddenSize, - int direction, + bool bidirectional, int mode){ // get size of first layer int size = rnn_single_param_size(inputSize, hiddenSize, mode); // get size of remaining layers - if(direction == rnn_enum::kUnidirectional) - size += (layerNum - 1) * rnn_single_param_size(hiddenSize, hiddenSize, mode); - else // bidirectional case: input size increases by 2 + if(bidirectional) size += (layerNum - 1) * rnn_single_param_size(2 * hiddenSize, hiddenSize, mode); + else + size += (layerNum - 1) * rnn_single_param_size(hiddenSize, hiddenSize, mode); return size; } struct RNNParam : public dmlc::Parameter { uint32_t state_size; uint32_t num_layers; - uint64_t workspace; bool batch_first; - int direction; + bool bidirectional; int mode; - float p; + float p, pkeep_; + int seq_length_; + bool lstm_q_; // whether type is lstm DMLC_DECLARE_PARAMETER(RNNParam) { DMLC_DECLARE_FIELD(state_size) @@ -83,13 +83,8 @@ struct RNNParam : public dmlc::Parameter { DMLC_DECLARE_FIELD(num_layers) .describe("number of stacked layers"); - DMLC_DECLARE_FIELD(workspace).set_default(512).set_range(0, 8192) - .describe("Tmp workspace for RNN (MB)"); - - DMLC_DECLARE_FIELD(direction) - .add_enum("unidirectional", rnn_enum::kUnidirectional) - .add_enum("bidirectional", rnn_enum::kBidirectional) - .describe("specifies the recurrence pattern"); + DMLC_DECLARE_FIELD(bidirectional).set_default(false) + .describe("whether to use bidirectional recurrent layers"); DMLC_DECLARE_FIELD(mode) .add_enum("rnn_relu", rnn_enum::kRnnRelu) @@ -108,9 +103,12 @@ template class RNNOp : public Operator { public: explicit RNNOp(RNNParam p) { - this->param_ = p; // convert MBytes first to Bytes and then to elements. - param_.workspace = (param_.workspace << 20) / sizeof(real_t); + param_.pkeep_ = 1.0f - param_.p; + if(param_.mode == rnn_enum::kLstm) + param_.lstm_q_ = true; + else + param_.lstm_q_ = false; } virtual void Forward(const OpContext &ctx, @@ -185,10 +183,7 @@ class RNNProp : public OperatorProperty { // Infer hidden state + cell state int batchSize = dshape[0]; int inputSize = dshape[2]; - int numDirections = 1; - if(param_.direction == rnn_enum::kBidirectional){ - numDirections = 2; - } + int numDirections = param_.bidirectional ? 2 : 1; int total_layers = numDirections * param_.num_layers; // double for bidirectional SHAPE_ASSIGN_CHECK(*in_shape, rnn_enum::kStateIn, @@ -202,7 +197,7 @@ class RNNProp : public OperatorProperty { int weight_size = rnn_param_size(param_.num_layers, inputSize, param_.state_size, - param_.direction, + param_.bidirectional, param_.mode); SHAPE_ASSIGN_CHECK(*in_shape, rnn_enum::kParams, Shape1(weight_size)); // infer output size From c1382b35eb12b37d518da8750142ee07bab2cc45 Mon Sep 17 00:00:00 2001 From: Sebastian Bodenstein Date: Mon, 18 Jul 2016 02:25:19 -0400 Subject: [PATCH 22/36] - added parameter size test - fixed bug where cudnnGetRNNParamsSize needs to be called after cudnnSetRNNDescriptor --- src/operator/cudnn_rnn-inl.h | 64 +++++++++++++++++++++--------------- 1 file changed, 38 insertions(+), 26 deletions(-) diff --git a/src/operator/cudnn_rnn-inl.h b/src/operator/cudnn_rnn-inl.h index 134044321ad7..3a40b2f67fd7 100644 --- a/src/operator/cudnn_rnn-inl.h +++ b/src/operator/cudnn_rnn-inl.h @@ -150,7 +150,7 @@ class CuDNNRNNOp : public Operator { ), CUDNN_STATUS_SUCCESS); } } - // + virtual void Backward(const OpContext &ctx, const std::vector &out_grad, const std::vector &in_data, @@ -182,6 +182,9 @@ class CuDNNRNNOp : public Operator { // get input + output tensors Tensor x = in_data[rnn_enum::kData].get(s); Tensor w = in_data[rnn_enum::kParams].get(s); + + param_.seq_length_ = x.shape_[1]; + // Tensor Descriptors std::vector x_vec(param_.seq_length_); std::vector y_vec(param_.seq_length_); @@ -305,7 +308,29 @@ class CuDNNRNNOp : public Operator { strideA ), CUDNN_STATUS_SUCCESS); - // Get temp space sizes + // Create Dropout descriptors + CHECK_EQ(cudnnCreateDropoutDescriptor(&dropout_desc_), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudnnDropoutGetStatesSize(s->dnn_handle_, + &dropout_byte_ + ), CUDNN_STATUS_SUCCESS); + dropout_size_ = dropout_byte_ / sizeof(DType); + CHECK_EQ(cudnnSetDropoutDescriptor(dropout_desc_, + s->dnn_handle_, + param_.pkeep_, // keep probability + NULL, + dropout_byte_, + seed_), CUDNN_STATUS_SUCCESS); + // RNN descriptors + CHECK_EQ(cudnnCreateRNNDescriptor(&rnn_desc_), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudnnSetRNNDescriptor(rnn_desc_, + param_.state_size, + param_.num_layers, + dropout_desc_, + input_mode_, + direction_, + mode_, + dtype_), CUDNN_STATUS_SUCCESS); + // Get temp space sizes CHECK_EQ(cudnnGetRNNWorkspaceSize(s->dnn_handle_, rnn_desc_, param_.seq_length_, @@ -318,8 +343,17 @@ class CuDNNRNNOp : public Operator { x_desc_vec_.data(), &reserve_space_byte_ ), CUDNN_STATUS_SUCCESS); - workspace_size_ = workspace_byte_ / sizeof(DType) + 1; - reserve_space_size_ = reserve_space_byte_ / sizeof(DType) + 1; + workspace_size_ = workspace_byte_ / sizeof(DType); + reserve_space_size_ = reserve_space_byte_ / sizeof(DType); + + // check that number of params are correct + size_t cudnn_param_size; + CHECK_EQ(cudnnGetRNNParamsSize(s->dnn_handle_, + rnn_desc_, + x_desc_vec_[0], + &cudnn_param_size, + dtype_), CUDNN_STATUS_SUCCESS); + CHECK_EQ(w.shape_[0] * sizeof(DType), cudnn_param_size); // Set param descriptors CHECK_EQ(cudnnCreateFilterDescriptor(&w_desc_), CUDNN_STATUS_SUCCESS); @@ -337,28 +371,6 @@ class CuDNNRNNOp : public Operator { 3, dim_w ), CUDNN_STATUS_SUCCESS); - // Create Dropout descriptors - CHECK_EQ(cudnnCreateDropoutDescriptor(&dropout_desc_), CUDNN_STATUS_SUCCESS); - CHECK_EQ(cudnnDropoutGetStatesSize(s->dnn_handle_, - &dropout_byte_ - ), CUDNN_STATUS_SUCCESS); - dropout_size_ = dropout_byte_ / sizeof(DType); - CHECK_EQ(cudnnSetDropoutDescriptor(dropout_desc_, - s->dnn_handle_, - param_.pkeep_, // keep probability - NULL, - dropout_byte_, - seed_), CUDNN_STATUS_SUCCESS); - // RNN descriptors - CHECK_EQ(cudnnCreateRNNDescriptor(&rnn_desc_), CUDNN_STATUS_SUCCESS); - CHECK_EQ(cudnnSetRNNDescriptor(rnn_desc_, - param_.state_size, - param_.num_layers, - dropout_desc_, - input_mode_, - direction_, - mode_, - dtype_), CUDNN_STATUS_SUCCESS); } } From f87c003bbd27f4dabf5c87adba0b86a604269562 Mon Sep 17 00:00:00 2001 From: Sebastian Bodenstein Date: Mon, 18 Jul 2016 10:32:55 -0400 Subject: [PATCH 23/36] - checks for contiguous input tensors - more consistent param names - removed 'batch_first' option for now. Might add it later again --- src/operator/cudnn_rnn-inl.h | 119 +++++++++++++++++++---------------- src/operator/rnn-inl.h | 37 ++++++----- 2 files changed, 82 insertions(+), 74 deletions(-) diff --git a/src/operator/cudnn_rnn-inl.h b/src/operator/cudnn_rnn-inl.h index 3a40b2f67fd7..8c6eae9dc984 100644 --- a/src/operator/cudnn_rnn-inl.h +++ b/src/operator/cudnn_rnn-inl.h @@ -92,15 +92,24 @@ class CuDNNRNNOp : public Operator { cy_ptr = (in_data[rnn_enum::kCellStateOut].get(s)).dptr_; } + CHECK_EQ(x.CheckContiguous(), true); + CHECK_EQ(w.CheckContiguous(), true); + CHECK_EQ(hx.CheckContiguous(), true); + CHECK_EQ(y.CheckContiguous(), true); + CHECK_EQ(hy.CheckContiguous(), true); + if(!init_cudnn_){ Init(s, in_data, out_data); } + // Get temp space + int temp_size = workspace_size_; + temp_size += ctx.is_train ? reserve_space_size_ : 0; + Tensor temp_space = + ctx.requested[rnn_enum::kTempSpace].get_space_typed( + mshadow::Shape1(temp_size), s); + if (ctx.is_train) { - // training mode - Tensor temp_space = - ctx.requested[rnn_enum::kTempSpace].get_space_typed( - mshadow::Shape1(workspace_size_ + reserve_space_size_), s); CHECK_EQ(cudnnRNNForwardTraining(s->dnn_handle_, rnn_desc_, param_.seq_length_, @@ -125,9 +134,6 @@ class CuDNNRNNOp : public Operator { ), CUDNN_STATUS_SUCCESS); } else { // inference mode - Tensor temp_space = - ctx.requested[rnn_enum::kTempSpace].get_space_typed( - mshadow::Shape1(workspace_size_), s); CHECK_EQ(cudnnRNNForwardInference(s->dnn_handle_, rnn_desc_, param_.seq_length_, @@ -182,8 +188,9 @@ class CuDNNRNNOp : public Operator { // get input + output tensors Tensor x = in_data[rnn_enum::kData].get(s); Tensor w = in_data[rnn_enum::kParams].get(s); - - param_.seq_length_ = x.shape_[1]; + param_.seq_length_ = x.shape_[0]; + param_.batch_size_ = x.shape_[1]; + param_.input_size_ = x.shape_[2]; // Tensor Descriptors std::vector x_vec(param_.seq_length_); @@ -193,49 +200,51 @@ class CuDNNRNNOp : public Operator { int dimA[3]; int strideA[3]; for (int i = 0; i < param_.seq_length_; i++) { - CHECK_EQ(cudnnCreateTensorDescriptor(&x_vec[i]), CUDNN_STATUS_SUCCESS); - CHECK_EQ(cudnnCreateTensorDescriptor(&y_vec[i]), CUDNN_STATUS_SUCCESS); - CHECK_EQ(cudnnCreateTensorDescriptor(&dx_vec[i]), CUDNN_STATUS_SUCCESS); - CHECK_EQ(cudnnCreateTensorDescriptor(&dy_vec[i]), CUDNN_STATUS_SUCCESS); - - dimA[0] = x.shape_[0]; - dimA[1] = x.shape_[2]; - dimA[2] = 1; - strideA[0] = dimA[2] * dimA[1]; - strideA[1] = dimA[2]; - strideA[2] = 1; + CHECK_EQ(cudnnCreateTensorDescriptor(&x_vec[i]), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudnnCreateTensorDescriptor(&y_vec[i]), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudnnCreateTensorDescriptor(&dx_vec[i]), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudnnCreateTensorDescriptor(&dy_vec[i]), CUDNN_STATUS_SUCCESS); + + dimA[0] = param_.batch_size_; + dimA[1] = param_.input_size_; + dimA[2] = 1; + dimA[0] = param_.batch_size_; + dimA[1] = param_.input_size_; + strideA[0] = dimA[2] * dimA[1]; + strideA[1] = dimA[2]; + strideA[2] = 1; - CHECK_EQ(cudnnSetTensorNdDescriptor(x_vec[i], - dtype_, - 3, - dimA, - strideA - ), CUDNN_STATUS_SUCCESS); - CHECK_EQ(cudnnSetTensorNdDescriptor(dx_vec[i], - dtype_, - 3, - dimA, - strideA - ), CUDNN_STATUS_SUCCESS); - dimA[0] = x.shape_[0]; - dimA[1] = param_.bidirectional ? param_.state_size * 2 : param_.state_size; - dimA[2] = 1; - strideA[0] = dimA[2] * dimA[1]; - strideA[1] = dimA[2]; - strideA[2] = 1; + CHECK_EQ(cudnnSetTensorNdDescriptor(x_vec[i], + dtype_, + 3, + dimA, + strideA + ), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudnnSetTensorNdDescriptor(dx_vec[i], + dtype_, + 3, + dimA, + strideA + ), CUDNN_STATUS_SUCCESS); + dimA[0] = param_.batch_size_; + dimA[1] = param_.bidirectional ? param_.state_size_ * 2 : param_.state_size_; + dimA[2] = 1; + strideA[0] = dimA[2] * dimA[1]; + strideA[1] = dimA[2]; + strideA[2] = 1; - CHECK_EQ(cudnnSetTensorNdDescriptor(y_vec[i], - dtype_, - 3, - dimA, - strideA - ), CUDNN_STATUS_SUCCESS); - CHECK_EQ(cudnnSetTensorNdDescriptor(dy_vec[i], - dtype_, - 3, - dimA, - strideA - ), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudnnSetTensorNdDescriptor(y_vec[i], + dtype_, + 3, + dimA, + strideA + ), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudnnSetTensorNdDescriptor(dy_vec[i], + dtype_, + 3, + dimA, + strideA + ), CUDNN_STATUS_SUCCESS); } x_desc_vec_ = x_vec; y_desc_vec_ = y_vec; @@ -243,9 +252,9 @@ class CuDNNRNNOp : public Operator { dy_desc_vec_ = dy_vec; // set the state tensors - dimA[0] = param_.num_layers * (param_.bidirectional ? 2 : 1); - dimA[1] = x.shape_[0]; //minibatch - dimA[2] = param_.state_size; + dimA[0] = param_.num_layers_ * (param_.bidirectional ? 2 : 1); + dimA[1] = param_.batch_size_; + dimA[2] = param_.state_size_; strideA[0] = dimA[2] * dimA[1]; strideA[1] = dimA[2]; strideA[2] = 1; @@ -323,8 +332,8 @@ class CuDNNRNNOp : public Operator { // RNN descriptors CHECK_EQ(cudnnCreateRNNDescriptor(&rnn_desc_), CUDNN_STATUS_SUCCESS); CHECK_EQ(cudnnSetRNNDescriptor(rnn_desc_, - param_.state_size, - param_.num_layers, + param_.state_size_, + param_.num_layers_, dropout_desc_, input_mode_, direction_, diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h index 53189d100ef2..a4252b7e8fe5 100644 --- a/src/operator/rnn-inl.h +++ b/src/operator/rnn-inl.h @@ -67,20 +67,19 @@ inline int rnn_param_size(int layerNum, } struct RNNParam : public dmlc::Parameter { - uint32_t state_size; - uint32_t num_layers; - bool batch_first; + uint32_t state_size_; + uint32_t num_layers_; bool bidirectional; int mode; float p, pkeep_; - int seq_length_; + int seq_length_, batch_size_, input_size_; bool lstm_q_; // whether type is lstm DMLC_DECLARE_PARAMETER(RNNParam) { - DMLC_DECLARE_FIELD(state_size) + DMLC_DECLARE_FIELD(state_size_) .describe("size of the state for each layer"); - DMLC_DECLARE_FIELD(num_layers) + DMLC_DECLARE_FIELD(num_layers_) .describe("number of stacked layers"); DMLC_DECLARE_FIELD(bidirectional).set_default(false) @@ -179,35 +178,35 @@ class RNNProp : public OperatorProperty { const TShape &dshape = (*in_shape)[rnn_enum::kData]; if (dshape.ndim() == 0) return false; CHECK_EQ(dshape.ndim(), 3) \ - << "Input data should be rank-3 tensor of dim (seqLength, batch, inputDim)."; - // Infer hidden state + cell state - int batchSize = dshape[0]; - int inputSize = dshape[2]; + << "Input data should be rank-3 tensor of dim (sequence length, batch size, input size)"; + // Get input sizes + int batch_size = dshape[1]; + int input_size = dshape[2]; int numDirections = param_.bidirectional ? 2 : 1; - int total_layers = numDirections * param_.num_layers; // double for bidirectional + int total_layers = numDirections * param_.num_layers_; // double for bidirectional SHAPE_ASSIGN_CHECK(*in_shape, rnn_enum::kStateIn, - Shape3(total_layers, batchSize, param_.state_size)); + Shape3(total_layers, batch_size, param_.state_size_)); if (param_.mode == rnn_enum::kLstm){ SHAPE_ASSIGN_CHECK(*in_shape, rnn_enum::kCellStateIn, - Shape3(total_layers, batchSize, param_.state_size)); + Shape3(total_layers, batch_size, param_.state_size_)); } // infer weight size - int weight_size = rnn_param_size(param_.num_layers, - inputSize, - param_.state_size, + int weight_size = rnn_param_size(param_.num_layers_, + input_size, + param_.state_size_, param_.bidirectional, param_.mode); SHAPE_ASSIGN_CHECK(*in_shape, rnn_enum::kParams, Shape1(weight_size)); // infer output size TShape oshape = dshape; - oshape[2] = numDirections * param_.state_size; + oshape[2] = numDirections * param_.state_size_; // infer output state size TShape outStateShape = dshape; outStateShape[0] = total_layers; - outStateShape[1] = batchSize; - outStateShape[2] = param_.state_size; + outStateShape[1] = batch_size; + outStateShape[2] = param_.state_size_; out_shape->clear(); out_shape->push_back(oshape); From 8b84ef0afd2ac99c5a879d146848a4f17f48ec62 Mon Sep 17 00:00:00 2001 From: Sebastian Bodenstein Date: Tue, 19 Jul 2016 23:06:39 +0200 Subject: [PATCH 24/36] - fixed input names --- src/operator/rnn-inl.h | 41 +++++++++++++++++++++++++---------------- src/operator/rnn.cc | 4 ++-- 2 files changed, 27 insertions(+), 18 deletions(-) diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h index a4252b7e8fe5..98f8a5953d70 100644 --- a/src/operator/rnn-inl.h +++ b/src/operator/rnn-inl.h @@ -144,18 +144,28 @@ class RNNProp : public OperatorProperty { public: std::vector ListArguments() const override { if (param_.mode == rnn_enum::kLstm) { - return {"data", "weight", "state", "cell_state"}; + return {"data", "parameters", "state", "cell_state"}; } else { - return {"data", "weight", "state"}; + return {"data", "parameters", "state"}; } } std::vector ListOutputs() const override { - if (param_.mode == rnn_enum::kLstm) { - return {"output", "final_state", "final_state_cell"}; - } else { - return {"output", "final_state"}; - } + if (param_.mode == rnn_enum::kLstm) + return {"output", "state", "state_cell"}; + else + return {"output", "state"}; + } + + int NumOutputs() const override { + if (param_.mode == rnn_enum::kLstm) + return 3; + else + return 2; + } + + int NumVisibleOutputs() const override { + return 1; } void Init(const std::vector >& kwargs) override { @@ -171,15 +181,15 @@ class RNNProp : public OperatorProperty { std::vector *aux_shape) const override { using namespace mshadow; if (param_.mode == rnn_enum::kLstm) { - CHECK_EQ(in_shape->size(), 4) << "Input:[data, weight, state, cell_state]"; + CHECK_EQ(in_shape->size(), 4) << "Input:[data, parameters, state, cell_state]"; } else { - CHECK_EQ(in_shape->size(), 3) << "Input:[data, weight, state]"; + CHECK_EQ(in_shape->size(), 3) << "Input:[data, parameters, state]"; } const TShape &dshape = (*in_shape)[rnn_enum::kData]; if (dshape.ndim() == 0) return false; CHECK_EQ(dshape.ndim(), 3) \ - << "Input data should be rank-3 tensor of dim (sequence length, batch size, input size)"; - // Get input sizes + << "Input data should be rank-3 tensor of dim [sequence length, batch size, input size]"; + // data: [sequence len, batch, input dimension] int batch_size = dshape[1]; int input_size = dshape[2]; int numDirections = param_.bidirectional ? 2 : 1; @@ -192,17 +202,16 @@ class RNNProp : public OperatorProperty { rnn_enum::kCellStateIn, Shape3(total_layers, batch_size, param_.state_size_)); } - // infer weight size - int weight_size = rnn_param_size(param_.num_layers_, + // calculate parameter vector length + int param_size = rnn_param_size(param_.num_layers_, input_size, param_.state_size_, param_.bidirectional, param_.mode); - SHAPE_ASSIGN_CHECK(*in_shape, rnn_enum::kParams, Shape1(weight_size)); - // infer output size + SHAPE_ASSIGN_CHECK(*in_shape, rnn_enum::kParams, Shape1(param_size)); + // output: [sequence len, batch, output size] TShape oshape = dshape; oshape[2] = numDirections * param_.state_size_; - // infer output state size TShape outStateShape = dshape; outStateShape[0] = total_layers; outStateShape[1] = batch_size; diff --git a/src/operator/rnn.cc b/src/operator/rnn.cc index 2a485e5ef224..5e3b2b8894af 100644 --- a/src/operator/rnn.cc +++ b/src/operator/rnn.cc @@ -20,7 +20,7 @@ Operator *CreateOp(RNNParam param, int dtype) { } Operator *RNNProp::CreateOperatorEx(Context ctx, std::vector *in_shape, - std::vector *in_type) const { + std::vector *in_type) const { std::vector out_shape, aux_shape; std::vector out_type, aux_type; CHECK(InferType(in_type, &out_type, &aux_type)); @@ -34,7 +34,7 @@ MXNET_REGISTER_OP_PROPERTY(RNN, RNNProp) .describe("Apply a recurrent layer to input.") .add_argument("data", "Symbol", "Input data to RNN") .add_argument("parameters", "Symbol", "Vector of all RNN trainable parameters") -.add_argument("hidden_state", "Symbol", "initial hidden state of the RNN") +.add_argument("state", "Symbol", "initial hidden state of the RNN") .add_argument("cell_state", "Symbol", "initial cell state for LSTM networks (only for LSTM)") .add_arguments(RNNParam::__FIELDS__()); } // namespace op From d50f2dc528da00d6707ca39316132b0704c97eb1 Mon Sep 17 00:00:00 2001 From: Sebastian Bodenstein Date: Wed, 20 Jul 2016 12:50:59 +0200 Subject: [PATCH 25/36] - added backward method --- src/operator/cudnn_rnn-inl.h | 95 ++++++++++++++++++++++++++++++++++-- src/operator/rnn-inl.h | 8 +-- 2 files changed, 95 insertions(+), 8 deletions(-) diff --git a/src/operator/cudnn_rnn-inl.h b/src/operator/cudnn_rnn-inl.h index 8c6eae9dc984..6a642f6428f8 100644 --- a/src/operator/cudnn_rnn-inl.h +++ b/src/operator/cudnn_rnn-inl.h @@ -80,7 +80,7 @@ class CuDNNRNNOp : public Operator { // get input + output tensors Tensor x = in_data[rnn_enum::kData].get(s); Tensor w = in_data[rnn_enum::kParams].get(s); - Tensor hx = in_data[rnn_enum::kStateIn].get(s); + Tensor hx = in_data[rnn_enum::kState].get(s); Tensor y = out_data[rnn_enum::kOut].get(s); Tensor hy = out_data[rnn_enum::kStateOut].get(s); @@ -88,8 +88,8 @@ class CuDNNRNNOp : public Operator { DType * cx_ptr = NULL; DType * cy_ptr = NULL; if (param_.mode == rnn_enum::kLstm){ - cx_ptr = (in_data[rnn_enum::kCellStateIn].get(s)).dptr_; - cy_ptr = (in_data[rnn_enum::kCellStateOut].get(s)).dptr_; + cx_ptr = (in_data[rnn_enum::kStateCell].get(s)).dptr_; + cy_ptr = (in_data[rnn_enum::kStateCellOut].get(s)).dptr_; } CHECK_EQ(x.CheckContiguous(), true); @@ -169,7 +169,94 @@ class CuDNNRNNOp : public Operator { size_t out_expected = param_.lstm_q_ ? 3 : 2; CHECK_EQ(in_data.size(), in_expected); CHECK_EQ(out_data.size(), out_expected); - CHECK_EQ(out_data.size(), out_expected); + CHECK_EQ(in_grad.size(), in_expected); + CHECK_EQ(out_grad.size(), out_expected); + + Stream *s = ctx.get_stream(); + // get input + output tensors + Tensor x = in_data[rnn_enum::kData].get(s); + Tensor dx = in_grad[rnn_enum::kData].get(s); + Tensor w = in_data[rnn_enum::kParams].get(s); + Tensor dw = in_grad[rnn_enum::kParams].get(s); + Tensor hx = in_data[rnn_enum::kState].get(s); + Tensor dhx = in_grad[rnn_enum::kState].get(s); + Tensor hy = in_data[rnn_enum::kStateOut].get(s); + Tensor dhy = out_grad[rnn_enum::kStateOut].get(s); + Tensor y = out_data[rnn_enum::kOut].get(s); + Tensor dy = out_grad[rnn_enum::kOut].get(s); + + DType * cx_ptr = NULL; + // DType * cy_ptr = NULL; + DType * dcx_ptr = NULL; + DType * dcy_ptr = NULL; + if (param_.mode == rnn_enum::kLstm){ + cx_ptr = (in_data[rnn_enum::kStateCell].get(s)).dptr_; + // cy_ptr = (in_data[rnn_enum::kStateCellOut].get(s)).dptr_; + dcx_ptr = (in_grad[rnn_enum::kStateCell].get(s)).dptr_; + dcy_ptr = (out_grad[rnn_enum::kStateCellOut].get(s)).dptr_; + } + + CHECK_EQ(x.CheckContiguous(), true); + CHECK_EQ(w.CheckContiguous(), true); + CHECK_EQ(hx.CheckContiguous(), true); + CHECK_EQ(y.CheckContiguous(), true); + CHECK_EQ(hy.CheckContiguous(), true); + + if(!init_cudnn_){ + Init(s, in_data, out_data); + } + + // Get temp space + int temp_size = workspace_size_; + temp_size += ctx.is_train ? reserve_space_size_ : 0; + Tensor temp_space = + ctx.requested[rnn_enum::kTempSpace].get_space_typed( + mshadow::Shape1(temp_size), s); + + CHECK_EQ(cudnnRNNBackwardData(s->dnn_handle_, + rnn_desc_, + param_.seq_length_, + y_desc_vec_.data(), + y.dptr_, + dy_desc_vec_.data(), + dy.dptr_, + dhy_desc_, + dhy.dptr_, + dcy_desc_, + dcy_ptr, + w_desc_, + w.dptr_, + hx_desc_, + hx.dptr_, + cx_desc_, + cx_ptr, + dx_desc_vec_.data(), + dx.dptr_, + dhx_desc_, + dhx.dptr_, + dcx_desc_, + dcx_ptr, + temp_space.dptr_, + workspace_byte_, + temp_space.dptr_ + workspace_size_, + reserve_space_byte_ + ), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudnnRNNBackwardWeights(s->dnn_handle_, + rnn_desc_, + param_.seq_length_, + x_desc_vec_.data(), + x.dptr_, + hx_desc_, + hx.dptr_, + y_desc_vec_.data(), + y.dptr_, + temp_space.dptr_, + workspace_byte_, + dw_desc_, + dw.dptr_, + temp_space.dptr_ + workspace_size_, + reserve_space_byte_ + ), CUDNN_STATUS_SUCCESS); } private: inline void Init(mshadow::Stream *s, diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h index 98f8a5953d70..fd68fd628432 100644 --- a/src/operator/rnn-inl.h +++ b/src/operator/rnn-inl.h @@ -21,8 +21,8 @@ namespace mxnet { namespace op { namespace rnn_enum { - enum RNNOpInputs {kData, kParams, kStateIn, kCellStateIn}; - enum RNNOpOutputs {kOut, kStateOut, kCellStateOut}; + enum RNNOpInputs {kData, kParams, kState, kStateCell}; + enum RNNOpOutputs {kOut, kStateOut, kStateCellOut}; enum RNNModeType {kRnnRelu, kRnnTanh, kLstm, kGru}; enum RNNOpResource {kTempSpace}; } @@ -195,11 +195,11 @@ class RNNProp : public OperatorProperty { int numDirections = param_.bidirectional ? 2 : 1; int total_layers = numDirections * param_.num_layers_; // double for bidirectional SHAPE_ASSIGN_CHECK(*in_shape, - rnn_enum::kStateIn, + rnn_enum::kState, Shape3(total_layers, batch_size, param_.state_size_)); if (param_.mode == rnn_enum::kLstm){ SHAPE_ASSIGN_CHECK(*in_shape, - rnn_enum::kCellStateIn, + rnn_enum::kStateCell, Shape3(total_layers, batch_size, param_.state_size_)); } // calculate parameter vector length From dc55e74bc324e7232c5b4089d6e96fb51d33ae74 Mon Sep 17 00:00:00 2001 From: Sebastian Bodenstein Date: Wed, 20 Jul 2016 13:16:50 +0200 Subject: [PATCH 26/36] - small fix for in/out names --- src/operator/rnn-inl.h | 8 ++++---- src/operator/rnn.cc | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h index fd68fd628432..137bebed5c06 100644 --- a/src/operator/rnn-inl.h +++ b/src/operator/rnn-inl.h @@ -144,7 +144,7 @@ class RNNProp : public OperatorProperty { public: std::vector ListArguments() const override { if (param_.mode == rnn_enum::kLstm) { - return {"data", "parameters", "state", "cell_state"}; + return {"data", "parameters", "state", "state_cell"}; } else { return {"data", "parameters", "state"}; } @@ -164,9 +164,9 @@ class RNNProp : public OperatorProperty { return 2; } - int NumVisibleOutputs() const override { - return 1; - } + // int NumVisibleOutputs() const override { + // return 1; + // } void Init(const std::vector >& kwargs) override { param_.Init(kwargs); diff --git a/src/operator/rnn.cc b/src/operator/rnn.cc index 5e3b2b8894af..337410c8ddc1 100644 --- a/src/operator/rnn.cc +++ b/src/operator/rnn.cc @@ -35,7 +35,7 @@ MXNET_REGISTER_OP_PROPERTY(RNN, RNNProp) .add_argument("data", "Symbol", "Input data to RNN") .add_argument("parameters", "Symbol", "Vector of all RNN trainable parameters") .add_argument("state", "Symbol", "initial hidden state of the RNN") -.add_argument("cell_state", "Symbol", "initial cell state for LSTM networks (only for LSTM)") +.add_argument("state_cell", "Symbol", "initial cell state for LSTM networks (only for LSTM)") .add_arguments(RNNParam::__FIELDS__()); } // namespace op } // namespace mxnet From 8bd215cd6c97b17eb4297500359a7e4011425585 Mon Sep 17 00:00:00 2001 From: Sebastian Bodenstein Date: Wed, 20 Jul 2016 14:24:57 +0200 Subject: [PATCH 27/36] - fixed bug: parameters can't have underscore --- src/operator/cudnn_rnn-inl.h | 10 +++++----- src/operator/rnn-inl.h | 22 +++++++++++----------- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/src/operator/cudnn_rnn-inl.h b/src/operator/cudnn_rnn-inl.h index 6a642f6428f8..d696ead26255 100644 --- a/src/operator/cudnn_rnn-inl.h +++ b/src/operator/cudnn_rnn-inl.h @@ -314,7 +314,7 @@ class CuDNNRNNOp : public Operator { strideA ), CUDNN_STATUS_SUCCESS); dimA[0] = param_.batch_size_; - dimA[1] = param_.bidirectional ? param_.state_size_ * 2 : param_.state_size_; + dimA[1] = param_.bidirectional ? param_.state_size * 2 : param_.state_size; dimA[2] = 1; strideA[0] = dimA[2] * dimA[1]; strideA[1] = dimA[2]; @@ -339,9 +339,9 @@ class CuDNNRNNOp : public Operator { dy_desc_vec_ = dy_vec; // set the state tensors - dimA[0] = param_.num_layers_ * (param_.bidirectional ? 2 : 1); + dimA[0] = param_.num_layers * (param_.bidirectional ? 2 : 1); dimA[1] = param_.batch_size_; - dimA[2] = param_.state_size_; + dimA[2] = param_.state_size; strideA[0] = dimA[2] * dimA[1]; strideA[1] = dimA[2]; strideA[2] = 1; @@ -419,8 +419,8 @@ class CuDNNRNNOp : public Operator { // RNN descriptors CHECK_EQ(cudnnCreateRNNDescriptor(&rnn_desc_), CUDNN_STATUS_SUCCESS); CHECK_EQ(cudnnSetRNNDescriptor(rnn_desc_, - param_.state_size_, - param_.num_layers_, + param_.state_size, + param_.num_layers, dropout_desc_, input_mode_, direction_, diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h index 137bebed5c06..ed0cf0db84b1 100644 --- a/src/operator/rnn-inl.h +++ b/src/operator/rnn-inl.h @@ -67,8 +67,8 @@ inline int rnn_param_size(int layerNum, } struct RNNParam : public dmlc::Parameter { - uint32_t state_size_; - uint32_t num_layers_; + uint32_t state_size; + uint32_t num_layers; bool bidirectional; int mode; float p, pkeep_; @@ -76,10 +76,10 @@ struct RNNParam : public dmlc::Parameter { bool lstm_q_; // whether type is lstm DMLC_DECLARE_PARAMETER(RNNParam) { - DMLC_DECLARE_FIELD(state_size_) + DMLC_DECLARE_FIELD(state_size) .describe("size of the state for each layer"); - DMLC_DECLARE_FIELD(num_layers_) + DMLC_DECLARE_FIELD(num_layers) .describe("number of stacked layers"); DMLC_DECLARE_FIELD(bidirectional).set_default(false) @@ -193,29 +193,29 @@ class RNNProp : public OperatorProperty { int batch_size = dshape[1]; int input_size = dshape[2]; int numDirections = param_.bidirectional ? 2 : 1; - int total_layers = numDirections * param_.num_layers_; // double for bidirectional + int total_layers = numDirections * param_.num_layers; // double for bidirectional SHAPE_ASSIGN_CHECK(*in_shape, rnn_enum::kState, - Shape3(total_layers, batch_size, param_.state_size_)); + Shape3(total_layers, batch_size, param_.state_size)); if (param_.mode == rnn_enum::kLstm){ SHAPE_ASSIGN_CHECK(*in_shape, rnn_enum::kStateCell, - Shape3(total_layers, batch_size, param_.state_size_)); + Shape3(total_layers, batch_size, param_.state_size)); } // calculate parameter vector length - int param_size = rnn_param_size(param_.num_layers_, + int param_size = rnn_param_size(param_.num_layers, input_size, - param_.state_size_, + param_.state_size, param_.bidirectional, param_.mode); SHAPE_ASSIGN_CHECK(*in_shape, rnn_enum::kParams, Shape1(param_size)); // output: [sequence len, batch, output size] TShape oshape = dshape; - oshape[2] = numDirections * param_.state_size_; + oshape[2] = numDirections * param_.state_size; TShape outStateShape = dshape; outStateShape[0] = total_layers; outStateShape[1] = batch_size; - outStateShape[2] = param_.state_size_; + outStateShape[2] = param_.state_size; out_shape->clear(); out_shape->push_back(oshape); From 2e333fcb54f5549faf6e10971de716b981cf3698 Mon Sep 17 00:00:00 2001 From: Sebastian Bodenstein Date: Wed, 20 Jul 2016 16:36:07 +0200 Subject: [PATCH 28/36] - fixed off-by-two error in weight shape inference for bidirectional net - moved calculated param to cudnn_rnn-inl.h --- src/operator/cudnn_rnn-inl.h | 7 ++++++- src/operator/rnn-inl.h | 10 +++------- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/src/operator/cudnn_rnn-inl.h b/src/operator/cudnn_rnn-inl.h index d696ead26255..1fd7afc90e3a 100644 --- a/src/operator/cudnn_rnn-inl.h +++ b/src/operator/cudnn_rnn-inl.h @@ -41,6 +41,12 @@ class CuDNNRNNOp : public Operator { } // RNN Direction direction_ = param_.bidirectional ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL; + // Other + param_.pkeep_ = 1.0f - param_.p; + if(param_.mode == rnn_enum::kLstm) + param_.lstm_q_ = true; + else + param_.lstm_q_ = false; } ~CuDNNRNNOp() { @@ -212,7 +218,6 @@ class CuDNNRNNOp : public Operator { Tensor temp_space = ctx.requested[rnn_enum::kTempSpace].get_space_typed( mshadow::Shape1(temp_size), s); - CHECK_EQ(cudnnRNNBackwardData(s->dnn_handle_, rnn_desc_, param_.seq_length_, diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h index ed0cf0db84b1..b51216bf9d4d 100644 --- a/src/operator/rnn-inl.h +++ b/src/operator/rnn-inl.h @@ -59,8 +59,10 @@ inline int rnn_param_size(int layerNum, // get size of first layer int size = rnn_single_param_size(inputSize, hiddenSize, mode); // get size of remaining layers - if(bidirectional) + if(bidirectional){ size += (layerNum - 1) * rnn_single_param_size(2 * hiddenSize, hiddenSize, mode); + size *= 2; + } else size += (layerNum - 1) * rnn_single_param_size(hiddenSize, hiddenSize, mode); return size; @@ -102,12 +104,6 @@ template class RNNOp : public Operator { public: explicit RNNOp(RNNParam p) { - // convert MBytes first to Bytes and then to elements. - param_.pkeep_ = 1.0f - param_.p; - if(param_.mode == rnn_enum::kLstm) - param_.lstm_q_ = true; - else - param_.lstm_q_ = false; } virtual void Forward(const OpContext &ctx, From 430bd0365195d715bd123c097607b0d782482b9a Mon Sep 17 00:00:00 2001 From: Sebastian Bodenstein Date: Thu, 21 Jul 2016 13:24:56 +0200 Subject: [PATCH 29/36] - added option to control num outputs --- src/operator/cudnn_rnn-inl.h | 5 ++++- src/operator/rnn-inl.h | 21 ++++++++++++++------- 2 files changed, 18 insertions(+), 8 deletions(-) diff --git a/src/operator/cudnn_rnn-inl.h b/src/operator/cudnn_rnn-inl.h index 1fd7afc90e3a..0c943bab7da0 100644 --- a/src/operator/cudnn_rnn-inl.h +++ b/src/operator/cudnn_rnn-inl.h @@ -7,9 +7,12 @@ #ifndef MXNET_OPERATOR_CUDNN_RNN_INL_H_ #define MXNET_OPERATOR_CUDNN_RNN_INL_H_ -#include #include +#include +#include +#include #include "./rnn-inl.h" + namespace mxnet { namespace op { #if defined(__CUDACC__) && MXNET_USE_CUDNN == 1 && CUDNN_MAJOR == 5 diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h index b51216bf9d4d..d036e299e519 100644 --- a/src/operator/rnn-inl.h +++ b/src/operator/rnn-inl.h @@ -71,7 +71,7 @@ inline int rnn_param_size(int layerNum, struct RNNParam : public dmlc::Parameter { uint32_t state_size; uint32_t num_layers; - bool bidirectional; + bool bidirectional, state_outputs; int mode; float p, pkeep_; int seq_length_, batch_size_, input_size_; @@ -97,6 +97,10 @@ struct RNNParam : public dmlc::Parameter { DMLC_DECLARE_FIELD(p).set_default(0.) .set_range(0, 1) .describe("Fraction of the input that gets dropped out at training time"); + + DMLC_DECLARE_FIELD(state_outputs).set_default(false) + .describe("Whether to have the states as symbol outputs."); + } }; @@ -160,9 +164,11 @@ class RNNProp : public OperatorProperty { return 2; } - // int NumVisibleOutputs() const override { - // return 1; - // } + int NumVisibleOutputs() const override { + int mode_num = (param_.mode == rnn_enum::kLstm) ? 2 : 1; + int num_outputs = param_.state_outputs ? (mode_num + 1) : 1; + return num_outputs; + } void Init(const std::vector >& kwargs) override { param_.Init(kwargs); @@ -193,11 +199,11 @@ class RNNProp : public OperatorProperty { SHAPE_ASSIGN_CHECK(*in_shape, rnn_enum::kState, Shape3(total_layers, batch_size, param_.state_size)); - if (param_.mode == rnn_enum::kLstm){ + if (param_.mode == rnn_enum::kLstm) SHAPE_ASSIGN_CHECK(*in_shape, rnn_enum::kStateCell, Shape3(total_layers, batch_size, param_.state_size)); - } + // calculate parameter vector length int param_size = rnn_param_size(param_.num_layers, input_size, @@ -217,7 +223,7 @@ class RNNProp : public OperatorProperty { out_shape->push_back(oshape); out_shape->push_back(outStateShape); // Deal with lstm cell state - if (param_.mode == rnn_enum::kLstm) + if(param_.mode == rnn_enum::kLstm) out_shape->push_back(outStateShape); return true; } @@ -240,6 +246,7 @@ class RNNProp : public OperatorProperty { out_type->clear(); out_type->push_back(dtype); out_type->push_back(dtype); + // Deal with lstm cell state if (param_.mode == rnn_enum::kLstm) out_type->push_back(dtype); return true; From 4dbe1367f9204517ee91e62e43db2ac4a8c87c58 Mon Sep 17 00:00:00 2001 From: Sebastian Bodenstein Date: Thu, 21 Jul 2016 13:52:17 +0200 Subject: [PATCH 30/36] - removed lint --- src/operator/cudnn_rnn-inl.h | 137 +++++++++++++++-------------------- src/operator/rnn-inl.h | 32 ++++---- src/operator/rnn.cc | 5 +- src/operator/rnn.cu | 2 +- 4 files changed, 77 insertions(+), 99 deletions(-) diff --git a/src/operator/cudnn_rnn-inl.h b/src/operator/cudnn_rnn-inl.h index 0c943bab7da0..f3bfc1eac1fe 100644 --- a/src/operator/cudnn_rnn-inl.h +++ b/src/operator/cudnn_rnn-inl.h @@ -24,7 +24,7 @@ class CuDNNRNNOp : public Operator { init_cudnn_ = false; dtype_ = mshadow::DataType::kCudnnFlag; // Defaults - input_mode_ = CUDNN_LINEAR_INPUT; // Don't support this yet + input_mode_ = CUDNN_LINEAR_INPUT; // Don't support this yet // RNN Mode switch (param_.mode) { case rnn_enum::kRnnRelu: @@ -46,7 +46,7 @@ class CuDNNRNNOp : public Operator { direction_ = param_.bidirectional ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL; // Other param_.pkeep_ = 1.0f - param_.p; - if(param_.mode == rnn_enum::kLstm) + if (param_.mode == rnn_enum::kLstm) param_.lstm_q_ = true; else param_.lstm_q_ = false; @@ -54,7 +54,7 @@ class CuDNNRNNOp : public Operator { ~CuDNNRNNOp() { if (init_cudnn_) { - for(int i = 0; i < x_desc_vec_.size(); ++i){ + for (int i = 0; i < x_desc_vec_.size(); ++i) { CHECK_EQ(cudnnDestroyTensorDescriptor(x_desc_vec_[i]), CUDNN_STATUS_SUCCESS); CHECK_EQ(cudnnDestroyTensorDescriptor(y_desc_vec_[i]), CUDNN_STATUS_SUCCESS); CHECK_EQ(cudnnDestroyTensorDescriptor(dx_desc_vec_[i]), CUDNN_STATUS_SUCCESS); @@ -63,18 +63,18 @@ class CuDNNRNNOp : public Operator { CHECK_EQ(cudnnDestroyTensorDescriptor(hx_desc_), CUDNN_STATUS_SUCCESS); CHECK_EQ(cudnnDestroyTensorDescriptor(cx_desc_), CUDNN_STATUS_SUCCESS); CHECK_EQ(cudnnDestroyTensorDescriptor(hy_desc_), CUDNN_STATUS_SUCCESS); - CHECK_EQ(cudnnDestroyTensorDescriptor(cy_desc_), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudnnDestroyTensorDescriptor(cy_desc_), CUDNN_STATUS_SUCCESS); CHECK_EQ(cudnnDestroyTensorDescriptor(dhx_desc_), CUDNN_STATUS_SUCCESS); CHECK_EQ(cudnnDestroyTensorDescriptor(dcx_desc_), CUDNN_STATUS_SUCCESS); CHECK_EQ(cudnnDestroyTensorDescriptor(dhy_desc_), CUDNN_STATUS_SUCCESS); - CHECK_EQ(cudnnDestroyTensorDescriptor(dcy_desc_), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudnnDestroyTensorDescriptor(dcy_desc_), CUDNN_STATUS_SUCCESS); CHECK_EQ(cudnnDestroyFilterDescriptor(w_desc_), CUDNN_STATUS_SUCCESS); CHECK_EQ(cudnnDestroyRNNDescriptor(rnn_desc_), CUDNN_STATUS_SUCCESS); CHECK_EQ(cudnnDestroyDropoutDescriptor(dropout_desc_), CUDNN_STATUS_SUCCESS); } } - + virtual void Forward(const OpContext &ctx, const std::vector &in_data, const std::vector &req, @@ -96,7 +96,7 @@ class CuDNNRNNOp : public Operator { DType * cx_ptr = NULL; DType * cy_ptr = NULL; - if (param_.mode == rnn_enum::kLstm){ + if (param_.mode == rnn_enum::kLstm) { cx_ptr = (in_data[rnn_enum::kStateCell].get(s)).dptr_; cy_ptr = (in_data[rnn_enum::kStateCellOut].get(s)).dptr_; } @@ -107,9 +107,9 @@ class CuDNNRNNOp : public Operator { CHECK_EQ(y.CheckContiguous(), true); CHECK_EQ(hy.CheckContiguous(), true); - if(!init_cudnn_){ + if (!init_cudnn_) { Init(s, in_data, out_data); - } + } // Get temp space int temp_size = workspace_size_; @@ -117,8 +117,8 @@ class CuDNNRNNOp : public Operator { Tensor temp_space = ctx.requested[rnn_enum::kTempSpace].get_space_typed( mshadow::Shape1(temp_size), s); - - if (ctx.is_train) { + + if (ctx.is_train) { CHECK_EQ(cudnnRNNForwardTraining(s->dnn_handle_, rnn_desc_, param_.seq_length_, @@ -139,8 +139,7 @@ class CuDNNRNNOp : public Operator { temp_space.dptr_, workspace_byte_, temp_space.dptr_ + workspace_size_, - reserve_space_byte_ - ), CUDNN_STATUS_SUCCESS); + reserve_space_byte_), CUDNN_STATUS_SUCCESS); } else { // inference mode CHECK_EQ(cudnnRNNForwardInference(s->dnn_handle_, @@ -161,11 +160,10 @@ class CuDNNRNNOp : public Operator { cy_desc_, cy_ptr, temp_space.dptr_, - workspace_byte_ - ), CUDNN_STATUS_SUCCESS); + workspace_byte_), CUDNN_STATUS_SUCCESS); } } - + virtual void Backward(const OpContext &ctx, const std::vector &out_grad, const std::vector &in_data, @@ -198,7 +196,7 @@ class CuDNNRNNOp : public Operator { // DType * cy_ptr = NULL; DType * dcx_ptr = NULL; DType * dcy_ptr = NULL; - if (param_.mode == rnn_enum::kLstm){ + if (param_.mode == rnn_enum::kLstm) { cx_ptr = (in_data[rnn_enum::kStateCell].get(s)).dptr_; // cy_ptr = (in_data[rnn_enum::kStateCellOut].get(s)).dptr_; dcx_ptr = (in_grad[rnn_enum::kStateCell].get(s)).dptr_; @@ -211,9 +209,9 @@ class CuDNNRNNOp : public Operator { CHECK_EQ(y.CheckContiguous(), true); CHECK_EQ(hy.CheckContiguous(), true); - if(!init_cudnn_){ + if (!init_cudnn_) { Init(s, in_data, out_data); - } + } // Get temp space int temp_size = workspace_size_; @@ -247,25 +245,24 @@ class CuDNNRNNOp : public Operator { temp_space.dptr_, workspace_byte_, temp_space.dptr_ + workspace_size_, - reserve_space_byte_ - ), CUDNN_STATUS_SUCCESS); - CHECK_EQ(cudnnRNNBackwardWeights(s->dnn_handle_, - rnn_desc_, - param_.seq_length_, - x_desc_vec_.data(), - x.dptr_, + reserve_space_byte_), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudnnRNNBackwardWeights(s->dnn_handle_, + rnn_desc_, + param_.seq_length_, + x_desc_vec_.data(), + x.dptr_, hx_desc_, - hx.dptr_, - y_desc_vec_.data(), + hx.dptr_, + y_desc_vec_.data(), y.dptr_, - temp_space.dptr_, - workspace_byte_, - dw_desc_, + temp_space.dptr_, + workspace_byte_, + dw_desc_, dw.dptr_, - temp_space.dptr_ + workspace_size_, - reserve_space_byte_ - ), CUDNN_STATUS_SUCCESS); + temp_space.dptr_ + workspace_size_, + reserve_space_byte_), CUDNN_STATUS_SUCCESS); } + private: inline void Init(mshadow::Stream *s, const std::vector &in_data, @@ -299,7 +296,7 @@ class CuDNNRNNOp : public Operator { CHECK_EQ(cudnnCreateTensorDescriptor(&y_vec[i]), CUDNN_STATUS_SUCCESS); CHECK_EQ(cudnnCreateTensorDescriptor(&dx_vec[i]), CUDNN_STATUS_SUCCESS); CHECK_EQ(cudnnCreateTensorDescriptor(&dy_vec[i]), CUDNN_STATUS_SUCCESS); - + dimA[0] = param_.batch_size_; dimA[1] = param_.input_size_; dimA[2] = 1; @@ -307,21 +304,19 @@ class CuDNNRNNOp : public Operator { dimA[1] = param_.input_size_; strideA[0] = dimA[2] * dimA[1]; strideA[1] = dimA[2]; - strideA[2] = 1; + strideA[2] = 1; CHECK_EQ(cudnnSetTensorNdDescriptor(x_vec[i], dtype_, 3, dimA, - strideA - ), CUDNN_STATUS_SUCCESS); + strideA), CUDNN_STATUS_SUCCESS); CHECK_EQ(cudnnSetTensorNdDescriptor(dx_vec[i], dtype_, 3, dimA, - strideA - ), CUDNN_STATUS_SUCCESS); - dimA[0] = param_.batch_size_; + strideA), CUDNN_STATUS_SUCCESS); + dimA[0] = param_.batch_size_; dimA[1] = param_.bidirectional ? param_.state_size * 2 : param_.state_size; dimA[2] = 1; strideA[0] = dimA[2] * dimA[1]; @@ -332,21 +327,19 @@ class CuDNNRNNOp : public Operator { dtype_, 3, dimA, - strideA - ), CUDNN_STATUS_SUCCESS); + strideA), CUDNN_STATUS_SUCCESS); CHECK_EQ(cudnnSetTensorNdDescriptor(dy_vec[i], dtype_, 3, dimA, - strideA - ), CUDNN_STATUS_SUCCESS); + strideA), CUDNN_STATUS_SUCCESS); } x_desc_vec_ = x_vec; y_desc_vec_ = y_vec; dx_desc_vec_ = dx_vec; dy_desc_vec_ = dy_vec; - // set the state tensors + // set the state tensors dimA[0] = param_.num_layers * (param_.bidirectional ? 2 : 1); dimA[1] = param_.batch_size_; dimA[2] = param_.state_size; @@ -367,64 +360,55 @@ class CuDNNRNNOp : public Operator { dtype_, 3, dimA, - strideA - ), CUDNN_STATUS_SUCCESS); + strideA), CUDNN_STATUS_SUCCESS); CHECK_EQ(cudnnSetTensorNdDescriptor(cx_desc_, dtype_, 3, dimA, - strideA - ), CUDNN_STATUS_SUCCESS); + strideA), CUDNN_STATUS_SUCCESS); CHECK_EQ(cudnnSetTensorNdDescriptor(hy_desc_, dtype_, 3, dimA, - strideA - ), CUDNN_STATUS_SUCCESS); + strideA), CUDNN_STATUS_SUCCESS); CHECK_EQ(cudnnSetTensorNdDescriptor(cy_desc_, dtype_, 3, dimA, - strideA - ), CUDNN_STATUS_SUCCESS); + strideA), CUDNN_STATUS_SUCCESS); CHECK_EQ(cudnnSetTensorNdDescriptor(dhx_desc_, dtype_, 3, dimA, - strideA - ), CUDNN_STATUS_SUCCESS); + strideA), CUDNN_STATUS_SUCCESS); CHECK_EQ(cudnnSetTensorNdDescriptor(dcx_desc_, dtype_, 3, dimA, - strideA - ), CUDNN_STATUS_SUCCESS); + strideA), CUDNN_STATUS_SUCCESS); CHECK_EQ(cudnnSetTensorNdDescriptor(dhy_desc_, dtype_, 3, dimA, - strideA - ), CUDNN_STATUS_SUCCESS); + strideA), CUDNN_STATUS_SUCCESS); CHECK_EQ(cudnnSetTensorNdDescriptor(dcy_desc_, dtype_, 3, dimA, - strideA - ), CUDNN_STATUS_SUCCESS); + strideA), CUDNN_STATUS_SUCCESS); // Create Dropout descriptors CHECK_EQ(cudnnCreateDropoutDescriptor(&dropout_desc_), CUDNN_STATUS_SUCCESS); - CHECK_EQ(cudnnDropoutGetStatesSize(s->dnn_handle_, - &dropout_byte_ - ), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudnnDropoutGetStatesSize(s->dnn_handle_, + &dropout_byte_), CUDNN_STATUS_SUCCESS); dropout_size_ = dropout_byte_ / sizeof(DType); CHECK_EQ(cudnnSetDropoutDescriptor(dropout_desc_, s->dnn_handle_, - param_.pkeep_, // keep probability + param_.pkeep_, // keep probability NULL, dropout_byte_, seed_), CUDNN_STATUS_SUCCESS); - // RNN descriptors + // RNN descriptors CHECK_EQ(cudnnCreateRNNDescriptor(&rnn_desc_), CUDNN_STATUS_SUCCESS); CHECK_EQ(cudnnSetRNNDescriptor(rnn_desc_, param_.state_size, @@ -434,19 +418,17 @@ class CuDNNRNNOp : public Operator { direction_, mode_, dtype_), CUDNN_STATUS_SUCCESS); - // Get temp space sizes + // Get temp space sizes CHECK_EQ(cudnnGetRNNWorkspaceSize(s->dnn_handle_, rnn_desc_, param_.seq_length_, x_desc_vec_.data(), - &workspace_byte_ - ), CUDNN_STATUS_SUCCESS); + &workspace_byte_), CUDNN_STATUS_SUCCESS); CHECK_EQ(cudnnGetRNNTrainingReserveSize(s->dnn_handle_, rnn_desc_, param_.seq_length_, x_desc_vec_.data(), - &reserve_space_byte_ - ), CUDNN_STATUS_SUCCESS); + &reserve_space_byte_), CUDNN_STATUS_SUCCESS); workspace_size_ = workspace_byte_ / sizeof(DType); reserve_space_size_ = reserve_space_byte_ / sizeof(DType); @@ -467,15 +449,12 @@ class CuDNNRNNOp : public Operator { dtype_, format_, 3, - dim_w - ), CUDNN_STATUS_SUCCESS); + dim_w), CUDNN_STATUS_SUCCESS); CHECK_EQ(cudnnSetFilterNdDescriptor(dw_desc_, dtype_, format_, 3, - dim_w - ), CUDNN_STATUS_SUCCESS); - + dim_w), CUDNN_STATUS_SUCCESS); } } @@ -486,7 +465,7 @@ class CuDNNRNNOp : public Operator { cudnnDirectionMode_t direction_; cudnnRNNInputMode_t input_mode_; cudnnDropoutDescriptor_t dropout_desc_; - unsigned long long seed_ = 4553; + unsigned long long seed_ = 1337ull; size_t workspace_byte_, reserve_space_byte_, dropout_byte_; int workspace_size_, reserve_space_size_, dropout_size_; @@ -496,7 +475,7 @@ class CuDNNRNNOp : public Operator { cudnnTensorDescriptor_t dhx_desc_, dcx_desc_; cudnnTensorDescriptor_t dhy_desc_, dcy_desc_; - cudnnFilterDescriptor_t w_desc_, dw_desc_; + cudnnFilterDescriptor_t w_desc_, dw_desc_; #if CUDNN_MAJOR == 5 cudnnTensorFormat_t format_; diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h index d036e299e519..2c7d20fe279c 100644 --- a/src/operator/rnn-inl.h +++ b/src/operator/rnn-inl.h @@ -30,13 +30,12 @@ namespace rnn_enum { // A utility function to calculate input size inline int rnn_single_param_size(int inputSize, int hiddenSize, - int mode){ + int mode) { int size = hiddenSize * (hiddenSize + inputSize + 2); // Different RNN's have different num weights - switch(mode) - { + switch (mode) { case rnn_enum::kRnnRelu: - size *= 1 ; + size *= 1; break; case rnn_enum::kRnnTanh: size *= 1; @@ -55,16 +54,16 @@ inline int rnn_param_size(int layerNum, int inputSize, int hiddenSize, bool bidirectional, - int mode){ + int mode) { // get size of first layer int size = rnn_single_param_size(inputSize, hiddenSize, mode); // get size of remaining layers - if(bidirectional){ + if (bidirectional) { size += (layerNum - 1) * rnn_single_param_size(2 * hiddenSize, hiddenSize, mode); size *= 2; + } else { + size += (layerNum - 1) * rnn_single_param_size(hiddenSize, hiddenSize, mode); } - else - size += (layerNum - 1) * rnn_single_param_size(hiddenSize, hiddenSize, mode); return size; } @@ -75,7 +74,7 @@ struct RNNParam : public dmlc::Parameter { int mode; float p, pkeep_; int seq_length_, batch_size_, input_size_; - bool lstm_q_; // whether type is lstm + bool lstm_q_; // whether type is lstm DMLC_DECLARE_PARAMETER(RNNParam) { DMLC_DECLARE_FIELD(state_size) @@ -93,14 +92,13 @@ struct RNNParam : public dmlc::Parameter { .add_enum("lstm", rnn_enum::kLstm) .add_enum("gru", rnn_enum::kGru) .describe("the type of RNN to compute"); - + DMLC_DECLARE_FIELD(p).set_default(0.) .set_range(0, 1) .describe("Fraction of the input that gets dropped out at training time"); DMLC_DECLARE_FIELD(state_outputs).set_default(false) .describe("Whether to have the states as symbol outputs."); - } }; @@ -117,7 +115,7 @@ class RNNOp : public Operator { const std::vector &aux_args) { using namespace mshadow; using namespace mshadow::expr; - // TODO: add MShadow implementation + // TODO(sbodenstein): add MShadow implementation } virtual void Backward(const OpContext &ctx, @@ -129,7 +127,7 @@ class RNNOp : public Operator { const std::vector &aux_args) { using namespace mshadow; using namespace mshadow::expr; - // TODO: add MShadow implementation + // TODO(sbodenstein): add MShadow implementation } private: @@ -153,14 +151,14 @@ class RNNProp : public OperatorProperty { std::vector ListOutputs() const override { if (param_.mode == rnn_enum::kLstm) return {"output", "state", "state_cell"}; - else + else return {"output", "state"}; } int NumOutputs() const override { if (param_.mode == rnn_enum::kLstm) return 3; - else + else return 2; } @@ -195,7 +193,7 @@ class RNNProp : public OperatorProperty { int batch_size = dshape[1]; int input_size = dshape[2]; int numDirections = param_.bidirectional ? 2 : 1; - int total_layers = numDirections * param_.num_layers; // double for bidirectional + int total_layers = numDirections * param_.num_layers; // double for bidirectional SHAPE_ASSIGN_CHECK(*in_shape, rnn_enum::kState, Shape3(total_layers, batch_size, param_.state_size)); @@ -223,7 +221,7 @@ class RNNProp : public OperatorProperty { out_shape->push_back(oshape); out_shape->push_back(outStateShape); // Deal with lstm cell state - if(param_.mode == rnn_enum::kLstm) + if (param_.mode == rnn_enum::kLstm) out_shape->push_back(outStateShape); return true; } diff --git a/src/operator/rnn.cc b/src/operator/rnn.cc index 337410c8ddc1..3067c8e986c1 100644 --- a/src/operator/rnn.cc +++ b/src/operator/rnn.cc @@ -19,8 +19,9 @@ Operator *CreateOp(RNNParam param, int dtype) { return op; } -Operator *RNNProp::CreateOperatorEx(Context ctx, std::vector *in_shape, - std::vector *in_type) const { +Operator *RNNProp::CreateOperatorEx(Context ctx, + std::vector *in_shape, + std::vector *in_type) const { std::vector out_shape, aux_shape; std::vector out_type, aux_type; CHECK(InferType(in_type, &out_type, &aux_type)); diff --git a/src/operator/rnn.cu b/src/operator/rnn.cu index fb90daf19b41..bf914026019d 100644 --- a/src/operator/rnn.cu +++ b/src/operator/rnn.cu @@ -21,7 +21,7 @@ Operator* CreateOp(RNNParam param, int dtype) { op = new CuDNNRNNOp(param); }) #else - LOG(FATAL) << "RNN is only available for cuDNN at the moment."; + LOG(FATAL) << "RNN is only available for cuDNN at the moment."; #endif // MXNET_USE_CUDNN && CUDNN_MAJOR return op; } From 27b23d25f276b7670b96824bca57cee63595caa3 Mon Sep 17 00:00:00 2001 From: Sebastian Bodenstein Date: Thu, 21 Jul 2016 15:47:41 +0200 Subject: [PATCH 31/36] - correct handling of backward dependencies --- src/operator/cudnn_rnn-inl.h | 27 +++++++++++++++------------ src/operator/rnn-inl.h | 20 ++++++++++++++++---- 2 files changed, 31 insertions(+), 16 deletions(-) diff --git a/src/operator/cudnn_rnn-inl.h b/src/operator/cudnn_rnn-inl.h index f3bfc1eac1fe..3f63bc4de0f5 100644 --- a/src/operator/cudnn_rnn-inl.h +++ b/src/operator/cudnn_rnn-inl.h @@ -187,27 +187,30 @@ class CuDNNRNNOp : public Operator { Tensor dw = in_grad[rnn_enum::kParams].get(s); Tensor hx = in_data[rnn_enum::kState].get(s); Tensor dhx = in_grad[rnn_enum::kState].get(s); - Tensor hy = in_data[rnn_enum::kStateOut].get(s); - Tensor dhy = out_grad[rnn_enum::kStateOut].get(s); Tensor y = out_data[rnn_enum::kOut].get(s); Tensor dy = out_grad[rnn_enum::kOut].get(s); - DType * cx_ptr = NULL; - // DType * cy_ptr = NULL; - DType * dcx_ptr = NULL; - DType * dcy_ptr = NULL; - if (param_.mode == rnn_enum::kLstm) { + // only need kStateOut grad output_states is true + void * dhy_ptr = NULL; + if (param_.state_outputs) + dhy_ptr = out_grad[rnn_enum::kStateOut].get(s).dptr_; + + // Deal with lstm + void * dcx_ptr = NULL; + void * dcy_ptr = NULL; + void * cx_ptr = NULL; + + if(param_.mode == rnn_enum::kLstm) { cx_ptr = (in_data[rnn_enum::kStateCell].get(s)).dptr_; - // cy_ptr = (in_data[rnn_enum::kStateCellOut].get(s)).dptr_; dcx_ptr = (in_grad[rnn_enum::kStateCell].get(s)).dptr_; - dcy_ptr = (out_grad[rnn_enum::kStateCellOut].get(s)).dptr_; } - + if ((param_.mode == rnn_enum::kLstm) && param_.state_outputs) + dcy_ptr = (out_grad[rnn_enum::kStateCellOut].get(s)).dptr_; + CHECK_EQ(x.CheckContiguous(), true); CHECK_EQ(w.CheckContiguous(), true); CHECK_EQ(hx.CheckContiguous(), true); CHECK_EQ(y.CheckContiguous(), true); - CHECK_EQ(hy.CheckContiguous(), true); if (!init_cudnn_) { Init(s, in_data, out_data); @@ -227,7 +230,7 @@ class CuDNNRNNOp : public Operator { dy_desc_vec_.data(), dy.dptr_, dhy_desc_, - dhy.dptr_, + dhy_ptr, dcy_desc_, dcy_ptr, w_desc_, diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h index 2c7d20fe279c..91284074b5d4 100644 --- a/src/operator/rnn-inl.h +++ b/src/operator/rnn-inl.h @@ -264,10 +264,22 @@ class RNNProp : public OperatorProperty { const std::vector &out_grad, const std::vector &in_data, const std::vector &out_data) const override { - if (param_.mode == rnn_enum::kLstm) - return {out_grad[rnn_enum::kOut], in_data[rnn_enum::kData], in_data[rnn_enum::kParams]}; - else - return {out_grad[rnn_enum::kOut], in_data[rnn_enum::kData], in_data[rnn_enum::kParams]}; + std::vector dep = {in_data[rnn_enum::kData], in_data[rnn_enum::kParams], + in_data[rnn_enum::kState], out_data[rnn_enum::kOut], out_grad[rnn_enum::kOut]}; + + if (param_.state_outputs) { + dep.push_back(out_data[rnn_enum::kStateOut]); + dep.push_back(out_grad[rnn_enum::kStateOut]); + } + + if (param_.mode == rnn_enum::kLstm) { + dep.push_back(in_data[rnn_enum::kStateCell]); + if(param_.state_outputs) { + dep.push_back(out_data[rnn_enum::kStateCellOut]); + dep.push_back(out_grad[rnn_enum::kStateCellOut]); + } + } + return dep; } std::vector ForwardResource( From 2b5f26d0666277e59ad56c7a2da3e1625a38ffea Mon Sep 17 00:00:00 2001 From: Sebastian Bodenstein Date: Thu, 21 Jul 2016 15:55:56 +0200 Subject: [PATCH 32/36] - fix lint --- src/operator/cudnn_rnn-inl.h | 6 +++--- src/operator/rnn-inl.h | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/operator/cudnn_rnn-inl.h b/src/operator/cudnn_rnn-inl.h index 3f63bc4de0f5..d5deca2af2f8 100644 --- a/src/operator/cudnn_rnn-inl.h +++ b/src/operator/cudnn_rnn-inl.h @@ -197,16 +197,16 @@ class CuDNNRNNOp : public Operator { // Deal with lstm void * dcx_ptr = NULL; - void * dcy_ptr = NULL; + void * dcy_ptr = NULL; void * cx_ptr = NULL; - if(param_.mode == rnn_enum::kLstm) { + if (param_.mode == rnn_enum::kLstm) { cx_ptr = (in_data[rnn_enum::kStateCell].get(s)).dptr_; dcx_ptr = (in_grad[rnn_enum::kStateCell].get(s)).dptr_; } if ((param_.mode == rnn_enum::kLstm) && param_.state_outputs) dcy_ptr = (out_grad[rnn_enum::kStateCellOut].get(s)).dptr_; - + CHECK_EQ(x.CheckContiguous(), true); CHECK_EQ(w.CheckContiguous(), true); CHECK_EQ(hx.CheckContiguous(), true); diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h index 91284074b5d4..ad4d21736345 100644 --- a/src/operator/rnn-inl.h +++ b/src/operator/rnn-inl.h @@ -274,7 +274,7 @@ class RNNProp : public OperatorProperty { if (param_.mode == rnn_enum::kLstm) { dep.push_back(in_data[rnn_enum::kStateCell]); - if(param_.state_outputs) { + if (param_.state_outputs) { dep.push_back(out_data[rnn_enum::kStateCellOut]); dep.push_back(out_grad[rnn_enum::kStateCellOut]); } From ccd7004307487c1f479545fe641b2bf6d00d53ba Mon Sep 17 00:00:00 2001 From: Sebastian Bodenstein Date: Thu, 21 Jul 2016 16:53:36 +0200 Subject: [PATCH 33/36] - fix type narrowing bug --- src/operator/cudnn_rnn-inl.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/operator/cudnn_rnn-inl.h b/src/operator/cudnn_rnn-inl.h index d5deca2af2f8..666c2a94e717 100644 --- a/src/operator/cudnn_rnn-inl.h +++ b/src/operator/cudnn_rnn-inl.h @@ -447,7 +447,8 @@ class CuDNNRNNOp : public Operator { // Set param descriptors CHECK_EQ(cudnnCreateFilterDescriptor(&w_desc_), CUDNN_STATUS_SUCCESS); CHECK_EQ(cudnnCreateFilterDescriptor(&dw_desc_), CUDNN_STATUS_SUCCESS); - int dim_w[3] = {w.shape_[0], 1, 1}; + int dim_w[3] = {1, 1, 1}; + dim_w[0] = w.shape_[0]; CHECK_EQ(cudnnSetFilterNdDescriptor(w_desc_, dtype_, format_, From 8fd0d92e7b2c465c45ffba1edf3123cf7cd8cbef Mon Sep 17 00:00:00 2001 From: Sebastian Bodenstein Date: Sun, 24 Jul 2016 02:38:21 +0200 Subject: [PATCH 34/36] - fixed incorrect dropout parameter - added dropout states - fixed incorrect handling of variable outputs --- src/operator/cudnn_rnn-inl.h | 30 +++++++++++++------- src/operator/rnn-inl.h | 54 ++++++++++++++++++++---------------- 2 files changed, 50 insertions(+), 34 deletions(-) diff --git a/src/operator/cudnn_rnn-inl.h b/src/operator/cudnn_rnn-inl.h index 666c2a94e717..e154a8af4740 100644 --- a/src/operator/cudnn_rnn-inl.h +++ b/src/operator/cudnn_rnn-inl.h @@ -45,7 +45,6 @@ class CuDNNRNNOp : public Operator { // RNN Direction direction_ = param_.bidirectional ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL; // Other - param_.pkeep_ = 1.0f - param_.p; if (param_.mode == rnn_enum::kLstm) param_.lstm_q_ = true; else @@ -72,6 +71,7 @@ class CuDNNRNNOp : public Operator { CHECK_EQ(cudnnDestroyFilterDescriptor(w_desc_), CUDNN_STATUS_SUCCESS); CHECK_EQ(cudnnDestroyRNNDescriptor(rnn_desc_), CUDNN_STATUS_SUCCESS); CHECK_EQ(cudnnDestroyDropoutDescriptor(dropout_desc_), CUDNN_STATUS_SUCCESS); + CHECK_EQ(cudaFree(dropout_states_), CUDNN_STATUS_SUCCESS); } } @@ -83,6 +83,9 @@ class CuDNNRNNOp : public Operator { using namespace mshadow; size_t in_expected = param_.lstm_q_ ? 4 : 3; size_t out_expected = param_.lstm_q_ ? 3 : 2; + if (!param_.state_outputs) + out_expected = 1; + CHECK_EQ(in_data.size(), in_expected); CHECK_EQ(out_data.size(), out_expected); Stream *s = ctx.get_stream(); @@ -90,9 +93,11 @@ class CuDNNRNNOp : public Operator { Tensor x = in_data[rnn_enum::kData].get(s); Tensor w = in_data[rnn_enum::kParams].get(s); Tensor hx = in_data[rnn_enum::kState].get(s); - Tensor y = out_data[rnn_enum::kOut].get(s); - Tensor hy = out_data[rnn_enum::kStateOut].get(s); + + void * hy_ptr = NULL; + if (param_.state_outputs) + hy_ptr = out_data[rnn_enum::kStateOut].get(s).dptr_; DType * cx_ptr = NULL; DType * cy_ptr = NULL; @@ -105,19 +110,16 @@ class CuDNNRNNOp : public Operator { CHECK_EQ(w.CheckContiguous(), true); CHECK_EQ(hx.CheckContiguous(), true); CHECK_EQ(y.CheckContiguous(), true); - CHECK_EQ(hy.CheckContiguous(), true); if (!init_cudnn_) { Init(s, in_data, out_data); } - // Get temp space int temp_size = workspace_size_; temp_size += ctx.is_train ? reserve_space_size_ : 0; Tensor temp_space = ctx.requested[rnn_enum::kTempSpace].get_space_typed( mshadow::Shape1(temp_size), s); - if (ctx.is_train) { CHECK_EQ(cudnnRNNForwardTraining(s->dnn_handle_, rnn_desc_, @@ -133,7 +135,7 @@ class CuDNNRNNOp : public Operator { y_desc_vec_.data(), y.dptr_, hy_desc_, - hy.dptr_, + hy_ptr, cy_desc_, cy_ptr, temp_space.dptr_, @@ -156,7 +158,7 @@ class CuDNNRNNOp : public Operator { y_desc_vec_.data(), y.dptr_, hy_desc_, - hy.dptr_, + hy_ptr, cy_desc_, cy_ptr, temp_space.dptr_, @@ -174,6 +176,9 @@ class CuDNNRNNOp : public Operator { using namespace mshadow; size_t in_expected = param_.lstm_q_ ? 4 : 3; size_t out_expected = param_.lstm_q_ ? 3 : 2; + if (!param_.state_outputs) + out_expected = 1; + CHECK_EQ(in_data.size(), in_expected); CHECK_EQ(out_data.size(), out_expected); CHECK_EQ(in_grad.size(), in_expected); @@ -276,6 +281,9 @@ class CuDNNRNNOp : public Operator { #endif size_t in_expected = param_.lstm_q_ ? 4 : 3; size_t out_expected = param_.lstm_q_ ? 3 : 2; + if (!param_.state_outputs) + out_expected = 1; + CHECK_EQ(in_data.size(), in_expected); CHECK_EQ(out_data.size(), out_expected); if (!init_cudnn_) { @@ -405,10 +413,11 @@ class CuDNNRNNOp : public Operator { CHECK_EQ(cudnnDropoutGetStatesSize(s->dnn_handle_, &dropout_byte_), CUDNN_STATUS_SUCCESS); dropout_size_ = dropout_byte_ / sizeof(DType); + CHECK_EQ(cudaMalloc(&dropout_states_, dropout_byte_), CUDNN_STATUS_SUCCESS); CHECK_EQ(cudnnSetDropoutDescriptor(dropout_desc_, s->dnn_handle_, - param_.pkeep_, // keep probability - NULL, + param_.p, // keep probability + dropout_states_, dropout_byte_, seed_), CUDNN_STATUS_SUCCESS); // RNN descriptors @@ -469,6 +478,7 @@ class CuDNNRNNOp : public Operator { cudnnDirectionMode_t direction_; cudnnRNNInputMode_t input_mode_; cudnnDropoutDescriptor_t dropout_desc_; + void *dropout_states_; unsigned long long seed_ = 1337ull; size_t workspace_byte_, reserve_space_byte_, dropout_byte_; int workspace_size_, reserve_space_size_, dropout_size_; diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h index ad4d21736345..a70138adb7ce 100644 --- a/src/operator/rnn-inl.h +++ b/src/operator/rnn-inl.h @@ -149,20 +149,17 @@ class RNNProp : public OperatorProperty { } std::vector ListOutputs() const override { - if (param_.mode == rnn_enum::kLstm) - return {"output", "state", "state_cell"}; + std::vector outputs = {"output"}; + if (!param_.state_outputs) + return outputs; else - return {"output", "state"}; - } - - int NumOutputs() const override { + outputs.push_back("state"); if (param_.mode == rnn_enum::kLstm) - return 3; - else - return 2; + outputs.push_back("state_cell"); + return outputs; } - int NumVisibleOutputs() const override { + int NumOutputs() const override { int mode_num = (param_.mode == rnn_enum::kLstm) ? 2 : 1; int num_outputs = param_.state_outputs ? (mode_num + 1) : 1; return num_outputs; @@ -209,21 +206,26 @@ class RNNProp : public OperatorProperty { param_.bidirectional, param_.mode); SHAPE_ASSIGN_CHECK(*in_shape, rnn_enum::kParams, Shape1(param_size)); + + out_shape->clear(); // output: [sequence len, batch, output size] TShape oshape = dshape; oshape[2] = numDirections * param_.state_size; - TShape outStateShape = dshape; - outStateShape[0] = total_layers; - outStateShape[1] = batch_size; - outStateShape[2] = param_.state_size; - - out_shape->clear(); out_shape->push_back(oshape); - out_shape->push_back(outStateShape); - // Deal with lstm cell state - if (param_.mode == rnn_enum::kLstm) + if (!param_.state_outputs) { + return true; + } else { + // outStateShape: [layer_num, batch, state size] + TShape outStateShape = dshape; + outStateShape[0] = total_layers; + outStateShape[1] = batch_size; + outStateShape[2] = param_.state_size; out_shape->push_back(outStateShape); - return true; + // Deal with lstm cell state + if (param_.mode == rnn_enum::kLstm) + out_shape->push_back(outStateShape); + return true; + } } bool InferType(std::vector *in_type, @@ -243,11 +245,15 @@ class RNNProp : public OperatorProperty { } out_type->clear(); out_type->push_back(dtype); - out_type->push_back(dtype); - // Deal with lstm cell state - if (param_.mode == rnn_enum::kLstm) + if (!param_.state_outputs) { + return true; + } else { out_type->push_back(dtype); - return true; + // Deal with lstm cell state + if (param_.mode == rnn_enum::kLstm) + out_type->push_back(dtype); + return true; + } } OperatorProperty* Copy() const override { From 4f46668590822a1446e3b8cf6b390180f9fde200 Mon Sep 17 00:00:00 2001 From: Sebastian Bodenstein Date: Sun, 24 Jul 2016 11:05:49 +0200 Subject: [PATCH 35/36] - fix incorrect cell state forward handling --- src/operator/cudnn_rnn-inl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/operator/cudnn_rnn-inl.h b/src/operator/cudnn_rnn-inl.h index e154a8af4740..69e092aa6d6a 100644 --- a/src/operator/cudnn_rnn-inl.h +++ b/src/operator/cudnn_rnn-inl.h @@ -103,7 +103,7 @@ class CuDNNRNNOp : public Operator { DType * cy_ptr = NULL; if (param_.mode == rnn_enum::kLstm) { cx_ptr = (in_data[rnn_enum::kStateCell].get(s)).dptr_; - cy_ptr = (in_data[rnn_enum::kStateCellOut].get(s)).dptr_; + cy_ptr = (out_data[rnn_enum::kStateCellOut].get(s)).dptr_; } CHECK_EQ(x.CheckContiguous(), true); From 3c50c5c25f23fb826489d761c282122f63753b8e Mon Sep 17 00:00:00 2001 From: Sebastian Bodenstein Date: Sun, 24 Jul 2016 23:53:54 +0200 Subject: [PATCH 36/36] - fixed lint by replacing unsigned long long with uint64_t --- src/operator/cudnn_rnn-inl.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/operator/cudnn_rnn-inl.h b/src/operator/cudnn_rnn-inl.h index 69e092aa6d6a..5707846a781f 100644 --- a/src/operator/cudnn_rnn-inl.h +++ b/src/operator/cudnn_rnn-inl.h @@ -11,6 +11,7 @@ #include #include #include +#include #include "./rnn-inl.h" namespace mxnet { @@ -479,7 +480,7 @@ class CuDNNRNNOp : public Operator { cudnnRNNInputMode_t input_mode_; cudnnDropoutDescriptor_t dropout_desc_; void *dropout_states_; - unsigned long long seed_ = 1337ull; + uint64_t seed_ = 1337ull; size_t workspace_byte_, reserve_space_byte_, dropout_byte_; int workspace_size_, reserve_space_size_, dropout_size_;