From 919c6f4eb59a57cc7245e6be55057399f5eb5a6b Mon Sep 17 00:00:00 2001
From: Sebastian Bodenstein <sebastianb@wolfram.com>
Date: Sat, 9 Jul 2016 22:39:46 -0400
Subject: [PATCH 01/36] - first commit

---
 src/operator/cudnn_rnn-inl.h | 208 ++++++++++++++++
 src/operator/rnn-inl.h       | 471 +++++++++++++++++++++++++++++++++++
 src/operator/rnn.cc          |  41 +++
 src/operator/rnn.cu          |  33 +++
 4 files changed, 753 insertions(+)
 create mode 100644 src/operator/cudnn_rnn-inl.h
 create mode 100644 src/operator/rnn-inl.h
 create mode 100644 src/operator/rnn.cc
 create mode 100644 src/operator/rnn.cu
diff --git a/src/operator/cudnn_rnn-inl.h b/src/operator/cudnn_rnn-inl.h
new file mode 100644
index 000000000000..37895c2b2488
--- /dev/null
+++ b/src/operator/cudnn_rnn-inl.h
@@ -0,0 +1,208 @@
+/*!
+ * Copyright (c) 2016 by Contributors
+ * \file cudnn_spatial_transformer-inl.h
+ * \brief
+ * \author Sebastian Bodenstein
+*/
+#ifndef MXNET_OPERATOR_CUDNN_RNN_INL_H_
+#define MXNET_OPERATOR_CUDNN_RNN_INL_H_
+
+#include <algorithm>
+#include <vector>
+#include "./rnn-inl.h"
+namespace mxnet {
+namespace op {
+#if defined(__CUDACC__) && MXNET_USE_CUDNN == 1 && CUDNN_MAJOR == 5
+template<typename DType>
+class CuDNNRNNOp : public Operator {
+ public:
+  explicit CuDNNRNNOp(RNNParam param) {
+    this->param_ = param;
+    init_cudnn_ = false;
+    dtype_ = mshadow::DataType<DType>::kCudnnFlag;
+    // RNN Mode
+    switch (param_.mode) {
+      case rnn_enum::kRnnRelu:
+        rnn_mode_ = CUDNN_RNN_RELU;
+        break;
+      case rnn_enum::kRnnTanh:
+        rnn_mode_ = CUDNN_RNN_TANH;
+        break;
+      case rnn_enum::kLstm:
+        rnn_mode_ = CUDNN_LSTM;
+        break;
+      case rnn_enum::kGru:
+        rnn_mode_ = CUDNN_GRU;
+        break;
+      default:
+        LOG(FATAL) << "Not implmented";
+    }
+    // RNN Direction
+    switch (param_.direction) {
+      case rnn_enum::kUnidirectional:
+        rnn_direction_ = CUDNN_UNIDIRECTIONAL;
+        break;
+      case rnn_enum::kBidirectional:
+        rnn_direction_ = CUDNN_BIDIRECTIONAL;
+        break;
+      default:
+        LOG(FATAL) << "Not implmented";
+    }
+  }
+  // ~CuDNNRNNOp() {
+  //   if (init_cudnn_) {
+  //     CHECK_EQ(cudnnDestroyRNNDescriptor(rnn_desc_), CUDNN_STATUS_SUCCESS);
+  //     // CHECK_EQ(cudnnDestroyTensorDescriptor(rnn_desc_), CUDNN_STATUS_SUCCESS);
+  //     // CHECK_EQ(cudnnDestroyTensorDescriptor(_desc_), CUDNN_STATUS_SUCCESS);
+  //   }
+  // }
+ 
+  virtual void Forward(const OpContext &ctx,
+                       const std::vector<TBlob> &in_data,
+                       const std::vector<OpReqType> &req,
+                       const std::vector<TBlob> &out_data,
+                       const std::vector<TBlob> &aux_args) {
+    using namespace mshadow;
+ //    CHECK_EQ(in_data.size(), 2);
+ //    CHECK_EQ(out_data.size(), 3);
+ //    Stream<gpu> *s = ctx.get_stream<gpu>();
+ //    Tensor<gpu, 4, DType> data = in_data[st::kData].get<gpu, 4, DType>(s);
+ //    Tensor<gpu, 4, DType> out = out_data[st::kOut].get<gpu, 4, DType>(s);
+ //    Shape<3> loc_shape = Shape3(data.size(0), 2, 3);
+ //    Shape<4> grid_shape = Shape4(out.size(0), out.size(2), out.size(3), 2);
+ //    Tensor<gpu, 3, DType> loc = in_data[st::kLoc].get_with_shape<gpu, 3, DType>(loc_shape, s);
+ //    Tensor<gpu, 4, DType> grid = out_data[st::kGridSrc]
+ //                                .get_with_shape<gpu, 4, DType>(grid_shape, s);
+ //    if (!init_cudnn_) {
+ //     Init(s, in_data, out_data);
+ //    }
+ //    CHECK_EQ(data.CheckContiguous(), true);
+ //    CHECK_EQ(out.CheckContiguous(), true);
+ //    typename DataType<DType>::ScaleType alpha = 1.0f;
+ //    typename DataType<DType>::ScaleType beta = 0.0f;
+ //    if (param_.transform_type == st::kAffine) {
+ //      CHECK_EQ(cudnnSpatialTfGridGeneratorForward(s->dnn_handle_,
+ //                                                  st_desc_,
+ //                                                  loc.dptr_,
+ //                                                  grid.dptr_/*output*/), CUDNN_STATUS_SUCCESS);
+ //    }
+ //    CHECK_EQ(cudnnSpatialTfSamplerForward(s->dnn_handle_,
+ //                                          st_desc_,
+ //                                          &alpha,
+ //                                          in_desc_,
+ //                                          data.dptr_,
+ //                                          grid.dptr_,
+ //                                          &beta,
+ //                                          out_desc_,
+ //                                          out.dptr_/*output*/), CUDNN_STATUS_SUCCESS);
+  }
+ //
+  virtual void Backward(const OpContext &ctx,
+                        const std::vector<TBlob> &out_grad,
+                        const std::vector<TBlob> &in_data,
+                        const std::vector<TBlob> &out_data,
+                        const std::vector<OpReqType> &req,
+                        const std::vector<TBlob> &in_grad,
+                        const std::vector<TBlob> &aux_args) {
+    using namespace mshadow;
+ //    CHECK_EQ(in_data.size(), 2);
+ //    CHECK_EQ(out_data.size(), 3);
+ //    CHECK_EQ(out_grad.size(), 1);
+ //    Stream<gpu> *s = ctx.get_stream<gpu>();
+ //    Tensor<gpu, 4, DType> data = in_data[st::kData].get<gpu, 4, DType>(s);
+ //    Tensor<gpu, 4, DType> grad = out_grad[st::kOut].get<gpu, 4, DType>(s);
+ //    Tensor<gpu, 4, DType> ddata = in_grad[st::kData].get<gpu, 4, DType>(s);
+ //    Shape<3> loc_shape = Shape3(data.size(0), 2, 3);
+ //    Shape<4> grid_shape = Shape4(grad.size(0), grad.size(2), grad.size(3), 2);
+ //    Tensor<gpu, 3, DType> dloc = in_grad[st::kLoc].get_with_shape<gpu, 3, DType>(loc_shape, s);
+ //    Tensor<gpu, 4, DType> grid = out_data[st::kGridSrc]
+ //                    .get_with_shape<gpu, 4, DType>(grid_shape, s);
+ //    // do not use out_grad[st::kGridSrc], because dgrid is a intermediate tensor, and not include in
+ //    // DeclareBackwardDependency, another, we can we reuse grid for inplace operator
+ //    typename DataType<DType>::ScaleType alpha = 1.0f;
+ //    typename DataType<DType>::ScaleType beta = 0.0f;
+ //    typename DataType<DType>::ScaleType alpha_dgrid = 1.0f;
+ //    typename DataType<DType>::ScaleType beta_dgrid = 0.0f;
+ //    CHECK_EQ(cudnnSpatialTfSamplerBackward(s->dnn_handle_,
+ //                                           st_desc_,
+ //                                           &alpha,
+ //                                           in_desc_,
+ //                                           data.dptr_,
+ //                                           &beta,
+ //                                           in_desc_/*reuse in_desc_*/,
+ //                                           ddata.dptr_/*output*/,
+ //                                           &alpha_dgrid,
+ //                                           out_desc_/*reuse out_desc_*/,
+ //                                           grad.dptr_,
+ //                                           grid.dptr_,
+ //                                           &beta_dgrid,
+ //                                           grid.dptr_/*output, reuse grid*/), CUDNN_STATUS_SUCCESS);
+ //    if (param_.transform_type == st::kAffine) {
+ //      CHECK_EQ(cudnnSpatialTfGridGeneratorBackward(s->dnn_handle_,
+ //                                                   st_desc_,
+ //                                                   grid.dptr_,
+ //                                                   dloc.dptr_/*out*/), CUDNN_STATUS_SUCCESS);
+ //    }
+  }
+ //
+ private:
+  inline void Init(mshadow::Stream<gpu> *s,
+                   const std::vector<TBlob> &in_data,
+                   const std::vector<TBlob> &out_data) {
+    using namespace mshadow;
+    // CHECK_EQ(in_data.size(), 2);
+    // CHECK_EQ(out_data.size(), 3);
+    // if (!init_cudnn_) {
+    //   init_cudnn_ = true;
+    //   // Tensor<gpu, 4, DType> data = in_data[st::kData].get<gpu, 4, DType>(s);
+    //   // Tensor<gpu, 4, DType> out = out_data[st::kOut].get<gpu, 4, DType>(s);
+    //   CHECK_EQ(cudnnCreateRNNDescriptor(&rnn_desc_), CUDNN_STATUS_SUCCESS);
+    //   CHECK_EQ(cudnnCreateDropoutDescriptor(&rnn_dropout_), CUDNN_STATUS_SUCCESS);
+
+    //   CHECK_EQ(cudnnCreateTensorDescriptor(&in_desc_), CUDNN_STATUS_SUCCESS);
+    //   CHECK_EQ(cudnnCreateTensorDescriptor(&out_desc_), CUDNN_STATUS_SUCCESS);
+    //   CHECK_EQ(cudnnSetTensor4dDescriptor(in_desc_,
+    //                                       format_,
+    //                                       dtype_,
+    //                                       data.size(0),
+    //                                       data.size(1),
+    //                                       data.size(2),
+    //                                       data.size(3)), CUDNN_STATUS_SUCCESS);
+    //   CHECK_EQ(cudnnSetTensor4dDescriptor(out_desc_,
+    //                                       format_,
+    //                                       dtype_,
+    //                                       out.size(0),
+    //                                       out.size(1),
+    //                                       out.size(2),
+    //                                       out.size(3)), CUDNN_STATUS_SUCCESS);
+    //   if (param_.sampler_type == st::kBilinear) {
+    //     int dim[] = {static_cast<int>(out.size(0)), static_cast<int>(out.size(1)),
+    //                  static_cast<int>(out.size(2)), static_cast<int>(out.size(3))};
+    //     CHECK_EQ(cudnnSetSpatialTransformerNdDescriptor(st_desc_,
+    //                                                     sampler_,
+    //                                                     dtype_,
+    //                                                     4,
+    //                                                     dim) , CUDNN_STATUS_SUCCESS);
+    //   }
+    // }
+  }
+ 
+  bool init_cudnn_;
+  cudnnDataType_t dtype_;
+  cudnnRNNDescriptor_t rnn_desc_;
+  cudnnRNNMode_t rnn_mode_;
+  cudnnDirectionMode_t rnn_direction_;
+  cudnnRNNInputMode_t rnn_input_mode_;
+  cudnnDropoutDescriptor_t rnn_dropout_;
+  // cudnnTensorDescriptor_t in_desc_;
+  // cudnnTensorDescriptor_t out_desc_;
+  #if CUDNN_MAJOR == 5
+  cudnnTensorFormat_t format_;
+  #endif
+  RNNParam param_;
+};
+#endif  // __CUDACC__ && CUDNN
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_OPERATOR_CUDNN_SPATIAL_TRANSFORMER_INL_H_
diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h
new file mode 100644
index 000000000000..3a538f001d5b
--- /dev/null
+++ b/src/operator/rnn-inl.h
@@ -0,0 +1,471 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file rnn-inl.h
+ * \brief
+ * \author Sebastian Bodenstein
+*/
+#ifndef MXNET_OPERATOR_RNN_INL_H_
+#define MXNET_OPERATOR_RNN_INL_H_
+
+#include <dmlc/logging.h>
+#include <dmlc/parameter.h>
+#include <mxnet/operator.h>
+#include <algorithm>
+#include <map>
+#include <vector>
+#include <string>
+#include <utility>
+#include "./operator_common.h"
+
+namespace mxnet {
+namespace op {
+
+namespace rnn_enum {
+  enum RNNOpInputs {kData, kWeight, kStateIn, kCellStateIn};
+  enum RNNOpOutputs {kOut, kStateOut, kCellStateOut};
+  enum RNNModeType {kRnnRelu, kRnnTanh, kLstm, kGru};  
+  enum RNNDirectionType {kUnidirectional, kBidirectional};
+  enum RNNOpResource {kTempSpace};
+}
+
+// A utility function to calculate input size
+
+inline int rnn_single_param_size(int inputSize,
+                                int hiddenSize, 
+                                int mode){
+  int size = hiddenSize * (hiddenSize + inputSize + 2);
+  // Different RNN's have different num weights
+  switch(mode)
+  {
+    case rnn_enum::kRnnRelu:
+      size *= 1 ;
+      break;
+    case rnn_enum::kRnnTanh:
+      size *= 1;
+      break;
+    case rnn_enum::kLstm:
+      size *= 4;
+      break;
+    case rnn_enum::kGru:
+      size *= 3;
+      break;
+  }
+  return size;
+}
+
+inline int rnn_param_size(int layerNum, 
+                          int inputSize,
+                          int hiddenSize, 
+                          int direction, 
+                          int mode){
+  // get size of first layer
+  int size = rnn_single_param_size(inputSize, hiddenSize, mode);
+  // get size of remaining layers
+  if(direction == rnn_enum::kUnidirectional)
+    size += (layerNum - 1) * rnn_single_param_size(hiddenSize, hiddenSize, mode);
+  else // bidirectional case: input size increases by 2
+    size += (layerNum - 1) * rnn_single_param_size(2 * hiddenSize, hiddenSize, mode);
+  return size;
+}
+
+struct RNNParam : public dmlc::Parameter<RNNParam> {
+  uint32_t state_size;
+  uint32_t num_layers;
+  uint64_t workspace;
+  bool batch_first;
+  int direction;
+  int mode;
+
+  DMLC_DECLARE_PARAMETER(RNNParam) {
+    DMLC_DECLARE_FIELD(state_size)
+    .describe("size of the state for each layer");
+
+    DMLC_DECLARE_FIELD(num_layers)
+    .describe("number of stacked layers");
+
+    DMLC_DECLARE_FIELD(workspace).set_default(512).set_range(0, 8192)
+    .describe("Tmp workspace for RNN (MB)");
+
+    DMLC_DECLARE_FIELD(direction)
+    .add_enum("unidirectional", rnn_enum::kUnidirectional)
+    .add_enum("bidirectional", rnn_enum::kBidirectional)
+    .describe("specifies the recurrence pattern");
+
+    DMLC_DECLARE_FIELD(mode)
+    .add_enum("rnn_relu", rnn_enum::kRnnRelu)
+    .add_enum("rnn_tanh", rnn_enum::kRnnTanh)
+    .add_enum("lstm", rnn_enum::kLstm)
+    .add_enum("gru", rnn_enum::kGru)
+    .describe("the type of RNN to compute");
+  }
+};
+
+template<typename xpu, typename DType>
+class RNNOp : public Operator {
+ public:
+  explicit RNNOp(RNNParam p) {
+    this->param_ = p;
+    // convert MBytes first to Bytes and then to elements.
+    param_.workspace = (param_.workspace << 20) / sizeof(real_t);
+  }
+
+  virtual void Forward(const OpContext &ctx,
+                       const std::vector<TBlob> &in_data,
+                       const std::vector<OpReqType> &req,
+                       const std::vector<TBlob> &out_data,
+                       const std::vector<TBlob> &aux_args) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+//     CHECK_EQ(req[rnn_enum::kOut], kWriteTo);
+  
+//     CHECK_EQ(in_data.size(), expected);
+//     CHECK_EQ(out_data.size(), 1);
+//     Stream<xpu> *s = ctx.get_stream<xpu>();
+//     Tensor<xpu, 4, DType> data = in_data[rnn_enum::kData].get<xpu, 4, DType>(s);
+//     Tensor<xpu, 4, DType> out = out_data[rnn_enum::kOut].get<xpu, 4, DType>(s);
+//     Shape<3> wmat_shape =
+//         Shape3(param_.num_group,
+//                data.shape_[1] / param_.num_group,
+//                param_.num_filter / param_.num_group * param_.kernel[0] * param_.kernel[1]);
+//     Tensor<xpu, 3, DType> wmat =
+//         in_data[rnn_enum::kWeight].get_with_shape<xpu, 3, DType>(wmat_shape, s);
+// #if defined(__CUDACC__)
+//     CHECK_EQ(s->blas_handle_ownership_, Stream<xpu>::OwnHandle)
+//         << "Must init CuBLAS handle in stream";
+// #endif
+//     const index_t nbatch = data.size(0);
+//     Tensor<xpu, 1, DType> workspace =
+//         ctx.requested[rnn_enum::kTempSpace].get_space_typed<xpu, 1, DType>(
+//             Shape1(this->InitTemp(out.shape_, data.shape_)), s);
+//     for (index_t i = 0; i < nbatch; i += nstep_) {
+//       const index_t step = std::min(nstep_, nbatch - i);
+//       Tensor<xpu, 2, DType> temp_col = Tensor<xpu, 2, DType>(
+//                                             workspace.dptr_,
+//                                             Shape2(shape_colunit_[0],
+//                                             shape_colunit_[1] * step), s);
+//       Tensor<xpu, 3, DType> temp_dst = Tensor<xpu, 3, DType>(
+//                                            workspace.dptr_ + temp_col.shape_.Size(),
+//                                            Shape3(shape_dstunit_[0],
+//                                            shape_dstunit_[1],
+//                                            shape_dstunit_[2] * step), s);
+//       temp_dst = reshape(swapaxis<1, 0>(data.Slice(i, i + step)), temp_dst.shape_);
+//       if (param_.pad[0] == 0 && param_.pad[1] == 0) {
+//         temp_col = unpack_patch2col(out.Slice(i, i + step),
+//                                     param_.kernel[0],
+//                                     param_.kernel[1],
+//                                     param_.stride[0],
+//                                     param_.stride[1],
+//                                     1, 1);  // RNN only support dilate equals 1
+//       } else {
+//         temp_col = unpack_patch2col(pad(out.Slice(i, i + step),
+//                                         param_.pad[0], param_.pad[1]),
+//                                     param_.kernel[0],
+//                                     param_.kernel[1],
+//                                     param_.stride[0],
+//                                     param_.stride[1],
+//                                     1, 1);  // RNN only support dilate equals 1
+//       }
+//       const index_t gstride = temp_col.size(0) / param_.num_group;
+//       for (uint32_t gid = 0; gid < param_.num_group; ++gid) {
+//         mshadow::Tensor<xpu, 2, DType> tmpc = temp_col.Slice(gstride * gid,
+//                                               gstride * (gid + 1));
+//         tmpc = dot(wmat[gid].T(), temp_dst[gid]);
+//       }
+//       if (param_.pad[0] == 0 && param_.pad[1] == 0) {
+//         out.Slice(i, i + step) = pack_col2patch(temp_col,
+//                                    out.Slice(i, i + step).shape_,
+//                                    param_.kernel[0],
+//                                    param_.kernel[1],
+//                                    param_.stride[0],
+//                                    1);  // RNN only support dilate equals 1
+//       } else {
+//         Shape<4> pshape = out.Slice(i, i + step).shape_;
+//         pshape[2] += 2 * param_.pad[0];
+//         pshape[3] += 2 * param_.pad[1];
+//         out.Slice(i, i + step) = crop(pack_col2patch(temp_col,
+//                                         pshape,
+//                                         param_.kernel[0],
+//                                         param_.kernel[1],
+//                                         param_.stride[0],
+//                                         1),  // RNN only support dilate equals 1
+//                                         out[i][0].shape_);
+//       }
+//     }
+//     if (!param_.no_bias) {
+//       // add bias, broadcast bias to dim 1: channel
+//       Tensor<xpu, 1, DType> bias = in_data[rnn_enum::kBias].get<xpu, 1, DType>(s);
+//       out += broadcast<1>(bias, out.shape_);
+//     }
+  }
+
+  virtual void Backward(const OpContext &ctx,
+                        const std::vector<TBlob> &out_grad,
+                        const std::vector<TBlob> &in_data,
+                        const std::vector<TBlob> &out_data,
+                        const std::vector<OpReqType> &req,
+                        const std::vector<TBlob> &in_grad,
+                        const std::vector<TBlob> &aux_args) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    // TODO(bing): check the BLAS Handle, be careful
+//     CHECK_EQ(out_grad.size(), 1);
+//     size_t expected = param_.no_bias == 0 ? 3 : 2;
+//     CHECK(in_data.size() == expected && in_grad.size() == expected);
+//     CHECK_EQ(req.size(), expected);
+//     CHECK_EQ(in_data[rnn_enum::kWeight].CheckContiguous(), true);
+//     // get data
+//     Stream<xpu> *s = ctx.get_stream<xpu>();
+//     Tensor<xpu, 4, DType> data = in_data[rnn_enum::kData].get<xpu, 4, DType>(s);
+//     Tensor<xpu, 4, DType> grad = out_grad[rnn_enum::kOut].get<xpu, 4, DType>(s);
+//     Tensor<xpu, 4, DType> gdata = in_grad[rnn_enum::kData].get<xpu, 4, DType>(s);
+//     Shape<3> wmat_shape =
+//         Shape3(param_.num_group,
+//                data.shape_[1] / param_.num_group,
+//                param_.num_filter / param_.num_group * param_.kernel[0] * param_.kernel[1]);
+//     Tensor<xpu, 3, DType> wmat =
+//         in_data[rnn_enum::kWeight].get_with_shape<xpu, 3, DType>(wmat_shape, s);
+//     Tensor<xpu, 3, DType> gwmat =
+//         in_grad[rnn_enum::kWeight].get_with_shape<xpu, 3, DType>(wmat_shape, s);
+// #if defined(__CUDACC__)
+//     CHECK_EQ(s->blas_handle_ownership_, Stream<xpu>::OwnHandle)
+//         << "Must init CuBLAS handle in stream";
+// #endif
+//     const index_t nbatch = data.size(0);
+//     Tensor<xpu, 1, DType> workspace =
+//         ctx.requested[rnn_enum::kTempSpace].get_space_typed<xpu, 1, DType>(
+//             Shape1(this->InitTemp(grad.shape_, data.shape_)), s);
+//     for (index_t i = 0; i < nbatch; i += nstep_) {
+//       const index_t step = std::min(nstep_, nbatch - i);
+//       Tensor<xpu, 2, DType> temp_col = Tensor<xpu, 2, DType>(
+//                                            workspace.dptr_,
+//                                            Shape2(shape_colunit_[0],
+//                                            shape_colunit_[1] * step), s);
+//       Tensor<xpu, 3, DType> temp_dst = Tensor<xpu, 3, DType>(
+//                                            workspace.dptr_ + temp_col.shape_.Size(),
+//                                            Shape3(shape_dstunit_[0],
+//                                            shape_dstunit_[1],
+//                                            shape_dstunit_[2] * step), s);
+//       temp_dst = reshape(swapaxis<1, 0>(data.Slice(i, i + step)), temp_dst.shape_);
+//       if (param_.pad[0] == 0 && param_.pad[1] == 0) {
+//         temp_col = unpack_patch2col(grad.Slice(i, i + step),
+//                                      param_.kernel[0],
+//                                      param_.kernel[1],
+//                                      param_.stride[0],
+//                                      param_.stride[1],
+//                                      1, 1);  // RNN only support dilate equals 1
+//       } else {
+//         temp_col = unpack_patch2col(pad(grad.Slice(i, i + step), param_.pad[0], param_.pad[1]),
+//                                      param_.kernel[0],
+//                                      param_.kernel[1],
+//                                      param_.stride[0],
+//                                      param_.stride[1],
+//                                      1, 1);  // RNN only support dilate equals 1
+//       }
+//       const index_t gstride = temp_col.size(0) / param_.num_group;
+//       for (uint32_t gid = 0; gid < param_.num_group; ++gid) {
+//         Tensor<xpu, 2, DType> tmpc = temp_col.Slice(gstride * gid, gstride * (gid + 1));
+//         if (i == 0) {
+//           Tensor<xpu, 2, DType> tmp_gwmat = gwmat[gid];
+//           Assign(tmp_gwmat, req[rnn_enum::kWeight], dot(temp_dst[gid], tmpc.T()));
+//         } else {
+//           gwmat[gid] += dot(temp_dst[gid], tmpc.T());
+//         }
+//       }
+//       if (req[rnn_enum::kData] == kWriteTo || req[rnn_enum::kData] == kWriteInplace) {
+//         for (uint32_t gid = 0; gid < param_.num_group; ++gid) {
+//           Tensor<xpu, 2, DType> tmpc = temp_col.Slice(gstride * gid, gstride * (gid + 1));
+//           temp_dst[gid] = dot(wmat[gid], tmpc);
+//         }
+//         gdata.Slice(i, i + step) = swapaxis<1, 0>(reshape(temp_dst,
+//                                                     mshadow::Shape4(gdata.shape_[1],
+//                                                     step,
+//                                                     gdata.size(2),
+//                                                     gdata.size(3))));
+//       }
+//     }
+//     if (!param_.no_bias) {
+//       Tensor<xpu, 1, DType> gbias = in_grad[rnn_enum::kBias].get<xpu, 1, DType>(s);
+//       Assign(gbias, req[rnn_enum::kBias], sumall_except_dim<1>(grad));
+//     }
+  }
+
+ private:
+//   inline index_t InitTemp(const mshadow::Shape<4> &ishape,
+//                           const mshadow::Shape<4> &oshape) {
+//     const int ksize_y = param_.kernel[0];
+//     const int ksize_x = param_.kernel[1];
+//     shape_colunit_ = mshadow::Shape2(ishape[1] * ksize_y * ksize_x,
+//                                      oshape[2] * oshape[3]);
+//     shape_dstunit_ = mshadow::Shape3(param_.num_group,
+//                                      oshape[1] / param_.num_group,
+//                                      oshape[2] * oshape[3]);
+//     // See convolution for workspace calculations
+//     nstep_ = std::max(
+//         std::min(
+//             static_cast<index_t>(
+//                 param_.workspace / (shape_colunit_.Size() + shape_dstunit_.Size())),
+//             ishape[0]),
+//         1U);
+
+//     mshadow::Shape<2> scol = mshadow::Shape2(shape_colunit_[0],
+//                                              shape_colunit_[1] * nstep_);
+//     mshadow::Shape<3> sdst = mshadow::Shape3(shape_dstunit_[0],
+//                                              shape_dstunit_[1],
+//                                              shape_dstunit_[2] * nstep_);
+//     index_t required_size = scol.Size() + sdst.Size();
+//     CHECK_GE(param_.workspace, required_size)
+//       << "\nMinimum workspace size: " << required_size * sizeof(DType) << " Bytes\n"
+//       << "Given: " << param_.workspace * sizeof(DType);
+//     return required_size;
+//   }
+
+ private:
+  RNNParam param_;
+};  // class RNNOp
+
+
+
+
+template<typename xpu>
+Operator* CreateOp(RNNParam param, int dtype);
+
+#if DMLC_USE_CXX11
+class RNNProp : public OperatorProperty {
+ public:
+  std::vector<std::string> ListArguments() const override {
+    if (param_.mode == rnn_enum::kLstm) {
+      return {"data", "weight", "state", "cell_state"};
+    } else {
+      return {"data", "weight", "state"};
+    }
+  }
+
+  void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {
+    param_.Init(kwargs);
+  }
+
+  std::map<std::string, std::string> GetParams() const override {
+    return param_.__DICT__();
+  }
+
+  bool InferShape(std::vector<TShape> *in_shape,
+                  std::vector<TShape> *out_shape,
+                  std::vector<TShape> *aux_shape) const override {
+    using namespace mshadow;
+    if (param_.mode == rnn_enum::kLstm) {
+      CHECK_EQ(in_shape->size(), 4) << "Input:[data, weight, state, cell_state]";
+    } else {
+      CHECK_EQ(in_shape->size(), 3) << "Input:[data, weight, state]";
+    }
+    const TShape &dshape = (*in_shape)[rnn_enum::kData];
+    if (dshape.ndim() ==  0) return false;
+    CHECK_EQ(dshape.ndim(), 3) \
+        << "Input data should be rank-3 tensor of dim (seqLength, batch, inputDim).";
+    // Infer hidden state + cell state
+    int batchSize = dshape[0];
+    int inputSize = dshape[2];
+    int numDirections = 1;
+    if(param_.direction == rnn_enum::kBidirectional){
+      numDirections = 2;
+    }
+    int total_layers = numDirections * param_.num_layers; // double for bidirectional
+    SHAPE_ASSIGN_CHECK(*in_shape,
+                       rnn_enum::kStateIn,
+                       Shape3(total_layers, batchSize, param_.state_size));
+    if (param_.mode == rnn_enum::kLstm){
+      SHAPE_ASSIGN_CHECK(*in_shape,
+                        rnn_enum::kCellStateIn,
+                        Shape3(total_layers, batchSize, param_.state_size));
+    }
+    // infer weight size
+    int weight_size = rnn_param_size(param_.num_layers, 
+                                    inputSize, 
+                                    param_.state_size, 
+                                    param_.direction, 
+                                    param_.mode);
+    SHAPE_ASSIGN_CHECK(*in_shape, rnn_enum::kWeight, Shape1(weight_size));
+    // infer output size
+    TShape oshape = dshape;
+    oshape[3] = numDirections * param_.state_size;
+    // infer output state size   
+    TShape outStateShape = dshape;
+    outStateShape[0] = total_layers;
+    outStateShape[1] = batchSize;
+    outStateShape[2] = param_.state_size;
+
+    out_shape->clear();   
+    out_shape->push_back(oshape);
+    out_shape->push_back(outStateShape);
+    if (param_.mode == rnn_enum::kLstm) 
+      out_shape->push_back(outStateShape);
+    return true;
+  }
+
+  bool InferType(std::vector<int> *in_type,
+                 std::vector<int> *out_type,
+                 std::vector<int> *aux_type) const override {
+    CHECK_GE(in_type->size(), 1);
+    int dtype = (*in_type)[0];
+    CHECK_NE(dtype, -1) << "First input must have specified type";
+    for (index_t i = 0; i < in_type->size(); ++i) {
+      if ((*in_type)[i] == -1) {
+        (*in_type)[i] = dtype;
+      } else {
+        CHECK_EQ((*in_type)[i], dtype) << "This layer requires uniform type. "
+                                       << "Expected " << dtype << " v.s. given "
+                                       << (*in_type)[i] << " at " << ListArguments()[i];
+      }
+    }
+    out_type->clear();
+    out_type->push_back(dtype);
+    out_type->push_back(dtype);
+    if (param_.mode == rnn_enum::kLstm) 
+      out_type->push_back(dtype);
+    return true;
+  }
+
+  OperatorProperty* Copy() const override {
+    auto ptr = new RNNProp();
+    ptr->param_ = param_;
+    return ptr;
+  }
+
+  std::string TypeString() const override {
+    return "RNN";
+  }
+
+  std::vector<int> DeclareBackwardDependency(
+    const std::vector<int> &out_grad,
+    const std::vector<int> &in_data,
+    const std::vector<int> &out_data) const override {
+    if (param_.mode == rnn_enum::kLstm)
+      return {out_grad[rnn_enum::kOut], in_data[rnn_enum::kData], in_data[rnn_enum::kWeight]};
+    else
+      return {out_grad[rnn_enum::kOut], in_data[rnn_enum::kData], in_data[rnn_enum::kWeight]};
+  }
+
+  std::vector<ResourceRequest> ForwardResource(
+      const std::vector<TShape> &in_shape) const override {
+    return {ResourceRequest::kTempSpace};
+  }
+
+  std::vector<ResourceRequest> BackwardResource(
+      const std::vector<TShape> &in_shape) const override {
+    return {ResourceRequest::kTempSpace};
+  }
+
+  Operator* CreateOperator(Context ctx) const override {
+    LOG(FATAL) << "Not Implemented";
+    return NULL;
+  }
+
+  Operator* CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
+                             std::vector<int> *in_type) const override;
+
+ private:
+  RNNParam param_;
+};  // class RNNProp
+#endif  // DMLC_USE_CXX11
+}  // namespace op
+}  // namespace mxnet
+#endif  // MXNET_OPERATOR_RNN_INL_H_
diff --git a/src/operator/rnn.cc b/src/operator/rnn.cc
new file mode 100644
index 000000000000..40f7f705718d
--- /dev/null
+++ b/src/operator/rnn.cc
@@ -0,0 +1,41 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file rnn.cc
+ * \brief
+ * \author Sebastian Bodenstein
+*/
+
+#include "./rnn-inl.h"
+
+namespace mxnet {
+namespace op {
+template<>
+Operator *CreateOp<cpu>(RNNParam param, int dtype) {
+  LOG(FATAL) << "RNN is only available for gpu at the moment.";
+  Operator *op = NULL;
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+    op = new RNNOp<cpu, DType>(param);
+  });
+  return op;
+}
+
+Operator *RNNProp::CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
+                                     std::vector<int> *in_type) const {
+  std::vector<TShape> out_shape, aux_shape;
+  std::vector<int> out_type, aux_type;
+  CHECK(InferType(in_type, &out_type, &aux_type));
+  CHECK(InferShape(in_shape, &out_shape, &aux_shape));
+  DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0]);
+}
+
+DMLC_REGISTER_PARAMETER(RNNParam);
+
+MXNET_REGISTER_OP_PROPERTY(RNN, RNNProp)
+.describe("Apply a recurrent layer to input.")
+.add_argument("data", "Symbol", "Input data to RNN")
+.add_argument("weight", "Symbol", "Weight for RNN layers")
+.add_argument("hidden_state", "Symbol", "initial hidden state of the RNN")
+.add_argument("cell_state", "Symbol", "initial cell state for LSTM networks")
+.add_arguments(RNNParam::__FIELDS__()); 
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/rnn.cu b/src/operator/rnn.cu
new file mode 100644
index 000000000000..2cb482f591b2
--- /dev/null
+++ b/src/operator/rnn.cu
@@ -0,0 +1,33 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file rnn.cu
+ * \brief
+ * \author Sebastian Bodenstein
+*/
+
+#include "./rnn-inl.h"
+#include <algorithm>
+#if MXNET_USE_CUDNN == 1 && CUDNN_MAJOR == 5
+#include "./cudnn_rnn-inl.h"
+#endif  // MXNET_USE_CUDNN && CUDNN_MAJOR
+
+namespace mxnet {
+namespace op {
+template<>
+Operator* CreateOp<gpu>(RNNParam param, int dtype) {
+  Operator *op = NULL;
+#if MXNET_USE_CUDNN == 1 && CUDNN_MAJOR == 5
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+    op = new CuDNNRNNOp<DType>(param);
+  })
+#else
+	1;
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+  op = new SpatialTransformerOp<gpu, DType>(param);
+  })
+#endif  // MXNET_USE_CUDNN && CUDNN_MAJOR
+  return op;
+}
+
+}  // namespace op
+}  // namespace mxnet

From 7025db87033b4846c046fdcea74eacafa54127e3 Mon Sep 17 00:00:00 2001
From: Sebastian Bodenstein <sebastianb@wolfram.com>
Date: Sat, 9 Jul 2016 23:17:47 -0400
Subject: [PATCH 02/36] - removed unnecssary commented out code - fixed error
 in output shape inference

---
 src/operator/rnn-inl.h | 207 +++--------------------------------------
 1 file changed, 12 insertions(+), 195 deletions(-)

diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h
index 3a538f001d5b..37150bf58878 100644
--- a/src/operator/rnn-inl.h
+++ b/src/operator/rnn-inl.h
@@ -29,7 +29,6 @@ namespace rnn_enum {
 }
 
 // A utility function to calculate input size
-
 inline int rnn_single_param_size(int inputSize,
                                 int hiddenSize, 
                                 int mode){
@@ -116,86 +115,7 @@ class RNNOp : public Operator {
                        const std::vector<TBlob> &aux_args) {
     using namespace mshadow;
     using namespace mshadow::expr;
-//     CHECK_EQ(req[rnn_enum::kOut], kWriteTo);
-  
-//     CHECK_EQ(in_data.size(), expected);
-//     CHECK_EQ(out_data.size(), 1);
-//     Stream<xpu> *s = ctx.get_stream<xpu>();
-//     Tensor<xpu, 4, DType> data = in_data[rnn_enum::kData].get<xpu, 4, DType>(s);
-//     Tensor<xpu, 4, DType> out = out_data[rnn_enum::kOut].get<xpu, 4, DType>(s);
-//     Shape<3> wmat_shape =
-//         Shape3(param_.num_group,
-//                data.shape_[1] / param_.num_group,
-//                param_.num_filter / param_.num_group * param_.kernel[0] * param_.kernel[1]);
-//     Tensor<xpu, 3, DType> wmat =
-//         in_data[rnn_enum::kWeight].get_with_shape<xpu, 3, DType>(wmat_shape, s);
-// #if defined(__CUDACC__)
-//     CHECK_EQ(s->blas_handle_ownership_, Stream<xpu>::OwnHandle)
-//         << "Must init CuBLAS handle in stream";
-// #endif
-//     const index_t nbatch = data.size(0);
-//     Tensor<xpu, 1, DType> workspace =
-//         ctx.requested[rnn_enum::kTempSpace].get_space_typed<xpu, 1, DType>(
-//             Shape1(this->InitTemp(out.shape_, data.shape_)), s);
-//     for (index_t i = 0; i < nbatch; i += nstep_) {
-//       const index_t step = std::min(nstep_, nbatch - i);
-//       Tensor<xpu, 2, DType> temp_col = Tensor<xpu, 2, DType>(
-//                                             workspace.dptr_,
-//                                             Shape2(shape_colunit_[0],
-//                                             shape_colunit_[1] * step), s);
-//       Tensor<xpu, 3, DType> temp_dst = Tensor<xpu, 3, DType>(
-//                                            workspace.dptr_ + temp_col.shape_.Size(),
-//                                            Shape3(shape_dstunit_[0],
-//                                            shape_dstunit_[1],
-//                                            shape_dstunit_[2] * step), s);
-//       temp_dst = reshape(swapaxis<1, 0>(data.Slice(i, i + step)), temp_dst.shape_);
-//       if (param_.pad[0] == 0 && param_.pad[1] == 0) {
-//         temp_col = unpack_patch2col(out.Slice(i, i + step),
-//                                     param_.kernel[0],
-//                                     param_.kernel[1],
-//                                     param_.stride[0],
-//                                     param_.stride[1],
-//                                     1, 1);  // RNN only support dilate equals 1
-//       } else {
-//         temp_col = unpack_patch2col(pad(out.Slice(i, i + step),
-//                                         param_.pad[0], param_.pad[1]),
-//                                     param_.kernel[0],
-//                                     param_.kernel[1],
-//                                     param_.stride[0],
-//                                     param_.stride[1],
-//                                     1, 1);  // RNN only support dilate equals 1
-//       }
-//       const index_t gstride = temp_col.size(0) / param_.num_group;
-//       for (uint32_t gid = 0; gid < param_.num_group; ++gid) {
-//         mshadow::Tensor<xpu, 2, DType> tmpc = temp_col.Slice(gstride * gid,
-//                                               gstride * (gid + 1));
-//         tmpc = dot(wmat[gid].T(), temp_dst[gid]);
-//       }
-//       if (param_.pad[0] == 0 && param_.pad[1] == 0) {
-//         out.Slice(i, i + step) = pack_col2patch(temp_col,
-//                                    out.Slice(i, i + step).shape_,
-//                                    param_.kernel[0],
-//                                    param_.kernel[1],
-//                                    param_.stride[0],
-//                                    1);  // RNN only support dilate equals 1
-//       } else {
-//         Shape<4> pshape = out.Slice(i, i + step).shape_;
-//         pshape[2] += 2 * param_.pad[0];
-//         pshape[3] += 2 * param_.pad[1];
-//         out.Slice(i, i + step) = crop(pack_col2patch(temp_col,
-//                                         pshape,
-//                                         param_.kernel[0],
-//                                         param_.kernel[1],
-//                                         param_.stride[0],
-//                                         1),  // RNN only support dilate equals 1
-//                                         out[i][0].shape_);
-//       }
-//     }
-//     if (!param_.no_bias) {
-//       // add bias, broadcast bias to dim 1: channel
-//       Tensor<xpu, 1, DType> bias = in_data[rnn_enum::kBias].get<xpu, 1, DType>(s);
-//       out += broadcast<1>(bias, out.shape_);
-//     }
+    // TODO: add MShadow implementation
   }
 
   virtual void Backward(const OpContext &ctx,
@@ -207,125 +127,13 @@ class RNNOp : public Operator {
                         const std::vector<TBlob> &aux_args) {
     using namespace mshadow;
     using namespace mshadow::expr;
-    // TODO(bing): check the BLAS Handle, be careful
-//     CHECK_EQ(out_grad.size(), 1);
-//     size_t expected = param_.no_bias == 0 ? 3 : 2;
-//     CHECK(in_data.size() == expected && in_grad.size() == expected);
-//     CHECK_EQ(req.size(), expected);
-//     CHECK_EQ(in_data[rnn_enum::kWeight].CheckContiguous(), true);
-//     // get data
-//     Stream<xpu> *s = ctx.get_stream<xpu>();
-//     Tensor<xpu, 4, DType> data = in_data[rnn_enum::kData].get<xpu, 4, DType>(s);
-//     Tensor<xpu, 4, DType> grad = out_grad[rnn_enum::kOut].get<xpu, 4, DType>(s);
-//     Tensor<xpu, 4, DType> gdata = in_grad[rnn_enum::kData].get<xpu, 4, DType>(s);
-//     Shape<3> wmat_shape =
-//         Shape3(param_.num_group,
-//                data.shape_[1] / param_.num_group,
-//                param_.num_filter / param_.num_group * param_.kernel[0] * param_.kernel[1]);
-//     Tensor<xpu, 3, DType> wmat =
-//         in_data[rnn_enum::kWeight].get_with_shape<xpu, 3, DType>(wmat_shape, s);
-//     Tensor<xpu, 3, DType> gwmat =
-//         in_grad[rnn_enum::kWeight].get_with_shape<xpu, 3, DType>(wmat_shape, s);
-// #if defined(__CUDACC__)
-//     CHECK_EQ(s->blas_handle_ownership_, Stream<xpu>::OwnHandle)
-//         << "Must init CuBLAS handle in stream";
-// #endif
-//     const index_t nbatch = data.size(0);
-//     Tensor<xpu, 1, DType> workspace =
-//         ctx.requested[rnn_enum::kTempSpace].get_space_typed<xpu, 1, DType>(
-//             Shape1(this->InitTemp(grad.shape_, data.shape_)), s);
-//     for (index_t i = 0; i < nbatch; i += nstep_) {
-//       const index_t step = std::min(nstep_, nbatch - i);
-//       Tensor<xpu, 2, DType> temp_col = Tensor<xpu, 2, DType>(
-//                                            workspace.dptr_,
-//                                            Shape2(shape_colunit_[0],
-//                                            shape_colunit_[1] * step), s);
-//       Tensor<xpu, 3, DType> temp_dst = Tensor<xpu, 3, DType>(
-//                                            workspace.dptr_ + temp_col.shape_.Size(),
-//                                            Shape3(shape_dstunit_[0],
-//                                            shape_dstunit_[1],
-//                                            shape_dstunit_[2] * step), s);
-//       temp_dst = reshape(swapaxis<1, 0>(data.Slice(i, i + step)), temp_dst.shape_);
-//       if (param_.pad[0] == 0 && param_.pad[1] == 0) {
-//         temp_col = unpack_patch2col(grad.Slice(i, i + step),
-//                                      param_.kernel[0],
-//                                      param_.kernel[1],
-//                                      param_.stride[0],
-//                                      param_.stride[1],
-//                                      1, 1);  // RNN only support dilate equals 1
-//       } else {
-//         temp_col = unpack_patch2col(pad(grad.Slice(i, i + step), param_.pad[0], param_.pad[1]),
-//                                      param_.kernel[0],
-//                                      param_.kernel[1],
-//                                      param_.stride[0],
-//                                      param_.stride[1],
-//                                      1, 1);  // RNN only support dilate equals 1
-//       }
-//       const index_t gstride = temp_col.size(0) / param_.num_group;
-//       for (uint32_t gid = 0; gid < param_.num_group; ++gid) {
-//         Tensor<xpu, 2, DType> tmpc = temp_col.Slice(gstride * gid, gstride * (gid + 1));
-//         if (i == 0) {
-//           Tensor<xpu, 2, DType> tmp_gwmat = gwmat[gid];
-//           Assign(tmp_gwmat, req[rnn_enum::kWeight], dot(temp_dst[gid], tmpc.T()));
-//         } else {
-//           gwmat[gid] += dot(temp_dst[gid], tmpc.T());
-//         }
-//       }
-//       if (req[rnn_enum::kData] == kWriteTo || req[rnn_enum::kData] == kWriteInplace) {
-//         for (uint32_t gid = 0; gid < param_.num_group; ++gid) {
-//           Tensor<xpu, 2, DType> tmpc = temp_col.Slice(gstride * gid, gstride * (gid + 1));
-//           temp_dst[gid] = dot(wmat[gid], tmpc);
-//         }
-//         gdata.Slice(i, i + step) = swapaxis<1, 0>(reshape(temp_dst,
-//                                                     mshadow::Shape4(gdata.shape_[1],
-//                                                     step,
-//                                                     gdata.size(2),
-//                                                     gdata.size(3))));
-//       }
-//     }
-//     if (!param_.no_bias) {
-//       Tensor<xpu, 1, DType> gbias = in_grad[rnn_enum::kBias].get<xpu, 1, DType>(s);
-//       Assign(gbias, req[rnn_enum::kBias], sumall_except_dim<1>(grad));
-//     }
+    // TODO: add MShadow implementation
   }
 
- private:
-//   inline index_t InitTemp(const mshadow::Shape<4> &ishape,
-//                           const mshadow::Shape<4> &oshape) {
-//     const int ksize_y = param_.kernel[0];
-//     const int ksize_x = param_.kernel[1];
-//     shape_colunit_ = mshadow::Shape2(ishape[1] * ksize_y * ksize_x,
-//                                      oshape[2] * oshape[3]);
-//     shape_dstunit_ = mshadow::Shape3(param_.num_group,
-//                                      oshape[1] / param_.num_group,
-//                                      oshape[2] * oshape[3]);
-//     // See convolution for workspace calculations
-//     nstep_ = std::max(
-//         std::min(
-//             static_cast<index_t>(
-//                 param_.workspace / (shape_colunit_.Size() + shape_dstunit_.Size())),
-//             ishape[0]),
-//         1U);
-
-//     mshadow::Shape<2> scol = mshadow::Shape2(shape_colunit_[0],
-//                                              shape_colunit_[1] * nstep_);
-//     mshadow::Shape<3> sdst = mshadow::Shape3(shape_dstunit_[0],
-//                                              shape_dstunit_[1],
-//                                              shape_dstunit_[2] * nstep_);
-//     index_t required_size = scol.Size() + sdst.Size();
-//     CHECK_GE(param_.workspace, required_size)
-//       << "\nMinimum workspace size: " << required_size * sizeof(DType) << " Bytes\n"
-//       << "Given: " << param_.workspace * sizeof(DType);
-//     return required_size;
-//   }
-
  private:
   RNNParam param_;
 };  // class RNNOp
 
-
-
-
 template<typename xpu>
 Operator* CreateOp(RNNParam param, int dtype);
 
@@ -340,6 +148,14 @@ class RNNProp : public OperatorProperty {
     }
   }
 
+  std::vector<std::string> ListOutputs() const override {
+    if (param_.mode == rnn_enum::kLstm) {
+      return {"output", "final_state", "final_state_cell"};
+    } else {
+      return {"output", "final_state"};
+    }
+  }
+
   void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {
     param_.Init(kwargs);
   }
@@ -386,7 +202,7 @@ class RNNProp : public OperatorProperty {
     SHAPE_ASSIGN_CHECK(*in_shape, rnn_enum::kWeight, Shape1(weight_size));
     // infer output size
     TShape oshape = dshape;
-    oshape[3] = numDirections * param_.state_size;
+    oshape[2] = numDirections * param_.state_size;
     // infer output state size   
     TShape outStateShape = dshape;
     outStateShape[0] = total_layers;
@@ -396,6 +212,7 @@ class RNNProp : public OperatorProperty {
     out_shape->clear();   
     out_shape->push_back(oshape);
     out_shape->push_back(outStateShape);
+    // Deal with lstm cell state
     if (param_.mode == rnn_enum::kLstm) 
       out_shape->push_back(outStateShape);
     return true;

From e7c2e98df7aef890682890021c34ab05e4ac1157 Mon Sep 17 00:00:00 2001
From: Sebastian Bodenstein <sebastianb@wolfram.com>
Date: Sun, 10 Jul 2016 00:55:39 -0400
Subject: [PATCH 03/36] - some renaming - added cudnn destructors

---
 src/operator/cudnn_rnn-inl.h | 163 +++++++++++++++++++++++------------
 src/operator/rnn-inl.h       |  34 ++++----
 src/operator/rnn.cc          |   6 +-
 src/operator/rnn.cu          |   5 +-
 4 files changed, 129 insertions(+), 79 deletions(-)

diff --git a/src/operator/cudnn_rnn-inl.h b/src/operator/cudnn_rnn-inl.h
index 37895c2b2488..61d6d2c2f23a 100644
--- a/src/operator/cudnn_rnn-inl.h
+++ b/src/operator/cudnn_rnn-inl.h
@@ -23,16 +23,16 @@ class CuDNNRNNOp : public Operator {
     // RNN Mode
     switch (param_.mode) {
       case rnn_enum::kRnnRelu:
-        rnn_mode_ = CUDNN_RNN_RELU;
+        mode_ = CUDNN_RNN_RELU;
         break;
       case rnn_enum::kRnnTanh:
-        rnn_mode_ = CUDNN_RNN_TANH;
+        mode_ = CUDNN_RNN_TANH;
         break;
       case rnn_enum::kLstm:
-        rnn_mode_ = CUDNN_LSTM;
+        mode_ = CUDNN_LSTM;
         break;
       case rnn_enum::kGru:
-        rnn_mode_ = CUDNN_GRU;
+        mode_ = CUDNN_GRU;
         break;
       default:
         LOG(FATAL) << "Not implmented";
@@ -40,22 +40,31 @@ class CuDNNRNNOp : public Operator {
     // RNN Direction
     switch (param_.direction) {
       case rnn_enum::kUnidirectional:
-        rnn_direction_ = CUDNN_UNIDIRECTIONAL;
+        direction_ = CUDNN_UNIDIRECTIONAL;
         break;
       case rnn_enum::kBidirectional:
-        rnn_direction_ = CUDNN_BIDIRECTIONAL;
+        direction_ = CUDNN_BIDIRECTIONAL;
         break;
       default:
         LOG(FATAL) << "Not implmented";
     }
   }
-  // ~CuDNNRNNOp() {
-  //   if (init_cudnn_) {
-  //     CHECK_EQ(cudnnDestroyRNNDescriptor(rnn_desc_), CUDNN_STATUS_SUCCESS);
-  //     // CHECK_EQ(cudnnDestroyTensorDescriptor(rnn_desc_), CUDNN_STATUS_SUCCESS);
-  //     // CHECK_EQ(cudnnDestroyTensorDescriptor(_desc_), CUDNN_STATUS_SUCCESS);
-  //   }
-  // }
+
+  ~CuDNNRNNOp() {
+    if (init_cudnn_) {
+      CHECK_EQ(cudnnDestroyTensorDescriptor(x_desc_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnDestroyTensorDescriptor(hx_desc_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnDestroyTensorDescriptor(y_desc_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnDestroyTensorDescriptor(hy_desc_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnDestroyFilterDescriptor(w_desc_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnDestroyRNNDescriptor(rnn_desc_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnDestroyDropoutDescriptor(dropout_desc_), CUDNN_STATUS_SUCCESS);
+      if (param_.mode == rnn_enum::kLstm){
+            CHECK_EQ(cudnnDestroyTensorDescriptor(cx_desc_), CUDNN_STATUS_SUCCESS);
+            CHECK_EQ(cudnnDestroyTensorDescriptor(cy_desc_), CUDNN_STATUS_SUCCESS);
+      }
+    }
+  }
  
   virtual void Forward(const OpContext &ctx,
                        const std::vector<TBlob> &in_data,
@@ -150,52 +159,96 @@ class CuDNNRNNOp : public Operator {
                    const std::vector<TBlob> &in_data,
                    const std::vector<TBlob> &out_data) {
     using namespace mshadow;
-    // CHECK_EQ(in_data.size(), 2);
-    // CHECK_EQ(out_data.size(), 3);
-    // if (!init_cudnn_) {
-    //   init_cudnn_ = true;
-    //   // Tensor<gpu, 4, DType> data = in_data[st::kData].get<gpu, 4, DType>(s);
-    //   // Tensor<gpu, 4, DType> out = out_data[st::kOut].get<gpu, 4, DType>(s);
-    //   CHECK_EQ(cudnnCreateRNNDescriptor(&rnn_desc_), CUDNN_STATUS_SUCCESS);
-    //   CHECK_EQ(cudnnCreateDropoutDescriptor(&rnn_dropout_), CUDNN_STATUS_SUCCESS);
+    #if CUDNN_MAJOR == 5
+    format_ = CUDNN_TENSOR_NCHW;
+    #endif
+
+    if (param_.mode == rnn_enum::kLstm){
+      CHECK_EQ(in_data.size(), 4);
+      CHECK_EQ(out_data.size(), 3);
+    }
+    else{
+      CHECK_EQ(in_data.size(), 3);
+      CHECK_EQ(out_data.size(), 2);
+    }
+    
+    if (!init_cudnn_) {
+      init_cudnn_ = true;
+
+      Tensor<gpu, 3, DType> data = in_data[rnn_enum::kData].get<gpu, 3, DType>(s);
+      Tensor<gpu, 1, DType> params = in_data[rnn_enum::kParams].get<gpu, 1, DType>(s);
+      Tensor<gpu, 3, DType> state = in_data[rnn_enum::kStateIn].get<gpu, 3, DType>(s);
+
+      Tensor<gpu, 3, DType> out = out_data[rnn_enum::kOut].get<gpu, 3, DType>(s);
+      Tensor<gpu, 3, DType> out_state = out_data[rnn_enum::kOut].get<gpu, 3, DType>(s);
+
+      if (param_.mode == rnn_enum::kLstm){
+        Tensor<gpu, 3, DType> cell_state = 
+          in_data[rnn_enum::kCellStateIn].get<gpu, 3, DType>(s);
+        Tensor<gpu, 3, DType> out_cell_state = 
+          in_data[rnn_enum::kCellStateOut].get<gpu, 3, DType>(s);
+      }
+
+      CHECK_EQ(cudnnCreateRNNDescriptor(&rnn_desc_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnCreateDropoutDescriptor(&dropout_desc_), CUDNN_STATUS_SUCCESS);
 
-    //   CHECK_EQ(cudnnCreateTensorDescriptor(&in_desc_), CUDNN_STATUS_SUCCESS);
-    //   CHECK_EQ(cudnnCreateTensorDescriptor(&out_desc_), CUDNN_STATUS_SUCCESS);
-    //   CHECK_EQ(cudnnSetTensor4dDescriptor(in_desc_,
-    //                                       format_,
-    //                                       dtype_,
-    //                                       data.size(0),
-    //                                       data.size(1),
-    //                                       data.size(2),
-    //                                       data.size(3)), CUDNN_STATUS_SUCCESS);
-    //   CHECK_EQ(cudnnSetTensor4dDescriptor(out_desc_,
-    //                                       format_,
-    //                                       dtype_,
-    //                                       out.size(0),
-    //                                       out.size(1),
-    //                                       out.size(2),
-    //                                       out.size(3)), CUDNN_STATUS_SUCCESS);
-    //   if (param_.sampler_type == st::kBilinear) {
-    //     int dim[] = {static_cast<int>(out.size(0)), static_cast<int>(out.size(1)),
-    //                  static_cast<int>(out.size(2)), static_cast<int>(out.size(3))};
-    //     CHECK_EQ(cudnnSetSpatialTransformerNdDescriptor(st_desc_,
-    //                                                     sampler_,
-    //                                                     dtype_,
-    //                                                     4,
-    //                                                     dim) , CUDNN_STATUS_SUCCESS);
-    //   }
-    // }
+      // Create tensors
+      CHECK_EQ(cudnnCreateFilterDescriptor(&w_desc_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnCreateTensorDescriptor(&x_desc_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnCreateTensorDescriptor(&hx_desc_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnCreateTensorDescriptor(&y_desc_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnCreateTensorDescriptor(&hy_desc_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnCreateTensorDescriptor(&y_desc_), CUDNN_STATUS_SUCCESS);
+      if (param_.mode == rnn_enum::kLstm){
+        CHECK_EQ(cudnnCreateTensorDescriptor(&cx_desc_), CUDNN_STATUS_SUCCESS);
+        CHECK_EQ(cudnnCreateTensorDescriptor(&cy_desc_), CUDNN_STATUS_SUCCESS);
+      }     
+
+      // CHECK_EQ(cudnnCreateTensorDescriptor(&in_desc_), CUDNN_STATUS_SUCCESS);
+      // CHECK_EQ(cudnnCreateTensorDescriptor(&out_desc_), CUDNN_STATUS_SUCCESS);
+      // CHECK_EQ(cudnnSetTensor4dDescriptor(in_desc_,
+      //                                     format_,
+      //                                     dtype_,
+      //                                     data.size(0),
+      //                                     data.size(1),
+      //                                     data.size(2),
+      //                                     data.size(3)), CUDNN_STATUS_SUCCESS);
+      // CHECK_EQ(cudnnSetTensor4dDescriptor(out_desc_,
+      //                                     format_,
+      //                                     dtype_,
+      //                                     out.size(0),
+      //                                     out.size(1),
+      //                                     out.size(2),
+      //                                     out.size(3)), CUDNN_STATUS_SUCCESS);
+      // if (param_.sampler_type == st::kBilinear) {
+      //   int dim[] = {static_cast<int>(out.size(0)), static_cast<int>(out.size(1)),
+      //                static_cast<int>(out.size(2)), static_cast<int>(out.size(3))};
+      //   CHECK_EQ(cudnnSetSpatialTransformerNdDescriptor(st_desc_,
+      //                                                   sampler_,
+      //                                                   dtype_,
+      //                                                   4,
+      //                                                   dim) , CUDNN_STATUS_SUCCESS);
+      // }
+    }
   }
- 
-  bool init_cudnn_;
+
   cudnnDataType_t dtype_;
+  bool init_cudnn_;
   cudnnRNNDescriptor_t rnn_desc_;
-  cudnnRNNMode_t rnn_mode_;
-  cudnnDirectionMode_t rnn_direction_;
-  cudnnRNNInputMode_t rnn_input_mode_;
-  cudnnDropoutDescriptor_t rnn_dropout_;
-  // cudnnTensorDescriptor_t in_desc_;
-  // cudnnTensorDescriptor_t out_desc_;
+  cudnnRNNMode_t mode_;
+  cudnnDirectionMode_t direction_;
+  cudnnRNNInputMode_t input_mode_;
+  cudnnDropoutDescriptor_t dropout_desc_;
+
+  cudnnTensorDescriptor_t x_desc_;
+  cudnnTensorDescriptor_t hx_desc_;
+  cudnnTensorDescriptor_t cx_desc_;    
+  cudnnTensorDescriptor_t y_desc_; 
+  cudnnTensorDescriptor_t hy_desc_; 
+  cudnnTensorDescriptor_t cy_desc_; 
+
+  cudnnFilterDescriptor_t w_desc_;   
+
   #if CUDNN_MAJOR == 5
   cudnnTensorFormat_t format_;
   #endif
diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h
index 37150bf58878..2729a2ff49cc 100644
--- a/src/operator/rnn-inl.h
+++ b/src/operator/rnn-inl.h
@@ -21,16 +21,16 @@ namespace mxnet {
 namespace op {
 
 namespace rnn_enum {
-  enum RNNOpInputs {kData, kWeight, kStateIn, kCellStateIn};
+  enum RNNOpInputs {kData, kParams, kStateIn, kCellStateIn};
   enum RNNOpOutputs {kOut, kStateOut, kCellStateOut};
-  enum RNNModeType {kRnnRelu, kRnnTanh, kLstm, kGru};  
+  enum RNNModeType {kRnnRelu, kRnnTanh, kLstm, kGru};
   enum RNNDirectionType {kUnidirectional, kBidirectional};
   enum RNNOpResource {kTempSpace};
 }
 
 // A utility function to calculate input size
 inline int rnn_single_param_size(int inputSize,
-                                int hiddenSize, 
+                                int hiddenSize,
                                 int mode){
   int size = hiddenSize * (hiddenSize + inputSize + 2);
   // Different RNN's have different num weights
@@ -52,10 +52,10 @@ inline int rnn_single_param_size(int inputSize,
   return size;
 }
 
-inline int rnn_param_size(int layerNum, 
+inline int rnn_param_size(int layerNum,
                           int inputSize,
-                          int hiddenSize, 
-                          int direction, 
+                          int hiddenSize,
+                          int direction,
                           int mode){
   // get size of first layer
   int size = rnn_single_param_size(inputSize, hiddenSize, mode);
@@ -194,26 +194,26 @@ class RNNProp : public OperatorProperty {
                         Shape3(total_layers, batchSize, param_.state_size));
     }
     // infer weight size
-    int weight_size = rnn_param_size(param_.num_layers, 
-                                    inputSize, 
-                                    param_.state_size, 
-                                    param_.direction, 
+    int weight_size = rnn_param_size(param_.num_layers,
+                                    inputSize,
+                                    param_.state_size,
+                                    param_.direction,
                                     param_.mode);
-    SHAPE_ASSIGN_CHECK(*in_shape, rnn_enum::kWeight, Shape1(weight_size));
+    SHAPE_ASSIGN_CHECK(*in_shape, rnn_enum::kParams, Shape1(weight_size));
     // infer output size
     TShape oshape = dshape;
     oshape[2] = numDirections * param_.state_size;
-    // infer output state size   
+    // infer output state size
     TShape outStateShape = dshape;
     outStateShape[0] = total_layers;
     outStateShape[1] = batchSize;
     outStateShape[2] = param_.state_size;
 
-    out_shape->clear();   
+    out_shape->clear();
     out_shape->push_back(oshape);
     out_shape->push_back(outStateShape);
     // Deal with lstm cell state
-    if (param_.mode == rnn_enum::kLstm) 
+    if (param_.mode == rnn_enum::kLstm)
       out_shape->push_back(outStateShape);
     return true;
   }
@@ -236,7 +236,7 @@ class RNNProp : public OperatorProperty {
     out_type->clear();
     out_type->push_back(dtype);
     out_type->push_back(dtype);
-    if (param_.mode == rnn_enum::kLstm) 
+    if (param_.mode == rnn_enum::kLstm)
       out_type->push_back(dtype);
     return true;
   }
@@ -256,9 +256,9 @@ class RNNProp : public OperatorProperty {
     const std::vector<int> &in_data,
     const std::vector<int> &out_data) const override {
     if (param_.mode == rnn_enum::kLstm)
-      return {out_grad[rnn_enum::kOut], in_data[rnn_enum::kData], in_data[rnn_enum::kWeight]};
+      return {out_grad[rnn_enum::kOut], in_data[rnn_enum::kData], in_data[rnn_enum::kParams]};
     else
-      return {out_grad[rnn_enum::kOut], in_data[rnn_enum::kData], in_data[rnn_enum::kWeight]};
+      return {out_grad[rnn_enum::kOut], in_data[rnn_enum::kData], in_data[rnn_enum::kParams]};
   }
 
   std::vector<ResourceRequest> ForwardResource(
diff --git a/src/operator/rnn.cc b/src/operator/rnn.cc
index 40f7f705718d..2a485e5ef224 100644
--- a/src/operator/rnn.cc
+++ b/src/operator/rnn.cc
@@ -33,9 +33,9 @@ DMLC_REGISTER_PARAMETER(RNNParam);
 MXNET_REGISTER_OP_PROPERTY(RNN, RNNProp)
 .describe("Apply a recurrent layer to input.")
 .add_argument("data", "Symbol", "Input data to RNN")
-.add_argument("weight", "Symbol", "Weight for RNN layers")
+.add_argument("parameters", "Symbol", "Vector of all RNN trainable parameters")
 .add_argument("hidden_state", "Symbol", "initial hidden state of the RNN")
-.add_argument("cell_state", "Symbol", "initial cell state for LSTM networks")
-.add_arguments(RNNParam::__FIELDS__()); 
+.add_argument("cell_state", "Symbol", "initial cell state for LSTM networks (only for LSTM)")
+.add_arguments(RNNParam::__FIELDS__());
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/rnn.cu b/src/operator/rnn.cu
index 2cb482f591b2..fb90daf19b41 100644
--- a/src/operator/rnn.cu
+++ b/src/operator/rnn.cu
@@ -21,10 +21,7 @@ Operator* CreateOp<gpu>(RNNParam param, int dtype) {
     op = new CuDNNRNNOp<DType>(param);
   })
 #else
-	1;
-  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
-  op = new SpatialTransformerOp<gpu, DType>(param);
-  })
+   LOG(FATAL) << "RNN is only available for cuDNN at the moment.";
 #endif  // MXNET_USE_CUDNN && CUDNN_MAJOR
   return op;
 }

From 6af1646bb730b966f327e569e46ab6871c859b32 Mon Sep 17 00:00:00 2001
From: Sebastian Bodenstein <sebastianb@wolfram.com>
Date: Sun, 17 Jul 2016 16:01:48 -0400
Subject: [PATCH 04/36] - added dropout

---
 src/operator/cudnn_rnn-inl.h | 166 +++++++++++++++++++++++++++--------
 src/operator/rnn-inl.h       |   5 ++
 2 files changed, 135 insertions(+), 36 deletions(-)

diff --git a/src/operator/cudnn_rnn-inl.h b/src/operator/cudnn_rnn-inl.h
index 61d6d2c2f23a..90bf5cbc9bc7 100644
--- a/src/operator/cudnn_rnn-inl.h
+++ b/src/operator/cudnn_rnn-inl.h
@@ -20,6 +20,8 @@ class CuDNNRNNOp : public Operator {
     this->param_ = param;
     init_cudnn_ = false;
     dtype_ = mshadow::DataType<DType>::kCudnnFlag;
+    // Defaults
+    input_mode_ = CUDNN_LINEAR_INPUT; 
     // RNN Mode
     switch (param_.mode) {
       case rnn_enum::kRnnRelu:
@@ -72,9 +74,48 @@ class CuDNNRNNOp : public Operator {
                        const std::vector<TBlob> &out_data,
                        const std::vector<TBlob> &aux_args) {
     using namespace mshadow;
- //    CHECK_EQ(in_data.size(), 2);
- //    CHECK_EQ(out_data.size(), 3);
- //    Stream<gpu> *s = ctx.get_stream<gpu>();
+    Stream<gpu> *s = ctx.get_stream<gpu>();
+    if(!init_cudnn_){
+      Init(s, in_data, out_data);
+    }
+    // get input + output tensors
+    Tensor<gpu, 3, DType> data = in_data[rnn_enum::kData].get<gpu, 3, DType>(s);
+    Tensor<gpu, 1, DType> params = in_data[rnn_enum::kParams].get<gpu, 1, DType>(s);
+    Tensor<gpu, 3, DType> state = in_data[rnn_enum::kStateIn].get<gpu, 3, DType>(s);
+
+    Tensor<gpu, 3, DType> out = out_data[rnn_enum::kOut].get<gpu, 3, DType>(s);
+    Tensor<gpu, 3, DType> out_state = out_data[rnn_enum::kStateOut].get<gpu, 3, DType>(s);
+
+    if (param_.mode == rnn_enum::kLstm){
+      Tensor<gpu, 3, DType> cell_state = 
+        in_data[rnn_enum::kCellStateIn].get<gpu, 3, DType>(s);
+      Tensor<gpu, 3, DType> out_cell_state = 
+        in_data[rnn_enum::kCellStateOut].get<gpu, 3, DType>(s);
+    }
+    // if (param_.mode == rnn_enum::kLstm){
+    //   CHECK_EQ(in_data.size(), 4);
+    //   CHECK_EQ(out_data.size(), 3);
+    // }
+    // else{
+    //   CHECK_EQ(in_data.size(), 3);
+    //   CHECK_EQ(out_data.size(), 2);
+    // }
+    // // Get tensors
+    // 
+    // Tensor<gpu, 3, DType> data = in_data[rnn_enum::kData].get<gpu, 3, DType>(s);
+    // Tensor<gpu, 1, DType> params = in_data[rnn_enum::kParams].get<gpu, 1, DType>(s);
+    // Tensor<gpu, 3, DType> state = in_data[rnn_enum::kStateIn].get<gpu, 3, DType>(s);
+
+    // Tensor<gpu, 3, DType> out = out_data[rnn_enum::kOut].get<gpu, 3, DType>(s);
+    // Tensor<gpu, 3, DType> out_state = out_data[rnn_enum::kOut].get<gpu, 3, DType>(s);
+
+    // if (param_.mode == rnn_enum::kLstm){
+    //   Tensor<gpu, 3, DType> cell_state = 
+    //     in_data[rnn_enum::kCellStateIn].get<gpu, 3, DType>(s);
+    //   Tensor<gpu, 3, DType> out_cell_state = 
+    //     in_data[rnn_enum::kCellStateOut].get<gpu, 3, DType>(s);
+    // }
+ //    
  //    Tensor<gpu, 4, DType> data = in_data[st::kData].get<gpu, 4, DType>(s);
  //    Tensor<gpu, 4, DType> out = out_data[st::kOut].get<gpu, 4, DType>(s);
  //    Shape<3> loc_shape = Shape3(data.size(0), 2, 3);
@@ -162,8 +203,7 @@ class CuDNNRNNOp : public Operator {
     #if CUDNN_MAJOR == 5
     format_ = CUDNN_TENSOR_NCHW;
     #endif
-
-    if (param_.mode == rnn_enum::kLstm){
+    if(param_.mode == rnn_enum::kLstm){
       CHECK_EQ(in_data.size(), 4);
       CHECK_EQ(out_data.size(), 3);
     }
@@ -171,64 +211,118 @@ class CuDNNRNNOp : public Operator {
       CHECK_EQ(in_data.size(), 3);
       CHECK_EQ(out_data.size(), 2);
     }
-    
     if (!init_cudnn_) {
       init_cudnn_ = true;
-
+      // get input + output tensors
       Tensor<gpu, 3, DType> data = in_data[rnn_enum::kData].get<gpu, 3, DType>(s);
       Tensor<gpu, 1, DType> params = in_data[rnn_enum::kParams].get<gpu, 1, DType>(s);
       Tensor<gpu, 3, DType> state = in_data[rnn_enum::kStateIn].get<gpu, 3, DType>(s);
 
       Tensor<gpu, 3, DType> out = out_data[rnn_enum::kOut].get<gpu, 3, DType>(s);
-      Tensor<gpu, 3, DType> out_state = out_data[rnn_enum::kOut].get<gpu, 3, DType>(s);
+      Tensor<gpu, 3, DType> out_state = out_data[rnn_enum::kStateOut].get<gpu, 3, DType>(s);
 
-      if (param_.mode == rnn_enum::kLstm){
+      if(param_.mode == rnn_enum::kLstm){
         Tensor<gpu, 3, DType> cell_state = 
           in_data[rnn_enum::kCellStateIn].get<gpu, 3, DType>(s);
         Tensor<gpu, 3, DType> out_cell_state = 
           in_data[rnn_enum::kCellStateOut].get<gpu, 3, DType>(s);
       }
 
+      // Create descriptors
       CHECK_EQ(cudnnCreateRNNDescriptor(&rnn_desc_), CUDNN_STATUS_SUCCESS);
       CHECK_EQ(cudnnCreateDropoutDescriptor(&dropout_desc_), CUDNN_STATUS_SUCCESS);
 
-      // Create tensors
       CHECK_EQ(cudnnCreateFilterDescriptor(&w_desc_), CUDNN_STATUS_SUCCESS);
       CHECK_EQ(cudnnCreateTensorDescriptor(&x_desc_), CUDNN_STATUS_SUCCESS);
       CHECK_EQ(cudnnCreateTensorDescriptor(&hx_desc_), CUDNN_STATUS_SUCCESS);
       CHECK_EQ(cudnnCreateTensorDescriptor(&y_desc_), CUDNN_STATUS_SUCCESS);
       CHECK_EQ(cudnnCreateTensorDescriptor(&hy_desc_), CUDNN_STATUS_SUCCESS);
-      CHECK_EQ(cudnnCreateTensorDescriptor(&y_desc_), CUDNN_STATUS_SUCCESS);
+
       if (param_.mode == rnn_enum::kLstm){
         CHECK_EQ(cudnnCreateTensorDescriptor(&cx_desc_), CUDNN_STATUS_SUCCESS);
         CHECK_EQ(cudnnCreateTensorDescriptor(&cy_desc_), CUDNN_STATUS_SUCCESS);
       }     
+      // set dropout 
+      // cudnnSetDropoutDescriptor(dropout_desc_,
+      //                           s->dnn_handle_,
+      //                           param_.p,
+      //                           void * states,
+      //                           size_t stateSizeInBytes,
+      //                           unsigned long long seed)
+      // set RNN 
+      CHECK_EQ(cudnnSetRNNDescriptor(rnn_desc_,
+                                    param_.state_size,
+                                    param_.num_layers,
+                                    dropout_desc_,
+                                    input_mode_,
+                                    direction_,
+                                    mode_,
+                                    dtype_), CUDNN_STATUS_SUCCESS);
+      // Set params
+      int dim_params[3] = {params.shape_[0], 1, 1};
+      CHECK_EQ(cudnnSetFilterNdDescriptor(w_desc_,
+                                          dtype_,
+                                          format_,
+                                          3,
+                                          dim_params
+                                         ), CUDNN_STATUS_SUCCESS);
+      // Get strides
+      int stride_data[3] = {data.shape_[2]*data.shape_[1], data.shape_[2], 1};
+      int stride_state[3] = {state.shape_[2]*state.shape_[1], state.shape_[2], 1};
+      int stride_out[3] = {out.shape_[2]*out.shape_[1], out.shape_[2], 1};   
+      int stride_out_state[3] = 
+        {out_state.shape_[2]*out_state.shape_[1], out_state.shape_[2], 1};
+ 
+      // cuDNN needs int arrays for dim, not index_t array used in Shape
+      int dim_data[3];
+      int dim_state[3];
+      int dim_out[3];
+      int dim_out_state[3];
+      std::copy(std::begin(data.shape_.shape_), std::end(data.shape_.shape_), std::begin(dim_data));
+      std::copy(std::begin(state.shape_.shape_), std::end(state.shape_.shape_), std::begin(dim_state));
+      std::copy(std::begin(out.shape_.shape_), std::end(out.shape_.shape_), std::begin(dim_out));
+      std::copy(std::begin(out_state.shape_.shape_), std::end(out_state.shape_.shape_), std::begin(dim_out_state));
 
-      // CHECK_EQ(cudnnCreateTensorDescriptor(&in_desc_), CUDNN_STATUS_SUCCESS);
-      // CHECK_EQ(cudnnCreateTensorDescriptor(&out_desc_), CUDNN_STATUS_SUCCESS);
-      // CHECK_EQ(cudnnSetTensor4dDescriptor(in_desc_,
-      //                                     format_,
-      //                                     dtype_,
-      //                                     data.size(0),
-      //                                     data.size(1),
-      //                                     data.size(2),
-      //                                     data.size(3)), CUDNN_STATUS_SUCCESS);
-      // CHECK_EQ(cudnnSetTensor4dDescriptor(out_desc_,
-      //                                     format_,
-      //                                     dtype_,
-      //                                     out.size(0),
-      //                                     out.size(1),
-      //                                     out.size(2),
-      //                                     out.size(3)), CUDNN_STATUS_SUCCESS);
-      // if (param_.sampler_type == st::kBilinear) {
-      //   int dim[] = {static_cast<int>(out.size(0)), static_cast<int>(out.size(1)),
-      //                static_cast<int>(out.size(2)), static_cast<int>(out.size(3))};
-      //   CHECK_EQ(cudnnSetSpatialTransformerNdDescriptor(st_desc_,
-      //                                                   sampler_,
-      //                                                   dtype_,
-      //                                                   4,
-      //                                                   dim) , CUDNN_STATUS_SUCCESS);
-      // }
+      // set the tensor descriptors
+      CHECK_EQ(cudnnSetTensorNdDescriptor(x_desc_,
+                                          dtype_,
+                                          3,
+                                          dim_data,
+                                          stride_data
+                                         ), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnSetTensorNdDescriptor(hx_desc_,
+                                          dtype_,
+                                          3,
+                                          dim_state,
+                                          stride_state
+                                         ), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnSetTensorNdDescriptor(y_desc_,
+                                          dtype_,
+                                          3,
+                                          dim_out,
+                                          stride_out
+                                         ), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnSetTensorNdDescriptor(hy_desc_,
+                                          dtype_,
+                                          3,
+                                          dim_out_state,
+                                          stride_out_state
+                                         ), CUDNN_STATUS_SUCCESS);
+      // LSTM has two extra descriptors
+      if (param_.mode == rnn_enum::kLstm){
+        CHECK_EQ(cudnnSetTensorNdDescriptor(cx_desc_,
+                                            dtype_,
+                                            3,
+                                            dim_state,
+                                            stride_state
+                                          ), CUDNN_STATUS_SUCCESS);
+        CHECK_EQ(cudnnSetTensorNdDescriptor(cy_desc_,
+                                            dtype_,
+                                            3,
+                                            dim_out_state,
+                                            stride_out_state
+                                          ), CUDNN_STATUS_SUCCESS);
+      }   
     }
   }
 
diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h
index 2729a2ff49cc..d81ed1637756 100644
--- a/src/operator/rnn-inl.h
+++ b/src/operator/rnn-inl.h
@@ -74,6 +74,7 @@ struct RNNParam : public dmlc::Parameter<RNNParam> {
   bool batch_first;
   int direction;
   int mode;
+  float p;
 
   DMLC_DECLARE_PARAMETER(RNNParam) {
     DMLC_DECLARE_FIELD(state_size)
@@ -96,6 +97,10 @@ struct RNNParam : public dmlc::Parameter<RNNParam> {
     .add_enum("lstm", rnn_enum::kLstm)
     .add_enum("gru", rnn_enum::kGru)
     .describe("the type of RNN to compute");
+    
+    DMLC_DECLARE_FIELD(p).set_default(0.)
+    .set_range(0, 1)
+    .describe("Fraction of the input that gets dropped out at training time");
   }
 };
 

From 050ca51ce382bc88ce56d6f2d198d7d1ae90739c Mon Sep 17 00:00:00 2001
From: Sebastian Bodenstein <sebastianb@wolfram.com>
Date: Mon, 18 Jul 2016 00:28:48 -0400
Subject: [PATCH 05/36] - major refactor - completed forward evaluation

---
 src/operator/cudnn_rnn-inl.h | 481 +++++++++++++++++++----------------
 src/operator/rnn-inl.h       |  39 ++-
 2 files changed, 277 insertions(+), 243 deletions(-)

diff --git a/src/operator/cudnn_rnn-inl.h b/src/operator/cudnn_rnn-inl.h
index 90bf5cbc9bc7..134044321ad7 100644
--- a/src/operator/cudnn_rnn-inl.h
+++ b/src/operator/cudnn_rnn-inl.h
@@ -1,6 +1,6 @@
 /*!
  * Copyright (c) 2016 by Contributors
- * \file cudnn_spatial_transformer-inl.h
+ * \file cudnn_rnn-inl.h
  * \brief
  * \author Sebastian Bodenstein
 */
@@ -21,7 +21,7 @@ class CuDNNRNNOp : public Operator {
     init_cudnn_ = false;
     dtype_ = mshadow::DataType<DType>::kCudnnFlag;
     // Defaults
-    input_mode_ = CUDNN_LINEAR_INPUT; 
+    input_mode_ = CUDNN_LINEAR_INPUT; // Don't support this yet
     // RNN Mode
     switch (param_.mode) {
       case rnn_enum::kRnnRelu:
@@ -40,31 +40,29 @@ class CuDNNRNNOp : public Operator {
         LOG(FATAL) << "Not implmented";
     }
     // RNN Direction
-    switch (param_.direction) {
-      case rnn_enum::kUnidirectional:
-        direction_ = CUDNN_UNIDIRECTIONAL;
-        break;
-      case rnn_enum::kBidirectional:
-        direction_ = CUDNN_BIDIRECTIONAL;
-        break;
-      default:
-        LOG(FATAL) << "Not implmented";
-    }
+    direction_ = param_.bidirectional ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL;
   }
 
   ~CuDNNRNNOp() {
     if (init_cudnn_) {
-      CHECK_EQ(cudnnDestroyTensorDescriptor(x_desc_), CUDNN_STATUS_SUCCESS);
+      for(int i = 0; i < x_desc_vec_.size(); ++i){
+        CHECK_EQ(cudnnDestroyTensorDescriptor(x_desc_vec_[i]), CUDNN_STATUS_SUCCESS);
+        CHECK_EQ(cudnnDestroyTensorDescriptor(y_desc_vec_[i]), CUDNN_STATUS_SUCCESS);
+        CHECK_EQ(cudnnDestroyTensorDescriptor(dx_desc_vec_[i]), CUDNN_STATUS_SUCCESS);
+        CHECK_EQ(cudnnDestroyTensorDescriptor(dy_desc_vec_[i]), CUDNN_STATUS_SUCCESS);
+      }
       CHECK_EQ(cudnnDestroyTensorDescriptor(hx_desc_), CUDNN_STATUS_SUCCESS);
-      CHECK_EQ(cudnnDestroyTensorDescriptor(y_desc_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnDestroyTensorDescriptor(cx_desc_), CUDNN_STATUS_SUCCESS);
       CHECK_EQ(cudnnDestroyTensorDescriptor(hy_desc_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnDestroyTensorDescriptor(cy_desc_), CUDNN_STATUS_SUCCESS);   
+      CHECK_EQ(cudnnDestroyTensorDescriptor(dhx_desc_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnDestroyTensorDescriptor(dcx_desc_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnDestroyTensorDescriptor(dhy_desc_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnDestroyTensorDescriptor(dcy_desc_), CUDNN_STATUS_SUCCESS);   
+
       CHECK_EQ(cudnnDestroyFilterDescriptor(w_desc_), CUDNN_STATUS_SUCCESS);
       CHECK_EQ(cudnnDestroyRNNDescriptor(rnn_desc_), CUDNN_STATUS_SUCCESS);
       CHECK_EQ(cudnnDestroyDropoutDescriptor(dropout_desc_), CUDNN_STATUS_SUCCESS);
-      if (param_.mode == rnn_enum::kLstm){
-            CHECK_EQ(cudnnDestroyTensorDescriptor(cx_desc_), CUDNN_STATUS_SUCCESS);
-            CHECK_EQ(cudnnDestroyTensorDescriptor(cy_desc_), CUDNN_STATUS_SUCCESS);
-      }
     }
   }
  
@@ -74,77 +72,83 @@ class CuDNNRNNOp : public Operator {
                        const std::vector<TBlob> &out_data,
                        const std::vector<TBlob> &aux_args) {
     using namespace mshadow;
+    size_t in_expected = param_.lstm_q_ ? 4 : 3;
+    size_t out_expected = param_.lstm_q_ ? 3 : 2;
+    CHECK_EQ(in_data.size(), in_expected);
+    CHECK_EQ(out_data.size(), out_expected);
     Stream<gpu> *s = ctx.get_stream<gpu>();
-    if(!init_cudnn_){
-      Init(s, in_data, out_data);
-    }
     // get input + output tensors
-    Tensor<gpu, 3, DType> data = in_data[rnn_enum::kData].get<gpu, 3, DType>(s);
-    Tensor<gpu, 1, DType> params = in_data[rnn_enum::kParams].get<gpu, 1, DType>(s);
-    Tensor<gpu, 3, DType> state = in_data[rnn_enum::kStateIn].get<gpu, 3, DType>(s);
+    Tensor<gpu, 3, DType> x = in_data[rnn_enum::kData].get<gpu, 3, DType>(s);
+    Tensor<gpu, 1, DType> w = in_data[rnn_enum::kParams].get<gpu, 1, DType>(s);
+    Tensor<gpu, 3, DType> hx = in_data[rnn_enum::kStateIn].get<gpu, 3, DType>(s);
 
-    Tensor<gpu, 3, DType> out = out_data[rnn_enum::kOut].get<gpu, 3, DType>(s);
-    Tensor<gpu, 3, DType> out_state = out_data[rnn_enum::kStateOut].get<gpu, 3, DType>(s);
+    Tensor<gpu, 3, DType> y = out_data[rnn_enum::kOut].get<gpu, 3, DType>(s);
+    Tensor<gpu, 3, DType> hy = out_data[rnn_enum::kStateOut].get<gpu, 3, DType>(s);
 
+    DType * cx_ptr = NULL;
+    DType * cy_ptr = NULL;
     if (param_.mode == rnn_enum::kLstm){
-      Tensor<gpu, 3, DType> cell_state = 
-        in_data[rnn_enum::kCellStateIn].get<gpu, 3, DType>(s);
-      Tensor<gpu, 3, DType> out_cell_state = 
-        in_data[rnn_enum::kCellStateOut].get<gpu, 3, DType>(s);
+      cx_ptr = (in_data[rnn_enum::kCellStateIn].get<gpu, 3, DType>(s)).dptr_;
+      cy_ptr = (in_data[rnn_enum::kCellStateOut].get<gpu, 3, DType>(s)).dptr_;
     }
-    // if (param_.mode == rnn_enum::kLstm){
-    //   CHECK_EQ(in_data.size(), 4);
-    //   CHECK_EQ(out_data.size(), 3);
-    // }
-    // else{
-    //   CHECK_EQ(in_data.size(), 3);
-    //   CHECK_EQ(out_data.size(), 2);
-    // }
-    // // Get tensors
-    // 
-    // Tensor<gpu, 3, DType> data = in_data[rnn_enum::kData].get<gpu, 3, DType>(s);
-    // Tensor<gpu, 1, DType> params = in_data[rnn_enum::kParams].get<gpu, 1, DType>(s);
-    // Tensor<gpu, 3, DType> state = in_data[rnn_enum::kStateIn].get<gpu, 3, DType>(s);
 
-    // Tensor<gpu, 3, DType> out = out_data[rnn_enum::kOut].get<gpu, 3, DType>(s);
-    // Tensor<gpu, 3, DType> out_state = out_data[rnn_enum::kOut].get<gpu, 3, DType>(s);
+    if(!init_cudnn_){
+      Init(s, in_data, out_data);
+    } 
 
-    // if (param_.mode == rnn_enum::kLstm){
-    //   Tensor<gpu, 3, DType> cell_state = 
-    //     in_data[rnn_enum::kCellStateIn].get<gpu, 3, DType>(s);
-    //   Tensor<gpu, 3, DType> out_cell_state = 
-    //     in_data[rnn_enum::kCellStateOut].get<gpu, 3, DType>(s);
-    // }
- //    
- //    Tensor<gpu, 4, DType> data = in_data[st::kData].get<gpu, 4, DType>(s);
- //    Tensor<gpu, 4, DType> out = out_data[st::kOut].get<gpu, 4, DType>(s);
- //    Shape<3> loc_shape = Shape3(data.size(0), 2, 3);
- //    Shape<4> grid_shape = Shape4(out.size(0), out.size(2), out.size(3), 2);
- //    Tensor<gpu, 3, DType> loc = in_data[st::kLoc].get_with_shape<gpu, 3, DType>(loc_shape, s);
- //    Tensor<gpu, 4, DType> grid = out_data[st::kGridSrc]
- //                                .get_with_shape<gpu, 4, DType>(grid_shape, s);
- //    if (!init_cudnn_) {
- //     Init(s, in_data, out_data);
- //    }
- //    CHECK_EQ(data.CheckContiguous(), true);
- //    CHECK_EQ(out.CheckContiguous(), true);
- //    typename DataType<DType>::ScaleType alpha = 1.0f;
- //    typename DataType<DType>::ScaleType beta = 0.0f;
- //    if (param_.transform_type == st::kAffine) {
- //      CHECK_EQ(cudnnSpatialTfGridGeneratorForward(s->dnn_handle_,
- //                                                  st_desc_,
- //                                                  loc.dptr_,
- //                                                  grid.dptr_/*output*/), CUDNN_STATUS_SUCCESS);
- //    }
- //    CHECK_EQ(cudnnSpatialTfSamplerForward(s->dnn_handle_,
- //                                          st_desc_,
- //                                          &alpha,
- //                                          in_desc_,
- //                                          data.dptr_,
- //                                          grid.dptr_,
- //                                          &beta,
- //                                          out_desc_,
- //                                          out.dptr_/*output*/), CUDNN_STATUS_SUCCESS);
+    if (ctx.is_train) { 
+      // training mode
+      Tensor<gpu, 1, DType> temp_space =
+        ctx.requested[rnn_enum::kTempSpace].get_space_typed<gpu, 1, DType>(
+                                mshadow::Shape1(workspace_size_ + reserve_space_size_), s);
+      CHECK_EQ(cudnnRNNForwardTraining(s->dnn_handle_,
+                                      rnn_desc_,
+                                      param_.seq_length_,
+                                      x_desc_vec_.data(),
+                                      x.dptr_,
+                                      hx_desc_,
+                                      hx.dptr_,
+                                      cx_desc_,
+                                      cx_ptr,
+                                      w_desc_,
+                                      w.dptr_,
+                                      y_desc_vec_.data(),
+                                      y.dptr_,
+                                      hy_desc_,
+                                      hy.dptr_,
+                                      cy_desc_,
+                                      cy_ptr,
+                                      temp_space.dptr_,
+                                      workspace_byte_,
+                                      temp_space.dptr_ + workspace_size_,
+                                      reserve_space_byte_
+                                      ), CUDNN_STATUS_SUCCESS);
+    } else {
+      // inference mode
+      Tensor<gpu, 1, DType> temp_space =
+          ctx.requested[rnn_enum::kTempSpace].get_space_typed<gpu, 1, DType>(
+                                  mshadow::Shape1(workspace_size_), s);
+      CHECK_EQ(cudnnRNNForwardInference(s->dnn_handle_,
+                                      rnn_desc_,
+                                      param_.seq_length_,
+                                      x_desc_vec_.data(),
+                                      x.dptr_,
+                                      hx_desc_,
+                                      hx.dptr_,
+                                      cx_desc_,
+                                      cx_ptr,
+                                      w_desc_,
+                                      w.dptr_,
+                                      y_desc_vec_.data(),
+                                      y.dptr_,
+                                      hy_desc_,
+                                      hy.dptr_,
+                                      cy_desc_,
+                                      cy_ptr,
+                                      temp_space.dptr_,
+                                      workspace_byte_
+                                      ), CUDNN_STATUS_SUCCESS); 
+    }
   }
  //
   virtual void Backward(const OpContext &ctx,
@@ -155,46 +159,12 @@ class CuDNNRNNOp : public Operator {
                         const std::vector<TBlob> &in_grad,
                         const std::vector<TBlob> &aux_args) {
     using namespace mshadow;
- //    CHECK_EQ(in_data.size(), 2);
- //    CHECK_EQ(out_data.size(), 3);
- //    CHECK_EQ(out_grad.size(), 1);
- //    Stream<gpu> *s = ctx.get_stream<gpu>();
- //    Tensor<gpu, 4, DType> data = in_data[st::kData].get<gpu, 4, DType>(s);
- //    Tensor<gpu, 4, DType> grad = out_grad[st::kOut].get<gpu, 4, DType>(s);
- //    Tensor<gpu, 4, DType> ddata = in_grad[st::kData].get<gpu, 4, DType>(s);
- //    Shape<3> loc_shape = Shape3(data.size(0), 2, 3);
- //    Shape<4> grid_shape = Shape4(grad.size(0), grad.size(2), grad.size(3), 2);
- //    Tensor<gpu, 3, DType> dloc = in_grad[st::kLoc].get_with_shape<gpu, 3, DType>(loc_shape, s);
- //    Tensor<gpu, 4, DType> grid = out_data[st::kGridSrc]
- //                    .get_with_shape<gpu, 4, DType>(grid_shape, s);
- //    // do not use out_grad[st::kGridSrc], because dgrid is a intermediate tensor, and not include in
- //    // DeclareBackwardDependency, another, we can we reuse grid for inplace operator
- //    typename DataType<DType>::ScaleType alpha = 1.0f;
- //    typename DataType<DType>::ScaleType beta = 0.0f;
- //    typename DataType<DType>::ScaleType alpha_dgrid = 1.0f;
- //    typename DataType<DType>::ScaleType beta_dgrid = 0.0f;
- //    CHECK_EQ(cudnnSpatialTfSamplerBackward(s->dnn_handle_,
- //                                           st_desc_,
- //                                           &alpha,
- //                                           in_desc_,
- //                                           data.dptr_,
- //                                           &beta,
- //                                           in_desc_/*reuse in_desc_*/,
- //                                           ddata.dptr_/*output*/,
- //                                           &alpha_dgrid,
- //                                           out_desc_/*reuse out_desc_*/,
- //                                           grad.dptr_,
- //                                           grid.dptr_,
- //                                           &beta_dgrid,
- //                                           grid.dptr_/*output, reuse grid*/), CUDNN_STATUS_SUCCESS);
- //    if (param_.transform_type == st::kAffine) {
- //      CHECK_EQ(cudnnSpatialTfGridGeneratorBackward(s->dnn_handle_,
- //                                                   st_desc_,
- //                                                   grid.dptr_,
- //                                                   dloc.dptr_/*out*/), CUDNN_STATUS_SUCCESS);
- //    }
+    size_t in_expected = param_.lstm_q_ ? 4 : 3;
+    size_t out_expected = param_.lstm_q_ ? 3 : 2;
+    CHECK_EQ(in_data.size(), in_expected);
+    CHECK_EQ(out_data.size(), out_expected);
+    CHECK_EQ(out_data.size(), out_expected);
   }
- //
  private:
   inline void Init(mshadow::Stream<gpu> *s,
                    const std::vector<TBlob> &in_data,
@@ -203,126 +173,193 @@ class CuDNNRNNOp : public Operator {
     #if CUDNN_MAJOR == 5
     format_ = CUDNN_TENSOR_NCHW;
     #endif
-    if(param_.mode == rnn_enum::kLstm){
-      CHECK_EQ(in_data.size(), 4);
-      CHECK_EQ(out_data.size(), 3);
-    }
-    else{
-      CHECK_EQ(in_data.size(), 3);
-      CHECK_EQ(out_data.size(), 2);
-    }
+    size_t in_expected = param_.lstm_q_ ? 4 : 3;
+    size_t out_expected = param_.lstm_q_ ? 3 : 2;
+    CHECK_EQ(in_data.size(), in_expected);
+    CHECK_EQ(out_data.size(), out_expected);
     if (!init_cudnn_) {
       init_cudnn_ = true;
       // get input + output tensors
-      Tensor<gpu, 3, DType> data = in_data[rnn_enum::kData].get<gpu, 3, DType>(s);
-      Tensor<gpu, 1, DType> params = in_data[rnn_enum::kParams].get<gpu, 1, DType>(s);
-      Tensor<gpu, 3, DType> state = in_data[rnn_enum::kStateIn].get<gpu, 3, DType>(s);
+      Tensor<gpu, 3, DType> x = in_data[rnn_enum::kData].get<gpu, 3, DType>(s);
+      Tensor<gpu, 1, DType> w = in_data[rnn_enum::kParams].get<gpu, 1, DType>(s);
+      // Tensor Descriptors
+      std::vector<cudnnTensorDescriptor_t> x_vec(param_.seq_length_);
+      std::vector<cudnnTensorDescriptor_t> y_vec(param_.seq_length_);
+      std::vector<cudnnTensorDescriptor_t> dx_vec(param_.seq_length_);
+      std::vector<cudnnTensorDescriptor_t> dy_vec(param_.seq_length_);
+      int dimA[3];
+      int strideA[3];
+      for (int i = 0; i < param_.seq_length_; i++) {
+          CHECK_EQ(cudnnCreateTensorDescriptor(&x_vec[i]), CUDNN_STATUS_SUCCESS);
+          CHECK_EQ(cudnnCreateTensorDescriptor(&y_vec[i]), CUDNN_STATUS_SUCCESS);
+          CHECK_EQ(cudnnCreateTensorDescriptor(&dx_vec[i]), CUDNN_STATUS_SUCCESS);
+          CHECK_EQ(cudnnCreateTensorDescriptor(&dy_vec[i]), CUDNN_STATUS_SUCCESS);
+
+          dimA[0] = x.shape_[0];
+          dimA[1] = x.shape_[2];
+          dimA[2] = 1;
+          strideA[0] = dimA[2] * dimA[1];
+          strideA[1] = dimA[2];
+          strideA[2] = 1; 
 
-      Tensor<gpu, 3, DType> out = out_data[rnn_enum::kOut].get<gpu, 3, DType>(s);
-      Tensor<gpu, 3, DType> out_state = out_data[rnn_enum::kStateOut].get<gpu, 3, DType>(s);
+          CHECK_EQ(cudnnSetTensorNdDescriptor(x_vec[i],
+                                    dtype_,
+                                    3,
+                                    dimA,
+                                    strideA
+                                    ), CUDNN_STATUS_SUCCESS);
+          CHECK_EQ(cudnnSetTensorNdDescriptor(dx_vec[i],
+                                    dtype_,
+                                    3,
+                                    dimA,
+                                    strideA
+                                    ), CUDNN_STATUS_SUCCESS);
+          dimA[0] = x.shape_[0];                           
+          dimA[1] = param_.bidirectional ? param_.state_size * 2 : param_.state_size;
+          dimA[2] = 1;
+          strideA[0] = dimA[2] * dimA[1];
+          strideA[1] = dimA[2];
+          strideA[2] = 1;
 
-      if(param_.mode == rnn_enum::kLstm){
-        Tensor<gpu, 3, DType> cell_state = 
-          in_data[rnn_enum::kCellStateIn].get<gpu, 3, DType>(s);
-        Tensor<gpu, 3, DType> out_cell_state = 
-          in_data[rnn_enum::kCellStateOut].get<gpu, 3, DType>(s);
+          CHECK_EQ(cudnnSetTensorNdDescriptor(y_vec[i],
+                                    dtype_,
+                                    3,
+                                    dimA,
+                                    strideA
+                                    ), CUDNN_STATUS_SUCCESS);
+          CHECK_EQ(cudnnSetTensorNdDescriptor(dy_vec[i],
+                                    dtype_,
+                                    3,
+                                    dimA,
+                                    strideA
+                                    ), CUDNN_STATUS_SUCCESS);
       }
+      x_desc_vec_ = x_vec;
+      y_desc_vec_ = y_vec;
+      dx_desc_vec_ = dx_vec;
+      dy_desc_vec_ = dy_vec;
 
-      // Create descriptors
-      CHECK_EQ(cudnnCreateRNNDescriptor(&rnn_desc_), CUDNN_STATUS_SUCCESS);
-      CHECK_EQ(cudnnCreateDropoutDescriptor(&dropout_desc_), CUDNN_STATUS_SUCCESS);
+      // set the state tensors                       
+      dimA[0] = param_.num_layers * (param_.bidirectional ? 2 : 1);
+      dimA[1] = x.shape_[0]; //minibatch
+      dimA[2] = param_.state_size;
+      strideA[0] = dimA[2] * dimA[1];
+      strideA[1] = dimA[2];
+      strideA[2] = 1;
 
-      CHECK_EQ(cudnnCreateFilterDescriptor(&w_desc_), CUDNN_STATUS_SUCCESS);
-      CHECK_EQ(cudnnCreateTensorDescriptor(&x_desc_), CUDNN_STATUS_SUCCESS);
       CHECK_EQ(cudnnCreateTensorDescriptor(&hx_desc_), CUDNN_STATUS_SUCCESS);
-      CHECK_EQ(cudnnCreateTensorDescriptor(&y_desc_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnCreateTensorDescriptor(&cx_desc_), CUDNN_STATUS_SUCCESS);
       CHECK_EQ(cudnnCreateTensorDescriptor(&hy_desc_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnCreateTensorDescriptor(&cy_desc_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnCreateTensorDescriptor(&dhx_desc_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnCreateTensorDescriptor(&dcx_desc_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnCreateTensorDescriptor(&dhy_desc_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnCreateTensorDescriptor(&dcy_desc_), CUDNN_STATUS_SUCCESS);
 
-      if (param_.mode == rnn_enum::kLstm){
-        CHECK_EQ(cudnnCreateTensorDescriptor(&cx_desc_), CUDNN_STATUS_SUCCESS);
-        CHECK_EQ(cudnnCreateTensorDescriptor(&cy_desc_), CUDNN_STATUS_SUCCESS);
-      }     
-      // set dropout 
-      // cudnnSetDropoutDescriptor(dropout_desc_,
-      //                           s->dnn_handle_,
-      //                           param_.p,
-      //                           void * states,
-      //                           size_t stateSizeInBytes,
-      //                           unsigned long long seed)
-      // set RNN 
-      CHECK_EQ(cudnnSetRNNDescriptor(rnn_desc_,
-                                    param_.state_size,
-                                    param_.num_layers,
-                                    dropout_desc_,
-                                    input_mode_,
-                                    direction_,
-                                    mode_,
-                                    dtype_), CUDNN_STATUS_SUCCESS);
-      // Set params
-      int dim_params[3] = {params.shape_[0], 1, 1};
-      CHECK_EQ(cudnnSetFilterNdDescriptor(w_desc_,
+      CHECK_EQ(cudnnSetTensorNdDescriptor(hx_desc_,
                                           dtype_,
-                                          format_,
                                           3,
-                                          dim_params
+                                          dimA,
+                                          strideA
                                          ), CUDNN_STATUS_SUCCESS);
-      // Get strides
-      int stride_data[3] = {data.shape_[2]*data.shape_[1], data.shape_[2], 1};
-      int stride_state[3] = {state.shape_[2]*state.shape_[1], state.shape_[2], 1};
-      int stride_out[3] = {out.shape_[2]*out.shape_[1], out.shape_[2], 1};   
-      int stride_out_state[3] = 
-        {out_state.shape_[2]*out_state.shape_[1], out_state.shape_[2], 1};
- 
-      // cuDNN needs int arrays for dim, not index_t array used in Shape
-      int dim_data[3];
-      int dim_state[3];
-      int dim_out[3];
-      int dim_out_state[3];
-      std::copy(std::begin(data.shape_.shape_), std::end(data.shape_.shape_), std::begin(dim_data));
-      std::copy(std::begin(state.shape_.shape_), std::end(state.shape_.shape_), std::begin(dim_state));
-      std::copy(std::begin(out.shape_.shape_), std::end(out.shape_.shape_), std::begin(dim_out));
-      std::copy(std::begin(out_state.shape_.shape_), std::end(out_state.shape_.shape_), std::begin(dim_out_state));
-
-      // set the tensor descriptors
-      CHECK_EQ(cudnnSetTensorNdDescriptor(x_desc_,
+      CHECK_EQ(cudnnSetTensorNdDescriptor(cx_desc_,
                                           dtype_,
                                           3,
-                                          dim_data,
-                                          stride_data
+                                          dimA,
+                                          strideA
                                          ), CUDNN_STATUS_SUCCESS);
-      CHECK_EQ(cudnnSetTensorNdDescriptor(hx_desc_,
+      CHECK_EQ(cudnnSetTensorNdDescriptor(hy_desc_,
                                           dtype_,
                                           3,
-                                          dim_state,
-                                          stride_state
+                                          dimA,
+                                          strideA
                                          ), CUDNN_STATUS_SUCCESS);
-      CHECK_EQ(cudnnSetTensorNdDescriptor(y_desc_,
+      CHECK_EQ(cudnnSetTensorNdDescriptor(cy_desc_,
                                           dtype_,
                                           3,
-                                          dim_out,
-                                          stride_out
+                                          dimA,
+                                          strideA
                                          ), CUDNN_STATUS_SUCCESS);
-      CHECK_EQ(cudnnSetTensorNdDescriptor(hy_desc_,
+      CHECK_EQ(cudnnSetTensorNdDescriptor(dhx_desc_,
                                           dtype_,
                                           3,
-                                          dim_out_state,
-                                          stride_out_state
+                                          dimA,
+                                          strideA
                                          ), CUDNN_STATUS_SUCCESS);
-      // LSTM has two extra descriptors
-      if (param_.mode == rnn_enum::kLstm){
-        CHECK_EQ(cudnnSetTensorNdDescriptor(cx_desc_,
-                                            dtype_,
-                                            3,
-                                            dim_state,
-                                            stride_state
-                                          ), CUDNN_STATUS_SUCCESS);
-        CHECK_EQ(cudnnSetTensorNdDescriptor(cy_desc_,
-                                            dtype_,
-                                            3,
-                                            dim_out_state,
-                                            stride_out_state
+      CHECK_EQ(cudnnSetTensorNdDescriptor(dcx_desc_,
+                                          dtype_,
+                                          3,
+                                          dimA,
+                                          strideA
+                                         ), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnSetTensorNdDescriptor(dhy_desc_,
+                                          dtype_,
+                                          3,
+                                          dimA,
+                                          strideA
+                                         ), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnSetTensorNdDescriptor(dcy_desc_,
+                                          dtype_,
+                                          3,
+                                          dimA,
+                                          strideA
+                                         ), CUDNN_STATUS_SUCCESS);
+
+      // Get temp space sizes
+      CHECK_EQ(cudnnGetRNNWorkspaceSize(s->dnn_handle_,
+                                        rnn_desc_,
+                                        param_.seq_length_,
+                                        x_desc_vec_.data(),
+                                        &workspace_byte_
+                                        ), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnGetRNNTrainingReserveSize(s->dnn_handle_,
+                                        rnn_desc_,
+                                        param_.seq_length_,
+                                        x_desc_vec_.data(),
+                                        &reserve_space_byte_
+                                        ), CUDNN_STATUS_SUCCESS);
+      workspace_size_ = workspace_byte_ / sizeof(DType) + 1;
+      reserve_space_size_ = reserve_space_byte_ / sizeof(DType) + 1;
+
+      // Set param descriptors
+      CHECK_EQ(cudnnCreateFilterDescriptor(&w_desc_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnCreateFilterDescriptor(&dw_desc_), CUDNN_STATUS_SUCCESS);
+      int dim_w[3] = {w.shape_[0], 1, 1};
+      CHECK_EQ(cudnnSetFilterNdDescriptor(w_desc_,
+                                          dtype_,
+                                          format_,
+                                          3,
+                                          dim_w
+                                         ), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnSetFilterNdDescriptor(dw_desc_,
+                                          dtype_,
+                                          format_,
+                                          3,
+                                          dim_w
+                                         ), CUDNN_STATUS_SUCCESS);
+      // Create Dropout descriptors
+      CHECK_EQ(cudnnCreateDropoutDescriptor(&dropout_desc_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnDropoutGetStatesSize(s->dnn_handle_, 
+                                          &dropout_byte_
                                           ), CUDNN_STATUS_SUCCESS);
-      }   
+      dropout_size_ = dropout_byte_ / sizeof(DType);
+      CHECK_EQ(cudnnSetDropoutDescriptor(dropout_desc_,
+                                        s->dnn_handle_,
+                                        param_.pkeep_,  // keep probability 
+                                        NULL,
+                                        dropout_byte_,
+                                        seed_), CUDNN_STATUS_SUCCESS);
+      // RNN descriptors       
+      CHECK_EQ(cudnnCreateRNNDescriptor(&rnn_desc_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnSetRNNDescriptor(rnn_desc_,
+                                    param_.state_size,
+                                    param_.num_layers,
+                                    dropout_desc_,
+                                    input_mode_,
+                                    direction_,
+                                    mode_,
+                                    dtype_), CUDNN_STATUS_SUCCESS);
+
     }
   }
 
@@ -333,15 +370,17 @@ class CuDNNRNNOp : public Operator {
   cudnnDirectionMode_t direction_;
   cudnnRNNInputMode_t input_mode_;
   cudnnDropoutDescriptor_t dropout_desc_;
+  unsigned long long seed_ = 4553;
+  size_t workspace_byte_, reserve_space_byte_, dropout_byte_;
+  int workspace_size_, reserve_space_size_, dropout_size_;
 
-  cudnnTensorDescriptor_t x_desc_;
-  cudnnTensorDescriptor_t hx_desc_;
-  cudnnTensorDescriptor_t cx_desc_;    
-  cudnnTensorDescriptor_t y_desc_; 
-  cudnnTensorDescriptor_t hy_desc_; 
-  cudnnTensorDescriptor_t cy_desc_; 
+  std::vector<cudnnTensorDescriptor_t> x_desc_vec_, y_desc_vec_, dx_desc_vec_, dy_desc_vec_;
+  cudnnTensorDescriptor_t hx_desc_, cx_desc_;
+  cudnnTensorDescriptor_t hy_desc_, cy_desc_;
+  cudnnTensorDescriptor_t dhx_desc_, dcx_desc_;
+  cudnnTensorDescriptor_t dhy_desc_, dcy_desc_;
 
-  cudnnFilterDescriptor_t w_desc_;   
+  cudnnFilterDescriptor_t w_desc_, dw_desc_;  
 
   #if CUDNN_MAJOR == 5
   cudnnTensorFormat_t format_;
@@ -352,4 +391,4 @@ class CuDNNRNNOp : public Operator {
 }  // namespace op
 }  // namespace mxnet
 
-#endif  // MXNET_OPERATOR_CUDNN_SPATIAL_TRANSFORMER_INL_H_
+#endif  // MXNET_OPERATOR_CUDNN_RNN_INL_H_
diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h
index d81ed1637756..53189d100ef2 100644
--- a/src/operator/rnn-inl.h
+++ b/src/operator/rnn-inl.h
@@ -24,7 +24,6 @@ namespace rnn_enum {
   enum RNNOpInputs {kData, kParams, kStateIn, kCellStateIn};
   enum RNNOpOutputs {kOut, kStateOut, kCellStateOut};
   enum RNNModeType {kRnnRelu, kRnnTanh, kLstm, kGru};
-  enum RNNDirectionType {kUnidirectional, kBidirectional};
   enum RNNOpResource {kTempSpace};
 }
 
@@ -55,26 +54,27 @@ inline int rnn_single_param_size(int inputSize,
 inline int rnn_param_size(int layerNum,
                           int inputSize,
                           int hiddenSize,
-                          int direction,
+                          bool bidirectional,
                           int mode){
   // get size of first layer
   int size = rnn_single_param_size(inputSize, hiddenSize, mode);
   // get size of remaining layers
-  if(direction == rnn_enum::kUnidirectional)
-    size += (layerNum - 1) * rnn_single_param_size(hiddenSize, hiddenSize, mode);
-  else // bidirectional case: input size increases by 2
+  if(bidirectional)
     size += (layerNum - 1) * rnn_single_param_size(2 * hiddenSize, hiddenSize, mode);
+  else 
+    size += (layerNum - 1) * rnn_single_param_size(hiddenSize, hiddenSize, mode);  
   return size;
 }
 
 struct RNNParam : public dmlc::Parameter<RNNParam> {
   uint32_t state_size;
   uint32_t num_layers;
-  uint64_t workspace;
   bool batch_first;
-  int direction;
+  bool bidirectional;
   int mode;
-  float p;
+  float p, pkeep_;
+  int seq_length_;
+  bool lstm_q_; // whether type is lstm 
 
   DMLC_DECLARE_PARAMETER(RNNParam) {
     DMLC_DECLARE_FIELD(state_size)
@@ -83,13 +83,8 @@ struct RNNParam : public dmlc::Parameter<RNNParam> {
     DMLC_DECLARE_FIELD(num_layers)
     .describe("number of stacked layers");
 
-    DMLC_DECLARE_FIELD(workspace).set_default(512).set_range(0, 8192)
-    .describe("Tmp workspace for RNN (MB)");
-
-    DMLC_DECLARE_FIELD(direction)
-    .add_enum("unidirectional", rnn_enum::kUnidirectional)
-    .add_enum("bidirectional", rnn_enum::kBidirectional)
-    .describe("specifies the recurrence pattern");
+    DMLC_DECLARE_FIELD(bidirectional).set_default(false)
+    .describe("whether to use bidirectional recurrent layers");
 
     DMLC_DECLARE_FIELD(mode)
     .add_enum("rnn_relu", rnn_enum::kRnnRelu)
@@ -108,9 +103,12 @@ template<typename xpu, typename DType>
 class RNNOp : public Operator {
  public:
   explicit RNNOp(RNNParam p) {
-    this->param_ = p;
     // convert MBytes first to Bytes and then to elements.
-    param_.workspace = (param_.workspace << 20) / sizeof(real_t);
+    param_.pkeep_ = 1.0f - param_.p;
+    if(param_.mode == rnn_enum::kLstm)
+      param_.lstm_q_ = true;
+    else
+      param_.lstm_q_ = false;
   }
 
   virtual void Forward(const OpContext &ctx,
@@ -185,10 +183,7 @@ class RNNProp : public OperatorProperty {
     // Infer hidden state + cell state
     int batchSize = dshape[0];
     int inputSize = dshape[2];
-    int numDirections = 1;
-    if(param_.direction == rnn_enum::kBidirectional){
-      numDirections = 2;
-    }
+    int numDirections = param_.bidirectional ? 2 : 1;
     int total_layers = numDirections * param_.num_layers; // double for bidirectional
     SHAPE_ASSIGN_CHECK(*in_shape,
                        rnn_enum::kStateIn,
@@ -202,7 +197,7 @@ class RNNProp : public OperatorProperty {
     int weight_size = rnn_param_size(param_.num_layers,
                                     inputSize,
                                     param_.state_size,
-                                    param_.direction,
+                                    param_.bidirectional,
                                     param_.mode);
     SHAPE_ASSIGN_CHECK(*in_shape, rnn_enum::kParams, Shape1(weight_size));
     // infer output size

From f81d8e97c119a0976d1827fb4f056dc40b20515a Mon Sep 17 00:00:00 2001
From: Sebastian Bodenstein <sebastianb@wolfram.com>
Date: Mon, 18 Jul 2016 02:25:19 -0400
Subject: [PATCH 06/36] - added parameter size test - fixed bug where
 cudnnGetRNNParamsSize needs to be called after cudnnSetRNNDescriptor

---
 src/operator/cudnn_rnn-inl.h | 64 +++++++++++++++++++++---------------
 1 file changed, 38 insertions(+), 26 deletions(-)

diff --git a/src/operator/cudnn_rnn-inl.h b/src/operator/cudnn_rnn-inl.h
index 134044321ad7..3a40b2f67fd7 100644
--- a/src/operator/cudnn_rnn-inl.h
+++ b/src/operator/cudnn_rnn-inl.h
@@ -150,7 +150,7 @@ class CuDNNRNNOp : public Operator {
                                       ), CUDNN_STATUS_SUCCESS); 
     }
   }
- //
+ 
   virtual void Backward(const OpContext &ctx,
                         const std::vector<TBlob> &out_grad,
                         const std::vector<TBlob> &in_data,
@@ -182,6 +182,9 @@ class CuDNNRNNOp : public Operator {
       // get input + output tensors
       Tensor<gpu, 3, DType> x = in_data[rnn_enum::kData].get<gpu, 3, DType>(s);
       Tensor<gpu, 1, DType> w = in_data[rnn_enum::kParams].get<gpu, 1, DType>(s);
+
+      param_.seq_length_ = x.shape_[1];
+
       // Tensor Descriptors
       std::vector<cudnnTensorDescriptor_t> x_vec(param_.seq_length_);
       std::vector<cudnnTensorDescriptor_t> y_vec(param_.seq_length_);
@@ -305,7 +308,29 @@ class CuDNNRNNOp : public Operator {
                                           strideA
                                          ), CUDNN_STATUS_SUCCESS);
 
-      // Get temp space sizes
+      // Create Dropout descriptors
+      CHECK_EQ(cudnnCreateDropoutDescriptor(&dropout_desc_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnDropoutGetStatesSize(s->dnn_handle_, 
+                                          &dropout_byte_
+                                          ), CUDNN_STATUS_SUCCESS);
+      dropout_size_ = dropout_byte_ / sizeof(DType);
+      CHECK_EQ(cudnnSetDropoutDescriptor(dropout_desc_,
+                                        s->dnn_handle_,
+                                        param_.pkeep_,  // keep probability 
+                                        NULL,
+                                        dropout_byte_,
+                                        seed_), CUDNN_STATUS_SUCCESS);
+      // RNN descriptors       
+      CHECK_EQ(cudnnCreateRNNDescriptor(&rnn_desc_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnSetRNNDescriptor(rnn_desc_,
+                                    param_.state_size,
+                                    param_.num_layers,
+                                    dropout_desc_,
+                                    input_mode_,
+                                    direction_,
+                                    mode_,
+                                    dtype_), CUDNN_STATUS_SUCCESS);
+      // Get temp space sizes     
       CHECK_EQ(cudnnGetRNNWorkspaceSize(s->dnn_handle_,
                                         rnn_desc_,
                                         param_.seq_length_,
@@ -318,8 +343,17 @@ class CuDNNRNNOp : public Operator {
                                         x_desc_vec_.data(),
                                         &reserve_space_byte_
                                         ), CUDNN_STATUS_SUCCESS);
-      workspace_size_ = workspace_byte_ / sizeof(DType) + 1;
-      reserve_space_size_ = reserve_space_byte_ / sizeof(DType) + 1;
+      workspace_size_ = workspace_byte_ / sizeof(DType);
+      reserve_space_size_ = reserve_space_byte_ / sizeof(DType);
+
+      // check that number of params are correct
+      size_t cudnn_param_size;
+      CHECK_EQ(cudnnGetRNNParamsSize(s->dnn_handle_,
+                                    rnn_desc_,
+                                    x_desc_vec_[0],
+                                    &cudnn_param_size,
+                                    dtype_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(w.shape_[0] * sizeof(DType), cudnn_param_size);
 
       // Set param descriptors
       CHECK_EQ(cudnnCreateFilterDescriptor(&w_desc_), CUDNN_STATUS_SUCCESS);
@@ -337,28 +371,6 @@ class CuDNNRNNOp : public Operator {
                                           3,
                                           dim_w
                                          ), CUDNN_STATUS_SUCCESS);
-      // Create Dropout descriptors
-      CHECK_EQ(cudnnCreateDropoutDescriptor(&dropout_desc_), CUDNN_STATUS_SUCCESS);
-      CHECK_EQ(cudnnDropoutGetStatesSize(s->dnn_handle_, 
-                                          &dropout_byte_
-                                          ), CUDNN_STATUS_SUCCESS);
-      dropout_size_ = dropout_byte_ / sizeof(DType);
-      CHECK_EQ(cudnnSetDropoutDescriptor(dropout_desc_,
-                                        s->dnn_handle_,
-                                        param_.pkeep_,  // keep probability 
-                                        NULL,
-                                        dropout_byte_,
-                                        seed_), CUDNN_STATUS_SUCCESS);
-      // RNN descriptors       
-      CHECK_EQ(cudnnCreateRNNDescriptor(&rnn_desc_), CUDNN_STATUS_SUCCESS);
-      CHECK_EQ(cudnnSetRNNDescriptor(rnn_desc_,
-                                    param_.state_size,
-                                    param_.num_layers,
-                                    dropout_desc_,
-                                    input_mode_,
-                                    direction_,
-                                    mode_,
-                                    dtype_), CUDNN_STATUS_SUCCESS);
 
     }
   }

From 812b7d4a80c5efdf4e83a469ef55e85a7f24a583 Mon Sep 17 00:00:00 2001
From: Sebastian Bodenstein <sebastianb@wolfram.com>
Date: Mon, 18 Jul 2016 10:32:55 -0400
Subject: [PATCH 07/36] - checks for contiguous input tensors - more consistent
 param names - removed 'batch_first' option for now. Might add it later again

---
 src/operator/cudnn_rnn-inl.h | 119 +++++++++++++++++++----------------
 src/operator/rnn-inl.h       |  37 ++++++-----
 2 files changed, 82 insertions(+), 74 deletions(-)

diff --git a/src/operator/cudnn_rnn-inl.h b/src/operator/cudnn_rnn-inl.h
index 3a40b2f67fd7..8c6eae9dc984 100644
--- a/src/operator/cudnn_rnn-inl.h
+++ b/src/operator/cudnn_rnn-inl.h
@@ -92,15 +92,24 @@ class CuDNNRNNOp : public Operator {
       cy_ptr = (in_data[rnn_enum::kCellStateOut].get<gpu, 3, DType>(s)).dptr_;
     }
 
+    CHECK_EQ(x.CheckContiguous(), true);
+    CHECK_EQ(w.CheckContiguous(), true);
+    CHECK_EQ(hx.CheckContiguous(), true);
+    CHECK_EQ(y.CheckContiguous(), true);
+    CHECK_EQ(hy.CheckContiguous(), true);
+
     if(!init_cudnn_){
       Init(s, in_data, out_data);
     } 
 
+    // Get temp space
+    int temp_size = workspace_size_;
+    temp_size += ctx.is_train ? reserve_space_size_ : 0;
+    Tensor<gpu, 1, DType> temp_space =
+      ctx.requested[rnn_enum::kTempSpace].get_space_typed<gpu, 1, DType>(
+                              mshadow::Shape1(temp_size), s);
+    
     if (ctx.is_train) { 
-      // training mode
-      Tensor<gpu, 1, DType> temp_space =
-        ctx.requested[rnn_enum::kTempSpace].get_space_typed<gpu, 1, DType>(
-                                mshadow::Shape1(workspace_size_ + reserve_space_size_), s);
       CHECK_EQ(cudnnRNNForwardTraining(s->dnn_handle_,
                                       rnn_desc_,
                                       param_.seq_length_,
@@ -125,9 +134,6 @@ class CuDNNRNNOp : public Operator {
                                       ), CUDNN_STATUS_SUCCESS);
     } else {
       // inference mode
-      Tensor<gpu, 1, DType> temp_space =
-          ctx.requested[rnn_enum::kTempSpace].get_space_typed<gpu, 1, DType>(
-                                  mshadow::Shape1(workspace_size_), s);
       CHECK_EQ(cudnnRNNForwardInference(s->dnn_handle_,
                                       rnn_desc_,
                                       param_.seq_length_,
@@ -182,8 +188,9 @@ class CuDNNRNNOp : public Operator {
       // get input + output tensors
       Tensor<gpu, 3, DType> x = in_data[rnn_enum::kData].get<gpu, 3, DType>(s);
       Tensor<gpu, 1, DType> w = in_data[rnn_enum::kParams].get<gpu, 1, DType>(s);
-
-      param_.seq_length_ = x.shape_[1];
+      param_.seq_length_ = x.shape_[0];
+      param_.batch_size_ = x.shape_[1];
+      param_.input_size_ = x.shape_[2];
 
       // Tensor Descriptors
       std::vector<cudnnTensorDescriptor_t> x_vec(param_.seq_length_);
@@ -193,49 +200,51 @@ class CuDNNRNNOp : public Operator {
       int dimA[3];
       int strideA[3];
       for (int i = 0; i < param_.seq_length_; i++) {
-          CHECK_EQ(cudnnCreateTensorDescriptor(&x_vec[i]), CUDNN_STATUS_SUCCESS);
-          CHECK_EQ(cudnnCreateTensorDescriptor(&y_vec[i]), CUDNN_STATUS_SUCCESS);
-          CHECK_EQ(cudnnCreateTensorDescriptor(&dx_vec[i]), CUDNN_STATUS_SUCCESS);
-          CHECK_EQ(cudnnCreateTensorDescriptor(&dy_vec[i]), CUDNN_STATUS_SUCCESS);
-
-          dimA[0] = x.shape_[0];
-          dimA[1] = x.shape_[2];
-          dimA[2] = 1;
-          strideA[0] = dimA[2] * dimA[1];
-          strideA[1] = dimA[2];
-          strideA[2] = 1; 
+        CHECK_EQ(cudnnCreateTensorDescriptor(&x_vec[i]), CUDNN_STATUS_SUCCESS);
+        CHECK_EQ(cudnnCreateTensorDescriptor(&y_vec[i]), CUDNN_STATUS_SUCCESS);
+        CHECK_EQ(cudnnCreateTensorDescriptor(&dx_vec[i]), CUDNN_STATUS_SUCCESS);
+        CHECK_EQ(cudnnCreateTensorDescriptor(&dy_vec[i]), CUDNN_STATUS_SUCCESS);
+        
+        dimA[0] = param_.batch_size_;
+        dimA[1] = param_.input_size_;
+        dimA[2] = 1;
+        dimA[0] = param_.batch_size_;
+        dimA[1] = param_.input_size_;
+        strideA[0] = dimA[2] * dimA[1];
+        strideA[1] = dimA[2];
+        strideA[2] = 1; 
 
-          CHECK_EQ(cudnnSetTensorNdDescriptor(x_vec[i],
-                                    dtype_,
-                                    3,
-                                    dimA,
-                                    strideA
-                                    ), CUDNN_STATUS_SUCCESS);
-          CHECK_EQ(cudnnSetTensorNdDescriptor(dx_vec[i],
-                                    dtype_,
-                                    3,
-                                    dimA,
-                                    strideA
-                                    ), CUDNN_STATUS_SUCCESS);
-          dimA[0] = x.shape_[0];                           
-          dimA[1] = param_.bidirectional ? param_.state_size * 2 : param_.state_size;
-          dimA[2] = 1;
-          strideA[0] = dimA[2] * dimA[1];
-          strideA[1] = dimA[2];
-          strideA[2] = 1;
+        CHECK_EQ(cudnnSetTensorNdDescriptor(x_vec[i],
+                                  dtype_,
+                                  3,
+                                  dimA,
+                                  strideA
+                                  ), CUDNN_STATUS_SUCCESS);
+        CHECK_EQ(cudnnSetTensorNdDescriptor(dx_vec[i],
+                                  dtype_,
+                                  3,
+                                  dimA,
+                                  strideA
+                                  ), CUDNN_STATUS_SUCCESS);
+        dimA[0] = param_.batch_size_;                        
+        dimA[1] = param_.bidirectional ? param_.state_size_ * 2 : param_.state_size_;
+        dimA[2] = 1;
+        strideA[0] = dimA[2] * dimA[1];
+        strideA[1] = dimA[2];
+        strideA[2] = 1;
 
-          CHECK_EQ(cudnnSetTensorNdDescriptor(y_vec[i],
-                                    dtype_,
-                                    3,
-                                    dimA,
-                                    strideA
-                                    ), CUDNN_STATUS_SUCCESS);
-          CHECK_EQ(cudnnSetTensorNdDescriptor(dy_vec[i],
-                                    dtype_,
-                                    3,
-                                    dimA,
-                                    strideA
-                                    ), CUDNN_STATUS_SUCCESS);
+        CHECK_EQ(cudnnSetTensorNdDescriptor(y_vec[i],
+                                  dtype_,
+                                  3,
+                                  dimA,
+                                  strideA
+                                  ), CUDNN_STATUS_SUCCESS);
+        CHECK_EQ(cudnnSetTensorNdDescriptor(dy_vec[i],
+                                  dtype_,
+                                  3,
+                                  dimA,
+                                  strideA
+                                  ), CUDNN_STATUS_SUCCESS);
       }
       x_desc_vec_ = x_vec;
       y_desc_vec_ = y_vec;
@@ -243,9 +252,9 @@ class CuDNNRNNOp : public Operator {
       dy_desc_vec_ = dy_vec;
 
       // set the state tensors                       
-      dimA[0] = param_.num_layers * (param_.bidirectional ? 2 : 1);
-      dimA[1] = x.shape_[0]; //minibatch
-      dimA[2] = param_.state_size;
+      dimA[0] = param_.num_layers_ * (param_.bidirectional ? 2 : 1);
+      dimA[1] = param_.batch_size_;
+      dimA[2] = param_.state_size_;
       strideA[0] = dimA[2] * dimA[1];
       strideA[1] = dimA[2];
       strideA[2] = 1;
@@ -323,8 +332,8 @@ class CuDNNRNNOp : public Operator {
       // RNN descriptors       
       CHECK_EQ(cudnnCreateRNNDescriptor(&rnn_desc_), CUDNN_STATUS_SUCCESS);
       CHECK_EQ(cudnnSetRNNDescriptor(rnn_desc_,
-                                    param_.state_size,
-                                    param_.num_layers,
+                                    param_.state_size_,
+                                    param_.num_layers_,
                                     dropout_desc_,
                                     input_mode_,
                                     direction_,
diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h
index 53189d100ef2..a4252b7e8fe5 100644
--- a/src/operator/rnn-inl.h
+++ b/src/operator/rnn-inl.h
@@ -67,20 +67,19 @@ inline int rnn_param_size(int layerNum,
 }
 
 struct RNNParam : public dmlc::Parameter<RNNParam> {
-  uint32_t state_size;
-  uint32_t num_layers;
-  bool batch_first;
+  uint32_t state_size_;
+  uint32_t num_layers_;
   bool bidirectional;
   int mode;
   float p, pkeep_;
-  int seq_length_;
+  int seq_length_, batch_size_, input_size_;
   bool lstm_q_; // whether type is lstm 
 
   DMLC_DECLARE_PARAMETER(RNNParam) {
-    DMLC_DECLARE_FIELD(state_size)
+    DMLC_DECLARE_FIELD(state_size_)
     .describe("size of the state for each layer");
 
-    DMLC_DECLARE_FIELD(num_layers)
+    DMLC_DECLARE_FIELD(num_layers_)
     .describe("number of stacked layers");
 
     DMLC_DECLARE_FIELD(bidirectional).set_default(false)
@@ -179,35 +178,35 @@ class RNNProp : public OperatorProperty {
     const TShape &dshape = (*in_shape)[rnn_enum::kData];
     if (dshape.ndim() ==  0) return false;
     CHECK_EQ(dshape.ndim(), 3) \
-        << "Input data should be rank-3 tensor of dim (seqLength, batch, inputDim).";
-    // Infer hidden state + cell state
-    int batchSize = dshape[0];
-    int inputSize = dshape[2];
+        << "Input data should be rank-3 tensor of dim (sequence length, batch size, input size)";
+    // Get input sizes
+    int batch_size = dshape[1];
+    int input_size = dshape[2];
     int numDirections = param_.bidirectional ? 2 : 1;
-    int total_layers = numDirections * param_.num_layers; // double for bidirectional
+    int total_layers = numDirections * param_.num_layers_; // double for bidirectional
     SHAPE_ASSIGN_CHECK(*in_shape,
                        rnn_enum::kStateIn,
-                       Shape3(total_layers, batchSize, param_.state_size));
+                       Shape3(total_layers, batch_size, param_.state_size_));
     if (param_.mode == rnn_enum::kLstm){
       SHAPE_ASSIGN_CHECK(*in_shape,
                         rnn_enum::kCellStateIn,
-                        Shape3(total_layers, batchSize, param_.state_size));
+                        Shape3(total_layers, batch_size, param_.state_size_));
     }
     // infer weight size
-    int weight_size = rnn_param_size(param_.num_layers,
-                                    inputSize,
-                                    param_.state_size,
+    int weight_size = rnn_param_size(param_.num_layers_,
+                                    input_size,
+                                    param_.state_size_,
                                     param_.bidirectional,
                                     param_.mode);
     SHAPE_ASSIGN_CHECK(*in_shape, rnn_enum::kParams, Shape1(weight_size));
     // infer output size
     TShape oshape = dshape;
-    oshape[2] = numDirections * param_.state_size;
+    oshape[2] = numDirections * param_.state_size_;
     // infer output state size
     TShape outStateShape = dshape;
     outStateShape[0] = total_layers;
-    outStateShape[1] = batchSize;
-    outStateShape[2] = param_.state_size;
+    outStateShape[1] = batch_size;
+    outStateShape[2] = param_.state_size_;
 
     out_shape->clear();
     out_shape->push_back(oshape);

From a7f64e243dc7401a341e9b45b80941eeb4333d51 Mon Sep 17 00:00:00 2001
From: Sebastian Bodenstein <sebastianb@wolfram.com>
Date: Tue, 19 Jul 2016 23:06:39 +0200
Subject: [PATCH 08/36] - fixed input names

---
 src/operator/rnn-inl.h | 41 +++++++++++++++++++++++++----------------
 src/operator/rnn.cc    |  4 ++--
 2 files changed, 27 insertions(+), 18 deletions(-)

diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h
index a4252b7e8fe5..98f8a5953d70 100644
--- a/src/operator/rnn-inl.h
+++ b/src/operator/rnn-inl.h
@@ -144,18 +144,28 @@ class RNNProp : public OperatorProperty {
  public:
   std::vector<std::string> ListArguments() const override {
     if (param_.mode == rnn_enum::kLstm) {
-      return {"data", "weight", "state", "cell_state"};
+      return {"data", "parameters", "state", "cell_state"};
     } else {
-      return {"data", "weight", "state"};
+      return {"data", "parameters", "state"};
     }
   }
 
   std::vector<std::string> ListOutputs() const override {
-    if (param_.mode == rnn_enum::kLstm) {
-      return {"output", "final_state", "final_state_cell"};
-    } else {
-      return {"output", "final_state"};
-    }
+    if (param_.mode == rnn_enum::kLstm)
+      return {"output", "state", "state_cell"};
+    else 
+      return {"output", "state"};
+  }
+
+  int NumOutputs() const override {
+    if (param_.mode == rnn_enum::kLstm)
+      return 3;
+    else 
+      return 2;
+  }
+
+  int NumVisibleOutputs() const override {
+    return 1;
   }
 
   void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {
@@ -171,15 +181,15 @@ class RNNProp : public OperatorProperty {
                   std::vector<TShape> *aux_shape) const override {
     using namespace mshadow;
     if (param_.mode == rnn_enum::kLstm) {
-      CHECK_EQ(in_shape->size(), 4) << "Input:[data, weight, state, cell_state]";
+      CHECK_EQ(in_shape->size(), 4) << "Input:[data, parameters, state, cell_state]";
     } else {
-      CHECK_EQ(in_shape->size(), 3) << "Input:[data, weight, state]";
+      CHECK_EQ(in_shape->size(), 3) << "Input:[data, parameters, state]";
     }
     const TShape &dshape = (*in_shape)[rnn_enum::kData];
     if (dshape.ndim() ==  0) return false;
     CHECK_EQ(dshape.ndim(), 3) \
-        << "Input data should be rank-3 tensor of dim (sequence length, batch size, input size)";
-    // Get input sizes
+        << "Input data should be rank-3 tensor of dim [sequence length, batch size, input size]";
+    // data: [sequence len, batch, input dimension]
     int batch_size = dshape[1];
     int input_size = dshape[2];
     int numDirections = param_.bidirectional ? 2 : 1;
@@ -192,17 +202,16 @@ class RNNProp : public OperatorProperty {
                         rnn_enum::kCellStateIn,
                         Shape3(total_layers, batch_size, param_.state_size_));
     }
-    // infer weight size
-    int weight_size = rnn_param_size(param_.num_layers_,
+    // calculate parameter vector length
+    int param_size = rnn_param_size(param_.num_layers_,
                                     input_size,
                                     param_.state_size_,
                                     param_.bidirectional,
                                     param_.mode);
-    SHAPE_ASSIGN_CHECK(*in_shape, rnn_enum::kParams, Shape1(weight_size));
-    // infer output size
+    SHAPE_ASSIGN_CHECK(*in_shape, rnn_enum::kParams, Shape1(param_size));
+    // output: [sequence len, batch, output size]
     TShape oshape = dshape;
     oshape[2] = numDirections * param_.state_size_;
-    // infer output state size
     TShape outStateShape = dshape;
     outStateShape[0] = total_layers;
     outStateShape[1] = batch_size;
diff --git a/src/operator/rnn.cc b/src/operator/rnn.cc
index 2a485e5ef224..5e3b2b8894af 100644
--- a/src/operator/rnn.cc
+++ b/src/operator/rnn.cc
@@ -20,7 +20,7 @@ Operator *CreateOp<cpu>(RNNParam param, int dtype) {
 }
 
 Operator *RNNProp::CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
-                                     std::vector<int> *in_type) const {
+                                     std::vector<int> *in_type) const {                                 
   std::vector<TShape> out_shape, aux_shape;
   std::vector<int> out_type, aux_type;
   CHECK(InferType(in_type, &out_type, &aux_type));
@@ -34,7 +34,7 @@ MXNET_REGISTER_OP_PROPERTY(RNN, RNNProp)
 .describe("Apply a recurrent layer to input.")
 .add_argument("data", "Symbol", "Input data to RNN")
 .add_argument("parameters", "Symbol", "Vector of all RNN trainable parameters")
-.add_argument("hidden_state", "Symbol", "initial hidden state of the RNN")
+.add_argument("state", "Symbol", "initial hidden state of the RNN")
 .add_argument("cell_state", "Symbol", "initial cell state for LSTM networks (only for LSTM)")
 .add_arguments(RNNParam::__FIELDS__());
 }  // namespace op

From e311b8691d08160342a83cd698a2b95dcba0e53f Mon Sep 17 00:00:00 2001
From: Sebastian Bodenstein <sebastianb@wolfram.com>
Date: Wed, 20 Jul 2016 12:50:59 +0200
Subject: [PATCH 09/36] - added backward method

---
 src/operator/cudnn_rnn-inl.h | 95 ++++++++++++++++++++++++++++++++++--
 src/operator/rnn-inl.h       |  8 +--
 2 files changed, 95 insertions(+), 8 deletions(-)

diff --git a/src/operator/cudnn_rnn-inl.h b/src/operator/cudnn_rnn-inl.h
index 8c6eae9dc984..6a642f6428f8 100644
--- a/src/operator/cudnn_rnn-inl.h
+++ b/src/operator/cudnn_rnn-inl.h
@@ -80,7 +80,7 @@ class CuDNNRNNOp : public Operator {
     // get input + output tensors
     Tensor<gpu, 3, DType> x = in_data[rnn_enum::kData].get<gpu, 3, DType>(s);
     Tensor<gpu, 1, DType> w = in_data[rnn_enum::kParams].get<gpu, 1, DType>(s);
-    Tensor<gpu, 3, DType> hx = in_data[rnn_enum::kStateIn].get<gpu, 3, DType>(s);
+    Tensor<gpu, 3, DType> hx = in_data[rnn_enum::kState].get<gpu, 3, DType>(s);
 
     Tensor<gpu, 3, DType> y = out_data[rnn_enum::kOut].get<gpu, 3, DType>(s);
     Tensor<gpu, 3, DType> hy = out_data[rnn_enum::kStateOut].get<gpu, 3, DType>(s);
@@ -88,8 +88,8 @@ class CuDNNRNNOp : public Operator {
     DType * cx_ptr = NULL;
     DType * cy_ptr = NULL;
     if (param_.mode == rnn_enum::kLstm){
-      cx_ptr = (in_data[rnn_enum::kCellStateIn].get<gpu, 3, DType>(s)).dptr_;
-      cy_ptr = (in_data[rnn_enum::kCellStateOut].get<gpu, 3, DType>(s)).dptr_;
+      cx_ptr = (in_data[rnn_enum::kStateCell].get<gpu, 3, DType>(s)).dptr_;
+      cy_ptr = (in_data[rnn_enum::kStateCellOut].get<gpu, 3, DType>(s)).dptr_;
     }
 
     CHECK_EQ(x.CheckContiguous(), true);
@@ -169,7 +169,94 @@ class CuDNNRNNOp : public Operator {
     size_t out_expected = param_.lstm_q_ ? 3 : 2;
     CHECK_EQ(in_data.size(), in_expected);
     CHECK_EQ(out_data.size(), out_expected);
-    CHECK_EQ(out_data.size(), out_expected);
+    CHECK_EQ(in_grad.size(), in_expected);
+    CHECK_EQ(out_grad.size(), out_expected);
+
+    Stream<gpu> *s = ctx.get_stream<gpu>();
+    // get input + output tensors
+    Tensor<gpu, 3, DType> x = in_data[rnn_enum::kData].get<gpu, 3, DType>(s);
+    Tensor<gpu, 3, DType> dx = in_grad[rnn_enum::kData].get<gpu, 3, DType>(s);
+    Tensor<gpu, 1, DType> w = in_data[rnn_enum::kParams].get<gpu, 1, DType>(s);
+    Tensor<gpu, 1, DType> dw = in_grad[rnn_enum::kParams].get<gpu, 1, DType>(s);
+    Tensor<gpu, 3, DType> hx = in_data[rnn_enum::kState].get<gpu, 3, DType>(s);
+    Tensor<gpu, 3, DType> dhx = in_grad[rnn_enum::kState].get<gpu, 3, DType>(s);
+    Tensor<gpu, 3, DType> hy = in_data[rnn_enum::kStateOut].get<gpu, 3, DType>(s);
+    Tensor<gpu, 3, DType> dhy = out_grad[rnn_enum::kStateOut].get<gpu, 3, DType>(s);
+    Tensor<gpu, 3, DType> y = out_data[rnn_enum::kOut].get<gpu, 3, DType>(s);
+    Tensor<gpu, 3, DType> dy = out_grad[rnn_enum::kOut].get<gpu, 3, DType>(s);
+
+    DType * cx_ptr = NULL;
+    // DType * cy_ptr = NULL;
+    DType * dcx_ptr = NULL;
+    DType * dcy_ptr = NULL;
+    if (param_.mode == rnn_enum::kLstm){
+      cx_ptr = (in_data[rnn_enum::kStateCell].get<gpu, 3, DType>(s)).dptr_;
+      // cy_ptr = (in_data[rnn_enum::kStateCellOut].get<gpu, 3, DType>(s)).dptr_;
+      dcx_ptr = (in_grad[rnn_enum::kStateCell].get<gpu, 3, DType>(s)).dptr_;
+      dcy_ptr = (out_grad[rnn_enum::kStateCellOut].get<gpu, 3, DType>(s)).dptr_;
+    }
+
+    CHECK_EQ(x.CheckContiguous(), true);
+    CHECK_EQ(w.CheckContiguous(), true);
+    CHECK_EQ(hx.CheckContiguous(), true);
+    CHECK_EQ(y.CheckContiguous(), true);
+    CHECK_EQ(hy.CheckContiguous(), true);
+
+    if(!init_cudnn_){
+      Init(s, in_data, out_data);
+    } 
+
+    // Get temp space
+    int temp_size = workspace_size_;
+    temp_size += ctx.is_train ? reserve_space_size_ : 0;
+    Tensor<gpu, 1, DType> temp_space =
+      ctx.requested[rnn_enum::kTempSpace].get_space_typed<gpu, 1, DType>(
+                              mshadow::Shape1(temp_size), s);
+    
+    CHECK_EQ(cudnnRNNBackwardData(s->dnn_handle_,
+                                rnn_desc_,
+                                param_.seq_length_,
+                                y_desc_vec_.data(),
+                                y.dptr_,
+                                dy_desc_vec_.data(),
+                                dy.dptr_,
+                                dhy_desc_,
+                                dhy.dptr_,
+                                dcy_desc_,
+                                dcy_ptr,
+                                w_desc_,
+                                w.dptr_,
+                                hx_desc_,
+                                hx.dptr_,
+                                cx_desc_,
+                                cx_ptr,
+                                dx_desc_vec_.data(),
+                                dx.dptr_,
+                                dhx_desc_,
+                                dhx.dptr_,
+                                dcx_desc_,
+                                dcx_ptr,
+                                temp_space.dptr_,
+                                workspace_byte_,
+                                temp_space.dptr_ + workspace_size_,
+                                reserve_space_byte_
+                                ), CUDNN_STATUS_SUCCESS);
+    CHECK_EQ(cudnnRNNBackwardWeights(s->dnn_handle_, 
+                                    rnn_desc_, 
+                                    param_.seq_length_, 
+                                    x_desc_vec_.data(), 
+                                    x.dptr_, 
+                                    hx_desc_,
+                                    hx.dptr_,                                                
+                                    y_desc_vec_.data(), 
+                                    y.dptr_,
+                                    temp_space.dptr_, 
+                                    workspace_byte_, 
+                                    dw_desc_, 
+                                    dw.dptr_,
+                                    temp_space.dptr_ + workspace_size_, 
+                                    reserve_space_byte_ 
+                                    ), CUDNN_STATUS_SUCCESS);
   }
  private:
   inline void Init(mshadow::Stream<gpu> *s,
diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h
index 98f8a5953d70..fd68fd628432 100644
--- a/src/operator/rnn-inl.h
+++ b/src/operator/rnn-inl.h
@@ -21,8 +21,8 @@ namespace mxnet {
 namespace op {
 
 namespace rnn_enum {
-  enum RNNOpInputs {kData, kParams, kStateIn, kCellStateIn};
-  enum RNNOpOutputs {kOut, kStateOut, kCellStateOut};
+  enum RNNOpInputs {kData, kParams, kState, kStateCell};
+  enum RNNOpOutputs {kOut, kStateOut, kStateCellOut};
   enum RNNModeType {kRnnRelu, kRnnTanh, kLstm, kGru};
   enum RNNOpResource {kTempSpace};
 }
@@ -195,11 +195,11 @@ class RNNProp : public OperatorProperty {
     int numDirections = param_.bidirectional ? 2 : 1;
     int total_layers = numDirections * param_.num_layers_; // double for bidirectional
     SHAPE_ASSIGN_CHECK(*in_shape,
-                       rnn_enum::kStateIn,
+                       rnn_enum::kState,
                        Shape3(total_layers, batch_size, param_.state_size_));
     if (param_.mode == rnn_enum::kLstm){
       SHAPE_ASSIGN_CHECK(*in_shape,
-                        rnn_enum::kCellStateIn,
+                        rnn_enum::kStateCell,
                         Shape3(total_layers, batch_size, param_.state_size_));
     }
     // calculate parameter vector length

From ccb1ae53b6f460ec9ab93d390a0f31ba0a671003 Mon Sep 17 00:00:00 2001
From: Sebastian Bodenstein <sebastianb@wolfram.com>
Date: Wed, 20 Jul 2016 13:16:50 +0200
Subject: [PATCH 10/36] - small fix for in/out names

---
 src/operator/rnn-inl.h | 8 ++++----
 src/operator/rnn.cc    | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h
index fd68fd628432..137bebed5c06 100644
--- a/src/operator/rnn-inl.h
+++ b/src/operator/rnn-inl.h
@@ -144,7 +144,7 @@ class RNNProp : public OperatorProperty {
  public:
   std::vector<std::string> ListArguments() const override {
     if (param_.mode == rnn_enum::kLstm) {
-      return {"data", "parameters", "state", "cell_state"};
+      return {"data", "parameters", "state", "state_cell"};
     } else {
       return {"data", "parameters", "state"};
     }
@@ -164,9 +164,9 @@ class RNNProp : public OperatorProperty {
       return 2;
   }
 
-  int NumVisibleOutputs() const override {
-    return 1;
-  }
+  // int NumVisibleOutputs() const override {
+  //   return 1;
+  // }
 
   void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {
     param_.Init(kwargs);
diff --git a/src/operator/rnn.cc b/src/operator/rnn.cc
index 5e3b2b8894af..337410c8ddc1 100644
--- a/src/operator/rnn.cc
+++ b/src/operator/rnn.cc
@@ -35,7 +35,7 @@ MXNET_REGISTER_OP_PROPERTY(RNN, RNNProp)
 .add_argument("data", "Symbol", "Input data to RNN")
 .add_argument("parameters", "Symbol", "Vector of all RNN trainable parameters")
 .add_argument("state", "Symbol", "initial hidden state of the RNN")
-.add_argument("cell_state", "Symbol", "initial cell state for LSTM networks (only for LSTM)")
+.add_argument("state_cell", "Symbol", "initial cell state for LSTM networks (only for LSTM)")
 .add_arguments(RNNParam::__FIELDS__());
 }  // namespace op
 }  // namespace mxnet

From 9b5e38382d94f667700297d5ef56cb61664581cf Mon Sep 17 00:00:00 2001
From: Sebastian Bodenstein <sebastianb@wolfram.com>
Date: Wed, 20 Jul 2016 14:24:57 +0200
Subject: [PATCH 11/36] - fixed bug: parameters can't have underscore

---
 src/operator/cudnn_rnn-inl.h | 10 +++++-----
 src/operator/rnn-inl.h       | 22 +++++++++++-----------
 2 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/src/operator/cudnn_rnn-inl.h b/src/operator/cudnn_rnn-inl.h
index 6a642f6428f8..d696ead26255 100644
--- a/src/operator/cudnn_rnn-inl.h
+++ b/src/operator/cudnn_rnn-inl.h
@@ -314,7 +314,7 @@ class CuDNNRNNOp : public Operator {
                                   strideA
                                   ), CUDNN_STATUS_SUCCESS);
         dimA[0] = param_.batch_size_;                        
-        dimA[1] = param_.bidirectional ? param_.state_size_ * 2 : param_.state_size_;
+        dimA[1] = param_.bidirectional ? param_.state_size * 2 : param_.state_size;
         dimA[2] = 1;
         strideA[0] = dimA[2] * dimA[1];
         strideA[1] = dimA[2];
@@ -339,9 +339,9 @@ class CuDNNRNNOp : public Operator {
       dy_desc_vec_ = dy_vec;
 
       // set the state tensors                       
-      dimA[0] = param_.num_layers_ * (param_.bidirectional ? 2 : 1);
+      dimA[0] = param_.num_layers * (param_.bidirectional ? 2 : 1);
       dimA[1] = param_.batch_size_;
-      dimA[2] = param_.state_size_;
+      dimA[2] = param_.state_size;
       strideA[0] = dimA[2] * dimA[1];
       strideA[1] = dimA[2];
       strideA[2] = 1;
@@ -419,8 +419,8 @@ class CuDNNRNNOp : public Operator {
       // RNN descriptors       
       CHECK_EQ(cudnnCreateRNNDescriptor(&rnn_desc_), CUDNN_STATUS_SUCCESS);
       CHECK_EQ(cudnnSetRNNDescriptor(rnn_desc_,
-                                    param_.state_size_,
-                                    param_.num_layers_,
+                                    param_.state_size,
+                                    param_.num_layers,
                                     dropout_desc_,
                                     input_mode_,
                                     direction_,
diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h
index 137bebed5c06..ed0cf0db84b1 100644
--- a/src/operator/rnn-inl.h
+++ b/src/operator/rnn-inl.h
@@ -67,8 +67,8 @@ inline int rnn_param_size(int layerNum,
 }
 
 struct RNNParam : public dmlc::Parameter<RNNParam> {
-  uint32_t state_size_;
-  uint32_t num_layers_;
+  uint32_t state_size;
+  uint32_t num_layers;
   bool bidirectional;
   int mode;
   float p, pkeep_;
@@ -76,10 +76,10 @@ struct RNNParam : public dmlc::Parameter<RNNParam> {
   bool lstm_q_; // whether type is lstm 
 
   DMLC_DECLARE_PARAMETER(RNNParam) {
-    DMLC_DECLARE_FIELD(state_size_)
+    DMLC_DECLARE_FIELD(state_size)
     .describe("size of the state for each layer");
 
-    DMLC_DECLARE_FIELD(num_layers_)
+    DMLC_DECLARE_FIELD(num_layers)
     .describe("number of stacked layers");
 
     DMLC_DECLARE_FIELD(bidirectional).set_default(false)
@@ -193,29 +193,29 @@ class RNNProp : public OperatorProperty {
     int batch_size = dshape[1];
     int input_size = dshape[2];
     int numDirections = param_.bidirectional ? 2 : 1;
-    int total_layers = numDirections * param_.num_layers_; // double for bidirectional
+    int total_layers = numDirections * param_.num_layers; // double for bidirectional
     SHAPE_ASSIGN_CHECK(*in_shape,
                        rnn_enum::kState,
-                       Shape3(total_layers, batch_size, param_.state_size_));
+                       Shape3(total_layers, batch_size, param_.state_size));
     if (param_.mode == rnn_enum::kLstm){
       SHAPE_ASSIGN_CHECK(*in_shape,
                         rnn_enum::kStateCell,
-                        Shape3(total_layers, batch_size, param_.state_size_));
+                        Shape3(total_layers, batch_size, param_.state_size));
     }
     // calculate parameter vector length
-    int param_size = rnn_param_size(param_.num_layers_,
+    int param_size = rnn_param_size(param_.num_layers,
                                     input_size,
-                                    param_.state_size_,
+                                    param_.state_size,
                                     param_.bidirectional,
                                     param_.mode);
     SHAPE_ASSIGN_CHECK(*in_shape, rnn_enum::kParams, Shape1(param_size));
     // output: [sequence len, batch, output size]
     TShape oshape = dshape;
-    oshape[2] = numDirections * param_.state_size_;
+    oshape[2] = numDirections * param_.state_size;
     TShape outStateShape = dshape;
     outStateShape[0] = total_layers;
     outStateShape[1] = batch_size;
-    outStateShape[2] = param_.state_size_;
+    outStateShape[2] = param_.state_size;
 
     out_shape->clear();
     out_shape->push_back(oshape);

From 8997a5d96e1aed7927f5e4cfd10e481d5e968bec Mon Sep 17 00:00:00 2001
From: Sebastian Bodenstein <sebastianb@wolfram.com>
Date: Wed, 20 Jul 2016 16:36:07 +0200
Subject: [PATCH 12/36] - fixed off-by-two error in weight shape inference for
 bidirectional net - moved calculated param to cudnn_rnn-inl.h

---
 src/operator/cudnn_rnn-inl.h |  7 ++++++-
 src/operator/rnn-inl.h       | 10 +++-------
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/src/operator/cudnn_rnn-inl.h b/src/operator/cudnn_rnn-inl.h
index d696ead26255..1fd7afc90e3a 100644
--- a/src/operator/cudnn_rnn-inl.h
+++ b/src/operator/cudnn_rnn-inl.h
@@ -41,6 +41,12 @@ class CuDNNRNNOp : public Operator {
     }
     // RNN Direction
     direction_ = param_.bidirectional ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL;
+    // Other
+    param_.pkeep_ = 1.0f - param_.p;
+    if(param_.mode == rnn_enum::kLstm)
+      param_.lstm_q_ = true;
+    else
+      param_.lstm_q_ = false;
   }
 
   ~CuDNNRNNOp() {
@@ -212,7 +218,6 @@ class CuDNNRNNOp : public Operator {
     Tensor<gpu, 1, DType> temp_space =
       ctx.requested[rnn_enum::kTempSpace].get_space_typed<gpu, 1, DType>(
                               mshadow::Shape1(temp_size), s);
-    
     CHECK_EQ(cudnnRNNBackwardData(s->dnn_handle_,
                                 rnn_desc_,
                                 param_.seq_length_,
diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h
index ed0cf0db84b1..b51216bf9d4d 100644
--- a/src/operator/rnn-inl.h
+++ b/src/operator/rnn-inl.h
@@ -59,8 +59,10 @@ inline int rnn_param_size(int layerNum,
   // get size of first layer
   int size = rnn_single_param_size(inputSize, hiddenSize, mode);
   // get size of remaining layers
-  if(bidirectional)
+  if(bidirectional){
     size += (layerNum - 1) * rnn_single_param_size(2 * hiddenSize, hiddenSize, mode);
+    size *= 2;
+  }
   else 
     size += (layerNum - 1) * rnn_single_param_size(hiddenSize, hiddenSize, mode);  
   return size;
@@ -102,12 +104,6 @@ template<typename xpu, typename DType>
 class RNNOp : public Operator {
  public:
   explicit RNNOp(RNNParam p) {
-    // convert MBytes first to Bytes and then to elements.
-    param_.pkeep_ = 1.0f - param_.p;
-    if(param_.mode == rnn_enum::kLstm)
-      param_.lstm_q_ = true;
-    else
-      param_.lstm_q_ = false;
   }
 
   virtual void Forward(const OpContext &ctx,

From 77bf61c2173d2e1f73504393e21feee8010902c9 Mon Sep 17 00:00:00 2001
From: Sebastian Bodenstein <sebastianb@wolfram.com>
Date: Thu, 21 Jul 2016 13:24:56 +0200
Subject: [PATCH 13/36] - added option to control num outputs

---
 src/operator/cudnn_rnn-inl.h |  5 ++++-
 src/operator/rnn-inl.h       | 21 ++++++++++++++-------
 2 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/src/operator/cudnn_rnn-inl.h b/src/operator/cudnn_rnn-inl.h
index 1fd7afc90e3a..0c943bab7da0 100644
--- a/src/operator/cudnn_rnn-inl.h
+++ b/src/operator/cudnn_rnn-inl.h
@@ -7,9 +7,12 @@
 #ifndef MXNET_OPERATOR_CUDNN_RNN_INL_H_
 #define MXNET_OPERATOR_CUDNN_RNN_INL_H_
 
-#include <algorithm>
 #include <vector>
+#include <map>
+#include <string>
+#include <utility>
 #include "./rnn-inl.h"
+
 namespace mxnet {
 namespace op {
 #if defined(__CUDACC__) && MXNET_USE_CUDNN == 1 && CUDNN_MAJOR == 5
diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h
index b51216bf9d4d..d036e299e519 100644
--- a/src/operator/rnn-inl.h
+++ b/src/operator/rnn-inl.h
@@ -71,7 +71,7 @@ inline int rnn_param_size(int layerNum,
 struct RNNParam : public dmlc::Parameter<RNNParam> {
   uint32_t state_size;
   uint32_t num_layers;
-  bool bidirectional;
+  bool bidirectional, state_outputs;
   int mode;
   float p, pkeep_;
   int seq_length_, batch_size_, input_size_;
@@ -97,6 +97,10 @@ struct RNNParam : public dmlc::Parameter<RNNParam> {
     DMLC_DECLARE_FIELD(p).set_default(0.)
     .set_range(0, 1)
     .describe("Fraction of the input that gets dropped out at training time");
+
+    DMLC_DECLARE_FIELD(state_outputs).set_default(false)
+    .describe("Whether to have the states as symbol outputs.");
+
   }
 };
 
@@ -160,9 +164,11 @@ class RNNProp : public OperatorProperty {
       return 2;
   }
 
-  // int NumVisibleOutputs() const override {
-  //   return 1;
-  // }
+  int NumVisibleOutputs() const override {
+    int mode_num = (param_.mode == rnn_enum::kLstm) ? 2 : 1;
+    int num_outputs = param_.state_outputs ? (mode_num + 1) : 1;
+    return num_outputs;
+  }
 
   void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {
     param_.Init(kwargs);
@@ -193,11 +199,11 @@ class RNNProp : public OperatorProperty {
     SHAPE_ASSIGN_CHECK(*in_shape,
                        rnn_enum::kState,
                        Shape3(total_layers, batch_size, param_.state_size));
-    if (param_.mode == rnn_enum::kLstm){
+    if (param_.mode == rnn_enum::kLstm)
       SHAPE_ASSIGN_CHECK(*in_shape,
                         rnn_enum::kStateCell,
                         Shape3(total_layers, batch_size, param_.state_size));
-    }
+
     // calculate parameter vector length
     int param_size = rnn_param_size(param_.num_layers,
                                     input_size,
@@ -217,7 +223,7 @@ class RNNProp : public OperatorProperty {
     out_shape->push_back(oshape);
     out_shape->push_back(outStateShape);
     // Deal with lstm cell state
-    if (param_.mode == rnn_enum::kLstm)
+    if(param_.mode == rnn_enum::kLstm)
       out_shape->push_back(outStateShape);
     return true;
   }
@@ -240,6 +246,7 @@ class RNNProp : public OperatorProperty {
     out_type->clear();
     out_type->push_back(dtype);
     out_type->push_back(dtype);
+    // Deal with lstm cell state
     if (param_.mode == rnn_enum::kLstm)
       out_type->push_back(dtype);
     return true;

From 62d6f8e33b7d4b01178d85541a328b424418d462 Mon Sep 17 00:00:00 2001
From: Sebastian Bodenstein <sebastianb@wolfram.com>
Date: Thu, 21 Jul 2016 13:52:17 +0200
Subject: [PATCH 14/36] - removed lint

---
 src/operator/cudnn_rnn-inl.h | 137 +++++++++++++++--------------------
 src/operator/rnn-inl.h       |  32 ++++----
 src/operator/rnn.cc          |   5 +-
 src/operator/rnn.cu          |   2 +-
 4 files changed, 77 insertions(+), 99 deletions(-)

diff --git a/src/operator/cudnn_rnn-inl.h b/src/operator/cudnn_rnn-inl.h
index 0c943bab7da0..f3bfc1eac1fe 100644
--- a/src/operator/cudnn_rnn-inl.h
+++ b/src/operator/cudnn_rnn-inl.h
@@ -24,7 +24,7 @@ class CuDNNRNNOp : public Operator {
     init_cudnn_ = false;
     dtype_ = mshadow::DataType<DType>::kCudnnFlag;
     // Defaults
-    input_mode_ = CUDNN_LINEAR_INPUT; // Don't support this yet
+    input_mode_ = CUDNN_LINEAR_INPUT;  // Don't support this yet
     // RNN Mode
     switch (param_.mode) {
       case rnn_enum::kRnnRelu:
@@ -46,7 +46,7 @@ class CuDNNRNNOp : public Operator {
     direction_ = param_.bidirectional ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL;
     // Other
     param_.pkeep_ = 1.0f - param_.p;
-    if(param_.mode == rnn_enum::kLstm)
+    if (param_.mode == rnn_enum::kLstm)
       param_.lstm_q_ = true;
     else
       param_.lstm_q_ = false;
@@ -54,7 +54,7 @@ class CuDNNRNNOp : public Operator {
 
   ~CuDNNRNNOp() {
     if (init_cudnn_) {
-      for(int i = 0; i < x_desc_vec_.size(); ++i){
+      for (int i = 0; i < x_desc_vec_.size(); ++i) {
         CHECK_EQ(cudnnDestroyTensorDescriptor(x_desc_vec_[i]), CUDNN_STATUS_SUCCESS);
         CHECK_EQ(cudnnDestroyTensorDescriptor(y_desc_vec_[i]), CUDNN_STATUS_SUCCESS);
         CHECK_EQ(cudnnDestroyTensorDescriptor(dx_desc_vec_[i]), CUDNN_STATUS_SUCCESS);
@@ -63,18 +63,18 @@ class CuDNNRNNOp : public Operator {
       CHECK_EQ(cudnnDestroyTensorDescriptor(hx_desc_), CUDNN_STATUS_SUCCESS);
       CHECK_EQ(cudnnDestroyTensorDescriptor(cx_desc_), CUDNN_STATUS_SUCCESS);
       CHECK_EQ(cudnnDestroyTensorDescriptor(hy_desc_), CUDNN_STATUS_SUCCESS);
-      CHECK_EQ(cudnnDestroyTensorDescriptor(cy_desc_), CUDNN_STATUS_SUCCESS);   
+      CHECK_EQ(cudnnDestroyTensorDescriptor(cy_desc_), CUDNN_STATUS_SUCCESS);
       CHECK_EQ(cudnnDestroyTensorDescriptor(dhx_desc_), CUDNN_STATUS_SUCCESS);
       CHECK_EQ(cudnnDestroyTensorDescriptor(dcx_desc_), CUDNN_STATUS_SUCCESS);
       CHECK_EQ(cudnnDestroyTensorDescriptor(dhy_desc_), CUDNN_STATUS_SUCCESS);
-      CHECK_EQ(cudnnDestroyTensorDescriptor(dcy_desc_), CUDNN_STATUS_SUCCESS);   
+      CHECK_EQ(cudnnDestroyTensorDescriptor(dcy_desc_), CUDNN_STATUS_SUCCESS);
 
       CHECK_EQ(cudnnDestroyFilterDescriptor(w_desc_), CUDNN_STATUS_SUCCESS);
       CHECK_EQ(cudnnDestroyRNNDescriptor(rnn_desc_), CUDNN_STATUS_SUCCESS);
       CHECK_EQ(cudnnDestroyDropoutDescriptor(dropout_desc_), CUDNN_STATUS_SUCCESS);
     }
   }
- 
+
   virtual void Forward(const OpContext &ctx,
                        const std::vector<TBlob> &in_data,
                        const std::vector<OpReqType> &req,
@@ -96,7 +96,7 @@ class CuDNNRNNOp : public Operator {
 
     DType * cx_ptr = NULL;
     DType * cy_ptr = NULL;
-    if (param_.mode == rnn_enum::kLstm){
+    if (param_.mode == rnn_enum::kLstm) {
       cx_ptr = (in_data[rnn_enum::kStateCell].get<gpu, 3, DType>(s)).dptr_;
       cy_ptr = (in_data[rnn_enum::kStateCellOut].get<gpu, 3, DType>(s)).dptr_;
     }
@@ -107,9 +107,9 @@ class CuDNNRNNOp : public Operator {
     CHECK_EQ(y.CheckContiguous(), true);
     CHECK_EQ(hy.CheckContiguous(), true);
 
-    if(!init_cudnn_){
+    if (!init_cudnn_) {
       Init(s, in_data, out_data);
-    } 
+    }
 
     // Get temp space
     int temp_size = workspace_size_;
@@ -117,8 +117,8 @@ class CuDNNRNNOp : public Operator {
     Tensor<gpu, 1, DType> temp_space =
       ctx.requested[rnn_enum::kTempSpace].get_space_typed<gpu, 1, DType>(
                               mshadow::Shape1(temp_size), s);
-    
-    if (ctx.is_train) { 
+
+    if (ctx.is_train) {
       CHECK_EQ(cudnnRNNForwardTraining(s->dnn_handle_,
                                       rnn_desc_,
                                       param_.seq_length_,
@@ -139,8 +139,7 @@ class CuDNNRNNOp : public Operator {
                                       temp_space.dptr_,
                                       workspace_byte_,
                                       temp_space.dptr_ + workspace_size_,
-                                      reserve_space_byte_
-                                      ), CUDNN_STATUS_SUCCESS);
+                                      reserve_space_byte_), CUDNN_STATUS_SUCCESS);
     } else {
       // inference mode
       CHECK_EQ(cudnnRNNForwardInference(s->dnn_handle_,
@@ -161,11 +160,10 @@ class CuDNNRNNOp : public Operator {
                                       cy_desc_,
                                       cy_ptr,
                                       temp_space.dptr_,
-                                      workspace_byte_
-                                      ), CUDNN_STATUS_SUCCESS); 
+                                      workspace_byte_), CUDNN_STATUS_SUCCESS);
     }
   }
- 
+
   virtual void Backward(const OpContext &ctx,
                         const std::vector<TBlob> &out_grad,
                         const std::vector<TBlob> &in_data,
@@ -198,7 +196,7 @@ class CuDNNRNNOp : public Operator {
     // DType * cy_ptr = NULL;
     DType * dcx_ptr = NULL;
     DType * dcy_ptr = NULL;
-    if (param_.mode == rnn_enum::kLstm){
+    if (param_.mode == rnn_enum::kLstm) {
       cx_ptr = (in_data[rnn_enum::kStateCell].get<gpu, 3, DType>(s)).dptr_;
       // cy_ptr = (in_data[rnn_enum::kStateCellOut].get<gpu, 3, DType>(s)).dptr_;
       dcx_ptr = (in_grad[rnn_enum::kStateCell].get<gpu, 3, DType>(s)).dptr_;
@@ -211,9 +209,9 @@ class CuDNNRNNOp : public Operator {
     CHECK_EQ(y.CheckContiguous(), true);
     CHECK_EQ(hy.CheckContiguous(), true);
 
-    if(!init_cudnn_){
+    if (!init_cudnn_) {
       Init(s, in_data, out_data);
-    } 
+    }
 
     // Get temp space
     int temp_size = workspace_size_;
@@ -247,25 +245,24 @@ class CuDNNRNNOp : public Operator {
                                 temp_space.dptr_,
                                 workspace_byte_,
                                 temp_space.dptr_ + workspace_size_,
-                                reserve_space_byte_
-                                ), CUDNN_STATUS_SUCCESS);
-    CHECK_EQ(cudnnRNNBackwardWeights(s->dnn_handle_, 
-                                    rnn_desc_, 
-                                    param_.seq_length_, 
-                                    x_desc_vec_.data(), 
-                                    x.dptr_, 
+                                reserve_space_byte_), CUDNN_STATUS_SUCCESS);
+    CHECK_EQ(cudnnRNNBackwardWeights(s->dnn_handle_,
+                                    rnn_desc_,
+                                    param_.seq_length_,
+                                    x_desc_vec_.data(),
+                                    x.dptr_,
                                     hx_desc_,
-                                    hx.dptr_,                                                
-                                    y_desc_vec_.data(), 
+                                    hx.dptr_,
+                                    y_desc_vec_.data(),
                                     y.dptr_,
-                                    temp_space.dptr_, 
-                                    workspace_byte_, 
-                                    dw_desc_, 
+                                    temp_space.dptr_,
+                                    workspace_byte_,
+                                    dw_desc_,
                                     dw.dptr_,
-                                    temp_space.dptr_ + workspace_size_, 
-                                    reserve_space_byte_ 
-                                    ), CUDNN_STATUS_SUCCESS);
+                                    temp_space.dptr_ + workspace_size_,
+                                    reserve_space_byte_), CUDNN_STATUS_SUCCESS);
   }
+
  private:
   inline void Init(mshadow::Stream<gpu> *s,
                    const std::vector<TBlob> &in_data,
@@ -299,7 +296,7 @@ class CuDNNRNNOp : public Operator {
         CHECK_EQ(cudnnCreateTensorDescriptor(&y_vec[i]), CUDNN_STATUS_SUCCESS);
         CHECK_EQ(cudnnCreateTensorDescriptor(&dx_vec[i]), CUDNN_STATUS_SUCCESS);
         CHECK_EQ(cudnnCreateTensorDescriptor(&dy_vec[i]), CUDNN_STATUS_SUCCESS);
-        
+
         dimA[0] = param_.batch_size_;
         dimA[1] = param_.input_size_;
         dimA[2] = 1;
@@ -307,21 +304,19 @@ class CuDNNRNNOp : public Operator {
         dimA[1] = param_.input_size_;
         strideA[0] = dimA[2] * dimA[1];
         strideA[1] = dimA[2];
-        strideA[2] = 1; 
+        strideA[2] = 1;
 
         CHECK_EQ(cudnnSetTensorNdDescriptor(x_vec[i],
                                   dtype_,
                                   3,
                                   dimA,
-                                  strideA
-                                  ), CUDNN_STATUS_SUCCESS);
+                                  strideA), CUDNN_STATUS_SUCCESS);
         CHECK_EQ(cudnnSetTensorNdDescriptor(dx_vec[i],
                                   dtype_,
                                   3,
                                   dimA,
-                                  strideA
-                                  ), CUDNN_STATUS_SUCCESS);
-        dimA[0] = param_.batch_size_;                        
+                                  strideA), CUDNN_STATUS_SUCCESS);
+        dimA[0] = param_.batch_size_;
         dimA[1] = param_.bidirectional ? param_.state_size * 2 : param_.state_size;
         dimA[2] = 1;
         strideA[0] = dimA[2] * dimA[1];
@@ -332,21 +327,19 @@ class CuDNNRNNOp : public Operator {
                                   dtype_,
                                   3,
                                   dimA,
-                                  strideA
-                                  ), CUDNN_STATUS_SUCCESS);
+                                  strideA), CUDNN_STATUS_SUCCESS);
         CHECK_EQ(cudnnSetTensorNdDescriptor(dy_vec[i],
                                   dtype_,
                                   3,
                                   dimA,
-                                  strideA
-                                  ), CUDNN_STATUS_SUCCESS);
+                                  strideA), CUDNN_STATUS_SUCCESS);
       }
       x_desc_vec_ = x_vec;
       y_desc_vec_ = y_vec;
       dx_desc_vec_ = dx_vec;
       dy_desc_vec_ = dy_vec;
 
-      // set the state tensors                       
+      // set the state tensors
       dimA[0] = param_.num_layers * (param_.bidirectional ? 2 : 1);
       dimA[1] = param_.batch_size_;
       dimA[2] = param_.state_size;
@@ -367,64 +360,55 @@ class CuDNNRNNOp : public Operator {
                                           dtype_,
                                           3,
                                           dimA,
-                                          strideA
-                                         ), CUDNN_STATUS_SUCCESS);
+                                          strideA), CUDNN_STATUS_SUCCESS);
       CHECK_EQ(cudnnSetTensorNdDescriptor(cx_desc_,
                                           dtype_,
                                           3,
                                           dimA,
-                                          strideA
-                                         ), CUDNN_STATUS_SUCCESS);
+                                          strideA), CUDNN_STATUS_SUCCESS);
       CHECK_EQ(cudnnSetTensorNdDescriptor(hy_desc_,
                                           dtype_,
                                           3,
                                           dimA,
-                                          strideA
-                                         ), CUDNN_STATUS_SUCCESS);
+                                          strideA), CUDNN_STATUS_SUCCESS);
       CHECK_EQ(cudnnSetTensorNdDescriptor(cy_desc_,
                                           dtype_,
                                           3,
                                           dimA,
-                                          strideA
-                                         ), CUDNN_STATUS_SUCCESS);
+                                          strideA), CUDNN_STATUS_SUCCESS);
       CHECK_EQ(cudnnSetTensorNdDescriptor(dhx_desc_,
                                           dtype_,
                                           3,
                                           dimA,
-                                          strideA
-                                         ), CUDNN_STATUS_SUCCESS);
+                                          strideA), CUDNN_STATUS_SUCCESS);
       CHECK_EQ(cudnnSetTensorNdDescriptor(dcx_desc_,
                                           dtype_,
                                           3,
                                           dimA,
-                                          strideA
-                                         ), CUDNN_STATUS_SUCCESS);
+                                          strideA), CUDNN_STATUS_SUCCESS);
       CHECK_EQ(cudnnSetTensorNdDescriptor(dhy_desc_,
                                           dtype_,
                                           3,
                                           dimA,
-                                          strideA
-                                         ), CUDNN_STATUS_SUCCESS);
+                                          strideA), CUDNN_STATUS_SUCCESS);
       CHECK_EQ(cudnnSetTensorNdDescriptor(dcy_desc_,
                                           dtype_,
                                           3,
                                           dimA,
-                                          strideA
-                                         ), CUDNN_STATUS_SUCCESS);
+                                          strideA), CUDNN_STATUS_SUCCESS);
 
       // Create Dropout descriptors
       CHECK_EQ(cudnnCreateDropoutDescriptor(&dropout_desc_), CUDNN_STATUS_SUCCESS);
-      CHECK_EQ(cudnnDropoutGetStatesSize(s->dnn_handle_, 
-                                          &dropout_byte_
-                                          ), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnDropoutGetStatesSize(s->dnn_handle_,
+                                        &dropout_byte_), CUDNN_STATUS_SUCCESS);
       dropout_size_ = dropout_byte_ / sizeof(DType);
       CHECK_EQ(cudnnSetDropoutDescriptor(dropout_desc_,
                                         s->dnn_handle_,
-                                        param_.pkeep_,  // keep probability 
+                                        param_.pkeep_,  // keep probability
                                         NULL,
                                         dropout_byte_,
                                         seed_), CUDNN_STATUS_SUCCESS);
-      // RNN descriptors       
+      // RNN descriptors
       CHECK_EQ(cudnnCreateRNNDescriptor(&rnn_desc_), CUDNN_STATUS_SUCCESS);
       CHECK_EQ(cudnnSetRNNDescriptor(rnn_desc_,
                                     param_.state_size,
@@ -434,19 +418,17 @@ class CuDNNRNNOp : public Operator {
                                     direction_,
                                     mode_,
                                     dtype_), CUDNN_STATUS_SUCCESS);
-      // Get temp space sizes     
+      // Get temp space sizes
       CHECK_EQ(cudnnGetRNNWorkspaceSize(s->dnn_handle_,
                                         rnn_desc_,
                                         param_.seq_length_,
                                         x_desc_vec_.data(),
-                                        &workspace_byte_
-                                        ), CUDNN_STATUS_SUCCESS);
+                                        &workspace_byte_), CUDNN_STATUS_SUCCESS);
       CHECK_EQ(cudnnGetRNNTrainingReserveSize(s->dnn_handle_,
                                         rnn_desc_,
                                         param_.seq_length_,
                                         x_desc_vec_.data(),
-                                        &reserve_space_byte_
-                                        ), CUDNN_STATUS_SUCCESS);
+                                        &reserve_space_byte_), CUDNN_STATUS_SUCCESS);
       workspace_size_ = workspace_byte_ / sizeof(DType);
       reserve_space_size_ = reserve_space_byte_ / sizeof(DType);
 
@@ -467,15 +449,12 @@ class CuDNNRNNOp : public Operator {
                                           dtype_,
                                           format_,
                                           3,
-                                          dim_w
-                                         ), CUDNN_STATUS_SUCCESS);
+                                          dim_w), CUDNN_STATUS_SUCCESS);
       CHECK_EQ(cudnnSetFilterNdDescriptor(dw_desc_,
                                           dtype_,
                                           format_,
                                           3,
-                                          dim_w
-                                         ), CUDNN_STATUS_SUCCESS);
-
+                                          dim_w), CUDNN_STATUS_SUCCESS);
     }
   }
 
@@ -486,7 +465,7 @@ class CuDNNRNNOp : public Operator {
   cudnnDirectionMode_t direction_;
   cudnnRNNInputMode_t input_mode_;
   cudnnDropoutDescriptor_t dropout_desc_;
-  unsigned long long seed_ = 4553;
+  unsigned long long seed_ = 1337ull;
   size_t workspace_byte_, reserve_space_byte_, dropout_byte_;
   int workspace_size_, reserve_space_size_, dropout_size_;
 
@@ -496,7 +475,7 @@ class CuDNNRNNOp : public Operator {
   cudnnTensorDescriptor_t dhx_desc_, dcx_desc_;
   cudnnTensorDescriptor_t dhy_desc_, dcy_desc_;
 
-  cudnnFilterDescriptor_t w_desc_, dw_desc_;  
+  cudnnFilterDescriptor_t w_desc_, dw_desc_;
 
   #if CUDNN_MAJOR == 5
   cudnnTensorFormat_t format_;
diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h
index d036e299e519..2c7d20fe279c 100644
--- a/src/operator/rnn-inl.h
+++ b/src/operator/rnn-inl.h
@@ -30,13 +30,12 @@ namespace rnn_enum {
 // A utility function to calculate input size
 inline int rnn_single_param_size(int inputSize,
                                 int hiddenSize,
-                                int mode){
+                                int mode) {
   int size = hiddenSize * (hiddenSize + inputSize + 2);
   // Different RNN's have different num weights
-  switch(mode)
-  {
+  switch (mode) {
     case rnn_enum::kRnnRelu:
-      size *= 1 ;
+      size *= 1;
       break;
     case rnn_enum::kRnnTanh:
       size *= 1;
@@ -55,16 +54,16 @@ inline int rnn_param_size(int layerNum,
                           int inputSize,
                           int hiddenSize,
                           bool bidirectional,
-                          int mode){
+                          int mode) {
   // get size of first layer
   int size = rnn_single_param_size(inputSize, hiddenSize, mode);
   // get size of remaining layers
-  if(bidirectional){
+  if (bidirectional) {
     size += (layerNum - 1) * rnn_single_param_size(2 * hiddenSize, hiddenSize, mode);
     size *= 2;
+  } else {
+    size += (layerNum - 1) * rnn_single_param_size(hiddenSize, hiddenSize, mode);
   }
-  else 
-    size += (layerNum - 1) * rnn_single_param_size(hiddenSize, hiddenSize, mode);  
   return size;
 }
 
@@ -75,7 +74,7 @@ struct RNNParam : public dmlc::Parameter<RNNParam> {
   int mode;
   float p, pkeep_;
   int seq_length_, batch_size_, input_size_;
-  bool lstm_q_; // whether type is lstm 
+  bool lstm_q_;  // whether type is lstm
 
   DMLC_DECLARE_PARAMETER(RNNParam) {
     DMLC_DECLARE_FIELD(state_size)
@@ -93,14 +92,13 @@ struct RNNParam : public dmlc::Parameter<RNNParam> {
     .add_enum("lstm", rnn_enum::kLstm)
     .add_enum("gru", rnn_enum::kGru)
     .describe("the type of RNN to compute");
-    
+
     DMLC_DECLARE_FIELD(p).set_default(0.)
     .set_range(0, 1)
     .describe("Fraction of the input that gets dropped out at training time");
 
     DMLC_DECLARE_FIELD(state_outputs).set_default(false)
     .describe("Whether to have the states as symbol outputs.");
-
   }
 };
 
@@ -117,7 +115,7 @@ class RNNOp : public Operator {
                        const std::vector<TBlob> &aux_args) {
     using namespace mshadow;
     using namespace mshadow::expr;
-    // TODO: add MShadow implementation
+    // TODO(sbodenstein): add MShadow implementation
   }
 
   virtual void Backward(const OpContext &ctx,
@@ -129,7 +127,7 @@ class RNNOp : public Operator {
                         const std::vector<TBlob> &aux_args) {
     using namespace mshadow;
     using namespace mshadow::expr;
-    // TODO: add MShadow implementation
+    // TODO(sbodenstein): add MShadow implementation
   }
 
  private:
@@ -153,14 +151,14 @@ class RNNProp : public OperatorProperty {
   std::vector<std::string> ListOutputs() const override {
     if (param_.mode == rnn_enum::kLstm)
       return {"output", "state", "state_cell"};
-    else 
+    else
       return {"output", "state"};
   }
 
   int NumOutputs() const override {
     if (param_.mode == rnn_enum::kLstm)
       return 3;
-    else 
+    else
       return 2;
   }
 
@@ -195,7 +193,7 @@ class RNNProp : public OperatorProperty {
     int batch_size = dshape[1];
     int input_size = dshape[2];
     int numDirections = param_.bidirectional ? 2 : 1;
-    int total_layers = numDirections * param_.num_layers; // double for bidirectional
+    int total_layers = numDirections * param_.num_layers;  // double for bidirectional
     SHAPE_ASSIGN_CHECK(*in_shape,
                        rnn_enum::kState,
                        Shape3(total_layers, batch_size, param_.state_size));
@@ -223,7 +221,7 @@ class RNNProp : public OperatorProperty {
     out_shape->push_back(oshape);
     out_shape->push_back(outStateShape);
     // Deal with lstm cell state
-    if(param_.mode == rnn_enum::kLstm)
+    if (param_.mode == rnn_enum::kLstm)
       out_shape->push_back(outStateShape);
     return true;
   }
diff --git a/src/operator/rnn.cc b/src/operator/rnn.cc
index 337410c8ddc1..3067c8e986c1 100644
--- a/src/operator/rnn.cc
+++ b/src/operator/rnn.cc
@@ -19,8 +19,9 @@ Operator *CreateOp<cpu>(RNNParam param, int dtype) {
   return op;
 }
 
-Operator *RNNProp::CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
-                                     std::vector<int> *in_type) const {                                 
+Operator *RNNProp::CreateOperatorEx(Context ctx,
+                                  std::vector<TShape> *in_shape,
+                                  std::vector<int> *in_type) const {
   std::vector<TShape> out_shape, aux_shape;
   std::vector<int> out_type, aux_type;
   CHECK(InferType(in_type, &out_type, &aux_type));
diff --git a/src/operator/rnn.cu b/src/operator/rnn.cu
index fb90daf19b41..bf914026019d 100644
--- a/src/operator/rnn.cu
+++ b/src/operator/rnn.cu
@@ -21,7 +21,7 @@ Operator* CreateOp<gpu>(RNNParam param, int dtype) {
     op = new CuDNNRNNOp<DType>(param);
   })
 #else
-   LOG(FATAL) << "RNN is only available for cuDNN at the moment.";
+  LOG(FATAL) << "RNN is only available for cuDNN at the moment.";
 #endif  // MXNET_USE_CUDNN && CUDNN_MAJOR
   return op;
 }

From 8b3c6b9ade40bfa00f2a1929a3fdc87f75da0709 Mon Sep 17 00:00:00 2001
From: Sebastian Bodenstein <sebastianb@wolfram.com>
Date: Thu, 21 Jul 2016 15:47:41 +0200
Subject: [PATCH 15/36] - correct handling of backward dependencies

---
 src/operator/cudnn_rnn-inl.h | 27 +++++++++++++++------------
 src/operator/rnn-inl.h       | 20 ++++++++++++++++----
 2 files changed, 31 insertions(+), 16 deletions(-)

diff --git a/src/operator/cudnn_rnn-inl.h b/src/operator/cudnn_rnn-inl.h
index f3bfc1eac1fe..3f63bc4de0f5 100644
--- a/src/operator/cudnn_rnn-inl.h
+++ b/src/operator/cudnn_rnn-inl.h
@@ -187,27 +187,30 @@ class CuDNNRNNOp : public Operator {
     Tensor<gpu, 1, DType> dw = in_grad[rnn_enum::kParams].get<gpu, 1, DType>(s);
     Tensor<gpu, 3, DType> hx = in_data[rnn_enum::kState].get<gpu, 3, DType>(s);
     Tensor<gpu, 3, DType> dhx = in_grad[rnn_enum::kState].get<gpu, 3, DType>(s);
-    Tensor<gpu, 3, DType> hy = in_data[rnn_enum::kStateOut].get<gpu, 3, DType>(s);
-    Tensor<gpu, 3, DType> dhy = out_grad[rnn_enum::kStateOut].get<gpu, 3, DType>(s);
     Tensor<gpu, 3, DType> y = out_data[rnn_enum::kOut].get<gpu, 3, DType>(s);
     Tensor<gpu, 3, DType> dy = out_grad[rnn_enum::kOut].get<gpu, 3, DType>(s);
 
-    DType * cx_ptr = NULL;
-    // DType * cy_ptr = NULL;
-    DType * dcx_ptr = NULL;
-    DType * dcy_ptr = NULL;
-    if (param_.mode == rnn_enum::kLstm) {
+    // only need kStateOut grad output_states is true
+    void * dhy_ptr = NULL;
+    if (param_.state_outputs)
+      dhy_ptr = out_grad[rnn_enum::kStateOut].get<gpu, 3, DType>(s).dptr_;
+
+    // Deal with lstm
+    void * dcx_ptr = NULL;
+    void * dcy_ptr = NULL;  
+    void * cx_ptr = NULL;
+
+    if(param_.mode == rnn_enum::kLstm) {
       cx_ptr = (in_data[rnn_enum::kStateCell].get<gpu, 3, DType>(s)).dptr_;
-      // cy_ptr = (in_data[rnn_enum::kStateCellOut].get<gpu, 3, DType>(s)).dptr_;
       dcx_ptr = (in_grad[rnn_enum::kStateCell].get<gpu, 3, DType>(s)).dptr_;
-      dcy_ptr = (out_grad[rnn_enum::kStateCellOut].get<gpu, 3, DType>(s)).dptr_;
     }
-
+    if ((param_.mode == rnn_enum::kLstm) && param_.state_outputs)
+        dcy_ptr = (out_grad[rnn_enum::kStateCellOut].get<gpu, 3, DType>(s)).dptr_;
+    
     CHECK_EQ(x.CheckContiguous(), true);
     CHECK_EQ(w.CheckContiguous(), true);
     CHECK_EQ(hx.CheckContiguous(), true);
     CHECK_EQ(y.CheckContiguous(), true);
-    CHECK_EQ(hy.CheckContiguous(), true);
 
     if (!init_cudnn_) {
       Init(s, in_data, out_data);
@@ -227,7 +230,7 @@ class CuDNNRNNOp : public Operator {
                                 dy_desc_vec_.data(),
                                 dy.dptr_,
                                 dhy_desc_,
-                                dhy.dptr_,
+                                dhy_ptr,
                                 dcy_desc_,
                                 dcy_ptr,
                                 w_desc_,
diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h
index 2c7d20fe279c..91284074b5d4 100644
--- a/src/operator/rnn-inl.h
+++ b/src/operator/rnn-inl.h
@@ -264,10 +264,22 @@ class RNNProp : public OperatorProperty {
     const std::vector<int> &out_grad,
     const std::vector<int> &in_data,
     const std::vector<int> &out_data) const override {
-    if (param_.mode == rnn_enum::kLstm)
-      return {out_grad[rnn_enum::kOut], in_data[rnn_enum::kData], in_data[rnn_enum::kParams]};
-    else
-      return {out_grad[rnn_enum::kOut], in_data[rnn_enum::kData], in_data[rnn_enum::kParams]};
+    std::vector<int> dep = {in_data[rnn_enum::kData], in_data[rnn_enum::kParams],
+        in_data[rnn_enum::kState], out_data[rnn_enum::kOut], out_grad[rnn_enum::kOut]};
+
+    if (param_.state_outputs) {
+      dep.push_back(out_data[rnn_enum::kStateOut]);
+      dep.push_back(out_grad[rnn_enum::kStateOut]);
+    }
+
+    if (param_.mode == rnn_enum::kLstm) {
+      dep.push_back(in_data[rnn_enum::kStateCell]);
+      if(param_.state_outputs) {
+        dep.push_back(out_data[rnn_enum::kStateCellOut]);
+        dep.push_back(out_grad[rnn_enum::kStateCellOut]);
+      }
+    }
+    return dep;
   }
 
   std::vector<ResourceRequest> ForwardResource(

From 82ac0417e37c34d8026f0dae6d49db21fb2991d4 Mon Sep 17 00:00:00 2001
From: Sebastian Bodenstein <sebastianb@wolfram.com>
Date: Thu, 21 Jul 2016 15:55:56 +0200
Subject: [PATCH 16/36] - fix lint

---
 src/operator/cudnn_rnn-inl.h | 6 +++---
 src/operator/rnn-inl.h       | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/operator/cudnn_rnn-inl.h b/src/operator/cudnn_rnn-inl.h
index 3f63bc4de0f5..d5deca2af2f8 100644
--- a/src/operator/cudnn_rnn-inl.h
+++ b/src/operator/cudnn_rnn-inl.h
@@ -197,16 +197,16 @@ class CuDNNRNNOp : public Operator {
 
     // Deal with lstm
     void * dcx_ptr = NULL;
-    void * dcy_ptr = NULL;  
+    void * dcy_ptr = NULL;
     void * cx_ptr = NULL;
 
-    if(param_.mode == rnn_enum::kLstm) {
+    if (param_.mode == rnn_enum::kLstm) {
       cx_ptr = (in_data[rnn_enum::kStateCell].get<gpu, 3, DType>(s)).dptr_;
       dcx_ptr = (in_grad[rnn_enum::kStateCell].get<gpu, 3, DType>(s)).dptr_;
     }
     if ((param_.mode == rnn_enum::kLstm) && param_.state_outputs)
         dcy_ptr = (out_grad[rnn_enum::kStateCellOut].get<gpu, 3, DType>(s)).dptr_;
-    
+
     CHECK_EQ(x.CheckContiguous(), true);
     CHECK_EQ(w.CheckContiguous(), true);
     CHECK_EQ(hx.CheckContiguous(), true);
diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h
index 91284074b5d4..ad4d21736345 100644
--- a/src/operator/rnn-inl.h
+++ b/src/operator/rnn-inl.h
@@ -274,7 +274,7 @@ class RNNProp : public OperatorProperty {
 
     if (param_.mode == rnn_enum::kLstm) {
       dep.push_back(in_data[rnn_enum::kStateCell]);
-      if(param_.state_outputs) {
+      if (param_.state_outputs) {
         dep.push_back(out_data[rnn_enum::kStateCellOut]);
         dep.push_back(out_grad[rnn_enum::kStateCellOut]);
       }

From d1d7ce35278227c77590ae7797a0043aa99fee13 Mon Sep 17 00:00:00 2001
From: Sebastian Bodenstein <sebastianb@wolfram.com>
Date: Sat, 9 Jul 2016 22:39:46 -0400
Subject: [PATCH 17/36] - first commit

---
 src/operator/cudnn_rnn-inl.h | 208 ++++++++++++++++
 src/operator/rnn-inl.h       | 471 +++++++++++++++++++++++++++++++++++
 src/operator/rnn.cc          |  41 +++
 src/operator/rnn.cu          |  33 +++
 4 files changed, 753 insertions(+)
 create mode 100644 src/operator/cudnn_rnn-inl.h
 create mode 100644 src/operator/rnn-inl.h
 create mode 100644 src/operator/rnn.cc
 create mode 100644 src/operator/rnn.cu

diff --git a/src/operator/cudnn_rnn-inl.h b/src/operator/cudnn_rnn-inl.h
new file mode 100644
index 000000000000..37895c2b2488
--- /dev/null
+++ b/src/operator/cudnn_rnn-inl.h
@@ -0,0 +1,208 @@
+/*!
+ * Copyright (c) 2016 by Contributors
+ * \file cudnn_spatial_transformer-inl.h
+ * \brief
+ * \author Sebastian Bodenstein
+*/
+#ifndef MXNET_OPERATOR_CUDNN_RNN_INL_H_
+#define MXNET_OPERATOR_CUDNN_RNN_INL_H_
+
+#include <algorithm>
+#include <vector>
+#include "./rnn-inl.h"
+namespace mxnet {
+namespace op {
+#if defined(__CUDACC__) && MXNET_USE_CUDNN == 1 && CUDNN_MAJOR == 5
+template<typename DType>
+class CuDNNRNNOp : public Operator {
+ public:
+  explicit CuDNNRNNOp(RNNParam param) {
+    this->param_ = param;
+    init_cudnn_ = false;
+    dtype_ = mshadow::DataType<DType>::kCudnnFlag;
+    // RNN Mode
+    switch (param_.mode) {
+      case rnn_enum::kRnnRelu:
+        rnn_mode_ = CUDNN_RNN_RELU;
+        break;
+      case rnn_enum::kRnnTanh:
+        rnn_mode_ = CUDNN_RNN_TANH;
+        break;
+      case rnn_enum::kLstm:
+        rnn_mode_ = CUDNN_LSTM;
+        break;
+      case rnn_enum::kGru:
+        rnn_mode_ = CUDNN_GRU;
+        break;
+      default:
+        LOG(FATAL) << "Not implmented";
+    }
+    // RNN Direction
+    switch (param_.direction) {
+      case rnn_enum::kUnidirectional:
+        rnn_direction_ = CUDNN_UNIDIRECTIONAL;
+        break;
+      case rnn_enum::kBidirectional:
+        rnn_direction_ = CUDNN_BIDIRECTIONAL;
+        break;
+      default:
+        LOG(FATAL) << "Not implmented";
+    }
+  }
+  // ~CuDNNRNNOp() {
+  //   if (init_cudnn_) {
+  //     CHECK_EQ(cudnnDestroyRNNDescriptor(rnn_desc_), CUDNN_STATUS_SUCCESS);
+  //     // CHECK_EQ(cudnnDestroyTensorDescriptor(rnn_desc_), CUDNN_STATUS_SUCCESS);
+  //     // CHECK_EQ(cudnnDestroyTensorDescriptor(_desc_), CUDNN_STATUS_SUCCESS);
+  //   }
+  // }
+ 
+  virtual void Forward(const OpContext &ctx,
+                       const std::vector<TBlob> &in_data,
+                       const std::vector<OpReqType> &req,
+                       const std::vector<TBlob> &out_data,
+                       const std::vector<TBlob> &aux_args) {
+    using namespace mshadow;
+ //    CHECK_EQ(in_data.size(), 2);
+ //    CHECK_EQ(out_data.size(), 3);
+ //    Stream<gpu> *s = ctx.get_stream<gpu>();
+ //    Tensor<gpu, 4, DType> data = in_data[st::kData].get<gpu, 4, DType>(s);
+ //    Tensor<gpu, 4, DType> out = out_data[st::kOut].get<gpu, 4, DType>(s);
+ //    Shape<3> loc_shape = Shape3(data.size(0), 2, 3);
+ //    Shape<4> grid_shape = Shape4(out.size(0), out.size(2), out.size(3), 2);
+ //    Tensor<gpu, 3, DType> loc = in_data[st::kLoc].get_with_shape<gpu, 3, DType>(loc_shape, s);
+ //    Tensor<gpu, 4, DType> grid = out_data[st::kGridSrc]
+ //                                .get_with_shape<gpu, 4, DType>(grid_shape, s);
+ //    if (!init_cudnn_) {
+ //     Init(s, in_data, out_data);
+ //    }
+ //    CHECK_EQ(data.CheckContiguous(), true);
+ //    CHECK_EQ(out.CheckContiguous(), true);
+ //    typename DataType<DType>::ScaleType alpha = 1.0f;
+ //    typename DataType<DType>::ScaleType beta = 0.0f;
+ //    if (param_.transform_type == st::kAffine) {
+ //      CHECK_EQ(cudnnSpatialTfGridGeneratorForward(s->dnn_handle_,
+ //                                                  st_desc_,
+ //                                                  loc.dptr_,
+ //                                                  grid.dptr_/*output*/), CUDNN_STATUS_SUCCESS);
+ //    }
+ //    CHECK_EQ(cudnnSpatialTfSamplerForward(s->dnn_handle_,
+ //                                          st_desc_,
+ //                                          &alpha,
+ //                                          in_desc_,
+ //                                          data.dptr_,
+ //                                          grid.dptr_,
+ //                                          &beta,
+ //                                          out_desc_,
+ //                                          out.dptr_/*output*/), CUDNN_STATUS_SUCCESS);
+  }
+ //
+  virtual void Backward(const OpContext &ctx,
+                        const std::vector<TBlob> &out_grad,
+                        const std::vector<TBlob> &in_data,
+                        const std::vector<TBlob> &out_data,
+                        const std::vector<OpReqType> &req,
+                        const std::vector<TBlob> &in_grad,
+                        const std::vector<TBlob> &aux_args) {
+    using namespace mshadow;
+ //    CHECK_EQ(in_data.size(), 2);
+ //    CHECK_EQ(out_data.size(), 3);
+ //    CHECK_EQ(out_grad.size(), 1);
+ //    Stream<gpu> *s = ctx.get_stream<gpu>();
+ //    Tensor<gpu, 4, DType> data = in_data[st::kData].get<gpu, 4, DType>(s);
+ //    Tensor<gpu, 4, DType> grad = out_grad[st::kOut].get<gpu, 4, DType>(s);
+ //    Tensor<gpu, 4, DType> ddata = in_grad[st::kData].get<gpu, 4, DType>(s);
+ //    Shape<3> loc_shape = Shape3(data.size(0), 2, 3);
+ //    Shape<4> grid_shape = Shape4(grad.size(0), grad.size(2), grad.size(3), 2);
+ //    Tensor<gpu, 3, DType> dloc = in_grad[st::kLoc].get_with_shape<gpu, 3, DType>(loc_shape, s);
+ //    Tensor<gpu, 4, DType> grid = out_data[st::kGridSrc]
+ //                    .get_with_shape<gpu, 4, DType>(grid_shape, s);
+ //    // do not use out_grad[st::kGridSrc], because dgrid is a intermediate tensor, and not include in
+ //    // DeclareBackwardDependency, another, we can we reuse grid for inplace operator
+ //    typename DataType<DType>::ScaleType alpha = 1.0f;
+ //    typename DataType<DType>::ScaleType beta = 0.0f;
+ //    typename DataType<DType>::ScaleType alpha_dgrid = 1.0f;
+ //    typename DataType<DType>::ScaleType beta_dgrid = 0.0f;
+ //    CHECK_EQ(cudnnSpatialTfSamplerBackward(s->dnn_handle_,
+ //                                           st_desc_,
+ //                                           &alpha,
+ //                                           in_desc_,
+ //                                           data.dptr_,
+ //                                           &beta,
+ //                                           in_desc_/*reuse in_desc_*/,
+ //                                           ddata.dptr_/*output*/,
+ //                                           &alpha_dgrid,
+ //                                           out_desc_/*reuse out_desc_*/,
+ //                                           grad.dptr_,
+ //                                           grid.dptr_,
+ //                                           &beta_dgrid,
+ //                                           grid.dptr_/*output, reuse grid*/), CUDNN_STATUS_SUCCESS);
+ //    if (param_.transform_type == st::kAffine) {
+ //      CHECK_EQ(cudnnSpatialTfGridGeneratorBackward(s->dnn_handle_,
+ //                                                   st_desc_,
+ //                                                   grid.dptr_,
+ //                                                   dloc.dptr_/*out*/), CUDNN_STATUS_SUCCESS);
+ //    }
+  }
+ //
+ private:
+  inline void Init(mshadow::Stream<gpu> *s,
+                   const std::vector<TBlob> &in_data,
+                   const std::vector<TBlob> &out_data) {
+    using namespace mshadow;
+    // CHECK_EQ(in_data.size(), 2);
+    // CHECK_EQ(out_data.size(), 3);
+    // if (!init_cudnn_) {
+    //   init_cudnn_ = true;
+    //   // Tensor<gpu, 4, DType> data = in_data[st::kData].get<gpu, 4, DType>(s);
+    //   // Tensor<gpu, 4, DType> out = out_data[st::kOut].get<gpu, 4, DType>(s);
+    //   CHECK_EQ(cudnnCreateRNNDescriptor(&rnn_desc_), CUDNN_STATUS_SUCCESS);
+    //   CHECK_EQ(cudnnCreateDropoutDescriptor(&rnn_dropout_), CUDNN_STATUS_SUCCESS);
+
+    //   CHECK_EQ(cudnnCreateTensorDescriptor(&in_desc_), CUDNN_STATUS_SUCCESS);
+    //   CHECK_EQ(cudnnCreateTensorDescriptor(&out_desc_), CUDNN_STATUS_SUCCESS);
+    //   CHECK_EQ(cudnnSetTensor4dDescriptor(in_desc_,
+    //                                       format_,
+    //                                       dtype_,
+    //                                       data.size(0),
+    //                                       data.size(1),
+    //                                       data.size(2),
+    //                                       data.size(3)), CUDNN_STATUS_SUCCESS);
+    //   CHECK_EQ(cudnnSetTensor4dDescriptor(out_desc_,
+    //                                       format_,
+    //                                       dtype_,
+    //                                       out.size(0),
+    //                                       out.size(1),
+    //                                       out.size(2),
+    //                                       out.size(3)), CUDNN_STATUS_SUCCESS);
+    //   if (param_.sampler_type == st::kBilinear) {
+    //     int dim[] = {static_cast<int>(out.size(0)), static_cast<int>(out.size(1)),
+    //                  static_cast<int>(out.size(2)), static_cast<int>(out.size(3))};
+    //     CHECK_EQ(cudnnSetSpatialTransformerNdDescriptor(st_desc_,
+    //                                                     sampler_,
+    //                                                     dtype_,
+    //                                                     4,
+    //                                                     dim) , CUDNN_STATUS_SUCCESS);
+    //   }
+    // }
+  }
+ 
+  bool init_cudnn_;
+  cudnnDataType_t dtype_;
+  cudnnRNNDescriptor_t rnn_desc_;
+  cudnnRNNMode_t rnn_mode_;
+  cudnnDirectionMode_t rnn_direction_;
+  cudnnRNNInputMode_t rnn_input_mode_;
+  cudnnDropoutDescriptor_t rnn_dropout_;
+  // cudnnTensorDescriptor_t in_desc_;
+  // cudnnTensorDescriptor_t out_desc_;
+  #if CUDNN_MAJOR == 5
+  cudnnTensorFormat_t format_;
+  #endif
+  RNNParam param_;
+};
+#endif  // __CUDACC__ && CUDNN
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_OPERATOR_CUDNN_SPATIAL_TRANSFORMER_INL_H_
diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h
new file mode 100644
index 000000000000..3a538f001d5b
--- /dev/null
+++ b/src/operator/rnn-inl.h
@@ -0,0 +1,471 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file rnn-inl.h
+ * \brief
+ * \author Sebastian Bodenstein
+*/
+#ifndef MXNET_OPERATOR_RNN_INL_H_
+#define MXNET_OPERATOR_RNN_INL_H_
+
+#include <dmlc/logging.h>
+#include <dmlc/parameter.h>
+#include <mxnet/operator.h>
+#include <algorithm>
+#include <map>
+#include <vector>
+#include <string>
+#include <utility>
+#include "./operator_common.h"
+
+namespace mxnet {
+namespace op {
+
+namespace rnn_enum {
+  enum RNNOpInputs {kData, kWeight, kStateIn, kCellStateIn};
+  enum RNNOpOutputs {kOut, kStateOut, kCellStateOut};
+  enum RNNModeType {kRnnRelu, kRnnTanh, kLstm, kGru};  
+  enum RNNDirectionType {kUnidirectional, kBidirectional};
+  enum RNNOpResource {kTempSpace};
+}
+
+// A utility function to calculate input size
+
+inline int rnn_single_param_size(int inputSize,
+                                int hiddenSize, 
+                                int mode){
+  int size = hiddenSize * (hiddenSize + inputSize + 2);
+  // Different RNN's have different num weights
+  switch(mode)
+  {
+    case rnn_enum::kRnnRelu:
+      size *= 1 ;
+      break;
+    case rnn_enum::kRnnTanh:
+      size *= 1;
+      break;
+    case rnn_enum::kLstm:
+      size *= 4;
+      break;
+    case rnn_enum::kGru:
+      size *= 3;
+      break;
+  }
+  return size;
+}
+
+inline int rnn_param_size(int layerNum, 
+                          int inputSize,
+                          int hiddenSize, 
+                          int direction, 
+                          int mode){
+  // get size of first layer
+  int size = rnn_single_param_size(inputSize, hiddenSize, mode);
+  // get size of remaining layers
+  if(direction == rnn_enum::kUnidirectional)
+    size += (layerNum - 1) * rnn_single_param_size(hiddenSize, hiddenSize, mode);
+  else // bidirectional case: input size increases by 2
+    size += (layerNum - 1) * rnn_single_param_size(2 * hiddenSize, hiddenSize, mode);
+  return size;
+}
+
+struct RNNParam : public dmlc::Parameter<RNNParam> {
+  uint32_t state_size;
+  uint32_t num_layers;
+  uint64_t workspace;
+  bool batch_first;
+  int direction;
+  int mode;
+
+  DMLC_DECLARE_PARAMETER(RNNParam) {
+    DMLC_DECLARE_FIELD(state_size)
+    .describe("size of the state for each layer");
+
+    DMLC_DECLARE_FIELD(num_layers)
+    .describe("number of stacked layers");
+
+    DMLC_DECLARE_FIELD(workspace).set_default(512).set_range(0, 8192)
+    .describe("Tmp workspace for RNN (MB)");
+
+    DMLC_DECLARE_FIELD(direction)
+    .add_enum("unidirectional", rnn_enum::kUnidirectional)
+    .add_enum("bidirectional", rnn_enum::kBidirectional)
+    .describe("specifies the recurrence pattern");
+
+    DMLC_DECLARE_FIELD(mode)
+    .add_enum("rnn_relu", rnn_enum::kRnnRelu)
+    .add_enum("rnn_tanh", rnn_enum::kRnnTanh)
+    .add_enum("lstm", rnn_enum::kLstm)
+    .add_enum("gru", rnn_enum::kGru)
+    .describe("the type of RNN to compute");
+  }
+};
+
+template<typename xpu, typename DType>
+class RNNOp : public Operator {
+ public:
+  explicit RNNOp(RNNParam p) {
+    this->param_ = p;
+    // convert MBytes first to Bytes and then to elements.
+    param_.workspace = (param_.workspace << 20) / sizeof(real_t);
+  }
+
+  virtual void Forward(const OpContext &ctx,
+                       const std::vector<TBlob> &in_data,
+                       const std::vector<OpReqType> &req,
+                       const std::vector<TBlob> &out_data,
+                       const std::vector<TBlob> &aux_args) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+//     CHECK_EQ(req[rnn_enum::kOut], kWriteTo);
+  
+//     CHECK_EQ(in_data.size(), expected);
+//     CHECK_EQ(out_data.size(), 1);
+//     Stream<xpu> *s = ctx.get_stream<xpu>();
+//     Tensor<xpu, 4, DType> data = in_data[rnn_enum::kData].get<xpu, 4, DType>(s);
+//     Tensor<xpu, 4, DType> out = out_data[rnn_enum::kOut].get<xpu, 4, DType>(s);
+//     Shape<3> wmat_shape =
+//         Shape3(param_.num_group,
+//                data.shape_[1] / param_.num_group,
+//                param_.num_filter / param_.num_group * param_.kernel[0] * param_.kernel[1]);
+//     Tensor<xpu, 3, DType> wmat =
+//         in_data[rnn_enum::kWeight].get_with_shape<xpu, 3, DType>(wmat_shape, s);
+// #if defined(__CUDACC__)
+//     CHECK_EQ(s->blas_handle_ownership_, Stream<xpu>::OwnHandle)
+//         << "Must init CuBLAS handle in stream";
+// #endif
+//     const index_t nbatch = data.size(0);
+//     Tensor<xpu, 1, DType> workspace =
+//         ctx.requested[rnn_enum::kTempSpace].get_space_typed<xpu, 1, DType>(
+//             Shape1(this->InitTemp(out.shape_, data.shape_)), s);
+//     for (index_t i = 0; i < nbatch; i += nstep_) {
+//       const index_t step = std::min(nstep_, nbatch - i);
+//       Tensor<xpu, 2, DType> temp_col = Tensor<xpu, 2, DType>(
+//                                             workspace.dptr_,
+//                                             Shape2(shape_colunit_[0],
+//                                             shape_colunit_[1] * step), s);
+//       Tensor<xpu, 3, DType> temp_dst = Tensor<xpu, 3, DType>(
+//                                            workspace.dptr_ + temp_col.shape_.Size(),
+//                                            Shape3(shape_dstunit_[0],
+//                                            shape_dstunit_[1],
+//                                            shape_dstunit_[2] * step), s);
+//       temp_dst = reshape(swapaxis<1, 0>(data.Slice(i, i + step)), temp_dst.shape_);
+//       if (param_.pad[0] == 0 && param_.pad[1] == 0) {
+//         temp_col = unpack_patch2col(out.Slice(i, i + step),
+//                                     param_.kernel[0],
+//                                     param_.kernel[1],
+//                                     param_.stride[0],
+//                                     param_.stride[1],
+//                                     1, 1);  // RNN only support dilate equals 1
+//       } else {
+//         temp_col = unpack_patch2col(pad(out.Slice(i, i + step),
+//                                         param_.pad[0], param_.pad[1]),
+//                                     param_.kernel[0],
+//                                     param_.kernel[1],
+//                                     param_.stride[0],
+//                                     param_.stride[1],
+//                                     1, 1);  // RNN only support dilate equals 1
+//       }
+//       const index_t gstride = temp_col.size(0) / param_.num_group;
+//       for (uint32_t gid = 0; gid < param_.num_group; ++gid) {
+//         mshadow::Tensor<xpu, 2, DType> tmpc = temp_col.Slice(gstride * gid,
+//                                               gstride * (gid + 1));
+//         tmpc = dot(wmat[gid].T(), temp_dst[gid]);
+//       }
+//       if (param_.pad[0] == 0 && param_.pad[1] == 0) {
+//         out.Slice(i, i + step) = pack_col2patch(temp_col,
+//                                    out.Slice(i, i + step).shape_,
+//                                    param_.kernel[0],
+//                                    param_.kernel[1],
+//                                    param_.stride[0],
+//                                    1);  // RNN only support dilate equals 1
+//       } else {
+//         Shape<4> pshape = out.Slice(i, i + step).shape_;
+//         pshape[2] += 2 * param_.pad[0];
+//         pshape[3] += 2 * param_.pad[1];
+//         out.Slice(i, i + step) = crop(pack_col2patch(temp_col,
+//                                         pshape,
+//                                         param_.kernel[0],
+//                                         param_.kernel[1],
+//                                         param_.stride[0],
+//                                         1),  // RNN only support dilate equals 1
+//                                         out[i][0].shape_);
+//       }
+//     }
+//     if (!param_.no_bias) {
+//       // add bias, broadcast bias to dim 1: channel
+//       Tensor<xpu, 1, DType> bias = in_data[rnn_enum::kBias].get<xpu, 1, DType>(s);
+//       out += broadcast<1>(bias, out.shape_);
+//     }
+  }
+
+  virtual void Backward(const OpContext &ctx,
+                        const std::vector<TBlob> &out_grad,
+                        const std::vector<TBlob> &in_data,
+                        const std::vector<TBlob> &out_data,
+                        const std::vector<OpReqType> &req,
+                        const std::vector<TBlob> &in_grad,
+                        const std::vector<TBlob> &aux_args) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    // TODO(bing): check the BLAS Handle, be careful
+//     CHECK_EQ(out_grad.size(), 1);
+//     size_t expected = param_.no_bias == 0 ? 3 : 2;
+//     CHECK(in_data.size() == expected && in_grad.size() == expected);
+//     CHECK_EQ(req.size(), expected);
+//     CHECK_EQ(in_data[rnn_enum::kWeight].CheckContiguous(), true);
+//     // get data
+//     Stream<xpu> *s = ctx.get_stream<xpu>();
+//     Tensor<xpu, 4, DType> data = in_data[rnn_enum::kData].get<xpu, 4, DType>(s);
+//     Tensor<xpu, 4, DType> grad = out_grad[rnn_enum::kOut].get<xpu, 4, DType>(s);
+//     Tensor<xpu, 4, DType> gdata = in_grad[rnn_enum::kData].get<xpu, 4, DType>(s);
+//     Shape<3> wmat_shape =
+//         Shape3(param_.num_group,
+//                data.shape_[1] / param_.num_group,
+//                param_.num_filter / param_.num_group * param_.kernel[0] * param_.kernel[1]);
+//     Tensor<xpu, 3, DType> wmat =
+//         in_data[rnn_enum::kWeight].get_with_shape<xpu, 3, DType>(wmat_shape, s);
+//     Tensor<xpu, 3, DType> gwmat =
+//         in_grad[rnn_enum::kWeight].get_with_shape<xpu, 3, DType>(wmat_shape, s);
+// #if defined(__CUDACC__)
+//     CHECK_EQ(s->blas_handle_ownership_, Stream<xpu>::OwnHandle)
+//         << "Must init CuBLAS handle in stream";
+// #endif
+//     const index_t nbatch = data.size(0);
+//     Tensor<xpu, 1, DType> workspace =
+//         ctx.requested[rnn_enum::kTempSpace].get_space_typed<xpu, 1, DType>(
+//             Shape1(this->InitTemp(grad.shape_, data.shape_)), s);
+//     for (index_t i = 0; i < nbatch; i += nstep_) {
+//       const index_t step = std::min(nstep_, nbatch - i);
+//       Tensor<xpu, 2, DType> temp_col = Tensor<xpu, 2, DType>(
+//                                            workspace.dptr_,
+//                                            Shape2(shape_colunit_[0],
+//                                            shape_colunit_[1] * step), s);
+//       Tensor<xpu, 3, DType> temp_dst = Tensor<xpu, 3, DType>(
+//                                            workspace.dptr_ + temp_col.shape_.Size(),
+//                                            Shape3(shape_dstunit_[0],
+//                                            shape_dstunit_[1],
+//                                            shape_dstunit_[2] * step), s);
+//       temp_dst = reshape(swapaxis<1, 0>(data.Slice(i, i + step)), temp_dst.shape_);
+//       if (param_.pad[0] == 0 && param_.pad[1] == 0) {
+//         temp_col = unpack_patch2col(grad.Slice(i, i + step),
+//                                      param_.kernel[0],
+//                                      param_.kernel[1],
+//                                      param_.stride[0],
+//                                      param_.stride[1],
+//                                      1, 1);  // RNN only support dilate equals 1
+//       } else {
+//         temp_col = unpack_patch2col(pad(grad.Slice(i, i + step), param_.pad[0], param_.pad[1]),
+//                                      param_.kernel[0],
+//                                      param_.kernel[1],
+//                                      param_.stride[0],
+//                                      param_.stride[1],
+//                                      1, 1);  // RNN only support dilate equals 1
+//       }
+//       const index_t gstride = temp_col.size(0) / param_.num_group;
+//       for (uint32_t gid = 0; gid < param_.num_group; ++gid) {
+//         Tensor<xpu, 2, DType> tmpc = temp_col.Slice(gstride * gid, gstride * (gid + 1));
+//         if (i == 0) {
+//           Tensor<xpu, 2, DType> tmp_gwmat = gwmat[gid];
+//           Assign(tmp_gwmat, req[rnn_enum::kWeight], dot(temp_dst[gid], tmpc.T()));
+//         } else {
+//           gwmat[gid] += dot(temp_dst[gid], tmpc.T());
+//         }
+//       }
+//       if (req[rnn_enum::kData] == kWriteTo || req[rnn_enum::kData] == kWriteInplace) {
+//         for (uint32_t gid = 0; gid < param_.num_group; ++gid) {
+//           Tensor<xpu, 2, DType> tmpc = temp_col.Slice(gstride * gid, gstride * (gid + 1));
+//           temp_dst[gid] = dot(wmat[gid], tmpc);
+//         }
+//         gdata.Slice(i, i + step) = swapaxis<1, 0>(reshape(temp_dst,
+//                                                     mshadow::Shape4(gdata.shape_[1],
+//                                                     step,
+//                                                     gdata.size(2),
+//                                                     gdata.size(3))));
+//       }
+//     }
+//     if (!param_.no_bias) {
+//       Tensor<xpu, 1, DType> gbias = in_grad[rnn_enum::kBias].get<xpu, 1, DType>(s);
+//       Assign(gbias, req[rnn_enum::kBias], sumall_except_dim<1>(grad));
+//     }
+  }
+
+ private:
+//   inline index_t InitTemp(const mshadow::Shape<4> &ishape,
+//                           const mshadow::Shape<4> &oshape) {
+//     const int ksize_y = param_.kernel[0];
+//     const int ksize_x = param_.kernel[1];
+//     shape_colunit_ = mshadow::Shape2(ishape[1] * ksize_y * ksize_x,
+//                                      oshape[2] * oshape[3]);
+//     shape_dstunit_ = mshadow::Shape3(param_.num_group,
+//                                      oshape[1] / param_.num_group,
+//                                      oshape[2] * oshape[3]);
+//     // See convolution for workspace calculations
+//     nstep_ = std::max(
+//         std::min(
+//             static_cast<index_t>(
+//                 param_.workspace / (shape_colunit_.Size() + shape_dstunit_.Size())),
+//             ishape[0]),
+//         1U);
+
+//     mshadow::Shape<2> scol = mshadow::Shape2(shape_colunit_[0],
+//                                              shape_colunit_[1] * nstep_);
+//     mshadow::Shape<3> sdst = mshadow::Shape3(shape_dstunit_[0],
+//                                              shape_dstunit_[1],
+//                                              shape_dstunit_[2] * nstep_);
+//     index_t required_size = scol.Size() + sdst.Size();
+//     CHECK_GE(param_.workspace, required_size)
+//       << "\nMinimum workspace size: " << required_size * sizeof(DType) << " Bytes\n"
+//       << "Given: " << param_.workspace * sizeof(DType);
+//     return required_size;
+//   }
+
+ private:
+  RNNParam param_;
+};  // class RNNOp
+
+
+
+
+template<typename xpu>
+Operator* CreateOp(RNNParam param, int dtype);
+
+#if DMLC_USE_CXX11
+class RNNProp : public OperatorProperty {
+ public:
+  std::vector<std::string> ListArguments() const override {
+    if (param_.mode == rnn_enum::kLstm) {
+      return {"data", "weight", "state", "cell_state"};
+    } else {
+      return {"data", "weight", "state"};
+    }
+  }
+
+  void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {
+    param_.Init(kwargs);
+  }
+
+  std::map<std::string, std::string> GetParams() const override {
+    return param_.__DICT__();
+  }
+
+  bool InferShape(std::vector<TShape> *in_shape,
+                  std::vector<TShape> *out_shape,
+                  std::vector<TShape> *aux_shape) const override {
+    using namespace mshadow;
+    if (param_.mode == rnn_enum::kLstm) {
+      CHECK_EQ(in_shape->size(), 4) << "Input:[data, weight, state, cell_state]";
+    } else {
+      CHECK_EQ(in_shape->size(), 3) << "Input:[data, weight, state]";
+    }
+    const TShape &dshape = (*in_shape)[rnn_enum::kData];
+    if (dshape.ndim() ==  0) return false;
+    CHECK_EQ(dshape.ndim(), 3) \
+        << "Input data should be rank-3 tensor of dim (seqLength, batch, inputDim).";
+    // Infer hidden state + cell state
+    int batchSize = dshape[0];
+    int inputSize = dshape[2];
+    int numDirections = 1;
+    if(param_.direction == rnn_enum::kBidirectional){
+      numDirections = 2;
+    }
+    int total_layers = numDirections * param_.num_layers; // double for bidirectional
+    SHAPE_ASSIGN_CHECK(*in_shape,
+                       rnn_enum::kStateIn,
+                       Shape3(total_layers, batchSize, param_.state_size));
+    if (param_.mode == rnn_enum::kLstm){
+      SHAPE_ASSIGN_CHECK(*in_shape,
+                        rnn_enum::kCellStateIn,
+                        Shape3(total_layers, batchSize, param_.state_size));
+    }
+    // infer weight size
+    int weight_size = rnn_param_size(param_.num_layers, 
+                                    inputSize, 
+                                    param_.state_size, 
+                                    param_.direction, 
+                                    param_.mode);
+    SHAPE_ASSIGN_CHECK(*in_shape, rnn_enum::kWeight, Shape1(weight_size));
+    // infer output size
+    TShape oshape = dshape;
+    oshape[3] = numDirections * param_.state_size;
+    // infer output state size   
+    TShape outStateShape = dshape;
+    outStateShape[0] = total_layers;
+    outStateShape[1] = batchSize;
+    outStateShape[2] = param_.state_size;
+
+    out_shape->clear();   
+    out_shape->push_back(oshape);
+    out_shape->push_back(outStateShape);
+    if (param_.mode == rnn_enum::kLstm) 
+      out_shape->push_back(outStateShape);
+    return true;
+  }
+
+  bool InferType(std::vector<int> *in_type,
+                 std::vector<int> *out_type,
+                 std::vector<int> *aux_type) const override {
+    CHECK_GE(in_type->size(), 1);
+    int dtype = (*in_type)[0];
+    CHECK_NE(dtype, -1) << "First input must have specified type";
+    for (index_t i = 0; i < in_type->size(); ++i) {
+      if ((*in_type)[i] == -1) {
+        (*in_type)[i] = dtype;
+      } else {
+        CHECK_EQ((*in_type)[i], dtype) << "This layer requires uniform type. "
+                                       << "Expected " << dtype << " v.s. given "
+                                       << (*in_type)[i] << " at " << ListArguments()[i];
+      }
+    }
+    out_type->clear();
+    out_type->push_back(dtype);
+    out_type->push_back(dtype);
+    if (param_.mode == rnn_enum::kLstm) 
+      out_type->push_back(dtype);
+    return true;
+  }
+
+  OperatorProperty* Copy() const override {
+    auto ptr = new RNNProp();
+    ptr->param_ = param_;
+    return ptr;
+  }
+
+  std::string TypeString() const override {
+    return "RNN";
+  }
+
+  std::vector<int> DeclareBackwardDependency(
+    const std::vector<int> &out_grad,
+    const std::vector<int> &in_data,
+    const std::vector<int> &out_data) const override {
+    if (param_.mode == rnn_enum::kLstm)
+      return {out_grad[rnn_enum::kOut], in_data[rnn_enum::kData], in_data[rnn_enum::kWeight]};
+    else
+      return {out_grad[rnn_enum::kOut], in_data[rnn_enum::kData], in_data[rnn_enum::kWeight]};
+  }
+
+  std::vector<ResourceRequest> ForwardResource(
+      const std::vector<TShape> &in_shape) const override {
+    return {ResourceRequest::kTempSpace};
+  }
+
+  std::vector<ResourceRequest> BackwardResource(
+      const std::vector<TShape> &in_shape) const override {
+    return {ResourceRequest::kTempSpace};
+  }
+
+  Operator* CreateOperator(Context ctx) const override {
+    LOG(FATAL) << "Not Implemented";
+    return NULL;
+  }
+
+  Operator* CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
+                             std::vector<int> *in_type) const override;
+
+ private:
+  RNNParam param_;
+};  // class RNNProp
+#endif  // DMLC_USE_CXX11
+}  // namespace op
+}  // namespace mxnet
+#endif  // MXNET_OPERATOR_RNN_INL_H_
diff --git a/src/operator/rnn.cc b/src/operator/rnn.cc
new file mode 100644
index 000000000000..40f7f705718d
--- /dev/null
+++ b/src/operator/rnn.cc
@@ -0,0 +1,41 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file rnn.cc
+ * \brief
+ * \author Sebastian Bodenstein
+*/
+
+#include "./rnn-inl.h"
+
+namespace mxnet {
+namespace op {
+template<>
+Operator *CreateOp<cpu>(RNNParam param, int dtype) {
+  LOG(FATAL) << "RNN is only available for gpu at the moment.";
+  Operator *op = NULL;
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+    op = new RNNOp<cpu, DType>(param);
+  });
+  return op;
+}
+
+Operator *RNNProp::CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
+                                     std::vector<int> *in_type) const {
+  std::vector<TShape> out_shape, aux_shape;
+  std::vector<int> out_type, aux_type;
+  CHECK(InferType(in_type, &out_type, &aux_type));
+  CHECK(InferShape(in_shape, &out_shape, &aux_shape));
+  DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0]);
+}
+
+DMLC_REGISTER_PARAMETER(RNNParam);
+
+MXNET_REGISTER_OP_PROPERTY(RNN, RNNProp)
+.describe("Apply a recurrent layer to input.")
+.add_argument("data", "Symbol", "Input data to RNN")
+.add_argument("weight", "Symbol", "Weight for RNN layers")
+.add_argument("hidden_state", "Symbol", "initial hidden state of the RNN")
+.add_argument("cell_state", "Symbol", "initial cell state for LSTM networks")
+.add_arguments(RNNParam::__FIELDS__()); 
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/rnn.cu b/src/operator/rnn.cu
new file mode 100644
index 000000000000..2cb482f591b2
--- /dev/null
+++ b/src/operator/rnn.cu
@@ -0,0 +1,33 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file rnn.cu
+ * \brief
+ * \author Sebastian Bodenstein
+*/
+
+#include "./rnn-inl.h"
+#include <algorithm>
+#if MXNET_USE_CUDNN == 1 && CUDNN_MAJOR == 5
+#include "./cudnn_rnn-inl.h"
+#endif  // MXNET_USE_CUDNN && CUDNN_MAJOR
+
+namespace mxnet {
+namespace op {
+template<>
+Operator* CreateOp<gpu>(RNNParam param, int dtype) {
+  Operator *op = NULL;
+#if MXNET_USE_CUDNN == 1 && CUDNN_MAJOR == 5
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+    op = new CuDNNRNNOp<DType>(param);
+  })
+#else
+	1;
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+  op = new SpatialTransformerOp<gpu, DType>(param);
+  })
+#endif  // MXNET_USE_CUDNN && CUDNN_MAJOR
+  return op;
+}
+
+}  // namespace op
+}  // namespace mxnet

From fde1cb30e85f6841b68a9d5ecc1bd27278d73d78 Mon Sep 17 00:00:00 2001
From: Sebastian Bodenstein <sebastianb@wolfram.com>
Date: Sat, 9 Jul 2016 23:17:47 -0400
Subject: [PATCH 18/36] - removed unnecssary commented out code - fixed error
 in output shape inference

---
 src/operator/rnn-inl.h | 207 +++--------------------------------------
 1 file changed, 12 insertions(+), 195 deletions(-)

diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h
index 3a538f001d5b..37150bf58878 100644
--- a/src/operator/rnn-inl.h
+++ b/src/operator/rnn-inl.h
@@ -29,7 +29,6 @@ namespace rnn_enum {
 }
 
 // A utility function to calculate input size
-
 inline int rnn_single_param_size(int inputSize,
                                 int hiddenSize, 
                                 int mode){
@@ -116,86 +115,7 @@ class RNNOp : public Operator {
                        const std::vector<TBlob> &aux_args) {
     using namespace mshadow;
     using namespace mshadow::expr;
-//     CHECK_EQ(req[rnn_enum::kOut], kWriteTo);
-  
-//     CHECK_EQ(in_data.size(), expected);
-//     CHECK_EQ(out_data.size(), 1);
-//     Stream<xpu> *s = ctx.get_stream<xpu>();
-//     Tensor<xpu, 4, DType> data = in_data[rnn_enum::kData].get<xpu, 4, DType>(s);
-//     Tensor<xpu, 4, DType> out = out_data[rnn_enum::kOut].get<xpu, 4, DType>(s);
-//     Shape<3> wmat_shape =
-//         Shape3(param_.num_group,
-//                data.shape_[1] / param_.num_group,
-//                param_.num_filter / param_.num_group * param_.kernel[0] * param_.kernel[1]);
-//     Tensor<xpu, 3, DType> wmat =
-//         in_data[rnn_enum::kWeight].get_with_shape<xpu, 3, DType>(wmat_shape, s);
-// #if defined(__CUDACC__)
-//     CHECK_EQ(s->blas_handle_ownership_, Stream<xpu>::OwnHandle)
-//         << "Must init CuBLAS handle in stream";
-// #endif
-//     const index_t nbatch = data.size(0);
-//     Tensor<xpu, 1, DType> workspace =
-//         ctx.requested[rnn_enum::kTempSpace].get_space_typed<xpu, 1, DType>(
-//             Shape1(this->InitTemp(out.shape_, data.shape_)), s);
-//     for (index_t i = 0; i < nbatch; i += nstep_) {
-//       const index_t step = std::min(nstep_, nbatch - i);
-//       Tensor<xpu, 2, DType> temp_col = Tensor<xpu, 2, DType>(
-//                                             workspace.dptr_,
-//                                             Shape2(shape_colunit_[0],
-//                                             shape_colunit_[1] * step), s);
-//       Tensor<xpu, 3, DType> temp_dst = Tensor<xpu, 3, DType>(
-//                                            workspace.dptr_ + temp_col.shape_.Size(),
-//                                            Shape3(shape_dstunit_[0],
-//                                            shape_dstunit_[1],
-//                                            shape_dstunit_[2] * step), s);
-//       temp_dst = reshape(swapaxis<1, 0>(data.Slice(i, i + step)), temp_dst.shape_);
-//       if (param_.pad[0] == 0 && param_.pad[1] == 0) {
-//         temp_col = unpack_patch2col(out.Slice(i, i + step),
-//                                     param_.kernel[0],
-//                                     param_.kernel[1],
-//                                     param_.stride[0],
-//                                     param_.stride[1],
-//                                     1, 1);  // RNN only support dilate equals 1
-//       } else {
-//         temp_col = unpack_patch2col(pad(out.Slice(i, i + step),
-//                                         param_.pad[0], param_.pad[1]),
-//                                     param_.kernel[0],
-//                                     param_.kernel[1],
-//                                     param_.stride[0],
-//                                     param_.stride[1],
-//                                     1, 1);  // RNN only support dilate equals 1
-//       }
-//       const index_t gstride = temp_col.size(0) / param_.num_group;
-//       for (uint32_t gid = 0; gid < param_.num_group; ++gid) {
-//         mshadow::Tensor<xpu, 2, DType> tmpc = temp_col.Slice(gstride * gid,
-//                                               gstride * (gid + 1));
-//         tmpc = dot(wmat[gid].T(), temp_dst[gid]);
-//       }
-//       if (param_.pad[0] == 0 && param_.pad[1] == 0) {
-//         out.Slice(i, i + step) = pack_col2patch(temp_col,
-//                                    out.Slice(i, i + step).shape_,
-//                                    param_.kernel[0],
-//                                    param_.kernel[1],
-//                                    param_.stride[0],
-//                                    1);  // RNN only support dilate equals 1
-//       } else {
-//         Shape<4> pshape = out.Slice(i, i + step).shape_;
-//         pshape[2] += 2 * param_.pad[0];
-//         pshape[3] += 2 * param_.pad[1];
-//         out.Slice(i, i + step) = crop(pack_col2patch(temp_col,
-//                                         pshape,
-//                                         param_.kernel[0],
-//                                         param_.kernel[1],
-//                                         param_.stride[0],
-//                                         1),  // RNN only support dilate equals 1
-//                                         out[i][0].shape_);
-//       }
-//     }
-//     if (!param_.no_bias) {
-//       // add bias, broadcast bias to dim 1: channel
-//       Tensor<xpu, 1, DType> bias = in_data[rnn_enum::kBias].get<xpu, 1, DType>(s);
-//       out += broadcast<1>(bias, out.shape_);
-//     }
+    // TODO: add MShadow implementation
   }
 
   virtual void Backward(const OpContext &ctx,
@@ -207,125 +127,13 @@ class RNNOp : public Operator {
                         const std::vector<TBlob> &aux_args) {
     using namespace mshadow;
     using namespace mshadow::expr;
-    // TODO(bing): check the BLAS Handle, be careful
-//     CHECK_EQ(out_grad.size(), 1);
-//     size_t expected = param_.no_bias == 0 ? 3 : 2;
-//     CHECK(in_data.size() == expected && in_grad.size() == expected);
-//     CHECK_EQ(req.size(), expected);
-//     CHECK_EQ(in_data[rnn_enum::kWeight].CheckContiguous(), true);
-//     // get data
-//     Stream<xpu> *s = ctx.get_stream<xpu>();
-//     Tensor<xpu, 4, DType> data = in_data[rnn_enum::kData].get<xpu, 4, DType>(s);
-//     Tensor<xpu, 4, DType> grad = out_grad[rnn_enum::kOut].get<xpu, 4, DType>(s);
-//     Tensor<xpu, 4, DType> gdata = in_grad[rnn_enum::kData].get<xpu, 4, DType>(s);
-//     Shape<3> wmat_shape =
-//         Shape3(param_.num_group,
-//                data.shape_[1] / param_.num_group,
-//                param_.num_filter / param_.num_group * param_.kernel[0] * param_.kernel[1]);
-//     Tensor<xpu, 3, DType> wmat =
-//         in_data[rnn_enum::kWeight].get_with_shape<xpu, 3, DType>(wmat_shape, s);
-//     Tensor<xpu, 3, DType> gwmat =
-//         in_grad[rnn_enum::kWeight].get_with_shape<xpu, 3, DType>(wmat_shape, s);
-// #if defined(__CUDACC__)
-//     CHECK_EQ(s->blas_handle_ownership_, Stream<xpu>::OwnHandle)
-//         << "Must init CuBLAS handle in stream";
-// #endif
-//     const index_t nbatch = data.size(0);
-//     Tensor<xpu, 1, DType> workspace =
-//         ctx.requested[rnn_enum::kTempSpace].get_space_typed<xpu, 1, DType>(
-//             Shape1(this->InitTemp(grad.shape_, data.shape_)), s);
-//     for (index_t i = 0; i < nbatch; i += nstep_) {
-//       const index_t step = std::min(nstep_, nbatch - i);
-//       Tensor<xpu, 2, DType> temp_col = Tensor<xpu, 2, DType>(
-//                                            workspace.dptr_,
-//                                            Shape2(shape_colunit_[0],
-//                                            shape_colunit_[1] * step), s);
-//       Tensor<xpu, 3, DType> temp_dst = Tensor<xpu, 3, DType>(
-//                                            workspace.dptr_ + temp_col.shape_.Size(),
-//                                            Shape3(shape_dstunit_[0],
-//                                            shape_dstunit_[1],
-//                                            shape_dstunit_[2] * step), s);
-//       temp_dst = reshape(swapaxis<1, 0>(data.Slice(i, i + step)), temp_dst.shape_);
-//       if (param_.pad[0] == 0 && param_.pad[1] == 0) {
-//         temp_col = unpack_patch2col(grad.Slice(i, i + step),
-//                                      param_.kernel[0],
-//                                      param_.kernel[1],
-//                                      param_.stride[0],
-//                                      param_.stride[1],
-//                                      1, 1);  // RNN only support dilate equals 1
-//       } else {
-//         temp_col = unpack_patch2col(pad(grad.Slice(i, i + step), param_.pad[0], param_.pad[1]),
-//                                      param_.kernel[0],
-//                                      param_.kernel[1],
-//                                      param_.stride[0],
-//                                      param_.stride[1],
-//                                      1, 1);  // RNN only support dilate equals 1
-//       }
-//       const index_t gstride = temp_col.size(0) / param_.num_group;
-//       for (uint32_t gid = 0; gid < param_.num_group; ++gid) {
-//         Tensor<xpu, 2, DType> tmpc = temp_col.Slice(gstride * gid, gstride * (gid + 1));
-//         if (i == 0) {
-//           Tensor<xpu, 2, DType> tmp_gwmat = gwmat[gid];
-//           Assign(tmp_gwmat, req[rnn_enum::kWeight], dot(temp_dst[gid], tmpc.T()));
-//         } else {
-//           gwmat[gid] += dot(temp_dst[gid], tmpc.T());
-//         }
-//       }
-//       if (req[rnn_enum::kData] == kWriteTo || req[rnn_enum::kData] == kWriteInplace) {
-//         for (uint32_t gid = 0; gid < param_.num_group; ++gid) {
-//           Tensor<xpu, 2, DType> tmpc = temp_col.Slice(gstride * gid, gstride * (gid + 1));
-//           temp_dst[gid] = dot(wmat[gid], tmpc);
-//         }
-//         gdata.Slice(i, i + step) = swapaxis<1, 0>(reshape(temp_dst,
-//                                                     mshadow::Shape4(gdata.shape_[1],
-//                                                     step,
-//                                                     gdata.size(2),
-//                                                     gdata.size(3))));
-//       }
-//     }
-//     if (!param_.no_bias) {
-//       Tensor<xpu, 1, DType> gbias = in_grad[rnn_enum::kBias].get<xpu, 1, DType>(s);
-//       Assign(gbias, req[rnn_enum::kBias], sumall_except_dim<1>(grad));
-//     }
+    // TODO: add MShadow implementation
   }
 
- private:
-//   inline index_t InitTemp(const mshadow::Shape<4> &ishape,
-//                           const mshadow::Shape<4> &oshape) {
-//     const int ksize_y = param_.kernel[0];
-//     const int ksize_x = param_.kernel[1];
-//     shape_colunit_ = mshadow::Shape2(ishape[1] * ksize_y * ksize_x,
-//                                      oshape[2] * oshape[3]);
-//     shape_dstunit_ = mshadow::Shape3(param_.num_group,
-//                                      oshape[1] / param_.num_group,
-//                                      oshape[2] * oshape[3]);
-//     // See convolution for workspace calculations
-//     nstep_ = std::max(
-//         std::min(
-//             static_cast<index_t>(
-//                 param_.workspace / (shape_colunit_.Size() + shape_dstunit_.Size())),
-//             ishape[0]),
-//         1U);
-
-//     mshadow::Shape<2> scol = mshadow::Shape2(shape_colunit_[0],
-//                                              shape_colunit_[1] * nstep_);
-//     mshadow::Shape<3> sdst = mshadow::Shape3(shape_dstunit_[0],
-//                                              shape_dstunit_[1],
-//                                              shape_dstunit_[2] * nstep_);
-//     index_t required_size = scol.Size() + sdst.Size();
-//     CHECK_GE(param_.workspace, required_size)
-//       << "\nMinimum workspace size: " << required_size * sizeof(DType) << " Bytes\n"
-//       << "Given: " << param_.workspace * sizeof(DType);
-//     return required_size;
-//   }
-
  private:
   RNNParam param_;
 };  // class RNNOp
 
-
-
-
 template<typename xpu>
 Operator* CreateOp(RNNParam param, int dtype);
 
@@ -340,6 +148,14 @@ class RNNProp : public OperatorProperty {
     }
   }
 
+  std::vector<std::string> ListOutputs() const override {
+    if (param_.mode == rnn_enum::kLstm) {
+      return {"output", "final_state", "final_state_cell"};
+    } else {
+      return {"output", "final_state"};
+    }
+  }
+
   void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {
     param_.Init(kwargs);
   }
@@ -386,7 +202,7 @@ class RNNProp : public OperatorProperty {
     SHAPE_ASSIGN_CHECK(*in_shape, rnn_enum::kWeight, Shape1(weight_size));
     // infer output size
     TShape oshape = dshape;
-    oshape[3] = numDirections * param_.state_size;
+    oshape[2] = numDirections * param_.state_size;
     // infer output state size   
     TShape outStateShape = dshape;
     outStateShape[0] = total_layers;
@@ -396,6 +212,7 @@ class RNNProp : public OperatorProperty {
     out_shape->clear();   
     out_shape->push_back(oshape);
     out_shape->push_back(outStateShape);
+    // Deal with lstm cell state
     if (param_.mode == rnn_enum::kLstm) 
       out_shape->push_back(outStateShape);
     return true;

From 7a8a11b53fb0e143e24fa9f15f345f99140a548e Mon Sep 17 00:00:00 2001
From: Sebastian Bodenstein <sebastianb@wolfram.com>
Date: Sun, 10 Jul 2016 00:55:39 -0400
Subject: [PATCH 19/36] - some renaming - added cudnn destructors

---
 src/operator/cudnn_rnn-inl.h | 163 +++++++++++++++++++++++------------
 src/operator/rnn-inl.h       |  34 ++++----
 src/operator/rnn.cc          |   6 +-
 src/operator/rnn.cu          |   5 +-
 4 files changed, 129 insertions(+), 79 deletions(-)

diff --git a/src/operator/cudnn_rnn-inl.h b/src/operator/cudnn_rnn-inl.h
index 37895c2b2488..61d6d2c2f23a 100644
--- a/src/operator/cudnn_rnn-inl.h
+++ b/src/operator/cudnn_rnn-inl.h
@@ -23,16 +23,16 @@ class CuDNNRNNOp : public Operator {
     // RNN Mode
     switch (param_.mode) {
       case rnn_enum::kRnnRelu:
-        rnn_mode_ = CUDNN_RNN_RELU;
+        mode_ = CUDNN_RNN_RELU;
         break;
       case rnn_enum::kRnnTanh:
-        rnn_mode_ = CUDNN_RNN_TANH;
+        mode_ = CUDNN_RNN_TANH;
         break;
       case rnn_enum::kLstm:
-        rnn_mode_ = CUDNN_LSTM;
+        mode_ = CUDNN_LSTM;
         break;
       case rnn_enum::kGru:
-        rnn_mode_ = CUDNN_GRU;
+        mode_ = CUDNN_GRU;
         break;
       default:
         LOG(FATAL) << "Not implmented";
@@ -40,22 +40,31 @@ class CuDNNRNNOp : public Operator {
     // RNN Direction
     switch (param_.direction) {
       case rnn_enum::kUnidirectional:
-        rnn_direction_ = CUDNN_UNIDIRECTIONAL;
+        direction_ = CUDNN_UNIDIRECTIONAL;
         break;
       case rnn_enum::kBidirectional:
-        rnn_direction_ = CUDNN_BIDIRECTIONAL;
+        direction_ = CUDNN_BIDIRECTIONAL;
         break;
       default:
         LOG(FATAL) << "Not implmented";
     }
   }
-  // ~CuDNNRNNOp() {
-  //   if (init_cudnn_) {
-  //     CHECK_EQ(cudnnDestroyRNNDescriptor(rnn_desc_), CUDNN_STATUS_SUCCESS);
-  //     // CHECK_EQ(cudnnDestroyTensorDescriptor(rnn_desc_), CUDNN_STATUS_SUCCESS);
-  //     // CHECK_EQ(cudnnDestroyTensorDescriptor(_desc_), CUDNN_STATUS_SUCCESS);
-  //   }
-  // }
+
+  ~CuDNNRNNOp() {
+    if (init_cudnn_) {
+      CHECK_EQ(cudnnDestroyTensorDescriptor(x_desc_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnDestroyTensorDescriptor(hx_desc_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnDestroyTensorDescriptor(y_desc_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnDestroyTensorDescriptor(hy_desc_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnDestroyFilterDescriptor(w_desc_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnDestroyRNNDescriptor(rnn_desc_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnDestroyDropoutDescriptor(dropout_desc_), CUDNN_STATUS_SUCCESS);
+      if (param_.mode == rnn_enum::kLstm){
+            CHECK_EQ(cudnnDestroyTensorDescriptor(cx_desc_), CUDNN_STATUS_SUCCESS);
+            CHECK_EQ(cudnnDestroyTensorDescriptor(cy_desc_), CUDNN_STATUS_SUCCESS);
+      }
+    }
+  }
  
   virtual void Forward(const OpContext &ctx,
                        const std::vector<TBlob> &in_data,
@@ -150,52 +159,96 @@ class CuDNNRNNOp : public Operator {
                    const std::vector<TBlob> &in_data,
                    const std::vector<TBlob> &out_data) {
     using namespace mshadow;
-    // CHECK_EQ(in_data.size(), 2);
-    // CHECK_EQ(out_data.size(), 3);
-    // if (!init_cudnn_) {
-    //   init_cudnn_ = true;
-    //   // Tensor<gpu, 4, DType> data = in_data[st::kData].get<gpu, 4, DType>(s);
-    //   // Tensor<gpu, 4, DType> out = out_data[st::kOut].get<gpu, 4, DType>(s);
-    //   CHECK_EQ(cudnnCreateRNNDescriptor(&rnn_desc_), CUDNN_STATUS_SUCCESS);
-    //   CHECK_EQ(cudnnCreateDropoutDescriptor(&rnn_dropout_), CUDNN_STATUS_SUCCESS);
+    #if CUDNN_MAJOR == 5
+    format_ = CUDNN_TENSOR_NCHW;
+    #endif
+
+    if (param_.mode == rnn_enum::kLstm){
+      CHECK_EQ(in_data.size(), 4);
+      CHECK_EQ(out_data.size(), 3);
+    }
+    else{
+      CHECK_EQ(in_data.size(), 3);
+      CHECK_EQ(out_data.size(), 2);
+    }
+    
+    if (!init_cudnn_) {
+      init_cudnn_ = true;
+
+      Tensor<gpu, 3, DType> data = in_data[rnn_enum::kData].get<gpu, 3, DType>(s);
+      Tensor<gpu, 1, DType> params = in_data[rnn_enum::kParams].get<gpu, 1, DType>(s);
+      Tensor<gpu, 3, DType> state = in_data[rnn_enum::kStateIn].get<gpu, 3, DType>(s);
+
+      Tensor<gpu, 3, DType> out = out_data[rnn_enum::kOut].get<gpu, 3, DType>(s);
+      Tensor<gpu, 3, DType> out_state = out_data[rnn_enum::kOut].get<gpu, 3, DType>(s);
+
+      if (param_.mode == rnn_enum::kLstm){
+        Tensor<gpu, 3, DType> cell_state = 
+          in_data[rnn_enum::kCellStateIn].get<gpu, 3, DType>(s);
+        Tensor<gpu, 3, DType> out_cell_state = 
+          in_data[rnn_enum::kCellStateOut].get<gpu, 3, DType>(s);
+      }
+
+      CHECK_EQ(cudnnCreateRNNDescriptor(&rnn_desc_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnCreateDropoutDescriptor(&dropout_desc_), CUDNN_STATUS_SUCCESS);
 
-    //   CHECK_EQ(cudnnCreateTensorDescriptor(&in_desc_), CUDNN_STATUS_SUCCESS);
-    //   CHECK_EQ(cudnnCreateTensorDescriptor(&out_desc_), CUDNN_STATUS_SUCCESS);
-    //   CHECK_EQ(cudnnSetTensor4dDescriptor(in_desc_,
-    //                                       format_,
-    //                                       dtype_,
-    //                                       data.size(0),
-    //                                       data.size(1),
-    //                                       data.size(2),
-    //                                       data.size(3)), CUDNN_STATUS_SUCCESS);
-    //   CHECK_EQ(cudnnSetTensor4dDescriptor(out_desc_,
-    //                                       format_,
-    //                                       dtype_,
-    //                                       out.size(0),
-    //                                       out.size(1),
-    //                                       out.size(2),
-    //                                       out.size(3)), CUDNN_STATUS_SUCCESS);
-    //   if (param_.sampler_type == st::kBilinear) {
-    //     int dim[] = {static_cast<int>(out.size(0)), static_cast<int>(out.size(1)),
-    //                  static_cast<int>(out.size(2)), static_cast<int>(out.size(3))};
-    //     CHECK_EQ(cudnnSetSpatialTransformerNdDescriptor(st_desc_,
-    //                                                     sampler_,
-    //                                                     dtype_,
-    //                                                     4,
-    //                                                     dim) , CUDNN_STATUS_SUCCESS);
-    //   }
-    // }
+      // Create tensors
+      CHECK_EQ(cudnnCreateFilterDescriptor(&w_desc_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnCreateTensorDescriptor(&x_desc_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnCreateTensorDescriptor(&hx_desc_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnCreateTensorDescriptor(&y_desc_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnCreateTensorDescriptor(&hy_desc_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnCreateTensorDescriptor(&y_desc_), CUDNN_STATUS_SUCCESS);
+      if (param_.mode == rnn_enum::kLstm){
+        CHECK_EQ(cudnnCreateTensorDescriptor(&cx_desc_), CUDNN_STATUS_SUCCESS);
+        CHECK_EQ(cudnnCreateTensorDescriptor(&cy_desc_), CUDNN_STATUS_SUCCESS);
+      }     
+
+      // CHECK_EQ(cudnnCreateTensorDescriptor(&in_desc_), CUDNN_STATUS_SUCCESS);
+      // CHECK_EQ(cudnnCreateTensorDescriptor(&out_desc_), CUDNN_STATUS_SUCCESS);
+      // CHECK_EQ(cudnnSetTensor4dDescriptor(in_desc_,
+      //                                     format_,
+      //                                     dtype_,
+      //                                     data.size(0),
+      //                                     data.size(1),
+      //                                     data.size(2),
+      //                                     data.size(3)), CUDNN_STATUS_SUCCESS);
+      // CHECK_EQ(cudnnSetTensor4dDescriptor(out_desc_,
+      //                                     format_,
+      //                                     dtype_,
+      //                                     out.size(0),
+      //                                     out.size(1),
+      //                                     out.size(2),
+      //                                     out.size(3)), CUDNN_STATUS_SUCCESS);
+      // if (param_.sampler_type == st::kBilinear) {
+      //   int dim[] = {static_cast<int>(out.size(0)), static_cast<int>(out.size(1)),
+      //                static_cast<int>(out.size(2)), static_cast<int>(out.size(3))};
+      //   CHECK_EQ(cudnnSetSpatialTransformerNdDescriptor(st_desc_,
+      //                                                   sampler_,
+      //                                                   dtype_,
+      //                                                   4,
+      //                                                   dim) , CUDNN_STATUS_SUCCESS);
+      // }
+    }
   }
- 
-  bool init_cudnn_;
+
   cudnnDataType_t dtype_;
+  bool init_cudnn_;
   cudnnRNNDescriptor_t rnn_desc_;
-  cudnnRNNMode_t rnn_mode_;
-  cudnnDirectionMode_t rnn_direction_;
-  cudnnRNNInputMode_t rnn_input_mode_;
-  cudnnDropoutDescriptor_t rnn_dropout_;
-  // cudnnTensorDescriptor_t in_desc_;
-  // cudnnTensorDescriptor_t out_desc_;
+  cudnnRNNMode_t mode_;
+  cudnnDirectionMode_t direction_;
+  cudnnRNNInputMode_t input_mode_;
+  cudnnDropoutDescriptor_t dropout_desc_;
+
+  cudnnTensorDescriptor_t x_desc_;
+  cudnnTensorDescriptor_t hx_desc_;
+  cudnnTensorDescriptor_t cx_desc_;    
+  cudnnTensorDescriptor_t y_desc_; 
+  cudnnTensorDescriptor_t hy_desc_; 
+  cudnnTensorDescriptor_t cy_desc_; 
+
+  cudnnFilterDescriptor_t w_desc_;   
+
   #if CUDNN_MAJOR == 5
   cudnnTensorFormat_t format_;
   #endif
diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h
index 37150bf58878..2729a2ff49cc 100644
--- a/src/operator/rnn-inl.h
+++ b/src/operator/rnn-inl.h
@@ -21,16 +21,16 @@ namespace mxnet {
 namespace op {
 
 namespace rnn_enum {
-  enum RNNOpInputs {kData, kWeight, kStateIn, kCellStateIn};
+  enum RNNOpInputs {kData, kParams, kStateIn, kCellStateIn};
   enum RNNOpOutputs {kOut, kStateOut, kCellStateOut};
-  enum RNNModeType {kRnnRelu, kRnnTanh, kLstm, kGru};  
+  enum RNNModeType {kRnnRelu, kRnnTanh, kLstm, kGru};
   enum RNNDirectionType {kUnidirectional, kBidirectional};
   enum RNNOpResource {kTempSpace};
 }
 
 // A utility function to calculate input size
 inline int rnn_single_param_size(int inputSize,
-                                int hiddenSize, 
+                                int hiddenSize,
                                 int mode){
   int size = hiddenSize * (hiddenSize + inputSize + 2);
   // Different RNN's have different num weights
@@ -52,10 +52,10 @@ inline int rnn_single_param_size(int inputSize,
   return size;
 }
 
-inline int rnn_param_size(int layerNum, 
+inline int rnn_param_size(int layerNum,
                           int inputSize,
-                          int hiddenSize, 
-                          int direction, 
+                          int hiddenSize,
+                          int direction,
                           int mode){
   // get size of first layer
   int size = rnn_single_param_size(inputSize, hiddenSize, mode);
@@ -194,26 +194,26 @@ class RNNProp : public OperatorProperty {
                         Shape3(total_layers, batchSize, param_.state_size));
     }
     // infer weight size
-    int weight_size = rnn_param_size(param_.num_layers, 
-                                    inputSize, 
-                                    param_.state_size, 
-                                    param_.direction, 
+    int weight_size = rnn_param_size(param_.num_layers,
+                                    inputSize,
+                                    param_.state_size,
+                                    param_.direction,
                                     param_.mode);
-    SHAPE_ASSIGN_CHECK(*in_shape, rnn_enum::kWeight, Shape1(weight_size));
+    SHAPE_ASSIGN_CHECK(*in_shape, rnn_enum::kParams, Shape1(weight_size));
     // infer output size
     TShape oshape = dshape;
     oshape[2] = numDirections * param_.state_size;
-    // infer output state size   
+    // infer output state size
     TShape outStateShape = dshape;
     outStateShape[0] = total_layers;
     outStateShape[1] = batchSize;
     outStateShape[2] = param_.state_size;
 
-    out_shape->clear();   
+    out_shape->clear();
     out_shape->push_back(oshape);
     out_shape->push_back(outStateShape);
     // Deal with lstm cell state
-    if (param_.mode == rnn_enum::kLstm) 
+    if (param_.mode == rnn_enum::kLstm)
       out_shape->push_back(outStateShape);
     return true;
   }
@@ -236,7 +236,7 @@ class RNNProp : public OperatorProperty {
     out_type->clear();
     out_type->push_back(dtype);
     out_type->push_back(dtype);
-    if (param_.mode == rnn_enum::kLstm) 
+    if (param_.mode == rnn_enum::kLstm)
       out_type->push_back(dtype);
     return true;
   }
@@ -256,9 +256,9 @@ class RNNProp : public OperatorProperty {
     const std::vector<int> &in_data,
     const std::vector<int> &out_data) const override {
     if (param_.mode == rnn_enum::kLstm)
-      return {out_grad[rnn_enum::kOut], in_data[rnn_enum::kData], in_data[rnn_enum::kWeight]};
+      return {out_grad[rnn_enum::kOut], in_data[rnn_enum::kData], in_data[rnn_enum::kParams]};
     else
-      return {out_grad[rnn_enum::kOut], in_data[rnn_enum::kData], in_data[rnn_enum::kWeight]};
+      return {out_grad[rnn_enum::kOut], in_data[rnn_enum::kData], in_data[rnn_enum::kParams]};
   }
 
   std::vector<ResourceRequest> ForwardResource(
diff --git a/src/operator/rnn.cc b/src/operator/rnn.cc
index 40f7f705718d..2a485e5ef224 100644
--- a/src/operator/rnn.cc
+++ b/src/operator/rnn.cc
@@ -33,9 +33,9 @@ DMLC_REGISTER_PARAMETER(RNNParam);
 MXNET_REGISTER_OP_PROPERTY(RNN, RNNProp)
 .describe("Apply a recurrent layer to input.")
 .add_argument("data", "Symbol", "Input data to RNN")
-.add_argument("weight", "Symbol", "Weight for RNN layers")
+.add_argument("parameters", "Symbol", "Vector of all RNN trainable parameters")
 .add_argument("hidden_state", "Symbol", "initial hidden state of the RNN")
-.add_argument("cell_state", "Symbol", "initial cell state for LSTM networks")
-.add_arguments(RNNParam::__FIELDS__()); 
+.add_argument("cell_state", "Symbol", "initial cell state for LSTM networks (only for LSTM)")
+.add_arguments(RNNParam::__FIELDS__());
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/rnn.cu b/src/operator/rnn.cu
index 2cb482f591b2..fb90daf19b41 100644
--- a/src/operator/rnn.cu
+++ b/src/operator/rnn.cu
@@ -21,10 +21,7 @@ Operator* CreateOp<gpu>(RNNParam param, int dtype) {
     op = new CuDNNRNNOp<DType>(param);
   })
 #else
-	1;
-  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
-  op = new SpatialTransformerOp<gpu, DType>(param);
-  })
+   LOG(FATAL) << "RNN is only available for cuDNN at the moment.";
 #endif  // MXNET_USE_CUDNN && CUDNN_MAJOR
   return op;
 }

From 8979b01ba3c845e58b7c3dde6e759d6b02da8e01 Mon Sep 17 00:00:00 2001
From: Sebastian Bodenstein <sebastianb@wolfram.com>
Date: Sun, 17 Jul 2016 16:01:48 -0400
Subject: [PATCH 20/36] - added dropout

---
 src/operator/cudnn_rnn-inl.h | 166 +++++++++++++++++++++++++++--------
 src/operator/rnn-inl.h       |   5 ++
 2 files changed, 135 insertions(+), 36 deletions(-)

diff --git a/src/operator/cudnn_rnn-inl.h b/src/operator/cudnn_rnn-inl.h
index 61d6d2c2f23a..90bf5cbc9bc7 100644
--- a/src/operator/cudnn_rnn-inl.h
+++ b/src/operator/cudnn_rnn-inl.h
@@ -20,6 +20,8 @@ class CuDNNRNNOp : public Operator {
     this->param_ = param;
     init_cudnn_ = false;
     dtype_ = mshadow::DataType<DType>::kCudnnFlag;
+    // Defaults
+    input_mode_ = CUDNN_LINEAR_INPUT; 
     // RNN Mode
     switch (param_.mode) {
       case rnn_enum::kRnnRelu:
@@ -72,9 +74,48 @@ class CuDNNRNNOp : public Operator {
                        const std::vector<TBlob> &out_data,
                        const std::vector<TBlob> &aux_args) {
     using namespace mshadow;
- //    CHECK_EQ(in_data.size(), 2);
- //    CHECK_EQ(out_data.size(), 3);
- //    Stream<gpu> *s = ctx.get_stream<gpu>();
+    Stream<gpu> *s = ctx.get_stream<gpu>();
+    if(!init_cudnn_){
+      Init(s, in_data, out_data);
+    }
+    // get input + output tensors
+    Tensor<gpu, 3, DType> data = in_data[rnn_enum::kData].get<gpu, 3, DType>(s);
+    Tensor<gpu, 1, DType> params = in_data[rnn_enum::kParams].get<gpu, 1, DType>(s);
+    Tensor<gpu, 3, DType> state = in_data[rnn_enum::kStateIn].get<gpu, 3, DType>(s);
+
+    Tensor<gpu, 3, DType> out = out_data[rnn_enum::kOut].get<gpu, 3, DType>(s);
+    Tensor<gpu, 3, DType> out_state = out_data[rnn_enum::kStateOut].get<gpu, 3, DType>(s);
+
+    if (param_.mode == rnn_enum::kLstm){
+      Tensor<gpu, 3, DType> cell_state = 
+        in_data[rnn_enum::kCellStateIn].get<gpu, 3, DType>(s);
+      Tensor<gpu, 3, DType> out_cell_state = 
+        in_data[rnn_enum::kCellStateOut].get<gpu, 3, DType>(s);
+    }
+    // if (param_.mode == rnn_enum::kLstm){
+    //   CHECK_EQ(in_data.size(), 4);
+    //   CHECK_EQ(out_data.size(), 3);
+    // }
+    // else{
+    //   CHECK_EQ(in_data.size(), 3);
+    //   CHECK_EQ(out_data.size(), 2);
+    // }
+    // // Get tensors
+    // 
+    // Tensor<gpu, 3, DType> data = in_data[rnn_enum::kData].get<gpu, 3, DType>(s);
+    // Tensor<gpu, 1, DType> params = in_data[rnn_enum::kParams].get<gpu, 1, DType>(s);
+    // Tensor<gpu, 3, DType> state = in_data[rnn_enum::kStateIn].get<gpu, 3, DType>(s);
+
+    // Tensor<gpu, 3, DType> out = out_data[rnn_enum::kOut].get<gpu, 3, DType>(s);
+    // Tensor<gpu, 3, DType> out_state = out_data[rnn_enum::kOut].get<gpu, 3, DType>(s);
+
+    // if (param_.mode == rnn_enum::kLstm){
+    //   Tensor<gpu, 3, DType> cell_state = 
+    //     in_data[rnn_enum::kCellStateIn].get<gpu, 3, DType>(s);
+    //   Tensor<gpu, 3, DType> out_cell_state = 
+    //     in_data[rnn_enum::kCellStateOut].get<gpu, 3, DType>(s);
+    // }
+ //    
  //    Tensor<gpu, 4, DType> data = in_data[st::kData].get<gpu, 4, DType>(s);
  //    Tensor<gpu, 4, DType> out = out_data[st::kOut].get<gpu, 4, DType>(s);
  //    Shape<3> loc_shape = Shape3(data.size(0), 2, 3);
@@ -162,8 +203,7 @@ class CuDNNRNNOp : public Operator {
     #if CUDNN_MAJOR == 5
     format_ = CUDNN_TENSOR_NCHW;
     #endif
-
-    if (param_.mode == rnn_enum::kLstm){
+    if(param_.mode == rnn_enum::kLstm){
       CHECK_EQ(in_data.size(), 4);
       CHECK_EQ(out_data.size(), 3);
     }
@@ -171,64 +211,118 @@ class CuDNNRNNOp : public Operator {
       CHECK_EQ(in_data.size(), 3);
       CHECK_EQ(out_data.size(), 2);
     }
-    
     if (!init_cudnn_) {
       init_cudnn_ = true;
-
+      // get input + output tensors
       Tensor<gpu, 3, DType> data = in_data[rnn_enum::kData].get<gpu, 3, DType>(s);
       Tensor<gpu, 1, DType> params = in_data[rnn_enum::kParams].get<gpu, 1, DType>(s);
       Tensor<gpu, 3, DType> state = in_data[rnn_enum::kStateIn].get<gpu, 3, DType>(s);
 
       Tensor<gpu, 3, DType> out = out_data[rnn_enum::kOut].get<gpu, 3, DType>(s);
-      Tensor<gpu, 3, DType> out_state = out_data[rnn_enum::kOut].get<gpu, 3, DType>(s);
+      Tensor<gpu, 3, DType> out_state = out_data[rnn_enum::kStateOut].get<gpu, 3, DType>(s);
 
-      if (param_.mode == rnn_enum::kLstm){
+      if(param_.mode == rnn_enum::kLstm){
         Tensor<gpu, 3, DType> cell_state = 
           in_data[rnn_enum::kCellStateIn].get<gpu, 3, DType>(s);
         Tensor<gpu, 3, DType> out_cell_state = 
           in_data[rnn_enum::kCellStateOut].get<gpu, 3, DType>(s);
       }
 
+      // Create descriptors
       CHECK_EQ(cudnnCreateRNNDescriptor(&rnn_desc_), CUDNN_STATUS_SUCCESS);
       CHECK_EQ(cudnnCreateDropoutDescriptor(&dropout_desc_), CUDNN_STATUS_SUCCESS);
 
-      // Create tensors
       CHECK_EQ(cudnnCreateFilterDescriptor(&w_desc_), CUDNN_STATUS_SUCCESS);
       CHECK_EQ(cudnnCreateTensorDescriptor(&x_desc_), CUDNN_STATUS_SUCCESS);
       CHECK_EQ(cudnnCreateTensorDescriptor(&hx_desc_), CUDNN_STATUS_SUCCESS);
       CHECK_EQ(cudnnCreateTensorDescriptor(&y_desc_), CUDNN_STATUS_SUCCESS);
       CHECK_EQ(cudnnCreateTensorDescriptor(&hy_desc_), CUDNN_STATUS_SUCCESS);
-      CHECK_EQ(cudnnCreateTensorDescriptor(&y_desc_), CUDNN_STATUS_SUCCESS);
+
       if (param_.mode == rnn_enum::kLstm){
         CHECK_EQ(cudnnCreateTensorDescriptor(&cx_desc_), CUDNN_STATUS_SUCCESS);
         CHECK_EQ(cudnnCreateTensorDescriptor(&cy_desc_), CUDNN_STATUS_SUCCESS);
       }     
+      // set dropout 
+      // cudnnSetDropoutDescriptor(dropout_desc_,
+      //                           s->dnn_handle_,
+      //                           param_.p,
+      //                           void * states,
+      //                           size_t stateSizeInBytes,
+      //                           unsigned long long seed)
+      // set RNN 
+      CHECK_EQ(cudnnSetRNNDescriptor(rnn_desc_,
+                                    param_.state_size,
+                                    param_.num_layers,
+                                    dropout_desc_,
+                                    input_mode_,
+                                    direction_,
+                                    mode_,
+                                    dtype_), CUDNN_STATUS_SUCCESS);
+      // Set params
+      int dim_params[3] = {params.shape_[0], 1, 1};
+      CHECK_EQ(cudnnSetFilterNdDescriptor(w_desc_,
+                                          dtype_,
+                                          format_,
+                                          3,
+                                          dim_params
+                                         ), CUDNN_STATUS_SUCCESS);
+      // Get strides
+      int stride_data[3] = {data.shape_[2]*data.shape_[1], data.shape_[2], 1};
+      int stride_state[3] = {state.shape_[2]*state.shape_[1], state.shape_[2], 1};
+      int stride_out[3] = {out.shape_[2]*out.shape_[1], out.shape_[2], 1};   
+      int stride_out_state[3] = 
+        {out_state.shape_[2]*out_state.shape_[1], out_state.shape_[2], 1};
+ 
+      // cuDNN needs int arrays for dim, not index_t array used in Shape
+      int dim_data[3];
+      int dim_state[3];
+      int dim_out[3];
+      int dim_out_state[3];
+      std::copy(std::begin(data.shape_.shape_), std::end(data.shape_.shape_), std::begin(dim_data));
+      std::copy(std::begin(state.shape_.shape_), std::end(state.shape_.shape_), std::begin(dim_state));
+      std::copy(std::begin(out.shape_.shape_), std::end(out.shape_.shape_), std::begin(dim_out));
+      std::copy(std::begin(out_state.shape_.shape_), std::end(out_state.shape_.shape_), std::begin(dim_out_state));
 
-      // CHECK_EQ(cudnnCreateTensorDescriptor(&in_desc_), CUDNN_STATUS_SUCCESS);
-      // CHECK_EQ(cudnnCreateTensorDescriptor(&out_desc_), CUDNN_STATUS_SUCCESS);
-      // CHECK_EQ(cudnnSetTensor4dDescriptor(in_desc_,
-      //                                     format_,
-      //                                     dtype_,
-      //                                     data.size(0),
-      //                                     data.size(1),
-      //                                     data.size(2),
-      //                                     data.size(3)), CUDNN_STATUS_SUCCESS);
-      // CHECK_EQ(cudnnSetTensor4dDescriptor(out_desc_,
-      //                                     format_,
-      //                                     dtype_,
-      //                                     out.size(0),
-      //                                     out.size(1),
-      //                                     out.size(2),
-      //                                     out.size(3)), CUDNN_STATUS_SUCCESS);
-      // if (param_.sampler_type == st::kBilinear) {
-      //   int dim[] = {static_cast<int>(out.size(0)), static_cast<int>(out.size(1)),
-      //                static_cast<int>(out.size(2)), static_cast<int>(out.size(3))};
-      //   CHECK_EQ(cudnnSetSpatialTransformerNdDescriptor(st_desc_,
-      //                                                   sampler_,
-      //                                                   dtype_,
-      //                                                   4,
-      //                                                   dim) , CUDNN_STATUS_SUCCESS);
-      // }
+      // set the tensor descriptors
+      CHECK_EQ(cudnnSetTensorNdDescriptor(x_desc_,
+                                          dtype_,
+                                          3,
+                                          dim_data,
+                                          stride_data
+                                         ), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnSetTensorNdDescriptor(hx_desc_,
+                                          dtype_,
+                                          3,
+                                          dim_state,
+                                          stride_state
+                                         ), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnSetTensorNdDescriptor(y_desc_,
+                                          dtype_,
+                                          3,
+                                          dim_out,
+                                          stride_out
+                                         ), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnSetTensorNdDescriptor(hy_desc_,
+                                          dtype_,
+                                          3,
+                                          dim_out_state,
+                                          stride_out_state
+                                         ), CUDNN_STATUS_SUCCESS);
+      // LSTM has two extra descriptors
+      if (param_.mode == rnn_enum::kLstm){
+        CHECK_EQ(cudnnSetTensorNdDescriptor(cx_desc_,
+                                            dtype_,
+                                            3,
+                                            dim_state,
+                                            stride_state
+                                          ), CUDNN_STATUS_SUCCESS);
+        CHECK_EQ(cudnnSetTensorNdDescriptor(cy_desc_,
+                                            dtype_,
+                                            3,
+                                            dim_out_state,
+                                            stride_out_state
+                                          ), CUDNN_STATUS_SUCCESS);
+      }   
     }
   }
 
diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h
index 2729a2ff49cc..d81ed1637756 100644
--- a/src/operator/rnn-inl.h
+++ b/src/operator/rnn-inl.h
@@ -74,6 +74,7 @@ struct RNNParam : public dmlc::Parameter<RNNParam> {
   bool batch_first;
   int direction;
   int mode;
+  float p;
 
   DMLC_DECLARE_PARAMETER(RNNParam) {
     DMLC_DECLARE_FIELD(state_size)
@@ -96,6 +97,10 @@ struct RNNParam : public dmlc::Parameter<RNNParam> {
     .add_enum("lstm", rnn_enum::kLstm)
     .add_enum("gru", rnn_enum::kGru)
     .describe("the type of RNN to compute");
+    
+    DMLC_DECLARE_FIELD(p).set_default(0.)
+    .set_range(0, 1)
+    .describe("Fraction of the input that gets dropped out at training time");
   }
 };
 

From 7861b3de9b8091f7d9243c6c34b41416c39bd069 Mon Sep 17 00:00:00 2001
From: Sebastian Bodenstein <sebastianb@wolfram.com>
Date: Mon, 18 Jul 2016 00:28:48 -0400
Subject: [PATCH 21/36] - major refactor - completed forward evaluation

---
 src/operator/cudnn_rnn-inl.h | 481 +++++++++++++++++++----------------
 src/operator/rnn-inl.h       |  39 ++-
 2 files changed, 277 insertions(+), 243 deletions(-)

diff --git a/src/operator/cudnn_rnn-inl.h b/src/operator/cudnn_rnn-inl.h
index 90bf5cbc9bc7..134044321ad7 100644
--- a/src/operator/cudnn_rnn-inl.h
+++ b/src/operator/cudnn_rnn-inl.h
@@ -1,6 +1,6 @@
 /*!
  * Copyright (c) 2016 by Contributors
- * \file cudnn_spatial_transformer-inl.h
+ * \file cudnn_rnn-inl.h
  * \brief
  * \author Sebastian Bodenstein
 */
@@ -21,7 +21,7 @@ class CuDNNRNNOp : public Operator {
     init_cudnn_ = false;
     dtype_ = mshadow::DataType<DType>::kCudnnFlag;
     // Defaults
-    input_mode_ = CUDNN_LINEAR_INPUT; 
+    input_mode_ = CUDNN_LINEAR_INPUT; // Don't support this yet
     // RNN Mode
     switch (param_.mode) {
       case rnn_enum::kRnnRelu:
@@ -40,31 +40,29 @@ class CuDNNRNNOp : public Operator {
         LOG(FATAL) << "Not implmented";
     }
     // RNN Direction
-    switch (param_.direction) {
-      case rnn_enum::kUnidirectional:
-        direction_ = CUDNN_UNIDIRECTIONAL;
-        break;
-      case rnn_enum::kBidirectional:
-        direction_ = CUDNN_BIDIRECTIONAL;
-        break;
-      default:
-        LOG(FATAL) << "Not implmented";
-    }
+    direction_ = param_.bidirectional ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL;
   }
 
   ~CuDNNRNNOp() {
     if (init_cudnn_) {
-      CHECK_EQ(cudnnDestroyTensorDescriptor(x_desc_), CUDNN_STATUS_SUCCESS);
+      for(int i = 0; i < x_desc_vec_.size(); ++i){
+        CHECK_EQ(cudnnDestroyTensorDescriptor(x_desc_vec_[i]), CUDNN_STATUS_SUCCESS);
+        CHECK_EQ(cudnnDestroyTensorDescriptor(y_desc_vec_[i]), CUDNN_STATUS_SUCCESS);
+        CHECK_EQ(cudnnDestroyTensorDescriptor(dx_desc_vec_[i]), CUDNN_STATUS_SUCCESS);
+        CHECK_EQ(cudnnDestroyTensorDescriptor(dy_desc_vec_[i]), CUDNN_STATUS_SUCCESS);
+      }
       CHECK_EQ(cudnnDestroyTensorDescriptor(hx_desc_), CUDNN_STATUS_SUCCESS);
-      CHECK_EQ(cudnnDestroyTensorDescriptor(y_desc_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnDestroyTensorDescriptor(cx_desc_), CUDNN_STATUS_SUCCESS);
       CHECK_EQ(cudnnDestroyTensorDescriptor(hy_desc_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnDestroyTensorDescriptor(cy_desc_), CUDNN_STATUS_SUCCESS);   
+      CHECK_EQ(cudnnDestroyTensorDescriptor(dhx_desc_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnDestroyTensorDescriptor(dcx_desc_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnDestroyTensorDescriptor(dhy_desc_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnDestroyTensorDescriptor(dcy_desc_), CUDNN_STATUS_SUCCESS);   
+
       CHECK_EQ(cudnnDestroyFilterDescriptor(w_desc_), CUDNN_STATUS_SUCCESS);
       CHECK_EQ(cudnnDestroyRNNDescriptor(rnn_desc_), CUDNN_STATUS_SUCCESS);
       CHECK_EQ(cudnnDestroyDropoutDescriptor(dropout_desc_), CUDNN_STATUS_SUCCESS);
-      if (param_.mode == rnn_enum::kLstm){
-            CHECK_EQ(cudnnDestroyTensorDescriptor(cx_desc_), CUDNN_STATUS_SUCCESS);
-            CHECK_EQ(cudnnDestroyTensorDescriptor(cy_desc_), CUDNN_STATUS_SUCCESS);
-      }
     }
   }
  
@@ -74,77 +72,83 @@ class CuDNNRNNOp : public Operator {
                        const std::vector<TBlob> &out_data,
                        const std::vector<TBlob> &aux_args) {
     using namespace mshadow;
+    size_t in_expected = param_.lstm_q_ ? 4 : 3;
+    size_t out_expected = param_.lstm_q_ ? 3 : 2;
+    CHECK_EQ(in_data.size(), in_expected);
+    CHECK_EQ(out_data.size(), out_expected);
     Stream<gpu> *s = ctx.get_stream<gpu>();
-    if(!init_cudnn_){
-      Init(s, in_data, out_data);
-    }
     // get input + output tensors
-    Tensor<gpu, 3, DType> data = in_data[rnn_enum::kData].get<gpu, 3, DType>(s);
-    Tensor<gpu, 1, DType> params = in_data[rnn_enum::kParams].get<gpu, 1, DType>(s);
-    Tensor<gpu, 3, DType> state = in_data[rnn_enum::kStateIn].get<gpu, 3, DType>(s);
+    Tensor<gpu, 3, DType> x = in_data[rnn_enum::kData].get<gpu, 3, DType>(s);
+    Tensor<gpu, 1, DType> w = in_data[rnn_enum::kParams].get<gpu, 1, DType>(s);
+    Tensor<gpu, 3, DType> hx = in_data[rnn_enum::kStateIn].get<gpu, 3, DType>(s);
 
-    Tensor<gpu, 3, DType> out = out_data[rnn_enum::kOut].get<gpu, 3, DType>(s);
-    Tensor<gpu, 3, DType> out_state = out_data[rnn_enum::kStateOut].get<gpu, 3, DType>(s);
+    Tensor<gpu, 3, DType> y = out_data[rnn_enum::kOut].get<gpu, 3, DType>(s);
+    Tensor<gpu, 3, DType> hy = out_data[rnn_enum::kStateOut].get<gpu, 3, DType>(s);
 
+    DType * cx_ptr = NULL;
+    DType * cy_ptr = NULL;
     if (param_.mode == rnn_enum::kLstm){
-      Tensor<gpu, 3, DType> cell_state = 
-        in_data[rnn_enum::kCellStateIn].get<gpu, 3, DType>(s);
-      Tensor<gpu, 3, DType> out_cell_state = 
-        in_data[rnn_enum::kCellStateOut].get<gpu, 3, DType>(s);
+      cx_ptr = (in_data[rnn_enum::kCellStateIn].get<gpu, 3, DType>(s)).dptr_;
+      cy_ptr = (in_data[rnn_enum::kCellStateOut].get<gpu, 3, DType>(s)).dptr_;
     }
-    // if (param_.mode == rnn_enum::kLstm){
-    //   CHECK_EQ(in_data.size(), 4);
-    //   CHECK_EQ(out_data.size(), 3);
-    // }
-    // else{
-    //   CHECK_EQ(in_data.size(), 3);
-    //   CHECK_EQ(out_data.size(), 2);
-    // }
-    // // Get tensors
-    // 
-    // Tensor<gpu, 3, DType> data = in_data[rnn_enum::kData].get<gpu, 3, DType>(s);
-    // Tensor<gpu, 1, DType> params = in_data[rnn_enum::kParams].get<gpu, 1, DType>(s);
-    // Tensor<gpu, 3, DType> state = in_data[rnn_enum::kStateIn].get<gpu, 3, DType>(s);
 
-    // Tensor<gpu, 3, DType> out = out_data[rnn_enum::kOut].get<gpu, 3, DType>(s);
-    // Tensor<gpu, 3, DType> out_state = out_data[rnn_enum::kOut].get<gpu, 3, DType>(s);
+    if(!init_cudnn_){
+      Init(s, in_data, out_data);
+    } 
 
-    // if (param_.mode == rnn_enum::kLstm){
-    //   Tensor<gpu, 3, DType> cell_state = 
-    //     in_data[rnn_enum::kCellStateIn].get<gpu, 3, DType>(s);
-    //   Tensor<gpu, 3, DType> out_cell_state = 
-    //     in_data[rnn_enum::kCellStateOut].get<gpu, 3, DType>(s);
-    // }
- //    
- //    Tensor<gpu, 4, DType> data = in_data[st::kData].get<gpu, 4, DType>(s);
- //    Tensor<gpu, 4, DType> out = out_data[st::kOut].get<gpu, 4, DType>(s);
- //    Shape<3> loc_shape = Shape3(data.size(0), 2, 3);
- //    Shape<4> grid_shape = Shape4(out.size(0), out.size(2), out.size(3), 2);
- //    Tensor<gpu, 3, DType> loc = in_data[st::kLoc].get_with_shape<gpu, 3, DType>(loc_shape, s);
- //    Tensor<gpu, 4, DType> grid = out_data[st::kGridSrc]
- //                                .get_with_shape<gpu, 4, DType>(grid_shape, s);
- //    if (!init_cudnn_) {
- //     Init(s, in_data, out_data);
- //    }
- //    CHECK_EQ(data.CheckContiguous(), true);
- //    CHECK_EQ(out.CheckContiguous(), true);
- //    typename DataType<DType>::ScaleType alpha = 1.0f;
- //    typename DataType<DType>::ScaleType beta = 0.0f;
- //    if (param_.transform_type == st::kAffine) {
- //      CHECK_EQ(cudnnSpatialTfGridGeneratorForward(s->dnn_handle_,
- //                                                  st_desc_,
- //                                                  loc.dptr_,
- //                                                  grid.dptr_/*output*/), CUDNN_STATUS_SUCCESS);
- //    }
- //    CHECK_EQ(cudnnSpatialTfSamplerForward(s->dnn_handle_,
- //                                          st_desc_,
- //                                          &alpha,
- //                                          in_desc_,
- //                                          data.dptr_,
- //                                          grid.dptr_,
- //                                          &beta,
- //                                          out_desc_,
- //                                          out.dptr_/*output*/), CUDNN_STATUS_SUCCESS);
+    if (ctx.is_train) { 
+      // training mode
+      Tensor<gpu, 1, DType> temp_space =
+        ctx.requested[rnn_enum::kTempSpace].get_space_typed<gpu, 1, DType>(
+                                mshadow::Shape1(workspace_size_ + reserve_space_size_), s);
+      CHECK_EQ(cudnnRNNForwardTraining(s->dnn_handle_,
+                                      rnn_desc_,
+                                      param_.seq_length_,
+                                      x_desc_vec_.data(),
+                                      x.dptr_,
+                                      hx_desc_,
+                                      hx.dptr_,
+                                      cx_desc_,
+                                      cx_ptr,
+                                      w_desc_,
+                                      w.dptr_,
+                                      y_desc_vec_.data(),
+                                      y.dptr_,
+                                      hy_desc_,
+                                      hy.dptr_,
+                                      cy_desc_,
+                                      cy_ptr,
+                                      temp_space.dptr_,
+                                      workspace_byte_,
+                                      temp_space.dptr_ + workspace_size_,
+                                      reserve_space_byte_
+                                      ), CUDNN_STATUS_SUCCESS);
+    } else {
+      // inference mode
+      Tensor<gpu, 1, DType> temp_space =
+          ctx.requested[rnn_enum::kTempSpace].get_space_typed<gpu, 1, DType>(
+                                  mshadow::Shape1(workspace_size_), s);
+      CHECK_EQ(cudnnRNNForwardInference(s->dnn_handle_,
+                                      rnn_desc_,
+                                      param_.seq_length_,
+                                      x_desc_vec_.data(),
+                                      x.dptr_,
+                                      hx_desc_,
+                                      hx.dptr_,
+                                      cx_desc_,
+                                      cx_ptr,
+                                      w_desc_,
+                                      w.dptr_,
+                                      y_desc_vec_.data(),
+                                      y.dptr_,
+                                      hy_desc_,
+                                      hy.dptr_,
+                                      cy_desc_,
+                                      cy_ptr,
+                                      temp_space.dptr_,
+                                      workspace_byte_
+                                      ), CUDNN_STATUS_SUCCESS); 
+    }
   }
  //
   virtual void Backward(const OpContext &ctx,
@@ -155,46 +159,12 @@ class CuDNNRNNOp : public Operator {
                         const std::vector<TBlob> &in_grad,
                         const std::vector<TBlob> &aux_args) {
     using namespace mshadow;
- //    CHECK_EQ(in_data.size(), 2);
- //    CHECK_EQ(out_data.size(), 3);
- //    CHECK_EQ(out_grad.size(), 1);
- //    Stream<gpu> *s = ctx.get_stream<gpu>();
- //    Tensor<gpu, 4, DType> data = in_data[st::kData].get<gpu, 4, DType>(s);
- //    Tensor<gpu, 4, DType> grad = out_grad[st::kOut].get<gpu, 4, DType>(s);
- //    Tensor<gpu, 4, DType> ddata = in_grad[st::kData].get<gpu, 4, DType>(s);
- //    Shape<3> loc_shape = Shape3(data.size(0), 2, 3);
- //    Shape<4> grid_shape = Shape4(grad.size(0), grad.size(2), grad.size(3), 2);
- //    Tensor<gpu, 3, DType> dloc = in_grad[st::kLoc].get_with_shape<gpu, 3, DType>(loc_shape, s);
- //    Tensor<gpu, 4, DType> grid = out_data[st::kGridSrc]
- //                    .get_with_shape<gpu, 4, DType>(grid_shape, s);
- //    // do not use out_grad[st::kGridSrc], because dgrid is a intermediate tensor, and not include in
- //    // DeclareBackwardDependency, another, we can we reuse grid for inplace operator
- //    typename DataType<DType>::ScaleType alpha = 1.0f;
- //    typename DataType<DType>::ScaleType beta = 0.0f;
- //    typename DataType<DType>::ScaleType alpha_dgrid = 1.0f;
- //    typename DataType<DType>::ScaleType beta_dgrid = 0.0f;
- //    CHECK_EQ(cudnnSpatialTfSamplerBackward(s->dnn_handle_,
- //                                           st_desc_,
- //                                           &alpha,
- //                                           in_desc_,
- //                                           data.dptr_,
- //                                           &beta,
- //                                           in_desc_/*reuse in_desc_*/,
- //                                           ddata.dptr_/*output*/,
- //                                           &alpha_dgrid,
- //                                           out_desc_/*reuse out_desc_*/,
- //                                           grad.dptr_,
- //                                           grid.dptr_,
- //                                           &beta_dgrid,
- //                                           grid.dptr_/*output, reuse grid*/), CUDNN_STATUS_SUCCESS);
- //    if (param_.transform_type == st::kAffine) {
- //      CHECK_EQ(cudnnSpatialTfGridGeneratorBackward(s->dnn_handle_,
- //                                                   st_desc_,
- //                                                   grid.dptr_,
- //                                                   dloc.dptr_/*out*/), CUDNN_STATUS_SUCCESS);
- //    }
+    size_t in_expected = param_.lstm_q_ ? 4 : 3;
+    size_t out_expected = param_.lstm_q_ ? 3 : 2;
+    CHECK_EQ(in_data.size(), in_expected);
+    CHECK_EQ(out_data.size(), out_expected);
+    CHECK_EQ(out_data.size(), out_expected);
   }
- //
  private:
   inline void Init(mshadow::Stream<gpu> *s,
                    const std::vector<TBlob> &in_data,
@@ -203,126 +173,193 @@ class CuDNNRNNOp : public Operator {
     #if CUDNN_MAJOR == 5
     format_ = CUDNN_TENSOR_NCHW;
     #endif
-    if(param_.mode == rnn_enum::kLstm){
-      CHECK_EQ(in_data.size(), 4);
-      CHECK_EQ(out_data.size(), 3);
-    }
-    else{
-      CHECK_EQ(in_data.size(), 3);
-      CHECK_EQ(out_data.size(), 2);
-    }
+    size_t in_expected = param_.lstm_q_ ? 4 : 3;
+    size_t out_expected = param_.lstm_q_ ? 3 : 2;
+    CHECK_EQ(in_data.size(), in_expected);
+    CHECK_EQ(out_data.size(), out_expected);
     if (!init_cudnn_) {
       init_cudnn_ = true;
       // get input + output tensors
-      Tensor<gpu, 3, DType> data = in_data[rnn_enum::kData].get<gpu, 3, DType>(s);
-      Tensor<gpu, 1, DType> params = in_data[rnn_enum::kParams].get<gpu, 1, DType>(s);
-      Tensor<gpu, 3, DType> state = in_data[rnn_enum::kStateIn].get<gpu, 3, DType>(s);
+      Tensor<gpu, 3, DType> x = in_data[rnn_enum::kData].get<gpu, 3, DType>(s);
+      Tensor<gpu, 1, DType> w = in_data[rnn_enum::kParams].get<gpu, 1, DType>(s);
+      // Tensor Descriptors
+      std::vector<cudnnTensorDescriptor_t> x_vec(param_.seq_length_);
+      std::vector<cudnnTensorDescriptor_t> y_vec(param_.seq_length_);
+      std::vector<cudnnTensorDescriptor_t> dx_vec(param_.seq_length_);
+      std::vector<cudnnTensorDescriptor_t> dy_vec(param_.seq_length_);
+      int dimA[3];
+      int strideA[3];
+      for (int i = 0; i < param_.seq_length_; i++) {
+          CHECK_EQ(cudnnCreateTensorDescriptor(&x_vec[i]), CUDNN_STATUS_SUCCESS);
+          CHECK_EQ(cudnnCreateTensorDescriptor(&y_vec[i]), CUDNN_STATUS_SUCCESS);
+          CHECK_EQ(cudnnCreateTensorDescriptor(&dx_vec[i]), CUDNN_STATUS_SUCCESS);
+          CHECK_EQ(cudnnCreateTensorDescriptor(&dy_vec[i]), CUDNN_STATUS_SUCCESS);
+
+          dimA[0] = x.shape_[0];
+          dimA[1] = x.shape_[2];
+          dimA[2] = 1;
+          strideA[0] = dimA[2] * dimA[1];
+          strideA[1] = dimA[2];
+          strideA[2] = 1; 
 
-      Tensor<gpu, 3, DType> out = out_data[rnn_enum::kOut].get<gpu, 3, DType>(s);
-      Tensor<gpu, 3, DType> out_state = out_data[rnn_enum::kStateOut].get<gpu, 3, DType>(s);
+          CHECK_EQ(cudnnSetTensorNdDescriptor(x_vec[i],
+                                    dtype_,
+                                    3,
+                                    dimA,
+                                    strideA
+                                    ), CUDNN_STATUS_SUCCESS);
+          CHECK_EQ(cudnnSetTensorNdDescriptor(dx_vec[i],
+                                    dtype_,
+                                    3,
+                                    dimA,
+                                    strideA
+                                    ), CUDNN_STATUS_SUCCESS);
+          dimA[0] = x.shape_[0];                           
+          dimA[1] = param_.bidirectional ? param_.state_size * 2 : param_.state_size;
+          dimA[2] = 1;
+          strideA[0] = dimA[2] * dimA[1];
+          strideA[1] = dimA[2];
+          strideA[2] = 1;
 
-      if(param_.mode == rnn_enum::kLstm){
-        Tensor<gpu, 3, DType> cell_state = 
-          in_data[rnn_enum::kCellStateIn].get<gpu, 3, DType>(s);
-        Tensor<gpu, 3, DType> out_cell_state = 
-          in_data[rnn_enum::kCellStateOut].get<gpu, 3, DType>(s);
+          CHECK_EQ(cudnnSetTensorNdDescriptor(y_vec[i],
+                                    dtype_,
+                                    3,
+                                    dimA,
+                                    strideA
+                                    ), CUDNN_STATUS_SUCCESS);
+          CHECK_EQ(cudnnSetTensorNdDescriptor(dy_vec[i],
+                                    dtype_,
+                                    3,
+                                    dimA,
+                                    strideA
+                                    ), CUDNN_STATUS_SUCCESS);
       }
+      x_desc_vec_ = x_vec;
+      y_desc_vec_ = y_vec;
+      dx_desc_vec_ = dx_vec;
+      dy_desc_vec_ = dy_vec;
 
-      // Create descriptors
-      CHECK_EQ(cudnnCreateRNNDescriptor(&rnn_desc_), CUDNN_STATUS_SUCCESS);
-      CHECK_EQ(cudnnCreateDropoutDescriptor(&dropout_desc_), CUDNN_STATUS_SUCCESS);
+      // set the state tensors                       
+      dimA[0] = param_.num_layers * (param_.bidirectional ? 2 : 1);
+      dimA[1] = x.shape_[0]; //minibatch
+      dimA[2] = param_.state_size;
+      strideA[0] = dimA[2] * dimA[1];
+      strideA[1] = dimA[2];
+      strideA[2] = 1;
 
-      CHECK_EQ(cudnnCreateFilterDescriptor(&w_desc_), CUDNN_STATUS_SUCCESS);
-      CHECK_EQ(cudnnCreateTensorDescriptor(&x_desc_), CUDNN_STATUS_SUCCESS);
       CHECK_EQ(cudnnCreateTensorDescriptor(&hx_desc_), CUDNN_STATUS_SUCCESS);
-      CHECK_EQ(cudnnCreateTensorDescriptor(&y_desc_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnCreateTensorDescriptor(&cx_desc_), CUDNN_STATUS_SUCCESS);
       CHECK_EQ(cudnnCreateTensorDescriptor(&hy_desc_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnCreateTensorDescriptor(&cy_desc_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnCreateTensorDescriptor(&dhx_desc_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnCreateTensorDescriptor(&dcx_desc_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnCreateTensorDescriptor(&dhy_desc_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnCreateTensorDescriptor(&dcy_desc_), CUDNN_STATUS_SUCCESS);
 
-      if (param_.mode == rnn_enum::kLstm){
-        CHECK_EQ(cudnnCreateTensorDescriptor(&cx_desc_), CUDNN_STATUS_SUCCESS);
-        CHECK_EQ(cudnnCreateTensorDescriptor(&cy_desc_), CUDNN_STATUS_SUCCESS);
-      }     
-      // set dropout 
-      // cudnnSetDropoutDescriptor(dropout_desc_,
-      //                           s->dnn_handle_,
-      //                           param_.p,
-      //                           void * states,
-      //                           size_t stateSizeInBytes,
-      //                           unsigned long long seed)
-      // set RNN 
-      CHECK_EQ(cudnnSetRNNDescriptor(rnn_desc_,
-                                    param_.state_size,
-                                    param_.num_layers,
-                                    dropout_desc_,
-                                    input_mode_,
-                                    direction_,
-                                    mode_,
-                                    dtype_), CUDNN_STATUS_SUCCESS);
-      // Set params
-      int dim_params[3] = {params.shape_[0], 1, 1};
-      CHECK_EQ(cudnnSetFilterNdDescriptor(w_desc_,
+      CHECK_EQ(cudnnSetTensorNdDescriptor(hx_desc_,
                                           dtype_,
-                                          format_,
                                           3,
-                                          dim_params
+                                          dimA,
+                                          strideA
                                          ), CUDNN_STATUS_SUCCESS);
-      // Get strides
-      int stride_data[3] = {data.shape_[2]*data.shape_[1], data.shape_[2], 1};
-      int stride_state[3] = {state.shape_[2]*state.shape_[1], state.shape_[2], 1};
-      int stride_out[3] = {out.shape_[2]*out.shape_[1], out.shape_[2], 1};   
-      int stride_out_state[3] = 
-        {out_state.shape_[2]*out_state.shape_[1], out_state.shape_[2], 1};
- 
-      // cuDNN needs int arrays for dim, not index_t array used in Shape
-      int dim_data[3];
-      int dim_state[3];
-      int dim_out[3];
-      int dim_out_state[3];
-      std::copy(std::begin(data.shape_.shape_), std::end(data.shape_.shape_), std::begin(dim_data));
-      std::copy(std::begin(state.shape_.shape_), std::end(state.shape_.shape_), std::begin(dim_state));
-      std::copy(std::begin(out.shape_.shape_), std::end(out.shape_.shape_), std::begin(dim_out));
-      std::copy(std::begin(out_state.shape_.shape_), std::end(out_state.shape_.shape_), std::begin(dim_out_state));
-
-      // set the tensor descriptors
-      CHECK_EQ(cudnnSetTensorNdDescriptor(x_desc_,
+      CHECK_EQ(cudnnSetTensorNdDescriptor(cx_desc_,
                                           dtype_,
                                           3,
-                                          dim_data,
-                                          stride_data
+                                          dimA,
+                                          strideA
                                          ), CUDNN_STATUS_SUCCESS);
-      CHECK_EQ(cudnnSetTensorNdDescriptor(hx_desc_,
+      CHECK_EQ(cudnnSetTensorNdDescriptor(hy_desc_,
                                           dtype_,
                                           3,
-                                          dim_state,
-                                          stride_state
+                                          dimA,
+                                          strideA
                                          ), CUDNN_STATUS_SUCCESS);
-      CHECK_EQ(cudnnSetTensorNdDescriptor(y_desc_,
+      CHECK_EQ(cudnnSetTensorNdDescriptor(cy_desc_,
                                           dtype_,
                                           3,
-                                          dim_out,
-                                          stride_out
+                                          dimA,
+                                          strideA
                                          ), CUDNN_STATUS_SUCCESS);
-      CHECK_EQ(cudnnSetTensorNdDescriptor(hy_desc_,
+      CHECK_EQ(cudnnSetTensorNdDescriptor(dhx_desc_,
                                           dtype_,
                                           3,
-                                          dim_out_state,
-                                          stride_out_state
+                                          dimA,
+                                          strideA
                                          ), CUDNN_STATUS_SUCCESS);
-      // LSTM has two extra descriptors
-      if (param_.mode == rnn_enum::kLstm){
-        CHECK_EQ(cudnnSetTensorNdDescriptor(cx_desc_,
-                                            dtype_,
-                                            3,
-                                            dim_state,
-                                            stride_state
-                                          ), CUDNN_STATUS_SUCCESS);
-        CHECK_EQ(cudnnSetTensorNdDescriptor(cy_desc_,
-                                            dtype_,
-                                            3,
-                                            dim_out_state,
-                                            stride_out_state
+      CHECK_EQ(cudnnSetTensorNdDescriptor(dcx_desc_,
+                                          dtype_,
+                                          3,
+                                          dimA,
+                                          strideA
+                                         ), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnSetTensorNdDescriptor(dhy_desc_,
+                                          dtype_,
+                                          3,
+                                          dimA,
+                                          strideA
+                                         ), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnSetTensorNdDescriptor(dcy_desc_,
+                                          dtype_,
+                                          3,
+                                          dimA,
+                                          strideA
+                                         ), CUDNN_STATUS_SUCCESS);
+
+      // Get temp space sizes
+      CHECK_EQ(cudnnGetRNNWorkspaceSize(s->dnn_handle_,
+                                        rnn_desc_,
+                                        param_.seq_length_,
+                                        x_desc_vec_.data(),
+                                        &workspace_byte_
+                                        ), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnGetRNNTrainingReserveSize(s->dnn_handle_,
+                                        rnn_desc_,
+                                        param_.seq_length_,
+                                        x_desc_vec_.data(),
+                                        &reserve_space_byte_
+                                        ), CUDNN_STATUS_SUCCESS);
+      workspace_size_ = workspace_byte_ / sizeof(DType) + 1;
+      reserve_space_size_ = reserve_space_byte_ / sizeof(DType) + 1;
+
+      // Set param descriptors
+      CHECK_EQ(cudnnCreateFilterDescriptor(&w_desc_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnCreateFilterDescriptor(&dw_desc_), CUDNN_STATUS_SUCCESS);
+      int dim_w[3] = {w.shape_[0], 1, 1};
+      CHECK_EQ(cudnnSetFilterNdDescriptor(w_desc_,
+                                          dtype_,
+                                          format_,
+                                          3,
+                                          dim_w
+                                         ), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnSetFilterNdDescriptor(dw_desc_,
+                                          dtype_,
+                                          format_,
+                                          3,
+                                          dim_w
+                                         ), CUDNN_STATUS_SUCCESS);
+      // Create Dropout descriptors
+      CHECK_EQ(cudnnCreateDropoutDescriptor(&dropout_desc_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnDropoutGetStatesSize(s->dnn_handle_, 
+                                          &dropout_byte_
                                           ), CUDNN_STATUS_SUCCESS);
-      }   
+      dropout_size_ = dropout_byte_ / sizeof(DType);
+      CHECK_EQ(cudnnSetDropoutDescriptor(dropout_desc_,
+                                        s->dnn_handle_,
+                                        param_.pkeep_,  // keep probability 
+                                        NULL,
+                                        dropout_byte_,
+                                        seed_), CUDNN_STATUS_SUCCESS);
+      // RNN descriptors       
+      CHECK_EQ(cudnnCreateRNNDescriptor(&rnn_desc_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnSetRNNDescriptor(rnn_desc_,
+                                    param_.state_size,
+                                    param_.num_layers,
+                                    dropout_desc_,
+                                    input_mode_,
+                                    direction_,
+                                    mode_,
+                                    dtype_), CUDNN_STATUS_SUCCESS);
+
     }
   }
 
@@ -333,15 +370,17 @@ class CuDNNRNNOp : public Operator {
   cudnnDirectionMode_t direction_;
   cudnnRNNInputMode_t input_mode_;
   cudnnDropoutDescriptor_t dropout_desc_;
+  unsigned long long seed_ = 4553;
+  size_t workspace_byte_, reserve_space_byte_, dropout_byte_;
+  int workspace_size_, reserve_space_size_, dropout_size_;
 
-  cudnnTensorDescriptor_t x_desc_;
-  cudnnTensorDescriptor_t hx_desc_;
-  cudnnTensorDescriptor_t cx_desc_;    
-  cudnnTensorDescriptor_t y_desc_; 
-  cudnnTensorDescriptor_t hy_desc_; 
-  cudnnTensorDescriptor_t cy_desc_; 
+  std::vector<cudnnTensorDescriptor_t> x_desc_vec_, y_desc_vec_, dx_desc_vec_, dy_desc_vec_;
+  cudnnTensorDescriptor_t hx_desc_, cx_desc_;
+  cudnnTensorDescriptor_t hy_desc_, cy_desc_;
+  cudnnTensorDescriptor_t dhx_desc_, dcx_desc_;
+  cudnnTensorDescriptor_t dhy_desc_, dcy_desc_;
 
-  cudnnFilterDescriptor_t w_desc_;   
+  cudnnFilterDescriptor_t w_desc_, dw_desc_;  
 
   #if CUDNN_MAJOR == 5
   cudnnTensorFormat_t format_;
@@ -352,4 +391,4 @@ class CuDNNRNNOp : public Operator {
 }  // namespace op
 }  // namespace mxnet
 
-#endif  // MXNET_OPERATOR_CUDNN_SPATIAL_TRANSFORMER_INL_H_
+#endif  // MXNET_OPERATOR_CUDNN_RNN_INL_H_
diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h
index d81ed1637756..53189d100ef2 100644
--- a/src/operator/rnn-inl.h
+++ b/src/operator/rnn-inl.h
@@ -24,7 +24,6 @@ namespace rnn_enum {
   enum RNNOpInputs {kData, kParams, kStateIn, kCellStateIn};
   enum RNNOpOutputs {kOut, kStateOut, kCellStateOut};
   enum RNNModeType {kRnnRelu, kRnnTanh, kLstm, kGru};
-  enum RNNDirectionType {kUnidirectional, kBidirectional};
   enum RNNOpResource {kTempSpace};
 }
 
@@ -55,26 +54,27 @@ inline int rnn_single_param_size(int inputSize,
 inline int rnn_param_size(int layerNum,
                           int inputSize,
                           int hiddenSize,
-                          int direction,
+                          bool bidirectional,
                           int mode){
   // get size of first layer
   int size = rnn_single_param_size(inputSize, hiddenSize, mode);
   // get size of remaining layers
-  if(direction == rnn_enum::kUnidirectional)
-    size += (layerNum - 1) * rnn_single_param_size(hiddenSize, hiddenSize, mode);
-  else // bidirectional case: input size increases by 2
+  if(bidirectional)
     size += (layerNum - 1) * rnn_single_param_size(2 * hiddenSize, hiddenSize, mode);
+  else 
+    size += (layerNum - 1) * rnn_single_param_size(hiddenSize, hiddenSize, mode);  
   return size;
 }
 
 struct RNNParam : public dmlc::Parameter<RNNParam> {
   uint32_t state_size;
   uint32_t num_layers;
-  uint64_t workspace;
   bool batch_first;
-  int direction;
+  bool bidirectional;
   int mode;
-  float p;
+  float p, pkeep_;
+  int seq_length_;
+  bool lstm_q_; // whether type is lstm 
 
   DMLC_DECLARE_PARAMETER(RNNParam) {
     DMLC_DECLARE_FIELD(state_size)
@@ -83,13 +83,8 @@ struct RNNParam : public dmlc::Parameter<RNNParam> {
     DMLC_DECLARE_FIELD(num_layers)
     .describe("number of stacked layers");
 
-    DMLC_DECLARE_FIELD(workspace).set_default(512).set_range(0, 8192)
-    .describe("Tmp workspace for RNN (MB)");
-
-    DMLC_DECLARE_FIELD(direction)
-    .add_enum("unidirectional", rnn_enum::kUnidirectional)
-    .add_enum("bidirectional", rnn_enum::kBidirectional)
-    .describe("specifies the recurrence pattern");
+    DMLC_DECLARE_FIELD(bidirectional).set_default(false)
+    .describe("whether to use bidirectional recurrent layers");
 
     DMLC_DECLARE_FIELD(mode)
     .add_enum("rnn_relu", rnn_enum::kRnnRelu)
@@ -108,9 +103,12 @@ template<typename xpu, typename DType>
 class RNNOp : public Operator {
  public:
   explicit RNNOp(RNNParam p) {
-    this->param_ = p;
     // convert MBytes first to Bytes and then to elements.
-    param_.workspace = (param_.workspace << 20) / sizeof(real_t);
+    param_.pkeep_ = 1.0f - param_.p;
+    if(param_.mode == rnn_enum::kLstm)
+      param_.lstm_q_ = true;
+    else
+      param_.lstm_q_ = false;
   }
 
   virtual void Forward(const OpContext &ctx,
@@ -185,10 +183,7 @@ class RNNProp : public OperatorProperty {
     // Infer hidden state + cell state
     int batchSize = dshape[0];
     int inputSize = dshape[2];
-    int numDirections = 1;
-    if(param_.direction == rnn_enum::kBidirectional){
-      numDirections = 2;
-    }
+    int numDirections = param_.bidirectional ? 2 : 1;
     int total_layers = numDirections * param_.num_layers; // double for bidirectional
     SHAPE_ASSIGN_CHECK(*in_shape,
                        rnn_enum::kStateIn,
@@ -202,7 +197,7 @@ class RNNProp : public OperatorProperty {
     int weight_size = rnn_param_size(param_.num_layers,
                                     inputSize,
                                     param_.state_size,
-                                    param_.direction,
+                                    param_.bidirectional,
                                     param_.mode);
     SHAPE_ASSIGN_CHECK(*in_shape, rnn_enum::kParams, Shape1(weight_size));
     // infer output size

From c1382b35eb12b37d518da8750142ee07bab2cc45 Mon Sep 17 00:00:00 2001
From: Sebastian Bodenstein <sebastianb@wolfram.com>
Date: Mon, 18 Jul 2016 02:25:19 -0400
Subject: [PATCH 22/36] - added parameter size test - fixed bug where
 cudnnGetRNNParamsSize needs to be called after cudnnSetRNNDescriptor

---
 src/operator/cudnn_rnn-inl.h | 64 +++++++++++++++++++++---------------
 1 file changed, 38 insertions(+), 26 deletions(-)

diff --git a/src/operator/cudnn_rnn-inl.h b/src/operator/cudnn_rnn-inl.h
index 134044321ad7..3a40b2f67fd7 100644
--- a/src/operator/cudnn_rnn-inl.h
+++ b/src/operator/cudnn_rnn-inl.h
@@ -150,7 +150,7 @@ class CuDNNRNNOp : public Operator {
                                       ), CUDNN_STATUS_SUCCESS); 
     }
   }
- //
+ 
   virtual void Backward(const OpContext &ctx,
                         const std::vector<TBlob> &out_grad,
                         const std::vector<TBlob> &in_data,
@@ -182,6 +182,9 @@ class CuDNNRNNOp : public Operator {
       // get input + output tensors
       Tensor<gpu, 3, DType> x = in_data[rnn_enum::kData].get<gpu, 3, DType>(s);
       Tensor<gpu, 1, DType> w = in_data[rnn_enum::kParams].get<gpu, 1, DType>(s);
+
+      param_.seq_length_ = x.shape_[1];
+
       // Tensor Descriptors
       std::vector<cudnnTensorDescriptor_t> x_vec(param_.seq_length_);
       std::vector<cudnnTensorDescriptor_t> y_vec(param_.seq_length_);
@@ -305,7 +308,29 @@ class CuDNNRNNOp : public Operator {
                                           strideA
                                          ), CUDNN_STATUS_SUCCESS);
 
-      // Get temp space sizes
+      // Create Dropout descriptors
+      CHECK_EQ(cudnnCreateDropoutDescriptor(&dropout_desc_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnDropoutGetStatesSize(s->dnn_handle_, 
+                                          &dropout_byte_
+                                          ), CUDNN_STATUS_SUCCESS);
+      dropout_size_ = dropout_byte_ / sizeof(DType);
+      CHECK_EQ(cudnnSetDropoutDescriptor(dropout_desc_,
+                                        s->dnn_handle_,
+                                        param_.pkeep_,  // keep probability 
+                                        NULL,
+                                        dropout_byte_,
+                                        seed_), CUDNN_STATUS_SUCCESS);
+      // RNN descriptors       
+      CHECK_EQ(cudnnCreateRNNDescriptor(&rnn_desc_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnSetRNNDescriptor(rnn_desc_,
+                                    param_.state_size,
+                                    param_.num_layers,
+                                    dropout_desc_,
+                                    input_mode_,
+                                    direction_,
+                                    mode_,
+                                    dtype_), CUDNN_STATUS_SUCCESS);
+      // Get temp space sizes     
       CHECK_EQ(cudnnGetRNNWorkspaceSize(s->dnn_handle_,
                                         rnn_desc_,
                                         param_.seq_length_,
@@ -318,8 +343,17 @@ class CuDNNRNNOp : public Operator {
                                         x_desc_vec_.data(),
                                         &reserve_space_byte_
                                         ), CUDNN_STATUS_SUCCESS);
-      workspace_size_ = workspace_byte_ / sizeof(DType) + 1;
-      reserve_space_size_ = reserve_space_byte_ / sizeof(DType) + 1;
+      workspace_size_ = workspace_byte_ / sizeof(DType);
+      reserve_space_size_ = reserve_space_byte_ / sizeof(DType);
+
+      // check that number of params are correct
+      size_t cudnn_param_size;
+      CHECK_EQ(cudnnGetRNNParamsSize(s->dnn_handle_,
+                                    rnn_desc_,
+                                    x_desc_vec_[0],
+                                    &cudnn_param_size,
+                                    dtype_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(w.shape_[0] * sizeof(DType), cudnn_param_size);
 
       // Set param descriptors
       CHECK_EQ(cudnnCreateFilterDescriptor(&w_desc_), CUDNN_STATUS_SUCCESS);
@@ -337,28 +371,6 @@ class CuDNNRNNOp : public Operator {
                                           3,
                                           dim_w
                                          ), CUDNN_STATUS_SUCCESS);
-      // Create Dropout descriptors
-      CHECK_EQ(cudnnCreateDropoutDescriptor(&dropout_desc_), CUDNN_STATUS_SUCCESS);
-      CHECK_EQ(cudnnDropoutGetStatesSize(s->dnn_handle_, 
-                                          &dropout_byte_
-                                          ), CUDNN_STATUS_SUCCESS);
-      dropout_size_ = dropout_byte_ / sizeof(DType);
-      CHECK_EQ(cudnnSetDropoutDescriptor(dropout_desc_,
-                                        s->dnn_handle_,
-                                        param_.pkeep_,  // keep probability 
-                                        NULL,
-                                        dropout_byte_,
-                                        seed_), CUDNN_STATUS_SUCCESS);
-      // RNN descriptors       
-      CHECK_EQ(cudnnCreateRNNDescriptor(&rnn_desc_), CUDNN_STATUS_SUCCESS);
-      CHECK_EQ(cudnnSetRNNDescriptor(rnn_desc_,
-                                    param_.state_size,
-                                    param_.num_layers,
-                                    dropout_desc_,
-                                    input_mode_,
-                                    direction_,
-                                    mode_,
-                                    dtype_), CUDNN_STATUS_SUCCESS);
 
     }
   }

From f87c003bbd27f4dabf5c87adba0b86a604269562 Mon Sep 17 00:00:00 2001
From: Sebastian Bodenstein <sebastianb@wolfram.com>
Date: Mon, 18 Jul 2016 10:32:55 -0400
Subject: [PATCH 23/36] - checks for contiguous input tensors - more consistent
 param names - removed 'batch_first' option for now. Might add it later again

---
 src/operator/cudnn_rnn-inl.h | 119 +++++++++++++++++++----------------
 src/operator/rnn-inl.h       |  37 ++++++-----
 2 files changed, 82 insertions(+), 74 deletions(-)

diff --git a/src/operator/cudnn_rnn-inl.h b/src/operator/cudnn_rnn-inl.h
index 3a40b2f67fd7..8c6eae9dc984 100644
--- a/src/operator/cudnn_rnn-inl.h
+++ b/src/operator/cudnn_rnn-inl.h
@@ -92,15 +92,24 @@ class CuDNNRNNOp : public Operator {
       cy_ptr = (in_data[rnn_enum::kCellStateOut].get<gpu, 3, DType>(s)).dptr_;
     }
 
+    CHECK_EQ(x.CheckContiguous(), true);
+    CHECK_EQ(w.CheckContiguous(), true);
+    CHECK_EQ(hx.CheckContiguous(), true);
+    CHECK_EQ(y.CheckContiguous(), true);
+    CHECK_EQ(hy.CheckContiguous(), true);
+
     if(!init_cudnn_){
       Init(s, in_data, out_data);
     } 
 
+    // Get temp space
+    int temp_size = workspace_size_;
+    temp_size += ctx.is_train ? reserve_space_size_ : 0;
+    Tensor<gpu, 1, DType> temp_space =
+      ctx.requested[rnn_enum::kTempSpace].get_space_typed<gpu, 1, DType>(
+                              mshadow::Shape1(temp_size), s);
+    
     if (ctx.is_train) { 
-      // training mode
-      Tensor<gpu, 1, DType> temp_space =
-        ctx.requested[rnn_enum::kTempSpace].get_space_typed<gpu, 1, DType>(
-                                mshadow::Shape1(workspace_size_ + reserve_space_size_), s);
       CHECK_EQ(cudnnRNNForwardTraining(s->dnn_handle_,
                                       rnn_desc_,
                                       param_.seq_length_,
@@ -125,9 +134,6 @@ class CuDNNRNNOp : public Operator {
                                       ), CUDNN_STATUS_SUCCESS);
     } else {
       // inference mode
-      Tensor<gpu, 1, DType> temp_space =
-          ctx.requested[rnn_enum::kTempSpace].get_space_typed<gpu, 1, DType>(
-                                  mshadow::Shape1(workspace_size_), s);
       CHECK_EQ(cudnnRNNForwardInference(s->dnn_handle_,
                                       rnn_desc_,
                                       param_.seq_length_,
@@ -182,8 +188,9 @@ class CuDNNRNNOp : public Operator {
       // get input + output tensors
       Tensor<gpu, 3, DType> x = in_data[rnn_enum::kData].get<gpu, 3, DType>(s);
       Tensor<gpu, 1, DType> w = in_data[rnn_enum::kParams].get<gpu, 1, DType>(s);
-
-      param_.seq_length_ = x.shape_[1];
+      param_.seq_length_ = x.shape_[0];
+      param_.batch_size_ = x.shape_[1];
+      param_.input_size_ = x.shape_[2];
 
       // Tensor Descriptors
       std::vector<cudnnTensorDescriptor_t> x_vec(param_.seq_length_);
@@ -193,49 +200,51 @@ class CuDNNRNNOp : public Operator {
       int dimA[3];
       int strideA[3];
       for (int i = 0; i < param_.seq_length_; i++) {
-          CHECK_EQ(cudnnCreateTensorDescriptor(&x_vec[i]), CUDNN_STATUS_SUCCESS);
-          CHECK_EQ(cudnnCreateTensorDescriptor(&y_vec[i]), CUDNN_STATUS_SUCCESS);
-          CHECK_EQ(cudnnCreateTensorDescriptor(&dx_vec[i]), CUDNN_STATUS_SUCCESS);
-          CHECK_EQ(cudnnCreateTensorDescriptor(&dy_vec[i]), CUDNN_STATUS_SUCCESS);
-
-          dimA[0] = x.shape_[0];
-          dimA[1] = x.shape_[2];
-          dimA[2] = 1;
-          strideA[0] = dimA[2] * dimA[1];
-          strideA[1] = dimA[2];
-          strideA[2] = 1; 
+        CHECK_EQ(cudnnCreateTensorDescriptor(&x_vec[i]), CUDNN_STATUS_SUCCESS);
+        CHECK_EQ(cudnnCreateTensorDescriptor(&y_vec[i]), CUDNN_STATUS_SUCCESS);
+        CHECK_EQ(cudnnCreateTensorDescriptor(&dx_vec[i]), CUDNN_STATUS_SUCCESS);
+        CHECK_EQ(cudnnCreateTensorDescriptor(&dy_vec[i]), CUDNN_STATUS_SUCCESS);
+        
+        dimA[0] = param_.batch_size_;
+        dimA[1] = param_.input_size_;
+        dimA[2] = 1;
+        dimA[0] = param_.batch_size_;
+        dimA[1] = param_.input_size_;
+        strideA[0] = dimA[2] * dimA[1];
+        strideA[1] = dimA[2];
+        strideA[2] = 1; 
 
-          CHECK_EQ(cudnnSetTensorNdDescriptor(x_vec[i],
-                                    dtype_,
-                                    3,
-                                    dimA,
-                                    strideA
-                                    ), CUDNN_STATUS_SUCCESS);
-          CHECK_EQ(cudnnSetTensorNdDescriptor(dx_vec[i],
-                                    dtype_,
-                                    3,
-                                    dimA,
-                                    strideA
-                                    ), CUDNN_STATUS_SUCCESS);
-          dimA[0] = x.shape_[0];                           
-          dimA[1] = param_.bidirectional ? param_.state_size * 2 : param_.state_size;
-          dimA[2] = 1;
-          strideA[0] = dimA[2] * dimA[1];
-          strideA[1] = dimA[2];
-          strideA[2] = 1;
+        CHECK_EQ(cudnnSetTensorNdDescriptor(x_vec[i],
+                                  dtype_,
+                                  3,
+                                  dimA,
+                                  strideA
+                                  ), CUDNN_STATUS_SUCCESS);
+        CHECK_EQ(cudnnSetTensorNdDescriptor(dx_vec[i],
+                                  dtype_,
+                                  3,
+                                  dimA,
+                                  strideA
+                                  ), CUDNN_STATUS_SUCCESS);
+        dimA[0] = param_.batch_size_;                        
+        dimA[1] = param_.bidirectional ? param_.state_size_ * 2 : param_.state_size_;
+        dimA[2] = 1;
+        strideA[0] = dimA[2] * dimA[1];
+        strideA[1] = dimA[2];
+        strideA[2] = 1;
 
-          CHECK_EQ(cudnnSetTensorNdDescriptor(y_vec[i],
-                                    dtype_,
-                                    3,
-                                    dimA,
-                                    strideA
-                                    ), CUDNN_STATUS_SUCCESS);
-          CHECK_EQ(cudnnSetTensorNdDescriptor(dy_vec[i],
-                                    dtype_,
-                                    3,
-                                    dimA,
-                                    strideA
-                                    ), CUDNN_STATUS_SUCCESS);
+        CHECK_EQ(cudnnSetTensorNdDescriptor(y_vec[i],
+                                  dtype_,
+                                  3,
+                                  dimA,
+                                  strideA
+                                  ), CUDNN_STATUS_SUCCESS);
+        CHECK_EQ(cudnnSetTensorNdDescriptor(dy_vec[i],
+                                  dtype_,
+                                  3,
+                                  dimA,
+                                  strideA
+                                  ), CUDNN_STATUS_SUCCESS);
       }
       x_desc_vec_ = x_vec;
       y_desc_vec_ = y_vec;
@@ -243,9 +252,9 @@ class CuDNNRNNOp : public Operator {
       dy_desc_vec_ = dy_vec;
 
       // set the state tensors                       
-      dimA[0] = param_.num_layers * (param_.bidirectional ? 2 : 1);
-      dimA[1] = x.shape_[0]; //minibatch
-      dimA[2] = param_.state_size;
+      dimA[0] = param_.num_layers_ * (param_.bidirectional ? 2 : 1);
+      dimA[1] = param_.batch_size_;
+      dimA[2] = param_.state_size_;
       strideA[0] = dimA[2] * dimA[1];
       strideA[1] = dimA[2];
       strideA[2] = 1;
@@ -323,8 +332,8 @@ class CuDNNRNNOp : public Operator {
       // RNN descriptors       
       CHECK_EQ(cudnnCreateRNNDescriptor(&rnn_desc_), CUDNN_STATUS_SUCCESS);
       CHECK_EQ(cudnnSetRNNDescriptor(rnn_desc_,
-                                    param_.state_size,
-                                    param_.num_layers,
+                                    param_.state_size_,
+                                    param_.num_layers_,
                                     dropout_desc_,
                                     input_mode_,
                                     direction_,
diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h
index 53189d100ef2..a4252b7e8fe5 100644
--- a/src/operator/rnn-inl.h
+++ b/src/operator/rnn-inl.h
@@ -67,20 +67,19 @@ inline int rnn_param_size(int layerNum,
 }
 
 struct RNNParam : public dmlc::Parameter<RNNParam> {
-  uint32_t state_size;
-  uint32_t num_layers;
-  bool batch_first;
+  uint32_t state_size_;
+  uint32_t num_layers_;
   bool bidirectional;
   int mode;
   float p, pkeep_;
-  int seq_length_;
+  int seq_length_, batch_size_, input_size_;
   bool lstm_q_; // whether type is lstm 
 
   DMLC_DECLARE_PARAMETER(RNNParam) {
-    DMLC_DECLARE_FIELD(state_size)
+    DMLC_DECLARE_FIELD(state_size_)
     .describe("size of the state for each layer");
 
-    DMLC_DECLARE_FIELD(num_layers)
+    DMLC_DECLARE_FIELD(num_layers_)
     .describe("number of stacked layers");
 
     DMLC_DECLARE_FIELD(bidirectional).set_default(false)
@@ -179,35 +178,35 @@ class RNNProp : public OperatorProperty {
     const TShape &dshape = (*in_shape)[rnn_enum::kData];
     if (dshape.ndim() ==  0) return false;
     CHECK_EQ(dshape.ndim(), 3) \
-        << "Input data should be rank-3 tensor of dim (seqLength, batch, inputDim).";
-    // Infer hidden state + cell state
-    int batchSize = dshape[0];
-    int inputSize = dshape[2];
+        << "Input data should be rank-3 tensor of dim (sequence length, batch size, input size)";
+    // Get input sizes
+    int batch_size = dshape[1];
+    int input_size = dshape[2];
     int numDirections = param_.bidirectional ? 2 : 1;
-    int total_layers = numDirections * param_.num_layers; // double for bidirectional
+    int total_layers = numDirections * param_.num_layers_; // double for bidirectional
     SHAPE_ASSIGN_CHECK(*in_shape,
                        rnn_enum::kStateIn,
-                       Shape3(total_layers, batchSize, param_.state_size));
+                       Shape3(total_layers, batch_size, param_.state_size_));
     if (param_.mode == rnn_enum::kLstm){
       SHAPE_ASSIGN_CHECK(*in_shape,
                         rnn_enum::kCellStateIn,
-                        Shape3(total_layers, batchSize, param_.state_size));
+                        Shape3(total_layers, batch_size, param_.state_size_));
     }
     // infer weight size
-    int weight_size = rnn_param_size(param_.num_layers,
-                                    inputSize,
-                                    param_.state_size,
+    int weight_size = rnn_param_size(param_.num_layers_,
+                                    input_size,
+                                    param_.state_size_,
                                     param_.bidirectional,
                                     param_.mode);
     SHAPE_ASSIGN_CHECK(*in_shape, rnn_enum::kParams, Shape1(weight_size));
     // infer output size
     TShape oshape = dshape;
-    oshape[2] = numDirections * param_.state_size;
+    oshape[2] = numDirections * param_.state_size_;
     // infer output state size
     TShape outStateShape = dshape;
     outStateShape[0] = total_layers;
-    outStateShape[1] = batchSize;
-    outStateShape[2] = param_.state_size;
+    outStateShape[1] = batch_size;
+    outStateShape[2] = param_.state_size_;
 
     out_shape->clear();
     out_shape->push_back(oshape);

From 8b84ef0afd2ac99c5a879d146848a4f17f48ec62 Mon Sep 17 00:00:00 2001
From: Sebastian Bodenstein <sebastianb@wolfram.com>
Date: Tue, 19 Jul 2016 23:06:39 +0200
Subject: [PATCH 24/36] - fixed input names

---
 src/operator/rnn-inl.h | 41 +++++++++++++++++++++++++----------------
 src/operator/rnn.cc    |  4 ++--
 2 files changed, 27 insertions(+), 18 deletions(-)

diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h
index a4252b7e8fe5..98f8a5953d70 100644
--- a/src/operator/rnn-inl.h
+++ b/src/operator/rnn-inl.h
@@ -144,18 +144,28 @@ class RNNProp : public OperatorProperty {
  public:
   std::vector<std::string> ListArguments() const override {
     if (param_.mode == rnn_enum::kLstm) {
-      return {"data", "weight", "state", "cell_state"};
+      return {"data", "parameters", "state", "cell_state"};
     } else {
-      return {"data", "weight", "state"};
+      return {"data", "parameters", "state"};
     }
   }
 
   std::vector<std::string> ListOutputs() const override {
-    if (param_.mode == rnn_enum::kLstm) {
-      return {"output", "final_state", "final_state_cell"};
-    } else {
-      return {"output", "final_state"};
-    }
+    if (param_.mode == rnn_enum::kLstm)
+      return {"output", "state", "state_cell"};
+    else 
+      return {"output", "state"};
+  }
+
+  int NumOutputs() const override {
+    if (param_.mode == rnn_enum::kLstm)
+      return 3;
+    else 
+      return 2;
+  }
+
+  int NumVisibleOutputs() const override {
+    return 1;
   }
 
   void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {
@@ -171,15 +181,15 @@ class RNNProp : public OperatorProperty {
                   std::vector<TShape> *aux_shape) const override {
     using namespace mshadow;
     if (param_.mode == rnn_enum::kLstm) {
-      CHECK_EQ(in_shape->size(), 4) << "Input:[data, weight, state, cell_state]";
+      CHECK_EQ(in_shape->size(), 4) << "Input:[data, parameters, state, cell_state]";
     } else {
-      CHECK_EQ(in_shape->size(), 3) << "Input:[data, weight, state]";
+      CHECK_EQ(in_shape->size(), 3) << "Input:[data, parameters, state]";
     }
     const TShape &dshape = (*in_shape)[rnn_enum::kData];
     if (dshape.ndim() ==  0) return false;
     CHECK_EQ(dshape.ndim(), 3) \
-        << "Input data should be rank-3 tensor of dim (sequence length, batch size, input size)";
-    // Get input sizes
+        << "Input data should be rank-3 tensor of dim [sequence length, batch size, input size]";
+    // data: [sequence len, batch, input dimension]
     int batch_size = dshape[1];
     int input_size = dshape[2];
     int numDirections = param_.bidirectional ? 2 : 1;
@@ -192,17 +202,16 @@ class RNNProp : public OperatorProperty {
                         rnn_enum::kCellStateIn,
                         Shape3(total_layers, batch_size, param_.state_size_));
     }
-    // infer weight size
-    int weight_size = rnn_param_size(param_.num_layers_,
+    // calculate parameter vector length
+    int param_size = rnn_param_size(param_.num_layers_,
                                     input_size,
                                     param_.state_size_,
                                     param_.bidirectional,
                                     param_.mode);
-    SHAPE_ASSIGN_CHECK(*in_shape, rnn_enum::kParams, Shape1(weight_size));
-    // infer output size
+    SHAPE_ASSIGN_CHECK(*in_shape, rnn_enum::kParams, Shape1(param_size));
+    // output: [sequence len, batch, output size]
     TShape oshape = dshape;
     oshape[2] = numDirections * param_.state_size_;
-    // infer output state size
     TShape outStateShape = dshape;
     outStateShape[0] = total_layers;
     outStateShape[1] = batch_size;
diff --git a/src/operator/rnn.cc b/src/operator/rnn.cc
index 2a485e5ef224..5e3b2b8894af 100644
--- a/src/operator/rnn.cc
+++ b/src/operator/rnn.cc
@@ -20,7 +20,7 @@ Operator *CreateOp<cpu>(RNNParam param, int dtype) {
 }
 
 Operator *RNNProp::CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
-                                     std::vector<int> *in_type) const {
+                                     std::vector<int> *in_type) const {                                 
   std::vector<TShape> out_shape, aux_shape;
   std::vector<int> out_type, aux_type;
   CHECK(InferType(in_type, &out_type, &aux_type));
@@ -34,7 +34,7 @@ MXNET_REGISTER_OP_PROPERTY(RNN, RNNProp)
 .describe("Apply a recurrent layer to input.")
 .add_argument("data", "Symbol", "Input data to RNN")
 .add_argument("parameters", "Symbol", "Vector of all RNN trainable parameters")
-.add_argument("hidden_state", "Symbol", "initial hidden state of the RNN")
+.add_argument("state", "Symbol", "initial hidden state of the RNN")
 .add_argument("cell_state", "Symbol", "initial cell state for LSTM networks (only for LSTM)")
 .add_arguments(RNNParam::__FIELDS__());
 }  // namespace op

From d50f2dc528da00d6707ca39316132b0704c97eb1 Mon Sep 17 00:00:00 2001
From: Sebastian Bodenstein <sebastianb@wolfram.com>
Date: Wed, 20 Jul 2016 12:50:59 +0200
Subject: [PATCH 25/36] - added backward method

---
 src/operator/cudnn_rnn-inl.h | 95 ++++++++++++++++++++++++++++++++++--
 src/operator/rnn-inl.h       |  8 +--
 2 files changed, 95 insertions(+), 8 deletions(-)

diff --git a/src/operator/cudnn_rnn-inl.h b/src/operator/cudnn_rnn-inl.h
index 8c6eae9dc984..6a642f6428f8 100644
--- a/src/operator/cudnn_rnn-inl.h
+++ b/src/operator/cudnn_rnn-inl.h
@@ -80,7 +80,7 @@ class CuDNNRNNOp : public Operator {
     // get input + output tensors
     Tensor<gpu, 3, DType> x = in_data[rnn_enum::kData].get<gpu, 3, DType>(s);
     Tensor<gpu, 1, DType> w = in_data[rnn_enum::kParams].get<gpu, 1, DType>(s);
-    Tensor<gpu, 3, DType> hx = in_data[rnn_enum::kStateIn].get<gpu, 3, DType>(s);
+    Tensor<gpu, 3, DType> hx = in_data[rnn_enum::kState].get<gpu, 3, DType>(s);
 
     Tensor<gpu, 3, DType> y = out_data[rnn_enum::kOut].get<gpu, 3, DType>(s);
     Tensor<gpu, 3, DType> hy = out_data[rnn_enum::kStateOut].get<gpu, 3, DType>(s);
@@ -88,8 +88,8 @@ class CuDNNRNNOp : public Operator {
     DType * cx_ptr = NULL;
     DType * cy_ptr = NULL;
     if (param_.mode == rnn_enum::kLstm){
-      cx_ptr = (in_data[rnn_enum::kCellStateIn].get<gpu, 3, DType>(s)).dptr_;
-      cy_ptr = (in_data[rnn_enum::kCellStateOut].get<gpu, 3, DType>(s)).dptr_;
+      cx_ptr = (in_data[rnn_enum::kStateCell].get<gpu, 3, DType>(s)).dptr_;
+      cy_ptr = (in_data[rnn_enum::kStateCellOut].get<gpu, 3, DType>(s)).dptr_;
     }
 
     CHECK_EQ(x.CheckContiguous(), true);
@@ -169,7 +169,94 @@ class CuDNNRNNOp : public Operator {
     size_t out_expected = param_.lstm_q_ ? 3 : 2;
     CHECK_EQ(in_data.size(), in_expected);
     CHECK_EQ(out_data.size(), out_expected);
-    CHECK_EQ(out_data.size(), out_expected);
+    CHECK_EQ(in_grad.size(), in_expected);
+    CHECK_EQ(out_grad.size(), out_expected);
+
+    Stream<gpu> *s = ctx.get_stream<gpu>();
+    // get input + output tensors
+    Tensor<gpu, 3, DType> x = in_data[rnn_enum::kData].get<gpu, 3, DType>(s);
+    Tensor<gpu, 3, DType> dx = in_grad[rnn_enum::kData].get<gpu, 3, DType>(s);
+    Tensor<gpu, 1, DType> w = in_data[rnn_enum::kParams].get<gpu, 1, DType>(s);
+    Tensor<gpu, 1, DType> dw = in_grad[rnn_enum::kParams].get<gpu, 1, DType>(s);
+    Tensor<gpu, 3, DType> hx = in_data[rnn_enum::kState].get<gpu, 3, DType>(s);
+    Tensor<gpu, 3, DType> dhx = in_grad[rnn_enum::kState].get<gpu, 3, DType>(s);
+    Tensor<gpu, 3, DType> hy = in_data[rnn_enum::kStateOut].get<gpu, 3, DType>(s);
+    Tensor<gpu, 3, DType> dhy = out_grad[rnn_enum::kStateOut].get<gpu, 3, DType>(s);
+    Tensor<gpu, 3, DType> y = out_data[rnn_enum::kOut].get<gpu, 3, DType>(s);
+    Tensor<gpu, 3, DType> dy = out_grad[rnn_enum::kOut].get<gpu, 3, DType>(s);
+
+    DType * cx_ptr = NULL;
+    // DType * cy_ptr = NULL;
+    DType * dcx_ptr = NULL;
+    DType * dcy_ptr = NULL;
+    if (param_.mode == rnn_enum::kLstm){
+      cx_ptr = (in_data[rnn_enum::kStateCell].get<gpu, 3, DType>(s)).dptr_;
+      // cy_ptr = (in_data[rnn_enum::kStateCellOut].get<gpu, 3, DType>(s)).dptr_;
+      dcx_ptr = (in_grad[rnn_enum::kStateCell].get<gpu, 3, DType>(s)).dptr_;
+      dcy_ptr = (out_grad[rnn_enum::kStateCellOut].get<gpu, 3, DType>(s)).dptr_;
+    }
+
+    CHECK_EQ(x.CheckContiguous(), true);
+    CHECK_EQ(w.CheckContiguous(), true);
+    CHECK_EQ(hx.CheckContiguous(), true);
+    CHECK_EQ(y.CheckContiguous(), true);
+    CHECK_EQ(hy.CheckContiguous(), true);
+
+    if(!init_cudnn_){
+      Init(s, in_data, out_data);
+    } 
+
+    // Get temp space
+    int temp_size = workspace_size_;
+    temp_size += ctx.is_train ? reserve_space_size_ : 0;
+    Tensor<gpu, 1, DType> temp_space =
+      ctx.requested[rnn_enum::kTempSpace].get_space_typed<gpu, 1, DType>(
+                              mshadow::Shape1(temp_size), s);
+    
+    CHECK_EQ(cudnnRNNBackwardData(s->dnn_handle_,
+                                rnn_desc_,
+                                param_.seq_length_,
+                                y_desc_vec_.data(),
+                                y.dptr_,
+                                dy_desc_vec_.data(),
+                                dy.dptr_,
+                                dhy_desc_,
+                                dhy.dptr_,
+                                dcy_desc_,
+                                dcy_ptr,
+                                w_desc_,
+                                w.dptr_,
+                                hx_desc_,
+                                hx.dptr_,
+                                cx_desc_,
+                                cx_ptr,
+                                dx_desc_vec_.data(),
+                                dx.dptr_,
+                                dhx_desc_,
+                                dhx.dptr_,
+                                dcx_desc_,
+                                dcx_ptr,
+                                temp_space.dptr_,
+                                workspace_byte_,
+                                temp_space.dptr_ + workspace_size_,
+                                reserve_space_byte_
+                                ), CUDNN_STATUS_SUCCESS);
+    CHECK_EQ(cudnnRNNBackwardWeights(s->dnn_handle_, 
+                                    rnn_desc_, 
+                                    param_.seq_length_, 
+                                    x_desc_vec_.data(), 
+                                    x.dptr_, 
+                                    hx_desc_,
+                                    hx.dptr_,                                                
+                                    y_desc_vec_.data(), 
+                                    y.dptr_,
+                                    temp_space.dptr_, 
+                                    workspace_byte_, 
+                                    dw_desc_, 
+                                    dw.dptr_,
+                                    temp_space.dptr_ + workspace_size_, 
+                                    reserve_space_byte_ 
+                                    ), CUDNN_STATUS_SUCCESS);
   }
  private:
   inline void Init(mshadow::Stream<gpu> *s,
diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h
index 98f8a5953d70..fd68fd628432 100644
--- a/src/operator/rnn-inl.h
+++ b/src/operator/rnn-inl.h
@@ -21,8 +21,8 @@ namespace mxnet {
 namespace op {
 
 namespace rnn_enum {
-  enum RNNOpInputs {kData, kParams, kStateIn, kCellStateIn};
-  enum RNNOpOutputs {kOut, kStateOut, kCellStateOut};
+  enum RNNOpInputs {kData, kParams, kState, kStateCell};
+  enum RNNOpOutputs {kOut, kStateOut, kStateCellOut};
   enum RNNModeType {kRnnRelu, kRnnTanh, kLstm, kGru};
   enum RNNOpResource {kTempSpace};
 }
@@ -195,11 +195,11 @@ class RNNProp : public OperatorProperty {
     int numDirections = param_.bidirectional ? 2 : 1;
     int total_layers = numDirections * param_.num_layers_; // double for bidirectional
     SHAPE_ASSIGN_CHECK(*in_shape,
-                       rnn_enum::kStateIn,
+                       rnn_enum::kState,
                        Shape3(total_layers, batch_size, param_.state_size_));
     if (param_.mode == rnn_enum::kLstm){
       SHAPE_ASSIGN_CHECK(*in_shape,
-                        rnn_enum::kCellStateIn,
+                        rnn_enum::kStateCell,
                         Shape3(total_layers, batch_size, param_.state_size_));
     }
     // calculate parameter vector length

From dc55e74bc324e7232c5b4089d6e96fb51d33ae74 Mon Sep 17 00:00:00 2001
From: Sebastian Bodenstein <sebastianb@wolfram.com>
Date: Wed, 20 Jul 2016 13:16:50 +0200
Subject: [PATCH 26/36] - small fix for in/out names

---
 src/operator/rnn-inl.h | 8 ++++----
 src/operator/rnn.cc    | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h
index fd68fd628432..137bebed5c06 100644
--- a/src/operator/rnn-inl.h
+++ b/src/operator/rnn-inl.h
@@ -144,7 +144,7 @@ class RNNProp : public OperatorProperty {
  public:
   std::vector<std::string> ListArguments() const override {
     if (param_.mode == rnn_enum::kLstm) {
-      return {"data", "parameters", "state", "cell_state"};
+      return {"data", "parameters", "state", "state_cell"};
     } else {
       return {"data", "parameters", "state"};
     }
@@ -164,9 +164,9 @@ class RNNProp : public OperatorProperty {
       return 2;
   }
 
-  int NumVisibleOutputs() const override {
-    return 1;
-  }
+  // int NumVisibleOutputs() const override {
+  //   return 1;
+  // }
 
   void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {
     param_.Init(kwargs);
diff --git a/src/operator/rnn.cc b/src/operator/rnn.cc
index 5e3b2b8894af..337410c8ddc1 100644
--- a/src/operator/rnn.cc
+++ b/src/operator/rnn.cc
@@ -35,7 +35,7 @@ MXNET_REGISTER_OP_PROPERTY(RNN, RNNProp)
 .add_argument("data", "Symbol", "Input data to RNN")
 .add_argument("parameters", "Symbol", "Vector of all RNN trainable parameters")
 .add_argument("state", "Symbol", "initial hidden state of the RNN")
-.add_argument("cell_state", "Symbol", "initial cell state for LSTM networks (only for LSTM)")
+.add_argument("state_cell", "Symbol", "initial cell state for LSTM networks (only for LSTM)")
 .add_arguments(RNNParam::__FIELDS__());
 }  // namespace op
 }  // namespace mxnet

From 8bd215cd6c97b17eb4297500359a7e4011425585 Mon Sep 17 00:00:00 2001
From: Sebastian Bodenstein <sebastianb@wolfram.com>
Date: Wed, 20 Jul 2016 14:24:57 +0200
Subject: [PATCH 27/36] - fixed bug: parameters can't have underscore

---
 src/operator/cudnn_rnn-inl.h | 10 +++++-----
 src/operator/rnn-inl.h       | 22 +++++++++++-----------
 2 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/src/operator/cudnn_rnn-inl.h b/src/operator/cudnn_rnn-inl.h
index 6a642f6428f8..d696ead26255 100644
--- a/src/operator/cudnn_rnn-inl.h
+++ b/src/operator/cudnn_rnn-inl.h
@@ -314,7 +314,7 @@ class CuDNNRNNOp : public Operator {
                                   strideA
                                   ), CUDNN_STATUS_SUCCESS);
         dimA[0] = param_.batch_size_;                        
-        dimA[1] = param_.bidirectional ? param_.state_size_ * 2 : param_.state_size_;
+        dimA[1] = param_.bidirectional ? param_.state_size * 2 : param_.state_size;
         dimA[2] = 1;
         strideA[0] = dimA[2] * dimA[1];
         strideA[1] = dimA[2];
@@ -339,9 +339,9 @@ class CuDNNRNNOp : public Operator {
       dy_desc_vec_ = dy_vec;
 
       // set the state tensors                       
-      dimA[0] = param_.num_layers_ * (param_.bidirectional ? 2 : 1);
+      dimA[0] = param_.num_layers * (param_.bidirectional ? 2 : 1);
       dimA[1] = param_.batch_size_;
-      dimA[2] = param_.state_size_;
+      dimA[2] = param_.state_size;
       strideA[0] = dimA[2] * dimA[1];
       strideA[1] = dimA[2];
       strideA[2] = 1;
@@ -419,8 +419,8 @@ class CuDNNRNNOp : public Operator {
       // RNN descriptors       
       CHECK_EQ(cudnnCreateRNNDescriptor(&rnn_desc_), CUDNN_STATUS_SUCCESS);
       CHECK_EQ(cudnnSetRNNDescriptor(rnn_desc_,
-                                    param_.state_size_,
-                                    param_.num_layers_,
+                                    param_.state_size,
+                                    param_.num_layers,
                                     dropout_desc_,
                                     input_mode_,
                                     direction_,
diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h
index 137bebed5c06..ed0cf0db84b1 100644
--- a/src/operator/rnn-inl.h
+++ b/src/operator/rnn-inl.h
@@ -67,8 +67,8 @@ inline int rnn_param_size(int layerNum,
 }
 
 struct RNNParam : public dmlc::Parameter<RNNParam> {
-  uint32_t state_size_;
-  uint32_t num_layers_;
+  uint32_t state_size;
+  uint32_t num_layers;
   bool bidirectional;
   int mode;
   float p, pkeep_;
@@ -76,10 +76,10 @@ struct RNNParam : public dmlc::Parameter<RNNParam> {
   bool lstm_q_; // whether type is lstm 
 
   DMLC_DECLARE_PARAMETER(RNNParam) {
-    DMLC_DECLARE_FIELD(state_size_)
+    DMLC_DECLARE_FIELD(state_size)
     .describe("size of the state for each layer");
 
-    DMLC_DECLARE_FIELD(num_layers_)
+    DMLC_DECLARE_FIELD(num_layers)
     .describe("number of stacked layers");
 
     DMLC_DECLARE_FIELD(bidirectional).set_default(false)
@@ -193,29 +193,29 @@ class RNNProp : public OperatorProperty {
     int batch_size = dshape[1];
     int input_size = dshape[2];
     int numDirections = param_.bidirectional ? 2 : 1;
-    int total_layers = numDirections * param_.num_layers_; // double for bidirectional
+    int total_layers = numDirections * param_.num_layers; // double for bidirectional
     SHAPE_ASSIGN_CHECK(*in_shape,
                        rnn_enum::kState,
-                       Shape3(total_layers, batch_size, param_.state_size_));
+                       Shape3(total_layers, batch_size, param_.state_size));
     if (param_.mode == rnn_enum::kLstm){
       SHAPE_ASSIGN_CHECK(*in_shape,
                         rnn_enum::kStateCell,
-                        Shape3(total_layers, batch_size, param_.state_size_));
+                        Shape3(total_layers, batch_size, param_.state_size));
     }
     // calculate parameter vector length
-    int param_size = rnn_param_size(param_.num_layers_,
+    int param_size = rnn_param_size(param_.num_layers,
                                     input_size,
-                                    param_.state_size_,
+                                    param_.state_size,
                                     param_.bidirectional,
                                     param_.mode);
     SHAPE_ASSIGN_CHECK(*in_shape, rnn_enum::kParams, Shape1(param_size));
     // output: [sequence len, batch, output size]
     TShape oshape = dshape;
-    oshape[2] = numDirections * param_.state_size_;
+    oshape[2] = numDirections * param_.state_size;
     TShape outStateShape = dshape;
     outStateShape[0] = total_layers;
     outStateShape[1] = batch_size;
-    outStateShape[2] = param_.state_size_;
+    outStateShape[2] = param_.state_size;
 
     out_shape->clear();
     out_shape->push_back(oshape);

From 2e333fcb54f5549faf6e10971de716b981cf3698 Mon Sep 17 00:00:00 2001
From: Sebastian Bodenstein <sebastianb@wolfram.com>
Date: Wed, 20 Jul 2016 16:36:07 +0200
Subject: [PATCH 28/36] - fixed off-by-two error in weight shape inference for
 bidirectional net - moved calculated param to cudnn_rnn-inl.h

---
 src/operator/cudnn_rnn-inl.h |  7 ++++++-
 src/operator/rnn-inl.h       | 10 +++-------
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/src/operator/cudnn_rnn-inl.h b/src/operator/cudnn_rnn-inl.h
index d696ead26255..1fd7afc90e3a 100644
--- a/src/operator/cudnn_rnn-inl.h
+++ b/src/operator/cudnn_rnn-inl.h
@@ -41,6 +41,12 @@ class CuDNNRNNOp : public Operator {
     }
     // RNN Direction
     direction_ = param_.bidirectional ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL;
+    // Other
+    param_.pkeep_ = 1.0f - param_.p;
+    if(param_.mode == rnn_enum::kLstm)
+      param_.lstm_q_ = true;
+    else
+      param_.lstm_q_ = false;
   }
 
   ~CuDNNRNNOp() {
@@ -212,7 +218,6 @@ class CuDNNRNNOp : public Operator {
     Tensor<gpu, 1, DType> temp_space =
       ctx.requested[rnn_enum::kTempSpace].get_space_typed<gpu, 1, DType>(
                               mshadow::Shape1(temp_size), s);
-    
     CHECK_EQ(cudnnRNNBackwardData(s->dnn_handle_,
                                 rnn_desc_,
                                 param_.seq_length_,
diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h
index ed0cf0db84b1..b51216bf9d4d 100644
--- a/src/operator/rnn-inl.h
+++ b/src/operator/rnn-inl.h
@@ -59,8 +59,10 @@ inline int rnn_param_size(int layerNum,
   // get size of first layer
   int size = rnn_single_param_size(inputSize, hiddenSize, mode);
   // get size of remaining layers
-  if(bidirectional)
+  if(bidirectional){
     size += (layerNum - 1) * rnn_single_param_size(2 * hiddenSize, hiddenSize, mode);
+    size *= 2;
+  }
   else 
     size += (layerNum - 1) * rnn_single_param_size(hiddenSize, hiddenSize, mode);  
   return size;
@@ -102,12 +104,6 @@ template<typename xpu, typename DType>
 class RNNOp : public Operator {
  public:
   explicit RNNOp(RNNParam p) {
-    // convert MBytes first to Bytes and then to elements.
-    param_.pkeep_ = 1.0f - param_.p;
-    if(param_.mode == rnn_enum::kLstm)
-      param_.lstm_q_ = true;
-    else
-      param_.lstm_q_ = false;
   }
 
   virtual void Forward(const OpContext &ctx,

From 430bd0365195d715bd123c097607b0d782482b9a Mon Sep 17 00:00:00 2001
From: Sebastian Bodenstein <sebastianb@wolfram.com>
Date: Thu, 21 Jul 2016 13:24:56 +0200
Subject: [PATCH 29/36] - added option to control num outputs

---
 src/operator/cudnn_rnn-inl.h |  5 ++++-
 src/operator/rnn-inl.h       | 21 ++++++++++++++-------
 2 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/src/operator/cudnn_rnn-inl.h b/src/operator/cudnn_rnn-inl.h
index 1fd7afc90e3a..0c943bab7da0 100644
--- a/src/operator/cudnn_rnn-inl.h
+++ b/src/operator/cudnn_rnn-inl.h
@@ -7,9 +7,12 @@
 #ifndef MXNET_OPERATOR_CUDNN_RNN_INL_H_
 #define MXNET_OPERATOR_CUDNN_RNN_INL_H_
 
-#include <algorithm>
 #include <vector>
+#include <map>
+#include <string>
+#include <utility>
 #include "./rnn-inl.h"
+
 namespace mxnet {
 namespace op {
 #if defined(__CUDACC__) && MXNET_USE_CUDNN == 1 && CUDNN_MAJOR == 5
diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h
index b51216bf9d4d..d036e299e519 100644
--- a/src/operator/rnn-inl.h
+++ b/src/operator/rnn-inl.h
@@ -71,7 +71,7 @@ inline int rnn_param_size(int layerNum,
 struct RNNParam : public dmlc::Parameter<RNNParam> {
   uint32_t state_size;
   uint32_t num_layers;
-  bool bidirectional;
+  bool bidirectional, state_outputs;
   int mode;
   float p, pkeep_;
   int seq_length_, batch_size_, input_size_;
@@ -97,6 +97,10 @@ struct RNNParam : public dmlc::Parameter<RNNParam> {
     DMLC_DECLARE_FIELD(p).set_default(0.)
     .set_range(0, 1)
     .describe("Fraction of the input that gets dropped out at training time");
+
+    DMLC_DECLARE_FIELD(state_outputs).set_default(false)
+    .describe("Whether to have the states as symbol outputs.");
+
   }
 };
 
@@ -160,9 +164,11 @@ class RNNProp : public OperatorProperty {
       return 2;
   }
 
-  // int NumVisibleOutputs() const override {
-  //   return 1;
-  // }
+  int NumVisibleOutputs() const override {
+    int mode_num = (param_.mode == rnn_enum::kLstm) ? 2 : 1;
+    int num_outputs = param_.state_outputs ? (mode_num + 1) : 1;
+    return num_outputs;
+  }
 
   void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {
     param_.Init(kwargs);
@@ -193,11 +199,11 @@ class RNNProp : public OperatorProperty {
     SHAPE_ASSIGN_CHECK(*in_shape,
                        rnn_enum::kState,
                        Shape3(total_layers, batch_size, param_.state_size));
-    if (param_.mode == rnn_enum::kLstm){
+    if (param_.mode == rnn_enum::kLstm)
       SHAPE_ASSIGN_CHECK(*in_shape,
                         rnn_enum::kStateCell,
                         Shape3(total_layers, batch_size, param_.state_size));
-    }
+
     // calculate parameter vector length
     int param_size = rnn_param_size(param_.num_layers,
                                     input_size,
@@ -217,7 +223,7 @@ class RNNProp : public OperatorProperty {
     out_shape->push_back(oshape);
     out_shape->push_back(outStateShape);
     // Deal with lstm cell state
-    if (param_.mode == rnn_enum::kLstm)
+    if(param_.mode == rnn_enum::kLstm)
       out_shape->push_back(outStateShape);
     return true;
   }
@@ -240,6 +246,7 @@ class RNNProp : public OperatorProperty {
     out_type->clear();
     out_type->push_back(dtype);
     out_type->push_back(dtype);
+    // Deal with lstm cell state
     if (param_.mode == rnn_enum::kLstm)
       out_type->push_back(dtype);
     return true;

From 4dbe1367f9204517ee91e62e43db2ac4a8c87c58 Mon Sep 17 00:00:00 2001
From: Sebastian Bodenstein <sebastianb@wolfram.com>
Date: Thu, 21 Jul 2016 13:52:17 +0200
Subject: [PATCH 30/36] - removed lint

---
 src/operator/cudnn_rnn-inl.h | 137 +++++++++++++++--------------------
 src/operator/rnn-inl.h       |  32 ++++----
 src/operator/rnn.cc          |   5 +-
 src/operator/rnn.cu          |   2 +-
 4 files changed, 77 insertions(+), 99 deletions(-)

diff --git a/src/operator/cudnn_rnn-inl.h b/src/operator/cudnn_rnn-inl.h
index 0c943bab7da0..f3bfc1eac1fe 100644
--- a/src/operator/cudnn_rnn-inl.h
+++ b/src/operator/cudnn_rnn-inl.h
@@ -24,7 +24,7 @@ class CuDNNRNNOp : public Operator {
     init_cudnn_ = false;
     dtype_ = mshadow::DataType<DType>::kCudnnFlag;
     // Defaults
-    input_mode_ = CUDNN_LINEAR_INPUT; // Don't support this yet
+    input_mode_ = CUDNN_LINEAR_INPUT;  // Don't support this yet
     // RNN Mode
     switch (param_.mode) {
       case rnn_enum::kRnnRelu:
@@ -46,7 +46,7 @@ class CuDNNRNNOp : public Operator {
     direction_ = param_.bidirectional ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL;
     // Other
     param_.pkeep_ = 1.0f - param_.p;
-    if(param_.mode == rnn_enum::kLstm)
+    if (param_.mode == rnn_enum::kLstm)
       param_.lstm_q_ = true;
     else
       param_.lstm_q_ = false;
@@ -54,7 +54,7 @@ class CuDNNRNNOp : public Operator {
 
   ~CuDNNRNNOp() {
     if (init_cudnn_) {
-      for(int i = 0; i < x_desc_vec_.size(); ++i){
+      for (int i = 0; i < x_desc_vec_.size(); ++i) {
         CHECK_EQ(cudnnDestroyTensorDescriptor(x_desc_vec_[i]), CUDNN_STATUS_SUCCESS);
         CHECK_EQ(cudnnDestroyTensorDescriptor(y_desc_vec_[i]), CUDNN_STATUS_SUCCESS);
         CHECK_EQ(cudnnDestroyTensorDescriptor(dx_desc_vec_[i]), CUDNN_STATUS_SUCCESS);
@@ -63,18 +63,18 @@ class CuDNNRNNOp : public Operator {
       CHECK_EQ(cudnnDestroyTensorDescriptor(hx_desc_), CUDNN_STATUS_SUCCESS);
       CHECK_EQ(cudnnDestroyTensorDescriptor(cx_desc_), CUDNN_STATUS_SUCCESS);
       CHECK_EQ(cudnnDestroyTensorDescriptor(hy_desc_), CUDNN_STATUS_SUCCESS);
-      CHECK_EQ(cudnnDestroyTensorDescriptor(cy_desc_), CUDNN_STATUS_SUCCESS);   
+      CHECK_EQ(cudnnDestroyTensorDescriptor(cy_desc_), CUDNN_STATUS_SUCCESS);
       CHECK_EQ(cudnnDestroyTensorDescriptor(dhx_desc_), CUDNN_STATUS_SUCCESS);
       CHECK_EQ(cudnnDestroyTensorDescriptor(dcx_desc_), CUDNN_STATUS_SUCCESS);
       CHECK_EQ(cudnnDestroyTensorDescriptor(dhy_desc_), CUDNN_STATUS_SUCCESS);
-      CHECK_EQ(cudnnDestroyTensorDescriptor(dcy_desc_), CUDNN_STATUS_SUCCESS);   
+      CHECK_EQ(cudnnDestroyTensorDescriptor(dcy_desc_), CUDNN_STATUS_SUCCESS);
 
       CHECK_EQ(cudnnDestroyFilterDescriptor(w_desc_), CUDNN_STATUS_SUCCESS);
       CHECK_EQ(cudnnDestroyRNNDescriptor(rnn_desc_), CUDNN_STATUS_SUCCESS);
       CHECK_EQ(cudnnDestroyDropoutDescriptor(dropout_desc_), CUDNN_STATUS_SUCCESS);
     }
   }
- 
+
   virtual void Forward(const OpContext &ctx,
                        const std::vector<TBlob> &in_data,
                        const std::vector<OpReqType> &req,
@@ -96,7 +96,7 @@ class CuDNNRNNOp : public Operator {
 
     DType * cx_ptr = NULL;
     DType * cy_ptr = NULL;
-    if (param_.mode == rnn_enum::kLstm){
+    if (param_.mode == rnn_enum::kLstm) {
       cx_ptr = (in_data[rnn_enum::kStateCell].get<gpu, 3, DType>(s)).dptr_;
       cy_ptr = (in_data[rnn_enum::kStateCellOut].get<gpu, 3, DType>(s)).dptr_;
     }
@@ -107,9 +107,9 @@ class CuDNNRNNOp : public Operator {
     CHECK_EQ(y.CheckContiguous(), true);
     CHECK_EQ(hy.CheckContiguous(), true);
 
-    if(!init_cudnn_){
+    if (!init_cudnn_) {
       Init(s, in_data, out_data);
-    } 
+    }
 
     // Get temp space
     int temp_size = workspace_size_;
@@ -117,8 +117,8 @@ class CuDNNRNNOp : public Operator {
     Tensor<gpu, 1, DType> temp_space =
       ctx.requested[rnn_enum::kTempSpace].get_space_typed<gpu, 1, DType>(
                               mshadow::Shape1(temp_size), s);
-    
-    if (ctx.is_train) { 
+
+    if (ctx.is_train) {
       CHECK_EQ(cudnnRNNForwardTraining(s->dnn_handle_,
                                       rnn_desc_,
                                       param_.seq_length_,
@@ -139,8 +139,7 @@ class CuDNNRNNOp : public Operator {
                                       temp_space.dptr_,
                                       workspace_byte_,
                                       temp_space.dptr_ + workspace_size_,
-                                      reserve_space_byte_
-                                      ), CUDNN_STATUS_SUCCESS);
+                                      reserve_space_byte_), CUDNN_STATUS_SUCCESS);
     } else {
       // inference mode
       CHECK_EQ(cudnnRNNForwardInference(s->dnn_handle_,
@@ -161,11 +160,10 @@ class CuDNNRNNOp : public Operator {
                                       cy_desc_,
                                       cy_ptr,
                                       temp_space.dptr_,
-                                      workspace_byte_
-                                      ), CUDNN_STATUS_SUCCESS); 
+                                      workspace_byte_), CUDNN_STATUS_SUCCESS);
     }
   }
- 
+
   virtual void Backward(const OpContext &ctx,
                         const std::vector<TBlob> &out_grad,
                         const std::vector<TBlob> &in_data,
@@ -198,7 +196,7 @@ class CuDNNRNNOp : public Operator {
     // DType * cy_ptr = NULL;
     DType * dcx_ptr = NULL;
     DType * dcy_ptr = NULL;
-    if (param_.mode == rnn_enum::kLstm){
+    if (param_.mode == rnn_enum::kLstm) {
       cx_ptr = (in_data[rnn_enum::kStateCell].get<gpu, 3, DType>(s)).dptr_;
       // cy_ptr = (in_data[rnn_enum::kStateCellOut].get<gpu, 3, DType>(s)).dptr_;
       dcx_ptr = (in_grad[rnn_enum::kStateCell].get<gpu, 3, DType>(s)).dptr_;
@@ -211,9 +209,9 @@ class CuDNNRNNOp : public Operator {
     CHECK_EQ(y.CheckContiguous(), true);
     CHECK_EQ(hy.CheckContiguous(), true);
 
-    if(!init_cudnn_){
+    if (!init_cudnn_) {
       Init(s, in_data, out_data);
-    } 
+    }
 
     // Get temp space
     int temp_size = workspace_size_;
@@ -247,25 +245,24 @@ class CuDNNRNNOp : public Operator {
                                 temp_space.dptr_,
                                 workspace_byte_,
                                 temp_space.dptr_ + workspace_size_,
-                                reserve_space_byte_
-                                ), CUDNN_STATUS_SUCCESS);
-    CHECK_EQ(cudnnRNNBackwardWeights(s->dnn_handle_, 
-                                    rnn_desc_, 
-                                    param_.seq_length_, 
-                                    x_desc_vec_.data(), 
-                                    x.dptr_, 
+                                reserve_space_byte_), CUDNN_STATUS_SUCCESS);
+    CHECK_EQ(cudnnRNNBackwardWeights(s->dnn_handle_,
+                                    rnn_desc_,
+                                    param_.seq_length_,
+                                    x_desc_vec_.data(),
+                                    x.dptr_,
                                     hx_desc_,
-                                    hx.dptr_,                                                
-                                    y_desc_vec_.data(), 
+                                    hx.dptr_,
+                                    y_desc_vec_.data(),
                                     y.dptr_,
-                                    temp_space.dptr_, 
-                                    workspace_byte_, 
-                                    dw_desc_, 
+                                    temp_space.dptr_,
+                                    workspace_byte_,
+                                    dw_desc_,
                                     dw.dptr_,
-                                    temp_space.dptr_ + workspace_size_, 
-                                    reserve_space_byte_ 
-                                    ), CUDNN_STATUS_SUCCESS);
+                                    temp_space.dptr_ + workspace_size_,
+                                    reserve_space_byte_), CUDNN_STATUS_SUCCESS);
   }
+
  private:
   inline void Init(mshadow::Stream<gpu> *s,
                    const std::vector<TBlob> &in_data,
@@ -299,7 +296,7 @@ class CuDNNRNNOp : public Operator {
         CHECK_EQ(cudnnCreateTensorDescriptor(&y_vec[i]), CUDNN_STATUS_SUCCESS);
         CHECK_EQ(cudnnCreateTensorDescriptor(&dx_vec[i]), CUDNN_STATUS_SUCCESS);
         CHECK_EQ(cudnnCreateTensorDescriptor(&dy_vec[i]), CUDNN_STATUS_SUCCESS);
-        
+
         dimA[0] = param_.batch_size_;
         dimA[1] = param_.input_size_;
         dimA[2] = 1;
@@ -307,21 +304,19 @@ class CuDNNRNNOp : public Operator {
         dimA[1] = param_.input_size_;
         strideA[0] = dimA[2] * dimA[1];
         strideA[1] = dimA[2];
-        strideA[2] = 1; 
+        strideA[2] = 1;
 
         CHECK_EQ(cudnnSetTensorNdDescriptor(x_vec[i],
                                   dtype_,
                                   3,
                                   dimA,
-                                  strideA
-                                  ), CUDNN_STATUS_SUCCESS);
+                                  strideA), CUDNN_STATUS_SUCCESS);
         CHECK_EQ(cudnnSetTensorNdDescriptor(dx_vec[i],
                                   dtype_,
                                   3,
                                   dimA,
-                                  strideA
-                                  ), CUDNN_STATUS_SUCCESS);
-        dimA[0] = param_.batch_size_;                        
+                                  strideA), CUDNN_STATUS_SUCCESS);
+        dimA[0] = param_.batch_size_;
         dimA[1] = param_.bidirectional ? param_.state_size * 2 : param_.state_size;
         dimA[2] = 1;
         strideA[0] = dimA[2] * dimA[1];
@@ -332,21 +327,19 @@ class CuDNNRNNOp : public Operator {
                                   dtype_,
                                   3,
                                   dimA,
-                                  strideA
-                                  ), CUDNN_STATUS_SUCCESS);
+                                  strideA), CUDNN_STATUS_SUCCESS);
         CHECK_EQ(cudnnSetTensorNdDescriptor(dy_vec[i],
                                   dtype_,
                                   3,
                                   dimA,
-                                  strideA
-                                  ), CUDNN_STATUS_SUCCESS);
+                                  strideA), CUDNN_STATUS_SUCCESS);
       }
       x_desc_vec_ = x_vec;
       y_desc_vec_ = y_vec;
       dx_desc_vec_ = dx_vec;
       dy_desc_vec_ = dy_vec;
 
-      // set the state tensors                       
+      // set the state tensors
       dimA[0] = param_.num_layers * (param_.bidirectional ? 2 : 1);
       dimA[1] = param_.batch_size_;
       dimA[2] = param_.state_size;
@@ -367,64 +360,55 @@ class CuDNNRNNOp : public Operator {
                                           dtype_,
                                           3,
                                           dimA,
-                                          strideA
-                                         ), CUDNN_STATUS_SUCCESS);
+                                          strideA), CUDNN_STATUS_SUCCESS);
       CHECK_EQ(cudnnSetTensorNdDescriptor(cx_desc_,
                                           dtype_,
                                           3,
                                           dimA,
-                                          strideA
-                                         ), CUDNN_STATUS_SUCCESS);
+                                          strideA), CUDNN_STATUS_SUCCESS);
       CHECK_EQ(cudnnSetTensorNdDescriptor(hy_desc_,
                                           dtype_,
                                           3,
                                           dimA,
-                                          strideA
-                                         ), CUDNN_STATUS_SUCCESS);
+                                          strideA), CUDNN_STATUS_SUCCESS);
       CHECK_EQ(cudnnSetTensorNdDescriptor(cy_desc_,
                                           dtype_,
                                           3,
                                           dimA,
-                                          strideA
-                                         ), CUDNN_STATUS_SUCCESS);
+                                          strideA), CUDNN_STATUS_SUCCESS);
       CHECK_EQ(cudnnSetTensorNdDescriptor(dhx_desc_,
                                           dtype_,
                                           3,
                                           dimA,
-                                          strideA
-                                         ), CUDNN_STATUS_SUCCESS);
+                                          strideA), CUDNN_STATUS_SUCCESS);
       CHECK_EQ(cudnnSetTensorNdDescriptor(dcx_desc_,
                                           dtype_,
                                           3,
                                           dimA,
-                                          strideA
-                                         ), CUDNN_STATUS_SUCCESS);
+                                          strideA), CUDNN_STATUS_SUCCESS);
       CHECK_EQ(cudnnSetTensorNdDescriptor(dhy_desc_,
                                           dtype_,
                                           3,
                                           dimA,
-                                          strideA
-                                         ), CUDNN_STATUS_SUCCESS);
+                                          strideA), CUDNN_STATUS_SUCCESS);
       CHECK_EQ(cudnnSetTensorNdDescriptor(dcy_desc_,
                                           dtype_,
                                           3,
                                           dimA,
-                                          strideA
-                                         ), CUDNN_STATUS_SUCCESS);
+                                          strideA), CUDNN_STATUS_SUCCESS);
 
       // Create Dropout descriptors
       CHECK_EQ(cudnnCreateDropoutDescriptor(&dropout_desc_), CUDNN_STATUS_SUCCESS);
-      CHECK_EQ(cudnnDropoutGetStatesSize(s->dnn_handle_, 
-                                          &dropout_byte_
-                                          ), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnDropoutGetStatesSize(s->dnn_handle_,
+                                        &dropout_byte_), CUDNN_STATUS_SUCCESS);
       dropout_size_ = dropout_byte_ / sizeof(DType);
       CHECK_EQ(cudnnSetDropoutDescriptor(dropout_desc_,
                                         s->dnn_handle_,
-                                        param_.pkeep_,  // keep probability 
+                                        param_.pkeep_,  // keep probability
                                         NULL,
                                         dropout_byte_,
                                         seed_), CUDNN_STATUS_SUCCESS);
-      // RNN descriptors       
+      // RNN descriptors
       CHECK_EQ(cudnnCreateRNNDescriptor(&rnn_desc_), CUDNN_STATUS_SUCCESS);
       CHECK_EQ(cudnnSetRNNDescriptor(rnn_desc_,
                                     param_.state_size,
@@ -434,19 +418,17 @@ class CuDNNRNNOp : public Operator {
                                     direction_,
                                     mode_,
                                     dtype_), CUDNN_STATUS_SUCCESS);
-      // Get temp space sizes     
+      // Get temp space sizes
       CHECK_EQ(cudnnGetRNNWorkspaceSize(s->dnn_handle_,
                                         rnn_desc_,
                                         param_.seq_length_,
                                         x_desc_vec_.data(),
-                                        &workspace_byte_
-                                        ), CUDNN_STATUS_SUCCESS);
+                                        &workspace_byte_), CUDNN_STATUS_SUCCESS);
       CHECK_EQ(cudnnGetRNNTrainingReserveSize(s->dnn_handle_,
                                         rnn_desc_,
                                         param_.seq_length_,
                                         x_desc_vec_.data(),
-                                        &reserve_space_byte_
-                                        ), CUDNN_STATUS_SUCCESS);
+                                        &reserve_space_byte_), CUDNN_STATUS_SUCCESS);
       workspace_size_ = workspace_byte_ / sizeof(DType);
       reserve_space_size_ = reserve_space_byte_ / sizeof(DType);
 
@@ -467,15 +449,12 @@ class CuDNNRNNOp : public Operator {
                                           dtype_,
                                           format_,
                                           3,
-                                          dim_w
-                                         ), CUDNN_STATUS_SUCCESS);
+                                          dim_w), CUDNN_STATUS_SUCCESS);
       CHECK_EQ(cudnnSetFilterNdDescriptor(dw_desc_,
                                           dtype_,
                                           format_,
                                           3,
-                                          dim_w
-                                         ), CUDNN_STATUS_SUCCESS);
-
+                                          dim_w), CUDNN_STATUS_SUCCESS);
     }
   }
 
@@ -486,7 +465,7 @@ class CuDNNRNNOp : public Operator {
   cudnnDirectionMode_t direction_;
   cudnnRNNInputMode_t input_mode_;
   cudnnDropoutDescriptor_t dropout_desc_;
-  unsigned long long seed_ = 4553;
+  unsigned long long seed_ = 1337ull;
   size_t workspace_byte_, reserve_space_byte_, dropout_byte_;
   int workspace_size_, reserve_space_size_, dropout_size_;
 
@@ -496,7 +475,7 @@ class CuDNNRNNOp : public Operator {
   cudnnTensorDescriptor_t dhx_desc_, dcx_desc_;
   cudnnTensorDescriptor_t dhy_desc_, dcy_desc_;
 
-  cudnnFilterDescriptor_t w_desc_, dw_desc_;  
+  cudnnFilterDescriptor_t w_desc_, dw_desc_;
 
   #if CUDNN_MAJOR == 5
   cudnnTensorFormat_t format_;
diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h
index d036e299e519..2c7d20fe279c 100644
--- a/src/operator/rnn-inl.h
+++ b/src/operator/rnn-inl.h
@@ -30,13 +30,12 @@ namespace rnn_enum {
 // A utility function to calculate input size
 inline int rnn_single_param_size(int inputSize,
                                 int hiddenSize,
-                                int mode){
+                                int mode) {
   int size = hiddenSize * (hiddenSize + inputSize + 2);
   // Different RNN's have different num weights
-  switch(mode)
-  {
+  switch (mode) {
     case rnn_enum::kRnnRelu:
-      size *= 1 ;
+      size *= 1;
       break;
     case rnn_enum::kRnnTanh:
       size *= 1;
@@ -55,16 +54,16 @@ inline int rnn_param_size(int layerNum,
                           int inputSize,
                           int hiddenSize,
                           bool bidirectional,
-                          int mode){
+                          int mode) {
   // get size of first layer
   int size = rnn_single_param_size(inputSize, hiddenSize, mode);
   // get size of remaining layers
-  if(bidirectional){
+  if (bidirectional) {
     size += (layerNum - 1) * rnn_single_param_size(2 * hiddenSize, hiddenSize, mode);
     size *= 2;
+  } else {
+    size += (layerNum - 1) * rnn_single_param_size(hiddenSize, hiddenSize, mode);
   }
-  else 
-    size += (layerNum - 1) * rnn_single_param_size(hiddenSize, hiddenSize, mode);  
   return size;
 }
 
@@ -75,7 +74,7 @@ struct RNNParam : public dmlc::Parameter<RNNParam> {
   int mode;
   float p, pkeep_;
   int seq_length_, batch_size_, input_size_;
-  bool lstm_q_; // whether type is lstm 
+  bool lstm_q_;  // whether type is lstm
 
   DMLC_DECLARE_PARAMETER(RNNParam) {
     DMLC_DECLARE_FIELD(state_size)
@@ -93,14 +92,13 @@ struct RNNParam : public dmlc::Parameter<RNNParam> {
     .add_enum("lstm", rnn_enum::kLstm)
     .add_enum("gru", rnn_enum::kGru)
     .describe("the type of RNN to compute");
-    
+
     DMLC_DECLARE_FIELD(p).set_default(0.)
     .set_range(0, 1)
     .describe("Fraction of the input that gets dropped out at training time");
 
     DMLC_DECLARE_FIELD(state_outputs).set_default(false)
     .describe("Whether to have the states as symbol outputs.");
-
   }
 };
 
@@ -117,7 +115,7 @@ class RNNOp : public Operator {
                        const std::vector<TBlob> &aux_args) {
     using namespace mshadow;
     using namespace mshadow::expr;
-    // TODO: add MShadow implementation
+    // TODO(sbodenstein): add MShadow implementation
   }
 
   virtual void Backward(const OpContext &ctx,
@@ -129,7 +127,7 @@ class RNNOp : public Operator {
                         const std::vector<TBlob> &aux_args) {
     using namespace mshadow;
     using namespace mshadow::expr;
-    // TODO: add MShadow implementation
+    // TODO(sbodenstein): add MShadow implementation
   }
 
  private:
@@ -153,14 +151,14 @@ class RNNProp : public OperatorProperty {
   std::vector<std::string> ListOutputs() const override {
     if (param_.mode == rnn_enum::kLstm)
       return {"output", "state", "state_cell"};
-    else 
+    else
       return {"output", "state"};
   }
 
   int NumOutputs() const override {
     if (param_.mode == rnn_enum::kLstm)
       return 3;
-    else 
+    else
       return 2;
   }
 
@@ -195,7 +193,7 @@ class RNNProp : public OperatorProperty {
     int batch_size = dshape[1];
     int input_size = dshape[2];
     int numDirections = param_.bidirectional ? 2 : 1;
-    int total_layers = numDirections * param_.num_layers; // double for bidirectional
+    int total_layers = numDirections * param_.num_layers;  // double for bidirectional
     SHAPE_ASSIGN_CHECK(*in_shape,
                        rnn_enum::kState,
                        Shape3(total_layers, batch_size, param_.state_size));
@@ -223,7 +221,7 @@ class RNNProp : public OperatorProperty {
     out_shape->push_back(oshape);
     out_shape->push_back(outStateShape);
     // Deal with lstm cell state
-    if(param_.mode == rnn_enum::kLstm)
+    if (param_.mode == rnn_enum::kLstm)
       out_shape->push_back(outStateShape);
     return true;
   }
diff --git a/src/operator/rnn.cc b/src/operator/rnn.cc
index 337410c8ddc1..3067c8e986c1 100644
--- a/src/operator/rnn.cc
+++ b/src/operator/rnn.cc
@@ -19,8 +19,9 @@ Operator *CreateOp<cpu>(RNNParam param, int dtype) {
   return op;
 }
 
-Operator *RNNProp::CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
-                                     std::vector<int> *in_type) const {                                 
+Operator *RNNProp::CreateOperatorEx(Context ctx,
+                                  std::vector<TShape> *in_shape,
+                                  std::vector<int> *in_type) const {
   std::vector<TShape> out_shape, aux_shape;
   std::vector<int> out_type, aux_type;
   CHECK(InferType(in_type, &out_type, &aux_type));
diff --git a/src/operator/rnn.cu b/src/operator/rnn.cu
index fb90daf19b41..bf914026019d 100644
--- a/src/operator/rnn.cu
+++ b/src/operator/rnn.cu
@@ -21,7 +21,7 @@ Operator* CreateOp<gpu>(RNNParam param, int dtype) {
     op = new CuDNNRNNOp<DType>(param);
   })
 #else
-   LOG(FATAL) << "RNN is only available for cuDNN at the moment.";
+  LOG(FATAL) << "RNN is only available for cuDNN at the moment.";
 #endif  // MXNET_USE_CUDNN && CUDNN_MAJOR
   return op;
 }

From 27b23d25f276b7670b96824bca57cee63595caa3 Mon Sep 17 00:00:00 2001
From: Sebastian Bodenstein <sebastianb@wolfram.com>
Date: Thu, 21 Jul 2016 15:47:41 +0200
Subject: [PATCH 31/36] - correct handling of backward dependencies

---
 src/operator/cudnn_rnn-inl.h | 27 +++++++++++++++------------
 src/operator/rnn-inl.h       | 20 ++++++++++++++++----
 2 files changed, 31 insertions(+), 16 deletions(-)

diff --git a/src/operator/cudnn_rnn-inl.h b/src/operator/cudnn_rnn-inl.h
index f3bfc1eac1fe..3f63bc4de0f5 100644
--- a/src/operator/cudnn_rnn-inl.h
+++ b/src/operator/cudnn_rnn-inl.h
@@ -187,27 +187,30 @@ class CuDNNRNNOp : public Operator {
     Tensor<gpu, 1, DType> dw = in_grad[rnn_enum::kParams].get<gpu, 1, DType>(s);
     Tensor<gpu, 3, DType> hx = in_data[rnn_enum::kState].get<gpu, 3, DType>(s);
     Tensor<gpu, 3, DType> dhx = in_grad[rnn_enum::kState].get<gpu, 3, DType>(s);
-    Tensor<gpu, 3, DType> hy = in_data[rnn_enum::kStateOut].get<gpu, 3, DType>(s);
-    Tensor<gpu, 3, DType> dhy = out_grad[rnn_enum::kStateOut].get<gpu, 3, DType>(s);
     Tensor<gpu, 3, DType> y = out_data[rnn_enum::kOut].get<gpu, 3, DType>(s);
     Tensor<gpu, 3, DType> dy = out_grad[rnn_enum::kOut].get<gpu, 3, DType>(s);
 
-    DType * cx_ptr = NULL;
-    // DType * cy_ptr = NULL;
-    DType * dcx_ptr = NULL;
-    DType * dcy_ptr = NULL;
-    if (param_.mode == rnn_enum::kLstm) {
+    // only need kStateOut grad output_states is true
+    void * dhy_ptr = NULL;
+    if (param_.state_outputs)
+      dhy_ptr = out_grad[rnn_enum::kStateOut].get<gpu, 3, DType>(s).dptr_;
+
+    // Deal with lstm
+    void * dcx_ptr = NULL;
+    void * dcy_ptr = NULL;  
+    void * cx_ptr = NULL;
+
+    if(param_.mode == rnn_enum::kLstm) {
       cx_ptr = (in_data[rnn_enum::kStateCell].get<gpu, 3, DType>(s)).dptr_;
-      // cy_ptr = (in_data[rnn_enum::kStateCellOut].get<gpu, 3, DType>(s)).dptr_;
       dcx_ptr = (in_grad[rnn_enum::kStateCell].get<gpu, 3, DType>(s)).dptr_;
-      dcy_ptr = (out_grad[rnn_enum::kStateCellOut].get<gpu, 3, DType>(s)).dptr_;
     }
-
+    if ((param_.mode == rnn_enum::kLstm) && param_.state_outputs)
+        dcy_ptr = (out_grad[rnn_enum::kStateCellOut].get<gpu, 3, DType>(s)).dptr_;
+    
     CHECK_EQ(x.CheckContiguous(), true);
     CHECK_EQ(w.CheckContiguous(), true);
     CHECK_EQ(hx.CheckContiguous(), true);
     CHECK_EQ(y.CheckContiguous(), true);
-    CHECK_EQ(hy.CheckContiguous(), true);
 
     if (!init_cudnn_) {
       Init(s, in_data, out_data);
@@ -227,7 +230,7 @@ class CuDNNRNNOp : public Operator {
                                 dy_desc_vec_.data(),
                                 dy.dptr_,
                                 dhy_desc_,
-                                dhy.dptr_,
+                                dhy_ptr,
                                 dcy_desc_,
                                 dcy_ptr,
                                 w_desc_,
diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h
index 2c7d20fe279c..91284074b5d4 100644
--- a/src/operator/rnn-inl.h
+++ b/src/operator/rnn-inl.h
@@ -264,10 +264,22 @@ class RNNProp : public OperatorProperty {
     const std::vector<int> &out_grad,
     const std::vector<int> &in_data,
     const std::vector<int> &out_data) const override {
-    if (param_.mode == rnn_enum::kLstm)
-      return {out_grad[rnn_enum::kOut], in_data[rnn_enum::kData], in_data[rnn_enum::kParams]};
-    else
-      return {out_grad[rnn_enum::kOut], in_data[rnn_enum::kData], in_data[rnn_enum::kParams]};
+    std::vector<int> dep = {in_data[rnn_enum::kData], in_data[rnn_enum::kParams],
+        in_data[rnn_enum::kState], out_data[rnn_enum::kOut], out_grad[rnn_enum::kOut]};
+
+    if (param_.state_outputs) {
+      dep.push_back(out_data[rnn_enum::kStateOut]);
+      dep.push_back(out_grad[rnn_enum::kStateOut]);
+    }
+
+    if (param_.mode == rnn_enum::kLstm) {
+      dep.push_back(in_data[rnn_enum::kStateCell]);
+      if(param_.state_outputs) {
+        dep.push_back(out_data[rnn_enum::kStateCellOut]);
+        dep.push_back(out_grad[rnn_enum::kStateCellOut]);
+      }
+    }
+    return dep;
   }
 
   std::vector<ResourceRequest> ForwardResource(

From 2b5f26d0666277e59ad56c7a2da3e1625a38ffea Mon Sep 17 00:00:00 2001
From: Sebastian Bodenstein <sebastianb@wolfram.com>
Date: Thu, 21 Jul 2016 15:55:56 +0200
Subject: [PATCH 32/36] - fix lint

---
 src/operator/cudnn_rnn-inl.h | 6 +++---
 src/operator/rnn-inl.h       | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/operator/cudnn_rnn-inl.h b/src/operator/cudnn_rnn-inl.h
index 3f63bc4de0f5..d5deca2af2f8 100644
--- a/src/operator/cudnn_rnn-inl.h
+++ b/src/operator/cudnn_rnn-inl.h
@@ -197,16 +197,16 @@ class CuDNNRNNOp : public Operator {
 
     // Deal with lstm
     void * dcx_ptr = NULL;
-    void * dcy_ptr = NULL;  
+    void * dcy_ptr = NULL;
     void * cx_ptr = NULL;
 
-    if(param_.mode == rnn_enum::kLstm) {
+    if (param_.mode == rnn_enum::kLstm) {
       cx_ptr = (in_data[rnn_enum::kStateCell].get<gpu, 3, DType>(s)).dptr_;
       dcx_ptr = (in_grad[rnn_enum::kStateCell].get<gpu, 3, DType>(s)).dptr_;
     }
     if ((param_.mode == rnn_enum::kLstm) && param_.state_outputs)
         dcy_ptr = (out_grad[rnn_enum::kStateCellOut].get<gpu, 3, DType>(s)).dptr_;
-    
+
     CHECK_EQ(x.CheckContiguous(), true);
     CHECK_EQ(w.CheckContiguous(), true);
     CHECK_EQ(hx.CheckContiguous(), true);
diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h
index 91284074b5d4..ad4d21736345 100644
--- a/src/operator/rnn-inl.h
+++ b/src/operator/rnn-inl.h
@@ -274,7 +274,7 @@ class RNNProp : public OperatorProperty {
 
     if (param_.mode == rnn_enum::kLstm) {
       dep.push_back(in_data[rnn_enum::kStateCell]);
-      if(param_.state_outputs) {
+      if (param_.state_outputs) {
         dep.push_back(out_data[rnn_enum::kStateCellOut]);
         dep.push_back(out_grad[rnn_enum::kStateCellOut]);
       }

From ccd7004307487c1f479545fe641b2bf6d00d53ba Mon Sep 17 00:00:00 2001
From: Sebastian Bodenstein <sebastianb@wolfram.com>
Date: Thu, 21 Jul 2016 16:53:36 +0200
Subject: [PATCH 33/36] - fix type narrowing bug

---
 src/operator/cudnn_rnn-inl.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/operator/cudnn_rnn-inl.h b/src/operator/cudnn_rnn-inl.h
index d5deca2af2f8..666c2a94e717 100644
--- a/src/operator/cudnn_rnn-inl.h
+++ b/src/operator/cudnn_rnn-inl.h
@@ -447,7 +447,8 @@ class CuDNNRNNOp : public Operator {
       // Set param descriptors
       CHECK_EQ(cudnnCreateFilterDescriptor(&w_desc_), CUDNN_STATUS_SUCCESS);
       CHECK_EQ(cudnnCreateFilterDescriptor(&dw_desc_), CUDNN_STATUS_SUCCESS);
-      int dim_w[3] = {w.shape_[0], 1, 1};
+      int dim_w[3] = {1, 1, 1};
+      dim_w[0] = w.shape_[0];
       CHECK_EQ(cudnnSetFilterNdDescriptor(w_desc_,
                                           dtype_,
                                           format_,

From 8fd0d92e7b2c465c45ffba1edf3123cf7cd8cbef Mon Sep 17 00:00:00 2001
From: Sebastian Bodenstein <sebastianb@wolfram.com>
Date: Sun, 24 Jul 2016 02:38:21 +0200
Subject: [PATCH 34/36] - fixed incorrect dropout parameter - added dropout
 states - fixed incorrect handling of variable outputs

---
 src/operator/cudnn_rnn-inl.h | 30 +++++++++++++-------
 src/operator/rnn-inl.h       | 54 ++++++++++++++++++++----------------
 2 files changed, 50 insertions(+), 34 deletions(-)

diff --git a/src/operator/cudnn_rnn-inl.h b/src/operator/cudnn_rnn-inl.h
index 666c2a94e717..e154a8af4740 100644
--- a/src/operator/cudnn_rnn-inl.h
+++ b/src/operator/cudnn_rnn-inl.h
@@ -45,7 +45,6 @@ class CuDNNRNNOp : public Operator {
     // RNN Direction
     direction_ = param_.bidirectional ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL;
     // Other
-    param_.pkeep_ = 1.0f - param_.p;
     if (param_.mode == rnn_enum::kLstm)
       param_.lstm_q_ = true;
     else
@@ -72,6 +71,7 @@ class CuDNNRNNOp : public Operator {
       CHECK_EQ(cudnnDestroyFilterDescriptor(w_desc_), CUDNN_STATUS_SUCCESS);
       CHECK_EQ(cudnnDestroyRNNDescriptor(rnn_desc_), CUDNN_STATUS_SUCCESS);
       CHECK_EQ(cudnnDestroyDropoutDescriptor(dropout_desc_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudaFree(dropout_states_), CUDNN_STATUS_SUCCESS);
     }
   }
 
@@ -83,6 +83,9 @@ class CuDNNRNNOp : public Operator {
     using namespace mshadow;
     size_t in_expected = param_.lstm_q_ ? 4 : 3;
     size_t out_expected = param_.lstm_q_ ? 3 : 2;
+    if (!param_.state_outputs)
+        out_expected = 1;
+
     CHECK_EQ(in_data.size(), in_expected);
     CHECK_EQ(out_data.size(), out_expected);
     Stream<gpu> *s = ctx.get_stream<gpu>();
@@ -90,9 +93,11 @@ class CuDNNRNNOp : public Operator {
     Tensor<gpu, 3, DType> x = in_data[rnn_enum::kData].get<gpu, 3, DType>(s);
     Tensor<gpu, 1, DType> w = in_data[rnn_enum::kParams].get<gpu, 1, DType>(s);
     Tensor<gpu, 3, DType> hx = in_data[rnn_enum::kState].get<gpu, 3, DType>(s);
-
     Tensor<gpu, 3, DType> y = out_data[rnn_enum::kOut].get<gpu, 3, DType>(s);
-    Tensor<gpu, 3, DType> hy = out_data[rnn_enum::kStateOut].get<gpu, 3, DType>(s);
+
+    void * hy_ptr = NULL;
+    if (param_.state_outputs)
+      hy_ptr = out_data[rnn_enum::kStateOut].get<gpu, 3, DType>(s).dptr_;
 
     DType * cx_ptr = NULL;
     DType * cy_ptr = NULL;
@@ -105,19 +110,16 @@ class CuDNNRNNOp : public Operator {
     CHECK_EQ(w.CheckContiguous(), true);
     CHECK_EQ(hx.CheckContiguous(), true);
     CHECK_EQ(y.CheckContiguous(), true);
-    CHECK_EQ(hy.CheckContiguous(), true);
 
     if (!init_cudnn_) {
       Init(s, in_data, out_data);
     }
-
     // Get temp space
     int temp_size = workspace_size_;
     temp_size += ctx.is_train ? reserve_space_size_ : 0;
     Tensor<gpu, 1, DType> temp_space =
       ctx.requested[rnn_enum::kTempSpace].get_space_typed<gpu, 1, DType>(
                               mshadow::Shape1(temp_size), s);
-
     if (ctx.is_train) {
       CHECK_EQ(cudnnRNNForwardTraining(s->dnn_handle_,
                                       rnn_desc_,
@@ -133,7 +135,7 @@ class CuDNNRNNOp : public Operator {
                                       y_desc_vec_.data(),
                                       y.dptr_,
                                       hy_desc_,
-                                      hy.dptr_,
+                                      hy_ptr,
                                       cy_desc_,
                                       cy_ptr,
                                       temp_space.dptr_,
@@ -156,7 +158,7 @@ class CuDNNRNNOp : public Operator {
                                       y_desc_vec_.data(),
                                       y.dptr_,
                                       hy_desc_,
-                                      hy.dptr_,
+                                      hy_ptr,
                                       cy_desc_,
                                       cy_ptr,
                                       temp_space.dptr_,
@@ -174,6 +176,9 @@ class CuDNNRNNOp : public Operator {
     using namespace mshadow;
     size_t in_expected = param_.lstm_q_ ? 4 : 3;
     size_t out_expected = param_.lstm_q_ ? 3 : 2;
+    if (!param_.state_outputs)
+      out_expected = 1;
+
     CHECK_EQ(in_data.size(), in_expected);
     CHECK_EQ(out_data.size(), out_expected);
     CHECK_EQ(in_grad.size(), in_expected);
@@ -276,6 +281,9 @@ class CuDNNRNNOp : public Operator {
     #endif
     size_t in_expected = param_.lstm_q_ ? 4 : 3;
     size_t out_expected = param_.lstm_q_ ? 3 : 2;
+    if (!param_.state_outputs)
+      out_expected = 1;
+
     CHECK_EQ(in_data.size(), in_expected);
     CHECK_EQ(out_data.size(), out_expected);
     if (!init_cudnn_) {
@@ -405,10 +413,11 @@ class CuDNNRNNOp : public Operator {
       CHECK_EQ(cudnnDropoutGetStatesSize(s->dnn_handle_,
                                         &dropout_byte_), CUDNN_STATUS_SUCCESS);
       dropout_size_ = dropout_byte_ / sizeof(DType);
+      CHECK_EQ(cudaMalloc(&dropout_states_, dropout_byte_), CUDNN_STATUS_SUCCESS);
       CHECK_EQ(cudnnSetDropoutDescriptor(dropout_desc_,
                                         s->dnn_handle_,
-                                        param_.pkeep_,  // keep probability
-                                        NULL,
+                                        param_.p,  // keep probability
+                                        dropout_states_,
                                         dropout_byte_,
                                         seed_), CUDNN_STATUS_SUCCESS);
       // RNN descriptors
@@ -469,6 +478,7 @@ class CuDNNRNNOp : public Operator {
   cudnnDirectionMode_t direction_;
   cudnnRNNInputMode_t input_mode_;
   cudnnDropoutDescriptor_t dropout_desc_;
+  void *dropout_states_;
   unsigned long long seed_ = 1337ull;
   size_t workspace_byte_, reserve_space_byte_, dropout_byte_;
   int workspace_size_, reserve_space_size_, dropout_size_;
diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h
index ad4d21736345..a70138adb7ce 100644
--- a/src/operator/rnn-inl.h
+++ b/src/operator/rnn-inl.h
@@ -149,20 +149,17 @@ class RNNProp : public OperatorProperty {
   }
 
   std::vector<std::string> ListOutputs() const override {
-    if (param_.mode == rnn_enum::kLstm)
-      return {"output", "state", "state_cell"};
+    std::vector<std::string> outputs = {"output"};
+    if (!param_.state_outputs)
+      return outputs;
     else
-      return {"output", "state"};
-  }
-
-  int NumOutputs() const override {
+      outputs.push_back("state");
     if (param_.mode == rnn_enum::kLstm)
-      return 3;
-    else
-      return 2;
+      outputs.push_back("state_cell");
+    return outputs;
   }
 
-  int NumVisibleOutputs() const override {
+  int NumOutputs() const override {
     int mode_num = (param_.mode == rnn_enum::kLstm) ? 2 : 1;
     int num_outputs = param_.state_outputs ? (mode_num + 1) : 1;
     return num_outputs;
@@ -209,21 +206,26 @@ class RNNProp : public OperatorProperty {
                                     param_.bidirectional,
                                     param_.mode);
     SHAPE_ASSIGN_CHECK(*in_shape, rnn_enum::kParams, Shape1(param_size));
+
+    out_shape->clear();
     // output: [sequence len, batch, output size]
     TShape oshape = dshape;
     oshape[2] = numDirections * param_.state_size;
-    TShape outStateShape = dshape;
-    outStateShape[0] = total_layers;
-    outStateShape[1] = batch_size;
-    outStateShape[2] = param_.state_size;
-
-    out_shape->clear();
     out_shape->push_back(oshape);
-    out_shape->push_back(outStateShape);
-    // Deal with lstm cell state
-    if (param_.mode == rnn_enum::kLstm)
+    if (!param_.state_outputs) {
+      return true;
+    } else {
+      // outStateShape: [layer_num, batch, state size]
+      TShape outStateShape = dshape;
+      outStateShape[0] = total_layers;
+      outStateShape[1] = batch_size;
+      outStateShape[2] = param_.state_size;
       out_shape->push_back(outStateShape);
-    return true;
+      // Deal with lstm cell state
+      if (param_.mode == rnn_enum::kLstm)
+        out_shape->push_back(outStateShape);
+      return true;
+    }
   }
 
   bool InferType(std::vector<int> *in_type,
@@ -243,11 +245,15 @@ class RNNProp : public OperatorProperty {
     }
     out_type->clear();
     out_type->push_back(dtype);
-    out_type->push_back(dtype);
-    // Deal with lstm cell state
-    if (param_.mode == rnn_enum::kLstm)
+    if (!param_.state_outputs) {
+      return true;
+    } else {
       out_type->push_back(dtype);
-    return true;
+      // Deal with lstm cell state
+      if (param_.mode == rnn_enum::kLstm)
+        out_type->push_back(dtype);
+      return true;
+    }
   }
 
   OperatorProperty* Copy() const override {

From 4f46668590822a1446e3b8cf6b390180f9fde200 Mon Sep 17 00:00:00 2001
From: Sebastian Bodenstein <sebastianb@wolfram.com>
Date: Sun, 24 Jul 2016 11:05:49 +0200
Subject: [PATCH 35/36] - fix incorrect cell state forward handling

---
 src/operator/cudnn_rnn-inl.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/operator/cudnn_rnn-inl.h b/src/operator/cudnn_rnn-inl.h
index e154a8af4740..69e092aa6d6a 100644
--- a/src/operator/cudnn_rnn-inl.h
+++ b/src/operator/cudnn_rnn-inl.h
@@ -103,7 +103,7 @@ class CuDNNRNNOp : public Operator {
     DType * cy_ptr = NULL;
     if (param_.mode == rnn_enum::kLstm) {
       cx_ptr = (in_data[rnn_enum::kStateCell].get<gpu, 3, DType>(s)).dptr_;
-      cy_ptr = (in_data[rnn_enum::kStateCellOut].get<gpu, 3, DType>(s)).dptr_;
+      cy_ptr = (out_data[rnn_enum::kStateCellOut].get<gpu, 3, DType>(s)).dptr_;
     }
 
     CHECK_EQ(x.CheckContiguous(), true);

From 3c50c5c25f23fb826489d761c282122f63753b8e Mon Sep 17 00:00:00 2001
From: Sebastian Bodenstein <sebastianb@wolfram.com>
Date: Sun, 24 Jul 2016 23:53:54 +0200
Subject: [PATCH 36/36] - fixed lint by replacing unsigned long long with
 uint64_t

---
 src/operator/cudnn_rnn-inl.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/operator/cudnn_rnn-inl.h b/src/operator/cudnn_rnn-inl.h
index 69e092aa6d6a..5707846a781f 100644
--- a/src/operator/cudnn_rnn-inl.h
+++ b/src/operator/cudnn_rnn-inl.h
@@ -11,6 +11,7 @@
 #include <map>
 #include <string>
 #include <utility>
+#include <cstdint>
 #include "./rnn-inl.h"
 
 namespace mxnet {
@@ -479,7 +480,7 @@ class CuDNNRNNOp : public Operator {
   cudnnRNNInputMode_t input_mode_;
   cudnnDropoutDescriptor_t dropout_desc_;
   void *dropout_states_;
-  unsigned long long seed_ = 1337ull;
+  uint64_t seed_ = 1337ull;
   size_t workspace_byte_, reserve_space_byte_, dropout_byte_;
   int workspace_size_, reserve_space_size_, dropout_size_;