NVIDIA
diff --git a/‎tensorflow/core/kernels/sparse_tensor_dense_matmul_op.cc
Lines changed: 4 additions & 3 deletions b/‎tensorflow/core/kernels/sparse_tensor_dense_matmul_op.cc
Lines changed: 4 additions & 3 deletions
diff --git a/‎tensorflow/core/kernels/sparse_tensor_dense_matmul_op.h
Lines changed: 2 additions & 1 deletion b/‎tensorflow/core/kernels/sparse_tensor_dense_matmul_op.h
Lines changed: 2 additions & 1 deletion
diff --git a/‎tensorflow/core/kernels/sparse_tensor_dense_matmul_op_gpu.cu.cc
Lines changed: 68 additions & 20 deletions b/‎tensorflow/core/kernels/sparse_tensor_dense_matmul_op_gpu.cu.cc
Lines changed: 68 additions & 20 deletions
diff --git a/‎tensorflow/python/kernel_tests/BUILD
Lines changed: 23 additions & 5 deletions b/‎tensorflow/python/kernel_tests/BUILD
Lines changed: 23 additions & 5 deletions
@@ -137,7 +137,7 @@ class SparseTensorDenseMatMulOp : public OpKernel {
   if (adjoint_a_ == ADJ_A && adjoint_b_ == ADJ_B) {                        \
     Status functor_status = functor::SparseTensorDenseMatMulFunctor<       \
         Device, T, Tindices, ADJ_A,                                        \
-        ADJ_B>::Compute(ctx->eigen_device<Device>(), out->matrix<T>(),     \
+        ADJ_B>::Compute(ctx, out->matrix<T>(),     \
                         a_indices->matrix<Tindices>(), a_values->vec<T>(), \
                         b->matrix<T>());                                   \
     OP_REQUIRES_OK(ctx, functor_status);                                   \
@@ -183,7 +183,7 @@ namespace functor {
   template <>                                                             \
   Status SparseTensorDenseMatMulFunctor<                                  \
       GPUDevice, T, Tindices, ADJ_A,                                      \
-      ADJ_B>::Compute(const GPUDevice& d, typename TTypes<T>::Matrix out, \
+      ADJ_B>::Compute(OpKernelContext* ctx, typename TTypes<T>::Matrix out, \
                       TTypes<Tindices>::ConstMatrix a_indices,            \
                       typename TTypes<T>::ConstVec a_values,              \
                       typename TTypes<T>::ConstMatrix b);                 \
@@ -246,10 +246,11 @@ struct SparseTensorDenseMatMulFunctor<CPUDevice, T, Tindices, ADJ_A, ADJ_B> {
   // Vectorize certain operations above this size.
   static const std::size_t kNumVectorize = 32;
 
-  static Status Compute(const CPUDevice& d, typename TTypes<T>::Matrix out,
+  static Status Compute(OpKernelContext* context, typename TTypes<T>::Matrix out,
                         typename TTypes<Tindices>::ConstMatrix a_indices,
                         typename TTypes<T>::ConstVec a_values,
                         typename TTypes<T>::ConstMatrix b) {
+    const CPUDevice d = context->eigen_device<CPUDevice>();
     const std::size_t nnz = a_values.size();
     const std::size_t rhs_right = (ADJ_B ? b.dimension(0) : b.dimension(1));
     const std::size_t lhs_right = (ADJ_B ? b.dimension(1) : b.dimension(0));
 
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_CORE_KERNELS_SPARSE_TENSOR_DENSE_MATMUL_OP_H_
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/core/errors.h"
@@ -29,7 +30,7 @@ template <typename Device, typename T, typename Tindices, bool ADJ_A,
           bool ADJ_B>
 struct SparseTensorDenseMatMulFunctor {
   static EIGEN_ALWAYS_INLINE Status Compute(
-      const Device& d, typename TTypes<T>::Matrix out,
+      OpKernelContext* context, typename TTypes<T>::Matrix out,
       typename TTypes<Tindices>::ConstMatrix a_indices,
       typename TTypes<T>::ConstVec a_values, typename TTypes<T>::ConstMatrix b);
 };
 
@@ -20,18 +20,29 @@ limitations under the License.
 #include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/kernels/sparse_tensor_dense_matmul_op.h"
+#include "tensorflow/core/util/env_var.h"
+#include "tensorflow/core/util/gpu_device_functions.h"
 #include "tensorflow/core/util/gpu_kernel_helper.h"
 
 namespace tensorflow {
 
 typedef Eigen::GpuDevice GPUDevice;
 
-template <typename T, typename Tindices, bool ADJ_A, bool ADJ_B>
+__global__ void DownCast(
+    const int size, const double* src, float* __restrict__ dst) {
+
+    GPU_1D_KERNEL_LOOP(index, size) {
+        dst[index] = (float)src[index];
+    }
+}
+
+template <typename Tin, typename Tindices, typename Tout,
+          bool ADJ_A, bool ADJ_B>
 __global__ void SparseTensorDenseMatMulKernel(int nnz, int m, int b_rows,
                                               int b_cols, int p,
                                               const Tindices* a_indices,
-                                              const T* a_values, const T* b,
-                                              T* out) {
+                                              const Tin* a_values, const Tin* b,
+                                              Tout* out) {
   // out_{ij} = sum_k {a_ik b_kj}
   // out = A * B', out_{ij} = sum_k {a_ik (b')_kj}; b'_{kj} = b_{jk}
   const int n = (ADJ_B) ? b_cols : b_rows;
@@ -44,31 +55,42 @@ __global__ void SparseTensorDenseMatMulKernel(int nnz, int m, int b_rows,
       continue;  // Nowhere to signal an error :(
     }
     // out[i, j]
-    T* out_location = out + i * p + j;
+    Tout* out_location = out + i * p + j;
     if (!FastBoundsCheck(k, n)) {
-      GpuAtomicAdd(out_location, std::numeric_limits<T>::quiet_NaN());
+      GpuAtomicAdd(out_location, std::numeric_limits<Tout>::quiet_NaN());
       continue;
     }
 
     // a_value == (ADJ_A) ? a[k, i] : a[i, k]
-    const T a_value = ldg(a_values + a_ix);
+    const Tin a_value = ldg(a_values + a_ix);
 
     // b_value == (ADJ_B) ? b[j, k] : b[k, j]
-    const T b_value = ldg(b + ((ADJ_B) ? j * b_cols + k : k * b_cols + j));
-    GpuAtomicAdd(out_location, a_value * b_value);
+    const Tin b_value = ldg(b + ((ADJ_B) ? j * b_cols + k : k * b_cols + j));
+    GpuAtomicAdd(out_location, (Tout)(a_value * b_value));
   }
 }
 
 namespace functor {
 
-template <typename T, typename Tindices, bool ADJ_A, bool ADJ_B>
-struct SparseTensorDenseMatMulFunctor<GPUDevice, T, Tindices, ADJ_A, ADJ_B> {
+bool RequireDeterminism() {
+  static bool require_determinism = [] {
+    bool deterministic_ops = false;
+    TF_CHECK_OK(tensorflow::ReadBoolFromEnvVar("TF_DETERMINISTIC_OPS",
+                                               /*default_val=*/false,
+                                               &deterministic_ops));
+    return deterministic_ops;
+  }();
+  return require_determinism;
+}
+
+template <typename Tindices, bool ADJ_A, bool ADJ_B>
+struct SparseTensorDenseMatMulFunctor<GPUDevice, float, Tindices, ADJ_A, ADJ_B> {
   static EIGEN_ALWAYS_INLINE Status
-  Compute(const GPUDevice& d, typename TTypes<T>::Matrix out,
+  Compute(OpKernelContext* context, typename TTypes<float>::Matrix out,
           typename TTypes<Tindices>::ConstMatrix a_indices,
-          typename TTypes<T>::ConstVec a_values,
-          typename TTypes<T>::ConstMatrix b) {
-    out.device(d) = out.constant(T(0));
+          typename TTypes<float>::ConstVec a_values,
+          typename TTypes<float>::ConstMatrix b) {
+    const GPUDevice d = context->eigen_device<GPUDevice>();
     int nnz = a_values.size();
     // out = A * B, A is [m x n] and B is [n x p], out is [m x p]
     int m = out.dimension(0);
@@ -80,12 +102,38 @@ struct SparseTensorDenseMatMulFunctor<GPUDevice, T, Tindices, ADJ_A, ADJ_B> {
     // out.size()?  Perhaps p * nnz ?
     GpuLaunchConfig config = GetGpuLaunchConfig(p * nnz, d);
 
-    TF_CHECK_OK(GpuLaunchKernel(
-        SparseTensorDenseMatMulKernel<T, Tindices, ADJ_A, ADJ_B>,
-        config.block_count, config.thread_per_block, 0, d.stream(), nnz, m,
-        b_rows, b_cols, p, a_indices.data(), a_values.data(), b.data(),
-        out.data()));
-
+    if (RequireDeterminism()) {
+      Tensor temp_buffer;
+      TensorShape outshape({m, p});
+
+      TF_RETURN_IF_ERROR(
+          context, context->allocate_temp(DT_DOUBLE, outshape, &temp_buffer));
+
+      TF_CHECK_OK(GpuLaunchKernel(
+          SetZero<double>, config.block_count, config.thread_per_block, 0,
+          d.stream(), m * p, (&temp_buffer)->flat<double>().data()));
+
+      TF_CHECK_OK(GpuLaunchKernel(
+          SparseTensorDenseMatMulKernel<float, Tindices, double, ADJ_A, ADJ_B>,
+          config.block_count, config.thread_per_block, 0, d.stream(),
+          nnz, m, b_rows, b_cols, p, a_indices.data(), a_values.data(),
+          b.data(), ((&temp_buffer)->matrix<double>()).data()));
+
+      TF_CHECK_OK(GpuLaunchKernel(
+          DownCast, config.block_count, config.thread_per_block,
+          0, d.stream(), m * p, ((&temp_buffer)->matrix<double>()).data(),
+          out.data()));
+    } else {
+      TF_CHECK_OK(GpuLaunchKernel(
+          SetZero<float>, config.block_count, config.thread_per_block, 0,
+          d.stream(), m * p, out.data()));
+
+      TF_CHECK_OK(GpuLaunchKernel(
+          SparseTensorDenseMatMulKernel<float, Tindices, float, ADJ_A, ADJ_B>,
+          config.block_count, config.thread_per_block, 0, d.stream(), nnz, m,
+          b_rows, b_cols, p, a_indices.data(), a_values.data(), b.data(),
+          out.data()));
+    }
     return Status::OK();
   }
 };
 
@@ -3267,11 +3267,10 @@ cuda_py_test(
     xla_enable_strict_auto_jit = True,
 )
 
-cuda_py_test(
-    name = "sparse_tensor_dense_matmul_op_test",
-    size = "medium",
-    srcs = ["sparse_tensor_dense_matmul_op_test.py"],
-    additional_deps = [
+py_library(
+    name = "sparse_tensor_dense_matmul_op_base",
+    srcs = ["sparse_tensor_dense_matmul_op_base.py"],
+    deps = [
         "//third_party/py/numpy",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
@@ -3284,6 +3283,25 @@ cuda_py_test(
         "//tensorflow/python:platform",
         "//tensorflow/python:sparse_ops",
     ],
+)
+
+cuda_py_test(
+    name = "sparse_tensor_dense_matmul_op_test",
+    size = "medium",
+    srcs = ["sparse_tensor_dense_matmul_op_test.py"],
+    additional_deps = [
+        ":sparse_tensor_dense_matmul_op_base",
+    ],
+    xla_enable_strict_auto_jit = True,
+)
+
+cuda_py_test(
+    name = "sparse_tensor_dense_matmul_op_deterministic_test",
+    size = "small",
+    srcs = ["sparse_tensor_dense_matmul_op_deterministic_test.py"],
+    additional_deps = [
+        ":sparse_tensor_dense_matmul_op_base",
+    ],
     xla_enable_strict_auto_jit = True,
 )