Skip to content

Commit eb0d3a1

Browse files
Luke Iwanskibenoitsteiner
authored andcommitted
OpenCL Improvements
* Registers Scatter and ScatterNd Ops for SYCL * Registers Stack op for SYCL * Fixes No sycl buffer found error for debug ops * Registers MatMul and Transpose Ops to SYCL device for double * Extends analyzer_cli_test.py test to cover SYCL * Fixes Transpose Op for double when on SYCL * Bumps Eigen version to fix double precision issue on SYCL * Extends SessionDebugTestBase to cover SYCL
1 parent cbcdc6e commit eb0d3a1

File tree

15 files changed

+313
-12
lines changed

15 files changed

+313
-12
lines changed

tensorflow/core/BUILD

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1597,6 +1597,7 @@ cc_library(
15971597
hdrs = if_not_windows([
15981598
"common_runtime/sycl/sycl_allocator.h",
15991599
"common_runtime/sycl/sycl_device.h",
1600+
"common_runtime/sycl/sycl_util.h",
16001601
"common_runtime/sycl/sycl_device_context.h",
16011602
]),
16021603
copts = tf_copts(),
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2+
3+
Licensed under the Apache License, Version 2.0 (the "License");
4+
you may not use this file except in compliance with the License.
5+
You may obtain a copy of the License at
6+
7+
http://www.apache.org/licenses/LICENSE-2.0
8+
9+
Unless required by applicable law or agreed to in writing, software
10+
distributed under the License is distributed on an "AS IS" BASIS,
11+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
See the License for the specific language governing permissions and
13+
limitations under the License.
14+
==============================================================================*/
15+
16+
#if !TENSORFLOW_USE_SYCL
17+
#error This file must only be included when building TensorFlow with SYCL support
18+
#endif
19+
20+
#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_SYCL_SYCL_UTIL_H_
21+
#define TENSORFLOW_CORE_COMMON_RUNTIME_SYCL_SYCL_UTIL_H_
22+
23+
// For DMA helper
24+
#include "tensorflow/core/common_runtime/dma_helper.h"
25+
#include "tensorflow/core/framework/tensor.h"
26+
27+
namespace tensorflow {
28+
inline void* GetBase(const Tensor* src) {
29+
return const_cast<void*>(DMAHelper::base(src));
30+
}
31+
32+
inline void* GetBase(Tensor* dst) { return DMAHelper::base(dst); }
33+
}
34+
35+
#endif // TENSORFLOW_CORE_COMMON_RUNTIME_SYCL_SYCL_UTIL_H_

tensorflow/core/kernels/BUILD

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ load(
3232
"tf_kernel_library",
3333
"cc_header_only_library",
3434
)
35+
load("@local_config_sycl//sycl:build_defs.bzl", "if_sycl")
3536
load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test")
3637
load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_tests")
3738
load(
@@ -465,7 +466,7 @@ tf_kernel_library(
465466
deps = ARRAY_DEPS + [
466467
"//tensorflow/core:gpu_runtime",
467468
"//tensorflow/core/debug:debug_io_utils",
468-
],
469+
] + if_sycl(["//tensorflow/core:sycl_runtime"]),
469470
)
470471

471472
cc_library(

tensorflow/core/kernels/debug_ops.h

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,9 @@ limitations under the License.
1717
#define TENSORFLOW_KERNELS_DEBUG_OP_H_
1818

1919
#include "tensorflow/core/common_runtime/gpu/gpu_util.h"
20+
#ifdef TENSORFLOW_USE_SYCL
21+
#include "tensorflow/core/common_runtime/sycl/sycl_util.h"
22+
#endif // TENSORFLOW_USE_SYCL
2023
#include "tensorflow/core/debug/debug_io_utils.h"
2124
#include "tensorflow/core/framework/device_base.h"
2225
#include "tensorflow/core/framework/op_kernel.h"
@@ -63,6 +66,20 @@ class CopyOp : public OpKernel {
6366
// The input tensor is on the host (CPU): deep-copy from CPU to CPU.
6467
*copied_tensor = tensor::DeepCopy(src_tensor);
6568
}
69+
#elif defined(TENSORFLOW_USE_SYCL)
70+
Device* device = static_cast<Device*>(context->device());
71+
// Determine if the input tensor is not on CPU (e.g., on GPU).
72+
bool off_host_input = device->device_type() == DEVICE_SYCL &&
73+
!context->input_alloc_attr(0).on_host();
74+
if(off_host_input) {
75+
auto size = src_tensor.NumElements() * sizeof(src_tensor.dtype());
76+
auto dst_ptr = GetBase(copied_tensor);
77+
auto src_ptr = GetBase(&src_tensor);
78+
typedef decltype(src_tensor.dtype()) ttype;
79+
device->eigen_sycl_device()->memcpy(dst_ptr, static_cast<const ttype *>(src_ptr), size);
80+
} else {
81+
*copied_tensor = tensor::DeepCopy(src_tensor);
82+
}
6683
#else
6784
*copied_tensor = tensor::DeepCopy(src_tensor);
6885
#endif

tensorflow/core/kernels/matmul_op.cc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -344,6 +344,7 @@ TF_CALL_half(REGISTER_GPU);
344344
.Label("eigen"), \
345345
MatMulOp<SYCLDevice, T, false /* xxblas */>)
346346
TF_CALL_float(REGISTER_SYCL);
347+
TF_CALL_double(REGISTER_SYCL);
347348

348349
#endif // TENSORFLOW_USE_SYCL
349350
} // namespace tensorflow

tensorflow/core/kernels/scatter_functor.h

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,50 @@ struct Assign<scatter_op::UpdateOp::DIV> {
7575
}
7676
};
7777

78+
#ifdef TENSORFLOW_USE_SYCL
79+
template <scatter_op::UpdateOp Op>
80+
struct AssignSYCL {};
81+
template <>
82+
struct AssignSYCL<scatter_op::UpdateOp::ASSIGN> {
83+
template <typename Device, typename Params, typename Update>
84+
static void Run(Device d, Params p, Update u) {
85+
p.device(d) = u;
86+
}
87+
};
88+
89+
template <>
90+
struct AssignSYCL<scatter_op::UpdateOp::ADD> {
91+
template <typename Device, typename Params, typename Update>
92+
static void Run(Device d, Params p, Update u) {
93+
p.device(d) += u;
94+
}
95+
};
96+
97+
template <>
98+
struct AssignSYCL<scatter_op::UpdateOp::SUB> {
99+
template <typename Device, typename Params, typename Update>
100+
static void Run(Device d, Params p, Update u) {
101+
p.device(d) -= u;
102+
}
103+
};
104+
105+
template <>
106+
struct AssignSYCL<scatter_op::UpdateOp::MUL> {
107+
template <typename Device, typename Params, typename Update>
108+
static void Run(Device d, Params p, Update u) {
109+
p.device(d) = p * u;
110+
}
111+
};
112+
113+
template <>
114+
struct AssignSYCL<scatter_op::UpdateOp::DIV> {
115+
template <typename Device, typename Params, typename Update>
116+
static void Run(Device d, Params p, Update u) {
117+
p.device(d) = p / u;
118+
}
119+
};
120+
#endif // TENSORFLOW_USE_SYCL
121+
78122
} // namespace internal
79123
} // namespace scatter_op
80124

@@ -110,6 +154,31 @@ struct ScatterFunctorBase {
110154
}
111155
};
112156

157+
#ifdef TENSORFLOW_USE_SYCL
158+
template <typename T, typename Index, scatter_op::UpdateOp op>
159+
struct ScatterFunctorBase <SYCLDevice, T, Index, op> {
160+
Index operator()(OpKernelContext* c, const SYCLDevice& d,
161+
typename TTypes<T>::Matrix params,
162+
typename TTypes<T>::ConstMatrix updates,
163+
typename TTypes<Index>::ConstFlat indices) {
164+
// indices and params sizes were validated in DoCompute().
165+
const Index N = static_cast<Index>(indices.size());
166+
const Index limit = static_cast<Index>(params.dimension(0));
167+
for (Index i = 0; i < N; i++) {
168+
// Grab the index and check its validity. An earlier version of the
169+
// code checked it and then grabbed it from memory a second time, which
170+
// was a security risk since it could have changed in between.
171+
const Index index = ::tensorflow::internal::SubtleMustCopy(indices(i));
172+
if (!FastBoundsCheck(index, limit)) return i;
173+
// Copy last Ndim-1 dimensions of updates[i] to params[index]
174+
scatter_op::internal::AssignSYCL<op>::Run(d, params.template chip<0>(index),
175+
updates.template chip<0>(i));
176+
}
177+
return -1;
178+
}
179+
};
180+
#endif // TENSORFLOW_USE_SYCL
181+
113182
template <typename T, typename Index>
114183
struct ScatterFunctorBase<CPUDevice, T, Index, scatter_op::UpdateOp::ASSIGN> {
115184
Index operator()(OpKernelContext* c, const CPUDevice& d,

tensorflow/core/kernels/scatter_nd_op.cc

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,9 @@ namespace tensorflow {
3131

3232
typedef Eigen::ThreadPoolDevice CPUDevice;
3333
typedef Eigen::GpuDevice GPUDevice;
34+
#ifdef TENSORFLOW_USE_SYCL
35+
typedef Eigen::SyclDevice SYCLDevice;
36+
#endif // TENSORFLOW_USE_SYCL
3437

3538
// Check whether updates.shape = indices.shape[:batch_dim] +
3639
// params_shape[slice_dim:]
@@ -415,6 +418,19 @@ TF_CALL_GPU_NUMBER_TYPES_NO_HALF(DECLARE_GPU_SPECS);
415418

416419
#endif // GOOGLE_CUDA
417420

421+
#ifdef TENSORFLOW_USE_SYCL
422+
#define REGISTER_SCATTER_ND_ADD_SUB_SYCL(type) \
423+
REGISTER_SCATTER_ND_ADD_SUB(type, SYCL);
424+
425+
#define REGISTER_SCATTER_ND_UPDATE_SYCL(type) \
426+
REGISTER_SCATTER_ND_UPDATE(type, SYCL);
427+
428+
TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_ND_ADD_SUB_SYCL);
429+
TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_ND_UPDATE_SYCL);
430+
#undef REGISTER_SCATTER_ND_ADD_SUB_SYCL
431+
#undef REGISTER_SCATTER_ND_UPDATE_SYCL
432+
#endif // TENSORFLOW_USE_SYCL
433+
418434
#undef REGISTER_SCATTER_ND_ADD
419435
#undef REGISTER_SCATTER_ND_ADD_SUB
420436
#undef REGISTER_SCATTER_ND_ADD_SUB_CPU

tensorflow/core/kernels/scatter_nd_op_cpu_impl.h

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,9 @@ limitations under the License.
3838
namespace tensorflow {
3939

4040
typedef Eigen::ThreadPoolDevice CPUDevice;
41+
#ifdef TENSORFLOW_USE_SYCL
42+
typedef Eigen::SyclDevice SYCLDevice;
43+
#endif // TENSORFLOW_USE_SYCL
4144

4245
class OpKernelContext;
4346

@@ -186,6 +189,91 @@ TF_CALL_NUMBER_TYPES(REGISTER_SCATTER_ND_MATH)
186189
#undef REGISTER_SCATTER_ND_INDEX
187190
#undef REGISTER_SCATTER_ND_FULL
188191

192+
#ifdef TENSORFLOW_USE_SYCL
193+
194+
// Implementation of update functor for SYCL.
195+
template <typename T, typename Index, scatter_nd_op::UpdateOp OP, int IXDIM>
196+
struct ScatterNdFunctor<SYCLDevice, T, Index, OP, IXDIM> {
197+
Index operator()(
198+
const SYCLDevice& d, const Index slice_size,
199+
const Eigen::array<Eigen::DenseIndex, IXDIM> output_shape_prefix,
200+
typename TTypes<T, 2>::Tensor Tparams,
201+
typename TTypes<Index, 2>::ConstTensor Tindices,
202+
typename TTypes<T, 2>::ConstTensor Tupdates,
203+
typename TTypes<T, 2>::Tensor Toutput) {
204+
// error_loc is -1 if there's no out-of-bounds index,
205+
// otherwise it is the location of an OOB index in Tindices.
206+
Index error_loc = -1;
207+
208+
const Eigen::DenseIndex batch_size = Tindices.dimension(0);
209+
210+
Index batch_strides[IXDIM];
211+
for (int dim = IXDIM - 1; dim >= 0; --dim) {
212+
if (dim == IXDIM - 1) {
213+
batch_strides[dim] = 1;
214+
} else {
215+
batch_strides[dim] =
216+
batch_strides[dim + 1] * output_shape_prefix[dim + 1];
217+
}
218+
}
219+
220+
for (Eigen::DenseIndex loc = 0; loc < batch_size; ++loc) {
221+
Index i = 0;
222+
bool out_of_bounds = false;
223+
for (int dim = 0; dim < IXDIM; ++dim) {
224+
const Index ix_d = internal::SubtleMustCopy(Tindices(loc, dim));
225+
out_of_bounds |= !FastBoundsCheck(ix_d, output_shape_prefix[dim]);
226+
i += ix_d * batch_strides[dim];
227+
}
228+
if (TF_PREDICT_FALSE(out_of_bounds)) {
229+
error_loc = loc;
230+
break;
231+
} else {
232+
auto input_chip = Toutput.template chip<0>(i);
233+
auto output_chip = input_chip.device(d);
234+
auto update_chip = Tupdates.template chip<0>(loc);
235+
update_executor::UpdateExecutor<
236+
decltype(input_chip), decltype(update_chip), decltype(output_chip),
237+
OP>::Execute(input_chip, update_chip, output_chip);
238+
}
239+
}
240+
241+
return error_loc;
242+
}
243+
};
244+
245+
#define REGISTER_SCATTER_ND_FULL_SYCL(T, Index, op) \
246+
template Index \
247+
ScatterNdFunctor<SYCLDevice, T, Index, op, CPU_PROVIDED_IXDIM>::operator()( \
248+
const SYCLDevice& d, const Index slice_size, \
249+
const Eigen::array<Eigen::DenseIndex, CPU_PROVIDED_IXDIM> \
250+
output_shape_prefix, \
251+
typename TTypes<T, 2>::Tensor Tparams, \
252+
typename TTypes<Index, 2>::ConstTensor Tindices, \
253+
typename TTypes<T, 2>::ConstTensor Tupdates, \
254+
typename TTypes<T, 2>::Tensor Toutput)
255+
256+
#define REGISTER_SCATTER_ND_INDEX_SYCL(type, op) \
257+
REGISTER_SCATTER_ND_FULL_SYCL(type, int32, op); \
258+
REGISTER_SCATTER_ND_FULL_SYCL(type, int64, op)
259+
260+
#define REGISTER_SCATTER_ND_UPDATE_SYCL(type) \
261+
REGISTER_SCATTER_ND_INDEX_SYCL(type, scatter_nd_op::UpdateOp::ASSIGN);
262+
263+
#define REGISTER_SCATTER_ND_MATH_SYCL(type) \
264+
REGISTER_SCATTER_ND_INDEX_SYCL(type, scatter_nd_op::UpdateOp::ADD); \
265+
REGISTER_SCATTER_ND_INDEX_SYCL(type, scatter_nd_op::UpdateOp::SUB);
266+
267+
TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_ND_UPDATE_SYCL)
268+
TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_ND_MATH_SYCL)
269+
270+
#undef REGISTER_SCATTER_ND_MATH_SYCL
271+
#undef REGISTER_SCATTER_ND_UPDATE_SYCL
272+
#undef REGISTER_SCATTER_ND_INDEX_SYCL
273+
#undef REGISTER_SCATTER_ND_FULL_SYCL
274+
275+
#endif // TENSORFLOW_USE_SYCL
276+
189277
} // namespace functor
190278

191279
} // namespace tensorflow

0 commit comments

Comments
 (0)