benoitsteiner
diff --git a/‎tensorflow/core/common_runtime/direct_session_test.cc‎
Lines changed: 1 addition & 2 deletions b/‎tensorflow/core/common_runtime/direct_session_test.cc‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc‎
Lines changed: 11 additions & 1 deletion b/‎tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc‎
Lines changed: 11 additions & 1 deletion
diff --git a/‎tensorflow/core/common_runtime/memory_types.cc‎
Lines changed: 3 additions & 3 deletions b/‎tensorflow/core/common_runtime/memory_types.cc‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎tensorflow/core/common_runtime/memory_types_test.cc‎
Lines changed: 18 additions & 0 deletions b/‎tensorflow/core/common_runtime/memory_types_test.cc‎
Lines changed: 18 additions & 0 deletions
diff --git a/‎tensorflow/core/common_runtime/sycl/sycl_device_factory.cc‎
Lines changed: 3 additions & 1 deletion b/‎tensorflow/core/common_runtime/sycl/sycl_device_factory.cc‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎tensorflow/core/common_runtime/sycl/sycl_util.h‎
Lines changed: 22 additions & 0 deletions b/‎tensorflow/core/common_runtime/sycl/sycl_util.h‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎tensorflow/core/debug/debug_gateway.cc‎
Lines changed: 1 addition & 1 deletion b/‎tensorflow/core/debug/debug_gateway.cc‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎tensorflow/core/debug/debug_gateway_test.cc‎
Lines changed: 16 additions & 2 deletions b/‎tensorflow/core/debug/debug_gateway_test.cc‎
Lines changed: 16 additions & 2 deletions
diff --git a/‎tensorflow/core/framework/op_kernel.cc‎
Lines changed: 2 additions & 2 deletions b/‎tensorflow/core/framework/op_kernel.cc‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎tensorflow/core/graph/testlib.cc‎
Lines changed: 4 additions & 0 deletions b/‎tensorflow/core/graph/testlib.cc‎
Lines changed: 4 additions & 0 deletions
@@ -871,8 +871,6 @@ class BlockingOp : public OpKernel {
 REGISTER_KERNEL_BUILDER(Name("BlockingOp").Device(DEVICE_CPU), BlockingOp);
 REGISTER_OP("BlockingOp").Input("x: float").Output("y: float").Doc("");
 
-REGISTER_KERNEL_BUILDER(Name("BlockingOp").Device(DEVICE_SYCL), BlockingOp);
-
 static void TestSessionInterOpThreadsImpl(bool use_function_lib) {
   FunctionDefLibrary library_graph_def;
   if (use_function_lib) {
@@ -910,6 +908,7 @@ static void TestSessionInterOpThreadsImpl(bool use_function_lib) {
       ->set_opt_level(OptimizerOptions_Level_L0);
   (*options.config.mutable_device_count())["CPU"] = 2;
   (*options.config.mutable_device_count())["GPU"] = 0;
+  (*options.config.mutable_device_count())["SYCL"] = 0;
 
   options.config.add_session_inter_op_thread_pool();
   auto* p = options.config.add_session_inter_op_thread_pool();
 
@@ -138,7 +138,8 @@ TEST(DirectSessionWithTrackingAllocTest, CostModelWarmup) {
   DirectSession* ds = static_cast<DirectSession*>(session.get());
   CostModelManager::CostModelMap cost_models;
   ds->ExportCostModels(&cost_models);
-  CHECK_EQ(cost_models.size(), 1);
+  ASSERT_GE(2, cost_models.size());
+  ASSERT_LE(1, cost_models.size());
   const CostModel* cm = (*cost_models.begin()).second;
   EXPECT_EQ(measure_steps, cm->GetUpdateTimes());
 }
@@ -155,10 +156,16 @@ static void TestHWAccelerator(bool enableHWTrace) {
   test::FillValues<float>(&x_tensor, {1, 1});
   Node* x = test::graph::Constant(&graph, x_tensor);
   x->set_assigned_device_name("/job:localhost/replica:0/task:0/gpu:0");
+#ifdef TENSORFLOW_USE_SYCL
+  x->set_assigned_device_name("/job:localhost/replica:0/task:0/device:SYCL:0");
+#endif // TENSORFLOW_USE_SYCL
 
   // y = A * x
   Node* y = test::graph::Matmul(&graph, a, x, false, false);
   y->set_assigned_device_name("/job:localhost/replica:0/task:0/gpu:0");
+#ifdef TENSORFLOW_USE_SYCL
+y->set_assigned_device_name("/job:localhost/replica:0/task:0/device:SYCL:0");
+#endif // TENSORFLOW_USE_SYCL
 
   Node* y_neg = test::graph::Unary(&graph, "Neg", y);
   y_neg->set_assigned_device_name("/job:localhost/replica:0/task:0/cpu:0");
@@ -169,6 +176,9 @@ static void TestHWAccelerator(bool enableHWTrace) {
   SessionOptions options;
   (*options.config.mutable_device_count())["CPU"] = 1;
   (*options.config.mutable_device_count())["GPU"] = 1;
+#ifdef TENSORFLOW_USE_SYCL
+  (*options.config.mutable_device_count())["SYCL"] = 1;
+#endif // TENSORFLOW_USE_SYCL
   options.config.set_allow_soft_placement(true);
   options.config.mutable_graph_options()->set_build_cost_model(1);
   std::unique_ptr<Session> session(NewSession(options));
 
@@ -45,12 +45,12 @@ struct EndpointEq {
 static Status ProcessMemoryTypes(
     DeviceType device_type, const Graph* g,
     std::function<Status(const Edge*, MemoryType, MemoryType)> fn) {
-  if (device_type != DEVICE_GPU) {
-    // On non-GPU devices, HOST_MEMORY and DEVICE_MEMORY are always
+  if (device_type != DEVICE_GPU && device_type != DEVICE_SYCL ) {
+    // On non-GPU and non-SYCL devices, HOST_MEMORY and DEVICE_MEMORY are always
     // compatible.
     return Status::OK();
   }
-  // For GPU device, HOST_MEMORY and DEVICE_MEMORY is not
+  // For GPU and SYCL device, HOST_MEMORY and DEVICE_MEMORY is not
   // compatible. I.e., a conversion/transfer must be done.
   //
   // {node id, slot id} -> memory type.
 
@@ -34,6 +34,9 @@ TEST(MemoryTypeChecker, Int32OK) {
   // There is a kernel for adding two int32s on host memory.
   TF_EXPECT_OK(ValidateMemoryTypes(DEVICE_GPU, g));
 #endif  // GOOGLE_CUDA
+#ifdef TENSORFLOW_USE_SYCL
+  TF_EXPECT_OK(ValidateMemoryTypes(DEVICE_SYCL, g));
+#endif // TENSORFLOW_USE_SYCL
   delete g;
 }
 
@@ -53,6 +56,15 @@ TEST(MemoryTypeChecker, Int32NotOk) {
   TF_EXPECT_OK(EnsureMemoryTypes(DEVICE_GPU, "/gpu:0", g));
   TF_EXPECT_OK(ValidateMemoryTypes(DEVICE_GPU, g));
 #endif  // GOOGLE_CUDA
+#ifdef TENSORFLOW_USE_SYCL
+  // There is no kernel for casting int32/host memory to float/device
+  // memory.
+  EXPECT_TRUE(errors::IsInternal(ValidateMemoryTypes(DEVICE_SYCL, g)));
+
+  // But we can insert _HostSend/_HostRecv to ensure the invariant.
+  TF_EXPECT_OK(EnsureMemoryTypes(DEVICE_SYCL, "/device:SYCL:0", g));
+  TF_EXPECT_OK(ValidateMemoryTypes(DEVICE_SYCL, g));
+#endif // TENSORFLOW_USE_SYCL
   delete g;
 }
 
@@ -74,6 +86,12 @@ TEST(MemoryTypeChecker, MemoryTypeForOutput) {
   // int Switch's output on GPU has HOST_MEMORY constraint.
   EXPECT_EQ(memory_type, HOST_MEMORY);
 #endif  // GOOGLE_CUDA
+#ifdef TENSORFLOW_USE_SYCL
+  auto si = test::graph::Switch(g, test::graph::Constant(g, vi), pred);
+  TF_EXPECT_OK(MemoryTypeForOutput(DEVICE_SYCL, g, si, 0, &memory_type));
+  // int Switch's output on GPU has HOST_MEMORY constraint.
+  EXPECT_EQ(memory_type, HOST_MEMORY);
+#endif // TENSORFLOW_USE_SYCL
   delete g;
 }
 
 
@@ -18,6 +18,8 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/sycl/sycl_device.h"
 
+#include "tensorflow/core/common_runtime/sycl/sycl_util.h"
+
 namespace tensorflow {
 
 class SYCLDeviceFactory : public DeviceFactory {
@@ -34,7 +36,7 @@ class SYCLDeviceFactory : public DeviceFactory {
       devices->push_back(
           new SYCLDevice(options, name, Bytes(256 << 20), DeviceLocality(),
                          SYCLDevice::GetShortDeviceDescription(),
-                         cl::sycl::gpu_selector(), cpu_allocator()));
+                         GetSYCLDevice(), cpu_allocator()));
     }
     return Status::OK();
   }
 
@@ -30,6 +30,28 @@ namespace tensorflow {
   }
 
   inline void* GetBase(Tensor* dst) { return DMAHelper::base(dst); }
+
+  inline cl::sycl::device GetSYCLDevice() {
+    // Obtain list of supported devices from Eigen
+    for (const auto& device :Eigen::get_sycl_supported_devices()) {
+      if(device.is_gpu()) {
+        // returns first found GPU
+        return device;
+      }
+    }
+
+    // Currently Intel GPU is not supported
+    LOG(WARNING) << "No OpenCL GPU found that is supported by ComputeCpp, trying OpenCL CPU";
+
+    for (const auto& device :Eigen::get_sycl_supported_devices()) {
+      if(device.is_cpu()) {
+        // returns first found CPU
+        return device;
+      }
+    }
+    // Currently Intel GPU is not supported
+    LOG(FATAL) << "No OpenCL GPU nor CPU found that is supported by ComputeCpp";
+  }
 }
 
 #endif // TENSORFLOW_CORE_COMMON_RUNTIME_SYCL_SYCL_UTIL_H_
@@ -84,7 +84,7 @@ void DebugGateway::CopyTensor(const string& node_name, const int output_slot,
     // Determine if the tensor is on device (GPU) or host (CPU).
     // The second part of the check is necessary because even an OpKernel on
     // may have output tensors allocated on CPU.
-    if (device->name().find("gpu:") != string::npos &&
+    if ((device->name().find("gpu:") != string::npos || device->name().find("SYCL:") != string::npos) &&
         !ctx->output_alloc_attr(output_slot).on_host()) {
       // GPU tensors: Copy it to host (CPU).
       DeviceContext* device_ctxt = ctx->op_device_context();
 
@@ -45,6 +45,8 @@ class SessionDebugMinusAXTest : public ::testing::Test {
 
 #if GOOGLE_CUDA
     const string kDeviceName = "/job:localhost/replica:0/task:0/gpu:0";
+#elif defined(TENSORFLOW_USE_SYCL)
+    const string kDeviceName = "/job:localhost/replica:0/task:0/device:SYCL:0";
 #else
     const string kDeviceName = "/job:localhost/replica:0/task:0/cpu:0";
 #endif
@@ -302,6 +304,8 @@ TEST_F(SessionDebugMinusAXTest, RunSimpleNetworkWithTwoDebugNodesInserted) {
 // through RunMetadata, given whether GPU is involved.
 #if GOOGLE_CUDA
   ASSERT_EQ(2, run_metadata.partition_graphs().size());
+#elif defined(TENSORFLOW_USE_SYCL)
+  ASSERT_EQ(2, run_metadata.partition_graphs().size());
 #else
   ASSERT_EQ(1, run_metadata.partition_graphs().size());
 #endif
@@ -336,7 +340,7 @@ TEST_F(SessionDebugMinusAXTest, RunSimpleNetworkWithTwoDebugNodesInserted) {
   ASSERT_EQ(1, debug_nan_count_tensor_vals[0].scalar<int64>()());
 }
 
-#ifndef GOOGLE_CUDA
+#if !defined(GOOGLE_CUDA) && !defined(TENSORFLOW_USE_SYCL)
 // TODO(cais): Reinstate the following test for concurrent debugged runs on
 //   a GPU once the root cause of the ~0.5% flakiness has been addressed.
 //   (b/34081273)
@@ -499,6 +503,8 @@ class SessionDebugOutputSlotWithoutOngoingEdgeTest : public ::testing::Test {
 
 #if GOOGLE_CUDA
     const string kDeviceName = "/job:localhost/replica:0/task:0/gpu:0";
+#elif defined(TENSORFLOW_USE_SYCL)
+    const string kDeviceName = "/job:localhost/replica:0/task:0/device:SYCL:0";
 #else
     const string kDeviceName = "/job:localhost/replica:0/task:0/cpu:0";
 #endif
@@ -599,6 +605,8 @@ class SessionDebugVariableTest : public ::testing::Test {
 
 #if GOOGLE_CUDA
     const string kDeviceName = "/job:localhost/replica:0/task:0/gpu:0";
+#elif defined(TENSORFLOW_USE_SYCL)
+    const string kDeviceName = "/job:localhost/replica:0/task:0/device:SYCL:0";
 #else
     const string kDeviceName = "/job:localhost/replica:0/task:0/cpu:0";
 #endif
@@ -818,6 +826,8 @@ TEST_F(SessionDebugVariableTest, VariableAssignWithDebugOps) {
 
 #if GOOGLE_CUDA
   ASSERT_EQ(2, run_metadata.partition_graphs().size());
+#elif defined(TENSORFLOW_USE_SYCL)
+  ASSERT_EQ(2, run_metadata.partition_graphs().size());
 #else
   ASSERT_EQ(1, run_metadata.partition_graphs().size());
 #endif
@@ -855,13 +865,17 @@ TEST_F(SessionDebugVariableTest, VariableAssignWithDebugOps) {
   ASSERT_EQ(2, debug_nan_count_tensor_vals[0].scalar<int64>()());
 }
 
-#if GOOGLE_CUDA
+#if defined(GOOGLE_CUDA) || defined(TENSORFLOW_USE_SYCL)
 class SessionDebugGPUSwitchTest : public ::testing::Test {
  public:
   void Initialize() {
     Graph graph(OpRegistry::Global());
 
+#ifdef GOOGLE_CUDA
     const string kDeviceName = "/job:localhost/replica:0/task:0/gpu:0";
+#elif TENSORFLOW_USE_SYCL
+    const string kDeviceName = "/job:localhost/replica:0/task:0/device:SYCL:0";
+#endif
 
     Tensor vb(DT_BOOL, TensorShape({}));
     vb.scalar<bool>()() = true;
 
@@ -94,9 +94,9 @@ OpKernel::OpKernel(OpKernelConstruction* context)
   OP_REQUIRES_OK(context, CheckOpDeprecation(context->op_def(),
                                              context->graph_def_version()));
 
-  // Kernels executing on GPU tie very few resources on the CPU where the
+  // Kernels executing on GPU/SYCL tie very few resources on the CPU where the
   // scheduler runs: we consider them as inexpensive.
-  expensive_ = context->device_type() != DeviceType(DEVICE_GPU);
+  expensive_ = context->device_type() != DeviceType(DEVICE_GPU) && context->device_type() != DeviceType(DEVICE_SYCL);
 }
 
 OpKernel::~OpKernel() {}
 
@@ -36,6 +36,10 @@ namespace tensorflow {
 REGISTER_KERNEL_BUILDER(Name("HostConst").Device(DEVICE_CPU), HostConstantOp);
 REGISTER_KERNEL_BUILDER(
     Name("HostConst").Device(DEVICE_GPU).HostMemory("output"), HostConstantOp);
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER_KERNEL_BUILDER(
+    Name("HostConst").Device(DEVICE_SYCL).HostMemory("output"), HostConstantOp);
+#endif // TENSORFLOW_USE_SYCL
 
 // Register the HostConst Op
 // Returns a constant tensor on the host.  Useful for writing C++ tests