Merge pull request #5267 from benoitsteiner/master

Initial support for OpenCL
2016-11-04 10:10:48 -07:00 · 2016-11-04 10:10:48 -07:00 · f179e0a16e
commit f179e0a16e
parent a507438361 1f094bccdb
38 changed files with 1148 additions and 22 deletions
--- a/104
+++ b/104
@ -116,6 +116,17 @@ GEN_GIT_SOURCE=tensorflow/tools/git/gen_git_source.py
 chmod a+x ${GEN_GIT_SOURCE}
 "${PYTHON_BIN_PATH}" ${GEN_GIT_SOURCE} --configure "${SOURCE_BASE_DIR}"

+## Set up SYCL-related environment settings
+while [ "$TF_NEED_OPENCL" == "" ]; do
+  read -p "Do you wish to build TensorFlow with OpenCL support? [y/N] " INPUT
+  case $INPUT in
+    [Yy]* ) echo "OpenCL support will be enabled for TensorFlow"; TF_NEED_OPENCL=1;;
+    [Nn]* ) echo "No OpenCL support will be enabled for TensorFlow"; TF_NEED_OPENCL=0;;
+    "" ) echo "No OpenCL support will be enabled for TensorFlow"; TF_NEED_OPENCL=0;;
+    * ) echo "Invalid selection: " $INPUT;;
+  esac
+done
+
 ## Set up Cuda-related environment settings

 while [ "$TF_NEED_CUDA" == "" ]; do
@ -129,12 +140,14 @@ while [ "$TF_NEED_CUDA" == "" ]; do
 done

 export TF_NEED_CUDA
-if [ "$TF_NEED_CUDA" == "0" ]; then
+export TF_NEED_SYCL
+if [[ "$TF_NEED_CUDA" == "0" ]] && [[ "$TF_NEED_OPENCL" == "0" ]]; then
  echo "Configuration finished"
  bazel_clean_and_fetch
  exit
 fi

+if [ "$TF_NEED_CUDA" == "1" ]; then
 # Set up which gcc nvcc should use as the host compiler
 while true; do
  fromuser=""
@ -336,6 +349,95 @@ EOF
  TF_CUDA_COMPUTE_CAPABILITIES=""
 done

+# end of if "$TF_NEED_CUDA" == "1"
+fi
+
+# OpenCL configuration
+
+if [ "$TF_NEED_OPENCL" == "1" ]; then
+
+# Determine which C++ compiler should be used as the host compiler
+while true; do
+  fromuser=""
+  if [ -z "$HOST_CXX_COMPILER" ]; then
+    default_cxx_host_compiler=$(which g++|| true)
+    read -p "Please specify which C++ compiler should be used as the host C++ compiler. [Default is $default_cxx_host_compiler]: " HOST_CXX_COMPILER
+    fromuser="1"
+    if [ -z "$HOST_CXX_COMPILER" ]; then
+      HOST_CXX_COMPILER=$default_cxx_host_compiler
+    fi
+  fi
+  if [ -e "$HOST_CXX_COMPILER" ]; then
+    export HOST_CXX_COMPILER
+    break
+  fi
+  echo "Invalid C++ compiler path. ${HOST_CXX_COMPILER} cannot be found" 1>&2
+  if [ -z "$fromuser" ]; then
+    exit 1
+  fi
+  HOST_CXX_COMPILER=""
+  # Retry
+done
+
+# Determine which C compiler should be used as the host compiler
+while true; do
+  fromuser=""
+  if [ -z "$HOST_C_COMPILER" ]; then
+    default_c_host_compiler=$(which gcc|| true)
+    read -p "Please specify which C compiler should be used as the host C compiler. [Default is $default_c_host_compiler]: " HOST_C_COMPILER
+    fromuser="1"
+    if [ -z "$HOST_C_COMPILER" ]; then
+      HOST_C_COMPILER=$default_c_host_compiler
+    fi
+  fi
+  if [ -e "$HOST_C_COMPILER" ]; then
+    export HOST_C_COMPILER
+    break
+  fi
+  echo "Invalid C compiler path. ${HOST_C_COMPILER} cannot be found" 1>&2
+  if [ -z "$fromuser" ]; then
+    exit 1
+  fi
+  HOST_C_COMPILER=""
+  # Retry
+done
+
+while true; do
+  # Configure the OPENCL version to use.
+  TF_OPENCL_VERSION="1.2"
+
+  # Point to ComputeCpp root
+  if [ -z "$COMPUTECPP_TOOLKIT_PATH" ]; then
+    default_computecpp_toolkit_path=/usr/local/computecpp
+    read -p "Please specify the location where ComputeCpp $TF_OPENCL_VERSION is installed. Refer to README.md for more details. [Default is $default_computecpp_toolkit_path]: " COMPUTECPP_TOOLKIT_PATH
+    fromuser="1"
+    if [ -z "$COMPUTECPP_TOOLKIT_PATH" ]; then
+      COMPUTECPP_TOOLKIT_PATH=$default_computecpp_toolkit_path
+    fi
+  fi
+
+  if [ "$OSNAME" == "Linux" ]; then
+    SYCL_RT_LIB_PATH="lib/libComputeCpp.so"
+  fi
+
+  if [ -e "${COMPUTECPP_TOOLKIT_PATH}/${SYCL_RT_LIB_PATH}" ]; then
+    export COMPUTECPP_TOOLKIT_PATH
+    break
+  fi
+  echo "Invalid SYCL $TF_OPENCL_VERSION library path. ${COMPUTECPP_TOOLKIT_PATH}/${SYCL_RT_LIB_PATH} cannot be found"
+
+  if [ -z "$fromuser" ]; then
+    exit 1
+  fi
+  # Retry
+  TF_OPENCL_VERSION=""
+  COMPUTECPP_TOOLKIT_PATH=""
+done
+
+export TF_NEED_OPENCL
+# end of if "$TF_NEED_OPENCL" == "1"
+fi
+
 bazel_clean_and_fetch

 echo "Configuration finished"
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@ -510,6 +510,7 @@ cc_library(
    deps = [
        ":core_cpu",
        ":gpu_runtime",
+        ":sycl_runtime",
    ],
 )

@ -1387,6 +1388,33 @@ tf_cuda_library(
    alwayslink = 1,
 )

+cc_library(
+    name = "sycl_runtime",
+    srcs = if_not_windows([
+        "common_runtime/sycl/sycl_device.cc",
+        "common_runtime/sycl/sycl_device_context.cc",
+        "common_runtime/sycl/sycl_device_factory.cc",
+    ]),
+    hdrs = if_not_windows([
+        "common_runtime/sycl/sycl_device.h",
+        "common_runtime/sycl/sycl_device_context.h",
+    ]),
+    copts = tf_copts(),
+    linkstatic = 1,
+    deps = [
+        ":core_cpu",
+        ":core_cpu_internal",
+        ":framework",
+        ":framework_internal",
+        ":lib",
+        ":lib_internal",
+        ":protos_all_cc",
+        "//third_party/eigen3",
+        "@local_config_sycl//sycl:sycl",
+    ],
+    alwayslink = 1,
+)
+
 # -----------------------------------------------------------------------------
 # Tests

--- a/tensorflow/core/common_runtime/device_set_test.cc
+++ b/tensorflow/core/common_runtime/device_set_test.cc
@ -68,12 +68,17 @@ TEST_F(DeviceSetTest, PrioritizedDeviceTypeList) {
      (std::vector<DeviceType>{DeviceType(DEVICE_GPU), DeviceType(DEVICE_CPU)}),
      types());

+  AddDevice("SYCL", "/job:a/replica:0/task:0/device:sycl:0");
+  EXPECT_EQ(
+      (std::vector<DeviceType>{DeviceType(DEVICE_SYCL), DeviceType(DEVICE_GPU),
+      DeviceType(DEVICE_CPU)}), types());
+
  AddDevice("T1", "/job:a/replica:0/task:0/device:T1:0");
  AddDevice("T1", "/job:a/replica:0/task:0/device:T1:1");
  AddDevice("T2", "/job:a/replica:0/task:0/device:T2:0");
  EXPECT_EQ(
-      (std::vector<DeviceType>{DeviceType("T1"), DeviceType("T2"),
-                               DeviceType(DEVICE_GPU), DeviceType(DEVICE_CPU)}),
+      (std::vector<DeviceType>{DeviceType(DEVICE_SYCL), DeviceType("T1"),
+      DeviceType("T2"), DeviceType(DEVICE_GPU), DeviceType(DEVICE_CPU)}),
      types());
 }

--- a/tensorflow/core/common_runtime/direct_session_test.cc
+++ b/tensorflow/core/common_runtime/direct_session_test.cc
@ -818,6 +818,8 @@ class BlockingOp : public OpKernel {
 REGISTER_KERNEL_BUILDER(Name("BlockingOp").Device(DEVICE_CPU), BlockingOp);
 REGISTER_OP("BlockingOp").Input("x: float").Output("y: float").Doc("");

+REGISTER_KERNEL_BUILDER(Name("BlockingOp").Device(DEVICE_SYCL), BlockingOp);
+
 static void TestSessionInterOpThreadsImpl(bool use_function_lib) {
  FunctionDefLibrary library_graph_def;
  if (use_function_lib) {
--- a/tensorflow/core/common_runtime/sycl/sycl_device.cc
+++ b/tensorflow/core/common_runtime/sycl/sycl_device.cc
@ -0,0 +1,88 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if TENSORFLOW_USE_SYCL
+
+#include "tensorflow/core/common_runtime/sycl/sycl_device.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+#include "tensorflow/core/framework/tensor.pb_text.h"
+#include "tensorflow/core/platform/tracing.h"
+
+namespace tensorflow {
+
+cl::sycl::gpu_selector s;
+cl::sycl::queue q(s);
+
+SYCLDevice::SYCLDevice(const SessionOptions& options, const string& name,
+                       Bytes memory_limit, const DeviceLocality& locality,
+                       const string& physical_device_desc, Allocator* allocator)
+    : LocalDevice(options,
+                  Device::BuildDeviceAttributes(name, DEVICE_SYCL, memory_limit,
+                                                locality, physical_device_desc),
+                  allocator),
+      allocator_(allocator),
+      device_context_(new SYCLDeviceContext()),
+      device_(q) {
+  set_eigen_sycl_device(&device_);
+}
+
+SYCLDevice::~SYCLDevice() {
+  device_context_->Unref();
+}
+
+void SYCLDevice::Compute(OpKernel* op_kernel, OpKernelContext* context) {
+  assert(context);
+  if (port::Tracing::IsActive()) {
+    // TODO(pbar) We really need a useful identifier of the graph node.
+    const uint64 id = Hash64(op_kernel->name());
+    port::Tracing::ScopedActivity region(port::Tracing::EventCategory::kCompute,
+                                         id);
+  }
+  op_kernel->Compute(context);
+}
+
+Allocator* SYCLDevice::GetAllocator(AllocatorAttributes attr) {
+  return allocator_;
+}
+
+Status SYCLDevice::MakeTensorFromProto(const TensorProto& tensor_proto,
+                                       const AllocatorAttributes alloc_attrs,
+                                       Tensor* tensor) {
+  Tensor parsed(tensor_proto.dtype());
+  if (!parsed.FromProto(cpu_allocator(), tensor_proto)) {
+    return errors::InvalidArgument("Cannot parse tensor from proto: ",
+                                   ProtoDebugString(tensor_proto));
+  }
+  *tensor = std::move(parsed);
+  return Status::OK();
+}
+
+Status SYCLDevice::FillContextMap(const Graph* graph,
+				  DeviceContextMap* device_context_map) {
+  // Fill in the context map.  It is OK for this map to contain
+  // duplicate DeviceContexts so long as we increment the refcount.
+  device_context_map->resize(graph->num_node_ids());
+  for (Node* n : graph->nodes()) {
+    device_context_->Ref();
+    (*device_context_map)[n->id()] = device_context_;
+  }
+
+  return Status::OK();
+}
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_USE_SYCL
--- a/tensorflow/core/common_runtime/sycl/sycl_device.h
+++ b/tensorflow/core/common_runtime/sycl/sycl_device.h
@ -0,0 +1,62 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if !TENSORFLOW_USE_SYCL
+#error This file must only be included when building TensorFlow with SYCL support
+#endif
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_SYCL_SYCL_DEVICE_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_SYCL_SYCL_DEVICE_H_
+
+#define EIGEN_USE_SYCL
+
+#include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/common_runtime/local_device.h"
+#include "tensorflow/core/common_runtime/sycl/sycl_device_context.h"
+#include "tensorflow/core/public/session_options.h"
+
+namespace tensorflow {
+
+class SYCLDevice : public LocalDevice {
+ public:
+  SYCLDevice(const SessionOptions& options, const string& name,
+             Bytes memory_limit, const DeviceLocality& locality,
+             const string& physical_device_desc, Allocator* allocator);
+  ~SYCLDevice() override;
+
+  void Compute(OpKernel* op_kernel, OpKernelContext* context) override;
+  Allocator* GetAllocator(AllocatorAttributes attr) override;
+  Status MakeTensorFromProto(const TensorProto& tensor_proto,
+                             const AllocatorAttributes alloc_attrs,
+                             Tensor* tensor) override;
+
+  Status FillContextMap(const Graph* graph,
+                        DeviceContextMap* device_context_map) override;
+
+  Status Sync() override { return Status::OK(); }
+  static string GetShortDeviceDescription(/*int device_id,
+                                          const DeviceDescription& desc*/) {
+    return strings::StrCat("device: 0, name SYCL, pci bus id: 0");
+  }
+
+ private:
+  Allocator* allocator_;  // Not owned
+  SYCLDeviceContext* device_context_;
+  Eigen::SyclDevice device_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_SYCL_SYCL_DEVICE_H_
--- a/tensorflow/core/common_runtime/sycl/sycl_device_context.cc
+++ b/tensorflow/core/common_runtime/sycl/sycl_device_context.cc
@ -0,0 +1,46 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/sycl/sycl_device_context.h"
+#include "tensorflow/core/common_runtime/dma_helper.h"
+
+namespace tensorflow {
+
+void SYCLDeviceContext::CopyCPUTensorToDevice(const Tensor* cpu_tensor, Device* device,
+					      Tensor* device_tensor,
+					      StatusCallback done) const {
+  const int64 total_bytes = cpu_tensor->TotalBytes();
+  if (total_bytes > 0) {
+    const void* src_ptr = DMAHelper::base(cpu_tensor);
+    void* dst_ptr = DMAHelper::base(device_tensor);
+    ::memcpy(dst_ptr, src_ptr, total_bytes);
+  }
+  done(Status::OK());
+}
+  
+void SYCLDeviceContext::CopyDeviceTensorToCPU(const Tensor* device_tensor, StringPiece edge_name,
+					      Device* device, Tensor* cpu_tensor,
+					      StatusCallback done) {
+  const int64 total_bytes = device_tensor->TotalBytes();
+  if (total_bytes > 0) {
+    const void* src_ptr = DMAHelper::base(device_tensor);
+    void* dst_ptr = DMAHelper::base(cpu_tensor);
+    ::memcpy(dst_ptr, src_ptr, total_bytes);
+  }
+  done(Status::OK());
+}
+
+}  // namespace tensorflow
+
--- a/tensorflow/core/common_runtime/sycl/sycl_device_context.h
+++ b/tensorflow/core/common_runtime/sycl/sycl_device_context.h
@ -0,0 +1,42 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMMON_RUNTIME_SYCL_SYCL_DEVICE_CONTEXT_H_
+#define TENSORFLOW_COMMON_RUNTIME_SYCL_SYCL_DEVICE_CONTEXT_H_
+
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/framework/device_base.h"
+
+namespace tensorflow {
+
+class SYCLDeviceContext : public DeviceContext {
+ public:
+  SYCLDeviceContext() {}
+  
+  ~SYCLDeviceContext() override {}
+  
+  void CopyCPUTensorToDevice(const Tensor* cpu_tensor, Device* device,
+			     Tensor* device_tensor,
+			     StatusCallback done) const override;
+  
+  void CopyDeviceTensorToCPU(const Tensor* device_tensor, StringPiece edge_name,
+			     Device* device, Tensor* cpu_tensor,
+			     StatusCallback done) override;
+  
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMMON_RUNTIME_SYCL_SYCL_DEVICE_CONTEXT_H_
--- a/tensorflow/core/common_runtime/sycl/sycl_device_factory.cc
+++ b/tensorflow/core/common_runtime/sycl/sycl_device_factory.cc
@ -0,0 +1,44 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if TENSORFLOW_USE_SYCL
+
+#include "tensorflow/core/common_runtime/sycl/sycl_device.h"
+
+namespace tensorflow {
+
+class SYCLDeviceFactory : public DeviceFactory {
+ public:
+  Status CreateDevices(const SessionOptions& options, const string& name_prefix,
+                       std::vector<Device*>* devices) override {
+    int n = 1;
+    auto iter = options.config.device_count().find("SYCL");
+    if (iter != options.config.device_count().end()) {
+      n = iter->second;
+    }
+    for (int i = 0; i < n; i++) {
+      string name = strings::StrCat(name_prefix, "/device:SYCL:", i);
+      devices->push_back(new SYCLDevice(
+          options, name, Bytes(256 << 20), DeviceLocality(),
+          SYCLDevice::GetShortDeviceDescription(), cpu_allocator()));
+    }
+    return Status::OK();
+  }
+};
+
+REGISTER_LOCAL_DEVICE_FACTORY("SYCL", SYCLDeviceFactory);
+}
+
+#endif  // TENSORFLOW_USE_SYCL
--- a/tensorflow/core/framework/device_base.h
+++ b/tensorflow/core/framework/device_base.h
@ -30,6 +30,9 @@ limitations under the License.

 namespace Eigen {
 struct ThreadPoolDevice;
+#ifdef TENSORFLOW_USE_SYCL
+struct SyclDevice;
+#endif
 }  // end namespace Eigen

 namespace perftools {
@ -145,6 +148,10 @@ class DeviceBase {
    eigen_cpu_device_ = d;
  }

+#ifdef TENSORFLOW_USE_SYCL
+  void set_eigen_sycl_device(Eigen::SyclDevice* d) { eigen_sycl_device_ = d; }
+#endif
+
  // Return the Allocator implementation to use based on the allocator
  // attributes requested.  See allocator.h for more details.
  virtual Allocator* GetAllocator(AllocatorAttributes /*attr*/) {
@ -167,6 +174,13 @@ class DeviceBase {
    return eigen_cpu_device_;
  }

+#ifdef TENSORFLOW_USE_SYCL
+  const Eigen::SyclDevice* eigen_sycl_device() const {
+    CHECK(eigen_sycl_device_ != nullptr);
+    return eigen_sycl_device_;
+  }
+#endif
+
  // Caller owns the return value. The OpKernelContext calls this even
  // for devices that do not implement an eigen_gpu_device. Overridden
  // by GPU devices to return a derived type.
@ -203,6 +217,9 @@ class DeviceBase {
  CpuWorkerThreads* cpu_worker_threads_ = nullptr;
  GpuDeviceInfo* gpu_device_info_ = nullptr;
  Eigen::ThreadPoolDevice* eigen_cpu_device_ = nullptr;
+#ifdef TENSORFLOW_USE_SYCL
+  Eigen::SyclDevice* eigen_sycl_device_ = nullptr;
+#endif
 };

 }  // namespace tensorflow
--- a/tensorflow/core/framework/op_kernel.cc
+++ b/tensorflow/core/framework/op_kernel.cc
@ -949,6 +949,13 @@ const Eigen::GpuDevice& OpKernelContext::eigen_device() const {
  return eigen_gpu_device();
 }

+#ifdef TENSORFLOW_USE_SYCL
+template <>
+const Eigen::SyclDevice& OpKernelContext::eigen_device() const {
+  return eigen_sycl_device();
+}
+#endif
+
 void OpKernelConstruction::CtxFailure(Status s) {
  VLOG(1) << s;
  SetStatus(s);
--- a/tensorflow/core/framework/op_kernel.h
+++ b/tensorflow/core/framework/op_kernel.h
@ -53,6 +53,7 @@ limitations under the License.
 namespace Eigen {
 struct ThreadPoolDevice;
 struct GpuDevice;
+struct SyclDevice;
 }  // end namespace Eigen

 namespace tensorflow {
@ -891,6 +892,11 @@ class OpKernelContext {
  const Eigen::GpuDevice& eigen_gpu_device() const {
    return params_->eigen_gpu_device->device();
  }
+#ifdef TENSORFLOW_USE_SYCL
+  const Eigen::SyclDevice& eigen_sycl_device() const {
+    return *device()->eigen_sycl_device();
+  }
+#endif
  template <typename EigenDeviceType>
  const EigenDeviceType& eigen_device() const;

--- a/tensorflow/core/framework/types.cc
+++ b/tensorflow/core/framework/types.cc
@ -37,6 +37,7 @@ std::ostream& operator<<(std::ostream& os, const DeviceType& d) {

 const char* const DEVICE_CPU = "CPU";
 const char* const DEVICE_GPU = "GPU";
+const char* const DEVICE_SYCL = "SYCL";

 string DataTypeString(DataType dtype) {
  if (IsRefType(dtype)) {
--- a/tensorflow/core/framework/types.h
+++ b/tensorflow/core/framework/types.h
@ -68,8 +68,9 @@ class DeviceType {
 std::ostream& operator<<(std::ostream& os, const DeviceType& d);

 // Convenient constants that can be passed to a DeviceType constructor
-extern const char* const DEVICE_CPU;  // "CPU"
-extern const char* const DEVICE_GPU;  // "GPU"
+extern const char* const DEVICE_CPU;   // "CPU"
+extern const char* const DEVICE_GPU;   // "GPU"
+extern const char* const DEVICE_SYCL;  // "SYCL"

 typedef gtl::InlinedVector<MemoryType, 4> MemoryTypeVector;
 typedef gtl::ArraySlice<MemoryType> MemoryTypeSlice;
--- a/tensorflow/core/framework/types_test.cc
+++ b/tensorflow/core/framework/types_test.cc
@ -25,6 +25,7 @@ namespace {
 TEST(TypesTest, DeviceTypeName) {
  EXPECT_EQ("CPU", DeviceTypeString(DeviceType(DEVICE_CPU)));
  EXPECT_EQ("GPU", DeviceTypeString(DeviceType(DEVICE_GPU)));
+  EXPECT_EQ("SYCL", DeviceTypeString(DeviceType(DEVICE_SYCL)));
 }

 TEST(TypesTest, kDataTypeRefOffset) {
--- a/tensorflow/core/kernels/constant_op.cc
+++ b/tensorflow/core/kernels/constant_op.cc
@ -51,6 +51,17 @@ ConstantOp::~ConstantOp() {}

 REGISTER_KERNEL_BUILDER(Name("Const").Device(DEVICE_CPU), ConstantOp);

+#if TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL_KERNEL(TYPE)                                    \
+  REGISTER_KERNEL_BUILDER(                                            \
+                          Name("Const")                               \
+                          .Device(DEVICE_SYCL)                        \
+                          .TypeConstraint<TYPE>("dtype"),             \
+			  ConstantOp);
+TF_CALL_NUMBER_TYPES(REGISTER_SYCL_KERNEL);
+#undef REGISTER_SYCL_KERNEL
+#endif
+
 #if GOOGLE_CUDA
 #define REGISTER_KERNEL(D, TYPE)                                      \
  REGISTER_KERNEL_BUILDER(                                            \
--- a/tensorflow/core/kernels/cwise_op_round.cc
+++ b/tensorflow/core/kernels/cwise_op_round.cc
@ -18,6 +18,14 @@ limitations under the License.
 namespace tensorflow {
 REGISTER5(UnaryOp, CPU, "Round", functor::round, Eigen::half, float, double,
          int32, int64);
+
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER(UnaryOp, SYCL, "Round", functor::round, float);
+namespace functor {
+DEFINE_UNARY1(round, float);
+}  // namespace functor
+#endif
+
 #if GOOGLE_CUDA
 REGISTER5(UnaryOp, GPU, "Round", functor::round, Eigen::half, float, double,
          int32, int64);
--- a/tensorflow/core/kernels/cwise_ops_common.h
+++ b/tensorflow/core/kernels/cwise_ops_common.h
@ -20,6 +20,10 @@ limitations under the License.

 #define EIGEN_USE_THREADS

+#ifdef TENSORFLOW_USE_SYCL
+#include "tensorflow/core/kernels/cwise_ops_sycl_common.h"
+#endif
+
 #include "tensorflow/core/kernels/cwise_ops.h"
 #include "tensorflow/core/kernels/cwise_ops_gradients.h"

@ -33,6 +37,9 @@ namespace tensorflow {

 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
+#ifdef TENSORFLOW_USE_SYCL
+typedef Eigen::SyclDevice SYCLDevice;
+#endif

 class BinaryOpShared : public OpKernel {
 public:
@ -96,45 +103,45 @@ class BinaryOp : public BinaryOpShared {
      if (state.in1_num_elements == 1) {
        // tensor op scalar
        functor::BinaryFunctor<Device, Functor, 1>().Right(
-            eigen_device, out_flat, in0.flat<Tin>(), in1.scalar<Tin>(),
-            error_ptr);
+            eigen_device, out_flat, in0.template flat<Tin>(),
+            in1.template scalar<Tin>(), error_ptr);
      } else if (state.in0_num_elements == 1) {
        // scalar op tensor
        functor::BinaryFunctor<Device, Functor, 1>().Left(
-            eigen_device, out_flat, in0.scalar<Tin>(), in1.flat<Tin>(),
-            error_ptr);
+            eigen_device, out_flat, in0.template scalar<Tin>(),
+            in1.template flat<Tin>(), error_ptr);
      } else {
        functor::BinaryFunctor<Device, Functor, 1>()(
-            eigen_device, out_flat, in0.flat<Tin>(), in1.flat<Tin>(),
-            error_ptr);
+            eigen_device, out_flat, in0.template flat<Tin>(),
+            in1.template flat<Tin>(), error_ptr);
      }
    } else if (ndims == 2) {
      functor::BinaryFunctor<Device, Functor, 2>().BCast(
          eigen_device, out->shaped<Tout, 2>(bcast->result_shape()),
-          in0.shaped<Tin, 2>(bcast->x_reshape()),
+          in0.template shaped<Tin, 2>(bcast->x_reshape()),
          BCast::ToIndexArray<2>(bcast->x_bcast()),
-          in1.shaped<Tin, 2>(bcast->y_reshape()),
+          in1.template shaped<Tin, 2>(bcast->y_reshape()),
          BCast::ToIndexArray<2>(bcast->y_bcast()), error_ptr);
    } else if (ndims == 3) {
      functor::BinaryFunctor<Device, Functor, 3>().BCast(
          eigen_device, out->shaped<Tout, 3>(bcast->result_shape()),
-          in0.shaped<Tin, 3>(bcast->x_reshape()),
+          in0.template shaped<Tin, 3>(bcast->x_reshape()),
          BCast::ToIndexArray<3>(bcast->x_bcast()),
-          in1.shaped<Tin, 3>(bcast->y_reshape()),
+          in1.template shaped<Tin, 3>(bcast->y_reshape()),
          BCast::ToIndexArray<3>(bcast->y_bcast()), error_ptr);
    } else if (ndims == 4) {
      functor::BinaryFunctor<Device, Functor, 4>().BCast(
          eigen_device, out->shaped<Tout, 4>(bcast->result_shape()),
-          in0.shaped<Tin, 4>(bcast->x_reshape()),
+          in0.template shaped<Tin, 4>(bcast->x_reshape()),
          BCast::ToIndexArray<4>(bcast->x_bcast()),
-          in1.shaped<Tin, 4>(bcast->y_reshape()),
+          in1.template shaped<Tin, 4>(bcast->y_reshape()),
          BCast::ToIndexArray<4>(bcast->y_bcast()), error_ptr);
    } else if (ndims == 5) {
      functor::BinaryFunctor<Device, Functor, 5>().BCast(
          eigen_device, out->shaped<Tout, 5>(bcast->result_shape()),
-          in0.shaped<Tin, 5>(bcast->x_reshape()),
+          in0.template shaped<Tin, 5>(bcast->x_reshape()),
          BCast::ToIndexArray<5>(bcast->x_bcast()),
-          in1.shaped<Tin, 5>(bcast->y_reshape()),
+          in1.template shaped<Tin, 5>(bcast->y_reshape()),
          BCast::ToIndexArray<5>(bcast->y_bcast()), error_ptr);
    } else {
      SetUnimplementedError(ctx);
--- a/tensorflow/core/kernels/cwise_ops_sycl_common.h
+++ b/tensorflow/core/kernels/cwise_ops_sycl_common.h
@ -0,0 +1,138 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if !TENSORFLOW_USE_SYCL
+#error This file must only be included when building TensorFlow with SYCL support
+#endif
+
+#ifndef TENSORFLOW_CORE_KERNELS_CWISE_OPS_SYCL_COMMON_H_
+#define TENSORFLOW_CORE_KERNELS_CWISE_OPS_SYCL_COMMON_H_
+
+#define EIGEN_USE_SYCL
+
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/kernels/cwise_ops.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace functor {
+
+typedef Eigen::SyclDevice SYCLDevice;
+
+template <typename OUT, typename RHS>
+void Assign(const SYCLDevice& d, OUT out, RHS rhs) {
+  out.device(d) = rhs;
+}
+
+// Partial specialization of UnaryFunctor<Device=SYCLDevice, Functor>.
+template <typename Functor>
+struct UnaryFunctor<SYCLDevice, Functor> {
+  void operator()(const SYCLDevice& d, typename Functor::tout_type out,
+                  typename Functor::tin_type in) {
+    To32Bit(out).device(d) = To32Bit(in).unaryExpr(typename Functor::func());
+  }
+};
+
+// Partial specialization of BinaryFunctor<Device=SYCLDevice, Functor>.
+template <typename Functor, int NDIMS, bool has_errors>
+struct BinaryFunctor<SYCLDevice, Functor, NDIMS, has_errors> {
+  void operator()(const SYCLDevice& d, typename Functor::tout_type out,
+                  typename Functor::tin_type in0,
+                  typename Functor::tin_type in1, bool* error) {
+    Assign(d, out, in0.binaryExpr(in1, typename Functor::func()));
+  }
+
+  void Left(const SYCLDevice& d, typename Functor::tout_type out,
+            typename Functor::tscalar_type scalar,
+            typename Functor::tin_type in, bool* error) {
+    LOG(FATAL) << "BinaryFunctor::Left NOT IMPLEMENTED ! ";
+  }
+
+  void Right(const SYCLDevice& d, typename Functor::tout_type out,
+             typename Functor::tin_type in,
+             typename Functor::tscalar_type scalar, bool* error) {
+    typedef typename Functor::out_type Tout;
+    typedef typename Functor::in_type Tin;
+    typedef typename Functor::func Binary;
+    typedef typename Eigen::internal::scalar_right<Tout, Tin, Binary> Unary;
+    Assign(d, out, in.unaryExpr(Unary(scalar.data())));
+  }
+
+  void BCast(const SYCLDevice& d,
+             typename TTypes<typename Functor::out_type, NDIMS>::Tensor out,
+             typename TTypes<typename Functor::in_type, NDIMS>::ConstTensor in0,
+             typename Eigen::array<Eigen::DenseIndex, NDIMS> bcast0,
+             typename TTypes<typename Functor::in_type, NDIMS>::ConstTensor in1,
+             typename Eigen::array<Eigen::DenseIndex, NDIMS> bcast1,
+             bool* error) {
+    LOG(FATAL) << "BinaryFunctor::BCast NOT IMPLEMENTED ";
+  }
+};
+
+// Macros to explicitly instantiate kernels on GPU for multiple types
+// (T0, T1, etc.) for UnaryFunctor (e.g., functor::sqrt).
+#define DEFINE_UNARY1(F, T) template struct UnaryFunctor<SYCLDevice, F<T> >
+#define DEFINE_UNARY2(F, T0, T1) \
+  DEFINE_UNARY1(F, T0);          \
+  DEFINE_UNARY1(F, T1)
+#define DEFINE_UNARY3(F, T0, T1, T2) \
+  DEFINE_UNARY2(F, T0, T1);          \
+  DEFINE_UNARY1(F, T2)
+#define DEFINE_UNARY4(F, T0, T1, T2, T3) \
+  DEFINE_UNARY2(F, T0, T1);              \
+  DEFINE_UNARY2(F, T2, T3)
+#define DEFINE_UNARY5(F, T0, T1, T2, T3, T4) \
+  DEFINE_UNARY2(F, T0, T1);                  \
+  DEFINE_UNARY3(F, T2, T3, T4)
+
+// Macros to explicitly instantiate kernels on GPU for multiple types
+// (T0, T1, etc.) for BinaryFunctor.
+#define DEFINE_BINARY1(F, T)                          \
+  template struct BinaryFunctor<SYCLDevice, F<T>, 1>; \
+  template struct BinaryFunctor<SYCLDevice, F<T>, 2>; \
+  template struct BinaryFunctor<SYCLDevice, F<T>, 3>
+#define DEFINE_BINARY2(F, T0, T1) \
+  DEFINE_BINARY1(F, T0);          \
+  DEFINE_BINARY1(F, T1)
+#define DEFINE_BINARY3(F, T0, T1, T2) \
+  DEFINE_BINARY2(F, T0, T1);          \
+  DEFINE_BINARY1(F, T2)
+#define DEFINE_BINARY4(F, T0, T1, T2, T3) \
+  DEFINE_BINARY2(F, T0, T1);              \
+  DEFINE_BINARY2(F, T2, T3)
+#define DEFINE_BINARY5(F, T0, T1, T2, T3, T4) \
+  DEFINE_BINARY2(F, T0, T1);                  \
+  DEFINE_BINARY3(F, T2, T3, T4)
+#define DEFINE_BINARY6(F, T0, T1, T2, T3, T4, T5) \
+  DEFINE_BINARY3(F, T0, T1, T2);                  \
+  DEFINE_BINARY3(F, T3, T4, T5)
+#define DEFINE_BINARY7(F, T0, T1, T2, T3, T4, T5, T6) \
+  DEFINE_BINARY3(F, T0, T1, T2);                      \
+  DEFINE_BINARY4(F, T3, T4, T5, T6)
+#define DEFINE_BINARY8(F, T0, T1, T2, T3, T4, T5, T6, T7) \
+  DEFINE_BINARY4(F, T0, T1, T2, T3);                      \
+  DEFINE_BINARY4(F, T4, T5, T6, T7)
+#define DEFINE_BINARY9(F, T0, T1, T2, T3, T4, T5, T6, T7, T8) \
+  DEFINE_BINARY4(F, T0, T1, T2, T3);                          \
+  DEFINE_BINARY5(F, T4, T5, T6, T7, T8)
+#define DEFINE_BINARY10(F, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9) \
+  DEFINE_BINARY5(F, T0, T1, T2, T3, T4);                           \
+  DEFINE_BINARY5(F, T5, T6, T7, T8, T9)
+
+}  // end namespace functor
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_CWISE_OPS_SYCL_COMMON_H_
--- a/tensorflow/core/kernels/function_ops.cc
+++ b/tensorflow/core/kernels/function_ops.cc
@ -87,6 +87,29 @@ class RetvalOp : public OpKernel {
 REGISTER_KERNEL_BUILDER(Name("_Arg").Device(DEVICE_CPU), ArgOp);
 REGISTER_KERNEL_BUILDER(Name("_Retval").Device(DEVICE_CPU), RetvalOp);

+#if TENSORFLOW_USE_SYCL
+#define REGISTER(type)     \
+  REGISTER_KERNEL_BUILDER( \
+      Name("_Arg").Device(DEVICE_SYCL).TypeConstraint<type>("T"), ArgOp);
+  TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER)
+  TF_CALL_bool(REGISTER) REGISTER_KERNEL_BUILDER(Name("_Arg")
+						 .Device(DEVICE_GPU)
+						 .HostMemory("output")
+						 .TypeConstraint<int32>("T"),
+						 ArgOp);
+#undef REGISTER
+#define REGISTER(type)     \
+  REGISTER_KERNEL_BUILDER( \
+      Name("_Retval").Device(DEVICE_SYCL).TypeConstraint<type>("T"), RetvalOp);
+  TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER)
+  TF_CALL_bool(REGISTER) REGISTER_KERNEL_BUILDER(Name("_Retval")
+						 .Device(DEVICE_GPU)
+						 .HostMemory("input")
+						 .TypeConstraint<int32>("T"),
+						 RetvalOp);
+#undef REGISTER
+#endif
+
 #define REGISTER(type)     \
  REGISTER_KERNEL_BUILDER( \
      Name("_Arg").Device(DEVICE_GPU).TypeConstraint<type>("T"), ArgOp);
--- a/tensorflow/core/kernels/identity_op.cc
+++ b/tensorflow/core/kernels/identity_op.cc
@ -34,6 +34,24 @@ REGISTER_KERNEL_BUILDER(Name("PlaceholderWithDefault").Device(DEVICE_CPU),

 REGISTER_KERNEL_BUILDER(Name("RefIdentity").Device(DEVICE_CPU), IdentityOp);

+#if TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL_KERNEL(type)                                       \
+  REGISTER_KERNEL_BUILDER(                                               \
+      Name("Identity").Device(DEVICE_SYCL).TypeConstraint<type>("T"),    \
+      IdentityOp);                                                       \
+  REGISTER_KERNEL_BUILDER(                                               \
+      Name("RefIdentity").Device(DEVICE_SYCL).TypeConstraint<type>("T"), \
+      IdentityOp);                                                       \
+  REGISTER_KERNEL_BUILDER(                                               \
+      Name("StopGradient").Device(DEVICE_SYCL).TypeConstraint<type>("T"),\
+      IdentityOp)
+
+TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL);
+REGISTER_SYCL_KERNEL(bfloat16);
+
+#undef REGISTER_SYCL_KERNEL
+#endif
+
 #define REGISTER_GPU_KERNEL(type)                                        \
  REGISTER_KERNEL_BUILDER(                                               \
      Name("Identity").Device(DEVICE_GPU).TypeConstraint<type>("T"),     \
@ -50,6 +68,7 @@ REGISTER_GPU_KERNEL(bfloat16);

 #undef REGISTER_GPU_KERNEL

+
 #if GOOGLE_CUDA
 // A special GPU kernel for int32 and bool.
 // TODO(b/25387198): Also enable int32 in device memory. This kernel
--- a/tensorflow/core/kernels/no_op.cc
+++ b/tensorflow/core/kernels/no_op.cc
@ -20,4 +20,8 @@ namespace tensorflow {
 REGISTER_KERNEL_BUILDER(Name("NoOp").Device(DEVICE_CPU), NoOp);
 REGISTER_KERNEL_BUILDER(Name("NoOp").Device(DEVICE_GPU), NoOp);

+#if TENSORFLOW_USE_SYCL
+REGISTER_KERNEL_BUILDER(Name("NoOp").Device(DEVICE_SYCL), NoOp);
+#endif
+
 }  // namespace tensorflow
--- a/tensorflow/core/kernels/sendrecv_ops.cc
+++ b/tensorflow/core/kernels/sendrecv_ops.cc
@ -78,6 +78,10 @@ void SendOp::Compute(OpKernelContext* ctx) {
 REGISTER_KERNEL_BUILDER(Name("_Send").Device(DEVICE_CPU), SendOp);
 REGISTER_KERNEL_BUILDER(Name("_Send").Device(DEVICE_GPU), SendOp);

+#if TENSORFLOW_USE_SYCL
+REGISTER_KERNEL_BUILDER(Name("_Send").Device(DEVICE_SYCL), SendOp);
+#endif
+
 REGISTER_KERNEL_BUILDER(Name("_HostSend").Device(DEVICE_CPU), SendOp);
 REGISTER_KERNEL_BUILDER(
    Name("_HostSend").Device(DEVICE_GPU).HostMemory("tensor"), SendOp);
@ -136,6 +140,10 @@ void RecvOp::ComputeAsync(OpKernelContext* ctx, DoneCallback done) {
 REGISTER_KERNEL_BUILDER(Name("_Recv").Device(DEVICE_CPU), RecvOp);
 REGISTER_KERNEL_BUILDER(Name("_Recv").Device(DEVICE_GPU), RecvOp);

+#if TENSORFLOW_USE_SYCL
+REGISTER_KERNEL_BUILDER(Name("_Recv").Device(DEVICE_SYCL), RecvOp);
+#endif
+
 REGISTER_KERNEL_BUILDER(Name("_HostRecv").Device(DEVICE_CPU), RecvOp);
 REGISTER_KERNEL_BUILDER(
    Name("_HostRecv").Device(DEVICE_GPU).HostMemory("tensor"), RecvOp);
--- a/tensorflow/core/platform/default/build_config/BUILD
+++ b/tensorflow/core/platform/default/build_config/BUILD
@ -10,6 +10,7 @@ exports_files(["LICENSE"])
 load("//tensorflow:tensorflow.bzl", "tf_copts")
 load("//tensorflow:tensorflow.bzl", "tf_cuda_library")
 load("@local_config_cuda//cuda:platform.bzl", "cuda_library_path")
+load("@local_config_sycl//sycl:platform.bzl", "sycl_library_path")

 cc_library(
    name = "gtest",
@ -143,6 +144,21 @@ cc_library(
    ],
 )

+cc_library(
+    name = "sycl",
+    data = [
+        "@local_config_sycl//sycl:{}".format(sycl_library_path("ComputeCpp")),
+    ],
+    linkopts = select({
+        "//conditions:default": [
+            "-Wl,-rpath,../local_config_sycl/sycl/lib",
+        ],
+    }),
+    deps = [
+        "@local_config_sycl//sycl:syclrt",
+    ],
+)
+
 filegroup(
    name = "android_srcs",
    srcs = glob(["*.h"]),
--- a/tensorflow/core/util/device_name_utils.cc
+++ b/tensorflow/core/util/device_name_utils.cc
@ -142,6 +142,7 @@ bool DeviceNameUtils::ParseFullName(StringPiece fullname, ParsedName* p) {
      progress = true;
    }

+    // Handle legacy naming convention for cpu and gpu.
    if (str_util::ConsumePrefix(&fullname, "/cpu:") ||
        str_util::ConsumePrefix(&fullname, "/CPU:")) {
      p->has_type = true;
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@ -1,11 +1,14 @@
 # TensorFlow external dependencies that can be loaded in WORKSPACE files.

 load("//third_party/gpus:cuda_configure.bzl", "cuda_configure")
+load("//third_party/sycl:sycl_configure.bzl", "sycl_configure")
+

 # If TensorFlow is linked as a submodule.
 # path_prefix and tf_repo_name are no longer used.
 def tf_workspace(path_prefix = "", tf_repo_name = ""):
  cuda_configure(name = "local_config_cuda")
+  sycl_configure(name = "local_config_sycl")
  if path_prefix:
    print("path_prefix was specified to tf_workspace but is no longer used and will be removed in the future.")
  if tf_repo_name:
@ -14,8 +17,8 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
  # These lines need to be changed when updating Eigen. They are parsed from
  # this file by the cmake and make builds to determine the eigen version and
  # hash.
-  eigen_version = "22b492048b2f"
-  eigen_sha256 = "8b9bd14a037c1a3fe37dc5e4a71504ebe48148cf2498fd8eb6848165a7a0538f"
+  eigen_version = "3f0fb403ec4c"
+  eigen_sha256 = "9ff8301c6af2640932c5ded77ecccee5786cec8c31315311220618b312e0472b"

  native.new_http_archive(
    name = "eigen_archive",
--- a/third_party/eigen3/BUILD
+++ b/third_party/eigen3/BUILD
@ -23,5 +23,8 @@ cc_library(
        "unsupported/Eigen/CXX11/FixedPoint",
    ],
    visibility = ["//visibility:public"],
-    deps = ["@eigen_archive//:eigen"],
+    deps = [
+        "@eigen_archive//:eigen",
+        "@local_config_sycl//sycl:sycl",
+    ],
 )
--- a/third_party/sycl/BUILD
+++ b/third_party/sycl/BUILD
--- a/third_party/sycl/crosstool/BUILD
+++ b/third_party/sycl/crosstool/BUILD
--- a/third_party/sycl/crosstool/BUILD.tpl
+++ b/third_party/sycl/crosstool/BUILD.tpl
@ -0,0 +1,29 @@
+licenses(["notice"])  # Apache 2.0
+
+package(default_visibility = ["//visibility:public"])
+
+cc_toolchain_suite(
+    name = "toolchain",
+    toolchains = {
+        "local|compiler": ":cc-compiler-local",
+    },
+)
+
+cc_toolchain(
+    name = "cc-compiler-local",
+    all_files = ":empty",
+    compiler_files = ":empty",
+    cpu = "local",
+    dwp_files = ":empty",
+    dynamic_runtime_libs = [":empty"],
+    linker_files = ":empty",
+    objcopy_files = ":empty",
+    static_runtime_libs = [":empty"],
+    strip_files = ":empty",
+    supports_param_files = 0,
+)
+
+filegroup(
+    name = "empty",
+    srcs = [],
+)
--- a/third_party/sycl/crosstool/CROSSTOOL.tpl
+++ b/third_party/sycl/crosstool/CROSSTOOL.tpl
@ -0,0 +1,82 @@
+major_version: "local"
+minor_version: ""
+default_target_cpu: "same_as_host"
+
+default_toolchain {
+  cpu: "k8"
+  toolchain_identifier: "local_linux"
+}
+
+toolchain {
+  abi_version: "local"
+  abi_libc_version: "local"
+  builtin_sysroot: ""
+  compiler: "compiler"
+  host_system_name: "local"
+  needsPic: true
+  supports_gold_linker: false
+  supports_incremental_linker: false
+  supports_fission: false
+  supports_interface_shared_objects: false
+  supports_normalizing_ar: false
+  supports_start_end_lib: false
+  supports_thin_archives: false
+  target_libc: "local"
+  target_cpu: "local"
+  target_system_name: "local"
+  toolchain_identifier: "local_linux"
+
+  tool_path { name: "ar" path: "/usr/bin/ar" }
+  tool_path { name: "compat-ld" path: "/usr/bin/ld" }
+  tool_path { name: "cpp" path: "/usr/bin/cpp" }
+  tool_path { name: "dwp" path: "/usr/bin/dwp" }
+  tool_path { name: "gcc" path: "computecpp" }
+  # Use "-std=c++11" for nvcc. For consistency, force both the host compiler
+  # and the device compiler to use "-std=c++11".
+  cxx_flag: "-std=c++11"
+  linker_flag: "-lstdc++"
+  linker_flag: "-B/usr/bin/"
+
+  # TODO(bazel-team): In theory, the path here ought to exactly match the path
+  # used by gcc. That works because bazel currently doesn't track files at
+  # absolute locations and has no remote execution, yet. However, this will need
+  # to be fixed, maybe with auto-detection?
+  cxx_builtin_include_directory: "/usr/lib/gcc/"
+  cxx_builtin_include_directory: "/usr/lib"
+  cxx_builtin_include_directory: "/usr/lib64"
+  cxx_builtin_include_directory: "/usr/local/include"
+  cxx_builtin_include_directory: "/usr/include"
+
+  cxx_builtin_include_directory: "%{computecpp_toolkit_path}"
+
+  tool_path { name: "gcov" path: "/usr/bin/gcov" }
+
+  # C(++) compiles invoke the compiler (as that is the one knowing where
+  # to find libraries), but we provide LD so other rules can invoke the linker.
+  tool_path { name: "ld" path: "/usr/bin/ld" }
+
+  tool_path { name: "nm" path: "/usr/bin/nm" }
+  tool_path { name: "objcopy" path: "/usr/bin/objcopy" }
+  objcopy_embed_flag: "-I"
+  objcopy_embed_flag: "binary"
+  tool_path { name: "objdump" path: "/usr/bin/objdump" }
+  tool_path { name: "strip" path: "/usr/bin/strip" }
+
+  # Make C++ compilation deterministic. Use linkstamping instead of these
+  # compiler symbols.
+  unfiltered_cxx_flag: "-Wno-builtin-macro-redefined"
+  unfiltered_cxx_flag: "-D__DATE__=\"redacted\""
+  unfiltered_cxx_flag: "-D__TIMESTAMP__=\"redacted\""
+  unfiltered_cxx_flag: "-D__TIME__=\"redacted\""
+
+  # All warnings are enabled. Maybe enable -Werror as well?
+  compiler_flag: "-Wall"
+
+  # Anticipated future default.
+  linker_flag: "-Wl,-no-as-needed"
+  # Stamp the binary with a unique identifier.
+  linker_flag: "-Wl,--build-id=md5"
+  linker_flag: "-Wl,--hash-style=gnu"
+
+  linking_mode_flags { mode: DYNAMIC }
+}
--- a/third_party/sycl/crosstool/computecpp.tpl
+++ b/third_party/sycl/crosstool/computecpp.tpl
@ -0,0 +1,61 @@
+#!/usr/bin/env python
+
+from argparse import ArgumentParser
+import os
+import subprocess
+import re
+import sys
+import pipes
+
+CPU_CXX_COMPILER = ('%{host_cxx_compiler}')
+CPU_C_COMPILER = ('%{host_c_compiler}')
+
+CURRENT_DIR = os.path.dirname(sys.argv[0])
+COMPUTECPP_ROOT = CURRENT_DIR +"/../sycl/"
+COMPUTECPP_DRIVER= COMPUTECPP_ROOT+"bin/compute++"
+COMPUTECPP_INCLUDE = COMPUTECPP_ROOT+"include"
+
+def main():
+  computecpp_compiler_flags = [""]
+  computecpp_compiler_flags = [flag for flag in sys.argv[1:]]
+  computecpp_compiler_flags = computecpp_compiler_flags + ["-D_GLIBCXX_USE_CXX11_ABI=0"]
+
+  output_file_index = computecpp_compiler_flags.index("-o") +1
+  output_file_name = computecpp_compiler_flags[output_file_index]
+
+  if(output_file_index == 1):
+    # we are linking
+    return subprocess.call([CPU_CXX_COMPILER] +computecpp_compiler_flags )
+
+  # find what we compile
+  compiling_cpp = 0
+  if("-c" in computecpp_compiler_flags):
+      compiled_file_index = computecpp_compiler_flags.index("-c") +1
+      compited_file_name = computecpp_compiler_flags[compiled_file_index]
+      if(compited_file_name.endswith(('.cc', '.c++', '.cpp', '.CPP', '.C', '.cxx'))):
+          compiling_cpp = 1;
+
+  if(compiling_cpp == 1):
+      filename, file_extension = os.path.splitext(output_file_name)
+      bc_out = filename + ".sycl"
+
+      computecpp_compiler_flags = ['-DTENSORFLOW_USE_SYCL', '-Wno-unused-variable','-I', COMPUTECPP_INCLUDE,'-isystem',
+      COMPUTECPP_INCLUDE, "-std=c++11", "-sycl", "-emit-llvm", "-no-serial-memop"] + computecpp_compiler_flags
+
+      # dont want that in case of compiling with computecpp first
+      host_compiler_flags = [""]
+      host_compiler_flags = [flag for flag in sys.argv[1:]
+                                if not flag.startswith(('-MF','-MD',))
+                                if not ".d" in flag]
+
+      x = subprocess.call([COMPUTECPP_DRIVER] +computecpp_compiler_flags )
+      if(x == 0):
+          host_compiler_flags = ['-DTENSORFLOW_USE_SYCL', '-Wno-unused-variable', '-I', COMPUTECPP_INCLUDE, "--include",bc_out] + host_compiler_flags
+          return subprocess.call([CPU_CXX_COMPILER] +host_compiler_flags )
+      return x
+  else:
+    # compile for C
+    return subprocess.call([CPU_C_COMPILER] +computecpp_compiler_flags)
+
+if __name__ == '__main__':
+  sys.exit(main())
--- a/third_party/sycl/sycl/BUILD
+++ b/third_party/sycl/sycl/BUILD
--- a/third_party/sycl/sycl/BUILD.tpl
+++ b/third_party/sycl/sycl/BUILD.tpl
@ -0,0 +1,43 @@
+licenses(["notice"])  # Apache 2.0
+
+load("@local_config_sycl//sycl:build_defs.bzl", "if_sycl")
+load("platform", "sycl_library_path")
+
+load("platform", "readlink_command")
+
+package(default_visibility = ["//visibility:public"])
+
+config_setting(
+    name = "using_sycl",
+    values = {
+        "define": "using_sycl=true",
+    },
+)
+
+cc_library(
+    name = "sycl_headers",
+    hdrs = glob([
+        "**/*.h",
+    ]),
+    includes = [".", "include"],
+)
+
+cc_library(
+    name = "syclrt",
+    srcs = [
+        sycl_library_path("ComputeCpp")
+    ],
+    data = [
+        sycl_library_path("ComputeCpp")
+    ],
+    includes = ["include/"],
+    linkstatic = 1,
+)
+
+cc_library(
+    name = "sycl",
+    deps = if_sycl([
+        ":sycl_headers",
+        ":syclrt",
+    ]),
+)
--- a/third_party/sycl/sycl/build_defs.bzl.tpl
+++ b/third_party/sycl/sycl/build_defs.bzl.tpl
@ -0,0 +1,13 @@
+# Macros for building SYCL code.
+
+def if_sycl(if_true, if_false = []):
+    """Shorthand for select()'ing on whether we're building with SYCL.
+
+    Returns a select statement which evaluates to if_true if we're building
+    with SYCL enabled.  Otherwise, the select statement evaluates to if_false.
+
+    """
+    return select({
+        "@local_config_sycl//sycl:using_sycl": if_true,
+        "//conditions:default": if_false
+    })
--- a/third_party/sycl/sycl/platform.bzl.tpl
+++ b/third_party/sycl/sycl/platform.bzl.tpl
@ -0,0 +1,5 @@
+def sycl_library_path(name):
+    return "lib/lib{}.so".format(name)
+
+def readlink_command():
+    return "readlink"
--- a/third_party/sycl/sycl_configure.bzl
+++ b/third_party/sycl/sycl_configure.bzl
@ -0,0 +1,197 @@
+# -*- Python -*-
+"""SYCL autoconfiguration.
+`sycl_configure` depends on the following environment variables:
+
+  * HOST_CXX_COMPILER:  The host C++ compiler
+  * HOST_C_COMPILER:    The host C compiler
+  * COMPUTECPP_TOOLKIT_PATH: The path to the ComputeCpp toolkit.
+"""
+
+_HOST_CXX_COMPILER = "HOST_CXX_COMPILER"
+_HOST_C_COMPILER= "HOST_C_COMPILER"
+_COMPUTECPP_TOOLKIT_PATH = "COMPUTECPP_TOOLKIT_PATH"
+
+def _enable_sycl(repository_ctx):
+  if "TF_NEED_OPENCL" in repository_ctx.os.environ:
+    enable_sycl = repository_ctx.os.environ["TF_NEED_OPENCL"].strip()
+    return enable_sycl == "1"
+  return False
+  
+def auto_configure_fail(msg):
+  """Output failure message when auto configuration fails."""
+  red = "\033[0;31m"
+  no_color = "\033[0m"
+  fail("\n%sAuto-Configuration Error:%s %s\n" % (red, no_color, msg))
+# END cc_configure common functions (see TODO above).
+
+def find_c(repository_ctx):
+  """Find host C compiler."""
+  c_name = "gcc"
+  if _HOST_C_COMPILER in repository_ctx.os.environ:
+    c_name = repository_ctx.os.environ[_HOST_C_COMPILER].strip()
+  if c_name.startswith("/"):
+    return c_name
+  c = repository_ctx.which(c_name)
+  if c == None:
+    fail("Cannot find C compiler, please correct your path.")
+  return c
+
+def find_cc(repository_ctx):
+  """Find host C++ compiler."""
+  cc_name = "g++"
+  if _HOST_CXX_COMPILER in repository_ctx.os.environ:
+    cc_name = repository_ctx.os.environ[_HOST_CXX_COMPILER].strip()
+  if cc_name.startswith("/"):
+    return cc_name
+  cc = repository_ctx.which(cc_name)
+  if cc == None:
+    fail("Cannot find C++ compiler, please correct your path.")
+  return cc
+
+def find_computecpp_root(repository_ctx):
+  """Find ComputeCpp compiler."""
+  sycl_name = ""
+  if _COMPUTECPP_TOOLKIT_PATH in repository_ctx.os.environ:
+    sycl_name = repository_ctx.os.environ[_COMPUTECPP_TOOLKIT_PATH].strip()
+  if sycl_name.startswith("/"):
+    return sycl_name
+  fail( "Cannot find SYCL compiler, please correct your path")
+
+def _check_lib(repository_ctx, toolkit_path, lib):
+  """Checks if lib exists under sycl_toolkit_path or fail if it doesn't.
+
+  Args:
+    repository_ctx: The repository context.
+    toolkit_path: The toolkit directory containing the libraries.
+    ib: The library to look for under toolkit_path.
+  """
+  lib_path = toolkit_path + "/" + lib
+  if not repository_ctx.path(lib_path).exists:
+    auto_configure_fail("Cannot find %s" % lib_path)
+
+def _check_dir(repository_ctx, directory):
+  """Checks whether the directory exists and fail if it does not.
+
+  Args:
+    repository_ctx: The repository context.
+    directory: The directory to check the existence of.
+  """
+  if not repository_ctx.path(directory).exists:
+    auto_configure_fail("Cannot find dir: %s" % directory)
+
+def _symlink_dir(repository_ctx, src_dir, dest_dir):
+  """Symlinks all the files in a directory.
+
+  Args:
+    repository_ctx: The repository context.
+    src_dir: The source directory.
+    dest_dir: The destination directory to create the symlinks in.
+  """
+  files = repository_ctx.path(src_dir).readdir()
+  for src_file in files:
+    repository_ctx.symlink(src_file, dest_dir + "/" + src_file.basename)
+
+def _tpl(repository_ctx, tpl, substitutions={}, out=None):
+  if not out:
+    out = tpl.replace(":", "/")
+  repository_ctx.template(
+      out,
+      Label("//third_party/sycl/%s.tpl" % tpl),
+      substitutions)
+
+def _file(repository_ctx, label):
+  repository_ctx.template(
+      label.replace(":", "/"),
+      Label("//third_party/sycl/%s.tpl" % label),
+      {})
+
+_DUMMY_CROSSTOOL_BZL_FILE = """
+def error_sycl_disabled():
+  fail("ERROR: Building with --config=sycl but TensorFlow is not configured " +
+       "to build with SYCL support. Please re-run ./configure and enter 'Y' " +
+       "at the prompt to build with SYCL support.")
+
+  native.genrule(
+      name = "error_gen_crosstool",
+      outs = ["CROSSTOOL"],
+      cmd = "echo 'Should not be run.' && exit 1",
+  )
+
+  native.filegroup(
+      name = "crosstool",
+      srcs = [":CROSSTOOL"],
+      output_licenses = ["unencumbered"],
+  )
+"""
+
+
+_DUMMY_CROSSTOOL_BUILD_FILE = """
+load("//crosstool:error_sycl_disabled.bzl", "error_sycl_disabled")
+
+error_sycl_disabled()
+"""
+
+def _create_dummy_repository(repository_ctx):
+  # Set up BUILD file for sycl/.
+  _file(repository_ctx, "sycl:build_defs.bzl")
+  _file(repository_ctx, "sycl:BUILD")
+  _file(repository_ctx, "sycl:platform.bzl")
+
+  # Create dummy files for the SYCL toolkit since they are still required by
+  # tensorflow/sycl/platform/default/build_config:sycl.
+  repository_ctx.file("sycl/include/sycl.hpp", "")
+  repository_ctx.file("sycl/lib/libComputeCpp.so", "")
+
+  # If sycl_configure is not configured to build with SYCL support, and the user
+  # attempts to build with --config=sycl, add a dummy build rule to intercept
+  # this and fail with an actionable error message.
+  repository_ctx.file("crosstool/error_sycl_disabled.bzl",
+                      _DUMMY_CROSSTOOL_BZL_FILE)
+  repository_ctx.file("crosstool/BUILD", _DUMMY_CROSSTOOL_BUILD_FILE)
+
+
+def _sycl_autoconf_imp(repository_ctx):
+  """Implementation of the sycl_autoconf rule."""
+  if not _enable_sycl(repository_ctx):
+    _create_dummy_repository(repository_ctx)
+  else:
+    # copy template files
+    _file(repository_ctx, "sycl:build_defs.bzl")
+    _file(repository_ctx, "sycl:BUILD")
+    _file(repository_ctx, "sycl:platform.bzl")
+    _file(repository_ctx, "crosstool:BUILD")
+    _tpl(repository_ctx, "crosstool:computecpp",
+    {
+      "%{host_cxx_compiler}" : find_cc(repository_ctx),
+      "%{host_c_compiler}" : find_c(repository_ctx),
+    })
+
+    computecpp_root = find_computecpp_root(repository_ctx);
+    _check_dir(repository_ctx, computecpp_root)
+
+    _tpl(repository_ctx, "crosstool:CROSSTOOL",
+    {
+      "%{computecpp_toolkit_path}" : computecpp_root,
+    })
+
+    # symlink libraries
+    _check_lib(repository_ctx, computecpp_root+"/lib", "libComputeCpp.so" )
+    _symlink_dir(repository_ctx, computecpp_root + "/lib", "sycl/lib")
+    _symlink_dir(repository_ctx, computecpp_root + "/include", "sycl/include")
+    _symlink_dir(repository_ctx, computecpp_root + "/bin", "sycl/bin")
+
+sycl_configure = repository_rule(
+  implementation = _sycl_autoconf_imp,
+  local = True,
+)
+"""Detects and configures the SYCL toolchain.
+
+Add the following to your WORKSPACE FILE:
+
+```python
+sycl_configure(name = "local_config_sycl")
+```
+
+Args:
+  name: A unique name for this workspace rule.
+"""
--- a/tools/bazel.rc.template
+++ b/tools/bazel.rc.template
@ -1,6 +1,9 @@
 build:cuda --crosstool_top=@local_config_cuda//crosstool:toolchain
 build:cuda --define=using_cuda=true --define=using_cuda_nvcc=true

+build:sycl --crosstool_top=@local_config_sycl//crosstool:toolchain
+build:sycl --define=using_sycl=true
+
 build --force_python=py$PYTHON_MAJOR_VERSION
 build --host_force_python=py$PYTHON_MAJOR_VERSION
 build --python$PYTHON_MAJOR_VERSION_path=$PYTHON_BINARY