From ace7c0e11cdc33c65fe5f6f650382d90fb30a901 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Fri, 3 Jun 2016 10:24:45 -0700
Subject: [PATCH 01/51] Pull the latest Eigen version that supports OpenCL

---
 eigen.BUILD                                       | 2 +-
 tensorflow/workspace.bzl                          | 4 ++--
 third_party/eigen3/Eigen/Cholesky                 | 2 +-
 third_party/eigen3/Eigen/Core                     | 2 +-
 third_party/eigen3/Eigen/Eigenvalues              | 2 +-
 third_party/eigen3/Eigen/LU                       | 2 +-
 third_party/eigen3/Eigen/QR                       | 2 +-
 third_party/eigen3/unsupported/Eigen/CXX11/Tensor | 2 +-
 8 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/eigen.BUILD b/eigen.BUILD
index 79bafe65b62..e834e495151 100644
--- a/eigen.BUILD
+++ b/eigen.BUILD
@@ -1,6 +1,6 @@
 package(default_visibility = ["//visibility:public"])
 
-archive_dir = "eigen-eigen-d02e6a705c30"
+archive_dir = "benoitsteiner-opencl-9d4a08d57d0d"
 
 cc_library(
     name = "eigen",
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 07f83651e02..e263c3658b4 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -6,8 +6,8 @@
 def tf_workspace(path_prefix = "", tf_repo_name = ""):
   native.new_http_archive(
     name = "eigen_archive",
-    url = "https://bitbucket.org/eigen/eigen/get/d02e6a705c30.tar.gz",
-    sha256 = "532956172daa8aba87c750791ff89a5c38cdb07e2525afe17ecb4bef812d67cf",
+    url = "https://bitbucket.org/benoitsteiner/opencl/get/9d4a08d57d0d.tar.gz",
+    sha256 = "2b736059052affcfa1f9a645c5e3a655ea4ada6dc0ba2d97a9b61902156bbafc",
     build_file = path_prefix + "eigen.BUILD",
   )
 
diff --git a/third_party/eigen3/Eigen/Cholesky b/third_party/eigen3/Eigen/Cholesky
index 56059bcc61c..bb3b599c957 100644
--- a/third_party/eigen3/Eigen/Cholesky
+++ b/third_party/eigen3/Eigen/Cholesky
@@ -1 +1 @@
-#include "eigen-eigen-d02e6a705c30/Eigen/Cholesky"
+#include "benoitsteiner-opencl-9d4a08d57d0d/Eigen/Cholesky"
diff --git a/third_party/eigen3/Eigen/Core b/third_party/eigen3/Eigen/Core
index c1d4a2e0f8c..95a48a30d84 100644
--- a/third_party/eigen3/Eigen/Core
+++ b/third_party/eigen3/Eigen/Core
@@ -1 +1 @@
-#include "eigen-eigen-d02e6a705c30/Eigen/Core"
+#include "benoitsteiner-opencl-9d4a08d57d0d/Eigen/Core"
diff --git a/third_party/eigen3/Eigen/Eigenvalues b/third_party/eigen3/Eigen/Eigenvalues
index 0a0731ba19b..21ffbb5202b 100644
--- a/third_party/eigen3/Eigen/Eigenvalues
+++ b/third_party/eigen3/Eigen/Eigenvalues
@@ -1 +1 @@
-#include "eigen-eigen-d02e6a705c30/Eigen/Eigenvalues"
+#include "benoitsteiner-opencl-9d4a08d57d0d/Eigen/Eigenvalues"
diff --git a/third_party/eigen3/Eigen/LU b/third_party/eigen3/Eigen/LU
index d6b39b8d235..3263cf2b828 100644
--- a/third_party/eigen3/Eigen/LU
+++ b/third_party/eigen3/Eigen/LU
@@ -1 +1 @@
-#include "eigen-eigen-d02e6a705c30/Eigen/LU"
+#include "benoitsteiner-opencl-9d4a08d57d0d/Eigen/LU"
diff --git a/third_party/eigen3/Eigen/QR b/third_party/eigen3/Eigen/QR
index a5406e93bc6..cf138a470f1 100644
--- a/third_party/eigen3/Eigen/QR
+++ b/third_party/eigen3/Eigen/QR
@@ -1 +1 @@
-#include "eigen-eigen-d02e6a705c30/Eigen/QR"
+#include "benoitsteiner-opencl-9d4a08d57d0d/Eigen/QR"
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/Tensor b/third_party/eigen3/unsupported/Eigen/CXX11/Tensor
index 4f730236b78..33742177dd8 100644
--- a/third_party/eigen3/unsupported/Eigen/CXX11/Tensor
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/Tensor
@@ -1 +1 @@
-#include "eigen-eigen-d02e6a705c30/unsupported/Eigen/CXX11/Tensor"
+#include "benoitsteiner-opencl-9d4a08d57d0d/unsupported/Eigen/CXX11/Tensor"

From e3f884b496b8548d9f5d46c2f1c487a957d3f341 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Wed, 12 Oct 2016 22:52:59 -0700
Subject: [PATCH 02/51] Switch to the latest version of Eigen that supports
 OpenCL

---
 tensorflow/workspace.bzl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 2f9b648f1a4..ac2a22ee548 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -14,14 +14,14 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
   # These lines need to be changed when updating Eigen. They are parsed from
   # this file by the cmake and make builds to determine the eigen version and
   # hash.
-  eigen_version = "97c1ebe6ccc2"
-  eigen_sha256 = "58ab9fa44391c850d783fe0867f42a00b5300293b7d73bbbbc8756c2e649fea2"
+  eigen_version = "aad63574941c"
+  eigen_sha256 = ""
 
   native.new_http_archive(
     name = "eigen_archive",
-    url = "http://bitbucket.org/eigen/eigen/get/" + eigen_version + ".tar.gz",
+    url = "http://bitbucket.org/benoitsteiner/opencl/get/" + eigen_version + ".tar.gz",
     sha256 = eigen_sha256,
-    strip_prefix = "eigen-eigen-" + eigen_version,
+    strip_prefix = "benoitsteiner-opencl-" + eigen_version,
     build_file = str(Label("//:eigen.BUILD")),
   )
 

From 21f822cb5eb117ef2bd42fab060175ad7b98e505 Mon Sep 17 00:00:00 2001
From: luke iwanski <luke@codeplay.com>
Date: Fri, 7 Oct 2016 15:13:17 +0100
Subject: [PATCH 03/51] Build system that works with ComputeCpp CE.

---
 configure                             |  74 ++++++++++++-
 third_party/sycl/BUILD                |  44 ++++++++
 third_party/sycl/build_defs.bzl       |  10 ++
 third_party/sycl/crosstool/BUILD      |  28 +++++
 third_party/sycl/crosstool/CROSSTOOL  |  82 +++++++++++++++
 third_party/sycl/crosstool/computecpp |  61 +++++++++++
 third_party/sycl/platform.bzl         |  17 +++
 third_party/sycl/sycl_config.sh       | 143 ++++++++++++++++++++++++++
 tools/bazel.rc.template               |   3 +
 9 files changed, 461 insertions(+), 1 deletion(-)
 create mode 100755 third_party/sycl/BUILD
 create mode 100755 third_party/sycl/build_defs.bzl
 create mode 100755 third_party/sycl/crosstool/BUILD
 create mode 100755 third_party/sycl/crosstool/CROSSTOOL
 create mode 100755 third_party/sycl/crosstool/computecpp
 create mode 100755 third_party/sycl/platform.bzl
 create mode 100755 third_party/sycl/sycl_config.sh

diff --git a/configure b/configure
index 933bd573578..426071e48d0 100755
--- a/configure
+++ b/configure
@@ -126,6 +126,17 @@ GEN_GIT_SOURCE=tensorflow/tools/git/gen_git_source.py
 chmod a+x ${GEN_GIT_SOURCE}
 "${PYTHON_BIN_PATH}" ${GEN_GIT_SOURCE} --configure "${SOURCE_BASE_DIR}"
 
+## Set up SYCL-related environment settings
+while [ "$TF_NEED_OPENCL" == "" ]; do
+  read -p "Do you wish to build TensorFlow with OpenCL support? [y/N] " INPUT
+  case $INPUT in
+    [Yy]* ) echo "OpenCL support will be enabled for TensorFlow"; TF_NEED_OPENCL=1;;
+    [Nn]* ) echo "No OpenCL support will be enabled for TensorFlow"; TF_NEED_OPENCL=0;;
+    "" ) echo "No OpenCL support will be enabled for TensorFlow"; TF_NEED_OPENCL=0;;
+    * ) echo "Invalid selection: " $INPUT;;
+  esac
+done
+
 ## Set up Cuda-related environment settings
 
 while [ "$TF_NEED_CUDA" == "" ]; do
@@ -139,12 +150,14 @@ while [ "$TF_NEED_CUDA" == "" ]; do
 done
 
 export TF_NEED_CUDA
-if [ "$TF_NEED_CUDA" == "0" ]; then
+export TF_NEED_SYCL
+if [[ "$TF_NEED_CUDA" == "0" ]] && [[ "$TF_NEED_OPENCL" == "0" ]]; then
   echo "Configuration finished"
   bazel_clean_and_fetch
   exit
 fi
 
+if [ "$TF_NEED_CUDA" == "1" ]; then
 # Set up which gcc nvcc should use as the host compiler
 while true; do
   fromuser=""
@@ -346,6 +359,65 @@ EOF
   TF_CUDA_COMPUTE_CAPABILITIES=""
 done
 
+# end of if "$TF_NEED_CUDA" == "1"
+fi
+
+# OpenCL configuration
+
+if [ "$TF_NEED_OPENCL" == "1" ]; then
+while true; do
+  # Configure the OPENCL version to use.
+  TF_OPENCL_VERSION="1.2"
+
+  # Point to ComputeCPP root
+  if [ -z "$COMPUTECPP_PATH" ]; then
+    default_computecpp_path=/usr/local/computecpp
+    read -p "Please specify the location where ComputeCPP $TF_OPENCL_VERSION is installed. Refer to README.md for more details. [Default is $default_computecpp_path]: " COMPUTECPP_PATH
+    fromuser="1"
+    if [ -z "$COMPUTECPP_PATH" ]; then
+      COMPUTECPP_PATH=$default_computecpp_path
+    fi
+  fi
+
+  if [ "$OSNAME" == "Linux" ]; then
+    SYCL_RT_LIB_PATH="lib/libComputeCpp.so"
+  fi
+
+  if [ -e "${COMPUTECPP_PATH}/${SYCL_RT_LIB_PATH}" ]; then
+    break
+  fi
+  echo "Invalid path to SYCL $TF_OPENCL_VERSION library. ${COMPUTECPP_PATH}/${SYCL_RT_LIB_PATH} cannot be found"
+
+  if [ -z "$fromuser" ]; then
+    exit 1
+  fi
+  # Retry
+  TF_OPENCL_VERSION=""
+  COMPUTECPP_PATH=""
+done
+
+cat > third_party/sycl/sycl.config <<EOF
+# COMPUTECPP_PATH refers to the ComputeCPP toolkit.
+COMPUTECPP_PATH="$COMPUTECPP_PATH"
+
+# The OpenCL version that should be used in this build
+TF_OPENCL_VERSION=$TF_OPENCL_VERSION
+
+EOF
+
+export WARNING=$DO_NOT_SUBMIT_WARNING
+perl -pi -e "s,#cxx_builtin_include_directory: {COMPUTECPP_INCLUDE},# \$ENV{WARNING}\ncxx_builtin_include_directory: \"${COMPUTECPP_PATH}\",s" third_party/sycl/crosstool/CROSSTOOL
+
+# Configure the platform name.
+perl -pi -e "s,PLATFORM = \".*\",PLATFORM = \"$OSNAME\",s" third_party/sycl/platform.bzl
+
+
+# Invoke the cuda_config.sh and set up the TensorFlow's canonical view of the Cuda libraries
+(cd third_party/sycl; ./sycl_config.sh;) || exit -1
+
+# end of if "$TF_NEED_OPENCL" == "1"
+fi
+
 bazel_clean_and_fetch
 
 echo "Configuration finished"
diff --git a/third_party/sycl/BUILD b/third_party/sycl/BUILD
new file mode 100755
index 00000000000..4aadc808302
--- /dev/null
+++ b/third_party/sycl/BUILD
@@ -0,0 +1,44 @@
+licenses(["restricted"])  # MPL2, portions GPL v3, LGPL v3, BSD-like
+
+load("//third_party/sycl:build_defs.bzl", "if_sycl")
+load("platform", "sycl_library_path")
+load("platform", "sycl_static_library_path")
+
+load("platform", "readlink_command")
+
+package(default_visibility = ["//visibility:public"])
+
+config_setting(
+    name = "using_sycl",
+    values = {
+        "define": "using_sycl=true",
+    },
+)
+
+cc_library(
+    name = "sycl_headers",
+    hdrs = glob([
+        "**/*.h",
+    ]),
+    includes = [".", "include"],
+)
+
+cc_library(
+    name = "syclrt",
+    srcs = [
+        sycl_library_path("ComputeCpp")
+    ],
+    data = [
+        sycl_library_path("ComputeCpp")
+    ],
+    includes = ["include/"],
+    linkstatic = 1,
+)
+
+cc_library(
+    name = "sycl",
+    deps = if_sycl([
+        ":sycl_headers",
+        ":syclrt",
+    ]),
+)
diff --git a/third_party/sycl/build_defs.bzl b/third_party/sycl/build_defs.bzl
new file mode 100755
index 00000000000..1aaeb741a21
--- /dev/null
+++ b/third_party/sycl/build_defs.bzl
@@ -0,0 +1,10 @@
+# Macros for building SYCL code.
+def if_sycl(if_true, if_false = []):
+    """Shorthand for select()'ing on whether we're building with SYCL.
+    Returns a select statement which evaluates to if_true if we're building
+    with SYCL enabled.  Otherwise, the select statement evaluates to if_false.
+    """
+    return select({
+        "//third_party/sycl:using_sycl": if_true,
+        "//conditions:default": if_false
+    })
diff --git a/third_party/sycl/crosstool/BUILD b/third_party/sycl/crosstool/BUILD
new file mode 100755
index 00000000000..eac4dc7fad8
--- /dev/null
+++ b/third_party/sycl/crosstool/BUILD
@@ -0,0 +1,28 @@
+licenses(["restricted"])
+
+package(default_visibility = ["//visibility:public"])
+
+filegroup(
+    name = "crosstool",
+    srcs = ["CROSSTOOL"],
+    output_licenses = ["unencumbered"],
+)
+
+cc_toolchain(
+    name = "cc-compiler-local",
+    all_files = ":empty",
+    compiler_files = ":empty",
+    cpu = "local",
+    dwp_files = ":empty",
+    dynamic_runtime_libs = [":empty"],
+    linker_files = ":empty",
+    objcopy_files = ":empty",
+    static_runtime_libs = [":empty"],
+    strip_files = ":empty",
+    supports_param_files = 0,
+)
+
+filegroup(
+    name = "empty",
+    srcs = [],
+)
diff --git a/third_party/sycl/crosstool/CROSSTOOL b/third_party/sycl/crosstool/CROSSTOOL
new file mode 100755
index 00000000000..a96d33d522f
--- /dev/null
+++ b/third_party/sycl/crosstool/CROSSTOOL
@@ -0,0 +1,82 @@
+major_version: "local"
+minor_version: ""
+default_target_cpu: "same_as_host"
+
+default_toolchain {
+  cpu: "k8"
+  toolchain_identifier: "local_linux"
+}
+
+toolchain {
+  abi_version: "local"
+  abi_libc_version: "local"
+  builtin_sysroot: ""
+  compiler: "compiler"
+  host_system_name: "local"
+  needsPic: true
+  supports_gold_linker: false
+  supports_incremental_linker: false
+  supports_fission: false
+  supports_interface_shared_objects: false
+  supports_normalizing_ar: false
+  supports_start_end_lib: false
+  supports_thin_archives: false
+  target_libc: "local"
+  target_cpu: "local"
+  target_system_name: "local"
+  toolchain_identifier: "local_linux"
+
+  tool_path { name: "ar" path: "/usr/bin/ar" }
+  tool_path { name: "compat-ld" path: "/usr/bin/ld" }
+  tool_path { name: "cpp" path: "/usr/bin/cpp" }
+  tool_path { name: "dwp" path: "/usr/bin/dwp" }
+  tool_path { name: "gcc" path: "computecpp" }
+  # Use "-std=c++11" for nvcc. For consistency, force both the host compiler
+  # and the device compiler to use "-std=c++11".
+  cxx_flag: "-std=c++11"
+  linker_flag: "-lstdc++"
+  linker_flag: "-B/usr/bin/"
+
+  # TODO(bazel-team): In theory, the path here ought to exactly match the path
+  # used by gcc. That works because bazel currently doesn't track files at
+  # absolute locations and has no remote execution, yet. However, this will need
+  # to be fixed, maybe with auto-detection?
+  cxx_builtin_include_directory: "/usr/lib/gcc/"
+  cxx_builtin_include_directory: "/usr/lib"
+  cxx_builtin_include_directory: "/usr/lib64"
+  cxx_builtin_include_directory: "/usr/local/include"
+  cxx_builtin_include_directory: "/usr/include"
+
+  #cxx_builtin_include_directory: {COMPUTECPP_INCLUDE}
+
+  tool_path { name: "gcov" path: "/usr/bin/gcov" }
+
+  # C(++) compiles invoke the compiler (as that is the one knowing where
+  # to find libraries), but we provide LD so other rules can invoke the linker.
+  tool_path { name: "ld" path: "/usr/bin/ld" }
+
+  tool_path { name: "nm" path: "/usr/bin/nm" }
+  tool_path { name: "objcopy" path: "/usr/bin/objcopy" }
+  objcopy_embed_flag: "-I"
+  objcopy_embed_flag: "binary"
+  tool_path { name: "objdump" path: "/usr/bin/objdump" }
+  tool_path { name: "strip" path: "/usr/bin/strip" }
+
+  # Make C++ compilation deterministic. Use linkstamping instead of these
+  # compiler symbols.
+  unfiltered_cxx_flag: "-Wno-builtin-macro-redefined"
+  unfiltered_cxx_flag: "-D__DATE__=\"redacted\""
+  unfiltered_cxx_flag: "-D__TIMESTAMP__=\"redacted\""
+  unfiltered_cxx_flag: "-D__TIME__=\"redacted\""
+
+  # All warnings are enabled. Maybe enable -Werror as well?
+  compiler_flag: "-Wall"
+
+  # Anticipated future default.
+  linker_flag: "-Wl,-no-as-needed"
+  # Stamp the binary with a unique identifier.
+  linker_flag: "-Wl,--build-id=md5"
+  linker_flag: "-Wl,--hash-style=gnu"
+
+  linking_mode_flags { mode: DYNAMIC }
+}
diff --git a/third_party/sycl/crosstool/computecpp b/third_party/sycl/crosstool/computecpp
new file mode 100755
index 00000000000..0b8427fd0ae
--- /dev/null
+++ b/third_party/sycl/crosstool/computecpp
@@ -0,0 +1,61 @@
+#!/usr/bin/env python2.7
+
+from argparse import ArgumentParser
+import os
+import subprocess
+import re
+import sys
+import pipes
+
+CPU_CXX_COMPILER = ('/usr/bin/clang++-3.6')
+CPU_C_COMPILER = ('/usr/bin/clang-3.6')
+
+CURRENT_DIR = os.path.dirname(sys.argv[0])
+COMPUTECPP_ROOT = CURRENT_DIR +"/../"
+COMPUTECPP_DRIVER= COMPUTECPP_ROOT+"bin/compute++"
+COMPUTECPP_INCLUDE = COMPUTECPP_ROOT+"include"
+
+def main():
+  computecpp_compiler_flags = [""]
+  computecpp_compiler_flags = [flag for flag in sys.argv[1:]]
+  computecpp_compiler_flags = computecpp_compiler_flags + ["-D_GLIBCXX_USE_CXX11_ABI=0"]
+
+  output_file_index = computecpp_compiler_flags.index("-o") +1
+  output_file_name = computecpp_compiler_flags[output_file_index]
+
+  if(output_file_index == 1):
+    # we are linking
+    return subprocess.call([CPU_CXX_COMPILER] +computecpp_compiler_flags )
+
+  # find what we compile
+  compiling_cpp = 0
+  if("-c" in computecpp_compiler_flags):
+      compiled_file_index = computecpp_compiler_flags.index("-c") +1
+      compited_file_name = computecpp_compiler_flags[compiled_file_index]
+      if(compited_file_name.endswith(('.cc', '.c++', '.cpp', '.CPP', '.C', '.cxx'))):
+          compiling_cpp = 1;
+
+  if(compiling_cpp == 1):
+      filename, file_extension = os.path.splitext(output_file_name)
+      bc_out = filename + ".sycl"
+
+      computecpp_compiler_flags = ['-DTENSORFLOW_USE_SYCL', '-Wno-unused-variable','-I', COMPUTECPP_INCLUDE,'-isystem',
+      COMPUTECPP_INCLUDE, "-std=c++11", "-sycl", "-emit-llvm", "-no-serial-memop"] + computecpp_compiler_flags
+
+      # dont want that in case of compiling with computecpp first
+      host_compiler_flags = [""]
+      host_compiler_flags = [flag for flag in sys.argv[1:]
+                                if not flag.startswith(('-MF','-MD',))
+                                if not ".d" in flag]
+
+      x = subprocess.call([COMPUTECPP_DRIVER] +computecpp_compiler_flags )
+      if(x == 0):
+          host_compiler_flags = ['-DTENSORFLOW_USE_SYCL', '-Wno-unused-variable', '-I', COMPUTECPP_INCLUDE, "--include",bc_out] + host_compiler_flags
+          return subprocess.call([CPU_CXX_COMPILER] +host_compiler_flags )
+      return x
+  else:
+    # compile for C
+    return subprocess.call([CPU_C_COMPILER] +computecpp_compiler_flags)
+
+if __name__ == '__main__':
+  sys.exit(main())
diff --git a/third_party/sycl/platform.bzl b/third_party/sycl/platform.bzl
new file mode 100755
index 00000000000..8905b496b3b
--- /dev/null
+++ b/third_party/sycl/platform.bzl
@@ -0,0 +1,17 @@
+SYCL_VERSION = ""
+PLATFORM = ""
+
+def sycl_sdk_version():
+  return SYCL_VERSION
+
+def sycl_library_path(name, version = sycl_sdk_version()):
+  if not version:
+    return "lib/lib{}.so".format(name)
+  else:
+    return "lib/lib{}.so.{}".format(name, version)
+
+def sycl_static_library_path(name):
+  return "lib/lib{}_static.a".format(name)
+
+def readlink_command():
+    return "readlink"
diff --git a/third_party/sycl/sycl_config.sh b/third_party/sycl/sycl_config.sh
new file mode 100755
index 00000000000..7fa93110708
--- /dev/null
+++ b/third_party/sycl/sycl_config.sh
@@ -0,0 +1,143 @@
+#!/usr/bin/env bash
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+
+# A simple script to configure the SYCL tree needed for the TensorFlow OpenCL
+# build. We need both COMPUTECPP toolkit $TF_OPENCL_VERSION.
+# Useage:
+#    * User edit sycl.config to point ComputeCPP toolkit to its local path
+#    * run sycl_config.sh to generate symbolic links in the source tree to reflect
+#    * the file organizations needed by TensorFlow.
+
+print_usage() {
+cat << EOF
+Usage: $0 [--check]
+  Configure TensorFlow's canonical view of SYCL libraries using sycl.config.
+Arguments:
+  --check: Only check that the proper SYCL dependencies has already been
+       properly configured in the source tree. It also creates symbolic links to
+       the files in the gen-tree to make bazel happy.
+EOF
+}
+
+CHECK_ONLY=0
+# Parse the arguments. Add more arguments as the "case" line when needed.
+while [[ $# -gt 0 ]]; do
+  argument="$1"
+  shift
+  case $argument in
+    --check)
+      CHECK_ONLY=1
+      ;;
+    *)
+      echo "Error: unknown arguments"
+      print_usage
+      exit -1
+      ;;
+  esac
+done
+
+source sycl.config || exit -1
+
+OUTPUTDIR=${OUTPUTDIR:-../..}
+COMPUTECPP_PATH=${COMPUTECPP_PATH:-/usr/local/computecpp}
+
+# An error message when the SYCL toolkit is not found
+function SYCLError {
+  echo ERROR: $1
+cat << EOF
+##############################################################################
+##############################################################################
+SYCL $TF_OPENCL_VERSION toolkit is missing.
+1. Download and install the ComputeCPP $TF_OPENCL_VERSION toolkit;
+2. Run configure from the root of the source tree, before rerunning bazel;
+Please refer to README.md for more details.
+##############################################################################
+##############################################################################
+EOF
+  exit -1
+}
+
+# Check that the SYCL libraries have already been properly configured in the source tree.
+# We still need to create links to the gen-tree to make bazel happy.
+function CheckAndLinkToSrcTree {
+  ERROR_FUNC=$1
+  FILE=$2
+  if test ! -e $FILE; then
+    $ERROR_FUNC "$PWD/$FILE cannot be found"
+  fi
+
+  # Link the output file to the source tree, avoiding self links if they are
+  # the same. This could happen if invoked from the source tree by accident.
+  if [ ! $($READLINK_CMD -f $PWD) == $($READLINK_CMD -f $OUTPUTDIR/third_party/sycl) ]; then
+    mkdir -p $(dirname $OUTPUTDIR/third_party/sycl/$FILE)
+    ln -sf $PWD/$FILE $OUTPUTDIR/third_party/sycl/$FILE
+  fi
+}
+
+OSNAME=`uname -s`
+if [ "$OSNAME" == "Linux" ]; then
+  SYCL_LIB_PATH="lib"
+  SYCL_RT_LIB_PATH="lib/libComputeCpp.so"
+  SYCL_RT_LIB_STATIC_PATH="lib/libComputeCpp.a"
+  READLINK_CMD="readlink"
+fi
+
+if [ "$CHECK_ONLY" == "1" ]; then
+  CheckAndLinkToSrcTree SYCLError include/SYCL/sycl.h
+  CheckAndLinkToSrcTree SYCLError $SYCL_RT_LIB_STATIC_PATH
+  CheckAndLinkToSrcTree CudaError $SYCL_RT_LIB_PATH
+  exit 0
+fi
+
+# Actually configure the source tree for TensorFlow's canonical view of SYCL
+# libraries.
+
+if test ! -e ${COMPUTECPP_PATH}/${SYCL_RT_LIB_PATH}; then
+  SYCLError "cannot find ${COMPUTECPP_PATH}/${SYCL_RT_LIB_PATH}"
+fi
+
+# Helper function to build symbolic links for all files in a directory.
+function LinkOneDir {
+  SRC_PREFIX=$1
+  DST_PREFIX=$2
+  SRC_DIR=$3
+  DST_DIR=$(echo $SRC_DIR | sed "s,^$SRC_PREFIX,$DST_PREFIX,")
+  mkdir -p $DST_DIR
+  FILE_LIST=$(find -L $SRC_DIR -maxdepth 1 -type f)
+  if test "$FILE_LIST" != ""; then
+    ln -sf $FILE_LIST $DST_DIR/ || exit -1
+  fi
+}
+export -f LinkOneDir
+
+# Build links for all files in the directory, including subdirectories.
+function LinkAllFiles {
+  SRC_DIR=$1
+  DST_DIR=$2
+  find -L $SRC_DIR -type d | xargs -I {} bash -c "LinkOneDir $SRC_DIR $DST_DIR {}" || exit -1
+}
+
+# Set up the symbolic links for SYCL toolkit. We link at individual file level,
+# not at the directory level.
+# This is because the external library may have a different file layout from our desired structure.
+mkdir -p $OUTPUTDIR/third_party/sycl
+echo "Setting up SYCL include"
+LinkAllFiles ${COMPUTECPP_PATH}/include $OUTPUTDIR/third_party/sycl/include || exit -1
+echo "Setting up SYCL ${SYCL_LIB_PATH}"
+LinkAllFiles ${COMPUTECPP_PATH}/${SYCL_LIB_PATH} $OUTPUTDIR/third_party/sycl/${SYCL_LIB_PATH} || exit -1
+echo "Setting up SYCL bin"
+LinkAllFiles ${COMPUTECPP_PATH}/bin $OUTPUTDIR/third_party/sycl/bin || exit -1
diff --git a/tools/bazel.rc.template b/tools/bazel.rc.template
index 58dd7434a89..875a290215d 100644
--- a/tools/bazel.rc.template
+++ b/tools/bazel.rc.template
@@ -1,6 +1,9 @@
 build:cuda --crosstool_top=@local_config_cuda//crosstool:toolchain
 build:cuda --define=using_cuda=true --define=using_cuda_nvcc=true
 
+build:sycl --crosstool_top=//third_party/sycl/crosstool
+build:sycl --define=using_sycl=true
+
 build --force_python=py$PYTHON_MAJOR_VERSION
 build --host_force_python=py$PYTHON_MAJOR_VERSION
 build --python$PYTHON_MAJOR_VERSION_path=$PYTHON_BINARY

From b8b166429a80b038da1cd56f331bcbbebb95932d Mon Sep 17 00:00:00 2001
From: luke <luke@codeplay.com>
Date: Wed, 12 Oct 2016 14:41:09 +0100
Subject: [PATCH 04/51] Applied workaround for the ComputeCpp CE.

---
 tensorflow/core/common_runtime/bfc_allocator.h            | 5 ++---
 tensorflow/core/common_runtime/direct_session.h           | 3 ++-
 .../core/common_runtime/gpu/gpu_allocator_retry_test.cc   | 6 +++---
 tensorflow/core/common_runtime/gpu/pool_allocator.h       | 2 +-
 tensorflow/core/framework/op.h                            | 3 ++-
 tensorflow/core/framework/tracking_allocator.h            | 2 +-
 tensorflow/core/kernels/barrier_ops.cc                    | 8 +++++---
 tensorflow/core/kernels/conditional_accumulator.h         | 6 +++---
 tensorflow/core/kernels/conditional_accumulator_base.h    | 3 ++-
 tensorflow/core/kernels/conditional_accumulator_base_op.h | 2 +-
 tensorflow/core/kernels/cwise_ops_common.h                | 5 +++++
 tensorflow/core/kernels/queue_base.h                      | 2 +-
 tensorflow/core/kernels/queue_op.h                        | 2 +-
 tensorflow/core/kernels/sparse_conditional_accumulator.h  | 2 +-
 tensorflow/core/kernels/tensor_array.h                    | 3 +--
 tensorflow/core/lib/monitoring/collection_registry.cc     | 4 +++-
 tensorflow/core/lib/monitoring/collection_registry.h      | 5 ++---
 tensorflow/core/lib/monitoring/counter.h                  | 3 +--
 tensorflow/stream_executor/machine_manager.h              | 7 +++----
 19 files changed, 40 insertions(+), 33 deletions(-)

diff --git a/tensorflow/core/common_runtime/bfc_allocator.h b/tensorflow/core/common_runtime/bfc_allocator.h
index c13f67ffcc7..8fd6597cb88 100644
--- a/tensorflow/core/common_runtime/bfc_allocator.h
+++ b/tensorflow/core/common_runtime/bfc_allocator.h
@@ -295,6 +295,8 @@ class BFCAllocator : public VisitableAllocator {
    private:
     std::vector<AllocationRegion> regions_;
   };
+  // Structures mutable after construction
+  mutable mutex lock_;
 
   // Returns 'bytes' rounded up to the next highest kMinAllocationSize.
   size_t RoundedBytes(size_t bytes);
@@ -389,9 +391,6 @@ class BFCAllocator : public VisitableAllocator {
 
   std::unique_ptr<SubAllocator> suballocator_;
   string name_;
-
-  // Structures mutable after construction
-  mutable mutex lock_;
   RegionManager region_manager_ GUARDED_BY(lock_);
 
   std::vector<Chunk> chunks_;
diff --git a/tensorflow/core/common_runtime/direct_session.h b/tensorflow/core/common_runtime/direct_session.h
index a4289112534..8fe4825aa6d 100644
--- a/tensorflow/core/common_runtime/direct_session.h
+++ b/tensorflow/core/common_runtime/direct_session.h
@@ -162,6 +162,8 @@ class DirectSession : public Session {
     protobuf::RepeatedPtrField<DebugTensorWatch> debug_tensor_watches;
   };
 
+  mutex graph_def_lock_;
+
   // Initializes the base execution state given the 'graph',
   // if not already initialized.
   Status MaybeInitializeExecutionState(const GraphDef& graph,
@@ -227,7 +229,6 @@ class DirectSession : public Session {
   string session_handle_;
   bool graph_created_ GUARDED_BY(graph_def_lock_) = false;
 
-  mutex graph_def_lock_;
   GraphDef graph_def_ GUARDED_BY(graph_def_lock_);
 
   // The thread-pools to use for running ops.
diff --git a/tensorflow/core/common_runtime/gpu/gpu_allocator_retry_test.cc b/tensorflow/core/common_runtime/gpu/gpu_allocator_retry_test.cc
index 2148f83fe57..423448773ae 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_allocator_retry_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_allocator_retry_test.cc
@@ -28,6 +28,7 @@ namespace tensorflow {
 namespace {
 
 class FakeAllocator {
+  mutex mu_;
  public:
   FakeAllocator(size_t cap, int millis_to_wait)
       : memory_capacity_(cap), millis_to_wait_(millis_to_wait) {}
@@ -57,7 +58,6 @@ class FakeAllocator {
  private:
   AllocatorRetry retry_;
   void* good_ptr_ = reinterpret_cast<void*>(0xdeadbeef);
-  mutex mu_;
   size_t memory_capacity_ GUARDED_BY(mu_);
   int millis_to_wait_;
 };
@@ -72,6 +72,7 @@ class FakeAllocator {
 // interesting part of their interaction with the allocator.  This
 // class is the mechanism that imposes turn taking.
 class AlternatingBarrier {
+  mutex mu_;
  public:
   explicit AlternatingBarrier(int num_users)
       : num_users_(num_users), next_turn_(0), done_(num_users, false) {}
@@ -109,7 +110,6 @@ class AlternatingBarrier {
     }
   }
 
-  mutex mu_;
   condition_variable cv_;
   int num_users_;
   int next_turn_ GUARDED_BY(mu_);
@@ -118,6 +118,7 @@ class AlternatingBarrier {
 
 class GPUAllocatorRetryTest : public ::testing::Test {
  protected:
+  mutex mu_;
   GPUAllocatorRetryTest() {}
 
   void LaunchConsumerThreads(int num_consumers, int cap_needed) {
@@ -173,7 +174,6 @@ class GPUAllocatorRetryTest : public ::testing::Test {
   std::vector<Thread*> consumers_;
   std::vector<int> consumer_count_;
   Notification notifier_;
-  mutex mu_;
   bool has_failed_ GUARDED_BY(mu_) = false;
   int count_ GUARDED_BY(mu_) = 0;
 };
diff --git a/tensorflow/core/common_runtime/gpu/pool_allocator.h b/tensorflow/core/common_runtime/gpu/pool_allocator.h
index b2f0265145f..437fea91155 100644
--- a/tensorflow/core/common_runtime/gpu/pool_allocator.h
+++ b/tensorflow/core/common_runtime/gpu/pool_allocator.h
@@ -45,6 +45,7 @@ class RoundUpInterface {
 // Size-limited pool of memory buffers obtained from a SubAllocator
 // instance.  Pool eviction policy is LRU.
 class PoolAllocator : public VisitableAllocator {
+  mutex mutex_;
  public:
   // "pool_size_limit" is the maximum number of returned, re-usable
   // memory buffers to keep in the pool.  If pool_size_limit == 0, the
@@ -136,7 +137,6 @@ class PoolAllocator : public VisitableAllocator {
   size_t pool_size_limit_;
   std::unique_ptr<SubAllocator> allocator_;
   std::unique_ptr<RoundUpInterface> size_rounder_;
-  mutex mutex_;
   std::multimap<const size_t, PtrRecord*> pool_ GUARDED_BY(mutex_);
   PtrRecord* lru_head_ GUARDED_BY(mutex_) = nullptr;
   PtrRecord* lru_tail_ GUARDED_BY(mutex_) = nullptr;
diff --git a/tensorflow/core/framework/op.h b/tensorflow/core/framework/op.h
index f047ddb12a1..321ace9f465 100644
--- a/tensorflow/core/framework/op.h
+++ b/tensorflow/core/framework/op.h
@@ -125,6 +125,8 @@ class OpRegistry : public OpRegistryInterface {
   void ClearDeferredRegistrations();
 
  private:
+  mutable mutex mu_;
+
   // Ensures that all the functions in deferred_ get called, their OpDef's
   // registered, and returns with deferred_ empty.  Returns true the first
   // time it is called. Prints a fatal log if any op registration fails.
@@ -141,7 +143,6 @@ class OpRegistry : public OpRegistryInterface {
   Status RegisterAlreadyLocked(OpRegistrationDataFactory op_data_factory) const
       EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
-  mutable mutex mu_;
   // Functions in deferred_ may only be called with mu_ held.
   mutable std::vector<OpRegistrationDataFactory> deferred_ GUARDED_BY(mu_);
   // Values are owned.
diff --git a/tensorflow/core/framework/tracking_allocator.h b/tensorflow/core/framework/tracking_allocator.h
index bb19f5dca04..040bb03f819 100644
--- a/tensorflow/core/framework/tracking_allocator.h
+++ b/tensorflow/core/framework/tracking_allocator.h
@@ -74,11 +74,11 @@ class TrackingAllocator : public Allocator {
   std::pair<size_t, size_t> GetSizesAndUnRef();
 
  private:
+  mutex mu_;
   ~TrackingAllocator() override {}
   bool UnRef() EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   Allocator* allocator_;  // not owned.
-  mutex mu_;
   // the number of calls to AllocateRaw that have not yet been matched
   // by a corresponding call to DeAllocateRaw, plus 1 if the Executor
   // has not yet read out the high watermark.
diff --git a/tensorflow/core/kernels/barrier_ops.cc b/tensorflow/core/kernels/barrier_ops.cc
index 84f57517605..e91d9037cff 100644
--- a/tensorflow/core/kernels/barrier_ops.cc
+++ b/tensorflow/core/kernels/barrier_ops.cc
@@ -40,6 +40,7 @@ namespace tensorflow {
 namespace barrier {
 
 class Barrier : public ResourceBase {
+  mutex mu_;
  public:
   typedef std::vector<Tensor> Tuple;
   typedef std::function<void()> DoneCallback;
@@ -417,7 +418,6 @@ class Barrier : public ResourceBase {
 
  private:
   typedef std::vector<PersistentTensor> PersistentTuple;
-  mutex mu_;
   bool closed_ GUARDED_BY(mu_);
   bool queue_closed_ GUARDED_BY(mu_);
   bool queue_cancelled_ GUARDED_BY(mu_);
@@ -433,6 +433,7 @@ class Barrier : public ResourceBase {
 };
 
 class BarrierOp : public OpKernel {
+  mutex mu_;
  public:
   explicit BarrierOp(OpKernelConstruction* context)
       : OpKernel(context), barrier_handle_set_(false) {
@@ -511,7 +512,6 @@ class BarrierOp : public OpKernel {
   std::vector<TensorShape> value_component_shapes_;
   ContainerInfo cinfo_;
 
-  mutex mu_;
   PersistentTensor barrier_handle_ GUARDED_BY(mu_);
   bool barrier_handle_set_ GUARDED_BY(mu_);
 
@@ -611,7 +611,9 @@ class TakeManyOp : public BarrierOpKernel {
     DataTypeVector expected_inputs = {DT_STRING_REF, DT_INT32};
     // The first output is the insertion index, the second output is the key.
     DataTypeVector expected_outputs = {DT_INT64, DT_STRING};
-    for (DataType dt : barrier->component_types()) {
+    for (auto it  = barrier->component_types().begin(),
+              end = barrier->component_types().end(); it!= end; it++ ){
+      const DataType dt = *it;
       expected_outputs.push_back(dt);
     }
     OP_REQUIRES_OK_ASYNC(
diff --git a/tensorflow/core/kernels/conditional_accumulator.h b/tensorflow/core/kernels/conditional_accumulator.h
index f8c340a7691..4ee1601f342 100644
--- a/tensorflow/core/kernels/conditional_accumulator.h
+++ b/tensorflow/core/kernels/conditional_accumulator.h
@@ -65,7 +65,7 @@ class ConditionalAccumulator
   functor::SetZeroFunctor<Device, T> set_zero_functor_;
 
   Status ValidateShape(const Tensor* tensor)
-      EXCLUSIVE_LOCKS_REQUIRED(this->mu_) {
+      EXCLUSIVE_LOCKS_REQUIRED(mu_) {
     // Must be compatible with accumulated gradient if available
     if (counter_ > 0) {
       if (!accum_grad_->shape().IsSameSize(tensor->shape())) {
@@ -98,7 +98,7 @@ class ConditionalAccumulator
   }
 
   void DivideAccumGradByCounter(OpKernelContext* ctx) override
-      EXCLUSIVE_LOCKS_REQUIRED(this->mu_) {
+      EXCLUSIVE_LOCKS_REQUIRED(mu_) {
     Tensor c(DataTypeToEnum<T>::value, {});
     c.scalar<T>()() = TypeConverter<T, int>::ConvertUToT(this->counter_);
     this->accum_grad_->template flat<T>().device(
@@ -113,7 +113,7 @@ class ConditionalAccumulator
 
   bool GetAndValidateTensorInputForApplyGrad(OpKernelContext* ctx,
                                              const Tensor** tensor) override
-      EXCLUSIVE_LOCKS_REQUIRED(this->mu_) {
+      EXCLUSIVE_LOCKS_REQUIRED(mu_) {
     // Get input gradient tensor
     const Tensor* grad_tensor;
     OP_REQUIRES_OK_BOOLEAN(ctx, ctx->input("gradient", &grad_tensor));
diff --git a/tensorflow/core/kernels/conditional_accumulator_base.h b/tensorflow/core/kernels/conditional_accumulator_base.h
index 05ee855daee..9992379640d 100644
--- a/tensorflow/core/kernels/conditional_accumulator_base.h
+++ b/tensorflow/core/kernels/conditional_accumulator_base.h
@@ -45,6 +45,8 @@ namespace tensorflow {
  * (3) the internal global_step value (current_global_step_) is incremented by 1
  */
 class ConditionalAccumulatorBase : public ResourceBase {
+ protected:
+  mutex mu_;
  public:
   // Args:
   //   dtype: The datatype of the gradients to be accumulated.
@@ -125,7 +127,6 @@ class ConditionalAccumulatorBase : public ResourceBase {
   const DataType dtype_;
   const PartialTensorShape shape_;
   const string name_;
-  mutex mu_;
   int counter_ GUARDED_BY(mu_);
   int64 current_global_step_ GUARDED_BY(mu_);
 
diff --git a/tensorflow/core/kernels/conditional_accumulator_base_op.h b/tensorflow/core/kernels/conditional_accumulator_base_op.h
index 33c2d596c8b..0a64a857cdb 100644
--- a/tensorflow/core/kernels/conditional_accumulator_base_op.h
+++ b/tensorflow/core/kernels/conditional_accumulator_base_op.h
@@ -43,6 +43,7 @@ namespace tensorflow {
  * ConditionalAccumulatorBase (via sub-class's Creator) and returns its handle.
  */
 class ConditionalAccumulatorBaseOp : public OpKernel {
+  mutex mu_;
  public:
   explicit ConditionalAccumulatorBaseOp(OpKernelConstruction* context)
       : OpKernel(context), accumulator_handle_set_(false) {
@@ -109,7 +110,6 @@ class ConditionalAccumulatorBaseOp : public OpKernel {
     return Status::OK();
   }
 
-  mutex mu_;
   PersistentTensor accumulator_handle_ GUARDED_BY(mu_);
   bool accumulator_handle_set_ GUARDED_BY(mu_);
 };
diff --git a/tensorflow/core/kernels/cwise_ops_common.h b/tensorflow/core/kernels/cwise_ops_common.h
index 5ad6b1fd4a1..3d1953d7f4c 100644
--- a/tensorflow/core/kernels/cwise_ops_common.h
+++ b/tensorflow/core/kernels/cwise_ops_common.h
@@ -82,8 +82,13 @@ class BinaryOp : public BinaryOpShared {
     if (!ctx->status().ok()) return;
     Tensor* out = state.out;
     BCast* bcast = &state.bcast;
+#if TENSORFLOW_USE_SYCL
+    decltype(state.in0) in0 = state.in0;
+    decltype(state.in1) in1 = state.in1;
+#else
     auto& in0 = state.in0;
     auto& in1 = state.in1;
+#endif
     if (state.out_num_elements == 0) {
       return;
     }
diff --git a/tensorflow/core/kernels/queue_base.h b/tensorflow/core/kernels/queue_base.h
index 79b479b44b5..6b2043e5a32 100644
--- a/tensorflow/core/kernels/queue_base.h
+++ b/tensorflow/core/kernels/queue_base.h
@@ -83,6 +83,7 @@ class QueueBase : public QueueInterface {
                                    int64 index);
 
  protected:
+  mutex mu_;
   enum Action { kEnqueue, kDequeue };
   enum RunResult { kNoProgress, kProgress, kComplete };
 
@@ -143,7 +144,6 @@ class QueueBase : public QueueInterface {
   const DataTypeVector component_dtypes_;
   const std::vector<TensorShape> component_shapes_;
   const string name_;
-  mutex mu_;
   bool closed_ GUARDED_BY(mu_);
 
   struct Attempt;
diff --git a/tensorflow/core/kernels/queue_op.h b/tensorflow/core/kernels/queue_op.h
index 7694827854c..a21be2c389d 100644
--- a/tensorflow/core/kernels/queue_op.h
+++ b/tensorflow/core/kernels/queue_op.h
@@ -34,6 +34,7 @@ namespace tensorflow {
 
 // Defines a QueueOp, an abstract class for Queue construction ops.
 class QueueOp : public OpKernel {
+  mutex mu_;
  public:
   QueueOp(OpKernelConstruction* context)
       : OpKernel(context), queue_handle_set_(false) {
@@ -94,7 +95,6 @@ class QueueOp : public OpKernel {
     return Status::OK();
   }
 
-  mutex mu_;
   PersistentTensor queue_handle_ GUARDED_BY(mu_);
   bool queue_handle_set_ GUARDED_BY(mu_);
 };
diff --git a/tensorflow/core/kernels/sparse_conditional_accumulator.h b/tensorflow/core/kernels/sparse_conditional_accumulator.h
index 89560094af6..73bd3b47e48 100644
--- a/tensorflow/core/kernels/sparse_conditional_accumulator.h
+++ b/tensorflow/core/kernels/sparse_conditional_accumulator.h
@@ -83,7 +83,7 @@ class SparseConditionalAccumulator
 
   Status ValidateShape(
       std::tuple<const Tensor*, const Tensor*, const Tensor*>* tensor,
-      bool has_known_shape) EXCLUSIVE_LOCKS_REQUIRED(this->mu_) {
+      bool has_known_shape) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
     const Tensor* tensor_idx = std::get<0>(*tensor);
     const Tensor* tensor_val = std::get<1>(*tensor);
     const Tensor* tensor_shape = std::get<2>(*tensor);
diff --git a/tensorflow/core/kernels/tensor_array.h b/tensorflow/core/kernels/tensor_array.h
index 7835fd7bbc1..96bff2c95ed 100644
--- a/tensorflow/core/kernels/tensor_array.h
+++ b/tensorflow/core/kernels/tensor_array.h
@@ -123,6 +123,7 @@ TF_CALL_GPU_NUMBER_TYPES(TENSOR_ARRAY_SET_ZERO_GPU);
 //     multiple reads of that index in the forward phase.
 //
 class TensorArray : public ResourceBase {
+  mutex mu_;
  public:
   static std::atomic<int64> tensor_array_counter;
 
@@ -338,8 +339,6 @@ class TensorArray : public ResourceBase {
   const DataType dtype_;
   Tensor handle_;
 
-  mutex mu_;
-
   // Marks that the tensor_array_ has been cleared.
   bool closed_ GUARDED_BY(mu_);
 
diff --git a/tensorflow/core/lib/monitoring/collection_registry.cc b/tensorflow/core/lib/monitoring/collection_registry.cc
index 47112279cff..01d643fbcca 100644
--- a/tensorflow/core/lib/monitoring/collection_registry.cc
+++ b/tensorflow/core/lib/monitoring/collection_registry.cc
@@ -45,7 +45,9 @@ void Collector::CollectMetricDescriptor(
   metric_descriptor->name = metric_def->name().ToString();
   metric_descriptor->description = metric_def->description().ToString();
 
-  for (const StringPiece label_name : metric_def->label_descriptions()) {
+  for (auto it  = metric_def->label_descriptions().begin(),
+            end = metric_def->label_descriptions().end() ; it!=end ;it++  ) {
+    const StringPiece label_name = *it;
     metric_descriptor->label_names.push_back(label_name.ToString());
   }
 
diff --git a/tensorflow/core/lib/monitoring/collection_registry.h b/tensorflow/core/lib/monitoring/collection_registry.h
index 3da2439238f..ed957b9ae45 100644
--- a/tensorflow/core/lib/monitoring/collection_registry.h
+++ b/tensorflow/core/lib/monitoring/collection_registry.h
@@ -121,6 +121,7 @@ class MetricCollectorGetter {
 //
 // This class is thread-safe.
 class CollectionRegistry {
+  mutable mutex mu_;
  public:
   ~CollectionRegistry() = default;
 
@@ -176,8 +177,6 @@ class CollectionRegistry {
   // TF environment, mainly used for timestamping.
   Env* const env_;
 
-  mutable mutex mu_;
-
   // Information required for collection.
   struct CollectionInfo {
     const AbstractMetricDef* const metric_def;
@@ -227,6 +226,7 @@ inline void CollectValue(const int64& value, Point* const point) {
 //
 // This class is thread-safe.
 class Collector {
+  mutable mutex mu_;
  public:
   Collector(const uint64 collection_time_millis)
       : collected_metrics_(new CollectedMetrics()),
@@ -260,7 +260,6 @@ class Collector {
       LOCKS_EXCLUDED(mu_);
 
  private:
-  mutable mutex mu_;
   std::unique_ptr<CollectedMetrics> collected_metrics_ GUARDED_BY(mu_);
   const uint64 collection_time_millis_;
 
diff --git a/tensorflow/core/lib/monitoring/counter.h b/tensorflow/core/lib/monitoring/counter.h
index e76057b980a..0ea50932dd9 100644
--- a/tensorflow/core/lib/monitoring/counter.h
+++ b/tensorflow/core/lib/monitoring/counter.h
@@ -78,6 +78,7 @@ class CounterCell {
 // This class is thread-safe.
 template <int NumLabels>
 class Counter {
+  mutable mutex mu_;
  public:
   ~Counter() {
     // Deleted here, before the metric_def is destroyed.
@@ -111,8 +112,6 @@ class Counter {
               }
             })) {}
 
-  mutable mutex mu_;
-
   // The metric definition. This will be used to identify the metric when we
   // register it for collection.
   const MetricDef<MetricKind::kCumulative, int64, NumLabels> metric_def_;
diff --git a/tensorflow/stream_executor/machine_manager.h b/tensorflow/stream_executor/machine_manager.h
index 65396dd1ff5..bf95bc74713 100644
--- a/tensorflow/stream_executor/machine_manager.h
+++ b/tensorflow/stream_executor/machine_manager.h
@@ -60,6 +60,9 @@ namespace gputools {
 //
 // Thread-safe.
 class MachineManager {
+  // Mutex that guards the initialization of the machine manager static
+  // variable.
+  static mutex mu_;
  public:
   // Inspects the host to determine the preferred GPU execution platform.
   // To force OpenCL from a build target on a machine that has both OpenCL and
@@ -171,10 +174,6 @@ class MachineManager {
   // Returns the NUMA node association for the StreamExecutor.
   int ExecutorToNumaNode(const StreamExecutor *stream_exec) const;
 
-  // Mutex that guards the initialization of the machine manager static
-  // variable.
-  static mutex mu_;
-
   // Singleton MachineManager value -- assignment to this is protected by a
   // static singleton guard clause.
   static MachineManager *singleton_ GUARDED_BY(mu_);

From 916663735e715856b9796ac6937f3a6565bfb3a7 Mon Sep 17 00:00:00 2001
From: luke iwanski <luke@codeplay.com>
Date: Fri, 14 Oct 2016 13:11:50 +0100
Subject: [PATCH 05/51] Turned filegroup into cc_toolchain_suite.

---
 third_party/sycl/crosstool/BUILD | 9 +++++----
 tools/bazel.rc.template          | 2 +-
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/third_party/sycl/crosstool/BUILD b/third_party/sycl/crosstool/BUILD
index eac4dc7fad8..ec0070e71da 100755
--- a/third_party/sycl/crosstool/BUILD
+++ b/third_party/sycl/crosstool/BUILD
@@ -2,10 +2,11 @@ licenses(["restricted"])
 
 package(default_visibility = ["//visibility:public"])
 
-filegroup(
-    name = "crosstool",
-    srcs = ["CROSSTOOL"],
-    output_licenses = ["unencumbered"],
+cc_toolchain_suite(
+    name = "toolchain",
+    toolchains = {
+        "local|compiler": ":cc-compiler-local",
+    },
 )
 
 cc_toolchain(
diff --git a/tools/bazel.rc.template b/tools/bazel.rc.template
index 875a290215d..bdbc88ba395 100644
--- a/tools/bazel.rc.template
+++ b/tools/bazel.rc.template
@@ -1,7 +1,7 @@
 build:cuda --crosstool_top=@local_config_cuda//crosstool:toolchain
 build:cuda --define=using_cuda=true --define=using_cuda_nvcc=true
 
-build:sycl --crosstool_top=//third_party/sycl/crosstool
+build:sycl --crosstool_top=//third_party/sycl/crosstool:toolchain
 build:sycl --define=using_sycl=true
 
 build --force_python=py$PYTHON_MAJOR_VERSION

From 656a72c0cd18cd17a7cecb2309a018f020525518 Mon Sep 17 00:00:00 2001
From: luke iwanski <luke@codeplay.com>
Date: Fri, 14 Oct 2016 16:50:23 +0100
Subject: [PATCH 06/51] Pointing to latest Eigen OpenCL version.

---
 tensorflow/workspace.bzl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index ac2a22ee548..6e3def96902 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -14,7 +14,7 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
   # These lines need to be changed when updating Eigen. They are parsed from
   # this file by the cmake and make builds to determine the eigen version and
   # hash.
-  eigen_version = "aad63574941c"
+  eigen_version = "ab6d16a84626"
   eigen_sha256 = ""
 
   native.new_http_archive(

From 1f88ec8eef88ec783a18a98816fbb15b21df6ee7 Mon Sep 17 00:00:00 2001
From: luke iwanski <luke@codeplay.com>
Date: Mon, 17 Oct 2016 21:14:25 +0100
Subject: [PATCH 07/51] Update to SYCL bazel toolchain.

---
 configure                                     |  85 +++++++----
 .../core/platform/default/build_config/BUILD  |  16 ++
 tensorflow/workspace.bzl                      |   3 +
 third_party/sycl/BUILD                        |  44 ------
 third_party/sycl/crosstool/BUILD              |  29 ----
 third_party/sycl/crosstool/BUILD.tpl          |  29 ++++
 .../crosstool/{CROSSTOOL => CROSSTOOL.tpl}    |   2 +-
 .../crosstool/{computecpp => computecpp.tpl}  |   8 +-
 third_party/sycl/platform.bzl                 |  17 ---
 third_party/sycl/sycl/BUILD                   |   0
 third_party/sycl/sycl/BUILD.tpl               |  43 ++++++
 .../build_defs.bzl.tpl}                       |   5 +-
 third_party/sycl/sycl/platform.bzl.tpl        |   5 +
 third_party/sycl/sycl_config.sh               | 143 -----------------
 third_party/sycl/sycl_configure.bzl           | 144 ++++++++++++++++++
 tools/bazel.rc.template                       |   2 +-
 16 files changed, 307 insertions(+), 268 deletions(-)
 mode change 100755 => 100644 third_party/sycl/BUILD
 mode change 100755 => 100644 third_party/sycl/crosstool/BUILD
 create mode 100755 third_party/sycl/crosstool/BUILD.tpl
 rename third_party/sycl/crosstool/{CROSSTOOL => CROSSTOOL.tpl} (97%)
 rename third_party/sycl/crosstool/{computecpp => computecpp.tpl} (93%)
 delete mode 100755 third_party/sycl/platform.bzl
 create mode 100644 third_party/sycl/sycl/BUILD
 create mode 100755 third_party/sycl/sycl/BUILD.tpl
 rename third_party/sycl/{build_defs.bzl => sycl/build_defs.bzl.tpl} (86%)
 create mode 100755 third_party/sycl/sycl/platform.bzl.tpl
 delete mode 100755 third_party/sycl/sycl_config.sh
 create mode 100644 third_party/sycl/sycl_configure.bzl

diff --git a/configure b/configure
index 08078d29d5c..759f317d0cb 100755
--- a/configure
+++ b/configure
@@ -365,17 +365,64 @@ fi
 # OpenCL configuration
 
 if [ "$TF_NEED_OPENCL" == "1" ]; then
+
+# Determine which C++ compiler should be used as the host compiler
+while true; do
+  fromuser=""
+  if [ -z "$HOST_CXX_COMPILER" ]; then
+    default_cxx_host_compiler=$(which g++|| true)
+    read -p "Please specify which C++ compiler should be used as the host C++ compiler. [Default is $default_cxx_host_compiler]: " HOST_CXX_COMPILER
+    fromuser="1"
+    if [ -z "$HOST_CXX_COMPILER" ]; then
+      HOST_CXX_COMPILER=$default_cxx_host_compiler
+    fi
+  fi
+  if [ -e "$HOST_CXX_COMPILER" ]; then
+    export HOST_CXX_COMPILER
+    break
+  fi
+  echo "Invalid C++ compiler path. ${HOST_CXX_COMPILER} cannot be found" 1>&2
+  if [ -z "$fromuser" ]; then
+    exit 1
+  fi
+  HOST_CXX_COMPILER=""
+  # Retry
+done
+
+# Determine which C compiler should be used as the host compiler
+while true; do
+  fromuser=""
+  if [ -z "$HOST_C_COMPILER" ]; then
+    default_c_host_compiler=$(which gcc|| true)
+    read -p "Please specify which C compiler should be used as the host C compiler. [Default is $default_c_host_compiler]: " HOST_C_COMPILER
+    fromuser="1"
+    if [ -z "$HOST_C_COMPILER" ]; then
+      HOST_C_COMPILER=$default_c_host_compiler
+    fi
+  fi
+  if [ -e "$HOST_C_COMPILER" ]; then
+    export HOST_C_COMPILER
+    break
+  fi
+  echo "Invalid C compiler path. ${HOST_C_COMPILER} cannot be found" 1>&2
+  if [ -z "$fromuser" ]; then
+    exit 1
+  fi
+  HOST_C_COMPILER=""
+  # Retry
+done
+
 while true; do
   # Configure the OPENCL version to use.
   TF_OPENCL_VERSION="1.2"
 
-  # Point to ComputeCPP root
-  if [ -z "$COMPUTECPP_PATH" ]; then
-    default_computecpp_path=/usr/local/computecpp
-    read -p "Please specify the location where ComputeCPP $TF_OPENCL_VERSION is installed. Refer to README.md for more details. [Default is $default_computecpp_path]: " COMPUTECPP_PATH
+  # Point to ComputeCpp root
+  if [ -z "$COMPUTECPP_TOOLKIT_PATH" ]; then
+    default_computecpp_toolkit_path=/usr/local/computecpp
+    read -p "Please specify the location where ComputeCpp $TF_OPENCL_VERSION is installed. Refer to README.md for more details. [Default is $default_computecpp_toolkit_path]: " COMPUTECPP_TOOLKIT_PATH
     fromuser="1"
-    if [ -z "$COMPUTECPP_PATH" ]; then
-      COMPUTECPP_PATH=$default_computecpp_path
+    if [ -z "$COMPUTECPP_TOOLKIT_PATH" ]; then
+      COMPUTECPP_TOOLKIT_PATH=$default_computecpp_toolkit_path
     fi
   fi
 
@@ -383,38 +430,20 @@ while true; do
     SYCL_RT_LIB_PATH="lib/libComputeCpp.so"
   fi
 
-  if [ -e "${COMPUTECPP_PATH}/${SYCL_RT_LIB_PATH}" ]; then
+  if [ -e "${COMPUTECPP_TOOLKIT_PATH}/${SYCL_RT_LIB_PATH}" ]; then
+    export COMPUTECPP_TOOLKIT_PATH
     break
   fi
-  echo "Invalid path to SYCL $TF_OPENCL_VERSION library. ${COMPUTECPP_PATH}/${SYCL_RT_LIB_PATH} cannot be found"
+  echo "Invalid SYCL $TF_OPENCL_VERSION library path. ${COMPUTECPP_TOOLKIT_PATH}/${SYCL_RT_LIB_PATH} cannot be found"
 
   if [ -z "$fromuser" ]; then
     exit 1
   fi
   # Retry
   TF_OPENCL_VERSION=""
-  COMPUTECPP_PATH=""
+  COMPUTECPP_TOOLKIT_PATH=""
 done
 
-cat > third_party/sycl/sycl.config <<EOF
-# COMPUTECPP_PATH refers to the ComputeCPP toolkit.
-COMPUTECPP_PATH="$COMPUTECPP_PATH"
-
-# The OpenCL version that should be used in this build
-TF_OPENCL_VERSION=$TF_OPENCL_VERSION
-
-EOF
-
-export WARNING=$DO_NOT_SUBMIT_WARNING
-perl -pi -e "s,#cxx_builtin_include_directory: {COMPUTECPP_INCLUDE},# \$ENV{WARNING}\ncxx_builtin_include_directory: \"${COMPUTECPP_PATH}\",s" third_party/sycl/crosstool/CROSSTOOL
-
-# Configure the platform name.
-perl -pi -e "s,PLATFORM = \".*\",PLATFORM = \"$OSNAME\",s" third_party/sycl/platform.bzl
-
-
-# Invoke the cuda_config.sh and set up the TensorFlow's canonical view of the Cuda libraries
-(cd third_party/sycl; ./sycl_config.sh;) || exit -1
-
 # end of if "$TF_NEED_OPENCL" == "1"
 fi
 
diff --git a/tensorflow/core/platform/default/build_config/BUILD b/tensorflow/core/platform/default/build_config/BUILD
index fe51a698d15..c8d73caa22f 100644
--- a/tensorflow/core/platform/default/build_config/BUILD
+++ b/tensorflow/core/platform/default/build_config/BUILD
@@ -10,6 +10,7 @@ exports_files(["LICENSE"])
 load("//tensorflow:tensorflow.bzl", "tf_copts")
 load("//tensorflow:tensorflow.bzl", "tf_cuda_library")
 load("@local_config_cuda//cuda:platform.bzl", "cuda_library_path")
+load("@local_config_sycl//sycl:platform.bzl", "sycl_library_path")
 
 cc_library(
     name = "gtest",
@@ -127,6 +128,21 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "sycl",
+    data = [
+        "@local_config_sycl//sycl:{}".format(sycl_library_path("ComputeCpp")),
+    ],
+    linkopts = select({
+        "//conditions:default": [
+            "-Wl,-rpath,../local_config_sycl/sycl/lib",
+        ],
+    }),
+    deps = [
+        "@local_config_sycl//sycl:syclrt",
+    ],
+)
+
 filegroup(
     name = "android_srcs",
     srcs = glob(["*.h"]),
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 6db0e40b8bb..62d303a8781 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -1,11 +1,14 @@
 # TensorFlow external dependencies that can be loaded in WORKSPACE files.
 
 load("//third_party/gpus:cuda_configure.bzl", "cuda_configure")
+load("//third_party/sycl:sycl_configure.bzl", "sycl_configure")
+
 
 # If TensorFlow is linked as a submodule.
 # path_prefix and tf_repo_name are no longer used.
 def tf_workspace(path_prefix = "", tf_repo_name = ""):
   cuda_configure(name = "local_config_cuda")
+  sycl_configure(name = "local_config_sycl")
   if path_prefix:
     print("path_prefix was specified to tf_workspace but is no longer used and will be removed in the future.")
   if tf_repo_name:
diff --git a/third_party/sycl/BUILD b/third_party/sycl/BUILD
old mode 100755
new mode 100644
index 4aadc808302..e69de29bb2d
--- a/third_party/sycl/BUILD
+++ b/third_party/sycl/BUILD
@@ -1,44 +0,0 @@
-licenses(["restricted"])  # MPL2, portions GPL v3, LGPL v3, BSD-like
-
-load("//third_party/sycl:build_defs.bzl", "if_sycl")
-load("platform", "sycl_library_path")
-load("platform", "sycl_static_library_path")
-
-load("platform", "readlink_command")
-
-package(default_visibility = ["//visibility:public"])
-
-config_setting(
-    name = "using_sycl",
-    values = {
-        "define": "using_sycl=true",
-    },
-)
-
-cc_library(
-    name = "sycl_headers",
-    hdrs = glob([
-        "**/*.h",
-    ]),
-    includes = [".", "include"],
-)
-
-cc_library(
-    name = "syclrt",
-    srcs = [
-        sycl_library_path("ComputeCpp")
-    ],
-    data = [
-        sycl_library_path("ComputeCpp")
-    ],
-    includes = ["include/"],
-    linkstatic = 1,
-)
-
-cc_library(
-    name = "sycl",
-    deps = if_sycl([
-        ":sycl_headers",
-        ":syclrt",
-    ]),
-)
diff --git a/third_party/sycl/crosstool/BUILD b/third_party/sycl/crosstool/BUILD
old mode 100755
new mode 100644
index ec0070e71da..e69de29bb2d
--- a/third_party/sycl/crosstool/BUILD
+++ b/third_party/sycl/crosstool/BUILD
@@ -1,29 +0,0 @@
-licenses(["restricted"])
-
-package(default_visibility = ["//visibility:public"])
-
-cc_toolchain_suite(
-    name = "toolchain",
-    toolchains = {
-        "local|compiler": ":cc-compiler-local",
-    },
-)
-
-cc_toolchain(
-    name = "cc-compiler-local",
-    all_files = ":empty",
-    compiler_files = ":empty",
-    cpu = "local",
-    dwp_files = ":empty",
-    dynamic_runtime_libs = [":empty"],
-    linker_files = ":empty",
-    objcopy_files = ":empty",
-    static_runtime_libs = [":empty"],
-    strip_files = ":empty",
-    supports_param_files = 0,
-)
-
-filegroup(
-    name = "empty",
-    srcs = [],
-)
diff --git a/third_party/sycl/crosstool/BUILD.tpl b/third_party/sycl/crosstool/BUILD.tpl
new file mode 100755
index 00000000000..ec0070e71da
--- /dev/null
+++ b/third_party/sycl/crosstool/BUILD.tpl
@@ -0,0 +1,29 @@
+licenses(["restricted"])
+
+package(default_visibility = ["//visibility:public"])
+
+cc_toolchain_suite(
+    name = "toolchain",
+    toolchains = {
+        "local|compiler": ":cc-compiler-local",
+    },
+)
+
+cc_toolchain(
+    name = "cc-compiler-local",
+    all_files = ":empty",
+    compiler_files = ":empty",
+    cpu = "local",
+    dwp_files = ":empty",
+    dynamic_runtime_libs = [":empty"],
+    linker_files = ":empty",
+    objcopy_files = ":empty",
+    static_runtime_libs = [":empty"],
+    strip_files = ":empty",
+    supports_param_files = 0,
+)
+
+filegroup(
+    name = "empty",
+    srcs = [],
+)
diff --git a/third_party/sycl/crosstool/CROSSTOOL b/third_party/sycl/crosstool/CROSSTOOL.tpl
similarity index 97%
rename from third_party/sycl/crosstool/CROSSTOOL
rename to third_party/sycl/crosstool/CROSSTOOL.tpl
index a96d33d522f..2108a5b9f08 100755
--- a/third_party/sycl/crosstool/CROSSTOOL
+++ b/third_party/sycl/crosstool/CROSSTOOL.tpl
@@ -47,7 +47,7 @@ toolchain {
   cxx_builtin_include_directory: "/usr/local/include"
   cxx_builtin_include_directory: "/usr/include"
 
-  #cxx_builtin_include_directory: {COMPUTECPP_INCLUDE}
+  cxx_builtin_include_directory: "%{computecpp_toolkit_path}"
 
   tool_path { name: "gcov" path: "/usr/bin/gcov" }
 
diff --git a/third_party/sycl/crosstool/computecpp b/third_party/sycl/crosstool/computecpp.tpl
similarity index 93%
rename from third_party/sycl/crosstool/computecpp
rename to third_party/sycl/crosstool/computecpp.tpl
index 0b8427fd0ae..4860e9f0a07 100755
--- a/third_party/sycl/crosstool/computecpp
+++ b/third_party/sycl/crosstool/computecpp.tpl
@@ -1,4 +1,4 @@
-#!/usr/bin/env python2.7
+#!/usr/bin/env python
 
 from argparse import ArgumentParser
 import os
@@ -7,11 +7,11 @@ import re
 import sys
 import pipes
 
-CPU_CXX_COMPILER = ('/usr/bin/clang++-3.6')
-CPU_C_COMPILER = ('/usr/bin/clang-3.6')
+CPU_CXX_COMPILER = ('%{host_cxx_compiler}')
+CPU_C_COMPILER = ('%{host_c_compiler}')
 
 CURRENT_DIR = os.path.dirname(sys.argv[0])
-COMPUTECPP_ROOT = CURRENT_DIR +"/../"
+COMPUTECPP_ROOT = CURRENT_DIR +"/../sycl/"
 COMPUTECPP_DRIVER= COMPUTECPP_ROOT+"bin/compute++"
 COMPUTECPP_INCLUDE = COMPUTECPP_ROOT+"include"
 
diff --git a/third_party/sycl/platform.bzl b/third_party/sycl/platform.bzl
deleted file mode 100755
index 8905b496b3b..00000000000
--- a/third_party/sycl/platform.bzl
+++ /dev/null
@@ -1,17 +0,0 @@
-SYCL_VERSION = ""
-PLATFORM = ""
-
-def sycl_sdk_version():
-  return SYCL_VERSION
-
-def sycl_library_path(name, version = sycl_sdk_version()):
-  if not version:
-    return "lib/lib{}.so".format(name)
-  else:
-    return "lib/lib{}.so.{}".format(name, version)
-
-def sycl_static_library_path(name):
-  return "lib/lib{}_static.a".format(name)
-
-def readlink_command():
-    return "readlink"
diff --git a/third_party/sycl/sycl/BUILD b/third_party/sycl/sycl/BUILD
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/third_party/sycl/sycl/BUILD.tpl b/third_party/sycl/sycl/BUILD.tpl
new file mode 100755
index 00000000000..c3879e8da54
--- /dev/null
+++ b/third_party/sycl/sycl/BUILD.tpl
@@ -0,0 +1,43 @@
+licenses(["restricted"])  # MPL2, portions GPL v3, LGPL v3, BSD-like
+
+load("@local_config_sycl//sycl:build_defs.bzl", "if_sycl")
+load("platform", "sycl_library_path")
+
+load("platform", "readlink_command")
+
+package(default_visibility = ["//visibility:public"])
+
+config_setting(
+    name = "using_sycl",
+    values = {
+        "define": "using_sycl=true",
+    },
+)
+
+cc_library(
+    name = "sycl_headers",
+    hdrs = glob([
+        "**/*.h",
+    ]),
+    includes = [".", "include"],
+)
+
+cc_library(
+    name = "syclrt",
+    srcs = [
+        sycl_library_path("ComputeCpp")
+    ],
+    data = [
+        sycl_library_path("ComputeCpp")
+    ],
+    includes = ["include/"],
+    linkstatic = 1,
+)
+
+cc_library(
+    name = "sycl",
+    deps = if_sycl([
+        ":sycl_headers",
+        ":syclrt",
+    ]),
+)
diff --git a/third_party/sycl/build_defs.bzl b/third_party/sycl/sycl/build_defs.bzl.tpl
similarity index 86%
rename from third_party/sycl/build_defs.bzl
rename to third_party/sycl/sycl/build_defs.bzl.tpl
index 1aaeb741a21..09bef0a6613 100755
--- a/third_party/sycl/build_defs.bzl
+++ b/third_party/sycl/sycl/build_defs.bzl.tpl
@@ -1,10 +1,13 @@
 # Macros for building SYCL code.
+
 def if_sycl(if_true, if_false = []):
     """Shorthand for select()'ing on whether we're building with SYCL.
+
     Returns a select statement which evaluates to if_true if we're building
     with SYCL enabled.  Otherwise, the select statement evaluates to if_false.
+
     """
     return select({
-        "//third_party/sycl:using_sycl": if_true,
+        "@local_config_sycl//sycl:using_sycl": if_true,
         "//conditions:default": if_false
     })
diff --git a/third_party/sycl/sycl/platform.bzl.tpl b/third_party/sycl/sycl/platform.bzl.tpl
new file mode 100755
index 00000000000..cb4b3356b22
--- /dev/null
+++ b/third_party/sycl/sycl/platform.bzl.tpl
@@ -0,0 +1,5 @@
+def sycl_library_path(name):
+    return "lib/lib{}.so".format(name)
+
+def readlink_command():
+    return "readlink"
diff --git a/third_party/sycl/sycl_config.sh b/third_party/sycl/sycl_config.sh
deleted file mode 100755
index 7fa93110708..00000000000
--- a/third_party/sycl/sycl_config.sh
+++ /dev/null
@@ -1,143 +0,0 @@
-#!/usr/bin/env bash
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-
-# A simple script to configure the SYCL tree needed for the TensorFlow OpenCL
-# build. We need both COMPUTECPP toolkit $TF_OPENCL_VERSION.
-# Useage:
-#    * User edit sycl.config to point ComputeCPP toolkit to its local path
-#    * run sycl_config.sh to generate symbolic links in the source tree to reflect
-#    * the file organizations needed by TensorFlow.
-
-print_usage() {
-cat << EOF
-Usage: $0 [--check]
-  Configure TensorFlow's canonical view of SYCL libraries using sycl.config.
-Arguments:
-  --check: Only check that the proper SYCL dependencies has already been
-       properly configured in the source tree. It also creates symbolic links to
-       the files in the gen-tree to make bazel happy.
-EOF
-}
-
-CHECK_ONLY=0
-# Parse the arguments. Add more arguments as the "case" line when needed.
-while [[ $# -gt 0 ]]; do
-  argument="$1"
-  shift
-  case $argument in
-    --check)
-      CHECK_ONLY=1
-      ;;
-    *)
-      echo "Error: unknown arguments"
-      print_usage
-      exit -1
-      ;;
-  esac
-done
-
-source sycl.config || exit -1
-
-OUTPUTDIR=${OUTPUTDIR:-../..}
-COMPUTECPP_PATH=${COMPUTECPP_PATH:-/usr/local/computecpp}
-
-# An error message when the SYCL toolkit is not found
-function SYCLError {
-  echo ERROR: $1
-cat << EOF
-##############################################################################
-##############################################################################
-SYCL $TF_OPENCL_VERSION toolkit is missing.
-1. Download and install the ComputeCPP $TF_OPENCL_VERSION toolkit;
-2. Run configure from the root of the source tree, before rerunning bazel;
-Please refer to README.md for more details.
-##############################################################################
-##############################################################################
-EOF
-  exit -1
-}
-
-# Check that the SYCL libraries have already been properly configured in the source tree.
-# We still need to create links to the gen-tree to make bazel happy.
-function CheckAndLinkToSrcTree {
-  ERROR_FUNC=$1
-  FILE=$2
-  if test ! -e $FILE; then
-    $ERROR_FUNC "$PWD/$FILE cannot be found"
-  fi
-
-  # Link the output file to the source tree, avoiding self links if they are
-  # the same. This could happen if invoked from the source tree by accident.
-  if [ ! $($READLINK_CMD -f $PWD) == $($READLINK_CMD -f $OUTPUTDIR/third_party/sycl) ]; then
-    mkdir -p $(dirname $OUTPUTDIR/third_party/sycl/$FILE)
-    ln -sf $PWD/$FILE $OUTPUTDIR/third_party/sycl/$FILE
-  fi
-}
-
-OSNAME=`uname -s`
-if [ "$OSNAME" == "Linux" ]; then
-  SYCL_LIB_PATH="lib"
-  SYCL_RT_LIB_PATH="lib/libComputeCpp.so"
-  SYCL_RT_LIB_STATIC_PATH="lib/libComputeCpp.a"
-  READLINK_CMD="readlink"
-fi
-
-if [ "$CHECK_ONLY" == "1" ]; then
-  CheckAndLinkToSrcTree SYCLError include/SYCL/sycl.h
-  CheckAndLinkToSrcTree SYCLError $SYCL_RT_LIB_STATIC_PATH
-  CheckAndLinkToSrcTree CudaError $SYCL_RT_LIB_PATH
-  exit 0
-fi
-
-# Actually configure the source tree for TensorFlow's canonical view of SYCL
-# libraries.
-
-if test ! -e ${COMPUTECPP_PATH}/${SYCL_RT_LIB_PATH}; then
-  SYCLError "cannot find ${COMPUTECPP_PATH}/${SYCL_RT_LIB_PATH}"
-fi
-
-# Helper function to build symbolic links for all files in a directory.
-function LinkOneDir {
-  SRC_PREFIX=$1
-  DST_PREFIX=$2
-  SRC_DIR=$3
-  DST_DIR=$(echo $SRC_DIR | sed "s,^$SRC_PREFIX,$DST_PREFIX,")
-  mkdir -p $DST_DIR
-  FILE_LIST=$(find -L $SRC_DIR -maxdepth 1 -type f)
-  if test "$FILE_LIST" != ""; then
-    ln -sf $FILE_LIST $DST_DIR/ || exit -1
-  fi
-}
-export -f LinkOneDir
-
-# Build links for all files in the directory, including subdirectories.
-function LinkAllFiles {
-  SRC_DIR=$1
-  DST_DIR=$2
-  find -L $SRC_DIR -type d | xargs -I {} bash -c "LinkOneDir $SRC_DIR $DST_DIR {}" || exit -1
-}
-
-# Set up the symbolic links for SYCL toolkit. We link at individual file level,
-# not at the directory level.
-# This is because the external library may have a different file layout from our desired structure.
-mkdir -p $OUTPUTDIR/third_party/sycl
-echo "Setting up SYCL include"
-LinkAllFiles ${COMPUTECPP_PATH}/include $OUTPUTDIR/third_party/sycl/include || exit -1
-echo "Setting up SYCL ${SYCL_LIB_PATH}"
-LinkAllFiles ${COMPUTECPP_PATH}/${SYCL_LIB_PATH} $OUTPUTDIR/third_party/sycl/${SYCL_LIB_PATH} || exit -1
-echo "Setting up SYCL bin"
-LinkAllFiles ${COMPUTECPP_PATH}/bin $OUTPUTDIR/third_party/sycl/bin || exit -1
diff --git a/third_party/sycl/sycl_configure.bzl b/third_party/sycl/sycl_configure.bzl
new file mode 100644
index 00000000000..37766e90dc7
--- /dev/null
+++ b/third_party/sycl/sycl_configure.bzl
@@ -0,0 +1,144 @@
+# -*- Python -*-
+"""SYCL autoconfiguration.
+`sycl_configure` depends on the following environment variables:
+
+  * HOST_CXX_COMPILER:  The host C++ compiler
+  * HOST_C_COMPILER:    The host C compiler
+  * COMPUTECPP_TOOLKIT_PATH: The path to the ComputeCpp toolkit.
+"""
+
+_HOST_CXX_COMPILER = "HOST_CXX_COMPILER"
+_HOST_C_COMPILER= "HOST_C_COMPILER"
+_COMPUTECPP_TOOLKIT_PATH = "COMPUTECPP_TOOLKIT_PATH"
+
+def auto_configure_fail(msg):
+  """Output failure message when auto configuration fails."""
+  red = "\033[0;31m"
+  no_color = "\033[0m"
+  fail("\n%sAuto-Configuration Error:%s %s\n" % (red, no_color, msg))
+# END cc_configure common functions (see TODO above).
+
+def find_c(repository_ctx):
+  """Find host C compiler."""
+  c_name = ""
+  if _HOST_C_COMPILER in repository_ctx.os.environ:
+    c_name = repository_ctx.os.environ[_HOST_C_COMPILER].strip()
+  if c_name.startswith("/"):
+    return c_name
+  c = repository_ctx.which(c_name)
+  if c == None:
+    fail("Cannot find C compiler, please correct your path.")
+  return c
+
+def find_cc(repository_ctx):
+  """Find host C++ compiler."""
+  cc_name = ""
+  if _HOST_CXX_COMPILER in repository_ctx.os.environ:
+    cc_name = repository_ctx.os.environ[_HOST_CXX_COMPILER].strip()
+  if cc_name.startswith("/"):
+    return cc_name
+  cc = repository_ctx.which(cc_name)
+  if cc == None:
+    fail("Cannot find C++ compiler, please correct your path.")
+  return cc
+
+def find_computecpp_root(repository_ctx):
+  """Find ComputeCpp compiler."""
+  sycl_name = ""
+  if _COMPUTECPP_TOOLKIT_PATH in repository_ctx.os.environ:
+    sycl_name = repository_ctx.os.environ[_COMPUTECPP_TOOLKIT_PATH].strip()
+  if sycl_name.startswith("/"):
+    return sycl_name
+  fail( "Cannot find SYCL compiler, please correct your path")
+
+def _check_lib(repository_ctx, toolkit_path, lib):
+  """Checks if lib exists under sycl_toolkit_path or fail if it doesn't.
+
+  Args:
+    repository_ctx: The repository context.
+    toolkit_path: The toolkit directory containing the libraries.
+    ib: The library to look for under toolkit_path.
+  """
+  lib_path = toolkit_path + "/" + lib
+  if not repository_ctx.path(lib_path).exists:
+    auto_configure_fail("Cannot find %s" % lib_path)
+
+def _check_dir(repository_ctx, directory):
+  """Checks whether the directory exists and fail if it does not.
+
+  Args:
+    repository_ctx: The repository context.
+    directory: The directory to check the existence of.
+  """
+  if not repository_ctx.path(directory).exists:
+    auto_configure_fail("Cannot find dir: %s" % directory)
+
+def _symlink_dir(repository_ctx, src_dir, dest_dir):
+  """Symlinks all the files in a directory.
+
+  Args:
+    repository_ctx: The repository context.
+    src_dir: The source directory.
+    dest_dir: The destination directory to create the symlinks in.
+  """
+  files = repository_ctx.path(src_dir).readdir()
+  for src_file in files:
+    repository_ctx.symlink(src_file, dest_dir + "/" + src_file.basename)
+
+def _tpl(repository_ctx, tpl, substitutions={}, out=None):
+  if not out:
+    out = tpl.replace(":", "/")
+  repository_ctx.template(
+      out,
+      Label("//third_party/sycl/%s.tpl" % tpl),
+      substitutions)
+
+def _file(repository_ctx, label):
+  repository_ctx.template(
+      label.replace(":", "/"),
+      Label("//third_party/sycl/%s.tpl" % label),
+      {})
+
+def _sycl_autoconf_imp(repository_ctx):
+  """Implementation of the sycl_autoconf rule."""
+
+  # copy template files
+  _file(repository_ctx, "sycl:build_defs.bzl")
+  _file(repository_ctx, "sycl:BUILD")
+  _file(repository_ctx, "sycl:platform.bzl")
+  _file(repository_ctx, "crosstool:BUILD")
+  _tpl(repository_ctx, "crosstool:computecpp",
+  {
+    "%{host_cxx_compiler}" : find_cc(repository_ctx),
+    "%{host_c_compiler}" : find_c(repository_ctx),
+  })
+
+  computecpp_root = find_computecpp_root(repository_ctx);
+  _check_dir(repository_ctx, computecpp_root)
+
+  _tpl(repository_ctx, "crosstool:CROSSTOOL",
+  {
+    "%{computecpp_toolkit_path}" : computecpp_root,
+  })
+
+  # symlink libraries
+  _check_lib(repository_ctx, computecpp_root+"/lib", "libComputeCpp.so" )
+  _symlink_dir(repository_ctx, computecpp_root + "/lib", "sycl/lib")
+  _symlink_dir(repository_ctx, computecpp_root + "/include", "sycl/include")
+  _symlink_dir(repository_ctx, computecpp_root + "/bin", "sycl/bin")
+
+sycl_configure = repository_rule(
+  implementation = _sycl_autoconf_imp,
+  local = True,
+)
+"""Detects and configures the SYCL toolchain.
+
+Add the following to your WORKSPACE FILE:
+
+```python
+sycl_configure(name = "local_config_sycl")
+```
+
+Args:
+  name: A unique name for this workspace rule.
+"""
diff --git a/tools/bazel.rc.template b/tools/bazel.rc.template
index bdbc88ba395..7a5c2aac808 100644
--- a/tools/bazel.rc.template
+++ b/tools/bazel.rc.template
@@ -1,7 +1,7 @@
 build:cuda --crosstool_top=@local_config_cuda//crosstool:toolchain
 build:cuda --define=using_cuda=true --define=using_cuda_nvcc=true
 
-build:sycl --crosstool_top=//third_party/sycl/crosstool:toolchain
+build:sycl --crosstool_top=@local_config_sycl//crosstool:toolchain
 build:sycl --define=using_sycl=true
 
 build --force_python=py$PYTHON_MAJOR_VERSION

From 1561686c88bce21b8de1350f87a5f878e9fe0b3c Mon Sep 17 00:00:00 2001
From: luke <luke@codeplay.com>
Date: Tue, 18 Oct 2016 16:25:14 +0100
Subject: [PATCH 08/51] Need to specify default C and C++ compilers to avoid
 bazel error when re-configuring.

---
 third_party/sycl/sycl_configure.bzl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/third_party/sycl/sycl_configure.bzl b/third_party/sycl/sycl_configure.bzl
index 37766e90dc7..e3c732bbc73 100644
--- a/third_party/sycl/sycl_configure.bzl
+++ b/third_party/sycl/sycl_configure.bzl
@@ -20,7 +20,7 @@ def auto_configure_fail(msg):
 
 def find_c(repository_ctx):
   """Find host C compiler."""
-  c_name = ""
+  c_name = "gcc"
   if _HOST_C_COMPILER in repository_ctx.os.environ:
     c_name = repository_ctx.os.environ[_HOST_C_COMPILER].strip()
   if c_name.startswith("/"):
@@ -32,7 +32,7 @@ def find_c(repository_ctx):
 
 def find_cc(repository_ctx):
   """Find host C++ compiler."""
-  cc_name = ""
+  cc_name = "g++"
   if _HOST_CXX_COMPILER in repository_ctx.os.environ:
     cc_name = repository_ctx.os.environ[_HOST_CXX_COMPILER].strip()
   if cc_name.startswith("/"):

From 455be86133dc31b9da350944f20f98f8a9250e40 Mon Sep 17 00:00:00 2001
From: luke <luke@codeplay.com>
Date: Tue, 18 Oct 2016 17:54:14 +0100
Subject: [PATCH 09/51] Added dummy repository creation for SYCL.

---
 configure                           |  1 +
 third_party/sycl/sycl_configure.bzl | 93 ++++++++++++++++++++++-------
 2 files changed, 74 insertions(+), 20 deletions(-)

diff --git a/configure b/configure
index 759f317d0cb..2e8b86fdaf7 100755
--- a/configure
+++ b/configure
@@ -444,6 +444,7 @@ while true; do
   COMPUTECPP_TOOLKIT_PATH=""
 done
 
+export TF_NEED_OPENCL
 # end of if "$TF_NEED_OPENCL" == "1"
 fi
 
diff --git a/third_party/sycl/sycl_configure.bzl b/third_party/sycl/sycl_configure.bzl
index e3c732bbc73..2dd82198ffb 100644
--- a/third_party/sycl/sycl_configure.bzl
+++ b/third_party/sycl/sycl_configure.bzl
@@ -11,6 +11,12 @@ _HOST_CXX_COMPILER = "HOST_CXX_COMPILER"
 _HOST_C_COMPILER= "HOST_C_COMPILER"
 _COMPUTECPP_TOOLKIT_PATH = "COMPUTECPP_TOOLKIT_PATH"
 
+def _enable_sycl(repository_ctx):
+  if "TF_NEED_OPENCL" in repository_ctx.os.environ:
+    enable_sycl = repository_ctx.os.environ["TF_NEED_OPENCL"].strip()
+    return enable_sycl == "1"
+  return False
+  
 def auto_configure_fail(msg):
   """Output failure message when auto configuration fails."""
   red = "\033[0;31m"
@@ -99,33 +105,80 @@ def _file(repository_ctx, label):
       Label("//third_party/sycl/%s.tpl" % label),
       {})
 
-def _sycl_autoconf_imp(repository_ctx):
-  """Implementation of the sycl_autoconf rule."""
+_DUMMY_CROSSTOOL_BZL_FILE = """
+def error_sycl_disabled():
+  fail("ERROR: Building with --config=sycl but TensorFlow is not configured " +
+       "to build with SYCL support. Please re-run ./configure and enter 'Y' " +
+       "at the prompt to build with SYCL support.")
 
-  # copy template files
+  native.genrule(
+      name = "error_gen_crosstool",
+      outs = ["CROSSTOOL"],
+      cmd = "echo 'Should not be run.' && exit 1",
+  )
+
+  native.filegroup(
+      name = "crosstool",
+      srcs = [":CROSSTOOL"],
+      output_licenses = ["unencumbered"],
+  )
+"""
+
+
+_DUMMY_CROSSTOOL_BUILD_FILE = """
+load("//crosstool:error_sycl_disabled.bzl", "error_sycl_disabled")
+
+error_sycl_disabled()
+"""
+
+def _create_dummy_repository(repository_ctx):
+  # Set up BUILD file for sycl/.
   _file(repository_ctx, "sycl:build_defs.bzl")
   _file(repository_ctx, "sycl:BUILD")
   _file(repository_ctx, "sycl:platform.bzl")
-  _file(repository_ctx, "crosstool:BUILD")
-  _tpl(repository_ctx, "crosstool:computecpp",
-  {
-    "%{host_cxx_compiler}" : find_cc(repository_ctx),
-    "%{host_c_compiler}" : find_c(repository_ctx),
-  })
 
-  computecpp_root = find_computecpp_root(repository_ctx);
-  _check_dir(repository_ctx, computecpp_root)
+  # Create dummy files for the SYCL toolkit since they are still required by
+  # tensorflow/sycl/platform/default/build_config:sycl.
+  repository_ctx.file("sycl/include/sycl.hpp", "")
+  repository_ctx.file("sycl/lib/libComputeCpp.so", "")
 
-  _tpl(repository_ctx, "crosstool:CROSSTOOL",
-  {
-    "%{computecpp_toolkit_path}" : computecpp_root,
-  })
+  # If sycl_configure is not configured to build with SYCL support, and the user
+  # attempts to build with --config=sycl, add a dummy build rule to intercept
+  # this and fail with an actionable error message.
+  repository_ctx.file("crosstool/error_sycl_disabled.bzl",
+                      _DUMMY_CROSSTOOL_BZL_FILE)
+  repository_ctx.file("crosstool/BUILD", _DUMMY_CROSSTOOL_BUILD_FILE)
 
-  # symlink libraries
-  _check_lib(repository_ctx, computecpp_root+"/lib", "libComputeCpp.so" )
-  _symlink_dir(repository_ctx, computecpp_root + "/lib", "sycl/lib")
-  _symlink_dir(repository_ctx, computecpp_root + "/include", "sycl/include")
-  _symlink_dir(repository_ctx, computecpp_root + "/bin", "sycl/bin")
+
+def _sycl_autoconf_imp(repository_ctx):
+  """Implementation of the sycl_autoconf rule."""
+  if not _enable_sycl(repository_ctx):
+    _create_dummy_repository(repository_ctx)
+  else:
+    # copy template files
+    _file(repository_ctx, "sycl:build_defs.bzl")
+    _file(repository_ctx, "sycl:BUILD")
+    _file(repository_ctx, "sycl:platform.bzl")
+    _file(repository_ctx, "crosstool:BUILD")
+    _tpl(repository_ctx, "crosstool:computecpp",
+    {
+      "%{host_cxx_compiler}" : find_cc(repository_ctx),
+      "%{host_c_compiler}" : find_c(repository_ctx),
+    })
+
+    computecpp_root = find_computecpp_root(repository_ctx);
+    _check_dir(repository_ctx, computecpp_root)
+
+    _tpl(repository_ctx, "crosstool:CROSSTOOL",
+    {
+      "%{computecpp_toolkit_path}" : computecpp_root,
+    })
+
+    # symlink libraries
+    _check_lib(repository_ctx, computecpp_root+"/lib", "libComputeCpp.so" )
+    _symlink_dir(repository_ctx, computecpp_root + "/lib", "sycl/lib")
+    _symlink_dir(repository_ctx, computecpp_root + "/include", "sycl/include")
+    _symlink_dir(repository_ctx, computecpp_root + "/bin", "sycl/bin")
 
 sycl_configure = repository_rule(
   implementation = _sycl_autoconf_imp,

From 58db8411d93e277f129143a2647484e768128182 Mon Sep 17 00:00:00 2001
From: luke <luke@codeplay.com>
Date: Tue, 18 Oct 2016 16:16:01 +0100
Subject: [PATCH 10/51] Added template keyword for post fix expression where
 left hand side is a dependent expression.

---
 tensorflow/core/kernels/cwise_ops_common.h | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/tensorflow/core/kernels/cwise_ops_common.h b/tensorflow/core/kernels/cwise_ops_common.h
index 3d1953d7f4c..0297e65b09a 100644
--- a/tensorflow/core/kernels/cwise_ops_common.h
+++ b/tensorflow/core/kernels/cwise_ops_common.h
@@ -101,45 +101,45 @@ class BinaryOp : public BinaryOpShared {
       if (state.in1_num_elements == 1) {
         // tensor op scalar
         functor::BinaryFunctor<Device, Functor, 1>().Right(
-            eigen_device, out_flat, in0.flat<Tin>(), in1.scalar<Tin>(),
+            eigen_device, out_flat, in0.template flat<Tin>(), in1.template scalar<Tin>(),
             error_ptr);
       } else if (state.in0_num_elements == 1) {
         // scalar op tensor
         functor::BinaryFunctor<Device, Functor, 1>().Left(
-            eigen_device, out_flat, in0.scalar<Tin>(), in1.flat<Tin>(),
+            eigen_device, out_flat, in0.template scalar<Tin>(), in1.template flat<Tin>(),
             error_ptr);
       } else {
         functor::BinaryFunctor<Device, Functor, 1>()(
-            eigen_device, out_flat, in0.flat<Tin>(), in1.flat<Tin>(),
+            eigen_device, out_flat, in0.template flat<Tin>(), in1.template flat<Tin>(),
             error_ptr);
       }
     } else if (ndims == 2) {
       functor::BinaryFunctor<Device, Functor, 2>().BCast(
           eigen_device, out->shaped<Tout, 2>(bcast->result_shape()),
-          in0.shaped<Tin, 2>(bcast->x_reshape()),
+          in0.template shaped<Tin, 2>(bcast->x_reshape()),
           BCast::ToIndexArray<2>(bcast->x_bcast()),
-          in1.shaped<Tin, 2>(bcast->y_reshape()),
+          in1.template shaped<Tin, 2>(bcast->y_reshape()),
           BCast::ToIndexArray<2>(bcast->y_bcast()), error_ptr);
     } else if (ndims == 3) {
       functor::BinaryFunctor<Device, Functor, 3>().BCast(
           eigen_device, out->shaped<Tout, 3>(bcast->result_shape()),
-          in0.shaped<Tin, 3>(bcast->x_reshape()),
+          in0.template shaped<Tin, 3>(bcast->x_reshape()),
           BCast::ToIndexArray<3>(bcast->x_bcast()),
-          in1.shaped<Tin, 3>(bcast->y_reshape()),
+          in1.template shaped<Tin, 3>(bcast->y_reshape()),
           BCast::ToIndexArray<3>(bcast->y_bcast()), error_ptr);
     } else if (ndims == 4) {
       functor::BinaryFunctor<Device, Functor, 4>().BCast(
           eigen_device, out->shaped<Tout, 4>(bcast->result_shape()),
-          in0.shaped<Tin, 4>(bcast->x_reshape()),
+          in0.template shaped<Tin, 4>(bcast->x_reshape()),
           BCast::ToIndexArray<4>(bcast->x_bcast()),
-          in1.shaped<Tin, 4>(bcast->y_reshape()),
+          in1.template shaped<Tin, 4>(bcast->y_reshape()),
           BCast::ToIndexArray<4>(bcast->y_bcast()), error_ptr);
     } else if (ndims == 5) {
       functor::BinaryFunctor<Device, Functor, 5>().BCast(
           eigen_device, out->shaped<Tout, 5>(bcast->result_shape()),
-          in0.shaped<Tin, 5>(bcast->x_reshape()),
+          in0.template shaped<Tin, 5>(bcast->x_reshape()),
           BCast::ToIndexArray<5>(bcast->x_bcast()),
-          in1.shaped<Tin, 5>(bcast->y_reshape()),
+          in1.template shaped<Tin, 5>(bcast->y_reshape()),
           BCast::ToIndexArray<5>(bcast->y_bcast()), error_ptr);
     } else {
       SetUnimplementedError(ctx);

From befbe983abe59c6aa96c6dae495ab458299dc0cf Mon Sep 17 00:00:00 2001
From: luke iwanski <luke@codeplay.com>
Date: Thu, 13 Oct 2016 20:20:39 +0100
Subject: [PATCH 11/51] Added SYCL type.

---
 tensorflow/core/framework/types.cc      | 1 +
 tensorflow/core/framework/types.h       | 5 +++--
 tensorflow/core/framework/types_test.cc | 1 +
 3 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/tensorflow/core/framework/types.cc b/tensorflow/core/framework/types.cc
index d1e4d572e47..235f551d0e2 100644
--- a/tensorflow/core/framework/types.cc
+++ b/tensorflow/core/framework/types.cc
@@ -37,6 +37,7 @@ std::ostream& operator<<(std::ostream& os, const DeviceType& d) {
 
 const char* const DEVICE_CPU = "CPU";
 const char* const DEVICE_GPU = "GPU";
+const char* const DEVICE_SYCL = "SYCL";
 
 string DataTypeString(DataType dtype) {
   if (IsRefType(dtype)) {
diff --git a/tensorflow/core/framework/types.h b/tensorflow/core/framework/types.h
index 09e5b07bed0..d8a1e160ced 100644
--- a/tensorflow/core/framework/types.h
+++ b/tensorflow/core/framework/types.h
@@ -67,8 +67,9 @@ class DeviceType {
 std::ostream& operator<<(std::ostream& os, const DeviceType& d);
 
 // Convenient constants that can be passed to a DeviceType constructor
-extern const char* const DEVICE_CPU;  // "CPU"
-extern const char* const DEVICE_GPU;  // "GPU"
+extern const char* const DEVICE_CPU;   // "CPU"
+extern const char* const DEVICE_GPU;   // "GPU"
+extern const char* const DEVICE_SYCL;  // "SYCL"
 
 typedef gtl::InlinedVector<MemoryType, 4> MemoryTypeVector;
 typedef gtl::ArraySlice<MemoryType> MemoryTypeSlice;
diff --git a/tensorflow/core/framework/types_test.cc b/tensorflow/core/framework/types_test.cc
index 18e0ef9c398..bc57740469f 100644
--- a/tensorflow/core/framework/types_test.cc
+++ b/tensorflow/core/framework/types_test.cc
@@ -25,6 +25,7 @@ namespace {
 TEST(TypesTest, DeviceTypeName) {
   EXPECT_EQ("CPU", DeviceTypeString(DeviceType(DEVICE_CPU)));
   EXPECT_EQ("GPU", DeviceTypeString(DeviceType(DEVICE_GPU)));
+  EXPECT_EQ("SYCL", DeviceTypeString(DeviceType(DEVICE_SYCL)));
 }
 
 TEST(TypesTest, kDataTypeRefOffset) {

From 530823bcbb8540465863d58be9bc18bdbae56d63 Mon Sep 17 00:00:00 2001
From: luke iwanski <luke@codeplay.com>
Date: Thu, 13 Oct 2016 21:48:16 +0100
Subject: [PATCH 12/51] Added SYCL to the device_set_test and
 device_name_utils.

---
 tensorflow/core/common_runtime/device_set_test.cc |  9 +++++++--
 tensorflow/core/util/device_name_utils.cc         | 10 ++++++++++
 2 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/tensorflow/core/common_runtime/device_set_test.cc b/tensorflow/core/common_runtime/device_set_test.cc
index 21d3be2f613..2c4aa227462 100644
--- a/tensorflow/core/common_runtime/device_set_test.cc
+++ b/tensorflow/core/common_runtime/device_set_test.cc
@@ -68,12 +68,17 @@ TEST_F(DeviceSetTest, PrioritizedDeviceTypeList) {
       (std::vector<DeviceType>{DeviceType(DEVICE_GPU), DeviceType(DEVICE_CPU)}),
       types());
 
+  AddDevice("SYCL", "/job:a/replica:0/task:0/sycl:0");
+  EXPECT_EQ(
+      (std::vector<DeviceType>{DeviceType(DEVICE_SYCL), DeviceType(DEVICE_GPU),
+      DeviceType(DEVICE_CPU)}), types());
+
   AddDevice("T1", "/job:a/replica:0/task:0/device:T1:0");
   AddDevice("T1", "/job:a/replica:0/task:0/device:T1:1");
   AddDevice("T2", "/job:a/replica:0/task:0/device:T2:0");
   EXPECT_EQ(
-      (std::vector<DeviceType>{DeviceType("T1"), DeviceType("T2"),
-                               DeviceType(DEVICE_GPU), DeviceType(DEVICE_CPU)}),
+      (std::vector<DeviceType>{DeviceType(DEVICE_SYCL), DeviceType("T1"),
+      DeviceType("T2"), DeviceType(DEVICE_GPU), DeviceType(DEVICE_CPU)}),
       types());
 }
 
diff --git a/tensorflow/core/util/device_name_utils.cc b/tensorflow/core/util/device_name_utils.cc
index c38b5758fa8..336b69b3aba 100644
--- a/tensorflow/core/util/device_name_utils.cc
+++ b/tensorflow/core/util/device_name_utils.cc
@@ -162,6 +162,16 @@ bool DeviceNameUtils::ParseFullName(StringPiece fullname, ParsedName* p) {
       }
       progress = true;
     }
+    if (str_util::ConsumePrefix(&fullname, "/sycl:") ||
+        str_util::ConsumePrefix(&fullname, "/SYCL:")) {
+      p->has_type = true;
+      p->type = "SYCL";  // Treat '/sycl:..' as uppercase '/device:SYCL:...'
+      p->has_id = !str_util::ConsumePrefix(&fullname, "*");
+      if (p->has_id && !ConsumeNumber(&fullname, &p->id)) {
+        return false;
+      }
+      progress = true;
+    }
 
     if (!progress) {
       return false;

From 13157f1ab96e6641b1373891688578b8d6da8d60 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Wed, 19 Oct 2016 12:21:03 -0700
Subject: [PATCH 13/51] Switched to the latest version of Eigen

---
 eigen.BUILD | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/eigen.BUILD b/eigen.BUILD
index 8a699f6aa84..210d1523ea3 100644
--- a/eigen.BUILD
+++ b/eigen.BUILD
@@ -55,6 +55,8 @@ EIGEN_MPL2_HEADER_FILES = glob(
     ],
 )
 
+# archive_dir = "benoitsteiner-opencl-9d4a08d57d0d"
+
 cc_library(
     name = "eigen",
     hdrs = EIGEN_MPL2_HEADER_FILES,

From 4b0a8966a5000d2e292a950827f46834ee7ad646 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Wed, 19 Oct 2016 12:24:14 -0700
Subject: [PATCH 14/51] Upgraded to the latest version of Eigen

---
 tensorflow/workspace.bzl | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 6e3def96902..2934bc221f5 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -14,8 +14,8 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
   # These lines need to be changed when updating Eigen. They are parsed from
   # this file by the cmake and make builds to determine the eigen version and
   # hash.
-  eigen_version = "ab6d16a84626"
-  eigen_sha256 = ""
+  eigen_version = "c6f01fd8258e"
+  eigen_sha256 = "b5c110733c6bc5f12cb3ceb66806c34932a376850b2801eea4a59a16c0de9206"
 
   native.new_http_archive(
     name = "eigen_archive",
@@ -27,16 +27,16 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
 
   native.http_archive(
     name = "com_googlesource_code_re2",
-    url = "http://github.com/google/re2/archive/7bab3dc83df6a838cc004cc7a7f51d5fe1a427d5.tar.gz",
-    sha256 = "ef91af8850f734c8be65f2774747f4c2d8d81e556ba009faa79b4dd8b2759555",
-    strip_prefix = "re2-7bab3dc83df6a838cc004cc7a7f51d5fe1a427d5",
+    url = "http://github.com/google/re2/archive/b94b7cd42e9f02673cd748c1ac1d16db4052514c.tar.gz",
+    sha256 = "bd63550101e056427c9e7ff12a408c1c8b74e9803f393ca916b2926fc2c4906f",
+    strip_prefix = "re2-b94b7cd42e9f02673cd748c1ac1d16db4052514c",
   )
 
   native.http_archive(
     name = "gemmlowp",
-    url = "http://github.com/google/gemmlowp/archive/8b20dd2ce142115857220bd6a35e8a081b3e0829.tar.gz",
-    sha256 = "9cf5f1e3d64b3632dbae5c65efb79f4374ca9ac362d788fc61e086af937ff6d7",
-    strip_prefix = "gemmlowp-8b20dd2ce142115857220bd6a35e8a081b3e0829",
+    url = "http://github.com/google/gemmlowp/archive/c0bacf11fb509a2cbe15a97362a2df067ffd57a2.tar.gz",
+    sha256 = "dc64a38f9927db18748d9024987c9b102115e25bc2be4b76aa8e422b8f83d882",
+    strip_prefix = "gemmlowp-c0bacf11fb509a2cbe15a97362a2df067ffd57a2",
   )
 
   native.new_http_archive(
@@ -98,9 +98,9 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
 
   native.http_archive(
     name = "protobuf",
-    url = "http://github.com/google/protobuf/archive/v3.1.0.tar.gz",
-    sha256 = "0a0ae63cbffc274efb573bdde9a253e3f32e458c41261df51c5dbc5ad541e8f7",
-    strip_prefix = "protobuf-3.1.0",
+    url = "http://github.com/google/protobuf/archive/c2b3e70efd2038a54ef8973771ac58192885125e.tar.gz",
+    sha256 = "eafc1bc4c27970d62effe64ba6610823fdd66711f440d8ca4a168167786a2fcb",
+    strip_prefix = "protobuf-c2b3e70efd2038a54ef8973771ac58192885125e",
   )
 
   native.new_http_archive(

From a51849e74d26cdaef6365ef4951d22fea53dcc69 Mon Sep 17 00:00:00 2001
From: luke iwanski <luke@codeplay.com>
Date: Wed, 19 Oct 2016 23:27:10 +0100
Subject: [PATCH 15/51] Appache 2.0 license.

---
 third_party/sycl/crosstool/BUILD.tpl | 2 +-
 third_party/sycl/sycl/BUILD.tpl      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/third_party/sycl/crosstool/BUILD.tpl b/third_party/sycl/crosstool/BUILD.tpl
index ec0070e71da..f539a376c85 100755
--- a/third_party/sycl/crosstool/BUILD.tpl
+++ b/third_party/sycl/crosstool/BUILD.tpl
@@ -1,4 +1,4 @@
-licenses(["restricted"])
+licenses(["notice"])  # Apache 2.0
 
 package(default_visibility = ["//visibility:public"])
 
diff --git a/third_party/sycl/sycl/BUILD.tpl b/third_party/sycl/sycl/BUILD.tpl
index c3879e8da54..9e83b1994cb 100755
--- a/third_party/sycl/sycl/BUILD.tpl
+++ b/third_party/sycl/sycl/BUILD.tpl
@@ -1,4 +1,4 @@
-licenses(["restricted"])  # MPL2, portions GPL v3, LGPL v3, BSD-like
+licenses(["notice"])  # Apache 2.0
 
 load("@local_config_sycl//sycl:build_defs.bzl", "if_sycl")
 load("platform", "sycl_library_path")

From 78d6d1a2628ed4979f22cebdfc762ba5f88f33e3 Mon Sep 17 00:00:00 2001
From: luke <luke@codeplay.com>
Date: Thu, 20 Oct 2016 19:04:19 +0100
Subject: [PATCH 16/51] Added sycl_device and sycl_device_factory.

---
 tensorflow/core/BUILD                         |  26 ++++
 .../core/common_runtime/device_factory.cc     |  11 +-
 .../core/common_runtime/sycl/sycl_device.cc   |  70 +++++++++
 .../core/common_runtime/sycl/sycl_device.h    |  53 +++++++
 .../sycl/sycl_device_factory.cc               |  43 ++++++
 tensorflow/core/framework/device_base.h       |  17 +++
 tensorflow/core/kernels/cwise_ops_common.h    |  19 ++-
 .../core/kernels/cwise_ops_sycl_common.h      | 136 ++++++++++++++++++
 tensorflow/workspace.bzl                      |   2 +-
 third_party/eigen3/BUILD                      |   4 +-
 10 files changed, 372 insertions(+), 9 deletions(-)
 create mode 100644 tensorflow/core/common_runtime/sycl/sycl_device.cc
 create mode 100644 tensorflow/core/common_runtime/sycl/sycl_device.h
 create mode 100644 tensorflow/core/common_runtime/sycl/sycl_device_factory.cc
 create mode 100644 tensorflow/core/kernels/cwise_ops_sycl_common.h

diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index f32cd0e6fc8..b19ac574b99 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -505,6 +505,7 @@ cc_library(
     deps = [
         ":core_cpu",
         ":gpu_runtime",
+        ":sycl_runtime",
     ],
 )
 
@@ -1414,6 +1415,31 @@ tf_cuda_library(
     alwayslink = 1,
 )
 
+cc_library(
+    name = "sycl_runtime",
+    srcs = if_not_windows([
+        "common_runtime/sycl/sycl_device.cc",
+        "common_runtime/sycl/sycl_device_factory.cc",
+    ]),
+    hdrs = if_not_windows([
+        "common_runtime/sycl/sycl_device.h",
+    ]),
+    copts = tf_copts(),
+    linkstatic = 1,
+    deps = [
+        ":core_cpu",
+        ":core_cpu_internal",
+        ":framework",
+        ":framework_internal",
+        ":lib",
+        ":lib_internal",
+        ":protos_all_cc",
+        "//third_party/eigen3",
+        "@local_config_sycl//sycl:sycl",
+    ],
+    alwayslink = 1,
+)
+
 # -----------------------------------------------------------------------------
 # Tests
 
diff --git a/tensorflow/core/common_runtime/device_factory.cc b/tensorflow/core/common_runtime/device_factory.cc
index 8104f446366..15933a81992 100644
--- a/tensorflow/core/common_runtime/device_factory.cc
+++ b/tensorflow/core/common_runtime/device_factory.cc
@@ -97,11 +97,20 @@ Status DeviceFactory::AddDevices(const SessionOptions& options,
         gpu_factory->CreateDevices(options, name_prefix, devices));
   }
 
+  // Then SYCL.
+  auto sycl_factory = GetFactory("SYCL");
+
+  if (sycl_factory) {
+    TF_RETURN_IF_ERROR(
+        sycl_factory->CreateDevices(options, name_prefix, devices));
+  }
+
   // Then the rest.
   mutex_lock l(*get_device_factory_lock());
   for (auto& p : device_factories()) {
     auto factory = p.second.factory.get();
-    if (factory != cpu_factory && factory != gpu_factory) {
+    if (factory != cpu_factory && factory != gpu_factory &&
+        factory != sycl_factory) {
       TF_RETURN_IF_ERROR(factory->CreateDevices(options, name_prefix, devices));
     }
   }
diff --git a/tensorflow/core/common_runtime/sycl/sycl_device.cc b/tensorflow/core/common_runtime/sycl/sycl_device.cc
new file mode 100644
index 00000000000..e13c34dd690
--- /dev/null
+++ b/tensorflow/core/common_runtime/sycl/sycl_device.cc
@@ -0,0 +1,70 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if TENSORFLOW_USE_SYCL
+
+#include "tensorflow/core/common_runtime/sycl/sycl_device.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+#include "tensorflow/core/framework/tensor.pb_text.h"
+#include "tensorflow/core/platform/tracing.h"
+
+namespace tensorflow {
+
+cl::sycl::gpu_selector s;
+cl::sycl::queue q(s);
+
+SYCLDevice::SYCLDevice(const SessionOptions& options, const string& name,
+                       Bytes memory_limit, BusAdjacency bus_adjacency,
+                       Allocator* allocator)
+    : LocalDevice(options, Device::BuildDeviceAttributes(
+                               name, DEVICE_SYCL, memory_limit, bus_adjacency),
+                  allocator),
+      allocator_(allocator),
+      device_(q) {
+  set_eigen_sycl_device(&device_);
+}
+
+SYCLDevice::~SYCLDevice() {}
+
+void SYCLDevice::Compute(OpKernel* op_kernel, OpKernelContext* context) {
+  assert(context);
+  if (port::Tracing::IsActive()) {
+    // TODO(pbar) We really need a useful identifier of the graph node.
+    const uint64 id = Hash64(op_kernel->name());
+    port::Tracing::ScopedActivity region(port::Tracing::EventCategory::kCompute,
+                                         id);
+  }
+  op_kernel->Compute(context);
+}
+
+Allocator* SYCLDevice::GetAllocator(AllocatorAttributes attr) {
+  return allocator_;
+}
+
+Status SYCLDevice::MakeTensorFromProto(const TensorProto& tensor_proto,
+                                       const AllocatorAttributes alloc_attrs,
+                                       Tensor* tensor) {
+  Tensor parsed(tensor_proto.dtype());
+  if (!parsed.FromProto(cpu_allocator(), tensor_proto)) {
+    return errors::InvalidArgument("Cannot parse tensor from proto: ",
+                                   ProtoDebugString(tensor_proto));
+  }
+  *tensor = std::move(parsed);
+  return Status::OK();
+}
+}
+
+#endif  // TENSORFLOW_USE_SYCL
diff --git a/tensorflow/core/common_runtime/sycl/sycl_device.h b/tensorflow/core/common_runtime/sycl/sycl_device.h
new file mode 100644
index 00000000000..e43997cbf25
--- /dev/null
+++ b/tensorflow/core/common_runtime/sycl/sycl_device.h
@@ -0,0 +1,53 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if !TENSORFLOW_USE_SYCL
+#error This file must only be included when building TensorFlow with SYCL support
+#endif
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_SYCL_SYCL_DEVICE_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_SYCL_SYCL_DEVICE_H_
+
+#define EIGEN_USE_SYCL
+
+#include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/common_runtime/local_device.h"
+#include "tensorflow/core/public/session_options.h"
+
+namespace tensorflow {
+
+class SYCLDevice : public LocalDevice {
+ public:
+  SYCLDevice(const SessionOptions& options, const string& name,
+             Bytes memory_limit, BusAdjacency bus_adjacency,
+             Allocator* allocator);
+  ~SYCLDevice() override;
+
+  void Compute(OpKernel* op_kernel, OpKernelContext* context) override;
+  Allocator* GetAllocator(AllocatorAttributes attr) override;
+  Status MakeTensorFromProto(const TensorProto& tensor_proto,
+                             const AllocatorAttributes alloc_attrs,
+                             Tensor* tensor) override;
+
+  Status Sync() override { return Status::OK(); }
+
+ private:
+  Allocator* allocator_;  // Not owned
+  Eigen::SyclDevice device_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_SYCL_SYCL_DEVICE_H_
diff --git a/tensorflow/core/common_runtime/sycl/sycl_device_factory.cc b/tensorflow/core/common_runtime/sycl/sycl_device_factory.cc
new file mode 100644
index 00000000000..fe10412ab6c
--- /dev/null
+++ b/tensorflow/core/common_runtime/sycl/sycl_device_factory.cc
@@ -0,0 +1,43 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if TENSORFLOW_USE_SYCL
+
+#include "sycl_device.h"
+
+namespace tensorflow {
+
+class SYCLDeviceFactory : public DeviceFactory {
+ public:
+  Status CreateDevices(const SessionOptions& options, const string& name_prefix,
+                       std::vector<Device*>* devices) override {
+    int n = 1;
+    auto iter = options.config.device_count().find("SYCL");
+    if (iter != options.config.device_count().end()) {
+      n = iter->second;
+    }
+    for (int i = 0; i < n; i++) {
+      string name = strings::StrCat(name_prefix, "/sycl:", i);
+      devices->push_back(new SYCLDevice(options, name, Bytes(256 << 20),
+                                        BUS_ANY, cpu_allocator()));
+    }
+    return Status::OK();
+  }
+};
+
+REGISTER_LOCAL_DEVICE_FACTORY("SYCL", SYCLDeviceFactory);
+}
+
+#endif  // TENSORFLOW_USE_SYCL
diff --git a/tensorflow/core/framework/device_base.h b/tensorflow/core/framework/device_base.h
index acba11657a3..eadba4c6c8d 100644
--- a/tensorflow/core/framework/device_base.h
+++ b/tensorflow/core/framework/device_base.h
@@ -30,6 +30,9 @@ limitations under the License.
 
 namespace Eigen {
 struct ThreadPoolDevice;
+#ifdef TENSORFLOW_USE_SYCL
+struct SyclDevice;
+#endif
 }  // end namespace Eigen
 
 namespace perftools {
@@ -145,6 +148,10 @@ class DeviceBase {
     eigen_cpu_device_ = d;
   }
 
+#ifdef TENSORFLOW_USE_SYCL
+  void set_eigen_sycl_device(Eigen::SyclDevice* d) { eigen_sycl_device_ = d; }
+#endif
+
   // Return the Allocator implementation to use based on the allocator
   // attributes requested.  See allocator.h for more details.
   virtual Allocator* GetAllocator(AllocatorAttributes /*attr*/) {
@@ -166,6 +173,13 @@ class DeviceBase {
     return eigen_cpu_device_;
   }
 
+#ifdef TENSORFLOW_USE_SYCL
+  const Eigen::SyclDevice* eigen_sycl_device() const {
+    CHECK(eigen_sycl_device_ != nullptr);
+    return eigen_sycl_device_;
+  }
+#endif
+
   // Caller owns the return value. The OpKernelContext calls this even
   // for devices that do not implement an eigen_gpu_device. Overridden
   // by GPU devices to return a derived type.
@@ -200,6 +214,9 @@ class DeviceBase {
   CpuWorkerThreads* cpu_worker_threads_ = nullptr;
   GpuDeviceInfo* gpu_device_info_ = nullptr;
   Eigen::ThreadPoolDevice* eigen_cpu_device_ = nullptr;
+#ifdef TENSORFLOW_USE_SYCL
+  Eigen::SyclDevice* eigen_sycl_device_ = nullptr;
+#endif
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_ops_common.h b/tensorflow/core/kernels/cwise_ops_common.h
index 0297e65b09a..cea6c3d59df 100644
--- a/tensorflow/core/kernels/cwise_ops_common.h
+++ b/tensorflow/core/kernels/cwise_ops_common.h
@@ -20,6 +20,10 @@ limitations under the License.
 
 #define EIGEN_USE_THREADS
 
+#ifdef TENSORFLOW_USE_SYCL
+#include "tensorflow/core/kernels/cwise_ops_sycl_common.h"
+#endif
+
 #include "tensorflow/core/kernels/cwise_ops.h"
 #include "tensorflow/core/kernels/cwise_ops_gradients.h"
 
@@ -33,6 +37,9 @@ namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
+#ifdef TENSORFLOW_USE_SYCL
+typedef Eigen::SyclDevice SYCLDevice;
+#endif
 
 class BinaryOpShared : public OpKernel {
  public:
@@ -101,17 +108,17 @@ class BinaryOp : public BinaryOpShared {
       if (state.in1_num_elements == 1) {
         // tensor op scalar
         functor::BinaryFunctor<Device, Functor, 1>().Right(
-            eigen_device, out_flat, in0.template flat<Tin>(), in1.template scalar<Tin>(),
-            error_ptr);
+            eigen_device, out_flat, in0.template flat<Tin>(),
+            in1.template scalar<Tin>(), error_ptr);
       } else if (state.in0_num_elements == 1) {
         // scalar op tensor
         functor::BinaryFunctor<Device, Functor, 1>().Left(
-            eigen_device, out_flat, in0.template scalar<Tin>(), in1.template flat<Tin>(),
-            error_ptr);
+            eigen_device, out_flat, in0.template scalar<Tin>(),
+            in1.template flat<Tin>(), error_ptr);
       } else {
         functor::BinaryFunctor<Device, Functor, 1>()(
-            eigen_device, out_flat, in0.template flat<Tin>(), in1.template flat<Tin>(),
-            error_ptr);
+            eigen_device, out_flat, in0.template flat<Tin>(),
+            in1.template flat<Tin>(), error_ptr);
       }
     } else if (ndims == 2) {
       functor::BinaryFunctor<Device, Functor, 2>().BCast(
diff --git a/tensorflow/core/kernels/cwise_ops_sycl_common.h b/tensorflow/core/kernels/cwise_ops_sycl_common.h
new file mode 100644
index 00000000000..baba610d6de
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_ops_sycl_common.h
@@ -0,0 +1,136 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if !TENSORFLOW_USE_SYCL
+#error This file must only be included when building TensorFlow with SYCL support
+#endif
+
+#ifndef TENSORFLOW_CORE_KERNELS_CWISE_OPS_SYCL_COMMON_H_
+#define TENSORFLOW_CORE_KERNELS_CWISE_OPS_SYCL_COMMON_H_
+
+#define EIGEN_USE_SYCL
+
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/kernels/cwise_ops.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace functor {
+
+typedef Eigen::SyclDevice SYCLDevice;
+
+template <typename OUT, typename RHS>
+void Assign(const SYCLDevice& d, OUT out, RHS rhs) {
+  out.device(d) = rhs;
+}
+
+// Partial specialization of BinaryFunctor<Device=SYCLDevice, Functor>.
+template <typename Functor, int NDIMS, bool has_errors>
+struct BinaryFunctor<SYCLDevice, Functor, NDIMS, has_errors> {
+  void operator()(const SYCLDevice& d, typename Functor::tout_type out,
+                  typename Functor::tin_type in0,
+                  typename Functor::tin_type in1, bool* error) {
+    Assign(d, out, in0.binaryExpr(in1, typename Functor::func()));
+  }
+
+  void Left(const SYCLDevice& d, typename Functor::tout_type out,
+            typename Functor::tscalar_type scalar,
+            typename Functor::tin_type in, bool* error) {
+    // typedef typename Functor::out_type Tout;
+    // typedef typename Functor::in_type Tin;
+    // typedef typename Functor::func Binary;
+    // typedef typename Eigen::internal::scalar_left<Tout, Tin, Binary> Unary;
+    // Assign(d, out, in.unaryExpr(Unary(scalar.data())));
+    //  printf("BinaryFunctor::Left NOT IMPLEMENTED ! \n");
+    LOG(FATAL) << "BinaryFunctor::Left NOT IMPLEMENTED ! ";
+  }
+
+  void Right(const SYCLDevice& d, typename Functor::tout_type out,
+             typename Functor::tin_type in,
+             typename Functor::tscalar_type scalar, bool* error) {
+    typedef typename Functor::out_type Tout;
+    typedef typename Functor::in_type Tin;
+    typedef typename Functor::func Binary;
+    typedef typename Eigen::internal::scalar_right<Tout, Tin, Binary> Unary;
+    Assign(d, out, in.unaryExpr(Unary(scalar.data())));
+  }
+
+  void BCast(const SYCLDevice& d,
+             typename TTypes<typename Functor::out_type, NDIMS>::Tensor out,
+             typename TTypes<typename Functor::in_type, NDIMS>::ConstTensor in0,
+             typename Eigen::array<Eigen::DenseIndex, NDIMS> bcast0,
+             typename TTypes<typename Functor::in_type, NDIMS>::ConstTensor in1,
+             typename Eigen::array<Eigen::DenseIndex, NDIMS> bcast1,
+             bool* error) {
+    LOG(FATAL) << "BinaryFunctor::BCast NOT IMPLEMENTED ";
+    //    printf("BinaryFunctor::BCast NOT IMPLEMENTED ! \n");
+  }
+};
+
+// Macros to explicitly instantiate kernels on GPU for multiple types
+// (T0, T1, etc.) for UnaryFunctor (e.g., functor::sqrt).
+#define DEFINE_UNARY1(F, T) template struct UnaryFunctor<SYCLDevice, F<T> >
+#define DEFINE_UNARY2(F, T0, T1) \
+  DEFINE_UNARY1(F, T0);          \
+  DEFINE_UNARY1(F, T1)
+#define DEFINE_UNARY3(F, T0, T1, T2) \
+  DEFINE_UNARY2(F, T0, T1);          \
+  DEFINE_UNARY1(F, T2)
+#define DEFINE_UNARY4(F, T0, T1, T2, T3) \
+  DEFINE_UNARY2(F, T0, T1);              \
+  DEFINE_UNARY2(F, T2, T3)
+#define DEFINE_UNARY5(F, T0, T1, T2, T3, T4) \
+  DEFINE_UNARY2(F, T0, T1);                  \
+  DEFINE_UNARY3(F, T2, T3, T4)
+
+// Macros to explicitly instantiate kernels on GPU for multiple types
+// (T0, T1, etc.) for BinaryFunctor.
+#define DEFINE_BINARY1(F, T)                          \
+  template struct BinaryFunctor<SYCLDevice, F<T>, 1>; \
+  template struct BinaryFunctor<SYCLDevice, F<T>, 2>; \
+  template struct BinaryFunctor<SYCLDevice, F<T>, 3>
+#define DEFINE_BINARY2(F, T0, T1) \
+  DEFINE_BINARY1(F, T0);          \
+  DEFINE_BINARY1(F, T1)
+#define DEFINE_BINARY3(F, T0, T1, T2) \
+  DEFINE_BINARY2(F, T0, T1);          \
+  DEFINE_BINARY1(F, T2)
+#define DEFINE_BINARY4(F, T0, T1, T2, T3) \
+  DEFINE_BINARY2(F, T0, T1);              \
+  DEFINE_BINARY2(F, T2, T3)
+#define DEFINE_BINARY5(F, T0, T1, T2, T3, T4) \
+  DEFINE_BINARY2(F, T0, T1);                  \
+  DEFINE_BINARY3(F, T2, T3, T4)
+#define DEFINE_BINARY6(F, T0, T1, T2, T3, T4, T5) \
+  DEFINE_BINARY3(F, T0, T1, T2);                  \
+  DEFINE_BINARY3(F, T3, T4, T5)
+#define DEFINE_BINARY7(F, T0, T1, T2, T3, T4, T5, T6) \
+  DEFINE_BINARY3(F, T0, T1, T2);                      \
+  DEFINE_BINARY4(F, T3, T4, T5, T6)
+#define DEFINE_BINARY8(F, T0, T1, T2, T3, T4, T5, T6, T7) \
+  DEFINE_BINARY4(F, T0, T1, T2, T3);                      \
+  DEFINE_BINARY4(F, T4, T5, T6, T7)
+#define DEFINE_BINARY9(F, T0, T1, T2, T3, T4, T5, T6, T7, T8) \
+  DEFINE_BINARY4(F, T0, T1, T2, T3);                          \
+  DEFINE_BINARY5(F, T4, T5, T6, T7, T8)
+#define DEFINE_BINARY10(F, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9) \
+  DEFINE_BINARY5(F, T0, T1, T2, T3, T4);                           \
+  DEFINE_BINARY5(F, T5, T6, T7, T8, T9)
+
+}  // end namespace functor
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_CWISE_OPS_SYCL_COMMON_H_
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 62d303a8781..e1b7b95e489 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -17,7 +17,7 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
   # These lines need to be changed when updating Eigen. They are parsed from
   # this file by the cmake and make builds to determine the eigen version and
   # hash.
-  eigen_version = "ab6d16a84626"
+  eigen_version = "090e5709bbaa"
   eigen_sha256 = ""
 
   native.new_http_archive(
diff --git a/third_party/eigen3/BUILD b/third_party/eigen3/BUILD
index 9ab7aadf87a..b45af94e540 100644
--- a/third_party/eigen3/BUILD
+++ b/third_party/eigen3/BUILD
@@ -23,5 +23,7 @@ cc_library(
         "unsupported/Eigen/CXX11/FixedPoint",
     ],
     visibility = ["//visibility:public"],
-    deps = ["@eigen_archive//:eigen"],
+    deps = ["@eigen_archive//:eigen",
+            "@local_config_sycl//sycl:sycl",
+    ],
 )

From f25bc0098aaf93ecff2751e7d92df23c50e31fce Mon Sep 17 00:00:00 2001
From: luke iwanski <luke@codeplay.com>
Date: Thu, 27 Oct 2016 07:03:46 +0100
Subject: [PATCH 17/51] Revert "Applied workaround for the ComputeCpp CE."

This reverts commit b8b166429a80b038da1cd56f331bcbbebb95932d.
---
 tensorflow/core/common_runtime/bfc_allocator.h            | 5 +++--
 tensorflow/core/common_runtime/direct_session.h           | 3 +--
 .../core/common_runtime/gpu/gpu_allocator_retry_test.cc   | 6 +++---
 tensorflow/core/common_runtime/gpu/pool_allocator.h       | 2 +-
 tensorflow/core/framework/op.h                            | 3 +--
 tensorflow/core/framework/tracking_allocator.h            | 2 +-
 tensorflow/core/kernels/barrier_ops.cc                    | 8 +++-----
 tensorflow/core/kernels/conditional_accumulator.h         | 6 +++---
 tensorflow/core/kernels/conditional_accumulator_base.h    | 3 +--
 tensorflow/core/kernels/conditional_accumulator_base_op.h | 2 +-
 tensorflow/core/kernels/cwise_ops_common.h                | 5 -----
 tensorflow/core/kernels/queue_base.h                      | 2 +-
 tensorflow/core/kernels/queue_op.h                        | 2 +-
 tensorflow/core/kernels/sparse_conditional_accumulator.h  | 2 +-
 tensorflow/core/kernels/tensor_array.h                    | 3 ++-
 tensorflow/core/lib/monitoring/collection_registry.cc     | 4 +---
 tensorflow/core/lib/monitoring/collection_registry.h      | 5 +++--
 tensorflow/core/lib/monitoring/counter.h                  | 3 ++-
 tensorflow/stream_executor/machine_manager.h              | 7 ++++---
 19 files changed, 33 insertions(+), 40 deletions(-)

diff --git a/tensorflow/core/common_runtime/bfc_allocator.h b/tensorflow/core/common_runtime/bfc_allocator.h
index 3b3e4a134f6..0b528cb0c27 100644
--- a/tensorflow/core/common_runtime/bfc_allocator.h
+++ b/tensorflow/core/common_runtime/bfc_allocator.h
@@ -295,8 +295,6 @@ class BFCAllocator : public VisitableAllocator {
    private:
     std::vector<AllocationRegion> regions_;
   };
-  // Structures mutable after construction
-  mutable mutex lock_;
 
   // Returns 'bytes' rounded up to the next highest kMinAllocationSize.
   size_t RoundedBytes(size_t bytes);
@@ -395,6 +393,9 @@ class BFCAllocator : public VisitableAllocator {
 
   std::unique_ptr<SubAllocator> suballocator_;
   string name_;
+
+  // Structures mutable after construction
+  mutable mutex lock_;
   RegionManager region_manager_ GUARDED_BY(lock_);
 
   std::vector<Chunk> chunks_;
diff --git a/tensorflow/core/common_runtime/direct_session.h b/tensorflow/core/common_runtime/direct_session.h
index 8fe4825aa6d..a4289112534 100644
--- a/tensorflow/core/common_runtime/direct_session.h
+++ b/tensorflow/core/common_runtime/direct_session.h
@@ -162,8 +162,6 @@ class DirectSession : public Session {
     protobuf::RepeatedPtrField<DebugTensorWatch> debug_tensor_watches;
   };
 
-  mutex graph_def_lock_;
-
   // Initializes the base execution state given the 'graph',
   // if not already initialized.
   Status MaybeInitializeExecutionState(const GraphDef& graph,
@@ -229,6 +227,7 @@ class DirectSession : public Session {
   string session_handle_;
   bool graph_created_ GUARDED_BY(graph_def_lock_) = false;
 
+  mutex graph_def_lock_;
   GraphDef graph_def_ GUARDED_BY(graph_def_lock_);
 
   // The thread-pools to use for running ops.
diff --git a/tensorflow/core/common_runtime/gpu/gpu_allocator_retry_test.cc b/tensorflow/core/common_runtime/gpu/gpu_allocator_retry_test.cc
index 423448773ae..2148f83fe57 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_allocator_retry_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_allocator_retry_test.cc
@@ -28,7 +28,6 @@ namespace tensorflow {
 namespace {
 
 class FakeAllocator {
-  mutex mu_;
  public:
   FakeAllocator(size_t cap, int millis_to_wait)
       : memory_capacity_(cap), millis_to_wait_(millis_to_wait) {}
@@ -58,6 +57,7 @@ class FakeAllocator {
  private:
   AllocatorRetry retry_;
   void* good_ptr_ = reinterpret_cast<void*>(0xdeadbeef);
+  mutex mu_;
   size_t memory_capacity_ GUARDED_BY(mu_);
   int millis_to_wait_;
 };
@@ -72,7 +72,6 @@ class FakeAllocator {
 // interesting part of their interaction with the allocator.  This
 // class is the mechanism that imposes turn taking.
 class AlternatingBarrier {
-  mutex mu_;
  public:
   explicit AlternatingBarrier(int num_users)
       : num_users_(num_users), next_turn_(0), done_(num_users, false) {}
@@ -110,6 +109,7 @@ class AlternatingBarrier {
     }
   }
 
+  mutex mu_;
   condition_variable cv_;
   int num_users_;
   int next_turn_ GUARDED_BY(mu_);
@@ -118,7 +118,6 @@ class AlternatingBarrier {
 
 class GPUAllocatorRetryTest : public ::testing::Test {
  protected:
-  mutex mu_;
   GPUAllocatorRetryTest() {}
 
   void LaunchConsumerThreads(int num_consumers, int cap_needed) {
@@ -174,6 +173,7 @@ class GPUAllocatorRetryTest : public ::testing::Test {
   std::vector<Thread*> consumers_;
   std::vector<int> consumer_count_;
   Notification notifier_;
+  mutex mu_;
   bool has_failed_ GUARDED_BY(mu_) = false;
   int count_ GUARDED_BY(mu_) = 0;
 };
diff --git a/tensorflow/core/common_runtime/gpu/pool_allocator.h b/tensorflow/core/common_runtime/gpu/pool_allocator.h
index 437fea91155..b2f0265145f 100644
--- a/tensorflow/core/common_runtime/gpu/pool_allocator.h
+++ b/tensorflow/core/common_runtime/gpu/pool_allocator.h
@@ -45,7 +45,6 @@ class RoundUpInterface {
 // Size-limited pool of memory buffers obtained from a SubAllocator
 // instance.  Pool eviction policy is LRU.
 class PoolAllocator : public VisitableAllocator {
-  mutex mutex_;
  public:
   // "pool_size_limit" is the maximum number of returned, re-usable
   // memory buffers to keep in the pool.  If pool_size_limit == 0, the
@@ -137,6 +136,7 @@ class PoolAllocator : public VisitableAllocator {
   size_t pool_size_limit_;
   std::unique_ptr<SubAllocator> allocator_;
   std::unique_ptr<RoundUpInterface> size_rounder_;
+  mutex mutex_;
   std::multimap<const size_t, PtrRecord*> pool_ GUARDED_BY(mutex_);
   PtrRecord* lru_head_ GUARDED_BY(mutex_) = nullptr;
   PtrRecord* lru_tail_ GUARDED_BY(mutex_) = nullptr;
diff --git a/tensorflow/core/framework/op.h b/tensorflow/core/framework/op.h
index 321ace9f465..f047ddb12a1 100644
--- a/tensorflow/core/framework/op.h
+++ b/tensorflow/core/framework/op.h
@@ -125,8 +125,6 @@ class OpRegistry : public OpRegistryInterface {
   void ClearDeferredRegistrations();
 
  private:
-  mutable mutex mu_;
-
   // Ensures that all the functions in deferred_ get called, their OpDef's
   // registered, and returns with deferred_ empty.  Returns true the first
   // time it is called. Prints a fatal log if any op registration fails.
@@ -143,6 +141,7 @@ class OpRegistry : public OpRegistryInterface {
   Status RegisterAlreadyLocked(OpRegistrationDataFactory op_data_factory) const
       EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
+  mutable mutex mu_;
   // Functions in deferred_ may only be called with mu_ held.
   mutable std::vector<OpRegistrationDataFactory> deferred_ GUARDED_BY(mu_);
   // Values are owned.
diff --git a/tensorflow/core/framework/tracking_allocator.h b/tensorflow/core/framework/tracking_allocator.h
index 040bb03f819..bb19f5dca04 100644
--- a/tensorflow/core/framework/tracking_allocator.h
+++ b/tensorflow/core/framework/tracking_allocator.h
@@ -74,11 +74,11 @@ class TrackingAllocator : public Allocator {
   std::pair<size_t, size_t> GetSizesAndUnRef();
 
  private:
-  mutex mu_;
   ~TrackingAllocator() override {}
   bool UnRef() EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   Allocator* allocator_;  // not owned.
+  mutex mu_;
   // the number of calls to AllocateRaw that have not yet been matched
   // by a corresponding call to DeAllocateRaw, plus 1 if the Executor
   // has not yet read out the high watermark.
diff --git a/tensorflow/core/kernels/barrier_ops.cc b/tensorflow/core/kernels/barrier_ops.cc
index e91d9037cff..84f57517605 100644
--- a/tensorflow/core/kernels/barrier_ops.cc
+++ b/tensorflow/core/kernels/barrier_ops.cc
@@ -40,7 +40,6 @@ namespace tensorflow {
 namespace barrier {
 
 class Barrier : public ResourceBase {
-  mutex mu_;
  public:
   typedef std::vector<Tensor> Tuple;
   typedef std::function<void()> DoneCallback;
@@ -418,6 +417,7 @@ class Barrier : public ResourceBase {
 
  private:
   typedef std::vector<PersistentTensor> PersistentTuple;
+  mutex mu_;
   bool closed_ GUARDED_BY(mu_);
   bool queue_closed_ GUARDED_BY(mu_);
   bool queue_cancelled_ GUARDED_BY(mu_);
@@ -433,7 +433,6 @@ class Barrier : public ResourceBase {
 };
 
 class BarrierOp : public OpKernel {
-  mutex mu_;
  public:
   explicit BarrierOp(OpKernelConstruction* context)
       : OpKernel(context), barrier_handle_set_(false) {
@@ -512,6 +511,7 @@ class BarrierOp : public OpKernel {
   std::vector<TensorShape> value_component_shapes_;
   ContainerInfo cinfo_;
 
+  mutex mu_;
   PersistentTensor barrier_handle_ GUARDED_BY(mu_);
   bool barrier_handle_set_ GUARDED_BY(mu_);
 
@@ -611,9 +611,7 @@ class TakeManyOp : public BarrierOpKernel {
     DataTypeVector expected_inputs = {DT_STRING_REF, DT_INT32};
     // The first output is the insertion index, the second output is the key.
     DataTypeVector expected_outputs = {DT_INT64, DT_STRING};
-    for (auto it  = barrier->component_types().begin(),
-              end = barrier->component_types().end(); it!= end; it++ ){
-      const DataType dt = *it;
+    for (DataType dt : barrier->component_types()) {
       expected_outputs.push_back(dt);
     }
     OP_REQUIRES_OK_ASYNC(
diff --git a/tensorflow/core/kernels/conditional_accumulator.h b/tensorflow/core/kernels/conditional_accumulator.h
index 4ee1601f342..f8c340a7691 100644
--- a/tensorflow/core/kernels/conditional_accumulator.h
+++ b/tensorflow/core/kernels/conditional_accumulator.h
@@ -65,7 +65,7 @@ class ConditionalAccumulator
   functor::SetZeroFunctor<Device, T> set_zero_functor_;
 
   Status ValidateShape(const Tensor* tensor)
-      EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+      EXCLUSIVE_LOCKS_REQUIRED(this->mu_) {
     // Must be compatible with accumulated gradient if available
     if (counter_ > 0) {
       if (!accum_grad_->shape().IsSameSize(tensor->shape())) {
@@ -98,7 +98,7 @@ class ConditionalAccumulator
   }
 
   void DivideAccumGradByCounter(OpKernelContext* ctx) override
-      EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+      EXCLUSIVE_LOCKS_REQUIRED(this->mu_) {
     Tensor c(DataTypeToEnum<T>::value, {});
     c.scalar<T>()() = TypeConverter<T, int>::ConvertUToT(this->counter_);
     this->accum_grad_->template flat<T>().device(
@@ -113,7 +113,7 @@ class ConditionalAccumulator
 
   bool GetAndValidateTensorInputForApplyGrad(OpKernelContext* ctx,
                                              const Tensor** tensor) override
-      EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+      EXCLUSIVE_LOCKS_REQUIRED(this->mu_) {
     // Get input gradient tensor
     const Tensor* grad_tensor;
     OP_REQUIRES_OK_BOOLEAN(ctx, ctx->input("gradient", &grad_tensor));
diff --git a/tensorflow/core/kernels/conditional_accumulator_base.h b/tensorflow/core/kernels/conditional_accumulator_base.h
index 9992379640d..05ee855daee 100644
--- a/tensorflow/core/kernels/conditional_accumulator_base.h
+++ b/tensorflow/core/kernels/conditional_accumulator_base.h
@@ -45,8 +45,6 @@ namespace tensorflow {
  * (3) the internal global_step value (current_global_step_) is incremented by 1
  */
 class ConditionalAccumulatorBase : public ResourceBase {
- protected:
-  mutex mu_;
  public:
   // Args:
   //   dtype: The datatype of the gradients to be accumulated.
@@ -127,6 +125,7 @@ class ConditionalAccumulatorBase : public ResourceBase {
   const DataType dtype_;
   const PartialTensorShape shape_;
   const string name_;
+  mutex mu_;
   int counter_ GUARDED_BY(mu_);
   int64 current_global_step_ GUARDED_BY(mu_);
 
diff --git a/tensorflow/core/kernels/conditional_accumulator_base_op.h b/tensorflow/core/kernels/conditional_accumulator_base_op.h
index 0a64a857cdb..33c2d596c8b 100644
--- a/tensorflow/core/kernels/conditional_accumulator_base_op.h
+++ b/tensorflow/core/kernels/conditional_accumulator_base_op.h
@@ -43,7 +43,6 @@ namespace tensorflow {
  * ConditionalAccumulatorBase (via sub-class's Creator) and returns its handle.
  */
 class ConditionalAccumulatorBaseOp : public OpKernel {
-  mutex mu_;
  public:
   explicit ConditionalAccumulatorBaseOp(OpKernelConstruction* context)
       : OpKernel(context), accumulator_handle_set_(false) {
@@ -110,6 +109,7 @@ class ConditionalAccumulatorBaseOp : public OpKernel {
     return Status::OK();
   }
 
+  mutex mu_;
   PersistentTensor accumulator_handle_ GUARDED_BY(mu_);
   bool accumulator_handle_set_ GUARDED_BY(mu_);
 };
diff --git a/tensorflow/core/kernels/cwise_ops_common.h b/tensorflow/core/kernels/cwise_ops_common.h
index cea6c3d59df..c825a91fb16 100644
--- a/tensorflow/core/kernels/cwise_ops_common.h
+++ b/tensorflow/core/kernels/cwise_ops_common.h
@@ -89,13 +89,8 @@ class BinaryOp : public BinaryOpShared {
     if (!ctx->status().ok()) return;
     Tensor* out = state.out;
     BCast* bcast = &state.bcast;
-#if TENSORFLOW_USE_SYCL
-    decltype(state.in0) in0 = state.in0;
-    decltype(state.in1) in1 = state.in1;
-#else
     auto& in0 = state.in0;
     auto& in1 = state.in1;
-#endif
     if (state.out_num_elements == 0) {
       return;
     }
diff --git a/tensorflow/core/kernels/queue_base.h b/tensorflow/core/kernels/queue_base.h
index 6b2043e5a32..79b479b44b5 100644
--- a/tensorflow/core/kernels/queue_base.h
+++ b/tensorflow/core/kernels/queue_base.h
@@ -83,7 +83,6 @@ class QueueBase : public QueueInterface {
                                    int64 index);
 
  protected:
-  mutex mu_;
   enum Action { kEnqueue, kDequeue };
   enum RunResult { kNoProgress, kProgress, kComplete };
 
@@ -144,6 +143,7 @@ class QueueBase : public QueueInterface {
   const DataTypeVector component_dtypes_;
   const std::vector<TensorShape> component_shapes_;
   const string name_;
+  mutex mu_;
   bool closed_ GUARDED_BY(mu_);
 
   struct Attempt;
diff --git a/tensorflow/core/kernels/queue_op.h b/tensorflow/core/kernels/queue_op.h
index a21be2c389d..7694827854c 100644
--- a/tensorflow/core/kernels/queue_op.h
+++ b/tensorflow/core/kernels/queue_op.h
@@ -34,7 +34,6 @@ namespace tensorflow {
 
 // Defines a QueueOp, an abstract class for Queue construction ops.
 class QueueOp : public OpKernel {
-  mutex mu_;
  public:
   QueueOp(OpKernelConstruction* context)
       : OpKernel(context), queue_handle_set_(false) {
@@ -95,6 +94,7 @@ class QueueOp : public OpKernel {
     return Status::OK();
   }
 
+  mutex mu_;
   PersistentTensor queue_handle_ GUARDED_BY(mu_);
   bool queue_handle_set_ GUARDED_BY(mu_);
 };
diff --git a/tensorflow/core/kernels/sparse_conditional_accumulator.h b/tensorflow/core/kernels/sparse_conditional_accumulator.h
index 73bd3b47e48..89560094af6 100644
--- a/tensorflow/core/kernels/sparse_conditional_accumulator.h
+++ b/tensorflow/core/kernels/sparse_conditional_accumulator.h
@@ -83,7 +83,7 @@ class SparseConditionalAccumulator
 
   Status ValidateShape(
       std::tuple<const Tensor*, const Tensor*, const Tensor*>* tensor,
-      bool has_known_shape) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+      bool has_known_shape) EXCLUSIVE_LOCKS_REQUIRED(this->mu_) {
     const Tensor* tensor_idx = std::get<0>(*tensor);
     const Tensor* tensor_val = std::get<1>(*tensor);
     const Tensor* tensor_shape = std::get<2>(*tensor);
diff --git a/tensorflow/core/kernels/tensor_array.h b/tensorflow/core/kernels/tensor_array.h
index ec651f72ecf..1fb1be6b628 100644
--- a/tensorflow/core/kernels/tensor_array.h
+++ b/tensorflow/core/kernels/tensor_array.h
@@ -124,7 +124,6 @@ TF_CALL_GPU_NUMBER_TYPES(TENSOR_ARRAY_SET_ZERO_GPU);
 //     multiple reads of that index in the forward phase.
 //
 class TensorArray : public ResourceBase {
-  mutex mu_;
  public:
   static std::atomic<int64> tensor_array_counter;
 
@@ -340,6 +339,8 @@ class TensorArray : public ResourceBase {
   const DataType dtype_;
   Tensor handle_;
 
+  mutex mu_;
+
   // Marks that the tensor_array_ has been cleared.
   bool closed_ GUARDED_BY(mu_);
 
diff --git a/tensorflow/core/lib/monitoring/collection_registry.cc b/tensorflow/core/lib/monitoring/collection_registry.cc
index 01d643fbcca..47112279cff 100644
--- a/tensorflow/core/lib/monitoring/collection_registry.cc
+++ b/tensorflow/core/lib/monitoring/collection_registry.cc
@@ -45,9 +45,7 @@ void Collector::CollectMetricDescriptor(
   metric_descriptor->name = metric_def->name().ToString();
   metric_descriptor->description = metric_def->description().ToString();
 
-  for (auto it  = metric_def->label_descriptions().begin(),
-            end = metric_def->label_descriptions().end() ; it!=end ;it++  ) {
-    const StringPiece label_name = *it;
+  for (const StringPiece label_name : metric_def->label_descriptions()) {
     metric_descriptor->label_names.push_back(label_name.ToString());
   }
 
diff --git a/tensorflow/core/lib/monitoring/collection_registry.h b/tensorflow/core/lib/monitoring/collection_registry.h
index ed957b9ae45..3da2439238f 100644
--- a/tensorflow/core/lib/monitoring/collection_registry.h
+++ b/tensorflow/core/lib/monitoring/collection_registry.h
@@ -121,7 +121,6 @@ class MetricCollectorGetter {
 //
 // This class is thread-safe.
 class CollectionRegistry {
-  mutable mutex mu_;
  public:
   ~CollectionRegistry() = default;
 
@@ -177,6 +176,8 @@ class CollectionRegistry {
   // TF environment, mainly used for timestamping.
   Env* const env_;
 
+  mutable mutex mu_;
+
   // Information required for collection.
   struct CollectionInfo {
     const AbstractMetricDef* const metric_def;
@@ -226,7 +227,6 @@ inline void CollectValue(const int64& value, Point* const point) {
 //
 // This class is thread-safe.
 class Collector {
-  mutable mutex mu_;
  public:
   Collector(const uint64 collection_time_millis)
       : collected_metrics_(new CollectedMetrics()),
@@ -260,6 +260,7 @@ class Collector {
       LOCKS_EXCLUDED(mu_);
 
  private:
+  mutable mutex mu_;
   std::unique_ptr<CollectedMetrics> collected_metrics_ GUARDED_BY(mu_);
   const uint64 collection_time_millis_;
 
diff --git a/tensorflow/core/lib/monitoring/counter.h b/tensorflow/core/lib/monitoring/counter.h
index 0ea50932dd9..e76057b980a 100644
--- a/tensorflow/core/lib/monitoring/counter.h
+++ b/tensorflow/core/lib/monitoring/counter.h
@@ -78,7 +78,6 @@ class CounterCell {
 // This class is thread-safe.
 template <int NumLabels>
 class Counter {
-  mutable mutex mu_;
  public:
   ~Counter() {
     // Deleted here, before the metric_def is destroyed.
@@ -112,6 +111,8 @@ class Counter {
               }
             })) {}
 
+  mutable mutex mu_;
+
   // The metric definition. This will be used to identify the metric when we
   // register it for collection.
   const MetricDef<MetricKind::kCumulative, int64, NumLabels> metric_def_;
diff --git a/tensorflow/stream_executor/machine_manager.h b/tensorflow/stream_executor/machine_manager.h
index bf95bc74713..65396dd1ff5 100644
--- a/tensorflow/stream_executor/machine_manager.h
+++ b/tensorflow/stream_executor/machine_manager.h
@@ -60,9 +60,6 @@ namespace gputools {
 //
 // Thread-safe.
 class MachineManager {
-  // Mutex that guards the initialization of the machine manager static
-  // variable.
-  static mutex mu_;
  public:
   // Inspects the host to determine the preferred GPU execution platform.
   // To force OpenCL from a build target on a machine that has both OpenCL and
@@ -174,6 +171,10 @@ class MachineManager {
   // Returns the NUMA node association for the StreamExecutor.
   int ExecutorToNumaNode(const StreamExecutor *stream_exec) const;
 
+  // Mutex that guards the initialization of the machine manager static
+  // variable.
+  static mutex mu_;
+
   // Singleton MachineManager value -- assignment to this is protected by a
   // static singleton guard clause.
   static MachineManager *singleton_ GUARDED_BY(mu_);

From 0ad69511e18b6d87eef5ae76ff9332021d2cd955 Mon Sep 17 00:00:00 2001
From: luke iwanski <luke@codeplay.com>
Date: Thu, 27 Oct 2016 07:12:20 +0100
Subject: [PATCH 18/51] Quick copy can be used for host buffers when using
 sycl.

---
 tensorflow/core/common_runtime/rendezvous_mgr.cc | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/core/common_runtime/rendezvous_mgr.cc b/tensorflow/core/common_runtime/rendezvous_mgr.cc
index 285ac7540c8..6b57e8a0e81 100644
--- a/tensorflow/core/common_runtime/rendezvous_mgr.cc
+++ b/tensorflow/core/common_runtime/rendezvous_mgr.cc
@@ -65,10 +65,10 @@ void IntraProcessRendezvous::SameWorkerRecvDone(
     StatusCallback done) {
   // Do a quick copy (sharing the underlying buffer) if both tensors
   // are on host memory.
-  const bool src_host =
-      (send_args.alloc_attrs.on_host() || parsed.src.type == "CPU");
-  const bool dst_host =
-      (recv_args.alloc_attrs.on_host() || parsed.dst.type == "CPU");
+  const bool src_host = (send_args.alloc_attrs.on_host() ||
+                         parsed.src.type == "CPU" || parsed.src.type == "SYCL");
+  const bool dst_host = (recv_args.alloc_attrs.on_host() ||
+                         parsed.dst.type == "CPU" || parsed.dst.type == "SYCL");
   if (src_host && dst_host) {
     *out = in;
     done(Status::OK());

From d4e0f8f654f558458577ce9c848bfd8c0239e2e7 Mon Sep 17 00:00:00 2001
From: luke iwanski <luke@codeplay.com>
Date: Thu, 27 Oct 2016 07:18:23 +0100
Subject: [PATCH 19/51] Added eigen_device that returns Eigen::SyclDevice to
 op_kernel.

---
 tensorflow/core/framework/op_kernel.cc | 5 +++++
 tensorflow/core/framework/op_kernel.h  | 4 ++++
 2 files changed, 9 insertions(+)

diff --git a/tensorflow/core/framework/op_kernel.cc b/tensorflow/core/framework/op_kernel.cc
index a7a93cb69cf..d233690a2b3 100644
--- a/tensorflow/core/framework/op_kernel.cc
+++ b/tensorflow/core/framework/op_kernel.cc
@@ -949,6 +949,11 @@ const Eigen::GpuDevice& OpKernelContext::eigen_device() const {
   return eigen_gpu_device();
 }
 
+template <>
+const Eigen::SyclDevice& OpKernelContext::eigen_device() const {
+  return eigen_sycl_device();
+}
+
 void OpKernelConstruction::CtxFailure(Status s) {
   VLOG(1) << s;
   SetStatus(s);
diff --git a/tensorflow/core/framework/op_kernel.h b/tensorflow/core/framework/op_kernel.h
index 4c14918ea70..55de40c4714 100644
--- a/tensorflow/core/framework/op_kernel.h
+++ b/tensorflow/core/framework/op_kernel.h
@@ -53,6 +53,7 @@ limitations under the License.
 namespace Eigen {
 struct ThreadPoolDevice;
 struct GpuDevice;
+struct SyclDevice;
 }  // end namespace Eigen
 
 namespace tensorflow {
@@ -891,6 +892,9 @@ class OpKernelContext {
   const Eigen::GpuDevice& eigen_gpu_device() const {
     return params_->eigen_gpu_device->device();
   }
+  const Eigen::SyclDevice& eigen_sycl_device() const {
+    return *device()->eigen_sycl_device();
+  }
   template <typename EigenDeviceType>
   const EigenDeviceType& eigen_device() const;
 

From 05cff7db89df7894dc9c0d40668a4915f41b65c6 Mon Sep 17 00:00:00 2001
From: luke iwanski <luke@codeplay.com>
Date: Thu, 27 Oct 2016 07:20:29 +0100
Subject: [PATCH 20/51] Registered Const Op for SYCL device.

---
 tensorflow/core/kernels/constant_op.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/core/kernels/constant_op.cc b/tensorflow/core/kernels/constant_op.cc
index a743be66124..946661aa39d 100644
--- a/tensorflow/core/kernels/constant_op.cc
+++ b/tensorflow/core/kernels/constant_op.cc
@@ -50,6 +50,7 @@ void ConstantOp::Compute(OpKernelContext* ctx) { ctx->set_output(0, tensor_); }
 ConstantOp::~ConstantOp() {}
 
 REGISTER_KERNEL_BUILDER(Name("Const").Device(DEVICE_CPU), ConstantOp);
+REGISTER_KERNEL_BUILDER(Name("Const").Device(DEVICE_SYCL), ConstantOp);
 
 #if GOOGLE_CUDA
 #define REGISTER_KERNEL(D, TYPE)                                      \

From 397972c4903d6d9fa1f266e1256bcc6ba786809f Mon Sep 17 00:00:00 2001
From: luke iwanski <luke@codeplay.com>
Date: Thu, 27 Oct 2016 07:23:19 +0100
Subject: [PATCH 21/51] Partial specialisation of UnaryFunctor for SYCLDevice
 has been added.

---
 tensorflow/core/kernels/cwise_ops_sycl_common.h | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/tensorflow/core/kernels/cwise_ops_sycl_common.h b/tensorflow/core/kernels/cwise_ops_sycl_common.h
index baba610d6de..868fe9b8270 100644
--- a/tensorflow/core/kernels/cwise_ops_sycl_common.h
+++ b/tensorflow/core/kernels/cwise_ops_sycl_common.h
@@ -37,6 +37,15 @@ void Assign(const SYCLDevice& d, OUT out, RHS rhs) {
   out.device(d) = rhs;
 }
 
+// Partial specialization of UnaryFunctor<Device=SYCLDevice, Functor>.
+template <typename Functor>
+struct UnaryFunctor<SYCLDevice, Functor> {
+  void operator()(const SYCLDevice& d, typename Functor::tout_type out,
+                  typename Functor::tin_type in) {
+    To32Bit(out).device(d) = To32Bit(in).unaryExpr(typename Functor::func());
+  }
+};
+
 // Partial specialization of BinaryFunctor<Device=SYCLDevice, Functor>.
 template <typename Functor, int NDIMS, bool has_errors>
 struct BinaryFunctor<SYCLDevice, Functor, NDIMS, has_errors> {

From 41e439e629c4b8c647c3bc34f7155be90ea51029 Mon Sep 17 00:00:00 2001
From: luke iwanski <luke@codeplay.com>
Date: Thu, 27 Oct 2016 07:25:05 +0100
Subject: [PATCH 22/51] _Arg and _Retval has been registered for SYCL device.

---
 tensorflow/core/kernels/function_ops.cc | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tensorflow/core/kernels/function_ops.cc b/tensorflow/core/kernels/function_ops.cc
index 56253eb64a7..d348a0dff3b 100644
--- a/tensorflow/core/kernels/function_ops.cc
+++ b/tensorflow/core/kernels/function_ops.cc
@@ -86,6 +86,9 @@ class RetvalOp : public OpKernel {
 REGISTER_KERNEL_BUILDER(Name("_Arg").Device(DEVICE_CPU), ArgOp);
 REGISTER_KERNEL_BUILDER(Name("_Retval").Device(DEVICE_CPU), RetvalOp);
 
+REGISTER_KERNEL_BUILDER(Name("_Arg").Device(DEVICE_SYCL), ArgOp);
+REGISTER_KERNEL_BUILDER(Name("_Retval").Device(DEVICE_SYCL), RetvalOp);
+
 #define REGISTER_GPU_KERNELS(type)                                       \
   REGISTER_KERNEL_BUILDER(                                               \
       Name("_Arg").Device(DEVICE_GPU).TypeConstraint<type>("T"), ArgOp); \

From 115bc8e54d0d46d4a1e6515fb82101ec8a191112 Mon Sep 17 00:00:00 2001
From: luke iwanski <luke@codeplay.com>
Date: Thu, 27 Oct 2016 07:26:24 +0100
Subject: [PATCH 23/51] Identity Op has been registered for SYCL device.

---
 tensorflow/core/kernels/identity_op.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/core/kernels/identity_op.cc b/tensorflow/core/kernels/identity_op.cc
index 459d329ba4a..ced8280e3bc 100644
--- a/tensorflow/core/kernels/identity_op.cc
+++ b/tensorflow/core/kernels/identity_op.cc
@@ -24,6 +24,7 @@ limitations under the License.
 namespace tensorflow {
 
 REGISTER_KERNEL_BUILDER(Name("Identity").Device(DEVICE_CPU), IdentityOp);
+REGISTER_KERNEL_BUILDER(Name("Identity").Device(DEVICE_SYCL), IdentityOp);
 // StopGradient does the same thing as Identity, but has a different
 // gradient registered.
 REGISTER_KERNEL_BUILDER(Name("StopGradient").Device(DEVICE_CPU), IdentityOp);

From 81e02084337f5fa5deceeb59fe86f1dcfbb4ab34 Mon Sep 17 00:00:00 2001
From: luke iwanski <luke@codeplay.com>
Date: Thu, 27 Oct 2016 07:27:30 +0100
Subject: [PATCH 24/51] NoOp has been registered for SYCL device.

---
 tensorflow/core/kernels/no_op.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/core/kernels/no_op.cc b/tensorflow/core/kernels/no_op.cc
index 0ad05ee3235..e4c7f48e773 100644
--- a/tensorflow/core/kernels/no_op.cc
+++ b/tensorflow/core/kernels/no_op.cc
@@ -19,5 +19,6 @@ namespace tensorflow {
 
 REGISTER_KERNEL_BUILDER(Name("NoOp").Device(DEVICE_CPU), NoOp);
 REGISTER_KERNEL_BUILDER(Name("NoOp").Device(DEVICE_GPU), NoOp);
+REGISTER_KERNEL_BUILDER(Name("NoOp").Device(DEVICE_SYCL), NoOp);
 
 }  // namespace tensorflow

From d60cccf42951f9aff63f500d6441ef6f18edea52 Mon Sep 17 00:00:00 2001
From: luke iwanski <luke@codeplay.com>
Date: Thu, 27 Oct 2016 07:30:23 +0100
Subject: [PATCH 25/51] _Send and _Recv Ops has been registered for SYCL
 device.

---
 tensorflow/core/kernels/sendrecv_ops.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tensorflow/core/kernels/sendrecv_ops.cc b/tensorflow/core/kernels/sendrecv_ops.cc
index c2a04ed0c4c..38b0bf5756e 100644
--- a/tensorflow/core/kernels/sendrecv_ops.cc
+++ b/tensorflow/core/kernels/sendrecv_ops.cc
@@ -77,6 +77,7 @@ void SendOp::Compute(OpKernelContext* ctx) {
 
 REGISTER_KERNEL_BUILDER(Name("_Send").Device(DEVICE_CPU), SendOp);
 REGISTER_KERNEL_BUILDER(Name("_Send").Device(DEVICE_GPU), SendOp);
+REGISTER_KERNEL_BUILDER(Name("_Send").Device(DEVICE_SYCL), SendOp);
 
 REGISTER_KERNEL_BUILDER(Name("_HostSend").Device(DEVICE_CPU), SendOp);
 REGISTER_KERNEL_BUILDER(
@@ -135,6 +136,7 @@ void RecvOp::ComputeAsync(OpKernelContext* ctx, DoneCallback done) {
 
 REGISTER_KERNEL_BUILDER(Name("_Recv").Device(DEVICE_CPU), RecvOp);
 REGISTER_KERNEL_BUILDER(Name("_Recv").Device(DEVICE_GPU), RecvOp);
+REGISTER_KERNEL_BUILDER(Name("_Recv").Device(DEVICE_SYCL), RecvOp);
 
 REGISTER_KERNEL_BUILDER(Name("_HostRecv").Device(DEVICE_CPU), RecvOp);
 REGISTER_KERNEL_BUILDER(

From fe9a864aad6b2cafdb6cd379e4be0f123247fa65 Mon Sep 17 00:00:00 2001
From: luke iwanski <luke@codeplay.com>
Date: Thu, 27 Oct 2016 07:33:32 +0100
Subject: [PATCH 26/51] Const op has been registered for SYCL device.

---
 tensorflow/core/kernels/cwise_op_round.cc | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/tensorflow/core/kernels/cwise_op_round.cc b/tensorflow/core/kernels/cwise_op_round.cc
index 0457f3931d8..c87157f2d37 100644
--- a/tensorflow/core/kernels/cwise_op_round.cc
+++ b/tensorflow/core/kernels/cwise_op_round.cc
@@ -18,6 +18,12 @@ limitations under the License.
 namespace tensorflow {
 REGISTER5(UnaryOp, CPU, "Round", functor::round, Eigen::half, float, double,
           int32, int64);
+
+REGISTER(UnaryOp, SYCL, "Round", functor::round, float);
+namespace functor {
+DEFINE_UNARY1(round, float);
+}  // namespace functor
+
 #if GOOGLE_CUDA
 REGISTER5(UnaryOp, GPU, "Round", functor::round, Eigen::half, float, double,
           int32, int64);

From 32f53d9b064f1451690e0b75ea71845239bb8f3b Mon Sep 17 00:00:00 2001
From: luke iwanski <luke@codeplay.com>
Date: Thu, 27 Oct 2016 07:40:19 +0100
Subject: [PATCH 27/51] BlockingOp has been registered for SYCL device.

---
 tensorflow/core/common_runtime/direct_session_test.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tensorflow/core/common_runtime/direct_session_test.cc b/tensorflow/core/common_runtime/direct_session_test.cc
index 38dd627da0c..124688515bd 100644
--- a/tensorflow/core/common_runtime/direct_session_test.cc
+++ b/tensorflow/core/common_runtime/direct_session_test.cc
@@ -818,6 +818,8 @@ class BlockingOp : public OpKernel {
 REGISTER_KERNEL_BUILDER(Name("BlockingOp").Device(DEVICE_CPU), BlockingOp);
 REGISTER_OP("BlockingOp").Input("x: float").Output("y: float").Doc("");
 
+REGISTER_KERNEL_BUILDER(Name("BlockingOp").Device(DEVICE_SYCL), BlockingOp);
+
 static void TestSessionInterOpThreadsImpl(bool use_function_lib) {
   FunctionDefLibrary library_graph_def;
   if (use_function_lib) {

From 9401110a9d8545c40fce638e4c63fcbd2668d0fc Mon Sep 17 00:00:00 2001
From: luke iwanski <luke@codeplay.com>
Date: Thu, 27 Oct 2016 20:55:30 +0100
Subject: [PATCH 28/51] bus_adjacency -> const DeviceLocality& locality. And
 dummy GetShortDeviceDescription for SYCL device.

---
 tensorflow/core/common_runtime/sycl/sycl_device.cc     |  9 +++++----
 tensorflow/core/common_runtime/sycl/sycl_device.h      | 10 ++++++++--
 .../core/common_runtime/sycl/sycl_device_factory.cc    |  5 +++--
 3 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/tensorflow/core/common_runtime/sycl/sycl_device.cc b/tensorflow/core/common_runtime/sycl/sycl_device.cc
index e13c34dd690..fbb4d148bf3 100644
--- a/tensorflow/core/common_runtime/sycl/sycl_device.cc
+++ b/tensorflow/core/common_runtime/sycl/sycl_device.cc
@@ -27,10 +27,11 @@ cl::sycl::gpu_selector s;
 cl::sycl::queue q(s);
 
 SYCLDevice::SYCLDevice(const SessionOptions& options, const string& name,
-                       Bytes memory_limit, BusAdjacency bus_adjacency,
-                       Allocator* allocator)
-    : LocalDevice(options, Device::BuildDeviceAttributes(
-                               name, DEVICE_SYCL, memory_limit, bus_adjacency),
+                       Bytes memory_limit, const DeviceLocality& locality,
+                       const string& physical_device_desc, Allocator* allocator)
+    : LocalDevice(options,
+                  Device::BuildDeviceAttributes(name, DEVICE_SYCL, memory_limit,
+                                                locality, physical_device_desc),
                   allocator),
       allocator_(allocator),
       device_(q) {
diff --git a/tensorflow/core/common_runtime/sycl/sycl_device.h b/tensorflow/core/common_runtime/sycl/sycl_device.h
index e43997cbf25..660ccfa6cea 100644
--- a/tensorflow/core/common_runtime/sycl/sycl_device.h
+++ b/tensorflow/core/common_runtime/sycl/sycl_device.h
@@ -31,8 +31,8 @@ namespace tensorflow {
 class SYCLDevice : public LocalDevice {
  public:
   SYCLDevice(const SessionOptions& options, const string& name,
-             Bytes memory_limit, BusAdjacency bus_adjacency,
-             Allocator* allocator);
+             Bytes memory_limit, const DeviceLocality& locality,
+             const string& physical_device_desc, Allocator* allocator);
   ~SYCLDevice() override;
 
   void Compute(OpKernel* op_kernel, OpKernelContext* context) override;
@@ -42,6 +42,12 @@ class SYCLDevice : public LocalDevice {
                              Tensor* tensor) override;
 
   Status Sync() override { return Status::OK(); }
+  static string GetShortDeviceDescription(/*int device_id,
+                                          const DeviceDescription& desc*/) {
+    return strings::StrCat("device: 0, name SYCL, pci bus id: 0");
+    // return strings::StrCat("device: ", device_id, ", name: ", desc.name(),
+    //                       ", pci bus id: ", desc.pci_bus_id());
+  }
 
  private:
   Allocator* allocator_;  // Not owned
diff --git a/tensorflow/core/common_runtime/sycl/sycl_device_factory.cc b/tensorflow/core/common_runtime/sycl/sycl_device_factory.cc
index fe10412ab6c..fba4a5f4018 100644
--- a/tensorflow/core/common_runtime/sycl/sycl_device_factory.cc
+++ b/tensorflow/core/common_runtime/sycl/sycl_device_factory.cc
@@ -30,8 +30,9 @@ class SYCLDeviceFactory : public DeviceFactory {
     }
     for (int i = 0; i < n; i++) {
       string name = strings::StrCat(name_prefix, "/sycl:", i);
-      devices->push_back(new SYCLDevice(options, name, Bytes(256 << 20),
-                                        BUS_ANY, cpu_allocator()));
+      devices->push_back(new SYCLDevice(
+          options, name, Bytes(256 << 20), DeviceLocality(),
+          SYCLDevice::GetShortDeviceDescription(), cpu_allocator()));
     }
     return Status::OK();
   }

From 0b4ce7841aa2d18c8546cfe75ef2c58b89864996 Mon Sep 17 00:00:00 2001
From: luke <luke@codeplay.com>
Date: Fri, 28 Oct 2016 11:27:29 +0100
Subject: [PATCH 29/51] Added #ifdef guard for SYCL related code.

---
 tensorflow/core/framework/op_kernel.cc    | 2 ++
 tensorflow/core/framework/op_kernel.h     | 2 ++
 tensorflow/core/kernels/cwise_op_round.cc | 2 ++
 3 files changed, 6 insertions(+)

diff --git a/tensorflow/core/framework/op_kernel.cc b/tensorflow/core/framework/op_kernel.cc
index d233690a2b3..50520bb3fd5 100644
--- a/tensorflow/core/framework/op_kernel.cc
+++ b/tensorflow/core/framework/op_kernel.cc
@@ -949,10 +949,12 @@ const Eigen::GpuDevice& OpKernelContext::eigen_device() const {
   return eigen_gpu_device();
 }
 
+#ifdef TENSORFLOW_USE_SYCL
 template <>
 const Eigen::SyclDevice& OpKernelContext::eigen_device() const {
   return eigen_sycl_device();
 }
+#endif
 
 void OpKernelConstruction::CtxFailure(Status s) {
   VLOG(1) << s;
diff --git a/tensorflow/core/framework/op_kernel.h b/tensorflow/core/framework/op_kernel.h
index 55de40c4714..432e2ad2f6e 100644
--- a/tensorflow/core/framework/op_kernel.h
+++ b/tensorflow/core/framework/op_kernel.h
@@ -892,9 +892,11 @@ class OpKernelContext {
   const Eigen::GpuDevice& eigen_gpu_device() const {
     return params_->eigen_gpu_device->device();
   }
+#ifdef TENSORFLOW_USE_SYCL
   const Eigen::SyclDevice& eigen_sycl_device() const {
     return *device()->eigen_sycl_device();
   }
+#endif
   template <typename EigenDeviceType>
   const EigenDeviceType& eigen_device() const;
 
diff --git a/tensorflow/core/kernels/cwise_op_round.cc b/tensorflow/core/kernels/cwise_op_round.cc
index c87157f2d37..7a4482dbb2b 100644
--- a/tensorflow/core/kernels/cwise_op_round.cc
+++ b/tensorflow/core/kernels/cwise_op_round.cc
@@ -19,10 +19,12 @@ namespace tensorflow {
 REGISTER5(UnaryOp, CPU, "Round", functor::round, Eigen::half, float, double,
           int32, int64);
 
+#ifdef TENSORFLOW_USE_SYCL
 REGISTER(UnaryOp, SYCL, "Round", functor::round, float);
 namespace functor {
 DEFINE_UNARY1(round, float);
 }  // namespace functor
+#endif
 
 #if GOOGLE_CUDA
 REGISTER5(UnaryOp, GPU, "Round", functor::round, Eigen::half, float, double,

From 27e116b85cccb016a380f8977b89c4e3638d5b25 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Fri, 28 Oct 2016 13:10:01 -0700
Subject: [PATCH 30/51] Fixed a typo

---
 tensorflow/workspace.bzl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index c8ec9fb0ab1..2211f1a28c6 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -22,7 +22,7 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
 
   native.new_http_archive(
     name = "eigen_archive",
-    url = "http://bitbucket.org/eigen/eigen/" + eigen_version + ".tar.gz",
+    url = "http://bitbucket.org/eigen/eigen/get/" + eigen_version + ".tar.gz",
     sha256 = eigen_sha256,
     strip_prefix = "eigen-eigen-" + eigen_version,
     build_file = str(Label("//:eigen.BUILD")),

From 174c6edb7dd5f99b4e9b0f45a0eaf978b578e85c Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Fri, 28 Oct 2016 13:42:37 -0700
Subject: [PATCH 31/51] Cleanup

---
 eigen.BUILD | 2 --
 1 file changed, 2 deletions(-)

diff --git a/eigen.BUILD b/eigen.BUILD
index 210d1523ea3..8a699f6aa84 100644
--- a/eigen.BUILD
+++ b/eigen.BUILD
@@ -55,8 +55,6 @@ EIGEN_MPL2_HEADER_FILES = glob(
     ],
 )
 
-# archive_dir = "benoitsteiner-opencl-9d4a08d57d0d"
-
 cc_library(
     name = "eigen",
     hdrs = EIGEN_MPL2_HEADER_FILES,

From cf33ec5e6a33fea15b719a0dd8c16d5b1a5c8b70 Mon Sep 17 00:00:00 2001
From: luke iwanski <luke@codeplay.com>
Date: Tue, 1 Nov 2016 16:47:00 +0000
Subject: [PATCH 32/51] Feedback from #5267 applied.

---
 tensorflow/core/common_runtime/sycl/sycl_device.h          | 2 --
 tensorflow/core/common_runtime/sycl/sycl_device_factory.cc | 2 +-
 tensorflow/core/kernels/constant_op.cc                     | 2 ++
 tensorflow/core/kernels/cwise_ops_sycl_common.h            | 7 -------
 4 files changed, 3 insertions(+), 10 deletions(-)

diff --git a/tensorflow/core/common_runtime/sycl/sycl_device.h b/tensorflow/core/common_runtime/sycl/sycl_device.h
index 660ccfa6cea..9299bb2af3d 100644
--- a/tensorflow/core/common_runtime/sycl/sycl_device.h
+++ b/tensorflow/core/common_runtime/sycl/sycl_device.h
@@ -45,8 +45,6 @@ class SYCLDevice : public LocalDevice {
   static string GetShortDeviceDescription(/*int device_id,
                                           const DeviceDescription& desc*/) {
     return strings::StrCat("device: 0, name SYCL, pci bus id: 0");
-    // return strings::StrCat("device: ", device_id, ", name: ", desc.name(),
-    //                       ", pci bus id: ", desc.pci_bus_id());
   }
 
  private:
diff --git a/tensorflow/core/common_runtime/sycl/sycl_device_factory.cc b/tensorflow/core/common_runtime/sycl/sycl_device_factory.cc
index fba4a5f4018..6de97715283 100644
--- a/tensorflow/core/common_runtime/sycl/sycl_device_factory.cc
+++ b/tensorflow/core/common_runtime/sycl/sycl_device_factory.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #if TENSORFLOW_USE_SYCL
 
-#include "sycl_device.h"
+#include "tensorflow/core/common_runtime/sycl/sycl_device.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/constant_op.cc b/tensorflow/core/kernels/constant_op.cc
index afb1034f9fb..00b44555a3d 100644
--- a/tensorflow/core/kernels/constant_op.cc
+++ b/tensorflow/core/kernels/constant_op.cc
@@ -50,7 +50,9 @@ void ConstantOp::Compute(OpKernelContext* ctx) { ctx->set_output(0, tensor_); }
 ConstantOp::~ConstantOp() {}
 
 REGISTER_KERNEL_BUILDER(Name("Const").Device(DEVICE_CPU), ConstantOp);
+#ifdef TENSORFLOW_USE_SYCL
 REGISTER_KERNEL_BUILDER(Name("Const").Device(DEVICE_SYCL), ConstantOp);
+#endif
 
 #if GOOGLE_CUDA
 #define REGISTER_KERNEL(D, TYPE)                                      \
diff --git a/tensorflow/core/kernels/cwise_ops_sycl_common.h b/tensorflow/core/kernels/cwise_ops_sycl_common.h
index 868fe9b8270..c66ae42c2d2 100644
--- a/tensorflow/core/kernels/cwise_ops_sycl_common.h
+++ b/tensorflow/core/kernels/cwise_ops_sycl_common.h
@@ -58,12 +58,6 @@ struct BinaryFunctor<SYCLDevice, Functor, NDIMS, has_errors> {
   void Left(const SYCLDevice& d, typename Functor::tout_type out,
             typename Functor::tscalar_type scalar,
             typename Functor::tin_type in, bool* error) {
-    // typedef typename Functor::out_type Tout;
-    // typedef typename Functor::in_type Tin;
-    // typedef typename Functor::func Binary;
-    // typedef typename Eigen::internal::scalar_left<Tout, Tin, Binary> Unary;
-    // Assign(d, out, in.unaryExpr(Unary(scalar.data())));
-    //  printf("BinaryFunctor::Left NOT IMPLEMENTED ! \n");
     LOG(FATAL) << "BinaryFunctor::Left NOT IMPLEMENTED ! ";
   }
 
@@ -85,7 +79,6 @@ struct BinaryFunctor<SYCLDevice, Functor, NDIMS, has_errors> {
              typename Eigen::array<Eigen::DenseIndex, NDIMS> bcast1,
              bool* error) {
     LOG(FATAL) << "BinaryFunctor::BCast NOT IMPLEMENTED ";
-    //    printf("BinaryFunctor::BCast NOT IMPLEMENTED ! \n");
   }
 };
 

From f58374e88ddfbc7ead046c8b42a5cb8423603cac Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Fri, 3 Jun 2016 10:24:45 -0700
Subject: [PATCH 33/51] Pull the latest Eigen version that supports OpenCL

---
 third_party/eigen3/Eigen/Eigenvalues | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/third_party/eigen3/Eigen/Eigenvalues b/third_party/eigen3/Eigen/Eigenvalues
index bf739b9b850..be27849e3b8 100644
--- a/third_party/eigen3/Eigen/Eigenvalues
+++ b/third_party/eigen3/Eigen/Eigenvalues
@@ -1 +1,5 @@
+<<<<<<< d9a89a5cf63b2ecedf68d3eefdc24be3f519e503
 #include "Eigen/Eigenvalues"
+=======
+#include "benoitsteiner-opencl-9d4a08d57d0d/Eigen/Eigenvalues"
+>>>>>>> Pull the latest Eigen version that supports OpenCL

From 04454bcaa89b5be5a8fd0273ee49c5405f919b5a Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Wed, 12 Oct 2016 22:52:59 -0700
Subject: [PATCH 34/51] Switch to the latest version of Eigen that supports
 OpenCL

---
 tensorflow/workspace.bzl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 2211f1a28c6..d6c0aff9b1e 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -22,9 +22,9 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
 
   native.new_http_archive(
     name = "eigen_archive",
-    url = "http://bitbucket.org/eigen/eigen/get/" + eigen_version + ".tar.gz",
+    url = "http://bitbucket.org/benoitsteiner/opencl/get/" + eigen_version + ".tar.gz",
     sha256 = eigen_sha256,
-    strip_prefix = "eigen-eigen-" + eigen_version,
+    strip_prefix = "benoitsteiner-opencl-" + eigen_version,
     build_file = str(Label("//:eigen.BUILD")),
   )
 

From ac7584fac29543683bb418dc346b23b9e904b799 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Wed, 19 Oct 2016 12:24:14 -0700
Subject: [PATCH 35/51] Upgraded to the latest version of Eigen

---
 tensorflow/workspace.bzl | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index d6c0aff9b1e..61bd20808ab 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -37,9 +37,15 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
 
   native.http_archive(
     name = "gemmlowp",
+<<<<<<< a6125b624cf1757b4cc2751fe764b34fbc8c9074
     url = "http://github.com/google/gemmlowp/archive/a6f29d8ac48d63293f845f2253eccbf86bc28321.tar.gz",
     sha256 = "75d40ea8e68b0d1644f052fffe8f14a410b2a73d40ccb859a95c0578d194ec26",
     strip_prefix = "gemmlowp-a6f29d8ac48d63293f845f2253eccbf86bc28321",
+=======
+    url = "http://github.com/google/gemmlowp/archive/c0bacf11fb509a2cbe15a97362a2df067ffd57a2.tar.gz",
+    sha256 = "dc64a38f9927db18748d9024987c9b102115e25bc2be4b76aa8e422b8f83d882",
+    strip_prefix = "gemmlowp-c0bacf11fb509a2cbe15a97362a2df067ffd57a2",
+>>>>>>> Upgraded to the latest version of Eigen
   )
 
   native.new_http_archive(
@@ -101,9 +107,15 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
 
   native.http_archive(
     name = "protobuf",
+<<<<<<< a6125b624cf1757b4cc2751fe764b34fbc8c9074
     url = "http://github.com/google/protobuf/archive/008b5a228b37c054f46ba478ccafa5e855cb16db.tar.gz",
     sha256 = "2737ad055eb8a9bc63ed068e32c4ea280b62d8236578cb4d4120eb5543f759ab",
     strip_prefix = "protobuf-008b5a228b37c054f46ba478ccafa5e855cb16db",
+=======
+    url = "http://github.com/google/protobuf/archive/c2b3e70efd2038a54ef8973771ac58192885125e.tar.gz",
+    sha256 = "eafc1bc4c27970d62effe64ba6610823fdd66711f440d8ca4a168167786a2fcb",
+    strip_prefix = "protobuf-c2b3e70efd2038a54ef8973771ac58192885125e",
+>>>>>>> Upgraded to the latest version of Eigen
   )
 
   native.new_http_archive(

From f4325776061a31283f2a377ffa6f5de329b9903a Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Fri, 28 Oct 2016 13:10:01 -0700
Subject: [PATCH 36/51] Fixed a typo

---
 tensorflow/workspace.bzl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 61bd20808ab..b78534755c9 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -22,9 +22,9 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
 
   native.new_http_archive(
     name = "eigen_archive",
-    url = "http://bitbucket.org/benoitsteiner/opencl/get/" + eigen_version + ".tar.gz",
+    url = "http://bitbucket.org/eigen/eigen/get/" + eigen_version + ".tar.gz",
     sha256 = eigen_sha256,
-    strip_prefix = "benoitsteiner-opencl-" + eigen_version,
+    strip_prefix = "eigen-eigen-" + eigen_version,
     build_file = str(Label("//:eigen.BUILD")),
   )
 

From 8e06a6e655c1d97496fa29e1b4435f8f0fb913aa Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Tue, 1 Nov 2016 10:32:48 -0700
Subject: [PATCH 37/51] Fixed merge conflicts.

---
 tensorflow/workspace.bzl | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index b78534755c9..2211f1a28c6 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -37,15 +37,9 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
 
   native.http_archive(
     name = "gemmlowp",
-<<<<<<< a6125b624cf1757b4cc2751fe764b34fbc8c9074
     url = "http://github.com/google/gemmlowp/archive/a6f29d8ac48d63293f845f2253eccbf86bc28321.tar.gz",
     sha256 = "75d40ea8e68b0d1644f052fffe8f14a410b2a73d40ccb859a95c0578d194ec26",
     strip_prefix = "gemmlowp-a6f29d8ac48d63293f845f2253eccbf86bc28321",
-=======
-    url = "http://github.com/google/gemmlowp/archive/c0bacf11fb509a2cbe15a97362a2df067ffd57a2.tar.gz",
-    sha256 = "dc64a38f9927db18748d9024987c9b102115e25bc2be4b76aa8e422b8f83d882",
-    strip_prefix = "gemmlowp-c0bacf11fb509a2cbe15a97362a2df067ffd57a2",
->>>>>>> Upgraded to the latest version of Eigen
   )
 
   native.new_http_archive(
@@ -107,15 +101,9 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
 
   native.http_archive(
     name = "protobuf",
-<<<<<<< a6125b624cf1757b4cc2751fe764b34fbc8c9074
     url = "http://github.com/google/protobuf/archive/008b5a228b37c054f46ba478ccafa5e855cb16db.tar.gz",
     sha256 = "2737ad055eb8a9bc63ed068e32c4ea280b62d8236578cb4d4120eb5543f759ab",
     strip_prefix = "protobuf-008b5a228b37c054f46ba478ccafa5e855cb16db",
-=======
-    url = "http://github.com/google/protobuf/archive/c2b3e70efd2038a54ef8973771ac58192885125e.tar.gz",
-    sha256 = "eafc1bc4c27970d62effe64ba6610823fdd66711f440d8ca4a168167786a2fcb",
-    strip_prefix = "protobuf-c2b3e70efd2038a54ef8973771ac58192885125e",
->>>>>>> Upgraded to the latest version of Eigen
   )
 
   native.new_http_archive(

From 4c85e1cd6c6d432265a566e058b7389f23ec4aa5 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Tue, 1 Nov 2016 10:56:23 -0700
Subject: [PATCH 38/51] Fixed merge conflict

---
 third_party/eigen3/Eigen/Eigenvalues | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/third_party/eigen3/Eigen/Eigenvalues b/third_party/eigen3/Eigen/Eigenvalues
index be27849e3b8..bf739b9b850 100644
--- a/third_party/eigen3/Eigen/Eigenvalues
+++ b/third_party/eigen3/Eigen/Eigenvalues
@@ -1,5 +1 @@
-<<<<<<< d9a89a5cf63b2ecedf68d3eefdc24be3f519e503
 #include "Eigen/Eigenvalues"
-=======
-#include "benoitsteiner-opencl-9d4a08d57d0d/Eigen/Eigenvalues"
->>>>>>> Pull the latest Eigen version that supports OpenCL

From f52f0f586e7ae72ca60241bf2f02a96923809723 Mon Sep 17 00:00:00 2001
From: Xiaoqiang Zheng <zhengxq@google.com>
Date: Fri, 28 Oct 2016 10:29:28 -0800
Subject: [PATCH 39/51] Merge changes from github. Change: 137532946

---
 tensorflow/core/ops/math_ops.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/ops/math_ops.cc b/tensorflow/core/ops/math_ops.cc
index ff00214da3c..9e553694b92 100644
--- a/tensorflow/core/ops/math_ops.cc
+++ b/tensorflow/core/ops/math_ops.cc
@@ -976,7 +976,7 @@ REGISTER_OP("Select")
       c->set_output(0, data);
 
       return Status::OK();
-    })
+   })
     .Doc(R"doc(
 Selects elements from `t` or `e`, depending on `condition`.
 

From 7326f9e9c0cbcb2e1584dff551cbbc4c41eb103f Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Wed, 2 Nov 2016 08:22:08 -0700
Subject: [PATCH 40/51] Improved the formatting of the BUILD file

---
 third_party/eigen3/BUILD | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/third_party/eigen3/BUILD b/third_party/eigen3/BUILD
index b45af94e540..f697866bde2 100644
--- a/third_party/eigen3/BUILD
+++ b/third_party/eigen3/BUILD
@@ -23,7 +23,8 @@ cc_library(
         "unsupported/Eigen/CXX11/FixedPoint",
     ],
     visibility = ["//visibility:public"],
-    deps = ["@eigen_archive//:eigen",
-            "@local_config_sycl//sycl:sycl",
+    deps = [
+        "@eigen_archive//:eigen",
+        "@local_config_sycl//sycl:sycl",
     ],
 )

From 810ade7a449b8e4c168de614c35d17dccd68f291 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Wed, 2 Nov 2016 09:17:10 -0700
Subject: [PATCH 41/51] Deleted unecessary code

---
 .../core/common_runtime/device_factory.cc      | 18 +-----------------
 1 file changed, 1 insertion(+), 17 deletions(-)

diff --git a/tensorflow/core/common_runtime/device_factory.cc b/tensorflow/core/common_runtime/device_factory.cc
index 15933a81992..84362d4b8ab 100644
--- a/tensorflow/core/common_runtime/device_factory.cc
+++ b/tensorflow/core/common_runtime/device_factory.cc
@@ -90,27 +90,11 @@ Status DeviceFactory::AddDevices(const SessionOptions& options,
     return errors::NotFound("No CPU devices are available in this process");
   }
 
-  // Then GPU.
-  auto gpu_factory = GetFactory("GPU");
-  if (gpu_factory) {
-    TF_RETURN_IF_ERROR(
-        gpu_factory->CreateDevices(options, name_prefix, devices));
-  }
-
-  // Then SYCL.
-  auto sycl_factory = GetFactory("SYCL");
-
-  if (sycl_factory) {
-    TF_RETURN_IF_ERROR(
-        sycl_factory->CreateDevices(options, name_prefix, devices));
-  }
-
   // Then the rest.
   mutex_lock l(*get_device_factory_lock());
   for (auto& p : device_factories()) {
     auto factory = p.second.factory.get();
-    if (factory != cpu_factory && factory != gpu_factory &&
-        factory != sycl_factory) {
+    if (factory != cpu_factory) {
       TF_RETURN_IF_ERROR(factory->CreateDevices(options, name_prefix, devices));
     }
   }

From 3f39dec966cc89cd3ff8f1c86995276e12ffd083 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Wed, 2 Nov 2016 10:33:49 -0700
Subject: [PATCH 42/51] Placement of operations of SYCL devices needs to be
 done by using the /device:sycl string.

---
 tensorflow/core/util/device_name_utils.cc | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

diff --git a/tensorflow/core/util/device_name_utils.cc b/tensorflow/core/util/device_name_utils.cc
index 336b69b3aba..ac183004966 100644
--- a/tensorflow/core/util/device_name_utils.cc
+++ b/tensorflow/core/util/device_name_utils.cc
@@ -142,6 +142,7 @@ bool DeviceNameUtils::ParseFullName(StringPiece fullname, ParsedName* p) {
       progress = true;
     }
 
+    // Handle legacy naming convention for cpu and gpu.
     if (str_util::ConsumePrefix(&fullname, "/cpu:") ||
         str_util::ConsumePrefix(&fullname, "/CPU:")) {
       p->has_type = true;
@@ -162,16 +163,6 @@ bool DeviceNameUtils::ParseFullName(StringPiece fullname, ParsedName* p) {
       }
       progress = true;
     }
-    if (str_util::ConsumePrefix(&fullname, "/sycl:") ||
-        str_util::ConsumePrefix(&fullname, "/SYCL:")) {
-      p->has_type = true;
-      p->type = "SYCL";  // Treat '/sycl:..' as uppercase '/device:SYCL:...'
-      p->has_id = !str_util::ConsumePrefix(&fullname, "*");
-      if (p->has_id && !ConsumeNumber(&fullname, &p->id)) {
-        return false;
-      }
-      progress = true;
-    }
 
     if (!progress) {
       return false;

From d5511081f54788b4f7851f81ac9a058a3fc620f0 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Wed, 2 Nov 2016 18:39:51 -0700
Subject: [PATCH 43/51] Register the constant op on sycl devices for numeric
 types only: we don't want to process strings on an accelerator.

---
 tensorflow/core/kernels/constant_op.cc | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/tensorflow/core/kernels/constant_op.cc b/tensorflow/core/kernels/constant_op.cc
index 00b44555a3d..4a289e1800f 100644
--- a/tensorflow/core/kernels/constant_op.cc
+++ b/tensorflow/core/kernels/constant_op.cc
@@ -50,8 +50,16 @@ void ConstantOp::Compute(OpKernelContext* ctx) { ctx->set_output(0, tensor_); }
 ConstantOp::~ConstantOp() {}
 
 REGISTER_KERNEL_BUILDER(Name("Const").Device(DEVICE_CPU), ConstantOp);
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER_KERNEL_BUILDER(Name("Const").Device(DEVICE_SYCL), ConstantOp);
+
+#if TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL_KERNEL(TYPE)                                    \
+  REGISTER_KERNEL_BUILDER(                                            \
+                          Name("Const")                               \
+                          .Device(DEVICE_SYCL)                        \
+                          .TypeConstraint<TYPE>("dtype"),             \
+			  ConstantOp);
+TF_CALL_NUMBER_TYPES(REGISTER_SYCL_KERNEL);
+#undef REGISTER_SYCL_KERNEL
 #endif
 
 #if GOOGLE_CUDA

From 90113e739cf0c6e93332bf02f8342df13950d4d1 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Wed, 2 Nov 2016 18:44:48 -0700
Subject: [PATCH 44/51] Added a device context for sycl

---
 tensorflow/core/BUILD                         |  2 +
 .../core/common_runtime/rendezvous_mgr.cc     |  4 +-
 .../core/common_runtime/sycl/sycl_device.cc   | 19 +++++++-
 .../core/common_runtime/sycl/sycl_device.h    |  5 ++
 .../sycl/sycl_device_context.cc               | 46 +++++++++++++++++++
 .../common_runtime/sycl/sycl_device_context.h | 42 +++++++++++++++++
 6 files changed, 115 insertions(+), 3 deletions(-)
 create mode 100644 tensorflow/core/common_runtime/sycl/sycl_device_context.cc
 create mode 100644 tensorflow/core/common_runtime/sycl/sycl_device_context.h

diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 90445ed4473..ff3fdb518fd 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -1380,10 +1380,12 @@ cc_library(
     name = "sycl_runtime",
     srcs = if_not_windows([
         "common_runtime/sycl/sycl_device.cc",
+        "common_runtime/sycl/sycl_device_context.cc",
         "common_runtime/sycl/sycl_device_factory.cc",
     ]),
     hdrs = if_not_windows([
         "common_runtime/sycl/sycl_device.h",
+	"common_runtime/sycl/sycl_device_context.h",
     ]),
     copts = tf_copts(),
     linkstatic = 1,
diff --git a/tensorflow/core/common_runtime/rendezvous_mgr.cc b/tensorflow/core/common_runtime/rendezvous_mgr.cc
index 6b57e8a0e81..bbdaa6d85b7 100644
--- a/tensorflow/core/common_runtime/rendezvous_mgr.cc
+++ b/tensorflow/core/common_runtime/rendezvous_mgr.cc
@@ -66,9 +66,9 @@ void IntraProcessRendezvous::SameWorkerRecvDone(
   // Do a quick copy (sharing the underlying buffer) if both tensors
   // are on host memory.
   const bool src_host = (send_args.alloc_attrs.on_host() ||
-                         parsed.src.type == "CPU" || parsed.src.type == "SYCL");
+                         parsed.src.type == "CPU");
   const bool dst_host = (recv_args.alloc_attrs.on_host() ||
-                         parsed.dst.type == "CPU" || parsed.dst.type == "SYCL");
+                         parsed.dst.type == "CPU");
   if (src_host && dst_host) {
     *out = in;
     done(Status::OK());
diff --git a/tensorflow/core/common_runtime/sycl/sycl_device.cc b/tensorflow/core/common_runtime/sycl/sycl_device.cc
index fbb4d148bf3..ae5b5fbb582 100644
--- a/tensorflow/core/common_runtime/sycl/sycl_device.cc
+++ b/tensorflow/core/common_runtime/sycl/sycl_device.cc
@@ -34,11 +34,14 @@ SYCLDevice::SYCLDevice(const SessionOptions& options, const string& name,
                                                 locality, physical_device_desc),
                   allocator),
       allocator_(allocator),
+      device_context_(new SYCLDeviceContext()),
       device_(q) {
   set_eigen_sycl_device(&device_);
 }
 
-SYCLDevice::~SYCLDevice() {}
+SYCLDevice::~SYCLDevice() {
+  device_context_->Unref();
+}
 
 void SYCLDevice::Compute(OpKernel* op_kernel, OpKernelContext* context) {
   assert(context);
@@ -66,6 +69,20 @@ Status SYCLDevice::MakeTensorFromProto(const TensorProto& tensor_proto,
   *tensor = std::move(parsed);
   return Status::OK();
 }
+
+Status SYCLDevice::FillContextMap(const Graph* graph,
+				  DeviceContextMap* device_context_map) {
+  // Fill in the context map.  It is OK for this map to contain
+  // duplicate DeviceContexts so long as we increment the refcount.
+  device_context_map->resize(graph->num_node_ids());
+  for (Node* n : graph->nodes()) {
+    device_context_->Ref();
+    (*device_context_map)[n->id()] = device_context_;
+  }
+
+  return Status::OK();
 }
 
+}  // namespace tensorflow
+
 #endif  // TENSORFLOW_USE_SYCL
diff --git a/tensorflow/core/common_runtime/sycl/sycl_device.h b/tensorflow/core/common_runtime/sycl/sycl_device.h
index 9299bb2af3d..eaa9429b167 100644
--- a/tensorflow/core/common_runtime/sycl/sycl_device.h
+++ b/tensorflow/core/common_runtime/sycl/sycl_device.h
@@ -24,6 +24,7 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/local_device.h"
+#include "tensorflow/core/common_runtime/sycl/sycl_device_context.h"
 #include "tensorflow/core/public/session_options.h"
 
 namespace tensorflow {
@@ -41,6 +42,9 @@ class SYCLDevice : public LocalDevice {
                              const AllocatorAttributes alloc_attrs,
                              Tensor* tensor) override;
 
+  Status FillContextMap(const Graph* graph,
+                        DeviceContextMap* device_context_map) override;
+
   Status Sync() override { return Status::OK(); }
   static string GetShortDeviceDescription(/*int device_id,
                                           const DeviceDescription& desc*/) {
@@ -49,6 +53,7 @@ class SYCLDevice : public LocalDevice {
 
  private:
   Allocator* allocator_;  // Not owned
+  SYCLDeviceContext* device_context_;
   Eigen::SyclDevice device_;
 };
 
diff --git a/tensorflow/core/common_runtime/sycl/sycl_device_context.cc b/tensorflow/core/common_runtime/sycl/sycl_device_context.cc
new file mode 100644
index 00000000000..bbf241a22f0
--- /dev/null
+++ b/tensorflow/core/common_runtime/sycl/sycl_device_context.cc
@@ -0,0 +1,46 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/sycl/sycl_device_context.h"
+#include "tensorflow/core/common_runtime/dma_helper.h"
+
+namespace tensorflow {
+
+void SYCLDeviceContext::CopyCPUTensorToDevice(const Tensor* cpu_tensor, Device* device,
+					      Tensor* device_tensor,
+					      StatusCallback done) const {
+  const int64 total_bytes = cpu_tensor->TotalBytes();
+  if (total_bytes > 0) {
+    const void* src_ptr = DMAHelper::base(cpu_tensor);
+    void* dst_ptr = DMAHelper::base(device_tensor);
+    ::memcpy(dst_ptr, src_ptr, total_bytes);
+  }
+  done(Status::OK());
+}
+  
+void SYCLDeviceContext::CopyDeviceTensorToCPU(const Tensor* device_tensor, StringPiece edge_name,
+					      Device* device, Tensor* cpu_tensor,
+					      StatusCallback done) {
+  const int64 total_bytes = device_tensor->TotalBytes();
+  if (total_bytes > 0) {
+    const void* src_ptr = DMAHelper::base(device_tensor);
+    void* dst_ptr = DMAHelper::base(cpu_tensor);
+    ::memcpy(dst_ptr, src_ptr, total_bytes);
+  }
+  done(Status::OK());
+}
+
+}  // namespace tensorflow
+
diff --git a/tensorflow/core/common_runtime/sycl/sycl_device_context.h b/tensorflow/core/common_runtime/sycl/sycl_device_context.h
new file mode 100644
index 00000000000..327de52eaa1
--- /dev/null
+++ b/tensorflow/core/common_runtime/sycl/sycl_device_context.h
@@ -0,0 +1,42 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMMON_RUNTIME_SYCL_SYCL_DEVICE_CONTEXT_H_
+#define TENSORFLOW_COMMON_RUNTIME_SYCL_SYCL_DEVICE_CONTEXT_H_
+
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/framework/device_base.h"
+
+namespace tensorflow {
+
+class SYCLDeviceContext : public DeviceContext {
+ public:
+  SYCLDeviceContext() {}
+  
+  ~SYCLDeviceContext() override {}
+  
+  void CopyCPUTensorToDevice(const Tensor* cpu_tensor, Device* device,
+			     Tensor* device_tensor,
+			     StatusCallback done) const override;
+  
+  void CopyDeviceTensorToCPU(const Tensor* device_tensor, StringPiece edge_name,
+			     Device* device, Tensor* cpu_tensor,
+			     StatusCallback done) override;
+  
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMMON_RUNTIME_SYCL_SYCL_DEVICE_CONTEXT_H_

From 8f97e2448dc1df374f5ec28457da047d6db3121b Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Wed, 2 Nov 2016 19:37:49 -0700
Subject: [PATCH 45/51] Register the sycl device under the '/device:SYCL:id'
 name so that its name follow the standard naming convention for devices

---
 tensorflow/core/common_runtime/sycl/sycl_device_factory.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/common_runtime/sycl/sycl_device_factory.cc b/tensorflow/core/common_runtime/sycl/sycl_device_factory.cc
index 6de97715283..97c4c2c236e 100644
--- a/tensorflow/core/common_runtime/sycl/sycl_device_factory.cc
+++ b/tensorflow/core/common_runtime/sycl/sycl_device_factory.cc
@@ -29,7 +29,7 @@ class SYCLDeviceFactory : public DeviceFactory {
       n = iter->second;
     }
     for (int i = 0; i < n; i++) {
-      string name = strings::StrCat(name_prefix, "/sycl:", i);
+      string name = strings::StrCat(name_prefix, "/device:SYCL:", i);
       devices->push_back(new SYCLDevice(
           options, name, Bytes(256 << 20), DeviceLocality(),
           SYCLDevice::GetShortDeviceDescription(), cpu_allocator()));

From d9037a06b4dacdd360db6eac7cd1338c48c0e385 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Wed, 2 Nov 2016 19:39:36 -0700
Subject: [PATCH 46/51] Upgraded to the latest version of Eigen

---
 tensorflow/workspace.bzl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 2211f1a28c6..a485e25a9d2 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -17,8 +17,8 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
   # These lines need to be changed when updating Eigen. They are parsed from
   # this file by the cmake and make builds to determine the eigen version and
   # hash.
-  eigen_version = "1d454915237a"
-  eigen_sha256 = "7e05dd4b9866ef0aa4498be34752a362596cc5db2f8439cee111e4ea54046b57"
+  eigen_version = "3f0fb403ec4c"
+  eigen_sha256 = "9ff8301c6af2640932c5ded77ecccee5786cec8c31315311220618b312e0472b"
 
   native.new_http_archive(
     name = "eigen_archive",

From fc9bde9c0675116490d204c21f81c764691503f9 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Wed, 2 Nov 2016 19:40:19 -0700
Subject: [PATCH 47/51] Only register the _Arg and _Retval kernel for POD types
 on sycl

---
 tensorflow/core/kernels/function_ops.cc | 24 ++++++++++++++++++++++--
 1 file changed, 22 insertions(+), 2 deletions(-)

diff --git a/tensorflow/core/kernels/function_ops.cc b/tensorflow/core/kernels/function_ops.cc
index 7d538d2924f..7cb9a3a6573 100644
--- a/tensorflow/core/kernels/function_ops.cc
+++ b/tensorflow/core/kernels/function_ops.cc
@@ -87,8 +87,28 @@ class RetvalOp : public OpKernel {
 REGISTER_KERNEL_BUILDER(Name("_Arg").Device(DEVICE_CPU), ArgOp);
 REGISTER_KERNEL_BUILDER(Name("_Retval").Device(DEVICE_CPU), RetvalOp);
 
-REGISTER_KERNEL_BUILDER(Name("_Arg").Device(DEVICE_SYCL), ArgOp);
-REGISTER_KERNEL_BUILDER(Name("_Retval").Device(DEVICE_SYCL), RetvalOp);
+#if TENSORFLOW_USE_SYCL
+#define REGISTER(type)     \
+  REGISTER_KERNEL_BUILDER( \
+      Name("_Arg").Device(DEVICE_SYCL).TypeConstraint<type>("T"), ArgOp);
+  TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER)
+  TF_CALL_bool(REGISTER) REGISTER_KERNEL_BUILDER(Name("_Arg")
+						 .Device(DEVICE_GPU)
+						 .HostMemory("output")
+						 .TypeConstraint<int32>("T"),
+						 ArgOp);
+#undef REGISTER
+#define REGISTER(type)     \
+  REGISTER_KERNEL_BUILDER( \
+      Name("_Retval").Device(DEVICE_SYCL).TypeConstraint<type>("T"), RetvalOp);
+  TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER)
+  TF_CALL_bool(REGISTER) REGISTER_KERNEL_BUILDER(Name("_Retval")
+						 .Device(DEVICE_GPU)
+						 .HostMemory("input")
+						 .TypeConstraint<int32>("T"),
+						 RetvalOp);
+#undef REGISTER
+#endif
 
 #define REGISTER(type)     \
   REGISTER_KERNEL_BUILDER( \

From 8d26ac88ef974ae5e5bd1d64e8da49ff6909dc76 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Wed, 2 Nov 2016 20:38:51 -0700
Subject: [PATCH 48/51] Updated the name of the sycl device

---
 tensorflow/core/common_runtime/device_set_test.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/common_runtime/device_set_test.cc b/tensorflow/core/common_runtime/device_set_test.cc
index 2c4aa227462..550fbf568ed 100644
--- a/tensorflow/core/common_runtime/device_set_test.cc
+++ b/tensorflow/core/common_runtime/device_set_test.cc
@@ -68,7 +68,7 @@ TEST_F(DeviceSetTest, PrioritizedDeviceTypeList) {
       (std::vector<DeviceType>{DeviceType(DEVICE_GPU), DeviceType(DEVICE_CPU)}),
       types());
 
-  AddDevice("SYCL", "/job:a/replica:0/task:0/sycl:0");
+  AddDevice("SYCL", "/job:a/replica:0/task:0/device:sycl:0");
   EXPECT_EQ(
       (std::vector<DeviceType>{DeviceType(DEVICE_SYCL), DeviceType(DEVICE_GPU),
       DeviceType(DEVICE_CPU)}), types());

From 3e678dc5f8c9d7fd0911705b886ffd792031770e Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Wed, 2 Nov 2016 20:53:57 -0700
Subject: [PATCH 49/51] Improved code formatting

---
 tensorflow/core/common_runtime/rendezvous_mgr.cc | 8 ++++----
 tensorflow/core/ops/math_ops.cc                  | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/tensorflow/core/common_runtime/rendezvous_mgr.cc b/tensorflow/core/common_runtime/rendezvous_mgr.cc
index bbdaa6d85b7..285ac7540c8 100644
--- a/tensorflow/core/common_runtime/rendezvous_mgr.cc
+++ b/tensorflow/core/common_runtime/rendezvous_mgr.cc
@@ -65,10 +65,10 @@ void IntraProcessRendezvous::SameWorkerRecvDone(
     StatusCallback done) {
   // Do a quick copy (sharing the underlying buffer) if both tensors
   // are on host memory.
-  const bool src_host = (send_args.alloc_attrs.on_host() ||
-                         parsed.src.type == "CPU");
-  const bool dst_host = (recv_args.alloc_attrs.on_host() ||
-                         parsed.dst.type == "CPU");
+  const bool src_host =
+      (send_args.alloc_attrs.on_host() || parsed.src.type == "CPU");
+  const bool dst_host =
+      (recv_args.alloc_attrs.on_host() || parsed.dst.type == "CPU");
   if (src_host && dst_host) {
     *out = in;
     done(Status::OK());
diff --git a/tensorflow/core/ops/math_ops.cc b/tensorflow/core/ops/math_ops.cc
index 9e553694b92..ff00214da3c 100644
--- a/tensorflow/core/ops/math_ops.cc
+++ b/tensorflow/core/ops/math_ops.cc
@@ -976,7 +976,7 @@ REGISTER_OP("Select")
       c->set_output(0, data);
 
       return Status::OK();
-   })
+    })
     .Doc(R"doc(
 Selects elements from `t` or `e`, depending on `condition`.
 

From 5ea3f1830ce723990ec64c3848d8171bfeaa7341 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Thu, 3 Nov 2016 08:22:02 -0700
Subject: [PATCH 50/51] Fixed formatting of the BUILD file

---
 tensorflow/core/BUILD | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index ff3fdb518fd..6c2fec6412a 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -1385,7 +1385,7 @@ cc_library(
     ]),
     hdrs = if_not_windows([
         "common_runtime/sycl/sycl_device.h",
-	"common_runtime/sycl/sycl_device_context.h",
+        "common_runtime/sycl/sycl_device_context.h",
     ]),
     copts = tf_copts(),
     linkstatic = 1,

From e2694bd4bc2ae61075da98cad47f688b592d53bc Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Thu, 3 Nov 2016 10:31:00 -0700
Subject: [PATCH 51/51] Don't create SYCL kernels unless TENSORFLOW_USE_SYCL is
 set

---
 tensorflow/core/kernels/identity_op.cc  | 20 +++++++++++++++++++-
 tensorflow/core/kernels/no_op.cc        |  3 +++
 tensorflow/core/kernels/sendrecv_ops.cc |  6 ++++++
 3 files changed, 28 insertions(+), 1 deletion(-)

diff --git a/tensorflow/core/kernels/identity_op.cc b/tensorflow/core/kernels/identity_op.cc
index ced8280e3bc..45d27dd19eb 100644
--- a/tensorflow/core/kernels/identity_op.cc
+++ b/tensorflow/core/kernels/identity_op.cc
@@ -24,7 +24,6 @@ limitations under the License.
 namespace tensorflow {
 
 REGISTER_KERNEL_BUILDER(Name("Identity").Device(DEVICE_CPU), IdentityOp);
-REGISTER_KERNEL_BUILDER(Name("Identity").Device(DEVICE_SYCL), IdentityOp);
 // StopGradient does the same thing as Identity, but has a different
 // gradient registered.
 REGISTER_KERNEL_BUILDER(Name("StopGradient").Device(DEVICE_CPU), IdentityOp);
@@ -35,6 +34,24 @@ REGISTER_KERNEL_BUILDER(Name("PlaceholderWithDefault").Device(DEVICE_CPU),
 
 REGISTER_KERNEL_BUILDER(Name("RefIdentity").Device(DEVICE_CPU), IdentityOp);
 
+#if TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL_KERNEL(type)                                       \
+  REGISTER_KERNEL_BUILDER(                                               \
+      Name("Identity").Device(DEVICE_SYCL).TypeConstraint<type>("T"),    \
+      IdentityOp);                                                       \
+  REGISTER_KERNEL_BUILDER(                                               \
+      Name("RefIdentity").Device(DEVICE_SYCL).TypeConstraint<type>("T"), \
+      IdentityOp);                                                       \
+  REGISTER_KERNEL_BUILDER(                                               \
+      Name("StopGradient").Device(DEVICE_SYCL).TypeConstraint<type>("T"),\
+      IdentityOp)
+
+TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL);
+REGISTER_SYCL_KERNEL(bfloat16);
+
+#undef REGISTER_SYCL_KERNEL
+#endif
+
 #define REGISTER_GPU_KERNEL(type)                                        \
   REGISTER_KERNEL_BUILDER(                                               \
       Name("Identity").Device(DEVICE_GPU).TypeConstraint<type>("T"),     \
@@ -51,6 +68,7 @@ REGISTER_GPU_KERNEL(bfloat16);
 
 #undef REGISTER_GPU_KERNEL
 
+
 #if GOOGLE_CUDA
 // A special GPU kernel for int32 and bool.
 // TODO(b/25387198): Also enable int32 in device memory. This kernel
diff --git a/tensorflow/core/kernels/no_op.cc b/tensorflow/core/kernels/no_op.cc
index e4c7f48e773..0993e6e1fcf 100644
--- a/tensorflow/core/kernels/no_op.cc
+++ b/tensorflow/core/kernels/no_op.cc
@@ -19,6 +19,9 @@ namespace tensorflow {
 
 REGISTER_KERNEL_BUILDER(Name("NoOp").Device(DEVICE_CPU), NoOp);
 REGISTER_KERNEL_BUILDER(Name("NoOp").Device(DEVICE_GPU), NoOp);
+
+#if TENSORFLOW_USE_SYCL
 REGISTER_KERNEL_BUILDER(Name("NoOp").Device(DEVICE_SYCL), NoOp);
+#endif
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/sendrecv_ops.cc b/tensorflow/core/kernels/sendrecv_ops.cc
index 38b0bf5756e..9e9cdda3826 100644
--- a/tensorflow/core/kernels/sendrecv_ops.cc
+++ b/tensorflow/core/kernels/sendrecv_ops.cc
@@ -77,7 +77,10 @@ void SendOp::Compute(OpKernelContext* ctx) {
 
 REGISTER_KERNEL_BUILDER(Name("_Send").Device(DEVICE_CPU), SendOp);
 REGISTER_KERNEL_BUILDER(Name("_Send").Device(DEVICE_GPU), SendOp);
+
+#if TENSORFLOW_USE_SYCL
 REGISTER_KERNEL_BUILDER(Name("_Send").Device(DEVICE_SYCL), SendOp);
+#endif
 
 REGISTER_KERNEL_BUILDER(Name("_HostSend").Device(DEVICE_CPU), SendOp);
 REGISTER_KERNEL_BUILDER(
@@ -136,7 +139,10 @@ void RecvOp::ComputeAsync(OpKernelContext* ctx, DoneCallback done) {
 
 REGISTER_KERNEL_BUILDER(Name("_Recv").Device(DEVICE_CPU), RecvOp);
 REGISTER_KERNEL_BUILDER(Name("_Recv").Device(DEVICE_GPU), RecvOp);
+
+#if TENSORFLOW_USE_SYCL
 REGISTER_KERNEL_BUILDER(Name("_Recv").Device(DEVICE_SYCL), RecvOp);
+#endif
 
 REGISTER_KERNEL_BUILDER(Name("_HostRecv").Device(DEVICE_CPU), RecvOp);
 REGISTER_KERNEL_BUILDER(