Merge pull request #2 from lukeiwanski/ComputeCpp

ComputeCpp CE compatibility.
2016-10-14 09:02:24 -07:00 · 2016-10-14 09:02:24 -07:00 · 5ce656d4ba
commit 5ce656d4ba
parent e3f884b496 656a72c0cd
29 changed files with 503 additions and 35 deletions
--- a/74
+++ b/74
@ -126,6 +126,17 @@ GEN_GIT_SOURCE=tensorflow/tools/git/gen_git_source.py
 chmod a+x ${GEN_GIT_SOURCE}
 "${PYTHON_BIN_PATH}" ${GEN_GIT_SOURCE} --configure "${SOURCE_BASE_DIR}"

+## Set up SYCL-related environment settings
+while [ "$TF_NEED_OPENCL" == "" ]; do
+  read -p "Do you wish to build TensorFlow with OpenCL support? [y/N] " INPUT
+  case $INPUT in
+    [Yy]* ) echo "OpenCL support will be enabled for TensorFlow"; TF_NEED_OPENCL=1;;
+    [Nn]* ) echo "No OpenCL support will be enabled for TensorFlow"; TF_NEED_OPENCL=0;;
+    "" ) echo "No OpenCL support will be enabled for TensorFlow"; TF_NEED_OPENCL=0;;
+    * ) echo "Invalid selection: " $INPUT;;
+  esac
+done
+
 ## Set up Cuda-related environment settings

 while [ "$TF_NEED_CUDA" == "" ]; do
@ -139,12 +150,14 @@ while [ "$TF_NEED_CUDA" == "" ]; do
 done

 export TF_NEED_CUDA
-if [ "$TF_NEED_CUDA" == "0" ]; then
+export TF_NEED_SYCL
+if [[ "$TF_NEED_CUDA" == "0" ]] && [[ "$TF_NEED_OPENCL" == "0" ]]; then
  echo "Configuration finished"
  bazel_clean_and_fetch
  exit
 fi

+if [ "$TF_NEED_CUDA" == "1" ]; then
 # Set up which gcc nvcc should use as the host compiler
 while true; do
  fromuser=""
@ -346,6 +359,65 @@ EOF
  TF_CUDA_COMPUTE_CAPABILITIES=""
 done

+# end of if "$TF_NEED_CUDA" == "1"
+fi
+
+# OpenCL configuration
+
+if [ "$TF_NEED_OPENCL" == "1" ]; then
+while true; do
+  # Configure the OPENCL version to use.
+  TF_OPENCL_VERSION="1.2"
+
+  # Point to ComputeCPP root
+  if [ -z "$COMPUTECPP_PATH" ]; then
+    default_computecpp_path=/usr/local/computecpp
+    read -p "Please specify the location where ComputeCPP $TF_OPENCL_VERSION is installed. Refer to README.md for more details. [Default is $default_computecpp_path]: " COMPUTECPP_PATH
+    fromuser="1"
+    if [ -z "$COMPUTECPP_PATH" ]; then
+      COMPUTECPP_PATH=$default_computecpp_path
+    fi
+  fi
+
+  if [ "$OSNAME" == "Linux" ]; then
+    SYCL_RT_LIB_PATH="lib/libComputeCpp.so"
+  fi
+
+  if [ -e "${COMPUTECPP_PATH}/${SYCL_RT_LIB_PATH}" ]; then
+    break
+  fi
+  echo "Invalid path to SYCL $TF_OPENCL_VERSION library. ${COMPUTECPP_PATH}/${SYCL_RT_LIB_PATH} cannot be found"
+
+  if [ -z "$fromuser" ]; then
+    exit 1
+  fi
+  # Retry
+  TF_OPENCL_VERSION=""
+  COMPUTECPP_PATH=""
+done
+
+cat > third_party/sycl/sycl.config <<EOF
+# COMPUTECPP_PATH refers to the ComputeCPP toolkit.
+COMPUTECPP_PATH="$COMPUTECPP_PATH"
+
+# The OpenCL version that should be used in this build
+TF_OPENCL_VERSION=$TF_OPENCL_VERSION
+
+EOF
+
+export WARNING=$DO_NOT_SUBMIT_WARNING
+perl -pi -e "s,#cxx_builtin_include_directory: {COMPUTECPP_INCLUDE},# \$ENV{WARNING}\ncxx_builtin_include_directory: \"${COMPUTECPP_PATH}\",s" third_party/sycl/crosstool/CROSSTOOL
+
+# Configure the platform name.
+perl -pi -e "s,PLATFORM = \".*\",PLATFORM = \"$OSNAME\",s" third_party/sycl/platform.bzl
+
+
+# Invoke the cuda_config.sh and set up the TensorFlow's canonical view of the Cuda libraries
+(cd third_party/sycl; ./sycl_config.sh;) || exit -1
+
+# end of if "$TF_NEED_OPENCL" == "1"
+fi
+
 bazel_clean_and_fetch

 echo "Configuration finished"
--- a/tensorflow/core/common_runtime/bfc_allocator.h
+++ b/tensorflow/core/common_runtime/bfc_allocator.h
@ -295,6 +295,8 @@ class BFCAllocator : public VisitableAllocator {
   private:
    std::vector<AllocationRegion> regions_;
  };
+  // Structures mutable after construction
+  mutable mutex lock_;

  // Returns 'bytes' rounded up to the next highest kMinAllocationSize.
  size_t RoundedBytes(size_t bytes);
@ -389,9 +391,6 @@ class BFCAllocator : public VisitableAllocator {

  std::unique_ptr<SubAllocator> suballocator_;
  string name_;
-
-  // Structures mutable after construction
-  mutable mutex lock_;
  RegionManager region_manager_ GUARDED_BY(lock_);

  std::vector<Chunk> chunks_;
--- a/tensorflow/core/common_runtime/direct_session.h
+++ b/tensorflow/core/common_runtime/direct_session.h
@ -162,6 +162,8 @@ class DirectSession : public Session {
    protobuf::RepeatedPtrField<DebugTensorWatch> debug_tensor_watches;
  };

+  mutex graph_def_lock_;
+
  // Initializes the base execution state given the 'graph',
  // if not already initialized.
  Status MaybeInitializeExecutionState(const GraphDef& graph,
@ -227,7 +229,6 @@ class DirectSession : public Session {
  string session_handle_;
  bool graph_created_ GUARDED_BY(graph_def_lock_) = false;

-  mutex graph_def_lock_;
  GraphDef graph_def_ GUARDED_BY(graph_def_lock_);

  // The thread-pools to use for running ops.
--- a/tensorflow/core/common_runtime/gpu/gpu_allocator_retry_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_allocator_retry_test.cc
@ -28,6 +28,7 @@ namespace tensorflow {
 namespace {

 class FakeAllocator {
+  mutex mu_;
 public:
  FakeAllocator(size_t cap, int millis_to_wait)
      : memory_capacity_(cap), millis_to_wait_(millis_to_wait) {}
@ -57,7 +58,6 @@ class FakeAllocator {
 private:
  AllocatorRetry retry_;
  void* good_ptr_ = reinterpret_cast<void*>(0xdeadbeef);
-  mutex mu_;
  size_t memory_capacity_ GUARDED_BY(mu_);
  int millis_to_wait_;
 };
@ -72,6 +72,7 @@ class FakeAllocator {
 // interesting part of their interaction with the allocator.  This
 // class is the mechanism that imposes turn taking.
 class AlternatingBarrier {
+  mutex mu_;
 public:
  explicit AlternatingBarrier(int num_users)
      : num_users_(num_users), next_turn_(0), done_(num_users, false) {}
@ -109,7 +110,6 @@ class AlternatingBarrier {
    }
  }

-  mutex mu_;
  condition_variable cv_;
  int num_users_;
  int next_turn_ GUARDED_BY(mu_);
@ -118,6 +118,7 @@ class AlternatingBarrier {

 class GPUAllocatorRetryTest : public ::testing::Test {
 protected:
+  mutex mu_;
  GPUAllocatorRetryTest() {}

  void LaunchConsumerThreads(int num_consumers, int cap_needed) {
@ -173,7 +174,6 @@ class GPUAllocatorRetryTest : public ::testing::Test {
  std::vector<Thread*> consumers_;
  std::vector<int> consumer_count_;
  Notification notifier_;
-  mutex mu_;
  bool has_failed_ GUARDED_BY(mu_) = false;
  int count_ GUARDED_BY(mu_) = 0;
 };
--- a/tensorflow/core/common_runtime/gpu/pool_allocator.h
+++ b/tensorflow/core/common_runtime/gpu/pool_allocator.h
@ -45,6 +45,7 @@ class RoundUpInterface {
 // Size-limited pool of memory buffers obtained from a SubAllocator
 // instance.  Pool eviction policy is LRU.
 class PoolAllocator : public VisitableAllocator {
+  mutex mutex_;
 public:
  // "pool_size_limit" is the maximum number of returned, re-usable
  // memory buffers to keep in the pool.  If pool_size_limit == 0, the
@ -136,7 +137,6 @@ class PoolAllocator : public VisitableAllocator {
  size_t pool_size_limit_;
  std::unique_ptr<SubAllocator> allocator_;
  std::unique_ptr<RoundUpInterface> size_rounder_;
-  mutex mutex_;
  std::multimap<const size_t, PtrRecord*> pool_ GUARDED_BY(mutex_);
  PtrRecord* lru_head_ GUARDED_BY(mutex_) = nullptr;
  PtrRecord* lru_tail_ GUARDED_BY(mutex_) = nullptr;
--- a/tensorflow/core/framework/op.h
+++ b/tensorflow/core/framework/op.h
@ -125,6 +125,8 @@ class OpRegistry : public OpRegistryInterface {
  void ClearDeferredRegistrations();

 private:
+  mutable mutex mu_;
+
  // Ensures that all the functions in deferred_ get called, their OpDef's
  // registered, and returns with deferred_ empty.  Returns true the first
  // time it is called. Prints a fatal log if any op registration fails.
@ -141,7 +143,6 @@ class OpRegistry : public OpRegistryInterface {
  Status RegisterAlreadyLocked(OpRegistrationDataFactory op_data_factory) const
      EXCLUSIVE_LOCKS_REQUIRED(mu_);

-  mutable mutex mu_;
  // Functions in deferred_ may only be called with mu_ held.
  mutable std::vector<OpRegistrationDataFactory> deferred_ GUARDED_BY(mu_);
  // Values are owned.
--- a/tensorflow/core/framework/tracking_allocator.h
+++ b/tensorflow/core/framework/tracking_allocator.h
@ -74,11 +74,11 @@ class TrackingAllocator : public Allocator {
  std::pair<size_t, size_t> GetSizesAndUnRef();

 private:
+  mutex mu_;
  ~TrackingAllocator() override {}
  bool UnRef() EXCLUSIVE_LOCKS_REQUIRED(mu_);

  Allocator* allocator_;  // not owned.
-  mutex mu_;
  // the number of calls to AllocateRaw that have not yet been matched
  // by a corresponding call to DeAllocateRaw, plus 1 if the Executor
  // has not yet read out the high watermark.
--- a/tensorflow/core/kernels/barrier_ops.cc
+++ b/tensorflow/core/kernels/barrier_ops.cc
@ -40,6 +40,7 @@ namespace tensorflow {
 namespace barrier {

 class Barrier : public ResourceBase {
+  mutex mu_;
 public:
  typedef std::vector<Tensor> Tuple;
  typedef std::function<void()> DoneCallback;
@ -417,7 +418,6 @@ class Barrier : public ResourceBase {

 private:
  typedef std::vector<PersistentTensor> PersistentTuple;
-  mutex mu_;
  bool closed_ GUARDED_BY(mu_);
  bool queue_closed_ GUARDED_BY(mu_);
  bool queue_cancelled_ GUARDED_BY(mu_);
@ -433,6 +433,7 @@ class Barrier : public ResourceBase {
 };

 class BarrierOp : public OpKernel {
+  mutex mu_;
 public:
  explicit BarrierOp(OpKernelConstruction* context)
      : OpKernel(context), barrier_handle_set_(false) {
@ -511,7 +512,6 @@ class BarrierOp : public OpKernel {
  std::vector<TensorShape> value_component_shapes_;
  ContainerInfo cinfo_;

-  mutex mu_;
  PersistentTensor barrier_handle_ GUARDED_BY(mu_);
  bool barrier_handle_set_ GUARDED_BY(mu_);

@ -611,7 +611,9 @@ class TakeManyOp : public BarrierOpKernel {
    DataTypeVector expected_inputs = {DT_STRING_REF, DT_INT32};
    // The first output is the insertion index, the second output is the key.
    DataTypeVector expected_outputs = {DT_INT64, DT_STRING};
-    for (DataType dt : barrier->component_types()) {
+    for (auto it  = barrier->component_types().begin(),
+              end = barrier->component_types().end(); it!= end; it++ ){
+      const DataType dt = *it;
      expected_outputs.push_back(dt);
    }
    OP_REQUIRES_OK_ASYNC(
--- a/tensorflow/core/kernels/conditional_accumulator.h
+++ b/tensorflow/core/kernels/conditional_accumulator.h
@ -65,7 +65,7 @@ class ConditionalAccumulator
  functor::SetZeroFunctor<Device, T> set_zero_functor_;

  Status ValidateShape(const Tensor* tensor)
-      EXCLUSIVE_LOCKS_REQUIRED(this->mu_) {
+      EXCLUSIVE_LOCKS_REQUIRED(mu_) {
    // Must be compatible with accumulated gradient if available
    if (counter_ > 0) {
      if (!accum_grad_->shape().IsSameSize(tensor->shape())) {
@ -98,7 +98,7 @@ class ConditionalAccumulator
  }

  void DivideAccumGradByCounter(OpKernelContext* ctx) override
-      EXCLUSIVE_LOCKS_REQUIRED(this->mu_) {
+      EXCLUSIVE_LOCKS_REQUIRED(mu_) {
    Tensor c(DataTypeToEnum<T>::value, {});
    c.scalar<T>()() = TypeConverter<T, int>::ConvertUToT(this->counter_);
    this->accum_grad_->template flat<T>().device(
@ -113,7 +113,7 @@ class ConditionalAccumulator

  bool GetAndValidateTensorInputForApplyGrad(OpKernelContext* ctx,
                                             const Tensor** tensor) override
-      EXCLUSIVE_LOCKS_REQUIRED(this->mu_) {
+      EXCLUSIVE_LOCKS_REQUIRED(mu_) {
    // Get input gradient tensor
    const Tensor* grad_tensor;
    OP_REQUIRES_OK_BOOLEAN(ctx, ctx->input("gradient", &grad_tensor));
--- a/tensorflow/core/kernels/conditional_accumulator_base.h
+++ b/tensorflow/core/kernels/conditional_accumulator_base.h
@ -45,6 +45,8 @@ namespace tensorflow {
 * (3) the internal global_step value (current_global_step_) is incremented by 1
 */
 class ConditionalAccumulatorBase : public ResourceBase {
+ protected:
+  mutex mu_;
 public:
  // Args:
  //   dtype: The datatype of the gradients to be accumulated.
@ -125,7 +127,6 @@ class ConditionalAccumulatorBase : public ResourceBase {
  const DataType dtype_;
  const PartialTensorShape shape_;
  const string name_;
-  mutex mu_;
  int counter_ GUARDED_BY(mu_);
  int64 current_global_step_ GUARDED_BY(mu_);

--- a/tensorflow/core/kernels/conditional_accumulator_base_op.h
+++ b/tensorflow/core/kernels/conditional_accumulator_base_op.h
@ -43,6 +43,7 @@ namespace tensorflow {
 * ConditionalAccumulatorBase (via sub-class's Creator) and returns its handle.
 */
 class ConditionalAccumulatorBaseOp : public OpKernel {
+  mutex mu_;
 public:
  explicit ConditionalAccumulatorBaseOp(OpKernelConstruction* context)
      : OpKernel(context), accumulator_handle_set_(false) {
@ -109,7 +110,6 @@ class ConditionalAccumulatorBaseOp : public OpKernel {
    return Status::OK();
  }

-  mutex mu_;
  PersistentTensor accumulator_handle_ GUARDED_BY(mu_);
  bool accumulator_handle_set_ GUARDED_BY(mu_);
 };
--- a/tensorflow/core/kernels/cwise_ops_common.h
+++ b/tensorflow/core/kernels/cwise_ops_common.h
@ -82,8 +82,13 @@ class BinaryOp : public BinaryOpShared {
    if (!ctx->status().ok()) return;
    Tensor* out = state.out;
    BCast* bcast = &state.bcast;
+#if TENSORFLOW_USE_SYCL
+    decltype(state.in0) in0 = state.in0;
+    decltype(state.in1) in1 = state.in1;
+#else
    auto& in0 = state.in0;
    auto& in1 = state.in1;
+#endif
    if (state.out_num_elements == 0) {
      return;
    }
--- a/tensorflow/core/kernels/queue_base.h
+++ b/tensorflow/core/kernels/queue_base.h
@ -83,6 +83,7 @@ class QueueBase : public QueueInterface {
                                   int64 index);

 protected:
+  mutex mu_;
  enum Action { kEnqueue, kDequeue };
  enum RunResult { kNoProgress, kProgress, kComplete };

@ -143,7 +144,6 @@ class QueueBase : public QueueInterface {
  const DataTypeVector component_dtypes_;
  const std::vector<TensorShape> component_shapes_;
  const string name_;
-  mutex mu_;
  bool closed_ GUARDED_BY(mu_);

  struct Attempt;
--- a/tensorflow/core/kernels/queue_op.h
+++ b/tensorflow/core/kernels/queue_op.h
@ -34,6 +34,7 @@ namespace tensorflow {

 // Defines a QueueOp, an abstract class for Queue construction ops.
 class QueueOp : public OpKernel {
+  mutex mu_;
 public:
  QueueOp(OpKernelConstruction* context)
      : OpKernel(context), queue_handle_set_(false) {
@ -94,7 +95,6 @@ class QueueOp : public OpKernel {
    return Status::OK();
  }

-  mutex mu_;
  PersistentTensor queue_handle_ GUARDED_BY(mu_);
  bool queue_handle_set_ GUARDED_BY(mu_);
 };
--- a/tensorflow/core/kernels/sparse_conditional_accumulator.h
+++ b/tensorflow/core/kernels/sparse_conditional_accumulator.h
@ -83,7 +83,7 @@ class SparseConditionalAccumulator

  Status ValidateShape(
      std::tuple<const Tensor*, const Tensor*, const Tensor*>* tensor,
-      bool has_known_shape) EXCLUSIVE_LOCKS_REQUIRED(this->mu_) {
+      bool has_known_shape) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
    const Tensor* tensor_idx = std::get<0>(*tensor);
    const Tensor* tensor_val = std::get<1>(*tensor);
    const Tensor* tensor_shape = std::get<2>(*tensor);
--- a/tensorflow/core/kernels/tensor_array.h
+++ b/tensorflow/core/kernels/tensor_array.h
@ -123,6 +123,7 @@ TF_CALL_GPU_NUMBER_TYPES(TENSOR_ARRAY_SET_ZERO_GPU);
 //     multiple reads of that index in the forward phase.
 //
 class TensorArray : public ResourceBase {
+  mutex mu_;
 public:
  static std::atomic<int64> tensor_array_counter;

@ -338,8 +339,6 @@ class TensorArray : public ResourceBase {
  const DataType dtype_;
  Tensor handle_;

-  mutex mu_;
-
  // Marks that the tensor_array_ has been cleared.
  bool closed_ GUARDED_BY(mu_);

--- a/tensorflow/core/lib/monitoring/collection_registry.cc
+++ b/tensorflow/core/lib/monitoring/collection_registry.cc
@ -45,7 +45,9 @@ void Collector::CollectMetricDescriptor(
  metric_descriptor->name = metric_def->name().ToString();
  metric_descriptor->description = metric_def->description().ToString();

-  for (const StringPiece label_name : metric_def->label_descriptions()) {
+  for (auto it  = metric_def->label_descriptions().begin(),
+            end = metric_def->label_descriptions().end() ; it!=end ;it++  ) {
+    const StringPiece label_name = *it;
    metric_descriptor->label_names.push_back(label_name.ToString());
  }

--- a/tensorflow/core/lib/monitoring/collection_registry.h
+++ b/tensorflow/core/lib/monitoring/collection_registry.h
@ -121,6 +121,7 @@ class MetricCollectorGetter {
 //
 // This class is thread-safe.
 class CollectionRegistry {
+  mutable mutex mu_;
 public:
  ~CollectionRegistry() = default;

@ -176,8 +177,6 @@ class CollectionRegistry {
  // TF environment, mainly used for timestamping.
  Env* const env_;

-  mutable mutex mu_;
-
  // Information required for collection.
  struct CollectionInfo {
    const AbstractMetricDef* const metric_def;
@ -227,6 +226,7 @@ inline void CollectValue(const int64& value, Point* const point) {
 //
 // This class is thread-safe.
 class Collector {
+  mutable mutex mu_;
 public:
  Collector(const uint64 collection_time_millis)
      : collected_metrics_(new CollectedMetrics()),
@ -260,7 +260,6 @@ class Collector {
      LOCKS_EXCLUDED(mu_);

 private:
-  mutable mutex mu_;
  std::unique_ptr<CollectedMetrics> collected_metrics_ GUARDED_BY(mu_);
  const uint64 collection_time_millis_;

--- a/tensorflow/core/lib/monitoring/counter.h
+++ b/tensorflow/core/lib/monitoring/counter.h
@ -78,6 +78,7 @@ class CounterCell {
 // This class is thread-safe.
 template <int NumLabels>
 class Counter {
+  mutable mutex mu_;
 public:
  ~Counter() {
    // Deleted here, before the metric_def is destroyed.
@ -111,8 +112,6 @@ class Counter {
              }
            })) {}

-  mutable mutex mu_;
-
  // The metric definition. This will be used to identify the metric when we
  // register it for collection.
  const MetricDef<MetricKind::kCumulative, int64, NumLabels> metric_def_;
--- a/tensorflow/stream_executor/machine_manager.h
+++ b/tensorflow/stream_executor/machine_manager.h
@ -60,6 +60,9 @@ namespace gputools {
 //
 // Thread-safe.
 class MachineManager {
+  // Mutex that guards the initialization of the machine manager static
+  // variable.
+  static mutex mu_;
 public:
  // Inspects the host to determine the preferred GPU execution platform.
  // To force OpenCL from a build target on a machine that has both OpenCL and
@ -171,10 +174,6 @@ class MachineManager {
  // Returns the NUMA node association for the StreamExecutor.
  int ExecutorToNumaNode(const StreamExecutor *stream_exec) const;

-  // Mutex that guards the initialization of the machine manager static
-  // variable.
-  static mutex mu_;
-
  // Singleton MachineManager value -- assignment to this is protected by a
  // static singleton guard clause.
  static MachineManager *singleton_ GUARDED_BY(mu_);
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@ -14,7 +14,7 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
  # These lines need to be changed when updating Eigen. They are parsed from
  # this file by the cmake and make builds to determine the eigen version and
  # hash.
-  eigen_version = "aad63574941c"
+  eigen_version = "ab6d16a84626"
  eigen_sha256 = ""

  native.new_http_archive(
--- a/third_party/sycl/BUILD
+++ b/third_party/sycl/BUILD
@ -0,0 +1,44 @@
+licenses(["restricted"])  # MPL2, portions GPL v3, LGPL v3, BSD-like
+
+load("//third_party/sycl:build_defs.bzl", "if_sycl")
+load("platform", "sycl_library_path")
+load("platform", "sycl_static_library_path")
+
+load("platform", "readlink_command")
+
+package(default_visibility = ["//visibility:public"])
+
+config_setting(
+    name = "using_sycl",
+    values = {
+        "define": "using_sycl=true",
+    },
+)
+
+cc_library(
+    name = "sycl_headers",
+    hdrs = glob([
+        "**/*.h",
+    ]),
+    includes = [".", "include"],
+)
+
+cc_library(
+    name = "syclrt",
+    srcs = [
+        sycl_library_path("ComputeCpp")
+    ],
+    data = [
+        sycl_library_path("ComputeCpp")
+    ],
+    includes = ["include/"],
+    linkstatic = 1,
+)
+
+cc_library(
+    name = "sycl",
+    deps = if_sycl([
+        ":sycl_headers",
+        ":syclrt",
+    ]),
+)
--- a/third_party/sycl/build_defs.bzl
+++ b/third_party/sycl/build_defs.bzl
@ -0,0 +1,10 @@
+# Macros for building SYCL code.
+def if_sycl(if_true, if_false = []):
+    """Shorthand for select()'ing on whether we're building with SYCL.
+    Returns a select statement which evaluates to if_true if we're building
+    with SYCL enabled.  Otherwise, the select statement evaluates to if_false.
+    """
+    return select({
+        "//third_party/sycl:using_sycl": if_true,
+        "//conditions:default": if_false
+    })
--- a/third_party/sycl/crosstool/BUILD
+++ b/third_party/sycl/crosstool/BUILD
@ -0,0 +1,29 @@
+licenses(["restricted"])
+
+package(default_visibility = ["//visibility:public"])
+
+cc_toolchain_suite(
+    name = "toolchain",
+    toolchains = {
+        "local|compiler": ":cc-compiler-local",
+    },
+)
+
+cc_toolchain(
+    name = "cc-compiler-local",
+    all_files = ":empty",
+    compiler_files = ":empty",
+    cpu = "local",
+    dwp_files = ":empty",
+    dynamic_runtime_libs = [":empty"],
+    linker_files = ":empty",
+    objcopy_files = ":empty",
+    static_runtime_libs = [":empty"],
+    strip_files = ":empty",
+    supports_param_files = 0,
+)
+
+filegroup(
+    name = "empty",
+    srcs = [],
+)
--- a/third_party/sycl/crosstool/CROSSTOOL
+++ b/third_party/sycl/crosstool/CROSSTOOL
@ -0,0 +1,82 @@
+major_version: "local"
+minor_version: ""
+default_target_cpu: "same_as_host"
+
+default_toolchain {
+  cpu: "k8"
+  toolchain_identifier: "local_linux"
+}
+
+toolchain {
+  abi_version: "local"
+  abi_libc_version: "local"
+  builtin_sysroot: ""
+  compiler: "compiler"
+  host_system_name: "local"
+  needsPic: true
+  supports_gold_linker: false
+  supports_incremental_linker: false
+  supports_fission: false
+  supports_interface_shared_objects: false
+  supports_normalizing_ar: false
+  supports_start_end_lib: false
+  supports_thin_archives: false
+  target_libc: "local"
+  target_cpu: "local"
+  target_system_name: "local"
+  toolchain_identifier: "local_linux"
+
+  tool_path { name: "ar" path: "/usr/bin/ar" }
+  tool_path { name: "compat-ld" path: "/usr/bin/ld" }
+  tool_path { name: "cpp" path: "/usr/bin/cpp" }
+  tool_path { name: "dwp" path: "/usr/bin/dwp" }
+  tool_path { name: "gcc" path: "computecpp" }
+  # Use "-std=c++11" for nvcc. For consistency, force both the host compiler
+  # and the device compiler to use "-std=c++11".
+  cxx_flag: "-std=c++11"
+  linker_flag: "-lstdc++"
+  linker_flag: "-B/usr/bin/"
+
+  # TODO(bazel-team): In theory, the path here ought to exactly match the path
+  # used by gcc. That works because bazel currently doesn't track files at
+  # absolute locations and has no remote execution, yet. However, this will need
+  # to be fixed, maybe with auto-detection?
+  cxx_builtin_include_directory: "/usr/lib/gcc/"
+  cxx_builtin_include_directory: "/usr/lib"
+  cxx_builtin_include_directory: "/usr/lib64"
+  cxx_builtin_include_directory: "/usr/local/include"
+  cxx_builtin_include_directory: "/usr/include"
+
+  #cxx_builtin_include_directory: {COMPUTECPP_INCLUDE}
+
+  tool_path { name: "gcov" path: "/usr/bin/gcov" }
+
+  # C(++) compiles invoke the compiler (as that is the one knowing where
+  # to find libraries), but we provide LD so other rules can invoke the linker.
+  tool_path { name: "ld" path: "/usr/bin/ld" }
+
+  tool_path { name: "nm" path: "/usr/bin/nm" }
+  tool_path { name: "objcopy" path: "/usr/bin/objcopy" }
+  objcopy_embed_flag: "-I"
+  objcopy_embed_flag: "binary"
+  tool_path { name: "objdump" path: "/usr/bin/objdump" }
+  tool_path { name: "strip" path: "/usr/bin/strip" }
+
+  # Make C++ compilation deterministic. Use linkstamping instead of these
+  # compiler symbols.
+  unfiltered_cxx_flag: "-Wno-builtin-macro-redefined"
+  unfiltered_cxx_flag: "-D__DATE__=\"redacted\""
+  unfiltered_cxx_flag: "-D__TIMESTAMP__=\"redacted\""
+  unfiltered_cxx_flag: "-D__TIME__=\"redacted\""
+
+  # All warnings are enabled. Maybe enable -Werror as well?
+  compiler_flag: "-Wall"
+
+  # Anticipated future default.
+  linker_flag: "-Wl,-no-as-needed"
+  # Stamp the binary with a unique identifier.
+  linker_flag: "-Wl,--build-id=md5"
+  linker_flag: "-Wl,--hash-style=gnu"
+
+  linking_mode_flags { mode: DYNAMIC }
+}
--- a/third_party/sycl/crosstool/computecpp
+++ b/third_party/sycl/crosstool/computecpp
@ -0,0 +1,61 @@
+#!/usr/bin/env python2.7
+
+from argparse import ArgumentParser
+import os
+import subprocess
+import re
+import sys
+import pipes
+
+CPU_CXX_COMPILER = ('/usr/bin/clang++-3.6')
+CPU_C_COMPILER = ('/usr/bin/clang-3.6')
+
+CURRENT_DIR = os.path.dirname(sys.argv[0])
+COMPUTECPP_ROOT = CURRENT_DIR +"/../"
+COMPUTECPP_DRIVER= COMPUTECPP_ROOT+"bin/compute++"
+COMPUTECPP_INCLUDE = COMPUTECPP_ROOT+"include"
+
+def main():
+  computecpp_compiler_flags = [""]
+  computecpp_compiler_flags = [flag for flag in sys.argv[1:]]
+  computecpp_compiler_flags = computecpp_compiler_flags + ["-D_GLIBCXX_USE_CXX11_ABI=0"]
+
+  output_file_index = computecpp_compiler_flags.index("-o") +1
+  output_file_name = computecpp_compiler_flags[output_file_index]
+
+  if(output_file_index == 1):
+    # we are linking
+    return subprocess.call([CPU_CXX_COMPILER] +computecpp_compiler_flags )
+
+  # find what we compile
+  compiling_cpp = 0
+  if("-c" in computecpp_compiler_flags):
+      compiled_file_index = computecpp_compiler_flags.index("-c") +1
+      compited_file_name = computecpp_compiler_flags[compiled_file_index]
+      if(compited_file_name.endswith(('.cc', '.c++', '.cpp', '.CPP', '.C', '.cxx'))):
+          compiling_cpp = 1;
+
+  if(compiling_cpp == 1):
+      filename, file_extension = os.path.splitext(output_file_name)
+      bc_out = filename + ".sycl"
+
+      computecpp_compiler_flags = ['-DTENSORFLOW_USE_SYCL', '-Wno-unused-variable','-I', COMPUTECPP_INCLUDE,'-isystem',
+      COMPUTECPP_INCLUDE, "-std=c++11", "-sycl", "-emit-llvm", "-no-serial-memop"] + computecpp_compiler_flags
+
+      # dont want that in case of compiling with computecpp first
+      host_compiler_flags = [""]
+      host_compiler_flags = [flag for flag in sys.argv[1:]
+                                if not flag.startswith(('-MF','-MD',))
+                                if not ".d" in flag]
+
+      x = subprocess.call([COMPUTECPP_DRIVER] +computecpp_compiler_flags )
+      if(x == 0):
+          host_compiler_flags = ['-DTENSORFLOW_USE_SYCL', '-Wno-unused-variable', '-I', COMPUTECPP_INCLUDE, "--include",bc_out] + host_compiler_flags
+          return subprocess.call([CPU_CXX_COMPILER] +host_compiler_flags )
+      return x
+  else:
+    # compile for C
+    return subprocess.call([CPU_C_COMPILER] +computecpp_compiler_flags)
+
+if __name__ == '__main__':
+  sys.exit(main())
--- a/third_party/sycl/platform.bzl
+++ b/third_party/sycl/platform.bzl
@ -0,0 +1,17 @@
+SYCL_VERSION = ""
+PLATFORM = ""
+
+def sycl_sdk_version():
+  return SYCL_VERSION
+
+def sycl_library_path(name, version = sycl_sdk_version()):
+  if not version:
+    return "lib/lib{}.so".format(name)
+  else:
+    return "lib/lib{}.so.{}".format(name, version)
+
+def sycl_static_library_path(name):
+  return "lib/lib{}_static.a".format(name)
+
+def readlink_command():
+    return "readlink"
--- a/third_party/sycl/sycl_config.sh
+++ b/third_party/sycl/sycl_config.sh
@ -0,0 +1,143 @@
+#!/usr/bin/env bash
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+
+# A simple script to configure the SYCL tree needed for the TensorFlow OpenCL
+# build. We need both COMPUTECPP toolkit $TF_OPENCL_VERSION.
+# Useage:
+#    * User edit sycl.config to point ComputeCPP toolkit to its local path
+#    * run sycl_config.sh to generate symbolic links in the source tree to reflect
+#    * the file organizations needed by TensorFlow.
+
+print_usage() {
+cat << EOF
+Usage: $0 [--check]
+  Configure TensorFlow's canonical view of SYCL libraries using sycl.config.
+Arguments:
+  --check: Only check that the proper SYCL dependencies has already been
+       properly configured in the source tree. It also creates symbolic links to
+       the files in the gen-tree to make bazel happy.
+EOF
+}
+
+CHECK_ONLY=0
+# Parse the arguments. Add more arguments as the "case" line when needed.
+while [[ $# -gt 0 ]]; do
+  argument="$1"
+  shift
+  case $argument in
+    --check)
+      CHECK_ONLY=1
+      ;;
+    *)
+      echo "Error: unknown arguments"
+      print_usage
+      exit -1
+      ;;
+  esac
+done
+
+source sycl.config || exit -1
+
+OUTPUTDIR=${OUTPUTDIR:-../..}
+COMPUTECPP_PATH=${COMPUTECPP_PATH:-/usr/local/computecpp}
+
+# An error message when the SYCL toolkit is not found
+function SYCLError {
+  echo ERROR: $1
+cat << EOF
+##############################################################################
+##############################################################################
+SYCL $TF_OPENCL_VERSION toolkit is missing.
+1. Download and install the ComputeCPP $TF_OPENCL_VERSION toolkit;
+2. Run configure from the root of the source tree, before rerunning bazel;
+Please refer to README.md for more details.
+##############################################################################
+##############################################################################
+EOF
+  exit -1
+}
+
+# Check that the SYCL libraries have already been properly configured in the source tree.
+# We still need to create links to the gen-tree to make bazel happy.
+function CheckAndLinkToSrcTree {
+  ERROR_FUNC=$1
+  FILE=$2
+  if test ! -e $FILE; then
+    $ERROR_FUNC "$PWD/$FILE cannot be found"
+  fi
+
+  # Link the output file to the source tree, avoiding self links if they are
+  # the same. This could happen if invoked from the source tree by accident.
+  if [ ! $($READLINK_CMD -f $PWD) == $($READLINK_CMD -f $OUTPUTDIR/third_party/sycl) ]; then
+    mkdir -p $(dirname $OUTPUTDIR/third_party/sycl/$FILE)
+    ln -sf $PWD/$FILE $OUTPUTDIR/third_party/sycl/$FILE
+  fi
+}
+
+OSNAME=`uname -s`
+if [ "$OSNAME" == "Linux" ]; then
+  SYCL_LIB_PATH="lib"
+  SYCL_RT_LIB_PATH="lib/libComputeCpp.so"
+  SYCL_RT_LIB_STATIC_PATH="lib/libComputeCpp.a"
+  READLINK_CMD="readlink"
+fi
+
+if [ "$CHECK_ONLY" == "1" ]; then
+  CheckAndLinkToSrcTree SYCLError include/SYCL/sycl.h
+  CheckAndLinkToSrcTree SYCLError $SYCL_RT_LIB_STATIC_PATH
+  CheckAndLinkToSrcTree CudaError $SYCL_RT_LIB_PATH
+  exit 0
+fi
+
+# Actually configure the source tree for TensorFlow's canonical view of SYCL
+# libraries.
+
+if test ! -e ${COMPUTECPP_PATH}/${SYCL_RT_LIB_PATH}; then
+  SYCLError "cannot find ${COMPUTECPP_PATH}/${SYCL_RT_LIB_PATH}"
+fi
+
+# Helper function to build symbolic links for all files in a directory.
+function LinkOneDir {
+  SRC_PREFIX=$1
+  DST_PREFIX=$2
+  SRC_DIR=$3
+  DST_DIR=$(echo $SRC_DIR | sed "s,^$SRC_PREFIX,$DST_PREFIX,")
+  mkdir -p $DST_DIR
+  FILE_LIST=$(find -L $SRC_DIR -maxdepth 1 -type f)
+  if test "$FILE_LIST" != ""; then
+    ln -sf $FILE_LIST $DST_DIR/ || exit -1
+  fi
+}
+export -f LinkOneDir
+
+# Build links for all files in the directory, including subdirectories.
+function LinkAllFiles {
+  SRC_DIR=$1
+  DST_DIR=$2
+  find -L $SRC_DIR -type d | xargs -I {} bash -c "LinkOneDir $SRC_DIR $DST_DIR {}" || exit -1
+}
+
+# Set up the symbolic links for SYCL toolkit. We link at individual file level,
+# not at the directory level.
+# This is because the external library may have a different file layout from our desired structure.
+mkdir -p $OUTPUTDIR/third_party/sycl
+echo "Setting up SYCL include"
+LinkAllFiles ${COMPUTECPP_PATH}/include $OUTPUTDIR/third_party/sycl/include || exit -1
+echo "Setting up SYCL ${SYCL_LIB_PATH}"
+LinkAllFiles ${COMPUTECPP_PATH}/${SYCL_LIB_PATH} $OUTPUTDIR/third_party/sycl/${SYCL_LIB_PATH} || exit -1
+echo "Setting up SYCL bin"
+LinkAllFiles ${COMPUTECPP_PATH}/bin $OUTPUTDIR/third_party/sycl/bin || exit -1
--- a/tools/bazel.rc.template
+++ b/tools/bazel.rc.template
@ -1,6 +1,9 @@
 build:cuda --crosstool_top=@local_config_cuda//crosstool:toolchain
 build:cuda --define=using_cuda=true --define=using_cuda_nvcc=true

+build:sycl --crosstool_top=//third_party/sycl/crosstool:toolchain
+build:sycl --define=using_sycl=true
+
 build --force_python=py$PYTHON_MAJOR_VERSION
 build --host_force_python=py$PYTHON_MAJOR_VERSION
 build --python$PYTHON_MAJOR_VERSION_path=$PYTHON_BINARY