From ace7c0e11cdc33c65fe5f6f650382d90fb30a901 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 3 Jun 2016 10:24:45 -0700 Subject: [PATCH 01/51] Pull the latest Eigen version that supports OpenCL --- eigen.BUILD | 2 +- tensorflow/workspace.bzl | 4 ++-- third_party/eigen3/Eigen/Cholesky | 2 +- third_party/eigen3/Eigen/Core | 2 +- third_party/eigen3/Eigen/Eigenvalues | 2 +- third_party/eigen3/Eigen/LU | 2 +- third_party/eigen3/Eigen/QR | 2 +- third_party/eigen3/unsupported/Eigen/CXX11/Tensor | 2 +- 8 files changed, 9 insertions(+), 9 deletions(-) diff --git a/eigen.BUILD b/eigen.BUILD index 79bafe65b62..e834e495151 100644 --- a/eigen.BUILD +++ b/eigen.BUILD @@ -1,6 +1,6 @@ package(default_visibility = ["//visibility:public"]) -archive_dir = "eigen-eigen-d02e6a705c30" +archive_dir = "benoitsteiner-opencl-9d4a08d57d0d" cc_library( name = "eigen", diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl index 07f83651e02..e263c3658b4 100644 --- a/tensorflow/workspace.bzl +++ b/tensorflow/workspace.bzl @@ -6,8 +6,8 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""): native.new_http_archive( name = "eigen_archive", - url = "https://bitbucket.org/eigen/eigen/get/d02e6a705c30.tar.gz", - sha256 = "532956172daa8aba87c750791ff89a5c38cdb07e2525afe17ecb4bef812d67cf", + url = "https://bitbucket.org/benoitsteiner/opencl/get/9d4a08d57d0d.tar.gz", + sha256 = "2b736059052affcfa1f9a645c5e3a655ea4ada6dc0ba2d97a9b61902156bbafc", build_file = path_prefix + "eigen.BUILD", ) diff --git a/third_party/eigen3/Eigen/Cholesky b/third_party/eigen3/Eigen/Cholesky index 56059bcc61c..bb3b599c957 100644 --- a/third_party/eigen3/Eigen/Cholesky +++ b/third_party/eigen3/Eigen/Cholesky @@ -1 +1 @@ -#include "eigen-eigen-d02e6a705c30/Eigen/Cholesky" +#include "benoitsteiner-opencl-9d4a08d57d0d/Eigen/Cholesky" diff --git a/third_party/eigen3/Eigen/Core b/third_party/eigen3/Eigen/Core index c1d4a2e0f8c..95a48a30d84 100644 --- a/third_party/eigen3/Eigen/Core +++ b/third_party/eigen3/Eigen/Core @@ -1 +1 @@ -#include "eigen-eigen-d02e6a705c30/Eigen/Core" +#include "benoitsteiner-opencl-9d4a08d57d0d/Eigen/Core" diff --git a/third_party/eigen3/Eigen/Eigenvalues b/third_party/eigen3/Eigen/Eigenvalues index 0a0731ba19b..21ffbb5202b 100644 --- a/third_party/eigen3/Eigen/Eigenvalues +++ b/third_party/eigen3/Eigen/Eigenvalues @@ -1 +1 @@ -#include "eigen-eigen-d02e6a705c30/Eigen/Eigenvalues" +#include "benoitsteiner-opencl-9d4a08d57d0d/Eigen/Eigenvalues" diff --git a/third_party/eigen3/Eigen/LU b/third_party/eigen3/Eigen/LU index d6b39b8d235..3263cf2b828 100644 --- a/third_party/eigen3/Eigen/LU +++ b/third_party/eigen3/Eigen/LU @@ -1 +1 @@ -#include "eigen-eigen-d02e6a705c30/Eigen/LU" +#include "benoitsteiner-opencl-9d4a08d57d0d/Eigen/LU" diff --git a/third_party/eigen3/Eigen/QR b/third_party/eigen3/Eigen/QR index a5406e93bc6..cf138a470f1 100644 --- a/third_party/eigen3/Eigen/QR +++ b/third_party/eigen3/Eigen/QR @@ -1 +1 @@ -#include "eigen-eigen-d02e6a705c30/Eigen/QR" +#include "benoitsteiner-opencl-9d4a08d57d0d/Eigen/QR" diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/Tensor b/third_party/eigen3/unsupported/Eigen/CXX11/Tensor index 4f730236b78..33742177dd8 100644 --- a/third_party/eigen3/unsupported/Eigen/CXX11/Tensor +++ b/third_party/eigen3/unsupported/Eigen/CXX11/Tensor @@ -1 +1 @@ -#include "eigen-eigen-d02e6a705c30/unsupported/Eigen/CXX11/Tensor" +#include "benoitsteiner-opencl-9d4a08d57d0d/unsupported/Eigen/CXX11/Tensor" From e3f884b496b8548d9f5d46c2f1c487a957d3f341 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 12 Oct 2016 22:52:59 -0700 Subject: [PATCH 02/51] Switch to the latest version of Eigen that supports OpenCL --- tensorflow/workspace.bzl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl index 2f9b648f1a4..ac2a22ee548 100644 --- a/tensorflow/workspace.bzl +++ b/tensorflow/workspace.bzl @@ -14,14 +14,14 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""): # These lines need to be changed when updating Eigen. They are parsed from # this file by the cmake and make builds to determine the eigen version and # hash. - eigen_version = "97c1ebe6ccc2" - eigen_sha256 = "58ab9fa44391c850d783fe0867f42a00b5300293b7d73bbbbc8756c2e649fea2" + eigen_version = "aad63574941c" + eigen_sha256 = "" native.new_http_archive( name = "eigen_archive", - url = "http://bitbucket.org/eigen/eigen/get/" + eigen_version + ".tar.gz", + url = "http://bitbucket.org/benoitsteiner/opencl/get/" + eigen_version + ".tar.gz", sha256 = eigen_sha256, - strip_prefix = "eigen-eigen-" + eigen_version, + strip_prefix = "benoitsteiner-opencl-" + eigen_version, build_file = str(Label("//:eigen.BUILD")), ) From 21f822cb5eb117ef2bd42fab060175ad7b98e505 Mon Sep 17 00:00:00 2001 From: luke iwanski Date: Fri, 7 Oct 2016 15:13:17 +0100 Subject: [PATCH 03/51] Build system that works with ComputeCpp CE. --- configure | 74 ++++++++++++- third_party/sycl/BUILD | 44 ++++++++ third_party/sycl/build_defs.bzl | 10 ++ third_party/sycl/crosstool/BUILD | 28 +++++ third_party/sycl/crosstool/CROSSTOOL | 82 +++++++++++++++ third_party/sycl/crosstool/computecpp | 61 +++++++++++ third_party/sycl/platform.bzl | 17 +++ third_party/sycl/sycl_config.sh | 143 ++++++++++++++++++++++++++ tools/bazel.rc.template | 3 + 9 files changed, 461 insertions(+), 1 deletion(-) create mode 100755 third_party/sycl/BUILD create mode 100755 third_party/sycl/build_defs.bzl create mode 100755 third_party/sycl/crosstool/BUILD create mode 100755 third_party/sycl/crosstool/CROSSTOOL create mode 100755 third_party/sycl/crosstool/computecpp create mode 100755 third_party/sycl/platform.bzl create mode 100755 third_party/sycl/sycl_config.sh diff --git a/configure b/configure index 933bd573578..426071e48d0 100755 --- a/configure +++ b/configure @@ -126,6 +126,17 @@ GEN_GIT_SOURCE=tensorflow/tools/git/gen_git_source.py chmod a+x ${GEN_GIT_SOURCE} "${PYTHON_BIN_PATH}" ${GEN_GIT_SOURCE} --configure "${SOURCE_BASE_DIR}" +## Set up SYCL-related environment settings +while [ "$TF_NEED_OPENCL" == "" ]; do + read -p "Do you wish to build TensorFlow with OpenCL support? [y/N] " INPUT + case $INPUT in + [Yy]* ) echo "OpenCL support will be enabled for TensorFlow"; TF_NEED_OPENCL=1;; + [Nn]* ) echo "No OpenCL support will be enabled for TensorFlow"; TF_NEED_OPENCL=0;; + "" ) echo "No OpenCL support will be enabled for TensorFlow"; TF_NEED_OPENCL=0;; + * ) echo "Invalid selection: " $INPUT;; + esac +done + ## Set up Cuda-related environment settings while [ "$TF_NEED_CUDA" == "" ]; do @@ -139,12 +150,14 @@ while [ "$TF_NEED_CUDA" == "" ]; do done export TF_NEED_CUDA -if [ "$TF_NEED_CUDA" == "0" ]; then +export TF_NEED_SYCL +if [[ "$TF_NEED_CUDA" == "0" ]] && [[ "$TF_NEED_OPENCL" == "0" ]]; then echo "Configuration finished" bazel_clean_and_fetch exit fi +if [ "$TF_NEED_CUDA" == "1" ]; then # Set up which gcc nvcc should use as the host compiler while true; do fromuser="" @@ -346,6 +359,65 @@ EOF TF_CUDA_COMPUTE_CAPABILITIES="" done +# end of if "$TF_NEED_CUDA" == "1" +fi + +# OpenCL configuration + +if [ "$TF_NEED_OPENCL" == "1" ]; then +while true; do + # Configure the OPENCL version to use. + TF_OPENCL_VERSION="1.2" + + # Point to ComputeCPP root + if [ -z "$COMPUTECPP_PATH" ]; then + default_computecpp_path=/usr/local/computecpp + read -p "Please specify the location where ComputeCPP $TF_OPENCL_VERSION is installed. Refer to README.md for more details. [Default is $default_computecpp_path]: " COMPUTECPP_PATH + fromuser="1" + if [ -z "$COMPUTECPP_PATH" ]; then + COMPUTECPP_PATH=$default_computecpp_path + fi + fi + + if [ "$OSNAME" == "Linux" ]; then + SYCL_RT_LIB_PATH="lib/libComputeCpp.so" + fi + + if [ -e "${COMPUTECPP_PATH}/${SYCL_RT_LIB_PATH}" ]; then + break + fi + echo "Invalid path to SYCL $TF_OPENCL_VERSION library. ${COMPUTECPP_PATH}/${SYCL_RT_LIB_PATH} cannot be found" + + if [ -z "$fromuser" ]; then + exit 1 + fi + # Retry + TF_OPENCL_VERSION="" + COMPUTECPP_PATH="" +done + +cat > third_party/sycl/sycl.config < Date: Wed, 12 Oct 2016 14:41:09 +0100 Subject: [PATCH 04/51] Applied workaround for the ComputeCpp CE. --- tensorflow/core/common_runtime/bfc_allocator.h | 5 ++--- tensorflow/core/common_runtime/direct_session.h | 3 ++- .../core/common_runtime/gpu/gpu_allocator_retry_test.cc | 6 +++--- tensorflow/core/common_runtime/gpu/pool_allocator.h | 2 +- tensorflow/core/framework/op.h | 3 ++- tensorflow/core/framework/tracking_allocator.h | 2 +- tensorflow/core/kernels/barrier_ops.cc | 8 +++++--- tensorflow/core/kernels/conditional_accumulator.h | 6 +++--- tensorflow/core/kernels/conditional_accumulator_base.h | 3 ++- tensorflow/core/kernels/conditional_accumulator_base_op.h | 2 +- tensorflow/core/kernels/cwise_ops_common.h | 5 +++++ tensorflow/core/kernels/queue_base.h | 2 +- tensorflow/core/kernels/queue_op.h | 2 +- tensorflow/core/kernels/sparse_conditional_accumulator.h | 2 +- tensorflow/core/kernels/tensor_array.h | 3 +-- tensorflow/core/lib/monitoring/collection_registry.cc | 4 +++- tensorflow/core/lib/monitoring/collection_registry.h | 5 ++--- tensorflow/core/lib/monitoring/counter.h | 3 +-- tensorflow/stream_executor/machine_manager.h | 7 +++---- 19 files changed, 40 insertions(+), 33 deletions(-) diff --git a/tensorflow/core/common_runtime/bfc_allocator.h b/tensorflow/core/common_runtime/bfc_allocator.h index c13f67ffcc7..8fd6597cb88 100644 --- a/tensorflow/core/common_runtime/bfc_allocator.h +++ b/tensorflow/core/common_runtime/bfc_allocator.h @@ -295,6 +295,8 @@ class BFCAllocator : public VisitableAllocator { private: std::vector regions_; }; + // Structures mutable after construction + mutable mutex lock_; // Returns 'bytes' rounded up to the next highest kMinAllocationSize. size_t RoundedBytes(size_t bytes); @@ -389,9 +391,6 @@ class BFCAllocator : public VisitableAllocator { std::unique_ptr suballocator_; string name_; - - // Structures mutable after construction - mutable mutex lock_; RegionManager region_manager_ GUARDED_BY(lock_); std::vector chunks_; diff --git a/tensorflow/core/common_runtime/direct_session.h b/tensorflow/core/common_runtime/direct_session.h index a4289112534..8fe4825aa6d 100644 --- a/tensorflow/core/common_runtime/direct_session.h +++ b/tensorflow/core/common_runtime/direct_session.h @@ -162,6 +162,8 @@ class DirectSession : public Session { protobuf::RepeatedPtrField debug_tensor_watches; }; + mutex graph_def_lock_; + // Initializes the base execution state given the 'graph', // if not already initialized. Status MaybeInitializeExecutionState(const GraphDef& graph, @@ -227,7 +229,6 @@ class DirectSession : public Session { string session_handle_; bool graph_created_ GUARDED_BY(graph_def_lock_) = false; - mutex graph_def_lock_; GraphDef graph_def_ GUARDED_BY(graph_def_lock_); // The thread-pools to use for running ops. diff --git a/tensorflow/core/common_runtime/gpu/gpu_allocator_retry_test.cc b/tensorflow/core/common_runtime/gpu/gpu_allocator_retry_test.cc index 2148f83fe57..423448773ae 100644 --- a/tensorflow/core/common_runtime/gpu/gpu_allocator_retry_test.cc +++ b/tensorflow/core/common_runtime/gpu/gpu_allocator_retry_test.cc @@ -28,6 +28,7 @@ namespace tensorflow { namespace { class FakeAllocator { + mutex mu_; public: FakeAllocator(size_t cap, int millis_to_wait) : memory_capacity_(cap), millis_to_wait_(millis_to_wait) {} @@ -57,7 +58,6 @@ class FakeAllocator { private: AllocatorRetry retry_; void* good_ptr_ = reinterpret_cast(0xdeadbeef); - mutex mu_; size_t memory_capacity_ GUARDED_BY(mu_); int millis_to_wait_; }; @@ -72,6 +72,7 @@ class FakeAllocator { // interesting part of their interaction with the allocator. This // class is the mechanism that imposes turn taking. class AlternatingBarrier { + mutex mu_; public: explicit AlternatingBarrier(int num_users) : num_users_(num_users), next_turn_(0), done_(num_users, false) {} @@ -109,7 +110,6 @@ class AlternatingBarrier { } } - mutex mu_; condition_variable cv_; int num_users_; int next_turn_ GUARDED_BY(mu_); @@ -118,6 +118,7 @@ class AlternatingBarrier { class GPUAllocatorRetryTest : public ::testing::Test { protected: + mutex mu_; GPUAllocatorRetryTest() {} void LaunchConsumerThreads(int num_consumers, int cap_needed) { @@ -173,7 +174,6 @@ class GPUAllocatorRetryTest : public ::testing::Test { std::vector consumers_; std::vector consumer_count_; Notification notifier_; - mutex mu_; bool has_failed_ GUARDED_BY(mu_) = false; int count_ GUARDED_BY(mu_) = 0; }; diff --git a/tensorflow/core/common_runtime/gpu/pool_allocator.h b/tensorflow/core/common_runtime/gpu/pool_allocator.h index b2f0265145f..437fea91155 100644 --- a/tensorflow/core/common_runtime/gpu/pool_allocator.h +++ b/tensorflow/core/common_runtime/gpu/pool_allocator.h @@ -45,6 +45,7 @@ class RoundUpInterface { // Size-limited pool of memory buffers obtained from a SubAllocator // instance. Pool eviction policy is LRU. class PoolAllocator : public VisitableAllocator { + mutex mutex_; public: // "pool_size_limit" is the maximum number of returned, re-usable // memory buffers to keep in the pool. If pool_size_limit == 0, the @@ -136,7 +137,6 @@ class PoolAllocator : public VisitableAllocator { size_t pool_size_limit_; std::unique_ptr allocator_; std::unique_ptr size_rounder_; - mutex mutex_; std::multimap pool_ GUARDED_BY(mutex_); PtrRecord* lru_head_ GUARDED_BY(mutex_) = nullptr; PtrRecord* lru_tail_ GUARDED_BY(mutex_) = nullptr; diff --git a/tensorflow/core/framework/op.h b/tensorflow/core/framework/op.h index f047ddb12a1..321ace9f465 100644 --- a/tensorflow/core/framework/op.h +++ b/tensorflow/core/framework/op.h @@ -125,6 +125,8 @@ class OpRegistry : public OpRegistryInterface { void ClearDeferredRegistrations(); private: + mutable mutex mu_; + // Ensures that all the functions in deferred_ get called, their OpDef's // registered, and returns with deferred_ empty. Returns true the first // time it is called. Prints a fatal log if any op registration fails. @@ -141,7 +143,6 @@ class OpRegistry : public OpRegistryInterface { Status RegisterAlreadyLocked(OpRegistrationDataFactory op_data_factory) const EXCLUSIVE_LOCKS_REQUIRED(mu_); - mutable mutex mu_; // Functions in deferred_ may only be called with mu_ held. mutable std::vector deferred_ GUARDED_BY(mu_); // Values are owned. diff --git a/tensorflow/core/framework/tracking_allocator.h b/tensorflow/core/framework/tracking_allocator.h index bb19f5dca04..040bb03f819 100644 --- a/tensorflow/core/framework/tracking_allocator.h +++ b/tensorflow/core/framework/tracking_allocator.h @@ -74,11 +74,11 @@ class TrackingAllocator : public Allocator { std::pair GetSizesAndUnRef(); private: + mutex mu_; ~TrackingAllocator() override {} bool UnRef() EXCLUSIVE_LOCKS_REQUIRED(mu_); Allocator* allocator_; // not owned. - mutex mu_; // the number of calls to AllocateRaw that have not yet been matched // by a corresponding call to DeAllocateRaw, plus 1 if the Executor // has not yet read out the high watermark. diff --git a/tensorflow/core/kernels/barrier_ops.cc b/tensorflow/core/kernels/barrier_ops.cc index 84f57517605..e91d9037cff 100644 --- a/tensorflow/core/kernels/barrier_ops.cc +++ b/tensorflow/core/kernels/barrier_ops.cc @@ -40,6 +40,7 @@ namespace tensorflow { namespace barrier { class Barrier : public ResourceBase { + mutex mu_; public: typedef std::vector Tuple; typedef std::function DoneCallback; @@ -417,7 +418,6 @@ class Barrier : public ResourceBase { private: typedef std::vector PersistentTuple; - mutex mu_; bool closed_ GUARDED_BY(mu_); bool queue_closed_ GUARDED_BY(mu_); bool queue_cancelled_ GUARDED_BY(mu_); @@ -433,6 +433,7 @@ class Barrier : public ResourceBase { }; class BarrierOp : public OpKernel { + mutex mu_; public: explicit BarrierOp(OpKernelConstruction* context) : OpKernel(context), barrier_handle_set_(false) { @@ -511,7 +512,6 @@ class BarrierOp : public OpKernel { std::vector value_component_shapes_; ContainerInfo cinfo_; - mutex mu_; PersistentTensor barrier_handle_ GUARDED_BY(mu_); bool barrier_handle_set_ GUARDED_BY(mu_); @@ -611,7 +611,9 @@ class TakeManyOp : public BarrierOpKernel { DataTypeVector expected_inputs = {DT_STRING_REF, DT_INT32}; // The first output is the insertion index, the second output is the key. DataTypeVector expected_outputs = {DT_INT64, DT_STRING}; - for (DataType dt : barrier->component_types()) { + for (auto it = barrier->component_types().begin(), + end = barrier->component_types().end(); it!= end; it++ ){ + const DataType dt = *it; expected_outputs.push_back(dt); } OP_REQUIRES_OK_ASYNC( diff --git a/tensorflow/core/kernels/conditional_accumulator.h b/tensorflow/core/kernels/conditional_accumulator.h index f8c340a7691..4ee1601f342 100644 --- a/tensorflow/core/kernels/conditional_accumulator.h +++ b/tensorflow/core/kernels/conditional_accumulator.h @@ -65,7 +65,7 @@ class ConditionalAccumulator functor::SetZeroFunctor set_zero_functor_; Status ValidateShape(const Tensor* tensor) - EXCLUSIVE_LOCKS_REQUIRED(this->mu_) { + EXCLUSIVE_LOCKS_REQUIRED(mu_) { // Must be compatible with accumulated gradient if available if (counter_ > 0) { if (!accum_grad_->shape().IsSameSize(tensor->shape())) { @@ -98,7 +98,7 @@ class ConditionalAccumulator } void DivideAccumGradByCounter(OpKernelContext* ctx) override - EXCLUSIVE_LOCKS_REQUIRED(this->mu_) { + EXCLUSIVE_LOCKS_REQUIRED(mu_) { Tensor c(DataTypeToEnum::value, {}); c.scalar()() = TypeConverter::ConvertUToT(this->counter_); this->accum_grad_->template flat().device( @@ -113,7 +113,7 @@ class ConditionalAccumulator bool GetAndValidateTensorInputForApplyGrad(OpKernelContext* ctx, const Tensor** tensor) override - EXCLUSIVE_LOCKS_REQUIRED(this->mu_) { + EXCLUSIVE_LOCKS_REQUIRED(mu_) { // Get input gradient tensor const Tensor* grad_tensor; OP_REQUIRES_OK_BOOLEAN(ctx, ctx->input("gradient", &grad_tensor)); diff --git a/tensorflow/core/kernels/conditional_accumulator_base.h b/tensorflow/core/kernels/conditional_accumulator_base.h index 05ee855daee..9992379640d 100644 --- a/tensorflow/core/kernels/conditional_accumulator_base.h +++ b/tensorflow/core/kernels/conditional_accumulator_base.h @@ -45,6 +45,8 @@ namespace tensorflow { * (3) the internal global_step value (current_global_step_) is incremented by 1 */ class ConditionalAccumulatorBase : public ResourceBase { + protected: + mutex mu_; public: // Args: // dtype: The datatype of the gradients to be accumulated. @@ -125,7 +127,6 @@ class ConditionalAccumulatorBase : public ResourceBase { const DataType dtype_; const PartialTensorShape shape_; const string name_; - mutex mu_; int counter_ GUARDED_BY(mu_); int64 current_global_step_ GUARDED_BY(mu_); diff --git a/tensorflow/core/kernels/conditional_accumulator_base_op.h b/tensorflow/core/kernels/conditional_accumulator_base_op.h index 33c2d596c8b..0a64a857cdb 100644 --- a/tensorflow/core/kernels/conditional_accumulator_base_op.h +++ b/tensorflow/core/kernels/conditional_accumulator_base_op.h @@ -43,6 +43,7 @@ namespace tensorflow { * ConditionalAccumulatorBase (via sub-class's Creator) and returns its handle. */ class ConditionalAccumulatorBaseOp : public OpKernel { + mutex mu_; public: explicit ConditionalAccumulatorBaseOp(OpKernelConstruction* context) : OpKernel(context), accumulator_handle_set_(false) { @@ -109,7 +110,6 @@ class ConditionalAccumulatorBaseOp : public OpKernel { return Status::OK(); } - mutex mu_; PersistentTensor accumulator_handle_ GUARDED_BY(mu_); bool accumulator_handle_set_ GUARDED_BY(mu_); }; diff --git a/tensorflow/core/kernels/cwise_ops_common.h b/tensorflow/core/kernels/cwise_ops_common.h index 5ad6b1fd4a1..3d1953d7f4c 100644 --- a/tensorflow/core/kernels/cwise_ops_common.h +++ b/tensorflow/core/kernels/cwise_ops_common.h @@ -82,8 +82,13 @@ class BinaryOp : public BinaryOpShared { if (!ctx->status().ok()) return; Tensor* out = state.out; BCast* bcast = &state.bcast; +#if TENSORFLOW_USE_SYCL + decltype(state.in0) in0 = state.in0; + decltype(state.in1) in1 = state.in1; +#else auto& in0 = state.in0; auto& in1 = state.in1; +#endif if (state.out_num_elements == 0) { return; } diff --git a/tensorflow/core/kernels/queue_base.h b/tensorflow/core/kernels/queue_base.h index 79b479b44b5..6b2043e5a32 100644 --- a/tensorflow/core/kernels/queue_base.h +++ b/tensorflow/core/kernels/queue_base.h @@ -83,6 +83,7 @@ class QueueBase : public QueueInterface { int64 index); protected: + mutex mu_; enum Action { kEnqueue, kDequeue }; enum RunResult { kNoProgress, kProgress, kComplete }; @@ -143,7 +144,6 @@ class QueueBase : public QueueInterface { const DataTypeVector component_dtypes_; const std::vector component_shapes_; const string name_; - mutex mu_; bool closed_ GUARDED_BY(mu_); struct Attempt; diff --git a/tensorflow/core/kernels/queue_op.h b/tensorflow/core/kernels/queue_op.h index 7694827854c..a21be2c389d 100644 --- a/tensorflow/core/kernels/queue_op.h +++ b/tensorflow/core/kernels/queue_op.h @@ -34,6 +34,7 @@ namespace tensorflow { // Defines a QueueOp, an abstract class for Queue construction ops. class QueueOp : public OpKernel { + mutex mu_; public: QueueOp(OpKernelConstruction* context) : OpKernel(context), queue_handle_set_(false) { @@ -94,7 +95,6 @@ class QueueOp : public OpKernel { return Status::OK(); } - mutex mu_; PersistentTensor queue_handle_ GUARDED_BY(mu_); bool queue_handle_set_ GUARDED_BY(mu_); }; diff --git a/tensorflow/core/kernels/sparse_conditional_accumulator.h b/tensorflow/core/kernels/sparse_conditional_accumulator.h index 89560094af6..73bd3b47e48 100644 --- a/tensorflow/core/kernels/sparse_conditional_accumulator.h +++ b/tensorflow/core/kernels/sparse_conditional_accumulator.h @@ -83,7 +83,7 @@ class SparseConditionalAccumulator Status ValidateShape( std::tuple* tensor, - bool has_known_shape) EXCLUSIVE_LOCKS_REQUIRED(this->mu_) { + bool has_known_shape) EXCLUSIVE_LOCKS_REQUIRED(mu_) { const Tensor* tensor_idx = std::get<0>(*tensor); const Tensor* tensor_val = std::get<1>(*tensor); const Tensor* tensor_shape = std::get<2>(*tensor); diff --git a/tensorflow/core/kernels/tensor_array.h b/tensorflow/core/kernels/tensor_array.h index 7835fd7bbc1..96bff2c95ed 100644 --- a/tensorflow/core/kernels/tensor_array.h +++ b/tensorflow/core/kernels/tensor_array.h @@ -123,6 +123,7 @@ TF_CALL_GPU_NUMBER_TYPES(TENSOR_ARRAY_SET_ZERO_GPU); // multiple reads of that index in the forward phase. // class TensorArray : public ResourceBase { + mutex mu_; public: static std::atomic tensor_array_counter; @@ -338,8 +339,6 @@ class TensorArray : public ResourceBase { const DataType dtype_; Tensor handle_; - mutex mu_; - // Marks that the tensor_array_ has been cleared. bool closed_ GUARDED_BY(mu_); diff --git a/tensorflow/core/lib/monitoring/collection_registry.cc b/tensorflow/core/lib/monitoring/collection_registry.cc index 47112279cff..01d643fbcca 100644 --- a/tensorflow/core/lib/monitoring/collection_registry.cc +++ b/tensorflow/core/lib/monitoring/collection_registry.cc @@ -45,7 +45,9 @@ void Collector::CollectMetricDescriptor( metric_descriptor->name = metric_def->name().ToString(); metric_descriptor->description = metric_def->description().ToString(); - for (const StringPiece label_name : metric_def->label_descriptions()) { + for (auto it = metric_def->label_descriptions().begin(), + end = metric_def->label_descriptions().end() ; it!=end ;it++ ) { + const StringPiece label_name = *it; metric_descriptor->label_names.push_back(label_name.ToString()); } diff --git a/tensorflow/core/lib/monitoring/collection_registry.h b/tensorflow/core/lib/monitoring/collection_registry.h index 3da2439238f..ed957b9ae45 100644 --- a/tensorflow/core/lib/monitoring/collection_registry.h +++ b/tensorflow/core/lib/monitoring/collection_registry.h @@ -121,6 +121,7 @@ class MetricCollectorGetter { // // This class is thread-safe. class CollectionRegistry { + mutable mutex mu_; public: ~CollectionRegistry() = default; @@ -176,8 +177,6 @@ class CollectionRegistry { // TF environment, mainly used for timestamping. Env* const env_; - mutable mutex mu_; - // Information required for collection. struct CollectionInfo { const AbstractMetricDef* const metric_def; @@ -227,6 +226,7 @@ inline void CollectValue(const int64& value, Point* const point) { // // This class is thread-safe. class Collector { + mutable mutex mu_; public: Collector(const uint64 collection_time_millis) : collected_metrics_(new CollectedMetrics()), @@ -260,7 +260,6 @@ class Collector { LOCKS_EXCLUDED(mu_); private: - mutable mutex mu_; std::unique_ptr collected_metrics_ GUARDED_BY(mu_); const uint64 collection_time_millis_; diff --git a/tensorflow/core/lib/monitoring/counter.h b/tensorflow/core/lib/monitoring/counter.h index e76057b980a..0ea50932dd9 100644 --- a/tensorflow/core/lib/monitoring/counter.h +++ b/tensorflow/core/lib/monitoring/counter.h @@ -78,6 +78,7 @@ class CounterCell { // This class is thread-safe. template class Counter { + mutable mutex mu_; public: ~Counter() { // Deleted here, before the metric_def is destroyed. @@ -111,8 +112,6 @@ class Counter { } })) {} - mutable mutex mu_; - // The metric definition. This will be used to identify the metric when we // register it for collection. const MetricDef metric_def_; diff --git a/tensorflow/stream_executor/machine_manager.h b/tensorflow/stream_executor/machine_manager.h index 65396dd1ff5..bf95bc74713 100644 --- a/tensorflow/stream_executor/machine_manager.h +++ b/tensorflow/stream_executor/machine_manager.h @@ -60,6 +60,9 @@ namespace gputools { // // Thread-safe. class MachineManager { + // Mutex that guards the initialization of the machine manager static + // variable. + static mutex mu_; public: // Inspects the host to determine the preferred GPU execution platform. // To force OpenCL from a build target on a machine that has both OpenCL and @@ -171,10 +174,6 @@ class MachineManager { // Returns the NUMA node association for the StreamExecutor. int ExecutorToNumaNode(const StreamExecutor *stream_exec) const; - // Mutex that guards the initialization of the machine manager static - // variable. - static mutex mu_; - // Singleton MachineManager value -- assignment to this is protected by a // static singleton guard clause. static MachineManager *singleton_ GUARDED_BY(mu_); From 916663735e715856b9796ac6937f3a6565bfb3a7 Mon Sep 17 00:00:00 2001 From: luke iwanski Date: Fri, 14 Oct 2016 13:11:50 +0100 Subject: [PATCH 05/51] Turned filegroup into cc_toolchain_suite. --- third_party/sycl/crosstool/BUILD | 9 +++++---- tools/bazel.rc.template | 2 +- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/third_party/sycl/crosstool/BUILD b/third_party/sycl/crosstool/BUILD index eac4dc7fad8..ec0070e71da 100755 --- a/third_party/sycl/crosstool/BUILD +++ b/third_party/sycl/crosstool/BUILD @@ -2,10 +2,11 @@ licenses(["restricted"]) package(default_visibility = ["//visibility:public"]) -filegroup( - name = "crosstool", - srcs = ["CROSSTOOL"], - output_licenses = ["unencumbered"], +cc_toolchain_suite( + name = "toolchain", + toolchains = { + "local|compiler": ":cc-compiler-local", + }, ) cc_toolchain( diff --git a/tools/bazel.rc.template b/tools/bazel.rc.template index 875a290215d..bdbc88ba395 100644 --- a/tools/bazel.rc.template +++ b/tools/bazel.rc.template @@ -1,7 +1,7 @@ build:cuda --crosstool_top=@local_config_cuda//crosstool:toolchain build:cuda --define=using_cuda=true --define=using_cuda_nvcc=true -build:sycl --crosstool_top=//third_party/sycl/crosstool +build:sycl --crosstool_top=//third_party/sycl/crosstool:toolchain build:sycl --define=using_sycl=true build --force_python=py$PYTHON_MAJOR_VERSION From 656a72c0cd18cd17a7cecb2309a018f020525518 Mon Sep 17 00:00:00 2001 From: luke iwanski Date: Fri, 14 Oct 2016 16:50:23 +0100 Subject: [PATCH 06/51] Pointing to latest Eigen OpenCL version. --- tensorflow/workspace.bzl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl index ac2a22ee548..6e3def96902 100644 --- a/tensorflow/workspace.bzl +++ b/tensorflow/workspace.bzl @@ -14,7 +14,7 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""): # These lines need to be changed when updating Eigen. They are parsed from # this file by the cmake and make builds to determine the eigen version and # hash. - eigen_version = "aad63574941c" + eigen_version = "ab6d16a84626" eigen_sha256 = "" native.new_http_archive( From 1f88ec8eef88ec783a18a98816fbb15b21df6ee7 Mon Sep 17 00:00:00 2001 From: luke iwanski Date: Mon, 17 Oct 2016 21:14:25 +0100 Subject: [PATCH 07/51] Update to SYCL bazel toolchain. --- configure | 85 +++++++---- .../core/platform/default/build_config/BUILD | 16 ++ tensorflow/workspace.bzl | 3 + third_party/sycl/BUILD | 44 ------ third_party/sycl/crosstool/BUILD | 29 ---- third_party/sycl/crosstool/BUILD.tpl | 29 ++++ .../crosstool/{CROSSTOOL => CROSSTOOL.tpl} | 2 +- .../crosstool/{computecpp => computecpp.tpl} | 8 +- third_party/sycl/platform.bzl | 17 --- third_party/sycl/sycl/BUILD | 0 third_party/sycl/sycl/BUILD.tpl | 43 ++++++ .../build_defs.bzl.tpl} | 5 +- third_party/sycl/sycl/platform.bzl.tpl | 5 + third_party/sycl/sycl_config.sh | 143 ----------------- third_party/sycl/sycl_configure.bzl | 144 ++++++++++++++++++ tools/bazel.rc.template | 2 +- 16 files changed, 307 insertions(+), 268 deletions(-) mode change 100755 => 100644 third_party/sycl/BUILD mode change 100755 => 100644 third_party/sycl/crosstool/BUILD create mode 100755 third_party/sycl/crosstool/BUILD.tpl rename third_party/sycl/crosstool/{CROSSTOOL => CROSSTOOL.tpl} (97%) rename third_party/sycl/crosstool/{computecpp => computecpp.tpl} (93%) delete mode 100755 third_party/sycl/platform.bzl create mode 100644 third_party/sycl/sycl/BUILD create mode 100755 third_party/sycl/sycl/BUILD.tpl rename third_party/sycl/{build_defs.bzl => sycl/build_defs.bzl.tpl} (86%) create mode 100755 third_party/sycl/sycl/platform.bzl.tpl delete mode 100755 third_party/sycl/sycl_config.sh create mode 100644 third_party/sycl/sycl_configure.bzl diff --git a/configure b/configure index 08078d29d5c..759f317d0cb 100755 --- a/configure +++ b/configure @@ -365,17 +365,64 @@ fi # OpenCL configuration if [ "$TF_NEED_OPENCL" == "1" ]; then + +# Determine which C++ compiler should be used as the host compiler +while true; do + fromuser="" + if [ -z "$HOST_CXX_COMPILER" ]; then + default_cxx_host_compiler=$(which g++|| true) + read -p "Please specify which C++ compiler should be used as the host C++ compiler. [Default is $default_cxx_host_compiler]: " HOST_CXX_COMPILER + fromuser="1" + if [ -z "$HOST_CXX_COMPILER" ]; then + HOST_CXX_COMPILER=$default_cxx_host_compiler + fi + fi + if [ -e "$HOST_CXX_COMPILER" ]; then + export HOST_CXX_COMPILER + break + fi + echo "Invalid C++ compiler path. ${HOST_CXX_COMPILER} cannot be found" 1>&2 + if [ -z "$fromuser" ]; then + exit 1 + fi + HOST_CXX_COMPILER="" + # Retry +done + +# Determine which C compiler should be used as the host compiler +while true; do + fromuser="" + if [ -z "$HOST_C_COMPILER" ]; then + default_c_host_compiler=$(which gcc|| true) + read -p "Please specify which C compiler should be used as the host C compiler. [Default is $default_c_host_compiler]: " HOST_C_COMPILER + fromuser="1" + if [ -z "$HOST_C_COMPILER" ]; then + HOST_C_COMPILER=$default_c_host_compiler + fi + fi + if [ -e "$HOST_C_COMPILER" ]; then + export HOST_C_COMPILER + break + fi + echo "Invalid C compiler path. ${HOST_C_COMPILER} cannot be found" 1>&2 + if [ -z "$fromuser" ]; then + exit 1 + fi + HOST_C_COMPILER="" + # Retry +done + while true; do # Configure the OPENCL version to use. TF_OPENCL_VERSION="1.2" - # Point to ComputeCPP root - if [ -z "$COMPUTECPP_PATH" ]; then - default_computecpp_path=/usr/local/computecpp - read -p "Please specify the location where ComputeCPP $TF_OPENCL_VERSION is installed. Refer to README.md for more details. [Default is $default_computecpp_path]: " COMPUTECPP_PATH + # Point to ComputeCpp root + if [ -z "$COMPUTECPP_TOOLKIT_PATH" ]; then + default_computecpp_toolkit_path=/usr/local/computecpp + read -p "Please specify the location where ComputeCpp $TF_OPENCL_VERSION is installed. Refer to README.md for more details. [Default is $default_computecpp_toolkit_path]: " COMPUTECPP_TOOLKIT_PATH fromuser="1" - if [ -z "$COMPUTECPP_PATH" ]; then - COMPUTECPP_PATH=$default_computecpp_path + if [ -z "$COMPUTECPP_TOOLKIT_PATH" ]; then + COMPUTECPP_TOOLKIT_PATH=$default_computecpp_toolkit_path fi fi @@ -383,38 +430,20 @@ while true; do SYCL_RT_LIB_PATH="lib/libComputeCpp.so" fi - if [ -e "${COMPUTECPP_PATH}/${SYCL_RT_LIB_PATH}" ]; then + if [ -e "${COMPUTECPP_TOOLKIT_PATH}/${SYCL_RT_LIB_PATH}" ]; then + export COMPUTECPP_TOOLKIT_PATH break fi - echo "Invalid path to SYCL $TF_OPENCL_VERSION library. ${COMPUTECPP_PATH}/${SYCL_RT_LIB_PATH} cannot be found" + echo "Invalid SYCL $TF_OPENCL_VERSION library path. ${COMPUTECPP_TOOLKIT_PATH}/${SYCL_RT_LIB_PATH} cannot be found" if [ -z "$fromuser" ]; then exit 1 fi # Retry TF_OPENCL_VERSION="" - COMPUTECPP_PATH="" + COMPUTECPP_TOOLKIT_PATH="" done -cat > third_party/sycl/sycl.config < Date: Tue, 18 Oct 2016 16:25:14 +0100 Subject: [PATCH 08/51] Need to specify default C and C++ compilers to avoid bazel error when re-configuring. --- third_party/sycl/sycl_configure.bzl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/third_party/sycl/sycl_configure.bzl b/third_party/sycl/sycl_configure.bzl index 37766e90dc7..e3c732bbc73 100644 --- a/third_party/sycl/sycl_configure.bzl +++ b/third_party/sycl/sycl_configure.bzl @@ -20,7 +20,7 @@ def auto_configure_fail(msg): def find_c(repository_ctx): """Find host C compiler.""" - c_name = "" + c_name = "gcc" if _HOST_C_COMPILER in repository_ctx.os.environ: c_name = repository_ctx.os.environ[_HOST_C_COMPILER].strip() if c_name.startswith("/"): @@ -32,7 +32,7 @@ def find_c(repository_ctx): def find_cc(repository_ctx): """Find host C++ compiler.""" - cc_name = "" + cc_name = "g++" if _HOST_CXX_COMPILER in repository_ctx.os.environ: cc_name = repository_ctx.os.environ[_HOST_CXX_COMPILER].strip() if cc_name.startswith("/"): From 455be86133dc31b9da350944f20f98f8a9250e40 Mon Sep 17 00:00:00 2001 From: luke Date: Tue, 18 Oct 2016 17:54:14 +0100 Subject: [PATCH 09/51] Added dummy repository creation for SYCL. --- configure | 1 + third_party/sycl/sycl_configure.bzl | 93 ++++++++++++++++++++++------- 2 files changed, 74 insertions(+), 20 deletions(-) diff --git a/configure b/configure index 759f317d0cb..2e8b86fdaf7 100755 --- a/configure +++ b/configure @@ -444,6 +444,7 @@ while true; do COMPUTECPP_TOOLKIT_PATH="" done +export TF_NEED_OPENCL # end of if "$TF_NEED_OPENCL" == "1" fi diff --git a/third_party/sycl/sycl_configure.bzl b/third_party/sycl/sycl_configure.bzl index e3c732bbc73..2dd82198ffb 100644 --- a/third_party/sycl/sycl_configure.bzl +++ b/third_party/sycl/sycl_configure.bzl @@ -11,6 +11,12 @@ _HOST_CXX_COMPILER = "HOST_CXX_COMPILER" _HOST_C_COMPILER= "HOST_C_COMPILER" _COMPUTECPP_TOOLKIT_PATH = "COMPUTECPP_TOOLKIT_PATH" +def _enable_sycl(repository_ctx): + if "TF_NEED_OPENCL" in repository_ctx.os.environ: + enable_sycl = repository_ctx.os.environ["TF_NEED_OPENCL"].strip() + return enable_sycl == "1" + return False + def auto_configure_fail(msg): """Output failure message when auto configuration fails.""" red = "\033[0;31m" @@ -99,33 +105,80 @@ def _file(repository_ctx, label): Label("//third_party/sycl/%s.tpl" % label), {}) -def _sycl_autoconf_imp(repository_ctx): - """Implementation of the sycl_autoconf rule.""" +_DUMMY_CROSSTOOL_BZL_FILE = """ +def error_sycl_disabled(): + fail("ERROR: Building with --config=sycl but TensorFlow is not configured " + + "to build with SYCL support. Please re-run ./configure and enter 'Y' " + + "at the prompt to build with SYCL support.") - # copy template files + native.genrule( + name = "error_gen_crosstool", + outs = ["CROSSTOOL"], + cmd = "echo 'Should not be run.' && exit 1", + ) + + native.filegroup( + name = "crosstool", + srcs = [":CROSSTOOL"], + output_licenses = ["unencumbered"], + ) +""" + + +_DUMMY_CROSSTOOL_BUILD_FILE = """ +load("//crosstool:error_sycl_disabled.bzl", "error_sycl_disabled") + +error_sycl_disabled() +""" + +def _create_dummy_repository(repository_ctx): + # Set up BUILD file for sycl/. _file(repository_ctx, "sycl:build_defs.bzl") _file(repository_ctx, "sycl:BUILD") _file(repository_ctx, "sycl:platform.bzl") - _file(repository_ctx, "crosstool:BUILD") - _tpl(repository_ctx, "crosstool:computecpp", - { - "%{host_cxx_compiler}" : find_cc(repository_ctx), - "%{host_c_compiler}" : find_c(repository_ctx), - }) - computecpp_root = find_computecpp_root(repository_ctx); - _check_dir(repository_ctx, computecpp_root) + # Create dummy files for the SYCL toolkit since they are still required by + # tensorflow/sycl/platform/default/build_config:sycl. + repository_ctx.file("sycl/include/sycl.hpp", "") + repository_ctx.file("sycl/lib/libComputeCpp.so", "") - _tpl(repository_ctx, "crosstool:CROSSTOOL", - { - "%{computecpp_toolkit_path}" : computecpp_root, - }) + # If sycl_configure is not configured to build with SYCL support, and the user + # attempts to build with --config=sycl, add a dummy build rule to intercept + # this and fail with an actionable error message. + repository_ctx.file("crosstool/error_sycl_disabled.bzl", + _DUMMY_CROSSTOOL_BZL_FILE) + repository_ctx.file("crosstool/BUILD", _DUMMY_CROSSTOOL_BUILD_FILE) - # symlink libraries - _check_lib(repository_ctx, computecpp_root+"/lib", "libComputeCpp.so" ) - _symlink_dir(repository_ctx, computecpp_root + "/lib", "sycl/lib") - _symlink_dir(repository_ctx, computecpp_root + "/include", "sycl/include") - _symlink_dir(repository_ctx, computecpp_root + "/bin", "sycl/bin") + +def _sycl_autoconf_imp(repository_ctx): + """Implementation of the sycl_autoconf rule.""" + if not _enable_sycl(repository_ctx): + _create_dummy_repository(repository_ctx) + else: + # copy template files + _file(repository_ctx, "sycl:build_defs.bzl") + _file(repository_ctx, "sycl:BUILD") + _file(repository_ctx, "sycl:platform.bzl") + _file(repository_ctx, "crosstool:BUILD") + _tpl(repository_ctx, "crosstool:computecpp", + { + "%{host_cxx_compiler}" : find_cc(repository_ctx), + "%{host_c_compiler}" : find_c(repository_ctx), + }) + + computecpp_root = find_computecpp_root(repository_ctx); + _check_dir(repository_ctx, computecpp_root) + + _tpl(repository_ctx, "crosstool:CROSSTOOL", + { + "%{computecpp_toolkit_path}" : computecpp_root, + }) + + # symlink libraries + _check_lib(repository_ctx, computecpp_root+"/lib", "libComputeCpp.so" ) + _symlink_dir(repository_ctx, computecpp_root + "/lib", "sycl/lib") + _symlink_dir(repository_ctx, computecpp_root + "/include", "sycl/include") + _symlink_dir(repository_ctx, computecpp_root + "/bin", "sycl/bin") sycl_configure = repository_rule( implementation = _sycl_autoconf_imp, From 58db8411d93e277f129143a2647484e768128182 Mon Sep 17 00:00:00 2001 From: luke Date: Tue, 18 Oct 2016 16:16:01 +0100 Subject: [PATCH 10/51] Added template keyword for post fix expression where left hand side is a dependent expression. --- tensorflow/core/kernels/cwise_ops_common.h | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/tensorflow/core/kernels/cwise_ops_common.h b/tensorflow/core/kernels/cwise_ops_common.h index 3d1953d7f4c..0297e65b09a 100644 --- a/tensorflow/core/kernels/cwise_ops_common.h +++ b/tensorflow/core/kernels/cwise_ops_common.h @@ -101,45 +101,45 @@ class BinaryOp : public BinaryOpShared { if (state.in1_num_elements == 1) { // tensor op scalar functor::BinaryFunctor().Right( - eigen_device, out_flat, in0.flat(), in1.scalar(), + eigen_device, out_flat, in0.template flat(), in1.template scalar(), error_ptr); } else if (state.in0_num_elements == 1) { // scalar op tensor functor::BinaryFunctor().Left( - eigen_device, out_flat, in0.scalar(), in1.flat(), + eigen_device, out_flat, in0.template scalar(), in1.template flat(), error_ptr); } else { functor::BinaryFunctor()( - eigen_device, out_flat, in0.flat(), in1.flat(), + eigen_device, out_flat, in0.template flat(), in1.template flat(), error_ptr); } } else if (ndims == 2) { functor::BinaryFunctor().BCast( eigen_device, out->shaped(bcast->result_shape()), - in0.shaped(bcast->x_reshape()), + in0.template shaped(bcast->x_reshape()), BCast::ToIndexArray<2>(bcast->x_bcast()), - in1.shaped(bcast->y_reshape()), + in1.template shaped(bcast->y_reshape()), BCast::ToIndexArray<2>(bcast->y_bcast()), error_ptr); } else if (ndims == 3) { functor::BinaryFunctor().BCast( eigen_device, out->shaped(bcast->result_shape()), - in0.shaped(bcast->x_reshape()), + in0.template shaped(bcast->x_reshape()), BCast::ToIndexArray<3>(bcast->x_bcast()), - in1.shaped(bcast->y_reshape()), + in1.template shaped(bcast->y_reshape()), BCast::ToIndexArray<3>(bcast->y_bcast()), error_ptr); } else if (ndims == 4) { functor::BinaryFunctor().BCast( eigen_device, out->shaped(bcast->result_shape()), - in0.shaped(bcast->x_reshape()), + in0.template shaped(bcast->x_reshape()), BCast::ToIndexArray<4>(bcast->x_bcast()), - in1.shaped(bcast->y_reshape()), + in1.template shaped(bcast->y_reshape()), BCast::ToIndexArray<4>(bcast->y_bcast()), error_ptr); } else if (ndims == 5) { functor::BinaryFunctor().BCast( eigen_device, out->shaped(bcast->result_shape()), - in0.shaped(bcast->x_reshape()), + in0.template shaped(bcast->x_reshape()), BCast::ToIndexArray<5>(bcast->x_bcast()), - in1.shaped(bcast->y_reshape()), + in1.template shaped(bcast->y_reshape()), BCast::ToIndexArray<5>(bcast->y_bcast()), error_ptr); } else { SetUnimplementedError(ctx); From befbe983abe59c6aa96c6dae495ab458299dc0cf Mon Sep 17 00:00:00 2001 From: luke iwanski Date: Thu, 13 Oct 2016 20:20:39 +0100 Subject: [PATCH 11/51] Added SYCL type. --- tensorflow/core/framework/types.cc | 1 + tensorflow/core/framework/types.h | 5 +++-- tensorflow/core/framework/types_test.cc | 1 + 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/tensorflow/core/framework/types.cc b/tensorflow/core/framework/types.cc index d1e4d572e47..235f551d0e2 100644 --- a/tensorflow/core/framework/types.cc +++ b/tensorflow/core/framework/types.cc @@ -37,6 +37,7 @@ std::ostream& operator<<(std::ostream& os, const DeviceType& d) { const char* const DEVICE_CPU = "CPU"; const char* const DEVICE_GPU = "GPU"; +const char* const DEVICE_SYCL = "SYCL"; string DataTypeString(DataType dtype) { if (IsRefType(dtype)) { diff --git a/tensorflow/core/framework/types.h b/tensorflow/core/framework/types.h index 09e5b07bed0..d8a1e160ced 100644 --- a/tensorflow/core/framework/types.h +++ b/tensorflow/core/framework/types.h @@ -67,8 +67,9 @@ class DeviceType { std::ostream& operator<<(std::ostream& os, const DeviceType& d); // Convenient constants that can be passed to a DeviceType constructor -extern const char* const DEVICE_CPU; // "CPU" -extern const char* const DEVICE_GPU; // "GPU" +extern const char* const DEVICE_CPU; // "CPU" +extern const char* const DEVICE_GPU; // "GPU" +extern const char* const DEVICE_SYCL; // "SYCL" typedef gtl::InlinedVector MemoryTypeVector; typedef gtl::ArraySlice MemoryTypeSlice; diff --git a/tensorflow/core/framework/types_test.cc b/tensorflow/core/framework/types_test.cc index 18e0ef9c398..bc57740469f 100644 --- a/tensorflow/core/framework/types_test.cc +++ b/tensorflow/core/framework/types_test.cc @@ -25,6 +25,7 @@ namespace { TEST(TypesTest, DeviceTypeName) { EXPECT_EQ("CPU", DeviceTypeString(DeviceType(DEVICE_CPU))); EXPECT_EQ("GPU", DeviceTypeString(DeviceType(DEVICE_GPU))); + EXPECT_EQ("SYCL", DeviceTypeString(DeviceType(DEVICE_SYCL))); } TEST(TypesTest, kDataTypeRefOffset) { From 530823bcbb8540465863d58be9bc18bdbae56d63 Mon Sep 17 00:00:00 2001 From: luke iwanski Date: Thu, 13 Oct 2016 21:48:16 +0100 Subject: [PATCH 12/51] Added SYCL to the device_set_test and device_name_utils. --- tensorflow/core/common_runtime/device_set_test.cc | 9 +++++++-- tensorflow/core/util/device_name_utils.cc | 10 ++++++++++ 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/tensorflow/core/common_runtime/device_set_test.cc b/tensorflow/core/common_runtime/device_set_test.cc index 21d3be2f613..2c4aa227462 100644 --- a/tensorflow/core/common_runtime/device_set_test.cc +++ b/tensorflow/core/common_runtime/device_set_test.cc @@ -68,12 +68,17 @@ TEST_F(DeviceSetTest, PrioritizedDeviceTypeList) { (std::vector{DeviceType(DEVICE_GPU), DeviceType(DEVICE_CPU)}), types()); + AddDevice("SYCL", "/job:a/replica:0/task:0/sycl:0"); + EXPECT_EQ( + (std::vector{DeviceType(DEVICE_SYCL), DeviceType(DEVICE_GPU), + DeviceType(DEVICE_CPU)}), types()); + AddDevice("T1", "/job:a/replica:0/task:0/device:T1:0"); AddDevice("T1", "/job:a/replica:0/task:0/device:T1:1"); AddDevice("T2", "/job:a/replica:0/task:0/device:T2:0"); EXPECT_EQ( - (std::vector{DeviceType("T1"), DeviceType("T2"), - DeviceType(DEVICE_GPU), DeviceType(DEVICE_CPU)}), + (std::vector{DeviceType(DEVICE_SYCL), DeviceType("T1"), + DeviceType("T2"), DeviceType(DEVICE_GPU), DeviceType(DEVICE_CPU)}), types()); } diff --git a/tensorflow/core/util/device_name_utils.cc b/tensorflow/core/util/device_name_utils.cc index c38b5758fa8..336b69b3aba 100644 --- a/tensorflow/core/util/device_name_utils.cc +++ b/tensorflow/core/util/device_name_utils.cc @@ -162,6 +162,16 @@ bool DeviceNameUtils::ParseFullName(StringPiece fullname, ParsedName* p) { } progress = true; } + if (str_util::ConsumePrefix(&fullname, "/sycl:") || + str_util::ConsumePrefix(&fullname, "/SYCL:")) { + p->has_type = true; + p->type = "SYCL"; // Treat '/sycl:..' as uppercase '/device:SYCL:...' + p->has_id = !str_util::ConsumePrefix(&fullname, "*"); + if (p->has_id && !ConsumeNumber(&fullname, &p->id)) { + return false; + } + progress = true; + } if (!progress) { return false; From 13157f1ab96e6641b1373891688578b8d6da8d60 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 19 Oct 2016 12:21:03 -0700 Subject: [PATCH 13/51] Switched to the latest version of Eigen --- eigen.BUILD | 2 ++ 1 file changed, 2 insertions(+) diff --git a/eigen.BUILD b/eigen.BUILD index 8a699f6aa84..210d1523ea3 100644 --- a/eigen.BUILD +++ b/eigen.BUILD @@ -55,6 +55,8 @@ EIGEN_MPL2_HEADER_FILES = glob( ], ) +# archive_dir = "benoitsteiner-opencl-9d4a08d57d0d" + cc_library( name = "eigen", hdrs = EIGEN_MPL2_HEADER_FILES, From 4b0a8966a5000d2e292a950827f46834ee7ad646 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 19 Oct 2016 12:24:14 -0700 Subject: [PATCH 14/51] Upgraded to the latest version of Eigen --- tensorflow/workspace.bzl | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl index 6e3def96902..2934bc221f5 100644 --- a/tensorflow/workspace.bzl +++ b/tensorflow/workspace.bzl @@ -14,8 +14,8 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""): # These lines need to be changed when updating Eigen. They are parsed from # this file by the cmake and make builds to determine the eigen version and # hash. - eigen_version = "ab6d16a84626" - eigen_sha256 = "" + eigen_version = "c6f01fd8258e" + eigen_sha256 = "b5c110733c6bc5f12cb3ceb66806c34932a376850b2801eea4a59a16c0de9206" native.new_http_archive( name = "eigen_archive", @@ -27,16 +27,16 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""): native.http_archive( name = "com_googlesource_code_re2", - url = "http://github.com/google/re2/archive/7bab3dc83df6a838cc004cc7a7f51d5fe1a427d5.tar.gz", - sha256 = "ef91af8850f734c8be65f2774747f4c2d8d81e556ba009faa79b4dd8b2759555", - strip_prefix = "re2-7bab3dc83df6a838cc004cc7a7f51d5fe1a427d5", + url = "http://github.com/google/re2/archive/b94b7cd42e9f02673cd748c1ac1d16db4052514c.tar.gz", + sha256 = "bd63550101e056427c9e7ff12a408c1c8b74e9803f393ca916b2926fc2c4906f", + strip_prefix = "re2-b94b7cd42e9f02673cd748c1ac1d16db4052514c", ) native.http_archive( name = "gemmlowp", - url = "http://github.com/google/gemmlowp/archive/8b20dd2ce142115857220bd6a35e8a081b3e0829.tar.gz", - sha256 = "9cf5f1e3d64b3632dbae5c65efb79f4374ca9ac362d788fc61e086af937ff6d7", - strip_prefix = "gemmlowp-8b20dd2ce142115857220bd6a35e8a081b3e0829", + url = "http://github.com/google/gemmlowp/archive/c0bacf11fb509a2cbe15a97362a2df067ffd57a2.tar.gz", + sha256 = "dc64a38f9927db18748d9024987c9b102115e25bc2be4b76aa8e422b8f83d882", + strip_prefix = "gemmlowp-c0bacf11fb509a2cbe15a97362a2df067ffd57a2", ) native.new_http_archive( @@ -98,9 +98,9 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""): native.http_archive( name = "protobuf", - url = "http://github.com/google/protobuf/archive/v3.1.0.tar.gz", - sha256 = "0a0ae63cbffc274efb573bdde9a253e3f32e458c41261df51c5dbc5ad541e8f7", - strip_prefix = "protobuf-3.1.0", + url = "http://github.com/google/protobuf/archive/c2b3e70efd2038a54ef8973771ac58192885125e.tar.gz", + sha256 = "eafc1bc4c27970d62effe64ba6610823fdd66711f440d8ca4a168167786a2fcb", + strip_prefix = "protobuf-c2b3e70efd2038a54ef8973771ac58192885125e", ) native.new_http_archive( From a51849e74d26cdaef6365ef4951d22fea53dcc69 Mon Sep 17 00:00:00 2001 From: luke iwanski Date: Wed, 19 Oct 2016 23:27:10 +0100 Subject: [PATCH 15/51] Appache 2.0 license. --- third_party/sycl/crosstool/BUILD.tpl | 2 +- third_party/sycl/sycl/BUILD.tpl | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/third_party/sycl/crosstool/BUILD.tpl b/third_party/sycl/crosstool/BUILD.tpl index ec0070e71da..f539a376c85 100755 --- a/third_party/sycl/crosstool/BUILD.tpl +++ b/third_party/sycl/crosstool/BUILD.tpl @@ -1,4 +1,4 @@ -licenses(["restricted"]) +licenses(["notice"]) # Apache 2.0 package(default_visibility = ["//visibility:public"]) diff --git a/third_party/sycl/sycl/BUILD.tpl b/third_party/sycl/sycl/BUILD.tpl index c3879e8da54..9e83b1994cb 100755 --- a/third_party/sycl/sycl/BUILD.tpl +++ b/third_party/sycl/sycl/BUILD.tpl @@ -1,4 +1,4 @@ -licenses(["restricted"]) # MPL2, portions GPL v3, LGPL v3, BSD-like +licenses(["notice"]) # Apache 2.0 load("@local_config_sycl//sycl:build_defs.bzl", "if_sycl") load("platform", "sycl_library_path") From 78d6d1a2628ed4979f22cebdfc762ba5f88f33e3 Mon Sep 17 00:00:00 2001 From: luke Date: Thu, 20 Oct 2016 19:04:19 +0100 Subject: [PATCH 16/51] Added sycl_device and sycl_device_factory. --- tensorflow/core/BUILD | 26 ++++ .../core/common_runtime/device_factory.cc | 11 +- .../core/common_runtime/sycl/sycl_device.cc | 70 +++++++++ .../core/common_runtime/sycl/sycl_device.h | 53 +++++++ .../sycl/sycl_device_factory.cc | 43 ++++++ tensorflow/core/framework/device_base.h | 17 +++ tensorflow/core/kernels/cwise_ops_common.h | 19 ++- .../core/kernels/cwise_ops_sycl_common.h | 136 ++++++++++++++++++ tensorflow/workspace.bzl | 2 +- third_party/eigen3/BUILD | 4 +- 10 files changed, 372 insertions(+), 9 deletions(-) create mode 100644 tensorflow/core/common_runtime/sycl/sycl_device.cc create mode 100644 tensorflow/core/common_runtime/sycl/sycl_device.h create mode 100644 tensorflow/core/common_runtime/sycl/sycl_device_factory.cc create mode 100644 tensorflow/core/kernels/cwise_ops_sycl_common.h diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD index f32cd0e6fc8..b19ac574b99 100644 --- a/tensorflow/core/BUILD +++ b/tensorflow/core/BUILD @@ -505,6 +505,7 @@ cc_library( deps = [ ":core_cpu", ":gpu_runtime", + ":sycl_runtime", ], ) @@ -1414,6 +1415,31 @@ tf_cuda_library( alwayslink = 1, ) +cc_library( + name = "sycl_runtime", + srcs = if_not_windows([ + "common_runtime/sycl/sycl_device.cc", + "common_runtime/sycl/sycl_device_factory.cc", + ]), + hdrs = if_not_windows([ + "common_runtime/sycl/sycl_device.h", + ]), + copts = tf_copts(), + linkstatic = 1, + deps = [ + ":core_cpu", + ":core_cpu_internal", + ":framework", + ":framework_internal", + ":lib", + ":lib_internal", + ":protos_all_cc", + "//third_party/eigen3", + "@local_config_sycl//sycl:sycl", + ], + alwayslink = 1, +) + # ----------------------------------------------------------------------------- # Tests diff --git a/tensorflow/core/common_runtime/device_factory.cc b/tensorflow/core/common_runtime/device_factory.cc index 8104f446366..15933a81992 100644 --- a/tensorflow/core/common_runtime/device_factory.cc +++ b/tensorflow/core/common_runtime/device_factory.cc @@ -97,11 +97,20 @@ Status DeviceFactory::AddDevices(const SessionOptions& options, gpu_factory->CreateDevices(options, name_prefix, devices)); } + // Then SYCL. + auto sycl_factory = GetFactory("SYCL"); + + if (sycl_factory) { + TF_RETURN_IF_ERROR( + sycl_factory->CreateDevices(options, name_prefix, devices)); + } + // Then the rest. mutex_lock l(*get_device_factory_lock()); for (auto& p : device_factories()) { auto factory = p.second.factory.get(); - if (factory != cpu_factory && factory != gpu_factory) { + if (factory != cpu_factory && factory != gpu_factory && + factory != sycl_factory) { TF_RETURN_IF_ERROR(factory->CreateDevices(options, name_prefix, devices)); } } diff --git a/tensorflow/core/common_runtime/sycl/sycl_device.cc b/tensorflow/core/common_runtime/sycl/sycl_device.cc new file mode 100644 index 00000000000..e13c34dd690 --- /dev/null +++ b/tensorflow/core/common_runtime/sycl/sycl_device.cc @@ -0,0 +1,70 @@ +/* Copyright 2016 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#if TENSORFLOW_USE_SYCL + +#include "tensorflow/core/common_runtime/sycl/sycl_device.h" +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" + +#include "tensorflow/core/framework/tensor.pb_text.h" +#include "tensorflow/core/platform/tracing.h" + +namespace tensorflow { + +cl::sycl::gpu_selector s; +cl::sycl::queue q(s); + +SYCLDevice::SYCLDevice(const SessionOptions& options, const string& name, + Bytes memory_limit, BusAdjacency bus_adjacency, + Allocator* allocator) + : LocalDevice(options, Device::BuildDeviceAttributes( + name, DEVICE_SYCL, memory_limit, bus_adjacency), + allocator), + allocator_(allocator), + device_(q) { + set_eigen_sycl_device(&device_); +} + +SYCLDevice::~SYCLDevice() {} + +void SYCLDevice::Compute(OpKernel* op_kernel, OpKernelContext* context) { + assert(context); + if (port::Tracing::IsActive()) { + // TODO(pbar) We really need a useful identifier of the graph node. + const uint64 id = Hash64(op_kernel->name()); + port::Tracing::ScopedActivity region(port::Tracing::EventCategory::kCompute, + id); + } + op_kernel->Compute(context); +} + +Allocator* SYCLDevice::GetAllocator(AllocatorAttributes attr) { + return allocator_; +} + +Status SYCLDevice::MakeTensorFromProto(const TensorProto& tensor_proto, + const AllocatorAttributes alloc_attrs, + Tensor* tensor) { + Tensor parsed(tensor_proto.dtype()); + if (!parsed.FromProto(cpu_allocator(), tensor_proto)) { + return errors::InvalidArgument("Cannot parse tensor from proto: ", + ProtoDebugString(tensor_proto)); + } + *tensor = std::move(parsed); + return Status::OK(); +} +} + +#endif // TENSORFLOW_USE_SYCL diff --git a/tensorflow/core/common_runtime/sycl/sycl_device.h b/tensorflow/core/common_runtime/sycl/sycl_device.h new file mode 100644 index 00000000000..e43997cbf25 --- /dev/null +++ b/tensorflow/core/common_runtime/sycl/sycl_device.h @@ -0,0 +1,53 @@ +/* Copyright 2016 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#if !TENSORFLOW_USE_SYCL +#error This file must only be included when building TensorFlow with SYCL support +#endif + +#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_SYCL_SYCL_DEVICE_H_ +#define TENSORFLOW_CORE_COMMON_RUNTIME_SYCL_SYCL_DEVICE_H_ + +#define EIGEN_USE_SYCL + +#include "tensorflow/core/common_runtime/device_factory.h" +#include "tensorflow/core/common_runtime/local_device.h" +#include "tensorflow/core/public/session_options.h" + +namespace tensorflow { + +class SYCLDevice : public LocalDevice { + public: + SYCLDevice(const SessionOptions& options, const string& name, + Bytes memory_limit, BusAdjacency bus_adjacency, + Allocator* allocator); + ~SYCLDevice() override; + + void Compute(OpKernel* op_kernel, OpKernelContext* context) override; + Allocator* GetAllocator(AllocatorAttributes attr) override; + Status MakeTensorFromProto(const TensorProto& tensor_proto, + const AllocatorAttributes alloc_attrs, + Tensor* tensor) override; + + Status Sync() override { return Status::OK(); } + + private: + Allocator* allocator_; // Not owned + Eigen::SyclDevice device_; +}; + +} // namespace tensorflow + +#endif // TENSORFLOW_CORE_COMMON_RUNTIME_SYCL_SYCL_DEVICE_H_ diff --git a/tensorflow/core/common_runtime/sycl/sycl_device_factory.cc b/tensorflow/core/common_runtime/sycl/sycl_device_factory.cc new file mode 100644 index 00000000000..fe10412ab6c --- /dev/null +++ b/tensorflow/core/common_runtime/sycl/sycl_device_factory.cc @@ -0,0 +1,43 @@ +/* Copyright 2016 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#if TENSORFLOW_USE_SYCL + +#include "sycl_device.h" + +namespace tensorflow { + +class SYCLDeviceFactory : public DeviceFactory { + public: + Status CreateDevices(const SessionOptions& options, const string& name_prefix, + std::vector* devices) override { + int n = 1; + auto iter = options.config.device_count().find("SYCL"); + if (iter != options.config.device_count().end()) { + n = iter->second; + } + for (int i = 0; i < n; i++) { + string name = strings::StrCat(name_prefix, "/sycl:", i); + devices->push_back(new SYCLDevice(options, name, Bytes(256 << 20), + BUS_ANY, cpu_allocator())); + } + return Status::OK(); + } +}; + +REGISTER_LOCAL_DEVICE_FACTORY("SYCL", SYCLDeviceFactory); +} + +#endif // TENSORFLOW_USE_SYCL diff --git a/tensorflow/core/framework/device_base.h b/tensorflow/core/framework/device_base.h index acba11657a3..eadba4c6c8d 100644 --- a/tensorflow/core/framework/device_base.h +++ b/tensorflow/core/framework/device_base.h @@ -30,6 +30,9 @@ limitations under the License. namespace Eigen { struct ThreadPoolDevice; +#ifdef TENSORFLOW_USE_SYCL +struct SyclDevice; +#endif } // end namespace Eigen namespace perftools { @@ -145,6 +148,10 @@ class DeviceBase { eigen_cpu_device_ = d; } +#ifdef TENSORFLOW_USE_SYCL + void set_eigen_sycl_device(Eigen::SyclDevice* d) { eigen_sycl_device_ = d; } +#endif + // Return the Allocator implementation to use based on the allocator // attributes requested. See allocator.h for more details. virtual Allocator* GetAllocator(AllocatorAttributes /*attr*/) { @@ -166,6 +173,13 @@ class DeviceBase { return eigen_cpu_device_; } +#ifdef TENSORFLOW_USE_SYCL + const Eigen::SyclDevice* eigen_sycl_device() const { + CHECK(eigen_sycl_device_ != nullptr); + return eigen_sycl_device_; + } +#endif + // Caller owns the return value. The OpKernelContext calls this even // for devices that do not implement an eigen_gpu_device. Overridden // by GPU devices to return a derived type. @@ -200,6 +214,9 @@ class DeviceBase { CpuWorkerThreads* cpu_worker_threads_ = nullptr; GpuDeviceInfo* gpu_device_info_ = nullptr; Eigen::ThreadPoolDevice* eigen_cpu_device_ = nullptr; +#ifdef TENSORFLOW_USE_SYCL + Eigen::SyclDevice* eigen_sycl_device_ = nullptr; +#endif }; } // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_ops_common.h b/tensorflow/core/kernels/cwise_ops_common.h index 0297e65b09a..cea6c3d59df 100644 --- a/tensorflow/core/kernels/cwise_ops_common.h +++ b/tensorflow/core/kernels/cwise_ops_common.h @@ -20,6 +20,10 @@ limitations under the License. #define EIGEN_USE_THREADS +#ifdef TENSORFLOW_USE_SYCL +#include "tensorflow/core/kernels/cwise_ops_sycl_common.h" +#endif + #include "tensorflow/core/kernels/cwise_ops.h" #include "tensorflow/core/kernels/cwise_ops_gradients.h" @@ -33,6 +37,9 @@ namespace tensorflow { typedef Eigen::ThreadPoolDevice CPUDevice; typedef Eigen::GpuDevice GPUDevice; +#ifdef TENSORFLOW_USE_SYCL +typedef Eigen::SyclDevice SYCLDevice; +#endif class BinaryOpShared : public OpKernel { public: @@ -101,17 +108,17 @@ class BinaryOp : public BinaryOpShared { if (state.in1_num_elements == 1) { // tensor op scalar functor::BinaryFunctor().Right( - eigen_device, out_flat, in0.template flat(), in1.template scalar(), - error_ptr); + eigen_device, out_flat, in0.template flat(), + in1.template scalar(), error_ptr); } else if (state.in0_num_elements == 1) { // scalar op tensor functor::BinaryFunctor().Left( - eigen_device, out_flat, in0.template scalar(), in1.template flat(), - error_ptr); + eigen_device, out_flat, in0.template scalar(), + in1.template flat(), error_ptr); } else { functor::BinaryFunctor()( - eigen_device, out_flat, in0.template flat(), in1.template flat(), - error_ptr); + eigen_device, out_flat, in0.template flat(), + in1.template flat(), error_ptr); } } else if (ndims == 2) { functor::BinaryFunctor().BCast( diff --git a/tensorflow/core/kernels/cwise_ops_sycl_common.h b/tensorflow/core/kernels/cwise_ops_sycl_common.h new file mode 100644 index 00000000000..baba610d6de --- /dev/null +++ b/tensorflow/core/kernels/cwise_ops_sycl_common.h @@ -0,0 +1,136 @@ +/* Copyright 2016 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#if !TENSORFLOW_USE_SYCL +#error This file must only be included when building TensorFlow with SYCL support +#endif + +#ifndef TENSORFLOW_CORE_KERNELS_CWISE_OPS_SYCL_COMMON_H_ +#define TENSORFLOW_CORE_KERNELS_CWISE_OPS_SYCL_COMMON_H_ + +#define EIGEN_USE_SYCL + +#include "tensorflow/core/framework/tensor_types.h" +#include "tensorflow/core/kernels/cwise_ops.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/platform/types.h" + +namespace tensorflow { +namespace functor { + +typedef Eigen::SyclDevice SYCLDevice; + +template +void Assign(const SYCLDevice& d, OUT out, RHS rhs) { + out.device(d) = rhs; +} + +// Partial specialization of BinaryFunctor. +template +struct BinaryFunctor { + void operator()(const SYCLDevice& d, typename Functor::tout_type out, + typename Functor::tin_type in0, + typename Functor::tin_type in1, bool* error) { + Assign(d, out, in0.binaryExpr(in1, typename Functor::func())); + } + + void Left(const SYCLDevice& d, typename Functor::tout_type out, + typename Functor::tscalar_type scalar, + typename Functor::tin_type in, bool* error) { + // typedef typename Functor::out_type Tout; + // typedef typename Functor::in_type Tin; + // typedef typename Functor::func Binary; + // typedef typename Eigen::internal::scalar_left Unary; + // Assign(d, out, in.unaryExpr(Unary(scalar.data()))); + // printf("BinaryFunctor::Left NOT IMPLEMENTED ! \n"); + LOG(FATAL) << "BinaryFunctor::Left NOT IMPLEMENTED ! "; + } + + void Right(const SYCLDevice& d, typename Functor::tout_type out, + typename Functor::tin_type in, + typename Functor::tscalar_type scalar, bool* error) { + typedef typename Functor::out_type Tout; + typedef typename Functor::in_type Tin; + typedef typename Functor::func Binary; + typedef typename Eigen::internal::scalar_right Unary; + Assign(d, out, in.unaryExpr(Unary(scalar.data()))); + } + + void BCast(const SYCLDevice& d, + typename TTypes::Tensor out, + typename TTypes::ConstTensor in0, + typename Eigen::array bcast0, + typename TTypes::ConstTensor in1, + typename Eigen::array bcast1, + bool* error) { + LOG(FATAL) << "BinaryFunctor::BCast NOT IMPLEMENTED "; + // printf("BinaryFunctor::BCast NOT IMPLEMENTED ! \n"); + } +}; + +// Macros to explicitly instantiate kernels on GPU for multiple types +// (T0, T1, etc.) for UnaryFunctor (e.g., functor::sqrt). +#define DEFINE_UNARY1(F, T) template struct UnaryFunctor > +#define DEFINE_UNARY2(F, T0, T1) \ + DEFINE_UNARY1(F, T0); \ + DEFINE_UNARY1(F, T1) +#define DEFINE_UNARY3(F, T0, T1, T2) \ + DEFINE_UNARY2(F, T0, T1); \ + DEFINE_UNARY1(F, T2) +#define DEFINE_UNARY4(F, T0, T1, T2, T3) \ + DEFINE_UNARY2(F, T0, T1); \ + DEFINE_UNARY2(F, T2, T3) +#define DEFINE_UNARY5(F, T0, T1, T2, T3, T4) \ + DEFINE_UNARY2(F, T0, T1); \ + DEFINE_UNARY3(F, T2, T3, T4) + +// Macros to explicitly instantiate kernels on GPU for multiple types +// (T0, T1, etc.) for BinaryFunctor. +#define DEFINE_BINARY1(F, T) \ + template struct BinaryFunctor, 1>; \ + template struct BinaryFunctor, 2>; \ + template struct BinaryFunctor, 3> +#define DEFINE_BINARY2(F, T0, T1) \ + DEFINE_BINARY1(F, T0); \ + DEFINE_BINARY1(F, T1) +#define DEFINE_BINARY3(F, T0, T1, T2) \ + DEFINE_BINARY2(F, T0, T1); \ + DEFINE_BINARY1(F, T2) +#define DEFINE_BINARY4(F, T0, T1, T2, T3) \ + DEFINE_BINARY2(F, T0, T1); \ + DEFINE_BINARY2(F, T2, T3) +#define DEFINE_BINARY5(F, T0, T1, T2, T3, T4) \ + DEFINE_BINARY2(F, T0, T1); \ + DEFINE_BINARY3(F, T2, T3, T4) +#define DEFINE_BINARY6(F, T0, T1, T2, T3, T4, T5) \ + DEFINE_BINARY3(F, T0, T1, T2); \ + DEFINE_BINARY3(F, T3, T4, T5) +#define DEFINE_BINARY7(F, T0, T1, T2, T3, T4, T5, T6) \ + DEFINE_BINARY3(F, T0, T1, T2); \ + DEFINE_BINARY4(F, T3, T4, T5, T6) +#define DEFINE_BINARY8(F, T0, T1, T2, T3, T4, T5, T6, T7) \ + DEFINE_BINARY4(F, T0, T1, T2, T3); \ + DEFINE_BINARY4(F, T4, T5, T6, T7) +#define DEFINE_BINARY9(F, T0, T1, T2, T3, T4, T5, T6, T7, T8) \ + DEFINE_BINARY4(F, T0, T1, T2, T3); \ + DEFINE_BINARY5(F, T4, T5, T6, T7, T8) +#define DEFINE_BINARY10(F, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9) \ + DEFINE_BINARY5(F, T0, T1, T2, T3, T4); \ + DEFINE_BINARY5(F, T5, T6, T7, T8, T9) + +} // end namespace functor +} // end namespace tensorflow + +#endif // TENSORFLOW_CORE_KERNELS_CWISE_OPS_SYCL_COMMON_H_ diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl index 62d303a8781..e1b7b95e489 100644 --- a/tensorflow/workspace.bzl +++ b/tensorflow/workspace.bzl @@ -17,7 +17,7 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""): # These lines need to be changed when updating Eigen. They are parsed from # this file by the cmake and make builds to determine the eigen version and # hash. - eigen_version = "ab6d16a84626" + eigen_version = "090e5709bbaa" eigen_sha256 = "" native.new_http_archive( diff --git a/third_party/eigen3/BUILD b/third_party/eigen3/BUILD index 9ab7aadf87a..b45af94e540 100644 --- a/third_party/eigen3/BUILD +++ b/third_party/eigen3/BUILD @@ -23,5 +23,7 @@ cc_library( "unsupported/Eigen/CXX11/FixedPoint", ], visibility = ["//visibility:public"], - deps = ["@eigen_archive//:eigen"], + deps = ["@eigen_archive//:eigen", + "@local_config_sycl//sycl:sycl", + ], ) From f25bc0098aaf93ecff2751e7d92df23c50e31fce Mon Sep 17 00:00:00 2001 From: luke iwanski Date: Thu, 27 Oct 2016 07:03:46 +0100 Subject: [PATCH 17/51] Revert "Applied workaround for the ComputeCpp CE." This reverts commit b8b166429a80b038da1cd56f331bcbbebb95932d. --- tensorflow/core/common_runtime/bfc_allocator.h | 5 +++-- tensorflow/core/common_runtime/direct_session.h | 3 +-- .../core/common_runtime/gpu/gpu_allocator_retry_test.cc | 6 +++--- tensorflow/core/common_runtime/gpu/pool_allocator.h | 2 +- tensorflow/core/framework/op.h | 3 +-- tensorflow/core/framework/tracking_allocator.h | 2 +- tensorflow/core/kernels/barrier_ops.cc | 8 +++----- tensorflow/core/kernels/conditional_accumulator.h | 6 +++--- tensorflow/core/kernels/conditional_accumulator_base.h | 3 +-- tensorflow/core/kernels/conditional_accumulator_base_op.h | 2 +- tensorflow/core/kernels/cwise_ops_common.h | 5 ----- tensorflow/core/kernels/queue_base.h | 2 +- tensorflow/core/kernels/queue_op.h | 2 +- tensorflow/core/kernels/sparse_conditional_accumulator.h | 2 +- tensorflow/core/kernels/tensor_array.h | 3 ++- tensorflow/core/lib/monitoring/collection_registry.cc | 4 +--- tensorflow/core/lib/monitoring/collection_registry.h | 5 +++-- tensorflow/core/lib/monitoring/counter.h | 3 ++- tensorflow/stream_executor/machine_manager.h | 7 ++++--- 19 files changed, 33 insertions(+), 40 deletions(-) diff --git a/tensorflow/core/common_runtime/bfc_allocator.h b/tensorflow/core/common_runtime/bfc_allocator.h index 3b3e4a134f6..0b528cb0c27 100644 --- a/tensorflow/core/common_runtime/bfc_allocator.h +++ b/tensorflow/core/common_runtime/bfc_allocator.h @@ -295,8 +295,6 @@ class BFCAllocator : public VisitableAllocator { private: std::vector regions_; }; - // Structures mutable after construction - mutable mutex lock_; // Returns 'bytes' rounded up to the next highest kMinAllocationSize. size_t RoundedBytes(size_t bytes); @@ -395,6 +393,9 @@ class BFCAllocator : public VisitableAllocator { std::unique_ptr suballocator_; string name_; + + // Structures mutable after construction + mutable mutex lock_; RegionManager region_manager_ GUARDED_BY(lock_); std::vector chunks_; diff --git a/tensorflow/core/common_runtime/direct_session.h b/tensorflow/core/common_runtime/direct_session.h index 8fe4825aa6d..a4289112534 100644 --- a/tensorflow/core/common_runtime/direct_session.h +++ b/tensorflow/core/common_runtime/direct_session.h @@ -162,8 +162,6 @@ class DirectSession : public Session { protobuf::RepeatedPtrField debug_tensor_watches; }; - mutex graph_def_lock_; - // Initializes the base execution state given the 'graph', // if not already initialized. Status MaybeInitializeExecutionState(const GraphDef& graph, @@ -229,6 +227,7 @@ class DirectSession : public Session { string session_handle_; bool graph_created_ GUARDED_BY(graph_def_lock_) = false; + mutex graph_def_lock_; GraphDef graph_def_ GUARDED_BY(graph_def_lock_); // The thread-pools to use for running ops. diff --git a/tensorflow/core/common_runtime/gpu/gpu_allocator_retry_test.cc b/tensorflow/core/common_runtime/gpu/gpu_allocator_retry_test.cc index 423448773ae..2148f83fe57 100644 --- a/tensorflow/core/common_runtime/gpu/gpu_allocator_retry_test.cc +++ b/tensorflow/core/common_runtime/gpu/gpu_allocator_retry_test.cc @@ -28,7 +28,6 @@ namespace tensorflow { namespace { class FakeAllocator { - mutex mu_; public: FakeAllocator(size_t cap, int millis_to_wait) : memory_capacity_(cap), millis_to_wait_(millis_to_wait) {} @@ -58,6 +57,7 @@ class FakeAllocator { private: AllocatorRetry retry_; void* good_ptr_ = reinterpret_cast(0xdeadbeef); + mutex mu_; size_t memory_capacity_ GUARDED_BY(mu_); int millis_to_wait_; }; @@ -72,7 +72,6 @@ class FakeAllocator { // interesting part of their interaction with the allocator. This // class is the mechanism that imposes turn taking. class AlternatingBarrier { - mutex mu_; public: explicit AlternatingBarrier(int num_users) : num_users_(num_users), next_turn_(0), done_(num_users, false) {} @@ -110,6 +109,7 @@ class AlternatingBarrier { } } + mutex mu_; condition_variable cv_; int num_users_; int next_turn_ GUARDED_BY(mu_); @@ -118,7 +118,6 @@ class AlternatingBarrier { class GPUAllocatorRetryTest : public ::testing::Test { protected: - mutex mu_; GPUAllocatorRetryTest() {} void LaunchConsumerThreads(int num_consumers, int cap_needed) { @@ -174,6 +173,7 @@ class GPUAllocatorRetryTest : public ::testing::Test { std::vector consumers_; std::vector consumer_count_; Notification notifier_; + mutex mu_; bool has_failed_ GUARDED_BY(mu_) = false; int count_ GUARDED_BY(mu_) = 0; }; diff --git a/tensorflow/core/common_runtime/gpu/pool_allocator.h b/tensorflow/core/common_runtime/gpu/pool_allocator.h index 437fea91155..b2f0265145f 100644 --- a/tensorflow/core/common_runtime/gpu/pool_allocator.h +++ b/tensorflow/core/common_runtime/gpu/pool_allocator.h @@ -45,7 +45,6 @@ class RoundUpInterface { // Size-limited pool of memory buffers obtained from a SubAllocator // instance. Pool eviction policy is LRU. class PoolAllocator : public VisitableAllocator { - mutex mutex_; public: // "pool_size_limit" is the maximum number of returned, re-usable // memory buffers to keep in the pool. If pool_size_limit == 0, the @@ -137,6 +136,7 @@ class PoolAllocator : public VisitableAllocator { size_t pool_size_limit_; std::unique_ptr allocator_; std::unique_ptr size_rounder_; + mutex mutex_; std::multimap pool_ GUARDED_BY(mutex_); PtrRecord* lru_head_ GUARDED_BY(mutex_) = nullptr; PtrRecord* lru_tail_ GUARDED_BY(mutex_) = nullptr; diff --git a/tensorflow/core/framework/op.h b/tensorflow/core/framework/op.h index 321ace9f465..f047ddb12a1 100644 --- a/tensorflow/core/framework/op.h +++ b/tensorflow/core/framework/op.h @@ -125,8 +125,6 @@ class OpRegistry : public OpRegistryInterface { void ClearDeferredRegistrations(); private: - mutable mutex mu_; - // Ensures that all the functions in deferred_ get called, their OpDef's // registered, and returns with deferred_ empty. Returns true the first // time it is called. Prints a fatal log if any op registration fails. @@ -143,6 +141,7 @@ class OpRegistry : public OpRegistryInterface { Status RegisterAlreadyLocked(OpRegistrationDataFactory op_data_factory) const EXCLUSIVE_LOCKS_REQUIRED(mu_); + mutable mutex mu_; // Functions in deferred_ may only be called with mu_ held. mutable std::vector deferred_ GUARDED_BY(mu_); // Values are owned. diff --git a/tensorflow/core/framework/tracking_allocator.h b/tensorflow/core/framework/tracking_allocator.h index 040bb03f819..bb19f5dca04 100644 --- a/tensorflow/core/framework/tracking_allocator.h +++ b/tensorflow/core/framework/tracking_allocator.h @@ -74,11 +74,11 @@ class TrackingAllocator : public Allocator { std::pair GetSizesAndUnRef(); private: - mutex mu_; ~TrackingAllocator() override {} bool UnRef() EXCLUSIVE_LOCKS_REQUIRED(mu_); Allocator* allocator_; // not owned. + mutex mu_; // the number of calls to AllocateRaw that have not yet been matched // by a corresponding call to DeAllocateRaw, plus 1 if the Executor // has not yet read out the high watermark. diff --git a/tensorflow/core/kernels/barrier_ops.cc b/tensorflow/core/kernels/barrier_ops.cc index e91d9037cff..84f57517605 100644 --- a/tensorflow/core/kernels/barrier_ops.cc +++ b/tensorflow/core/kernels/barrier_ops.cc @@ -40,7 +40,6 @@ namespace tensorflow { namespace barrier { class Barrier : public ResourceBase { - mutex mu_; public: typedef std::vector Tuple; typedef std::function DoneCallback; @@ -418,6 +417,7 @@ class Barrier : public ResourceBase { private: typedef std::vector PersistentTuple; + mutex mu_; bool closed_ GUARDED_BY(mu_); bool queue_closed_ GUARDED_BY(mu_); bool queue_cancelled_ GUARDED_BY(mu_); @@ -433,7 +433,6 @@ class Barrier : public ResourceBase { }; class BarrierOp : public OpKernel { - mutex mu_; public: explicit BarrierOp(OpKernelConstruction* context) : OpKernel(context), barrier_handle_set_(false) { @@ -512,6 +511,7 @@ class BarrierOp : public OpKernel { std::vector value_component_shapes_; ContainerInfo cinfo_; + mutex mu_; PersistentTensor barrier_handle_ GUARDED_BY(mu_); bool barrier_handle_set_ GUARDED_BY(mu_); @@ -611,9 +611,7 @@ class TakeManyOp : public BarrierOpKernel { DataTypeVector expected_inputs = {DT_STRING_REF, DT_INT32}; // The first output is the insertion index, the second output is the key. DataTypeVector expected_outputs = {DT_INT64, DT_STRING}; - for (auto it = barrier->component_types().begin(), - end = barrier->component_types().end(); it!= end; it++ ){ - const DataType dt = *it; + for (DataType dt : barrier->component_types()) { expected_outputs.push_back(dt); } OP_REQUIRES_OK_ASYNC( diff --git a/tensorflow/core/kernels/conditional_accumulator.h b/tensorflow/core/kernels/conditional_accumulator.h index 4ee1601f342..f8c340a7691 100644 --- a/tensorflow/core/kernels/conditional_accumulator.h +++ b/tensorflow/core/kernels/conditional_accumulator.h @@ -65,7 +65,7 @@ class ConditionalAccumulator functor::SetZeroFunctor set_zero_functor_; Status ValidateShape(const Tensor* tensor) - EXCLUSIVE_LOCKS_REQUIRED(mu_) { + EXCLUSIVE_LOCKS_REQUIRED(this->mu_) { // Must be compatible with accumulated gradient if available if (counter_ > 0) { if (!accum_grad_->shape().IsSameSize(tensor->shape())) { @@ -98,7 +98,7 @@ class ConditionalAccumulator } void DivideAccumGradByCounter(OpKernelContext* ctx) override - EXCLUSIVE_LOCKS_REQUIRED(mu_) { + EXCLUSIVE_LOCKS_REQUIRED(this->mu_) { Tensor c(DataTypeToEnum::value, {}); c.scalar()() = TypeConverter::ConvertUToT(this->counter_); this->accum_grad_->template flat().device( @@ -113,7 +113,7 @@ class ConditionalAccumulator bool GetAndValidateTensorInputForApplyGrad(OpKernelContext* ctx, const Tensor** tensor) override - EXCLUSIVE_LOCKS_REQUIRED(mu_) { + EXCLUSIVE_LOCKS_REQUIRED(this->mu_) { // Get input gradient tensor const Tensor* grad_tensor; OP_REQUIRES_OK_BOOLEAN(ctx, ctx->input("gradient", &grad_tensor)); diff --git a/tensorflow/core/kernels/conditional_accumulator_base.h b/tensorflow/core/kernels/conditional_accumulator_base.h index 9992379640d..05ee855daee 100644 --- a/tensorflow/core/kernels/conditional_accumulator_base.h +++ b/tensorflow/core/kernels/conditional_accumulator_base.h @@ -45,8 +45,6 @@ namespace tensorflow { * (3) the internal global_step value (current_global_step_) is incremented by 1 */ class ConditionalAccumulatorBase : public ResourceBase { - protected: - mutex mu_; public: // Args: // dtype: The datatype of the gradients to be accumulated. @@ -127,6 +125,7 @@ class ConditionalAccumulatorBase : public ResourceBase { const DataType dtype_; const PartialTensorShape shape_; const string name_; + mutex mu_; int counter_ GUARDED_BY(mu_); int64 current_global_step_ GUARDED_BY(mu_); diff --git a/tensorflow/core/kernels/conditional_accumulator_base_op.h b/tensorflow/core/kernels/conditional_accumulator_base_op.h index 0a64a857cdb..33c2d596c8b 100644 --- a/tensorflow/core/kernels/conditional_accumulator_base_op.h +++ b/tensorflow/core/kernels/conditional_accumulator_base_op.h @@ -43,7 +43,6 @@ namespace tensorflow { * ConditionalAccumulatorBase (via sub-class's Creator) and returns its handle. */ class ConditionalAccumulatorBaseOp : public OpKernel { - mutex mu_; public: explicit ConditionalAccumulatorBaseOp(OpKernelConstruction* context) : OpKernel(context), accumulator_handle_set_(false) { @@ -110,6 +109,7 @@ class ConditionalAccumulatorBaseOp : public OpKernel { return Status::OK(); } + mutex mu_; PersistentTensor accumulator_handle_ GUARDED_BY(mu_); bool accumulator_handle_set_ GUARDED_BY(mu_); }; diff --git a/tensorflow/core/kernels/cwise_ops_common.h b/tensorflow/core/kernels/cwise_ops_common.h index cea6c3d59df..c825a91fb16 100644 --- a/tensorflow/core/kernels/cwise_ops_common.h +++ b/tensorflow/core/kernels/cwise_ops_common.h @@ -89,13 +89,8 @@ class BinaryOp : public BinaryOpShared { if (!ctx->status().ok()) return; Tensor* out = state.out; BCast* bcast = &state.bcast; -#if TENSORFLOW_USE_SYCL - decltype(state.in0) in0 = state.in0; - decltype(state.in1) in1 = state.in1; -#else auto& in0 = state.in0; auto& in1 = state.in1; -#endif if (state.out_num_elements == 0) { return; } diff --git a/tensorflow/core/kernels/queue_base.h b/tensorflow/core/kernels/queue_base.h index 6b2043e5a32..79b479b44b5 100644 --- a/tensorflow/core/kernels/queue_base.h +++ b/tensorflow/core/kernels/queue_base.h @@ -83,7 +83,6 @@ class QueueBase : public QueueInterface { int64 index); protected: - mutex mu_; enum Action { kEnqueue, kDequeue }; enum RunResult { kNoProgress, kProgress, kComplete }; @@ -144,6 +143,7 @@ class QueueBase : public QueueInterface { const DataTypeVector component_dtypes_; const std::vector component_shapes_; const string name_; + mutex mu_; bool closed_ GUARDED_BY(mu_); struct Attempt; diff --git a/tensorflow/core/kernels/queue_op.h b/tensorflow/core/kernels/queue_op.h index a21be2c389d..7694827854c 100644 --- a/tensorflow/core/kernels/queue_op.h +++ b/tensorflow/core/kernels/queue_op.h @@ -34,7 +34,6 @@ namespace tensorflow { // Defines a QueueOp, an abstract class for Queue construction ops. class QueueOp : public OpKernel { - mutex mu_; public: QueueOp(OpKernelConstruction* context) : OpKernel(context), queue_handle_set_(false) { @@ -95,6 +94,7 @@ class QueueOp : public OpKernel { return Status::OK(); } + mutex mu_; PersistentTensor queue_handle_ GUARDED_BY(mu_); bool queue_handle_set_ GUARDED_BY(mu_); }; diff --git a/tensorflow/core/kernels/sparse_conditional_accumulator.h b/tensorflow/core/kernels/sparse_conditional_accumulator.h index 73bd3b47e48..89560094af6 100644 --- a/tensorflow/core/kernels/sparse_conditional_accumulator.h +++ b/tensorflow/core/kernels/sparse_conditional_accumulator.h @@ -83,7 +83,7 @@ class SparseConditionalAccumulator Status ValidateShape( std::tuple* tensor, - bool has_known_shape) EXCLUSIVE_LOCKS_REQUIRED(mu_) { + bool has_known_shape) EXCLUSIVE_LOCKS_REQUIRED(this->mu_) { const Tensor* tensor_idx = std::get<0>(*tensor); const Tensor* tensor_val = std::get<1>(*tensor); const Tensor* tensor_shape = std::get<2>(*tensor); diff --git a/tensorflow/core/kernels/tensor_array.h b/tensorflow/core/kernels/tensor_array.h index ec651f72ecf..1fb1be6b628 100644 --- a/tensorflow/core/kernels/tensor_array.h +++ b/tensorflow/core/kernels/tensor_array.h @@ -124,7 +124,6 @@ TF_CALL_GPU_NUMBER_TYPES(TENSOR_ARRAY_SET_ZERO_GPU); // multiple reads of that index in the forward phase. // class TensorArray : public ResourceBase { - mutex mu_; public: static std::atomic tensor_array_counter; @@ -340,6 +339,8 @@ class TensorArray : public ResourceBase { const DataType dtype_; Tensor handle_; + mutex mu_; + // Marks that the tensor_array_ has been cleared. bool closed_ GUARDED_BY(mu_); diff --git a/tensorflow/core/lib/monitoring/collection_registry.cc b/tensorflow/core/lib/monitoring/collection_registry.cc index 01d643fbcca..47112279cff 100644 --- a/tensorflow/core/lib/monitoring/collection_registry.cc +++ b/tensorflow/core/lib/monitoring/collection_registry.cc @@ -45,9 +45,7 @@ void Collector::CollectMetricDescriptor( metric_descriptor->name = metric_def->name().ToString(); metric_descriptor->description = metric_def->description().ToString(); - for (auto it = metric_def->label_descriptions().begin(), - end = metric_def->label_descriptions().end() ; it!=end ;it++ ) { - const StringPiece label_name = *it; + for (const StringPiece label_name : metric_def->label_descriptions()) { metric_descriptor->label_names.push_back(label_name.ToString()); } diff --git a/tensorflow/core/lib/monitoring/collection_registry.h b/tensorflow/core/lib/monitoring/collection_registry.h index ed957b9ae45..3da2439238f 100644 --- a/tensorflow/core/lib/monitoring/collection_registry.h +++ b/tensorflow/core/lib/monitoring/collection_registry.h @@ -121,7 +121,6 @@ class MetricCollectorGetter { // // This class is thread-safe. class CollectionRegistry { - mutable mutex mu_; public: ~CollectionRegistry() = default; @@ -177,6 +176,8 @@ class CollectionRegistry { // TF environment, mainly used for timestamping. Env* const env_; + mutable mutex mu_; + // Information required for collection. struct CollectionInfo { const AbstractMetricDef* const metric_def; @@ -226,7 +227,6 @@ inline void CollectValue(const int64& value, Point* const point) { // // This class is thread-safe. class Collector { - mutable mutex mu_; public: Collector(const uint64 collection_time_millis) : collected_metrics_(new CollectedMetrics()), @@ -260,6 +260,7 @@ class Collector { LOCKS_EXCLUDED(mu_); private: + mutable mutex mu_; std::unique_ptr collected_metrics_ GUARDED_BY(mu_); const uint64 collection_time_millis_; diff --git a/tensorflow/core/lib/monitoring/counter.h b/tensorflow/core/lib/monitoring/counter.h index 0ea50932dd9..e76057b980a 100644 --- a/tensorflow/core/lib/monitoring/counter.h +++ b/tensorflow/core/lib/monitoring/counter.h @@ -78,7 +78,6 @@ class CounterCell { // This class is thread-safe. template class Counter { - mutable mutex mu_; public: ~Counter() { // Deleted here, before the metric_def is destroyed. @@ -112,6 +111,8 @@ class Counter { } })) {} + mutable mutex mu_; + // The metric definition. This will be used to identify the metric when we // register it for collection. const MetricDef metric_def_; diff --git a/tensorflow/stream_executor/machine_manager.h b/tensorflow/stream_executor/machine_manager.h index bf95bc74713..65396dd1ff5 100644 --- a/tensorflow/stream_executor/machine_manager.h +++ b/tensorflow/stream_executor/machine_manager.h @@ -60,9 +60,6 @@ namespace gputools { // // Thread-safe. class MachineManager { - // Mutex that guards the initialization of the machine manager static - // variable. - static mutex mu_; public: // Inspects the host to determine the preferred GPU execution platform. // To force OpenCL from a build target on a machine that has both OpenCL and @@ -174,6 +171,10 @@ class MachineManager { // Returns the NUMA node association for the StreamExecutor. int ExecutorToNumaNode(const StreamExecutor *stream_exec) const; + // Mutex that guards the initialization of the machine manager static + // variable. + static mutex mu_; + // Singleton MachineManager value -- assignment to this is protected by a // static singleton guard clause. static MachineManager *singleton_ GUARDED_BY(mu_); From 0ad69511e18b6d87eef5ae76ff9332021d2cd955 Mon Sep 17 00:00:00 2001 From: luke iwanski Date: Thu, 27 Oct 2016 07:12:20 +0100 Subject: [PATCH 18/51] Quick copy can be used for host buffers when using sycl. --- tensorflow/core/common_runtime/rendezvous_mgr.cc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tensorflow/core/common_runtime/rendezvous_mgr.cc b/tensorflow/core/common_runtime/rendezvous_mgr.cc index 285ac7540c8..6b57e8a0e81 100644 --- a/tensorflow/core/common_runtime/rendezvous_mgr.cc +++ b/tensorflow/core/common_runtime/rendezvous_mgr.cc @@ -65,10 +65,10 @@ void IntraProcessRendezvous::SameWorkerRecvDone( StatusCallback done) { // Do a quick copy (sharing the underlying buffer) if both tensors // are on host memory. - const bool src_host = - (send_args.alloc_attrs.on_host() || parsed.src.type == "CPU"); - const bool dst_host = - (recv_args.alloc_attrs.on_host() || parsed.dst.type == "CPU"); + const bool src_host = (send_args.alloc_attrs.on_host() || + parsed.src.type == "CPU" || parsed.src.type == "SYCL"); + const bool dst_host = (recv_args.alloc_attrs.on_host() || + parsed.dst.type == "CPU" || parsed.dst.type == "SYCL"); if (src_host && dst_host) { *out = in; done(Status::OK()); From d4e0f8f654f558458577ce9c848bfd8c0239e2e7 Mon Sep 17 00:00:00 2001 From: luke iwanski Date: Thu, 27 Oct 2016 07:18:23 +0100 Subject: [PATCH 19/51] Added eigen_device that returns Eigen::SyclDevice to op_kernel. --- tensorflow/core/framework/op_kernel.cc | 5 +++++ tensorflow/core/framework/op_kernel.h | 4 ++++ 2 files changed, 9 insertions(+) diff --git a/tensorflow/core/framework/op_kernel.cc b/tensorflow/core/framework/op_kernel.cc index a7a93cb69cf..d233690a2b3 100644 --- a/tensorflow/core/framework/op_kernel.cc +++ b/tensorflow/core/framework/op_kernel.cc @@ -949,6 +949,11 @@ const Eigen::GpuDevice& OpKernelContext::eigen_device() const { return eigen_gpu_device(); } +template <> +const Eigen::SyclDevice& OpKernelContext::eigen_device() const { + return eigen_sycl_device(); +} + void OpKernelConstruction::CtxFailure(Status s) { VLOG(1) << s; SetStatus(s); diff --git a/tensorflow/core/framework/op_kernel.h b/tensorflow/core/framework/op_kernel.h index 4c14918ea70..55de40c4714 100644 --- a/tensorflow/core/framework/op_kernel.h +++ b/tensorflow/core/framework/op_kernel.h @@ -53,6 +53,7 @@ limitations under the License. namespace Eigen { struct ThreadPoolDevice; struct GpuDevice; +struct SyclDevice; } // end namespace Eigen namespace tensorflow { @@ -891,6 +892,9 @@ class OpKernelContext { const Eigen::GpuDevice& eigen_gpu_device() const { return params_->eigen_gpu_device->device(); } + const Eigen::SyclDevice& eigen_sycl_device() const { + return *device()->eigen_sycl_device(); + } template const EigenDeviceType& eigen_device() const; From 05cff7db89df7894dc9c0d40668a4915f41b65c6 Mon Sep 17 00:00:00 2001 From: luke iwanski Date: Thu, 27 Oct 2016 07:20:29 +0100 Subject: [PATCH 20/51] Registered Const Op for SYCL device. --- tensorflow/core/kernels/constant_op.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/tensorflow/core/kernels/constant_op.cc b/tensorflow/core/kernels/constant_op.cc index a743be66124..946661aa39d 100644 --- a/tensorflow/core/kernels/constant_op.cc +++ b/tensorflow/core/kernels/constant_op.cc @@ -50,6 +50,7 @@ void ConstantOp::Compute(OpKernelContext* ctx) { ctx->set_output(0, tensor_); } ConstantOp::~ConstantOp() {} REGISTER_KERNEL_BUILDER(Name("Const").Device(DEVICE_CPU), ConstantOp); +REGISTER_KERNEL_BUILDER(Name("Const").Device(DEVICE_SYCL), ConstantOp); #if GOOGLE_CUDA #define REGISTER_KERNEL(D, TYPE) \ From 397972c4903d6d9fa1f266e1256bcc6ba786809f Mon Sep 17 00:00:00 2001 From: luke iwanski Date: Thu, 27 Oct 2016 07:23:19 +0100 Subject: [PATCH 21/51] Partial specialisation of UnaryFunctor for SYCLDevice has been added. --- tensorflow/core/kernels/cwise_ops_sycl_common.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/tensorflow/core/kernels/cwise_ops_sycl_common.h b/tensorflow/core/kernels/cwise_ops_sycl_common.h index baba610d6de..868fe9b8270 100644 --- a/tensorflow/core/kernels/cwise_ops_sycl_common.h +++ b/tensorflow/core/kernels/cwise_ops_sycl_common.h @@ -37,6 +37,15 @@ void Assign(const SYCLDevice& d, OUT out, RHS rhs) { out.device(d) = rhs; } +// Partial specialization of UnaryFunctor. +template +struct UnaryFunctor { + void operator()(const SYCLDevice& d, typename Functor::tout_type out, + typename Functor::tin_type in) { + To32Bit(out).device(d) = To32Bit(in).unaryExpr(typename Functor::func()); + } +}; + // Partial specialization of BinaryFunctor. template struct BinaryFunctor { From 41e439e629c4b8c647c3bc34f7155be90ea51029 Mon Sep 17 00:00:00 2001 From: luke iwanski Date: Thu, 27 Oct 2016 07:25:05 +0100 Subject: [PATCH 22/51] _Arg and _Retval has been registered for SYCL device. --- tensorflow/core/kernels/function_ops.cc | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tensorflow/core/kernels/function_ops.cc b/tensorflow/core/kernels/function_ops.cc index 56253eb64a7..d348a0dff3b 100644 --- a/tensorflow/core/kernels/function_ops.cc +++ b/tensorflow/core/kernels/function_ops.cc @@ -86,6 +86,9 @@ class RetvalOp : public OpKernel { REGISTER_KERNEL_BUILDER(Name("_Arg").Device(DEVICE_CPU), ArgOp); REGISTER_KERNEL_BUILDER(Name("_Retval").Device(DEVICE_CPU), RetvalOp); +REGISTER_KERNEL_BUILDER(Name("_Arg").Device(DEVICE_SYCL), ArgOp); +REGISTER_KERNEL_BUILDER(Name("_Retval").Device(DEVICE_SYCL), RetvalOp); + #define REGISTER_GPU_KERNELS(type) \ REGISTER_KERNEL_BUILDER( \ Name("_Arg").Device(DEVICE_GPU).TypeConstraint("T"), ArgOp); \ From 115bc8e54d0d46d4a1e6515fb82101ec8a191112 Mon Sep 17 00:00:00 2001 From: luke iwanski Date: Thu, 27 Oct 2016 07:26:24 +0100 Subject: [PATCH 23/51] Identity Op has been registered for SYCL device. --- tensorflow/core/kernels/identity_op.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/tensorflow/core/kernels/identity_op.cc b/tensorflow/core/kernels/identity_op.cc index 459d329ba4a..ced8280e3bc 100644 --- a/tensorflow/core/kernels/identity_op.cc +++ b/tensorflow/core/kernels/identity_op.cc @@ -24,6 +24,7 @@ limitations under the License. namespace tensorflow { REGISTER_KERNEL_BUILDER(Name("Identity").Device(DEVICE_CPU), IdentityOp); +REGISTER_KERNEL_BUILDER(Name("Identity").Device(DEVICE_SYCL), IdentityOp); // StopGradient does the same thing as Identity, but has a different // gradient registered. REGISTER_KERNEL_BUILDER(Name("StopGradient").Device(DEVICE_CPU), IdentityOp); From 81e02084337f5fa5deceeb59fe86f1dcfbb4ab34 Mon Sep 17 00:00:00 2001 From: luke iwanski Date: Thu, 27 Oct 2016 07:27:30 +0100 Subject: [PATCH 24/51] NoOp has been registered for SYCL device. --- tensorflow/core/kernels/no_op.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/tensorflow/core/kernels/no_op.cc b/tensorflow/core/kernels/no_op.cc index 0ad05ee3235..e4c7f48e773 100644 --- a/tensorflow/core/kernels/no_op.cc +++ b/tensorflow/core/kernels/no_op.cc @@ -19,5 +19,6 @@ namespace tensorflow { REGISTER_KERNEL_BUILDER(Name("NoOp").Device(DEVICE_CPU), NoOp); REGISTER_KERNEL_BUILDER(Name("NoOp").Device(DEVICE_GPU), NoOp); +REGISTER_KERNEL_BUILDER(Name("NoOp").Device(DEVICE_SYCL), NoOp); } // namespace tensorflow From d60cccf42951f9aff63f500d6441ef6f18edea52 Mon Sep 17 00:00:00 2001 From: luke iwanski Date: Thu, 27 Oct 2016 07:30:23 +0100 Subject: [PATCH 25/51] _Send and _Recv Ops has been registered for SYCL device. --- tensorflow/core/kernels/sendrecv_ops.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tensorflow/core/kernels/sendrecv_ops.cc b/tensorflow/core/kernels/sendrecv_ops.cc index c2a04ed0c4c..38b0bf5756e 100644 --- a/tensorflow/core/kernels/sendrecv_ops.cc +++ b/tensorflow/core/kernels/sendrecv_ops.cc @@ -77,6 +77,7 @@ void SendOp::Compute(OpKernelContext* ctx) { REGISTER_KERNEL_BUILDER(Name("_Send").Device(DEVICE_CPU), SendOp); REGISTER_KERNEL_BUILDER(Name("_Send").Device(DEVICE_GPU), SendOp); +REGISTER_KERNEL_BUILDER(Name("_Send").Device(DEVICE_SYCL), SendOp); REGISTER_KERNEL_BUILDER(Name("_HostSend").Device(DEVICE_CPU), SendOp); REGISTER_KERNEL_BUILDER( @@ -135,6 +136,7 @@ void RecvOp::ComputeAsync(OpKernelContext* ctx, DoneCallback done) { REGISTER_KERNEL_BUILDER(Name("_Recv").Device(DEVICE_CPU), RecvOp); REGISTER_KERNEL_BUILDER(Name("_Recv").Device(DEVICE_GPU), RecvOp); +REGISTER_KERNEL_BUILDER(Name("_Recv").Device(DEVICE_SYCL), RecvOp); REGISTER_KERNEL_BUILDER(Name("_HostRecv").Device(DEVICE_CPU), RecvOp); REGISTER_KERNEL_BUILDER( From fe9a864aad6b2cafdb6cd379e4be0f123247fa65 Mon Sep 17 00:00:00 2001 From: luke iwanski Date: Thu, 27 Oct 2016 07:33:32 +0100 Subject: [PATCH 26/51] Const op has been registered for SYCL device. --- tensorflow/core/kernels/cwise_op_round.cc | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tensorflow/core/kernels/cwise_op_round.cc b/tensorflow/core/kernels/cwise_op_round.cc index 0457f3931d8..c87157f2d37 100644 --- a/tensorflow/core/kernels/cwise_op_round.cc +++ b/tensorflow/core/kernels/cwise_op_round.cc @@ -18,6 +18,12 @@ limitations under the License. namespace tensorflow { REGISTER5(UnaryOp, CPU, "Round", functor::round, Eigen::half, float, double, int32, int64); + +REGISTER(UnaryOp, SYCL, "Round", functor::round, float); +namespace functor { +DEFINE_UNARY1(round, float); +} // namespace functor + #if GOOGLE_CUDA REGISTER5(UnaryOp, GPU, "Round", functor::round, Eigen::half, float, double, int32, int64); From 32f53d9b064f1451690e0b75ea71845239bb8f3b Mon Sep 17 00:00:00 2001 From: luke iwanski Date: Thu, 27 Oct 2016 07:40:19 +0100 Subject: [PATCH 27/51] BlockingOp has been registered for SYCL device. --- tensorflow/core/common_runtime/direct_session_test.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tensorflow/core/common_runtime/direct_session_test.cc b/tensorflow/core/common_runtime/direct_session_test.cc index 38dd627da0c..124688515bd 100644 --- a/tensorflow/core/common_runtime/direct_session_test.cc +++ b/tensorflow/core/common_runtime/direct_session_test.cc @@ -818,6 +818,8 @@ class BlockingOp : public OpKernel { REGISTER_KERNEL_BUILDER(Name("BlockingOp").Device(DEVICE_CPU), BlockingOp); REGISTER_OP("BlockingOp").Input("x: float").Output("y: float").Doc(""); +REGISTER_KERNEL_BUILDER(Name("BlockingOp").Device(DEVICE_SYCL), BlockingOp); + static void TestSessionInterOpThreadsImpl(bool use_function_lib) { FunctionDefLibrary library_graph_def; if (use_function_lib) { From 9401110a9d8545c40fce638e4c63fcbd2668d0fc Mon Sep 17 00:00:00 2001 From: luke iwanski Date: Thu, 27 Oct 2016 20:55:30 +0100 Subject: [PATCH 28/51] bus_adjacency -> const DeviceLocality& locality. And dummy GetShortDeviceDescription for SYCL device. --- tensorflow/core/common_runtime/sycl/sycl_device.cc | 9 +++++---- tensorflow/core/common_runtime/sycl/sycl_device.h | 10 ++++++++-- .../core/common_runtime/sycl/sycl_device_factory.cc | 5 +++-- 3 files changed, 16 insertions(+), 8 deletions(-) diff --git a/tensorflow/core/common_runtime/sycl/sycl_device.cc b/tensorflow/core/common_runtime/sycl/sycl_device.cc index e13c34dd690..fbb4d148bf3 100644 --- a/tensorflow/core/common_runtime/sycl/sycl_device.cc +++ b/tensorflow/core/common_runtime/sycl/sycl_device.cc @@ -27,10 +27,11 @@ cl::sycl::gpu_selector s; cl::sycl::queue q(s); SYCLDevice::SYCLDevice(const SessionOptions& options, const string& name, - Bytes memory_limit, BusAdjacency bus_adjacency, - Allocator* allocator) - : LocalDevice(options, Device::BuildDeviceAttributes( - name, DEVICE_SYCL, memory_limit, bus_adjacency), + Bytes memory_limit, const DeviceLocality& locality, + const string& physical_device_desc, Allocator* allocator) + : LocalDevice(options, + Device::BuildDeviceAttributes(name, DEVICE_SYCL, memory_limit, + locality, physical_device_desc), allocator), allocator_(allocator), device_(q) { diff --git a/tensorflow/core/common_runtime/sycl/sycl_device.h b/tensorflow/core/common_runtime/sycl/sycl_device.h index e43997cbf25..660ccfa6cea 100644 --- a/tensorflow/core/common_runtime/sycl/sycl_device.h +++ b/tensorflow/core/common_runtime/sycl/sycl_device.h @@ -31,8 +31,8 @@ namespace tensorflow { class SYCLDevice : public LocalDevice { public: SYCLDevice(const SessionOptions& options, const string& name, - Bytes memory_limit, BusAdjacency bus_adjacency, - Allocator* allocator); + Bytes memory_limit, const DeviceLocality& locality, + const string& physical_device_desc, Allocator* allocator); ~SYCLDevice() override; void Compute(OpKernel* op_kernel, OpKernelContext* context) override; @@ -42,6 +42,12 @@ class SYCLDevice : public LocalDevice { Tensor* tensor) override; Status Sync() override { return Status::OK(); } + static string GetShortDeviceDescription(/*int device_id, + const DeviceDescription& desc*/) { + return strings::StrCat("device: 0, name SYCL, pci bus id: 0"); + // return strings::StrCat("device: ", device_id, ", name: ", desc.name(), + // ", pci bus id: ", desc.pci_bus_id()); + } private: Allocator* allocator_; // Not owned diff --git a/tensorflow/core/common_runtime/sycl/sycl_device_factory.cc b/tensorflow/core/common_runtime/sycl/sycl_device_factory.cc index fe10412ab6c..fba4a5f4018 100644 --- a/tensorflow/core/common_runtime/sycl/sycl_device_factory.cc +++ b/tensorflow/core/common_runtime/sycl/sycl_device_factory.cc @@ -30,8 +30,9 @@ class SYCLDeviceFactory : public DeviceFactory { } for (int i = 0; i < n; i++) { string name = strings::StrCat(name_prefix, "/sycl:", i); - devices->push_back(new SYCLDevice(options, name, Bytes(256 << 20), - BUS_ANY, cpu_allocator())); + devices->push_back(new SYCLDevice( + options, name, Bytes(256 << 20), DeviceLocality(), + SYCLDevice::GetShortDeviceDescription(), cpu_allocator())); } return Status::OK(); } From 0b4ce7841aa2d18c8546cfe75ef2c58b89864996 Mon Sep 17 00:00:00 2001 From: luke Date: Fri, 28 Oct 2016 11:27:29 +0100 Subject: [PATCH 29/51] Added #ifdef guard for SYCL related code. --- tensorflow/core/framework/op_kernel.cc | 2 ++ tensorflow/core/framework/op_kernel.h | 2 ++ tensorflow/core/kernels/cwise_op_round.cc | 2 ++ 3 files changed, 6 insertions(+) diff --git a/tensorflow/core/framework/op_kernel.cc b/tensorflow/core/framework/op_kernel.cc index d233690a2b3..50520bb3fd5 100644 --- a/tensorflow/core/framework/op_kernel.cc +++ b/tensorflow/core/framework/op_kernel.cc @@ -949,10 +949,12 @@ const Eigen::GpuDevice& OpKernelContext::eigen_device() const { return eigen_gpu_device(); } +#ifdef TENSORFLOW_USE_SYCL template <> const Eigen::SyclDevice& OpKernelContext::eigen_device() const { return eigen_sycl_device(); } +#endif void OpKernelConstruction::CtxFailure(Status s) { VLOG(1) << s; diff --git a/tensorflow/core/framework/op_kernel.h b/tensorflow/core/framework/op_kernel.h index 55de40c4714..432e2ad2f6e 100644 --- a/tensorflow/core/framework/op_kernel.h +++ b/tensorflow/core/framework/op_kernel.h @@ -892,9 +892,11 @@ class OpKernelContext { const Eigen::GpuDevice& eigen_gpu_device() const { return params_->eigen_gpu_device->device(); } +#ifdef TENSORFLOW_USE_SYCL const Eigen::SyclDevice& eigen_sycl_device() const { return *device()->eigen_sycl_device(); } +#endif template const EigenDeviceType& eigen_device() const; diff --git a/tensorflow/core/kernels/cwise_op_round.cc b/tensorflow/core/kernels/cwise_op_round.cc index c87157f2d37..7a4482dbb2b 100644 --- a/tensorflow/core/kernels/cwise_op_round.cc +++ b/tensorflow/core/kernels/cwise_op_round.cc @@ -19,10 +19,12 @@ namespace tensorflow { REGISTER5(UnaryOp, CPU, "Round", functor::round, Eigen::half, float, double, int32, int64); +#ifdef TENSORFLOW_USE_SYCL REGISTER(UnaryOp, SYCL, "Round", functor::round, float); namespace functor { DEFINE_UNARY1(round, float); } // namespace functor +#endif #if GOOGLE_CUDA REGISTER5(UnaryOp, GPU, "Round", functor::round, Eigen::half, float, double, From 27e116b85cccb016a380f8977b89c4e3638d5b25 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 28 Oct 2016 13:10:01 -0700 Subject: [PATCH 30/51] Fixed a typo --- tensorflow/workspace.bzl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl index c8ec9fb0ab1..2211f1a28c6 100644 --- a/tensorflow/workspace.bzl +++ b/tensorflow/workspace.bzl @@ -22,7 +22,7 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""): native.new_http_archive( name = "eigen_archive", - url = "http://bitbucket.org/eigen/eigen/" + eigen_version + ".tar.gz", + url = "http://bitbucket.org/eigen/eigen/get/" + eigen_version + ".tar.gz", sha256 = eigen_sha256, strip_prefix = "eigen-eigen-" + eigen_version, build_file = str(Label("//:eigen.BUILD")), From 174c6edb7dd5f99b4e9b0f45a0eaf978b578e85c Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 28 Oct 2016 13:42:37 -0700 Subject: [PATCH 31/51] Cleanup --- eigen.BUILD | 2 -- 1 file changed, 2 deletions(-) diff --git a/eigen.BUILD b/eigen.BUILD index 210d1523ea3..8a699f6aa84 100644 --- a/eigen.BUILD +++ b/eigen.BUILD @@ -55,8 +55,6 @@ EIGEN_MPL2_HEADER_FILES = glob( ], ) -# archive_dir = "benoitsteiner-opencl-9d4a08d57d0d" - cc_library( name = "eigen", hdrs = EIGEN_MPL2_HEADER_FILES, From cf33ec5e6a33fea15b719a0dd8c16d5b1a5c8b70 Mon Sep 17 00:00:00 2001 From: luke iwanski Date: Tue, 1 Nov 2016 16:47:00 +0000 Subject: [PATCH 32/51] Feedback from #5267 applied. --- tensorflow/core/common_runtime/sycl/sycl_device.h | 2 -- tensorflow/core/common_runtime/sycl/sycl_device_factory.cc | 2 +- tensorflow/core/kernels/constant_op.cc | 2 ++ tensorflow/core/kernels/cwise_ops_sycl_common.h | 7 ------- 4 files changed, 3 insertions(+), 10 deletions(-) diff --git a/tensorflow/core/common_runtime/sycl/sycl_device.h b/tensorflow/core/common_runtime/sycl/sycl_device.h index 660ccfa6cea..9299bb2af3d 100644 --- a/tensorflow/core/common_runtime/sycl/sycl_device.h +++ b/tensorflow/core/common_runtime/sycl/sycl_device.h @@ -45,8 +45,6 @@ class SYCLDevice : public LocalDevice { static string GetShortDeviceDescription(/*int device_id, const DeviceDescription& desc*/) { return strings::StrCat("device: 0, name SYCL, pci bus id: 0"); - // return strings::StrCat("device: ", device_id, ", name: ", desc.name(), - // ", pci bus id: ", desc.pci_bus_id()); } private: diff --git a/tensorflow/core/common_runtime/sycl/sycl_device_factory.cc b/tensorflow/core/common_runtime/sycl/sycl_device_factory.cc index fba4a5f4018..6de97715283 100644 --- a/tensorflow/core/common_runtime/sycl/sycl_device_factory.cc +++ b/tensorflow/core/common_runtime/sycl/sycl_device_factory.cc @@ -15,7 +15,7 @@ limitations under the License. #if TENSORFLOW_USE_SYCL -#include "sycl_device.h" +#include "tensorflow/core/common_runtime/sycl/sycl_device.h" namespace tensorflow { diff --git a/tensorflow/core/kernels/constant_op.cc b/tensorflow/core/kernels/constant_op.cc index afb1034f9fb..00b44555a3d 100644 --- a/tensorflow/core/kernels/constant_op.cc +++ b/tensorflow/core/kernels/constant_op.cc @@ -50,7 +50,9 @@ void ConstantOp::Compute(OpKernelContext* ctx) { ctx->set_output(0, tensor_); } ConstantOp::~ConstantOp() {} REGISTER_KERNEL_BUILDER(Name("Const").Device(DEVICE_CPU), ConstantOp); +#ifdef TENSORFLOW_USE_SYCL REGISTER_KERNEL_BUILDER(Name("Const").Device(DEVICE_SYCL), ConstantOp); +#endif #if GOOGLE_CUDA #define REGISTER_KERNEL(D, TYPE) \ diff --git a/tensorflow/core/kernels/cwise_ops_sycl_common.h b/tensorflow/core/kernels/cwise_ops_sycl_common.h index 868fe9b8270..c66ae42c2d2 100644 --- a/tensorflow/core/kernels/cwise_ops_sycl_common.h +++ b/tensorflow/core/kernels/cwise_ops_sycl_common.h @@ -58,12 +58,6 @@ struct BinaryFunctor { void Left(const SYCLDevice& d, typename Functor::tout_type out, typename Functor::tscalar_type scalar, typename Functor::tin_type in, bool* error) { - // typedef typename Functor::out_type Tout; - // typedef typename Functor::in_type Tin; - // typedef typename Functor::func Binary; - // typedef typename Eigen::internal::scalar_left Unary; - // Assign(d, out, in.unaryExpr(Unary(scalar.data()))); - // printf("BinaryFunctor::Left NOT IMPLEMENTED ! \n"); LOG(FATAL) << "BinaryFunctor::Left NOT IMPLEMENTED ! "; } @@ -85,7 +79,6 @@ struct BinaryFunctor { typename Eigen::array bcast1, bool* error) { LOG(FATAL) << "BinaryFunctor::BCast NOT IMPLEMENTED "; - // printf("BinaryFunctor::BCast NOT IMPLEMENTED ! \n"); } }; From f58374e88ddfbc7ead046c8b42a5cb8423603cac Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 3 Jun 2016 10:24:45 -0700 Subject: [PATCH 33/51] Pull the latest Eigen version that supports OpenCL --- third_party/eigen3/Eigen/Eigenvalues | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/third_party/eigen3/Eigen/Eigenvalues b/third_party/eigen3/Eigen/Eigenvalues index bf739b9b850..be27849e3b8 100644 --- a/third_party/eigen3/Eigen/Eigenvalues +++ b/third_party/eigen3/Eigen/Eigenvalues @@ -1 +1,5 @@ +<<<<<<< d9a89a5cf63b2ecedf68d3eefdc24be3f519e503 #include "Eigen/Eigenvalues" +======= +#include "benoitsteiner-opencl-9d4a08d57d0d/Eigen/Eigenvalues" +>>>>>>> Pull the latest Eigen version that supports OpenCL From 04454bcaa89b5be5a8fd0273ee49c5405f919b5a Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 12 Oct 2016 22:52:59 -0700 Subject: [PATCH 34/51] Switch to the latest version of Eigen that supports OpenCL --- tensorflow/workspace.bzl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl index 2211f1a28c6..d6c0aff9b1e 100644 --- a/tensorflow/workspace.bzl +++ b/tensorflow/workspace.bzl @@ -22,9 +22,9 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""): native.new_http_archive( name = "eigen_archive", - url = "http://bitbucket.org/eigen/eigen/get/" + eigen_version + ".tar.gz", + url = "http://bitbucket.org/benoitsteiner/opencl/get/" + eigen_version + ".tar.gz", sha256 = eigen_sha256, - strip_prefix = "eigen-eigen-" + eigen_version, + strip_prefix = "benoitsteiner-opencl-" + eigen_version, build_file = str(Label("//:eigen.BUILD")), ) From ac7584fac29543683bb418dc346b23b9e904b799 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 19 Oct 2016 12:24:14 -0700 Subject: [PATCH 35/51] Upgraded to the latest version of Eigen --- tensorflow/workspace.bzl | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl index d6c0aff9b1e..61bd20808ab 100644 --- a/tensorflow/workspace.bzl +++ b/tensorflow/workspace.bzl @@ -37,9 +37,15 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""): native.http_archive( name = "gemmlowp", +<<<<<<< a6125b624cf1757b4cc2751fe764b34fbc8c9074 url = "http://github.com/google/gemmlowp/archive/a6f29d8ac48d63293f845f2253eccbf86bc28321.tar.gz", sha256 = "75d40ea8e68b0d1644f052fffe8f14a410b2a73d40ccb859a95c0578d194ec26", strip_prefix = "gemmlowp-a6f29d8ac48d63293f845f2253eccbf86bc28321", +======= + url = "http://github.com/google/gemmlowp/archive/c0bacf11fb509a2cbe15a97362a2df067ffd57a2.tar.gz", + sha256 = "dc64a38f9927db18748d9024987c9b102115e25bc2be4b76aa8e422b8f83d882", + strip_prefix = "gemmlowp-c0bacf11fb509a2cbe15a97362a2df067ffd57a2", +>>>>>>> Upgraded to the latest version of Eigen ) native.new_http_archive( @@ -101,9 +107,15 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""): native.http_archive( name = "protobuf", +<<<<<<< a6125b624cf1757b4cc2751fe764b34fbc8c9074 url = "http://github.com/google/protobuf/archive/008b5a228b37c054f46ba478ccafa5e855cb16db.tar.gz", sha256 = "2737ad055eb8a9bc63ed068e32c4ea280b62d8236578cb4d4120eb5543f759ab", strip_prefix = "protobuf-008b5a228b37c054f46ba478ccafa5e855cb16db", +======= + url = "http://github.com/google/protobuf/archive/c2b3e70efd2038a54ef8973771ac58192885125e.tar.gz", + sha256 = "eafc1bc4c27970d62effe64ba6610823fdd66711f440d8ca4a168167786a2fcb", + strip_prefix = "protobuf-c2b3e70efd2038a54ef8973771ac58192885125e", +>>>>>>> Upgraded to the latest version of Eigen ) native.new_http_archive( From f4325776061a31283f2a377ffa6f5de329b9903a Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 28 Oct 2016 13:10:01 -0700 Subject: [PATCH 36/51] Fixed a typo --- tensorflow/workspace.bzl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl index 61bd20808ab..b78534755c9 100644 --- a/tensorflow/workspace.bzl +++ b/tensorflow/workspace.bzl @@ -22,9 +22,9 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""): native.new_http_archive( name = "eigen_archive", - url = "http://bitbucket.org/benoitsteiner/opencl/get/" + eigen_version + ".tar.gz", + url = "http://bitbucket.org/eigen/eigen/get/" + eigen_version + ".tar.gz", sha256 = eigen_sha256, - strip_prefix = "benoitsteiner-opencl-" + eigen_version, + strip_prefix = "eigen-eigen-" + eigen_version, build_file = str(Label("//:eigen.BUILD")), ) From 8e06a6e655c1d97496fa29e1b4435f8f0fb913aa Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 1 Nov 2016 10:32:48 -0700 Subject: [PATCH 37/51] Fixed merge conflicts. --- tensorflow/workspace.bzl | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl index b78534755c9..2211f1a28c6 100644 --- a/tensorflow/workspace.bzl +++ b/tensorflow/workspace.bzl @@ -37,15 +37,9 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""): native.http_archive( name = "gemmlowp", -<<<<<<< a6125b624cf1757b4cc2751fe764b34fbc8c9074 url = "http://github.com/google/gemmlowp/archive/a6f29d8ac48d63293f845f2253eccbf86bc28321.tar.gz", sha256 = "75d40ea8e68b0d1644f052fffe8f14a410b2a73d40ccb859a95c0578d194ec26", strip_prefix = "gemmlowp-a6f29d8ac48d63293f845f2253eccbf86bc28321", -======= - url = "http://github.com/google/gemmlowp/archive/c0bacf11fb509a2cbe15a97362a2df067ffd57a2.tar.gz", - sha256 = "dc64a38f9927db18748d9024987c9b102115e25bc2be4b76aa8e422b8f83d882", - strip_prefix = "gemmlowp-c0bacf11fb509a2cbe15a97362a2df067ffd57a2", ->>>>>>> Upgraded to the latest version of Eigen ) native.new_http_archive( @@ -107,15 +101,9 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""): native.http_archive( name = "protobuf", -<<<<<<< a6125b624cf1757b4cc2751fe764b34fbc8c9074 url = "http://github.com/google/protobuf/archive/008b5a228b37c054f46ba478ccafa5e855cb16db.tar.gz", sha256 = "2737ad055eb8a9bc63ed068e32c4ea280b62d8236578cb4d4120eb5543f759ab", strip_prefix = "protobuf-008b5a228b37c054f46ba478ccafa5e855cb16db", -======= - url = "http://github.com/google/protobuf/archive/c2b3e70efd2038a54ef8973771ac58192885125e.tar.gz", - sha256 = "eafc1bc4c27970d62effe64ba6610823fdd66711f440d8ca4a168167786a2fcb", - strip_prefix = "protobuf-c2b3e70efd2038a54ef8973771ac58192885125e", ->>>>>>> Upgraded to the latest version of Eigen ) native.new_http_archive( From 4c85e1cd6c6d432265a566e058b7389f23ec4aa5 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 1 Nov 2016 10:56:23 -0700 Subject: [PATCH 38/51] Fixed merge conflict --- third_party/eigen3/Eigen/Eigenvalues | 4 ---- 1 file changed, 4 deletions(-) diff --git a/third_party/eigen3/Eigen/Eigenvalues b/third_party/eigen3/Eigen/Eigenvalues index be27849e3b8..bf739b9b850 100644 --- a/third_party/eigen3/Eigen/Eigenvalues +++ b/third_party/eigen3/Eigen/Eigenvalues @@ -1,5 +1 @@ -<<<<<<< d9a89a5cf63b2ecedf68d3eefdc24be3f519e503 #include "Eigen/Eigenvalues" -======= -#include "benoitsteiner-opencl-9d4a08d57d0d/Eigen/Eigenvalues" ->>>>>>> Pull the latest Eigen version that supports OpenCL From f52f0f586e7ae72ca60241bf2f02a96923809723 Mon Sep 17 00:00:00 2001 From: Xiaoqiang Zheng Date: Fri, 28 Oct 2016 10:29:28 -0800 Subject: [PATCH 39/51] Merge changes from github. Change: 137532946 --- tensorflow/core/ops/math_ops.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/core/ops/math_ops.cc b/tensorflow/core/ops/math_ops.cc index ff00214da3c..9e553694b92 100644 --- a/tensorflow/core/ops/math_ops.cc +++ b/tensorflow/core/ops/math_ops.cc @@ -976,7 +976,7 @@ REGISTER_OP("Select") c->set_output(0, data); return Status::OK(); - }) + }) .Doc(R"doc( Selects elements from `t` or `e`, depending on `condition`. From 7326f9e9c0cbcb2e1584dff551cbbc4c41eb103f Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 2 Nov 2016 08:22:08 -0700 Subject: [PATCH 40/51] Improved the formatting of the BUILD file --- third_party/eigen3/BUILD | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/third_party/eigen3/BUILD b/third_party/eigen3/BUILD index b45af94e540..f697866bde2 100644 --- a/third_party/eigen3/BUILD +++ b/third_party/eigen3/BUILD @@ -23,7 +23,8 @@ cc_library( "unsupported/Eigen/CXX11/FixedPoint", ], visibility = ["//visibility:public"], - deps = ["@eigen_archive//:eigen", - "@local_config_sycl//sycl:sycl", + deps = [ + "@eigen_archive//:eigen", + "@local_config_sycl//sycl:sycl", ], ) From 810ade7a449b8e4c168de614c35d17dccd68f291 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 2 Nov 2016 09:17:10 -0700 Subject: [PATCH 41/51] Deleted unecessary code --- .../core/common_runtime/device_factory.cc | 18 +----------------- 1 file changed, 1 insertion(+), 17 deletions(-) diff --git a/tensorflow/core/common_runtime/device_factory.cc b/tensorflow/core/common_runtime/device_factory.cc index 15933a81992..84362d4b8ab 100644 --- a/tensorflow/core/common_runtime/device_factory.cc +++ b/tensorflow/core/common_runtime/device_factory.cc @@ -90,27 +90,11 @@ Status DeviceFactory::AddDevices(const SessionOptions& options, return errors::NotFound("No CPU devices are available in this process"); } - // Then GPU. - auto gpu_factory = GetFactory("GPU"); - if (gpu_factory) { - TF_RETURN_IF_ERROR( - gpu_factory->CreateDevices(options, name_prefix, devices)); - } - - // Then SYCL. - auto sycl_factory = GetFactory("SYCL"); - - if (sycl_factory) { - TF_RETURN_IF_ERROR( - sycl_factory->CreateDevices(options, name_prefix, devices)); - } - // Then the rest. mutex_lock l(*get_device_factory_lock()); for (auto& p : device_factories()) { auto factory = p.second.factory.get(); - if (factory != cpu_factory && factory != gpu_factory && - factory != sycl_factory) { + if (factory != cpu_factory) { TF_RETURN_IF_ERROR(factory->CreateDevices(options, name_prefix, devices)); } } From 3f39dec966cc89cd3ff8f1c86995276e12ffd083 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 2 Nov 2016 10:33:49 -0700 Subject: [PATCH 42/51] Placement of operations of SYCL devices needs to be done by using the /device:sycl string. --- tensorflow/core/util/device_name_utils.cc | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/tensorflow/core/util/device_name_utils.cc b/tensorflow/core/util/device_name_utils.cc index 336b69b3aba..ac183004966 100644 --- a/tensorflow/core/util/device_name_utils.cc +++ b/tensorflow/core/util/device_name_utils.cc @@ -142,6 +142,7 @@ bool DeviceNameUtils::ParseFullName(StringPiece fullname, ParsedName* p) { progress = true; } + // Handle legacy naming convention for cpu and gpu. if (str_util::ConsumePrefix(&fullname, "/cpu:") || str_util::ConsumePrefix(&fullname, "/CPU:")) { p->has_type = true; @@ -162,16 +163,6 @@ bool DeviceNameUtils::ParseFullName(StringPiece fullname, ParsedName* p) { } progress = true; } - if (str_util::ConsumePrefix(&fullname, "/sycl:") || - str_util::ConsumePrefix(&fullname, "/SYCL:")) { - p->has_type = true; - p->type = "SYCL"; // Treat '/sycl:..' as uppercase '/device:SYCL:...' - p->has_id = !str_util::ConsumePrefix(&fullname, "*"); - if (p->has_id && !ConsumeNumber(&fullname, &p->id)) { - return false; - } - progress = true; - } if (!progress) { return false; From d5511081f54788b4f7851f81ac9a058a3fc620f0 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 2 Nov 2016 18:39:51 -0700 Subject: [PATCH 43/51] Register the constant op on sycl devices for numeric types only: we don't want to process strings on an accelerator. --- tensorflow/core/kernels/constant_op.cc | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/tensorflow/core/kernels/constant_op.cc b/tensorflow/core/kernels/constant_op.cc index 00b44555a3d..4a289e1800f 100644 --- a/tensorflow/core/kernels/constant_op.cc +++ b/tensorflow/core/kernels/constant_op.cc @@ -50,8 +50,16 @@ void ConstantOp::Compute(OpKernelContext* ctx) { ctx->set_output(0, tensor_); } ConstantOp::~ConstantOp() {} REGISTER_KERNEL_BUILDER(Name("Const").Device(DEVICE_CPU), ConstantOp); -#ifdef TENSORFLOW_USE_SYCL -REGISTER_KERNEL_BUILDER(Name("Const").Device(DEVICE_SYCL), ConstantOp); + +#if TENSORFLOW_USE_SYCL +#define REGISTER_SYCL_KERNEL(TYPE) \ + REGISTER_KERNEL_BUILDER( \ + Name("Const") \ + .Device(DEVICE_SYCL) \ + .TypeConstraint("dtype"), \ + ConstantOp); +TF_CALL_NUMBER_TYPES(REGISTER_SYCL_KERNEL); +#undef REGISTER_SYCL_KERNEL #endif #if GOOGLE_CUDA From 90113e739cf0c6e93332bf02f8342df13950d4d1 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 2 Nov 2016 18:44:48 -0700 Subject: [PATCH 44/51] Added a device context for sycl --- tensorflow/core/BUILD | 2 + .../core/common_runtime/rendezvous_mgr.cc | 4 +- .../core/common_runtime/sycl/sycl_device.cc | 19 +++++++- .../core/common_runtime/sycl/sycl_device.h | 5 ++ .../sycl/sycl_device_context.cc | 46 +++++++++++++++++++ .../common_runtime/sycl/sycl_device_context.h | 42 +++++++++++++++++ 6 files changed, 115 insertions(+), 3 deletions(-) create mode 100644 tensorflow/core/common_runtime/sycl/sycl_device_context.cc create mode 100644 tensorflow/core/common_runtime/sycl/sycl_device_context.h diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD index 90445ed4473..ff3fdb518fd 100644 --- a/tensorflow/core/BUILD +++ b/tensorflow/core/BUILD @@ -1380,10 +1380,12 @@ cc_library( name = "sycl_runtime", srcs = if_not_windows([ "common_runtime/sycl/sycl_device.cc", + "common_runtime/sycl/sycl_device_context.cc", "common_runtime/sycl/sycl_device_factory.cc", ]), hdrs = if_not_windows([ "common_runtime/sycl/sycl_device.h", + "common_runtime/sycl/sycl_device_context.h", ]), copts = tf_copts(), linkstatic = 1, diff --git a/tensorflow/core/common_runtime/rendezvous_mgr.cc b/tensorflow/core/common_runtime/rendezvous_mgr.cc index 6b57e8a0e81..bbdaa6d85b7 100644 --- a/tensorflow/core/common_runtime/rendezvous_mgr.cc +++ b/tensorflow/core/common_runtime/rendezvous_mgr.cc @@ -66,9 +66,9 @@ void IntraProcessRendezvous::SameWorkerRecvDone( // Do a quick copy (sharing the underlying buffer) if both tensors // are on host memory. const bool src_host = (send_args.alloc_attrs.on_host() || - parsed.src.type == "CPU" || parsed.src.type == "SYCL"); + parsed.src.type == "CPU"); const bool dst_host = (recv_args.alloc_attrs.on_host() || - parsed.dst.type == "CPU" || parsed.dst.type == "SYCL"); + parsed.dst.type == "CPU"); if (src_host && dst_host) { *out = in; done(Status::OK()); diff --git a/tensorflow/core/common_runtime/sycl/sycl_device.cc b/tensorflow/core/common_runtime/sycl/sycl_device.cc index fbb4d148bf3..ae5b5fbb582 100644 --- a/tensorflow/core/common_runtime/sycl/sycl_device.cc +++ b/tensorflow/core/common_runtime/sycl/sycl_device.cc @@ -34,11 +34,14 @@ SYCLDevice::SYCLDevice(const SessionOptions& options, const string& name, locality, physical_device_desc), allocator), allocator_(allocator), + device_context_(new SYCLDeviceContext()), device_(q) { set_eigen_sycl_device(&device_); } -SYCLDevice::~SYCLDevice() {} +SYCLDevice::~SYCLDevice() { + device_context_->Unref(); +} void SYCLDevice::Compute(OpKernel* op_kernel, OpKernelContext* context) { assert(context); @@ -66,6 +69,20 @@ Status SYCLDevice::MakeTensorFromProto(const TensorProto& tensor_proto, *tensor = std::move(parsed); return Status::OK(); } + +Status SYCLDevice::FillContextMap(const Graph* graph, + DeviceContextMap* device_context_map) { + // Fill in the context map. It is OK for this map to contain + // duplicate DeviceContexts so long as we increment the refcount. + device_context_map->resize(graph->num_node_ids()); + for (Node* n : graph->nodes()) { + device_context_->Ref(); + (*device_context_map)[n->id()] = device_context_; + } + + return Status::OK(); } +} // namespace tensorflow + #endif // TENSORFLOW_USE_SYCL diff --git a/tensorflow/core/common_runtime/sycl/sycl_device.h b/tensorflow/core/common_runtime/sycl/sycl_device.h index 9299bb2af3d..eaa9429b167 100644 --- a/tensorflow/core/common_runtime/sycl/sycl_device.h +++ b/tensorflow/core/common_runtime/sycl/sycl_device.h @@ -24,6 +24,7 @@ limitations under the License. #include "tensorflow/core/common_runtime/device_factory.h" #include "tensorflow/core/common_runtime/local_device.h" +#include "tensorflow/core/common_runtime/sycl/sycl_device_context.h" #include "tensorflow/core/public/session_options.h" namespace tensorflow { @@ -41,6 +42,9 @@ class SYCLDevice : public LocalDevice { const AllocatorAttributes alloc_attrs, Tensor* tensor) override; + Status FillContextMap(const Graph* graph, + DeviceContextMap* device_context_map) override; + Status Sync() override { return Status::OK(); } static string GetShortDeviceDescription(/*int device_id, const DeviceDescription& desc*/) { @@ -49,6 +53,7 @@ class SYCLDevice : public LocalDevice { private: Allocator* allocator_; // Not owned + SYCLDeviceContext* device_context_; Eigen::SyclDevice device_; }; diff --git a/tensorflow/core/common_runtime/sycl/sycl_device_context.cc b/tensorflow/core/common_runtime/sycl/sycl_device_context.cc new file mode 100644 index 00000000000..bbf241a22f0 --- /dev/null +++ b/tensorflow/core/common_runtime/sycl/sycl_device_context.cc @@ -0,0 +1,46 @@ +/* Copyright 2016 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/core/common_runtime/sycl/sycl_device_context.h" +#include "tensorflow/core/common_runtime/dma_helper.h" + +namespace tensorflow { + +void SYCLDeviceContext::CopyCPUTensorToDevice(const Tensor* cpu_tensor, Device* device, + Tensor* device_tensor, + StatusCallback done) const { + const int64 total_bytes = cpu_tensor->TotalBytes(); + if (total_bytes > 0) { + const void* src_ptr = DMAHelper::base(cpu_tensor); + void* dst_ptr = DMAHelper::base(device_tensor); + ::memcpy(dst_ptr, src_ptr, total_bytes); + } + done(Status::OK()); +} + +void SYCLDeviceContext::CopyDeviceTensorToCPU(const Tensor* device_tensor, StringPiece edge_name, + Device* device, Tensor* cpu_tensor, + StatusCallback done) { + const int64 total_bytes = device_tensor->TotalBytes(); + if (total_bytes > 0) { + const void* src_ptr = DMAHelper::base(device_tensor); + void* dst_ptr = DMAHelper::base(cpu_tensor); + ::memcpy(dst_ptr, src_ptr, total_bytes); + } + done(Status::OK()); +} + +} // namespace tensorflow + diff --git a/tensorflow/core/common_runtime/sycl/sycl_device_context.h b/tensorflow/core/common_runtime/sycl/sycl_device_context.h new file mode 100644 index 00000000000..327de52eaa1 --- /dev/null +++ b/tensorflow/core/common_runtime/sycl/sycl_device_context.h @@ -0,0 +1,42 @@ +/* Copyright 2016 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_COMMON_RUNTIME_SYCL_SYCL_DEVICE_CONTEXT_H_ +#define TENSORFLOW_COMMON_RUNTIME_SYCL_SYCL_DEVICE_CONTEXT_H_ + +#include "tensorflow/core/common_runtime/device.h" +#include "tensorflow/core/framework/device_base.h" + +namespace tensorflow { + +class SYCLDeviceContext : public DeviceContext { + public: + SYCLDeviceContext() {} + + ~SYCLDeviceContext() override {} + + void CopyCPUTensorToDevice(const Tensor* cpu_tensor, Device* device, + Tensor* device_tensor, + StatusCallback done) const override; + + void CopyDeviceTensorToCPU(const Tensor* device_tensor, StringPiece edge_name, + Device* device, Tensor* cpu_tensor, + StatusCallback done) override; + +}; + +} // namespace tensorflow + +#endif // TENSORFLOW_COMMON_RUNTIME_SYCL_SYCL_DEVICE_CONTEXT_H_ From 8f97e2448dc1df374f5ec28457da047d6db3121b Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 2 Nov 2016 19:37:49 -0700 Subject: [PATCH 45/51] Register the sycl device under the '/device:SYCL:id' name so that its name follow the standard naming convention for devices --- tensorflow/core/common_runtime/sycl/sycl_device_factory.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/core/common_runtime/sycl/sycl_device_factory.cc b/tensorflow/core/common_runtime/sycl/sycl_device_factory.cc index 6de97715283..97c4c2c236e 100644 --- a/tensorflow/core/common_runtime/sycl/sycl_device_factory.cc +++ b/tensorflow/core/common_runtime/sycl/sycl_device_factory.cc @@ -29,7 +29,7 @@ class SYCLDeviceFactory : public DeviceFactory { n = iter->second; } for (int i = 0; i < n; i++) { - string name = strings::StrCat(name_prefix, "/sycl:", i); + string name = strings::StrCat(name_prefix, "/device:SYCL:", i); devices->push_back(new SYCLDevice( options, name, Bytes(256 << 20), DeviceLocality(), SYCLDevice::GetShortDeviceDescription(), cpu_allocator())); From d9037a06b4dacdd360db6eac7cd1338c48c0e385 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 2 Nov 2016 19:39:36 -0700 Subject: [PATCH 46/51] Upgraded to the latest version of Eigen --- tensorflow/workspace.bzl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl index 2211f1a28c6..a485e25a9d2 100644 --- a/tensorflow/workspace.bzl +++ b/tensorflow/workspace.bzl @@ -17,8 +17,8 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""): # These lines need to be changed when updating Eigen. They are parsed from # this file by the cmake and make builds to determine the eigen version and # hash. - eigen_version = "1d454915237a" - eigen_sha256 = "7e05dd4b9866ef0aa4498be34752a362596cc5db2f8439cee111e4ea54046b57" + eigen_version = "3f0fb403ec4c" + eigen_sha256 = "9ff8301c6af2640932c5ded77ecccee5786cec8c31315311220618b312e0472b" native.new_http_archive( name = "eigen_archive", From fc9bde9c0675116490d204c21f81c764691503f9 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 2 Nov 2016 19:40:19 -0700 Subject: [PATCH 47/51] Only register the _Arg and _Retval kernel for POD types on sycl --- tensorflow/core/kernels/function_ops.cc | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/tensorflow/core/kernels/function_ops.cc b/tensorflow/core/kernels/function_ops.cc index 7d538d2924f..7cb9a3a6573 100644 --- a/tensorflow/core/kernels/function_ops.cc +++ b/tensorflow/core/kernels/function_ops.cc @@ -87,8 +87,28 @@ class RetvalOp : public OpKernel { REGISTER_KERNEL_BUILDER(Name("_Arg").Device(DEVICE_CPU), ArgOp); REGISTER_KERNEL_BUILDER(Name("_Retval").Device(DEVICE_CPU), RetvalOp); -REGISTER_KERNEL_BUILDER(Name("_Arg").Device(DEVICE_SYCL), ArgOp); -REGISTER_KERNEL_BUILDER(Name("_Retval").Device(DEVICE_SYCL), RetvalOp); +#if TENSORFLOW_USE_SYCL +#define REGISTER(type) \ + REGISTER_KERNEL_BUILDER( \ + Name("_Arg").Device(DEVICE_SYCL).TypeConstraint("T"), ArgOp); + TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER) + TF_CALL_bool(REGISTER) REGISTER_KERNEL_BUILDER(Name("_Arg") + .Device(DEVICE_GPU) + .HostMemory("output") + .TypeConstraint("T"), + ArgOp); +#undef REGISTER +#define REGISTER(type) \ + REGISTER_KERNEL_BUILDER( \ + Name("_Retval").Device(DEVICE_SYCL).TypeConstraint("T"), RetvalOp); + TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER) + TF_CALL_bool(REGISTER) REGISTER_KERNEL_BUILDER(Name("_Retval") + .Device(DEVICE_GPU) + .HostMemory("input") + .TypeConstraint("T"), + RetvalOp); +#undef REGISTER +#endif #define REGISTER(type) \ REGISTER_KERNEL_BUILDER( \ From 8d26ac88ef974ae5e5bd1d64e8da49ff6909dc76 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 2 Nov 2016 20:38:51 -0700 Subject: [PATCH 48/51] Updated the name of the sycl device --- tensorflow/core/common_runtime/device_set_test.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/core/common_runtime/device_set_test.cc b/tensorflow/core/common_runtime/device_set_test.cc index 2c4aa227462..550fbf568ed 100644 --- a/tensorflow/core/common_runtime/device_set_test.cc +++ b/tensorflow/core/common_runtime/device_set_test.cc @@ -68,7 +68,7 @@ TEST_F(DeviceSetTest, PrioritizedDeviceTypeList) { (std::vector{DeviceType(DEVICE_GPU), DeviceType(DEVICE_CPU)}), types()); - AddDevice("SYCL", "/job:a/replica:0/task:0/sycl:0"); + AddDevice("SYCL", "/job:a/replica:0/task:0/device:sycl:0"); EXPECT_EQ( (std::vector{DeviceType(DEVICE_SYCL), DeviceType(DEVICE_GPU), DeviceType(DEVICE_CPU)}), types()); From 3e678dc5f8c9d7fd0911705b886ffd792031770e Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 2 Nov 2016 20:53:57 -0700 Subject: [PATCH 49/51] Improved code formatting --- tensorflow/core/common_runtime/rendezvous_mgr.cc | 8 ++++---- tensorflow/core/ops/math_ops.cc | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/tensorflow/core/common_runtime/rendezvous_mgr.cc b/tensorflow/core/common_runtime/rendezvous_mgr.cc index bbdaa6d85b7..285ac7540c8 100644 --- a/tensorflow/core/common_runtime/rendezvous_mgr.cc +++ b/tensorflow/core/common_runtime/rendezvous_mgr.cc @@ -65,10 +65,10 @@ void IntraProcessRendezvous::SameWorkerRecvDone( StatusCallback done) { // Do a quick copy (sharing the underlying buffer) if both tensors // are on host memory. - const bool src_host = (send_args.alloc_attrs.on_host() || - parsed.src.type == "CPU"); - const bool dst_host = (recv_args.alloc_attrs.on_host() || - parsed.dst.type == "CPU"); + const bool src_host = + (send_args.alloc_attrs.on_host() || parsed.src.type == "CPU"); + const bool dst_host = + (recv_args.alloc_attrs.on_host() || parsed.dst.type == "CPU"); if (src_host && dst_host) { *out = in; done(Status::OK()); diff --git a/tensorflow/core/ops/math_ops.cc b/tensorflow/core/ops/math_ops.cc index 9e553694b92..ff00214da3c 100644 --- a/tensorflow/core/ops/math_ops.cc +++ b/tensorflow/core/ops/math_ops.cc @@ -976,7 +976,7 @@ REGISTER_OP("Select") c->set_output(0, data); return Status::OK(); - }) + }) .Doc(R"doc( Selects elements from `t` or `e`, depending on `condition`. From 5ea3f1830ce723990ec64c3848d8171bfeaa7341 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 3 Nov 2016 08:22:02 -0700 Subject: [PATCH 50/51] Fixed formatting of the BUILD file --- tensorflow/core/BUILD | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD index ff3fdb518fd..6c2fec6412a 100644 --- a/tensorflow/core/BUILD +++ b/tensorflow/core/BUILD @@ -1385,7 +1385,7 @@ cc_library( ]), hdrs = if_not_windows([ "common_runtime/sycl/sycl_device.h", - "common_runtime/sycl/sycl_device_context.h", + "common_runtime/sycl/sycl_device_context.h", ]), copts = tf_copts(), linkstatic = 1, From e2694bd4bc2ae61075da98cad47f688b592d53bc Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 3 Nov 2016 10:31:00 -0700 Subject: [PATCH 51/51] Don't create SYCL kernels unless TENSORFLOW_USE_SYCL is set --- tensorflow/core/kernels/identity_op.cc | 20 +++++++++++++++++++- tensorflow/core/kernels/no_op.cc | 3 +++ tensorflow/core/kernels/sendrecv_ops.cc | 6 ++++++ 3 files changed, 28 insertions(+), 1 deletion(-) diff --git a/tensorflow/core/kernels/identity_op.cc b/tensorflow/core/kernels/identity_op.cc index ced8280e3bc..45d27dd19eb 100644 --- a/tensorflow/core/kernels/identity_op.cc +++ b/tensorflow/core/kernels/identity_op.cc @@ -24,7 +24,6 @@ limitations under the License. namespace tensorflow { REGISTER_KERNEL_BUILDER(Name("Identity").Device(DEVICE_CPU), IdentityOp); -REGISTER_KERNEL_BUILDER(Name("Identity").Device(DEVICE_SYCL), IdentityOp); // StopGradient does the same thing as Identity, but has a different // gradient registered. REGISTER_KERNEL_BUILDER(Name("StopGradient").Device(DEVICE_CPU), IdentityOp); @@ -35,6 +34,24 @@ REGISTER_KERNEL_BUILDER(Name("PlaceholderWithDefault").Device(DEVICE_CPU), REGISTER_KERNEL_BUILDER(Name("RefIdentity").Device(DEVICE_CPU), IdentityOp); +#if TENSORFLOW_USE_SYCL +#define REGISTER_SYCL_KERNEL(type) \ + REGISTER_KERNEL_BUILDER( \ + Name("Identity").Device(DEVICE_SYCL).TypeConstraint("T"), \ + IdentityOp); \ + REGISTER_KERNEL_BUILDER( \ + Name("RefIdentity").Device(DEVICE_SYCL).TypeConstraint("T"), \ + IdentityOp); \ + REGISTER_KERNEL_BUILDER( \ + Name("StopGradient").Device(DEVICE_SYCL).TypeConstraint("T"),\ + IdentityOp) + +TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL); +REGISTER_SYCL_KERNEL(bfloat16); + +#undef REGISTER_SYCL_KERNEL +#endif + #define REGISTER_GPU_KERNEL(type) \ REGISTER_KERNEL_BUILDER( \ Name("Identity").Device(DEVICE_GPU).TypeConstraint("T"), \ @@ -51,6 +68,7 @@ REGISTER_GPU_KERNEL(bfloat16); #undef REGISTER_GPU_KERNEL + #if GOOGLE_CUDA // A special GPU kernel for int32 and bool. // TODO(b/25387198): Also enable int32 in device memory. This kernel diff --git a/tensorflow/core/kernels/no_op.cc b/tensorflow/core/kernels/no_op.cc index e4c7f48e773..0993e6e1fcf 100644 --- a/tensorflow/core/kernels/no_op.cc +++ b/tensorflow/core/kernels/no_op.cc @@ -19,6 +19,9 @@ namespace tensorflow { REGISTER_KERNEL_BUILDER(Name("NoOp").Device(DEVICE_CPU), NoOp); REGISTER_KERNEL_BUILDER(Name("NoOp").Device(DEVICE_GPU), NoOp); + +#if TENSORFLOW_USE_SYCL REGISTER_KERNEL_BUILDER(Name("NoOp").Device(DEVICE_SYCL), NoOp); +#endif } // namespace tensorflow diff --git a/tensorflow/core/kernels/sendrecv_ops.cc b/tensorflow/core/kernels/sendrecv_ops.cc index 38b0bf5756e..9e9cdda3826 100644 --- a/tensorflow/core/kernels/sendrecv_ops.cc +++ b/tensorflow/core/kernels/sendrecv_ops.cc @@ -77,7 +77,10 @@ void SendOp::Compute(OpKernelContext* ctx) { REGISTER_KERNEL_BUILDER(Name("_Send").Device(DEVICE_CPU), SendOp); REGISTER_KERNEL_BUILDER(Name("_Send").Device(DEVICE_GPU), SendOp); + +#if TENSORFLOW_USE_SYCL REGISTER_KERNEL_BUILDER(Name("_Send").Device(DEVICE_SYCL), SendOp); +#endif REGISTER_KERNEL_BUILDER(Name("_HostSend").Device(DEVICE_CPU), SendOp); REGISTER_KERNEL_BUILDER( @@ -136,7 +139,10 @@ void RecvOp::ComputeAsync(OpKernelContext* ctx, DoneCallback done) { REGISTER_KERNEL_BUILDER(Name("_Recv").Device(DEVICE_CPU), RecvOp); REGISTER_KERNEL_BUILDER(Name("_Recv").Device(DEVICE_GPU), RecvOp); + +#if TENSORFLOW_USE_SYCL REGISTER_KERNEL_BUILDER(Name("_Recv").Device(DEVICE_SYCL), RecvOp); +#endif REGISTER_KERNEL_BUILDER(Name("_HostRecv").Device(DEVICE_CPU), RecvOp); REGISTER_KERNEL_BUILDER(