OpenCL Improvements (#7596)

* OpenCL improvements

Added Tile, Transpose and Range Ops double support for SYCL device.
Moved gpu_device_name() to test_util.py so now it can be used in force_gpu to pull either GPU or SYCL depending on what is available in the system.

* Improvements to the SYCL device support

 - Registration of Type Traits required for stride slice op
 - Registration of ConcatOffset, _ListToArray, _ArrayToList
   Pad, Reverse ( CPU ), ReverseV2 ( CPU ), Size, ExpandDims,
   Squeeze, StridedSlice, StridedSliceGrad, StridedSliceAssign,
   TileGrad, InvertPermutation, Transpose
 - Registration of Sycl kernels only for essential data types
 - Floor_div_real has been disabled for SYCL device
 - Device in control_flow_ops_py_test.py needed to be lower cased

* SYCL support improvements (#31)

* Improvements to the SYCL device support

This commit reduces number of failing tests when TensorFlow compiles
for OpenCL support.

 - Registration of Type Traits required for stride slice op
 - Registration of ConcatOffset, _ListToArray, _ArrayToList
   Pad, Reverse ( CPU ), ReverseV2 ( CPU ), Size, ExpandDims,
   Squeeze, StridedSlice, StridedSliceGrad, StridedSliceAssign,
   TileGrad, InvertPermutation, Transpose
 - Registration of Sycl kernels only for essential data types
 - Floor_div_real has been disabled for SYCL device
 - Device in control_flow_ops_py_test.py needed to be lower cased

* Fixes & Version bump (#33)

* Fix Unbuntu typo. (#38)

unbuntu -> ubuntu

* Add problem descriptions and solutions (#35)

* Add ComputeCpp lib folder to LD_LIBRARY_PATH

* Add ImportError problem + solution

If you get the error message "ImportError: libComputeCpp.so: cannot open shared
object file: No such file or directory", make sure you have added the
path to ComputeCpp's lib folder to your `LD_LIBRARY_PATH`.

* Add another ImportError problem + solution

If you get the error message "ImportError: cannot import name
'pywrap_tensorflow'" you may be standing in the TensorFlow directory.

* Improvements to the SYCL device support

* Registers FloorDiv, FloorMod and SoftMax Ops for SYCL device

* Workaround for 0 bytes allocation for SYCL device (#42)

* Sycl improvements (#44)

- Eigen version bump
 - Extends Cast and Cwise ops benchmark to cover Sycl device
 - Extends device_lib_test.py to cover Sycl device
 - Registers int32, string and ResourceHandler to run on host for
   Enter and RefEnter Sycl Ops
 - Enables RecudeMax op for Sycl since Eigen implementation is ready
 - Registers Less op for Sycl device

* Improved the formatting of the SYCL code

* Fixed compilation error.

* Made sure that using test sessions with force_gpu=True forces the
placement on a gpu device even if none is detected.
This commit is contained in:
Benoit Steiner 2017-02-21 11:00:19 -08:00 committed by Vijay Vasudevan
parent 43c71a0338
commit 2c8d0dca97
102 changed files with 1451 additions and 221 deletions

View File

@ -116,6 +116,8 @@ load(
"//third_party/mkl:build_defs.bzl",
"if_mkl",
)
load("@local_config_sycl//sycl:build_defs.bzl", "if_sycl")
# -----------------------------------------------------------------------------
# Public targets
@ -729,7 +731,7 @@ cc_library(
"//tensorflow/core/kernels:ops_testutil",
"//tensorflow/core/kernels:ops_util",
"//tensorflow/core/platform/default/build_config:gtest",
],
] + if_sycl([":sycl_runtime"]),
)
# This is a link-only library to provide a DirectSession

View File

@ -25,6 +25,9 @@ string SYCLAllocator::Name() { return "device:SYCL"; }
void *SYCLAllocator::AllocateRaw(size_t alignment, size_t num_bytes) {
assert(device_);
if (num_bytes == 0) {
return device_->allocate(1);
}
auto p = device_->allocate(num_bytes);
return p;
}
@ -42,6 +45,6 @@ void SYCLAllocator::EnterLameDuckMode() {
}
}
} // namespace tensorflow
} // namespace tensorflow
#endif // TENSORFLOW_USE_SYCL
#endif // TENSORFLOW_USE_SYCL

View File

@ -27,8 +27,8 @@ limitations under the License.
namespace tensorflow {
class SYCLAllocator : public Allocator {
public:
SYCLAllocator(Eigen::QueueInterface* device) : device_(device) {}
public:
SYCLAllocator(Eigen::QueueInterface *device) : device_(device) {}
virtual ~SYCLAllocator() override;
string Name() override;
void *AllocateRaw(size_t alignment, size_t num_bytes) override;
@ -36,11 +36,12 @@ public:
void EnterLameDuckMode();
virtual bool ShouldAllocateEmptyTensors() override final { return true; }
private:
private:
Eigen::QueueInterface *device_; // not owned
TF_DISALLOW_COPY_AND_ASSIGN(SYCLAllocator);
};
} // namespace tensorflow
} // namespace tensorflow
#endif // TENSORFLOW_COMMON_RUNTIME_SYCL_SYCL_ALLOCATOR_H_
#endif // TENSORFLOW_COMMON_RUNTIME_SYCL_SYCL_ALLOCATOR_H_

View File

@ -23,7 +23,7 @@ limitations under the License.
namespace tensorflow {
static std::unordered_set<SYCLDevice*> live_devices;
static std::unordered_set<SYCLDevice *> live_devices;
static bool first_time = true;
void ShutdownSycl() {

View File

@ -34,10 +34,11 @@ class SYCLDevice : public LocalDevice {
Bytes memory_limit, const DeviceLocality &locality,
const string &physical_device_desc, SYCLSelector sycl_selector,
Allocator *cpu_allocator)
: LocalDevice(options, Device::BuildDeviceAttributes(
name, DEVICE_SYCL, memory_limit, locality,
physical_device_desc),
nullptr),
: LocalDevice(
options,
Device::BuildDeviceAttributes(name, DEVICE_SYCL, memory_limit,
locality, physical_device_desc),
nullptr),
cpu_allocator_(cpu_allocator),
sycl_queue_(new Eigen::QueueInterface(sycl_selector)),
sycl_device_(new Eigen::SyclDevice(sycl_queue_)),

View File

@ -17,8 +17,8 @@ limitations under the License.
#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
#include "tensorflow/core/common_runtime/sycl/sycl_device_context.h"
#include "tensorflow/core/common_runtime/dma_helper.h"
#include "tensorflow/core/common_runtime/sycl/sycl_device_context.h"
namespace tensorflow {
@ -31,68 +31,68 @@ void SYCLDeviceContext::CopyCPUTensorToDevice(const Tensor *cpu_tensor,
const void *src_ptr = DMAHelper::base(cpu_tensor);
void *dst_ptr = DMAHelper::base(device_tensor);
switch (cpu_tensor->dtype()) {
case DT_FLOAT:
device->eigen_sycl_device()->memcpyHostToDevice(
static_cast<float *>(dst_ptr), static_cast<const float *>(src_ptr),
total_bytes);
break;
case DT_DOUBLE:
device->eigen_sycl_device()->memcpyHostToDevice(
static_cast<double *>(dst_ptr), static_cast<const double *>(src_ptr),
total_bytes);
break;
case DT_INT32:
device->eigen_sycl_device()->memcpyHostToDevice(
static_cast<int32 *>(dst_ptr), static_cast<const int32 *>(src_ptr),
total_bytes);
break;
case DT_INT64:
device->eigen_sycl_device()->memcpyHostToDevice(
static_cast<int64 *>(dst_ptr), static_cast<const int64 *>(src_ptr),
total_bytes);
break;
case DT_HALF:
device->eigen_sycl_device()->memcpyHostToDevice(
static_cast<Eigen::half *>(dst_ptr),
static_cast<const Eigen::half *>(src_ptr), total_bytes);
break;
case DT_COMPLEX64:
device->eigen_sycl_device()->memcpyHostToDevice(
static_cast<std::complex<float> *>(dst_ptr),
static_cast<const std::complex<float> *>(src_ptr), total_bytes);
break;
case DT_COMPLEX128:
device->eigen_sycl_device()->memcpyHostToDevice(
static_cast<std::complex<double> *>(dst_ptr),
static_cast<const std::complex<double> *>(src_ptr), total_bytes);
break;
case DT_INT8:
device->eigen_sycl_device()->memcpyHostToDevice(
static_cast<int8 *>(dst_ptr), static_cast<const int8 *>(src_ptr),
total_bytes);
break;
case DT_INT16:
device->eigen_sycl_device()->memcpyHostToDevice(
static_cast<int16 *>(dst_ptr), static_cast<const int16 *>(src_ptr),
total_bytes);
break;
case DT_UINT8:
device->eigen_sycl_device()->memcpyHostToDevice(
static_cast<uint8 *>(dst_ptr), static_cast<const uint8 *>(src_ptr),
total_bytes);
break;
case DT_UINT16:
device->eigen_sycl_device()->memcpyHostToDevice(
static_cast<uint16 *>(dst_ptr), static_cast<const uint16 *>(src_ptr),
total_bytes);
break;
case DT_BOOL:
device->eigen_sycl_device()->memcpyHostToDevice(
static_cast<bool *>(dst_ptr), static_cast<const bool *>(src_ptr),
total_bytes);
break;
default:
assert(false && "unsupported type");
case DT_FLOAT:
device->eigen_sycl_device()->memcpyHostToDevice(
static_cast<float *>(dst_ptr), static_cast<const float *>(src_ptr),
total_bytes);
break;
case DT_DOUBLE:
device->eigen_sycl_device()->memcpyHostToDevice(
static_cast<double *>(dst_ptr),
static_cast<const double *>(src_ptr), total_bytes);
break;
case DT_INT32:
device->eigen_sycl_device()->memcpyHostToDevice(
static_cast<int32 *>(dst_ptr), static_cast<const int32 *>(src_ptr),
total_bytes);
break;
case DT_INT64:
device->eigen_sycl_device()->memcpyHostToDevice(
static_cast<int64 *>(dst_ptr), static_cast<const int64 *>(src_ptr),
total_bytes);
break;
case DT_HALF:
device->eigen_sycl_device()->memcpyHostToDevice(
static_cast<Eigen::half *>(dst_ptr),
static_cast<const Eigen::half *>(src_ptr), total_bytes);
break;
case DT_COMPLEX64:
device->eigen_sycl_device()->memcpyHostToDevice(
static_cast<std::complex<float> *>(dst_ptr),
static_cast<const std::complex<float> *>(src_ptr), total_bytes);
break;
case DT_COMPLEX128:
device->eigen_sycl_device()->memcpyHostToDevice(
static_cast<std::complex<double> *>(dst_ptr),
static_cast<const std::complex<double> *>(src_ptr), total_bytes);
break;
case DT_INT8:
device->eigen_sycl_device()->memcpyHostToDevice(
static_cast<int8 *>(dst_ptr), static_cast<const int8 *>(src_ptr),
total_bytes);
break;
case DT_INT16:
device->eigen_sycl_device()->memcpyHostToDevice(
static_cast<int16 *>(dst_ptr), static_cast<const int16 *>(src_ptr),
total_bytes);
break;
case DT_UINT8:
device->eigen_sycl_device()->memcpyHostToDevice(
static_cast<uint8 *>(dst_ptr), static_cast<const uint8 *>(src_ptr),
total_bytes);
break;
case DT_UINT16:
device->eigen_sycl_device()->memcpyHostToDevice(
static_cast<uint16 *>(dst_ptr),
static_cast<const uint16 *>(src_ptr), total_bytes);
break;
case DT_BOOL:
device->eigen_sycl_device()->memcpyHostToDevice(
static_cast<bool *>(dst_ptr), static_cast<const bool *>(src_ptr),
total_bytes);
break;
default:
assert(false && "unsupported type");
}
}
device->eigen_sycl_device()->synchronize();
@ -106,71 +106,71 @@ void SYCLDeviceContext::CopyDeviceTensorToCPU(const Tensor *device_tensor,
StatusCallback done) {
const int64 total_bytes = device_tensor->TotalBytes();
if (total_bytes > 0) {
const void* src_ptr = DMAHelper::base(device_tensor);
void* dst_ptr = DMAHelper::base(cpu_tensor);
const void *src_ptr = DMAHelper::base(device_tensor);
void *dst_ptr = DMAHelper::base(cpu_tensor);
switch (device_tensor->dtype()) {
case DT_FLOAT:
device->eigen_sycl_device()->memcpyDeviceToHost(
static_cast<float *>(dst_ptr), static_cast<const float *>(src_ptr),
total_bytes);
break;
case DT_DOUBLE:
device->eigen_sycl_device()->memcpyDeviceToHost(
static_cast<double *>(dst_ptr), static_cast<const double *>(src_ptr),
total_bytes);
break;
case DT_INT32:
device->eigen_sycl_device()->memcpyDeviceToHost(
static_cast<int32 *>(dst_ptr), static_cast<const int32 *>(src_ptr),
total_bytes);
break;
case DT_INT64:
device->eigen_sycl_device()->memcpyDeviceToHost(
static_cast<int64 *>(dst_ptr), static_cast<const int64 *>(src_ptr),
total_bytes);
break;
case DT_HALF:
device->eigen_sycl_device()->memcpyDeviceToHost(
static_cast<Eigen::half *>(dst_ptr),
static_cast<const Eigen::half *>(src_ptr), total_bytes);
break;
case DT_COMPLEX64:
device->eigen_sycl_device()->memcpyDeviceToHost(
static_cast<std::complex<float> *>(dst_ptr),
static_cast<const std::complex<float> *>(src_ptr), total_bytes);
break;
case DT_COMPLEX128:
device->eigen_sycl_device()->memcpyDeviceToHost(
static_cast<std::complex<double> *>(dst_ptr),
static_cast<const std::complex<double> *>(src_ptr), total_bytes);
break;
case DT_INT8:
device->eigen_sycl_device()->memcpyDeviceToHost(
static_cast<int8 *>(dst_ptr), static_cast<const int8 *>(src_ptr),
total_bytes);
break;
case DT_INT16:
device->eigen_sycl_device()->memcpyDeviceToHost(
static_cast<int16 *>(dst_ptr), static_cast<const int16 *>(src_ptr),
total_bytes);
break;
case DT_UINT8:
device->eigen_sycl_device()->memcpyDeviceToHost(
static_cast<uint8 *>(dst_ptr), static_cast<const uint8 *>(src_ptr),
total_bytes);
break;
case DT_UINT16:
device->eigen_sycl_device()->memcpyDeviceToHost(
static_cast<uint16 *>(dst_ptr), static_cast<const uint16 *>(src_ptr),
total_bytes);
break;
case DT_BOOL:
device->eigen_sycl_device()->memcpyDeviceToHost(
static_cast<bool *>(dst_ptr), static_cast<const bool *>(src_ptr),
total_bytes);
break;
default:
assert(false && "unsupported type");
case DT_FLOAT:
device->eigen_sycl_device()->memcpyDeviceToHost(
static_cast<float *>(dst_ptr), static_cast<const float *>(src_ptr),
total_bytes);
break;
case DT_DOUBLE:
device->eigen_sycl_device()->memcpyDeviceToHost(
static_cast<double *>(dst_ptr),
static_cast<const double *>(src_ptr), total_bytes);
break;
case DT_INT32:
device->eigen_sycl_device()->memcpyDeviceToHost(
static_cast<int32 *>(dst_ptr), static_cast<const int32 *>(src_ptr),
total_bytes);
break;
case DT_INT64:
device->eigen_sycl_device()->memcpyDeviceToHost(
static_cast<int64 *>(dst_ptr), static_cast<const int64 *>(src_ptr),
total_bytes);
break;
case DT_HALF:
device->eigen_sycl_device()->memcpyDeviceToHost(
static_cast<Eigen::half *>(dst_ptr),
static_cast<const Eigen::half *>(src_ptr), total_bytes);
break;
case DT_COMPLEX64:
device->eigen_sycl_device()->memcpyDeviceToHost(
static_cast<std::complex<float> *>(dst_ptr),
static_cast<const std::complex<float> *>(src_ptr), total_bytes);
break;
case DT_COMPLEX128:
device->eigen_sycl_device()->memcpyDeviceToHost(
static_cast<std::complex<double> *>(dst_ptr),
static_cast<const std::complex<double> *>(src_ptr), total_bytes);
break;
case DT_INT8:
device->eigen_sycl_device()->memcpyDeviceToHost(
static_cast<int8 *>(dst_ptr), static_cast<const int8 *>(src_ptr),
total_bytes);
break;
case DT_INT16:
device->eigen_sycl_device()->memcpyDeviceToHost(
static_cast<int16 *>(dst_ptr), static_cast<const int16 *>(src_ptr),
total_bytes);
break;
case DT_UINT8:
device->eigen_sycl_device()->memcpyDeviceToHost(
static_cast<uint8 *>(dst_ptr), static_cast<const uint8 *>(src_ptr),
total_bytes);
break;
case DT_UINT16:
device->eigen_sycl_device()->memcpyDeviceToHost(
static_cast<uint16 *>(dst_ptr),
static_cast<const uint16 *>(src_ptr), total_bytes);
break;
case DT_BOOL:
device->eigen_sycl_device()->memcpyDeviceToHost(
static_cast<bool *>(dst_ptr), static_cast<const bool *>(src_ptr),
total_bytes);
break;
default:
assert(false && "unsupported type");
}
}
device->eigen_sycl_device()->synchronize();
@ -178,4 +178,4 @@ void SYCLDeviceContext::CopyDeviceTensorToCPU(const Tensor *device_tensor,
}
} // namespace tensorflow
#endif // TENSORFLOW_USE_SYCL
#endif // TENSORFLOW_USE_SYCL

View File

@ -26,7 +26,7 @@ limitations under the License.
namespace tensorflow {
class SYCLDeviceContext : public DeviceContext {
public:
public:
SYCLDeviceContext() {}
~SYCLDeviceContext() override {}
@ -40,6 +40,6 @@ public:
StatusCallback done) override;
};
} // namespace tensorflow
} // namespace tensorflow
#endif // TENSORFLOW_COMMON_RUNTIME_SYCL_SYCL_DEVICE_CONTEXT_H_
#endif // TENSORFLOW_COMMON_RUNTIME_SYCL_SYCL_DEVICE_CONTEXT_H_

View File

@ -21,7 +21,7 @@ limitations under the License.
namespace tensorflow {
class SYCLDeviceFactory : public DeviceFactory {
public:
public:
Status CreateDevices(const SessionOptions &options, const string &name_prefix,
std::vector<Device *> *devices) override {
int n = 1;
@ -31,10 +31,10 @@ public:
}
for (int i = 0; i < n; i++) {
string name = strings::StrCat(name_prefix, "/device:SYCL:", i);
devices->push_back(new SYCLDevice(options, name, Bytes(256 << 20),
DeviceLocality(),
SYCLDevice::GetShortDeviceDescription(),
cl::sycl::gpu_selector(), cpu_allocator()));
devices->push_back(
new SYCLDevice(options, name, Bytes(256 << 20), DeviceLocality(),
SYCLDevice::GetShortDeviceDescription(),
cl::sycl::gpu_selector(), cpu_allocator()));
}
return Status::OK();
}
@ -43,4 +43,4 @@ public:
REGISTER_LOCAL_DEVICE_FACTORY("SYCL", SYCLDeviceFactory, 200);
}
#endif // TENSORFLOW_USE_SYCL
#endif // TENSORFLOW_USE_SYCL

View File

@ -21,6 +21,10 @@ limitations under the License.
typedef Eigen::ThreadPoolDevice CPUDevice;
typedef Eigen::GpuDevice GPUDevice;
#ifdef TENSORFLOW_USE_SYCL
typedef Eigen::SyclDevice SYCLDevice;
#endif // TENSORFLOW_USE_SYCL
#include "tensorflow/core/framework/numeric_types.h"
#include "tensorflow/core/platform/types.h"
@ -66,6 +70,17 @@ struct proxy_type_pod<GPUDevice, 2> {
typedef Eigen::half type;
};
#ifdef TENSORFLOW_USE_SYCL
template <>
struct proxy_type_pod<SYCLDevice, 8> {
typedef double type;
};
template <>
struct proxy_type_pod<SYCLDevice, 4> {
typedef float type;
};
#endif // TENSORFLOW_USE_SYCL
/// If POD we use proxy_type_pod, otherwise this maps to identiy.
template <typename Device, typename T>
struct proxy_type {
@ -81,6 +96,10 @@ struct proxy_type {
TF_CALL_int8(m) TF_CALL_complex128(m)
#define TF_CALL_GPU_PROXY_TYPES(m) \
TF_CALL_double(m) TF_CALL_float(m) TF_CALL_half(m) TF_CALL_int32(m)
#ifdef TENSORFLOW_USE_SYCL
#define TF_CALL_SYCL_PROXY_TYPES(m) \
TF_CALL_double(m) TF_CALL_float(m) TF_CALL_int32(m)
#endif // TENSORFLOW_USE_SYCL
} // namespace tensorflow
#endif // TENSORFLOW_FRAMEWORK_REGISTER_TYPES_TRAITS_H_

View File

@ -34,6 +34,9 @@ namespace tensorflow {
typedef Eigen::ThreadPoolDevice CPUDevice;
typedef Eigen::GpuDevice GPUDevice;
#ifdef TENSORFLOW_USE_SYCL
typedef Eigen::SyclDevice SYCLDevice;
#endif // TENSORFLOW_USE_SYCL
#define CURRY_TYPES2(FN, arg0) \
FN(arg0, bool); \
@ -206,6 +209,52 @@ REGISTER_CAST_GPU(bfloat16, float);
#undef REGISTER_CAST_GPU
#endif // GOOGLE_CUDA
#ifdef TENSORFLOW_USE_SYCL
class SyclCastOp : public CastOpBase {
public:
explicit SyclCastOp(OpKernelConstruction* ctx) : CastOpBase(ctx) {
OP_REQUIRES_OK(ctx, Prepare());
}
private:
Status Prepare() {
if (src_dtype_ == dst_dtype_) {
work_ = nullptr; // Identity
return Status::OK();
}
if (src_dtype_ == DT_BOOL) {
work_ = GetSyclCastFromBool(dst_dtype_);
} else if (src_dtype_ == DT_INT32) {
work_ = GetSyclCastFromInt32(dst_dtype_);
} else if (src_dtype_ == DT_INT64) {
work_ = GetSyclCastFromInt64(dst_dtype_);
} else if (src_dtype_ == DT_FLOAT) {
work_ = GetSyclCastFromFloat(dst_dtype_);
} else if (src_dtype_ == DT_DOUBLE) {
work_ = GetSyclCastFromDouble(dst_dtype_);
}
return work_ == nullptr ? Unimplemented() : Status::OK();
}
};
#define REGISTER_CAST_SYCL(srctype, dsttype) \
REGISTER_KERNEL_BUILDER(Name("Cast") \
.TypeConstraint<srctype>("SrcT") \
.TypeConstraint<dsttype>("DstT") \
.Device(DEVICE_SYCL), \
SyclCastOp)
CURRY_TYPES2(REGISTER_CAST_SYCL, bool);
CURRY_TYPES2(REGISTER_CAST_SYCL, int32);
CURRY_TYPES2(REGISTER_CAST_SYCL, int64);
CURRY_TYPES2(REGISTER_CAST_SYCL, float);
CURRY_TYPES2(REGISTER_CAST_SYCL, double);
#undef REGISTER_CAST_SYCL
#endif // TENSORFLOW_USE_SYCL
#undef CURRY_TYPES2
// HostCast differs from Cast in that its input and output are in host memory.
@ -213,5 +262,10 @@ REGISTER_KERNEL_BUILDER(Name("_HostCast").Device(DEVICE_CPU), CpuCastOp);
REGISTER_KERNEL_BUILDER(
Name("_HostCast").Device(DEVICE_GPU).HostMemory("x").HostMemory("y"),
CpuCastOp);
#ifdef TENSORFLOW_USE_SYCL
REGISTER_KERNEL_BUILDER(
Name("_HostCast").Device(DEVICE_SYCL).HostMemory("x").HostMemory("y"),
CpuCastOp);
#endif // TENSORFLOW_USE_SYCL
} // end namespace tensorflow

View File

@ -33,6 +33,16 @@ struct CastFunctor<Eigen::ThreadPoolDevice, O, I> {
}
};
#ifdef TENSORFLOW_USE_SYCL
template <typename O, typename I>
struct CastFunctor<Eigen::SyclDevice, O, I> {
void operator()(const Eigen::SyclDevice& d, typename TTypes<O>::Flat o,
typename TTypes<I>::ConstFlat i) {
o.device(d) = i.template cast<O>();
}
};
#endif // TENSORFLOW_USE_SYCL
} // namespace functor
#define CURRY_TYPES3(FN, arg0, arg1) \
@ -140,6 +150,25 @@ GetGpuCastFromBfloat(DataType dst_dtype);
#endif // GOOGLE_CUDA
#ifdef TENSORFLOW_USE_SYCL
std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
GetSyclCastFromBool(DataType dst_dtype);
std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
GetSyclCastFromInt32(DataType dst_dtype);
std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
GetSyclCastFromInt64(DataType dst_dtype);
std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
GetSyclCastFromFloat(DataType dst_dtype);
std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
GetSyclCastFromDouble(DataType dst_dtype);
#endif // TENSORFLOW_USE_SYCL
} // namespace tensorflow
#endif // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_CAST_OP_IMPL_H_

View File

@ -34,4 +34,14 @@ GetGpuCastFromBool(DataType dst_dtype) {
}
#endif // GOOGLE_CUDA
#ifdef TENSORFLOW_USE_SYCL
typedef Eigen::SyclDevice SYCLDevice;
std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
GetSyclCastFromBool(DataType dst_dtype) {
CURRY_TYPES3(CAST_CASE, SYCLDevice, bool);
return nullptr;
}
#endif // TENSORFLOW_USE_SYCL
} // namespace tensorflow

View File

@ -34,4 +34,14 @@ GetGpuCastFromDouble(DataType dst_dtype) {
}
#endif // GOOGLE_CUDA
#ifdef TENSORFLOW_USE_SYCL
typedef Eigen::SyclDevice SYCLDevice;
std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
GetSyclCastFromDouble(DataType dst_dtype) {
CURRY_TYPES3(CAST_CASE, SYCLDevice, double);
return nullptr;
}
#endif // TENSORFLOW_USE_SYC
} // namespace tensorflow

View File

@ -49,4 +49,14 @@ GetGpuCastFromFloat(DataType dst_dtype) {
}
#endif // GOOGLE_CUDA
#ifdef TENSORFLOW_USE_SYCL
typedef Eigen::SyclDevice SYCLDevice;
std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
GetSyclCastFromFloat(DataType dst_dtype) {
CURRY_TYPES3(CAST_CASE, SYCLDevice, float);
return nullptr;
}
#endif // TENSORFLOW_USE_SYCL
} // namespace tensorflow

View File

@ -34,4 +34,14 @@ GetGpuCastFromInt32(DataType dst_dtype) {
}
#endif // GOOGLE_CUDA
#ifdef TENSORFLOW_USE_SYCL
typedef Eigen::SyclDevice SYCLDevice;
std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
GetSyclCastFromInt32(DataType dst_dtype) {
CURRY_TYPES3(CAST_CASE, CPUDevice, int32);
return nullptr;
}
#endif // TENSORFLOW_USE_SYCL
} // namespace tensorflow

View File

@ -19,6 +19,9 @@ namespace tensorflow {
typedef Eigen::ThreadPoolDevice CPUDevice;
typedef Eigen::GpuDevice GPUDevice;
#ifdef TENSORFLOW_USE_SYCL
typedef Eigen::SyclDevice SYCLDevice;
#endif // TENSORFLOW_USE_SYCL
std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
GetCpuCastFromInt64(DataType dst_dtype) {
@ -34,4 +37,13 @@ GetGpuCastFromInt64(DataType dst_dtype) {
}
#endif // GOOGLE_CUDA
#ifdef TENSORFLOW_USE_SYCL
typedef Eigen::SyclDevice SYCLDevice;
std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
GetSyclCastFromInt64(DataType dst_dtype) {
CURRY_TYPES3(CAST_CASE, SYCLDevice, int64);
return nullptr;
}
#endif // TENSORFLOW_USE_SYCL
} // namespace tensorflow

View File

@ -105,7 +105,12 @@ static void BM_gpu_float_int64(int iters, int num) {
testing::BytesProcessed(static_cast<int64>(iters) * num *
(sizeof(float) + sizeof(int64)));
testing::UseRealTime();
#if GOOGLE_CUDA
test::Benchmark("gpu", Cast<float, int64>(num)).Run(iters);
#endif // GOOGLE_CUDA
#ifdef TENSORFLOW_USE_SYCL
test::Benchmark("sycl", Cast<float, int64>(num)).Run(iters);
#endif // TENSORFLOW_USE_SYCL
}
BENCHMARK(BM_gpu_float_int64)->Arg(64 << 10)->Arg(32 << 20);
@ -123,7 +128,12 @@ static void BM_gpu_bool_float(int iters, int num) {
testing::BytesProcessed(static_cast<int64>(iters) * num *
(sizeof(bool) + sizeof(float)));
testing::UseRealTime();
#if GOOGLE_CUDA
test::Benchmark("gpu", Cast<bool, float>(num)).Run(iters);
#endif // GOOGLE_CUDA
#ifdef TENSORFLOW_USE_SYCL
test::Benchmark("sycl", Cast<bool, float>(num)).Run(iters);
#endif // TENSORFLOW_USE_SYCL
}
BENCHMARK(BM_gpu_bool_float)->Arg(64 << 10)->Arg(32 << 20);
@ -168,7 +178,9 @@ static void BM_gpu_float_half(int iters, int num) {
testing::BytesProcessed(static_cast<int64>(iters) * num *
(sizeof(float) + sizeof(Eigen::half)));
testing::UseRealTime();
#if GOOGLE_CUDA
test::Benchmark("gpu", Cast<float, Eigen::half>(num)).Run(iters);
#endif // GOOGLE_CUDA
}
BENCHMARK(BM_gpu_float_half)->Arg(64 << 10)->Arg(32 << 20);
@ -177,7 +189,9 @@ static void BM_gpu_half_float(int iters, int num) {
testing::BytesProcessed(static_cast<int64>(iters) * num *
(sizeof(float) + sizeof(Eigen::half)));
testing::UseRealTime();
#if GOOGLE_CUDA
test::Benchmark("gpu", Cast<Eigen::half, float>(num)).Run(iters);
#endif // GOOGLE_CUDA
}
BENCHMARK(BM_gpu_half_float)->Arg(64 << 10)->Arg(32 << 20);

View File

@ -38,6 +38,14 @@ void ConcatGPU(
Tensor* output, typename TTypes<T, 2>::Tensor* output_flat);
#endif // GOOGLE_CUDA
#ifdef TENSORFLOW_USE_SYCL
template <typename T>
void ConcatSYCL(const Eigen::SyclDevice& d,
const std::vector<
std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>& inputs,
typename TTypes<T, 2>::Matrix* output);
#endif // TENSORFLOW_USE_SYCL
} // namespace tensorflow
#endif // TENSORFLOW_KERNELS_CONCAT_LIB_H_

View File

@ -74,4 +74,23 @@ REGISTER(qint16)
REGISTER(qint32)
REGISTER(bfloat16)
#ifdef TENSORFLOW_USE_SYCL
template <typename T>
void ConcatSYCL(const Eigen::SyclDevice& d,
const std::vector<
std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>& inputs,
typename TTypes<T, 2>::Matrix* output) {
ConcatSYCLImpl<T>(d, inputs, sizeof(T) /* cost_per_unit */, MemCpyCopier<T>(),
output);
}
#define REGISTER_SYCL(T) \
template void ConcatSYCL<T>( \
const Eigen::SyclDevice&, \
const std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>&, \
typename TTypes<T, 2>::Matrix* output);
TF_CALL_GPU_NUMBER_TYPES(REGISTER_SYCL)
#undef REGISTER_SYCL
#endif // TENSORFLOW_USE_SYCL
} // namespace tensorflow

View File

@ -126,4 +126,39 @@ void ConcatCPUImpl(
cost_per_unit, work);
}
#ifdef TENSORFLOW_USE_SYCL
template <typename T, typename ElementCopier>
void ConcatSYCLImpl(
const Eigen::SyclDevice& d,
const std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>&
inputs,
int64 cost_per_unit, ElementCopier copier,
typename TTypes<T, 2>::Matrix* output) {
size_t num_inputs = inputs.size();
std::vector<ptrdiff_t> sizes;
sizes.reserve(num_inputs);
int64 row_size = 0;
for (const auto& input : inputs) {
sizes.push_back(input->dimension(1));
row_size += sizes.back();
}
T* out = &(*output)(0, 0);
std::vector<const T*> inp;
inp.reserve(num_inputs);
for (const auto& input : inputs) {
inp.push_back(&(*input)(0, 0));
}
const int64 dim0 = output->dimension(0);
for (int64 i = 0; i < dim0; ++i) {
for (int64 j = 0; j < num_inputs; ++j) {
auto size = sizes[j];
d.memcpy(out, inp[j], size * sizeof(T));
out += size;
inp[j] += size;
}
}
}
#endif // TENSORFLOW_USE_SYCL
} // namespace tensorflow

View File

@ -35,6 +35,9 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
#if GOOGLE_CUDA
typedef Eigen::GpuDevice GPUDevice;
#endif // GOOGLE_CUDA
#ifdef TENSORFLOW_USE_SYCL
typedef Eigen::SyclDevice SYCLDevice;
#endif // TENSORFLOW_USE_SYCL
enum AxisArgumentName { NAME_IS_AXIS, NAME_IS_CONCAT_DIM };
@ -134,6 +137,12 @@ class ConcatBaseOp : public OpKernel {
return;
}
#endif // GOOGLE_CUDA
#ifdef TENSORFLOW_USE_SYCL
if (std::is_same<Device, SYCLDevice>::value) {
ConcatSYCL<T>(c->eigen_sycl_device(), inputs_flat, &output_flat);
return;
}
#endif // TENSORFLOW_USE_SYCL
ConcatCPU<T>(c->device(), inputs_flat, &output_flat);
}
}
@ -207,6 +216,39 @@ REGISTER_KERNEL_BUILDER(Name("ConcatV2")
#endif // GOOGLE_CUDA
#ifdef TENSORFLOW_USE_SYCL
#define REGISTER_SYCL(type) \
REGISTER_KERNEL_BUILDER(Name("Concat") \
.Device(DEVICE_SYCL) \
.TypeConstraint<type>("T") \
.HostMemory("concat_dim"), \
ConcatOp<SYCLDevice, type>) \
REGISTER_KERNEL_BUILDER(Name("ConcatV2") \
.Device(DEVICE_SYCL) \
.TypeConstraint<type>("T") \
.TypeConstraint<int32>("Tidx") \
.HostMemory("axis"), \
ConcatV2Op<SYCLDevice, type>)
TF_CALL_GPU_NUMBER_TYPES(REGISTER_SYCL);
REGISTER_KERNEL_BUILDER(Name("Concat")
.Device(DEVICE_SYCL)
.TypeConstraint<int32>("T")
.HostMemory("concat_dim")
.HostMemory("values")
.HostMemory("output"),
ConcatOp<CPUDevice, int32>);
REGISTER_KERNEL_BUILDER(Name("ConcatV2")
.Device(DEVICE_SYCL)
.TypeConstraint<int32>("T")
.TypeConstraint<int32>("Tidx")
.HostMemory("values")
.HostMemory("axis")
.HostMemory("output"),
ConcatV2Op<CPUDevice, int32>);
#undef REGISTER_SYCL
#endif // TENSORFLOW_USE_SYCL
class ConcatOffsetOp : public OpKernel {
public:
explicit ConcatOffsetOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
@ -293,4 +335,12 @@ REGISTER_KERNEL_BUILDER(Name("ConcatOffset")
.HostMemory("offset"),
ConcatOffsetOp);
#ifdef TENSORFLOW_USE_SYCL
REGISTER_KERNEL_BUILDER(Name("ConcatOffset")
.Device(DEVICE_SYCL)
.HostMemory("concat_dim")
.HostMemory("shape")
.HostMemory("offset"),
ConcatOffsetOp);
#endif // TENSORFLOW_USE_SYCL
} // namespace tensorflow

View File

@ -57,7 +57,10 @@ REGISTER_KERNEL_BUILDER(Name("Const").Device(DEVICE_CPU), ConstantOp);
REGISTER_KERNEL_BUILDER( \
Name("Const").Device(DEVICE_SYCL).TypeConstraint<TYPE>("dtype"), \
ConstantOp);
TF_CALL_NUMBER_TYPES(REGISTER_SYCL_KERNEL);
REGISTER_SYCL_KERNEL(float);
REGISTER_SYCL_KERNEL(double);
REGISTER_SYCL_KERNEL(bool);
REGISTER_SYCL_KERNEL(int64);
#undef REGISTER_SYCL_KERNEL
#endif
@ -112,6 +115,17 @@ REGISTER_KERNEL_BUILDER(Name("Const")
HostConstantOp);
#endif
#ifdef TENSORFLOW_USE_SYCL
// A special GPU kernel for int32.
// TODO(b/25387198): Also enable int32 in device memory. This kernel
// registration requires all int32 inputs and outputs to be in host memory.
REGISTER_KERNEL_BUILDER(Name("Const")
.Device(DEVICE_SYCL)
.HostMemory("output")
.TypeConstraint<int32>("dtype"),
HostConstantOp);
#endif // TENSORFLOW_USE_SYCL
typedef Eigen::ThreadPoolDevice CPUDevice;
typedef Eigen::GpuDevice GPUDevice;
#ifdef TENSORFLOW_USE_SYCL
@ -186,6 +200,7 @@ REGISTER_KERNEL(CPU, quint8);
#ifdef TENSORFLOW_USE_SYCL
REGISTER_KERNEL(SYCL, float)
REGISTER_KERNEL(SYCL, double)
REGISTER_KERNEL_BUILDER(Name("Fill")
.Device(DEVICE_SYCL)
.TypeConstraint<int32>("T")
@ -245,6 +260,7 @@ TF_CALL_POD_STRING_TYPES(REGISTER_CPU);
#ifdef TENSORFLOW_USE_SYCL
REGISTER_KERNEL(float, SYCL);
REGISTER_KERNEL(bool, SYCL);
REGISTER_KERNEL_BUILDER(Name("ZerosLike")
.Device(DEVICE_SYCL)
.TypeConstraint<int32>("T")

View File

@ -321,6 +321,30 @@ TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_REF_KERNEL);
#undef REGISTER_SYCL_KERNEL
#undef REGISTER_SYCL_REF_KERNEL
#define REGISTER_SYCL_HOST_KERNEL(type) \
REGISTER_KERNEL_BUILDER(Name("Enter") \
.Device(DEVICE_SYCL) \
.HostMemory("data") \
.HostMemory("output") \
.TypeConstraint<type>("T"), \
EnterOp)
#define REGISTER_SYCL_HOST_REF_KERNEL(type) \
REGISTER_KERNEL_BUILDER(Name("RefEnter") \
.Device(DEVICE_SYCL) \
.HostMemory("data") \
.HostMemory("output") \
.TypeConstraint<type>("T"), \
EnterOp)
REGISTER_SYCL_HOST_KERNEL(int32);
REGISTER_SYCL_HOST_REF_KERNEL(int32);
REGISTER_SYCL_HOST_KERNEL(string);
REGISTER_SYCL_HOST_REF_KERNEL(string);
REGISTER_SYCL_HOST_KERNEL(ResourceHandle);
#undef REGISTER_SYCL_HOST_KERNEL
#undef REGISTER_SYCL_HOST_REF_KERNEL
#endif
// Special GPU kernels for int32 and string.

View File

@ -26,6 +26,7 @@ REGISTER2(UnaryOp, CPU, "Acos", functor::acos, float, double);
.TypeConstraint<TYPE>("T"), \
UnaryOp<SYCLDevice, functor::acos<TYPE>>);
REGISTER_SYCL_KERNEL(float);
REGISTER_SYCL_KERNEL(double);
#undef REGISTER_SYCL_KERNEL
#endif // TENSORFLOW_USE_SYCL

View File

@ -18,7 +18,7 @@ limitations under the License.
namespace tensorflow {
REGISTER5(BinaryOp, CPU, "Add", functor::add, float, Eigen::half, double, int32,
int64);
#if TENSORFLOW_USE_SYCL
#define REGISTER_SYCL_KERNEL(TYPE) \
REGISTER_KERNEL_BUILDER( \
@ -26,10 +26,19 @@ REGISTER5(BinaryOp, CPU, "Add", functor::add, float, Eigen::half, double, int32,
.Device(DEVICE_SYCL) \
.TypeConstraint<TYPE>("T"), \
BinaryOp<SYCLDevice, functor::add<TYPE>>);
REGISTER_SYCL_KERNEL(float);
REGISTER_SYCL_KERNEL(float);
REGISTER_SYCL_KERNEL(double);
#undef REGISTER_SYCL_KERNEL
REGISTER_KERNEL_BUILDER(Name("Add")
.Device(DEVICE_SYCL)
.HostMemory("x")
.HostMemory("y")
.HostMemory("z")
.TypeConstraint<int32>("T"),
BinaryOp<CPUDevice, functor::add<int32>>);
#endif // TENSORFLOW_USE_SYCL
#if GOOGLE_CUDA
REGISTER3(BinaryOp, GPU, "Add", functor::add, float, Eigen::half, double);

View File

@ -26,6 +26,7 @@ REGISTER2(UnaryOp, CPU, "Asin", functor::asin, float, double);
.TypeConstraint<TYPE>("T"), \
UnaryOp<SYCLDevice, functor::asin<TYPE>>);
REGISTER_SYCL_KERNEL(float);
REGISTER_SYCL_KERNEL(double);
#undef REGISTER_SYCL_KERNEL
#endif // TENSORFLOW_USE_SYCL

View File

@ -26,6 +26,7 @@ REGISTER2(UnaryOp, CPU, "Atan", functor::atan, float, double);
.TypeConstraint<TYPE>("T"), \
UnaryOp<SYCLDevice, functor::atan<TYPE>>);
REGISTER_SYCL_KERNEL(float);
REGISTER_SYCL_KERNEL(double);
#undef REGISTER_SYCL_KERNEL
#endif // TENSORFLOW_USE_SYCL

View File

@ -26,6 +26,7 @@ REGISTER3(UnaryOp, CPU, "Ceil", functor::ceil, float, Eigen::half, double);
.TypeConstraint<TYPE>("T"), \
UnaryOp<SYCLDevice, functor::ceil<TYPE>>);
REGISTER_SYCL_KERNEL(float);
REGISTER_SYCL_KERNEL(double);
#undef REGISTER_SYCL_KERNEL
#endif // TENSORFLOW_USE_SYCL

View File

@ -27,6 +27,7 @@ REGISTER5(UnaryOp, CPU, "Cos", functor::cos, float, Eigen::half, double,
.TypeConstraint<TYPE>("T"), \
UnaryOp<SYCLDevice, functor::cos<TYPE>>);
REGISTER_SYCL_KERNEL(float);
REGISTER_SYCL_KERNEL(double);
#undef REGISTER_SYCL_KERNEL
#endif // TENSORFLOW_USE_SYCL

View File

@ -37,8 +37,18 @@ REGISTER5(BinaryOp, CPU, "RealDiv", functor::div, float, Eigen::half, double,
.TypeConstraint<TYPE>("T"), \
BinaryOp<SYCLDevice, functor::div<TYPE>>);
REGISTER_SYCL_KERNEL(float)
REGISTER_SYCL_KERNEL(int32)
REGISTER_SYCL_KERNEL(double)
#undef REGISTER_SYCL_KERNEL
// A special GPU kernel for int32.
// TODO(b/25387198): Also enable int32 in device memory. This kernel
// registration requires all int32 inputs and outputs to be in host memory.
REGISTER_KERNEL_BUILDER(Name("Div")
.Device(DEVICE_SYCL)
.HostMemory("x")
.HostMemory("y")
.HostMemory("z")
.TypeConstraint<int32>("T"),
BinaryOp<CPUDevice, functor::safe_div<int32>>);
#endif // TENSORFLOW_USE_SYCL
#if GOOGLE_CUDA
REGISTER9(BinaryOp, GPU, "Div", functor::div, float, Eigen::half, double, uint8,

View File

@ -34,4 +34,16 @@ REGISTER_KERNEL_BUILDER(Name("Equal")
BinaryOp<CPUDevice, functor::equal_to<int32>>);
#endif
#ifdef TENSORFLOW_USE_SYCL
REGISTER2(BinaryOp, SYCL, "Equal", functor::equal_to, float, double);
REGISTER_KERNEL_BUILDER(Name("Equal")
.Device(DEVICE_SYCL)
.HostMemory("x")
.HostMemory("y")
.HostMemory("z")
.TypeConstraint<int32>("T"),
BinaryOp<CPUDevice, functor::equal_to<int32>>);
#endif // TENSORFLOW_USE_SYCL
} // namespace tensorflow

View File

@ -21,4 +21,7 @@ REGISTER5(UnaryOp, CPU, "Expm1", functor::expm1, float, Eigen::half, double,
#if GOOGLE_CUDA
REGISTER3(UnaryOp, GPU, "Expm1", functor::expm1, float, Eigen::half, double);
#endif
#ifdef TENSORFLOW_USE_SYCL
REGISTER(UnaryOp, SYCL, "Expm1", functor::expm1, float);
#endif // TENSORFLOW_USE_SYCL
} // namespace tensorflow

View File

@ -26,6 +26,7 @@ REGISTER3(UnaryOp, CPU, "Floor", functor::floor, float, Eigen::half, double);
.TypeConstraint<TYPE>("T"), \
UnaryOp<SYCLDevice, functor::floor<TYPE>>);
REGISTER_SYCL_KERNEL(float);
REGISTER_SYCL_KERNEL(double);
#undef REGISTER_SYCL_KERNEL
#endif // TENSORFLOW_USE_SYCL

View File

@ -21,17 +21,6 @@ REGISTER5(BinaryOp, CPU, "FloorDiv", functor::safe_floor_div, uint8, uint16,
REGISTER3(BinaryOp, CPU, "FloorDiv", functor::floor_div_real, float,
Eigen::half, double);
#if TENSORFLOW_USE_SYCL
#define REGISTER_SYCL_KERNEL(TYPE) \
REGISTER_KERNEL_BUILDER( \
Name("FloorDiv") \
.Device(DEVICE_SYCL) \
.TypeConstraint<TYPE>("T"), \
BinaryOp<SYCLDevice, functor::floor_div_real<TYPE>>);
REGISTER_SYCL_KERNEL(float)
#undef REGISTER_SYCL_KERNEL
#endif // TENSORFLOW_USE_SYCL
#if GOOGLE_CUDA
REGISTER4(BinaryOp, GPU, "FloorDiv", functor::floor_div, uint8, uint16, int16,
int64);
@ -51,4 +40,14 @@ REGISTER_KERNEL_BUILDER(Name("FloorDiv")
.TypeConstraint<int32>("T"),
BinaryOp<CPUDevice, functor::safe_floor_div<int32>>);
#endif
#ifdef TENSORFLOW_USE_SYCL
REGISTER_KERNEL_BUILDER(Name("FloorDiv")
.Device(DEVICE_SYCL)
.HostMemory("x")
.HostMemory("y")
.HostMemory("z")
.TypeConstraint<int32>("T"),
BinaryOp<CPUDevice, functor::safe_floor_div<int32>>);
#endif // TENSORFLOW_USE_SYCL
} // namespace tensorflow

View File

@ -31,4 +31,14 @@ REGISTER_KERNEL_BUILDER(Name("FloorMod")
.TypeConstraint<int32>("T"),
BinaryOp<CPUDevice, functor::safe_floor_mod<int32>>);
#endif
#ifdef TENSORFLOW_USE_SYCL
REGISTER_KERNEL_BUILDER(Name("FloorMod")
.Device(DEVICE_SYCL)
.HostMemory("x")
.HostMemory("y")
.HostMemory("z")
.TypeConstraint<int32>("T"),
BinaryOp<CPUDevice, functor::safe_floor_mod<int32>>);
#endif // TENSORFLOW_USE_SYCL
} // namespace tensorflow

View File

@ -33,5 +33,19 @@ REGISTER_KERNEL_BUILDER(Name("Greater")
.TypeConstraint<int32>("T"),
BinaryOp<CPUDevice, functor::greater<int32>>);
#endif
#ifdef TENSORFLOW_USE_SYCL
REGISTER(BinaryOp, SYCL, "Greater", functor::greater, float);
// A special GPU kernel for int32.
// TODO(b/25387198): Also enable int32 in device memory. This kernel
// registration requires all int32 inputs and outputs to be in host memory.
REGISTER_KERNEL_BUILDER(Name("Greater")
.Device(DEVICE_SYCL)
.HostMemory("x")
.HostMemory("y")
.HostMemory("z")
.TypeConstraint<int32>("T"),
BinaryOp<CPUDevice, functor::greater<int32>>);
#endif // TENSORFLOW_USE_SYCL
} // namespace tensorflow

View File

@ -34,4 +34,15 @@ REGISTER_KERNEL_BUILDER(Name("GreaterEqual")
BinaryOp<CPUDevice, functor::greater_equal<int32>>);
#endif
#ifdef TENSORFLOW_USE_SYCL
REGISTER(BinaryOp, SYCL, "GreaterEqual", functor::greater_equal, float);
REGISTER_KERNEL_BUILDER(Name("GreaterEqual")
.Device(DEVICE_SYCL)
.HostMemory("x")
.HostMemory("y")
.HostMemory("z")
.TypeConstraint<int32>("T"),
BinaryOp<CPUDevice, functor::greater_equal<int32>>);
#endif // TENSORFLOW_USE_SYCL
} // namespace tensorflow

View File

@ -27,6 +27,7 @@ REGISTER3(UnaryOp, CPU, "IsFinite", functor::isfinite, float, Eigen::half,
.TypeConstraint<TYPE>("T"), \
UnaryOp<SYCLDevice, functor::isfinite<TYPE>>);
REGISTER_SYCL_KERNEL(float);
REGISTER_SYCL_KERNEL(double);
#undef REGISTER_SYCL_KERNEL
#endif // TENSORFLOW_USE_SYCL

View File

@ -26,6 +26,7 @@ REGISTER3(UnaryOp, CPU, "IsInf", functor::isinf, float, Eigen::half, double);
.TypeConstraint<TYPE>("T"), \
UnaryOp<SYCLDevice, functor::isinf<TYPE>>);
REGISTER_SYCL_KERNEL(float);
REGISTER_SYCL_KERNEL(double);
#undef REGISTER_SYCL_KERNEL
#endif // TENSORFLOW_USE_SYCL

View File

@ -26,6 +26,7 @@ REGISTER3(UnaryOp, CPU, "IsNan", functor::isnan, float, Eigen::half, double);
.TypeConstraint<TYPE>("T"), \
UnaryOp<SYCLDevice, functor::isnan<TYPE>>);
REGISTER_SYCL_KERNEL(float);
REGISTER_SYCL_KERNEL(double);
#undef REGISTER_SYCL_KERNEL
#endif // TENSORFLOW_USE_SYCL

View File

@ -33,5 +33,15 @@ REGISTER_KERNEL_BUILDER(Name("Less")
.TypeConstraint<int32>("T"),
BinaryOp<CPUDevice, functor::less<int32>>);
#endif
#ifdef TENSORFLOW_USE_SYCL
REGISTER3(BinaryOp, SYCL, "Less", functor::less, float, double, int64);
REGISTER_KERNEL_BUILDER(Name("Less")
.Device(DEVICE_SYCL)
.HostMemory("x")
.HostMemory("y")
.HostMemory("z")
.TypeConstraint<int32>("T"),
BinaryOp<CPUDevice, functor::less<int32>>);
#endif // TENSORFLOW_USE_SYCL
} // namespace tensorflow

View File

@ -34,4 +34,16 @@ REGISTER_KERNEL_BUILDER(Name("LessEqual")
BinaryOp<CPUDevice, functor::less_equal<int32>>);
#endif
#ifdef TENSORFLOW_USE_SYCL
REGISTER(BinaryOp, SYCL, "LessEqual", functor::less_equal, float);
REGISTER_KERNEL_BUILDER(Name("LessEqual")
.Device(DEVICE_SYCL)
.HostMemory("x")
.HostMemory("y")
.HostMemory("z")
.TypeConstraint<int32>("T"),
BinaryOp<CPUDevice, functor::less_equal<int32>>);
#endif // TENSORFLOW_USE_SYCL
} // namespace tensorflow

View File

@ -27,6 +27,7 @@ REGISTER5(UnaryOp, CPU, "Log", functor::log, float, Eigen::half, double,
.TypeConstraint<TYPE>("T"), \
UnaryOp<SYCLDevice, functor::log<TYPE>>);
REGISTER_SYCL_KERNEL(float);
REGISTER_SYCL_KERNEL(double);
#undef REGISTER_SYCL_KERNEL
#endif // TENSORFLOW_USE_SYCL

View File

@ -27,6 +27,7 @@ REGISTER5(UnaryOp, CPU, "Log1p", functor::log1p, float, Eigen::half, double,
.TypeConstraint<TYPE>("T"), \
UnaryOp<SYCLDevice, functor::log1p<TYPE>>);
REGISTER_SYCL_KERNEL(float);
REGISTER_SYCL_KERNEL(double);
#undef REGISTER_SYCL_KERNEL
#endif // TENSORFLOW_USE_SYCL

View File

@ -34,4 +34,19 @@ REGISTER_KERNEL_BUILDER(Name("Maximum")
BinaryOp<CPUDevice, functor::maximum<int32>>);
#endif
#ifdef TENSORFLOW_USE_SYCL
REGISTER(BinaryOp, SYCL, "Maximum", functor::maximum, float);
// A special GPU kernel for int32.
// TODO(b/25387198): Also enable int32 in device memory. This kernel
// registration requires all int32 inputs and outputs to be in host memory.
REGISTER_KERNEL_BUILDER(Name("Maximum")
.Device(DEVICE_SYCL)
.HostMemory("x")
.HostMemory("y")
.HostMemory("z")
.TypeConstraint<int32>("T"),
BinaryOp<CPUDevice, functor::maximum<int32>>);
#endif // TENSORFLOW_USE_SYCL
} // namespace tensorflow

View File

@ -34,4 +34,16 @@ REGISTER_KERNEL_BUILDER(Name("Minimum")
BinaryOp<CPUDevice, functor::minimum<int32>>);
#endif
#ifdef TENSORFLOW_USE_SYCL
REGISTER(BinaryOp, SYCL, "Minimum", functor::minimum, float);
REGISTER_KERNEL_BUILDER(Name("Minimum")
.Device(DEVICE_SYCL)
.HostMemory("x")
.HostMemory("y")
.HostMemory("z")
.TypeConstraint<int32>("T"),
BinaryOp<CPUDevice, functor::minimum<int32>>);
#endif // TENSORFLOW_USE_SYCL
} // namespace tensorflow

View File

@ -28,7 +28,15 @@ REGISTER5(BinaryOp, CPU, "Mul", functor::mul, float, Eigen::half, double,
.TypeConstraint<TYPE>("T"), \
BinaryOp<SYCLDevice, functor::mul<TYPE>>);
REGISTER_SYCL_KERNEL(float)
REGISTER_SYCL_KERNEL(double)
#undef REGISTER_SYCL_KERNEL
REGISTER_KERNEL_BUILDER(Name("Mul")
.Device(DEVICE_SYCL)
.HostMemory("x")
.HostMemory("y")
.HostMemory("z")
.TypeConstraint<int32>("T"),
BinaryOp<CPUDevice, functor::mul<int32>>);
#endif // TENSORFLOW_USE_SYCL
#if GOOGLE_CUDA
REGISTER4(BinaryOp, GPU, "Mul", functor::mul, float, Eigen::half, double,

View File

@ -27,6 +27,7 @@ REGISTER7(BinaryOp, CPU, "Pow", functor::pow, float, Eigen::half, double, int32,
.TypeConstraint<TYPE>("T"), \
BinaryOp<SYCLDevice, functor::pow<TYPE>>);
REGISTER_SYCL_KERNEL(float);
REGISTER_SYCL_KERNEL(double);
#undef REGISTER_SYCL_KERNEL
#endif // TENSORFLOW_USE_SYCL

View File

@ -36,6 +36,9 @@ REGISTER5(UnaryOp, CPU, "Reciprocal", functor::inverse, float, Eigen::half,
REGISTER4(UnaryOp, GPU, "Reciprocal", functor::inverse, float, Eigen::half,
double, int64);
#endif
#ifdef TENSORFLOW_USE_SYCL
REGISTER(UnaryOp, SYCL, "Reciprocal", functor::inverse, float);
#endif // TENSORFLOW_USE_SYCL
REGISTER5(SimpleBinaryOp, CPU, "ReciprocalGrad", functor::inverse_grad, float,
Eigen::half, double, complex64, complex128);
@ -43,4 +46,7 @@ REGISTER5(SimpleBinaryOp, CPU, "ReciprocalGrad", functor::inverse_grad, float,
REGISTER3(SimpleBinaryOp, GPU, "ReciprocalGrad", functor::inverse_grad, float,
Eigen::half, double);
#endif
#ifdef TENSORFLOW_USE_SYCL
REGISTER(SimpleBinaryOp, SYCL, "ReciprocalGrad", functor::inverse_grad, float);
#endif // TENSORFLOW_USE_SYCL
} // namespace tensorflow

View File

@ -20,9 +20,9 @@ REGISTER5(UnaryOp, CPU, "Round", functor::round, Eigen::half, float, double,
int32, int64);
#ifdef TENSORFLOW_USE_SYCL
REGISTER(UnaryOp, SYCL, "Round", functor::round, float);
REGISTER2(UnaryOp, SYCL, "Round", functor::round, float, double);
namespace functor {
DEFINE_UNARY1(round, float);
DEFINE_UNARY2(round, float, double);
} // namespace functor
#endif

View File

@ -27,6 +27,7 @@ REGISTER5(UnaryOp, CPU, "Rsqrt", functor::rsqrt, float, Eigen::half, double,
.TypeConstraint<TYPE>("T"), \
UnaryOp<SYCLDevice, functor::rsqrt<TYPE>>);
REGISTER_SYCL_KERNEL(float);
REGISTER_SYCL_KERNEL(double);
#undef REGISTER_SYCL_KERNEL
#endif // TENSORFLOW_USE_SYCL

View File

@ -28,6 +28,10 @@ namespace tensorflow {
typedef Eigen::ThreadPoolDevice CPUDevice;
typedef Eigen::GpuDevice GPUDevice;
#ifdef TENSORFLOW_USE_SYCL
typedef Eigen::SyclDevice SYCLDevice;
#endif // TENSORFLOW_USE_SYCL
template <typename Device, typename T>
class SelectOp : public OpKernel {
public:
@ -163,12 +167,24 @@ REGISTER_SELECT_GPU(complex128);
#endif // GOOGLE_CUDA
#ifdef TENSORFLOW_USE_SYCL
// Registration of the SYCL implementations.
#define REGISTER_SELECT_SYCL(type) \
REGISTER_KERNEL_BUILDER( \
Name("Select").Device(DEVICE_SYCL).TypeConstraint<type>("T"), \
SelectOp<SYCLDevice, type>);
REGISTER_SELECT_SYCL(float);
REGISTER_SELECT_SYCL(int32);
#undef REGISTER_SELECT_SYCL
#endif // TENSORFLOW_USE_SYCL
namespace functor {
// CPU Specializations of Select functors.
template <typename T>
struct SelectFunctor<CPUDevice, T> {
void operator()(const CPUDevice& d, typename TTypes<T>::Flat out,
template <typename Device, typename T>
struct SelectFunctorBase {
void operator()(const Device& d, typename TTypes<T>::Flat out,
typename TTypes<bool>::ConstFlat cond_flat,
typename TTypes<T>::ConstFlat then_flat,
typename TTypes<T>::ConstFlat else_flat) {
@ -176,10 +192,18 @@ struct SelectFunctor<CPUDevice, T> {
}
};
// CPU Specializations of Select functors with scalar
template <typename T>
struct SelectScalarFunctor<CPUDevice, T> {
void operator()(const CPUDevice& d, typename TTypes<T>::Flat out,
struct SelectFunctor<CPUDevice, T>
: SelectFunctorBase<CPUDevice, T> {};
#ifdef TENSORFLOW_USE_SYCL
template <typename T>
struct SelectFunctor<SYCLDevice, T>
: SelectFunctorBase<SYCLDevice, T> {};
#endif // TENSORFLOW_USE_SYCL
template <typename Device, typename T>
struct SelectScalarFunctorBase {
void operator()(const Device& d, typename TTypes<T>::Flat out,
TTypes<bool>::ConstScalar cond,
typename TTypes<T>::ConstFlat then_flat,
typename TTypes<T>::ConstFlat else_flat) {
@ -187,9 +211,19 @@ struct SelectScalarFunctor<CPUDevice, T> {
}
};
// CPU Specializations of Select functors with scalar
template <typename T>
struct BatchSelectFunctor<CPUDevice, T> {
void operator()(const CPUDevice& d,
struct SelectScalarFunctor<CPUDevice, T>
: SelectScalarFunctorBase<CPUDevice, T> {};
#ifdef TENSORFLOW_USE_SYCL
template <typename T>
struct SelectScalarFunctor<SYCLDevice, T>
: SelectScalarFunctorBase<SYCLDevice, T> {};
#endif // TENSORFLOW_USE_SYCL
template <typename Device, typename T>
struct BatchSelectFunctorBase {
void operator()(const Device& d,
typename TTypes<T>::Matrix output_flat_outer_dims,
TTypes<bool>::ConstVec cond_vec,
typename TTypes<T>::ConstMatrix then_flat_outer_dims,
@ -214,6 +248,15 @@ struct BatchSelectFunctor<CPUDevice, T> {
}
};
template <typename T>
struct BatchSelectFunctor<CPUDevice, T>
: BatchSelectFunctorBase<CPUDevice, T> {};
#ifdef TENSORFLOW_USE_SYCL
template <typename T>
struct BatchSelectFunctor<SYCLDevice, T>
: BatchSelectFunctorBase<SYCLDevice, T> {};
#endif // TENSORFLOW_USE_SYCL
} // namespace functor
} // namespace tensorflow

View File

@ -23,6 +23,9 @@ REGISTER5(UnaryOp, CPU, "Sigmoid", functor::sigmoid, float, Eigen::half, double,
REGISTER3(UnaryOp, GPU, "Sigmoid", functor::sigmoid, float, Eigen::half,
double);
#endif
#ifdef TENSORFLOW_USE_SYCL
REGISTER(UnaryOp, SYCL, "Sigmoid", functor::sigmoid, float);
#endif // TENSORFLOW_USE_SYCL
REGISTER5(SimpleBinaryOp, CPU, "SigmoidGrad", functor::sigmoid_grad, float,
Eigen::half, double, complex64, complex128);
@ -30,5 +33,8 @@ REGISTER5(SimpleBinaryOp, CPU, "SigmoidGrad", functor::sigmoid_grad, float,
REGISTER3(SimpleBinaryOp, GPU, "SigmoidGrad", functor::sigmoid_grad, float,
Eigen::half, double);
#endif
#ifdef TENSORFLOW_USE_SYCL
REGISTER(SimpleBinaryOp, SYCL, "SigmoidGrad", functor::sigmoid_grad, float);
#endif // TENSORFLOW_USE_SYCL
} // namespace tensorflow

View File

@ -33,4 +33,17 @@ REGISTER_KERNEL_BUILDER(Name("Sign")
UnaryOp<CPUDevice, functor::sign<int32>>);
#endif
#ifdef TENSORFLOW_USE_SYCL
REGISTER(UnaryOp, SYCL, "Sign", functor::sign, float);
// A special GPU kernel for int32.
// TODO(b/25387198): Also enable int32 in device memory. This kernel
// registration requires all int32 inputs and outputs to be in host memory.
REGISTER_KERNEL_BUILDER(Name("Sign")
.Device(DEVICE_SYCL)
.HostMemory("x")
.HostMemory("y")
.TypeConstraint<int32>("T"),
UnaryOp<CPUDevice, functor::sign<int32>>);
#endif // TENSORFLOW_USE_SYCL
} // namespace tensorflow

View File

@ -27,6 +27,7 @@ REGISTER5(UnaryOp, CPU, "Sin", functor::sin, float, Eigen::half, double,
.TypeConstraint<TYPE>("T"), \
UnaryOp<SYCLDevice, functor::sin<TYPE>>);
REGISTER_SYCL_KERNEL(float);
REGISTER_SYCL_KERNEL(double);
#undef REGISTER_SYCL_KERNEL
#endif // TENSORFLOW_USE_SYC

View File

@ -27,8 +27,9 @@ REGISTER5(UnaryOp, CPU, "Sqrt", functor::sqrt, float, Eigen::half, double,
.TypeConstraint<TYPE>("T"), \
UnaryOp<SYCLDevice, functor::sqrt<TYPE>>);
REGISTER_SYCL_KERNEL(float);
REGISTER_SYCL_KERNEL(double);
#undef REGISTER_SYCL_KERNEL
#endif // TENSORFLOW_USE_SYC
#endif // TENSORFLOW_USE_SYCL
#if GOOGLE_CUDA
REGISTER3(UnaryOp, GPU, "Sqrt", functor::sqrt, float, Eigen::half, double);

View File

@ -27,6 +27,7 @@ REGISTER7(UnaryOp, CPU, "Square", functor::square, float, Eigen::half, double,
.TypeConstraint<TYPE>("T"), \
UnaryOp<SYCLDevice, functor::square<TYPE>>);
REGISTER_SYCL_KERNEL(float);
REGISTER_SYCL_KERNEL(double);
#undef REGISTER_SYCL_KERNEL
#endif // TENSORFLOW_USE_SYC

View File

@ -26,6 +26,7 @@ REGISTER2(UnaryOp, CPU, "Tan", functor::tan, float, double);
.TypeConstraint<TYPE>("T"), \
UnaryOp<SYCLDevice, functor::tan<TYPE>>);
REGISTER_SYCL_KERNEL(float);
REGISTER_SYCL_KERNEL(double);
#undef REGISTER_SYCL_KERNEL
#endif // TENSORFLOW_USE_SYC

View File

@ -28,6 +28,7 @@ REGISTER5(UnaryOp, CPU, "Tanh", functor::tanh, float, Eigen::half, double,
.TypeConstraint<TYPE>("T"), \
UnaryOp<SYCLDevice, functor::tanh<TYPE>>);
REGISTER_SYCL_KERNEL(float);
REGISTER_SYCL_KERNEL(double);
#undef REGISTER_SYCL_KERNEL
#endif // TENSORFLOW_USE_SYC

View File

@ -171,6 +171,21 @@ struct SimpleBinaryFunctor<CPUDevice, Functor> {
}
};
#ifdef TENSORFLOW_USE_SYCL
// Partial specialization of BinaryFunctor for SYCL devices
typedef Eigen::SyclDevice SYCLDevice;
template <typename Functor>
struct SimpleBinaryFunctor<SYCLDevice, Functor> {
void operator()(const SYCLDevice& d, typename Functor::tout_type out,
typename Functor::tin_type in0,
typename Functor::tin_type in1) {
out.device(d) = in0.binaryExpr(in1, typename Functor::func());
}
};
#endif // TENSORFLOW_USE_SYCL
template <typename T>
struct tanh_grad : base<T, Eigen::internal::scalar_tanh_gradient_op<T>> {};

View File

@ -51,18 +51,38 @@ static int ColsFromArg(int arg) { return (arg % kRows); }
BENCHMARK(BM_##DEVICE##_##FUNC##_##TYPE)->Range(4 << 10, 1 << 20);
BM_UNARY(cpu, Floor, float, DT_FLOAT);
#if GOOGLE_CUDA
BM_UNARY(gpu, Floor, float, DT_FLOAT);
#endif // GOOGLE_CUDA
#ifdef TENSORFLOW_USE_SYCL
BM_UNARY(sycl, Floor, float, DT_FLOAT);
#endif // TENSORFLOW_USE_SYCL
BM_UNARY(cpu, Floor, double, DT_DOUBLE);
#if GOOGLE_CUDA
BM_UNARY(gpu, Floor, double, DT_DOUBLE);
#endif // GOOGLE_CUDA
#ifdef TENSORFLOW_USE_SYCL
BM_UNARY(sycl, Floor, double, DT_DOUBLE);
#endif // TENSORFLOW_USE_SYCL
BM_UNARY(cpu, Conj, std::complex<float>, DT_COMPLEX64);
#if GOOGLE_CUDA
BM_UNARY(gpu, Conj, std::complex<float>, DT_COMPLEX64);
#endif // GOOGLE_CUDA
BM_UNARY(cpu, Conj, std::complex<double>, DT_COMPLEX128);
#if GOOGLE_CUDA
BM_UNARY(gpu, Conj, std::complex<double>, DT_COMPLEX128);
#endif // GOOGLE_CUDA
BM_UNARY(cpu, Rint, double, DT_DOUBLE);
#if GOOGLE_CUDA
BM_UNARY(gpu, Rint, double, DT_DOUBLE);
#endif // GOOGLE_CUDA
BM_UNARY(cpu, Rint, float, DT_FLOAT);
#if GOOGLE_CUDA
BM_UNARY(gpu, Rint, float, DT_FLOAT);
#endif // GOOGLE_CUDA
// data func scalar.
static Graph* BinaryScalar(int num, const string& func) {
@ -90,9 +110,20 @@ static Graph* BinaryScalar(int num, const string& func) {
->Arg(1048576);
BM_BINARY_SCALAR(cpu, Less);
#if GOOGLE_CUDA
BM_BINARY_SCALAR(gpu, Less);
#endif // GOOGLE_CUDA
#ifdef TENSORFLOW_USE_SYCL
BM_BINARY_SCALAR(sycl, Less);
#endif // TENSORFLOW_USE_SYCL
BM_BINARY_SCALAR(cpu, Add);
#if GOOGLE_CUDA
BM_BINARY_SCALAR(gpu, Add);
#endif // GOOGLE_CUDA
#ifdef TENSORFLOW_USE_SYCL
BM_BINARY_SCALAR(sycl, Add);
#endif // TENSORFLOW_USE_SYCL
#undef BM_BINARY_SCALAR
template <class T>
@ -130,9 +161,13 @@ static Graph* BiasAdd(int rows, int cols, DataType type) {
using Eigen::half;
BM_BIAS_ADD_ALL(cpu, float, DT_FLOAT);
#if GOOGLE_CUDA
BM_BIAS_ADD_ALL(gpu, float, DT_FLOAT);
#endif // GOOGLE_CUDA
BM_BIAS_ADD_ALL(cpu, half, DT_HALF);
#if GOOGLE_CUDA
BM_BIAS_ADD_ALL(gpu, half, DT_HALF);
#endif // GOOGLE_CUDA
#undef BM_BIAS_ADD_ALL
#undef BM_BIAS_ADD
@ -180,12 +215,18 @@ static Graph* BiasAddGrad(int rows, int cols, int channels, DataType type,
BM_BIAS_ADD_GRAD(DEVICE, FORMAT, C_TYPE, TF_TYPE, 4096, 4096, 1);
using Eigen::half;
#if GOOGLE_CUDA
BM_BIAS_ADD_GRAD_ALL(gpu, NCHW, float, DT_FLOAT);
BM_BIAS_ADD_GRAD_ALL(gpu, NCHW, half, DT_HALF);
#endif // GOOGLE_CUDA
BM_BIAS_ADD_GRAD_ALL(cpu, NHWC, float, DT_FLOAT);
#if GOOGLE_CUDA
BM_BIAS_ADD_GRAD_ALL(gpu, NHWC, float, DT_FLOAT);
#endif // GOOGLE_CUDA
BM_BIAS_ADD_GRAD_ALL(cpu, NHWC, half, DT_HALF);
#if GOOGLE_CUDA
BM_BIAS_ADD_GRAD_ALL(gpu, NHWC, half, DT_HALF);
#endif // GOOGLE_CUDA
#undef BM_BIAS_ADD_GRAD_ALL
#undef BM_BIAS_ADD_GRAD
@ -223,7 +264,12 @@ static Graph* BcastAdd(int rows, int cols, int dim) {
BM_BCAST_ADD_ROW(DEVICE, 2048, 512); \
BM_BCAST_ADD_ROW(DEVICE, 4096, 512);
BM_BCAST_ADD_ROW_ALL(cpu);
#if GOOGLE_CUDA
BM_BCAST_ADD_ROW_ALL(gpu);
#endif // GOOGLE_CUDA
#ifdef TENSORFLOW_USE_SYCL
BM_BCAST_ADD_ROW_ALL(sycl);
#endif // TENSORFLOW_USE_SYCL
#undef BM_BCAST_ADD_ROW_ALL
#undef BM_BCAST_ADD_ROW
@ -244,7 +290,12 @@ BM_BCAST_ADD_ROW_ALL(gpu);
BM_BCAST_ADD_COL(DEVICE, 2048, 512); \
BM_BCAST_ADD_COL(DEVICE, 4096, 512);
BM_BCAST_ADD_COL_ALL(cpu);
#if GOOGLE_CUDA
BM_BCAST_ADD_COL_ALL(gpu);
#endif // GOOGLE_CUDA
#ifdef TENSORFLOW_USE_SYCL
BM_BCAST_ADD_COL_ALL(sycl);
#endif // TENSORFLOW_USE_SYCL
#undef BM_BCAST_ADD_COL_ALL
#undef BM_BCAST_ADD_COL

View File

@ -97,6 +97,7 @@ REGISTER_GPU_DEBUG_NAN_COUNT(double);
.TypeConstraint<type>("T"), \
DebugNanCountOp<type>);
REGISTER_GPU_DEBUG_NAN_COUNT(float);
REGISTER_GPU_DEBUG_NAN_COUNT(double);
#endif // TENSORFLOW_USE_SYCL
// Register debug numeric summary ops.
@ -129,6 +130,7 @@ REGISTER_GPU_DEBUG_NUMERIC_SUMMARY_COUNT(double);
.TypeConstraint<type>("T"), \
DebugNumericSummaryOp<type>);
REGISTER_GPU_DEBUG_NUMERIC_SUMMARY_COUNT(float);
REGISTER_GPU_DEBUG_NUMERIC_SUMMARY_COUNT(double);
#endif // TENSORFLOW_USE_SYCL
} // namespace tensorflow

View File

@ -152,6 +152,7 @@ typedef Eigen::SyclDevice SYCLDevice;
DenseUpdateOp<SYCLDevice, type, DenseUpdateType::SUB>);
REGISTER_SYCL_KERNEL(float);
REGISTER_SYCL_KERNEL(double);
#undef REGISTER_SYCL_KERNEL
#endif

View File

@ -62,6 +62,8 @@ void SetZeroFunctor<Eigen::SyclDevice, T>::operator()(
#define DEFINE_SETZERO_SYCL(T) \
template struct SetZeroFunctor<Eigen::SyclDevice, T>;
DEFINE_SETZERO_SYCL(float);
DEFINE_SETZERO_SYCL(bool);
DEFINE_SETZERO_SYCL(double);
#undef DEFINE_SETZERO_SYCL
#endif // TENSORFLOW_USE_SYCL

View File

@ -185,6 +185,34 @@ REGISTER_KERNEL_BUILDER(Name("_ArrayToList")
.TypeConstraint<int32>("T"),
PassOn);
#ifdef TENSORFLOW_USE_SYCL
#define REGISTER_SYCL_KERNELS(type) \
REGISTER_KERNEL_BUILDER( \
Name("_ListToArray").Device(DEVICE_SYCL).TypeConstraint<type>("T"),\
PassOn); \
REGISTER_KERNEL_BUILDER( \
Name("_ArrayToList").Device(DEVICE_SYCL).TypeConstraint<type>("T"),\
PassOn);
REGISTER_SYCL_KERNELS(float);
REGISTER_SYCL_KERNELS(double);
#undef REGISTER_SYCL_KERNELS
REGISTER_KERNEL_BUILDER(Name("_ListToArray")
.Device(DEVICE_SYCL)
.HostMemory("input")
.HostMemory("output")
.TypeConstraint<int32>("T"),
PassOn);
REGISTER_KERNEL_BUILDER(Name("_ArrayToList")
.Device(DEVICE_SYCL)
.HostMemory("input")
.HostMemory("output")
.TypeConstraint<int32>("T"),
PassOn);
#endif // TENSORFLOW_USE_SYCL
class SymbolicGradientOp : public AsyncOpKernel {
public:
SymbolicGradientOp(OpKernelConstruction* ctx)

View File

@ -46,6 +46,9 @@ perftools::gputools::DeviceMemory<T> AsDeviceMemory(const T* cuda_memory) {
typedef Eigen::ThreadPoolDevice CPUDevice;
typedef Eigen::GpuDevice GPUDevice;
#ifdef TENSORFLOW_USE_SYCL
typedef Eigen::SyclDevice SYCLDevice;
#endif // TENSORFLOW_USE_SYCL
template <typename Device, typename T, bool USE_CUBLAS>
struct LaunchMatMul;
@ -118,27 +121,42 @@ bool ExplicitVectorMatrixOptimization<Eigen::half>(
return false;
}
// On CPUs, we ignore USE_CUBLAS
template <typename T>
struct LaunchMatMulCPU {
template <typename Device, typename T>
struct LaunchMatMulBase {
static void launch(
OpKernelContext* ctx, OpKernel* kernel, const Tensor& a, const Tensor& b,
const Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1>& dim_pair,
Tensor* out) {
#ifndef TENSORFLOW_USE_SYCL
// An explicit vector-matrix multiply is much better optimized than an
// implicit one and this is a bottleneck during non-batched inference.
bool was_vector = ExplicitVectorMatrixOptimization<T>(a, b, dim_pair, out);
if (!was_vector) {
functor::MatMulFunctor<CPUDevice, T>()(ctx->eigen_device<CPUDevice>(),
#endif // TENSORFLOW_USE_SYCL
functor::MatMulFunctor<Device, T>()(ctx->eigen_device<Device>(),
out->matrix<T>(), a.matrix<T>(),
b.matrix<T>(), dim_pair);
#ifndef TENSORFLOW_USE_SYCL
}
#endif // TENSORFLOW_USE_SYCL
}
};
// On CPUs, we ignore USE_CUBLAS
template <typename T>
struct LaunchMatMulCPU : LaunchMatMulBase<CPUDevice, T> {};
template <typename T, bool USE_CUBLAS>
struct LaunchMatMul<CPUDevice, T, USE_CUBLAS> : public LaunchMatMulCPU<T> {};
#ifdef TENSORFLOW_USE_SYCL
template <typename T>
struct LaunchMatMulSYCL : LaunchMatMulBase<SYCLDevice, T> {};
template <typename T, bool USE_CUBLAS>
struct LaunchMatMul<SYCLDevice, T, USE_CUBLAS> : public LaunchMatMulSYCL<T> {};
#endif // TENSORFLOW_USE_SYCL
#if GOOGLE_CUDA
template <typename T>
@ -256,6 +274,20 @@ struct MatMulFunctor<CPUDevice, T> {
}
};
#ifdef TENSORFLOW_USE_SYCL
// Partial specialization MatMulFunctor<Device=SYCLDevice, T>.
template <typename T>
struct MatMulFunctor<SYCLDevice, T> {
void operator()(
const SYCLDevice& d, typename MatMulTypes<T>::out_type out,
typename MatMulTypes<T>::in_type in0,
typename MatMulTypes<T>::in_type in1,
const Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1>& dim_pair) {
MatMul<SYCLDevice>(d, out, in0, in1, dim_pair);
}
};
#endif // TENSORFLOW_USE_SYCL
} // end namespace functor
#define REGISTER_CPU(T) \
@ -294,4 +326,17 @@ TF_CALL_half(REGISTER_GPU);
#endif
#endif // GOOGLE_CUDA
#ifdef TENSORFLOW_USE_SYCL
#define REGISTER_SYCL(T) \
REGISTER_KERNEL_BUILDER( \
Name("MatMul").Device(DEVICE_SYCL).TypeConstraint<T>("T"), \
MatMulOp<SYCLDevice, T, false /* xxblas */>); \
REGISTER_KERNEL_BUILDER(Name("MatMul") \
.Device(DEVICE_SYCL) \
.TypeConstraint<T>("T") \
.Label("eigen"), \
MatMulOp<SYCLDevice, T, false /* xxblas */>)
TF_CALL_float(REGISTER_SYCL);
#endif // TENSORFLOW_USE_SYCL
} // namespace tensorflow

View File

@ -167,6 +167,7 @@ REGISTER_KERNEL_BUILDER(Name("Pack")
PackOp<SYCLDevice, type>)
REGISTER_SYCL(float);
REGISTER_SYCL(double);
#undef REGISTER_SYCL
// A special GPU kernel for int32.

View File

@ -38,6 +38,9 @@ namespace tensorflow {
typedef Eigen::ThreadPoolDevice CPUDevice;
typedef Eigen::GpuDevice GPUDevice;
#ifdef TENSORFLOW_USE_SYCL
typedef Eigen::SyclDevice SYCLDevice;
#endif // TENSORFLOW_USE_SYCL
template <typename Device, typename T>
class PadOp : public OpKernel {
@ -199,4 +202,30 @@ REGISTER_KERNEL_BUILDER(Name("Pad")
PadOp<CPUDevice, int32>);
#endif
#ifdef TENSORFLOW_USE_SYCL
// Registration of the GPU implementations.
#define REGISTER_SYCL_KERNEL(T) \
REGISTER_KERNEL_BUILDER(Name("Pad") \
.Device(DEVICE_SYCL) \
.TypeConstraint<T>("T") \
.TypeConstraint<int32>("Tpaddings") \
.HostMemory("paddings"), \
PadOp<SYCLDevice, T>)
REGISTER_SYCL_KERNEL(float);
REGISTER_SYCL_KERNEL(double);
// A special GPU kernel for int32.
// TODO(b/25387198): Also enable int32 in device memory. This kernel
// registration requires all int32 inputs and outputs to be in host memory.
REGISTER_KERNEL_BUILDER(Name("Pad")
.Device(DEVICE_SYCL)
.TypeConstraint<int32>("T")
.TypeConstraint<int32>("Tpaddings")
.HostMemory("input")
.HostMemory("paddings")
.HostMemory("output"),
PadOp<CPUDevice, int32>);
#endif // TENSORFLOW_USE_SYCL
} // end namespace tensorflow

View File

@ -268,6 +268,31 @@ struct ReduceFunctor<CPUDevice, Reducer>
template <typename Reducer>
struct ReduceFunctor<SYCLDevice, Reducer>
: ReduceFunctorBase<SYCLDevice, Reducer>{};
template <typename T>
struct ReduceFunctor<SYCLDevice, Eigen::internal::MeanReducer<T> > {
template <typename OUT_T, typename IN_T, typename ReductionAxes>
static void Reduce(const SYCLDevice& d, OUT_T out, IN_T in,
const ReductionAxes& reduction_axes,
const Eigen::internal::MeanReducer<T>& reducer) {
typedef typename IN_T::Index Index;
// Eigen sum reductions are much faster on GPU than mean reductions:
// Simply trigger them by computing the sum of the weighted inputs.
Index num_coeffs_to_reduce = 1;
for (int i = 0; i < Eigen::internal::array_size<ReductionAxes>::value;
++i) {
num_coeffs_to_reduce *= in.dimension(reduction_axes[i]);
}
T scale = T(1.0) / num_coeffs_to_reduce;
out.device(d) = (in * scale).sum(reduction_axes);
}
template <typename OUT_T>
static void FillIdentity(const SYCLDevice& d, OUT_T out,
const Eigen::internal::MeanReducer<T>& reducer) {
FillIdentityEigenImpl(d, out, reducer);
}
};
#endif // TENSORFLOW_USE_SYCL
} // namespace functor

View File

@ -57,4 +57,27 @@ REGISTER_KERNEL_BUILDER(
#endif
#ifdef TENSORFLOW_USE_SYCL
#define REGISTER_SYCL_KERNELS(type) \
REGISTER_KERNEL_BUILDER( \
Name("Max") \
.Device(DEVICE_SYCL) \
.TypeConstraint<type>("T") \
.TypeConstraint<int32>("Tidx") \
.HostMemory("reduction_indices"), \
ReductionOp<SYCLDevice, type, Eigen::internal::MaxReducer<type>>);
REGISTER_SYCL_KERNELS(float);
#undef REGISTER_SYCL_KERNELS
REGISTER_KERNEL_BUILDER(
Name("Max")
.Device(DEVICE_SYCL)
.HostMemory("reduction_indices")
.HostMemory("input")
.HostMemory("output")
.TypeConstraint<int32>("T")
.TypeConstraint<int32>("Tidx"),
ReductionOp<CPUDevice, int32, Eigen::internal::MaxReducer<int32>>);
#endif // TENSORFLOW_USE_SYCL
} // namespace tensorflow

View File

@ -44,4 +44,17 @@ REGISTER_GPU_KERNELS(double);
#endif
#ifdef TENSORFLOW_USE_SYCL
#define REGISTER_SYCL_KERNELS(type) \
REGISTER_KERNEL_BUILDER( \
Name("Mean") \
.Device(DEVICE_SYCL) \
.TypeConstraint<type>("T") \
.TypeConstraint<int32>("Tidx") \
.HostMemory("reduction_indices"), \
ReductionOp<SYCLDevice, type, Eigen::internal::MeanReducer<type>>);
REGISTER_SYCL_KERNELS(float);
#undef REGISTER_SYCL_KERNELS
#endif // TENSORFLOW_USE_SYCL
} // namespace tensorflow

View File

@ -57,4 +57,27 @@ REGISTER_KERNEL_BUILDER(
#endif
#ifdef TENSORFLOW_USE_SYCL
#define REGISTER_SYCL_KERNELS(type) \
REGISTER_KERNEL_BUILDER( \
Name("Min") \
.Device(DEVICE_SYCL) \
.TypeConstraint<type>("T") \
.TypeConstraint<int32>("Tidx") \
.HostMemory("reduction_indices"), \
ReductionOp<SYCLDevice, type, Eigen::internal::MinReducer<type>>);
REGISTER_SYCL_KERNELS(float);
#undef REGISTER_SYCL_KERNELS
REGISTER_KERNEL_BUILDER(
Name("Min")
.Device(DEVICE_SYCL)
.HostMemory("reduction_indices")
.HostMemory("input")
.HostMemory("output")
.TypeConstraint<int32>("T")
.TypeConstraint<int32>("Tidx"),
ReductionOp<CPUDevice, int32, Eigen::internal::MinReducer<int32>>);
#endif // TENSORFLOW_USE_SYCL
} // namespace tensorflow

View File

@ -45,4 +45,28 @@ REGISTER_GPU_KERNELS(double);
#endif
#ifdef TENSORFLOW_USE_SYCL
#define REGISTER_SYCL_KERNELS(type) \
REGISTER_KERNEL_BUILDER( \
Name("Prod") \
.Device(DEVICE_SYCL) \
.TypeConstraint<type>("T") \
.TypeConstraint<int32>("Tidx") \
.HostMemory("reduction_indices"), \
ReductionOp<SYCLDevice, type, Eigen::internal::ProdReducer<type>>);
REGISTER_SYCL_KERNELS(float);
REGISTER_SYCL_KERNELS(double);
#undef REGISTER_SYCL_KERNELS
REGISTER_KERNEL_BUILDER(
Name("Prod")
.Device(DEVICE_SYCL)
.TypeConstraint<int32>("T")
.TypeConstraint<int32>("Tidx")
.HostMemory("input")
.HostMemory("output")
.HostMemory("reduction_indices"),
ReductionOp<CPUDevice, int32, Eigen::internal::ProdReducer<int32>>);
#endif // TENSORFLOW_USE_SYCL
} // namespace tensorflow

View File

@ -74,7 +74,6 @@ REGISTER_KERNEL_BUILDER(
.HostMemory("reduction_indices"), \
ReductionOp<SYCLDevice, type, Eigen::internal::SumReducer<type>>);
REGISTER_SYCL_KERNELS(float);
REGISTER_SYCL_KERNELS(double);
#undef REGISTER_SYCL_KERNELS
// A special GPU kernel for int32.

View File

@ -29,6 +29,9 @@ namespace tensorflow {
typedef Eigen::ThreadPoolDevice CPUDevice;
typedef Eigen::GpuDevice GPUDevice;
#ifdef TENSORFLOW_USE_SYCL
typedef Eigen::SyclDevice SYCLDevice;
#endif // TENSORFLOW_USE_SYCL
#define REGISTER_RELU_KERNELS(type) \
REGISTER_KERNEL_BUILDER( \
@ -131,4 +134,30 @@ TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS);
#endif // GOOGLE_CUDA
#ifdef TENSORFLOW_USE_SYCL
// Registration of the GPU implementations.
#define REGISTER_SYCL_KERNELS(type) \
REGISTER_KERNEL_BUILDER( \
Name("Relu").Device(DEVICE_SYCL).TypeConstraint<type>("T"), \
ReluOp<SYCLDevice, type>); \
REGISTER_KERNEL_BUILDER( \
Name("ReluGrad").Device(DEVICE_SYCL).TypeConstraint<type>("T"), \
ReluGradOp<SYCLDevice, type>); \
REGISTER_KERNEL_BUILDER( \
Name("Relu6").Device(DEVICE_SYCL).TypeConstraint<type>("T"), \
Relu6Op<SYCLDevice, type>); \
REGISTER_KERNEL_BUILDER( \
Name("Relu6Grad").Device(DEVICE_SYCL).TypeConstraint<type>("T"), \
Relu6GradOp<SYCLDevice, type>); \
REGISTER_KERNEL_BUILDER( \
Name("Elu").Device(DEVICE_SYCL).TypeConstraint<type>("T"), \
EluOp<SYCLDevice, type>); \
REGISTER_KERNEL_BUILDER( \
Name("EluGrad").Device(DEVICE_SYCL).TypeConstraint<type>("T"), \
EluGradOp<SYCLDevice, type>)
REGISTER_SYCL_KERNELS(float);
#undef REGISTER_SYCL_KERNELS
#endif // TENSORFLOW_USE_SYCL
} // namespace tensorflow

View File

@ -175,6 +175,10 @@ void EluGradOp<Device, T>::OperateNoTemplate(OpKernelContext* context,
} // namespace tensorflow
#ifdef TENSORFLOW_USE_SYCL
#undef EIGEN_USE_SYCL
#endif // TENSORFLOW_USE_SYCL
#undef EIGEN_USE_THREADS
#endif // TENSORFLOW_KERNELS_RELU_OP_H_

View File

@ -33,6 +33,9 @@ namespace tensorflow {
typedef Eigen::ThreadPoolDevice CPUDevice;
typedef Eigen::GpuDevice GPUDevice;
#ifdef TENSORFLOW_USE_SYCL
typedef Eigen::SyclDevice SYCLDevice;
#endif // TENSORFLOW_USE_SYCL
namespace {
@ -351,4 +354,36 @@ REGISTER_KERNEL_BUILDER(Name("ReverseV2")
ReverseV2Op<CPUDevice, int32>);
#endif // GOOGLE_CUDA
#ifdef TENSORFLOW_USE_SYCL
#define REGISTER_SYCL_KERNELS(T) \
REGISTER_KERNEL_BUILDER(Name("Reverse") \
.Device(DEVICE_SYCL) \
.TypeConstraint<T>("T") \
.HostMemory("dims"), \
ReverseOp<SYCLDevice, T>) \
REGISTER_KERNEL_BUILDER(Name("ReverseV2") \
.Device(DEVICE_SYCL) \
.TypeConstraint<T>("T") \
.TypeConstraint<int32>("Tidx") \
.HostMemory("axis"), \
ReverseV2Op<SYCLDevice, T>)
TF_CALL_float(REGISTER_SYCL_KERNELS);
REGISTER_KERNEL_BUILDER(Name("Reverse")
.Device(DEVICE_SYCL)
.TypeConstraint<int32>("T")
.HostMemory("tensor")
.HostMemory("dims")
.HostMemory("output"),
ReverseOp<CPUDevice, int32>);
REGISTER_KERNEL_BUILDER(Name("ReverseV2")
.Device(DEVICE_SYCL)
.TypeConstraint<int32>("T")
.TypeConstraint<int32>("Tidx")
.HostMemory("tensor")
.HostMemory("axis")
.HostMemory("output"),
ReverseV2Op<CPUDevice, int32>);
#endif // TENSORFLOW_USE_SYCL
} // namespace tensorflow

View File

@ -180,8 +180,8 @@ TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_UPDATE_GPU);
#define REGISTER_SCATTER_UPDATE_SYCL(type) REGISTER_SCATTER_UPDATE(type, SYCL);
REGISTER_SCATTER_ARITHEMTIC_SYCL(float);
REGISTER_SCATTER_UPDATE_SYCL(float);
TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_ARITHEMTIC_SYCL);
TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_UPDATE_SYCL);
#undef REGISTER_SCATTER_ARITHEMTIC_SYCL
#undef REGISTER_SCATTER_UPDATE_SYCL

View File

@ -92,9 +92,11 @@ class RangeOp : public OpKernel {
#ifdef TENSORFLOW_USE_SYCL
#define REGISTER_SYCL_KERNEL(T) REGISTER_KERNEL(DEVICE_SYCL, T)
TF_CALL_float(REGISTER_SYCL_KERNEL);
TF_CALL_double(REGISTER_SYCL_KERNEL);
TF_CALL_int32(REGISTER_SYCL_KERNEL);
TF_CALL_int64(REGISTER_SYCL_KERNEL);
#endif // TENSORFLOW_USE_SYCL
#undef REGISTER_SYCL_KERNEL
#endif // TENSORFLOW_USE_SYCL
TF_CALL_float(REGISTER_CPU_KERNEL);
TF_CALL_double(REGISTER_CPU_KERNEL);
@ -170,4 +172,9 @@ TF_CALL_double(REGISTER_CPU_KERNEL);
TF_CALL_float(REGISTER_GPU_KERNEL);
TF_CALL_double(REGISTER_GPU_KERNEL);
#ifdef TENSORFLOW_USE_SYCL
#define REGISTER_SYCL_KERNEL(T) REGISTER_KERNEL(DEVICE_SYCL, T)
TF_CALL_float(REGISTER_SYCL_KERNEL);
TF_CALL_double(REGISTER_SYCL_KERNEL);
#endif // TENSORFLOW_USE_SYCL
} // namespace tensorflow

View File

@ -201,6 +201,7 @@ REGISTER_KERNEL_BUILDER(Name("Rank").Device(DEVICE_CPU).HostMemory("output"),
.HostMemory("output"), \
RankOp);
REGISTER_SYCL_KERNEL(float);
REGISTER_SYCL_KERNEL(double);
#undef REGISTER_SYCL_KERNEL
// A special GPU kernel for int32 and bool.
@ -297,6 +298,43 @@ REGISTER_KERNEL_BUILDER(Name("Size")
SizeOp<int64>);
#endif
#ifdef TENSORFLOW_USE_SYCL
#define REGISTER_SYCL_KERNEL(type) \
REGISTER_KERNEL_BUILDER(Name("Size") \
.Device(DEVICE_SYCL) \
.TypeConstraint<type>("T") \
.TypeConstraint<int32>("out_type") \
.HostMemory("output"), \
SizeOp<int32>); \
REGISTER_KERNEL_BUILDER(Name("Size") \
.Device(DEVICE_SYCL) \
.TypeConstraint<type>("T") \
.TypeConstraint<int64>("out_type") \
.HostMemory("output"), \
SizeOp<int64>);
REGISTER_SYCL_KERNEL(float);
REGISTER_SYCL_KERNEL(double);
#undef REGISTER_SYCL_KERNEL
// A special GPU kernel for int32.
// TODO(b/25387198): Also enable int32 in device memory. This kernel
// registration requires all int32 inputs and outputs to be in host memory.
REGISTER_KERNEL_BUILDER(Name("Size")
.Device(DEVICE_SYCL)
.TypeConstraint<int32>("T")
.TypeConstraint<int32>("out_type")
.HostMemory("input")
.HostMemory("output"),
SizeOp<int32>);
REGISTER_KERNEL_BUILDER(Name("Size")
.Device(DEVICE_SYCL)
.TypeConstraint<int32>("T")
.TypeConstraint<int64>("out_type")
.HostMemory("input")
.HostMemory("output"),
SizeOp<int64>);
#endif // TENSORFLOW_USE_SYCL
// ExpandDims ------------------------------------
REGISTER_KERNEL_BUILDER(Name("ExpandDims")
.Device(DEVICE_CPU)
@ -323,7 +361,30 @@ REGISTER_KERNEL_BUILDER(Name("ExpandDims")
.HostMemory("dim")
.HostMemory("output"),
ExpandDimsOp);
#endif
#endif // GOOGLE_CUDA
#ifdef TENSORFLOW_USE_SYCL
#define REGISTER_SYCL_KERNEL(type) \
REGISTER_KERNEL_BUILDER(Name("ExpandDims") \
.Device(DEVICE_SYCL) \
.TypeConstraint<type>("T") \
.TypeConstraint<int32>("Tdim") \
.HostMemory("dim"), \
ExpandDimsOp);
REGISTER_SYCL_KERNEL(float)
REGISTER_SYCL_KERNEL(double)
#undef REGISTER_SYCL_KERNEL
REGISTER_KERNEL_BUILDER(Name("ExpandDims")
.Device(DEVICE_SYCL)
.TypeConstraint<int32>("T")
.TypeConstraint<int32>("Tdim")
.HostMemory("input")
.HostMemory("dim")
.HostMemory("output"),
ExpandDimsOp);
#endif // TENSORFLOW_USE_SYCL
// Squeeze ---------------------------------------
REGISTER_KERNEL_BUILDER(Name("Squeeze").Device(DEVICE_CPU), SqueezeOp);
@ -347,4 +408,24 @@ REGISTER_KERNEL_BUILDER(Name("Squeeze")
SqueezeOp);
#endif
#if TENSORFLOW_USE_SYCL
#define REGISTER_SYCL_KERNEL(type) \
REGISTER_KERNEL_BUILDER( \
Name("Squeeze").Device(DEVICE_SYCL).TypeConstraint<type>("T"),\
SqueezeOp);
REGISTER_SYCL_KERNEL(float);
REGISTER_SYCL_KERNEL(double);
#undef REGISTER_SYCL_KERNEL
// A special GPU kernel for int32.
// TODO(b/25387198): Also enable int32 in device memory. This kernel
// registration requires all int32 inputs and outputs to be in host memory.
REGISTER_KERNEL_BUILDER(Name("Squeeze")
.Device(DEVICE_SYCL)
.TypeConstraint<int32>("T")
.HostMemory("input")
.HostMemory("output"),
SqueezeOp);
#endif // TENSORFLOW_USE_SYCL
} // namespace tensorflow

View File

@ -28,17 +28,27 @@ namespace tensorflow {
typedef Eigen::ThreadPoolDevice CPUDevice;
typedef Eigen::GpuDevice GPUDevice;
#ifdef TENSORFLOW_USE_SYCL
typedef Eigen::SyclDevice SYCLDevice;
#endif // TENSORFLOW_USE_SYCL
// Partial specialization for a CPUDevice, that uses the Eigen implementation
// from SoftmaxEigenImpl.
namespace functor {
template <typename T>
struct SoftmaxFunctor<CPUDevice, T> {
void operator()(const CPUDevice& d, typename TTypes<T>::ConstMatrix logits,
template <typename Device, typename T>
struct SoftmaxFunctorBase {
void operator()(const Device& d, typename TTypes<T>::ConstMatrix logits,
typename TTypes<T>::Matrix softmax, const bool log) {
SoftmaxEigenImpl<CPUDevice, T>::Compute(d, logits, softmax, log);
SoftmaxEigenImpl<Device, T>::Compute(d, logits, softmax, log);
}
};
template <typename T>
struct SoftmaxFunctor<CPUDevice, T> : SoftmaxFunctorBase<CPUDevice, T> {};
#ifdef TENSORFLOW_USE_SYCL
template <typename T>
struct SoftmaxFunctor<SYCLDevice, T> : SoftmaxFunctorBase<SYCLDevice, T> {};
#endif // TENSORFLOW_USE_SYCL
} // namespace functor
#define REGISTER_CPU(T) \
@ -76,4 +86,10 @@ REGISTER_KERNEL_BUILDER(
SoftmaxOp<GPUDevice, float>);
#endif // GOOGLE_CUDA
#ifdef TENSORFLOW_USE_SYCL
REGISTER_KERNEL_BUILDER(
Name("Softmax").Device(DEVICE_SYCL).TypeConstraint<float>("T"),
SoftmaxOp<SYCLDevice, float>);
#endif // TENSORFLOW_USE_SYCL
} // namespace tensorflow

View File

@ -99,6 +99,9 @@ REGISTER_KERNEL_BUILDER(Name("Stage").Device(DEVICE_CPU), StageOp);
#if GOOGLE_CUDA
REGISTER_KERNEL_BUILDER(Name("Stage").Device(DEVICE_GPU), StageOp);
#endif
#ifdef TENSORFLOW_USE_SYCL
REGISTER_KERNEL_BUILDER(Name("Stage").Device(DEVICE_SYCL), StageOp);
#endif // TENSORFLOW_USE_SYCL
class UnstageOp : public OpKernel {
public:
@ -126,5 +129,8 @@ REGISTER_KERNEL_BUILDER(Name("Unstage").Device(DEVICE_CPU), UnstageOp);
#if GOOGLE_CUDA
REGISTER_KERNEL_BUILDER(Name("Unstage").Device(DEVICE_GPU), UnstageOp);
#endif
#ifdef TENSORFLOW_USE_SYCL
REGISTER_KERNEL_BUILDER(Name("Unstage").Device(DEVICE_SYCL), UnstageOp);
#endif // TENSORFLOW_USE_SYCL
} // namespace tensorflow

View File

@ -450,4 +450,71 @@ REGISTER_KERNEL_BUILDER(Name("StridedSliceAssign")
#undef REGISTER_GPU
#endif // GOOGLE_CUDA
#ifdef TENSORFLOW_USE_SYCL
#define REGISTER_SYCL(type) \
REGISTER_KERNEL_BUILDER(Name("StridedSlice") \
.Device(DEVICE_SYCL) \
.TypeConstraint<type>("T") \
.HostMemory("begin") \
.HostMemory("end") \
.HostMemory("strides") \
.TypeConstraint<int32>("Index"), \
StridedSliceOp<SYCLDevice, type>) \
REGISTER_KERNEL_BUILDER(Name("StridedSliceGrad") \
.Device(DEVICE_SYCL) \
.TypeConstraint<type>("T") \
.HostMemory("shape") \
.HostMemory("begin") \
.HostMemory("end") \
.HostMemory("strides") \
.TypeConstraint<int32>("Index"), \
StridedSliceGradOp<SYCLDevice, type>)\
REGISTER_KERNEL_BUILDER(Name("StridedSliceAssign") \
.Device(DEVICE_SYCL) \
.TypeConstraint<type>("T") \
.HostMemory("begin") \
.HostMemory("end") \
.HostMemory("strides") \
.TypeConstraint<int32>("Index"), \
StridedSliceAssignOp<SYCLDevice, type>)
REGISTER_SYCL(float);
REGISTER_SYCL(double);
// A special GPU kernel for int32.
// TODO(b/25387198): Also enable int32 in device memory. This kernel
// registration requires all int32 inputs and outputs to be in host memory.
REGISTER_KERNEL_BUILDER(Name("StridedSlice")
.Device(DEVICE_SYCL)
.TypeConstraint<int32>("T")
.TypeConstraint<int32>("Index")
.HostMemory("input")
.HostMemory("begin")
.HostMemory("end")
.HostMemory("strides")
.HostMemory("output"),
StridedSliceOp<CPUDevice, int32>);
REGISTER_KERNEL_BUILDER(Name("StridedSliceGrad")
.Device(DEVICE_SYCL)
.TypeConstraint<int32>("T")
.TypeConstraint<int32>("Index")
.HostMemory("shape")
.HostMemory("begin")
.HostMemory("end")
.HostMemory("strides")
.HostMemory("dy")
.HostMemory("output"),
StridedSliceGradOp<CPUDevice, int32>);
REGISTER_KERNEL_BUILDER(Name("StridedSliceAssign")
.Device(DEVICE_SYCL)
.TypeConstraint<int32>("T")
.TypeConstraint<int32>("Index")
.HostMemory("ref")
.HostMemory("begin")
.HostMemory("end")
.HostMemory("strides"),
StridedSliceAssignOp<CPUDevice, int32>)
#undef REGISTER_SYCL
#endif // TENSORFLOW_USE_SYCL
} // namespace tensorflow

View File

@ -285,6 +285,20 @@ DECLARE_FOR_N_GPU(int32);
TF_CALL_ALL_TYPES(DECLARE_FOR_N_CPU);
DECLARE_FOR_N_CPU(bfloat16);
#ifdef TENSORFLOW_USE_SYCL
#define PREVENT_FOR_N_SYCL(T) \
PREVENT_INSTANTIATE(T, STRIDED_SLICE_INSTANTIATE_DIM)
#define DECLARE_FOR_N_SYCL(T) \
INSTANTIATE(SYCLDevice, T, STRIDED_SLICE_INSTANTIATE_DIM)
TF_CALL_SYCL_PROXY_TYPES(PREVENT_FOR_N_SYCL);
TF_CALL_GPU_NUMBER_TYPES(DECLARE_FOR_N_SYCL);
DECLARE_FOR_N_SYCL(int32);
#undef DECLARE_FOR_N_SYCL
#endif // TENSORFLOW_USE_SYCL
#undef INSTANTIATE
#undef DECLARE_FOR_N_CPU
#undef DECLARE_FOR_N_GPU

View File

@ -260,6 +260,8 @@ TF_CALL_complex128(HANDLE_TYPE_NAME_GPU);
#ifdef TENSORFLOW_USE_SYCL
TF_CALL_float(HANDLE_TYPE_NAME_SYCL);
TF_CALL_double(HANDLE_TYPE_NAME_SYCL);
TF_CALL_int32(HANDLE_TYPE_NAME_SYCL);
#endif // TENSORFLOW_USE_SYCL
#undef HANDLE_TYPE_NAME_CPU
@ -506,6 +508,16 @@ TF_CALL_complex64(HANDLE_TYPE_NAME_GPU);
TF_CALL_complex128(HANDLE_TYPE_NAME_GPU);
#endif // GOOGLE_CUDA
#if TENSORFLOW_USE_SYCL
#define HANDLE_TYPE_NAME_SYCL(T) \
HANDLE_CASE_DIM(SYCLDevice, T, DataTypeToEnum<T>::value);
TF_CALL_float(HANDLE_TYPE_NAME_SYCL);
TF_CALL_double(HANDLE_TYPE_NAME_SYCL);
TF_CALL_int32(HANDLE_TYPE_NAME_SYCL);
#undef HANDLE_TYPE_NAME_SYCL
#endif // TENSORFLOW_USE_SYCL
#undef HANDLE_TYPE_NAME_CPU
#undef HANDLE_TYPE_NAME_GPU
#undef HANDLE_CASE_DIM
@ -605,6 +617,25 @@ REGISTER_KERNEL_BUILDER(Name("Tile")
.TypeConstraint<int32>("Tmultiples")
.HostMemory("multiples"),
TileOp<SYCLDevice>);
REGISTER_KERNEL_BUILDER(Name("Tile")
.Device(DEVICE_SYCL)
.TypeConstraint<double>("T")
.TypeConstraint<int32>("Tmultiples")
.HostMemory("multiples"),
TileOp<SYCLDevice>);
REGISTER_KERNEL_BUILDER(Name("TileGrad")
.Device(DEVICE_SYCL)
.TypeConstraint<float>("T")
.TypeConstraint<int32>("Tmultiples")
.HostMemory("multiples"),
TileGradientOp<SYCLDevice>);
REGISTER_KERNEL_BUILDER(Name("TileGrad")
.Device(DEVICE_SYCL)
.TypeConstraint<double>("T")
.TypeConstraint<int32>("Tmultiples")
.HostMemory("multiples"),
TileGradientOp<SYCLDevice>);
#endif // TENSORFLOW_USE_SYCL
} // namespace tensorflow

View File

@ -70,6 +70,8 @@ typedef Eigen::SyclDevice SYCLDevice;
#define DEFINE_TYPE(T) DEFINE_DIM(T, CPU_PROVIDED_IXDIM)
TF_CALL_float(DEFINE_TYPE);
TF_CALL_double(DEFINE_TYPE);
TF_CALL_int32(DEFINE_TYPE);
#undef DEFINE_DIM
#undef DEFINE_TYPE
@ -81,6 +83,8 @@ TF_CALL_float(DEFINE_TYPE);
#define DEFINE_TYPE(T) DEFINE_DIM(T, CPU_PROVIDED_IXDIM)
TF_CALL_float(DEFINE_TYPE);
TF_CALL_double(DEFINE_TYPE);
TF_CALL_int32(DEFINE_TYPE);
#undef DEFINE_DIM
#undef DEFINE_TYPE

View File

@ -423,6 +423,7 @@ TF_CALL_double(REGISTER_CPU_KERNELS);
#ifdef TENSORFLOW_USE_SYCL
#define REGISTER_SYCL_KERNELS(T) REGISTER_KERNELS(SYCL, T);
TF_CALL_float(REGISTER_SYCL_KERNELS);
TF_CALL_double(REGISTER_SYCL_KERNELS);
#undef REGISTER_SYCL_KERNELS
#endif
@ -2355,6 +2356,7 @@ TF_CALL_double(REGISTER_CPU_KERNELS);
#define REGISTER_SYCL_KERNELS(T) REGISTER_KERNELS(SYCL, T);
TF_CALL_float(REGISTER_SYCL_KERNELS);
TF_CALL_double(REGISTER_SYCL_KERNELS);
#endif
#if GOOGLE_CUDA

View File

@ -127,6 +127,7 @@ Status DoTranspose<SYCLDevice>(const SYCLDevice& d, const Tensor& in,
switch (in.dtype()) {
case DT_FLOAT:
case DT_DOUBLE:
case DT_INT32:
internal::Transpose<SYCLDevice, uint32>(d, in, perm, out);
break;

View File

@ -82,6 +82,15 @@ REGISTER_KERNEL_BUILDER(Name("InvertPermutation")
.HostMemory("y"),
InvertPermutationOp);
#ifdef TENSORFLOW_USE_SYCL
REGISTER_KERNEL_BUILDER(Name("InvertPermutation")
.Device(DEVICE_SYCL)
.TypeConstraint<int32>("T")
.HostMemory("x")
.HostMemory("y"),
InvertPermutationOp);
#endif // TENSORFLOW_USE_SYCL
// output = TransposeOp(T<any> input, T<int32> perm) takes a tensor
// of type T and rank N, and a permutation of 0, 1, ..., N-1. It
// shuffles the dimensions of the input tensor according to permutation.
@ -201,4 +210,24 @@ TF_CALL_POD_TYPES(REGISTER);
#undef REGISTER
#endif
#ifdef TENSORFLOW_USE_SYCL
Status TransposeSyclOp::DoTranspose(OpKernelContext* ctx, const Tensor& in,
gtl::ArraySlice<int32> perm, Tensor* out) {
typedef Eigen::SyclDevice SYCLDevice;
return ::tensorflow::DoTranspose(ctx->eigen_device<SYCLDevice>(), in, perm,
out);
}
#define REGISTER(T) \
REGISTER_KERNEL_BUILDER(Name("Transpose") \
.Device(DEVICE_SYCL) \
.TypeConstraint<T>("T") \
.TypeConstraint<int32>("Tperm") \
.HostMemory("perm"), \
TransposeSyclOp);
REGISTER(float);
REGISTER(bool);
REGISTER(int32);
#undef REGISTER
#endif
} // namespace tensorflow

View File

@ -50,6 +50,17 @@ class TransposeGpuOp : public TransposeOp {
gtl::ArraySlice<int32> perm, Tensor* out) override;
};
#ifdef TENSORFLOW_USE_SYCL
class TransposeSyclOp : public TransposeOp {
public:
explicit TransposeSyclOp(OpKernelConstruction* ctx) : TransposeOp(ctx) {}
protected:
Status DoTranspose(OpKernelContext* ctx, const Tensor& in,
gtl::ArraySlice<int32> perm, Tensor* out) override;
};
#endif // TENSORFLOW_USE_SYCL
} // namespace tensorflow
#endif // TENSORFLOW_KERNELS_TRANSPOSE_OP_H_

View File

@ -160,6 +160,7 @@ REGISTER_KERNEL_BUILDER(Name("Unpack")
UnpackOp<SYCLDevice, type>)
REGISTER_SYCL(float);
REGISTER_SYCL(double);
#undef REGISTER_SYCL
// A special SYCL kernel for int32.

View File

@ -58,8 +58,9 @@ REGISTER_KERNEL_BUILDER(Name("IsVariableInitialized").Device(DEVICE_CPU),
IsVariableInitializedOp);
REGISTER_SYCL_KERNEL(float);
REGISTER_SYCL_KERNEL(double);
#undef REGISTER_SYCL_KERNEL
#endif
#endif // TENSORFLOW_USE_SYCL
#if GOOGLE_CUDA
// Only register 'Variable' on GPU for the subset of types also supported by

View File

@ -28,6 +28,9 @@ namespace tensorflow {
typedef Eigen::ThreadPoolDevice CPUDevice;
typedef Eigen::GpuDevice GPUDevice;
#ifdef TENSORFLOW_USE_SYCL
typedef Eigen::SyclDevice SYCLDevice;
#endif // TENSORFLOW_USE_SYCL
template <typename Device, typename T>
class SoftmaxXentWithLogitsOp : public OpKernel {
@ -74,17 +77,25 @@ class SoftmaxXentWithLogitsOp : public OpKernel {
// Partial specialization for a CPUDevice, that uses the Eigen implementation
// from XentEigenImpl.
namespace functor {
template <typename T>
struct XentFunctor<CPUDevice, T> {
void operator()(const CPUDevice& d, typename TTypes<T>::ConstMatrix logits,
template <typename Device, typename T>
struct XentFunctorBase {
void operator()(const Device& d, typename TTypes<T>::ConstMatrix logits,
typename TTypes<T>::ConstMatrix labels,
typename TTypes<T>::Matrix scratch,
typename TTypes<T>::Vec loss,
typename TTypes<T>::Matrix backprop) {
XentEigenImpl<CPUDevice, T>::Compute(d, logits, labels, scratch, loss,
XentEigenImpl<Device, T>::Compute(d, logits, labels, scratch, loss,
backprop);
}
};
template <typename T>
struct XentFunctor<CPUDevice, T> : XentFunctorBase<CPUDevice, T> {};
#ifdef TENSORFLOW_USE_SYCL
template <typename T>
struct XentFunctor<SYCLDevice, T> : XentFunctorBase<SYCLDevice, T> {};
#endif // TENSORFLOW_USE_SYCL
} // namespace functor
#define REGISTER_CPU(T) \
@ -111,4 +122,11 @@ REGISTER_KERNEL_BUILDER(Name("SoftmaxCrossEntropyWithLogits")
SoftmaxXentWithLogitsOp<GPUDevice, double>);
#endif // GOOGLE_CUDA
#ifdef TENSORFLOW_USE_SYCL
REGISTER_KERNEL_BUILDER(Name("SoftmaxCrossEntropyWithLogits")
.Device(DEVICE_SYCL)
.TypeConstraint<float>("T"),
SoftmaxXentWithLogitsOp<SYCLDevice, float>);
#endif // TENSORFLOW_USE_SYCL
} // namespace tensorflow

View File

@ -390,7 +390,7 @@ class TestOp : public OpKernel {
REGISTER_KERNEL_BUILDER(Name("TestOpWithNoGrad").Device(DEVICE_CPU), TestOp);
#ifdef TENSORFLOW_USE_SYCL
REGISTER_KERNEL_BUILDER(Name("TestOpWithNoGrad").Device(DEVICE_SYCL), TestOp);
#endif // TENSORFLOW_USE_SYCL
#endif // TENSORFLOW_USE_SYCL
TEST_F(MathGradTest, Error_Reporting) {
auto x = test::AsTensor<float>({-3.f});
@ -707,6 +707,8 @@ TEST_F(MathGradTest, Pow) {
}
}
//TODO{lukeiwanski}: Implement Complex Pow for SYCL
#ifndef TENSORFLOW_USE_SYCL
TEST_F(MathGradTest, ComplexPow) {
auto x = test::AsTensor<complex64>({0.f, 2.f, -2.f}, TensorShape({3}));
auto y = test::AsTensor<complex64>({2.f, 2.f, 2.f}, TensorShape({3}));
@ -725,6 +727,7 @@ TEST_F(MathGradTest, ComplexPow) {
dy, test::AsTensor<complex64>({h(0.f, 2.f), h(2.f, 2.f), h(-2.f, 2.f)},
TensorShape({3})));
}
#endif // TENSORFLOW_USE_SYCL
TEST_F(MathGradTest, Maximum) {
auto x = test::AsTensor<float>({-3.f, -2.f, -1.f, 1.f, 2.f, 3.f},
@ -886,6 +889,8 @@ TEST_F(MathGradTest, MatMul_11) {
test::ExpectClose(dy, MatMul(dz, true, x, true));
}
//TODO{lukeiwanski}: Implement BatchMatMul for SYCL
#ifndef TENSORFLOW_USE_SYCL
TEST_F(MathGradTest, BatchMatMul_00) {
auto x = test::AsTensor<float>({1.f, 2.f, 3.f, 4.f, 5.f, 6.f},
TensorShape({1, 2, 3}));
@ -933,6 +938,7 @@ TEST_F(MathGradTest, BatchMatMul_11) {
test::ExpectClose(dx, BatchMatMul(y, true, dz, true));
test::ExpectClose(dy, BatchMatMul(dz, true, x, true));
}
#endif // TENSORFLOW_USE_SYCL
TEST_F(MathGradTest, Sum_dim0) {
auto x = test::AsTensor<float>({-3.f, -2.f, -1.f, 1.f, 2.f, 3.f},

View File

@ -706,7 +706,7 @@ for production (though it will mature over time).
##### Download and install OpenCL drivers
The exact steps required for a functional OpenCL installation will depend on
your environment. For Unbuntu 14.04, the following steps are known to work:
your environment. For Ubuntu 14.04, the following steps are known to work:
```bash
sudo apt-get install ocl-icd-opencl-dev opencl-headers
@ -727,12 +727,17 @@ and copy the files into e.g. `/usr/local/computecpp`:
```bash
tar -xvzf ComputeCpp-CE-0.1.1-Ubuntu.14.04-64bit.tar.gz
sudo mkdir /usr/local/computecpp
sudo cp -R ComputeCpp-CE-0.1.1-Linux /usr/local/computecpp
sudo chmod -R a+r /usr/local/computecpp/
sudo chmod -R a+x /usr/local/computecpp/bin
```
Add the lib folder to your `LD_LIBRARY_PATH` to make Python find `libComputeCpp.so` by adding the following line to your `~/.bash_profile`:
```bash
export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/computecpp/lib"
```
### Prepare environment for Mac OS X
We recommend using [homebrew](http://brew.sh) to install the bazel dependency,
@ -1305,6 +1310,12 @@ This is typically because you have the Xcode build tools installed, but you stil
When importing tensorflow, you may see an "ImportError" raised. Below
are some possible examples and solutions:
```
ImportError: cannot import name 'pywrap_tensorflow'
```
This can occur if you try to import tensorflow while your current working directory is in the same directory as TensorFlow is located. If that is the case, change the working directory (i.e. `cd` in bash or `os.chdir` in python) to some folder outside of the TensorFlow directory and try importing tensorflow again.
```
ImportError: /lib64/libc.so.6: version `GLIBC_2.16' not found (required by ..._pywrap_tensorflow.so)
```
@ -1323,3 +1334,8 @@ directory, and so tries to import directly from the source code instead of
your installed tensorflow package. Solution: don't import tensorflow
from the tensorflow source code root directory, if you are.
```
ImportError: libComputeCpp.so: cannot open shared object file: No such file or directory
```
Make sure you have added the path to ComputeCpp's `lib` folder to your `LD_LIBRARY_PATH` (as mentioned above).

View File

@ -34,7 +34,7 @@ class DeviceLibTest(test_util.TensorFlowTestCase):
# GPU test
if test.is_gpu_available():
self.assertGreater(len(devices), 1)
self.assertTrue("GPU" in [d.device_type for d in devices])
self.assertTrue("GPU" in [d.device_type for d in devices] or "SYCL" in [d.device_type for d in devices])
if __name__ == "__main__":

View File

@ -44,7 +44,14 @@ from tensorflow.python.platform import googletest
from tensorflow.python.platform import tf_logging as logging
from tensorflow.python.util import compat
from tensorflow.python.util.protobuf import compare
from tensorflow.python.client import device_lib
def gpu_device_name():
"""Returns the name of a GPU device if available or the empty string."""
for x in device_lib.list_local_devices():
if x.device_type == 'GPU' or x.device_type == 'SYCL':
return x.name
return ''
def assert_ops_in_graph(expected_ops, graph):
"""Assert all expected operations are found.
@ -301,7 +308,12 @@ class TensorFlowTestCase(googletest.TestCase):
sess = self._cached_session
with sess.graph.as_default(), sess.as_default():
if force_gpu:
with sess.graph.device("/gpu:0"):
# Use the name of an actual device if one is detected, or '/gpu:0'
# otherwise
gpu_name = gpu_device_name()
if len(gpu_name) == 0:
gpu_name = '/gpu:0'
with sess.graph.device(gpu_name):
yield sess
elif use_gpu:
yield sess
@ -311,7 +323,12 @@ class TensorFlowTestCase(googletest.TestCase):
else:
with session.Session(graph=graph, config=prepare_config(config)) as sess:
if force_gpu:
with sess.graph.device("/gpu:0"):
# Use the name of an actual device if one is detected, or '/gpu:0'
# otherwise
gpu_name = gpu_device_name()
if len(gpu_name) == 0:
gpu_name = '/gpu:0'
with sess.graph.device(gpu_name):
yield sess
elif use_gpu:
yield sess

View File

@ -1342,7 +1342,7 @@ class ControlFlowTest(test.TestCase):
def _testWhileGrad_ColocateGradients(self, colocate):
gpu_dev_name = test.gpu_device_name() if test.is_gpu_available() else "/gpu:0"
gpu_short_name = gpu_dev_name.split('/')[-1]
gpu_short_name = gpu_dev_name.split('/')[-1].lower()
with self.test_session(graph=ops.Graph()) as sess:
v = constant_op.constant(2.0, name="v")

View File

@ -31,7 +31,7 @@ class StageTest(test.TestCase):
with ops.device('/cpu:0'):
x = array_ops.placeholder(dtypes.float32)
v = 2. * (array_ops.zeros([1024, 1024]) + x)
with ops.device('/gpu:0'):
with ops.device(test.gpu_device_name()):
stager = data_flow_ops.StagingArea([dtypes.float32])
stage = stager.put([v])
y = stager.get()
@ -46,7 +46,7 @@ class StageTest(test.TestCase):
with ops.device('/cpu:0'):
x = array_ops.placeholder(dtypes.float32)
v = 2. * (array_ops.zeros([128, 128]) + x)
with ops.device('/gpu:0'):
with ops.device(test.gpu_device_name()):
stager = data_flow_ops.StagingArea([dtypes.float32, dtypes.float32])
stage = stager.put([x, v])
z, y = stager.get()
@ -62,7 +62,7 @@ class StageTest(test.TestCase):
with ops.device('/cpu:0'):
x = array_ops.placeholder(dtypes.float32)
v = 2. * (array_ops.zeros([128, 128]) + x)
with ops.device('/gpu:0'):
with ops.device(test.gpu_device_name()):
stager = data_flow_ops.StagingArea(
[dtypes.float32, dtypes.float32],
shapes=[[], [128, 128]],

View File

@ -40,6 +40,7 @@ from tensorflow.python.util.all_util import remove_undocumented
# pylint: disable=unused-import
from tensorflow.python.framework.test_util import assert_equal_graph_def
from tensorflow.python.framework.test_util import TensorFlowTestCase as TestCase
from tensorflow.python.framework.test_util import gpu_device_name
from tensorflow.python.ops.gradient_checker import compute_gradient_error
from tensorflow.python.ops.gradient_checker import compute_gradient
@ -105,15 +106,6 @@ def is_gpu_available(cuda_only=False):
return any((x.device_type == 'GPU' or x.device_type == 'SYCL')
for x in _device_lib.list_local_devices())
def gpu_device_name():
"""Returns the name of a GPU device if available or the empty string."""
for x in _device_lib.list_local_devices():
if x.device_type == 'GPU' or x.device_type == 'SYCL':
return x.name
return ''
_allowed_symbols = [
# We piggy-back googletest documentation.
'Benchmark',

Some files were not shown because too many files have changed in this diff Show More