Merge pull request #26457 from ROCmSoftwarePlatform:google_upstream_softplus_op

PiperOrigin-RevId: 238465926
This commit is contained in:
TensorFlower Gardener 2019-03-14 10:21:17 -07:00
commit 500d747dad
9 changed files with 60 additions and 49 deletions

View File

@ -23,6 +23,10 @@ limitations under the License.
#include "cuda/include/cudnn.h" #include "cuda/include/cudnn.h"
#endif #endif
#if TENSORFLOW_USE_ROCM
#include "rocm/include/hip/hip_runtime.h"
#endif
#ifdef EIGEN_USE_LIBXSMM #ifdef EIGEN_USE_LIBXSMM
#include "include/libxsmm.h" #include "include/libxsmm.h"
#endif #endif
@ -109,6 +113,36 @@ DeviceProperties GetLocalGPUInfo(PlatformGpuId platform_gpu_id) {
strings::StrCat(properties.major, ".", properties.minor); strings::StrCat(properties.major, ".", properties.minor);
(*device.mutable_environment())["cuda"] = strings::StrCat(CUDA_VERSION); (*device.mutable_environment())["cuda"] = strings::StrCat(CUDA_VERSION);
(*device.mutable_environment())["cudnn"] = strings::StrCat(CUDNN_VERSION); (*device.mutable_environment())["cudnn"] = strings::StrCat(CUDNN_VERSION);
#elif TENSORFLOW_USE_ROCM
hipDeviceProp_t properties;
hipError_t error =
hipGetDeviceProperties(&properties, platform_gpu_id.value());
if (error != hipSuccess) {
device.set_type("UNKNOWN");
LOG(ERROR) << "Failed to get device properties, error code: " << error;
return device;
}
// ROCM TODO review if numbers here are valid
device.set_vendor("Advanced Micro Devices, Inc");
device.set_model(properties.name);
device.set_frequency(properties.clockRate * 1e-3);
device.set_num_cores(properties.multiProcessorCount);
device.set_num_registers(properties.regsPerBlock);
device.set_l1_cache_size(16 * 1024);
device.set_l2_cache_size(properties.l2CacheSize);
device.set_l3_cache_size(0);
device.set_shared_memory_size_per_multiprocessor(
properties.maxSharedMemoryPerMultiProcessor);
device.set_memory_size(properties.totalGlobalMem);
// 8 is the number of bits per byte. 2 is accounted for
// double data rate (DDR).
device.set_bandwidth(properties.memoryBusWidth / 8 *
properties.memoryClockRate * 2);
(*device.mutable_environment())["architecture"] =
strings::StrCat("gfx", properties.gcnArch);
#endif #endif
return device; return device;

View File

@ -87,7 +87,7 @@ void SoftplusGradOp<Device, T>::OperateNoTemplate(OpKernelContext* context,
TF_CALL_FLOAT_TYPES(REGISTER_KERNELS); TF_CALL_FLOAT_TYPES(REGISTER_KERNELS);
#undef REGISTER_KERNELS #undef REGISTER_KERNELS
#if GOOGLE_CUDA #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
// Forward declarations of the functor specializations for GPU. // Forward declarations of the functor specializations for GPU.
namespace functor { namespace functor {
#define DECLARE_GPU_SPEC(T) \ #define DECLARE_GPU_SPEC(T) \
@ -119,6 +119,6 @@ TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPEC);
TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS); TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS);
#undef REGISTER_GPU_KERNELS #undef REGISTER_GPU_KERNELS
#endif // GOOGLE_CUDA #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
} // namespace tensorflow } // namespace tensorflow

View File

@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. limitations under the License.
==============================================================================*/ ==============================================================================*/
#if GOOGLE_CUDA #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
#define EIGEN_USE_GPU #define EIGEN_USE_GPU
@ -37,4 +37,4 @@ TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_KERNELS);
} // end namespace tensorflow } // end namespace tensorflow
#endif // GOOGLE_CUDA #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM

View File

@ -20,6 +20,8 @@ limitations under the License.
#if TENSORFLOW_USE_ROCM #if TENSORFLOW_USE_ROCM
#define __HIP_DISABLE_CPP_FUNCTIONS__
#include "rocm/include/hip/hip_complex.h" #include "rocm/include/hip/hip_complex.h"
#include "rocm/include/hip/hip_runtime.h" #include "rocm/include/hip/hip_runtime.h"
#include "rocm/include/hiprand/hiprand.h" #include "rocm/include/hiprand/hiprand.h"

View File

@ -158,27 +158,27 @@ port::StatusOr<void*> GetCudnnDsoHandle() {
port::StatusOr<void*> GetRocblasDsoHandle() { port::StatusOr<void*> GetRocblasDsoHandle() {
static auto result = new auto(DsoLoader::GetRocblasDsoHandle()); static auto result = new auto(DsoLoader::GetRocblasDsoHandle());
return result; return *result;
} }
port::StatusOr<void*> GetMiopenDsoHandle() { port::StatusOr<void*> GetMiopenDsoHandle() {
static auto result = new auto(DsoLoader::GetMiopenDsoHandle()); static auto result = new auto(DsoLoader::GetMiopenDsoHandle());
return result; return *result;
} }
port::StatusOr<void*> GetRocfftDsoHandle() { port::StatusOr<void*> GetRocfftDsoHandle() {
static auto result = new auto(DsoLoader::GetRocfftDsoHandle()); static auto result = new auto(DsoLoader::GetRocfftDsoHandle());
return result; return *result;
} }
port::StatusOr<void*> GetRocrandDsoHandle() { port::StatusOr<void*> GetRocrandDsoHandle() {
static auto result = new auto(DsoLoader::GetRocrandDsoHandle()); static auto result = new auto(DsoLoader::GetRocrandDsoHandle());
return result; return *result;
} }
port::StatusOr<void*> GetHipDsoHandle() { port::StatusOr<void*> GetHipDsoHandle() {
static auto result = new auto(DsoLoader::GetHipDsoHandle()); static auto result = new auto(DsoLoader::GetHipDsoHandle());
return result; return *result;
} }
} // namespace CachedDsoLoader } // namespace CachedDsoLoader

View File

@ -232,7 +232,6 @@ cc_library(
"//tensorflow/core:lib_internal", "//tensorflow/core:lib_internal",
"//tensorflow/stream_executor:dnn", "//tensorflow/stream_executor:dnn",
"//tensorflow/stream_executor:event", "//tensorflow/stream_executor:event",
"//tensorflow/stream_executor:logging_proto_cc",
"//tensorflow/stream_executor:plugin_registry", "//tensorflow/stream_executor:plugin_registry",
"//tensorflow/stream_executor:scratch_allocator", "//tensorflow/stream_executor:scratch_allocator",
"//tensorflow/stream_executor:stream_executor_pimpl_header", "//tensorflow/stream_executor:stream_executor_pimpl_header",

View File

@ -500,7 +500,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
hipDeviceptr_t location, hipDeviceptr_t location,
uint8 value, size_t size) { uint8 value, size_t size) {
ScopedActivateContext activation{context}; ScopedActivateContext activation{context};
hipError_t res = tensorflow::wrap::hipMemset(location, value, size); hipError_t res = tensorflow::wrap::hipMemsetD8(location, value, size);
if (res != hipSuccess) { if (res != hipSuccess) {
LOG(ERROR) << "failed to memset memory: " << ToString(res); LOG(ERROR) << "failed to memset memory: " << ToString(res);
return false; return false;
@ -514,15 +514,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
size_t uint32_count) { size_t uint32_count) {
ScopedActivateContext activation{context}; ScopedActivateContext activation{context};
void* pointer = absl::bit_cast<void*>(location); void* pointer = absl::bit_cast<void*>(location);
unsigned char valueC = static_cast<unsigned char>(value); hipError_t res = tensorflow::wrap::hipMemsetD32(pointer, value, uint32_count);
uint32_t value32 = (valueC << 24) | (valueC << 16) | (valueC << 8) | (valueC);
if (value32 != value) {
// mismatch indicates case where hipMemsetAsyc can't emulate hipMemSetD32
LOG(ERROR) << "failed to memset memory";
return false;
}
hipError_t res = tensorflow::wrap::hipMemset(pointer, static_cast<int>(value),
uint32_count * 4);
if (res != hipSuccess) { if (res != hipSuccess) {
LOG(ERROR) << "failed to memset memory: " << ToString(res); LOG(ERROR) << "failed to memset memory: " << ToString(res);
return false; return false;
@ -553,17 +545,8 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
GpuStreamHandle stream) { GpuStreamHandle stream) {
ScopedActivateContext activation{context}; ScopedActivateContext activation{context};
void* pointer = absl::bit_cast<void*>(location); void* pointer = absl::bit_cast<void*>(location);
hipError_t res =
// FIXME - need to set a 32-bit value here tensorflow::wrap::hipMemsetD32Async(pointer, value, uint32_count, stream);
unsigned char valueC = static_cast<unsigned char>(value);
uint32_t value32 = (valueC << 24) | (valueC << 16) | (valueC << 8) | (valueC);
if (value32 != value) {
// mismatch indicates case where hipMemsetAsyc can't emulate hipMemSetD32
LOG(ERROR) << "failed to memset memory";
return false;
}
hipError_t res = tensorflow::wrap::hipMemsetAsync(pointer, value,
uint32_count * 4, stream);
if (res != hipSuccess) { if (res != hipSuccess) {
LOG(ERROR) << "failed to enqueue async memset operation: " << ToString(res); LOG(ERROR) << "failed to enqueue async memset operation: " << ToString(res);
return false; return false;
@ -671,7 +654,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
uint64 bytes) { uint64 bytes) {
ScopedActivateContext activated{context}; ScopedActivateContext activated{context};
hipDeviceptr_t result = 0; hipDeviceptr_t result = 0;
hipError_t res = tensorflow::wrap::hipMallocVanilla(&result, bytes); hipError_t res = tensorflow::wrap::hipMalloc(&result, bytes);
if (res != hipSuccess) { if (res != hipSuccess) {
LOG(ERROR) << "failed to allocate " LOG(ERROR) << "failed to allocate "
<< port::HumanReadableNumBytes::ToString(bytes) << " (" << bytes << port::HumanReadableNumBytes::ToString(bytes) << " (" << bytes
@ -717,8 +700,8 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
ScopedActivateContext activation{context}; ScopedActivateContext activation{context};
void* host_mem = nullptr; void* host_mem = nullptr;
// "Portable" memory is visible to all ROCM contexts. Safe for our use model. // "Portable" memory is visible to all ROCM contexts. Safe for our use model.
hipError_t res = tensorflow::wrap::hipHostMallocVanilla( hipError_t res =
&host_mem, bytes, hipHostMallocPortable); tensorflow::wrap::hipHostMalloc(&host_mem, bytes, hipHostMallocPortable);
if (res != hipSuccess) { if (res != hipSuccess) {
LOG(ERROR) << "failed to alloc " << bytes LOG(ERROR) << "failed to alloc " << bytes
<< " bytes on host: " << ToString(res); << " bytes on host: " << ToString(res);

View File

@ -20,6 +20,8 @@ limitations under the License.
#ifndef TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_DRIVER_WRAPPER_H_ #ifndef TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_DRIVER_WRAPPER_H_
#define TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_DRIVER_WRAPPER_H_ #define TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_DRIVER_WRAPPER_H_
#define __HIP_DISABLE_CPP_FUNCTIONS__
#include "rocm/include/hip/hip_runtime.h" #include "rocm/include/hip/hip_runtime.h"
#include "tensorflow/stream_executor/lib/env.h" #include "tensorflow/stream_executor/lib/env.h"
#include "tensorflow/stream_executor/platform/dso_loader.h" #include "tensorflow/stream_executor/platform/dso_loader.h"
@ -48,21 +50,6 @@ namespace wrap {
#define TO_STR_(x) #x #define TO_STR_(x) #x
#define TO_STR(x) TO_STR_(x) #define TO_STR(x) TO_STR_(x)
// hipMalloc and hipHostMalloc are defined as funtion templates in the
// HIP header files, and hence their names get mangled and the attempt
// to resolve their name when trying to dynamically load them will fail
// Updating the HIP header files to make them C functions is underway.
// Until that change flows through, we will workaround the issue by
// creating dummy wrappers for them here
hipError_t hipMallocVanilla(void** ptr, size_t size) {
return hipErrorNotInitialized;
}
hipError_t hipHostMallocVanilla(void** ptr, size_t size, unsigned int flags) {
return hipErrorNotInitialized;
}
#define STREAM_EXECUTOR_HIP_WRAP(hipSymbolName) \ #define STREAM_EXECUTOR_HIP_WRAP(hipSymbolName) \
template <typename... Args> \ template <typename... Args> \
auto hipSymbolName(Args... args)->decltype(::hipSymbolName(args...)) { \ auto hipSymbolName(Args... args)->decltype(::hipSymbolName(args...)) { \
@ -107,9 +94,11 @@ hipError_t hipHostMallocVanilla(void** ptr, size_t size, unsigned int flags) {
__macro(hipGetDeviceCount) \ __macro(hipGetDeviceCount) \
__macro(hipGetDeviceProperties) \ __macro(hipGetDeviceProperties) \
__macro(hipHostFree) \ __macro(hipHostFree) \
__macro(hipHostMalloc) \
__macro(hipHostRegister) \ __macro(hipHostRegister) \
__macro(hipHostUnregister) \ __macro(hipHostUnregister) \
__macro(hipInit) \ __macro(hipInit) \
__macro(hipMalloc) \
__macro(hipMemGetAddressRange) \ __macro(hipMemGetAddressRange) \
__macro(hipMemGetInfo) \ __macro(hipMemGetInfo) \
__macro(hipMemcpyDtoD) \ __macro(hipMemcpyDtoD) \
@ -119,7 +108,10 @@ hipError_t hipHostMallocVanilla(void** ptr, size_t size, unsigned int flags) {
__macro(hipMemcpyHtoD) \ __macro(hipMemcpyHtoD) \
__macro(hipMemcpyHtoDAsync) \ __macro(hipMemcpyHtoDAsync) \
__macro(hipMemset) \ __macro(hipMemset) \
__macro(hipMemsetD32) \
__macro(hipMemsetD8) \
__macro(hipMemsetAsync) \ __macro(hipMemsetAsync) \
__macro(hipMemsetD32Async) \
__macro(hipModuleGetFunction) \ __macro(hipModuleGetFunction) \
__macro(hipModuleGetGlobal) \ __macro(hipModuleGetGlobal) \
__macro(hipModuleLaunchKernel) \ __macro(hipModuleLaunchKernel) \

View File

@ -1234,6 +1234,7 @@ def tf_gpu_library(deps = None, cuda_deps = None, copts = tf_copts(), **kwargs):
"@local_config_cuda//cuda:cuda_headers", "@local_config_cuda//cuda:cuda_headers",
]) + if_rocm_is_configured(cuda_deps + [ ]) + if_rocm_is_configured(cuda_deps + [
# rocm_header placeholder # rocm_header placeholder
# rocm_header placeholder
]), ]),
copts = (copts + if_cuda(["-DGOOGLE_CUDA=1"]) + if_rocm(["-DTENSORFLOW_USE_ROCM=1"]) + if_mkl(["-DINTEL_MKL=1"]) + if_mkl_open_source_only(["-DINTEL_MKL_DNN_ONLY"]) + if_enable_mkl(["-DENABLE_MKL"]) + if_tensorrt(["-DGOOGLE_TENSORRT=1"])), copts = (copts + if_cuda(["-DGOOGLE_CUDA=1"]) + if_rocm(["-DTENSORFLOW_USE_ROCM=1"]) + if_mkl(["-DINTEL_MKL=1"]) + if_mkl_open_source_only(["-DINTEL_MKL_DNN_ONLY"]) + if_enable_mkl(["-DENABLE_MKL"]) + if_tensorrt(["-DGOOGLE_TENSORRT=1"])),
**kwargs **kwargs