From af54994072bda083229fd11cb2b1d58e2cd38ab0 Mon Sep 17 00:00:00 2001 From: Eugene Kuznetsov Date: Wed, 15 Jan 2020 17:12:25 -0800 Subject: [PATCH] Implementing GpuManagedAllocator for ROCm Enabling several common runtime unit tests for ROCm --- .../common_runtime/gpu/gpu_device_test.cc | 2 +- .../common_runtime/gpu/gpu_event_mgr_test.cc | 4 ++-- .../gpu/gpu_managed_allocator.cc | 18 +++++++++++++--- .../common_runtime/gpu/pool_allocator_test.cc | 21 ++++++++++++------- .../hierarchical_tree_broadcaster_test.cc | 8 +++---- .../core/common_runtime/memory_types_test.cc | 12 +++++------ tensorflow/core/common_runtime/placer_test.cc | 2 +- .../process_function_library_runtime_test.cc | 6 +++--- .../core/common_runtime/ring_gatherer_test.cc | 8 +++---- .../core/common_runtime/ring_reducer_test.cc | 8 +++---- 10 files changed, 53 insertions(+), 36 deletions(-) diff --git a/tensorflow/core/common_runtime/gpu/gpu_device_test.cc b/tensorflow/core/common_runtime/gpu/gpu_device_test.cc index c7627e07b4f..fd77aa379b1 100644 --- a/tensorflow/core/common_runtime/gpu/gpu_device_test.cc +++ b/tensorflow/core/common_runtime/gpu/gpu_device_test.cc @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#if GOOGLE_CUDA +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM #include "tensorflow/core/common_runtime/gpu/gpu_device.h" diff --git a/tensorflow/core/common_runtime/gpu/gpu_event_mgr_test.cc b/tensorflow/core/common_runtime/gpu/gpu_event_mgr_test.cc index 966956dd5ae..680aec1ab29 100644 --- a/tensorflow/core/common_runtime/gpu/gpu_event_mgr_test.cc +++ b/tensorflow/core/common_runtime/gpu/gpu_event_mgr_test.cc @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#if GOOGLE_CUDA +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM #include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h" @@ -859,4 +859,4 @@ BENCHMARK(BM_chain_1M_100_true)->Arg(8); } // namespace } // namespace tensorflow -#endif // GOOGLE_CUDA +#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM diff --git a/tensorflow/core/common_runtime/gpu/gpu_managed_allocator.cc b/tensorflow/core/common_runtime/gpu/gpu_managed_allocator.cc index d0f68987a5c..695a06c972d 100644 --- a/tensorflow/core/common_runtime/gpu/gpu_managed_allocator.cc +++ b/tensorflow/core/common_runtime/gpu/gpu_managed_allocator.cc @@ -13,30 +13,42 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#ifdef GOOGLE_CUDA +#if GOOGLE_CUDA #include "third_party/gpus/cuda/include/cuda.h" #define EIGEN_USE_GPU #endif +#if TENSORFLOW_USE_ROCM +#include "rocm/include/hip/hip_runtime.h" +#define EIGEN_USE_GPU +#endif + #include "tensorflow/core/common_runtime/gpu/gpu_managed_allocator.h" namespace tensorflow { void* GpuManagedAllocator::AllocateRaw(size_t alignment, size_t num_bytes) { void* ptr = nullptr; -#ifdef GOOGLE_CUDA +#if GOOGLE_CUDA CUdeviceptr result = 0; CHECK_EQ(cuMemAllocManaged(&result, num_bytes, CU_MEM_ATTACH_GLOBAL), CUDA_SUCCESS); ptr = reinterpret_cast(result); +#elif TENSORFLOW_USE_ROCM + void** result = 0; + CHECK_EQ(hipHostMalloc(&result, num_bytes, 0), + 0); + ptr = reinterpret_cast(result); #endif CHECK(!(reinterpret_cast(ptr) & (alignment - 1))); return ptr; } void GpuManagedAllocator::DeallocateRaw(void* ptr) { -#ifdef GOOGLE_CUDA +#if GOOGLE_CUDA CHECK_EQ(cudaFree(ptr), cudaSuccess); +#elif TENSORFLOW_USE_ROCM + CHECK_EQ(hipFree(ptr), hipSuccess); #endif } diff --git a/tensorflow/core/common_runtime/gpu/pool_allocator_test.cc b/tensorflow/core/common_runtime/gpu/pool_allocator_test.cc index 42b5ed959c9..f792787c696 100644 --- a/tensorflow/core/common_runtime/gpu/pool_allocator_test.cc +++ b/tensorflow/core/common_runtime/gpu/pool_allocator_test.cc @@ -13,20 +13,21 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#if GOOGLE_CUDA +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM #include "tensorflow/core/common_runtime/pool_allocator.h" #include "tensorflow/core/common_runtime/gpu/gpu_host_allocator.h" #include "tensorflow/core/platform/stream_executor.h" #include "tensorflow/core/platform/test.h" - +#include "gpu_init.h" namespace tensorflow { namespace { TEST(PoolAllocatorTest, ZeroSizeBuffers) { se::Platform* platform = - se::MultiPlatformManager::PlatformWithName("cuda").ValueOrDie(); + se::MultiPlatformManager::PlatformWithName(GpuPlatformName()) + .ValueOrDie(); PoolAllocator pool( 2 /*pool_size_limit*/, false /*auto_resize*/, new GpuHostAllocator( @@ -45,7 +46,8 @@ TEST(PoolAllocatorTest, ZeroSizeBuffers) { TEST(PoolAllocatorTest, ZeroSizePool) { se::Platform* platform = - se::MultiPlatformManager::PlatformWithName("cuda").ValueOrDie(); + se::MultiPlatformManager::PlatformWithName(GpuPlatformName()) + .ValueOrDie(); PoolAllocator pool( 0 /*pool_size_limit*/, false /*auto_resize*/, new GpuHostAllocator( @@ -79,7 +81,8 @@ TEST(PoolAllocatorTest, ZeroSizePool) { TEST(PoolAllocatorTest, Alignment) { se::Platform* platform = - se::MultiPlatformManager::PlatformWithName("cuda").ValueOrDie(); + se::MultiPlatformManager::PlatformWithName(GpuPlatformName()) + .ValueOrDie(); PoolAllocator pool( 0 /*pool_size_limit*/, false /*auto_resize*/, new GpuHostAllocator( @@ -141,7 +144,8 @@ TEST(PoolAllocatorTest, CudaHostAllocator) { free_size += size; }; se::Platform* platform = - se::MultiPlatformManager::PlatformWithName("cuda").ValueOrDie(); + se::MultiPlatformManager::PlatformWithName(GpuPlatformName()) + .ValueOrDie(); GpuHostAllocator* sub_allocator = new GpuHostAllocator( platform->GetExecutor(se::StreamExecutorConfig(/*ordinal=*/0)) .ValueOrDie(), @@ -244,7 +248,8 @@ TEST(PoolAllocatorTest, Pow2Rounder) { TEST(PoolAllocatorTest, Name) { se::Platform* platform = - se::MultiPlatformManager::PlatformWithName("cuda").ValueOrDie(); + se::MultiPlatformManager::PlatformWithName(GpuPlatformName()) + .ValueOrDie(); PoolAllocator pool( 2 /*pool_size_limit*/, false /*auto_resize*/, new GpuHostAllocator( @@ -258,4 +263,4 @@ TEST(PoolAllocatorTest, Name) { } // namespace } // namespace tensorflow -#endif // GOOGLE_CUDA +#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM diff --git a/tensorflow/core/common_runtime/hierarchical_tree_broadcaster_test.cc b/tensorflow/core/common_runtime/hierarchical_tree_broadcaster_test.cc index 9fcf75fb4d3..92139e2426f 100644 --- a/tensorflow/core/common_runtime/hierarchical_tree_broadcaster_test.cc +++ b/tensorflow/core/common_runtime/hierarchical_tree_broadcaster_test.cc @@ -201,7 +201,7 @@ class HierarchicalTreeBroadcasterTest : public ::testing::Test { if (col_exec_) col_exec_->Unref(); } -#ifdef GOOGLE_CUDA +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM void InitGPUDevices() { auto device_factory = DeviceFactory::GetFactory("GPU"); CHECK(device_factory); @@ -214,7 +214,7 @@ class HierarchicalTreeBroadcasterTest : public ::testing::Test { void Init(int num_workers, int num_devices_per_worker, DataType dtype, const DeviceType& device_type, int fail_after) { -#ifdef GOOGLE_CUDA +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM InitGPUDevices(); #endif VLOG(2) << "num_workers=" << num_workers @@ -871,7 +871,7 @@ TEST_F(HierarchicalTreeBroadcasterTest, InitializeParams4TasksVariableGPU) { } \ } -#ifndef GOOGLE_CUDA +#if !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM) // B T W D L A F DEF_TEST(FLOAT, CPU, 1, 2, 1, 0, false) DEF_TEST(FLOAT, CPU, 1, 2, 1001, 0, true) @@ -889,7 +889,7 @@ DEF_TEST(FLOAT, CPU, 2, 4, 128, 1, true) DEF_TEST(FLOAT, CPU, 2, 4, 128, 5, false) #endif -#ifdef GOOGLE_CUDA +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM // Can only set W=1 for GPU tests. // B T W D L A F DEF_TEST(FLOAT, GPU, 1, 2, 1, 0, true) diff --git a/tensorflow/core/common_runtime/memory_types_test.cc b/tensorflow/core/common_runtime/memory_types_test.cc index a0935855719..e2ed7aadd9c 100644 --- a/tensorflow/core/common_runtime/memory_types_test.cc +++ b/tensorflow/core/common_runtime/memory_types_test.cc @@ -30,10 +30,10 @@ TEST(MemoryTypeChecker, Int32OK) { auto in1 = test::graph::Constant(g, v); test::graph::Add(g, in0, in1); TF_EXPECT_OK(ValidateMemoryTypes(DEVICE_CPU, g)); -#if GOOGLE_CUDA +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM // There is a kernel for adding two int32s on host memory. TF_EXPECT_OK(ValidateMemoryTypes(DEVICE_GPU, g)); -#endif // GOOGLE_CUDA +#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM #ifdef TENSORFLOW_USE_SYCL TF_EXPECT_OK(ValidateMemoryTypes(DEVICE_SYCL, g)); #endif // TENSORFLOW_USE_SYCL @@ -47,7 +47,7 @@ TEST(MemoryTypeChecker, Int32NotOk) { auto x = test::graph::Constant(g, v); test::graph::Cast(g, x, DT_FLOAT); TF_EXPECT_OK(ValidateMemoryTypes(DEVICE_CPU, g)); -#if GOOGLE_CUDA +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM // There is no kernel for casting int32/host memory to float/device // memory. EXPECT_TRUE(errors::IsInternal(ValidateMemoryTypes(DEVICE_GPU, g))); @@ -55,7 +55,7 @@ TEST(MemoryTypeChecker, Int32NotOk) { // But we can insert _HostSend/_HostRecv to ensure the invariant. TF_EXPECT_OK(EnsureMemoryTypes(DEVICE_GPU, "/device:GPU:0", g)); TF_EXPECT_OK(ValidateMemoryTypes(DEVICE_GPU, g)); -#endif // GOOGLE_CUDA +#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM #ifdef TENSORFLOW_USE_SYCL // There is no kernel for casting int32/host memory to float/device // memory. @@ -80,12 +80,12 @@ TEST(MemoryTypeChecker, MemoryTypeForOutput) { TF_EXPECT_OK(MemoryTypeForOutput(DEVICE_CPU, g, sf, 0, &memory_type)); // float Switch's output on CPU doesn't have HOST_MEMORY constraint. EXPECT_EQ(memory_type, DEVICE_MEMORY); -#if GOOGLE_CUDA +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM auto si = test::graph::Switch(g, test::graph::Constant(g, vi), pred); TF_EXPECT_OK(MemoryTypeForOutput(DEVICE_GPU, g, si, 0, &memory_type)); // int Switch's output on GPU has HOST_MEMORY constraint. EXPECT_EQ(memory_type, HOST_MEMORY); -#endif // GOOGLE_CUDA +#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM #ifdef TENSORFLOW_USE_SYCL auto si = test::graph::Switch(g, test::graph::Constant(g, vi), pred); TF_EXPECT_OK(MemoryTypeForOutput(DEVICE_SYCL, g, si, 0, &memory_type)); diff --git a/tensorflow/core/common_runtime/placer_test.cc b/tensorflow/core/common_runtime/placer_test.cc index 2047e39b8ee..33d1bb5413a 100644 --- a/tensorflow/core/common_runtime/placer_test.cc +++ b/tensorflow/core/common_runtime/placer_test.cc @@ -1724,7 +1724,7 @@ TEST_F(PlacerTest, TestNonExistentDevice) { EXPECT_TRUE(absl::StrContains(s.error_message(), "but available devices")); } -#if !GOOGLE_CUDA +#if !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM) // Test that we inform the user if they appear to be explicitly placing nodes // on a GPU when CUDA is not available TEST_F(PlacerTest, TestUseGpuWithNoCuda) { diff --git a/tensorflow/core/common_runtime/process_function_library_runtime_test.cc b/tensorflow/core/common_runtime/process_function_library_runtime_test.cc index 55bc408f9c5..0739744c2fe 100644 --- a/tensorflow/core/common_runtime/process_function_library_runtime_test.cc +++ b/tensorflow/core/common_runtime/process_function_library_runtime_test.cc @@ -153,7 +153,7 @@ class ProcessFunctionLibraryRuntimeTest : public ::testing::Test { return cpu_tensor; #else CHECK(false); -#endif // GOOGLE_CUDA +#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM } Tensor CPUToGPU(const Tensor& cpu_tensor) { @@ -178,7 +178,7 @@ class ProcessFunctionLibraryRuntimeTest : public ::testing::Test { return device_tensor; #else CHECK(false); -#endif // GOOGLE_CUDA +#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM } Status RunWithRuntime( @@ -479,7 +479,7 @@ TEST_F(ProcessFunctionLibraryRuntimeTest, ClusterFLRParallelTest) { } bool IsCUDATensor(const Tensor& t) { -#ifdef GOOGLE_CUDA +#if GOOGLE_CUDA cudaPointerAttributes attributes; cudaError_t err = cudaPointerGetAttributes(&attributes, t.tensor_data().data()); diff --git a/tensorflow/core/common_runtime/ring_gatherer_test.cc b/tensorflow/core/common_runtime/ring_gatherer_test.cc index 87a493c39b9..e5dda048ea4 100644 --- a/tensorflow/core/common_runtime/ring_gatherer_test.cc +++ b/tensorflow/core/common_runtime/ring_gatherer_test.cc @@ -116,7 +116,7 @@ class RingGathererTest : public ::testing::Test { protected: RingGathererTest() : device_type_(DEVICE_CPU) {} -#ifdef GOOGLE_CUDA +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM void InitGPUDevices() { auto device_factory = DeviceFactory::GetFactory("GPU"); CHECK(device_factory); @@ -135,7 +135,7 @@ class RingGathererTest : public ::testing::Test { void Init(int num_workers, int num_devices, DataType dtype, const DeviceType& device_type, int num_subdivs, int fail_after) { -#ifdef GOOGLE_CUDA +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM InitGPUDevices(); #endif device_type_ = device_type; @@ -603,7 +603,7 @@ TEST_F(RingGathererTest, InitializeParams) { } \ } -#ifndef GOOGLE_CUDA +#if !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM) // Success tests DEF_TEST(FLOAT, CPU, 1, 2, 1, 1, 0) DEF_TEST(FLOAT, CPU, 1, 2, 1, 2, 0) @@ -628,7 +628,7 @@ DEF_TEST(FLOAT, CPU, 2, 8, 1, 9408, 7) DEF_TEST(FLOAT, CPU, 2, 8, 1, 9408, 11) #endif -#ifdef GOOGLE_CUDA +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM // GPU tests. So long as the device names are all in a single tasks we // bypass inter-worker routing code and can fake multiple GPUs with a single // GPU, from the perspective of the RingGatherer logic. So these tests diff --git a/tensorflow/core/common_runtime/ring_reducer_test.cc b/tensorflow/core/common_runtime/ring_reducer_test.cc index 32454853d60..2a5db7989c8 100644 --- a/tensorflow/core/common_runtime/ring_reducer_test.cc +++ b/tensorflow/core/common_runtime/ring_reducer_test.cc @@ -138,7 +138,7 @@ class RingReducerTest : public ::testing::Test { protected: RingReducerTest() : device_type_(DEVICE_CPU) {} -#ifdef GOOGLE_CUDA +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM void InitGPUDevices() { auto device_factory = DeviceFactory::GetFactory("GPU"); CHECK(device_factory); @@ -157,7 +157,7 @@ class RingReducerTest : public ::testing::Test { void Init(int num_workers, int num_devices, DataType dtype, const DeviceType& device_type, int num_subdivs, int fail_after) { -#ifdef GOOGLE_CUDA +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM InitGPUDevices(); #endif device_type_ = device_type; @@ -683,7 +683,7 @@ TEST_F(RingReducerTest, AutomaticSubdivUpperBound) { } \ } -#ifndef GOOGLE_CUDA +#if !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM) // Success tests DEF_TEST(FLOAT, CPU, 1, 2, 1, 1, 0) DEF_TEST(FLOAT, CPU, 1, 2, 1, 2, 0) @@ -710,7 +710,7 @@ DEF_TEST(FLOAT, CPU, 2, 8, 1, 9408, 7) DEF_TEST(FLOAT, CPU, 2, 8, 2, 9408, 11) #endif -#ifdef GOOGLE_CUDA +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM // GPU tests. So long as the device names are all in a single tasks we // bypass inter-worker routing code and can fake multiple GPUs with a single // GPU, from the perspective of the RingReducer logic. So these tests