From d280d3d9b951063bf9e02ac79e248d729d33a29c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 8 Mar 2019 06:19:53 -0800
Subject: [PATCH] PR #25676: [ROCm] Adding ROCm support for code in the
 tensorflow/core/common_runtime/gpu directory

Imported from GitHub PR #25676

This PR contains commits to add ROCm support for code within the directory
     `tensorflow/core/common_runtime/gpu`

Prior to this PR the gpu implementation specific code within that dir is for CUDA only, and this PR adds the ROCm equivalent.

Some files/variables have been renamed from *cuda* to *gpu* to make code more generic

The PR is broken down into several commits for ease-of-review.  I can squash them into one prior to the merge if need be.

@tatianashp. Adding you to the cc-list just as an FYI
@whchung

Copybara import of the project:

  - c62c032d92e17a775db791ad03ae3502ea6d8e8e adding ROCm support in tensorflow/core/common_runtime/gpu... by Deven Desai <deven.desai.amd@gmail.com>
  - e94e3f408d9956dfa3aa7849a4ff41c584a6e0f0 adding ROCm support in tensorflow/core/common_runtime/gpu... by Deven Desai <deven.desai.amd@gmail.com>
  - 96e522ddf21fa675308a38b91d0a95efd771033e adding ROCm support in tensorflow/core/common_runtime/gpu... by Deven Desai <deven.desai.amd@gmail.com>
  - 929bbc2d4c620e31b2f7786600f4d7a271de0dba renaming cuda_host_allocator.h to gpu_host_allocator.h by Deven Desai <deven.desai.amd@gmail.com>
  - 4e255e01d26f74e003c11a894ebf39fcaf64ca49 misc changes by Deven Desai <deven.desai.amd@gmail.com>
  - a11b8117f1b27ba7f1abd43c4849e29d21c610d2 changes requested in the code review by Deven Desai <deven.desai.amd@gmail.com>
  - 07b49644d27aeac2e8646609640206faf5908ca5 using string instead int std::string + removing the #defi... by Deven Desai <deven.desai.amd@gmail.com>
  - 46bc499856a769cd00e9ddcdf5c922e3c17f0603 Merge 07b49644d27aeac2e8646609640206faf5908ca5 into afab5... by Deven Desai <36858332+deven-amd@users.noreply.github.com>

COPYBARA_INTEGRATE_REVIEW=https://github.com/tensorflow/tensorflow/pull/25676 from ROCmSoftwarePlatform:google_upstream_common_runtime_changes 07b49644d27aeac2e8646609640206faf5908ca5
PiperOrigin-RevId: 237437557
---
 tensorflow/contrib/gdr/gdr_memory_manager.cc  |   7 +-
 tensorflow/contrib/verbs/rdma.cc              |   4 +-
 tensorflow/contrib/verbs/rdma_mgr.cc          |   4 +-
 tensorflow/core/BUILD                         |   6 +-
 .../gpu/gpu_bfc_allocator_test.cc             |   4 +-
 .../gpu/gpu_debug_allocator_test.cc           |   4 +-
 .../core/common_runtime/gpu/gpu_device.cc     | 162 ++++++++++++++++--
 .../core/common_runtime/gpu/gpu_device.h      |  10 +-
 .../common_runtime/gpu/gpu_device_factory.cc  |   8 +-
 ..._host_allocator.h => gpu_host_allocator.h} |  20 +--
 .../core/common_runtime/gpu/gpu_init.cc       |  16 +-
 tensorflow/core/common_runtime/gpu/gpu_init.h |   8 +-
 .../common_runtime/gpu/gpu_process_state.cc   | 123 ++++++-------
 .../common_runtime/gpu/gpu_process_state.h    |  20 +--
 .../core/common_runtime/gpu/gpu_util.cc       |   2 +-
 .../common_runtime/gpu/pool_allocator_test.cc |  12 +-
 .../core/common_runtime/process_state.h       |   2 +-
 .../core/platform/default/build_config/BUILD  |   5 +-
 tensorflow/core/platform/rocm.h               |  22 +++
 tensorflow/stream_executor/rocm/BUILD         |  14 ++
 .../stream_executor/rocm/rocm_activation.h    |  39 +++++
 21 files changed, 359 insertions(+), 133 deletions(-)
 rename tensorflow/core/common_runtime/gpu/{cuda_host_allocator.h => gpu_host_allocator.h} (74%)
 create mode 100644 tensorflow/core/platform/rocm.h
 create mode 100644 tensorflow/stream_executor/rocm/rocm_activation.h

diff --git a/tensorflow/contrib/gdr/gdr_memory_manager.cc b/tensorflow/contrib/gdr/gdr_memory_manager.cc
index 7321e973191..9b8e832fd96 100644
--- a/tensorflow/contrib/gdr/gdr_memory_manager.cc
+++ b/tensorflow/contrib/gdr/gdr_memory_manager.cc
@@ -250,10 +250,9 @@ Status GdrMemoryManager::Init() {
   LOG(INFO) << "Instrumenting CPU allocator(s)";
 
   for (int numa_idx = 0; numa_idx < port::NUMANumNodes(); ++numa_idx) {
-    GPUProcessState::singleton()->AddCUDAHostAllocVisitor(numa_idx,
-                                                          alloc_visitor);
-    GPUProcessState::singleton()->AddCUDAHostFreeVisitor(numa_idx,
-                                                         free_visitor);
+    GPUProcessState::singleton()->AddGpuHostAllocVisitor(numa_idx,
+                                                         alloc_visitor);
+    GPUProcessState::singleton()->AddGpuHostFreeVisitor(numa_idx, free_visitor);
   }
 
   if (IsGDRAvailable()) {
diff --git a/tensorflow/contrib/verbs/rdma.cc b/tensorflow/contrib/verbs/rdma.cc
index 9db80f6b573..b4b6b705f4b 100644
--- a/tensorflow/contrib/verbs/rdma.cc
+++ b/tensorflow/contrib/verbs/rdma.cc
@@ -1086,7 +1086,7 @@ void RdmaTensorResponse::RecvHandler(Rendezvous::ParsedKey parsed,
       // The tensor must be copied from GPU to CPU, because either:
       // 1. The tensor is located on a non GDR compatible GPU.
       // 2. The tensor's meta-data has changed.
-      Allocator* alloc = GPUProcessState::singleton()->GetCUDAHostAllocator(0);
+      Allocator* alloc = GPUProcessState::singleton()->GetGpuHostAllocator(0);
       copy = Tensor(alloc, in.dtype(), in.shape());
       CountCopies(rm_.name_, (void*)DMAHelper::base(&in),
                   (void*)DMAHelper::base(&copy), in.TotalBytes(), true);
@@ -1543,7 +1543,7 @@ bool RdmaTensorRequest::AllocateTensors() {
     if (mr_ == nullptr) {
       // Can't RDMA directly to result. Use a proxy.
       proxy_tensor_ =
-          new Tensor(GPUProcessState::singleton()->GetCUDAHostAllocator(0),
+          new Tensor(GPUProcessState::singleton()->GetGpuHostAllocator(0),
                      result_tensor_->dtype(), result_tensor_->shape());
       rdma_addr_ = DMAHelper::base(proxy_tensor_);
       mr_ =
diff --git a/tensorflow/contrib/verbs/rdma_mgr.cc b/tensorflow/contrib/verbs/rdma_mgr.cc
index 2f237542786..5ac9f46447c 100644
--- a/tensorflow/contrib/verbs/rdma_mgr.cc
+++ b/tensorflow/contrib/verbs/rdma_mgr.cc
@@ -277,8 +277,8 @@ void RdmaMgr::InitAllocators() {
   ProcessState::singleton()->AddCPUFreeVisitor(free_visitor);
 
 #if GOOGLE_CUDA
-  GPUProcessState::singleton()->AddCUDAHostAllocVisitor(0, alloc_visitor);
-  GPUProcessState::singleton()->AddCUDAHostFreeVisitor(0, free_visitor);
+  GPUProcessState::singleton()->AddGpuHostAllocVisitor(0, alloc_visitor);
+  GPUProcessState::singleton()->AddGpuHostFreeVisitor(0, free_visitor);
 
   if (IsGDRAvailable()) {
     // Note we don't free allocated GPU memory so there is no free visitor
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index a54ae7cd15a..70f0f438d79 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -1705,6 +1705,7 @@ filegroup(
             "platform/**/logger.cc",
             "platform/default/test_benchmark.*",
             "platform/cuda.h",
+            "platform/rocm.h",
             "platform/google/**/*",
             "platform/hadoop/**/*",
             "platform/gif.h",
@@ -2259,6 +2260,7 @@ LIB_INTERNAL_PRIVATE_HEADERS = ["framework/resource_handle.h"] + glob(
         "platform/jpeg.h",
         "platform/png.h",
         "platform/**/cuda.h",
+        "platform/**/rocm.h",
         "platform/**/stream_executor.h",
     ],
 )
@@ -2371,6 +2373,7 @@ cc_library(
             "**/*test*",
             "platform/**/cuda.h",
             "platform/**/cuda_libdevice_path.cc",
+            "platform/**/rocm.h",
             "platform/**/stream_executor.h",
             "platform/**/env_time.cc",
             "platform/**/device_tracer.cc",
@@ -2866,6 +2869,7 @@ tf_cuda_library(
     srcs = ["platform/stream_executor.h"],
     hdrs = [
         "platform/cuda.h",
+        "platform/rocm.h",
         "platform/stream_executor.h",
     ],
     deps = [
@@ -3301,7 +3305,7 @@ cc_library(
 )
 
 GPU_RUNTIME_HEADERS = [
-    "common_runtime/gpu/cuda_host_allocator.h",
+    "common_runtime/gpu/gpu_host_allocator.h",
     "common_runtime/gpu/gpu_bfc_allocator.h",
     "common_runtime/gpu/gpu_cudamalloc_allocator.h",
     "common_runtime/gpu/gpu_debug_allocator.h",
diff --git a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc
index 9c0abd97a18..2e44a37c68f 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #include "tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h"
 
@@ -580,4 +580,4 @@ TEST_F(GPUBFCAllocatorPrivateMethodsTest, ForceAllowGrowth) {
 
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator_test.cc b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator_test.cc
index aca08a7e33d..a0728c017e1 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #include "tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h"
 
@@ -249,4 +249,4 @@ TEST(GPUDebugAllocatorTest, AllocatedVsRequested) {
 }  // namespace
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.cc b/tensorflow/core/common_runtime/gpu/gpu_device.cc
index 607193a0013..5332ff3582d 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.cc
@@ -15,7 +15,11 @@ limitations under the License.
 
 // TODO(opensource): Use a more generic sounding preprocessor name than
 // GOOGLE_CUDA
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+#if TENSORFLOW_USE_ROCM
+#include "rocm/include/hip/hip_runtime.h"
+#endif
 
 #define EIGEN_USE_GPU
 
@@ -55,7 +59,11 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
+#if GOOGLE_CUDA
 #include "tensorflow/core/platform/cuda.h"
+#elif TENSORFLOW_USE_ROCM
+#include "tensorflow/core/platform/rocm.h"
+#endif
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/stream_executor.h"
@@ -67,18 +75,36 @@ limitations under the License.
 #include "tensorflow/core/util/stream_executor_util.h"
 
 #if !defined(PLATFORM_GOOGLE)
+#if GOOGLE_CUDA
 #include "cuda/cuda_config.h"
 #endif
+#endif
 
 namespace tensorflow {
 
+#if GOOGLE_CUDA
+
+typedef cudaStream_t gpuStream_t;
+typedef cudaDeviceProp gpuDeviceProp_t;
+#define EIGEN_GPU_SCRATCH_SIZE (Eigen::kGpuScratchSize)
+using se::cuda::ScopedActivateExecutorContext;
+
+#elif TENSORFLOW_USE_ROCM
+
+typedef hipStream_t gpuStream_t;
+typedef hipDeviceProp_t gpuDeviceProp_t;
+#define EIGEN_GPU_SCRATCH_SIZE (Eigen::kGpuScratchSize)
+using se::rocm::ScopedActivateExecutorContext;
+
+#endif
+
 // Eigen Ops directly allocate memory only for temporary buffers used
 // during OpKernel::Compute().  The recommended way of allocating such
 // memory is via OpKernelContext::allocate_temp().  However, Eigen Ops
 // don't have access to OpKernelContext, instead they get access to
 // memory directly through the device allocator.  As an Open Source
 // project, Eigen assumes allocator semantics similar to those of the
-// CUDA memory allocator, and may not work correctly due to race
+// CUDA or ROCm memory allocator, and may not work correctly due to race
 // conditions if used with some other allocator.  For safety, we need
 // to delay deallocation calls out of Eigen until all events on the
 // corresponding stream have completed.  The following two classes
@@ -91,7 +117,7 @@ class EigenGpuStreamDevice : public ::Eigen::StreamInterface {
     Eigen::initializeDeviceProp();
   }
   ~EigenGpuStreamDevice() override {}
-  void Reinitialize(OpKernelContext* context, const cudaStream_t* cuda_stream,
+  void Reinitialize(OpKernelContext* context, const gpuStream_t* gpu_stream,
                     TfGpuId tf_gpu_id, ::tensorflow::Allocator* alloc,
                     char* scratch) {
     if (LogMemory::IsEnabled()) {
@@ -102,15 +128,15 @@ class EigenGpuStreamDevice : public ::Eigen::StreamInterface {
     scratch_ = scratch;
     semaphore_ =
         reinterpret_cast<unsigned int*>(scratch + Eigen::kGpuScratchSize);
-    stream_ = cuda_stream;
+    stream_ = gpu_stream;
     allocator_ = alloc;
     PlatformGpuId platform_gpu_id;
     TF_CHECK_OK(GpuIdManager::TfToPlatformGpuId(tf_gpu_id, &platform_gpu_id));
     device_prop_ = &Eigen::m_deviceProperties[platform_gpu_id.value()];
   }
 
-  const cudaStream_t& stream() const override { return *stream_; }
-  const cudaDeviceProp& deviceProperties() const override {
+  const gpuStream_t& stream() const override { return *stream_; }
+  const gpuDeviceProp_t& deviceProperties() const override {
     return *device_prop_;
   }
 
@@ -140,8 +166,13 @@ class EigenGpuStreamDevice : public ::Eigen::StreamInterface {
     }
     AsyncFreeData* afData =
         new AsyncFreeData(allocator_, buffer, operation_, step_id_);
+#if GOOGLE_CUDA
     cudaError_t err = cudaStreamAddCallback(*stream_, asyncFree, afData, 0);
     CHECK_EQ(err, cudaSuccess);
+#elif TENSORFLOW_USE_ROCM
+    hipError_t err = hipStreamAddCallback(*stream_, asyncFree, afData, 0);
+    CHECK_EQ(err, hipSuccess);
+#endif
   }
 
   // Return a pointer to a per stream scratchpad of 1024 bytes residing
@@ -165,8 +196,12 @@ class EigenGpuStreamDevice : public ::Eigen::StreamInterface {
     const int64 step_id_;
   };
 
-  static void CUDART_CB asyncFree(cudaStream_t stream, cudaError_t status,
+#if GOOGLE_CUDA
+  static void CUDART_CB asyncFree(gpuStream_t stream, cudaError_t status,
                                   void* userData) {
+#elif TENSORFLOW_USE_ROCM
+  static void asyncFree(gpuStream_t stream, hipError_t status, void* userData) {
+#endif
     AsyncFreeData* data = static_cast<AsyncFreeData*>(userData);
     if (LogMemory::IsEnabled()) {
       LogMemory::RecordRawDeallocation(data->operation_, data->step_id_,
@@ -178,8 +213,8 @@ class EigenGpuStreamDevice : public ::Eigen::StreamInterface {
 
   string operation_;
   int64 step_id_;
-  const cudaStream_t* stream_;          // Not owned.
-  const cudaDeviceProp* device_prop_;   // Not owned.
+  const gpuStream_t* stream_;           // Not owned.
+  const gpuDeviceProp_t* device_prop_;  // Not owned.
   ::tensorflow::Allocator* allocator_;  // Not owned.
   mutable char* scratch_;
   mutable unsigned int* semaphore_;
@@ -454,7 +489,7 @@ Status BaseGPUDevice::FillContextMap(const Graph* graph,
 void BaseGPUDevice::Compute(OpKernel* op_kernel, OpKernelContext* context) {
   // NOTE(tucker): We need to discriminate between Eigen GPU
   // operations and all others.  If an operation is Eigen
-  // implemented (or otherwise tries to launch a cuda kernel
+  // implemented (or otherwise tries to launch a GPU kernel
   // directly), we need to establish a stacked-scoped environment
   // that directs it to execute on the proper device.  Otherwise we
   // expect the Op to use StreamExecutor directly and correctly.  The
@@ -530,7 +565,7 @@ void BaseGPUDevice::ComputeHelper(OpKernel* op_kernel,
     DCHECK(kernel_tracker_);
     kernel_tracker_->PauseWhilePendingExceeds(pending_cap_);
   }
-  se::cuda::ScopedActivateExecutorContext scoped_activation{stream->parent()};
+  ScopedActivateExecutorContext scoped_activation{stream->parent()};
   op_kernel->Compute(context);
   if (context->status().ok()) {
     if (sync_every_op_) {
@@ -596,7 +631,7 @@ void BaseGPUDevice::ComputeAsync(AsyncOpKernel* op_kernel,
   // activity is simple enough that its overhead is negligible.
   tracing::ScopedActivity activity(op_kernel->name(), op_kernel->type_string(),
                                    op_kernel->IsExpensive());
-  se::cuda::ScopedActivateExecutorContext scoped_activation{stream->parent()};
+  ScopedActivateExecutorContext scoped_activation{stream->parent()};
   op_kernel->ComputeAsync(context, done);
 }
 
@@ -715,10 +750,10 @@ class ConcretePerOpGpuDevice : public PerOpGpuDevice {
  public:
   ConcretePerOpGpuDevice() : device_(&stream_device_) {}
 
-  void Reinitialize(OpKernelContext* context, const cudaStream_t* cuda_stream,
+  void Reinitialize(OpKernelContext* context, const gpuStream_t* gpu_stream,
                     TfGpuId tf_gpu_id, Allocator* base_allocator,
                     char* scratch) {
-    stream_device_.Reinitialize(context, cuda_stream, tf_gpu_id, base_allocator,
+    stream_device_.Reinitialize(context, gpu_stream, tf_gpu_id, base_allocator,
                                 scratch);
   }
 
@@ -898,9 +933,9 @@ void BaseGPUDevice::ReinitializeDevice(OpKernelContext* context,
   ConcretePerOpGpuDevice* concrete_device =
       static_cast<ConcretePerOpGpuDevice*>(device);
   DCHECK(concrete_device);
-  const cudaStream_t* cuda_stream = reinterpret_cast<const cudaStream_t*>(
+  const gpuStream_t* gpu_stream = reinterpret_cast<const gpuStream_t*>(
       streams_[stream_id]->compute->implementation()->GpuStreamMemberHack());
-  concrete_device->Reinitialize(context, cuda_stream, tf_gpu_id_, allocator,
+  concrete_device->Reinitialize(context, gpu_stream, tf_gpu_id_, allocator,
                                 scratch_[stream_id]);
 }
 
@@ -977,14 +1012,24 @@ Status BaseGPUDeviceFactory::CreateDevices(
   if (!valid_platform_gpu_ids.empty()) {
     // Save the original device.
     int original_device = 0;
+#if GOOGLE_CUDA
     cudaError_t err = cudaGetDevice(&original_device);
     if (err != cudaSuccess) {
       return errors::Internal("cudaGetDevice() failed. Status: ",
                               cudaGetErrorString(err));
     }
+#elif TENSORFLOW_USE_ROCM
+    hipError_t err = hipGetDevice(&original_device);
+    if (err != hipSuccess) {
+      return errors::Internal("hipGetDevice() failed. Status: ",
+                              hipGetErrorString(err));
+    }
+#endif
+
     // Force to implicitly initialize CUDA runtime on each valid GPU before
     // CreateGPUDevice().
     for (PlatformGpuId platform_gpu_id : valid_platform_gpu_ids) {
+#if GOOGLE_CUDA
       err = cudaSetDevice(platform_gpu_id.value());
       if (err != cudaSuccess) {
         return errors::Internal(
@@ -997,13 +1042,35 @@ Status BaseGPUDeviceFactory::CreateDevices(
                                 platform_gpu_id.value(),
                                 " failed. Status: ", cudaGetErrorString(err));
       }
+#elif TENSORFLOW_USE_ROCM
+      err = hipSetDevice(platform_gpu_id.value());
+      if (err != hipSuccess) {
+        return errors::Internal(
+            "hipSetDevice() on GPU:", platform_gpu_id.value(),
+            " failed. Status: ", hipGetErrorString(err));
+      }
+      err = hipFree(nullptr);
+      if (err != hipSuccess) {
+        return errors::Internal("ROCm runtime implicit initialization on GPU:",
+                                platform_gpu_id.value(),
+                                " failed. Status: ", hipGetErrorString(err));
+      }
+#endif
     }
     // Reset to the original device.
+#if GOOGLE_CUDA
     err = cudaSetDevice(original_device);
     if (err != cudaSuccess) {
       return errors::Internal("cudaSetDevice() on GPU:", original_device,
                               " failed. Status: ", cudaGetErrorString(err));
     }
+#elif TENSORFLOW_USE_ROCM
+    err = hipSetDevice(original_device);
+    if (err != hipSuccess) {
+      return errors::Internal("hipSetDevice() on GPU:", original_device,
+                              " failed. Status: ", hipGetErrorString(err));
+    }
+#endif
   }
 
   std::vector<InterconnectMap> interconnect_maps;
@@ -1093,6 +1160,7 @@ Status BaseGPUDeviceFactory::CreateDevices(
 
 static string GetShortDeviceDescription(PlatformGpuId platform_gpu_id,
                                         const se::DeviceDescription& desc) {
+#if GOOGLE_CUDA
   int cc_major;
   int cc_minor;
   if (!desc.cuda_compute_capability(&cc_major, &cc_minor)) {
@@ -1104,6 +1172,11 @@ static string GetShortDeviceDescription(PlatformGpuId platform_gpu_id,
                          desc.name(), ", pci bus id: ", desc.pci_bus_id(),
                          ", compute capability: ", cc_major, ".", cc_minor);
   // LINT.ThenChange(//tensorflow/python/platform/test.py)
+#elif TENSORFLOW_USE_ROCM
+  return strings::StrCat("device: ", platform_gpu_id.value(),
+                         ", name: ", desc.name(),
+                         ", pci bus id: ", desc.pci_bus_id());
+#endif
 }
 
 Status BaseGPUDeviceFactory::CreateGPUDevice(
@@ -1329,6 +1402,7 @@ static int GetMinGPUMultiprocessorCount(
 
 namespace {
 
+#if GOOGLE_CUDA
 struct CudaVersion {
   // Initialize from version_name in the form of "3.5"
   explicit CudaVersion(const std::string& version_name) {
@@ -1380,6 +1454,15 @@ std::vector<CudaVersion> GetSupportedCudaComputeCapabilities() {
 #endif
   return cuda_caps;
 }
+#endif  // GOOGLE_CUDA
+
+#if TENSORFLOW_USE_ROCM
+std::vector<int> supported_amdgpu_isa_versions = {803, 900, 906};
+
+std::vector<int> GetSupportedAMDGPUISAVersions() {
+  return supported_amdgpu_isa_versions;
+}
+#endif  // TENSORFLOW_USE_ROCM
 
 Status EnablePeerAccess(se::Platform* platform,
                         const std::vector<PlatformGpuId>& visible_gpu_order) {
@@ -1457,6 +1540,7 @@ Status BaseGPUDeviceFactory::GetValidDeviceIds(
       total_bytes = 0;
     }
     const auto& description = stream_exec->GetDeviceDescription();
+#if GOOGLE_CUDA
     int cc_major;
     int cc_minor;
     if (!description.cuda_compute_capability(&cc_major, &cc_minor)) {
@@ -1471,6 +1555,21 @@ Status BaseGPUDeviceFactory::GetValidDeviceIds(
               << "\npciBusID: " << description.pci_bus_id() << "\ntotalMemory: "
               << strings::HumanReadableNumBytes(total_bytes)
               << " freeMemory: " << strings::HumanReadableNumBytes(free_bytes);
+#elif TENSORFLOW_USE_ROCM
+    int isa_version;
+    if (!description.rocm_amdgpu_isa_version(&isa_version)) {
+      // Logs internally on failure.
+      isa_version = 0;
+    }
+    LOG(INFO) << "Found device " << i << " with properties: "
+              << "\nname: " << description.name() << "\nAMDGPU ISA: gfx"
+              << isa_version << "\nmemoryClockRate (GHz) "
+              << description.clock_rate_ghz() << "\npciBusID "
+              << description.pci_bus_id() << "\nTotal memory: "
+              << strings::HumanReadableNumBytes(total_bytes)
+              << "\nFree memory: "
+              << strings::HumanReadableNumBytes(free_bytes);
+#endif
   }
   // Checking peering and shows matrix if more than one gpu found.
   if (new_gpu_found && visible_gpu_order.size() > 1) {
@@ -1478,6 +1577,7 @@ Status BaseGPUDeviceFactory::GetValidDeviceIds(
     TF_RETURN_IF_ERROR(EnablePeerAccess(gpu_manager, visible_gpu_order));
   }
 
+#if GOOGLE_CUDA
   auto cuda_supported_capabilities = GetSupportedCudaComputeCapabilities();
   if (cuda_supported_capabilities.empty()) {
     return errors::FailedPrecondition(
@@ -1485,6 +1585,15 @@ Status BaseGPUDeviceFactory::GetValidDeviceIds(
   }
   CudaVersion min_supported_capability = *std::min_element(
       cuda_supported_capabilities.begin(), cuda_supported_capabilities.end());
+#elif TENSORFLOW_USE_ROCM
+  auto rocm_supported_isas = GetSupportedAMDGPUISAVersions();
+  if (rocm_supported_isas.empty()) {
+    return errors::FailedPrecondition(
+        "No supported rocm capabilities in binary.");
+  }
+  int min_supported_isa =
+      *std::min_element(rocm_supported_isas.begin(), rocm_supported_isas.end());
+#endif
 
   int min_gpu_core_count =
       GetMinGPUMultiprocessorCount(gpu_manager, visible_gpu_order);
@@ -1502,6 +1611,8 @@ Status BaseGPUDeviceFactory::GetValidDeviceIds(
     }
     se::StreamExecutor* se = exec_status.ValueOrDie();
     const se::DeviceDescription& desc = se->GetDeviceDescription();
+
+#if GOOGLE_CUDA
     CudaVersion device_capability;
     if (!desc.cuda_compute_capability(&device_capability.major_part,
                                       &device_capability.minor_part)) {
@@ -1522,6 +1633,23 @@ Status BaseGPUDeviceFactory::GetValidDeviceIds(
                 << min_supported_capability << ".";
       continue;
     }
+#elif TENSORFLOW_USE_ROCM
+    int device_isa;
+    if (!desc.rocm_amdgpu_isa_version(&device_isa)) {
+      continue;
+    }
+    // Only GPUs with no less than the minimum supported compute capability is
+    // accepted.
+    if (device_isa < min_supported_isa) {
+      LOG(INFO) << "Ignoring visible gpu device "
+                << "(" << GetShortDeviceDescription(visible_gpu_id, desc)
+                << ") "
+                << "with AMDGPU ISA gfx" << device_isa
+                << ". The minimum required AMDGPU ISA is gfx"
+                << min_supported_isa << ".";
+      continue;
+    }
+#endif
 
     // Filter out slow GPUs. By default, GPUs with a lower multiprocessor
     // count than the fastest GPU are filtered out, unless they have 8 or more
@@ -1531,7 +1659,7 @@ Status BaseGPUDeviceFactory::GetValidDeviceIds(
       LOG(INFO) << "Ignoring visible gpu device "
                 << "(" << GetShortDeviceDescription(visible_gpu_id, desc)
                 << ") "
-                << "with Cuda multiprocessor count: " << desc.core_count()
+                << "with core count: " << desc.core_count()
                 << ". The minimum required count is " << min_gpu_core_count
                 << ". You can adjust this requirement with the env var "
                    "TF_MIN_GPU_MULTIPROCESSOR_COUNT.";
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.h b/tensorflow/core/common_runtime/gpu/gpu_device.h
index f8f2a2e2f32..377133043f7 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#if !GOOGLE_CUDA
-#error This file must only be included when building with Cuda support
+#if !GOOGLE_CUDA && !TENSORFLOW_USE_ROCM
+#error This file must only be included when building with Cuda or ROCm support
 #endif
 
 #ifndef TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_DEVICE_H_
@@ -98,7 +98,7 @@ class BaseGPUDevice : public LocalDevice {
                                Allocator* allocator) override;
 
   // Returns the platform GPU id of this device within the native driver system;
-  // e.g., for CUDA this is the ordinal of the GPU within the system.
+  // e.g., for CUDA and ROCm this is the ordinal of the GPU within the system.
   int gpu_id() const {
     PlatformGpuId platform_gpu_id;
     TF_CHECK_OK(GpuIdManager::TfToPlatformGpuId(tf_gpu_id_, &platform_gpu_id));
@@ -311,8 +311,8 @@ class BaseGPUDeviceFactory : public DeviceFactory {
   // Returns into 'ids' the list of valid platform GPU ids, in the order that
   // they should map to TF GPU ids "/device:GPU:0", "/device:GPU:1", etc,
   // based upon 'visible_gpu_order' which was generated by parsing
-  // GPUOptions::visible_device_list which is a comma-separated list of CUDA GPU
-  // ids.
+  // GPUOptions::visible_device_list which is a comma-separated list of CUDA or
+  // ROCm GPU ids.
   Status GetValidDeviceIds(const std::vector<PlatformGpuId>& visible_gpu_order,
                            std::vector<PlatformGpuId>* ids);
 
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device_factory.cc b/tensorflow/core/common_runtime/gpu/gpu_device_factory.cc
index 962891894ad..99243aa28b5 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device_factory.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device_factory.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #define EIGEN_USE_GPU
 
@@ -45,7 +45,7 @@ class GPUDevice : public BaseGPUDevice {
     if (attr.on_host()) {
       if (attr.gpu_compatible() || force_gpu_compatible_) {
         GPUProcessState* ps = GPUProcessState::singleton();
-        return ps->GetCUDAHostAllocator(0);
+        return ps->GetGpuHostAllocator(0);
       } else {
         return cpu_allocator_;
       }
@@ -94,7 +94,7 @@ class GPUCompatibleCPUDevice : public ThreadPoolDevice {
   Allocator* GetAllocator(AllocatorAttributes attr) override {
     GPUProcessState* ps = GPUProcessState::singleton();
     if (attr.gpu_compatible() || force_gpu_compatible_) {
-      return ps->GetCUDAHostAllocator(numa_node_);
+      return ps->GetGpuHostAllocator(numa_node_);
     } else {
       // Call the parent's implementation.
       return ThreadPoolDevice::GetAllocator(attr);
@@ -136,4 +136,4 @@ REGISTER_LOCAL_DEVICE_FACTORY("CPU", GPUCompatibleCPUDeviceFactory, 70);
 
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/common_runtime/gpu/cuda_host_allocator.h b/tensorflow/core/common_runtime/gpu/gpu_host_allocator.h
similarity index 74%
rename from tensorflow/core/common_runtime/gpu/cuda_host_allocator.h
rename to tensorflow/core/common_runtime/gpu/gpu_host_allocator.h
index 6bd29ef775f..3ac579112f9 100644
--- a/tensorflow/core/common_runtime/gpu/cuda_host_allocator.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_host_allocator.h
@@ -13,28 +13,28 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_GPU_CUDA_HOST_ALLOCATOR_H_
-#define TENSORFLOW_CORE_COMMON_RUNTIME_GPU_CUDA_HOST_ALLOCATOR_H_
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_HOST_ALLOCATOR_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_HOST_ALLOCATOR_H_
 
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/stream_executor.h"
 
 namespace tensorflow {
-// Allocator for pinned CPU RAM that is made known to CUDA for the
+// Allocator for pinned CPU RAM that is made known to GPU for the
 // purpose of efficient DMA with a GPU.
-class CUDAHostAllocator : public SubAllocator {
+class GpuHostAllocator : public SubAllocator {
  public:
   // Note: stream_exec cannot be null.
-  explicit CUDAHostAllocator(se::StreamExecutor* stream_exec, int numa_node,
-                             const std::vector<Visitor>& alloc_visitors,
-                             const std::vector<Visitor>& free_visitors)
+  explicit GpuHostAllocator(se::StreamExecutor* stream_exec, int numa_node,
+                            const std::vector<Visitor>& alloc_visitors,
+                            const std::vector<Visitor>& free_visitors)
       : SubAllocator(alloc_visitors, free_visitors),
         stream_exec_(stream_exec),
         numa_node_(numa_node) {
     CHECK(stream_exec_ != nullptr);
   }
-  ~CUDAHostAllocator() override {}
+  ~GpuHostAllocator() override {}
 
   void* Alloc(size_t alignment, size_t num_bytes) override {
     void* ptr = nullptr;
@@ -61,8 +61,8 @@ class CUDAHostAllocator : public SubAllocator {
   se::StreamExecutor* stream_exec_;  // not owned, non-null
   const int numa_node_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(CUDAHostAllocator);
+  TF_DISALLOW_COPY_AND_ASSIGN(GpuHostAllocator);
 };
 
 }  // namespace tensorflow
-#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_GPU_CUDA_HOST_ALLOCATOR_H_
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_HOST_ALLOCATOR_H_
diff --git a/tensorflow/core/common_runtime/gpu/gpu_init.cc b/tensorflow/core/common_runtime/gpu/gpu_init.cc
index e0ec93a98e2..b6672c35712 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_init.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_init.cc
@@ -29,17 +29,27 @@ limitations under the License.
 namespace tensorflow {
 
 Status ValidateGPUMachineManager() {
-  return se::MultiPlatformManager::PlatformWithName("CUDA").status();
+  return se::MultiPlatformManager::PlatformWithName(GpuPlatformName()).status();
 }
 
 se::Platform* GPUMachineManager() {
-  auto result = se::MultiPlatformManager::PlatformWithName("CUDA");
+  auto result = se::MultiPlatformManager::PlatformWithName(GpuPlatformName());
   if (!result.ok()) {
-    LOG(FATAL) << "Could not find Platform with name CUDA";
+    LOG(FATAL) << "Could not find Platform with name " << GpuPlatformName();
     return nullptr;
   }
 
   return result.ValueOrDie();
 }
 
+string GpuPlatformName() {
+#if TENSORFLOW_USE_ROCM
+  return "ROCM";
+#else
+  // This function will return "CUDA" even when building TF without GPU support
+  // This is done to preserve existing functionality
+  return "CUDA";
+#endif
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/gpu/gpu_init.h b/tensorflow/core/common_runtime/gpu/gpu_init.h
index 4e1f06ac838..4c8f0868df0 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_init.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_init.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_INIT_H_
 #define TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_INIT_H_
 
+#include <string>
 #include "tensorflow/core/lib/core/status.h"
 
 namespace stream_executor {
@@ -24,7 +25,7 @@ class Platform;
 
 namespace tensorflow {
 
-// Initializes the CUDA platform and returns OK if the CUDA
+// Initializes the GPU platform and returns OK if the GPU
 // platform could be initialized.
 Status ValidateGPUMachineManager();
 
@@ -34,6 +35,11 @@ Status ValidateGPUMachineManager();
 // in the process (e.g., ValidateGPUMachineManager() returns OK).
 stream_executor::Platform* GPUMachineManager();
 
+// Returns the string describing the name of the GPU platform in use.
+// This value is "CUDA" by default, and
+// "ROCM" when TF is built with `--config==rocm`
+string GpuPlatformName();
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_INIT_H_
diff --git a/tensorflow/core/common_runtime/gpu/gpu_process_state.cc b/tensorflow/core/common_runtime/gpu/gpu_process_state.cc
index 39883d34f9e..ea45bfaef53 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_process_state.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_process_state.cc
@@ -18,10 +18,10 @@ limitations under the License.
 #include <cstring>
 #include <vector>
 
-#include "tensorflow/core/common_runtime/gpu/cuda_host_allocator.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_host_allocator.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_id.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_id_manager.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_id_utils.h"
@@ -81,7 +81,7 @@ Allocator* GPUProcessState::GetGPUAllocator(const GPUOptions& options,
                                             TfGpuId tf_gpu_id,
                                             size_t total_bytes) {
   CHECK(process_state_);
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
   const string& allocator_type = options.allocator_type();
   mutex_lock lock(mu_);
   GpuIdUtil::CheckValidTfGpuId(tf_gpu_id);
@@ -155,14 +155,15 @@ Allocator* GPUProcessState::GetGPUAllocator(const GPUOptions& options,
     return allocator_parts.allocator.get();
   }
 #else
-  LOG(FATAL) << "GPUAllocator unavailable. Not compiled with --config=cuda.";
+  LOG(FATAL) << "GPUAllocator unavailable. Not compiled with --config=cuda or "
+                "--config=rocm.";
   return nullptr;
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 }
 
 SharedCounter* GPUProcessState::GPUAllocatorCounter(TfGpuId tf_gpu_id) {
   DCHECK(process_state_);
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
   GpuIdUtil::CheckValidTfGpuId(tf_gpu_id);
   mutex_lock l(mu_);
   if (tf_gpu_id.value() >= static_cast<int64>(gpu_allocators_.size())) {
@@ -173,37 +174,37 @@ SharedCounter* GPUProcessState::GPUAllocatorCounter(TfGpuId tf_gpu_id) {
   return allocator_parts.counter.get();
 #else
   return nullptr;
-#endif
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 }
 
-Allocator* GPUProcessState::GetCUDAHostAllocator(int numa_node) {
+Allocator* GPUProcessState::GetGpuHostAllocator(int numa_node) {
   CHECK(process_state_);
   if (!HasGPUDevice() ||
-      !process_state_->ProcessState::FLAGS_brain_mem_reg_cuda_dma) {
+      !process_state_->ProcessState::FLAGS_brain_mem_reg_gpu_dma) {
     return process_state_->GetCPUAllocator(numa_node);
   }
   if (numa_node == port::kNUMANoAffinity) {
     numa_node = 0;
   }
   {
-    // Here we optimize the most common use case where cuda_host_allocators_
-    // and cuda_al_ have already been populated and since we're only reading
+    // Here we optimize the most common use case where gpu_host_allocators_
+    // have already been populated and since we're only reading
     // these vectors, we can get by with a shared lock. In the slower case,
     // we take a unique lock and populate these vectors.
     tf_shared_lock lock(mu_);
 
     if (process_state_->ProcessState::FLAGS_brain_gpu_record_mem_types &&
-        !cuda_host_allocators_.empty() &&
-        cuda_host_allocators_[0].recording_allocator != nullptr) {
-      return cuda_host_allocators_[0].recording_allocator.get();
+        !gpu_host_allocators_.empty() &&
+        gpu_host_allocators_[0].recording_allocator != nullptr) {
+      return gpu_host_allocators_[0].recording_allocator.get();
     }
-    if (static_cast<int>(cuda_host_allocators_.size()) > numa_node) {
-      return cuda_host_allocators_[0].allocator.get();
+    if (static_cast<int>(gpu_host_allocators_.size()) > numa_node) {
+      return gpu_host_allocators_[0].allocator.get();
     }
   }
 
   mutex_lock lock(mu_);
-  // Find the first valid StreamExecutor to request CUDA host memory
+  // Find the first valid StreamExecutor to request CUDA or ROCm host memory
   // through, since any will work.
   //
   // This search isn't super clean, and it would be nice to use a
@@ -220,39 +221,39 @@ Allocator* GPUProcessState::GetCUDAHostAllocator(int numa_node) {
 
   CHECK_NE(nullptr, se);
 
-  while (static_cast<int>(cuda_host_allocators_.size()) <= numa_node) {
-    while (cuda_host_alloc_visitors_.size() <= numa_node) {
-      cuda_host_alloc_visitors_.push_back({});
+  while (static_cast<int>(gpu_host_allocators_.size()) <= numa_node) {
+    while (gpu_host_alloc_visitors_.size() <= numa_node) {
+      gpu_host_alloc_visitors_.push_back({});
     }
-    while (cuda_host_free_visitors_.size() <= numa_node) {
-      cuda_host_free_visitors_.push_back({});
+    while (gpu_host_free_visitors_.size() <= numa_node) {
+      gpu_host_free_visitors_.push_back({});
     }
-    SubAllocator* sub_allocator = new CUDAHostAllocator(
-        se, numa_node, cuda_host_alloc_visitors_[numa_node],
-        cuda_host_free_visitors_[numa_node]);
+    SubAllocator* sub_allocator =
+        new GpuHostAllocator(se, numa_node, gpu_host_alloc_visitors_[numa_node],
+                             gpu_host_free_visitors_[numa_node]);
     // TODO(zheng-xq): evaluate whether 64GB by default is the best choice.
-    int64 cuda_host_mem_limit_in_mb = -1;
-    Status status = ReadInt64FromEnvVar("TF_CUDA_HOST_MEM_LIMIT_IN_MB",
+    int64 gpu_host_mem_limit_in_mb = -1;
+    Status status = ReadInt64FromEnvVar("TF_GPU_HOST_MEM_LIMIT_IN_MB",
                                         1LL << 16 /*64GB max by default*/,
-                                        &cuda_host_mem_limit_in_mb);
+                                        &gpu_host_mem_limit_in_mb);
     if (!status.ok()) {
-      LOG(ERROR) << "GetCUDAHostAllocator: " << status.error_message();
+      LOG(ERROR) << "GetGpuHostAllocator: " << status.error_message();
     }
-    int64 cuda_host_mem_limit = cuda_host_mem_limit_in_mb * (1LL << 20);
+    int64 gpu_host_mem_limit = gpu_host_mem_limit_in_mb * (1LL << 20);
     Allocator* allocator =
-        new BFCAllocator(sub_allocator, cuda_host_mem_limit,
-                         true /*allow_growth*/, "cuda_host_bfc" /*name*/);
+        new BFCAllocator(sub_allocator, gpu_host_mem_limit,
+                         true /*allow_growth*/, "gpu_host_bfc" /*name*/);
 
     if (LogMemory::IsEnabled() && !allocator->TracksAllocationSizes()) {
       // Wrap the allocator to track allocation ids for better logging
       // at the cost of performance.
       allocator = new TrackingAllocator(allocator, true);
     }
-    cuda_host_allocators_.push_back({std::unique_ptr<Allocator>(allocator),
-                                     std::unique_ptr<SharedCounter>(nullptr),
-                                     sub_allocator,
-                                     std::unique_ptr<Allocator>(nullptr)});
-    AllocatorParts& allocator_parts = cuda_host_allocators_.back();
+    gpu_host_allocators_.push_back({std::unique_ptr<Allocator>(allocator),
+                                    std::unique_ptr<SharedCounter>(nullptr),
+                                    sub_allocator,
+                                    std::unique_ptr<Allocator>(nullptr)});
+    AllocatorParts& allocator_parts = gpu_host_allocators_.back();
     if (process_state_->ProcessState::FLAGS_brain_gpu_record_mem_types) {
       ProcessState::MemDesc md;
       md.loc = ProcessState::MemDesc::CPU;
@@ -266,15 +267,15 @@ Allocator* GPUProcessState::GetCUDAHostAllocator(int numa_node) {
     }
   }
   if (process_state_->ProcessState::FLAGS_brain_gpu_record_mem_types) {
-    return cuda_host_allocators_[0].recording_allocator.get();
+    return gpu_host_allocators_[0].recording_allocator.get();
   } else {
-    return cuda_host_allocators_[0].allocator.get();
+    return gpu_host_allocators_[0].allocator.get();
   }
 }
 
 void GPUProcessState::AddGPUAllocVisitor(int bus_id,
                                          const SubAllocator::Visitor& visitor) {
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
   mutex_lock lock(mu_);
   CHECK(gpu_allocators_.empty())  // Crash OK
       << "AddGPUAllocVisitor must be called before "
@@ -284,35 +285,35 @@ void GPUProcessState::AddGPUAllocVisitor(int bus_id,
     gpu_visitors_.push_back(std::vector<SubAllocator::Visitor>());
   }
   gpu_visitors_[bus_id].push_back(visitor);
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 }
 
-void GPUProcessState::AddCUDAHostAllocVisitor(
+void GPUProcessState::AddGpuHostAllocVisitor(
     int numa_node, const SubAllocator::Visitor& visitor) {
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
   mutex_lock lock(mu_);
-  CHECK(cuda_host_allocators_.empty())  // Crash OK
-      << "AddCUDAHostAllocVisitor must be called before "
-         "first call to GetCUDAHostAllocator.";
-  while (numa_node >= static_cast<int64>(cuda_host_alloc_visitors_.size())) {
-    cuda_host_alloc_visitors_.push_back(std::vector<SubAllocator::Visitor>());
+  CHECK(gpu_host_allocators_.empty())  // Crash OK
+      << "AddGpuHostAllocVisitor must be called before "
+         "first call to GetGpuHostAllocator.";
+  while (numa_node >= static_cast<int64>(gpu_host_alloc_visitors_.size())) {
+    gpu_host_alloc_visitors_.push_back(std::vector<SubAllocator::Visitor>());
   }
-  cuda_host_alloc_visitors_[numa_node].push_back(visitor);
-#endif  // GOOGLE_CUDA
+  gpu_host_alloc_visitors_[numa_node].push_back(visitor);
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 }
 
-void GPUProcessState::AddCUDAHostFreeVisitor(
+void GPUProcessState::AddGpuHostFreeVisitor(
     int numa_node, const SubAllocator::Visitor& visitor) {
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
   mutex_lock lock(mu_);
-  CHECK(cuda_host_allocators_.empty())  // Crash OK
-      << "AddCUDAHostFreeVisitor must be called before "
-         "first call to GetCUDAHostAllocator.";
-  while (numa_node >= static_cast<int64>(cuda_host_free_visitors_.size())) {
-    cuda_host_free_visitors_.push_back(std::vector<SubAllocator::Visitor>());
+  CHECK(gpu_host_allocators_.empty())  // Crash OK
+      << "AddGpuHostFreeVisitor must be called before "
+         "first call to GetGpuHostAllocator.";
+  while (numa_node >= static_cast<int64>(gpu_host_free_visitors_.size())) {
+    gpu_host_free_visitors_.push_back(std::vector<SubAllocator::Visitor>());
   }
-  cuda_host_free_visitors_[numa_node].push_back(visitor);
-#endif  // GOOGLE_CUDA
+  gpu_host_free_visitors_[numa_node].push_back(visitor);
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 }
 
 void GPUProcessState::TestOnlyReset() {
@@ -324,9 +325,9 @@ void GPUProcessState::TestOnlyReset() {
     gpu_device_enabled_ = false;
     gpu_allocators_.clear();
     gpu_visitors_.clear();
-    cuda_host_allocators_.clear();
-    cuda_host_alloc_visitors_.clear();
-    cuda_host_free_visitors_.clear();
+    gpu_host_allocators_.clear();
+    gpu_host_alloc_visitors_.clear();
+    gpu_host_free_visitors_.clear();
   }
 }
 
diff --git a/tensorflow/core/common_runtime/gpu/gpu_process_state.h b/tensorflow/core/common_runtime/gpu/gpu_process_state.h
index 861157ca0c6..09e5575f04f 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_process_state.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_process_state.h
@@ -83,7 +83,7 @@ class GPUProcessState {
   virtual Allocator* GetGPUAllocator(const GPUOptions& options,
                                      TfGpuId tf_gpu_id, size_t total_bytes);
 
-  virtual Allocator* GetCUDAHostAllocator(int numa_node);
+  virtual Allocator* GetGpuHostAllocator(int numa_node);
 
   // Registers a Visitor to be invoked on new chunks of memory allocated by the
   // SubAllocator of every GPU proximate to the specified bus.  The AllocVisitor
@@ -98,14 +98,14 @@ class GPUProcessState {
                                   const SubAllocator::Visitor& visitor);
 
   // Registers a Visitor to be invoked on new chunks of memory allocated by
-  // the SubAllocator of the CUDAHostAllocator for the given numa_node.
-  virtual void AddCUDAHostAllocVisitor(int numa_node,
-                                       const SubAllocator::Visitor& visitor);
+  // the SubAllocator of the GpuHostAllocator for the given numa_node.
+  virtual void AddGpuHostAllocVisitor(int numa_node,
+                                      const SubAllocator::Visitor& visitor);
 
   // Registers a Visitor to be invoked on each chunk handed back for freeing to
-  // the SubAllocator of the CUDAHostAllocator for the given numa_node.
-  virtual void AddCUDAHostFreeVisitor(int numa_node,
-                                      const SubAllocator::Visitor& visitor);
+  // the SubAllocator of the GpuHostAllocator for the given numa_node.
+  virtual void AddGpuHostFreeVisitor(int numa_node,
+                                     const SubAllocator::Visitor& visitor);
 
   // Returns bus_id for the given GPU id.
   virtual int BusIdForGPU(TfGpuId tf_gpu_id);
@@ -143,10 +143,10 @@ class GPUProcessState {
   std::vector<AllocatorParts> gpu_allocators_ GUARDED_BY(mu_);
   std::vector<std::vector<SubAllocator::Visitor>> gpu_visitors_ GUARDED_BY(mu_);
 
-  std::vector<AllocatorParts> cuda_host_allocators_ GUARDED_BY(mu_);
-  std::vector<std::vector<SubAllocator::Visitor>> cuda_host_alloc_visitors_
+  std::vector<AllocatorParts> gpu_host_allocators_ GUARDED_BY(mu_);
+  std::vector<std::vector<SubAllocator::Visitor>> gpu_host_alloc_visitors_
       GUARDED_BY(mu_);
-  std::vector<std::vector<SubAllocator::Visitor>> cuda_host_free_visitors_
+  std::vector<std::vector<SubAllocator::Visitor>> gpu_host_free_visitors_
       GUARDED_BY(mu_);
 };
 
diff --git a/tensorflow/core/common_runtime/gpu/gpu_util.cc b/tensorflow/core/common_runtime/gpu/gpu_util.cc
index 5851360cab7..56f68c8a8ed 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_util.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_util.cc
@@ -150,7 +150,7 @@ void GPUUtil::SetProtoFromGPU(const Tensor& tensor, Device* dev,
   const int64 total_bytes = is_dead ? 0 : tensor.TotalBytes();
   if (total_bytes > 0) {
     tracing::ScopedAnnotation annotation("SetProtoFromGPU");
-    alloc = GPUProcessState::singleton()->GetCUDAHostAllocator(0);
+    alloc = GPUProcessState::singleton()->GetGpuHostAllocator(0);
     buf = alloc->Allocate<char>(total_bytes);
     if (LogMemory::IsEnabled()) {
       LogMemory::RecordRawAllocation("SetProtoFromGPU",
diff --git a/tensorflow/core/common_runtime/gpu/pool_allocator_test.cc b/tensorflow/core/common_runtime/gpu/pool_allocator_test.cc
index 6b2f6547b07..42b5ed959c9 100644
--- a/tensorflow/core/common_runtime/gpu/pool_allocator_test.cc
+++ b/tensorflow/core/common_runtime/gpu/pool_allocator_test.cc
@@ -17,7 +17,7 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/pool_allocator.h"
 
-#include "tensorflow/core/common_runtime/gpu/cuda_host_allocator.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_host_allocator.h"
 #include "tensorflow/core/platform/stream_executor.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -29,7 +29,7 @@ TEST(PoolAllocatorTest, ZeroSizeBuffers) {
       se::MultiPlatformManager::PlatformWithName("cuda").ValueOrDie();
   PoolAllocator pool(
       2 /*pool_size_limit*/, false /*auto_resize*/,
-      new CUDAHostAllocator(
+      new GpuHostAllocator(
           platform->GetExecutor(se::StreamExecutorConfig(/*ordinal=*/0))
               .ValueOrDie(),
           0 /*numa_node*/, {}, {}),
@@ -48,7 +48,7 @@ TEST(PoolAllocatorTest, ZeroSizePool) {
       se::MultiPlatformManager::PlatformWithName("cuda").ValueOrDie();
   PoolAllocator pool(
       0 /*pool_size_limit*/, false /*auto_resize*/,
-      new CUDAHostAllocator(
+      new GpuHostAllocator(
           platform->GetExecutor(se::StreamExecutorConfig(/*ordinal=*/0))
               .ValueOrDie(),
           0 /*numa_node*/, {}, {}),
@@ -82,7 +82,7 @@ TEST(PoolAllocatorTest, Alignment) {
       se::MultiPlatformManager::PlatformWithName("cuda").ValueOrDie();
   PoolAllocator pool(
       0 /*pool_size_limit*/, false /*auto_resize*/,
-      new CUDAHostAllocator(
+      new GpuHostAllocator(
           platform->GetExecutor(se::StreamExecutorConfig(/*ordinal=*/0))
               .ValueOrDie(),
           0 /*numa_node*/, {}, {}),
@@ -142,7 +142,7 @@ TEST(PoolAllocatorTest, CudaHostAllocator) {
       };
   se::Platform* platform =
       se::MultiPlatformManager::PlatformWithName("cuda").ValueOrDie();
-  CUDAHostAllocator* sub_allocator = new CUDAHostAllocator(
+  GpuHostAllocator* sub_allocator = new GpuHostAllocator(
       platform->GetExecutor(se::StreamExecutorConfig(/*ordinal=*/0))
           .ValueOrDie(),
       0 /*numa_node*/, {alloc_visitor}, {free_visitor});
@@ -247,7 +247,7 @@ TEST(PoolAllocatorTest, Name) {
       se::MultiPlatformManager::PlatformWithName("cuda").ValueOrDie();
   PoolAllocator pool(
       2 /*pool_size_limit*/, false /*auto_resize*/,
-      new CUDAHostAllocator(
+      new GpuHostAllocator(
           platform->GetExecutor(se::StreamExecutorConfig(/*ordinal=*/0))
               .ValueOrDie(),
           0 /*numa_node*/, {}, {}),
diff --git a/tensorflow/core/common_runtime/process_state.h b/tensorflow/core/common_runtime/process_state.h
index bc877db99e4..8f3cc5c4446 100644
--- a/tensorflow/core/common_runtime/process_state.h
+++ b/tensorflow/core/common_runtime/process_state.h
@@ -83,7 +83,7 @@ class ProcessState : public ProcessStateInterface {
 
   // If these flags need to be runtime configurable consider adding
   // them to ConfigProto.
-  static const bool FLAGS_brain_mem_reg_cuda_dma = true;
+  static const bool FLAGS_brain_mem_reg_gpu_dma = true;
   static const bool FLAGS_brain_gpu_record_mem_types = false;
 
   // Helper method for unit tests to reset the ProcessState singleton by
diff --git a/tensorflow/core/platform/default/build_config/BUILD b/tensorflow/core/platform/default/build_config/BUILD
index 845fe0ec047..3ad58794823 100644
--- a/tensorflow/core/platform/default/build_config/BUILD
+++ b/tensorflow/core/platform/default/build_config/BUILD
@@ -34,7 +34,10 @@ cc_library(
 
 tf_cuda_library(
     name = "stream_executor",
-    cuda_deps = ["//tensorflow/stream_executor/cuda:cuda_activation"],
+    cuda_deps = [
+        "//tensorflow/stream_executor/cuda:cuda_activation",
+        "//tensorflow/stream_executor/rocm:rocm_activation",
+    ],
     deps = [
         "//tensorflow/stream_executor",
         "//tensorflow/stream_executor:dnn",
diff --git a/tensorflow/core/platform/rocm.h b/tensorflow/core/platform/rocm.h
new file mode 100644
index 00000000000..1896cc3d84c
--- /dev/null
+++ b/tensorflow/core/platform/rocm.h
@@ -0,0 +1,22 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_ROCM_H_
+#define TENSORFLOW_CORE_PLATFORM_ROCM_H_
+
+#include "tensorflow/core/platform/platform.h"
+#include "tensorflow/stream_executor/rocm/rocm_activation.h"
+
+#endif  // TENSORFLOW_CORE_PLATFORM_ROCM_H_
diff --git a/tensorflow/stream_executor/rocm/BUILD b/tensorflow/stream_executor/rocm/BUILD
index cb39690ad1c..8c452a31d36 100644
--- a/tensorflow/stream_executor/rocm/BUILD
+++ b/tensorflow/stream_executor/rocm/BUILD
@@ -62,6 +62,20 @@ cc_library(
     ]),
 )
 
+cc_library(
+    name = "rocm_activation",
+    srcs = [],
+    hdrs = if_rocm_is_configured(["rocm_activation.h"]),
+    deps = if_rocm_is_configured([
+        ":rocm_driver",
+        "@local_config_rocm//rocm:rocm_headers",
+        "//tensorflow/stream_executor",
+        "//tensorflow/stream_executor:stream_executor_internal",
+        "//tensorflow/stream_executor/gpu:gpu_activation",
+        "//tensorflow/stream_executor/platform",
+    ]),
+)
+
 cc_library(
     name = "rocm_event",
     srcs = if_rocm_is_configured(["rocm_event.cc"]),
diff --git a/tensorflow/stream_executor/rocm/rocm_activation.h b/tensorflow/stream_executor/rocm/rocm_activation.h
new file mode 100644
index 00000000000..03a0beaece4
--- /dev/null
+++ b/tensorflow/stream_executor/rocm/rocm_activation.h
@@ -0,0 +1,39 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file contains APIs that assume a StreamExecutor is backed by ROCM.
+// It reaches into the ROCM implementation to activate an underlying ROCM
+// context.
+//
+// Having this file separate from rocm/rocm_gpu_executor.h means that dependent
+// code does not also have to depend on rocm.h.
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_ACTIVATION_H_
+#define TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_ACTIVATION_H_
+
+#include "tensorflow/stream_executor/gpu/gpu_activation.h"
+
+namespace stream_executor {
+
+class StreamExecutor;
+
+namespace rocm {
+
+using ScopedActivateExecutorContext = gpu::ScopedActivateExecutorContext;
+
+}  // namespace rocm
+}  // namespace stream_executor
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_ACTIVATION_H_