PR #25011: [GPU][ROCm][CUDA] StreamExecutor logic for ROCm / CUDA platform (PR 20709 / 22669 / 24156 continued)

Please approve this CL. It will be submitted automatically, and its GitHub pull request will be marked as merged. Imported from GitHub PR #25011 New PR to continue the efforts started by @deven-amd in #20709 / #22669 / #24156. This PR aims to refactor StreamExecutor GPU interfaces so it can be shared among CUDA and ROCm. The PR would be the first part of a series of PRs. Based on @timshen91 's inputs, I've refactored logic in #214156 so : - only contains changes in stream_executor/.... - does not remove any stream_executor/cuda/*.h, so that things outside of stream_executor don't break. All the types and functions in the namespace cuda now alias to namespace gpu counterparts. For example, namespace cuda { using CUDADriver = gpu::GpuDriver; }. - all stream_executor/gpu/BUILD targets should be only visible to //third_party/tensorflow/stream_executor:__subpackages__. - target stream_executor/gpu:X should be only used by stream_executor/cuda:cuda_X or stream_executor/rocm:rocm_X, not cuda_Y. For example, cuda:cuda_platform should depend on cuda:cuda_driver, not gpu:gpu_driver. Copybara import of the project: - 267affbb73df9164baf4e62142fe7201e6a305ee [ROCm][CUDA] StreamExecutor logic for ROCm / CUDA platform by Wen-Heng (Jack) Chung <whchung@gmail.com> - 04fac5bf358059bdb2cd4a3e092e52dc982ea7b0 Merge 267affbb73df9164baf4e62142fe7201e6a305ee into 5f8ea... by Wen-Heng (Jack) Chung <whchung@gmail.com> COPYBARA_INTEGRATE_REVIEW=https://github.com/tensorflow/tensorflow/pull/25011 from ROCmSoftwarePlatform:google-upstream-pr-stream-executor-alt 267affbb73df9164baf4e62142fe7201e6a305ee PiperOrigin-RevId: 231250990
2019-01-28 11:00:39 -08:00 · 2019-01-28 11:00:39 -08:00 · aba83497f5
commit aba83497f5
parent 56c3ac7d23
71 changed files with 6790 additions and 2324 deletions
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@ -343,6 +343,13 @@ config_setting(
    },
 )

+config_setting(
+    name = "using_rocm_hipcc",
+    define_values = {
+        "using_rocm_hipcc": "true",
+    },
+)
+
 config_setting(
    name = "with_mpi_support",
    values = {"define": "with_mpi_support=true"},
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@ -1964,6 +1964,14 @@ cc_library(
    ],
 )

+cc_library(
+    name = "rocm",
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core/platform/default/build_config:rocm",
+    ],
+)
+
 # -----------------------------------------------------------------------------
 # Clif-related proto libraries.

--- a/tensorflow/core/platform/default/build_config.bzl
+++ b/tensorflow/core/platform/default/build_config.bzl
@ -6,6 +6,7 @@ load("//tensorflow:tensorflow.bzl", "if_windows")
 load("//tensorflow:tensorflow.bzl", "if_not_windows")
 load("//tensorflow/core:platform/default/build_config_root.bzl", "if_static")
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
+load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm")
 load(
    "//third_party/mkl:build_defs.bzl",
    "if_mkl_ml",
@ -735,6 +736,11 @@ def tf_additional_binary_deps():
            "//tensorflow/stream_executor:cuda_platform",
            "//tensorflow/core/platform/default/build_config:cuda",
        ],
+    ) + if_rocm(
+        [
+            "//tensorflow/stream_executor:rocm_platform",
+            "//tensorflow/core/platform/default/build_config:rocm",
+        ],
    ) + [
        # TODO(allenl): Split these out into their own shared objects (they are
        # here because they are shared between contrib/ op shared objects and
--- a/tensorflow/core/platform/default/build_config/BUILD
+++ b/tensorflow/core/platform/default/build_config/BUILD
@ -8,6 +8,7 @@ licenses(["notice"])  # Apache 2.0
 exports_files(["LICENSE"])

 load("//tensorflow:tensorflow.bzl", "if_cuda")
+load("//tensorflow:tensorflow.bzl", "if_rocm")
 load("//tensorflow:tensorflow.bzl", "tf_copts")
 load("//tensorflow:tensorflow.bzl", "tf_cuda_library")
 load("//tensorflow/core:platform/default/build_config_root.bzl", "if_static")
@ -42,6 +43,7 @@ tf_cuda_library(
        "//tensorflow/stream_executor/cuda:cuda_platform_id",
        "//tensorflow/stream_executor/host:host_platform_id",
        "//tensorflow/stream_executor/platform:dso_loader",
+        "//tensorflow/stream_executor/rocm:rocm_platform_id",
    ] + select({
        "@local_config_cuda//cuda:darwin": ["IOKit"],
        "//conditions:default": [],
@ -50,6 +52,7 @@ tf_cuda_library(
        "//tensorflow:using_cuda_nvcc": ["//tensorflow/stream_executor/cuda:all_runtime"],
        "//tensorflow:using_cuda_clang_with_dynamic_build": [],
        "//tensorflow:using_cuda_nvcc_with_dynamic_build": [],
+        "//tensorflow:using_rocm_hipcc": ["//tensorflow/stream_executor/rocm:all_runtime"],
        "//conditions:default": [],
    }),
 )
@ -67,6 +70,18 @@ cc_library(
    }),
 )

+cc_library(
+    name = "stream_executor_rocm",
+    deps = [
+        ":stream_executor_no_cuda",
+        ":rocm",
+    ] + if_static(
+        ["//tensorflow/stream_executor/rocm:all_runtime"],
+    ) + select({
+        "//conditions:default": [],
+    }),
+)
+
 cc_library(
    name = "stream_executor_no_cuda",
    deps = [
@ -79,6 +94,7 @@ cc_library(
        "//tensorflow/stream_executor/host:host_platform",
        "//tensorflow/stream_executor/host:host_platform_id",
        "//tensorflow/stream_executor/platform:dso_loader",
+        "//tensorflow/stream_executor/rocm:rocm_platform_id",
    ],
 )

@ -267,6 +283,17 @@ cc_library(
    ],
 )

+cc_library(
+    name = "rocm",
+    data = [],
+    linkopts = select({
+        "//conditions:default": [
+            "-Wl,-rpath,../local_config_rocm/rocm/rocm/lib",
+        ],
+    }),
+    deps = [],
+)
+
 cc_library(
    name = "sycl",
    data = if_ccpp([
--- a/tensorflow/core/platform/stream_executor.h
+++ b/tensorflow/core/platform/stream_executor.h
@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/multi_platform_manager.h"
 #include "tensorflow/stream_executor/platform.h"
 #include "tensorflow/stream_executor/platform/dso_loader.h"
+#include "tensorflow/stream_executor/rocm/rocm_platform_id.h"
 #include "tensorflow/stream_executor/scratch_allocator.h"
 #include "tensorflow/stream_executor/stream.h"
 #include "tensorflow/stream_executor/stream_executor.h"
--- a/tensorflow/core/platform/stream_executor_no_cuda.h
+++ b/tensorflow/core/platform/stream_executor_no_cuda.h
@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/multi_platform_manager.h"
 #include "tensorflow/stream_executor/platform.h"
 #include "tensorflow/stream_executor/platform/dso_loader.h"
+#include "tensorflow/stream_executor/rocm/rocm_platform_id.h"
 #include "tensorflow/stream_executor/scratch_allocator.h"
 #include "tensorflow/stream_executor/stream.h"
 #include "tensorflow/stream_executor/stream_executor.h"
--- a/tensorflow/stream_executor/BUILD
+++ b/tensorflow/stream_executor/BUILD
@ -654,3 +654,8 @@ alias(
    name = "cuda_platform",
    actual = "//tensorflow/stream_executor/cuda:all_runtime",
 )
+
+alias(
+    name = "rocm_platform",
+    actual = "//tensorflow/stream_executor/rocm:all_runtime",
+)
--- a/tensorflow/stream_executor/cuda/BUILD
+++ b/tensorflow/stream_executor/cuda/BUILD
@ -66,6 +66,7 @@ cc_library(
    deps = if_cuda_is_configured([
        "@com_google_absl//absl/container:inlined_vector",
        "@com_google_absl//absl/strings",
+        "//tensorflow/stream_executor/gpu:gpu_diagnostics_header",
        "//tensorflow/stream_executor/lib",
        "//tensorflow/stream_executor/platform",
    ]),
@ -85,6 +86,7 @@ cc_library(
        "@com_google_absl//absl/strings",
        "@local_config_cuda//cuda:cuda_headers",
        "//tensorflow/stream_executor:device_options",
+        "//tensorflow/stream_executor/gpu:gpu_driver_header",
        "//tensorflow/stream_executor/lib",
        "//tensorflow/stream_executor/platform",
        "//tensorflow/stream_executor/platform:dso_loader",
@ -97,18 +99,22 @@ cc_library(
    name = "cuda_activation_header",
    hdrs = ["cuda_activation.h"],
    visibility = ["//visibility:public"],
-    deps = ["//tensorflow/stream_executor/platform"],
+    deps = [
+        "//tensorflow/stream_executor/gpu:gpu_activation_header",
+        "//tensorflow/stream_executor/platform",
+    ],
 )

 cc_library(
    name = "cuda_activation",
-    srcs = if_cuda_is_configured(["cuda_activation.cc"]),
+    srcs = [],
    hdrs = if_cuda_is_configured(["cuda_activation.h"]),
    deps = if_cuda_is_configured([
        ":cuda_driver",
        "@local_config_cuda//cuda:cuda_headers",
        "//tensorflow/stream_executor",
        "//tensorflow/stream_executor:stream_executor_internal",
+        "//tensorflow/stream_executor/gpu:gpu_activation",
        "//tensorflow/stream_executor/platform",
    ]),
 )
@ -120,6 +126,7 @@ cc_library(
    deps = if_cuda_is_configured([
        ":cuda_kernel",
        "//tensorflow/stream_executor:event",
+        "//tensorflow/stream_executor/gpu:gpu_executor_header",
        "//tensorflow/stream_executor/lib",
        "//tensorflow/stream_executor/platform",
    ]),
@ -133,10 +140,10 @@ cc_library(
    deps = if_cuda_is_configured([
        ":cuda_activation",
        ":cuda_gpu_executor",
-        ":cuda_helpers",
        ":cuda_platform_id",
        ":cuda_stream",
        ":cuda_timer",
+        ":cuda_helpers",
        "@com_google_absl//absl/strings",
        "//third_party/eigen3",
        "@local_config_cuda//cuda:cuda_headers",
@ -147,6 +154,7 @@ cc_library(
        "//tensorflow/stream_executor:plugin_registry",
        "//tensorflow/stream_executor:scratch_allocator",
        "//tensorflow/stream_executor:timer",
+        "//tensorflow/stream_executor/gpu:gpu_helpers_header",
        "//tensorflow/stream_executor/lib",
        "//tensorflow/stream_executor/platform",
        "//tensorflow/stream_executor/platform:dso_loader",
@ -162,14 +170,15 @@ cc_library(
    deps = if_cuda_is_configured([
        ":cuda_activation_header",
        ":cuda_gpu_executor_header",
-        ":cuda_helpers",
        ":cuda_platform_id",
        ":cuda_stream",
+        ":cuda_helpers",
        "@local_config_cuda//cuda:cuda_headers",
        "//tensorflow/stream_executor:event",
        "//tensorflow/stream_executor:fft",
        "//tensorflow/stream_executor:plugin_registry",
        "//tensorflow/stream_executor:scratch_allocator",
+        "//tensorflow/stream_executor/gpu:gpu_helpers_header",
        "//tensorflow/stream_executor/lib",
        "//tensorflow/stream_executor/platform",
        "//tensorflow/stream_executor/platform:dso_loader",
@ -223,13 +232,15 @@ cc_library(
    deps = if_cuda_is_configured([
        ":cuda_activation",
        ":cuda_gpu_executor",
-        ":cuda_helpers",
        ":cuda_platform_id",
        ":cuda_stream",
+        ":cuda_helpers",
        "@local_config_cuda//cuda:cuda_headers",
        "//tensorflow/stream_executor:event",
        "//tensorflow/stream_executor:plugin_registry",
        "//tensorflow/stream_executor:rng",
+        "//tensorflow/stream_executor/gpu:gpu_helpers_header",
+        "//tensorflow/stream_executor/gpu:gpu_rng_header",
        "//tensorflow/stream_executor/lib",
        "//tensorflow/stream_executor/platform",
        "//tensorflow/stream_executor/platform:dso_loader",
@ -239,12 +250,14 @@ cc_library(

 cc_library(
    name = "cuda_kernel",
+    srcs = if_cuda_is_configured(["cuda_kernel.cc"]),
    hdrs = if_cuda_is_configured(["cuda_kernel.h"]),
    deps = if_cuda_is_configured([
        ":cuda_driver",
        "@local_config_cuda//cuda:cuda_headers",
        "//tensorflow/stream_executor:event",
        "//tensorflow/stream_executor:stream_executor_pimpl_header",
+        "//tensorflow/stream_executor/gpu:gpu_kernel_header",
        "//tensorflow/stream_executor/lib",
        "//tensorflow/stream_executor/platform",
    ]),
@ -254,6 +267,9 @@ cc_library(
 cc_library(
    name = "cuda_helpers",
    textual_hdrs = if_cuda_is_configured(["cuda_helpers.h"]),
+    deps = if_cuda_is_configured([
+        "//tensorflow/stream_executor/gpu:gpu_helpers_header",
+    ]),
 )

 cc_library(
@ -265,19 +281,22 @@ cc_library(
        ":cuda_gpu_executor_header",
        ":cuda_stream",
        "//tensorflow/stream_executor:stream_executor_headers",
+        "//tensorflow/stream_executor/gpu:gpu_event",
+        "//tensorflow/stream_executor/gpu:gpu_stream_header",
        "//tensorflow/stream_executor/lib",
    ]),
 )

 cc_library(
    name = "cuda_stream",
-    srcs = if_cuda_is_configured(["cuda_stream.cc"]),
+    srcs = [],
    hdrs = if_cuda_is_configured(["cuda_stream.h"]),
    deps = if_cuda_is_configured([
        ":cuda_driver",
        ":cuda_gpu_executor_header",
        "//tensorflow/stream_executor:stream_executor_headers",
        "//tensorflow/stream_executor:stream_header",
+        "//tensorflow/stream_executor/gpu:gpu_stream",
        "//tensorflow/stream_executor/lib",
        "//tensorflow/stream_executor/platform",
    ]),
@ -285,18 +304,18 @@ cc_library(

 cc_library(
    name = "cuda_timer",
-    srcs = if_cuda_is_configured(["cuda_timer.cc"]),
+    srcs = [],
    hdrs = if_cuda_is_configured(["cuda_timer.h"]),
    deps = if_cuda_is_configured([
        ":cuda_driver",
        ":cuda_gpu_executor_header",
        ":cuda_stream",
        "//tensorflow/stream_executor:stream_executor_headers",
+        "//tensorflow/stream_executor/gpu:gpu_timer",
        "//tensorflow/stream_executor/lib",
    ]),
 )

-# It implements :cuda_gpu_executor_header
 cc_library(
    name = "cuda_gpu_executor",
    srcs = if_cuda_is_configured(["cuda_gpu_executor.cc"]),
@ -316,6 +335,7 @@ cc_library(
        "//tensorflow/stream_executor:stream_executor_internal",
        "//tensorflow/stream_executor:stream_executor_pimpl_header",
        "//tensorflow/stream_executor:timer",
+        "//tensorflow/stream_executor/gpu:gpu_executor_header",
        "//tensorflow/stream_executor/lib",
        "//tensorflow/stream_executor/platform",
        "//tensorflow/stream_executor/platform:dso_loader",
--- a/tensorflow/stream_executor/cuda/cuda_activation.h
+++ b/tensorflow/stream_executor/cuda/cuda_activation.h
@ -17,13 +17,13 @@ limitations under the License.
 // It reaches into the CUDA implementation to activate an underlying CUDA
 // context.
 //
-// Having this file separate from cuda_gpu_executor.h means that dependent
+// Having this file separate from cuda/cuda_gpu_executor.h means that dependent
 // code does not also have to depend on cuda.h.

 #ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_ACTIVATION_H_
 #define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_ACTIVATION_H_

-#include "tensorflow/stream_executor/platform/port.h"
+#include "tensorflow/stream_executor/gpu/gpu_activation.h"

 namespace stream_executor {

@ -31,29 +31,7 @@ class StreamExecutor;

 namespace cuda {

-class CUDAExecutor;
-class ScopedActivateContext;
-
-// Activates a CUDA context within an enclosing scope.
-class ScopedActivateExecutorContext {
- public:
-  // Form that takes a CUDA executor implementation.
-  explicit ScopedActivateExecutorContext(CUDAExecutor* cuda_exec);
-
-  // Form that takes a pImpl executor and extracts a CUDA implementation --
-  // fatal failure if it is not CUDA inside.
-  explicit ScopedActivateExecutorContext(StreamExecutor* stream_exec);
-
-  ScopedActivateExecutorContext(ScopedActivateExecutorContext&& other);
-
-  ~ScopedActivateExecutorContext();
-
- private:
-  // The cuda.h-using datatype that we wrap.
-  ScopedActivateContext* driver_scoped_activate_context_;
-
-  SE_DISALLOW_COPY_AND_ASSIGN(ScopedActivateExecutorContext);
-};
+using ScopedActivateExecutorContext = gpu::ScopedActivateExecutorContext;

 }  // namespace cuda
 }  // namespace stream_executor
--- a/tensorflow/stream_executor/cuda/cuda_blas.cc
+++ b/tensorflow/stream_executor/cuda/cuda_blas.cc
--- a/tensorflow/stream_executor/cuda/cuda_blas.h
+++ b/tensorflow/stream_executor/cuda/cuda_blas.h
@ -33,26 +33,26 @@ namespace stream_executor {

 class Stream;

-namespace cuda {
+namespace gpu {

 // Opaque and unique identifier for the cuBLAS plugin.
 extern const PluginId kCuBlasPlugin;

-class CUDAExecutor;
+class GpuExecutor;

 // BLAS plugin for CUDA platform via cuBLAS library.
 //
 // This satisfies the platform-agnostic BlasSupport interface.
 //
 // Note that the cuBLAS handle that this encapsulates is implicitly tied to the
-// context (and, as a result, the device) that the parent CUDAExecutor is tied
+// context (and, as a result, the device) that the parent GpuExecutor is tied
 // to. This simply happens as an artifact of creating the cuBLAS handle when a
 // CUDA context is active.
 //
 // Thread-safe post-initialization.
 class CUDABlas : public blas::BlasSupport {
 public:
-  explicit CUDABlas(CUDAExecutor *parent);
+  explicit CUDABlas(GpuExecutor *parent);

  // Allocates a cuBLAS handle.
  bool Init();
@ -145,9 +145,9 @@ class CUDABlas : public blas::BlasSupport {
  // mutex that guards the cuBLAS handle for this device.
  mutex mu_;

-  // CUDAExecutor which instantiated this CUDABlas.
+  // GpuExecutor which instantiated this CUDABlas.
  // Immutable post-initialization.
-  CUDAExecutor *parent_;
+  GpuExecutor *parent_;

  // cuBLAS library handle on the device.
  cublasHandle_t blas_ GUARDED_BY(mu_);
@ -155,7 +155,7 @@ class CUDABlas : public blas::BlasSupport {
  SE_DISALLOW_COPY_AND_ASSIGN(CUDABlas);
 };

-}  // namespace cuda
+}  // namespace gpu
 }  // namespace stream_executor

 #endif  // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_BLAS_H_
--- a/tensorflow/stream_executor/cuda/cuda_diagnostics.cc
+++ b/tensorflow/stream_executor/cuda/cuda_diagnostics.cc
@ -52,13 +52,6 @@ limitations under the License.
 namespace stream_executor {
 namespace cuda {

-#ifdef __APPLE__
-static const CFStringRef kDriverKextIdentifier = CFSTR("com.nvidia.CUDA");
-#elif !defined(PLATFORM_WINDOWS)
-static const char *kDriverVersionPath = "/proc/driver/nvidia/version";
-#endif
-
-
 string DriverVersionToString(DriverVersion version) {
  return port::Printf("%d.%d.%d", std::get<0>(version), std::get<1>(version), std::get<2>(version));
 }
@ -112,6 +105,18 @@ port::StatusOr<DriverVersion> StringToDriverVersion(const string &value) {
  return result;
 }

+}  // namespace cuda
+}  // namespace stream_executor
+
+namespace stream_executor {
+namespace gpu {
+
+#ifdef __APPLE__
+static const CFStringRef kDriverKextIdentifier = CFSTR("com.nvidia.CUDA");
+#elif !defined(PLATFORM_WINDOWS)
+static const char *kDriverVersionPath = "/proc/driver/nvidia/version";
+#endif
+
 // -- class Diagnostician

 string Diagnostician::GetDevNodePath(int dev_node_ordinal) {
@ -190,11 +195,11 @@ void Diagnostician::LogDiagnosticInformation() {
  }
  port::StatusOr<DriverVersion> dso_version = FindDsoVersion();
  LOG(INFO) << "libcuda reported version is: "
-            << DriverVersionStatusToString(dso_version);
+            << cuda::DriverVersionStatusToString(dso_version);

  port::StatusOr<DriverVersion> kernel_version = FindKernelDriverVersion();
  LOG(INFO) << "kernel reported version is: "
-	  << DriverVersionStatusToString(kernel_version);
+            << cuda::DriverVersionStatusToString(kernel_version);
 #endif

  // OS X kernel driver does not report version accurately
@ -232,7 +237,7 @@ port::StatusOr<DriverVersion> Diagnostician::FindDsoVersion() {
    }
    const size_t length = suffix_pos - start;
    const string version = path.substr(start, length);
-    result = StringToDriverVersion(version);
+    result = cuda::StringToDriverVersion(version);
  }
 #else
 #if !defined(PLATFORM_WINDOWS) && !defined(ANDROID_TEGRA)
@ -260,7 +265,7 @@ port::StatusOr<DriverVersion> Diagnostician::FindDsoVersion() {
      // TODO(b/22689637): Eliminate the explicit namespace if possible.
      auto stripped_dso_version = port::StripSuffixString(dso_version, ".ld64");
      auto result = static_cast<port::StatusOr<DriverVersion> *>(data);
-      *result = StringToDriverVersion(stripped_dso_version);
+      *result = cuda::StringToDriverVersion(stripped_dso_version);
      return 1;
    }
    return 0;
@ -292,7 +297,7 @@ port::StatusOr<DriverVersion> Diagnostician::FindKernelModuleVersion(
  // TODO(b/22689637): Eliminate the explicit namespace if possible.
  auto stripped_kernel_version =
      port::StripSuffixString(kernel_version, ".ld64");
-  return StringToDriverVersion(stripped_kernel_version);
+  return cuda::StringToDriverVersion(stripped_kernel_version);
 }

 void Diagnostician::WarnOnDsoKernelMismatch(
@ -301,12 +306,12 @@ void Diagnostician::WarnOnDsoKernelMismatch(
  if (kernel_version.ok() && dso_version.ok() &&
      dso_version.ValueOrDie() == kernel_version.ValueOrDie()) {
    LOG(INFO) << "kernel version seems to match DSO: "
-              << DriverVersionToString(kernel_version.ValueOrDie());
+              << cuda::DriverVersionToString(kernel_version.ValueOrDie());
  } else {
    LOG(ERROR) << "kernel version "
-               << DriverVersionStatusToString(kernel_version)
+               << cuda::DriverVersionStatusToString(kernel_version)
               << " does not match DSO version "
-               << DriverVersionStatusToString(dso_version)
+               << cuda::DriverVersionStatusToString(dso_version)
               << " -- cannot find working devices in this configuration";
  }
 }
@ -336,9 +341,9 @@ port::StatusOr<DriverVersion> Diagnostician::FindKernelDriverVersion() {
    // see
    // https://developer.apple.com/library/mac/documentation/CoreFoundation/Conceptual/CFStrings/Articles/AccessingContents.html#//apple_ref/doc/uid/20001184-100980-TPXREF112
    if (version == NULL) {
-      return StringToDriverVersion("");
+      return cuda::StringToDriverVersion("");
    }
-    return StringToDriverVersion(version);
+    return cuda::StringToDriverVersion(version);
  }
  CFRelease(kext_infos);
  auto status = port::Status(
@ -387,6 +392,5 @@ port::StatusOr<DriverVersion> Diagnostician::FindKernelDriverVersion() {
 #endif
 }

-
-}  // namespace cuda
+}  // namespace gpu
 }  // namespace stream_executor
--- a/tensorflow/stream_executor/cuda/cuda_diagnostics.h
+++ b/tensorflow/stream_executor/cuda/cuda_diagnostics.h
@ -16,17 +16,13 @@ limitations under the License.
 #ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_DIAGNOSTICS_H_
 #define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_DIAGNOSTICS_H_

-#include "tensorflow/stream_executor/platform/port.h"
-#include <tuple>
-
-#include "tensorflow/stream_executor/lib/statusor.h"
-#include "tensorflow/stream_executor/platform/port.h"
+#include "tensorflow/stream_executor/gpu/gpu_diagnostics.h"

 namespace stream_executor {
 namespace cuda {

 // e.g. DriverVersion{346, 3, 4}
-using DriverVersion = std::tuple<int, int, int>;
+using DriverVersion = gpu::DriverVersion;

 // Converts a parsed driver version to string form.
 string DriverVersionToString(DriverVersion version);
@ -35,61 +31,9 @@ string DriverVersionToString(DriverVersion version);
 string DriverVersionStatusToString(port::StatusOr<DriverVersion> version);

 // Converts a string of a form like "331.79" to a DriverVersion{331, 79}.
-port::StatusOr<DriverVersion> StringToDriverVersion(const string &value);
+port::StatusOr<DriverVersion> StringToDriverVersion(const string& value);

-class Diagnostician {
- public:
-  // Logs diagnostic information when CUDA appears to be misconfigured (e.g. is
-  // not initializing).
-  //
-  // Note: if we're running on a machine that has no GPUs, we don't want to
-  // produce very much log spew beyond saying, "looks like there's no CUDA
-  // kernel
-  // module running".
-  //
-  // Note: we use non-Google-File:: API here because we may be called before
-  // InitGoogle has completed.
-  static void LogDiagnosticInformation();
-
-  // Given the driver version file contents, finds the kernel module version and
-  // returns it as a string.
-  //
-  // This is solely used for more informative log messages when the user is
-  // running on a machine that happens to have a libcuda/kernel driver mismatch.
-  static port::StatusOr<DriverVersion> FindKernelModuleVersion(
-      const string &driver_version_file_contents);
-
-  // Extracts the kernel driver version from the current host.
-  static port::StatusOr<DriverVersion> FindKernelDriverVersion();
-
-  // Iterates through loaded DSOs with DlIteratePhdrCallback to find the
-  // driver-interfacing DSO version number. Returns it as a string.
-  static port::StatusOr<DriverVersion> FindDsoVersion();
-
-  // Logs information about the kernel driver version and userspace driver
-  // library version.
-  static void LogDriverVersionInformation();
-
- private:
-
-  // Given the DSO version number and the driver version file contents, extracts
-  // the driver version and compares, warning the user in the case of
-  // incompatibility.
-  //
-  // This is solely used for more informative log messages when the user is
-  // running on a machine that happens to have a libcuda/kernel driver mismatch.
-  static void WarnOnDsoKernelMismatch(
-      port::StatusOr<DriverVersion> dso_version,
-      port::StatusOr<DriverVersion> kernel_version);
-
-  // Logs information about the dev nodes present on this machine: their
-  // existence, permissions, accessibility from this uid/gid.
-  static void LogDevNodeDiagnosticInformation();
-
-  static string GetDevNodePath(int dev_node_ordinal);
-
-  SE_DISALLOW_COPY_AND_ASSIGN(Diagnostician);
-};
+using Diagnostician = gpu::Diagnostician;

 }  // namespace cuda
 }  // namespace stream_executor
--- a/tensorflow/stream_executor/cuda/cuda_dnn.cc
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc
@ -58,7 +58,7 @@ limitations under the License.
 #pragma clang diagnostic warning "-Wmismatched-tags"

 namespace stream_executor {
-namespace cuda {
+namespace gpu {

 PLUGIN_REGISTRY_DEFINE_PLUGIN_ID(kCuDnnPlugin);

@ -137,7 +137,7 @@ class CudnnHandle {
 public:
  // Takes ownership of the executor context and the lock to access cuDNN
  // using handle.
-  CudnnHandle(cuda::ScopedActivateExecutorContext context, mutex_lock lock,
+  CudnnHandle(gpu::ScopedActivateExecutorContext context, mutex_lock lock,
              cudnnHandle_t handle)
      : context_(std::move(context)), lock_(std::move(lock)), handle_(handle) {}

@ -146,7 +146,7 @@ class CudnnHandle {
  cudnnHandle_t handle() const { return handle_; }

 private:
-  cuda::ScopedActivateExecutorContext context_;
+  gpu::ScopedActivateExecutorContext context_;
  mutex_lock lock_;
  cudnnHandle_t handle_;  // Not owned.
 };
@ -334,10 +334,10 @@ class CudnnAccess {
  // The legacy default stream synchronizes with all other streams and it is
  // therefore a bad idea (performance wise) to call any cuDNN APIs that
  // enqueue work in the stream.
-  CudnnHandle GetHandle(CUDAExecutor* executor, Stream* stream) {
+  CudnnHandle GetHandle(GpuExecutor* executor, Stream* stream) {
    mutex_lock lock(mutex_);
-    cuda::ScopedActivateExecutorContext context(executor);
-    CUstream cu_stream = stream ? AsCUDAStreamValue(stream) : cudaStreamLegacy;
+    gpu::ScopedActivateExecutorContext context(executor);
+    CUstream cu_stream = stream ? AsGpuStreamValue(stream) : cudaStreamLegacy;
    auto status = cudnnSetStream(handle_, cu_stream);
    CHECK_EQ(status, CUDNN_STATUS_SUCCESS) << "Failed to set cuDNN stream.";
    return CudnnHandle(std::move(context), std::move(lock), handle_);
@ -448,7 +448,7 @@ port::Status GetLoadedCudnnVersion(CudnnVersion* version) {

 }  // namespace

-CudnnSupport::CudnnSupport(CUDAExecutor* parent) : parent_(parent) {}
+CudnnSupport::CudnnSupport(GpuExecutor* parent) : parent_(parent) {}

 port::Status CudnnSupport::Init() {
  ScopedActivateExecutorContext context(parent_);
@ -481,14 +481,14 @@ port::Status CudnnSupport::Init() {
  CHECK_EQ(cudnn_handle, nullptr);
  LOG(ERROR) << "Could not create cudnn handle: " << ToString(status);
  if (status == CUDNN_STATUS_NOT_INITIALIZED) {
-    auto result = cuda::Diagnostician::FindKernelDriverVersion();
+    auto result = gpu::Diagnostician::FindKernelDriverVersion();
    if (!result.ok()) {
      LOG(ERROR) << "Error retrieving driver version: "
-                 << DriverVersionStatusToString(result);
+                 << cuda::DriverVersionStatusToString(result);
    } else {
      const auto& version = result.ValueOrDie();
      LOG(ERROR) << "Possibly insufficient driver version: "
-                 << DriverVersionToString(version);
+                 << cuda::DriverVersionToString(version);
    }
  }

@ -1151,7 +1151,7 @@ class CudnnRnnParamsDescriptor {
 }  // namespace

 class CudnnRnnDescriptor : public dnn::RnnDescriptor {
-  CudnnRnnDescriptor(const CudnnHandle& cudnn, cuda::RnnDescriptor rnn_desc,
+  CudnnRnnDescriptor(const CudnnHandle& cudnn, gpu::RnnDescriptor rnn_desc,
                     PersistentRnnPlan rnn_plan, int num_layers,
                     int hidden_size, int input_size, int batch_size,
                     cudnnRNNInputMode_t input_mode,
@ -1191,7 +1191,7 @@ class CudnnRnnDescriptor : public dnn::RnnDescriptor {
        CudnnDropoutDescriptor dropout_desc,
        CudnnDropoutDescriptor::Create(cudnn, dropout, seed, state_allocator));

-    cuda::RnnDescriptor rnn_desc = CreateRnnDescriptor();
+    gpu::RnnDescriptor rnn_desc = CreateRnnDescriptor();
    cudnnRNNAlgo_t rnn_algo = ToCudnnRNNAlgo(algorithm_config.algorithm());

    // TODO: allow the user to choose an algorithm.
@ -1282,7 +1282,7 @@ class CudnnRnnDescriptor : public dnn::RnnDescriptor {
  }

 private:
-  cuda::RnnDescriptor rnn_desc_;
+  gpu::RnnDescriptor rnn_desc_;
  PersistentRnnPlan rnn_plan_;
  int num_layers_;
  int hidden_size_;
@ -1401,15 +1401,14 @@ port::StatusOr<CudnnRnnParamsDescriptor> CudnnRnnParamsDescriptor::Create(

 class CudnnRnnSequenceTensorDescriptor
    : public dnn::RnnSequenceTensorDescriptor {
-  CudnnRnnSequenceTensorDescriptor(CUDAExecutor* parent, int max_seq_length,
+  CudnnRnnSequenceTensorDescriptor(GpuExecutor* parent, int max_seq_length,
                                   int batch_size, int data_size,
                                   cudnnDataType_t data_type,
 #if CUDNN_VERSION >= 7201
                                   RNNDataDescriptor data_handle,
 #endif
                                   TensorDescriptor handle)
-      : parent_(parent),
-        max_seq_length_(max_seq_length),
+      : max_seq_length_(max_seq_length),
        batch_size_(batch_size),
        data_size_(data_size),
        data_type_(data_type),
@ -1425,7 +1424,7 @@ class CudnnRnnSequenceTensorDescriptor
      default;

  static port::StatusOr<CudnnRnnSequenceTensorDescriptor> Create(
-      CUDAExecutor* parent, int max_seq_length, int batch_size, int data_size,
+      GpuExecutor* parent, int max_seq_length, int batch_size, int data_size,
      cudnnDataType_t data_type) {
    CHECK_GT(max_seq_length, 0);
    int dims[] = {batch_size, data_size, 1};
@ -1444,7 +1443,7 @@ class CudnnRnnSequenceTensorDescriptor
  }

  static port::StatusOr<CudnnRnnSequenceTensorDescriptor> Create(
-      CUDAExecutor* parent, int max_seq_length, int batch_size, int data_size,
+      GpuExecutor* parent, int max_seq_length, int batch_size, int data_size,
      const absl::Span<const int>& seq_lengths, cudnnDataType_t data_type) {
 #if CUDNN_VERSION >= 7201
    CHECK_GT(max_seq_length, 0);
@ -1496,7 +1495,6 @@ class CudnnRnnSequenceTensorDescriptor
  }

 private:
-  CUDAExecutor* parent_;
  int max_seq_length_;
  int batch_size_;
  int data_size_;
@ -1511,11 +1509,10 @@ class CudnnRnnSequenceTensorDescriptor

 class CudnnRnnStateTensorDescriptor : public dnn::RnnStateTensorDescriptor {
 public:
-  CudnnRnnStateTensorDescriptor(CUDAExecutor* parent, int num_layers,
+  CudnnRnnStateTensorDescriptor(GpuExecutor* parent, int num_layers,
                                int batch_size, int data_size,
                                cudnnDataType_t data_type)
-      : parent_(parent),
-        handle_(CreateTensorDescriptor()),
+      : handle_(CreateTensorDescriptor()),
        num_layers_(num_layers),
        batch_size_(batch_size),
        data_size_(data_size),
@ -1535,7 +1532,6 @@ class CudnnRnnStateTensorDescriptor : public dnn::RnnStateTensorDescriptor {
  int data_size() const { return data_size_; }

 private:
-  CUDAExecutor* parent_;
  TensorDescriptor handle_;
  int num_layers_;
  int batch_size_;
@ -1699,14 +1695,14 @@ port::Status CudnnSupport::DoRnnForwardImpl(
    }
  }

-  std::unique_ptr<CUDATimer, TimerDeleter> timer;
+  std::unique_ptr<GpuTimer, GpuTimerDeleter> timer;
  const bool is_profiling = output_profile_result != nullptr;
  if (is_profiling) {
-    timer.reset(new CUDATimer(parent_));
+    timer.reset(new GpuTimer(parent_));
    // The start and stop of the timer should be as close to the Cudnn call as
    // possible. It is still possible for other threads to issue workload on
    // to this stream. So it could take multiple profiling measurements.
-    if (!timer->Init() || !timer->Start(AsCUDAStream(stream))) {
+    if (!timer->Init() || !timer->Start(AsGpuStream(stream))) {
      return port::Status(port::error::INTERNAL, "Failed to start timer");
    }
  }
@ -1791,7 +1787,7 @@ port::Status CudnnSupport::DoRnnForwardImpl(
  }

  if (is_profiling) {
-    if (!timer->Stop(AsCUDAStream(stream))) {
+    if (!timer->Stop(AsGpuStream(stream))) {
      return port::Status(port::error::INTERNAL, "Failed to stop timer");
    }
    auto algo_desc = *rnn_desc.algorithm_config().algorithm();
@ -1842,14 +1838,14 @@ port::Status CudnnSupport::DoRnnBackwardImpl(
                      CreateRnnWorkspace(stream, cudnn, rnn_desc, input_desc,
                                         workspace_allocator));

-  std::unique_ptr<CUDATimer, TimerDeleter> timer;
+  std::unique_ptr<GpuTimer, GpuTimerDeleter> timer;
  const bool is_profiling = output_profile_result != nullptr;
  if (is_profiling) {
-    timer.reset(new CUDATimer(parent_));
+    timer.reset(new GpuTimer(parent_));
    // The start and stop of the timer should be as close to the Cudnn call as
    // possible. It is still possible for other threads to issue workload on
    // to this stream. So it could take multiple profiling measurements.
-    if (!timer->Init() || !timer->Start(AsCUDAStream(stream))) {
+    if (!timer->Init() || !timer->Start(AsGpuStream(stream))) {
      return port::Status(port::error::INTERNAL, "Failed to start timer");
    }
  }
@ -1948,7 +1944,7 @@ port::Status CudnnSupport::DoRnnBackwardImpl(
  }

  if (is_profiling) {
-    if (!timer->Stop(AsCUDAStream(stream))) {
+    if (!timer->Stop(AsGpuStream(stream))) {
      return port::Status(port::error::INTERNAL, "Failed to stop timer");
    }
    auto algo_desc = *rnn_desc.algorithm_config().algorithm();
@ -2915,13 +2911,13 @@ port::Status CudnnSupport::DoConvolve(

  const bool is_profiling = output_profile_result != nullptr;

-  std::unique_ptr<CUDATimer, TimerDeleter> timer;
+  std::unique_ptr<GpuTimer, GpuTimerDeleter> timer;
  if (is_profiling) {
-    timer.reset(new CUDATimer(parent_));  // NOLINT
+    timer.reset(new GpuTimer(parent_));  // NOLINT
    // The start and stop of the timer should be as close to the Cudnn call as
    // possible. It is still possible for other threads to issue workload on
    // to this stream. So it could take multiple profiling measurements.
-    if (!timer->Init() || !timer->Start(AsCUDAStream(stream))) {
+    if (!timer->Init() || !timer->Start(AsGpuStream(stream))) {
      return port::Status(port::error::INTERNAL, "Failed to start timer");
    }
  }
@ -3110,7 +3106,7 @@ port::Status CudnnSupport::DoConvolve(
  }

  if (is_profiling) {
-    if (!timer->Stop(AsCUDAStream(stream))) {
+    if (!timer->Stop(AsGpuStream(stream))) {
      return port::Status(port::error::INTERNAL, "Failed to stop timer");
    }
    output_profile_result->set_algorithm(algorithm_desc);
@ -3175,13 +3171,13 @@ port::Status CudnnSupport::DoFusedConvolveImpl(
          stream, cudnn, algorithm_config, conv_input_nd, filter, conv,
          output_nd, scratch_allocator, &scratch));

-  std::unique_ptr<CUDATimer, TimerDeleter> timer;
+  std::unique_ptr<GpuTimer, GpuTimerDeleter> timer;
  if (is_profiling) {
-    timer.reset(new CUDATimer(parent_));  // NOLINT
+    timer.reset(new GpuTimer(parent_));  // NOLINT
    // The start and stop of the timer should be as close to the Cudnn call as
    // possible. It is still possible for other threads to issue workload on
    // to this stream. So it could take multiple profiling measurements.
-    if (!timer->Init() || !timer->Start(AsCUDAStream(stream))) {
+    if (!timer->Init() || !timer->Start(AsGpuStream(stream))) {
      return port::Status(port::error::INTERNAL, "Failed to start timer");
    }
  }
@ -3234,7 +3230,7 @@ port::Status CudnnSupport::DoFusedConvolveImpl(
      /*yDesc=*/output_nd.handle(), /*y=*/output_data->opaque()));

  if (is_profiling) {
-    if (!timer->Stop(AsCUDAStream(stream))) {
+    if (!timer->Stop(AsGpuStream(stream))) {
      return port::Status(port::error::INTERNAL, "Failed to stop timer");
    }
    output_profile_result->set_algorithm(algo_desc);
@ -4339,22 +4335,22 @@ bool CudnnSupport::DeriveOutputBatchDescriptor(
  return IsStatusOk(status, /*report_error=*/true);
 }

-}  // namespace cuda
+}  // namespace gpu

 void initialize_cudnn() {
  port::Status status =
      PluginRegistry::Instance()->RegisterFactory<PluginRegistry::DnnFactory>(
-          cuda::kCudaPlatformId, cuda::kCuDnnPlugin, "cuDNN",
+          cuda::kCudaPlatformId, gpu::kCuDnnPlugin, "cuDNN",
          [](internal::StreamExecutorInterface* parent) -> dnn::DnnSupport* {
-            cuda::CUDAExecutor* cuda_executor =
-                dynamic_cast<cuda::CUDAExecutor*>(parent);
+            gpu::GpuExecutor* cuda_executor =
+                dynamic_cast<gpu::GpuExecutor*>(parent);
            if (cuda_executor == nullptr) {
              LOG(ERROR) << "Attempting to initialize an instance of the cuDNN "
                         << "support library with a non-CUDA StreamExecutor";
              return nullptr;
            }

-            cuda::CudnnSupport* dnn = new cuda::CudnnSupport(cuda_executor);
+            gpu::CudnnSupport* dnn = new gpu::CudnnSupport(cuda_executor);
            if (!dnn->Init().ok()) {
              // Note: Init() will log a more specific error.
              delete dnn;
@ -4369,7 +4365,7 @@ void initialize_cudnn() {
  }

  PluginRegistry::Instance()->SetDefaultFactory(
-      cuda::kCudaPlatformId, PluginKind::kDnn, cuda::kCuDnnPlugin);
+      cuda::kCudaPlatformId, PluginKind::kDnn, gpu::kCuDnnPlugin);
 }

 }  // namespace stream_executor
--- a/tensorflow/stream_executor/cuda/cuda_dnn.h
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.h
@ -28,9 +28,9 @@ limitations under the License.
 #include "tensorflow/stream_executor/temporary_device_memory.h"

 namespace stream_executor {
-namespace cuda {
+namespace gpu {

-class CUDAExecutor;
+class GpuExecutor;
 class CudnnRnnDescriptor;
 class CudnnRnnSequenceTensorDescriptor;
 class CudnnRnnStateTensorDescriptor;
@ -42,7 +42,7 @@ extern const PluginId kCuDnnPlugin;
 // functions, see dnn.h.
 class CudnnSupport : public dnn::DnnSupport {
 public:
-  explicit CudnnSupport(CUDAExecutor* parent);
+  explicit CudnnSupport(GpuExecutor* parent);

  port::Status Init() override;
  port::StatusOr<perftools::gputools::dnn::VersionInfo> GetVersion() override;
@ -552,7 +552,7 @@ class CudnnSupport : public dnn::DnnSupport {
                         DeviceMemoryBase* output_data) override;

 private:
-  CUDAExecutor* parent_;  // Parent executor object. Not owned.
+  GpuExecutor* parent_;  // Parent executor object. Not owned.

  // Provides access to the cuDNN handle.
  std::unique_ptr<class CudnnAccess> cudnn_;
@ -667,7 +667,7 @@ class CudnnSupport : public dnn::DnnSupport {
  SE_DISALLOW_COPY_AND_ASSIGN(CudnnSupport);
 };

-}  // namespace cuda
+}  // namespace gpu
 }  // namespace stream_executor

 #endif  // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_DNN_H_
--- a/tensorflow/stream_executor/cuda/cuda_driver.cc
+++ b/tensorflow/stream_executor/cuda/cuda_driver.cc
@ -45,21 +45,20 @@ bool FLAGS_gpuexec_cuda_device_0_only = false;

 // Debugging: on each push and pop of a cuda context, verify the current context
 // matches the expected one.
-constexpr bool kVerifyCudaContext = false;
+constexpr bool kVerifyGpuContext = false;

 namespace stream_executor {
-namespace cuda {
-
+namespace gpu {
 namespace {

 // Manages the singleton map of contexts that we've created, mapping
-// from the CUcontext to the CudaContext* that we pass around internally.
-// This also manages assignment of unique ids to CudaContexts, to allow
+// from the CUcontext to the GpuContext* that we pass around internally.
+// This also manages assignment of unique ids to GpuContexts, to allow
 // for fast comparison of a context against the current context.
 //
 // CUDA-runtime-created contexts are avoided, if triple angle
 // brace launches are required, by using the scoped activations in
-// cuda_activation.h.
+// gpu/gpu_activation.h.
 class CreatedContexts {
 public:
  // Returns whether context is a member of the live set.
@ -69,14 +68,14 @@ class CreatedContexts {
  }

  // Adds context to the live set, or returns it if it's already present.
-  static CudaContext* Add(CUcontext context) {
+  static GpuContext* Add(CUcontext context) {
    CHECK(context != nullptr);
    mutex_lock lock(mu_);
    auto insert_result = Live()->insert(std::make_pair(context, nullptr));
    auto it = insert_result.first;
    if (insert_result.second) {
      // context was not present in the map.  Add it.
-      it->second = MakeUnique<CudaContext>(context, next_id_++);
+      it->second = MakeUnique<GpuContext>(context, next_id_++);
    }
    return it->second.get();
  }
@ -92,9 +91,9 @@ class CreatedContexts {

 private:
  // Returns the live map singleton.
-  static std::map<CUcontext, std::unique_ptr<CudaContext>> *Live() {
+  static std::map<CUcontext, std::unique_ptr<GpuContext>>* Live() {
    static auto singleton =
-        new std::map<CUcontext, std::unique_ptr<CudaContext>>;
+        new std::map<CUcontext, std::unique_ptr<GpuContext>>;
    return singleton;
  }

@ -123,7 +122,7 @@ string ToString(CUresult result) {
 // created by StreamExecutor (to ensure that the CUDA runtime didn't create a
 // context behind our backs).
 CUcontext CurrentContext() {
-  CUcontext current = CUDADriver::CurrentContextOrDie();
+  CUcontext current = cuda::CurrentContextOrDie();
  if (current != nullptr && !CreatedContexts::Has(current)) {
    LOG(FATAL) << "current context was not created by the StreamExecutor "
                  "cuda_driver API: "
@ -177,7 +176,7 @@ void SynchronizeOrDie() {

 struct ThreadLocalData {
  int64 id;
-  CudaContext* context;  // Only valid if id == a known good context.
+  GpuContext* context;  // Only valid if id == a known good context.
  int depth;
 };

@ -185,13 +184,13 @@ SE_STATIC_THREAD_LOCAL_POD(ThreadLocalData, tls_data);

 }  // namespace

-ScopedActivateContext::ScopedActivateContext(CudaContext* cuda_context) {
+ScopedActivateContext::ScopedActivateContext(GpuContext* cuda_context) {
  if (FLAGS_gpuexec_cuda_sync_around_driver_calls) SynchronizeOrDie();

  auto* tls = &tls_data.get();
  tls->depth++;
  if (tls->id == cuda_context->id()) {
-    if (kVerifyCudaContext) {
+    if (kVerifyGpuContext) {
      CHECK_EQ(CurrentContext(), cuda_context->context());
    }
    DCHECK_EQ(CurrentContext(), cuda_context->context());
@ -215,8 +214,8 @@ ScopedActivateContext::~ScopedActivateContext() {

  auto* tls = &tls_data.get();

-  if (kVerifyCudaContext) {
-    // Note that if kVerifyCudaContext is used, and contexts are deleted, it's
+  if (kVerifyGpuContext) {
+    // Note that if kVerifyGpuContext is used, and contexts are deleted, it's
    // possible this could fail in the CurrentContext() call.
    CHECK_EQ(CurrentContext(),
             tls->context == nullptr ? nullptr : tls->context->context());
@ -242,7 +241,7 @@ namespace {
 // logging purposes. Returns "?" if the device could not be successfully
 // queried.
 string CUDAPointerToDeviceString(CUdeviceptr pointer) {
-  auto value = CUDADriver::GetPointerDevice(pointer);
+  auto value = GpuDriver::GetPointerDevice(pointer);
  if (value.ok()) {
    return absl::StrCat(value.ValueOrDie());
  }
@ -254,7 +253,7 @@ string CUDAPointerToDeviceString(CUdeviceptr pointer) {
 // logging purposes. Returns "?" if the memory space could not be successfully
 // queried.
 string CUDAPointerToMemorySpaceString(CUdeviceptr pointer) {
-  auto value = CUDADriver::GetPointerMemorySpace(pointer);
+  auto value = GpuDriver::GetPointerMemorySpace(pointer);
  if (value.ok()) {
    return MemorySpaceString(value.ValueOrDie());
  }
@ -267,20 +266,20 @@ string CUDAPointerToMemorySpaceString(CUdeviceptr pointer) {
 // primarily for logging purposes. Returns "error" if an error is encountered
 // in the process of querying.
 string CUDAPointersToCanAccessString(CUdeviceptr from, CUdeviceptr to) {
-  auto from_context = CUDADriver::GetPointerContext(from);
+  auto from_context = GpuDriver::GetPointerContext(from);
  if (!from_context.ok()) {
    LOG(ERROR) << "could not retrieve source pointer's context: "
               << from_context.status();
    return "error";
  }
-  auto to_context = CUDADriver::GetPointerContext(to);
+  auto to_context = GpuDriver::GetPointerContext(to);
  if (!to_context.ok()) {
    LOG(ERROR) << "could not retrieve destination pointer's context: "
               << to_context.status();
    return "error";
  }
-  return CUDADriver::CanEnablePeerAccess(from_context.ValueOrDie(),
-                                         to_context.ValueOrDie())
+  return GpuDriver::CanEnablePeerAccess(from_context.ValueOrDie(),
+                                        to_context.ValueOrDie())
             ? "true"
             : "false";
 }
@ -308,9 +307,9 @@ static port::Status InternalInit() {

 }  // namespace

-/* static */ port::Status CUDADriver::Init() {
+/* static */ port::Status GpuDriver::Init() {
  // Cached return value from calling InternalInit(), as cuInit need only be
-  // called once, but CUDADriver::Init may be called many times.
+  // called once, but GpuDriver::Init may be called many times.
  static port::Status init_retval;
  static bool set = false;
  static mutex *init_mu = new mutex;
@ -324,8 +323,8 @@ static port::Status InternalInit() {
  return init_retval;
 }

-/* static */ port::Status CUDADriver::GetDevice(int device_ordinal,
-                                                CUdevice *device) {
+/* static */ port::Status GpuDriver::GetDevice(int device_ordinal,
+                                               CUdevice* device) {
  CUresult res = tensorflow::wrap::cuDeviceGet(device, device_ordinal);
  if (res == CUDA_SUCCESS) {
    return port::Status::OK();
@ -336,8 +335,8 @@ static port::Status InternalInit() {
      absl::StrCat("failed call to cuDeviceGet: ", ToString(res)));
 }

-/* static */ bool CUDADriver::GetDeviceName(CUdevice device,
-                                            string *device_name) {
+/* static */ bool GpuDriver::GetDeviceName(CUdevice device,
+                                           string* device_name) {
  static const size_t kCharLimit = 64;
  absl::InlinedVector<char, 4> chars(kCharLimit);
  CUresult res =
@ -376,9 +375,9 @@ bool DeviceOptionsToContextFlags(const DeviceOptions &device_options,
  return true;
 }

-/* static */ port::Status CUDADriver::CreateContext(
-    CUdevice device, const DeviceOptions &device_options,
-    CudaContext **context) {
+/* static */ port::Status GpuDriver::CreateContext(
+    int device_ordinal, CUdevice device, const DeviceOptions& device_options,
+    GpuContext** context) {
  *context = nullptr;

  int flags = 0;
@ -407,7 +406,7 @@ bool DeviceOptionsToContextFlags(const DeviceOptions &device_options,
    }
  }

-  former_context = CUDADriver::CurrentContextOrDie();
+  former_context = cuda::CurrentContextOrDie();
  res = tensorflow::wrap::cuDevicePrimaryCtxRetain(&new_context, device);
  if (former_context != nullptr) {
    CUdevice former_device;
@ -454,7 +453,7 @@ bool DeviceOptionsToContextFlags(const DeviceOptions &device_options,
  return port::Status(port::error::INTERNAL, message);
 }

-/* static */ void CUDADriver::DestroyContext(CudaContext* context) {
+/* static */ void GpuDriver::DestroyContext(GpuContext* context) {
  if (context == nullptr) {
    return;
  }
@ -473,9 +472,9 @@ bool DeviceOptionsToContextFlags(const DeviceOptions &device_options,
  CreatedContexts::Remove(context->context());
 }

-/* static */ bool CUDADriver::FuncGetAttribute(CUfunction_attribute attribute,
-                                               CUfunction func,
-                                               int *attribute_value) {
+/* static */ bool GpuDriver::FuncGetAttribute(CUfunction_attribute attribute,
+                                              CUfunction func,
+                                              int* attribute_value) {
  CUresult res =
      tensorflow::wrap::cuFuncGetAttribute(attribute_value, attribute, func);
  if (res != CUDA_SUCCESS) {
@ -486,8 +485,8 @@ bool DeviceOptionsToContextFlags(const DeviceOptions &device_options,
  return true;
 }

-/* static */ bool CUDADriver::FuncSetCacheConfig(CUfunction function,
-                                                 CUfunc_cache cache_config) {
+/* static */ bool GpuDriver::FuncSetCacheConfig(CUfunction function,
+                                                CUfunc_cache cache_config) {
  CUresult res = tensorflow::wrap::cuFuncSetCacheConfig(function, cache_config);
  if (res != CUDA_SUCCESS) {
    LOG(ERROR) << "failed to set CUDA kernel cache config. kernel: " << function
@ -499,7 +498,7 @@ bool DeviceOptionsToContextFlags(const DeviceOptions &device_options,
 }

 /* static */ port::StatusOr<CUsharedconfig>
-CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
+GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
  CUsharedconfig shared_mem_config;
  ScopedActivateContext activation(context);
  CUresult result =
@ -517,8 +516,8 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
  return shared_mem_config;
 }

-/* static */ port::Status CUDADriver::ContextSetSharedMemConfig(
-    CudaContext* context, CUsharedconfig shared_mem_config) {
+/* static */ port::Status GpuDriver::ContextSetSharedMemConfig(
+    GpuContext* context, CUsharedconfig shared_mem_config) {
  ScopedActivateContext activation(context);
  CUresult result =
      tensorflow::wrap::cuCtxSetSharedMemConfig(shared_mem_config);
@ -536,12 +535,12 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
  return port::Status::OK();
 }

-/* static */ bool CUDADriver::LaunchKernel(
-    CudaContext* context, CUfunction function, unsigned int grid_dim_x,
+/* static */ bool GpuDriver::LaunchKernel(
+    GpuContext* context, CUfunction function, unsigned int grid_dim_x,
    unsigned int grid_dim_y, unsigned int grid_dim_z, unsigned int block_dim_x,
    unsigned int block_dim_y, unsigned int block_dim_z,
-    unsigned int shared_mem_bytes, CUstream stream, void **kernel_params,
-    void **extra) {
+    unsigned int shared_mem_bytes, CUstream stream, void** kernel_params,
+    void** extra) {
  ScopedActivateContext activation(context);
  VLOG(2) << "launching kernel: " << function << "; gdx: " << grid_dim_x
          << " gdy: " << grid_dim_y << " gdz: " << grid_dim_z
@ -559,9 +558,9 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
  return true;
 }

-/* static */ port::Status CUDADriver::LoadCubin(CudaContext* context,
-                                                const char *cubin_bytes,
-                                                CUmodule *module) {
+/* static */ port::Status GpuDriver::LoadCubin(GpuContext* context,
+                                               const char* cubin_bytes,
+                                               CUmodule* module) {
  ScopedActivateContext activation(context);
  CUresult result =
      tensorflow::wrap::cuModuleLoadFatBinary(module, cubin_bytes);
@ -573,9 +572,9 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
  return port::Status::OK();
 }

-/* static */ bool CUDADriver::LoadPtx(CudaContext* context,
-                                      const char *ptx_contents,
-                                      CUmodule *module) {
+/* static */ bool GpuDriver::LoadPtx(GpuContext* context,
+                                     const char* ptx_contents,
+                                     CUmodule* module) {
  port::Notification notification;
  bool ret = true;
  GetDriverExecutor()->Schedule([context, ptx_contents, module, &ret,
@ -643,9 +642,16 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
  return ret;
 }

-/* static */ bool CUDADriver::SynchronousMemsetUint8(CudaContext* context,
-                                                     CUdeviceptr location,
-                                                     uint8 value, size_t size) {
+/* static */ bool GpuDriver::LoadHsaco(GpuContext* context,
+                                       const char* hsaco_contents,
+                                       CUmodule* module) {
+  LOG(ERROR) << "Feature not supported on CUDA platform (LoadHsaco)";
+  return false;
+}
+
+/* static */ bool GpuDriver::SynchronousMemsetUint8(GpuContext* context,
+                                                    CUdeviceptr location,
+                                                    uint8 value, size_t size) {
  ScopedActivateContext activation(context);
  CUresult res = tensorflow::wrap::cuMemsetD8(location, value, size);
  if (res != CUDA_SUCCESS) {
@ -655,10 +661,10 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
  return true;
 }

-/* static */ bool CUDADriver::SynchronousMemsetUint32(CudaContext* context,
-                                                      CUdeviceptr location,
-                                                      uint32 value,
-                                                      size_t uint32_count) {
+/* static */ bool GpuDriver::SynchronousMemsetUint32(GpuContext* context,
+                                                     CUdeviceptr location,
+                                                     uint32 value,
+                                                     size_t uint32_count) {
  ScopedActivateContext activation(context);
  CUresult res = tensorflow::wrap::cuMemsetD32(location, value, uint32_count);
  if (res != CUDA_SUCCESS) {
@ -668,11 +674,11 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
  return true;
 }

-/* static */ bool CUDADriver::AsynchronousMemsetUint8(CudaContext* context,
-                                                      CUdeviceptr location,
-                                                      uint8 value,
-                                                      size_t uint32_count,
-                                                      CUstream stream) {
+/* static */ bool GpuDriver::AsynchronousMemsetUint8(GpuContext* context,
+                                                     CUdeviceptr location,
+                                                     uint8 value,
+                                                     size_t uint32_count,
+                                                     CUstream stream) {
  ScopedActivateContext activation(context);
  CUresult res =
      tensorflow::wrap::cuMemsetD8Async(location, value, uint32_count, stream);
@ -684,11 +690,11 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
  return true;
 }

-/* static */ bool CUDADriver::AsynchronousMemsetUint32(CudaContext* context,
-                                                       CUdeviceptr location,
-                                                       uint32 value,
-                                                       size_t uint32_count,
-                                                       CUstream stream) {
+/* static */ bool GpuDriver::AsynchronousMemsetUint32(GpuContext* context,
+                                                      CUdeviceptr location,
+                                                      uint32 value,
+                                                      size_t uint32_count,
+                                                      CUstream stream) {
  ScopedActivateContext activation(context);
  CUresult res =
      tensorflow::wrap::cuMemsetD32Async(location, value, uint32_count, stream);
@ -700,10 +706,10 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
  return true;
 }

-/* static */ bool CUDADriver::AddStreamCallback(CudaContext* context,
-                                                CUstream stream,
-                                                StreamCallback callback,
-                                                void *data) {
+/* static */ bool GpuDriver::AddStreamCallback(GpuContext* context,
+                                               CUstream stream,
+                                               StreamCallback callback,
+                                               void* data) {
  // Note: flags param is required to be zero according to CUDA 6.0.
  CUresult res = tensorflow::wrap::cuStreamAddCallback(stream, callback, data,
                                                       0 /* = flags */);
@ -714,10 +720,10 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
  return true;
 }

-/* static */ bool CUDADriver::GetModuleFunction(CudaContext *context,
-                                                CUmodule module,
-                                                const char *kernel_name,
-                                                CUfunction *function) {
+/* static */ bool GpuDriver::GetModuleFunction(GpuContext* context,
+                                               CUmodule module,
+                                               const char* kernel_name,
+                                               CUfunction* function) {
  ScopedActivateContext activated{context};
  CHECK(module != nullptr && kernel_name != nullptr);
  CUresult res =
@ -731,11 +737,10 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
  return true;
 }

-/* static */ bool CUDADriver::GetModuleSymbol(CudaContext* context,
-                                              CUmodule module,
-                                              const char *symbol_name,
-                                              CUdeviceptr *dptr,
-                                              size_t *bytes) {
+/* static */ bool GpuDriver::GetModuleSymbol(GpuContext* context,
+                                             CUmodule module,
+                                             const char* symbol_name,
+                                             CUdeviceptr* dptr, size_t* bytes) {
  ScopedActivateContext activated{context};
  CHECK(module != nullptr && symbol_name != nullptr &&
        (dptr != nullptr || bytes != nullptr));
@ -752,8 +757,8 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
  return true;
 }

-/* static */ void CUDADriver::UnloadModule(CudaContext *context,
-                                           CUmodule module) {
+/* static */ void GpuDriver::UnloadModule(GpuContext* context,
+                                          CUmodule module) {
  ScopedActivateContext activated{context};
  CUresult res = tensorflow::wrap::cuModuleUnload(module);
  if (res != CUDA_SUCCESS) {
@ -762,8 +767,8 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
  }
 }

-/* static */ port::StatusOr<CUdevice> CUDADriver::DeviceFromContext(
-    CudaContext* context) {
+/* static */ port::StatusOr<CUdevice> GpuDriver::DeviceFromContext(
+    GpuContext* context) {
  ScopedActivateContext activated{context};
  CUdevice device = -1;
  CUresult result = tensorflow::wrap::cuCtxGetDevice(&device);
@ -776,26 +781,26 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
      absl::StrCat("failed to get device for context: ", ToString(result)));
 }

-/* static */ bool CUDADriver::CreateStream(CudaContext *context,
-                                           CUstream *out) {
+/* static */ bool GpuDriver::CreateStream(GpuContext* context,
+                                          CUstream* stream) {
  // TODO(leary) can we switch this to CU_STREAM_NON_BLOCKING or will that mess
  // up synchronization with respect to memsets and any other things that have
  // to occur on the default stream?
  ScopedActivateContext activated{context};
-  CUresult res = tensorflow::wrap::cuStreamCreate(out, 0);
+  CUresult res = tensorflow::wrap::cuStreamCreate(stream, 0);
  if (res != CUDA_SUCCESS) {
    LOG(ERROR) << "could not allocate CUDA stream for context "
               << context->context() << ": " << ToString(res);
    return false;
  }

-  VLOG(2) << "successfully created stream " << *out << " for context "
+  VLOG(2) << "successfully created stream " << *stream << " for context "
          << context->context() << " on thread";
  return true;
 }

-/* static */ void CUDADriver::DestroyStream(CudaContext* context,
-                                            CUstream *stream) {
+/* static */ void GpuDriver::DestroyStream(GpuContext* context,
+                                           CUstream* stream) {
  if (*stream == nullptr) {
    return;
  }
@ -812,8 +817,8 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
  }
 }

-/* static */ void *CUDADriver::DeviceAllocate(CudaContext *context,
-                                              uint64 bytes) {
+/* static */ void* GpuDriver::DeviceAllocate(GpuContext* context,
+                                             uint64 bytes) {
  ScopedActivateContext activated{context};
  CUdeviceptr result = 0;
  CUresult res = tensorflow::wrap::cuMemAlloc(&result, bytes);
@ -829,8 +834,8 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
  return ptr;
 }

-/* static */ void CUDADriver::DeviceDeallocate(CudaContext* context,
-                                               void *location) {
+/* static */ void GpuDriver::DeviceDeallocate(GpuContext* context,
+                                              void* location) {
  ScopedActivateContext activation(context);
  CUdeviceptr pointer = absl::bit_cast<CUdeviceptr>(location);
  CUresult res = tensorflow::wrap::cuMemFree(pointer);
@ -843,8 +848,8 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
  }
 }

-/* static */ void *CUDADriver::UnifiedMemoryAllocate(CudaContext *context,
-                                                     uint64 bytes) {
+/* static */ void* GpuDriver::UnifiedMemoryAllocate(GpuContext* context,
+                                                    uint64 bytes) {
  ScopedActivateContext activation(context);
  CUdeviceptr result = 0;
  // "Portable" memory is visible to all CUDA contexts. Safe for our use model.
@ -861,8 +866,8 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
  return ptr;
 }

-/* static */ void CUDADriver::UnifiedMemoryDeallocate(CudaContext *context,
-                                                      void *location) {
+/* static */ void GpuDriver::UnifiedMemoryDeallocate(GpuContext* context,
+                                                     void* location) {
  ScopedActivateContext activation(context);
  CUdeviceptr pointer = absl::bit_cast<CUdeviceptr>(location);
  CUresult res = tensorflow::wrap::cuMemFree(pointer);
@ -875,8 +880,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
  }
 }

-/* static */ void *CUDADriver::HostAllocate(CudaContext *context,
-                                            uint64 bytes) {
+/* static */ void* GpuDriver::HostAllocate(GpuContext* context, uint64 bytes) {
  ScopedActivateContext activation(context);
  void *host_mem = nullptr;
  // "Portable" memory is visible to all CUDA contexts. Safe for our use model.
@ -889,8 +893,8 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
  return host_mem;
 }

-/* static */ void CUDADriver::HostDeallocate(CudaContext* context,
-                                             void *location) {
+/* static */ void GpuDriver::HostDeallocate(GpuContext* context,
+                                            void* location) {
  ScopedActivateContext activation(context);
  CUresult res = tensorflow::wrap::cuMemFreeHost(location);
  if (res != CUDA_SUCCESS) {
@ -899,8 +903,8 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
  }
 }

-/* static */ bool CUDADriver::HostRegister(CudaContext* context, void *location,
-                                           uint64 bytes) {
+/* static */ bool GpuDriver::HostRegister(GpuContext* context, void* location,
+                                          uint64 bytes) {
  ScopedActivateContext activation(context);
  // "Portable" memory is visible to all CUDA contexts. Safe for our use model.
  CUresult res = tensorflow::wrap::cuMemHostRegister(
@ -913,8 +917,8 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
  return true;
 }

-/* static */ bool CUDADriver::HostUnregister(CudaContext* context,
-                                             void *location) {
+/* static */ bool GpuDriver::HostUnregister(GpuContext* context,
+                                            void* location) {
  ScopedActivateContext activation(context);
  CUresult res = tensorflow::wrap::cuMemHostUnregister(location);
  if (res != CUDA_SUCCESS) {
@ -925,8 +929,8 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
  return true;
 }

-/* static */ port::Status CUDADriver::DestroyEvent(CudaContext* context,
-                                                   CUevent *event) {
+/* static */ port::Status GpuDriver::DestroyEvent(GpuContext* context,
+                                                  CUevent* event) {
  if (*event == nullptr) {
    return port::Status(port::error::INVALID_ARGUMENT,
                        "input event cannot be null");
@ -953,9 +957,9 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
  }
 }

-/* static */ port::Status CUDADriver::RecordEvent(CudaContext* context,
-                                                  CUevent event,
-                                                  CUstream stream) {
+/* static */ port::Status GpuDriver::RecordEvent(GpuContext* context,
+                                                 CUevent event,
+                                                 CUstream stream) {
  ScopedActivateContext activated{context};
  CUresult res = tensorflow::wrap::cuEventRecord(event, stream);
  switch (res) {
@ -975,8 +979,8 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
  }
 }

-/* static */ port::StatusOr<CUresult> CUDADriver::QueryEvent(
-    CudaContext *context, CUevent event) {
+/* static */ port::StatusOr<CUresult> GpuDriver::QueryEvent(GpuContext* context,
+                                                            CUevent event) {
  ScopedActivateContext activated{context};
  CUresult res = tensorflow::wrap::cuEventQuery(event);
  if (res != CUDA_SUCCESS && res != CUDA_ERROR_NOT_READY) {
@ -988,9 +992,9 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
  return res;
 }

-/* static */ bool CUDADriver::GetEventElapsedTime(CudaContext* context,
-                                                  float *elapsed_milliseconds,
-                                                  CUevent start, CUevent stop) {
+/* static */ bool GpuDriver::GetEventElapsedTime(GpuContext* context,
+                                                 float* elapsed_milliseconds,
+                                                 CUevent start, CUevent stop) {
  ScopedActivateContext activated{context};
  // The stop event must have completed in order for cuEventElapsedTime to
  // work.
@ -1009,9 +1013,8 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
  return true;
 }

-/* static */ bool CUDADriver::WaitStreamOnEvent(CudaContext* context,
-                                                CUstream stream,
-                                                CUevent event) {
+/* static */ bool GpuDriver::WaitStreamOnEvent(GpuContext* context,
+                                               CUstream stream, CUevent event) {
  ScopedActivateContext activation(context);
  CUresult res =
      tensorflow::wrap::cuStreamWaitEvent(stream, event, 0 /* = flags */);
@ -1023,7 +1026,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
  return true;
 }

-/* static */ bool CUDADriver::SynchronizeContext(CudaContext* context) {
+/* static */ bool GpuDriver::SynchronizeContext(GpuContext* context) {
  ScopedActivateContext activation(context);
  CUresult res = tensorflow::wrap::cuCtxSynchronize();
  if (res != CUDA_SUCCESS) {
@ -1035,8 +1038,8 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
  return true;
 }

-/* static */ port::Status CUDADriver::SynchronizeStream(CudaContext *context,
-                                                        CUstream stream) {
+/* static */ port::Status GpuDriver::SynchronizeStream(GpuContext* context,
+                                                       CUstream stream) {
  ScopedActivateContext activated{context};
  CHECK(stream != nullptr);
  CUresult res = tensorflow::wrap::cuStreamSynchronize(stream);
@ -1051,8 +1054,8 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
  return port::Status::OK();
 }

-/* static */ bool CUDADriver::IsStreamIdle(CudaContext *context,
-                                           CUstream stream) {
+/* static */ bool GpuDriver::IsStreamIdle(GpuContext* context,
+                                          CUstream stream) {
  ScopedActivateContext activated{context};
  CHECK(stream != nullptr);
  CUresult res = tensorflow::wrap::cuStreamQuery(stream);
@ -1066,10 +1069,10 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
  return false;
 }

-/* static */ port::Status CUDADriver::SynchronousMemcpyD2H(CudaContext *context,
-                                                           void *host_dst,
-                                                           CUdeviceptr gpu_src,
-                                                           uint64 size) {
+/* static */ port::Status GpuDriver::SynchronousMemcpyD2H(GpuContext* context,
+                                                          void* host_dst,
+                                                          CUdeviceptr gpu_src,
+                                                          uint64 size) {
  ScopedActivateContext activation(context);
  CUresult res = tensorflow::wrap::cuMemcpyDtoH(host_dst, gpu_src, size);
  if (res != CUDA_SUCCESS) {
@ -1084,10 +1087,10 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
  return port::Status::OK();
 }

-/* static */ port::Status CUDADriver::SynchronousMemcpyH2D(CudaContext *context,
-                                                           CUdeviceptr gpu_dst,
-                                                           const void *host_src,
-                                                           uint64 size) {
+/* static */ port::Status GpuDriver::SynchronousMemcpyH2D(GpuContext* context,
+                                                          CUdeviceptr gpu_dst,
+                                                          const void* host_src,
+                                                          uint64 size) {
  ScopedActivateContext activation(context);
  CUresult res = tensorflow::wrap::cuMemcpyHtoD(gpu_dst, host_src, size);
  if (res != CUDA_SUCCESS) {
@ -1101,10 +1104,10 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
  return port::Status::OK();
 }

-/* static */ port::Status CUDADriver::SynchronousMemcpyD2D(CudaContext *context,
-                                                           CUdeviceptr gpu_dst,
-                                                           CUdeviceptr gpu_src,
-                                                           uint64 size) {
+/* static */ port::Status GpuDriver::SynchronousMemcpyD2D(GpuContext* context,
+                                                          CUdeviceptr gpu_dst,
+                                                          CUdeviceptr gpu_src,
+                                                          uint64 size) {
  ScopedActivateContext activation(context);
  CUresult res = tensorflow::wrap::cuMemcpyDtoD(gpu_dst, gpu_src, size);
  if (res != CUDA_SUCCESS) {
@ -1118,11 +1121,11 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
  return port::Status::OK();
 }

-/* static */ bool CUDADriver::AsynchronousMemcpyD2H(CudaContext* context,
-                                                    void *host_dst,
-                                                    CUdeviceptr gpu_src,
-                                                    uint64 size,
-                                                    CUstream stream) {
+/* static */ bool GpuDriver::AsynchronousMemcpyD2H(GpuContext* context,
+                                                   void* host_dst,
+                                                   CUdeviceptr gpu_src,
+                                                   uint64 size,
+                                                   CUstream stream) {
  ScopedActivateContext activation(context);
  CUresult res =
      tensorflow::wrap::cuMemcpyDtoHAsync(host_dst, gpu_src, size, stream);
@ -1140,11 +1143,11 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
  return true;
 }

-/* static */ bool CUDADriver::AsynchronousMemcpyH2D(CudaContext* context,
-                                                    CUdeviceptr gpu_dst,
-                                                    const void *host_src,
-                                                    uint64 size,
-                                                    CUstream stream) {
+/* static */ bool GpuDriver::AsynchronousMemcpyH2D(GpuContext* context,
+                                                   CUdeviceptr gpu_dst,
+                                                   const void* host_src,
+                                                   uint64 size,
+                                                   CUstream stream) {
  ScopedActivateContext activation(context);
  CUresult res =
      tensorflow::wrap::cuMemcpyHtoDAsync(gpu_dst, host_src, size, stream);
@ -1161,11 +1164,11 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
  return true;
 }

-/* static */ bool CUDADriver::AsynchronousMemcpyD2D(CudaContext* context,
-                                                    CUdeviceptr gpu_dst,
-                                                    CUdeviceptr gpu_src,
-                                                    uint64 size,
-                                                    CUstream stream) {
+/* static */ bool GpuDriver::AsynchronousMemcpyD2D(GpuContext* context,
+                                                   CUdeviceptr gpu_dst,
+                                                   CUdeviceptr gpu_src,
+                                                   uint64 size,
+                                                   CUstream stream) {
  ScopedActivateContext activation(context);
  CUresult result =
      tensorflow::wrap::cuMemcpyDtoDAsync(gpu_dst, gpu_src, size, stream);
@ -1189,9 +1192,9 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
  return true;
 }

-/* static */ port::Status CUDADriver::CreateEvent(CudaContext* context,
-                                                  CUevent *result,
-                                                  EventFlags flags) {
+/* static */ port::Status GpuDriver::CreateEvent(GpuContext* context,
+                                                 CUevent* result,
+                                                 EventFlags flags) {
  int cuflags;
  switch (flags) {
    case EventFlags::kDefault:
@ -1219,7 +1222,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
  }
 }

-/* static */ int CUDADriver::GetDeviceCount() {
+/* static */ int GpuDriver::GetDeviceCount() {
  int device_count = 0;
  CUresult res = tensorflow::wrap::cuDeviceGetCount(&device_count);
  if (res != CUDA_SUCCESS) {
@ -1233,9 +1236,9 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
  return device_count;
 }

-/* static */ port::StatusOr<CudaContext*> CUDADriver::GetPointerContext(
+/* static */ port::StatusOr<GpuContext*> GpuDriver::GetPointerContext(
    CUdeviceptr pointer) {
-  CudaContext* context = nullptr;
+  GpuContext* context = nullptr;
  CUresult result = tensorflow::wrap::cuPointerGetAttribute(
      &context, CU_POINTER_ATTRIBUTE_CONTEXT, pointer);
  if (result == CUDA_SUCCESS) {
@ -1249,7 +1252,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
                   ToString(result)));
 }

-/* static */ port::StatusOr<MemorySpace> CUDADriver::GetPointerMemorySpace(
+/* static */ port::StatusOr<MemorySpace> GpuDriver::GetPointerMemorySpace(
    CUdeviceptr pointer) {
  unsigned int value;
  CUresult result = tensorflow::wrap::cuPointerGetAttribute(
@ -1273,9 +1276,9 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
                   ToString(result)));
 }

-/* static */ port::Status CUDADriver::GetPointerAddressRange(CUdeviceptr dptr,
-                                                             CUdeviceptr *base,
-                                                             size_t *size) {
+/* static */ port::Status GpuDriver::GetPointerAddressRange(CUdeviceptr dptr,
+                                                            CUdeviceptr* base,
+                                                            size_t* size) {
  CUresult result = tensorflow::wrap::cuMemGetAddressRange(base, size, dptr);
  if (result == CUDA_SUCCESS) {
    return port::Status::OK();
@ -1295,7 +1298,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
                   reinterpret_cast<void *>(dptr), ToString(result).c_str()));
 }

-/* static */ port::StatusOr<CUdevice> CUDADriver::GetPointerDevice(
+/* static */ port::StatusOr<CUdevice> GpuDriver::GetPointerDevice(
    CUdeviceptr pointer) {
  auto result = GetPointerContext(pointer);
  if (!result.ok()) {
@ -1305,9 +1308,9 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
  return DeviceFromContext(result.ValueOrDie());
 }

-/* static */ port::Status CUDADriver::GetComputeCapability(int *cc_major,
-                                                           int *cc_minor,
-                                                           CUdevice device) {
+/* static */ port::Status GpuDriver::GetComputeCapability(int* cc_major,
+                                                          int* cc_minor,
+                                                          CUdevice device) {
  *cc_major = 0;
  *cc_minor = 0;

@ -1334,6 +1337,13 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
  return port::Status::OK();
 }

+/* static */ port::Status GpuDriver::GetGpuISAVersion(int* version,
+                                                      CUdevice device) {
+  return port::Status{
+      port::error::INTERNAL,
+      "Feature not supported on CUDA platform (GetGpuISAVersion)"};
+}
+
 // Helper function that turns the integer output of cuDeviceGetAttribute to type
 // T and wraps it in a StatusOr.
 template <typename T>
@ -1352,49 +1362,49 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
  return converted;
 }

-/* static */ port::StatusOr<int> CUDADriver::GetMultiprocessorCount(
+/* static */ port::StatusOr<int> GpuDriver::GetMultiprocessorCount(
    CUdevice device) {
  return GetSimpleAttribute<int>(device,
                                 CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT);
 }

-/* static */ port::StatusOr<int64> CUDADriver::GetMaxSharedMemoryPerCore(
+/* static */ port::StatusOr<int64> GpuDriver::GetMaxSharedMemoryPerCore(
    CUdevice device) {
  return GetSimpleAttribute<int64>(
      device, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR);
 }

-/* static */ port::StatusOr<int64> CUDADriver::GetMaxSharedMemoryPerBlock(
+/* static */ port::StatusOr<int64> GpuDriver::GetMaxSharedMemoryPerBlock(
    CUdevice device) {
  return GetSimpleAttribute<int64>(
      device, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK);
 }

-/* static */ port::StatusOr<int64> CUDADriver::GetMaxThreadsPerMultiprocessor(
+/* static */ port::StatusOr<int64> GpuDriver::GetMaxThreadsPerMultiprocessor(
    CUdevice device) {
  return GetSimpleAttribute<int64>(
      device, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR);
 }

-/* static */ port::StatusOr<int64> CUDADriver::GetMaxThreadsPerBlock(
+/* static */ port::StatusOr<int64> GpuDriver::GetMaxThreadsPerBlock(
    CUdevice device) {
  return GetSimpleAttribute<int64>(device,
                                   CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK);
 }

-/* static */ port::StatusOr<int64> CUDADriver::GetMaxRegistersPerBlock(
+/* static */ port::StatusOr<int64> GpuDriver::GetMaxRegistersPerBlock(
    CUdevice device) {
  return GetSimpleAttribute<int64>(device,
                                   CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK);
 }

-/* static */ port::StatusOr<int64> CUDADriver::GetThreadsPerWarp(
+/* static */ port::StatusOr<int64> GpuDriver::GetThreadsPerWarp(
    CUdevice device) {
  return GetSimpleAttribute<int64>(device, CU_DEVICE_ATTRIBUTE_WARP_SIZE);
 }

-/* static */ bool CUDADriver::GetGridLimits(int *x, int *y, int *z,
-                                            CUdevice device) {
+/* static */ bool GpuDriver::GetGridLimits(int* x, int* y, int* z,
+                                           CUdevice device) {
  int value;
  CUresult res = tensorflow::wrap::cuDeviceGetAttribute(
      &value, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, device);
@ -1422,7 +1432,7 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
  return true;
 }

-/* static */ bool CUDADriver::GetDriverVersion(int *driver_version) {
+/* static */ bool GpuDriver::GetDriverVersion(int* driver_version) {
  CUresult res = tensorflow::wrap::cuDriverGetVersion(driver_version);
  if (res != CUDA_SUCCESS) {
    LOG(ERROR) << "failed to query driver version: " << ToString(res);
@ -1432,7 +1442,19 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
  return true;
 }

-/* static */ port::StatusOr<int> CUDADriver::GetDeviceAttribute(
+/* static */ bool GpuDriver::GetDeviceProperties(CUdevprop* device_properties,
+                                                 int device_ordinal) {
+  CUresult res = tensorflow::wrap::cuDeviceGetProperties(device_properties,
+                                                         device_ordinal);
+  if (res != CUDA_SUCCESS) {
+    LOG(ERROR) << "failed to query device properties: " << ToString(res);
+    return false;
+  }
+
+  return true;
+}
+
+/* static */ port::StatusOr<int> GpuDriver::GetDeviceAttribute(
    CUdevice_attribute attribute, CUdevice device) {
  int val;
  CUresult res =
@ -1446,7 +1468,7 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
  return val;
 }

-/* static */ bool CUDADriver::IsEccEnabled(CUdevice device, bool *result) {
+/* static */ bool GpuDriver::IsEccEnabled(CUdevice device, bool* result) {
  int value = -1;
  CUresult res = tensorflow::wrap::cuDeviceGetAttribute(
      &value, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, device);
@ -1459,9 +1481,9 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
  return true;
 }

-/* static */ bool CUDADriver::GetDeviceMemoryInfo(CudaContext* context,
-                                                  int64 *free_out,
-                                                  int64 *total_out) {
+/* static */ bool GpuDriver::GetDeviceMemoryInfo(GpuContext* context,
+                                                 int64* free_out,
+                                                 int64* total_out) {
  ScopedActivateContext activation(context);
  size_t free = 0;
  size_t total = 0;
@ -1476,8 +1498,8 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
  return true;
 }

-/* static */ bool CUDADriver::GetDeviceTotalMemory(CUdevice device,
-                                                   uint64 *result) {
+/* static */ bool GpuDriver::GetDeviceTotalMemory(CUdevice device,
+                                                  uint64* result) {
  size_t value = -1;
  CUresult res = tensorflow::wrap::cuDeviceTotalMem(&value, device);
  if (res != CUDA_SUCCESS) {
@ -1489,7 +1511,7 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
  return true;
 }

-/* static */ string CUDADriver::GetPCIBusID(CUdevice device) {
+/* static */ string GpuDriver::GetPCIBusID(CUdevice device) {
  string pci_bus_id;
  static const int kBufferSize = 64;
  absl::InlinedVector<char, 4> chars(kBufferSize);
@ -1504,8 +1526,8 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
  return pci_bus_id;
 }

-/* static */ bool CUDADriver::CanEnablePeerAccess(CudaContext* from,
-                                                  CudaContext* to) {
+/* static */ bool GpuDriver::CanEnablePeerAccess(GpuContext* from,
+                                                 GpuContext* to) {
  if (from == to) {
    return true;  // A context can always access its own memory.
  }
@ -1533,8 +1555,8 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
  return can_access_peer;
 }

-/* static */ port::Status CUDADriver::EnablePeerAccess(CudaContext* from,
-                                                       CudaContext* to) {
+/* static */ port::Status GpuDriver::EnablePeerAccess(GpuContext* from,
+                                                      GpuContext* to) {
  if (from == to) {
    return port::Status::OK();  // A context can always access its own memory.
  }
@ -1553,8 +1575,8 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
  return port::Status::OK();
 }

-/* static */ port::StatusOr<int> CUDADriver::GetMaxOccupiedBlocksPerCore(
-    CudaContext* context, CUfunction kernel, int threads_per_block,
+/* static */ port::StatusOr<int> GpuDriver::GetMaxOccupiedBlocksPerCore(
+    GpuContext* context, CUfunction kernel, int threads_per_block,
    size_t dynamic_shared_memory_bytes) {
  ScopedActivateContext activation(context);

@ -1572,11 +1594,15 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
  return max_blocks;
 }

-/* static */ CUcontext CUDADriver::CurrentContextOrDie() {
+}  // namespace gpu
+
+namespace cuda {
+
+CUcontext CurrentContextOrDie() {
  CUcontext current = nullptr;
  CUresult result = tensorflow::wrap::cuCtxGetCurrent(&current);
  if (result != CUDA_SUCCESS) {
-    LOG(FATAL) << "failed to query current context: " << ToString(result);
+    LOG(FATAL) << "failed to query current context: " << gpu::ToString(result);
  }
  return current;
 }
--- a/tensorflow/stream_executor/cuda/cuda_driver.h
+++ b/tensorflow/stream_executor/cuda/cuda_driver.h
@ -18,495 +18,45 @@ limitations under the License.
 #ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_DRIVER_H_
 #define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_DRIVER_H_

-#include <stddef.h>
-#include "tensorflow/stream_executor/platform/port.h"
-
-#include "cuda/include/cuda.h"
-#include "tensorflow/stream_executor/device_options.h"
-#include "tensorflow/stream_executor/lib/status.h"
-#include "tensorflow/stream_executor/lib/statusor.h"
-#include "tensorflow/stream_executor/platform/port.h"
+#include "tensorflow/stream_executor/gpu/gpu_driver.h"

 namespace stream_executor {
-namespace cuda {
-
-// Identifies the memory space where an allocation resides. See
-// CUDADriver::GetPointerMemorySpace().
-enum class MemorySpace { kHost, kDevice };
-
-// Returns a casual string, such as "host" for the provided memory space.
-string MemorySpaceString(MemorySpace memory_space);
-
-class CudaContext;
-
-// CUDADriver contains wrappers for calls to the userspace library driver. It's
-// useful to isolate these calls and put basic wrappers around them to separate
-// userspace library driver behaviors from the rest of the program.
-//
-// At the moment it's simply used as a namespace.
-//
-// The calls log any specific errors internally and return whether the operation
-// was successful to the caller.
-//
-// The order of parameters is generally kept symmetric with the underlying CUDA
-// driver API.
-//
-// Links on functions are to specific documentation under
-// http://docs.nvidia.com/cuda/cuda-driver-api/
-//
-// Thread safety: these functions should not be used from signal handlers.
-class CUDADriver {
- public:
-  // Wraps a call to cuInit with logging to help indicate what has gone wrong in
-  // the case of failure. Safe to call multiple times; will be fast on all calls
-  // after the first.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__INITIALIZE.html#group__CUDA__INITIALIZE_1g0a2f1517e1bd8502c7194c3a8c134bc3
-  static port::Status Init();
-
-  // Returns the device associated with the given context.
-  // device is an outparam owned by the caller, must not be null.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g4e84b109eba36cdaaade167f34ae881e
-  static port::StatusOr<CUdevice> DeviceFromContext(CudaContext* context);
-
-  // Creates a new CUDA stream associated with the given context via
-  // cuStreamCreate.
-  // stream is an outparam owned by the caller, must not be null.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1ga581f0c5833e21ded8b5a56594e243f4
-  static bool CreateStream(CudaContext* context, CUstream* stream);
-
-  // Destroys a CUDA stream associated with the given context.
-  // stream is owned by the caller, must not be null, and *stream is set to null
-  // if the stream is successfully destroyed.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1g244c8833de4596bcd31a06cdf21ee758
-  static void DestroyStream(CudaContext* context, CUstream* stream);
-
-  // CUDA events can explicitly disable event TSC retrieval for some presumed
-  // performance improvement if timing is unnecessary.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1g450687e75f3ff992fe01662a43d9d3db
-  enum class EventFlags { kDefault, kDisableTiming };
-
-  // Creates a new event associated with the given context.
-  // result is an outparam owned by the caller and must not be null.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1g450687e75f3ff992fe01662a43d9d3db
-  static port::Status CreateEvent(CudaContext* context, CUevent* result,
-                                  EventFlags flags);
-
-  // Destroys *event and turns it into a nullptr. event may not be null, but
-  // *event may be, via cuEventDestroy
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1g593ec73a8ec5a5fc031311d3e4dca1ef
-  static port::Status DestroyEvent(CudaContext* context, CUevent* event);
-
-  // Allocates a GPU memory space of size bytes associated with the given
-  // context via cuMemAlloc.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gb82d2a09844a58dd9e744dc31e8aa467
-  static void* DeviceAllocate(CudaContext* context, uint64 bytes);
-
-  // Deallocates a GPU memory space of size bytes associated with the given
-  // context via cuMemFree.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g89b3f154e17cc89b6eea277dbdf5c93a
-  static void DeviceDeallocate(CudaContext* context, void* location);
-
-  // Allocates a unified memory space of size bytes associated with the given
-  // context via cuMemAllocManaged.
-  // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gb347ded34dc326af404aa02af5388a32
-  static void* UnifiedMemoryAllocate(CudaContext* context, uint64 bytes);
-
-  // Deallocates a unified memory space of size bytes associated with the given
-  // context via cuMemFree.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g89b3f154e17cc89b6eea277dbdf5c93a
-  static void UnifiedMemoryDeallocate(CudaContext* context, void* location);
-
-  // Allocates page-locked and CUDA-registered memory on the host via
-  // cuMemAllocHost.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gdd8311286d2c2691605362c689bc64e0
-  static void* HostAllocate(CudaContext* context, uint64 bytes);
-
-  // Deallocates a location created by HostAllocate, via cuMemFreeHost.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g62e0fdbe181dab6b1c90fa1a51c7b92c
-  static void HostDeallocate(CudaContext* context, void* location);
-
-  // Registers a memory region at location of size bytes via cuMemHostRegister.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gf0a9fe11544326dabd743b7aa6b54223
-  static bool HostRegister(CudaContext* context, void* location, uint64 bytes);
-
-  // Unregisters a memory region that was previously registered at location via
-  // cuMemHostUnregister.
-  //
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g63f450c8125359be87b7623b1c0b2a14
-  //
-  // TODO(leary) verify an error will be returned if the location wasn't
-  // previously registered.
-  static bool HostUnregister(CudaContext* context, void* location);
-
-  // Given a device ordinal, returns a device handle into the device outparam,
-  // which must not be null.
-  //
-  // N.B. these device handles do not have a corresponding destroy function in
-  // the CUDA driver API.
-  static port::Status GetDevice(int device_ordinal, CUdevice* device);
-
-  // Given a device handle, returns the name reported by the driver for the
-  // device.
-  static bool GetDeviceName(CUdevice device, string* name_out);
-
-  // Given a device to create a context for, returns a context handle into the
-  // context outparam, which must not be null.
-  //
-  // N.B. CUDA contexts are weird. They are implicitly associated with the
-  // calling thread. Current documentation on contexts and their influence on
-  // userspace processes is given here:
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g65dc0012348bc84810e2103a40d8e2cf
-  static port::Status CreateContext(CUdevice device,
-                                    const DeviceOptions& device_options,
-                                    CudaContext** context);
-
-  // Destroys the provided context via cuCtxDestroy.
-  // Don't do this while clients could still be using the context, per the docs
-  // bad things will happen.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g27a365aebb0eb548166309f58a1e8b8e
-  static void DestroyContext(CudaContext* context);
-
-  // Queries the runtime for the specified attribute of the specified function.
-  // cuFuncGetAttribute (the underlying CUDA driver API routine) only operates
-  // in terms of integer-sized values, so there's no potential for overrun (as
-  // of CUDA 5.5).
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EXEC.html#group__CUDA__EXEC_1g5e92a1b0d8d1b82cb00dcfb2de15961b
-  static bool FuncGetAttribute(CUfunction_attribute attribute,
-                               CUfunction function, int* attribute_value);
-
-  // Sets the preferred cache configuration for the specified function.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EXEC.html#group__CUDA__EXEC_1g40f8c11e81def95dc0072a375f965681
-  static bool FuncSetCacheConfig(CUfunction function,
-                                 CUfunc_cache cache_config);
-
-  // Gets the preferred shared memory bank configuration for the specified
-  // CONTEXT (not function!), either default or four- or eight-byte bank size.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g17153a1b8b8c756f7ab8505686a4ad74
-  static port::StatusOr<CUsharedconfig> ContextGetSharedMemConfig(
-      CudaContext* context);
-
-  // Sets the preferred shared memory bank configuration for the specified
-  // CONTEXT (not function!), either default or four- or eight-byte bank size.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g2574235fa643f8f251bf7bc28fac3692
-  static port::Status ContextSetSharedMemConfig(
-      CudaContext* context, CUsharedconfig shared_mem_config);
-
-  // Launches a CUDA kernel via cuLaunchKernel.
-  // TODO(leary) describe the structure of kernel_params and extra in a readable
-  // way.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EXEC.html#group__CUDA__EXEC_1gb8f3dc3031b40da29d5f9a7139e52e15
-  static bool LaunchKernel(CudaContext* context, CUfunction function,
-                           unsigned int grid_dim_x, unsigned int grid_dim_y,
-                           unsigned int grid_dim_z, unsigned int block_dim_x,
-                           unsigned int block_dim_y, unsigned int block_dim_z,
-                           unsigned int shared_mem_bytes, CUstream stream,
-                           void** kernel_params, void** extra);
-
-  // Loads ptx_contents with the CUDA driver's PTX JIT and stores the resulting
-  // handle in "module". Any error logs that are produced are logged internally.
-  static bool LoadPtx(CudaContext* context, const char* ptx_contents,
-                      CUmodule* module);
-
-  // Loads cubin_bytes with the CUDA driver's blob loading interface and stores
-  // the resulting handle in "module".
-  static port::Status LoadCubin(CudaContext* context, const char* cubin_bytes,
-                                CUmodule* module);
-
-  // Retrieves a named kernel from a loaded module, and places the resulting
-  // handle into function (outparam) on success. Neither kernel_name nor
-  // function may be null. No ownership is taken of kernel_name.
-  static bool GetModuleFunction(CudaContext* context, CUmodule module,
-                                const char* kernel_name, CUfunction* function);
-
-  // Retrieves a named global/constant symbol from a loaded module, and returns
-  // a device pointer and size of the symbol on success. symbol_name may not be
-  // null. At least one of dptr or bytes should not be null. No ownership is
-  // taken of symbol_name.
-  static bool GetModuleSymbol(CudaContext* context, CUmodule module,
-                              const char* symbol_name, CUdeviceptr* dptr,
-                              size_t* bytes);
-
-  // Unloads module from the current context via cuModuleUnload.
-  // TODO(leary) the documentation doesn't say what kind of disasters happen
-  // if you try to unload a module while its CUfunctions are in use.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MODULE.html#group__CUDA__MODULE_1g8ea3d716524369de3763104ced4ea57b
-  static void UnloadModule(CudaContext* context, CUmodule module);
-
-  // Performs a synchronous memset of the device memory segment via cuMemsetD8.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g6e582bf866e9e2fb014297bfaf354d7b
-  static bool SynchronousMemsetUint8(CudaContext* context, CUdeviceptr location,
-                                     uint8 value, size_t size);
-
-  // Performs a synchronous memset of the device memory segment via cuMemsetD32.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g983e8d8759acd1b64326317481fbf132
-  static bool SynchronousMemsetUint32(CudaContext* context,
-                                      CUdeviceptr location, uint32 value,
-                                      size_t uint32_count);
-
-  // Performs an asynchronous memset of the device memory segment via
-  // cuMemsetD8Async.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gaef08a7ccd61112f94e82f2b30d43627
-  static bool AsynchronousMemsetUint8(CudaContext* context,
-                                      CUdeviceptr location, uint8 value,
-                                      size_t uint32_count, CUstream stream);
-
-  // Performs an asynchronous memset of the device memory segment via
-  // cuMemsetD32Async.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g58229da5d30f1c0cdf667b320ec2c0f5
-  static bool AsynchronousMemsetUint32(CudaContext* context,
-                                       CUdeviceptr location, uint32 value,
-                                       size_t uint32_count, CUstream stream);
-
-  // -- Synchronous memcopies.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g4d32266788c440b0220b1a9ba5795169
-
-  static port::Status SynchronousMemcpyD2H(CudaContext* context, void* host_dst,
-                                           CUdeviceptr gpu_src, uint64 size);
-  static port::Status SynchronousMemcpyH2D(CudaContext* context,
-                                           CUdeviceptr gpu_dst,
-                                           const void* host_src, uint64 size);
-  static port::Status SynchronousMemcpyD2D(CudaContext* context,
-                                           CUdeviceptr gpu_dst,
-                                           CUdeviceptr gpu_src, uint64 size);
-
-  // -- Asynchronous memcopies.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g56f30236c7c5247f8e061b59d3268362
-
-  static bool AsynchronousMemcpyD2H(CudaContext* context, void* host_dst,
-                                    CUdeviceptr gpu_src, uint64 size,
-                                    CUstream stream);
-  static bool AsynchronousMemcpyH2D(CudaContext* context, CUdeviceptr gpu_dst,
-                                    const void* host_src, uint64 size,
-                                    CUstream stream);
-  static bool AsynchronousMemcpyD2D(CudaContext* context, CUdeviceptr gpu_dst,
-                                    CUdeviceptr gpu_src, uint64 size,
-                                    CUstream stream);
-
-  // The CUDA stream callback type signature.
-  // The data passed to AddStreamCallback is subsequently passed to this
-  // callback when it fires.
-  //
-  // Some notable things:
-  // * Callbacks must not make any CUDA API calls.
-  // * Callbacks from independent streams execute in an undefined order and may
-  //   be serialized.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1g613d97a277d7640f4cb1c03bd51c2483
-  typedef void (*StreamCallback)(CUstream stream, CUresult status, void* data);
-
-  // Enqueues a callback operation into stream.
-  // See StreamCallback above and the NVIDIA documentation for additional
-  // details.
-  static bool AddStreamCallback(CudaContext* context, CUstream stream,
-                                StreamCallback callback, void* data);
-
-  // Causes stream to wait for event to trigger before proceeding via
-  // cuStreamWaitEvent.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#axzz334nAXAhM
-  static bool WaitStreamOnEvent(CudaContext* context, CUstream stream,
-                                CUevent event);
-
-  // Blocks the calling thread until the operations enqueued onto stream have
-  // been completed, via cuStreamSynchronize.
-  //
-  // TODO(leary) if a pathological thread enqueues operations onto the stream
-  // while another thread blocks like this, can you wind up waiting an unbounded
-  // amount of time?
-  //
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1g15e49dd91ec15991eb7c0a741beb7dad
-  static port::Status SynchronizeStream(CudaContext* context, CUstream stream);
-
-  // Blocks the calling thread until the operations associated with the context
-  // have been completed, via cuCtxSynchronize.
-  //
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g7a54725f28d34b8c6299f0c6ca579616
-  static bool SynchronizeContext(CudaContext* context);
-
-  // Returns true if all stream tasks have completed at time of the call. Note
-  // the potential for races around this call (if another thread adds work to
-  // the stream immediately after this returns).
-  static bool IsStreamIdle(CudaContext* context, CUstream stream);
-
-  // Returns whether code in the from context can access memory in the to
-  // context via cuDeviceCanAccessPeer.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__PEER__ACCESS.html#group__CUDA__PEER__ACCESS_1g496bdaae1f632ebfb695b99d2c40f19e
-  static bool CanEnablePeerAccess(CudaContext* from, CudaContext* to);
-
-  // Enables peer access per CanEnablePeerAccess, via cuCtxEnablePeerAccess.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__PEER__ACCESS.html#group__CUDA__PEER__ACCESS_1g0889ec6728e61c05ed359551d67b3f5a
-  static port::Status EnablePeerAccess(CudaContext* from, CudaContext* to);
-
-  // Returns the elapsed milliseconds between start and stop via
-  // cuEventElapsedTime.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1gdfb1178807353bbcaa9e245da497cf97
-  static bool GetEventElapsedTime(CudaContext* context,
-                                  float* elapsed_milliseconds, CUevent start,
-                                  CUevent stop);
-
-  // Records that an event occurred when execution reaches the current point in
-  // thestream via cuEventRecord.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1g95424d3be52c4eb95d83861b70fb89d1
-  static port::Status RecordEvent(CudaContext* context, CUevent event,
-                                  CUstream stream);
-
-  // Polls (without blocking) to determine the status of an event - pending or
-  // complete (or an error status).
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1g6f0704d755066b0ee705749ae911deef
-  static port::StatusOr<CUresult> QueryEvent(CudaContext* context,
-                                             CUevent event);
-
-  // -- Pointer-specific calls.
-
-  // Returns the context in which pointer was allocated or registered.
-  static port::StatusOr<CudaContext*> GetPointerContext(CUdeviceptr pointer);
-
-  // Returns the device associated with the context from GetPointerContext().
-  static port::StatusOr<CUdevice> GetPointerDevice(CUdeviceptr pointer);
-
-  // Returns the memory space addressed by pointer.
-  static port::StatusOr<MemorySpace> GetPointerMemorySpace(CUdeviceptr pointer);
-
-  // Returns the base address and size of the device pointer dptr.
-  static port::Status GetPointerAddressRange(CUdeviceptr dptr,
-                                             CUdeviceptr* base, size_t* size);
-
-  // -- Device-specific calls.
-
-  // Returns the compute capability for the device; i.e (3, 5).
-  // This is currently done via the deprecated device API.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE__DEPRECATED.html#group__CUDA__DEVICE__DEPRECATED_1ge2091bbac7e1fb18c2821612115607ea
-  static port::Status GetComputeCapability(int* cc_major, int* cc_minor,
-                                           CUdevice device);
-
-  // Returns the number of multiprocessors on the device (note that the device
-  // may be multi-GPU-per-board).
-  static port::StatusOr<int> GetMultiprocessorCount(CUdevice device);
-
-  // Returns the limit on number of threads that can be resident in a single
-  // multiprocessor.
-  static port::StatusOr<int64> GetMaxThreadsPerMultiprocessor(CUdevice device);
-
-  // Returns the limit on number of threads which may be resident for a single
-  // block (cooperative thread array).
-  static port::StatusOr<int64> GetMaxThreadsPerBlock(CUdevice device);
-
-  // Returns the amount of shared memory available on a single GPU core (i.e.
-  // SM on NVIDIA devices).
-  static port::StatusOr<int64> GetMaxSharedMemoryPerCore(CUdevice device);
-
-  // Returns the amount of shared memory available for a single block
-  // (cooperative thread array).
-  static port::StatusOr<int64> GetMaxSharedMemoryPerBlock(CUdevice device);
-
-  // Returns the maximum supported number of registers per block.
-  static port::StatusOr<int64> GetMaxRegistersPerBlock(CUdevice device);
-
-  // Returns the number of threads per warp.
-  static port::StatusOr<int64> GetThreadsPerWarp(CUdevice device);
-
-  // Queries the grid limits for device with cuDeviceGetAttribute calls.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE.html#group__CUDA__DEVICE_1g9c3e1414f0ad901d3278a4d6645fc266
-  static bool GetGridLimits(int* x, int* y, int* z, CUdevice device);
-
-  // Gets a specific integer-valued property about the given device.
-  //
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE.html#group__CUDA__DEVICE_1g9c3e1414f0ad901d3278a4d6645fc266
-  static port::StatusOr<int> GetDeviceAttribute(CUdevice_attribute attribute,
-                                                CUdevice device);
-
-  // Returns whether ECC is enabled for the given CUdevice via
-  // cuDeviceGetattribute with CU_DEVICE_ATTRIBUTE_ECC_ENABLED.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE.html#group__CUDA__DEVICE_1g9c3e1414f0ad901d3278a4d6645fc266
-  static bool IsEccEnabled(CUdevice device, bool* result);
-
-  // Returns the total amount of memory available for allocation by the CUDA
-  // context, in bytes, via cuDeviceTotalMem.
-  static bool GetDeviceTotalMemory(CUdevice device, uint64* result);
-
-  // Returns the free amount of memory and total amount of memory, as reported
-  // by cuMemGetInfo.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g808f555540d0143a331cc42aa98835c0
-  static bool GetDeviceMemoryInfo(CudaContext* context, int64* free,
-                                  int64* total);
-
-  // Returns a PCI bus id string for the device.
-  // [domain]:[bus]:[device].[function]
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g85295e7d9745ab8f0aa80dd1e172acfc
-  static string GetPCIBusID(CUdevice device);
-
-  // -- Context- and device-independent calls.
-
-  // Returns the number of visible CUDA device via cuDeviceGetCount.
-  // This should correspond to the set of device ordinals available.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE.html#group__CUDA__DEVICE_1g52b5ce05cb8c5fb6831b2c0ff2887c74
-  static int GetDeviceCount();
-
-  // Returns the driver version number via cuDriverGetVersion.
-  // This is, surprisingly, NOT the actual driver version (e.g. 331.79) but,
-  // instead, the CUDA toolkit release number that this driver is compatible
-  // with; e.g. 6000 (for a CUDA 6.0 compatible driver) or 6050 (for a CUDA 6.5
-  // compatible driver).
-  //
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__VERSION.html#group__CUDA__VERSION_1g8b7a10395392e049006e61bcdc8ebe71
-  static bool GetDriverVersion(int* driver_version);
-
-  // -- Other calls
-
-  // Returns the maximum number of blocks (per multiprocessor) occupied by the
-  // specified kernel/CUfunction when launched with the specified parameters.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__OCCUPANCY.html#group__CUDA__OCCUPANCY_1gcc6e1094d05cba2cee17fe33ddd04a98
-  static port::StatusOr<int> GetMaxOccupiedBlocksPerCore(
-      CudaContext* context, CUfunction kernel, int threads_per_block,
-      size_t dynamic_shared_memory_bytes);
-
-  // Returns the current context set in CUDA. This is done by calling the cuda
-  // driver (e.g., this value is not our cached view of the current context).
-  static CUcontext CurrentContextOrDie();
-
-  // Seam for injecting an error at CUDA initialization time for testing
-  // purposes.
-  static bool driver_inject_init_error_;
-};
-
-// Ensures a context is activated within a scope.
-class ScopedActivateContext {
- public:
-  // Activates the context via cuCtxSetCurrent, if it is not the currently
-  // active context (a la cuCtxGetCurrent). Note the alternative push/pop
-  // mechanism is said by NVIDIA to be relatively slow and deprecated.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1gbe562ee6258b4fcc272ca6478ca2a2f7
-  explicit ScopedActivateContext(CudaContext* context);
-
-  // Checks that the context has remained activated for the duration of the
-  // scope.
-  ~ScopedActivateContext();
-
- private:
-  CudaContext* to_restore_ = nullptr;
-};
-
-// CudaContext wraps a cuda CUcontext handle, and includes a unique id. The
+namespace gpu {
+// CUDAContext wraps a cuda CUcontext handle, and includes a unique id. The
 // unique id is positive, and ids are not repeated within the process.
-class CudaContext {
+class GpuContext {
 public:
-  CudaContext(CUcontext context, int64 id) : context_(context), id_(id) {}
+  GpuContext(CUcontext context, int64 id) : context_(context), id_(id) {}

  CUcontext context() const { return context_; }
  int64 id() const { return id_; }

  // Disallow copying and moving.
-  CudaContext(CudaContext&&) = delete;
-  CudaContext(const CudaContext&) = delete;
-  CudaContext& operator=(CudaContext&&) = delete;
-  CudaContext& operator=(const CudaContext&) = delete;
+  GpuContext(GpuContext&&) = delete;
+  GpuContext(const GpuContext&) = delete;
+  GpuContext& operator=(GpuContext&&) = delete;
+  GpuContext& operator=(const GpuContext&) = delete;

 private:
  CUcontext const context_;
  const int64 id_;
 };

-inline CUcontext CurrentContextOrDie() {
-  return CUDADriver::CurrentContextOrDie();
-}
+}  // namespace gpu
+
+namespace cuda {
+
+using MemorySpace = gpu::MemorySpace;
+
+using CUDADriver = gpu::GpuDriver;
+
+using ScopedActivateContext = gpu::ScopedActivateContext;
+
+using CudaContext = gpu::GpuContext;
+
+// Returns the current context set in CUDA. This is done by calling the cuda
+// driver (e.g., this value is not our cached view of the current context).
+CUcontext CurrentContextOrDie();

 }  // namespace cuda
 }  // namespace stream_executor
--- a/tensorflow/stream_executor/cuda/cuda_driver_wrapper.h
+++ b/tensorflow/stream_executor/cuda/cuda_driver_wrapper.h
@ -78,6 +78,7 @@ namespace wrap {
  __macro(cuDeviceGetCount)                             \
  __macro(cuDeviceGetName)                              \
  __macro(cuDeviceGetPCIBusId)                          \
+  __macro(cuDeviceGetProperties)                        \
  __macro(cuDevicePrimaryCtxGetState)                   \
  __macro(cuDevicePrimaryCtxRelease)                    \
  __macro(cuDevicePrimaryCtxRetain)                     \
--- a/tensorflow/stream_executor/cuda/cuda_event.cc
+++ b/tensorflow/stream_executor/cuda/cuda_event.cc
@ -20,30 +20,11 @@ limitations under the License.
 #include "tensorflow/stream_executor/lib/statusor.h"

 namespace stream_executor {
-namespace cuda {
+namespace gpu {

-CUDAEvent::CUDAEvent(CUDAExecutor* parent)
-    : parent_(parent), cuda_event_(nullptr) {}
-
-CUDAEvent::~CUDAEvent() {}
-
-port::Status CUDAEvent::Init() {
-  return CUDADriver::CreateEvent(parent_->cuda_context(), &cuda_event_,
-                                 CUDADriver::EventFlags::kDisableTiming);
-}
-
-port::Status CUDAEvent::Destroy() {
-  return CUDADriver::DestroyEvent(parent_->cuda_context(), &cuda_event_);
-}
-
-port::Status CUDAEvent::Record(CUDAStream* stream) {
-  return CUDADriver::RecordEvent(parent_->cuda_context(), cuda_event_,
-                                 stream->cuda_stream());
-}
-
-Event::Status CUDAEvent::PollForStatus() {
+Event::Status GpuEvent::PollForStatus() {
  port::StatusOr<CUresult> status =
-      CUDADriver::QueryEvent(parent_->cuda_context(), cuda_event_);
+      GpuDriver::QueryEvent(parent_->gpu_context(), gpu_event_);
  if (!status.ok()) {
    LOG(ERROR) << "Error polling for event status: "
               << status.status().error_message();
@ -62,9 +43,5 @@ Event::Status CUDAEvent::PollForStatus() {
  }
 }

-const CUevent& CUDAEvent::cuda_event() {
-  return cuda_event_;
-}
-
-}  // namespace cuda
+}  // namespace gpu
 }  // namespace stream_executor
--- a/tensorflow/stream_executor/cuda/cuda_event.h
+++ b/tensorflow/stream_executor/cuda/cuda_event.h
@ -16,45 +16,12 @@ limitations under the License.
 #ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_EVENT_H_
 #define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_EVENT_H_

-#include "tensorflow/stream_executor/cuda/cuda_driver.h"
-#include "tensorflow/stream_executor/cuda/cuda_stream.h"
-#include "tensorflow/stream_executor/event.h"
-#include "tensorflow/stream_executor/lib/status.h"
+#include "tensorflow/stream_executor/gpu/gpu_event.h"

 namespace stream_executor {
 namespace cuda {

-// CUDAEvent wraps a CUevent in the platform-independent EventInterface
-// interface.
-class CUDAEvent : public internal::EventInterface {
- public:
-  explicit CUDAEvent(CUDAExecutor* parent);
-
-  ~CUDAEvent() override;
-
-  // Populates the CUDA-platform-specific elements of this object.
-  port::Status Init();
-
-  // Deallocates any platform-specific elements of this object. This is broken
-  // out (not part of the destructor) to allow for error reporting.
-  port::Status Destroy();
-
-  // Inserts the event at the current position into the specified stream.
-  port::Status Record(CUDAStream* stream);
-
-  // Polls the CUDA platform for the event's current status.
-  Event::Status PollForStatus();
-
-  // The underlying CUDA event element.
-  const CUevent& cuda_event();
-
- private:
-  // The Executor used to which this object and CUevent are bound.
-  CUDAExecutor* parent_;
-
-  // The underlying CUDA event element.
-  CUevent cuda_event_;
-};
+using CUDAEvent = gpu::GpuEvent;

 }  // namespace cuda
 }  // namespace stream_executor
--- a/tensorflow/stream_executor/cuda/cuda_fft.cc
+++ b/tensorflow/stream_executor/cuda/cuda_fft.cc
@ -33,7 +33,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/stream_executor_internal.h"

 namespace stream_executor {
-namespace cuda {
+namespace gpu {

 PLUGIN_REGISTRY_DEFINE_PLUGIN_ID(kCuFftPlugin);

@ -45,13 +45,13 @@ namespace wrap {
 // manner on first use. This dynamic loading technique is used to avoid DSO
 // dependencies on vendor libraries which may or may not be available in the
 // deployed binary environment.
-#define STREAM_EXECUTOR_CUFFT_WRAP(__name)                       \
-  struct WrapperShim__##__name {                                 \
-    template <typename... Args>                                  \
-    cufftResult operator()(CUDAExecutor *parent, Args... args) { \
-      cuda::ScopedActivateExecutorContext sac{parent};           \
-      return ::__name(args...);                                  \
-    }                                                            \
+#define STREAM_EXECUTOR_CUFFT_WRAP(__name)                      \
+  struct WrapperShim__##__name {                                \
+    template <typename... Args>                                 \
+    cufftResult operator()(GpuExecutor *parent, Args... args) { \
+      gpu::ScopedActivateExecutorContext sac{parent};           \
+      return ::__name(args...);                                 \
+    }                                                           \
  } __name;

 #else
@ -77,8 +77,8 @@ namespace wrap {
      return f;                                                           \
    }                                                                     \
    template <typename... Args>                                           \
-    cufftResult operator()(CUDAExecutor *parent, Args... args) {          \
-      cuda::ScopedActivateExecutorContext sac{parent};                    \
+    cufftResult operator()(GpuExecutor *parent, Args... args) {           \
+      gpu::ScopedActivateExecutorContext sac{parent};                     \
      return DynLoad()(args...);                                          \
    }                                                                     \
  } __name;                                                               \
@ -145,8 +145,8 @@ cufftType CUDAFftType(fft::Type type) {
 }

 // Associates the given stream with the given cuFFT plan.
-bool SetStream(CUDAExecutor *parent, cufftHandle plan, Stream *stream) {
-  auto ret = wrap::cufftSetStream(parent, plan, AsCUDAStreamValue(stream));
+bool SetStream(GpuExecutor *parent, cufftHandle plan, Stream *stream) {
+  auto ret = wrap::cufftSetStream(parent, plan, AsGpuStreamValue(stream));
  if (ret != CUFFT_SUCCESS) {
    LOG(ERROR) << "failed to run cuFFT routine cufftSetStream: " << ret;
    return false;
@ -157,7 +157,7 @@ bool SetStream(CUDAExecutor *parent, cufftHandle plan, Stream *stream) {
 }  // namespace

 port::Status CUDAFftPlan::Initialize(
-    CUDAExecutor *parent, Stream *stream, int rank, uint64 *elem_count,
+    GpuExecutor *parent, Stream *stream, int rank, uint64 *elem_count,
    uint64 *input_embed, uint64 input_stride, uint64 input_distance,
    uint64 *output_embed, uint64 output_stride, uint64 output_distance,
    fft::Type type, int batch_count, ScratchAllocator *scratch_allocator) {
@ -317,7 +317,7 @@ port::Status CUDAFftPlan::Initialize(
  return port::Status::OK();
 }

-port::Status CUDAFftPlan::Initialize(CUDAExecutor *parent, Stream *stream,
+port::Status CUDAFftPlan::Initialize(GpuExecutor *parent, Stream *stream,
                                     int rank, uint64 *elem_count,
                                     fft::Type type,
                                     ScratchAllocator *scratch_allocator) {
@ -549,8 +549,8 @@ bool CUDAFft::DoFftInternal(Stream *stream, fft::Plan *plan, FuncT cufftExec,
  }

  auto ret = cufftExec(parent_, cuda_fft_plan->GetPlan(),
-                       CUDAComplex(const_cast<InputT *>(CUDAMemory(input))),
-                       CUDAComplex(CUDAMemoryMutable(output)));
+                       GpuComplex(const_cast<InputT *>(GpuMemory(input))),
+                       GpuComplex(GpuMemoryMutable(output)));

  if (ret != CUFFT_SUCCESS) {
    LOG(ERROR) << "failed to run cuFFT routine: " << ret;
@ -576,8 +576,8 @@ bool CUDAFft::DoFftWithDirectionInternal(Stream *stream, fft::Plan *plan,
  }

  auto ret = cufftExec(parent_, cuda_fft_plan->GetPlan(),
-                       CUDAComplex(const_cast<InputT *>(CUDAMemory(input))),
-                       CUDAComplex(CUDAMemoryMutable(output)),
+                       GpuComplex(const_cast<InputT *>(GpuMemory(input))),
+                       GpuComplex(GpuMemoryMutable(output)),
                       cuda_fft_plan->GetFftDirection());

  if (ret != CUFFT_SUCCESS) {
@ -614,22 +614,22 @@ STREAM_EXECUTOR_CUDA_DEFINE_FFT(double, Z2Z, D2Z, Z2D)

 #undef STREAM_EXECUTOR_CUDA_DEFINE_FFT

-}  // namespace cuda
+}  // namespace gpu

 void initialize_cufft() {
  port::Status status =
      PluginRegistry::Instance()->RegisterFactory<PluginRegistry::FftFactory>(
-          cuda::kCudaPlatformId, cuda::kCuFftPlugin, "cuFFT",
+          cuda::kCudaPlatformId, gpu::kCuFftPlugin, "cuFFT",
          [](internal::StreamExecutorInterface *parent) -> fft::FftSupport * {
-            cuda::CUDAExecutor *cuda_executor =
-                dynamic_cast<cuda::CUDAExecutor *>(parent);
+            gpu::GpuExecutor *cuda_executor =
+                dynamic_cast<gpu::GpuExecutor *>(parent);
            if (cuda_executor == nullptr) {
              LOG(ERROR) << "Attempting to initialize an instance of the cuFFT "
                         << "support library with a non-CUDA StreamExecutor";
              return nullptr;
            }

-            return new cuda::CUDAFft(cuda_executor);
+            return new gpu::CUDAFft(cuda_executor);
          });
  if (!status.ok()) {
    LOG(ERROR) << "Unable to register cuFFT factory: "
@ -637,7 +637,7 @@ void initialize_cufft() {
  }

  PluginRegistry::Instance()->SetDefaultFactory(
-      cuda::kCudaPlatformId, PluginKind::kFft, cuda::kCuFftPlugin);
+      cuda::kCudaPlatformId, PluginKind::kFft, gpu::kCuFftPlugin);
 }

 }  // namespace stream_executor
--- a/tensorflow/stream_executor/cuda/cuda_fft.h
+++ b/tensorflow/stream_executor/cuda/cuda_fft.h
@ -30,9 +30,9 @@ namespace stream_executor {

 class Stream;

-namespace cuda {
+namespace gpu {

-class CUDAExecutor;
+class GpuExecutor;

 // Opaque and unique indentifier for the cuFFT plugin.
 extern const PluginId kCuFftPlugin;
@ -64,17 +64,17 @@ class CUDAFftPlan : public fft::Plan {
  }

  // Initialize function for batched plan
-  port::Status Initialize(CUDAExecutor *parent, Stream *stream, int rank,
-                          uint64 *elem_count, uint64 *input_embed,
+  port::Status Initialize(GpuExecutor* parent, Stream* stream, int rank,
+                          uint64* elem_count, uint64* input_embed,
                          uint64 input_stride, uint64 input_distance,
-                          uint64 *output_embed, uint64 output_stride,
+                          uint64* output_embed, uint64 output_stride,
                          uint64 output_distance, fft::Type type,
-                          int batch_count, ScratchAllocator *scratch_allocator);
+                          int batch_count, ScratchAllocator* scratch_allocator);

  // Initialize function for 1d,2d, and 3d plan
-  port::Status Initialize(CUDAExecutor *parent, Stream *stream, int rank,
-                          uint64 *elem_count, fft::Type type,
-                          ScratchAllocator *scratch_allocator);
+  port::Status Initialize(GpuExecutor* parent, Stream* stream, int rank,
+                          uint64* elem_count, fft::Type type,
+                          ScratchAllocator* scratch_allocator);

  port::Status UpdateScratchAllocator(Stream *stream,
                                      ScratchAllocator *scratch_allocator);
@ -83,7 +83,7 @@ class CUDAFftPlan : public fft::Plan {
  bool IsInitialized() const { return is_initialized_; }

 private:
-  CUDAExecutor *parent_;
+  GpuExecutor* parent_;
  cufftHandle plan_;
  fft::Type fft_type_;
  DeviceMemory<uint8> scratch_;
@ -96,7 +96,7 @@ class CUDAFftPlan : public fft::Plan {
 // This satisfies the platform-agnostic FftSupport interface.
 //
 // Note that the cuFFT handle that this encapsulates is implicitly tied to the
-// context (and, as a result, the device) that the parent CUDAExecutor is tied
+// context (and, as a result, the device) that the parent GpuExecutor is tied
 // to. This simply happens as an artifact of creating the cuFFT handle when a
 // CUDA context is active.
 //
@ -104,13 +104,13 @@ class CUDAFftPlan : public fft::Plan {
 // context of parent_, so all context is explicit.
 class CUDAFft : public fft::FftSupport {
 public:
-  explicit CUDAFft(CUDAExecutor *parent) : parent_(parent) {}
+  explicit CUDAFft(GpuExecutor* parent) : parent_(parent) {}
  ~CUDAFft() override {}

  TENSORFLOW_STREAM_EXECUTOR_GPU_FFT_SUPPORT_OVERRIDES

 private:
-  CUDAExecutor *parent_;
+  GpuExecutor* parent_;

  // Two helper functions that execute dynload::cufftExec?2?.

@ -131,7 +131,7 @@ class CUDAFft : public fft::FftSupport {
  SE_DISALLOW_COPY_AND_ASSIGN(CUDAFft);
 };

-}  // namespace cuda
+}  // namespace gpu
 }  // namespace stream_executor

 #endif  // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_FFT_H_
--- a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
+++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
@ -72,7 +72,7 @@ extern bool FLAGS_check_gpu_leaks;
 bool FLAGS_prefer_cubin_to_ptx = true;

 namespace stream_executor {
-namespace cuda {
+namespace gpu {

 // Hook that can be used to CUBIN-ate PTX before it is loaded into the driver.
 // It has been observed that loading both PTX and cubins into the driver library
@ -84,17 +84,16 @@ namespace cuda {
 // variable with extern linkage and populate it from another translation unit.
 std::function<string(const string &)> g_cubinate;

-static CUDAEvent *AsCUDAEvent(Event *event) {
+static GpuEvent* AsGpuEvent(Event* event) {
  DCHECK(event != nullptr);
-  return static_cast<CUDAEvent *>(event->implementation());
+  return static_cast<GpuEvent*>(event->implementation());
 }

-
 // Given a platform-independent timer datatype, returns the internal CUDA
 // platform implementation pointer.
-static CUDATimer *AsCUDATimer(Timer *timer) {
+static GpuTimer* AsGpuTimer(Timer* timer) {
  DCHECK(timer != nullptr);
-  return static_cast<CUDATimer *>(timer->implementation());
+  return static_cast<GpuTimer*>(timer->implementation());
 }

 // Given const GPU memory, returns a libcuda device pointer datatype, suitable
@ -112,48 +111,49 @@ static CUdeviceptr AsCudaDevicePtr(DeviceMemoryBase *gpu_mem) {
  return AsCudaDevicePtr(*gpu_mem);
 }

-CudaContext* ExtractCudaContext(CUDAExecutor *cuda_exec) {
+GpuContext* ExtractGpuContext(GpuExecutor* cuda_exec) {
  CHECK(cuda_exec != nullptr);
-  return cuda_exec->cuda_context();
+  return cuda_exec->gpu_context();
 }

-CUDAExecutor *ExtractCudaExecutor(StreamExecutor *stream_exec) {
-  return static_cast<CUDAExecutor *>(stream_exec->implementation());
+GpuExecutor* ExtractGpuExecutor(StreamExecutor* stream_exec) {
+  return static_cast<GpuExecutor*>(stream_exec->implementation());
 }

-CUDAExecutor::~CUDAExecutor() {
-  CHECK(kernel_to_gpu_binary_.empty()) << "CUDAExecutor has live kernels.";
-  CHECK(gpu_binary_to_module_.empty()) << "CUDAExecutor has loaded modules.";
+GpuExecutor::~GpuExecutor() {
+  CHECK(kernel_to_gpu_binary_.empty()) << "GpuExecutor has live kernels.";
+  CHECK(gpu_binary_to_module_.empty()) << "GpuExecutor has loaded modules.";
  if (context_ != nullptr) {
-    CUDADriver::DestroyContext(context_);
+    GpuDriver::DestroyContext(context_);
  }
 }

-port::Status CUDAExecutor::Init(int device_ordinal,
-                                DeviceOptions device_options) {
+port::Status GpuExecutor::Init(int device_ordinal,
+                               DeviceOptions device_options) {
  device_ordinal_ = device_ordinal;

-  auto status = CUDADriver::Init();
+  auto status = GpuDriver::Init();
  if (!status.ok()) {
    return status;
  }

-  status = CUDADriver::GetDevice(device_ordinal_, &device_);
+  status = GpuDriver::GetDevice(device_ordinal_, &device_);
  if (!status.ok()) {
    return status;
  }

-  status = CUDADriver::CreateContext(device_, device_options, &context_);
+  status = GpuDriver::CreateContext(device_ordinal_, device_, device_options,
+                                    &context_);
  if (!status.ok()) {
    return status;
  }

-  return CUDADriver::GetComputeCapability(&cc_major_, &cc_minor_, device_);
+  return GpuDriver::GetComputeCapability(&cc_major_, &cc_minor_, device_);
 }

-bool CUDAExecutor::FindOnDiskForComputeCapability(
+bool GpuExecutor::FindOnDiskForComputeCapability(
    absl::string_view filename, absl::string_view canonical_suffix,
-    string *found_filename) const {
+    string* found_filename) const {
  if (cc_major_ == 0 && cc_minor_ == 0) {
    return false;
  }
@ -177,6 +177,13 @@ bool CUDAExecutor::FindOnDiskForComputeCapability(
  return false;
 }

+bool GpuExecutor::FindOnDiskForISAVersion(absl::string_view filename,
+                                          absl::string_view canonical_suffix,
+                                          string* found_filename) const {
+  LOG(ERROR)
+      << "Feature not supported on CUDA platform (FindOnDiskForISAVersion)";
+  return false;
+}
 // Returns the path to the running executable.
 // N.B. Derived from //knowledge/smalltalk/background_kb.cc
 // Arg: strip_exe: if true, remove the name of the executable itself from the
@ -211,12 +218,12 @@ static string GetBinaryDir(bool strip_exe) {
  return exe_path;
 }

-bool CUDAExecutor::LoadModuleFromCuBin(const char *cubin, CUmodule *module) {
+bool GpuExecutor::LoadModuleFromCuBin(const char* cubin, CUmodule* module) {
  uint64_t module_refcount;
  std::tie(*module, module_refcount) = gpu_binary_to_module_[cubin];

  if (*module == nullptr) {
-    auto load_status = CUDADriver::LoadCubin(context_, cubin, module);
+    auto load_status = GpuDriver::LoadCubin(context_, cubin, module);
    if (!load_status.ok()) {
      LOG(ERROR) << "failed to load CUBIN: " << load_status;
      return false;
@ -233,12 +240,12 @@ bool CUDAExecutor::LoadModuleFromCuBin(const char *cubin, CUmodule *module) {
  return true;
 }

-bool CUDAExecutor::LoadModuleFromPtx(const char *ptx, CUmodule *module) {
+bool GpuExecutor::LoadModuleFromPtx(const char* ptx, CUmodule* module) {
  uint64_t module_refcount;
  std::tie(*module, module_refcount) = gpu_binary_to_module_[ptx];

  if (*module == nullptr) {
-    if (!CUDADriver::LoadPtx(context_, ptx, module)) {
+    if (!GpuDriver::LoadPtx(context_, ptx, module)) {
      return false;
    }
    VLOG(3) << "Loaded PTX " << static_cast<const void *>(ptx) << " as module "
@ -253,9 +260,14 @@ bool CUDAExecutor::LoadModuleFromPtx(const char *ptx, CUmodule *module) {
  return true;
 }

-bool CUDAExecutor::GetKernel(const MultiKernelLoaderSpec &spec,
-                             KernelBase *kernel) {
-  CUDAKernel *cuda_kernel = AsCUDAKernel(kernel);
+bool GpuExecutor::LoadModuleFromHsaco(const char* hsaco, CUmodule* module) {
+  LOG(ERROR) << "Feature not supported on CUDA platform (LoadModuleFromHsaco)";
+  return false;
+}
+
+bool GpuExecutor::GetKernel(const MultiKernelLoaderSpec& spec,
+                            KernelBase* kernel) {
+  GpuKernel* cuda_kernel = AsGpuKernel(kernel);
  CUmodule module;
  const string *kernelname;

@ -295,8 +307,8 @@ bool CUDAExecutor::GetKernel(const MultiKernelLoaderSpec &spec,
    return false;
  }
  VLOG(2) << "getting function " << *kernelname << " from module " << module;
-  if (!CUDADriver::GetModuleFunction(context_, module, kernelname->c_str(),
-                                     cuda_kernel->cuda_function_ptr())) {
+  if (!GpuDriver::GetModuleFunction(context_, module, kernelname->c_str(),
+                                    cuda_kernel->gpu_function_ptr())) {
    return false;
  }

@ -313,7 +325,7 @@ bool CUDAExecutor::GetKernel(const MultiKernelLoaderSpec &spec,
  return true;
 }

-bool CUDAExecutor::UnloadGpuBinary(const void *gpu_binary) {
+bool GpuExecutor::UnloadGpuBinary(const void* gpu_binary) {
  auto module_it = gpu_binary_to_module_.find(gpu_binary);
  if (gpu_binary_to_module_.end() == module_it) {
    VLOG(3) << "No loaded CUDA module for " << gpu_binary;
@ -324,13 +336,13 @@ bool CUDAExecutor::UnloadGpuBinary(const void *gpu_binary) {
  VLOG(3) << "Found CUDA module " << module << " with refcount " << refcount;
  if (--refcount == 0) {
    VLOG(3) << "Unloading CUDA module " << module;
-    CUDADriver::UnloadModule(context_, module);
+    GpuDriver::UnloadModule(context_, module);
    gpu_binary_to_module_.erase(module_it);
  }
  return true;
 }

-void CUDAExecutor::UnloadKernel(const KernelBase *kernel) {
+void GpuExecutor::UnloadKernel(const KernelBase* kernel) {
  VLOG(3) << "Unloading kernel " << kernel << " : " << kernel->name();

  mutex_lock lock{in_memory_modules_mu_};
@ -346,9 +358,9 @@ void CUDAExecutor::UnloadKernel(const KernelBase *kernel) {
  kernel_to_gpu_binary_.erase(gpu_binary_it);
 }

-bool CUDAExecutor::LoadModule(const MultiModuleLoaderSpec &spec,
-                              ModuleHandle *module_handle) {
-  // In CUDAExecutor we store the pointer to the GPU binary (PTX or CUBIN) as
+bool GpuExecutor::LoadModule(const MultiModuleLoaderSpec& spec,
+                             ModuleHandle* module_handle) {
+  // In GpuExecutor we store the pointer to the GPU binary (PTX or CUBIN) as
  // ModuleHandle::id().
  CUmodule cu_module;
  if (spec.has_cuda_cubin_in_memory()) {
@ -382,25 +394,23 @@ bool CUDAExecutor::LoadModule(const MultiModuleLoaderSpec &spec,
  return false;
 }

-bool CUDAExecutor::UnloadModule(ModuleHandle module_handle) {
+bool GpuExecutor::UnloadModule(ModuleHandle module_handle) {
  const char *gpu_binary = reinterpret_cast<const char *>(module_handle.id());
  mutex_lock lock{in_memory_modules_mu_};
  return UnloadGpuBinary(gpu_binary);
 }

-bool CUDAExecutor::GetKernelMetadata(CUDAKernel *cuda_kernel,
-                                     KernelMetadata *kernel_metadata) {
+bool GpuExecutor::GetKernelMetadata(GpuKernel* cuda_kernel,
+                                    KernelMetadata* kernel_metadata) {
  int value;
-  if (!CUDADriver::FuncGetAttribute(CU_FUNC_ATTRIBUTE_NUM_REGS,
-                                    *cuda_kernel->cuda_function_ptr(),
-                                    &value)) {
+  if (!GpuDriver::FuncGetAttribute(CU_FUNC_ATTRIBUTE_NUM_REGS,
+                                   *cuda_kernel->gpu_function_ptr(), &value)) {
    return false;
  }
  kernel_metadata->set_registers_per_thread(value);

-  if (!CUDADriver::FuncGetAttribute(CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES,
-                                    *cuda_kernel->cuda_function_ptr(),
-                                    &value)) {
+  if (!GpuDriver::FuncGetAttribute(CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES,
+                                   *cuda_kernel->gpu_function_ptr(), &value)) {
    return false;
  }
  kernel_metadata->set_shared_memory_bytes(value);
@ -408,13 +418,13 @@ bool CUDAExecutor::GetKernelMetadata(CUDAKernel *cuda_kernel,
  return true;
 }

-bool CUDAExecutor::Launch(Stream *stream, const ThreadDim &thread_dims,
-                          const BlockDim &block_dims, const KernelBase &kernel,
-                          const KernelArgsArrayBase &args) {
+bool GpuExecutor::Launch(Stream* stream, const ThreadDim& thread_dims,
+                         const BlockDim& block_dims, const KernelBase& kernel,
+                         const KernelArgsArrayBase& args) {
  CHECK_EQ(kernel.Arity(), args.number_of_arguments());
-  CUstream custream = AsCUDAStreamValue(stream);
-  const CUDAKernel *cuda_kernel = AsCUDAKernel(&kernel);
-  CUfunction cufunc = cuda_kernel->AsCUDAFunctionValue();
+  CUstream custream = AsGpuStreamValue(stream);
+  const GpuKernel* cuda_kernel = AsGpuKernel(&kernel);
+  CUfunction cufunc = cuda_kernel->AsGpuFunctionHandle();

  // Only perform/print the occupancy check once.  Even just checking to see
  // whether we've done an occupancy check on this kernel before isn't free
@ -431,16 +441,16 @@ bool CUDAExecutor::Launch(Stream *stream, const ThreadDim &thread_dims,

  if (cuda_kernel->GetPreferredCacheConfig() !=
      KernelCacheConfig::kNoPreference) {
-    CUDADriver::FuncSetCacheConfig(cufunc, cuda_kernel->GetCUDACacheConfig());
+    GpuDriver::FuncSetCacheConfig(cufunc, cuda_kernel->GetGpuCacheConfig());
  }

  void **kernel_params = const_cast<void **>(args.argument_addresses().data());

-  if (!CUDADriver::LaunchKernel(context_, cufunc, block_dims.x, block_dims.y,
-                                block_dims.z, thread_dims.x, thread_dims.y,
-                                thread_dims.z, args.number_of_shared_bytes(),
-                                custream, kernel_params,
-                                nullptr /* = extra */)) {
+  if (!GpuDriver::LaunchKernel(context_, cufunc, block_dims.x, block_dims.y,
+                               block_dims.z, thread_dims.x, thread_dims.y,
+                               thread_dims.z, args.number_of_shared_bytes(),
+                               custream, kernel_params,
+                               nullptr /* = extra */)) {
    LOG(ERROR) << "failed to launch CUDA kernel " << kernel.name() << " with "
               << args.number_of_arguments()
               << " args; thread dim: " << thread_dims.ToString()
@ -454,9 +464,9 @@ bool CUDAExecutor::Launch(Stream *stream, const ThreadDim &thread_dims,
 // This is a non-essential operation; if there's a failure, proceed without
 // logging an error. It's nearly certain that in case of failures, we'd never
 // get here in the first place; these are very low-impact routines.
-void CUDAExecutor::VlogOccupancyInfo(const KernelBase &kernel,
-                                     const ThreadDim &thread_dims,
-                                     const BlockDim &block_dims) {
+void GpuExecutor::VlogOccupancyInfo(const KernelBase& kernel,
+                                    const ThreadDim& thread_dims,
+                                    const BlockDim& block_dims) {
  VLOG(2) << "Computing kernel occupancy for kernel "
          << kernel.demangled_name();
  VLOG(2) << "Thread dimensions (" << thread_dims.x << ", " << thread_dims.y
@ -475,8 +485,8 @@ void CUDAExecutor::VlogOccupancyInfo(const KernelBase &kernel,
  const DeviceDescription &device_description =
      kernel.parent()->GetDeviceDescription();

-  const CUDAKernel *cuda_kernel = AsCUDAKernel(&kernel);
-  CUfunction cufunc = cuda_kernel->AsCUDAFunctionValue();
+  const GpuKernel* cuda_kernel = AsGpuKernel(&kernel);
+  CUfunction cufunc = cuda_kernel->AsGpuFunctionHandle();

  int blocks_per_sm = CalculateOccupancy(device_description, regs_per_thread,
                                         smem_per_block, thread_dims, cufunc);
@ -496,10 +506,11 @@ void CUDAExecutor::VlogOccupancyInfo(const KernelBase &kernel,
 // Compute and return maximum blocks per core (occupancy) based on the
 // device description, some kernel characteristics and the number of threads per
 // block.  If unable to compute occupancy, zero is returned.
-int CUDAExecutor::CalculateOccupancy(
-    const DeviceDescription &device_description, uint64 registers_per_thread,
-    uint64 shared_memory_per_block, const ThreadDim &thread_dims,
-    CUfunction func) {
+int GpuExecutor::CalculateOccupancy(const DeviceDescription& device_description,
+                                    uint64 registers_per_thread,
+                                    uint64 shared_memory_per_block,
+                                    const ThreadDim& thread_dims,
+                                    CUfunction func) {
  int suggested_blocks = 0;
  int suggested_threads = 0;
  CUresult err = tensorflow::wrap::cuOccupancyMaxPotentialBlockSize(
@ -511,12 +522,12 @@ int CUDAExecutor::CalculateOccupancy(

 // Compute and return the suggested thread count to achieve ideal occupancy.
 // If the provided thread dimensions match this number, zero is returned.
-int CUDAExecutor::CompareOccupancy(int *initial_blocks,
-                                   const DeviceDescription &device_description,
-                                   uint64 registers_per_thread,
-                                   uint64 shared_memory_per_block,
-                                   const ThreadDim &thread_dims,
-                                   CUfunction func) {
+int GpuExecutor::CompareOccupancy(int* initial_blocks,
+                                  const DeviceDescription& device_description,
+                                  uint64 registers_per_thread,
+                                  uint64 shared_memory_per_block,
+                                  const ThreadDim& thread_dims,
+                                  CUfunction func) {
  int suggested_blocks = 0;
  int suggested_threads = 0;
  CUresult err = tensorflow::wrap::cuOccupancyMaxPotentialBlockSize(
@ -531,88 +542,87 @@ int CUDAExecutor::CompareOccupancy(int *initial_blocks,
  }
 }

-void *CUDAExecutor::Allocate(uint64 size) {
-  return CUDADriver::DeviceAllocate(context_, size);
+void* GpuExecutor::Allocate(uint64 size) {
+  return GpuDriver::DeviceAllocate(context_, size);
 }

-void *CUDAExecutor::AllocateSubBuffer(DeviceMemoryBase *mem,
-                                      uint64 offset_bytes, uint64 size_bytes) {
+void* GpuExecutor::AllocateSubBuffer(DeviceMemoryBase* mem, uint64 offset_bytes,
+                                     uint64 size_bytes) {
  // offset and size are in bytes, so char* works as the pointer type.
  return reinterpret_cast<char *>(mem->opaque()) + offset_bytes;
 }

-void CUDAExecutor::Deallocate(DeviceMemoryBase *mem) {
+void GpuExecutor::Deallocate(DeviceMemoryBase* mem) {
  // CUDA "sub-buffers" are just pointer + offset, so no dealloc is necessary.
  if (!mem->is_sub_buffer()) {
-    CUDADriver::DeviceDeallocate(context_, mem->opaque());
+    GpuDriver::DeviceDeallocate(context_, mem->opaque());
  }
 }

-bool CUDAExecutor::HostMemoryRegister(void *location, uint64 size) {
+bool GpuExecutor::HostMemoryRegister(void* location, uint64 size) {
  if (location == nullptr || size == 0) {
    LOG(WARNING) << "attempting to register null or zero-sized memory: "
                 << location << "; size " << size;
  }
  VLOG(2) << "registering " << location << " size " << size;
-  return CUDADriver::HostRegister(context_, location, size);
+  return GpuDriver::HostRegister(context_, location, size);
 }

-bool CUDAExecutor::HostMemoryUnregister(void *location) {
+bool GpuExecutor::HostMemoryUnregister(void* location) {
  VLOG(2) << "unregistering " << location;
-  return CUDADriver::HostUnregister(context_, location);
+  return GpuDriver::HostUnregister(context_, location);
 }

-bool CUDAExecutor::SynchronizeAllActivity() {
-  return CUDADriver::SynchronizeContext(context_);
+bool GpuExecutor::SynchronizeAllActivity() {
+  return GpuDriver::SynchronizeContext(context_);
 }

-bool CUDAExecutor::SynchronousMemZero(DeviceMemoryBase *location, uint64 size) {
+bool GpuExecutor::SynchronousMemZero(DeviceMemoryBase* location, uint64 size) {
  if (reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 &&
      size % 4 == 0) {
-    return CUDADriver::SynchronousMemsetUint32(
+    return GpuDriver::SynchronousMemsetUint32(
        context_, AsCudaDevicePtr(location), 0x0, size / 4);
  }
-  return CUDADriver::SynchronousMemsetUint8(context_, AsCudaDevicePtr(location),
-                                            0x0, size);
+  return GpuDriver::SynchronousMemsetUint8(context_, AsCudaDevicePtr(location),
+                                           0x0, size);
 }

-bool CUDAExecutor::SynchronousMemSet(DeviceMemoryBase *location, int value,
-                                     uint64 size) {
+bool GpuExecutor::SynchronousMemSet(DeviceMemoryBase* location, int value,
+                                    uint64 size) {
  if (reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 &&
      size % 4 == 0) {
    // cudaMemset reinterprets "value" as a uint8.
    uint8 byte_value = static_cast<uint8>(value);
    uint32 pattern = (byte_value << 24) | (byte_value << 16) |
                     (byte_value << 8) | byte_value;
-    return CUDADriver::SynchronousMemsetUint32(
+    return GpuDriver::SynchronousMemsetUint32(
        context_, AsCudaDevicePtr(location), pattern, size / 4);
  }
-  return CUDADriver::SynchronousMemsetUint8(context_, AsCudaDevicePtr(location),
-                                            value, size);
+  return GpuDriver::SynchronousMemsetUint8(context_, AsCudaDevicePtr(location),
+                                           value, size);
 }

-port::Status CUDAExecutor::SynchronousMemcpy(DeviceMemoryBase *gpu_dst,
-                                             const void *host_src,
-                                             uint64 size) {
-  return CUDADriver::SynchronousMemcpyH2D(context_, AsCudaDevicePtr(gpu_dst),
-                                          host_src, size);
+port::Status GpuExecutor::SynchronousMemcpy(DeviceMemoryBase* gpu_dst,
+                                            const void* host_src, uint64 size) {
+  return GpuDriver::SynchronousMemcpyH2D(context_, AsCudaDevicePtr(gpu_dst),
+                                         host_src, size);
 }

-port::Status CUDAExecutor::SynchronousMemcpy(void *host_dst,
-                                             const DeviceMemoryBase &gpu_src,
-                                             uint64 size) {
-  return CUDADriver::SynchronousMemcpyD2H(context_, host_dst,
-                                          AsCudaDevicePtr(gpu_src), size);
+port::Status GpuExecutor::SynchronousMemcpy(void* host_dst,
+                                            const DeviceMemoryBase& gpu_src,
+                                            uint64 size) {
+  return GpuDriver::SynchronousMemcpyD2H(context_, host_dst,
+                                         AsCudaDevicePtr(gpu_src), size);
 }

-port::Status CUDAExecutor::SynchronousMemcpyDeviceToDevice(
-    DeviceMemoryBase *gpu_dst, const DeviceMemoryBase &gpu_src, uint64 size) {
-  return CUDADriver::SynchronousMemcpyD2D(context_, AsCudaDevicePtr(gpu_dst),
-                                          AsCudaDevicePtr(gpu_src), size);
+port::Status GpuExecutor::SynchronousMemcpyDeviceToDevice(
+    DeviceMemoryBase* gpu_dst, const DeviceMemoryBase& gpu_src, uint64 size) {
+  return GpuDriver::SynchronousMemcpyD2D(context_, AsCudaDevicePtr(gpu_dst),
+                                         AsCudaDevicePtr(gpu_src), size);
 }

-bool CUDAExecutor::MemZero(Stream *stream, DeviceMemoryBase *location,
-                           uint64 size) {
+bool GpuExecutor::MemZero(Stream* stream, DeviceMemoryBase* location,
+                          uint64 size) {
  if (reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 &&
      size % 4 == 0) {
    return Memset32(stream, location, 0x0, size);
@ -621,88 +631,87 @@ bool CUDAExecutor::MemZero(Stream *stream, DeviceMemoryBase *location,
  }
 }

-bool CUDAExecutor::Memset(Stream *stream, DeviceMemoryBase *location,
-                           uint8 pattern, uint64 size) {
+bool GpuExecutor::Memset(Stream* stream, DeviceMemoryBase* location,
+                         uint8 pattern, uint64 size) {
  VLOG(2) << "enqueueing memset8 operation onto stream " << stream
          << " at location " << location << " with size " << size
          << " and pattern " << std::hex << pattern;
-  return CUDADriver::AsynchronousMemsetUint8(
-      context_, AsCudaDevicePtr(location), pattern, size,
-      AsCUDAStreamValue(stream));
+  return GpuDriver::AsynchronousMemsetUint8(context_, AsCudaDevicePtr(location),
+                                            pattern, size,
+                                            AsGpuStreamValue(stream));
 }

-bool CUDAExecutor::Memset32(Stream *stream, DeviceMemoryBase *location,
-                            uint32 pattern, uint64 size) {
+bool GpuExecutor::Memset32(Stream* stream, DeviceMemoryBase* location,
+                           uint32 pattern, uint64 size) {
  VLOG(2) << "enqueueing memset32 operation onto stream " << stream
          << " at location " << location << " with size " << size
          << " and pattern " << std::hex << pattern;
  CHECK(reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 &&
        size % 4 == 0);
-  return CUDADriver::AsynchronousMemsetUint32(
+  return GpuDriver::AsynchronousMemsetUint32(
      context_, AsCudaDevicePtr(location), pattern, size / 4,
-      AsCUDAStreamValue(stream));
+      AsGpuStreamValue(stream));
 }

-bool CUDAExecutor::Memcpy(Stream *stream, void *host_dst,
-                          const DeviceMemoryBase &gpu_src, uint64 size) {
-  return CUDADriver::AsynchronousMemcpyD2H(context_, host_dst,
-                                           AsCudaDevicePtr(gpu_src), size,
-                                           AsCUDAStreamValue(stream));
+bool GpuExecutor::Memcpy(Stream* stream, void* host_dst,
+                         const DeviceMemoryBase& gpu_src, uint64 size) {
+  return GpuDriver::AsynchronousMemcpyD2H(context_, host_dst,
+                                          AsCudaDevicePtr(gpu_src), size,
+                                          AsGpuStreamValue(stream));
 }

-bool CUDAExecutor::Memcpy(Stream *stream, DeviceMemoryBase *gpu_dst,
-                          const void *host_src, uint64 size) {
-  return CUDADriver::AsynchronousMemcpyH2D(context_, AsCudaDevicePtr(gpu_dst),
-                                           host_src, size,
-                                           AsCUDAStreamValue(stream));
+bool GpuExecutor::Memcpy(Stream* stream, DeviceMemoryBase* gpu_dst,
+                         const void* host_src, uint64 size) {
+  return GpuDriver::AsynchronousMemcpyH2D(context_, AsCudaDevicePtr(gpu_dst),
+                                          host_src, size,
+                                          AsGpuStreamValue(stream));
 }

-bool CUDAExecutor::MemcpyDeviceToDevice(Stream *stream,
-                                        DeviceMemoryBase *gpu_dst,
-                                        const DeviceMemoryBase &gpu_src,
-                                        uint64 size) {
-  return CUDADriver::AsynchronousMemcpyD2D(context_, AsCudaDevicePtr(gpu_dst),
-                                           AsCudaDevicePtr(gpu_src), size,
-                                           AsCUDAStreamValue(stream));
+bool GpuExecutor::MemcpyDeviceToDevice(Stream* stream,
+                                       DeviceMemoryBase* gpu_dst,
+                                       const DeviceMemoryBase& gpu_src,
+                                       uint64 size) {
+  return GpuDriver::AsynchronousMemcpyD2D(context_, AsCudaDevicePtr(gpu_dst),
+                                          AsCudaDevicePtr(gpu_src), size,
+                                          AsGpuStreamValue(stream));
 }

-bool CUDAExecutor::HostCallback(Stream *stream,
-                                std::function<port::Status()> callback) {
+bool GpuExecutor::HostCallback(Stream* stream,
+                               std::function<port::Status()> callback) {
  auto callback_ptr = new std::function<void()>([callback]() {
    port::Status s = callback();
    if (!s.ok()) {
      LOG(WARNING) << "Host callback failed: " << s;
    }
  });
-  return CUDADriver::AddStreamCallback(context_, AsCUDAStreamValue(stream),
-                                       InternalHostCallback, callback_ptr);
+  return GpuDriver::AddStreamCallback(context_, AsGpuStreamValue(stream),
+                                      InternalHostCallback, callback_ptr);
 }

-/* static */ void CUDAExecutor::InternalHostCallback(CUstream stream,
-                                                     CUresult status,
-                                                     void *data) {
+/* static */ void GpuExecutor::InternalHostCallback(CUstream stream,
+                                                    CUresult status,
+                                                    void* data) {
  std::function<void()> *callback =
      reinterpret_cast<std::function<void()> *>(data);
  (*callback)();
  delete callback;
 }

-port::Status CUDAExecutor::AllocateEvent(Event *event) {
-  return AsCUDAEvent(event)->Init();
+port::Status GpuExecutor::AllocateEvent(Event* event) {
+  return AsGpuEvent(event)->Init();
 }

-port::Status CUDAExecutor::DeallocateEvent(Event *event) {
-  return AsCUDAEvent(event)->Destroy();
+port::Status GpuExecutor::DeallocateEvent(Event* event) {
+  return AsGpuEvent(event)->Destroy();
 }

-port::Status CUDAExecutor::RecordEvent(Stream *stream, Event *event) {
-  return AsCUDAEvent(event)->Record(AsCUDAStream(stream));
+port::Status GpuExecutor::RecordEvent(Stream* stream, Event* event) {
+  return AsGpuEvent(event)->Record(AsGpuStream(stream));
 }

-port::Status CUDAExecutor::WaitForEvent(Stream *stream, Event *event) {
-  if (CUDADriver::WaitStreamOnEvent(context_,
-                                    AsCUDAStream(stream)->cuda_stream(),
-                                    AsCUDAEvent(event)->cuda_event())) {
+port::Status GpuExecutor::WaitForEvent(Stream* stream, Event* event) {
+  if (GpuDriver::WaitStreamOnEvent(context_, AsGpuStream(stream)->gpu_stream(),
+                                   AsGpuEvent(event)->gpu_event())) {
    return port::Status::OK();
  } else {
    return port::Status(
@ -712,61 +721,61 @@ port::Status CUDAExecutor::WaitForEvent(Stream *stream, Event *event) {
  }
 }

-Event::Status CUDAExecutor::PollForEventStatus(Event *event) {
-  return AsCUDAEvent(event)->PollForStatus();
+Event::Status GpuExecutor::PollForEventStatus(Event* event) {
+  return AsGpuEvent(event)->PollForStatus();
 }

-bool CUDAExecutor::AllocateStream(Stream *stream) {
-  return AsCUDAStream(stream)->Init();
+bool GpuExecutor::AllocateStream(Stream* stream) {
+  return AsGpuStream(stream)->Init();
 }

-void CUDAExecutor::DeallocateStream(Stream *stream) {
-  CUDAStream *cuda_stream = AsCUDAStream(stream);
+void GpuExecutor::DeallocateStream(Stream* stream) {
+  GpuStream* cuda_stream = AsGpuStream(stream);
  if (!cuda_stream->IsIdle()) {
    LOG(ERROR) << "Deallocating stream with pending work";
  }
  cuda_stream->Destroy();
 }

-bool CUDAExecutor::AllocateTimer(Timer *timer) {
-  return AsCUDATimer(timer)->Init();
+bool GpuExecutor::AllocateTimer(Timer* timer) {
+  return AsGpuTimer(timer)->Init();
 }

-void CUDAExecutor::DeallocateTimer(Timer *timer) {
-  AsCUDATimer(timer)->Destroy();
+void GpuExecutor::DeallocateTimer(Timer* timer) {
+  AsGpuTimer(timer)->Destroy();
 }

-bool CUDAExecutor::CreateStreamDependency(Stream *dependent, Stream *other) {
-  CUevent other_completed_event = *AsCUDAStream(other)->completed_event();
-  bool ok = CUDADriver::RecordEvent(context_, other_completed_event,
-                                    AsCUDAStreamValue(other))
-      .ok();
+bool GpuExecutor::CreateStreamDependency(Stream* dependent, Stream* other) {
+  CUevent other_completed_event = *AsGpuStream(other)->completed_event();
+  bool ok = GpuDriver::RecordEvent(context_, other_completed_event,
+                                   AsGpuStreamValue(other))
+                .ok();
  if (!ok) {
    LOG(ERROR) << "failed to record completion event; "
                  "therefore, failed to create inter-stream dependency";
    return false;
  }

-  return CUDADriver::WaitStreamOnEvent(context_, AsCUDAStreamValue(dependent),
-                                       other_completed_event);
+  return GpuDriver::WaitStreamOnEvent(context_, AsGpuStreamValue(dependent),
+                                      other_completed_event);
 }

-bool CUDAExecutor::StartTimer(Stream *stream, Timer *timer) {
-  return AsCUDATimer(timer)->Start(AsCUDAStream(stream));
+bool GpuExecutor::StartTimer(Stream* stream, Timer* timer) {
+  return AsGpuTimer(timer)->Start(AsGpuStream(stream));
 }

-bool CUDAExecutor::StopTimer(Stream *stream, Timer *timer) {
-  return AsCUDATimer(timer)->Stop(AsCUDAStream(stream));
+bool GpuExecutor::StopTimer(Stream* stream, Timer* timer) {
+  return AsGpuTimer(timer)->Stop(AsGpuStream(stream));
 }

-port::Status CUDAExecutor::BlockHostUntilDone(Stream *stream) {
-  return CUDADriver::SynchronizeStream(context_, AsCUDAStreamValue(stream));
+port::Status GpuExecutor::BlockHostUntilDone(Stream* stream) {
+  return GpuDriver::SynchronizeStream(context_, AsGpuStreamValue(stream));
 }

-blas::BlasSupport *CUDAExecutor::CreateBlas() {
+blas::BlasSupport* GpuExecutor::CreateBlas() {
  PluginRegistry *registry = PluginRegistry::Instance();
  port::StatusOr<PluginRegistry::BlasFactory> status =
-      registry->GetFactory<PluginRegistry::BlasFactory>(kCudaPlatformId,
+      registry->GetFactory<PluginRegistry::BlasFactory>(cuda::kCudaPlatformId,
                                                        plugin_config_.blas());
  if (!status.ok()) {
    LOG(ERROR) << "Unable to retrieve BLAS factory: "
@ -777,10 +786,10 @@ blas::BlasSupport *CUDAExecutor::CreateBlas() {
  return status.ValueOrDie()(this);
 }

-dnn::DnnSupport *CUDAExecutor::CreateDnn() {
+dnn::DnnSupport* GpuExecutor::CreateDnn() {
  PluginRegistry *registry = PluginRegistry::Instance();
  port::StatusOr<PluginRegistry::DnnFactory> status =
-      registry->GetFactory<PluginRegistry::DnnFactory>(kCudaPlatformId,
+      registry->GetFactory<PluginRegistry::DnnFactory>(cuda::kCudaPlatformId,
                                                       plugin_config_.dnn());
  if (!status.ok()) {
    LOG(ERROR) << "Unable to retrieve DNN factory: "
@ -791,10 +800,10 @@ dnn::DnnSupport *CUDAExecutor::CreateDnn() {
  return status.ValueOrDie()(this);
 }

-fft::FftSupport *CUDAExecutor::CreateFft() {
+fft::FftSupport* GpuExecutor::CreateFft() {
  PluginRegistry *registry = PluginRegistry::Instance();
  port::StatusOr<PluginRegistry::FftFactory> status =
-      registry->GetFactory<PluginRegistry::FftFactory>(kCudaPlatformId,
+      registry->GetFactory<PluginRegistry::FftFactory>(cuda::kCudaPlatformId,
                                                       plugin_config_.fft());
  if (!status.ok()) {
    LOG(ERROR) << "Unable to retrieve FFT factory: "
@ -805,10 +814,10 @@ fft::FftSupport *CUDAExecutor::CreateFft() {
  return status.ValueOrDie()(this);
 }

-rng::RngSupport *CUDAExecutor::CreateRng() {
+rng::RngSupport* GpuExecutor::CreateRng() {
  PluginRegistry *registry = PluginRegistry::Instance();
  port::StatusOr<PluginRegistry::RngFactory> status =
-      registry->GetFactory<PluginRegistry::RngFactory>(kCudaPlatformId,
+      registry->GetFactory<PluginRegistry::RngFactory>(cuda::kCudaPlatformId,
                                                       plugin_config_.rng());
  if (!status.ok()) {
    LOG(ERROR) << "Unable to retrieve RNG factory: "
@ -820,23 +829,21 @@ rng::RngSupport *CUDAExecutor::CreateRng() {
 }

 // TODO(rspringer): Remove in b/18544742.
-bool CUDAExecutor::SupportsDnn() const {
-  return true;
+bool GpuExecutor::SupportsDnn() const { return true; }
+
+bool GpuExecutor::CanEnablePeerAccessTo(StreamExecutorInterface* other) {
+  GpuExecutor* cuda_other = static_cast<GpuExecutor*>(other);
+  return GpuDriver::CanEnablePeerAccess(context_, cuda_other->context_);
 }

-bool CUDAExecutor::CanEnablePeerAccessTo(StreamExecutorInterface *other) {
-  CUDAExecutor *cuda_other = static_cast<CUDAExecutor *>(other);
-  return CUDADriver::CanEnablePeerAccess(context_, cuda_other->context_);
+port::Status GpuExecutor::EnablePeerAccessTo(StreamExecutorInterface* other) {
+  GpuExecutor* cuda_other = static_cast<GpuExecutor*>(other);
+  return GpuDriver::EnablePeerAccess(context_, cuda_other->context_);
 }

-port::Status CUDAExecutor::EnablePeerAccessTo(StreamExecutorInterface *other) {
-  CUDAExecutor *cuda_other = static_cast<CUDAExecutor *>(other);
-  return CUDADriver::EnablePeerAccess(context_, cuda_other->context_);
-}
-
-SharedMemoryConfig CUDAExecutor::GetDeviceSharedMemoryConfig() {
+SharedMemoryConfig GpuExecutor::GetDeviceSharedMemoryConfig() {
  port::StatusOr<CUsharedconfig> cuda_config =
-      CUDADriver::ContextGetSharedMemConfig(context_);
+      GpuDriver::ContextGetSharedMemConfig(context_);
  if (!cuda_config.ok()) {
    // Don't log; the failed call will log necessary output.
    return SharedMemoryConfig::kDefault;
@ -855,7 +862,7 @@ SharedMemoryConfig CUDAExecutor::GetDeviceSharedMemoryConfig() {
  }
 }

-port::Status CUDAExecutor::SetDeviceSharedMemoryConfig(
+port::Status GpuExecutor::SetDeviceSharedMemoryConfig(
    SharedMemoryConfig config) {
  CUsharedconfig cuda_config;
  switch (config) {
@ -872,21 +879,21 @@ port::Status CUDAExecutor::SetDeviceSharedMemoryConfig(
      LOG(FATAL) << "Invalid shared memory configuration specified: "
                 << static_cast<int>(config);
  }
-  return CUDADriver::ContextSetSharedMemConfig(context_, cuda_config);
+  return GpuDriver::ContextSetSharedMemConfig(context_, cuda_config);
 }

-bool CUDAExecutor::DeviceMemoryUsage(int64 *free, int64 *total) const {
-  return CUDADriver::GetDeviceMemoryInfo(context_, free, total);
+bool GpuExecutor::DeviceMemoryUsage(int64* free, int64* total) const {
+  return GpuDriver::GetDeviceMemoryInfo(context_, free, total);
 }

-bool CUDAExecutor::GetSymbol(const string &symbol_name,
-                             ModuleHandle module_handle, void **mem,
-                             size_t *bytes) {
+bool GpuExecutor::GetSymbol(const string& symbol_name,
+                            ModuleHandle module_handle, void** mem,
+                            size_t* bytes) {
  auto lookup_in_module = [&](CUmodule module) {
    CHECK(module != nullptr);
-    return CUDADriver::GetModuleSymbol(context_, module, symbol_name.c_str(),
-                                       reinterpret_cast<CUdeviceptr *>(mem),
-                                       bytes);
+    return GpuDriver::GetModuleSymbol(context_, module, symbol_name.c_str(),
+                                      reinterpret_cast<CUdeviceptr*>(mem),
+                                      bytes);
  };

  {  // give limited scope to mutex_lock
@ -908,13 +915,13 @@ bool CUDAExecutor::GetSymbol(const string &symbol_name,
  return false;
 }

-bool CUDAExecutor::FillBlockDimLimit(BlockDim *block_dim_limit) const {
+bool GpuExecutor::FillBlockDimLimit(BlockDim* block_dim_limit) const {
  // The BlockDim name is a mismatch against these GRID_DIM_* queries because
  // we use BlockDims to express the dimensions of blocks within a grid
  // (as opposed to ThreadDim which expresses the dimensions of threads
  // within a block).
  int x, y, z;
-  if (!CUDADriver::GetGridLimits(&x, &y, &z, device_)) {
+  if (!GpuDriver::GetGridLimits(&x, &y, &z, device_)) {
    return false;
  }

@ -924,35 +931,35 @@ bool CUDAExecutor::FillBlockDimLimit(BlockDim *block_dim_limit) const {
  return true;
 }

-bool CUDAExecutor::SupportsBlas() const { return true; }
+bool GpuExecutor::SupportsBlas() const { return true; }

-bool CUDAExecutor::SupportsFft() const { return true; }
+bool GpuExecutor::SupportsFft() const { return true; }

-bool CUDAExecutor::SupportsRng() const { return true; }
+bool GpuExecutor::SupportsRng() const { return true; }

 std::unique_ptr<internal::EventInterface>
-CUDAExecutor::CreateEventImplementation() {
-  return std::unique_ptr<internal::EventInterface>(new CUDAEvent(this));
+GpuExecutor::CreateEventImplementation() {
+  return std::unique_ptr<internal::EventInterface>(new GpuEvent(this));
 }

 std::unique_ptr<internal::KernelInterface>
-CUDAExecutor::CreateKernelImplementation() {
-  return std::unique_ptr<internal::KernelInterface>(new CUDAKernel());
+GpuExecutor::CreateKernelImplementation() {
+  return std::unique_ptr<internal::KernelInterface>(new GpuKernel());
 }

 std::unique_ptr<internal::StreamInterface>
-CUDAExecutor::GetStreamImplementation() {
-  return std::unique_ptr<internal::StreamInterface>(new CUDAStream(this));
+GpuExecutor::GetStreamImplementation() {
+  return std::unique_ptr<internal::StreamInterface>(new GpuStream(this));
 }

 std::unique_ptr<internal::TimerInterface>
-CUDAExecutor::GetTimerImplementation() {
-  return std::unique_ptr<internal::TimerInterface>(new CUDATimer(this));
+GpuExecutor::GetTimerImplementation() {
+  return std::unique_ptr<internal::TimerInterface>(new GpuTimer(this));
 }

-void *CUDAExecutor::GpuContextHack() { return context_; }
+void* GpuExecutor::GpuContextHack() { return context_; }

-CudaContext* CUDAExecutor::cuda_context() { return context_; }
+GpuContext* GpuExecutor::gpu_context() { return context_; }

 // Attempts to read the NUMA node corresponding to the GPU device's PCI bus out
 // of SysFS. Returns -1 if it cannot.
@ -1019,21 +1026,21 @@ static int TryToReadNumaNode(const string &pci_bus_id, int device_ordinal) {
 #endif
 }

-
-DeviceDescription *CUDAExecutor::PopulateDeviceDescription() const {
+DeviceDescription* GpuExecutor::PopulateDeviceDescription() const {
  internal::DeviceDescriptionBuilder builder;

  {
    int driver_version = 0;
-    (void)CUDADriver::GetDriverVersion(&driver_version);
+    (void)GpuDriver::GetDriverVersion(&driver_version);
    string augmented_driver_version = port::Printf(
        "%d (%s)", driver_version,
-        DriverVersionStatusToString(Diagnostician::FindDsoVersion()).c_str());
+        cuda::DriverVersionStatusToString(Diagnostician::FindDsoVersion())
+            .c_str());
    builder.set_driver_version(augmented_driver_version);
  }

  {
-    string pci_bus_id = CUDADriver::GetPCIBusID(device_);
+    string pci_bus_id = GpuDriver::GetPCIBusID(device_);

    // Lower the hex characters to match sysfs.
    pci_bus_id = port::Lowercase(pci_bus_id);
@ -1046,43 +1053,43 @@ DeviceDescription *CUDAExecutor::PopulateDeviceDescription() const {

  {
    builder.set_threads_per_block_limit(
-        CUDADriver::GetDeviceAttribute(
-            CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, device_)
+        GpuDriver::GetDeviceAttribute(CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
+                                      device_)
            .ValueOrDie());

    ThreadDim thread_dim_limit;
-    thread_dim_limit.x = CUDADriver::GetDeviceAttribute(
+    thread_dim_limit.x = GpuDriver::GetDeviceAttribute(
                             CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, device_)
                             .ValueOrDie();
-    thread_dim_limit.y = CUDADriver::GetDeviceAttribute(
+    thread_dim_limit.y = GpuDriver::GetDeviceAttribute(
                             CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, device_)
                             .ValueOrDie();
-    thread_dim_limit.z = CUDADriver::GetDeviceAttribute(
+    thread_dim_limit.z = GpuDriver::GetDeviceAttribute(
                             CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, device_)
                             .ValueOrDie();
    builder.set_thread_dim_limit(thread_dim_limit);

    int clock_rate =
-        CUDADriver::GetDeviceAttribute(CU_DEVICE_ATTRIBUTE_CLOCK_RATE, device_)
+        GpuDriver::GetDeviceAttribute(CU_DEVICE_ATTRIBUTE_CLOCK_RATE, device_)
            .ValueOrDie();
    builder.set_clock_rate_ghz(static_cast<float>(clock_rate) / 1e6);
  }

  {
    bool ecc_enabled = false;
-    (void)CUDADriver::IsEccEnabled(device_, &ecc_enabled);
+    (void)GpuDriver::IsEccEnabled(device_, &ecc_enabled);
    builder.set_ecc_enabled(ecc_enabled);
  }

  {
    uint64 device_memory_size = -1;
-    (void)CUDADriver::GetDeviceTotalMemory(device_, &device_memory_size);
+    (void)GpuDriver::GetDeviceTotalMemory(device_, &device_memory_size);
    builder.set_device_memory_size(device_memory_size);
  }

-  port::StatusOr<int> mem_clock_khz = CUDADriver::GetDeviceAttribute(
+  port::StatusOr<int> mem_clock_khz = GpuDriver::GetDeviceAttribute(
      CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, device_ordinal_);
-  port::StatusOr<int> mem_bus_width_bits = CUDADriver::GetDeviceAttribute(
+  port::StatusOr<int> mem_bus_width_bits = GpuDriver::GetDeviceAttribute(
      CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, device_ordinal_);
  if (mem_clock_khz.ok() && mem_bus_width_bits.ok()) {
    // Times 2 because HBM is DDR memory; it gets two data bits per each data
@ -1100,7 +1107,7 @@ DeviceDescription *CUDAExecutor::PopulateDeviceDescription() const {

  {
    string device_name;
-    (void)CUDADriver::GetDeviceName(device_, &device_name);
+    (void)GpuDriver::GetDeviceName(device_, &device_name);
    builder.set_name(device_name);
  }

@ -1114,19 +1121,19 @@ DeviceDescription *CUDAExecutor::PopulateDeviceDescription() const {
  builder.set_device_vendor("NVIDIA Corporation");
  builder.set_cuda_compute_capability(cc_major_, cc_minor_);
  builder.set_shared_memory_per_core(
-      CUDADriver::GetMaxSharedMemoryPerCore(device_).ValueOrDie());
+      GpuDriver::GetMaxSharedMemoryPerCore(device_).ValueOrDie());
  builder.set_shared_memory_per_block(
-      CUDADriver::GetMaxSharedMemoryPerBlock(device_).ValueOrDie());
+      GpuDriver::GetMaxSharedMemoryPerBlock(device_).ValueOrDie());
  builder.set_core_count(
-      CUDADriver::GetMultiprocessorCount(device_).ValueOrDie());
+      GpuDriver::GetMultiprocessorCount(device_).ValueOrDie());
  builder.set_threads_per_core_limit(
-      CUDADriver::GetMaxThreadsPerMultiprocessor(device_).ValueOrDie());
+      GpuDriver::GetMaxThreadsPerMultiprocessor(device_).ValueOrDie());
  builder.set_registers_per_block_limit(
-      CUDADriver::GetMaxRegistersPerBlock(device_).ValueOrDie());
+      GpuDriver::GetMaxRegistersPerBlock(device_).ValueOrDie());
  builder.set_threads_per_warp(
-      CUDADriver::GetThreadsPerWarp(device_).ValueOrDie());
+      GpuDriver::GetThreadsPerWarp(device_).ValueOrDie());
  builder.set_registers_per_core_limit(
-      CUDADriver::GetDeviceAttribute(
+      GpuDriver::GetDeviceAttribute(
          CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR, device_)
          .ValueOrDie());

@ -1134,11 +1141,11 @@ DeviceDescription *CUDAExecutor::PopulateDeviceDescription() const {
  return built.release();
 }

-}  // namespace cuda
+}  // namespace gpu

 void initialize_cuda_gpu_executor() {
-  *internal::MakeCUDAExecutorImplementation() = [](const PluginConfig &config) {
-    return new cuda::CUDAExecutor{config};
+  *internal::MakeCUDAExecutorImplementation() = [](const PluginConfig& config) {
+    return new gpu::GpuExecutor{config};
  };
 }

--- a/tensorflow/stream_executor/cuda/cuda_gpu_executor.h
+++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.h
@ -22,289 +22,12 @@ limitations under the License.
 #ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_GPU_EXECUTOR_H_
 #define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_GPU_EXECUTOR_H_

-#include <set>
-#include <unordered_map>
-
-#include "absl/strings/string_view.h"
-#include "tensorflow/stream_executor/cuda/cuda_kernel.h"
-#include "tensorflow/stream_executor/event.h"
-#include "tensorflow/stream_executor/lib/status.h"
-#include "tensorflow/stream_executor/lib/statusor.h"
-#include "tensorflow/stream_executor/platform.h"
-#include "tensorflow/stream_executor/platform/mutex.h"
-#include "tensorflow/stream_executor/platform/port.h"
-#include "tensorflow/stream_executor/platform/thread_annotations.h"
-#include "tensorflow/stream_executor/stream_executor_internal.h"
+#include "tensorflow/stream_executor/gpu/gpu_executor.h"

 namespace stream_executor {
 namespace cuda {

-// CUDA-platform implementation of the platform-agnostic
-// StreamExecutorInferface.
-class CUDAExecutor : public internal::StreamExecutorInterface {
- public:
-  // sub_platform indicates the subplatform used in this executor; it must
-  // be a CUDA type.
-  explicit CUDAExecutor(const PluginConfig &plugin_config)
-      : device_(0),
-        context_(nullptr),
-        device_ordinal_(0),
-        cc_major_(0),
-        cc_minor_(0),
-        plugin_config_(plugin_config) {}
-
-  // See the corresponding StreamExecutor methods for method comments on the
-  // following overrides.
-
-  ~CUDAExecutor() override;
-
-  port::Status Init(int device_ordinal, DeviceOptions device_options) override;
-
-  bool GetKernel(const MultiKernelLoaderSpec &spec,
-                 KernelBase *kernel) override;
-  void UnloadKernel(const KernelBase *kernel) override;
-  bool LoadModule(const MultiModuleLoaderSpec &spec,
-                  ModuleHandle *module_handle) override;
-  bool UnloadModule(ModuleHandle module_handle) override;
-
-  bool Launch(Stream *stream, const ThreadDim &thread_dims,
-              const BlockDim &block_dims, const KernelBase &k,
-              const KernelArgsArrayBase &args) override;
-
-  int CalculateOccupancy(const DeviceDescription &device_description,
-                         uint64 registers_per_thread,
-                         uint64 shared_memory_per_block,
-                         const ThreadDim &thread_dims, CUfunction func);
-
-  int CompareOccupancy(int *initial_blocks,
-                       const DeviceDescription &device_description,
-                       uint64 registers_per_thread,
-                       uint64 shared_memory_per_block,
-                       const ThreadDim &thread_dims, CUfunction func);
-
-  void *Allocate(uint64 size) override;
-
-  void *AllocateSubBuffer(DeviceMemoryBase *mem, uint64 offset_bytes,
-                          uint64 size_bytes) override;
-
-  void Deallocate(DeviceMemoryBase *mem) override;
-
-  void *UnifiedMemoryAllocate(uint64 size) override {
-    return CUDADriver::UnifiedMemoryAllocate(context_, size);
-  }
-
-  void UnifiedMemoryDeallocate(void *location) override {
-    return CUDADriver::UnifiedMemoryDeallocate(context_, location);
-  }
-
-  // CUDA allocation/registration functions are necessary because the driver
-  // internally sets up buffers for DMA operations (and page locks them).
-  // There's no external interface for us to otherwise control these DMA
-  // settings.
-  void *HostMemoryAllocate(uint64 size) override {
-    return CUDADriver::HostAllocate(context_, size);
-  }
-
-  void HostMemoryDeallocate(void *location) override {
-    return CUDADriver::HostDeallocate(context_, location);
-  }
-
-  bool HostMemoryRegister(void *location, uint64 size) override;
-
-  bool HostMemoryUnregister(void *location) override;
-
-  bool SynchronizeAllActivity() override;
-
-  bool SynchronousMemZero(DeviceMemoryBase *location, uint64 size) override;
-
-  bool SynchronousMemSet(DeviceMemoryBase *location, int value,
-                         uint64 size) override;
-
-  port::Status SynchronousMemcpy(DeviceMemoryBase *gpu_dst,
-                                 const void *host_src, uint64 size) override;
-
-  port::Status SynchronousMemcpy(void *host_dst,
-                                 const DeviceMemoryBase &gpu_src,
-                                 uint64 size) override;
-
-  port::Status SynchronousMemcpyDeviceToDevice(DeviceMemoryBase *gpu_dst,
-                                               const DeviceMemoryBase &gpu_src,
-                                               uint64 size) override;
-
-  bool MemZero(Stream *stream, DeviceMemoryBase *location,
-               uint64 size) override;
-  bool Memset(Stream *stream, DeviceMemoryBase *location, uint8 pattern,
-              uint64 size) override;
-  bool Memset32(Stream *stream, DeviceMemoryBase *location, uint32 pattern,
-                uint64 size) override;
-
-  bool Memcpy(Stream *stream, void *host_dst, const DeviceMemoryBase &gpu_src,
-              uint64 size) override;
-
-  bool Memcpy(Stream *stream, DeviceMemoryBase *gpu_dst, const void *host_src,
-              uint64 size) override;
-
-  bool MemcpyDeviceToDevice(Stream *stream, DeviceMemoryBase *gpu_dst,
-                            const DeviceMemoryBase &gpu_src,
-                            uint64 size) override;
-
-  bool HostCallback(Stream *stream,
-                    std::function<port::Status()> callback) override;
-
-  bool AllocateStream(Stream *stream) override;
-
-  void DeallocateStream(Stream *stream) override;
-
-  bool CreateStreamDependency(Stream *dependent, Stream *other) override;
-
-  bool AllocateTimer(Timer *timer) override;
-
-  void DeallocateTimer(Timer *timer) override;
-
-  bool StartTimer(Stream *stream, Timer *timer) override;
-
-  bool StopTimer(Stream *stream, Timer *timer) override;
-
-  port::Status AllocateEvent(Event *event) override;
-
-  port::Status DeallocateEvent(Event *event) override;
-
-  port::Status RecordEvent(Stream *stream, Event *event) override;
-
-  port::Status WaitForEvent(Stream *stream, Event *event) override;
-
-  Event::Status PollForEventStatus(Event *event) override;
-
-  port::Status BlockHostUntilDone(Stream *stream) override;
-
-  int PlatformDeviceCount() override { return CUDADriver::GetDeviceCount(); }
-
-  port::Status EnablePeerAccessTo(StreamExecutorInterface *other) override;
-
-  bool CanEnablePeerAccessTo(StreamExecutorInterface *other) override;
-
-  SharedMemoryConfig GetDeviceSharedMemoryConfig() override;
-
-  port::Status SetDeviceSharedMemoryConfig(SharedMemoryConfig config) override;
-
-  bool DeviceMemoryUsage(int64 *free, int64 *total) const override;
-
-  // Search for the symbol and returns a device pointer and size.
-  // Returns false if symbol does not exist.
-  bool GetSymbol(const string &symbol_name, ModuleHandle module_handle,
-                 void **mem, size_t *bytes) override;
-
-  DeviceDescription *PopulateDeviceDescription() const override;
-
-  // Populates the block_dim_limit by querying the device driver API. If an
-  // error occurs at any point while asking the driver for block dim limits, it
-  // will be only partially populated as a result, and an error will be logged.
-  bool FillBlockDimLimit(BlockDim *block_dim_limit) const;
-
-  bool SupportsBlas() const override;
-
-  blas::BlasSupport *CreateBlas() override;
-
-  bool SupportsFft() const override;
-
-  fft::FftSupport *CreateFft() override;
-
-  bool SupportsRng() const override;
-
-  rng::RngSupport *CreateRng() override;
-
-  bool SupportsDnn() const override;
-
-  dnn::DnnSupport *CreateDnn() override;
-
-  std::unique_ptr<internal::EventInterface> CreateEventImplementation()
-      override;
-
-  std::unique_ptr<internal::KernelInterface> CreateKernelImplementation()
-      override;
-
-  std::unique_ptr<internal::StreamInterface> GetStreamImplementation() override;
-
-  std::unique_ptr<internal::TimerInterface> GetTimerImplementation() override;
-
-  void *GpuContextHack() override;
-
-  CudaContext* cuda_context();
-
- private:
-  // Attempts to find a more specific version of the file indicated by
-  // filename by looking for compute-capability-specific suffixed versions; i.e.
-  // looking for "foo.ptx" will check to see if "foo.ptx.cc30.ptx" is present if
-  // we're on a compute capability 3.0 machine.
-  bool FindOnDiskForComputeCapability(absl::string_view filename,
-                                      absl::string_view canonical_suffix,
-                                      string *found_filename) const;
-
-  // Host callback landing routine invoked by CUDA.
-  // data: User-provided callback provided to HostCallback() above, captured
-  //       as a std::function<void()>. Allocated/initialized inside
-  //       HostCallback() and owned and deleted by this call.
-  static void InternalHostCallback(CUstream stream, CUresult status,
-                                   void *data);
-
-  // Collects metadata for the specified kernel.
-  bool GetKernelMetadata(CUDAKernel *cuda_kernel,
-                         KernelMetadata *kernel_metadata);
-
-  // Prints to VLOG(2) information about the kernel's occupancy and how it might
-  // be improved.
-  void VlogOccupancyInfo(const KernelBase &kernel, const ThreadDim &thread_dims,
-                         const BlockDim &block_dims);
-
-  bool LoadModuleFromCuBin(const char *cubin, CUmodule *module)
-      EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_);
-
-  // Loads the PTX text `ptx` as a CUDA module.  `ptx` must be null terminated.
-  bool LoadModuleFromPtx(const char *ptx, CUmodule *module)
-      EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_);
-
-  bool UnloadGpuBinary(const void *gpu_binary)
-      EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_);
-
-  // Guards the in-memory-module mapping.
-  mutex in_memory_modules_mu_;
-
-  // Kernel -> loaded GPU binary. Many kernels may load the same binary.
-  std::unordered_map<const KernelBase *, const void *> kernel_to_gpu_binary_
-      GUARDED_BY(in_memory_modules_mu_);
-  // GPU binary (PTX or CUBIN) -> {CUDA module, reference count}.
-  std::unordered_map<const void *, std::pair<CUmodule, uint64>>
-      gpu_binary_to_module_ GUARDED_BY(in_memory_modules_mu_);
-
-  // Guards the launched kernel set.
-  mutex launched_kernels_mu_;
-
-  // Keeps track of the set of launched kernels. Currently used to suppress the
-  // occupancy check on subsequent launches.
-  std::set<CUfunction> launched_kernels_ GUARDED_BY(launched_kernels_mu_);
-
-  // Handle for the CUDA device being operated on. Immutable
-  // post-initialization.
-  CUdevice device_;
-
-  // Handle for session with the library/driver. Immutable post-initialization.
-  CudaContext* context_;
-
-  // The device ordinal value that this executor was initialized with; recorded
-  // for use in getting device metadata. Immutable post-initialization.
-  int device_ordinal_;
-
-  // The major verion of the compute capability for device_.
-  int cc_major_;
-
-  // The minor verion of the compute capability for device_.
-  int cc_minor_;
-
-  // The plugin configuration associated with this instance.
-  PluginConfig plugin_config_;
-
-  SE_DISALLOW_COPY_AND_ASSIGN(CUDAExecutor);
-};
+using CUDAExecutor = gpu::GpuExecutor;

 }  // namespace cuda
 }  // namespace stream_executor
--- a/tensorflow/stream_executor/cuda/cuda_helpers.h
+++ b/tensorflow/stream_executor/cuda/cuda_helpers.h
@ -17,88 +17,9 @@ limitations under the License.
 //
 // These are typically placed here for use by multiple source components (for
 // example, BLAS and executor components).
-
 #ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_HELPERS_H_
 #define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_HELPERS_H_

-#include <stddef.h>
-#include <complex>
-
-#include "cuda/include/cuComplex.h"
-
-namespace stream_executor {
-
-template <typename ElemT>
-class DeviceMemory;
-
-namespace cuda {
-
-// Converts a const DeviceMemory reference to its underlying typed pointer in
-// CUDA
-// device memory.
-template <typename T>
-const T *CUDAMemory(const DeviceMemory<T> &mem) {
-  return static_cast<const T *>(mem.opaque());
-}
-
-// Converts a (non-const) DeviceMemory pointer reference to its underlying typed
-// pointer in CUDA device memory.
-template <typename T>
-T *CUDAMemoryMutable(DeviceMemory<T> *mem) {
-  return static_cast<T *>(mem->opaque());
-}
-
-static_assert(sizeof(std::complex<float>) == sizeof(cuComplex),
-              "std::complex<float> and cuComplex should have the same size");
-static_assert(offsetof(cuComplex, x) == 0,
-              "The real part of cuComplex should appear first.");
-static_assert(sizeof(std::complex<double>) == sizeof(cuDoubleComplex),
-              "std::complex<double> and cuDoubleComplex should have the same "
-              "size");
-static_assert(offsetof(cuDoubleComplex, x) == 0,
-              "The real part of cuDoubleComplex should appear first.");
-
-// Type traits to get CUDA complex types from std::complex<>.
-
-template <typename T>
-struct CUDAComplexT {
-  typedef T type;
-};
-
-template <>
-struct CUDAComplexT<std::complex<float>> {
-  typedef cuComplex type;
-};
-
-template <>
-struct CUDAComplexT<std::complex<double>> {
-  typedef cuDoubleComplex type;
-};
-
-// Converts pointers of std::complex<> to pointers of
-// cuComplex/cuDoubleComplex. No type conversion for non-complex types.
-
-template <typename T>
-inline const typename CUDAComplexT<T>::type *CUDAComplex(const T *p) {
-  return reinterpret_cast<const typename CUDAComplexT<T>::type *>(p);
-}
-
-template <typename T>
-inline typename CUDAComplexT<T>::type *CUDAComplex(T *p) {
-  return reinterpret_cast<typename CUDAComplexT<T>::type *>(p);
-}
-
-// Converts values of std::complex<float/double> to values of
-// cuComplex/cuDoubleComplex.
-inline cuComplex CUDAComplexValue(std::complex<float> val) {
-  return {val.real(), val.imag()};
-}
-
-inline cuDoubleComplex CUDAComplexValue(std::complex<double> val) {
-  return {val.real(), val.imag()};
-}
-
-}  // namespace cuda
-}  // namespace stream_executor
+#include "tensorflow/stream_executor/gpu/gpu_helpers.h"

 #endif  // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_HELPERS_H_
--- a/tensorflow/stream_executor/cuda/cuda_kernel.cc
+++ b/tensorflow/stream_executor/cuda/cuda_kernel.cc
@ -0,0 +1,38 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/stream_executor/cuda/cuda_kernel.h"
+
+namespace stream_executor {
+namespace gpu {
+
+CUfunc_cache GpuKernel::GetGpuCacheConfig() const {
+  switch (preferred_cache_config_) {
+    case KernelCacheConfig::kNoPreference:
+      return CU_FUNC_CACHE_PREFER_NONE;
+    case KernelCacheConfig::kPreferShared:
+      return CU_FUNC_CACHE_PREFER_SHARED;
+    case KernelCacheConfig::kPreferL1:
+      return CU_FUNC_CACHE_PREFER_L1;
+    case KernelCacheConfig::kPreferEqual:
+      return CU_FUNC_CACHE_PREFER_EQUAL;
+    default:
+      LOG(FATAL) << "Unknown KernelCacheConfig"
+                 << static_cast<int32>(preferred_cache_config_);
+  }
+}
+
+}  // namespace gpu
+}  // namespace stream_executor
--- a/tensorflow/stream_executor/cuda/cuda_kernel.h
+++ b/tensorflow/stream_executor/cuda/cuda_kernel.h
@ -22,104 +22,12 @@ limitations under the License.
 #ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_KERNEL_H_
 #define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_KERNEL_H_

-#include "tensorflow/stream_executor/kernel_cache_config.h"
-#include "tensorflow/stream_executor/stream_executor_internal.h"
-#include "tensorflow/stream_executor/cuda/cuda_driver.h"
-#include "tensorflow/stream_executor/platform/port.h"
-#include "tensorflow/stream_executor/platform/logging.h"
-#include "cuda/include/cuda.h"
-
-#ifdef PLATFORMS_GPUS_CUDA_DYNAMIC_LIBCUDA_DYNAMIC_LIBCUDA_H_
-#error \
-    "No driver calls in this file, wrap driver functionality in cuda_driver.cc."
-#endif
-
-#ifdef __CUDA_RUNTIME_H__
-#error \
-    "CUDA runtime being included into CUDA GPU executor; should be driver only."
-#endif
+#include "tensorflow/stream_executor/gpu/gpu_kernel.h"

 namespace stream_executor {
 namespace cuda {

-// Wraps a CUfunction to implement the platform-independent KernelInterface.
-class CUDAKernel : public internal::KernelInterface {
- public:
-  CUDAKernel() : cuda_function_(nullptr), arity_(0),
-                 preferred_cache_config_(KernelCacheConfig::kNoPreference) {}
-
-  // Note that the function is unloaded when the module is unloaded, and the
-  // module that the function is contained in is owned by the CUDAExecutor.
-  ~CUDAKernel() override {}
-
-  // As arity cannot be reflected upon using the CUDA API, the arity is
-  // explicitly set during the CUDAExecutor::GetKernel initialization process.
-  void set_arity(unsigned arity) { arity_ = arity; }
-  unsigned Arity() const override { return arity_; }
-
-  // Returns the CUfunction value for passing to the CUDA API.
-  CUfunction AsCUDAFunctionValue() const {
-    DCHECK(cuda_function_ != nullptr);
-    return const_cast<CUfunction>(cuda_function_);
-  }
-
-  // Returns the slot that the CUfunction is stored within for this object,
-  // for the CUDA API which wants to load into a CUfunction*.
-  CUfunction *cuda_function_ptr() { return &cuda_function_; }
-
-  // CUDA supports setting the preferred cache configuration of a CUfunction
-  // (more-or-less equivalent to a CUDAKernel). We support this via the below
-  // functions; users can set a preference, and that is applied when the kernel
-  // is [lazy-]loaded (in CUDAExecutor::Launch). The alternative would be to
-  // load the kernel & set the preference when the user calls the setter below;
-  // either approach is valid.
-  // Sets the current kernel cache configuration preference.
-  void SetPreferredCacheConfig(KernelCacheConfig config) override {
-    preferred_cache_config_ = config;
-  }
-
-  // Returns the current kernel cache configuration preference.
-  KernelCacheConfig GetPreferredCacheConfig() const override {
-    return preferred_cache_config_;
-  }
-
-  // Returns the current kernel cache configuration preference as a
-  // CUfunc_cache.
-  CUfunc_cache GetCUDACacheConfig() const {
-    switch (preferred_cache_config_) {
-      case KernelCacheConfig::kNoPreference:
-        return CU_FUNC_CACHE_PREFER_NONE;
-      case KernelCacheConfig::kPreferShared:
-        return CU_FUNC_CACHE_PREFER_SHARED;
-      case KernelCacheConfig::kPreferL1:
-        return CU_FUNC_CACHE_PREFER_L1;
-      case KernelCacheConfig::kPreferEqual:
-        return CU_FUNC_CACHE_PREFER_EQUAL;
-      default:
-        LOG(FATAL) << "Unknown KernelCacheConfig"
-                   << static_cast<int32>(preferred_cache_config_);
-    }
-  }
-
- private:
-  CUfunction cuda_function_;  // Wrapped CUDA kernel handle.
-  unsigned arity_;            // Number of formal parameters the kernel takes.
-
-  // Preferred (but not required) cache configuration for this kernel.
-  KernelCacheConfig preferred_cache_config_;
-};
-
-// Given a platform-independent kernel datatype, returns the (const) internal
-// CUDA platform implementation pointer.
-inline const CUDAKernel *AsCUDAKernel(const KernelBase *kernel) {
-  return static_cast<const CUDAKernel *>(kernel->implementation());
-}
-
-// Given a platform-independent kernel datatype, returns the (non-const)
-// internal CUDA platform implementation pointer.
-inline CUDAKernel *AsCUDAKernel(KernelBase *kernel) {
-  return static_cast<CUDAKernel *>(kernel->implementation());
-}
+using CUDAKernel = gpu::GpuKernel;

 }  // namespace cuda
 }  // namespace stream_executor
--- a/tensorflow/stream_executor/cuda/cuda_platform.cc
+++ b/tensorflow/stream_executor/cuda/cuda_platform.cc
@ -25,7 +25,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/lib/stringprintf.h"

 namespace stream_executor {
-namespace cuda {
+namespace gpu {
 namespace {

 // Synchronize with spinlocks.
@ -129,16 +129,16 @@ port::StatusOr<StreamExecutor*> CudaPlatform::FirstExecutorForBus(
      port::Printf("Executor for bus %d not found.", bus_ordinal));
 }

-Platform::Id CudaPlatform::id() const { return kCudaPlatformId; }
+Platform::Id CudaPlatform::id() const { return cuda::kCudaPlatformId; }

 int CudaPlatform::VisibleDeviceCount() const {
  // Throw away the result - it logs internally, and this [containing] function
  // isn't in the path of user control. It's safe to call this > 1x.
-  if (!cuda::CUDADriver::Init().ok()) {
+  if (!gpu::GpuDriver::Init().ok()) {
    return -1;
  }

-  return CUDADriver::GetDeviceCount();
+  return GpuDriver::GetDeviceCount();
 }

 const string& CudaPlatform::Name() const { return name_; }
@ -169,7 +169,7 @@ port::StatusOr<StreamExecutor*> CudaPlatform::GetExecutor(
 port::StatusOr<std::unique_ptr<StreamExecutor>>
 CudaPlatform::GetUncachedExecutor(const StreamExecutorConfig& config) {
  auto executor = MakeUnique<StreamExecutor>(
-      this, MakeUnique<CUDAExecutor>(config.plugin_config));
+      this, MakeUnique<GpuExecutor>(config.plugin_config));
  auto init_status = executor->Init(config.ordinal, config.device_options);
  if (!init_status.ok()) {
    return port::Status(
@ -191,13 +191,13 @@ void CudaPlatform::UnregisterTraceListener(TraceListener* listener) {
  LOG(FATAL) << "not yet implemented: unregister CUDA trace listener";
 }

-}  // namespace cuda
+}  // namespace gpu

 static void InitializeCudaPlatform() {
  // Disabling leak checking, MultiPlatformManager does not destroy its
  // registered platforms.

-  std::unique_ptr<cuda::CudaPlatform> platform(new cuda::CudaPlatform);
+  std::unique_ptr<gpu::CudaPlatform> platform(new gpu::CudaPlatform);
  SE_CHECK_OK(MultiPlatformManager::RegisterPlatform(std::move(platform)));
 }

--- a/tensorflow/stream_executor/cuda/cuda_platform.h
+++ b/tensorflow/stream_executor/cuda/cuda_platform.h
@ -32,7 +32,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/trace_listener.h"

 namespace stream_executor {
-namespace cuda {
+namespace gpu {

 // Opaque and unique identifier for the CUDA platform plugin.
 // This is needed so that plugins can refer to/identify this platform without
@ -102,6 +102,12 @@ class CudaPlatform : public Platform {
  SE_DISALLOW_COPY_AND_ASSIGN(CudaPlatform);
 };

+}  // namespace gpu
+
+namespace cuda {
+
+using CudaPlatform = gpu::CudaPlatform;
+
 }  // namespace cuda
 }  // namespace stream_executor

--- a/tensorflow/stream_executor/cuda/cuda_rng.cc
+++ b/tensorflow/stream_executor/cuda/cuda_rng.cc
@ -58,33 +58,33 @@ std::ostream &operator<<(std::ostream &in, const curandStatus_t &status) {
 }

 namespace stream_executor {
-namespace cuda {
+namespace gpu {

-PLUGIN_REGISTRY_DEFINE_PLUGIN_ID(kCuRandPlugin);
+PLUGIN_REGISTRY_DEFINE_PLUGIN_ID(kGpuRandPlugin);

 namespace wrap {

 #ifdef PLATFORM_GOOGLE
-#define STREAM_EXECUTOR_CURAND_WRAP(__name)                         \
-  struct WrapperShim__##__name {                                    \
-    template <typename... Args>                                     \
-    curandStatus_t operator()(CUDAExecutor *parent, Args... args) { \
-      cuda::ScopedActivateExecutorContext sac{parent};              \
-      return ::__name(args...);                                     \
-    }                                                               \
+#define STREAM_EXECUTOR_CURAND_WRAP(__name)                        \
+  struct WrapperShim__##__name {                                   \
+    template <typename... Args>                                    \
+    curandStatus_t operator()(GpuExecutor* parent, Args... args) { \
+      gpu::ScopedActivateExecutorContext sac{parent};              \
+      return ::__name(args...);                                    \
+    }                                                              \
  } __name;

 #else
 #define STREAM_EXECUTOR_CURAND_WRAP(__name)                               \
  struct DynLoadShim__##__name {                                          \
-    static const char *kName;                                             \
+    static const char* kName;                                             \
    using FuncPtrT = std::add_pointer<decltype(::__name)>::type;          \
-    static void *GetDsoHandle() {                                         \
+    static void* GetDsoHandle() {                                         \
      auto s = internal::CachedDsoLoader::GetCurandDsoHandle();           \
      return s.ValueOrDie();                                              \
    }                                                                     \
    static FuncPtrT LoadOrDie() {                                         \
-      void *f;                                                            \
+      void* f;                                                            \
      auto s = port::Env::Default()->GetSymbolFromLibrary(GetDsoHandle(), \
                                                          kName, &f);     \
      CHECK(s.ok()) << "could not find " << kName                         \
@ -96,12 +96,12 @@ namespace wrap {
      return f;                                                           \
    }                                                                     \
    template <typename... Args>                                           \
-    curandStatus_t operator()(CUDAExecutor *parent, Args... args) {       \
-      cuda::ScopedActivateExecutorContext sac{parent};                    \
+    curandStatus_t operator()(GpuExecutor* parent, Args... args) {        \
+      gpu::ScopedActivateExecutorContext sac{parent};                     \
      return DynLoad()(args...);                                          \
    }                                                                     \
  } __name;                                                               \
-  const char *DynLoadShim__##__name::kName = #__name;
+  const char* DynLoadShim__##__name::kName = #__name;
 #endif

 STREAM_EXECUTOR_CURAND_WRAP(curandCreateGenerator);
@ -116,38 +116,15 @@ STREAM_EXECUTOR_CURAND_WRAP(curandGenerateNormalDouble);

 }  // namespace wrap

-template <typename T>
-string TypeString();
+GpuRng::GpuRng(GpuExecutor* parent) : parent_(parent), rng_(nullptr) {}

-template <>
-string TypeString<float>() {
-  return "float";
-}
-
-template <>
-string TypeString<double>() {
-  return "double";
-}
-
-template <>
-string TypeString<std::complex<float>>() {
-  return "std::complex<float>";
-}
-
-template <>
-string TypeString<std::complex<double>>() {
-  return "std::complex<double>";
-}
-
-CUDARng::CUDARng(CUDAExecutor *parent) : parent_(parent), rng_(nullptr) {}
-
-CUDARng::~CUDARng() {
+GpuRng::~GpuRng() {
  if (rng_ != nullptr) {
    wrap::curandDestroyGenerator(parent_, rng_);
  }
 }

-bool CUDARng::Init() {
+bool GpuRng::Init() {
  mutex_lock lock(mu_);
  CHECK(rng_ == nullptr);

@ -162,9 +139,9 @@ bool CUDARng::Init() {
  return true;
 }

-bool CUDARng::SetStream(Stream *stream) {
+bool GpuRng::SetStream(Stream* stream) {
  curandStatus_t ret =
-      wrap::curandSetStream(parent_, rng_, AsCUDAStreamValue(stream));
+      wrap::curandSetStream(parent_, rng_, AsGpuStreamValue(stream));
  if (ret != CURAND_STATUS_SUCCESS) {
    LOG(ERROR) << "failed to set stream for random generation: " << ret;
    return false;
@ -182,8 +159,7 @@ constexpr bool ComplexIsConsecutiveFloats() {
 }

 template <typename T>
-bool CUDARng::DoPopulateRandUniformInternal(Stream *stream,
-                                            DeviceMemory<T> *v) {
+bool GpuRng::DoPopulateRandUniformInternal(Stream* stream, DeviceMemory<T>* v) {
  mutex_lock lock(mu_);
  static_assert(ComplexIsConsecutiveFloats(),
                "std::complex values are not stored as consecutive values");
@ -203,11 +179,11 @@ bool CUDARng::DoPopulateRandUniformInternal(Stream *stream,
  if (std::is_same<T, float>::value ||
      std::is_same<T, std::complex<float>>::value) {
    ret = wrap::curandGenerateUniform(
-        parent_, rng_, reinterpret_cast<float *>(CUDAMemoryMutable(v)),
+        parent_, rng_, reinterpret_cast<float*>(GpuMemoryMutable(v)),
        element_count);
  } else {
    ret = wrap::curandGenerateUniformDouble(
-        parent_, rng_, reinterpret_cast<double *>(CUDAMemoryMutable(v)),
+        parent_, rng_, reinterpret_cast<double*>(GpuMemoryMutable(v)),
        element_count);
  }
  if (ret != CURAND_STATUS_SUCCESS) {
@ -220,29 +196,29 @@ bool CUDARng::DoPopulateRandUniformInternal(Stream *stream,
  return true;
 }

-bool CUDARng::DoPopulateRandUniform(Stream *stream, DeviceMemory<float> *v) {
+bool GpuRng::DoPopulateRandUniform(Stream* stream, DeviceMemory<float>* v) {
  return DoPopulateRandUniformInternal(stream, v);
 }

-bool CUDARng::DoPopulateRandUniform(Stream *stream, DeviceMemory<double> *v) {
+bool GpuRng::DoPopulateRandUniform(Stream* stream, DeviceMemory<double>* v) {
  return DoPopulateRandUniformInternal(stream, v);
 }

-bool CUDARng::DoPopulateRandUniform(Stream *stream,
-                                    DeviceMemory<std::complex<float>> *v) {
+bool GpuRng::DoPopulateRandUniform(Stream* stream,
+                                   DeviceMemory<std::complex<float>>* v) {
  return DoPopulateRandUniformInternal(stream, v);
 }

-bool CUDARng::DoPopulateRandUniform(Stream *stream,
-                                    DeviceMemory<std::complex<double>> *v) {
+bool GpuRng::DoPopulateRandUniform(Stream* stream,
+                                   DeviceMemory<std::complex<double>>* v) {
  return DoPopulateRandUniformInternal(stream, v);
 }

 template <typename ElemT, typename FuncT>
-bool CUDARng::DoPopulateRandGaussianInternal(Stream *stream, ElemT mean,
-                                             ElemT stddev,
-                                             DeviceMemory<ElemT> *v,
-                                             FuncT func) {
+bool GpuRng::DoPopulateRandGaussianInternal(Stream* stream, ElemT mean,
+                                            ElemT stddev,
+                                            DeviceMemory<ElemT>* v,
+                                            FuncT func) {
  mutex_lock lock(mu_);

  if (!SetStream(stream)) {
@ -251,7 +227,7 @@ bool CUDARng::DoPopulateRandGaussianInternal(Stream *stream, ElemT mean,

  uint64 element_count = v->ElementCount();
  curandStatus_t ret =
-      func(parent_, rng_, CUDAMemoryMutable(v), element_count, mean, stddev);
+      func(parent_, rng_, GpuMemoryMutable(v), element_count, mean, stddev);

  if (ret != CURAND_STATUS_SUCCESS) {
    LOG(ERROR) << "failed to do gaussian generation of " << v->ElementCount()
@ -262,19 +238,19 @@ bool CUDARng::DoPopulateRandGaussianInternal(Stream *stream, ElemT mean,
  return true;
 }

-bool CUDARng::DoPopulateRandGaussian(Stream *stream, float mean, float stddev,
-                                     DeviceMemory<float> *v) {
+bool GpuRng::DoPopulateRandGaussian(Stream* stream, float mean, float stddev,
+                                    DeviceMemory<float>* v) {
  return DoPopulateRandGaussianInternal(stream, mean, stddev, v,
                                        wrap::curandGenerateNormal);
 }

-bool CUDARng::DoPopulateRandGaussian(Stream *stream, double mean, double stddev,
-                                     DeviceMemory<double> *v) {
+bool GpuRng::DoPopulateRandGaussian(Stream* stream, double mean, double stddev,
+                                    DeviceMemory<double>* v) {
  return DoPopulateRandGaussianInternal(stream, mean, stddev, v,
                                        wrap::curandGenerateNormalDouble);
 }

-bool CUDARng::SetSeed(Stream *stream, const uint8 *seed, uint64 seed_bytes) {
+bool GpuRng::SetSeed(Stream* stream, const uint8* seed, uint64 seed_bytes) {
  mutex_lock lock(mu_);
  CHECK(rng_ != nullptr);

@ -303,15 +279,15 @@ bool CUDARng::SetSeed(Stream *stream, const uint8 *seed, uint64 seed_bytes) {
  return true;
 }

-}  // namespace cuda
+}  // namespace gpu

 void initialize_curand() {
  port::Status status =
      PluginRegistry::Instance()->RegisterFactory<PluginRegistry::RngFactory>(
-          cuda::kCudaPlatformId, cuda::kCuRandPlugin, "cuRAND",
-          [](internal::StreamExecutorInterface *parent) -> rng::RngSupport * {
-            cuda::CUDAExecutor *cuda_executor =
-                dynamic_cast<cuda::CUDAExecutor *>(parent);
+          cuda::kCudaPlatformId, gpu::kGpuRandPlugin, "cuRAND",
+          [](internal::StreamExecutorInterface* parent) -> rng::RngSupport* {
+            gpu::GpuExecutor* cuda_executor =
+                dynamic_cast<gpu::GpuExecutor*>(parent);
            if (cuda_executor == nullptr) {
              LOG(ERROR)
                  << "Attempting to initialize an instance of the cuRAND "
@ -319,7 +295,7 @@ void initialize_curand() {
              return nullptr;
            }

-            cuda::CUDARng *rng = new cuda::CUDARng(cuda_executor);
+            gpu::GpuRng* rng = new gpu::GpuRng(cuda_executor);
            if (!rng->Init()) {
              // Note: Init() will log a more specific error.
              delete rng;
@ -334,7 +310,7 @@ void initialize_curand() {
  }

  PluginRegistry::Instance()->SetDefaultFactory(
-      cuda::kCudaPlatformId, PluginKind::kRng, cuda::kCuRandPlugin);
+      cuda::kCudaPlatformId, PluginKind::kRng, gpu::kGpuRandPlugin);
 }

 }  // namespace stream_executor
--- a/tensorflow/stream_executor/cuda/cuda_rng.h
+++ b/tensorflow/stream_executor/cuda/cuda_rng.h
@ -16,85 +16,13 @@ limitations under the License.
 #ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_RNG_H_
 #define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_RNG_H_

-#include "tensorflow/stream_executor/platform/mutex.h"
-#include "tensorflow/stream_executor/platform/port.h"
-#include "tensorflow/stream_executor/platform/thread_annotations.h"
-#include "tensorflow/stream_executor/plugin_registry.h"
-#include "tensorflow/stream_executor/rng.h"
-
-typedef struct curandGenerator_st *curandGenerator_t;
+#include "tensorflow/stream_executor/gpu/gpu_rng.h"

 namespace stream_executor {

-class Stream;
-template <typename ElemT>
-class DeviceMemory;
-
 namespace cuda {

-// Opaque and unique identifier for the cuRAND plugin.
-extern const PluginId kCuRandPlugin;
-
-class CUDAExecutor;
-
-// CUDA-platform implementation of the random number generation support
-// interface.
-//
-// Thread-safe post-initialization.
-class CUDARng : public rng::RngSupport {
- public:
-  explicit CUDARng(CUDAExecutor *parent);
-
-  // Retrieves a curand library generator handle. This is necessary for
-  // enqueuing random number generation work onto the device.
-  // TODO(leary) provide a way for users to select the RNG algorithm.
-  bool Init();
-
-  // Releases a curand library generator handle, if one was acquired.
-  ~CUDARng() override;
-
-  // See rng::RngSupport for details on the following overrides.
-  bool DoPopulateRandUniform(Stream *stream, DeviceMemory<float> *v) override;
-  bool DoPopulateRandUniform(Stream *stream, DeviceMemory<double> *v) override;
-  bool DoPopulateRandUniform(Stream *stream,
-                             DeviceMemory<std::complex<float>> *v) override;
-  bool DoPopulateRandUniform(Stream *stream,
-                             DeviceMemory<std::complex<double>> *v) override;
-  bool DoPopulateRandGaussian(Stream *stream, float mean, float stddev,
-                              DeviceMemory<float> *v) override;
-  bool DoPopulateRandGaussian(Stream *stream, double mean, double stddev,
-                              DeviceMemory<double> *v) override;
-
-  bool SetSeed(Stream *stream, const uint8 *seed, uint64 seed_bytes) override;
-
- private:
-  // Actually performs the work of generating random numbers - the public
-  // methods are thin wrappers to this interface.
-  template <typename T>
-  bool DoPopulateRandUniformInternal(Stream *stream, DeviceMemory<T> *v);
-  template <typename ElemT, typename FuncT>
-  bool DoPopulateRandGaussianInternal(Stream *stream, ElemT mean, ElemT stddev,
-                                      DeviceMemory<ElemT> *v, FuncT func);
-
-  // Sets the stream for the internal curand generator.
-  //
-  // This is a stateful operation, as the handle can only have one stream set at
-  // a given time, so it is usually performed right before enqueuing work to do
-  // with random number generation.
-  bool SetStream(Stream *stream) EXCLUSIVE_LOCKS_REQUIRED(mu_);
-
-  // mutex that guards the cuRAND handle for this device.
-  mutex mu_;
-
-  // CUDAExecutor which instantiated this CUDARng.
-  // Immutable post-initialization.
-  CUDAExecutor *parent_;
-
-  // cuRANDalibrary handle on the device.
-  curandGenerator_t rng_ GUARDED_BY(mu_);
-
-  SE_DISALLOW_COPY_AND_ASSIGN(CUDARng);
-};
+using CUDARng = gpu::GpuRng;

 }  // namespace cuda
 }  // namespace stream_executor
--- a/tensorflow/stream_executor/cuda/cuda_stream.h
+++ b/tensorflow/stream_executor/cuda/cuda_stream.h
@ -13,79 +13,22 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/

-// Defines the CUDAStream type - the CUDA-specific implementation of the generic
+// Defines the GpuStream type - the CUDA-specific implementation of the generic
 // StreamExecutor Stream interface.

 #ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_STREAM_H_
 #define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_STREAM_H_

-#include "tensorflow/stream_executor/cuda/cuda_driver.h"
-#include "tensorflow/stream_executor/platform/thread_annotations.h"
-#include "tensorflow/stream_executor/stream_executor_internal.h"
+#include "tensorflow/stream_executor/gpu/gpu_stream.h"

 namespace stream_executor {
 namespace cuda {

-class CUDAExecutor;
+using CUDAStream = gpu::GpuStream;

-// Wraps a CUstream in order to satisfy the platform-independent
-// StreamInterface.
-//
-// Thread-safe post-initialization.
-class CUDAStream : public internal::StreamInterface {
- public:
-  explicit CUDAStream(CUDAExecutor *parent)
-      : parent_(parent), cuda_stream_(nullptr), completed_event_(nullptr) {}
-
-  // Note: teardown is handled by a parent's call to DeallocateStream.
-  ~CUDAStream() override {}
-
-  void *GpuStreamHack() override { return cuda_stream_; }
-  void **GpuStreamMemberHack() override {
-    return reinterpret_cast<void **>(&cuda_stream_);
-  }
-
-  // Explicitly initialize the CUDA resources associated with this stream, used
-  // by StreamExecutor::AllocateStream().
-  bool Init();
-
-  // Explicitly destroy the CUDA resources associated with this stream, used by
-  // StreamExecutor::DeallocateStream().
-  void Destroy();
-
-  // Returns true if no work is pending or executing on the stream.
-  bool IsIdle() const;
-
-  // Retrieves an event which indicates that all work enqueued into the stream
-  // has completed. Ownership of the event is not transferred to the caller, the
-  // event is owned by this stream.
-  CUevent* completed_event() { return &completed_event_; }
-
-  // Returns the CUstream value for passing to the CUDA API.
-  //
-  // Precond: this CUDAStream has been allocated (otherwise passing a nullptr
-  // into the NVIDIA library causes difficult-to-understand faults).
-  CUstream cuda_stream() const {
-    DCHECK(cuda_stream_ != nullptr);
-    return const_cast<CUstream>(cuda_stream_);
-  }
-
-  CUDAExecutor *parent() const { return parent_; }
-
- private:
-  CUDAExecutor *parent_;  // Executor that spawned this stream.
-  CUstream cuda_stream_;  // Wrapped CUDA stream handle.
-
-  // Event that indicates this stream has completed.
-  CUevent completed_event_ = nullptr;
-};
-
-// Helper functions to simplify extremely common flows.
-// Converts a Stream to the underlying CUDAStream implementation.
-CUDAStream *AsCUDAStream(Stream *stream);
-
-// Extracts a CUstream from a CUDAStream-backed Stream object.
-CUstream AsCUDAStreamValue(Stream *stream);
+inline CUDAStream* AsCUDAStream(Stream* stream) {
+  return gpu::AsGpuStream(stream);
+}

 }  // namespace cuda
 }  // namespace stream_executor
--- a/tensorflow/stream_executor/cuda/cuda_timer.h
+++ b/tensorflow/stream_executor/cuda/cuda_timer.h
@ -13,76 +13,18 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/

-// Defines the CUDATimer type - the CUDA-specific implementation of the generic
+// Defines the GpuTimer type - the CUDA-specific implementation of the generic
 // StreamExecutor Timer interface.

 #ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_TIMER_H_
 #define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_TIMER_H_

-#include "tensorflow/stream_executor/stream_executor_internal.h"
-#include "tensorflow/stream_executor/cuda/cuda_driver.h"
-#include "tensorflow/stream_executor/cuda/cuda_gpu_executor.h"
+#include "tensorflow/stream_executor/gpu/gpu_timer.h"

 namespace stream_executor {
 namespace cuda {

-class CUDAExecutor;
-class CUDAStream;
-
-// Wraps a pair of CUevents in order to satisfy the platform-independent
-// TimerInferface -- both a start and a stop event are present which may be
-// recorded in a stream.
-class CUDATimer : public internal::TimerInterface {
- public:
-  explicit CUDATimer(CUDAExecutor *parent)
-      : parent_(parent), start_event_(nullptr), stop_event_(nullptr) {}
-
-  // Note: teardown needs to be explicitly handled in this API by a call to
-  // StreamExecutor::DeallocateTimer(), which invokes Destroy().
-  // TODO(csigg): Change to RAII.
-  ~CUDATimer() override {}
-
-  // Allocates the platform-specific pieces of the timer, called as part of
-  // StreamExecutor::AllocateTimer().
-  bool Init();
-
-  // Deallocates the platform-specific pieces of the timer, called as part of
-  // StreamExecutor::DeallocateTimer().
-  void Destroy();
-
-  // Records the "timer start" event at the current point in the stream.
-  bool Start(CUDAStream *stream);
-
-  // Records the "timer stop" event at the current point in the stream.
-  bool Stop(CUDAStream *stream);
-
-  // Returns the elapsed time, in milliseconds, between the start and stop
-  // events.
-  float GetElapsedMilliseconds() const;
-
-  // See Timer::Microseconds().
-  // TODO(leary) make this into an error code interface...
-  uint64 Microseconds() const override {
-    return GetElapsedMilliseconds() * 1e3;
-  }
-
-  // See Timer::Nanoseconds().
-  uint64 Nanoseconds() const override { return GetElapsedMilliseconds() * 1e6; }
-
- private:
-  CUDAExecutor *parent_;
-  CUevent start_event_;  // Event recorded to indicate the "start" timestamp
-                         // executing in a stream.
-  CUevent stop_event_;   // Event recorded to indicate the "stop" timestamp
-                         // executing in a stream.
-};
-
-struct TimerDeleter {
-  void operator()(CUDATimer *t) {
-    t->Destroy();
-    delete t;
-  }
-};
+using CUDATimer = gpu::GpuTimer;

 }  // namespace cuda
 }  // namespace stream_executor
--- a/tensorflow/stream_executor/cuda/cudnn_version.cc
+++ b/tensorflow/stream_executor/cuda/cudnn_version.cc
@ -16,7 +16,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/cuda/cudnn_version.h"

 namespace stream_executor {
-namespace cuda {
+namespace gpu {

 bool IsSourceCompatibleWithCudnnLibrary(CudnnVersion source_version,
                                        CudnnVersion loaded_version) {
@ -36,5 +36,5 @@ bool IsSourceCompatibleWithCudnnLibrary(CudnnVersion source_version,
           loaded_version.minor_version >= source_version.minor_version));
 }

-}  // namespace cuda
+}  // namespace gpu
 }  // namespace stream_executor
--- a/tensorflow/stream_executor/cuda/cudnn_version.h
+++ b/tensorflow/stream_executor/cuda/cudnn_version.h
@ -21,7 +21,7 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/strcat.h"

 namespace stream_executor {
-namespace cuda {
+namespace gpu {

 struct CudnnVersion {
  CudnnVersion() = default;
@ -44,7 +44,7 @@ struct CudnnVersion {
 bool IsSourceCompatibleWithCudnnLibrary(CudnnVersion source_version,
                                        CudnnVersion loaded_version);

-}  // namespace cuda
+}  // namespace gpu
 }  // namespace stream_executor

 #endif  // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDNN_VERSION_H_
--- a/tensorflow/stream_executor/cuda/cudnn_version_test.cc
+++ b/tensorflow/stream_executor/cuda/cudnn_version_test.cc
@ -18,7 +18,7 @@ limitations under the License.
 #include "tensorflow/core/platform/test.h"

 namespace stream_executor {
-namespace cuda {
+namespace gpu {
 namespace {

 TEST(CuDNNVersion, ToString) {
@ -68,5 +68,5 @@ TEST(IsSourceCompatibleWithCudnnLibraryTest, Basic) {
 }

 }  // namespace
-}  // namespace cuda
+}  // namespace gpu
 }  // namespace stream_executor
--- a/tensorflow/stream_executor/device_description.cc
+++ b/tensorflow/stream_executor/device_description.cc
@ -50,6 +50,7 @@ DeviceDescription::DeviceDescription()
      clock_rate_ghz_(-1.0),
      cuda_compute_capability_major_(-1),
      cuda_compute_capability_minor_(-1),
+      rocm_amdgpu_isa_version_(-1),
      numa_node_(-1),
      core_count_(-1),
      ecc_enabled_(false) {}
@ -112,6 +113,15 @@ bool DeviceDescription::cuda_compute_capability(int *major, int *minor) const {
  return cuda_compute_capability_major_ != 0;
 }

+bool DeviceDescription::rocm_amdgpu_isa_version(int *version) const {
+  bool status = false;
+  if (rocm_amdgpu_isa_version_ > 0) {
+    *version = rocm_amdgpu_isa_version_;
+    status = true;
+  }
+  return status;
+}
+
 bool ThreadDimOk(const DeviceDescription &device_description,
                 const ThreadDim &thread_dim) {
  auto total_threads = thread_dim.x * thread_dim.y * thread_dim.z;
--- a/tensorflow/stream_executor/device_description.h
+++ b/tensorflow/stream_executor/device_description.h
@ -133,6 +133,11 @@ class DeviceDescription {
  // zero, and the return value will be false.
  bool cuda_compute_capability(int *major, int *minor) const;

+  // Returns the AMDGPU ISA version if we're running on the ROCm platform.
+  // If the information is not available, the version is not modified,
+  // and the return value will be false.
+  bool rocm_amdgpu_isa_version(int *version) const;
+
  // Returns the maximum amount of shared memory present on a single core
  // (i.e. Streaming Multiprocessor on NVIDIA GPUs; Compute Unit for OpenCL
  // devices). Note that some devices, such as NVIDIA's have a configurable
@ -195,6 +200,9 @@ class DeviceDescription {
  int cuda_compute_capability_major_;
  int cuda_compute_capability_minor_;

+  // ROCM AMDGPU ISA version, 0 if not available.
+  int rocm_amdgpu_isa_version_;
+
  int numa_node_;
  int core_count_;
  bool ecc_enabled_;
@ -280,6 +288,10 @@ class DeviceDescriptionBuilder {
    device_description_->cuda_compute_capability_minor_ = minor;
  }

+  void set_rocm_amdgpu_isa_version(int version) {
+    device_description_->rocm_amdgpu_isa_version_ = version;
+  }
+
  void set_numa_node(int value) { device_description_->numa_node_ = value; }
  void set_core_count(int value) { device_description_->core_count_ = value; }
  void set_ecc_enabled(bool value) {
--- a/tensorflow/stream_executor/gpu/BUILD
+++ b/tensorflow/stream_executor/gpu/BUILD
@ -0,0 +1,209 @@
+# Description:
+#   GPU-platform specific StreamExecutor support code.
+
+licenses(["notice"])  # Apache 2.0
+
+load(
+    "//tensorflow/stream_executor:build_defs.bzl",
+    "stream_executor_friends",
+)
+load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda_is_configured")
+load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm_is_configured")
+
+package_group(
+    name = "friends",
+    packages = stream_executor_friends(),
+)
+
+package(
+    default_visibility = [":friends"],
+)
+
+# Filegroup used to collect source files for the dependency check.
+filegroup(
+    name = "c_srcs",
+    data = glob([
+        "**/*.cc",
+        "**/*.h",
+    ]),
+)
+
+cc_library(
+    name = "gpu_activation_header",
+    hdrs = ["gpu_activation.h"],
+    visibility = ["//tensorflow/stream_executor:__subpackages__"],
+    deps = ["//tensorflow/stream_executor/platform"],
+)
+
+cc_library(
+    name = "gpu_activation",
+    srcs = ["gpu_activation.cc"],
+    hdrs = ["gpu_activation.h"],
+    visibility = ["//tensorflow/stream_executor:__subpackages__"],
+    deps = [
+        ":gpu_activation_header",
+        ":gpu_driver_header",
+        "//tensorflow/stream_executor",
+        "//tensorflow/stream_executor:stream_executor_internal",
+        "//tensorflow/stream_executor/platform",
+    ],
+)
+
+cc_library(
+    name = "gpu_diagnostics_header",
+    hdrs = ["gpu_diagnostics.h"],
+    visibility = ["//tensorflow/stream_executor:__subpackages__"],
+    deps = [
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+    ],
+)
+
+cc_library(
+    name = "gpu_driver_header",
+    hdrs = ["gpu_driver.h"],
+    visibility = ["//tensorflow/stream_executor:__subpackages__"],
+    deps = [
+        ":gpu_types_header",
+        "//tensorflow/stream_executor:device_options",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+        "@local_config_cuda//cuda:cuda_headers",
+    ],
+)
+
+cc_library(
+    name = "gpu_event_header",
+    hdrs = ["gpu_event.h"],
+    visibility = ["//tensorflow/stream_executor:__subpackages__"],
+    deps = [
+        ":gpu_driver_header",
+        ":gpu_stream_header",
+        "//tensorflow/stream_executor:event",
+        "//tensorflow/stream_executor/lib",
+    ],
+)
+
+cc_library(
+    name = "gpu_event",
+    srcs = ["gpu_event.cc"],
+    hdrs = ["gpu_event.h"],
+    visibility = ["//tensorflow/stream_executor:__subpackages__"],
+    deps = [
+        ":gpu_driver_header",
+        ":gpu_executor_header",
+        ":gpu_stream",
+        "//tensorflow/stream_executor:stream_executor_headers",
+        "//tensorflow/stream_executor/lib",
+    ],
+)
+
+cc_library(
+    name = "gpu_executor_header",
+    hdrs = ["gpu_executor.h"],
+    visibility = ["//tensorflow/stream_executor:__subpackages__"],
+    deps = [
+        ":gpu_kernel_header",
+        "//tensorflow/stream_executor:event",
+        "//tensorflow/stream_executor:platform",
+        "//tensorflow/stream_executor:stream_executor_internal",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_library(
+    name = "gpu_helpers_header",
+    hdrs = ["gpu_helpers.h"],
+    visibility = ["//tensorflow/stream_executor:__subpackages__"],
+    deps = [":gpu_types_header"],
+)
+
+cc_library(
+    name = "gpu_kernel_header",
+    hdrs = ["gpu_kernel.h"],
+    visibility = ["//tensorflow/stream_executor:__subpackages__"],
+    deps = [
+        ":gpu_driver_header",
+        "//tensorflow/stream_executor:event",
+        "//tensorflow/stream_executor:stream_executor_pimpl_header",
+        "//tensorflow/stream_executor/platform",
+    ],
+)
+
+cc_library(
+    name = "gpu_rng_header",
+    hdrs = ["gpu_rng.h"],
+    visibility = ["//tensorflow/stream_executor:__subpackages__"],
+    deps = [
+        ":gpu_types_header",
+        "//tensorflow/stream_executor:plugin_registry",
+        "//tensorflow/stream_executor:rng",
+        "//tensorflow/stream_executor/platform",
+    ],
+)
+
+cc_library(
+    name = "gpu_stream_header",
+    hdrs = ["gpu_stream.h"],
+    visibility = ["//tensorflow/stream_executor:__subpackages__"],
+    deps = [
+        ":gpu_driver_header",
+        "//tensorflow/stream_executor:stream_executor_internal",
+        "//tensorflow/stream_executor/platform",
+    ],
+)
+
+cc_library(
+    name = "gpu_stream",
+    srcs = ["gpu_stream.cc"],
+    hdrs = ["gpu_stream.h"],
+    visibility = ["//tensorflow/stream_executor:__subpackages__"],
+    deps = [
+        ":gpu_driver_header",
+        ":gpu_executor_header",
+        "//tensorflow/stream_executor:stream_executor_headers",
+        "//tensorflow/stream_executor:stream_header",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+    ],
+)
+
+cc_library(
+    name = "gpu_timer_header",
+    hdrs = ["gpu_timer.h"],
+    visibility = ["//tensorflow/stream_executor:__subpackages__"],
+    deps = [
+        ":gpu_driver_header",
+        ":gpu_executor_header",
+        "//tensorflow/stream_executor:stream_executor_internal",
+    ],
+)
+
+cc_library(
+    name = "gpu_timer",
+    srcs = ["gpu_timer.cc"],
+    hdrs = ["gpu_timer.h"],
+    visibility = ["//tensorflow/stream_executor:__subpackages__"],
+    deps = [
+        ":gpu_driver_header",
+        ":gpu_executor_header",
+        ":gpu_stream",
+        "//tensorflow/stream_executor:stream_executor_headers",
+        "//tensorflow/stream_executor/lib",
+    ],
+)
+
+cc_library(
+    name = "gpu_types_header",
+    hdrs = ["gpu_types.h"],
+    visibility = ["//tensorflow/stream_executor:__subpackages__"],
+    deps = [
+        "//tensorflow/stream_executor/platform",
+    ] + if_cuda_is_configured([
+        "@local_config_cuda//cuda:cuda_headers",
+    ]) + if_rocm_is_configured([
+        "@local_config_rocm//rocm:rocm_headers",
+    ]),
+)
--- a/tensorflow/stream_executor/cuda/cuda_activation.cc
+++ b/tensorflow/stream_executor/cuda/cuda_activation.cc
@ -1,4 +1,4 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.

 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@ -13,36 +13,36 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/

-#include "tensorflow/stream_executor/cuda/cuda_activation.h"
+#include "tensorflow/stream_executor/gpu/gpu_activation.h"

-#include "tensorflow/stream_executor/cuda/cuda_driver.h"
+#include "tensorflow/stream_executor/gpu/gpu_driver.h"
 #include "tensorflow/stream_executor/stream_executor.h"
 #include "tensorflow/stream_executor/stream_executor_internal.h"

 namespace stream_executor {
-namespace cuda {
+namespace gpu {

-CudaContext* ExtractCudaContext(CUDAExecutor *cuda_exec);
-CUDAExecutor *ExtractCudaExecutor(StreamExecutor *stream_exec);
+GpuContext* ExtractGpuContext(GpuExecutor* gpu_exec);
+GpuExecutor* ExtractGpuExecutor(StreamExecutor* stream_exec);

 ScopedActivateExecutorContext::ScopedActivateExecutorContext(
-    CUDAExecutor *cuda_exec):
-      driver_scoped_activate_context_(
-          new ScopedActivateContext{ExtractCudaContext(cuda_exec)}) { }
+    GpuExecutor* gpu_exec)
+    : driver_scoped_activate_context_(
+          new ScopedActivateContext{ExtractGpuContext(gpu_exec)}) {}

 ScopedActivateExecutorContext::ScopedActivateExecutorContext(
-    StreamExecutor *stream_exec)
-    : ScopedActivateExecutorContext(ExtractCudaExecutor(stream_exec)) {}
+    StreamExecutor* stream_exec)
+    : ScopedActivateExecutorContext(ExtractGpuExecutor(stream_exec)) {}

 ScopedActivateExecutorContext::~ScopedActivateExecutorContext() {
-  delete static_cast<ScopedActivateContext *>(driver_scoped_activate_context_);
+  delete static_cast<ScopedActivateContext*>(driver_scoped_activate_context_);
 }

 ScopedActivateExecutorContext::ScopedActivateExecutorContext(
-    ScopedActivateExecutorContext &&other)
+    ScopedActivateExecutorContext&& other)
    : driver_scoped_activate_context_(other.driver_scoped_activate_context_) {
  other.driver_scoped_activate_context_ = nullptr;
 }

-}  // namespace cuda
+}  // namespace gpu
 }  // namespace stream_executor
--- a/tensorflow/stream_executor/gpu/gpu_activation.h
+++ b/tensorflow/stream_executor/gpu/gpu_activation.h
@ -0,0 +1,61 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file contains APIs that assume a StreamExecutor is backed by CUDA.
+// It reaches into the CUDA implementation to activate an underlying CUDA
+// context.
+//
+// Having this file separate from gpu/gpu_executor.h means that dependent
+// code does not also have to depend on cuda.h.
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_ACTIVATION_H_
+#define TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_ACTIVATION_H_
+
+#include "tensorflow/stream_executor/platform/port.h"
+
+namespace stream_executor {
+
+class StreamExecutor;
+
+namespace gpu {
+
+class GpuExecutor;
+class ScopedActivateContext;
+
+// Activates a CUDA context within an enclosing scope.
+class ScopedActivateExecutorContext {
+ public:
+  // Form that takes a CUDA executor implementation.
+  explicit ScopedActivateExecutorContext(GpuExecutor* gpu_exec);
+
+  // Form that takes a pImpl executor and extracts a CUDA implementation --
+  // fatal failure if it is not CUDA inside.
+  explicit ScopedActivateExecutorContext(StreamExecutor* stream_exec);
+
+  ScopedActivateExecutorContext(ScopedActivateExecutorContext&& other);
+
+  ~ScopedActivateExecutorContext();
+
+ private:
+  // The cuda.h-using datatype that we wrap.
+  ScopedActivateContext* driver_scoped_activate_context_;
+
+  SE_DISALLOW_COPY_AND_ASSIGN(ScopedActivateExecutorContext);
+};
+
+}  // namespace gpu
+}  // namespace stream_executor
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_ACTIVATION_H_
--- a/tensorflow/stream_executor/gpu/gpu_diagnostics.h
+++ b/tensorflow/stream_executor/gpu/gpu_diagnostics.h
@ -0,0 +1,99 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_DIAGNOSTICS_H_
+#define TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_DIAGNOSTICS_H_
+
+#include <tuple>
+#include "tensorflow/stream_executor/platform/port.h"
+
+#include "tensorflow/stream_executor/lib/statusor.h"
+#include "tensorflow/stream_executor/platform/port.h"
+
+namespace stream_executor {
+namespace gpu {
+
+// e.g. DriverVersion{346, 3, 4}
+using DriverVersion = std::tuple<int, int, int>;
+
+// FIXME: These functions are in stream_executor::cuda namespaces for now
+// Will move to stream_executor::gpu namespace in the near future
+//
+//// Converts a parsed driver version to string form.
+// string DriverVersionToString(DriverVersion version);
+//
+//// Converts a parsed driver version or status value to natural string form.
+// string DriverVersionStatusToString(port::StatusOr<DriverVersion> version);
+//
+//// Converts a string of a form like "331.79" to a DriverVersion{331, 79}.
+// port::StatusOr<DriverVersion> StringToDriverVersion(const string& value);
+
+class Diagnostician {
+ public:
+  // Logs diagnostic information when CUDA appears to be misconfigured (e.g. is
+  // not initializing).
+  //
+  // Note: if we're running on a machine that has no GPUs, we don't want to
+  // produce very much log spew beyond saying, "looks like there's no CUDA
+  // kernel
+  // module running".
+  //
+  // Note: we use non-Google-File:: API here because we may be called before
+  // InitGoogle has completed.
+  static void LogDiagnosticInformation();
+
+  // Given the driver version file contents, finds the kernel module version and
+  // returns it as a string.
+  //
+  // This is solely used for more informative log messages when the user is
+  // running on a machine that happens to have a libcuda/kernel driver mismatch.
+  static port::StatusOr<DriverVersion> FindKernelModuleVersion(
+      const string& driver_version_file_contents);
+
+  // Extracts the kernel driver version from the current host.
+  static port::StatusOr<DriverVersion> FindKernelDriverVersion();
+
+  // Iterates through loaded DSOs with DlIteratePhdrCallback to find the
+  // driver-interfacing DSO version number. Returns it as a string.
+  static port::StatusOr<DriverVersion> FindDsoVersion();
+
+  // Logs information about the kernel driver version and userspace driver
+  // library version.
+  static void LogDriverVersionInformation();
+
+ private:
+  // Given the DSO version number and the driver version file contents, extracts
+  // the driver version and compares, warning the user in the case of
+  // incompatibility.
+  //
+  // This is solely used for more informative log messages when the user is
+  // running on a machine that happens to have a libcuda/kernel driver mismatch.
+  static void WarnOnDsoKernelMismatch(
+      port::StatusOr<DriverVersion> dso_version,
+      port::StatusOr<DriverVersion> kernel_version);
+
+  // Logs information about the dev nodes present on this machine: their
+  // existence, permissions, accessibility from this uid/gid.
+  static void LogDevNodeDiagnosticInformation();
+
+  static string GetDevNodePath(int dev_node_ordinal);
+
+  SE_DISALLOW_COPY_AND_ASSIGN(Diagnostician);
+};
+
+}  // namespace gpu
+}  // namespace stream_executor
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_DIAGNOSTICS_H_
--- a/tensorflow/stream_executor/gpu/gpu_driver.h
+++ b/tensorflow/stream_executor/gpu/gpu_driver.h
@ -0,0 +1,525 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// CUDA userspace driver library wrapper functionality.
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_DRIVER_H_
+#define TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_DRIVER_H_
+
+#include <stddef.h>
+#include "tensorflow/stream_executor/platform/port.h"
+
+#include "cuda/include/cuda.h"
+#include "tensorflow/stream_executor/device_options.h"
+#include "tensorflow/stream_executor/lib/status.h"
+#include "tensorflow/stream_executor/lib/statusor.h"
+#include "tensorflow/stream_executor/platform/port.h"
+
+#include "tensorflow/stream_executor/gpu/gpu_types.h"
+
+namespace stream_executor {
+namespace gpu {
+
+// Identifies the memory space where an allocation resides. See
+// GpuDriver::GetPointerMemorySpace().
+enum class MemorySpace { kHost, kDevice };
+
+// Returns a casual string, such as "host" for the provided memory space.
+string MemorySpaceString(MemorySpace memory_space);
+
+class GpuContext;
+
+// GpuDriver contains wrappers for calls to the userspace library driver. It's
+// useful to isolate these calls and put basic wrappers around them to separate
+// userspace library driver behaviors from the rest of the program.
+//
+// At the moment it's simply used as a namespace.
+//
+// The calls log any specific errors internally and return whether the operation
+// was successful to the caller.
+//
+// The order of parameters is generally kept symmetric with the underlying CUDA
+// driver API.
+//
+// Links on functions are to specific documentation under
+// http://docs.nvidia.com/cuda/cuda-driver-api/
+//
+// Thread safety: these functions should not be used from signal handlers.
+class GpuDriver {
+ public:
+  // Wraps a call to cuInit with logging to help indicate what has gone wrong in
+  // the case of failure. Safe to call multiple times; will be fast on all calls
+  // after the first.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__INITIALIZE.html#group__CUDA__INITIALIZE_1g0a2f1517e1bd8502c7194c3a8c134bc3
+  static port::Status Init();
+
+  // Returns the device associated with the given context.
+  // device is an outparam owned by the caller, must not be null.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g4e84b109eba36cdaaade167f34ae881e
+  static port::StatusOr<GpuDeviceHandle> DeviceFromContext(GpuContext* context);
+
+  // Creates a new CUDA stream associated with the given context via
+  // cuStreamCreate.
+  // stream is an outparam owned by the caller, must not be null.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1ga581f0c5833e21ded8b5a56594e243f4
+  static bool CreateStream(GpuContext* context, GpuStreamHandle* stream);
+
+  // Destroys a CUDA stream associated with the given context.
+  // stream is owned by the caller, must not be null, and *stream is set to null
+  // if the stream is successfully destroyed.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1g244c8833de4596bcd31a06cdf21ee758
+  static void DestroyStream(GpuContext* context, GpuStreamHandle* stream);
+
+  // CUDA events can explicitly disable event TSC retrieval for some presumed
+  // performance improvement if timing is unnecessary.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1g450687e75f3ff992fe01662a43d9d3db
+  enum class EventFlags { kDefault, kDisableTiming };
+
+  // Creates a new event associated with the given context.
+  // result is an outparam owned by the caller and must not be null.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1g450687e75f3ff992fe01662a43d9d3db
+  static port::Status CreateEvent(GpuContext* context, GpuEventHandle* result,
+                                  EventFlags flags);
+
+  // Destroys *event and turns it into a nullptr. event may not be null, but
+  // *event may be, via cuEventDestroy
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1g593ec73a8ec5a5fc031311d3e4dca1ef
+  static port::Status DestroyEvent(GpuContext* context, GpuEventHandle* event);
+
+  // Allocates a GPU memory space of size bytes associated with the given
+  // context via cuMemAlloc.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gb82d2a09844a58dd9e744dc31e8aa467
+  static void* DeviceAllocate(GpuContext* context, uint64 bytes);
+
+  // Deallocates a GPU memory space of size bytes associated with the given
+  // context via cuMemFree.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g89b3f154e17cc89b6eea277dbdf5c93a
+  static void DeviceDeallocate(GpuContext* context, void* location);
+
+  // Allocates a unified memory space of size bytes associated with the given
+  // context via cuMemAllocManaged.
+  // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gb347ded34dc326af404aa02af5388a32
+  // (supported on CUDA only)
+  static void* UnifiedMemoryAllocate(GpuContext* context, uint64 bytes);
+
+  // Deallocates a unified memory space of size bytes associated with the given
+  // context via cuMemFree.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g89b3f154e17cc89b6eea277dbdf5c93a
+  // (supported on CUDA only)
+  static void UnifiedMemoryDeallocate(GpuContext* context, void* location);
+
+  // Allocates page-locked and CUDA-registered memory on the host via
+  // cuMemAllocHost.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gdd8311286d2c2691605362c689bc64e0
+  static void* HostAllocate(GpuContext* context, uint64 bytes);
+
+  // Deallocates a location created by HostAllocate, via cuMemFreeHost.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g62e0fdbe181dab6b1c90fa1a51c7b92c
+  static void HostDeallocate(GpuContext* context, void* location);
+
+  // Registers a memory region at location of size bytes via cuMemHostRegister.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gf0a9fe11544326dabd743b7aa6b54223
+  static bool HostRegister(GpuContext* context, void* location, uint64 bytes);
+
+  // Unregisters a memory region that was previously registered at location via
+  // cuMemHostUnregister.
+  //
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g63f450c8125359be87b7623b1c0b2a14
+  //
+  // TODO(leary) verify an error will be returned if the location wasn't
+  // previously registered.
+  static bool HostUnregister(GpuContext* context, void* location);
+
+  // Given a device ordinal, returns a device handle into the device outparam,
+  // which must not be null.
+  //
+  // N.B. these device handles do not have a corresponding destroy function in
+  // the CUDA driver API.
+  static port::Status GetDevice(int device_ordinal, GpuDeviceHandle* device);
+
+  // Given a device handle, returns the name reported by the driver for the
+  // device.
+  static bool GetDeviceName(GpuDeviceHandle device, string* device_name);
+
+  // Given a device to create a context for, returns a context handle into the
+  // context outparam, which must not be null.
+  //
+  // N.B. CUDA contexts are weird. They are implicitly associated with the
+  // calling thread. Current documentation on contexts and their influence on
+  // userspace processes is given here:
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g65dc0012348bc84810e2103a40d8e2cf
+  static port::Status CreateContext(int device_ordinal, GpuDeviceHandle device,
+                                    const DeviceOptions& device_options,
+                                    GpuContext** context);
+
+  // Destroys the provided context via cuCtxDestroy.
+  // Don't do this while clients could still be using the context, per the docs
+  // bad things will happen.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g27a365aebb0eb548166309f58a1e8b8e
+  static void DestroyContext(GpuContext* context);
+
+  // Queries the runtime for the specified attribute of the specified function.
+  // cuFuncGetAttribute (the underlying CUDA driver API routine) only operates
+  // in terms of integer-sized values, so there's no potential for overrun (as
+  // of CUDA 5.5).
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EXEC.html#group__CUDA__EXEC_1g5e92a1b0d8d1b82cb00dcfb2de15961b
+  static bool FuncGetAttribute(GpuFunctionAttribute attribute,
+                               GpuFunctionHandle function,
+                               int* attribute_value);
+
+  // Sets the preferred cache configuration for the specified function.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EXEC.html#group__CUDA__EXEC_1g40f8c11e81def95dc0072a375f965681
+  static bool FuncSetCacheConfig(GpuFunctionHandle function,
+                                 GpuFuncCachePreference cache_config);
+
+  // Gets the preferred shared memory bank configuration for the specified
+  // CONTEXT (not function!), either default or four- or eight-byte bank size.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g17153a1b8b8c756f7ab8505686a4ad74
+  static port::StatusOr<GpuSharedMemConfig> ContextGetSharedMemConfig(
+      GpuContext* context);
+
+  // Sets the preferred shared memory bank configuration for the specified
+  // CONTEXT (not function!), either default or four- or eight-byte bank size.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g2574235fa643f8f251bf7bc28fac3692
+  static port::Status ContextSetSharedMemConfig(
+      GpuContext* context, GpuSharedMemConfig shared_mem_config);
+
+  // Launches a CUDA kernel via cuLaunchKernel.
+  // TODO(leary) describe the structure of kernel_params and extra in a readable
+  // way.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EXEC.html#group__CUDA__EXEC_1gb8f3dc3031b40da29d5f9a7139e52e15
+  static bool LaunchKernel(GpuContext* context, GpuFunctionHandle function,
+                           unsigned int grid_dim_x, unsigned int grid_dim_y,
+                           unsigned int grid_dim_z, unsigned int block_dim_x,
+                           unsigned int block_dim_y, unsigned int block_dim_z,
+                           unsigned int shared_mem_bytes,
+                           GpuStreamHandle stream, void** kernel_params,
+                           void** extra);
+
+  // Loads ptx_contents with the CUDA driver's PTX JIT and stores the resulting
+  // handle in "module". Any error logs that are produced are logged internally.
+  // (supported on CUDA only)
+  static bool LoadPtx(GpuContext* context, const char* ptx_contents,
+                      GpuModuleHandle* module);
+
+  // Loads cubin_bytes with the CUDA driver's blob loading interface and stores
+  // the resulting handle in "module".
+  // (supported on CUDA only)
+  static port::Status LoadCubin(GpuContext* context, const char* cubin_bytes,
+                                GpuModuleHandle* module);
+
+  // Loads HSACO with the ROCM runtime and stores the resulting handle in
+  // "module". Any error logs that are produced are logged internally.
+  // (supported on ROCm only)
+  static bool LoadHsaco(GpuContext* context, const char* hsaco_contents,
+                        GpuModuleHandle* module);
+
+  // Retrieves a named kernel from a loaded module, and places the resulting
+  // handle into function (outparam) on success. Neither kernel_name nor
+  // function may be null. No ownership is taken of kernel_name.
+  static bool GetModuleFunction(GpuContext* context, GpuModuleHandle module,
+                                const char* kernel_name,
+                                GpuFunctionHandle* function);
+
+  // Retrieves a named global/constant symbol from a loaded module, and returns
+  // a device pointer and size of the symbol on success. symbol_name may not be
+  // null. At least one of dptr or bytes should not be null. No ownership is
+  // taken of symbol_name.
+  static bool GetModuleSymbol(GpuContext* context, GpuModuleHandle module,
+                              const char* symbol_name, GpuDevicePtr* dptr,
+                              size_t* bytes);
+
+  // Unloads module from the current context via cuModuleUnload.
+  // TODO(leary) the documentation doesn't say what kind of disasters happen
+  // if you try to unload a module while its GpuFunctionHandles are in use.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MODULE.html#group__CUDA__MODULE_1g8ea3d716524369de3763104ced4ea57b
+  static void UnloadModule(GpuContext* context, GpuModuleHandle module);
+
+  // Performs a synchronous memset of the device memory segment via cuMemsetD8.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g6e582bf866e9e2fb014297bfaf354d7b
+  static bool SynchronousMemsetUint8(GpuContext* context, GpuDevicePtr location,
+                                     uint8 value, size_t size);
+
+  // Performs a synchronous memset of the device memory segment via cuMemsetD32.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g983e8d8759acd1b64326317481fbf132
+  static bool SynchronousMemsetUint32(GpuContext* context,
+                                      GpuDevicePtr location, uint32 value,
+                                      size_t uint32_count);
+
+  // Performs an asynchronous memset of the device memory segment via
+  // cuMemsetD8Async.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gaef08a7ccd61112f94e82f2b30d43627
+  static bool AsynchronousMemsetUint8(GpuContext* context,
+                                      GpuDevicePtr location, uint8 value,
+                                      size_t uint32_count,
+                                      GpuStreamHandle stream);
+
+  // Performs an asynchronous memset of the device memory segment via
+  // cuMemsetD32Async.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g58229da5d30f1c0cdf667b320ec2c0f5
+  static bool AsynchronousMemsetUint32(GpuContext* context,
+                                       GpuDevicePtr location, uint32 value,
+                                       size_t uint32_count,
+                                       GpuStreamHandle stream);
+
+  // -- Synchronous memcopies.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g4d32266788c440b0220b1a9ba5795169
+
+  static port::Status SynchronousMemcpyD2H(GpuContext* context, void* host_dst,
+                                           GpuDevicePtr gpu_src, uint64 size);
+  static port::Status SynchronousMemcpyH2D(GpuContext* context,
+                                           GpuDevicePtr gpu_dst,
+                                           const void* host_src, uint64 size);
+  static port::Status SynchronousMemcpyD2D(GpuContext* context,
+                                           GpuDevicePtr gpu_dst,
+                                           GpuDevicePtr gpu_src, uint64 size);
+
+  // -- Asynchronous memcopies.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g56f30236c7c5247f8e061b59d3268362
+
+  static bool AsynchronousMemcpyD2H(GpuContext* context, void* host_dst,
+                                    GpuDevicePtr gpu_src, uint64 size,
+                                    GpuStreamHandle stream);
+  static bool AsynchronousMemcpyH2D(GpuContext* context, GpuDevicePtr gpu_dst,
+                                    const void* host_src, uint64 size,
+                                    GpuStreamHandle stream);
+  static bool AsynchronousMemcpyD2D(GpuContext* context, GpuDevicePtr gpu_dst,
+                                    GpuDevicePtr gpu_src, uint64 size,
+                                    GpuStreamHandle stream);
+
+  // The CUDA stream callback type signature.
+  // The data passed to AddStreamCallback is subsequently passed to this
+  // callback when it fires.
+  //
+  // Some notable things:
+  // * Callbacks must not make any CUDA API calls.
+  // * Callbacks from independent streams execute in an undefined order and may
+  //   be serialized.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1g613d97a277d7640f4cb1c03bd51c2483
+  typedef void (*StreamCallback)(GpuStreamHandle stream, GpuStatus status,
+                                 void* data);
+
+  // Enqueues a callback operation into stream.
+  // See StreamCallback above and the NVIDIA documentation for additional
+  // details.
+  static bool AddStreamCallback(GpuContext* context, GpuStreamHandle stream,
+                                StreamCallback callback, void* data);
+
+  // Causes stream to wait for event to trigger before proceeding via
+  // cuStreamWaitEvent.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#axzz334nAXAhM
+  static bool WaitStreamOnEvent(GpuContext* context, GpuStreamHandle stream,
+                                GpuEventHandle event);
+
+  // Blocks the calling thread until the operations enqueued onto stream have
+  // been completed, via cuStreamSynchronize.
+  //
+  // TODO(leary) if a pathological thread enqueues operations onto the stream
+  // while another thread blocks like this, can you wind up waiting an unbounded
+  // amount of time?
+  //
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1g15e49dd91ec15991eb7c0a741beb7dad
+  static port::Status SynchronizeStream(GpuContext* context,
+                                        GpuStreamHandle stream);
+
+  // Blocks the calling thread until the operations associated with the context
+  // have been completed, via cuCtxSynchronize.
+  //
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g7a54725f28d34b8c6299f0c6ca579616
+  static bool SynchronizeContext(GpuContext* context);
+
+  // Returns true if all stream tasks have completed at time of the call. Note
+  // the potential for races around this call (if another thread adds work to
+  // the stream immediately after this returns).
+  static bool IsStreamIdle(GpuContext* context, GpuStreamHandle stream);
+
+  // Returns whether code in the from context can access memory in the to
+  // context via cuDeviceCanAccessPeer.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__PEER__ACCESS.html#group__CUDA__PEER__ACCESS_1g496bdaae1f632ebfb695b99d2c40f19e
+  static bool CanEnablePeerAccess(GpuContext* from, GpuContext* to);
+
+  // Enables peer access per CanEnablePeerAccess, via cuCtxEnablePeerAccess.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__PEER__ACCESS.html#group__CUDA__PEER__ACCESS_1g0889ec6728e61c05ed359551d67b3f5a
+  static port::Status EnablePeerAccess(GpuContext* from, GpuContext* to);
+
+  // Returns the elapsed milliseconds between start and stop via
+  // cuEventElapsedTime.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1gdfb1178807353bbcaa9e245da497cf97
+  static bool GetEventElapsedTime(GpuContext* context,
+                                  float* elapsed_milliseconds,
+                                  GpuEventHandle start, GpuEventHandle stop);
+
+  // Records that an event occurred when execution reaches the current point in
+  // thestream via cuEventRecord.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1g95424d3be52c4eb95d83861b70fb89d1
+  static port::Status RecordEvent(GpuContext* context, GpuEventHandle event,
+                                  GpuStreamHandle stream);
+
+  // Polls (without blocking) to determine the status of an event - pending or
+  // complete (or an error status).
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1g6f0704d755066b0ee705749ae911deef
+  static port::StatusOr<GpuStatus> QueryEvent(GpuContext* context,
+                                              GpuEventHandle event);
+
+  // -- Pointer-specific calls.
+
+  // Returns the context in which pointer was allocated or registered.
+  static port::StatusOr<GpuContext*> GetPointerContext(GpuDevicePtr pointer);
+
+  // Returns the device associated with the context from GetPointerContext().
+  static port::StatusOr<GpuDeviceHandle> GetPointerDevice(GpuDevicePtr pointer);
+
+  // Returns the memory space addressed by pointer.
+  static port::StatusOr<MemorySpace> GetPointerMemorySpace(
+      GpuDevicePtr pointer);
+
+  // Returns the base address and size of the device pointer dptr.
+  static port::Status GetPointerAddressRange(GpuDevicePtr dptr,
+                                             GpuDevicePtr* base, size_t* size);
+
+  // -- Device-specific calls.
+
+  // Returns the compute capability for the device; i.e (3, 5).
+  // This is currently done via the deprecated device API.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE__DEPRECATED.html#group__CUDA__DEVICE__DEPRECATED_1ge2091bbac7e1fb18c2821612115607ea
+  // (supported on CUDA only)
+  static port::Status GetComputeCapability(int* cc_major, int* cc_minor,
+                                           GpuDeviceHandle device);
+
+  // Returns Gpu ISA version for the device; i.e 803, 900.
+  // (supported on ROCm only)
+  static port::Status GetGpuISAVersion(int* version, GpuDeviceHandle device);
+
+  // Returns the number of multiprocessors on the device (note that the device
+  // may be multi-GPU-per-board).
+  static port::StatusOr<int> GetMultiprocessorCount(GpuDeviceHandle device);
+
+  // Returns the limit on number of threads that can be resident in a single
+  // multiprocessor.
+  static port::StatusOr<int64> GetMaxThreadsPerMultiprocessor(
+      GpuDeviceHandle device);
+
+  // Returns the limit on number of threads which may be resident for a single
+  // block (cooperative thread array).
+  static port::StatusOr<int64> GetMaxThreadsPerBlock(GpuDeviceHandle device);
+
+  // Returns the amount of shared memory available on a single GPU core (i.e.
+  // SM on NVIDIA devices).
+  static port::StatusOr<int64> GetMaxSharedMemoryPerCore(
+      GpuDeviceHandle device);
+
+  // Returns the amount of shared memory available for a single block
+  // (cooperative thread array).
+  static port::StatusOr<int64> GetMaxSharedMemoryPerBlock(
+      GpuDeviceHandle device);
+
+  // Returns the maximum supported number of registers per block.
+  static port::StatusOr<int64> GetMaxRegistersPerBlock(GpuDeviceHandle device);
+
+  // Returns the number of threads per warp.
+  static port::StatusOr<int64> GetThreadsPerWarp(GpuDeviceHandle device);
+
+  // Queries the grid limits for device with cuDeviceGetAttribute calls.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE.html#group__CUDA__DEVICE_1g9c3e1414f0ad901d3278a4d6645fc266
+  static bool GetGridLimits(int* x, int* y, int* z, GpuDeviceHandle device);
+
+  // Returns a grab-bag of device properties in a caller-owned device_properties
+  // structure for device_ordinal via cuDeviceGetProperties.
+  //
+  // This call is deprecated in the NVIDIA driver API; its replacement is
+  // GetDeviceAttribute
+  //
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE__DEPRECATED.html#group__CUDA__DEVICE__DEPRECATED_1g65a5b4e25186bd257df80b98c98cffe6
+  static bool GetDeviceProperties(GpuDeviceProperty* device_properties,
+                                  int device_ordinal);
+
+  // Gets a specific integer-valued property about the given device.
+  //
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE.html#group__CUDA__DEVICE_1g9c3e1414f0ad901d3278a4d6645fc266
+  static port::StatusOr<int> GetDeviceAttribute(GpuDeviceAttribute attribute,
+                                                GpuDeviceHandle device);
+
+  // Returns whether ECC is enabled for the given GpuDeviceHandle via
+  // cuDeviceGetattribute with CU_DEVICE_ATTRIBUTE_ECC_ENABLED.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE.html#group__CUDA__DEVICE_1g9c3e1414f0ad901d3278a4d6645fc266
+  static bool IsEccEnabled(GpuDeviceHandle device, bool* result);
+
+  // Returns the total amount of memory available for allocation by the CUDA
+  // context, in bytes, via cuDeviceTotalMem.
+  static bool GetDeviceTotalMemory(GpuDeviceHandle device, uint64* result);
+
+  // Returns the free amount of memory and total amount of memory, as reported
+  // by cuMemGetInfo.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g808f555540d0143a331cc42aa98835c0
+  static bool GetDeviceMemoryInfo(GpuContext* context, int64* free,
+                                  int64* total);
+
+  // Returns a PCI bus id string for the device.
+  // [domain]:[bus]:[device].[function]
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g85295e7d9745ab8f0aa80dd1e172acfc
+  static string GetPCIBusID(GpuDeviceHandle device);
+
+  // -- Context- and device-independent calls.
+
+  // Returns the number of visible CUDA device via cuDeviceGetCount.
+  // This should correspond to the set of device ordinals available.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE.html#group__CUDA__DEVICE_1g52b5ce05cb8c5fb6831b2c0ff2887c74
+  static int GetDeviceCount();
+
+  // Returns the driver version number via cuDriverGetVersion.
+  // This is, surprisingly, NOT the actual driver version (e.g. 331.79) but,
+  // instead, the CUDA toolkit release number that this driver is compatible
+  // with; e.g. 6000 (for a CUDA 6.0 compatible driver) or 6050 (for a CUDA 6.5
+  // compatible driver).
+  //
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__VERSION.html#group__CUDA__VERSION_1g8b7a10395392e049006e61bcdc8ebe71
+  static bool GetDriverVersion(int* driver_version);
+
+  // -- Other calls
+
+  // Returns the maximum number of blocks (per multiprocessor) occupied by the
+  // specified kernel/GpuFunctionHandle when launched with the specified
+  // parameters.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__OCCUPANCY.html#group__CUDA__OCCUPANCY_1gcc6e1094d05cba2cee17fe33ddd04a98
+  static port::StatusOr<int> GetMaxOccupiedBlocksPerCore(
+      GpuContext* context, GpuFunctionHandle kernel, int threads_per_block,
+      size_t dynamic_shared_memory_bytes);
+
+  // Seam for injecting an error at CUDA initialization time for testing
+  // purposes.
+  static bool driver_inject_init_error_;
+};
+
+// Ensures a context is activated within a scope.
+class ScopedActivateContext {
+ public:
+  // Activates the context via cuCtxSetCurrent, if it is not the currently
+  // active context (a la cuCtxGetCurrent). Note the alternative push/pop
+  // mechanism is said by NVIDIA to be relatively slow and deprecated.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1gbe562ee6258b4fcc272ca6478ca2a2f7
+  explicit ScopedActivateContext(GpuContext* context);
+
+  // Checks that the context has remained activated for the duration of the
+  // scope.
+  ~ScopedActivateContext();
+
+ private:
+  GpuContext* to_restore_ = nullptr;
+};
+
+}  // namespace gpu
+}  // namespace stream_executor
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_DRIVER_H_
--- a/tensorflow/stream_executor/gpu/gpu_event.cc
+++ b/tensorflow/stream_executor/gpu/gpu_event.cc
@ -0,0 +1,47 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/stream_executor/gpu/gpu_event.h"
+
+#include "tensorflow/stream_executor/gpu/gpu_executor.h"
+#include "tensorflow/stream_executor/gpu/gpu_stream.h"
+#include "tensorflow/stream_executor/lib/statusor.h"
+
+namespace stream_executor {
+namespace gpu {
+
+GpuEvent::GpuEvent(GpuExecutor* parent)
+    : parent_(parent), gpu_event_(nullptr) {}
+
+GpuEvent::~GpuEvent() {}
+
+port::Status GpuEvent::Init() {
+  return GpuDriver::CreateEvent(parent_->gpu_context(), &gpu_event_,
+                                GpuDriver::EventFlags::kDisableTiming);
+}
+
+port::Status GpuEvent::Destroy() {
+  return GpuDriver::DestroyEvent(parent_->gpu_context(), &gpu_event_);
+}
+
+port::Status GpuEvent::Record(GpuStream* stream) {
+  return GpuDriver::RecordEvent(parent_->gpu_context(), gpu_event_,
+                                stream->gpu_stream());
+}
+
+GpuEventHandle GpuEvent::gpu_event() { return gpu_event_; }
+
+}  // namespace gpu
+}  // namespace stream_executor
--- a/tensorflow/stream_executor/gpu/gpu_event.h
+++ b/tensorflow/stream_executor/gpu/gpu_event.h
@ -0,0 +1,62 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_EVENT_H_
+#define TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_EVENT_H_
+
+#include "tensorflow/stream_executor/event.h"
+#include "tensorflow/stream_executor/gpu/gpu_driver.h"
+#include "tensorflow/stream_executor/gpu/gpu_stream.h"
+#include "tensorflow/stream_executor/lib/status.h"
+
+namespace stream_executor {
+namespace gpu {
+
+// GpuEvent wraps a GpuEventHandle in the platform-independent EventInterface
+// interface.
+class GpuEvent : public internal::EventInterface {
+ public:
+  explicit GpuEvent(GpuExecutor* parent);
+
+  ~GpuEvent() override;
+
+  // Populates the CUDA-platform-specific elements of this object.
+  port::Status Init();
+
+  // Deallocates any platform-specific elements of this object. This is broken
+  // out (not part of the destructor) to allow for error reporting.
+  port::Status Destroy();
+
+  // Inserts the event at the current position into the specified stream.
+  port::Status Record(GpuStream* stream);
+
+  // Polls the CUDA platform for the event's current status.
+  Event::Status PollForStatus();
+
+  // The underlying CUDA event element.
+  GpuEventHandle gpu_event();
+
+ private:
+  // The Executor used to which this object and GpuEventHandle are bound.
+  GpuExecutor* parent_;
+
+  // The underlying CUDA event element.
+  GpuEventHandle gpu_event_;
+};
+
+}  // namespace gpu
+}  // namespace stream_executor
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_EVENT_H_
--- a/tensorflow/stream_executor/gpu/gpu_executor.h
+++ b/tensorflow/stream_executor/gpu/gpu_executor.h
@ -0,0 +1,347 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// The CUDA implementation of the StreamExecutorInterface functionality.
+// CUDA inclusions are ideally confined to this implementation file.
+//
+// The notions from the StreamExecutor basically correspond to the CUDA streams
+// programming model provided by the libcuda.so driver APIs, so we don't have
+// to do much more than wrap the calls to the libraries appropriately.
+#ifndef TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_EXECUTOR_H_
+#define TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_EXECUTOR_H_
+
+#include <set>
+#include <unordered_map>
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/stream_executor/event.h"
+#include "tensorflow/stream_executor/gpu/gpu_kernel.h"
+#include "tensorflow/stream_executor/lib/status.h"
+#include "tensorflow/stream_executor/lib/statusor.h"
+#include "tensorflow/stream_executor/platform.h"
+#include "tensorflow/stream_executor/platform/mutex.h"
+#include "tensorflow/stream_executor/platform/port.h"
+#include "tensorflow/stream_executor/platform/thread_annotations.h"
+#include "tensorflow/stream_executor/stream_executor_internal.h"
+
+namespace stream_executor {
+namespace gpu {
+
+// CUDA-platform implementation of the platform-agnostic
+// StreamExecutorInferface.
+class GpuExecutor : public internal::StreamExecutorInterface {
+ public:
+  // sub_platform indicates the subplatform used in this executor; it must
+  // be a CUDA type.
+  explicit GpuExecutor(const PluginConfig& plugin_config)
+      : device_(0),
+        context_(nullptr),
+        device_ordinal_(0),
+        cc_major_(0),
+        cc_minor_(0),
+        version_(0),
+        plugin_config_(plugin_config) {}
+
+  // See the corresponding StreamExecutor methods for method comments on the
+  // following overrides.
+
+  ~GpuExecutor() override;
+
+  port::Status Init(int device_ordinal, DeviceOptions device_options) override;
+
+  bool GetKernel(const MultiKernelLoaderSpec& spec,
+                 KernelBase* kernel) override;
+  // (supported on CUDA only)
+  void UnloadKernel(const KernelBase* kernel) override;
+  bool LoadModule(const MultiModuleLoaderSpec& spec,
+                  ModuleHandle* module_handle) override;
+  bool UnloadModule(ModuleHandle module_handle) override;
+
+  bool Launch(Stream* stream, const ThreadDim& thread_dims,
+              const BlockDim& block_dims, const KernelBase& k,
+              const KernelArgsArrayBase& args) override;
+
+  // (supported on CUDA only)
+  int CalculateOccupancy(const DeviceDescription& device_description,
+                         uint64 registers_per_thread,
+                         uint64 shared_memory_per_block,
+                         const ThreadDim& thread_dims, GpuFunctionHandle func);
+
+  // (supported on CUDA only)
+  int CompareOccupancy(int* initial_blocks,
+                       const DeviceDescription& device_description,
+                       uint64 registers_per_thread,
+                       uint64 shared_memory_per_block,
+                       const ThreadDim& thread_dims, GpuFunctionHandle func);
+
+  void* Allocate(uint64 size) override;
+
+  void* AllocateSubBuffer(DeviceMemoryBase* mem, uint64 offset_bytes,
+                          uint64 size_bytes) override;
+
+  void Deallocate(DeviceMemoryBase* mem) override;
+
+  void* UnifiedMemoryAllocate(uint64 size) override {
+    return GpuDriver::UnifiedMemoryAllocate(context_, size);
+  }
+
+  void UnifiedMemoryDeallocate(void* location) override {
+    return GpuDriver::UnifiedMemoryDeallocate(context_, location);
+  }
+
+  // CUDA allocation/registration functions are necessary because the driver
+  // internally sets up buffers for DMA operations (and page locks them).
+  // There's no external interface for us to otherwise control these DMA
+  // settings.
+  void* HostMemoryAllocate(uint64 size) override {
+    return GpuDriver::HostAllocate(context_, size);
+  }
+
+  void HostMemoryDeallocate(void* location) override {
+    return GpuDriver::HostDeallocate(context_, location);
+  }
+
+  bool HostMemoryRegister(void* location, uint64 size) override;
+
+  bool HostMemoryUnregister(void* location) override;
+
+  bool SynchronizeAllActivity() override;
+
+  bool SynchronousMemZero(DeviceMemoryBase* location, uint64 size) override;
+
+  bool SynchronousMemSet(DeviceMemoryBase* location, int value,
+                         uint64 size) override;
+
+  port::Status SynchronousMemcpy(DeviceMemoryBase* gpu_dst,
+                                 const void* host_src, uint64 size) override;
+
+  port::Status SynchronousMemcpy(void* host_dst,
+                                 const DeviceMemoryBase& gpu_src,
+                                 uint64 size) override;
+
+  port::Status SynchronousMemcpyDeviceToDevice(DeviceMemoryBase* gpu_dst,
+                                               const DeviceMemoryBase& gpu_src,
+                                               uint64 size) override;
+
+  bool MemZero(Stream* stream, DeviceMemoryBase* location,
+               uint64 size) override;
+  bool Memset(Stream* stream, DeviceMemoryBase* location, uint8 pattern,
+              uint64 size) override;
+  bool Memset32(Stream* stream, DeviceMemoryBase* location, uint32 pattern,
+                uint64 size) override;
+
+  bool Memcpy(Stream* stream, void* host_dst, const DeviceMemoryBase& gpu_src,
+              uint64 size) override;
+
+  bool Memcpy(Stream* stream, DeviceMemoryBase* gpu_dst, const void* host_src,
+              uint64 size) override;
+
+  bool MemcpyDeviceToDevice(Stream* stream, DeviceMemoryBase* gpu_dst,
+                            const DeviceMemoryBase& gpu_src,
+                            uint64 size) override;
+
+  bool HostCallback(Stream* stream,
+                    std::function<port::Status()> callback) override;
+
+  bool AllocateStream(Stream* stream) override;
+
+  void DeallocateStream(Stream* stream) override;
+
+  bool CreateStreamDependency(Stream* dependent, Stream* other) override;
+
+  bool AllocateTimer(Timer* timer) override;
+
+  void DeallocateTimer(Timer* timer) override;
+
+  bool StartTimer(Stream* stream, Timer* timer) override;
+
+  bool StopTimer(Stream* stream, Timer* timer) override;
+
+  port::Status AllocateEvent(Event* event) override;
+
+  port::Status DeallocateEvent(Event* event) override;
+
+  port::Status RecordEvent(Stream* stream, Event* event) override;
+
+  port::Status WaitForEvent(Stream* stream, Event* event) override;
+
+  Event::Status PollForEventStatus(Event* event) override;
+
+  port::Status BlockHostUntilDone(Stream* stream) override;
+
+  int PlatformDeviceCount() override { return GpuDriver::GetDeviceCount(); }
+
+  port::Status EnablePeerAccessTo(StreamExecutorInterface* other) override;
+
+  bool CanEnablePeerAccessTo(StreamExecutorInterface* other) override;
+
+  SharedMemoryConfig GetDeviceSharedMemoryConfig() override;
+
+  port::Status SetDeviceSharedMemoryConfig(SharedMemoryConfig config) override;
+
+  bool DeviceMemoryUsage(int64* free, int64* total) const override;
+
+  // Search for the symbol and returns a device pointer and size.
+  // Returns false if symbol does not exist.
+  bool GetSymbol(const string& symbol_name, ModuleHandle module_handle,
+                 void** mem, size_t* bytes) override;
+
+  DeviceDescription* PopulateDeviceDescription() const override;
+
+  // Populates the block_dim_limit by querying the device driver API. If an
+  // error occurs at any point while asking the driver for block dim limits, it
+  // will be only partially populated as a result, and an error will be logged.
+  bool FillBlockDimLimit(BlockDim* block_dim_limit) const;
+
+  bool SupportsBlas() const override;
+
+  blas::BlasSupport* CreateBlas() override;
+
+  bool SupportsFft() const override;
+
+  fft::FftSupport* CreateFft() override;
+
+  bool SupportsRng() const override;
+
+  rng::RngSupport* CreateRng() override;
+
+  bool SupportsDnn() const override;
+
+  dnn::DnnSupport* CreateDnn() override;
+
+  std::unique_ptr<internal::EventInterface> CreateEventImplementation()
+      override;
+
+  std::unique_ptr<internal::KernelInterface> CreateKernelImplementation()
+      override;
+
+  std::unique_ptr<internal::StreamInterface> GetStreamImplementation() override;
+
+  std::unique_ptr<internal::TimerInterface> GetTimerImplementation() override;
+
+  void* GpuContextHack() override;
+
+  GpuContext* gpu_context();
+
+ private:
+  // Attempts to find a more specific version of the file indicated by
+  // filename by looking for compute-capability-specific suffixed versions; i.e.
+  // looking for "foo.ptx" will check to see if "foo.ptx.cc30.ptx" is present if
+  // we're on a compute capability 3.0 machine.
+  // (supported on CUDA only)
+  bool FindOnDiskForComputeCapability(absl::string_view filename,
+                                      absl::string_view canonical_suffix,
+                                      string* found_filename) const;
+
+  // Attempts to find a more specific version of the file indicated by
+  // filename by looking for AMDGPU ISA-specific suffixed versions.
+  // (supported on ROCm only)
+
+  bool FindOnDiskForISAVersion(absl::string_view filename,
+                               absl::string_view canonical_suffix,
+                               string* found_filename) const;
+
+  // Host callback landing routine invoked by CUDA.
+  // data: User-provided callback provided to HostCallback() above, captured
+  //       as a std::function<void()>. Allocated/initialized inside
+  //       HostCallback() and owned and deleted by this call.
+  static void InternalHostCallback(GpuStreamHandle stream, GpuStatus status,
+                                   void* data);
+
+  // Collects metadata for the specified kernel.
+  bool GetKernelMetadata(GpuKernel* cuda_kernel,
+                         KernelMetadata* kernel_metadata);
+
+  // Prints to VLOG(2) information about the kernel's occupancy and how it might
+  // be improved.
+  void VlogOccupancyInfo(const KernelBase& kernel, const ThreadDim& thread_dims,
+                         const BlockDim& block_dims);
+
+  // (supported on CUDA only)
+  bool LoadModuleFromCuBin(const char* cubin, GpuModuleHandle* module)
+      EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_);
+
+  // Loads the PTX text `ptx` as a CUDA module.  `ptx` must be null terminated.
+  // (supported on CUDA only)
+  bool LoadModuleFromPtx(const char* ptx, GpuModuleHandle* module)
+      EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_);
+
+  // (supported on ROCm only)
+  bool LoadModuleFromHsaco(const char* hsaco, GpuModuleHandle* module)
+      EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_);
+
+  bool UnloadGpuBinary(const void* gpu_binary)
+      EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_);
+
+  // Guards the on-disk-module mapping.
+  mutex disk_modules_mu_;
+
+  // Mapping from filename to GPUModuleHandle, if it was already retrieved.
+  // Multiple GPUFunctionHandle are usually obtained from a single
+  // GPUModuleHandle so we attempt to hit in this mapping first, before
+  // retrieving it.
+  std::map<string, GpuModuleHandle> disk_modules_ GUARDED_BY(disk_modules_mu_);
+
+  // Guards the in-memory-module mapping.
+  mutex in_memory_modules_mu_;
+
+  std::map<const char*, GpuModuleHandle> in_memory_modules_
+      GUARDED_BY(in_memory_modules_mu_);
+
+  // Kernel -> loaded GPU binary. Many kernels may load the same binary.
+  std::unordered_map<const KernelBase*, const void*> kernel_to_gpu_binary_
+      GUARDED_BY(in_memory_modules_mu_);
+  // GPU binary (PTX or CUBIN or HSACO) -> {CUDA module, reference count}.
+  std::unordered_map<const void*, std::pair<GpuModuleHandle, uint64>>
+      gpu_binary_to_module_ GUARDED_BY(in_memory_modules_mu_);
+
+  // Guards the launched kernel set.
+  mutex launched_kernels_mu_;
+
+  // Keeps track of the set of launched kernels. Currently used to suppress the
+  // occupancy check on subsequent launches.
+  std::set<GpuFunctionHandle> launched_kernels_
+      GUARDED_BY(launched_kernels_mu_);
+
+  // Handle for the CUDA device being operated on. Immutable
+  // post-initialization.
+  GpuDeviceHandle device_;
+
+  // Handle for session with the library/driver. Immutable post-initialization.
+  GpuContext* context_;
+
+  // The device ordinal value that this executor was initialized with; recorded
+  // for use in getting device metadata. Immutable post-initialization.
+  int device_ordinal_;
+
+  // The major verion of the compute capability for device_.
+  int cc_major_;
+
+  // The minor verion of the compute capability for device_.
+  int cc_minor_;
+
+  // GPU ISA version for device_.
+  int version_;
+
+  // The plugin configuration associated with this instance.
+  PluginConfig plugin_config_;
+
+  SE_DISALLOW_COPY_AND_ASSIGN(GpuExecutor);
+};
+
+}  // namespace gpu
+}  // namespace stream_executor
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_EXECUTOR_H_
--- a/tensorflow/stream_executor/gpu/gpu_helpers.h
+++ b/tensorflow/stream_executor/gpu/gpu_helpers.h
@ -0,0 +1,107 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Common helper functions used for dealing with CUDA API datatypes.
+//
+// These are typically placed here for use by multiple source components (for
+// example, BLAS and executor components).
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_HELPERS_H_
+#define TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_HELPERS_H_
+
+#include <stddef.h>
+#include <complex>
+
+#include "tensorflow/stream_executor/gpu/gpu_types.h"
+
+namespace stream_executor {
+
+template <typename ElemT>
+class DeviceMemory;
+
+namespace gpu {
+
+// Converts a const DeviceMemory reference to its underlying typed pointer in
+// CUDA
+// device memory.
+template <typename T>
+const T* GpuMemory(const DeviceMemory<T>& mem) {
+  return static_cast<const T*>(mem.opaque());
+}
+
+// Converts a (non-const) DeviceMemory pointer reference to its underlying typed
+// pointer in CUDA device memory.
+template <typename T>
+T* GpuMemoryMutable(DeviceMemory<T>* mem) {
+  return static_cast<T*>(mem->opaque());
+}
+
+static_assert(
+    sizeof(std::complex<float>) == sizeof(GpuComplexType),
+    "std::complex<float> and GpuComplexType should have the same size");
+static_assert(offsetof(GpuComplexType, x) == 0,
+              "The real part of GpuComplexType should appear first.");
+static_assert(
+    sizeof(std::complex<double>) == sizeof(GpuDoubleComplexType),
+    "std::complex<double> and GpuDoubleComplexType should have the same "
+    "size");
+static_assert(offsetof(GpuDoubleComplexType, x) == 0,
+              "The real part of GpuDoubleComplexType should appear first.");
+
+// Type traits to get CUDA complex types from std::complex<>.
+
+template <typename T>
+struct GpuComplexT {
+  typedef T type;
+};
+
+template <>
+struct GpuComplexT<std::complex<float>> {
+  typedef GpuComplexType type;
+};
+
+template <>
+struct GpuComplexT<std::complex<double>> {
+  typedef GpuDoubleComplexType type;
+};
+
+// Converts pointers of std::complex<> to pointers of
+// GpuComplexType/GpuDoubleComplexType. No type conversion for non-complex
+// types.
+
+template <typename T>
+inline const typename GpuComplexT<T>::type* GpuComplex(const T* p) {
+  return reinterpret_cast<const typename GpuComplexT<T>::type*>(p);
+}
+
+template <typename T>
+inline typename GpuComplexT<T>::type* GpuComplex(T* p) {
+  return reinterpret_cast<typename GpuComplexT<T>::type*>(p);
+}
+
+// Converts values of std::complex<float/double> to values of
+// GpuComplexType/GpuDoubleComplexType.
+inline GpuComplexType GpuComplexValue(std::complex<float> val) {
+  return {val.real(), val.imag()};
+}
+
+inline GpuDoubleComplexType GpuComplexValue(std::complex<double> val) {
+  return {val.real(), val.imag()};
+}
+
+}  // namespace gpu
+}  // namespace stream_executor
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_HELPERS_H_
--- a/tensorflow/stream_executor/gpu/gpu_kernel.h
+++ b/tensorflow/stream_executor/gpu/gpu_kernel.h
@ -0,0 +1,105 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// The CUDA implementation of the StreamExecutorInterface functionality.
+// CUDA inclusions are ideally confined to this implementation file.
+//
+// The notions from the StreamExecutor basically correspond to the CUDA streams
+// programming model provided by the libcuda.so driver APIs, so we don't have
+// to do much more than wrap the calls to the libraries appropriately.
+#ifndef TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_KERNEL_H_
+#define TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_KERNEL_H_
+
+#include "tensorflow/stream_executor/gpu/gpu_driver.h"
+#include "tensorflow/stream_executor/kernel_cache_config.h"
+#include "tensorflow/stream_executor/platform/logging.h"
+#include "tensorflow/stream_executor/platform/port.h"
+#include "tensorflow/stream_executor/stream_executor_internal.h"
+
+namespace stream_executor {
+namespace gpu {
+
+// Wraps a GpuFunctionHandle to implement the platform-independent
+// KernelInterface.
+class GpuKernel : public internal::KernelInterface {
+ public:
+  GpuKernel()
+      : gpu_function_(nullptr),
+        arity_(0),
+        preferred_cache_config_(KernelCacheConfig::kNoPreference) {}
+
+  // Note that the function is unloaded when the module is unloaded, and the
+  // module that the function is contained in is owned by the GpuExecutor.
+  ~GpuKernel() override {}
+
+  // As arity cannot be reflected upon using the CUDA API, the arity is
+  // explicitly set during the GpuExecutor::GetKernel initialization process.
+  void set_arity(unsigned arity) { arity_ = arity; }
+  unsigned Arity() const override { return arity_; }
+
+  // Returns the GpuFunctionHandle value for passing to the CUDA API.
+  GpuFunctionHandle AsGpuFunctionHandle() const {
+    DCHECK(gpu_function_ != nullptr);
+    return const_cast<GpuFunctionHandle>(gpu_function_);
+  }
+
+  // Returns the slot that the GpuFunctionHandle is stored within for this
+  // object, for the CUDA API which wants to load into a GpuFunctionHandle*.
+  GpuFunctionHandle* gpu_function_ptr() { return &gpu_function_; }
+
+  // CUDA supports setting the preferred cache configuration of a
+  // GpuFunctionHandle (more-or-less equivalent to a GpuKernel). We support this
+  // via the below functions; users can set a preference, and that is applied
+  // when the kernel is [lazy-]loaded (in GpuExecutor::Launch). The alternative
+  // would be to load the kernel & set the preference when the user calls the
+  // setter below; either approach is valid. Sets the current kernel cache
+  // configuration preference.
+  void SetPreferredCacheConfig(KernelCacheConfig config) override {
+    preferred_cache_config_ = config;
+  }
+
+  // Returns the current kernel cache configuration preference.
+  KernelCacheConfig GetPreferredCacheConfig() const override {
+    return preferred_cache_config_;
+  }
+
+  // Returns the current kernel cache configuration preference as a
+  // CUfunc_cache.
+  GpuFuncCachePreference GetGpuCacheConfig() const;
+
+ private:
+  GpuFunctionHandle gpu_function_;  // Wrapped CUDA kernel handle.
+  unsigned arity_;  // Number of formal parameters the kernel takes.
+
+  // Preferred (but not required) cache configuration for this kernel.
+  KernelCacheConfig preferred_cache_config_;
+};
+
+// Given a platform-independent kernel datatype, returns the (const) internal
+// CUDA platform implementation pointer.
+inline const GpuKernel* AsGpuKernel(const KernelBase* kernel) {
+  return static_cast<const GpuKernel*>(kernel->implementation());
+}
+
+// Given a platform-independent kernel datatype, returns the (non-const)
+// internal CUDA platform implementation pointer.
+inline GpuKernel* AsGpuKernel(KernelBase* kernel) {
+  return static_cast<GpuKernel*>(kernel->implementation());
+}
+
+}  // namespace gpu
+}  // namespace stream_executor
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_KERNEL_H_
--- a/tensorflow/stream_executor/gpu/gpu_rng.h
+++ b/tensorflow/stream_executor/gpu/gpu_rng.h
@ -0,0 +1,125 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_RNG_H_
+#define TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_RNG_H_
+
+#include "tensorflow/stream_executor/platform/mutex.h"
+#include "tensorflow/stream_executor/platform/port.h"
+#include "tensorflow/stream_executor/platform/thread_annotations.h"
+#include "tensorflow/stream_executor/plugin_registry.h"
+#include "tensorflow/stream_executor/rng.h"
+
+#include "tensorflow/stream_executor/gpu/gpu_types.h"
+
+namespace stream_executor {
+
+class Stream;
+template <typename ElemT>
+class DeviceMemory;
+
+namespace gpu {
+
+// Opaque and unique identifier for the GPU RNG plugin.
+extern const PluginId kGpuRandPlugin;
+
+class GpuExecutor;
+
+// GPU-platform implementation of the random number generation support
+// interface.
+//
+// Thread-safe post-initialization.
+class GpuRng : public rng::RngSupport {
+ public:
+  explicit GpuRng(GpuExecutor* parent);
+
+  // Retrieves a gpu rng library generator handle. This is necessary for
+  // enqueuing random number generation work onto the device.
+  // TODO(leary) provide a way for users to select the RNG algorithm.
+  bool Init();
+
+  // Releases a gpu rng library generator handle, if one was acquired.
+  ~GpuRng() override;
+
+  // See rng::RngSupport for details on the following overrides.
+  bool DoPopulateRandUniform(Stream* stream, DeviceMemory<float>* v) override;
+  bool DoPopulateRandUniform(Stream* stream, DeviceMemory<double>* v) override;
+  bool DoPopulateRandUniform(Stream* stream,
+                             DeviceMemory<std::complex<float>>* v) override;
+  bool DoPopulateRandUniform(Stream* stream,
+                             DeviceMemory<std::complex<double>>* v) override;
+  bool DoPopulateRandGaussian(Stream* stream, float mean, float stddev,
+                              DeviceMemory<float>* v) override;
+  bool DoPopulateRandGaussian(Stream* stream, double mean, double stddev,
+                              DeviceMemory<double>* v) override;
+
+  bool SetSeed(Stream* stream, const uint8* seed, uint64 seed_bytes) override;
+
+ private:
+  // Actually performs the work of generating random numbers - the public
+  // methods are thin wrappers to this interface.
+  template <typename T>
+  bool DoPopulateRandUniformInternal(Stream* stream, DeviceMemory<T>* v);
+  template <typename ElemT, typename FuncT>
+  bool DoPopulateRandGaussianInternal(Stream* stream, ElemT mean, ElemT stddev,
+                                      DeviceMemory<ElemT>* v, FuncT func);
+
+  // Sets the stream for the internal gpu rng generator.
+  //
+  // This is a stateful operation, as the handle can only have one stream set at
+  // a given time, so it is usually performed right before enqueuing work to do
+  // with random number generation.
+  bool SetStream(Stream* stream) EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  // mutex that guards the gpu rng library handle for this device.
+  mutex mu_;
+
+  // GpuExecutor which instantiated this GpuRng.
+  // Immutable post-initialization.
+  GpuExecutor* parent_;
+
+  // gpu rng library handle on the device.
+  GpuRngHandle rng_ GUARDED_BY(mu_);
+
+  SE_DISALLOW_COPY_AND_ASSIGN(GpuRng);
+};
+
+template <typename T>
+string TypeString();
+
+template <>
+string TypeString<float>() {
+  return "float";
+}
+
+template <>
+string TypeString<double>() {
+  return "double";
+}
+
+template <>
+string TypeString<std::complex<float>>() {
+  return "std::complex<float>";
+}
+
+template <>
+string TypeString<std::complex<double>>() {
+  return "std::complex<double>";
+}
+
+}  // namespace gpu
+}  // namespace stream_executor
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_RNG_H_
--- a/tensorflow/stream_executor/cuda/cuda_stream.cc
+++ b/tensorflow/stream_executor/cuda/cuda_stream.cc
@ -1,4 +1,4 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.

 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@ -13,49 +13,49 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/

-#include "tensorflow/stream_executor/cuda/cuda_stream.h"
+#include "tensorflow/stream_executor/gpu/gpu_stream.h"

-#include "tensorflow/stream_executor/cuda/cuda_gpu_executor.h"
+#include "tensorflow/stream_executor/gpu/gpu_executor.h"
 #include "tensorflow/stream_executor/lib/status.h"
 #include "tensorflow/stream_executor/stream.h"

 namespace stream_executor {
-namespace cuda {
+namespace gpu {

-bool CUDAStream::Init() {
-  if (!CUDADriver::CreateStream(parent_->cuda_context(), &cuda_stream_)) {
+bool GpuStream::Init() {
+  if (!GpuDriver::CreateStream(parent_->gpu_context(), &gpu_stream_)) {
    return false;
  }
-  return CUDADriver::CreateEvent(parent_->cuda_context(), &completed_event_,
-                                 CUDADriver::EventFlags::kDisableTiming)
+  return GpuDriver::CreateEvent(parent_->gpu_context(), &completed_event_,
+                                GpuDriver::EventFlags::kDisableTiming)
      .ok();
 }

-void CUDAStream::Destroy() {
+void GpuStream::Destroy() {
  if (completed_event_ != nullptr) {
    port::Status status =
-        CUDADriver::DestroyEvent(parent_->cuda_context(), &completed_event_);
+        GpuDriver::DestroyEvent(parent_->gpu_context(), &completed_event_);
    if (!status.ok()) {
      LOG(ERROR) << status.error_message();
    }
  }

-  CUDADriver::DestroyStream(parent_->cuda_context(), &cuda_stream_);
+  GpuDriver::DestroyStream(parent_->gpu_context(), &gpu_stream_);
 }

-bool CUDAStream::IsIdle() const {
-  return CUDADriver::IsStreamIdle(parent_->cuda_context(), cuda_stream_);
+bool GpuStream::IsIdle() const {
+  return GpuDriver::IsStreamIdle(parent_->gpu_context(), gpu_stream_);
 }

-CUDAStream *AsCUDAStream(Stream *stream) {
+GpuStream* AsGpuStream(Stream* stream) {
  DCHECK(stream != nullptr);
-  return static_cast<CUDAStream *>(stream->implementation());
+  return static_cast<GpuStream*>(stream->implementation());
 }

-CUstream AsCUDAStreamValue(Stream *stream) {
+GpuStreamHandle AsGpuStreamValue(Stream* stream) {
  DCHECK(stream != nullptr);
-  return AsCUDAStream(stream)->cuda_stream();
+  return AsGpuStream(stream)->gpu_stream();
 }

-}  // namespace cuda
+}  // namespace gpu
 }  // namespace stream_executor
--- a/tensorflow/stream_executor/gpu/gpu_stream.h
+++ b/tensorflow/stream_executor/gpu/gpu_stream.h
@ -0,0 +1,96 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Defines the GpuStream type - the CUDA-specific implementation of the generic
+// StreamExecutor Stream interface.
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_STREAM_H_
+#define TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_STREAM_H_
+
+#include "tensorflow/stream_executor/gpu/gpu_driver.h"
+#include "tensorflow/stream_executor/platform/thread_annotations.h"
+#include "tensorflow/stream_executor/stream_executor_internal.h"
+
+namespace stream_executor {
+namespace gpu {
+
+class GpuExecutor;
+
+// Wraps a GpuStreamHandle in order to satisfy the platform-independent
+// StreamInterface.
+//
+// Thread-safe post-initialization.
+class GpuStream : public internal::StreamInterface {
+ public:
+  explicit GpuStream(GpuExecutor* parent)
+      : parent_(parent), gpu_stream_(nullptr), completed_event_(nullptr) {}
+
+  // Note: teardown is handled by a parent's call to DeallocateStream.
+  ~GpuStream() override {}
+
+  void* GpuStreamHack() override { return gpu_stream_; }
+  void** GpuStreamMemberHack() override {
+    return reinterpret_cast<void**>(&gpu_stream_);
+  }
+
+  // Explicitly initialize the CUDA resources associated with this stream, used
+  // by StreamExecutor::AllocateStream().
+  bool Init();
+
+  // Explicitly destroy the CUDA resources associated with this stream, used by
+  // StreamExecutor::DeallocateStream().
+  void Destroy();
+
+  // Returns true if no work is pending or executing on the stream.
+  bool IsIdle() const;
+
+  // Retrieves an event which indicates that all work enqueued into the stream
+  // has completed. Ownership of the event is not transferred to the caller, the
+  // event is owned by this stream.
+  GpuEventHandle* completed_event() { return &completed_event_; }
+
+  // Returns the GpuStreamHandle value for passing to the CUDA API.
+  //
+  // Precond: this GpuStream has been allocated (otherwise passing a nullptr
+  // into the NVIDIA library causes difficult-to-understand faults).
+  GpuStreamHandle gpu_stream() const {
+    DCHECK(gpu_stream_ != nullptr);
+    return const_cast<GpuStreamHandle>(gpu_stream_);
+  }
+
+  // TODO(timshen): Migrate away and remove this function.
+  GpuStreamHandle cuda_stream() const { return gpu_stream(); }
+
+  GpuExecutor* parent() const { return parent_; }
+
+ private:
+  GpuExecutor* parent_;         // Executor that spawned this stream.
+  GpuStreamHandle gpu_stream_;  // Wrapped CUDA stream handle.
+
+  // Event that indicates this stream has completed.
+  GpuEventHandle completed_event_ = nullptr;
+};
+
+// Helper functions to simplify extremely common flows.
+// Converts a Stream to the underlying GpuStream implementation.
+GpuStream* AsGpuStream(Stream* stream);
+
+// Extracts a GpuStreamHandle from a GpuStream-backed Stream object.
+GpuStreamHandle AsGpuStreamValue(Stream* stream);
+
+}  // namespace gpu
+}  // namespace stream_executor
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_STREAM_H_
--- a/tensorflow/stream_executor/cuda/cuda_timer.cc
+++ b/tensorflow/stream_executor/cuda/cuda_timer.cc
@ -1,4 +1,4 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.

 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@ -13,31 +13,31 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/

-#include "tensorflow/stream_executor/cuda/cuda_timer.h"
+#include "tensorflow/stream_executor/gpu/gpu_timer.h"

-#include "tensorflow/stream_executor/cuda/cuda_driver.h"
-#include "tensorflow/stream_executor/cuda/cuda_gpu_executor.h"
-#include "tensorflow/stream_executor/cuda/cuda_stream.h"
+#include "tensorflow/stream_executor/gpu/gpu_driver.h"
+#include "tensorflow/stream_executor/gpu/gpu_executor.h"
+#include "tensorflow/stream_executor/gpu/gpu_stream.h"
 #include "tensorflow/stream_executor/lib/status.h"

 namespace stream_executor {
-namespace cuda {
+namespace gpu {

-bool CUDATimer::Init() {
+bool GpuTimer::Init() {
  CHECK(start_event_ == nullptr && stop_event_ == nullptr);
-  CudaContext* context = parent_->cuda_context();
-  port::Status status = CUDADriver::CreateEvent(
-      context, &start_event_, CUDADriver::EventFlags::kDefault);
+  GpuContext* context = parent_->gpu_context();
+  port::Status status = GpuDriver::CreateEvent(context, &start_event_,
+                                               GpuDriver::EventFlags::kDefault);
  if (!status.ok()) {
    LOG(ERROR) << status;
    return false;
  }

-  status = CUDADriver::CreateEvent(context, &stop_event_,
-                                   CUDADriver::EventFlags::kDefault);
+  status = GpuDriver::CreateEvent(context, &stop_event_,
+                                  GpuDriver::EventFlags::kDefault);
  if (!status.ok()) {
    LOG(ERROR) << status;
-    status = CUDADriver::DestroyEvent(context, &start_event_);
+    status = GpuDriver::DestroyEvent(context, &start_event_);
    if (!status.ok()) {
      LOG(ERROR) << status;
    }
@ -48,47 +48,46 @@ bool CUDATimer::Init() {
  return true;
 }

-void CUDATimer::Destroy() {
-  CudaContext* context = parent_->cuda_context();
-  port::Status status = CUDADriver::DestroyEvent(context, &start_event_);
+void GpuTimer::Destroy() {
+  GpuContext* context = parent_->gpu_context();
+  port::Status status = GpuDriver::DestroyEvent(context, &start_event_);
  if (!status.ok()) {
    LOG(ERROR) << status;
  }

-  status = CUDADriver::DestroyEvent(context, &stop_event_);
+  status = GpuDriver::DestroyEvent(context, &stop_event_);
  if (!status.ok()) {
    LOG(ERROR) << status;
  }
 }

-float CUDATimer::GetElapsedMilliseconds() const {
+float GpuTimer::GetElapsedMilliseconds() const {
  CHECK(start_event_ != nullptr && stop_event_ != nullptr);
  // TODO(leary) provide a way to query timer resolution?
  // CUDA docs say a resolution of about 0.5us
  float elapsed_milliseconds = NAN;
-  (void)CUDADriver::GetEventElapsedTime(parent_->cuda_context(),
-                                        &elapsed_milliseconds, start_event_,
-                                        stop_event_);
+  (void)GpuDriver::GetEventElapsedTime(
+      parent_->gpu_context(), &elapsed_milliseconds, start_event_, stop_event_);
  return elapsed_milliseconds;
 }

-bool CUDATimer::Start(CUDAStream* stream) {
-  port::Status status = CUDADriver::RecordEvent(
-      parent_->cuda_context(), start_event_, stream->cuda_stream());
+bool GpuTimer::Start(GpuStream* stream) {
+  port::Status status = GpuDriver::RecordEvent(
+      parent_->gpu_context(), start_event_, stream->gpu_stream());
  if (!status.ok()) {
    LOG(ERROR) << status;
  }
  return status.ok();
 }

-bool CUDATimer::Stop(CUDAStream* stream) {
-  port::Status status = CUDADriver::RecordEvent(
-      parent_->cuda_context(), stop_event_, stream->cuda_stream());
+bool GpuTimer::Stop(GpuStream* stream) {
+  port::Status status = GpuDriver::RecordEvent(
+      parent_->gpu_context(), stop_event_, stream->gpu_stream());
  if (!status.ok()) {
    LOG(ERROR) << status;
  }
  return status.ok();
 }

-}  // namespace cuda
+}  // namespace gpu
 }  // namespace stream_executor
--- a/tensorflow/stream_executor/gpu/gpu_timer.h
+++ b/tensorflow/stream_executor/gpu/gpu_timer.h
@ -0,0 +1,90 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Defines the GpuTimer type - the CUDA-specific implementation of the generic
+// StreamExecutor Timer interface.
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_TIMER_H_
+#define TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_TIMER_H_
+
+#include "tensorflow/stream_executor/gpu/gpu_driver.h"
+#include "tensorflow/stream_executor/gpu/gpu_executor.h"
+#include "tensorflow/stream_executor/stream_executor_internal.h"
+
+namespace stream_executor {
+namespace gpu {
+
+class GpuExecutor;
+class GpuStream;
+
+// Wraps a pair of GpuEventHandles in order to satisfy the platform-independent
+// TimerInferface -- both a start and a stop event are present which may be
+// recorded in a stream.
+class GpuTimer : public internal::TimerInterface {
+ public:
+  explicit GpuTimer(GpuExecutor* parent)
+      : parent_(parent), start_event_(nullptr), stop_event_(nullptr) {}
+
+  // Note: teardown needs to be explicitly handled in this API by a call to
+  // StreamExecutor::DeallocateTimer(), which invokes Destroy().
+  // TODO(csigg): Change to RAII.
+  ~GpuTimer() override {}
+
+  // Allocates the platform-specific pieces of the timer, called as part of
+  // StreamExecutor::AllocateTimer().
+  bool Init();
+
+  // Deallocates the platform-specific pieces of the timer, called as part of
+  // StreamExecutor::DeallocateTimer().
+  void Destroy();
+
+  // Records the "timer start" event at the current point in the stream.
+  bool Start(GpuStream* stream);
+
+  // Records the "timer stop" event at the current point in the stream.
+  bool Stop(GpuStream* stream);
+
+  // Returns the elapsed time, in milliseconds, between the start and stop
+  // events.
+  float GetElapsedMilliseconds() const;
+
+  // See Timer::Microseconds().
+  // TODO(leary) make this into an error code interface...
+  uint64 Microseconds() const override {
+    return GetElapsedMilliseconds() * 1e3;
+  }
+
+  // See Timer::Nanoseconds().
+  uint64 Nanoseconds() const override { return GetElapsedMilliseconds() * 1e6; }
+
+ private:
+  GpuExecutor* parent_;
+  GpuEventHandle start_event_;  // Event recorded to indicate the "start"
+                                // timestamp executing in a stream.
+  GpuEventHandle stop_event_;   // Event recorded to indicate the "stop"
+                                // timestamp executing in a stream.
+};
+
+struct GpuTimerDeleter {
+  void operator()(GpuTimer* t) {
+    t->Destroy();
+    delete t;
+  }
+};
+
+}  // namespace gpu
+}  // namespace stream_executor
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_TIMER_H_
--- a/tensorflow/stream_executor/gpu/gpu_types.h
+++ b/tensorflow/stream_executor/gpu/gpu_types.h
@ -0,0 +1,84 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// GPU (ROCm / CUDA) specific type handle resolution
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_TYPES_H_
+#define TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_TYPES_H_
+
+#if TENSORFLOW_USE_ROCM
+
+#include "rocm/include/hip/hip_complex.h"
+#include "rocm/include/hip/hip_runtime.h"
+#include "rocm/include/hiprand/hiprand.h"
+
+#else  // CUDA
+
+#include "cuda/include/cuComplex.h"
+#include "cuda/include/cuda.h"
+
+// cannot include curand.h here
+//   because it triggers the #error in cuda/cuda_gpu_executor.cc
+//     (because curand.h includes cuda_runtime.h)
+// so explicitly adding the lone typedef we need from that file
+typedef struct curandGenerator_st* curandGenerator_t;
+
+#endif
+
+namespace stream_executor {
+namespace gpu {
+
+#if TENSORFLOW_USE_ROCM
+
+using GpuStreamHandle = hipStream_t;
+using GpuEventHandle = hipEvent_t;
+using GpuFunctionHandle = hipFunction_t;
+using GpuFunctionAttribute = hipDeviceAttribute_t;  // not a typo!
+using GpuDeviceHandle = hipDevice_t;
+using GpuDevicePtr = hipDeviceptr_t;
+using GpuDeviceAttribute = hipDeviceAttribute_t;
+using GpuDeviceProperty = hipDeviceProp_t;
+using GpuModuleHandle = hipModule_t;
+using GpuStatus = hipError_t;
+using GpuFuncCachePreference = hipFuncCache_t;
+using GpuSharedMemConfig = hipSharedMemConfig;
+using GpuComplexType = hipComplex;
+using GpuDoubleComplexType = hipDoubleComplex;
+using GpuRngHandle = hiprandGenerator_t;
+
+#else  // CUDA
+
+using GpuStreamHandle = CUstream;
+using GpuEventHandle = CUevent;
+using GpuFunctionHandle = CUfunction;
+using GpuFunctionAttribute = CUfunction_attribute;
+using GpuDeviceHandle = CUdevice;
+using GpuDevicePtr = CUdeviceptr;
+using GpuDeviceAttribute = CUdevice_attribute;
+using GpuDeviceProperty = CUdevprop;
+using GpuModuleHandle = CUmodule;
+using GpuStatus = CUresult;
+using GpuFuncCachePreference = CUfunc_cache;
+using GpuSharedMemConfig = CUsharedconfig;
+using GpuComplexType = cuComplex;
+using GpuDoubleComplexType = cuDoubleComplex;
+using GpuRngHandle = curandGenerator_t;
+
+#endif
+
+}  // namespace gpu
+}  // namespace stream_executor
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_TYPES_H_
--- a/tensorflow/stream_executor/platform.cc
+++ b/tensorflow/stream_executor/platform.cc
@ -28,6 +28,8 @@ string PlatformKindString(PlatformKind kind) {
  switch (kind) {
    case PlatformKind::kCuda:
      return "CUDA";
+    case PlatformKind::kROCm:
+      return "ROCm";
    case PlatformKind::kOpenCL:
      return "OpenCL";
    case PlatformKind::kHost:
@ -52,6 +54,7 @@ PlatformKind PlatformKindFromString(string kind) {
 bool PlatformIsRunnable(PlatformKind kind) {
  switch (kind) {
    case PlatformKind::kCuda:
+    case PlatformKind::kROCm:
    case PlatformKind::kOpenCL:
    case PlatformKind::kHost:
      return true;
@ -63,6 +66,7 @@ bool PlatformIsRunnable(PlatformKind kind) {
 bool PlatformIsRunnableOnDevice(PlatformKind kind) {
  switch (kind) {
    case PlatformKind::kCuda:
+    case PlatformKind::kROCm:
    case PlatformKind::kOpenCL:
      return true;
    default:
--- a/tensorflow/stream_executor/platform.h
+++ b/tensorflow/stream_executor/platform.h
@ -40,6 +40,7 @@ class StreamExecutor;
 enum class PlatformKind {
  kInvalid,
  kCuda,
+  kROCm,
  kOpenCL,
  kHost,
  kMock,
--- a/tensorflow/stream_executor/rocm/BUILD
+++ b/tensorflow/stream_executor/rocm/BUILD
@ -0,0 +1,267 @@
+# Description:
+#   ROCm-platform specific StreamExecutor support code.
+
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load(
+    "//tensorflow/stream_executor:build_defs.bzl",
+    "stream_executor_friends",
+)
+load("//tensorflow:tensorflow.bzl", "tf_copts")
+load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm_is_configured")
+load("//tensorflow/core:platform/default/build_config_root.bzl", "if_static")
+
+package_group(
+    name = "friends",
+    packages = stream_executor_friends(),
+)
+
+package(
+    default_visibility = [":friends"],
+)
+
+# Filegroup used to collect source files for the dependency check.
+filegroup(
+    name = "c_srcs",
+    data = glob([
+        "**/*.cc",
+        "**/*.h",
+    ]),
+)
+
+cc_library(
+    name = "rocm_diagnostics",
+    srcs = if_rocm_is_configured(["rocm_diagnostics.cc"]),
+    hdrs = [],
+    deps = if_rocm_is_configured([
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
+        "//tensorflow/stream_executor/gpu:gpu_diagnostics_header",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+    ]),
+)
+
+cc_library(
+    name = "rocm_driver",
+    srcs = if_rocm_is_configured(["rocm_driver.cc"]),
+    hdrs = [],
+    deps = if_rocm_is_configured([
+        ":rocm_diagnostics",
+        "@com_google_absl//absl/base",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/strings",
+        "//tensorflow/stream_executor:device_options",
+        "//tensorflow/stream_executor/gpu:gpu_driver_header",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+        "@local_config_rocm//rocm:rocm_headers",
+    ]),
+)
+
+cc_library(
+    name = "rocm_event",
+    srcs = if_rocm_is_configured(["rocm_event.cc"]),
+    hdrs = [],
+    deps = if_rocm_is_configured([
+        ":rocm_driver",
+        "//tensorflow/stream_executor:stream_executor_headers",
+        "//tensorflow/stream_executor/gpu:gpu_event_header",
+        "//tensorflow/stream_executor/gpu:gpu_executor_header",
+        "//tensorflow/stream_executor/gpu:gpu_stream_header",
+        "//tensorflow/stream_executor/lib",
+    ]),
+)
+
+cc_library(
+    name = "rocm_gpu_executor",
+    srcs = if_rocm_is_configured(["rocm_gpu_executor.cc"]),
+    hdrs = [],
+    deps = if_rocm_is_configured([
+        ":rocm_diagnostics",
+        ":rocm_driver",
+        ":rocm_event",
+        ":rocm_kernel",
+        ":rocm_platform_id",
+        "@com_google_absl//absl/strings",
+        "//tensorflow/stream_executor:event",
+        "//tensorflow/stream_executor:plugin_registry",
+        "//tensorflow/stream_executor:stream_executor_internal",
+        "//tensorflow/stream_executor:stream_executor_pimpl_header",
+        "//tensorflow/stream_executor:timer",
+        "//tensorflow/stream_executor/gpu:gpu_activation_header",
+        "//tensorflow/stream_executor/gpu:gpu_event",
+        "//tensorflow/stream_executor/gpu:gpu_kernel_header",
+        "//tensorflow/stream_executor/gpu:gpu_stream",
+        "//tensorflow/stream_executor/gpu:gpu_timer",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+        "//tensorflow/stream_executor/platform:dso_loader",
+    ]),
+    alwayslink = True,
+)
+
+cc_library(
+    name = "rocm_kernel",
+    srcs = if_rocm_is_configured(["rocm_kernel.cc"]),
+    hdrs = [],
+    visibility = ["//visibility:public"],
+    deps = if_rocm_is_configured([
+        "//tensorflow/stream_executor/gpu:gpu_kernel_header",
+    ]),
+    alwayslink = True,
+)
+
+cc_library(
+    name = "rocm_platform",
+    srcs = if_rocm_is_configured(["rocm_platform.cc"]),
+    hdrs = if_rocm_is_configured(["rocm_platform.h"]),
+    visibility = ["//visibility:public"],
+    deps = if_rocm_is_configured([
+        ":rocm_driver",
+        ":rocm_gpu_executor",
+        ":rocm_platform_id",
+        "//tensorflow/stream_executor",  # buildcleaner: keep
+        "//tensorflow/stream_executor:executor_cache",
+        "//tensorflow/stream_executor:multi_platform_manager",
+        "//tensorflow/stream_executor:stream_executor_pimpl_header",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+    ]),
+    alwayslink = True,  # Registers itself with the MultiPlatformManager.
+)
+
+cc_library(
+    name = "rocm_platform_id",
+    srcs = ["rocm_platform_id.cc"],
+    hdrs = ["rocm_platform_id.h"],
+    deps = ["//tensorflow/stream_executor:platform"],
+)
+
+# FIXME: enable in future PRs
+#cc_library(
+#    name = "rocblas_plugin",
+#    srcs = ["rocm_blas.cc"],
+#    hdrs = ["rocm_blas.h"],
+#    visibility = ["//visibility:public"],
+#    deps = [
+#        ":rocm_gpu_executor",
+#        ":rocm_platform_id",
+#        "//third_party/eigen3",
+#        "//tensorflow/core:lib_internal",
+#        "//tensorflow/stream_executor",
+#        "//tensorflow/stream_executor:event",
+#        "//tensorflow/stream_executor:host_or_device_scalar",
+#        "//tensorflow/stream_executor:plugin_registry",
+#        "//tensorflow/stream_executor:scratch_allocator",
+#        "//tensorflow/stream_executor:timer",
+#        "//tenosrflow/stream_executor/gpu:gpu_activation_header",
+#        "//tenosrflow/stream_executor/gpu:gpu_stream_header",
+#        "//tenosrflow/stream_executor/gpu:gpu_timer_header",
+#        "//tensorflow/stream_executor/lib",
+#        "//tensorflow/stream_executor/platform",
+#        "//tensorflow/stream_executor/platform:dso_loader",
+#        "@com_google_absl//absl/strings",
+#        "@local_config_rocm//rocm:rocm_headers",
+#    ] + if_static(["@local_config_rocm//rocm:rocblas"]),
+#    alwayslink = True,
+#)
+
+# FIXME: enable in future PRs
+#cc_library(
+#    name = "rocfft_plugin",
+#    srcs = ["rocm_fft.cc"],
+#    hdrs = [],
+#    visibility = ["//visibility:public"],
+#    deps = [
+#        ":rocm_platform_id",
+#        "//tensorflow/stream_executor:event",
+#        "//tensorflow/stream_executor:fft",
+#        "//tensorflow/stream_executor:plugin_registry",
+#        "//tensorflow/stream_executor:scratch_allocator",
+#        "//tenosrflow/stream_executor/gpu:gpu_stream_header",
+#        "//tensorflow/stream_executor/lib",
+#        "//tensorflow/stream_executor/platform",
+#        "//tensorflow/stream_executor/platform:dso_loader",
+#        "@local_config_rocm//rocm:rocm_headers",
+#    ] + if_static(["@local_config_rocm//rocm:rocfft"]),
+#    alwayslink = True,
+#)
+
+# FIXME: enable in future PRs
+#cc_library(
+#    name = "miopen_plugin",
+#    srcs = ["rocm_dnn.cc"],
+#    hdrs = [],
+#    copts = [
+#        # STREAM_EXECUTOR_CUDNN_WRAP would fail on Clang with the default
+#        # setting of template depth 256
+#        "-ftemplate-depth-512",
+#    ],
+#    visibility = ["//visibility:public"],
+#    deps = [
+#        ":rocm_diagnostics",
+#        ":rocm_driver",
+#        ":rocm_gpu_executor",
+#        ":rocm_platform_id",
+#        "//third_party/eigen3",
+#        "//tensorflow/core:lib",
+#        "//tensorflow/core:lib_internal",
+#        "//tensorflow/core:logger",
+#        "//tensorflow/stream_executor:dnn",
+#        "//tensorflow/stream_executor:event",
+#        "//tensorflow/stream_executor:logging_proto_cc",
+#        "//tensorflow/stream_executor:plugin_registry",
+#        "//tensorflow/stream_executor:scratch_allocator",
+#        "//tensorflow/stream_executor:stream_executor_pimpl_header",
+#        "//tensorflow/stream_executor:temporary_device_memory",
+#        "//tenosrflow/stream_executor/gpu:gpu_activation_header",
+#        "//tenosrflow/stream_executor/gpu:gpu_stream_header",
+#        "//tenosrflow/stream_executor/gpu:gpu_timer_header",
+#        "//tensorflow/stream_executor/lib",
+#        "//tensorflow/stream_executor/platform",
+#        "//tensorflow/stream_executor/platform:dso_loader",
+#        "@com_google_absl//absl/strings",
+#        "@local_config_rocm//rocm:rocm_headers",
+#    ] + tf_additional_miopen_plugin_deps() + if_static(["@local_config_rocm//rocm:miopen"]),
+#    alwayslink = True,
+#)
+
+# FIXME: enable in future PRs
+#cc_library(
+#    name = "rocrand_plugin",
+#    srcs = ["rocm_rng.cc"],
+#    hdrs = [],
+#    deps = [
+#        ":rocm_gpu_executor",
+#        ":rocm_platform_id",
+#        "@local_config_rocm//rocm:rocm_headers",
+#        "//tensorflow/stream_executor:event",
+#        "//tensorflow/stream_executor:plugin_registry",
+#        "//tensorflow/stream_executor:rng",
+#        "//tenosrflow/stream_executor/gpu:gpu_activation_header",
+#        "//tenosrflow/stream_executor/gpu:gpu_stream_header",
+#        "//tensorflow/stream_executor/lib",
+#        "//tensorflow/stream_executor/platform",
+#        "//tensorflow/stream_executor/platform:dso_loader",
+#    ] + if_static(["@local_config_rocm//rocm:curand"]),
+#    alwayslink = True,
+#)
+
+cc_library(
+    name = "all_runtime",
+    copts = tf_copts(),
+    visibility = ["//visibility:public"],
+    deps = if_rocm_is_configured([
+        # FIXME: enable in future PRs
+        #":miopen_plugin",
+        #":rocfft_plugin",
+        #":rocblas_plugin",
+        #":rocrand_plugin",
+        ":rocm_driver",
+        ":rocm_platform",
+    ]),
+    alwayslink = 1,
+)
--- a/tensorflow/stream_executor/rocm/rocm_diagnostics.cc
+++ b/tensorflow/stream_executor/rocm/rocm_diagnostics.cc
@ -0,0 +1,234 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <dirent.h>
+
+#include <limits.h>
+#include <link.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/sysmacros.h>
+#include <unistd.h>
+#include <algorithm>
+#include <memory>
+#include <vector>
+
+#include "absl/container/inlined_vector.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
+#include "tensorflow/stream_executor/gpu/gpu_diagnostics.h"
+#include "tensorflow/stream_executor/lib/error.h"
+#include "tensorflow/stream_executor/lib/numbers.h"
+#include "tensorflow/stream_executor/lib/process_state.h"
+#include "tensorflow/stream_executor/lib/status.h"
+#include "tensorflow/stream_executor/lib/str_util.h"
+#include "tensorflow/stream_executor/lib/stringprintf.h"
+#include "tensorflow/stream_executor/platform/logging.h"
+
+namespace stream_executor {
+namespace gpu {
+
+string DriverVersionToString(DriverVersion version) {
+  return absl::StrFormat("%d.%d.%d", std::get<0>(version), std::get<1>(version),
+                         std::get<2>(version));
+}
+
+string DriverVersionStatusToString(port::StatusOr<DriverVersion> version) {
+  if (!version.ok()) {
+    return version.status().ToString();
+  }
+
+  return DriverVersionToString(version.ValueOrDie());
+}
+
+port::StatusOr<DriverVersion> StringToDriverVersion(const string& value) {
+  std::vector<string> pieces = port::Split(value, '.');
+  if (pieces.size() != 2 && pieces.size() != 3) {
+    return port::Status{port::error::INVALID_ARGUMENT,
+                        absl::StrFormat("expected %%d.%%d or %%d.%%d.%%d form "
+                                        "for driver version; got \"%s\"",
+                                        value.c_str())};
+  }
+
+  int major;
+  int minor;
+  int patch = 0;
+  if (!port::safe_strto32(pieces[0], &major)) {
+    return port::Status{
+        port::error::INVALID_ARGUMENT,
+        absl::StrFormat("could not parse major version number \"%s\" as an "
+                        "integer from string \"%s\"",
+                        pieces[0].c_str(), value.c_str())};
+  }
+  if (!port::safe_strto32(pieces[1], &minor)) {
+    return port::Status{
+        port::error::INVALID_ARGUMENT,
+        absl::StrFormat("could not parse minor version number \"%s\" as an "
+                        "integer from string \"%s\"",
+                        pieces[1].c_str(), value.c_str())};
+  }
+  if (pieces.size() == 3 && !port::safe_strto32(pieces[2], &patch)) {
+    return port::Status{
+        port::error::INVALID_ARGUMENT,
+        absl::StrFormat("could not parse patch version number \"%s\" as an "
+                        "integer from string \"%s\"",
+                        pieces[2].c_str(), value.c_str())};
+  }
+
+  DriverVersion result{major, minor, patch};
+  VLOG(2) << "version string \"" << value << "\" made value "
+          << DriverVersionToString(result);
+  return result;
+}
+
+// -- class Diagnostician
+
+string Diagnostician::GetDevNodePath(int dev_node_ordinal) {
+  return absl::StrCat("/dev/kfd", dev_node_ordinal);
+}
+
+void Diagnostician::LogDiagnosticInformation() {
+  LOG(INFO) << "retrieving ROCM diagnostic information for host: "
+            << port::Hostname();
+
+  LogDriverVersionInformation();
+}
+
+/* static */ void Diagnostician::LogDriverVersionInformation() {
+  LOG(INFO) << "hostname: " << port::Hostname();
+  if (VLOG_IS_ON(1)) {
+    const char* value = getenv("LD_LIBRARY_PATH");
+    string library_path = value == nullptr ? "" : value;
+    VLOG(1) << "LD_LIBRARY_PATH is: \"" << library_path << "\"";
+
+    std::vector<string> pieces = port::Split(library_path, ':');
+    for (const auto& piece : pieces) {
+      if (piece.empty()) {
+        continue;
+      }
+      DIR* dir = opendir(piece.c_str());
+      if (dir == nullptr) {
+        VLOG(1) << "could not open \"" << piece << "\"";
+        continue;
+      }
+      while (dirent* entity = readdir(dir)) {
+        VLOG(1) << piece << " :: " << entity->d_name;
+      }
+      closedir(dir);
+    }
+  }
+  port::StatusOr<DriverVersion> dso_version = FindDsoVersion();
+  LOG(INFO) << "librocm reported version is: "
+            << DriverVersionStatusToString(dso_version);
+
+  port::StatusOr<DriverVersion> kernel_version = FindKernelDriverVersion();
+  LOG(INFO) << "kernel reported version is: "
+            << DriverVersionStatusToString(kernel_version);
+
+  if (kernel_version.ok() && dso_version.ok()) {
+    WarnOnDsoKernelMismatch(dso_version, kernel_version);
+  }
+}
+
+// Iterates through loaded DSOs with DlIteratePhdrCallback to find the
+// driver-interfacing DSO version number. Returns it as a string.
+port::StatusOr<DriverVersion> Diagnostician::FindDsoVersion() {
+  port::StatusOr<DriverVersion> result{port::Status{
+      port::error::NOT_FOUND,
+      "was unable to find librocm.so DSO loaded into this program"}};
+
+  // Callback used when iterating through DSOs. Looks for the driver-interfacing
+  // DSO and yields its version number into the callback data, when found.
+  auto iterate_phdr = [](struct dl_phdr_info* info, size_t size,
+                         void* data) -> int {
+    if (strstr(info->dlpi_name, "librocm.so.1")) {
+      VLOG(1) << "found DLL info with name: " << info->dlpi_name;
+      char resolved_path[PATH_MAX] = {0};
+      if (realpath(info->dlpi_name, resolved_path) == nullptr) {
+        return 0;
+      }
+      VLOG(1) << "found DLL info with resolved path: " << resolved_path;
+      const char* slash = rindex(resolved_path, '/');
+      if (slash == nullptr) {
+        return 0;
+      }
+      const char* so_suffix = ".so.";
+      const char* dot = strstr(slash, so_suffix);
+      if (dot == nullptr) {
+        return 0;
+      }
+      string dso_version = dot + strlen(so_suffix);
+      // TODO(b/22689637): Eliminate the explicit namespace if possible.
+      auto stripped_dso_version = port::StripSuffixString(dso_version, ".ld64");
+      auto result = static_cast<port::StatusOr<DriverVersion>*>(data);
+      *result = StringToDriverVersion(stripped_dso_version);
+      return 1;
+    }
+    return 0;
+  };
+
+  dl_iterate_phdr(iterate_phdr, &result);
+
+  return result;
+}
+
+port::StatusOr<DriverVersion> Diagnostician::FindKernelModuleVersion(
+    const string& driver_version_file_contents) {
+  static const char* kDriverFilePrelude = "Kernel Module  ";
+  size_t offset = driver_version_file_contents.find(kDriverFilePrelude);
+  if (offset == string::npos) {
+    return port::Status{
+        port::error::NOT_FOUND,
+        absl::StrCat("could not find kernel module information in "
+                     "driver version file contents: \"",
+                     driver_version_file_contents, "\"")};
+  }
+
+  string version_and_rest = driver_version_file_contents.substr(
+      offset + strlen(kDriverFilePrelude), string::npos);
+  size_t space_index = version_and_rest.find(" ");
+  auto kernel_version = version_and_rest.substr(0, space_index);
+  // TODO(b/22689637): Eliminate the explicit namespace if possible.
+  auto stripped_kernel_version =
+      port::StripSuffixString(kernel_version, ".ld64");
+  return StringToDriverVersion(stripped_kernel_version);
+}
+
+void Diagnostician::WarnOnDsoKernelMismatch(
+    port::StatusOr<DriverVersion> dso_version,
+    port::StatusOr<DriverVersion> kernel_version) {
+  if (kernel_version.ok() && dso_version.ok() &&
+      dso_version.ValueOrDie() == kernel_version.ValueOrDie()) {
+    LOG(INFO) << "kernel version seems to match DSO: "
+              << DriverVersionToString(kernel_version.ValueOrDie());
+  } else {
+    LOG(ERROR) << "kernel version "
+               << DriverVersionStatusToString(kernel_version)
+               << " does not match DSO version "
+               << DriverVersionStatusToString(dso_version)
+               << " -- cannot find working devices in this configuration";
+  }
+}
+
+port::StatusOr<DriverVersion> Diagnostician::FindKernelDriverVersion() {
+  auto status = port::Status{port::error::UNIMPLEMENTED,
+                             "kernel reported driver version not implemented"};
+  return status;
+}
+
+}  // namespace gpu
+}  // namespace stream_executor
--- a/tensorflow/stream_executor/rocm/rocm_driver.cc
+++ b/tensorflow/stream_executor/rocm/rocm_driver.cc
--- a/tensorflow/stream_executor/rocm/rocm_event.cc
+++ b/tensorflow/stream_executor/rocm/rocm_event.cc
@ -0,0 +1,46 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/stream_executor/gpu/gpu_event.h"
+#include "tensorflow/stream_executor/gpu/gpu_executor.h"
+#include "tensorflow/stream_executor/gpu/gpu_stream.h"
+#include "tensorflow/stream_executor/lib/statusor.h"
+
+namespace stream_executor {
+namespace gpu {
+
+Event::Status GpuEvent::PollForStatus() {
+  port::StatusOr<hipError_t> status =
+      GpuDriver::QueryEvent(parent_->gpu_context(), gpu_event_);
+  if (!status.ok()) {
+    LOG(ERROR) << "Error polling for event status: "
+               << status.status().error_message();
+    return Event::Status::kError;
+  }
+
+  switch (status.ValueOrDie()) {
+    case hipSuccess:
+      return Event::Status::kComplete;
+    case hipErrorNotReady:
+      return Event::Status::kPending;
+    default:
+      LOG(INFO) << "Error condition returned for event status: "
+                << status.ValueOrDie();
+      return Event::Status::kError;
+  }
+}
+
+}  // namespace gpu
+}  // namespace stream_executor
--- a/tensorflow/stream_executor/rocm/rocm_gpu_executor.cc
+++ b/tensorflow/stream_executor/rocm/rocm_gpu_executor.cc
@ -0,0 +1,976 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <unistd.h>
+
+#include "absl/base/casts.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
+#include "tensorflow/stream_executor/gpu/gpu_diagnostics.h"
+#include "tensorflow/stream_executor/gpu/gpu_driver.h"
+#include "tensorflow/stream_executor/gpu/gpu_event.h"
+#include "tensorflow/stream_executor/gpu/gpu_executor.h"
+#include "tensorflow/stream_executor/gpu/gpu_stream.h"
+#include "tensorflow/stream_executor/gpu/gpu_timer.h"
+#include "tensorflow/stream_executor/kernel_cache_config.h"
+#include "tensorflow/stream_executor/lib/env.h"
+#include "tensorflow/stream_executor/lib/error.h"
+#include "tensorflow/stream_executor/lib/initialize.h"
+#include "tensorflow/stream_executor/lib/mathutil.h"
+#include "tensorflow/stream_executor/lib/numbers.h"
+#include "tensorflow/stream_executor/lib/path.h"
+#include "tensorflow/stream_executor/lib/process_state.h"
+#include "tensorflow/stream_executor/lib/ptr_util.h"
+#include "tensorflow/stream_executor/lib/statusor.h"
+#include "tensorflow/stream_executor/lib/str_util.h"
+#include "tensorflow/stream_executor/lib/stringprintf.h"
+#include "tensorflow/stream_executor/platform.h"
+#include "tensorflow/stream_executor/platform/dso_loader.h"
+#include "tensorflow/stream_executor/platform/logging.h"
+#include "tensorflow/stream_executor/platform/port.h"
+#include "tensorflow/stream_executor/plugin_registry.h"
+#include "tensorflow/stream_executor/rocm/rocm_platform_id.h"
+#include "tensorflow/stream_executor/stream.h"
+#include "tensorflow/stream_executor/stream_executor_internal.h"
+#include "tensorflow/stream_executor/stream_executor_pimpl.h"
+#include "tensorflow/stream_executor/timer.h"
+
+#ifdef PLATFORMS_GPUS_ROCM_DYNAMIC_LIBROCM_DYNAMIC_LIBROCM_H_
+#error \
+    "No driver calls in this file, wrap driver functionality in rocm_driver.cc."
+#endif
+
+#ifdef __ROCM_RUNTIME_H__
+#error \
+    "ROCM runtime being included into ROCM GPU executor; should be driver only."
+#endif
+
+namespace stream_executor {
+namespace gpu {
+
+static GpuEvent* AsGpuEvent(Event* event) {
+  DCHECK(event != nullptr);
+  return static_cast<GpuEvent*>(event->implementation());
+}
+
+// Given a platform-independent timer datatype, returns the internal ROCM
+// platform implementation pointer.
+static GpuTimer* AsGpuTimer(Timer* timer) {
+  DCHECK(timer != nullptr);
+  return static_cast<GpuTimer*>(timer->implementation());
+}
+
+// Given const GPU memory, returns a librocm device pointer datatype, suitable
+// for passing directly to librocm APIs.
+//
+// N.B. we must lose constness in order to pass a suitable type to the existing
+// librocm APIs, so the caller should take care to only pass the result of const
+// GPU memory conversions to librocm functions which will honor constness.
+static hipDeviceptr_t AsROCmDevicePtr(const DeviceMemoryBase& gpu_mem) {
+  return const_cast<hipDeviceptr_t>(gpu_mem.opaque());
+}
+
+// See description on const version above.
+static hipDeviceptr_t AsROCmDevicePtr(DeviceMemoryBase* gpu_mem) {
+  return AsROCmDevicePtr(*gpu_mem);
+}
+
+static GpuContext* GetGpuContext(Stream* stream) {
+  return static_cast<GpuExecutor*>(stream->parent()->implementation())
+      ->gpu_context();
+}
+
+GpuContext* ExtractGpuContext(GpuExecutor* rocm_exec) {
+  CHECK(rocm_exec != nullptr);
+  return rocm_exec->gpu_context();
+}
+
+GpuExecutor* ExtractGpuExecutor(StreamExecutor* stream_exec) {
+  return static_cast<GpuExecutor*>(stream_exec->implementation());
+}
+
+GpuExecutor::~GpuExecutor() {
+  for (auto& it : disk_modules_) {
+    GpuDriver::UnloadModule(context_, it.second);
+  }
+  for (auto& it : in_memory_modules_) {
+    GpuDriver::UnloadModule(context_, it.second);
+  }
+  if (context_ != nullptr) {
+    GpuDriver::DestroyContext(context_);
+  }
+  CHECK(gpu_binary_to_module_.empty()) << "GpuExecutor has loaded modules.";
+}
+bool GpuExecutor::UnloadModule(ModuleHandle module_handle) {
+  const char* gpu_binary = reinterpret_cast<const char*>(module_handle.id());
+  mutex_lock lock{in_memory_modules_mu_};
+  return UnloadGpuBinary(gpu_binary);
+}
+
+bool GpuExecutor::UnloadGpuBinary(const void* gpu_binary) {
+  auto module_it = gpu_binary_to_module_.find(gpu_binary);
+  if (gpu_binary_to_module_.end() == module_it) {
+    VLOG(3) << "No loaded  HSACO module for " << gpu_binary;
+    return false;
+  }
+  auto& module = module_it->second.first;
+  auto& refcount = module_it->second.second;
+  VLOG(3) << "Found HSACO module " << module << " with refcount " << refcount;
+  if (--refcount == 0) {
+    VLOG(3) << "Unloading  HSACO module " << module;
+    GpuDriver::UnloadModule(context_, module);
+    gpu_binary_to_module_.erase(module_it);
+  }
+  return true;
+}
+
+void GpuExecutor::UnloadKernel(const KernelBase* kernel) {
+  LOG(FATAL) << "Feature not supported on ROCM platform (UnloadKernel)";
+}
+
+port::Status GpuExecutor::Init(int device_ordinal,
+                               DeviceOptions device_options) {
+  device_ordinal_ = device_ordinal;
+
+  auto status = GpuDriver::Init();
+  if (!status.ok()) {
+    return status;
+  }
+
+  status = GpuDriver::GetDevice(device_ordinal_, &device_);
+  if (!status.ok()) {
+    return status;
+  }
+
+  status = GpuDriver::CreateContext(device_ordinal_, device_, device_options,
+                                    &context_);
+  if (!status.ok()) {
+    return status;
+  }
+
+  return GpuDriver::GetGpuISAVersion(&version_, device_);
+}
+
+bool GpuExecutor::FindOnDiskForComputeCapability(
+    absl::string_view filename, absl::string_view canonical_suffix,
+    string* found_filename) const {
+  LOG(FATAL) << "Feature not supported on ROCM platform "
+                "(FindOnDiskForComputeCapability)";
+  return false;
+}
+
+bool GpuExecutor::FindOnDiskForISAVersion(absl::string_view filename,
+                                          absl::string_view canonical_suffix,
+                                          string* found_filename) const {
+  if (version_ == 0) {
+    return false;
+  }
+
+  string cc_specific =
+      absl::StrCat(filename, ".cc", version_, canonical_suffix);
+  if (port::FileExists(cc_specific).ok()) {
+    VLOG(2) << "found AMDGPU ISA version-specific file, using that: "
+            << cc_specific;
+    *found_filename = cc_specific;
+    return true;
+  }
+
+  VLOG(2) << "could not find AMDGPU ISA version-specific file at: "
+          << cc_specific;
+  if (port::FileExists(string(filename)).ok()) {
+    *found_filename = string(filename);
+    return true;
+  }
+
+  return false;
+}
+
+// Returns the path to the running executable.
+// N.B. Derived from //knowledge/smalltalk/background_kb.cc
+// Arg: strip_exe: if true, remove the name of the executable itself from the
+//                 returned string. Example: calling this from /usr/bin/foo
+//                 would return /usr/bin.
+static string GetBinaryDir(bool strip_exe) {
+  char exe_path[PATH_MAX] = {0};
+  CHECK_ERR(readlink("/proc/self/exe", exe_path, sizeof(exe_path) - 1));
+  // Make sure it's null-terminated:
+  exe_path[sizeof(exe_path) - 1] = 0;
+
+  if (strip_exe) {
+    // The exe is the last component of the path, so remove one component.
+    string ret = exe_path;
+    std::vector<string> components = port::Split(exe_path, '/');
+    components.pop_back();
+    return port::Join(components, "/");
+  }
+  return exe_path;
+}
+
+bool GpuExecutor::GetKernel(const MultiKernelLoaderSpec& spec,
+                            KernelBase* kernel) {
+  GpuKernel* rocm_kernel = AsGpuKernel(kernel);
+  hipModule_t module = nullptr;
+  const string* kernelname;
+
+  const OnDiskKernelLoaderSpec* on_disk_spec = nullptr;
+  bool has_cubin = spec.has_cuda_cubin_on_disk();
+  if (has_cubin) {
+    on_disk_spec = &spec.cuda_cubin_on_disk();
+  }
+
+  if (on_disk_spec != nullptr) {
+    LOG(WARNING) << "loading ROCM kernel from disk is not supported";
+    return false;
+  } else if (spec.has_cuda_cubin_in_memory()) {
+    kernelname = &spec.cuda_cubin_in_memory().kernelname();
+
+    const char* hsaco = spec.cuda_cubin_in_memory().bytes();
+    mutex_lock lock{in_memory_modules_mu_};
+    module = in_memory_modules_[hsaco];
+
+    if (module == nullptr) {
+      if (!GpuDriver::LoadHsaco(context_, hsaco, &module)) {
+        LOG(ERROR) << "failed to load HSACO\n";
+        return false;
+      }
+      in_memory_modules_[hsaco] = module;
+    }
+  } else {
+    LOG(WARNING) << "no method of loading ROCM kernel provided";
+    return false;
+  }
+
+  VLOG(2) << "getting function " << *kernelname << " from module " << module;
+  if (!GpuDriver::GetModuleFunction(context_, module, kernelname->c_str(),
+                                    rocm_kernel->gpu_function_ptr())) {
+    return false;
+  }
+
+  // We have to trust the kernel loader spec arity because there doesn't appear
+  // to be a way to reflect on the number of expected arguments w/the ROCM API.
+  rocm_kernel->set_arity(spec.arity());
+
+  KernelMetadata kernel_metadata;
+  if (!GetKernelMetadata(rocm_kernel, &kernel_metadata)) {
+    LOG(WARNING) << "Unable to get metadata for kernel " << kernelname;
+  }
+  kernel->set_metadata(kernel_metadata);
+  kernel->set_name(*kernelname);
+  return true;
+}
+
+bool GpuExecutor::GetKernelMetadata(GpuKernel* rocm_kernel,
+                                    KernelMetadata* kernel_metadata) {
+  int value = 0;
+  // TODO(ROCm) implement this feature in HIP
+  kernel_metadata->set_registers_per_thread(value);
+
+  // TODO(ROCm) implement this feature in HIP
+  kernel_metadata->set_shared_memory_bytes(value);
+
+  return true;
+}
+
+bool GpuExecutor::Launch(Stream* stream, const ThreadDim& thread_dims,
+                         const BlockDim& block_dims, const KernelBase& kernel,
+                         const KernelArgsArrayBase& args) {
+  CHECK_EQ(kernel.Arity(), args.number_of_arguments());
+  GpuStreamHandle hipstream = AsGpuStreamValue(stream);
+  const GpuKernel* rocm_kernel = AsGpuKernel(&kernel);
+  hipFunction_t hipfunc = rocm_kernel->AsGpuFunctionHandle();
+
+  // Only perform/print the occupancy check once.  Even just checking to see
+  // whether we've done an occupancy check on this kernel before isn't free
+  // (because we have to synchronize), so we only do this at -v 2+.
+  if (VLOG_IS_ON(2)) {
+    mutex_lock lock(launched_kernels_mu_);
+    if (!launched_kernels_.count(hipfunc)) {
+      VlogOccupancyInfo(kernel, thread_dims, block_dims);
+      // TODO(rspringer): Remove elements from launched_kernels_...if we ever
+      // expose a kernel/module deallocation method.
+      launched_kernels_.insert(hipfunc);
+    }
+  }
+
+  if (rocm_kernel->GetPreferredCacheConfig() !=
+      KernelCacheConfig::kNoPreference) {
+    GpuDriver::FuncSetCacheConfig(hipfunc, rocm_kernel->GetGpuCacheConfig());
+  }
+
+  // prepare kernargs
+  // KernelArgsArrayBase keeps the pointer of arguments
+  // deference them here
+  std::vector<void*> kernargs;
+  KernelArgIterator iter = args.arg_iterator();
+  while (iter.has_next()) {
+    KernelArg arg = iter.next();
+    VLOG(2) << "*(arg.address): "
+            << reinterpret_cast<void*>(
+                   *static_cast<const uint64_t*>(arg.address));
+    kernargs.push_back(
+        reinterpret_cast<void*>(*static_cast<const uint64_t*>(arg.address)));
+  }
+
+  size_t size = sizeof(void*) * kernargs.size();
+  void* config[] = {HIP_LAUNCH_PARAM_BUFFER_POINTER, kernargs.data(),
+                    HIP_LAUNCH_PARAM_BUFFER_SIZE, &size, HIP_LAUNCH_PARAM_END};
+
+  if (!GpuDriver::LaunchKernel(
+          GetGpuContext(stream), hipfunc, block_dims.x, block_dims.y,
+          block_dims.z, thread_dims.x, thread_dims.y, thread_dims.z,
+          args.number_of_shared_bytes(), hipstream, nullptr, (void**)&config)) {
+    LOG(ERROR) << "failed to launch ROCM kernel with args: "
+               << args.number_of_arguments()
+               << "; thread dim: " << thread_dims.ToString()
+               << "; block dim: " << block_dims.ToString();
+    return false;
+  }
+
+  return true;
+}
+
+int GpuExecutor::CalculateOccupancy(const DeviceDescription& device_description,
+                                    uint64 registers_per_thread,
+                                    uint64 shared_memory_per_block,
+                                    const ThreadDim& thread_dims,
+                                    GpuFunctionHandle func) {
+  LOG(FATAL) << "Feature not supported on ROCM platform (CalculateOccupancy)";
+  return 0;
+}
+
+int GpuExecutor::CompareOccupancy(int* initial_blocks,
+                                  const DeviceDescription& device_description,
+                                  uint64 registers_per_thread,
+                                  uint64 shared_memory_per_block,
+                                  const ThreadDim& thread_dims,
+                                  GpuFunctionHandle func) {
+  LOG(FATAL) << "Feature not supported on ROCM platform (CompareOccupancy)";
+  return 0;
+}
+
+bool GpuExecutor::LoadModule(const MultiModuleLoaderSpec& spec,
+                             ModuleHandle* module_handle) {
+  // In GpuExecutor we store the pointer to the  HSACO binary  as
+  // ModuleHandle::id().
+  hipModule_t hip_module = nullptr;
+  // TODO(ROCm): Need  generic term instead of cubin/cuda/ptx
+  if (spec.has_cuda_cubin_in_memory()) {
+    mutex_lock lock{in_memory_modules_mu_};
+    if (!LoadModuleFromHsaco(
+            reinterpret_cast<const char*>(spec.cuda_cubin_in_memory().data()),
+            &hip_module)) {
+      return false;
+    }
+    *module_handle = ModuleHandle(const_cast<void*>(
+        static_cast<const void*>(spec.cuda_cubin_in_memory().data())));
+    return true;
+  } else {
+    LOG(ERROR) << "No HSACO binary found \n";
+    return false;
+  }
+}
+
+bool GpuExecutor::LoadModuleFromCuBin(const char* cubin, hipModule_t* module) {
+  LOG(FATAL) << "Feature not supported on ROCM platform (LoadModuleFromCuBin)";
+  return false;
+}
+
+bool GpuExecutor::LoadModuleFromPtx(const char* ptx, hipModule_t* module) {
+  LOG(FATAL) << "Feature not supported on ROCM platform (LoadModuleFromPtx)";
+  return false;
+}
+
+bool GpuExecutor::LoadModuleFromHsaco(const char* hsaco, hipModule_t* module) {
+  uint64_t module_refcount;
+  std::tie(*module, module_refcount) = gpu_binary_to_module_[hsaco];
+
+  if (*module == nullptr) {
+    if (!GpuDriver::LoadHsaco(context_, hsaco, module)) {
+      LOG(ERROR) << "failed to load : HSACO \n";
+      return false;
+    }
+    module_refcount = 1;
+    VLOG(3) << "Loaded HSACO " << static_cast<const void*>(hsaco)
+            << " as module " << *module;
+  } else {
+    ++module_refcount;
+    VLOG(3) << "HSACO " << static_cast<const void*>(hsaco)
+            << " is already loaded as module " << *module;
+  }
+  gpu_binary_to_module_[hsaco] = {*module, module_refcount};
+  return true;
+}
+
+// This is a non-essential operation; if there's a failure, proceed without
+// logging an error. It's nearly certain that in case of failures, we'd never
+// get here in the first place; these are very low-impact routines.
+void GpuExecutor::VlogOccupancyInfo(const KernelBase& kernel,
+                                    const ThreadDim& thread_dims,
+                                    const BlockDim& block_dims) {
+  // TODO(ROCm) implement this feature in HIP
+}
+
+void* GpuExecutor::Allocate(uint64 size) {
+  return GpuDriver::DeviceAllocate(context_, size);
+}
+
+void* GpuExecutor::AllocateSubBuffer(DeviceMemoryBase* mem, uint64 offset_bytes,
+                                     uint64 size_bytes) {
+  // offset and size are in bytes, so char* works as the pointer type.
+  return reinterpret_cast<char*>(mem->opaque()) + offset_bytes;
+}
+
+void GpuExecutor::Deallocate(DeviceMemoryBase* mem) {
+  // ROCM "sub-buffers" are just pointer + offset, so no dealloc is necessary.
+  if (!mem->is_sub_buffer()) {
+    GpuDriver::DeviceDeallocate(context_, mem->opaque());
+  }
+}
+
+bool GpuExecutor::HostMemoryRegister(void* location, uint64 size) {
+  if (location == nullptr || size == 0) {
+    LOG(WARNING) << "attempting to register null or zero-sized memory: "
+                 << location << "; size " << size;
+  }
+  VLOG(2) << "registering " << location << " size " << size;
+  return GpuDriver::HostRegister(context_, location, size);
+}
+
+bool GpuExecutor::HostMemoryUnregister(void* location) {
+  VLOG(2) << "unregistering " << location;
+  return GpuDriver::HostUnregister(context_, location);
+}
+
+bool GpuExecutor::SynchronizeAllActivity() {
+  return GpuDriver::SynchronizeContext(context_);
+}
+
+bool GpuExecutor::SynchronousMemZero(DeviceMemoryBase* location, uint64 size) {
+  if (reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 &&
+      size % 4 == 0) {
+    return GpuDriver::SynchronousMemsetUint32(
+        context_, AsROCmDevicePtr(location), 0x0, size / 4);
+  }
+  return GpuDriver::SynchronousMemsetUint8(context_, AsROCmDevicePtr(location),
+                                           0x0, size);
+}
+
+bool GpuExecutor::SynchronousMemSet(DeviceMemoryBase* location, int value,
+                                    uint64 size) {
+  if (reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 &&
+      size % 4 == 0) {
+    // hipMemset reinterprets "value" as a uint8.
+    uint8 byte_value = static_cast<uint8>(value);
+    uint32 pattern = (byte_value << 24) | (byte_value << 16) |
+                     (byte_value << 8) | byte_value;
+    return GpuDriver::SynchronousMemsetUint32(
+        context_, AsROCmDevicePtr(location), pattern, size / 4);
+  }
+  return GpuDriver::SynchronousMemsetUint8(context_, AsROCmDevicePtr(location),
+                                           value, size);
+}
+
+port::Status GpuExecutor::SynchronousMemcpy(DeviceMemoryBase* gpu_dst,
+                                            const void* host_src, uint64 size) {
+  return GpuDriver::SynchronousMemcpyH2D(context_, AsROCmDevicePtr(gpu_dst),
+                                         host_src, size);
+}
+
+port::Status GpuExecutor::SynchronousMemcpy(void* host_dst,
+                                            const DeviceMemoryBase& gpu_src,
+                                            uint64 size) {
+  return GpuDriver::SynchronousMemcpyD2H(context_, host_dst,
+                                         AsROCmDevicePtr(gpu_src), size);
+}
+
+port::Status GpuExecutor::SynchronousMemcpyDeviceToDevice(
+    DeviceMemoryBase* gpu_dst, const DeviceMemoryBase& gpu_src, uint64 size) {
+  return GpuDriver::SynchronousMemcpyD2D(context_, AsROCmDevicePtr(gpu_dst),
+                                         AsROCmDevicePtr(gpu_src), size);
+}
+
+bool GpuExecutor::MemZero(Stream* stream, DeviceMemoryBase* location,
+                          uint64 size) {
+  if (reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 &&
+      size % 4 == 0) {
+    return Memset32(stream, location, 0x0, size);
+  } else {
+    return Memset(stream, location, 0x0, size);
+  }
+}
+
+bool GpuExecutor::Memset(Stream* stream, DeviceMemoryBase* location,
+                         uint8 pattern, uint64 size) {
+  VLOG(2) << "enqueueing memset8 operation onto stream " << stream
+          << " at location " << location << " with size " << size
+          << " and pattern " << std::hex << pattern;
+  return GpuDriver::AsynchronousMemsetUint8(context_, AsROCmDevicePtr(location),
+                                            pattern, size,
+                                            AsGpuStreamValue(stream));
+}
+
+bool GpuExecutor::Memset32(Stream* stream, DeviceMemoryBase* location,
+                           uint32 pattern, uint64 size) {
+  VLOG(2) << "enqueueing memset32 operation onto stream " << stream
+          << " at location " << location << " with size " << size
+          << " and pattern " << std::hex << pattern;
+  CHECK(reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 &&
+        size % 4 == 0);
+  return GpuDriver::AsynchronousMemsetUint32(
+      context_, AsROCmDevicePtr(location), pattern, size / 4,
+      AsGpuStreamValue(stream));
+}
+
+bool GpuExecutor::Memcpy(Stream* stream, void* host_dst,
+                         const DeviceMemoryBase& gpu_src, uint64 size) {
+  return GpuDriver::AsynchronousMemcpyD2H(context_, host_dst,
+                                          AsROCmDevicePtr(gpu_src), size,
+                                          AsGpuStreamValue(stream));
+}
+
+bool GpuExecutor::Memcpy(Stream* stream, DeviceMemoryBase* gpu_dst,
+                         const void* host_src, uint64 size) {
+  return GpuDriver::AsynchronousMemcpyH2D(context_, AsROCmDevicePtr(gpu_dst),
+                                          host_src, size,
+                                          AsGpuStreamValue(stream));
+}
+
+bool GpuExecutor::MemcpyDeviceToDevice(Stream* stream,
+                                       DeviceMemoryBase* gpu_dst,
+                                       const DeviceMemoryBase& gpu_src,
+                                       uint64 size) {
+  return GpuDriver::AsynchronousMemcpyD2D(context_, AsROCmDevicePtr(gpu_dst),
+                                          AsROCmDevicePtr(gpu_src), size,
+                                          AsGpuStreamValue(stream));
+}
+
+bool GpuExecutor::HostCallback(Stream* stream,
+                               std::function<port::Status()> callback) {
+  auto callback_ptr = new std::function<void()>([callback]() {
+    port::Status s = callback();
+    if (!s.ok()) {
+      LOG(WARNING) << "Host callback failed: " << s;
+    }
+  });
+  return GpuDriver::AddStreamCallback(context_, AsGpuStreamValue(stream),
+                                      InternalHostCallback, callback_ptr);
+}
+
+/* static */ void GpuExecutor::InternalHostCallback(GpuStreamHandle stream,
+                                                    hipError_t status,
+                                                    void* data) {
+  std::function<void()>* callback =
+      reinterpret_cast<std::function<void()>*>(data);
+  (*callback)();
+  delete callback;
+}
+
+port::Status GpuExecutor::AllocateEvent(Event* event) {
+  return AsGpuEvent(event)->Init();
+}
+
+port::Status GpuExecutor::DeallocateEvent(Event* event) {
+  return AsGpuEvent(event)->Destroy();
+}
+
+port::Status GpuExecutor::RecordEvent(Stream* stream, Event* event) {
+  return AsGpuEvent(event)->Record(AsGpuStream(stream));
+}
+
+port::Status GpuExecutor::WaitForEvent(Stream* stream, Event* event) {
+  if (GpuDriver::WaitStreamOnEvent(context_, AsGpuStream(stream)->gpu_stream(),
+                                   AsGpuEvent(event)->gpu_event())) {
+    return port::Status::OK();
+  } else {
+    return port::Status{
+        port::error::INTERNAL,
+        absl::StrFormat("error recording waiting for ROCM event on stream %p",
+                        stream)};
+  }
+}
+
+Event::Status GpuExecutor::PollForEventStatus(Event* event) {
+  return AsGpuEvent(event)->PollForStatus();
+}
+
+bool GpuExecutor::AllocateStream(Stream* stream) {
+  return AsGpuStream(stream)->Init();
+}
+
+void GpuExecutor::DeallocateStream(Stream* stream) {
+  GpuStream* rocm_stream = AsGpuStream(stream);
+  if (!rocm_stream->IsIdle()) {
+    LOG(ERROR) << "Deallocating stream with pending work";
+  }
+  rocm_stream->Destroy();
+}
+
+bool GpuExecutor::AllocateTimer(Timer* timer) {
+  return AsGpuTimer(timer)->Init();
+}
+
+void GpuExecutor::DeallocateTimer(Timer* timer) {
+  AsGpuTimer(timer)->Destroy();
+}
+
+bool GpuExecutor::CreateStreamDependency(Stream* dependent, Stream* other) {
+  GpuEventHandle other_completed_event = *AsGpuStream(other)->completed_event();
+  bool ok = GpuDriver::RecordEvent(context_, other_completed_event,
+                                   AsGpuStreamValue(other))
+                .ok();
+  if (!ok) {
+    LOG(ERROR) << "failed to record completion event; "
+                  "therefore, failed to create inter-stream dependency";
+    return false;
+  }
+
+  return GpuDriver::WaitStreamOnEvent(context_, AsGpuStreamValue(dependent),
+                                      other_completed_event);
+}
+
+bool GpuExecutor::StartTimer(Stream* stream, Timer* timer) {
+  return AsGpuTimer(timer)->Start(AsGpuStream(stream));
+}
+
+bool GpuExecutor::StopTimer(Stream* stream, Timer* timer) {
+  return AsGpuTimer(timer)->Stop(AsGpuStream(stream));
+}
+
+port::Status GpuExecutor::BlockHostUntilDone(Stream* stream) {
+  return GpuDriver::SynchronizeStream(context_, AsGpuStreamValue(stream));
+}
+
+blas::BlasSupport* GpuExecutor::CreateBlas() {
+  PluginRegistry* registry = PluginRegistry::Instance();
+  port::StatusOr<PluginRegistry::BlasFactory> status =
+      registry->GetFactory<PluginRegistry::BlasFactory>(kROCmPlatformId,
+                                                        plugin_config_.blas());
+  if (!status.ok()) {
+    LOG(ERROR) << "Unable to retrieve BLAS factory: "
+               << status.status().error_message();
+    return nullptr;
+  }
+
+  return status.ValueOrDie()(this);
+}
+
+dnn::DnnSupport* GpuExecutor::CreateDnn() {
+  PluginRegistry* registry = PluginRegistry::Instance();
+  port::StatusOr<PluginRegistry::DnnFactory> status =
+      registry->GetFactory<PluginRegistry::DnnFactory>(kROCmPlatformId,
+                                                       plugin_config_.dnn());
+  if (!status.ok()) {
+    LOG(ERROR) << "Unable to retrieve DNN factory: "
+               << status.status().error_message();
+    return nullptr;
+  }
+
+  return status.ValueOrDie()(this);
+}
+
+fft::FftSupport* GpuExecutor::CreateFft() {
+  PluginRegistry* registry = PluginRegistry::Instance();
+  port::StatusOr<PluginRegistry::FftFactory> status =
+      registry->GetFactory<PluginRegistry::FftFactory>(kROCmPlatformId,
+                                                       plugin_config_.fft());
+  if (!status.ok()) {
+    LOG(ERROR) << "Unable to retrieve FFT factory: "
+               << status.status().error_message();
+    return nullptr;
+  }
+
+  return status.ValueOrDie()(this);
+}
+
+rng::RngSupport* GpuExecutor::CreateRng() {
+  PluginRegistry* registry = PluginRegistry::Instance();
+  port::StatusOr<PluginRegistry::RngFactory> status =
+      registry->GetFactory<PluginRegistry::RngFactory>(kROCmPlatformId,
+                                                       plugin_config_.rng());
+  if (!status.ok()) {
+    LOG(ERROR) << "Unable to retrieve RNG factory: "
+               << status.status().error_message();
+    return nullptr;
+  }
+
+  return status.ValueOrDie()(this);
+}
+
+// TODO(rspringer): Remove in b/18544742.
+bool GpuExecutor::SupportsDnn() const { return true; }
+
+bool GpuExecutor::CanEnablePeerAccessTo(StreamExecutorInterface* other) {
+  GpuExecutor* rocm_other = static_cast<GpuExecutor*>(other);
+  return GpuDriver::CanEnablePeerAccess(context_, rocm_other->context_);
+}
+
+port::Status GpuExecutor::EnablePeerAccessTo(StreamExecutorInterface* other) {
+  GpuExecutor* rocm_other = static_cast<GpuExecutor*>(other);
+  return GpuDriver::EnablePeerAccess(context_, rocm_other->context_);
+}
+
+SharedMemoryConfig GpuExecutor::GetDeviceSharedMemoryConfig() {
+  port::StatusOr<hipSharedMemConfig> rocm_config =
+      GpuDriver::ContextGetSharedMemConfig(context_);
+  if (!rocm_config.ok()) {
+    // Don't log; the failed call will log necessary output.
+    return SharedMemoryConfig::kDefault;
+  }
+
+  switch (rocm_config.ValueOrDie()) {
+    case hipSharedMemBankSizeDefault:
+      return SharedMemoryConfig::kDefault;
+    case hipSharedMemBankSizeFourByte:
+      return SharedMemoryConfig::kFourByte;
+    case hipSharedMemBankSizeEightByte:
+      return SharedMemoryConfig::kEightByte;
+    default:
+      LOG(FATAL) << "Invalid shared memory configuration returned: "
+                 << rocm_config.ValueOrDie();
+  }
+}
+
+port::Status GpuExecutor::SetDeviceSharedMemoryConfig(
+    SharedMemoryConfig config) {
+  hipSharedMemConfig rocm_config;
+  switch (config) {
+    case SharedMemoryConfig::kDefault:
+      rocm_config = hipSharedMemBankSizeDefault;
+      break;
+    case SharedMemoryConfig::kFourByte:
+      rocm_config = hipSharedMemBankSizeFourByte;
+      break;
+    case SharedMemoryConfig::kEightByte:
+      rocm_config = hipSharedMemBankSizeEightByte;
+      break;
+    default:
+      LOG(FATAL) << "Invalid shared memory configuration specified: "
+                 << static_cast<int>(config);
+  }
+  return GpuDriver::ContextSetSharedMemConfig(context_, rocm_config);
+}
+
+bool GpuExecutor::DeviceMemoryUsage(int64* free, int64* total) const {
+  return GpuDriver::GetDeviceMemoryInfo(context_, free, total);
+}
+
+bool GpuExecutor::GetSymbol(const string& symbol_name,
+                            ModuleHandle module_handle, void** mem,
+                            size_t* bytes) {
+  {  // give limited scope to mutex_lock
+    mutex_lock lock{disk_modules_mu_};
+    for (auto& it : disk_modules_) {
+      if (GpuDriver::GetModuleSymbol(context_, it.second, symbol_name.c_str(),
+                                     reinterpret_cast<hipDeviceptr_t*>(mem),
+                                     bytes)) {
+        return true;
+      }
+    }
+  }
+
+  {  // give limited scope to mutex_lock
+    mutex_lock lock{in_memory_modules_mu_};
+    for (auto& it : in_memory_modules_) {
+      if (GpuDriver::GetModuleSymbol(context_, it.second, symbol_name.c_str(),
+                                     reinterpret_cast<hipDeviceptr_t*>(mem),
+                                     bytes)) {
+        return true;
+      }
+    }
+  }
+
+  {  // give limited scope to mutex_lock
+    mutex_lock lock{in_memory_modules_mu_};
+    if (static_cast<bool>(module_handle)) {
+      auto it = gpu_binary_to_module_.find(module_handle.id());
+      CHECK(it != gpu_binary_to_module_.end());
+      if (GpuDriver::GetModuleSymbol(
+              context_, it->second.first, symbol_name.c_str(),
+              reinterpret_cast<hipDeviceptr_t*>(mem), bytes)) {
+        return true;
+      }
+    }
+
+    for (auto& it : gpu_binary_to_module_) {
+      if (GpuDriver::GetModuleSymbol(
+              context_, it.second.first, symbol_name.c_str(),
+              reinterpret_cast<hipDeviceptr_t*>(mem), bytes)) {
+        return true;
+      }
+    }
+  }
+
+  LOG(INFO) << "Falied to find symbol in any modules: " << symbol_name;
+  return false;
+}
+
+bool GpuExecutor::FillBlockDimLimit(BlockDim* block_dim_limit) const {
+  // The BlockDim name is a mismatch against these GRID_DIM_* queries because
+  // we use BlockDims to express the dimensions of blocks within a grid
+  // (as opposed to ThreadDim which expresses the dimensions of threads
+  // within a block).
+  int x, y, z;
+  if (!GpuDriver::GetGridLimits(&x, &y, &z, device_)) {
+    return false;
+  }
+
+  block_dim_limit->x = x;
+  block_dim_limit->y = y;
+  block_dim_limit->z = z;
+  return true;
+}
+
+bool GpuExecutor::SupportsBlas() const { return true; }
+
+bool GpuExecutor::SupportsFft() const { return true; }
+
+bool GpuExecutor::SupportsRng() const { return true; }
+
+std::unique_ptr<internal::EventInterface>
+GpuExecutor::CreateEventImplementation() {
+  return std::unique_ptr<internal::EventInterface>(new GpuEvent(this));
+}
+
+std::unique_ptr<internal::KernelInterface>
+GpuExecutor::CreateKernelImplementation() {
+  return std::unique_ptr<internal::KernelInterface>(new GpuKernel());
+}
+
+std::unique_ptr<internal::StreamInterface>
+GpuExecutor::GetStreamImplementation() {
+  return std::unique_ptr<internal::StreamInterface>(new GpuStream(this));
+}
+
+std::unique_ptr<internal::TimerInterface>
+GpuExecutor::GetTimerImplementation() {
+  return std::unique_ptr<internal::TimerInterface>(new GpuTimer(this));
+}
+
+void* GpuExecutor::GpuContextHack() { return context_; }
+
+GpuContext* GpuExecutor::gpu_context() { return context_; }
+
+// Attempts to read the NUMA node corresponding to the GPU device's PCI bus out
+// of SysFS. Returns -1 if it cannot.
+//
+// For anything more complicated/prod-focused than this, you'll likely want to
+// turn to gsys' topology modeling.
+static int TryToReadNumaNode(const string& pci_bus_id, int device_ordinal) {
+  // TODO(ROCm) implement this feature in HIP
+  return 1;
+}
+
+DeviceDescription* GpuExecutor::PopulateDeviceDescription() const {
+  internal::DeviceDescriptionBuilder builder;
+
+  {
+    int driver_version = 0;
+    (void)GpuDriver::GetDriverVersion(&driver_version);
+    string augmented_driver_version = absl::StrFormat(
+        "%d (%s)", driver_version,
+        DriverVersionStatusToString(Diagnostician::FindDsoVersion()).c_str());
+    builder.set_driver_version(augmented_driver_version);
+  }
+
+  {
+    string pci_bus_id = GpuDriver::GetPCIBusID(device_);
+
+    // Lower the hex characters to match sysfs.
+    pci_bus_id = port::Lowercase(pci_bus_id);
+    builder.set_pci_bus_id(pci_bus_id);
+
+    // Read the NUMA node corresponding to the PCI bus ID out of sysfs.
+    int numa_node = TryToReadNumaNode(pci_bus_id, device_ordinal_);
+    builder.set_numa_node(numa_node);
+  }
+
+  hipDeviceProp_t prop;
+  if (GpuDriver::GetDeviceProperties(&prop, device_ordinal_)) {
+    builder.set_threads_per_block_limit(prop.maxThreadsPerBlock);
+
+    ThreadDim thread_dim_limit;
+    thread_dim_limit.x = prop.maxThreadsDim[0];
+    thread_dim_limit.y = prop.maxThreadsDim[1];
+    thread_dim_limit.z = prop.maxThreadsDim[2];
+    builder.set_thread_dim_limit(thread_dim_limit);
+
+    float clock_rate_ghz = static_cast<float>(prop.clockRate) / 1e6;
+    builder.set_clock_rate_ghz(clock_rate_ghz);
+  }
+
+  {
+    bool ecc_enabled = false;
+    (void)GpuDriver::IsEccEnabled(device_, &ecc_enabled);
+    builder.set_ecc_enabled(ecc_enabled);
+  }
+
+  {
+    uint64 device_memory_size = -1;
+    (void)GpuDriver::GetDeviceTotalMemory(device_, &device_memory_size);
+    builder.set_device_memory_size(device_memory_size);
+  }
+
+  {
+    BlockDim block_dim_limit;
+    FillBlockDimLimit(&block_dim_limit);
+    builder.set_block_dim_limit(block_dim_limit);
+  }
+
+  {
+    string device_name;
+    (void)GpuDriver::GetDeviceName(device_, &device_name);
+    builder.set_name(device_name);
+  }
+
+  builder.set_platform_version(
+      absl::StrCat("AMDGPU ISA version: gfx", version_));
+
+  // TODO(leary) should be a way to query this from the driver, but this is
+  // unlikely to change for us any time soon.
+  builder.set_device_address_bits(64);
+
+  builder.set_device_vendor("Advanced Micro Devices, Inc");
+  builder.set_rocm_amdgpu_isa_version(version_);
+  builder.set_shared_memory_per_core(
+      GpuDriver::GetMaxSharedMemoryPerCore(device_).ValueOrDie());
+  builder.set_shared_memory_per_block(
+      GpuDriver::GetMaxSharedMemoryPerBlock(device_).ValueOrDie());
+  builder.set_core_count(
+      GpuDriver::GetMultiprocessorCount(device_).ValueOrDie());
+  builder.set_threads_per_core_limit(
+      GpuDriver::GetMaxThreadsPerMultiprocessor(device_).ValueOrDie());
+  builder.set_registers_per_block_limit(
+      GpuDriver::GetMaxRegistersPerBlock(device_).ValueOrDie());
+  builder.set_threads_per_warp(
+      GpuDriver::GetThreadsPerWarp(device_).ValueOrDie());
+  builder.set_registers_per_core_limit(64 * 1024);
+
+  auto built = builder.Build();
+  return built.release();
+}
+
+}  // namespace gpu
+
+void initialize_rocm_gpu_executor() {
+  *internal::MakeROCMExecutorImplementation() = [](const PluginConfig& config) {
+    return new gpu::GpuExecutor{config};
+  };
+}
+
+}  // namespace stream_executor
+
+REGISTER_MODULE_INITIALIZER(rocm_gpu_executor, {
+  stream_executor::initialize_rocm_gpu_executor();
+});
--- a/tensorflow/stream_executor/rocm/rocm_kernel.cc
+++ b/tensorflow/stream_executor/rocm/rocm_kernel.cc
@ -0,0 +1,38 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/stream_executor/gpu/gpu_kernel.h"
+
+namespace stream_executor {
+namespace gpu {
+
+hipFuncCache_t GpuKernel::GetGpuCacheConfig() const {
+  switch (preferred_cache_config_) {
+    case KernelCacheConfig::kNoPreference:
+      return hipFuncCachePreferNone;
+    case KernelCacheConfig::kPreferShared:
+      return hipFuncCachePreferShared;
+    case KernelCacheConfig::kPreferL1:
+      return hipFuncCachePreferL1;
+    case KernelCacheConfig::kPreferEqual:
+      return hipFuncCachePreferEqual;
+    default:
+      LOG(FATAL) << "Unknown KernelCacheConfig"
+                 << static_cast<int32>(preferred_cache_config_);
+  }
+}
+
+}  // namespace gpu
+}  // namespace stream_executor
--- a/tensorflow/stream_executor/rocm/rocm_platform.cc
+++ b/tensorflow/stream_executor/rocm/rocm_platform.cc
@ -0,0 +1,180 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/stream_executor/rocm/rocm_platform.h"
+
+#include "absl/strings/str_format.h"
+#include "tensorflow/stream_executor/gpu/gpu_driver.h"
+#include "tensorflow/stream_executor/gpu/gpu_executor.h"
+#include "tensorflow/stream_executor/lib/error.h"
+#include "tensorflow/stream_executor/lib/initialize.h"
+#include "tensorflow/stream_executor/lib/ptr_util.h"
+#include "tensorflow/stream_executor/lib/status.h"
+#include "tensorflow/stream_executor/lib/stringprintf.h"
+#include "tensorflow/stream_executor/rocm/rocm_platform_id.h"
+
+namespace stream_executor {
+namespace gpu {
+
+ROCmPlatform::ROCmPlatform()
+    : name_("ROCM"), min_numa_node_(0), limit_numa_node_(0) {}
+
+ROCmPlatform::~ROCmPlatform() {}
+
+// Due to legacy issues in user code, we can't currently call InpectNumaNodes
+// at module initialization time, because non-GPU programs still include this
+// plugin via various methods, so instead, it has to be init-on-reference.
+void ROCmPlatform::InspectNumaNodes() {
+  // To get NUMA node information, we need to create all executors, so we can
+  // examine their device descriptions to see their bus assignments.
+  static bool initialized = false;
+  static mutex numa_mutex(LINKER_INITIALIZED);
+  mutex_lock lock(numa_mutex);
+  if (initialized) {
+    return;
+  }
+
+  StreamExecutorConfig config;
+  for (int i = 0; i < VisibleDeviceCount(); i++) {
+    config.ordinal = i;
+    StreamExecutor* exec = GetExecutor(config).ValueOrDie();
+    if (i == 0) {
+      // NUMA nodes may not start at 0, so set the minimum node  based on the
+      // first executor we see.
+      min_numa_node_ = exec->GetDeviceDescription().numa_node();
+      limit_numa_node_ = min_numa_node_ + 1;
+    } else {
+      min_numa_node_ =
+          std::min(min_numa_node_, exec->GetDeviceDescription().numa_node());
+      limit_numa_node_ = std::max(limit_numa_node_,
+                                  exec->GetDeviceDescription().numa_node() + 1);
+    }
+  }
+  initialized = true;
+}
+
+int ROCmPlatform::BusCount() {
+  InspectNumaNodes();
+  return limit_numa_node_ - min_numa_node_;
+}
+
+int ROCmPlatform::DeviceToBus(int device_ordinal) {
+  StreamExecutorConfig config;
+  config.ordinal = device_ordinal;
+  StreamExecutor* exec = GetExecutor(config).ValueOrDie();
+  return exec->GetDeviceDescription().numa_node() - min_numa_node_;
+}
+
+port::StatusOr<StreamExecutor*> ROCmPlatform::FirstExecutorForBus(
+    int bus_ordinal) {
+  InspectNumaNodes();
+  CHECK_LT(bus_ordinal, BusCount()) << "bus ordinal out of available range";
+  for (int i = 0; i < VisibleDeviceCount(); i++) {
+    if (DeviceToBus(i) == bus_ordinal) {
+      StreamExecutorConfig config;
+      config.ordinal = i;
+      return GetExecutor(config).ValueOrDie();
+    }
+  }
+
+  return port::Status{
+      port::error::NOT_FOUND,
+      absl::StrFormat("Executor for bus %d not found.", bus_ordinal)};
+}
+
+Platform::Id ROCmPlatform::id() const { return kROCmPlatformId; }
+
+int ROCmPlatform::VisibleDeviceCount() const {
+  // Throw away the result - it logs internally, and this [containing] function
+  // isn't in the path of user control. It's safe to call this > 1x.
+
+  if (!gpu::GpuDriver::Init().ok()) {
+    return -1;
+  }
+
+  return GpuDriver::GetDeviceCount();
+}
+
+const string& ROCmPlatform::Name() const { return name_; }
+
+port::StatusOr<StreamExecutor*> ROCmPlatform::ExecutorForDevice(int ordinal) {
+  StreamExecutorConfig config;
+  config.ordinal = ordinal;
+  config.plugin_config = PluginConfig();
+  config.device_options = DeviceOptions::Default();
+  return GetExecutor(config);
+}
+
+port::StatusOr<StreamExecutor*> ROCmPlatform::ExecutorForDeviceWithPluginConfig(
+    int device_ordinal, const PluginConfig& plugin_config) {
+  StreamExecutorConfig config;
+  config.ordinal = device_ordinal;
+  config.plugin_config = plugin_config;
+  config.device_options = DeviceOptions::Default();
+  return GetExecutor(config);
+}
+
+port::StatusOr<StreamExecutor*> ROCmPlatform::GetExecutor(
+    const StreamExecutorConfig& config) {
+  return executor_cache_.GetOrCreate(
+      config, [&]() { return GetUncachedExecutor(config); });
+}
+
+port::StatusOr<std::unique_ptr<StreamExecutor>>
+ROCmPlatform::GetUncachedExecutor(const StreamExecutorConfig& config) {
+  auto executor = MakeUnique<StreamExecutor>(
+      this, MakeUnique<GpuExecutor>(config.plugin_config));
+  auto init_status = executor->Init(config.ordinal, config.device_options);
+  if (!init_status.ok()) {
+    return port::Status{
+        port::error::INTERNAL,
+        absl::StrFormat(
+            "failed initializing StreamExecutor for ROCM device ordinal %d: %s",
+            config.ordinal, init_status.ToString().c_str())};
+  }
+
+  return std::move(executor);
+}
+
+void ROCmPlatform::RegisterTraceListener(
+    std::unique_ptr<TraceListener> listener) {
+  LOG(FATAL) << "not yet implemented: register ROCM trace listener";
+}
+
+void ROCmPlatform::UnregisterTraceListener(TraceListener* listener) {
+  LOG(FATAL) << "not yet implemented: unregister ROCM trace listener";
+}
+
+}  // namespace gpu
+
+static void InitializeROCmPlatform() {
+  // Disabling leak checking, MultiPlatformManager does not destroy its
+  // registered platforms.
+  auto status = MultiPlatformManager::PlatformWithName("ROCM");
+  if (!status.ok()) {
+    std::unique_ptr<gpu::ROCmPlatform> platform(new gpu::ROCmPlatform);
+    SE_CHECK_OK(MultiPlatformManager::RegisterPlatform(std::move(platform)));
+  }
+}
+
+}  // namespace stream_executor
+
+REGISTER_MODULE_INITIALIZER(rocm_platform,
+                            stream_executor::InitializeROCmPlatform());
+
+DECLARE_MODULE_INITIALIZER(multi_platform_manager);
+// Note that module initialization sequencing is not supported in the
+// open-source project, so this will be a no-op there.
+REGISTER_MODULE_INITIALIZER_SEQUENCE(rocm_platform, multi_platform_manager);
--- a/tensorflow/stream_executor/rocm/rocm_platform.h
+++ b/tensorflow/stream_executor/rocm/rocm_platform.h
@ -0,0 +1,110 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_PLATFORM_H_
+#define TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_PLATFORM_H_
+
+#include <memory>
+#include <vector>
+
+#include "tensorflow/stream_executor/executor_cache.h"
+#include "tensorflow/stream_executor/lib/statusor.h"
+#include "tensorflow/stream_executor/multi_platform_manager.h"
+#include "tensorflow/stream_executor/platform.h"
+#include "tensorflow/stream_executor/platform/mutex.h"
+#include "tensorflow/stream_executor/platform/port.h"
+#include "tensorflow/stream_executor/platform/thread_annotations.h"
+#include "tensorflow/stream_executor/stream_executor_internal.h"
+#include "tensorflow/stream_executor/stream_executor_pimpl.h"
+#include "tensorflow/stream_executor/trace_listener.h"
+
+namespace stream_executor {
+namespace gpu {
+
+// Opaque and unique identifier for the ROCM platform plugin.
+// This is needed so that plugins can refer to/identify this platform without
+// instantiating a ROCmPlatform object.
+extern const Platform::Id kROCmPlatformId;
+
+// ROCm-specific platform plugin, registered as a singleton value via module
+// initializer.
+class ROCmPlatform : public Platform {
+ public:
+  ROCmPlatform();
+  ~ROCmPlatform() override;
+
+  // ROCmPlatform-specific functionality
+  // Returns the number of distinct buses / NUMA nodes on the machine.
+  int BusCount();
+
+  // Returns the bus/NUMA node for the specified device ordinal.
+  int DeviceToBus(int device_ordinal);
+
+  // Returns the lowest-ordinal-number StreamExecutor on the specified bus.
+  port::StatusOr<StreamExecutor*> FirstExecutorForBus(int bus_ordinal);
+
+  // Platform interface implementation:
+  // Returns the same value as kROCmPlatform above.
+  Platform::Id id() const override;
+
+  // Returns -1 as a sentinel on internal failure (and logs the error).
+  int VisibleDeviceCount() const override;
+
+  const string& Name() const override;
+
+  port::StatusOr<StreamExecutor*> ExecutorForDevice(int ordinal) override;
+
+  port::StatusOr<StreamExecutor*> ExecutorForDeviceWithPluginConfig(
+      int ordinal, const PluginConfig& config) override;
+
+  port::StatusOr<StreamExecutor*> GetExecutor(
+      const StreamExecutorConfig& config) override;
+
+  port::StatusOr<std::unique_ptr<StreamExecutor>> GetUncachedExecutor(
+      const StreamExecutorConfig& config) override;
+
+  void RegisterTraceListener(std::unique_ptr<TraceListener> listener) override;
+
+  void UnregisterTraceListener(TraceListener* listener) override;
+
+ private:
+  // Determines the number of NUMA nodes and the assignment of executor to each.
+  void InspectNumaNodes();
+
+  // This platform's name.
+  string name_;
+
+  // mutex that guards internal state.
+  mutable mutex mu_;
+
+  // Cache of created executors.
+  ExecutorCache executor_cache_;
+
+  // The smallest NUMA node value for any device managed by this machine
+  // manager. Used, along with limit_numa_node_, to convert NUMA nodes into bus
+  // ordinals. The NUMA node space occupied by GPUs is assumed to be dense./
+  int min_numa_node_;
+
+  // Larger than the NUMA node value for any device managed by this machine
+  // manager.
+  int limit_numa_node_;
+
+  SE_DISALLOW_COPY_AND_ASSIGN(ROCmPlatform);
+};
+
+}  // namespace gpu
+}  // namespace stream_executor
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_PLATFORM_H_
--- a/tensorflow/stream_executor/rocm/rocm_platform_id.cc
+++ b/tensorflow/stream_executor/rocm/rocm_platform_id.cc
@ -0,0 +1,24 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/stream_executor/rocm/rocm_platform_id.h"
+
+namespace stream_executor {
+namespace gpu {
+
+PLATFORM_DEFINE_ID(kROCmPlatformId);
+
+}  // namespace gpu
+}  // namespace stream_executor
--- a/tensorflow/stream_executor/rocm/rocm_platform_id.h
+++ b/tensorflow/stream_executor/rocm/rocm_platform_id.h
@ -0,0 +1,34 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_PLATFORM_ID_H_
+#define TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_PLATFORM_ID_H_
+
+#include "tensorflow/stream_executor/platform.h"
+
+namespace stream_executor {
+namespace gpu {
+
+// Opaque and unique identifier for the ROCm platform.
+// This is needed so that plugins can refer to/identify this platform without
+// instantiating a ROCmPlatform object.
+// This is broken out here to avoid a circular dependency between ROCmPlatform
+// and GpuExecutor.
+extern const Platform::Id kROCmPlatformId;
+
+}  // namespace gpu
+}  // namespace stream_executor
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_PLATFORM_ID_H_
--- a/tensorflow/stream_executor/rocm/rocm_rng.cc
+++ b/tensorflow/stream_executor/rocm/rocm_rng.cc
@ -0,0 +1,284 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "rocm/include/hiprand/hiprand.h"
+#include "tensorflow/stream_executor/device_memory.h"
+#include "tensorflow/stream_executor/gpu/gpu_activation.h"
+#include "tensorflow/stream_executor/gpu/gpu_executor.h"
+#include "tensorflow/stream_executor/gpu/gpu_helpers.h"
+#include "tensorflow/stream_executor/gpu/gpu_rng.h"
+#include "tensorflow/stream_executor/gpu/gpu_stream.h"
+#include "tensorflow/stream_executor/lib/env.h"
+#include "tensorflow/stream_executor/lib/initialize.h"
+#include "tensorflow/stream_executor/lib/status.h"
+#include "tensorflow/stream_executor/platform/logging.h"
+#include "tensorflow/stream_executor/rng.h"
+#include "tensorflow/stream_executor/rocm/rocm_platform_id.h"
+
+// Formats hiprandStatus_t to output prettified values into a log stream.
+std::ostream& operator<<(std::ostream& in, const hiprandStatus_t& status) {
+#define OSTREAM_HIPRAND_STATUS(__name) \
+  case HIPRAND_STATUS_##__name:        \
+    in << "HIPRAND_STATUS_" #__name;   \
+    return in;
+
+  switch (status) {
+    OSTREAM_HIPRAND_STATUS(SUCCESS)
+    OSTREAM_HIPRAND_STATUS(VERSION_MISMATCH)
+    OSTREAM_HIPRAND_STATUS(NOT_INITIALIZED)
+    OSTREAM_HIPRAND_STATUS(ALLOCATION_FAILED)
+    OSTREAM_HIPRAND_STATUS(TYPE_ERROR)
+    OSTREAM_HIPRAND_STATUS(OUT_OF_RANGE)
+    OSTREAM_HIPRAND_STATUS(LENGTH_NOT_MULTIPLE)
+    OSTREAM_HIPRAND_STATUS(LAUNCH_FAILURE)
+    OSTREAM_HIPRAND_STATUS(PREEXISTING_FAILURE)
+    OSTREAM_HIPRAND_STATUS(INITIALIZATION_FAILED)
+    OSTREAM_HIPRAND_STATUS(ARCH_MISMATCH)
+    OSTREAM_HIPRAND_STATUS(INTERNAL_ERROR)
+    default:
+      in << "hiprandStatus_t(" << static_cast<int>(status) << ")";
+      return in;
+  }
+}
+
+namespace stream_executor {
+namespace gpu {
+
+PLUGIN_REGISTRY_DEFINE_PLUGIN_ID(kGpuRandPlugin);
+
+namespace wrap {
+
+#define PERFTOOLS_GPUTOOLS_HIPRAND_WRAP(__name)                     \
+  struct WrapperShim__##__name {                                    \
+    template <typename... Args>                                     \
+    hiprandStatus_t operator()(GpuExecutor* parent, Args... args) { \
+      gpu::ScopedActivateExecutorContext sac{parent};               \
+      return ::__name(args...);                                     \
+    }                                                               \
+  } __name;
+
+PERFTOOLS_GPUTOOLS_HIPRAND_WRAP(hiprandCreateGenerator);
+PERFTOOLS_GPUTOOLS_HIPRAND_WRAP(hiprandDestroyGenerator);
+PERFTOOLS_GPUTOOLS_HIPRAND_WRAP(hiprandSetStream);
+PERFTOOLS_GPUTOOLS_HIPRAND_WRAP(hiprandGenerateUniform);
+PERFTOOLS_GPUTOOLS_HIPRAND_WRAP(hiprandGenerateUniformDouble);
+PERFTOOLS_GPUTOOLS_HIPRAND_WRAP(hiprandSetPseudoRandomGeneratorSeed);
+PERFTOOLS_GPUTOOLS_HIPRAND_WRAP(hiprandSetGeneratorOffset);
+PERFTOOLS_GPUTOOLS_HIPRAND_WRAP(hiprandGenerateNormal);
+PERFTOOLS_GPUTOOLS_HIPRAND_WRAP(hiprandGenerateNormalDouble);
+
+}  // namespace wrap
+
+GpuRng::GpuRng(GpuExecutor* parent) : parent_(parent), rng_(nullptr) {}
+
+GpuRng::~GpuRng() {
+  if (rng_ != nullptr) {
+    wrap::hiprandDestroyGenerator(parent_, rng_);
+  }
+}
+
+bool GpuRng::Init() {
+  mutex_lock lock{mu_};
+  CHECK(rng_ == nullptr);
+
+  hiprandStatus_t ret =
+      wrap::hiprandCreateGenerator(parent_, &rng_, HIPRAND_RNG_PSEUDO_DEFAULT);
+  if (ret != HIPRAND_STATUS_SUCCESS) {
+    LOG(ERROR) << "failed to create random number generator: " << ret;
+    return false;
+  }
+
+  CHECK(rng_ != nullptr);
+  return true;
+}
+
+bool GpuRng::SetStream(Stream* stream) {
+  hiprandStatus_t ret =
+      wrap::hiprandSetStream(parent_, rng_, AsGpuStreamValue(stream));
+  if (ret != HIPRAND_STATUS_SUCCESS) {
+    LOG(ERROR) << "failed to set stream for random generation: " << ret;
+    return false;
+  }
+
+  return true;
+}
+
+// Returns true if std::complex stores its contents as two consecutive
+// elements. Tests int, float and double, as the last two are independent
+// specializations.
+constexpr bool ComplexIsConsecutiveFloats() {
+  return sizeof(std::complex<int>) == 8 && sizeof(std::complex<float>) == 8 &&
+         sizeof(std::complex<double>) == 16;
+}
+
+template <typename T>
+bool GpuRng::DoPopulateRandUniformInternal(Stream* stream, DeviceMemory<T>* v) {
+  mutex_lock lock{mu_};
+  static_assert(ComplexIsConsecutiveFloats(),
+                "std::complex values are not stored as consecutive values");
+
+  if (!SetStream(stream)) {
+    return false;
+  }
+
+  // std::complex<T> is currently implemented as two consecutive T variables.
+  uint64 element_count = v->ElementCount();
+  if (std::is_same<T, std::complex<float>>::value ||
+      std::is_same<T, std::complex<double>>::value) {
+    element_count *= 2;
+  }
+
+  hiprandStatus_t ret;
+  if (std::is_same<T, float>::value ||
+      std::is_same<T, std::complex<float>>::value) {
+    ret = wrap::hiprandGenerateUniform(
+        parent_, rng_, reinterpret_cast<float*>(GpuMemoryMutable(v)),
+        element_count);
+  } else {
+    ret = wrap::hiprandGenerateUniformDouble(
+        parent_, rng_, reinterpret_cast<double*>(GpuMemoryMutable(v)),
+        element_count);
+  }
+  if (ret != HIPRAND_STATUS_SUCCESS) {
+    LOG(ERROR) << "failed to do uniform generation of " << v->ElementCount()
+               << " " << TypeString<T>() << "s at " << v->opaque() << ": "
+               << ret;
+    return false;
+  }
+
+  return true;
+}
+
+bool GpuRng::DoPopulateRandUniform(Stream* stream, DeviceMemory<float>* v) {
+  return DoPopulateRandUniformInternal(stream, v);
+}
+
+bool GpuRng::DoPopulateRandUniform(Stream* stream, DeviceMemory<double>* v) {
+  return DoPopulateRandUniformInternal(stream, v);
+}
+
+bool GpuRng::DoPopulateRandUniform(Stream* stream,
+                                   DeviceMemory<std::complex<float>>* v) {
+  return DoPopulateRandUniformInternal(stream, v);
+}
+
+bool GpuRng::DoPopulateRandUniform(Stream* stream,
+                                   DeviceMemory<std::complex<double>>* v) {
+  return DoPopulateRandUniformInternal(stream, v);
+}
+
+template <typename ElemT, typename FuncT>
+bool GpuRng::DoPopulateRandGaussianInternal(Stream* stream, ElemT mean,
+                                            ElemT stddev,
+                                            DeviceMemory<ElemT>* v,
+                                            FuncT func) {
+  mutex_lock lock{mu_};
+
+  if (!SetStream(stream)) {
+    return false;
+  }
+
+  uint64 element_count = v->ElementCount();
+  hiprandStatus_t ret =
+      func(parent_, rng_, GpuMemoryMutable(v), element_count, mean, stddev);
+
+  if (ret != HIPRAND_STATUS_SUCCESS) {
+    LOG(ERROR) << "failed to do gaussian generation of " << v->ElementCount()
+               << " floats at " << v->opaque() << ": " << ret;
+    return false;
+  }
+
+  return true;
+}
+
+bool GpuRng::DoPopulateRandGaussian(Stream* stream, float mean, float stddev,
+                                    DeviceMemory<float>* v) {
+  return DoPopulateRandGaussianInternal(stream, mean, stddev, v,
+                                        wrap::hiprandGenerateNormal);
+}
+
+bool GpuRng::DoPopulateRandGaussian(Stream* stream, double mean, double stddev,
+                                    DeviceMemory<double>* v) {
+  return DoPopulateRandGaussianInternal(stream, mean, stddev, v,
+                                        wrap::hiprandGenerateNormalDouble);
+}
+
+bool GpuRng::SetSeed(Stream* stream, const uint8* seed, uint64 seed_bytes) {
+  mutex_lock lock{mu_};
+  CHECK(rng_ != nullptr);
+
+  if (!CheckSeed(seed, seed_bytes)) {
+    return false;
+  }
+
+  if (!SetStream(stream)) {
+    return false;
+  }
+
+  // Requires 8 bytes of seed data; checked in RngSupport::CheckSeed (above)
+  // (which itself requires 16 for API consistency with host RNG fallbacks).
+  hiprandStatus_t ret = wrap::hiprandSetPseudoRandomGeneratorSeed(
+      parent_, rng_, *(reinterpret_cast<const uint64*>(seed)));
+  if (ret != HIPRAND_STATUS_SUCCESS) {
+    LOG(ERROR) << "failed to set rng seed: " << ret;
+    return false;
+  }
+
+  ret = wrap::hiprandSetGeneratorOffset(parent_, rng_, 0);
+  if (ret != HIPRAND_STATUS_SUCCESS) {
+    LOG(ERROR) << "failed to reset rng position: " << ret;
+    return false;
+  }
+  return true;
+}
+
+}  // namespace gpu
+}  // namespace stream_executor
+
+namespace se = ::stream_executor;
+
+REGISTER_MODULE_INITIALIZER(register_hiprand, {
+  se::port::Status status =
+      se::PluginRegistry::Instance()
+          ->RegisterFactory<se::PluginRegistry::RngFactory>(
+              se::gpu::kROCmPlatformId, se::gpu::kGpuRandPlugin, "hipRAND",
+              [](se::internal::StreamExecutorInterface* parent)
+                  -> se::rng::RngSupport* {
+                se::gpu::GpuExecutor* rocm_executor =
+                    dynamic_cast<se::gpu::GpuExecutor*>(parent);
+                if (rocm_executor == nullptr) {
+                  LOG(ERROR)
+                      << "Attempting to initialize an instance of the hipRAND "
+                      << "support library with a non-ROCM StreamExecutor";
+                  return nullptr;
+                }
+
+                se::gpu::GpuRng* rng = new se::gpu::GpuRng(rocm_executor);
+                if (!rng->Init()) {
+                  // Note: Init() will log a more specific error.
+                  delete rng;
+                  return nullptr;
+                }
+                return rng;
+              });
+
+  if (!status.ok()) {
+    LOG(ERROR) << "Unable to register hipRAND factory: "
+               << status.error_message();
+  }
+
+  se::PluginRegistry::Instance()->SetDefaultFactory(
+      se::gpu::kROCmPlatformId, se::PluginKind::kRng, se::gpu::kGpuRandPlugin);
+});
--- a/tensorflow/stream_executor/stream_executor_internal.cc
+++ b/tensorflow/stream_executor/stream_executor_internal.cc
@ -25,6 +25,13 @@ StreamExecutorFactory* MakeCUDAExecutorImplementation() {
  return &instance;
 }

+// -- ROCm
+
+StreamExecutorFactory* MakeROCMExecutorImplementation() {
+  static StreamExecutorFactory instance;
+  return &instance;
+}
+
 // -- OpenCL

 StreamExecutorFactory* MakeOpenCLExecutorImplementation() {
--- a/tensorflow/stream_executor/stream_executor_internal.h
+++ b/tensorflow/stream_executor/stream_executor_internal.h
@ -374,9 +374,11 @@ using StreamFactory = std::function<StreamInterface *(StreamExecutor *)>;
 using TimerFactory = std::function<TimerInterface *(StreamExecutor *)>;
 using KernelFactory = std::function<KernelInterface*()>;

-StreamExecutorFactory* MakeCUDAExecutorImplementation();
+StreamExecutorFactory *MakeCUDAExecutorImplementation();

-StreamExecutorFactory* MakeOpenCLExecutorImplementation();
+StreamExecutorFactory *MakeROCMExecutorImplementation();
+
+StreamExecutorFactory *MakeOpenCLExecutorImplementation();

 extern StreamExecutorFactory MakeHostExecutorImplementation;

--- a/tensorflow/stream_executor/stream_executor_pimpl.cc
+++ b/tensorflow/stream_executor/stream_executor_pimpl.cc
@ -71,6 +71,9 @@ internal::StreamExecutorInterface *StreamExecutorImplementationFromPlatformKind(
    case PlatformKind::kCuda:
      factory = *internal::MakeCUDAExecutorImplementation();
      break;
+    case PlatformKind::kROCm:
+      factory = *internal::MakeROCMExecutorImplementation();
+      break;
    case PlatformKind::kOpenCL:
      factory = *internal::MakeOpenCLExecutorImplementation();
      break;
@ -188,6 +191,8 @@ StreamExecutor::StreamExecutor(
      memory_limit_bytes_(GetMemoryLimitBytes()) {
  if (port::Lowercase(platform_->Name()) == "cuda") {
    platform_kind_ = PlatformKind::kCuda;
+  } else if (port::Lowercase(platform_->Name()) == "rocm") {
+    platform_kind_ = PlatformKind::kROCm;
  } else if (port::Lowercase(platform_->Name()) == "opencl") {
    platform_kind_ = PlatformKind::kOpenCL;
  } else if (port::Lowercase(platform_->Name()) == "host") {
--- a/third_party/gpus/rocm/BUILD.tpl
+++ b/third_party/gpus/rocm/BUILD.tpl
@ -18,6 +18,7 @@ cc_library(
    includes = [
        ".",
        "rocm/include",
+        "rocm/include/rocrand",
    ],
    visibility = ["//visibility:public"],
 )