From d236afda36626f3dd6dfea234413b3b4d62fc9a0 Mon Sep 17 00:00:00 2001
From: Deven Desai <deven.desai.amd@gmail.com>
Date: Tue, 22 Dec 2020 16:17:25 +0000
Subject: [PATCH] Adding hooks in the Stream Executor API to get/set the AMDGPU
 gcnArchName device property

---
 tensorflow/core/common_runtime/gpu/gpu_device.cc  |  8 ++------
 tensorflow/stream_executor/cuda/cuda_driver.cc    |  7 +++++++
 tensorflow/stream_executor/device_description.cc  |  3 +++
 tensorflow/stream_executor/device_description.h   | 14 ++++++++++++++
 tensorflow/stream_executor/gpu/gpu_driver.h       |  6 ++++++
 tensorflow/stream_executor/rocm/rocm_driver.cc    | 15 +++++++++++++++
 .../stream_executor/rocm/rocm_gpu_executor.cc     | 10 +++++++++-
 tensorflow/stream_executor/tpu/c_api_decl.h       |  1 +
 8 files changed, 57 insertions(+), 7 deletions(-)
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.cc b/tensorflow/core/common_runtime/gpu/gpu_device.cc
index 50647b8cd76..1fc99c5a520 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.cc
@@ -1770,15 +1770,11 @@ Status BaseGPUDeviceFactory::GetValidDeviceIds(
               << strings::HumanReadableNumBytes(description->memory_bandwidth())
               << "/s";
 #elif TENSORFLOW_USE_ROCM
-    int isa_version;
-    if (!description->rocm_amdgpu_isa_version(&isa_version)) {
-      // Logs internally on failure.
-      isa_version = 0;
-    }
+    std::string gcn_arch_name = description->rocm_amdgpu_gcn_arch_name();
     LOG(INFO) << "Found device " << i << " with properties: "
               << "\npciBusID: " << description->pci_bus_id()
               << " name: " << description->name()
-              << "     ROCm AMD GPU ISA: gfx" << isa_version
+              << "     ROCm AMDGPU Arch: " << gcn_arch_name
               << "\ncoreClock: " << description->clock_rate_ghz() << "GHz"
               << " coreCount: " << description->core_count()
               << " deviceMemorySize: "
diff --git a/tensorflow/stream_executor/cuda/cuda_driver.cc b/tensorflow/stream_executor/cuda/cuda_driver.cc
index 42db563a0bd..2b64db17197 100644
--- a/tensorflow/stream_executor/cuda/cuda_driver.cc
+++ b/tensorflow/stream_executor/cuda/cuda_driver.cc
@@ -1388,6 +1388,13 @@ GpuDriver::CreateMemoryHandle(GpuContext* context, uint64 bytes) {
       "Feature not supported on CUDA platform (GetGpuISAVersion)"};
 }
 
+/* static */ port::Status GpuDriver::GetGpuGCNArchName(
+    CUdevice device, std::string* gcnArchName) {
+  return port::Status{
+      port::error::INTERNAL,
+      "Feature not supported on CUDA platform (GetGpuGCNArchName)"};
+}
+
 // Helper function that turns the integer output of cuDeviceGetAttribute to type
 // T and wraps it in a StatusOr.
 template <typename T>
diff --git a/tensorflow/stream_executor/device_description.cc b/tensorflow/stream_executor/device_description.cc
index 130e2e638e5..4ffe02e7d96 100644
--- a/tensorflow/stream_executor/device_description.cc
+++ b/tensorflow/stream_executor/device_description.cc
@@ -51,6 +51,7 @@ DeviceDescription::DeviceDescription()
       cuda_compute_capability_major_(-1),
       cuda_compute_capability_minor_(-1),
       rocm_amdgpu_isa_version_(-1),
+      rocm_amdgpu_gcn_arch_name_(kUndefinedString),
       numa_node_(-1),
       core_count_(-1),
       ecc_enabled_(false) {}
@@ -95,6 +96,8 @@ std::unique_ptr<std::map<std::string, std::string>> DeviceDescription::ToMap()
   result["CUDA Compute Capability"] = absl::StrCat(
       cuda_compute_capability_major_, ".", cuda_compute_capability_minor_);
 
+  result["AMDGPU GCN Arch Name"] = absl::StrCat(rocm_amdgpu_gcn_arch_name_);
+
   result["NUMA Node"] = absl::StrCat(numa_node());
   result["Core Count"] = absl::StrCat(core_count());
   result["ECC Enabled"] = absl::StrCat(ecc_enabled());
diff --git a/tensorflow/stream_executor/device_description.h b/tensorflow/stream_executor/device_description.h
index fa7426eb04b..fef4be435ad 100644
--- a/tensorflow/stream_executor/device_description.h
+++ b/tensorflow/stream_executor/device_description.h
@@ -138,6 +138,13 @@ class DeviceDescription {
   // and the return value will be false.
   bool rocm_amdgpu_isa_version(int *version) const;
 
+  // Returns the
+  // * AMDGPU GCN Architecture Name if we're running on the ROCm platform.
+  // * kUndefinedString otherwise
+  const std::string rocm_amdgpu_gcn_arch_name() const {
+    return rocm_amdgpu_gcn_arch_name_;
+  }
+
   // Returns the maximum amount of shared memory present on a single core
   // (i.e. Streaming Multiprocessor on NVIDIA GPUs; Compute Unit for OpenCL
   // devices). Note that some devices, such as NVIDIA's have a configurable
@@ -203,6 +210,9 @@ class DeviceDescription {
   // ROCM AMDGPU ISA version, 0 if not available.
   int rocm_amdgpu_isa_version_;
 
+  // ROCm AMDGPU GCN Architecture name, "" if not available.
+  std::string rocm_amdgpu_gcn_arch_name_;
+
   int numa_node_;
   int core_count_;
   bool ecc_enabled_;
@@ -294,6 +304,10 @@ class DeviceDescriptionBuilder {
     device_description_->rocm_amdgpu_isa_version_ = version;
   }
 
+  void set_rocm_amdgpu_gcn_arch_name(const std::string& gcn_arch_name) {
+    device_description_->rocm_amdgpu_gcn_arch_name_ = gcn_arch_name;
+  }
+
   void set_numa_node(int value) { device_description_->numa_node_ = value; }
   void set_core_count(int value) { device_description_->core_count_ = value; }
   void set_ecc_enabled(bool value) {
diff --git a/tensorflow/stream_executor/gpu/gpu_driver.h b/tensorflow/stream_executor/gpu/gpu_driver.h
index 3cd13dcc013..955ed59926a 100644
--- a/tensorflow/stream_executor/gpu/gpu_driver.h
+++ b/tensorflow/stream_executor/gpu/gpu_driver.h
@@ -460,6 +460,12 @@ class GpuDriver {
   // (supported on ROCm only)
   static port::Status GetGpuISAVersion(int* version, GpuDeviceHandle device);
 
+  // Return the full GCN Architecture Name for the the device
+  // for eg: amdgcn-amd-amdhsa--gfx908:sramecc+:xnack-
+  // (supported on ROCm only)
+  static port::Status GetGpuGCNArchName(GpuDeviceHandle device,
+                                        std::string* gcnArchName);
+
   // Returns the number of multiprocessors on the device (note that the device
   // may be multi-GPU-per-board).
   static port::StatusOr<int> GetMultiprocessorCount(GpuDeviceHandle device);
diff --git a/tensorflow/stream_executor/rocm/rocm_driver.cc b/tensorflow/stream_executor/rocm/rocm_driver.cc
index a070979e71d..f7be297c3d7 100644
--- a/tensorflow/stream_executor/rocm/rocm_driver.cc
+++ b/tensorflow/stream_executor/rocm/rocm_driver.cc
@@ -1080,6 +1080,21 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
                       device)};
 }
 
+/* static */ port::Status GpuDriver::GetGpuGCNArchName(
+    hipDevice_t device, std::string* gcnArchName) {
+  hipDeviceProp_t props;
+  hipError_t result = tensorflow::wrap::hipGetDeviceProperties(&props, device);
+  if (result == hipSuccess) {
+    *gcnArchName = props.gcnArchName;
+    return port::Status::OK();
+  }
+  *gcnArchName = "";
+  return port::Status{
+      port::error::INTERNAL,
+      absl::StrFormat("failed to determine AMDGpu GCN Arch Name for device %d",
+                      device)};
+}
+
 // Helper function that turns the integer output of hipDeviceGetAttribute to
 // type T and wraps it in a StatusOr.
 template <typename T>
diff --git a/tensorflow/stream_executor/rocm/rocm_gpu_executor.cc b/tensorflow/stream_executor/rocm/rocm_gpu_executor.cc
index dbab0304d82..3926aeeb288 100644
--- a/tensorflow/stream_executor/rocm/rocm_gpu_executor.cc
+++ b/tensorflow/stream_executor/rocm/rocm_gpu_executor.cc
@@ -820,6 +820,12 @@ GpuExecutor::CreateDeviceDescription(int device_ordinal) {
     return status;
   }
 
+  string gcn_arch_name;
+  status = GpuDriver::GetGpuGCNArchName(device, &gcn_arch_name);
+  if (!status.ok()) {
+    return status;
+  }
+
   internal::DeviceDescriptionBuilder builder;
 
   {
@@ -888,7 +894,7 @@ GpuExecutor::CreateDeviceDescription(int device_ordinal) {
   }
 
   builder.set_platform_version(
-      absl::StrCat("AMDGPU ISA version: gfx", version));
+      absl::StrCat("AMDGPU ISA version: ", gcn_arch_name));
 
   // TODO(leary) should be a way to query this from the driver, but this is
   // unlikely to change for us any time soon.
@@ -896,6 +902,8 @@ GpuExecutor::CreateDeviceDescription(int device_ordinal) {
 
   builder.set_device_vendor("Advanced Micro Devices, Inc");
   builder.set_rocm_amdgpu_isa_version(version);
+  builder.set_rocm_amdgpu_gcn_arch_name(gcn_arch_name);
+
   builder.set_shared_memory_per_core(
       GpuDriver::GetMaxSharedMemoryPerCore(device).ValueOrDie());
   builder.set_shared_memory_per_block(
diff --git a/tensorflow/stream_executor/tpu/c_api_decl.h b/tensorflow/stream_executor/tpu/c_api_decl.h
index 71a725f5886..95af7303e77 100644
--- a/tensorflow/stream_executor/tpu/c_api_decl.h
+++ b/tensorflow/stream_executor/tpu/c_api_decl.h
@@ -140,6 +140,7 @@ typedef struct SE_DeviceDescription {
   int cuda_compute_capability_minor;
 
   int rocm_amdgpu_isa_version;
+  char* rocm_amdgpu_gcn_arch_name;
 
   int numa_node;
   int core_count;