From 2bfa43b081aafc803708a98a6cce83606aedc300 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 21 Jan 2020 20:49:05 -0800
Subject: [PATCH] Add GPU device capabilities and compute max FMA throughput.

PiperOrigin-RevId: 290881456
Change-Id: Ied22de910ec0fefb641f040b40d39f75631aecad
---
 .../profiler/protobuf/hardware_types.proto    | 13 ++++
 tensorflow/core/profiler/utils/BUILD          | 10 +++
 .../profiler/utils/hardware_type_utils.cc     | 76 +++++++++++++++++++
 .../core/profiler/utils/hardware_type_utils.h | 31 ++++++++
 4 files changed, 130 insertions(+)
 create mode 100644 tensorflow/core/profiler/utils/hardware_type_utils.cc
 create mode 100644 tensorflow/core/profiler/utils/hardware_type_utils.h

diff --git a/tensorflow/core/profiler/protobuf/hardware_types.proto b/tensorflow/core/profiler/protobuf/hardware_types.proto
index fe04d583d48..0538ee0b056 100644
--- a/tensorflow/core/profiler/protobuf/hardware_types.proto
+++ b/tensorflow/core/profiler/protobuf/hardware_types.proto
@@ -15,3 +15,16 @@ enum HardwareType {
   // TPU.
   TPU = 3;
 }
+
+message CudaComputeCapability {
+  uint32 major = 1;
+  uint32 minor = 2;
+}
+
+message DeviceCapabilities {
+  double clock_rate_in_ghz = 1;
+  uint32 num_cores = 2;
+  uint64 memory_size_in_bytes = 3;
+  uint64 memory_bandwidth = 4;
+  CudaComputeCapability compute_capability = 5;
+}
diff --git a/tensorflow/core/profiler/utils/BUILD b/tensorflow/core/profiler/utils/BUILD
index 41e1fa26159..ff38e825e95 100644
--- a/tensorflow/core/profiler/utils/BUILD
+++ b/tensorflow/core/profiler/utils/BUILD
@@ -24,6 +24,16 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "hardware_type_utils",
+    srcs = ["hardware_type_utils.cc"],
+    hdrs = ["hardware_type_utils.h"],
+    deps = [
+        "//tensorflow/core:lib",
+        "//tensorflow/core/profiler/protobuf:hardware_types_proto_cc",
+    ],
+)
+
 cc_library(
     name = "math_utils",
     hdrs = ["math_utils.h"],
diff --git a/tensorflow/core/profiler/utils/hardware_type_utils.cc b/tensorflow/core/profiler/utils/hardware_type_utils.cc
new file mode 100644
index 00000000000..db797502c27
--- /dev/null
+++ b/tensorflow/core/profiler/utils/hardware_type_utils.cc
@@ -0,0 +1,76 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/profiler/utils/hardware_type_utils.h"
+
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace profiler {
+namespace {
+
+// Get theoretical upperbound of single precision FMA throughput of the GPU per
+// cycle per streaming multiprocessor.
+// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#arithmetic-instructions__throughput-native-arithmetic-instructions
+uint32 GetFmaMaxThroughputPerSMPerCycle(const DeviceCapabilities& device_cap) {
+  uint32 n_fp32_cores = 0;
+  uint32 n_tc_cores = 0;
+  switch (device_cap.compute_capability().major()) {
+    case 2:
+      // Fermi
+      n_fp32_cores = 32;
+      break;
+    case 3:
+      // Kepler
+      n_fp32_cores = 192;
+      break;
+    case 5:
+      // Maxwell
+      n_fp32_cores = 128;
+      break;
+    case 6:
+      // Pascal
+      if (device_cap.compute_capability().minor() > 0) {
+        // Pascal SM61/62
+        n_fp32_cores = 128;
+      } else {
+        // Pascal SM60
+        n_fp32_cores = 64;
+      }
+      break;
+    case 7:
+      // Volta and Turing
+      n_fp32_cores = 64;
+      n_tc_cores = 8;
+      break;
+    default:
+      LOG(ERROR) << "Invalid GPU compute capability.";
+      break;
+  }
+  // GPU TensorCore can execute 64 FMAs per cycle.
+  // https://devblogs.nvidia.com/programming-tensor-cores-cuda-9/
+  return n_fp32_cores + n_tc_cores * 64;
+}
+
+}  // namespace
+
+double GetFlopMaxThroughputPerSM(const DeviceCapabilities& device_cap) {
+  // One FMA = 2 floating point operations, one multiply and one add.
+  return GetFmaMaxThroughputPerSMPerCycle(device_cap) * 2 *
+         device_cap.clock_rate_in_ghz();
+}
+
+}  // namespace profiler
+}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/utils/hardware_type_utils.h b/tensorflow/core/profiler/utils/hardware_type_utils.h
new file mode 100644
index 00000000000..9d4b8b73eaf
--- /dev/null
+++ b/tensorflow/core/profiler/utils/hardware_type_utils.h
@@ -0,0 +1,31 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PROFILER_UTILS_HARDWARE_TYPE_UTILS_H_
+#define TENSORFLOW_CORE_PROFILER_UTILS_HARDWARE_TYPE_UTILS_H_
+
+#include "tensorflow/core/profiler/protobuf/hardware_types.pb.h"
+
+namespace tensorflow {
+namespace profiler {
+
+// Get peak single precision throughput of the GPU in GFLOPS per
+// streaming multiprocessor.
+double GetFlopMaxThroughputPerSM(const DeviceCapabilities& device_cap);
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_UTILS_HARDWARE_TYPE_UTILS_H_