Add GPU device capabilities and compute max FMA throughput.

PiperOrigin-RevId: 290881456 Change-Id: Ied22de910ec0fefb641f040b40d39f75631aecad
2020-01-21 20:49:05 -08:00 · 2020-01-21 20:49:05 -08:00 · 2bfa43b081
commit 2bfa43b081
parent b55bd3a8ca
4 changed files with 130 additions and 0 deletions
--- a/tensorflow/core/profiler/protobuf/hardware_types.proto
+++ b/tensorflow/core/profiler/protobuf/hardware_types.proto
@ -15,3 +15,16 @@ enum HardwareType {
  // TPU.
  TPU = 3;
 }
+
+message CudaComputeCapability {
+  uint32 major = 1;
+  uint32 minor = 2;
+}
+
+message DeviceCapabilities {
+  double clock_rate_in_ghz = 1;
+  uint32 num_cores = 2;
+  uint64 memory_size_in_bytes = 3;
+  uint64 memory_bandwidth = 4;
+  CudaComputeCapability compute_capability = 5;
+}
--- a/tensorflow/core/profiler/utils/BUILD
+++ b/tensorflow/core/profiler/utils/BUILD
@ -24,6 +24,16 @@ cc_library(
    ],
 )

+cc_library(
+    name = "hardware_type_utils",
+    srcs = ["hardware_type_utils.cc"],
+    hdrs = ["hardware_type_utils.h"],
+    deps = [
+        "//tensorflow/core:lib",
+        "//tensorflow/core/profiler/protobuf:hardware_types_proto_cc",
+    ],
+)
+
 cc_library(
    name = "math_utils",
    hdrs = ["math_utils.h"],
--- a/tensorflow/core/profiler/utils/hardware_type_utils.cc
+++ b/tensorflow/core/profiler/utils/hardware_type_utils.cc
@ -0,0 +1,76 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/profiler/utils/hardware_type_utils.h"
+
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace profiler {
+namespace {
+
+// Get theoretical upperbound of single precision FMA throughput of the GPU per
+// cycle per streaming multiprocessor.
+// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#arithmetic-instructions__throughput-native-arithmetic-instructions
+uint32 GetFmaMaxThroughputPerSMPerCycle(const DeviceCapabilities& device_cap) {
+  uint32 n_fp32_cores = 0;
+  uint32 n_tc_cores = 0;
+  switch (device_cap.compute_capability().major()) {
+    case 2:
+      // Fermi
+      n_fp32_cores = 32;
+      break;
+    case 3:
+      // Kepler
+      n_fp32_cores = 192;
+      break;
+    case 5:
+      // Maxwell
+      n_fp32_cores = 128;
+      break;
+    case 6:
+      // Pascal
+      if (device_cap.compute_capability().minor() > 0) {
+        // Pascal SM61/62
+        n_fp32_cores = 128;
+      } else {
+        // Pascal SM60
+        n_fp32_cores = 64;
+      }
+      break;
+    case 7:
+      // Volta and Turing
+      n_fp32_cores = 64;
+      n_tc_cores = 8;
+      break;
+    default:
+      LOG(ERROR) << "Invalid GPU compute capability.";
+      break;
+  }
+  // GPU TensorCore can execute 64 FMAs per cycle.
+  // https://devblogs.nvidia.com/programming-tensor-cores-cuda-9/
+  return n_fp32_cores + n_tc_cores * 64;
+}
+
+}  // namespace
+
+double GetFlopMaxThroughputPerSM(const DeviceCapabilities& device_cap) {
+  // One FMA = 2 floating point operations, one multiply and one add.
+  return GetFmaMaxThroughputPerSMPerCycle(device_cap) * 2 *
+         device_cap.clock_rate_in_ghz();
+}
+
+}  // namespace profiler
+}  // namespace tensorflow
--- a/tensorflow/core/profiler/utils/hardware_type_utils.h
+++ b/tensorflow/core/profiler/utils/hardware_type_utils.h
@ -0,0 +1,31 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PROFILER_UTILS_HARDWARE_TYPE_UTILS_H_
+#define TENSORFLOW_CORE_PROFILER_UTILS_HARDWARE_TYPE_UTILS_H_
+
+#include "tensorflow/core/profiler/protobuf/hardware_types.pb.h"
+
+namespace tensorflow {
+namespace profiler {
+
+// Get peak single precision throughput of the GPU in GFLOPS per
+// streaming multiprocessor.
+double GetFlopMaxThroughputPerSM(const DeviceCapabilities& device_cap);
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_UTILS_HARDWARE_TYPE_UTILS_H_