Add GPU device capabilities and compute max FMA throughput.

PiperOrigin-RevId: 290881456 Change-Id: Ied22de910ec0fefb641f040b40d39f75631aecad
2020-01-21 20:49:05 -08:00 · 2020-01-21 20:49:05 -08:00 · 2bfa43b081
commit 2bfa43b081
parent b55bd3a8ca
4 changed files with 130 additions and 0 deletions
--- a/tensorflow/core/profiler/protobuf/hardware_types.proto
+++ b/tensorflow/core/profiler/protobuf/hardware_types.proto
@ -15,3 +15,16 @@ enum HardwareType {
  // TPU.
  TPU = 3;
 }
 message CudaComputeCapability {
  uint32 major = 1;
  uint32 minor = 2;
 }
 message DeviceCapabilities {
  double clock_rate_in_ghz = 1;
  uint32 num_cores = 2;
  uint64 memory_size_in_bytes = 3;
  uint64 memory_bandwidth = 4;
  CudaComputeCapability compute_capability = 5;
 }
--- a/tensorflow/core/profiler/utils/BUILD
+++ b/tensorflow/core/profiler/utils/BUILD
@ -24,6 +24,16 @@ cc_library(
    ],
 )
 cc_library(
    name = "hardware_type_utils",
    srcs = ["hardware_type_utils.cc"],
    hdrs = ["hardware_type_utils.h"],
    deps = [
        "//tensorflow/core:lib",
        "//tensorflow/core/profiler/protobuf:hardware_types_proto_cc",
    ],
 )
 cc_library(
    name = "math_utils",
    hdrs = ["math_utils.h"],
--- a/tensorflow/core/profiler/utils/hardware_type_utils.cc
+++ b/tensorflow/core/profiler/utils/hardware_type_utils.cc
@ -0,0 +1,76 @@
 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/profiler/utils/hardware_type_utils.h"
 #include "tensorflow/core/platform/types.h"
 namespace tensorflow {
 namespace profiler {
 namespace {
 // Get theoretical upperbound of single precision FMA throughput of the GPU per
 // cycle per streaming multiprocessor.
 // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#arithmetic-instructions__throughput-native-arithmetic-instructions
 uint32 GetFmaMaxThroughputPerSMPerCycle(const DeviceCapabilities& device_cap) {
  uint32 n_fp32_cores = 0;
  uint32 n_tc_cores = 0;
  switch (device_cap.compute_capability().major()) {
    case 2:
      // Fermi
      n_fp32_cores = 32;
      break;
    case 3:
      // Kepler
      n_fp32_cores = 192;
      break;
    case 5:
      // Maxwell
      n_fp32_cores = 128;
      break;
    case 6:
      // Pascal
      if (device_cap.compute_capability().minor() > 0) {
        // Pascal SM61/62
        n_fp32_cores = 128;
      } else {
        // Pascal SM60
        n_fp32_cores = 64;
      }
      break;
    case 7:
      // Volta and Turing
      n_fp32_cores = 64;
      n_tc_cores = 8;
      break;
    default:
      LOG(ERROR) << "Invalid GPU compute capability.";
      break;
  }
  // GPU TensorCore can execute 64 FMAs per cycle.
  // https://devblogs.nvidia.com/programming-tensor-cores-cuda-9/
  return n_fp32_cores + n_tc_cores * 64;
 }
 }  // namespace
 double GetFlopMaxThroughputPerSM(const DeviceCapabilities& device_cap) {
  // One FMA = 2 floating point operations, one multiply and one add.
  return GetFmaMaxThroughputPerSMPerCycle(device_cap) * 2 *
         device_cap.clock_rate_in_ghz();
 }
 }  // namespace profiler
 }  // namespace tensorflow
--- a/tensorflow/core/profiler/utils/hardware_type_utils.h
+++ b/tensorflow/core/profiler/utils/hardware_type_utils.h
@ -0,0 +1,31 @@
 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #ifndef TENSORFLOW_CORE_PROFILER_UTILS_HARDWARE_TYPE_UTILS_H_
 #define TENSORFLOW_CORE_PROFILER_UTILS_HARDWARE_TYPE_UTILS_H_
 #include "tensorflow/core/profiler/protobuf/hardware_types.pb.h"
 namespace tensorflow {
 namespace profiler {
 // Get peak single precision throughput of the GPU in GFLOPS per
 // streaming multiprocessor.
 double GetFlopMaxThroughputPerSM(const DeviceCapabilities& device_cap);
 }  // namespace profiler
 }  // namespace tensorflow
 #endif  // TENSORFLOW_CORE_PROFILER_UTILS_HARDWARE_TYPE_UTILS_H_