Add GPU device capabilities and compute max FMA throughput.

PiperOrigin-RevId: 290881456
Change-Id: Ied22de910ec0fefb641f040b40d39f75631aecad
This commit is contained in:
A. Unique TensorFlower 2020-01-21 20:49:05 -08:00 committed by TensorFlower Gardener
parent b55bd3a8ca
commit 2bfa43b081
4 changed files with 130 additions and 0 deletions

View File

@ -15,3 +15,16 @@ enum HardwareType {
// TPU. // TPU.
TPU = 3; TPU = 3;
} }
message CudaComputeCapability {
uint32 major = 1;
uint32 minor = 2;
}
message DeviceCapabilities {
double clock_rate_in_ghz = 1;
uint32 num_cores = 2;
uint64 memory_size_in_bytes = 3;
uint64 memory_bandwidth = 4;
CudaComputeCapability compute_capability = 5;
}

View File

@ -24,6 +24,16 @@ cc_library(
], ],
) )
cc_library(
name = "hardware_type_utils",
srcs = ["hardware_type_utils.cc"],
hdrs = ["hardware_type_utils.h"],
deps = [
"//tensorflow/core:lib",
"//tensorflow/core/profiler/protobuf:hardware_types_proto_cc",
],
)
cc_library( cc_library(
name = "math_utils", name = "math_utils",
hdrs = ["math_utils.h"], hdrs = ["math_utils.h"],

View File

@ -0,0 +1,76 @@
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/core/profiler/utils/hardware_type_utils.h"
#include "tensorflow/core/platform/types.h"
namespace tensorflow {
namespace profiler {
namespace {
// Get theoretical upperbound of single precision FMA throughput of the GPU per
// cycle per streaming multiprocessor.
// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#arithmetic-instructions__throughput-native-arithmetic-instructions
uint32 GetFmaMaxThroughputPerSMPerCycle(const DeviceCapabilities& device_cap) {
uint32 n_fp32_cores = 0;
uint32 n_tc_cores = 0;
switch (device_cap.compute_capability().major()) {
case 2:
// Fermi
n_fp32_cores = 32;
break;
case 3:
// Kepler
n_fp32_cores = 192;
break;
case 5:
// Maxwell
n_fp32_cores = 128;
break;
case 6:
// Pascal
if (device_cap.compute_capability().minor() > 0) {
// Pascal SM61/62
n_fp32_cores = 128;
} else {
// Pascal SM60
n_fp32_cores = 64;
}
break;
case 7:
// Volta and Turing
n_fp32_cores = 64;
n_tc_cores = 8;
break;
default:
LOG(ERROR) << "Invalid GPU compute capability.";
break;
}
// GPU TensorCore can execute 64 FMAs per cycle.
// https://devblogs.nvidia.com/programming-tensor-cores-cuda-9/
return n_fp32_cores + n_tc_cores * 64;
}
} // namespace
double GetFlopMaxThroughputPerSM(const DeviceCapabilities& device_cap) {
// One FMA = 2 floating point operations, one multiply and one add.
return GetFmaMaxThroughputPerSMPerCycle(device_cap) * 2 *
device_cap.clock_rate_in_ghz();
}
} // namespace profiler
} // namespace tensorflow

View File

@ -0,0 +1,31 @@
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_CORE_PROFILER_UTILS_HARDWARE_TYPE_UTILS_H_
#define TENSORFLOW_CORE_PROFILER_UTILS_HARDWARE_TYPE_UTILS_H_
#include "tensorflow/core/profiler/protobuf/hardware_types.pb.h"
namespace tensorflow {
namespace profiler {
// Get peak single precision throughput of the GPU in GFLOPS per
// streaming multiprocessor.
double GetFlopMaxThroughputPerSM(const DeviceCapabilities& device_cap);
} // namespace profiler
} // namespace tensorflow
#endif // TENSORFLOW_CORE_PROFILER_UTILS_HARDWARE_TYPE_UTILS_H_