From 2bfa43b081aafc803708a98a6cce83606aedc300 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Tue, 21 Jan 2020 20:49:05 -0800 Subject: [PATCH] Add GPU device capabilities and compute max FMA throughput. PiperOrigin-RevId: 290881456 Change-Id: Ied22de910ec0fefb641f040b40d39f75631aecad --- .../profiler/protobuf/hardware_types.proto | 13 ++++ tensorflow/core/profiler/utils/BUILD | 10 +++ .../profiler/utils/hardware_type_utils.cc | 76 +++++++++++++++++++ .../core/profiler/utils/hardware_type_utils.h | 31 ++++++++ 4 files changed, 130 insertions(+) create mode 100644 tensorflow/core/profiler/utils/hardware_type_utils.cc create mode 100644 tensorflow/core/profiler/utils/hardware_type_utils.h diff --git a/tensorflow/core/profiler/protobuf/hardware_types.proto b/tensorflow/core/profiler/protobuf/hardware_types.proto index fe04d583d48..0538ee0b056 100644 --- a/tensorflow/core/profiler/protobuf/hardware_types.proto +++ b/tensorflow/core/profiler/protobuf/hardware_types.proto @@ -15,3 +15,16 @@ enum HardwareType { // TPU. TPU = 3; } + +message CudaComputeCapability { + uint32 major = 1; + uint32 minor = 2; +} + +message DeviceCapabilities { + double clock_rate_in_ghz = 1; + uint32 num_cores = 2; + uint64 memory_size_in_bytes = 3; + uint64 memory_bandwidth = 4; + CudaComputeCapability compute_capability = 5; +} diff --git a/tensorflow/core/profiler/utils/BUILD b/tensorflow/core/profiler/utils/BUILD index 41e1fa26159..ff38e825e95 100644 --- a/tensorflow/core/profiler/utils/BUILD +++ b/tensorflow/core/profiler/utils/BUILD @@ -24,6 +24,16 @@ cc_library( ], ) +cc_library( + name = "hardware_type_utils", + srcs = ["hardware_type_utils.cc"], + hdrs = ["hardware_type_utils.h"], + deps = [ + "//tensorflow/core:lib", + "//tensorflow/core/profiler/protobuf:hardware_types_proto_cc", + ], +) + cc_library( name = "math_utils", hdrs = ["math_utils.h"], diff --git a/tensorflow/core/profiler/utils/hardware_type_utils.cc b/tensorflow/core/profiler/utils/hardware_type_utils.cc new file mode 100644 index 00000000000..db797502c27 --- /dev/null +++ b/tensorflow/core/profiler/utils/hardware_type_utils.cc @@ -0,0 +1,76 @@ +/* Copyright 2020 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/core/profiler/utils/hardware_type_utils.h" + +#include "tensorflow/core/platform/types.h" + +namespace tensorflow { +namespace profiler { +namespace { + +// Get theoretical upperbound of single precision FMA throughput of the GPU per +// cycle per streaming multiprocessor. +// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#arithmetic-instructions__throughput-native-arithmetic-instructions +uint32 GetFmaMaxThroughputPerSMPerCycle(const DeviceCapabilities& device_cap) { + uint32 n_fp32_cores = 0; + uint32 n_tc_cores = 0; + switch (device_cap.compute_capability().major()) { + case 2: + // Fermi + n_fp32_cores = 32; + break; + case 3: + // Kepler + n_fp32_cores = 192; + break; + case 5: + // Maxwell + n_fp32_cores = 128; + break; + case 6: + // Pascal + if (device_cap.compute_capability().minor() > 0) { + // Pascal SM61/62 + n_fp32_cores = 128; + } else { + // Pascal SM60 + n_fp32_cores = 64; + } + break; + case 7: + // Volta and Turing + n_fp32_cores = 64; + n_tc_cores = 8; + break; + default: + LOG(ERROR) << "Invalid GPU compute capability."; + break; + } + // GPU TensorCore can execute 64 FMAs per cycle. + // https://devblogs.nvidia.com/programming-tensor-cores-cuda-9/ + return n_fp32_cores + n_tc_cores * 64; +} + +} // namespace + +double GetFlopMaxThroughputPerSM(const DeviceCapabilities& device_cap) { + // One FMA = 2 floating point operations, one multiply and one add. + return GetFmaMaxThroughputPerSMPerCycle(device_cap) * 2 * + device_cap.clock_rate_in_ghz(); +} + +} // namespace profiler +} // namespace tensorflow diff --git a/tensorflow/core/profiler/utils/hardware_type_utils.h b/tensorflow/core/profiler/utils/hardware_type_utils.h new file mode 100644 index 00000000000..9d4b8b73eaf --- /dev/null +++ b/tensorflow/core/profiler/utils/hardware_type_utils.h @@ -0,0 +1,31 @@ +/* Copyright 2020 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_CORE_PROFILER_UTILS_HARDWARE_TYPE_UTILS_H_ +#define TENSORFLOW_CORE_PROFILER_UTILS_HARDWARE_TYPE_UTILS_H_ + +#include "tensorflow/core/profiler/protobuf/hardware_types.pb.h" + +namespace tensorflow { +namespace profiler { + +// Get peak single precision throughput of the GPU in GFLOPS per +// streaming multiprocessor. +double GetFlopMaxThroughputPerSM(const DeviceCapabilities& device_cap); + +} // namespace profiler +} // namespace tensorflow + +#endif // TENSORFLOW_CORE_PROFILER_UTILS_HARDWARE_TYPE_UTILS_H_