Add GPU device capabilities and compute max FMA throughput.
PiperOrigin-RevId: 290881456 Change-Id: Ied22de910ec0fefb641f040b40d39f75631aecad
This commit is contained in:
parent
b55bd3a8ca
commit
2bfa43b081
@ -15,3 +15,16 @@ enum HardwareType {
|
||||
// TPU.
|
||||
TPU = 3;
|
||||
}
|
||||
|
||||
message CudaComputeCapability {
|
||||
uint32 major = 1;
|
||||
uint32 minor = 2;
|
||||
}
|
||||
|
||||
message DeviceCapabilities {
|
||||
double clock_rate_in_ghz = 1;
|
||||
uint32 num_cores = 2;
|
||||
uint64 memory_size_in_bytes = 3;
|
||||
uint64 memory_bandwidth = 4;
|
||||
CudaComputeCapability compute_capability = 5;
|
||||
}
|
||||
|
@ -24,6 +24,16 @@ cc_library(
|
||||
],
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "hardware_type_utils",
|
||||
srcs = ["hardware_type_utils.cc"],
|
||||
hdrs = ["hardware_type_utils.h"],
|
||||
deps = [
|
||||
"//tensorflow/core:lib",
|
||||
"//tensorflow/core/profiler/protobuf:hardware_types_proto_cc",
|
||||
],
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "math_utils",
|
||||
hdrs = ["math_utils.h"],
|
||||
|
76
tensorflow/core/profiler/utils/hardware_type_utils.cc
Normal file
76
tensorflow/core/profiler/utils/hardware_type_utils.cc
Normal file
@ -0,0 +1,76 @@
|
||||
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#include "tensorflow/core/profiler/utils/hardware_type_utils.h"
|
||||
|
||||
#include "tensorflow/core/platform/types.h"
|
||||
|
||||
namespace tensorflow {
|
||||
namespace profiler {
|
||||
namespace {
|
||||
|
||||
// Get theoretical upperbound of single precision FMA throughput of the GPU per
|
||||
// cycle per streaming multiprocessor.
|
||||
// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#arithmetic-instructions__throughput-native-arithmetic-instructions
|
||||
uint32 GetFmaMaxThroughputPerSMPerCycle(const DeviceCapabilities& device_cap) {
|
||||
uint32 n_fp32_cores = 0;
|
||||
uint32 n_tc_cores = 0;
|
||||
switch (device_cap.compute_capability().major()) {
|
||||
case 2:
|
||||
// Fermi
|
||||
n_fp32_cores = 32;
|
||||
break;
|
||||
case 3:
|
||||
// Kepler
|
||||
n_fp32_cores = 192;
|
||||
break;
|
||||
case 5:
|
||||
// Maxwell
|
||||
n_fp32_cores = 128;
|
||||
break;
|
||||
case 6:
|
||||
// Pascal
|
||||
if (device_cap.compute_capability().minor() > 0) {
|
||||
// Pascal SM61/62
|
||||
n_fp32_cores = 128;
|
||||
} else {
|
||||
// Pascal SM60
|
||||
n_fp32_cores = 64;
|
||||
}
|
||||
break;
|
||||
case 7:
|
||||
// Volta and Turing
|
||||
n_fp32_cores = 64;
|
||||
n_tc_cores = 8;
|
||||
break;
|
||||
default:
|
||||
LOG(ERROR) << "Invalid GPU compute capability.";
|
||||
break;
|
||||
}
|
||||
// GPU TensorCore can execute 64 FMAs per cycle.
|
||||
// https://devblogs.nvidia.com/programming-tensor-cores-cuda-9/
|
||||
return n_fp32_cores + n_tc_cores * 64;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
double GetFlopMaxThroughputPerSM(const DeviceCapabilities& device_cap) {
|
||||
// One FMA = 2 floating point operations, one multiply and one add.
|
||||
return GetFmaMaxThroughputPerSMPerCycle(device_cap) * 2 *
|
||||
device_cap.clock_rate_in_ghz();
|
||||
}
|
||||
|
||||
} // namespace profiler
|
||||
} // namespace tensorflow
|
31
tensorflow/core/profiler/utils/hardware_type_utils.h
Normal file
31
tensorflow/core/profiler/utils/hardware_type_utils.h
Normal file
@ -0,0 +1,31 @@
|
||||
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#ifndef TENSORFLOW_CORE_PROFILER_UTILS_HARDWARE_TYPE_UTILS_H_
|
||||
#define TENSORFLOW_CORE_PROFILER_UTILS_HARDWARE_TYPE_UTILS_H_
|
||||
|
||||
#include "tensorflow/core/profiler/protobuf/hardware_types.pb.h"
|
||||
|
||||
namespace tensorflow {
|
||||
namespace profiler {
|
||||
|
||||
// Get peak single precision throughput of the GPU in GFLOPS per
|
||||
// streaming multiprocessor.
|
||||
double GetFlopMaxThroughputPerSM(const DeviceCapabilities& device_cap);
|
||||
|
||||
} // namespace profiler
|
||||
} // namespace tensorflow
|
||||
|
||||
#endif // TENSORFLOW_CORE_PROFILER_UTILS_HARDWARE_TYPE_UTILS_H_
|
Loading…
Reference in New Issue
Block a user