Add GPU device capabilities and compute max FMA throughput.
PiperOrigin-RevId: 290881456 Change-Id: Ied22de910ec0fefb641f040b40d39f75631aecad
This commit is contained in:
parent
b55bd3a8ca
commit
2bfa43b081
@ -15,3 +15,16 @@ enum HardwareType {
|
|||||||
// TPU.
|
// TPU.
|
||||||
TPU = 3;
|
TPU = 3;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
message CudaComputeCapability {
|
||||||
|
uint32 major = 1;
|
||||||
|
uint32 minor = 2;
|
||||||
|
}
|
||||||
|
|
||||||
|
message DeviceCapabilities {
|
||||||
|
double clock_rate_in_ghz = 1;
|
||||||
|
uint32 num_cores = 2;
|
||||||
|
uint64 memory_size_in_bytes = 3;
|
||||||
|
uint64 memory_bandwidth = 4;
|
||||||
|
CudaComputeCapability compute_capability = 5;
|
||||||
|
}
|
||||||
|
@ -24,6 +24,16 @@ cc_library(
|
|||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
|
||||||
|
cc_library(
|
||||||
|
name = "hardware_type_utils",
|
||||||
|
srcs = ["hardware_type_utils.cc"],
|
||||||
|
hdrs = ["hardware_type_utils.h"],
|
||||||
|
deps = [
|
||||||
|
"//tensorflow/core:lib",
|
||||||
|
"//tensorflow/core/profiler/protobuf:hardware_types_proto_cc",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
cc_library(
|
cc_library(
|
||||||
name = "math_utils",
|
name = "math_utils",
|
||||||
hdrs = ["math_utils.h"],
|
hdrs = ["math_utils.h"],
|
||||||
|
76
tensorflow/core/profiler/utils/hardware_type_utils.cc
Normal file
76
tensorflow/core/profiler/utils/hardware_type_utils.cc
Normal file
@ -0,0 +1,76 @@
|
|||||||
|
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
==============================================================================*/
|
||||||
|
|
||||||
|
#include "tensorflow/core/profiler/utils/hardware_type_utils.h"
|
||||||
|
|
||||||
|
#include "tensorflow/core/platform/types.h"
|
||||||
|
|
||||||
|
namespace tensorflow {
|
||||||
|
namespace profiler {
|
||||||
|
namespace {
|
||||||
|
|
||||||
|
// Get theoretical upperbound of single precision FMA throughput of the GPU per
|
||||||
|
// cycle per streaming multiprocessor.
|
||||||
|
// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#arithmetic-instructions__throughput-native-arithmetic-instructions
|
||||||
|
uint32 GetFmaMaxThroughputPerSMPerCycle(const DeviceCapabilities& device_cap) {
|
||||||
|
uint32 n_fp32_cores = 0;
|
||||||
|
uint32 n_tc_cores = 0;
|
||||||
|
switch (device_cap.compute_capability().major()) {
|
||||||
|
case 2:
|
||||||
|
// Fermi
|
||||||
|
n_fp32_cores = 32;
|
||||||
|
break;
|
||||||
|
case 3:
|
||||||
|
// Kepler
|
||||||
|
n_fp32_cores = 192;
|
||||||
|
break;
|
||||||
|
case 5:
|
||||||
|
// Maxwell
|
||||||
|
n_fp32_cores = 128;
|
||||||
|
break;
|
||||||
|
case 6:
|
||||||
|
// Pascal
|
||||||
|
if (device_cap.compute_capability().minor() > 0) {
|
||||||
|
// Pascal SM61/62
|
||||||
|
n_fp32_cores = 128;
|
||||||
|
} else {
|
||||||
|
// Pascal SM60
|
||||||
|
n_fp32_cores = 64;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case 7:
|
||||||
|
// Volta and Turing
|
||||||
|
n_fp32_cores = 64;
|
||||||
|
n_tc_cores = 8;
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
LOG(ERROR) << "Invalid GPU compute capability.";
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
// GPU TensorCore can execute 64 FMAs per cycle.
|
||||||
|
// https://devblogs.nvidia.com/programming-tensor-cores-cuda-9/
|
||||||
|
return n_fp32_cores + n_tc_cores * 64;
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace
|
||||||
|
|
||||||
|
double GetFlopMaxThroughputPerSM(const DeviceCapabilities& device_cap) {
|
||||||
|
// One FMA = 2 floating point operations, one multiply and one add.
|
||||||
|
return GetFmaMaxThroughputPerSMPerCycle(device_cap) * 2 *
|
||||||
|
device_cap.clock_rate_in_ghz();
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace profiler
|
||||||
|
} // namespace tensorflow
|
31
tensorflow/core/profiler/utils/hardware_type_utils.h
Normal file
31
tensorflow/core/profiler/utils/hardware_type_utils.h
Normal file
@ -0,0 +1,31 @@
|
|||||||
|
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
==============================================================================*/
|
||||||
|
|
||||||
|
#ifndef TENSORFLOW_CORE_PROFILER_UTILS_HARDWARE_TYPE_UTILS_H_
|
||||||
|
#define TENSORFLOW_CORE_PROFILER_UTILS_HARDWARE_TYPE_UTILS_H_
|
||||||
|
|
||||||
|
#include "tensorflow/core/profiler/protobuf/hardware_types.pb.h"
|
||||||
|
|
||||||
|
namespace tensorflow {
|
||||||
|
namespace profiler {
|
||||||
|
|
||||||
|
// Get peak single precision throughput of the GPU in GFLOPS per
|
||||||
|
// streaming multiprocessor.
|
||||||
|
double GetFlopMaxThroughputPerSM(const DeviceCapabilities& device_cap);
|
||||||
|
|
||||||
|
} // namespace profiler
|
||||||
|
} // namespace tensorflow
|
||||||
|
|
||||||
|
#endif // TENSORFLOW_CORE_PROFILER_UTILS_HARDWARE_TYPE_UTILS_H_
|
Loading…
Reference in New Issue
Block a user