Open source GPU KernelStats proto and utilities.
PiperOrigin-RevId: 296653029 Change-Id: I21fa5dd8d1e2252ce21a5dc1ad2bb815d1edaa02
This commit is contained in:
parent
179f7513ff
commit
b0d557ac35
tensorflow/core/profiler
@ -67,6 +67,7 @@ tf_proto_library(
|
||||
srcs = ["op_stats.proto"],
|
||||
cc_api_version = 2,
|
||||
protodeps = [
|
||||
":kernel_stats_proto",
|
||||
":op_metrics_proto",
|
||||
":steps_db_proto",
|
||||
],
|
||||
@ -75,6 +76,13 @@ tf_proto_library(
|
||||
],
|
||||
)
|
||||
|
||||
tf_proto_library(
|
||||
name = "kernel_stats_proto",
|
||||
srcs = ["kernel_stats.proto"],
|
||||
cc_api_version = 2,
|
||||
visibility = [":friends"],
|
||||
)
|
||||
|
||||
# This proto is deprecating and not guarenteed to be compatible across versions.
|
||||
# Please don't refer in new project unless you are double confirmed.
|
||||
tf_proto_library(
|
||||
|
37
tensorflow/core/profiler/protobuf/kernel_stats.proto
Normal file
37
tensorflow/core/profiler/protobuf/kernel_stats.proto
Normal file
@ -0,0 +1,37 @@
|
||||
syntax = "proto3";
|
||||
|
||||
package tensorflow.profiler;
|
||||
|
||||
message KernelReport {
|
||||
// Name of the kernel.
|
||||
string name = 1;
|
||||
// Registers per thread.
|
||||
uint32 registers_per_thread = 2;
|
||||
// Static shared memory in bytes.
|
||||
uint32 static_shmem_bytes = 3;
|
||||
// Dynamic shared memory in bytes.
|
||||
uint32 dynamic_shmem_bytes = 4;
|
||||
// Block dimensions.
|
||||
repeated uint32 block_dim = 5;
|
||||
// Grid dimensions.
|
||||
repeated uint32 grid_dim = 6;
|
||||
// Total duration of this kernel.
|
||||
uint64 total_duration_ns = 7;
|
||||
// Min duration of kernel in nanoseconds.
|
||||
uint64 min_duration_ns = 8;
|
||||
// Max duration of kernel in nanoseconds.
|
||||
uint64 max_duration_ns = 9;
|
||||
// Kernel utilizes TensorCore instructions.
|
||||
bool is_kernel_using_tensor_core = 10;
|
||||
// Operation is eligible to use TensorCores.
|
||||
bool is_op_tensor_core_eligible = 11;
|
||||
// TF operation name.
|
||||
string op_name = 12;
|
||||
// Number of occurrences.
|
||||
uint32 occurrences = 13;
|
||||
}
|
||||
|
||||
message KernelStatsDb {
|
||||
// A list of kernels aggregated by name.
|
||||
repeated KernelReport reports = 1;
|
||||
}
|
@ -2,6 +2,7 @@ syntax = "proto3";
|
||||
|
||||
package tensorflow.profiler;
|
||||
|
||||
import "tensorflow/core/profiler/protobuf/kernel_stats.proto";
|
||||
import "tensorflow/core/profiler/protobuf/op_metrics.proto";
|
||||
import "tensorflow/core/profiler/protobuf/steps_db.proto";
|
||||
|
||||
@ -99,4 +100,6 @@ message OpStats {
|
||||
StepDatabaseResult step_db = 4;
|
||||
// The run environment of this profiling session.
|
||||
RunEnvironment run_environment = 5;
|
||||
// Kernel stats results from all GPUs.
|
||||
KernelStatsDb kernel_stats_db = 6;
|
||||
}
|
||||
|
@ -304,3 +304,14 @@ tf_cc_test(
|
||||
"@com_google_absl//absl/strings",
|
||||
],
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "kernel_stats_utils",
|
||||
srcs = ["kernel_stats_utils.cc"],
|
||||
hdrs = ["kernel_stats_utils.h"],
|
||||
deps = [
|
||||
"//tensorflow/core:lib",
|
||||
"//tensorflow/core/profiler/protobuf:kernel_stats_proto_cc",
|
||||
"@com_google_absl//absl/strings",
|
||||
],
|
||||
)
|
||||
|
169
tensorflow/core/profiler/utils/kernel_stats_utils.cc
Normal file
169
tensorflow/core/profiler/utils/kernel_stats_utils.cc
Normal file
@ -0,0 +1,169 @@
|
||||
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#include "tensorflow/core/profiler/utils/kernel_stats_utils.h"
|
||||
|
||||
#include <tuple>
|
||||
#include <vector>
|
||||
|
||||
#include "absl/strings/match.h"
|
||||
#include "absl/strings/numbers.h"
|
||||
#include "absl/strings/str_cat.h"
|
||||
#include "absl/strings/str_split.h"
|
||||
#include "absl/strings/string_view.h"
|
||||
#include "tensorflow/core/platform/logging.h"
|
||||
#include "tensorflow/core/profiler/protobuf/kernel_stats.pb.h"
|
||||
|
||||
namespace tensorflow {
|
||||
namespace profiler {
|
||||
|
||||
void ParseKernelLaunchParams(absl::string_view xstat_kernel_details,
|
||||
KernelReport* kernel) {
|
||||
const std::vector<absl::string_view> params =
|
||||
absl::StrSplit(xstat_kernel_details, absl::ByAnyChar(":\n"));
|
||||
|
||||
constexpr uint32_t kNumDimensions = 3;
|
||||
for (uint32_t dim = 0; dim < kNumDimensions; ++dim) {
|
||||
kernel->add_block_dim(1);
|
||||
kernel->add_grid_dim(1);
|
||||
}
|
||||
|
||||
// Process value pairs.
|
||||
for (uint32_t ii = 0; ii < params.size(); ii += 2) {
|
||||
uint32_t value = 0;
|
||||
if (params[ii] == "registers_per_thread" &&
|
||||
absl::SimpleAtoi(params[ii + 1], &value)) {
|
||||
kernel->set_registers_per_thread(value);
|
||||
} else if (params[ii] == "static_shared_memory_usage" &&
|
||||
absl::SimpleAtoi(params[ii + 1], &value)) {
|
||||
kernel->set_static_shmem_bytes(value);
|
||||
} else if (params[ii] == "dynamic_shared_memory_usage" &&
|
||||
absl::SimpleAtoi(params[ii + 1], &value)) {
|
||||
kernel->set_dynamic_shmem_bytes(value);
|
||||
} else if (params[ii] == "block_x" &&
|
||||
absl::SimpleAtoi(params[ii + 1], &value)) {
|
||||
kernel->mutable_block_dim()->Set(0, value);
|
||||
} else if (params[ii] == "block_y" &&
|
||||
absl::SimpleAtoi(params[ii + 1], &value)) {
|
||||
kernel->mutable_block_dim()->Set(1, value);
|
||||
} else if (params[ii] == "block_z" &&
|
||||
absl::SimpleAtoi(params[ii + 1], &value)) {
|
||||
kernel->mutable_block_dim()->Set(2, value);
|
||||
} else if (params[ii] == "grid_x" &&
|
||||
absl::SimpleAtoi(params[ii + 1], &value)) {
|
||||
kernel->mutable_grid_dim()->Set(0, value);
|
||||
} else if (params[ii] == "grid_y" &&
|
||||
absl::SimpleAtoi(params[ii + 1], &value)) {
|
||||
kernel->mutable_grid_dim()->Set(1, value);
|
||||
} else if (params[ii] == "grid_z" &&
|
||||
absl::SimpleAtoi(params[ii + 1], &value)) {
|
||||
kernel->mutable_grid_dim()->Set(2, value);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool IsKernelUsingTensorCore(absl::string_view kernel_name) {
|
||||
// Some examples: volta_h884gemm, volta_fp16_s884gemm,
|
||||
// turing_fp16_s1688cudnn_fp16
|
||||
bool possible_tensor_kernel = absl::StrContains(kernel_name, "884") ||
|
||||
absl::StrContains(kernel_name, "1688");
|
||||
#if defined(VLOG_IF)
|
||||
VLOG_IF(1, possible_tensor_kernel)
|
||||
<< "Possible tensor kernel: " << kernel_name << "\n";
|
||||
#endif // defined(VLOG_IF)
|
||||
|
||||
return (absl::StartsWith(kernel_name, "volta_i884") ||
|
||||
absl::StartsWith(kernel_name, "volta_h884") ||
|
||||
absl::StartsWith(kernel_name, "volta_s884") ||
|
||||
absl::StartsWith(kernel_name, "volta_fp16_i884") ||
|
||||
absl::StartsWith(kernel_name, "volta_fp16_h884") ||
|
||||
absl::StartsWith(kernel_name, "volta_fp16_s884") ||
|
||||
absl::StartsWith(kernel_name, "turing_i1688") ||
|
||||
absl::StartsWith(kernel_name, "turing_h1688") ||
|
||||
absl::StartsWith(kernel_name, "turing_s1688") ||
|
||||
absl::StartsWith(kernel_name, "turing_fp16_i1688") ||
|
||||
absl::StartsWith(kernel_name, "turing_fp16_h1688") ||
|
||||
absl::StartsWith(kernel_name, "turing_fp16_s1688"));
|
||||
}
|
||||
|
||||
// This list is not exhaustive.
|
||||
bool IsOpTensorCoreEligible(absl::string_view tf_op_name) {
|
||||
return (absl::StrContains(tf_op_name, "Conv") ||
|
||||
absl::StrContains(tf_op_name, "Einsum"));
|
||||
}
|
||||
|
||||
bool KernelReportLessThanComparator::operator()(const KernelReport& lhs,
|
||||
const KernelReport& rhs) {
|
||||
// Disable formatting to keep vertical alignment for better readability,
|
||||
// and make it easier to reorder columns.
|
||||
// clang-format off
|
||||
auto lhs_tuple = std::make_tuple(
|
||||
lhs.name(),
|
||||
lhs.grid_dim(0),
|
||||
lhs.grid_dim(1),
|
||||
lhs.grid_dim(2),
|
||||
lhs.block_dim(0),
|
||||
lhs.block_dim(1),
|
||||
lhs.block_dim(2),
|
||||
lhs.registers_per_thread(),
|
||||
lhs.static_shmem_bytes(),
|
||||
lhs.dynamic_shmem_bytes(),
|
||||
lhs.is_kernel_using_tensor_core(),
|
||||
lhs.is_op_tensor_core_eligible(),
|
||||
lhs.op_name());
|
||||
|
||||
auto rhs_tuple = std::make_tuple(
|
||||
rhs.name(),
|
||||
rhs.grid_dim(0),
|
||||
rhs.grid_dim(1),
|
||||
rhs.grid_dim(2),
|
||||
rhs.block_dim(0),
|
||||
rhs.block_dim(1),
|
||||
rhs.block_dim(2),
|
||||
rhs.registers_per_thread(),
|
||||
rhs.static_shmem_bytes(),
|
||||
rhs.dynamic_shmem_bytes(),
|
||||
rhs.is_kernel_using_tensor_core(),
|
||||
rhs.is_op_tensor_core_eligible(),
|
||||
rhs.op_name());
|
||||
// clang-format on
|
||||
return lhs_tuple < rhs_tuple;
|
||||
}
|
||||
|
||||
bool KernelReportEqualToComparator::operator()(const KernelReport& lhs,
|
||||
const KernelReport& rhs) {
|
||||
// Disable formatting to keep vertical alignment for better readability,
|
||||
// and make it easier to reorder columns.
|
||||
// clang-format off
|
||||
// Put the most expensive string comparisons last.
|
||||
return (
|
||||
lhs.is_kernel_using_tensor_core() == rhs.is_kernel_using_tensor_core() &&
|
||||
lhs.is_op_tensor_core_eligible() == rhs.is_op_tensor_core_eligible() &&
|
||||
lhs.block_dim(0) == rhs.block_dim(0) &&
|
||||
lhs.block_dim(1) == rhs.block_dim(1) &&
|
||||
lhs.block_dim(2) == rhs.block_dim(2) &&
|
||||
lhs.grid_dim(0) == rhs.grid_dim(0) &&
|
||||
lhs.grid_dim(1) == rhs.grid_dim(1) &&
|
||||
lhs.grid_dim(2) == rhs.grid_dim(2) &&
|
||||
lhs.registers_per_thread() == rhs.registers_per_thread() &&
|
||||
lhs.static_shmem_bytes() == rhs.static_shmem_bytes() &&
|
||||
lhs.dynamic_shmem_bytes() == rhs.dynamic_shmem_bytes() &&
|
||||
lhs.name() == rhs.name() &&
|
||||
lhs.op_name() == rhs.op_name());
|
||||
// clang-format on
|
||||
}
|
||||
|
||||
} // namespace profiler
|
||||
} // namespace tensorflow
|
48
tensorflow/core/profiler/utils/kernel_stats_utils.h
Normal file
48
tensorflow/core/profiler/utils/kernel_stats_utils.h
Normal file
@ -0,0 +1,48 @@
|
||||
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#ifndef TENSORFLOW_CORE_PROFILER_UTILS_KERNEL_STATS_UTILS_H_
|
||||
#define TENSORFLOW_CORE_PROFILER_UTILS_KERNEL_STATS_UTILS_H_
|
||||
|
||||
#include "absl/strings/string_view.h"
|
||||
#include "tensorflow/core/profiler/protobuf/kernel_stats.pb.h"
|
||||
|
||||
namespace tensorflow {
|
||||
namespace profiler {
|
||||
|
||||
// Populates kernel launch information from a KernelDetails XStat.
|
||||
void ParseKernelLaunchParams(absl::string_view xstat_kernel_details,
|
||||
KernelReport* kernel);
|
||||
|
||||
// Returns true if kernel uses TensorCores.
|
||||
bool IsKernelUsingTensorCore(absl::string_view kernel_name);
|
||||
|
||||
// Returns true if operation is eligible to use TensorCores.
|
||||
bool IsOpTensorCoreEligible(absl::string_view tf_op_name);
|
||||
|
||||
// Less than comparator for Kernel Reports.
|
||||
struct KernelReportLessThanComparator {
|
||||
bool operator()(const KernelReport& lhs, const KernelReport& rhs);
|
||||
};
|
||||
|
||||
// Equal to comparator for Kernel Reports.
|
||||
struct KernelReportEqualToComparator {
|
||||
bool operator()(const KernelReport& lhs, const KernelReport& rhs);
|
||||
};
|
||||
|
||||
} // namespace profiler
|
||||
} // namespace tensorflow
|
||||
|
||||
#endif // TENSORFLOW_CORE_PROFILER_UTILS_KERNEL_STATS_UTILS_H_
|
Loading…
Reference in New Issue
Block a user