Open source GPU KernelStats proto and utilities.

PiperOrigin-RevId: 296653029
Change-Id: I21fa5dd8d1e2252ce21a5dc1ad2bb815d1edaa02
This commit is contained in:
Yi Situ 2020-02-22 15:10:32 -08:00 committed by TensorFlower Gardener
parent 179f7513ff
commit b0d557ac35
6 changed files with 276 additions and 0 deletions

View File

@ -67,6 +67,7 @@ tf_proto_library(
srcs = ["op_stats.proto"],
cc_api_version = 2,
protodeps = [
":kernel_stats_proto",
":op_metrics_proto",
":steps_db_proto",
],
@ -75,6 +76,13 @@ tf_proto_library(
],
)
tf_proto_library(
name = "kernel_stats_proto",
srcs = ["kernel_stats.proto"],
cc_api_version = 2,
visibility = [":friends"],
)
# This proto is deprecating and not guarenteed to be compatible across versions.
# Please don't refer in new project unless you are double confirmed.
tf_proto_library(

View File

@ -0,0 +1,37 @@
syntax = "proto3";
package tensorflow.profiler;
message KernelReport {
// Name of the kernel.
string name = 1;
// Registers per thread.
uint32 registers_per_thread = 2;
// Static shared memory in bytes.
uint32 static_shmem_bytes = 3;
// Dynamic shared memory in bytes.
uint32 dynamic_shmem_bytes = 4;
// Block dimensions.
repeated uint32 block_dim = 5;
// Grid dimensions.
repeated uint32 grid_dim = 6;
// Total duration of this kernel.
uint64 total_duration_ns = 7;
// Min duration of kernel in nanoseconds.
uint64 min_duration_ns = 8;
// Max duration of kernel in nanoseconds.
uint64 max_duration_ns = 9;
// Kernel utilizes TensorCore instructions.
bool is_kernel_using_tensor_core = 10;
// Operation is eligible to use TensorCores.
bool is_op_tensor_core_eligible = 11;
// TF operation name.
string op_name = 12;
// Number of occurrences.
uint32 occurrences = 13;
}
message KernelStatsDb {
// A list of kernels aggregated by name.
repeated KernelReport reports = 1;
}

View File

@ -2,6 +2,7 @@ syntax = "proto3";
package tensorflow.profiler;
import "tensorflow/core/profiler/protobuf/kernel_stats.proto";
import "tensorflow/core/profiler/protobuf/op_metrics.proto";
import "tensorflow/core/profiler/protobuf/steps_db.proto";
@ -99,4 +100,6 @@ message OpStats {
StepDatabaseResult step_db = 4;
// The run environment of this profiling session.
RunEnvironment run_environment = 5;
// Kernel stats results from all GPUs.
KernelStatsDb kernel_stats_db = 6;
}

View File

@ -304,3 +304,14 @@ tf_cc_test(
"@com_google_absl//absl/strings",
],
)
cc_library(
name = "kernel_stats_utils",
srcs = ["kernel_stats_utils.cc"],
hdrs = ["kernel_stats_utils.h"],
deps = [
"//tensorflow/core:lib",
"//tensorflow/core/profiler/protobuf:kernel_stats_proto_cc",
"@com_google_absl//absl/strings",
],
)

View File

@ -0,0 +1,169 @@
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/core/profiler/utils/kernel_stats_utils.h"
#include <tuple>
#include <vector>
#include "absl/strings/match.h"
#include "absl/strings/numbers.h"
#include "absl/strings/str_cat.h"
#include "absl/strings/str_split.h"
#include "absl/strings/string_view.h"
#include "tensorflow/core/platform/logging.h"
#include "tensorflow/core/profiler/protobuf/kernel_stats.pb.h"
namespace tensorflow {
namespace profiler {
void ParseKernelLaunchParams(absl::string_view xstat_kernel_details,
KernelReport* kernel) {
const std::vector<absl::string_view> params =
absl::StrSplit(xstat_kernel_details, absl::ByAnyChar(":\n"));
constexpr uint32_t kNumDimensions = 3;
for (uint32_t dim = 0; dim < kNumDimensions; ++dim) {
kernel->add_block_dim(1);
kernel->add_grid_dim(1);
}
// Process value pairs.
for (uint32_t ii = 0; ii < params.size(); ii += 2) {
uint32_t value = 0;
if (params[ii] == "registers_per_thread" &&
absl::SimpleAtoi(params[ii + 1], &value)) {
kernel->set_registers_per_thread(value);
} else if (params[ii] == "static_shared_memory_usage" &&
absl::SimpleAtoi(params[ii + 1], &value)) {
kernel->set_static_shmem_bytes(value);
} else if (params[ii] == "dynamic_shared_memory_usage" &&
absl::SimpleAtoi(params[ii + 1], &value)) {
kernel->set_dynamic_shmem_bytes(value);
} else if (params[ii] == "block_x" &&
absl::SimpleAtoi(params[ii + 1], &value)) {
kernel->mutable_block_dim()->Set(0, value);
} else if (params[ii] == "block_y" &&
absl::SimpleAtoi(params[ii + 1], &value)) {
kernel->mutable_block_dim()->Set(1, value);
} else if (params[ii] == "block_z" &&
absl::SimpleAtoi(params[ii + 1], &value)) {
kernel->mutable_block_dim()->Set(2, value);
} else if (params[ii] == "grid_x" &&
absl::SimpleAtoi(params[ii + 1], &value)) {
kernel->mutable_grid_dim()->Set(0, value);
} else if (params[ii] == "grid_y" &&
absl::SimpleAtoi(params[ii + 1], &value)) {
kernel->mutable_grid_dim()->Set(1, value);
} else if (params[ii] == "grid_z" &&
absl::SimpleAtoi(params[ii + 1], &value)) {
kernel->mutable_grid_dim()->Set(2, value);
}
}
}
bool IsKernelUsingTensorCore(absl::string_view kernel_name) {
// Some examples: volta_h884gemm, volta_fp16_s884gemm,
// turing_fp16_s1688cudnn_fp16
bool possible_tensor_kernel = absl::StrContains(kernel_name, "884") ||
absl::StrContains(kernel_name, "1688");
#if defined(VLOG_IF)
VLOG_IF(1, possible_tensor_kernel)
<< "Possible tensor kernel: " << kernel_name << "\n";
#endif // defined(VLOG_IF)
return (absl::StartsWith(kernel_name, "volta_i884") ||
absl::StartsWith(kernel_name, "volta_h884") ||
absl::StartsWith(kernel_name, "volta_s884") ||
absl::StartsWith(kernel_name, "volta_fp16_i884") ||
absl::StartsWith(kernel_name, "volta_fp16_h884") ||
absl::StartsWith(kernel_name, "volta_fp16_s884") ||
absl::StartsWith(kernel_name, "turing_i1688") ||
absl::StartsWith(kernel_name, "turing_h1688") ||
absl::StartsWith(kernel_name, "turing_s1688") ||
absl::StartsWith(kernel_name, "turing_fp16_i1688") ||
absl::StartsWith(kernel_name, "turing_fp16_h1688") ||
absl::StartsWith(kernel_name, "turing_fp16_s1688"));
}
// This list is not exhaustive.
bool IsOpTensorCoreEligible(absl::string_view tf_op_name) {
return (absl::StrContains(tf_op_name, "Conv") ||
absl::StrContains(tf_op_name, "Einsum"));
}
bool KernelReportLessThanComparator::operator()(const KernelReport& lhs,
const KernelReport& rhs) {
// Disable formatting to keep vertical alignment for better readability,
// and make it easier to reorder columns.
// clang-format off
auto lhs_tuple = std::make_tuple(
lhs.name(),
lhs.grid_dim(0),
lhs.grid_dim(1),
lhs.grid_dim(2),
lhs.block_dim(0),
lhs.block_dim(1),
lhs.block_dim(2),
lhs.registers_per_thread(),
lhs.static_shmem_bytes(),
lhs.dynamic_shmem_bytes(),
lhs.is_kernel_using_tensor_core(),
lhs.is_op_tensor_core_eligible(),
lhs.op_name());
auto rhs_tuple = std::make_tuple(
rhs.name(),
rhs.grid_dim(0),
rhs.grid_dim(1),
rhs.grid_dim(2),
rhs.block_dim(0),
rhs.block_dim(1),
rhs.block_dim(2),
rhs.registers_per_thread(),
rhs.static_shmem_bytes(),
rhs.dynamic_shmem_bytes(),
rhs.is_kernel_using_tensor_core(),
rhs.is_op_tensor_core_eligible(),
rhs.op_name());
// clang-format on
return lhs_tuple < rhs_tuple;
}
bool KernelReportEqualToComparator::operator()(const KernelReport& lhs,
const KernelReport& rhs) {
// Disable formatting to keep vertical alignment for better readability,
// and make it easier to reorder columns.
// clang-format off
// Put the most expensive string comparisons last.
return (
lhs.is_kernel_using_tensor_core() == rhs.is_kernel_using_tensor_core() &&
lhs.is_op_tensor_core_eligible() == rhs.is_op_tensor_core_eligible() &&
lhs.block_dim(0) == rhs.block_dim(0) &&
lhs.block_dim(1) == rhs.block_dim(1) &&
lhs.block_dim(2) == rhs.block_dim(2) &&
lhs.grid_dim(0) == rhs.grid_dim(0) &&
lhs.grid_dim(1) == rhs.grid_dim(1) &&
lhs.grid_dim(2) == rhs.grid_dim(2) &&
lhs.registers_per_thread() == rhs.registers_per_thread() &&
lhs.static_shmem_bytes() == rhs.static_shmem_bytes() &&
lhs.dynamic_shmem_bytes() == rhs.dynamic_shmem_bytes() &&
lhs.name() == rhs.name() &&
lhs.op_name() == rhs.op_name());
// clang-format on
}
} // namespace profiler
} // namespace tensorflow

View File

@ -0,0 +1,48 @@
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_CORE_PROFILER_UTILS_KERNEL_STATS_UTILS_H_
#define TENSORFLOW_CORE_PROFILER_UTILS_KERNEL_STATS_UTILS_H_
#include "absl/strings/string_view.h"
#include "tensorflow/core/profiler/protobuf/kernel_stats.pb.h"
namespace tensorflow {
namespace profiler {
// Populates kernel launch information from a KernelDetails XStat.
void ParseKernelLaunchParams(absl::string_view xstat_kernel_details,
KernelReport* kernel);
// Returns true if kernel uses TensorCores.
bool IsKernelUsingTensorCore(absl::string_view kernel_name);
// Returns true if operation is eligible to use TensorCores.
bool IsOpTensorCoreEligible(absl::string_view tf_op_name);
// Less than comparator for Kernel Reports.
struct KernelReportLessThanComparator {
bool operator()(const KernelReport& lhs, const KernelReport& rhs);
};
// Equal to comparator for Kernel Reports.
struct KernelReportEqualToComparator {
bool operator()(const KernelReport& lhs, const KernelReport& rhs);
};
} // namespace profiler
} // namespace tensorflow
#endif // TENSORFLOW_CORE_PROFILER_UTILS_KERNEL_STATS_UTILS_H_