Open source GPU KernelStats proto and utilities.

PiperOrigin-RevId: 296653029 Change-Id: I21fa5dd8d1e2252ce21a5dc1ad2bb815d1edaa02
2020-02-22 15:10:32 -08:00 · 2020-02-22 15:10:32 -08:00 · b0d557ac35
commit b0d557ac35
parent 179f7513ff
6 changed files with 276 additions and 0 deletions
--- a/tensorflow/core/profiler/protobuf/BUILD
+++ b/tensorflow/core/profiler/protobuf/BUILD
@ -67,6 +67,7 @@ tf_proto_library(
    srcs = ["op_stats.proto"],
    cc_api_version = 2,
    protodeps = [
+        ":kernel_stats_proto",
        ":op_metrics_proto",
        ":steps_db_proto",
    ],
@ -75,6 +76,13 @@ tf_proto_library(
    ],
 )

+tf_proto_library(
+    name = "kernel_stats_proto",
+    srcs = ["kernel_stats.proto"],
+    cc_api_version = 2,
+    visibility = [":friends"],
+)
+
 # This proto is deprecating and not guarenteed to be compatible across versions.
 # Please don't refer in new project unless you are double confirmed.
 tf_proto_library(
--- a/tensorflow/core/profiler/protobuf/kernel_stats.proto
+++ b/tensorflow/core/profiler/protobuf/kernel_stats.proto
@ -0,0 +1,37 @@
+syntax = "proto3";
+
+package tensorflow.profiler;
+
+message KernelReport {
+  // Name of the kernel.
+  string name = 1;
+  // Registers per thread.
+  uint32 registers_per_thread = 2;
+  // Static shared memory in bytes.
+  uint32 static_shmem_bytes = 3;
+  // Dynamic shared memory in bytes.
+  uint32 dynamic_shmem_bytes = 4;
+  // Block dimensions.
+  repeated uint32 block_dim = 5;
+  // Grid dimensions.
+  repeated uint32 grid_dim = 6;
+  // Total duration of this kernel.
+  uint64 total_duration_ns = 7;
+  // Min duration of kernel in nanoseconds.
+  uint64 min_duration_ns = 8;
+  // Max duration of kernel in nanoseconds.
+  uint64 max_duration_ns = 9;
+  // Kernel utilizes TensorCore instructions.
+  bool is_kernel_using_tensor_core = 10;
+  // Operation is eligible to use TensorCores.
+  bool is_op_tensor_core_eligible = 11;
+  // TF operation name.
+  string op_name = 12;
+  // Number of occurrences.
+  uint32 occurrences = 13;
+}
+
+message KernelStatsDb {
+  // A list of kernels aggregated by name.
+  repeated KernelReport reports = 1;
+}
--- a/tensorflow/core/profiler/protobuf/op_stats.proto
+++ b/tensorflow/core/profiler/protobuf/op_stats.proto
@ -2,6 +2,7 @@ syntax = "proto3";

 package tensorflow.profiler;

+import "tensorflow/core/profiler/protobuf/kernel_stats.proto";
 import "tensorflow/core/profiler/protobuf/op_metrics.proto";
 import "tensorflow/core/profiler/protobuf/steps_db.proto";

@ -99,4 +100,6 @@ message OpStats {
  StepDatabaseResult step_db = 4;
  // The run environment of this profiling session.
  RunEnvironment run_environment = 5;
+  // Kernel stats results from all GPUs.
+  KernelStatsDb kernel_stats_db = 6;
 }
--- a/tensorflow/core/profiler/utils/BUILD
+++ b/tensorflow/core/profiler/utils/BUILD
@ -304,3 +304,14 @@ tf_cc_test(
        "@com_google_absl//absl/strings",
    ],
 )
+
+cc_library(
+    name = "kernel_stats_utils",
+    srcs = ["kernel_stats_utils.cc"],
+    hdrs = ["kernel_stats_utils.h"],
+    deps = [
+        "//tensorflow/core:lib",
+        "//tensorflow/core/profiler/protobuf:kernel_stats_proto_cc",
+        "@com_google_absl//absl/strings",
+    ],
+)
--- a/tensorflow/core/profiler/utils/kernel_stats_utils.cc
+++ b/tensorflow/core/profiler/utils/kernel_stats_utils.cc
@ -0,0 +1,169 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/profiler/utils/kernel_stats_utils.h"
+
+#include <tuple>
+#include <vector>
+
+#include "absl/strings/match.h"
+#include "absl/strings/numbers.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_split.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/profiler/protobuf/kernel_stats.pb.h"
+
+namespace tensorflow {
+namespace profiler {
+
+void ParseKernelLaunchParams(absl::string_view xstat_kernel_details,
+                             KernelReport* kernel) {
+  const std::vector<absl::string_view> params =
+      absl::StrSplit(xstat_kernel_details, absl::ByAnyChar(":\n"));
+
+  constexpr uint32_t kNumDimensions = 3;
+  for (uint32_t dim = 0; dim < kNumDimensions; ++dim) {
+    kernel->add_block_dim(1);
+    kernel->add_grid_dim(1);
+  }
+
+  // Process value pairs.
+  for (uint32_t ii = 0; ii < params.size(); ii += 2) {
+    uint32_t value = 0;
+    if (params[ii] == "registers_per_thread" &&
+        absl::SimpleAtoi(params[ii + 1], &value)) {
+      kernel->set_registers_per_thread(value);
+    } else if (params[ii] == "static_shared_memory_usage" &&
+               absl::SimpleAtoi(params[ii + 1], &value)) {
+      kernel->set_static_shmem_bytes(value);
+    } else if (params[ii] == "dynamic_shared_memory_usage" &&
+               absl::SimpleAtoi(params[ii + 1], &value)) {
+      kernel->set_dynamic_shmem_bytes(value);
+    } else if (params[ii] == "block_x" &&
+               absl::SimpleAtoi(params[ii + 1], &value)) {
+      kernel->mutable_block_dim()->Set(0, value);
+    } else if (params[ii] == "block_y" &&
+               absl::SimpleAtoi(params[ii + 1], &value)) {
+      kernel->mutable_block_dim()->Set(1, value);
+    } else if (params[ii] == "block_z" &&
+               absl::SimpleAtoi(params[ii + 1], &value)) {
+      kernel->mutable_block_dim()->Set(2, value);
+    } else if (params[ii] == "grid_x" &&
+               absl::SimpleAtoi(params[ii + 1], &value)) {
+      kernel->mutable_grid_dim()->Set(0, value);
+    } else if (params[ii] == "grid_y" &&
+               absl::SimpleAtoi(params[ii + 1], &value)) {
+      kernel->mutable_grid_dim()->Set(1, value);
+    } else if (params[ii] == "grid_z" &&
+               absl::SimpleAtoi(params[ii + 1], &value)) {
+      kernel->mutable_grid_dim()->Set(2, value);
+    }
+  }
+}
+
+bool IsKernelUsingTensorCore(absl::string_view kernel_name) {
+  // Some examples: volta_h884gemm, volta_fp16_s884gemm,
+  // turing_fp16_s1688cudnn_fp16
+  bool possible_tensor_kernel = absl::StrContains(kernel_name, "884") ||
+                                absl::StrContains(kernel_name, "1688");
+#if defined(VLOG_IF)
+  VLOG_IF(1, possible_tensor_kernel)
+      << "Possible tensor kernel: " << kernel_name << "\n";
+#endif  // defined(VLOG_IF)
+
+  return (absl::StartsWith(kernel_name, "volta_i884") ||
+          absl::StartsWith(kernel_name, "volta_h884") ||
+          absl::StartsWith(kernel_name, "volta_s884") ||
+          absl::StartsWith(kernel_name, "volta_fp16_i884") ||
+          absl::StartsWith(kernel_name, "volta_fp16_h884") ||
+          absl::StartsWith(kernel_name, "volta_fp16_s884") ||
+          absl::StartsWith(kernel_name, "turing_i1688") ||
+          absl::StartsWith(kernel_name, "turing_h1688") ||
+          absl::StartsWith(kernel_name, "turing_s1688") ||
+          absl::StartsWith(kernel_name, "turing_fp16_i1688") ||
+          absl::StartsWith(kernel_name, "turing_fp16_h1688") ||
+          absl::StartsWith(kernel_name, "turing_fp16_s1688"));
+}
+
+// This list is not exhaustive.
+bool IsOpTensorCoreEligible(absl::string_view tf_op_name) {
+  return (absl::StrContains(tf_op_name, "Conv") ||
+          absl::StrContains(tf_op_name, "Einsum"));
+}
+
+bool KernelReportLessThanComparator::operator()(const KernelReport& lhs,
+                                                const KernelReport& rhs) {
+  // Disable formatting to keep vertical alignment for better readability,
+  // and make it easier to reorder columns.
+  // clang-format off
+  auto lhs_tuple = std::make_tuple(
+      lhs.name(),
+      lhs.grid_dim(0),
+      lhs.grid_dim(1),
+      lhs.grid_dim(2),
+      lhs.block_dim(0),
+      lhs.block_dim(1),
+      lhs.block_dim(2),
+      lhs.registers_per_thread(),
+      lhs.static_shmem_bytes(),
+      lhs.dynamic_shmem_bytes(),
+      lhs.is_kernel_using_tensor_core(),
+      lhs.is_op_tensor_core_eligible(),
+      lhs.op_name());
+
+  auto rhs_tuple = std::make_tuple(
+      rhs.name(),
+      rhs.grid_dim(0),
+      rhs.grid_dim(1),
+      rhs.grid_dim(2),
+      rhs.block_dim(0),
+      rhs.block_dim(1),
+      rhs.block_dim(2),
+      rhs.registers_per_thread(),
+      rhs.static_shmem_bytes(),
+      rhs.dynamic_shmem_bytes(),
+      rhs.is_kernel_using_tensor_core(),
+      rhs.is_op_tensor_core_eligible(),
+      rhs.op_name());
+  // clang-format on
+  return lhs_tuple < rhs_tuple;
+}
+
+bool KernelReportEqualToComparator::operator()(const KernelReport& lhs,
+                                               const KernelReport& rhs) {
+  // Disable formatting to keep vertical alignment for better readability,
+  // and make it easier to reorder columns.
+  // clang-format off
+  // Put the most expensive string comparisons last.
+  return (
+      lhs.is_kernel_using_tensor_core() == rhs.is_kernel_using_tensor_core() &&
+      lhs.is_op_tensor_core_eligible() == rhs.is_op_tensor_core_eligible() &&
+      lhs.block_dim(0) == rhs.block_dim(0) &&
+      lhs.block_dim(1) == rhs.block_dim(1) &&
+      lhs.block_dim(2) == rhs.block_dim(2) &&
+      lhs.grid_dim(0) == rhs.grid_dim(0) &&
+      lhs.grid_dim(1) == rhs.grid_dim(1) &&
+      lhs.grid_dim(2) == rhs.grid_dim(2) &&
+      lhs.registers_per_thread() == rhs.registers_per_thread() &&
+      lhs.static_shmem_bytes() == rhs.static_shmem_bytes() &&
+      lhs.dynamic_shmem_bytes() == rhs.dynamic_shmem_bytes() &&
+      lhs.name() == rhs.name() &&
+      lhs.op_name() == rhs.op_name());
+  // clang-format on
+}
+
+}  // namespace profiler
+}  // namespace tensorflow
--- a/tensorflow/core/profiler/utils/kernel_stats_utils.h
+++ b/tensorflow/core/profiler/utils/kernel_stats_utils.h
@ -0,0 +1,48 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PROFILER_UTILS_KERNEL_STATS_UTILS_H_
+#define TENSORFLOW_CORE_PROFILER_UTILS_KERNEL_STATS_UTILS_H_
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/profiler/protobuf/kernel_stats.pb.h"
+
+namespace tensorflow {
+namespace profiler {
+
+// Populates kernel launch information from a KernelDetails XStat.
+void ParseKernelLaunchParams(absl::string_view xstat_kernel_details,
+                             KernelReport* kernel);
+
+// Returns true if kernel uses TensorCores.
+bool IsKernelUsingTensorCore(absl::string_view kernel_name);
+
+// Returns true if operation is eligible to use TensorCores.
+bool IsOpTensorCoreEligible(absl::string_view tf_op_name);
+
+// Less than comparator for Kernel Reports.
+struct KernelReportLessThanComparator {
+  bool operator()(const KernelReport& lhs, const KernelReport& rhs);
+};
+
+// Equal to comparator for Kernel Reports.
+struct KernelReportEqualToComparator {
+  bool operator()(const KernelReport& lhs, const KernelReport& rhs);
+};
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_UTILS_KERNEL_STATS_UTILS_H_