From 3af5c069b6b30c744d4f629ed18f2927e20d5790 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 16 Dec 2020 23:48:33 -0800
Subject: [PATCH] add nvtx range push/pop tracking. use that information as a
 stats in kernel events.

PiperOrigin-RevId: 347969357
Change-Id: I8fa5a7bc10e27319011ffe4901a53b28845e2aea
---
 tensorflow/core/profiler/internal/gpu/BUILD   | 12 ++++
 .../profiler/internal/gpu/cupti_collector.cc  | 26 +++++---
 .../profiler/internal/gpu/cupti_collector.h   | 14 +++-
 .../profiler/internal/gpu/cupti_tracer.cc     | 66 +++++++++++++++----
 .../core/profiler/internal/gpu/cupti_tracer.h |  5 ++
 .../core/profiler/internal/gpu/nvtx_utils.cc  | 30 +++++++++
 .../core/profiler/internal/gpu/nvtx_utils.h   | 58 ++++++++++++++++
 .../core/profiler/utils/xplane_schema.cc      |  1 +
 .../core/profiler/utils/xplane_schema.h       |  1 +
 9 files changed, 188 insertions(+), 25 deletions(-)
 create mode 100644 tensorflow/core/profiler/internal/gpu/nvtx_utils.cc
 create mode 100644 tensorflow/core/profiler/internal/gpu/nvtx_utils.h

diff --git a/tensorflow/core/profiler/internal/gpu/BUILD b/tensorflow/core/profiler/internal/gpu/BUILD
index 40773c6cb98..681670a2415 100644
--- a/tensorflow/core/profiler/internal/gpu/BUILD
+++ b/tensorflow/core/profiler/internal/gpu/BUILD
@@ -127,8 +127,10 @@ tf_cuda_library(
         ":cupti_collector",
         ":cupti_interface",
         ":cupti_utils",
+        ":nvtx_utils",
         "//tensorflow/core:lib",
         "//tensorflow/core/profiler/internal/cpu:annotation_stack",
+        "//tensorflow/core/profiler/lib:scoped_annotation",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:node_hash_map",
         "@com_google_absl//absl/container:node_hash_set",
@@ -136,6 +138,16 @@ tf_cuda_library(
     ],
 )
 
+tf_cuda_library(
+    name = "nvtx_utils",
+    srcs = if_cuda_is_configured_compat(["nvtx_utils.cc"]),
+    hdrs = if_cuda_is_configured_compat(["nvtx_utils.h"]),
+    copts = tf_profiler_copts() + tf_copts(),
+    deps = [
+        "//tensorflow/core:lib",
+    ],
+)
+
 tf_cuda_library(
     name = "cupti_collector",
     srcs = if_cuda_is_configured_compat(["cupti_collector.cc"]),
diff --git a/tensorflow/core/profiler/internal/gpu/cupti_collector.cc b/tensorflow/core/profiler/internal/gpu/cupti_collector.cc
index ab4061a3749..42d97d17b03 100644
--- a/tensorflow/core/profiler/internal/gpu/cupti_collector.cc
+++ b/tensorflow/core/profiler/internal/gpu/cupti_collector.cc
@@ -160,6 +160,11 @@ struct PerDeviceCollector {
                               GetStatTypeStr(StatType::kKernelAnnotation)),
                           *plane->GetOrCreateStatMetadata(event.annotation));
     }
+    if (!event.nvtx_range.empty()) {
+      xevent.AddStatValue(
+          *plane->GetOrCreateStatMetadata(GetStatTypeStr(StatType::kNVTXRange)),
+          *plane->GetOrCreateStatMetadata(event.nvtx_range));
+    }
     if (event.context_id != CuptiTracerEvent::kInvalidContextId) {
       xevent.AddStatValue(
           *plane->GetOrCreateStatMetadata(GetStatTypeStr(StatType::kContextId)),
@@ -547,8 +552,9 @@ struct PerDeviceCollector {
 }  // namespace
 
 void AnnotationMap::Add(uint32 device_id, uint32 correlation_id,
-                        const std::string& annotation) {
-  if (annotation.empty()) return;
+                        const absl::string_view annotation,
+                        const absl::string_view nvtx_range) {
+  if (annotation.empty() && nvtx_range.empty()) return;
   VLOG(3) << "Add annotation: device_id: " << device_id
           << " correlation_id: " << correlation_id
           << " annotation: " << annotation;
@@ -556,20 +562,22 @@ void AnnotationMap::Add(uint32 device_id, uint32 correlation_id,
   auto& per_device_map = per_device_map_[device_id];
   absl::MutexLock lock(&per_device_map.mutex);
   if (per_device_map.annotations.size() < max_size_) {
-    absl::string_view annotation_str =
-        *per_device_map.annotations.insert(annotation).first;
-    per_device_map.correlation_map.emplace(correlation_id, annotation_str);
+    AnnotationInfo info;
+    info.annotation = *per_device_map.annotations.emplace(annotation).first;
+    if (!nvtx_range.empty())
+      info.nvtx_range = *per_device_map.nvtx_ranges.emplace(nvtx_range).first;
+    per_device_map.correlation_map.emplace(correlation_id, info);
   }
 }
 
-absl::string_view AnnotationMap::LookUp(uint32 device_id,
-                                        uint32 correlation_id) {
-  if (device_id >= per_device_map_.size()) return absl::string_view();
+AnnotationMap::AnnotationInfo AnnotationMap::LookUp(uint32 device_id,
+                                                    uint32 correlation_id) {
+  if (device_id >= per_device_map_.size()) return AnnotationInfo();
   auto& per_device_map = per_device_map_[device_id];
   absl::MutexLock lock(&per_device_map.mutex);
   auto it = per_device_map.correlation_map.find(correlation_id);
   return it != per_device_map.correlation_map.end() ? it->second
-                                                    : absl::string_view();
+                                                    : AnnotationInfo();
 }
 
 // CuptiTraceCollectorImpl store the CuptiTracerEvents from CuptiTracer and
diff --git a/tensorflow/core/profiler/internal/gpu/cupti_collector.h b/tensorflow/core/profiler/internal/gpu/cupti_collector.h
index ada6cec1d2d..d303c587be0 100644
--- a/tensorflow/core/profiler/internal/gpu/cupti_collector.h
+++ b/tensorflow/core/profiler/internal/gpu/cupti_collector.h
@@ -127,6 +127,7 @@ struct CuptiTracerEvent {
   // This points to strings in AnnotationMap, which should outlive the point
   // where serialization happens.
   absl::string_view annotation;
+  absl::string_view nvtx_range;
   uint64 start_time_ns = 0;
   uint64 end_time_ns = 0;
   uint32 device_id = 0;
@@ -156,11 +157,17 @@ struct CuptiTracerCollectorOptions {
 
 class AnnotationMap {
  public:
+  struct AnnotationInfo {
+    absl::string_view annotation;
+    absl::string_view nvtx_range;
+  };
+
   explicit AnnotationMap(uint64 max_size, uint32 num_gpus)
       : max_size_(max_size), per_device_map_(num_gpus) {}
   void Add(uint32 device_id, uint32 correlation_id,
-           const std::string& annotation);
-  absl::string_view LookUp(uint32 device_id, uint32 correlation_id);
+           const absl::string_view annotation,
+           const absl::string_view nvtx_range);
+  AnnotationInfo LookUp(uint32 device_id, uint32 correlation_id);
 
  private:
   struct PerDeviceAnnotationMap {
@@ -170,7 +177,8 @@ class AnnotationMap {
     // Annotation tends to be repetitive, use a hash_set to store the strings,
     // an use the reference to the string in the map.
     absl::node_hash_set<std::string> annotations;
-    absl::flat_hash_map<uint32, absl::string_view> correlation_map;
+    absl::node_hash_set<std::string> nvtx_ranges;
+    absl::flat_hash_map<uint32, AnnotationInfo> correlation_map;
   };
   const uint64 max_size_;
   absl::FixedArray<PerDeviceAnnotationMap> per_device_map_;
diff --git a/tensorflow/core/profiler/internal/gpu/cupti_tracer.cc b/tensorflow/core/profiler/internal/gpu/cupti_tracer.cc
index 51a04af5442..6d04aebec90 100644
--- a/tensorflow/core/profiler/internal/gpu/cupti_tracer.cc
+++ b/tensorflow/core/profiler/internal/gpu/cupti_tracer.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "absl/container/node_hash_map.h"
 #include "absl/container/node_hash_set.h"
+#include "third_party/gpus/cuda/extras/CUPTI/include/generated_nvtx_meta.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/host_info.h"
@@ -26,6 +27,8 @@ limitations under the License.
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/mem.h"
 #include "tensorflow/core/profiler/internal/cpu/annotation_stack.h"
+#include "tensorflow/core/profiler/internal/gpu/cupti_collector.h"
+#include "tensorflow/core/profiler/internal/gpu/nvtx_utils.h"
 
 namespace tensorflow {
 namespace profiler {
@@ -418,8 +421,10 @@ void AddKernelActivityEvent(CuptiTraceCollector *collector,
   event.context_id = kernel->contextId;
   event.stream_id = kernel->streamId;
   event.correlation_id = kernel->correlationId;
-  event.annotation = collector->annotation_map()->LookUp(event.device_id,
-                                                         event.correlation_id);
+  AnnotationMap::AnnotationInfo info = collector->annotation_map()->LookUp(
+      event.device_id, event.correlation_id);
+  event.annotation = info.annotation;
+  event.nvtx_range = info.nvtx_range;
   event.kernel_info.registers_per_thread = kernel->registersPerThread;
   event.kernel_info.static_shared_memory_usage = kernel->staticSharedMemory;
   event.kernel_info.dynamic_shared_memory_usage = kernel->dynamicSharedMemory;
@@ -464,8 +469,9 @@ void AddMemcpyActivityEvent(CuptiTraceCollector *collector,
   event.context_id = memcpy->contextId;
   event.stream_id = memcpy->streamId;
   event.correlation_id = memcpy->correlationId;
-  event.annotation = collector->annotation_map()->LookUp(event.device_id,
-                                                         event.correlation_id);
+  AnnotationMap::AnnotationInfo info = collector->annotation_map()->LookUp(
+      event.device_id, event.correlation_id);
+  event.annotation = info.annotation;
   event.memcpy_info.kind = memcpy->copyKind;
   event.memcpy_info.num_bytes = memcpy->bytes;
   event.memcpy_info.destination = memcpy->deviceId;
@@ -488,8 +494,9 @@ void AddMemcpy2ActivityEvent(CuptiTraceCollector *collector,
   event.context_id = memcpy2->contextId;
   event.stream_id = memcpy2->streamId;
   event.correlation_id = memcpy2->correlationId;
-  event.annotation = collector->annotation_map()->LookUp(event.device_id,
-                                                         event.correlation_id);
+  AnnotationMap::AnnotationInfo info = collector->annotation_map()->LookUp(
+      event.device_id, event.correlation_id);
+  event.annotation = info.annotation;
   event.memcpy_info.kind = CUPTI_ACTIVITY_MEMCPY_KIND_PTOP;
   event.memcpy_info.num_bytes = memcpy2->bytes;
   event.memcpy_info.destination = memcpy2->dstDeviceId;
@@ -946,8 +953,9 @@ class CudaEventRecorder {
     event.context_id = stream_info.ctx_info->context_id;
     event.stream_id = stream_info.stream_id;
     event.correlation_id = record.correlation_id;
-    event.annotation =
-        annotation_map->LookUp(event.device_id, event.correlation_id);
+    AnnotationMap::AnnotationInfo info = collector_->annotation_map()->LookUp(
+        event.device_id, event.correlation_id);
+    event.annotation = info.annotation;
     event.kernel_info = record.details;
     collector_->AddEvent(std::move(event));
     return Status::OK();
@@ -974,8 +982,9 @@ class CudaEventRecorder {
     event.context_id = stream_info.ctx_info->context_id;
     event.stream_id = stream_info.stream_id;
     event.correlation_id = record.correlation_id;
-    event.annotation =
-        annotation_map->LookUp(event.device_id, event.correlation_id);
+    AnnotationMap::AnnotationInfo info = collector_->annotation_map()->LookUp(
+        event.device_id, event.correlation_id);
+    event.annotation = info.annotation;
     event.memcpy_info.num_bytes = record.size_bytes;
     // TODO: support MemcpyD2D where destination != source;
     event.memcpy_info.destination = ordinal_;
@@ -1063,7 +1072,7 @@ class CuptiDriverApiHookWithCudaEvent : public CuptiDriverApiHook {
           // Because annotation are per device, therefore we need to populate
           // annotation for each device involved.
           collector_->annotation_map()->Add(*dev_id, cbdata->correlationId,
-                                            annotation);
+                                            annotation, "");
           record_indices.push_back(
               cuda_event_recorders_[*dev_id]->StartKernel<CUDA_LAUNCH_PARAMS>(
                   "CooperativeKernelMultiDevice", *context,
@@ -1425,6 +1434,11 @@ Status CuptiTracer::EnableApiTracing() {
     RETURN_IF_CUPTI_ERROR(cupti_interface_->EnableDomain(
         1 /* ENABLE */, subscriber_, CUPTI_CB_DOMAIN_DRIVER_API));
   }
+
+  if (option_->enable_nvtx_tracking) {
+    RETURN_IF_CUPTI_ERROR(cupti_interface_->EnableDomain(
+        1 /* ENABLE */, subscriber_, CUPTI_CB_DOMAIN_NVTX));
+  }
   return Status::OK();
 }
 
@@ -1443,6 +1457,11 @@ Status CuptiTracer::DisableApiTracing() {
         0 /* DISABLE */, subscriber_, CUPTI_CB_DOMAIN_DRIVER_API));
   }
 
+  if (option_->enable_nvtx_tracking) {
+    RETURN_IF_CUPTI_ERROR(cupti_interface_->EnableDomain(
+        0 /* DISABLE */, subscriber_, CUPTI_CB_DOMAIN_NVTX));
+  }
+
   VLOG(1) << "Disable subscriber";
   RETURN_IF_CUPTI_ERROR(cupti_interface_->Unsubscribe(subscriber_));
   return Status::OK();
@@ -1510,11 +1529,31 @@ Status CuptiTracer::Finalize() {
   return 0;
 }
 
+Status CuptiTracer::HandleNVTXCallback(CUpti_CallbackId cbid,
+                                       const CUpti_CallbackData *cbdata) {
+  const CUpti_NvtxData *pdata =
+      reinterpret_cast<const CUpti_NvtxData *>(cbdata);
+  if (cbid == CUPTI_CBID_NVTX_nvtxDomainRangePushEx) {
+    const nvtxDomainRangePushEx_params *params =
+        reinterpret_cast<const nvtxDomainRangePushEx_params *>(
+            pdata->functionParams);
+    // TODO(profiler): The messageType is actually NVTX_MESSAGE_TYPE_REGISTERED
+    // (which is 3), However it seems to me that we can not get the registered
+    // string from nvtxDomainRegisterStringA_params. If we reinterpret the
+    // payload as ascii, it happen to work.
+    NVTXRangeTracker::EnterRange(params->core.eventAttrib->message.ascii);
+  } else if (cbid == CUPTI_CBID_NVTX_nvtxDomainRangePop) {
+    NVTXRangeTracker::ExitRange();
+  }
+  return Status::OK();
+}
+
 Status CuptiTracer::HandleCallback(CUpti_CallbackDomain domain,
                                    CUpti_CallbackId cbid,
                                    const CUpti_CallbackData *cbdata) {
   if (!api_tracing_enabled_) return Status::OK();  // already unsubscribed.
   if (!cupti_driver_api_hook_) return Status::OK();  // already unsubscribed.
+  if (domain == CUPTI_CB_DOMAIN_NVTX) return HandleNVTXCallback(cbid, cbdata);
   if (domain != CUPTI_CB_DOMAIN_DRIVER_API) return Status::OK();
   if (internalCuCall) return Status::OK();
 
@@ -1546,11 +1585,12 @@ Status CuptiTracer::HandleCallback(CUpti_CallbackDomain domain,
         // we need to populate per device annotation map respectively.
         for (int i = 0; i < num_gpus_; ++i) {
           collector_->annotation_map()->Add(i, cbdata->correlationId,
-                                            annotation);
+                                            annotation, "");
         }
       } else {
+        absl::string_view nvtx_range = NVTXRangeTracker::CurrentRange();
         collector_->annotation_map()->Add(device_id, cbdata->correlationId,
-                                          annotation);
+                                          annotation, nvtx_range);
       }
     }
 
diff --git a/tensorflow/core/profiler/internal/gpu/cupti_tracer.h b/tensorflow/core/profiler/internal/gpu/cupti_tracer.h
index 3f7a2d4d7e1..970c4f9d252 100644
--- a/tensorflow/core/profiler/internal/gpu/cupti_tracer.h
+++ b/tensorflow/core/profiler/internal/gpu/cupti_tracer.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include "absl/types/optional.h"
 #include "third_party/gpus/cuda/extras/CUPTI/include/cupti.h"
+#include "third_party/gpus/cuda/include/nvtx3/nvToolsExt.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/status.h"
@@ -50,6 +51,8 @@ struct CuptiTracerOptions {
   bool cupti_finalize = false;
   // Whether to call cuCtxSynchronize for each device before Stop().
   bool sync_devices_before_stop = false;
+  // Whether to enable NVTX tracking, we need this for TensorRT tracking.
+  bool enable_nvtx_tracking = false;
 };
 
 class CuptiDriverApiHook {
@@ -111,6 +114,8 @@ class CuptiTracer {
   Status DisableActivityTracing();
   Status Finalize();
   void ConfigureActivityUnifiedMemoryCounter(bool enable);
+  Status HandleNVTXCallback(CUpti_CallbackId cbid,
+                            const CUpti_CallbackData* cbdata);
 
   int num_gpus_;
   absl::optional<CuptiTracerOptions> option_;
diff --git a/tensorflow/core/profiler/internal/gpu/nvtx_utils.cc b/tensorflow/core/profiler/internal/gpu/nvtx_utils.cc
new file mode 100644
index 00000000000..ace1533c9b4
--- /dev/null
+++ b/tensorflow/core/profiler/internal/gpu/nvtx_utils.cc
@@ -0,0 +1,30 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/profiler/internal/gpu/nvtx_utils.h"
+
+#include "third_party/gpus/cuda/include/nvtx3/nvToolsExt.h"
+#include "tensorflow/core/platform/platform.h"
+
+namespace tensorflow {
+namespace profiler {
+
+/*static*/ std::stack<std::string> &NVTXRangeTracker::GetRangeStack() {
+  static thread_local std::stack<std::string> range_stack;
+  return range_stack;
+}
+
+}  // namespace profiler
+}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/internal/gpu/nvtx_utils.h b/tensorflow/core/profiler/internal/gpu/nvtx_utils.h
new file mode 100644
index 00000000000..b9085fa03e9
--- /dev/null
+++ b/tensorflow/core/profiler/internal/gpu/nvtx_utils.h
@@ -0,0 +1,58 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_GPU_NVTX_UTILS_H_
+#define TENSORFLOW_CORE_PROFILER_INTERNAL_GPU_NVTX_UTILS_H_
+
+#include <stack>
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/platform/macros.h"
+
+namespace tensorflow {
+namespace profiler {
+
+/***
+ * We have no intention to use NVTX in tensorflow right now, we use this class
+ * to track NVTX instrumentation inside NVIDIA libraries (such as TensorRT).
+ * This bears a lot of resemblance to ScopedAnnotation for now.  In the future,
+ * we will use TraceMe to keep track trace context within a thread.
+ */
+class NVTXRangeTracker {
+ public:
+  static void EnterRange(const std::string& range) {
+    auto& range_stack = GetRangeStack();
+    range_stack.push(range);
+  }
+  static void ExitRange() {
+    auto& range_stack = GetRangeStack();
+    if (!range_stack.empty()) range_stack.pop();
+  }
+  static const absl::string_view CurrentRange() {
+    auto& range_stack = GetRangeStack();
+    if (!range_stack.empty()) return range_stack.top();
+    return "";
+  }
+
+ private:
+  static std::stack<std::string>& GetRangeStack();
+
+  TF_DISALLOW_COPY_AND_ASSIGN(NVTXRangeTracker);
+};
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_INTERNAL_GPU_NVTX_UTILS_H_
diff --git a/tensorflow/core/profiler/utils/xplane_schema.cc b/tensorflow/core/profiler/utils/xplane_schema.cc
index 7dd00c4376f..f0ea78dbe24 100644
--- a/tensorflow/core/profiler/utils/xplane_schema.cc
+++ b/tensorflow/core/profiler/utils/xplane_schema.cc
@@ -176,6 +176,7 @@ const StatTypeMap& GetStatTypeMap() {
       {"memalloc_details", kMemallocDetails},
       {"kernel_details", kKernelDetails},
       {"annotation", kKernelAnnotation},
+      {"nvtx_range", kNVTXRange},
       {"stream", kStream},
       // Stats added when processing traces.
       {"group_id", kGroupId},
diff --git a/tensorflow/core/profiler/utils/xplane_schema.h b/tensorflow/core/profiler/utils/xplane_schema.h
index ad4c100c96c..ca51874e637 100644
--- a/tensorflow/core/profiler/utils/xplane_schema.h
+++ b/tensorflow/core/profiler/utils/xplane_schema.h
@@ -164,6 +164,7 @@ enum StatType {
   kMemcpyDetails,
   kMemallocDetails,
   kKernelAnnotation,
+  kNVTXRange,
   kKernelDetails,
   kStream,
   // Stats added when processing traces.