From 3af5c069b6b30c744d4f629ed18f2927e20d5790 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Wed, 16 Dec 2020 23:48:33 -0800 Subject: [PATCH] add nvtx range push/pop tracking. use that information as a stats in kernel events. PiperOrigin-RevId: 347969357 Change-Id: I8fa5a7bc10e27319011ffe4901a53b28845e2aea --- tensorflow/core/profiler/internal/gpu/BUILD | 12 ++++ .../profiler/internal/gpu/cupti_collector.cc | 26 +++++--- .../profiler/internal/gpu/cupti_collector.h | 14 +++- .../profiler/internal/gpu/cupti_tracer.cc | 66 +++++++++++++++---- .../core/profiler/internal/gpu/cupti_tracer.h | 5 ++ .../core/profiler/internal/gpu/nvtx_utils.cc | 30 +++++++++ .../core/profiler/internal/gpu/nvtx_utils.h | 58 ++++++++++++++++ .../core/profiler/utils/xplane_schema.cc | 1 + .../core/profiler/utils/xplane_schema.h | 1 + 9 files changed, 188 insertions(+), 25 deletions(-) create mode 100644 tensorflow/core/profiler/internal/gpu/nvtx_utils.cc create mode 100644 tensorflow/core/profiler/internal/gpu/nvtx_utils.h diff --git a/tensorflow/core/profiler/internal/gpu/BUILD b/tensorflow/core/profiler/internal/gpu/BUILD index 40773c6cb98..681670a2415 100644 --- a/tensorflow/core/profiler/internal/gpu/BUILD +++ b/tensorflow/core/profiler/internal/gpu/BUILD @@ -127,8 +127,10 @@ tf_cuda_library( ":cupti_collector", ":cupti_interface", ":cupti_utils", + ":nvtx_utils", "//tensorflow/core:lib", "//tensorflow/core/profiler/internal/cpu:annotation_stack", + "//tensorflow/core/profiler/lib:scoped_annotation", "@com_google_absl//absl/container:flat_hash_map", "@com_google_absl//absl/container:node_hash_map", "@com_google_absl//absl/container:node_hash_set", @@ -136,6 +138,16 @@ tf_cuda_library( ], ) +tf_cuda_library( + name = "nvtx_utils", + srcs = if_cuda_is_configured_compat(["nvtx_utils.cc"]), + hdrs = if_cuda_is_configured_compat(["nvtx_utils.h"]), + copts = tf_profiler_copts() + tf_copts(), + deps = [ + "//tensorflow/core:lib", + ], +) + tf_cuda_library( name = "cupti_collector", srcs = if_cuda_is_configured_compat(["cupti_collector.cc"]), diff --git a/tensorflow/core/profiler/internal/gpu/cupti_collector.cc b/tensorflow/core/profiler/internal/gpu/cupti_collector.cc index ab4061a3749..42d97d17b03 100644 --- a/tensorflow/core/profiler/internal/gpu/cupti_collector.cc +++ b/tensorflow/core/profiler/internal/gpu/cupti_collector.cc @@ -160,6 +160,11 @@ struct PerDeviceCollector { GetStatTypeStr(StatType::kKernelAnnotation)), *plane->GetOrCreateStatMetadata(event.annotation)); } + if (!event.nvtx_range.empty()) { + xevent.AddStatValue( + *plane->GetOrCreateStatMetadata(GetStatTypeStr(StatType::kNVTXRange)), + *plane->GetOrCreateStatMetadata(event.nvtx_range)); + } if (event.context_id != CuptiTracerEvent::kInvalidContextId) { xevent.AddStatValue( *plane->GetOrCreateStatMetadata(GetStatTypeStr(StatType::kContextId)), @@ -547,8 +552,9 @@ struct PerDeviceCollector { } // namespace void AnnotationMap::Add(uint32 device_id, uint32 correlation_id, - const std::string& annotation) { - if (annotation.empty()) return; + const absl::string_view annotation, + const absl::string_view nvtx_range) { + if (annotation.empty() && nvtx_range.empty()) return; VLOG(3) << "Add annotation: device_id: " << device_id << " correlation_id: " << correlation_id << " annotation: " << annotation; @@ -556,20 +562,22 @@ void AnnotationMap::Add(uint32 device_id, uint32 correlation_id, auto& per_device_map = per_device_map_[device_id]; absl::MutexLock lock(&per_device_map.mutex); if (per_device_map.annotations.size() < max_size_) { - absl::string_view annotation_str = - *per_device_map.annotations.insert(annotation).first; - per_device_map.correlation_map.emplace(correlation_id, annotation_str); + AnnotationInfo info; + info.annotation = *per_device_map.annotations.emplace(annotation).first; + if (!nvtx_range.empty()) + info.nvtx_range = *per_device_map.nvtx_ranges.emplace(nvtx_range).first; + per_device_map.correlation_map.emplace(correlation_id, info); } } -absl::string_view AnnotationMap::LookUp(uint32 device_id, - uint32 correlation_id) { - if (device_id >= per_device_map_.size()) return absl::string_view(); +AnnotationMap::AnnotationInfo AnnotationMap::LookUp(uint32 device_id, + uint32 correlation_id) { + if (device_id >= per_device_map_.size()) return AnnotationInfo(); auto& per_device_map = per_device_map_[device_id]; absl::MutexLock lock(&per_device_map.mutex); auto it = per_device_map.correlation_map.find(correlation_id); return it != per_device_map.correlation_map.end() ? it->second - : absl::string_view(); + : AnnotationInfo(); } // CuptiTraceCollectorImpl store the CuptiTracerEvents from CuptiTracer and diff --git a/tensorflow/core/profiler/internal/gpu/cupti_collector.h b/tensorflow/core/profiler/internal/gpu/cupti_collector.h index ada6cec1d2d..d303c587be0 100644 --- a/tensorflow/core/profiler/internal/gpu/cupti_collector.h +++ b/tensorflow/core/profiler/internal/gpu/cupti_collector.h @@ -127,6 +127,7 @@ struct CuptiTracerEvent { // This points to strings in AnnotationMap, which should outlive the point // where serialization happens. absl::string_view annotation; + absl::string_view nvtx_range; uint64 start_time_ns = 0; uint64 end_time_ns = 0; uint32 device_id = 0; @@ -156,11 +157,17 @@ struct CuptiTracerCollectorOptions { class AnnotationMap { public: + struct AnnotationInfo { + absl::string_view annotation; + absl::string_view nvtx_range; + }; + explicit AnnotationMap(uint64 max_size, uint32 num_gpus) : max_size_(max_size), per_device_map_(num_gpus) {} void Add(uint32 device_id, uint32 correlation_id, - const std::string& annotation); - absl::string_view LookUp(uint32 device_id, uint32 correlation_id); + const absl::string_view annotation, + const absl::string_view nvtx_range); + AnnotationInfo LookUp(uint32 device_id, uint32 correlation_id); private: struct PerDeviceAnnotationMap { @@ -170,7 +177,8 @@ class AnnotationMap { // Annotation tends to be repetitive, use a hash_set to store the strings, // an use the reference to the string in the map. absl::node_hash_set annotations; - absl::flat_hash_map correlation_map; + absl::node_hash_set nvtx_ranges; + absl::flat_hash_map correlation_map; }; const uint64 max_size_; absl::FixedArray per_device_map_; diff --git a/tensorflow/core/profiler/internal/gpu/cupti_tracer.cc b/tensorflow/core/profiler/internal/gpu/cupti_tracer.cc index 51a04af5442..6d04aebec90 100644 --- a/tensorflow/core/profiler/internal/gpu/cupti_tracer.cc +++ b/tensorflow/core/profiler/internal/gpu/cupti_tracer.cc @@ -19,6 +19,7 @@ limitations under the License. #include "absl/container/flat_hash_set.h" #include "absl/container/node_hash_map.h" #include "absl/container/node_hash_set.h" +#include "third_party/gpus/cuda/extras/CUPTI/include/generated_nvtx_meta.h" #include "tensorflow/core/platform/env.h" #include "tensorflow/core/platform/errors.h" #include "tensorflow/core/platform/host_info.h" @@ -26,6 +27,8 @@ limitations under the License. #include "tensorflow/core/platform/macros.h" #include "tensorflow/core/platform/mem.h" #include "tensorflow/core/profiler/internal/cpu/annotation_stack.h" +#include "tensorflow/core/profiler/internal/gpu/cupti_collector.h" +#include "tensorflow/core/profiler/internal/gpu/nvtx_utils.h" namespace tensorflow { namespace profiler { @@ -418,8 +421,10 @@ void AddKernelActivityEvent(CuptiTraceCollector *collector, event.context_id = kernel->contextId; event.stream_id = kernel->streamId; event.correlation_id = kernel->correlationId; - event.annotation = collector->annotation_map()->LookUp(event.device_id, - event.correlation_id); + AnnotationMap::AnnotationInfo info = collector->annotation_map()->LookUp( + event.device_id, event.correlation_id); + event.annotation = info.annotation; + event.nvtx_range = info.nvtx_range; event.kernel_info.registers_per_thread = kernel->registersPerThread; event.kernel_info.static_shared_memory_usage = kernel->staticSharedMemory; event.kernel_info.dynamic_shared_memory_usage = kernel->dynamicSharedMemory; @@ -464,8 +469,9 @@ void AddMemcpyActivityEvent(CuptiTraceCollector *collector, event.context_id = memcpy->contextId; event.stream_id = memcpy->streamId; event.correlation_id = memcpy->correlationId; - event.annotation = collector->annotation_map()->LookUp(event.device_id, - event.correlation_id); + AnnotationMap::AnnotationInfo info = collector->annotation_map()->LookUp( + event.device_id, event.correlation_id); + event.annotation = info.annotation; event.memcpy_info.kind = memcpy->copyKind; event.memcpy_info.num_bytes = memcpy->bytes; event.memcpy_info.destination = memcpy->deviceId; @@ -488,8 +494,9 @@ void AddMemcpy2ActivityEvent(CuptiTraceCollector *collector, event.context_id = memcpy2->contextId; event.stream_id = memcpy2->streamId; event.correlation_id = memcpy2->correlationId; - event.annotation = collector->annotation_map()->LookUp(event.device_id, - event.correlation_id); + AnnotationMap::AnnotationInfo info = collector->annotation_map()->LookUp( + event.device_id, event.correlation_id); + event.annotation = info.annotation; event.memcpy_info.kind = CUPTI_ACTIVITY_MEMCPY_KIND_PTOP; event.memcpy_info.num_bytes = memcpy2->bytes; event.memcpy_info.destination = memcpy2->dstDeviceId; @@ -946,8 +953,9 @@ class CudaEventRecorder { event.context_id = stream_info.ctx_info->context_id; event.stream_id = stream_info.stream_id; event.correlation_id = record.correlation_id; - event.annotation = - annotation_map->LookUp(event.device_id, event.correlation_id); + AnnotationMap::AnnotationInfo info = collector_->annotation_map()->LookUp( + event.device_id, event.correlation_id); + event.annotation = info.annotation; event.kernel_info = record.details; collector_->AddEvent(std::move(event)); return Status::OK(); @@ -974,8 +982,9 @@ class CudaEventRecorder { event.context_id = stream_info.ctx_info->context_id; event.stream_id = stream_info.stream_id; event.correlation_id = record.correlation_id; - event.annotation = - annotation_map->LookUp(event.device_id, event.correlation_id); + AnnotationMap::AnnotationInfo info = collector_->annotation_map()->LookUp( + event.device_id, event.correlation_id); + event.annotation = info.annotation; event.memcpy_info.num_bytes = record.size_bytes; // TODO: support MemcpyD2D where destination != source; event.memcpy_info.destination = ordinal_; @@ -1063,7 +1072,7 @@ class CuptiDriverApiHookWithCudaEvent : public CuptiDriverApiHook { // Because annotation are per device, therefore we need to populate // annotation for each device involved. collector_->annotation_map()->Add(*dev_id, cbdata->correlationId, - annotation); + annotation, ""); record_indices.push_back( cuda_event_recorders_[*dev_id]->StartKernel( "CooperativeKernelMultiDevice", *context, @@ -1425,6 +1434,11 @@ Status CuptiTracer::EnableApiTracing() { RETURN_IF_CUPTI_ERROR(cupti_interface_->EnableDomain( 1 /* ENABLE */, subscriber_, CUPTI_CB_DOMAIN_DRIVER_API)); } + + if (option_->enable_nvtx_tracking) { + RETURN_IF_CUPTI_ERROR(cupti_interface_->EnableDomain( + 1 /* ENABLE */, subscriber_, CUPTI_CB_DOMAIN_NVTX)); + } return Status::OK(); } @@ -1443,6 +1457,11 @@ Status CuptiTracer::DisableApiTracing() { 0 /* DISABLE */, subscriber_, CUPTI_CB_DOMAIN_DRIVER_API)); } + if (option_->enable_nvtx_tracking) { + RETURN_IF_CUPTI_ERROR(cupti_interface_->EnableDomain( + 0 /* DISABLE */, subscriber_, CUPTI_CB_DOMAIN_NVTX)); + } + VLOG(1) << "Disable subscriber"; RETURN_IF_CUPTI_ERROR(cupti_interface_->Unsubscribe(subscriber_)); return Status::OK(); @@ -1510,11 +1529,31 @@ Status CuptiTracer::Finalize() { return 0; } +Status CuptiTracer::HandleNVTXCallback(CUpti_CallbackId cbid, + const CUpti_CallbackData *cbdata) { + const CUpti_NvtxData *pdata = + reinterpret_cast(cbdata); + if (cbid == CUPTI_CBID_NVTX_nvtxDomainRangePushEx) { + const nvtxDomainRangePushEx_params *params = + reinterpret_cast( + pdata->functionParams); + // TODO(profiler): The messageType is actually NVTX_MESSAGE_TYPE_REGISTERED + // (which is 3), However it seems to me that we can not get the registered + // string from nvtxDomainRegisterStringA_params. If we reinterpret the + // payload as ascii, it happen to work. + NVTXRangeTracker::EnterRange(params->core.eventAttrib->message.ascii); + } else if (cbid == CUPTI_CBID_NVTX_nvtxDomainRangePop) { + NVTXRangeTracker::ExitRange(); + } + return Status::OK(); +} + Status CuptiTracer::HandleCallback(CUpti_CallbackDomain domain, CUpti_CallbackId cbid, const CUpti_CallbackData *cbdata) { if (!api_tracing_enabled_) return Status::OK(); // already unsubscribed. if (!cupti_driver_api_hook_) return Status::OK(); // already unsubscribed. + if (domain == CUPTI_CB_DOMAIN_NVTX) return HandleNVTXCallback(cbid, cbdata); if (domain != CUPTI_CB_DOMAIN_DRIVER_API) return Status::OK(); if (internalCuCall) return Status::OK(); @@ -1546,11 +1585,12 @@ Status CuptiTracer::HandleCallback(CUpti_CallbackDomain domain, // we need to populate per device annotation map respectively. for (int i = 0; i < num_gpus_; ++i) { collector_->annotation_map()->Add(i, cbdata->correlationId, - annotation); + annotation, ""); } } else { + absl::string_view nvtx_range = NVTXRangeTracker::CurrentRange(); collector_->annotation_map()->Add(device_id, cbdata->correlationId, - annotation); + annotation, nvtx_range); } } diff --git a/tensorflow/core/profiler/internal/gpu/cupti_tracer.h b/tensorflow/core/profiler/internal/gpu/cupti_tracer.h index 3f7a2d4d7e1..970c4f9d252 100644 --- a/tensorflow/core/profiler/internal/gpu/cupti_tracer.h +++ b/tensorflow/core/profiler/internal/gpu/cupti_tracer.h @@ -18,6 +18,7 @@ limitations under the License. #include "absl/types/optional.h" #include "third_party/gpus/cuda/extras/CUPTI/include/cupti.h" +#include "third_party/gpus/cuda/include/nvtx3/nvToolsExt.h" #include "tensorflow/core/platform/errors.h" #include "tensorflow/core/platform/macros.h" #include "tensorflow/core/platform/status.h" @@ -50,6 +51,8 @@ struct CuptiTracerOptions { bool cupti_finalize = false; // Whether to call cuCtxSynchronize for each device before Stop(). bool sync_devices_before_stop = false; + // Whether to enable NVTX tracking, we need this for TensorRT tracking. + bool enable_nvtx_tracking = false; }; class CuptiDriverApiHook { @@ -111,6 +114,8 @@ class CuptiTracer { Status DisableActivityTracing(); Status Finalize(); void ConfigureActivityUnifiedMemoryCounter(bool enable); + Status HandleNVTXCallback(CUpti_CallbackId cbid, + const CUpti_CallbackData* cbdata); int num_gpus_; absl::optional option_; diff --git a/tensorflow/core/profiler/internal/gpu/nvtx_utils.cc b/tensorflow/core/profiler/internal/gpu/nvtx_utils.cc new file mode 100644 index 00000000000..ace1533c9b4 --- /dev/null +++ b/tensorflow/core/profiler/internal/gpu/nvtx_utils.cc @@ -0,0 +1,30 @@ +/* Copyright 2020 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/core/profiler/internal/gpu/nvtx_utils.h" + +#include "third_party/gpus/cuda/include/nvtx3/nvToolsExt.h" +#include "tensorflow/core/platform/platform.h" + +namespace tensorflow { +namespace profiler { + +/*static*/ std::stack &NVTXRangeTracker::GetRangeStack() { + static thread_local std::stack range_stack; + return range_stack; +} + +} // namespace profiler +} // namespace tensorflow diff --git a/tensorflow/core/profiler/internal/gpu/nvtx_utils.h b/tensorflow/core/profiler/internal/gpu/nvtx_utils.h new file mode 100644 index 00000000000..b9085fa03e9 --- /dev/null +++ b/tensorflow/core/profiler/internal/gpu/nvtx_utils.h @@ -0,0 +1,58 @@ +/* Copyright 2020 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_GPU_NVTX_UTILS_H_ +#define TENSORFLOW_CORE_PROFILER_INTERNAL_GPU_NVTX_UTILS_H_ + +#include + +#include "absl/strings/string_view.h" +#include "tensorflow/core/platform/macros.h" + +namespace tensorflow { +namespace profiler { + +/*** + * We have no intention to use NVTX in tensorflow right now, we use this class + * to track NVTX instrumentation inside NVIDIA libraries (such as TensorRT). + * This bears a lot of resemblance to ScopedAnnotation for now. In the future, + * we will use TraceMe to keep track trace context within a thread. + */ +class NVTXRangeTracker { + public: + static void EnterRange(const std::string& range) { + auto& range_stack = GetRangeStack(); + range_stack.push(range); + } + static void ExitRange() { + auto& range_stack = GetRangeStack(); + if (!range_stack.empty()) range_stack.pop(); + } + static const absl::string_view CurrentRange() { + auto& range_stack = GetRangeStack(); + if (!range_stack.empty()) return range_stack.top(); + return ""; + } + + private: + static std::stack& GetRangeStack(); + + TF_DISALLOW_COPY_AND_ASSIGN(NVTXRangeTracker); +}; + +} // namespace profiler +} // namespace tensorflow + +#endif // TENSORFLOW_CORE_PROFILER_INTERNAL_GPU_NVTX_UTILS_H_ diff --git a/tensorflow/core/profiler/utils/xplane_schema.cc b/tensorflow/core/profiler/utils/xplane_schema.cc index 7dd00c4376f..f0ea78dbe24 100644 --- a/tensorflow/core/profiler/utils/xplane_schema.cc +++ b/tensorflow/core/profiler/utils/xplane_schema.cc @@ -176,6 +176,7 @@ const StatTypeMap& GetStatTypeMap() { {"memalloc_details", kMemallocDetails}, {"kernel_details", kKernelDetails}, {"annotation", kKernelAnnotation}, + {"nvtx_range", kNVTXRange}, {"stream", kStream}, // Stats added when processing traces. {"group_id", kGroupId}, diff --git a/tensorflow/core/profiler/utils/xplane_schema.h b/tensorflow/core/profiler/utils/xplane_schema.h index ad4c100c96c..ca51874e637 100644 --- a/tensorflow/core/profiler/utils/xplane_schema.h +++ b/tensorflow/core/profiler/utils/xplane_schema.h @@ -164,6 +164,7 @@ enum StatType { kMemcpyDetails, kMemallocDetails, kKernelAnnotation, + kNVTXRange, kKernelDetails, kStream, // Stats added when processing traces.