add nvtx range push/pop tracking. use that information as a stats in kernel events.

PiperOrigin-RevId: 347969357
Change-Id: I8fa5a7bc10e27319011ffe4901a53b28845e2aea
This commit is contained in:
A. Unique TensorFlower 2020-12-16 23:48:33 -08:00 committed by TensorFlower Gardener
parent d8f4e9a18e
commit 3af5c069b6
9 changed files with 188 additions and 25 deletions

View File

@ -127,8 +127,10 @@ tf_cuda_library(
":cupti_collector",
":cupti_interface",
":cupti_utils",
":nvtx_utils",
"//tensorflow/core:lib",
"//tensorflow/core/profiler/internal/cpu:annotation_stack",
"//tensorflow/core/profiler/lib:scoped_annotation",
"@com_google_absl//absl/container:flat_hash_map",
"@com_google_absl//absl/container:node_hash_map",
"@com_google_absl//absl/container:node_hash_set",
@ -136,6 +138,16 @@ tf_cuda_library(
],
)
tf_cuda_library(
name = "nvtx_utils",
srcs = if_cuda_is_configured_compat(["nvtx_utils.cc"]),
hdrs = if_cuda_is_configured_compat(["nvtx_utils.h"]),
copts = tf_profiler_copts() + tf_copts(),
deps = [
"//tensorflow/core:lib",
],
)
tf_cuda_library(
name = "cupti_collector",
srcs = if_cuda_is_configured_compat(["cupti_collector.cc"]),

View File

@ -160,6 +160,11 @@ struct PerDeviceCollector {
GetStatTypeStr(StatType::kKernelAnnotation)),
*plane->GetOrCreateStatMetadata(event.annotation));
}
if (!event.nvtx_range.empty()) {
xevent.AddStatValue(
*plane->GetOrCreateStatMetadata(GetStatTypeStr(StatType::kNVTXRange)),
*plane->GetOrCreateStatMetadata(event.nvtx_range));
}
if (event.context_id != CuptiTracerEvent::kInvalidContextId) {
xevent.AddStatValue(
*plane->GetOrCreateStatMetadata(GetStatTypeStr(StatType::kContextId)),
@ -547,8 +552,9 @@ struct PerDeviceCollector {
} // namespace
void AnnotationMap::Add(uint32 device_id, uint32 correlation_id,
const std::string& annotation) {
if (annotation.empty()) return;
const absl::string_view annotation,
const absl::string_view nvtx_range) {
if (annotation.empty() && nvtx_range.empty()) return;
VLOG(3) << "Add annotation: device_id: " << device_id
<< " correlation_id: " << correlation_id
<< " annotation: " << annotation;
@ -556,20 +562,22 @@ void AnnotationMap::Add(uint32 device_id, uint32 correlation_id,
auto& per_device_map = per_device_map_[device_id];
absl::MutexLock lock(&per_device_map.mutex);
if (per_device_map.annotations.size() < max_size_) {
absl::string_view annotation_str =
*per_device_map.annotations.insert(annotation).first;
per_device_map.correlation_map.emplace(correlation_id, annotation_str);
AnnotationInfo info;
info.annotation = *per_device_map.annotations.emplace(annotation).first;
if (!nvtx_range.empty())
info.nvtx_range = *per_device_map.nvtx_ranges.emplace(nvtx_range).first;
per_device_map.correlation_map.emplace(correlation_id, info);
}
}
absl::string_view AnnotationMap::LookUp(uint32 device_id,
uint32 correlation_id) {
if (device_id >= per_device_map_.size()) return absl::string_view();
AnnotationMap::AnnotationInfo AnnotationMap::LookUp(uint32 device_id,
uint32 correlation_id) {
if (device_id >= per_device_map_.size()) return AnnotationInfo();
auto& per_device_map = per_device_map_[device_id];
absl::MutexLock lock(&per_device_map.mutex);
auto it = per_device_map.correlation_map.find(correlation_id);
return it != per_device_map.correlation_map.end() ? it->second
: absl::string_view();
: AnnotationInfo();
}
// CuptiTraceCollectorImpl store the CuptiTracerEvents from CuptiTracer and

View File

@ -127,6 +127,7 @@ struct CuptiTracerEvent {
// This points to strings in AnnotationMap, which should outlive the point
// where serialization happens.
absl::string_view annotation;
absl::string_view nvtx_range;
uint64 start_time_ns = 0;
uint64 end_time_ns = 0;
uint32 device_id = 0;
@ -156,11 +157,17 @@ struct CuptiTracerCollectorOptions {
class AnnotationMap {
public:
struct AnnotationInfo {
absl::string_view annotation;
absl::string_view nvtx_range;
};
explicit AnnotationMap(uint64 max_size, uint32 num_gpus)
: max_size_(max_size), per_device_map_(num_gpus) {}
void Add(uint32 device_id, uint32 correlation_id,
const std::string& annotation);
absl::string_view LookUp(uint32 device_id, uint32 correlation_id);
const absl::string_view annotation,
const absl::string_view nvtx_range);
AnnotationInfo LookUp(uint32 device_id, uint32 correlation_id);
private:
struct PerDeviceAnnotationMap {
@ -170,7 +177,8 @@ class AnnotationMap {
// Annotation tends to be repetitive, use a hash_set to store the strings,
// an use the reference to the string in the map.
absl::node_hash_set<std::string> annotations;
absl::flat_hash_map<uint32, absl::string_view> correlation_map;
absl::node_hash_set<std::string> nvtx_ranges;
absl::flat_hash_map<uint32, AnnotationInfo> correlation_map;
};
const uint64 max_size_;
absl::FixedArray<PerDeviceAnnotationMap> per_device_map_;

View File

@ -19,6 +19,7 @@ limitations under the License.
#include "absl/container/flat_hash_set.h"
#include "absl/container/node_hash_map.h"
#include "absl/container/node_hash_set.h"
#include "third_party/gpus/cuda/extras/CUPTI/include/generated_nvtx_meta.h"
#include "tensorflow/core/platform/env.h"
#include "tensorflow/core/platform/errors.h"
#include "tensorflow/core/platform/host_info.h"
@ -26,6 +27,8 @@ limitations under the License.
#include "tensorflow/core/platform/macros.h"
#include "tensorflow/core/platform/mem.h"
#include "tensorflow/core/profiler/internal/cpu/annotation_stack.h"
#include "tensorflow/core/profiler/internal/gpu/cupti_collector.h"
#include "tensorflow/core/profiler/internal/gpu/nvtx_utils.h"
namespace tensorflow {
namespace profiler {
@ -418,8 +421,10 @@ void AddKernelActivityEvent(CuptiTraceCollector *collector,
event.context_id = kernel->contextId;
event.stream_id = kernel->streamId;
event.correlation_id = kernel->correlationId;
event.annotation = collector->annotation_map()->LookUp(event.device_id,
event.correlation_id);
AnnotationMap::AnnotationInfo info = collector->annotation_map()->LookUp(
event.device_id, event.correlation_id);
event.annotation = info.annotation;
event.nvtx_range = info.nvtx_range;
event.kernel_info.registers_per_thread = kernel->registersPerThread;
event.kernel_info.static_shared_memory_usage = kernel->staticSharedMemory;
event.kernel_info.dynamic_shared_memory_usage = kernel->dynamicSharedMemory;
@ -464,8 +469,9 @@ void AddMemcpyActivityEvent(CuptiTraceCollector *collector,
event.context_id = memcpy->contextId;
event.stream_id = memcpy->streamId;
event.correlation_id = memcpy->correlationId;
event.annotation = collector->annotation_map()->LookUp(event.device_id,
event.correlation_id);
AnnotationMap::AnnotationInfo info = collector->annotation_map()->LookUp(
event.device_id, event.correlation_id);
event.annotation = info.annotation;
event.memcpy_info.kind = memcpy->copyKind;
event.memcpy_info.num_bytes = memcpy->bytes;
event.memcpy_info.destination = memcpy->deviceId;
@ -488,8 +494,9 @@ void AddMemcpy2ActivityEvent(CuptiTraceCollector *collector,
event.context_id = memcpy2->contextId;
event.stream_id = memcpy2->streamId;
event.correlation_id = memcpy2->correlationId;
event.annotation = collector->annotation_map()->LookUp(event.device_id,
event.correlation_id);
AnnotationMap::AnnotationInfo info = collector->annotation_map()->LookUp(
event.device_id, event.correlation_id);
event.annotation = info.annotation;
event.memcpy_info.kind = CUPTI_ACTIVITY_MEMCPY_KIND_PTOP;
event.memcpy_info.num_bytes = memcpy2->bytes;
event.memcpy_info.destination = memcpy2->dstDeviceId;
@ -946,8 +953,9 @@ class CudaEventRecorder {
event.context_id = stream_info.ctx_info->context_id;
event.stream_id = stream_info.stream_id;
event.correlation_id = record.correlation_id;
event.annotation =
annotation_map->LookUp(event.device_id, event.correlation_id);
AnnotationMap::AnnotationInfo info = collector_->annotation_map()->LookUp(
event.device_id, event.correlation_id);
event.annotation = info.annotation;
event.kernel_info = record.details;
collector_->AddEvent(std::move(event));
return Status::OK();
@ -974,8 +982,9 @@ class CudaEventRecorder {
event.context_id = stream_info.ctx_info->context_id;
event.stream_id = stream_info.stream_id;
event.correlation_id = record.correlation_id;
event.annotation =
annotation_map->LookUp(event.device_id, event.correlation_id);
AnnotationMap::AnnotationInfo info = collector_->annotation_map()->LookUp(
event.device_id, event.correlation_id);
event.annotation = info.annotation;
event.memcpy_info.num_bytes = record.size_bytes;
// TODO: support MemcpyD2D where destination != source;
event.memcpy_info.destination = ordinal_;
@ -1063,7 +1072,7 @@ class CuptiDriverApiHookWithCudaEvent : public CuptiDriverApiHook {
// Because annotation are per device, therefore we need to populate
// annotation for each device involved.
collector_->annotation_map()->Add(*dev_id, cbdata->correlationId,
annotation);
annotation, "");
record_indices.push_back(
cuda_event_recorders_[*dev_id]->StartKernel<CUDA_LAUNCH_PARAMS>(
"CooperativeKernelMultiDevice", *context,
@ -1425,6 +1434,11 @@ Status CuptiTracer::EnableApiTracing() {
RETURN_IF_CUPTI_ERROR(cupti_interface_->EnableDomain(
1 /* ENABLE */, subscriber_, CUPTI_CB_DOMAIN_DRIVER_API));
}
if (option_->enable_nvtx_tracking) {
RETURN_IF_CUPTI_ERROR(cupti_interface_->EnableDomain(
1 /* ENABLE */, subscriber_, CUPTI_CB_DOMAIN_NVTX));
}
return Status::OK();
}
@ -1443,6 +1457,11 @@ Status CuptiTracer::DisableApiTracing() {
0 /* DISABLE */, subscriber_, CUPTI_CB_DOMAIN_DRIVER_API));
}
if (option_->enable_nvtx_tracking) {
RETURN_IF_CUPTI_ERROR(cupti_interface_->EnableDomain(
0 /* DISABLE */, subscriber_, CUPTI_CB_DOMAIN_NVTX));
}
VLOG(1) << "Disable subscriber";
RETURN_IF_CUPTI_ERROR(cupti_interface_->Unsubscribe(subscriber_));
return Status::OK();
@ -1510,11 +1529,31 @@ Status CuptiTracer::Finalize() {
return 0;
}
Status CuptiTracer::HandleNVTXCallback(CUpti_CallbackId cbid,
const CUpti_CallbackData *cbdata) {
const CUpti_NvtxData *pdata =
reinterpret_cast<const CUpti_NvtxData *>(cbdata);
if (cbid == CUPTI_CBID_NVTX_nvtxDomainRangePushEx) {
const nvtxDomainRangePushEx_params *params =
reinterpret_cast<const nvtxDomainRangePushEx_params *>(
pdata->functionParams);
// TODO(profiler): The messageType is actually NVTX_MESSAGE_TYPE_REGISTERED
// (which is 3), However it seems to me that we can not get the registered
// string from nvtxDomainRegisterStringA_params. If we reinterpret the
// payload as ascii, it happen to work.
NVTXRangeTracker::EnterRange(params->core.eventAttrib->message.ascii);
} else if (cbid == CUPTI_CBID_NVTX_nvtxDomainRangePop) {
NVTXRangeTracker::ExitRange();
}
return Status::OK();
}
Status CuptiTracer::HandleCallback(CUpti_CallbackDomain domain,
CUpti_CallbackId cbid,
const CUpti_CallbackData *cbdata) {
if (!api_tracing_enabled_) return Status::OK(); // already unsubscribed.
if (!cupti_driver_api_hook_) return Status::OK(); // already unsubscribed.
if (domain == CUPTI_CB_DOMAIN_NVTX) return HandleNVTXCallback(cbid, cbdata);
if (domain != CUPTI_CB_DOMAIN_DRIVER_API) return Status::OK();
if (internalCuCall) return Status::OK();
@ -1546,11 +1585,12 @@ Status CuptiTracer::HandleCallback(CUpti_CallbackDomain domain,
// we need to populate per device annotation map respectively.
for (int i = 0; i < num_gpus_; ++i) {
collector_->annotation_map()->Add(i, cbdata->correlationId,
annotation);
annotation, "");
}
} else {
absl::string_view nvtx_range = NVTXRangeTracker::CurrentRange();
collector_->annotation_map()->Add(device_id, cbdata->correlationId,
annotation);
annotation, nvtx_range);
}
}

View File

@ -18,6 +18,7 @@ limitations under the License.
#include "absl/types/optional.h"
#include "third_party/gpus/cuda/extras/CUPTI/include/cupti.h"
#include "third_party/gpus/cuda/include/nvtx3/nvToolsExt.h"
#include "tensorflow/core/platform/errors.h"
#include "tensorflow/core/platform/macros.h"
#include "tensorflow/core/platform/status.h"
@ -50,6 +51,8 @@ struct CuptiTracerOptions {
bool cupti_finalize = false;
// Whether to call cuCtxSynchronize for each device before Stop().
bool sync_devices_before_stop = false;
// Whether to enable NVTX tracking, we need this for TensorRT tracking.
bool enable_nvtx_tracking = false;
};
class CuptiDriverApiHook {
@ -111,6 +114,8 @@ class CuptiTracer {
Status DisableActivityTracing();
Status Finalize();
void ConfigureActivityUnifiedMemoryCounter(bool enable);
Status HandleNVTXCallback(CUpti_CallbackId cbid,
const CUpti_CallbackData* cbdata);
int num_gpus_;
absl::optional<CuptiTracerOptions> option_;

View File

@ -0,0 +1,30 @@
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/core/profiler/internal/gpu/nvtx_utils.h"
#include "third_party/gpus/cuda/include/nvtx3/nvToolsExt.h"
#include "tensorflow/core/platform/platform.h"
namespace tensorflow {
namespace profiler {
/*static*/ std::stack<std::string> &NVTXRangeTracker::GetRangeStack() {
static thread_local std::stack<std::string> range_stack;
return range_stack;
}
} // namespace profiler
} // namespace tensorflow

View File

@ -0,0 +1,58 @@
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_GPU_NVTX_UTILS_H_
#define TENSORFLOW_CORE_PROFILER_INTERNAL_GPU_NVTX_UTILS_H_
#include <stack>
#include "absl/strings/string_view.h"
#include "tensorflow/core/platform/macros.h"
namespace tensorflow {
namespace profiler {
/***
* We have no intention to use NVTX in tensorflow right now, we use this class
* to track NVTX instrumentation inside NVIDIA libraries (such as TensorRT).
* This bears a lot of resemblance to ScopedAnnotation for now. In the future,
* we will use TraceMe to keep track trace context within a thread.
*/
class NVTXRangeTracker {
public:
static void EnterRange(const std::string& range) {
auto& range_stack = GetRangeStack();
range_stack.push(range);
}
static void ExitRange() {
auto& range_stack = GetRangeStack();
if (!range_stack.empty()) range_stack.pop();
}
static const absl::string_view CurrentRange() {
auto& range_stack = GetRangeStack();
if (!range_stack.empty()) return range_stack.top();
return "";
}
private:
static std::stack<std::string>& GetRangeStack();
TF_DISALLOW_COPY_AND_ASSIGN(NVTXRangeTracker);
};
} // namespace profiler
} // namespace tensorflow
#endif // TENSORFLOW_CORE_PROFILER_INTERNAL_GPU_NVTX_UTILS_H_

View File

@ -176,6 +176,7 @@ const StatTypeMap& GetStatTypeMap() {
{"memalloc_details", kMemallocDetails},
{"kernel_details", kKernelDetails},
{"annotation", kKernelAnnotation},
{"nvtx_range", kNVTXRange},
{"stream", kStream},
// Stats added when processing traces.
{"group_id", kGroupId},

View File

@ -164,6 +164,7 @@ enum StatType {
kMemcpyDetails,
kMemallocDetails,
kKernelAnnotation,
kNVTXRange,
kKernelDetails,
kStream,
// Stats added when processing traces.