add nvtx range push/pop tracking. use that information as a stats in kernel events.
PiperOrigin-RevId: 347969357 Change-Id: I8fa5a7bc10e27319011ffe4901a53b28845e2aea
This commit is contained in:
parent
d8f4e9a18e
commit
3af5c069b6
@ -127,8 +127,10 @@ tf_cuda_library(
|
||||
":cupti_collector",
|
||||
":cupti_interface",
|
||||
":cupti_utils",
|
||||
":nvtx_utils",
|
||||
"//tensorflow/core:lib",
|
||||
"//tensorflow/core/profiler/internal/cpu:annotation_stack",
|
||||
"//tensorflow/core/profiler/lib:scoped_annotation",
|
||||
"@com_google_absl//absl/container:flat_hash_map",
|
||||
"@com_google_absl//absl/container:node_hash_map",
|
||||
"@com_google_absl//absl/container:node_hash_set",
|
||||
@ -136,6 +138,16 @@ tf_cuda_library(
|
||||
],
|
||||
)
|
||||
|
||||
tf_cuda_library(
|
||||
name = "nvtx_utils",
|
||||
srcs = if_cuda_is_configured_compat(["nvtx_utils.cc"]),
|
||||
hdrs = if_cuda_is_configured_compat(["nvtx_utils.h"]),
|
||||
copts = tf_profiler_copts() + tf_copts(),
|
||||
deps = [
|
||||
"//tensorflow/core:lib",
|
||||
],
|
||||
)
|
||||
|
||||
tf_cuda_library(
|
||||
name = "cupti_collector",
|
||||
srcs = if_cuda_is_configured_compat(["cupti_collector.cc"]),
|
||||
|
@ -160,6 +160,11 @@ struct PerDeviceCollector {
|
||||
GetStatTypeStr(StatType::kKernelAnnotation)),
|
||||
*plane->GetOrCreateStatMetadata(event.annotation));
|
||||
}
|
||||
if (!event.nvtx_range.empty()) {
|
||||
xevent.AddStatValue(
|
||||
*plane->GetOrCreateStatMetadata(GetStatTypeStr(StatType::kNVTXRange)),
|
||||
*plane->GetOrCreateStatMetadata(event.nvtx_range));
|
||||
}
|
||||
if (event.context_id != CuptiTracerEvent::kInvalidContextId) {
|
||||
xevent.AddStatValue(
|
||||
*plane->GetOrCreateStatMetadata(GetStatTypeStr(StatType::kContextId)),
|
||||
@ -547,8 +552,9 @@ struct PerDeviceCollector {
|
||||
} // namespace
|
||||
|
||||
void AnnotationMap::Add(uint32 device_id, uint32 correlation_id,
|
||||
const std::string& annotation) {
|
||||
if (annotation.empty()) return;
|
||||
const absl::string_view annotation,
|
||||
const absl::string_view nvtx_range) {
|
||||
if (annotation.empty() && nvtx_range.empty()) return;
|
||||
VLOG(3) << "Add annotation: device_id: " << device_id
|
||||
<< " correlation_id: " << correlation_id
|
||||
<< " annotation: " << annotation;
|
||||
@ -556,20 +562,22 @@ void AnnotationMap::Add(uint32 device_id, uint32 correlation_id,
|
||||
auto& per_device_map = per_device_map_[device_id];
|
||||
absl::MutexLock lock(&per_device_map.mutex);
|
||||
if (per_device_map.annotations.size() < max_size_) {
|
||||
absl::string_view annotation_str =
|
||||
*per_device_map.annotations.insert(annotation).first;
|
||||
per_device_map.correlation_map.emplace(correlation_id, annotation_str);
|
||||
AnnotationInfo info;
|
||||
info.annotation = *per_device_map.annotations.emplace(annotation).first;
|
||||
if (!nvtx_range.empty())
|
||||
info.nvtx_range = *per_device_map.nvtx_ranges.emplace(nvtx_range).first;
|
||||
per_device_map.correlation_map.emplace(correlation_id, info);
|
||||
}
|
||||
}
|
||||
|
||||
absl::string_view AnnotationMap::LookUp(uint32 device_id,
|
||||
uint32 correlation_id) {
|
||||
if (device_id >= per_device_map_.size()) return absl::string_view();
|
||||
AnnotationMap::AnnotationInfo AnnotationMap::LookUp(uint32 device_id,
|
||||
uint32 correlation_id) {
|
||||
if (device_id >= per_device_map_.size()) return AnnotationInfo();
|
||||
auto& per_device_map = per_device_map_[device_id];
|
||||
absl::MutexLock lock(&per_device_map.mutex);
|
||||
auto it = per_device_map.correlation_map.find(correlation_id);
|
||||
return it != per_device_map.correlation_map.end() ? it->second
|
||||
: absl::string_view();
|
||||
: AnnotationInfo();
|
||||
}
|
||||
|
||||
// CuptiTraceCollectorImpl store the CuptiTracerEvents from CuptiTracer and
|
||||
|
@ -127,6 +127,7 @@ struct CuptiTracerEvent {
|
||||
// This points to strings in AnnotationMap, which should outlive the point
|
||||
// where serialization happens.
|
||||
absl::string_view annotation;
|
||||
absl::string_view nvtx_range;
|
||||
uint64 start_time_ns = 0;
|
||||
uint64 end_time_ns = 0;
|
||||
uint32 device_id = 0;
|
||||
@ -156,11 +157,17 @@ struct CuptiTracerCollectorOptions {
|
||||
|
||||
class AnnotationMap {
|
||||
public:
|
||||
struct AnnotationInfo {
|
||||
absl::string_view annotation;
|
||||
absl::string_view nvtx_range;
|
||||
};
|
||||
|
||||
explicit AnnotationMap(uint64 max_size, uint32 num_gpus)
|
||||
: max_size_(max_size), per_device_map_(num_gpus) {}
|
||||
void Add(uint32 device_id, uint32 correlation_id,
|
||||
const std::string& annotation);
|
||||
absl::string_view LookUp(uint32 device_id, uint32 correlation_id);
|
||||
const absl::string_view annotation,
|
||||
const absl::string_view nvtx_range);
|
||||
AnnotationInfo LookUp(uint32 device_id, uint32 correlation_id);
|
||||
|
||||
private:
|
||||
struct PerDeviceAnnotationMap {
|
||||
@ -170,7 +177,8 @@ class AnnotationMap {
|
||||
// Annotation tends to be repetitive, use a hash_set to store the strings,
|
||||
// an use the reference to the string in the map.
|
||||
absl::node_hash_set<std::string> annotations;
|
||||
absl::flat_hash_map<uint32, absl::string_view> correlation_map;
|
||||
absl::node_hash_set<std::string> nvtx_ranges;
|
||||
absl::flat_hash_map<uint32, AnnotationInfo> correlation_map;
|
||||
};
|
||||
const uint64 max_size_;
|
||||
absl::FixedArray<PerDeviceAnnotationMap> per_device_map_;
|
||||
|
@ -19,6 +19,7 @@ limitations under the License.
|
||||
#include "absl/container/flat_hash_set.h"
|
||||
#include "absl/container/node_hash_map.h"
|
||||
#include "absl/container/node_hash_set.h"
|
||||
#include "third_party/gpus/cuda/extras/CUPTI/include/generated_nvtx_meta.h"
|
||||
#include "tensorflow/core/platform/env.h"
|
||||
#include "tensorflow/core/platform/errors.h"
|
||||
#include "tensorflow/core/platform/host_info.h"
|
||||
@ -26,6 +27,8 @@ limitations under the License.
|
||||
#include "tensorflow/core/platform/macros.h"
|
||||
#include "tensorflow/core/platform/mem.h"
|
||||
#include "tensorflow/core/profiler/internal/cpu/annotation_stack.h"
|
||||
#include "tensorflow/core/profiler/internal/gpu/cupti_collector.h"
|
||||
#include "tensorflow/core/profiler/internal/gpu/nvtx_utils.h"
|
||||
|
||||
namespace tensorflow {
|
||||
namespace profiler {
|
||||
@ -418,8 +421,10 @@ void AddKernelActivityEvent(CuptiTraceCollector *collector,
|
||||
event.context_id = kernel->contextId;
|
||||
event.stream_id = kernel->streamId;
|
||||
event.correlation_id = kernel->correlationId;
|
||||
event.annotation = collector->annotation_map()->LookUp(event.device_id,
|
||||
event.correlation_id);
|
||||
AnnotationMap::AnnotationInfo info = collector->annotation_map()->LookUp(
|
||||
event.device_id, event.correlation_id);
|
||||
event.annotation = info.annotation;
|
||||
event.nvtx_range = info.nvtx_range;
|
||||
event.kernel_info.registers_per_thread = kernel->registersPerThread;
|
||||
event.kernel_info.static_shared_memory_usage = kernel->staticSharedMemory;
|
||||
event.kernel_info.dynamic_shared_memory_usage = kernel->dynamicSharedMemory;
|
||||
@ -464,8 +469,9 @@ void AddMemcpyActivityEvent(CuptiTraceCollector *collector,
|
||||
event.context_id = memcpy->contextId;
|
||||
event.stream_id = memcpy->streamId;
|
||||
event.correlation_id = memcpy->correlationId;
|
||||
event.annotation = collector->annotation_map()->LookUp(event.device_id,
|
||||
event.correlation_id);
|
||||
AnnotationMap::AnnotationInfo info = collector->annotation_map()->LookUp(
|
||||
event.device_id, event.correlation_id);
|
||||
event.annotation = info.annotation;
|
||||
event.memcpy_info.kind = memcpy->copyKind;
|
||||
event.memcpy_info.num_bytes = memcpy->bytes;
|
||||
event.memcpy_info.destination = memcpy->deviceId;
|
||||
@ -488,8 +494,9 @@ void AddMemcpy2ActivityEvent(CuptiTraceCollector *collector,
|
||||
event.context_id = memcpy2->contextId;
|
||||
event.stream_id = memcpy2->streamId;
|
||||
event.correlation_id = memcpy2->correlationId;
|
||||
event.annotation = collector->annotation_map()->LookUp(event.device_id,
|
||||
event.correlation_id);
|
||||
AnnotationMap::AnnotationInfo info = collector->annotation_map()->LookUp(
|
||||
event.device_id, event.correlation_id);
|
||||
event.annotation = info.annotation;
|
||||
event.memcpy_info.kind = CUPTI_ACTIVITY_MEMCPY_KIND_PTOP;
|
||||
event.memcpy_info.num_bytes = memcpy2->bytes;
|
||||
event.memcpy_info.destination = memcpy2->dstDeviceId;
|
||||
@ -946,8 +953,9 @@ class CudaEventRecorder {
|
||||
event.context_id = stream_info.ctx_info->context_id;
|
||||
event.stream_id = stream_info.stream_id;
|
||||
event.correlation_id = record.correlation_id;
|
||||
event.annotation =
|
||||
annotation_map->LookUp(event.device_id, event.correlation_id);
|
||||
AnnotationMap::AnnotationInfo info = collector_->annotation_map()->LookUp(
|
||||
event.device_id, event.correlation_id);
|
||||
event.annotation = info.annotation;
|
||||
event.kernel_info = record.details;
|
||||
collector_->AddEvent(std::move(event));
|
||||
return Status::OK();
|
||||
@ -974,8 +982,9 @@ class CudaEventRecorder {
|
||||
event.context_id = stream_info.ctx_info->context_id;
|
||||
event.stream_id = stream_info.stream_id;
|
||||
event.correlation_id = record.correlation_id;
|
||||
event.annotation =
|
||||
annotation_map->LookUp(event.device_id, event.correlation_id);
|
||||
AnnotationMap::AnnotationInfo info = collector_->annotation_map()->LookUp(
|
||||
event.device_id, event.correlation_id);
|
||||
event.annotation = info.annotation;
|
||||
event.memcpy_info.num_bytes = record.size_bytes;
|
||||
// TODO: support MemcpyD2D where destination != source;
|
||||
event.memcpy_info.destination = ordinal_;
|
||||
@ -1063,7 +1072,7 @@ class CuptiDriverApiHookWithCudaEvent : public CuptiDriverApiHook {
|
||||
// Because annotation are per device, therefore we need to populate
|
||||
// annotation for each device involved.
|
||||
collector_->annotation_map()->Add(*dev_id, cbdata->correlationId,
|
||||
annotation);
|
||||
annotation, "");
|
||||
record_indices.push_back(
|
||||
cuda_event_recorders_[*dev_id]->StartKernel<CUDA_LAUNCH_PARAMS>(
|
||||
"CooperativeKernelMultiDevice", *context,
|
||||
@ -1425,6 +1434,11 @@ Status CuptiTracer::EnableApiTracing() {
|
||||
RETURN_IF_CUPTI_ERROR(cupti_interface_->EnableDomain(
|
||||
1 /* ENABLE */, subscriber_, CUPTI_CB_DOMAIN_DRIVER_API));
|
||||
}
|
||||
|
||||
if (option_->enable_nvtx_tracking) {
|
||||
RETURN_IF_CUPTI_ERROR(cupti_interface_->EnableDomain(
|
||||
1 /* ENABLE */, subscriber_, CUPTI_CB_DOMAIN_NVTX));
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
@ -1443,6 +1457,11 @@ Status CuptiTracer::DisableApiTracing() {
|
||||
0 /* DISABLE */, subscriber_, CUPTI_CB_DOMAIN_DRIVER_API));
|
||||
}
|
||||
|
||||
if (option_->enable_nvtx_tracking) {
|
||||
RETURN_IF_CUPTI_ERROR(cupti_interface_->EnableDomain(
|
||||
0 /* DISABLE */, subscriber_, CUPTI_CB_DOMAIN_NVTX));
|
||||
}
|
||||
|
||||
VLOG(1) << "Disable subscriber";
|
||||
RETURN_IF_CUPTI_ERROR(cupti_interface_->Unsubscribe(subscriber_));
|
||||
return Status::OK();
|
||||
@ -1510,11 +1529,31 @@ Status CuptiTracer::Finalize() {
|
||||
return 0;
|
||||
}
|
||||
|
||||
Status CuptiTracer::HandleNVTXCallback(CUpti_CallbackId cbid,
|
||||
const CUpti_CallbackData *cbdata) {
|
||||
const CUpti_NvtxData *pdata =
|
||||
reinterpret_cast<const CUpti_NvtxData *>(cbdata);
|
||||
if (cbid == CUPTI_CBID_NVTX_nvtxDomainRangePushEx) {
|
||||
const nvtxDomainRangePushEx_params *params =
|
||||
reinterpret_cast<const nvtxDomainRangePushEx_params *>(
|
||||
pdata->functionParams);
|
||||
// TODO(profiler): The messageType is actually NVTX_MESSAGE_TYPE_REGISTERED
|
||||
// (which is 3), However it seems to me that we can not get the registered
|
||||
// string from nvtxDomainRegisterStringA_params. If we reinterpret the
|
||||
// payload as ascii, it happen to work.
|
||||
NVTXRangeTracker::EnterRange(params->core.eventAttrib->message.ascii);
|
||||
} else if (cbid == CUPTI_CBID_NVTX_nvtxDomainRangePop) {
|
||||
NVTXRangeTracker::ExitRange();
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status CuptiTracer::HandleCallback(CUpti_CallbackDomain domain,
|
||||
CUpti_CallbackId cbid,
|
||||
const CUpti_CallbackData *cbdata) {
|
||||
if (!api_tracing_enabled_) return Status::OK(); // already unsubscribed.
|
||||
if (!cupti_driver_api_hook_) return Status::OK(); // already unsubscribed.
|
||||
if (domain == CUPTI_CB_DOMAIN_NVTX) return HandleNVTXCallback(cbid, cbdata);
|
||||
if (domain != CUPTI_CB_DOMAIN_DRIVER_API) return Status::OK();
|
||||
if (internalCuCall) return Status::OK();
|
||||
|
||||
@ -1546,11 +1585,12 @@ Status CuptiTracer::HandleCallback(CUpti_CallbackDomain domain,
|
||||
// we need to populate per device annotation map respectively.
|
||||
for (int i = 0; i < num_gpus_; ++i) {
|
||||
collector_->annotation_map()->Add(i, cbdata->correlationId,
|
||||
annotation);
|
||||
annotation, "");
|
||||
}
|
||||
} else {
|
||||
absl::string_view nvtx_range = NVTXRangeTracker::CurrentRange();
|
||||
collector_->annotation_map()->Add(device_id, cbdata->correlationId,
|
||||
annotation);
|
||||
annotation, nvtx_range);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -18,6 +18,7 @@ limitations under the License.
|
||||
|
||||
#include "absl/types/optional.h"
|
||||
#include "third_party/gpus/cuda/extras/CUPTI/include/cupti.h"
|
||||
#include "third_party/gpus/cuda/include/nvtx3/nvToolsExt.h"
|
||||
#include "tensorflow/core/platform/errors.h"
|
||||
#include "tensorflow/core/platform/macros.h"
|
||||
#include "tensorflow/core/platform/status.h"
|
||||
@ -50,6 +51,8 @@ struct CuptiTracerOptions {
|
||||
bool cupti_finalize = false;
|
||||
// Whether to call cuCtxSynchronize for each device before Stop().
|
||||
bool sync_devices_before_stop = false;
|
||||
// Whether to enable NVTX tracking, we need this for TensorRT tracking.
|
||||
bool enable_nvtx_tracking = false;
|
||||
};
|
||||
|
||||
class CuptiDriverApiHook {
|
||||
@ -111,6 +114,8 @@ class CuptiTracer {
|
||||
Status DisableActivityTracing();
|
||||
Status Finalize();
|
||||
void ConfigureActivityUnifiedMemoryCounter(bool enable);
|
||||
Status HandleNVTXCallback(CUpti_CallbackId cbid,
|
||||
const CUpti_CallbackData* cbdata);
|
||||
|
||||
int num_gpus_;
|
||||
absl::optional<CuptiTracerOptions> option_;
|
||||
|
30
tensorflow/core/profiler/internal/gpu/nvtx_utils.cc
Normal file
30
tensorflow/core/profiler/internal/gpu/nvtx_utils.cc
Normal file
@ -0,0 +1,30 @@
|
||||
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#include "tensorflow/core/profiler/internal/gpu/nvtx_utils.h"
|
||||
|
||||
#include "third_party/gpus/cuda/include/nvtx3/nvToolsExt.h"
|
||||
#include "tensorflow/core/platform/platform.h"
|
||||
|
||||
namespace tensorflow {
|
||||
namespace profiler {
|
||||
|
||||
/*static*/ std::stack<std::string> &NVTXRangeTracker::GetRangeStack() {
|
||||
static thread_local std::stack<std::string> range_stack;
|
||||
return range_stack;
|
||||
}
|
||||
|
||||
} // namespace profiler
|
||||
} // namespace tensorflow
|
58
tensorflow/core/profiler/internal/gpu/nvtx_utils.h
Normal file
58
tensorflow/core/profiler/internal/gpu/nvtx_utils.h
Normal file
@ -0,0 +1,58 @@
|
||||
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_GPU_NVTX_UTILS_H_
|
||||
#define TENSORFLOW_CORE_PROFILER_INTERNAL_GPU_NVTX_UTILS_H_
|
||||
|
||||
#include <stack>
|
||||
|
||||
#include "absl/strings/string_view.h"
|
||||
#include "tensorflow/core/platform/macros.h"
|
||||
|
||||
namespace tensorflow {
|
||||
namespace profiler {
|
||||
|
||||
/***
|
||||
* We have no intention to use NVTX in tensorflow right now, we use this class
|
||||
* to track NVTX instrumentation inside NVIDIA libraries (such as TensorRT).
|
||||
* This bears a lot of resemblance to ScopedAnnotation for now. In the future,
|
||||
* we will use TraceMe to keep track trace context within a thread.
|
||||
*/
|
||||
class NVTXRangeTracker {
|
||||
public:
|
||||
static void EnterRange(const std::string& range) {
|
||||
auto& range_stack = GetRangeStack();
|
||||
range_stack.push(range);
|
||||
}
|
||||
static void ExitRange() {
|
||||
auto& range_stack = GetRangeStack();
|
||||
if (!range_stack.empty()) range_stack.pop();
|
||||
}
|
||||
static const absl::string_view CurrentRange() {
|
||||
auto& range_stack = GetRangeStack();
|
||||
if (!range_stack.empty()) return range_stack.top();
|
||||
return "";
|
||||
}
|
||||
|
||||
private:
|
||||
static std::stack<std::string>& GetRangeStack();
|
||||
|
||||
TF_DISALLOW_COPY_AND_ASSIGN(NVTXRangeTracker);
|
||||
};
|
||||
|
||||
} // namespace profiler
|
||||
} // namespace tensorflow
|
||||
|
||||
#endif // TENSORFLOW_CORE_PROFILER_INTERNAL_GPU_NVTX_UTILS_H_
|
@ -176,6 +176,7 @@ const StatTypeMap& GetStatTypeMap() {
|
||||
{"memalloc_details", kMemallocDetails},
|
||||
{"kernel_details", kKernelDetails},
|
||||
{"annotation", kKernelAnnotation},
|
||||
{"nvtx_range", kNVTXRange},
|
||||
{"stream", kStream},
|
||||
// Stats added when processing traces.
|
||||
{"group_id", kGroupId},
|
||||
|
@ -164,6 +164,7 @@ enum StatType {
|
||||
kMemcpyDetails,
|
||||
kMemallocDetails,
|
||||
kKernelAnnotation,
|
||||
kNVTXRange,
|
||||
kKernelDetails,
|
||||
kStream,
|
||||
// Stats added when processing traces.
|
||||
|
Loading…
Reference in New Issue
Block a user