merge open source device tracer with google one and move it to core/profiler/internal/gpu.
PiperOrigin-RevId: 268153734
This commit is contained in:
parent
c9c46b589e
commit
191f36cb17
@ -9,7 +9,6 @@ load(
|
|||||||
)
|
)
|
||||||
load(
|
load(
|
||||||
"//tensorflow/core/platform:default/build_config.bzl",
|
"//tensorflow/core/platform:default/build_config.bzl",
|
||||||
"tf_additional_device_tracer_test_flags",
|
|
||||||
"tf_kernel_tests_linkstatic",
|
"tf_kernel_tests_linkstatic",
|
||||||
)
|
)
|
||||||
load(
|
load(
|
||||||
@ -237,8 +236,7 @@ tf_cuda_cc_test(
|
|||||||
srcs = [
|
srcs = [
|
||||||
"c_api_experimental_test.cc",
|
"c_api_experimental_test.cc",
|
||||||
],
|
],
|
||||||
args =
|
args = ["--heap_check=local"],
|
||||||
["--heap_check=local"] + tf_additional_device_tracer_test_flags(),
|
|
||||||
extra_copts = tfe_xla_copts(),
|
extra_copts = tfe_xla_copts(),
|
||||||
linkstatic = tf_kernel_tests_linkstatic(),
|
linkstatic = tf_kernel_tests_linkstatic(),
|
||||||
tags = tf_cuda_tests_tags() + ["nomac"],
|
tags = tf_cuda_tests_tags() + ["nomac"],
|
||||||
|
@ -107,8 +107,6 @@ load(
|
|||||||
"//tensorflow/core/platform:default/build_config.bzl",
|
"//tensorflow/core/platform:default/build_config.bzl",
|
||||||
"tf_additional_all_protos",
|
"tf_additional_all_protos",
|
||||||
"tf_additional_core_deps",
|
"tf_additional_core_deps",
|
||||||
"tf_additional_device_tracer_cuda_deps",
|
|
||||||
"tf_additional_device_tracer_test_flags",
|
|
||||||
"tf_additional_human_readable_json_deps",
|
"tf_additional_human_readable_json_deps",
|
||||||
"tf_additional_lib_defines",
|
"tf_additional_lib_defines",
|
||||||
"tf_additional_lib_deps",
|
"tf_additional_lib_deps",
|
||||||
@ -3398,28 +3396,6 @@ cc_library(
|
|||||||
alwayslink = 1,
|
alwayslink = 1,
|
||||||
)
|
)
|
||||||
|
|
||||||
tf_cuda_library(
|
|
||||||
name = "device_tracer",
|
|
||||||
srcs = [
|
|
||||||
"//tensorflow/core/platform:legacy_device_tracer_srcs",
|
|
||||||
],
|
|
||||||
copts = tf_copts(),
|
|
||||||
cuda_deps = tf_additional_device_tracer_cuda_deps(),
|
|
||||||
visibility = [
|
|
||||||
"//tensorflow:internal",
|
|
||||||
],
|
|
||||||
deps = [
|
|
||||||
":core_cpu_internal",
|
|
||||||
":lib",
|
|
||||||
":protos_all_cc",
|
|
||||||
"//tensorflow/core/profiler/internal:parse_annotation",
|
|
||||||
"//tensorflow/core/profiler/internal:profiler_interface",
|
|
||||||
"//tensorflow/core/profiler/lib:traceme",
|
|
||||||
"@com_google_absl//absl/flags:flag",
|
|
||||||
],
|
|
||||||
alwayslink = True,
|
|
||||||
)
|
|
||||||
|
|
||||||
tf_proto_library_cc(
|
tf_proto_library_cc(
|
||||||
name = "replay_log_proto",
|
name = "replay_log_proto",
|
||||||
srcs = ["protobuf/replay_log.proto"],
|
srcs = ["protobuf/replay_log.proto"],
|
||||||
@ -5235,36 +5211,6 @@ tf_cc_test(
|
|||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
|
||||||
tf_cc_test_gpu(
|
|
||||||
name = "device_tracer_test",
|
|
||||||
size = "small",
|
|
||||||
srcs = ["//tensorflow/core/platform:device_tracer_test.cc"],
|
|
||||||
args =
|
|
||||||
["--heap_check=local"] + tf_additional_device_tracer_test_flags(),
|
|
||||||
linkstatic = tf_kernel_tests_linkstatic(),
|
|
||||||
tags = tf_cuda_tests_tags() + ["nomac"],
|
|
||||||
deps = [
|
|
||||||
":all_kernels",
|
|
||||||
":core_cpu",
|
|
||||||
":core_cpu_internal",
|
|
||||||
":device_tracer",
|
|
||||||
":direct_session",
|
|
||||||
":direct_session_internal",
|
|
||||||
":framework",
|
|
||||||
":framework_internal",
|
|
||||||
":gpu_runtime",
|
|
||||||
":lib",
|
|
||||||
":lib_internal",
|
|
||||||
":protos_all_cc",
|
|
||||||
":test",
|
|
||||||
":test_main",
|
|
||||||
":testlib",
|
|
||||||
"//tensorflow/cc:cc_ops",
|
|
||||||
"//tensorflow/core/kernels:ops_util",
|
|
||||||
"//tensorflow/core/profiler/internal:profiler_interface",
|
|
||||||
],
|
|
||||||
)
|
|
||||||
|
|
||||||
tf_cc_tests(
|
tf_cc_tests(
|
||||||
name = "common_runtime_input_colocation_exemption_registry_test",
|
name = "common_runtime_input_colocation_exemption_registry_test",
|
||||||
size = "small",
|
size = "small",
|
||||||
|
@ -10,7 +10,6 @@
|
|||||||
|
|
||||||
load(
|
load(
|
||||||
"//tensorflow/core/platform:default/build_config.bzl",
|
"//tensorflow/core/platform:default/build_config.bzl",
|
||||||
"tf_additional_device_tracer_srcs",
|
|
||||||
"tf_additional_lib_hdrs",
|
"tf_additional_lib_hdrs",
|
||||||
"tf_additional_lib_srcs",
|
"tf_additional_lib_srcs",
|
||||||
"tf_additional_libdevice_srcs",
|
"tf_additional_libdevice_srcs",
|
||||||
@ -474,8 +473,6 @@ filegroup(
|
|||||||
"**/monitoring.cc",
|
"**/monitoring.cc",
|
||||||
"**/stream_executor.h",
|
"**/stream_executor.h",
|
||||||
"**/env_time.cc",
|
"**/env_time.cc",
|
||||||
"**/device_tracer.cc",
|
|
||||||
"**/tpu_tracer.cc",
|
|
||||||
"**/logger.cc",
|
"**/logger.cc",
|
||||||
"**/logging.cc",
|
"**/logging.cc",
|
||||||
"**/human_readable_json.cc",
|
"**/human_readable_json.cc",
|
||||||
@ -573,8 +570,6 @@ filegroup(
|
|||||||
"**/env_time.cc",
|
"**/env_time.cc",
|
||||||
"**/monitoring.cc",
|
"**/monitoring.cc",
|
||||||
"**/cuda_libdevice_path.cc",
|
"**/cuda_libdevice_path.cc",
|
||||||
"**/device_tracer.cc",
|
|
||||||
"**/tpu_tracer.cc",
|
|
||||||
"**/logger.cc",
|
"**/logger.cc",
|
||||||
"**/logging.cc",
|
"**/logging.cc",
|
||||||
"**/human_readable_json.cc",
|
"**/human_readable_json.cc",
|
||||||
@ -600,12 +595,6 @@ filegroup(
|
|||||||
visibility = ["//tensorflow/core:__pkg__"],
|
visibility = ["//tensorflow/core:__pkg__"],
|
||||||
)
|
)
|
||||||
|
|
||||||
filegroup(
|
|
||||||
name = "legacy_device_tracer_srcs",
|
|
||||||
srcs = tf_additional_device_tracer_srcs(),
|
|
||||||
visibility = ["//tensorflow/core:__pkg__"],
|
|
||||||
)
|
|
||||||
|
|
||||||
filegroup(
|
filegroup(
|
||||||
name = "legacy_minimal_lib_srcs",
|
name = "legacy_minimal_lib_srcs",
|
||||||
srcs = tf_additional_minimal_lib_srcs(),
|
srcs = tf_additional_minimal_lib_srcs(),
|
||||||
|
@ -573,24 +573,11 @@ def tf_protos_grappler():
|
|||||||
)
|
)
|
||||||
|
|
||||||
def tf_additional_device_tracer_srcs():
|
def tf_additional_device_tracer_srcs():
|
||||||
return ["default/device_tracer.cc"]
|
return ["device_tracer.cc"]
|
||||||
|
|
||||||
def tf_additional_cupti_utils_cuda_deps():
|
def tf_additional_cupti_utils_cuda_deps():
|
||||||
return []
|
return []
|
||||||
|
|
||||||
def tf_additional_device_tracer_cuda_deps():
|
|
||||||
return [
|
|
||||||
"//tensorflow/stream_executor/cuda:cupti_stub",
|
|
||||||
"@com_google_absl//absl/base",
|
|
||||||
"@com_google_absl//absl/strings",
|
|
||||||
"@com_google_absl//absl/strings:str_format",
|
|
||||||
"@com_google_absl//absl/container:node_hash_map",
|
|
||||||
"@com_google_absl//absl/container:flat_hash_map",
|
|
||||||
]
|
|
||||||
|
|
||||||
def tf_additional_device_tracer_test_flags():
|
|
||||||
return []
|
|
||||||
|
|
||||||
def tf_additional_cupti_test_flags():
|
def tf_additional_cupti_test_flags():
|
||||||
return []
|
return []
|
||||||
|
|
||||||
|
@ -1,681 +0,0 @@
|
|||||||
/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
|
|
||||||
|
|
||||||
Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
you may not use this file except in compliance with the License.
|
|
||||||
You may obtain a copy of the License at
|
|
||||||
|
|
||||||
http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
|
|
||||||
Unless required by applicable law or agreed to in writing, software
|
|
||||||
distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
See the License for the specific language governing permissions and
|
|
||||||
limitations under the License.
|
|
||||||
==============================================================================*/
|
|
||||||
|
|
||||||
#if GOOGLE_CUDA
|
|
||||||
|
|
||||||
#include <stdlib.h>
|
|
||||||
|
|
||||||
#include <memory>
|
|
||||||
|
|
||||||
#include "absl/base/casts.h"
|
|
||||||
#include "absl/container/flat_hash_map.h"
|
|
||||||
#include "absl/container/node_hash_map.h"
|
|
||||||
#include "absl/strings/ascii.h"
|
|
||||||
#include "absl/strings/str_cat.h"
|
|
||||||
#include "absl/strings/str_format.h"
|
|
||||||
#include "third_party/gpus/cuda/extras/CUPTI/include/cupti.h"
|
|
||||||
#include "tensorflow/core/common_runtime/step_stats_collector.h"
|
|
||||||
#include "tensorflow/core/framework/step_stats.pb.h"
|
|
||||||
#include "tensorflow/core/lib/core/errors.h"
|
|
||||||
#include "tensorflow/core/lib/core/status.h"
|
|
||||||
#include "tensorflow/core/lib/hash/hash.h"
|
|
||||||
#include "tensorflow/core/lib/strings/strcat.h"
|
|
||||||
#include "tensorflow/core/lib/strings/stringprintf.h"
|
|
||||||
#include "tensorflow/core/platform/abi.h"
|
|
||||||
#include "tensorflow/core/platform/annotation.h"
|
|
||||||
#include "tensorflow/core/platform/env.h"
|
|
||||||
#include "tensorflow/core/platform/macros.h"
|
|
||||||
#include "tensorflow/core/platform/mem.h"
|
|
||||||
#include "tensorflow/core/platform/mutex.h"
|
|
||||||
#include "tensorflow/core/platform/tracing.h"
|
|
||||||
#include "tensorflow/core/profiler/internal/profiler_interface.h"
|
|
||||||
#include "tensorflow/core/util/env_var.h"
|
|
||||||
|
|
||||||
namespace tensorflow {
|
|
||||||
namespace {
|
|
||||||
Status ToStatus(CUptiResult result) {
|
|
||||||
if (result == CUPTI_SUCCESS) {
|
|
||||||
return Status::OK();
|
|
||||||
}
|
|
||||||
const char* str = nullptr;
|
|
||||||
cuptiGetResultString(result, &str);
|
|
||||||
return errors::Unavailable("CUPTI error: ", str ? str : "<unknown>");
|
|
||||||
}
|
|
||||||
|
|
||||||
Status ToStatus(CUresult result) {
|
|
||||||
if (result == CUDA_SUCCESS) {
|
|
||||||
return Status::OK();
|
|
||||||
}
|
|
||||||
const char* str = nullptr;
|
|
||||||
cuGetErrorName(result, &str);
|
|
||||||
return errors::Unavailable("CUDA error: ", str ? str : "<unknown>");
|
|
||||||
}
|
|
||||||
|
|
||||||
void LogIfError(const Status& status) {
|
|
||||||
if (status.ok()) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
LOG(ERROR) << status.error_message();
|
|
||||||
}
|
|
||||||
|
|
||||||
bool IsAscii(string& str) {
|
|
||||||
for (auto& ch : str) {
|
|
||||||
if (!absl::ascii_isascii(ch)) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
struct KernelRecord {
|
|
||||||
const char* kernel_name;
|
|
||||||
// TODO(csigg): cuStreamGetCtx introduced in CUDA 9.2 would allow us to only
|
|
||||||
// record the stream and infer the context during collection.
|
|
||||||
CUcontext context;
|
|
||||||
CUstream stream;
|
|
||||||
CUevent start_event;
|
|
||||||
CUevent stop_event;
|
|
||||||
const std::string* annotation;
|
|
||||||
};
|
|
||||||
|
|
||||||
struct MemcpyRecord {
|
|
||||||
CUmemorytype src_type;
|
|
||||||
CUmemorytype dst_type;
|
|
||||||
size_t size_bytes;
|
|
||||||
CUcontext context;
|
|
||||||
CUstream stream;
|
|
||||||
CUevent start_event;
|
|
||||||
CUevent stop_event;
|
|
||||||
const std::string* annotation;
|
|
||||||
};
|
|
||||||
|
|
||||||
Status CreateAndRecordEvent(CUevent* event, CUstream stream) {
|
|
||||||
TF_RETURN_IF_ERROR(ToStatus(cuEventCreate(event, CU_EVENT_DEFAULT)));
|
|
||||||
return ToStatus(cuEventRecord(*event, stream));
|
|
||||||
}
|
|
||||||
|
|
||||||
// Stores a series of kernel and memcpy records.
|
|
||||||
class CudaEventRecorder {
|
|
||||||
public:
|
|
||||||
// Registers the start of a kernel launch. The returned index should be passed
|
|
||||||
// to StopKernel() after the kernel launch has completed.
|
|
||||||
size_t StartKernel(const char* kernel_name, CUcontext context,
|
|
||||||
CUstream stream) {
|
|
||||||
KernelRecord record = {kernel_name, context, stream};
|
|
||||||
LogIfError(CreateAndRecordEvent(&record.start_event, stream));
|
|
||||||
mutex_lock lock(mutex_);
|
|
||||||
if (tracing::ScopedAnnotation::IsEnabled()) {
|
|
||||||
record.annotation =
|
|
||||||
&*annotations_.emplace(Annotation::CurrentAnnotation()).first;
|
|
||||||
}
|
|
||||||
kernel_records_.push_back(record);
|
|
||||||
return kernel_records_.size() - 1;
|
|
||||||
}
|
|
||||||
void StopKernel(size_t index) {
|
|
||||||
mutex_lock lock(mutex_);
|
|
||||||
auto& record = kernel_records_[index];
|
|
||||||
LogIfError(CreateAndRecordEvent(&record.stop_event, record.stream));
|
|
||||||
}
|
|
||||||
|
|
||||||
// Registers the start of a copy operation. The returned index should be
|
|
||||||
// passed to StopMemcpy() after the kernel launch has completed.
|
|
||||||
size_t StartMemcpy(CUmemorytype src_type, CUmemorytype dst_type,
|
|
||||||
size_t size_bytes, CUcontext context, CUstream stream) {
|
|
||||||
MemcpyRecord record = {src_type, dst_type, size_bytes, context, stream};
|
|
||||||
LogIfError(CreateAndRecordEvent(&record.start_event, stream));
|
|
||||||
mutex_lock lock(mutex_);
|
|
||||||
if (tracing::ScopedAnnotation::IsEnabled()) {
|
|
||||||
record.annotation =
|
|
||||||
&*annotations_.emplace(Annotation::CurrentAnnotation()).first;
|
|
||||||
}
|
|
||||||
memcpy_records_.push_back(record);
|
|
||||||
return memcpy_records_.size() - 1;
|
|
||||||
}
|
|
||||||
void StopMemcpy(size_t index) {
|
|
||||||
mutex_lock lock(mutex_);
|
|
||||||
auto& record = memcpy_records_[index];
|
|
||||||
LogIfError(CreateAndRecordEvent(&record.stop_event, record.stream));
|
|
||||||
}
|
|
||||||
|
|
||||||
std::vector<KernelRecord> ConsumeKernelRecords() {
|
|
||||||
mutex_lock lock(mutex_);
|
|
||||||
return std::move(kernel_records_);
|
|
||||||
}
|
|
||||||
std::vector<MemcpyRecord> ConsumeMemcpyRecords() {
|
|
||||||
mutex_lock lock(mutex_);
|
|
||||||
return std::move(memcpy_records_);
|
|
||||||
}
|
|
||||||
|
|
||||||
private:
|
|
||||||
mutex mutex_;
|
|
||||||
std::unordered_set<std::string> annotations_ GUARDED_BY(mutex_);
|
|
||||||
std::vector<KernelRecord> kernel_records_ GUARDED_BY(mutex_);
|
|
||||||
std::vector<MemcpyRecord> memcpy_records_ GUARDED_BY(mutex_);
|
|
||||||
};
|
|
||||||
|
|
||||||
// Instances register callbacks with CUPTI to notify the event recorder before
|
|
||||||
// and after kernel launches and memory copies.
|
|
||||||
class CuptiCallbackHook {
|
|
||||||
public:
|
|
||||||
CuptiCallbackHook() : subscriber_(nullptr) {}
|
|
||||||
|
|
||||||
Status Enable(CudaEventRecorder* recorder) {
|
|
||||||
TF_RETURN_IF_ERROR(
|
|
||||||
ToStatus(cuptiSubscribe(&subscriber_, &CuptiCallback, recorder)));
|
|
||||||
for (auto cbid : {CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel,
|
|
||||||
CUPTI_DRIVER_TRACE_CBID_cuMemcpy,
|
|
||||||
CUPTI_DRIVER_TRACE_CBID_cuMemcpyAsync,
|
|
||||||
CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoD_v2,
|
|
||||||
CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoDAsync_v2,
|
|
||||||
CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoH_v2,
|
|
||||||
CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoHAsync_v2,
|
|
||||||
CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoD_v2,
|
|
||||||
CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoDAsync_v2}) {
|
|
||||||
TF_RETURN_IF_ERROR(ToStatus(cuptiEnableCallback(
|
|
||||||
/*enable=*/1, subscriber_, CUPTI_CB_DOMAIN_DRIVER_API, cbid)));
|
|
||||||
}
|
|
||||||
return Status::OK();
|
|
||||||
}
|
|
||||||
|
|
||||||
~CuptiCallbackHook() { LogIfError(ToStatus(cuptiUnsubscribe(subscriber_))); }
|
|
||||||
|
|
||||||
private:
|
|
||||||
static void CUPTIAPI CuptiCallback(void* userdata,
|
|
||||||
CUpti_CallbackDomain domain,
|
|
||||||
CUpti_CallbackId cbid,
|
|
||||||
const void* cbdata) {
|
|
||||||
auto recorder = static_cast<CudaEventRecorder*>(userdata);
|
|
||||||
auto data = static_cast<const CUpti_CallbackData*>(cbdata);
|
|
||||||
DCHECK_EQ(domain, CUPTI_CB_DOMAIN_DRIVER_API);
|
|
||||||
|
|
||||||
if (data->callbackSite == CUPTI_API_ENTER) {
|
|
||||||
DriverApiEnterCallback(cbid, *data, recorder);
|
|
||||||
} else {
|
|
||||||
DriverApiExitCallback(cbid, *data, recorder);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static CUmemorytype GetMemoryType(CUdeviceptr ptr) {
|
|
||||||
CUmemorytype mem_type;
|
|
||||||
auto status =
|
|
||||||
cuPointerGetAttribute(&mem_type, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, ptr);
|
|
||||||
if (status == CUDA_ERROR_INVALID_VALUE) {
|
|
||||||
// Pointer not registered with CUDA, must be host memory.
|
|
||||||
return CU_MEMORYTYPE_HOST;
|
|
||||||
}
|
|
||||||
LogIfError(ToStatus(status));
|
|
||||||
return mem_type;
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename T>
|
|
||||||
static void StartMemcpy(CUmemorytype src_type, CUmemorytype dst_type,
|
|
||||||
const CUpti_CallbackData& cbdata,
|
|
||||||
CudaEventRecorder* recorder) {
|
|
||||||
auto params = static_cast<const T*>(cbdata.functionParams);
|
|
||||||
*cbdata.correlationData = recorder->StartMemcpy(
|
|
||||||
src_type, dst_type, params->ByteCount, cbdata.context, nullptr);
|
|
||||||
}
|
|
||||||
template <typename T>
|
|
||||||
static void StartMemcpyAsync(CUmemorytype src_type, CUmemorytype dst_type,
|
|
||||||
const CUpti_CallbackData& cbdata,
|
|
||||||
CudaEventRecorder* recorder) {
|
|
||||||
auto params = static_cast<const T*>(cbdata.functionParams);
|
|
||||||
*cbdata.correlationData = recorder->StartMemcpy(
|
|
||||||
src_type, dst_type, params->ByteCount, cbdata.context, params->hStream);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void DriverApiEnterCallback(CUpti_CallbackId cbid,
|
|
||||||
const CUpti_CallbackData& cbdata,
|
|
||||||
CudaEventRecorder* recorder) {
|
|
||||||
switch (cbid) {
|
|
||||||
case CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel: {
|
|
||||||
DCHECK_NE(cbdata.symbolName, nullptr);
|
|
||||||
auto params =
|
|
||||||
static_cast<const cuLaunchKernel_params*>(cbdata.functionParams);
|
|
||||||
*cbdata.correlationData = recorder->StartKernel(
|
|
||||||
cbdata.symbolName, cbdata.context, params->hStream);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
case CUPTI_DRIVER_TRACE_CBID_cuMemcpy: {
|
|
||||||
auto params =
|
|
||||||
static_cast<const cuMemcpy_params*>(cbdata.functionParams);
|
|
||||||
return StartMemcpy<cuMemcpy_params>(GetMemoryType(params->src),
|
|
||||||
GetMemoryType(params->dst), cbdata,
|
|
||||||
recorder);
|
|
||||||
}
|
|
||||||
case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAsync: {
|
|
||||||
auto params =
|
|
||||||
static_cast<const cuMemcpyAsync_params*>(cbdata.functionParams);
|
|
||||||
return StartMemcpyAsync<cuMemcpyAsync_params>(
|
|
||||||
GetMemoryType(params->src), GetMemoryType(params->dst), cbdata,
|
|
||||||
recorder);
|
|
||||||
}
|
|
||||||
|
|
||||||
case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoD_v2:
|
|
||||||
return StartMemcpy<cuMemcpyHtoD_v2_params>(
|
|
||||||
CU_MEMORYTYPE_HOST, CU_MEMORYTYPE_DEVICE, cbdata, recorder);
|
|
||||||
|
|
||||||
case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoDAsync_v2:
|
|
||||||
return StartMemcpyAsync<cuMemcpyHtoDAsync_v2_params>(
|
|
||||||
CU_MEMORYTYPE_HOST, CU_MEMORYTYPE_DEVICE, cbdata, recorder);
|
|
||||||
|
|
||||||
case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoH_v2:
|
|
||||||
return StartMemcpy<cuMemcpyDtoH_v2_params>(
|
|
||||||
CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_HOST, cbdata, recorder);
|
|
||||||
case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoHAsync_v2:
|
|
||||||
return StartMemcpyAsync<cuMemcpyDtoHAsync_v2_params>(
|
|
||||||
CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_HOST, cbdata, recorder);
|
|
||||||
|
|
||||||
case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoD_v2:
|
|
||||||
return StartMemcpy<cuMemcpyDtoD_v2_params>(
|
|
||||||
CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_DEVICE, cbdata, recorder);
|
|
||||||
case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoDAsync_v2:
|
|
||||||
return StartMemcpyAsync<cuMemcpyDtoDAsync_v2_params>(
|
|
||||||
CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_DEVICE, cbdata, recorder);
|
|
||||||
|
|
||||||
default:
|
|
||||||
LOG(ERROR) << "Unexpected callback id: " << cbid;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static void DriverApiExitCallback(CUpti_CallbackId cbid,
|
|
||||||
const CUpti_CallbackData& cbdata,
|
|
||||||
CudaEventRecorder* recorder) {
|
|
||||||
switch (cbid) {
|
|
||||||
case CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel:
|
|
||||||
recorder->StopKernel(*cbdata.correlationData);
|
|
||||||
break;
|
|
||||||
case CUPTI_DRIVER_TRACE_CBID_cuMemcpy:
|
|
||||||
case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAsync:
|
|
||||||
case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoD_v2:
|
|
||||||
case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoDAsync_v2:
|
|
||||||
case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoH_v2:
|
|
||||||
case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoHAsync_v2:
|
|
||||||
case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoD_v2:
|
|
||||||
case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoDAsync_v2:
|
|
||||||
recorder->StopMemcpy(*cbdata.correlationData);
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
LOG(ERROR) << "Unexpected callback id: " << cbid;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
CUpti_SubscriberHandle subscriber_;
|
|
||||||
};
|
|
||||||
|
|
||||||
// 'GpuTracer' is an interface for collecting low-level execution timings
|
|
||||||
// of hardware accelerator (e.g. GPU) computation and DMA transfers.
|
|
||||||
class GpuTracer : public profiler::ProfilerInterface {
|
|
||||||
public:
|
|
||||||
GpuTracer();
|
|
||||||
~GpuTracer() override;
|
|
||||||
|
|
||||||
// ProfilerInterface interface:
|
|
||||||
Status Start() override;
|
|
||||||
Status Stop() override;
|
|
||||||
// Collect trace results. Results are added to the specified
|
|
||||||
// StepStatsCollector. Does not clear any existing stats.
|
|
||||||
// It is an error to call 'Collect' while a trace is running.
|
|
||||||
Status CollectData(RunMetadata* run_metadata) override;
|
|
||||||
profiler::DeviceType GetDeviceType() override {
|
|
||||||
return profiler::DeviceType::kGpu;
|
|
||||||
}
|
|
||||||
|
|
||||||
private:
|
|
||||||
std::unique_ptr<CudaEventRecorder> recorder_;
|
|
||||||
std::unique_ptr<CuptiCallbackHook> cupti_hook_;
|
|
||||||
|
|
||||||
mutex mu_;
|
|
||||||
bool enabled_ GUARDED_BY(mu_);
|
|
||||||
};
|
|
||||||
|
|
||||||
GpuTracer::GpuTracer() : recorder_(new CudaEventRecorder()), enabled_(false) {
|
|
||||||
VLOG(1) << "GpuTracer created.";
|
|
||||||
}
|
|
||||||
|
|
||||||
GpuTracer::~GpuTracer() {
|
|
||||||
// Unregister the CUPTI callbacks if needed to prevent them from accessing
|
|
||||||
// freed memory.
|
|
||||||
Stop().IgnoreError();
|
|
||||||
}
|
|
||||||
|
|
||||||
Status GpuTracer::Start() {
|
|
||||||
VLOG(1) << "GpuTracer::Start";
|
|
||||||
mutex_lock l(mu_);
|
|
||||||
if (enabled_) {
|
|
||||||
return errors::FailedPrecondition("GpuTracer is already enabled.");
|
|
||||||
}
|
|
||||||
cupti_hook_.reset(new CuptiCallbackHook());
|
|
||||||
TF_RETURN_IF_ERROR(cupti_hook_->Enable(recorder_.get()));
|
|
||||||
|
|
||||||
tracing::ScopedAnnotation::Enable(true);
|
|
||||||
|
|
||||||
enabled_ = true;
|
|
||||||
return Status::OK();
|
|
||||||
}
|
|
||||||
|
|
||||||
Status GpuTracer::Stop() {
|
|
||||||
VLOG(1) << "GpuTracer::Stop";
|
|
||||||
mutex_lock l(mu_);
|
|
||||||
if (!enabled_) {
|
|
||||||
return Status::OK();
|
|
||||||
}
|
|
||||||
cupti_hook_.reset();
|
|
||||||
tracing::ScopedAnnotation::Enable(false);
|
|
||||||
|
|
||||||
enabled_ = false;
|
|
||||||
return Status::OK();
|
|
||||||
}
|
|
||||||
|
|
||||||
class CudaEventCollector {
|
|
||||||
struct DeviceInfo {
|
|
||||||
int ordinal;
|
|
||||||
std::string name;
|
|
||||||
int num_contexts;
|
|
||||||
};
|
|
||||||
|
|
||||||
struct ContextInfo {
|
|
||||||
int index;
|
|
||||||
const DeviceInfo* dev_info;
|
|
||||||
int num_streams;
|
|
||||||
CUevent end_event;
|
|
||||||
};
|
|
||||||
|
|
||||||
struct StreamInfo {
|
|
||||||
std::string name;
|
|
||||||
int index; // 0 is reserved for null stream.
|
|
||||||
const ContextInfo* ctx_info;
|
|
||||||
};
|
|
||||||
|
|
||||||
// Include context in key to distinguish null streams.
|
|
||||||
using StreamKey = std::pair<CUcontext, CUstream>;
|
|
||||||
|
|
||||||
CudaEventCollector(CudaEventRecorder* recorder, StepStatsCollector* collector)
|
|
||||||
: recorder_(recorder), collector_(collector) {
|
|
||||||
DCHECK(recorder != nullptr);
|
|
||||||
DCHECK(collector != nullptr);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Populates device_infos_ from all devices.
|
|
||||||
Status InitializeDeviceInfos() {
|
|
||||||
int count;
|
|
||||||
TF_RETURN_IF_ERROR(ToStatus(cuDeviceGetCount(&count)));
|
|
||||||
for (int ordinal = 0; ordinal < count; ++ordinal) {
|
|
||||||
CUdevice device;
|
|
||||||
TF_RETURN_IF_ERROR(ToStatus(cuDeviceGet(&device, ordinal)));
|
|
||||||
char name[100];
|
|
||||||
TF_RETURN_IF_ERROR(ToStatus(cuDeviceGetName(name, sizeof(name), device)));
|
|
||||||
device_infos_[device] = {ordinal, name};
|
|
||||||
}
|
|
||||||
return Status::OK();
|
|
||||||
}
|
|
||||||
|
|
||||||
// Returns element from context_infos_, adding it if not yet present.
|
|
||||||
Status GetContextInfo(CUcontext context, ContextInfo** ctx_info_ptr) {
|
|
||||||
auto it = context_infos_.find(context);
|
|
||||||
|
|
||||||
if (it == context_infos_.end()) {
|
|
||||||
TF_RETURN_IF_ERROR(ToStatus(cuCtxSetCurrent(context)));
|
|
||||||
CUdevice device;
|
|
||||||
TF_RETURN_IF_ERROR(ToStatus(cuCtxGetDevice(&device)));
|
|
||||||
|
|
||||||
auto& dev_info = device_infos_[device];
|
|
||||||
ContextInfo ctx_info = {dev_info.num_contexts++, &dev_info};
|
|
||||||
it = context_infos_.emplace(context, ctx_info).first;
|
|
||||||
}
|
|
||||||
|
|
||||||
*ctx_info_ptr = &it->second;
|
|
||||||
return Status::OK();
|
|
||||||
}
|
|
||||||
|
|
||||||
// Adds element to stream_infos_ if not yet present. If present, clear name
|
|
||||||
// if it doesn't match parameter.
|
|
||||||
Status AddStreamInfo(CUcontext context, CUstream stream,
|
|
||||||
absl::string_view name) {
|
|
||||||
StreamKey key(context, stream);
|
|
||||||
auto it = stream_infos_.find(key);
|
|
||||||
if (it != stream_infos_.end()) {
|
|
||||||
if (it->second.name != name) {
|
|
||||||
it->second.name.clear(); // Stream with inconsistent names, clear it.
|
|
||||||
}
|
|
||||||
return Status::OK();
|
|
||||||
}
|
|
||||||
|
|
||||||
ContextInfo* ctx_info;
|
|
||||||
TF_RETURN_IF_ERROR(GetContextInfo(context, &ctx_info));
|
|
||||||
int index = stream ? ++ctx_info->num_streams : 0;
|
|
||||||
StreamInfo stream_info = {static_cast<std::string>(name), index, ctx_info};
|
|
||||||
stream_infos_.emplace(key, stream_info);
|
|
||||||
return Status::OK();
|
|
||||||
}
|
|
||||||
|
|
||||||
// Returns string describing source and destination memory types.
|
|
||||||
static std::string GetMemcpyName(const MemcpyRecord& record) {
|
|
||||||
auto get_memory_type = [](CUmemorytype mem_type) {
|
|
||||||
switch (mem_type) {
|
|
||||||
case CU_MEMORYTYPE_HOST:
|
|
||||||
return 'H';
|
|
||||||
case CU_MEMORYTYPE_DEVICE:
|
|
||||||
return 'D';
|
|
||||||
case CU_MEMORYTYPE_ARRAY:
|
|
||||||
return 'A';
|
|
||||||
case CU_MEMORYTYPE_UNIFIED:
|
|
||||||
return 'U';
|
|
||||||
default:
|
|
||||||
LOG(ERROR) << "Unknown memory type: " << mem_type;
|
|
||||||
return '?';
|
|
||||||
}
|
|
||||||
};
|
|
||||||
return absl::StrFormat("Memcpy%cto%c", get_memory_type(record.src_type),
|
|
||||||
get_memory_type(record.dst_type));
|
|
||||||
}
|
|
||||||
|
|
||||||
// Returns time in microseconds between events recorded on the GPU.
|
|
||||||
static uint64_t GetElapsedTimeUs(CUevent start, CUevent stop) {
|
|
||||||
float elapsed_ms = 0.0f;
|
|
||||||
LogIfError(ToStatus(cuEventElapsedTime(&elapsed_ms, start, stop)));
|
|
||||||
return static_cast<uint64>(
|
|
||||||
std::llroundf(1000 * std::max(elapsed_ms, 0.0f)));
|
|
||||||
}
|
|
||||||
|
|
||||||
// Synchronizes all contexts.
|
|
||||||
Status Synchronize() const {
|
|
||||||
for (const auto& pair : context_infos_) {
|
|
||||||
TF_RETURN_IF_ERROR(ToStatus(cuCtxSetCurrent(pair.first)));
|
|
||||||
TF_RETURN_IF_ERROR(ToStatus(cuCtxSynchronize()));
|
|
||||||
}
|
|
||||||
return Status::OK();
|
|
||||||
}
|
|
||||||
|
|
||||||
// Save stats to collector;
|
|
||||||
Status SaveStats(std::unique_ptr<NodeExecStats> stats,
|
|
||||||
const StreamInfo& stream_info) const {
|
|
||||||
auto ctx_info = stream_info.ctx_info;
|
|
||||||
auto dev_info = ctx_info->dev_info;
|
|
||||||
// TODO(csigg): tfprof_node.cc, run_metadata_test.py, and timeline_test.py
|
|
||||||
// currently require this particular formatting.
|
|
||||||
collector_->Save(
|
|
||||||
absl::StrFormat("/device:GPU:%d/stream:all", dev_info->ordinal),
|
|
||||||
new NodeExecStats(*stats));
|
|
||||||
auto name = absl::StrFormat("/gpu:%d (%s)/context#%d/", dev_info->ordinal,
|
|
||||||
dev_info->name, ctx_info->index);
|
|
||||||
if (stream_info.index) {
|
|
||||||
absl::StrAppend(&name, "stream#", std::to_string(stream_info.index));
|
|
||||||
} else {
|
|
||||||
absl::StrAppend(&name, "null stream");
|
|
||||||
}
|
|
||||||
if (!stream_info.name.empty()) {
|
|
||||||
absl::StrAppend(&name, ":", stream_info.name);
|
|
||||||
}
|
|
||||||
collector_->Save(name, stats.release());
|
|
||||||
return Status::OK();
|
|
||||||
}
|
|
||||||
|
|
||||||
Status SaveRecord(const KernelRecord& record) const {
|
|
||||||
if (!record.start_event || !record.stop_event) {
|
|
||||||
return Status::OK();
|
|
||||||
}
|
|
||||||
const auto& stream_info =
|
|
||||||
stream_infos_.at(StreamKey(record.context, record.stream));
|
|
||||||
auto start_us =
|
|
||||||
GetElapsedTimeUs(record.start_event, stream_info.ctx_info->end_event);
|
|
||||||
auto elapsed_us = GetElapsedTimeUs(record.start_event, record.stop_event);
|
|
||||||
|
|
||||||
auto stats = absl::make_unique<NodeExecStats>();
|
|
||||||
std::string node_name = port::MaybeAbiDemangle(record.kernel_name);
|
|
||||||
// Sometimes CUPTI returns invalid characters. See b/129892466.
|
|
||||||
if (!IsAscii(node_name)) {
|
|
||||||
node_name = "<invalid_name>";
|
|
||||||
}
|
|
||||||
if (record.annotation) {
|
|
||||||
node_name = absl::StrCat(*record.annotation, "@@", node_name);
|
|
||||||
}
|
|
||||||
stats->set_node_name(node_name);
|
|
||||||
// TODO(csigg): Report grid size?
|
|
||||||
std::string node_label;
|
|
||||||
stats->set_timeline_label(node_label);
|
|
||||||
stats->set_all_start_micros(end_walltime_us_ - start_us);
|
|
||||||
stats->set_op_end_rel_micros(elapsed_us);
|
|
||||||
stats->set_all_end_rel_micros(elapsed_us);
|
|
||||||
return SaveStats(std::move(stats), stream_info);
|
|
||||||
}
|
|
||||||
|
|
||||||
Status SaveRecord(const MemcpyRecord& record) const {
|
|
||||||
if (!record.start_event || !record.stop_event) {
|
|
||||||
return Status::OK();
|
|
||||||
}
|
|
||||||
const auto& stream_info =
|
|
||||||
stream_infos_.at(StreamKey(record.context, record.stream));
|
|
||||||
auto start_us =
|
|
||||||
GetElapsedTimeUs(record.start_event, stream_info.ctx_info->end_event);
|
|
||||||
auto elapsed_us = GetElapsedTimeUs(record.start_event, record.stop_event);
|
|
||||||
|
|
||||||
auto stats = absl::make_unique<NodeExecStats>();
|
|
||||||
std::string node_name = GetMemcpyName(record);
|
|
||||||
// Sometimes CUPTI returns invalid characters. See b/129892466.
|
|
||||||
if (!IsAscii(node_name)) {
|
|
||||||
node_name = "<invalid_name>";
|
|
||||||
}
|
|
||||||
if (record.annotation) {
|
|
||||||
node_name = absl::StrCat(*record.annotation, "@@", node_name);
|
|
||||||
}
|
|
||||||
stats->set_node_name(node_name);
|
|
||||||
// TODO(csigg): Show label in Chrome trace viewer.
|
|
||||||
std::string node_label = absl::StrFormat("%d bytes", record.size_bytes);
|
|
||||||
stats->set_timeline_label(node_label);
|
|
||||||
stats->set_all_start_micros(end_walltime_us_ - start_us);
|
|
||||||
stats->set_op_end_rel_micros(elapsed_us);
|
|
||||||
stats->set_all_end_rel_micros(elapsed_us);
|
|
||||||
return SaveStats(std::move(stats), stream_info);
|
|
||||||
}
|
|
||||||
|
|
||||||
Status Collect() {
|
|
||||||
TF_RETURN_IF_ERROR(InitializeDeviceInfos());
|
|
||||||
|
|
||||||
auto kernel_records = recorder_->ConsumeKernelRecords();
|
|
||||||
auto memcpy_records = recorder_->ConsumeMemcpyRecords();
|
|
||||||
LOG(INFO) << "Collecting " << kernel_records.size() << " kernel records, "
|
|
||||||
<< memcpy_records.size() << " memcpy records.";
|
|
||||||
|
|
||||||
// Gather all profiled streams and contexts.
|
|
||||||
for (const auto& record : kernel_records) {
|
|
||||||
TF_RETURN_IF_ERROR(
|
|
||||||
AddStreamInfo(record.context, record.stream, "Kernel"));
|
|
||||||
}
|
|
||||||
for (const auto& record : memcpy_records) {
|
|
||||||
TF_RETURN_IF_ERROR(
|
|
||||||
AddStreamInfo(record.context, record.stream, GetMemcpyName(record)));
|
|
||||||
}
|
|
||||||
|
|
||||||
// Synchronize all contexts, record end events, synchronize again.
|
|
||||||
TF_RETURN_IF_ERROR(Synchronize());
|
|
||||||
for (auto& pair : context_infos_) {
|
|
||||||
TF_RETURN_IF_ERROR(ToStatus(cuCtxSetCurrent(pair.first)));
|
|
||||||
TF_RETURN_IF_ERROR(CreateAndRecordEvent(&pair.second.end_event, nullptr));
|
|
||||||
}
|
|
||||||
TF_RETURN_IF_ERROR(Synchronize());
|
|
||||||
end_walltime_us_ = Env::Default()->NowMicros();
|
|
||||||
|
|
||||||
for (const auto& record : kernel_records) {
|
|
||||||
TF_RETURN_IF_ERROR(SaveRecord(record));
|
|
||||||
}
|
|
||||||
for (const auto& record : memcpy_records) {
|
|
||||||
TF_RETURN_IF_ERROR(SaveRecord(record));
|
|
||||||
}
|
|
||||||
|
|
||||||
return Status::OK();
|
|
||||||
}
|
|
||||||
|
|
||||||
public:
|
|
||||||
// Consumes the records in recorder and saves them to the collector.
|
|
||||||
static Status Collect(CudaEventRecorder* recorder,
|
|
||||||
StepStatsCollector* collector) {
|
|
||||||
CUcontext context;
|
|
||||||
TF_RETURN_IF_ERROR(ToStatus(cuCtxGetCurrent(&context)));
|
|
||||||
auto status = CudaEventCollector(recorder, collector).Collect();
|
|
||||||
TF_RETURN_IF_ERROR(ToStatus(cuCtxSetCurrent(context)));
|
|
||||||
return status;
|
|
||||||
}
|
|
||||||
|
|
||||||
private:
|
|
||||||
CudaEventRecorder* recorder_;
|
|
||||||
StepStatsCollector* collector_;
|
|
||||||
|
|
||||||
absl::node_hash_map<CUdevice, DeviceInfo> device_infos_;
|
|
||||||
absl::node_hash_map<CUcontext, ContextInfo> context_infos_;
|
|
||||||
absl::flat_hash_map<StreamKey, StreamInfo, hash<StreamKey>> stream_infos_;
|
|
||||||
int64 end_walltime_us_;
|
|
||||||
};
|
|
||||||
|
|
||||||
Status GpuTracer::CollectData(RunMetadata* run_metadata) {
|
|
||||||
mutex_lock l(mu_);
|
|
||||||
if (enabled_) {
|
|
||||||
return errors::FailedPrecondition("GpuTracer is still enabled.");
|
|
||||||
}
|
|
||||||
|
|
||||||
StepStatsCollector step_stats_collector(run_metadata->mutable_step_stats());
|
|
||||||
TF_RETURN_IF_ERROR(
|
|
||||||
CudaEventCollector::Collect(recorder_.get(), &step_stats_collector));
|
|
||||||
step_stats_collector.Finalize();
|
|
||||||
return Status::OK();
|
|
||||||
}
|
|
||||||
} // namespace
|
|
||||||
|
|
||||||
// Not in anonymous namespace for testing purposes.
|
|
||||||
std::unique_ptr<profiler::ProfilerInterface> CreateGpuTracer(
|
|
||||||
const profiler::ProfilerOptions& options) {
|
|
||||||
auto status = cuInit(0);
|
|
||||||
if (status != CUDA_SUCCESS) {
|
|
||||||
LogIfError(ToStatus(status));
|
|
||||||
return nullptr;
|
|
||||||
}
|
|
||||||
if (options.device_type != profiler::DeviceType::kGpu &&
|
|
||||||
options.device_type != profiler::DeviceType::kUnspecified)
|
|
||||||
return nullptr;
|
|
||||||
return absl::make_unique<GpuTracer>();
|
|
||||||
}
|
|
||||||
|
|
||||||
auto register_device_tracer_factory = [] {
|
|
||||||
bool enable;
|
|
||||||
TF_CHECK_OK(ReadBoolFromEnvVar("TF_ENABLE_OSS_GPU_PROFILER", true, &enable));
|
|
||||||
if (enable) {
|
|
||||||
RegisterProfilerFactory(&CreateGpuTracer);
|
|
||||||
}
|
|
||||||
return 0;
|
|
||||||
}();
|
|
||||||
|
|
||||||
} // namespace tensorflow
|
|
||||||
#endif // GOOGLE_CUDA
|
|
@ -426,7 +426,6 @@ tf_cc_test(
|
|||||||
deps = [
|
deps = [
|
||||||
"//tensorflow/core:core_cpu",
|
"//tensorflow/core:core_cpu",
|
||||||
"//tensorflow/core:core_cpu_internal",
|
"//tensorflow/core:core_cpu_internal",
|
||||||
"//tensorflow/core:device_tracer",
|
|
||||||
"//tensorflow/core:framework",
|
"//tensorflow/core:framework",
|
||||||
"//tensorflow/core:lib",
|
"//tensorflow/core:lib",
|
||||||
"//tensorflow/core:lib_internal",
|
"//tensorflow/core:lib_internal",
|
||||||
|
@ -1,6 +1,19 @@
|
|||||||
|
load(
|
||||||
|
"//tensorflow:tensorflow.bzl",
|
||||||
|
"tf_copts",
|
||||||
|
"tf_cuda_library",
|
||||||
|
"if_cuda_is_configured_compat",
|
||||||
|
)
|
||||||
|
load("//tensorflow:tensorflow.bzl", "tf_cc_test_gpu")
|
||||||
load(
|
load(
|
||||||
"//tensorflow/core/platform:default/build_config.bzl",
|
"//tensorflow/core/platform:default/build_config.bzl",
|
||||||
"tf_additional_cupti_utils_cuda_deps",
|
"tf_additional_cupti_utils_cuda_deps",
|
||||||
|
"tf_additional_device_tracer_srcs",
|
||||||
|
"tf_kernel_tests_linkstatic",
|
||||||
|
)
|
||||||
|
load(
|
||||||
|
"//tensorflow/core/platform:default/build_config_root.bzl",
|
||||||
|
"tf_cuda_tests_tags",
|
||||||
)
|
)
|
||||||
|
|
||||||
package(
|
package(
|
||||||
@ -8,16 +21,54 @@ package(
|
|||||||
licenses = ["notice"], # Apache 2.0
|
licenses = ["notice"], # Apache 2.0
|
||||||
)
|
)
|
||||||
|
|
||||||
alias(
|
tf_cuda_library(
|
||||||
name = "device_tracer",
|
name = "device_tracer",
|
||||||
actual = "//tensorflow/core:device_tracer",
|
srcs = tf_additional_device_tracer_srcs(),
|
||||||
|
copts = tf_copts(),
|
||||||
|
cuda_deps = [
|
||||||
|
"//tensorflow/core/profiler/internal/gpu:cupti_tracer",
|
||||||
|
"//tensorflow/core/profiler/internal/gpu:cupti_wrapper",
|
||||||
|
],
|
||||||
|
deps = [
|
||||||
|
":cupti_utils",
|
||||||
|
"//tensorflow/core:core_cpu_internal",
|
||||||
|
"//tensorflow/core:lib",
|
||||||
|
"//tensorflow/core:protos_all_cc",
|
||||||
|
"//tensorflow/core/profiler/internal:parse_annotation",
|
||||||
|
"//tensorflow/core/profiler/internal:profiler_interface",
|
||||||
|
"//tensorflow/core/profiler/lib:traceme",
|
||||||
|
"@com_google_absl//absl/flags:flag",
|
||||||
|
],
|
||||||
|
alwayslink = 1,
|
||||||
)
|
)
|
||||||
|
|
||||||
load(
|
tf_cc_test_gpu(
|
||||||
"//tensorflow:tensorflow.bzl",
|
name = "device_tracer_test",
|
||||||
"tf_copts",
|
size = "small",
|
||||||
"tf_cuda_library",
|
srcs = ["device_tracer_test.cc"],
|
||||||
"if_cuda_is_configured_compat",
|
args = ["--heap_check=local"],
|
||||||
|
linkstatic = tf_kernel_tests_linkstatic(),
|
||||||
|
tags = tf_cuda_tests_tags() + ["nomac"],
|
||||||
|
deps = [
|
||||||
|
":device_tracer",
|
||||||
|
"//tensorflow/cc:cc_ops",
|
||||||
|
"//tensorflow/core:all_kernels",
|
||||||
|
"//tensorflow/core:core_cpu",
|
||||||
|
"//tensorflow/core:core_cpu_internal",
|
||||||
|
"//tensorflow/core:direct_session",
|
||||||
|
"//tensorflow/core:direct_session_internal",
|
||||||
|
"//tensorflow/core:framework",
|
||||||
|
"//tensorflow/core:framework_internal",
|
||||||
|
"//tensorflow/core:gpu_runtime",
|
||||||
|
"//tensorflow/core:lib",
|
||||||
|
"//tensorflow/core:lib_internal",
|
||||||
|
"//tensorflow/core:protos_all_cc",
|
||||||
|
"//tensorflow/core:test",
|
||||||
|
"//tensorflow/core:test_main",
|
||||||
|
"//tensorflow/core:testlib",
|
||||||
|
"//tensorflow/core/kernels:ops_util",
|
||||||
|
"//tensorflow/core/profiler/internal:profiler_interface",
|
||||||
|
],
|
||||||
)
|
)
|
||||||
|
|
||||||
tf_cuda_library(
|
tf_cuda_library(
|
||||||
@ -52,10 +103,12 @@ tf_cuda_library(
|
|||||||
visibility = ["//visibility:public"],
|
visibility = ["//visibility:public"],
|
||||||
deps = [
|
deps = [
|
||||||
":cupti_interface",
|
":cupti_interface",
|
||||||
|
":cupti_utils",
|
||||||
"//tensorflow/core:lib",
|
"//tensorflow/core:lib",
|
||||||
"//tensorflow/core/platform:annotation",
|
"//tensorflow/core/platform:annotation",
|
||||||
"@com_google_absl//absl/container:fixed_array",
|
"@com_google_absl//absl/container:fixed_array",
|
||||||
"@com_google_absl//absl/container:flat_hash_map",
|
"@com_google_absl//absl/container:flat_hash_map",
|
||||||
|
"@com_google_absl//absl/container:node_hash_map",
|
||||||
"@com_google_absl//absl/container:node_hash_set",
|
"@com_google_absl//absl/container:node_hash_set",
|
||||||
"@com_google_absl//absl/types:optional",
|
"@com_google_absl//absl/types:optional",
|
||||||
],
|
],
|
||||||
|
@ -19,6 +19,7 @@ limitations under the License.
|
|||||||
#include "absl/container/node_hash_map.h"
|
#include "absl/container/node_hash_map.h"
|
||||||
#include "tensorflow/core/lib/core/errors.h"
|
#include "tensorflow/core/lib/core/errors.h"
|
||||||
#include "tensorflow/core/lib/gtl/cleanup.h"
|
#include "tensorflow/core/lib/gtl/cleanup.h"
|
||||||
|
#include "tensorflow/core/lib/hash/hash.h"
|
||||||
#include "tensorflow/core/platform/abi.h"
|
#include "tensorflow/core/platform/abi.h"
|
||||||
#include "tensorflow/core/platform/annotation.h"
|
#include "tensorflow/core/platform/annotation.h"
|
||||||
#include "tensorflow/core/platform/env.h"
|
#include "tensorflow/core/platform/env.h"
|
||||||
@ -904,7 +905,7 @@ class CudaEventRecorder {
|
|||||||
CuptiInterface *cupti_interface_;
|
CuptiInterface *cupti_interface_;
|
||||||
CuptiTraceCollector *collector_;
|
CuptiTraceCollector *collector_;
|
||||||
const int ordinal_;
|
const int ordinal_;
|
||||||
string device_name_;
|
std::string device_name_;
|
||||||
uint64 end_walltime_us_;
|
uint64 end_walltime_us_;
|
||||||
// Include context in key to distinguish null streams.
|
// Include context in key to distinguish null streams.
|
||||||
using StreamKey = std::pair<CUcontext, CUstream>;
|
using StreamKey = std::pair<CUcontext, CUstream>;
|
||||||
@ -1172,7 +1173,7 @@ const char *GetTraceEventTypeName(const CuptiTracerEventType &type) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void AnnotationMap::Add(uint32 device_id, uint32 correlation_id,
|
void AnnotationMap::Add(uint32 device_id, uint32 correlation_id,
|
||||||
const string &annotation) {
|
const std::string &annotation) {
|
||||||
if (annotation.empty()) return;
|
if (annotation.empty()) return;
|
||||||
VLOG(3) << "Add annotation: device_id: " << device_id
|
VLOG(3) << "Add annotation: device_id: " << device_id
|
||||||
<< " correlation_id: " << correlation_id
|
<< " correlation_id: " << correlation_id
|
||||||
@ -1386,7 +1387,7 @@ Status CuptiTracer::HandleCallback(CUpti_CallbackDomain domain,
|
|||||||
device_id, domain, cbid, cbdata));
|
device_id, domain, cbid, cbdata));
|
||||||
} else if (cbdata->callbackSite == CUPTI_API_EXIT) {
|
} else if (cbdata->callbackSite == CUPTI_API_EXIT) {
|
||||||
// Set up the map from correlation id to annotation string.
|
// Set up the map from correlation id to annotation string.
|
||||||
const string &annotation = tensorflow::Annotation::CurrentAnnotation();
|
const std::string &annotation = tensorflow::Annotation::CurrentAnnotation();
|
||||||
if (!annotation.empty()) {
|
if (!annotation.empty()) {
|
||||||
annotation_map_->Add(device_id, cbdata->correlationId, annotation);
|
annotation_map_->Add(device_id, cbdata->correlationId, annotation);
|
||||||
}
|
}
|
||||||
|
@ -159,7 +159,8 @@ class CuptiTraceCollector {
|
|||||||
virtual ~CuptiTraceCollector() {}
|
virtual ~CuptiTraceCollector() {}
|
||||||
|
|
||||||
virtual void AddEvent(CuptiTracerEvent&& event) = 0;
|
virtual void AddEvent(CuptiTracerEvent&& event) = 0;
|
||||||
virtual void OnEventsDropped(const string& reason, uint32 num_events) = 0;
|
virtual void OnEventsDropped(const std::string& reason,
|
||||||
|
uint32 num_events) = 0;
|
||||||
virtual void Flush() = 0;
|
virtual void Flush() = 0;
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
@ -173,7 +174,8 @@ class AnnotationMap {
|
|||||||
public:
|
public:
|
||||||
explicit AnnotationMap(uint64 max_size, uint32 num_gpus)
|
explicit AnnotationMap(uint64 max_size, uint32 num_gpus)
|
||||||
: max_size_(max_size), per_device_map_(num_gpus) {}
|
: max_size_(max_size), per_device_map_(num_gpus) {}
|
||||||
void Add(uint32 device_id, uint32 correlation_id, const string& annotation);
|
void Add(uint32 device_id, uint32 correlation_id,
|
||||||
|
const std::string& annotation);
|
||||||
absl::string_view LookUp(uint32 device_id, uint32 correlation_id);
|
absl::string_view LookUp(uint32 device_id, uint32 correlation_id);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
@ -183,7 +185,7 @@ class AnnotationMap {
|
|||||||
absl::Mutex mutex;
|
absl::Mutex mutex;
|
||||||
// Annotation tends to be repetitive, use a hash_set to store the strings,
|
// Annotation tends to be repetitive, use a hash_set to store the strings,
|
||||||
// an use the reference to the string in the map.
|
// an use the reference to the string in the map.
|
||||||
absl::node_hash_set<string> annotations;
|
absl::node_hash_set<std::string> annotations;
|
||||||
absl::flat_hash_map<uint32, absl::string_view> correlation_map;
|
absl::flat_hash_map<uint32, absl::string_view> correlation_map;
|
||||||
};
|
};
|
||||||
const uint64 max_size_;
|
const uint64 max_size_;
|
||||||
|
395
tensorflow/core/profiler/internal/gpu/device_tracer.cc
Normal file
395
tensorflow/core/profiler/internal/gpu/device_tracer.cc
Normal file
@ -0,0 +1,395 @@
|
|||||||
|
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
==============================================================================*/
|
||||||
|
|
||||||
|
#if GOOGLE_CUDA
|
||||||
|
|
||||||
|
#include <stdlib.h>
|
||||||
|
|
||||||
|
#include <memory>
|
||||||
|
|
||||||
|
#include "absl/container/fixed_array.h"
|
||||||
|
#include "absl/strings/str_cat.h"
|
||||||
|
#include "tensorflow/core/common_runtime/step_stats_collector.h"
|
||||||
|
#include "tensorflow/core/lib/core/errors.h"
|
||||||
|
#include "tensorflow/core/lib/strings/str_util.h"
|
||||||
|
#include "tensorflow/core/platform/annotation.h"
|
||||||
|
#include "tensorflow/core/platform/macros.h"
|
||||||
|
#include "tensorflow/core/platform/stringprintf.h"
|
||||||
|
#include "tensorflow/core/profiler/internal/gpu/cupti_tracer.h"
|
||||||
|
#include "tensorflow/core/profiler/internal/gpu/cupti_wrapper.h"
|
||||||
|
#include "tensorflow/core/profiler/internal/parse_annotation.h"
|
||||||
|
#include "tensorflow/core/profiler/internal/profiler_interface.h"
|
||||||
|
#include "tensorflow/core/util/env_var.h"
|
||||||
|
|
||||||
|
namespace tensorflow {
|
||||||
|
namespace profiler {
|
||||||
|
|
||||||
|
// Adapter from CuptiTraceCollector to StepStatsCollector: This class convert
|
||||||
|
// and filter from CuptiTracerEvent to tensorflow::NodeExecStats.
|
||||||
|
// We can not just forward event on the fly because StepStatsCollector have
|
||||||
|
// a single mutex for all devices, Therefore we will cache events and forward
|
||||||
|
// only when Flush().
|
||||||
|
class StepStatsCuptiTracerAdaptor : public CuptiTraceCollector {
|
||||||
|
public:
|
||||||
|
StepStatsCuptiTracerAdaptor(const CuptiTracerCollectorOptions& option,
|
||||||
|
const std::string prefix, int num_gpus,
|
||||||
|
uint64 start_walltime_ns, uint64 start_gpu_ns,
|
||||||
|
StepStatsCollector* trace_collector)
|
||||||
|
: CuptiTraceCollector(option),
|
||||||
|
trace_collector_(trace_collector),
|
||||||
|
num_callback_events_(0),
|
||||||
|
num_activity_events_(0),
|
||||||
|
start_walltime_ns_(start_walltime_ns),
|
||||||
|
start_gpu_ns_(start_gpu_ns),
|
||||||
|
num_gpus_(num_gpus),
|
||||||
|
per_device_adaptor_(num_gpus) {
|
||||||
|
for (int i = 0; i < num_gpus; ++i) { // for each device id.
|
||||||
|
per_device_adaptor_[i].stream_device =
|
||||||
|
strings::StrCat(prefix, "/device:GPU:", i, "/stream:");
|
||||||
|
per_device_adaptor_[i].memcpy_device =
|
||||||
|
strings::StrCat(prefix, "/device:GPU:", i, "/memcpy");
|
||||||
|
per_device_adaptor_[i].sync_device =
|
||||||
|
strings::StrCat(prefix, "/device:GPU:", i, "/sync");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void AddEvent(CuptiTracerEvent&& event) override {
|
||||||
|
if (event.device_id >= num_gpus_) return;
|
||||||
|
if (event.source == CuptiTracerEventSource::DriverCallback) {
|
||||||
|
if (num_callback_events_ > options_.max_callback_api_events) {
|
||||||
|
OnEventsDropped("trace collector", 1);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
num_callback_events_++;
|
||||||
|
} else {
|
||||||
|
if (num_activity_events_ > options_.max_activity_api_events) {
|
||||||
|
OnEventsDropped("trace collector", 1);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
num_activity_events_++;
|
||||||
|
}
|
||||||
|
per_device_adaptor_[event.device_id].AddEvent(std::move(event));
|
||||||
|
}
|
||||||
|
void OnEventsDropped(const string& reason, uint32 num_events) override {}
|
||||||
|
void Flush() override {
|
||||||
|
LOG(INFO) << " GpuTracer has collected " << num_callback_events_
|
||||||
|
<< " callback api events and " << num_activity_events_
|
||||||
|
<< " activity events.";
|
||||||
|
for (int i = 0; i < num_gpus_; ++i) {
|
||||||
|
per_device_adaptor_[i].Flush(trace_collector_, start_walltime_ns_,
|
||||||
|
start_gpu_ns_);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
StepStatsCollector* trace_collector_;
|
||||||
|
std::atomic<int> num_callback_events_;
|
||||||
|
std::atomic<int> num_activity_events_;
|
||||||
|
uint64 start_walltime_ns_;
|
||||||
|
uint64 start_gpu_ns_;
|
||||||
|
int num_gpus_;
|
||||||
|
|
||||||
|
struct CorrelationInfo {
|
||||||
|
CorrelationInfo(uint32 t, uint32 e) : thread_id(t), enqueue_time_ns(e) {}
|
||||||
|
uint32 thread_id;
|
||||||
|
uint64 enqueue_time_ns;
|
||||||
|
};
|
||||||
|
struct PerDeviceAdaptor {
|
||||||
|
void AddEvent(CuptiTracerEvent&& event) {
|
||||||
|
absl::MutexLock lock(&mutex);
|
||||||
|
if (event.source == CuptiTracerEventSource::DriverCallback) {
|
||||||
|
// Cupti api callcack events were used to populate launch times etc.
|
||||||
|
if (event.name == "cuStreamSynchronize") {
|
||||||
|
events.emplace_back(std::move(event));
|
||||||
|
}
|
||||||
|
if (event.correlation_id != CuptiTracerEvent::kInvalidCorrelationId) {
|
||||||
|
correlation_info.insert(
|
||||||
|
{event.correlation_id,
|
||||||
|
CorrelationInfo(event.thread_id, event.start_time_ns)});
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// Cupti activity events measure device times etc.
|
||||||
|
events.emplace_back(std::move(event));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
void Flush(StepStatsCollector* collector, uint64 start_walltime_ns,
|
||||||
|
uint64 start_gpu_ns) {
|
||||||
|
absl::MutexLock lock(&mutex);
|
||||||
|
for (auto& event : events) {
|
||||||
|
NodeExecStats* ns = new NodeExecStats;
|
||||||
|
ns->set_all_start_micros(
|
||||||
|
(start_walltime_ns + (event.start_time_ns - start_gpu_ns)) / 1000);
|
||||||
|
ns->set_op_start_rel_micros(0);
|
||||||
|
auto elapsed_ns = event.end_time_ns - event.start_time_ns;
|
||||||
|
ns->set_op_end_rel_micros(elapsed_ns / 1000);
|
||||||
|
ns->set_all_end_rel_micros(elapsed_ns / 1000);
|
||||||
|
|
||||||
|
if (event.source == CuptiTracerEventSource::DriverCallback) {
|
||||||
|
DCHECK_EQ(event.name, "cuStreamSynchronize");
|
||||||
|
ns->set_node_name(string(event.name));
|
||||||
|
ns->set_timeline_label(absl::StrCat("ThreadId ", event.thread_id));
|
||||||
|
ns->set_thread_id(event.thread_id);
|
||||||
|
collector->Save(sync_device, ns);
|
||||||
|
} else { // CuptiTracerEventSource::Activity
|
||||||
|
// Get launch information if available.
|
||||||
|
if (event.correlation_id != CuptiTracerEvent::kInvalidCorrelationId) {
|
||||||
|
auto it = correlation_info.find(event.correlation_id);
|
||||||
|
if (it != correlation_info.end()) {
|
||||||
|
ns->set_scheduled_micros(it->second.enqueue_time_ns / 1000);
|
||||||
|
ns->set_thread_id(it->second.thread_id);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
auto annotation_stack = ParseAnnotationStack(event.annotation);
|
||||||
|
absl::string_view activity_name = !annotation_stack.empty()
|
||||||
|
? annotation_stack.back().name
|
||||||
|
: event.name;
|
||||||
|
ns->set_node_name(string(activity_name));
|
||||||
|
switch (event.type) {
|
||||||
|
case CuptiTracerEventType::Kernel: {
|
||||||
|
const string details = strings::Printf(
|
||||||
|
"regs:%llu shm:%llu grid:%llu,%llu,%llu block:%llu,%llu,%llu",
|
||||||
|
event.kernel_info.registers_per_thread,
|
||||||
|
event.kernel_info.static_shared_memory_usage,
|
||||||
|
event.kernel_info.grid_x, event.kernel_info.grid_y,
|
||||||
|
event.kernel_info.grid_z, event.kernel_info.block_x,
|
||||||
|
event.kernel_info.block_y, event.kernel_info.block_z);
|
||||||
|
ns->set_timeline_label(absl::StrCat(activity_name, " ", details));
|
||||||
|
auto nscopy = new NodeExecStats(*ns);
|
||||||
|
collector->Save(absl::StrCat(stream_device, "all"), ns);
|
||||||
|
collector->Save(absl::StrCat(stream_device, event.stream_id),
|
||||||
|
nscopy);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case CuptiTracerEventType::MemcpyH2D:
|
||||||
|
case CuptiTracerEventType::MemcpyD2H:
|
||||||
|
case CuptiTracerEventType::MemcpyD2D:
|
||||||
|
case CuptiTracerEventType::MemcpyP2P: {
|
||||||
|
string details = absl::StrCat(
|
||||||
|
activity_name, " bytes:", event.memcpy_info.num_bytes);
|
||||||
|
if (event.memcpy_info.async) {
|
||||||
|
absl::StrAppend(&details, " aync");
|
||||||
|
}
|
||||||
|
if (event.memcpy_info.destination != event.device_id) {
|
||||||
|
absl::StrAppend(&details,
|
||||||
|
" to device:", event.memcpy_info.destination);
|
||||||
|
}
|
||||||
|
ns->set_timeline_label(std::move(details));
|
||||||
|
auto nscopy = new NodeExecStats(*ns);
|
||||||
|
collector->Save(memcpy_device, ns);
|
||||||
|
collector->Save(
|
||||||
|
absl::StrCat(stream_device, event.stream_id, "<",
|
||||||
|
GetTraceEventTypeName(event.type), ">"),
|
||||||
|
nscopy);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
default:
|
||||||
|
ns->set_timeline_label(string(activity_name));
|
||||||
|
collector->Save(stream_device, ns);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
absl::Mutex mutex;
|
||||||
|
std::string stream_device GUARDED_BY(mutex);
|
||||||
|
std::string memcpy_device GUARDED_BY(mutex);
|
||||||
|
std::string sync_device GUARDED_BY(mutex);
|
||||||
|
std::vector<CuptiTracerEvent> events GUARDED_BY(mutex);
|
||||||
|
absl::flat_hash_map<uint32, CorrelationInfo> correlation_info
|
||||||
|
GUARDED_BY(mutex);
|
||||||
|
};
|
||||||
|
absl::FixedArray<PerDeviceAdaptor> per_device_adaptor_;
|
||||||
|
|
||||||
|
TF_DISALLOW_COPY_AND_ASSIGN(StepStatsCuptiTracerAdaptor);
|
||||||
|
};
|
||||||
|
|
||||||
|
// GpuTracer for GPU.
|
||||||
|
class GpuTracer : public profiler::ProfilerInterface {
|
||||||
|
public:
|
||||||
|
GpuTracer(CuptiTracer* cupti_tracer, CuptiInterface* cupti_interface)
|
||||||
|
: cupti_tracer_(cupti_tracer),
|
||||||
|
cupti_interface_(cupti_interface),
|
||||||
|
trace_collector_(&step_stats_) {
|
||||||
|
VLOG(1) << "GpuTracer created.";
|
||||||
|
}
|
||||||
|
~GpuTracer() override {}
|
||||||
|
|
||||||
|
// GpuTracer interface:
|
||||||
|
Status Start() override;
|
||||||
|
Status Stop() override;
|
||||||
|
Status CollectData(RunMetadata* run_metadata) override;
|
||||||
|
profiler::DeviceType GetDeviceType() override {
|
||||||
|
return profiler::DeviceType::kGpu;
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
Status DoStart();
|
||||||
|
Status DoStop();
|
||||||
|
|
||||||
|
enum State {
|
||||||
|
kNotStarted,
|
||||||
|
kStartedOk,
|
||||||
|
kStartedError,
|
||||||
|
kStoppedOk,
|
||||||
|
kStoppedError
|
||||||
|
};
|
||||||
|
State profiling_state_ = State::kNotStarted;
|
||||||
|
|
||||||
|
CuptiTracer* cupti_tracer_;
|
||||||
|
CuptiTracerOptions options_;
|
||||||
|
CuptiInterface* cupti_interface_;
|
||||||
|
StepStats step_stats_;
|
||||||
|
StepStatsCollector trace_collector_;
|
||||||
|
std::unique_ptr<StepStatsCuptiTracerAdaptor> step_stats_cupti_adaptor_;
|
||||||
|
};
|
||||||
|
|
||||||
|
Status GpuTracer::DoStart() {
|
||||||
|
if (!cupti_tracer_->IsAvailable()) {
|
||||||
|
return errors::Unavailable("Another profile session running.");
|
||||||
|
}
|
||||||
|
|
||||||
|
options_.cbids_selected = {
|
||||||
|
// KERNEL
|
||||||
|
CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel,
|
||||||
|
// MEMCPY
|
||||||
|
CUPTI_DRIVER_TRACE_CBID_cuMemcpy,
|
||||||
|
CUPTI_DRIVER_TRACE_CBID_cuMemcpyAsync,
|
||||||
|
CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoD_v2,
|
||||||
|
CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoDAsync_v2,
|
||||||
|
CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoH_v2,
|
||||||
|
CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoHAsync_v2,
|
||||||
|
CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoD_v2,
|
||||||
|
CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoDAsync_v2,
|
||||||
|
CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoH_v2,
|
||||||
|
CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoHAsync_v2,
|
||||||
|
CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoD_v2,
|
||||||
|
CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoA_v2,
|
||||||
|
CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoA_v2,
|
||||||
|
CUPTI_DRIVER_TRACE_CBID_cuMemcpy2D_v2,
|
||||||
|
CUPTI_DRIVER_TRACE_CBID_cuMemcpy2DUnaligned_v2,
|
||||||
|
CUPTI_DRIVER_TRACE_CBID_cuMemcpy2DAsync_v2,
|
||||||
|
CUPTI_DRIVER_TRACE_CBID_cuMemcpy3D_v2,
|
||||||
|
CUPTI_DRIVER_TRACE_CBID_cuMemcpy3DAsync_v2,
|
||||||
|
CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoA_v2,
|
||||||
|
CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoAAsync_v2,
|
||||||
|
// GENERIC
|
||||||
|
CUPTI_DRIVER_TRACE_CBID_cuStreamSynchronize,
|
||||||
|
};
|
||||||
|
|
||||||
|
bool trace_concurrent_kernels = false;
|
||||||
|
ReadBoolFromEnvVar("TF_GPU_CUPTI_FORCE_CONCURRENT_KERNEL", false,
|
||||||
|
&trace_concurrent_kernels)
|
||||||
|
.IgnoreError();
|
||||||
|
options_.activities_selected.push_back(
|
||||||
|
trace_concurrent_kernels ? CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL
|
||||||
|
: CUPTI_ACTIVITY_KIND_KERNEL);
|
||||||
|
options_.activities_selected.push_back(CUPTI_ACTIVITY_KIND_MEMCPY);
|
||||||
|
options_.activities_selected.push_back(CUPTI_ACTIVITY_KIND_MEMCPY2);
|
||||||
|
options_.activities_selected.push_back(CUPTI_ACTIVITY_KIND_OVERHEAD);
|
||||||
|
|
||||||
|
#if CUDA_VERSION < 10000
|
||||||
|
if (!trace_concurrent_kernels) options_.cupti_finalize = true;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
CuptiTracerCollectorOptions collector_options;
|
||||||
|
uint64 start_gputime_ns = CuptiTracer::GetTimestamp();
|
||||||
|
uint64 start_walltime_ns = tensorflow::EnvTime::Default()->NowNanos();
|
||||||
|
int num_gpus = cupti_tracer_->NumGpus();
|
||||||
|
step_stats_cupti_adaptor_ = absl::make_unique<StepStatsCuptiTracerAdaptor>(
|
||||||
|
collector_options, "", num_gpus, start_walltime_ns, start_gputime_ns,
|
||||||
|
&trace_collector_);
|
||||||
|
|
||||||
|
tensorflow::tracing::ScopedAnnotation::Enable(true);
|
||||||
|
cupti_tracer_->Enable(options_, cupti_interface_,
|
||||||
|
step_stats_cupti_adaptor_.get());
|
||||||
|
return Status::OK();
|
||||||
|
}
|
||||||
|
|
||||||
|
Status GpuTracer::Start() {
|
||||||
|
Status status = DoStart();
|
||||||
|
if (status.ok()) {
|
||||||
|
profiling_state_ = State::kStartedOk;
|
||||||
|
return Status::OK();
|
||||||
|
} else {
|
||||||
|
profiling_state_ = State::kStartedError;
|
||||||
|
return status;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Status GpuTracer::DoStop() {
|
||||||
|
cupti_tracer_->Disable();
|
||||||
|
tensorflow::tracing::ScopedAnnotation::Enable(false);
|
||||||
|
return Status::OK();
|
||||||
|
}
|
||||||
|
|
||||||
|
Status GpuTracer::Stop() {
|
||||||
|
if (profiling_state_ == State::kStartedOk) {
|
||||||
|
Status status = DoStop();
|
||||||
|
profiling_state_ = status.ok() ? State::kStoppedOk : State::kStoppedError;
|
||||||
|
}
|
||||||
|
return Status::OK();
|
||||||
|
}
|
||||||
|
|
||||||
|
Status GpuTracer::CollectData(RunMetadata* run_metadata) {
|
||||||
|
switch (profiling_state_) {
|
||||||
|
case State::kNotStarted:
|
||||||
|
VLOG(1) << "No trace data collected, session wasn't started";
|
||||||
|
return Status::OK();
|
||||||
|
case State::kStartedOk:
|
||||||
|
return errors::FailedPrecondition("Cannot collect trace before stopping");
|
||||||
|
case State::kStartedError:
|
||||||
|
LOG(ERROR) << "Cannot collect, xprof failed to start";
|
||||||
|
return Status::OK();
|
||||||
|
case State::kStoppedError:
|
||||||
|
VLOG(1) << "No trace data collected";
|
||||||
|
return Status::OK();
|
||||||
|
case State::kStoppedOk: {
|
||||||
|
// Input run_metadata is shared by profiler interfaces, we need append.
|
||||||
|
trace_collector_.Finalize();
|
||||||
|
for (auto& dev_stats : *step_stats_.mutable_dev_stats()) {
|
||||||
|
run_metadata->mutable_step_stats()->add_dev_stats()->Swap(&dev_stats);
|
||||||
|
}
|
||||||
|
return Status::OK();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return errors::Internal("Invalid profiling state: ", profiling_state_);
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace profiler
|
||||||
|
|
||||||
|
// Not in anonymous namespace for testing purposes.
|
||||||
|
std::unique_ptr<profiler::ProfilerInterface> CreateGpuTracer(
|
||||||
|
const profiler::ProfilerOptions& options) {
|
||||||
|
if (options.device_type != profiler::DeviceType::kGpu &&
|
||||||
|
options.device_type != profiler::DeviceType::kUnspecified)
|
||||||
|
return nullptr;
|
||||||
|
profiler::CuptiTracer* cupti_tracer =
|
||||||
|
profiler::CuptiTracer::GetCuptiTracerSingleton();
|
||||||
|
if (!cupti_tracer->IsAvailable()) {
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
profiler::CuptiInterface* cupti_interface = profiler::GetCuptiInterface();
|
||||||
|
return absl::make_unique<profiler::GpuTracer>(cupti_tracer, cupti_interface);
|
||||||
|
}
|
||||||
|
|
||||||
|
auto register_gpu_tracer_factory = [] {
|
||||||
|
RegisterProfilerFactory(&CreateGpuTracer);
|
||||||
|
return 0;
|
||||||
|
}();
|
||||||
|
|
||||||
|
} // namespace tensorflow
|
||||||
|
|
||||||
|
#endif // GOOGLE_CUDA
|
Loading…
Reference in New Issue
Block a user