diff --git a/tensorflow/core/platform/default/build_config.bzl b/tensorflow/core/platform/default/build_config.bzl index 6e6d1785a65..bd35e64ef47 100644 --- a/tensorflow/core/platform/default/build_config.bzl +++ b/tensorflow/core/platform/default/build_config.bzl @@ -577,7 +577,14 @@ def tf_protos_grappler(): ) def tf_additional_cupti_wrapper_deps(): - return ["//tensorflow/core/platform/default/gpu:cupti_wrapper"] + return [ + "//tensorflow/core/platform/default/gpu:cupti_wrapper", + "@com_google_absl//absl/base", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/strings:str_format", + "@com_google_absl//absl/container:node_hash_map", + "@com_google_absl//absl/container:flat_hash_map", + ] def tf_additional_device_tracer_srcs(): return ["platform/default/device_tracer.cc"] diff --git a/tensorflow/core/platform/default/device_tracer.cc b/tensorflow/core/platform/default/device_tracer.cc index edb51f4d49b..fdd934cdcaf 100644 --- a/tensorflow/core/platform/default/device_tracer.cc +++ b/tensorflow/core/platform/default/device_tracer.cc @@ -20,9 +20,15 @@ limitations under the License. #include #include +#include "absl/base/casts.h" +#include "absl/container/flat_hash_map.h" +#include "absl/container/node_hash_map.h" +#include "absl/strings/str_cat.h" +#include "absl/strings/str_format.h" #include "tensorflow/core/common_runtime/step_stats_collector.h" #include "tensorflow/core/framework/step_stats.pb.h" #include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/core/lib/hash/hash.h" #include "tensorflow/core/lib/strings/strcat.h" #include "tensorflow/core/lib/strings/stringprintf.h" #include "tensorflow/core/platform/cupti_wrapper.h" @@ -34,276 +40,288 @@ limitations under the License. #include "tensorflow/core/profiler/internal/cpu/host_tracer.h" #include "tensorflow/core/profiler/lib/traceme.h" -namespace { - -// Maps a MemcpyKind enum to a const string. -const char *getMemcpyKindString(CUpti_ActivityMemcpyKind kind) { - switch (kind) { - case CUPTI_ACTIVITY_MEMCPY_KIND_HTOD: - return "HtoD"; - case CUPTI_ACTIVITY_MEMCPY_KIND_DTOH: - return "DtoH"; - case CUPTI_ACTIVITY_MEMCPY_KIND_HTOA: - return "HtoA"; - case CUPTI_ACTIVITY_MEMCPY_KIND_ATOH: - return "AtoH"; - case CUPTI_ACTIVITY_MEMCPY_KIND_ATOA: - return "AtoA"; - case CUPTI_ACTIVITY_MEMCPY_KIND_ATOD: - return "AtoD"; - case CUPTI_ACTIVITY_MEMCPY_KIND_DTOA: - return "DtoA"; - case CUPTI_ACTIVITY_MEMCPY_KIND_DTOD: - return "DtoD"; - case CUPTI_ACTIVITY_MEMCPY_KIND_HTOH: - return "HtoH"; - case CUPTI_ACTIVITY_MEMCPY_KIND_PTOP: - return "PtoP"; - default: - break; - } - return ""; -} - -// Maps a MemoryKind enum to a const string. -const char *getMemoryKindString(CUpti_ActivityMemoryKind kind) { - switch (kind) { - case CUPTI_ACTIVITY_MEMORY_KIND_UNKNOWN: - return "Unknown"; - case CUPTI_ACTIVITY_MEMORY_KIND_PAGEABLE: - return "Pageable"; - case CUPTI_ACTIVITY_MEMORY_KIND_PINNED: - return "Pinned"; - case CUPTI_ACTIVITY_MEMORY_KIND_DEVICE: - return "Device"; - case CUPTI_ACTIVITY_MEMORY_KIND_ARRAY: - return "Array"; - default: - break; - } - return ""; -} - -// Maps an OverheadKind enum to a const string. -const char *getActivityOverheadKindString(CUpti_ActivityOverheadKind kind) { - switch (kind) { - case CUPTI_ACTIVITY_OVERHEAD_DRIVER_COMPILER: - return "COMPILER"; - case CUPTI_ACTIVITY_OVERHEAD_CUPTI_BUFFER_FLUSH: - return "BUFFER_FLUSH"; - case CUPTI_ACTIVITY_OVERHEAD_CUPTI_INSTRUMENTATION: - return "INSTRUMENTATION"; - case CUPTI_ACTIVITY_OVERHEAD_CUPTI_RESOURCE: - return "RESOURCE"; - default: - break; - } - return ""; -} - -} // namespace - namespace tensorflow { -namespace devicetracer { +namespace { +Status ToStatus(CUptiResult result) { + if (result == CUPTI_SUCCESS) { + return Status::OK(); + } + const char* str = nullptr; + if (auto wrapper = + absl::make_unique()) { + wrapper->GetResultString(result, &str); + } + return errors::Unavailable("CUPTI error: ", str ? str : ""); +} -// Forward declaration. -class CUPTIManager; +Status ToStatus(CUresult result) { + if (result == CUDA_SUCCESS) { + return Status::OK(); + } + const char* str = nullptr; + cuGetErrorName(result, &str); + return errors::Unavailable("CUDA error: ", str ? str : ""); +} -// Returns a pointer to the CUPTIManager singleton. -CUPTIManager *GetCUPTIManager(); +void LogIfError(const Status& status) { + if (status.ok()) { + return; + } + LOG(ERROR) << status.error_message(); +} -// Callback interface for consumers of CUPTI tracing. -class CUPTIClient { - public: - virtual ~CUPTIClient() {} - - // Invoked for each CUPTI activity reported. - virtual void ActivityCallback(const CUpti_Activity &activity) = 0; +struct KernelRecord { + const char* kernel_name; + // TODO(csigg): cuStreamGetCtx introduced in CUDA 9.2 would allow us to only + // record the stream and infer the context during collection. + CUcontext context; + CUstream stream; + CUevent start_event; + CUevent stop_event; + const std::string* annotation; }; -#define CUPTI_CALL(call) \ - do { \ - CUptiResult _status = cupti_wrapper_->call; \ - if (_status != CUPTI_SUCCESS) { \ - LOG(ERROR) << "cuda call " << #call << " failed " << _status; \ - } \ - } while (0) - -// Singleton class to manage registration of CUPTI callbacks. -class CUPTIManager { - public: - CUPTIManager() { - cupti_wrapper_.reset(new perftools::gputools::profiler::CuptiWrapper()); - } - - static CUPTIManager *Create() { - auto manager = absl::make_unique(); - CUptiResult status = manager->cupti_wrapper_->ActivityRegisterCallbacks( - BufferRequested, BufferCompleted); - if (status != CUPTI_SUCCESS) { - LOG(ERROR) << "Failed to initialize CUPTI: " << status; - return nullptr; - } - return manager.release(); - } - - // Enables tracing and delivers event callbacks to 'client'. - // Does not take ownership of client. Client's lifetime must persist - // until tracing is disabled. - Status EnableTrace(CUPTIClient *client); - - // Disable tracing. No further events will be delivered to 'client'. - Status DisableTrace(); - - private: - // Static functions which we can use as CUPTI callbacks. - static void BufferRequested(uint8_t **buffer, size_t *size, - size_t *maxNumRecords) { - GetCUPTIManager()->InternalBufferRequested(buffer, size, maxNumRecords); - } - static void BufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer, - size_t size, size_t validSize) { - GetCUPTIManager()->InternalBufferCompleted(ctx, streamId, buffer, size, - validSize); - } - // These methods are called by the static stubs above. - void InternalBufferRequested(uint8_t **buffer, size_t *size, - size_t *maxNumRecords); - void InternalBufferCompleted(CUcontext ctx, uint32_t streamId, - uint8_t *buffer, size_t size, size_t validSize); - - // Size of buffers used for CUPTI tracing. - static constexpr size_t kBufferSize = 32 * 1024; - // Required alignment of CUPTI buffers. - static constexpr size_t kBufferAlignment = 8; - - mutex mu_; - CUPTIClient *client_ GUARDED_BY(mu_); - std::unique_ptr cupti_wrapper_; - - TF_DISALLOW_COPY_AND_ASSIGN(CUPTIManager); +struct MemcpyRecord { + CUmemorytype src_type; + CUmemorytype dst_type; + size_t size_bytes; + CUcontext context; + CUstream stream; + CUevent start_event; + CUevent stop_event; + const std::string* annotation; }; -Status CUPTIManager::EnableTrace(CUPTIClient *client) { - mutex_lock l(mu_); - // TODO(pbar) Work out the minimal set to trace. - // We can currently manage without driver/runtime tracing. - // CUPTI_CALL(ActivityEnable(CUPTI_ACTIVITY_KIND_CONTEXT)); - // CUPTI_CALL(ActivityEnable(CUPTI_ACTIVITY_KIND_DRIVER)); - // CUPTI_CALL(ActivityEnable(CUPTI_ACTIVITY_KIND_RUNTIME)); - // These might be useful for annotations but require NVTX API. - // CUPTI_CALL(ActivityEnable(CUPTI_ACTIVITY_KIND_NAME)); - // CUPTI_CALL(ActivityEnable(CUPTI_ACTIVITY_KIND_MARKER)); - - CUPTI_CALL(ActivityEnable(CUPTI_ACTIVITY_KIND_DEVICE)); - CUPTI_CALL(ActivityEnable(CUPTI_ACTIVITY_KIND_KERNEL)); - CUPTI_CALL(ActivityEnable(CUPTI_ACTIVITY_KIND_MEMCPY)); - CUPTI_CALL(ActivityEnable(CUPTI_ACTIVITY_KIND_MEMCPY2)); - CUPTI_CALL(ActivityEnable(CUPTI_ACTIVITY_KIND_MEMSET)); - CUPTI_CALL(ActivityEnable(CUPTI_ACTIVITY_KIND_OVERHEAD)); - client_ = client; - return Status::OK(); +Status CreateAndRecordEvent(CUevent* event, CUstream stream) { + TF_RETURN_IF_ERROR(ToStatus(cuEventCreate(event, CU_EVENT_DEFAULT))); + return ToStatus(cuEventRecord(*event, stream)); } -Status CUPTIManager::DisableTrace() { - // We turn off all tracing regardless. - CUPTI_CALL(ActivityDisable(CUPTI_ACTIVITY_KIND_NAME)); - CUPTI_CALL(ActivityDisable(CUPTI_ACTIVITY_KIND_MARKER)); - CUPTI_CALL(ActivityDisable(CUPTI_ACTIVITY_KIND_OVERHEAD)); - CUPTI_CALL(ActivityDisable(CUPTI_ACTIVITY_KIND_CONTEXT)); - CUPTI_CALL(ActivityDisable(CUPTI_ACTIVITY_KIND_DRIVER)); - CUPTI_CALL(ActivityDisable(CUPTI_ACTIVITY_KIND_RUNTIME)); - CUPTI_CALL(ActivityDisable(CUPTI_ACTIVITY_KIND_DEVICE)); - CUPTI_CALL(ActivityDisable(CUPTI_ACTIVITY_KIND_KERNEL)); - CUPTI_CALL(ActivityDisable(CUPTI_ACTIVITY_KIND_MEMCPY)); - CUPTI_CALL(ActivityDisable(CUPTI_ACTIVITY_KIND_MEMCPY2)); - CUPTI_CALL(ActivityDisable(CUPTI_ACTIVITY_KIND_MEMSET)); - CUPTI_CALL(ActivityFlushAll(CUPTI_ACTIVITY_FLAG_FLUSH_FORCED)); - { - // Don't acquire this lock until Flush returns, since Flush - // will potentially cause callbacks into BufferCompleted. - mutex_lock l(mu_); - client_ = nullptr; - } - return Status::OK(); -} - -void CUPTIManager::InternalBufferRequested(uint8_t **buffer, size_t *size, - size_t *maxNumRecords) { - VLOG(2) << "BufferRequested"; - void *p = port::AlignedMalloc(kBufferSize, kBufferAlignment); - *size = kBufferSize; - *buffer = reinterpret_cast(p); - *maxNumRecords = 0; -} - -void CUPTIManager::InternalBufferCompleted(CUcontext ctx, uint32_t streamId, - uint8_t *buffer, size_t size, - size_t validSize) { - VLOG(2) << "BufferCompleted"; - CUptiResult status; - CUpti_Activity *record = nullptr; - mutex_lock l(mu_); // Hold mu_ while using client_. - if (client_ && validSize > 0) { - do { - status = - cupti_wrapper_->ActivityGetNextRecord(buffer, validSize, &record); - if (status == CUPTI_SUCCESS) { - client_->ActivityCallback(*record); - } else { - break; - } - } while (1); - - // report any records dropped from the queue - size_t dropped; - CUPTI_CALL(ActivityGetNumDroppedRecords(ctx, streamId, &dropped)); - if (dropped != 0) { - LOG(WARNING) << "Dropped " << dropped << " activity records"; - } - } - port::AlignedFree(buffer); -} - -CUPTIManager *GetCUPTIManager() { - static CUPTIManager *manager = CUPTIManager::Create(); - return manager; -} - -#ifdef _MSC_VER -#define __thread __declspec(thread) -#endif - -// TODO(pbar) Move this to platform specific header file? -// Static thread local variable for POD types. -#define TF_STATIC_THREAD_LOCAL_POD(_Type_, _var_) \ - static __thread _Type_ s_obj_##_var_; \ - namespace { \ - class ThreadLocal_##_var_ { \ - public: \ - ThreadLocal_##_var_() {} \ - void Init() {} \ - inline _Type_ *pointer() const { return &s_obj_##_var_; } \ - inline _Type_ *safe_pointer() const { return &s_obj_##_var_; } \ - _Type_ &get() const { return s_obj_##_var_; } \ - bool is_native_tls() const { return true; } \ - \ - private: \ - TF_DISALLOW_COPY_AND_ASSIGN(ThreadLocal_##_var_); \ - } _var_; \ - } // namespace - // Thread-local state recording the most recent annotation (if any). // When non-null, this points to a string in the active annotation // of the current thread. The annotation is guaranteed to remain live // for the duration of the CUPTI API callback. -TF_STATIC_THREAD_LOCAL_POD(const char *, tls_current_annotation); +static thread_local const char* tls_current_annotation; + +// Stores a series of kernel and memcpy records. +class CudaEventRecorder { + public: + // Registers the start of a kernel launch. The returned index should be passed + // to StopKernel() after the kernel launch has completed. + size_t StartKernel(const char* kernel_name, CUcontext context, + CUstream stream) { + KernelRecord record = {kernel_name, context, stream}; + LogIfError(CreateAndRecordEvent(&record.start_event, stream)); + mutex_lock lock(mutex_); + if (tls_current_annotation) { + record.annotation = &*annotations_.emplace(tls_current_annotation).first; + } + kernel_records_.push_back(record); + return kernel_records_.size() - 1; + } + void StopKernel(size_t index) { + mutex_lock lock(mutex_); + auto& record = kernel_records_[index]; + LogIfError(CreateAndRecordEvent(&record.stop_event, record.stream)); + } + + // Registers the start of a copy operation. The returned index should be + // passed to StopMemcpy() after the kernel launch has completed. + size_t StartMemcpy(CUmemorytype src_type, CUmemorytype dst_type, + size_t size_bytes, CUcontext context, CUstream stream) { + MemcpyRecord record = {src_type, dst_type, size_bytes, context, stream}; + LogIfError(CreateAndRecordEvent(&record.start_event, stream)); + mutex_lock lock(mutex_); + if (tls_current_annotation) { + record.annotation = &*annotations_.emplace(tls_current_annotation).first; + } + memcpy_records_.push_back(record); + return memcpy_records_.size() - 1; + } + void StopMemcpy(size_t index) { + mutex_lock lock(mutex_); + auto& record = memcpy_records_[index]; + LogIfError(CreateAndRecordEvent(&record.stop_event, record.stream)); + } + + std::vector ConsumeKernelRecords() { + mutex_lock lock(mutex_); + return std::move(kernel_records_); + } + std::vector ConsumeMemcpyRecords() { + mutex_lock lock(mutex_); + return std::move(memcpy_records_); + } + + private: + mutex mutex_; + std::unordered_set annotations_ GUARDED_BY(mutex_); + std::vector kernel_records_ GUARDED_BY(mutex_); + std::vector memcpy_records_ GUARDED_BY(mutex_); +}; + +// Instances register callbacks with CUPTI to notify the event recorder before +// and after kernel launches and memory copies. +class CuptiCallbackHook { + public: + CuptiCallbackHook() + : cupti_wrapper_(new perftools::gputools::profiler::CuptiWrapper()), + subscriber_(nullptr) {} + + Status Enable(CudaEventRecorder* recorder) { + TF_RETURN_IF_ERROR(ToStatus( + cupti_wrapper_->Subscribe(&subscriber_, &CuptiCallback, recorder))); + for (auto cbid : {CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel, + CUPTI_DRIVER_TRACE_CBID_cuMemcpy, + CUPTI_DRIVER_TRACE_CBID_cuMemcpyAsync, + CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoD_v2, + CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoDAsync_v2, + CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoH_v2, + CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoHAsync_v2, + CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoD_v2, + CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoDAsync_v2}) { + TF_RETURN_IF_ERROR(ToStatus(cupti_wrapper_->EnableCallback( + /*enable=*/1, subscriber_, CUPTI_CB_DOMAIN_DRIVER_API, cbid))); + } + return Status::OK(); + } + + ~CuptiCallbackHook() { + LogIfError(ToStatus(cupti_wrapper_->Unsubscribe(subscriber_))); + } + + private: + static void CUPTIAPI CuptiCallback(void* userdata, + CUpti_CallbackDomain domain, + CUpti_CallbackId cbid, + const void* cbdata) { + auto recorder = static_cast(userdata); + auto data = static_cast(cbdata); + DCHECK_EQ(domain, CUPTI_CB_DOMAIN_DRIVER_API); + + if (data->callbackSite == CUPTI_API_ENTER) { + DriverApiEnterCallback(cbid, *data, recorder); + } else { + DriverApiExitCallback(cbid, *data, recorder); + } + } + + static CUmemorytype GetMemoryType(CUdeviceptr ptr) { + CUmemorytype mem_type; + auto status = + cuPointerGetAttribute(&mem_type, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, ptr); + if (status == CUDA_ERROR_INVALID_VALUE) { + // Pointer not registered with CUDA, must be host memory. + return CU_MEMORYTYPE_HOST; + } + LogIfError(ToStatus(status)); + return mem_type; + } + + template + static void StartMemcpy(CUmemorytype src_type, CUmemorytype dst_type, + const CUpti_CallbackData& cbdata, + CudaEventRecorder* recorder) { + auto params = static_cast(cbdata.functionParams); + *cbdata.correlationData = recorder->StartMemcpy( + src_type, dst_type, params->ByteCount, cbdata.context, nullptr); + } + template + static void StartMemcpyAsync(CUmemorytype dst_type, CUmemorytype src_type, + const CUpti_CallbackData& cbdata, + CudaEventRecorder* recorder) { + auto params = static_cast(cbdata.functionParams); + *cbdata.correlationData = recorder->StartMemcpy( + src_type, dst_type, params->ByteCount, cbdata.context, params->hStream); + } + + static void DriverApiEnterCallback(CUpti_CallbackId cbid, + const CUpti_CallbackData& cbdata, + CudaEventRecorder* recorder) { + switch (cbid) { + case CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel: { + DCHECK_NE(cbdata.symbolName, nullptr); + auto params = + static_cast(cbdata.functionParams); + *cbdata.correlationData = recorder->StartKernel( + cbdata.symbolName, cbdata.context, params->hStream); + return; + } + + case CUPTI_DRIVER_TRACE_CBID_cuMemcpy: { + auto params = + static_cast(cbdata.functionParams); + return StartMemcpy(GetMemoryType(params->src), + GetMemoryType(params->dst), cbdata, + recorder); + } + case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAsync: { + auto params = + static_cast(cbdata.functionParams); + return StartMemcpyAsync( + GetMemoryType(params->src), GetMemoryType(params->dst), cbdata, + recorder); + } + + case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoD_v2: + return StartMemcpy( + CU_MEMORYTYPE_HOST, CU_MEMORYTYPE_DEVICE, cbdata, recorder); + + case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoDAsync_v2: + return StartMemcpyAsync( + CU_MEMORYTYPE_HOST, CU_MEMORYTYPE_DEVICE, cbdata, recorder); + + case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoH_v2: + return StartMemcpy( + CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_HOST, cbdata, recorder); + case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoHAsync_v2: + return StartMemcpyAsync( + CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_HOST, cbdata, recorder); + + case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoD_v2: + return StartMemcpy( + CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_DEVICE, cbdata, recorder); + case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoDAsync_v2: + return StartMemcpyAsync( + CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_DEVICE, cbdata, recorder); + + default: + LOG(ERROR) << "Unexpected callback id: " << cbid; + } + } + + static void DriverApiExitCallback(CUpti_CallbackId cbid, + const CUpti_CallbackData& cbdata, + CudaEventRecorder* recorder) { + switch (cbid) { + case CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel: + recorder->StopKernel(*cbdata.correlationData); + break; + case CUPTI_DRIVER_TRACE_CBID_cuMemcpy: + case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAsync: + case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoD_v2: + case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoDAsync_v2: + case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoH_v2: + case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoHAsync_v2: + case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoD_v2: + case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoDAsync_v2: + recorder->StopMemcpy(*cbdata.correlationData); + break; + default: + LOG(ERROR) << "Unexpected callback id: " << cbid; + } + } + + std::unique_ptr cupti_wrapper_; + CUpti_SubscriberHandle subscriber_; +}; +} // namespace class TraceCollectorImpl : public tracing::TraceCollector { public: class ActivityHandle : public Handle { public: - ActivityHandle(string &&name, int level) + ActivityHandle(std::string&& name, int level) : trace_me_(std::move(name), level) {} private: @@ -320,13 +338,13 @@ class TraceCollectorImpl : public tracing::TraceCollector { virtual std::unique_ptr CreateAnnotationHandle( StringPiece name_part1, StringPiece name_part2) const { struct Impl : public tracing::TraceCollector::Handle { - string annotation; - explicit Impl(string &&name_scope) : annotation(name_scope) { + std::string annotation; + explicit Impl(std::string&& name_scope) : annotation(name_scope) { VLOG(2) << "CreateAnnotationHandle " << annotation; // Remember the most recent ScopedAnnotation for each thread. - tls_current_annotation.get() = annotation.c_str(); + tls_current_annotation = annotation.c_str(); } - ~Impl() override { tls_current_annotation.get() = nullptr; } + ~Impl() override { tls_current_annotation = nullptr; } }; return absl::make_unique(ConcatenateNames(name_part1, name_part2)); } @@ -367,84 +385,32 @@ class TraceCollectorImpl : public tracing::TraceCollector { std::atomic active_trace_session_; }; -TraceCollectorImpl *GlobalDefaultTraceCollector() { - static auto *instance = new TraceCollectorImpl(); +TraceCollectorImpl* GlobalDefaultTraceCollector() { + static auto* instance = new TraceCollectorImpl(); return instance; } -class DeviceTracerImpl : public DeviceTracer, public CUPTIClient { +class DeviceTracerImpl : public DeviceTracer { public: - DeviceTracerImpl(CUPTIManager *cupti_manager); + DeviceTracerImpl(); ~DeviceTracerImpl() override; // DeviceTracer interface: Status Start() override; Status Stop() override; - Status Collect(StepStatsCollector *collector) override; - - protected: - // This callback is used exclusively by CUPTIManager. - friend class CUPTIManager; - void ActivityCallback(const CUpti_Activity &activity) override; + Status Collect(StepStatsCollector* collector) override; private: - // Internal struct to record kernel launches. - struct KernelRecord { - uint64_t start_timestamp; - uint64_t end_timestamp; - uint32 device_id; - uint32 stream_id; - uint32 correlation_id; - }; - // Internal struct to record memcpy operations. - struct MemcpyRecord { - uint64_t start_timestamp; - uint64_t end_timestamp; - uint32 device_id; - uint32 stream_id; - uint32 correlation_id; - uint8 copyKind; - uint8 srcKind; - uint8 dstKind; - uint64 bytes; - }; - - // This is the subscriber callback which is invoked directly by CUPTI. - // The 'userdata' argument will be a pointer to the active 'DeviceTracerImpl'. - static void CUPTIAPI ApiCallback(void *userdata, CUpti_CallbackDomain domain, - CUpti_CallbackId cbid, const void *cbdata); - - // Records the mapping between correlation ID and kernel name. - void AddCorrelationId(uint32 correlation_id, const string &name); - - // Returns the current system time in microseconds. - inline int64 NowInUsec() { return Env::Default()->NowMicros(); } - - CUPTIManager *cupti_manager_; - std::unique_ptr cupti_wrapper_; - CUpti_SubscriberHandle subscriber_; - - mutex trace_mu_; - static constexpr size_t kMaxRecords = 1024 * 1024; - std::map correlations_ GUARDED_BY(trace_mu_); - std::vector kernel_records_ GUARDED_BY(trace_mu_); - std::vector memcpy_records_ GUARDED_BY(trace_mu_); + std::unique_ptr recorder_; + std::unique_ptr cupti_hook_; mutex mu_; bool enabled_ GUARDED_BY(mu_); - int64 start_walltime_us_ GUARDED_BY(mu_); - int64 end_walltime_us_ GUARDED_BY(mu_); - uint64_t start_timestamp_ GUARDED_BY(mu_); - uint64_t end_timestamp_ GUARDED_BY(mu_); std::unique_ptr host_tracer_ GUARDED_BY(mu_); - - TF_DISALLOW_COPY_AND_ASSIGN(DeviceTracerImpl); }; -DeviceTracerImpl::DeviceTracerImpl(CUPTIManager *cupti_manager) - : cupti_manager_(cupti_manager) { +DeviceTracerImpl::DeviceTracerImpl() : recorder_(new CudaEventRecorder()) { VLOG(1) << "DeviceTracer created."; - cupti_wrapper_.reset(new perftools::gputools::profiler::CuptiWrapper()); host_tracer_ = profiler::cpu::HostTracer::Create(2); enabled_ = false; } @@ -461,55 +427,12 @@ Status DeviceTracerImpl::Start() { if (enabled_) { return errors::FailedPrecondition("DeviceTracer is already enabled."); } - // There can only be one CUPTI subscriber. If we can't create one then - // there is another trace in progress (possibly by external code). - CUptiResult ret; - ret = cupti_wrapper_->Subscribe( - &subscriber_, static_cast(ApiCallback), this); - if (ret == CUPTI_ERROR_MAX_LIMIT_REACHED) { - return errors::Unavailable("CUPTI subcriber limit reached."); - } else if (ret != CUPTI_SUCCESS) { - return errors::Internal("Failed to create CUPTI subcriber."); - } + cupti_hook_.reset(new CuptiCallbackHook()); + TF_RETURN_IF_ERROR(cupti_hook_->Enable(recorder_.get())); // Register as a TraceEngine to receive ScopedAnnotations. GlobalDefaultTraceCollector()->Start(); - // Intercept launch and memcpy calls to capture the Op name annotation. - // TODO(pbar) Add callbacks for memcpy variants. - CUPTI_CALL(EnableCallback(/*enable=*/1, subscriber_, - CUPTI_CB_DOMAIN_DRIVER_API, - CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel)); - CUPTI_CALL(EnableCallback(/*enable=*/1, subscriber_, - CUPTI_CB_DOMAIN_RUNTIME_API, - CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy_v3020)); - CUPTI_CALL(EnableCallback( - /*enable=*/1, subscriber_, CUPTI_CB_DOMAIN_RUNTIME_API, - CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyAsync_v3020)); - - CUPTI_CALL(EnableCallback(/*enable=*/1, subscriber_, - CUPTI_CB_DOMAIN_DRIVER_API, - CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoH_v2)); - CUPTI_CALL(EnableCallback(/*enable=*/1, subscriber_, - CUPTI_CB_DOMAIN_DRIVER_API, - CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoHAsync_v2)); - CUPTI_CALL(EnableCallback(/*enable=*/1, subscriber_, - CUPTI_CB_DOMAIN_DRIVER_API, - CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoD_v2)); - CUPTI_CALL(EnableCallback(/*enable=*/1, subscriber_, - CUPTI_CB_DOMAIN_DRIVER_API, - CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoDAsync_v2)); - CUPTI_CALL(EnableCallback(/*enable=*/1, subscriber_, - CUPTI_CB_DOMAIN_DRIVER_API, - CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoD_v2)); - CUPTI_CALL(EnableCallback(/*enable=*/1, subscriber_, - CUPTI_CB_DOMAIN_DRIVER_API, - CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoDAsync_v2)); - - TF_RETURN_IF_ERROR(cupti_manager_->EnableTrace(this)); - - CUPTI_CALL(GetTimestamp(&start_timestamp_)); - start_walltime_us_ = NowInUsec(); host_tracer_->Start().IgnoreError(); enabled_ = true; return Status::OK(); @@ -521,199 +444,288 @@ Status DeviceTracerImpl::Stop() { if (!enabled_) { return Status::OK(); } - CUPTI_CALL(Unsubscribe(subscriber_)); + cupti_hook_.reset(); GlobalDefaultTraceCollector()->Stop(); - TF_RETURN_IF_ERROR(cupti_manager_->DisableTrace()); - end_walltime_us_ = NowInUsec(); - CUPTI_CALL(GetTimestamp(&end_timestamp_)); enabled_ = false; host_tracer_->Stop().IgnoreError(); return Status::OK(); } -void DeviceTracerImpl::AddCorrelationId(uint32 correlation_id, - const string &name) { - VLOG(2) << correlation_id << " : " << name; - mutex_lock l(trace_mu_); - if (correlations_.size() >= kMaxRecords) return; - correlations_.emplace(correlation_id, name); -} +namespace { +class CudaEventCollector { + struct DeviceInfo { + int ordinal; + std::string name; + int num_contexts; + }; -/*static*/ void DeviceTracerImpl::ApiCallback(void *userdata, - CUpti_CallbackDomain domain, - CUpti_CallbackId cbid, - const void *cbdata) { - auto *cbInfo = reinterpret_cast(cbdata); - DeviceTracerImpl *tracer = reinterpret_cast(userdata); - VLOG(2) << "ApiCallback " << domain << ":" << cbid - << " func: " << cbInfo->functionName; + struct ContextInfo { + int index; + const DeviceInfo* dev_info; + int num_streams; + CUevent end_event; + }; - // API callbacks are invoked synchronously on the thread making the - // CUDA API call. If this pointer is non-null then the ScopedAnnotation - // must be valid. - const char *tls_annotation = tls_current_annotation.get(); + struct StreamInfo { + std::string name; + int index; // 0 is reserved for null stream. + const ContextInfo* ctx_info; + }; - if ((domain == CUPTI_CB_DOMAIN_DRIVER_API) && - (cbid == CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel)) { - if (cbInfo->callbackSite == CUPTI_API_ENTER) { - auto *params = reinterpret_cast( - cbInfo->functionParams); - if (VLOG_IS_ON(2)) { - VLOG(2) << "LAUNCH stream " << params->hStream << " correlation " - << cbInfo->correlationId << " kernel " << cbInfo->symbolName; - } - const string annotation = - tls_annotation ? tls_annotation : cbInfo->symbolName; - tracer->AddCorrelationId(cbInfo->correlationId, annotation); - } - } else if ((domain == CUPTI_CB_DOMAIN_RUNTIME_API) && - (cbid == CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy_v3020 || - cbid == CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyAsync_v3020)) { - if (cbInfo->callbackSite == CUPTI_API_ENTER) { - if (VLOG_IS_ON(2)) { - auto *funcParams = reinterpret_cast( - cbInfo->functionParams); - size_t count = funcParams->count; - enum cudaMemcpyKind kind = funcParams->kind; - VLOG(2) << "MEMCPY count " << count << " kind " << kind; - } - if (tls_annotation) { - const string annotation = tls_annotation; - tracer->AddCorrelationId(cbInfo->correlationId, annotation); - } - } - } else if ((domain == CUPTI_CB_DOMAIN_DRIVER_API) && - (cbid == CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoD_v2 || - cbid == CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoH_v2 || - cbid == CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoD_v2 || - cbid == CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoDAsync_v2 || - cbid == CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoHAsync_v2 || - cbid == CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoDAsync_v2)) { - if (cbInfo->callbackSite == CUPTI_API_EXIT && tls_annotation) { - const string annotation = tls_annotation; - tracer->AddCorrelationId(cbInfo->correlationId, annotation); - } - } else { - VLOG(1) << "Unhandled API Callback for " << domain << " " << cbid; + // Include context in key to distinguish null streams. + using StreamKey = std::pair; + + CudaEventCollector(CudaEventRecorder* recorder, StepStatsCollector* collector) + : recorder_(recorder), collector_(collector) { + DCHECK(recorder != nullptr); + DCHECK(collector != nullptr); } -} -void DeviceTracerImpl::ActivityCallback(const CUpti_Activity &record) { - VLOG(2) << "ActivityCallback " << record.kind; - mutex_lock l(trace_mu_); - switch (record.kind) { - case CUPTI_ACTIVITY_KIND_MEMCPY: { - if (memcpy_records_.size() >= kMaxRecords) return; - auto *memcpy = reinterpret_cast(&record); - memcpy_records_.push_back(MemcpyRecord{ - memcpy->start, memcpy->end, memcpy->deviceId, memcpy->streamId, - memcpy->correlationId, memcpy->copyKind, memcpy->srcKind, - memcpy->dstKind, memcpy->bytes}); - break; + // Populates device_infos_ from all devices. + Status InitializeDeviceInfos() { + int count; + TF_RETURN_IF_ERROR(ToStatus(cuDeviceGetCount(&count))); + for (int ordinal = 0; ordinal < count; ++ordinal) { + CUdevice device; + TF_RETURN_IF_ERROR(ToStatus(cuDeviceGet(&device, ordinal))); + char name[100]; + TF_RETURN_IF_ERROR(ToStatus(cuDeviceGetName(name, sizeof(name), device))); + device_infos_[device] = {ordinal, name}; } - case CUPTI_ACTIVITY_KIND_MEMCPY2: { - if (memcpy_records_.size() >= kMaxRecords) return; - auto *memcpy = reinterpret_cast(&record); - memcpy_records_.push_back(MemcpyRecord{ - memcpy->start, memcpy->end, memcpy->deviceId, memcpy->streamId, - memcpy->correlationId, memcpy->copyKind, memcpy->srcKind, - memcpy->dstKind, memcpy->bytes}); - break; - } - case CUPTI_ACTIVITY_KIND_KERNEL: - case CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL: { - if (kernel_records_.size() >= kMaxRecords) return; - auto *kernel = reinterpret_cast(&record); - kernel_records_.push_back(KernelRecord{kernel->start, kernel->end, - kernel->deviceId, kernel->streamId, - kernel->correlationId}); - break; - } - default: - VLOG(1) << "ActivityCallback unhandled kind"; - break; + return Status::OK(); } -} -Status DeviceTracerImpl::Collect(StepStatsCollector *collector) { + // Returns element from context_infos_, adding it if not yet present. + Status GetContextInfo(CUcontext context, ContextInfo** ctx_info_ptr) { + auto it = context_infos_.find(context); + + if (it == context_infos_.end()) { + TF_RETURN_IF_ERROR(ToStatus(cuCtxSetCurrent(context))); + CUdevice device; + TF_RETURN_IF_ERROR(ToStatus(cuCtxGetDevice(&device))); + + auto& dev_info = device_infos_[device]; + ContextInfo ctx_info = {dev_info.num_contexts++, &dev_info}; + it = context_infos_.emplace(context, ctx_info).first; + } + + *ctx_info_ptr = &it->second; + return Status::OK(); + } + + // Adds element to stream_infos_ if not yet present. If present, clear name + // if it doesn't match parameter. + Status AddStreamInfo(CUcontext context, CUstream stream, + absl::string_view name) { + StreamKey key(context, stream); + auto it = stream_infos_.find(key); + if (it != stream_infos_.end()) { + if (it->second.name != name) { + it->second.name.clear(); // Stream with inconsistent names, clear it. + } + return Status::OK(); + } + + ContextInfo* ctx_info; + TF_RETURN_IF_ERROR(GetContextInfo(context, &ctx_info)); + int index = stream ? ++ctx_info->num_streams : 0; + StreamInfo stream_info = {static_cast(name), index, ctx_info}; + stream_infos_.emplace(key, stream_info); + return Status::OK(); + } + + // Returns string describing source and destination memory types. + static std::string GetMemcpyName(const MemcpyRecord& record) { + auto get_memory_type = [](CUmemorytype mem_type) { + switch (mem_type) { + case CU_MEMORYTYPE_HOST: + return 'H'; + case CU_MEMORYTYPE_DEVICE: + return 'D'; + case CU_MEMORYTYPE_ARRAY: + return 'A'; + case CU_MEMORYTYPE_UNIFIED: + return 'U'; + default: + LOG(ERROR) << "Unknown memory type: " << mem_type; + return '?'; + } + }; + return absl::StrFormat("Memcpy%cto%c", get_memory_type(record.src_type), + get_memory_type(record.dst_type)); + } + + // Returns time in microseconds between events recorded on the GPU. + static uint64_t GetElasedTimeUs(CUevent start, CUevent stop) { + float elapsed_ms = 0.0f; + LogIfError(ToStatus(cuEventElapsedTime(&elapsed_ms, start, stop))); + return static_cast( + std::llroundf(1000 * std::max(elapsed_ms, 0.0f))); + } + + // Synchronizes all contexts. + Status Synchronize() const { + for (const auto& pair : context_infos_) { + TF_RETURN_IF_ERROR(ToStatus(cuCtxSetCurrent(pair.first))); + TF_RETURN_IF_ERROR(ToStatus(cuCtxSynchronize())); + } + return Status::OK(); + } + + // Save stats to collector; + Status SaveStats(std::unique_ptr stats, + const StreamInfo& stream_info) const { + auto ctx_info = stream_info.ctx_info; + auto dev_info = ctx_info->dev_info; + // TODO(csigg): tfprof_node.cc, run_metadata_test.py, and timeline_test.py + // currently require this particular formatting. + collector_->Save( + absl::StrFormat("/device:GPU:%d/stream:all", dev_info->ordinal), + new NodeExecStats(*stats)); + auto name = absl::StrFormat("/gpu:%d (%s)/context#%d/", dev_info->ordinal, + dev_info->name, ctx_info->index); + if (stream_info.index) { + absl::StrAppend(&name, "stream#", std::to_string(stream_info.index)); + } else { + absl::StrAppend(&name, "null stream"); + } + if (!stream_info.name.empty()) { + absl::StrAppend(&name, ":", stream_info.name); + } + collector_->Save(name, stats.release()); + return Status::OK(); + } + + Status SaveRecord(const KernelRecord& record) const { + if (!record.start_event || !record.stop_event) { + return Status::OK(); + } + const auto& stream_info = + stream_infos_.at(StreamKey(record.context, record.stream)); + auto start_us = + GetElasedTimeUs(record.start_event, stream_info.ctx_info->end_event); + auto elapsed_us = GetElasedTimeUs(record.start_event, record.stop_event); + + auto stats = absl::make_unique(); + std::string node_name = record.kernel_name; + if (record.annotation) { + node_name = absl::StrCat(*record.annotation, "::", node_name); + } + stats->set_node_name(node_name); + // TODO(csigg): Report grid size? + std::string node_label; + stats->set_timeline_label(node_label); + stats->set_all_start_micros(end_walltime_us_ - start_us); + stats->set_op_end_rel_micros(elapsed_us); + stats->set_all_end_rel_micros(elapsed_us); + return SaveStats(std::move(stats), stream_info); + } + + Status SaveRecord(const MemcpyRecord& record) const { + if (!record.start_event || !record.stop_event) { + return Status::OK(); + } + const auto& stream_info = + stream_infos_.at(StreamKey(record.context, record.stream)); + auto start_us = + GetElasedTimeUs(record.start_event, stream_info.ctx_info->end_event); + auto elapsed_us = GetElasedTimeUs(record.start_event, record.stop_event); + + auto stats = absl::make_unique(); + std::string node_name = GetMemcpyName(record); + if (record.annotation) { + node_name = absl::StrCat(*record.annotation, "::", node_name); + } + stats->set_node_name(node_name); + // TODO(csigg): Show label in Chrome trace viewer. + std::string node_label = absl::StrFormat("%d bytes", record.size_bytes); + stats->set_timeline_label(node_label); + stats->set_all_start_micros(end_walltime_us_ - start_us); + stats->set_op_end_rel_micros(elapsed_us); + stats->set_all_end_rel_micros(elapsed_us); + return SaveStats(std::move(stats), stream_info); + } + + Status Collect() { + TF_RETURN_IF_ERROR(InitializeDeviceInfos()); + + auto kernel_records = recorder_->ConsumeKernelRecords(); + auto memcpy_records = recorder_->ConsumeMemcpyRecords(); + LOG(INFO) << "Collecting " << kernel_records.size() << " kernel records, " + << memcpy_records.size() << " memcpy records."; + + // Gather all profiled streams and contexts. + for (const auto& record : kernel_records) { + TF_RETURN_IF_ERROR( + AddStreamInfo(record.context, record.stream, "Kernel")); + } + for (const auto& record : memcpy_records) { + TF_RETURN_IF_ERROR( + AddStreamInfo(record.context, record.stream, GetMemcpyName(record))); + } + + // Synchronize all contexts, record end events, synchronize again. + TF_RETURN_IF_ERROR(Synchronize()); + for (auto& pair : context_infos_) { + TF_RETURN_IF_ERROR(ToStatus(cuCtxSetCurrent(pair.first))); + TF_RETURN_IF_ERROR(CreateAndRecordEvent(&pair.second.end_event, nullptr)); + } + TF_RETURN_IF_ERROR(Synchronize()); + end_walltime_us_ = Env::Default()->NowMicros(); + + for (const auto& record : kernel_records) { + TF_RETURN_IF_ERROR(SaveRecord(record)); + } + for (const auto& record : memcpy_records) { + TF_RETURN_IF_ERROR(SaveRecord(record)); + } + + return Status::OK(); + } + + public: + // Consumes the records in recorder and saves them to the collector. + static Status Collect(CudaEventRecorder* recorder, + StepStatsCollector* collector) { + CUcontext context; + TF_RETURN_IF_ERROR(ToStatus(cuCtxGetCurrent(&context))); + auto status = CudaEventCollector(recorder, collector).Collect(); + TF_RETURN_IF_ERROR(ToStatus(cuCtxSetCurrent(context))); + return status; + } + + private: + CudaEventRecorder* recorder_; + StepStatsCollector* collector_; + + absl::node_hash_map device_infos_; + absl::node_hash_map context_infos_; + absl::flat_hash_map> stream_infos_; + int64 end_walltime_us_; +}; +} // namespace + +Status DeviceTracerImpl::Collect(StepStatsCollector* collector) { mutex_lock l(mu_); if (enabled_) { return errors::FailedPrecondition("DeviceTracer is still enabled."); } - // TODO(pbar) Handle device IDs and prefix properly. - const string prefix = ""; - const int id = 0; - const string stream_device = - strings::StrCat(prefix, "/device:GPU:", id, "/stream:"); - const string memcpy_device = - strings::StrCat(prefix, "/device:GPU:", id, "/memcpy"); - - mutex_lock l2(trace_mu_); - for (const auto &rec : kernel_records_) { - auto it = correlations_.find(rec.correlation_id); - const string name = (it != correlations_.cend()) ? it->second : "unknown"; - NodeExecStats *ns = new NodeExecStats; - ns->set_all_start_micros(start_walltime_us_ + - ((rec.start_timestamp - start_timestamp_) / 1000)); - ns->set_op_start_rel_micros(0); - auto elapsed_us = - std::max((rec.end_timestamp - rec.start_timestamp) / 1000, 1); - ns->set_op_end_rel_micros(elapsed_us); - ns->set_all_end_rel_micros(elapsed_us); - ns->set_node_name(name); - // TODO(pbar) Generate details based on the kernel activity record. - // ns->set_timeline_label(details); - auto nscopy = new NodeExecStats; - *nscopy = *ns; - collector->Save(strings::StrCat(stream_device, "all"), ns); - collector->Save(strings::StrCat(stream_device, rec.stream_id), nscopy); - } - for (const auto &rec : memcpy_records_) { - auto it = correlations_.find(rec.correlation_id); - const string name = (it != correlations_.cend()) ? it->second : "unknown"; - NodeExecStats *ns = new NodeExecStats; - ns->set_all_start_micros(start_walltime_us_ + - ((rec.start_timestamp - start_timestamp_) / 1000)); - ns->set_op_start_rel_micros(0); - auto elapsed_us = - std::max((rec.end_timestamp - rec.start_timestamp) / 1000, 1); - ns->set_op_end_rel_micros(elapsed_us); - ns->set_all_end_rel_micros(elapsed_us); - auto copyKind = static_cast(rec.copyKind); - auto srcKind = static_cast(rec.srcKind); - auto dstKind = static_cast(rec.dstKind); - const string details = strings::Printf( - "MEMCPY%s %llu bytes (%s to %s)", getMemcpyKindString(copyKind), - rec.bytes, getMemoryKindString(srcKind), getMemoryKindString(dstKind)); - ns->set_node_name( - strings::StrCat(name, ":MEMCPY", getMemcpyKindString(copyKind))); - ns->set_timeline_label(details); - auto nscopy = new NodeExecStats; - *nscopy = *ns; - collector->Save(memcpy_device, ns); - collector->Save(strings::StrCat(stream_device, rec.stream_id), nscopy); - } - + TF_RETURN_IF_ERROR(CudaEventCollector::Collect(recorder_.get(), collector)); host_tracer_->CollectDataToCollector(collector).IgnoreError(); return Status::OK(); } -} // namespace devicetracer - std::unique_ptr CreateDeviceTracer() { - devicetracer::CUPTIManager *cupti_manager = devicetracer::GetCUPTIManager(); - if (cupti_manager == nullptr) { + auto status = cuInit(0); + if (status != CUDA_SUCCESS) { + LogIfError(ToStatus(status)); return nullptr; } - std::unique_ptr tracer( - new devicetracer::DeviceTracerImpl(cupti_manager)); - return tracer; + return absl::make_unique(); } - } // namespace tensorflow - #else // GOOGLE_CUDA namespace tensorflow { diff --git a/tensorflow/core/platform/default/gpu/cupti_wrapper.cc b/tensorflow/core/platform/default/gpu/cupti_wrapper.cc index 481bbf9bae1..671d8cf8bf9 100644 --- a/tensorflow/core/platform/default/gpu/cupti_wrapper.cc +++ b/tensorflow/core/platform/default/gpu/cupti_wrapper.cc @@ -61,6 +61,7 @@ LIBCUPTI_WRAP(cuptiEnableCallback); LIBCUPTI_WRAP(cuptiEnableDomain); LIBCUPTI_WRAP(cuptiSubscribe); LIBCUPTI_WRAP(cuptiUnsubscribe); +LIBCUPTI_WRAP(cuptiGetResultString); } // namespace dynload @@ -124,6 +125,11 @@ CUptiResult CuptiWrapper::Unsubscribe(CUpti_SubscriberHandle subscriber) { return dynload::cuptiUnsubscribe(subscriber); } +CUptiResult CuptiWrapper::GetResultString(CUptiResult result, + const char** str) { + return dynload::cuptiGetResultString(result, str); +} + } // namespace profiler } // namespace gputools } // namespace perftools diff --git a/tensorflow/core/platform/default/gpu/cupti_wrapper.h b/tensorflow/core/platform/default/gpu/cupti_wrapper.h index e3ebe6ca1d0..b35a5ab4c31 100644 --- a/tensorflow/core/platform/default/gpu/cupti_wrapper.h +++ b/tensorflow/core/platform/default/gpu/cupti_wrapper.h @@ -68,6 +68,8 @@ class CuptiWrapper { CUpti_CallbackFunc callback, void* userdata); CUptiResult Unsubscribe(CUpti_SubscriberHandle subscriber); + + CUptiResult GetResultString(CUptiResult result, const char** str); }; } // namespace profiler