Enable tracing for nested thunks.
- moves the annotation cache to the thunk itself - iterates through the whole executable, not just top-level thunks, to construct annotations - constructs a ScopedAnnotation anywhere an atomic thunk might be executed (currently in GpuExecutable and SequentialThunk) PiperOrigin-RevId: 305075290 Change-Id: Ice38b1a5abd88899bbf7ca1f5e3d94f09a5d1573
This commit is contained in:
parent
da2d11092e
commit
20bacc0d35
@ -43,6 +43,13 @@ ConditionalThunk::ConditionalThunk(
|
||||
}
|
||||
}
|
||||
|
||||
void ConditionalThunk::ComputeAnnotations() {
|
||||
Thunk::ComputeAnnotations();
|
||||
for (auto& branch_thunk : branch_thunks_) {
|
||||
branch_thunk->ComputeAnnotations();
|
||||
}
|
||||
}
|
||||
|
||||
Status ConditionalThunk::Initialize(const GpuExecutable& executable,
|
||||
se::StreamExecutor* executor) {
|
||||
if (branch_index_is_bool_) {
|
||||
|
@ -51,6 +51,7 @@ class ConditionalThunk : public Thunk {
|
||||
ConditionalThunk(const ConditionalThunk&) = delete;
|
||||
ConditionalThunk& operator=(const ConditionalThunk&) = delete;
|
||||
|
||||
void ComputeAnnotations() override;
|
||||
Status Initialize(const GpuExecutable& executable,
|
||||
se::StreamExecutor* executor) override;
|
||||
Status ExecuteOnStream(const ExecuteParams& params) override;
|
||||
|
@ -34,6 +34,11 @@ ForThunk::ForThunk(const int64 loop_limit,
|
||||
// this ForThunk, and shouldn't be profiled separately from it.
|
||||
std::move(*body_thunk_sequence), nullptr)) {}
|
||||
|
||||
void ForThunk::ComputeAnnotations() {
|
||||
Thunk::ComputeAnnotations();
|
||||
body_thunk_sequence_->ComputeAnnotations();
|
||||
}
|
||||
|
||||
Status ForThunk::Initialize(const GpuExecutable& executable,
|
||||
se::StreamExecutor* executor) {
|
||||
TF_RETURN_IF_ERROR(body_thunk_sequence_->Initialize(executable, executor));
|
||||
|
@ -37,6 +37,7 @@ class ForThunk : public Thunk {
|
||||
ForThunk(const ForThunk&) = delete;
|
||||
ForThunk& operator=(const ForThunk&) = delete;
|
||||
|
||||
void ComputeAnnotations() override;
|
||||
Status Initialize(const GpuExecutable& executable,
|
||||
se::StreamExecutor* executor) override;
|
||||
Status ExecuteOnStream(const ExecuteParams& params) override;
|
||||
|
@ -45,7 +45,7 @@ namespace xla {
|
||||
namespace gpu {
|
||||
namespace {
|
||||
|
||||
using tensorflow::profiler::ScopedAnnotation;
|
||||
using ::tensorflow::profiler::ScopedAnnotation;
|
||||
|
||||
} // namespace
|
||||
|
||||
@ -91,13 +91,8 @@ GpuExecutable::~GpuExecutable() {
|
||||
}
|
||||
|
||||
void GpuExecutable::ComputeThunkAnnotations() {
|
||||
CanonicalNameMap canonical_name_map;
|
||||
for (Thunk* thunk : thunk_schedule_->TotalOrder()) {
|
||||
const HloInstruction* hlo = thunk->hlo_instruction();
|
||||
CHECK(hlo);
|
||||
thunk_annotations_[thunk] =
|
||||
absl::StrFormat("Thunk#hlo_op=%s,hlo_module=%s#", hlo->name(),
|
||||
hlo->GetModule()->name());
|
||||
thunk->ComputeAnnotations();
|
||||
}
|
||||
}
|
||||
|
||||
@ -173,17 +168,13 @@ Status GpuExecutable::ExecuteThunks(
|
||||
tensorflow::profiler::TraceMeLevel::kInfo);
|
||||
|
||||
std::map<const Thunk*, std::unique_ptr<se::Event>> thunk_to_finish_event;
|
||||
bool scoped_annotation_enabled = ScopedAnnotation::IsEnabled();
|
||||
std::vector<std::function<void()>> deferred_host_callbacks;
|
||||
for (Thunk* thunk : thunk_schedule_->TotalOrder()) {
|
||||
CHECK(thunk->hlo_instruction());
|
||||
// Annotate execution of this op if tracing was enabled when we started
|
||||
// running this module. If tracing is enabled *while* we're running the
|
||||
// module, we won't get any data, but that's probably an OK trade-off.
|
||||
absl::optional<ScopedAnnotation> op_annotation;
|
||||
CHECK(thunk->hlo_instruction());
|
||||
if (scoped_annotation_enabled) {
|
||||
op_annotation.emplace(FindOrDie(thunk_annotations_, thunk));
|
||||
}
|
||||
ScopedAnnotation annotation([&] { return thunk->profile_annotation(); });
|
||||
|
||||
TF_RETURN_IF_ERROR(thunk->Initialize(*this, executor));
|
||||
int32 stream_no =
|
||||
|
@ -151,10 +151,6 @@ class GpuExecutable : public Executable {
|
||||
// memory for every output/temp buffers.
|
||||
const std::shared_ptr<const BufferAssignment> assignment_;
|
||||
|
||||
// Maps a thunk to a string describing the thunk. This is useful when
|
||||
// constructing ScopeAnnotation objects.
|
||||
absl::flat_hash_map<Thunk*, string> thunk_annotations_;
|
||||
|
||||
// Cache of module handles and constant buffer allocation maps used by
|
||||
// `ResolveConstantGlobals`.
|
||||
tensorflow::mutex module_handle_mutex_;
|
||||
|
@ -17,14 +17,23 @@ limitations under the License.
|
||||
|
||||
#include "tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h"
|
||||
#include "tensorflow/core/lib/core/errors.h"
|
||||
#include "tensorflow/core/profiler/lib/scoped_annotation.h"
|
||||
|
||||
namespace xla {
|
||||
namespace gpu {
|
||||
|
||||
using ::tensorflow::profiler::ScopedAnnotation;
|
||||
|
||||
SequentialThunk::SequentialThunk(std::vector<std::unique_ptr<Thunk>> thunks,
|
||||
const HloInstruction* hlo)
|
||||
: Thunk(Kind::kSequential, hlo), thunks_(std::move(thunks)) {}
|
||||
|
||||
void SequentialThunk::ComputeAnnotations() {
|
||||
for (const auto& thunk : thunks_) {
|
||||
thunk->ComputeAnnotations();
|
||||
}
|
||||
}
|
||||
|
||||
Status SequentialThunk::Initialize(const GpuExecutable& executable,
|
||||
se::StreamExecutor* executor) {
|
||||
for (auto& thunk : thunks_) {
|
||||
@ -37,6 +46,7 @@ Status SequentialThunk::ExecuteOnStream(const ExecuteParams& params) {
|
||||
auto op_profiler =
|
||||
params.profiler->MakeScopedInstructionProfiler(hlo_instruction());
|
||||
for (const auto& thunk : thunks_) {
|
||||
ScopedAnnotation annotation([&] { return thunk->profile_annotation(); });
|
||||
TF_RETURN_IF_ERROR(thunk->ExecuteOnStream(params));
|
||||
}
|
||||
return Status::OK();
|
||||
|
@ -39,6 +39,7 @@ class SequentialThunk : public Thunk {
|
||||
|
||||
const std::vector<std::unique_ptr<Thunk>>& thunks() const { return thunks_; }
|
||||
|
||||
void ComputeAnnotations() override;
|
||||
Status Initialize(const GpuExecutable& executable,
|
||||
se::StreamExecutor* executor) override;
|
||||
Status ExecuteOnStream(const ExecuteParams& params) override;
|
||||
|
@ -79,6 +79,18 @@ class Thunk {
|
||||
|
||||
Kind kind() const { return kind_; }
|
||||
const HloInstruction* hlo_instruction() const { return hlo_instruction_; }
|
||||
string profile_annotation() const { return profile_annotation_; }
|
||||
|
||||
// Constructs and caches the profile annotation string for this thunk and
|
||||
// any child thunks.
|
||||
virtual void ComputeAnnotations() {
|
||||
const HloInstruction* hlo = hlo_instruction();
|
||||
if (hlo) {
|
||||
profile_annotation_ =
|
||||
absl::StrFormat("Thunk:#hlo_op=%s,hlo_module=%s#", hlo->name(),
|
||||
hlo->GetModule()->name());
|
||||
}
|
||||
}
|
||||
|
||||
// Prepares the thunk for execution on the given StreamExecutor.
|
||||
//
|
||||
@ -130,6 +142,7 @@ class Thunk {
|
||||
private:
|
||||
Kind kind_;
|
||||
const HloInstruction* hlo_instruction_;
|
||||
string profile_annotation_;
|
||||
};
|
||||
|
||||
// A sequence of thunks.
|
||||
|
@ -39,6 +39,12 @@ WhileThunk::WhileThunk(
|
||||
body_thunk_sequence_(absl::make_unique<SequentialThunk>(
|
||||
std::move(*body_thunk_sequence), nullptr)) {}
|
||||
|
||||
void WhileThunk::ComputeAnnotations() {
|
||||
Thunk::ComputeAnnotations();
|
||||
condition_thunk_sequence_->ComputeAnnotations();
|
||||
body_thunk_sequence_->ComputeAnnotations();
|
||||
}
|
||||
|
||||
Status WhileThunk::Initialize(const GpuExecutable& executable,
|
||||
se::StreamExecutor* executor) {
|
||||
TF_RETURN_IF_ERROR(
|
||||
|
@ -46,6 +46,7 @@ class WhileThunk : public Thunk {
|
||||
WhileThunk(const WhileThunk&) = delete;
|
||||
WhileThunk& operator=(const WhileThunk&) = delete;
|
||||
|
||||
void ComputeAnnotations() override;
|
||||
Status Initialize(const GpuExecutable& executable,
|
||||
se::StreamExecutor* executor) override;
|
||||
Status ExecuteOnStream(const ExecuteParams& params) override;
|
||||
|
Loading…
x
Reference in New Issue
Block a user