From 58c1aaf77721268a4ef87ebd2ab520a6d5a62f79 Mon Sep 17 00:00:00 2001 From: Sanjoy Das Date: Mon, 13 Nov 2017 17:20:26 -0800 Subject: [PATCH] Split up HloExecutionProfile into a set of re-usable components The end goal is to have Hlo profiling support in XlaJitCompiledCpuFunction and eventually AOT compiled XlaCompiledCpuFunction. This change leaves the HloExecutionProfile interface mostly intact -- internally it uses the new split out components to do what it did before. However, in future CLs: - I'll extract out a HloExecutionProfilePrototype that contains the HloProfilePrinter, the OwningHloProfilePrinterStaticData and the HloToProfileIndex. This will then live in the Executable (if profiling is enabled). - The HloExecutionProfile for a specific execution will have a pointer to the parent HloExecutionProfilePrototype, which it will use to paginate profile_counters_. - The CPU backend will use the HloToProfileIndex in the HloExecutionProfilePrototype to map hlo instructions to profile counter offsets. This will make the indices in the generated code "line up" with the indices that the HloProfilePrinter expects. These changes will allow the XlaJitCompiledCpuFunction (and later AOT) clients to pass in an appropriately sized zeroed buffer to the generated function and then pass that same buffer to the appropriate HloProfilePrinter to get a textual Hlo profile. PiperOrigin-RevId: 175613737 --- tensorflow/compiler/xla/service/BUILD | 23 ++++ tensorflow/compiler/xla/service/executable.h | 29 ++-- .../xla/service/hlo_execution_profile.cc | 130 ++++++++++++++---- .../xla/service/hlo_execution_profile.h | 87 +++++++++--- .../xla/service/hlo_execution_profile_test.cc | 99 +++++++++++++ .../xla/service/hlo_profile_printer.cc | 67 +++++++++ .../xla/service/hlo_profile_printer.h | 97 +++++++++++++ tensorflow/compiler/xla/service/hlo_runner.cc | 3 +- tensorflow/compiler/xla/service/service.cc | 27 +--- 9 files changed, 467 insertions(+), 95 deletions(-) create mode 100644 tensorflow/compiler/xla/service/hlo_execution_profile_test.cc create mode 100644 tensorflow/compiler/xla/service/hlo_profile_printer.cc create mode 100644 tensorflow/compiler/xla/service/hlo_profile_printer.h diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD index c163a5f8376..c9828d8641a 100644 --- a/tensorflow/compiler/xla/service/BUILD +++ b/tensorflow/compiler/xla/service/BUILD @@ -1360,6 +1360,7 @@ cc_library( deps = [ ":hlo", ":hlo_cost_analysis", + ":hlo_profile_printer", ":human_readable_profile_builder", "//tensorflow/compiler/xla:types", "//tensorflow/compiler/xla:util", @@ -1368,6 +1369,18 @@ cc_library( ], ) +tf_cc_test( + name = "hlo_execution_profile_test", + srcs = ["hlo_execution_profile_test.cc"], + deps = [ + ":cpu_plugin", + ":hlo_cost_analysis", + ":hlo_execution_profile", + "//tensorflow/compiler/xla/tests:hlo_test_base", + "//tensorflow/compiler/xla/tests:xla_internal_test_main", + ], +) + tf_cc_test( name = "hlo_computation_test", srcs = ["hlo_computation_test.cc"], @@ -2159,6 +2172,16 @@ cc_library( ], ) +cc_library( + name = "hlo_profile_printer", + srcs = ["hlo_profile_printer.cc"], + hdrs = ["hlo_profile_printer.h"], + deps = [ + ":human_readable_profile_builder", + "//tensorflow/compiler/xla:types", + ], +) + # ----------------------------------------------------------------------------- filegroup( diff --git a/tensorflow/compiler/xla/service/executable.h b/tensorflow/compiler/xla/service/executable.h index 7e0d182b365..21357073718 100644 --- a/tensorflow/compiler/xla/service/executable.h +++ b/tensorflow/compiler/xla/service/executable.h @@ -197,14 +197,14 @@ StatusOr Executable::ExecuteOnStreamWrapper( VLOG(1) << "enqueueing executable on stream..."; // If the profiling flag isn't enabled, we pass nullptr as the profile to // indicate profiling is not requested. - HloExecutionProfile hlo_execution_profile; - HloExecutionProfile* profile_ptr = + std::unique_ptr profile_ptr = module_config().debug_options().xla_hlo_profile() && hlo_profiling_enabled() - ? &hlo_execution_profile + ? MakeUnique(module(), *CreateCostAnalysis()) : nullptr; - auto return_value = ExecuteOnStream(run_options, arguments, profile_ptr); + auto return_value = + ExecuteOnStream(run_options, arguments, profile_ptr.get()); if (profile != nullptr) { VLOG(1) << "enqueueing 'stop timer' and blocking host until done..."; @@ -232,24 +232,11 @@ StatusOr Executable::ExecuteOnStreamWrapper( } if (profile_ptr != nullptr) { - std::unordered_set profiled_computations = - profile_ptr->profiled_computations(); - // To ensure we have print the profiles in a stable order, iterate over the - // computations in post order. - std::list all_computations = - module().MakeComputationPostOrder(); - for (xla::HloComputation* computation : all_computations) { - if (profiled_computations.count(computation) > 0) { - string profile_string = profile_ptr->ToString( - *computation, stream->parent()->GetDeviceDescription(), - CreateCostAnalysis().get()); - if (!profile_string.empty()) { - XLA_LOG_LINES(tensorflow::INFO, profile_string); - } - } - } + XLA_LOG_LINES( + tensorflow::INFO, + profile_ptr->ToString(stream->parent()->GetDeviceDescription())); hlo_graph_dumper::MaybeDumpHloModule(module(), "Service::Execute", - profile_ptr); + profile_ptr.get()); } return return_value; diff --git a/tensorflow/compiler/xla/service/hlo_execution_profile.cc b/tensorflow/compiler/xla/service/hlo_execution_profile.cc index bf19bc9309b..ecce2bd4e51 100644 --- a/tensorflow/compiler/xla/service/hlo_execution_profile.cc +++ b/tensorflow/compiler/xla/service/hlo_execution_profile.cc @@ -26,45 +26,115 @@ limitations under the License. #include "tensorflow/compiler/xla/util.h" namespace xla { +HloToProfileIndex::HloToProfileIndex(const HloModule& module) { + size_t current_profile_index = 0; + for (xla::HloComputation* computation : module.MakeComputationPostOrder()) { + InsertOrDie(&computation_to_profile_idx_, computation, + current_profile_index++); + for (const HloInstruction* instruction : computation->instructions()) { + // For simplicity we track all instrutions here, but we could skip + // non-executing instructions like constants and parameters. + InsertOrDie(&instruction_to_profile_idx_, instruction, + current_profile_index++); + } + } +} + +static HloProfilePrinter CreateOwnedHloProfilePrinter( + const HloToProfileIndex& hlo_to_profile_index, + const HloCostAnalysis& cost_analysis) { + using HloComputationInfo = HloProfilePrinter::HloComputationInfo; + using HloInstructionInfo = HloProfilePrinter::HloInstructionInfo; + + HloComputationInfo* computation_infos = + new HloComputationInfo[hlo_to_profile_index.computation_count()]; + + // There are two "indices" in play here. The first one is the index of the + // HloComputationInfo or HloInstructionInfo in the array that contains said + // HloComputationInfo or HloInstructionInfo. The second index is the index of + // the HloComputationInfo or HloInstructionInfo in the profile counters array, + // as decided by hlo_to_profile_index. The latter index is always referred to + // as "profile_index". + + size_t computation_index_in_static_data = 0; + size_t max_profile_index = hlo_to_profile_index.total_count(); + for (const auto& pair : hlo_to_profile_index.computation_to_profile_idx()) { + CHECK_LT(pair.second, max_profile_index); + const HloComputation* computation = pair.first; + size_t current_computation_index = computation_index_in_static_data++; + HloComputationInfo* computation_info = + &computation_infos[current_computation_index]; + + computation_info->name = strdup(computation->name().c_str()); + computation_info->profile_index = pair.second; + computation_info->instructions = + new HloInstructionInfo[computation->instruction_count()]; + computation_info->instructions_size = computation->instruction_count(); + + size_t instruction_index_in_static_data = 0; + for (const HloInstruction* hlo : computation->instructions()) { + HloProfilePrinter::HloInstructionInfo* instruction_info = + &computation_info->instructions[instruction_index_in_static_data++]; + instruction_info->long_name = strdup(hlo->ToString().c_str()); + instruction_info->short_name = + strdup(hlo->ToString(/*compact_operands=*/true).c_str()); + instruction_info->category = strdup(hlo->ToCategory().c_str()); + instruction_info->flop_count = cost_analysis.flop_count(*hlo); + instruction_info->transcendental_count = + cost_analysis.transcendental_count(*hlo); + instruction_info->bytes_accessed = cost_analysis.bytes_accessed(*hlo); + instruction_info->seconds = cost_analysis.seconds(*hlo); + instruction_info->profile_index = + hlo_to_profile_index.GetProfileIndexFor(*hlo); + CHECK_LT(instruction_info->profile_index, max_profile_index); + } + } + + auto deleter = [](HloProfilePrinter::HloComputationInfo* computation_infos, + int64 computation_infos_size) { + for (int64 i = 0; i < computation_infos_size; i++) { + HloInstructionInfo* instruction_infos = computation_infos[i].instructions; + for (int64 j = 0; j < computation_infos[i].instructions_size; j++) { + // We can't make instruction_infos[j].long_name etc. non-const pointers + // since they may point into static storage, so we have a const_cast + // here. + free(const_cast(instruction_infos[j].long_name)); + free(const_cast(instruction_infos[j].short_name)); + free(const_cast(instruction_infos[j].category)); + } + delete[] instruction_infos; + free(const_cast(computation_infos[i].name)); + } + delete[] computation_infos; + }; + + return HloProfilePrinter(computation_infos, + hlo_to_profile_index.computation_count(), deleter); +} + +HloExecutionProfile::HloExecutionProfile(const HloModule& module, + const HloCostAnalysis& cost_analysis) + : hlo_to_profile_index_(module), + hlo_profile_printer_( + CreateOwnedHloProfilePrinter(hlo_to_profile_index_, cost_analysis)), + profile_counters_( + /*count*/ hlo_to_profile_index_.total_count(), + /*value*/ 0) {} void HloExecutionProfile::SetCyclesTakenBy(const HloInstruction* hlo, uint64 cycles_taken) { - hlo_to_cycles_taken_[hlo] = cycles_taken; - profiled_computations_.insert(hlo->parent()); + profile_counters_[hlo_to_profile_index_.GetProfileIndexFor(*hlo)] = + cycles_taken; } uint64 HloExecutionProfile::GetCyclesTakenBy(const HloInstruction& hlo) const { - auto iter = hlo_to_cycles_taken_.find(&hlo); - if (iter == hlo_to_cycles_taken_.end()) { - return 0; - } - return iter->second; + return profile_counters_[hlo_to_profile_index_.GetProfileIndexFor(hlo)]; } string HloExecutionProfile::ToString( - const HloComputation& computation, - const DeviceDescription& device_description, - HloCostAnalysis* cost_analysis) const { - tensorflow::Status analysis_status = computation.Accept(cost_analysis); - if (!analysis_status.ok()) { - return ""; - } - - HumanReadableProfileBuilder builder(computation.name(), - total_cycles_executed(computation), - device_description.clock_rate_ghz()); - for (const auto& item : hlo_to_cycles_taken_) { - const HloInstruction* hlo = item.first; - int64 cycles = item.second; - - builder.AddOp(/*op_name=*/hlo->ToString(), - /*short_name=*/hlo->ToString(/*compact_operands=*/true), - hlo->ToCategory(), cycles, cost_analysis->flop_count(*hlo), - cost_analysis->transcendental_count(*hlo), - cost_analysis->bytes_accessed(*hlo), - cost_analysis->seconds(*hlo)); - } - return builder.ToString(); + const DeviceDescription& device_description) const { + return hlo_profile_printer_.ToString(profile_counters_.data(), + device_description.clock_rate_ghz()); } } // namespace xla diff --git a/tensorflow/compiler/xla/service/hlo_execution_profile.h b/tensorflow/compiler/xla/service/hlo_execution_profile.h index cdce77cff42..f945b9d84c6 100644 --- a/tensorflow/compiler/xla/service/hlo_execution_profile.h +++ b/tensorflow/compiler/xla/service/hlo_execution_profile.h @@ -18,7 +18,9 @@ limitations under the License. #include +#include "tensorflow/compiler/xla/map_util.h" #include "tensorflow/compiler/xla/service/hlo_cost_analysis.h" +#include "tensorflow/compiler/xla/service/hlo_profile_printer.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/core/platform/stream_executor_no_cuda.h" #include "tensorflow/core/platform/types.h" @@ -27,6 +29,54 @@ namespace xla { class HloInstruction; +// Maps all HloInstructions and HloComputions in an HloModule to integers. +// These integers form the contiguous range [0, GetTotalCount()). +class HloToProfileIndex { + public: + // Scans `module` to populate this instance of HloToProfileIndex. + explicit HloToProfileIndex(const HloModule& module); + + HloToProfileIndex(const HloToProfileIndex&) = default; + HloToProfileIndex(HloToProfileIndex&&) = default; + + HloToProfileIndex& operator=(const HloToProfileIndex&) = default; + HloToProfileIndex& operator=(HloToProfileIndex&&) = default; + + size_t GetProfileIndexFor(const HloInstruction& instruction) const { + return FindOrDie(instruction_to_profile_idx(), &instruction); + } + + size_t GetProfileIndexFor(const HloComputation& computation) const { + return FindOrDie(computation_to_profile_idx(), &computation); + } + + size_t instruction_count() const { + return instruction_to_profile_idx().size(); + } + + size_t computation_count() const { + return computation_to_profile_idx().size(); + } + + size_t total_count() const { + return instruction_count() + computation_count(); + } + + const std::unordered_map& + instruction_to_profile_idx() const { + return instruction_to_profile_idx_; + } + + const std::unordered_map& + computation_to_profile_idx() const { + return computation_to_profile_idx_; + } + + private: + std::unordered_map instruction_to_profile_idx_; + std::unordered_map computation_to_profile_idx_; +}; + // Describes how much time each HLO operation took. // // Each HloComputation takes a certain number of cycles. This class helps break @@ -35,6 +85,9 @@ class HloExecutionProfile { public: using DeviceDescription = perftools::gputools::DeviceDescription; + HloExecutionProfile(const HloModule& module, + const HloCostAnalysis& cost_analysis); + // Record how many cycles this HLO took to execute. void SetCyclesTakenBy(const HloInstruction* hlo, uint64 cycles_taken); @@ -44,17 +97,15 @@ class HloExecutionProfile { // Return the number of cycles this computation took to execute. uint64 total_cycles_executed(const HloComputation& computation) const { - auto it = total_cycles_executed_.find(&computation); - if (it != total_cycles_executed_.end()) { - return it->second; - } - return 0; + return profile_counters_[hlo_to_profile_index_.GetProfileIndexFor( + computation)]; } // Record how many cycles a computation took to execute. void set_total_cycles_executed(const HloComputation& computation, uint64 total_cycles_executed) { - total_cycles_executed_[&computation] = total_cycles_executed; + profile_counters_[hlo_to_profile_index_.GetProfileIndexFor(computation)] = + total_cycles_executed; } // Returns a version of the execution profile suitable for performance @@ -63,25 +114,19 @@ class HloExecutionProfile { // for the operations in a given computation. Returns an empty string if it // wasn't possible to generate a printable version. cost_analysis should be a // clean analysis that can be used to visit the computation. - string ToString(const HloComputation& computation, - const DeviceDescription& device_description, - HloCostAnalysis* cost_analysis) const; - - // Returns the computations we have profiled. - std::unordered_set profiled_computations() const { - return profiled_computations_; - } + string ToString(const DeviceDescription& device_description) const; private: - // Contains a mapping from HLO to the number of cycles it took to execute it. - std::unordered_map hlo_to_cycles_taken_; + // hlo_to_profile_index_ maps an Hlo entity (computation or instruction) to an + // index in profile_counters_. + HloToProfileIndex hlo_to_profile_index_; - // If non-empty, contains the total number of cycles a computation took to - // execute. - std::unordered_map total_cycles_executed_; + // Used to print profile_counters_ in a human readable form. + HloProfilePrinter hlo_profile_printer_; - // The computations we have profiled. - std::unordered_set profiled_computations_; + // Stores per-Hlo profile counters. This is the only thing that changes when + // we execute an XLA computation. + std::vector profile_counters_; }; } // namespace xla diff --git a/tensorflow/compiler/xla/service/hlo_execution_profile_test.cc b/tensorflow/compiler/xla/service/hlo_execution_profile_test.cc new file mode 100644 index 00000000000..0628444b34b --- /dev/null +++ b/tensorflow/compiler/xla/service/hlo_execution_profile_test.cc @@ -0,0 +1,99 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/xla/service/hlo_execution_profile.h" +#include "tensorflow/compiler/xla/service/hlo_cost_analysis.h" +#include "tensorflow/compiler/xla/tests/hlo_test_base.h" + +namespace xla { +namespace { + +class HloExecutionProfileTest : public HloTestBase { + protected: + static constexpr int64 kInstructionCyclesIndex = 0; + static constexpr int64 kInstructionNameIndex = 19; +}; + +// Splits `lines` into a sequence of lines delimited by newlines and then split +// each of those lines into a sequence of words delimited by spaces. Filter out +// empty words. +std::vector> SplitIntoLinesAndWords( + tensorflow::StringPiece lines) { + std::vector> result; + for (const string& line : tensorflow::str_util::Split(lines, '\n')) { + std::vector words; + for (const string& word : tensorflow::str_util::Split(line, ' ')) { + if (!word.empty()) { + words.push_back(word); + } + } + result.push_back(std::move(words)); + } + + return result; +} + +TEST_F(HloExecutionProfileTest, Basic) { + std::unique_ptr hlo_module = CreateNewModule(); + + HloComputation::Builder builder(TestName()); + Shape shape = ShapeUtil::MakeShape(F32, {30, 30}); + HloInstruction* param_lhs = + builder.AddInstruction(HloInstruction::CreateParameter(0, shape, "lhs")); + HloInstruction* param_rhs = + builder.AddInstruction(HloInstruction::CreateParameter(1, shape, "rhs")); + HloInstruction* add_instruction = + builder.AddInstruction(HloInstruction::CreateBinary( + shape, HloOpcode::kAdd, param_lhs, param_rhs)); + HloInstruction* dot_instruction = + builder.AddInstruction(HloInstruction::CreateBinary( + shape, HloOpcode::kDot, param_lhs, add_instruction)); + + hlo_module->AddEntryComputation(builder.Build()); + + auto shape_size_function = [&](const Shape& shape) { + const int64 pointer_size = 8; + if (ShapeUtil::IsOpaque(shape)) { + return pointer_size; + } + return ShapeUtil::ByteSizeOf(shape, pointer_size); + }; + + HloCostAnalysis cost_analysis(shape_size_function); + HloExecutionProfile execution_profile(*hlo_module, cost_analysis); + + const int64 add_cycles = 1000; + const int64 dot_cycles = 4000; + + execution_profile.SetCyclesTakenBy(add_instruction, add_cycles); + execution_profile.SetCyclesTakenBy(dot_instruction, dot_cycles); + + string rendered_profile = execution_profile.ToString( + backend().default_stream_executor()->GetDeviceDescription()); + std::vector> lines_and_words = + SplitIntoLinesAndWords(rendered_profile); + ASSERT_EQ(lines_and_words.size(), 8); + + const std::vector& line_2 = lines_and_words[2]; + const std::vector& line_3 = lines_and_words[3]; + + EXPECT_EQ(line_2[kInstructionCyclesIndex], std::to_string(dot_cycles)); + EXPECT_EQ(line_2[kInstructionNameIndex], dot_instruction->name()); + + EXPECT_EQ(line_3[kInstructionCyclesIndex], std::to_string(add_cycles)); + EXPECT_EQ(line_3[kInstructionNameIndex], add_instruction->name()); +} +} // namespace +} // namespace xla diff --git a/tensorflow/compiler/xla/service/hlo_profile_printer.cc b/tensorflow/compiler/xla/service/hlo_profile_printer.cc new file mode 100644 index 00000000000..071c5a6629a --- /dev/null +++ b/tensorflow/compiler/xla/service/hlo_profile_printer.cc @@ -0,0 +1,67 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/xla/service/hlo_profile_printer.h" + +#include "tensorflow/compiler/xla/service/human_readable_profile_builder.h" + +namespace xla { +string HloProfilePrinter::ToString(const int64* counters, + double clock_rate_ghz) const { + string result; + + for (int computation_idx = 0; computation_idx < computation_infos_size_; + computation_idx++) { + const HloComputationInfo& computation = computation_infos_[computation_idx]; + const HloInstructionInfo* instructions_begin = computation.instructions; + const HloInstructionInfo* instructions_end = + computation.instructions + computation.instructions_size; + bool any_instruction_profiled = + std::any_of(instructions_begin, instructions_end, + [&](const HloInstructionInfo& instruction_info) { + return counters[instruction_info.profile_index] != 0; + }); + + if (!any_instruction_profiled) { + continue; + } + + // Once we start using this in AOT for real, we will probably need a more + // minimal version of HumanReadableProfileBuilder. + HumanReadableProfileBuilder builder( + computation.name, counters[computation.profile_index], clock_rate_ghz); + + for (const auto* instruction = instructions_begin; + instruction != instructions_end; instruction++) { + builder.AddOp( + /*op_name=*/instruction->long_name, + /*short_name=*/instruction->short_name, instruction->category, + counters[instruction->profile_index], instruction->flop_count, + instruction->transcendental_count, instruction->bytes_accessed, + instruction->seconds); + } + + result += builder.ToString(); + } + + return result; +} + +HloProfilePrinter::~HloProfilePrinter() { + if (deleter_) { + deleter_(computation_infos_, computation_infos_size_); + } +} +} // namespace xla diff --git a/tensorflow/compiler/xla/service/hlo_profile_printer.h b/tensorflow/compiler/xla/service/hlo_profile_printer.h new file mode 100644 index 00000000000..45921c66f68 --- /dev/null +++ b/tensorflow/compiler/xla/service/hlo_profile_printer.h @@ -0,0 +1,97 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_HLO_PROFILE_PRINTER_H_ +#define THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_HLO_PROFILE_PRINTER_H_ + +#include +#include +#include + +#include "tensorflow/compiler/xla/types.h" + +namespace xla { +// Instances of this class can pretty-print profile counters gathered from +// running an XLA computation without having access to the backing module. +class HloProfilePrinter { + public: + // Holds meta information about an HloInstruction. + // + // The pointer-typed fields can be owning or non-owning -- this decision is + // manifested as the deleter_ function in the containing HloProfilePrinter. + struct HloInstructionInfo { + // Textual information for pretty printing. + const char* long_name; + const char* short_name; + const char* category; + + // Metrics computed by HloCostAnalysis. + float flop_count; + float transcendental_count; + float bytes_accessed; + float seconds; + + // The index into the profile counters array for the HloInstruction + // corresponding to this HloInstructionInfo. + int64 profile_index; + }; + + // Holds meta information about an HloComputation. + // + // The pointer-typed fields can be owning or non-owning -- this decision is + // manifested as the deleter_ function in the containing HloProfilePrinter. + struct HloComputationInfo { + const char* name; + + // The index into the profile counters array for the HloInstruction + // corresponding to this HloComputationInfo. + int64 profile_index; + + HloInstructionInfo* instructions; + int64 instructions_size; + }; + + HloProfilePrinter( + HloComputationInfo* computation_infos, int64 computation_infos_size, + std::function deleter = nullptr) + : computation_infos_(computation_infos), + computation_infos_size_(computation_infos_size), + deleter_(std::move(deleter)) {} + + HloProfilePrinter(HloProfilePrinter&& other) { + std::swap(other.computation_infos_, computation_infos_); + std::swap(other.computation_infos_size_, computation_infos_size_); + std::swap(other.deleter_, deleter_); + } + + HloProfilePrinter(const HloProfilePrinter&) = delete; + HloProfilePrinter& operator=(const HloProfilePrinter&) = delete; + + // Convert the profile counter sequence `counters` to a human readable string + // representation. + string ToString(const int64* counters, double clock_rate_ghz) const; + + ~HloProfilePrinter(); + + private: + // The `computation_infos_` field can be owning or non-owning -- this decision + // is manifested as the deleter_ function. + HloComputationInfo* computation_infos_ = nullptr; + int64 computation_infos_size_ = 0; + std::function deleter_; +}; +} // namespace xla + +#endif // THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_HLO_PROFILE_PRINTER_H_ diff --git a/tensorflow/compiler/xla/service/hlo_runner.cc b/tensorflow/compiler/xla/service/hlo_runner.cc index 158fb9a546c..63f2b1296ed 100644 --- a/tensorflow/compiler/xla/service/hlo_runner.cc +++ b/tensorflow/compiler/xla/service/hlo_runner.cc @@ -130,14 +130,13 @@ StatusOr HloRunner::Execute( run_options.set_intra_op_thread_pool( backend().eigen_intra_op_thread_pool_device()); - HloExecutionProfile hlo_execution_profile; ServiceExecutableRunOptions service_run_options( run_options, backend().StreamBorrower(), backend().inter_op_thread_pool()); TF_ASSIGN_OR_RETURN( se::DeviceMemoryBase result, executable->ExecuteOnStream(&service_run_options, arguments, - &hlo_execution_profile)); + /*hlo_execution_profile=*/nullptr)); TF_RET_CHECK(stream.BlockHostUntilDone()); allocations_.push_back(result); diff --git a/tensorflow/compiler/xla/service/service.cc b/tensorflow/compiler/xla/service/service.cc index 71afbee456b..ee9501dd483 100644 --- a/tensorflow/compiler/xla/service/service.cc +++ b/tensorflow/compiler/xla/service/service.cc @@ -572,30 +572,15 @@ Service::ExecuteParallelAndRegisterResult( // profile. for (auto& index_to_profiled_stream : index_to_profiled_streams) { int64 device = index_to_profiled_stream.first; + auto& module = executables[device]->module(); se::Stream* stream = index_to_profiled_stream.second; - HloExecutionProfile hlo_profile; + HloExecutionProfile hlo_profile(module, + *executables[device]->CreateCostAnalysis()); TF_RETURN_IF_ERROR(executables[device]->PopulateExecutionProfile( &hlo_profile, stream->parent())); - - std::unordered_set profiled_computations = - hlo_profile.profiled_computations(); - // To ensure we have print the profiles in a stable order, iterate over the - // computations in post order. - auto& module = executables[device]->module(); - std::list all_computations = - module.MakeComputationPostOrder(); - for (xla::HloComputation* computation : all_computations) { - if (profiled_computations.count(computation) > 0) { - string profile_string = hlo_profile.ToString( - *computation, streams[0]->parent()->GetDeviceDescription(), - executables[device]->CreateCostAnalysis().get()); - if (!profile_string.empty()) { - LOG(INFO) << "HLO profile for execution on device " << device - << ":\n"; - XLA_LOG_LINES(tensorflow::INFO, profile_string); - } - } - } + XLA_LOG_LINES( + tensorflow::INFO, + hlo_profile.ToString(streams[0]->parent()->GetDeviceDescription())); hlo_graph_dumper::MaybeDumpHloModule(module, "Service::Execute", &hlo_profile); }