Allow cost estimates to differ per backend and include the estimates into the HLO profile. Add a summary table for what categories have the most opportunity for optimization left in them.

PiperOrigin-RevId: 163780413
This commit is contained in:
Bjarke Hammersholt Roune 2017-07-31 18:48:42 -07:00 committed by TensorFlower Gardener
parent 14b7367613
commit b882d686ff
15 changed files with 113 additions and 51 deletions

View File

@ -26,7 +26,7 @@ namespace se = ::perftools::gputools;
namespace sep = ::perftools::gputools::executorplugin; namespace sep = ::perftools::gputools::executorplugin;
ExecutorExecutable::ExecutorExecutable(std::unique_ptr<HloModule> hlo_module) ExecutorExecutable::ExecutorExecutable(std::unique_ptr<HloModule> hlo_module)
: Executable(std::move(hlo_module), ShapeSizeBytes) {} : Executable(std::move(hlo_module)) {}
ExecutorExecutable::~ExecutorExecutable() {} ExecutorExecutable::~ExecutorExecutable() {}
@ -140,5 +140,10 @@ StatusOr<se::DeviceMemoryBase> ExecutorExecutable::ExecuteAsyncOnStream(
return ShapeUtil::ByteSizeOf(shape, sizeof(void*)); return ShapeUtil::ByteSizeOf(shape, sizeof(void*));
} }
std::unique_ptr<HloCostAnalysis> ExecutorExecutable::CreateCostAnalysis()
const {
return MakeUnique<HloCostAnalysis>(ShapeSizeBytes);
}
} // namespace executorplugin } // namespace executorplugin
} // namespace xla } // namespace xla

View File

@ -55,6 +55,8 @@ class ExecutorExecutable : public Executable {
static int64 ShapeSizeBytes(const Shape& shape); static int64 ShapeSizeBytes(const Shape& shape);
std::unique_ptr<HloCostAnalysis> CreateCostAnalysis() const override;
private: private:
TF_DISALLOW_COPY_AND_ASSIGN(ExecutorExecutable); TF_DISALLOW_COPY_AND_ASSIGN(ExecutorExecutable);
}; };

View File

@ -54,7 +54,7 @@ CpuExecutable::CpuExecutable(
std::unique_ptr<BufferAssignment> assignment, std::unique_ptr<BufferAssignment> assignment,
std::unique_ptr<HloModule> hlo_module, const string& entry_function_name, std::unique_ptr<HloModule> hlo_module, const string& entry_function_name,
std::unordered_map<const HloInstruction*, size_t> hlo_to_profile_idx) std::unordered_map<const HloInstruction*, size_t> hlo_to_profile_idx)
: Executable(std::move(hlo_module), CpuExecutable::ShapeSizeBytes), : Executable(std::move(hlo_module)),
jit_(std::move(jit)), jit_(std::move(jit)),
assignment_(std::move(assignment)), assignment_(std::move(assignment)),
hlo_to_profile_idx_(std::move(hlo_to_profile_idx)) { hlo_to_profile_idx_(std::move(hlo_to_profile_idx)) {
@ -380,5 +380,9 @@ const PointsToSet& CpuExecutable::GetRootPointsToSet() const {
module().entry_computation()->root_instruction()); module().entry_computation()->root_instruction());
} }
std::unique_ptr<HloCostAnalysis> CpuExecutable::CreateCostAnalysis() const {
return MakeUnique<HloCostAnalysis>(ShapeSizeBytes);
}
} // namespace cpu } // namespace cpu
} // namespace xla } // namespace xla

View File

@ -85,6 +85,8 @@ class CpuExecutable : public Executable {
static int64 ShapeSizeBytes(const Shape& shape); static int64 ShapeSizeBytes(const Shape& shape);
std::unique_ptr<HloCostAnalysis> CreateCostAnalysis() const override;
private: private:
// Allocate buffers required for execution and assign them to the elements of // Allocate buffers required for execution and assign them to the elements of
// "buffers". "buffers" should be sized to the number of buffers in buffer // "buffers". "buffers" should be sized to the number of buffers in buffer

View File

@ -62,7 +62,7 @@ ParallelCpuExecutable::ParallelCpuExecutable(
std::unordered_map<const HloInstruction*, size_t> hlo_to_profile_idx, std::unordered_map<const HloInstruction*, size_t> hlo_to_profile_idx,
std::unordered_map<const HloInstruction*, std::unique_ptr<unsigned char[]>> std::unordered_map<const HloInstruction*, std::unique_ptr<unsigned char[]>>
aligned_constants) aligned_constants)
: Executable(std::move(hlo_module), ParallelCpuExecutable::ShapeSizeBytes), : Executable(std::move(hlo_module)),
jit_(std::move(jit)), jit_(std::move(jit)),
assignment_(std::move(assignment)), assignment_(std::move(assignment)),
functions_names_(std::move(function_names)), functions_names_(std::move(function_names)),
@ -622,5 +622,10 @@ const PointsToSet& ParallelCpuExecutable::GetRootPointsToSet() const {
module().entry_computation()->root_instruction()); module().entry_computation()->root_instruction());
} }
std::unique_ptr<HloCostAnalysis> ParallelCpuExecutable::CreateCostAnalysis()
const {
return MakeUnique<HloCostAnalysis>(ShapeSizeBytes);
}
} // namespace cpu } // namespace cpu
} // namespace xla } // namespace xla

View File

@ -95,6 +95,8 @@ class ParallelCpuExecutable : public Executable {
"Equality test on CPU parallel executable is not implemented."); "Equality test on CPU parallel executable is not implemented.");
} }
std::unique_ptr<HloCostAnalysis> CreateCostAnalysis() const override;
private: private:
// Allocate buffers required for execution and assign them to the elements of // Allocate buffers required for execution and assign them to the elements of
// "buffers". "buffers" should be sized to the number of buffers in buffer // "buffers". "buffers" should be sized to the number of buffers in buffer

View File

@ -44,10 +44,8 @@ namespace xla {
// interface that is used for launching compiled programs across platforms. // interface that is used for launching compiled programs across platforms.
class Executable { class Executable {
public: public:
explicit Executable(std::unique_ptr<HloModule> hlo_module, explicit Executable(std::unique_ptr<HloModule> hlo_module)
HloCostAnalysis::ShapeSizeFunction shape_size_function) : hlo_module_(std::move(hlo_module)) {}
: hlo_module_(std::move(hlo_module)),
shape_size_function_(std::move(shape_size_function)) {}
virtual ~Executable() {} virtual ~Executable() {}
// Enqueues the compilation result on the provided stream, passing the given // Enqueues the compilation result on the provided stream, passing the given
@ -152,10 +150,9 @@ class Executable {
static Status DumpToDirectory(const string& directory_path, string filename, static Status DumpToDirectory(const string& directory_path, string filename,
const SessionModule& session_module); const SessionModule& session_module);
// Return a reference to a function that computes the size of a given Shape. // Returns a cost analysis object appropriate for the platform on which this
const HloCostAnalysis::ShapeSizeFunction& shape_size_function() const { // executable can run.
return shape_size_function_; virtual std::unique_ptr<HloCostAnalysis> CreateCostAnalysis() const = 0;
}
protected: protected:
mutable tensorflow::mutex mutex_; mutable tensorflow::mutex mutex_;
@ -168,11 +165,6 @@ class Executable {
// around. // around.
std::unique_ptr<HloModule> hlo_module_; std::unique_ptr<HloModule> hlo_module_;
// Function to compute the size of a given Shape, in bytes. This is
// provided to the Executable when it is constructed, and used to produce
// data for profiling the execution.
HloCostAnalysis::ShapeSizeFunction shape_size_function_;
// SessionModule this was compiled from. Null if not dumping executions. // SessionModule this was compiled from. Null if not dumping executions.
std::unique_ptr<SessionModule> session_module_; std::unique_ptr<SessionModule> session_module_;
@ -240,7 +232,7 @@ StatusOr<ReturnT> Executable::ExecuteOnStreamWrapper(
if (profiled_computations.count(computation) > 0) { if (profiled_computations.count(computation) > 0) {
string profile_string = profile_ptr->ToString( string profile_string = profile_ptr->ToString(
*computation, stream->parent()->GetDeviceDescription(), *computation, stream->parent()->GetDeviceDescription(),
shape_size_function_); CreateCostAnalysis().get());
if (!profile_string.empty()) { if (!profile_string.empty()) {
XLA_LOG_LINES(tensorflow::INFO, profile_string); XLA_LOG_LINES(tensorflow::INFO, profile_string);
} }

View File

@ -112,10 +112,11 @@ GpuExecutable::GpuExecutable(
std::unique_ptr<HloModule> hlo_module, std::unique_ptr<HloModule> hlo_module,
std::unique_ptr<BufferAssignment> assignment, std::unique_ptr<BufferAssignment> assignment,
HloCostAnalysis::ShapeSizeFunction shape_size_function) HloCostAnalysis::ShapeSizeFunction shape_size_function)
: Executable(std::move(hlo_module), std::move(shape_size_function)), : Executable(std::move(hlo_module)),
ptx_(ptx), ptx_(ptx),
thunk_schedule_(std::move(thunk_schedule)), thunk_schedule_(std::move(thunk_schedule)),
assignment_(std::move(assignment)) {} assignment_(std::move(assignment)),
shape_size_function_(std::move(shape_size_function)) {}
Status GpuExecutable::ExecuteThunks( Status GpuExecutable::ExecuteThunks(
const ServiceExecutableRunOptions* run_options, const ServiceExecutableRunOptions* run_options,
@ -356,5 +357,9 @@ const PointsToSet& GpuExecutable::GetRootPointsToSet() const {
module().entry_computation()->root_instruction()); module().entry_computation()->root_instruction());
} }
std::unique_ptr<HloCostAnalysis> GpuExecutable::CreateCostAnalysis() const {
return MakeUnique<HloCostAnalysis>(shape_size_function_);
}
} // namespace gpu } // namespace gpu
} // namespace xla } // namespace xla

View File

@ -85,6 +85,8 @@ class GpuExecutable : public Executable {
return Unimplemented("Equality test on GPU executable is not implemented."); return Unimplemented("Equality test on GPU executable is not implemented.");
} }
std::unique_ptr<HloCostAnalysis> CreateCostAnalysis() const override;
private: private:
// If `block_host_until_done` is false, execution will not block the host // If `block_host_until_done` is false, execution will not block the host
// until the kernels have completed. This is used as an optimization for // until the kernels have completed. This is used as an optimization for
@ -119,6 +121,9 @@ class GpuExecutable : public Executable {
// memory for every output/temp buffers. // memory for every output/temp buffers.
const std::unique_ptr<BufferAssignment> assignment_; const std::unique_ptr<BufferAssignment> assignment_;
// Function to compute the size of a given Shape, in bytes.
HloCostAnalysis::ShapeSizeFunction shape_size_function_;
TF_DISALLOW_COPY_AND_ASSIGN(GpuExecutable); TF_DISALLOW_COPY_AND_ASSIGN(GpuExecutable);
}; };

View File

@ -65,7 +65,8 @@ Status HloCostAnalysis::Postprocess(HloInstruction* hlo) {
if (property.first != kSecondsKey) { if (property.first != kSecondsKey) {
max_seconds = std::max( max_seconds = std::max(
max_seconds, max_seconds,
property.second / GetProperty(property.first, per_second_rates_)); property.second /
GetProperty(property.first, per_second_rates_, INFINITY));
} }
} }
current_properties_[kSecondsKey] = max_seconds; current_properties_[kSecondsKey] = max_seconds;
@ -97,9 +98,10 @@ Status HloCostAnalysis::HandleElementwiseOp(HloInstruction* hlo_instruction) {
} }
/*static*/ float HloCostAnalysis::GetProperty(const string& key, /*static*/ float HloCostAnalysis::GetProperty(const string& key,
const Properties& properties) { const Properties& properties,
const float default_value) {
auto key_value = properties.find(key); auto key_value = properties.find(key);
return key_value == properties.end() ? 0.0f : key_value->second; return key_value == properties.end() ? default_value : key_value->second;
} }
/*static*/ float HloCostAnalysis::GetPropertyForHlo( /*static*/ float HloCostAnalysis::GetPropertyForHlo(
@ -523,6 +525,10 @@ int64 HloCostAnalysis::bytes_accessed(const HloInstruction& hlo) const {
return GetPropertyForHlo(hlo, kBytesAccessedKey, hlo_properties_); return GetPropertyForHlo(hlo, kBytesAccessedKey, hlo_properties_);
} }
float HloCostAnalysis::seconds(const HloInstruction& hlo) const {
return GetPropertyForHlo(hlo, kSecondsKey, hlo_properties_);
}
StatusOr<HloCostAnalysis::Properties> HloCostAnalysis::ProcessSubcomputation( StatusOr<HloCostAnalysis::Properties> HloCostAnalysis::ProcessSubcomputation(
HloComputation* computation, const ShapeSizeFunction* shape_size) { HloComputation* computation, const ShapeSizeFunction* shape_size) {
if (shape_size == nullptr) { if (shape_size == nullptr) {

View File

@ -175,9 +175,11 @@ class HloCostAnalysis : public DfsHloVisitor {
// Utility function to handle all element-wise operations. // Utility function to handle all element-wise operations.
Status HandleElementwiseOp(HloInstruction* hlo_instruction); Status HandleElementwiseOp(HloInstruction* hlo_instruction);
// Returns 0.0f if the key is not present in the properties. Otherwise, // Returns the default value if the key is not present in the
// returns the value that the key maps to from the properties parameter. // properties. Otherwise, returns the value that the key maps to from the
static float GetProperty(const string& key, const Properties& properties); // properties parameter.
static float GetProperty(const string& key, const Properties& properties,
float default_value = 0.0f);
// Returns 0.0f if the hlo is not present in hlo_to_properties or if the key // Returns 0.0f if the hlo is not present in hlo_to_properties or if the key
// is not present in hlo_to_properties[hlo]. Otherwise, returns the value that // is not present in hlo_to_properties[hlo]. Otherwise, returns the value that

View File

@ -44,10 +44,9 @@ uint64 HloExecutionProfile::GetProfileResult(const HloInstruction& hlo) const {
string HloExecutionProfile::ToString( string HloExecutionProfile::ToString(
const HloComputation& computation, const HloComputation& computation,
const DeviceDescription& device_description, const DeviceDescription& device_description,
const HloCostAnalysis::ShapeSizeFunction& shape_size) const { HloCostAnalysis* cost_analysis) const {
HloCostAnalysis cost_analysis(shape_size);
tensorflow::Status analysis_status = tensorflow::Status analysis_status =
computation.root_instruction()->Accept(&cost_analysis); computation.root_instruction()->Accept(cost_analysis);
if (!analysis_status.ok()) { if (!analysis_status.ok()) {
return ""; return "";
} }
@ -61,8 +60,9 @@ string HloExecutionProfile::ToString(
builder.AddOp(/*op_name=*/hlo->ToString(), builder.AddOp(/*op_name=*/hlo->ToString(),
/*short_name=*/hlo->ToString(/*compact_operands=*/true), /*short_name=*/hlo->ToString(/*compact_operands=*/true),
hlo->ToCategory(), cycles, cost_analysis.flop_count(*hlo), hlo->ToCategory(), cycles, cost_analysis->flop_count(*hlo),
cost_analysis.bytes_accessed(*hlo)); cost_analysis->bytes_accessed(*hlo),
cost_analysis->seconds(*hlo));
} }
return builder.ToString(); return builder.ToString();
} }

View File

@ -60,12 +60,12 @@ class HloExecutionProfile {
// Returns a version of the execution profile suitable for performance // Returns a version of the execution profile suitable for performance
// debugging; e.g. emits cycle counts, execution time at the nominal device // debugging; e.g. emits cycle counts, execution time at the nominal device
// frequency, and the effective throughput given the provided cost_analysis // frequency, and the effective throughput given the provided cost_analysis
// for the operations in a given computation. // for the operations in a given computation. Returns an empty string if it
// Returns an empty string if it wasn't possible to generate a printable // wasn't possible to generate a printable version. cost_analysis should be a
// version. // clean analysis that can be used to visit the computation.
string ToString(const HloComputation& computation, string ToString(const HloComputation& computation,
const DeviceDescription& device_description, const DeviceDescription& device_description,
const HloCostAnalysis::ShapeSizeFunction& shape_size) const; HloCostAnalysis* cost_analysis) const;
// Returns the computations we have profiled. // Returns the computations we have profiled.
std::unordered_set<const HloComputation*> profiled_computations() const { std::unordered_set<const HloComputation*> profiled_computations() const {

View File

@ -53,16 +53,23 @@ string HumanReadableProfileBuilder::ToString() const {
double nsecs = op.cycles / clock_rate_ghz_; double nsecs = op.cycles / clock_rate_ghz_;
Appendf(&s, Appendf(&s,
"\t%15lld cycles (%6.2f%%) :: %12.1f usec @ f_nom :: %18s " "%15lld cycles (%6.2f%%) :: %12.1f usec (%12.1f optimal) "
":: %12s/s :: %12s/cycle :: %s\n", ":: %18s :: %12s/s :: %12s/cycle :: %s\n",
op.cycles, cycles_percent, CyclesToMicroseconds(op.cycles), op.cycles, cycles_percent, CyclesToMicroseconds(op.cycles),
op.optimal_seconds * 1e6,
op.flop_count <= 0 op.flop_count <= 0
? "<none>" ? "<none>"
: HumanReadableNumFlops(op.flop_count, nsecs).c_str(), : HumanReadableNumFlops(op.flop_count, nsecs).c_str(),
bytes_per_sec.c_str(), bytes_per_cycle.c_str(), op.name.c_str()); bytes_per_sec.c_str(), bytes_per_cycle.c_str(), op.name.c_str());
}; };
append_op({"[total]", "[total]", /*category=*/"", total_cycles_, -1, -1}); float optimal_seconds_sum = 0.0;
for (const auto& op : op_infos_) {
optimal_seconds_sum += op.optimal_seconds;
}
append_op({"[total]", "[total]", /*category=*/"", total_cycles_, -1, -1,
optimal_seconds_sum});
// Sort ops in decreasing order of cycles. // Sort ops in decreasing order of cycles.
std::vector<OpInfo> sorted_ops(op_infos_); std::vector<OpInfo> sorted_ops(op_infos_);
@ -76,19 +83,43 @@ string HumanReadableProfileBuilder::ToString() const {
if (total_cycles_ <= 0) { if (total_cycles_ <= 0) {
StrAppend(&s, "****** 0 total cycles ******\n"); StrAppend(&s, "****** 0 total cycles ******\n");
} else { } else {
MetricTableReport table; // Only show an optimal discrepancy table if at least one value was
table.SetMetricName("microseconds"); // specified. Estimates are non-negative, so if the sum is greater than
table.SetEntryName("ops"); // zero, then at least one summand was greater than zero.
table.SetShowCategoryTable(); if (optimal_seconds_sum > 0) {
for (const auto& op : sorted_ops) { MetricTableReport table;
MetricTableReport::Entry entry; table.SetMetricName("microseconds above estimated optimum");
entry.text = op.name; table.SetEntryName("ops");
entry.short_text = op.short_name; table.SetShowCategoryTable();
entry.category_text = op.category; float total_discrepancy_in_microseconds = 0.0f;
entry.metric = CyclesToMicroseconds(op.cycles); for (const auto& op : sorted_ops) {
table.AddEntry(std::move(entry)); MetricTableReport::Entry entry;
entry.text = op.name;
entry.short_text = op.short_name;
entry.category_text = op.category;
entry.metric =
CyclesToMicroseconds(op.cycles) - op.optimal_seconds * 1e6;
total_discrepancy_in_microseconds += entry.metric;
table.AddEntry(std::move(entry));
}
StrAppend(&s, table.MakeReport(total_discrepancy_in_microseconds));
}
{
MetricTableReport table;
table.SetMetricName("microseconds");
table.SetEntryName("ops");
table.SetShowCategoryTable();
for (const auto& op : sorted_ops) {
MetricTableReport::Entry entry;
entry.text = op.name;
entry.short_text = op.short_name;
entry.category_text = op.category;
entry.metric = CyclesToMicroseconds(op.cycles);
table.AddEntry(std::move(entry));
}
StrAppend(&s, table.MakeReport(CyclesToMicroseconds(total_cycles_)));
} }
StrAppend(&s, table.MakeReport(CyclesToMicroseconds(total_cycles_)));
} }
return s; return s;
} }

View File

@ -45,10 +45,10 @@ class HumanReadableProfileBuilder {
void AddOp(tensorflow::StringPiece op_name, void AddOp(tensorflow::StringPiece op_name,
tensorflow::StringPiece short_name, tensorflow::StringPiece short_name,
tensorflow::StringPiece category, int64 cycles, int64 flop_count, tensorflow::StringPiece category, int64 cycles, int64 flop_count,
int64 bytes_accessed) { int64 bytes_accessed, float optimal_seconds) {
op_infos_.push_back({op_name.ToString(), short_name.ToString(), op_infos_.push_back({op_name.ToString(), short_name.ToString(),
category.ToString(), cycles, flop_count, category.ToString(), cycles, flop_count,
bytes_accessed}); bytes_accessed, optimal_seconds});
} }
// Gets the human-readable profile. // Gets the human-readable profile.
@ -62,6 +62,7 @@ class HumanReadableProfileBuilder {
int64 cycles; int64 cycles;
int64 flop_count; int64 flop_count;
int64 bytes_accessed; int64 bytes_accessed;
float optimal_seconds;
}; };
double CyclesToSeconds(int64 cycles) const { double CyclesToSeconds(int64 cycles) const {