Allow cost estimates to differ per backend and include the estimates into the HLO profile. Add a summary table for what categories have the most opportunity for optimization left in them.

PiperOrigin-RevId: 163780413
This commit is contained in:
Bjarke Hammersholt Roune 2017-07-31 18:48:42 -07:00 committed by TensorFlower Gardener
parent 14b7367613
commit b882d686ff
15 changed files with 113 additions and 51 deletions

View File

@ -26,7 +26,7 @@ namespace se = ::perftools::gputools;
namespace sep = ::perftools::gputools::executorplugin;
ExecutorExecutable::ExecutorExecutable(std::unique_ptr<HloModule> hlo_module)
: Executable(std::move(hlo_module), ShapeSizeBytes) {}
: Executable(std::move(hlo_module)) {}
ExecutorExecutable::~ExecutorExecutable() {}
@ -140,5 +140,10 @@ StatusOr<se::DeviceMemoryBase> ExecutorExecutable::ExecuteAsyncOnStream(
return ShapeUtil::ByteSizeOf(shape, sizeof(void*));
}
std::unique_ptr<HloCostAnalysis> ExecutorExecutable::CreateCostAnalysis()
const {
return MakeUnique<HloCostAnalysis>(ShapeSizeBytes);
}
} // namespace executorplugin
} // namespace xla

View File

@ -55,6 +55,8 @@ class ExecutorExecutable : public Executable {
static int64 ShapeSizeBytes(const Shape& shape);
std::unique_ptr<HloCostAnalysis> CreateCostAnalysis() const override;
private:
TF_DISALLOW_COPY_AND_ASSIGN(ExecutorExecutable);
};

View File

@ -54,7 +54,7 @@ CpuExecutable::CpuExecutable(
std::unique_ptr<BufferAssignment> assignment,
std::unique_ptr<HloModule> hlo_module, const string& entry_function_name,
std::unordered_map<const HloInstruction*, size_t> hlo_to_profile_idx)
: Executable(std::move(hlo_module), CpuExecutable::ShapeSizeBytes),
: Executable(std::move(hlo_module)),
jit_(std::move(jit)),
assignment_(std::move(assignment)),
hlo_to_profile_idx_(std::move(hlo_to_profile_idx)) {
@ -380,5 +380,9 @@ const PointsToSet& CpuExecutable::GetRootPointsToSet() const {
module().entry_computation()->root_instruction());
}
std::unique_ptr<HloCostAnalysis> CpuExecutable::CreateCostAnalysis() const {
return MakeUnique<HloCostAnalysis>(ShapeSizeBytes);
}
} // namespace cpu
} // namespace xla

View File

@ -85,6 +85,8 @@ class CpuExecutable : public Executable {
static int64 ShapeSizeBytes(const Shape& shape);
std::unique_ptr<HloCostAnalysis> CreateCostAnalysis() const override;
private:
// Allocate buffers required for execution and assign them to the elements of
// "buffers". "buffers" should be sized to the number of buffers in buffer

View File

@ -62,7 +62,7 @@ ParallelCpuExecutable::ParallelCpuExecutable(
std::unordered_map<const HloInstruction*, size_t> hlo_to_profile_idx,
std::unordered_map<const HloInstruction*, std::unique_ptr<unsigned char[]>>
aligned_constants)
: Executable(std::move(hlo_module), ParallelCpuExecutable::ShapeSizeBytes),
: Executable(std::move(hlo_module)),
jit_(std::move(jit)),
assignment_(std::move(assignment)),
functions_names_(std::move(function_names)),
@ -622,5 +622,10 @@ const PointsToSet& ParallelCpuExecutable::GetRootPointsToSet() const {
module().entry_computation()->root_instruction());
}
std::unique_ptr<HloCostAnalysis> ParallelCpuExecutable::CreateCostAnalysis()
const {
return MakeUnique<HloCostAnalysis>(ShapeSizeBytes);
}
} // namespace cpu
} // namespace xla

View File

@ -95,6 +95,8 @@ class ParallelCpuExecutable : public Executable {
"Equality test on CPU parallel executable is not implemented.");
}
std::unique_ptr<HloCostAnalysis> CreateCostAnalysis() const override;
private:
// Allocate buffers required for execution and assign them to the elements of
// "buffers". "buffers" should be sized to the number of buffers in buffer

View File

@ -44,10 +44,8 @@ namespace xla {
// interface that is used for launching compiled programs across platforms.
class Executable {
public:
explicit Executable(std::unique_ptr<HloModule> hlo_module,
HloCostAnalysis::ShapeSizeFunction shape_size_function)
: hlo_module_(std::move(hlo_module)),
shape_size_function_(std::move(shape_size_function)) {}
explicit Executable(std::unique_ptr<HloModule> hlo_module)
: hlo_module_(std::move(hlo_module)) {}
virtual ~Executable() {}
// Enqueues the compilation result on the provided stream, passing the given
@ -152,10 +150,9 @@ class Executable {
static Status DumpToDirectory(const string& directory_path, string filename,
const SessionModule& session_module);
// Return a reference to a function that computes the size of a given Shape.
const HloCostAnalysis::ShapeSizeFunction& shape_size_function() const {
return shape_size_function_;
}
// Returns a cost analysis object appropriate for the platform on which this
// executable can run.
virtual std::unique_ptr<HloCostAnalysis> CreateCostAnalysis() const = 0;
protected:
mutable tensorflow::mutex mutex_;
@ -168,11 +165,6 @@ class Executable {
// around.
std::unique_ptr<HloModule> hlo_module_;
// Function to compute the size of a given Shape, in bytes. This is
// provided to the Executable when it is constructed, and used to produce
// data for profiling the execution.
HloCostAnalysis::ShapeSizeFunction shape_size_function_;
// SessionModule this was compiled from. Null if not dumping executions.
std::unique_ptr<SessionModule> session_module_;
@ -240,7 +232,7 @@ StatusOr<ReturnT> Executable::ExecuteOnStreamWrapper(
if (profiled_computations.count(computation) > 0) {
string profile_string = profile_ptr->ToString(
*computation, stream->parent()->GetDeviceDescription(),
shape_size_function_);
CreateCostAnalysis().get());
if (!profile_string.empty()) {
XLA_LOG_LINES(tensorflow::INFO, profile_string);
}

View File

@ -112,10 +112,11 @@ GpuExecutable::GpuExecutable(
std::unique_ptr<HloModule> hlo_module,
std::unique_ptr<BufferAssignment> assignment,
HloCostAnalysis::ShapeSizeFunction shape_size_function)
: Executable(std::move(hlo_module), std::move(shape_size_function)),
: Executable(std::move(hlo_module)),
ptx_(ptx),
thunk_schedule_(std::move(thunk_schedule)),
assignment_(std::move(assignment)) {}
assignment_(std::move(assignment)),
shape_size_function_(std::move(shape_size_function)) {}
Status GpuExecutable::ExecuteThunks(
const ServiceExecutableRunOptions* run_options,
@ -356,5 +357,9 @@ const PointsToSet& GpuExecutable::GetRootPointsToSet() const {
module().entry_computation()->root_instruction());
}
std::unique_ptr<HloCostAnalysis> GpuExecutable::CreateCostAnalysis() const {
return MakeUnique<HloCostAnalysis>(shape_size_function_);
}
} // namespace gpu
} // namespace xla

View File

@ -85,6 +85,8 @@ class GpuExecutable : public Executable {
return Unimplemented("Equality test on GPU executable is not implemented.");
}
std::unique_ptr<HloCostAnalysis> CreateCostAnalysis() const override;
private:
// If `block_host_until_done` is false, execution will not block the host
// until the kernels have completed. This is used as an optimization for
@ -119,6 +121,9 @@ class GpuExecutable : public Executable {
// memory for every output/temp buffers.
const std::unique_ptr<BufferAssignment> assignment_;
// Function to compute the size of a given Shape, in bytes.
HloCostAnalysis::ShapeSizeFunction shape_size_function_;
TF_DISALLOW_COPY_AND_ASSIGN(GpuExecutable);
};

View File

@ -65,7 +65,8 @@ Status HloCostAnalysis::Postprocess(HloInstruction* hlo) {
if (property.first != kSecondsKey) {
max_seconds = std::max(
max_seconds,
property.second / GetProperty(property.first, per_second_rates_));
property.second /
GetProperty(property.first, per_second_rates_, INFINITY));
}
}
current_properties_[kSecondsKey] = max_seconds;
@ -97,9 +98,10 @@ Status HloCostAnalysis::HandleElementwiseOp(HloInstruction* hlo_instruction) {
}
/*static*/ float HloCostAnalysis::GetProperty(const string& key,
const Properties& properties) {
const Properties& properties,
const float default_value) {
auto key_value = properties.find(key);
return key_value == properties.end() ? 0.0f : key_value->second;
return key_value == properties.end() ? default_value : key_value->second;
}
/*static*/ float HloCostAnalysis::GetPropertyForHlo(
@ -523,6 +525,10 @@ int64 HloCostAnalysis::bytes_accessed(const HloInstruction& hlo) const {
return GetPropertyForHlo(hlo, kBytesAccessedKey, hlo_properties_);
}
float HloCostAnalysis::seconds(const HloInstruction& hlo) const {
return GetPropertyForHlo(hlo, kSecondsKey, hlo_properties_);
}
StatusOr<HloCostAnalysis::Properties> HloCostAnalysis::ProcessSubcomputation(
HloComputation* computation, const ShapeSizeFunction* shape_size) {
if (shape_size == nullptr) {

View File

@ -175,9 +175,11 @@ class HloCostAnalysis : public DfsHloVisitor {
// Utility function to handle all element-wise operations.
Status HandleElementwiseOp(HloInstruction* hlo_instruction);
// Returns 0.0f if the key is not present in the properties. Otherwise,
// returns the value that the key maps to from the properties parameter.
static float GetProperty(const string& key, const Properties& properties);
// Returns the default value if the key is not present in the
// properties. Otherwise, returns the value that the key maps to from the
// properties parameter.
static float GetProperty(const string& key, const Properties& properties,
float default_value = 0.0f);
// Returns 0.0f if the hlo is not present in hlo_to_properties or if the key
// is not present in hlo_to_properties[hlo]. Otherwise, returns the value that

View File

@ -44,10 +44,9 @@ uint64 HloExecutionProfile::GetProfileResult(const HloInstruction& hlo) const {
string HloExecutionProfile::ToString(
const HloComputation& computation,
const DeviceDescription& device_description,
const HloCostAnalysis::ShapeSizeFunction& shape_size) const {
HloCostAnalysis cost_analysis(shape_size);
HloCostAnalysis* cost_analysis) const {
tensorflow::Status analysis_status =
computation.root_instruction()->Accept(&cost_analysis);
computation.root_instruction()->Accept(cost_analysis);
if (!analysis_status.ok()) {
return "";
}
@ -61,8 +60,9 @@ string HloExecutionProfile::ToString(
builder.AddOp(/*op_name=*/hlo->ToString(),
/*short_name=*/hlo->ToString(/*compact_operands=*/true),
hlo->ToCategory(), cycles, cost_analysis.flop_count(*hlo),
cost_analysis.bytes_accessed(*hlo));
hlo->ToCategory(), cycles, cost_analysis->flop_count(*hlo),
cost_analysis->bytes_accessed(*hlo),
cost_analysis->seconds(*hlo));
}
return builder.ToString();
}

View File

@ -60,12 +60,12 @@ class HloExecutionProfile {
// Returns a version of the execution profile suitable for performance
// debugging; e.g. emits cycle counts, execution time at the nominal device
// frequency, and the effective throughput given the provided cost_analysis
// for the operations in a given computation.
// Returns an empty string if it wasn't possible to generate a printable
// version.
// for the operations in a given computation. Returns an empty string if it
// wasn't possible to generate a printable version. cost_analysis should be a
// clean analysis that can be used to visit the computation.
string ToString(const HloComputation& computation,
const DeviceDescription& device_description,
const HloCostAnalysis::ShapeSizeFunction& shape_size) const;
HloCostAnalysis* cost_analysis) const;
// Returns the computations we have profiled.
std::unordered_set<const HloComputation*> profiled_computations() const {

View File

@ -53,16 +53,23 @@ string HumanReadableProfileBuilder::ToString() const {
double nsecs = op.cycles / clock_rate_ghz_;
Appendf(&s,
"\t%15lld cycles (%6.2f%%) :: %12.1f usec @ f_nom :: %18s "
":: %12s/s :: %12s/cycle :: %s\n",
"%15lld cycles (%6.2f%%) :: %12.1f usec (%12.1f optimal) "
":: %18s :: %12s/s :: %12s/cycle :: %s\n",
op.cycles, cycles_percent, CyclesToMicroseconds(op.cycles),
op.optimal_seconds * 1e6,
op.flop_count <= 0
? "<none>"
: HumanReadableNumFlops(op.flop_count, nsecs).c_str(),
bytes_per_sec.c_str(), bytes_per_cycle.c_str(), op.name.c_str());
};
append_op({"[total]", "[total]", /*category=*/"", total_cycles_, -1, -1});
float optimal_seconds_sum = 0.0;
for (const auto& op : op_infos_) {
optimal_seconds_sum += op.optimal_seconds;
}
append_op({"[total]", "[total]", /*category=*/"", total_cycles_, -1, -1,
optimal_seconds_sum});
// Sort ops in decreasing order of cycles.
std::vector<OpInfo> sorted_ops(op_infos_);
@ -76,19 +83,43 @@ string HumanReadableProfileBuilder::ToString() const {
if (total_cycles_ <= 0) {
StrAppend(&s, "****** 0 total cycles ******\n");
} else {
MetricTableReport table;
table.SetMetricName("microseconds");
table.SetEntryName("ops");
table.SetShowCategoryTable();
for (const auto& op : sorted_ops) {
MetricTableReport::Entry entry;
entry.text = op.name;
entry.short_text = op.short_name;
entry.category_text = op.category;
entry.metric = CyclesToMicroseconds(op.cycles);
table.AddEntry(std::move(entry));
// Only show an optimal discrepancy table if at least one value was
// specified. Estimates are non-negative, so if the sum is greater than
// zero, then at least one summand was greater than zero.
if (optimal_seconds_sum > 0) {
MetricTableReport table;
table.SetMetricName("microseconds above estimated optimum");
table.SetEntryName("ops");
table.SetShowCategoryTable();
float total_discrepancy_in_microseconds = 0.0f;
for (const auto& op : sorted_ops) {
MetricTableReport::Entry entry;
entry.text = op.name;
entry.short_text = op.short_name;
entry.category_text = op.category;
entry.metric =
CyclesToMicroseconds(op.cycles) - op.optimal_seconds * 1e6;
total_discrepancy_in_microseconds += entry.metric;
table.AddEntry(std::move(entry));
}
StrAppend(&s, table.MakeReport(total_discrepancy_in_microseconds));
}
{
MetricTableReport table;
table.SetMetricName("microseconds");
table.SetEntryName("ops");
table.SetShowCategoryTable();
for (const auto& op : sorted_ops) {
MetricTableReport::Entry entry;
entry.text = op.name;
entry.short_text = op.short_name;
entry.category_text = op.category;
entry.metric = CyclesToMicroseconds(op.cycles);
table.AddEntry(std::move(entry));
}
StrAppend(&s, table.MakeReport(CyclesToMicroseconds(total_cycles_)));
}
StrAppend(&s, table.MakeReport(CyclesToMicroseconds(total_cycles_)));
}
return s;
}

View File

@ -45,10 +45,10 @@ class HumanReadableProfileBuilder {
void AddOp(tensorflow::StringPiece op_name,
tensorflow::StringPiece short_name,
tensorflow::StringPiece category, int64 cycles, int64 flop_count,
int64 bytes_accessed) {
int64 bytes_accessed, float optimal_seconds) {
op_infos_.push_back({op_name.ToString(), short_name.ToString(),
category.ToString(), cycles, flop_count,
bytes_accessed});
bytes_accessed, optimal_seconds});
}
// Gets the human-readable profile.
@ -62,6 +62,7 @@ class HumanReadableProfileBuilder {
int64 cycles;
int64 flop_count;
int64 bytes_accessed;
float optimal_seconds;
};
double CyclesToSeconds(int64 cycles) const {