Allow cost estimates to differ per backend and include the estimates into the HLO profile. Add a summary table for what categories have the most opportunity for optimization left in them.
PiperOrigin-RevId: 163780413
This commit is contained in:
parent
14b7367613
commit
b882d686ff
tensorflow/compiler
plugin/executor
xla/service
@ -26,7 +26,7 @@ namespace se = ::perftools::gputools;
|
||||
namespace sep = ::perftools::gputools::executorplugin;
|
||||
|
||||
ExecutorExecutable::ExecutorExecutable(std::unique_ptr<HloModule> hlo_module)
|
||||
: Executable(std::move(hlo_module), ShapeSizeBytes) {}
|
||||
: Executable(std::move(hlo_module)) {}
|
||||
|
||||
ExecutorExecutable::~ExecutorExecutable() {}
|
||||
|
||||
@ -140,5 +140,10 @@ StatusOr<se::DeviceMemoryBase> ExecutorExecutable::ExecuteAsyncOnStream(
|
||||
return ShapeUtil::ByteSizeOf(shape, sizeof(void*));
|
||||
}
|
||||
|
||||
std::unique_ptr<HloCostAnalysis> ExecutorExecutable::CreateCostAnalysis()
|
||||
const {
|
||||
return MakeUnique<HloCostAnalysis>(ShapeSizeBytes);
|
||||
}
|
||||
|
||||
} // namespace executorplugin
|
||||
} // namespace xla
|
||||
|
@ -55,6 +55,8 @@ class ExecutorExecutable : public Executable {
|
||||
|
||||
static int64 ShapeSizeBytes(const Shape& shape);
|
||||
|
||||
std::unique_ptr<HloCostAnalysis> CreateCostAnalysis() const override;
|
||||
|
||||
private:
|
||||
TF_DISALLOW_COPY_AND_ASSIGN(ExecutorExecutable);
|
||||
};
|
||||
|
@ -54,7 +54,7 @@ CpuExecutable::CpuExecutable(
|
||||
std::unique_ptr<BufferAssignment> assignment,
|
||||
std::unique_ptr<HloModule> hlo_module, const string& entry_function_name,
|
||||
std::unordered_map<const HloInstruction*, size_t> hlo_to_profile_idx)
|
||||
: Executable(std::move(hlo_module), CpuExecutable::ShapeSizeBytes),
|
||||
: Executable(std::move(hlo_module)),
|
||||
jit_(std::move(jit)),
|
||||
assignment_(std::move(assignment)),
|
||||
hlo_to_profile_idx_(std::move(hlo_to_profile_idx)) {
|
||||
@ -380,5 +380,9 @@ const PointsToSet& CpuExecutable::GetRootPointsToSet() const {
|
||||
module().entry_computation()->root_instruction());
|
||||
}
|
||||
|
||||
std::unique_ptr<HloCostAnalysis> CpuExecutable::CreateCostAnalysis() const {
|
||||
return MakeUnique<HloCostAnalysis>(ShapeSizeBytes);
|
||||
}
|
||||
|
||||
} // namespace cpu
|
||||
} // namespace xla
|
||||
|
@ -85,6 +85,8 @@ class CpuExecutable : public Executable {
|
||||
|
||||
static int64 ShapeSizeBytes(const Shape& shape);
|
||||
|
||||
std::unique_ptr<HloCostAnalysis> CreateCostAnalysis() const override;
|
||||
|
||||
private:
|
||||
// Allocate buffers required for execution and assign them to the elements of
|
||||
// "buffers". "buffers" should be sized to the number of buffers in buffer
|
||||
|
@ -62,7 +62,7 @@ ParallelCpuExecutable::ParallelCpuExecutable(
|
||||
std::unordered_map<const HloInstruction*, size_t> hlo_to_profile_idx,
|
||||
std::unordered_map<const HloInstruction*, std::unique_ptr<unsigned char[]>>
|
||||
aligned_constants)
|
||||
: Executable(std::move(hlo_module), ParallelCpuExecutable::ShapeSizeBytes),
|
||||
: Executable(std::move(hlo_module)),
|
||||
jit_(std::move(jit)),
|
||||
assignment_(std::move(assignment)),
|
||||
functions_names_(std::move(function_names)),
|
||||
@ -622,5 +622,10 @@ const PointsToSet& ParallelCpuExecutable::GetRootPointsToSet() const {
|
||||
module().entry_computation()->root_instruction());
|
||||
}
|
||||
|
||||
std::unique_ptr<HloCostAnalysis> ParallelCpuExecutable::CreateCostAnalysis()
|
||||
const {
|
||||
return MakeUnique<HloCostAnalysis>(ShapeSizeBytes);
|
||||
}
|
||||
|
||||
} // namespace cpu
|
||||
} // namespace xla
|
||||
|
@ -95,6 +95,8 @@ class ParallelCpuExecutable : public Executable {
|
||||
"Equality test on CPU parallel executable is not implemented.");
|
||||
}
|
||||
|
||||
std::unique_ptr<HloCostAnalysis> CreateCostAnalysis() const override;
|
||||
|
||||
private:
|
||||
// Allocate buffers required for execution and assign them to the elements of
|
||||
// "buffers". "buffers" should be sized to the number of buffers in buffer
|
||||
|
@ -44,10 +44,8 @@ namespace xla {
|
||||
// interface that is used for launching compiled programs across platforms.
|
||||
class Executable {
|
||||
public:
|
||||
explicit Executable(std::unique_ptr<HloModule> hlo_module,
|
||||
HloCostAnalysis::ShapeSizeFunction shape_size_function)
|
||||
: hlo_module_(std::move(hlo_module)),
|
||||
shape_size_function_(std::move(shape_size_function)) {}
|
||||
explicit Executable(std::unique_ptr<HloModule> hlo_module)
|
||||
: hlo_module_(std::move(hlo_module)) {}
|
||||
virtual ~Executable() {}
|
||||
|
||||
// Enqueues the compilation result on the provided stream, passing the given
|
||||
@ -152,10 +150,9 @@ class Executable {
|
||||
static Status DumpToDirectory(const string& directory_path, string filename,
|
||||
const SessionModule& session_module);
|
||||
|
||||
// Return a reference to a function that computes the size of a given Shape.
|
||||
const HloCostAnalysis::ShapeSizeFunction& shape_size_function() const {
|
||||
return shape_size_function_;
|
||||
}
|
||||
// Returns a cost analysis object appropriate for the platform on which this
|
||||
// executable can run.
|
||||
virtual std::unique_ptr<HloCostAnalysis> CreateCostAnalysis() const = 0;
|
||||
|
||||
protected:
|
||||
mutable tensorflow::mutex mutex_;
|
||||
@ -168,11 +165,6 @@ class Executable {
|
||||
// around.
|
||||
std::unique_ptr<HloModule> hlo_module_;
|
||||
|
||||
// Function to compute the size of a given Shape, in bytes. This is
|
||||
// provided to the Executable when it is constructed, and used to produce
|
||||
// data for profiling the execution.
|
||||
HloCostAnalysis::ShapeSizeFunction shape_size_function_;
|
||||
|
||||
// SessionModule this was compiled from. Null if not dumping executions.
|
||||
std::unique_ptr<SessionModule> session_module_;
|
||||
|
||||
@ -240,7 +232,7 @@ StatusOr<ReturnT> Executable::ExecuteOnStreamWrapper(
|
||||
if (profiled_computations.count(computation) > 0) {
|
||||
string profile_string = profile_ptr->ToString(
|
||||
*computation, stream->parent()->GetDeviceDescription(),
|
||||
shape_size_function_);
|
||||
CreateCostAnalysis().get());
|
||||
if (!profile_string.empty()) {
|
||||
XLA_LOG_LINES(tensorflow::INFO, profile_string);
|
||||
}
|
||||
|
@ -112,10 +112,11 @@ GpuExecutable::GpuExecutable(
|
||||
std::unique_ptr<HloModule> hlo_module,
|
||||
std::unique_ptr<BufferAssignment> assignment,
|
||||
HloCostAnalysis::ShapeSizeFunction shape_size_function)
|
||||
: Executable(std::move(hlo_module), std::move(shape_size_function)),
|
||||
: Executable(std::move(hlo_module)),
|
||||
ptx_(ptx),
|
||||
thunk_schedule_(std::move(thunk_schedule)),
|
||||
assignment_(std::move(assignment)) {}
|
||||
assignment_(std::move(assignment)),
|
||||
shape_size_function_(std::move(shape_size_function)) {}
|
||||
|
||||
Status GpuExecutable::ExecuteThunks(
|
||||
const ServiceExecutableRunOptions* run_options,
|
||||
@ -356,5 +357,9 @@ const PointsToSet& GpuExecutable::GetRootPointsToSet() const {
|
||||
module().entry_computation()->root_instruction());
|
||||
}
|
||||
|
||||
std::unique_ptr<HloCostAnalysis> GpuExecutable::CreateCostAnalysis() const {
|
||||
return MakeUnique<HloCostAnalysis>(shape_size_function_);
|
||||
}
|
||||
|
||||
} // namespace gpu
|
||||
} // namespace xla
|
||||
|
@ -85,6 +85,8 @@ class GpuExecutable : public Executable {
|
||||
return Unimplemented("Equality test on GPU executable is not implemented.");
|
||||
}
|
||||
|
||||
std::unique_ptr<HloCostAnalysis> CreateCostAnalysis() const override;
|
||||
|
||||
private:
|
||||
// If `block_host_until_done` is false, execution will not block the host
|
||||
// until the kernels have completed. This is used as an optimization for
|
||||
@ -119,6 +121,9 @@ class GpuExecutable : public Executable {
|
||||
// memory for every output/temp buffers.
|
||||
const std::unique_ptr<BufferAssignment> assignment_;
|
||||
|
||||
// Function to compute the size of a given Shape, in bytes.
|
||||
HloCostAnalysis::ShapeSizeFunction shape_size_function_;
|
||||
|
||||
TF_DISALLOW_COPY_AND_ASSIGN(GpuExecutable);
|
||||
};
|
||||
|
||||
|
@ -65,7 +65,8 @@ Status HloCostAnalysis::Postprocess(HloInstruction* hlo) {
|
||||
if (property.first != kSecondsKey) {
|
||||
max_seconds = std::max(
|
||||
max_seconds,
|
||||
property.second / GetProperty(property.first, per_second_rates_));
|
||||
property.second /
|
||||
GetProperty(property.first, per_second_rates_, INFINITY));
|
||||
}
|
||||
}
|
||||
current_properties_[kSecondsKey] = max_seconds;
|
||||
@ -97,9 +98,10 @@ Status HloCostAnalysis::HandleElementwiseOp(HloInstruction* hlo_instruction) {
|
||||
}
|
||||
|
||||
/*static*/ float HloCostAnalysis::GetProperty(const string& key,
|
||||
const Properties& properties) {
|
||||
const Properties& properties,
|
||||
const float default_value) {
|
||||
auto key_value = properties.find(key);
|
||||
return key_value == properties.end() ? 0.0f : key_value->second;
|
||||
return key_value == properties.end() ? default_value : key_value->second;
|
||||
}
|
||||
|
||||
/*static*/ float HloCostAnalysis::GetPropertyForHlo(
|
||||
@ -523,6 +525,10 @@ int64 HloCostAnalysis::bytes_accessed(const HloInstruction& hlo) const {
|
||||
return GetPropertyForHlo(hlo, kBytesAccessedKey, hlo_properties_);
|
||||
}
|
||||
|
||||
float HloCostAnalysis::seconds(const HloInstruction& hlo) const {
|
||||
return GetPropertyForHlo(hlo, kSecondsKey, hlo_properties_);
|
||||
}
|
||||
|
||||
StatusOr<HloCostAnalysis::Properties> HloCostAnalysis::ProcessSubcomputation(
|
||||
HloComputation* computation, const ShapeSizeFunction* shape_size) {
|
||||
if (shape_size == nullptr) {
|
||||
|
@ -175,9 +175,11 @@ class HloCostAnalysis : public DfsHloVisitor {
|
||||
// Utility function to handle all element-wise operations.
|
||||
Status HandleElementwiseOp(HloInstruction* hlo_instruction);
|
||||
|
||||
// Returns 0.0f if the key is not present in the properties. Otherwise,
|
||||
// returns the value that the key maps to from the properties parameter.
|
||||
static float GetProperty(const string& key, const Properties& properties);
|
||||
// Returns the default value if the key is not present in the
|
||||
// properties. Otherwise, returns the value that the key maps to from the
|
||||
// properties parameter.
|
||||
static float GetProperty(const string& key, const Properties& properties,
|
||||
float default_value = 0.0f);
|
||||
|
||||
// Returns 0.0f if the hlo is not present in hlo_to_properties or if the key
|
||||
// is not present in hlo_to_properties[hlo]. Otherwise, returns the value that
|
||||
|
@ -44,10 +44,9 @@ uint64 HloExecutionProfile::GetProfileResult(const HloInstruction& hlo) const {
|
||||
string HloExecutionProfile::ToString(
|
||||
const HloComputation& computation,
|
||||
const DeviceDescription& device_description,
|
||||
const HloCostAnalysis::ShapeSizeFunction& shape_size) const {
|
||||
HloCostAnalysis cost_analysis(shape_size);
|
||||
HloCostAnalysis* cost_analysis) const {
|
||||
tensorflow::Status analysis_status =
|
||||
computation.root_instruction()->Accept(&cost_analysis);
|
||||
computation.root_instruction()->Accept(cost_analysis);
|
||||
if (!analysis_status.ok()) {
|
||||
return "";
|
||||
}
|
||||
@ -61,8 +60,9 @@ string HloExecutionProfile::ToString(
|
||||
|
||||
builder.AddOp(/*op_name=*/hlo->ToString(),
|
||||
/*short_name=*/hlo->ToString(/*compact_operands=*/true),
|
||||
hlo->ToCategory(), cycles, cost_analysis.flop_count(*hlo),
|
||||
cost_analysis.bytes_accessed(*hlo));
|
||||
hlo->ToCategory(), cycles, cost_analysis->flop_count(*hlo),
|
||||
cost_analysis->bytes_accessed(*hlo),
|
||||
cost_analysis->seconds(*hlo));
|
||||
}
|
||||
return builder.ToString();
|
||||
}
|
||||
|
@ -60,12 +60,12 @@ class HloExecutionProfile {
|
||||
// Returns a version of the execution profile suitable for performance
|
||||
// debugging; e.g. emits cycle counts, execution time at the nominal device
|
||||
// frequency, and the effective throughput given the provided cost_analysis
|
||||
// for the operations in a given computation.
|
||||
// Returns an empty string if it wasn't possible to generate a printable
|
||||
// version.
|
||||
// for the operations in a given computation. Returns an empty string if it
|
||||
// wasn't possible to generate a printable version. cost_analysis should be a
|
||||
// clean analysis that can be used to visit the computation.
|
||||
string ToString(const HloComputation& computation,
|
||||
const DeviceDescription& device_description,
|
||||
const HloCostAnalysis::ShapeSizeFunction& shape_size) const;
|
||||
HloCostAnalysis* cost_analysis) const;
|
||||
|
||||
// Returns the computations we have profiled.
|
||||
std::unordered_set<const HloComputation*> profiled_computations() const {
|
||||
|
@ -53,16 +53,23 @@ string HumanReadableProfileBuilder::ToString() const {
|
||||
|
||||
double nsecs = op.cycles / clock_rate_ghz_;
|
||||
Appendf(&s,
|
||||
"\t%15lld cycles (%6.2f%%) :: %12.1f usec @ f_nom :: %18s "
|
||||
":: %12s/s :: %12s/cycle :: %s\n",
|
||||
"%15lld cycles (%6.2f%%) :: %12.1f usec (%12.1f optimal) "
|
||||
":: %18s :: %12s/s :: %12s/cycle :: %s\n",
|
||||
op.cycles, cycles_percent, CyclesToMicroseconds(op.cycles),
|
||||
op.optimal_seconds * 1e6,
|
||||
op.flop_count <= 0
|
||||
? "<none>"
|
||||
: HumanReadableNumFlops(op.flop_count, nsecs).c_str(),
|
||||
bytes_per_sec.c_str(), bytes_per_cycle.c_str(), op.name.c_str());
|
||||
};
|
||||
|
||||
append_op({"[total]", "[total]", /*category=*/"", total_cycles_, -1, -1});
|
||||
float optimal_seconds_sum = 0.0;
|
||||
for (const auto& op : op_infos_) {
|
||||
optimal_seconds_sum += op.optimal_seconds;
|
||||
}
|
||||
|
||||
append_op({"[total]", "[total]", /*category=*/"", total_cycles_, -1, -1,
|
||||
optimal_seconds_sum});
|
||||
|
||||
// Sort ops in decreasing order of cycles.
|
||||
std::vector<OpInfo> sorted_ops(op_infos_);
|
||||
@ -76,19 +83,43 @@ string HumanReadableProfileBuilder::ToString() const {
|
||||
if (total_cycles_ <= 0) {
|
||||
StrAppend(&s, "****** 0 total cycles ******\n");
|
||||
} else {
|
||||
MetricTableReport table;
|
||||
table.SetMetricName("microseconds");
|
||||
table.SetEntryName("ops");
|
||||
table.SetShowCategoryTable();
|
||||
for (const auto& op : sorted_ops) {
|
||||
MetricTableReport::Entry entry;
|
||||
entry.text = op.name;
|
||||
entry.short_text = op.short_name;
|
||||
entry.category_text = op.category;
|
||||
entry.metric = CyclesToMicroseconds(op.cycles);
|
||||
table.AddEntry(std::move(entry));
|
||||
// Only show an optimal discrepancy table if at least one value was
|
||||
// specified. Estimates are non-negative, so if the sum is greater than
|
||||
// zero, then at least one summand was greater than zero.
|
||||
if (optimal_seconds_sum > 0) {
|
||||
MetricTableReport table;
|
||||
table.SetMetricName("microseconds above estimated optimum");
|
||||
table.SetEntryName("ops");
|
||||
table.SetShowCategoryTable();
|
||||
float total_discrepancy_in_microseconds = 0.0f;
|
||||
for (const auto& op : sorted_ops) {
|
||||
MetricTableReport::Entry entry;
|
||||
entry.text = op.name;
|
||||
entry.short_text = op.short_name;
|
||||
entry.category_text = op.category;
|
||||
entry.metric =
|
||||
CyclesToMicroseconds(op.cycles) - op.optimal_seconds * 1e6;
|
||||
total_discrepancy_in_microseconds += entry.metric;
|
||||
table.AddEntry(std::move(entry));
|
||||
}
|
||||
StrAppend(&s, table.MakeReport(total_discrepancy_in_microseconds));
|
||||
}
|
||||
|
||||
{
|
||||
MetricTableReport table;
|
||||
table.SetMetricName("microseconds");
|
||||
table.SetEntryName("ops");
|
||||
table.SetShowCategoryTable();
|
||||
for (const auto& op : sorted_ops) {
|
||||
MetricTableReport::Entry entry;
|
||||
entry.text = op.name;
|
||||
entry.short_text = op.short_name;
|
||||
entry.category_text = op.category;
|
||||
entry.metric = CyclesToMicroseconds(op.cycles);
|
||||
table.AddEntry(std::move(entry));
|
||||
}
|
||||
StrAppend(&s, table.MakeReport(CyclesToMicroseconds(total_cycles_)));
|
||||
}
|
||||
StrAppend(&s, table.MakeReport(CyclesToMicroseconds(total_cycles_)));
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
@ -45,10 +45,10 @@ class HumanReadableProfileBuilder {
|
||||
void AddOp(tensorflow::StringPiece op_name,
|
||||
tensorflow::StringPiece short_name,
|
||||
tensorflow::StringPiece category, int64 cycles, int64 flop_count,
|
||||
int64 bytes_accessed) {
|
||||
int64 bytes_accessed, float optimal_seconds) {
|
||||
op_infos_.push_back({op_name.ToString(), short_name.ToString(),
|
||||
category.ToString(), cycles, flop_count,
|
||||
bytes_accessed});
|
||||
bytes_accessed, optimal_seconds});
|
||||
}
|
||||
|
||||
// Gets the human-readable profile.
|
||||
@ -62,6 +62,7 @@ class HumanReadableProfileBuilder {
|
||||
int64 cycles;
|
||||
int64 flop_count;
|
||||
int64 bytes_accessed;
|
||||
float optimal_seconds;
|
||||
};
|
||||
|
||||
double CyclesToSeconds(int64 cycles) const {
|
||||
|
Loading…
Reference in New Issue
Block a user