395 lines
13 KiB
C++
395 lines
13 KiB
C++
/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
|
||
|
||
Licensed under the Apache License, Version 2.0 (the "License");
|
||
you may not use this file except in compliance with the License.
|
||
You may obtain a copy of the License at
|
||
|
||
http://www.apache.org/licenses/LICENSE-2.0
|
||
|
||
Unless required by applicable law or agreed to in writing, software
|
||
distributed under the License is distributed on an "AS IS" BASIS,
|
||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||
See the License for the specific language governing permissions and
|
||
limitations under the License.
|
||
==============================================================================*/
|
||
|
||
#include "tensorflow/compiler/xla/util.h"
|
||
|
||
#include <stdarg.h>
|
||
|
||
#include <cmath>
|
||
#include <limits>
|
||
#include <numeric>
|
||
|
||
#include "absl/algorithm/container.h"
|
||
#include "absl/container/flat_hash_map.h"
|
||
#include "absl/container/inlined_vector.h"
|
||
#include "absl/strings/match.h"
|
||
#include "absl/strings/str_cat.h"
|
||
#include "absl/strings/str_format.h"
|
||
#include "absl/strings/str_join.h"
|
||
#include "absl/strings/str_split.h"
|
||
#include "absl/types/optional.h"
|
||
#include "tensorflow/compiler/xla/types.h"
|
||
#include "tensorflow/core/lib/core/errors.h"
|
||
#include "tensorflow/core/lib/math/math_util.h"
|
||
#include "tensorflow/core/lib/strings/numbers.h"
|
||
#include "tensorflow/core/platform/bfloat16.h"
|
||
#include "tensorflow/core/platform/env.h"
|
||
#include "tensorflow/core/platform/mutex.h"
|
||
#include "tensorflow/core/platform/numbers.h"
|
||
#include "tensorflow/core/platform/stacktrace.h"
|
||
|
||
namespace xla {
|
||
|
||
Status WithLogBacktrace(const Status& status) {
|
||
CHECK(!status.ok());
|
||
VLOG(1) << status.ToString();
|
||
VLOG(2) << tensorflow::CurrentStackTrace();
|
||
return status;
|
||
}
|
||
|
||
ScopedLoggingTimer::ScopedLoggingTimer(const std::string& label, bool enabled,
|
||
const char* file, int line,
|
||
TimerStats* timer_stats)
|
||
: enabled_(enabled),
|
||
file_(file),
|
||
line_(line),
|
||
label_(label),
|
||
timer_stats_(timer_stats) {
|
||
if (enabled_) {
|
||
start_micros_ = tensorflow::Env::Default()->NowMicros();
|
||
}
|
||
}
|
||
|
||
void ScopedLoggingTimer::StopAndLog() {
|
||
if (enabled_) {
|
||
uint64 end_micros = tensorflow::Env::Default()->NowMicros();
|
||
double secs = (end_micros - start_micros_) / 1000000.0;
|
||
|
||
TimerStats& stats = *timer_stats_;
|
||
tensorflow::mutex_lock lock(stats.stats_mutex);
|
||
stats.cumulative_secs += secs;
|
||
if (secs > stats.max_secs) {
|
||
stats.max_secs = secs;
|
||
}
|
||
stats.times_called++;
|
||
|
||
LOG(INFO).AtLocation(file_, line_)
|
||
<< label_
|
||
<< " time: " << tensorflow::strings::HumanReadableElapsedTime(secs)
|
||
<< " (cumulative: "
|
||
<< tensorflow::strings::HumanReadableElapsedTime(stats.cumulative_secs)
|
||
<< ", max: "
|
||
<< tensorflow::strings::HumanReadableElapsedTime(stats.max_secs)
|
||
<< ", #called: " << stats.times_called << ")";
|
||
enabled_ = false;
|
||
}
|
||
}
|
||
|
||
ScopedLoggingTimer::~ScopedLoggingTimer() { StopAndLog(); }
|
||
|
||
Status AddStatus(Status prior, absl::string_view context) {
|
||
CHECK(!prior.ok());
|
||
return Status{prior.code(),
|
||
absl::StrCat(context, ": ", prior.error_message())};
|
||
}
|
||
|
||
Status AppendStatus(Status prior, absl::string_view context) {
|
||
CHECK(!prior.ok());
|
||
return Status{prior.code(),
|
||
absl::StrCat(prior.error_message(), ": ", context)};
|
||
}
|
||
|
||
string Reindent(absl::string_view original,
|
||
const absl::string_view indentation) {
|
||
std::vector<string> pieces =
|
||
absl::StrSplit(absl::string_view(original.data(), original.size()), '\n');
|
||
return absl::StrJoin(pieces, "\n", [indentation](string* out, string s) {
|
||
absl::StrAppend(out, indentation, absl::StripAsciiWhitespace(s));
|
||
});
|
||
}
|
||
|
||
bool IsPermutation(absl::Span<const int64> permutation, int64 rank) {
|
||
if (rank != permutation.size()) {
|
||
return false;
|
||
}
|
||
absl::InlinedVector<int64, 8> trivial_permutation(rank);
|
||
absl::c_iota(trivial_permutation, 0);
|
||
return absl::c_is_permutation(permutation, trivial_permutation);
|
||
}
|
||
|
||
std::vector<int64> InversePermutation(
|
||
absl::Span<const int64> input_permutation) {
|
||
DCHECK(IsPermutation(input_permutation, input_permutation.size()));
|
||
std::vector<int64> output_permutation(input_permutation.size(), -1);
|
||
for (size_t i = 0; i < input_permutation.size(); ++i) {
|
||
output_permutation.at(input_permutation.at(i)) = i;
|
||
}
|
||
return output_permutation;
|
||
}
|
||
|
||
std::vector<int64> ComposePermutations(absl::Span<const int64> p1,
|
||
absl::Span<const int64> p2) {
|
||
CHECK_EQ(p1.size(), p2.size());
|
||
std::vector<int64> output;
|
||
for (size_t i = 0; i < p1.size(); ++i) {
|
||
output.push_back(p1.at(p2.at(i)));
|
||
}
|
||
return output;
|
||
}
|
||
|
||
bool IsIdentityPermutation(absl::Span<const int64> permutation) {
|
||
for (int64 i = 0; i < permutation.size(); ++i) {
|
||
if (permutation[i] != i) {
|
||
return false;
|
||
}
|
||
}
|
||
return true;
|
||
}
|
||
|
||
string RoundTripFpToString(tensorflow::bfloat16 value) {
|
||
return absl::StrFormat("%.4g", static_cast<float>(value));
|
||
}
|
||
|
||
string RoundTripFpToString(Eigen::half value) {
|
||
return absl::StrFormat("%.5g", static_cast<float>(value));
|
||
}
|
||
|
||
string RoundTripFpToString(float value) {
|
||
char buffer[tensorflow::strings::kFastToBufferSize];
|
||
tensorflow::strings::FloatToBuffer(value, buffer);
|
||
return buffer;
|
||
}
|
||
|
||
string RoundTripFpToString(double value) {
|
||
char buffer[tensorflow::strings::kFastToBufferSize];
|
||
tensorflow::strings::DoubleToBuffer(value, buffer);
|
||
return buffer;
|
||
}
|
||
|
||
PaddingConfig MakeNoPaddingConfig(int64 rank) {
|
||
PaddingConfig padding_config;
|
||
for (int64 dnum = 0; dnum < rank; ++dnum) {
|
||
auto dimension = padding_config.add_dimensions();
|
||
dimension->set_edge_padding_low(0);
|
||
dimension->set_edge_padding_high(0);
|
||
dimension->set_interior_padding(0);
|
||
}
|
||
return padding_config;
|
||
}
|
||
|
||
PaddingConfig MakeEdgePaddingConfig(
|
||
absl::Span<const std::pair<int64, int64>> padding) {
|
||
PaddingConfig padding_config;
|
||
for (const std::pair<int64, int64>& dim : padding) {
|
||
auto dimension = padding_config.add_dimensions();
|
||
dimension->set_edge_padding_low(dim.first);
|
||
dimension->set_edge_padding_high(dim.second);
|
||
dimension->set_interior_padding(0);
|
||
}
|
||
return padding_config;
|
||
}
|
||
|
||
bool HasInteriorPadding(const PaddingConfig& config) {
|
||
for (const auto& dim : config.dimensions()) {
|
||
if (dim.interior_padding() != 0) {
|
||
return true;
|
||
}
|
||
}
|
||
return false;
|
||
}
|
||
|
||
namespace {
|
||
string HumanReadableNumOps(double flops, double nanoseconds,
|
||
absl::string_view op_prefix) {
|
||
if (nanoseconds == 0) {
|
||
return absl::StrCat("NaN ", op_prefix, "OP/s");
|
||
}
|
||
double nano_flops = flops / nanoseconds;
|
||
string throughput = tensorflow::strings::HumanReadableNum(
|
||
static_cast<int64>(nano_flops * 1e9));
|
||
absl::string_view sp(throughput);
|
||
// Use the more common "G(FLOPS)", rather than "B(FLOPS)"
|
||
if (absl::EndsWith(sp, "B") || // Ends in 'B', ignoring case
|
||
absl::EndsWith(sp, "b")) {
|
||
*throughput.rbegin() = 'G';
|
||
}
|
||
throughput += absl::StrCat(op_prefix, "OP/s");
|
||
return throughput;
|
||
}
|
||
} // namespace
|
||
|
||
string HumanReadableNumFlops(double flops, double nanoseconds) {
|
||
return HumanReadableNumOps(flops, nanoseconds, "FL");
|
||
}
|
||
|
||
string HumanReadableNumTranscendentalOps(double trops, double nanoseconds) {
|
||
return HumanReadableNumOps(trops, nanoseconds, "TR");
|
||
}
|
||
|
||
void LogLines(int sev, absl::string_view text, const char* fname, int lineno) {
|
||
const int orig_sev = sev;
|
||
if (sev == tensorflow::FATAL) {
|
||
sev = tensorflow::ERROR;
|
||
}
|
||
|
||
// Protect calls with a mutex so we don't interleave calls to LogLines from
|
||
// multiple threads.
|
||
static tensorflow::mutex log_lines_mu(tensorflow::LINKER_INITIALIZED);
|
||
tensorflow::mutex_lock lock(log_lines_mu);
|
||
|
||
size_t cur = 0;
|
||
while (cur < text.size()) {
|
||
size_t eol = text.find('\n', cur);
|
||
if (eol == absl::string_view::npos) {
|
||
eol = text.size();
|
||
}
|
||
auto msg = text.substr(cur, eol - cur);
|
||
tensorflow::internal::LogString(fname, lineno, sev,
|
||
string(msg.data(), msg.size()));
|
||
cur = eol + 1;
|
||
}
|
||
|
||
if (orig_sev == tensorflow::FATAL) {
|
||
tensorflow::internal::LogString(fname, lineno, orig_sev,
|
||
"Aborting due to errors.");
|
||
}
|
||
}
|
||
|
||
int64 Product(absl::Span<const int64> xs) {
|
||
return std::accumulate(xs.begin(), xs.end(), static_cast<int64>(1),
|
||
std::multiplies<int64>());
|
||
}
|
||
|
||
absl::InlinedVector<std::pair<int64, int64>, 8> CommonFactors(
|
||
absl::Span<const int64> a, absl::Span<const int64> b) {
|
||
CHECK_EQ(Product(a), Product(b));
|
||
if (0 == Product(a)) {
|
||
return {std::make_pair(0, 0), std::make_pair(a.size(), b.size())};
|
||
}
|
||
|
||
absl::InlinedVector<std::pair<int64, int64>, 8> bounds;
|
||
for (int64 i = 0, j = 0, prior_i = -1, prior_j = -1, partial_size_a = 1,
|
||
partial_size_b = 1;
|
||
;) {
|
||
if (partial_size_a == partial_size_b && (i > prior_i || j > prior_j)) {
|
||
std::tie(prior_i, prior_j) = std::make_pair(i, j);
|
||
bounds.emplace_back(i, j);
|
||
continue;
|
||
}
|
||
bool in_bounds_i = i < a.size();
|
||
bool in_bounds_j = j < b.size();
|
||
if (!(in_bounds_i || in_bounds_j)) {
|
||
break;
|
||
}
|
||
bool next_a =
|
||
partial_size_a < partial_size_b ||
|
||
(in_bounds_i &&
|
||
(!in_bounds_j || (partial_size_a == partial_size_b && a[i] <= b[j])));
|
||
bool next_b =
|
||
partial_size_b < partial_size_a ||
|
||
(in_bounds_j &&
|
||
(!in_bounds_i || (partial_size_b == partial_size_a && b[j] <= a[i])));
|
||
if (next_a) {
|
||
partial_size_a *= a[i];
|
||
++i;
|
||
}
|
||
if (next_b) {
|
||
partial_size_b *= b[j];
|
||
++j;
|
||
}
|
||
}
|
||
return bounds;
|
||
}
|
||
|
||
ConvertedDimensionNumbers ConvertDimensionNumbers(
|
||
absl::Span<const int64> from_dimensions, absl::Span<const int64> from_sizes,
|
||
absl::Span<const int64> to_sizes) {
|
||
ConvertedDimensionNumbers dimensions;
|
||
auto common_factors = CommonFactors(from_sizes, to_sizes);
|
||
for (int64 i = 0; i < common_factors.size() - 1; ++i) {
|
||
bool any_present = false;
|
||
bool all_present = true;
|
||
for (int64 d = common_factors[i].first; d < common_factors[i + 1].first;
|
||
++d) {
|
||
const bool present = absl::c_linear_search(from_dimensions, d);
|
||
any_present |= present;
|
||
all_present &= present;
|
||
}
|
||
if (all_present) {
|
||
for (int64 d = common_factors[i].second; d < common_factors[i + 1].second;
|
||
++d) {
|
||
dimensions.to_dimensions.push_back(d);
|
||
}
|
||
for (int64 d = common_factors[i].first; d < common_factors[i + 1].first;
|
||
++d) {
|
||
dimensions.transformed_from_dimensions.push_back(d);
|
||
}
|
||
} else if (any_present) {
|
||
for (int64 d = common_factors[i].first; d < common_factors[i + 1].first;
|
||
++d) {
|
||
if (absl::c_linear_search(from_dimensions, d)) {
|
||
dimensions.untransformed_from_dimensions.push_back(d);
|
||
}
|
||
}
|
||
}
|
||
}
|
||
return dimensions;
|
||
}
|
||
string SanitizeFileName(string file_name) {
|
||
for (char& c : file_name) {
|
||
if (c == '/' || c == '\\' || c == '[' || c == ']' || c == ' ') {
|
||
c = '_';
|
||
}
|
||
}
|
||
return file_name;
|
||
}
|
||
|
||
// Utility function to split a double-precision float (F64) into a pair of F32s.
|
||
// For a p-bit number, and a splitting point (p/2) <= s <= (p - 1), the
|
||
// algorithm produces a (p - s)-bit value 'hi' and a non-overlapping (s - 1)-bit
|
||
// value 'lo'. See Theorem 4 in [1] (attributed to Dekker) or [2] for the
|
||
// original theorem by Dekker.
|
||
//
|
||
// For double-precision F64s, which contain a 53 bit mantissa (52 of them
|
||
// explicit), we can represent the most significant 49 digits as the unevaluated
|
||
// sum of two single-precision floats 'hi' and 'lo'. The 'hi' float stores the
|
||
// most significant 24 bits and the sign bit of 'lo' together with its mantissa
|
||
// store the remaining 25 bits. The exponent of the resulting representation is
|
||
// still restricted to 8 bits of F32.
|
||
//
|
||
// References:
|
||
// [1] A. Thall, Extended-Precision Floating-Point Numbers for GPU Computation,
|
||
// SIGGRAPH Research Posters, 2006.
|
||
// (http://andrewthall.org/papers/df64_qf128.pdf)
|
||
// [2] T. J. Dekker, A floating point technique for extending the available
|
||
// precision, Numerische Mathematik, vol. 18, pp. 224–242, 1971.
|
||
std::pair<float, float> SplitF64ToF32(double x) {
|
||
const float x_f32 = static_cast<float>(x);
|
||
// Early return if x is an infinity or NaN.
|
||
if (!std::isfinite(x)) {
|
||
return std::make_pair(x_f32, 0.0f);
|
||
}
|
||
|
||
// Only values within the range of F32 are supported, unless it is infinity.
|
||
// Small values with large negative exponents would be rounded to zero.
|
||
if (!std::isfinite(x_f32)) {
|
||
LOG(WARNING) << "Out of range F64 constant detected: " << x;
|
||
}
|
||
|
||
// The high float is simply the double rounded to the nearest float. Because
|
||
// we are rounding to nearest with ties to even, the error introduced in
|
||
// rounding is less than half an ULP in the high ULP.
|
||
const float hi = x_f32;
|
||
// We can compute the low term using Sterbenz' lemma: If a and b are two
|
||
// positive floating point numbers and a/2 ≤ b ≤ 2a, then their difference can
|
||
// be computed exactly.
|
||
// Note: the difference is computed exactly but is rounded to the nearest
|
||
// float which will introduce additional error.
|
||
const float lo = static_cast<float>(x - static_cast<double>(hi));
|
||
return std::make_pair(hi, lo);
|
||
}
|
||
|
||
} // namespace xla
|