STT-tensorflow/tensorflow/compiler/xla/util.cc

/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/

#include "tensorflow/compiler/xla/util.h"

#include <stdarg.h>

#include <cmath>
#include <limits>
#include <numeric>

#include "absl/algorithm/container.h"
#include "absl/container/flat_hash_map.h"
#include "absl/container/inlined_vector.h"
#include "absl/strings/match.h"
#include "absl/strings/str_cat.h"
#include "absl/strings/str_format.h"
#include "absl/strings/str_join.h"
#include "absl/strings/str_split.h"
#include "absl/types/optional.h"
#include "tensorflow/compiler/xla/types.h"
#include "tensorflow/core/lib/core/errors.h"
#include "tensorflow/core/lib/math/math_util.h"
#include "tensorflow/core/lib/strings/numbers.h"
#include "tensorflow/core/platform/bfloat16.h"
#include "tensorflow/core/platform/env.h"
#include "tensorflow/core/platform/mutex.h"
#include "tensorflow/core/platform/numbers.h"
#include "tensorflow/core/platform/stacktrace.h"

namespace xla {

Status WithLogBacktrace(const Status& status) {
  CHECK(!status.ok());
  VLOG(1) << status.ToString();
  VLOG(2) << tensorflow::CurrentStackTrace();
  return status;
}

ScopedLoggingTimer::ScopedLoggingTimer(const std::string& label, bool enabled,
                                       const char* file, int line,
                                       TimerStats* timer_stats)
    : enabled_(enabled),
      file_(file),
      line_(line),
      label_(label),
      timer_stats_(timer_stats) {
  if (enabled_) {
    start_micros_ = tensorflow::Env::Default()->NowMicros();
  }
}

void ScopedLoggingTimer::StopAndLog() {
  if (enabled_) {
    uint64 end_micros = tensorflow::Env::Default()->NowMicros();
    double secs = (end_micros - start_micros_) / 1000000.0;

    TimerStats& stats = *timer_stats_;
    tensorflow::mutex_lock lock(stats.stats_mutex);
    stats.cumulative_secs += secs;
    if (secs > stats.max_secs) {
      stats.max_secs = secs;
    }
    stats.times_called++;

    LOG(INFO).AtLocation(file_, line_)
        << label_
        << " time: " << tensorflow::strings::HumanReadableElapsedTime(secs)
        << " (cumulative: "
        << tensorflow::strings::HumanReadableElapsedTime(stats.cumulative_secs)
        << ", max: "
        << tensorflow::strings::HumanReadableElapsedTime(stats.max_secs)
        << ", #called: " << stats.times_called << ")";
    enabled_ = false;
  }
}

ScopedLoggingTimer::~ScopedLoggingTimer() { StopAndLog(); }

Status AddStatus(Status prior, absl::string_view context) {
  CHECK(!prior.ok());
  return Status{prior.code(),
                absl::StrCat(context, ": ", prior.error_message())};
}

Status AppendStatus(Status prior, absl::string_view context) {
  CHECK(!prior.ok());
  return Status{prior.code(),
                absl::StrCat(prior.error_message(), ": ", context)};
}

string Reindent(absl::string_view original,
                const absl::string_view indentation) {
  std::vector<string> pieces =
      absl::StrSplit(absl::string_view(original.data(), original.size()), '\n');
  return absl::StrJoin(pieces, "\n", [indentation](string* out, string s) {
    absl::StrAppend(out, indentation, absl::StripAsciiWhitespace(s));
  });
}

bool IsPermutation(absl::Span<const int64> permutation, int64 rank) {
  if (rank != permutation.size()) {
    return false;
  }
  absl::InlinedVector<int64, 8> trivial_permutation(rank);
  absl::c_iota(trivial_permutation, 0);
  return absl::c_is_permutation(permutation, trivial_permutation);
}

std::vector<int64> InversePermutation(
    absl::Span<const int64> input_permutation) {
  DCHECK(IsPermutation(input_permutation, input_permutation.size()));
  std::vector<int64> output_permutation(input_permutation.size(), -1);
  for (size_t i = 0; i < input_permutation.size(); ++i) {
    output_permutation.at(input_permutation.at(i)) = i;
  }
  return output_permutation;
}

std::vector<int64> ComposePermutations(absl::Span<const int64> p1,
                                       absl::Span<const int64> p2) {
  CHECK_EQ(p1.size(), p2.size());
  std::vector<int64> output;
  for (size_t i = 0; i < p1.size(); ++i) {
    output.push_back(p1.at(p2.at(i)));
  }
  return output;
}

bool IsIdentityPermutation(absl::Span<const int64> permutation) {
  for (int64 i = 0; i < permutation.size(); ++i) {
    if (permutation[i] != i) {
      return false;
    }
  }
  return true;
}

string RoundTripFpToString(tensorflow::bfloat16 value) {
  return absl::StrFormat("%.4g", static_cast<float>(value));
}

string RoundTripFpToString(Eigen::half value) {
  return absl::StrFormat("%.5g", static_cast<float>(value));
}

string RoundTripFpToString(float value) {
  char buffer[tensorflow::strings::kFastToBufferSize];
  tensorflow::strings::FloatToBuffer(value, buffer);
  return buffer;
}

string RoundTripFpToString(double value) {
  char buffer[tensorflow::strings::kFastToBufferSize];
  tensorflow::strings::DoubleToBuffer(value, buffer);
  return buffer;
}

PaddingConfig MakeNoPaddingConfig(int64 rank) {
  PaddingConfig padding_config;
  for (int64 dnum = 0; dnum < rank; ++dnum) {
    auto dimension = padding_config.add_dimensions();
    dimension->set_edge_padding_low(0);
    dimension->set_edge_padding_high(0);
    dimension->set_interior_padding(0);
  }
  return padding_config;
}

PaddingConfig MakeEdgePaddingConfig(
    absl::Span<const std::pair<int64, int64>> padding) {
  PaddingConfig padding_config;
  for (const std::pair<int64, int64>& dim : padding) {
    auto dimension = padding_config.add_dimensions();
    dimension->set_edge_padding_low(dim.first);
    dimension->set_edge_padding_high(dim.second);
    dimension->set_interior_padding(0);
  }
  return padding_config;
}

bool HasInteriorPadding(const PaddingConfig& config) {
  for (const auto& dim : config.dimensions()) {
    if (dim.interior_padding() != 0) {
      return true;
    }
  }
  return false;
}

namespace {
string HumanReadableNumOps(double flops, double nanoseconds,
                           absl::string_view op_prefix) {
  if (nanoseconds == 0) {
    return absl::StrCat("NaN ", op_prefix, "OP/s");
  }
  double nano_flops = flops / nanoseconds;
  string throughput = tensorflow::strings::HumanReadableNum(
      static_cast<int64>(nano_flops * 1e9));
  absl::string_view sp(throughput);
  // Use the more common "G(FLOPS)", rather than "B(FLOPS)"
  if (absl::EndsWith(sp, "B") ||  // Ends in 'B', ignoring case
      absl::EndsWith(sp, "b")) {
    *throughput.rbegin() = 'G';
  }
  throughput += absl::StrCat(op_prefix, "OP/s");
  return throughput;
}
}  // namespace

string HumanReadableNumFlops(double flops, double nanoseconds) {
  return HumanReadableNumOps(flops, nanoseconds, "FL");
}

string HumanReadableNumTranscendentalOps(double trops, double nanoseconds) {
  return HumanReadableNumOps(trops, nanoseconds, "TR");
}

void LogLines(int sev, absl::string_view text, const char* fname, int lineno) {
  const int orig_sev = sev;
  if (sev == tensorflow::FATAL) {
    sev = tensorflow::ERROR;
  }

  // Protect calls with a mutex so we don't interleave calls to LogLines from
  // multiple threads.
  static tensorflow::mutex log_lines_mu(tensorflow::LINKER_INITIALIZED);
  tensorflow::mutex_lock lock(log_lines_mu);

  size_t cur = 0;
  while (cur < text.size()) {
    size_t eol = text.find('\n', cur);
    if (eol == absl::string_view::npos) {
      eol = text.size();
    }
    auto msg = text.substr(cur, eol - cur);
    tensorflow::internal::LogString(fname, lineno, sev,
                                    string(msg.data(), msg.size()));
    cur = eol + 1;
  }

  if (orig_sev == tensorflow::FATAL) {
    tensorflow::internal::LogString(fname, lineno, orig_sev,
                                    "Aborting due to errors.");
  }
}

int64 Product(absl::Span<const int64> xs) {
  return std::accumulate(xs.begin(), xs.end(), static_cast<int64>(1),
                         std::multiplies<int64>());
}

absl::InlinedVector<std::pair<int64, int64>, 8> CommonFactors(
    absl::Span<const int64> a, absl::Span<const int64> b) {
  CHECK_EQ(Product(a), Product(b));
  if (0 == Product(a)) {
    return {std::make_pair(0, 0), std::make_pair(a.size(), b.size())};
  }

  absl::InlinedVector<std::pair<int64, int64>, 8> bounds;
  for (int64 i = 0, j = 0, prior_i = -1, prior_j = -1, partial_size_a = 1,
             partial_size_b = 1;
       ;) {
    if (partial_size_a == partial_size_b && (i > prior_i || j > prior_j)) {
      std::tie(prior_i, prior_j) = std::make_pair(i, j);
      bounds.emplace_back(i, j);
      continue;
    }
    bool in_bounds_i = i < a.size();
    bool in_bounds_j = j < b.size();
    if (!(in_bounds_i || in_bounds_j)) {
      break;
    }
    bool next_a =
        partial_size_a < partial_size_b ||
        (in_bounds_i &&
         (!in_bounds_j || (partial_size_a == partial_size_b && a[i] <= b[j])));
    bool next_b =
        partial_size_b < partial_size_a ||
        (in_bounds_j &&
         (!in_bounds_i || (partial_size_b == partial_size_a && b[j] <= a[i])));
    if (next_a) {
      partial_size_a *= a[i];
      ++i;
    }
    if (next_b) {
      partial_size_b *= b[j];
      ++j;
    }
  }
  return bounds;
}

ConvertedDimensionNumbers ConvertDimensionNumbers(
    absl::Span<const int64> from_dimensions, absl::Span<const int64> from_sizes,
    absl::Span<const int64> to_sizes) {
  ConvertedDimensionNumbers dimensions;
  auto common_factors = CommonFactors(from_sizes, to_sizes);
  for (int64 i = 0; i < common_factors.size() - 1; ++i) {
    bool any_present = false;
    bool all_present = true;
    for (int64 d = common_factors[i].first; d < common_factors[i + 1].first;
         ++d) {
      const bool present = absl::c_linear_search(from_dimensions, d);
      any_present |= present;
      all_present &= present;
    }
    if (all_present) {
      for (int64 d = common_factors[i].second; d < common_factors[i + 1].second;
           ++d) {
        dimensions.to_dimensions.push_back(d);
      }
      for (int64 d = common_factors[i].first; d < common_factors[i + 1].first;
           ++d) {
        dimensions.transformed_from_dimensions.push_back(d);
      }
    } else if (any_present) {
      for (int64 d = common_factors[i].first; d < common_factors[i + 1].first;
           ++d) {
        if (absl::c_linear_search(from_dimensions, d)) {
          dimensions.untransformed_from_dimensions.push_back(d);
        }
      }
    }
  }
  return dimensions;
}
string SanitizeFileName(string file_name) {
  for (char& c : file_name) {
    if (c == '/' || c == '\\' || c == '[' || c == ']' || c == ' ') {
      c = '_';
    }
  }
  return file_name;
}

// Utility function to split a double-precision float (F64) into a pair of F32s.
// For a p-bit number, and a splitting point (p/2) <= s <= (p - 1), the
// algorithm produces a (p - s)-bit value 'hi' and a non-overlapping (s - 1)-bit
// value 'lo'. See Theorem 4 in [1] (attributed to Dekker) or [2] for the
// original theorem by Dekker.
//
// For double-precision F64s, which contain a 53 bit mantissa (52 of them
// explicit), we can represent the most significant 49 digits as the unevaluated
// sum of two single-precision floats 'hi' and 'lo'. The 'hi' float stores the
// most significant 24 bits and the sign bit of 'lo' together with its mantissa
// store the remaining 25 bits. The exponent of the resulting representation is
// still restricted to 8 bits of F32.
//
// References:
// [1] A. Thall, Extended-Precision Floating-Point Numbers for GPU Computation,
//     SIGGRAPH Research Posters, 2006.
//     (http://andrewthall.org/papers/df64_qf128.pdf)
// [2] T. J. Dekker, A floating point technique for extending the available
//     precision, Numerische Mathematik, vol. 18, pp. 224–242, 1971.
std::pair<float, float> SplitF64ToF32(double x) {
  const float x_f32 = static_cast<float>(x);
  // Early return if x is an infinity or NaN.
  if (!std::isfinite(x)) {
    return std::make_pair(x_f32, 0.0f);
  }

  // Only values within the range of F32 are supported, unless it is infinity.
  // Small values with large negative exponents would be rounded to zero.
  if (!std::isfinite(x_f32)) {
    LOG(WARNING) << "Out of range F64 constant detected: " << x;
  }

  // The high float is simply the double rounded to the nearest float. Because
  // we are rounding to nearest with ties to even, the error introduced in
  // rounding is less than half an ULP in the high ULP.
  const float hi = x_f32;
  // We can compute the low term using Sterbenz' lemma: If a and b are two
  // positive floating point numbers and a/2 ≤ b ≤ 2a, then their difference can
  // be computed exactly.
  // Note: the difference is computed exactly but is rounded to the nearest
  // float which will introduce additional error.
  const float lo = static_cast<float>(x - static_cast<double>(hi));
  return std::make_pair(hi, lo);
}

}  // namespace xla