[SparseTensor] Optimize the tf.sparse.to_dense() implementation.

This change includes several optimizations:

1. Introduce `SparseTensor::IndicesValidVectorFastPath()`, for validating the
   indices of a 1-D SparseTensor. The optimized code is similar to
   `IndicesValid32BitFastPath()`, which optimistically assumes that the tensor
   is valid and falls back to slower code in the failure case, except it does
   not have the 32-bit limitation. The compiler is able to vectorize the loop
   over the indices, for increased throughput.

2. Implement fast paths for 1-D and 2-D inputs in `SparseTensor::ToDense()`.
   The main win here comes from avoiding the data-dependent loop over
   dimensions when computing the index of the output value. We also avoid
   an unnecessary integer multiplication (by 1) in each case.

3. Minor optimizations to the 3+-D case in `SparseTensor::ToDense()`, avoiding
   unnecessary calls to `TensorShape::dim_size()` and using pointer arithmetic
   rather than Eigen logic to dereference index elements.

4. Minor optimizations to the `SparseTensor::Create()` method, which now
   assigns directly to the relevant fields of the result instead of invoking
   the `SparseTensor` constructor and the move assignment operator. In this
   case the existing move logic wasn't saving us much, because the `Tensor` and
   `gtl::InlinedVector` move constructors still have to copy quite a lot of
   data.

5. Minor optimizations to the `SparseToDense::Compute()` method. In particular,
   we avoid allocating a temporary tensor for the indices when the input is
   DT_INT64 (which is the common case, since all `tf.SparseTensor` objects have
   64-bit indices).

PiperOrigin-RevId: 296075159
Change-Id: I0b051621920aec9b2a8dc6c7ecbf55e5b2d59098
This commit is contained in:
Derek Murray 2020-02-19 15:45:36 -08:00 committed by TensorFlower Gardener
parent 88c4b69a57
commit 867d3c9708
3 changed files with 133 additions and 46 deletions

View File

@ -20,14 +20,13 @@ limitations under the License.
#define EIGEN_USE_THREADS
#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
#include <numeric>
#include <sstream>
#include <string>
#include <unordered_map>
#include <utility>
#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
#include "tensorflow/core/framework/op_kernel.h"
#include "tensorflow/core/framework/register_types.h"
#include "tensorflow/core/framework/tensor.h"
@ -35,6 +34,7 @@ limitations under the License.
#include "tensorflow/core/lib/core/status.h"
#include "tensorflow/core/lib/gtl/inlined_vector.h"
#include "tensorflow/core/lib/strings/stringprintf.h"
#include "tensorflow/core/util/ptr_util.h"
#include "tensorflow/core/util/sparse/sparse_tensor.h"
namespace tensorflow {
@ -93,36 +93,44 @@ class SparseToDense : public OpKernel {
Tensor* output = nullptr;
OP_REQUIRES_OK(c, c->allocate_output(0, output_tensor_shape, &output));
TensorShape ix_shape({num_elems, num_dims});
Tensor indices_shaped(DT_INT64, ix_shape);
if (indices.dtype() == DT_INT64) {
CHECK(indices_shaped.CopyFrom(indices, ix_shape));
const Tensor* indices_shaped;
std::unique_ptr<Tensor> indices_shaped_holder;
if (indices.dtype() == DT_INT64 && indices.dims() == 2) {
indices_shaped = &indices;
} else {
indices_shaped.matrix<int64>() =
indices.shaped<Index, 2>(ix_shape.dim_sizes()).template cast<int64>();
TensorShape ix_shape({num_elems, num_dims});
indices_shaped_holder = MakeUnique<Tensor>(DT_INT64, ix_shape);
indices_shaped = indices_shaped_holder.get();
if (indices.dtype() == DT_INT64) {
CHECK(indices_shaped_holder->CopyFrom(indices, ix_shape));
} else {
indices_shaped_holder->matrix<int64>() =
indices.shaped<Index, 2>(ix_shape.dim_sizes())
.template cast<int64>();
}
}
// If we received a scalar, we'll need to create a new
// tensor with copies of the values as a vec.
// TODO(ebrevdo): find a way to avoid this temp allocation.
Tensor sparse_values_b;
const Tensor* sparse_values_b;
std::unique_ptr<Tensor> sparse_values_b_holder;
if (TensorShapeUtils::IsScalar(sparse_values.shape())) {
OP_REQUIRES_OK(
c, c->allocate_temp(DataTypeToEnum<T>::value,
TensorShape({num_elems}), &sparse_values_b));
sparse_values_b.vec<T>().setConstant(sparse_values.scalar<T>()());
sparse_values_b_holder = MakeUnique<Tensor>(DataTypeToEnum<T>::value,
TensorShape({num_elems}));
sparse_values_b = sparse_values_b_holder.get();
sparse_values_b_holder->vec<T>().setConstant(sparse_values.scalar<T>()());
} else {
sparse_values_b = sparse_values;
sparse_values_b = &sparse_values;
}
// Assume SparseTensor is lexicographically sorted.
gtl::InlinedVector<int64, 8> order(output->shape().dims());
std::iota(order.begin(), order.end(), 0);
sparse::SparseTensor st;
OP_REQUIRES_OK(c,
sparse::SparseTensor::Create(indices_shaped, sparse_values_b,
output->shape(), order, &st));
OP_REQUIRES_OK(
c, sparse::SparseTensor::Create(*indices_shaped, *sparse_values_b,
output->shape(), order, &st));
if (validate_indices_) {
OP_REQUIRES_OK(c, st.IndicesValid());

View File

@ -65,7 +65,11 @@ Status GetDimsFromIx(const Tensor& ix, int* result) {
return errors::InvalidArgument("Shape rank must be SparseTensor rank.");
}
*result = SparseTensor(std::move(ix), std::move(vals), shape, order);
result->ix_ = std::move(ix);
result->vals_ = std::move(vals);
result->shape_.assign(shape.begin(), shape.end());
result->order_.assign(order.begin(), order.end());
result->dims_ = dims;
return Status::OK();
}
@ -108,6 +112,37 @@ SparseTensor::SparseTensor(Tensor ix, Tensor vals, const VarDimArray shape,
DCHECK_EQ(shape.size(), dims_) << "Shape rank must be SparseTensor rank.";
}
// Optimized version of `IndicesValid()` with the following requirements:
// * The sparse tensor is one-dimensional.
//
// Returns true if the indices are valid, otherwise false.
// NOTE(mrry): If this method returns false, call IndicesValidHelper<true>()
// to obtain a meaningful error message.
bool SparseTensor::IndicesValidVectorFastPath() const {
DCHECK_EQ(shape_.size(), 1);
DCHECK_EQ(order_[0], 0);
const int64 max_index = shape_[0];
// We maintain separate bools for each validation predicate to enable
// vectorization across loop iterations.
bool index_in_range_valid = true;
bool order_valid = true;
int64 prev_index = -1;
const auto ix_t = ix_.matrix<int64>();
const int64* const index_base_ptr = ix_t.data();
for (std::size_t n = 0; n < ix_t.dimension(0); ++n) {
const int64 index = index_base_ptr[n];
index_in_range_valid = index_in_range_valid & (index < max_index);
order_valid = order_valid & (index > prev_index);
prev_index = index;
}
return index_in_range_valid & order_valid;
}
// Optimized version of `IndicesValid()` with the following requirements:
// * The sparse tensor is two-dimensional.
// * The tensor's indices are in the "standard" (lexicographic) order.
@ -116,7 +151,7 @@ SparseTensor::SparseTensor(Tensor ix, Tensor vals, const VarDimArray shape,
// Returns true if the indices are valid, otherwise false.
// NOTE(mrry): If this method returns false, call IndicesValidHelper<true>()
// to obtain a meaningful error message.
bool SparseTensor::IndicesValid32BitFastPath() const {
bool SparseTensor::IndicesValidMatrix32BitFastPath() const {
const auto ix_t = ix_.matrix<int64>();
const int64* const shape_ptr = shape_.data();
@ -241,6 +276,10 @@ Status SparseTensor::IndicesValidHelper() const {
}
Status SparseTensor::IndicesValid() const {
if (shape_.size() == 1 && IndicesValidVectorFastPath()) {
return Status::OK();
}
bool standard_order = true;
for (size_t i = 0; i < order_.size(); ++i) {
if (order_[i] < 0) {
@ -252,9 +291,14 @@ Status SparseTensor::IndicesValid() const {
}
if (standard_order) {
if (shape_.size() == 2 && shape_[0] <= std::numeric_limits<int32>::max() &&
shape_[1] <= std::numeric_limits<int32>::max()) {
if (IndicesValid32BitFastPath()) {
if (shape_.size() == 1) {
if (IndicesValidVectorFastPath()) {
return Status::OK();
}
} else if (shape_.size() == 2 &&
shape_[0] <= std::numeric_limits<int32>::max() &&
shape_[1] <= std::numeric_limits<int32>::max()) {
if (IndicesValidMatrix32BitFastPath()) {
return Status::OK();
}
}

View File

@ -201,7 +201,14 @@ class SparseTensor {
return vec;
}
bool IndicesValid32BitFastPath() const;
// Optimized implementation of `IndicesValid` for 1-D sparse tensors.
// REQUIRES: `shape_.size() == 1`.
bool IndicesValidVectorFastPath() const;
// Optimized implementation of `IndicesValid` for 2-D sparse tensors whose
// indices fit within the range of an `int32`.
// REQUIRES: `shape_.size() == 2`.
bool IndicesValidMatrix32BitFastPath() const;
template <bool standard_order>
Status IndicesValidHelper() const;
@ -354,32 +361,60 @@ inline bool SparseTensor::ToDense(Tensor* out, bool initialize) {
if (!ValidateAndInitializeToDense<T>(out, initialize)) return false;
auto out_t = out->flat<T>();
auto ix_t = ix_.matrix<int64>();
auto vals_t = vals_.vec<T>();
auto ix_t = ix_.matrix<int64>();
const int64* const ix_ptr = ix_t.data();
std::vector<int64> strides(dims_);
const auto& out_shape = out->shape();
if (dims_ > 0) {
strides[dims_ - 1] = 1;
}
for (int d = dims_ - 2; d >= 0; --d) {
strides[d] = strides[d + 1] * out_shape.dim_size(d + 1);
}
for (int n = 0; n < vals_t.dimension(0); ++n) {
bool invalid_dims = false;
int64 ix = 0;
for (int d = 0; d < dims_; ++d) {
const int64 ix_n_d = internal::SubtleMustCopy(ix_t(n, d));
if (!FastBoundsCheck(ix_n_d, out_shape.dim_size(d))) {
invalid_dims = true;
}
ix += strides[d] * ix_n_d;
if (dims_ == 1) {
// Fast path for sparse vectors.
const int64 out_length = out->shape().dim_size(0);
for (int n = 0; n < vals_t.dimension(0); ++n) {
const int64 index = internal::SubtleMustCopy(ix_ptr[n]);
if (!FastBoundsCheck(index, out_length)) return false;
out_t(index) = vals_t(n);
}
if (invalid_dims) return false;
out_t(ix) = vals_t(n);
return true;
} else if (dims_ == 2) {
// Fast path for sparse matrices.
const auto& out_shape = out->shape();
const int64 out_rows = out_shape.dim_size(0);
const int64 out_cols = out_shape.dim_size(1);
for (int n = 0; n < vals_t.dimension(0); ++n) {
const int64 row_index = internal::SubtleMustCopy(ix_ptr[n * 2]);
const int64 col_index = internal::SubtleMustCopy(ix_ptr[n * 2 + 1]);
if (!(FastBoundsCheck(row_index, out_rows) &&
FastBoundsCheck(col_index, out_cols))) {
return false;
}
out_t(row_index * out_cols + col_index) = vals_t(n);
}
return true;
} else {
// General path for N-dimensional sparse tensors.
gtl::InlinedVector<int64, 4> strides(dims_);
const auto& out_shape = out->shape().dim_sizes();
if (dims_ > 0) {
strides[dims_ - 1] = 1;
}
for (int d = dims_ - 2; d >= 0; --d) {
strides[d] = strides[d + 1] * out_shape[d + 1];
}
for (int n = 0; n < vals_t.dimension(0); ++n) {
bool invalid_dims = false;
int64 ix = 0;
for (int d = 0; d < dims_; ++d) {
const int64 ix_n_d = internal::SubtleMustCopy(ix_ptr[n * dims_ + d]);
if (!FastBoundsCheck(ix_n_d, out_shape[d])) {
invalid_dims = true;
}
ix += strides[d] * ix_n_d;
}
if (invalid_dims) return false;
out_t(ix) = vals_t(n);
}
return true;
}
return true;
}
template <typename T>