[SparseTensor] Optimize the tf.sparse.to_dense() implementation.

This change includes several optimizations: 1. Introduce `SparseTensor::IndicesValidVectorFastPath()`, for validating the indices of a 1-D SparseTensor. The optimized code is similar to `IndicesValid32BitFastPath()`, which optimistically assumes that the tensor is valid and falls back to slower code in the failure case, except it does not have the 32-bit limitation. The compiler is able to vectorize the loop over the indices, for increased throughput. 2. Implement fast paths for 1-D and 2-D inputs in `SparseTensor::ToDense()`. The main win here comes from avoiding the data-dependent loop over dimensions when computing the index of the output value. We also avoid an unnecessary integer multiplication (by 1) in each case. 3. Minor optimizations to the 3+-D case in `SparseTensor::ToDense()`, avoiding unnecessary calls to `TensorShape::dim_size()` and using pointer arithmetic rather than Eigen logic to dereference index elements. 4. Minor optimizations to the `SparseTensor::Create()` method, which now assigns directly to the relevant fields of the result instead of invoking the `SparseTensor` constructor and the move assignment operator. In this case the existing move logic wasn't saving us much, because the `Tensor` and `gtl::InlinedVector` move constructors still have to copy quite a lot of data. 5. Minor optimizations to the `SparseToDense::Compute()` method. In particular, we avoid allocating a temporary tensor for the indices when the input is DT_INT64 (which is the common case, since all `tf.SparseTensor` objects have 64-bit indices). PiperOrigin-RevId: 296075159 Change-Id: I0b051621920aec9b2a8dc6c7ecbf55e5b2d59098
2020-02-19 15:45:36 -08:00 · 2020-02-19 15:45:36 -08:00 · 867d3c9708
commit 867d3c9708
parent 88c4b69a57
3 changed files with 133 additions and 46 deletions
--- a/tensorflow/core/kernels/sparse_to_dense_op.cc
+++ b/tensorflow/core/kernels/sparse_to_dense_op.cc
@ -20,14 +20,13 @@ limitations under the License.
 #define EIGEN_USE_THREADS
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include <numeric>
 #include <sstream>
 #include <string>
 #include <unordered_map>
 #include <utility>
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
@ -35,6 +34,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/util/ptr_util.h"
 #include "tensorflow/core/util/sparse/sparse_tensor.h"
 namespace tensorflow {
@ -93,36 +93,44 @@ class SparseToDense : public OpKernel {
    Tensor* output = nullptr;
    OP_REQUIRES_OK(c, c->allocate_output(0, output_tensor_shape, &output));
-    TensorShape ix_shape({num_elems, num_dims});
+    const Tensor* indices_shaped;
-    Tensor indices_shaped(DT_INT64, ix_shape);
+    std::unique_ptr<Tensor> indices_shaped_holder;
-    if (indices.dtype() == DT_INT64) {
+    if (indices.dtype() == DT_INT64 && indices.dims() == 2) {
-      CHECK(indices_shaped.CopyFrom(indices, ix_shape));
+      indices_shaped = &indices;
    } else {
-      indices_shaped.matrix<int64>() =
+      TensorShape ix_shape({num_elems, num_dims});
-          indices.shaped<Index, 2>(ix_shape.dim_sizes()).template cast<int64>();
+      indices_shaped_holder = MakeUnique<Tensor>(DT_INT64, ix_shape);
      indices_shaped = indices_shaped_holder.get();
      if (indices.dtype() == DT_INT64) {
        CHECK(indices_shaped_holder->CopyFrom(indices, ix_shape));
      } else {
        indices_shaped_holder->matrix<int64>() =
            indices.shaped<Index, 2>(ix_shape.dim_sizes())
                .template cast<int64>();
      }
    }
    // If we received a scalar, we'll need to create a new
    // tensor with copies of the values as a vec.
-    // TODO(ebrevdo): find a way to avoid this temp allocation.
+    const Tensor* sparse_values_b;
-    Tensor sparse_values_b;
+    std::unique_ptr<Tensor> sparse_values_b_holder;
    if (TensorShapeUtils::IsScalar(sparse_values.shape())) {
-      OP_REQUIRES_OK(
+      sparse_values_b_holder = MakeUnique<Tensor>(DataTypeToEnum<T>::value,
-          c, c->allocate_temp(DataTypeToEnum<T>::value,
+                                                  TensorShape({num_elems}));
-                              TensorShape({num_elems}), &sparse_values_b));
+      sparse_values_b = sparse_values_b_holder.get();
-      sparse_values_b.vec<T>().setConstant(sparse_values.scalar<T>()());
+      sparse_values_b_holder->vec<T>().setConstant(sparse_values.scalar<T>()());
    } else {
-      sparse_values_b = sparse_values;
+      sparse_values_b = &sparse_values;
    }
    // Assume SparseTensor is lexicographically sorted.
    gtl::InlinedVector<int64, 8> order(output->shape().dims());
    std::iota(order.begin(), order.end(), 0);
    sparse::SparseTensor st;
-    OP_REQUIRES_OK(c,
+    OP_REQUIRES_OK(
-                   sparse::SparseTensor::Create(indices_shaped, sparse_values_b,
+        c, sparse::SparseTensor::Create(*indices_shaped, *sparse_values_b,
-                                                output->shape(), order, &st));
+                                        output->shape(), order, &st));
    if (validate_indices_) {
      OP_REQUIRES_OK(c, st.IndicesValid());
--- a/tensorflow/core/util/sparse/sparse_tensor.cc
+++ b/tensorflow/core/util/sparse/sparse_tensor.cc
@ -65,7 +65,11 @@ Status GetDimsFromIx(const Tensor& ix, int* result) {
    return errors::InvalidArgument("Shape rank must be SparseTensor rank.");
  }
-  *result = SparseTensor(std::move(ix), std::move(vals), shape, order);
+  result->ix_ = std::move(ix);
  result->vals_ = std::move(vals);
  result->shape_.assign(shape.begin(), shape.end());
  result->order_.assign(order.begin(), order.end());
  result->dims_ = dims;
  return Status::OK();
 }
@ -108,6 +112,37 @@ SparseTensor::SparseTensor(Tensor ix, Tensor vals, const VarDimArray shape,
  DCHECK_EQ(shape.size(), dims_) << "Shape rank must be SparseTensor rank.";
 }
 // Optimized version of `IndicesValid()` with the following requirements:
 // * The sparse tensor is one-dimensional.
 //
 // Returns true if the indices are valid, otherwise false.
 // NOTE(mrry): If this method returns false, call IndicesValidHelper<true>()
 // to obtain a meaningful error message.
 bool SparseTensor::IndicesValidVectorFastPath() const {
  DCHECK_EQ(shape_.size(), 1);
  DCHECK_EQ(order_[0], 0);
  const int64 max_index = shape_[0];
  // We maintain separate bools for each validation predicate to enable
  // vectorization across loop iterations.
  bool index_in_range_valid = true;
  bool order_valid = true;
  int64 prev_index = -1;
  const auto ix_t = ix_.matrix<int64>();
  const int64* const index_base_ptr = ix_t.data();
  for (std::size_t n = 0; n < ix_t.dimension(0); ++n) {
    const int64 index = index_base_ptr[n];
    index_in_range_valid = index_in_range_valid & (index < max_index);
    order_valid = order_valid & (index > prev_index);
    prev_index = index;
  }
  return index_in_range_valid & order_valid;
 }
 // Optimized version of `IndicesValid()` with the following requirements:
 // * The sparse tensor is two-dimensional.
 // * The tensor's indices are in the "standard" (lexicographic) order.
@ -116,7 +151,7 @@ SparseTensor::SparseTensor(Tensor ix, Tensor vals, const VarDimArray shape,
 // Returns true if the indices are valid, otherwise false.
 // NOTE(mrry): If this method returns false, call IndicesValidHelper<true>()
 // to obtain a meaningful error message.
-bool SparseTensor::IndicesValid32BitFastPath() const {
+bool SparseTensor::IndicesValidMatrix32BitFastPath() const {
  const auto ix_t = ix_.matrix<int64>();
  const int64* const shape_ptr = shape_.data();
@ -241,6 +276,10 @@ Status SparseTensor::IndicesValidHelper() const {
 }
 Status SparseTensor::IndicesValid() const {
  if (shape_.size() == 1 && IndicesValidVectorFastPath()) {
    return Status::OK();
  }
  bool standard_order = true;
  for (size_t i = 0; i < order_.size(); ++i) {
    if (order_[i] < 0) {
@ -252,9 +291,14 @@ Status SparseTensor::IndicesValid() const {
  }
  if (standard_order) {
-    if (shape_.size() == 2 && shape_[0] <= std::numeric_limits<int32>::max() &&
+    if (shape_.size() == 1) {
-        shape_[1] <= std::numeric_limits<int32>::max()) {
+      if (IndicesValidVectorFastPath()) {
-      if (IndicesValid32BitFastPath()) {
+        return Status::OK();
      }
    } else if (shape_.size() == 2 &&
               shape_[0] <= std::numeric_limits<int32>::max() &&
               shape_[1] <= std::numeric_limits<int32>::max()) {
      if (IndicesValidMatrix32BitFastPath()) {
        return Status::OK();
      }
    }
--- a/tensorflow/core/util/sparse/sparse_tensor.h
+++ b/tensorflow/core/util/sparse/sparse_tensor.h
@ -201,7 +201,14 @@ class SparseTensor {
    return vec;
  }
-  bool IndicesValid32BitFastPath() const;
+  // Optimized implementation of `IndicesValid` for 1-D sparse tensors.
  // REQUIRES: `shape_.size() == 1`.
  bool IndicesValidVectorFastPath() const;
  // Optimized implementation of `IndicesValid` for 2-D sparse tensors whose
  // indices fit within the range of an `int32`.
  // REQUIRES: `shape_.size() == 2`.
  bool IndicesValidMatrix32BitFastPath() const;
  template <bool standard_order>
  Status IndicesValidHelper() const;
@ -354,32 +361,60 @@ inline bool SparseTensor::ToDense(Tensor* out, bool initialize) {
  if (!ValidateAndInitializeToDense<T>(out, initialize)) return false;
  auto out_t = out->flat<T>();
  auto ix_t = ix_.matrix<int64>();
  auto vals_t = vals_.vec<T>();
  auto ix_t = ix_.matrix<int64>();
  const int64* const ix_ptr = ix_t.data();
-  std::vector<int64> strides(dims_);
+  if (dims_ == 1) {
-  const auto& out_shape = out->shape();
+    // Fast path for sparse vectors.
-  if (dims_ > 0) {
+    const int64 out_length = out->shape().dim_size(0);
-    strides[dims_ - 1] = 1;
+    for (int n = 0; n < vals_t.dimension(0); ++n) {
-  }
+      const int64 index = internal::SubtleMustCopy(ix_ptr[n]);
-  for (int d = dims_ - 2; d >= 0; --d) {
+      if (!FastBoundsCheck(index, out_length)) return false;
-    strides[d] = strides[d + 1] * out_shape.dim_size(d + 1);
+      out_t(index) = vals_t(n);
  }
  for (int n = 0; n < vals_t.dimension(0); ++n) {
    bool invalid_dims = false;
    int64 ix = 0;
    for (int d = 0; d < dims_; ++d) {
      const int64 ix_n_d = internal::SubtleMustCopy(ix_t(n, d));
      if (!FastBoundsCheck(ix_n_d, out_shape.dim_size(d))) {
        invalid_dims = true;
      }
      ix += strides[d] * ix_n_d;
    }
-    if (invalid_dims) return false;
+    return true;
-    out_t(ix) = vals_t(n);
+  } else if (dims_ == 2) {
    // Fast path for sparse matrices.
    const auto& out_shape = out->shape();
    const int64 out_rows = out_shape.dim_size(0);
    const int64 out_cols = out_shape.dim_size(1);
    for (int n = 0; n < vals_t.dimension(0); ++n) {
      const int64 row_index = internal::SubtleMustCopy(ix_ptr[n * 2]);
      const int64 col_index = internal::SubtleMustCopy(ix_ptr[n * 2 + 1]);
      if (!(FastBoundsCheck(row_index, out_rows) &&
            FastBoundsCheck(col_index, out_cols))) {
        return false;
      }
      out_t(row_index * out_cols + col_index) = vals_t(n);
    }
    return true;
  } else {
    // General path for N-dimensional sparse tensors.
    gtl::InlinedVector<int64, 4> strides(dims_);
    const auto& out_shape = out->shape().dim_sizes();
    if (dims_ > 0) {
      strides[dims_ - 1] = 1;
    }
    for (int d = dims_ - 2; d >= 0; --d) {
      strides[d] = strides[d + 1] * out_shape[d + 1];
    }
    for (int n = 0; n < vals_t.dimension(0); ++n) {
      bool invalid_dims = false;
      int64 ix = 0;
      for (int d = 0; d < dims_; ++d) {
        const int64 ix_n_d = internal::SubtleMustCopy(ix_ptr[n * dims_ + d]);
        if (!FastBoundsCheck(ix_n_d, out_shape[d])) {
          invalid_dims = true;
        }
        ix += strides[d] * ix_n_d;
      }
      if (invalid_dims) return false;
      out_t(ix) = vals_t(n);
    }
    return true;
  }
  return true;
 }
 template <typename T>