[SparseTensor] Optimize the tf.sparse.to_dense()
implementation.
This change includes several optimizations: 1. Introduce `SparseTensor::IndicesValidVectorFastPath()`, for validating the indices of a 1-D SparseTensor. The optimized code is similar to `IndicesValid32BitFastPath()`, which optimistically assumes that the tensor is valid and falls back to slower code in the failure case, except it does not have the 32-bit limitation. The compiler is able to vectorize the loop over the indices, for increased throughput. 2. Implement fast paths for 1-D and 2-D inputs in `SparseTensor::ToDense()`. The main win here comes from avoiding the data-dependent loop over dimensions when computing the index of the output value. We also avoid an unnecessary integer multiplication (by 1) in each case. 3. Minor optimizations to the 3+-D case in `SparseTensor::ToDense()`, avoiding unnecessary calls to `TensorShape::dim_size()` and using pointer arithmetic rather than Eigen logic to dereference index elements. 4. Minor optimizations to the `SparseTensor::Create()` method, which now assigns directly to the relevant fields of the result instead of invoking the `SparseTensor` constructor and the move assignment operator. In this case the existing move logic wasn't saving us much, because the `Tensor` and `gtl::InlinedVector` move constructors still have to copy quite a lot of data. 5. Minor optimizations to the `SparseToDense::Compute()` method. In particular, we avoid allocating a temporary tensor for the indices when the input is DT_INT64 (which is the common case, since all `tf.SparseTensor` objects have 64-bit indices). PiperOrigin-RevId: 296075159 Change-Id: I0b051621920aec9b2a8dc6c7ecbf55e5b2d59098
This commit is contained in:
parent
88c4b69a57
commit
867d3c9708
tensorflow/core
@ -20,14 +20,13 @@ limitations under the License.
|
||||
|
||||
#define EIGEN_USE_THREADS
|
||||
|
||||
#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
|
||||
|
||||
#include <numeric>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <utility>
|
||||
|
||||
#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
|
||||
#include "tensorflow/core/framework/op_kernel.h"
|
||||
#include "tensorflow/core/framework/register_types.h"
|
||||
#include "tensorflow/core/framework/tensor.h"
|
||||
@ -35,6 +34,7 @@ limitations under the License.
|
||||
#include "tensorflow/core/lib/core/status.h"
|
||||
#include "tensorflow/core/lib/gtl/inlined_vector.h"
|
||||
#include "tensorflow/core/lib/strings/stringprintf.h"
|
||||
#include "tensorflow/core/util/ptr_util.h"
|
||||
#include "tensorflow/core/util/sparse/sparse_tensor.h"
|
||||
|
||||
namespace tensorflow {
|
||||
@ -93,36 +93,44 @@ class SparseToDense : public OpKernel {
|
||||
Tensor* output = nullptr;
|
||||
OP_REQUIRES_OK(c, c->allocate_output(0, output_tensor_shape, &output));
|
||||
|
||||
TensorShape ix_shape({num_elems, num_dims});
|
||||
Tensor indices_shaped(DT_INT64, ix_shape);
|
||||
if (indices.dtype() == DT_INT64) {
|
||||
CHECK(indices_shaped.CopyFrom(indices, ix_shape));
|
||||
const Tensor* indices_shaped;
|
||||
std::unique_ptr<Tensor> indices_shaped_holder;
|
||||
if (indices.dtype() == DT_INT64 && indices.dims() == 2) {
|
||||
indices_shaped = &indices;
|
||||
} else {
|
||||
indices_shaped.matrix<int64>() =
|
||||
indices.shaped<Index, 2>(ix_shape.dim_sizes()).template cast<int64>();
|
||||
TensorShape ix_shape({num_elems, num_dims});
|
||||
indices_shaped_holder = MakeUnique<Tensor>(DT_INT64, ix_shape);
|
||||
indices_shaped = indices_shaped_holder.get();
|
||||
if (indices.dtype() == DT_INT64) {
|
||||
CHECK(indices_shaped_holder->CopyFrom(indices, ix_shape));
|
||||
} else {
|
||||
indices_shaped_holder->matrix<int64>() =
|
||||
indices.shaped<Index, 2>(ix_shape.dim_sizes())
|
||||
.template cast<int64>();
|
||||
}
|
||||
}
|
||||
|
||||
// If we received a scalar, we'll need to create a new
|
||||
// tensor with copies of the values as a vec.
|
||||
// TODO(ebrevdo): find a way to avoid this temp allocation.
|
||||
Tensor sparse_values_b;
|
||||
const Tensor* sparse_values_b;
|
||||
std::unique_ptr<Tensor> sparse_values_b_holder;
|
||||
|
||||
if (TensorShapeUtils::IsScalar(sparse_values.shape())) {
|
||||
OP_REQUIRES_OK(
|
||||
c, c->allocate_temp(DataTypeToEnum<T>::value,
|
||||
TensorShape({num_elems}), &sparse_values_b));
|
||||
sparse_values_b.vec<T>().setConstant(sparse_values.scalar<T>()());
|
||||
sparse_values_b_holder = MakeUnique<Tensor>(DataTypeToEnum<T>::value,
|
||||
TensorShape({num_elems}));
|
||||
sparse_values_b = sparse_values_b_holder.get();
|
||||
sparse_values_b_holder->vec<T>().setConstant(sparse_values.scalar<T>()());
|
||||
} else {
|
||||
sparse_values_b = sparse_values;
|
||||
sparse_values_b = &sparse_values;
|
||||
}
|
||||
|
||||
// Assume SparseTensor is lexicographically sorted.
|
||||
gtl::InlinedVector<int64, 8> order(output->shape().dims());
|
||||
std::iota(order.begin(), order.end(), 0);
|
||||
sparse::SparseTensor st;
|
||||
OP_REQUIRES_OK(c,
|
||||
sparse::SparseTensor::Create(indices_shaped, sparse_values_b,
|
||||
output->shape(), order, &st));
|
||||
OP_REQUIRES_OK(
|
||||
c, sparse::SparseTensor::Create(*indices_shaped, *sparse_values_b,
|
||||
output->shape(), order, &st));
|
||||
|
||||
if (validate_indices_) {
|
||||
OP_REQUIRES_OK(c, st.IndicesValid());
|
||||
|
@ -65,7 +65,11 @@ Status GetDimsFromIx(const Tensor& ix, int* result) {
|
||||
return errors::InvalidArgument("Shape rank must be SparseTensor rank.");
|
||||
}
|
||||
|
||||
*result = SparseTensor(std::move(ix), std::move(vals), shape, order);
|
||||
result->ix_ = std::move(ix);
|
||||
result->vals_ = std::move(vals);
|
||||
result->shape_.assign(shape.begin(), shape.end());
|
||||
result->order_.assign(order.begin(), order.end());
|
||||
result->dims_ = dims;
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
@ -108,6 +112,37 @@ SparseTensor::SparseTensor(Tensor ix, Tensor vals, const VarDimArray shape,
|
||||
DCHECK_EQ(shape.size(), dims_) << "Shape rank must be SparseTensor rank.";
|
||||
}
|
||||
|
||||
// Optimized version of `IndicesValid()` with the following requirements:
|
||||
// * The sparse tensor is one-dimensional.
|
||||
//
|
||||
// Returns true if the indices are valid, otherwise false.
|
||||
// NOTE(mrry): If this method returns false, call IndicesValidHelper<true>()
|
||||
// to obtain a meaningful error message.
|
||||
bool SparseTensor::IndicesValidVectorFastPath() const {
|
||||
DCHECK_EQ(shape_.size(), 1);
|
||||
DCHECK_EQ(order_[0], 0);
|
||||
|
||||
const int64 max_index = shape_[0];
|
||||
|
||||
// We maintain separate bools for each validation predicate to enable
|
||||
// vectorization across loop iterations.
|
||||
bool index_in_range_valid = true;
|
||||
bool order_valid = true;
|
||||
|
||||
int64 prev_index = -1;
|
||||
const auto ix_t = ix_.matrix<int64>();
|
||||
const int64* const index_base_ptr = ix_t.data();
|
||||
|
||||
for (std::size_t n = 0; n < ix_t.dimension(0); ++n) {
|
||||
const int64 index = index_base_ptr[n];
|
||||
index_in_range_valid = index_in_range_valid & (index < max_index);
|
||||
order_valid = order_valid & (index > prev_index);
|
||||
prev_index = index;
|
||||
}
|
||||
|
||||
return index_in_range_valid & order_valid;
|
||||
}
|
||||
|
||||
// Optimized version of `IndicesValid()` with the following requirements:
|
||||
// * The sparse tensor is two-dimensional.
|
||||
// * The tensor's indices are in the "standard" (lexicographic) order.
|
||||
@ -116,7 +151,7 @@ SparseTensor::SparseTensor(Tensor ix, Tensor vals, const VarDimArray shape,
|
||||
// Returns true if the indices are valid, otherwise false.
|
||||
// NOTE(mrry): If this method returns false, call IndicesValidHelper<true>()
|
||||
// to obtain a meaningful error message.
|
||||
bool SparseTensor::IndicesValid32BitFastPath() const {
|
||||
bool SparseTensor::IndicesValidMatrix32BitFastPath() const {
|
||||
const auto ix_t = ix_.matrix<int64>();
|
||||
const int64* const shape_ptr = shape_.data();
|
||||
|
||||
@ -241,6 +276,10 @@ Status SparseTensor::IndicesValidHelper() const {
|
||||
}
|
||||
|
||||
Status SparseTensor::IndicesValid() const {
|
||||
if (shape_.size() == 1 && IndicesValidVectorFastPath()) {
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
bool standard_order = true;
|
||||
for (size_t i = 0; i < order_.size(); ++i) {
|
||||
if (order_[i] < 0) {
|
||||
@ -252,9 +291,14 @@ Status SparseTensor::IndicesValid() const {
|
||||
}
|
||||
|
||||
if (standard_order) {
|
||||
if (shape_.size() == 2 && shape_[0] <= std::numeric_limits<int32>::max() &&
|
||||
shape_[1] <= std::numeric_limits<int32>::max()) {
|
||||
if (IndicesValid32BitFastPath()) {
|
||||
if (shape_.size() == 1) {
|
||||
if (IndicesValidVectorFastPath()) {
|
||||
return Status::OK();
|
||||
}
|
||||
} else if (shape_.size() == 2 &&
|
||||
shape_[0] <= std::numeric_limits<int32>::max() &&
|
||||
shape_[1] <= std::numeric_limits<int32>::max()) {
|
||||
if (IndicesValidMatrix32BitFastPath()) {
|
||||
return Status::OK();
|
||||
}
|
||||
}
|
||||
|
@ -201,7 +201,14 @@ class SparseTensor {
|
||||
return vec;
|
||||
}
|
||||
|
||||
bool IndicesValid32BitFastPath() const;
|
||||
// Optimized implementation of `IndicesValid` for 1-D sparse tensors.
|
||||
// REQUIRES: `shape_.size() == 1`.
|
||||
bool IndicesValidVectorFastPath() const;
|
||||
|
||||
// Optimized implementation of `IndicesValid` for 2-D sparse tensors whose
|
||||
// indices fit within the range of an `int32`.
|
||||
// REQUIRES: `shape_.size() == 2`.
|
||||
bool IndicesValidMatrix32BitFastPath() const;
|
||||
|
||||
template <bool standard_order>
|
||||
Status IndicesValidHelper() const;
|
||||
@ -354,32 +361,60 @@ inline bool SparseTensor::ToDense(Tensor* out, bool initialize) {
|
||||
if (!ValidateAndInitializeToDense<T>(out, initialize)) return false;
|
||||
|
||||
auto out_t = out->flat<T>();
|
||||
auto ix_t = ix_.matrix<int64>();
|
||||
auto vals_t = vals_.vec<T>();
|
||||
auto ix_t = ix_.matrix<int64>();
|
||||
const int64* const ix_ptr = ix_t.data();
|
||||
|
||||
std::vector<int64> strides(dims_);
|
||||
const auto& out_shape = out->shape();
|
||||
if (dims_ > 0) {
|
||||
strides[dims_ - 1] = 1;
|
||||
}
|
||||
for (int d = dims_ - 2; d >= 0; --d) {
|
||||
strides[d] = strides[d + 1] * out_shape.dim_size(d + 1);
|
||||
}
|
||||
|
||||
for (int n = 0; n < vals_t.dimension(0); ++n) {
|
||||
bool invalid_dims = false;
|
||||
int64 ix = 0;
|
||||
for (int d = 0; d < dims_; ++d) {
|
||||
const int64 ix_n_d = internal::SubtleMustCopy(ix_t(n, d));
|
||||
if (!FastBoundsCheck(ix_n_d, out_shape.dim_size(d))) {
|
||||
invalid_dims = true;
|
||||
}
|
||||
ix += strides[d] * ix_n_d;
|
||||
if (dims_ == 1) {
|
||||
// Fast path for sparse vectors.
|
||||
const int64 out_length = out->shape().dim_size(0);
|
||||
for (int n = 0; n < vals_t.dimension(0); ++n) {
|
||||
const int64 index = internal::SubtleMustCopy(ix_ptr[n]);
|
||||
if (!FastBoundsCheck(index, out_length)) return false;
|
||||
out_t(index) = vals_t(n);
|
||||
}
|
||||
if (invalid_dims) return false;
|
||||
out_t(ix) = vals_t(n);
|
||||
return true;
|
||||
} else if (dims_ == 2) {
|
||||
// Fast path for sparse matrices.
|
||||
const auto& out_shape = out->shape();
|
||||
const int64 out_rows = out_shape.dim_size(0);
|
||||
const int64 out_cols = out_shape.dim_size(1);
|
||||
for (int n = 0; n < vals_t.dimension(0); ++n) {
|
||||
const int64 row_index = internal::SubtleMustCopy(ix_ptr[n * 2]);
|
||||
const int64 col_index = internal::SubtleMustCopy(ix_ptr[n * 2 + 1]);
|
||||
if (!(FastBoundsCheck(row_index, out_rows) &&
|
||||
FastBoundsCheck(col_index, out_cols))) {
|
||||
return false;
|
||||
}
|
||||
out_t(row_index * out_cols + col_index) = vals_t(n);
|
||||
}
|
||||
return true;
|
||||
} else {
|
||||
// General path for N-dimensional sparse tensors.
|
||||
gtl::InlinedVector<int64, 4> strides(dims_);
|
||||
const auto& out_shape = out->shape().dim_sizes();
|
||||
if (dims_ > 0) {
|
||||
strides[dims_ - 1] = 1;
|
||||
}
|
||||
for (int d = dims_ - 2; d >= 0; --d) {
|
||||
strides[d] = strides[d + 1] * out_shape[d + 1];
|
||||
}
|
||||
|
||||
for (int n = 0; n < vals_t.dimension(0); ++n) {
|
||||
bool invalid_dims = false;
|
||||
int64 ix = 0;
|
||||
for (int d = 0; d < dims_; ++d) {
|
||||
const int64 ix_n_d = internal::SubtleMustCopy(ix_ptr[n * dims_ + d]);
|
||||
if (!FastBoundsCheck(ix_n_d, out_shape[d])) {
|
||||
invalid_dims = true;
|
||||
}
|
||||
ix += strides[d] * ix_n_d;
|
||||
}
|
||||
if (invalid_dims) return false;
|
||||
out_t(ix) = vals_t(n);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
|
Loading…
Reference in New Issue
Block a user