Performance improvements for RaggedTensorToTensor

PiperOrigin-RevId: 270283647
This commit is contained in:
Edward Loper 2019-09-20 09:19:57 -07:00 committed by TensorFlower Gardener
parent 09f6d42882
commit 97cfacbca3

View File

@ -44,7 +44,6 @@ namespace tensorflow {
namespace {
typedef Eigen::ThreadPoolDevice CPUDevice;
using ::std::vector;
using ::tensorflow::errors::Internal;
const int kShapeInputIndex = 0;
const int kValueInputIndex = 1;
@ -188,23 +187,22 @@ class RaggedTensorToTensorBaseOp : public OpKernel {
* If first_dimension_output = 11 instead, then:
* result = [0 100 200 300 400 500 600 700 800 900]
*/
vector<INDEX_TYPE> CalculateFirstParentOutputIndex(
INDEX_TYPE first_dimension, INDEX_TYPE output_index_multiplier,
INDEX_TYPE first_dimension_output) {
void CalculateFirstParentOutputIndex(INDEX_TYPE first_dimension,
INDEX_TYPE output_index_multiplier,
INDEX_TYPE first_dimension_output,
vector<INDEX_TYPE>* result) {
const INDEX_TYPE min_dimension =
std::min(first_dimension, first_dimension_output);
vector<INDEX_TYPE> result;
result.reserve(first_dimension);
result->reserve(first_dimension);
int current_output_index = 0;
for (INDEX_TYPE i = 0; i < min_dimension;
++i, current_output_index += output_index_multiplier) {
result.push_back(current_output_index);
result->push_back(current_output_index);
}
for (INDEX_TYPE i = min_dimension; i < first_dimension; ++i) {
result.push_back(-1);
result->push_back(-1);
}
DCHECK_EQ(result.size(), first_dimension);
return result;
DCHECK_EQ(result->size(), first_dimension);
}
void CalculateOutputIndexRowSplit(
@ -350,10 +348,10 @@ class RaggedTensorToTensorBaseOp : public OpKernel {
OP_REQUIRES_OK(context,
CalculateOutputSize(first_dimension, context, &output_size));
vector<INDEX_TYPE> multiplier;
multiplier.resize(output_size.size());
multiplier.resize(ragged_rank_ + 1);
multiplier[multiplier.size() - 1] = 1;
for (int i = output_size.size() - 2; i >= 0; --i) {
for (int i = multiplier.size() - 2; i >= 0; --i) {
multiplier[i] = multiplier[i + 1] * output_size[i + 1];
}
// Full size of the tensor.
@ -366,21 +364,25 @@ class RaggedTensorToTensorBaseOp : public OpKernel {
context->allocate_output(0, output_shape, &output_tensor));
const INDEX_TYPE full_size = multiplier[0] * output_size[0];
if (full_size > 0) {
vector<INDEX_TYPE> output_index = CalculateFirstParentOutputIndex(
first_dimension, multiplier[0], output_size[0]);
vector<INDEX_TYPE> output_index, new_output_index;
int nvals = context->input(kValueInputIndex).shape().dim_size(0);
output_index.reserve(nvals);
new_output_index.reserve(nvals);
CalculateFirstParentOutputIndex(first_dimension, multiplier[0],
output_size[0], &output_index);
for (int i = 1; i <= ragged_rank_; ++i) {
vector<INDEX_TYPE> new_output_index;
OP_REQUIRES_OK(context, CalculateOutputIndex(
context, i - 1, output_index, multiplier[i],
output_size[i], &new_output_index));
output_index = new_output_index;
output_index.swap(new_output_index);
new_output_index.clear();
}
SetOutput(context, output_index, output_tensor);
SetOutput(context, ragged_rank_, output_index, output_tensor);
}
}
virtual void SetOutput(OpKernelContext* context,
virtual void SetOutput(OpKernelContext* context, int ragged_rank,
const vector<INDEX_TYPE>& output_index,
Tensor* output_tensor) = 0;
@ -397,20 +399,17 @@ void slow_copy_array(VALUE_TYPE* dst, const VALUE_TYPE* src, INDEX_TYPE size) {
}
template <typename VALUE_TYPE, typename INDEX_TYPE>
void copy_array(VALUE_TYPE* dst, const VALUE_TYPE* src, INDEX_TYPE size,
size_t bytes) {
memcpy(dst, src, bytes);
void copy_array(VALUE_TYPE* dst, const VALUE_TYPE* src, INDEX_TYPE size) {
memcpy(dst, src, size * sizeof(VALUE_TYPE));
}
template <>
void copy_array<string, int64>(string* dst, const string* src, int64 size,
size_t bytes) {
void copy_array<string, int64>(string* dst, const string* src, int64 size) {
slow_copy_array(dst, src, size);
}
template <>
void copy_array<string, int32>(string* dst, const string* src, int32 size,
size_t bytes) {
void copy_array<string, int32>(string* dst, const string* src, int32 size) {
slow_copy_array(dst, src, size);
}
@ -419,13 +418,13 @@ void copy_array<string, int32>(string* dst, const string* src, int32 size,
// is not TriviallyCopyable
template <>
void copy_array<Eigen::half, int64>(Eigen::half* dst, const Eigen::half* src,
int64 size, size_t bytes) {
int64 size) {
slow_copy_array(dst, src, size);
}
template <>
void copy_array<Eigen::half, int32>(Eigen::half* dst, const Eigen::half* src,
int32 size, size_t bytes) {
int32 size) {
slow_copy_array(dst, src, size);
}
@ -435,80 +434,111 @@ class RaggedTensorToTensorOp : public RaggedTensorToTensorBaseOp<INDEX_TYPE> {
explicit RaggedTensorToTensorOp(OpKernelConstruction* context)
: RaggedTensorToTensorBaseOp<INDEX_TYPE>(context) {}
void SetOutput(OpKernelContext* context,
void SetOutput(OpKernelContext* context, int ragged_rank,
const vector<INDEX_TYPE>& output_index,
Tensor* output_tensor) override {
typename tensorflow::TTypes<VALUE_TYPE>::Flat output_flat =
output_tensor->flat<VALUE_TYPE>();
const auto& value_tensor = context->input(kValueInputIndex);
// Note: it's ok to use OP_REQUIRES_OK (rather than TF_RETURN_IF_ERROR)
// in this function, but only because it's the last thing we do before
// returning from Compute().
if (output_tensor->NumElements() == 0) return;
const auto& values_tensor = context->input(kValueInputIndex);
const VALUE_TYPE* values_base = values_tensor.flat<VALUE_TYPE>().data();
const auto& default_value_tensor = context->input(kDefaultValueInputIndex);
if (value_tensor.shape().dims() == 1) {
// Initialize tensor to default_value.
VALUE_TYPE* base_output = output_flat.data();
VALUE_TYPE default_value = default_value_tensor.scalar<VALUE_TYPE>()();
VALUE_TYPE* output_base = output_tensor->flat<VALUE_TYPE>().data();
std::fill(base_output, base_output + output_flat.size(), default_value);
auto values = context->input(kValueInputIndex).flat<VALUE_TYPE>();
int values_size = values.size();
OP_REQUIRES(context, values_size == output_index.size(),
Internal("Values and indices must be equal"));
for (int i = 0; i < values_size; ++i) {
if (output_index[i] >= 0) {
output_flat(output_index[i]) = values(i);
}
}
} else {
const auto& output_shape = output_tensor->shape();
const auto& default_value_shape = default_value_tensor.shape();
TensorShape element_shape = output_tensor->shape();
element_shape.RemoveDimRange(0, ragged_rank + 1);
int value_element_size = element_shape.num_elements();
size_t output_index_size = output_index.size();
// Initialize tensor to default_value.
BCast bcast(BCast::FromShape(default_value_shape),
BCast::FromShape(output_shape),
// Broadcast the default value to value_element_size. (We can skip this
// if default_value_tensor.NumElements() == 1, since we use std::fill
// when that's true.)
const VALUE_TYPE* default_value =
default_value_tensor.flat<VALUE_TYPE>().data();
Tensor bcast_default; // Temporary tensor for result of broadcast
if (default_value_tensor.NumElements() != value_element_size &&
default_value_tensor.NumElements() != 1) {
const auto& src_shape = default_value_tensor.shape();
BCast bcast(BCast::FromShape(src_shape), BCast::FromShape(element_shape),
/*fewer_dims_optimization=*/true);
OP_REQUIRES(
context, bcast.IsValid(),
errors::InvalidArgument(
"Incompatible shapes: ", default_value_shape.DebugString(),
" vs. ", default_value_shape.DebugString()));
OP_REQUIRES(
context, BCast::ToShape(bcast.output_shape()) == output_shape,
errors::InvalidArgument("Unable to broadcast default_value of shape ",
default_value_shape, " to tensor of shape ",
output_shape));
// Note: bcast should always be valid, since we rejected any incompatible
// shapes when we called ValidateDefaultValueShape().
OP_REQUIRES(context, bcast.IsValid(),
errors::InvalidArgument("Error broadcasting default_value"));
OP_REQUIRES_OK(context,
context->allocate_temp(default_value_tensor.dtype(),
element_shape, &bcast_default));
const CPUDevice& device = context->eigen_device<CPUDevice>();
functor::BroadcastTo<CPUDevice, VALUE_TYPE>()(
device, context, *output_tensor, output_shape, default_value_tensor,
default_value_shape, bcast);
device, context, bcast_default, element_shape, default_value_tensor,
src_shape, bcast);
default_value = bcast_default.flat<VALUE_TYPE>().data();
}
VALUE_TYPE* base_output = output_flat.data();
auto values = context->input(kValueInputIndex).flat<VALUE_TYPE>();
size_t values_size = values.size();
size_t output_index_size = output_index.size();
// A value "element" is a group of values that are arranged together.
// For example, if the value shape is [3,4,5], then 20 values are in a
// value element.
int value_element_size = values_size / output_index_size;
int value_element_bytesize = value_element_size * sizeof(VALUE_TYPE);
const VALUE_TYPE* values_base = values.data();
// Loop through the output_index vector, finding contiguous regions that
// should be copied. Once we find the end of a contiguous region, copy it
// and add any necessary padding (with default_value).
INDEX_TYPE src_start = 0; // Start of contiguous region (in values)
INDEX_TYPE dst_start = 0; // Destination for contiguous region (in output)
INDEX_TYPE dst_end = 0; // Destination for contiguous region (in output)
for (int src_i = 0; src_i <= output_index_size; ++src_i) {
// dst_i is the destination where the value at src_i should be copied.
INDEX_TYPE dst_i = src_i < output_index_size ? output_index[src_i] : -1;
OP_REQUIRES(context,
value_tensor.shape().dim_size(0) == output_index_size,
Internal("Values and indices must be equal"));
// If we're still in a contiguous region, then update dst_end go to the
// next src_i.
if (dst_i == dst_end) {
++dst_end;
continue;
}
OP_REQUIRES(context,
values_size == output_index_size * value_element_size,
Internal("Values and indices must be equal"));
INDEX_TYPE value_index = 0;
for (int i = 0; i < output_index_size;
++i, value_index += value_element_size) {
if (output_index[i] >= 0) {
VALUE_TYPE* dst = base_output + output_index[i];
const VALUE_TYPE* src = values_base + value_index;
copy_array<VALUE_TYPE, INDEX_TYPE>(dst, src, value_element_size,
value_element_bytesize);
// We found the end of contiguous region. This can be because we found
// a gap (dst_i > dst_end), or a source value that shouldn't be copied
// because it's out-of-bounds (dst_i == -1), or the end of the tensor
// (dst_i = -1).
if (dst_start < dst_end) {
// Copy the contiguous region.
const VALUE_TYPE* src = values_base + src_start * value_element_size;
VALUE_TYPE* dst = output_base + dst_start * value_element_size;
INDEX_TYPE nvals = (dst_end - dst_start) * value_element_size;
copy_array<VALUE_TYPE, INDEX_TYPE>(dst, src, nvals);
}
// Add any necessary padding (w/ default_value).
if (src_i >= output_index_size) {
// We reached the end of values: pad to the end of output.
size_t output_size = output_tensor->NumElements();
dst_i = output_size / value_element_size;
}
if (dst_i > dst_end) {
if (default_value_tensor.NumElements() == 1) {
std::fill(output_base + dst_end * value_element_size,
output_base + dst_i * value_element_size, *default_value);
dst_end = dst_i;
} else {
while (dst_i > dst_end) {
VALUE_TYPE* dst = output_base + dst_end * value_element_size;
copy_array<VALUE_TYPE, INDEX_TYPE>(dst, default_value,
value_element_size);
++dst_end;
}
}
}
// Update indices.
if (dst_i < 0) {
// src_i should be skipped -- leave it out of the contiguous region.
src_start = src_i + 1;
dst_start = dst_end;
} else {
// src_i should be copied -- include it in the contiguous region.
src_start = src_i;
dst_start = dst_end;
dst_end = dst_start + 1;
}
}
}
};