[WhereCPUOp] Optimize the tensor-wrapping of the num_true scalar.

Previously, we allocated a temporary, aligned, scalar Tensor of DT_INT64 to receive the `num_true` result, which involves multiple virtual calls, atomic operations, etc. Instead, we now (on CPU) allocate an `int64` on the stack and wrap it in a `TTypes<int64>::UnalignedScalar`, which is cheap to create.

PiperOrigin-RevId: 296928208
Change-Id: I61a55096d1bc1f673f9cac21984b27a754960a0b
This commit is contained in:
Derek Murray 2020-02-24 10:55:23 -08:00 committed by TensorFlower Gardener
parent 2d67dbde27
commit db8443e2fd
3 changed files with 9 additions and 13 deletions

View File

@ -75,7 +75,7 @@ template <typename T>
struct NumTrue<CPUDevice, T, int64> {
static Status Compute(OpKernelContext* ctx, const CPUDevice& d,
typename TTypes<T>::ConstFlat input,
TTypes<int64>::Scalar num_true) {
TTypes<int64>::UnalignedScalar num_true) {
num_true() = CountAccumulator<T>(input.data(), input.data() + input.size());
return Status::OK();
}
@ -140,18 +140,14 @@ class WhereCPUOp : public OpKernel {
const int input_dims = input.dims();
Tensor num_true;
AllocatorAttributes attr;
attr.set_on_host(true);
OP_REQUIRES_OK(context, context->allocate_temp(DT_INT64, TensorShape({}),
&num_true, attr));
auto num_true_t = num_true.scalar<int64>();
int64 num_true;
TTypes<int64>::UnalignedScalar num_true_t(&num_true);
Status s = functor::NumTrue<CPUDevice, T, int64>::Compute(
context, context->eigen_device<CPUDevice>(), input.flat<T>(),
num_true_t);
OP_REQUIRES_OK(context, s);
TensorShape output_shape({num_true_t(), input_dims});
TensorShape output_shape({num_true, input_dims});
Tensor* output = nullptr;
OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output));
@ -216,7 +212,7 @@ namespace functor {
template <> \
Status NumTrue<GPUDevice, T, Tindex>::Compute( \
OpKernelContext* ctx, const GPUDevice& d, TTypes<T>::ConstFlat input, \
TTypes<Tindex>::Scalar num_true); \
TTypes<Tindex>::UnalignedScalar num_true); \
extern template struct NumTrue<GPUDevice, T, Tindex>
#define DECLARE_GPU_NUMTRUE_TYPE(T) \
@ -287,8 +283,8 @@ class WhereGPUOp : public AsyncOpKernel {
context->allocate_temp(DataTypeToEnum<Tindex>::v(),
TensorShape({}), &num_true),
done);
auto num_true_t = num_true.scalar<Tindex>();
typename TTypes<Tindex>::UnalignedScalar num_true_t(
num_true.scalar<Tindex>().data());
se::DeviceMemoryBase num_true_ptr(static_cast<void*>(num_true_t.data()));
// Push kernel to stream to get number of true elements.

View File

@ -41,7 +41,7 @@ struct NumTrue {
EIGEN_ALWAYS_INLINE static Status Compute(
OpKernelContext* ctx, const Device& d,
typename TTypes<T>::ConstFlat input,
typename TTypes<TIndex>::Scalar num_true);
typename TTypes<TIndex>::UnalignedScalar num_true);
};
template <typename Device, int NDIM, typename T, typename TIndex>

View File

@ -149,7 +149,7 @@ struct NumTrue<GPUDevice, T, TIndex> {
EIGEN_ALWAYS_INLINE static Status Compute(
OpKernelContext* ctx, const GPUDevice& d,
typename TTypes<T>::ConstFlat input,
typename TTypes<TIndex>::Scalar num_true) {
typename TTypes<TIndex>::UnalignedScalar num_true) {
const auto& cu_stream = GetGpuStream(ctx);
std::size_t temp_storage_bytes = 0;