[WhereCPUOp] Optimize the tensor-wrapping of the num_true scalar.

Previously, we allocated a temporary, aligned, scalar Tensor of DT_INT64 to receive the `num_true` result, which involves multiple virtual calls, atomic operations, etc. Instead, we now (on CPU) allocate an `int64` on the stack and wrap it in a `TTypes<int64>::UnalignedScalar`, which is cheap to create. PiperOrigin-RevId: 296928208 Change-Id: I61a55096d1bc1f673f9cac21984b27a754960a0b
2020-02-24 10:55:23 -08:00 · 2020-02-24 10:55:23 -08:00 · db8443e2fd
commit db8443e2fd
parent 2d67dbde27
3 changed files with 9 additions and 13 deletions
--- a/tensorflow/core/kernels/where_op.cc
+++ b/tensorflow/core/kernels/where_op.cc
@ -75,7 +75,7 @@ template <typename T>
 struct NumTrue<CPUDevice, T, int64> {
  static Status Compute(OpKernelContext* ctx, const CPUDevice& d,
                        typename TTypes<T>::ConstFlat input,
-                        TTypes<int64>::Scalar num_true) {
+                        TTypes<int64>::UnalignedScalar num_true) {
    num_true() = CountAccumulator<T>(input.data(), input.data() + input.size());
    return Status::OK();
  }
@ -140,18 +140,14 @@ class WhereCPUOp : public OpKernel {

    const int input_dims = input.dims();

-    Tensor num_true;
-    AllocatorAttributes attr;
-    attr.set_on_host(true);
-    OP_REQUIRES_OK(context, context->allocate_temp(DT_INT64, TensorShape({}),
-                                                   &num_true, attr));
-    auto num_true_t = num_true.scalar<int64>();
+    int64 num_true;
+    TTypes<int64>::UnalignedScalar num_true_t(&num_true);

    Status s = functor::NumTrue<CPUDevice, T, int64>::Compute(
        context, context->eigen_device<CPUDevice>(), input.flat<T>(),
        num_true_t);
    OP_REQUIRES_OK(context, s);
-    TensorShape output_shape({num_true_t(), input_dims});
+    TensorShape output_shape({num_true, input_dims});
    Tensor* output = nullptr;
    OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output));

@ -216,7 +212,7 @@ namespace functor {
  template <>                                                               \
  Status NumTrue<GPUDevice, T, Tindex>::Compute(                            \
      OpKernelContext* ctx, const GPUDevice& d, TTypes<T>::ConstFlat input, \
-      TTypes<Tindex>::Scalar num_true);                                     \
+      TTypes<Tindex>::UnalignedScalar num_true);                            \
  extern template struct NumTrue<GPUDevice, T, Tindex>

 #define DECLARE_GPU_NUMTRUE_TYPE(T) \
@ -287,8 +283,8 @@ class WhereGPUOp : public AsyncOpKernel {
                         context->allocate_temp(DataTypeToEnum<Tindex>::v(),
                                                TensorShape({}), &num_true),
                         done);
-
-    auto num_true_t = num_true.scalar<Tindex>();
+    typename TTypes<Tindex>::UnalignedScalar num_true_t(
+        num_true.scalar<Tindex>().data());

    se::DeviceMemoryBase num_true_ptr(static_cast<void*>(num_true_t.data()));
    // Push kernel to stream to get number of true elements.
--- a/tensorflow/core/kernels/where_op.h
+++ b/tensorflow/core/kernels/where_op.h
@ -41,7 +41,7 @@ struct NumTrue {
  EIGEN_ALWAYS_INLINE static Status Compute(
      OpKernelContext* ctx, const Device& d,
      typename TTypes<T>::ConstFlat input,
-      typename TTypes<TIndex>::Scalar num_true);
+      typename TTypes<TIndex>::UnalignedScalar num_true);
 };

 template <typename Device, int NDIM, typename T, typename TIndex>
--- a/tensorflow/core/kernels/where_op_gpu.cu.h
+++ b/tensorflow/core/kernels/where_op_gpu.cu.h
@ -149,7 +149,7 @@ struct NumTrue<GPUDevice, T, TIndex> {
  EIGEN_ALWAYS_INLINE static Status Compute(
      OpKernelContext* ctx, const GPUDevice& d,
      typename TTypes<T>::ConstFlat input,
-      typename TTypes<TIndex>::Scalar num_true) {
+      typename TTypes<TIndex>::UnalignedScalar num_true) {
    const auto& cu_stream = GetGpuStream(ctx);

    std::size_t temp_storage_bytes = 0;