[WhereCPUOp] Optimize the tensor-wrapping of the num_true
scalar.
Previously, we allocated a temporary, aligned, scalar Tensor of DT_INT64 to receive the `num_true` result, which involves multiple virtual calls, atomic operations, etc. Instead, we now (on CPU) allocate an `int64` on the stack and wrap it in a `TTypes<int64>::UnalignedScalar`, which is cheap to create. PiperOrigin-RevId: 296928208 Change-Id: I61a55096d1bc1f673f9cac21984b27a754960a0b
This commit is contained in:
parent
2d67dbde27
commit
db8443e2fd
tensorflow/core/kernels
@ -75,7 +75,7 @@ template <typename T>
|
||||
struct NumTrue<CPUDevice, T, int64> {
|
||||
static Status Compute(OpKernelContext* ctx, const CPUDevice& d,
|
||||
typename TTypes<T>::ConstFlat input,
|
||||
TTypes<int64>::Scalar num_true) {
|
||||
TTypes<int64>::UnalignedScalar num_true) {
|
||||
num_true() = CountAccumulator<T>(input.data(), input.data() + input.size());
|
||||
return Status::OK();
|
||||
}
|
||||
@ -140,18 +140,14 @@ class WhereCPUOp : public OpKernel {
|
||||
|
||||
const int input_dims = input.dims();
|
||||
|
||||
Tensor num_true;
|
||||
AllocatorAttributes attr;
|
||||
attr.set_on_host(true);
|
||||
OP_REQUIRES_OK(context, context->allocate_temp(DT_INT64, TensorShape({}),
|
||||
&num_true, attr));
|
||||
auto num_true_t = num_true.scalar<int64>();
|
||||
int64 num_true;
|
||||
TTypes<int64>::UnalignedScalar num_true_t(&num_true);
|
||||
|
||||
Status s = functor::NumTrue<CPUDevice, T, int64>::Compute(
|
||||
context, context->eigen_device<CPUDevice>(), input.flat<T>(),
|
||||
num_true_t);
|
||||
OP_REQUIRES_OK(context, s);
|
||||
TensorShape output_shape({num_true_t(), input_dims});
|
||||
TensorShape output_shape({num_true, input_dims});
|
||||
Tensor* output = nullptr;
|
||||
OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output));
|
||||
|
||||
@ -216,7 +212,7 @@ namespace functor {
|
||||
template <> \
|
||||
Status NumTrue<GPUDevice, T, Tindex>::Compute( \
|
||||
OpKernelContext* ctx, const GPUDevice& d, TTypes<T>::ConstFlat input, \
|
||||
TTypes<Tindex>::Scalar num_true); \
|
||||
TTypes<Tindex>::UnalignedScalar num_true); \
|
||||
extern template struct NumTrue<GPUDevice, T, Tindex>
|
||||
|
||||
#define DECLARE_GPU_NUMTRUE_TYPE(T) \
|
||||
@ -287,8 +283,8 @@ class WhereGPUOp : public AsyncOpKernel {
|
||||
context->allocate_temp(DataTypeToEnum<Tindex>::v(),
|
||||
TensorShape({}), &num_true),
|
||||
done);
|
||||
|
||||
auto num_true_t = num_true.scalar<Tindex>();
|
||||
typename TTypes<Tindex>::UnalignedScalar num_true_t(
|
||||
num_true.scalar<Tindex>().data());
|
||||
|
||||
se::DeviceMemoryBase num_true_ptr(static_cast<void*>(num_true_t.data()));
|
||||
// Push kernel to stream to get number of true elements.
|
||||
|
@ -41,7 +41,7 @@ struct NumTrue {
|
||||
EIGEN_ALWAYS_INLINE static Status Compute(
|
||||
OpKernelContext* ctx, const Device& d,
|
||||
typename TTypes<T>::ConstFlat input,
|
||||
typename TTypes<TIndex>::Scalar num_true);
|
||||
typename TTypes<TIndex>::UnalignedScalar num_true);
|
||||
};
|
||||
|
||||
template <typename Device, int NDIM, typename T, typename TIndex>
|
||||
|
@ -149,7 +149,7 @@ struct NumTrue<GPUDevice, T, TIndex> {
|
||||
EIGEN_ALWAYS_INLINE static Status Compute(
|
||||
OpKernelContext* ctx, const GPUDevice& d,
|
||||
typename TTypes<T>::ConstFlat input,
|
||||
typename TTypes<TIndex>::Scalar num_true) {
|
||||
typename TTypes<TIndex>::UnalignedScalar num_true) {
|
||||
const auto& cu_stream = GetGpuStream(ctx);
|
||||
|
||||
std::size_t temp_storage_bytes = 0;
|
||||
|
Loading…
Reference in New Issue
Block a user