From fc44600e5c3ccf1de1e3d4792a00d3578311d3f6 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Mon, 12 Nov 2018 08:41:34 -0800 Subject: [PATCH] Avoid overflow in reduction GPU kernel for large tensors, see issue #22123. Improve launch code. PiperOrigin-RevId: 221097986 --- .../core/kernels/reduction_gpu_kernels.cu.h | 87 ++++++++++--------- 1 file changed, 46 insertions(+), 41 deletions(-) diff --git a/tensorflow/core/kernels/reduction_gpu_kernels.cu.h b/tensorflow/core/kernels/reduction_gpu_kernels.cu.h index f5644d0da4c..e9cf36c62b9 100644 --- a/tensorflow/core/kernels/reduction_gpu_kernels.cu.h +++ b/tensorflow/core/kernels/reduction_gpu_kernels.cu.h @@ -218,7 +218,11 @@ __global__ void RowReduceKernel( T in, outT out, int num_rows, int num_cols, Op op, typename std::iterator_traits::value_type initVal) { typedef typename std::iterator_traits::value_type value_type; - const int row = (blockIdx.x * blockDim.x + threadIdx.x) / 32; + // Defensive index computation to avoid integer overflow. + assert(blockDim.x % 32 == 0); + int warps_per_block = blockDim.x / 32; + int warp_index = threadIdx.x / 32; + const int row = blockIdx.x * warps_per_block + warp_index; const int lane = threadIdx.x % 32; if (num_cols == 1) { @@ -526,27 +530,27 @@ void LaunchScalarReduction(OpKernelContext* ctx, OUT_T out, IN_T in, init); return; } - std::size_t temp_storage_bytes = 0; - Tensor temp_storage; - // written as a loop because it reduces clutter - // first pass allocates memory, second launches kernel(s) - for (int i = 0; i < 2; ++i) { - auto success = cub::DeviceReduce::Reduce( - i == 0 ? nullptr : temp_storage.flat().data(), - temp_storage_bytes, in, out, in_size, op, init, cu_stream); + size_t temp_storage_bytes = 0; + auto reduce = [&](void* temp_storage_ptr) { + auto success = + cub::DeviceReduce::Reduce(temp_storage_ptr, temp_storage_bytes, in, out, + in_size, op, init, cu_stream); OP_REQUIRES( ctx, success == 0, errors::Internal("CUB reduce error", cudaGetErrorString(success))); + }; - if (i == 0) - OP_REQUIRES_OK( - ctx, - ctx->allocate_temp( - DT_INT8, TensorShape({static_cast(temp_storage_bytes)}), - &temp_storage)); - } + reduce(nullptr); // Get required amount of temp storage. + + Tensor temp_storage; + OP_REQUIRES_OK( + ctx, ctx->allocate_temp( + DT_INT8, TensorShape({static_cast(temp_storage_bytes)}), + &temp_storage)); + + reduce(temp_storage.flat().data()); // Do reduction. } template @@ -569,25 +573,26 @@ void LaunchRowReduction(OpKernelContext* ctx, OUT_T out, IN_T in, int num_rows, cub::TransformInputIterator> transform_iter(counting_iter, row_offset_op); - std::size_t temp_storage_bytes = 0; - Tensor temp_storage; - for (int i = 0; i < 2; ++i) { + size_t temp_storage_bytes = 0; + auto reduce = [&](void* temp_storage_ptr) { auto success = cub::DeviceSegmentedReduce::Reduce( - i == 0 ? nullptr : temp_storage.flat().data(), - temp_storage_bytes, in, out, num_rows, transform_iter, + temp_storage_ptr, temp_storage_bytes, in, out, num_rows, transform_iter, transform_iter + 1, op, init, cu_stream); OP_REQUIRES(ctx, success == 0, errors::Internal("CUB segmented reduce error", cudaGetErrorString(success))); + }; - if (i == 0) - OP_REQUIRES_OK( - ctx, - ctx->allocate_temp( - DT_INT8, TensorShape({static_cast(temp_storage_bytes)}), - &temp_storage)); - } + reduce(nullptr); // Get required amount of temp storage. + + Tensor temp_storage; + OP_REQUIRES_OK( + ctx, ctx->allocate_temp( + DT_INT8, TensorShape({static_cast(temp_storage_bytes)}), + &temp_storage)); + + reduce(temp_storage.flat().data()); // Do reduction. } template @@ -720,25 +725,25 @@ void Launch3DXZReduction(OpKernelContext* ctx, OUT_T out, IN_T in, int extent_x, gather_iter); std::size_t temp_storage_bytes = 0; - Tensor temp_storage; - - for (int i = 0; i < 2; ++i) { + auto reduce = [&](void* temp_storage_ptr) { auto success = cub::DeviceSegmentedReduce::Reduce( - i == 0 ? nullptr : temp_storage.flat().data(), - temp_storage_bytes, permute_iter, out, extent_y, transform_iter, - transform_iter + 1, op, init, cu_stream); + temp_storage_ptr, temp_storage_bytes, permute_iter, out, extent_y, + transform_iter, transform_iter + 1, op, init, cu_stream); OP_REQUIRES(ctx, success == 0, errors::Internal("CUB segmented reduce error", cudaGetErrorString(success))); + }; - if (i == 0) - OP_REQUIRES_OK( - ctx, - ctx->allocate_temp( - DT_INT8, TensorShape({static_cast(temp_storage_bytes)}), - &temp_storage)); - } + reduce(nullptr); // Get required amount of temp storage. + + Tensor temp_storage; + OP_REQUIRES_OK( + ctx, ctx->allocate_temp( + DT_INT8, TensorShape({static_cast(temp_storage_bytes)}), + &temp_storage)); + + reduce(temp_storage.flat().data()); // Do reduction. } namespace reduction_op_helper {