From b9579f96bd07d3016285128e1e2466540b47bf01 Mon Sep 17 00:00:00 2001 From: Kaixi Hou Date: Fri, 27 Mar 2020 14:20:09 -0700 Subject: [PATCH 1/2] Vectorize transpose --- tensorflow/core/kernels/conv_2d_gpu.h | 91 +++++++++++++++++++++++++-- 1 file changed, 87 insertions(+), 4 deletions(-) diff --git a/tensorflow/core/kernels/conv_2d_gpu.h b/tensorflow/core/kernels/conv_2d_gpu.h index 31abe9dfead..90d85e6f04e 100644 --- a/tensorflow/core/kernels/conv_2d_gpu.h +++ b/tensorflow/core/kernels/conv_2d_gpu.h @@ -210,6 +210,57 @@ __global__ void ShuffleInTensor3Simple(int nthreads, } } +constexpr int kUnroll = 4; + +template +__global__ void ShuffleInTensor3SimpleVector(int nthreads, + const T* __restrict__ input, + Dimension<3> input_dims, + T* __restrict__ output) { + Dimension<3> output_dims; + output_dims[sp0] = input_dims[0]; + output_dims[sp1] = input_dims[1]; + output_dims[sp2] = input_dims[2]; + + const int stride = blockDim.x * gridDim.x * kUnroll; + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + T buf[kUnroll]; + + int output_index; + for (output_index = tid * kUnroll; output_index + kUnroll - 1 < nthreads; + output_index += stride) { +#pragma unroll + for (int i = 0; i < kUnroll; i++) { + int output_index_i = output_index + i; + Index<3> output_tensor_index = FlatToTensorIndex(output_index_i, + output_dims); + Index<3> input_tensor_index; + input_tensor_index[0] = output_tensor_index[sp0]; + input_tensor_index[1] = output_tensor_index[sp1]; + input_tensor_index[2] = output_tensor_index[sp2]; + + int input_index_i = TensorIndexToFlat(input_tensor_index, input_dims); + buf[i] = maybe_conj::run(ldg(input + input_index_i)); + } + float2 *out = reinterpret_cast(output + output_index); + *out = *reinterpret_cast(buf); + } + + for(; output_index < nthreads; output_index++) { + Index<3> output_tensor_index = FlatToTensorIndex(output_index, output_dims); + + Index<3> input_tensor_index; + input_tensor_index[0] = output_tensor_index[sp0]; + input_tensor_index[1] = output_tensor_index[sp1]; + input_tensor_index[2] = output_tensor_index[sp2]; + + int input_index = TensorIndexToFlat(input_tensor_index, input_dims); + + output[output_index] = + maybe_conj::run(ldg(input + input_index)); + } +} + // Use shared memory tiles to swap dimension-1 and dimension-2 of a 3D tensor, // where dimensions are zero-based: output[i][j][k] = input[i][k][j]. // @@ -1008,10 +1059,42 @@ struct SwapDimension0And2InTensor3 { static_cast(combined_dims[2])}; size_t total_size = combined_dims[0] * combined_dims[1] * combined_dims[2]; GpuLaunchConfig config = GetGpuLaunchConfig(total_size, d); - TF_CHECK_OK(GpuLaunchKernel(ShuffleInTensor3Simple, - config.block_count, config.thread_per_block, 0, - d.stream(), config.virtual_thread_count, in, - input_dims, out)); + + auto out_ptr = reinterpret_cast(out); + bool aligned = out_ptr % 16 == 0; + + bool use_vector = false; + bool use_custom_config = false; + if (input_dims[0] <= 128 && input_dims[2] <= 128 || + input_dims[0] * input_dims[1] <= 128 || + input_dims[1] * input_dims[2] <= 8) { + use_vector = true; + use_custom_config = true; + } else if (input_dims[1] * input_dims[2] <= 16384) { + use_vector = true; + } + + if (sizeof(T) == 2 && aligned && use_vector) { + int block_count; + if (use_custom_config) { + block_count = (total_size + config.thread_per_block - 1) / + config.thread_per_block; + } else { + block_count = config.block_count; + } + + TF_CHECK_OK(GpuLaunchKernel(ShuffleInTensor3SimpleVector, + block_count, + config.thread_per_block / kUnroll, + 0, d.stream(), total_size, + in, input_dims, out)); + } else { + TF_CHECK_OK(GpuLaunchKernel(ShuffleInTensor3Simple, + config.block_count, config.thread_per_block, + 0, d.stream(), config.virtual_thread_count, + in, input_dims, out)); + } } }; From 0975574df38cecd6f5643d0c188342cef96b463e Mon Sep 17 00:00:00 2001 From: Kaixi Hou Date: Mon, 11 May 2020 10:46:01 -0700 Subject: [PATCH 2/2] Minor changes --- tensorflow/core/kernels/conv_2d_gpu.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tensorflow/core/kernels/conv_2d_gpu.h b/tensorflow/core/kernels/conv_2d_gpu.h index 90d85e6f04e..297016160ad 100644 --- a/tensorflow/core/kernels/conv_2d_gpu.h +++ b/tensorflow/core/kernels/conv_2d_gpu.h @@ -210,7 +210,7 @@ __global__ void ShuffleInTensor3Simple(int nthreads, } } -constexpr int kUnroll = 4; +static constexpr int kUnroll = 4; template __global__ void ShuffleInTensor3SimpleVector(int nthreads, @@ -246,7 +246,7 @@ __global__ void ShuffleInTensor3SimpleVector(int nthreads, *out = *reinterpret_cast(buf); } - for(; output_index < nthreads; output_index++) { + for (; output_index < nthreads; ++output_index) { Index<3> output_tensor_index = FlatToTensorIndex(output_index, output_dims); Index<3> input_tensor_index;