diff --git a/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc b/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc index 3d4670c9bae..9083626fbf0 100644 --- a/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc +++ b/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc @@ -272,6 +272,88 @@ __global__ void SwapDimension1And2InTensor3UsingTiles(const T* input, } } +// Use shared memory tiles to swap dimension-1 and dimension-2 of a 3D tensor +// when only one of the dimension sizes is smaller than 16, +// where dimensions are zero-based: output[i][j][k] = input[i][k][j]. +// +// small_dim = the_smaller_dimension_size +// large_dim = the_larger_dimension_size +// tile_num_per_block = blockDim.x +// kTileLength = small_dim +// +// Each thread block operates on a single rectangle tile, where its width is +// kTileLength (we currently set it to 64) and its height is small_dim, +// We set the thread block's X dimension to be tile_num_per_block, and its Y +// and Z to be one. +template +__global__ void SwapDimension1And2InTensor3SmallDim(const T* input, + int batch_per_block, + Dimension<3> input_dims, + T* output) { + // TODO(yangzihao) avoid share memory bank conflict. + __shared__ T shared_memory_tile[ShmemSize]; + + eigen_assert(blockDim.y == 1); + eigen_assert(blockDim.z == 1); + eigen_assert(gridDim.z == 1); + + int block_offset = blockIdx.x * blockDim.x; + + int x = threadIdx.x; + int tile_height = blockDim.x; + + // Get tile height, width, and thread/block origin indices. + int small_dim = SmallDim2 ? input_dims[2] : input_dims[1]; + int large_dim = SmallDim2 ? input_dims[1] : input_dims[2]; + + int global_offset = small_dim * large_dim * (blockIdx.y * batch_per_block) + + (SmallDim2 ? block_offset * small_dim : block_offset); + if (global_offset >= (input_dims[0] * input_dims[1] * input_dims[2])) return; + + for (int batch = 0; batch < batch_per_block; ++batch) { + int block_origin_idx = + small_dim * large_dim * (blockIdx.y * batch_per_block + batch); + int thread_origin_idx = + block_origin_idx + + (SmallDim2 ? block_offset * small_dim : block_offset) + x; + + if (block_offset + blockDim.x > large_dim) { + tile_height = large_dim - block_offset; + } + + __syncthreads(); + + // Load a continuous memory region to shared memory tile. + if (x < tile_height) { + for (int y = 0; y < small_dim; y++) { + int shmem_index = + SmallDim2 ? (x + y * tile_height) : (x * small_dim + y); + shared_memory_tile[shmem_index] = + ldg(input + thread_origin_idx + + y * (SmallDim2 ? tile_height : large_dim)); + } + } + + __syncthreads(); + + // Get block origin index for output array. + int output_block_offset = block_origin_idx; + int output_block_idx = SmallDim2 ? block_offset : block_offset * small_dim; + int output_block_origin_idx = output_block_offset + output_block_idx; + + // Store the tranposed memory region in shared memory to device. + if (x < tile_height) { + for (int y = 0; y < small_dim; y++) { + int output_idx = output_block_origin_idx + x + + y * (SmallDim2 ? large_dim : tile_height); + int shmem_index = + SmallDim2 ? (x * small_dim + y) : (x + y * tile_height); + output[output_idx] = shared_memory_tile[shmem_index]; + } + } + } +} + // A Cuda custom kernel that convert input to output, given proper padding on // the left and the top. The padded value is zero. template @@ -420,25 +502,62 @@ template void RunSwapDimension1And2InTensor3(const GPUDevice& d, const T* input, const Dimension<3>& input_dims, T* output) { // If both dimensions are not trivial, use tiles for the actual swapping. + // If one dimension is trivial, use SmallDim kernel for swapping. // Otherwise, the trivial swapping relying on the ldg cache is more efficient. static const int kMinDimensionToUseTiles = 16; bool use_tiles = (input_dims[1] >= kMinDimensionToUseTiles && input_dims[2] >= kMinDimensionToUseTiles); + bool use_small_dim = ((input_dims[1] >= kMinDimensionToUseTiles && + input_dims[2] < kMinDimensionToUseTiles)) || + ((input_dims[1] < kMinDimensionToUseTiles && + input_dims[2] >= kMinDimensionToUseTiles)); + static const int NumSubTiles = 8; + if (use_tiles) { - // We get best performance when TileSize is the number of threads in a warp - // (32 on our GPUs) and NumSubTiles is 8, so our block size is 8 * 32 = 256 - // threads. static const int TileSize = 32; - static const int NumSubTiles = 8; Dimension<3> input_dims_in_tiles = { input_dims[0], (input_dims[1] + TileSize - 1) / TileSize, (input_dims[2] + TileSize - 1) / TileSize, }; int total_tiles_count = input_dims_in_tiles[0] * input_dims_in_tiles[1] * input_dims_in_tiles[2]; + // We get best performance when TileSize is the number of threads in a warp + // (32 on our GPUs) and NumSubTiles is 8, so our block size is 8 * 32 = 256 + // threads. SwapDimension1And2InTensor3UsingTiles<<< total_tiles_count, dim3(TileSize, NumSubTiles), 0, d.stream()>>>( input, input_dims, output); + } else if (use_small_dim) { + // When only one of the dimensions is smaller than kMinDimensionToUseTiles, + // we use one block to process a rectangle region with the size of + // kTileLength * small_dim. We found that when set kTileLength to 64 on + // TitanX Maxwell GPU, it achieves the best performance. + // large_dim + // +---------------...--------+ + // | | | | + // small_dim | | ... | | + // | | | | + // +--------------...---------+ + // \----- ------/ \- -/ + // V V + // kTileLength(tile_height) tile_height + static const int kTileLength = 64; + static const int kGridDimY = 65535; + int large_dim = std::max(input_dims[2], input_dims[1]); + int tile_num_per_block = (large_dim + kTileLength - 1) / kTileLength; + int grid_dim_y = std::min(input_dims[0], kGridDimY); + int batch_per_block = (input_dims[0] + grid_dim_y - 1) / grid_dim_y; + if (input_dims[2] < input_dims[1]) { + SwapDimension1And2InTensor3SmallDim< + T, kTileLength * kMinDimensionToUseTiles, true> + <<>>(input, batch_per_block, input_dims, output); + } else { + SwapDimension1And2InTensor3SmallDim< + T, kTileLength * kMinDimensionToUseTiles, false> + <<>>(input, batch_per_block, input_dims, output); + } } else { int total_element_count = input_dims[0] * input_dims[1] * input_dims[2]; CudaLaunchConfig config = GetCudaLaunchConfig(total_element_count, d); diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD index ab3b851ef85..bdbad146602 100644 --- a/tensorflow/python/BUILD +++ b/tensorflow/python/BUILD @@ -4060,6 +4060,26 @@ cuda_py_test( main = "ops/concat_benchmark.py", ) +cuda_py_test( + name = "conv2d_benchmark", + size = "large", + srcs = ["ops/conv2d_benchmark.py"], + additional_deps = [ + ":client", + ":client_testlib", + ":control_flow_ops", + ":framework_for_generated_wrappers", + ":nn_ops", + ":platform", + ":platform_benchmark", + ":random_ops", + ":variables", + "//third_party/py/numpy", + "//tensorflow/core:protos_all_py", + ], + main = "ops/conv2d_benchmark.py", +) + cuda_py_test( name = "split_benchmark", srcs = ["ops/split_benchmark.py"], diff --git a/tensorflow/python/kernel_tests/transpose_op_test.py b/tensorflow/python/kernel_tests/transpose_op_test.py index 570fa79944c..9e1f83395ba 100644 --- a/tensorflow/python/kernel_tests/transpose_op_test.py +++ b/tensorflow/python/kernel_tests/transpose_op_test.py @@ -229,6 +229,80 @@ class TransposeTest(test.TestCase): self.assertAllEqual(np_ans, tf_ans) self.assertShapeEqual(np_ans, y) + def testLargeSizeGPU(self): + # If no GPU available, skip the test + if not test.is_gpu_available(cuda_only=True): + return + + large_shapes = [[1000000, 31, 3], [3, 1000000, 31], [3, 31, 1000000], + [10000, 310, 3], [3, 10000, 310], [3, 310, 10000], + [2, 1000, 1000], [1000, 2, 1000], [1000, 1000, 2]] + perms = [[0, 2, 1]] * 9 + + for input_shape, perm in zip(large_shapes, perms): + total_size = np.prod(input_shape) + inp = np.arange(1, total_size + 1, dtype=np.float32).reshape(input_shape) + np_ans = self._np_transpose(inp, perm) + with self.test_session(use_gpu=True): + inx = ops.convert_to_tensor(inp) + y = array_ops.transpose(inx, perm) + tf_ans = y.eval() + self.assertAllEqual(np_ans, tf_ans) + self.assertShapeEqual(np_ans, y) + + def testRandomizedSmallDimLargeSizeGPU(self): + # If no GPU available, skip the test + if not test.is_gpu_available(cuda_only=True): + return + + # Draw 10 random shapes with large dimension sizes. + # 40% prob to generate dim[0] size within [1, 2047] + # 40% prob to generate dim[0] size within [2048, 4095] + # 20% prob to generate dim[0] size within [4096, 100000] + # 50% prob to use dim[1] as the small dim (<16) + num_samples = 10 + total_size = 500000 + small_size_limit = 2048 + large_size_limit = 95905 + small_size_percentage = 0.4 + medium_size_percentage = 0.4 + large_size_percentage = 0.2 + perms = [[0, 2, 1]] * num_samples + dim_zero_sizes = [] + dim_zero_sizes += list( + np.random.randint( + small_size_limit, size=int(small_size_percentage * num_samples)) + + 1) + dim_zero_sizes += list( + np.random.randint( + small_size_limit, size=int(medium_size_percentage * num_samples)) + + small_size_limit) + dim_zero_sizes += list( + np.random.randint( + large_size_limit, size=int(large_size_percentage * num_samples)) + + small_size_limit * 2) + input_shapes = [] + small_dim_limit = 16 + for dim_zero_size in dim_zero_sizes: + small_dim_size = np.random.randint(small_dim_limit - 1) + 1 + large_dim_size = int( + total_size / dim_zero_size / small_dim_size) + small_dim_limit + input_shapes += ([[dim_zero_size, small_dim_size, large_dim_size]] + if np.random.randint(2) else + [[dim_zero_size, large_dim_size, small_dim_size]]) + + for input_shape, perm in zip(input_shapes, perms): + # generate input data with random ints from 0 to 9. + inp = np.random.randint(10, size=input_shape) + np_ans = self._np_transpose(inp, perm) + with self.test_session(use_gpu=True): + inx = ops.convert_to_tensor(inp) + y = array_ops.transpose(inx, perm) + tf_ans = y.eval() + self.assertAllEqual(np_ans, tf_ans) + self.assertShapeEqual(np_ans, y) + self._ClearCachedSession() + def testNop(self): self._compareCpu(np.arange(0, 6).reshape([3, 2]).astype(np.float32), [0, 1]) diff --git a/tensorflow/python/ops/conv2d_benchmark.py b/tensorflow/python/ops/conv2d_benchmark.py new file mode 100644 index 00000000000..6992fa57ead --- /dev/null +++ b/tensorflow/python/ops/conv2d_benchmark.py @@ -0,0 +1,141 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Benchmark for Conv2D op.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import itertools +import time + +from tensorflow.python.client import session as session_lib +from tensorflow.python.framework import ops +from tensorflow.python.ops import control_flow_ops +from tensorflow.python.ops import nn_ops +from tensorflow.python.ops import random_ops +from tensorflow.python.ops import variables +from tensorflow.python.platform import test + + +def build_graph(device, input_shape, filter_shape, strides, padding, num_iters): + """builds a graph containing a sequence of conv2d operations. + + Args: + device: String, the device to run on. + input_shape: Shape of the input tensor. + filter_shape: Shape of the filter tensor. + strides: A list of ints. 1-D of length 4. The stride of sliding + window for each dimension of input. + padding: A string from: "SAME", "VALID". The type of padding + algorithm to use. + num_iters: number of iterations to run conv2d. + + Returns: + An array of tensors to run() + """ + with ops.device("/%s:0" % device): + inp = variables.Variable(random_ops.truncated_normal(input_shape)) + filt = variables.Variable(random_ops.truncated_normal(filter_shape)) + + outputs = [] + conv2d_op = nn_ops.conv2d(inp, filt, strides, padding, data_format="NHWC") + outputs.append(conv2d_op) + for _ in range(1, num_iters): + with ops.control_dependencies([conv2d_op]): + conv2d_op = nn_ops.conv2d( + inp, filt, strides, padding, data_format="NHWC") + outputs.append(conv2d_op) + return control_flow_ops.group(*outputs) + + +class Conv2DBenchmark(test.Benchmark): + """Benchmark conv2d!""" + + def _run_graph(self, device, input_shape, filter_shape, strides, padding, + num_iters): + """runs the graph and print its execution time. + + Args: + device: String, the device to run on. + input_shape: Shape of the input tensor. + filter_shape: Shape of the filter tensor. + strides: A list of ints. 1-D of length 4. The stride of sliding + window for each dimension of input. + padding: A string from: "SAME", "VALID". The type of padding + algorithm to use. num_iters: Number of iterations to run the + benchmark. + num_iters: number of iterations to run conv2d. + + Returns: + The duration of the run in seconds. + """ + graph = ops.Graph() + with graph.as_default(): + outputs = build_graph(device, input_shape, filter_shape, strides, padding, + num_iters) + with session_lib.Session(graph=graph) as session: + variables.global_variables_initializer().run() + # warmup runs + session.run(outputs) + + start_time = time.time() + session.run(outputs) + duration = (time.time() - start_time) / num_iters + + print("%s inputshape:%s filtershape:%s strides:%s padding:%s " + "%d iters: %.8f sec" % + (device, str(input_shape).replace(" ", ""), + str(filter_shape).replace(" ", ""), + str(strides).replace(" ", ""), padding, num_iters, duration)) + + name_template = ( + "conv2d_{device}_input_shape_{inputshape}_filter_shape_{filtershape}_" + "strides_{strides}_padding_{padding}") + + self.report_benchmark( + name=name_template.format( + device=device, + inputshape=str(input_shape).replace(" ", ""), + filtershape=str(filter_shape).replace(" ", ""), + strides=str(strides).replace(" ", ""), + padding=padding).replace(" ", ""), + iters=num_iters, + wall_time=duration / num_iters) + + return duration + + def benchmark_conv2d(self): + print("conv2d benchmark:") + + h = 500 + w = 500 + fh = 3 + fw = 3 + input_shapes = [] + filter_shapes = [] + for b, c in itertools.product([4, 16, 32], [i for i in range(3, 16)]): + input_shapes += [[b, h, w, c]] + filter_shapes += [[fh, fw, c, b]] + strides = [[1, 2, 2, 1]] + paddings = ["VALID", "SAME"] + for ishape, fshape in zip(input_shapes, filter_shapes): + for stride in strides: + for padding in paddings: + self._run_graph("gpu", ishape, fshape, stride, padding, 80) + + +if __name__ == "__main__": + test.main() diff --git a/tensorflow/python/ops/transpose_benchmark.py b/tensorflow/python/ops/transpose_benchmark.py index 63a314295e3..6b5f0f20d8c 100644 --- a/tensorflow/python/ops/transpose_benchmark.py +++ b/tensorflow/python/ops/transpose_benchmark.py @@ -1,4 +1,4 @@ -# Copyright 2015 The TensorFlow Authors. All Rights Reserved. +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -32,7 +32,7 @@ from tensorflow.python.platform import test def build_graph(device, input_shape, perm, datatype, num_iters): - """Build a graph containing a sequence of conv2d operations. + """builds a graph containing a sequence of conv2d operations. Args: device: String, the device to run on. @@ -50,10 +50,12 @@ def build_graph(device, input_shape, perm, datatype, num_iters): t = constant_op.constant(inp, shape=input_shape) outputs = [] - outputs.append(array_ops.transpose(t, perm)) - for i in range(1, num_iters): - with ops.control_dependencies([outputs[i - 1]]): - outputs.append(array_ops.transpose(t, perm)) + transpose_op = array_ops.transpose(t, perm) + outputs.append(transpose_op) + for _ in range(1, num_iters): + with ops.control_dependencies([transpose_op]): + transpose_op = array_ops.transpose(t, perm) + outputs.append(transpose_op) return control_flow_ops.group(*outputs) @@ -61,7 +63,7 @@ class TransposeBenchmark(test.Benchmark): """Benchmark transpose!""" def _run_graph(self, device, input_shape, perm, num_iters, datatype): - """Run the graph and print its execution time. + """runs the graph and print its execution time. Args: device: String, the device to run on. @@ -82,9 +84,11 @@ class TransposeBenchmark(test.Benchmark): session.run(outputs) start_time = time.time() session.run(outputs) + duration = (time.time() - start_time) / num_iters throughput = np.prod( np.array(input_shape)) * datatype().itemsize * 2 / duration / 1e9 + print("%s %s inputshape:%s perm:%s %d %.6fsec, %.4fGB/s." % (device, str(datatype), str(input_shape).replace(" ", ""), str(perm).replace(" ", ""), num_iters, duration, throughput)) @@ -108,12 +112,12 @@ class TransposeBenchmark(test.Benchmark): datatypes = [np.complex128, np.float64, np.float32, np.float16, np.int8] - small_shapes = [[2, 20, 20, 20, 16], [2, 16, 20, 20, 20]] * 2 + [[ - 2, 100, 100, 16 - ], [2, 16, 100, 100]] * 2 + [[2, 5000, 16], [2, 16, 5000]] * 2 - small_perms = [[0, 4, 1, 2, 3], [0, 2, 3, 4, 1]] + [[4, 1, 2, 3, 0]] * 2 + [ - [0, 3, 1, 2], [0, 2, 3, 1] - ] + [[3, 1, 2, 0]] * 2 + [[0, 2, 1]] * 2 + [[2, 1, 0]] * 2 + small_shapes = [[2, 20, 20, 20, 16], [2, 16, 20, 20, 20]] * 2 + small_shapes += [[2, 100, 100, 16], [2, 16, 100, 100]] * 2 + small_shapes += [[2, 5000, 16], [2, 16, 5000]] * 2 + small_perms = [[0, 4, 1, 2, 3], [0, 2, 3, 4, 1]] + [[4, 1, 2, 3, 0]] * 2 + small_perms += [[0, 3, 1, 2], [0, 2, 3, 1]] + [[3, 1, 2, 0]] * 2 + small_perms += [[0, 2, 1]] * 2 + [[2, 1, 0]] * 2 large_shapes = [[2, 40, 40, 40, 32], [2, 40, 40, 40, 64]] * 2 + [[ 2, 300, 300, 32 @@ -132,5 +136,23 @@ class TransposeBenchmark(test.Benchmark): for ishape, perm in zip(large_shapes, large_perms): self._run_graph("gpu", ishape, perm, num_iters, datatype) + small_dim_large_shapes = [[2, 10000, 3], [2, 3, 10000], [2, 10000, 8], + [2, 8, 10000]] + small_dim_small_shapes = [[2, 5000, 3], [2, 3, 5000], [2, 5000, 8], + [2, 8, 5000]] + small_dim_perms = [[0, 2, 1]] * 4 + + num_iters = 320 + small_dim_large_shape_datatypes = [np.float64, np.float32, np.int8] + for datatype in small_dim_large_shape_datatypes: + for ishape, perm in zip(small_dim_large_shapes, small_dim_perms): + self._run_graph("gpu", ishape, perm, num_iters, datatype) + + small_dim_small_shape_datatypes = [np.complex128, np.float16] + for datatype in small_dim_small_shape_datatypes: + for ishape, perm in zip(small_dim_small_shapes, small_dim_perms): + self._run_graph("gpu", ishape, perm, num_iters, datatype) + + if __name__ == "__main__": test.main()