From 58b1c0f401436f3311c551977c18abc897938a23 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Wed, 29 Jan 2020 14:34:30 -0800 Subject: [PATCH] Internal change PiperOrigin-RevId: 292220288 Change-Id: Ib7e23f56f7b79174669d10ae7d938a82d9c19900 --- tensorflow/core/kernels/BUILD | 2 +- tensorflow/core/kernels/argmax_op.cc | 40 +----- tensorflow/core/kernels/argmax_op.h | 8 -- tensorflow/core/kernels/argmax_op_gpu.cu.cc | 136 ------------------ .../python/kernel_tests/argmax_op_test.py | 57 +------- 5 files changed, 4 insertions(+), 239 deletions(-) diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD index 8119c3fae17..328e5545413 100644 --- a/tensorflow/core/kernels/BUILD +++ b/tensorflow/core/kernels/BUILD @@ -3913,7 +3913,7 @@ tf_kernel_library( tf_kernel_library( name = "argmax_op", prefix = "argmax_op", - deps = MATH_DEPS + if_cuda_or_rocm([":reduction_ops"]), + deps = MATH_DEPS, ) tf_kernel_library( diff --git a/tensorflow/core/kernels/argmax_op.cc b/tensorflow/core/kernels/argmax_op.cc index 34c7744bea7..63bef41a272 100644 --- a/tensorflow/core/kernels/argmax_op.cc +++ b/tensorflow/core/kernels/argmax_op.cc @@ -13,6 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ +// See docs in ../ops/math_ops.cc. + #define EIGEN_USE_THREADS #if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \ @@ -39,39 +41,6 @@ namespace tensorflow { typedef Eigen::ThreadPoolDevice CPUDevice; typedef Eigen::GpuDevice GPUDevice; -template -struct CustomArgOp; - -template <> -struct CustomArgOp { - template - // Determines whether the custom kernel in argmax_op_gpu.cu.cc should be - // used, and if so, runs it by calling DoGpuArgOp. If it was run, - // returns true. Otherwise, it returns false and the caller must calculate the - // arg min or max itself. - static bool CustomArgFunc(OpKernelContext* context, const Tensor& input, - int axis, Tensor* output) { - return false; - } -}; - -template <> -struct CustomArgOp { - template - static bool CustomArgFunc(OpKernelContext* context, const Tensor& input, - int axis, Tensor* output) { - if (output->NumElements() <= 1024 || output->dims() > 7) { - // The custom kernel is faster than Eigen when the number of output - // elements is relatively small. We also only handle the Eigen case for up - // to 7 dimensions. - DoGpuArgOp(context, input, axis, output); - return true; - } else { - return false; - } - } -}; - template class ArgOp : public OpKernel { public: @@ -112,11 +81,6 @@ class ArgOp : public OpKernel { return; } - if (CustomArgOp::template CustomArgFunc( - context, input, axis, output)) { - return; - } - #define HANDLE_DIM(NDIM) \ case NDIM: \ ArgFunctor::Reduce##NDIM(context->eigen_device(), \ diff --git a/tensorflow/core/kernels/argmax_op.h b/tensorflow/core/kernels/argmax_op.h index 6f8146de6a7..5306f3ae1ef 100644 --- a/tensorflow/core/kernels/argmax_op.h +++ b/tensorflow/core/kernels/argmax_op.h @@ -18,8 +18,6 @@ limitations under the License. // Generator definition for ArgMaxOp, must be compilable by nvcc. #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" -#include "tensorflow/core/framework/op_kernel.h" -#include "tensorflow/core/framework/tensor.h" #include "tensorflow/core/framework/tensor_types.h" #include "tensorflow/core/platform/types.h" @@ -45,7 +43,6 @@ struct ArgMax { DECLARE_COMPUTE_SPEC(7); #undef DECLARE_COMPUTE_SPEC - enum { is_argmax = true }; }; template @@ -66,15 +63,10 @@ struct ArgMin { DECLARE_COMPUTE_SPEC(7); #undef DECLARE_COMPUTE_SPEC - enum { is_argmax = false }; }; } // namespace functor -template -void DoGpuArgOp(OpKernelContext* context, const Tensor& input, int axis, - Tensor* output); - } // namespace tensorflow #endif // TENSORFLOW_CORE_KERNELS_ARGMAX_OP_H_ diff --git a/tensorflow/core/kernels/argmax_op_gpu.cu.cc b/tensorflow/core/kernels/argmax_op_gpu.cu.cc index 0bb4c35901e..bd7c4b4027c 100644 --- a/tensorflow/core/kernels/argmax_op_gpu.cu.cc +++ b/tensorflow/core/kernels/argmax_op_gpu.cu.cc @@ -20,147 +20,11 @@ limitations under the License. #include "tensorflow/core/framework/register_types.h" #include "tensorflow/core/kernels/argmax_op.h" -#include "tensorflow/core/kernels/reduction_gpu_kernels.cu.h" -#include "tensorflow/core/kernels/reduction_ops_common.h" namespace tensorflow { typedef Eigen::GpuDevice GPUDevice; -typedef tensorflow::TTypes::Tensor::Index Index; - -// To compute the argmax/argmin, we perform a reduction on KeyValuePairs, which -// are (flattened index, value) pairs. -template -using KeyValuePair = cub::KeyValuePair; - -namespace { - -template -struct MaxOrMinFunc; - -// The reduction operator: Returns the KeyValuePair with the highest or lowest -// value. -template -struct MaxOrMinFunc { - __host__ __device__ __forceinline__ KeyValuePair operator()( - const KeyValuePair& lhs, const KeyValuePair& rhs) { - // If one value is NaN, we choose the other value. This behavior is not - // guaranteed by the op and may change in the future. - return (lhs.value > rhs.value || Eigen::numext::isnan(rhs.value)) ? lhs - : rhs; - } -}; - -template -struct MaxOrMinFunc { - __host__ __device__ __forceinline__ KeyValuePair operator()( - const KeyValuePair& lhs, const KeyValuePair& rhs) { - return (lhs.value < rhs.value || Eigen::numext::isnan(rhs.value)) ? lhs - : rhs; - } -}; - -// The output converter: Converts from a KeyValuePair to an index into a a -// specific dimension. dim1 is the size of the dimension being reduced. dim2 is -// the size of the dimension(s) after dim1. -template -struct OutputConverter { - OutputConverter(Index dim1, Index dim2) : dim1_(dim1), dim2_(dim2) {} - - __host__ __device__ __forceinline__ Tout - operator()(const KeyValuePair& key_value_pair) const { - return static_cast((key_value_pair.key / dim2_) % dim1_); - } - - Index dim1_; - Index dim2_; -}; - -} // namespace - -namespace functor { -namespace reduction_op_helper { - -// Template specialization of IdentityValue, to return the identity value for -// the reduction. This is needed for ReduceImpl, a function we call. We return -// (0, -inf) for argmax and (0, inf) for argmin. -template -struct IdentityValue, MaxOrMinFunc> { - KeyValuePair operator()() { - return {0, -std::numeric_limits::infinity()}; - } -}; - -template -struct IdentityValue, MaxOrMinFunc> { - KeyValuePair operator()() { - return {0, std::numeric_limits::infinity()}; - } -}; - -} // namespace reduction_op_helper -} // namespace functor - -template -void DoGpuArgOp(OpKernelContext* context, const Tensor& input, int axis, - Tensor* output) { - // We collapse adjacent axes of the input tensor in order to view it as a - // 3 dimensional tensor. The reduction axis is not collapsed, so the three new - // axes will be the input axes to the left of the reduction axis, the - // reduction axis, and the input axes to the right of the reduction axis. - Index dim0 = 1; - for (Index i = 0; i < axis; i++) { - dim0 *= input.dim_size(i); - } - Index dim1 = input.dim_size(axis); - Index dim2 = 1; - for (Index i = axis + 1; i < input.dims(); i++) { - dim2 *= input.dim_size(i); - } - DCHECK_EQ(dim0 * dim1 * dim2, input.NumElements()); - - auto inp = input.shaped({dim0, dim1, dim2}); - auto out = output->shaped({dim0, dim2}); - - // We call ReduceImpl to perform the reduction. The input iterator returns - // KeyValuePairs. The reduction functor returns the KeyValuePair with the max - // or min value. The output iterator converts the KeyValuePair into an index - // into dim1. - using InputIterType = cub::ArgIndexInputIterator; - using Functor = MaxOrMinFunc; - using OutputIterType = - TransformOutputIterator, OutputConverter>; - - InputIterType inp_wrapper(inp.data()); - OutputIterType out_wrapper(out.data(), OutputConverter(dim1, dim2)); - - typedef const Eigen::array::Tensor::Index, 1>& ReductionAxes; - Constants constants; - - // TODO(reedwm): We can probably improve performance by writing specialized - // argmax kernels instead of relying on the generic ReduceImpl function - functor::ReduceImpl, Functor, OutputIterType, InputIterType, - ReductionAxes>(context, out_wrapper, inp_wrapper, 3, dim0, - dim1, dim2, 2, constants.kOne, Functor()); -} - -#define DEFINE_GPU_ARG_OPS(T) \ - template void DoGpuArgOp(OpKernelContext * context, \ - const Tensor& input, int axis, \ - Tensor* output); \ - template void DoGpuArgOp(OpKernelContext * context, \ - const Tensor& input, int axis, \ - Tensor* output); \ - template void DoGpuArgOp(OpKernelContext * context, \ - const Tensor& input, int axis, \ - Tensor* output); \ - template void DoGpuArgOp(OpKernelContext * context, \ - const Tensor& input, int axis, \ - Tensor* output); - -TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_ARG_OPS); - #define DEFINE_GPU_SPEC(T) \ template struct functor::ArgMax; \ template struct functor::ArgMin; \ diff --git a/tensorflow/python/kernel_tests/argmax_op_test.py b/tensorflow/python/kernel_tests/argmax_op_test.py index b75ca61ccf4..86d2941b8d3 100644 --- a/tensorflow/python/kernel_tests/argmax_op_test.py +++ b/tensorflow/python/kernel_tests/argmax_op_test.py @@ -21,15 +21,10 @@ import functools import numpy as np -from tensorflow.python.client import session from tensorflow.python.framework import dtypes from tensorflow.python.framework import test_util from tensorflow.python.ops import array_ops -from tensorflow.python.ops import control_flow_ops from tensorflow.python.ops import math_ops -from tensorflow.python.ops import random_ops -from tensorflow.python.ops import variables -from tensorflow.python.platform import benchmark from tensorflow.python.platform import test @@ -74,7 +69,7 @@ class ArgMaxTest(test.TestCase): self._testBothArg(math_ops.argmin, x, 0, x.argmin()) def _testDim(self, dtype): - shape = (3, 2, 4, 1, 5, 3, 2) + shape = (3, 2, 4, 5, 6, 3, 7) x = np.arange(functools.reduce(lambda x, y: x * y, shape), dtype=dtype) np.random.shuffle(x) x = x.reshape(shape) @@ -84,17 +79,9 @@ class ArgMaxTest(test.TestCase): self._testBothArg(math_ops.argmax, x, axis, x.argmax(axis)) self._testBothArg(math_ops.argmin, x, axis, x.argmin(axis)) - def _testLargeOutput(self, dtype): - # Test case where output size is greater than 1024, which uses a different - # codepath on the GPU. - x = np.asarray(100 * np.random.randn(11, 10, 5, 11), dtype=dtype) - self._testBothArg(math_ops.argmax, x, 2, x.argmax(2)) - self._testBothArg(math_ops.argmin, x, 2, x.argmin(2)) - def testFloat(self): self._testBasic(np.float32) self._testDim(np.float32) - self._testLargeOutput(np.float32) def testFloatInt32Output(self): x = np.asarray(100 * np.random.randn(200), dtype=np.float32) @@ -116,12 +103,6 @@ class ArgMaxTest(test.TestCase): def testDouble(self): self._testBasic(np.float64) self._testDim(np.float64) - self._testLargeOutput(np.float64) - - def testHalf(self): - self._testBasic(np.float16) - self._testDim(np.float16) - self._testLargeOutput(np.float16) def testInt32(self): self._testBasic(np.int32) @@ -153,41 +134,5 @@ class ArgMaxTest(test.TestCase): self.assertEqual(ret.shape, (1, 0)) -class ArgMaxBenchmark(test.Benchmark): - - def _RunSingleBenchmark(self, shape, dtype, bench_name): - with session.Session(config=benchmark.benchmark_config()) as sess: - num_dims = len(shape) - var = variables.Variable(random_ops.random_uniform(shape, dtype=dtype)) - variables.variables_initializer([var]).run() - for dim in range(num_dims): - num_ops_in_group = 15 - op = control_flow_ops.group(*(math_ops.argmax(var, dimension=dim) - for _ in range(num_ops_in_group))) - op_name = "%s_%s_dim%d" % (bench_name, dtype.name, dim) - num_bytes = num_ops_in_group * np.prod(shape) * dtype.size - self.run_op_benchmark(sess, op, burn_iters=5, min_iters=20, - name=op_name, mbs=num_bytes / 1e6) - - def _runBenchmarksWithDtype(self, dtype): - self._RunSingleBenchmark((2**17,), dtype, "1d") - self._RunSingleBenchmark((2**13, 2**13), dtype, "square_2d") - self._RunSingleBenchmark((2**5, 2**16), dtype, "rectangle1_2d") - self._RunSingleBenchmark((2**16, 2**5), dtype, "rectangle2_2d") - self._RunSingleBenchmark((2**8, 2**8, 2**8), dtype, "cube_3d") - self._RunSingleBenchmark((2**16, 2**5, 2**5), dtype, "rectangle1_3d") - self._RunSingleBenchmark((2**5, 2**16, 2**5), dtype, "rectangle2_3d") - self._RunSingleBenchmark((2**5, 2**5, 2**16), dtype, "rectangle3_3d") - - def benchmarkFloat(self): - self._runBenchmarksWithDtype(dtypes.float32) - - def benchmarkDouble(self): - self._runBenchmarksWithDtype(dtypes.float64) - - def benchmarkHalf(self): - self._runBenchmarksWithDtype(dtypes.float16) - - if __name__ == "__main__": test.main()