Internal change

PiperOrigin-RevId: 292220288 Change-Id: Ib7e23f56f7b79174669d10ae7d938a82d9c19900
2020-01-29 14:34:30 -08:00 · 2020-01-29 14:34:30 -08:00 · 58b1c0f401
commit 58b1c0f401
parent c094952fca
5 changed files with 4 additions and 239 deletions
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@ -3913,7 +3913,7 @@ tf_kernel_library(
 tf_kernel_library(
    name = "argmax_op",
    prefix = "argmax_op",
-    deps = MATH_DEPS + if_cuda_or_rocm([":reduction_ops"]),
+    deps = MATH_DEPS,
 )
 tf_kernel_library(
--- a/tensorflow/core/kernels/argmax_op.cc
+++ b/tensorflow/core/kernels/argmax_op.cc
@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 // See docs in ../ops/math_ops.cc.
 #define EIGEN_USE_THREADS
 #if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
@ -39,39 +41,6 @@ namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 template <typename Device>
 struct CustomArgOp;
 template <>
 struct CustomArgOp<CPUDevice> {
  template <typename T, typename Tout, typename ArgFunctor>
  // Determines whether the custom kernel in argmax_op_gpu.cu.cc should be
  // used, and if so, runs it by calling DoGpuArgOp. If it was run,
  // returns true. Otherwise, it returns false and the caller must calculate the
  // arg min or max itself.
  static bool CustomArgFunc(OpKernelContext* context, const Tensor& input,
                            int axis, Tensor* output) {
    return false;
  }
 };
 template <>
 struct CustomArgOp<GPUDevice> {
  template <typename T, typename Tout, typename ArgFunctor>
  static bool CustomArgFunc(OpKernelContext* context, const Tensor& input,
                            int axis, Tensor* output) {
    if (output->NumElements() <= 1024 || output->dims() > 7) {
      // The custom kernel is faster than Eigen when the number of output
      // elements is relatively small. We also only handle the Eigen case for up
      // to 7 dimensions.
      DoGpuArgOp<T, Tout, ArgFunctor::is_argmax>(context, input, axis, output);
      return true;
    } else {
      return false;
    }
  }
 };
 template <typename Device, typename T, typename Tout, typename ArgFunctor>
 class ArgOp : public OpKernel {
 public:
@ -112,11 +81,6 @@ class ArgOp : public OpKernel {
      return;
    }
    if (CustomArgOp<Device>::template CustomArgFunc<T, Tout, ArgFunctor>(
            context, input, axis, output)) {
      return;
    }
 #define HANDLE_DIM(NDIM)                                        \
  case NDIM:                                                    \
    ArgFunctor::Reduce##NDIM(context->eigen_device<Device>(),   \
--- a/tensorflow/core/kernels/argmax_op.h
+++ b/tensorflow/core/kernels/argmax_op.h
@ -18,8 +18,6 @@ limitations under the License.
 // Generator definition for ArgMaxOp, must be compilable by nvcc.
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/platform/types.h"
@ -45,7 +43,6 @@ struct ArgMax {
  DECLARE_COMPUTE_SPEC(7);
 #undef DECLARE_COMPUTE_SPEC
  enum { is_argmax = true };
 };
 template <typename Device, typename T, typename Tout>
@ -66,15 +63,10 @@ struct ArgMin {
  DECLARE_COMPUTE_SPEC(7);
 #undef DECLARE_COMPUTE_SPEC
  enum { is_argmax = false };
 };
 }  // namespace functor
 template <typename T, typename Tout, bool is_argmax>
 void DoGpuArgOp(OpKernelContext* context, const Tensor& input, int axis,
                Tensor* output);
 }  // namespace tensorflow
 #endif  // TENSORFLOW_CORE_KERNELS_ARGMAX_OP_H_
--- a/tensorflow/core/kernels/argmax_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/argmax_op_gpu.cu.cc
@ -20,147 +20,11 @@ limitations under the License.
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/kernels/argmax_op.h"
 #include "tensorflow/core/kernels/reduction_gpu_kernels.cu.h"
 #include "tensorflow/core/kernels/reduction_ops_common.h"
 namespace tensorflow {
 typedef Eigen::GpuDevice GPUDevice;
 typedef tensorflow::TTypes<float>::Tensor::Index Index;
 // To compute the argmax/argmin, we perform a reduction on KeyValuePairs, which
 // are (flattened index, value) pairs.
 template <typename T>
 using KeyValuePair = cub::KeyValuePair<Index, T>;
 namespace {
 template <typename T, bool is_argmax>
 struct MaxOrMinFunc;
 // The reduction operator: Returns the KeyValuePair with the highest or lowest
 // value.
 template <typename T>
 struct MaxOrMinFunc<T, true> {
  __host__ __device__ __forceinline__ KeyValuePair<T> operator()(
      const KeyValuePair<T>& lhs, const KeyValuePair<T>& rhs) {
    // If one value is NaN, we choose the other value. This behavior is not
    // guaranteed by the op and may change in the future.
    return (lhs.value > rhs.value || Eigen::numext::isnan(rhs.value)) ? lhs
                                                                      : rhs;
  }
 };
 template <typename T>
 struct MaxOrMinFunc<T, false> {
  __host__ __device__ __forceinline__ KeyValuePair<T> operator()(
      const KeyValuePair<T>& lhs, const KeyValuePair<T>& rhs) {
    return (lhs.value < rhs.value || Eigen::numext::isnan(rhs.value)) ? lhs
                                                                      : rhs;
  }
 };
 // The output converter: Converts from a KeyValuePair to an index into a a
 // specific dimension. dim1 is the size of the dimension being reduced. dim2 is
 // the size of the dimension(s) after dim1.
 template <typename T, typename Tout>
 struct OutputConverter {
  OutputConverter(Index dim1, Index dim2) : dim1_(dim1), dim2_(dim2) {}
  __host__ __device__ __forceinline__ Tout
  operator()(const KeyValuePair<T>& key_value_pair) const {
    return static_cast<Tout>((key_value_pair.key / dim2_) % dim1_);
  }
  Index dim1_;
  Index dim2_;
 };
 }  // namespace
 namespace functor {
 namespace reduction_op_helper {
 // Template specialization of IdentityValue, to return the identity value for
 // the reduction. This is needed for ReduceImpl, a function we call. We return
 // (0, -inf) for argmax and (0, inf) for argmin.
 template <typename T>
 struct IdentityValue<KeyValuePair<T>, MaxOrMinFunc<T, true>> {
  KeyValuePair<T> operator()() {
    return {0, -std::numeric_limits<T>::infinity()};
  }
 };
 template <typename T>
 struct IdentityValue<KeyValuePair<T>, MaxOrMinFunc<T, false>> {
  KeyValuePair<T> operator()() {
    return {0, std::numeric_limits<T>::infinity()};
  }
 };
 }  // namespace reduction_op_helper
 }  // namespace functor
 template <typename T, typename Tout, bool is_argmax>
 void DoGpuArgOp(OpKernelContext* context, const Tensor& input, int axis,
                Tensor* output) {
  // We collapse adjacent axes of the input tensor in order to view it as a
  // 3 dimensional tensor. The reduction axis is not collapsed, so the three new
  // axes will be the input axes to the left of the reduction axis, the
  // reduction axis, and the input axes to the right of the reduction axis.
  Index dim0 = 1;
  for (Index i = 0; i < axis; i++) {
    dim0 *= input.dim_size(i);
  }
  Index dim1 = input.dim_size(axis);
  Index dim2 = 1;
  for (Index i = axis + 1; i < input.dims(); i++) {
    dim2 *= input.dim_size(i);
  }
  DCHECK_EQ(dim0 * dim1 * dim2, input.NumElements());
  auto inp = input.shaped<T, 3>({dim0, dim1, dim2});
  auto out = output->shaped<Tout, 2>({dim0, dim2});
  // We call ReduceImpl to perform the reduction. The input iterator returns
  // KeyValuePairs. The reduction functor returns the KeyValuePair with the max
  // or min value. The output iterator converts the KeyValuePair into an index
  // into dim1.
  using InputIterType = cub::ArgIndexInputIterator<const T*>;
  using Functor = MaxOrMinFunc<T, is_argmax>;
  using OutputIterType =
      TransformOutputIterator<Tout, KeyValuePair<T>, OutputConverter<T, Tout>>;
  InputIterType inp_wrapper(inp.data());
  OutputIterType out_wrapper(out.data(), OutputConverter<T, Tout>(dim1, dim2));
  typedef const Eigen::array<TTypes<float>::Tensor::Index, 1>& ReductionAxes;
  Constants<GPUDevice> constants;
  // TODO(reedwm): We can probably improve performance by writing specialized
  // argmax kernels instead of relying on the generic ReduceImpl function
  functor::ReduceImpl<KeyValuePair<T>, Functor, OutputIterType, InputIterType,
                      ReductionAxes>(context, out_wrapper, inp_wrapper, 3, dim0,
                                     dim1, dim2, 2, constants.kOne, Functor());
 }
 #define DEFINE_GPU_ARG_OPS(T)                                              \
  template void DoGpuArgOp<T, int64, true>(OpKernelContext * context,      \
                                           const Tensor& input, int axis,  \
                                           Tensor* output);                \
  template void DoGpuArgOp<T, int64, false>(OpKernelContext * context,     \
                                            const Tensor& input, int axis, \
                                            Tensor* output);               \
  template void DoGpuArgOp<T, int32, true>(OpKernelContext * context,      \
                                           const Tensor& input, int axis,  \
                                           Tensor* output);                \
  template void DoGpuArgOp<T, int32, false>(OpKernelContext * context,     \
                                            const Tensor& input, int axis, \
                                            Tensor* output);
 TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_ARG_OPS);
 #define DEFINE_GPU_SPEC(T)                              \
  template struct functor::ArgMax<GPUDevice, T, int64>; \
  template struct functor::ArgMin<GPUDevice, T, int64>; \
--- a/tensorflow/python/kernel_tests/argmax_op_test.py
+++ b/tensorflow/python/kernel_tests/argmax_op_test.py
@ -21,15 +21,10 @@ import functools
 import numpy as np
 from tensorflow.python.client import session
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import benchmark
 from tensorflow.python.platform import test
@ -74,7 +69,7 @@ class ArgMaxTest(test.TestCase):
    self._testBothArg(math_ops.argmin, x, 0, x.argmin())
  def _testDim(self, dtype):
-    shape = (3, 2, 4, 1, 5, 3, 2)
+    shape = (3, 2, 4, 5, 6, 3, 7)
    x = np.arange(functools.reduce(lambda x, y: x * y, shape), dtype=dtype)
    np.random.shuffle(x)
    x = x.reshape(shape)
@ -84,17 +79,9 @@ class ArgMaxTest(test.TestCase):
      self._testBothArg(math_ops.argmax, x, axis, x.argmax(axis))
      self._testBothArg(math_ops.argmin, x, axis, x.argmin(axis))
  def _testLargeOutput(self, dtype):
    # Test case where output size is greater than 1024, which uses a different
    # codepath on the GPU.
    x = np.asarray(100 * np.random.randn(11, 10, 5, 11), dtype=dtype)
    self._testBothArg(math_ops.argmax, x, 2, x.argmax(2))
    self._testBothArg(math_ops.argmin, x, 2, x.argmin(2))
  def testFloat(self):
    self._testBasic(np.float32)
    self._testDim(np.float32)
    self._testLargeOutput(np.float32)
  def testFloatInt32Output(self):
    x = np.asarray(100 * np.random.randn(200), dtype=np.float32)
@ -116,12 +103,6 @@ class ArgMaxTest(test.TestCase):
  def testDouble(self):
    self._testBasic(np.float64)
    self._testDim(np.float64)
    self._testLargeOutput(np.float64)
  def testHalf(self):
    self._testBasic(np.float16)
    self._testDim(np.float16)
    self._testLargeOutput(np.float16)
  def testInt32(self):
    self._testBasic(np.int32)
@ -153,41 +134,5 @@ class ArgMaxTest(test.TestCase):
        self.assertEqual(ret.shape, (1, 0))
 class ArgMaxBenchmark(test.Benchmark):
  def _RunSingleBenchmark(self, shape, dtype, bench_name):
    with session.Session(config=benchmark.benchmark_config()) as sess:
      num_dims = len(shape)
      var = variables.Variable(random_ops.random_uniform(shape, dtype=dtype))
      variables.variables_initializer([var]).run()
      for dim in range(num_dims):
        num_ops_in_group = 15
        op = control_flow_ops.group(*(math_ops.argmax(var, dimension=dim)
                                      for _ in range(num_ops_in_group)))
        op_name = "%s_%s_dim%d" % (bench_name, dtype.name, dim)
        num_bytes = num_ops_in_group * np.prod(shape) * dtype.size
        self.run_op_benchmark(sess, op, burn_iters=5, min_iters=20,
                              name=op_name, mbs=num_bytes / 1e6)
  def _runBenchmarksWithDtype(self, dtype):
    self._RunSingleBenchmark((2**17,), dtype, "1d")
    self._RunSingleBenchmark((2**13, 2**13), dtype, "square_2d")
    self._RunSingleBenchmark((2**5, 2**16), dtype, "rectangle1_2d")
    self._RunSingleBenchmark((2**16, 2**5), dtype, "rectangle2_2d")
    self._RunSingleBenchmark((2**8, 2**8, 2**8), dtype, "cube_3d")
    self._RunSingleBenchmark((2**16, 2**5, 2**5), dtype, "rectangle1_3d")
    self._RunSingleBenchmark((2**5, 2**16, 2**5), dtype, "rectangle2_3d")
    self._RunSingleBenchmark((2**5, 2**5, 2**16), dtype, "rectangle3_3d")
  def benchmarkFloat(self):
    self._runBenchmarksWithDtype(dtypes.float32)
  def benchmarkDouble(self):
    self._runBenchmarksWithDtype(dtypes.float64)
  def benchmarkHalf(self):
    self._runBenchmarksWithDtype(dtypes.float16)
 if __name__ == "__main__":
  test.main()