Internal change
PiperOrigin-RevId: 292220288 Change-Id: Ib7e23f56f7b79174669d10ae7d938a82d9c19900
This commit is contained in:
		
							parent
							
								
									c094952fca
								
							
						
					
					
						commit
						58b1c0f401
					
				| @ -3913,7 +3913,7 @@ tf_kernel_library( | ||||
| tf_kernel_library( | ||||
|     name = "argmax_op", | ||||
|     prefix = "argmax_op", | ||||
|     deps = MATH_DEPS + if_cuda_or_rocm([":reduction_ops"]), | ||||
|     deps = MATH_DEPS, | ||||
| ) | ||||
| 
 | ||||
| tf_kernel_library( | ||||
|  | ||||
| @ -13,6 +13,8 @@ See the License for the specific language governing permissions and | ||||
| limitations under the License. | ||||
| ==============================================================================*/ | ||||
| 
 | ||||
| // See docs in ../ops/math_ops.cc.
 | ||||
| 
 | ||||
| #define EIGEN_USE_THREADS | ||||
| 
 | ||||
| #if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \ | ||||
| @ -39,39 +41,6 @@ namespace tensorflow { | ||||
| typedef Eigen::ThreadPoolDevice CPUDevice; | ||||
| typedef Eigen::GpuDevice GPUDevice; | ||||
| 
 | ||||
| template <typename Device> | ||||
| struct CustomArgOp; | ||||
| 
 | ||||
| template <> | ||||
| struct CustomArgOp<CPUDevice> { | ||||
|   template <typename T, typename Tout, typename ArgFunctor> | ||||
|   // Determines whether the custom kernel in argmax_op_gpu.cu.cc should be
 | ||||
|   // used, and if so, runs it by calling DoGpuArgOp. If it was run,
 | ||||
|   // returns true. Otherwise, it returns false and the caller must calculate the
 | ||||
|   // arg min or max itself.
 | ||||
|   static bool CustomArgFunc(OpKernelContext* context, const Tensor& input, | ||||
|                             int axis, Tensor* output) { | ||||
|     return false; | ||||
|   } | ||||
| }; | ||||
| 
 | ||||
| template <> | ||||
| struct CustomArgOp<GPUDevice> { | ||||
|   template <typename T, typename Tout, typename ArgFunctor> | ||||
|   static bool CustomArgFunc(OpKernelContext* context, const Tensor& input, | ||||
|                             int axis, Tensor* output) { | ||||
|     if (output->NumElements() <= 1024 || output->dims() > 7) { | ||||
|       // The custom kernel is faster than Eigen when the number of output
 | ||||
|       // elements is relatively small. We also only handle the Eigen case for up
 | ||||
|       // to 7 dimensions.
 | ||||
|       DoGpuArgOp<T, Tout, ArgFunctor::is_argmax>(context, input, axis, output); | ||||
|       return true; | ||||
|     } else { | ||||
|       return false; | ||||
|     } | ||||
|   } | ||||
| }; | ||||
| 
 | ||||
| template <typename Device, typename T, typename Tout, typename ArgFunctor> | ||||
| class ArgOp : public OpKernel { | ||||
|  public: | ||||
| @ -112,11 +81,6 @@ class ArgOp : public OpKernel { | ||||
|       return; | ||||
|     } | ||||
| 
 | ||||
|     if (CustomArgOp<Device>::template CustomArgFunc<T, Tout, ArgFunctor>( | ||||
|             context, input, axis, output)) { | ||||
|       return; | ||||
|     } | ||||
| 
 | ||||
| #define HANDLE_DIM(NDIM)                                        \ | ||||
|   case NDIM:                                                    \ | ||||
|     ArgFunctor::Reduce##NDIM(context->eigen_device<Device>(),   \ | ||||
|  | ||||
| @ -18,8 +18,6 @@ limitations under the License. | ||||
| // Generator definition for ArgMaxOp, must be compilable by nvcc.
 | ||||
| 
 | ||||
| #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" | ||||
| #include "tensorflow/core/framework/op_kernel.h" | ||||
| #include "tensorflow/core/framework/tensor.h" | ||||
| #include "tensorflow/core/framework/tensor_types.h" | ||||
| #include "tensorflow/core/platform/types.h" | ||||
| 
 | ||||
| @ -45,7 +43,6 @@ struct ArgMax { | ||||
|   DECLARE_COMPUTE_SPEC(7); | ||||
| 
 | ||||
| #undef DECLARE_COMPUTE_SPEC | ||||
|   enum { is_argmax = true }; | ||||
| }; | ||||
| 
 | ||||
| template <typename Device, typename T, typename Tout> | ||||
| @ -66,15 +63,10 @@ struct ArgMin { | ||||
|   DECLARE_COMPUTE_SPEC(7); | ||||
| 
 | ||||
| #undef DECLARE_COMPUTE_SPEC | ||||
|   enum { is_argmax = false }; | ||||
| }; | ||||
| 
 | ||||
| }  // namespace functor
 | ||||
| 
 | ||||
| template <typename T, typename Tout, bool is_argmax> | ||||
| void DoGpuArgOp(OpKernelContext* context, const Tensor& input, int axis, | ||||
|                 Tensor* output); | ||||
| 
 | ||||
| }  // namespace tensorflow
 | ||||
| 
 | ||||
| #endif  // TENSORFLOW_CORE_KERNELS_ARGMAX_OP_H_
 | ||||
|  | ||||
| @ -20,147 +20,11 @@ limitations under the License. | ||||
| 
 | ||||
| #include "tensorflow/core/framework/register_types.h" | ||||
| #include "tensorflow/core/kernels/argmax_op.h" | ||||
| #include "tensorflow/core/kernels/reduction_gpu_kernels.cu.h" | ||||
| #include "tensorflow/core/kernels/reduction_ops_common.h" | ||||
| 
 | ||||
| namespace tensorflow { | ||||
| 
 | ||||
| typedef Eigen::GpuDevice GPUDevice; | ||||
| 
 | ||||
| typedef tensorflow::TTypes<float>::Tensor::Index Index; | ||||
| 
 | ||||
| // To compute the argmax/argmin, we perform a reduction on KeyValuePairs, which
 | ||||
| // are (flattened index, value) pairs.
 | ||||
| template <typename T> | ||||
| using KeyValuePair = cub::KeyValuePair<Index, T>; | ||||
| 
 | ||||
| namespace { | ||||
| 
 | ||||
| template <typename T, bool is_argmax> | ||||
| struct MaxOrMinFunc; | ||||
| 
 | ||||
| // The reduction operator: Returns the KeyValuePair with the highest or lowest
 | ||||
| // value.
 | ||||
| template <typename T> | ||||
| struct MaxOrMinFunc<T, true> { | ||||
|   __host__ __device__ __forceinline__ KeyValuePair<T> operator()( | ||||
|       const KeyValuePair<T>& lhs, const KeyValuePair<T>& rhs) { | ||||
|     // If one value is NaN, we choose the other value. This behavior is not
 | ||||
|     // guaranteed by the op and may change in the future.
 | ||||
|     return (lhs.value > rhs.value || Eigen::numext::isnan(rhs.value)) ? lhs | ||||
|                                                                       : rhs; | ||||
|   } | ||||
| }; | ||||
| 
 | ||||
| template <typename T> | ||||
| struct MaxOrMinFunc<T, false> { | ||||
|   __host__ __device__ __forceinline__ KeyValuePair<T> operator()( | ||||
|       const KeyValuePair<T>& lhs, const KeyValuePair<T>& rhs) { | ||||
|     return (lhs.value < rhs.value || Eigen::numext::isnan(rhs.value)) ? lhs | ||||
|                                                                       : rhs; | ||||
|   } | ||||
| }; | ||||
| 
 | ||||
| // The output converter: Converts from a KeyValuePair to an index into a a
 | ||||
| // specific dimension. dim1 is the size of the dimension being reduced. dim2 is
 | ||||
| // the size of the dimension(s) after dim1.
 | ||||
| template <typename T, typename Tout> | ||||
| struct OutputConverter { | ||||
|   OutputConverter(Index dim1, Index dim2) : dim1_(dim1), dim2_(dim2) {} | ||||
| 
 | ||||
|   __host__ __device__ __forceinline__ Tout | ||||
|   operator()(const KeyValuePair<T>& key_value_pair) const { | ||||
|     return static_cast<Tout>((key_value_pair.key / dim2_) % dim1_); | ||||
|   } | ||||
| 
 | ||||
|   Index dim1_; | ||||
|   Index dim2_; | ||||
| }; | ||||
| 
 | ||||
| }  // namespace
 | ||||
| 
 | ||||
| namespace functor { | ||||
| namespace reduction_op_helper { | ||||
| 
 | ||||
| // Template specialization of IdentityValue, to return the identity value for
 | ||||
| // the reduction. This is needed for ReduceImpl, a function we call. We return
 | ||||
| // (0, -inf) for argmax and (0, inf) for argmin.
 | ||||
| template <typename T> | ||||
| struct IdentityValue<KeyValuePair<T>, MaxOrMinFunc<T, true>> { | ||||
|   KeyValuePair<T> operator()() { | ||||
|     return {0, -std::numeric_limits<T>::infinity()}; | ||||
|   } | ||||
| }; | ||||
| 
 | ||||
| template <typename T> | ||||
| struct IdentityValue<KeyValuePair<T>, MaxOrMinFunc<T, false>> { | ||||
|   KeyValuePair<T> operator()() { | ||||
|     return {0, std::numeric_limits<T>::infinity()}; | ||||
|   } | ||||
| }; | ||||
| 
 | ||||
| }  // namespace reduction_op_helper
 | ||||
| }  // namespace functor
 | ||||
| 
 | ||||
| template <typename T, typename Tout, bool is_argmax> | ||||
| void DoGpuArgOp(OpKernelContext* context, const Tensor& input, int axis, | ||||
|                 Tensor* output) { | ||||
|   // We collapse adjacent axes of the input tensor in order to view it as a
 | ||||
|   // 3 dimensional tensor. The reduction axis is not collapsed, so the three new
 | ||||
|   // axes will be the input axes to the left of the reduction axis, the
 | ||||
|   // reduction axis, and the input axes to the right of the reduction axis.
 | ||||
|   Index dim0 = 1; | ||||
|   for (Index i = 0; i < axis; i++) { | ||||
|     dim0 *= input.dim_size(i); | ||||
|   } | ||||
|   Index dim1 = input.dim_size(axis); | ||||
|   Index dim2 = 1; | ||||
|   for (Index i = axis + 1; i < input.dims(); i++) { | ||||
|     dim2 *= input.dim_size(i); | ||||
|   } | ||||
|   DCHECK_EQ(dim0 * dim1 * dim2, input.NumElements()); | ||||
| 
 | ||||
|   auto inp = input.shaped<T, 3>({dim0, dim1, dim2}); | ||||
|   auto out = output->shaped<Tout, 2>({dim0, dim2}); | ||||
| 
 | ||||
|   // We call ReduceImpl to perform the reduction. The input iterator returns
 | ||||
|   // KeyValuePairs. The reduction functor returns the KeyValuePair with the max
 | ||||
|   // or min value. The output iterator converts the KeyValuePair into an index
 | ||||
|   // into dim1.
 | ||||
|   using InputIterType = cub::ArgIndexInputIterator<const T*>; | ||||
|   using Functor = MaxOrMinFunc<T, is_argmax>; | ||||
|   using OutputIterType = | ||||
|       TransformOutputIterator<Tout, KeyValuePair<T>, OutputConverter<T, Tout>>; | ||||
| 
 | ||||
|   InputIterType inp_wrapper(inp.data()); | ||||
|   OutputIterType out_wrapper(out.data(), OutputConverter<T, Tout>(dim1, dim2)); | ||||
| 
 | ||||
|   typedef const Eigen::array<TTypes<float>::Tensor::Index, 1>& ReductionAxes; | ||||
|   Constants<GPUDevice> constants; | ||||
| 
 | ||||
|   // TODO(reedwm): We can probably improve performance by writing specialized
 | ||||
|   // argmax kernels instead of relying on the generic ReduceImpl function
 | ||||
|   functor::ReduceImpl<KeyValuePair<T>, Functor, OutputIterType, InputIterType, | ||||
|                       ReductionAxes>(context, out_wrapper, inp_wrapper, 3, dim0, | ||||
|                                      dim1, dim2, 2, constants.kOne, Functor()); | ||||
| } | ||||
| 
 | ||||
| #define DEFINE_GPU_ARG_OPS(T)                                              \ | ||||
|   template void DoGpuArgOp<T, int64, true>(OpKernelContext * context,      \ | ||||
|                                            const Tensor& input, int axis,  \ | ||||
|                                            Tensor* output);                \ | ||||
|   template void DoGpuArgOp<T, int64, false>(OpKernelContext * context,     \ | ||||
|                                             const Tensor& input, int axis, \ | ||||
|                                             Tensor* output);               \ | ||||
|   template void DoGpuArgOp<T, int32, true>(OpKernelContext * context,      \ | ||||
|                                            const Tensor& input, int axis,  \ | ||||
|                                            Tensor* output);                \ | ||||
|   template void DoGpuArgOp<T, int32, false>(OpKernelContext * context,     \ | ||||
|                                             const Tensor& input, int axis, \ | ||||
|                                             Tensor* output); | ||||
| 
 | ||||
| TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_ARG_OPS); | ||||
| 
 | ||||
| #define DEFINE_GPU_SPEC(T)                              \ | ||||
|   template struct functor::ArgMax<GPUDevice, T, int64>; \ | ||||
|   template struct functor::ArgMin<GPUDevice, T, int64>; \ | ||||
|  | ||||
| @ -21,15 +21,10 @@ import functools | ||||
| 
 | ||||
| import numpy as np | ||||
| 
 | ||||
| from tensorflow.python.client import session | ||||
| from tensorflow.python.framework import dtypes | ||||
| from tensorflow.python.framework import test_util | ||||
| from tensorflow.python.ops import array_ops | ||||
| from tensorflow.python.ops import control_flow_ops | ||||
| from tensorflow.python.ops import math_ops | ||||
| from tensorflow.python.ops import random_ops | ||||
| from tensorflow.python.ops import variables | ||||
| from tensorflow.python.platform import benchmark | ||||
| from tensorflow.python.platform import test | ||||
| 
 | ||||
| 
 | ||||
| @ -74,7 +69,7 @@ class ArgMaxTest(test.TestCase): | ||||
|     self._testBothArg(math_ops.argmin, x, 0, x.argmin()) | ||||
| 
 | ||||
|   def _testDim(self, dtype): | ||||
|     shape = (3, 2, 4, 1, 5, 3, 2) | ||||
|     shape = (3, 2, 4, 5, 6, 3, 7) | ||||
|     x = np.arange(functools.reduce(lambda x, y: x * y, shape), dtype=dtype) | ||||
|     np.random.shuffle(x) | ||||
|     x = x.reshape(shape) | ||||
| @ -84,17 +79,9 @@ class ArgMaxTest(test.TestCase): | ||||
|       self._testBothArg(math_ops.argmax, x, axis, x.argmax(axis)) | ||||
|       self._testBothArg(math_ops.argmin, x, axis, x.argmin(axis)) | ||||
| 
 | ||||
|   def _testLargeOutput(self, dtype): | ||||
|     # Test case where output size is greater than 1024, which uses a different | ||||
|     # codepath on the GPU. | ||||
|     x = np.asarray(100 * np.random.randn(11, 10, 5, 11), dtype=dtype) | ||||
|     self._testBothArg(math_ops.argmax, x, 2, x.argmax(2)) | ||||
|     self._testBothArg(math_ops.argmin, x, 2, x.argmin(2)) | ||||
| 
 | ||||
|   def testFloat(self): | ||||
|     self._testBasic(np.float32) | ||||
|     self._testDim(np.float32) | ||||
|     self._testLargeOutput(np.float32) | ||||
| 
 | ||||
|   def testFloatInt32Output(self): | ||||
|     x = np.asarray(100 * np.random.randn(200), dtype=np.float32) | ||||
| @ -116,12 +103,6 @@ class ArgMaxTest(test.TestCase): | ||||
|   def testDouble(self): | ||||
|     self._testBasic(np.float64) | ||||
|     self._testDim(np.float64) | ||||
|     self._testLargeOutput(np.float64) | ||||
| 
 | ||||
|   def testHalf(self): | ||||
|     self._testBasic(np.float16) | ||||
|     self._testDim(np.float16) | ||||
|     self._testLargeOutput(np.float16) | ||||
| 
 | ||||
|   def testInt32(self): | ||||
|     self._testBasic(np.int32) | ||||
| @ -153,41 +134,5 @@ class ArgMaxTest(test.TestCase): | ||||
|         self.assertEqual(ret.shape, (1, 0)) | ||||
| 
 | ||||
| 
 | ||||
| class ArgMaxBenchmark(test.Benchmark): | ||||
| 
 | ||||
|   def _RunSingleBenchmark(self, shape, dtype, bench_name): | ||||
|     with session.Session(config=benchmark.benchmark_config()) as sess: | ||||
|       num_dims = len(shape) | ||||
|       var = variables.Variable(random_ops.random_uniform(shape, dtype=dtype)) | ||||
|       variables.variables_initializer([var]).run() | ||||
|       for dim in range(num_dims): | ||||
|         num_ops_in_group = 15 | ||||
|         op = control_flow_ops.group(*(math_ops.argmax(var, dimension=dim) | ||||
|                                       for _ in range(num_ops_in_group))) | ||||
|         op_name = "%s_%s_dim%d" % (bench_name, dtype.name, dim) | ||||
|         num_bytes = num_ops_in_group * np.prod(shape) * dtype.size | ||||
|         self.run_op_benchmark(sess, op, burn_iters=5, min_iters=20, | ||||
|                               name=op_name, mbs=num_bytes / 1e6) | ||||
| 
 | ||||
|   def _runBenchmarksWithDtype(self, dtype): | ||||
|     self._RunSingleBenchmark((2**17,), dtype, "1d") | ||||
|     self._RunSingleBenchmark((2**13, 2**13), dtype, "square_2d") | ||||
|     self._RunSingleBenchmark((2**5, 2**16), dtype, "rectangle1_2d") | ||||
|     self._RunSingleBenchmark((2**16, 2**5), dtype, "rectangle2_2d") | ||||
|     self._RunSingleBenchmark((2**8, 2**8, 2**8), dtype, "cube_3d") | ||||
|     self._RunSingleBenchmark((2**16, 2**5, 2**5), dtype, "rectangle1_3d") | ||||
|     self._RunSingleBenchmark((2**5, 2**16, 2**5), dtype, "rectangle2_3d") | ||||
|     self._RunSingleBenchmark((2**5, 2**5, 2**16), dtype, "rectangle3_3d") | ||||
| 
 | ||||
|   def benchmarkFloat(self): | ||||
|     self._runBenchmarksWithDtype(dtypes.float32) | ||||
| 
 | ||||
|   def benchmarkDouble(self): | ||||
|     self._runBenchmarksWithDtype(dtypes.float64) | ||||
| 
 | ||||
|   def benchmarkHalf(self): | ||||
|     self._runBenchmarksWithDtype(dtypes.float16) | ||||
| 
 | ||||
| 
 | ||||
| if __name__ == "__main__": | ||||
|   test.main() | ||||
|  | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user