Internal change

PiperOrigin-RevId: 292220288
Change-Id: Ib7e23f56f7b79174669d10ae7d938a82d9c19900
This commit is contained in:
A. Unique TensorFlower 2020-01-29 14:34:30 -08:00 committed by TensorFlower Gardener
parent c094952fca
commit 58b1c0f401
5 changed files with 4 additions and 239 deletions

View File

@ -3913,7 +3913,7 @@ tf_kernel_library(
tf_kernel_library(
name = "argmax_op",
prefix = "argmax_op",
deps = MATH_DEPS + if_cuda_or_rocm([":reduction_ops"]),
deps = MATH_DEPS,
)
tf_kernel_library(

View File

@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
// See docs in ../ops/math_ops.cc.
#define EIGEN_USE_THREADS
#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
@ -39,39 +41,6 @@ namespace tensorflow {
typedef Eigen::ThreadPoolDevice CPUDevice;
typedef Eigen::GpuDevice GPUDevice;
template <typename Device>
struct CustomArgOp;
template <>
struct CustomArgOp<CPUDevice> {
template <typename T, typename Tout, typename ArgFunctor>
// Determines whether the custom kernel in argmax_op_gpu.cu.cc should be
// used, and if so, runs it by calling DoGpuArgOp. If it was run,
// returns true. Otherwise, it returns false and the caller must calculate the
// arg min or max itself.
static bool CustomArgFunc(OpKernelContext* context, const Tensor& input,
int axis, Tensor* output) {
return false;
}
};
template <>
struct CustomArgOp<GPUDevice> {
template <typename T, typename Tout, typename ArgFunctor>
static bool CustomArgFunc(OpKernelContext* context, const Tensor& input,
int axis, Tensor* output) {
if (output->NumElements() <= 1024 || output->dims() > 7) {
// The custom kernel is faster than Eigen when the number of output
// elements is relatively small. We also only handle the Eigen case for up
// to 7 dimensions.
DoGpuArgOp<T, Tout, ArgFunctor::is_argmax>(context, input, axis, output);
return true;
} else {
return false;
}
}
};
template <typename Device, typename T, typename Tout, typename ArgFunctor>
class ArgOp : public OpKernel {
public:
@ -112,11 +81,6 @@ class ArgOp : public OpKernel {
return;
}
if (CustomArgOp<Device>::template CustomArgFunc<T, Tout, ArgFunctor>(
context, input, axis, output)) {
return;
}
#define HANDLE_DIM(NDIM) \
case NDIM: \
ArgFunctor::Reduce##NDIM(context->eigen_device<Device>(), \

View File

@ -18,8 +18,6 @@ limitations under the License.
// Generator definition for ArgMaxOp, must be compilable by nvcc.
#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
#include "tensorflow/core/framework/op_kernel.h"
#include "tensorflow/core/framework/tensor.h"
#include "tensorflow/core/framework/tensor_types.h"
#include "tensorflow/core/platform/types.h"
@ -45,7 +43,6 @@ struct ArgMax {
DECLARE_COMPUTE_SPEC(7);
#undef DECLARE_COMPUTE_SPEC
enum { is_argmax = true };
};
template <typename Device, typename T, typename Tout>
@ -66,15 +63,10 @@ struct ArgMin {
DECLARE_COMPUTE_SPEC(7);
#undef DECLARE_COMPUTE_SPEC
enum { is_argmax = false };
};
} // namespace functor
template <typename T, typename Tout, bool is_argmax>
void DoGpuArgOp(OpKernelContext* context, const Tensor& input, int axis,
Tensor* output);
} // namespace tensorflow
#endif // TENSORFLOW_CORE_KERNELS_ARGMAX_OP_H_

View File

@ -20,147 +20,11 @@ limitations under the License.
#include "tensorflow/core/framework/register_types.h"
#include "tensorflow/core/kernels/argmax_op.h"
#include "tensorflow/core/kernels/reduction_gpu_kernels.cu.h"
#include "tensorflow/core/kernels/reduction_ops_common.h"
namespace tensorflow {
typedef Eigen::GpuDevice GPUDevice;
typedef tensorflow::TTypes<float>::Tensor::Index Index;
// To compute the argmax/argmin, we perform a reduction on KeyValuePairs, which
// are (flattened index, value) pairs.
template <typename T>
using KeyValuePair = cub::KeyValuePair<Index, T>;
namespace {
template <typename T, bool is_argmax>
struct MaxOrMinFunc;
// The reduction operator: Returns the KeyValuePair with the highest or lowest
// value.
template <typename T>
struct MaxOrMinFunc<T, true> {
__host__ __device__ __forceinline__ KeyValuePair<T> operator()(
const KeyValuePair<T>& lhs, const KeyValuePair<T>& rhs) {
// If one value is NaN, we choose the other value. This behavior is not
// guaranteed by the op and may change in the future.
return (lhs.value > rhs.value || Eigen::numext::isnan(rhs.value)) ? lhs
: rhs;
}
};
template <typename T>
struct MaxOrMinFunc<T, false> {
__host__ __device__ __forceinline__ KeyValuePair<T> operator()(
const KeyValuePair<T>& lhs, const KeyValuePair<T>& rhs) {
return (lhs.value < rhs.value || Eigen::numext::isnan(rhs.value)) ? lhs
: rhs;
}
};
// The output converter: Converts from a KeyValuePair to an index into a a
// specific dimension. dim1 is the size of the dimension being reduced. dim2 is
// the size of the dimension(s) after dim1.
template <typename T, typename Tout>
struct OutputConverter {
OutputConverter(Index dim1, Index dim2) : dim1_(dim1), dim2_(dim2) {}
__host__ __device__ __forceinline__ Tout
operator()(const KeyValuePair<T>& key_value_pair) const {
return static_cast<Tout>((key_value_pair.key / dim2_) % dim1_);
}
Index dim1_;
Index dim2_;
};
} // namespace
namespace functor {
namespace reduction_op_helper {
// Template specialization of IdentityValue, to return the identity value for
// the reduction. This is needed for ReduceImpl, a function we call. We return
// (0, -inf) for argmax and (0, inf) for argmin.
template <typename T>
struct IdentityValue<KeyValuePair<T>, MaxOrMinFunc<T, true>> {
KeyValuePair<T> operator()() {
return {0, -std::numeric_limits<T>::infinity()};
}
};
template <typename T>
struct IdentityValue<KeyValuePair<T>, MaxOrMinFunc<T, false>> {
KeyValuePair<T> operator()() {
return {0, std::numeric_limits<T>::infinity()};
}
};
} // namespace reduction_op_helper
} // namespace functor
template <typename T, typename Tout, bool is_argmax>
void DoGpuArgOp(OpKernelContext* context, const Tensor& input, int axis,
Tensor* output) {
// We collapse adjacent axes of the input tensor in order to view it as a
// 3 dimensional tensor. The reduction axis is not collapsed, so the three new
// axes will be the input axes to the left of the reduction axis, the
// reduction axis, and the input axes to the right of the reduction axis.
Index dim0 = 1;
for (Index i = 0; i < axis; i++) {
dim0 *= input.dim_size(i);
}
Index dim1 = input.dim_size(axis);
Index dim2 = 1;
for (Index i = axis + 1; i < input.dims(); i++) {
dim2 *= input.dim_size(i);
}
DCHECK_EQ(dim0 * dim1 * dim2, input.NumElements());
auto inp = input.shaped<T, 3>({dim0, dim1, dim2});
auto out = output->shaped<Tout, 2>({dim0, dim2});
// We call ReduceImpl to perform the reduction. The input iterator returns
// KeyValuePairs. The reduction functor returns the KeyValuePair with the max
// or min value. The output iterator converts the KeyValuePair into an index
// into dim1.
using InputIterType = cub::ArgIndexInputIterator<const T*>;
using Functor = MaxOrMinFunc<T, is_argmax>;
using OutputIterType =
TransformOutputIterator<Tout, KeyValuePair<T>, OutputConverter<T, Tout>>;
InputIterType inp_wrapper(inp.data());
OutputIterType out_wrapper(out.data(), OutputConverter<T, Tout>(dim1, dim2));
typedef const Eigen::array<TTypes<float>::Tensor::Index, 1>& ReductionAxes;
Constants<GPUDevice> constants;
// TODO(reedwm): We can probably improve performance by writing specialized
// argmax kernels instead of relying on the generic ReduceImpl function
functor::ReduceImpl<KeyValuePair<T>, Functor, OutputIterType, InputIterType,
ReductionAxes>(context, out_wrapper, inp_wrapper, 3, dim0,
dim1, dim2, 2, constants.kOne, Functor());
}
#define DEFINE_GPU_ARG_OPS(T) \
template void DoGpuArgOp<T, int64, true>(OpKernelContext * context, \
const Tensor& input, int axis, \
Tensor* output); \
template void DoGpuArgOp<T, int64, false>(OpKernelContext * context, \
const Tensor& input, int axis, \
Tensor* output); \
template void DoGpuArgOp<T, int32, true>(OpKernelContext * context, \
const Tensor& input, int axis, \
Tensor* output); \
template void DoGpuArgOp<T, int32, false>(OpKernelContext * context, \
const Tensor& input, int axis, \
Tensor* output);
TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_ARG_OPS);
#define DEFINE_GPU_SPEC(T) \
template struct functor::ArgMax<GPUDevice, T, int64>; \
template struct functor::ArgMin<GPUDevice, T, int64>; \

View File

@ -21,15 +21,10 @@ import functools
import numpy as np
from tensorflow.python.client import session
from tensorflow.python.framework import dtypes
from tensorflow.python.framework import test_util
from tensorflow.python.ops import array_ops
from tensorflow.python.ops import control_flow_ops
from tensorflow.python.ops import math_ops
from tensorflow.python.ops import random_ops
from tensorflow.python.ops import variables
from tensorflow.python.platform import benchmark
from tensorflow.python.platform import test
@ -74,7 +69,7 @@ class ArgMaxTest(test.TestCase):
self._testBothArg(math_ops.argmin, x, 0, x.argmin())
def _testDim(self, dtype):
shape = (3, 2, 4, 1, 5, 3, 2)
shape = (3, 2, 4, 5, 6, 3, 7)
x = np.arange(functools.reduce(lambda x, y: x * y, shape), dtype=dtype)
np.random.shuffle(x)
x = x.reshape(shape)
@ -84,17 +79,9 @@ class ArgMaxTest(test.TestCase):
self._testBothArg(math_ops.argmax, x, axis, x.argmax(axis))
self._testBothArg(math_ops.argmin, x, axis, x.argmin(axis))
def _testLargeOutput(self, dtype):
# Test case where output size is greater than 1024, which uses a different
# codepath on the GPU.
x = np.asarray(100 * np.random.randn(11, 10, 5, 11), dtype=dtype)
self._testBothArg(math_ops.argmax, x, 2, x.argmax(2))
self._testBothArg(math_ops.argmin, x, 2, x.argmin(2))
def testFloat(self):
self._testBasic(np.float32)
self._testDim(np.float32)
self._testLargeOutput(np.float32)
def testFloatInt32Output(self):
x = np.asarray(100 * np.random.randn(200), dtype=np.float32)
@ -116,12 +103,6 @@ class ArgMaxTest(test.TestCase):
def testDouble(self):
self._testBasic(np.float64)
self._testDim(np.float64)
self._testLargeOutput(np.float64)
def testHalf(self):
self._testBasic(np.float16)
self._testDim(np.float16)
self._testLargeOutput(np.float16)
def testInt32(self):
self._testBasic(np.int32)
@ -153,41 +134,5 @@ class ArgMaxTest(test.TestCase):
self.assertEqual(ret.shape, (1, 0))
class ArgMaxBenchmark(test.Benchmark):
def _RunSingleBenchmark(self, shape, dtype, bench_name):
with session.Session(config=benchmark.benchmark_config()) as sess:
num_dims = len(shape)
var = variables.Variable(random_ops.random_uniform(shape, dtype=dtype))
variables.variables_initializer([var]).run()
for dim in range(num_dims):
num_ops_in_group = 15
op = control_flow_ops.group(*(math_ops.argmax(var, dimension=dim)
for _ in range(num_ops_in_group)))
op_name = "%s_%s_dim%d" % (bench_name, dtype.name, dim)
num_bytes = num_ops_in_group * np.prod(shape) * dtype.size
self.run_op_benchmark(sess, op, burn_iters=5, min_iters=20,
name=op_name, mbs=num_bytes / 1e6)
def _runBenchmarksWithDtype(self, dtype):
self._RunSingleBenchmark((2**17,), dtype, "1d")
self._RunSingleBenchmark((2**13, 2**13), dtype, "square_2d")
self._RunSingleBenchmark((2**5, 2**16), dtype, "rectangle1_2d")
self._RunSingleBenchmark((2**16, 2**5), dtype, "rectangle2_2d")
self._RunSingleBenchmark((2**8, 2**8, 2**8), dtype, "cube_3d")
self._RunSingleBenchmark((2**16, 2**5, 2**5), dtype, "rectangle1_3d")
self._RunSingleBenchmark((2**5, 2**16, 2**5), dtype, "rectangle2_3d")
self._RunSingleBenchmark((2**5, 2**5, 2**16), dtype, "rectangle3_3d")
def benchmarkFloat(self):
self._runBenchmarksWithDtype(dtypes.float32)
def benchmarkDouble(self):
self._runBenchmarksWithDtype(dtypes.float64)
def benchmarkHalf(self):
self._runBenchmarksWithDtype(dtypes.float16)
if __name__ == "__main__":
test.main()