Internal change
PiperOrigin-RevId: 292220288 Change-Id: Ib7e23f56f7b79174669d10ae7d938a82d9c19900
This commit is contained in:
parent
c094952fca
commit
58b1c0f401
@ -3913,7 +3913,7 @@ tf_kernel_library(
|
|||||||
tf_kernel_library(
|
tf_kernel_library(
|
||||||
name = "argmax_op",
|
name = "argmax_op",
|
||||||
prefix = "argmax_op",
|
prefix = "argmax_op",
|
||||||
deps = MATH_DEPS + if_cuda_or_rocm([":reduction_ops"]),
|
deps = MATH_DEPS,
|
||||||
)
|
)
|
||||||
|
|
||||||
tf_kernel_library(
|
tf_kernel_library(
|
||||||
|
@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
|
|||||||
limitations under the License.
|
limitations under the License.
|
||||||
==============================================================================*/
|
==============================================================================*/
|
||||||
|
|
||||||
|
// See docs in ../ops/math_ops.cc.
|
||||||
|
|
||||||
#define EIGEN_USE_THREADS
|
#define EIGEN_USE_THREADS
|
||||||
|
|
||||||
#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
|
#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
|
||||||
@ -39,39 +41,6 @@ namespace tensorflow {
|
|||||||
typedef Eigen::ThreadPoolDevice CPUDevice;
|
typedef Eigen::ThreadPoolDevice CPUDevice;
|
||||||
typedef Eigen::GpuDevice GPUDevice;
|
typedef Eigen::GpuDevice GPUDevice;
|
||||||
|
|
||||||
template <typename Device>
|
|
||||||
struct CustomArgOp;
|
|
||||||
|
|
||||||
template <>
|
|
||||||
struct CustomArgOp<CPUDevice> {
|
|
||||||
template <typename T, typename Tout, typename ArgFunctor>
|
|
||||||
// Determines whether the custom kernel in argmax_op_gpu.cu.cc should be
|
|
||||||
// used, and if so, runs it by calling DoGpuArgOp. If it was run,
|
|
||||||
// returns true. Otherwise, it returns false and the caller must calculate the
|
|
||||||
// arg min or max itself.
|
|
||||||
static bool CustomArgFunc(OpKernelContext* context, const Tensor& input,
|
|
||||||
int axis, Tensor* output) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
template <>
|
|
||||||
struct CustomArgOp<GPUDevice> {
|
|
||||||
template <typename T, typename Tout, typename ArgFunctor>
|
|
||||||
static bool CustomArgFunc(OpKernelContext* context, const Tensor& input,
|
|
||||||
int axis, Tensor* output) {
|
|
||||||
if (output->NumElements() <= 1024 || output->dims() > 7) {
|
|
||||||
// The custom kernel is faster than Eigen when the number of output
|
|
||||||
// elements is relatively small. We also only handle the Eigen case for up
|
|
||||||
// to 7 dimensions.
|
|
||||||
DoGpuArgOp<T, Tout, ArgFunctor::is_argmax>(context, input, axis, output);
|
|
||||||
return true;
|
|
||||||
} else {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
template <typename Device, typename T, typename Tout, typename ArgFunctor>
|
template <typename Device, typename T, typename Tout, typename ArgFunctor>
|
||||||
class ArgOp : public OpKernel {
|
class ArgOp : public OpKernel {
|
||||||
public:
|
public:
|
||||||
@ -112,11 +81,6 @@ class ArgOp : public OpKernel {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (CustomArgOp<Device>::template CustomArgFunc<T, Tout, ArgFunctor>(
|
|
||||||
context, input, axis, output)) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
#define HANDLE_DIM(NDIM) \
|
#define HANDLE_DIM(NDIM) \
|
||||||
case NDIM: \
|
case NDIM: \
|
||||||
ArgFunctor::Reduce##NDIM(context->eigen_device<Device>(), \
|
ArgFunctor::Reduce##NDIM(context->eigen_device<Device>(), \
|
||||||
|
@ -18,8 +18,6 @@ limitations under the License.
|
|||||||
// Generator definition for ArgMaxOp, must be compilable by nvcc.
|
// Generator definition for ArgMaxOp, must be compilable by nvcc.
|
||||||
|
|
||||||
#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
|
#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
|
||||||
#include "tensorflow/core/framework/op_kernel.h"
|
|
||||||
#include "tensorflow/core/framework/tensor.h"
|
|
||||||
#include "tensorflow/core/framework/tensor_types.h"
|
#include "tensorflow/core/framework/tensor_types.h"
|
||||||
#include "tensorflow/core/platform/types.h"
|
#include "tensorflow/core/platform/types.h"
|
||||||
|
|
||||||
@ -45,7 +43,6 @@ struct ArgMax {
|
|||||||
DECLARE_COMPUTE_SPEC(7);
|
DECLARE_COMPUTE_SPEC(7);
|
||||||
|
|
||||||
#undef DECLARE_COMPUTE_SPEC
|
#undef DECLARE_COMPUTE_SPEC
|
||||||
enum { is_argmax = true };
|
|
||||||
};
|
};
|
||||||
|
|
||||||
template <typename Device, typename T, typename Tout>
|
template <typename Device, typename T, typename Tout>
|
||||||
@ -66,15 +63,10 @@ struct ArgMin {
|
|||||||
DECLARE_COMPUTE_SPEC(7);
|
DECLARE_COMPUTE_SPEC(7);
|
||||||
|
|
||||||
#undef DECLARE_COMPUTE_SPEC
|
#undef DECLARE_COMPUTE_SPEC
|
||||||
enum { is_argmax = false };
|
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace functor
|
} // namespace functor
|
||||||
|
|
||||||
template <typename T, typename Tout, bool is_argmax>
|
|
||||||
void DoGpuArgOp(OpKernelContext* context, const Tensor& input, int axis,
|
|
||||||
Tensor* output);
|
|
||||||
|
|
||||||
} // namespace tensorflow
|
} // namespace tensorflow
|
||||||
|
|
||||||
#endif // TENSORFLOW_CORE_KERNELS_ARGMAX_OP_H_
|
#endif // TENSORFLOW_CORE_KERNELS_ARGMAX_OP_H_
|
||||||
|
@ -20,147 +20,11 @@ limitations under the License.
|
|||||||
|
|
||||||
#include "tensorflow/core/framework/register_types.h"
|
#include "tensorflow/core/framework/register_types.h"
|
||||||
#include "tensorflow/core/kernels/argmax_op.h"
|
#include "tensorflow/core/kernels/argmax_op.h"
|
||||||
#include "tensorflow/core/kernels/reduction_gpu_kernels.cu.h"
|
|
||||||
#include "tensorflow/core/kernels/reduction_ops_common.h"
|
|
||||||
|
|
||||||
namespace tensorflow {
|
namespace tensorflow {
|
||||||
|
|
||||||
typedef Eigen::GpuDevice GPUDevice;
|
typedef Eigen::GpuDevice GPUDevice;
|
||||||
|
|
||||||
typedef tensorflow::TTypes<float>::Tensor::Index Index;
|
|
||||||
|
|
||||||
// To compute the argmax/argmin, we perform a reduction on KeyValuePairs, which
|
|
||||||
// are (flattened index, value) pairs.
|
|
||||||
template <typename T>
|
|
||||||
using KeyValuePair = cub::KeyValuePair<Index, T>;
|
|
||||||
|
|
||||||
namespace {
|
|
||||||
|
|
||||||
template <typename T, bool is_argmax>
|
|
||||||
struct MaxOrMinFunc;
|
|
||||||
|
|
||||||
// The reduction operator: Returns the KeyValuePair with the highest or lowest
|
|
||||||
// value.
|
|
||||||
template <typename T>
|
|
||||||
struct MaxOrMinFunc<T, true> {
|
|
||||||
__host__ __device__ __forceinline__ KeyValuePair<T> operator()(
|
|
||||||
const KeyValuePair<T>& lhs, const KeyValuePair<T>& rhs) {
|
|
||||||
// If one value is NaN, we choose the other value. This behavior is not
|
|
||||||
// guaranteed by the op and may change in the future.
|
|
||||||
return (lhs.value > rhs.value || Eigen::numext::isnan(rhs.value)) ? lhs
|
|
||||||
: rhs;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
template <typename T>
|
|
||||||
struct MaxOrMinFunc<T, false> {
|
|
||||||
__host__ __device__ __forceinline__ KeyValuePair<T> operator()(
|
|
||||||
const KeyValuePair<T>& lhs, const KeyValuePair<T>& rhs) {
|
|
||||||
return (lhs.value < rhs.value || Eigen::numext::isnan(rhs.value)) ? lhs
|
|
||||||
: rhs;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
// The output converter: Converts from a KeyValuePair to an index into a a
|
|
||||||
// specific dimension. dim1 is the size of the dimension being reduced. dim2 is
|
|
||||||
// the size of the dimension(s) after dim1.
|
|
||||||
template <typename T, typename Tout>
|
|
||||||
struct OutputConverter {
|
|
||||||
OutputConverter(Index dim1, Index dim2) : dim1_(dim1), dim2_(dim2) {}
|
|
||||||
|
|
||||||
__host__ __device__ __forceinline__ Tout
|
|
||||||
operator()(const KeyValuePair<T>& key_value_pair) const {
|
|
||||||
return static_cast<Tout>((key_value_pair.key / dim2_) % dim1_);
|
|
||||||
}
|
|
||||||
|
|
||||||
Index dim1_;
|
|
||||||
Index dim2_;
|
|
||||||
};
|
|
||||||
|
|
||||||
} // namespace
|
|
||||||
|
|
||||||
namespace functor {
|
|
||||||
namespace reduction_op_helper {
|
|
||||||
|
|
||||||
// Template specialization of IdentityValue, to return the identity value for
|
|
||||||
// the reduction. This is needed for ReduceImpl, a function we call. We return
|
|
||||||
// (0, -inf) for argmax and (0, inf) for argmin.
|
|
||||||
template <typename T>
|
|
||||||
struct IdentityValue<KeyValuePair<T>, MaxOrMinFunc<T, true>> {
|
|
||||||
KeyValuePair<T> operator()() {
|
|
||||||
return {0, -std::numeric_limits<T>::infinity()};
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
template <typename T>
|
|
||||||
struct IdentityValue<KeyValuePair<T>, MaxOrMinFunc<T, false>> {
|
|
||||||
KeyValuePair<T> operator()() {
|
|
||||||
return {0, std::numeric_limits<T>::infinity()};
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
} // namespace reduction_op_helper
|
|
||||||
} // namespace functor
|
|
||||||
|
|
||||||
template <typename T, typename Tout, bool is_argmax>
|
|
||||||
void DoGpuArgOp(OpKernelContext* context, const Tensor& input, int axis,
|
|
||||||
Tensor* output) {
|
|
||||||
// We collapse adjacent axes of the input tensor in order to view it as a
|
|
||||||
// 3 dimensional tensor. The reduction axis is not collapsed, so the three new
|
|
||||||
// axes will be the input axes to the left of the reduction axis, the
|
|
||||||
// reduction axis, and the input axes to the right of the reduction axis.
|
|
||||||
Index dim0 = 1;
|
|
||||||
for (Index i = 0; i < axis; i++) {
|
|
||||||
dim0 *= input.dim_size(i);
|
|
||||||
}
|
|
||||||
Index dim1 = input.dim_size(axis);
|
|
||||||
Index dim2 = 1;
|
|
||||||
for (Index i = axis + 1; i < input.dims(); i++) {
|
|
||||||
dim2 *= input.dim_size(i);
|
|
||||||
}
|
|
||||||
DCHECK_EQ(dim0 * dim1 * dim2, input.NumElements());
|
|
||||||
|
|
||||||
auto inp = input.shaped<T, 3>({dim0, dim1, dim2});
|
|
||||||
auto out = output->shaped<Tout, 2>({dim0, dim2});
|
|
||||||
|
|
||||||
// We call ReduceImpl to perform the reduction. The input iterator returns
|
|
||||||
// KeyValuePairs. The reduction functor returns the KeyValuePair with the max
|
|
||||||
// or min value. The output iterator converts the KeyValuePair into an index
|
|
||||||
// into dim1.
|
|
||||||
using InputIterType = cub::ArgIndexInputIterator<const T*>;
|
|
||||||
using Functor = MaxOrMinFunc<T, is_argmax>;
|
|
||||||
using OutputIterType =
|
|
||||||
TransformOutputIterator<Tout, KeyValuePair<T>, OutputConverter<T, Tout>>;
|
|
||||||
|
|
||||||
InputIterType inp_wrapper(inp.data());
|
|
||||||
OutputIterType out_wrapper(out.data(), OutputConverter<T, Tout>(dim1, dim2));
|
|
||||||
|
|
||||||
typedef const Eigen::array<TTypes<float>::Tensor::Index, 1>& ReductionAxes;
|
|
||||||
Constants<GPUDevice> constants;
|
|
||||||
|
|
||||||
// TODO(reedwm): We can probably improve performance by writing specialized
|
|
||||||
// argmax kernels instead of relying on the generic ReduceImpl function
|
|
||||||
functor::ReduceImpl<KeyValuePair<T>, Functor, OutputIterType, InputIterType,
|
|
||||||
ReductionAxes>(context, out_wrapper, inp_wrapper, 3, dim0,
|
|
||||||
dim1, dim2, 2, constants.kOne, Functor());
|
|
||||||
}
|
|
||||||
|
|
||||||
#define DEFINE_GPU_ARG_OPS(T) \
|
|
||||||
template void DoGpuArgOp<T, int64, true>(OpKernelContext * context, \
|
|
||||||
const Tensor& input, int axis, \
|
|
||||||
Tensor* output); \
|
|
||||||
template void DoGpuArgOp<T, int64, false>(OpKernelContext * context, \
|
|
||||||
const Tensor& input, int axis, \
|
|
||||||
Tensor* output); \
|
|
||||||
template void DoGpuArgOp<T, int32, true>(OpKernelContext * context, \
|
|
||||||
const Tensor& input, int axis, \
|
|
||||||
Tensor* output); \
|
|
||||||
template void DoGpuArgOp<T, int32, false>(OpKernelContext * context, \
|
|
||||||
const Tensor& input, int axis, \
|
|
||||||
Tensor* output);
|
|
||||||
|
|
||||||
TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_ARG_OPS);
|
|
||||||
|
|
||||||
#define DEFINE_GPU_SPEC(T) \
|
#define DEFINE_GPU_SPEC(T) \
|
||||||
template struct functor::ArgMax<GPUDevice, T, int64>; \
|
template struct functor::ArgMax<GPUDevice, T, int64>; \
|
||||||
template struct functor::ArgMin<GPUDevice, T, int64>; \
|
template struct functor::ArgMin<GPUDevice, T, int64>; \
|
||||||
|
@ -21,15 +21,10 @@ import functools
|
|||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
from tensorflow.python.client import session
|
|
||||||
from tensorflow.python.framework import dtypes
|
from tensorflow.python.framework import dtypes
|
||||||
from tensorflow.python.framework import test_util
|
from tensorflow.python.framework import test_util
|
||||||
from tensorflow.python.ops import array_ops
|
from tensorflow.python.ops import array_ops
|
||||||
from tensorflow.python.ops import control_flow_ops
|
|
||||||
from tensorflow.python.ops import math_ops
|
from tensorflow.python.ops import math_ops
|
||||||
from tensorflow.python.ops import random_ops
|
|
||||||
from tensorflow.python.ops import variables
|
|
||||||
from tensorflow.python.platform import benchmark
|
|
||||||
from tensorflow.python.platform import test
|
from tensorflow.python.platform import test
|
||||||
|
|
||||||
|
|
||||||
@ -74,7 +69,7 @@ class ArgMaxTest(test.TestCase):
|
|||||||
self._testBothArg(math_ops.argmin, x, 0, x.argmin())
|
self._testBothArg(math_ops.argmin, x, 0, x.argmin())
|
||||||
|
|
||||||
def _testDim(self, dtype):
|
def _testDim(self, dtype):
|
||||||
shape = (3, 2, 4, 1, 5, 3, 2)
|
shape = (3, 2, 4, 5, 6, 3, 7)
|
||||||
x = np.arange(functools.reduce(lambda x, y: x * y, shape), dtype=dtype)
|
x = np.arange(functools.reduce(lambda x, y: x * y, shape), dtype=dtype)
|
||||||
np.random.shuffle(x)
|
np.random.shuffle(x)
|
||||||
x = x.reshape(shape)
|
x = x.reshape(shape)
|
||||||
@ -84,17 +79,9 @@ class ArgMaxTest(test.TestCase):
|
|||||||
self._testBothArg(math_ops.argmax, x, axis, x.argmax(axis))
|
self._testBothArg(math_ops.argmax, x, axis, x.argmax(axis))
|
||||||
self._testBothArg(math_ops.argmin, x, axis, x.argmin(axis))
|
self._testBothArg(math_ops.argmin, x, axis, x.argmin(axis))
|
||||||
|
|
||||||
def _testLargeOutput(self, dtype):
|
|
||||||
# Test case where output size is greater than 1024, which uses a different
|
|
||||||
# codepath on the GPU.
|
|
||||||
x = np.asarray(100 * np.random.randn(11, 10, 5, 11), dtype=dtype)
|
|
||||||
self._testBothArg(math_ops.argmax, x, 2, x.argmax(2))
|
|
||||||
self._testBothArg(math_ops.argmin, x, 2, x.argmin(2))
|
|
||||||
|
|
||||||
def testFloat(self):
|
def testFloat(self):
|
||||||
self._testBasic(np.float32)
|
self._testBasic(np.float32)
|
||||||
self._testDim(np.float32)
|
self._testDim(np.float32)
|
||||||
self._testLargeOutput(np.float32)
|
|
||||||
|
|
||||||
def testFloatInt32Output(self):
|
def testFloatInt32Output(self):
|
||||||
x = np.asarray(100 * np.random.randn(200), dtype=np.float32)
|
x = np.asarray(100 * np.random.randn(200), dtype=np.float32)
|
||||||
@ -116,12 +103,6 @@ class ArgMaxTest(test.TestCase):
|
|||||||
def testDouble(self):
|
def testDouble(self):
|
||||||
self._testBasic(np.float64)
|
self._testBasic(np.float64)
|
||||||
self._testDim(np.float64)
|
self._testDim(np.float64)
|
||||||
self._testLargeOutput(np.float64)
|
|
||||||
|
|
||||||
def testHalf(self):
|
|
||||||
self._testBasic(np.float16)
|
|
||||||
self._testDim(np.float16)
|
|
||||||
self._testLargeOutput(np.float16)
|
|
||||||
|
|
||||||
def testInt32(self):
|
def testInt32(self):
|
||||||
self._testBasic(np.int32)
|
self._testBasic(np.int32)
|
||||||
@ -153,41 +134,5 @@ class ArgMaxTest(test.TestCase):
|
|||||||
self.assertEqual(ret.shape, (1, 0))
|
self.assertEqual(ret.shape, (1, 0))
|
||||||
|
|
||||||
|
|
||||||
class ArgMaxBenchmark(test.Benchmark):
|
|
||||||
|
|
||||||
def _RunSingleBenchmark(self, shape, dtype, bench_name):
|
|
||||||
with session.Session(config=benchmark.benchmark_config()) as sess:
|
|
||||||
num_dims = len(shape)
|
|
||||||
var = variables.Variable(random_ops.random_uniform(shape, dtype=dtype))
|
|
||||||
variables.variables_initializer([var]).run()
|
|
||||||
for dim in range(num_dims):
|
|
||||||
num_ops_in_group = 15
|
|
||||||
op = control_flow_ops.group(*(math_ops.argmax(var, dimension=dim)
|
|
||||||
for _ in range(num_ops_in_group)))
|
|
||||||
op_name = "%s_%s_dim%d" % (bench_name, dtype.name, dim)
|
|
||||||
num_bytes = num_ops_in_group * np.prod(shape) * dtype.size
|
|
||||||
self.run_op_benchmark(sess, op, burn_iters=5, min_iters=20,
|
|
||||||
name=op_name, mbs=num_bytes / 1e6)
|
|
||||||
|
|
||||||
def _runBenchmarksWithDtype(self, dtype):
|
|
||||||
self._RunSingleBenchmark((2**17,), dtype, "1d")
|
|
||||||
self._RunSingleBenchmark((2**13, 2**13), dtype, "square_2d")
|
|
||||||
self._RunSingleBenchmark((2**5, 2**16), dtype, "rectangle1_2d")
|
|
||||||
self._RunSingleBenchmark((2**16, 2**5), dtype, "rectangle2_2d")
|
|
||||||
self._RunSingleBenchmark((2**8, 2**8, 2**8), dtype, "cube_3d")
|
|
||||||
self._RunSingleBenchmark((2**16, 2**5, 2**5), dtype, "rectangle1_3d")
|
|
||||||
self._RunSingleBenchmark((2**5, 2**16, 2**5), dtype, "rectangle2_3d")
|
|
||||||
self._RunSingleBenchmark((2**5, 2**5, 2**16), dtype, "rectangle3_3d")
|
|
||||||
|
|
||||||
def benchmarkFloat(self):
|
|
||||||
self._runBenchmarksWithDtype(dtypes.float32)
|
|
||||||
|
|
||||||
def benchmarkDouble(self):
|
|
||||||
self._runBenchmarksWithDtype(dtypes.float64)
|
|
||||||
|
|
||||||
def benchmarkHalf(self):
|
|
||||||
self._runBenchmarksWithDtype(dtypes.float16)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
test.main()
|
test.main()
|
||||||
|
Loading…
x
Reference in New Issue
Block a user