Rename Segment*ReductionF->*ReductionF

Rename `*OpGPU` to `NonAtomic*OpGpu`

Rename `*OpGPU` to `NonAtomic*OpGpu`

Add REGISTER_KERNEL_BUILDER for int32

Rename `*OpGPU` to `NonAtomic*OpGpu`

Rename `*OpGPU` to `NonAtomic*OpGpu`

Rename `*OpGPU` to `NonAtomic*OpGpu`

Rename `Segment*ReductionF`->`*ReductionF`

Fix problem of compiling
This commit is contained in:
qqq.jq 2020-11-10 13:12:44 +08:00 committed by firejqwei
parent f85360f53b
commit 587385e0fd
6 changed files with 77 additions and 51 deletions

View File

@ -47,7 +47,7 @@ typedef Eigen::GpuDevice GPUDevice;
// data: input data tensor. // data: input data tensor.
// output: output reshaped to {output_rows, output.size/output_rows} // output: output reshaped to {output_rows, output.size/output_rows}
template <typename T, typename Index, typename InitialValueF, template <typename T, typename Index, typename InitialValueF,
typename SegmentReductionF, typename SegmentAtomicReductionF> typename ReductionF, typename AtomicReductionF>
struct SegmentReductionFunctor { struct SegmentReductionFunctor {
void operator()(OpKernelContext* ctx, const GPUDevice& d, void operator()(OpKernelContext* ctx, const GPUDevice& d,
const Index output_rows, const TensorShape& segment_ids_shape, const Index output_rows, const TensorShape& segment_ids_shape,
@ -71,7 +71,7 @@ struct UnsortedSegmentFunctor {
// Atomic reduction functors for the gpu. // Atomic reduction functors for the gpu.
template <typename T> template <typename T>
struct SumAtomicOpGpu { struct AtomicSumOpGpu {
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(T* dest, EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(T* dest,
const T& value) { const T& value) {
GpuAtomicAdd(dest, value); GpuAtomicAdd(dest, value);
@ -79,7 +79,7 @@ struct SumAtomicOpGpu {
}; };
template <typename T> template <typename T>
struct ProdAtomicOpGpu { struct AtomicProdOpGpu {
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(T* dest, EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(T* dest,
const T& value) { const T& value) {
GpuAtomicMul(dest, value); GpuAtomicMul(dest, value);
@ -87,7 +87,7 @@ struct ProdAtomicOpGpu {
}; };
template <typename T> template <typename T>
struct MaxAtomicOpGpu { struct AtomicMaxOpGpu {
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(T* dest, EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(T* dest,
const T& value) { const T& value) {
GpuAtomicMax(dest, value); GpuAtomicMax(dest, value);
@ -95,16 +95,16 @@ struct MaxAtomicOpGpu {
}; };
template <typename T> template <typename T>
struct MinAtomicOpGpu { struct AtomicMinOpGpu {
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(T* dest, EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(T* dest,
const T& value) { const T& value) {
GpuAtomicMin(dest, value); GpuAtomicMin(dest, value);
} }
}; };
// Reduction functors for the gpu. // Non-atomic reduction functors for the gpu.
template <typename T> template <typename T>
struct SumOpGpu { struct NonAtomicSumOpGpu {
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(T* dest, EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(T* dest,
const T& value) { const T& value) {
*dest += value; *dest += value;
@ -112,7 +112,7 @@ struct SumOpGpu {
}; };
template <typename T> template <typename T>
struct ProdOpGpu { struct NonAtomicProdOpGpu {
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(T* dest, EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(T* dest,
const T& value) { const T& value) {
*dest *= value; *dest *= value;
@ -120,7 +120,7 @@ struct ProdOpGpu {
}; };
template <typename T> template <typename T>
struct MaxOpGpu { struct NonAtomicMaxOpGpu {
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(T* dest, EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(T* dest,
const T& value) { const T& value) {
*dest = max(*dest, value); *dest = max(*dest, value);
@ -128,7 +128,7 @@ struct MaxOpGpu {
}; };
template <typename T> template <typename T>
struct MinOpGpu { struct NonAtomicMinOpGpu {
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(T* dest, EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(T* dest,
const T& value) { const T& value) {
*dest = min(*dest, value); *dest = min(*dest, value);

View File

@ -52,12 +52,12 @@ using GPUDevice = Eigen::GpuDevice;
// rows of input data and all reduction elements share one inner // rows of input data and all reduction elements share one inner
// dimension index. // dimension index.
template <typename T, typename Index, int OuterDimTileSize, template <typename T, typename Index, int OuterDimTileSize,
typename SegmentReductionF, typename SegmentAtomicReductionF> typename ReductionF, typename AtomicReductionF>
__global__ void SortedSegmentReductionCustomKernel( __global__ void SortedSegmentReductionCustomKernel(
const Index input_outer_dim_size, const Index inner_dim_size, const Index input_outer_dim_size, const Index inner_dim_size,
const Index output_outer_dim_size, const Index* __restrict__ segment_ids, const Index output_outer_dim_size, const Index* __restrict__ segment_ids,
const T* __restrict__ input, T* __restrict__ output, const T* __restrict__ input, T* __restrict__ output,
const Index total_stripe_countconst T initial_value) { const Index total_stripe_count, const T initial_value) {
for (int stripe_index : GpuGridRangeX(total_stripe_count)) { for (int stripe_index : GpuGridRangeX(total_stripe_count)) {
const Index segment_offset = stripe_index % inner_dim_size; const Index segment_offset = stripe_index % inner_dim_size;
const Index input_outer_dim_index_base = const Index input_outer_dim_index_base =
@ -82,13 +82,13 @@ __global__ void SortedSegmentReductionCustomKernel(
// Decide whether to write result to global memory using atomic // Decide whether to write result to global memory using atomic
// operations. // operations.
if (last_output_segment_id == first_segment_id) { if (last_output_segment_id == first_segment_id) {
SegmentAtomicReductionF()(output + output_index, reduce_res); AtomicReductionF()(output + output_index, reduce_res);
} else { } else {
SegmentReductionF()(output + output_index, reduce_res); ReductionF()(output + output_index, reduce_res);
} }
reduce_res = initial_value; reduce_res = initial_value;
} }
SegmentReductionF()(&reduce_res, ReductionF()(&reduce_res,
ldg(input + (input_outer_dim_index_base + j) ldg(input + (input_outer_dim_index_base + j)
* inner_dim_size + segment_offset)); * inner_dim_size + segment_offset));
last_output_segment_id = current_output_segment_id; last_output_segment_id = current_output_segment_id;
@ -98,7 +98,7 @@ __global__ void SortedSegmentReductionCustomKernel(
// the following strip. // the following strip.
const Index output_index = const Index output_index =
last_output_segment_id * inner_dim_size + segment_offset; last_output_segment_id * inner_dim_size + segment_offset;
SegmentAtomicReductionF()(output + output_index, reduce_res); AtomicReductionF()(output + output_index, reduce_res);
} }
} }
@ -128,9 +128,9 @@ __global__ void UnsortedSegmentCustomKernel(
namespace functor { namespace functor {
template <typename T, typename Index, typename InitialValueF, template <typename T, typename Index, typename InitialValueF,
typename SegmentReductionF, typename SegmentAtomicReductionF> typename ReductionF, typename AtomicReductionF>
void SegmentReductionFunctor<T, Index, InitialValueF, SegmentReductionF, void SegmentReductionFunctor<T, Index, InitialValueF, ReductionF,
SegmentAtomicReductionF>::operator()( AtomicReductionF>::operator()(
OpKernelContext* ctx, const GPUDevice& d, const Index output_rows, OpKernelContext* ctx, const GPUDevice& d, const Index output_rows,
const TensorShape& segment_ids_shape, const TensorShape& segment_ids_shape,
typename TTypes<Index>::ConstFlat segment_ids, const Index data_size, typename TTypes<Index>::ConstFlat segment_ids, const Index data_size,
@ -168,8 +168,7 @@ void SegmentReductionFunctor<T, Index, InitialValueF, SegmentReductionF,
config = GetGpuLaunchConfig(total_stripe_count, d); config = GetGpuLaunchConfig(total_stripe_count, d);
TF_CHECK_OK(GpuLaunchKernel( TF_CHECK_OK(GpuLaunchKernel(
SortedSegmentReductionCustomKernel<T, Index, OuterDimTileSize, SortedSegmentReductionCustomKernel<T, Index, OuterDimTileSize,
SegmentReductionF, ReductionF, AtomicReductionF>,
SegmentAtomicReductionF>,
config.block_count, config.thread_per_block, 0, d.stream(), config.block_count, config.thread_per_block, 0, d.stream(),
input_outer_dim_size, input_inner_dim_size, output_rows, input_outer_dim_size, input_inner_dim_size, output_rows,
segment_ids.data(), data, output.data(), total_stripe_count, segment_ids.data(), data, output.data(), total_stripe_count,
@ -216,13 +215,13 @@ struct UnsortedSegmentFunctor<GPUDevice, T, Index, InitialValueF, ReductionF> {
#define DEFINE_SORTED_GPU_SPECS_INDEX(T, Index) \ #define DEFINE_SORTED_GPU_SPECS_INDEX(T, Index) \
template struct SegmentReductionFunctor<T, Index, functor::Zero<T>, \ template struct SegmentReductionFunctor<T, Index, functor::Zero<T>, \
functor::SumOpGpu<T>, functor::SumAtomicOpGpu<T>>; \ functor::NonAtomicSumOpGpu<T>, functor::AtomicSumOpGpu<T>>; \
template struct SegmentReductionFunctor<T, Index, functor::One<T>, \ template struct SegmentReductionFunctor<T, Index, functor::One<T>, \
functor::ProdOpGpu<T>, functor::ProdAtomicOpGpu<T>>; \ functor::NonAtomicProdOpGpu<T>, functor::AtomicProdOpGpu<T>>; \
template struct SegmentReductionFunctor<T, Index, functor::Highest<T>, \ template struct SegmentReductionFunctor<T, Index, functor::Highest<T>, \
functor::MinOpGpu<T>, functor::MinAtomicOpGpu<T>>; \ functor::NonAtomicMinOpGpu<T>, functor::AtomicMinOpGpu<T>>; \
template struct SegmentReductionFunctor<T, Index, functor::Lowest<T>, \ template struct SegmentReductionFunctor<T, Index, functor::Lowest<T>, \
functor::MaxOpGpu<T>, functor::MaxAtomicOpGpu<T>>; functor::NonAtomicMaxOpGpu<T>, functor::AtomicMaxOpGpu<T>>;
#define DEFINE_SORTED_GPU_SPECS(T) \ #define DEFINE_SORTED_GPU_SPECS(T) \
DEFINE_SORTED_GPU_SPECS_INDEX(T, int32); \ DEFINE_SORTED_GPU_SPECS_INDEX(T, int32); \
@ -232,16 +231,16 @@ TF_CALL_GPU_NUMBER_TYPES(DEFINE_SORTED_GPU_SPECS);
#define DEFINE_REAL_UNSORTED_GPU_SPECS_INDEX(T, Index) \ #define DEFINE_REAL_UNSORTED_GPU_SPECS_INDEX(T, Index) \
template struct UnsortedSegmentFunctor< \ template struct UnsortedSegmentFunctor< \
GPUDevice, T, Index, functor::Lowest<T>, functor::MaxAtomicOpGpu<T>>; \ GPUDevice, T, Index, functor::Lowest<T>, functor::AtomicMaxOpGpu<T>>; \
template struct UnsortedSegmentFunctor< \ template struct UnsortedSegmentFunctor< \
GPUDevice, T, Index, functor::Highest<T>, functor::MinAtomicOpGpu<T>>; \ GPUDevice, T, Index, functor::Highest<T>, functor::AtomicMinOpGpu<T>>; \
template struct UnsortedSegmentFunctor<GPUDevice, T, Index, functor::One<T>, \ template struct UnsortedSegmentFunctor<GPUDevice, T, Index, functor::One<T>, \
functor::ProdAtomicOpGpu<T>>; functor::AtomicProdOpGpu<T>>;
// Sum is the only op that supports all input types currently. // Sum is the only op that supports all input types currently.
#define DEFINE_SUM_UNSORTED_GPU_SPECS_INDEX(T, Index) \ #define DEFINE_SUM_UNSORTED_GPU_SPECS_INDEX(T, Index) \
template struct UnsortedSegmentFunctor< \ template struct UnsortedSegmentFunctor< \
GPUDevice, T, Index, functor::Zero<T>, functor::SumAtomicOpGpu<T>>; GPUDevice, T, Index, functor::Zero<T>, functor::AtomicSumOpGpu<T>>;
#define DEFINE_REAL_GPU_SPECS(T) \ #define DEFINE_REAL_GPU_SPECS(T) \
DEFINE_REAL_UNSORTED_GPU_SPECS_INDEX(T, int32); \ DEFINE_REAL_UNSORTED_GPU_SPECS_INDEX(T, int32); \

View File

@ -113,17 +113,44 @@ REGISTER_COMPLEX_CPU_KERNELS_ALL(complex128);
#undef REGISTER_COMPLEX_CPU_KERNELS_ALL #undef REGISTER_COMPLEX_CPU_KERNELS_ALL
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
#define REGISTER_GPU_SORTED_KERNELS(type, index_type) \ #define REGISTER_GPU_KERNEL_SORTEDSEGMENT( \
REGISTER_KERNEL_BUILDER(Name("SegmentSum") \ name, type, index_type, initial_value_functor, \
.Device(DEVICE_GPU) \ reduction_kernel_functor, atomic_reduction_kernel_functor) \
.TypeConstraint<type>("T") \ REGISTER_KERNEL_BUILDER( \
.TypeConstraint<index_type>("Tindices"), \ Name(name) \
SegmentSumGPUOp<type, index_type>) .Device(DEVICE_GPU) \
.TypeConstraint<type>("T") \
.TypeConstraint<index_type>("Tindices"), \
SegmentReductionGPUOp< \
type, index_type, \
functor::SegmentReductionFunctor<type, index_type, \
initial_value_functor, \
reduction_kernel_functor, \
atomic_reduction_kernel_functor> >)
#define REGISTER_GPU_SORTED_KERNELS(type, index_type) \
REGISTER_GPU_KERNEL_SORTEDSEGMENT("SegmentSum", type, index_type, \
functor::Zero<type>, \
functor::NonAtomicSumOpGpu<type>, \
functor::AtomicSumOpGpu<type>); \
REGISTER_GPU_KERNEL_SORTEDSEGMENT("SegmentProd", type, index_type, \
functor::One<type>, \
functor::NonAtomicProdOpGpu<type>, \
functor::AtomicProdOpGpu<type>); \
REGISTER_GPU_KERNEL_SORTEDSEGMENT("SegmentMin", type, index_type, \
functor::Highest<type>, \
functor::NonAtomicMinOpGpu<type>, \
functor::AtomicMinOpGpu<type>); \
REGISTER_GPU_KERNEL_SORTEDSEGMENT("SegmentMax", type, index_type, \
functor::Lowest<type>, \
functor::NonAtomicMaxOpGpu<type>, \
functor::AtomicMaxOpGpu<type>);
#define REGISTER_GPU_SORTED_KERNELS_ALL(type) \ #define REGISTER_GPU_SORTED_KERNELS_ALL(type) \
REGISTER_GPU_SORTED_KERNELS(type, int32) REGISTER_GPU_SORTED_KERNELS(type, int32)
TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_SORTED_KERNELS_ALL); TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_SORTED_KERNELS_ALL);
#undef REGISTER_GPU_KERNEL_SORTEDSEGMENT
#undef REGISTER_GPU_SORTED_KERNELS #undef REGISTER_GPU_SORTED_KERNELS
#undef REGISTER_GPU_SORTED_KERNELS_ALL #undef REGISTER_GPU_SORTED_KERNELS_ALL
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM

View File

@ -81,20 +81,20 @@ REGISTER_COMPLEX_CPU_KERNELS_ALL(complex128);
#define REGISTER_GPU_SORTED_KERNELS(type, index_type) \ #define REGISTER_GPU_SORTED_KERNELS(type, index_type) \
REGISTER_GPU_KERNEL_SORTEDSEGMENT("SegmentSum", type, index_type, \ REGISTER_GPU_KERNEL_SORTEDSEGMENT("SegmentSum", type, index_type, \
functor::Zero<type>, \ functor::Zero<type>, \
functor::SumOpGpu<type>, \ functor::NonAtomicSumOpGpu<type>, \
functor::SumAtomicOpGpu<type>); \ functor::AtomicSumOpGpu<type>); \
REGISTER_GPU_KERNEL_SORTEDSEGMENT("SegmentProd", type, index_type, \ REGISTER_GPU_KERNEL_SORTEDSEGMENT("SegmentProd", type, index_type, \
functor::One<type>, \ functor::One<type>, \
functor::ProdOpGpu<type>, \ functor::NonAtomicProdOpGpu<type>, \
functor::ProdAtomicOpGpu<type>); \ functor::AtomicProdOpGpu<type>); \
REGISTER_GPU_KERNEL_SORTEDSEGMENT("SegmentMin", type, index_type, \ REGISTER_GPU_KERNEL_SORTEDSEGMENT("SegmentMin", type, index_type, \
functor::Highest<type>, \ functor::Highest<type>, \
functor::MinOpGpu<type>, \ functor::NonAtomicMinOpGpu<type>, \
functor::MinAtomicOpGpu<type>); \ functor::AtomicMinOpGpu<type>); \
REGISTER_GPU_KERNEL_SORTEDSEGMENT("SegmentMax", type, index_type, \ REGISTER_GPU_KERNEL_SORTEDSEGMENT("SegmentMax", type, index_type, \
functor::Lowest<type>, \ functor::Lowest<type>, \
functor::MaxOpGpu<type>, \ functor::NonAtomicMaxOpGpu<type>, \
functor::MaxAtomicOpGpu<type>); functor::AtomicMaxOpGpu<type>);
#define REGISTER_GPU_SORTED_KERNELS_ALL(type) \ #define REGISTER_GPU_SORTED_KERNELS_ALL(type) \
REGISTER_GPU_SORTED_KERNELS(type, int64); REGISTER_GPU_SORTED_KERNELS(type, int64);

View File

@ -88,18 +88,18 @@ REGISTER_COMPLEX_CPU_UNSORTED_KERNELS_ALL(complex128);
#define REGISTER_REAL_GPU_UNSORTED_KERNELS(type, index_type) \ #define REGISTER_REAL_GPU_UNSORTED_KERNELS(type, index_type) \
REGISTER_GPU_KERNEL_UNSORTEDSEGMENT("UnsortedSegmentMax", type, index_type, \ REGISTER_GPU_KERNEL_UNSORTEDSEGMENT("UnsortedSegmentMax", type, index_type, \
functor::Lowest<type>, \ functor::Lowest<type>, \
functor::MaxAtomicOpGpu<type>); \ functor::AtomicMaxOpGpu<type>); \
REGISTER_GPU_KERNEL_UNSORTEDSEGMENT("UnsortedSegmentMin", type, index_type, \ REGISTER_GPU_KERNEL_UNSORTEDSEGMENT("UnsortedSegmentMin", type, index_type, \
functor::Highest<type>, \ functor::Highest<type>, \
functor::MinAtomicOpGpu<type>); \ functor::AtomicMinOpGpu<type>); \
REGISTER_GPU_KERNEL_UNSORTEDSEGMENT("UnsortedSegmentProd", type, index_type, \ REGISTER_GPU_KERNEL_UNSORTEDSEGMENT("UnsortedSegmentProd", type, index_type, \
functor::One<type>, \ functor::One<type>, \
functor::ProdAtomicOpGpu<type>); functor::AtomicProdOpGpu<type>);
#define REGISTER_SUM_GPU_UNSORTED_KERNELS(type, index_type) \ #define REGISTER_SUM_GPU_UNSORTED_KERNELS(type, index_type) \
REGISTER_GPU_KERNEL_UNSORTEDSEGMENT("UnsortedSegmentSum", type, index_type, \ REGISTER_GPU_KERNEL_UNSORTEDSEGMENT("UnsortedSegmentSum", type, index_type, \
functor::Zero<type>, \ functor::Zero<type>, \
functor::SumAtomicOpGpu<type>); functor::AtomicSumOpGpu<type>);
#define REGISTER_REAL_GPU_UNSORTED_KERNELS_ALL(type) \ #define REGISTER_REAL_GPU_UNSORTED_KERNELS_ALL(type) \
REGISTER_REAL_GPU_UNSORTED_KERNELS(type, int32) REGISTER_REAL_GPU_UNSORTED_KERNELS(type, int32)

View File

@ -88,18 +88,18 @@ REGISTER_COMPLEX_CPU_UNSORTED_KERNELS_ALL(complex128);
#define REGISTER_REAL_GPU_UNSORTED_KERNELS(type, index_type) \ #define REGISTER_REAL_GPU_UNSORTED_KERNELS(type, index_type) \
REGISTER_GPU_KERNEL_UNSORTEDSEGMENT("UnsortedSegmentMax", type, index_type, \ REGISTER_GPU_KERNEL_UNSORTEDSEGMENT("UnsortedSegmentMax", type, index_type, \
functor::Lowest<type>, \ functor::Lowest<type>, \
functor::MaxAtomicOpGpu<type>); \ functor::AtomicMaxOpGpu<type>); \
REGISTER_GPU_KERNEL_UNSORTEDSEGMENT("UnsortedSegmentMin", type, index_type, \ REGISTER_GPU_KERNEL_UNSORTEDSEGMENT("UnsortedSegmentMin", type, index_type, \
functor::Highest<type>, \ functor::Highest<type>, \
functor::MinAtomicOpGpu<type>); \ functor::AtomicMinOpGpu<type>); \
REGISTER_GPU_KERNEL_UNSORTEDSEGMENT("UnsortedSegmentProd", type, index_type, \ REGISTER_GPU_KERNEL_UNSORTEDSEGMENT("UnsortedSegmentProd", type, index_type, \
functor::One<type>, \ functor::One<type>, \
functor::ProdAtomicOpGpu<type>); functor::AtomicProdOpGpu<type>);
#define REGISTER_SUM_GPU_UNSORTED_KERNELS(type, index_type) \ #define REGISTER_SUM_GPU_UNSORTED_KERNELS(type, index_type) \
REGISTER_GPU_KERNEL_UNSORTEDSEGMENT("UnsortedSegmentSum", type, index_type, \ REGISTER_GPU_KERNEL_UNSORTEDSEGMENT("UnsortedSegmentSum", type, index_type, \
functor::Zero<type>, \ functor::Zero<type>, \
functor::SumAtomicOpGpu<type>); functor::AtomicSumOpGpu<type>);
#define REGISTER_REAL_GPU_UNSORTED_KERNELS_ALL(type) \ #define REGISTER_REAL_GPU_UNSORTED_KERNELS_ALL(type) \
REGISTER_REAL_GPU_UNSORTED_KERNELS(type, int64) REGISTER_REAL_GPU_UNSORTED_KERNELS(type, int64)