Part two of renaming Cuda?DLaunchConfig to Gpu?DLaunchConfig: fix call sites.
PiperOrigin-RevId: 248708765
This commit is contained in:
parent
cc03fdce67
commit
f9a4227ae5
@ -140,8 +140,8 @@ void ConcatGPUImpl(const Eigen::GpuDevice& gpu_device,
|
|||||||
const GpuDeviceArrayStruct<IntType>& output_scan,
|
const GpuDeviceArrayStruct<IntType>& output_scan,
|
||||||
bool fixed_size, int split_size,
|
bool fixed_size, int split_size,
|
||||||
typename TTypes<T, 2>::Matrix* output) {
|
typename TTypes<T, 2>::Matrix* output) {
|
||||||
auto config = GetCuda2DLaunchConfig(output->dimension(1),
|
auto config = GetGpu2DLaunchConfig(output->dimension(1), output->dimension(0),
|
||||||
output->dimension(0), gpu_device);
|
gpu_device);
|
||||||
|
|
||||||
if (fixed_size) {
|
if (fixed_size) {
|
||||||
TF_CHECK_OK(CudaLaunchKernel(
|
TF_CHECK_OK(CudaLaunchKernel(
|
||||||
|
@ -222,7 +222,7 @@ void SplitVOpGPULaunch<T, IntType>::Run(
|
|||||||
gpu_device.stream(), input_ptr, total_rows,
|
gpu_device.stream(), input_ptr, total_rows,
|
||||||
total_cols, output_ptr_data));
|
total_cols, output_ptr_data));
|
||||||
} else {
|
} else {
|
||||||
auto config = GetCuda2DLaunchConfig(total_cols, total_rows, gpu_device);
|
auto config = GetGpu2DLaunchConfig(total_cols, total_rows, gpu_device);
|
||||||
IntType smem_max = gpu_device.sharedMemPerBlock();
|
IntType smem_max = gpu_device.sharedMemPerBlock();
|
||||||
IntType smem_usage = output_scan.size * sizeof(IntType);
|
IntType smem_usage = output_scan.size * sizeof(IntType);
|
||||||
// performance crossover is less than using maximum available shared
|
// performance crossover is less than using maximum available shared
|
||||||
|
@ -59,7 +59,7 @@ namespace {
|
|||||||
// The result is stored in V[batch] and has the same sign as the
|
// The result is stored in V[batch] and has the same sign as the
|
||||||
// real value of V (which should be computed)
|
// real value of V (which should be computed)
|
||||||
template <class Scalar>
|
template <class Scalar>
|
||||||
__global__ void ComputeValueOfVKernel(Cuda2DLaunchConfig config, int64 m,
|
__global__ void ComputeValueOfVKernel(Gpu2DLaunchConfig config, int64 m,
|
||||||
int64 ldu, const Scalar* M,
|
int64 ldu, const Scalar* M,
|
||||||
const Scalar* U, const Scalar* S,
|
const Scalar* U, const Scalar* S,
|
||||||
Scalar* V) {
|
Scalar* V) {
|
||||||
@ -195,7 +195,7 @@ class SvdOpGpu : public AsyncOpKernel {
|
|||||||
// 1. compute the (batched) sum
|
// 1. compute the (batched) sum
|
||||||
const GPUDevice& d = context->eigen_device<GPUDevice>();
|
const GPUDevice& d = context->eigen_device<GPUDevice>();
|
||||||
d.memset(outputV_ptr, 0, batch_size * sizeof(Scalar));
|
d.memset(outputV_ptr, 0, batch_size * sizeof(Scalar));
|
||||||
Cuda2DLaunchConfig cfg2D = GetCuda2DLaunchConfig(batch_size, m, d);
|
Gpu2DLaunchConfig cfg2D = GetCuda2DLaunchConfig(batch_size, m, d);
|
||||||
TF_CHECK_OK(CudaLaunchKernel(ComputeValueOfVKernel<Scalar>,
|
TF_CHECK_OK(CudaLaunchKernel(ComputeValueOfVKernel<Scalar>,
|
||||||
cfg2D.block_count, cfg2D.thread_per_block, 0,
|
cfg2D.block_count, cfg2D.thread_per_block, 0,
|
||||||
d.stream(), cfg2D, m, full_matrices_ ? m : p,
|
d.stream(), cfg2D, m, full_matrices_ ? m : p,
|
||||||
|
@ -54,7 +54,7 @@ __global__ void Count1D(GpuLaunchConfig config, int bufsize, int* outbuf) {
|
|||||||
atomicAdd(&outbuf[x % bufsize], 1);
|
atomicAdd(&outbuf[x % bufsize], 1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
__global__ void Count2D(Cuda2DLaunchConfig config, int bufsize, int* outbuf) {
|
__global__ void Count2D(Gpu2DLaunchConfig config, int bufsize, int* outbuf) {
|
||||||
CUDA_AXIS_KERNEL_LOOP(x, config.virtual_thread_count.x, X) {
|
CUDA_AXIS_KERNEL_LOOP(x, config.virtual_thread_count.x, X) {
|
||||||
if (x < 0) { // x might overflow when testing extreme case
|
if (x < 0) { // x might overflow when testing extreme case
|
||||||
break;
|
break;
|
||||||
@ -68,7 +68,7 @@ __global__ void Count2D(Cuda2DLaunchConfig config, int bufsize, int* outbuf) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
__global__ void Count3D(Cuda3DLaunchConfig config, int bufsize, int* outbuf) {
|
__global__ void Count3D(Gpu3DLaunchConfig config, int bufsize, int* outbuf) {
|
||||||
CUDA_AXIS_KERNEL_LOOP(x, config.virtual_thread_count.x, X) {
|
CUDA_AXIS_KERNEL_LOOP(x, config.virtual_thread_count.x, X) {
|
||||||
if (x < 0) { // x might overflow when testing extreme case
|
if (x < 0) { // x might overflow when testing extreme case
|
||||||
break;
|
break;
|
||||||
@ -189,7 +189,7 @@ TEST_F(GpuLaunchConfigTest, GetGpuLaunchConfig) {
|
|||||||
#undef TEST_LAUNCH_PARAMETER
|
#undef TEST_LAUNCH_PARAMETER
|
||||||
}
|
}
|
||||||
|
|
||||||
bool operator==(const Cuda2DLaunchConfig& a, const Cuda2DLaunchConfig& b) {
|
bool operator==(const Gpu2DLaunchConfig& a, const Cuda2DLaunchConfig& b) {
|
||||||
return a.thread_per_block.x == b.thread_per_block.x &&
|
return a.thread_per_block.x == b.thread_per_block.x &&
|
||||||
a.thread_per_block.y == b.thread_per_block.y &&
|
a.thread_per_block.y == b.thread_per_block.y &&
|
||||||
a.thread_per_block.z == b.thread_per_block.z &&
|
a.thread_per_block.z == b.thread_per_block.z &&
|
||||||
@ -201,8 +201,8 @@ bool operator==(const Cuda2DLaunchConfig& a, const Cuda2DLaunchConfig& b) {
|
|||||||
a.thread_per_block.z == b.thread_per_block.z;
|
a.thread_per_block.z == b.thread_per_block.z;
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST_F(GpuLaunchConfigTest, GetCuda2DLaunchConfig) {
|
TEST_F(GpuLaunchConfigTest, GetGpu2DLaunchConfig) {
|
||||||
Cuda2DLaunchConfig cfg;
|
Gpu2DLaunchConfig cfg;
|
||||||
GpuLaunchConfig cfg1d;
|
GpuLaunchConfig cfg1d;
|
||||||
|
|
||||||
// test valid inputs
|
// test valid inputs
|
||||||
@ -212,7 +212,7 @@ TEST_F(GpuLaunchConfigTest, GetCuda2DLaunchConfig) {
|
|||||||
cfg1d.thread_per_block, 0, d.stream(), cfg1d, \
|
cfg1d.thread_per_block, 0, d.stream(), cfg1d, \
|
||||||
outbuf)); \
|
outbuf)); \
|
||||||
CUDA_ASSERT_SUCCESS \
|
CUDA_ASSERT_SUCCESS \
|
||||||
cfg = GetCuda2DLaunchConfig(dimx, dimy, d); \
|
cfg = GetGpu2DLaunchConfig(dimx, dimy, d); \
|
||||||
TF_EXPECT_OK(CudaLaunchKernel(Count2D, cfg.block_count, \
|
TF_EXPECT_OK(CudaLaunchKernel(Count2D, cfg.block_count, \
|
||||||
cfg.thread_per_block, 0, d.stream(), cfg, \
|
cfg.thread_per_block, 0, d.stream(), cfg, \
|
||||||
bufsize, outbuf)); \
|
bufsize, outbuf)); \
|
||||||
@ -224,7 +224,7 @@ TEST_F(GpuLaunchConfigTest, GetCuda2DLaunchConfig) {
|
|||||||
cfg1d.thread_per_block, 0, d.stream(), cfg1d, \
|
cfg1d.thread_per_block, 0, d.stream(), cfg1d, \
|
||||||
outbuf)); \
|
outbuf)); \
|
||||||
CUDA_ASSERT_SUCCESS \
|
CUDA_ASSERT_SUCCESS \
|
||||||
cfg = GetCuda2DLaunchConfig(dimx, dimy, d, Count2D, 0, 0); \
|
cfg = GetGpu2DLaunchConfig(dimx, dimy, d, Count2D, 0, 0); \
|
||||||
TF_EXPECT_OK(CudaLaunchKernel(Count2D, cfg.block_count, \
|
TF_EXPECT_OK(CudaLaunchKernel(Count2D, cfg.block_count, \
|
||||||
cfg.thread_per_block, 0, d.stream(), cfg, \
|
cfg.thread_per_block, 0, d.stream(), cfg, \
|
||||||
bufsize, outbuf)); \
|
bufsize, outbuf)); \
|
||||||
@ -245,8 +245,8 @@ TEST_F(GpuLaunchConfigTest, GetCuda2DLaunchConfig) {
|
|||||||
#undef TEST_LAUNCH_PARAMETER
|
#undef TEST_LAUNCH_PARAMETER
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST_F(GpuLaunchConfigTest, GetCuda3DLaunchConfig) {
|
TEST_F(GpuLaunchConfigTest, GetGpu3DLaunchConfig) {
|
||||||
Cuda3DLaunchConfig cfg;
|
Gpu3DLaunchConfig cfg;
|
||||||
GpuLaunchConfig cfg1d;
|
GpuLaunchConfig cfg1d;
|
||||||
|
|
||||||
// test valid inputs
|
// test valid inputs
|
||||||
@ -256,7 +256,7 @@ TEST_F(GpuLaunchConfigTest, GetCuda3DLaunchConfig) {
|
|||||||
cfg1d.thread_per_block, 0, d.stream(), cfg1d, \
|
cfg1d.thread_per_block, 0, d.stream(), cfg1d, \
|
||||||
outbuf)); \
|
outbuf)); \
|
||||||
CUDA_ASSERT_SUCCESS \
|
CUDA_ASSERT_SUCCESS \
|
||||||
cfg = GetCuda3DLaunchConfig(dimx, dimy, dimz, d, Count3D, 0, 0); \
|
cfg = GetGpu3DLaunchConfig(dimx, dimy, dimz, d, Count3D, 0, 0); \
|
||||||
TF_EXPECT_OK(CudaLaunchKernel(Count3D, cfg.block_count, \
|
TF_EXPECT_OK(CudaLaunchKernel(Count3D, cfg.block_count, \
|
||||||
cfg.thread_per_block, 0, d.stream(), cfg, \
|
cfg.thread_per_block, 0, d.stream(), cfg, \
|
||||||
bufsize, outbuf)); \
|
bufsize, outbuf)); \
|
||||||
|
Loading…
x
Reference in New Issue
Block a user