diff --git a/tensorflow/core/kernels/bias_op_gpu.cu.cc b/tensorflow/core/kernels/bias_op_gpu.cu.cc index d8eb0c5f79d..764d087f45d 100644 --- a/tensorflow/core/kernels/bias_op_gpu.cu.cc +++ b/tensorflow/core/kernels/bias_op_gpu.cu.cc @@ -81,17 +81,17 @@ void BiasGPU::compute(const GPUDevice& d, const T* input, const T* bias, if (total_count == 0) { return; } - GpuLaunchConfig config = GetCudaLaunchConfig(total_count, d); + GpuLaunchConfig config = GetGpuLaunchConfig(total_count, d); if (data_format == FORMAT_NHWC) { - TF_CHECK_OK(CudaLaunchKernel(BiasNHWCKernel, config.block_count, - config.thread_per_block, 0, d.stream(), - config.virtual_thread_count, input, bias, - output, bias_size)); + TF_CHECK_OK(GpuLaunchKernel(BiasNHWCKernel, config.block_count, + config.thread_per_block, 0, d.stream(), + config.virtual_thread_count, input, bias, + output, bias_size)); } else { - TF_CHECK_OK(CudaLaunchKernel(BiasNCHWKernel, config.block_count, - config.thread_per_block, 0, d.stream(), - config.virtual_thread_count, input, bias, - output, bias_size, image_size)); + TF_CHECK_OK(GpuLaunchKernel(BiasNCHWKernel, config.block_count, + config.thread_per_block, 0, d.stream(), + config.virtual_thread_count, input, bias, + output, bias_size, image_size)); } } @@ -204,7 +204,7 @@ void BiasGradGPU::compute(const GPUDevice& d, const T* output_backprop, return; } static constexpr int32 kWarpSize = 32; - GpuLaunchConfig config = GetCudaLaunchConfig(total_count, d); + GpuLaunchConfig config = GetGpuLaunchConfig(total_count, d); const int max_shared_memory_size = d.sharedMemPerBlock() / 2; int32 shared_memory_size = 0; @@ -214,10 +214,10 @@ void BiasGradGPU::compute(const GPUDevice& d, const T* output_backprop, // Check if we have enough shared memory. if (shared_memory_size <= max_shared_memory_size) { if (data_format == FORMAT_NHWC) { - TF_CHECK_OK(CudaLaunchKernel(BiasGradNHWC_SharedAtomics, - config.block_count, config.thread_per_block, - shared_memory_size, d.stream(), total_count, - output_backprop, bias_backprop, bias_size)); + TF_CHECK_OK(GpuLaunchKernel(BiasGradNHWC_SharedAtomics, + config.block_count, config.thread_per_block, + shared_memory_size, d.stream(), total_count, + output_backprop, bias_backprop, bias_size)); } else { // Round up the block count to multiple of bias_size. int group_size = (config.block_count + bias_size - 1) / bias_size; @@ -225,24 +225,24 @@ void BiasGradGPU::compute(const GPUDevice& d, const T* output_backprop, if (config.thread_per_block < kWarpSize) { config.thread_per_block = kWarpSize; } - TF_CHECK_OK(CudaLaunchKernel( - BiasGradNCHW_SharedAtomics, config.block_count, - config.thread_per_block, 0, d.stream(), output_backprop, - bias_backprop, batch, bias_size, image_size, group_size)); + TF_CHECK_OK(GpuLaunchKernel(BiasGradNCHW_SharedAtomics, + config.block_count, config.thread_per_block, + 0, d.stream(), output_backprop, bias_backprop, + batch, bias_size, image_size, group_size)); } } else { // Note that even if we don't have enough shared memory to fit the entire // output block, it is possible to process one group of elements at a time. // But for now, we simply fall back to the naive implementation. if (data_format == FORMAT_NHWC) { - TF_CHECK_OK(CudaLaunchKernel( + TF_CHECK_OK(GpuLaunchKernel( BiasGradNHWC_Naive, config.block_count, config.thread_per_block, 0, d.stream(), total_count, output_backprop, bias_backprop, bias_size)); } else { - TF_CHECK_OK(CudaLaunchKernel(BiasGradNCHW_Naive, config.block_count, - config.thread_per_block, 0, d.stream(), - total_count, output_backprop, bias_backprop, - bias_size, image_size)); + TF_CHECK_OK(GpuLaunchKernel(BiasGradNCHW_Naive, config.block_count, + config.thread_per_block, 0, d.stream(), + total_count, output_backprop, bias_backprop, + bias_size, image_size)); } } } diff --git a/tensorflow/core/kernels/depthwise_conv_op_gpu.h b/tensorflow/core/kernels/depthwise_conv_op_gpu.h index 6425bce133e..721088f80ba 100644 --- a/tensorflow/core/kernels/depthwise_conv_op_gpu.h +++ b/tensorflow/core/kernels/depthwise_conv_op_gpu.h @@ -656,12 +656,12 @@ Status LaunchDepthwiseConv2dGPUSmall(OpKernelContext* ctx, kBlockDepth * (tile_pixels + filter_pixels) * sizeof(S); const int num_outputs = args.out_rows * args.out_cols * block_count; auto device = ctx->eigen_gpu_device(); - GpuLaunchConfig config = GetCudaLaunchConfigFixedBlockSize( + GpuLaunchConfig config = GetGpuLaunchConfigFixedBlockSize( num_outputs, device, kernel, shared_memory_size, block_dim.x * block_dim.y * block_dim.z); - TF_CHECK_OK(CudaLaunchKernel(kernel, config.block_count, block_dim, - shared_memory_size, device.stream(), args, input, - filter, output)); + TF_CHECK_OK(GpuLaunchKernel(kernel, config.block_count, block_dim, + shared_memory_size, device.stream(), args, input, + filter, output)); return Status::OK(); } @@ -751,10 +751,10 @@ Status LaunchDepthwiseConv2dGPU(OpKernelContext* ctx, const DepthwiseArgs& args, kKnownDepthMultiplier < 0 ? std::numeric_limits::max() : device.getNumGpuMultiProcessors(); - TF_CHECK_OK(CudaLaunchKernel(kernel, - std::min(max_block_count, config.block_count), - config.thread_per_block, 0, device.stream(), - args, input, filter, output, num_outputs)); + TF_CHECK_OK(GpuLaunchKernel(kernel, + std::min(max_block_count, config.block_count), + config.thread_per_block, 0, device.stream(), args, + input, filter, output, num_outputs)); return Status::OK(); } @@ -969,7 +969,7 @@ Status LaunchDepthwiseConv2dBackpropInputGPU(OpKernelContext* ctx, auto device = ctx->eigen_gpu_device(); GpuLaunchConfig config = GetGpuLaunchConfig(num_in_backprop, device, kernel, 0, 0); - TF_CHECK_OK(CudaLaunchKernel( + TF_CHECK_OK(GpuLaunchKernel( kernel, config.block_count, config.thread_per_block, 0, device.stream(), args, out_backprop, filter, in_backprop, num_in_backprop)); return Status::OK(); @@ -1611,12 +1611,12 @@ Status TryLaunchDepthwiseConv2dBackpropFilterGPUSmall( " is not supported"); } const int num_out_backprop = args.out_rows * args.out_cols * block_count; - GpuLaunchConfig config = GetCudaLaunchConfigFixedBlockSize( + GpuLaunchConfig config = GetGpuLaunchConfigFixedBlockSize( num_out_backprop, device, kernel, shared_memory_size, block_dim.x * block_dim.y * block_dim.z); - TF_CHECK_OK(CudaLaunchKernel(kernel, config.block_count, block_dim, - shared_memory_size, device.stream(), args, - out_backprop, input, filter_backprop)); + TF_CHECK_OK(GpuLaunchKernel(kernel, config.block_count, block_dim, + shared_memory_size, device.stream(), args, + out_backprop, input, filter_backprop)); return Status::OK(); } @@ -1717,7 +1717,7 @@ Status LaunchDepthwiseConv2dBackpropFilterGPU( auto device = ctx->eigen_gpu_device(); GpuLaunchConfig config = GetGpuLaunchConfig(num_out_backprop, device, kernel, 0, 0); - TF_CHECK_OK(CudaLaunchKernel( + TF_CHECK_OK(GpuLaunchKernel( kernel, config.block_count, config.thread_per_block, 0, device.stream(), args, out_backprop, input, filter_backprop, num_out_backprop)); return Status::OK(); diff --git a/tensorflow/core/kernels/determinant_op_gpu.cu.cc b/tensorflow/core/kernels/determinant_op_gpu.cu.cc index 387ea3b6607..5678aeef20e 100644 --- a/tensorflow/core/kernels/determinant_op_gpu.cu.cc +++ b/tensorflow/core/kernels/determinant_op_gpu.cu.cc @@ -128,9 +128,9 @@ struct DeterminantFromPivotedLUFunctor { int* info) { const int64 num_matrices = output.size(); const int64 n = lu_factor.dimension(2); - GpuLaunchConfig config = GetCudaLaunchConfig(num_matrices, device); + GpuLaunchConfig config = GetGpuLaunchConfig(num_matrices, device); - TF_CHECK_OK(CudaLaunchKernel( + TF_CHECK_OK(GpuLaunchKernel( DeterminantFromPivotedLUKernel, config.block_count, config.thread_per_block, 0, device.stream(), config.virtual_thread_count, n, lu_factor.data(), pivots, nullptr, @@ -151,8 +151,8 @@ struct LogDeterminantFromPivotedLUFunctor { typename TTypes::Tensor log_abs_det) { const int64 num_matrices = sign.size(); const int64 n = lu_factor.dimension(2); - GpuLaunchConfig config = GetCudaLaunchConfig(num_matrices, device); - TF_CHECK_OK(CudaLaunchKernel( + GpuLaunchConfig config = GetGpuLaunchConfig(num_matrices, device); + TF_CHECK_OK(GpuLaunchKernel( DeterminantFromPivotedLUKernel, config.block_count, config.thread_per_block, 0, device.stream(), config.virtual_thread_count, n, lu_factor.data(), pivots, sign.data(), diff --git a/tensorflow/core/kernels/in_topk_op_gpu.cu.cc b/tensorflow/core/kernels/in_topk_op_gpu.cu.cc index 3c14838a53b..1894dedddf1 100644 --- a/tensorflow/core/kernels/in_topk_op_gpu.cu.cc +++ b/tensorflow/core/kernels/in_topk_op_gpu.cu.cc @@ -118,13 +118,13 @@ struct InTopKFunctor { const auto& d = context->eigen_device(); // Compute a mask for all predictions. - CudaLaunchConfig config = GetCudaLaunchConfig(num_targets * num_classes, d); - OP_REQUIRES_OK(context, CudaLaunchKernel( - ComputePredictionMaskKernel, - config.block_count, config.thread_per_block, 0, - d.stream(), predictions.data(), targets.data(), - predictions_mask.flat().data(), - num_targets, num_classes)); + CudaLaunchConfig config = GetGpuLaunchConfig(num_targets * num_classes, d); + OP_REQUIRES_OK( + context, GpuLaunchKernel(ComputePredictionMaskKernel, + config.block_count, config.thread_per_block, 0, + d.stream(), predictions.data(), targets.data(), + predictions_mask.flat().data(), + num_targets, num_classes)); // Reduce prediction masks to number of predictions larger than the target // prediction, or to the negative value if we can't compute an answer. diff --git a/tensorflow/core/kernels/lu_op_gpu.cu.cc b/tensorflow/core/kernels/lu_op_gpu.cu.cc index 6d94d7f3f64..5ac290e1766 100644 --- a/tensorflow/core/kernels/lu_op_gpu.cu.cc +++ b/tensorflow/core/kernels/lu_op_gpu.cu.cc @@ -222,8 +222,8 @@ class LuOpGpu : public AsyncOpKernel { int* pivots_ptr = pivots.flat().data(); Tidx* permutation_indices_ptr = permutation_indices->template flat().data(); - GpuLaunchConfig cfgPivots = GetCudaLaunchConfig(batch_size, device); - TF_CHECK_OK(CudaLaunchKernel( + GpuLaunchConfig cfgPivots = GetGpuLaunchConfig(batch_size, device); + TF_CHECK_OK(GpuLaunchKernel( ComputePermutationFromTranspositionsKernel, cfgPivots.block_count, cfgPivots.thread_per_block, 0, device.stream(), cfgPivots, num_rows, pivots_ptr, permutation_indices_ptr)); diff --git a/tensorflow/core/kernels/non_max_suppression_op.cu.cc b/tensorflow/core/kernels/non_max_suppression_op.cu.cc index 4e578487d6b..9b8526a75c3 100644 --- a/tensorflow/core/kernels/non_max_suppression_op.cu.cc +++ b/tensorflow/core/kernels/non_max_suppression_op.cu.cc @@ -241,7 +241,7 @@ Status NmsGpu(const float* d_sorted_boxes_float_ptr, const int num_boxes, DataType::DT_INT32, TensorShape({max_nms_mask_size}), &d_nms_mask)); // reset data sensitive tensors auto device = context->eigen_gpu_device(); - auto config = GetCudaLaunchConfig(d_nms_mask.NumElements(), device); + auto config = GetGpuLaunchConfig(d_nms_mask.NumElements(), device); TF_CHECK_OK(GpuLaunchKernel(SetZero, config.block_count, config.thread_per_block, 0, device.stream(), config.virtual_thread_count, @@ -425,7 +425,7 @@ class NonMaxSuppressionV2GPUOp : public OpKernel { &d_sorted_boxes)); // this will return sorted scores and their indices - auto config = GetCudaLaunchConfig(num_boxes, device); + auto config = GetGpuLaunchConfig(num_boxes, device); // initialize box and score indices TF_CHECK_OK(GpuLaunchKernel(Iota, config.block_count, config.thread_per_block, 0, device.stream(), @@ -472,7 +472,7 @@ class NonMaxSuppressionV2GPUOp : public OpKernel { context->allocate_output(0, TensorShape({num_outputs}), &output_indices)); if (num_outputs == 0) return; - config = GetCudaLaunchConfig(num_outputs, device); + config = GetGpuLaunchConfig(num_outputs, device); TF_CHECK_OK(GpuLaunchKernel( IndexMultiSelect, config.block_count, config.thread_per_block, 0, device.stream(), config.virtual_thread_count, diff --git a/tensorflow/core/kernels/relu_op_gpu.cu.cc b/tensorflow/core/kernels/relu_op_gpu.cu.cc index 2ade89b7ff5..385565b28d4 100644 --- a/tensorflow/core/kernels/relu_op_gpu.cu.cc +++ b/tensorflow/core/kernels/relu_op_gpu.cu.cc @@ -104,9 +104,9 @@ struct ReluGrad { if (count == 0) return; int32 half2_count = Eigen::divup(count, 2); constexpr int32 kThreadInBlock = 512; - GpuLaunchConfig config = GetCudaLaunchConfigFixedBlockSize( + GpuLaunchConfig config = GetGpuLaunchConfigFixedBlockSize( half2_count, d, ReluGradHalfKernel, 0, kThreadInBlock); - TF_CHECK_OK(CudaLaunchKernel( + TF_CHECK_OK(GpuLaunchKernel( ReluGradHalfKernel, config.block_count, config.thread_per_block, 0, d.stream(), gradient.data(), feature.data(), backprop.data(), count)); } @@ -133,9 +133,9 @@ struct Relu { int32 vect_count = Eigen::divup(count, 4); constexpr int32 kThreadInBlock = 512; - GpuLaunchConfig config = GetCudaLaunchConfigFixedBlockSize( + GpuLaunchConfig config = GetGpuLaunchConfigFixedBlockSize( vect_count, d, Relu_int8x4_kernel, 0, kThreadInBlock); - TF_CHECK_OK(CudaLaunchKernel( + TF_CHECK_OK(GpuLaunchKernel( Relu_int8x4_kernel, config.block_count, config.thread_per_block, 0, d.stream(), vect_count, reinterpret_cast(input.data()), reinterpret_cast(output.data()))); diff --git a/tensorflow/core/kernels/roll_op_gpu.cu.cc b/tensorflow/core/kernels/roll_op_gpu.cu.cc index baae4c94e00..c5ef02d84a6 100644 --- a/tensorflow/core/kernels/roll_op_gpu.cu.cc +++ b/tensorflow/core/kernels/roll_op_gpu.cu.cc @@ -71,7 +71,7 @@ struct Roll { d.memcpyHostToDevice(thres_buf, threshold.data(), thres_bytes); d.memcpyHostToDevice(range_buf, dim_range.data(), range_bytes); - CudaLaunchConfig cfg = GetCudaLaunchConfig(num_elements, d); + CudaLaunchConfig cfg = GetGpuLaunchConfig(num_elements, d); TF_CHECK_OK(GpuLaunchKernel(RollKernel, cfg.block_count, cfg.thread_per_block, 0, d.stream(), diff --git a/tensorflow/core/kernels/stateful_random_ops_gpu.cu.cc b/tensorflow/core/kernels/stateful_random_ops_gpu.cu.cc index 8d6e826d625..ca05e0d10cc 100644 --- a/tensorflow/core/kernels/stateful_random_ops_gpu.cu.cc +++ b/tensorflow/core/kernels/stateful_random_ops_gpu.cu.cc @@ -70,12 +70,12 @@ void UpdateVariableAndFill_Philox::operator()( // maximize occupancy const int kGroupSize = Distribution::kResultElementCount; int work_element_count = (output_size + kGroupSize - 1) / kGroupSize; - GpuLaunchConfig cfg = GetCudaLaunchConfig(work_element_count, d, - FillKernel, 0, 0); + GpuLaunchConfig cfg = + GetGpuLaunchConfig(work_element_count, d, FillKernel, 0, 0); int zero = 0; cudaMemcpyToSymbol(thread_counter, &zero, sizeof(int)); - TF_CHECK_OK(CudaLaunchKernel( + TF_CHECK_OK(GpuLaunchKernel( FillKernel, cfg.block_count, cfg.thread_per_block, 0, d.stream(), dist, state_size, output_size, state_data, output_data)); } diff --git a/tensorflow/core/kernels/svd_op_gpu.cu.cc b/tensorflow/core/kernels/svd_op_gpu.cu.cc index 2f7ba18dd4a..433af71c566 100644 --- a/tensorflow/core/kernels/svd_op_gpu.cu.cc +++ b/tensorflow/core/kernels/svd_op_gpu.cu.cc @@ -196,16 +196,16 @@ class SvdOpGpu : public AsyncOpKernel { const GPUDevice& d = context->eigen_device(); d.memset(outputV_ptr, 0, batch_size * sizeof(Scalar)); Gpu2DLaunchConfig cfg2D = GetCuda2DLaunchConfig(batch_size, m, d); - TF_CHECK_OK(CudaLaunchKernel(ComputeValueOfVKernel, - cfg2D.block_count, cfg2D.thread_per_block, 0, - d.stream(), cfg2D, m, full_matrices_ ? m : p, - input_copy.flat().data(), - outputU_ptr, outputS_ptr, outputV_ptr)); + TF_CHECK_OK(GpuLaunchKernel(ComputeValueOfVKernel, + cfg2D.block_count, cfg2D.thread_per_block, 0, + d.stream(), cfg2D, m, full_matrices_ ? m : p, + input_copy.flat().data(), outputU_ptr, + outputS_ptr, outputV_ptr)); // 2. clamp V to -1 or +1 - GpuLaunchConfig cfg1D = GetCudaLaunchConfig(batch_size, d); - TF_CHECK_OK(CudaLaunchKernel(ExtractSignOfVKernel, - cfg1D.block_count, cfg1D.thread_per_block, 0, - d.stream(), cfg1D, outputV_ptr)); + GpuLaunchConfig cfg1D = GetGpuLaunchConfig(batch_size, d); + TF_CHECK_OK(GpuLaunchKernel(ExtractSignOfVKernel, + cfg1D.block_count, cfg1D.thread_per_block, 0, + d.stream(), cfg1D, outputV_ptr)); } if (compute_uv_) { diff --git a/tensorflow/core/kernels/topk_op_gpu.h b/tensorflow/core/kernels/topk_op_gpu.h index e078a087e6c..12717ca11fe 100644 --- a/tensorflow/core/kernels/topk_op_gpu.h +++ b/tensorflow/core/kernels/topk_op_gpu.h @@ -402,9 +402,9 @@ cudaError LaunchTopKKernel(const cudaStream_t& stream, int num_shards, // We are limited by the amount of shared memory we have per block. auto shared_memory_size = (num_shards + 1) * k * sizeof(Entry); - TF_CHECK_OK(CudaLaunchKernel(TopKKernel, batch_size, num_shards, - shared_memory_size, stream, input, length, k, - sorted, output, indices)); + TF_CHECK_OK(GpuLaunchKernel(TopKKernel, batch_size, num_shards, + shared_memory_size, stream, input, length, k, + sorted, output, indices)); return cudaGetLastError(); } diff --git a/tensorflow/core/kernels/tridiagonal_matmul_op_gpu.cu.cc b/tensorflow/core/kernels/tridiagonal_matmul_op_gpu.cu.cc index 7b0d4ed8227..f5cff69e63a 100644 --- a/tensorflow/core/kernels/tridiagonal_matmul_op_gpu.cu.cc +++ b/tensorflow/core/kernels/tridiagonal_matmul_op_gpu.cu.cc @@ -77,7 +77,7 @@ class TridiagonalMatMulOpGpu : public OpKernel { OP_REQUIRES_OK(context, context->allocate_output(0, rhs.shape(), &output)); const Eigen::GpuDevice& device = context->eigen_device(); - CudaLaunchConfig cfg = GetCudaLaunchConfig(1, device); + CudaLaunchConfig cfg = GetGpuLaunchConfig(1, device); TF_CHECK_OK(GpuLaunchKernel( TridiagonalMatMulKernel, cfg.block_count, cfg.thread_per_block, 0, device.stream(), batch_size, m, n, superdiag.flat().data(), diff --git a/tensorflow/core/kernels/tridiagonal_solve_op_gpu.cu.cc b/tensorflow/core/kernels/tridiagonal_solve_op_gpu.cu.cc index fb1d5b2ae77..b940357ff3b 100644 --- a/tensorflow/core/kernels/tridiagonal_solve_op_gpu.cu.cc +++ b/tensorflow/core/kernels/tridiagonal_solve_op_gpu.cu.cc @@ -228,13 +228,13 @@ class TridiagonalSolveOpGpuLinalg : public LinearAlgebraOp { void SolveForSizeOneOrTwo(OpKernelContext* context, const Scalar* diagonals, const Scalar* rhs, Scalar* output, int m, int k) { const Eigen::GpuDevice& device = context->eigen_device(); - GpuLaunchConfig cfg = GetCudaLaunchConfig(1, device); + GpuLaunchConfig cfg = GetGpuLaunchConfig(1, device); bool* not_invertible_dev; cudaMalloc(¬_invertible_dev, sizeof(bool)); - TF_CHECK_OK(CudaLaunchKernel(SolveForSizeOneOrTwoKernel, - cfg.block_count, cfg.thread_per_block, 0, - device.stream(), m, diagonals, rhs, k, output, - not_invertible_dev)); + TF_CHECK_OK(GpuLaunchKernel(SolveForSizeOneOrTwoKernel, + cfg.block_count, cfg.thread_per_block, 0, + device.stream(), m, diagonals, rhs, k, output, + not_invertible_dev)); bool not_invertible_host; cudaMemcpy(¬_invertible_host, not_invertible_dev, sizeof(bool), cudaMemcpyDeviceToHost);