diff --git a/tensorflow/core/kernels/non_max_suppression_op.cu.cc b/tensorflow/core/kernels/non_max_suppression_op.cu.cc index c2cae2ab212..d19f6eb676c 100644 --- a/tensorflow/core/kernels/non_max_suppression_op.cu.cc +++ b/tensorflow/core/kernels/non_max_suppression_op.cu.cc @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#if GOOGLE_CUDA +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM #define EIGEN_USE_GPU #include @@ -28,7 +28,12 @@ limitations under the License. #include "tensorflow/core/util/gpu_launch_config.h" #include "tensorflow/stream_executor/stream_executor.h" -struct __align__(16) Box { + +struct +#if GOOGLE_CUDA + __align__(16) +#endif + Box { float x1, y1, x2, y2; }; @@ -114,7 +119,7 @@ __global__ void NMSReduce(const int* bitmask, const int bit_mask_len, char* result_mask) { extern __shared__ int local[]; // set global mask to accept all boxes - for (int box : CudaGridRangeX(bit_mask_len)) { + for (int box : GpuGridRangeX(bit_mask_len)) { local[box] = 0xFFFFFFFF; } __syncthreads(); @@ -127,7 +132,7 @@ __global__ void NMSReduce(const int* bitmask, const int bit_mask_len, accepted_boxes += 1; int offset = box * bit_mask_len; // update global mask with current box's mask - for (int b : CudaGridRangeX(bit_mask_len)) { + for (int b : GpuGridRangeX(bit_mask_len)) { local[b] &= ~bitmask[offset + b]; } __syncthreads(); @@ -135,7 +140,7 @@ __global__ void NMSReduce(const int* bitmask, const int bit_mask_len, } // copy global mask to result_max char array. char array is needed for // cub::DeviceSelect later. - for (int box : CudaGridRangeX(num_boxes)) { + for (int box : GpuGridRangeX(num_boxes)) { result_mask[box] = CheckBit(local, box); } } @@ -232,14 +237,14 @@ __device__ EIGEN_STRONG_INLINE void SelectHelper(const Index i_selected, template __global__ void IndexMultiSelect(const int num_elements, const Index* indices, const T* original, T* selected, Args... args) { - for (const int idx : CudaGridRangeX(num_elements)) { + for (const int idx : GpuGridRangeX(num_elements)) { SelectHelper(idx, indices[idx], original, selected, args...); } } template __global__ void Iota(const int num_elements, const T offset, T* to_fill) { - for (int idx : CudaGridRangeX(num_elements)) { + for (int idx : GpuGridRangeX(num_elements)) { to_fill[idx] = static_cast(idx) + offset; } } @@ -322,13 +327,13 @@ Status NmsGpu(const float* d_sorted_boxes_float_ptr, const int num_boxes, TF_RETURN_IF_CUDA_ERROR(cudaGetLastError()); // do Cub::deviceSelect::flagged size_t flagged_buffer_size = 0; - cub::DeviceSelect::Flagged(static_cast(nullptr), // temp_storage - flagged_buffer_size, - static_cast(nullptr), // input - static_cast(nullptr), // selection flag - static_cast(nullptr), // selected items - static_cast(nullptr), // num_selected - num_boxes, device.stream()); + gpuprim::DeviceSelect::Flagged(static_cast(nullptr), // temp_storage + flagged_buffer_size, + static_cast(nullptr), // input + static_cast(nullptr), // selection flag + static_cast(nullptr), // selected items + static_cast(nullptr), // num_selected + num_boxes, device.stream()); Tensor cub_scratch; TF_RETURN_IF_ERROR(context->allocate_temp( DataType::DT_INT8, TensorShape({(int64)flagged_buffer_size}), @@ -337,22 +342,22 @@ Status NmsGpu(const float* d_sorted_boxes_float_ptr, const int num_boxes, TF_RETURN_IF_ERROR(context->allocate_temp(DataType::DT_INT32, TensorShape({1}), &d_num_selected)); - cub::DeviceSelect::Flagged( + gpuprim::DeviceSelect::Flagged( (void*)cub_scratch.flat().data(), // temp_storage flagged_buffer_size, d_indices.flat().data(), // input selected, // selection flag d_selected_indices, // selected items d_num_selected.flat().data(), num_boxes, device.stream()); - cudaEvent_t copy_done; + gpuEvent_t copy_done; TF_RETURN_IF_CUDA_ERROR( - cudaEventCreateWithFlags(©_done, cudaEventDisableTiming)); + gpuEventCreateWithFlags(©_done, gpuEventDisableTiming)); device.memcpyDeviceToHost(h_selected_count, d_num_selected.flat().data(), sizeof(int)); - TF_RETURN_IF_CUDA_ERROR(cudaEventRecord(copy_done, device.stream())); - TF_RETURN_IF_CUDA_ERROR(cudaEventSynchronize(copy_done)); + TF_RETURN_IF_CUDA_ERROR(gpuEventRecord(copy_done, device.stream())); + TF_RETURN_IF_CUDA_ERROR(gpuEventSynchronize(copy_done)); *h_nkeep = *h_selected_count; - cudaEventDestroy(copy_done); + gpuEventDestroy(copy_done); return Status::OK(); } @@ -375,9 +380,10 @@ Status CountIf(OpKernelContext* context, const float* dev_array, const Op& op, size_t workspace_size = 0; auto cuda_stream = tensorflow::GetGpuStream(context); auto device = context->eigen_gpu_device(); - cub::DeviceSelect::If(nullptr, workspace_size, static_cast(nullptr), - static_cast(nullptr), - static_cast(nullptr), num_elements, op); + gpuprim::DeviceSelect::If(nullptr, workspace_size, + static_cast(nullptr), + static_cast(nullptr), + static_cast(nullptr), num_elements, op); TF_RETURN_IF_ERROR(context->allocate_temp( DataType::DT_FLOAT, TensorShape({num_elements}), &scratch_output)); @@ -385,17 +391,17 @@ Status CountIf(OpKernelContext* context, const float* dev_array, const Op& op, DataType::DT_INT8, TensorShape({(int64)workspace_size}), &workspace)); TF_RETURN_IF_ERROR(context->allocate_temp(DataType::DT_INT32, TensorShape({1}), &element_count)); - cudaEvent_t copy_done; + gpuEvent_t copy_done; TF_RETURN_IF_CUDA_ERROR( - cudaEventCreateWithFlags(©_done, cudaEventDisableTiming)); - TF_RETURN_IF_CUDA_ERROR(cub::DeviceSelect::If( + gpuEventCreateWithFlags(©_done, gpuEventDisableTiming)); + TF_RETURN_IF_CUDA_ERROR(gpuprim::DeviceSelect::If( workspace.flat().data(), workspace_size, dev_array, scratch_output.flat().data(), element_count.flat().data(), num_elements, op, cuda_stream)); device.memcpyDeviceToHost(result, element_count.flat().data(), sizeof(int)); - TF_RETURN_IF_CUDA_ERROR(cudaEventRecord(copy_done, device.stream())); - TF_RETURN_IF_CUDA_ERROR(cudaEventSynchronize(copy_done)); + TF_RETURN_IF_CUDA_ERROR(gpuEventRecord(copy_done, device.stream())); + TF_RETURN_IF_CUDA_ERROR(gpuEventSynchronize(copy_done)); return Status::OK(); } @@ -418,7 +424,7 @@ Status DoNMS(OpKernelContext* context, const Tensor& boxes, return Status::OK(); } - cudaError_t cuda_ret = cub::DeviceRadixSort::SortPairsDescending( + cudaError_t cuda_ret = gpuprim::DeviceRadixSort::SortPairsDescending( nullptr, cub_sort_temp_storage_bytes, static_cast(nullptr), // scores static_cast(nullptr), // sorted scores @@ -458,7 +464,7 @@ Status DoNMS(OpKernelContext* context, const Tensor& boxes, config.virtual_thread_count, 0, d_indices.flat().data())); TF_RETURN_IF_CUDA_ERROR(cudaGetLastError()); - cuda_ret = cub::DeviceRadixSort::SortPairsDescending( + cuda_ret = gpuprim::DeviceRadixSort::SortPairsDescending( d_cub_sort_buffer.flat().data(), cub_sort_temp_storage_bytes, scores.flat().data(), d_sorted_scores.flat().data(), d_indices.flat().data(), d_sorted_indices.flat().data(), diff --git a/tensorflow/core/kernels/non_max_suppression_op.h b/tensorflow/core/kernels/non_max_suppression_op.h index eaa1b28ad4b..24957c2bbed 100644 --- a/tensorflow/core/kernels/non_max_suppression_op.h +++ b/tensorflow/core/kernels/non_max_suppression_op.h @@ -35,7 +35,7 @@ struct NonMaxSuppression { } // namespace functor -#if GOOGLE_CUDA +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM extern const int kNmsBoxesPerTread; // Given descending sorted box list, apply non-maximal-suppression with given diff --git a/tensorflow/core/kernels/non_max_suppression_op_gpu_test.cc b/tensorflow/core/kernels/non_max_suppression_op_gpu_test.cc index 8dcb9c77a41..57f812d410e 100644 --- a/tensorflow/core/kernels/non_max_suppression_op_gpu_test.cc +++ b/tensorflow/core/kernels/non_max_suppression_op_gpu_test.cc @@ -35,7 +35,7 @@ limitations under the License. namespace tensorflow { -#if GOOGLE_CUDA +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM // These tests are copied from non_max_suppression_op_test.cc file and modified // to use GPU ops. See other file for test details. diff --git a/tensorflow/core/kernels/ops_testutil.cc b/tensorflow/core/kernels/ops_testutil.cc index c6f751d196c..c535fe66601 100644 --- a/tensorflow/core/kernels/ops_testutil.cc +++ b/tensorflow/core/kernels/ops_testutil.cc @@ -122,7 +122,8 @@ void OpsTestBase::SetDevice(const DeviceType& device_type, } #else CHECK_NE(device_type, DEVICE_GPU) - << "Requesting GPU on binary compiled without GOOGLE_CUDA or TENSORFLOW_USE_ROCM."; + << "Requesting GPU on binary compiled without GOOGLE_CUDA or " + "TENSORFLOW_USE_ROCM."; allocator_ = device_->GetAllocator(AllocatorAttributes()); #endif }