Replace remaining reference to GetCudaStream with GetGpuStream.

PiperOrigin-RevId: 255935906
2019-07-01 05:29:47 -07:00 · 2019-07-01 05:29:47 -07:00 · 108af9109e
commit 108af9109e
parent 3840910209
5 changed files with 4 additions and 16 deletions
--- a/tensorflow/core/kernels/dynamic_partition_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/dynamic_partition_op_gpu.cu.cc
@ -336,7 +336,7 @@ class DynamicPartitionOpGPU : public AsyncOpKernel {
                 Tensor* indices_out, DoneCallback done) {
    int32 N = partitions->NumElements();
    const GPUDevice& device = c->eigen_device<GPUDevice>();
-    const auto& cu_stream = GetCudaStream(c);
+    const auto& cu_stream = GetGpuStream(c);

    // Initialize the indices_in tensor using the Range GPU kernel.
    RangeInit(device, 0, 1, N, indices_in->flat<int32>());
@ -369,7 +369,7 @@ class DynamicPartitionOpGPU : public AsyncOpKernel {
                         Tensor* partition_count, Tensor* indices_out,
                         DoneCallback done) {
    const GPUDevice& device = c->eigen_device<GPUDevice>();
-    const auto& cu_stream = GetCudaStream(c);
+    const auto& cu_stream = GetGpuStream(c);
    int32 N = partitions->NumElements();
    Tensor indices_in;
    Tensor partitions_out;
--- a/tensorflow/core/kernels/non_max_suppression_op.cu.cc
+++ b/tensorflow/core/kernels/non_max_suppression_op.cu.cc
@ -381,7 +381,7 @@ class NonMaxSuppressionV2GPUOp : public OpKernel {
    }
    const int output_size = max_output_size.scalar<int>()();
    size_t cub_sort_temp_storage_bytes = 0;
-    auto cuda_stream = GetCudaStream(context);
+    auto cuda_stream = GetGpuStream(context);
    auto device = context->eigen_gpu_device();
    // Calling cub with nullptrs as inputs will make it return
    // workspace size needed for the operation instead of doing the operation.
--- a/tensorflow/core/kernels/softmax_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/softmax_op_gpu.cu.cc
@ -148,7 +148,7 @@ class SoftmaxOpGPU : public OpKernel {
    OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
                                {0}, 0, logits_in_.shape(), &softmax_out));

-    const auto& cu_stream = GetCudaStream(context);
+    const auto& cu_stream = GetGpuStream(context);
    if (logits_in_.NumElements() > 0) {
      Tensor max_logits;
      Tensor sum_probs;
--- a/tensorflow/core/util/gpu_kernel_helper.h
+++ b/tensorflow/core/util/gpu_kernel_helper.h
@ -80,7 +80,6 @@ inline const char* GpuGetErrorString(hipError_t error) {
 }
 #endif

-// Exact copy from GetCudaStream() in gpu_launch_config.h
 // Returns a raw reference to the current cuda stream. Required by a
 // number of kernel calls (for which StreamInterface* does not work),
 // i.e. CUB and certain cublas primitives.
--- a/tensorflow/core/util/gpu_launch_config.h
+++ b/tensorflow/core/util/gpu_launch_config.h
@ -374,17 +374,6 @@ Gpu2DLaunchConfig GetGpu2DLaunchConfig(int xdim, int ydim,
 CREATE_CUDA_HOST_FUNCTION_ALIAS(GetGpu2DLaunchConfig, GetCuda2DLaunchConfig);

 #if GOOGLE_CUDA
-// Returns a raw reference to the current cuda stream.  Required by a
-// number of kernel calls (for which StreamInterface* does not work), i.e.
-// CUB and certain cublas primitives.
-inline const cudaStream_t& GetCudaStream(OpKernelContext* context) {
-  const cudaStream_t* ptr = CHECK_NOTNULL(
-      reinterpret_cast<const cudaStream_t*>(context->op_device_context()
-                                                ->stream()
-                                                ->implementation()
-                                                ->GpuStreamMemberHack()));
-  return *ptr;
-}
 template <typename DeviceFunc>
 Cuda2DLaunchConfig GetCuda2DLaunchConfig(int xdim, int ydim,
                                         const Eigen::GpuDevice& d,