Merge pull request #28758 from ThisIsIsaac:master

PiperOrigin-RevId: 256383659
2019-07-03 10:58:23 -07:00 · 2019-07-03 10:58:23 -07:00 · 81acfa851e
commit 81acfa851e
parent 26f9dbe537 33dffe53fb
1 changed files with 109 additions and 13 deletions
--- a/tensorflow/core/kernels/resize_bilinear_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/resize_bilinear_op_gpu.cu.cc
@ -31,6 +31,87 @@ typedef Eigen::GpuDevice GPUDevice;

 namespace {

+template <typename T>
+__global__ void ResizeBilinearKernel_faster(
+    const int num_channel_threads, const T* __restrict__ images,
+    float height_scale, float width_scale, int batch, int in_height,
+    int in_width, int channels, int out_height, int out_width,
+    float* __restrict__ output) {
+  constexpr int kChannelsPerThread = 16 / sizeof(T);
+  for (int out_idx = blockIdx.x * blockDim.x + threadIdx.x;
+       out_idx < out_width * out_height * num_channel_threads;
+       out_idx += blockDim.x * gridDim.x) {
+    int idx = out_idx;
+    const int c_start = idx % num_channel_threads;
+    idx /= num_channel_threads;
+    const int x = idx % out_width;
+    idx /= out_width;
+    const int y = idx % out_height;
+
+    const float in_y = (static_cast<float>(y) + 0.5f) * height_scale - 0.5f;
+
+    const int top_y_index = in_y > 0.0 ? floorf(in_y) : 0;
+    const int bottom_y_index =
+        (in_y < in_height - 1) ? ceilf(in_y) : in_height - 1;
+    const float y_lerp = in_y - floorf(in_y);
+
+    const float in_x = (static_cast<float>(x) + 0.5f) * width_scale - 0.5f;
+    const int left_x_index = in_x > 0.0 ? floorf(in_x) : 0;
+    const int right_x_index =
+        (in_x < in_width - 1) ? ceilf(in_x) : in_width - 1;
+    const float x_lerp = in_x - left_x_index;
+
+    float top_left_reg[kChannelsPerThread];
+    float top_right_reg[kChannelsPerThread];
+    float bottom_left_reg[kChannelsPerThread];
+    float bottom_right_reg[kChannelsPerThread];
+    float out_reg[kChannelsPerThread];
+    for (int b = 0; b < batch; b++) {
+      for (int c = c_start * kChannelsPerThread; c < channels;
+           c += kChannelsPerThread * num_channel_threads) {
+        // 16 byte read from global memory and cache them in registers.
+        ((float4*)top_left_reg)[0] =
+            ((float4*)images)[(((b * in_height + top_y_index) * in_width +
+                                left_x_index) *
+                                   channels +
+                               c) /
+                              4];
+        ((float4*)top_right_reg)[0] =
+            ((float4*)images)[(((b * in_height + top_y_index) * in_width +
+                                right_x_index) *
+                                   channels +
+                               c) /
+                              4];
+        ((float4*)bottom_left_reg)[0] =
+            ((float4*)images)[(((b * in_height + bottom_y_index) * in_width +
+                                left_x_index) *
+                                   channels +
+                               c) /
+                              4];
+        ((float4*)bottom_right_reg)[0] =
+            ((float4*)images)[(((b * in_height + bottom_y_index) * in_width +
+                                right_x_index) *
+                                   channels +
+                               c) /
+                              4];
+#pragma unroll
+        for (int unroll = 0; unroll < kChannelsPerThread; ++unroll) {
+          const float top =
+              top_left_reg[unroll] +
+              (top_right_reg[unroll] - top_left_reg[unroll]) * x_lerp;
+          const float bottom =
+              bottom_left_reg[unroll] +
+              (bottom_right_reg[unroll] - bottom_left_reg[unroll]) * x_lerp;
+          out_reg[unroll] = top + (bottom - top) * y_lerp;
+        }
+        ((float4*)
+             output)[(((b * out_height + y) * out_width + x) * channels + c) /
+                     4] = ((float4*)out_reg)[0];
+      }
+    }
+  }
+}
+
 template <typename T>
 __global__ void ResizeBilinearKernel(const int32 nthreads, const T* images,
                                     float height_scale, float width_scale,
@ -279,21 +360,36 @@ struct ResizeBilinear<GPUDevice, T> {
    if (total_count == 0) return;

    GpuLaunchConfig config = GetGpuLaunchConfig(total_count, d);
+    void (*kernel)(const int num_threads, const T* __restrict__ images,
+                   float height_scale, float width_scale, int batch,
+                   int in_height, int in_width, int channels, int out_height,
+                   int out_width, float* __restrict__ output) =
+        LegacyResizeBilinearKernel<T>;
+
    if (half_pixel_centers) {
-      TF_CHECK_OK(
-          GpuLaunchKernel(ResizeBilinearKernel<T>, dim3(config.block_count),
-                          dim3(config.thread_per_block), 0, d.stream(),
-                          config.virtual_thread_count, images.data(),
-                          height_scale, width_scale, batch, in_height, in_width,
-                          channels, out_height, out_width, output.data()));
-    } else {
-      TF_CHECK_OK(GpuLaunchKernel(
-          LegacyResizeBilinearKernel<T>, dim3(config.block_count),
-          dim3(config.thread_per_block), 0, d.stream(),
-          config.virtual_thread_count, images.data(), height_scale, width_scale,
-          batch, in_height, in_width, channels, out_height, out_width,
-          output.data()));
+      // If centers are not at half-pixel, use the legacy kernel instead.
+      kernel = ResizeBilinearKernel<T>;
+
+      // 16 bytes per thread and 8 threads for coalesced 128 bytes global memory
+      // access.
+      constexpr int max_num_threads_per_pixel = 8;
+      constexpr int channels_per_thread = 16 / sizeof(T);
+      if (channels % channels_per_thread == 0 &&
+          std::is_same<float, T>::value) {
+        int num_threads_per_pixel =
+            std::min(max_num_threads_per_pixel, channels / channels_per_thread);
+        config = GetGpuLaunchConfig(
+            out_height * out_width * num_threads_per_pixel, d);
+        config.virtual_thread_count = num_threads_per_pixel;
+        kernel = ResizeBilinearKernel_faster<T>;
+      }
    }
+
+    TF_CHECK_OK(
+        GpuLaunchKernel(kernel, config.block_count, config.thread_per_block, 0,
+                        d.stream(), config.virtual_thread_count, images.data(),
+                        height_scale, width_scale, batch, in_height, in_width,
+                        channels, out_height, out_width, output.data()));
  }
 };