Combine legacy and non-legacy deterministic resize_bilinear CUDA back-prop kernels

2020-05-11 20:11:12 -07:00 · 2020-05-11 20:11:12 -07:00 · 294065e9f6
commit 294065e9f6
parent 116db3235a
1 changed files with 14 additions and 71 deletions
--- a/tensorflow/core/kernels/image/resize_bilinear_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/image/resize_bilinear_op_gpu.cu.cc
@ -1,4 +1,4 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2016-2020 The TensorFlow Authors. All Rights Reserved.

 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@ -235,7 +235,7 @@ __global__ void ResizeBilinearDeterministicGradKernel(
    float height_scale, float inverse_height_scale, float width_scale,
    float inverse_width_scale, int batch, int original_height,
    int original_width, int channels, int resized_height, int resized_width,
-    T* __restrict__ output_grad) {
+    float offset, T* __restrict__ output_grad) {
  GPU_1D_KERNEL_LOOP(out_idx, nthreads) {
    // out_idx = c + channels * (x + original_width * (y + original_height * b))
    int idx = out_idx;
@ -247,11 +247,11 @@ __global__ void ResizeBilinearDeterministicGradKernel(
    const int b = idx / original_height;

    int in_y_start = max(0, __float2int_ru(
-        (out_y_center - 1 + 0.5) * inverse_height_scale - 0.5));
-    const float out_y_start = (in_y_start + 0.5) * height_scale - 0.5;
+        (out_y_center - 1 + offset) * inverse_height_scale - offset));
+    const float out_y_start = (in_y_start + offset) * height_scale - offset;
    int in_x_start = max(0, __float2int_ru(
-        (out_x_center - 1 + 0.5) * inverse_width_scale - 0.5));
-    const float out_x_start = (in_x_start + 0.5) * width_scale - 0.5;
+        (out_x_center - 1 + offset) * inverse_width_scale - offset));
+    const float out_x_start = (in_x_start + offset) * width_scale - offset;
    T acc = 0;
    // For clarity, prior to C++17, while loops are preferable to for loops here
    float out_y = out_y_start; int in_y = in_y_start;
@ -389,55 +389,6 @@ __global__ void LegacyResizeBilinearGradKernel(
  }
 }

-template <typename T>
-__global__ void LegacyResizeBilinearDeterministicGradKernel(
-    const int32 nthreads, const float* __restrict__ input_grad,
-    float height_scale, float inverse_height_scale, float width_scale,
-    float inverse_width_scale, int batch, int original_height,
-    int original_width, int channels, int resized_height, int resized_width,
-    T* __restrict__ output_grad) {
-  GPU_1D_KERNEL_LOOP(out_idx, nthreads) {
-    // out_idx = c + channels * (x + original_width * (y + original_height * b))
-    int idx = out_idx;
-    const int c = idx % channels;
-    idx /= channels;
-    const int out_x_center = idx % original_width;
-    idx /= original_width;
-    const int out_y_center = idx % original_height;
-    const int b = idx / original_height;
-
-    int in_y_start = max(0, __float2int_ru(
-        (out_y_center - 1) * inverse_height_scale));
-    const float out_y_start = in_y_start * height_scale;
-    int in_x_start = max(0, __float2int_ru(
-        (out_x_center - 1) * inverse_width_scale));
-    const float out_x_start = in_x_start * width_scale;
-    T acc = 0;
-    // For clarity, prior to C++17, while loops are preferable to for loops here
-    float out_y = out_y_start; int in_y = in_y_start;
-    while(out_y < out_y_center + 1 && in_y < resized_height) {
-      float out_x = out_x_start; int in_x = in_x_start;
-      while(out_x < out_x_center + 1 && in_x < resized_width) {
-        int in_idx = ((b * resized_height + in_y) * resized_width + in_x) *
-                     channels + c;
-        // Clamping to zero is unnecessary because out_x and out_y will never
-        // be less than zero in legacy mode.
-        // Clamping to height/width is necessary when upscaling.
-        float out_y_clamped = fminf(out_y, original_height - 1);
-        float out_x_clamped = fminf(out_x, original_width - 1);
-        float y_lerp = (1 - fabsf(out_y_clamped - out_y_center));
-        float x_lerp = (1 - fabsf(out_x_clamped - out_x_center));
-        acc += static_cast<T>(input_grad[in_idx] * y_lerp * x_lerp);
-        out_x += width_scale;
-        in_x++;
-      }
-      out_y += height_scale;
-      in_y++;
-    }
-    output_grad[out_idx] = acc;
-  }
-}
-
 }  // namespace

 namespace functor {
@ -529,25 +480,17 @@ struct ResizeBilinearGrad<GPUDevice, T> {
    config = GetGpuLaunchConfig(total_count, d);

    if (RequireDeterminism()) {
-      // The following scale values below should never be zero, enforced by
+      // The scale values below should never be zero, enforced by
      // ImageResizerGradientState
      float inverse_height_scale = 1 / height_scale;
      float inverse_width_scale = 1 / width_scale;
-      if (half_pixel_centers) {
-        TF_CHECK_OK(GpuLaunchKernel(
-            ResizeBilinearDeterministicGradKernel<T>, config.block_count,
-            config.thread_per_block, 0, d.stream(), config.virtual_thread_count,
-            input_grad.data(), height_scale, inverse_height_scale, width_scale,
-            inverse_width_scale, batch, original_height, original_width,
-            channels, resized_height, resized_width, output_grad.data()));
-      } else {
-        TF_CHECK_OK(GpuLaunchKernel(
-            LegacyResizeBilinearDeterministicGradKernel<T>, config.block_count,
-            config.thread_per_block, 0, d.stream(), config.virtual_thread_count,
-            input_grad.data(), height_scale, inverse_height_scale, width_scale,
-            inverse_width_scale, batch, original_height, original_width,
-            channels, resized_height, resized_width, output_grad.data()));
-      }
+      float offset = half_pixel_centers ? 0.5 : 0;
+      TF_CHECK_OK(GpuLaunchKernel(
+          ResizeBilinearDeterministicGradKernel<T>, config.block_count,
+          config.thread_per_block, 0, d.stream(), config.virtual_thread_count,
+          input_grad.data(), height_scale, inverse_height_scale, width_scale,
+          inverse_width_scale, batch, original_height, original_width,
+          channels, resized_height, resized_width, offset, output_grad.data()));
    } else {
      // Initialize output_grad with all zeros.
      TF_CHECK_OK(GpuLaunchKernel(