Combine legacy and non-legacy deterministic resize_bilinear CUDA back-prop kernels

This commit is contained in:
Duncan Riach 2020-05-11 20:11:12 -07:00
parent 116db3235a
commit 294065e9f6

View File

@ -1,4 +1,4 @@
/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
/* Copyright 2016-2020 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
@ -235,7 +235,7 @@ __global__ void ResizeBilinearDeterministicGradKernel(
float height_scale, float inverse_height_scale, float width_scale,
float inverse_width_scale, int batch, int original_height,
int original_width, int channels, int resized_height, int resized_width,
T* __restrict__ output_grad) {
float offset, T* __restrict__ output_grad) {
GPU_1D_KERNEL_LOOP(out_idx, nthreads) {
// out_idx = c + channels * (x + original_width * (y + original_height * b))
int idx = out_idx;
@ -247,11 +247,11 @@ __global__ void ResizeBilinearDeterministicGradKernel(
const int b = idx / original_height;
int in_y_start = max(0, __float2int_ru(
(out_y_center - 1 + 0.5) * inverse_height_scale - 0.5));
const float out_y_start = (in_y_start + 0.5) * height_scale - 0.5;
(out_y_center - 1 + offset) * inverse_height_scale - offset));
const float out_y_start = (in_y_start + offset) * height_scale - offset;
int in_x_start = max(0, __float2int_ru(
(out_x_center - 1 + 0.5) * inverse_width_scale - 0.5));
const float out_x_start = (in_x_start + 0.5) * width_scale - 0.5;
(out_x_center - 1 + offset) * inverse_width_scale - offset));
const float out_x_start = (in_x_start + offset) * width_scale - offset;
T acc = 0;
// For clarity, prior to C++17, while loops are preferable to for loops here
float out_y = out_y_start; int in_y = in_y_start;
@ -389,55 +389,6 @@ __global__ void LegacyResizeBilinearGradKernel(
}
}
template <typename T>
__global__ void LegacyResizeBilinearDeterministicGradKernel(
const int32 nthreads, const float* __restrict__ input_grad,
float height_scale, float inverse_height_scale, float width_scale,
float inverse_width_scale, int batch, int original_height,
int original_width, int channels, int resized_height, int resized_width,
T* __restrict__ output_grad) {
GPU_1D_KERNEL_LOOP(out_idx, nthreads) {
// out_idx = c + channels * (x + original_width * (y + original_height * b))
int idx = out_idx;
const int c = idx % channels;
idx /= channels;
const int out_x_center = idx % original_width;
idx /= original_width;
const int out_y_center = idx % original_height;
const int b = idx / original_height;
int in_y_start = max(0, __float2int_ru(
(out_y_center - 1) * inverse_height_scale));
const float out_y_start = in_y_start * height_scale;
int in_x_start = max(0, __float2int_ru(
(out_x_center - 1) * inverse_width_scale));
const float out_x_start = in_x_start * width_scale;
T acc = 0;
// For clarity, prior to C++17, while loops are preferable to for loops here
float out_y = out_y_start; int in_y = in_y_start;
while(out_y < out_y_center + 1 && in_y < resized_height) {
float out_x = out_x_start; int in_x = in_x_start;
while(out_x < out_x_center + 1 && in_x < resized_width) {
int in_idx = ((b * resized_height + in_y) * resized_width + in_x) *
channels + c;
// Clamping to zero is unnecessary because out_x and out_y will never
// be less than zero in legacy mode.
// Clamping to height/width is necessary when upscaling.
float out_y_clamped = fminf(out_y, original_height - 1);
float out_x_clamped = fminf(out_x, original_width - 1);
float y_lerp = (1 - fabsf(out_y_clamped - out_y_center));
float x_lerp = (1 - fabsf(out_x_clamped - out_x_center));
acc += static_cast<T>(input_grad[in_idx] * y_lerp * x_lerp);
out_x += width_scale;
in_x++;
}
out_y += height_scale;
in_y++;
}
output_grad[out_idx] = acc;
}
}
} // namespace
namespace functor {
@ -529,25 +480,17 @@ struct ResizeBilinearGrad<GPUDevice, T> {
config = GetGpuLaunchConfig(total_count, d);
if (RequireDeterminism()) {
// The following scale values below should never be zero, enforced by
// The scale values below should never be zero, enforced by
// ImageResizerGradientState
float inverse_height_scale = 1 / height_scale;
float inverse_width_scale = 1 / width_scale;
if (half_pixel_centers) {
TF_CHECK_OK(GpuLaunchKernel(
ResizeBilinearDeterministicGradKernel<T>, config.block_count,
config.thread_per_block, 0, d.stream(), config.virtual_thread_count,
input_grad.data(), height_scale, inverse_height_scale, width_scale,
inverse_width_scale, batch, original_height, original_width,
channels, resized_height, resized_width, output_grad.data()));
} else {
TF_CHECK_OK(GpuLaunchKernel(
LegacyResizeBilinearDeterministicGradKernel<T>, config.block_count,
config.thread_per_block, 0, d.stream(), config.virtual_thread_count,
input_grad.data(), height_scale, inverse_height_scale, width_scale,
inverse_width_scale, batch, original_height, original_width,
channels, resized_height, resized_width, output_grad.data()));
}
float offset = half_pixel_centers ? 0.5 : 0;
TF_CHECK_OK(GpuLaunchKernel(
ResizeBilinearDeterministicGradKernel<T>, config.block_count,
config.thread_per_block, 0, d.stream(), config.virtual_thread_count,
input_grad.data(), height_scale, inverse_height_scale, width_scale,
inverse_width_scale, batch, original_height, original_width,
channels, resized_height, resized_width, offset, output_grad.data()));
} else {
// Initialize output_grad with all zeros.
TF_CHECK_OK(GpuLaunchKernel(