From f65efd739cd93c0deeb1e0ae8c5a64d5250cac02 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Sun, 7 Jun 2020 23:38:32 -0700 Subject: [PATCH] Optimize bilinear resizing using vectorization. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ``` name old time/op new time/op delta BM_Resize 4.04ms ± 2% 2.84ms ± 2% -29.71% (p=0.000 n=10+10) ``` PiperOrigin-RevId: 315219756 Change-Id: Idcdc00c31199060c67665aeb52b24f495664dbdf --- tensorflow/core/kernels/resize_bilinear_op.cc | 148 +++++++++++++----- .../core/kernels/resize_bilinear_op_test.cc | 36 +++++ 2 files changed, 149 insertions(+), 35 deletions(-) diff --git a/tensorflow/core/kernels/resize_bilinear_op.cc b/tensorflow/core/kernels/resize_bilinear_op.cc index 57c39d1ce53..a0673fea73d 100644 --- a/tensorflow/core/kernels/resize_bilinear_op.cc +++ b/tensorflow/core/kernels/resize_bilinear_op.cc @@ -18,6 +18,10 @@ limitations under the License. #include "tensorflow/core/kernels/resize_bilinear_op.h" +#ifdef __SSE4_1__ +#include +#endif + #include #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" #include "tensorflow/core/framework/op_kernel.h" @@ -107,6 +111,107 @@ inline float compute_lerp(const float top_left, const float top_right, return top + (bottom - top) * y_lerp; } +#ifdef __SSE4_1__ +/* Vector version of the above */ +inline __m128 compute_lerp_v(const __m128 top_left, const __m128 top_right, + const __m128 bottom_left, + const __m128 bottom_right, const __m128 x_lerp, + const __m128 y_lerp) { + const __m128 top = + _mm_add_ps(top_left, _mm_mul_ps(_mm_sub_ps(top_right, top_left), x_lerp)); + const __m128 bottom = _mm_add_ps( + bottom_left, _mm_mul_ps(_mm_sub_ps(bottom_right, bottom_left), x_lerp)); + return _mm_add_ps(top, _mm_mul_ps(_mm_sub_ps(bottom, top), y_lerp)); +} +#endif + +template +void ResizeLine3Channels(const T* const ys_input_lower_ptr, + const T* const ys_input_upper_ptr, + const CachedInterpolation* const xs, + const float ys_lerp, const int64 out_width, + float* out_y) { + for (int64 x = 0; x < out_width; ++x) { + const int64 xs_lower = xs[x].lower; + const int64 xs_upper = xs[x].upper; + const float xs_lerp = xs[x].lerp; + + // Read channel 0. + const float top_left0(ys_input_lower_ptr[xs_lower + 0]); + const float top_right0(ys_input_lower_ptr[xs_upper + 0]); + const float bottom_left0(ys_input_upper_ptr[xs_lower + 0]); + const float bottom_right0(ys_input_upper_ptr[xs_upper + 0]); + + // Read channel 1. + const float top_left1(ys_input_lower_ptr[xs_lower + 1]); + const float top_right1(ys_input_lower_ptr[xs_upper + 1]); + const float bottom_left1(ys_input_upper_ptr[xs_lower + 1]); + const float bottom_right1(ys_input_upper_ptr[xs_upper + 1]); + + // Read channel 2. + const float top_left2(ys_input_lower_ptr[xs_lower + 2]); + const float top_right2(ys_input_lower_ptr[xs_upper + 2]); + const float bottom_left2(ys_input_upper_ptr[xs_lower + 2]); + const float bottom_right2(ys_input_upper_ptr[xs_upper + 2]); + + // Compute output. + out_y[x * 3 + 0] = compute_lerp(top_left0, top_right0, bottom_left0, + bottom_right0, xs_lerp, ys_lerp); + out_y[x * 3 + 1] = compute_lerp(top_left1, top_right1, bottom_left1, + bottom_right1, xs_lerp, ys_lerp); + out_y[x * 3 + 2] = compute_lerp(top_left2, top_right2, bottom_left2, + bottom_right2, xs_lerp, ys_lerp); + } +} + +#ifdef __SSE4_1__ + +// Load 3 floats from the given buffer, which must be of size at least 4. +template +inline __m128 load_3xfloat_v(T* values) { + return _mm_set_ps(0.0f, static_cast(values[2]), + static_cast(values[1]), + static_cast(values[0])); +} + +// Specialize cases that can be done more efficiently. +template <> +inline __m128 load_3xfloat_v(float* values) { + return _mm_loadu_ps(values); +} + +template +void ResizeLine3ChannelsVector(const T* const ys_input_lower_ptr, + const T* const ys_input_upper_ptr, + const CachedInterpolation* const xs, + const float ys_lerp, const int64 out_width, + float* out_y) { + const __m128 ys_lerp_v = _mm_set1_ps(ys_lerp); + // All pixels but the last one can overflow, vectorize the inside of the + // row. + int64 x = 0; + for (x = 0; x < out_width - 1; ++x) { + const int64 xs_lower = xs[x].lower; + const int64 xs_upper = xs[x].upper; + const __m128 xs_lerp_v = _mm_set1_ps(xs[x].lerp); + + const __m128 top_left_v = load_3xfloat_v(ys_input_lower_ptr + xs_lower); + const __m128 top_right_v = load_3xfloat_v(ys_input_lower_ptr + xs_upper); + const __m128 bottom_left_v = load_3xfloat_v(ys_input_upper_ptr + xs_lower); + const __m128 bottom_right_v = load_3xfloat_v(ys_input_upper_ptr + xs_upper); + + _mm_storeu_ps(out_y + x * 3, + compute_lerp_v(top_left_v, top_right_v, bottom_left_v, + bottom_right_v, xs_lerp_v, ys_lerp_v)); + } + // The last pixel of each row must be done in a non-vectorized way + // because we cannot overflow. + ResizeLine3Channels(ys_input_lower_ptr, ys_input_upper_ptr, + xs + out_width - 1, ys_lerp, 1, + out_y + (out_width - 1) * 3); +} +#endif + template void resize_image( typename TTypes::ConstTensor images, const int batch_size, @@ -136,41 +241,13 @@ void resize_image(typename TTypes::ConstTensor images, for (int64 y = 0; y < out_height; ++y) { const T* ys_input_lower_ptr = input_b_ptr + ys[y].lower * in_row_size; const T* ys_input_upper_ptr = input_b_ptr + ys[y].upper * in_row_size; - const float ys_lerp = ys[y].lerp; - for (int64 x = 0; x < out_width; ++x) { - const int64 xs_lower = xs[x].lower; - const int64 xs_upper = xs[x].upper; - const float xs_lerp = xs[x].lerp; - - // Read channel 0. - const float top_left0(ys_input_lower_ptr[xs_lower + 0]); - const float top_right0(ys_input_lower_ptr[xs_upper + 0]); - const float bottom_left0(ys_input_upper_ptr[xs_lower + 0]); - const float bottom_right0(ys_input_upper_ptr[xs_upper + 0]); - - // Read channel 1. - const float top_left1(ys_input_lower_ptr[xs_lower + 1]); - const float top_right1(ys_input_lower_ptr[xs_upper + 1]); - const float bottom_left1(ys_input_upper_ptr[xs_lower + 1]); - const float bottom_right1(ys_input_upper_ptr[xs_upper + 1]); - - // Read channel 2. - const float top_left2(ys_input_lower_ptr[xs_lower + 2]); - const float top_right2(ys_input_lower_ptr[xs_upper + 2]); - const float bottom_left2(ys_input_upper_ptr[xs_lower + 2]); - const float bottom_right2(ys_input_upper_ptr[xs_upper + 2]); - - // Compute output. - output_y_ptr[x * channels + 0] = - compute_lerp(top_left0, top_right0, bottom_left0, bottom_right0, - xs_lerp, ys_lerp); - output_y_ptr[x * channels + 1] = - compute_lerp(top_left1, top_right1, bottom_left1, bottom_right1, - xs_lerp, ys_lerp); - output_y_ptr[x * channels + 2] = - compute_lerp(top_left2, top_right2, bottom_left2, bottom_right2, - xs_lerp, ys_lerp); - } +#ifdef __SSE4_1__ + ResizeLine3ChannelsVector(ys_input_lower_ptr, ys_input_upper_ptr, xs, + ys[y].lerp, out_width, output_y_ptr); +#else + ResizeLine3Channels(ys_input_lower_ptr, ys_input_upper_ptr, xs, + ys[y].lerp, out_width, output_y_ptr); +#endif output_y_ptr += out_row_size; } input_b_ptr += in_batch_num_values; @@ -338,6 +415,7 @@ struct ResizeBilinearGrad { static_cast(ceilf(in_x)), original_width - 1); const float x_lerp = in_x - floorf(in_x); const float inverse_x_lerp = (1.0f - x_lerp); + // TODO(b/158287314): Look into vectorizing this. for (Eigen::Index c = 0; c < channels; ++c) { output_grad(b, top_y_index, left_x_index, c) += T(input_grad(b, y, x, c) * inverse_y_lerp * inverse_x_lerp); diff --git a/tensorflow/core/kernels/resize_bilinear_op_test.cc b/tensorflow/core/kernels/resize_bilinear_op_test.cc index 4873b49612d..df00ca281e7 100644 --- a/tensorflow/core/kernels/resize_bilinear_op_test.cc +++ b/tensorflow/core/kernels/resize_bilinear_op_test.cc @@ -28,6 +28,7 @@ limitations under the License. #include "tensorflow/core/lib/random/random.h" #include "tensorflow/core/lib/strings/str_util.h" #include "tensorflow/core/platform/test.h" +#include "tensorflow/core/platform/test_benchmark.h" #include "tensorflow/core/public/session_options.h" namespace tensorflow { @@ -543,4 +544,39 @@ INSTANTIATE_TEST_SUITE_P(ResizeBilinearOpAlignCornersTestGpu, ResizeBilinearOpAlignCornersTest, ::testing::Values(TestDevice::GPU)); #endif // GOOGLE_CUDA + +class ResizeBM : public ResizeBilinearOpTest { + public: + void TestBody() override {} + void SetUpBenchmark(int input_width, int input_height, int num_channels, + int output_width, int output_height) { + TF_EXPECT_OK(NodeDefBuilder("resize_bilinear_op", "ResizeBilinear") + .Input(FakeInput(DT_FLOAT)) + .Input(FakeInput(DT_INT32)) + .Attr("align_corners", align_corners_) + .Attr("half_pixel_centers", half_pixel_centers_) + .Finalize(node_def())); + TF_EXPECT_OK(InitOp()); + const TensorShape shape( + {/*batch_size*/ 1, input_width, input_height, num_channels}); + SetRandomImageInput(shape); + AddInputFromArray(TensorShape({2}), {output_width, output_height}); + } + + using ResizeBilinearOpTest::RunOpKernel; +}; + +#ifdef PLATFORM_GOOGLE + +void BM_Resize(benchmark::State& state) { + ResizeBM bench; + bench.SetUpBenchmark(640, 480, 3, 1024, 768); + for (const auto _ : state) { + CHECK(bench.RunOpKernel().ok()); + } +} +BENCHMARK(BM_Resize); + +#endif + } // namespace tensorflow