Improve resize_bicubic performance by reorganizing loops (#13840)
* Improve resize_bicubic performance by reorganizing loops This fix tries to address the issue raised in 13693 where performance of `resize_bicubic` is not on par with opencv. This fix rearranges the loops so that it is the same for num_channel=40 and num_channel=3: Pre-fix: ``` CHANNEL=40 opencv: 145.08ms tf: 314.26ms CHANNEL=3 opencv: 11.95ms tf: 8.95ms ``` Post-fix: ``` CHANNEL=40 opencv: 144.25ms tf: 214.55ms CHANNEL=3 opencv: 11.78ms tf: 14.07ms ``` This fix fixes 13693. Signed-off-by: Yong Tang <yong.tang.github@outlook.com> * Keep special handling of `num_channels=3` for `resize_bicubic` This commit keeps special handling of `num_channels=3` for `resize_bicubic`: Without special handling: ``` opencv: 11.78ms tf: 14.07ms ``` With special handling: ``` opencv: 11.74ms tf: 9.46ms ``` Signed-off-by: Yong Tang <yong.tang.github@outlook.com> * Expand Benchmark test for resize_bicubic Signed-off-by: Yong Tang <yong.tang.github@outlook.com> * Update from review feedback. Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
This commit is contained in:
parent
b927df57f0
commit
17096081ee
@ -20,7 +20,6 @@ limitations under the License.
|
|||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <array>
|
#include <array>
|
||||||
|
|
||||||
#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
|
|
||||||
#include "tensorflow/core/framework/op_kernel.h"
|
#include "tensorflow/core/framework/op_kernel.h"
|
||||||
#include "tensorflow/core/framework/register_types.h"
|
#include "tensorflow/core/framework/register_types.h"
|
||||||
#include "tensorflow/core/framework/tensor.h"
|
#include "tensorflow/core/framework/tensor.h"
|
||||||
@ -29,6 +28,7 @@ limitations under the License.
|
|||||||
#include "tensorflow/core/kernels/image_resizer_state.h"
|
#include "tensorflow/core/kernels/image_resizer_state.h"
|
||||||
#include "tensorflow/core/lib/core/status.h"
|
#include "tensorflow/core/lib/core/status.h"
|
||||||
#include "tensorflow/core/platform/logging.h"
|
#include "tensorflow/core/platform/logging.h"
|
||||||
|
#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
|
||||||
|
|
||||||
namespace tensorflow {
|
namespace tensorflow {
|
||||||
namespace {
|
namespace {
|
||||||
@ -235,6 +235,7 @@ inline void interpolate_with_caching(
|
|||||||
|
|
||||||
const T* input_b_ptr = input_data.data();
|
const T* input_b_ptr = input_data.data();
|
||||||
float* output_y_ptr = output_data.data();
|
float* output_y_ptr = output_data.data();
|
||||||
|
std::vector<float> cached_value(num_channels == 3 ? 0 : 4 * num_channels, 0);
|
||||||
|
|
||||||
for (int64 b = 0; b < resizer_state.batch_size;
|
for (int64 b = 0; b < resizer_state.batch_size;
|
||||||
++b, input_b_ptr += in_batch_width) {
|
++b, input_b_ptr += in_batch_width) {
|
||||||
@ -248,6 +249,7 @@ inline void interpolate_with_caching(
|
|||||||
const T* y_ptr_1 = input_b_ptr + y_wai.index_1 * in_row_width;
|
const T* y_ptr_1 = input_b_ptr + y_wai.index_1 * in_row_width;
|
||||||
const T* y_ptr_2 = input_b_ptr + y_wai.index_2 * in_row_width;
|
const T* y_ptr_2 = input_b_ptr + y_wai.index_2 * in_row_width;
|
||||||
const T* y_ptr_3 = input_b_ptr + y_wai.index_3 * in_row_width;
|
const T* y_ptr_3 = input_b_ptr + y_wai.index_3 * in_row_width;
|
||||||
|
|
||||||
if (num_channels == 3) {
|
if (num_channels == 3) {
|
||||||
// Manually unroll case of 3 channels.
|
// Manually unroll case of 3 channels.
|
||||||
float cached_value_0[4] = {0};
|
float cached_value_0[4] = {0};
|
||||||
@ -330,48 +332,61 @@ inline void interpolate_with_caching(
|
|||||||
x_wai.weight_2, x_wai.weight_3);
|
x_wai.weight_2, x_wai.weight_3);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
for (int64 c = 0; c < num_channels; ++c) {
|
for (int64 x = 0; x < resizer_state.out_width; ++x) {
|
||||||
float cached_value[4] = {0};
|
const WeightsAndIndices& x_wai = x_wais[x];
|
||||||
for (int64 x = 0; x < resizer_state.out_width; ++x) {
|
// Shift values in cached_value to fill first 'advance' values.
|
||||||
const WeightsAndIndices& x_wai = x_wais[x];
|
switch (x_wai.advance) {
|
||||||
// Shift values in cached_value to fill first 'advance' values.
|
case 3:
|
||||||
switch (x_wai.advance) {
|
for (int64 c = 0; c < num_channels; ++c) {
|
||||||
case 3:
|
cached_value[4 * c + 0] = cached_value[4 * c + 1];
|
||||||
cached_value[0] = cached_value[1];
|
cached_value[4 * c + 1] = cached_value[4 * c + 2];
|
||||||
cached_value[1] = cached_value[2];
|
cached_value[4 * c + 2] = cached_value[4 * c + 3];
|
||||||
cached_value[2] = cached_value[3];
|
|
||||||
break;
|
|
||||||
case 2:
|
|
||||||
cached_value[0] = cached_value[2];
|
|
||||||
cached_value[1] = cached_value[3];
|
|
||||||
break;
|
|
||||||
case 1: {
|
|
||||||
cached_value[0] = cached_value[3];
|
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
|
break;
|
||||||
|
case 2:
|
||||||
|
for (int64 c = 0; c < num_channels; ++c) {
|
||||||
|
cached_value[4 * c + 0] = cached_value[4 * c + 2];
|
||||||
|
cached_value[4 * c + 1] = cached_value[4 * c + 3];
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case 1: {
|
||||||
|
for (int64 c = 0; c < num_channels; ++c) {
|
||||||
|
cached_value[4 * c + 0] = cached_value[4 * c + 3];
|
||||||
|
}
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Set the remaining '4-advance' values by computing.
|
// Set the remaining '4-advance' values by computing.
|
||||||
switch (x_wai.advance) {
|
switch (x_wai.advance) {
|
||||||
case 0:
|
case 0:
|
||||||
cached_value[0] = ComputeYInterpolation(
|
for (int64 c = 0; c < num_channels; ++c) {
|
||||||
|
cached_value[4 * c + 0] = ComputeYInterpolation(
|
||||||
0, c, y_wai, y_ptr_0, y_ptr_1, y_ptr_2, y_ptr_3, x_wai);
|
0, c, y_wai, y_ptr_0, y_ptr_1, y_ptr_2, y_ptr_3, x_wai);
|
||||||
TF_FALLTHROUGH_INTENDED;
|
}
|
||||||
case 1:
|
TF_FALLTHROUGH_INTENDED;
|
||||||
cached_value[1] = ComputeYInterpolation(
|
case 1:
|
||||||
|
for (int64 c = 0; c < num_channels; ++c) {
|
||||||
|
cached_value[4 * c + 1] = ComputeYInterpolation(
|
||||||
1, c, y_wai, y_ptr_0, y_ptr_1, y_ptr_2, y_ptr_3, x_wai);
|
1, c, y_wai, y_ptr_0, y_ptr_1, y_ptr_2, y_ptr_3, x_wai);
|
||||||
TF_FALLTHROUGH_INTENDED;
|
}
|
||||||
case 2:
|
TF_FALLTHROUGH_INTENDED;
|
||||||
cached_value[2] = ComputeYInterpolation(
|
case 2:
|
||||||
|
for (int64 c = 0; c < num_channels; ++c) {
|
||||||
|
cached_value[4 * c + 2] = ComputeYInterpolation(
|
||||||
2, c, y_wai, y_ptr_0, y_ptr_1, y_ptr_2, y_ptr_3, x_wai);
|
2, c, y_wai, y_ptr_0, y_ptr_1, y_ptr_2, y_ptr_3, x_wai);
|
||||||
TF_FALLTHROUGH_INTENDED;
|
}
|
||||||
case 3:
|
TF_FALLTHROUGH_INTENDED;
|
||||||
cached_value[3] = ComputeYInterpolation(
|
case 3:
|
||||||
|
for (int64 c = 0; c < num_channels; ++c) {
|
||||||
|
cached_value[4 * c + 3] = ComputeYInterpolation(
|
||||||
3, c, y_wai, y_ptr_0, y_ptr_1, y_ptr_2, y_ptr_3, x_wai);
|
3, c, y_wai, y_ptr_0, y_ptr_1, y_ptr_2, y_ptr_3, x_wai);
|
||||||
break;
|
}
|
||||||
}
|
break;
|
||||||
|
}
|
||||||
|
for (int64 c = 0; c < num_channels; ++c) {
|
||||||
output_y_ptr[x * num_channels + c] =
|
output_y_ptr[x * num_channels + c] =
|
||||||
Compute(cached_value, x_wai.weight_0, x_wai.weight_1,
|
Compute(&cached_value[4 * c], x_wai.weight_0, x_wai.weight_1,
|
||||||
x_wai.weight_2, x_wai.weight_3);
|
x_wai.weight_2, x_wai.weight_3);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -251,14 +251,15 @@ TEST_F(ResizeBicubicOpTest, TestAreaRandomDataSeveralInputsSizes4Channels) {
|
|||||||
RunManyRandomTests(4);
|
RunManyRandomTests(4);
|
||||||
}
|
}
|
||||||
|
|
||||||
static Graph* ResizeBicubic(int batch_size, int size, int channels) {
|
static Graph* ResizeBicubic(int batch_size, int size, int channels,
|
||||||
|
float scale_y = 0.3, float scale_x = 0.7) {
|
||||||
Graph* g = new Graph(OpRegistry::Global());
|
Graph* g = new Graph(OpRegistry::Global());
|
||||||
Tensor input(DT_FLOAT, TensorShape({batch_size, size, size, channels}));
|
Tensor input(DT_FLOAT, TensorShape({batch_size, size, size, channels}));
|
||||||
input.flat<float>().setRandom();
|
input.flat<float>().setRandom();
|
||||||
Tensor shape(DT_INT32, TensorShape({2}));
|
Tensor shape(DT_INT32, TensorShape({2}));
|
||||||
auto shape_t = shape.flat<int32>();
|
auto shape_t = shape.flat<int32>();
|
||||||
shape_t(0) = 0.3 * size;
|
shape_t(0) = scale_y * size;
|
||||||
shape_t(1) = 0.7 * size;
|
shape_t(1) = scale_x * size;
|
||||||
test::graph::Binary(g, "ResizeBicubic", test::graph::Constant(g, input),
|
test::graph::Binary(g, "ResizeBicubic", test::graph::Constant(g, input),
|
||||||
test::graph::Constant(g, shape));
|
test::graph::Constant(g, shape));
|
||||||
return g;
|
return g;
|
||||||
@ -285,4 +286,17 @@ BM_ResizeBicubicDev(32, 128, 3);
|
|||||||
BM_ResizeBicubicDev(32, 512, 3);
|
BM_ResizeBicubicDev(32, 512, 3);
|
||||||
BM_ResizeBicubicDev(32, 1024, 3);
|
BM_ResizeBicubicDev(32, 1024, 3);
|
||||||
|
|
||||||
|
#define BM_ResizeBicubicExpand(BATCH, SIZE, CHANNELS) \
|
||||||
|
static void BM_ResizeBicubicExpand##_##BATCH##_##SIZE##_##CHANNELS(int iters) { \
|
||||||
|
testing::ItemsProcessed(static_cast<int64>(iters) * BATCH * SIZE * SIZE * \
|
||||||
|
CHANNELS * 8 * 8); \
|
||||||
|
test::Benchmark("cpu", ResizeBicubic(BATCH, SIZE, CHANNELS, 8, 8)) \
|
||||||
|
.Run(iters); \
|
||||||
|
} \
|
||||||
|
BENCHMARK(BM_ResizeBicubicExpand##_##BATCH##_##SIZE##_##CHANNELS);
|
||||||
|
|
||||||
|
BM_ResizeBicubicExpand(12, 48, 1);
|
||||||
|
BM_ResizeBicubicExpand(12, 48, 3);
|
||||||
|
BM_ResizeBicubicExpand(12, 48, 40);
|
||||||
|
|
||||||
} // end namespace tensorflow
|
} // end namespace tensorflow
|
||||||
|
Loading…
Reference in New Issue
Block a user