Improve resize_bicubic performance by reorganizing loops (#13840)

* Improve resize_bicubic performance by reorganizing loops

This fix tries to address the issue raised in 13693 where
performance of `resize_bicubic` is not on par with opencv.

This fix rearranges the loops so that it is the same for
num_channel=40 and num_channel=3:

Pre-fix:
```
CHANNEL=40
opencv: 145.08ms
tf: 314.26ms

CHANNEL=3
opencv: 11.95ms
tf: 8.95ms
```

Post-fix:
```
CHANNEL=40
opencv: 144.25ms
tf: 214.55ms

CHANNEL=3
opencv: 11.78ms
tf: 14.07ms
```

This fix fixes 13693.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Keep special handling of `num_channels=3` for `resize_bicubic`

This commit keeps special handling of `num_channels=3` for
`resize_bicubic`:
Without special handling:
```
opencv: 11.78ms
tf: 14.07ms
```
With special handling:
```
opencv: 11.74ms
tf: 9.46ms
```

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Expand Benchmark test for resize_bicubic

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Update from review feedback.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
This commit is contained in:
Yong Tang 2017-10-21 22:14:40 -07:00 committed by Vijay Vasudevan
parent b927df57f0
commit 17096081ee
2 changed files with 67 additions and 38 deletions

View File

@ -20,7 +20,6 @@ limitations under the License.
#include <algorithm>
#include <array>
#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
#include "tensorflow/core/framework/op_kernel.h"
#include "tensorflow/core/framework/register_types.h"
#include "tensorflow/core/framework/tensor.h"
@ -29,6 +28,7 @@ limitations under the License.
#include "tensorflow/core/kernels/image_resizer_state.h"
#include "tensorflow/core/lib/core/status.h"
#include "tensorflow/core/platform/logging.h"
#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
namespace tensorflow {
namespace {
@ -235,6 +235,7 @@ inline void interpolate_with_caching(
const T* input_b_ptr = input_data.data();
float* output_y_ptr = output_data.data();
std::vector<float> cached_value(num_channels == 3 ? 0 : 4 * num_channels, 0);
for (int64 b = 0; b < resizer_state.batch_size;
++b, input_b_ptr += in_batch_width) {
@ -248,6 +249,7 @@ inline void interpolate_with_caching(
const T* y_ptr_1 = input_b_ptr + y_wai.index_1 * in_row_width;
const T* y_ptr_2 = input_b_ptr + y_wai.index_2 * in_row_width;
const T* y_ptr_3 = input_b_ptr + y_wai.index_3 * in_row_width;
if (num_channels == 3) {
// Manually unroll case of 3 channels.
float cached_value_0[4] = {0};
@ -330,48 +332,61 @@ inline void interpolate_with_caching(
x_wai.weight_2, x_wai.weight_3);
}
} else {
for (int64 c = 0; c < num_channels; ++c) {
float cached_value[4] = {0};
for (int64 x = 0; x < resizer_state.out_width; ++x) {
const WeightsAndIndices& x_wai = x_wais[x];
// Shift values in cached_value to fill first 'advance' values.
switch (x_wai.advance) {
case 3:
cached_value[0] = cached_value[1];
cached_value[1] = cached_value[2];
cached_value[2] = cached_value[3];
break;
case 2:
cached_value[0] = cached_value[2];
cached_value[1] = cached_value[3];
break;
case 1: {
cached_value[0] = cached_value[3];
break;
for (int64 x = 0; x < resizer_state.out_width; ++x) {
const WeightsAndIndices& x_wai = x_wais[x];
// Shift values in cached_value to fill first 'advance' values.
switch (x_wai.advance) {
case 3:
for (int64 c = 0; c < num_channels; ++c) {
cached_value[4 * c + 0] = cached_value[4 * c + 1];
cached_value[4 * c + 1] = cached_value[4 * c + 2];
cached_value[4 * c + 2] = cached_value[4 * c + 3];
}
break;
case 2:
for (int64 c = 0; c < num_channels; ++c) {
cached_value[4 * c + 0] = cached_value[4 * c + 2];
cached_value[4 * c + 1] = cached_value[4 * c + 3];
}
break;
case 1: {
for (int64 c = 0; c < num_channels; ++c) {
cached_value[4 * c + 0] = cached_value[4 * c + 3];
}
break;
}
}
// Set the remaining '4-advance' values by computing.
switch (x_wai.advance) {
case 0:
cached_value[0] = ComputeYInterpolation(
// Set the remaining '4-advance' values by computing.
switch (x_wai.advance) {
case 0:
for (int64 c = 0; c < num_channels; ++c) {
cached_value[4 * c + 0] = ComputeYInterpolation(
0, c, y_wai, y_ptr_0, y_ptr_1, y_ptr_2, y_ptr_3, x_wai);
TF_FALLTHROUGH_INTENDED;
case 1:
cached_value[1] = ComputeYInterpolation(
}
TF_FALLTHROUGH_INTENDED;
case 1:
for (int64 c = 0; c < num_channels; ++c) {
cached_value[4 * c + 1] = ComputeYInterpolation(
1, c, y_wai, y_ptr_0, y_ptr_1, y_ptr_2, y_ptr_3, x_wai);
TF_FALLTHROUGH_INTENDED;
case 2:
cached_value[2] = ComputeYInterpolation(
}
TF_FALLTHROUGH_INTENDED;
case 2:
for (int64 c = 0; c < num_channels; ++c) {
cached_value[4 * c + 2] = ComputeYInterpolation(
2, c, y_wai, y_ptr_0, y_ptr_1, y_ptr_2, y_ptr_3, x_wai);
TF_FALLTHROUGH_INTENDED;
case 3:
cached_value[3] = ComputeYInterpolation(
}
TF_FALLTHROUGH_INTENDED;
case 3:
for (int64 c = 0; c < num_channels; ++c) {
cached_value[4 * c + 3] = ComputeYInterpolation(
3, c, y_wai, y_ptr_0, y_ptr_1, y_ptr_2, y_ptr_3, x_wai);
break;
}
}
break;
}
for (int64 c = 0; c < num_channels; ++c) {
output_y_ptr[x * num_channels + c] =
Compute(cached_value, x_wai.weight_0, x_wai.weight_1,
Compute(&cached_value[4 * c], x_wai.weight_0, x_wai.weight_1,
x_wai.weight_2, x_wai.weight_3);
}
}

View File

@ -251,14 +251,15 @@ TEST_F(ResizeBicubicOpTest, TestAreaRandomDataSeveralInputsSizes4Channels) {
RunManyRandomTests(4);
}
static Graph* ResizeBicubic(int batch_size, int size, int channels) {
static Graph* ResizeBicubic(int batch_size, int size, int channels,
float scale_y = 0.3, float scale_x = 0.7) {
Graph* g = new Graph(OpRegistry::Global());
Tensor input(DT_FLOAT, TensorShape({batch_size, size, size, channels}));
input.flat<float>().setRandom();
Tensor shape(DT_INT32, TensorShape({2}));
auto shape_t = shape.flat<int32>();
shape_t(0) = 0.3 * size;
shape_t(1) = 0.7 * size;
shape_t(0) = scale_y * size;
shape_t(1) = scale_x * size;
test::graph::Binary(g, "ResizeBicubic", test::graph::Constant(g, input),
test::graph::Constant(g, shape));
return g;
@ -285,4 +286,17 @@ BM_ResizeBicubicDev(32, 128, 3);
BM_ResizeBicubicDev(32, 512, 3);
BM_ResizeBicubicDev(32, 1024, 3);
#define BM_ResizeBicubicExpand(BATCH, SIZE, CHANNELS) \
static void BM_ResizeBicubicExpand##_##BATCH##_##SIZE##_##CHANNELS(int iters) { \
testing::ItemsProcessed(static_cast<int64>(iters) * BATCH * SIZE * SIZE * \
CHANNELS * 8 * 8); \
test::Benchmark("cpu", ResizeBicubic(BATCH, SIZE, CHANNELS, 8, 8)) \
.Run(iters); \
} \
BENCHMARK(BM_ResizeBicubicExpand##_##BATCH##_##SIZE##_##CHANNELS);
BM_ResizeBicubicExpand(12, 48, 1);
BM_ResizeBicubicExpand(12, 48, 3);
BM_ResizeBicubicExpand(12, 48, 40);
} // end namespace tensorflow