Add gemmlowp-threadpool multithreading to the depthwiseconv implementation for the quantized path.
PiperOrigin-RevId: 239959051
This commit is contained in:
parent
36f817a9f3
commit
152095e319
@ -21,6 +21,7 @@ limitations under the License.
|
||||
|
||||
#include "tensorflow/lite/c/builtin_op_data.h"
|
||||
#include "tensorflow/lite/c/c_api_internal.h"
|
||||
#include "tensorflow/lite/kernels/gemm_support.h"
|
||||
#include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_float.h"
|
||||
#include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8.h"
|
||||
#include "tensorflow/lite/kernels/internal/quantization_util.h"
|
||||
@ -66,6 +67,7 @@ struct OpData {
|
||||
};
|
||||
|
||||
void* Init(TfLiteContext* context, const char* buffer, size_t length) {
|
||||
gemm_support::IncrementUsageCounter(context);
|
||||
// This is a builtin op, so we don't use the contents in 'buffer', if any.
|
||||
// Instead, we allocate a new object to carry information from Prepare() to
|
||||
// Eval().
|
||||
@ -73,6 +75,7 @@ void* Init(TfLiteContext* context, const char* buffer, size_t length) {
|
||||
}
|
||||
|
||||
void Free(TfLiteContext* context, void* buffer) {
|
||||
gemm_support::DecrementUsageCounter(context);
|
||||
delete reinterpret_cast<OpData*>(buffer);
|
||||
}
|
||||
|
||||
@ -230,17 +233,6 @@ void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
|
||||
auto filter_offset = -filter->params.zero_point;
|
||||
auto output_offset = output->params.zero_point;
|
||||
|
||||
void (*depthwise_conv)(const DepthwiseParams&, const RuntimeShape&,
|
||||
const uint8*, const RuntimeShape&, const uint8*,
|
||||
const RuntimeShape&, const int32*, const RuntimeShape&,
|
||||
uint8*);
|
||||
|
||||
if (kernel_type == kReference) {
|
||||
depthwise_conv = &reference_ops::DepthwiseConv;
|
||||
} else {
|
||||
depthwise_conv = &optimized_ops::DepthwiseConv;
|
||||
}
|
||||
|
||||
DepthwiseParams op_params;
|
||||
op_params.padding_type = PaddingType::kSame;
|
||||
op_params.padding_values.width = data->padding.width;
|
||||
@ -257,11 +249,20 @@ void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
|
||||
op_params.output_shift = -data->output_shift;
|
||||
op_params.quantized_activation_min = data->output_activation_min;
|
||||
op_params.quantized_activation_max = data->output_activation_max;
|
||||
depthwise_conv(op_params, GetTensorShape(input),
|
||||
GetTensorData<uint8_t>(input), GetTensorShape(filter),
|
||||
GetTensorData<uint8_t>(filter), GetTensorShape(bias),
|
||||
GetTensorData<int32_t>(bias), GetTensorShape(output),
|
||||
GetTensorData<uint8_t>(output));
|
||||
if (kernel_type == kReference) {
|
||||
reference_ops::DepthwiseConv(
|
||||
op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
|
||||
GetTensorShape(filter), GetTensorData<uint8_t>(filter),
|
||||
GetTensorShape(bias), GetTensorData<int32_t>(bias),
|
||||
GetTensorShape(output), GetTensorData<uint8_t>(output));
|
||||
} else {
|
||||
gemmlowp::GemmContext* gemm_context = gemm_support::GetFromContext(context);
|
||||
optimized_ops::DepthwiseConv(
|
||||
op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
|
||||
GetTensorShape(filter), GetTensorData<uint8_t>(filter),
|
||||
GetTensorShape(bias), GetTensorData<int32_t>(bias),
|
||||
GetTensorShape(output), GetTensorData<uint8_t>(output), gemm_context);
|
||||
}
|
||||
}
|
||||
|
||||
void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
|
||||
|
@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
#include <cstdarg>
|
||||
#include <initializer_list>
|
||||
#include <gtest/gtest.h>
|
||||
#include "absl/memory/memory.h"
|
||||
#include "tensorflow/lite/interpreter.h"
|
||||
@ -501,6 +502,172 @@ TEST_P(QuantizedDepthwiseConvolutionOpTest, SimpleDilatedTestPaddingSame) {
|
||||
ElementsAreArray({4, 7, 3, 6, 10, 4, 2, 3, 1}));
|
||||
}
|
||||
|
||||
TEST_P(DepthwiseConvolutionOpTest, MultithreadOnRowUint8GeneralTest) {
|
||||
const int depth = 1;
|
||||
const int image_width = 4;
|
||||
const int image_height = 28;
|
||||
const int image_batch_count = 3;
|
||||
const int filter_size = 3;
|
||||
const int filter_count = 1;
|
||||
|
||||
QuantizedDepthwiseConvolutionOpModel m(
|
||||
GetRegistration(),
|
||||
{TensorType_UINT8,
|
||||
{image_batch_count, image_height, image_width, depth},
|
||||
0,
|
||||
255},
|
||||
{TensorType_UINT8,
|
||||
{depth, filter_size, filter_size, filter_count},
|
||||
0,
|
||||
255},
|
||||
{TensorType_UINT8, {}, 0, 255}, Padding_VALID);
|
||||
|
||||
// clang-format off
|
||||
m.SetInput({
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
|
||||
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
|
||||
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
|
||||
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
|
||||
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
|
||||
});
|
||||
// clang-format on
|
||||
|
||||
// The filter matrix is:
|
||||
// | 1 | 2 | 3 |
|
||||
// | 4 | 5 | 6 |
|
||||
// | 7 | 8 | 9 |
|
||||
m.SetFilter({1, 2, 3, 4, 5, 6, 7, 8, 9});
|
||||
// No bias for this test.
|
||||
m.SetBias({0});
|
||||
m.SetNumThreads(4);
|
||||
m.Invoke();
|
||||
|
||||
// clang-format off
|
||||
EXPECT_THAT(
|
||||
m.GetOutput(),
|
||||
ElementsAreArray({
|
||||
0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 24, 24, 39, 39,
|
||||
45, 45, 45, 45, 45, 45, 45, 45,
|
||||
45, 45, 45, 45, 45, 45, 45, 45,
|
||||
45, 45, 45, 45, 21, 21, 6, 6,
|
||||
0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0,
|
||||
|
||||
0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 48, 48, 78, 78,
|
||||
90, 90, 90, 90, 90, 90, 90, 90,
|
||||
90, 90, 90, 90, 90, 90, 90, 90,
|
||||
90, 90, 90, 90, 42, 42, 12, 12,
|
||||
0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0,
|
||||
|
||||
0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 72, 72, 117, 117,
|
||||
135, 135, 135, 135, 135, 135, 135, 135,
|
||||
135, 135, 135, 135, 135, 135, 135, 135,
|
||||
135, 135, 135, 135, 63, 63, 18, 18,
|
||||
0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0,
|
||||
}));
|
||||
// clang-format on
|
||||
}
|
||||
|
||||
TEST_P(DepthwiseConvolutionOpTest, MultithreadOnBatchUint8GeneralTest) {
|
||||
const int depth = 1;
|
||||
const int image_width = 8;
|
||||
const int image_height = 4;
|
||||
const int image_batch_count = 6;
|
||||
const int filter_size = 3;
|
||||
const int filter_count = 1;
|
||||
|
||||
QuantizedDepthwiseConvolutionOpModel m(
|
||||
GetRegistration(),
|
||||
{TensorType_UINT8,
|
||||
{image_batch_count, image_height, image_width, depth},
|
||||
0,
|
||||
255},
|
||||
{TensorType_UINT8,
|
||||
{depth, filter_size, filter_size, filter_count},
|
||||
0,
|
||||
255},
|
||||
{TensorType_UINT8, {}, 0, 255}, Padding_VALID);
|
||||
|
||||
// clang-format off
|
||||
m.SetInput({
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0
|
||||
});
|
||||
// clang-format on
|
||||
|
||||
// The filter matrix is:
|
||||
// | 1 | 2 | 3 |
|
||||
// | 4 | 5 | 6 |
|
||||
// | 7 | 8 | 9 |
|
||||
m.SetFilter({1, 2, 3, 4, 5, 6, 7, 8, 9});
|
||||
// No bias for this test.
|
||||
m.SetBias({0});
|
||||
m.SetNumThreads(4);
|
||||
m.Invoke();
|
||||
|
||||
// clang-format off
|
||||
EXPECT_THAT(
|
||||
m.GetOutput(),
|
||||
ElementsAreArray({
|
||||
39, 39, 39, 39, 39, 39,
|
||||
21, 21, 21, 21, 21, 21,
|
||||
|
||||
39, 39, 39, 39, 39, 39,
|
||||
21, 21, 21, 21, 21, 21,
|
||||
|
||||
39, 39, 39, 39, 39, 39,
|
||||
21, 21, 21, 21, 21, 21,
|
||||
|
||||
39, 39, 39, 39, 39, 39,
|
||||
21, 21, 21, 21, 21, 21,
|
||||
|
||||
39, 39, 39, 39, 39, 39,
|
||||
21, 21, 21, 21, 21, 21,
|
||||
|
||||
39, 39, 39, 39, 39, 39,
|
||||
21, 21, 21, 21, 21, 21
|
||||
}));
|
||||
// clang-format on
|
||||
}
|
||||
|
||||
class PerChannelQuantizedDepthwiseConvolutionOpModel
|
||||
: public BaseDepthwiseConvolutionOpModel {
|
||||
public:
|
||||
|
@ -139,7 +139,8 @@ inline void DispatchDepthwiseConv(
|
||||
// Call kernel optimized for depthwise convolutions using 3x3 filters.
|
||||
optimized_ops::depthwise_conv::DepthwiseConv3x3Filter(
|
||||
params, input_shape, input_data, filter_shape, filter_data,
|
||||
bias_shape, bias_data, output_shape, output_data);
|
||||
bias_shape, bias_data, output_shape, output_data, /*thread_start=*/0,
|
||||
/*thread_end=*/output_shape.Dims(1), /*thread_dim=*/1);
|
||||
return;
|
||||
#else
|
||||
break;
|
||||
@ -242,7 +243,8 @@ inline void DispatchDepthwiseConv(
|
||||
case DepthwiseConvImplementation::kUseGenericKernel: {
|
||||
optimized_ops::depthwise_conv::DepthwiseConvGeneral(
|
||||
params, input_shape, input_data, filter_shape, filter_data,
|
||||
bias_shape, bias_data, output_shape, output_data);
|
||||
bias_shape, bias_data, output_shape, output_data, /*thread_start=*/0,
|
||||
/*thread_end=*/output_shape.Dims(1), /*thread_dim=*/1);
|
||||
return;
|
||||
}
|
||||
case DepthwiseConvImplementation::kNone:
|
||||
@ -271,13 +273,15 @@ inline void DispatchDepthwiseConv(
|
||||
optimized_ops::DepthwiseConvWithRounding<
|
||||
DepthwiseConvOutputRounding::kAwayFromZero>(
|
||||
params, input_shape, input_data, filter_shape, filter_data,
|
||||
bias_shape, bias_data, output_shape, output_data);
|
||||
bias_shape, bias_data, output_shape, output_data, /*thread_start=*/0,
|
||||
/*thread_end=*/output_shape.Dims(1), /*thread_dim=*/1);
|
||||
return;
|
||||
case DepthwiseConvOutputRounding::kUpward:
|
||||
optimized_ops::DepthwiseConvWithRounding<
|
||||
DepthwiseConvOutputRounding::kUpward>(
|
||||
params, input_shape, input_data, filter_shape, filter_data,
|
||||
bias_shape, bias_data, output_shape, output_data);
|
||||
bias_shape, bias_data, output_shape, output_data, /*thread_start=*/0,
|
||||
/*thread_end=*/output_shape.Dims(1), /*thread_dim=*/1);
|
||||
return;
|
||||
default:
|
||||
break;
|
||||
|
@ -1662,7 +1662,7 @@ inline void DepthwiseConvGeneral(
|
||||
const uint8* input_data, const RuntimeShape& filter_shape,
|
||||
const uint8* filter_data, const RuntimeShape& bias_shape,
|
||||
const int32* bias_data, const RuntimeShape& output_shape,
|
||||
uint8* output_data) {
|
||||
uint8* output_data, int thread_start, int thread_end, int thread_dim) {
|
||||
const int stride_width = params.stride_width;
|
||||
const int stride_height = params.stride_height;
|
||||
const int pad_width = params.padding_values.width;
|
||||
@ -1684,7 +1684,7 @@ inline void DepthwiseConvGeneral(
|
||||
const int input_depth = input_shape.Dims(3);
|
||||
const int filter_height = filter_shape.Dims(1);
|
||||
const int filter_width = filter_shape.Dims(2);
|
||||
const int output_height = output_shape.Dims(1);
|
||||
const int output_rows = output_shape.Dims(1);
|
||||
const int output_width = output_shape.Dims(2);
|
||||
#ifdef USE_NEON
|
||||
const bool shift_left = (output_shift > 0);
|
||||
@ -1700,6 +1700,7 @@ inline void DepthwiseConvGeneral(
|
||||
kAccBufferActualSize);
|
||||
TFLITE_DCHECK_LE(kAccBufferActualSize, kAccBufferMaxSize);
|
||||
TFLITE_DCHECK_GE(kOutputPixelsInAccBuffer, 1);
|
||||
TFLITE_DCHECK(thread_dim == 0 || thread_dim == 1);
|
||||
|
||||
// row_accum_func will point to the core accumulation function to be used
|
||||
// for this DepthwiseConv op.
|
||||
@ -1766,9 +1767,34 @@ inline void DepthwiseConvGeneral(
|
||||
const int filter_height_stride = filter_shape.Dims(3) * filter_shape.Dims(2);
|
||||
|
||||
// Now that we have determined row_accum_func, we can start work.
|
||||
uint8* output_ptr = output_data;
|
||||
for (int b = 0; b < batches; ++b) {
|
||||
for (int out_y = 0; out_y < output_height; ++out_y) {
|
||||
int batch_start = 0;
|
||||
int batch_end = batches;
|
||||
int row_start = 0;
|
||||
int row_end = output_rows;
|
||||
int output_ptr_offset = 0;
|
||||
|
||||
switch (thread_dim) {
|
||||
case 0:
|
||||
TFLITE_DCHECK_GE(thread_start, 0);
|
||||
TFLITE_DCHECK_LE(thread_end, batches);
|
||||
batch_start = thread_start;
|
||||
batch_end = thread_end;
|
||||
output_ptr_offset = batch_start * FlatSizeSkipDim(output_shape, 0);
|
||||
break;
|
||||
case 1:
|
||||
TFLITE_DCHECK_GE(thread_start, 0);
|
||||
TFLITE_DCHECK_LE(thread_end, output_rows);
|
||||
row_start = thread_start;
|
||||
row_end = thread_end;
|
||||
output_ptr_offset = row_start * output_width * output_depth;
|
||||
break;
|
||||
}
|
||||
|
||||
uint8* output_ptr = output_data + output_ptr_offset;
|
||||
int batch_step =
|
||||
(output_rows + row_start - row_end) * output_width * output_depth;
|
||||
for (int b = batch_start; b < batch_end; ++b) {
|
||||
for (int out_y = row_start; out_y < row_end; ++out_y) {
|
||||
const int in_y_origin = (out_y * stride_height) - pad_height;
|
||||
const int filter_y_start =
|
||||
std::max(0, (-in_y_origin + dilation_height_factor - 1) /
|
||||
@ -1944,6 +1970,7 @@ inline void DepthwiseConvGeneral(
|
||||
}
|
||||
}
|
||||
}
|
||||
output_ptr += batch_step;
|
||||
}
|
||||
}
|
||||
|
||||
@ -1955,7 +1982,7 @@ inline void DepthwiseConvWithRounding(
|
||||
const uint8* input_data, const RuntimeShape& filter_shape,
|
||||
const uint8* filter_data, const RuntimeShape& bias_shape,
|
||||
const int32* bias_data, const RuntimeShape& output_shape,
|
||||
uint8* output_data) {
|
||||
uint8* output_data, int thread_start, int thread_end, int thread_dim) {
|
||||
gemmlowp::ScopedProfilingLabel label("DepthwiseConv/8bit");
|
||||
const int depth_multiplier = params.depth_multiplier;
|
||||
const int32 output_activation_min = params.quantized_activation_min;
|
||||
@ -1991,7 +2018,8 @@ inline void DepthwiseConvWithRounding(
|
||||
gemmlowp::ScopedProfilingLabel specialized_label("DepthwiseConv/8bit/3x3");
|
||||
depthwise_conv::DepthwiseConv3x3Filter(
|
||||
params, input_shape, input_data, filter_shape, filter_data, bias_shape,
|
||||
bias_data, output_shape, output_data);
|
||||
bias_data, output_shape, output_data, thread_start, thread_end,
|
||||
thread_dim);
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
@ -2000,7 +2028,77 @@ inline void DepthwiseConvWithRounding(
|
||||
"DepthwiseConv/8bit/General");
|
||||
depthwise_conv::DepthwiseConvGeneral(params, input_shape, input_data,
|
||||
filter_shape, filter_data, bias_shape,
|
||||
bias_data, output_shape, output_data);
|
||||
bias_data, output_shape, output_data,
|
||||
thread_start, thread_end, thread_dim);
|
||||
}
|
||||
|
||||
inline void DepthwiseConvImpl(
|
||||
const DepthwiseParams& params, const RuntimeShape& input_shape,
|
||||
const uint8* input_data, const RuntimeShape& filter_shape,
|
||||
const uint8* filter_data, const RuntimeShape& bias_shape,
|
||||
const int32* bias_data, const RuntimeShape& output_shape,
|
||||
uint8* output_data, int thread_start, int thread_end, int thread_dim) {
|
||||
return DepthwiseConvWithRounding<DepthwiseConvOutputRounding::kAwayFromZero>(
|
||||
params, input_shape, input_data, filter_shape, filter_data, bias_shape,
|
||||
bias_data, output_shape, output_data, thread_start, thread_end,
|
||||
thread_dim);
|
||||
}
|
||||
|
||||
template <typename T, typename TS>
|
||||
struct DepthwiseConvWorkerTask : public gemmlowp::Task {
|
||||
DepthwiseConvWorkerTask(const DepthwiseParams& params,
|
||||
const RuntimeShape& input_shape, const T* input_data,
|
||||
const RuntimeShape& filter_shape,
|
||||
const T* filter_data, const RuntimeShape& bias_shape,
|
||||
const TS* bias_data, const RuntimeShape& output_shape,
|
||||
T* output_data, int thread_start, int thread_end,
|
||||
int thread_dim)
|
||||
: params_(params),
|
||||
input_shape_(input_shape),
|
||||
input_data_(input_data),
|
||||
filter_shape_(filter_shape),
|
||||
filter_data_(filter_data),
|
||||
bias_shape_(bias_shape),
|
||||
bias_data_(bias_data),
|
||||
output_shape_(output_shape),
|
||||
output_data_(output_data),
|
||||
thread_start_(thread_start),
|
||||
thread_end_(thread_end),
|
||||
thread_dim_(thread_dim) {}
|
||||
|
||||
void Run() override {
|
||||
DepthwiseConvImpl(params_, input_shape_, input_data_, filter_shape_,
|
||||
filter_data_, bias_shape_, bias_data_, output_shape_,
|
||||
output_data_, thread_start_, thread_end_, thread_dim_);
|
||||
}
|
||||
|
||||
private:
|
||||
const DepthwiseParams& params_;
|
||||
const RuntimeShape& input_shape_;
|
||||
const T* input_data_;
|
||||
const RuntimeShape& filter_shape_;
|
||||
const T* filter_data_;
|
||||
const RuntimeShape& bias_shape_;
|
||||
const TS* bias_data_;
|
||||
const RuntimeShape& output_shape_;
|
||||
T* output_data_;
|
||||
int thread_start_;
|
||||
int thread_end_;
|
||||
int thread_dim_;
|
||||
};
|
||||
|
||||
inline int HowManyConvThreads(const RuntimeShape& output_shape,
|
||||
const RuntimeShape& filter_shape,
|
||||
int thread_dim) {
|
||||
constexpr int kMinMulPerThread = 8;
|
||||
const int output_units = output_shape.Dims(thread_dim);
|
||||
const int filter_height = filter_shape.Dims(1);
|
||||
const int filter_width = filter_shape.Dims(2);
|
||||
const int num_mul_per_unit =
|
||||
FlatSizeSkipDim(output_shape, thread_dim) * filter_height * filter_width;
|
||||
const int min_units_per_thread = kMinMulPerThread / num_mul_per_unit + 1;
|
||||
int thread_count = output_units / min_units_per_thread;
|
||||
return thread_count;
|
||||
}
|
||||
|
||||
inline void DepthwiseConv(
|
||||
@ -2008,10 +2106,50 @@ inline void DepthwiseConv(
|
||||
const uint8* input_data, const RuntimeShape& filter_shape,
|
||||
const uint8* filter_data, const RuntimeShape& bias_shape,
|
||||
const int32* bias_data, const RuntimeShape& output_shape,
|
||||
uint8* output_data) {
|
||||
return DepthwiseConvWithRounding<DepthwiseConvOutputRounding::kAwayFromZero>(
|
||||
params, input_shape, input_data, filter_shape, filter_data, bias_shape,
|
||||
bias_data, output_shape, output_data);
|
||||
uint8* output_data, gemmlowp::GemmContext* gemm_context = nullptr) {
|
||||
gemmlowp::ScopedProfilingLabel label("DepthwiseConv");
|
||||
|
||||
TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
|
||||
TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
|
||||
TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
|
||||
|
||||
const int output_batches = output_shape.Dims(0);
|
||||
const int output_rows = output_shape.Dims(1);
|
||||
int thread_count_batch = HowManyConvThreads(output_shape, filter_shape, 0);
|
||||
int thread_count_row = HowManyConvThreads(output_shape, filter_shape, 1);
|
||||
int thread_dim, thread_count, thread_dim_size;
|
||||
if (thread_count_batch > thread_count_row) {
|
||||
thread_dim = 0;
|
||||
thread_dim_size = output_batches;
|
||||
thread_count = thread_count_batch;
|
||||
} else {
|
||||
thread_dim = 1;
|
||||
thread_dim_size = output_rows;
|
||||
thread_count = thread_count_row;
|
||||
}
|
||||
|
||||
const int max_threads = gemm_context ? gemm_context->max_num_threads() : 1;
|
||||
thread_count = std::max(1, std::min(thread_count, max_threads));
|
||||
|
||||
if (thread_count == 1) {
|
||||
DepthwiseConvImpl(params, input_shape, input_data, filter_shape,
|
||||
filter_data, bias_shape, bias_data, output_shape,
|
||||
output_data, /*thread_start=*/0,
|
||||
/*thread_end=*/output_rows, /*thread_dim=*/1);
|
||||
} else {
|
||||
std::vector<gemmlowp::Task*> tasks(thread_count);
|
||||
int thread_start = 0;
|
||||
for (int i = 0; i < thread_count; ++i) {
|
||||
int thread_end =
|
||||
thread_start + (thread_dim_size - thread_start) / (thread_count - i);
|
||||
tasks[i] = new DepthwiseConvWorkerTask<uint8, int32>(
|
||||
params, input_shape, input_data, filter_shape, filter_data,
|
||||
bias_shape, bias_data, output_shape, output_data, thread_start,
|
||||
thread_end, thread_dim);
|
||||
thread_start = thread_end;
|
||||
}
|
||||
gemm_context->workers_pool()->Execute(tasks);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace optimized_ops
|
||||
|
@ -3535,7 +3535,7 @@ inline void DepthwiseConv3x3Filter(
|
||||
const uint8* input_data, const RuntimeShape& filter_shape,
|
||||
const uint8* filter_data, const RuntimeShape& bias_shape,
|
||||
const int32* bias_data, const RuntimeShape& output_shape,
|
||||
uint8* output_data) {
|
||||
uint8* output_data, int thread_start, int thread_end, int thread_dim) {
|
||||
gemmlowp::ScopedProfilingLabel label(__PRETTY_FUNCTION__);
|
||||
DepthwiseConvParams params;
|
||||
|
||||
@ -3586,6 +3586,7 @@ inline void DepthwiseConv3x3Filter(
|
||||
TFLITE_DCHECK(pad_height == 0 || pad_height == 1);
|
||||
TFLITE_DCHECK(pad_width == 0 || pad_width == 1);
|
||||
TFLITE_DCHECK(pad_width == pad_height);
|
||||
TFLITE_DCHECK(thread_dim == 0 || thread_dim == 1);
|
||||
|
||||
const int32 batches = MatchingDim(input_shape, 0, output_shape, 0);
|
||||
const int64_t input_batch_size = params.input_row_size * params.input_height;
|
||||
@ -3619,14 +3620,35 @@ inline void DepthwiseConv3x3Filter(
|
||||
// used in gemmlowp.
|
||||
uint8 shuffle_workspace[kDepthwiseConvScratchWorkspaceSize];
|
||||
|
||||
for (int32 b = 0; b < batches; ++b) {
|
||||
int batch_start = 0;
|
||||
int batch_end = batches;
|
||||
int row_start = 0;
|
||||
int row_end = params.output_height;
|
||||
|
||||
switch (thread_dim) {
|
||||
case 0:
|
||||
TFLITE_DCHECK_GE(thread_start, 0);
|
||||
TFLITE_DCHECK_LE(thread_end, batches);
|
||||
batch_start = thread_start;
|
||||
batch_end = thread_end;
|
||||
break;
|
||||
case 1:
|
||||
TFLITE_DCHECK_GE(thread_start, 0);
|
||||
TFLITE_DCHECK_LE(thread_end, params.output_height);
|
||||
row_start = thread_start;
|
||||
row_end = thread_end;
|
||||
break;
|
||||
}
|
||||
|
||||
for (int32 b = batch_start; b < batch_end; ++b) {
|
||||
const uint8* input_ptr = input_data + b * input_batch_size;
|
||||
uint8* output_ptr = output_data + b * output_batch_size;
|
||||
uint8* output_ptr = output_data + b * output_batch_size +
|
||||
row_start * params.output_width * params.output_depth;
|
||||
|
||||
int32 out_x = 0;
|
||||
int32 out_y = 0;
|
||||
int32 out_y = row_start;
|
||||
int32 end_x = params.output_width;
|
||||
int32 end_y = params.output_height;
|
||||
int32 end_y = row_end;
|
||||
|
||||
if (pad_width == 1 && pad_height == 1) {
|
||||
DepthwiseConvHandlePadding(input_ptr, filter_data, bias_data, output_ptr,
|
||||
@ -3635,8 +3657,8 @@ inline void DepthwiseConv3x3Filter(
|
||||
// Update extents now that the edges have been handled.
|
||||
out_x = 1;
|
||||
end_x = params.output_width - 1;
|
||||
out_y = 1;
|
||||
end_y = params.output_height - 1;
|
||||
out_y = std::max(1, out_y);
|
||||
end_y = std::min(params.output_height - 1, end_y);
|
||||
const int in_x = (out_x * stride_width) - pad_width;
|
||||
const int in_y = (out_y * stride_height) - pad_height;
|
||||
input_ptr += in_y * params.input_row_size + in_x * params.input_depth;
|
||||
|
@ -234,9 +234,14 @@ inline void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims,
|
||||
// Legacy ops used mixed left and right shifts. Now all are +ve-means-left.
|
||||
op_params.output_shift = kDepthwiseReverseShift * output_shift;
|
||||
|
||||
DepthwiseConv(op_params, DimsToShape(input_dims), input_data,
|
||||
DimsToShape(filter_dims), filter_data, DimsToShape(bias_dims),
|
||||
bias_data, DimsToShape(output_dims), output_data);
|
||||
const RuntimeShape output_shape = DimsToShape(output_dims);
|
||||
const int output_height = output_shape.Dims(1);
|
||||
|
||||
DepthwiseConvImpl(op_params, DimsToShape(input_dims), input_data,
|
||||
DimsToShape(filter_dims), filter_data,
|
||||
DimsToShape(bias_dims), bias_data, DimsToShape(output_dims),
|
||||
output_data, /*thread_start=*/0,
|
||||
/*thread_end=*/output_height, /*thread_dim=*/1);
|
||||
}
|
||||
|
||||
inline void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims,
|
||||
|
@ -327,6 +327,7 @@ class SingleOpModel {
|
||||
}
|
||||
|
||||
void SetNumThreads(int num_threads) {
|
||||
CHECK(interpreter_ != nullptr);
|
||||
interpreter_->SetNumThreads(num_threads);
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user