Add gemmlowp-threadpool multithreading to the depthwiseconv implementation for the quantized path.
PiperOrigin-RevId: 239959051
This commit is contained in:
parent
36f817a9f3
commit
152095e319
@ -21,6 +21,7 @@ limitations under the License.
|
|||||||
|
|
||||||
#include "tensorflow/lite/c/builtin_op_data.h"
|
#include "tensorflow/lite/c/builtin_op_data.h"
|
||||||
#include "tensorflow/lite/c/c_api_internal.h"
|
#include "tensorflow/lite/c/c_api_internal.h"
|
||||||
|
#include "tensorflow/lite/kernels/gemm_support.h"
|
||||||
#include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_float.h"
|
#include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_float.h"
|
||||||
#include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8.h"
|
#include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8.h"
|
||||||
#include "tensorflow/lite/kernels/internal/quantization_util.h"
|
#include "tensorflow/lite/kernels/internal/quantization_util.h"
|
||||||
@ -66,6 +67,7 @@ struct OpData {
|
|||||||
};
|
};
|
||||||
|
|
||||||
void* Init(TfLiteContext* context, const char* buffer, size_t length) {
|
void* Init(TfLiteContext* context, const char* buffer, size_t length) {
|
||||||
|
gemm_support::IncrementUsageCounter(context);
|
||||||
// This is a builtin op, so we don't use the contents in 'buffer', if any.
|
// This is a builtin op, so we don't use the contents in 'buffer', if any.
|
||||||
// Instead, we allocate a new object to carry information from Prepare() to
|
// Instead, we allocate a new object to carry information from Prepare() to
|
||||||
// Eval().
|
// Eval().
|
||||||
@ -73,6 +75,7 @@ void* Init(TfLiteContext* context, const char* buffer, size_t length) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void Free(TfLiteContext* context, void* buffer) {
|
void Free(TfLiteContext* context, void* buffer) {
|
||||||
|
gemm_support::DecrementUsageCounter(context);
|
||||||
delete reinterpret_cast<OpData*>(buffer);
|
delete reinterpret_cast<OpData*>(buffer);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -230,17 +233,6 @@ void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
|
|||||||
auto filter_offset = -filter->params.zero_point;
|
auto filter_offset = -filter->params.zero_point;
|
||||||
auto output_offset = output->params.zero_point;
|
auto output_offset = output->params.zero_point;
|
||||||
|
|
||||||
void (*depthwise_conv)(const DepthwiseParams&, const RuntimeShape&,
|
|
||||||
const uint8*, const RuntimeShape&, const uint8*,
|
|
||||||
const RuntimeShape&, const int32*, const RuntimeShape&,
|
|
||||||
uint8*);
|
|
||||||
|
|
||||||
if (kernel_type == kReference) {
|
|
||||||
depthwise_conv = &reference_ops::DepthwiseConv;
|
|
||||||
} else {
|
|
||||||
depthwise_conv = &optimized_ops::DepthwiseConv;
|
|
||||||
}
|
|
||||||
|
|
||||||
DepthwiseParams op_params;
|
DepthwiseParams op_params;
|
||||||
op_params.padding_type = PaddingType::kSame;
|
op_params.padding_type = PaddingType::kSame;
|
||||||
op_params.padding_values.width = data->padding.width;
|
op_params.padding_values.width = data->padding.width;
|
||||||
@ -257,11 +249,20 @@ void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
|
|||||||
op_params.output_shift = -data->output_shift;
|
op_params.output_shift = -data->output_shift;
|
||||||
op_params.quantized_activation_min = data->output_activation_min;
|
op_params.quantized_activation_min = data->output_activation_min;
|
||||||
op_params.quantized_activation_max = data->output_activation_max;
|
op_params.quantized_activation_max = data->output_activation_max;
|
||||||
depthwise_conv(op_params, GetTensorShape(input),
|
if (kernel_type == kReference) {
|
||||||
GetTensorData<uint8_t>(input), GetTensorShape(filter),
|
reference_ops::DepthwiseConv(
|
||||||
GetTensorData<uint8_t>(filter), GetTensorShape(bias),
|
op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
|
||||||
GetTensorData<int32_t>(bias), GetTensorShape(output),
|
GetTensorShape(filter), GetTensorData<uint8_t>(filter),
|
||||||
GetTensorData<uint8_t>(output));
|
GetTensorShape(bias), GetTensorData<int32_t>(bias),
|
||||||
|
GetTensorShape(output), GetTensorData<uint8_t>(output));
|
||||||
|
} else {
|
||||||
|
gemmlowp::GemmContext* gemm_context = gemm_support::GetFromContext(context);
|
||||||
|
optimized_ops::DepthwiseConv(
|
||||||
|
op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
|
||||||
|
GetTensorShape(filter), GetTensorData<uint8_t>(filter),
|
||||||
|
GetTensorShape(bias), GetTensorData<int32_t>(bias),
|
||||||
|
GetTensorShape(output), GetTensorData<uint8_t>(output), gemm_context);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
|
void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
|
||||||
|
@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
|
|||||||
limitations under the License.
|
limitations under the License.
|
||||||
==============================================================================*/
|
==============================================================================*/
|
||||||
#include <cstdarg>
|
#include <cstdarg>
|
||||||
|
#include <initializer_list>
|
||||||
#include <gtest/gtest.h>
|
#include <gtest/gtest.h>
|
||||||
#include "absl/memory/memory.h"
|
#include "absl/memory/memory.h"
|
||||||
#include "tensorflow/lite/interpreter.h"
|
#include "tensorflow/lite/interpreter.h"
|
||||||
@ -501,6 +502,172 @@ TEST_P(QuantizedDepthwiseConvolutionOpTest, SimpleDilatedTestPaddingSame) {
|
|||||||
ElementsAreArray({4, 7, 3, 6, 10, 4, 2, 3, 1}));
|
ElementsAreArray({4, 7, 3, 6, 10, 4, 2, 3, 1}));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TEST_P(DepthwiseConvolutionOpTest, MultithreadOnRowUint8GeneralTest) {
|
||||||
|
const int depth = 1;
|
||||||
|
const int image_width = 4;
|
||||||
|
const int image_height = 28;
|
||||||
|
const int image_batch_count = 3;
|
||||||
|
const int filter_size = 3;
|
||||||
|
const int filter_count = 1;
|
||||||
|
|
||||||
|
QuantizedDepthwiseConvolutionOpModel m(
|
||||||
|
GetRegistration(),
|
||||||
|
{TensorType_UINT8,
|
||||||
|
{image_batch_count, image_height, image_width, depth},
|
||||||
|
0,
|
||||||
|
255},
|
||||||
|
{TensorType_UINT8,
|
||||||
|
{depth, filter_size, filter_size, filter_count},
|
||||||
|
0,
|
||||||
|
255},
|
||||||
|
{TensorType_UINT8, {}, 0, 255}, Padding_VALID);
|
||||||
|
|
||||||
|
// clang-format off
|
||||||
|
m.SetInput({
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||||
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||||
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
|
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
|
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
|
||||||
|
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
|
||||||
|
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
|
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
|
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
|
||||||
|
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
|
||||||
|
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
|
||||||
|
});
|
||||||
|
// clang-format on
|
||||||
|
|
||||||
|
// The filter matrix is:
|
||||||
|
// | 1 | 2 | 3 |
|
||||||
|
// | 4 | 5 | 6 |
|
||||||
|
// | 7 | 8 | 9 |
|
||||||
|
m.SetFilter({1, 2, 3, 4, 5, 6, 7, 8, 9});
|
||||||
|
// No bias for this test.
|
||||||
|
m.SetBias({0});
|
||||||
|
m.SetNumThreads(4);
|
||||||
|
m.Invoke();
|
||||||
|
|
||||||
|
// clang-format off
|
||||||
|
EXPECT_THAT(
|
||||||
|
m.GetOutput(),
|
||||||
|
ElementsAreArray({
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 24, 24, 39, 39,
|
||||||
|
45, 45, 45, 45, 45, 45, 45, 45,
|
||||||
|
45, 45, 45, 45, 45, 45, 45, 45,
|
||||||
|
45, 45, 45, 45, 21, 21, 6, 6,
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0,
|
||||||
|
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 48, 48, 78, 78,
|
||||||
|
90, 90, 90, 90, 90, 90, 90, 90,
|
||||||
|
90, 90, 90, 90, 90, 90, 90, 90,
|
||||||
|
90, 90, 90, 90, 42, 42, 12, 12,
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0,
|
||||||
|
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 72, 72, 117, 117,
|
||||||
|
135, 135, 135, 135, 135, 135, 135, 135,
|
||||||
|
135, 135, 135, 135, 135, 135, 135, 135,
|
||||||
|
135, 135, 135, 135, 63, 63, 18, 18,
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0,
|
||||||
|
}));
|
||||||
|
// clang-format on
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST_P(DepthwiseConvolutionOpTest, MultithreadOnBatchUint8GeneralTest) {
|
||||||
|
const int depth = 1;
|
||||||
|
const int image_width = 8;
|
||||||
|
const int image_height = 4;
|
||||||
|
const int image_batch_count = 6;
|
||||||
|
const int filter_size = 3;
|
||||||
|
const int filter_count = 1;
|
||||||
|
|
||||||
|
QuantizedDepthwiseConvolutionOpModel m(
|
||||||
|
GetRegistration(),
|
||||||
|
{TensorType_UINT8,
|
||||||
|
{image_batch_count, image_height, image_width, depth},
|
||||||
|
0,
|
||||||
|
255},
|
||||||
|
{TensorType_UINT8,
|
||||||
|
{depth, filter_size, filter_size, filter_count},
|
||||||
|
0,
|
||||||
|
255},
|
||||||
|
{TensorType_UINT8, {}, 0, 255}, Padding_VALID);
|
||||||
|
|
||||||
|
// clang-format off
|
||||||
|
m.SetInput({
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||||
|
1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
|
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||||
|
1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
|
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||||
|
1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
|
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||||
|
1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
|
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||||
|
1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
|
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||||
|
1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0
|
||||||
|
});
|
||||||
|
// clang-format on
|
||||||
|
|
||||||
|
// The filter matrix is:
|
||||||
|
// | 1 | 2 | 3 |
|
||||||
|
// | 4 | 5 | 6 |
|
||||||
|
// | 7 | 8 | 9 |
|
||||||
|
m.SetFilter({1, 2, 3, 4, 5, 6, 7, 8, 9});
|
||||||
|
// No bias for this test.
|
||||||
|
m.SetBias({0});
|
||||||
|
m.SetNumThreads(4);
|
||||||
|
m.Invoke();
|
||||||
|
|
||||||
|
// clang-format off
|
||||||
|
EXPECT_THAT(
|
||||||
|
m.GetOutput(),
|
||||||
|
ElementsAreArray({
|
||||||
|
39, 39, 39, 39, 39, 39,
|
||||||
|
21, 21, 21, 21, 21, 21,
|
||||||
|
|
||||||
|
39, 39, 39, 39, 39, 39,
|
||||||
|
21, 21, 21, 21, 21, 21,
|
||||||
|
|
||||||
|
39, 39, 39, 39, 39, 39,
|
||||||
|
21, 21, 21, 21, 21, 21,
|
||||||
|
|
||||||
|
39, 39, 39, 39, 39, 39,
|
||||||
|
21, 21, 21, 21, 21, 21,
|
||||||
|
|
||||||
|
39, 39, 39, 39, 39, 39,
|
||||||
|
21, 21, 21, 21, 21, 21,
|
||||||
|
|
||||||
|
39, 39, 39, 39, 39, 39,
|
||||||
|
21, 21, 21, 21, 21, 21
|
||||||
|
}));
|
||||||
|
// clang-format on
|
||||||
|
}
|
||||||
|
|
||||||
class PerChannelQuantizedDepthwiseConvolutionOpModel
|
class PerChannelQuantizedDepthwiseConvolutionOpModel
|
||||||
: public BaseDepthwiseConvolutionOpModel {
|
: public BaseDepthwiseConvolutionOpModel {
|
||||||
public:
|
public:
|
||||||
|
@ -139,7 +139,8 @@ inline void DispatchDepthwiseConv(
|
|||||||
// Call kernel optimized for depthwise convolutions using 3x3 filters.
|
// Call kernel optimized for depthwise convolutions using 3x3 filters.
|
||||||
optimized_ops::depthwise_conv::DepthwiseConv3x3Filter(
|
optimized_ops::depthwise_conv::DepthwiseConv3x3Filter(
|
||||||
params, input_shape, input_data, filter_shape, filter_data,
|
params, input_shape, input_data, filter_shape, filter_data,
|
||||||
bias_shape, bias_data, output_shape, output_data);
|
bias_shape, bias_data, output_shape, output_data, /*thread_start=*/0,
|
||||||
|
/*thread_end=*/output_shape.Dims(1), /*thread_dim=*/1);
|
||||||
return;
|
return;
|
||||||
#else
|
#else
|
||||||
break;
|
break;
|
||||||
@ -242,7 +243,8 @@ inline void DispatchDepthwiseConv(
|
|||||||
case DepthwiseConvImplementation::kUseGenericKernel: {
|
case DepthwiseConvImplementation::kUseGenericKernel: {
|
||||||
optimized_ops::depthwise_conv::DepthwiseConvGeneral(
|
optimized_ops::depthwise_conv::DepthwiseConvGeneral(
|
||||||
params, input_shape, input_data, filter_shape, filter_data,
|
params, input_shape, input_data, filter_shape, filter_data,
|
||||||
bias_shape, bias_data, output_shape, output_data);
|
bias_shape, bias_data, output_shape, output_data, /*thread_start=*/0,
|
||||||
|
/*thread_end=*/output_shape.Dims(1), /*thread_dim=*/1);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
case DepthwiseConvImplementation::kNone:
|
case DepthwiseConvImplementation::kNone:
|
||||||
@ -271,13 +273,15 @@ inline void DispatchDepthwiseConv(
|
|||||||
optimized_ops::DepthwiseConvWithRounding<
|
optimized_ops::DepthwiseConvWithRounding<
|
||||||
DepthwiseConvOutputRounding::kAwayFromZero>(
|
DepthwiseConvOutputRounding::kAwayFromZero>(
|
||||||
params, input_shape, input_data, filter_shape, filter_data,
|
params, input_shape, input_data, filter_shape, filter_data,
|
||||||
bias_shape, bias_data, output_shape, output_data);
|
bias_shape, bias_data, output_shape, output_data, /*thread_start=*/0,
|
||||||
|
/*thread_end=*/output_shape.Dims(1), /*thread_dim=*/1);
|
||||||
return;
|
return;
|
||||||
case DepthwiseConvOutputRounding::kUpward:
|
case DepthwiseConvOutputRounding::kUpward:
|
||||||
optimized_ops::DepthwiseConvWithRounding<
|
optimized_ops::DepthwiseConvWithRounding<
|
||||||
DepthwiseConvOutputRounding::kUpward>(
|
DepthwiseConvOutputRounding::kUpward>(
|
||||||
params, input_shape, input_data, filter_shape, filter_data,
|
params, input_shape, input_data, filter_shape, filter_data,
|
||||||
bias_shape, bias_data, output_shape, output_data);
|
bias_shape, bias_data, output_shape, output_data, /*thread_start=*/0,
|
||||||
|
/*thread_end=*/output_shape.Dims(1), /*thread_dim=*/1);
|
||||||
return;
|
return;
|
||||||
default:
|
default:
|
||||||
break;
|
break;
|
||||||
|
@ -1662,7 +1662,7 @@ inline void DepthwiseConvGeneral(
|
|||||||
const uint8* input_data, const RuntimeShape& filter_shape,
|
const uint8* input_data, const RuntimeShape& filter_shape,
|
||||||
const uint8* filter_data, const RuntimeShape& bias_shape,
|
const uint8* filter_data, const RuntimeShape& bias_shape,
|
||||||
const int32* bias_data, const RuntimeShape& output_shape,
|
const int32* bias_data, const RuntimeShape& output_shape,
|
||||||
uint8* output_data) {
|
uint8* output_data, int thread_start, int thread_end, int thread_dim) {
|
||||||
const int stride_width = params.stride_width;
|
const int stride_width = params.stride_width;
|
||||||
const int stride_height = params.stride_height;
|
const int stride_height = params.stride_height;
|
||||||
const int pad_width = params.padding_values.width;
|
const int pad_width = params.padding_values.width;
|
||||||
@ -1684,7 +1684,7 @@ inline void DepthwiseConvGeneral(
|
|||||||
const int input_depth = input_shape.Dims(3);
|
const int input_depth = input_shape.Dims(3);
|
||||||
const int filter_height = filter_shape.Dims(1);
|
const int filter_height = filter_shape.Dims(1);
|
||||||
const int filter_width = filter_shape.Dims(2);
|
const int filter_width = filter_shape.Dims(2);
|
||||||
const int output_height = output_shape.Dims(1);
|
const int output_rows = output_shape.Dims(1);
|
||||||
const int output_width = output_shape.Dims(2);
|
const int output_width = output_shape.Dims(2);
|
||||||
#ifdef USE_NEON
|
#ifdef USE_NEON
|
||||||
const bool shift_left = (output_shift > 0);
|
const bool shift_left = (output_shift > 0);
|
||||||
@ -1700,6 +1700,7 @@ inline void DepthwiseConvGeneral(
|
|||||||
kAccBufferActualSize);
|
kAccBufferActualSize);
|
||||||
TFLITE_DCHECK_LE(kAccBufferActualSize, kAccBufferMaxSize);
|
TFLITE_DCHECK_LE(kAccBufferActualSize, kAccBufferMaxSize);
|
||||||
TFLITE_DCHECK_GE(kOutputPixelsInAccBuffer, 1);
|
TFLITE_DCHECK_GE(kOutputPixelsInAccBuffer, 1);
|
||||||
|
TFLITE_DCHECK(thread_dim == 0 || thread_dim == 1);
|
||||||
|
|
||||||
// row_accum_func will point to the core accumulation function to be used
|
// row_accum_func will point to the core accumulation function to be used
|
||||||
// for this DepthwiseConv op.
|
// for this DepthwiseConv op.
|
||||||
@ -1766,9 +1767,34 @@ inline void DepthwiseConvGeneral(
|
|||||||
const int filter_height_stride = filter_shape.Dims(3) * filter_shape.Dims(2);
|
const int filter_height_stride = filter_shape.Dims(3) * filter_shape.Dims(2);
|
||||||
|
|
||||||
// Now that we have determined row_accum_func, we can start work.
|
// Now that we have determined row_accum_func, we can start work.
|
||||||
uint8* output_ptr = output_data;
|
int batch_start = 0;
|
||||||
for (int b = 0; b < batches; ++b) {
|
int batch_end = batches;
|
||||||
for (int out_y = 0; out_y < output_height; ++out_y) {
|
int row_start = 0;
|
||||||
|
int row_end = output_rows;
|
||||||
|
int output_ptr_offset = 0;
|
||||||
|
|
||||||
|
switch (thread_dim) {
|
||||||
|
case 0:
|
||||||
|
TFLITE_DCHECK_GE(thread_start, 0);
|
||||||
|
TFLITE_DCHECK_LE(thread_end, batches);
|
||||||
|
batch_start = thread_start;
|
||||||
|
batch_end = thread_end;
|
||||||
|
output_ptr_offset = batch_start * FlatSizeSkipDim(output_shape, 0);
|
||||||
|
break;
|
||||||
|
case 1:
|
||||||
|
TFLITE_DCHECK_GE(thread_start, 0);
|
||||||
|
TFLITE_DCHECK_LE(thread_end, output_rows);
|
||||||
|
row_start = thread_start;
|
||||||
|
row_end = thread_end;
|
||||||
|
output_ptr_offset = row_start * output_width * output_depth;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
uint8* output_ptr = output_data + output_ptr_offset;
|
||||||
|
int batch_step =
|
||||||
|
(output_rows + row_start - row_end) * output_width * output_depth;
|
||||||
|
for (int b = batch_start; b < batch_end; ++b) {
|
||||||
|
for (int out_y = row_start; out_y < row_end; ++out_y) {
|
||||||
const int in_y_origin = (out_y * stride_height) - pad_height;
|
const int in_y_origin = (out_y * stride_height) - pad_height;
|
||||||
const int filter_y_start =
|
const int filter_y_start =
|
||||||
std::max(0, (-in_y_origin + dilation_height_factor - 1) /
|
std::max(0, (-in_y_origin + dilation_height_factor - 1) /
|
||||||
@ -1944,6 +1970,7 @@ inline void DepthwiseConvGeneral(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
output_ptr += batch_step;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1955,7 +1982,7 @@ inline void DepthwiseConvWithRounding(
|
|||||||
const uint8* input_data, const RuntimeShape& filter_shape,
|
const uint8* input_data, const RuntimeShape& filter_shape,
|
||||||
const uint8* filter_data, const RuntimeShape& bias_shape,
|
const uint8* filter_data, const RuntimeShape& bias_shape,
|
||||||
const int32* bias_data, const RuntimeShape& output_shape,
|
const int32* bias_data, const RuntimeShape& output_shape,
|
||||||
uint8* output_data) {
|
uint8* output_data, int thread_start, int thread_end, int thread_dim) {
|
||||||
gemmlowp::ScopedProfilingLabel label("DepthwiseConv/8bit");
|
gemmlowp::ScopedProfilingLabel label("DepthwiseConv/8bit");
|
||||||
const int depth_multiplier = params.depth_multiplier;
|
const int depth_multiplier = params.depth_multiplier;
|
||||||
const int32 output_activation_min = params.quantized_activation_min;
|
const int32 output_activation_min = params.quantized_activation_min;
|
||||||
@ -1991,7 +2018,8 @@ inline void DepthwiseConvWithRounding(
|
|||||||
gemmlowp::ScopedProfilingLabel specialized_label("DepthwiseConv/8bit/3x3");
|
gemmlowp::ScopedProfilingLabel specialized_label("DepthwiseConv/8bit/3x3");
|
||||||
depthwise_conv::DepthwiseConv3x3Filter(
|
depthwise_conv::DepthwiseConv3x3Filter(
|
||||||
params, input_shape, input_data, filter_shape, filter_data, bias_shape,
|
params, input_shape, input_data, filter_shape, filter_data, bias_shape,
|
||||||
bias_data, output_shape, output_data);
|
bias_data, output_shape, output_data, thread_start, thread_end,
|
||||||
|
thread_dim);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
@ -2000,7 +2028,77 @@ inline void DepthwiseConvWithRounding(
|
|||||||
"DepthwiseConv/8bit/General");
|
"DepthwiseConv/8bit/General");
|
||||||
depthwise_conv::DepthwiseConvGeneral(params, input_shape, input_data,
|
depthwise_conv::DepthwiseConvGeneral(params, input_shape, input_data,
|
||||||
filter_shape, filter_data, bias_shape,
|
filter_shape, filter_data, bias_shape,
|
||||||
bias_data, output_shape, output_data);
|
bias_data, output_shape, output_data,
|
||||||
|
thread_start, thread_end, thread_dim);
|
||||||
|
}
|
||||||
|
|
||||||
|
inline void DepthwiseConvImpl(
|
||||||
|
const DepthwiseParams& params, const RuntimeShape& input_shape,
|
||||||
|
const uint8* input_data, const RuntimeShape& filter_shape,
|
||||||
|
const uint8* filter_data, const RuntimeShape& bias_shape,
|
||||||
|
const int32* bias_data, const RuntimeShape& output_shape,
|
||||||
|
uint8* output_data, int thread_start, int thread_end, int thread_dim) {
|
||||||
|
return DepthwiseConvWithRounding<DepthwiseConvOutputRounding::kAwayFromZero>(
|
||||||
|
params, input_shape, input_data, filter_shape, filter_data, bias_shape,
|
||||||
|
bias_data, output_shape, output_data, thread_start, thread_end,
|
||||||
|
thread_dim);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T, typename TS>
|
||||||
|
struct DepthwiseConvWorkerTask : public gemmlowp::Task {
|
||||||
|
DepthwiseConvWorkerTask(const DepthwiseParams& params,
|
||||||
|
const RuntimeShape& input_shape, const T* input_data,
|
||||||
|
const RuntimeShape& filter_shape,
|
||||||
|
const T* filter_data, const RuntimeShape& bias_shape,
|
||||||
|
const TS* bias_data, const RuntimeShape& output_shape,
|
||||||
|
T* output_data, int thread_start, int thread_end,
|
||||||
|
int thread_dim)
|
||||||
|
: params_(params),
|
||||||
|
input_shape_(input_shape),
|
||||||
|
input_data_(input_data),
|
||||||
|
filter_shape_(filter_shape),
|
||||||
|
filter_data_(filter_data),
|
||||||
|
bias_shape_(bias_shape),
|
||||||
|
bias_data_(bias_data),
|
||||||
|
output_shape_(output_shape),
|
||||||
|
output_data_(output_data),
|
||||||
|
thread_start_(thread_start),
|
||||||
|
thread_end_(thread_end),
|
||||||
|
thread_dim_(thread_dim) {}
|
||||||
|
|
||||||
|
void Run() override {
|
||||||
|
DepthwiseConvImpl(params_, input_shape_, input_data_, filter_shape_,
|
||||||
|
filter_data_, bias_shape_, bias_data_, output_shape_,
|
||||||
|
output_data_, thread_start_, thread_end_, thread_dim_);
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
const DepthwiseParams& params_;
|
||||||
|
const RuntimeShape& input_shape_;
|
||||||
|
const T* input_data_;
|
||||||
|
const RuntimeShape& filter_shape_;
|
||||||
|
const T* filter_data_;
|
||||||
|
const RuntimeShape& bias_shape_;
|
||||||
|
const TS* bias_data_;
|
||||||
|
const RuntimeShape& output_shape_;
|
||||||
|
T* output_data_;
|
||||||
|
int thread_start_;
|
||||||
|
int thread_end_;
|
||||||
|
int thread_dim_;
|
||||||
|
};
|
||||||
|
|
||||||
|
inline int HowManyConvThreads(const RuntimeShape& output_shape,
|
||||||
|
const RuntimeShape& filter_shape,
|
||||||
|
int thread_dim) {
|
||||||
|
constexpr int kMinMulPerThread = 8;
|
||||||
|
const int output_units = output_shape.Dims(thread_dim);
|
||||||
|
const int filter_height = filter_shape.Dims(1);
|
||||||
|
const int filter_width = filter_shape.Dims(2);
|
||||||
|
const int num_mul_per_unit =
|
||||||
|
FlatSizeSkipDim(output_shape, thread_dim) * filter_height * filter_width;
|
||||||
|
const int min_units_per_thread = kMinMulPerThread / num_mul_per_unit + 1;
|
||||||
|
int thread_count = output_units / min_units_per_thread;
|
||||||
|
return thread_count;
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void DepthwiseConv(
|
inline void DepthwiseConv(
|
||||||
@ -2008,10 +2106,50 @@ inline void DepthwiseConv(
|
|||||||
const uint8* input_data, const RuntimeShape& filter_shape,
|
const uint8* input_data, const RuntimeShape& filter_shape,
|
||||||
const uint8* filter_data, const RuntimeShape& bias_shape,
|
const uint8* filter_data, const RuntimeShape& bias_shape,
|
||||||
const int32* bias_data, const RuntimeShape& output_shape,
|
const int32* bias_data, const RuntimeShape& output_shape,
|
||||||
uint8* output_data) {
|
uint8* output_data, gemmlowp::GemmContext* gemm_context = nullptr) {
|
||||||
return DepthwiseConvWithRounding<DepthwiseConvOutputRounding::kAwayFromZero>(
|
gemmlowp::ScopedProfilingLabel label("DepthwiseConv");
|
||||||
params, input_shape, input_data, filter_shape, filter_data, bias_shape,
|
|
||||||
bias_data, output_shape, output_data);
|
TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
|
||||||
|
TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
|
||||||
|
TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
|
||||||
|
|
||||||
|
const int output_batches = output_shape.Dims(0);
|
||||||
|
const int output_rows = output_shape.Dims(1);
|
||||||
|
int thread_count_batch = HowManyConvThreads(output_shape, filter_shape, 0);
|
||||||
|
int thread_count_row = HowManyConvThreads(output_shape, filter_shape, 1);
|
||||||
|
int thread_dim, thread_count, thread_dim_size;
|
||||||
|
if (thread_count_batch > thread_count_row) {
|
||||||
|
thread_dim = 0;
|
||||||
|
thread_dim_size = output_batches;
|
||||||
|
thread_count = thread_count_batch;
|
||||||
|
} else {
|
||||||
|
thread_dim = 1;
|
||||||
|
thread_dim_size = output_rows;
|
||||||
|
thread_count = thread_count_row;
|
||||||
|
}
|
||||||
|
|
||||||
|
const int max_threads = gemm_context ? gemm_context->max_num_threads() : 1;
|
||||||
|
thread_count = std::max(1, std::min(thread_count, max_threads));
|
||||||
|
|
||||||
|
if (thread_count == 1) {
|
||||||
|
DepthwiseConvImpl(params, input_shape, input_data, filter_shape,
|
||||||
|
filter_data, bias_shape, bias_data, output_shape,
|
||||||
|
output_data, /*thread_start=*/0,
|
||||||
|
/*thread_end=*/output_rows, /*thread_dim=*/1);
|
||||||
|
} else {
|
||||||
|
std::vector<gemmlowp::Task*> tasks(thread_count);
|
||||||
|
int thread_start = 0;
|
||||||
|
for (int i = 0; i < thread_count; ++i) {
|
||||||
|
int thread_end =
|
||||||
|
thread_start + (thread_dim_size - thread_start) / (thread_count - i);
|
||||||
|
tasks[i] = new DepthwiseConvWorkerTask<uint8, int32>(
|
||||||
|
params, input_shape, input_data, filter_shape, filter_data,
|
||||||
|
bias_shape, bias_data, output_shape, output_data, thread_start,
|
||||||
|
thread_end, thread_dim);
|
||||||
|
thread_start = thread_end;
|
||||||
|
}
|
||||||
|
gemm_context->workers_pool()->Execute(tasks);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace optimized_ops
|
} // namespace optimized_ops
|
||||||
|
@ -3535,7 +3535,7 @@ inline void DepthwiseConv3x3Filter(
|
|||||||
const uint8* input_data, const RuntimeShape& filter_shape,
|
const uint8* input_data, const RuntimeShape& filter_shape,
|
||||||
const uint8* filter_data, const RuntimeShape& bias_shape,
|
const uint8* filter_data, const RuntimeShape& bias_shape,
|
||||||
const int32* bias_data, const RuntimeShape& output_shape,
|
const int32* bias_data, const RuntimeShape& output_shape,
|
||||||
uint8* output_data) {
|
uint8* output_data, int thread_start, int thread_end, int thread_dim) {
|
||||||
gemmlowp::ScopedProfilingLabel label(__PRETTY_FUNCTION__);
|
gemmlowp::ScopedProfilingLabel label(__PRETTY_FUNCTION__);
|
||||||
DepthwiseConvParams params;
|
DepthwiseConvParams params;
|
||||||
|
|
||||||
@ -3586,6 +3586,7 @@ inline void DepthwiseConv3x3Filter(
|
|||||||
TFLITE_DCHECK(pad_height == 0 || pad_height == 1);
|
TFLITE_DCHECK(pad_height == 0 || pad_height == 1);
|
||||||
TFLITE_DCHECK(pad_width == 0 || pad_width == 1);
|
TFLITE_DCHECK(pad_width == 0 || pad_width == 1);
|
||||||
TFLITE_DCHECK(pad_width == pad_height);
|
TFLITE_DCHECK(pad_width == pad_height);
|
||||||
|
TFLITE_DCHECK(thread_dim == 0 || thread_dim == 1);
|
||||||
|
|
||||||
const int32 batches = MatchingDim(input_shape, 0, output_shape, 0);
|
const int32 batches = MatchingDim(input_shape, 0, output_shape, 0);
|
||||||
const int64_t input_batch_size = params.input_row_size * params.input_height;
|
const int64_t input_batch_size = params.input_row_size * params.input_height;
|
||||||
@ -3619,14 +3620,35 @@ inline void DepthwiseConv3x3Filter(
|
|||||||
// used in gemmlowp.
|
// used in gemmlowp.
|
||||||
uint8 shuffle_workspace[kDepthwiseConvScratchWorkspaceSize];
|
uint8 shuffle_workspace[kDepthwiseConvScratchWorkspaceSize];
|
||||||
|
|
||||||
for (int32 b = 0; b < batches; ++b) {
|
int batch_start = 0;
|
||||||
|
int batch_end = batches;
|
||||||
|
int row_start = 0;
|
||||||
|
int row_end = params.output_height;
|
||||||
|
|
||||||
|
switch (thread_dim) {
|
||||||
|
case 0:
|
||||||
|
TFLITE_DCHECK_GE(thread_start, 0);
|
||||||
|
TFLITE_DCHECK_LE(thread_end, batches);
|
||||||
|
batch_start = thread_start;
|
||||||
|
batch_end = thread_end;
|
||||||
|
break;
|
||||||
|
case 1:
|
||||||
|
TFLITE_DCHECK_GE(thread_start, 0);
|
||||||
|
TFLITE_DCHECK_LE(thread_end, params.output_height);
|
||||||
|
row_start = thread_start;
|
||||||
|
row_end = thread_end;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int32 b = batch_start; b < batch_end; ++b) {
|
||||||
const uint8* input_ptr = input_data + b * input_batch_size;
|
const uint8* input_ptr = input_data + b * input_batch_size;
|
||||||
uint8* output_ptr = output_data + b * output_batch_size;
|
uint8* output_ptr = output_data + b * output_batch_size +
|
||||||
|
row_start * params.output_width * params.output_depth;
|
||||||
|
|
||||||
int32 out_x = 0;
|
int32 out_x = 0;
|
||||||
int32 out_y = 0;
|
int32 out_y = row_start;
|
||||||
int32 end_x = params.output_width;
|
int32 end_x = params.output_width;
|
||||||
int32 end_y = params.output_height;
|
int32 end_y = row_end;
|
||||||
|
|
||||||
if (pad_width == 1 && pad_height == 1) {
|
if (pad_width == 1 && pad_height == 1) {
|
||||||
DepthwiseConvHandlePadding(input_ptr, filter_data, bias_data, output_ptr,
|
DepthwiseConvHandlePadding(input_ptr, filter_data, bias_data, output_ptr,
|
||||||
@ -3635,8 +3657,8 @@ inline void DepthwiseConv3x3Filter(
|
|||||||
// Update extents now that the edges have been handled.
|
// Update extents now that the edges have been handled.
|
||||||
out_x = 1;
|
out_x = 1;
|
||||||
end_x = params.output_width - 1;
|
end_x = params.output_width - 1;
|
||||||
out_y = 1;
|
out_y = std::max(1, out_y);
|
||||||
end_y = params.output_height - 1;
|
end_y = std::min(params.output_height - 1, end_y);
|
||||||
const int in_x = (out_x * stride_width) - pad_width;
|
const int in_x = (out_x * stride_width) - pad_width;
|
||||||
const int in_y = (out_y * stride_height) - pad_height;
|
const int in_y = (out_y * stride_height) - pad_height;
|
||||||
input_ptr += in_y * params.input_row_size + in_x * params.input_depth;
|
input_ptr += in_y * params.input_row_size + in_x * params.input_depth;
|
||||||
|
@ -234,9 +234,14 @@ inline void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims,
|
|||||||
// Legacy ops used mixed left and right shifts. Now all are +ve-means-left.
|
// Legacy ops used mixed left and right shifts. Now all are +ve-means-left.
|
||||||
op_params.output_shift = kDepthwiseReverseShift * output_shift;
|
op_params.output_shift = kDepthwiseReverseShift * output_shift;
|
||||||
|
|
||||||
DepthwiseConv(op_params, DimsToShape(input_dims), input_data,
|
const RuntimeShape output_shape = DimsToShape(output_dims);
|
||||||
DimsToShape(filter_dims), filter_data, DimsToShape(bias_dims),
|
const int output_height = output_shape.Dims(1);
|
||||||
bias_data, DimsToShape(output_dims), output_data);
|
|
||||||
|
DepthwiseConvImpl(op_params, DimsToShape(input_dims), input_data,
|
||||||
|
DimsToShape(filter_dims), filter_data,
|
||||||
|
DimsToShape(bias_dims), bias_data, DimsToShape(output_dims),
|
||||||
|
output_data, /*thread_start=*/0,
|
||||||
|
/*thread_end=*/output_height, /*thread_dim=*/1);
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims,
|
inline void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims,
|
||||||
|
@ -327,6 +327,7 @@ class SingleOpModel {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void SetNumThreads(int num_threads) {
|
void SetNumThreads(int num_threads) {
|
||||||
|
CHECK(interpreter_ != nullptr);
|
||||||
interpreter_->SetNumThreads(num_threads);
|
interpreter_->SetNumThreads(num_threads);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user