Add gemmlowp-threadpool multithreading to the depthwiseconv implementation for the quantized path.

PiperOrigin-RevId: 239959051
This commit is contained in:
Lu Wang 2019-03-23 10:31:22 -07:00 committed by TensorFlower Gardener
parent 36f817a9f3
commit 152095e319
7 changed files with 380 additions and 42 deletions

View File

@ -21,6 +21,7 @@ limitations under the License.
#include "tensorflow/lite/c/builtin_op_data.h"
#include "tensorflow/lite/c/c_api_internal.h"
#include "tensorflow/lite/kernels/gemm_support.h"
#include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_float.h"
#include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8.h"
#include "tensorflow/lite/kernels/internal/quantization_util.h"
@ -66,6 +67,7 @@ struct OpData {
};
void* Init(TfLiteContext* context, const char* buffer, size_t length) {
gemm_support::IncrementUsageCounter(context);
// This is a builtin op, so we don't use the contents in 'buffer', if any.
// Instead, we allocate a new object to carry information from Prepare() to
// Eval().
@ -73,6 +75,7 @@ void* Init(TfLiteContext* context, const char* buffer, size_t length) {
}
void Free(TfLiteContext* context, void* buffer) {
gemm_support::DecrementUsageCounter(context);
delete reinterpret_cast<OpData*>(buffer);
}
@ -230,17 +233,6 @@ void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
auto filter_offset = -filter->params.zero_point;
auto output_offset = output->params.zero_point;
void (*depthwise_conv)(const DepthwiseParams&, const RuntimeShape&,
const uint8*, const RuntimeShape&, const uint8*,
const RuntimeShape&, const int32*, const RuntimeShape&,
uint8*);
if (kernel_type == kReference) {
depthwise_conv = &reference_ops::DepthwiseConv;
} else {
depthwise_conv = &optimized_ops::DepthwiseConv;
}
DepthwiseParams op_params;
op_params.padding_type = PaddingType::kSame;
op_params.padding_values.width = data->padding.width;
@ -257,11 +249,20 @@ void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
op_params.output_shift = -data->output_shift;
op_params.quantized_activation_min = data->output_activation_min;
op_params.quantized_activation_max = data->output_activation_max;
depthwise_conv(op_params, GetTensorShape(input),
GetTensorData<uint8_t>(input), GetTensorShape(filter),
GetTensorData<uint8_t>(filter), GetTensorShape(bias),
GetTensorData<int32_t>(bias), GetTensorShape(output),
GetTensorData<uint8_t>(output));
if (kernel_type == kReference) {
reference_ops::DepthwiseConv(
op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
GetTensorShape(filter), GetTensorData<uint8_t>(filter),
GetTensorShape(bias), GetTensorData<int32_t>(bias),
GetTensorShape(output), GetTensorData<uint8_t>(output));
} else {
gemmlowp::GemmContext* gemm_context = gemm_support::GetFromContext(context);
optimized_ops::DepthwiseConv(
op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
GetTensorShape(filter), GetTensorData<uint8_t>(filter),
GetTensorShape(bias), GetTensorData<int32_t>(bias),
GetTensorShape(output), GetTensorData<uint8_t>(output), gemm_context);
}
}
void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,

View File

@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include <cstdarg>
#include <initializer_list>
#include <gtest/gtest.h>
#include "absl/memory/memory.h"
#include "tensorflow/lite/interpreter.h"
@ -501,6 +502,172 @@ TEST_P(QuantizedDepthwiseConvolutionOpTest, SimpleDilatedTestPaddingSame) {
ElementsAreArray({4, 7, 3, 6, 10, 4, 2, 3, 1}));
}
TEST_P(DepthwiseConvolutionOpTest, MultithreadOnRowUint8GeneralTest) {
const int depth = 1;
const int image_width = 4;
const int image_height = 28;
const int image_batch_count = 3;
const int filter_size = 3;
const int filter_count = 1;
QuantizedDepthwiseConvolutionOpModel m(
GetRegistration(),
{TensorType_UINT8,
{image_batch_count, image_height, image_width, depth},
0,
255},
{TensorType_UINT8,
{depth, filter_size, filter_size, filter_count},
0,
255},
{TensorType_UINT8, {}, 0, 255}, Padding_VALID);
// clang-format off
m.SetInput({
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
});
// clang-format on
// The filter matrix is:
// | 1 | 2 | 3 |
// | 4 | 5 | 6 |
// | 7 | 8 | 9 |
m.SetFilter({1, 2, 3, 4, 5, 6, 7, 8, 9});
// No bias for this test.
m.SetBias({0});
m.SetNumThreads(4);
m.Invoke();
// clang-format off
EXPECT_THAT(
m.GetOutput(),
ElementsAreArray({
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 24, 24, 39, 39,
45, 45, 45, 45, 45, 45, 45, 45,
45, 45, 45, 45, 45, 45, 45, 45,
45, 45, 45, 45, 21, 21, 6, 6,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 48, 48, 78, 78,
90, 90, 90, 90, 90, 90, 90, 90,
90, 90, 90, 90, 90, 90, 90, 90,
90, 90, 90, 90, 42, 42, 12, 12,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 72, 72, 117, 117,
135, 135, 135, 135, 135, 135, 135, 135,
135, 135, 135, 135, 135, 135, 135, 135,
135, 135, 135, 135, 63, 63, 18, 18,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0,
}));
// clang-format on
}
TEST_P(DepthwiseConvolutionOpTest, MultithreadOnBatchUint8GeneralTest) {
const int depth = 1;
const int image_width = 8;
const int image_height = 4;
const int image_batch_count = 6;
const int filter_size = 3;
const int filter_count = 1;
QuantizedDepthwiseConvolutionOpModel m(
GetRegistration(),
{TensorType_UINT8,
{image_batch_count, image_height, image_width, depth},
0,
255},
{TensorType_UINT8,
{depth, filter_size, filter_size, filter_count},
0,
255},
{TensorType_UINT8, {}, 0, 255}, Padding_VALID);
// clang-format off
m.SetInput({
0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0
});
// clang-format on
// The filter matrix is:
// | 1 | 2 | 3 |
// | 4 | 5 | 6 |
// | 7 | 8 | 9 |
m.SetFilter({1, 2, 3, 4, 5, 6, 7, 8, 9});
// No bias for this test.
m.SetBias({0});
m.SetNumThreads(4);
m.Invoke();
// clang-format off
EXPECT_THAT(
m.GetOutput(),
ElementsAreArray({
39, 39, 39, 39, 39, 39,
21, 21, 21, 21, 21, 21,
39, 39, 39, 39, 39, 39,
21, 21, 21, 21, 21, 21,
39, 39, 39, 39, 39, 39,
21, 21, 21, 21, 21, 21,
39, 39, 39, 39, 39, 39,
21, 21, 21, 21, 21, 21,
39, 39, 39, 39, 39, 39,
21, 21, 21, 21, 21, 21,
39, 39, 39, 39, 39, 39,
21, 21, 21, 21, 21, 21
}));
// clang-format on
}
class PerChannelQuantizedDepthwiseConvolutionOpModel
: public BaseDepthwiseConvolutionOpModel {
public:

View File

@ -139,7 +139,8 @@ inline void DispatchDepthwiseConv(
// Call kernel optimized for depthwise convolutions using 3x3 filters.
optimized_ops::depthwise_conv::DepthwiseConv3x3Filter(
params, input_shape, input_data, filter_shape, filter_data,
bias_shape, bias_data, output_shape, output_data);
bias_shape, bias_data, output_shape, output_data, /*thread_start=*/0,
/*thread_end=*/output_shape.Dims(1), /*thread_dim=*/1);
return;
#else
break;
@ -242,7 +243,8 @@ inline void DispatchDepthwiseConv(
case DepthwiseConvImplementation::kUseGenericKernel: {
optimized_ops::depthwise_conv::DepthwiseConvGeneral(
params, input_shape, input_data, filter_shape, filter_data,
bias_shape, bias_data, output_shape, output_data);
bias_shape, bias_data, output_shape, output_data, /*thread_start=*/0,
/*thread_end=*/output_shape.Dims(1), /*thread_dim=*/1);
return;
}
case DepthwiseConvImplementation::kNone:
@ -271,13 +273,15 @@ inline void DispatchDepthwiseConv(
optimized_ops::DepthwiseConvWithRounding<
DepthwiseConvOutputRounding::kAwayFromZero>(
params, input_shape, input_data, filter_shape, filter_data,
bias_shape, bias_data, output_shape, output_data);
bias_shape, bias_data, output_shape, output_data, /*thread_start=*/0,
/*thread_end=*/output_shape.Dims(1), /*thread_dim=*/1);
return;
case DepthwiseConvOutputRounding::kUpward:
optimized_ops::DepthwiseConvWithRounding<
DepthwiseConvOutputRounding::kUpward>(
params, input_shape, input_data, filter_shape, filter_data,
bias_shape, bias_data, output_shape, output_data);
bias_shape, bias_data, output_shape, output_data, /*thread_start=*/0,
/*thread_end=*/output_shape.Dims(1), /*thread_dim=*/1);
return;
default:
break;

View File

@ -1662,7 +1662,7 @@ inline void DepthwiseConvGeneral(
const uint8* input_data, const RuntimeShape& filter_shape,
const uint8* filter_data, const RuntimeShape& bias_shape,
const int32* bias_data, const RuntimeShape& output_shape,
uint8* output_data) {
uint8* output_data, int thread_start, int thread_end, int thread_dim) {
const int stride_width = params.stride_width;
const int stride_height = params.stride_height;
const int pad_width = params.padding_values.width;
@ -1684,7 +1684,7 @@ inline void DepthwiseConvGeneral(
const int input_depth = input_shape.Dims(3);
const int filter_height = filter_shape.Dims(1);
const int filter_width = filter_shape.Dims(2);
const int output_height = output_shape.Dims(1);
const int output_rows = output_shape.Dims(1);
const int output_width = output_shape.Dims(2);
#ifdef USE_NEON
const bool shift_left = (output_shift > 0);
@ -1700,6 +1700,7 @@ inline void DepthwiseConvGeneral(
kAccBufferActualSize);
TFLITE_DCHECK_LE(kAccBufferActualSize, kAccBufferMaxSize);
TFLITE_DCHECK_GE(kOutputPixelsInAccBuffer, 1);
TFLITE_DCHECK(thread_dim == 0 || thread_dim == 1);
// row_accum_func will point to the core accumulation function to be used
// for this DepthwiseConv op.
@ -1766,9 +1767,34 @@ inline void DepthwiseConvGeneral(
const int filter_height_stride = filter_shape.Dims(3) * filter_shape.Dims(2);
// Now that we have determined row_accum_func, we can start work.
uint8* output_ptr = output_data;
for (int b = 0; b < batches; ++b) {
for (int out_y = 0; out_y < output_height; ++out_y) {
int batch_start = 0;
int batch_end = batches;
int row_start = 0;
int row_end = output_rows;
int output_ptr_offset = 0;
switch (thread_dim) {
case 0:
TFLITE_DCHECK_GE(thread_start, 0);
TFLITE_DCHECK_LE(thread_end, batches);
batch_start = thread_start;
batch_end = thread_end;
output_ptr_offset = batch_start * FlatSizeSkipDim(output_shape, 0);
break;
case 1:
TFLITE_DCHECK_GE(thread_start, 0);
TFLITE_DCHECK_LE(thread_end, output_rows);
row_start = thread_start;
row_end = thread_end;
output_ptr_offset = row_start * output_width * output_depth;
break;
}
uint8* output_ptr = output_data + output_ptr_offset;
int batch_step =
(output_rows + row_start - row_end) * output_width * output_depth;
for (int b = batch_start; b < batch_end; ++b) {
for (int out_y = row_start; out_y < row_end; ++out_y) {
const int in_y_origin = (out_y * stride_height) - pad_height;
const int filter_y_start =
std::max(0, (-in_y_origin + dilation_height_factor - 1) /
@ -1944,6 +1970,7 @@ inline void DepthwiseConvGeneral(
}
}
}
output_ptr += batch_step;
}
}
@ -1955,7 +1982,7 @@ inline void DepthwiseConvWithRounding(
const uint8* input_data, const RuntimeShape& filter_shape,
const uint8* filter_data, const RuntimeShape& bias_shape,
const int32* bias_data, const RuntimeShape& output_shape,
uint8* output_data) {
uint8* output_data, int thread_start, int thread_end, int thread_dim) {
gemmlowp::ScopedProfilingLabel label("DepthwiseConv/8bit");
const int depth_multiplier = params.depth_multiplier;
const int32 output_activation_min = params.quantized_activation_min;
@ -1991,7 +2018,8 @@ inline void DepthwiseConvWithRounding(
gemmlowp::ScopedProfilingLabel specialized_label("DepthwiseConv/8bit/3x3");
depthwise_conv::DepthwiseConv3x3Filter(
params, input_shape, input_data, filter_shape, filter_data, bias_shape,
bias_data, output_shape, output_data);
bias_data, output_shape, output_data, thread_start, thread_end,
thread_dim);
return;
}
#endif
@ -2000,7 +2028,77 @@ inline void DepthwiseConvWithRounding(
"DepthwiseConv/8bit/General");
depthwise_conv::DepthwiseConvGeneral(params, input_shape, input_data,
filter_shape, filter_data, bias_shape,
bias_data, output_shape, output_data);
bias_data, output_shape, output_data,
thread_start, thread_end, thread_dim);
}
inline void DepthwiseConvImpl(
const DepthwiseParams& params, const RuntimeShape& input_shape,
const uint8* input_data, const RuntimeShape& filter_shape,
const uint8* filter_data, const RuntimeShape& bias_shape,
const int32* bias_data, const RuntimeShape& output_shape,
uint8* output_data, int thread_start, int thread_end, int thread_dim) {
return DepthwiseConvWithRounding<DepthwiseConvOutputRounding::kAwayFromZero>(
params, input_shape, input_data, filter_shape, filter_data, bias_shape,
bias_data, output_shape, output_data, thread_start, thread_end,
thread_dim);
}
template <typename T, typename TS>
struct DepthwiseConvWorkerTask : public gemmlowp::Task {
DepthwiseConvWorkerTask(const DepthwiseParams& params,
const RuntimeShape& input_shape, const T* input_data,
const RuntimeShape& filter_shape,
const T* filter_data, const RuntimeShape& bias_shape,
const TS* bias_data, const RuntimeShape& output_shape,
T* output_data, int thread_start, int thread_end,
int thread_dim)
: params_(params),
input_shape_(input_shape),
input_data_(input_data),
filter_shape_(filter_shape),
filter_data_(filter_data),
bias_shape_(bias_shape),
bias_data_(bias_data),
output_shape_(output_shape),
output_data_(output_data),
thread_start_(thread_start),
thread_end_(thread_end),
thread_dim_(thread_dim) {}
void Run() override {
DepthwiseConvImpl(params_, input_shape_, input_data_, filter_shape_,
filter_data_, bias_shape_, bias_data_, output_shape_,
output_data_, thread_start_, thread_end_, thread_dim_);
}
private:
const DepthwiseParams& params_;
const RuntimeShape& input_shape_;
const T* input_data_;
const RuntimeShape& filter_shape_;
const T* filter_data_;
const RuntimeShape& bias_shape_;
const TS* bias_data_;
const RuntimeShape& output_shape_;
T* output_data_;
int thread_start_;
int thread_end_;
int thread_dim_;
};
inline int HowManyConvThreads(const RuntimeShape& output_shape,
const RuntimeShape& filter_shape,
int thread_dim) {
constexpr int kMinMulPerThread = 8;
const int output_units = output_shape.Dims(thread_dim);
const int filter_height = filter_shape.Dims(1);
const int filter_width = filter_shape.Dims(2);
const int num_mul_per_unit =
FlatSizeSkipDim(output_shape, thread_dim) * filter_height * filter_width;
const int min_units_per_thread = kMinMulPerThread / num_mul_per_unit + 1;
int thread_count = output_units / min_units_per_thread;
return thread_count;
}
inline void DepthwiseConv(
@ -2008,10 +2106,50 @@ inline void DepthwiseConv(
const uint8* input_data, const RuntimeShape& filter_shape,
const uint8* filter_data, const RuntimeShape& bias_shape,
const int32* bias_data, const RuntimeShape& output_shape,
uint8* output_data) {
return DepthwiseConvWithRounding<DepthwiseConvOutputRounding::kAwayFromZero>(
params, input_shape, input_data, filter_shape, filter_data, bias_shape,
bias_data, output_shape, output_data);
uint8* output_data, gemmlowp::GemmContext* gemm_context = nullptr) {
gemmlowp::ScopedProfilingLabel label("DepthwiseConv");
TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
const int output_batches = output_shape.Dims(0);
const int output_rows = output_shape.Dims(1);
int thread_count_batch = HowManyConvThreads(output_shape, filter_shape, 0);
int thread_count_row = HowManyConvThreads(output_shape, filter_shape, 1);
int thread_dim, thread_count, thread_dim_size;
if (thread_count_batch > thread_count_row) {
thread_dim = 0;
thread_dim_size = output_batches;
thread_count = thread_count_batch;
} else {
thread_dim = 1;
thread_dim_size = output_rows;
thread_count = thread_count_row;
}
const int max_threads = gemm_context ? gemm_context->max_num_threads() : 1;
thread_count = std::max(1, std::min(thread_count, max_threads));
if (thread_count == 1) {
DepthwiseConvImpl(params, input_shape, input_data, filter_shape,
filter_data, bias_shape, bias_data, output_shape,
output_data, /*thread_start=*/0,
/*thread_end=*/output_rows, /*thread_dim=*/1);
} else {
std::vector<gemmlowp::Task*> tasks(thread_count);
int thread_start = 0;
for (int i = 0; i < thread_count; ++i) {
int thread_end =
thread_start + (thread_dim_size - thread_start) / (thread_count - i);
tasks[i] = new DepthwiseConvWorkerTask<uint8, int32>(
params, input_shape, input_data, filter_shape, filter_data,
bias_shape, bias_data, output_shape, output_data, thread_start,
thread_end, thread_dim);
thread_start = thread_end;
}
gemm_context->workers_pool()->Execute(tasks);
}
}
} // namespace optimized_ops

View File

@ -3535,7 +3535,7 @@ inline void DepthwiseConv3x3Filter(
const uint8* input_data, const RuntimeShape& filter_shape,
const uint8* filter_data, const RuntimeShape& bias_shape,
const int32* bias_data, const RuntimeShape& output_shape,
uint8* output_data) {
uint8* output_data, int thread_start, int thread_end, int thread_dim) {
gemmlowp::ScopedProfilingLabel label(__PRETTY_FUNCTION__);
DepthwiseConvParams params;
@ -3586,6 +3586,7 @@ inline void DepthwiseConv3x3Filter(
TFLITE_DCHECK(pad_height == 0 || pad_height == 1);
TFLITE_DCHECK(pad_width == 0 || pad_width == 1);
TFLITE_DCHECK(pad_width == pad_height);
TFLITE_DCHECK(thread_dim == 0 || thread_dim == 1);
const int32 batches = MatchingDim(input_shape, 0, output_shape, 0);
const int64_t input_batch_size = params.input_row_size * params.input_height;
@ -3619,14 +3620,35 @@ inline void DepthwiseConv3x3Filter(
// used in gemmlowp.
uint8 shuffle_workspace[kDepthwiseConvScratchWorkspaceSize];
for (int32 b = 0; b < batches; ++b) {
int batch_start = 0;
int batch_end = batches;
int row_start = 0;
int row_end = params.output_height;
switch (thread_dim) {
case 0:
TFLITE_DCHECK_GE(thread_start, 0);
TFLITE_DCHECK_LE(thread_end, batches);
batch_start = thread_start;
batch_end = thread_end;
break;
case 1:
TFLITE_DCHECK_GE(thread_start, 0);
TFLITE_DCHECK_LE(thread_end, params.output_height);
row_start = thread_start;
row_end = thread_end;
break;
}
for (int32 b = batch_start; b < batch_end; ++b) {
const uint8* input_ptr = input_data + b * input_batch_size;
uint8* output_ptr = output_data + b * output_batch_size;
uint8* output_ptr = output_data + b * output_batch_size +
row_start * params.output_width * params.output_depth;
int32 out_x = 0;
int32 out_y = 0;
int32 out_y = row_start;
int32 end_x = params.output_width;
int32 end_y = params.output_height;
int32 end_y = row_end;
if (pad_width == 1 && pad_height == 1) {
DepthwiseConvHandlePadding(input_ptr, filter_data, bias_data, output_ptr,
@ -3635,8 +3657,8 @@ inline void DepthwiseConv3x3Filter(
// Update extents now that the edges have been handled.
out_x = 1;
end_x = params.output_width - 1;
out_y = 1;
end_y = params.output_height - 1;
out_y = std::max(1, out_y);
end_y = std::min(params.output_height - 1, end_y);
const int in_x = (out_x * stride_width) - pad_width;
const int in_y = (out_y * stride_height) - pad_height;
input_ptr += in_y * params.input_row_size + in_x * params.input_depth;

View File

@ -234,9 +234,14 @@ inline void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims,
// Legacy ops used mixed left and right shifts. Now all are +ve-means-left.
op_params.output_shift = kDepthwiseReverseShift * output_shift;
DepthwiseConv(op_params, DimsToShape(input_dims), input_data,
DimsToShape(filter_dims), filter_data, DimsToShape(bias_dims),
bias_data, DimsToShape(output_dims), output_data);
const RuntimeShape output_shape = DimsToShape(output_dims);
const int output_height = output_shape.Dims(1);
DepthwiseConvImpl(op_params, DimsToShape(input_dims), input_data,
DimsToShape(filter_dims), filter_data,
DimsToShape(bias_dims), bias_data, DimsToShape(output_dims),
output_data, /*thread_start=*/0,
/*thread_end=*/output_height, /*thread_dim=*/1);
}
inline void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims,

View File

@ -327,6 +327,7 @@ class SingleOpModel {
}
void SetNumThreads(int num_threads) {
CHECK(interpreter_ != nullptr);
interpreter_->SetNumThreads(num_threads);
}