Depthwise convolution 3x3 per-channel int8 for dot-product ARM (13).
PiperOrigin-RevId: 295749216 Change-Id: Ieea413c1e525a06e04fe957603d167cda46e3318
This commit is contained in:
parent
49aa204fc3
commit
b9a0bd18a9
@ -27,6 +27,21 @@ namespace tflite {
|
||||
namespace optimized_ops {
|
||||
namespace depthwise_conv {
|
||||
|
||||
#ifdef USE_NEON
|
||||
inline int8x16_t util_vld1q_x8(const uint8* data_addr) {
|
||||
return vreinterpretq_s8_u8(vld1q_u8(data_addr));
|
||||
}
|
||||
inline int8x16_t util_vld1q_x8(const int8* data_addr) {
|
||||
return vld1q_s8(data_addr);
|
||||
}
|
||||
inline int8x8_t util_vld1_x8(const uint8* data_addr) {
|
||||
return vreinterpret_s8_u8(vld1_u8(data_addr));
|
||||
}
|
||||
inline int8x8_t util_vld1_x8(const int8* data_addr) {
|
||||
return vld1_s8(data_addr);
|
||||
}
|
||||
#endif
|
||||
|
||||
#define STR(s) STR_UNEXPANDED(s)
|
||||
#define STR_UNEXPANDED(s) #s
|
||||
|
||||
@ -5907,13 +5922,15 @@ struct ProcessPerDepth<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
template <QuantizationType quantization_type>
|
||||
struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
|
||||
QuantizationType::kNonPerChannelUint8,
|
||||
quantization_type,
|
||||
DepthwiseConvDepthMultiplication::kNoMultiplication,
|
||||
/*max_padding=*/0> {
|
||||
static inline void PackMacroBlockNeon(
|
||||
const uint8* input_block_data, int8* scratch_block_data,
|
||||
const typename QuantizationTypeImpl<quantization_type>::ExternalType*
|
||||
input_block_data,
|
||||
int8* scratch_block_data,
|
||||
const DepthwiseConvDotProdParams* function_params) {
|
||||
TFLITE_DCHECK_EQ(function_params->padding_bottom, 0);
|
||||
TFLITE_DCHECK_EQ(function_params->padding_top, 0);
|
||||
@ -5932,7 +5949,8 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
|
||||
const int input_depth = function_params->input_depth;
|
||||
|
||||
TFLITE_DCHECK_GE(depth_micro_repeats, 0);
|
||||
constexpr uint8 kSignBit = 0x80;
|
||||
constexpr uint8 kSignBit =
|
||||
QuantizationTypeImpl<quantization_type>::kUint8SignBit;
|
||||
const int micro_block_size = 4 * 8;
|
||||
const int depth_advance = width_overall_micro_repeats * micro_block_size;
|
||||
const int width_advance =
|
||||
@ -5948,14 +5966,14 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
|
||||
int8x16_t work_reg_b;
|
||||
|
||||
// Effect subtraction of zero-point = 128 by XOR of sign bit.
|
||||
const int8x16_t sign_bit = vdupq_n_s8(kSignBit);
|
||||
const uint8x16_t sign_bit = vdupq_n_u8(kSignBit);
|
||||
|
||||
// Work through one slice, by row, at a time.
|
||||
int8* scratch_data_0 = scratch_block_data;
|
||||
|
||||
for (int k_height = 0; k_height < block_height; ++k_height) {
|
||||
const int8* input_data_0 =
|
||||
reinterpret_cast<const int8*>(input_block_data);
|
||||
const typename QuantizationTypeImpl<quantization_type>::ExternalType*
|
||||
input_data_0 = input_block_data;
|
||||
int8x16_t input_data_a;
|
||||
int8x16_t input_data_b;
|
||||
int8x16_t input_data_c;
|
||||
@ -5976,29 +5994,27 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
|
||||
if (depth_micro_repeats >= 2) {
|
||||
i_depth += 2;
|
||||
|
||||
//
|
||||
|
||||
input_data_a = vld1q_s8(input_data_0);
|
||||
input_data_b = vld1q_s8(input_data_0 + 1 * input_depth);
|
||||
input_data_c = vld1q_s8(input_data_0 + 2 * input_depth);
|
||||
input_data_d = vld1q_s8(input_data_0 + 3 * input_depth);
|
||||
input_data_a = util_vld1q_x8(input_data_0);
|
||||
input_data_b = util_vld1q_x8(input_data_0 + 1 * input_depth);
|
||||
input_data_c = util_vld1q_x8(input_data_0 + 2 * input_depth);
|
||||
input_data_d = util_vld1q_x8(input_data_0 + 3 * input_depth);
|
||||
input_data_0 += 16;
|
||||
|
||||
//
|
||||
|
||||
for (; i_depth < depth_micro_repeats - 1; i_depth += 2) {
|
||||
work_reg_a = vzip1q_s8(input_data_a, input_data_b);
|
||||
work_reg_b = vzip1q_s8(input_data_c, input_data_d);
|
||||
vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
|
||||
work_reg_a = veorq_s8(work_reg_a, sign_bit);
|
||||
work_reg_b = veorq_s8(work_reg_b, sign_bit);
|
||||
if (quantization_type == QuantizationType::kNonPerChannelUint8) {
|
||||
work_reg_a = veorq_s8(work_reg_a, sign_bit);
|
||||
work_reg_b = veorq_s8(work_reg_b, sign_bit);
|
||||
}
|
||||
|
||||
work_reg_a_sp = vzip2q_s8(input_data_a, input_data_b);
|
||||
work_reg_b_sp = vzip2q_s8(input_data_c, input_data_d);
|
||||
vzipq_s8x2_in_place(&work_reg_a_sp, &work_reg_b_sp);
|
||||
|
||||
input_data_a = vld1q_s8(input_data_0);
|
||||
input_data_b = vld1q_s8(input_data_0 + 1 * input_depth);
|
||||
input_data_a = util_vld1q_x8(input_data_0);
|
||||
input_data_b = util_vld1q_x8(input_data_0 + 1 * input_depth);
|
||||
optimized_ops_prefetch_write_l1_keep(scratch_data_0);
|
||||
optimized_ops_prefetch_write_l1_keep(scratch_data_0 + 16);
|
||||
vst1q_s8(scratch_data_0, work_reg_a);
|
||||
@ -6006,41 +6022,43 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
|
||||
|
||||
scratch_data_0 += depth_advance;
|
||||
|
||||
work_reg_a_sp = veorq_s8(work_reg_a_sp, sign_bit);
|
||||
work_reg_b_sp = veorq_s8(work_reg_b_sp, sign_bit);
|
||||
if (quantization_type == QuantizationType::kNonPerChannelUint8) {
|
||||
work_reg_a_sp = veorq_s8(work_reg_a_sp, sign_bit);
|
||||
work_reg_b_sp = veorq_s8(work_reg_b_sp, sign_bit);
|
||||
}
|
||||
|
||||
input_data_c = vld1q_s8(input_data_0 + 2 * input_depth);
|
||||
input_data_d = vld1q_s8(input_data_0 + 3 * input_depth);
|
||||
input_data_c = util_vld1q_x8(input_data_0 + 2 * input_depth);
|
||||
input_data_d = util_vld1q_x8(input_data_0 + 3 * input_depth);
|
||||
optimized_ops_prefetch_write_l1_keep(scratch_data_0);
|
||||
optimized_ops_prefetch_write_l1_keep(scratch_data_0 + 16);
|
||||
vst1q_s8(scratch_data_0, work_reg_a_sp);
|
||||
vst1q_s8(scratch_data_0 + 16, work_reg_b_sp);
|
||||
|
||||
scratch_data_0 += depth_advance;
|
||||
|
||||
//
|
||||
|
||||
input_data_0 += 16;
|
||||
}
|
||||
|
||||
work_reg_a = vzip1q_s8(input_data_a, input_data_b);
|
||||
work_reg_b = vzip1q_s8(input_data_c, input_data_d);
|
||||
vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
|
||||
work_reg_a = veorq_s8(work_reg_a, sign_bit);
|
||||
work_reg_b = veorq_s8(work_reg_b, sign_bit);
|
||||
if (quantization_type == QuantizationType::kNonPerChannelUint8) {
|
||||
work_reg_a = veorq_s8(work_reg_a, sign_bit);
|
||||
work_reg_b = veorq_s8(work_reg_b, sign_bit);
|
||||
}
|
||||
optimized_ops_prefetch_write_l1_keep(scratch_data_0);
|
||||
optimized_ops_prefetch_write_l1_keep(scratch_data_0 + 16);
|
||||
vst1q_s8(scratch_data_0, work_reg_a);
|
||||
vst1q_s8(scratch_data_0 + 16, work_reg_b);
|
||||
|
||||
scratch_data_0 += depth_advance;
|
||||
//
|
||||
|
||||
work_reg_a_sp = vzip2q_s8(input_data_a, input_data_b);
|
||||
work_reg_b_sp = vzip2q_s8(input_data_c, input_data_d);
|
||||
vzipq_s8x2_in_place(&work_reg_a_sp, &work_reg_b_sp);
|
||||
work_reg_a_sp = veorq_s8(work_reg_a_sp, sign_bit);
|
||||
work_reg_b_sp = veorq_s8(work_reg_b_sp, sign_bit);
|
||||
if (quantization_type == QuantizationType::kNonPerChannelUint8) {
|
||||
work_reg_a_sp = veorq_s8(work_reg_a_sp, sign_bit);
|
||||
work_reg_b_sp = veorq_s8(work_reg_b_sp, sign_bit);
|
||||
}
|
||||
|
||||
optimized_ops_prefetch_write_l1_keep(scratch_data_0);
|
||||
optimized_ops_prefetch_write_l1_keep(scratch_data_0 + 16);
|
||||
@ -6063,8 +6081,10 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
|
||||
input_data_0 += 8;
|
||||
|
||||
vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
|
||||
work_reg_a = veorq_s8(work_reg_a, sign_bit);
|
||||
work_reg_b = veorq_s8(work_reg_b, sign_bit);
|
||||
if (quantization_type == QuantizationType::kNonPerChannelUint8) {
|
||||
work_reg_a = veorq_s8(work_reg_a, sign_bit);
|
||||
work_reg_b = veorq_s8(work_reg_b, sign_bit);
|
||||
}
|
||||
|
||||
optimized_ops_prefetch_write_l1_keep(scratch_data_0);
|
||||
optimized_ops_prefetch_write_l1_keep(scratch_data_0 + 16);
|
||||
@ -6082,9 +6102,9 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
|
||||
TFLITE_DCHECK_GT(residual_width, 0);
|
||||
TFLITE_DCHECK_LT(residual_width, 4);
|
||||
for (int i_depth = 0; i_depth < depth_micro_repeats; ++i_depth) {
|
||||
input_data_c = vdupq_n_s8(kSignBit);
|
||||
input_data_c = vdupq_n_u8(kSignBit);
|
||||
input_data_a = vld1q_lane_s8x8(input_data_0, input_data_a, 0);
|
||||
input_data_d = vdupq_n_s8(kSignBit);
|
||||
input_data_d = vdupq_n_u8(kSignBit);
|
||||
if (residual_width > 1) {
|
||||
input_data_b =
|
||||
vld1q_lane_s8x8(input_data_0 + input_depth, input_data_b, 0);
|
||||
@ -6096,8 +6116,10 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
|
||||
work_reg_a = vzip1q_s8(input_data_a, input_data_b);
|
||||
work_reg_b = vzip1q_s8(input_data_c, input_data_d);
|
||||
|
||||
work_reg_a = veorq_s8(work_reg_a, sign_bit);
|
||||
work_reg_b = veorq_s8(work_reg_b, sign_bit);
|
||||
if (quantization_type == QuantizationType::kNonPerChannelUint8) {
|
||||
work_reg_a = veorq_s8(work_reg_a, sign_bit);
|
||||
work_reg_b = veorq_s8(work_reg_b, sign_bit);
|
||||
}
|
||||
vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
|
||||
|
||||
optimized_ops_prefetch_write_l1_keep(scratch_data_0);
|
||||
@ -6111,6 +6133,7 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
|
||||
scratch_data_0 += width_advance;
|
||||
input_data_0 += input_depth_skip;
|
||||
}
|
||||
|
||||
scratch_data_0 += height_advance;
|
||||
input_block_data += input_height_stride;
|
||||
}
|
||||
@ -6121,23 +6144,28 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
|
||||
|
||||
static void __attribute__((noinline))
|
||||
Run(int32 height_block_number, int32 width_block_number,
|
||||
const uint8* input_block_data, int8* scratch_block_data,
|
||||
const typename QuantizationTypeImpl<quantization_type>::ExternalType*
|
||||
input_block_data,
|
||||
int8* scratch_block_data,
|
||||
const DepthwiseConvDotProdParams* function_params) {
|
||||
PreloadInputBlock<uint8>(input_block_data, function_params);
|
||||
PreloadInputBlock(input_block_data, function_params);
|
||||
PackMacroBlockNeon(input_block_data, scratch_block_data, function_params);
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
template <QuantizationType quantization_type>
|
||||
struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
|
||||
QuantizationType::kNonPerChannelUint8,
|
||||
quantization_type,
|
||||
DepthwiseConvDepthMultiplication::kNoMultiplication,
|
||||
/*max_padding=*/1> {
|
||||
static inline void PackMacroBlockNeon(
|
||||
int32 height_block_number, int32 width_block_number,
|
||||
const uint8* input_block_data, int8* scratch_block_data,
|
||||
const typename QuantizationTypeImpl<quantization_type>::ExternalType*
|
||||
input_block_data,
|
||||
int8* scratch_block_data,
|
||||
const DepthwiseConvDotProdParams* function_params) {
|
||||
constexpr uint8 kSignBit = 0x80;
|
||||
constexpr uint8 kSignBit =
|
||||
QuantizationTypeImpl<quantization_type>::kUint8SignBit;
|
||||
|
||||
const int workspace_height_stride =
|
||||
function_params->workspace_height_stride;
|
||||
@ -6157,7 +6185,8 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
|
||||
const int padding_bottom = function_params->padding_bottom;
|
||||
|
||||
TFLITE_DCHECK_GT(depth_micro_repeats, 0);
|
||||
constexpr int kSymmetricZeroPoint = 128;
|
||||
constexpr int kSymmetricZeroPoint =
|
||||
QuantizationTypeImpl<quantization_type>::kIntSymmetricZeroPoint;
|
||||
|
||||
const int micro_block_size = 4 * 8;
|
||||
const int depth_advance = width_overall_micro_repeats * micro_block_size;
|
||||
@ -6188,7 +6217,7 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
|
||||
int8x16_t work_reg_b;
|
||||
|
||||
// Effect subtraction of zero-point = 128 by XOR of sign bit.
|
||||
const int8x16_t sign_bit = vdupq_n_s8(kSignBit);
|
||||
const uint8x16_t sign_bit = vdupq_n_u8(kSignBit);
|
||||
|
||||
// Work through one slice, by row, at a time.
|
||||
int8* scratch_data_0 = scratch_block_data;
|
||||
@ -6205,8 +6234,8 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
|
||||
}
|
||||
|
||||
for (int k_height = 0; k_height < copy_block_height; ++k_height) {
|
||||
const int8* input_data_0 =
|
||||
reinterpret_cast<const int8*>(input_block_data);
|
||||
const typename QuantizationTypeImpl<quantization_type>::ExternalType*
|
||||
input_data_0 = input_block_data;
|
||||
int8x16_t input_data_a;
|
||||
int8x16_t input_data_b;
|
||||
int8x16_t input_data_c;
|
||||
@ -6241,29 +6270,28 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
|
||||
if (depth_micro_repeats >= 2) {
|
||||
i_depth += 2;
|
||||
|
||||
//
|
||||
|
||||
input_data_a = vld1q_s8(input_data_0);
|
||||
input_data_b = vld1q_s8(input_data_0 + 1 * input_depth);
|
||||
input_data_c = vld1q_s8(input_data_0 + 2 * input_depth);
|
||||
input_data_d = vld1q_s8(input_data_0 + 3 * input_depth);
|
||||
input_data_a = util_vld1q_x8(input_data_0);
|
||||
input_data_b = util_vld1q_x8(input_data_0 + 1 * input_depth);
|
||||
input_data_c = util_vld1q_x8(input_data_0 + 2 * input_depth);
|
||||
input_data_d = util_vld1q_x8(input_data_0 + 3 * input_depth);
|
||||
input_data_0 += 16;
|
||||
|
||||
//
|
||||
|
||||
for (; i_depth < depth_micro_repeats - 1; i_depth += 2) {
|
||||
work_reg_a = vzip1q_s8(input_data_a, input_data_b);
|
||||
work_reg_b = vzip1q_s8(input_data_c, input_data_d);
|
||||
vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
|
||||
work_reg_a = veorq_s8(work_reg_a, sign_bit);
|
||||
work_reg_b = veorq_s8(work_reg_b, sign_bit);
|
||||
if (quantization_type ==
|
||||
QuantizationType::kNonPerChannelUint8) {
|
||||
work_reg_a = veorq_s8(work_reg_a, sign_bit);
|
||||
work_reg_b = veorq_s8(work_reg_b, sign_bit);
|
||||
}
|
||||
|
||||
work_reg_a_sp = vzip2q_s8(input_data_a, input_data_b);
|
||||
work_reg_b_sp = vzip2q_s8(input_data_c, input_data_d);
|
||||
vzipq_s8x2_in_place(&work_reg_a_sp, &work_reg_b_sp);
|
||||
|
||||
input_data_a = vld1q_s8(input_data_0);
|
||||
input_data_b = vld1q_s8(input_data_0 + 1 * input_depth);
|
||||
input_data_a = util_vld1q_x8(input_data_0);
|
||||
input_data_b = util_vld1q_x8(input_data_0 + 1 * input_depth);
|
||||
optimized_ops_prefetch_write_l1_keep(scratch_data_0);
|
||||
optimized_ops_prefetch_write_l1_keep(scratch_data_0 + 16);
|
||||
vst1q_s8(scratch_data_0, work_reg_a);
|
||||
@ -6271,41 +6299,44 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
|
||||
|
||||
scratch_data_0 += depth_advance;
|
||||
|
||||
work_reg_a_sp = veorq_s8(work_reg_a_sp, sign_bit);
|
||||
work_reg_b_sp = veorq_s8(work_reg_b_sp, sign_bit);
|
||||
if (quantization_type ==
|
||||
QuantizationType::kNonPerChannelUint8) {
|
||||
work_reg_a_sp = veorq_s8(work_reg_a_sp, sign_bit);
|
||||
work_reg_b_sp = veorq_s8(work_reg_b_sp, sign_bit);
|
||||
}
|
||||
|
||||
input_data_c = vld1q_s8(input_data_0 + 2 * input_depth);
|
||||
input_data_d = vld1q_s8(input_data_0 + 3 * input_depth);
|
||||
input_data_c = util_vld1q_x8(input_data_0 + 2 * input_depth);
|
||||
input_data_d = util_vld1q_x8(input_data_0 + 3 * input_depth);
|
||||
optimized_ops_prefetch_write_l1_keep(scratch_data_0);
|
||||
optimized_ops_prefetch_write_l1_keep(scratch_data_0 + 16);
|
||||
vst1q_s8(scratch_data_0, work_reg_a_sp);
|
||||
vst1q_s8(scratch_data_0 + 16, work_reg_b_sp);
|
||||
|
||||
scratch_data_0 += depth_advance;
|
||||
|
||||
//
|
||||
|
||||
input_data_0 += 16;
|
||||
}
|
||||
|
||||
work_reg_a = vzip1q_s8(input_data_a, input_data_b);
|
||||
work_reg_b = vzip1q_s8(input_data_c, input_data_d);
|
||||
vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
|
||||
work_reg_a = veorq_s8(work_reg_a, sign_bit);
|
||||
work_reg_b = veorq_s8(work_reg_b, sign_bit);
|
||||
if (quantization_type == QuantizationType::kNonPerChannelUint8) {
|
||||
work_reg_a = veorq_s8(work_reg_a, sign_bit);
|
||||
work_reg_b = veorq_s8(work_reg_b, sign_bit);
|
||||
}
|
||||
optimized_ops_prefetch_write_l1_keep(scratch_data_0);
|
||||
optimized_ops_prefetch_write_l1_keep(scratch_data_0 + 16);
|
||||
vst1q_s8(scratch_data_0, work_reg_a);
|
||||
vst1q_s8(scratch_data_0 + 16, work_reg_b);
|
||||
|
||||
scratch_data_0 += depth_advance;
|
||||
//
|
||||
|
||||
work_reg_a_sp = vzip2q_s8(input_data_a, input_data_b);
|
||||
work_reg_b_sp = vzip2q_s8(input_data_c, input_data_d);
|
||||
vzipq_s8x2_in_place(&work_reg_a_sp, &work_reg_b_sp);
|
||||
work_reg_a_sp = veorq_s8(work_reg_a_sp, sign_bit);
|
||||
work_reg_b_sp = veorq_s8(work_reg_b_sp, sign_bit);
|
||||
if (quantization_type == QuantizationType::kNonPerChannelUint8) {
|
||||
work_reg_a_sp = veorq_s8(work_reg_a_sp, sign_bit);
|
||||
work_reg_b_sp = veorq_s8(work_reg_b_sp, sign_bit);
|
||||
}
|
||||
|
||||
optimized_ops_prefetch_write_l1_keep(scratch_data_0);
|
||||
optimized_ops_prefetch_write_l1_keep(scratch_data_0 + 16);
|
||||
@ -6328,8 +6359,10 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
|
||||
input_data_0 += 8;
|
||||
|
||||
vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
|
||||
work_reg_a = veorq_s8(work_reg_a, sign_bit);
|
||||
work_reg_b = veorq_s8(work_reg_b, sign_bit);
|
||||
if (quantization_type == QuantizationType::kNonPerChannelUint8) {
|
||||
work_reg_a = veorq_s8(work_reg_a, sign_bit);
|
||||
work_reg_b = veorq_s8(work_reg_b, sign_bit);
|
||||
}
|
||||
|
||||
optimized_ops_prefetch_write_l1_keep(scratch_data_0);
|
||||
optimized_ops_prefetch_write_l1_keep(scratch_data_0 + 16);
|
||||
@ -6343,10 +6376,10 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
|
||||
} else {
|
||||
TFLITE_DCHECK_LT(adjusted_residual_width, 4);
|
||||
for (int i_depth = 0; i_depth < depth_micro_repeats; ++i_depth) {
|
||||
input_data_a = vdupq_n_s8(-input_offset);
|
||||
input_data_b = vdupq_n_s8(-input_offset);
|
||||
input_data_c = vdupq_n_s8(-input_offset);
|
||||
input_data_d = vdupq_n_s8(-input_offset);
|
||||
input_data_a = vdupq_n_u8(-input_offset);
|
||||
input_data_b = vdupq_n_u8(-input_offset);
|
||||
input_data_c = vdupq_n_u8(-input_offset);
|
||||
input_data_d = vdupq_n_u8(-input_offset);
|
||||
if (adjusted_residual_width > 0) {
|
||||
input_data_a = vld1q_lane_s8x8(input_data_0, input_data_a, 0);
|
||||
if (adjusted_residual_width > 1) {
|
||||
@ -6361,8 +6394,10 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
|
||||
work_reg_a = vzip1q_s8(input_data_a, input_data_b);
|
||||
work_reg_b = vzip1q_s8(input_data_c, input_data_d);
|
||||
|
||||
work_reg_a = veorq_s8(work_reg_a, sign_bit);
|
||||
work_reg_b = veorq_s8(work_reg_b, sign_bit);
|
||||
if (quantization_type == QuantizationType::kNonPerChannelUint8) {
|
||||
work_reg_a = veorq_s8(work_reg_a, sign_bit);
|
||||
work_reg_b = veorq_s8(work_reg_b, sign_bit);
|
||||
}
|
||||
vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
|
||||
|
||||
optimized_ops_prefetch_write_l1_keep(scratch_data_0);
|
||||
@ -6386,29 +6421,28 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
|
||||
if (depth_micro_repeats >= 2) {
|
||||
i_depth += 2;
|
||||
|
||||
//
|
||||
|
||||
input_data_a = vdupq_n_s8(-input_offset);
|
||||
input_data_b = vld1q_s8(input_data_0 + 1 * input_depth);
|
||||
input_data_c = vld1q_s8(input_data_0 + 2 * input_depth);
|
||||
input_data_d = vld1q_s8(input_data_0 + 3 * input_depth);
|
||||
input_data_a = vdupq_n_u8(-input_offset);
|
||||
input_data_b = util_vld1q_x8(input_data_0 + 1 * input_depth);
|
||||
input_data_c = util_vld1q_x8(input_data_0 + 2 * input_depth);
|
||||
input_data_d = util_vld1q_x8(input_data_0 + 3 * input_depth);
|
||||
input_data_0 += 16;
|
||||
|
||||
//
|
||||
|
||||
for (; i_depth < depth_micro_repeats - 1; i_depth += 2) {
|
||||
work_reg_a = vzip1q_s8(input_data_a, input_data_b);
|
||||
work_reg_b = vzip1q_s8(input_data_c, input_data_d);
|
||||
vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
|
||||
work_reg_a = veorq_s8(work_reg_a, sign_bit);
|
||||
work_reg_b = veorq_s8(work_reg_b, sign_bit);
|
||||
if (quantization_type ==
|
||||
QuantizationType::kNonPerChannelUint8) {
|
||||
work_reg_a = veorq_s8(work_reg_a, sign_bit);
|
||||
work_reg_b = veorq_s8(work_reg_b, sign_bit);
|
||||
}
|
||||
|
||||
work_reg_a_sp = vzip2q_s8(input_data_a, input_data_b);
|
||||
work_reg_b_sp = vzip2q_s8(input_data_c, input_data_d);
|
||||
vzipq_s8x2_in_place(&work_reg_a_sp, &work_reg_b_sp);
|
||||
|
||||
input_data_a = vdupq_n_s8(-input_offset);
|
||||
input_data_b = vld1q_s8(input_data_0 + 1 * input_depth);
|
||||
input_data_a = vdupq_n_u8(-input_offset);
|
||||
input_data_b = util_vld1q_x8(input_data_0 + 1 * input_depth);
|
||||
optimized_ops_prefetch_write_l1_keep(scratch_data_0);
|
||||
optimized_ops_prefetch_write_l1_keep(scratch_data_0 + 16);
|
||||
vst1q_s8(scratch_data_0, work_reg_a);
|
||||
@ -6416,41 +6450,44 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
|
||||
|
||||
scratch_data_0 += depth_advance;
|
||||
|
||||
work_reg_a_sp = veorq_s8(work_reg_a_sp, sign_bit);
|
||||
work_reg_b_sp = veorq_s8(work_reg_b_sp, sign_bit);
|
||||
if (quantization_type ==
|
||||
QuantizationType::kNonPerChannelUint8) {
|
||||
work_reg_a_sp = veorq_s8(work_reg_a_sp, sign_bit);
|
||||
work_reg_b_sp = veorq_s8(work_reg_b_sp, sign_bit);
|
||||
}
|
||||
|
||||
input_data_c = vld1q_s8(input_data_0 + 2 * input_depth);
|
||||
input_data_d = vld1q_s8(input_data_0 + 3 * input_depth);
|
||||
input_data_c = util_vld1q_x8(input_data_0 + 2 * input_depth);
|
||||
input_data_d = util_vld1q_x8(input_data_0 + 3 * input_depth);
|
||||
optimized_ops_prefetch_write_l1_keep(scratch_data_0);
|
||||
optimized_ops_prefetch_write_l1_keep(scratch_data_0 + 16);
|
||||
vst1q_s8(scratch_data_0, work_reg_a_sp);
|
||||
vst1q_s8(scratch_data_0 + 16, work_reg_b_sp);
|
||||
|
||||
scratch_data_0 += depth_advance;
|
||||
|
||||
//
|
||||
|
||||
input_data_0 += 16;
|
||||
}
|
||||
|
||||
work_reg_a = vzip1q_s8(input_data_a, input_data_b);
|
||||
work_reg_b = vzip1q_s8(input_data_c, input_data_d);
|
||||
vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
|
||||
work_reg_a = veorq_s8(work_reg_a, sign_bit);
|
||||
work_reg_b = veorq_s8(work_reg_b, sign_bit);
|
||||
if (quantization_type == QuantizationType::kNonPerChannelUint8) {
|
||||
work_reg_a = veorq_s8(work_reg_a, sign_bit);
|
||||
work_reg_b = veorq_s8(work_reg_b, sign_bit);
|
||||
}
|
||||
optimized_ops_prefetch_write_l1_keep(scratch_data_0);
|
||||
optimized_ops_prefetch_write_l1_keep(scratch_data_0 + 16);
|
||||
vst1q_s8(scratch_data_0, work_reg_a);
|
||||
vst1q_s8(scratch_data_0 + 16, work_reg_b);
|
||||
|
||||
scratch_data_0 += depth_advance;
|
||||
//
|
||||
|
||||
work_reg_a_sp = vzip2q_s8(input_data_a, input_data_b);
|
||||
work_reg_b_sp = vzip2q_s8(input_data_c, input_data_d);
|
||||
vzipq_s8x2_in_place(&work_reg_a_sp, &work_reg_b_sp);
|
||||
work_reg_a_sp = veorq_s8(work_reg_a_sp, sign_bit);
|
||||
work_reg_b_sp = veorq_s8(work_reg_b_sp, sign_bit);
|
||||
if (quantization_type == QuantizationType::kNonPerChannelUint8) {
|
||||
work_reg_a_sp = veorq_s8(work_reg_a_sp, sign_bit);
|
||||
work_reg_b_sp = veorq_s8(work_reg_b_sp, sign_bit);
|
||||
}
|
||||
|
||||
optimized_ops_prefetch_write_l1_keep(scratch_data_0);
|
||||
optimized_ops_prefetch_write_l1_keep(scratch_data_0 + 16);
|
||||
@ -6460,7 +6497,7 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
|
||||
scratch_data_0 += depth_advance;
|
||||
}
|
||||
for (; i_depth < depth_micro_repeats; ++i_depth) {
|
||||
input_data_a = vdupq_n_s8(-input_offset);
|
||||
input_data_a = vdupq_n_u8(-input_offset);
|
||||
input_data_b = vld1q_lane_s8x8(input_data_0 + 1 * input_depth,
|
||||
input_data_b, 0);
|
||||
input_data_c = vld1q_lane_s8x8(input_data_0 + 2 * input_depth,
|
||||
@ -6473,8 +6510,10 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
|
||||
input_data_0 += 8;
|
||||
|
||||
vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
|
||||
work_reg_a = veorq_s8(work_reg_a, sign_bit);
|
||||
work_reg_b = veorq_s8(work_reg_b, sign_bit);
|
||||
if (quantization_type == QuantizationType::kNonPerChannelUint8) {
|
||||
work_reg_a = veorq_s8(work_reg_a, sign_bit);
|
||||
work_reg_b = veorq_s8(work_reg_b, sign_bit);
|
||||
}
|
||||
|
||||
optimized_ops_prefetch_write_l1_keep(scratch_data_0);
|
||||
optimized_ops_prefetch_write_l1_keep(scratch_data_0 + 16);
|
||||
@ -6489,10 +6528,10 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
|
||||
TFLITE_DCHECK_LT(adjusted_residual_width, 4);
|
||||
|
||||
for (int i_depth = 0; i_depth < depth_micro_repeats; ++i_depth) {
|
||||
input_data_a = vdupq_n_s8(-input_offset);
|
||||
input_data_b = vdupq_n_s8(-input_offset);
|
||||
input_data_c = vdupq_n_s8(-input_offset);
|
||||
input_data_d = vdupq_n_s8(-input_offset);
|
||||
input_data_a = vdupq_n_u8(-input_offset);
|
||||
input_data_b = vdupq_n_u8(-input_offset);
|
||||
input_data_c = vdupq_n_u8(-input_offset);
|
||||
input_data_d = vdupq_n_u8(-input_offset);
|
||||
// Skip loading first column.
|
||||
if (adjusted_residual_width > 1) {
|
||||
input_data_b = vld1q_lane_s8x8(input_data_0 + input_depth,
|
||||
@ -6505,8 +6544,10 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
|
||||
work_reg_a = vzip1q_s8(input_data_a, input_data_b);
|
||||
work_reg_b = vzip1q_s8(input_data_c, input_data_d);
|
||||
|
||||
work_reg_a = veorq_s8(work_reg_a, sign_bit);
|
||||
work_reg_b = veorq_s8(work_reg_b, sign_bit);
|
||||
if (quantization_type == QuantizationType::kNonPerChannelUint8) {
|
||||
work_reg_a = veorq_s8(work_reg_a, sign_bit);
|
||||
work_reg_b = veorq_s8(work_reg_b, sign_bit);
|
||||
}
|
||||
vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
|
||||
|
||||
optimized_ops_prefetch_write_l1_keep(scratch_data_0);
|
||||
@ -6538,22 +6579,26 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
|
||||
|
||||
static void __attribute__((noinline))
|
||||
Run(int32 height_block_number, int32 width_block_number,
|
||||
const uint8* input_block_data, int8* scratch_block_data,
|
||||
const typename QuantizationTypeImpl<quantization_type>::ExternalType*
|
||||
input_block_data,
|
||||
int8* scratch_block_data,
|
||||
const DepthwiseConvDotProdParams* function_params) {
|
||||
PreloadInputBlock<uint8>(input_block_data, function_params);
|
||||
PreloadInputBlock(input_block_data, function_params);
|
||||
PackMacroBlockNeon(height_block_number, width_block_number,
|
||||
input_block_data, scratch_block_data, function_params);
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
template <QuantizationType quantization_type>
|
||||
struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
|
||||
QuantizationType::kNonPerChannelUint8,
|
||||
quantization_type,
|
||||
DepthwiseConvDepthMultiplication::kUnitInputDepth,
|
||||
/*max_padding=*/1> {
|
||||
static inline void PackMacroBlockNeon(
|
||||
int32 height_block_number, int32 width_block_number,
|
||||
const uint8* input_block_data, int8* scratch_block_data,
|
||||
const typename QuantizationTypeImpl<quantization_type>::ExternalType*
|
||||
input_block_data,
|
||||
int8* scratch_block_data,
|
||||
const DepthwiseConvDotProdParams* function_params) {
|
||||
const int workspace_height_stride =
|
||||
function_params->workspace_height_stride;
|
||||
@ -6570,7 +6615,8 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
|
||||
const int padding_top = function_params->padding_top;
|
||||
const int padding_bottom = function_params->padding_bottom;
|
||||
|
||||
constexpr int kSymmetricZeroPoint = 128;
|
||||
constexpr int kSymmetricZeroPoint =
|
||||
QuantizationTypeImpl<quantization_type>::kIntSymmetricZeroPoint;
|
||||
|
||||
TFLITE_DCHECK_GE(workspace_height_stride, 4 * width_overall_micro_repeats);
|
||||
|
||||
@ -6631,7 +6677,8 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
|
||||
int scratch_data_offset = 0;
|
||||
int input_block_offset = 0;
|
||||
|
||||
constexpr uint8 kSignBit = 0x80;
|
||||
constexpr uint8 kSignBit =
|
||||
QuantizationTypeImpl<quantization_type>::kUint8SignBit;
|
||||
|
||||
// Transpositions are 4x4, but doing 2 at a time is more efficient in NEON
|
||||
// code. Note the blocks of 4x4 are still interleaved down the depth.
|
||||
@ -6640,8 +6687,8 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
|
||||
int8x8_t padding_mask;
|
||||
|
||||
// Effect subtraction of zero-point = 128 by XOR of sign bit.
|
||||
const int8x16_t sign_bit = vdupq_n_s8(kSignBit);
|
||||
const int8x16_t padding_reg = vdupq_n_s8(-input_offset);
|
||||
const uint8x16_t sign_bit = vdupq_n_u8(kSignBit);
|
||||
const uint8x16_t padding_reg = vdupq_n_u8(-input_offset);
|
||||
padding_mask = vdup_n_s8(-1);
|
||||
half_work_reg = vdup_n_s8(0);
|
||||
|
||||
@ -6660,10 +6707,11 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
|
||||
// iteration of the main copy loop. In the case of leading width
|
||||
// padding, we unroll this specially.
|
||||
if (leading_width_padding) {
|
||||
work_reg = vld1q_s8(reinterpret_cast<const int8*>(
|
||||
input_block_data + input_block_offset));
|
||||
work_reg = util_vld1q_x8(input_block_data + input_block_offset);
|
||||
work_reg = vextq_s8(padding_reg, work_reg, 15);
|
||||
work_reg = veorq_s8(work_reg, sign_bit);
|
||||
if (quantization_type == QuantizationType::kNonPerChannelUint8) {
|
||||
work_reg = veorq_s8(work_reg, sign_bit);
|
||||
}
|
||||
optimized_ops_prefetch_write_l1_keep(scratch_data);
|
||||
vst1q_s8(scratch_data, work_reg);
|
||||
copy_done += 15;
|
||||
@ -6671,9 +6719,11 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
|
||||
|
||||
// Main copy loop.
|
||||
for (; (copy_done + 16) <= copy_size; copy_done += 16) {
|
||||
work_reg = vld1q_s8(reinterpret_cast<const int8*>(
|
||||
input_block_data + input_block_offset + copy_done));
|
||||
work_reg = veorq_s8(work_reg, sign_bit);
|
||||
work_reg =
|
||||
util_vld1q_x8(input_block_data + input_block_offset + copy_done);
|
||||
if (quantization_type == QuantizationType::kNonPerChannelUint8) {
|
||||
work_reg = veorq_s8(work_reg, sign_bit);
|
||||
}
|
||||
TFLITE_DCHECK_EQ((start_width + copy_done) % 16, 0);
|
||||
optimized_ops_prefetch_write_l1_keep(scratch_data + start_width +
|
||||
copy_done);
|
||||
@ -6681,9 +6731,11 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
|
||||
}
|
||||
|
||||
if (copy_done + 8 <= copy_size) {
|
||||
half_work_reg = vld1_s8(reinterpret_cast<const int8*>(
|
||||
input_block_data + input_block_offset + copy_done));
|
||||
half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
|
||||
half_work_reg =
|
||||
util_vld1_x8(input_block_data + input_block_offset + copy_done);
|
||||
if (quantization_type == QuantizationType::kNonPerChannelUint8) {
|
||||
half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
|
||||
}
|
||||
TFLITE_DCHECK_EQ((start_width + copy_done) % 8, 0);
|
||||
optimized_ops_prefetch_write_l1_keep(scratch_data + start_width +
|
||||
copy_done);
|
||||
@ -6703,16 +6755,17 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
|
||||
// Employ overlapping-load strategy in order to load full register,
|
||||
// but use only part.
|
||||
// This has the advantage of resulting in zeros after shifting.
|
||||
half_work_reg = vld1_s8(reinterpret_cast<const int8*>(
|
||||
input_block_data + input_block_offset + copy_size - 8));
|
||||
|
||||
half_work_reg = util_vld1_x8(input_block_data + input_block_offset +
|
||||
copy_size - 8);
|
||||
half_work_reg = vreinterpret_s8_s64(
|
||||
vshl_s64(vreinterpret_s64_s8(half_work_reg),
|
||||
vdup_n_s64(-8 * (8 - copy_remaining))));
|
||||
half_work_reg = vbsl_s8(vreinterpret_u8_s8(padding_mask),
|
||||
vget_low_s8(padding_reg), half_work_reg);
|
||||
|
||||
half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
|
||||
if (quantization_type == QuantizationType::kNonPerChannelUint8) {
|
||||
half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
|
||||
}
|
||||
TFLITE_DCHECK_EQ((start_width + copy_done) % 8, 0);
|
||||
optimized_ops_prefetch_write_l1_keep(scratch_data + start_width +
|
||||
copy_done);
|
||||
@ -6748,7 +6801,9 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
|
||||
half_work_reg = vld1_lane_8x4(input_block_data + input_block_offset,
|
||||
half_work_reg, 0);
|
||||
half_work_reg = vext_s8(vget_low_s8(padding_reg), half_work_reg, 7);
|
||||
half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
|
||||
if (quantization_type == QuantizationType::kNonPerChannelUint8) {
|
||||
half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
|
||||
}
|
||||
optimized_ops_prefetch_write_l1_keep(scratch_data);
|
||||
vst1_lane_8x4(scratch_data, half_work_reg, 0);
|
||||
copy_done += 3;
|
||||
@ -6759,7 +6814,9 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
|
||||
half_work_reg =
|
||||
vld1_lane_8x4(input_block_data + input_block_offset + copy_done,
|
||||
half_work_reg, 0);
|
||||
half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
|
||||
if (quantization_type == QuantizationType::kNonPerChannelUint8) {
|
||||
half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
|
||||
}
|
||||
TFLITE_DCHECK_EQ((start_width + copy_done) % 4, 0);
|
||||
optimized_ops_prefetch_write_l1_keep(scratch_data + start_width +
|
||||
copy_done);
|
||||
@ -6790,7 +6847,9 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
|
||||
half_work_reg = vbsl_s8(vreinterpret_u8_s8(padding_mask),
|
||||
vget_low_s8(padding_reg), half_work_reg);
|
||||
|
||||
half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
|
||||
if (quantization_type == QuantizationType::kNonPerChannelUint8) {
|
||||
half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
|
||||
}
|
||||
TFLITE_DCHECK_EQ((start_width + copy_done) % 4, 0);
|
||||
optimized_ops_prefetch_write_l1_keep(scratch_data + start_width +
|
||||
copy_done);
|
||||
@ -6823,7 +6882,7 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
|
||||
TFLITE_DCHECK(trailing_width_padding);
|
||||
|
||||
for (int k_height = 0; k_height < copy_block_height; ++k_height) {
|
||||
half_work_reg = vdup_n_s8(-input_offset);
|
||||
half_work_reg = vdup_n_u8(-input_offset);
|
||||
half_work_reg = vld1_lane_s8(reinterpret_cast<const int8*>(
|
||||
input_block_data + input_block_offset),
|
||||
half_work_reg, 1);
|
||||
@ -6836,7 +6895,9 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
|
||||
input_block_offset + 2),
|
||||
half_work_reg, 3);
|
||||
|
||||
half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
|
||||
if (quantization_type == QuantizationType::kNonPerChannelUint8) {
|
||||
half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
|
||||
}
|
||||
TFLITE_DCHECK_EQ(scratch_data_offset % 8, 0);
|
||||
optimized_ops_prefetch_write_l1_keep(scratch_data_base +
|
||||
scratch_data_offset);
|
||||
@ -6865,7 +6926,7 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
|
||||
padding_mask = vreinterpret_s8_s64(vshl_s64(
|
||||
vreinterpret_s64_s8(padding_mask), vdup_n_s64(8 * copy_remaining)));
|
||||
if (leading_width_padding) {
|
||||
padding_mask = vset_lane_s8(255, padding_mask, 0);
|
||||
padding_mask = vset_lane_u8(255, padding_mask, 0);
|
||||
}
|
||||
|
||||
for (int k_height = 0; k_height < copy_block_height; ++k_height) {
|
||||
@ -6884,7 +6945,9 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
|
||||
half_work_reg = vbsl_s8(vreinterpret_u8_s8(padding_mask),
|
||||
vget_low_s8(padding_reg), half_work_reg);
|
||||
|
||||
half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
|
||||
if (quantization_type == QuantizationType::kNonPerChannelUint8) {
|
||||
half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
|
||||
}
|
||||
TFLITE_DCHECK_EQ(scratch_data_offset % 4, 0);
|
||||
optimized_ops_prefetch_write_l1_keep(scratch_data_base +
|
||||
scratch_data_offset);
|
||||
@ -6925,22 +6988,26 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
|
||||
|
||||
static void __attribute__((noinline))
|
||||
Run(int32 height_block_number, int32 width_block_number,
|
||||
const uint8* input_block_data, int8* scratch_block_data,
|
||||
const typename QuantizationTypeImpl<quantization_type>::ExternalType*
|
||||
input_block_data,
|
||||
int8* scratch_block_data,
|
||||
const DepthwiseConvDotProdParams* function_params) {
|
||||
PreloadInputBlock<uint8>(input_block_data, function_params);
|
||||
PreloadInputBlock(input_block_data, function_params);
|
||||
PackMacroBlockNeon(height_block_number, width_block_number,
|
||||
input_block_data, scratch_block_data, function_params);
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
template <QuantizationType quantization_type>
|
||||
struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
|
||||
QuantizationType::kNonPerChannelUint8,
|
||||
quantization_type,
|
||||
DepthwiseConvDepthMultiplication::kUnitInputDepth,
|
||||
/*max_padding=*/0> {
|
||||
static inline void PackMacroBlockNeon(
|
||||
int32 height_block_number, int32 width_block_number,
|
||||
const uint8* input_block_data, int8* scratch_block_data,
|
||||
const typename QuantizationTypeImpl<quantization_type>::ExternalType*
|
||||
input_block_data,
|
||||
int8* scratch_block_data,
|
||||
const DepthwiseConvDotProdParams* function_params) {
|
||||
const int workspace_height_stride =
|
||||
function_params->workspace_height_stride;
|
||||
@ -6980,7 +7047,8 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
|
||||
int scratch_data_offset = 0;
|
||||
int input_block_offset = 0;
|
||||
|
||||
constexpr uint8 kSignBit = 0x80;
|
||||
constexpr uint8 kSignBit =
|
||||
QuantizationTypeImpl<quantization_type>::kUint8SignBit;
|
||||
|
||||
// Transpositions are 4x4, but doing 2 at a time is more efficient in NEON
|
||||
// code. Note the blocks of 4x4 are still interleaved down the depth.
|
||||
@ -6988,7 +7056,7 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
|
||||
int8x8_t half_work_reg;
|
||||
|
||||
// Effect subtraction of zero-point = 128 by XOR of sign bit.
|
||||
const int8x16_t sign_bit = vdupq_n_s8(kSignBit);
|
||||
const uint8x16_t sign_bit = vdupq_n_u8(kSignBit);
|
||||
half_work_reg = vdup_n_s8(0);
|
||||
|
||||
if (copy_size >= 16) {
|
||||
@ -7002,18 +7070,22 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
|
||||
|
||||
// Main copy loop.
|
||||
for (; (copy_done + 16) <= copy_size; copy_done += 16) {
|
||||
work_reg = vld1q_s8(reinterpret_cast<const int8*>(
|
||||
input_block_data + input_block_offset + copy_done));
|
||||
work_reg = veorq_s8(work_reg, sign_bit);
|
||||
work_reg =
|
||||
util_vld1q_x8(input_block_data + input_block_offset + copy_done);
|
||||
if (quantization_type == QuantizationType::kNonPerChannelUint8) {
|
||||
work_reg = veorq_s8(work_reg, sign_bit);
|
||||
}
|
||||
TFLITE_DCHECK_EQ(copy_done % 16, 0);
|
||||
optimized_ops_prefetch_write_l1_keep(scratch_data + copy_done);
|
||||
vst1q_s8(scratch_data + copy_done, work_reg);
|
||||
}
|
||||
|
||||
if (copy_done + 8 <= copy_size) {
|
||||
half_work_reg = vld1_s8(reinterpret_cast<const int8*>(
|
||||
input_block_data + input_block_offset + copy_done));
|
||||
half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
|
||||
half_work_reg =
|
||||
util_vld1_x8(input_block_data + input_block_offset + copy_done);
|
||||
if (quantization_type == QuantizationType::kNonPerChannelUint8) {
|
||||
half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
|
||||
}
|
||||
TFLITE_DCHECK_EQ(copy_done % 8, 0);
|
||||
optimized_ops_prefetch_write_l1_keep(scratch_data + copy_done);
|
||||
vst1_s8(scratch_data + copy_done, half_work_reg);
|
||||
@ -7032,14 +7104,16 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
|
||||
// Employ overlapping-load strategy in order to load full register,
|
||||
// but use only part.
|
||||
// This has the advantage of resulting in zeros after shifting.
|
||||
half_work_reg = vld1_s8(reinterpret_cast<const int8*>(
|
||||
input_block_data + input_block_offset + copy_size - 8));
|
||||
half_work_reg = util_vld1_x8(input_block_data + input_block_offset +
|
||||
copy_size - 8);
|
||||
|
||||
half_work_reg = vreinterpret_s8_s64(
|
||||
vshl_s64(vreinterpret_s64_s8(half_work_reg),
|
||||
vdup_n_s64(-8 * (8 - copy_remaining))));
|
||||
|
||||
half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
|
||||
if (quantization_type == QuantizationType::kNonPerChannelUint8) {
|
||||
half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
|
||||
}
|
||||
TFLITE_DCHECK_EQ(copy_done % 8, 0);
|
||||
optimized_ops_prefetch_write_l1_keep(scratch_data + copy_done);
|
||||
vst1_s8(scratch_data + copy_done, half_work_reg);
|
||||
@ -7069,7 +7143,9 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
|
||||
half_work_reg =
|
||||
vld1_lane_8x4(input_block_data + input_block_offset + copy_done,
|
||||
half_work_reg, 0);
|
||||
half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
|
||||
if (quantization_type == QuantizationType::kNonPerChannelUint8) {
|
||||
half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
|
||||
}
|
||||
TFLITE_DCHECK_EQ(copy_done % 4, 0);
|
||||
optimized_ops_prefetch_write_l1_keep(scratch_data + copy_done);
|
||||
vst1_lane_8x4(scratch_data + copy_done, half_work_reg, 0);
|
||||
@ -7096,7 +7172,9 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
|
||||
vshl_s64(vreinterpret_s64_s8(half_work_reg),
|
||||
vdup_n_s64(-8 * (4 - copy_remaining))));
|
||||
|
||||
half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
|
||||
if (quantization_type == QuantizationType::kNonPerChannelUint8) {
|
||||
half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
|
||||
}
|
||||
TFLITE_DCHECK_EQ(copy_done % 4, 0);
|
||||
optimized_ops_prefetch_write_l1_keep(scratch_data + copy_done);
|
||||
vst1_lane_8x4(scratch_data + copy_done, half_work_reg, 0);
|
||||
@ -7159,9 +7237,11 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
|
||||
|
||||
static void __attribute__((noinline))
|
||||
Run(int32 height_block_number, int32 width_block_number,
|
||||
const uint8* input_block_data, int8* scratch_block_data,
|
||||
const typename QuantizationTypeImpl<quantization_type>::ExternalType*
|
||||
input_block_data,
|
||||
int8* scratch_block_data,
|
||||
const DepthwiseConvDotProdParams* function_params) {
|
||||
PreloadInputBlock<uint8>(input_block_data, function_params);
|
||||
PreloadInputBlock(input_block_data, function_params);
|
||||
PackMacroBlockNeon(height_block_number, width_block_number,
|
||||
input_block_data, scratch_block_data, function_params);
|
||||
}
|
||||
|
@ -37,18 +37,6 @@ namespace depthwise_conv {
|
||||
|
||||
#ifdef USE_NEON
|
||||
|
||||
inline int8x16_t util_vld1q_x8(const uint8* data_addr) {
|
||||
return vreinterpretq_s8_u8(vld1q_u8(data_addr));
|
||||
}
|
||||
inline int8x16_t util_vld1q_x8(const int8* data_addr) {
|
||||
return vld1q_s8(data_addr);
|
||||
}
|
||||
inline int8x8_t util_vld1_x8(const uint8* data_addr) {
|
||||
return vreinterpret_s8_u8(vld1_u8(data_addr));
|
||||
}
|
||||
inline int8x8_t util_vld1_x8(const int8* data_addr) {
|
||||
return vld1_s8(data_addr);
|
||||
}
|
||||
inline void util_vst1_x8(uint8* data_addr, int8x8_t reg) {
|
||||
return vst1_u8(data_addr, vreinterpret_u8_s8(reg));
|
||||
}
|
||||
@ -1999,7 +1987,8 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseIntrinsics3x3DotProduct,
|
||||
|
||||
if (copy_size >= 16) {
|
||||
const int copy_remaining = (copy_size + start_width) & 0x7;
|
||||
padding_mask = vshl_u64(padding_mask, vdup_n_s64(8 * copy_remaining));
|
||||
padding_mask = vreinterpret_s8_s64(vshl_s64(
|
||||
vreinterpret_s64_s8(padding_mask), vdup_n_s64(8 * copy_remaining)));
|
||||
|
||||
for (int k_height = 0; k_height < copy_block_height; ++k_height) {
|
||||
// Work through one slice, by row, at a time.
|
||||
@ -2057,10 +2046,11 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseIntrinsics3x3DotProduct,
|
||||
half_work_reg = util_vld1_x8(input_block_data + input_block_offset +
|
||||
copy_size - 8);
|
||||
|
||||
half_work_reg =
|
||||
vshl_u64(half_work_reg, vdup_n_s64(-8 * (8 - copy_remaining)));
|
||||
half_work_reg =
|
||||
vbsl_s8(padding_mask, vget_low_s8(padding_reg), half_work_reg);
|
||||
half_work_reg = vreinterpret_s8_s64(
|
||||
vshl_s64(vreinterpret_s64_s8(half_work_reg),
|
||||
vdup_n_s64(-8 * (8 - copy_remaining))));
|
||||
half_work_reg = vbsl_s8(vreinterpret_u8_s8(padding_mask),
|
||||
vget_low_s8(padding_reg), half_work_reg);
|
||||
|
||||
if (quantization_type == QuantizationType::kNonPerChannelUint8) {
|
||||
half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
|
||||
@ -2078,7 +2068,8 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseIntrinsics3x3DotProduct,
|
||||
}
|
||||
} else if (copy_size >= 4) {
|
||||
const int copy_remaining = (copy_size + start_width) & 0x3;
|
||||
padding_mask = vshl_u64(padding_mask, vdup_n_s64(8 * copy_remaining));
|
||||
padding_mask = vreinterpret_s8_s64(vshl_s64(
|
||||
vreinterpret_s64_s8(padding_mask), vdup_n_s64(8 * copy_remaining)));
|
||||
|
||||
for (int k_height = 0; k_height < copy_block_height; ++k_height) {
|
||||
// Work through one slice, by row, at a time.
|
||||
@ -2130,10 +2121,11 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseIntrinsics3x3DotProduct,
|
||||
input_block_data + input_block_offset + copy_size - 4,
|
||||
half_work_reg, 0);
|
||||
|
||||
half_work_reg =
|
||||
vshl_u64(half_work_reg, vdup_n_s64(-8 * (4 - copy_remaining)));
|
||||
half_work_reg =
|
||||
vbsl_s8(padding_mask, vget_low_s8(padding_reg), half_work_reg);
|
||||
half_work_reg = vreinterpret_s8_s64(
|
||||
vshl_s64(vreinterpret_s64_s8(half_work_reg),
|
||||
vdup_n_s64(-8 * (4 - copy_remaining))));
|
||||
half_work_reg = vbsl_s8(vreinterpret_u8_s8(padding_mask),
|
||||
vget_low_s8(padding_reg), half_work_reg);
|
||||
|
||||
if (quantization_type == QuantizationType::kNonPerChannelUint8) {
|
||||
half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
|
||||
@ -2199,24 +2191,27 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseIntrinsics3x3DotProduct,
|
||||
} else {
|
||||
TFLITE_DCHECK_EQ(width_overall_micro_repeats, 1);
|
||||
const int copy_remaining = (copy_size + start_width) & 0x3;
|
||||
padding_mask = vshl_u64(padding_mask, vdup_n_s64(8 * copy_remaining));
|
||||
padding_mask = vreinterpret_s8_s64(vshl_s64(
|
||||
vreinterpret_s64_s8(padding_mask), vdup_n_s64(8 * copy_remaining)));
|
||||
if (leading_width_padding) {
|
||||
padding_mask = vset_lane_u8(255, padding_mask, 0);
|
||||
}
|
||||
|
||||
for (int k_height = 0; k_height < copy_block_height; ++k_height) {
|
||||
for (int i = 0; i < copy_size; ++i) {
|
||||
half_work_reg = vshl_n_u64(half_work_reg, 8);
|
||||
half_work_reg = vreinterpret_s8_s64(
|
||||
vshl_n_s64(vreinterpret_s64_s8(half_work_reg), 8));
|
||||
half_work_reg = vld1_lane_s8(
|
||||
reinterpret_cast<const int8*>(
|
||||
input_block_data + input_block_offset + copy_size - 1 - i),
|
||||
half_work_reg, 0);
|
||||
}
|
||||
if (leading_width_padding) {
|
||||
half_work_reg = vshl_n_s64(half_work_reg, 8);
|
||||
half_work_reg = vreinterpret_s8_s64(
|
||||
vshl_n_s64(vreinterpret_s64_s8(half_work_reg), 8));
|
||||
}
|
||||
half_work_reg =
|
||||
vbsl_s8(padding_mask, vget_low_s8(padding_reg), half_work_reg);
|
||||
half_work_reg = vbsl_s8(vreinterpret_u8_s8(padding_mask),
|
||||
vget_low_s8(padding_reg), half_work_reg);
|
||||
|
||||
if (quantization_type == QuantizationType::kNonPerChannelUint8) {
|
||||
half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
|
||||
@ -2376,8 +2371,9 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseIntrinsics3x3DotProduct,
|
||||
half_work_reg = util_vld1_x8(input_block_data + input_block_offset +
|
||||
copy_size - 8);
|
||||
|
||||
half_work_reg =
|
||||
vshl_u64(half_work_reg, vdup_n_s64(-8 * (8 - copy_remaining)));
|
||||
half_work_reg = vreinterpret_s8_s64(
|
||||
vshl_s64(vreinterpret_s64_s8(half_work_reg),
|
||||
vdup_n_s64(-8 * (8 - copy_remaining))));
|
||||
|
||||
if (quantization_type == QuantizationType::kNonPerChannelUint8) {
|
||||
half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
|
||||
@ -2432,8 +2428,9 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseIntrinsics3x3DotProduct,
|
||||
input_block_data + input_block_offset + copy_size - 4,
|
||||
half_work_reg, 0);
|
||||
|
||||
half_work_reg =
|
||||
vshl_u64(half_work_reg, vdup_n_s64(-8 * (4 - copy_remaining)));
|
||||
half_work_reg = vreinterpret_s8_s64(
|
||||
vshl_s64(vreinterpret_s64_s8(half_work_reg),
|
||||
vdup_n_s64(-8 * (4 - copy_remaining))));
|
||||
|
||||
if (quantization_type == QuantizationType::kNonPerChannelUint8) {
|
||||
half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
|
||||
@ -2456,7 +2453,8 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseIntrinsics3x3DotProduct,
|
||||
|
||||
for (int k_height = 0; k_height < copy_block_height; ++k_height) {
|
||||
for (int i = 0; i < copy_size; ++i) {
|
||||
half_work_reg = vshl_n_u64(half_work_reg, 8);
|
||||
half_work_reg = vreinterpret_s8_s64(
|
||||
vshl_n_s64(vreinterpret_s64_s8(half_work_reg), 8));
|
||||
half_work_reg = vld1_lane_s8(
|
||||
reinterpret_cast<const int8*>(
|
||||
input_block_data + input_block_offset + copy_size - 1 - i),
|
||||
|
Loading…
x
Reference in New Issue
Block a user