DepthwiseConv, dot-product kernel code, improve workspace cache handling.
PiperOrigin-RevId: 245142197
This commit is contained in:
parent
da9a113ccd
commit
b57cabed82
@ -5351,8 +5351,8 @@ struct DepthwiseConvThroughDepth {
|
|||||||
// |start_depth| to |end_depth|. Keep this not inlined to maintain a small
|
// |start_depth| to |end_depth|. Keep this not inlined to maintain a small
|
||||||
// binary size. We use a DepthwiseConvParams struct for read only params
|
// binary size. We use a DepthwiseConvParams struct for read only params
|
||||||
// to minimize call overhead.
|
// to minimize call overhead.
|
||||||
static __attribute__((noinline)) void Run(
|
static void __attribute__((noinline))
|
||||||
const uint8* input_ptr, const uint8* filter_ptr, const int32* bias_ptr,
|
Run(const uint8* input_ptr, const uint8* filter_ptr, const int32* bias_ptr,
|
||||||
uint8* output_ptr, int64_t start_depth, int64_t end_depth,
|
uint8* output_ptr, int64_t start_depth, int64_t end_depth,
|
||||||
int64_t input_depth, int64_t input_row_size, int32 output_window_height,
|
int64_t input_depth, int64_t input_row_size, int32 output_window_height,
|
||||||
int32 output_window_width, const DepthwiseConvParams& params) {
|
int32 output_window_width, const DepthwiseConvParams& params) {
|
||||||
@ -5767,11 +5767,44 @@ inline void DepthwiseConv3x3Filter(
|
|||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
// Perform any necessary cache hinting and pre-writing.
|
||||||
|
template <DepthwiseConvImplementation implementation>
|
||||||
|
struct WorkspacePrefetchWrite {
|
||||||
|
static inline void Run(int8 fill_data, int size, int8* workspace) {}
|
||||||
|
};
|
||||||
|
|
||||||
|
#if defined(USE_NEON) && defined(__aarch64__)
|
||||||
|
// Encourage the processor to keep the workspace in cache. Both the cache hint
|
||||||
|
// and some memory writes are required.
|
||||||
|
//
|
||||||
|
// This code is extremely fragile.
|
||||||
|
// Do not edit without extensive comparative performance testing.
|
||||||
|
// Do not inline without great care.
|
||||||
|
// Do not rely on results before and after getting coffee: non-thermal changes
|
||||||
|
// of more than 10% can occur with hidden underlying processor state changes.
|
||||||
|
template <>
|
||||||
|
struct WorkspacePrefetchWrite<
|
||||||
|
DepthwiseConvImplementation::kUseNeon3x3DotProduct> {
|
||||||
|
static void __attribute__((noinline))
|
||||||
|
Run(int8 fill_data, int size, int8* workspace) {
|
||||||
|
const int8x8_t fill_data_vec = vdup_n_s8(fill_data);
|
||||||
|
int i = 0;
|
||||||
|
for (; i < (size - 15); i += 64) {
|
||||||
|
int8* ptr = workspace + i;
|
||||||
|
asm volatile("prfm pstl1keep, [%[ptr]]\n" ::[ptr] "r"(ptr) :);
|
||||||
|
vst1_lane_u32(reinterpret_cast<uint32_t*>(ptr), fill_data_vec, 0);
|
||||||
|
}
|
||||||
|
vst1_lane_u32(reinterpret_cast<uint32_t*>(workspace + size - 4),
|
||||||
|
fill_data_vec, 0);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
#endif // USE_NEON &&__aarch64__
|
||||||
|
|
||||||
#if defined(__ARM_FEATURE_DOTPROD) && !defined(GOOGLE_L4T)
|
#if defined(__ARM_FEATURE_DOTPROD) && !defined(GOOGLE_L4T)
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
struct ProcessPerDepth<DepthwiseConvImplementation::kUseNeon3x3DotProduct> {
|
struct ProcessPerDepth<DepthwiseConvImplementation::kUseNeon3x3DotProduct> {
|
||||||
static void ProcessPerDepthNeon(
|
static inline void ProcessPerDepthNeon(
|
||||||
const uint8* filter_data, const int32* bias_data,
|
const uint8* filter_data, const int32* bias_data,
|
||||||
int8* shuffled_filter_data, int32* adjusted_bias_data,
|
int8* shuffled_filter_data, int32* adjusted_bias_data,
|
||||||
const DepthwiseConvDotProdParams* function_params) {
|
const DepthwiseConvDotProdParams* function_params) {
|
||||||
@ -6098,11 +6131,11 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
|
|||||||
scratch_block_data + block_height * workspace_height_stride);
|
scratch_block_data + block_height * workspace_height_stride);
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline void Run(int32 height_block_number, int32 width_block_number,
|
static void __attribute__((noinline))
|
||||||
const uint8* input_block_data,
|
Run(int32 height_block_number, int32 width_block_number,
|
||||||
int8* scratch_block_data,
|
const uint8* input_block_data, int8* scratch_block_data,
|
||||||
const DepthwiseConvDotProdParams* function_params) {
|
const DepthwiseConvDotProdParams* function_params) {
|
||||||
PreloadInputBlock(input_block_data, function_params);
|
PreloadInputBlock<uint8>(input_block_data, function_params);
|
||||||
PackMacroBlockNeon(input_block_data, scratch_block_data, function_params);
|
PackMacroBlockNeon(input_block_data, scratch_block_data, function_params);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
@ -6489,11 +6522,11 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
|
|||||||
scratch_block_data + block_height * workspace_height_stride);
|
scratch_block_data + block_height * workspace_height_stride);
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline void Run(int32 height_block_number, int32 width_block_number,
|
static void __attribute__((noinline))
|
||||||
const uint8* input_block_data,
|
Run(int32 height_block_number, int32 width_block_number,
|
||||||
int8* scratch_block_data,
|
const uint8* input_block_data, int8* scratch_block_data,
|
||||||
const DepthwiseConvDotProdParams* function_params) {
|
const DepthwiseConvDotProdParams* function_params) {
|
||||||
PreloadInputBlock(input_block_data, function_params);
|
PreloadInputBlock<uint8>(input_block_data, function_params);
|
||||||
PackMacroBlockNeon(height_block_number, width_block_number,
|
PackMacroBlockNeon(height_block_number, width_block_number,
|
||||||
input_block_data, scratch_block_data, function_params);
|
input_block_data, scratch_block_data, function_params);
|
||||||
}
|
}
|
||||||
@ -6838,11 +6871,11 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
|
|||||||
scratch_block_data + block_height * workspace_height_stride);
|
scratch_block_data + block_height * workspace_height_stride);
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline void Run(int32 height_block_number, int32 width_block_number,
|
static void __attribute__((noinline))
|
||||||
const uint8* input_block_data,
|
Run(int32 height_block_number, int32 width_block_number,
|
||||||
int8* scratch_block_data,
|
const uint8* input_block_data, int8* scratch_block_data,
|
||||||
const DepthwiseConvDotProdParams* function_params) {
|
const DepthwiseConvDotProdParams* function_params) {
|
||||||
PreloadInputBlock(input_block_data, function_params);
|
PreloadInputBlock<uint8>(input_block_data, function_params);
|
||||||
PackMacroBlockNeon(height_block_number, width_block_number,
|
PackMacroBlockNeon(height_block_number, width_block_number,
|
||||||
input_block_data, scratch_block_data, function_params);
|
input_block_data, scratch_block_data, function_params);
|
||||||
}
|
}
|
||||||
@ -7055,11 +7088,11 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
|
|||||||
scratch_block_data + block_height * workspace_height_stride);
|
scratch_block_data + block_height * workspace_height_stride);
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline void Run(int32 height_block_number, int32 width_block_number,
|
static void __attribute__((noinline))
|
||||||
const uint8* input_block_data,
|
Run(int32 height_block_number, int32 width_block_number,
|
||||||
int8* scratch_block_data,
|
const uint8* input_block_data, int8* scratch_block_data,
|
||||||
const DepthwiseConvDotProdParams* function_params) {
|
const DepthwiseConvDotProdParams* function_params) {
|
||||||
PreloadInputBlock(input_block_data, function_params);
|
PreloadInputBlock<uint8>(input_block_data, function_params);
|
||||||
PackMacroBlockNeon(height_block_number, width_block_number,
|
PackMacroBlockNeon(height_block_number, width_block_number,
|
||||||
input_block_data, scratch_block_data, function_params);
|
input_block_data, scratch_block_data, function_params);
|
||||||
}
|
}
|
||||||
@ -7677,9 +7710,9 @@ struct KernelMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
|
|||||||
}
|
}
|
||||||
} // NOLINT(readability/fn_size) Manually unrolled.
|
} // NOLINT(readability/fn_size) Manually unrolled.
|
||||||
|
|
||||||
static inline void Run(const int8* scratch_block_data,
|
static void __attribute__((noinline))
|
||||||
const int8* filter_workspace, const int32* bias_data,
|
Run(const int8* scratch_block_data, const int8* filter_workspace,
|
||||||
uint8* output_block_data,
|
const int32* bias_data, uint8* output_block_data,
|
||||||
const DepthwiseConvDotProdParams* function_params) {
|
const DepthwiseConvDotProdParams* function_params) {
|
||||||
KernelMacroBlockNeon(scratch_block_data, filter_workspace, bias_data,
|
KernelMacroBlockNeon(scratch_block_data, filter_workspace, bias_data,
|
||||||
output_block_data, function_params);
|
output_block_data, function_params);
|
||||||
@ -8104,9 +8137,9 @@ struct KernelMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
|
|||||||
}
|
}
|
||||||
} // NOLINT(readability/fn_size) Manually unrolled.
|
} // NOLINT(readability/fn_size) Manually unrolled.
|
||||||
|
|
||||||
static inline void Run(const int8* scratch_block_data,
|
static void __attribute__((noinline))
|
||||||
const int8* filter_workspace, const int32* bias_data,
|
Run(const int8* scratch_block_data, const int8* filter_workspace,
|
||||||
uint8* output_block_data,
|
const int32* bias_data, uint8* output_block_data,
|
||||||
const DepthwiseConvDotProdParams* function_params) {
|
const DepthwiseConvDotProdParams* function_params) {
|
||||||
KernelMacroBlockNeon(scratch_block_data, filter_workspace, bias_data,
|
KernelMacroBlockNeon(scratch_block_data, filter_workspace, bias_data,
|
||||||
output_block_data, function_params);
|
output_block_data, function_params);
|
||||||
@ -8747,9 +8780,9 @@ struct KernelMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
|
|||||||
}
|
}
|
||||||
} // NOLINT(readability/fn_size) Manually unrolled.
|
} // NOLINT(readability/fn_size) Manually unrolled.
|
||||||
|
|
||||||
static inline void Run(const int8* scratch_block_data,
|
static void __attribute__((noinline))
|
||||||
const int8* filter_workspace, const int32* bias_data,
|
Run(const int8* scratch_block_data, const int8* filter_workspace,
|
||||||
uint8* output_block_data,
|
const int32* bias_data, uint8* output_block_data,
|
||||||
const DepthwiseConvDotProdParams* function_params) {
|
const DepthwiseConvDotProdParams* function_params) {
|
||||||
KernelMacroBlockNeon(scratch_block_data, filter_workspace, bias_data,
|
KernelMacroBlockNeon(scratch_block_data, filter_workspace, bias_data,
|
||||||
output_block_data, function_params);
|
output_block_data, function_params);
|
||||||
@ -9273,9 +9306,9 @@ struct KernelMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline void Run(const int8* scratch_block_data,
|
static void __attribute__((noinline))
|
||||||
const int8* filter_workspace, const int32* bias_data,
|
Run(const int8* scratch_block_data, const int8* filter_workspace,
|
||||||
uint8* output_block_data,
|
const int32* bias_data, uint8* output_block_data,
|
||||||
const DepthwiseConvDotProdParams* function_params) {
|
const DepthwiseConvDotProdParams* function_params) {
|
||||||
KernelMacroBlockNeon(scratch_block_data, filter_workspace, bias_data,
|
KernelMacroBlockNeon(scratch_block_data, filter_workspace, bias_data,
|
||||||
output_block_data, function_params);
|
output_block_data, function_params);
|
||||||
@ -9751,6 +9784,15 @@ inline void DepthwiseConvDotProduct3x3(
|
|||||||
function_params.output_height_stride = output_height_stride;
|
function_params.output_height_stride = output_height_stride;
|
||||||
function_params.residual_width = residual_micro_width;
|
function_params.residual_width = residual_micro_width;
|
||||||
|
|
||||||
|
// Prefetch workspace for write, along with any necessary dummy writes.
|
||||||
|
const int max_workspace_height_stride =
|
||||||
|
16 * ((workspace_width_micro_repeats + 3) >> 2) * largest_macro_depth;
|
||||||
|
const int workspace_fill_size = std::min(
|
||||||
|
kDepthwiseConvScratchWorkspaceSize,
|
||||||
|
height_block_size * max_workspace_height_stride + kWorkspaceExtension);
|
||||||
|
WorkspacePrefetchWrite<implementation>::Run(
|
||||||
|
params.weights_offset, workspace_fill_size, macroblock_workspace);
|
||||||
|
|
||||||
// Main process.
|
// Main process.
|
||||||
//
|
//
|
||||||
// Most kernels are nested batch-height-width-depth. Here we proceed over
|
// Most kernels are nested batch-height-width-depth. Here we proceed over
|
||||||
|
Loading…
x
Reference in New Issue
Block a user