DepthwiseConv, dot-product kernel code, improve workspace cache handling.
PiperOrigin-RevId: 245142197
This commit is contained in:
parent
da9a113ccd
commit
b57cabed82
@ -5351,8 +5351,8 @@ struct DepthwiseConvThroughDepth {
|
||||
// |start_depth| to |end_depth|. Keep this not inlined to maintain a small
|
||||
// binary size. We use a DepthwiseConvParams struct for read only params
|
||||
// to minimize call overhead.
|
||||
static __attribute__((noinline)) void Run(
|
||||
const uint8* input_ptr, const uint8* filter_ptr, const int32* bias_ptr,
|
||||
static void __attribute__((noinline))
|
||||
Run(const uint8* input_ptr, const uint8* filter_ptr, const int32* bias_ptr,
|
||||
uint8* output_ptr, int64_t start_depth, int64_t end_depth,
|
||||
int64_t input_depth, int64_t input_row_size, int32 output_window_height,
|
||||
int32 output_window_width, const DepthwiseConvParams& params) {
|
||||
@ -5767,11 +5767,44 @@ inline void DepthwiseConv3x3Filter(
|
||||
|
||||
#endif
|
||||
|
||||
// Perform any necessary cache hinting and pre-writing.
|
||||
template <DepthwiseConvImplementation implementation>
|
||||
struct WorkspacePrefetchWrite {
|
||||
static inline void Run(int8 fill_data, int size, int8* workspace) {}
|
||||
};
|
||||
|
||||
#if defined(USE_NEON) && defined(__aarch64__)
|
||||
// Encourage the processor to keep the workspace in cache. Both the cache hint
|
||||
// and some memory writes are required.
|
||||
//
|
||||
// This code is extremely fragile.
|
||||
// Do not edit without extensive comparative performance testing.
|
||||
// Do not inline without great care.
|
||||
// Do not rely on results before and after getting coffee: non-thermal changes
|
||||
// of more than 10% can occur with hidden underlying processor state changes.
|
||||
template <>
|
||||
struct WorkspacePrefetchWrite<
|
||||
DepthwiseConvImplementation::kUseNeon3x3DotProduct> {
|
||||
static void __attribute__((noinline))
|
||||
Run(int8 fill_data, int size, int8* workspace) {
|
||||
const int8x8_t fill_data_vec = vdup_n_s8(fill_data);
|
||||
int i = 0;
|
||||
for (; i < (size - 15); i += 64) {
|
||||
int8* ptr = workspace + i;
|
||||
asm volatile("prfm pstl1keep, [%[ptr]]\n" ::[ptr] "r"(ptr) :);
|
||||
vst1_lane_u32(reinterpret_cast<uint32_t*>(ptr), fill_data_vec, 0);
|
||||
}
|
||||
vst1_lane_u32(reinterpret_cast<uint32_t*>(workspace + size - 4),
|
||||
fill_data_vec, 0);
|
||||
}
|
||||
};
|
||||
#endif // USE_NEON &&__aarch64__
|
||||
|
||||
#if defined(__ARM_FEATURE_DOTPROD) && !defined(GOOGLE_L4T)
|
||||
|
||||
template <>
|
||||
struct ProcessPerDepth<DepthwiseConvImplementation::kUseNeon3x3DotProduct> {
|
||||
static void ProcessPerDepthNeon(
|
||||
static inline void ProcessPerDepthNeon(
|
||||
const uint8* filter_data, const int32* bias_data,
|
||||
int8* shuffled_filter_data, int32* adjusted_bias_data,
|
||||
const DepthwiseConvDotProdParams* function_params) {
|
||||
@ -6098,11 +6131,11 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
|
||||
scratch_block_data + block_height * workspace_height_stride);
|
||||
}
|
||||
|
||||
static inline void Run(int32 height_block_number, int32 width_block_number,
|
||||
const uint8* input_block_data,
|
||||
int8* scratch_block_data,
|
||||
const DepthwiseConvDotProdParams* function_params) {
|
||||
PreloadInputBlock(input_block_data, function_params);
|
||||
static void __attribute__((noinline))
|
||||
Run(int32 height_block_number, int32 width_block_number,
|
||||
const uint8* input_block_data, int8* scratch_block_data,
|
||||
const DepthwiseConvDotProdParams* function_params) {
|
||||
PreloadInputBlock<uint8>(input_block_data, function_params);
|
||||
PackMacroBlockNeon(input_block_data, scratch_block_data, function_params);
|
||||
}
|
||||
};
|
||||
@ -6489,11 +6522,11 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
|
||||
scratch_block_data + block_height * workspace_height_stride);
|
||||
}
|
||||
|
||||
static inline void Run(int32 height_block_number, int32 width_block_number,
|
||||
const uint8* input_block_data,
|
||||
int8* scratch_block_data,
|
||||
const DepthwiseConvDotProdParams* function_params) {
|
||||
PreloadInputBlock(input_block_data, function_params);
|
||||
static void __attribute__((noinline))
|
||||
Run(int32 height_block_number, int32 width_block_number,
|
||||
const uint8* input_block_data, int8* scratch_block_data,
|
||||
const DepthwiseConvDotProdParams* function_params) {
|
||||
PreloadInputBlock<uint8>(input_block_data, function_params);
|
||||
PackMacroBlockNeon(height_block_number, width_block_number,
|
||||
input_block_data, scratch_block_data, function_params);
|
||||
}
|
||||
@ -6838,11 +6871,11 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
|
||||
scratch_block_data + block_height * workspace_height_stride);
|
||||
}
|
||||
|
||||
static inline void Run(int32 height_block_number, int32 width_block_number,
|
||||
const uint8* input_block_data,
|
||||
int8* scratch_block_data,
|
||||
const DepthwiseConvDotProdParams* function_params) {
|
||||
PreloadInputBlock(input_block_data, function_params);
|
||||
static void __attribute__((noinline))
|
||||
Run(int32 height_block_number, int32 width_block_number,
|
||||
const uint8* input_block_data, int8* scratch_block_data,
|
||||
const DepthwiseConvDotProdParams* function_params) {
|
||||
PreloadInputBlock<uint8>(input_block_data, function_params);
|
||||
PackMacroBlockNeon(height_block_number, width_block_number,
|
||||
input_block_data, scratch_block_data, function_params);
|
||||
}
|
||||
@ -7055,11 +7088,11 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
|
||||
scratch_block_data + block_height * workspace_height_stride);
|
||||
}
|
||||
|
||||
static inline void Run(int32 height_block_number, int32 width_block_number,
|
||||
const uint8* input_block_data,
|
||||
int8* scratch_block_data,
|
||||
const DepthwiseConvDotProdParams* function_params) {
|
||||
PreloadInputBlock(input_block_data, function_params);
|
||||
static void __attribute__((noinline))
|
||||
Run(int32 height_block_number, int32 width_block_number,
|
||||
const uint8* input_block_data, int8* scratch_block_data,
|
||||
const DepthwiseConvDotProdParams* function_params) {
|
||||
PreloadInputBlock<uint8>(input_block_data, function_params);
|
||||
PackMacroBlockNeon(height_block_number, width_block_number,
|
||||
input_block_data, scratch_block_data, function_params);
|
||||
}
|
||||
@ -7677,10 +7710,10 @@ struct KernelMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
|
||||
}
|
||||
} // NOLINT(readability/fn_size) Manually unrolled.
|
||||
|
||||
static inline void Run(const int8* scratch_block_data,
|
||||
const int8* filter_workspace, const int32* bias_data,
|
||||
uint8* output_block_data,
|
||||
const DepthwiseConvDotProdParams* function_params) {
|
||||
static void __attribute__((noinline))
|
||||
Run(const int8* scratch_block_data, const int8* filter_workspace,
|
||||
const int32* bias_data, uint8* output_block_data,
|
||||
const DepthwiseConvDotProdParams* function_params) {
|
||||
KernelMacroBlockNeon(scratch_block_data, filter_workspace, bias_data,
|
||||
output_block_data, function_params);
|
||||
}
|
||||
@ -8104,10 +8137,10 @@ struct KernelMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
|
||||
}
|
||||
} // NOLINT(readability/fn_size) Manually unrolled.
|
||||
|
||||
static inline void Run(const int8* scratch_block_data,
|
||||
const int8* filter_workspace, const int32* bias_data,
|
||||
uint8* output_block_data,
|
||||
const DepthwiseConvDotProdParams* function_params) {
|
||||
static void __attribute__((noinline))
|
||||
Run(const int8* scratch_block_data, const int8* filter_workspace,
|
||||
const int32* bias_data, uint8* output_block_data,
|
||||
const DepthwiseConvDotProdParams* function_params) {
|
||||
KernelMacroBlockNeon(scratch_block_data, filter_workspace, bias_data,
|
||||
output_block_data, function_params);
|
||||
}
|
||||
@ -8747,10 +8780,10 @@ struct KernelMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
|
||||
}
|
||||
} // NOLINT(readability/fn_size) Manually unrolled.
|
||||
|
||||
static inline void Run(const int8* scratch_block_data,
|
||||
const int8* filter_workspace, const int32* bias_data,
|
||||
uint8* output_block_data,
|
||||
const DepthwiseConvDotProdParams* function_params) {
|
||||
static void __attribute__((noinline))
|
||||
Run(const int8* scratch_block_data, const int8* filter_workspace,
|
||||
const int32* bias_data, uint8* output_block_data,
|
||||
const DepthwiseConvDotProdParams* function_params) {
|
||||
KernelMacroBlockNeon(scratch_block_data, filter_workspace, bias_data,
|
||||
output_block_data, function_params);
|
||||
}
|
||||
@ -9273,10 +9306,10 @@ struct KernelMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
|
||||
}
|
||||
}
|
||||
|
||||
static inline void Run(const int8* scratch_block_data,
|
||||
const int8* filter_workspace, const int32* bias_data,
|
||||
uint8* output_block_data,
|
||||
const DepthwiseConvDotProdParams* function_params) {
|
||||
static void __attribute__((noinline))
|
||||
Run(const int8* scratch_block_data, const int8* filter_workspace,
|
||||
const int32* bias_data, uint8* output_block_data,
|
||||
const DepthwiseConvDotProdParams* function_params) {
|
||||
KernelMacroBlockNeon(scratch_block_data, filter_workspace, bias_data,
|
||||
output_block_data, function_params);
|
||||
}
|
||||
@ -9751,6 +9784,15 @@ inline void DepthwiseConvDotProduct3x3(
|
||||
function_params.output_height_stride = output_height_stride;
|
||||
function_params.residual_width = residual_micro_width;
|
||||
|
||||
// Prefetch workspace for write, along with any necessary dummy writes.
|
||||
const int max_workspace_height_stride =
|
||||
16 * ((workspace_width_micro_repeats + 3) >> 2) * largest_macro_depth;
|
||||
const int workspace_fill_size = std::min(
|
||||
kDepthwiseConvScratchWorkspaceSize,
|
||||
height_block_size * max_workspace_height_stride + kWorkspaceExtension);
|
||||
WorkspacePrefetchWrite<implementation>::Run(
|
||||
params.weights_offset, workspace_fill_size, macroblock_workspace);
|
||||
|
||||
// Main process.
|
||||
//
|
||||
// Most kernels are nested batch-height-width-depth. Here we proceed over
|
||||
|
Loading…
Reference in New Issue
Block a user