diff --git a/tensorflow/lite/kernels/cpu_backend_gemm.h b/tensorflow/lite/kernels/cpu_backend_gemm.h index eccf69f19d3..6ede8d2fc49 100644 --- a/tensorflow/lite/kernels/cpu_backend_gemm.h +++ b/tensorflow/lite/kernels/cpu_backend_gemm.h @@ -92,6 +92,7 @@ void Gemm(const MatrixParams& lhs_params, const LhsScalar* lhs_data, const MatrixParams& dst_params, DstScalar* dst_data, const GemmParams& params, CpuBackendContext* context) { + gemmlowp::ScopedProfilingLabel label("cpu_backend_gemm::Gemm"); ValidateParams(lhs_params, rhs_params, dst_params, params); if (dst_params.cols == 1) { // GEMV case: try a custom fast GEMV path. @@ -100,6 +101,7 @@ void Gemm(const MatrixParams& lhs_params, const LhsScalar* lhs_data, return; } } + gemmlowp::ScopedProfilingLabel label2("cpu_backend_gemm::Gemm: general GEMM"); GemmImpl::Run(lhs_params, lhs_data, rhs_params, rhs_data, dst_params, dst_data, params, context); diff --git a/tensorflow/lite/kernels/cpu_backend_gemm_custom_gemv.h b/tensorflow/lite/kernels/cpu_backend_gemm_custom_gemv.h index 3b686e5a1f7..017f1660e8c 100644 --- a/tensorflow/lite/kernels/cpu_backend_gemm_custom_gemv.h +++ b/tensorflow/lite/kernels/cpu_backend_gemm_custom_gemv.h @@ -144,6 +144,7 @@ bool CustomGemv( const MatrixParams& dst_params, DstScalar* dst_data, const GemmParams& params, CpuBackendContext* context) { + gemmlowp::ScopedProfilingLabel label("cpu_backend_gemm::Gemm: CustomGemv"); using Impl = CustomGemvImpl; if (lhs_params.rows < Impl::kKernelRows) { @@ -186,8 +187,8 @@ bool CustomGemv( // Some NEON helper functions used by CustomGemvImpl specializations below, // allowing for some type genericity in them. -inline int16x8x2_t LoadAndSubtractZeroPoint(const std::uint8_t* src, - std::uint8_t zero_point) { +inline int16x8x2_t Load16AndSubtractZeroPoint(const std::uint8_t* src, + std::uint8_t zero_point) { uint8x16_t src_u8 = vld1q_u8(src); int16x8_t src_s16_0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(src_u8))); int16x8_t src_s16_1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(src_u8))); @@ -198,8 +199,8 @@ inline int16x8x2_t LoadAndSubtractZeroPoint(const std::uint8_t* src, return result; } -inline int16x8x2_t LoadAndSubtractZeroPoint(const std::int8_t* src, - std::int8_t zero_point) { +inline int16x8x2_t Load16AndSubtractZeroPoint(const std::int8_t* src, + std::int8_t zero_point) { int8x16_t src_s8 = vld1q_s8(src); int16x8_t src_s16_0 = vmovl_s8(vget_low_s8(src_s8)); int16x8_t src_s16_1 = vmovl_s8(vget_high_s8(src_s8)); @@ -210,6 +211,22 @@ inline int16x8x2_t LoadAndSubtractZeroPoint(const std::int8_t* src, return result; } +inline int16x8_t Load8AndSubtractZeroPoint(const std::uint8_t* src, + std::uint8_t zero_point) { + uint8x8_t src_u8 = vld1_u8(src); + int16x8_t src_s16 = vreinterpretq_s16_u16(vmovl_u8(src_u8)); + int16x8_t zero_point_vec = vdupq_n_s16(zero_point); + return vsubq_s16(src_s16, zero_point_vec); +} + +inline int16x8_t Load8AndSubtractZeroPoint(const std::int8_t* src, + std::int8_t zero_point) { + int8x8_t src_s8 = vld1_s8(src); + int16x8_t src_s16 = vmovl_s8(src_s8); + int16x8_t zero_point_vec = vdupq_n_s16(zero_point); + return vsubq_s16(src_s16, zero_point_vec); +} + inline void ClampAndStore(int32x4_t src, std::uint8_t clamp_min, std::uint8_t clamp_max, std::uint8_t* dst) { // Narrow values down to 16 bit signed. @@ -288,11 +305,12 @@ struct CustomGemvImpl& rhs_params, const MatrixParams& dst_params, const GemmParams& params) { - // There are no further requirements on the applicability of this kernel, - // beyond the left-hand-side matrix having at least kKernelRows rows, - // and the type requirements implied in this template partial - // specialization. - return true; + // The kernel processes at least 8 LHS columns at once to fill NEON + // registers. The leftovers-handling code at the end works by loading a + // partially overlapping final register by walking back by a few (<8) values + // to avoid running past the row's end. This relies on there being + // at least 8 LHS columns. + return lhs_params.cols >= 8; } static void Run( @@ -311,6 +329,27 @@ struct CustomGemvImpl& rhs_params, const MatrixParams& dst_params, const GemmParams& params) { - // There are no further requirements on the applicability of this kernel, - // beyond the left-hand-side matrix having at least kKernelRows rows, - // and the type requirements implied in this template partial - // specialization. - return true; + // The kernel processes 4 LHS columns at once to fill float32x4 registers. + // The leftovers-handling code at the end works by loading a partially + // overlapping final register by walking back by a few (<4) floats + // to avoid running past the row's end. This relies on there being + // at least 4 LHS columns. + return lhs_params.cols >= 4; } static void Run(const MatrixParams& lhs_params, const float* lhs_data, const MatrixParams& rhs_params, const float* rhs_data, @@ -505,6 +639,27 @@ struct CustomGemvImpl