PR #27825: TFLite: Div op Neon optimization
Imported from GitHub PR https://github.com/tensorflow/tensorflow/pull/27825 Added float32 division optimized with Neon SIMD instructions. Copybara import of the project: --0840008136
by Michal W. Tarnowski <michal.tarnowski@tcl.com>: Non-broadcast Div optimized --43a06104a6
by Michal W. Tarnowski <michal.tarnowski@tcl.com>: Explicit NEON typenames removed -- 4d9297306254d... *** ROLLBACK_OF=283557872 BEGIN_PUBLIC PiperOrigin-RevId: 283616376 Change-Id: I66eeafd640d1d7342877453c52459dec731141ef
This commit is contained in:
parent
af79ee35f5
commit
d0acd1e267
@ -2718,89 +2718,6 @@ inline void BroadcastMulDispatch(
|
|||||||
input2_data, output_shape, output_data);
|
input2_data, output_shape, output_data);
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void Div(const ArithmeticParams& params,
|
|
||||||
const RuntimeShape& input1_shape, const float* input1_data,
|
|
||||||
const RuntimeShape& input2_shape, const float* input2_data,
|
|
||||||
const RuntimeShape& output_shape, float* output_data) {
|
|
||||||
gemmlowp::ScopedProfilingLabel label("Div");
|
|
||||||
const float output_activation_min = params.float_activation_min;
|
|
||||||
const float output_activation_max = params.float_activation_max;
|
|
||||||
|
|
||||||
int i = 0;
|
|
||||||
const int size = MatchingFlatSize(input1_shape, input2_shape, output_shape);
|
|
||||||
#ifdef USE_NEON
|
|
||||||
// NEON does not offer division instruction, multiplication by the reciprocal
|
|
||||||
// is used instead. This parameter controls the number of Newton-Raphson
|
|
||||||
// iterations used to refine the initial estimate of the reciprocal given by
|
|
||||||
// vrecpeq_f32 instruction. Typically, two iterations are enough to match
|
|
||||||
// the float division accuracy closely.
|
|
||||||
static constexpr int kNewtonSteps = 2;
|
|
||||||
static const auto TWO_F32 = vdupq_n_f32(2.f);
|
|
||||||
const auto activation_min = vdupq_n_f32(output_activation_min);
|
|
||||||
const auto activation_max = vdupq_n_f32(output_activation_max);
|
|
||||||
for (; i <= size - 16; i += 16) {
|
|
||||||
const auto a10 = vld1q_f32(input1_data + i);
|
|
||||||
const auto a11 = vld1q_f32(input1_data + i + 4);
|
|
||||||
const auto a12 = vld1q_f32(input1_data + i + 8);
|
|
||||||
const auto a13 = vld1q_f32(input1_data + i + 12);
|
|
||||||
const auto a20 = vld1q_f32(input2_data + i);
|
|
||||||
const auto a21 = vld1q_f32(input2_data + i + 4);
|
|
||||||
const auto a22 = vld1q_f32(input2_data + i + 8);
|
|
||||||
const auto a23 = vld1q_f32(input2_data + i + 12);
|
|
||||||
|
|
||||||
auto r0 = vrecpeq_f32(a20);
|
|
||||||
auto r1 = vrecpeq_f32(a21);
|
|
||||||
auto r2 = vrecpeq_f32(a22);
|
|
||||||
auto r3 = vrecpeq_f32(a23);
|
|
||||||
for (int k = 0; k < kNewtonSteps; ++k) {
|
|
||||||
r0 = vmulq_f32(r0, vsubq_f32(TWO_F32, vmulq_f32(r0, a20)));
|
|
||||||
r1 = vmulq_f32(r1, vsubq_f32(TWO_F32, vmulq_f32(r1, a21)));
|
|
||||||
r2 = vmulq_f32(r2, vsubq_f32(TWO_F32, vmulq_f32(r2, a22)));
|
|
||||||
r3 = vmulq_f32(r3, vsubq_f32(TWO_F32, vmulq_f32(r3, a23)));
|
|
||||||
}
|
|
||||||
|
|
||||||
auto x0 = vmulq_f32(a10, r0);
|
|
||||||
auto x1 = vmulq_f32(a11, r1);
|
|
||||||
auto x2 = vmulq_f32(a12, r2);
|
|
||||||
auto x3 = vmulq_f32(a13, r3);
|
|
||||||
x0 = vmaxq_f32(activation_min, x0);
|
|
||||||
x1 = vmaxq_f32(activation_min, x1);
|
|
||||||
x2 = vmaxq_f32(activation_min, x2);
|
|
||||||
x3 = vmaxq_f32(activation_min, x3);
|
|
||||||
x0 = vminq_f32(activation_max, x0);
|
|
||||||
x1 = vminq_f32(activation_max, x1);
|
|
||||||
x2 = vminq_f32(activation_max, x2);
|
|
||||||
x3 = vminq_f32(activation_max, x3);
|
|
||||||
|
|
||||||
vst1q_f32(output_data + i, x0);
|
|
||||||
vst1q_f32(output_data + i + 4, x1);
|
|
||||||
vst1q_f32(output_data + i + 8, x2);
|
|
||||||
vst1q_f32(output_data + i + 12, x3);
|
|
||||||
}
|
|
||||||
for (; i <= size - 4; i += 4) {
|
|
||||||
const auto a1 = vld1q_f32(input1_data + i);
|
|
||||||
const auto a2 = vld1q_f32(input2_data + i);
|
|
||||||
|
|
||||||
auto r = vrecpeq_f32(a2);
|
|
||||||
for (int k = 0; k < kNewtonSteps; ++k) {
|
|
||||||
r = vmulq_f32(r, vsubq_f32(TWO_F32, vmulq_f32(r, a2)));
|
|
||||||
}
|
|
||||||
|
|
||||||
auto x = vmulq_f32(a1, r);
|
|
||||||
x = vmaxq_f32(activation_min, x);
|
|
||||||
x = vminq_f32(activation_max, x);
|
|
||||||
|
|
||||||
vst1q_f32(output_data + i, x);
|
|
||||||
}
|
|
||||||
#endif // NEON
|
|
||||||
|
|
||||||
for (; i < size; ++i) {
|
|
||||||
output_data[i] = ActivationFunctionWithMinMax(
|
|
||||||
input1_data[i] / input2_data[i], output_activation_min,
|
|
||||||
output_activation_max);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// TODO(jiawen): We can implement BroadcastDiv on buffers of arbitrary
|
// TODO(jiawen): We can implement BroadcastDiv on buffers of arbitrary
|
||||||
// dimensionality if the runtime code does a single loop over one dimension
|
// dimensionality if the runtime code does a single loop over one dimension
|
||||||
// that handles broadcasting as the base case. The code generator would then
|
// that handles broadcasting as the base case. The code generator would then
|
||||||
|
Loading…
Reference in New Issue
Block a user