Explicit NEON typenames removed

This commit is contained in:
Michal W. Tarnowski 2019-04-13 22:31:47 +02:00
parent 0840008136
commit 43a06104a6

View File

@ -3226,23 +3226,23 @@ inline void Div(const ArithmeticParams& params,
const int size = MatchingFlatSize(input1_shape, input2_shape, output_shape); const int size = MatchingFlatSize(input1_shape, input2_shape, output_shape);
#ifdef USE_NEON #ifdef USE_NEON
static constexpr int kNewtonSteps = 2; static constexpr int kNewtonSteps = 2;
static const float32x4_t TWO_F32 = vdupq_n_f32(2.f); static const auto TWO_F32 = vdupq_n_f32(2.f);
const float32x4_t activation_min = vdupq_n_f32(output_activation_min); const auto activation_min = vdupq_n_f32(output_activation_min);
const float32x4_t activation_max = vdupq_n_f32(output_activation_max); const auto activation_max = vdupq_n_f32(output_activation_max);
for (; i <= size - 16; i += 16) { for (; i <= size - 16; i += 16) {
const float32x4_t a10 = vld1q_f32(input1_data + i); const auto a10 = vld1q_f32(input1_data + i);
const float32x4_t a11 = vld1q_f32(input1_data + i + 4); const auto a11 = vld1q_f32(input1_data + i + 4);
const float32x4_t a12 = vld1q_f32(input1_data + i + 8); const auto a12 = vld1q_f32(input1_data + i + 8);
const float32x4_t a13 = vld1q_f32(input1_data + i + 12); const auto a13 = vld1q_f32(input1_data + i + 12);
const float32x4_t a20 = vld1q_f32(input2_data + i); const auto a20 = vld1q_f32(input2_data + i);
const float32x4_t a21 = vld1q_f32(input2_data + i + 4); const auto a21 = vld1q_f32(input2_data + i + 4);
const float32x4_t a22 = vld1q_f32(input2_data + i + 8); const auto a22 = vld1q_f32(input2_data + i + 8);
const float32x4_t a23 = vld1q_f32(input2_data + i + 12); const auto a23 = vld1q_f32(input2_data + i + 12);
float32x4_t r0 = vrecpeq_f32(a20); auto r0 = vrecpeq_f32(a20);
float32x4_t r1 = vrecpeq_f32(a21); auto r1 = vrecpeq_f32(a21);
float32x4_t r2 = vrecpeq_f32(a22); auto r2 = vrecpeq_f32(a22);
float32x4_t r3 = vrecpeq_f32(a23); auto r3 = vrecpeq_f32(a23);
for (int k = 0; k < kNewtonSteps; ++k) { for (int k = 0; k < kNewtonSteps; ++k) {
r0 = vmulq_f32(r0, vsubq_f32(TWO_F32, vmulq_f32(r0, a20))); r0 = vmulq_f32(r0, vsubq_f32(TWO_F32, vmulq_f32(r0, a20)));
r1 = vmulq_f32(r1, vsubq_f32(TWO_F32, vmulq_f32(r1, a21))); r1 = vmulq_f32(r1, vsubq_f32(TWO_F32, vmulq_f32(r1, a21)));
@ -3250,10 +3250,10 @@ inline void Div(const ArithmeticParams& params,
r3 = vmulq_f32(r3, vsubq_f32(TWO_F32, vmulq_f32(r3, a23))); r3 = vmulq_f32(r3, vsubq_f32(TWO_F32, vmulq_f32(r3, a23)));
} }
float32x4_t x0 = vmulq_f32(a10, r0); auto x0 = vmulq_f32(a10, r0);
float32x4_t x1 = vmulq_f32(a11, r1); auto x1 = vmulq_f32(a11, r1);
float32x4_t x2 = vmulq_f32(a12, r2); auto x2 = vmulq_f32(a12, r2);
float32x4_t x3 = vmulq_f32(a13, r3); auto x3 = vmulq_f32(a13, r3);
x0 = vmaxq_f32(activation_min, x0); x0 = vmaxq_f32(activation_min, x0);
x1 = vmaxq_f32(activation_min, x1); x1 = vmaxq_f32(activation_min, x1);
x2 = vmaxq_f32(activation_min, x2); x2 = vmaxq_f32(activation_min, x2);
@ -3269,15 +3269,15 @@ inline void Div(const ArithmeticParams& params,
vst1q_f32(output_data + i + 12, x3); vst1q_f32(output_data + i + 12, x3);
} }
for (; i <= size - 4; i += 4) { for (; i <= size - 4; i += 4) {
const float32x4_t a1 = vld1q_f32(input1_data + i); const auto a1 = vld1q_f32(input1_data + i);
const float32x4_t a2 = vld1q_f32(input2_data + i); const auto a2 = vld1q_f32(input2_data + i);
float32x4_t r = vrecpeq_f32(a2); auto r = vrecpeq_f32(a2);
for (int k = 0; k < kNewtonSteps; ++k) { for (int k = 0; k < kNewtonSteps; ++k) {
r = vmulq_f32(r, vsubq_f32(TWO_F32, vmulq_f32(r, a2))); r = vmulq_f32(r, vsubq_f32(TWO_F32, vmulq_f32(r, a2)));
} }
float32x4_t x = vmulq_f32(a1, r); auto x = vmulq_f32(a1, r);
x = vmaxq_f32(activation_min, x); x = vmaxq_f32(activation_min, x);
x = vminq_f32(activation_max, x); x = vminq_f32(activation_max, x);