Add x86 vector code using SSE 4.1 for MatrixBatchVectorMultiplyAccumulate and SparseMatrixBatchVectorMultiplyAccumulate, int8 versions only.

PiperOrigin-RevId: 255269233
2019-06-26 14:49:46 -07:00 · 2019-06-26 14:49:46 -07:00 · 869de82d13
commit 869de82d13
parent 7d02afd419
6 changed files with 455 additions and 10 deletions
--- a/tensorflow/lite/kernels/internal/BUILD
+++ b/tensorflow/lite/kernels/internal/BUILD
@ -505,6 +505,25 @@ cc_library(
    ],
 )
 cc_library(
    name = "sse_tensor_utils",
    srcs = [
        "compatibility.h",
        "optimized/sse_tensor_utils.cc",
    ],
    hdrs = [
        "optimized/sse_tensor_utils.h",
        "optimized/sse_tensor_utils_impl.h",
    ],
    deps = [
        ":cpu_check",
        ":neon_tensor_utils",
        ":portable_tensor_utils",
        "//tensorflow/lite/c:c_api_internal",
        "//tensorflow/lite/kernels:op_macros",
    ],
 )
 cc_library(
    name = "kernel_utils",
    srcs = ["kernel_utils.cc"],
@ -572,7 +591,7 @@ cc_library(
            ":neon_tensor_utils",
        ],
        ":haswell": [
-            ":neon_tensor_utils",
+            ":sse_tensor_utils",
        ],
        ":ios_armv7": [
            ":neon_tensor_utils",
@ -581,25 +600,25 @@ cc_library(
            ":neon_tensor_utils",
        ],
        ":ios_x86_64": [
-            ":neon_tensor_utils",
+            ":sse_tensor_utils",
        ],
        ":x86_64": [
-            ":neon_tensor_utils",
+            ":sse_tensor_utils",
        ],
        ":x86": [
-            ":neon_tensor_utils",
+            ":sse_tensor_utils",
        ],
        ":k8": [
-            ":neon_tensor_utils",
+            ":sse_tensor_utils",
        ],
        ":darwin": [
            ":neon_tensor_utils",
        ],
        ":darwin_x86_64": [
-            ":neon_tensor_utils",
+            ":sse_tensor_utils",
        ],
        ":freebsd": [
-            ":neon_tensor_utils",
+            ":sse_tensor_utils",
        ],
        "//conditions:default": [
            ":portable_tensor_utils",
@ -808,6 +827,7 @@ cc_library(
    hdrs = [
        "optimized/cpu_check.h",
        "optimized/neon_check.h",
        "optimized/sse_check.h",
    ],
    deps = [
        "//tensorflow/lite/kernels:cpu_backend_context",
--- a/tensorflow/lite/kernels/internal/optimized/sse_check.h
+++ b/tensorflow/lite/kernels/internal/optimized/sse_check.h
@ -0,0 +1,34 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_SSE_CHECK_H_
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_SSE_CHECK_H_
 #if defined(__SSE4_1__)
 // SSE 4.1 available: Use the SSE code.
 #define SSE_OR_PORTABLE(funcname, ...) Sse##funcname(__VA_ARGS__)
 #else
 #include "tensorflow/lite/kernels/internal/optimized/neon_check.h"
 // No SSE 4.1 available: Fall back to NEON_OR_PORTABLE, potentially used with
 // NEON_2_SSE translator library. As the library requires SSSE3, the fallback is
 // generally using Portable code, only a narrow subset of processors supporting
 // SSSE3 but no SSE4.1 is affected - but that includes the android_x86 ABI (not
 // android_x86_64).
 #define SSE_OR_PORTABLE(...) NEON_OR_PORTABLE(__VA_ARGS__)
 #endif
 #endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_SSE_CHECK_H_
--- a/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.cc
+++ b/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.cc
@ -0,0 +1,155 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/kernels/internal/optimized/sse_tensor_utils_impl.h"
 #ifdef __SSE4_1__
 #include <emmintrin.h>  // SSE2
 #include <smmintrin.h>  // SSE4.1
 #include <tmmintrin.h>  // SSSE3
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 namespace tflite {
 namespace tensor_utils {
 namespace {
 // Elementwise multiply two i8x8 vectors to i16x8, add elements pairwise and
 // accumulate result to a i32x4 accumulator.
 //
 // Shared by the inner loop of MatrixBatchVectorMultiplyAccumulate(int8) and
 // SparseMatrixBatchVectorMultiplyAccumulate(int8).
 //
 // x86 SSE has no i8*i8 instruction (only a u8*i8), so we need to do sign
 // extension to 16 bit and do i16*i16 multiplications. There is an instruction
 // to sign-extend i8x8 => i16x8 from the lower half of the register (used here),
 // but there is no direct way to sign-extend the high half, only multiple
 // instructions (see _mm_cmpgt_epi8 and _mm_unpackhi_epi8). Bottom line is, it
 // is actually cheaper to only to process 8 elements = 64b at a time.
 static inline __m128i MatrixBatchVectorMultiplyAccumulateLoopBodySse(
    __m128i dotprod, __m128i a_8x8, __m128i b_8x8) {
  // Sign extend i8 => i16
  __m128i a_16x8 = _mm_cvtepi8_epi16(a_8x8);  // SSE4.1
  __m128i b_16x8 = _mm_cvtepi8_epi16(b_8x8);  // SSE4.1
  // sumprod[i] = a[2*i]*b[2*i] + a[2*i+1]*b[2*i+1] (i = 0..3)
  __m128i sumprod_32x4 = _mm_madd_epi16(a_16x8, b_16x8);  // SSE2
  // i32x4 + i32x4
  return _mm_add_epi32(dotprod, sumprod_32x4);  // SSE2
 }
 // Horizontally add 4 int32 values stored in a single XMM register to int32_t.
 static inline int32_t ReduceInt32x4(__m128i acc) {
  acc = _mm_hadd_epi32(acc, acc);  // SSSE3
  // This second hadd could be only 64 bit, but 64 and 128 bit hadd has same
  // latency on most CPUs, and it costs more to move. (Moving can be no-op, but
  // nevertheless is an extra instruction occupying the decoder and I cache.)
  acc = _mm_hadd_epi32(acc, acc);  // SSSE3
  // SSE4.1 instrinsic, but actually translated to SSE2 instruction (due to
  // moving from 0th element).
  return _mm_extract_epi32(acc, 0);
 }
 }  // namespace
 void SseMatrixBatchVectorMultiplyAccumulate(
    const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
    const int8_t* __restrict__ vectors, const float* scaling_factors,
    int n_batch, float* __restrict__ result, int result_stride) {
  static constexpr int kBlockSize = 8;
  for (int batch = 0; batch < n_batch; ++batch) {
    const float batch_scaling_factor = scaling_factors[batch];
    // Compute dot-product for every column.
    for (int row = 0; row < m_rows; ++row, result += result_stride) {
      // Get the address of the first element of the row.
      const int8_t* row_ptr = matrix + row * m_cols;
      // Initialize the dot product sum for the row to 0.
      __m128i dotprod_32x4 = _mm_setzero_si128();  // SSE2
      // For every block of kBlockSize 8-bit elements.
      int col = 0;
      for (; col < (m_cols & ~(kBlockSize - 1)); col += kBlockSize) {
        // See comment at MatrixBatchVectorMultiplyAccumulateLoopBodySse why to
        // load only 64 bits. _mm_loadl_epi64 requires SSE2.
        const __m128i vec_8x8 =
            _mm_loadl_epi64(reinterpret_cast<const __m128i*>(vectors + col));
        const __m128i row_8x8 =
            _mm_loadl_epi64(reinterpret_cast<const __m128i*>(row_ptr + col));
        dotprod_32x4 = MatrixBatchVectorMultiplyAccumulateLoopBodySse(
            dotprod_32x4, vec_8x8, row_8x8);
      }  // for col
      // Horizontally add the 4 intermediate sum values to get the final
      // dot-prod value for this row.
      int32_t sum = ReduceInt32x4(dotprod_32x4);
      // Postamble loop.
      for (; col < m_cols; ++col) {
        sum += row_ptr[col] * vectors[col];
      }  // for col
      *result += sum * batch_scaling_factor;
    }  // for row
    vectors += m_cols;
  }  // for batch
 }
 void SseSparseMatrixBatchVectorMultiplyAccumulate(
    const int8_t* __restrict__ matrix, const uint8_t* ledger, const int m_rows,
    const int m_cols, const int8_t* __restrict__ vectors,
    const float* scaling_factors, int n_batch, float* __restrict__ result,
    int result_stride) {
  static const int kBlockSize = 16;
  TFLITE_DCHECK_EQ(m_cols % kBlockSize, 0);
  for (int batch = 0; batch < n_batch; ++batch, vectors += m_cols) {
    const float batch_scaling_factor = scaling_factors[batch];
    const uint8_t* ledger_ptr = ledger;
    const int8_t* row_ptr = matrix;
    for (int row = 0; row < m_rows; ++row, result += result_stride) {
      // Initialize the dot product sum for the row to 0.
      __m128i dotprod_32x4 = _mm_setzero_si128();
      int num_nonzero_blocks = *ledger_ptr++;
      for (int i = 0; i < num_nonzero_blocks; i++) {
        const int col_index = *ledger_ptr++ * kBlockSize;
        // With sparse models, we assume the block size is 16, we can't change
        // it to 8 here to better fit SSE (see dense version). Instead, do the
        // int8x8_t computation twice.
        __m128i vec_8x8 = _mm_loadl_epi64(
            reinterpret_cast<const __m128i*>(vectors + col_index));
        __m128i row_8x8 =
            _mm_loadl_epi64(reinterpret_cast<const __m128i*>(row_ptr));
        dotprod_32x4 = MatrixBatchVectorMultiplyAccumulateLoopBodySse(
            dotprod_32x4, vec_8x8, row_8x8);
        vec_8x8 = _mm_loadl_epi64(
            reinterpret_cast<const __m128i*>(vectors + col_index + 8));
        row_8x8 =
            _mm_loadl_epi64(reinterpret_cast<const __m128i*>(row_ptr + 8));
        dotprod_32x4 = MatrixBatchVectorMultiplyAccumulateLoopBodySse(
            dotprod_32x4, vec_8x8, row_8x8);
        row_ptr += kBlockSize;
      }
      // Horizontally add the 4 intermediate sum values to get the final
      // dot-prod value for this row.
      int32_t dotprod = ReduceInt32x4(dotprod_32x4);
      *result += dotprod * batch_scaling_factor;
    }  // for row
  }    // for batch
 }
 }  // namespace tensor_utils
 }  // namespace tflite
 #endif  // __SSE4_1__
--- a/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.h
+++ b/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.h
@ -0,0 +1,186 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_SSE_TENSOR_UTILS_H_
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_SSE_TENSOR_UTILS_H_
 // Note: This file is a copy-paste version of neon_tensor_utils.h, only
 // difference is in MatrixBatchVectorMultiplyAccumulate and
 // SparseMatrixBatchVectorMultiplyAccumulate (other functions do not have SSE
 // implementation yet).
 // Note: Most of the functions below use NEON_OR_PORTABLE, through the Intel
 // NEON_2_SSE translator library. If a native SSE version of a function is
 // implemented, replace the appropriate one to SSE_OR_PORTABLE.
 // TODO(ghodrat): Remove this header file and the dependency to internal data
 // structure.
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/kernels/internal/optimized/neon_check.h"
 #include "tensorflow/lite/kernels/internal/optimized/neon_tensor_utils_impl.h"
 #include "tensorflow/lite/kernels/internal/optimized/sse_check.h"
 #include "tensorflow/lite/kernels/internal/optimized/sse_tensor_utils_impl.h"
 #include "tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h"
 namespace tflite {
 namespace tensor_utils {
 void MatrixBatchVectorMultiplyAccumulate(const float* matrix, int m_rows,
                                         int m_cols, const float* vector,
                                         int n_batch, float* result,
                                         int result_stride) {
  NEON_OR_PORTABLE(MatrixBatchVectorMultiplyAccumulate, matrix, m_rows, m_cols,
                   vector, n_batch, result, result_stride);
 }
 void MatrixBatchVectorMultiplyAccumulate(
    const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
    const int8_t* __restrict__ vectors, const float* scaling_factors,
    int n_batch, float* __restrict__ result, int result_stride) {
  SSE_OR_PORTABLE(MatrixBatchVectorMultiplyAccumulate, matrix, m_rows, m_cols,
                  vectors, scaling_factors, n_batch, result, result_stride);
 }
 void SparseMatrixBatchVectorMultiplyAccumulate(
    const float* __restrict__ matrix, const uint8_t* __restrict__ ledger,
    int m_rows, int m_cols, const float* __restrict__ vector, int n_batch,
    float* __restrict__ result, int result_stride) {
  NEON_OR_PORTABLE(SparseMatrixBatchVectorMultiplyAccumulate, matrix, ledger,
                   m_rows, m_cols, vector, n_batch, result, result_stride);
 }
 void SparseMatrixBatchVectorMultiplyAccumulate(
    const int8_t* __restrict__ matrix, const uint8_t* ledger, const int m_rows,
    const int m_cols, const int8_t* __restrict__ vectors,
    const float* scaling_factors, int n_batch, float* __restrict__ result,
    int result_stride) {
  SSE_OR_PORTABLE(SparseMatrixBatchVectorMultiplyAccumulate, matrix, ledger,
                  m_rows, m_cols, vectors, scaling_factors, n_batch, result,
                  result_stride);
 }
 void VectorVectorCwiseProduct(const float* vector1, const float* vector2,
                              int v_size, float* result) {
  NEON_OR_PORTABLE(VectorVectorCwiseProduct, vector1, vector2, v_size, result);
 }
 void VectorVectorCwiseProductAccumulate(const float* vector1,
                                        const float* vector2, int v_size,
                                        float* result) {
  NEON_OR_PORTABLE(VectorVectorCwiseProductAccumulate, vector1, vector2, v_size,
                   result);
 }
 void VectorBatchVectorCwiseProduct(const float* vector, int v_size,
                                   const float* batch_vector, int n_batch,
                                   float* result) {
  NEON_OR_PORTABLE(VectorBatchVectorCwiseProduct, vector, v_size, batch_vector,
                   n_batch, result);
 }
 void VectorBatchVectorCwiseProductAccumulate(const float* vector, int v_size,
                                             const float* batch_vector,
                                             int n_batch, float* result) {
  NEON_OR_PORTABLE(VectorBatchVectorCwiseProductAccumulate, vector, v_size,
                   batch_vector, n_batch, result);
 }
 float VectorVectorDotProduct(const float* vector1, const float* vector2,
                             int v_size) {
  return NEON_OR_PORTABLE(VectorVectorDotProduct, vector1, vector2, v_size);
 }
 void BatchVectorBatchVectorDotProduct(const float* vector1,
                                      const float* vector2, int v_size,
                                      int n_batch, float* result,
                                      int result_stride) {
  NEON_OR_PORTABLE(BatchVectorBatchVectorDotProduct, vector1, vector2, v_size,
                   n_batch, result, result_stride);
 }
 void VectorBatchVectorAdd(const float* vector, int v_size, int n_batch,
                          float* batch_vector) {
  PortableVectorBatchVectorAdd(vector, v_size, n_batch, batch_vector);
 }
 void VectorBatchVectorAssign(const float* vector, int v_size, int n_batch,
                             float* batch_vector) {
  PortableVectorBatchVectorAssign(vector, v_size, n_batch, batch_vector);
 }
 void ApplySigmoidToVector(const float* vector, int v_size, float* result) {
  PortableApplySigmoidToVector(vector, v_size, result);
 }
 void ApplyActivationToVector(const float* vector, int v_size,
                             TfLiteFusedActivation activation, float* result) {
  PortableApplyActivationToVector(vector, v_size, activation, result);
 }
 void CopyVector(const float* vector, int v_size, float* result) {
  PortableCopyVector(vector, v_size, result);
 }
 void Sub1Vector(const float* vector, int v_size, float* result) {
  NEON_OR_PORTABLE(Sub1Vector, vector, v_size, result);
 }
 void ZeroVector(float* vector, int v_size) {
  PortableZeroVector(vector, v_size);
 }
 float Clip(float f, float abs_limit) { return PortableClip(f, abs_limit); }
 // Check if all entries of a vector are zero.
 bool IsZeroVector(const float* vector, int v_size) {
  return NEON_OR_PORTABLE(IsZeroVector, vector, v_size);
 }
 void VectorScalarMultiply(const int8_t* vector, int v_size, float scale,
                          float* result) {
  NEON_OR_PORTABLE(VectorScalarMultiply, vector, v_size, scale, result);
 }
 void ClipVector(const float* vector, int v_size, float abs_limit,
                float* result) {
  NEON_OR_PORTABLE(ClipVector, vector, v_size, abs_limit, result);
 }
 void SymmetricQuantizeFloats(const float* values, const int size,
                             int8_t* quantized_values, float* min_value,
                             float* max_value, float* scaling_factor) {
  NEON_OR_PORTABLE(SymmetricQuantizeFloats, values, size, quantized_values,
                   min_value, max_value, scaling_factor);
 }
 void VectorShiftLeft(float* vector, int v_size, float shift_value) {
  NEON_OR_PORTABLE(VectorShiftLeft, vector, v_size, shift_value);
 }
 void ReductionSumVector(const float* input_vector, float* output_vector,
                        int output_size, int reduction_size) {
  NEON_OR_PORTABLE(ReductionSumVector, input_vector, output_vector, output_size,
                   reduction_size);
 }
 void MeanStddevNormalization(const float* input_vector, float* output_vector,
                             int v_size, int n_batch,
                             float normalization_epsilon) {
  PortableMeanStddevNormalization(input_vector, output_vector, v_size, n_batch,
                                  normalization_epsilon);
 }
 }  // namespace tensor_utils
 }  // namespace tflite
 #endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_SSE_TENSOR_UTILS_H_
--- a/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils_impl.h
+++ b/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils_impl.h
@ -0,0 +1,48 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_SSE_TENSOR_UTILS_IMPL_H_
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_SSE_TENSOR_UTILS_IMPL_H_
 #include <cstdint>
 #if defined(_MSC_VER)
 #define __restrict__ __restrict
 #endif
 namespace tflite {
 namespace tensor_utils {
 #ifdef __SSE4_1__
 // Matrix multiplication for quantized values using symmetric quantization.
 void SseMatrixBatchVectorMultiplyAccumulate(
    const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
    const int8_t* __restrict__ vectors, const float* scaling_factors,
    int n_batch, float* __restrict__ result, int result_stride);
 // Matrix multiplication for quantized values using symmetric quantization.
 // Sparse version.
 void SseSparseMatrixBatchVectorMultiplyAccumulate(
    const int8_t* __restrict__ matrix, const uint8_t* ledger, const int m_rows,
    const int m_cols, const int8_t* __restrict__ vectors,
    const float* scaling_factors, int n_batch, float* __restrict__ result,
    int result_stride);
 #endif  // __SSE4_1__
 }  // namespace tensor_utils
 }  // namespace tflite
 #endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_SSE_TENSOR_UTILS_IMPL_H_
--- a/tensorflow/lite/kernels/internal/tensor_utils.cc
+++ b/tensorflow/lite/kernels/internal/tensor_utils.cc
@ -14,10 +14,12 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/kernels/internal/tensor_utils.h"
-#include "tensorflow/lite/kernels/internal/optimized/cpu_check.h"
+#include "tensorflow/lite/kernels/internal/optimized/neon_check.h"
-#ifdef USE_NEON
+#if defined(__SSE4_1__)
 #include "tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.h"
 #elif defined(USE_NEON)
 #include "tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.h"
 #else
 #include "tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h"
-#endif  // USE_NEON
+#endif  // __SSE4_1__ or USE_NEON