Transpose op optimization

PiperOrigin-RevId: 268194603
2019-09-10 04:12:39 -07:00 · 2019-09-10 04:12:39 -07:00 · 2a7d2c4295
commit 2a7d2c4295
parent b266a849d8
5 changed files with 287 additions and 12 deletions
--- a/tensorflow/lite/kernels/internal/optimized/legacy_optimized_ops.h
+++ b/tensorflow/lite/kernels/internal/optimized/legacy_optimized_ops.h
@ -72,7 +72,6 @@ using reference_ops::SpaceToBatchND;
 using reference_ops::Split;
 using reference_ops::StridedSlice;
 using reference_ops::TensorFlowSplit;
 using reference_ops::Transpose;
 static constexpr int kDepthwiseReverseShift = -1;
@ -4918,6 +4917,18 @@ inline void Dequantize(const uint8* input_data, const Dims<4>& input_dims,
             DimsToShape(output_dims), output_data);
 }
 template <typename T>
 void Transpose(const T* input, const Dims<4>& input_dims, T* output,
               const Dims<4>& output_dims, const int* permuted_axes) {
  TransposeParams params;
  params.perm_count = 4;
  for (int i = 0; i < 4; ++i) {
    params.perm[i] = 3 - permuted_axes[3 - i];
  }
  Transpose(params, DimsToShape(input_dims), input, DimsToShape(output_dims),
            output);
 }
 }  // namespace optimized_ops
 }  // namespace tflite
 #endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_LEGACY_OPTIMIZED_OPS_H_
--- a/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
+++ b/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
@ -98,7 +98,6 @@ using reference_ops::SpaceToBatchND;
 using reference_ops::Split;
 using reference_ops::StridedSlice;
 using reference_ops::Sub16;
 using reference_ops::Transpose;
 // TODO(b/80247582) Remove this constant.
 // This will be phased out as the shifts are revised with more thought. Use of a
@ -180,6 +179,12 @@ struct TTypes {
  typedef Eigen::TensorMap<
      Eigen::Tensor<const T, 2, Eigen::RowMajor, IndexType>>
      UnalignedConstMatrix;
  typedef Eigen::TensorMap<
      Eigen::Tensor<const T, NDIMS, Eigen::RowMajor, IndexType>, Eigen::Aligned>
      ConstTensor;
  typedef Eigen::TensorMap<Eigen::Tensor<T, NDIMS, Eigen::RowMajor, IndexType>,
                           Eigen::Aligned>
      Tensor;
 };
 // TODO(b/62193649): this function is only needed as long
@ -6694,6 +6699,171 @@ inline void Logistic16bitPercision(const LogisticParams& params,
  }
 }
 // Transpose2DOn32bitMatrix only deals with typical 2D matrix transpose ops.
 inline void Transpose2DOn32bitMatrix(const TransposeParams& params,
                                     const RuntimeShape& input_shape,
                                     const int32_t* input_data,
                                     const RuntimeShape& output_shape,
                                     int32_t* output_data) {
  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 2);
  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 2);
  TFLITE_DCHECK_EQ(params.perm_count, 2);
  TFLITE_DCHECK_EQ(params.perm[0], 1);
  TFLITE_DCHECK_EQ(params.perm[1], 0);
  const int d0 = input_shape.DimsData()[0];
  const int d1 = input_shape.DimsData()[1];
 #ifdef USE_NEON
  const int kLines = 4;
  const int kSkipSize = (kLines - 1) * d1;
 #endif
  const int32_t* input = input_data;
  int i = 0;
 #ifdef USE_NEON
  for (; i <= d0 - kLines; i += kLines) {
    int32_t* output = output_data + i;
    const int32_t* input_ptr = input;
    __builtin_prefetch(input_ptr, 0, 3);
    input_ptr += d1;
    __builtin_prefetch(input_ptr, 0, 3);
    input_ptr += d1;
    __builtin_prefetch(input_ptr, 0, 3);
    input_ptr += d1;
    __builtin_prefetch(input_ptr, 0, 3);
    int j = 0;
    for (; j <= d1 - kLines; j += kLines) {
      input_ptr = input;
      int32x4_t a0 = vld1q_s32(input);
      input_ptr += d1;
      int32x4_t a1 = vld1q_s32(input_ptr);
      input_ptr += d1;
      int32x4_t a2 = vld1q_s32(input_ptr);
      input_ptr += d1;
      int32x4_t a3 = vld1q_s32(input_ptr);
      int32x4x2_t tmp1 = vuzpq_s32(a0, a2);
      int32x4x2_t tmp2 = vuzpq_s32(a1, a3);
      int32x4x2_t tmp3 = vtrnq_s32(tmp1.val[0], tmp2.val[0]);
      int32x4x2_t tmp4 = vtrnq_s32(tmp1.val[1], tmp2.val[1]);
      vst1q_s32(output, tmp3.val[0]);
      output += d0;
      vst1q_s32(output, tmp4.val[0]);
      output += d0;
      vst1q_s32(output, tmp3.val[1]);
      output += d0;
      vst1q_s32(output, tmp4.val[1]);
      output += d0;
      input += kLines;
    }
    if (j == d1) {
      input += kSkipSize;
    } else {
      for (int p = 0; p < kLines; ++p) {
        for (int q = 0; q < d1 - j; ++q) {
          *(output + q * d0 + p) = *(input + p * d1 + q);
        }
      }
      input += (d1 - j) + kSkipSize;
    }
  }
 #endif
  for (; i < d0; ++i) {
    int32_t* output = output_data + i;
    for (int j = 0; j < d1; ++j) {
      *output = *input;
      output += d0;
      ++input;
    }
  }
 }
 template <typename T>
 inline void TransposeImpl(const TransposeParams& params,
                          const RuntimeShape& unextended_input_shape,
                          const T* input_data,
                          const RuntimeShape& unextended_output_shape,
                          T* output_data) {
  const int unextended_output_size = unextended_input_shape.DimensionsCount();
  const RuntimeShape input_shape =
      RuntimeShape::ExtendedShape(4, unextended_input_shape);
  const RuntimeShape output_shape =
      RuntimeShape::ExtendedShape(4, unextended_output_shape);
  const int input_ext_size = 4 - unextended_input_shape.DimensionsCount();
  const int output_ext_size = 4 - unextended_output_size;
  // The perm data is extended to match the output, each index incremented by
  // the amount of front padding of the input shape.
  int extended_perm[4];
  for (int i = 0; i < output_ext_size; ++i) {
    extended_perm[i] = i;
  }
  for (int i = 0; i < unextended_output_size; ++i) {
    extended_perm[i + output_ext_size] = params.perm[i] + input_ext_size;
  }
  Eigen::array<int, 4> p;
  for (int i = 0; i < 4; ++i) p[i] = extended_perm[i];
  Eigen::DSizes<Eigen::DenseIndex, 4> input_dsizes;
  for (int d = 0; d < 4; d++) {
    input_dsizes[d] = static_cast<Eigen::DenseIndex>(input_shape.Dims(d));
  }
  Eigen::DSizes<Eigen::DenseIndex, 4> output_dsizes;
  for (int d = 0; d < 4; d++) {
    output_dsizes[d] = static_cast<Eigen::DenseIndex>(output_shape.Dims(d));
  }
  auto x = typename TTypes<T, 4>::ConstTensor(input_data, input_dsizes);
  auto y = typename TTypes<T, 4>::Tensor(output_data, output_dsizes);
  y = x.shuffle(p);
 }
 template <typename T>
 void Transpose(const TransposeParams& params,
               const RuntimeShape& unextended_input_shape, const T* input_data,
               const RuntimeShape& unextended_output_shape, T* output_data) {
  const int unextended_output_size = unextended_output_shape.DimensionsCount();
  TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4);
  TFLITE_DCHECK_LE(unextended_output_size, 4);
  TFLITE_DCHECK_EQ(unextended_output_size, params.perm_count);
  // Transpose kernel only does rearranging values not numeric evaluations on
  // each cell. It's safe to implement per size of scalar type and this trick
  // keeps the total code size in a reasonable range.
  switch (sizeof(T)) {
    case 1:
      // TODO(jaesung): Find a good 2d transpose implementation for 8-bit
      // matrices.
      TransposeImpl(params, unextended_input_shape,
                    reinterpret_cast<const int8_t*>(input_data),
                    unextended_output_shape,
                    reinterpret_cast<int8_t*>(output_data));
      break;
    case 4:
      if (unextended_input_shape.DimensionsCount() == 2 &&
          params.perm[0] == 1 && params.perm[1] == 0) {
        Transpose2DOn32bitMatrix(params, unextended_input_shape,
                                 reinterpret_cast<const int32_t*>(input_data),
                                 unextended_output_shape,
                                 reinterpret_cast<int32_t*>(output_data));
        return;
      }
      TransposeImpl(params, unextended_input_shape,
                    reinterpret_cast<const int32_t*>(input_data),
                    unextended_output_shape,
                    reinterpret_cast<int32_t*>(output_data));
      break;
    default:
      // Reroute to the reference version if the given size is not common.
      reference_ops::Transpose(params, unextended_input_shape, input_data,
                               unextended_output_shape, output_data);
  }
 }
 }  // namespace optimized_ops
 }  // namespace tflite
--- a/tensorflow/lite/kernels/internal/reference/reference_ops.h
+++ b/tensorflow/lite/kernels/internal/reference/reference_ops.h
@ -3046,9 +3046,11 @@ inline void ArgMax(const RuntimeShape& input1_shape, const T1* input1_data,
 }
 template <typename T>
-void Transpose(const TransposeParams& params,
+inline void TransposeImpl(const TransposeParams& params,
-               const RuntimeShape& unextended_input_shape, const T* input_data,
+                          const RuntimeShape& unextended_input_shape,
-               const RuntimeShape& unextended_output_shape, T* output_data) {
+                          const T* input_data,
                          const RuntimeShape& unextended_output_shape,
                          T* output_data) {
  const int unextended_output_size = unextended_output_shape.DimensionsCount();
  TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4);
  TFLITE_DCHECK_LE(unextended_output_size, 4);
@ -3096,6 +3098,42 @@ void Transpose(const TransposeParams& params,
  }
 }
 template <typename T>
 void Transpose(const TransposeParams& params,
               const RuntimeShape& unextended_input_shape, const T* input_data,
               const RuntimeShape& unextended_output_shape, T* output_data) {
  // Transpose kernel only does rearranging values not numeric evaluations on
  // each cell. It's safe to implement per size of scalar type and this trick
  // keeps the total code size in a reasonable range.
  switch (sizeof(T)) {
    case 1:
      TransposeImpl<int8_t>(params, unextended_input_shape,
                            reinterpret_cast<const int8_t*>(input_data),
                            unextended_output_shape,
                            reinterpret_cast<int8_t*>(output_data));
      break;
    case 2:
      TransposeImpl<int16_t>(params, unextended_input_shape,
                             reinterpret_cast<const int16_t*>(input_data),
                             unextended_output_shape,
                             reinterpret_cast<int16_t*>(output_data));
      break;
    case 4:
      TransposeImpl<int32_t>(params, unextended_input_shape,
                             reinterpret_cast<const int32_t*>(input_data),
                             unextended_output_shape,
                             reinterpret_cast<int32_t*>(output_data));
      break;
    case 8:
      TransposeImpl<int64_t>(params, unextended_input_shape,
                             reinterpret_cast<const int64_t*>(input_data),
                             unextended_output_shape,
                             reinterpret_cast<int64_t*>(output_data));
      break;
  }
 }
 inline void TransposeConv(
    const ConvParams& params, const RuntimeShape& input_shape,
    const float* input_data, const RuntimeShape& filter_shape,
--- a/tensorflow/lite/kernels/transpose.cc
+++ b/tensorflow/lite/kernels/transpose.cc
@ -13,9 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include <string.h>
 #include <vector>
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
@ -29,6 +32,7 @@ namespace transpose {
 // This file has two implementations of Transpose.
 enum KernelType {
  kReference,
  kGenericOptimized,
 };
 struct TransposeContext {
@ -96,8 +100,18 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
  const int size = op_context.perm->dims->data[0];
  TransposeParams params;
  params.perm_count = size;
  bool identical = true;
  for (int i = 0; i < size; ++i) {
    params.perm[i] = perm_data[i];
    if (perm_data[i] != i) identical = false;
  }
  // TODO(b/140779653): Add an optimization pass in the conversion process to
  // remove transpose op nodes where they do nothing like the below one.
  if (identical) {
    memcpy(op_context.output->data.raw, op_context.input->data.raw,
           op_context.output->bytes);
    return kTfLiteOk;
  }
 #define TF_LITE_TRANSPOSE(type, scalar)                     \
@ -108,32 +122,44 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
  switch (op_context.input->type) {
    case kTfLiteFloat32:
-      if (kernel_type == kReference) {
+      if (kernel_type == kGenericOptimized) {
        TF_LITE_TRANSPOSE(optimized_ops, float);
      } else {
        TF_LITE_TRANSPOSE(reference_ops, float);
      }
      break;
    case kTfLiteUInt8:
-      if (kernel_type == kReference) {
+      if (kernel_type == kGenericOptimized) {
        TF_LITE_TRANSPOSE(optimized_ops, uint8_t);
      } else {
        TF_LITE_TRANSPOSE(reference_ops, uint8_t);
      }
      break;
    case kTfLiteInt8:
-      if (kernel_type == kReference) {
+      if (kernel_type == kGenericOptimized) {
        TF_LITE_TRANSPOSE(optimized_ops, int8_t);
      } else {
        TF_LITE_TRANSPOSE(reference_ops, int8_t);
      }
      break;
    case kTfLiteInt32:
-      if (kernel_type == kReference) {
+      if (kernel_type == kGenericOptimized) {
        TF_LITE_TRANSPOSE(optimized_ops, int32_t);
      } else {
        TF_LITE_TRANSPOSE(reference_ops, int32_t);
      }
      break;
    case kTfLiteInt64:
-      if (kernel_type == kReference) {
+      if (kernel_type == kGenericOptimized) {
        TF_LITE_TRANSPOSE(optimized_ops, int64_t);
      } else {
        TF_LITE_TRANSPOSE(reference_ops, int64_t);
      }
      break;
    case kTfLiteBool:
-      if (kernel_type == kReference) {
+      if (kernel_type == kGenericOptimized) {
        TF_LITE_TRANSPOSE(optimized_ops, bool);
      } else {
        TF_LITE_TRANSPOSE(reference_ops, bool);
      }
      break;
@ -156,7 +182,15 @@ TfLiteRegistration* Register_TRANSPOSE_REF() {
  return &r;
 }
-TfLiteRegistration* Register_TRANSPOSE() { return Register_TRANSPOSE_REF(); }
+TfLiteRegistration* Register_TRANSPOSE_GENERIC_OPTIMIZED() {
  static TfLiteRegistration r = {nullptr, nullptr, transpose::Prepare,
                                 transpose::Eval<transpose::kGenericOptimized>};
  return &r;
 }
 TfLiteRegistration* Register_TRANSPOSE() {
  return Register_TRANSPOSE_GENERIC_OPTIMIZED();
 }
 }  // namespace builtin
 }  // namespace ops
--- a/tensorflow/lite/kernels/transpose_test.cc
+++ b/tensorflow/lite/kernels/transpose_test.cc
@ -233,6 +233,28 @@ TEST(TransposeTest, Test2DInputConstTensor) {
  EXPECT_THAT(m.GetOutput(), ElementsAreArray({0, 2, 4, 1, 3, 5}));
 }
 TEST(TransposeTest, Test2D4x4KernelTestLeftOverRightSide) {
  TransposeOpConstModel m({4, 6}, {2}, {1, 0});
  m.SetInput({0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11,
              12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23});
  m.Invoke();
  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({6, 4}));
  EXPECT_THAT(m.GetOutput(),
              ElementsAreArray({0, 6, 12, 18, 1, 7,  13, 19, 2, 8,  14, 20,
                                3, 9, 15, 21, 4, 10, 16, 22, 5, 11, 17, 23}));
 }
 TEST(TransposeTest, Test2D4x4KernelTest2LeftOverBottomSide) {
  TransposeOpConstModel m({6, 4}, {2}, {1, 0});
  m.SetInput({0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11,
              12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23});
  m.Invoke();
  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({4, 6}));
  EXPECT_THAT(m.GetOutput(),
              ElementsAreArray({0, 4, 8,  12, 16, 20, 1, 5, 9,  13, 17, 21,
                                2, 6, 10, 14, 18, 22, 3, 7, 11, 15, 19, 23}));
 }
 TEST(TransposeTest, Test2DInputDynamicTensor) {
  TransposeOpDynamicModel m({3, 2}, {2});
  m.SetInput({0, 1, 2, 3, 4, 5});