diff --git a/tensorflow/lite/kernels/internal/optimized/legacy_optimized_ops.h b/tensorflow/lite/kernels/internal/optimized/legacy_optimized_ops.h
index b9305169065..16c04d5ae98 100644
--- a/tensorflow/lite/kernels/internal/optimized/legacy_optimized_ops.h
+++ b/tensorflow/lite/kernels/internal/optimized/legacy_optimized_ops.h
@@ -72,7 +72,6 @@ using reference_ops::SpaceToBatchND;
 using reference_ops::Split;
 using reference_ops::StridedSlice;
 using reference_ops::TensorFlowSplit;
-using reference_ops::Transpose;
 
 static constexpr int kDepthwiseReverseShift = -1;
 
@@ -4918,6 +4917,18 @@ inline void Dequantize(const uint8* input_data, const Dims<4>& input_dims,
              DimsToShape(output_dims), output_data);
 }
 
+template <typename T>
+void Transpose(const T* input, const Dims<4>& input_dims, T* output,
+               const Dims<4>& output_dims, const int* permuted_axes) {
+  TransposeParams params;
+  params.perm_count = 4;
+  for (int i = 0; i < 4; ++i) {
+    params.perm[i] = 3 - permuted_axes[3 - i];
+  }
+  Transpose(params, DimsToShape(input_dims), input, DimsToShape(output_dims),
+            output);
+}
+
 }  // namespace optimized_ops
 }  // namespace tflite
 #endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_LEGACY_OPTIMIZED_OPS_H_
diff --git a/tensorflow/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
index ba7b0fd2f32..e4a40a2e909 100644
--- a/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
+++ b/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
@@ -98,7 +98,6 @@ using reference_ops::SpaceToBatchND;
 using reference_ops::Split;
 using reference_ops::StridedSlice;
 using reference_ops::Sub16;
-using reference_ops::Transpose;
 
 // TODO(b/80247582) Remove this constant.
 // This will be phased out as the shifts are revised with more thought. Use of a
@@ -180,6 +179,12 @@ struct TTypes {
   typedef Eigen::TensorMap<
       Eigen::Tensor<const T, 2, Eigen::RowMajor, IndexType>>
       UnalignedConstMatrix;
+  typedef Eigen::TensorMap<
+      Eigen::Tensor<const T, NDIMS, Eigen::RowMajor, IndexType>, Eigen::Aligned>
+      ConstTensor;
+  typedef Eigen::TensorMap<Eigen::Tensor<T, NDIMS, Eigen::RowMajor, IndexType>,
+                           Eigen::Aligned>
+      Tensor;
 };
 
 // TODO(b/62193649): this function is only needed as long
@@ -6694,6 +6699,171 @@ inline void Logistic16bitPercision(const LogisticParams& params,
   }
 }
 
+// Transpose2DOn32bitMatrix only deals with typical 2D matrix transpose ops.
+inline void Transpose2DOn32bitMatrix(const TransposeParams& params,
+                                     const RuntimeShape& input_shape,
+                                     const int32_t* input_data,
+                                     const RuntimeShape& output_shape,
+                                     int32_t* output_data) {
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 2);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 2);
+  TFLITE_DCHECK_EQ(params.perm_count, 2);
+  TFLITE_DCHECK_EQ(params.perm[0], 1);
+  TFLITE_DCHECK_EQ(params.perm[1], 0);
+
+  const int d0 = input_shape.DimsData()[0];
+  const int d1 = input_shape.DimsData()[1];
+#ifdef USE_NEON
+  const int kLines = 4;
+  const int kSkipSize = (kLines - 1) * d1;
+#endif
+
+  const int32_t* input = input_data;
+
+  int i = 0;
+#ifdef USE_NEON
+  for (; i <= d0 - kLines; i += kLines) {
+    int32_t* output = output_data + i;
+
+    const int32_t* input_ptr = input;
+    __builtin_prefetch(input_ptr, 0, 3);
+    input_ptr += d1;
+    __builtin_prefetch(input_ptr, 0, 3);
+    input_ptr += d1;
+    __builtin_prefetch(input_ptr, 0, 3);
+    input_ptr += d1;
+    __builtin_prefetch(input_ptr, 0, 3);
+
+    int j = 0;
+    for (; j <= d1 - kLines; j += kLines) {
+      input_ptr = input;
+      int32x4_t a0 = vld1q_s32(input);
+      input_ptr += d1;
+      int32x4_t a1 = vld1q_s32(input_ptr);
+      input_ptr += d1;
+      int32x4_t a2 = vld1q_s32(input_ptr);
+      input_ptr += d1;
+      int32x4_t a3 = vld1q_s32(input_ptr);
+
+      int32x4x2_t tmp1 = vuzpq_s32(a0, a2);
+      int32x4x2_t tmp2 = vuzpq_s32(a1, a3);
+      int32x4x2_t tmp3 = vtrnq_s32(tmp1.val[0], tmp2.val[0]);
+      int32x4x2_t tmp4 = vtrnq_s32(tmp1.val[1], tmp2.val[1]);
+
+      vst1q_s32(output, tmp3.val[0]);
+      output += d0;
+      vst1q_s32(output, tmp4.val[0]);
+      output += d0;
+      vst1q_s32(output, tmp3.val[1]);
+      output += d0;
+      vst1q_s32(output, tmp4.val[1]);
+      output += d0;
+      input += kLines;
+    }
+    if (j == d1) {
+      input += kSkipSize;
+    } else {
+      for (int p = 0; p < kLines; ++p) {
+        for (int q = 0; q < d1 - j; ++q) {
+          *(output + q * d0 + p) = *(input + p * d1 + q);
+        }
+      }
+      input += (d1 - j) + kSkipSize;
+    }
+  }
+#endif
+  for (; i < d0; ++i) {
+    int32_t* output = output_data + i;
+    for (int j = 0; j < d1; ++j) {
+      *output = *input;
+      output += d0;
+      ++input;
+    }
+  }
+}
+
+template <typename T>
+inline void TransposeImpl(const TransposeParams& params,
+                          const RuntimeShape& unextended_input_shape,
+                          const T* input_data,
+                          const RuntimeShape& unextended_output_shape,
+                          T* output_data) {
+  const int unextended_output_size = unextended_input_shape.DimensionsCount();
+  const RuntimeShape input_shape =
+      RuntimeShape::ExtendedShape(4, unextended_input_shape);
+  const RuntimeShape output_shape =
+      RuntimeShape::ExtendedShape(4, unextended_output_shape);
+  const int input_ext_size = 4 - unextended_input_shape.DimensionsCount();
+  const int output_ext_size = 4 - unextended_output_size;
+
+  // The perm data is extended to match the output, each index incremented by
+  // the amount of front padding of the input shape.
+  int extended_perm[4];
+  for (int i = 0; i < output_ext_size; ++i) {
+    extended_perm[i] = i;
+  }
+  for (int i = 0; i < unextended_output_size; ++i) {
+    extended_perm[i + output_ext_size] = params.perm[i] + input_ext_size;
+  }
+
+  Eigen::array<int, 4> p;
+  for (int i = 0; i < 4; ++i) p[i] = extended_perm[i];
+  Eigen::DSizes<Eigen::DenseIndex, 4> input_dsizes;
+  for (int d = 0; d < 4; d++) {
+    input_dsizes[d] = static_cast<Eigen::DenseIndex>(input_shape.Dims(d));
+  }
+  Eigen::DSizes<Eigen::DenseIndex, 4> output_dsizes;
+  for (int d = 0; d < 4; d++) {
+    output_dsizes[d] = static_cast<Eigen::DenseIndex>(output_shape.Dims(d));
+  }
+
+  auto x = typename TTypes<T, 4>::ConstTensor(input_data, input_dsizes);
+  auto y = typename TTypes<T, 4>::Tensor(output_data, output_dsizes);
+  y = x.shuffle(p);
+}
+
+template <typename T>
+void Transpose(const TransposeParams& params,
+               const RuntimeShape& unextended_input_shape, const T* input_data,
+               const RuntimeShape& unextended_output_shape, T* output_data) {
+  const int unextended_output_size = unextended_output_shape.DimensionsCount();
+  TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_output_size, 4);
+  TFLITE_DCHECK_EQ(unextended_output_size, params.perm_count);
+
+  // Transpose kernel only does rearranging values not numeric evaluations on
+  // each cell. It's safe to implement per size of scalar type and this trick
+  // keeps the total code size in a reasonable range.
+  switch (sizeof(T)) {
+    case 1:
+      // TODO(jaesung): Find a good 2d transpose implementation for 8-bit
+      // matrices.
+      TransposeImpl(params, unextended_input_shape,
+                    reinterpret_cast<const int8_t*>(input_data),
+                    unextended_output_shape,
+                    reinterpret_cast<int8_t*>(output_data));
+      break;
+    case 4:
+      if (unextended_input_shape.DimensionsCount() == 2 &&
+          params.perm[0] == 1 && params.perm[1] == 0) {
+        Transpose2DOn32bitMatrix(params, unextended_input_shape,
+                                 reinterpret_cast<const int32_t*>(input_data),
+                                 unextended_output_shape,
+                                 reinterpret_cast<int32_t*>(output_data));
+        return;
+      }
+      TransposeImpl(params, unextended_input_shape,
+                    reinterpret_cast<const int32_t*>(input_data),
+                    unextended_output_shape,
+                    reinterpret_cast<int32_t*>(output_data));
+      break;
+    default:
+      // Reroute to the reference version if the given size is not common.
+      reference_ops::Transpose(params, unextended_input_shape, input_data,
+                               unextended_output_shape, output_data);
+  }
+}
+
 }  // namespace optimized_ops
 }  // namespace tflite
 
diff --git a/tensorflow/lite/kernels/internal/reference/reference_ops.h b/tensorflow/lite/kernels/internal/reference/reference_ops.h
index 5f2e8331f59..203cafdfbb5 100644
--- a/tensorflow/lite/kernels/internal/reference/reference_ops.h
+++ b/tensorflow/lite/kernels/internal/reference/reference_ops.h
@@ -3046,9 +3046,11 @@ inline void ArgMax(const RuntimeShape& input1_shape, const T1* input1_data,
 }
 
 template <typename T>
-void Transpose(const TransposeParams& params,
-               const RuntimeShape& unextended_input_shape, const T* input_data,
-               const RuntimeShape& unextended_output_shape, T* output_data) {
+inline void TransposeImpl(const TransposeParams& params,
+                          const RuntimeShape& unextended_input_shape,
+                          const T* input_data,
+                          const RuntimeShape& unextended_output_shape,
+                          T* output_data) {
   const int unextended_output_size = unextended_output_shape.DimensionsCount();
   TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4);
   TFLITE_DCHECK_LE(unextended_output_size, 4);
@@ -3096,6 +3098,42 @@ void Transpose(const TransposeParams& params,
   }
 }
 
+template <typename T>
+void Transpose(const TransposeParams& params,
+               const RuntimeShape& unextended_input_shape, const T* input_data,
+               const RuntimeShape& unextended_output_shape, T* output_data) {
+  // Transpose kernel only does rearranging values not numeric evaluations on
+  // each cell. It's safe to implement per size of scalar type and this trick
+  // keeps the total code size in a reasonable range.
+  switch (sizeof(T)) {
+    case 1:
+      TransposeImpl<int8_t>(params, unextended_input_shape,
+                            reinterpret_cast<const int8_t*>(input_data),
+                            unextended_output_shape,
+                            reinterpret_cast<int8_t*>(output_data));
+      break;
+    case 2:
+      TransposeImpl<int16_t>(params, unextended_input_shape,
+                             reinterpret_cast<const int16_t*>(input_data),
+                             unextended_output_shape,
+                             reinterpret_cast<int16_t*>(output_data));
+      break;
+
+    case 4:
+      TransposeImpl<int32_t>(params, unextended_input_shape,
+                             reinterpret_cast<const int32_t*>(input_data),
+                             unextended_output_shape,
+                             reinterpret_cast<int32_t*>(output_data));
+      break;
+    case 8:
+      TransposeImpl<int64_t>(params, unextended_input_shape,
+                             reinterpret_cast<const int64_t*>(input_data),
+                             unextended_output_shape,
+                             reinterpret_cast<int64_t*>(output_data));
+      break;
+  }
+}
+
 inline void TransposeConv(
     const ConvParams& params, const RuntimeShape& input_shape,
     const float* input_data, const RuntimeShape& filter_shape,
diff --git a/tensorflow/lite/kernels/transpose.cc b/tensorflow/lite/kernels/transpose.cc
index 33dee1ff838..c2d2cde2a68 100644
--- a/tensorflow/lite/kernels/transpose.cc
+++ b/tensorflow/lite/kernels/transpose.cc
@@ -13,9 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include <string.h>
+
 #include <vector>
+
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
@@ -29,6 +32,7 @@ namespace transpose {
 // This file has two implementations of Transpose.
 enum KernelType {
   kReference,
+  kGenericOptimized,
 };
 
 struct TransposeContext {
@@ -96,8 +100,18 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   const int size = op_context.perm->dims->data[0];
   TransposeParams params;
   params.perm_count = size;
+  bool identical = true;
   for (int i = 0; i < size; ++i) {
     params.perm[i] = perm_data[i];
+    if (perm_data[i] != i) identical = false;
+  }
+
+  // TODO(b/140779653): Add an optimization pass in the conversion process to
+  // remove transpose op nodes where they do nothing like the below one.
+  if (identical) {
+    memcpy(op_context.output->data.raw, op_context.input->data.raw,
+           op_context.output->bytes);
+    return kTfLiteOk;
   }
 
 #define TF_LITE_TRANSPOSE(type, scalar)                     \
@@ -108,32 +122,44 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 
   switch (op_context.input->type) {
     case kTfLiteFloat32:
-      if (kernel_type == kReference) {
+      if (kernel_type == kGenericOptimized) {
+        TF_LITE_TRANSPOSE(optimized_ops, float);
+      } else {
         TF_LITE_TRANSPOSE(reference_ops, float);
       }
       break;
     case kTfLiteUInt8:
-      if (kernel_type == kReference) {
+      if (kernel_type == kGenericOptimized) {
+        TF_LITE_TRANSPOSE(optimized_ops, uint8_t);
+      } else {
         TF_LITE_TRANSPOSE(reference_ops, uint8_t);
       }
       break;
     case kTfLiteInt8:
-      if (kernel_type == kReference) {
+      if (kernel_type == kGenericOptimized) {
+        TF_LITE_TRANSPOSE(optimized_ops, int8_t);
+      } else {
         TF_LITE_TRANSPOSE(reference_ops, int8_t);
       }
       break;
     case kTfLiteInt32:
-      if (kernel_type == kReference) {
+      if (kernel_type == kGenericOptimized) {
+        TF_LITE_TRANSPOSE(optimized_ops, int32_t);
+      } else {
         TF_LITE_TRANSPOSE(reference_ops, int32_t);
       }
       break;
     case kTfLiteInt64:
-      if (kernel_type == kReference) {
+      if (kernel_type == kGenericOptimized) {
+        TF_LITE_TRANSPOSE(optimized_ops, int64_t);
+      } else {
         TF_LITE_TRANSPOSE(reference_ops, int64_t);
       }
       break;
     case kTfLiteBool:
-      if (kernel_type == kReference) {
+      if (kernel_type == kGenericOptimized) {
+        TF_LITE_TRANSPOSE(optimized_ops, bool);
+      } else {
         TF_LITE_TRANSPOSE(reference_ops, bool);
       }
       break;
@@ -156,7 +182,15 @@ TfLiteRegistration* Register_TRANSPOSE_REF() {
   return &r;
 }
 
-TfLiteRegistration* Register_TRANSPOSE() { return Register_TRANSPOSE_REF(); }
+TfLiteRegistration* Register_TRANSPOSE_GENERIC_OPTIMIZED() {
+  static TfLiteRegistration r = {nullptr, nullptr, transpose::Prepare,
+                                 transpose::Eval<transpose::kGenericOptimized>};
+  return &r;
+}
+
+TfLiteRegistration* Register_TRANSPOSE() {
+  return Register_TRANSPOSE_GENERIC_OPTIMIZED();
+}
 
 }  // namespace builtin
 }  // namespace ops
diff --git a/tensorflow/lite/kernels/transpose_test.cc b/tensorflow/lite/kernels/transpose_test.cc
index f5d8f314d6e..badbec30f7a 100644
--- a/tensorflow/lite/kernels/transpose_test.cc
+++ b/tensorflow/lite/kernels/transpose_test.cc
@@ -233,6 +233,28 @@ TEST(TransposeTest, Test2DInputConstTensor) {
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({0, 2, 4, 1, 3, 5}));
 }
 
+TEST(TransposeTest, Test2D4x4KernelTestLeftOverRightSide) {
+  TransposeOpConstModel m({4, 6}, {2}, {1, 0});
+  m.SetInput({0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11,
+              12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({6, 4}));
+  EXPECT_THAT(m.GetOutput(),
+              ElementsAreArray({0, 6, 12, 18, 1, 7,  13, 19, 2, 8,  14, 20,
+                                3, 9, 15, 21, 4, 10, 16, 22, 5, 11, 17, 23}));
+}
+
+TEST(TransposeTest, Test2D4x4KernelTest2LeftOverBottomSide) {
+  TransposeOpConstModel m({6, 4}, {2}, {1, 0});
+  m.SetInput({0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11,
+              12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({4, 6}));
+  EXPECT_THAT(m.GetOutput(),
+              ElementsAreArray({0, 4, 8,  12, 16, 20, 1, 5, 9,  13, 17, 21,
+                                2, 6, 10, 14, 18, 22, 3, 7, 11, 15, 19, 23}));
+}
+
 TEST(TransposeTest, Test2DInputDynamicTensor) {
   TransposeOpDynamicModel m({3, 2}, {2});
   m.SetInput({0, 1, 2, 3, 4, 5});