From 313408ba1f5c46a3248980c96399e1804fd6d436 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Thu, 12 May 2016 12:48:25 -0800
Subject: [PATCH] Rely on the tensor cost model to figure out the best strategy
 to parallelize the transpose operation instead of using a hardcoded cutoff
 point. Change: 122193123

---
 tensorflow/core/kernels/transpose_functor_cpu.cc | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)
diff --git a/tensorflow/core/kernels/transpose_functor_cpu.cc b/tensorflow/core/kernels/transpose_functor_cpu.cc
index 0997b3b2839..ca4268c78d3 100644
--- a/tensorflow/core/kernels/transpose_functor_cpu.cc
+++ b/tensorflow/core/kernels/transpose_functor_cpu.cc
@@ -56,14 +56,7 @@ void TransposeUsingEigen(const Device& d, const Tensor& in,
   auto y = typename TTypes<T, NDIMS>::Tensor(
       reinterpret_cast<T*>(const_cast<char*>(out->tensor_data().data())),
       out->shape().AsEigenDSizes<NDIMS>());
-  auto nelem = in.NumElements();
-  static const int64 kInlineThreshold = 131072;
-  if (nelem * sizeof(T) < kInlineThreshold) {
-    // Don't bother multi-threaded transpose if 'in' is small.
-    y = x.shuffle(p);
-  } else {
-    y.device(d) = x.shuffle(p);
-  }
+  y.device(d) = x.shuffle(p);
 }
 
 }  // end namespace internal