Rely on the tensor cost model to figure out the best strategy to parallelize

the transpose operation instead of using a hardcoded cutoff point. Change: 122193123
2016-05-12 12:48:25 -08:00 · 2016-05-12 12:48:25 -08:00 · 313408ba1f
commit 313408ba1f
parent b082c4f921
1 changed files with 1 additions and 8 deletions
--- a/tensorflow/core/kernels/transpose_functor_cpu.cc
+++ b/tensorflow/core/kernels/transpose_functor_cpu.cc
@ -56,14 +56,7 @@ void TransposeUsingEigen(const Device& d, const Tensor& in,
  auto y = typename TTypes<T, NDIMS>::Tensor(
      reinterpret_cast<T*>(const_cast<char*>(out->tensor_data().data())),
      out->shape().AsEigenDSizes<NDIMS>());
-  auto nelem = in.NumElements();
+  y.device(d) = x.shuffle(p);
  static const int64 kInlineThreshold = 131072;
  if (nelem * sizeof(T) < kInlineThreshold) {
    // Don't bother multi-threaded transpose if 'in' is small.
    y = x.shuffle(p);
  } else {
    y.device(d) = x.shuffle(p);
  }
 }
 }  // end namespace internal