Rely on the tensor cost model to figure out the best strategy to parallelize
the transpose operation instead of using a hardcoded cutoff point. Change: 122193123
This commit is contained in:
parent
b082c4f921
commit
313408ba1f
@ -56,14 +56,7 @@ void TransposeUsingEigen(const Device& d, const Tensor& in,
|
|||||||
auto y = typename TTypes<T, NDIMS>::Tensor(
|
auto y = typename TTypes<T, NDIMS>::Tensor(
|
||||||
reinterpret_cast<T*>(const_cast<char*>(out->tensor_data().data())),
|
reinterpret_cast<T*>(const_cast<char*>(out->tensor_data().data())),
|
||||||
out->shape().AsEigenDSizes<NDIMS>());
|
out->shape().AsEigenDSizes<NDIMS>());
|
||||||
auto nelem = in.NumElements();
|
y.device(d) = x.shuffle(p);
|
||||||
static const int64 kInlineThreshold = 131072;
|
|
||||||
if (nelem * sizeof(T) < kInlineThreshold) {
|
|
||||||
// Don't bother multi-threaded transpose if 'in' is small.
|
|
||||||
y = x.shuffle(p);
|
|
||||||
} else {
|
|
||||||
y.device(d) = x.shuffle(p);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
} // end namespace internal
|
} // end namespace internal
|
||||||
|
Loading…
Reference in New Issue
Block a user