Use BlasGemv() when autotune is not set.

PiperOrigin-RevId: 163754092
2017-07-31 15:05:49 -07:00 · 2017-07-31 15:05:49 -07:00 · d03ba54f72
commit d03ba54f72
parent 724884f1ca
1 changed files with 13 additions and 2 deletions
--- a/tensorflow/core/kernels/matmul_op.cc
+++ b/tensorflow/core/kernels/matmul_op.cc
@ -212,6 +212,11 @@ bool LaunchBlasGemv<Eigen::half>::IsSupported() {
  return false;
 }

+template <typename T>
+bool ShouldUseGemv(uint64 n) {
+  return (LaunchBlasGemv<T>::IsSupported() && n == 1);
+}
+
 }  // namespace

 bool GetCublasAutotuneComputationType(
@ -339,7 +344,7 @@ struct LaunchMatMul<GPUDevice, T, true /* USE_CUBLAS */> {
          }
        }
        // Try BlasGemvWithProfiling
-        if (LaunchBlasGemv<T>::IsSupported() && n == 1) {
+        if (ShouldUseGemv<T>(n)) {
          LaunchBlasGemv<T>::Compute(ctx, stream, !transpose_a,
                                     transpose_a ? m : k, transpose_a ? k : m,
                                     a_ptr, b_ptr, &c_ptr, &profile_result);
@ -385,11 +390,17 @@ struct LaunchMatMul<GPUDevice, T, true /* USE_CUBLAS */> {
    //  2) compute type does not support autotune;
    //  3) no algorithm is found;
    //  4) all internal kernels in autotune return invalid results.
+    //  For the following case, we use normal BlasGemv():
+    //  1) We didn't set the use_autotune flag but LaunchBlasGemv is supported
+    //     and n == 1.
+    //  2) We set the use_autotune flag and it picked up BlasGemv() and set the
+    //     algorithm_config.algorithm() to be kDefaultBlasGemv.
    if (!use_autotune || !compute_type_supported || algorithms->empty() ||
        algorithm_config.algorithm() == kNoAlgorithm ||
        algorithm_config.algorithm() == kDefaultBlasGemm ||
        algorithm_config.algorithm() == kDefaultBlasGemv) {
-      if (algorithm_config.algorithm() == kDefaultBlasGemv) {
+      if (algorithm_config.algorithm() == kDefaultBlasGemv ||
+          ShouldUseGemv<T>(n)) {
        // This is a matrix*vector multiply so use GEMV to compute A * b.
        // Here we are multiplying in the natural order, so we have to flip
        // the transposition flag to compensate for the tensor being stored