Add workaround for cublasLt known issue

- Avoids a heuristic alignment issue noted in the CUDA Release Notes.
2020-08-04 22:26:57 +10:00 · 2020-08-04 22:26:57 +10:00 · 8c0eb4b35b
commit 8c0eb4b35b
parent f7d29c94df
1 changed files with 84 additions and 18 deletions
--- a/tensorflow/stream_executor/cuda/cuda_blas.cc
+++ b/tensorflow/stream_executor/cuda/cuda_blas.cc
@ -488,6 +488,26 @@ cudaDataType_t GetCUDADataType(blas::DataType ty) {
      return CUDA_C_64F;
  }
 }
+
+int GetDataTypeSizeBytes(blas::DataType ty) {
+  switch (ty) {
+    case blas::DataType::kF16:
+      return 2;
+    case blas::DataType::kF32:
+      return 4;
+    case blas::DataType::kF64:
+      return 8;
+    case blas::DataType::kI8:
+      return 1;
+    case blas::DataType::kI32:
+      return 4;
+    case blas::DataType::kComplexF32:
+      return 8;
+    case blas::DataType::kComplexF64:
+      return 16;
+  }
+}
+
 }  // namespace

 template <typename FuncT, typename... Args>
@ -3161,22 +3181,6 @@ UniqueLayoutDesc CreateCublasLtLayoutDesc(blas::DataType data_type, uint64 rows,
  return unique_desc;
 }

-UniqueMatmulPreference CreateCublasLtMatmulPreference(
-    size_t max_workspace_bytes) {
-  cublasLtMatmulPreference_t preference;
-  cublasStatus_t status = cublasLtMatmulPreferenceCreate(&preference);
-  if (status != CUBLAS_STATUS_SUCCESS) {
-    VLOG(2) << "cublasLtMatmulPreferenceCreate failed: " << ToString(status);
-    return nullptr;
-  }
-  UniqueMatmulPreference unique_preference(preference);
-  if (!SetCublasLtAttr(preference, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES,
-                       max_workspace_bytes)) {
-    return nullptr;
-  }
-  return unique_preference;
-}
-
 // Helper function to allocate workspace.
 port::Status AllocateWorkspace(void** workspace,
                               ScratchAllocator* scratch_allocator,
@ -3230,6 +3234,11 @@ class CUDABlasLtMatmulPlan final : public blas::IBlasLtMatmulPlan {
  blas::DataType cd_type() const { return cd_type_; }
  blas::DataType scale_type() const { return scale_type_; }
  blas::PointerMode pointer_mode() const { return pointer_mode_; }
+  int batch_count() const { return batch_count_; }
+  int64 stride_a() const { return stride_a_; }
+  int64 stride_b() const { return stride_b_; }
+  int64 stride_c() const { return stride_c_; }
+  int64 stride_d() const { return stride_d_; }

 private:
  UniqueOpDesc op_desc_;
@ -3241,6 +3250,11 @@ class CUDABlasLtMatmulPlan final : public blas::IBlasLtMatmulPlan {
  blas::DataType cd_type_;
  blas::DataType scale_type_;
  blas::PointerMode pointer_mode_;
+  int batch_count_;
+  int64 stride_a_;
+  int64 stride_b_;
+  int64 stride_c_;
+  int64 stride_d_;
 };

 CUDABlasLtMatmulPlan::CUDABlasLtMatmulPlan(
@ -3261,7 +3275,12 @@ CUDABlasLtMatmulPlan::CUDABlasLtMatmulPlan(
      ab_type_(ab_type),
      cd_type_(cd_type),
      scale_type_(GetScaleType(cd_type, computation_type)),
-      pointer_mode_(pointer_mode) {
+      pointer_mode_(pointer_mode),
+      batch_count_(batch_count),
+      stride_a_(stride_a),
+      stride_b_(stride_b),
+      stride_c_(stride_c),
+      stride_d_(stride_d) {
  uint64 rows_a = transa == blas::Transpose::kNoTranspose ? m : k;
  uint64 cols_a = transa == blas::Transpose::kNoTranspose ? k : m;
  uint64 rows_b = transb == blas::Transpose::kNoTranspose ? k : n;
@ -3296,6 +3315,53 @@ class CUDABlasLtMatmulAlgorithm final : public blas::IBlasLtMatmulAlgorithm {
  size_t workspace_size_;
 };

+UniqueMatmulPreference CreateCublasLtMatmulPreference(
+    const blas::IBlasLtMatmulPlan* plan,
+    size_t max_workspace_bytes) {
+  cublasLtMatmulPreference_t preference;
+  cublasStatus_t status = cublasLtMatmulPreferenceCreate(&preference);
+  if (status != CUBLAS_STATUS_SUCCESS) {
+    VLOG(2) << "cublasLtMatmulPreferenceCreate failed: " << ToString(status);
+    return nullptr;
+  }
+  UniqueMatmulPreference unique_preference(preference);
+  if (!SetCublasLtAttr(preference, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES,
+                       max_workspace_bytes)) {
+    return nullptr;
+  }
+
+  const auto& cuda_plan = *static_cast<const CUDABlasLtMatmulPlan*>(plan);
+  if (cuda_plan.batch_count() == 0) {
+    return unique_preference;
+  }
+  // This is a workaround for a known issue in cuBlasLt where the heuristic may
+  // in rare cases select an algo that does not support the specified stride.
+  // Specifying the alignment requirements manually like this avoids the issue.
+  auto get_alignment_bytes = [](int64 stride, blas::DataType dtype) {
+    return (stride & -stride) * GetDataTypeSizeBytes(dtype);
+  };
+  if ((cuda_plan.stride_a() &&
+       !SetCublasLtAttr(preference, CUBLASLT_MATMUL_PREF_MIN_ALIGNMENT_A_BYTES,
+                        (uint32)get_alignment_bytes(cuda_plan.stride_a(),
+                                                    cuda_plan.ab_type()))) ||
+      (cuda_plan.stride_b() &&
+       !SetCublasLtAttr(preference, CUBLASLT_MATMUL_PREF_MIN_ALIGNMENT_B_BYTES,
+                        (uint32)get_alignment_bytes(cuda_plan.stride_b(),
+                                                    cuda_plan.ab_type()))) ||
+      (cuda_plan.stride_c() &&
+       !SetCublasLtAttr(preference, CUBLASLT_MATMUL_PREF_MIN_ALIGNMENT_C_BYTES,
+                        (uint32)get_alignment_bytes(cuda_plan.stride_c(),
+                                                    cuda_plan.cd_type()))) ||
+      (cuda_plan.stride_d() &&
+       !SetCublasLtAttr(preference, CUBLASLT_MATMUL_PREF_MIN_ALIGNMENT_D_BYTES,
+                        (uint32)get_alignment_bytes(cuda_plan.stride_d(),
+                                                    cuda_plan.cd_type())))) {
+    return nullptr;
+  }
+
+  return unique_preference;
+}
+
 } // namespace

 #endif  // CUDA_VERSION >= 11000
@ -3327,7 +3393,7 @@ bool CUDABlas::GetBlasLtMatmulAlgorithms(
        out_algorithms) {
 #if CUDA_VERSION >= 11000
  UniqueMatmulPreference preference =
-      CreateCublasLtMatmulPreference(max_workspace_size);
+      CreateCublasLtMatmulPreference(plan, max_workspace_size);
  if (!preference) return false;

  std::vector<cublasLtMatmulHeuristicResult_t> results(max_algorithm_count);