From 5bf0bab331720d2b1cd1ff862b646bdef45b7206 Mon Sep 17 00:00:00 2001 From: Deven Desai Date: Thu, 7 May 2020 20:55:49 +0000 Subject: [PATCH] [ROCm] Fix for ROCm CSB breakage on 200507 The following PR/commit introduces a build error on the ROCm platform https://github.com/tensorflow/tensorflow/pull/38802 The error is caused by a call to the `CsrgemmBufferSize` routine which only exists on the CUDA side. The call to it was not guarded by the same #if block that guards the function declaration + definition. Adding the missing #if block fixes the issue. This PR also adds some explicit `GOOGLE_CUDA &&` and `|| TENSORFLOW_USE_ROCM` conditions to some `#if` to make things clear. --- tensorflow/core/kernels/cuda_sparse.h | 10 +++++----- tensorflow/core/kernels/sparse/mat_mul_op.cc | 16 +++++----------- .../core/kernels/sparse/sparse_mat_mul_op.cc | 16 +++++++++------- 3 files changed, 19 insertions(+), 23 deletions(-) diff --git a/tensorflow/core/kernels/cuda_sparse.h b/tensorflow/core/kernels/cuda_sparse.h index eb69469b615..2d41cc72421 100644 --- a/tensorflow/core/kernels/cuda_sparse.h +++ b/tensorflow/core/kernels/cuda_sparse.h @@ -259,7 +259,7 @@ class GpuSparse { // http://docs.nvidia.com/cuda/cusparse/index.html#cusparse-lt-t-gt-coo2csr. Status Coo2csr(const int* cooRowInd, int nnz, int m, int* csrRowPtr) const; -#if CUDA_VERSION < 10020 +#if (GOOGLE_CUDA && (CUDA_VERSION < 10020)) || TENSORFLOW_USE_ROCM // Sparse-dense matrix multiplication C = alpha * op(A) * op(B) + beta * C, // where A is a sparse matrix in CSR format, B and C are dense tall // matrices. This routine allows transposition of matrix B, which @@ -311,7 +311,7 @@ class GpuSparse { // http://docs.nvidia.com/cuda/cusparse/index.html#cusparse-lt-t-gt-csrmv_mergepath // // **NOTE** This is an in-place operation for data in y. -#if CUDA_VERSION < 10020 +#if (GOOGLE_CUDA && (CUDA_VERSION < 10020)) || TENSORFLOW_USE_ROCM template Status Csrmv(gpusparseOperation_t transA, int m, int n, int nnz, const Scalar* alpha_host, const gpusparseMatDescr_t descrA, @@ -366,7 +366,7 @@ class GpuSparse { Scalar* csrSortedValC, int* csrSortedRowPtrC, int* csrSortedColIndC, void* workspace); -#if CUDA_VERSION >= 10000 +#if GOOGLE_CUDA && (CUDA_VERSION >= 10000) // Computes sparse-sparse matrix multiplication of matrices // stored in CSR format. This is part zero: calculate required workspace // size. @@ -383,7 +383,7 @@ class GpuSparse { // output. csrSortedRowPtrC must be preallocated on device with // m + 1 entries. See: // http://docs.nvidia.com/cuda/cusparse/index.html#cusparse-lt-t-gt-csrgemm. -#if CUDA_VERSION < 10000 +#if (GOOGLE_CUDA && (CUDA_VERSION < 10000)) || TENSORFLOW_USE_ROCM Status CsrgemmNnz(gpusparseOperation_t transA, gpusparseOperation_t transB, int m, int k, int n, const gpusparseMatDescr_t descrA, int nnzA, const int* csrSortedRowPtrA, @@ -408,7 +408,7 @@ class GpuSparse { // addition. csrValC and csrColIndC must be allocated on the device // with nnzTotalDevHostPtr entries (as calculated by CsrgemmNnz). See: // http://docs.nvidia.com/cuda/cusparse/index.html#cusparse-lt-t-gt-csrgemm. -#if CUDA_VERSION < 10000 +#if (GOOGLE_CUDA && (CUDA_VERSION < 10000)) || TENSORFLOW_USE_ROCM template Status Csrgemm(gpusparseOperation_t transA, gpusparseOperation_t transB, int m, int k, int n, const gpusparseMatDescr_t descrA, diff --git a/tensorflow/core/kernels/sparse/mat_mul_op.cc b/tensorflow/core/kernels/sparse/mat_mul_op.cc index a0834800446..50fa0ec88ea 100644 --- a/tensorflow/core/kernels/sparse/mat_mul_op.cc +++ b/tensorflow/core/kernels/sparse/mat_mul_op.cc @@ -728,12 +728,14 @@ namespace { template struct GPUDataType; +// GPUDataType templates are currently not instantiated in the ROCm flow +// So leaving out the #elif TENSORFLOW_USE_ROCM blocks for now +// hipblas library is not (yet) being pulled in via rocm_configure.bzl +// so cannot reference tyeps from hipblas headers here template <> struct GPUDataType { #if GOOGLE_CUDA static constexpr cudaDataType_t type = CUDA_R_16F; -#elif TENSORFLOW_USE_ROCM - static constexpr hipblasDataType_t type = HIPBLAS_R_16F; #endif }; @@ -741,8 +743,6 @@ template <> struct GPUDataType { #if GOOGLE_CUDA static constexpr cudaDataType_t type = CUDA_R_32F; -#elif TENSORFLOW_USE_ROCM - static constexpr hipblasDataType_t type = HIPBLAS_R_32F; #endif }; @@ -750,8 +750,6 @@ template <> struct GPUDataType> { #if GOOGLE_CUDA static constexpr cudaDataType_t type = CUDA_C_32F; -#elif TENSORFLOW_USE_ROCM - static constexpr hipblasDataType_t type = HIPBLAS_C_32F; #endif }; @@ -759,8 +757,6 @@ template <> struct GPUDataType { #if GOOGLE_CUDA static constexpr cudaDataType_t type = CUDA_R_64F; -#elif TENSORFLOW_USE_ROCM - static constexpr hipblasDataType_t type = HIPBLAS_R_64F; #endif }; @@ -768,8 +764,6 @@ template <> struct GPUDataType> { #if GOOGLE_CUDA static constexpr cudaDataType_t type = CUDA_C_64F; -#elif TENSORFLOW_USE_ROCM - static constexpr hipblasDataType_t type = HIPBLAS_C_64F; #endif }; @@ -957,7 +951,7 @@ class CSRSparseMatrixMatVec { const int n = a.dense_shape_host(1); const int nnz = a.values.size(); DCHECK_EQ(nnz, a.col_ind.size()); -#if CUDA_VERSION >= 10020 +#if GOOGLE_CUDA && (CUDA_VERSION >= 10020) TF_RETURN_IF_ERROR(cuda_sparse.Csrmv(transA_, m, n, nnz, &alpha, a.values.data(), a.row_ptr.data(), a.col_ind.data(), x, &beta, y)); diff --git a/tensorflow/core/kernels/sparse/sparse_mat_mul_op.cc b/tensorflow/core/kernels/sparse/sparse_mat_mul_op.cc index 7325d5f6873..fb652e13d15 100644 --- a/tensorflow/core/kernels/sparse/sparse_mat_mul_op.cc +++ b/tensorflow/core/kernels/sparse/sparse_mat_mul_op.cc @@ -417,7 +417,7 @@ class CSRSparseMatMulGPUOp : public OpKernel { } auto b_input_dense_shape = b_input_matrix->dense_shape().vec(); -#if CUDA_VERSION >= 10000 +#if GOOGLE_CUDA && (CUDA_VERSION >= 10000) size_t maxWorkspaceSize = 0; for (int i = 0; i < batch_size; ++i) { // Calculate maximum workspace size over batch. @@ -558,7 +558,7 @@ struct CSRSparseSparseMatrixMatMul initialized_(false), transpose_a_(transpose_a), adjoint_a_(adjoint_a), -#if CUDA_VERSION < 10000 +#if (GOOGLE_CUDA && (CUDA_VERSION < 10000)) || TENSORFLOW_USE_ROCM transpose_b_(transpose_b) { #else transpose_b_(transpose_b), @@ -573,7 +573,7 @@ struct CSRSparseSparseMatrixMatMul : GPUSPARSE(OPERATION_NON_TRANSPOSE); } -#if CUDA_VERSION >= 10000 +#if GOOGLE_CUDA && (CUDA_VERSION >= 10000) ~CSRSparseSparseMatrixMatMul() { if (initialized_) { cusparseDestroyCsrgemm2Info(info_); @@ -591,7 +591,7 @@ struct CSRSparseSparseMatrixMatMul TF_RETURN_IF_ERROR(descrA_.Initialize()); TF_RETURN_IF_ERROR(descrB_.Initialize()); TF_RETURN_IF_ERROR(descrC_.Initialize()); -#if CUDA_VERSION >= 10000 +#if GOOGLE_CUDA && (CUDA_VERSION >= 10000) TF_RETURN_IF_GPUSPARSE_ERROR(cusparseCreateCsrgemm2Info(&info_)); #endif initialized_ = true; @@ -600,6 +600,7 @@ struct CSRSparseSparseMatrixMatMul Status GetWorkspaceSize(const ConstCSRComponent& a, const ConstCSRComponent& b, size_t* bufferSize) { +#if GOOGLE_CUDA && (CUDA_VERSION >= 10000) DCHECK(initialized_); const int m = a.dense_shape_host(a.dense_shape_host.size() - (transpose_a_ ? 1 : 2)); @@ -621,6 +622,7 @@ struct CSRSparseSparseMatrixMatMul m, n, k, descrA_.descr(), nnzA, a.row_ptr.data(), a.col_ind.data(), descrB_.descr(), nnzB, b.row_ptr.data(), b.col_ind.data(), info_, bufferSize)); +#endif return Status::OK(); } @@ -650,7 +652,7 @@ struct CSRSparseSparseMatrixMatMul *output_nnz = -1; -#if CUDA_VERSION < 10000 +#if (GOOGLE_CUDA && (CUDA_VERSION < 10000)) || TENSORFLOW_USE_ROCM TF_RETURN_IF_ERROR(cuda_sparse_.CsrgemmNnz( transA_, transB_, m, n, k, descrA_.descr(), nnzA, a.row_ptr.data(), a.col_ind.data(), descrB_.descr(), nnzB, b.row_ptr.data(), @@ -693,7 +695,7 @@ struct CSRSparseSparseMatrixMatMul b.dense_shape_host(b.dense_shape_host.size() - (transpose_b_ ? 2 : 1)); DCHECK_EQ(n, c->dense_shape_host(c->dense_shape_host.size() - 1)); -#if CUDA_VERSION < 10000 +#if (GOOGLE_CUDA && (CUDA_VERSION < 10000)) || TENSORFLOW_USE_ROCM TF_RETURN_IF_ERROR(cuda_sparse_.Csrgemm( transA_, transB_, m, k, n, descrA_.descr(), nnzA, a.values.data(), a.row_ptr.data(), a.col_ind.data(), descrB_.descr(), nnzB, @@ -732,7 +734,7 @@ struct CSRSparseSparseMatrixMatMul GpuSparseMatrixDescriptor descrC_; gpusparseOperation_t transA_; gpusparseOperation_t transB_; -#if CUDA_VERSION >= 10000 +#if GOOGLE_CUDA && (CUDA_VERSION >= 10000) csrgemm2Info_t info_; #endif };