diff --git a/tensorflow/core/kernels/cuda_sparse.h b/tensorflow/core/kernels/cuda_sparse.h index eb69469b615..2d41cc72421 100644 --- a/tensorflow/core/kernels/cuda_sparse.h +++ b/tensorflow/core/kernels/cuda_sparse.h @@ -259,7 +259,7 @@ class GpuSparse { // http://docs.nvidia.com/cuda/cusparse/index.html#cusparse-lt-t-gt-coo2csr. Status Coo2csr(const int* cooRowInd, int nnz, int m, int* csrRowPtr) const; -#if CUDA_VERSION < 10020 +#if (GOOGLE_CUDA && (CUDA_VERSION < 10020)) || TENSORFLOW_USE_ROCM // Sparse-dense matrix multiplication C = alpha * op(A) * op(B) + beta * C, // where A is a sparse matrix in CSR format, B and C are dense tall // matrices. This routine allows transposition of matrix B, which @@ -311,7 +311,7 @@ class GpuSparse { // http://docs.nvidia.com/cuda/cusparse/index.html#cusparse-lt-t-gt-csrmv_mergepath // // **NOTE** This is an in-place operation for data in y. -#if CUDA_VERSION < 10020 +#if (GOOGLE_CUDA && (CUDA_VERSION < 10020)) || TENSORFLOW_USE_ROCM template <typename Scalar> Status Csrmv(gpusparseOperation_t transA, int m, int n, int nnz, const Scalar* alpha_host, const gpusparseMatDescr_t descrA, @@ -366,7 +366,7 @@ class GpuSparse { Scalar* csrSortedValC, int* csrSortedRowPtrC, int* csrSortedColIndC, void* workspace); -#if CUDA_VERSION >= 10000 +#if GOOGLE_CUDA && (CUDA_VERSION >= 10000) // Computes sparse-sparse matrix multiplication of matrices // stored in CSR format. This is part zero: calculate required workspace // size. @@ -383,7 +383,7 @@ class GpuSparse { // output. csrSortedRowPtrC must be preallocated on device with // m + 1 entries. See: // http://docs.nvidia.com/cuda/cusparse/index.html#cusparse-lt-t-gt-csrgemm. -#if CUDA_VERSION < 10000 +#if (GOOGLE_CUDA && (CUDA_VERSION < 10000)) || TENSORFLOW_USE_ROCM Status CsrgemmNnz(gpusparseOperation_t transA, gpusparseOperation_t transB, int m, int k, int n, const gpusparseMatDescr_t descrA, int nnzA, const int* csrSortedRowPtrA, @@ -408,7 +408,7 @@ class GpuSparse { // addition. csrValC and csrColIndC must be allocated on the device // with nnzTotalDevHostPtr entries (as calculated by CsrgemmNnz). See: // http://docs.nvidia.com/cuda/cusparse/index.html#cusparse-lt-t-gt-csrgemm. -#if CUDA_VERSION < 10000 +#if (GOOGLE_CUDA && (CUDA_VERSION < 10000)) || TENSORFLOW_USE_ROCM template <typename Scalar> Status Csrgemm(gpusparseOperation_t transA, gpusparseOperation_t transB, int m, int k, int n, const gpusparseMatDescr_t descrA, diff --git a/tensorflow/core/kernels/sparse/mat_mul_op.cc b/tensorflow/core/kernels/sparse/mat_mul_op.cc index a0834800446..50fa0ec88ea 100644 --- a/tensorflow/core/kernels/sparse/mat_mul_op.cc +++ b/tensorflow/core/kernels/sparse/mat_mul_op.cc @@ -728,12 +728,14 @@ namespace { template <typename T> struct GPUDataType; +// GPUDataType templates are currently not instantiated in the ROCm flow +// So leaving out the #elif TENSORFLOW_USE_ROCM blocks for now +// hipblas library is not (yet) being pulled in via rocm_configure.bzl +// so cannot reference tyeps from hipblas headers here template <> struct GPUDataType<Eigen::half> { #if GOOGLE_CUDA static constexpr cudaDataType_t type = CUDA_R_16F; -#elif TENSORFLOW_USE_ROCM - static constexpr hipblasDataType_t type = HIPBLAS_R_16F; #endif }; @@ -741,8 +743,6 @@ template <> struct GPUDataType<float> { #if GOOGLE_CUDA static constexpr cudaDataType_t type = CUDA_R_32F; -#elif TENSORFLOW_USE_ROCM - static constexpr hipblasDataType_t type = HIPBLAS_R_32F; #endif }; @@ -750,8 +750,6 @@ template <> struct GPUDataType<std::complex<float>> { #if GOOGLE_CUDA static constexpr cudaDataType_t type = CUDA_C_32F; -#elif TENSORFLOW_USE_ROCM - static constexpr hipblasDataType_t type = HIPBLAS_C_32F; #endif }; @@ -759,8 +757,6 @@ template <> struct GPUDataType<double> { #if GOOGLE_CUDA static constexpr cudaDataType_t type = CUDA_R_64F; -#elif TENSORFLOW_USE_ROCM - static constexpr hipblasDataType_t type = HIPBLAS_R_64F; #endif }; @@ -768,8 +764,6 @@ template <> struct GPUDataType<std::complex<double>> { #if GOOGLE_CUDA static constexpr cudaDataType_t type = CUDA_C_64F; -#elif TENSORFLOW_USE_ROCM - static constexpr hipblasDataType_t type = HIPBLAS_C_64F; #endif }; @@ -957,7 +951,7 @@ class CSRSparseMatrixMatVec<GPUDevice, T> { const int n = a.dense_shape_host(1); const int nnz = a.values.size(); DCHECK_EQ(nnz, a.col_ind.size()); -#if CUDA_VERSION >= 10020 +#if GOOGLE_CUDA && (CUDA_VERSION >= 10020) TF_RETURN_IF_ERROR(cuda_sparse.Csrmv(transA_, m, n, nnz, &alpha, a.values.data(), a.row_ptr.data(), a.col_ind.data(), x, &beta, y)); diff --git a/tensorflow/core/kernels/sparse/sparse_mat_mul_op.cc b/tensorflow/core/kernels/sparse/sparse_mat_mul_op.cc index 7325d5f6873..fb652e13d15 100644 --- a/tensorflow/core/kernels/sparse/sparse_mat_mul_op.cc +++ b/tensorflow/core/kernels/sparse/sparse_mat_mul_op.cc @@ -417,7 +417,7 @@ class CSRSparseMatMulGPUOp : public OpKernel { } auto b_input_dense_shape = b_input_matrix->dense_shape().vec<int64>(); -#if CUDA_VERSION >= 10000 +#if GOOGLE_CUDA && (CUDA_VERSION >= 10000) size_t maxWorkspaceSize = 0; for (int i = 0; i < batch_size; ++i) { // Calculate maximum workspace size over batch. @@ -558,7 +558,7 @@ struct CSRSparseSparseMatrixMatMul<GPUDevice, T> initialized_(false), transpose_a_(transpose_a), adjoint_a_(adjoint_a), -#if CUDA_VERSION < 10000 +#if (GOOGLE_CUDA && (CUDA_VERSION < 10000)) || TENSORFLOW_USE_ROCM transpose_b_(transpose_b) { #else transpose_b_(transpose_b), @@ -573,7 +573,7 @@ struct CSRSparseSparseMatrixMatMul<GPUDevice, T> : GPUSPARSE(OPERATION_NON_TRANSPOSE); } -#if CUDA_VERSION >= 10000 +#if GOOGLE_CUDA && (CUDA_VERSION >= 10000) ~CSRSparseSparseMatrixMatMul() { if (initialized_) { cusparseDestroyCsrgemm2Info(info_); @@ -591,7 +591,7 @@ struct CSRSparseSparseMatrixMatMul<GPUDevice, T> TF_RETURN_IF_ERROR(descrA_.Initialize()); TF_RETURN_IF_ERROR(descrB_.Initialize()); TF_RETURN_IF_ERROR(descrC_.Initialize()); -#if CUDA_VERSION >= 10000 +#if GOOGLE_CUDA && (CUDA_VERSION >= 10000) TF_RETURN_IF_GPUSPARSE_ERROR(cusparseCreateCsrgemm2Info(&info_)); #endif initialized_ = true; @@ -600,6 +600,7 @@ struct CSRSparseSparseMatrixMatMul<GPUDevice, T> Status GetWorkspaceSize(const ConstCSRComponent<T>& a, const ConstCSRComponent<T>& b, size_t* bufferSize) { +#if GOOGLE_CUDA && (CUDA_VERSION >= 10000) DCHECK(initialized_); const int m = a.dense_shape_host(a.dense_shape_host.size() - (transpose_a_ ? 1 : 2)); @@ -621,6 +622,7 @@ struct CSRSparseSparseMatrixMatMul<GPUDevice, T> m, n, k, descrA_.descr(), nnzA, a.row_ptr.data(), a.col_ind.data(), descrB_.descr(), nnzB, b.row_ptr.data(), b.col_ind.data(), info_, bufferSize)); +#endif return Status::OK(); } @@ -650,7 +652,7 @@ struct CSRSparseSparseMatrixMatMul<GPUDevice, T> *output_nnz = -1; -#if CUDA_VERSION < 10000 +#if (GOOGLE_CUDA && (CUDA_VERSION < 10000)) || TENSORFLOW_USE_ROCM TF_RETURN_IF_ERROR(cuda_sparse_.CsrgemmNnz( transA_, transB_, m, n, k, descrA_.descr(), nnzA, a.row_ptr.data(), a.col_ind.data(), descrB_.descr(), nnzB, b.row_ptr.data(), @@ -693,7 +695,7 @@ struct CSRSparseSparseMatrixMatMul<GPUDevice, T> b.dense_shape_host(b.dense_shape_host.size() - (transpose_b_ ? 2 : 1)); DCHECK_EQ(n, c->dense_shape_host(c->dense_shape_host.size() - 1)); -#if CUDA_VERSION < 10000 +#if (GOOGLE_CUDA && (CUDA_VERSION < 10000)) || TENSORFLOW_USE_ROCM TF_RETURN_IF_ERROR(cuda_sparse_.Csrgemm( transA_, transB_, m, k, n, descrA_.descr(), nnzA, a.values.data(), a.row_ptr.data(), a.col_ind.data(), descrB_.descr(), nnzB, @@ -732,7 +734,7 @@ struct CSRSparseSparseMatrixMatMul<GPUDevice, T> GpuSparseMatrixDescriptor descrC_; gpusparseOperation_t transA_; gpusparseOperation_t transB_; -#if CUDA_VERSION >= 10000 +#if GOOGLE_CUDA && (CUDA_VERSION >= 10000) csrgemm2Info_t info_; #endif };