Merge pull request #34800 from ROCmSoftwarePlatform:google_upstream_rocm_csr_sparse_matrix_support

PiperOrigin-RevId: 289617600 Change-Id: Ic1aa3714126d7b867295ae386b6be643c1dc83e4
2020-01-14 03:19:11 -08:00 · 2020-01-14 03:19:11 -08:00 · 0e2b5a9d2a
commit 0e2b5a9d2a
parent 353cb749f4 5d1ccc1eee
35 changed files with 1031 additions and 424 deletions
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@ -3480,14 +3480,18 @@ tf_kernel_library(

 tf_kernel_library(
    name = "cuda_sparse",
-    srcs = ["cuda_sparse.cc"],
+    srcs = if_cuda(["cuda_sparse.cc"]) + if_rocm(["rocm_sparse.cc"]),
    hdrs = ["cuda_sparse.h"],
    deps = [
        "//tensorflow/core:framework",
        "//tensorflow/core:lib",
        "//tensorflow/core/kernels:cuda_solvers",
+    ] + if_cuda([
        "//tensorflow/stream_executor/cuda:cusparse_lib",
-    ] + if_cuda(["@cub_archive//:cub"]),
+        "@cub_archive//:cub",
+    ]) + if_rocm([
+        "@local_config_rocm//rocm:hipsparse",
+    ]),
 )

 LINALG_DEPS = [
--- a/tensorflow/core/kernels/cuda_sparse.cc
+++ b/tensorflow/core/kernels/cuda_sparse.cc
@ -69,7 +69,7 @@ inline typename CudaComplexT<T>::type* AsCudaComplex(T* p) {
 }

 // A set of initialized handles to the underlying Cuda libraries used by
-// CudaSparse. We maintain one such set of handles per unique stream.
+// GpuSparse. We maintain one such set of handles per unique stream.
 class CudaSparseHandles {
 public:
  explicit CudaSparseHandles(cudaStream_t stream)
@ -96,8 +96,8 @@ class CudaSparseHandles {

  Status Initialize() {
    if (initialized_) return Status::OK();
-    TF_RETURN_IF_CUSPARSE_ERROR(cusparseCreate(&cusparse_handle_));
-    TF_RETURN_IF_CUSPARSE_ERROR(cusparseSetStream(cusparse_handle_, stream_));
+    TF_RETURN_IF_GPUSPARSE_ERROR(cusparseCreate(&cusparse_handle_));
+    TF_RETURN_IF_GPUSPARSE_ERROR(cusparseSetStream(cusparse_handle_, stream_));
    initialized_ = true;
    return Status::OK();
  }
@ -149,7 +149,7 @@ HandleMap* GetHandleMapSingleton() {

 }  // namespace

-CudaSparse::CudaSparse(OpKernelContext* context)
+GpuSparse::GpuSparse(OpKernelContext* context)
    : initialized_(false), context_(context) {
  auto cuda_stream_ptr =
      reinterpret_cast<const cudaStream_t*>(context->op_device_context()
@ -157,25 +157,24 @@ CudaSparse::CudaSparse(OpKernelContext* context)
                                                ->implementation()
                                                ->GpuStreamMemberHack());
  DCHECK(cuda_stream_ptr);
-  cuda_stream_ = *cuda_stream_ptr;
+  gpu_stream_ = *cuda_stream_ptr;
 }

-Status CudaSparse::Initialize() {
+Status GpuSparse::Initialize() {
  HandleMap* handle_map = GetHandleMapSingleton();
  DCHECK(handle_map);
  mutex_lock lock(handle_map_mutex);
-  auto it = handle_map->find(cuda_stream_);
+  auto it = handle_map->find(gpu_stream_);
  if (it == handle_map->end()) {
-    LOG(INFO) << "Creating CudaSparse handles for stream " << cuda_stream_;
+    LOG(INFO) << "Creating CudaSparse handles for stream " << gpu_stream_;
    // Previously unseen Cuda stream. Initialize a set of Cuda sparse library
    // handles for it.
-    CudaSparseHandles new_handles(cuda_stream_);
+    CudaSparseHandles new_handles(gpu_stream_);
    TF_RETURN_IF_ERROR(new_handles.Initialize());
-    it =
-        handle_map->insert(std::make_pair(cuda_stream_, std::move(new_handles)))
-            .first;
+    it = handle_map->insert(std::make_pair(gpu_stream_, std::move(new_handles)))
+             .first;
  }
-  cusparse_handle_ = &it->second.handle();
+  gpusparse_handle_ = &it->second.handle();
  initialized_ = true;
  return Status::OK();
 }
@ -205,32 +204,32 @@ template <typename Scalar, typename SparseFn>
 static inline Status GtsvImpl(SparseFn op, cusparseHandle_t cusparse_handle,
                              int m, int n, const Scalar* dl, const Scalar* d,
                              const Scalar* du, Scalar* B, int ldb) {
-  TF_RETURN_IF_CUSPARSE_ERROR(op(cusparse_handle, m, n, AsCudaComplex(dl),
-                                 AsCudaComplex(d), AsCudaComplex(du),
-                                 AsCudaComplex(B), ldb));
+  TF_RETURN_IF_GPUSPARSE_ERROR(op(cusparse_handle, m, n, AsCudaComplex(dl),
+                                  AsCudaComplex(d), AsCudaComplex(du),
+                                  AsCudaComplex(B), ldb));
  return Status::OK();
 }

-#define GTSV_INSTANCE(Scalar, sparse_prefix)                                 \
-  template <>                                                                \
-  Status CudaSparse::Gtsv<Scalar>(int m, int n, const Scalar* dl,            \
-                                  const Scalar* d, const Scalar* du,         \
-                                  Scalar* B, int ldb) const {                \
-    DCHECK(initialized_);                                                    \
-    return GtsvImpl(SPARSE_FN(gtsv, sparse_prefix), *cusparse_handle_, m, n, \
-                    dl, d, du, B, ldb);                                      \
+#define GTSV_INSTANCE(Scalar, sparse_prefix)                                   \
+  template <>                                                                  \
+  Status GpuSparse::Gtsv<Scalar>(int m, int n, const Scalar* dl,               \
+                                 const Scalar* d, const Scalar* du, Scalar* B, \
+                                 int ldb) const {                              \
+    DCHECK(initialized_);                                                      \
+    return GtsvImpl(SPARSE_FN(gtsv, sparse_prefix), *gpusparse_handle_, m, n,  \
+                    dl, d, du, B, ldb);                                        \
  }

 TF_CALL_LAPACK_TYPES(GTSV_INSTANCE);

-#define GTSV_NO_PIVOT_INSTANCE(Scalar, sparse_prefix)                          \
-  template <>                                                                  \
-  Status CudaSparse::GtsvNoPivot<Scalar>(int m, int n, const Scalar* dl,       \
-                                         const Scalar* d, const Scalar* du,    \
-                                         Scalar* B, int ldb) const {           \
-    DCHECK(initialized_);                                                      \
-    return GtsvImpl(SPARSE_FN(gtsv_nopivot, sparse_prefix), *cusparse_handle_, \
-                    m, n, dl, d, du, B, ldb);                                  \
+#define GTSV_NO_PIVOT_INSTANCE(Scalar, sparse_prefix)                      \
+  template <>                                                              \
+  Status GpuSparse::GtsvNoPivot<Scalar>(int m, int n, const Scalar* dl,    \
+                                        const Scalar* d, const Scalar* du, \
+                                        Scalar* B, int ldb) const {        \
+    DCHECK(initialized_);                                                  \
+    return GtsvImpl(SPARSE_FN(gtsv_nopivot, sparse_prefix),                \
+                    *gpusparse_handle_, m, n, dl, d, du, B, ldb);          \
  }

 TF_CALL_LAPACK_TYPES(GTSV_NO_PIVOT_INSTANCE);
@ -242,20 +241,20 @@ static inline Status GtsvStridedBatchImpl(SparseFn op,
                                          const Scalar* d, const Scalar* du,
                                          Scalar* x, int batchCount,
                                          int batchStride) {
-  TF_RETURN_IF_CUSPARSE_ERROR(op(cusparse_handle, m, AsCudaComplex(dl),
-                                 AsCudaComplex(d), AsCudaComplex(du),
-                                 AsCudaComplex(x), batchCount, batchStride));
+  TF_RETURN_IF_GPUSPARSE_ERROR(op(cusparse_handle, m, AsCudaComplex(dl),
+                                  AsCudaComplex(d), AsCudaComplex(du),
+                                  AsCudaComplex(x), batchCount, batchStride));
  return Status::OK();
 }

 #define GTSV_STRIDED_BATCH_INSTANCE(Scalar, sparse_prefix)                   \
  template <>                                                                \
-  Status CudaSparse::GtsvStridedBatch<Scalar>(                               \
+  Status GpuSparse::GtsvStridedBatch<Scalar>(                                \
      int m, const Scalar* dl, const Scalar* d, const Scalar* du, Scalar* x, \
      int batchCount, int batchStride) const {                               \
    DCHECK(initialized_);                                                    \
    return GtsvStridedBatchImpl(SPARSE_FN(gtsvStridedBatch, sparse_prefix),  \
-                                *cusparse_handle_, m, dl, d, du, x,          \
+                                *gpusparse_handle_, m, dl, d, du, x,         \
                                batchCount, batchStride);                    \
  }

@ -266,32 +265,32 @@ static inline Status Gtsv2Impl(SparseFn op, cusparseHandle_t cusparse_handle,
                               int m, int n, const Scalar* dl, const Scalar* d,
                               const Scalar* du, Scalar* B, int ldb,
                               void* pBuffer) {
-  TF_RETURN_IF_CUSPARSE_ERROR(op(cusparse_handle, m, n, AsCudaComplex(dl),
-                                 AsCudaComplex(d), AsCudaComplex(du),
-                                 AsCudaComplex(B), ldb, pBuffer));
+  TF_RETURN_IF_GPUSPARSE_ERROR(op(cusparse_handle, m, n, AsCudaComplex(dl),
+                                  AsCudaComplex(d), AsCudaComplex(du),
+                                  AsCudaComplex(B), ldb, pBuffer));
  return Status::OK();
 }

-#define GTSV2_INSTANCE(Scalar, sparse_prefix)                                  \
-  template <>                                                                  \
-  Status CudaSparse::Gtsv2<Scalar>(int m, int n, const Scalar* dl,             \
-                                   const Scalar* d, const Scalar* du,          \
-                                   Scalar* B, int ldb, void* pBuffer) const {  \
-    DCHECK(initialized_);                                                      \
-    return Gtsv2Impl(SPARSE_FN(gtsv2, sparse_prefix), *cusparse_handle_, m, n, \
-                     dl, d, du, B, ldb, pBuffer);                              \
+#define GTSV2_INSTANCE(Scalar, sparse_prefix)                                \
+  template <>                                                                \
+  Status GpuSparse::Gtsv2<Scalar>(int m, int n, const Scalar* dl,            \
+                                  const Scalar* d, const Scalar* du,         \
+                                  Scalar* B, int ldb, void* pBuffer) const { \
+    DCHECK(initialized_);                                                    \
+    return Gtsv2Impl(SPARSE_FN(gtsv2, sparse_prefix), *gpusparse_handle_, m, \
+                     n, dl, d, du, B, ldb, pBuffer);                         \
  }

 TF_CALL_LAPACK_TYPES(GTSV2_INSTANCE);

-#define GTSV2_NO_PIVOT_INSTANCE(Scalar, sparse_prefix)                     \
-  template <>                                                              \
-  Status CudaSparse::Gtsv2NoPivot<Scalar>(                                 \
-      int m, int n, const Scalar* dl, const Scalar* d, const Scalar* du,   \
-      Scalar* B, int ldb, void* pBuffer) const {                           \
-    DCHECK(initialized_);                                                  \
-    return Gtsv2Impl(SPARSE_FN(gtsv2_nopivot, sparse_prefix),              \
-                     *cusparse_handle_, m, n, dl, d, du, B, ldb, pBuffer); \
+#define GTSV2_NO_PIVOT_INSTANCE(Scalar, sparse_prefix)                      \
+  template <>                                                               \
+  Status GpuSparse::Gtsv2NoPivot<Scalar>(                                   \
+      int m, int n, const Scalar* dl, const Scalar* d, const Scalar* du,    \
+      Scalar* B, int ldb, void* pBuffer) const {                            \
+    DCHECK(initialized_);                                                   \
+    return Gtsv2Impl(SPARSE_FN(gtsv2_nopivot, sparse_prefix),               \
+                     *gpusparse_handle_, m, n, dl, d, du, B, ldb, pBuffer); \
  }

 TF_CALL_LAPACK_TYPES(GTSV2_NO_PIVOT_INSTANCE);
@ -303,34 +302,34 @@ static inline Status Gtsv2BufferSizeExtImpl(SparseFn op,
                                            const Scalar* d, const Scalar* du,
                                            const Scalar* B, int ldb,
                                            size_t* bufferSizeInBytes) {
-  TF_RETURN_IF_CUSPARSE_ERROR(op(cusparse_handle, m, n, AsCudaComplex(dl),
-                                 AsCudaComplex(d), AsCudaComplex(du),
-                                 AsCudaComplex(B), ldb, bufferSizeInBytes));
+  TF_RETURN_IF_GPUSPARSE_ERROR(op(cusparse_handle, m, n, AsCudaComplex(dl),
+                                  AsCudaComplex(d), AsCudaComplex(du),
+                                  AsCudaComplex(B), ldb, bufferSizeInBytes));
  return Status::OK();
 }

-#define GTSV2_BUFFER_SIZE_INSTANCE(Scalar, sparse_prefix)                    \
-  template <>                                                                \
-  Status CudaSparse::Gtsv2BufferSizeExt<Scalar>(                             \
-      int m, int n, const Scalar* dl, const Scalar* d, const Scalar* du,     \
-      const Scalar* B, int ldb, size_t* bufferSizeInBytes) const {           \
-    DCHECK(initialized_);                                                    \
-    return Gtsv2BufferSizeExtImpl(                                           \
-        SPARSE_FN(gtsv2_bufferSizeExt, sparse_prefix), *cusparse_handle_, m, \
-        n, dl, d, du, B, ldb, bufferSizeInBytes);                            \
+#define GTSV2_BUFFER_SIZE_INSTANCE(Scalar, sparse_prefix)                     \
+  template <>                                                                 \
+  Status GpuSparse::Gtsv2BufferSizeExt<Scalar>(                               \
+      int m, int n, const Scalar* dl, const Scalar* d, const Scalar* du,      \
+      const Scalar* B, int ldb, size_t* bufferSizeInBytes) const {            \
+    DCHECK(initialized_);                                                     \
+    return Gtsv2BufferSizeExtImpl(                                            \
+        SPARSE_FN(gtsv2_bufferSizeExt, sparse_prefix), *gpusparse_handle_, m, \
+        n, dl, d, du, B, ldb, bufferSizeInBytes);                             \
  }

 TF_CALL_LAPACK_TYPES(GTSV2_BUFFER_SIZE_INSTANCE);

 #define GTSV2_NO_PIVOT_BUFFER_SIZE_INSTANCE(Scalar, sparse_prefix)       \
  template <>                                                            \
-  Status CudaSparse::Gtsv2NoPivotBufferSizeExt<Scalar>(                  \
+  Status GpuSparse::Gtsv2NoPivotBufferSizeExt<Scalar>(                   \
      int m, int n, const Scalar* dl, const Scalar* d, const Scalar* du, \
      const Scalar* B, int ldb, size_t* bufferSizeInBytes) const {       \
    DCHECK(initialized_);                                                \
    return Gtsv2BufferSizeExtImpl(                                       \
        SPARSE_FN(gtsv2_nopivot_bufferSizeExt, sparse_prefix),           \
-        *cusparse_handle_, m, n, dl, d, du, B, ldb, bufferSizeInBytes);  \
+        *gpusparse_handle_, m, n, dl, d, du, B, ldb, bufferSizeInBytes); \
  }

 TF_CALL_LAPACK_TYPES(GTSV2_NO_PIVOT_BUFFER_SIZE_INSTANCE);
@ -342,7 +341,7 @@ static inline Status Gtsv2StridedBatchImpl(SparseFn op,
                                           const Scalar* d, const Scalar* du,
                                           Scalar* x, int batchCount,
                                           int batchStride, void* pBuffer) {
-  TF_RETURN_IF_CUSPARSE_ERROR(op(
+  TF_RETURN_IF_GPUSPARSE_ERROR(op(
      cusparse_handle, m, AsCudaComplex(dl), AsCudaComplex(d),
      AsCudaComplex(du), AsCudaComplex(x), batchCount, batchStride, pBuffer));
  return Status::OK();
@ -350,12 +349,12 @@ static inline Status Gtsv2StridedBatchImpl(SparseFn op,

 #define GTSV2_STRIDED_BATCH_INSTANCE(Scalar, sparse_prefix)                   \
  template <>                                                                 \
-  Status CudaSparse::Gtsv2StridedBatch<Scalar>(                               \
+  Status GpuSparse::Gtsv2StridedBatch<Scalar>(                                \
      int m, const Scalar* dl, const Scalar* d, const Scalar* du, Scalar* x,  \
      int batchCount, int batchStride, void* pBuffer) const {                 \
    DCHECK(initialized_);                                                     \
    return Gtsv2StridedBatchImpl(SPARSE_FN(gtsv2StridedBatch, sparse_prefix), \
-                                 *cusparse_handle_, m, dl, d, du, x,          \
+                                 *gpusparse_handle_, m, dl, d, du, x,         \
                                 batchCount, batchStride, pBuffer);           \
  }

@ -366,30 +365,30 @@ static inline Status Gtsv2StridedBatchBufferSizeImpl(
    SparseFn op, cusparseHandle_t cusparse_handle, int m, const Scalar* dl,
    const Scalar* d, const Scalar* du, const Scalar* x, int batchCount,
    int batchStride, size_t* bufferSizeInBytes) {
-  TF_RETURN_IF_CUSPARSE_ERROR(op(cusparse_handle, m, AsCudaComplex(dl),
-                                 AsCudaComplex(d), AsCudaComplex(du),
-                                 AsCudaComplex(x), batchCount, batchStride,
-                                 bufferSizeInBytes));
+  TF_RETURN_IF_GPUSPARSE_ERROR(op(cusparse_handle, m, AsCudaComplex(dl),
+                                  AsCudaComplex(d), AsCudaComplex(du),
+                                  AsCudaComplex(x), batchCount, batchStride,
+                                  bufferSizeInBytes));
  return Status::OK();
 }

 #define GTSV2_STRIDED_BATCH_BUFFER_SIZE_INSTANCE(Scalar, sparse_prefix) \
  template <>                                                           \
-  Status CudaSparse::Gtsv2StridedBatchBufferSizeExt<Scalar>(            \
+  Status GpuSparse::Gtsv2StridedBatchBufferSizeExt<Scalar>(             \
      int m, const Scalar* dl, const Scalar* d, const Scalar* du,       \
      const Scalar* x, int batchCount, int batchStride,                 \
      size_t* bufferSizeInBytes) const {                                \
    DCHECK(initialized_);                                               \
    return Gtsv2StridedBatchBufferSizeImpl(                             \
        SPARSE_FN(gtsv2StridedBatch_bufferSizeExt, sparse_prefix),      \
-        *cusparse_handle_, m, dl, d, du, x, batchCount, batchStride,    \
+        *gpusparse_handle_, m, dl, d, du, x, batchCount, batchStride,   \
        bufferSizeInBytes);                                             \
  }

 TF_CALL_LAPACK_TYPES(GTSV2_STRIDED_BATCH_BUFFER_SIZE_INSTANCE);

-Status CudaSparse::Coo2csr(const int* cooRowInd, int nnz, int m,
-                           int* csrRowPtr) const {
+Status GpuSparse::Coo2csr(const int* cooRowInd, int nnz, int m,
+                          int* csrRowPtr) const {
  // cusparseStatus_t CUSPARSEAPI cusparseXcoo2csr(cusparseHandle_t handle,
  //                                               const int *cooRowInd,
  //                                               int nnz,
@ -398,14 +397,14 @@ Status CudaSparse::Coo2csr(const int* cooRowInd, int nnz, int m,
  //                                               cusparseIndexBase_t
  //                                               idxBase);
  DCHECK(initialized_);
-  TF_RETURN_IF_CUSPARSE_ERROR(cusparseXcoo2csr(*cusparse_handle_, cooRowInd,
-                                               nnz, m, csrRowPtr,
-                                               CUSPARSE_INDEX_BASE_ZERO));
+  TF_RETURN_IF_GPUSPARSE_ERROR(cusparseXcoo2csr(*gpusparse_handle_, cooRowInd,
+                                                nnz, m, csrRowPtr,
+                                                CUSPARSE_INDEX_BASE_ZERO));
  return Status::OK();
 }

-Status CudaSparse::Csr2coo(const int* csrRowPtr, int nnz, int m,
-                           int* cooRowInd) const {
+Status GpuSparse::Csr2coo(const int* csrRowPtr, int nnz, int m,
+                          int* cooRowInd) const {
  // cusparseStatus_t CUSPARSEAPI cusparseXcsr2coo(cusparseHandle_t handle,
  //                                               const int *csrRowPtr,
  //                                               int nnz,
@ -414,26 +413,26 @@ Status CudaSparse::Csr2coo(const int* csrRowPtr, int nnz, int m,
  //                                               cusparseIndexBase_t
  //                                               idxBase);
  DCHECK(initialized_);
-  TF_RETURN_IF_CUSPARSE_ERROR(cusparseXcsr2coo(*cusparse_handle_, csrRowPtr,
-                                               nnz, m, cooRowInd,
-                                               CUSPARSE_INDEX_BASE_ZERO));
+  TF_RETURN_IF_GPUSPARSE_ERROR(cusparseXcsr2coo(*gpusparse_handle_, csrRowPtr,
+                                                nnz, m, cooRowInd,
+                                                CUSPARSE_INDEX_BASE_ZERO));
  return Status::OK();
 }

-Status CudaSparse::CsrgeamNnz(int m, int n, const cusparseMatDescr_t descrA,
-                              int nnzA, const int* csrSortedRowPtrA,
-                              const int* csrSortedColIndA,
-                              const cusparseMatDescr_t descrB, int nnzB,
-                              const int* csrSortedRowPtrB,
-                              const int* csrSortedColIndB,
-                              const cusparseMatDescr_t descrC,
-                              int* csrSortedRowPtrC, int* nnzTotalDevHostPtr) {
+Status GpuSparse::CsrgeamNnz(int m, int n, const cusparseMatDescr_t descrA,
+                             int nnzA, const int* csrSortedRowPtrA,
+                             const int* csrSortedColIndA,
+                             const cusparseMatDescr_t descrB, int nnzB,
+                             const int* csrSortedRowPtrB,
+                             const int* csrSortedColIndB,
+                             const cusparseMatDescr_t descrC,
+                             int* csrSortedRowPtrC, int* nnzTotalDevHostPtr) {
  DCHECK(initialized_);
  DCHECK(nnzTotalDevHostPtr != nullptr);
-  TF_RETURN_IF_CUSPARSE_ERROR(cusparseXcsrgeamNnz(
-      *cusparse_handle_, m, n, descrA, nnzA, csrSortedRowPtrA, csrSortedColIndA,
-      descrB, nnzB, csrSortedRowPtrB, csrSortedColIndB, descrC,
-      csrSortedRowPtrC, nnzTotalDevHostPtr));
+  TF_RETURN_IF_GPUSPARSE_ERROR(cusparseXcsrgeamNnz(
+      *gpusparse_handle_, m, n, descrA, nnzA, csrSortedRowPtrA,
+      csrSortedColIndA, descrB, nnzB, csrSortedRowPtrB, csrSortedColIndB,
+      descrC, csrSortedRowPtrC, nnzTotalDevHostPtr));
  return Status::OK();
 }

@ -452,7 +451,7 @@ static inline Status CsrmmImpl(
  //     const float* csrSortedValA, const int* csrSortedRowPtrA,
  //     const int* csrSortedColIndA, const float* B, int ldb, const float*
  //     beta, float* C, int ldc);
-  TF_RETURN_IF_CUSPARSE_ERROR(op(
+  TF_RETURN_IF_GPUSPARSE_ERROR(op(
      cusparse_handle, transA, transB, m, n, k, nnz, AsCudaComplex(alpha_host),
      descrA, AsCudaComplex(csrSortedValA), csrSortedRowPtrA, csrSortedColIndA,
      AsCudaComplex(B), ldb, AsCudaComplex(beta_host), AsCudaComplex(C), ldc));
@ -461,7 +460,7 @@ static inline Status CsrmmImpl(

 #define CSRMM_INSTANCE(Scalar, sparse_prefix)                                \
  template <>                                                                \
-  Status CudaSparse::Csrmm<Scalar>(                                          \
+  Status GpuSparse::Csrmm<Scalar>(                                           \
      cusparseOperation_t transA, cusparseOperation_t transB, int m, int n,  \
      int k, int nnz, const Scalar* alpha_host,                              \
      const cusparseMatDescr_t descrA, const Scalar* csrSortedValA,          \
@ -470,7 +469,7 @@ static inline Status CsrmmImpl(
      const {                                                                \
    DCHECK(initialized_);                                                    \
    return CsrmmImpl(SPARSE_FN(csrmm2, sparse_prefix), context_,             \
-                     *cusparse_handle_, transA, transB, m, n, k, nnz,        \
+                     *gpusparse_handle_, transA, transB, m, n, k, nnz,       \
                     alpha_host, descrA, csrSortedValA, csrSortedRowPtrA,    \
                     csrSortedColIndA, B, ldb, beta_host, C, ldc);           \
  }
@ -484,7 +483,7 @@ static inline Status CsrmvImpl(
    const cusparseMatDescr_t descrA, const Scalar* csrSortedValA,
    const int* csrSortedRowPtrA, const int* csrSortedColIndA, const Scalar* x,
    const Scalar* beta_host, Scalar* y) {
-  TF_RETURN_IF_CUSPARSE_ERROR(
+  TF_RETURN_IF_GPUSPARSE_ERROR(
      op(cusparse_handle, transA, m, n, nnz, AsCudaComplex(alpha_host), descrA,
         AsCudaComplex(csrSortedValA), csrSortedRowPtrA, csrSortedColIndA,
         AsCudaComplex(x), AsCudaComplex(beta_host), AsCudaComplex(y)));
@ -494,7 +493,7 @@ static inline Status CsrmvImpl(
 // TODO(ebrevdo,rmlarsen): Use csrmv_mp for all cases when available in CUDA 9.
 #define CSRMV_INSTANCE(Scalar, sparse_prefix)                                \
  template <>                                                                \
-  Status CudaSparse::Csrmv<Scalar>(                                          \
+  Status GpuSparse::Csrmv<Scalar>(                                           \
      cusparseOperation_t transA, int m, int n, int nnz,                     \
      const Scalar* alpha_host, const cusparseMatDescr_t descrA,             \
      const Scalar* csrSortedValA, const int* csrSortedRowPtrA,              \
@ -503,12 +502,12 @@ static inline Status CsrmvImpl(
    DCHECK(initialized_);                                                    \
    if (transA == CUSPARSE_OPERATION_NON_TRANSPOSE) {                        \
      return CsrmvImpl(SPARSE_FN(csrmv_mp, sparse_prefix), context_,         \
-                       *cusparse_handle_, transA, m, n, nnz, alpha_host,     \
+                       *gpusparse_handle_, transA, m, n, nnz, alpha_host,    \
                       descrA, csrSortedValA, csrSortedRowPtrA,              \
                       csrSortedColIndA, x, beta_host, y);                   \
    } else {                                                                 \
      return CsrmvImpl(SPARSE_FN(csrmv, sparse_prefix), context_,            \
-                       *cusparse_handle_, transA, m, n, nnz, alpha_host,     \
+                       *gpusparse_handle_, transA, m, n, nnz, alpha_host,    \
                       descrA, csrSortedValA, csrSortedRowPtrA,              \
                       csrSortedColIndA, x, beta_host, y);                   \
    }                                                                        \
@ -526,7 +525,7 @@ static inline Status CsrgeamImpl(
    const int* csrSortedRowPtrB, const int* csrSortedColIndB,
    const cusparseMatDescr_t descrC, Scalar* csrSortedValC,
    int* csrSortedRowPtrC, int* csrSortedColIndC) {
-  TF_RETURN_IF_CUSPARSE_ERROR(
+  TF_RETURN_IF_GPUSPARSE_ERROR(
      op(cusparse_handle, m, n, AsCudaComplex(alpha), descrA, nnzA,
         AsCudaComplex(csrSortedValA), csrSortedRowPtrA, csrSortedColIndA,
         AsCudaComplex(beta), descrB, nnzB, AsCudaComplex(csrSortedValB),
@ -537,7 +536,7 @@ static inline Status CsrgeamImpl(

 #define CSRGEAM_INSTANCE(Scalar, sparse_prefix)                               \
  template <>                                                                 \
-  Status CudaSparse::Csrgeam<Scalar>(                                         \
+  Status GpuSparse::Csrgeam<Scalar>(                                          \
      int m, int n, const Scalar* alpha, const cusparseMatDescr_t descrA,     \
      int nnzA, const Scalar* csrSortedValA, const int* csrSortedRowPtrA,     \
      const int* csrSortedColIndA, const Scalar* beta,                        \
@ -547,7 +546,7 @@ static inline Status CsrgeamImpl(
      int* csrSortedRowPtrC, int* csrSortedColIndC) {                         \
    DCHECK(initialized_);                                                     \
    return CsrgeamImpl(SPARSE_FN(csrgeam, sparse_prefix), context_,           \
-                       *cusparse_handle_, m, n, alpha, descrA, nnzA,          \
+                       *gpusparse_handle_, m, n, alpha, descrA, nnzA,         \
                       csrSortedValA, csrSortedRowPtrA, csrSortedColIndA,     \
                       beta, descrB, nnzB, csrSortedValB, csrSortedRowPtrB,   \
                       csrSortedColIndB, descrC, csrSortedValC,               \
@ -556,7 +555,7 @@ static inline Status CsrgeamImpl(

 TF_CALL_LAPACK_TYPES(CSRGEAM_INSTANCE);

-Status CudaSparse::CsrgemmNnz(
+Status GpuSparse::CsrgemmNnz(
    cusparseOperation_t transA, cusparseOperation_t transB, int m, int k, int n,
    const cusparseMatDescr_t descrA, int nnzA, const int* csrSortedRowPtrA,
    const int* csrSortedColIndA, const cusparseMatDescr_t descrB, int nnzB,
@ -565,8 +564,8 @@ Status CudaSparse::CsrgemmNnz(
    int* nnzTotalDevHostPtr) {
  DCHECK(initialized_);
  DCHECK(nnzTotalDevHostPtr != nullptr);
-  TF_RETURN_IF_CUSPARSE_ERROR(cusparseXcsrgemmNnz(
-      *cusparse_handle_, transA, transB, m, k, n, descrA, nnzA,
+  TF_RETURN_IF_GPUSPARSE_ERROR(cusparseXcsrgemmNnz(
+      *gpusparse_handle_, transA, transB, m, k, n, descrA, nnzA,
      csrSortedRowPtrA, csrSortedColIndA, descrB, nnzB, csrSortedRowPtrB,
      csrSortedColIndB, descrC, csrSortedRowPtrC, nnzTotalDevHostPtr));
  return Status::OK();
@ -582,7 +581,7 @@ static inline Status CsrgemmImpl(
    const int* csrSortedRowPtrB, const int* csrSortedColIndB,
    const cusparseMatDescr_t descrC, Scalar* csrSortedValC,
    int* csrSortedRowPtrC, int* csrSortedColIndC) {
-  TF_RETURN_IF_CUSPARSE_ERROR(
+  TF_RETURN_IF_GPUSPARSE_ERROR(
      op(cusparse_handle, transA, transB, m, k, n, descrA, nnzA,
         AsCudaComplex(csrSortedValA), csrSortedRowPtrA, csrSortedColIndA,
         descrB, nnzB, AsCudaComplex(csrSortedValB), csrSortedRowPtrB,
@ -593,7 +592,7 @@ static inline Status CsrgemmImpl(

 #define CSRGEMM_INSTANCE(Scalar, sparse_prefix)                               \
  template <>                                                                 \
-  Status CudaSparse::Csrgemm<Scalar>(                                         \
+  Status GpuSparse::Csrgemm<Scalar>(                                          \
      cusparseOperation_t transA, cusparseOperation_t transB, int m, int k,   \
      int n, const cusparseMatDescr_t descrA, int nnzA,                       \
      const Scalar* csrSortedValA, const int* csrSortedRowPtrA,               \
@ -603,7 +602,7 @@ static inline Status CsrgemmImpl(
      Scalar* csrSortedValC, int* csrSortedRowPtrC, int* csrSortedColIndC) {  \
    DCHECK(initialized_);                                                     \
    return CsrgemmImpl(SPARSE_FN(csrgemm, sparse_prefix), context_,           \
-                       *cusparse_handle_, transA, transB, m, k, n, descrA,    \
+                       *gpusparse_handle_, transA, transB, m, k, n, descrA,   \
                       nnzA, csrSortedValA, csrSortedRowPtrA,                 \
                       csrSortedColIndA, descrB, nnzB, csrSortedValB,         \
                       csrSortedRowPtrB, csrSortedColIndB, descrC,            \
@ -620,12 +619,12 @@ static inline Status Csru2csrImpl(SparseFnT op, BufferSizeFnT buffer_size_op,
                                  const cusparseMatDescr_t descrA,
                                  Scalar* csrVal, const int* csrRowPtr,
                                  int* csrColInd) {
-  CudaSparseCsrSortingConversionInfo info;
+  GpuSparseCsrSortingConversionInfo info;
  TF_RETURN_IF_ERROR(info.Initialize());

  size_t pBufferSizeInBytes = 0;

-  TF_RETURN_IF_CUSPARSE_ERROR(
+  TF_RETURN_IF_GPUSPARSE_ERROR(
      buffer_size_op(cusparse_handle, m, n, nnz, AsCudaComplex(csrVal),
                     csrRowPtr, csrColInd, info.info(), &pBufferSizeInBytes));

@ -636,22 +635,22 @@ static inline Status Csru2csrImpl(SparseFnT op, BufferSizeFnT buffer_size_op,
  auto pBuffer = pBuffer_t.flat<int8>();
  DCHECK(pBuffer.data() != nullptr);

-  TF_RETURN_IF_CUSPARSE_ERROR(op(cusparse_handle, m, n, nnz, descrA,
-                                 AsCudaComplex(csrVal), csrRowPtr, csrColInd,
-                                 info.info(), pBuffer.data()));
+  TF_RETURN_IF_GPUSPARSE_ERROR(op(cusparse_handle, m, n, nnz, descrA,
+                                  AsCudaComplex(csrVal), csrRowPtr, csrColInd,
+                                  info.info(), pBuffer.data()));

  return Status::OK();
 }

 #define CSRU2CSR_INSTANCE(Scalar, sparse_prefix)                              \
  template <>                                                                 \
-  Status CudaSparse::Csru2csr<Scalar>(                                        \
+  Status GpuSparse::Csru2csr<Scalar>(                                         \
      int m, int n, int nnz, const cusparseMatDescr_t descrA, Scalar* csrVal, \
      const int* csrRowPtr, int* csrColInd) {                                 \
    DCHECK(initialized_);                                                     \
    return Csru2csrImpl(SPARSE_FN(csru2csr, sparse_prefix),                   \
                        BUFSIZE_FN(csru2csr, sparse_prefix), context_,        \
-                        *cusparse_handle_, m, n, nnz, descrA, csrVal,         \
+                        *gpusparse_handle_, m, n, nnz, descrA, csrVal,        \
                        csrRowPtr, csrColInd);                                \
  }

@ -664,22 +663,22 @@ static inline Status Csr2cscImpl(SparseFnT op, OpKernelContext* context,
                                 const int* csrRowPtr, const int* csrColInd,
                                 Scalar* cscVal, int* cscRowInd, int* cscColPtr,
                                 const cusparseAction_t copyValues) {
-  TF_RETURN_IF_CUSPARSE_ERROR(op(cusparse_handle, m, n, nnz,
-                                 AsCudaComplex(csrVal), csrRowPtr, csrColInd,
-                                 AsCudaComplex(cscVal), cscRowInd, cscColPtr,
-                                 copyValues, CUSPARSE_INDEX_BASE_ZERO));
+  TF_RETURN_IF_GPUSPARSE_ERROR(op(cusparse_handle, m, n, nnz,
+                                  AsCudaComplex(csrVal), csrRowPtr, csrColInd,
+                                  AsCudaComplex(cscVal), cscRowInd, cscColPtr,
+                                  copyValues, CUSPARSE_INDEX_BASE_ZERO));
  return Status::OK();
 }

 #define CSR2CSC_INSTANCE(Scalar, sparse_prefix)                              \
  template <>                                                                \
-  Status CudaSparse::Csr2csc<Scalar>(                                        \
+  Status GpuSparse::Csr2csc<Scalar>(                                         \
      int m, int n, int nnz, const Scalar* csrVal, const int* csrRowPtr,     \
      const int* csrColInd, Scalar* cscVal, int* cscRowInd, int* cscColPtr,  \
      const cusparseAction_t copyValues) {                                   \
    DCHECK(initialized_);                                                    \
    return Csr2cscImpl(SPARSE_FN(csr2csc, sparse_prefix), context_,          \
-                       *cusparse_handle_, m, n, nnz, csrVal, csrRowPtr,      \
+                       *gpusparse_handle_, m, n, nnz, csrVal, csrRowPtr,     \
                       csrColInd, cscVal, cscRowInd, cscColPtr, copyValues); \
  }

--- a/tensorflow/core/kernels/cuda_sparse.h
+++ b/tensorflow/core/kernels/cuda_sparse.h
@ -16,15 +16,38 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_KERNELS_CUDA_SPARSE_H_
 #define TENSORFLOW_CORE_KERNELS_CUDA_SPARSE_H_

-// This header declares the class CudaSparse, which contains wrappers of
+// This header declares the class GpuSparse, which contains wrappers of
 // cuSparse libraries for use in TensorFlow kernels.

-#ifdef GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM

 #include <functional>
 #include <vector>

+#if GOOGLE_CUDA
+
 #include "third_party/gpus/cuda/include/cusparse.h"
+
+using gpusparseStatus_t = cusparseStatus_t;
+using gpusparseOperation_t = cusparseOperation_t;
+using gpusparseMatDescr_t = cusparseMatDescr_t;
+using gpusparseAction_t = cusparseAction_t;
+using gpusparseHandle_t = cusparseHandle_t;
+using gpuStream_t = cudaStream_t;
+
+#elif TENSORFLOW_USE_ROCM
+
+#include "rocm/include/hipsparse/hipsparse.h"
+
+using gpusparseStatus_t = hipsparseStatus_t;
+using gpusparseOperation_t = hipsparseOperation_t;
+using gpusparseMatDescr_t = hipsparseMatDescr_t;
+using gpusparseAction_t = hipsparseAction_t;
+using gpusparseHandle_t = hipsparseHandle_t;
+using gpuStream_t = hipStream_t;
+
+#endif
+
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_types.h"
@ -40,13 +63,15 @@ limitations under the License.

 namespace tensorflow {

-inline string ConvertCUSparseErrorToString(const cusparseStatus_t status) {
+inline string ConvertGPUSparseErrorToString(const gpusparseStatus_t status) {
  switch (status) {
 #define STRINGIZE(q) #q
 #define RETURN_IF_STATUS(err) \
  case err:                   \
    return STRINGIZE(err);

+#if GOOGLE_CUDA
+
    RETURN_IF_STATUS(CUSPARSE_STATUS_SUCCESS)
    RETURN_IF_STATUS(CUSPARSE_STATUS_NOT_INITIALIZED)
    RETURN_IF_STATUS(CUSPARSE_STATUS_ALLOC_FAILED)
@ -57,27 +82,62 @@ inline string ConvertCUSparseErrorToString(const cusparseStatus_t status) {
    RETURN_IF_STATUS(CUSPARSE_STATUS_INTERNAL_ERROR)
    RETURN_IF_STATUS(CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED)

-#undef RETURN_IF_STATUS
-#undef STRINGIZE
    default:
      return strings::StrCat("Unknown CUSPARSE error: ",
                             static_cast<int>(status));
+#elif TENSORFLOW_USE_ROCM
+
+    RETURN_IF_STATUS(HIPSPARSE_STATUS_SUCCESS)
+    RETURN_IF_STATUS(HIPSPARSE_STATUS_NOT_INITIALIZED)
+    RETURN_IF_STATUS(HIPSPARSE_STATUS_ALLOC_FAILED)
+    RETURN_IF_STATUS(HIPSPARSE_STATUS_INVALID_VALUE)
+    RETURN_IF_STATUS(HIPSPARSE_STATUS_ARCH_MISMATCH)
+    RETURN_IF_STATUS(HIPSPARSE_STATUS_MAPPING_ERROR)
+    RETURN_IF_STATUS(HIPSPARSE_STATUS_EXECUTION_FAILED)
+    RETURN_IF_STATUS(HIPSPARSE_STATUS_INTERNAL_ERROR)
+    RETURN_IF_STATUS(HIPSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED)
+    RETURN_IF_STATUS(HIPSPARSE_STATUS_ZERO_PIVOT)
+
+    default:
+      return strings::StrCat("Unknown hipSPARSE error: ",
+                             static_cast<int>(status));
+#endif
+
+#undef RETURN_IF_STATUS
+#undef STRINGIZE
  }
 }

-#define TF_RETURN_IF_CUSPARSE_ERROR(expr)                                  \
+#if GOOGLE_CUDA
+
+#define TF_RETURN_IF_GPUSPARSE_ERROR(expr)                                 \
  do {                                                                     \
    auto status = (expr);                                                  \
    if (TF_PREDICT_FALSE(status != CUSPARSE_STATUS_SUCCESS)) {             \
      return errors::Internal(__FILE__, ":", __LINE__, " (", TF_STR(expr), \
                              "): cuSparse call failed with status ",      \
-                              ConvertCUSparseErrorToString(status));       \
+                              ConvertGPUSparseErrorToString(status));      \
    }                                                                      \
  } while (0)

-inline cusparseOperation_t TransposeAndConjugateToCuSparseOp(bool transpose,
-                                                             bool conjugate,
-                                                             Status* status) {
+#elif TENSORFLOW_USE_ROCM
+
+#define TF_RETURN_IF_GPUSPARSE_ERROR(expr)                                 \
+  do {                                                                     \
+    auto status = (expr);                                                  \
+    if (TF_PREDICT_FALSE(status != HIPSPARSE_STATUS_SUCCESS)) {            \
+      return errors::Internal(__FILE__, ":", __LINE__, " (", TF_STR(expr), \
+                              "): hipSPARSE call failed with status ",     \
+                              ConvertGPUSparseErrorToString(status));      \
+    }                                                                      \
+  } while (0)
+
+#endif
+
+inline gpusparseOperation_t TransposeAndConjugateToGpuSparseOp(bool transpose,
+                                                               bool conjugate,
+                                                               Status* status) {
+#if GOOGLE_CUDA
  if (transpose) {
    return conjugate ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE
                     : CUSPARSE_OPERATION_TRANSPOSE;
@ -89,25 +149,38 @@ inline cusparseOperation_t TransposeAndConjugateToCuSparseOp(bool transpose,
    }
    return CUSPARSE_OPERATION_NON_TRANSPOSE;
  }
+#elif TENSORFLOW_USE_ROCM
+  if (transpose) {
+    return conjugate ? HIPSPARSE_OPERATION_CONJUGATE_TRANSPOSE
+                     : HIPSPARSE_OPERATION_TRANSPOSE;
+  } else {
+    if (conjugate) {
+      DCHECK(status != nullptr);
+      *status = errors::InvalidArgument(
+          "Conjugate == True and transpose == False is not supported.");
+    }
+    return HIPSPARSE_OPERATION_NON_TRANSPOSE;
+  }
+#endif
 }

-// The CudaSparse class provides a simplified templated API for cuSparse
+// The GpuSparse class provides a simplified templated API for cuSparse
 // (http://docs.nvidia.com/cuda/cusparse/index.html).
 // An object of this class wraps static cuSparse instances,
 // and will launch Cuda kernels on the stream wrapped by the GPU device
 // in the OpKernelContext provided to the constructor.
 //
 // Notice: All the computational member functions are asynchronous and simply
-// launch one or more Cuda kernels on the Cuda stream wrapped by the CudaSparse
+// launch one or more Cuda kernels on the Cuda stream wrapped by the GpuSparse
 // object.

-class CudaSparse {
+class GpuSparse {
 public:
  // This object stores a pointer to context, which must outlive it.
-  explicit CudaSparse(OpKernelContext* context);
-  virtual ~CudaSparse() {}
+  explicit GpuSparse(OpKernelContext* context);
+  virtual ~GpuSparse() {}

-  // This initializes the CudaSparse class if it hasn't
+  // This initializes the GpuSparse class if it hasn't
  // been initialized yet.  All following public methods require the
  // class has been initialized.  Can be run multiple times; all
  // subsequent calls after the first have no effect.
@ -218,9 +291,9 @@ class CudaSparse {
  //
  // **NOTE** This is an in-place operation for data in C.
  template <typename Scalar>
-  Status Csrmm(cusparseOperation_t transA, cusparseOperation_t transB, int m,
+  Status Csrmm(gpusparseOperation_t transA, gpusparseOperation_t transB, int m,
               int n, int k, int nnz, const Scalar* alpha_host,
-               const cusparseMatDescr_t descrA, const Scalar* csrSortedValA,
+               const gpusparseMatDescr_t descrA, const Scalar* csrSortedValA,
               const int* csrSortedRowPtrA, const int* csrSortedColIndA,
               const Scalar* B, int ldb, const Scalar* beta_host, Scalar* C,
               int ldc) const;
@ -231,8 +304,8 @@ class CudaSparse {
  //
  // **NOTE** This is an in-place operation for data in y.
  template <typename Scalar>
-  Status Csrmv(cusparseOperation_t transA, int m, int n, int nnz,
-               const Scalar* alpha_host, const cusparseMatDescr_t descrA,
+  Status Csrmv(gpusparseOperation_t transA, int m, int n, int nnz,
+               const Scalar* alpha_host, const gpusparseMatDescr_t descrA,
               const Scalar* csrSortedValA, const int* csrSortedRowPtrA,
               const int* csrSortedColIndA, const Scalar* x,
               const Scalar* beta_host, Scalar* y) const;
@ -242,11 +315,11 @@ class CudaSparse {
  // output.  csrSortedRowPtrC must be preallocated on device with
  // m + 1 entries.  See:
  // http://docs.nvidia.com/cuda/cusparse/index.html#cusparse-lt-t-gt-csrgeam.
-  Status CsrgeamNnz(int m, int n, const cusparseMatDescr_t descrA, int nnzA,
+  Status CsrgeamNnz(int m, int n, const gpusparseMatDescr_t descrA, int nnzA,
                    const int* csrSortedRowPtrA, const int* csrSortedColIndA,
-                    const cusparseMatDescr_t descrB, int nnzB,
+                    const gpusparseMatDescr_t descrB, int nnzB,
                    const int* csrSortedRowPtrB, const int* csrSortedColIndB,
-                    const cusparseMatDescr_t descrC, int* csrSortedRowPtrC,
+                    const gpusparseMatDescr_t descrC, int* csrSortedRowPtrC,
                    int* nnzTotalDevHostPtr);

  // Computes sparse - sparse matrix addition of matrices
@ -256,12 +329,12 @@ class CudaSparse {
  // http://docs.nvidia.com/cuda/cusparse/index.html#cusparse-lt-t-gt-csrgeam.
  template <typename Scalar>
  Status Csrgeam(int m, int n, const Scalar* alpha,
-                 const cusparseMatDescr_t descrA, int nnzA,
+                 const gpusparseMatDescr_t descrA, int nnzA,
                 const Scalar* csrSortedValA, const int* csrSortedRowPtrA,
                 const int* csrSortedColIndA, const Scalar* beta,
-                 const cusparseMatDescr_t descrB, int nnzB,
+                 const gpusparseMatDescr_t descrB, int nnzB,
                 const Scalar* csrSortedValB, const int* csrSortedRowPtrB,
-                 const int* csrSortedColIndB, const cusparseMatDescr_t descrC,
+                 const int* csrSortedColIndB, const gpusparseMatDescr_t descrC,
                 Scalar* csrSortedValC, int* csrSortedRowPtrC,
                 int* csrSortedColIndC);

@ -270,13 +343,13 @@ class CudaSparse {
  // output.  csrSortedRowPtrC must be preallocated on device with
  // m + 1 entries.  See:
  // http://docs.nvidia.com/cuda/cusparse/index.html#cusparse-lt-t-gt-csrgemm.
-  Status CsrgemmNnz(cusparseOperation_t transA, cusparseOperation_t transB,
-                    int m, int k, int n, const cusparseMatDescr_t descrA,
+  Status CsrgemmNnz(gpusparseOperation_t transA, gpusparseOperation_t transB,
+                    int m, int k, int n, const gpusparseMatDescr_t descrA,
                    int nnzA, const int* csrSortedRowPtrA,
                    const int* csrSortedColIndA,
-                    const cusparseMatDescr_t descrB, int nnzB,
+                    const gpusparseMatDescr_t descrB, int nnzB,
                    const int* csrSortedRowPtrB, const int* csrSortedColIndB,
-                    const cusparseMatDescr_t descrC, int* csrSortedRowPtrC,
+                    const gpusparseMatDescr_t descrC, int* csrSortedRowPtrC,
                    int* nnzTotalDevHostPtr);

  // Computes sparse - sparse matrix matmul of matrices
@ -285,19 +358,20 @@ class CudaSparse {
  // with nnzTotalDevHostPtr entries (as calculated by CsrgemmNnz).  See:
  // http://docs.nvidia.com/cuda/cusparse/index.html#cusparse-lt-t-gt-csrgemm.
  template <typename Scalar>
-  Status Csrgemm(cusparseOperation_t transA, cusparseOperation_t transB, int m,
-                 int k, int n, const cusparseMatDescr_t descrA, int nnzA,
-                 const Scalar* csrSortedValA, const int* csrSortedRowPtrA,
-                 const int* csrSortedColIndA, const cusparseMatDescr_t descrB,
-                 int nnzB, const Scalar* csrSortedValB,
-                 const int* csrSortedRowPtrB, const int* csrSortedColIndB,
-                 const cusparseMatDescr_t descrC, Scalar* csrSortedValC,
-                 int* csrSortedRowPtrC, int* csrSortedColIndC);
+  Status Csrgemm(gpusparseOperation_t transA, gpusparseOperation_t transB,
+                 int m, int k, int n, const gpusparseMatDescr_t descrA,
+                 int nnzA, const Scalar* csrSortedValA,
+                 const int* csrSortedRowPtrA, const int* csrSortedColIndA,
+                 const gpusparseMatDescr_t descrB, int nnzB,
+                 const Scalar* csrSortedValB, const int* csrSortedRowPtrB,
+                 const int* csrSortedColIndB, const gpusparseMatDescr_t descrC,
+                 Scalar* csrSortedValC, int* csrSortedRowPtrC,
+                 int* csrSortedColIndC);

  // In-place reordering of unsorted CSR to sorted CSR.
  // http://docs.nvidia.com/cuda/cusparse/index.html#cusparse-lt-t-gt-csru2csr
  template <typename Scalar>
-  Status Csru2csr(int m, int n, int nnz, const cusparseMatDescr_t descrA,
+  Status Csru2csr(int m, int n, int nnz, const gpusparseMatDescr_t descrA,
                  Scalar* csrVal, const int* csrRowPtr, int* csrColInd);

  // Converts from CSR to CSC format (equivalently, transpose).
@ -306,30 +380,30 @@ class CudaSparse {
  Status Csr2csc(int m, int n, int nnz, const Scalar* csrVal,
                 const int* csrRowPtr, const int* csrColInd, Scalar* cscVal,
                 int* cscRowInd, int* cscColPtr,
-                 const cusparseAction_t copyValues);
+                 const gpusparseAction_t copyValues);

 private:
  bool initialized_;
  OpKernelContext *context_;  // not owned.
-  cudaStream_t cuda_stream_;
-  cusparseHandle_t *cusparse_handle_;  // not owned.
+  gpuStream_t gpu_stream_;
+  gpusparseHandle_t* gpusparse_handle_;  // not owned.

-  TF_DISALLOW_COPY_AND_ASSIGN(CudaSparse);
+  TF_DISALLOW_COPY_AND_ASSIGN(GpuSparse);
 };

 // A wrapper class to ensure that a CUDA sparse matrix descriptor is initialized
-// only once. For more details on the descriptor (cusparseMatDescr_t), see:
+// only once. For more details on the descriptor (gpusparseMatDescr_t), see:
 // https://docs.nvidia.com/cuda/cusparse/index.html#cusparsematdescrt
-class CudaSparseMatrixDescriptor {
+class GpuSparseMatrixDescriptor {
 public:
-  explicit CudaSparseMatrixDescriptor() : initialized_(false) {}
+  explicit GpuSparseMatrixDescriptor() : initialized_(false) {}

-  CudaSparseMatrixDescriptor(CudaSparseMatrixDescriptor&& rhs)
+  GpuSparseMatrixDescriptor(GpuSparseMatrixDescriptor&& rhs)
      : initialized_(rhs.initialized_), descr_(std::move(rhs.descr_)) {
    rhs.initialized_ = false;
  }

-  CudaSparseMatrixDescriptor& operator=(CudaSparseMatrixDescriptor&& rhs) {
+  GpuSparseMatrixDescriptor& operator=(GpuSparseMatrixDescriptor&& rhs) {
    if (this == &rhs) return *this;
    Release();
    initialized_ = rhs.initialized_;
@ -338,23 +412,27 @@ class CudaSparseMatrixDescriptor {
    return *this;
  }

-  ~CudaSparseMatrixDescriptor() { Release(); }
+  ~GpuSparseMatrixDescriptor() { Release(); }

  // Initializes the underlying descriptor.  Will fail on the second call if
  // called more than once.
  Status Initialize() {
    DCHECK(!initialized_);
-    TF_RETURN_IF_CUSPARSE_ERROR(cusparseCreateMatDescr(&descr_));
+#if GOOGLE_CUDA
+    TF_RETURN_IF_GPUSPARSE_ERROR(cusparseCreateMatDescr(&descr_));
+#elif TENSORFLOW_USE_ROCM
+    TF_RETURN_IF_GPUSPARSE_ERROR(hipsparseCreateMatDescr(&descr_));
+#endif
    initialized_ = true;
    return Status::OK();
  }

-  cusparseMatDescr_t& descr() {
+  gpusparseMatDescr_t& descr() {
    DCHECK(initialized_);
    return descr_;
  }

-  const cusparseMatDescr_t& descr() const {
+  const gpusparseMatDescr_t& descr() const {
    DCHECK(initialized_);
    return descr_;
  }
@ -362,31 +440,37 @@ class CudaSparseMatrixDescriptor {
 private:
  void Release() {
    if (initialized_) {
+#if GOOGLE_CUDA
      cusparseDestroyMatDescr(descr_);
+#elif TENSORFLOW_USE_ROCM
+      hipsparseDestroyMatDescr(descr_);
+#endif
      initialized_ = false;
    }
  }

  bool initialized_;
-  cusparseMatDescr_t descr_;
+  gpusparseMatDescr_t descr_;

-  TF_DISALLOW_COPY_AND_ASSIGN(CudaSparseMatrixDescriptor);
+  TF_DISALLOW_COPY_AND_ASSIGN(GpuSparseMatrixDescriptor);
 };

+#if GOOGLE_CUDA
+
 // A wrapper class to ensure that an unsorted/sorted CSR conversion information
 // struct (csru2csrInfo_t) is initialized only once. See:
 // https://docs.nvidia.com/cuda/cusparse/index.html#csru2csr
-class CudaSparseCsrSortingConversionInfo {
+class GpuSparseCsrSortingConversionInfo {
 public:
-  explicit CudaSparseCsrSortingConversionInfo() : initialized_(false) {}
+  explicit GpuSparseCsrSortingConversionInfo() : initialized_(false) {}

-  CudaSparseCsrSortingConversionInfo(CudaSparseCsrSortingConversionInfo&& rhs)
+  GpuSparseCsrSortingConversionInfo(GpuSparseCsrSortingConversionInfo&& rhs)
      : initialized_(rhs.initialized_), info_(std::move(rhs.info_)) {
    rhs.initialized_ = false;
  }

-  CudaSparseCsrSortingConversionInfo& operator=(
-      CudaSparseCsrSortingConversionInfo&& rhs) {
+  GpuSparseCsrSortingConversionInfo& operator=(
+      GpuSparseCsrSortingConversionInfo&& rhs) {
    if (this == &rhs) return *this;
    Release();
    initialized_ = rhs.initialized_;
@ -395,13 +479,13 @@ class CudaSparseCsrSortingConversionInfo {
    return *this;
  }

-  ~CudaSparseCsrSortingConversionInfo() { Release(); }
+  ~GpuSparseCsrSortingConversionInfo() { Release(); }

  // Initializes the underlying info. Will fail on the second call if called
  // more than once.
  Status Initialize() {
    DCHECK(!initialized_);
-    TF_RETURN_IF_CUSPARSE_ERROR(cusparseCreateCsru2csrInfo(&info_));
+    TF_RETURN_IF_GPUSPARSE_ERROR(cusparseCreateCsru2csrInfo(&info_));
    initialized_ = true;
    return Status::OK();
  }
@ -427,11 +511,13 @@ class CudaSparseCsrSortingConversionInfo {
  bool initialized_;
  csru2csrInfo_t info_;

-  TF_DISALLOW_COPY_AND_ASSIGN(CudaSparseCsrSortingConversionInfo);
+  TF_DISALLOW_COPY_AND_ASSIGN(GpuSparseCsrSortingConversionInfo);
 };

-}  // namespace tensorflow
-
 #endif  // GOOGLE_CUDA

+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
 #endif  // TENSORFLOW_CORE_KERNELS_CUDA_SPARSE_H_
--- a/tensorflow/core/kernels/rocm_sparse.cc
+++ b/tensorflow/core/kernels/rocm_sparse.cc
@ -0,0 +1,330 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if TENSORFLOW_USE_ROCM
+
+#include <complex>
+#include <memory>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/cuda_solvers.h"
+#include "tensorflow/core/kernels/cuda_sparse.h"
+#include "tensorflow/core/lib/core/blocking_counter.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/stream_executor.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace {
+
+// A set of initialized handles to the underlying ROCm libraries used by
+// GpuSparse. We maintain one such set of handles per unique stream.
+class HipSparseHandles {
+ public:
+  explicit HipSparseHandles(hipStream_t stream)
+      : initialized_(false), stream_(stream) {}
+
+  HipSparseHandles(HipSparseHandles&& rhs)
+      : initialized_(rhs.initialized_),
+        stream_(std::move(rhs.stream_)),
+        hipsparse_handle_(rhs.hipsparse_handle_) {
+    rhs.initialized_ = false;
+  }
+
+  HipSparseHandles& operator=(HipSparseHandles&& rhs) {
+    if (this == &rhs) return *this;
+    Release();
+    stream_ = std::move(rhs.stream_);
+    hipsparse_handle_ = std::move(rhs.hipsparse_handle_);
+    initialized_ = rhs.initialized_;
+    rhs.initialized_ = false;
+    return *this;
+  }
+
+  ~HipSparseHandles() { Release(); }
+
+  Status Initialize() {
+    if (initialized_) return Status::OK();
+    TF_RETURN_IF_GPUSPARSE_ERROR(hipsparseCreate(&hipsparse_handle_));
+    TF_RETURN_IF_GPUSPARSE_ERROR(
+        hipsparseSetStream(hipsparse_handle_, stream_));
+    initialized_ = true;
+    return Status::OK();
+  }
+
+  hipsparseHandle_t& handle() {
+    DCHECK(initialized_);
+    return hipsparse_handle_;
+  }
+
+  const hipsparseHandle_t& handle() const {
+    DCHECK(initialized_);
+    return hipsparse_handle_;
+  }
+
+ private:
+  void Release() {
+    if (initialized_) {
+      // This should never return anything other than success
+      auto err = hipsparseDestroy(hipsparse_handle_);
+      DCHECK(err == HIPSPARSE_STATUS_SUCCESS)
+          << "Failed to destroy hipSPARSE instance.";
+      initialized_ = false;
+    }
+  }
+  bool initialized_;
+  hipStream_t stream_;
+  hipsparseHandle_t hipsparse_handle_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(HipSparseHandles);
+};
+
+// TODO(ebrevdo): Replace global mutex guarding CudaSparseHandles
+// lookup with one of:
+//    1. Adding the handle to the CudaStream structure; do the lookup there.
+//    2. Add a thread-local cusparse, set it to the current stream
+//       upon each call.
+// #1 seems like the cleanest option but will need to wait until this
+// is moved into TF core.
+static mutex handle_map_mutex(LINKER_INITIALIZED);
+
+using HandleMap = std::unordered_map<hipStream_t, HipSparseHandles>;
+
+// Returns a singleton map used for storing initialized handles for each unique
+// cuda stream.
+HandleMap* GetHandleMapSingleton() {
+  static HandleMap* cm = new HandleMap;
+  return cm;
+}
+
+}  // namespace
+
+GpuSparse::GpuSparse(OpKernelContext* context)
+    : initialized_(false), context_(context) {
+  auto hip_stream_ptr =
+      reinterpret_cast<const hipStream_t*>(context->op_device_context()
+                                               ->stream()
+                                               ->implementation()
+                                               ->GpuStreamMemberHack());
+  DCHECK(hip_stream_ptr);
+  gpu_stream_ = *hip_stream_ptr;
+}
+
+Status GpuSparse::Initialize() {
+  HandleMap* handle_map = GetHandleMapSingleton();
+  DCHECK(handle_map);
+  mutex_lock lock(handle_map_mutex);
+  auto it = handle_map->find(gpu_stream_);
+  if (it == handle_map->end()) {
+    LOG(INFO) << "Creating GpuSparse handles for stream " << gpu_stream_;
+    // Previously unseen ROCm stream. Initialize a set of ROCm sparse library
+    // handles for it.
+    HipSparseHandles new_handles(gpu_stream_);
+    TF_RETURN_IF_ERROR(new_handles.Initialize());
+    it = handle_map->insert(std::make_pair(gpu_stream_, std::move(new_handles)))
+             .first;
+  }
+  gpusparse_handle_ = &it->second.handle();
+  initialized_ = true;
+  return Status::OK();
+}
+
+// Macro that specializes a sparse method for all 4 standard
+// numeric types.
+#define TF_CALL_HIP_LAPACK_TYPES(m) m(float, S) m(double, D)
+
+// Macros to construct hipsparse method names.
+#define SPARSE_FN(method, sparse_prefix) hipsparse##sparse_prefix##method
+
+Status GpuSparse::Coo2csr(const int* cooRowInd, int nnz, int m,
+                          int* csrRowPtr) const {
+  DCHECK(initialized_);
+  TF_RETURN_IF_GPUSPARSE_ERROR(hipsparseXcoo2csr(*gpusparse_handle_, cooRowInd,
+                                                 nnz, m, csrRowPtr,
+                                                 HIPSPARSE_INDEX_BASE_ZERO));
+  return Status::OK();
+}
+
+Status GpuSparse::Csr2coo(const int* csrRowPtr, int nnz, int m,
+                          int* cooRowInd) const {
+  DCHECK(initialized_);
+  TF_RETURN_IF_GPUSPARSE_ERROR(hipsparseXcsr2coo(*gpusparse_handle_, csrRowPtr,
+                                                 nnz, m, cooRowInd,
+                                                 HIPSPARSE_INDEX_BASE_ZERO));
+  return Status::OK();
+}
+
+template <typename Scalar, typename SparseFnT>
+static inline Status CsrmmImpl(
+    SparseFnT op, OpKernelContext* context, hipsparseHandle_t hipsparse_handle,
+    hipsparseOperation_t transA, hipsparseOperation_t transB, int m, int n,
+    int k, int nnz, const Scalar* alpha_host, const hipsparseMatDescr_t descrA,
+    const Scalar* csrSortedValA, const int* csrSortedRowPtrA,
+    const int* csrSortedColIndA, const Scalar* B, int ldb,
+    const Scalar* beta_host, Scalar* C, int ldc) {
+  TF_RETURN_IF_GPUSPARSE_ERROR(op(hipsparse_handle, transA, transB, m, n, k,
+                                  nnz, alpha_host, descrA, csrSortedValA,
+                                  csrSortedRowPtrA, csrSortedColIndA, B, ldb,
+                                  beta_host, C, ldc));
+  return Status::OK();
+}
+
+#define CSRMM_INSTANCE(Scalar, sparse_prefix)                                 \
+  template <>                                                                 \
+  Status GpuSparse::Csrmm<Scalar>(                                            \
+      hipsparseOperation_t transA, hipsparseOperation_t transB, int m, int n, \
+      int k, int nnz, const Scalar* alpha_host,                               \
+      const hipsparseMatDescr_t descrA, const Scalar* csrSortedValA,          \
+      const int* csrSortedRowPtrA, const int* csrSortedColIndA,               \
+      const Scalar* B, int ldb, const Scalar* beta_host, Scalar* C, int ldc)  \
+      const {                                                                 \
+    DCHECK(initialized_);                                                     \
+    return CsrmmImpl(SPARSE_FN(csrmm2, sparse_prefix), context_,              \
+                     *gpusparse_handle_, transA, transB, m, n, k, nnz,        \
+                     alpha_host, descrA, csrSortedValA, csrSortedRowPtrA,     \
+                     csrSortedColIndA, B, ldb, beta_host, C, ldc);            \
+  }
+
+TF_CALL_HIP_LAPACK_TYPES(CSRMM_INSTANCE);
+
+template <typename Scalar, typename SparseFnT>
+static inline Status CsrmvImpl(SparseFnT op, OpKernelContext* context,
+                               hipsparseHandle_t hipsparse_handle,
+                               hipsparseOperation_t transA, int m, int n,
+                               int nnz, const Scalar* alpha_host,
+                               const hipsparseMatDescr_t descrA,
+                               const Scalar* csrSortedValA,
+                               const int* csrSortedRowPtrA,
+                               const int* csrSortedColIndA, const Scalar* x,
+                               const Scalar* beta_host, Scalar* y) {
+  TF_RETURN_IF_GPUSPARSE_ERROR(
+      op(hipsparse_handle, transA, m, n, nnz, alpha_host, descrA, csrSortedValA,
+         csrSortedRowPtrA, csrSortedColIndA, x, beta_host, y));
+  return Status::OK();
+}
+
+// TODO(ebrevdo,rmlarsen): Use csrmv_mp for all cases when available in CUDA 9.
+#define CSRMV_INSTANCE(Scalar, sparse_prefix)                                \
+  template <>                                                                \
+  Status GpuSparse::Csrmv<Scalar>(                                           \
+      hipsparseOperation_t transA, int m, int n, int nnz,                    \
+      const Scalar* alpha_host, const hipsparseMatDescr_t descrA,            \
+      const Scalar* csrSortedValA, const int* csrSortedRowPtrA,              \
+      const int* csrSortedColIndA, const Scalar* x, const Scalar* beta_host, \
+      Scalar* y) const {                                                     \
+    DCHECK(initialized_);                                                    \
+    return CsrmvImpl(SPARSE_FN(csrmv, sparse_prefix), context_,              \
+                     *gpusparse_handle_, transA, m, n, nnz, alpha_host,      \
+                     descrA, csrSortedValA, csrSortedRowPtrA,                \
+                     csrSortedColIndA, x, beta_host, y);                     \
+  }
+
+TF_CALL_HIP_LAPACK_TYPES(CSRMV_INSTANCE);
+
+Status GpuSparse::CsrgemmNnz(
+    hipsparseOperation_t transA, hipsparseOperation_t transB, int m, int n,
+    int k, const hipsparseMatDescr_t descrA, int nnzA,
+    const int* csrSortedRowPtrA, const int* csrSortedColIndA,
+    const hipsparseMatDescr_t descrB, int nnzB, const int* csrSortedRowPtrB,
+    const int* csrSortedColIndB, const hipsparseMatDescr_t descrC,
+    int* csrSortedRowPtrC, int* nnzTotalDevHostPtr) {
+  DCHECK(initialized_);
+  DCHECK(nnzTotalDevHostPtr != nullptr);
+  TF_RETURN_IF_GPUSPARSE_ERROR(hipsparseXcsrgemmNnz(
+      *gpusparse_handle_, transA, transB, m, n, k, descrA, nnzA,
+      csrSortedRowPtrA, csrSortedColIndA, descrB, nnzB, csrSortedRowPtrB,
+      csrSortedColIndB, descrC, csrSortedRowPtrC, nnzTotalDevHostPtr));
+  return Status::OK();
+}
+
+template <typename Scalar, typename SparseFnT>
+static inline Status CsrgemmImpl(
+    SparseFnT op, OpKernelContext* context, hipsparseHandle_t hipsparse_handle,
+    hipsparseOperation_t transA, hipsparseOperation_t transB, int m, int n,
+    int k, const hipsparseMatDescr_t descrA, int nnzA,
+    const Scalar* csrSortedValA, const int* csrSortedRowPtrA,
+    const int* csrSortedColIndA, const hipsparseMatDescr_t descrB, int nnzB,
+    const Scalar* csrSortedValB, const int* csrSortedRowPtrB,
+    const int* csrSortedColIndB, const hipsparseMatDescr_t descrC,
+    Scalar* csrSortedValC, int* csrSortedRowPtrC, int* csrSortedColIndC) {
+  TF_RETURN_IF_GPUSPARSE_ERROR(
+      op(hipsparse_handle, transA, transB, m, n, k, descrA, nnzA, csrSortedValA,
+         csrSortedRowPtrA, csrSortedColIndA, descrB, nnzB, csrSortedValB,
+         csrSortedRowPtrB, csrSortedColIndB, descrC, csrSortedValC,
+         csrSortedRowPtrC, csrSortedColIndC));
+  return Status::OK();
+}
+
+#define CSRGEMM_INSTANCE(Scalar, sparse_prefix)                                \
+  template <>                                                                  \
+  Status GpuSparse::Csrgemm<Scalar>(                                           \
+      hipsparseOperation_t transA, hipsparseOperation_t transB, int m, int n,  \
+      int k, const hipsparseMatDescr_t descrA, int nnzA,                       \
+      const Scalar* csrSortedValA, const int* csrSortedRowPtrA,                \
+      const int* csrSortedColIndA, const hipsparseMatDescr_t descrB, int nnzB, \
+      const Scalar* csrSortedValB, const int* csrSortedRowPtrB,                \
+      const int* csrSortedColIndB, const hipsparseMatDescr_t descrC,           \
+      Scalar* csrSortedValC, int* csrSortedRowPtrC, int* csrSortedColIndC) {   \
+    DCHECK(initialized_);                                                      \
+    return CsrgemmImpl(SPARSE_FN(csrgemm, sparse_prefix), context_,            \
+                       *gpusparse_handle_, transA, transB, m, n, k, descrA,    \
+                       nnzA, csrSortedValA, csrSortedRowPtrA,                  \
+                       csrSortedColIndA, descrB, nnzB, csrSortedValB,          \
+                       csrSortedRowPtrB, csrSortedColIndB, descrC,             \
+                       csrSortedValC, csrSortedRowPtrC, csrSortedColIndC);     \
+  }
+
+TF_CALL_HIP_LAPACK_TYPES(CSRGEMM_INSTANCE);
+
+template <typename Scalar, typename SparseFnT>
+static inline Status Csr2cscImpl(SparseFnT op, OpKernelContext* context,
+                                 hipsparseHandle_t hipsparse_handle, int m,
+                                 int n, int nnz, const Scalar* csrVal,
+                                 const int* csrRowPtr, const int* csrColInd,
+                                 Scalar* cscVal, int* cscRowInd, int* cscColPtr,
+                                 const hipsparseAction_t copyValues) {
+  TF_RETURN_IF_GPUSPARSE_ERROR(
+      op(hipsparse_handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, cscVal,
+         cscRowInd, cscColPtr, copyValues, HIPSPARSE_INDEX_BASE_ZERO));
+  return Status::OK();
+}
+
+#define CSR2CSC_INSTANCE(Scalar, sparse_prefix)                              \
+  template <>                                                                \
+  Status GpuSparse::Csr2csc<Scalar>(                                         \
+      int m, int n, int nnz, const Scalar* csrVal, const int* csrRowPtr,     \
+      const int* csrColInd, Scalar* cscVal, int* cscRowInd, int* cscColPtr,  \
+      const hipsparseAction_t copyValues) {                                  \
+    DCHECK(initialized_);                                                    \
+    return Csr2cscImpl(SPARSE_FN(csr2csc, sparse_prefix), context_,          \
+                       *gpusparse_handle_, m, n, nnz, csrVal, csrRowPtr,     \
+                       csrColInd, cscVal, cscRowInd, cscColPtr, copyValues); \
+  }
+
+TF_CALL_HIP_LAPACK_TYPES(CSR2CSC_INSTANCE);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_USE_ROCM
--- a/tensorflow/core/kernels/sparse/BUILD
+++ b/tensorflow/core/kernels/sparse/BUILD
@ -2,10 +2,10 @@

 load(
    "//tensorflow:tensorflow.bzl",
+    "if_cuda_or_rocm",
    "tf_cc_test",
    "tf_kernel_library",
 )
-load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")

 package(
    default_visibility = ["//visibility:public"],
@ -77,7 +77,7 @@ tf_kernel_library(
        "//tensorflow/core/kernels:scatter_nd_op",
        "//tensorflow/core/kernels:slice_op",
        "//tensorflow/core/kernels:transpose_functor",
-    ] + if_cuda([
+    ] + if_cuda_or_rocm([
        "//tensorflow/core/kernels:cuda_solvers",
        "//tensorflow/core/kernels:cuda_sparse",
    ]),
--- a/tensorflow/core/kernels/sparse/add_op.cc
+++ b/tensorflow/core/kernels/sparse/add_op.cc
@ -15,7 +15,7 @@ limitations under the License.

 #define EIGEN_USE_THREADS

-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define EIGEN_USE_GPU
 #endif

@ -31,7 +31,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/sparse/sparse_matrix.h"
 #include "tensorflow/core/kernels/fill_functor.h"

-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #include "tensorflow/core/kernels/cuda_solvers.h"
 #include "tensorflow/core/kernels/cuda_sparse.h"
 #endif
@ -233,8 +233,10 @@ class CSRAddOp : public OpKernel {

 REGISTER_GPU(float)
 REGISTER_GPU(double)
+#if GOOGLE_CUDA
 REGISTER_GPU(complex64)
 REGISTER_GPU(complex128)
+#endif

 #undef REGISTER_GPU

@ -246,7 +248,7 @@ REGISTER_UNARY_VARIANT_BINARY_OP_FUNCTION(

 #undef REGISTER

-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 namespace functor {
 template <typename T>
 struct CSRSparseMatrixAdd<GPUDevice, T>
@ -324,10 +326,10 @@ struct CSRSparseMatrixAdd<GPUDevice, T>

 private:
  OpKernelContext* ctx_;
-  CudaSparse cuda_sparse_;
-  CudaSparseMatrixDescriptor descrA_;
-  CudaSparseMatrixDescriptor descrB_;
-  CudaSparseMatrixDescriptor descrC_;
+  GpuSparse cuda_sparse_;
+  GpuSparseMatrixDescriptor descrA_;
+  GpuSparseMatrixDescriptor descrB_;
+  GpuSparseMatrixDescriptor descrC_;
  const T alpha_;
  const T beta_;
  bool initialized_;
@ -337,6 +339,6 @@ struct CSRSparseMatrixAdd<GPUDevice, T>

 }  // namespace functor

-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM

 }  // namespace tensorflow
--- a/tensorflow/core/kernels/sparse/conj_op.cc
+++ b/tensorflow/core/kernels/sparse/conj_op.cc
@ -15,7 +15,7 @@ limitations under the License.

 #define EIGEN_USE_THREADS

-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define EIGEN_USE_GPU
 #endif

@ -31,7 +31,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/sparse/kernels.h"
 #include "tensorflow/core/kernels/sparse/sparse_matrix.h"

-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #include "tensorflow/core/kernels/cuda_solvers.h"
 #include "tensorflow/core/kernels/cuda_sparse.h"
 #endif
@ -92,12 +92,12 @@ REGISTER_UNARY_VARIANT_UNARY_OP_FUNCTION(
    CONJ_VARIANT_UNARY_OP, DEVICE_CPU, CSRSparseMatrix,
    (CSRSparseMatrixUnaryHelper<CPUDevice, CSRSparseMatrixConjFunctor>));

-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM

 REGISTER_UNARY_VARIANT_UNARY_OP_FUNCTION(
    CONJ_VARIANT_UNARY_OP, DEVICE_GPU, CSRSparseMatrix,
    (CSRSparseMatrixUnaryHelper<GPUDevice, CSRSparseMatrixConjFunctor>));

-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM

 }  // namespace tensorflow
--- a/tensorflow/core/kernels/sparse/csr_sparse_matrix_to_dense_op.cc
+++ b/tensorflow/core/kernels/sparse/csr_sparse_matrix_to_dense_op.cc
@ -15,7 +15,7 @@ limitations under the License.

 #define EIGEN_USE_THREADS

-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define EIGEN_USE_GPU
 #endif

@ -33,7 +33,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/sparse/sparse_matrix.h"
 #include "tensorflow/core/util/work_sharder.h"

-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #include "tensorflow/core/kernels/cuda_solvers.h"
 #include "tensorflow/core/kernels/cuda_sparse.h"
 #endif
@ -220,19 +220,21 @@ REGISTER_CPU(double)
 REGISTER_CPU(complex64)
 REGISTER_CPU(complex128)

-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM

 REGISTER_GPU(float)
 REGISTER_GPU(double)
+#if GOOGLE_CUDA
 REGISTER_GPU(complex64)
 REGISTER_GPU(complex128)
+#endif

-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM

 #undef REGISTER_CPU
 #undef REGISTER_GPU

-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM

 namespace functor {
 template <>
@ -256,6 +258,6 @@ extern template struct CSRSparseMatrixToCOOSparseMatrix<GPUDevice>;

 }  // namespace functor

-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM

 }  // namespace tensorflow
--- a/tensorflow/core/kernels/sparse/csr_sparse_matrix_to_sparse_tensor_op.cc
+++ b/tensorflow/core/kernels/sparse/csr_sparse_matrix_to_sparse_tensor_op.cc
@ -15,7 +15,7 @@ limitations under the License.

 #define EIGEN_USE_THREADS

-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define EIGEN_USE_GPU
 #endif

@ -31,7 +31,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/sparse/sparse_matrix.h"
 #include "tensorflow/core/util/work_sharder.h"

-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #include "tensorflow/core/kernels/cuda_solvers.h"
 #include "tensorflow/core/kernels/cuda_sparse.h"
 #endif
@ -205,18 +205,20 @@ class CSRSparseMatrixToSparseTensorGPUOp : public OpKernel {
                              .HostMemory("dense_shape"),       \
                          CSRSparseMatrixToSparseTensorGPUOp<GPUDevice, T>);

-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM

 REGISTER_GPU(float)
 REGISTER_GPU(double)
+#if GOOGLE_CUDA
 REGISTER_GPU(complex64)
 REGISTER_GPU(complex128)
+#endif

-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM

 #undef REGISTER_GPU

-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM

 namespace functor {
 template <>
@ -240,7 +242,7 @@ extern template struct CSRSparseMatrixToCOOSparseMatrix<GPUDevice>;

 }  // namespace functor

-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM

 #define REGISTER_CPU(T)                                         \
  REGISTER_KERNEL_BUILDER(Name("CSRSparseMatrixToSparseTensor") \
--- a/tensorflow/core/kernels/sparse/dense_to_csr_sparse_matrix_op.cc
+++ b/tensorflow/core/kernels/sparse/dense_to_csr_sparse_matrix_op.cc
@ -15,7 +15,7 @@ limitations under the License.

 #define EIGEN_USE_THREADS

-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define EIGEN_USE_GPU
 #endif

@ -32,13 +32,18 @@ limitations under the License.
 #include "tensorflow/core/kernels/sparse/kernels.h"
 #include "tensorflow/core/kernels/sparse/sparse_matrix.h"

-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
 #include "tensorflow/core/kernels/cuda_solvers.h"
 #include "tensorflow/core/kernels/cuda_sparse.h"
-#include "tensorflow/stream_executor/cuda/cuda_activation.h"
+#endif

+#if GOOGLE_CUDA
+#include "tensorflow/stream_executor/cuda/cuda_activation.h"
 using ::perftools::gputools::cuda::ScopedActivateExecutorContext;
+#elif TENSORFLOW_USE_ROCM
+#include "tensorflow/stream_executor/rocm/rocm_activation.h"
+using ::perftools::gputools::rocm::ScopedActivateExecutorContext;
 #endif

 namespace tensorflow {
@ -138,7 +143,7 @@ REGISTER_CPU(complex128)

 #undef REGISTER_CPU

-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM

 template <typename Device, typename T>
 class DenseToCSRSparseMatrixGPUOp : public AsyncOpKernel {
@ -356,8 +361,10 @@ class DenseToCSRSparseMatrixGPUOp : public AsyncOpKernel {

 REGISTER_GPU(GPU, float)
 REGISTER_GPU(GPU, double)
+#if GOOGLE_CUDA
 REGISTER_GPU(GPU, complex64)
 REGISTER_GPU(GPU, complex128)
+#endif

 namespace functor {

@ -380,7 +387,7 @@ struct COOSparseMatrixToCSRSparseMatrix<GPUDevice> {
  Status operator()(OpKernelContext* c, const int rows, const int cols,
                    TTypes<int>::UnalignedVec coo_row_ind,
                    TTypes<int>::UnalignedVec csr_row_ptr) {
-    CudaSparse cuda_sparse(c);
+    GpuSparse cuda_sparse(c);
    TF_RETURN_IF_ERROR(cuda_sparse.Initialize());
    return cuda_sparse.Coo2csr(coo_row_ind.data(),
                               /*nnz*/ coo_row_ind.size(),
@ -391,7 +398,7 @@ extern template struct COOSparseMatrixToCSRSparseMatrix<GPUDevice>;

 }  // namespace functor

-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM

 #undef REGISTER_GPU

--- a/tensorflow/core/kernels/sparse/kernels_gpu.cu.cc
+++ b/tensorflow/core/kernels/sparse/kernels_gpu.cu.cc
@ -13,15 +13,19 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/

-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM

 #define EIGEN_USE_GPU

 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#if GOOGLE_CUDA
 #include "third_party/cub/device/device_histogram.cuh"
 #include "third_party/cub/iterator/counting_input_iterator.cuh"
 #include "third_party/cub/iterator/transform_input_iterator.cuh"
 #include "third_party/gpus/cuda/include/cusparse.h"
+#elif TENSORFLOW_USE_ROCM
+#include "rocm/include/hipcub/hipcub.hpp"
+#endif
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/kernels/cuda_sparse.h"
@ -32,6 +36,12 @@ limitations under the License.
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/gpu_kernel_helper.h"

+#if GOOGLE_CUDA
+namespace gpuprim = ::cub;
+#elif TENSORFLOW_USE_ROCM
+namespace gpuprim = ::hipcub;
+#endif
+
 namespace tensorflow {

 typedef Eigen::GpuDevice GPUDevice;
@ -65,9 +75,9 @@ Status CalculateNNZPerBatchMatrixFromIndices<GPUDevice>::operator()(
  DCHECK_EQ(indices.dimension(1), 3);  // batch, row, col

  const int rank = indices.dimension(1);
-  cub::CountingInputIterator<int> row_counter(0);
-  cub::TransformInputIterator<int, StridedDataReader,
-                              cub::CountingInputIterator<int>>
+  gpuprim::CountingInputIterator<int> row_counter(0);
+  gpuprim::TransformInputIterator<int, StridedDataReader,
+                                  gpuprim::CountingInputIterator<int>>
      indices_first_column(row_counter,
                           StridedDataReader(indices.data(), rank));

@ -76,7 +86,7 @@ Status CalculateNNZPerBatchMatrixFromIndices<GPUDevice>::operator()(
  DCHECK_NE(indices.data(), nullptr);
  DCHECK_NE(nnz_per_batch.data(), nullptr);

-  auto first_success = cub::DeviceHistogram::HistogramEven(
+  auto first_success = gpuprim::DeviceHistogram::HistogramEven(
      /*d_temp_storage*/ nullptr,
      /*temp_storage_bytes&*/ temp_storage_bytes,
      /*d_samples*/ indices_first_column,
@ -87,12 +97,12 @@ Status CalculateNNZPerBatchMatrixFromIndices<GPUDevice>::operator()(
      /*num_samples*/ total_nnz,
      /*stream*/ cu_stream);

-  if (first_success != cudaSuccess) {
+  if (first_success != gpuSuccess) {
    return errors::Internal(
        "SparseTensorToCSRSparseMatrix: Could not launch "
-        "cub::DeviceHistogram::HistogramEven "
+        "gpuprim::DeviceHistogram::HistogramEven "
        "to calculate temp_storage_bytes, status: ",
-        cudaGetErrorString(first_success));
+        GpuGetErrorString(first_success));
  }

  Tensor temp_storage;
@ -100,7 +110,7 @@ Status CalculateNNZPerBatchMatrixFromIndices<GPUDevice>::operator()(
      DT_INT8, TensorShape({static_cast<int64>(temp_storage_bytes)}),
      &temp_storage));
  DCHECK_NE(temp_storage.flat<int8>().data(), nullptr);
-  auto second_success = cub::DeviceHistogram::HistogramEven(
+  auto second_success = gpuprim::DeviceHistogram::HistogramEven(
      /*d_temp_storage*/ temp_storage.flat<int8>().data(),
      /*temp_storage_bytes&*/ temp_storage_bytes,
      /*d_samples*/ indices_first_column,
@ -111,12 +121,12 @@ Status CalculateNNZPerBatchMatrixFromIndices<GPUDevice>::operator()(
      /*num_samples*/ total_nnz,
      /*stream*/ cu_stream);

-  if (second_success != cudaSuccess) {
+  if (second_success != gpuSuccess) {
    return errors::Internal(
        "SparseTensorToCSRSparseMatrix: Could not launch "
-        "cub::DeviceHistogram::HistogramEven "
+        "gpuprim::DeviceHistogram::HistogramEven "
        "to count nnz entries per batch.  temp_storage_bytes: ",
-        temp_storage_bytes, ", status: ", cudaGetErrorString(second_success));
+        temp_storage_bytes, ", status: ", GpuGetErrorString(second_success));
  }

  return Status::OK();
@ -128,11 +138,11 @@ template <>
 Status CSRSparseMatrixToCOOSparseMatrix<GPUDevice>::operator()(
    OpKernelContext* c, TTypes<const int>::UnalignedVec csr_row_ptr,
    TTypes<int>::UnalignedVec coo_row_ind) {
-  CudaSparse cuda_sparse(c);
+  GpuSparse gpu_sparse(c);
  const int nnz = coo_row_ind.size();
-  TF_RETURN_IF_ERROR(cuda_sparse.Initialize());
+  TF_RETURN_IF_ERROR(gpu_sparse.Initialize());
  const int m = csr_row_ptr.size() - 1;  // rows
-  return cuda_sparse.Csr2coo(csr_row_ptr.data(), nnz, m, coo_row_ind.data());
+  return gpu_sparse.Csr2coo(csr_row_ptr.data(), nnz, m, coo_row_ind.data());
 }

 template <int stride>
@ -140,7 +150,7 @@ __global__ void SparseTensorToCOOMatrixKernel(const int64* indices,
                                              int* coo_rows_out,
                                              int* coo_cols_out, int size) {
  const int offset = (stride == 3) ? 1 : 0;
-  CUDA_1D_KERNEL_LOOP(i, size) {
+  GPU_1D_KERNEL_LOOP(i, size) {
    coo_rows_out[i] = static_cast<int>(ldg(indices + i * stride + offset));
    coo_cols_out[i] = static_cast<int>(ldg(indices + i * stride + offset + 1));
  }
@ -157,20 +167,22 @@ void SparseTensorToCOOSparseMatrix<GPUDevice>::operator()(
  const int size = coo_row_ind.dimension(0);
  GpuLaunchConfig config = GetGpuLaunchConfig(size, d);
  if (stride == 2) {
-    SparseTensorToCOOMatrixKernel<2>
-        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-            indices.data(), coo_row_ind.data(), coo_col_ind.data(), size);
+    TF_CHECK_OK(GpuLaunchKernel(SparseTensorToCOOMatrixKernel<2>,
+                                config.block_count, config.thread_per_block, 0,
+                                d.stream(), indices.data(), coo_row_ind.data(),
+                                coo_col_ind.data(), size));
  } else {
-    SparseTensorToCOOMatrixKernel<3>
-        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-            indices.data(), coo_row_ind.data(), coo_col_ind.data(), size);
+    TF_CHECK_OK(GpuLaunchKernel(SparseTensorToCOOMatrixKernel<3>,
+                                config.block_count, config.thread_per_block, 0,
+                                d.stream(), indices.data(), coo_row_ind.data(),
+                                coo_col_ind.data(), size));
  }
 }

 __global__ void COOMatrixToSparseTensorKernel2D(const int* coo_rows,
                                                const int* coo_cols,
                                                int64* indices_out, int size) {
-  CUDA_1D_KERNEL_LOOP(i, size) {
+  GPU_1D_KERNEL_LOOP(i, size) {
    indices_out[i * 2] = static_cast<int64>(ldg(coo_rows + i));
    indices_out[i * 2 + 1] = static_cast<int64>(ldg(coo_cols + i));
  }
@ -203,7 +215,7 @@ __global__ void COOMatrixToSparseTensorKernel3D(
  }
  __syncthreads();

-  CUDA_1D_KERNEL_LOOP(i, size) {
+  GPU_1D_KERNEL_LOOP(i, size) {
    // TODO(ebrevdo): Consider special casing batch_size <= 3,
    // alternatively doing linear instead of binary search.  Requires
    // some benchmarks.
@ -231,9 +243,10 @@ Status COOSparseMatrixToSparseTensor<GPUDevice>::operator()(
  DCHECK_EQ(size, indices.dimension(0));
  if (ndims == 2) {
    GpuLaunchConfig config = GetGpuLaunchConfig(size, d);
-    COOMatrixToSparseTensorKernel2D<<<config.block_count,
-                                      config.thread_per_block, 0, d.stream()>>>(
-        coo_row_ind.data(), coo_col_ind.data(), indices.data(), size);
+    TF_CHECK_OK(GpuLaunchKernel(COOMatrixToSparseTensorKernel2D,
+                                config.block_count, config.thread_per_block, 0,
+                                d.stream(), coo_row_ind.data(),
+                                coo_col_ind.data(), indices.data(), size));
    return Status::OK();
  } else {
    const int batch_size = host_dense_shape(0);
@ -246,11 +259,11 @@ Status COOSparseMatrixToSparseTensor<GPUDevice>::operator()(
    GpuLaunchConfig config = GetGpuLaunchConfig(size, d);
    // shared memory stores the batch pointers.
    const size_t shared_memory_size = sizeof(int) * (batch_size + 1);
-    COOMatrixToSparseTensorKernel3D<<<config.block_count,
-                                      config.thread_per_block,
-                                      shared_memory_size, d.stream()>>>(
-        coo_row_ind.data(), coo_col_ind.data(), indices.data(),
-        batch_ptr_copy.data(), batch_size, size);
+    TF_CHECK_OK(
+        GpuLaunchKernel(COOMatrixToSparseTensorKernel3D, config.block_count,
+                        config.thread_per_block, shared_memory_size, d.stream(),
+                        coo_row_ind.data(), coo_col_ind.data(), indices.data(),
+                        batch_ptr_copy.data(), batch_size, size));
    return Status::OK();
  }
 }
@ -274,7 +287,7 @@ __global__ void CSRSparseMatrixBatchMulVecKernel3D(
  }
  __syncthreads();

-  CUDA_1D_KERNEL_LOOP(i, total_nnz) {
+  GPU_1D_KERNEL_LOOP(i, total_nnz) {
    const int b = BinarySearchRange(local_batch_ptr, batch_size, i);
    c_values[i] = ldg(a_values + i) * local_batch_values[b];
  }
@ -316,10 +329,10 @@ Status CSRSparseMatrixBatchMulVecImpl(OpKernelContext* ctx,
  const size_t shared_memory_size =
      (sizeof(int) * (batch_size + 1)  // local batch_pointers.
       + sizeof(T) * batch_size);      // local copy of b.
-  CSRSparseMatrixBatchMulVecKernel3D<T>
-      <<<config.block_count, config.thread_per_block, shared_memory_size,
-         d.stream()>>>(a_values.data(), b.data(), c_values.data(),
-                       batch_ptr_copy.data(), batch_size, total_nnz);
+  TF_CHECK_OK(GpuLaunchKernel(
+      CSRSparseMatrixBatchMulVecKernel3D<T>, config.block_count,
+      config.thread_per_block, shared_memory_size, d.stream(), a_values.data(),
+      b.data(), c_values.data(), batch_ptr_copy.data(), batch_size, total_nnz));

  return Status::OK();
 }
@ -374,7 +387,7 @@ __global__ void CSRSparseMatrixSoftmaxKernel2D(const int rows,
  // algorithm to distribute the work in case the row sizes are
  // uneven:
  //   http://images.nvidia.com/events/sc15/pdfs/sc15-Merge-Based-Parallel-Sparse-Matrix-Vector-Multiplication-merrill.pdf
-  CUDA_1D_KERNEL_LOOP(row, rows) {
+  GPU_1D_KERNEL_LOOP(row, rows) {
    CalculateRowSoftmax(ldg(row_ptr + row), ldg(row_ptr + row + 1), logits,
                        softmax);
  }
@ -382,7 +395,7 @@ __global__ void CSRSparseMatrixSoftmaxKernel2D(const int rows,

 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void CopyFromGpuDeviceArrayToLocal(
    GpuDeviceArrayStruct<int> cuda_ptr_s, int* local_ptr, int length) {
-#ifdef __CUDA_ARCH__
+#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
  const int* cuda_ptr = GetGpuDeviceArrayOnDevice(&cuda_ptr_s);
  for (int i = threadIdx.x; i < length; i += blockDim.x) {
    local_ptr[i] = cuda_ptr[i];
@ -404,7 +417,7 @@ __global__ void CSRSparseMatrixSoftmaxKernel3D(
  CopyFromGpuDeviceArrayToLocal(std::move(batch_ptr_s), local_batch_ptr,
                                batch_size + 1);

-  CUDA_1D_KERNEL_LOOP(i, size) {
+  GPU_1D_KERNEL_LOOP(i, size) {
    const int batch = i / rows;
    const int row = i % rows;
    const int batch_offset = local_batch_ptr[batch];
@ -431,10 +444,10 @@ Status CSRSparseMatrixSoftmaxGPUImpl(OpKernelContext* ctx,
    const int rows = host_dense_shape(0);
    DCHECK_EQ(rows, row_ptr.size() - 1);
    GpuLaunchConfig config = GetGpuLaunchConfig(rows /*size*/, d);
-    CSRSparseMatrixSoftmaxKernel2D<T>
-        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-            rows /*size*/, row_ptr.data(), logits_values.data(),
-            softmax_values.data());
+    TF_CHECK_OK(GpuLaunchKernel(CSRSparseMatrixSoftmaxKernel2D<T>,
+                                config.block_count, config.thread_per_block, 0,
+                                d.stream(), rows /*size*/, row_ptr.data(),
+                                logits_values.data(), softmax_values.data()));
  } else {
    const int batch_size = host_dense_shape(0);
    const int rows = host_dense_shape(1);
@ -452,10 +465,11 @@ Status CSRSparseMatrixSoftmaxGPUImpl(OpKernelContext* ctx,
    GpuLaunchConfig config = GetGpuLaunchConfig(size, d);
    // shared memory stores the batch pointers.
    const size_t shared_memory_size = sizeof(int) * (batch_size + 1);
-    CSRSparseMatrixSoftmaxKernel3D<T>
-        <<<config.block_count, config.thread_per_block, shared_memory_size,
-           d.stream()>>>(size, rows, batch_ptr_copy.data(), row_ptr.data(),
-                         logits_values.data(), softmax_values.data());
+    TF_CHECK_OK(GpuLaunchKernel(CSRSparseMatrixSoftmaxKernel3D<T>,
+                                config.block_count, config.thread_per_block,
+                                shared_memory_size, d.stream(), size, rows,
+                                batch_ptr_copy.data(), row_ptr.data(),
+                                logits_values.data(), softmax_values.data()));
  }

  return Status::OK();
@ -549,7 +563,7 @@ __global__ void CSRSparseMatrixSoftmaxGradKernel2D(
  // algorithm to distribute the work in case the row sizes are
  // uneven:
  //   http://images.nvidia.com/events/sc15/pdfs/sc15-Merge-Based-Parallel-Sparse-Matrix-Vector-Multiplication-merrill.pdf
-  CUDA_1D_KERNEL_LOOP(row, rows) {
+  GPU_1D_KERNEL_LOOP(row, rows) {
    CalculateRowSoftmaxGrad(
        ldg(softmax_row_ptr + row) /*softmax_begin*/,
        ldg(softmax_row_ptr + row + 1) /*softmax_end*/, softmax_col_ind,
@ -579,7 +593,7 @@ __global__ void CSRSparseMatrixSoftmaxGradKernel3D(
 #define SOFTMAX_BATCH_PTR(i) local_batch_ptr[i];
 #define GRAD_SOFTMAX_BATCH_PTR(i) local_batch_ptr[batch_size + 1 + i];

-  CUDA_1D_KERNEL_LOOP(i, size) {
+  GPU_1D_KERNEL_LOOP(i, size) {
    const int batch = i / rows;
    const int row = i % rows;
    const int softmax_batch_offset = SOFTMAX_BATCH_PTR(batch);
@ -625,12 +639,12 @@ Status CSRSparseMatrixSoftmaxGradGPUImpl(
    DCHECK_EQ(rows + 1, softmax_row_ptr.size());
    DCHECK_EQ(rows + 1, grad_softmax_row_ptr.size());
    GpuLaunchConfig config = GetGpuLaunchConfig(rows /*size*/, d);
-    CSRSparseMatrixSoftmaxGradKernel2D<T>
-        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-            rows /*size*/, softmax_row_ptr.data(), softmax_col_ind.data(),
-            softmax_values.data(), grad_softmax_row_ptr.data(),
-            grad_softmax_col_ind.data(), grad_softmax_values.data(),
-            gradient_values.data());
+    TF_CHECK_OK(GpuLaunchKernel(
+        CSRSparseMatrixSoftmaxGradKernel2D<T>, config.block_count,
+        config.thread_per_block, 0, d.stream(), rows /*size*/,
+        softmax_row_ptr.data(), softmax_col_ind.data(), softmax_values.data(),
+        grad_softmax_row_ptr.data(), grad_softmax_col_ind.data(),
+        grad_softmax_values.data(), gradient_values.data()));
  } else {
    const int batch_size = host_dense_shape(0);
    const int rows = host_dense_shape(1);
@ -656,13 +670,13 @@ Status CSRSparseMatrixSoftmaxGradGPUImpl(
    // shared memory stores two copies of batch pointers: one for the
    // softmax CSR matrix, one for the grad_softmax CSR matrix.
    const size_t shared_memory_size = 2 * sizeof(int) * (batch_size + 1);
-    CSRSparseMatrixSoftmaxGradKernel3D<T>
-        <<<config.block_count, config.thread_per_block, shared_memory_size,
-           d.stream()>>>(size, rows, softmax_and_grad_batch_ptr_copy.data(),
-                         softmax_row_ptr.data(), softmax_col_ind.data(),
-                         softmax_values.data(), grad_softmax_row_ptr.data(),
-                         grad_softmax_col_ind.data(),
-                         grad_softmax_values.data(), gradient_values.data());
+    TF_CHECK_OK(GpuLaunchKernel(
+        CSRSparseMatrixSoftmaxGradKernel3D<T>, config.block_count,
+        config.thread_per_block, shared_memory_size, d.stream(), size, rows,
+        softmax_and_grad_batch_ptr_copy.data(), softmax_row_ptr.data(),
+        softmax_col_ind.data(), softmax_values.data(),
+        grad_softmax_row_ptr.data(), grad_softmax_col_ind.data(),
+        grad_softmax_values.data(), gradient_values.data()));
  }

  return Status::OK();
@ -687,4 +701,4 @@ DEFINE_SOFTMAX_GRAD_GPU(double);

 }  // namespace tensorflow

-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
--- a/tensorflow/core/kernels/sparse/mat_mul_op.cc
+++ b/tensorflow/core/kernels/sparse/mat_mul_op.cc
@ -15,7 +15,7 @@ limitations under the License.

 #define EIGEN_USE_THREADS

-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define EIGEN_USE_GPU
 #endif

@ -36,7 +36,7 @@ limitations under the License.
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
 #include "tensorflow/core/platform/threadpool.h"

-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #include "tensorflow/core/kernels/cuda_solvers.h"
 #include "tensorflow/core/kernels/cuda_sparse.h"
 #endif
@ -694,7 +694,7 @@ REGISTER_CPU(complex128)

 #undef REGISTER_CPU

-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM

 #define REGISTER_GPU(T)                                                     \
  REGISTER_KERNEL_BUILDER(                                                  \
@ -703,14 +703,16 @@ REGISTER_CPU(complex128)

 REGISTER_GPU(float)
 REGISTER_GPU(double)
+#if GOOGLE_CUDA
 REGISTER_GPU(complex64)
 REGISTER_GPU(complex128)
+#endif

 #undef REGISTER_GPU

-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM

-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM

 namespace functor {

@ -723,7 +725,7 @@ class CSRSparseMatrixMatMul<GPUDevice, T> {
  Status Compute(OpKernelContext* ctx, const ConstCSRComponent<T>& a,
                 typename TTypes<T>::UnalignedConstMatrix b,
                 typename TTypes<T>::UnalignedMatrix c) {
-    CudaSparse cuda_sparse(ctx);
+    GpuSparse cuda_sparse(ctx);
    TF_RETURN_IF_ERROR(cuda_sparse.Initialize());
    {
      // Use Csrmm to calculate:
@ -741,19 +743,34 @@ class CSRSparseMatrixMatMul<GPUDevice, T> {

      // transA must be non-transpose if transB is transpose (cusparse
      // limitation).
-      const cusparseOperation_t transA = CUSPARSE_OPERATION_NON_TRANSPOSE;
+#if GOOGLE_CUDA
+      const gpusparseOperation_t transA = CUSPARSE_OPERATION_NON_TRANSPOSE;
+#elif TENSORFLOW_USE_ROCM
+      const gpusparseOperation_t transA = HIPSPARSE_OPERATION_NON_TRANSPOSE;
+#endif

      // transB: b is row-major, and cusparse requires col-major b (or
      // equivalently transB == transpose).  this version is actually more
      // efficient.
-      const cusparseOperation_t transB = CUSPARSE_OPERATION_TRANSPOSE;
+#if GOOGLE_CUDA
+      const gpusparseOperation_t transB = CUSPARSE_OPERATION_TRANSPOSE;

-      cusparseMatDescr_t descrA;
-      TF_RETURN_IF_CUSPARSE_ERROR(cusparseCreateMatDescr(&descrA));
-      TF_RETURN_IF_CUSPARSE_ERROR(
+      gpusparseMatDescr_t descrA;
+      TF_RETURN_IF_GPUSPARSE_ERROR(cusparseCreateMatDescr(&descrA));
+      TF_RETURN_IF_GPUSPARSE_ERROR(
          cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL));
-      TF_RETURN_IF_CUSPARSE_ERROR(
+      TF_RETURN_IF_GPUSPARSE_ERROR(
          cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO));
+#elif TENSORFLOW_USE_ROCM
+      const gpusparseOperation_t transB = HIPSPARSE_OPERATION_TRANSPOSE;
+
+      gpusparseMatDescr_t descrA;
+      TF_RETURN_IF_GPUSPARSE_ERROR(hipsparseCreateMatDescr(&descrA));
+      TF_RETURN_IF_GPUSPARSE_ERROR(
+          hipsparseSetMatType(descrA, HIPSPARSE_MATRIX_TYPE_GENERAL));
+      TF_RETURN_IF_GPUSPARSE_ERROR(
+          hipsparseSetMatIndexBase(descrA, HIPSPARSE_INDEX_BASE_ZERO));
+#endif

      // A is (m, k), Bt is (ldb, k) and Ct is (ldc, n)
      const int k = b.dimension(0);
@ -796,13 +813,13 @@ template <typename T>
 class CSRSparseMatrixMatVec<GPUDevice, T> {
 public:
  CSRSparseMatrixMatVec(bool transpose_a, bool conjugate_a)
-      : transA_(TransposeAndConjugateToCuSparseOp(transpose_a, conjugate_a,
-                                                  &status_)) {}
+      : transA_(TransposeAndConjugateToGpuSparseOp(transpose_a, conjugate_a,
+                                                   &status_)) {}

  Status Compute(OpKernelContext* ctx, const ConstCSRComponent<T>& a,
                 const T* x, T* y) {
    TF_RETURN_IF_ERROR(status_);
-    CudaSparse cuda_sparse(ctx);
+    GpuSparse cuda_sparse(ctx);
    TF_RETURN_IF_ERROR(cuda_sparse.Initialize());
    {
      // Use Csrmv to calculate:
@ -815,12 +832,20 @@ class CSRSparseMatrixMatVec<GPUDevice, T> {
      const T alpha = 1;
      const T beta = 0;

-      cusparseMatDescr_t descrA;
-      TF_RETURN_IF_CUSPARSE_ERROR(cusparseCreateMatDescr(&descrA));
-      TF_RETURN_IF_CUSPARSE_ERROR(
+      gpusparseMatDescr_t descrA;
+#if GOOGLE_CUDA
+      TF_RETURN_IF_GPUSPARSE_ERROR(cusparseCreateMatDescr(&descrA));
+      TF_RETURN_IF_GPUSPARSE_ERROR(
          cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL));
-      TF_RETURN_IF_CUSPARSE_ERROR(
+      TF_RETURN_IF_GPUSPARSE_ERROR(
          cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO));
+#elif TENSORFLOW_USE_ROCM
+      TF_RETURN_IF_GPUSPARSE_ERROR(hipsparseCreateMatDescr(&descrA));
+      TF_RETURN_IF_GPUSPARSE_ERROR(
+          hipsparseSetMatType(descrA, HIPSPARSE_MATRIX_TYPE_GENERAL));
+      TF_RETURN_IF_GPUSPARSE_ERROR(
+          hipsparseSetMatIndexBase(descrA, HIPSPARSE_INDEX_BASE_ZERO));
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM

      const int m = a.dense_shape_host(0);
      const int n = a.dense_shape_host(1);
@ -836,11 +861,11 @@ class CSRSparseMatrixMatVec<GPUDevice, T> {

 private:
  Status status_;
-  const cusparseOperation_t transA_;
+  const gpusparseOperation_t transA_;
 };

 }  // namespace functor

-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM

 }  // namespace tensorflow
--- a/tensorflow/core/kernels/sparse/mul_op.cc
+++ b/tensorflow/core/kernels/sparse/mul_op.cc
@ -15,7 +15,7 @@ limitations under the License.

 #define EIGEN_USE_THREADS

-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define EIGEN_USE_GPU
 #endif

@ -28,7 +28,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/sparse/kernels.h"
 #include "tensorflow/core/kernels/sparse/sparse_matrix.h"

-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #include "tensorflow/core/kernels/cuda_sparse.h"
 #endif

@ -101,22 +101,24 @@ class CSRMulOp : public OpKernel {
      Name("SparseMatrixMul").Device(DEVICE_##DEV).TypeConstraint<T>("T"), \
      CSRMulOp<DEV##Device, T>);

-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM

 #define REGISTER_GPU(T) REGISTER(GPU, T)

 REGISTER_GPU(float)
 REGISTER_GPU(double)
+#if GOOGLE_CUDA
 REGISTER_GPU(complex64)
 REGISTER_GPU(complex128)
+#endif

 #undef REGISTER_GPU

-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM

 #undef REGISTER

-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM

 namespace functor {

@ -159,13 +161,15 @@ class CSRSparseMatrixMulScalar<GPUDevice, T> {

 DECLARE_GPU_SPEC(float);
 DECLARE_GPU_SPEC(double);
+#if GOOGLE_CUDA
 DECLARE_GPU_SPEC(std::complex<float>);
 DECLARE_GPU_SPEC(std::complex<double>);
+#endif

 #undef DECLARE_GPU_SPEC

 }  // namespace functor

-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM

 }  // namespace tensorflow
--- a/tensorflow/core/kernels/sparse/nnz_op.cc
+++ b/tensorflow/core/kernels/sparse/nnz_op.cc
@ -15,7 +15,7 @@ limitations under the License.

 #define EIGEN_USE_THREADS

-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define EIGEN_USE_GPU
 #endif

@ -28,7 +28,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/sparse/kernels.h"
 #include "tensorflow/core/kernels/sparse/sparse_matrix.h"

-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #include "tensorflow/core/kernels/cuda_solvers.h"
 #include "tensorflow/core/kernels/cuda_sparse.h"
 #endif
@ -67,11 +67,11 @@ class CSRNNZOp : public OpKernel {

 REGISTER(CPU)

-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM

 REGISTER(GPU)

-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM

 #undef REGISTER

--- a/tensorflow/core/kernels/sparse/softmax_op.cc
+++ b/tensorflow/core/kernels/sparse/softmax_op.cc
@ -19,7 +19,7 @@ limitations under the License.

 #define EIGEN_USE_THREADS

-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #include "tensorflow/core/kernels/cuda_sparse.h"
 #define EIGEN_USE_GPU
 #endif
@ -84,7 +84,7 @@ class CSRSoftmaxOp : public OpKernel {
  }
 };

-#ifdef GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define REGISTER(DEV, T)                                  \
  REGISTER_KERNEL_BUILDER(Name("SparseMatrixSoftmax")     \
                              .Device(DEVICE_##DEV)       \
@ -110,7 +110,7 @@ DECLARE_GPU_SPEC(double);
 #undef DECLARE_GPU_SPEC
 }  // namespace functor

-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM

 template <typename Device, typename T>
 class CSRSoftmaxGradOp : public OpKernel {
@ -193,7 +193,7 @@ class CSRSoftmaxGradOp : public OpKernel {
  }
 };

-#ifdef GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define REGISTER(DEV, T)                                  \
  REGISTER_KERNEL_BUILDER(Name("SparseMatrixSoftmaxGrad") \
                              .Device(DEVICE_##DEV)       \
@ -220,6 +220,6 @@ DECLARE_GPU_SPEC(double);
 #undef DECLARE_GPU_SPEC
 }  // namespace functor

-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM

 }  // namespace tensorflow
--- a/tensorflow/core/kernels/sparse/sparse_mat_mul_op.cc
+++ b/tensorflow/core/kernels/sparse/sparse_mat_mul_op.cc
@ -15,7 +15,7 @@ limitations under the License.

 #define EIGEN_USE_THREADS

-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define EIGEN_USE_GPU
 #endif

@ -35,7 +35,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/sparse/sparse_matrix.h"
 #include "tensorflow/core/util/work_sharder.h"

-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #include "tensorflow/core/kernels/cuda_solvers.h"
 #include "tensorflow/core/kernels/cuda_sparse.h"
 #endif
@ -498,22 +498,24 @@ REGISTER_CPU(complex128)
                              .TypeConstraint<T>("type"),  \
                          CSRSparseMatMulGPUOp<DEV##Device, T>);

-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM

 #define REGISTER_GPU(T) REGISTER(GPU, T)

 REGISTER_GPU(float)
 REGISTER_GPU(double)
+#if GOOGLE_CUDA
 REGISTER_GPU(complex64)
 REGISTER_GPU(complex128)
+#endif  // GOOGLE_CUDA

 #undef REGISTER_GPU

-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM

 #undef REGISTER

-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 namespace functor {
 template <typename T>
 struct CSRSparseSparseMatrixMatMul<GPUDevice, T>
@ -527,11 +529,20 @@ struct CSRSparseSparseMatrixMatMul<GPUDevice, T>
        adjoint_a_(adjoint_a),
        transpose_b_(transpose_b) {
    // TODO(ebrevdo): Figure out why transposed implementations crash cuSparse.
+#if GOOGLE_CUDA
    transA_ = transpose_a ? (adjoint_a ? CUSPARSE_OPERATION_TRANSPOSE
                                       : CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE)
                          : CUSPARSE_OPERATION_NON_TRANSPOSE;
    transB_ = transpose_b ? CUSPARSE_OPERATION_TRANSPOSE
                          : CUSPARSE_OPERATION_NON_TRANSPOSE;
+#elif TENSORFLOW_USE_ROCM
+    transA_ = transpose_a
+                  ? (adjoint_a ? HIPSPARSE_OPERATION_TRANSPOSE
+                               : HIPSPARSE_OPERATION_CONJUGATE_TRANSPOSE)
+                  : HIPSPARSE_OPERATION_NON_TRANSPOSE;
+    transB_ = transpose_b ? HIPSPARSE_OPERATION_TRANSPOSE
+                          : HIPSPARSE_OPERATION_NON_TRANSPOSE;
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
  }

  Status Initialize() {
@ -630,20 +641,20 @@ struct CSRSparseSparseMatrixMatMul<GPUDevice, T>

 private:
  OpKernelContext* ctx_;
-  CudaSparse cuda_sparse_;
+  GpuSparse cuda_sparse_;
  bool initialized_;
  bool transpose_a_;
  bool adjoint_a_;
  bool transpose_b_;
-  CudaSparseMatrixDescriptor descrA_;
-  CudaSparseMatrixDescriptor descrB_;
-  CudaSparseMatrixDescriptor descrC_;
-  cusparseOperation_t transA_;
-  cusparseOperation_t transB_;
+  GpuSparseMatrixDescriptor descrA_;
+  GpuSparseMatrixDescriptor descrB_;
+  GpuSparseMatrixDescriptor descrC_;
+  gpusparseOperation_t transA_;
+  gpusparseOperation_t transB_;
 };

 }  // namespace functor

-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM

 }  // namespace tensorflow
--- a/tensorflow/core/kernels/sparse/sparse_matrix.cc
+++ b/tensorflow/core/kernels/sparse/sparse_matrix.cc
@ -15,7 +15,7 @@ limitations under the License.

 #define EIGEN_USE_THREADS

-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define EIGEN_USE_GPU
 #endif

--- a/tensorflow/core/kernels/sparse/sparse_matrix.h
+++ b/tensorflow/core/kernels/sparse/sparse_matrix.h
@ -18,7 +18,7 @@ limitations under the License.

 #define EIGEN_USE_THREADS

-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define EIGEN_USE_GPU
 #endif

--- a/tensorflow/core/kernels/sparse/sparse_matrix_components_op.cc
+++ b/tensorflow/core/kernels/sparse/sparse_matrix_components_op.cc
@ -15,7 +15,7 @@ limitations under the License.

 #define EIGEN_USE_THREADS

-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define EIGEN_USE_GPU
 #endif

@ -29,7 +29,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/sparse/kernels.h"
 #include "tensorflow/core/kernels/sparse/sparse_matrix.h"

-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #include "tensorflow/core/kernels/cuda_solvers.h"
 #include "tensorflow/core/kernels/cuda_sparse.h"
 #endif
@ -116,12 +116,14 @@ REGISTER(CPU, double)
 REGISTER(CPU, complex64)
 REGISTER(CPU, complex128)

-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM

 REGISTER(GPU, float)
 REGISTER(GPU, double)
+#if GOOGLE_CUDA
 REGISTER(GPU, complex64)
 REGISTER(GPU, complex128)
+#endif

 #undef REGISTER

@ -139,12 +141,14 @@ namespace functor {
 DECLARE_GPU_SPEC(int32);
 DECLARE_GPU_SPEC(float);
 DECLARE_GPU_SPEC(double);
+#if GOOGLE_CUDA
 DECLARE_GPU_SPEC(complex64);
 DECLARE_GPU_SPEC(complex128);
+#endif

 #undef DECLARE_GPU_SPEC
 }  // namespace functor

-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM

 }  // namespace tensorflow
--- a/tensorflow/core/kernels/sparse/sparse_tensor_to_csr_sparse_matrix_op.cc
+++ b/tensorflow/core/kernels/sparse/sparse_tensor_to_csr_sparse_matrix_op.cc
@ -15,7 +15,7 @@ limitations under the License.

 #define EIGEN_USE_THREADS

-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define EIGEN_USE_GPU
 #endif

@ -30,13 +30,18 @@ limitations under the License.
 #include "tensorflow/core/kernels/sparse/kernels.h"
 #include "tensorflow/core/kernels/sparse/sparse_matrix.h"

-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
 #include "tensorflow/core/kernels/cuda_solvers.h"
 #include "tensorflow/core/kernels/cuda_sparse.h"
-#include "tensorflow/stream_executor/cuda/cuda_activation.h"
+#endif

+#if GOOGLE_CUDA
+#include "tensorflow/stream_executor/cuda/cuda_activation.h"
 using ::perftools::gputools::cuda::ScopedActivateExecutorContext;
+#elif TENSORFLOW_USE_ROCM
+#include "tensorflow/stream_executor/rocm/rocm_activation.h"
+using ::perftools::gputools::rocm::ScopedActivateExecutorContext;
 #endif

 namespace tensorflow {
@ -104,7 +109,7 @@ class SparseTensorToCSRSparseMatrixCPUOp : public OpKernel {
  }
 };

-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM

 template <typename Device, typename T>
 class SparseTensorToCSRSparseMatrixGPUOp : public AsyncOpKernel {
@ -302,7 +307,7 @@ struct COOSparseMatrixToCSRSparseMatrix<GPUDevice> {
  Status operator()(OpKernelContext* c, const int rows, const int cols,
                    TTypes<int>::UnalignedVec coo_row_ind,
                    TTypes<int>::UnalignedVec csr_row_ptr) {
-    CudaSparse cuda_sparse(c);
+    GpuSparse cuda_sparse(c);
    TF_RETURN_IF_ERROR(cuda_sparse.Initialize());
    return cuda_sparse.Coo2csr(coo_row_ind.data(),
                               /*nnz*/ coo_row_ind.size(),
@ -322,12 +327,14 @@ extern template struct COOSparseMatrixToCSRSparseMatrix<GPUDevice>;

 REGISTER_GPU(float)
 REGISTER_GPU(double)
+#if GOOGLE_CUDA
 REGISTER_GPU(complex64)
 REGISTER_GPU(complex128)
+#endif

 #undef REGISTER_GPU

-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM

 #define REGISTER_CPU(T)                                         \
  REGISTER_KERNEL_BUILDER(Name("SparseTensorToCSRSparseMatrix") \
--- a/tensorflow/core/kernels/sparse/transpose_op.cc
+++ b/tensorflow/core/kernels/sparse/transpose_op.cc
@ -19,7 +19,7 @@ limitations under the License.

 #define EIGEN_USE_THREADS

-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #include "tensorflow/core/kernels/cuda_sparse.h"
 #define EIGEN_USE_GPU
 #endif
@ -132,9 +132,12 @@ REGISTER_TRANSPOSE(CPU, double)
 REGISTER_TRANSPOSE(CPU, complex64)
 REGISTER_TRANSPOSE(CPU, complex128)

-#ifdef GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER_TRANSPOSE(GPU, float)
 REGISTER_TRANSPOSE(GPU, double)
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+#if GOOGLE_CUDA
 REGISTER_TRANSPOSE(GPU, complex64)
 REGISTER_TRANSPOSE(GPU, complex128)
 #endif  // GOOGLE_CUDA
@ -250,16 +253,20 @@ struct CSRSparseMatrixTransposeComponent<CPUDevice, T> {
  }
 };

-#ifdef GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM

 template <typename T>
 struct CSRSparseMatrixTransposeComponent<GPUDevice, T> {
  Status operator()(OpKernelContext* ctx, const ConstCSRComponent<T>& x,
                    CSRComponent<T>* y) {
    TF_RETURN_IF_ERROR(ValidateTransposeInputs(x, *y));
-    CudaSparse cuda_sparse(ctx);
+    GpuSparse cuda_sparse(ctx);
    TF_RETURN_IF_ERROR(cuda_sparse.Initialize());
-    const cusparseAction_t copyValues = CUSPARSE_ACTION_NUMERIC;
+#if GOOGLE_CUDA
+    const gpusparseAction_t copyValues = CUSPARSE_ACTION_NUMERIC;
+#elif TENSORFLOW_USE_ROCM
+    const gpusparseAction_t copyValues = HIPSPARSE_ACTION_NUMERIC;
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
    const int rank = x.dense_shape_host.size();
    const int m = x.row_ptr.size() - 1;
    const int n = x.dense_shape_host(rank - 1);
@ -279,7 +286,7 @@ struct CSRSparseMatrixTransposeComponent<GPUDevice, T> {
    return Status::OK();
  }
 };
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 }  // namespace functor

 }  // namespace tensorflow
--- a/tensorflow/core/kernels/sparse/zeros_op.cc
+++ b/tensorflow/core/kernels/sparse/zeros_op.cc
@ -15,7 +15,7 @@ limitations under the License.

 #define EIGEN_USE_THREADS

-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define EIGEN_USE_GPU
 #endif

@ -74,7 +74,7 @@ Status CSRSparseMatrixZerosLikeHelper(OpKernelContext* ctx,

 }  // namespace

-#ifdef GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define REGISTER(DEV)                                     \
  REGISTER_KERNEL_BUILDER(Name("SparseMatrixZeros")       \
                              .Device(DEVICE_##DEV)       \
@ -88,6 +88,6 @@ REGISTER_UNARY_VARIANT_UNARY_OP_FUNCTION(
    CSRSparseMatrixZerosLikeHelper<GPUDevice>);

 #undef REGISTER
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM

 }  // namespace tensorflow
--- a/tensorflow/core/kernels/sparse/zeros_op.h
+++ b/tensorflow/core/kernels/sparse/zeros_op.h
@ -18,7 +18,7 @@ limitations under the License.

 #define EIGEN_USE_THREADS

-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define EIGEN_USE_GPU
 #endif

--- a/tensorflow/core/kernels/tridiagonal_solve_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/tridiagonal_solve_op_gpu.cu.cc
@ -156,7 +156,7 @@ class TridiagonalSolveOpGpuLinalg : public LinearAlgebraOp<Scalar> {
                           k);
      return;
    }
-    std::unique_ptr<CudaSparse> cusparse_solver(new CudaSparse(context));
+    std::unique_ptr<GpuSparse> cusparse_solver(new GpuSparse(context));
    OP_REQUIRES_OK(context, cusparse_solver->Initialize());
    if (k == 1) {
      // rhs is copied into x, then gtsv replaces x with solution.
@ -196,20 +196,20 @@ class TridiagonalSolveOpGpuLinalg : public LinearAlgebraOp<Scalar> {
  }

  void SolveWithGtsv(OpKernelContext* context,
-                     std::unique_ptr<CudaSparse>& cusparse_solver,
+                     std::unique_ptr<GpuSparse>& cusparse_solver,
                     const Scalar* superdiag, const Scalar* diag,
                     const Scalar* subdiag, Scalar* rhs, const int num_eqs,
                     const int num_rhs) const {
 #if CUDA_VERSION < 9000
-    auto function = pivoting_ ? &CudaSparse::Gtsv<Scalar>
-                              : &CudaSparse::GtsvNoPivot<Scalar>;
+    auto function =
+        pivoting_ ? &GpuSparse::Gtsv<Scalar> : &GpuSparse::GtsvNoPivot<Scalar>;
    OP_REQUIRES_OK(
        context, (cusparse_solver.get()->*function)(
                     num_eqs, num_rhs, subdiag, diag, superdiag, rhs, num_eqs));
 #else
    auto buffer_function = pivoting_
-                               ? &CudaSparse::Gtsv2BufferSizeExt<Scalar>
-                               : &CudaSparse::Gtsv2NoPivotBufferSizeExt<Scalar>;
+                               ? &GpuSparse::Gtsv2BufferSizeExt<Scalar>
+                               : &GpuSparse::Gtsv2NoPivotBufferSizeExt<Scalar>;
    size_t buffer_size;
    OP_REQUIRES_OK(context, (cusparse_solver.get()->*buffer_function)(
                                num_eqs, num_rhs, subdiag, diag, superdiag, rhs,
@ -220,8 +220,8 @@ class TridiagonalSolveOpGpuLinalg : public LinearAlgebraOp<Scalar> {
                   context->allocate_temp(DT_UINT8, temp_shape, &temp_tensor));
    void* buffer = temp_tensor.flat<std::uint8_t>().data();

-    auto solver_function = pivoting_ ? &CudaSparse::Gtsv2<Scalar>
-                                     : &CudaSparse::Gtsv2NoPivot<Scalar>;
+    auto solver_function = pivoting_ ? &GpuSparse::Gtsv2<Scalar>
+                                     : &GpuSparse::Gtsv2NoPivot<Scalar>;
    OP_REQUIRES_OK(context, (cusparse_solver.get()->*solver_function)(
                                num_eqs, num_rhs, subdiag, diag, superdiag, rhs,
                                num_eqs, buffer));
@ -315,7 +315,7 @@ class TridiagonalSolveOpGpu : public OpKernel {
                       rhs.flat<Scalar>().size());
    Scalar* x = output->flat<Scalar>().data();

-    std::unique_ptr<CudaSparse> cusparse_solver(new CudaSparse(context));
+    std::unique_ptr<GpuSparse> cusparse_solver(new GpuSparse(context));

    OP_REQUIRES_OK(context, cusparse_solver->Initialize());
 #if CUDA_VERSION < 9000
--- a/tensorflow/python/kernel_tests/linalg/sparse/BUILD
+++ b/tensorflow/python/kernel_tests/linalg/sparse/BUILD
@ -28,7 +28,6 @@ cuda_py_test(
    size = "medium",
    srcs = ["csr_sparse_matrix_test.py"],
    main = "csr_sparse_matrix_test.py",
-    tags = ["no_rocm"],
    deps = [
        "//tensorflow/python/ops/linalg/sparse",
    ],
@ -40,7 +39,6 @@ cuda_py_test(
    srcs = ["csr_sparse_matrix_ops_test.py"],
    main = "csr_sparse_matrix_ops_test.py",
    shard_count = 10,
-    tags = ["no_rocm"],
    deps = [
        "//tensorflow/python/ops/linalg/sparse",
        "//tensorflow/python/ops/linalg/sparse:gen_sparse_csr_matrix_ops",
@ -53,7 +51,6 @@ cuda_py_test(
    srcs = ["csr_sparse_matrix_grad_test.py"],
    main = "csr_sparse_matrix_grad_test.py",
    shard_count = 50,
-    tags = ["no_rocm"],
    deps = [
        "//tensorflow/python/ops/linalg/sparse",
    ],
@ -65,7 +62,6 @@ cuda_py_test(
    srcs = ["csr_sparse_matrix_dense_mat_mul_grad_test.py"],
    main = "csr_sparse_matrix_dense_mat_mul_grad_test.py",
    shard_count = 50,
-    tags = ["no_rocm"],
    deps = [
        "//tensorflow/python/ops/linalg/sparse",
    ],
@ -77,7 +73,6 @@ cuda_py_test(
    srcs = ["csr_sparse_matrix_sparse_mat_mul_grad_test.py"],
    main = "csr_sparse_matrix_sparse_mat_mul_grad_test.py",
    shard_count = 50,
-    tags = ["no_rocm"],
    deps = [
        "//tensorflow/python/ops/linalg/sparse",
    ],
--- a/tensorflow/python/kernel_tests/linalg/sparse/csr_sparse_matrix_dense_mat_mul_grad_test.py
+++ b/tensorflow/python/kernel_tests/linalg/sparse/csr_sparse_matrix_dense_mat_mul_grad_test.py
@ -106,7 +106,11 @@ class CSRSparseMatrixDenseMatMulGradTest(test.TestCase):

 # These tests are refactored from sparse_csr_matrix_grad_test to keep its size
 # "medium".
-for dtype in (np.float32, np.complex64):
+dtypes_to_test = [np.float32]
+if not test.is_built_with_rocm:
+  # complex type is not supported on the ROCm platform
+  dtypes_to_test += [np.complex64]
+for dtype in dtypes_to_test:
  for (t_a, t_b, adj_a, adj_b, t_out,
       conj_out) in itertools.product(*(([False, True],) * 6)):

--- a/tensorflow/python/kernel_tests/linalg/sparse/csr_sparse_matrix_grad_test.py
+++ b/tensorflow/python/kernel_tests/linalg/sparse/csr_sparse_matrix_grad_test.py
@ -84,6 +84,9 @@ class CSRSparseMatrixGradTest(test.TestCase):
    if not self._gpu_available:
      return

+    if test.is_built_with_rocm():
+      self.skipTest("sparse-matrix-add op not supported on ROCm")
+
    sparsify = lambda m: m * (m > 0)
    for dense_shape in ([53, 65, 127], [127, 65]):
      a_mats_val = sparsify(np.random.randn(*dense_shape))
--- a/tensorflow/python/kernel_tests/linalg/sparse/csr_sparse_matrix_ops_test.py
+++ b/tensorflow/python/kernel_tests/linalg/sparse/csr_sparse_matrix_ops_test.py
@ -432,6 +432,9 @@ class CSRSparseMatrixOpsTest(test.TestCase):
    if not self._gpu_available:
      return

+    if test.is_built_with_rocm():
+      self.skipTest("sparse-matrix-add op not supported on ROCm")
+
    a_indices = np.array([[0, 0], [2, 3]])
    a_values = np.array([1.0, 5.0]).astype(np.float32)
    a_dense_shape = [5, 6]
@ -469,6 +472,9 @@ class CSRSparseMatrixOpsTest(test.TestCase):
    if not self._gpu_available:
      return

+    if test.is_built_with_rocm():
+      self.skipTest("sparse-matrix-add op not supported on ROCm")
+
    sparsify = lambda m: m * (m > 0)
    dense_shape = [53, 65, 127]
    a_mats = sparsify(np.random.randn(*dense_shape)).astype(np.float32)
@ -511,6 +517,9 @@ class CSRSparseMatrixOpsTest(test.TestCase):

  @test_util.run_in_graph_and_eager_modes
  def testSparseMatrixMatMulConjugateOutput(self):
+    if test.is_built_with_rocm():
+      self.skipTest("complex type not supported on ROCm")
+
    for shapes in [[(5, 6), (6, 1)], [(5, 6), (6, 2)]]:
      a_indices = np.array([[0, 0], [2, 3]])
      a_values = np.array([1.0 + 1.j, 5.0 - 2.j]).astype(np.complex64)
@ -533,8 +542,19 @@ class CSRSparseMatrixOpsTest(test.TestCase):

  @test_util.run_in_graph_and_eager_modes
  def testLargeBatchSparseMatrixMatMul(self):
+    dtypes_to_test = [np.float32]
+    if not test.is_built_with_rocm():
+      # complex types is not supported on the ROCm platform
+      dtypes_to_test += [np.complex64]
+
+    if test.is_built_with_rocm():
+      # TODO(rocm): fix this
+      # This test is currently failing on the ROCm platform
+      # Ren-enable it once the fix is available
+      self.skipTest("hipSPARSE all failure on the ROCm platform")
+
    sparsify = lambda m: m * (m > 0)
-    for dtype in np.float32, np.complex64:
+    for dtype in dtypes_to_test:
      for (transpose_a, transpose_b) in ((False, False), (False, True),
                                         (True, False), (True, True)):
        for (adjoint_a, adjoint_b) in ((False, False), (False, True),
@ -584,8 +604,19 @@ class CSRSparseMatrixOpsTest(test.TestCase):

  @test_util.run_in_graph_and_eager_modes
  def testLargeBatchSparseMatrixMatMulTransposed(self):
+    dtypes_to_test = [np.float32]
+    if not test.is_built_with_rocm():
+      # complex types is not supported on the ROCm platform
+      dtypes_to_test += [np.complex64]
+
+    if test.is_built_with_rocm():
+      # TODO(rocm): fix this
+      # This test is currently failing on the ROCm platform
+      # Ren-enable it once the fix is available
+      self.skipTest("hipSPARSE all failure on the ROCm platform")
+
    sparsify = lambda m: m * (m > 0)
-    for dtype in np.float32, np.complex64:
+    for dtype in dtypes_to_test:
      for (transpose_a, transpose_b) in ((False, False), (False, True),
                                         (True, False), (True, True)):
        for (adjoint_a, adjoint_b) in ((False, False), (False, True),
@ -636,6 +667,10 @@ class CSRSparseMatrixOpsTest(test.TestCase):

  @test_util.run_in_graph_and_eager_modes
  def testLargeBatchSparseMatrixMatMulConjugate(self):
+    if test.is_built_with_rocm():
+      # complex types are not yet supported on the ROCm platform
+      self.skipTest("complex type not supported on ROCm")
+
    sparsify = lambda m: m * (m > 0)
    a_dense_shape = [53, 65, 127]
    b_dense_shape = [53, 127, 67]
@ -767,6 +802,10 @@ class CSRSparseMatrixOpsTest(test.TestCase):
    if not self._gpu_available:
      return

+    if test.is_built_with_rocm():
+      # sparse-matrix-add op is not yet supported on the ROCm platform
+      self.skipTest("sparse-matrix-add op not supported on ROCm")
+
    sparsify = lambda m: m * (m > 0)
    dense_shape = [53, 65, 127]
    matrices = [
@ -1154,9 +1193,10 @@ class CSRSparseMatrixOpsTest(test.TestCase):
        ]  #
    ]).astype(np.complex128)

-    data_types = [
-        dtypes.float32, dtypes.float64, dtypes.complex64, dtypes.complex128
-    ]
+    data_types = [dtypes.float32, dtypes.float64]
+    if not test.is_built_with_rocm():
+      # complex type is not supported on the ROCm platform
+      data_types += [dtypes.complex64, dtypes.complex128]
    for dtype in data_types:
      sparse_matrix = dense_to_csr_sparse_matrix(
          math_ops.cast(dense_mat, dtype))
--- a/tensorflow/python/kernel_tests/linalg/sparse/csr_sparse_matrix_test.py
+++ b/tensorflow/python/kernel_tests/linalg/sparse/csr_sparse_matrix_test.py
@ -154,7 +154,11 @@ class SparseMatrixMatmulTest(test.TestCase):
    sparsify = lambda m: m * (m > 0)
    dense_shape_a = [5, 13, 7] if transpose_a or adjoint_a else [5, 7, 13]
    dense_shape_b = [5, 15, 13] if transpose_b or adjoint_b else [5, 13, 15]
-    for dtype in np.float32, np.complex64:
+    dtypes_to_test = [np.float32]
+    if not test.is_built_with_rocm():
+      # complex type is not supported on the ROCm platform
+      dtypes_to_test += [np.complex64]
+    for dtype in dtypes_to_test:
      a_mats = sparsify((np.random.randn(*dense_shape_a) +
                         1.j * np.random.randn(*dense_shape_a))).astype(dtype)
      b_mats = sparsify((np.random.randn(*dense_shape_b) +
@ -194,7 +198,11 @@ class SparseMatrixMatmulTest(test.TestCase):
    sparsify = lambda m: m * (m > 0)
    dense_shape_a = [5, 13, 7] if transpose_a or adjoint_a else [5, 7, 13]
    dense_shape_b = [5, 15, 13] if transpose_b or adjoint_b else [5, 13, 15]
-    for dtype in np.float32, np.complex64:
+    dtypes_to_test = [np.float32]
+    if not test.is_built_with_rocm():
+      # complex type is not supported on the ROCm platform
+      dtypes_to_test += [np.complex64]
+    for dtype in dtypes_to_test:
      a_mats = sparsify((np.random.randn(*dense_shape_a) +
                         1.j * np.random.randn(*dense_shape_a))).astype(dtype)
      b_mats = (np.random.randn(*dense_shape_b) +
@ -231,7 +239,11 @@ class SparseMatrixMatmulTest(test.TestCase):
    sparsify = lambda m: m * (m > 0)
    dense_shape_a = [5, 13, 7] if transpose_a or adjoint_a else [5, 7, 13]
    dense_shape_b = [5, 15, 13] if transpose_b or adjoint_b else [5, 13, 15]
-    for dtype in np.float32, np.complex64:
+    dtypes_to_test = [np.float32]
+    if not test.is_built_with_rocm():
+      # complex type is not supported on the ROCm platform
+      dtypes_to_test += [np.complex64]
+    for dtype in dtypes_to_test:
      a_mats = (np.random.randn(*dense_shape_a) +
                1.j * np.random.randn(*dense_shape_a)).astype(dtype)
      b_mats = sparsify((np.random.randn(*dense_shape_b) +
--- a/third_party/gpus/rocm/BUILD.tpl
+++ b/third_party/gpus/rocm/BUILD.tpl
@ -137,4 +137,11 @@ cc_library(
    ],
 )

+cc_import(
+    name = "hipsparse",
+    hdrs = glob(["rocm/include/hipsparse/**",]),
+    shared_library = "rocm/lib/%{hipsparse_lib}",
+    visibility = ["//visibility:public"],
+)
+
 %{copy_rules}
--- a/third_party/gpus/rocm/rocm_config.h.tpl
+++ b/third_party/gpus/rocm/rocm_config.h.tpl
@ -16,6 +16,6 @@ limitations under the License.
 #ifndef ROCM_ROCM_CONFIG_H_
 #define ROCM_ROCM_CONFIG_H_

-#define TF_ROCM_TOOLKIT_PATH "/opt/rocm"
+#define TF_ROCM_TOOLKIT_PATH "%{rocm_toolkit_path}"

 #endif  // ROCM_ROCM_CONFIG_H_
--- a/third_party/gpus/rocm_configure.bzl
+++ b/third_party/gpus/rocm_configure.bzl
@ -191,50 +191,50 @@ def _rocm_include_path(repository_ctx, rocm_config):
    inc_dirs.append(rocm_config.rocm_toolkit_path + "/include")

    # Add HSA headers
-    inc_dirs.append("/opt/rocm/hsa/include")
+    inc_dirs.append(rocm_config.rocm_toolkit_path + "/hsa/include")

    # Add HIP headers
-    inc_dirs.append("/opt/rocm/include/hip")
-    inc_dirs.append("/opt/rocm/include/hip/hcc_detail")
-    inc_dirs.append("/opt/rocm/hip/include")
+    inc_dirs.append(rocm_config.rocm_toolkit_path + "/include/hip")
+    inc_dirs.append(rocm_config.rocm_toolkit_path + "/include/hip/hcc_detail")
+    inc_dirs.append(rocm_config.rocm_toolkit_path + "/hip/include")

    # Add HIP-Clang headers
-    inc_dirs.append("/opt/rocm/llvm/lib/clang/8.0/include")
-    inc_dirs.append("/opt/rocm/llvm/lib/clang/9.0.0/include")
-    inc_dirs.append("/opt/rocm/llvm/lib/clang/10.0.0/include")
+    inc_dirs.append(rocm_config.rocm_toolkit_path + "/llvm/lib/clang/8.0/include")
+    inc_dirs.append(rocm_config.rocm_toolkit_path + "/llvm/lib/clang/9.0.0/include")
+    inc_dirs.append(rocm_config.rocm_toolkit_path + "/llvm/lib/clang/10.0.0/include")

    # Add rocrand and hiprand headers
-    inc_dirs.append("/opt/rocm/rocrand/include")
-    inc_dirs.append("/opt/rocm/hiprand/include")
+    inc_dirs.append(rocm_config.rocm_toolkit_path + "/rocrand/include")
+    inc_dirs.append(rocm_config.rocm_toolkit_path + "/hiprand/include")

    # Add rocfft headers
-    inc_dirs.append("/opt/rocm/rocfft/include")
+    inc_dirs.append(rocm_config.rocm_toolkit_path + "/rocfft/include")

    # Add rocBLAS headers
-    inc_dirs.append("/opt/rocm/rocblas/include")
+    inc_dirs.append(rocm_config.rocm_toolkit_path + "/rocblas/include")

    # Add MIOpen headers
-    inc_dirs.append("/opt/rocm/miopen/include")
+    inc_dirs.append(rocm_config.rocm_toolkit_path + "/miopen/include")

    # Add RCCL headers
-    inc_dirs.append("/opt/rocm/rccl/include")
+    inc_dirs.append(rocm_config.rocm_toolkit_path + "/rccl/include")

    # Add hcc headers
-    inc_dirs.append("/opt/rocm/hcc/include")
-    inc_dirs.append("/opt/rocm/hcc/compiler/lib/clang/7.0.0/include/")
-    inc_dirs.append("/opt/rocm/hcc/lib/clang/7.0.0/include")
+    inc_dirs.append(rocm_config.rocm_toolkit_path + "/hcc/include")
+    inc_dirs.append(rocm_config.rocm_toolkit_path + "/hcc/compiler/lib/clang/7.0.0/include/")
+    inc_dirs.append(rocm_config.rocm_toolkit_path + "/hcc/lib/clang/7.0.0/include")

    # Newer hcc builds use/are based off of clang 8.0.0.
-    inc_dirs.append("/opt/rocm/hcc/compiler/lib/clang/8.0.0/include/")
-    inc_dirs.append("/opt/rocm/hcc/lib/clang/8.0.0/include")
+    inc_dirs.append(rocm_config.rocm_toolkit_path + "/hcc/compiler/lib/clang/8.0.0/include/")
+    inc_dirs.append(rocm_config.rocm_toolkit_path + "/hcc/lib/clang/8.0.0/include")

    # Support hcc based off clang 9.0.0, included in ROCm2.2
-    inc_dirs.append("/opt/rocm/hcc/compiler/lib/clang/9.0.0/include/")
-    inc_dirs.append("/opt/rocm/hcc/lib/clang/9.0.0/include")
+    inc_dirs.append(rocm_config.rocm_toolkit_path + "/hcc/compiler/lib/clang/9.0.0/include/")
+    inc_dirs.append(rocm_config.rocm_toolkit_path + "/hcc/lib/clang/9.0.0/include")

    # Support hcc based off clang 10.0.0, included in ROCm2.8
-    inc_dirs.append("/opt/rocm/hcc/compiler/lib/clang/10.0.0/include/")
-    inc_dirs.append("/opt/rocm/hcc/lib/clang/10.0.0/include")
+    inc_dirs.append(rocm_config.rocm_toolkit_path + "/hcc/compiler/lib/clang/10.0.0/include/")
+    inc_dirs.append(rocm_config.rocm_toolkit_path + "/hcc/lib/clang/10.0.0/include")

    return inc_dirs

@ -300,11 +300,12 @@ def _hipcc_env(repository_ctx):
                         repository_ctx.os.environ[name].strip() + "\";")
    return hipcc_env.strip()

-def _hipcc_is_hipclang(repository_ctx):
+def _hipcc_is_hipclang(repository_ctx, rocm_config):
    """Returns if hipcc is based on hip-clang toolchain.

    Args:
        repository_ctx: The repository context.
+        rocm_config: The path to the hip compiler.

    Returns:
        A string "True" if hipcc is based on hip-clang toolchain.
@ -319,7 +320,7 @@ def _hipcc_is_hipclang(repository_ctx):
    # grep for "HIP_COMPILER=clang" in /opt/rocm/hip/lib/.hipInfo
    grep_result = _execute(
        repository_ctx,
-        ["grep", "HIP_COMPILER=clang", "/opt/rocm/hip/lib/.hipInfo"],
+        ["grep", "HIP_COMPILER=clang", rocm_config.rocm_toolkit_path + "/hip/lib/.hipInfo"],
        empty_stdout_fine = True,
    )
    result = grep_result.stdout.strip()
@ -327,13 +328,14 @@ def _hipcc_is_hipclang(repository_ctx):
        return "True"
    return "False"

-def _if_hipcc_is_hipclang(repository_ctx, if_true, if_false = []):
+def _if_hipcc_is_hipclang(repository_ctx, rocm_config, if_true, if_false = []):
    """
    Returns either the if_true or if_false arg based on whether hipcc
    is based on the hip-clang toolchain

    Args :
        repository_ctx: The repository context.
+        rocm_config: The path to the hip compiler.
        if_true : value to return if hipcc is hip-clang based
        if_false : value to return if hipcc is not hip-clang based
                   (optional, defaults to empty list)
@ -341,7 +343,7 @@ def _if_hipcc_is_hipclang(repository_ctx, if_true, if_false = []):
    Returns :
        either the if_true arg or the of_False arg
    """
-    if _hipcc_is_hipclang(repository_ctx) == "True":
+    if _hipcc_is_hipclang(repository_ctx, rocm_config) == "True":
        return if_true
    return if_false

@ -478,6 +480,11 @@ def _find_libs(repository_ctx, rocm_config):
            repository_ctx,
            rocm_config.rocm_toolkit_path + "/rccl",
        ),
+        "hipsparse": _find_rocm_lib(
+            "hipsparse",
+            repository_ctx,
+            rocm_config.rocm_toolkit_path + "/hipsparse",
+        ),
    }

 def _get_rocm_config(repository_ctx):
@ -558,6 +565,7 @@ def _create_dummy_repository(repository_ctx):
            "%{rccl_lib}": _lib_name("rccl"),
            "%{rocfft_lib}": _lib_name("rocfft"),
            "%{hiprand_lib}": _lib_name("hiprand"),
+            "%{hipsparse_lib}": _lib_name("hipsparse"),
            "%{copy_rules}": "",
            "%{rocm_headers}": "",
        },
@ -703,6 +711,12 @@ def _create_local_rocm_repository(repository_ctx):
            src_dir = rocm_toolkit_path + "/rccl/include",
            out_dir = "rocm/include/rccl",
        ),
+        make_copy_dir_rule(
+            repository_ctx,
+            name = "hipsparse-include",
+            src_dir = rocm_toolkit_path + "/hipsparse/include",
+            out_dir = "rocm/include/hipsparse",
+        ),
    ]

    rocm_libs = _find_libs(repository_ctx, rocm_config)
@ -740,16 +754,19 @@ def _create_local_rocm_repository(repository_ctx):
            "%{hiprand_lib}": rocm_libs["hiprand"].file_name,
            "%{miopen_lib}": rocm_libs["miopen"].file_name,
            "%{rccl_lib}": rocm_libs["rccl"].file_name,
+            "%{hipsparse_lib}": rocm_libs["hipsparse"].file_name,
            "%{copy_rules}": "\n".join(copy_rules),
            "%{rocm_headers}": ('":rocm-include",\n' +
                                '":rocfft-include",\n' +
                                '":rocblas-include",\n' +
                                '":miopen-include",\n' +
-                                '":rccl-include",'),
+                                '":rccl-include",\n' +
+                                '":hipsparse-include",'),
        },
    )

    # Set up crosstool/
+
    cc = find_cc(repository_ctx)

    host_compiler_includes = get_cxx_inc_directories(repository_ctx, cc)
@ -762,7 +779,7 @@ def _create_local_rocm_repository(repository_ctx):

    rocm_defines["%{host_compiler_prefix}"] = host_compiler_prefix

-    rocm_defines["%{linker_bin_path}"] = "/opt/rocm/hcc/compiler/bin"
+    rocm_defines["%{linker_bin_path}"] = rocm_config.rocm_toolkit_path + "/hcc/compiler/bin"

    # For gcc, do not canonicalize system header paths; some versions of gcc
    # pick the shortest possible path for system includes when creating the
@ -775,7 +792,7 @@ def _create_local_rocm_repository(repository_ctx):
        "-DTENSORFLOW_USE_ROCM=1",
        "-D__HIP_PLATFORM_HCC__",
        "-DEIGEN_USE_HIP",
-    ] + _if_hipcc_is_hipclang(repository_ctx, [
+    ] + _if_hipcc_is_hipclang(repository_ctx, rocm_config, [
        #
        # define "TENSORFLOW_COMPILER_IS_HIP_CLANG" when we are using clang
        # based hipcc to compile/build tensorflow
@ -815,14 +832,14 @@ def _create_local_rocm_repository(repository_ctx):
        "crosstool:clang/bin/crosstool_wrapper_driver_rocm",
        {
            "%{cpu_compiler}": str(cc),
-            "%{hipcc_path}": "/opt/rocm/bin/hipcc",
+            "%{hipcc_path}": rocm_config.rocm_toolkit_path + "/bin/hipcc",
            "%{hipcc_env}": _hipcc_env(repository_ctx),
-            "%{hipcc_is_hipclang}": _hipcc_is_hipclang(repository_ctx),
-            "%{rocr_runtime_path}": "/opt/rocm/lib",
+            "%{hipcc_is_hipclang}": _hipcc_is_hipclang(repository_ctx, rocm_config),
+            "%{rocr_runtime_path}": rocm_config.rocm_toolkit_path + "/lib",
            "%{rocr_runtime_library}": "hsa-runtime64",
-            "%{hip_runtime_path}": "/opt/rocm/hip/lib",
+            "%{hip_runtime_path}": rocm_config.rocm_toolkit_path + "/hip/lib",
            "%{hip_runtime_library}": "hip_hcc",
-            "%{hcc_runtime_path}": "/opt/rocm/hcc/lib",
+            "%{hcc_runtime_path}": rocm_config.rocm_toolkit_path + "/hcc/lib",
            "%{hcc_runtime_library}": "mcwamp",
            "%{crosstool_verbose}": _crosstool_verbose(repository_ctx),
            "%{gcc_host_compiler_path}": str(cc),
--- a/third_party/toolchains/preconfig/generate/containers.bzl
+++ b/third_party/toolchains/preconfig/generate/containers.bzl
@ -9,5 +9,5 @@ container_digests = {
    "cuda10.1-cudnn7-centos6": "sha256:454b899657e87893ee5e68dc0f87df59b6a0a7418ae09cafcc3dd65ac71feca9",
    "cuda10.0-cudnn7-ubuntu16.04-manylinux2010": "sha256:5812d9d0ef0a3276fc5faaf4cd01f3d6e03d635893a6e2d2e04f6f01d626c432",
    "cuda10.1-cudnn7-ubuntu16.04-manylinux2010": "sha256:f8e15f08cb501e5f2de3dc450f614609fd3ed19bde74b153fa66d14b2307610c",
-    "rocm-ubuntu16.04": "sha256:d5cd4120cff3d2a452378aad03746ff5f24699d86cf695c20ee96f366e42975f",
+    "rocm-ubuntu16.04": "sha256:e645447dd6127325f3e97b8bf23424f637a8579d963b34fcc6772cf7cfaa0ebe",
 }
--- a/third_party/toolchains/preconfig/generate/generate.bzl
+++ b/third_party/toolchains/preconfig/generate/generate.bzl
@ -72,7 +72,7 @@ def _tensorflow_rbe_config(name, compiler, python_version, os, rocm_version = No
    docker_toolchain_autoconfig(
        name = name,
        base = base,
-        bazel_version = "0.29.1",
+        bazel_version = "1.2.1",
        build_bazel_src = build_bazel_src,
        config_repos = config_repos,
        env = env,
--- a/third_party/toolchains/preconfig/ubuntu16.04/rocm/rocm/BUILD
+++ b/third_party/toolchains/preconfig/ubuntu16.04/rocm/rocm/BUILD
@ -15,6 +15,7 @@ cc_library(
    name = "rocm_headers",
    hdrs = [
        "rocm/rocm_config.h",
+        ":hipsparse-include",
        ":miopen-include",
        ":rccl-include",
        ":rocblas-include",
@ -141,6 +142,13 @@ cc_library(
    ],
 )

+cc_import(
+    name = "hipsparse",
+    hdrs = glob(["rocm/include/hipsparse/**"]),
+    shared_library = "rocm/lib/libhipsparse.so",
+    visibility = ["//visibility:public"],
+)
+
 genrule(
    name = "rocm-include",
    outs = [
@ -175,6 +183,7 @@ genrule(
        "rocm/include/hcc/clang-c/CXErrorCode.h",
        "rocm/include/hcc/clang-c/CXString.h",
        "rocm/include/hcc/clang-c/Documentation.h",
+        "rocm/include/hcc/clang-c/FatalErrorHandler.h",
        "rocm/include/hcc/clang-c/Index.h",
        "rocm/include/hcc/clang-c/Platform.h",
        "rocm/include/hcc/coordinate",
@ -275,12 +284,14 @@ genrule(
        "rocm/include/hip/hcc_detail/hip_prof_str.h",
        "rocm/include/hip/hcc_detail/hip_runtime.h",
        "rocm/include/hip/hcc_detail/hip_runtime_api.h",
+        "rocm/include/hip/hcc_detail/hip_runtime_prof.h",
        "rocm/include/hip/hcc_detail/hip_surface_types.h",
        "rocm/include/hip/hcc_detail/hip_texture_types.h",
        "rocm/include/hip/hcc_detail/hip_vector_types.h",
        "rocm/include/hip/hcc_detail/hiprtc.h",
        "rocm/include/hip/hcc_detail/host_defines.h",
        "rocm/include/hip/hcc_detail/hsa_helpers.hpp",
+        "rocm/include/hip/hcc_detail/library_types.h",
        "rocm/include/hip/hcc_detail/llvm_intrinsics.h",
        "rocm/include/hip/hcc_detail/macro_based_grid_launch.hpp",
        "rocm/include/hip/hcc_detail/math_functions.h",
@ -292,6 +303,7 @@ genrule(
        "rocm/include/hip/hip_common.h",
        "rocm/include/hip/hip_complex.h",
        "rocm/include/hip/hip_cooperative_groups.h",
+        "rocm/include/hip/hip_ext.h",
        "rocm/include/hip/hip_fp16.h",
        "rocm/include/hip/hip_hcc.h",
        "rocm/include/hip/hip_profile.h",
@ -300,6 +312,7 @@ genrule(
        "rocm/include/hip/hip_texture_types.h",
        "rocm/include/hip/hip_vector_types.h",
        "rocm/include/hip/hiprtc.h",
+        "rocm/include/hip/library_types.h",
        "rocm/include/hip/math_functions.h",
        "rocm/include/hip/nvcc_detail/channel_descriptor.h",
        "rocm/include/hip/nvcc_detail/hip_complex.h",
@ -441,7 +454,6 @@ genrule(
        "rocm/include/ocml.h",
        "rocm/include/opencl1.2-c.pch",
        "rocm/include/opencl2.0-c.pch",
-        "rocm/include/profiler/CXLActivityLogger/CXLActivityLogger.h",
        "rocm/include/rccl.h",
        "rocm/include/rocalution.hpp",
        "rocm/include/rocblas-auxiliary.h",
@ -583,6 +595,7 @@ genrule(
        "rocm/include/rocrand/rocrand_xorwow.h",
        "rocm/include/rocrand/rocrand_xorwow_precomputed.h",
        "rocm/include/rocsparse-auxiliary.h",
+        "rocm/include/rocsparse-complex-types.h",
        "rocm/include/rocsparse-export.h",
        "rocm/include/rocsparse-functions.h",
        "rocm/include/rocsparse-types.h",
@ -1468,6 +1481,16 @@ genrule(
    cmd = """cp -rLf "/opt/rocm/rccl/include/." "$(@D)/" """,
 )

+genrule(
+    name = "hipsparse-include",
+    outs = [
+        "rocm/include/hipsparse/hipsparse-export.h",
+        "rocm/include/hipsparse/hipsparse-version.h",
+        "rocm/include/hipsparse/hipsparse.h",
+    ],
+    cmd = """cp -rLf "/opt/rocm/hipsparse/include/." "$(@D)/rocm/include/hipsparse/" """,
+)
+
 genrule(
    name = "rocm-lib",
    outs = [
@ -1477,11 +1500,13 @@ genrule(
        "rocm/lib/libhiprand.so",
        "rocm/lib/libMIOpen.so",
        "rocm/lib/librccl.so",
+        "rocm/lib/libhipsparse.so",
    ],
    cmd = """cp -f "/opt/rocm/hip/lib/libhip_hcc.so" "$(location rocm/lib/libhip_hcc.so)" && \
 cp -f "/opt/rocm/rocblas/lib/librocblas.so.0.1" "$(location rocm/lib/librocblas.so)" && \
 cp -f "/opt/rocm/rocfft/lib/librocfft.so.0.1" "$(location rocm/lib/librocfft.so)" && \
 cp -f "/opt/rocm/hiprand/lib/libhiprand.so.1.1" "$(location rocm/lib/libhiprand.so)" && \
 cp -f "/opt/rocm/miopen/lib/libMIOpen.so.1" "$(location rocm/lib/libMIOpen.so)" && \
-cp -f "/opt/rocm/rccl/lib/librccl.so" "$(location rocm/lib/librccl.so)" """,
+cp -f "/opt/rocm/rccl/lib/librccl.so" "$(location rocm/lib/librccl.so)" && \
+cp -f "/opt/rocm/hipsparse/lib/libhipsparse.so.0.1" "$(location rocm/lib/libhipsparse.so)" """,
 )