[SE] Use absl mutex instead of TF mutex.

Fix layering violation. PiperOrigin-RevId: 245851033
2019-04-29 16:51:22 -07:00 · 2019-04-29 16:51:22 -07:00 · 481748b5c9
commit 481748b5c9
parent 9904676451
45 changed files with 207 additions and 279 deletions
--- a/tensorflow/stream_executor/BUILD
+++ b/tensorflow/stream_executor/BUILD
@ -93,6 +93,7 @@ cc_library(
        "//tensorflow/stream_executor/platform",
        "@com_google_absl//absl/base:core_headers",
        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
        "@com_google_absl//absl/types:optional",
        "@com_google_absl//absl/types:span",
    ],
@ -140,6 +141,7 @@ cc_library(
        "//tensorflow/stream_executor/platform",
        "@com_google_absl//absl/base:core_headers",
        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
        "@com_google_absl//absl/types:optional",
        "@com_google_absl//absl/types:span",
    ],
@ -152,6 +154,7 @@ cc_library(
    deps = [
        "//tensorflow/stream_executor/platform",
        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
    ],
 )

@ -199,6 +202,7 @@ cc_library(
        "//tensorflow/stream_executor/lib",
        "//tensorflow/stream_executor/platform",
        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
        "@com_google_absl//absl/types:optional",
        "@com_google_absl//absl/types:span",
    ],
@ -231,6 +235,7 @@ cc_library(
        "//tensorflow/stream_executor/platform",
        "//third_party/eigen3",
        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
    ],
 )

@ -259,6 +264,7 @@ cc_library(
        "//tensorflow/stream_executor/platform",
        "@com_google_absl//absl/base:core_headers",
        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
    ],
 )

@ -295,6 +301,7 @@ cc_library(
        ":stream_header",
        "//tensorflow/stream_executor/lib",
        "//tensorflow/stream_executor/platform",
+        "@com_google_absl//absl/synchronization",
    ],
 )

@ -311,6 +318,7 @@ cc_library(
        "//tensorflow/stream_executor/platform",
        "@com_google_absl//absl/strings",
        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/synchronization",
    ],
 )

@ -399,6 +407,7 @@ cc_library(
        "@com_google_absl//absl/base:core_headers",
        "@com_google_absl//absl/strings",
        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/synchronization",
        "@com_google_absl//absl/types:optional",
        "@com_google_absl//absl/types:span",
    ],
@ -440,10 +449,10 @@ cc_library(
        ":plugin",
        ":stream_executor_headers",
        "//tensorflow/stream_executor/lib",
-        "//tensorflow/stream_executor/platform",
        "@com_google_absl//absl/base:core_headers",
        "@com_google_absl//absl/strings",
        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/synchronization",
    ],
 )

@ -536,6 +545,7 @@ cc_library(
        "//tensorflow/stream_executor/platform",
        "@com_google_absl//absl/base:core_headers",
        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
        "@com_google_absl//absl/types:optional",
        "@com_google_absl//absl/types:span",
    ],
@ -612,6 +622,7 @@ cc_library(
        "//tensorflow/stream_executor/platform",
        "@com_google_absl//absl/base:core_headers",
        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
        "@com_google_absl//absl/types:optional",
        "@com_google_absl//absl/types:span",
    ],
--- a/tensorflow/stream_executor/cuda/BUILD
+++ b/tensorflow/stream_executor/cuda/BUILD
@ -59,9 +59,11 @@ cc_library(
            "//tensorflow/stream_executor/platform",
        ],
    ) + tf_additional_cuda_platform_deps() + [
+        "@com_google_absl//absl/base:core_headers",
        "@com_google_absl//absl/memory",
        "@com_google_absl//absl/strings",
        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/synchronization",
    ],
    alwayslink = True,  # Registers itself with the MultiPlatformManager.
 )
@ -112,6 +114,7 @@ cc_library(
        "//tensorflow:using_cuda_clang_with_dynamic_build": ["cudart_stub"],
        "//conditions:default": ["//tensorflow/core:cuda"],
    }) + [
+        "@com_google_absl//absl/base:core_headers",
        "@com_google_absl//absl/memory",
        "@com_google_absl//absl/strings:str_format",
        "@com_google_absl//absl/synchronization",
@ -223,7 +226,10 @@ cc_library(
    ] + if_static(
        ["@local_config_cuda//cuda:cublas"],
        [":cublas_stub"],
-    )) + ["@com_google_absl//absl/strings:str_format"],
+    )) + [
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/synchronization",
+    ],
    alwayslink = True,
 )

@ -306,7 +312,7 @@ cc_library(
    ]) + tf_additional_cudnn_plugin_deps() + if_cuda_is_configured(if_static(
        ["@local_config_cuda//cuda:cudnn"],
        [":cudnn_stub"],
-    )),
+    )) + ["@com_google_absl//absl/synchronization"],
    alwayslink = True,
 )

--- a/tensorflow/stream_executor/cuda/cuda_blas.cc
+++ b/tensorflow/stream_executor/cuda/cuda_blas.cc
@ -402,7 +402,7 @@ template <typename FuncT, typename... Args>
 bool CUDABlas::DoBlasInternalImpl(FuncT cublas_func, Stream *stream,
                                  bool pointer_mode_host, bool err_on_failure,
                                  bool use_tensor_op_math, Args... args) {
-  mutex_lock lock(mu_);
+  absl::MutexLock lock(&mu_);

  CHECK(blas_ != nullptr);
  if (!SetStream(stream)) {
--- a/tensorflow/stream_executor/cuda/cuda_blas.h
+++ b/tensorflow/stream_executor/cuda/cuda_blas.h
@ -20,9 +20,9 @@ limitations under the License.
 #ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_BLAS_H_
 #define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_BLAS_H_

+#include "absl/synchronization/mutex.h"
 #include "tensorflow/stream_executor/blas.h"
 #include "tensorflow/stream_executor/host_or_device_scalar.h"
-#include "tensorflow/stream_executor/platform/mutex.h"
 #include "tensorflow/stream_executor/platform/port.h"
 #include "tensorflow/stream_executor/platform/thread_annotations.h"
 #include "tensorflow/stream_executor/plugin_registry.h"
@ -142,8 +142,8 @@ class CUDABlas : public blas::BlasSupport {
                                   const T &beta, DeviceMemory<T> *y, int incy,
                                   blas::ProfileResult *output_profile_result);

-  // mutex that guards the cuBLAS handle for this device.
-  mutex mu_;
+  // Guards the cuBLAS handle for this device.
+  absl::Mutex mu_;

  // GpuExecutor which instantiated this CUDABlas.
  // Immutable post-initialization.
--- a/tensorflow/stream_executor/cuda/cuda_dnn.cc
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc
@ -133,8 +133,8 @@ class CudnnHandle {
 public:
  // Takes ownership of the executor context and the lock to access cuDNN
  // using handle.
-  CudnnHandle(gpu::ScopedActivateExecutorContext context, mutex_lock lock,
-              cudnnHandle_t handle)
+  CudnnHandle(gpu::ScopedActivateExecutorContext context,
+              std::unique_ptr<absl::MutexLock> lock, cudnnHandle_t handle)
      : context_(std::move(context)), lock_(std::move(lock)), handle_(handle) {}

  // Returns cuDNN handle. To be passed directly to cuDNN APIs, don't keep
@ -143,7 +143,7 @@ class CudnnHandle {

 private:
  gpu::ScopedActivateExecutorContext context_;
-  mutex_lock lock_;
+  std::unique_ptr<absl::MutexLock> lock_;
  cudnnHandle_t handle_;  // Not owned.
 };

@ -160,7 +160,7 @@ class CudnnAccess {
  explicit CudnnAccess(cudnnHandle_t handle) : handle_(handle) {}

  ~CudnnAccess() {
-    mutex_lock lock(mutex_);
+    absl::MutexLock lock(&mutex_);
    cudnnDestroy(handle_);
  }

@ -181,7 +181,8 @@ class CudnnAccess {
  // therefore a bad idea (performance wise) to call any cuDNN APIs that
  // enqueue work in the stream.
  CudnnHandle GetHandle(GpuExecutor* executor, Stream* stream) {
-    mutex_lock lock(mutex_);
+    auto lock = absl::make_unique<absl::MutexLock>(&mutex_);
+    mutex_.AssertHeld();
    gpu::ScopedActivateExecutorContext context(executor);
    CUstream cu_stream = stream ? AsGpuStreamValue(stream) : cudaStreamLegacy;
    const auto status = cudnnSetStream(handle_, cu_stream);
@ -191,7 +192,7 @@ class CudnnAccess {

 private:
  // Guards the enqueueing of cuDNN operations via the handle_ below.
-  mutex mutex_;
+  absl::Mutex mutex_;

  // cuDNN library handle.
  cudnnHandle_t handle_ GUARDED_BY(mutex_);  // Owned.
--- a/tensorflow/stream_executor/cuda/cuda_dnn.h
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.h
@ -22,7 +22,6 @@ limitations under the License.
 #include "tensorflow/stream_executor/cuda/cuda_activation.h"
 #include "tensorflow/stream_executor/dnn.h"
 #include "tensorflow/stream_executor/lib/status.h"
-#include "tensorflow/stream_executor/platform/mutex.h"
 #include "tensorflow/stream_executor/platform/thread_annotations.h"
 #include "tensorflow/stream_executor/plugin_registry.h"
 #include "tensorflow/stream_executor/temporary_device_memory.h"
--- a/tensorflow/stream_executor/cuda/cuda_driver.cc
+++ b/tensorflow/stream_executor/cuda/cuda_driver.cc
@ -23,11 +23,13 @@ limitations under the License.
 #include <utility>

 #include "absl/base/casts.h"
+#include "absl/base/const_init.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/debugging/leak_check.h"
 #include "absl/memory/memory.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
+#include "absl/synchronization/mutex.h"
 #include "absl/synchronization/notification.h"
 #include "cuda/include/cuda_runtime_api.h"
 #include "tensorflow/stream_executor/cuda/cuda_diagnostics.h"
@ -38,7 +40,6 @@ limitations under the License.
 #include "tensorflow/stream_executor/lib/static_threadlocal.h"
 #include "tensorflow/stream_executor/lib/threadpool.h"
 #include "tensorflow/stream_executor/platform/logging.h"
-#include "tensorflow/stream_executor/platform/mutex.h"
 #include "tensorflow/stream_executor/platform/port.h"

 bool FLAGS_gpuexec_cuda_driver_inject_init_error = false;
@ -65,14 +66,14 @@ class CreatedContexts {
 public:
  // Returns whether context is a member of the live set.
  static bool Has(CUcontext context) {
-    tf_shared_lock lock(mu_);
+    absl::ReaderMutexLock lock(&mu_);
    return Live()->find(context) != Live()->end();
  }

  // Adds context to the live set, or returns it if it's already present.
  static GpuContext* Add(CUcontext context) {
    CHECK(context != nullptr);
-    mutex_lock lock(mu_);
+    absl::MutexLock lock(&mu_);
    auto insert_result = Live()->insert(std::make_pair(context, nullptr));
    auto it = insert_result.first;
    if (insert_result.second) {
@ -85,7 +86,7 @@ class CreatedContexts {
  // Removes context from the live set.
  static void Remove(CUcontext context) {
    CHECK(context != nullptr);
-    mutex_lock lock(mu_);
+    absl::MutexLock lock(&mu_);
    auto it = Live()->find(context);
    CHECK(it != Live()->end()) << context;
    Live()->erase(it);
@ -100,11 +101,11 @@ class CreatedContexts {
  }

  // Lock that guards access-to/mutation-of the live set.
-  static mutex mu_;
+  static absl::Mutex mu_;
  static int64 next_id_;
 };

-/* static */ mutex CreatedContexts::mu_{LINKER_INITIALIZED};
+/* static */ absl::Mutex CreatedContexts::mu_{absl::kConstInit};
 /* static */ int64 CreatedContexts::next_id_ = 1;  // 0 means "no context"

 // Formats CUresult to output prettified values into a log stream.
@ -322,17 +323,10 @@ static port::Status InternalInit() {
 /* static */ port::Status GpuDriver::Init() {
  // Cached return value from calling InternalInit(), as cuInit need only be
  // called once, but GpuDriver::Init may be called many times.
-  static port::Status init_retval;
-  static bool set = false;
-  static mutex* init_mu = new mutex;
-
-  mutex_lock lock(*init_mu);
-  if (!set) {
-    init_retval = InternalInit();
-    set = true;
-  }
-
-  return init_retval;
+  static port::Status* init_retval = [] {
+    return new port::Status(InternalInit());
+  }();
+  return *init_retval;
 }

 /* static */ port::Status GpuDriver::GetDevice(int device_ordinal,
--- a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
+++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
@ -272,7 +272,7 @@ bool GpuExecutor::GetKernel(const MultiKernelLoaderSpec& spec,
  VLOG(3) << "GetKernel on kernel " << kernel << " : " << kernel->name();

  if (spec.has_cuda_cubin_in_memory()) {
-    mutex_lock lock{in_memory_modules_mu_};
+    absl::MutexLock lock{&in_memory_modules_mu_};
    kernelname = &spec.cuda_cubin_in_memory().kernelname();
    const char *cubin = spec.cuda_cubin_in_memory().bytes();
    if (!LoadModuleFromCuBin(cubin, &module)) {
@ -295,7 +295,7 @@ bool GpuExecutor::GetKernel(const MultiKernelLoaderSpec& spec,
      return false;
    }

-    mutex_lock lock{in_memory_modules_mu_};
+    absl::MutexLock lock{&in_memory_modules_mu_};
    if (!LoadModuleFromPtx(ptx, &module)) {
      return false;
    }
@ -343,7 +343,7 @@ bool GpuExecutor::UnloadGpuBinary(const void* gpu_binary) {
 void GpuExecutor::UnloadKernel(const KernelBase* kernel) {
  VLOG(3) << "Unloading kernel " << kernel << " : " << kernel->name();

-  mutex_lock lock{in_memory_modules_mu_};
+  absl::MutexLock lock{&in_memory_modules_mu_};
  auto gpu_binary_it = kernel_to_gpu_binary_.find(kernel);
  if (kernel_to_gpu_binary_.end() == gpu_binary_it) {
    VLOG(3) << "Kernel " << kernel << " : " << kernel->name()
@ -362,7 +362,7 @@ bool GpuExecutor::LoadModule(const MultiModuleLoaderSpec& spec,
  // ModuleHandle::id().
  CUmodule cu_module;
  if (spec.has_cuda_cubin_in_memory()) {
-    mutex_lock lock{in_memory_modules_mu_};
+    absl::MutexLock lock{&in_memory_modules_mu_};
    if (!LoadModuleFromCuBin(
            reinterpret_cast<const char *>(spec.cuda_cubin_in_memory().data()),
            &cu_module)) {
@ -380,7 +380,7 @@ bool GpuExecutor::LoadModule(const MultiModuleLoaderSpec& spec,
      return false;
    }

-    mutex_lock lock{in_memory_modules_mu_};
+    absl::MutexLock lock{&in_memory_modules_mu_};
    if (!LoadModuleFromPtx(spec.cuda_ptx_in_memory(), &cu_module)) {
      return false;
    }
@ -394,7 +394,7 @@ bool GpuExecutor::LoadModule(const MultiModuleLoaderSpec& spec,

 bool GpuExecutor::UnloadModule(ModuleHandle module_handle) {
  const char *gpu_binary = reinterpret_cast<const char *>(module_handle.id());
-  mutex_lock lock{in_memory_modules_mu_};
+  absl::MutexLock lock{&in_memory_modules_mu_};
  return UnloadGpuBinary(gpu_binary);
 }

@ -428,7 +428,7 @@ bool GpuExecutor::Launch(Stream* stream, const ThreadDim& thread_dims,
  // whether we've done an occupancy check on this kernel before isn't free
  // (because we have to synchronize), so we only do this at -v 2+.
  if (VLOG_IS_ON(2)) {
-    mutex_lock lock(launched_kernels_mu_);
+    absl::MutexLock lock(&launched_kernels_mu_);
    if (!launched_kernels_.count(cufunc)) {
      VlogOccupancyInfo(kernel, thread_dims, block_dims);
      // TODO(rspringer): Remove elements from launched_kernels_...if we ever
@ -892,7 +892,7 @@ bool GpuExecutor::GetSymbol(const string& symbol_name,
  };

  {  // give limited scope to mutex_lock
-    mutex_lock lock{in_memory_modules_mu_};
+    absl::MutexLock lock{&in_memory_modules_mu_};
    if (static_cast<bool>(module_handle)) {
      auto it = gpu_binary_to_module_.find(module_handle.id());
      CHECK(it != gpu_binary_to_module_.end());
--- a/tensorflow/stream_executor/cuda/cuda_platform.cc
+++ b/tensorflow/stream_executor/cuda/cuda_platform.cc
@ -15,6 +15,7 @@ limitations under the License.

 #include "tensorflow/stream_executor/cuda/cuda_platform.h"

+#include "absl/base/const_init.h"
 #include "absl/memory/memory.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
@ -75,30 +76,25 @@ CudaPlatform::~CudaPlatform() {}
 void CudaPlatform::InspectNumaNodes() {
  // To get NUMA node information, we need to create all executors, so we can
  // examine their device descriptions to see their bus assignments.
-  static bool initialized = false;
-  static mutex numa_mutex(LINKER_INITIALIZED);
-  mutex_lock lock(numa_mutex);
-  if (initialized) {
-    return;
-  }
-
-  StreamExecutorConfig config;
-  for (int i = 0; i < VisibleDeviceCount(); i++) {
-    config.ordinal = i;
-    StreamExecutor* exec = GetExecutor(config).ValueOrDie();
-    if (i == 0) {
-      // NUMA nodes may not start at 0, so set the minimum node  based on the
-      // first executor we see.
-      min_numa_node_ = exec->GetDeviceDescription().numa_node();
-      limit_numa_node_ = min_numa_node_ + 1;
-    } else {
-      min_numa_node_ =
-          std::min(min_numa_node_, exec->GetDeviceDescription().numa_node());
-      limit_numa_node_ = std::max(limit_numa_node_,
-                                  exec->GetDeviceDescription().numa_node() + 1);
+  static std::once_flag once;
+  std::call_once(once, [&] {
+    StreamExecutorConfig config;
+    for (int i = 0; i < VisibleDeviceCount(); i++) {
+      config.ordinal = i;
+      StreamExecutor* exec = GetExecutor(config).ValueOrDie();
+      if (i == 0) {
+        // NUMA nodes may not start at 0, so set the minimum node  based on the
+        // first executor we see.
+        min_numa_node_ = exec->GetDeviceDescription().numa_node();
+        limit_numa_node_ = min_numa_node_ + 1;
+      } else {
+        min_numa_node_ =
+            std::min(min_numa_node_, exec->GetDeviceDescription().numa_node());
+        limit_numa_node_ = std::max(
+            limit_numa_node_, exec->GetDeviceDescription().numa_node() + 1);
+      }
    }
-  }
-  initialized = true;
+  });
 }

 int CudaPlatform::BusCount() {
--- a/tensorflow/stream_executor/cuda/cuda_platform.h
+++ b/tensorflow/stream_executor/cuda/cuda_platform.h
@ -24,7 +24,6 @@ limitations under the License.
 #include "tensorflow/stream_executor/lib/statusor.h"
 #include "tensorflow/stream_executor/multi_platform_manager.h"
 #include "tensorflow/stream_executor/platform.h"
-#include "tensorflow/stream_executor/platform/mutex.h"
 #include "tensorflow/stream_executor/platform/port.h"
 #include "tensorflow/stream_executor/platform/thread_annotations.h"
 #include "tensorflow/stream_executor/stream_executor_internal.h"
--- a/tensorflow/stream_executor/cuda/cuda_rng.cc
+++ b/tensorflow/stream_executor/cuda/cuda_rng.cc
@ -71,7 +71,7 @@ GpuRng::~GpuRng() {
 }

 bool GpuRng::Init() {
-  mutex_lock lock(mu_);
+  absl::MutexLock lock(&mu_);
  CHECK(rng_ == nullptr);

  cuda::ScopedActivateExecutorContext sac(parent_);
@ -106,7 +106,7 @@ constexpr bool ComplexIsConsecutiveFloats() {

 template <typename T>
 bool GpuRng::DoPopulateRandUniformInternal(Stream* stream, DeviceMemory<T>* v) {
-  mutex_lock lock(mu_);
+  absl::MutexLock lock(&mu_);
  static_assert(ComplexIsConsecutiveFloats(),
                "std::complex values are not stored as consecutive values");

@ -164,7 +164,7 @@ bool GpuRng::DoPopulateRandGaussianInternal(Stream* stream, ElemT mean,
                                            ElemT stddev,
                                            DeviceMemory<ElemT>* v,
                                            FuncT func) {
-  mutex_lock lock(mu_);
+  absl::MutexLock lock(&mu_);

  if (!SetStream(stream)) {
    return false;
@ -197,7 +197,7 @@ bool GpuRng::DoPopulateRandGaussian(Stream* stream, double mean, double stddev,
 }

 bool GpuRng::SetSeed(Stream* stream, const uint8* seed, uint64 seed_bytes) {
-  mutex_lock lock(mu_);
+  absl::MutexLock lock(&mu_);
  CHECK(rng_ != nullptr);

  if (!CheckSeed(seed, seed_bytes)) {
--- a/tensorflow/stream_executor/executor_cache.cc
+++ b/tensorflow/stream_executor/executor_cache.cc
@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/executor_cache.h"

 #include "absl/strings/str_format.h"
+#include "absl/synchronization/mutex.h"

 namespace stream_executor {

@ -32,7 +33,7 @@ port::StatusOr<StreamExecutor*> ExecutorCache::GetOrCreate(

  Entry* entry = nullptr;
  {
-    mutex_lock lock{mutex_};
+    absl::MutexLock lock{&mutex_};
    entry = &cache_[config.ordinal];
    // Release the map lock; the address of 'entry' is stable because
    // std::map guarantees reference stability.
@ -41,7 +42,7 @@ port::StatusOr<StreamExecutor*> ExecutorCache::GetOrCreate(
  // Acquire the per-Entry mutex without holding the map mutex. Initializing
  // an Executor may be expensive, so we want to allow concurrent
  // initialization of different entries.
-  mutex_lock lock{entry->configurations_mutex};
+  absl::MutexLock lock{&entry->configurations_mutex};
  for (const auto& iter : entry->configurations) {
    if (iter.first.plugin_config == config.plugin_config &&
        iter.first.device_options == config.device_options) {
@ -66,7 +67,7 @@ port::StatusOr<StreamExecutor*> ExecutorCache::Get(
    const StreamExecutorConfig& config) {
  Entry* entry = nullptr;
  {
-    tf_shared_lock lock{mutex_};
+    absl::ReaderMutexLock lock{&mutex_};
    auto it = cache_.find(config.ordinal);
    if (it != cache_.end()) {
      entry = &it->second;
@ -77,7 +78,7 @@ port::StatusOr<StreamExecutor*> ExecutorCache::Get(
                          config.ordinal));
    }
  }
-  tf_shared_lock lock{entry->configurations_mutex};
+  absl::ReaderMutexLock lock{&entry->configurations_mutex};
  if (entry->configurations.empty()) {
    return port::Status(
        port::error::NOT_FOUND,
@ -96,12 +97,12 @@ port::StatusOr<StreamExecutor*> ExecutorCache::Get(
 }

 void ExecutorCache::DestroyAllExecutors() {
-  mutex_lock lock{mutex_};
+  absl::MutexLock lock{&mutex_};
  cache_.clear();
 }

 ExecutorCache::Entry::~Entry() {
-  mutex_lock lock{configurations_mutex};
+  absl::MutexLock lock{&configurations_mutex};
  configurations.clear();
 }

--- a/tensorflow/stream_executor/executor_cache.h
+++ b/tensorflow/stream_executor/executor_cache.h
@ -19,9 +19,9 @@ limitations under the License.
 #include <functional>
 #include <map>

+#include "absl/synchronization/mutex.h"
 #include "tensorflow/stream_executor/lib/status.h"
 #include "tensorflow/stream_executor/lib/statusor.h"
-#include "tensorflow/stream_executor/platform/mutex.h"
 #include "tensorflow/stream_executor/stream_executor_pimpl.h"

 namespace stream_executor {
@ -54,11 +54,11 @@ class ExecutorCache {
  struct Entry {
    ~Entry();

-    // Mutex that locks the contents of each entry. The 'mutex_' of the
+    // Mutex that guards the contents of each entry. The 'mutex_' of the
    // ExecutorCache class protects both the 'cache_' and the existence of each
    // Entry, but not the Entry's contents. 'configurations_mutex' protects the
    // contents of the entry after 'mutex_' has been dropped.
-    mutex configurations_mutex;
+    absl::Mutex configurations_mutex;

    // Vector of cached {config, executor} pairs.
    std::vector<
@ -69,7 +69,7 @@ class ExecutorCache {
  // Maps ordinal number to a list of cached executors for that ordinal.
  // We key off of ordinal (instead of just looking up all fields in the
  // StreamExecutorConfig) for a slight improvement in lookup time.
-  mutex mutex_;
+  absl::Mutex mutex_;
  std::map<int, Entry> cache_ GUARDED_BY(mutex_);

  SE_DISALLOW_COPY_AND_ASSIGN(ExecutorCache);
--- a/tensorflow/stream_executor/gpu/BUILD
+++ b/tensorflow/stream_executor/gpu/BUILD
@ -98,6 +98,7 @@ cc_library(
        "//tensorflow/stream_executor/lib",
        "//tensorflow/stream_executor/platform",
        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
    ],
 )

@ -126,6 +127,7 @@ cc_library(
        "//tensorflow/stream_executor:plugin_registry",
        "//tensorflow/stream_executor:rng",
        "//tensorflow/stream_executor/platform",
+        "@com_google_absl//absl/synchronization",
    ],
 )

--- a/tensorflow/stream_executor/gpu/gpu_executor.h
+++ b/tensorflow/stream_executor/gpu/gpu_executor.h
@ -26,12 +26,12 @@ limitations under the License.
 #include <unordered_map>

 #include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
 #include "tensorflow/stream_executor/event.h"
 #include "tensorflow/stream_executor/gpu/gpu_kernel.h"
 #include "tensorflow/stream_executor/lib/status.h"
 #include "tensorflow/stream_executor/lib/statusor.h"
 #include "tensorflow/stream_executor/platform.h"
-#include "tensorflow/stream_executor/platform/mutex.h"
 #include "tensorflow/stream_executor/platform/port.h"
 #include "tensorflow/stream_executor/platform/thread_annotations.h"
 #include "tensorflow/stream_executor/stream_executor_internal.h"
@ -287,7 +287,7 @@ class GpuExecutor : public internal::StreamExecutorInterface {
      EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_);

  // Guards the on-disk-module mapping.
-  mutex disk_modules_mu_;
+  absl::Mutex disk_modules_mu_;

  // Mapping from filename to GPUModuleHandle, if it was already retrieved.
  // Multiple GPUFunctionHandle are usually obtained from a single
@ -296,7 +296,7 @@ class GpuExecutor : public internal::StreamExecutorInterface {
  std::map<string, GpuModuleHandle> disk_modules_ GUARDED_BY(disk_modules_mu_);

  // Guards the in-memory-module mapping.
-  mutex in_memory_modules_mu_;
+  absl::Mutex in_memory_modules_mu_;

  std::map<const char*, GpuModuleHandle> in_memory_modules_
      GUARDED_BY(in_memory_modules_mu_);
@ -309,7 +309,7 @@ class GpuExecutor : public internal::StreamExecutorInterface {
      gpu_binary_to_module_ GUARDED_BY(in_memory_modules_mu_);

  // Guards the launched kernel set.
-  mutex launched_kernels_mu_;
+  absl::Mutex launched_kernels_mu_;

  // Keeps track of the set of launched kernels. Currently used to suppress the
  // occupancy check on subsequent launches.
--- a/tensorflow/stream_executor/gpu/gpu_rng.h
+++ b/tensorflow/stream_executor/gpu/gpu_rng.h
@ -16,14 +16,13 @@ limitations under the License.
 #ifndef TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_RNG_H_
 #define TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_RNG_H_

-#include "tensorflow/stream_executor/platform/mutex.h"
+#include "absl/synchronization/mutex.h"
+#include "tensorflow/stream_executor/gpu/gpu_types.h"
 #include "tensorflow/stream_executor/platform/port.h"
 #include "tensorflow/stream_executor/platform/thread_annotations.h"
 #include "tensorflow/stream_executor/plugin_registry.h"
 #include "tensorflow/stream_executor/rng.h"

-#include "tensorflow/stream_executor/gpu/gpu_types.h"
-
 namespace stream_executor {

 class Stream;
@ -83,8 +82,8 @@ class GpuRng : public rng::RngSupport {
  // with random number generation.
  bool SetStream(Stream* stream) EXCLUSIVE_LOCKS_REQUIRED(mu_);

-  // mutex that guards the gpu rng library handle for this device.
-  mutex mu_;
+  // Guards the gpu rng library handle for this device.
+  absl::Mutex mu_;

  // GpuExecutor which instantiated this GpuRng.
  // Immutable post-initialization.
--- a/tensorflow/stream_executor/host/BUILD
+++ b/tensorflow/stream_executor/host/BUILD
@ -52,6 +52,7 @@ cc_library(
        "//tensorflow/stream_executor/lib",
        "//tensorflow/stream_executor/platform",
        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/synchronization",
    ],
    alwayslink = True,  # Registers itself with the MultiPlatformManager.
 )
@ -67,6 +68,7 @@ cc_library(
    deps = [
        "//tensorflow/stream_executor:kernel",
        "//tensorflow/stream_executor/lib",
+        "@com_google_absl//absl/synchronization",
    ],
 )

--- a/tensorflow/stream_executor/host/host_platform.h
+++ b/tensorflow/stream_executor/host/host_platform.h
@ -27,7 +27,6 @@ limitations under the License.
 #include "tensorflow/stream_executor/lib/statusor.h"
 #include "tensorflow/stream_executor/multi_platform_manager.h"
 #include "tensorflow/stream_executor/platform.h"
-#include "tensorflow/stream_executor/platform/mutex.h"
 #include "tensorflow/stream_executor/platform/port.h"
 #include "tensorflow/stream_executor/platform/thread_annotations.h"
 #include "tensorflow/stream_executor/stream_executor_pimpl.h"
--- a/tensorflow/stream_executor/host/host_stream.cc
+++ b/tensorflow/stream_executor/host/host_stream.cc
@ -38,15 +38,15 @@ bool HostStream::EnqueueTask(std::function<void()> task) {
      // should guarantee that all tasks are destroyed.
      task = std::function<void()>();
      {
-        mutex_lock lock(stream->mu_);
+        absl::MutexLock lock(&stream->mu_);
        --stream->pending_tasks_;
      }
-      stream->completion_condition_.notify_all();
+      stream->completion_condition_.SignalAll();
    }
  };

  {
-    mutex_lock lock(mu_);
+    absl::MutexLock lock(&mu_);
    ++pending_tasks_;
  }
  host_executor_->Schedule(NotifiedTask{this, std::move(task)});
@ -54,9 +54,9 @@ bool HostStream::EnqueueTask(std::function<void()> task) {
 }

 void HostStream::BlockUntilDone() {
-  mutex_lock lock(mu_);
+  absl::MutexLock lock(&mu_);
  while (pending_tasks_ != 0) {
-    completion_condition_.wait(lock);
+    completion_condition_.Wait(&mu_);
  }
 }

--- a/tensorflow/stream_executor/host/host_stream.h
+++ b/tensorflow/stream_executor/host/host_stream.h
@ -21,6 +21,7 @@ limitations under the License.
 #include <functional>
 #include <memory>

+#include "absl/synchronization/mutex.h"
 #include "tensorflow/stream_executor/lib/threadpool.h"
 #include "tensorflow/stream_executor/stream_executor_internal.h"

@ -45,9 +46,9 @@ class HostStream : public internal::StreamInterface {
  static const int kExecutorThreads = 1;
  std::unique_ptr<port::ThreadPool> host_executor_;

-  mutex mu_;
+  absl::Mutex mu_;
  int pending_tasks_ GUARDED_BY(mu_) = 0;
-  condition_variable completion_condition_;
+  absl::CondVar completion_condition_;
 };

 }  // namespace host
--- a/tensorflow/stream_executor/kernel_spec.cc
+++ b/tensorflow/stream_executor/kernel_spec.cc
@ -94,7 +94,7 @@ const char *CudaPtxInMemory::default_text() const {
    return nullptr;
  }

-  mutex_lock lock(mu_);
+  absl::MutexLock lock(&mu_);

  auto ptx = ptx_by_compute_capability_.begin()->second;
  // Check if there is an entry in decompressed ptx table.
@ -128,7 +128,7 @@ const char *CudaPtxInMemory::text(int compute_capability_major,
    return nullptr;
  }

-  mutex_lock lock(mu_);
+  absl::MutexLock lock(&mu_);

  // Check if there is an entry in decompressed ptx table.
  auto decompressed_ptx_iter = decompressed_ptx_.find(ptx_iter->second);
--- a/tensorflow/stream_executor/kernel_spec.h
+++ b/tensorflow/stream_executor/kernel_spec.h
@ -47,13 +47,13 @@ limitations under the License.
 #define TENSORFLOW_STREAM_EXECUTOR_KERNEL_SPEC_H_

 #include <stddef.h>
+
 #include <map>
 #include <memory>
-#include "tensorflow/stream_executor/platform/port.h"

 #include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
 #include "tensorflow/stream_executor/platform/logging.h"
-#include "tensorflow/stream_executor/platform/mutex.h"
 #include "tensorflow/stream_executor/platform/port.h"

 namespace stream_executor {
@ -206,7 +206,7 @@ class CudaPtxInMemory : public KernelLoaderSpec {
  // Stores all decompressed ptx strings, with original ptx string as keys.
  // It is marked as mutable for lazy decompression.
  mutable std::map<const char *, string> decompressed_ptx_;
-  mutable mutex mu_;
+  mutable absl::Mutex mu_;

  // Defines the minimum compute capability possible. Used when PTX has no
  // compute capability specified (in the single-PTX constructor).
--- a/tensorflow/stream_executor/platform/BUILD
+++ b/tensorflow/stream_executor/platform/BUILD
@ -16,7 +16,6 @@ cc_library(
    name = "platform",
    textual_hdrs = [
        "logging.h",
-        "mutex.h",
        "platform.h",
        "port.h",
        "thread_annotations.h",
--- a/tensorflow/stream_executor/platform/default/BUILD
+++ b/tensorflow/stream_executor/platform/default/BUILD
@ -6,7 +6,6 @@ cc_library(
    name = "platform",
    textual_hdrs = [
        "initialize.h",
-        "mutex.h",
    ],
    deps = ["//tensorflow/core:lib"],
 )
--- a/tensorflow/stream_executor/platform/default/dso_loader.h
+++ b/tensorflow/stream_executor/platform/default/dso_loader.h
@ -20,13 +20,13 @@ limitations under the License.
 #define TENSORFLOW_STREAM_EXECUTOR_DSO_LOADER_H_

 #include <vector>
-#include "tensorflow/stream_executor/platform/port.h"

 #include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
 #include "tensorflow/stream_executor/lib/status.h"
 #include "tensorflow/stream_executor/lib/statusor.h"
 #include "tensorflow/stream_executor/platform.h"
-#include "tensorflow/stream_executor/platform/mutex.h"
+#include "tensorflow/stream_executor/platform/port.h"

 namespace stream_executor {
 namespace internal {
--- a/tensorflow/stream_executor/platform/default/mutex.h
+++ b/tensorflow/stream_executor/platform/default/mutex.h
@ -1,40 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_PLATFORM_DEFAULT_MUTEX_H_
-#define TENSORFLOW_STREAM_EXECUTOR_PLATFORM_DEFAULT_MUTEX_H_
-
-#include "tensorflow/core/platform/mutex.h"
-#include "tensorflow/core/platform/types.h"
-
-namespace stream_executor {
-
-#undef mutex_lock
-#undef tf_shared_lock
-
-using tensorflow::ConditionResult;
-using tensorflow::WaitForMilliseconds;
-using tensorflow::condition_variable;
-using tensorflow::mutex;
-using tensorflow::mutex_lock;
-using tensorflow::tf_shared_lock;
-
-#define mutex_lock(x) static_assert(0, "mutex_lock_decl_missing_var_name");
-#define tf_shared_lock(x) \
-  static_assert(0, "tf_shared_lock_decl_missing_var_name");
-
-}  // namespace stream_executor
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_PLATFORM_DEFAULT_MUTEX_H_
--- a/tensorflow/stream_executor/platform/mutex.h
+++ b/tensorflow/stream_executor/platform/mutex.h
@ -1,27 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_PLATFORM_MUTEX_H_
-#define TENSORFLOW_STREAM_EXECUTOR_PLATFORM_MUTEX_H_
-
-#include "tensorflow/stream_executor/platform/platform.h"
-
-#if defined(PLATFORM_GOOGLE)
-#include "tensorflow/stream_executor/platform/google/mutex.h"
-#else
-#include "tensorflow/stream_executor/platform/default/mutex.h"
-#endif
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_PLATFORM_MUTEX_H_
--- a/tensorflow/stream_executor/platform/port.h
+++ b/tensorflow/stream_executor/platform/port.h
@ -19,7 +19,6 @@ limitations under the License.
 #define TENSORFLOW_STREAM_EXECUTOR_PLATFORM_PORT_H_

 #include "tensorflow/core/platform/macros.h"
-#include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/types.h"

 namespace stream_executor {
@ -38,9 +37,6 @@ using tensorflow::uint64;
 using std::string;
 #endif

-using tensorflow::LinkerInitialized;
-using tensorflow::LINKER_INITIALIZED;
-
 #define SE_FALLTHROUGH_INTENDED TF_FALLTHROUGH_INTENDED

 }  // namespace stream_executor
--- a/tensorflow/stream_executor/plugin_registry.cc
+++ b/tensorflow/stream_executor/plugin_registry.cc
@ -15,8 +15,10 @@ limitations under the License.

 #include "tensorflow/stream_executor/plugin_registry.h"

+#include "absl/base/const_init.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
+#include "absl/synchronization/mutex.h"
 #include "tensorflow/stream_executor/lib/error.h"
 #include "tensorflow/stream_executor/multi_platform_manager.h"

@ -44,9 +46,9 @@ string PluginKindString(PluginKind plugin_kind) {
 PluginRegistry::DefaultFactories::DefaultFactories() :
    blas(kNullPlugin), dnn(kNullPlugin), fft(kNullPlugin), rng(kNullPlugin) { }

-static mutex& GetPluginRegistryMutex() {
-  static mutex* mu = new mutex;
-  return *mu;
+static absl::Mutex& GetPluginRegistryMutex() {
+  static absl::Mutex mu(absl::kConstInit);
+  return mu;
 }

 /* static */ PluginRegistry* PluginRegistry::instance_ = nullptr;
@ -54,7 +56,7 @@ static mutex& GetPluginRegistryMutex() {
 PluginRegistry::PluginRegistry() {}

 /* static */ PluginRegistry* PluginRegistry::Instance() {
-  mutex_lock lock{GetPluginRegistryMutex()};
+  absl::MutexLock lock{&GetPluginRegistryMutex()};
  if (instance_ == nullptr) {
    instance_ = new PluginRegistry();
  }
@ -70,7 +72,7 @@ template <typename FACTORY_TYPE>
 port::Status PluginRegistry::RegisterFactoryInternal(
    PluginId plugin_id, const string& plugin_name, FACTORY_TYPE factory,
    std::map<PluginId, FACTORY_TYPE>* factories) {
-  mutex_lock lock{GetPluginRegistryMutex()};
+  absl::MutexLock lock{&GetPluginRegistryMutex()};

  if (factories->find(plugin_id) != factories->end()) {
    return port::Status(
--- a/tensorflow/stream_executor/plugin_registry.h
+++ b/tensorflow/stream_executor/plugin_registry.h
@ -25,7 +25,6 @@ limitations under the License.
 #include "tensorflow/stream_executor/lib/status.h"
 #include "tensorflow/stream_executor/lib/statusor.h"
 #include "tensorflow/stream_executor/platform.h"
-#include "tensorflow/stream_executor/platform/mutex.h"
 #include "tensorflow/stream_executor/plugin.h"
 #include "tensorflow/stream_executor/rng.h"

--- a/tensorflow/stream_executor/rocm/rocm_blas.cc
+++ b/tensorflow/stream_executor/rocm/rocm_blas.cc
@ -382,7 +382,7 @@ template <typename FuncT, typename... Args>
 bool ROCMBlas::DoBlasInternalImpl(FuncT rocblas_func, Stream *stream,
                                  bool pointer_mode_host, bool err_on_failure,
                                  Args... args) {
-  mutex_lock lock{mu_};
+  absl::MutexLock lock{&mu_};

  CHECK(blas_ != nullptr);
  if (!SetStream(stream)) {
--- a/tensorflow/stream_executor/rocm/rocm_blas.h
+++ b/tensorflow/stream_executor/rocm/rocm_blas.h
@ -20,8 +20,8 @@ limitations under the License.
 #ifndef TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_BLAS_H_
 #define TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_BLAS_H_

+#include "absl/synchronization/mutex.h"
 #include "tensorflow/stream_executor/blas.h"
-#include "tensorflow/stream_executor/platform/mutex.h"
 #include "tensorflow/stream_executor/platform/port.h"
 #include "tensorflow/stream_executor/platform/thread_annotations.h"
 #include "tensorflow/stream_executor/plugin_registry.h"
@ -141,7 +141,7 @@ class ROCMBlas : public blas::BlasSupport {
                                   blas::ProfileResult *output_profile_result);

  // mutex that guards the rocBLAS handle for this device.
-  mutex mu_;
+  absl::Mutex mu_;

  // GpuExecutor which instantiated this ROCMBlas.
  // Immutable post-initialization.
--- a/tensorflow/stream_executor/rocm/rocm_dnn.cc
+++ b/tensorflow/stream_executor/rocm/rocm_dnn.cc
@ -98,8 +98,8 @@ class MIOpenHandle {
 public:
  // Takes ownership of the executor context and the lock to access MIOpen
  // using handle.
-  MIOpenHandle(gpu::ScopedActivateExecutorContext context, mutex_lock lock,
-               miopenHandle_t handle)
+  MIOpenHandle(gpu::ScopedActivateExecutorContext context,
+               std::unique_ptr<absl::MutexLock> lock, miopenHandle_t handle)
      : context_(std::move(context)), lock_(std::move(lock)), handle_(handle) {}

  // Returns MIOpen handle. To be passed directly to MIOpen APIs, don't keep
@ -108,7 +108,7 @@ class MIOpenHandle {

 private:
  gpu::ScopedActivateExecutorContext context_;
-  mutex_lock lock_;
+  std::unique_ptr<absl::MutexLock> lock_;
  miopenHandle_t handle_;  // Not owned.
 };

@ -318,7 +318,7 @@ class CachedFusionPlans {
                           miopenFusionPlanDescriptor_t* fusion_plan,
                           miopenFusionDirection_t fusion_direction,
                           miopenTensorDescriptor_t input_descriptor) {
-    mutex_lock lock{cached_plans_mutex};
+    absl::MutexLock lock{&cached_plans_mutex};

    bool found_cached_plan = false;

@ -342,7 +342,7 @@ class CachedFusionPlans {

  // Need to figure out the right place to call this routine.
  static void Clear() {
-    mutex_lock lock{cached_plans_mutex};
+    absl::MutexLock lock{&cached_plans_mutex};

    for (auto it : cached_plans) {
      auto status = wrap::miopenDestroyFusionPlan(it.second);
@ -359,19 +359,19 @@ class CachedFusionPlans {

  // Is the Fusion plan corresponding to this hash unsupported.
  static bool IsUnsupportedFusionPlan(uint64 hash) {
-    mutex_lock lock{cached_plans_mutex};
+    absl::MutexLock lock{&cached_plans_mutex};
    return unsupported_plans.count(hash) > 0;
  }

  // Mark the given hash value as corresponding to an unsupported fusion plan.
  static void MarkFusionPlanUnsupported(uint64 hash) {
-    mutex_lock lock{cached_plans_mutex};
+    absl::MutexLock lock{&cached_plans_mutex};
    unsupported_plans.insert(hash);
  }

 private:
  // Mutex to guard access to all data within this class.
-  static mutex cached_plans_mutex;
+  static absl::Mutex cached_plans_mutex;

  // Map of hash-value to MIOpen Fusion plan descriptors.
  // Need to be able share this across more than one stream and hence static.
@ -382,7 +382,7 @@ class CachedFusionPlans {
  static std::set<uint64> unsupported_plans;
 };

-mutex CachedFusionPlans::cached_plans_mutex;
+absl::Mutex CachedFusionPlans::cached_plans_mutex;
 std::map<uint64, miopenFusionPlanDescriptor_t> CachedFusionPlans::cached_plans;
 std::set<uint64> CachedFusionPlans::unsupported_plans;

@ -449,7 +449,7 @@ class MIOpenAccess {
  explicit MIOpenAccess(miopenHandle_t handle) : handle_(handle) {}

  ~MIOpenAccess() {
-    mutex_lock lock(mutex_);
+    absl::MutexLock lock(&mutex_);
    wrap::miopenDestroy(handle_);
  }

@ -468,7 +468,8 @@ class MIOpenAccess {
  // therefore a bad idea (performance wise) to call any MIOpen APIs that
  // enqueue work in the stream.
  MIOpenHandle GetHandle(GpuExecutor* executor, Stream* stream) {
-    mutex_lock lock(mutex_);
+    auto lock = absl::make_unique<absl::MutexLock>(&mutex_);
+    mutex_.AssertHeld();
    gpu::ScopedActivateExecutorContext context(executor);
    hipStream_t hip_stream = stream ? AsGpuStreamValue(stream) : nullptr;
    auto status = wrap::miopenSetStream(handle_, hip_stream);
@ -478,7 +479,7 @@ class MIOpenAccess {

 private:
  // Guards the enqueueing of MIOpen operations via the handle_ below.
-  mutex mutex_;
+  absl::Mutex mutex_;

  // MIOpen library handle.
  miopenHandle_t handle_ GUARDED_BY(mutex_);  // Owned.
--- a/tensorflow/stream_executor/rocm/rocm_dnn.h
+++ b/tensorflow/stream_executor/rocm/rocm_dnn.h
@ -19,9 +19,9 @@ limitations under the License.
 #ifndef TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_DNN_H_
 #define TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_DNN_H_

+#include "absl/synchronization/mutex.h"
 #include "tensorflow/stream_executor/dnn.h"
 #include "tensorflow/stream_executor/lib/status.h"
-#include "tensorflow/stream_executor/platform/mutex.h"
 #include "tensorflow/stream_executor/platform/thread_annotations.h"
 #include "tensorflow/stream_executor/plugin_registry.h"
 #include "tensorflow/stream_executor/temporary_device_memory.h"
--- a/tensorflow/stream_executor/rocm/rocm_driver.cc
+++ b/tensorflow/stream_executor/rocm/rocm_driver.cc
@ -24,6 +24,7 @@ limitations under the License.
 #include "absl/container/inlined_vector.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
+#include "absl/synchronization/mutex.h"
 #include "absl/synchronization/notification.h"
 #include "tensorflow/stream_executor/gpu/gpu_diagnostics.h"
 #include "tensorflow/stream_executor/gpu/gpu_driver.h"
@ -34,7 +35,6 @@ limitations under the License.
 #include "tensorflow/stream_executor/lib/static_threadlocal.h"
 #include "tensorflow/stream_executor/lib/threadpool.h"
 #include "tensorflow/stream_executor/platform/logging.h"
-#include "tensorflow/stream_executor/platform/mutex.h"
 #include "tensorflow/stream_executor/platform/port.h"
 #include "tensorflow/stream_executor/rocm/rocm_driver_wrapper.h"

@ -305,17 +305,10 @@ static port::Status InternalInit() {
 /* static */ port::Status GpuDriver::Init() {
  // Cached return value from calling InternalInit(), as hipInit need only be
  // called once, but GpuDriver::Init may be called many times.
-  static port::Status init_retval;
-  static bool set = false;
-  static mutex* init_mu = new mutex;
-
-  mutex_lock lock(*init_mu);
-  if (!set) {
-    init_retval = InternalInit();
-    set = true;
-  }
-
-  return init_retval;
+  static port::Status* init_retval = [&] {
+    init_retval = new Status(InternalInit());
+  }();
+  return *init_retval;
 }

 /* static */ port::Status GpuDriver::GetDevice(int device_ordinal,
--- a/tensorflow/stream_executor/rocm/rocm_gpu_executor.cc
+++ b/tensorflow/stream_executor/rocm/rocm_gpu_executor.cc
@ -114,7 +114,7 @@ GpuExecutor::~GpuExecutor() {
 }
 bool GpuExecutor::UnloadModule(ModuleHandle module_handle) {
  const char* gpu_binary = reinterpret_cast<const char*>(module_handle.id());
-  mutex_lock lock{in_memory_modules_mu_};
+  absl::MutexLock lock{&in_memory_modules_mu_};
  return UnloadGpuBinary(gpu_binary);
 }

@ -236,7 +236,7 @@ bool GpuExecutor::GetKernel(const MultiKernelLoaderSpec& spec,
    kernelname = &spec.cuda_cubin_in_memory().kernelname();

    const char* hsaco = spec.cuda_cubin_in_memory().bytes();
-    mutex_lock lock{in_memory_modules_mu_};
+    absl::MutexLock lock{&in_memory_modules_mu_};
    module = in_memory_modules_[hsaco];

    if (module == nullptr) {
@ -294,7 +294,7 @@ bool GpuExecutor::Launch(Stream* stream, const ThreadDim& thread_dims,
  // whether we've done an occupancy check on this kernel before isn't free
  // (because we have to synchronize), so we only do this at -v 2+.
  if (VLOG_IS_ON(2)) {
-    mutex_lock lock(launched_kernels_mu_);
+    absl::MutexLock lock(&launched_kernels_mu_);
    if (!launched_kernels_.count(hipfunc)) {
      VlogOccupancyInfo(kernel, thread_dims, block_dims);
      // TODO(rspringer): Remove elements from launched_kernels_...if we ever
@ -366,7 +366,7 @@ bool GpuExecutor::LoadModule(const MultiModuleLoaderSpec& spec,
  hipModule_t hip_module = nullptr;
  // TODO(ROCm): Need  generic term instead of cubin/cuda/ptx
  if (spec.has_cuda_cubin_in_memory()) {
-    mutex_lock lock{in_memory_modules_mu_};
+    absl::MutexLock lock{&in_memory_modules_mu_};
    if (!LoadModuleFromHsaco(
            reinterpret_cast<const char*>(spec.cuda_cubin_in_memory().data()),
            &hip_module)) {
@ -765,8 +765,8 @@ bool GpuExecutor::DeviceMemoryUsage(int64* free, int64* total) const {
 bool GpuExecutor::GetSymbol(const string& symbol_name,
                            ModuleHandle module_handle, void** mem,
                            size_t* bytes) {
-  {  // give limited scope to mutex_lock
-    mutex_lock lock{disk_modules_mu_};
+  {  // give limited scope to lock
+    absl::MutexLock lock{&disk_modules_mu_};
    for (auto& it : disk_modules_) {
      if (GpuDriver::GetModuleSymbol(context_, it.second, symbol_name.c_str(),
                                     reinterpret_cast<hipDeviceptr_t*>(mem),
@ -776,8 +776,8 @@ bool GpuExecutor::GetSymbol(const string& symbol_name,
    }
  }

-  {  // give limited scope to mutex_lock
-    mutex_lock lock{in_memory_modules_mu_};
+  {  // give limited scope to lock
+    absl::MutexLock lock{&in_memory_modules_mu_};
    for (auto& it : in_memory_modules_) {
      if (GpuDriver::GetModuleSymbol(context_, it.second, symbol_name.c_str(),
                                     reinterpret_cast<hipDeviceptr_t*>(mem),
@ -787,8 +787,8 @@ bool GpuExecutor::GetSymbol(const string& symbol_name,
    }
  }

-  {  // give limited scope to mutex_lock
-    mutex_lock lock{in_memory_modules_mu_};
+  {  // give limited scope to lock
+    absl::MutexLock lock{&in_memory_modules_mu_};
    if (static_cast<bool>(module_handle)) {
      auto it = gpu_binary_to_module_.find(module_handle.id());
      CHECK(it != gpu_binary_to_module_.end());
--- a/tensorflow/stream_executor/rocm/rocm_platform.cc
+++ b/tensorflow/stream_executor/rocm/rocm_platform.cc
@ -38,30 +38,25 @@ ROCmPlatform::~ROCmPlatform() {}
 void ROCmPlatform::InspectNumaNodes() {
  // To get NUMA node information, we need to create all executors, so we can
  // examine their device descriptions to see their bus assignments.
-  static bool initialized = false;
-  static mutex numa_mutex(LINKER_INITIALIZED);
-  mutex_lock lock(numa_mutex);
-  if (initialized) {
-    return;
-  }
-
-  StreamExecutorConfig config;
-  for (int i = 0; i < VisibleDeviceCount(); i++) {
-    config.ordinal = i;
-    StreamExecutor* exec = GetExecutor(config).ValueOrDie();
-    if (i == 0) {
-      // NUMA nodes may not start at 0, so set the minimum node  based on the
-      // first executor we see.
-      min_numa_node_ = exec->GetDeviceDescription().numa_node();
-      limit_numa_node_ = min_numa_node_ + 1;
-    } else {
-      min_numa_node_ =
-          std::min(min_numa_node_, exec->GetDeviceDescription().numa_node());
-      limit_numa_node_ = std::max(limit_numa_node_,
-                                  exec->GetDeviceDescription().numa_node() + 1);
+  std::once_flag once;
+  std::call_once(once, [&] {
+    StreamExecutorConfig config;
+    for (int i = 0; i < VisibleDeviceCount(); i++) {
+      config.ordinal = i;
+      StreamExecutor* exec = GetExecutor(config).ValueOrDie();
+      if (i == 0) {
+        // NUMA nodes may not start at 0, so set the minimum node  based on the
+        // first executor we see.
+        min_numa_node_ = exec->GetDeviceDescription().numa_node();
+        limit_numa_node_ = min_numa_node_ + 1;
+      } else {
+        min_numa_node_ =
+            std::min(min_numa_node_, exec->GetDeviceDescription().numa_node());
+        limit_numa_node_ = std::max(
+            limit_numa_node_, exec->GetDeviceDescription().numa_node() + 1);
+      }
    }
-  }
-  initialized = true;
+  });
 }

 int ROCmPlatform::BusCount() {
--- a/tensorflow/stream_executor/rocm/rocm_platform.h
+++ b/tensorflow/stream_executor/rocm/rocm_platform.h
@ -19,11 +19,11 @@ limitations under the License.
 #include <memory>
 #include <vector>

+#include "absl/synchronization/mutex.h"
 #include "tensorflow/stream_executor/executor_cache.h"
 #include "tensorflow/stream_executor/lib/statusor.h"
 #include "tensorflow/stream_executor/multi_platform_manager.h"
 #include "tensorflow/stream_executor/platform.h"
-#include "tensorflow/stream_executor/platform/mutex.h"
 #include "tensorflow/stream_executor/platform/port.h"
 #include "tensorflow/stream_executor/platform/thread_annotations.h"
 #include "tensorflow/stream_executor/stream_executor_internal.h"
@ -90,7 +90,7 @@ class ROCmPlatform : public Platform {
  string name_;

  // mutex that guards internal state.
-  mutable mutex mu_;
+  mutable absl::Mutex mu_;

  // Cache of created executors.
  ExecutorCache executor_cache_;
--- a/tensorflow/stream_executor/rocm/rocm_rng.cc
+++ b/tensorflow/stream_executor/rocm/rocm_rng.cc
@ -126,7 +126,7 @@ GpuRng::~GpuRng() {
 }

 bool GpuRng::Init() {
-  mutex_lock lock{mu_};
+  absl::MutexLock lock{&mu_};
  CHECK(rng_ == nullptr);

  hiprandStatus_t ret =
@ -161,7 +161,7 @@ constexpr bool ComplexIsConsecutiveFloats() {

 template <typename T>
 bool GpuRng::DoPopulateRandUniformInternal(Stream* stream, DeviceMemory<T>* v) {
-  mutex_lock lock{mu_};
+  absl::MutexLock lock{&mu_};
  static_assert(ComplexIsConsecutiveFloats(),
                "std::complex values are not stored as consecutive values");

@ -220,7 +220,7 @@ bool GpuRng::DoPopulateRandGaussianInternal(Stream* stream, ElemT mean,
                                            ElemT stddev,
                                            DeviceMemory<ElemT>* v,
                                            FuncT func) {
-  mutex_lock lock{mu_};
+  absl::MutexLock lock{&mu_};

  if (!SetStream(stream)) {
    return false;
@ -252,7 +252,7 @@ bool GpuRng::DoPopulateRandGaussian(Stream* stream, double mean, double stddev,
 }

 bool GpuRng::SetSeed(Stream* stream, const uint8* seed, uint64 seed_bytes) {
-  mutex_lock lock{mu_};
+  absl::MutexLock lock{&mu_};
  CHECK(rng_ != nullptr);

  if (!CheckSeed(seed, seed_bytes)) {
--- a/tensorflow/stream_executor/stream.cc
+++ b/tensorflow/stream_executor/stream.cc
@ -290,7 +290,7 @@ port::Status Stream::RefreshStatus() {
 Stream &Stream::Init() {
  VLOG_CALL();

-  mutex_lock lock(mu_);
+  absl::MutexLock lock(&mu_);
  CHECK_EQ(false, allocated_)
      << "stream appears to already have been initialized";
  CHECK(!ok_) << "stream should be in !ok() state pre-initialization";
@ -1736,7 +1736,7 @@ Stream &Stream::ThenMemcpyH2DQuantized(
 }

 Stream *Stream::GetOrCreateSubStream() {
-  mutex_lock lock(mu_);
+  absl::MutexLock lock(&mu_);

  // Look for the first reusable sub_stream that is ok, dropping !ok sub_streams
  // we encounter along the way.
@ -1783,7 +1783,7 @@ Stream *Stream::GetOrCreateSubStream() {
 }

 void Stream::ReturnSubStream(Stream *sub_stream) {
-  mutex_lock lock(mu_);
+  absl::MutexLock lock(&mu_);

  // Look for the sub-stream.
  for (int64 index = 0; index < sub_streams_.size(); ++index) {
@ -5285,7 +5285,7 @@ void Stream::CheckStatus(port::Status status) {
    return;
  }
  LOG(ERROR) << status;
-  mutex_lock lock(mu_);
+  absl::MutexLock lock(&mu_);
  ok_ = false;
 }

--- a/tensorflow/stream_executor/stream.h
+++ b/tensorflow/stream_executor/stream.h
@ -25,6 +25,7 @@ limitations under the License.
 #include <functional>
 #include <memory>

+#include "absl/synchronization/mutex.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/stream_executor/blas.h"
 #include "tensorflow/stream_executor/device_memory.h"
@ -35,7 +36,6 @@ limitations under the License.
 #include "tensorflow/stream_executor/kernel.h"
 #include "tensorflow/stream_executor/launch_dim.h"
 #include "tensorflow/stream_executor/lib/array_slice.h"
-#include "tensorflow/stream_executor/platform/mutex.h"
 #include "tensorflow/stream_executor/platform/port.h"
 #include "tensorflow/stream_executor/platform/thread_annotations.h"
 #include "tensorflow/stream_executor/temporary_memory_manager.h"
@ -1964,7 +1964,7 @@ class Stream {
  friend class ocl::CLBlas;    // for parent_.

  bool InErrorState() const LOCKS_EXCLUDED(mu_) {
-    tf_shared_lock lock(mu_);
+    absl::ReaderMutexLock lock(&mu_);
    return !ok_;
  }

@ -1974,7 +1974,7 @@ class Stream {
    if (operation_retcode) {
      return;
    }
-    mutex_lock lock(mu_);
+    absl::MutexLock lock(&mu_);
    ok_ = false;
  }

@ -1998,7 +1998,7 @@ class Stream {

  // mutex that guards the allocation / error state flags.
  // Mutable so that it can be obtained via const reader lock.
-  mutable mutex mu_;
+  mutable absl::Mutex mu_;

  // Whether Init() was successfully called to allocate this stream on the
  // underlying platform. It simply flips from 0 to 1 with a sanity check.
--- a/tensorflow/stream_executor/stream_executor_pimpl.cc
+++ b/tensorflow/stream_executor/stream_executor_pimpl.cc
@ -23,6 +23,7 @@ limitations under the License.
 #include <memory>
 #include <utility>

+#include "absl/base/const_init.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/synchronization/notification.h"
@ -92,7 +93,7 @@ class ScopedTracer {
  void Trace(CallbackT callback, TraceArgsT... args) {
    {
      // Instance tracers held in a block to limit the lock lifetime.
-      tf_shared_lock lock{stream_exec_->mu_};
+      absl::ReaderMutexLock lock{&stream_exec_->mu_};
      for (TraceListener *listener : stream_exec_->listeners_) {
        (listener->*callback)(correlation_id_,
                              std::forward<TraceArgsT>(args)...);
@ -121,7 +122,7 @@ MakeScopedTracer(StreamExecutor *stream_exec, BeginCallT begin_call,
  auto tracer = MakeScopedTracer(this, &LOC ## Begin,               \
                                 &LOC ## Complete, ## __VA_ARGS__);

-/* static */ mutex StreamExecutor::static_mu_{LINKER_INITIALIZED};
+/* static */ absl::Mutex StreamExecutor::static_mu_{absl::kConstInit};

 // Get per-device memory limit in bytes. Returns 0 if
 // TF_PER_DEVICE_MEMORY_LIMIT_MB environment variable is not set.
@ -217,7 +218,7 @@ void StreamExecutor::Deallocate(DeviceMemoryBase *mem) {
 }

 void StreamExecutor::GetMemAllocs(std::map<void *, AllocRecord> *records_out) {
-  tf_shared_lock lock(mu_);
+  absl::ReaderMutexLock lock(&mu_);
  *records_out = mem_allocs_;
 }

@ -247,7 +248,7 @@ port::Status StreamExecutor::SetDeviceSharedMemoryConfig(
 }

 const DeviceDescription &StreamExecutor::GetDeviceDescription() const {
-  mutex_lock lock(mu_);
+  absl::MutexLock lock(&mu_);
  if (device_description_ != nullptr) {
    return *device_description_;
  }
@ -393,7 +394,7 @@ StreamExecutor::createRnnStateTensorDescriptor(int num_layer, int batch_size,
 }

 dnn::DnnSupport *StreamExecutor::AsDnn() {
-  mutex_lock lock(mu_);
+  absl::MutexLock lock(&mu_);
  if (dnn_ != nullptr) {
    return dnn_.get();
  }
@ -403,7 +404,7 @@ dnn::DnnSupport *StreamExecutor::AsDnn() {
 }

 blas::BlasSupport *StreamExecutor::AsBlas() {
-  mutex_lock lock(mu_);
+  absl::MutexLock lock(&mu_);
  if (blas_ != nullptr) {
    return blas_.get();
  }
@ -413,7 +414,7 @@ blas::BlasSupport *StreamExecutor::AsBlas() {
 }

 fft::FftSupport *StreamExecutor::AsFft() {
-  mutex_lock lock(mu_);
+  absl::MutexLock lock(&mu_);
  if (fft_ != nullptr) {
    return fft_.get();
  }
@ -423,7 +424,7 @@ fft::FftSupport *StreamExecutor::AsFft() {
 }

 rng::RngSupport *StreamExecutor::AsRng() {
-  mutex_lock lock(mu_);
+  absl::MutexLock lock(&mu_);
  if (rng_ != nullptr) {
    return rng_.get();
  }
@ -781,7 +782,7 @@ void StreamExecutor::EnqueueOnBackgroundThread(std::function<void()> task) {

 void StreamExecutor::CreateAllocRecord(void *opaque, uint64 bytes) {
  if (FLAGS_check_device_leaks && opaque != nullptr && bytes != 0) {
-    mutex_lock lock(mu_);
+    absl::MutexLock lock(&mu_);
    mem_allocs_[opaque] = AllocRecord{
        bytes, ""};
    mem_alloc_bytes_ += bytes;
@ -790,7 +791,7 @@ void StreamExecutor::CreateAllocRecord(void *opaque, uint64 bytes) {

 void StreamExecutor::EraseAllocRecord(void *opaque) {
  if (FLAGS_check_device_leaks && opaque != nullptr) {
-    mutex_lock lock(mu_);
+    absl::MutexLock lock(&mu_);
    if (mem_allocs_.find(opaque) == mem_allocs_.end()) {
      LOG(ERROR) << "Deallocating unknown pointer: " << opaque;
    } else {
@ -804,7 +805,7 @@ void StreamExecutor::EnableTracing(bool enabled) { tracing_enabled_ = enabled; }

 void StreamExecutor::RegisterTraceListener(TraceListener *listener) {
  {
-    mutex_lock lock(mu_);
+    absl::MutexLock lock(&mu_);
    if (listeners_.find(listener) != listeners_.end()) {
      LOG(INFO) << "Attempt to register already-registered listener, "
                << listener;
@ -818,7 +819,7 @@ void StreamExecutor::RegisterTraceListener(TraceListener *listener) {

 bool StreamExecutor::UnregisterTraceListener(TraceListener *listener) {
  {
-    mutex_lock lock(mu_);
+    absl::MutexLock lock(&mu_);
    if (listeners_.find(listener) == listeners_.end()) {
      LOG(INFO) << "Attempt to unregister unknown listener, " << listener;
      return false;
@ -839,7 +840,7 @@ void StreamExecutor::SubmitTrace(TraceCallT trace_call, ArgsT &&... args) {
  if (tracing_enabled_) {
    {
      // instance tracers held in a block to limit the lock lifetime.
-      tf_shared_lock lock(mu_);
+      absl::ReaderMutexLock lock(&mu_);
      for (TraceListener *listener : listeners_) {
        (listener->*trace_call)(std::forward<ArgsT>(args)...);
      }
--- a/tensorflow/stream_executor/stream_executor_pimpl.h
+++ b/tensorflow/stream_executor/stream_executor_pimpl.h
@ -23,13 +23,13 @@ limitations under the License.
 #include <vector>

 #include "absl/base/macros.h"
+#include "absl/synchronization/mutex.h"
 #include "absl/types/optional.h"
 #include "tensorflow/stream_executor/lib/status.h"
 #include "tensorflow/stream_executor/lib/statusor.h"
 #include "tensorflow/stream_executor/lib/threadpool.h"
 #include "tensorflow/stream_executor/platform.h"
 #include "tensorflow/stream_executor/platform/logging.h"
-#include "tensorflow/stream_executor/platform/mutex.h"
 #include "tensorflow/stream_executor/platform/port.h"
 #include "tensorflow/stream_executor/platform/thread_annotations.h"
 #include "tensorflow/stream_executor/rng.h"
@ -619,13 +619,13 @@ class StreamExecutor {
  void SubmitTrace(TraceCallT trace_call, ArgsT&&... args);

  // Reader/writer lock for class-static StreamExecutor members.
-  static mutex static_mu_;
+  static absl::Mutex static_mu_;

  // Reader/writer lock for mutable data structures on this StreamExecutor.
  //
  // Mutable so that caching functions (like DeviceDescription, AsBlas, etc.)
  // can acquire the lock on their first (mutating) call as well.
-  mutable mutex mu_;
+  mutable absl::Mutex mu_;

  // Reference to the platform that created this executor.
  const Platform *platform_;
--- a/tensorflow/stream_executor/temporary_memory_manager.cc
+++ b/tensorflow/stream_executor/temporary_memory_manager.cc
@ -25,7 +25,7 @@ namespace stream_executor {
 namespace internal {

 void TemporaryMemoryManager::ForceDeallocateAll() {
-  mutex_lock lock(mutex_);
+  absl::MutexLock lock(&mutex_);
  VLOG(1) << "force-deallocating " << records_.size() << " remaining records";
  for (auto it = records_.begin(); it != records_.end(); ++it) {
    DeviceMemoryBase device_memory = it->first;
@ -35,7 +35,7 @@ void TemporaryMemoryManager::ForceDeallocateAll() {

 void TemporaryMemoryManager::MarkFinalized(
    const DeviceMemoryBase& device_memory, uint64 generation, bool must_exist) {
-  mutex_lock lock(mutex_);
+  absl::MutexLock lock(&mutex_);
  auto it = records_.find(device_memory);
  if (it == records_.end()) {
    if (must_exist) {
@ -48,7 +48,7 @@ void TemporaryMemoryManager::MarkFinalized(
 }

 void TemporaryMemoryManager::DeallocateFinalizedTemporaries() {
-  mutex_lock lock(mutex_);
+  absl::MutexLock lock(&mutex_);
  int deallocated_count = 0;
  for (auto it = records_.begin(); it != records_.end();) {
    if (it->second.finalized) {
@ -65,7 +65,7 @@ void TemporaryMemoryManager::DeallocateFinalizedTemporaries() {

 bool TemporaryMemoryManager::IsFinalized(const DeviceMemoryBase& device_memory,
                                         uint64 allocation_generation) const {
-  mutex_lock lock(mutex_);
+  absl::MutexLock lock(&mutex_);
  auto it = records_.find(device_memory);
  if (it == records_.end()) {
    return true;  // If there's no record present it's vacuously finalized.
@ -81,7 +81,7 @@ bool TemporaryMemoryManager::IsFinalized(const DeviceMemoryBase& device_memory,

 bool TemporaryMemoryManager::HasAllocated(const DeviceMemoryBase& device_memory,
                                          uint64 generation) const {
-  mutex_lock lock(mutex_);
+  absl::MutexLock lock(&mutex_);
  auto it = records_.find(device_memory);
  if (it == records_.end()) {
    return false;
@ -106,7 +106,7 @@ TemporaryMemoryManager::AllocateArrayBase(uint64 element_count,
  // Add the record before instantiating the device memory instance so we can
  // check the allocation invariant at TemporaryDeviceMemory construction time.
  {
-    mutex_lock lock(mutex_);
+    absl::MutexLock lock(&mutex_);
    generation = ++generation_;
    DCHECK(records_.find(device_memory) == records_.end());
    records_[device_memory] = {generation,
--- a/tensorflow/stream_executor/temporary_memory_manager.h
+++ b/tensorflow/stream_executor/temporary_memory_manager.h
@ -24,10 +24,10 @@ limitations under the License.
 #include <map>
 #include <memory>

+#include "absl/synchronization/mutex.h"
 #include "tensorflow/stream_executor/device_memory.h"
 #include "tensorflow/stream_executor/lib/status.h"
 #include "tensorflow/stream_executor/lib/statusor.h"
-#include "tensorflow/stream_executor/platform/mutex.h"
 #include "tensorflow/stream_executor/platform/thread_annotations.h"
 #include "tensorflow/stream_executor/temporary_device_memory.h"

@ -108,7 +108,7 @@ class TemporaryMemoryManager {
      uint64 element_count, uint64 element_size);

  // Mutex to guard temporary record state.
-  mutable mutex mutex_;
+  mutable absl::Mutex mutex_;

  // Mapping from device memory to the current (live) temporary memory record.
  //