From 642db2faf55e3ca7acd06ea236e9d47f63190718 Mon Sep 17 00:00:00 2001
From: Anna R <annarev@google.com>
Date: Tue, 11 Aug 2020 19:56:45 -0700
Subject: [PATCH] Remove SharedMemoryConfig since it is not used anywhere.

PiperOrigin-RevId: 326154532
Change-Id: I13be21f577226c48c7e0d5bfc7efb787f0422e85
---
 .../xla/service/interpreter/executor.h        | 10 --
 tensorflow/stream_executor/BUILD              | 12 ---
 .../stream_executor/cuda/cuda_gpu_executor.cc | 98 ++++++-------------
 tensorflow/stream_executor/gpu/gpu_executor.h |  4 -
 .../stream_executor/host/host_gpu_executor.h  | 14 ---
 .../stream_executor/rocm/rocm_gpu_executor.cc | 71 +++-----------
 .../stream_executor/shared_memory_config.h    | 34 -------
 .../stream_executor_internal.h                |  4 -
 .../stream_executor/stream_executor_pimpl.cc  | 19 +---
 .../stream_executor/stream_executor_pimpl.h   | 23 ++---
 tensorflow/stream_executor/tpu/tpu_executor.h | 12 +--
 11 files changed, 54 insertions(+), 247 deletions(-)
 delete mode 100644 tensorflow/stream_executor/shared_memory_config.h

diff --git a/tensorflow/compiler/xla/service/interpreter/executor.h b/tensorflow/compiler/xla/service/interpreter/executor.h
index 9e4bdeb2b2d..9416b11a07e 100644
--- a/tensorflow/compiler/xla/service/interpreter/executor.h
+++ b/tensorflow/compiler/xla/service/interpreter/executor.h
@@ -38,7 +38,6 @@ limitations under the License.
 #include "tensorflow/stream_executor/launch_dim.h"
 #include "tensorflow/stream_executor/plugin.h"
 #include "tensorflow/stream_executor/rng.h"
-#include "tensorflow/stream_executor/shared_memory_config.h"
 #include "tensorflow/stream_executor/stream.h"
 #include "tensorflow/stream_executor/stream_executor.h"
 #include "tensorflow/stream_executor/stream_executor_internal.h"
@@ -182,15 +181,6 @@ class XlaInterpreterExecutor : public internal::StreamExecutorInterface {
     return true;
   }
 
-  SharedMemoryConfig GetDeviceSharedMemoryConfig() override {
-    return SharedMemoryConfig::kDefault;
-  }
-
-  port::Status SetDeviceSharedMemoryConfig(SharedMemoryConfig config) override {
-    return port::Status{port::error::UNIMPLEMENTED,
-                        "Shared memory not supported"};
-  }
-
   std::unique_ptr<internal::EventInterface> CreateEventImplementation()
       override {
     return nullptr;
diff --git a/tensorflow/stream_executor/BUILD b/tensorflow/stream_executor/BUILD
index 871576f6cef..22aa60a70a4 100644
--- a/tensorflow/stream_executor/BUILD
+++ b/tensorflow/stream_executor/BUILD
@@ -67,7 +67,6 @@ cc_library(
         "plugin.h",
         "plugin_registry.h",
         "rng.h",
-        "shared_memory_config.h",
         "stream_executor_pimpl.h",
         "temporary_device_memory.h",
         "temporary_memory_manager.h",
@@ -123,7 +122,6 @@ cc_library(
         "multi_platform_manager.h",
         "platform.h",
         "plugin_registry.h",
-        "shared_memory_config.h",
         "stream_executor.h",
         "stream_executor_internal.h",
         "timer.h",
@@ -173,11 +171,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "shared_memory_config",
-    hdrs = ["shared_memory_config.h"],
-)
-
 # Aliases for backwards compatibility.
 alias(
     name = "stream_header",
@@ -343,7 +336,6 @@ cc_library(
         "kernel_cache_config.h",
         "kernel_spec.h",
         "platform.h",
-        "shared_memory_config.h",
         "stream.h",
         "stream_executor_internal.h",
         "trace_listener.h",
@@ -455,7 +447,6 @@ cc_library(
         "stream_executor_internal.cc",
     ],
     hdrs = [
-        "shared_memory_config.h",
         "stream_executor_internal.h",
     ],
     deps = [
@@ -484,7 +475,6 @@ cc_library(
         "dnn.h",
         "kernel.h",
         "kernel_cache_config.h",
-        "shared_memory_config.h",
         "stream_executor_pimpl.h",
     ],
     visibility = ["//visibility:public"],
@@ -569,7 +559,6 @@ cc_library(
         "plugin.h",
         "plugin_registry.h",
         "rng.h",
-        "shared_memory_config.h",
         "stream.h",
         "stream_executor.h",
         "stream_executor_internal.h",
@@ -619,7 +608,6 @@ cc_library(
         "plugin.h",
         "plugin_registry.h",
         "rng.h",
-        "shared_memory_config.h",
         "stream.h",
         "stream_executor.h",
         "stream_executor_internal.h",
diff --git a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
index 79a027f1255..d649d00ded9 100644
--- a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
+++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
@@ -101,12 +101,12 @@ static GpuTimer* AsGpuTimer(Timer* timer) {
 // N.B. we must lose constness in order to pass a suitable type to the existing
 // libcuda APIs, so the caller should take care to only pass the result of const
 // GPU memory conversions to libcuda functions which will honor constness.
-static CUdeviceptr AsCudaDevicePtr(const DeviceMemoryBase &gpu_mem) {
+static CUdeviceptr AsCudaDevicePtr(const DeviceMemoryBase& gpu_mem) {
   return reinterpret_cast<CUdeviceptr>(gpu_mem.opaque());
 }
 
 // See description on const version above.
-static CUdeviceptr AsCudaDevicePtr(DeviceMemoryBase *gpu_mem) {
+static CUdeviceptr AsCudaDevicePtr(DeviceMemoryBase* gpu_mem) {
   return AsCudaDevicePtr(*gpu_mem);
 }
 
@@ -225,11 +225,11 @@ port::Status GpuExecutor::LoadModuleFromCuBin(const char* cubin,
   if (*module == nullptr) {
     TF_RETURN_IF_ERROR(GpuDriver::LoadCubin(context_, cubin, module));
     module_refcount = 1;
-    VLOG(3) << "Loaded CUBIN " << static_cast<const void *>(cubin)
+    VLOG(3) << "Loaded CUBIN " << static_cast<const void*>(cubin)
             << " as module " << *module;
   } else {
     ++module_refcount;
-    VLOG(3) << "CUBIN " << static_cast<const void *>(cubin)
+    VLOG(3) << "CUBIN " << static_cast<const void*>(cubin)
             << " is already loaded as module " << *module;
   }
   gpu_binary_to_module_[cubin] = {*module, module_refcount};
@@ -242,12 +242,12 @@ port::Status GpuExecutor::LoadModuleFromPtx(const char* ptx, CUmodule* module) {
 
   if (*module == nullptr) {
     TF_RETURN_IF_ERROR(GpuDriver::LoadPtx(context_, ptx, module));
-    VLOG(3) << "Loaded PTX " << static_cast<const void *>(ptx) << " as module "
+    VLOG(3) << "Loaded PTX " << static_cast<const void*>(ptx) << " as module "
             << *module;
     module_refcount = 1;
   } else {
     ++module_refcount;
-    VLOG(3) << "PTX " << static_cast<const void *>(ptx)
+    VLOG(3) << "PTX " << static_cast<const void*>(ptx)
             << " is already loaded as module " << module;
   }
   gpu_binary_to_module_[ptx] = {*module, module_refcount};
@@ -271,7 +271,7 @@ port::Status GpuExecutor::GetKernel(const MultiKernelLoaderSpec& spec,
   if (spec.has_cuda_cubin_in_memory()) {
     absl::MutexLock lock{&in_memory_modules_mu_};
     kernelname = &spec.cuda_cubin_in_memory().kernelname();
-    const char *cubin = spec.cuda_cubin_in_memory().bytes();
+    const char* cubin = spec.cuda_cubin_in_memory().bytes();
     TF_RETURN_IF_ERROR(LoadModuleFromCuBin(cubin, &module));
     kernel_to_gpu_binary_[kernel] = cubin;
   } else if (spec.has_cuda_ptx_in_memory()) {
@@ -281,7 +281,7 @@ port::Status GpuExecutor::GetKernel(const MultiKernelLoaderSpec& spec,
       return port::InternalError("Compute capability not set");
     }
 
-    const char *ptx = spec.cuda_ptx_in_memory().text(cc_major_, cc_minor_);
+    const char* ptx = spec.cuda_ptx_in_memory().text(cc_major_, cc_minor_);
     if (ptx == nullptr) {
       ptx = spec.cuda_ptx_in_memory().default_text();
     }
@@ -318,8 +318,8 @@ bool GpuExecutor::UnloadGpuBinary(const void* gpu_binary) {
     VLOG(3) << "No loaded CUDA module for " << gpu_binary;
     return false;
   }
-  auto &module = module_it->second.first;
-  auto &refcount = module_it->second.second;
+  auto& module = module_it->second.first;
+  auto& refcount = module_it->second.second;
   VLOG(3) << "Found CUDA module " << module << " with refcount " << refcount;
   if (--refcount == 0) {
     VLOG(3) << "Unloading CUDA module " << module;
@@ -355,8 +355,8 @@ port::Status GpuExecutor::LoadModule(const MultiModuleLoaderSpec& spec,
     TF_RETURN_IF_ERROR(LoadModuleFromCuBin(
         reinterpret_cast<const char*>(spec.cuda_cubin_in_memory().data()),
         &cu_module));
-    *module_handle = ModuleHandle(const_cast<void *>(
-        static_cast<const void *>(spec.cuda_cubin_in_memory().data())));
+    *module_handle = ModuleHandle(const_cast<void*>(
+        static_cast<const void*>(spec.cuda_cubin_in_memory().data())));
     return port::Status::OK();
   } else if (spec.has_cuda_ptx_in_memory()) {
     if (cc_major_ == 0 && cc_minor_ == 0) {
@@ -370,15 +370,15 @@ port::Status GpuExecutor::LoadModule(const MultiModuleLoaderSpec& spec,
     absl::MutexLock lock{&in_memory_modules_mu_};
     TF_RETURN_IF_ERROR(
         LoadModuleFromPtx(spec.cuda_ptx_in_memory(), &cu_module));
-    *module_handle = ModuleHandle(const_cast<void *>(
-        static_cast<const void *>(spec.cuda_ptx_in_memory())));
+    *module_handle = ModuleHandle(
+        const_cast<void*>(static_cast<const void*>(spec.cuda_ptx_in_memory())));
     return port::Status::OK();
   }
   return port::InternalError("No method of loading CUDA module provided");
 }
 
 bool GpuExecutor::UnloadModule(ModuleHandle module_handle) {
-  const char *gpu_binary = reinterpret_cast<const char *>(module_handle.id());
+  const char* gpu_binary = reinterpret_cast<const char*>(module_handle.id());
   absl::MutexLock lock{&in_memory_modules_mu_};
   return UnloadGpuBinary(gpu_binary);
 }
@@ -425,7 +425,7 @@ port::Status GpuExecutor::Launch(Stream* stream, const ThreadDim& thread_dims,
         cufunc, cuda_kernel->GetGpuCacheConfig()));
   }
 
-  void **kernel_params = const_cast<void **>(args.argument_addresses().data());
+  void** kernel_params = const_cast<void**>(args.argument_addresses().data());
 
   return GpuDriver::LaunchKernel(
       context_, cufunc, block_dims.x, block_dims.y, block_dims.z, thread_dims.x,
@@ -454,7 +454,7 @@ void GpuExecutor::VlogOccupancyInfo(const KernelBase& kernel,
     return;
   }
 
-  const DeviceDescription &device_description =
+  const DeviceDescription& device_description =
       kernel.parent()->GetDeviceDescription();
 
   const GpuKernel* cuda_kernel = AsGpuKernel(&kernel);
@@ -522,7 +522,7 @@ DeviceMemoryBase GpuExecutor::Allocate(uint64 size, int64 memory_space) {
 void* GpuExecutor::GetSubBuffer(DeviceMemoryBase* mem, uint64 offset_bytes,
                                 uint64 size_bytes) {
   // offset and size are in bytes, so char* works as the pointer type.
-  return reinterpret_cast<char *>(mem->opaque()) + offset_bytes;
+  return reinterpret_cast<char*>(mem->opaque()) + offset_bytes;
 }
 
 void GpuExecutor::Deallocate(DeviceMemoryBase* mem) {
@@ -662,8 +662,8 @@ bool GpuExecutor::HostCallback(Stream* stream,
 /* static */ void GpuExecutor::InternalHostCallback(CUstream stream,
                                                     CUresult status,
                                                     void* data) {
-  std::function<void()> *callback =
-      reinterpret_cast<std::function<void()> *>(data);
+  std::function<void()>* callback =
+      reinterpret_cast<std::function<void()>*>(data);
   (*callback)();
   delete callback;
 }
@@ -744,7 +744,7 @@ port::Status GpuExecutor::BlockHostUntilDone(Stream* stream) {
 }
 
 blas::BlasSupport* GpuExecutor::CreateBlas() {
-  PluginRegistry *registry = PluginRegistry::Instance();
+  PluginRegistry* registry = PluginRegistry::Instance();
   port::StatusOr<PluginRegistry::BlasFactory> status =
       registry->GetFactory<PluginRegistry::BlasFactory>(cuda::kCudaPlatformId,
                                                         plugin_config_.blas());
@@ -758,7 +758,7 @@ blas::BlasSupport* GpuExecutor::CreateBlas() {
 }
 
 dnn::DnnSupport* GpuExecutor::CreateDnn() {
-  PluginRegistry *registry = PluginRegistry::Instance();
+  PluginRegistry* registry = PluginRegistry::Instance();
   port::StatusOr<PluginRegistry::DnnFactory> status =
       registry->GetFactory<PluginRegistry::DnnFactory>(cuda::kCudaPlatformId,
                                                        plugin_config_.dnn());
@@ -772,7 +772,7 @@ dnn::DnnSupport* GpuExecutor::CreateDnn() {
 }
 
 fft::FftSupport* GpuExecutor::CreateFft() {
-  PluginRegistry *registry = PluginRegistry::Instance();
+  PluginRegistry* registry = PluginRegistry::Instance();
   port::StatusOr<PluginRegistry::FftFactory> status =
       registry->GetFactory<PluginRegistry::FftFactory>(cuda::kCudaPlatformId,
                                                        plugin_config_.fft());
@@ -786,7 +786,7 @@ fft::FftSupport* GpuExecutor::CreateFft() {
 }
 
 rng::RngSupport* GpuExecutor::CreateRng() {
-  PluginRegistry *registry = PluginRegistry::Instance();
+  PluginRegistry* registry = PluginRegistry::Instance();
   port::StatusOr<PluginRegistry::RngFactory> status =
       registry->GetFactory<PluginRegistry::RngFactory>(cuda::kCudaPlatformId,
                                                        plugin_config_.rng());
@@ -812,47 +812,6 @@ port::Status GpuExecutor::EnablePeerAccessTo(StreamExecutorInterface* other) {
   return GpuDriver::EnablePeerAccess(context_, cuda_other->context_);
 }
 
-SharedMemoryConfig GpuExecutor::GetDeviceSharedMemoryConfig() {
-  port::StatusOr<CUsharedconfig> cuda_config =
-      GpuDriver::ContextGetSharedMemConfig(context_);
-  if (!cuda_config.ok()) {
-    // Don't log; the failed call will log necessary output.
-    return SharedMemoryConfig::kDefault;
-  }
-
-  switch (cuda_config.ValueOrDie()) {
-    case CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE:
-      return SharedMemoryConfig::kDefault;
-    case CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE:
-      return SharedMemoryConfig::kFourByte;
-    case CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE:
-      return SharedMemoryConfig::kEightByte;
-    default:
-      LOG(FATAL) << "Invalid shared memory configuration returned: "
-                 << cuda_config.ValueOrDie();
-  }
-}
-
-port::Status GpuExecutor::SetDeviceSharedMemoryConfig(
-    SharedMemoryConfig config) {
-  CUsharedconfig cuda_config;
-  switch (config) {
-    case SharedMemoryConfig::kDefault:
-      cuda_config = CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE;
-      break;
-    case SharedMemoryConfig::kFourByte:
-      cuda_config = CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE;
-      break;
-    case SharedMemoryConfig::kEightByte:
-      cuda_config = CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE;
-      break;
-    default:
-      LOG(FATAL) << "Invalid shared memory configuration specified: "
-                 << static_cast<int>(config);
-  }
-  return GpuDriver::ContextSetSharedMemConfig(context_, cuda_config);
-}
-
 bool GpuExecutor::DeviceMemoryUsage(int64* free, int64* total) const {
   return GpuDriver::GetDeviceMemoryInfo(context_, free, total);
 }
@@ -875,7 +834,7 @@ bool GpuExecutor::GetSymbol(const std::string& symbol_name,
       return lookup_in_module(it->second.first);
     }
 
-    for (auto &it : gpu_binary_to_module_) {
+    for (auto& it : gpu_binary_to_module_) {
       if (lookup_in_module(it.second.first)) {
         return true;
       }
@@ -963,7 +922,7 @@ static int TryToReadNumaNode(const std::string& pci_bus_id,
   // We have to use fopen/fread here so that the device properties can be
   // populated before InitGoogle procedure has been completed (at which point we
   // could use the file::* utilities).
-  FILE *file = fopen(filename.c_str(), "r");
+  FILE* file = fopen(filename.c_str(), "r");
   if (file == nullptr) {
     LOG(ERROR) << "could not open file to read NUMA node: " << filename
                << "\nYour kernel may have been built without NUMA support.";
@@ -980,8 +939,9 @@ static int TryToReadNumaNode(const std::string& pci_bus_id,
   if (port::safe_strto32(content, &value)) {
     if (value < 0) {  // See http://b/18228951 for details on this path.
       LOG(INFO) << "successful NUMA node read from SysFS had negative value ("
-                << value << "), but there must be at least one NUMA node"
-                            ", so returning NUMA node zero";
+                << value
+                << "), but there must be at least one NUMA node"
+                   ", so returning NUMA node zero";
       fclose(file);
       return 0;
     }
diff --git a/tensorflow/stream_executor/gpu/gpu_executor.h b/tensorflow/stream_executor/gpu/gpu_executor.h
index fc4ea0e0ab2..edc015c6126 100644
--- a/tensorflow/stream_executor/gpu/gpu_executor.h
+++ b/tensorflow/stream_executor/gpu/gpu_executor.h
@@ -188,10 +188,6 @@ class GpuExecutor : public internal::StreamExecutorInterface {
 
   bool CanEnablePeerAccessTo(StreamExecutorInterface* other) override;
 
-  SharedMemoryConfig GetDeviceSharedMemoryConfig() override;
-
-  port::Status SetDeviceSharedMemoryConfig(SharedMemoryConfig config) override;
-
   bool DeviceMemoryUsage(int64* free, int64* total) const override;
 
   // Search for the symbol and returns a device pointer and size.
diff --git a/tensorflow/stream_executor/host/host_gpu_executor.h b/tensorflow/stream_executor/host/host_gpu_executor.h
index 9b896fe06f8..953f8ced47f 100644
--- a/tensorflow/stream_executor/host/host_gpu_executor.h
+++ b/tensorflow/stream_executor/host/host_gpu_executor.h
@@ -148,20 +148,6 @@ class HostExecutor : public internal::StreamExecutorInterface {
     return true;
   }
 
-  SharedMemoryConfig GetDeviceSharedMemoryConfig() override {
-    LOG(INFO) << "Shared memory configuration is unsupported for host "
-              << "executors.";
-    return SharedMemoryConfig::kDefault;
-  }
-
-  port::Status SetDeviceSharedMemoryConfig(SharedMemoryConfig config) override {
-    std::string error_msg{
-        "Shared memory configuration is unsupported for host "
-        "executors."};
-    LOG(INFO) << error_msg;
-    return port::Status(port::error::UNIMPLEMENTED, error_msg);
-  }
-
   bool SupportsBlas() const override;
   blas::BlasSupport *CreateBlas() override;
 
diff --git a/tensorflow/stream_executor/rocm/rocm_gpu_executor.cc b/tensorflow/stream_executor/rocm/rocm_gpu_executor.cc
index fd3b5f19913..2a85cb820ed 100644
--- a/tensorflow/stream_executor/rocm/rocm_gpu_executor.cc
+++ b/tensorflow/stream_executor/rocm/rocm_gpu_executor.cc
@@ -720,47 +720,6 @@ port::Status GpuExecutor::EnablePeerAccessTo(StreamExecutorInterface* other) {
   return GpuDriver::EnablePeerAccess(context_, rocm_other->context_);
 }
 
-SharedMemoryConfig GpuExecutor::GetDeviceSharedMemoryConfig() {
-  port::StatusOr<hipSharedMemConfig> rocm_config =
-      GpuDriver::ContextGetSharedMemConfig(context_);
-  if (!rocm_config.ok()) {
-    // Don't log; the failed call will log necessary output.
-    return SharedMemoryConfig::kDefault;
-  }
-
-  switch (rocm_config.ValueOrDie()) {
-    case hipSharedMemBankSizeDefault:
-      return SharedMemoryConfig::kDefault;
-    case hipSharedMemBankSizeFourByte:
-      return SharedMemoryConfig::kFourByte;
-    case hipSharedMemBankSizeEightByte:
-      return SharedMemoryConfig::kEightByte;
-    default:
-      LOG(FATAL) << "Invalid shared memory configuration returned: "
-                 << rocm_config.ValueOrDie();
-  }
-}
-
-port::Status GpuExecutor::SetDeviceSharedMemoryConfig(
-    SharedMemoryConfig config) {
-  hipSharedMemConfig rocm_config;
-  switch (config) {
-    case SharedMemoryConfig::kDefault:
-      rocm_config = hipSharedMemBankSizeDefault;
-      break;
-    case SharedMemoryConfig::kFourByte:
-      rocm_config = hipSharedMemBankSizeFourByte;
-      break;
-    case SharedMemoryConfig::kEightByte:
-      rocm_config = hipSharedMemBankSizeEightByte;
-      break;
-    default:
-      LOG(FATAL) << "Invalid shared memory configuration specified: "
-                 << static_cast<int>(config);
-  }
-  return GpuDriver::ContextSetSharedMemConfig(context_, rocm_config);
-}
-
 bool GpuExecutor::DeviceMemoryUsage(int64* free, int64* total) const {
   return GpuDriver::GetDeviceMemoryInfo(context_, free, total);
 }
@@ -768,24 +727,24 @@ bool GpuExecutor::DeviceMemoryUsage(int64* free, int64* total) const {
 bool GpuExecutor::GetSymbol(const string& symbol_name,
                             ModuleHandle module_handle, void** mem,
                             size_t* bytes) {
-    absl::MutexLock lock{&in_memory_modules_mu_};
-    if (static_cast<bool>(module_handle)) {
-      auto it = gpu_binary_to_module_.find(module_handle.id());
-      CHECK(it != gpu_binary_to_module_.end());
-      if (GpuDriver::GetModuleSymbol(
-              context_, it->second.first, symbol_name.c_str(),
-              reinterpret_cast<hipDeviceptr_t*>(mem), bytes)) {
-        return true;
-      }
+  absl::MutexLock lock{&in_memory_modules_mu_};
+  if (static_cast<bool>(module_handle)) {
+    auto it = gpu_binary_to_module_.find(module_handle.id());
+    CHECK(it != gpu_binary_to_module_.end());
+    if (GpuDriver::GetModuleSymbol(
+            context_, it->second.first, symbol_name.c_str(),
+            reinterpret_cast<hipDeviceptr_t*>(mem), bytes)) {
+      return true;
     }
+  }
 
-    for (auto& it : gpu_binary_to_module_) {
-      if (GpuDriver::GetModuleSymbol(
-              context_, it.second.first, symbol_name.c_str(),
-              reinterpret_cast<hipDeviceptr_t*>(mem), bytes)) {
-        return true;
-      }
+  for (auto& it : gpu_binary_to_module_) {
+    if (GpuDriver::GetModuleSymbol(
+            context_, it.second.first, symbol_name.c_str(),
+            reinterpret_cast<hipDeviceptr_t*>(mem), bytes)) {
+      return true;
     }
+  }
 
   LOG(INFO) << "Falied to find symbol in any modules: " << symbol_name;
   return false;
diff --git a/tensorflow/stream_executor/shared_memory_config.h b/tensorflow/stream_executor/shared_memory_config.h
deleted file mode 100644
index 7cbeb3bcd91..00000000000
--- a/tensorflow/stream_executor/shared_memory_config.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// This file defines a uniform interface to configuration options for shared
-// memory for supported devices. As with many StreamExecutor-supported features,
-// support for the options defined herein is device-dependent.
-#ifndef TENSORFLOW_STREAM_EXECUTOR_SHARED_MEMORY_CONFIG_H_
-#define TENSORFLOW_STREAM_EXECUTOR_SHARED_MEMORY_CONFIG_H_
-
-namespace stream_executor {
-
-// SharedMemoryConfig enum describes potential widths of shared memory banks for
-// a device or kernel.
-enum class SharedMemoryConfig {
-  kDefault,    // Use the device default configuration.
-  kFourByte,   // Sets shared memory banks to be four bytes wide.
-  kEightByte,  // Sets shared memory banks to be eight bytes wide.
-};
-
-}  // namespace stream_executor
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_SHARED_MEMORY_CONFIG_H_
diff --git a/tensorflow/stream_executor/stream_executor_internal.h b/tensorflow/stream_executor/stream_executor_internal.h
index 408b4fc8207..437338085b3 100644
--- a/tensorflow/stream_executor/stream_executor_internal.h
+++ b/tensorflow/stream_executor/stream_executor_internal.h
@@ -44,7 +44,6 @@ limitations under the License.
 #include "tensorflow/stream_executor/platform.h"
 #include "tensorflow/stream_executor/platform/port.h"
 #include "tensorflow/stream_executor/plugin_registry.h"
-#include "tensorflow/stream_executor/shared_memory_config.h"
 #include "tensorflow/stream_executor/trace_listener.h"
 
 namespace stream_executor {
@@ -267,9 +266,6 @@ class StreamExecutorInterface {
   virtual int PlatformDeviceCount() = 0;
   virtual port::Status EnablePeerAccessTo(StreamExecutorInterface *other) = 0;
   virtual bool CanEnablePeerAccessTo(StreamExecutorInterface *other) = 0;
-  virtual SharedMemoryConfig GetDeviceSharedMemoryConfig() = 0;
-  virtual port::Status SetDeviceSharedMemoryConfig(
-      SharedMemoryConfig config) = 0;
 
   virtual int64 GetDeviceLoad() { return -1; }
 
diff --git a/tensorflow/stream_executor/stream_executor_pimpl.cc b/tensorflow/stream_executor/stream_executor_pimpl.cc
index d23f1472e33..db4e8f9b694 100644
--- a/tensorflow/stream_executor/stream_executor_pimpl.cc
+++ b/tensorflow/stream_executor/stream_executor_pimpl.cc
@@ -230,23 +230,6 @@ port::Status StreamExecutor::EnablePeerAccessTo(StreamExecutor *other) {
   return implementation_->EnablePeerAccessTo(other->implementation_.get());
 }
 
-SharedMemoryConfig StreamExecutor::GetDeviceSharedMemoryConfig() {
-  return implementation_->GetDeviceSharedMemoryConfig();
-}
-
-port::Status StreamExecutor::SetDeviceSharedMemoryConfig(
-    SharedMemoryConfig config) {
-  if (config != SharedMemoryConfig::kDefault &&
-      config != SharedMemoryConfig::kFourByte &&
-      config != SharedMemoryConfig::kEightByte) {
-    std::string error_msg = absl::StrFormat(
-        "Invalid shared memory config specified: %d", static_cast<int>(config));
-    LOG(ERROR) << error_msg;
-    return port::Status(port::error::INVALID_ARGUMENT, error_msg);
-  }
-  return implementation_->SetDeviceSharedMemoryConfig(config);
-}
-
 const DeviceDescription &StreamExecutor::GetDeviceDescription() const {
   absl::MutexLock lock(&mu_);
   if (device_description_ != nullptr) {
@@ -858,7 +841,7 @@ absl::optional<AllocatorStats> StreamExecutor::GetAllocatorStats() {
 }
 
 template <typename TraceCallT, typename... ArgsT>
-void StreamExecutor::SubmitTrace(TraceCallT trace_call, ArgsT &&... args) {
+void StreamExecutor::SubmitTrace(TraceCallT trace_call, ArgsT &&...args) {
   if (tracing_enabled_) {
     {
       // instance tracers held in a block to limit the lock lifetime.
diff --git a/tensorflow/stream_executor/stream_executor_pimpl.h b/tensorflow/stream_executor/stream_executor_pimpl.h
index f7f69f78e89..b9b118ca42c 100644
--- a/tensorflow/stream_executor/stream_executor_pimpl.h
+++ b/tensorflow/stream_executor/stream_executor_pimpl.h
@@ -35,7 +35,6 @@ limitations under the License.
 #include "tensorflow/stream_executor/platform/logging.h"
 #include "tensorflow/stream_executor/platform/port.h"
 #include "tensorflow/stream_executor/rng.h"
-#include "tensorflow/stream_executor/shared_memory_config.h"
 #include "tensorflow/stream_executor/stream.h"
 #include "tensorflow/stream_executor/stream_executor_internal.h"
 #include "tensorflow/stream_executor/trace_listener.h"
@@ -54,8 +53,8 @@ struct AllocRecord {
 };
 
 // Forward declaration of private friend class.
-template <typename BeginCallT, typename CompleteCallT,
-          typename ReturnT, typename... BeginArgsT>
+template <typename BeginCallT, typename CompleteCallT, typename ReturnT,
+          typename... BeginArgsT>
 class ScopedTracer;
 
 // A StreamExecutor manages a single device, in terms of executing work (kernel
@@ -322,14 +321,6 @@ class StreamExecutor {
   // this is more an up-front test as to whether it's expressly forbidden.
   bool CanEnablePeerAccessTo(StreamExecutor *other);
 
-  // Gets the preferred shared memory configuration for the device to which this
-  // executor is bound.
-  SharedMemoryConfig GetDeviceSharedMemoryConfig();
-
-  // Sets the preferred shared memory configuration for the device to which this
-  // executor is bound.
-  port::Status SetDeviceSharedMemoryConfig(SharedMemoryConfig config);
-
   // Obtains metadata about the underlying device.
   // The value is cached on first use.
   const DeviceDescription &GetDeviceDescription() const;
@@ -507,12 +498,12 @@ class StreamExecutor {
   // To register a listener for all executors for a given platform, see
   // Platform::RegisterTraceListener().
   // Does not take ownership of listener.
-  void RegisterTraceListener(TraceListener* listener);
+  void RegisterTraceListener(TraceListener *listener);
 
   // Removes a TraceListener from this StreamExecutor instance.
   // Returns false (and logs) in cases where the argument listener was not
   // previously registered.
-  bool UnregisterTraceListener(TraceListener* listener);
+  bool UnregisterTraceListener(TraceListener *listener);
 
   // Return allocator statistics.
   absl::optional<AllocatorStats> GetAllocatorStats();
@@ -522,8 +513,8 @@ class StreamExecutor {
   StreamExecutorMemoryAllocator *GetAllocator() { return &allocator_; }
 
  private:
-  template <typename BeginCallT, typename CompleteCallT,
-            typename ReturnT, typename... BeginArgsT>
+  template <typename BeginCallT, typename CompleteCallT, typename ReturnT,
+            typename... BeginArgsT>
   friend class ScopedTracer;
   friend class Event;
   friend class Stream;
@@ -648,7 +639,7 @@ class StreamExecutor {
   // Calls the relevant TraceListener routine to begin tracing for the specified
   // asynchronous method.
   template <typename TraceCallT, typename... ArgsT>
-  void SubmitTrace(TraceCallT trace_call, ArgsT&&... args);
+  void SubmitTrace(TraceCallT trace_call, ArgsT &&...args);
 
   // Reader/writer lock for class-static StreamExecutor members.
   static absl::Mutex static_mu_;
diff --git a/tensorflow/stream_executor/tpu/tpu_executor.h b/tensorflow/stream_executor/tpu/tpu_executor.h
index faeae86da9b..2430a350463 100644
--- a/tensorflow/stream_executor/tpu/tpu_executor.h
+++ b/tensorflow/stream_executor/tpu/tpu_executor.h
@@ -96,8 +96,7 @@ class TpuExecutor : public tensorflow::tpu::TpuExecutorInterface {
   void DequeueOutfeed(int32 outfeed_queue_index, absl::Span<uint8> bytes,
                       StatusCallback done);
 
-  Status EnqueueInfeed(int32 infeed_queue_index,
-                       absl::Span<const uint8> bytes);
+  Status EnqueueInfeed(int32 infeed_queue_index, absl::Span<const uint8> bytes);
 
   absl::optional<stream_executor::AllocatorStats> GetAllocatorStats() override;
 
@@ -175,10 +174,6 @@ class TpuExecutor : public tensorflow::tpu::TpuExecutorInterface {
     LOG(FATAL) << "Not yet implemented";
   }
 
-  stream_executor::SharedMemoryConfig GetDeviceSharedMemoryConfig() override {
-    LOG(FATAL) << "not yet implemented";
-  }
-
   void* GetSubBuffer(DeviceMemoryBase* parent, uint64 offset,
                      uint64 size) override {
     LOG(FATAL) << "not yet implemented";
@@ -197,10 +192,7 @@ class TpuExecutor : public tensorflow::tpu::TpuExecutorInterface {
   bool CanEnablePeerAccessTo(StreamExecutorInterface* other) override {
     LOG(FATAL) << "not yet implemented";
   }
-  Status SetDeviceSharedMemoryConfig(
-      stream_executor::SharedMemoryConfig config) override {
-    LOG(FATAL) << "not yet implemented";
-  }
+
   void* HostMemoryAllocate(uint64 size) override {
     LOG(FATAL) << "not yet implemented";
   }