Remove SharedMemoryConfig since it is not used anywhere.
PiperOrigin-RevId: 326154532 Change-Id: I13be21f577226c48c7e0d5bfc7efb787f0422e85
This commit is contained in:
parent
9636571807
commit
642db2faf5
@ -38,7 +38,6 @@ limitations under the License.
|
||||
#include "tensorflow/stream_executor/launch_dim.h"
|
||||
#include "tensorflow/stream_executor/plugin.h"
|
||||
#include "tensorflow/stream_executor/rng.h"
|
||||
#include "tensorflow/stream_executor/shared_memory_config.h"
|
||||
#include "tensorflow/stream_executor/stream.h"
|
||||
#include "tensorflow/stream_executor/stream_executor.h"
|
||||
#include "tensorflow/stream_executor/stream_executor_internal.h"
|
||||
@ -182,15 +181,6 @@ class XlaInterpreterExecutor : public internal::StreamExecutorInterface {
|
||||
return true;
|
||||
}
|
||||
|
||||
SharedMemoryConfig GetDeviceSharedMemoryConfig() override {
|
||||
return SharedMemoryConfig::kDefault;
|
||||
}
|
||||
|
||||
port::Status SetDeviceSharedMemoryConfig(SharedMemoryConfig config) override {
|
||||
return port::Status{port::error::UNIMPLEMENTED,
|
||||
"Shared memory not supported"};
|
||||
}
|
||||
|
||||
std::unique_ptr<internal::EventInterface> CreateEventImplementation()
|
||||
override {
|
||||
return nullptr;
|
||||
|
@ -67,7 +67,6 @@ cc_library(
|
||||
"plugin.h",
|
||||
"plugin_registry.h",
|
||||
"rng.h",
|
||||
"shared_memory_config.h",
|
||||
"stream_executor_pimpl.h",
|
||||
"temporary_device_memory.h",
|
||||
"temporary_memory_manager.h",
|
||||
@ -123,7 +122,6 @@ cc_library(
|
||||
"multi_platform_manager.h",
|
||||
"platform.h",
|
||||
"plugin_registry.h",
|
||||
"shared_memory_config.h",
|
||||
"stream_executor.h",
|
||||
"stream_executor_internal.h",
|
||||
"timer.h",
|
||||
@ -173,11 +171,6 @@ cc_library(
|
||||
],
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "shared_memory_config",
|
||||
hdrs = ["shared_memory_config.h"],
|
||||
)
|
||||
|
||||
# Aliases for backwards compatibility.
|
||||
alias(
|
||||
name = "stream_header",
|
||||
@ -343,7 +336,6 @@ cc_library(
|
||||
"kernel_cache_config.h",
|
||||
"kernel_spec.h",
|
||||
"platform.h",
|
||||
"shared_memory_config.h",
|
||||
"stream.h",
|
||||
"stream_executor_internal.h",
|
||||
"trace_listener.h",
|
||||
@ -455,7 +447,6 @@ cc_library(
|
||||
"stream_executor_internal.cc",
|
||||
],
|
||||
hdrs = [
|
||||
"shared_memory_config.h",
|
||||
"stream_executor_internal.h",
|
||||
],
|
||||
deps = [
|
||||
@ -484,7 +475,6 @@ cc_library(
|
||||
"dnn.h",
|
||||
"kernel.h",
|
||||
"kernel_cache_config.h",
|
||||
"shared_memory_config.h",
|
||||
"stream_executor_pimpl.h",
|
||||
],
|
||||
visibility = ["//visibility:public"],
|
||||
@ -569,7 +559,6 @@ cc_library(
|
||||
"plugin.h",
|
||||
"plugin_registry.h",
|
||||
"rng.h",
|
||||
"shared_memory_config.h",
|
||||
"stream.h",
|
||||
"stream_executor.h",
|
||||
"stream_executor_internal.h",
|
||||
@ -619,7 +608,6 @@ cc_library(
|
||||
"plugin.h",
|
||||
"plugin_registry.h",
|
||||
"rng.h",
|
||||
"shared_memory_config.h",
|
||||
"stream.h",
|
||||
"stream_executor.h",
|
||||
"stream_executor_internal.h",
|
||||
|
@ -101,12 +101,12 @@ static GpuTimer* AsGpuTimer(Timer* timer) {
|
||||
// N.B. we must lose constness in order to pass a suitable type to the existing
|
||||
// libcuda APIs, so the caller should take care to only pass the result of const
|
||||
// GPU memory conversions to libcuda functions which will honor constness.
|
||||
static CUdeviceptr AsCudaDevicePtr(const DeviceMemoryBase &gpu_mem) {
|
||||
static CUdeviceptr AsCudaDevicePtr(const DeviceMemoryBase& gpu_mem) {
|
||||
return reinterpret_cast<CUdeviceptr>(gpu_mem.opaque());
|
||||
}
|
||||
|
||||
// See description on const version above.
|
||||
static CUdeviceptr AsCudaDevicePtr(DeviceMemoryBase *gpu_mem) {
|
||||
static CUdeviceptr AsCudaDevicePtr(DeviceMemoryBase* gpu_mem) {
|
||||
return AsCudaDevicePtr(*gpu_mem);
|
||||
}
|
||||
|
||||
@ -225,11 +225,11 @@ port::Status GpuExecutor::LoadModuleFromCuBin(const char* cubin,
|
||||
if (*module == nullptr) {
|
||||
TF_RETURN_IF_ERROR(GpuDriver::LoadCubin(context_, cubin, module));
|
||||
module_refcount = 1;
|
||||
VLOG(3) << "Loaded CUBIN " << static_cast<const void *>(cubin)
|
||||
VLOG(3) << "Loaded CUBIN " << static_cast<const void*>(cubin)
|
||||
<< " as module " << *module;
|
||||
} else {
|
||||
++module_refcount;
|
||||
VLOG(3) << "CUBIN " << static_cast<const void *>(cubin)
|
||||
VLOG(3) << "CUBIN " << static_cast<const void*>(cubin)
|
||||
<< " is already loaded as module " << *module;
|
||||
}
|
||||
gpu_binary_to_module_[cubin] = {*module, module_refcount};
|
||||
@ -242,12 +242,12 @@ port::Status GpuExecutor::LoadModuleFromPtx(const char* ptx, CUmodule* module) {
|
||||
|
||||
if (*module == nullptr) {
|
||||
TF_RETURN_IF_ERROR(GpuDriver::LoadPtx(context_, ptx, module));
|
||||
VLOG(3) << "Loaded PTX " << static_cast<const void *>(ptx) << " as module "
|
||||
VLOG(3) << "Loaded PTX " << static_cast<const void*>(ptx) << " as module "
|
||||
<< *module;
|
||||
module_refcount = 1;
|
||||
} else {
|
||||
++module_refcount;
|
||||
VLOG(3) << "PTX " << static_cast<const void *>(ptx)
|
||||
VLOG(3) << "PTX " << static_cast<const void*>(ptx)
|
||||
<< " is already loaded as module " << module;
|
||||
}
|
||||
gpu_binary_to_module_[ptx] = {*module, module_refcount};
|
||||
@ -271,7 +271,7 @@ port::Status GpuExecutor::GetKernel(const MultiKernelLoaderSpec& spec,
|
||||
if (spec.has_cuda_cubin_in_memory()) {
|
||||
absl::MutexLock lock{&in_memory_modules_mu_};
|
||||
kernelname = &spec.cuda_cubin_in_memory().kernelname();
|
||||
const char *cubin = spec.cuda_cubin_in_memory().bytes();
|
||||
const char* cubin = spec.cuda_cubin_in_memory().bytes();
|
||||
TF_RETURN_IF_ERROR(LoadModuleFromCuBin(cubin, &module));
|
||||
kernel_to_gpu_binary_[kernel] = cubin;
|
||||
} else if (spec.has_cuda_ptx_in_memory()) {
|
||||
@ -281,7 +281,7 @@ port::Status GpuExecutor::GetKernel(const MultiKernelLoaderSpec& spec,
|
||||
return port::InternalError("Compute capability not set");
|
||||
}
|
||||
|
||||
const char *ptx = spec.cuda_ptx_in_memory().text(cc_major_, cc_minor_);
|
||||
const char* ptx = spec.cuda_ptx_in_memory().text(cc_major_, cc_minor_);
|
||||
if (ptx == nullptr) {
|
||||
ptx = spec.cuda_ptx_in_memory().default_text();
|
||||
}
|
||||
@ -318,8 +318,8 @@ bool GpuExecutor::UnloadGpuBinary(const void* gpu_binary) {
|
||||
VLOG(3) << "No loaded CUDA module for " << gpu_binary;
|
||||
return false;
|
||||
}
|
||||
auto &module = module_it->second.first;
|
||||
auto &refcount = module_it->second.second;
|
||||
auto& module = module_it->second.first;
|
||||
auto& refcount = module_it->second.second;
|
||||
VLOG(3) << "Found CUDA module " << module << " with refcount " << refcount;
|
||||
if (--refcount == 0) {
|
||||
VLOG(3) << "Unloading CUDA module " << module;
|
||||
@ -355,8 +355,8 @@ port::Status GpuExecutor::LoadModule(const MultiModuleLoaderSpec& spec,
|
||||
TF_RETURN_IF_ERROR(LoadModuleFromCuBin(
|
||||
reinterpret_cast<const char*>(spec.cuda_cubin_in_memory().data()),
|
||||
&cu_module));
|
||||
*module_handle = ModuleHandle(const_cast<void *>(
|
||||
static_cast<const void *>(spec.cuda_cubin_in_memory().data())));
|
||||
*module_handle = ModuleHandle(const_cast<void*>(
|
||||
static_cast<const void*>(spec.cuda_cubin_in_memory().data())));
|
||||
return port::Status::OK();
|
||||
} else if (spec.has_cuda_ptx_in_memory()) {
|
||||
if (cc_major_ == 0 && cc_minor_ == 0) {
|
||||
@ -370,15 +370,15 @@ port::Status GpuExecutor::LoadModule(const MultiModuleLoaderSpec& spec,
|
||||
absl::MutexLock lock{&in_memory_modules_mu_};
|
||||
TF_RETURN_IF_ERROR(
|
||||
LoadModuleFromPtx(spec.cuda_ptx_in_memory(), &cu_module));
|
||||
*module_handle = ModuleHandle(const_cast<void *>(
|
||||
static_cast<const void *>(spec.cuda_ptx_in_memory())));
|
||||
*module_handle = ModuleHandle(
|
||||
const_cast<void*>(static_cast<const void*>(spec.cuda_ptx_in_memory())));
|
||||
return port::Status::OK();
|
||||
}
|
||||
return port::InternalError("No method of loading CUDA module provided");
|
||||
}
|
||||
|
||||
bool GpuExecutor::UnloadModule(ModuleHandle module_handle) {
|
||||
const char *gpu_binary = reinterpret_cast<const char *>(module_handle.id());
|
||||
const char* gpu_binary = reinterpret_cast<const char*>(module_handle.id());
|
||||
absl::MutexLock lock{&in_memory_modules_mu_};
|
||||
return UnloadGpuBinary(gpu_binary);
|
||||
}
|
||||
@ -425,7 +425,7 @@ port::Status GpuExecutor::Launch(Stream* stream, const ThreadDim& thread_dims,
|
||||
cufunc, cuda_kernel->GetGpuCacheConfig()));
|
||||
}
|
||||
|
||||
void **kernel_params = const_cast<void **>(args.argument_addresses().data());
|
||||
void** kernel_params = const_cast<void**>(args.argument_addresses().data());
|
||||
|
||||
return GpuDriver::LaunchKernel(
|
||||
context_, cufunc, block_dims.x, block_dims.y, block_dims.z, thread_dims.x,
|
||||
@ -454,7 +454,7 @@ void GpuExecutor::VlogOccupancyInfo(const KernelBase& kernel,
|
||||
return;
|
||||
}
|
||||
|
||||
const DeviceDescription &device_description =
|
||||
const DeviceDescription& device_description =
|
||||
kernel.parent()->GetDeviceDescription();
|
||||
|
||||
const GpuKernel* cuda_kernel = AsGpuKernel(&kernel);
|
||||
@ -522,7 +522,7 @@ DeviceMemoryBase GpuExecutor::Allocate(uint64 size, int64 memory_space) {
|
||||
void* GpuExecutor::GetSubBuffer(DeviceMemoryBase* mem, uint64 offset_bytes,
|
||||
uint64 size_bytes) {
|
||||
// offset and size are in bytes, so char* works as the pointer type.
|
||||
return reinterpret_cast<char *>(mem->opaque()) + offset_bytes;
|
||||
return reinterpret_cast<char*>(mem->opaque()) + offset_bytes;
|
||||
}
|
||||
|
||||
void GpuExecutor::Deallocate(DeviceMemoryBase* mem) {
|
||||
@ -662,8 +662,8 @@ bool GpuExecutor::HostCallback(Stream* stream,
|
||||
/* static */ void GpuExecutor::InternalHostCallback(CUstream stream,
|
||||
CUresult status,
|
||||
void* data) {
|
||||
std::function<void()> *callback =
|
||||
reinterpret_cast<std::function<void()> *>(data);
|
||||
std::function<void()>* callback =
|
||||
reinterpret_cast<std::function<void()>*>(data);
|
||||
(*callback)();
|
||||
delete callback;
|
||||
}
|
||||
@ -744,7 +744,7 @@ port::Status GpuExecutor::BlockHostUntilDone(Stream* stream) {
|
||||
}
|
||||
|
||||
blas::BlasSupport* GpuExecutor::CreateBlas() {
|
||||
PluginRegistry *registry = PluginRegistry::Instance();
|
||||
PluginRegistry* registry = PluginRegistry::Instance();
|
||||
port::StatusOr<PluginRegistry::BlasFactory> status =
|
||||
registry->GetFactory<PluginRegistry::BlasFactory>(cuda::kCudaPlatformId,
|
||||
plugin_config_.blas());
|
||||
@ -758,7 +758,7 @@ blas::BlasSupport* GpuExecutor::CreateBlas() {
|
||||
}
|
||||
|
||||
dnn::DnnSupport* GpuExecutor::CreateDnn() {
|
||||
PluginRegistry *registry = PluginRegistry::Instance();
|
||||
PluginRegistry* registry = PluginRegistry::Instance();
|
||||
port::StatusOr<PluginRegistry::DnnFactory> status =
|
||||
registry->GetFactory<PluginRegistry::DnnFactory>(cuda::kCudaPlatformId,
|
||||
plugin_config_.dnn());
|
||||
@ -772,7 +772,7 @@ dnn::DnnSupport* GpuExecutor::CreateDnn() {
|
||||
}
|
||||
|
||||
fft::FftSupport* GpuExecutor::CreateFft() {
|
||||
PluginRegistry *registry = PluginRegistry::Instance();
|
||||
PluginRegistry* registry = PluginRegistry::Instance();
|
||||
port::StatusOr<PluginRegistry::FftFactory> status =
|
||||
registry->GetFactory<PluginRegistry::FftFactory>(cuda::kCudaPlatformId,
|
||||
plugin_config_.fft());
|
||||
@ -786,7 +786,7 @@ fft::FftSupport* GpuExecutor::CreateFft() {
|
||||
}
|
||||
|
||||
rng::RngSupport* GpuExecutor::CreateRng() {
|
||||
PluginRegistry *registry = PluginRegistry::Instance();
|
||||
PluginRegistry* registry = PluginRegistry::Instance();
|
||||
port::StatusOr<PluginRegistry::RngFactory> status =
|
||||
registry->GetFactory<PluginRegistry::RngFactory>(cuda::kCudaPlatformId,
|
||||
plugin_config_.rng());
|
||||
@ -812,47 +812,6 @@ port::Status GpuExecutor::EnablePeerAccessTo(StreamExecutorInterface* other) {
|
||||
return GpuDriver::EnablePeerAccess(context_, cuda_other->context_);
|
||||
}
|
||||
|
||||
SharedMemoryConfig GpuExecutor::GetDeviceSharedMemoryConfig() {
|
||||
port::StatusOr<CUsharedconfig> cuda_config =
|
||||
GpuDriver::ContextGetSharedMemConfig(context_);
|
||||
if (!cuda_config.ok()) {
|
||||
// Don't log; the failed call will log necessary output.
|
||||
return SharedMemoryConfig::kDefault;
|
||||
}
|
||||
|
||||
switch (cuda_config.ValueOrDie()) {
|
||||
case CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE:
|
||||
return SharedMemoryConfig::kDefault;
|
||||
case CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE:
|
||||
return SharedMemoryConfig::kFourByte;
|
||||
case CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE:
|
||||
return SharedMemoryConfig::kEightByte;
|
||||
default:
|
||||
LOG(FATAL) << "Invalid shared memory configuration returned: "
|
||||
<< cuda_config.ValueOrDie();
|
||||
}
|
||||
}
|
||||
|
||||
port::Status GpuExecutor::SetDeviceSharedMemoryConfig(
|
||||
SharedMemoryConfig config) {
|
||||
CUsharedconfig cuda_config;
|
||||
switch (config) {
|
||||
case SharedMemoryConfig::kDefault:
|
||||
cuda_config = CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE;
|
||||
break;
|
||||
case SharedMemoryConfig::kFourByte:
|
||||
cuda_config = CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE;
|
||||
break;
|
||||
case SharedMemoryConfig::kEightByte:
|
||||
cuda_config = CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE;
|
||||
break;
|
||||
default:
|
||||
LOG(FATAL) << "Invalid shared memory configuration specified: "
|
||||
<< static_cast<int>(config);
|
||||
}
|
||||
return GpuDriver::ContextSetSharedMemConfig(context_, cuda_config);
|
||||
}
|
||||
|
||||
bool GpuExecutor::DeviceMemoryUsage(int64* free, int64* total) const {
|
||||
return GpuDriver::GetDeviceMemoryInfo(context_, free, total);
|
||||
}
|
||||
@ -875,7 +834,7 @@ bool GpuExecutor::GetSymbol(const std::string& symbol_name,
|
||||
return lookup_in_module(it->second.first);
|
||||
}
|
||||
|
||||
for (auto &it : gpu_binary_to_module_) {
|
||||
for (auto& it : gpu_binary_to_module_) {
|
||||
if (lookup_in_module(it.second.first)) {
|
||||
return true;
|
||||
}
|
||||
@ -963,7 +922,7 @@ static int TryToReadNumaNode(const std::string& pci_bus_id,
|
||||
// We have to use fopen/fread here so that the device properties can be
|
||||
// populated before InitGoogle procedure has been completed (at which point we
|
||||
// could use the file::* utilities).
|
||||
FILE *file = fopen(filename.c_str(), "r");
|
||||
FILE* file = fopen(filename.c_str(), "r");
|
||||
if (file == nullptr) {
|
||||
LOG(ERROR) << "could not open file to read NUMA node: " << filename
|
||||
<< "\nYour kernel may have been built without NUMA support.";
|
||||
@ -980,7 +939,8 @@ static int TryToReadNumaNode(const std::string& pci_bus_id,
|
||||
if (port::safe_strto32(content, &value)) {
|
||||
if (value < 0) { // See http://b/18228951 for details on this path.
|
||||
LOG(INFO) << "successful NUMA node read from SysFS had negative value ("
|
||||
<< value << "), but there must be at least one NUMA node"
|
||||
<< value
|
||||
<< "), but there must be at least one NUMA node"
|
||||
", so returning NUMA node zero";
|
||||
fclose(file);
|
||||
return 0;
|
||||
|
@ -188,10 +188,6 @@ class GpuExecutor : public internal::StreamExecutorInterface {
|
||||
|
||||
bool CanEnablePeerAccessTo(StreamExecutorInterface* other) override;
|
||||
|
||||
SharedMemoryConfig GetDeviceSharedMemoryConfig() override;
|
||||
|
||||
port::Status SetDeviceSharedMemoryConfig(SharedMemoryConfig config) override;
|
||||
|
||||
bool DeviceMemoryUsage(int64* free, int64* total) const override;
|
||||
|
||||
// Search for the symbol and returns a device pointer and size.
|
||||
|
@ -148,20 +148,6 @@ class HostExecutor : public internal::StreamExecutorInterface {
|
||||
return true;
|
||||
}
|
||||
|
||||
SharedMemoryConfig GetDeviceSharedMemoryConfig() override {
|
||||
LOG(INFO) << "Shared memory configuration is unsupported for host "
|
||||
<< "executors.";
|
||||
return SharedMemoryConfig::kDefault;
|
||||
}
|
||||
|
||||
port::Status SetDeviceSharedMemoryConfig(SharedMemoryConfig config) override {
|
||||
std::string error_msg{
|
||||
"Shared memory configuration is unsupported for host "
|
||||
"executors."};
|
||||
LOG(INFO) << error_msg;
|
||||
return port::Status(port::error::UNIMPLEMENTED, error_msg);
|
||||
}
|
||||
|
||||
bool SupportsBlas() const override;
|
||||
blas::BlasSupport *CreateBlas() override;
|
||||
|
||||
|
@ -720,47 +720,6 @@ port::Status GpuExecutor::EnablePeerAccessTo(StreamExecutorInterface* other) {
|
||||
return GpuDriver::EnablePeerAccess(context_, rocm_other->context_);
|
||||
}
|
||||
|
||||
SharedMemoryConfig GpuExecutor::GetDeviceSharedMemoryConfig() {
|
||||
port::StatusOr<hipSharedMemConfig> rocm_config =
|
||||
GpuDriver::ContextGetSharedMemConfig(context_);
|
||||
if (!rocm_config.ok()) {
|
||||
// Don't log; the failed call will log necessary output.
|
||||
return SharedMemoryConfig::kDefault;
|
||||
}
|
||||
|
||||
switch (rocm_config.ValueOrDie()) {
|
||||
case hipSharedMemBankSizeDefault:
|
||||
return SharedMemoryConfig::kDefault;
|
||||
case hipSharedMemBankSizeFourByte:
|
||||
return SharedMemoryConfig::kFourByte;
|
||||
case hipSharedMemBankSizeEightByte:
|
||||
return SharedMemoryConfig::kEightByte;
|
||||
default:
|
||||
LOG(FATAL) << "Invalid shared memory configuration returned: "
|
||||
<< rocm_config.ValueOrDie();
|
||||
}
|
||||
}
|
||||
|
||||
port::Status GpuExecutor::SetDeviceSharedMemoryConfig(
|
||||
SharedMemoryConfig config) {
|
||||
hipSharedMemConfig rocm_config;
|
||||
switch (config) {
|
||||
case SharedMemoryConfig::kDefault:
|
||||
rocm_config = hipSharedMemBankSizeDefault;
|
||||
break;
|
||||
case SharedMemoryConfig::kFourByte:
|
||||
rocm_config = hipSharedMemBankSizeFourByte;
|
||||
break;
|
||||
case SharedMemoryConfig::kEightByte:
|
||||
rocm_config = hipSharedMemBankSizeEightByte;
|
||||
break;
|
||||
default:
|
||||
LOG(FATAL) << "Invalid shared memory configuration specified: "
|
||||
<< static_cast<int>(config);
|
||||
}
|
||||
return GpuDriver::ContextSetSharedMemConfig(context_, rocm_config);
|
||||
}
|
||||
|
||||
bool GpuExecutor::DeviceMemoryUsage(int64* free, int64* total) const {
|
||||
return GpuDriver::GetDeviceMemoryInfo(context_, free, total);
|
||||
}
|
||||
|
@ -1,34 +0,0 @@
|
||||
/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
// This file defines a uniform interface to configuration options for shared
|
||||
// memory for supported devices. As with many StreamExecutor-supported features,
|
||||
// support for the options defined herein is device-dependent.
|
||||
#ifndef TENSORFLOW_STREAM_EXECUTOR_SHARED_MEMORY_CONFIG_H_
|
||||
#define TENSORFLOW_STREAM_EXECUTOR_SHARED_MEMORY_CONFIG_H_
|
||||
|
||||
namespace stream_executor {
|
||||
|
||||
// SharedMemoryConfig enum describes potential widths of shared memory banks for
|
||||
// a device or kernel.
|
||||
enum class SharedMemoryConfig {
|
||||
kDefault, // Use the device default configuration.
|
||||
kFourByte, // Sets shared memory banks to be four bytes wide.
|
||||
kEightByte, // Sets shared memory banks to be eight bytes wide.
|
||||
};
|
||||
|
||||
} // namespace stream_executor
|
||||
|
||||
#endif // TENSORFLOW_STREAM_EXECUTOR_SHARED_MEMORY_CONFIG_H_
|
@ -44,7 +44,6 @@ limitations under the License.
|
||||
#include "tensorflow/stream_executor/platform.h"
|
||||
#include "tensorflow/stream_executor/platform/port.h"
|
||||
#include "tensorflow/stream_executor/plugin_registry.h"
|
||||
#include "tensorflow/stream_executor/shared_memory_config.h"
|
||||
#include "tensorflow/stream_executor/trace_listener.h"
|
||||
|
||||
namespace stream_executor {
|
||||
@ -267,9 +266,6 @@ class StreamExecutorInterface {
|
||||
virtual int PlatformDeviceCount() = 0;
|
||||
virtual port::Status EnablePeerAccessTo(StreamExecutorInterface *other) = 0;
|
||||
virtual bool CanEnablePeerAccessTo(StreamExecutorInterface *other) = 0;
|
||||
virtual SharedMemoryConfig GetDeviceSharedMemoryConfig() = 0;
|
||||
virtual port::Status SetDeviceSharedMemoryConfig(
|
||||
SharedMemoryConfig config) = 0;
|
||||
|
||||
virtual int64 GetDeviceLoad() { return -1; }
|
||||
|
||||
|
@ -230,23 +230,6 @@ port::Status StreamExecutor::EnablePeerAccessTo(StreamExecutor *other) {
|
||||
return implementation_->EnablePeerAccessTo(other->implementation_.get());
|
||||
}
|
||||
|
||||
SharedMemoryConfig StreamExecutor::GetDeviceSharedMemoryConfig() {
|
||||
return implementation_->GetDeviceSharedMemoryConfig();
|
||||
}
|
||||
|
||||
port::Status StreamExecutor::SetDeviceSharedMemoryConfig(
|
||||
SharedMemoryConfig config) {
|
||||
if (config != SharedMemoryConfig::kDefault &&
|
||||
config != SharedMemoryConfig::kFourByte &&
|
||||
config != SharedMemoryConfig::kEightByte) {
|
||||
std::string error_msg = absl::StrFormat(
|
||||
"Invalid shared memory config specified: %d", static_cast<int>(config));
|
||||
LOG(ERROR) << error_msg;
|
||||
return port::Status(port::error::INVALID_ARGUMENT, error_msg);
|
||||
}
|
||||
return implementation_->SetDeviceSharedMemoryConfig(config);
|
||||
}
|
||||
|
||||
const DeviceDescription &StreamExecutor::GetDeviceDescription() const {
|
||||
absl::MutexLock lock(&mu_);
|
||||
if (device_description_ != nullptr) {
|
||||
@ -858,7 +841,7 @@ absl::optional<AllocatorStats> StreamExecutor::GetAllocatorStats() {
|
||||
}
|
||||
|
||||
template <typename TraceCallT, typename... ArgsT>
|
||||
void StreamExecutor::SubmitTrace(TraceCallT trace_call, ArgsT &&... args) {
|
||||
void StreamExecutor::SubmitTrace(TraceCallT trace_call, ArgsT &&...args) {
|
||||
if (tracing_enabled_) {
|
||||
{
|
||||
// instance tracers held in a block to limit the lock lifetime.
|
||||
|
@ -35,7 +35,6 @@ limitations under the License.
|
||||
#include "tensorflow/stream_executor/platform/logging.h"
|
||||
#include "tensorflow/stream_executor/platform/port.h"
|
||||
#include "tensorflow/stream_executor/rng.h"
|
||||
#include "tensorflow/stream_executor/shared_memory_config.h"
|
||||
#include "tensorflow/stream_executor/stream.h"
|
||||
#include "tensorflow/stream_executor/stream_executor_internal.h"
|
||||
#include "tensorflow/stream_executor/trace_listener.h"
|
||||
@ -54,8 +53,8 @@ struct AllocRecord {
|
||||
};
|
||||
|
||||
// Forward declaration of private friend class.
|
||||
template <typename BeginCallT, typename CompleteCallT,
|
||||
typename ReturnT, typename... BeginArgsT>
|
||||
template <typename BeginCallT, typename CompleteCallT, typename ReturnT,
|
||||
typename... BeginArgsT>
|
||||
class ScopedTracer;
|
||||
|
||||
// A StreamExecutor manages a single device, in terms of executing work (kernel
|
||||
@ -322,14 +321,6 @@ class StreamExecutor {
|
||||
// this is more an up-front test as to whether it's expressly forbidden.
|
||||
bool CanEnablePeerAccessTo(StreamExecutor *other);
|
||||
|
||||
// Gets the preferred shared memory configuration for the device to which this
|
||||
// executor is bound.
|
||||
SharedMemoryConfig GetDeviceSharedMemoryConfig();
|
||||
|
||||
// Sets the preferred shared memory configuration for the device to which this
|
||||
// executor is bound.
|
||||
port::Status SetDeviceSharedMemoryConfig(SharedMemoryConfig config);
|
||||
|
||||
// Obtains metadata about the underlying device.
|
||||
// The value is cached on first use.
|
||||
const DeviceDescription &GetDeviceDescription() const;
|
||||
@ -507,12 +498,12 @@ class StreamExecutor {
|
||||
// To register a listener for all executors for a given platform, see
|
||||
// Platform::RegisterTraceListener().
|
||||
// Does not take ownership of listener.
|
||||
void RegisterTraceListener(TraceListener* listener);
|
||||
void RegisterTraceListener(TraceListener *listener);
|
||||
|
||||
// Removes a TraceListener from this StreamExecutor instance.
|
||||
// Returns false (and logs) in cases where the argument listener was not
|
||||
// previously registered.
|
||||
bool UnregisterTraceListener(TraceListener* listener);
|
||||
bool UnregisterTraceListener(TraceListener *listener);
|
||||
|
||||
// Return allocator statistics.
|
||||
absl::optional<AllocatorStats> GetAllocatorStats();
|
||||
@ -522,8 +513,8 @@ class StreamExecutor {
|
||||
StreamExecutorMemoryAllocator *GetAllocator() { return &allocator_; }
|
||||
|
||||
private:
|
||||
template <typename BeginCallT, typename CompleteCallT,
|
||||
typename ReturnT, typename... BeginArgsT>
|
||||
template <typename BeginCallT, typename CompleteCallT, typename ReturnT,
|
||||
typename... BeginArgsT>
|
||||
friend class ScopedTracer;
|
||||
friend class Event;
|
||||
friend class Stream;
|
||||
@ -648,7 +639,7 @@ class StreamExecutor {
|
||||
// Calls the relevant TraceListener routine to begin tracing for the specified
|
||||
// asynchronous method.
|
||||
template <typename TraceCallT, typename... ArgsT>
|
||||
void SubmitTrace(TraceCallT trace_call, ArgsT&&... args);
|
||||
void SubmitTrace(TraceCallT trace_call, ArgsT &&...args);
|
||||
|
||||
// Reader/writer lock for class-static StreamExecutor members.
|
||||
static absl::Mutex static_mu_;
|
||||
|
@ -96,8 +96,7 @@ class TpuExecutor : public tensorflow::tpu::TpuExecutorInterface {
|
||||
void DequeueOutfeed(int32 outfeed_queue_index, absl::Span<uint8> bytes,
|
||||
StatusCallback done);
|
||||
|
||||
Status EnqueueInfeed(int32 infeed_queue_index,
|
||||
absl::Span<const uint8> bytes);
|
||||
Status EnqueueInfeed(int32 infeed_queue_index, absl::Span<const uint8> bytes);
|
||||
|
||||
absl::optional<stream_executor::AllocatorStats> GetAllocatorStats() override;
|
||||
|
||||
@ -175,10 +174,6 @@ class TpuExecutor : public tensorflow::tpu::TpuExecutorInterface {
|
||||
LOG(FATAL) << "Not yet implemented";
|
||||
}
|
||||
|
||||
stream_executor::SharedMemoryConfig GetDeviceSharedMemoryConfig() override {
|
||||
LOG(FATAL) << "not yet implemented";
|
||||
}
|
||||
|
||||
void* GetSubBuffer(DeviceMemoryBase* parent, uint64 offset,
|
||||
uint64 size) override {
|
||||
LOG(FATAL) << "not yet implemented";
|
||||
@ -197,10 +192,7 @@ class TpuExecutor : public tensorflow::tpu::TpuExecutorInterface {
|
||||
bool CanEnablePeerAccessTo(StreamExecutorInterface* other) override {
|
||||
LOG(FATAL) << "not yet implemented";
|
||||
}
|
||||
Status SetDeviceSharedMemoryConfig(
|
||||
stream_executor::SharedMemoryConfig config) override {
|
||||
LOG(FATAL) << "not yet implemented";
|
||||
}
|
||||
|
||||
void* HostMemoryAllocate(uint64 size) override {
|
||||
LOG(FATAL) << "not yet implemented";
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user