Finish migrating {Tf|Platform}GpuId to {Tf|Platform}DeviceId
PiperOrigin-RevId: 361252995 Change-Id: I818798fc00efe7b98c35145ce067204d9e023895
This commit is contained in:
parent
fbd744dfdb
commit
d7634bbfaf
@ -43,15 +43,15 @@ static xla::StatusOr<absl::optional<std::set<int>>> ParseVisibleDeviceList(
|
|||||||
}
|
}
|
||||||
const std::vector<string> visible_devices =
|
const std::vector<string> visible_devices =
|
||||||
absl::StrSplit(visible_device_list, ',');
|
absl::StrSplit(visible_device_list, ',');
|
||||||
for (const string& platform_gpu_id_str : visible_devices) {
|
for (const string& platform_device_id_str : visible_devices) {
|
||||||
int32 platform_gpu_id;
|
int32 platform_device_id;
|
||||||
if (!absl::SimpleAtoi(platform_gpu_id_str, &platform_gpu_id)) {
|
if (!absl::SimpleAtoi(platform_device_id_str, &platform_device_id)) {
|
||||||
return errors::InvalidArgument(
|
return errors::InvalidArgument(
|
||||||
"Could not parse entry in 'visible_device_list': '",
|
"Could not parse entry in 'visible_device_list': '",
|
||||||
platform_gpu_id_str,
|
platform_device_id_str,
|
||||||
"'. visible_device_list = ", visible_device_list);
|
"'. visible_device_list = ", visible_device_list);
|
||||||
}
|
}
|
||||||
gpu_ids.insert(platform_gpu_id);
|
gpu_ids.insert(platform_device_id);
|
||||||
}
|
}
|
||||||
return {{gpu_ids}};
|
return {{gpu_ids}};
|
||||||
}
|
}
|
||||||
|
@ -102,19 +102,21 @@ struct EdgePtrCompare {
|
|||||||
// TODO(laigd): instead of deciding the device here, the converter should accept
|
// TODO(laigd): instead of deciding the device here, the converter should accept
|
||||||
// a device name as one of the conversion parameter so users can control on
|
// a device name as one of the conversion parameter so users can control on
|
||||||
// which device they want to run the conversion.
|
// which device they want to run the conversion.
|
||||||
std::pair<TfGpuId, PlatformGpuId> GetFirstValidDeviceId() {
|
std::pair<TfDeviceId, PlatformDeviceId> GetFirstValidDeviceId() {
|
||||||
for (int tf_gpu_id_value = 0; tf_gpu_id_value < 100; ++tf_gpu_id_value) {
|
for (int tf_device_id_value = 0; tf_device_id_value < 100;
|
||||||
TfGpuId tf_gpu_id(tf_gpu_id_value);
|
++tf_device_id_value) {
|
||||||
PlatformGpuId platform_gpu_id;
|
TfDeviceId tf_device_id(tf_device_id_value);
|
||||||
Status s = GpuIdManager::TfToPlatformGpuId(tf_gpu_id, &platform_gpu_id);
|
PlatformDeviceId platform_device_id;
|
||||||
|
Status s =
|
||||||
|
GpuIdManager::TfToPlatformDeviceId(tf_device_id, &platform_device_id);
|
||||||
if (s.ok()) {
|
if (s.ok()) {
|
||||||
VLOG(1) << "Found TF GPU " << tf_gpu_id.value() << " at cuda device "
|
VLOG(1) << "Found TF GPU " << tf_device_id.value() << " at cuda device "
|
||||||
<< platform_gpu_id.value();
|
<< platform_device_id.value();
|
||||||
return std::make_pair(tf_gpu_id, platform_gpu_id);
|
return std::make_pair(tf_device_id, platform_device_id);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
LOG(ERROR) << "Could not find any TF GPUs";
|
LOG(ERROR) << "Could not find any TF GPUs";
|
||||||
return std::make_pair(TfGpuId(-1), PlatformGpuId(-1));
|
return std::make_pair(TfDeviceId(-1), PlatformDeviceId(-1));
|
||||||
}
|
}
|
||||||
|
|
||||||
// Returns false for const nodes (we intend to drop control edges from those).
|
// Returns false for const nodes (we intend to drop control edges from those).
|
||||||
@ -266,14 +268,14 @@ Status GetEngineInfo(const Graph* g,
|
|||||||
}
|
}
|
||||||
info->device = DeviceNameUtils::ParsedNameToString(segment_device);
|
info->device = DeviceNameUtils::ParsedNameToString(segment_device);
|
||||||
} else {
|
} else {
|
||||||
TfGpuId tf_gpu_id;
|
TfDeviceId tf_device_id;
|
||||||
PlatformGpuId platform_gpu_id;
|
PlatformDeviceId platform_device_id;
|
||||||
std::tie(tf_gpu_id, platform_gpu_id) = GetFirstValidDeviceId();
|
std::tie(tf_device_id, platform_device_id) = GetFirstValidDeviceId();
|
||||||
if (tf_gpu_id.value() >= 0) {
|
if (tf_device_id.value() >= 0) {
|
||||||
DeviceNameUtils::ParsedName parsed_name;
|
DeviceNameUtils::ParsedName parsed_name;
|
||||||
parsed_name.type = "GPU";
|
parsed_name.type = "GPU";
|
||||||
parsed_name.has_type = true;
|
parsed_name.has_type = true;
|
||||||
parsed_name.id = tf_gpu_id.value();
|
parsed_name.id = tf_device_id.value();
|
||||||
parsed_name.has_id = true;
|
parsed_name.has_id = true;
|
||||||
info->device = DeviceNameUtils::ParsedNameToString(parsed_name);
|
info->device = DeviceNameUtils::ParsedNameToString(parsed_name);
|
||||||
} else {
|
} else {
|
||||||
@ -640,17 +642,17 @@ std::pair<int, Allocator*> GetDeviceAndAllocator(const ConversionParams& params,
|
|||||||
if (params.cluster == nullptr || params.cluster->GetDeviceSet() == nullptr ||
|
if (params.cluster == nullptr || params.cluster->GetDeviceSet() == nullptr ||
|
||||||
engine.device.empty()) {
|
engine.device.empty()) {
|
||||||
// If device is not set, use the first found GPU device for the conversion.
|
// If device is not set, use the first found GPU device for the conversion.
|
||||||
TfGpuId tf_gpu_id;
|
TfDeviceId tf_device_id;
|
||||||
PlatformGpuId platform_gpu_id;
|
PlatformDeviceId platform_device_id;
|
||||||
std::tie(tf_gpu_id, platform_gpu_id) = GetFirstValidDeviceId();
|
std::tie(tf_device_id, platform_device_id) = GetFirstValidDeviceId();
|
||||||
cuda_device_id = platform_gpu_id.value();
|
cuda_device_id = platform_device_id.value();
|
||||||
if (cuda_device_id >= 0) {
|
if (cuda_device_id >= 0) {
|
||||||
GPUOptions gpu_options;
|
GPUOptions gpu_options;
|
||||||
// If the TF to Cuda gpu id mapping exist, the device and corresponding
|
// If the TF to Cuda gpu id mapping exist, the device and corresponding
|
||||||
// allocator must have been initialized already, so the
|
// allocator must have been initialized already, so the
|
||||||
// GetGPUAllocator() call won't create a new allocator.
|
// GetGPUAllocator() call won't create a new allocator.
|
||||||
dev_allocator = GPUProcessState::singleton()->GetGPUAllocator(
|
dev_allocator = GPUProcessState::singleton()->GetGPUAllocator(
|
||||||
gpu_options, tf_gpu_id, /*total_bytes=*/1, /*peer_gpu_ids=*/{});
|
gpu_options, tf_device_id, /*total_bytes=*/1, /*peer_gpu_ids=*/{});
|
||||||
}
|
}
|
||||||
return std::make_pair(cuda_device_id, dev_allocator);
|
return std::make_pair(cuda_device_id, dev_allocator);
|
||||||
}
|
}
|
||||||
|
@ -1044,25 +1044,25 @@ Status TRTEngineOp::AllocateCalibrationResources(
|
|||||||
}
|
}
|
||||||
cres->calibrator_.reset(
|
cres->calibrator_.reset(
|
||||||
new TRTInt8Calibrator(cres->device_buffers_, batch_size, name()));
|
new TRTInt8Calibrator(cres->device_buffers_, batch_size, name()));
|
||||||
const int platform_gpu_id =
|
const int platform_device_id =
|
||||||
ctx->device()->tensorflow_gpu_device_info()->gpu_id;
|
ctx->device()->tensorflow_gpu_device_info()->gpu_id;
|
||||||
if (platform_gpu_id < 0) {
|
if (platform_device_id < 0) {
|
||||||
LOG(ERROR) << "Can't get gpu_device_info from context->device()";
|
LOG(ERROR) << "Can't get gpu_device_info from context->device()";
|
||||||
return errors::InvalidArgument(
|
return errors::InvalidArgument(
|
||||||
"Context->device doesn't contain device info!");
|
"Context->device doesn't contain device info!");
|
||||||
}
|
}
|
||||||
|
|
||||||
cache_res->Ref();
|
cache_res->Ref();
|
||||||
cres->thr_.reset(new std::thread([this, cres, shapes, platform_gpu_id,
|
cres->thr_.reset(new std::thread([this, cres, shapes, platform_device_id,
|
||||||
cache_res]() {
|
cache_res]() {
|
||||||
core::ScopedUnref sc(cache_res);
|
core::ScopedUnref sc(cache_res);
|
||||||
|
|
||||||
VLOG(1) << "Starting calibration thread on device " << platform_gpu_id
|
VLOG(1) << "Starting calibration thread on device " << platform_device_id
|
||||||
<< ", Calibration Resource @ " << cres;
|
<< ", Calibration Resource @ " << cres;
|
||||||
auto err = cudaSetDevice(platform_gpu_id);
|
auto err = cudaSetDevice(platform_device_id);
|
||||||
if (err != cudaSuccess) {
|
if (err != cudaSuccess) {
|
||||||
// TODO(aaroey): should return error here.
|
// TODO(aaroey): should return error here.
|
||||||
LOG(ERROR) << "Couldn't set cuda device to " << platform_gpu_id
|
LOG(ERROR) << "Couldn't set cuda device to " << platform_device_id
|
||||||
<< " in calibration thread";
|
<< " in calibration thread";
|
||||||
}
|
}
|
||||||
std::vector<PartialTensorShape> partial_shapes(shapes.begin(),
|
std::vector<PartialTensorShape> partial_shapes(shapes.begin(),
|
||||||
|
@ -149,7 +149,7 @@ class GPUDeviceTestHelper {
|
|||||||
DeviceFactory::NewDevice(DEVICE_GPU, sops, "/job:a/replica:0/task:0");
|
DeviceFactory::NewDevice(DEVICE_GPU, sops, "/job:a/replica:0/task:0");
|
||||||
gpu_.reset(reinterpret_cast<BaseGPUDevice*>(device_.release()));
|
gpu_.reset(reinterpret_cast<BaseGPUDevice*>(device_.release()));
|
||||||
gpu_allocator_ = GPUProcessState::singleton()->GetGPUAllocator(
|
gpu_allocator_ = GPUProcessState::singleton()->GetGPUAllocator(
|
||||||
GPUOptions(), TfGpuId(0), memory_limit, /*peer_gpu_ids=*/{});
|
GPUOptions(), TfDeviceId(0), memory_limit, /*peer_gpu_ids=*/{});
|
||||||
host_allocator_ = GPUProcessState::singleton()->GetGpuHostAllocator(0);
|
host_allocator_ = GPUProcessState::singleton()->GetGpuHostAllocator(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -26,11 +26,11 @@ limitations under the License.
|
|||||||
|
|
||||||
namespace tensorflow {
|
namespace tensorflow {
|
||||||
|
|
||||||
GPUcudaMallocAllocator::GPUcudaMallocAllocator(Allocator* allocator,
|
GPUcudaMallocAllocator::GPUcudaMallocAllocator(
|
||||||
PlatformGpuId platform_gpu_id)
|
Allocator* allocator, PlatformDeviceId platform_device_id)
|
||||||
: base_allocator_(allocator) {
|
: base_allocator_(allocator) {
|
||||||
stream_exec_ = DeviceIdUtil::ExecutorForPlatformDeviceId(GPUMachineManager(),
|
stream_exec_ = DeviceIdUtil::ExecutorForPlatformDeviceId(GPUMachineManager(),
|
||||||
platform_gpu_id)
|
platform_device_id)
|
||||||
.ValueOrDie();
|
.ValueOrDie();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -32,7 +32,7 @@ namespace tensorflow {
|
|||||||
class GPUcudaMallocAllocator : public Allocator {
|
class GPUcudaMallocAllocator : public Allocator {
|
||||||
public:
|
public:
|
||||||
explicit GPUcudaMallocAllocator(Allocator* allocator,
|
explicit GPUcudaMallocAllocator(Allocator* allocator,
|
||||||
PlatformGpuId platform_gpu_id);
|
PlatformDeviceId platform_device_id);
|
||||||
~GPUcudaMallocAllocator() override;
|
~GPUcudaMallocAllocator() override;
|
||||||
string Name() override { return "gpu_debug"; }
|
string Name() override { return "gpu_debug"; }
|
||||||
void* AllocateRaw(size_t alignment, size_t num_bytes) override;
|
void* AllocateRaw(size_t alignment, size_t num_bytes) override;
|
||||||
|
@ -42,12 +42,12 @@ static std::string GetCudaErrorMessage(CUresult result) {
|
|||||||
#endif // GOOGLE_CUDA
|
#endif // GOOGLE_CUDA
|
||||||
|
|
||||||
GpuCudaMallocAsyncAllocator::GpuCudaMallocAsyncAllocator(
|
GpuCudaMallocAsyncAllocator::GpuCudaMallocAsyncAllocator(
|
||||||
PlatformGpuId platform_gpu_id, size_t pool_size, bool reserve_memory,
|
PlatformDeviceId platform_device_id, size_t pool_size, bool reserve_memory,
|
||||||
bool compute_stats)
|
bool compute_stats)
|
||||||
: name_(absl::StrCat("gpu_async_", platform_gpu_id.value())) {
|
: name_(absl::StrCat("gpu_async_", platform_device_id.value())) {
|
||||||
#if TF_CUDA_MALLOC_ASYNC_SUPPORTED
|
#if TF_CUDA_MALLOC_ASYNC_SUPPORTED
|
||||||
stream_exec_ = DeviceIdUtil::ExecutorForPlatformDeviceId(GPUMachineManager(),
|
stream_exec_ = DeviceIdUtil::ExecutorForPlatformDeviceId(GPUMachineManager(),
|
||||||
platform_gpu_id)
|
platform_device_id)
|
||||||
.ValueOrDie();
|
.ValueOrDie();
|
||||||
// Initialized here as it only exist if compiled with a recent
|
// Initialized here as it only exist if compiled with a recent
|
||||||
// enough CUDA.
|
// enough CUDA.
|
||||||
@ -56,7 +56,7 @@ GpuCudaMallocAsyncAllocator::GpuCudaMallocAsyncAllocator(
|
|||||||
// WAR an CUDA 11.2 driver bug for multiple-GPU. It currently
|
// WAR an CUDA 11.2 driver bug for multiple-GPU. It currently
|
||||||
// request that the context on GPU 0 is initialized. Which isn't the
|
// request that the context on GPU 0 is initialized. Which isn't the
|
||||||
// case for TF+horovod.
|
// case for TF+horovod.
|
||||||
if (platform_gpu_id.value() > 0) {
|
if (platform_device_id.value() > 0) {
|
||||||
CUcontext pctx; // We loose track of it. But this is fine.
|
CUcontext pctx; // We loose track of it. But this is fine.
|
||||||
if (auto result = cuDevicePrimaryCtxRetain(&pctx, 0))
|
if (auto result = cuDevicePrimaryCtxRetain(&pctx, 0))
|
||||||
LOG(FATAL) // Crash OK.
|
LOG(FATAL) // Crash OK.
|
||||||
@ -65,9 +65,10 @@ GpuCudaMallocAsyncAllocator::GpuCudaMallocAsyncAllocator(
|
|||||||
|
|
||||||
se::cuda::ScopedActivateExecutorContext scoped_activation{stream_exec_};
|
se::cuda::ScopedActivateExecutorContext scoped_activation{stream_exec_};
|
||||||
int cuda_malloc_async_supported;
|
int cuda_malloc_async_supported;
|
||||||
if (auto status = cuDeviceGetAttribute(
|
if (auto status =
|
||||||
&cuda_malloc_async_supported,
|
cuDeviceGetAttribute(&cuda_malloc_async_supported,
|
||||||
CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED, platform_gpu_id.value()))
|
CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED,
|
||||||
|
platform_device_id.value()))
|
||||||
LOG(FATAL) << // Crash OK.
|
LOG(FATAL) << // Crash OK.
|
||||||
"Failed to get device attribute: " << GetCudaErrorMessage(status);
|
"Failed to get device attribute: " << GetCudaErrorMessage(status);
|
||||||
if (!cuda_malloc_async_supported)
|
if (!cuda_malloc_async_supported)
|
||||||
@ -79,12 +80,13 @@ GpuCudaMallocAsyncAllocator::GpuCudaMallocAsyncAllocator(
|
|||||||
LOG(FATAL) // Crash OK.
|
LOG(FATAL) // Crash OK.
|
||||||
<< "Failed to create CUDA stream: " << GetCudaErrorMessage(status);
|
<< "Failed to create CUDA stream: " << GetCudaErrorMessage(status);
|
||||||
|
|
||||||
if (auto status = cuDeviceGetDefaultMemPool(&pool_, platform_gpu_id.value()))
|
if (auto status =
|
||||||
|
cuDeviceGetDefaultMemPool(&pool_, platform_device_id.value()))
|
||||||
LOG(FATAL) << // Crash OK.
|
LOG(FATAL) << // Crash OK.
|
||||||
"Failed to get default CUDA pool: " << GetCudaErrorMessage(status);
|
"Failed to get default CUDA pool: " << GetCudaErrorMessage(status);
|
||||||
|
|
||||||
VLOG(1) << Name() << " CudaMallocAsync initialized on platform: "
|
VLOG(1) << Name() << " CudaMallocAsync initialized on platform: "
|
||||||
<< platform_gpu_id.value() << " with pool size of: " << pool_size
|
<< platform_device_id.value() << " with pool size of: " << pool_size
|
||||||
<< " this ptr: " << this;
|
<< " this ptr: " << this;
|
||||||
uint64_t pool_size_64 = pool_size;
|
uint64_t pool_size_64 = pool_size;
|
||||||
if (auto status = cuMemPoolSetAttribute(
|
if (auto status = cuMemPoolSetAttribute(
|
||||||
|
@ -64,7 +64,7 @@ namespace tensorflow {
|
|||||||
// driver can return the excess memory to other processes.
|
// driver can return the excess memory to other processes.
|
||||||
class GpuCudaMallocAsyncAllocator : public Allocator {
|
class GpuCudaMallocAsyncAllocator : public Allocator {
|
||||||
public:
|
public:
|
||||||
explicit GpuCudaMallocAsyncAllocator(PlatformGpuId platform_gpu_id,
|
explicit GpuCudaMallocAsyncAllocator(PlatformDeviceId platform_device_id,
|
||||||
size_t pool_size,
|
size_t pool_size,
|
||||||
bool reserve_memory = false,
|
bool reserve_memory = false,
|
||||||
bool compute_stats = false);
|
bool compute_stats = false);
|
||||||
|
@ -76,10 +76,10 @@ void InitMask(se::StreamExecutor* exec, void* ptr, int64* mask) {
|
|||||||
// GPUDebugAllocator
|
// GPUDebugAllocator
|
||||||
// -----------------------------------------------------------------------------
|
// -----------------------------------------------------------------------------
|
||||||
GPUDebugAllocator::GPUDebugAllocator(Allocator* allocator,
|
GPUDebugAllocator::GPUDebugAllocator(Allocator* allocator,
|
||||||
PlatformGpuId platform_gpu_id)
|
PlatformDeviceId platform_device_id)
|
||||||
: base_allocator_(allocator) {
|
: base_allocator_(allocator) {
|
||||||
stream_exec_ = DeviceIdUtil::ExecutorForPlatformDeviceId(GPUMachineManager(),
|
stream_exec_ = DeviceIdUtil::ExecutorForPlatformDeviceId(GPUMachineManager(),
|
||||||
platform_gpu_id)
|
platform_device_id)
|
||||||
.ValueOrDie();
|
.ValueOrDie();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -155,10 +155,10 @@ bool GPUDebugAllocator::CheckFooter(void* ptr) {
|
|||||||
// GPUNanResetAllocator
|
// GPUNanResetAllocator
|
||||||
// -----------------------------------------------------------------------------
|
// -----------------------------------------------------------------------------
|
||||||
GPUNanResetAllocator::GPUNanResetAllocator(Allocator* allocator,
|
GPUNanResetAllocator::GPUNanResetAllocator(Allocator* allocator,
|
||||||
PlatformGpuId platform_gpu_id)
|
PlatformDeviceId platform_device_id)
|
||||||
: base_allocator_(allocator) {
|
: base_allocator_(allocator) {
|
||||||
stream_exec_ = DeviceIdUtil::ExecutorForPlatformDeviceId(GPUMachineManager(),
|
stream_exec_ = DeviceIdUtil::ExecutorForPlatformDeviceId(GPUMachineManager(),
|
||||||
platform_gpu_id)
|
platform_device_id)
|
||||||
.ValueOrDie();
|
.ValueOrDie();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -34,7 +34,7 @@ namespace tensorflow {
|
|||||||
class GPUDebugAllocator : public Allocator {
|
class GPUDebugAllocator : public Allocator {
|
||||||
public:
|
public:
|
||||||
explicit GPUDebugAllocator(Allocator* allocator,
|
explicit GPUDebugAllocator(Allocator* allocator,
|
||||||
PlatformGpuId platform_gpu_id);
|
PlatformDeviceId platform_device_id);
|
||||||
~GPUDebugAllocator() override;
|
~GPUDebugAllocator() override;
|
||||||
string Name() override { return "gpu_debug"; }
|
string Name() override { return "gpu_debug"; }
|
||||||
void* AllocateRaw(size_t alignment, size_t num_bytes) override;
|
void* AllocateRaw(size_t alignment, size_t num_bytes) override;
|
||||||
@ -64,7 +64,7 @@ class GPUDebugAllocator : public Allocator {
|
|||||||
class GPUNanResetAllocator : public Allocator {
|
class GPUNanResetAllocator : public Allocator {
|
||||||
public:
|
public:
|
||||||
explicit GPUNanResetAllocator(Allocator* allocator,
|
explicit GPUNanResetAllocator(Allocator* allocator,
|
||||||
PlatformGpuId platform_gpu_id);
|
PlatformDeviceId platform_device_id);
|
||||||
~GPUNanResetAllocator() override;
|
~GPUNanResetAllocator() override;
|
||||||
string Name() override { return "gpu_nan_reset"; }
|
string Name() override { return "gpu_nan_reset"; }
|
||||||
void* AllocateRaw(size_t alignment, size_t num_bytes) override;
|
void* AllocateRaw(size_t alignment, size_t num_bytes) override;
|
||||||
|
@ -37,7 +37,7 @@ limitations under the License.
|
|||||||
namespace tensorflow {
|
namespace tensorflow {
|
||||||
namespace {
|
namespace {
|
||||||
|
|
||||||
se::StreamExecutor* ExecutorForPlatformGpuId(
|
se::StreamExecutor* ExecutorForPlatformDeviceId(
|
||||||
PlatformDeviceId platform_device_id) {
|
PlatformDeviceId platform_device_id) {
|
||||||
return DeviceIdUtil::ExecutorForPlatformDeviceId(GPUMachineManager(),
|
return DeviceIdUtil::ExecutorForPlatformDeviceId(GPUMachineManager(),
|
||||||
platform_device_id)
|
platform_device_id)
|
||||||
@ -45,12 +45,12 @@ se::StreamExecutor* ExecutorForPlatformGpuId(
|
|||||||
}
|
}
|
||||||
|
|
||||||
TEST(GPUDebugAllocatorTest, OverwriteDetection_None) {
|
TEST(GPUDebugAllocatorTest, OverwriteDetection_None) {
|
||||||
const PlatformGpuId platform_gpu_id(0);
|
const PlatformDeviceId platform_device_id(0);
|
||||||
auto stream_exec = ExecutorForPlatformGpuId(platform_gpu_id);
|
auto stream_exec = ExecutorForPlatformDeviceId(platform_device_id);
|
||||||
DeviceMemAllocator* sub_allocator = new DeviceMemAllocator(
|
DeviceMemAllocator* sub_allocator = new DeviceMemAllocator(
|
||||||
stream_exec, platform_gpu_id, false /*use_unified_memory*/, {}, {});
|
stream_exec, platform_device_id, false /*use_unified_memory*/, {}, {});
|
||||||
GPUDebugAllocator a(new GPUBFCAllocator(sub_allocator, 1 << 30, ""),
|
GPUDebugAllocator a(new GPUBFCAllocator(sub_allocator, 1 << 30, ""),
|
||||||
platform_gpu_id);
|
platform_device_id);
|
||||||
|
|
||||||
for (int s : {8}) {
|
for (int s : {8}) {
|
||||||
std::vector<int64> cpu_array(s);
|
std::vector<int64> cpu_array(s);
|
||||||
@ -72,13 +72,13 @@ TEST(GPUDebugAllocatorTest, OverwriteDetection_Header) {
|
|||||||
for (int s : {8, 211}) {
|
for (int s : {8, 211}) {
|
||||||
EXPECT_DEATH(
|
EXPECT_DEATH(
|
||||||
{
|
{
|
||||||
const PlatformGpuId platform_gpu_id(0);
|
const PlatformDeviceId platform_device_id(0);
|
||||||
auto stream_exec = ExecutorForPlatformGpuId(platform_gpu_id);
|
auto stream_exec = ExecutorForPlatformDeviceId(platform_device_id);
|
||||||
DeviceMemAllocator* sub_allocator =
|
DeviceMemAllocator* sub_allocator =
|
||||||
new DeviceMemAllocator(stream_exec, platform_gpu_id,
|
new DeviceMemAllocator(stream_exec, platform_device_id,
|
||||||
false /*use_unified_memory*/, {}, {});
|
false /*use_unified_memory*/, {}, {});
|
||||||
GPUDebugAllocator a(new GPUBFCAllocator(sub_allocator, 1 << 30, ""),
|
GPUDebugAllocator a(new GPUBFCAllocator(sub_allocator, 1 << 30, ""),
|
||||||
platform_gpu_id);
|
platform_device_id);
|
||||||
|
|
||||||
std::vector<int64> cpu_array(s);
|
std::vector<int64> cpu_array(s);
|
||||||
memset(&cpu_array[0], 0, cpu_array.size() * sizeof(int64));
|
memset(&cpu_array[0], 0, cpu_array.size() * sizeof(int64));
|
||||||
@ -108,13 +108,13 @@ TEST(GPUDebugAllocatorTest, OverwriteDetection_Footer) {
|
|||||||
for (int s : {8, 22}) {
|
for (int s : {8, 22}) {
|
||||||
EXPECT_DEATH(
|
EXPECT_DEATH(
|
||||||
{
|
{
|
||||||
const PlatformGpuId platform_gpu_id(0);
|
const PlatformDeviceId platform_device_id(0);
|
||||||
auto stream_exec = ExecutorForPlatformGpuId(platform_gpu_id);
|
auto stream_exec = ExecutorForPlatformDeviceId(platform_device_id);
|
||||||
DeviceMemAllocator* sub_allocator =
|
DeviceMemAllocator* sub_allocator =
|
||||||
new DeviceMemAllocator(stream_exec, platform_gpu_id,
|
new DeviceMemAllocator(stream_exec, platform_device_id,
|
||||||
false /*use_unified_memory*/, {}, {});
|
false /*use_unified_memory*/, {}, {});
|
||||||
GPUDebugAllocator a(new GPUBFCAllocator(sub_allocator, 1 << 30, ""),
|
GPUDebugAllocator a(new GPUBFCAllocator(sub_allocator, 1 << 30, ""),
|
||||||
platform_gpu_id);
|
platform_device_id);
|
||||||
|
|
||||||
std::vector<int64> cpu_array(s);
|
std::vector<int64> cpu_array(s);
|
||||||
memset(&cpu_array[0], 0, cpu_array.size() * sizeof(int64));
|
memset(&cpu_array[0], 0, cpu_array.size() * sizeof(int64));
|
||||||
@ -141,12 +141,12 @@ TEST(GPUDebugAllocatorTest, OverwriteDetection_Footer) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
TEST(GPUDebugAllocatorTest, ResetToNan) {
|
TEST(GPUDebugAllocatorTest, ResetToNan) {
|
||||||
const PlatformGpuId platform_gpu_id(0);
|
const PlatformDeviceId platform_device_id(0);
|
||||||
auto stream_exec = ExecutorForPlatformGpuId(platform_gpu_id);
|
auto stream_exec = ExecutorForPlatformDeviceId(platform_device_id);
|
||||||
DeviceMemAllocator* sub_allocator = new DeviceMemAllocator(
|
DeviceMemAllocator* sub_allocator = new DeviceMemAllocator(
|
||||||
stream_exec, platform_gpu_id, false /*use_unified_memory*/, {}, {});
|
stream_exec, platform_device_id, false /*use_unified_memory*/, {}, {});
|
||||||
GPUNanResetAllocator a(new GPUBFCAllocator(sub_allocator, 1 << 30, ""),
|
GPUNanResetAllocator a(new GPUBFCAllocator(sub_allocator, 1 << 30, ""),
|
||||||
platform_gpu_id);
|
platform_device_id);
|
||||||
|
|
||||||
std::vector<float> cpu_array(1024);
|
std::vector<float> cpu_array(1024);
|
||||||
std::vector<float> cpu_array_result(1024);
|
std::vector<float> cpu_array_result(1024);
|
||||||
@ -183,15 +183,15 @@ TEST(GPUDebugAllocatorTest, ResetToNan) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
TEST(GPUDebugAllocatorTest, ResetToNanWithHeaderFooter) {
|
TEST(GPUDebugAllocatorTest, ResetToNanWithHeaderFooter) {
|
||||||
const PlatformGpuId platform_gpu_id(0);
|
const PlatformDeviceId platform_device_id(0);
|
||||||
auto stream_exec = ExecutorForPlatformGpuId(platform_gpu_id);
|
auto stream_exec = ExecutorForPlatformDeviceId(platform_device_id);
|
||||||
// NaN reset must be the outer-most allocator.
|
// NaN reset must be the outer-most allocator.
|
||||||
DeviceMemAllocator* sub_allocator = new DeviceMemAllocator(
|
DeviceMemAllocator* sub_allocator = new DeviceMemAllocator(
|
||||||
stream_exec, platform_gpu_id, false /*use_unified_memory*/, {}, {});
|
stream_exec, platform_device_id, false /*use_unified_memory*/, {}, {});
|
||||||
GPUNanResetAllocator a(
|
GPUNanResetAllocator a(
|
||||||
new GPUDebugAllocator(new GPUBFCAllocator(sub_allocator, 1 << 30, ""),
|
new GPUDebugAllocator(new GPUBFCAllocator(sub_allocator, 1 << 30, ""),
|
||||||
platform_gpu_id),
|
platform_device_id),
|
||||||
platform_gpu_id);
|
platform_device_id);
|
||||||
|
|
||||||
std::vector<float> cpu_array(1024);
|
std::vector<float> cpu_array(1024);
|
||||||
std::vector<float> cpu_array_result(1024);
|
std::vector<float> cpu_array_result(1024);
|
||||||
@ -228,24 +228,24 @@ TEST(GPUDebugAllocatorTest, ResetToNanWithHeaderFooter) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
TEST(GPUDebugAllocatorTest, TracksSizes) {
|
TEST(GPUDebugAllocatorTest, TracksSizes) {
|
||||||
const PlatformGpuId platform_gpu_id(0);
|
const PlatformDeviceId platform_device_id(0);
|
||||||
DeviceMemAllocator* sub_allocator = new DeviceMemAllocator(
|
DeviceMemAllocator* sub_allocator = new DeviceMemAllocator(
|
||||||
ExecutorForPlatformGpuId(platform_gpu_id), platform_gpu_id,
|
ExecutorForPlatformDeviceId(platform_device_id), platform_device_id,
|
||||||
false /*use_unified_memory*/, {}, {});
|
false /*use_unified_memory*/, {}, {});
|
||||||
GPUDebugAllocator a(new GPUBFCAllocator(sub_allocator, 1 << 30, ""),
|
GPUDebugAllocator a(new GPUBFCAllocator(sub_allocator, 1 << 30, ""),
|
||||||
platform_gpu_id);
|
platform_device_id);
|
||||||
EXPECT_EQ(true, a.TracksAllocationSizes());
|
EXPECT_EQ(true, a.TracksAllocationSizes());
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(GPUDebugAllocatorTest, AllocatedVsRequested) {
|
TEST(GPUDebugAllocatorTest, AllocatedVsRequested) {
|
||||||
const PlatformGpuId platform_gpu_id(0);
|
const PlatformDeviceId platform_device_id(0);
|
||||||
DeviceMemAllocator* sub_allocator = new DeviceMemAllocator(
|
DeviceMemAllocator* sub_allocator = new DeviceMemAllocator(
|
||||||
ExecutorForPlatformGpuId(platform_gpu_id), platform_gpu_id,
|
ExecutorForPlatformDeviceId(platform_device_id), platform_device_id,
|
||||||
false /*use_unified_memory*/, {}, {});
|
false /*use_unified_memory*/, {}, {});
|
||||||
GPUNanResetAllocator a(
|
GPUNanResetAllocator a(
|
||||||
new GPUDebugAllocator(new GPUBFCAllocator(sub_allocator, 1 << 30, ""),
|
new GPUDebugAllocator(new GPUBFCAllocator(sub_allocator, 1 << 30, ""),
|
||||||
platform_gpu_id),
|
platform_device_id),
|
||||||
platform_gpu_id);
|
platform_device_id);
|
||||||
float* t1 = TypedAllocator::Allocate<float>(&a, 1, {});
|
float* t1 = TypedAllocator::Allocate<float>(&a, 1, {});
|
||||||
EXPECT_EQ(4, a.RequestedSize(t1));
|
EXPECT_EQ(4, a.RequestedSize(t1));
|
||||||
EXPECT_EQ(256, a.AllocatedSize(t1));
|
EXPECT_EQ(256, a.AllocatedSize(t1));
|
||||||
|
@ -120,7 +120,7 @@ class EigenGpuStreamDevice : public ::Eigen::StreamInterface {
|
|||||||
}
|
}
|
||||||
~EigenGpuStreamDevice() override {}
|
~EigenGpuStreamDevice() override {}
|
||||||
void Reinitialize(OpKernelContext* context, const gpuStream_t* gpu_stream,
|
void Reinitialize(OpKernelContext* context, const gpuStream_t* gpu_stream,
|
||||||
TfGpuId tf_gpu_id, ::tensorflow::Allocator* alloc,
|
TfDeviceId tf_device_id, ::tensorflow::Allocator* alloc,
|
||||||
char* scratch) {
|
char* scratch) {
|
||||||
if (LogMemory::IsEnabled()) {
|
if (LogMemory::IsEnabled()) {
|
||||||
operation_ = context->op_kernel().name() + "/EigenAllocator";
|
operation_ = context->op_kernel().name() + "/EigenAllocator";
|
||||||
@ -132,9 +132,10 @@ class EigenGpuStreamDevice : public ::Eigen::StreamInterface {
|
|||||||
reinterpret_cast<unsigned int*>(scratch + Eigen::kGpuScratchSize);
|
reinterpret_cast<unsigned int*>(scratch + Eigen::kGpuScratchSize);
|
||||||
stream_ = gpu_stream;
|
stream_ = gpu_stream;
|
||||||
allocator_ = alloc;
|
allocator_ = alloc;
|
||||||
PlatformGpuId platform_gpu_id;
|
PlatformDeviceId platform_device_id;
|
||||||
TF_CHECK_OK(GpuIdManager::TfToPlatformGpuId(tf_gpu_id, &platform_gpu_id));
|
TF_CHECK_OK(
|
||||||
device_prop_ = &Eigen::m_deviceProperties[platform_gpu_id.value()];
|
GpuIdManager::TfToPlatformDeviceId(tf_device_id, &platform_device_id));
|
||||||
|
device_prop_ = &Eigen::m_deviceProperties[platform_device_id.value()];
|
||||||
}
|
}
|
||||||
|
|
||||||
const gpuStream_t& stream() const override { return *stream_; }
|
const gpuStream_t& stream() const override { return *stream_; }
|
||||||
@ -233,18 +234,18 @@ class EigenGpuStreamDevice : public ::Eigen::StreamInterface {
|
|||||||
class BaseGPUDevice::StreamGroupFactory {
|
class BaseGPUDevice::StreamGroupFactory {
|
||||||
public:
|
public:
|
||||||
// Returns the unique stream group for use with the stream defined by
|
// Returns the unique stream group for use with the stream defined by
|
||||||
// {tf_gpu_id, stream_group_within_gpu}, creating it if it does not yet
|
// {tf_device_id, stream_group_within_gpu}, creating it if it does not yet
|
||||||
// exist.
|
// exist.
|
||||||
// This function is thread safe.
|
// This function is thread safe.
|
||||||
BaseGPUDevice::StreamGroup* GetOrCreate(TfGpuId tf_gpu_id,
|
BaseGPUDevice::StreamGroup* GetOrCreate(TfDeviceId tf_device_id,
|
||||||
int stream_group_within_gpu,
|
int stream_group_within_gpu,
|
||||||
se::StreamExecutor* executor,
|
se::StreamExecutor* executor,
|
||||||
const GPUOptions& options) {
|
const GPUOptions& options) {
|
||||||
mutex_lock guard(lock_);
|
mutex_lock guard(lock_);
|
||||||
StreamGroup* group =
|
StreamGroup* group =
|
||||||
&streams_[key_type(tf_gpu_id.value(), stream_group_within_gpu)];
|
&streams_[key_type(tf_device_id.value(), stream_group_within_gpu)];
|
||||||
if (!group->compute) {
|
if (!group->compute) {
|
||||||
int priority = GetPriority(tf_gpu_id.value(), options);
|
int priority = GetPriority(tf_device_id.value(), options);
|
||||||
group->priority = priority;
|
group->priority = priority;
|
||||||
group->compute = GetStream(executor, priority);
|
group->compute = GetStream(executor, priority);
|
||||||
group->compute->Init();
|
group->compute->Init();
|
||||||
@ -339,8 +340,8 @@ class BaseGPUDevice::StreamGroupFactory {
|
|||||||
private:
|
private:
|
||||||
// Returns priority for the given virtual GPU id from the session options.
|
// Returns priority for the given virtual GPU id from the session options.
|
||||||
// Returns 0 if no virtual devices are specified.
|
// Returns 0 if no virtual devices are specified.
|
||||||
int GetPriority(int tf_gpu_id, const GPUOptions& options) {
|
int GetPriority(int tf_device_id, const GPUOptions& options) {
|
||||||
int id = tf_gpu_id;
|
int id = tf_device_id;
|
||||||
int i = 0;
|
int i = 0;
|
||||||
int priority = 0;
|
int priority = 0;
|
||||||
while (i < options.experimental().virtual_devices_size()) {
|
while (i < options.experimental().virtual_devices_size()) {
|
||||||
@ -378,7 +379,7 @@ class BaseGPUDevice::StreamGroupFactory {
|
|||||||
|
|
||||||
BaseGPUDevice::BaseGPUDevice(const SessionOptions& options, const string& name,
|
BaseGPUDevice::BaseGPUDevice(const SessionOptions& options, const string& name,
|
||||||
Bytes memory_limit, const DeviceLocality& locality,
|
Bytes memory_limit, const DeviceLocality& locality,
|
||||||
TfGpuId tf_gpu_id,
|
TfDeviceId tf_device_id,
|
||||||
const string& physical_device_desc,
|
const string& physical_device_desc,
|
||||||
Allocator* gpu_allocator, Allocator* cpu_allocator,
|
Allocator* gpu_allocator, Allocator* cpu_allocator,
|
||||||
bool sync_every_op)
|
bool sync_every_op)
|
||||||
@ -388,7 +389,7 @@ BaseGPUDevice::BaseGPUDevice(const SessionOptions& options, const string& name,
|
|||||||
gpu_allocator_(gpu_allocator),
|
gpu_allocator_(gpu_allocator),
|
||||||
cpu_allocator_(cpu_allocator),
|
cpu_allocator_(cpu_allocator),
|
||||||
scoped_allocator_mgr_(new ScopedAllocatorMgr(name)),
|
scoped_allocator_mgr_(new ScopedAllocatorMgr(name)),
|
||||||
tf_gpu_id_(tf_gpu_id),
|
tf_device_id_(tf_device_id),
|
||||||
sync_every_op_(sync_every_op) {
|
sync_every_op_(sync_every_op) {
|
||||||
GPUProcessState::singleton()->EnableGPUDevice();
|
GPUProcessState::singleton()->EnableGPUDevice();
|
||||||
}
|
}
|
||||||
@ -410,7 +411,8 @@ Status BaseGPUDevice::InitScratchBuffers() {
|
|||||||
Allocator::kAllocatorAlignment, scratch_buffer_size);
|
Allocator::kAllocatorAlignment, scratch_buffer_size);
|
||||||
if (scratch_buffer == nullptr) {
|
if (scratch_buffer == nullptr) {
|
||||||
return errors::FailedPrecondition(
|
return errors::FailedPrecondition(
|
||||||
"Failed to allocate scratch buffer for device ", tf_gpu_id_.value());
|
"Failed to allocate scratch buffer for device ",
|
||||||
|
tf_device_id_.value());
|
||||||
}
|
}
|
||||||
se::DeviceMemory<char> mem(
|
se::DeviceMemory<char> mem(
|
||||||
se::DeviceMemoryBase(scratch_buffer, scratch_buffer_size));
|
se::DeviceMemoryBase(scratch_buffer, scratch_buffer_size));
|
||||||
@ -423,16 +425,16 @@ Status BaseGPUDevice::InitScratchBuffers() {
|
|||||||
|
|
||||||
Status BaseGPUDevice::Init(const SessionOptions& options) {
|
Status BaseGPUDevice::Init(const SessionOptions& options) {
|
||||||
auto executor_status = DeviceIdUtil::ExecutorForTfDeviceId(
|
auto executor_status = DeviceIdUtil::ExecutorForTfDeviceId(
|
||||||
DEVICE_GPU, GPUMachineManager(), tf_gpu_id_);
|
DEVICE_GPU, GPUMachineManager(), tf_device_id_);
|
||||||
if (!executor_status.status().ok()) {
|
if (!executor_status.status().ok()) {
|
||||||
return errors::Internal("Failed to get StreamExecutor for device ",
|
return errors::Internal("Failed to get StreamExecutor for device ",
|
||||||
tf_gpu_id_.value());
|
tf_device_id_.value());
|
||||||
}
|
}
|
||||||
|
|
||||||
executor_ = executor_status.ValueOrDie();
|
executor_ = executor_status.ValueOrDie();
|
||||||
|
|
||||||
stream_ = StreamGroupFactory::Global().GetOrCreate(
|
stream_ = StreamGroupFactory::Global().GetOrCreate(
|
||||||
tf_gpu_id_, 0, executor_, options.config.gpu_options());
|
tf_device_id_, 0, executor_, options.config.gpu_options());
|
||||||
device_context_ =
|
device_context_ =
|
||||||
new GPUDeviceContext(0, stream_->compute,
|
new GPUDeviceContext(0, stream_->compute,
|
||||||
#if TENSORFLOW_USE_ROCM
|
#if TENSORFLOW_USE_ROCM
|
||||||
@ -461,7 +463,7 @@ Status BaseGPUDevice::Init(const SessionOptions& options) {
|
|||||||
// The GPUKernelTracker will use this SharedCounter, instead of
|
// The GPUKernelTracker will use this SharedCounter, instead of
|
||||||
// owning its own.
|
// owning its own.
|
||||||
timing_counter =
|
timing_counter =
|
||||||
GPUProcessState::singleton()->GPUAllocatorCounter(tf_gpu_id_);
|
GPUProcessState::singleton()->GPUAllocatorCounter(tf_device_id_);
|
||||||
DCHECK(timing_counter);
|
DCHECK(timing_counter);
|
||||||
}
|
}
|
||||||
kernel_tracker_.reset(new GPUKernelTracker(
|
kernel_tracker_.reset(new GPUKernelTracker(
|
||||||
@ -473,10 +475,10 @@ Status BaseGPUDevice::Init(const SessionOptions& options) {
|
|||||||
gpu_device_info_->stream = stream_->compute;
|
gpu_device_info_->stream = stream_->compute;
|
||||||
gpu_device_info_->default_context = device_context_;
|
gpu_device_info_->default_context = device_context_;
|
||||||
gpu_device_info_->event_mgr = em_;
|
gpu_device_info_->event_mgr = em_;
|
||||||
PlatformGpuId platform_gpu_id;
|
PlatformDeviceId platform_device_id;
|
||||||
TF_RETURN_IF_ERROR(
|
TF_RETURN_IF_ERROR(
|
||||||
GpuIdManager::TfToPlatformGpuId(tf_gpu_id_, &platform_gpu_id));
|
GpuIdManager::TfToPlatformDeviceId(tf_device_id_, &platform_device_id));
|
||||||
gpu_device_info_->gpu_id = platform_gpu_id.value();
|
gpu_device_info_->gpu_id = platform_device_id.value();
|
||||||
set_tensorflow_gpu_device_info(gpu_device_info_);
|
set_tensorflow_gpu_device_info(gpu_device_info_);
|
||||||
|
|
||||||
// Whether and how the GPU device uses its own threadpool.
|
// Whether and how the GPU device uses its own threadpool.
|
||||||
@ -505,7 +507,7 @@ Status BaseGPUDevice::Init(const SessionOptions& options) {
|
|||||||
// TODO(zhengxq): pin the thread to the same socket of the target GPU.
|
// TODO(zhengxq): pin the thread to the same socket of the target GPU.
|
||||||
thread_pool_.reset(new thread::ThreadPool(
|
thread_pool_.reset(new thread::ThreadPool(
|
||||||
options.env, ThreadOptions(),
|
options.env, ThreadOptions(),
|
||||||
strings::StrCat("gpu_private_", tf_gpu_id_.value()),
|
strings::StrCat("gpu_private_", tf_device_id_.value()),
|
||||||
static_cast<int32>(gpu_thread_count),
|
static_cast<int32>(gpu_thread_count),
|
||||||
!options.config.experimental().disable_thread_spinning(),
|
!options.config.experimental().disable_thread_spinning(),
|
||||||
/*allocator=*/nullptr));
|
/*allocator=*/nullptr));
|
||||||
@ -531,8 +533,8 @@ Status BaseGPUDevice::Init(const SessionOptions& options) {
|
|||||||
string BaseGPUDevice::ComputeOpKernelDebugString(const OpKernel& op_kernel,
|
string BaseGPUDevice::ComputeOpKernelDebugString(const OpKernel& op_kernel,
|
||||||
const int& stream_id) {
|
const int& stream_id) {
|
||||||
return strings::StrCat(op_kernel.name(), " op ", op_kernel.type_string(),
|
return strings::StrCat(op_kernel.name(), " op ", op_kernel.type_string(),
|
||||||
" on GPU ", tf_gpu_id_.value(), " stream[", stream_id,
|
" on GPU ", tf_device_id_.value(), " stream[",
|
||||||
"]");
|
stream_id, "]");
|
||||||
}
|
}
|
||||||
|
|
||||||
void BaseGPUDevice::Compute(OpKernel* op_kernel, OpKernelContext* context) {
|
void BaseGPUDevice::Compute(OpKernel* op_kernel, OpKernelContext* context) {
|
||||||
@ -624,8 +626,8 @@ void BaseGPUDevice::ComputeAsync(AsyncOpKernel* op_kernel,
|
|||||||
const auto stream_id = gpu_device_context->stream_id();
|
const auto stream_id = gpu_device_context->stream_id();
|
||||||
|
|
||||||
VLOG(1) << "GpuDevice::ComputeAsync " << op_kernel->name() << " op "
|
VLOG(1) << "GpuDevice::ComputeAsync " << op_kernel->name() << " op "
|
||||||
<< op_kernel->type_string() << " on GPU" << tf_gpu_id_ << " stream["
|
<< op_kernel->type_string() << " on GPU" << tf_device_id_
|
||||||
<< stream_id << "]";
|
<< " stream[" << stream_id << "]";
|
||||||
|
|
||||||
ScopedActivateExecutorContext scoped_activation{stream->parent()};
|
ScopedActivateExecutorContext scoped_activation{stream->parent()};
|
||||||
op_kernel->ComputeAsync(context, std::move(done));
|
op_kernel->ComputeAsync(context, std::move(done));
|
||||||
@ -763,10 +765,10 @@ class ConcretePerOpGpuDevice : public PerOpGpuDevice {
|
|||||||
ConcretePerOpGpuDevice() : device_(&stream_device_) {}
|
ConcretePerOpGpuDevice() : device_(&stream_device_) {}
|
||||||
|
|
||||||
void Reinitialize(OpKernelContext* context, const gpuStream_t* gpu_stream,
|
void Reinitialize(OpKernelContext* context, const gpuStream_t* gpu_stream,
|
||||||
TfGpuId tf_gpu_id, Allocator* base_allocator,
|
TfDeviceId tf_device_id, Allocator* base_allocator,
|
||||||
char* scratch) {
|
char* scratch) {
|
||||||
stream_device_.Reinitialize(context, gpu_stream, tf_gpu_id, base_allocator,
|
stream_device_.Reinitialize(context, gpu_stream, tf_device_id,
|
||||||
scratch);
|
base_allocator, scratch);
|
||||||
}
|
}
|
||||||
|
|
||||||
const Eigen::GpuDevice& device() const override { return device_; }
|
const Eigen::GpuDevice& device() const override { return device_; }
|
||||||
@ -777,8 +779,9 @@ class ConcretePerOpGpuDevice : public PerOpGpuDevice {
|
|||||||
};
|
};
|
||||||
|
|
||||||
// Parse 'visible_device_list' into a list of platform GPU ids.
|
// Parse 'visible_device_list' into a list of platform GPU ids.
|
||||||
Status ParseVisibleDeviceList(const string& visible_device_list,
|
Status ParseVisibleDeviceList(
|
||||||
std::vector<PlatformGpuId>* visible_gpu_order) {
|
const string& visible_device_list,
|
||||||
|
std::vector<PlatformDeviceId>* visible_gpu_order) {
|
||||||
visible_gpu_order->clear();
|
visible_gpu_order->clear();
|
||||||
se::Platform* gpu_manager = GPUMachineManager();
|
se::Platform* gpu_manager = GPUMachineManager();
|
||||||
|
|
||||||
@ -793,28 +796,28 @@ Status ParseVisibleDeviceList(const string& visible_device_list,
|
|||||||
} else {
|
} else {
|
||||||
const std::vector<string> order_str =
|
const std::vector<string> order_str =
|
||||||
str_util::Split(visible_device_list, ',');
|
str_util::Split(visible_device_list, ',');
|
||||||
for (const string& platform_gpu_id_str : order_str) {
|
for (const string& platform_device_id_str : order_str) {
|
||||||
int32 platform_gpu_id;
|
int32 platform_device_id;
|
||||||
if (!strings::safe_strto32(platform_gpu_id_str, &platform_gpu_id)) {
|
if (!strings::safe_strto32(platform_device_id_str, &platform_device_id)) {
|
||||||
return errors::InvalidArgument(
|
return errors::InvalidArgument(
|
||||||
"Could not parse entry in 'visible_device_list': '",
|
"Could not parse entry in 'visible_device_list': '",
|
||||||
platform_gpu_id_str,
|
platform_device_id_str,
|
||||||
"'. visible_device_list = ", visible_device_list);
|
"'. visible_device_list = ", visible_device_list);
|
||||||
}
|
}
|
||||||
if (platform_gpu_id < 0 ||
|
if (platform_device_id < 0 ||
|
||||||
platform_gpu_id >= gpu_manager->VisibleDeviceCount()) {
|
platform_device_id >= gpu_manager->VisibleDeviceCount()) {
|
||||||
return errors::InvalidArgument(
|
return errors::InvalidArgument(
|
||||||
"'visible_device_list' listed an invalid GPU id '", platform_gpu_id,
|
"'visible_device_list' listed an invalid GPU id '",
|
||||||
"' but visible device count is ",
|
platform_device_id, "' but visible device count is ",
|
||||||
gpu_manager->VisibleDeviceCount());
|
gpu_manager->VisibleDeviceCount());
|
||||||
}
|
}
|
||||||
visible_gpu_order->push_back(PlatformGpuId(platform_gpu_id));
|
visible_gpu_order->push_back(PlatformDeviceId(platform_device_id));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Validate no repeats.
|
// Validate no repeats.
|
||||||
std::set<PlatformGpuId> visible_device_set(visible_gpu_order->begin(),
|
std::set<PlatformDeviceId> visible_device_set(visible_gpu_order->begin(),
|
||||||
visible_gpu_order->end());
|
visible_gpu_order->end());
|
||||||
if (visible_device_set.size() != visible_gpu_order->size()) {
|
if (visible_device_set.size() != visible_gpu_order->size()) {
|
||||||
return errors::InvalidArgument(
|
return errors::InvalidArgument(
|
||||||
"visible_device_list contained a duplicate entry: ",
|
"visible_device_list contained a duplicate entry: ",
|
||||||
@ -825,8 +828,8 @@ Status ParseVisibleDeviceList(const string& visible_device_list,
|
|||||||
|
|
||||||
Status VerifyVirtualDeviceSettings(
|
Status VerifyVirtualDeviceSettings(
|
||||||
const size_t num_gpus_to_use, const GPUOptions& gpu_options,
|
const size_t num_gpus_to_use, const GPUOptions& gpu_options,
|
||||||
const std::vector<PlatformGpuId>& visible_gpu_order,
|
const std::vector<PlatformDeviceId>& visible_gpu_order,
|
||||||
const std::vector<PlatformGpuId>& valid_platform_gpu_ids,
|
const std::vector<PlatformDeviceId>& valid_platform_device_ids,
|
||||||
const std::map<int, std::pair<int, int>>& supported_priority_ranges) {
|
const std::map<int, std::pair<int, int>>& supported_priority_ranges) {
|
||||||
const auto& virtual_devices = gpu_options.experimental().virtual_devices();
|
const auto& virtual_devices = gpu_options.experimental().virtual_devices();
|
||||||
CHECK(!virtual_devices.empty());
|
CHECK(!virtual_devices.empty());
|
||||||
@ -849,11 +852,11 @@ Status VerifyVirtualDeviceSettings(
|
|||||||
" #GPUs in visible_device_list: ", visible_gpu_order.size(),
|
" #GPUs in visible_device_list: ", visible_gpu_order.size(),
|
||||||
" virtual_devices.size(): ", virtual_devices.size());
|
" virtual_devices.size(): ", virtual_devices.size());
|
||||||
}
|
}
|
||||||
if (valid_platform_gpu_ids.size() != virtual_devices.size()) {
|
if (valid_platform_device_ids.size() != virtual_devices.size()) {
|
||||||
return errors::Unknown(
|
return errors::Unknown(
|
||||||
"The number of valid GPUs doesn't match the number of elements in "
|
"The number of valid GPUs doesn't match the number of elements in "
|
||||||
"the virtual_devices list.",
|
"the virtual_devices list.",
|
||||||
" #valid GPUs: ", valid_platform_gpu_ids.size(),
|
" #valid GPUs: ", valid_platform_device_ids.size(),
|
||||||
" virtual_devices.size(): ", virtual_devices.size());
|
" virtual_devices.size(): ", virtual_devices.size());
|
||||||
}
|
}
|
||||||
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||||
@ -882,7 +885,7 @@ Status VerifyVirtualDeviceSettings(
|
|||||||
i, " memory_limit_mb size: ", memory_limit_mb.size(),
|
i, " memory_limit_mb size: ", memory_limit_mb.size(),
|
||||||
" and priority size: ", priority.size());
|
" and priority size: ", priority.size());
|
||||||
}
|
}
|
||||||
const int gpu_id = valid_platform_gpu_ids[i].value();
|
const int gpu_id = valid_platform_device_ids[i].value();
|
||||||
auto it = supported_priority_ranges.find(gpu_id);
|
auto it = supported_priority_ranges.find(gpu_id);
|
||||||
if (it == supported_priority_ranges.end()) {
|
if (it == supported_priority_ranges.end()) {
|
||||||
return errors::Internal(
|
return errors::Internal(
|
||||||
@ -950,19 +953,19 @@ int64 MinSystemMemory(int64 available_memory, int cc_major) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Get the memory limit for the virtual device being created on GPU with
|
// Get the memory limit for the virtual device being created on GPU with
|
||||||
// 'platform_gpu_id', when that virtual device is the only virtual device being
|
// 'platform_device_id', when that virtual device is the only virtual device
|
||||||
// created on that GPU.
|
// being created on that GPU.
|
||||||
Status SingleVirtualDeviceMemoryLimit(const GPUOptions& gpu_options,
|
Status SingleVirtualDeviceMemoryLimit(const GPUOptions& gpu_options,
|
||||||
PlatformGpuId platform_gpu_id,
|
PlatformDeviceId platform_device_id,
|
||||||
int64* memory_limit) {
|
int64* memory_limit) {
|
||||||
int64 total_memory = 0;
|
int64 total_memory = 0;
|
||||||
int64 available_memory = 0;
|
int64 available_memory = 0;
|
||||||
se::StreamExecutor* se = DeviceIdUtil::ExecutorForPlatformDeviceId(
|
se::StreamExecutor* se = DeviceIdUtil::ExecutorForPlatformDeviceId(
|
||||||
GPUMachineManager(), platform_gpu_id)
|
GPUMachineManager(), platform_device_id)
|
||||||
.ValueOrDie();
|
.ValueOrDie();
|
||||||
if (!se->DeviceMemoryUsage(&available_memory, &total_memory)) {
|
if (!se->DeviceMemoryUsage(&available_memory, &total_memory)) {
|
||||||
return errors::Unknown("Failed to query available memory for GPU ",
|
return errors::Unknown("Failed to query available memory for GPU ",
|
||||||
platform_gpu_id.value());
|
platform_device_id.value());
|
||||||
}
|
}
|
||||||
|
|
||||||
int64 allocated_memory = 0;
|
int64 allocated_memory = 0;
|
||||||
@ -1037,7 +1040,7 @@ void BaseGPUDevice::ReinitializeDevice(OpKernelContext* context,
|
|||||||
DCHECK_EQ(stream_id, 0);
|
DCHECK_EQ(stream_id, 0);
|
||||||
const gpuStream_t* gpu_stream = reinterpret_cast<const gpuStream_t*>(
|
const gpuStream_t* gpu_stream = reinterpret_cast<const gpuStream_t*>(
|
||||||
stream_->compute->implementation()->GpuStreamMemberHack());
|
stream_->compute->implementation()->GpuStreamMemberHack());
|
||||||
concrete_device->Reinitialize(context, gpu_stream, tf_gpu_id_, allocator,
|
concrete_device->Reinitialize(context, gpu_stream, tf_device_id_, allocator,
|
||||||
scratch_);
|
scratch_);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1093,7 +1096,7 @@ Status BaseGPUDeviceFactory::CacheDeviceIds() {
|
|||||||
return Status::OK();
|
return Status::OK();
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<PlatformGpuId> visible_gpu_order(device_count);
|
std::vector<PlatformDeviceId> visible_gpu_order(device_count);
|
||||||
std::iota(visible_gpu_order.begin(), visible_gpu_order.end(), 0);
|
std::iota(visible_gpu_order.begin(), visible_gpu_order.end(), 0);
|
||||||
TF_RETURN_IF_ERROR(GetValidDeviceIds(visible_gpu_order, &cached_device_ids_));
|
TF_RETURN_IF_ERROR(GetValidDeviceIds(visible_gpu_order, &cached_device_ids_));
|
||||||
return Status::OK();
|
return Status::OK();
|
||||||
@ -1101,9 +1104,9 @@ Status BaseGPUDeviceFactory::CacheDeviceIds() {
|
|||||||
|
|
||||||
Status BaseGPUDeviceFactory::ListPhysicalDevices(std::vector<string>* devices) {
|
Status BaseGPUDeviceFactory::ListPhysicalDevices(std::vector<string>* devices) {
|
||||||
TF_RETURN_IF_ERROR(CacheDeviceIds());
|
TF_RETURN_IF_ERROR(CacheDeviceIds());
|
||||||
for (PlatformGpuId platform_gpu_id : cached_device_ids_) {
|
for (PlatformDeviceId platform_device_id : cached_device_ids_) {
|
||||||
const string device_name =
|
const string device_name =
|
||||||
strings::StrCat("/physical_device:GPU:", platform_gpu_id.value());
|
strings::StrCat("/physical_device:GPU:", platform_device_id.value());
|
||||||
devices->push_back(device_name);
|
devices->push_back(device_name);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1117,14 +1120,15 @@ Status BaseGPUDeviceFactory::GetDeviceDetails(
|
|||||||
if (device_index < 0 || device_index > cached_device_ids_.size()) {
|
if (device_index < 0 || device_index > cached_device_ids_.size()) {
|
||||||
return errors::Internal("Invalid device index: ", device_index);
|
return errors::Internal("Invalid device index: ", device_index);
|
||||||
}
|
}
|
||||||
PlatformGpuId platform_gpu_id = cached_device_ids_[device_index];
|
PlatformDeviceId platform_device_id = cached_device_ids_[device_index];
|
||||||
|
|
||||||
TF_RETURN_IF_ERROR(ValidateGPUMachineManager());
|
TF_RETURN_IF_ERROR(ValidateGPUMachineManager());
|
||||||
se::Platform* gpu_manager = GPUMachineManager();
|
se::Platform* gpu_manager = GPUMachineManager();
|
||||||
if (gpu_manager == nullptr) {
|
if (gpu_manager == nullptr) {
|
||||||
return errors::Internal("Cannot get GPUMachineManager");
|
return errors::Internal("Cannot get GPUMachineManager");
|
||||||
}
|
}
|
||||||
auto desc_status = gpu_manager->DescriptionForDevice(platform_gpu_id.value());
|
auto desc_status =
|
||||||
|
gpu_manager->DescriptionForDevice(platform_device_id.value());
|
||||||
if (!desc_status.ok()) {
|
if (!desc_status.ok()) {
|
||||||
return desc_status.status();
|
return desc_status.status();
|
||||||
}
|
}
|
||||||
@ -1159,8 +1163,8 @@ Status BaseGPUDeviceFactory::CreateDevices(
|
|||||||
num_gpus_to_use = iter->second;
|
num_gpus_to_use = iter->second;
|
||||||
}
|
}
|
||||||
const auto& gpu_options = options.config.gpu_options();
|
const auto& gpu_options = options.config.gpu_options();
|
||||||
std::vector<PlatformGpuId> visible_gpu_order;
|
std::vector<PlatformDeviceId> visible_gpu_order;
|
||||||
std::vector<PlatformGpuId> valid_platform_gpu_ids;
|
std::vector<PlatformDeviceId> valid_platform_device_ids;
|
||||||
// If we aren't going to use any GPUs, don't initialize them.
|
// If we aren't going to use any GPUs, don't initialize them.
|
||||||
// We don't want to call ParseVisibleDeviceList if num_gpus_to_use is 0,
|
// We don't want to call ParseVisibleDeviceList if num_gpus_to_use is 0,
|
||||||
// because it treats an empty gpu_options.visible_device_list as 'all GPUs
|
// because it treats an empty gpu_options.visible_device_list as 'all GPUs
|
||||||
@ -1188,13 +1192,13 @@ Status BaseGPUDeviceFactory::CreateDevices(
|
|||||||
}
|
}
|
||||||
|
|
||||||
TF_RETURN_IF_ERROR(
|
TF_RETURN_IF_ERROR(
|
||||||
GetValidDeviceIds(visible_gpu_order, &valid_platform_gpu_ids));
|
GetValidDeviceIds(visible_gpu_order, &valid_platform_device_ids));
|
||||||
}
|
}
|
||||||
if (num_gpus_to_use > valid_platform_gpu_ids.size()) {
|
if (num_gpus_to_use > valid_platform_device_ids.size()) {
|
||||||
num_gpus_to_use = valid_platform_gpu_ids.size();
|
num_gpus_to_use = valid_platform_device_ids.size();
|
||||||
}
|
}
|
||||||
std::map<int, std::pair<int, int>> supported_priority_ranges;
|
std::map<int, std::pair<int, int>> supported_priority_ranges;
|
||||||
if (!valid_platform_gpu_ids.empty()) {
|
if (!valid_platform_device_ids.empty()) {
|
||||||
// Save the original device.
|
// Save the original device.
|
||||||
int original_device = 0;
|
int original_device = 0;
|
||||||
#if GOOGLE_CUDA
|
#if GOOGLE_CUDA
|
||||||
@ -1213,18 +1217,18 @@ Status BaseGPUDeviceFactory::CreateDevices(
|
|||||||
|
|
||||||
// Force to implicitly initialize CUDA runtime on each valid GPU before
|
// Force to implicitly initialize CUDA runtime on each valid GPU before
|
||||||
// CreateGPUDevice().
|
// CreateGPUDevice().
|
||||||
for (PlatformGpuId platform_gpu_id : valid_platform_gpu_ids) {
|
for (PlatformDeviceId platform_device_id : valid_platform_device_ids) {
|
||||||
#if GOOGLE_CUDA
|
#if GOOGLE_CUDA
|
||||||
err = cudaSetDevice(platform_gpu_id.value());
|
err = cudaSetDevice(platform_device_id.value());
|
||||||
if (err != cudaSuccess) {
|
if (err != cudaSuccess) {
|
||||||
return errors::Internal(
|
return errors::Internal(
|
||||||
"cudaSetDevice() on GPU:", platform_gpu_id.value(),
|
"cudaSetDevice() on GPU:", platform_device_id.value(),
|
||||||
" failed. Status: ", cudaGetErrorString(err));
|
" failed. Status: ", cudaGetErrorString(err));
|
||||||
}
|
}
|
||||||
err = cudaFree(nullptr);
|
err = cudaFree(nullptr);
|
||||||
if (err != cudaSuccess) {
|
if (err != cudaSuccess) {
|
||||||
return errors::Internal("CUDA runtime implicit initialization on GPU:",
|
return errors::Internal("CUDA runtime implicit initialization on GPU:",
|
||||||
platform_gpu_id.value(),
|
platform_device_id.value(),
|
||||||
" failed. Status: ", cudaGetErrorString(err));
|
" failed. Status: ", cudaGetErrorString(err));
|
||||||
}
|
}
|
||||||
int priority_low, priority_high;
|
int priority_low, priority_high;
|
||||||
@ -1237,19 +1241,19 @@ Status BaseGPUDeviceFactory::CreateDevices(
|
|||||||
VLOG(1) << "Cuda stream priority range on GPU(" << original_device
|
VLOG(1) << "Cuda stream priority range on GPU(" << original_device
|
||||||
<< "): " << priority_high << "," << priority_low;
|
<< "): " << priority_high << "," << priority_low;
|
||||||
supported_priority_ranges.insert(
|
supported_priority_ranges.insert(
|
||||||
std::make_pair(platform_gpu_id.value(),
|
std::make_pair(platform_device_id.value(),
|
||||||
std::make_pair(priority_low, priority_high)));
|
std::make_pair(priority_low, priority_high)));
|
||||||
#elif TENSORFLOW_USE_ROCM
|
#elif TENSORFLOW_USE_ROCM
|
||||||
err = hipSetDevice(platform_gpu_id.value());
|
err = hipSetDevice(platform_device_id.value());
|
||||||
if (err != hipSuccess) {
|
if (err != hipSuccess) {
|
||||||
return errors::Internal(
|
return errors::Internal(
|
||||||
"hipSetDevice() on GPU:", platform_gpu_id.value(),
|
"hipSetDevice() on GPU:", platform_device_id.value(),
|
||||||
" failed. Status: ", hipGetErrorString(err));
|
" failed. Status: ", hipGetErrorString(err));
|
||||||
}
|
}
|
||||||
err = hipFree(nullptr);
|
err = hipFree(nullptr);
|
||||||
if (err != hipSuccess) {
|
if (err != hipSuccess) {
|
||||||
return errors::Internal("ROCm runtime implicit initialization on GPU:",
|
return errors::Internal("ROCm runtime implicit initialization on GPU:",
|
||||||
platform_gpu_id.value(),
|
platform_device_id.value(),
|
||||||
" failed. Status: ", hipGetErrorString(err));
|
" failed. Status: ", hipGetErrorString(err));
|
||||||
}
|
}
|
||||||
int priority_low, priority_high;
|
int priority_low, priority_high;
|
||||||
@ -1262,7 +1266,7 @@ Status BaseGPUDeviceFactory::CreateDevices(
|
|||||||
VLOG(1) << "HIP stream priority range on GPU(" << original_device
|
VLOG(1) << "HIP stream priority range on GPU(" << original_device
|
||||||
<< "): " << priority_high << "," << priority_low;
|
<< "): " << priority_high << "," << priority_low;
|
||||||
supported_priority_ranges.insert(
|
supported_priority_ranges.insert(
|
||||||
std::make_pair(platform_gpu_id.value(),
|
std::make_pair(platform_device_id.value(),
|
||||||
std::make_pair(priority_low, priority_high)));
|
std::make_pair(priority_low, priority_high)));
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
@ -1306,9 +1310,9 @@ Status BaseGPUDeviceFactory::CreateDevices(
|
|||||||
LOG(INFO) << line_buf;
|
LOG(INFO) << line_buf;
|
||||||
for (int i = 0; i < visible_gpu_order.size(); ++i) {
|
for (int i = 0; i < visible_gpu_order.size(); ++i) {
|
||||||
line_buf = strings::StrCat(visible_gpu_order[i].value(), ": ");
|
line_buf = strings::StrCat(visible_gpu_order[i].value(), ": ");
|
||||||
PlatformGpuId gpu_id_i = visible_gpu_order[i];
|
PlatformDeviceId gpu_id_i = visible_gpu_order[i];
|
||||||
for (int j = 0; j < visible_gpu_order.size(); ++j) {
|
for (int j = 0; j < visible_gpu_order.size(); ++j) {
|
||||||
PlatformGpuId gpu_id_j = visible_gpu_order[j];
|
PlatformDeviceId gpu_id_j = visible_gpu_order[j];
|
||||||
if (im.directed_links.find({gpu_id_i, gpu_id_j}) !=
|
if (im.directed_links.find({gpu_id_i, gpu_id_j}) !=
|
||||||
im.directed_links.end()) {
|
im.directed_links.end()) {
|
||||||
line_buf.append("Y ");
|
line_buf.append("Y ");
|
||||||
@ -1323,22 +1327,23 @@ Status BaseGPUDeviceFactory::CreateDevices(
|
|||||||
const auto& virtual_devices = gpu_options.experimental().virtual_devices();
|
const auto& virtual_devices = gpu_options.experimental().virtual_devices();
|
||||||
if (!virtual_devices.empty()) {
|
if (!virtual_devices.empty()) {
|
||||||
TF_RETURN_IF_ERROR(VerifyVirtualDeviceSettings(
|
TF_RETURN_IF_ERROR(VerifyVirtualDeviceSettings(
|
||||||
num_gpus_to_use, gpu_options, visible_gpu_order, valid_platform_gpu_ids,
|
num_gpus_to_use, gpu_options, visible_gpu_order,
|
||||||
supported_priority_ranges));
|
valid_platform_device_ids, supported_priority_ranges));
|
||||||
// We've verified that num_gpus_to_use >= virtual_devices.size().
|
// We've verified that num_gpus_to_use >= virtual_devices.size().
|
||||||
num_gpus_to_use = virtual_devices.size();
|
num_gpus_to_use = virtual_devices.size();
|
||||||
CHECK(gpu_options.visible_device_list().empty() ||
|
CHECK(gpu_options.visible_device_list().empty() ||
|
||||||
valid_platform_gpu_ids == visible_gpu_order);
|
valid_platform_device_ids == visible_gpu_order);
|
||||||
}
|
}
|
||||||
int next_tf_gpu_id = 0;
|
int next_tf_device_id = 0;
|
||||||
std::vector<int64> memory_limit_bytes;
|
std::vector<int64> memory_limit_bytes;
|
||||||
for (int i = 0; i < num_gpus_to_use; ++i) {
|
for (int i = 0; i < num_gpus_to_use; ++i) {
|
||||||
const PlatformGpuId platform_gpu_id = valid_platform_gpu_ids[i];
|
const PlatformDeviceId platform_device_id = valid_platform_device_ids[i];
|
||||||
if (virtual_devices.empty() ||
|
if (virtual_devices.empty() ||
|
||||||
virtual_devices.Get(i).memory_limit_mb_size() == 0) {
|
virtual_devices.Get(i).memory_limit_mb_size() == 0) {
|
||||||
int64 single_virtual_device_memory_limit = 0;
|
int64 single_virtual_device_memory_limit = 0;
|
||||||
TF_RETURN_IF_ERROR(SingleVirtualDeviceMemoryLimit(
|
TF_RETURN_IF_ERROR(
|
||||||
gpu_options, platform_gpu_id, &single_virtual_device_memory_limit));
|
SingleVirtualDeviceMemoryLimit(gpu_options, platform_device_id,
|
||||||
|
&single_virtual_device_memory_limit));
|
||||||
memory_limit_bytes.push_back(single_virtual_device_memory_limit);
|
memory_limit_bytes.push_back(single_virtual_device_memory_limit);
|
||||||
} else {
|
} else {
|
||||||
const auto& memory_limit_mb = virtual_devices.Get(i).memory_limit_mb();
|
const auto& memory_limit_mb = virtual_devices.Get(i).memory_limit_mb();
|
||||||
@ -1347,36 +1352,37 @@ Status BaseGPUDeviceFactory::CreateDevices(
|
|||||||
return static_cast<int64>(mb) * (1ll << 20);
|
return static_cast<int64>(mb) * (1ll << 20);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
while (next_tf_gpu_id < memory_limit_bytes.size()) {
|
while (next_tf_device_id < memory_limit_bytes.size()) {
|
||||||
TfGpuId tf_gpu_id(next_tf_gpu_id);
|
TfDeviceId tf_device_id(next_tf_device_id);
|
||||||
++next_tf_gpu_id;
|
++next_tf_device_id;
|
||||||
TF_RETURN_IF_ERROR(
|
TF_RETURN_IF_ERROR(GpuIdManager::InsertTfPlatformDeviceIdPair(
|
||||||
GpuIdManager::InsertTfPlatformGpuIdPair(tf_gpu_id, platform_gpu_id));
|
tf_device_id, platform_device_id));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
const int num_tf_gpus = next_tf_gpu_id;
|
const int num_tf_gpus = next_tf_device_id;
|
||||||
|
|
||||||
LocalityMap device_localities;
|
LocalityMap device_localities;
|
||||||
TF_RETURN_IF_ERROR(
|
TF_RETURN_IF_ERROR(
|
||||||
GetDeviceLocalities(num_tf_gpus, interconnect_maps, &device_localities));
|
GetDeviceLocalities(num_tf_gpus, interconnect_maps, &device_localities));
|
||||||
|
|
||||||
// Build the GPUDevices
|
// Build the GPUDevices
|
||||||
CHECK_EQ(next_tf_gpu_id, memory_limit_bytes.size());
|
CHECK_EQ(next_tf_device_id, memory_limit_bytes.size());
|
||||||
for (int di = 0; di < num_tf_gpus; ++di) {
|
for (int di = 0; di < num_tf_gpus; ++di) {
|
||||||
TfGpuId tf_gpu_id(di);
|
TfDeviceId tf_device_id(di);
|
||||||
int64 bytes = memory_limit_bytes[di];
|
int64 bytes = memory_limit_bytes[di];
|
||||||
auto it = device_localities.find(tf_gpu_id);
|
auto it = device_localities.find(tf_device_id);
|
||||||
if (it == device_localities.end()) {
|
if (it == device_localities.end()) {
|
||||||
return errors::Internal("Failed to find DeviceLocality for GPU device ",
|
return errors::Internal("Failed to find DeviceLocality for GPU device ",
|
||||||
tf_gpu_id.value());
|
tf_device_id.value());
|
||||||
}
|
}
|
||||||
TF_RETURN_IF_ERROR(CreateGPUDevice(options, name_prefix, tf_gpu_id, bytes,
|
TF_RETURN_IF_ERROR(CreateGPUDevice(options, name_prefix, tf_device_id,
|
||||||
it->second, num_tf_gpus, devices));
|
bytes, it->second, num_tf_gpus,
|
||||||
|
devices));
|
||||||
}
|
}
|
||||||
return Status::OK();
|
return Status::OK();
|
||||||
}
|
}
|
||||||
|
|
||||||
static string GetShortDeviceDescription(PlatformGpuId platform_gpu_id,
|
static string GetShortDeviceDescription(PlatformDeviceId platform_device_id,
|
||||||
const se::DeviceDescription& desc) {
|
const se::DeviceDescription& desc) {
|
||||||
#if GOOGLE_CUDA
|
#if GOOGLE_CUDA
|
||||||
int cc_major;
|
int cc_major;
|
||||||
@ -1386,54 +1392,56 @@ static string GetShortDeviceDescription(PlatformGpuId platform_gpu_id,
|
|||||||
cc_minor = 0;
|
cc_minor = 0;
|
||||||
}
|
}
|
||||||
// LINT.IfChange
|
// LINT.IfChange
|
||||||
return strings::StrCat("device: ", platform_gpu_id.value(),
|
return strings::StrCat("device: ", platform_device_id.value(),
|
||||||
", name: ", desc.name(),
|
", name: ", desc.name(),
|
||||||
", pci bus id: ", desc.pci_bus_id(),
|
", pci bus id: ", desc.pci_bus_id(),
|
||||||
", compute capability: ", cc_major, ".", cc_minor);
|
", compute capability: ", cc_major, ".", cc_minor);
|
||||||
// LINT.ThenChange(//tensorflow/python/framework/gpu_util.py)
|
// LINT.ThenChange(//tensorflow/python/framework/gpu_util.py)
|
||||||
#elif TENSORFLOW_USE_ROCM
|
#elif TENSORFLOW_USE_ROCM
|
||||||
return strings::StrCat("device: ", platform_gpu_id.value(),
|
return strings::StrCat("device: ", platform_device_id.value(),
|
||||||
", name: ", desc.name(),
|
", name: ", desc.name(),
|
||||||
", pci bus id: ", desc.pci_bus_id());
|
", pci bus id: ", desc.pci_bus_id());
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
Status BaseGPUDeviceFactory::CreateGPUDevice(
|
Status BaseGPUDeviceFactory::CreateGPUDevice(
|
||||||
const SessionOptions& options, const string& name_prefix, TfGpuId tf_gpu_id,
|
const SessionOptions& options, const string& name_prefix,
|
||||||
int64 memory_limit, const DeviceLocality& dev_locality, size_t num_tf_gpus,
|
TfDeviceId tf_device_id, int64 memory_limit,
|
||||||
|
const DeviceLocality& dev_locality, size_t num_tf_gpus,
|
||||||
std::vector<std::unique_ptr<Device>>* devices) {
|
std::vector<std::unique_ptr<Device>>* devices) {
|
||||||
CHECK_GE(tf_gpu_id.value(), 0);
|
CHECK_GE(tf_device_id.value(), 0);
|
||||||
const string device_name =
|
const string device_name =
|
||||||
strings::StrCat(name_prefix, "/device:GPU:", tf_gpu_id.value());
|
strings::StrCat(name_prefix, "/device:GPU:", tf_device_id.value());
|
||||||
DeviceIdUtil::CheckValidTfDeviceId(DEVICE_GPU, GPUMachineManager(),
|
DeviceIdUtil::CheckValidTfDeviceId(DEVICE_GPU, GPUMachineManager(),
|
||||||
tf_gpu_id);
|
tf_device_id);
|
||||||
PlatformGpuId platform_gpu_id;
|
PlatformDeviceId platform_device_id;
|
||||||
TF_RETURN_IF_ERROR(
|
TF_RETURN_IF_ERROR(
|
||||||
GpuIdManager::TfToPlatformGpuId(tf_gpu_id, &platform_gpu_id));
|
GpuIdManager::TfToPlatformDeviceId(tf_device_id, &platform_device_id));
|
||||||
int numa_node = dev_locality.numa_node();
|
int numa_node = dev_locality.numa_node();
|
||||||
|
|
||||||
se::Platform* gpu_manager = GPUMachineManager();
|
se::Platform* gpu_manager = GPUMachineManager();
|
||||||
auto desc_status = gpu_manager->DescriptionForDevice(platform_gpu_id.value());
|
auto desc_status =
|
||||||
|
gpu_manager->DescriptionForDevice(platform_device_id.value());
|
||||||
if (!desc_status.ok()) {
|
if (!desc_status.ok()) {
|
||||||
return desc_status.status();
|
return desc_status.status();
|
||||||
}
|
}
|
||||||
auto desc = desc_status.ConsumeValueOrDie();
|
auto desc = desc_status.ConsumeValueOrDie();
|
||||||
|
|
||||||
std::vector<TfGpuId> peer_gpu_ids;
|
std::vector<TfDeviceId> peer_gpu_ids;
|
||||||
peer_gpu_ids.reserve(num_tf_gpus);
|
peer_gpu_ids.reserve(num_tf_gpus);
|
||||||
for (int id = 0; id < num_tf_gpus; ++id) {
|
for (int id = 0; id < num_tf_gpus; ++id) {
|
||||||
TfGpuId peer_tf_gpu_id(id);
|
TfDeviceId peer_tf_device_id(id);
|
||||||
if (peer_tf_gpu_id != tf_gpu_id) {
|
if (peer_tf_device_id != tf_device_id) {
|
||||||
peer_gpu_ids.push_back(peer_tf_gpu_id);
|
peer_gpu_ids.push_back(peer_tf_device_id);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
GPUProcessState* process_state = GPUProcessState::singleton();
|
GPUProcessState* process_state = GPUProcessState::singleton();
|
||||||
Allocator* gpu_allocator = process_state->GetGPUAllocator(
|
Allocator* gpu_allocator = process_state->GetGPUAllocator(
|
||||||
options.config.gpu_options(), tf_gpu_id, memory_limit, peer_gpu_ids);
|
options.config.gpu_options(), tf_device_id, memory_limit, peer_gpu_ids);
|
||||||
if (gpu_allocator == nullptr) {
|
if (gpu_allocator == nullptr) {
|
||||||
return errors::Internal("Failed to get memory allocator for TF GPU ",
|
return errors::Internal("Failed to get memory allocator for TF GPU ",
|
||||||
tf_gpu_id.value(), " with ", memory_limit,
|
tf_device_id.value(), " with ", memory_limit,
|
||||||
" bytes of memory.");
|
" bytes of memory.");
|
||||||
}
|
}
|
||||||
absl::optional<AllocatorStats> stats = gpu_allocator->GetStats();
|
absl::optional<AllocatorStats> stats = gpu_allocator->GetStats();
|
||||||
@ -1441,7 +1449,7 @@ Status BaseGPUDeviceFactory::CreateGPUDevice(
|
|||||||
return errors::Internal("No allocator statistics");
|
return errors::Internal("No allocator statistics");
|
||||||
}
|
}
|
||||||
// 'memory_limit' is the required memory size, but if the allocator with
|
// 'memory_limit' is the required memory size, but if the allocator with
|
||||||
// given tf_gpu_id was created before, we'll use it instead of creating a
|
// given tf_device_id was created before, we'll use it instead of creating a
|
||||||
// new one (as TF gpu device is a shared resource), in which case the actual
|
// new one (as TF gpu device is a shared resource), in which case the actual
|
||||||
// memory limit represented by 'stats.bytes_limit' used by that allocator
|
// memory limit represented by 'stats.bytes_limit' used by that allocator
|
||||||
// may be different (which should be an error).
|
// may be different (which should be an error).
|
||||||
@ -1451,11 +1459,11 @@ Status BaseGPUDeviceFactory::CreateGPUDevice(
|
|||||||
int64 bytes_limit = stats->bytes_limit ? *stats->bytes_limit : 0;
|
int64 bytes_limit = stats->bytes_limit ? *stats->bytes_limit : 0;
|
||||||
std::unique_ptr<BaseGPUDevice> gpu_device = CreateGPUDevice(
|
std::unique_ptr<BaseGPUDevice> gpu_device = CreateGPUDevice(
|
||||||
options, device_name, static_cast<Bytes>(bytes_limit), dev_locality,
|
options, device_name, static_cast<Bytes>(bytes_limit), dev_locality,
|
||||||
tf_gpu_id, GetShortDeviceDescription(platform_gpu_id, *desc),
|
tf_device_id, GetShortDeviceDescription(platform_device_id, *desc),
|
||||||
gpu_allocator, ProcessState::singleton()->GetCPUAllocator(numa_node));
|
gpu_allocator, ProcessState::singleton()->GetCPUAllocator(numa_node));
|
||||||
LOG(INFO) << "Created TensorFlow device (" << device_name << " with "
|
LOG(INFO) << "Created TensorFlow device (" << device_name << " with "
|
||||||
<< (bytes_limit >> 20) << " MB memory) -> physical GPU ("
|
<< (bytes_limit >> 20) << " MB memory) -> physical GPU ("
|
||||||
<< GetShortDeviceDescription(platform_gpu_id, *desc) << ")";
|
<< GetShortDeviceDescription(platform_device_id, *desc) << ")";
|
||||||
TF_RETURN_IF_ERROR(gpu_device->Init(options));
|
TF_RETURN_IF_ERROR(gpu_device->Init(options));
|
||||||
devices->push_back(std::move(gpu_device));
|
devices->push_back(std::move(gpu_device));
|
||||||
|
|
||||||
@ -1463,13 +1471,13 @@ Status BaseGPUDeviceFactory::CreateGPUDevice(
|
|||||||
}
|
}
|
||||||
|
|
||||||
namespace {
|
namespace {
|
||||||
std::unique_ptr<std::map<std::pair<PlatformGpuId, PlatformGpuId>, bool>>
|
std::unique_ptr<std::map<std::pair<PlatformDeviceId, PlatformDeviceId>, bool>>
|
||||||
GetPeerAccessMap(se::Platform* platform,
|
GetPeerAccessMap(se::Platform* platform,
|
||||||
const std::vector<PlatformGpuId>& visible_gpu_order) {
|
const std::vector<PlatformDeviceId>& visible_gpu_order) {
|
||||||
std::unique_ptr<std::map<std::pair<PlatformGpuId, PlatformGpuId>, bool>> map(
|
std::unique_ptr<std::map<std::pair<PlatformDeviceId, PlatformDeviceId>, bool>>
|
||||||
new std::map<std::pair<PlatformGpuId, PlatformGpuId>, bool>);
|
map(new std::map<std::pair<PlatformDeviceId, PlatformDeviceId>, bool>);
|
||||||
for (PlatformGpuId platform_gpu_i : visible_gpu_order) {
|
for (PlatformDeviceId platform_gpu_i : visible_gpu_order) {
|
||||||
for (PlatformGpuId platform_gpu_j : visible_gpu_order) {
|
for (PlatformDeviceId platform_gpu_j : visible_gpu_order) {
|
||||||
se::StreamExecutor* from =
|
se::StreamExecutor* from =
|
||||||
DeviceIdUtil::ExecutorForPlatformDeviceId(platform, platform_gpu_i)
|
DeviceIdUtil::ExecutorForPlatformDeviceId(platform, platform_gpu_i)
|
||||||
.ValueOrDie();
|
.ValueOrDie();
|
||||||
@ -1487,7 +1495,7 @@ GetPeerAccessMap(se::Platform* platform,
|
|||||||
} // namespace
|
} // namespace
|
||||||
|
|
||||||
Status BaseGPUDeviceFactory::GetInterconnectMaps(
|
Status BaseGPUDeviceFactory::GetInterconnectMaps(
|
||||||
const std::vector<PlatformGpuId>& visible_gpu_order,
|
const std::vector<PlatformDeviceId>& visible_gpu_order,
|
||||||
se::Platform* gpu_manager, std::vector<InterconnectMap>* maps) {
|
se::Platform* gpu_manager, std::vector<InterconnectMap>* maps) {
|
||||||
// The default interconnect map is obtained from the StreamExecutor.
|
// The default interconnect map is obtained from the StreamExecutor.
|
||||||
auto access_map = GetPeerAccessMap(gpu_manager, visible_gpu_order);
|
auto access_map = GetPeerAccessMap(gpu_manager, visible_gpu_order);
|
||||||
@ -1495,8 +1503,8 @@ Status BaseGPUDeviceFactory::GetInterconnectMaps(
|
|||||||
InterconnectMap& imap = maps->at(0);
|
InterconnectMap& imap = maps->at(0);
|
||||||
imap.name = "StreamExecutor";
|
imap.name = "StreamExecutor";
|
||||||
imap.strength = InterconnectMap::kStreamExecutorStrength;
|
imap.strength = InterconnectMap::kStreamExecutorStrength;
|
||||||
for (PlatformGpuId gpu_id_i : visible_gpu_order) {
|
for (PlatformDeviceId gpu_id_i : visible_gpu_order) {
|
||||||
for (PlatformGpuId gpu_id_j : visible_gpu_order) {
|
for (PlatformDeviceId gpu_id_j : visible_gpu_order) {
|
||||||
if (gpu_id_i == gpu_id_j) continue;
|
if (gpu_id_i == gpu_id_j) continue;
|
||||||
if ((*access_map)[{gpu_id_i, gpu_id_j}]) {
|
if ((*access_map)[{gpu_id_i, gpu_id_j}]) {
|
||||||
imap.directed_links.insert({gpu_id_i, gpu_id_j});
|
imap.directed_links.insert({gpu_id_i, gpu_id_j});
|
||||||
@ -1509,21 +1517,21 @@ Status BaseGPUDeviceFactory::GetInterconnectMaps(
|
|||||||
Status BaseGPUDeviceFactory::GetDeviceLocalities(
|
Status BaseGPUDeviceFactory::GetDeviceLocalities(
|
||||||
int num_tf_gpus, const std::vector<InterconnectMap>& interconnects,
|
int num_tf_gpus, const std::vector<InterconnectMap>& interconnects,
|
||||||
LocalityMap* localities) {
|
LocalityMap* localities) {
|
||||||
std::vector<TfGpuId> all_tf_gpu_ids;
|
std::vector<TfDeviceId> all_tf_device_ids;
|
||||||
all_tf_gpu_ids.reserve(num_tf_gpus);
|
all_tf_device_ids.reserve(num_tf_gpus);
|
||||||
for (int i = 0; i < num_tf_gpus; ++i) {
|
for (int i = 0; i < num_tf_gpus; ++i) {
|
||||||
all_tf_gpu_ids.push_back(TfGpuId(i));
|
all_tf_device_ids.push_back(TfDeviceId(i));
|
||||||
}
|
}
|
||||||
for (TfGpuId tf_gpu_id : all_tf_gpu_ids) {
|
for (TfDeviceId tf_device_id : all_tf_device_ids) {
|
||||||
PlatformGpuId platform_gpu_id;
|
PlatformDeviceId platform_device_id;
|
||||||
TF_RETURN_IF_ERROR(
|
TF_RETURN_IF_ERROR(
|
||||||
GpuIdManager::TfToPlatformGpuId(tf_gpu_id, &platform_gpu_id));
|
GpuIdManager::TfToPlatformDeviceId(tf_device_id, &platform_device_id));
|
||||||
// Get GPU bus_id from its reported NUMA affinity. Because GPUs are
|
// Get GPU bus_id from its reported NUMA affinity. Because GPUs are
|
||||||
// virtualized in some environments, we can't just use the GPU id.
|
// virtualized in some environments, we can't just use the GPU id.
|
||||||
// NUMA locales are indexed from 0, buses are indexed from 1.
|
// NUMA locales are indexed from 0, buses are indexed from 1.
|
||||||
se::Platform* gpu_manager = GPUMachineManager();
|
se::Platform* gpu_manager = GPUMachineManager();
|
||||||
auto desc_status =
|
auto desc_status =
|
||||||
gpu_manager->DescriptionForDevice(platform_gpu_id.value());
|
gpu_manager->DescriptionForDevice(platform_device_id.value());
|
||||||
if (!desc_status.ok()) {
|
if (!desc_status.ok()) {
|
||||||
return desc_status.status();
|
return desc_status.status();
|
||||||
}
|
}
|
||||||
@ -1537,7 +1545,7 @@ Status BaseGPUDeviceFactory::GetDeviceLocalities(
|
|||||||
// trouble may manifest as slower than expected performance, or
|
// trouble may manifest as slower than expected performance, or
|
||||||
// outright failures.
|
// outright failures.
|
||||||
LOG(INFO) << "Could not identify NUMA node of platform GPU id "
|
LOG(INFO) << "Could not identify NUMA node of platform GPU id "
|
||||||
<< platform_gpu_id
|
<< platform_device_id
|
||||||
<< ", defaulting to 0. Your kernel may not have been built "
|
<< ", defaulting to 0. Your kernel may not have been built "
|
||||||
<< "with NUMA support.";
|
<< "with NUMA support.";
|
||||||
numa_node = 0;
|
numa_node = 0;
|
||||||
@ -1549,11 +1557,11 @@ Status BaseGPUDeviceFactory::GetDeviceLocalities(
|
|||||||
// Set LocalLinks from InterconnectMaps.
|
// Set LocalLinks from InterconnectMaps.
|
||||||
LocalLinks* links = dev_locality.mutable_links();
|
LocalLinks* links = dev_locality.mutable_links();
|
||||||
for (const InterconnectMap& imap : interconnects) {
|
for (const InterconnectMap& imap : interconnects) {
|
||||||
for (TfGpuId tf_gpu_dst : all_tf_gpu_ids) {
|
for (TfDeviceId tf_gpu_dst : all_tf_device_ids) {
|
||||||
PlatformGpuId platform_gpu_dst;
|
PlatformDeviceId platform_gpu_dst;
|
||||||
TF_RETURN_IF_ERROR(
|
TF_RETURN_IF_ERROR(
|
||||||
GpuIdManager::TfToPlatformGpuId(tf_gpu_dst, &platform_gpu_dst));
|
GpuIdManager::TfToPlatformDeviceId(tf_gpu_dst, &platform_gpu_dst));
|
||||||
if (imap.directed_links.find({platform_gpu_id, platform_gpu_dst}) !=
|
if (imap.directed_links.find({platform_device_id, platform_gpu_dst}) !=
|
||||||
imap.directed_links.end()) {
|
imap.directed_links.end()) {
|
||||||
InterconnectLink* ilink = links->add_link();
|
InterconnectLink* ilink = links->add_link();
|
||||||
ilink->set_device_id(tf_gpu_dst.value());
|
ilink->set_device_id(tf_gpu_dst.value());
|
||||||
@ -1565,12 +1573,12 @@ Status BaseGPUDeviceFactory::GetDeviceLocalities(
|
|||||||
|
|
||||||
// If this is one of multiple virtual GPUs on the same physical GPU
|
// If this is one of multiple virtual GPUs on the same physical GPU
|
||||||
// add high strength links to the others.
|
// add high strength links to the others.
|
||||||
for (TfGpuId tf_gpu_dst : all_tf_gpu_ids) {
|
for (TfDeviceId tf_gpu_dst : all_tf_device_ids) {
|
||||||
if (tf_gpu_id == tf_gpu_dst) continue;
|
if (tf_device_id == tf_gpu_dst) continue;
|
||||||
PlatformGpuId platform_gpu_dst;
|
PlatformDeviceId platform_gpu_dst;
|
||||||
TF_RETURN_IF_ERROR(
|
TF_RETURN_IF_ERROR(
|
||||||
GpuIdManager::TfToPlatformGpuId(tf_gpu_dst, &platform_gpu_dst));
|
GpuIdManager::TfToPlatformDeviceId(tf_gpu_dst, &platform_gpu_dst));
|
||||||
if (platform_gpu_id == platform_gpu_dst) {
|
if (platform_device_id == platform_gpu_dst) {
|
||||||
InterconnectLink* ilink = links->add_link();
|
InterconnectLink* ilink = links->add_link();
|
||||||
ilink->set_device_id(tf_gpu_dst.value());
|
ilink->set_device_id(tf_gpu_dst.value());
|
||||||
ilink->set_type("SAME_DEVICE");
|
ilink->set_type("SAME_DEVICE");
|
||||||
@ -1578,10 +1586,11 @@ Status BaseGPUDeviceFactory::GetDeviceLocalities(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
(*localities)[tf_gpu_id] = dev_locality;
|
(*localities)[tf_device_id] = dev_locality;
|
||||||
VLOG(1) << "GPUDevice PlatformGpuId " << platform_gpu_id << " TfGpuId "
|
VLOG(1) << "GPUDevice PlatformDeviceId " << platform_device_id
|
||||||
<< tf_gpu_id << " on bus " << dev_locality.bus_id()
|
<< " TfDeviceId " << tf_device_id << " on bus "
|
||||||
<< " numa: " << numa_node << " pci: " << desc->pci_bus_id()
|
<< dev_locality.bus_id() << " numa: " << numa_node
|
||||||
|
<< " pci: " << desc->pci_bus_id()
|
||||||
<< " DeviceLocality: " << dev_locality.DebugString();
|
<< " DeviceLocality: " << dev_locality.DebugString();
|
||||||
}
|
}
|
||||||
return Status::OK();
|
return Status::OK();
|
||||||
@ -1589,7 +1598,7 @@ Status BaseGPUDeviceFactory::GetDeviceLocalities(
|
|||||||
|
|
||||||
static int GetDefaultMinGPUMultiprocessorCount(
|
static int GetDefaultMinGPUMultiprocessorCount(
|
||||||
se::Platform* gpu_manager,
|
se::Platform* gpu_manager,
|
||||||
const std::vector<PlatformGpuId>& visible_gpu_order) {
|
const std::vector<PlatformDeviceId>& visible_gpu_order) {
|
||||||
static const int kDefaultMinGPUMultiprocessorCount = 8;
|
static const int kDefaultMinGPUMultiprocessorCount = 8;
|
||||||
|
|
||||||
// Find the highest multi-processor count across all visible GPUs.
|
// Find the highest multi-processor count across all visible GPUs.
|
||||||
@ -1614,7 +1623,7 @@ static int GetDefaultMinGPUMultiprocessorCount(
|
|||||||
|
|
||||||
static int GetMinGPUMultiprocessorCount(
|
static int GetMinGPUMultiprocessorCount(
|
||||||
se::Platform* gpu_manager,
|
se::Platform* gpu_manager,
|
||||||
const std::vector<PlatformGpuId>& visible_gpu_order) {
|
const std::vector<PlatformDeviceId>& visible_gpu_order) {
|
||||||
const char* tf_min_gpu_core_count = getenv("TF_MIN_GPU_MULTIPROCESSOR_COUNT");
|
const char* tf_min_gpu_core_count = getenv("TF_MIN_GPU_MULTIPROCESSOR_COUNT");
|
||||||
|
|
||||||
if (tf_min_gpu_core_count == nullptr ||
|
if (tf_min_gpu_core_count == nullptr ||
|
||||||
@ -1704,14 +1713,14 @@ std::vector<int> GetSupportedAMDGPUISAVersions() {
|
|||||||
} // namespace
|
} // namespace
|
||||||
|
|
||||||
Status BaseGPUDeviceFactory::EnablePeerAccess(
|
Status BaseGPUDeviceFactory::EnablePeerAccess(
|
||||||
const std::vector<PlatformGpuId>& visible_gpu_order) {
|
const std::vector<PlatformDeviceId>& visible_gpu_order) {
|
||||||
se::Platform* gpu_manager = GPUMachineManager();
|
se::Platform* gpu_manager = GPUMachineManager();
|
||||||
int possible_peer_count = 0;
|
int possible_peer_count = 0;
|
||||||
int enabled_peer_count = 0;
|
int enabled_peer_count = 0;
|
||||||
for (int i = 0; i < visible_gpu_order.size(); ++i) {
|
for (int i = 0; i < visible_gpu_order.size(); ++i) {
|
||||||
const PlatformGpuId platform_gpu_i = visible_gpu_order[i];
|
const PlatformDeviceId platform_gpu_i = visible_gpu_order[i];
|
||||||
for (int j = 0; j < visible_gpu_order.size(); ++j) {
|
for (int j = 0; j < visible_gpu_order.size(); ++j) {
|
||||||
const PlatformGpuId platform_gpu_j = visible_gpu_order[j];
|
const PlatformDeviceId platform_gpu_j = visible_gpu_order[j];
|
||||||
// We have already validated that ExecutorForDevice() calls return OK.
|
// We have already validated that ExecutorForDevice() calls return OK.
|
||||||
se::StreamExecutor* from =
|
se::StreamExecutor* from =
|
||||||
DeviceIdUtil::ExecutorForPlatformDeviceId(gpu_manager, platform_gpu_i)
|
DeviceIdUtil::ExecutorForPlatformDeviceId(gpu_manager, platform_gpu_i)
|
||||||
@ -1748,8 +1757,8 @@ Status BaseGPUDeviceFactory::EnablePeerAccess(
|
|||||||
}
|
}
|
||||||
|
|
||||||
Status BaseGPUDeviceFactory::GetValidDeviceIds(
|
Status BaseGPUDeviceFactory::GetValidDeviceIds(
|
||||||
const std::vector<PlatformGpuId>& visible_gpu_order,
|
const std::vector<PlatformDeviceId>& visible_gpu_order,
|
||||||
std::vector<PlatformGpuId>* ids) {
|
std::vector<PlatformDeviceId>* ids) {
|
||||||
se::Platform* gpu_manager = GPUMachineManager();
|
se::Platform* gpu_manager = GPUMachineManager();
|
||||||
for (int i = 0; i < visible_gpu_order.size(); ++i) {
|
for (int i = 0; i < visible_gpu_order.size(); ++i) {
|
||||||
int visible_gpu_id = visible_gpu_order[i].value();
|
int visible_gpu_id = visible_gpu_order[i].value();
|
||||||
@ -1834,7 +1843,7 @@ Status BaseGPUDeviceFactory::GetValidDeviceIds(
|
|||||||
|
|
||||||
// Filter out devices that don't have the right capability or power.
|
// Filter out devices that don't have the right capability or power.
|
||||||
for (int i = 0; i < visible_gpu_order.size(); ++i) {
|
for (int i = 0; i < visible_gpu_order.size(); ++i) {
|
||||||
const PlatformGpuId visible_gpu_id = visible_gpu_order[i];
|
const PlatformDeviceId visible_gpu_id = visible_gpu_order[i];
|
||||||
auto description_status =
|
auto description_status =
|
||||||
gpu_manager->DescriptionForDevice(visible_gpu_id.value());
|
gpu_manager->DescriptionForDevice(visible_gpu_id.value());
|
||||||
if (!description_status.ok()) {
|
if (!description_status.ok()) {
|
||||||
@ -1904,7 +1913,7 @@ Status BaseGPUDeviceFactory::GetValidDeviceIds(
|
|||||||
if (!ids->empty()) {
|
if (!ids->empty()) {
|
||||||
std::vector<int> raw_ids(ids->size());
|
std::vector<int> raw_ids(ids->size());
|
||||||
std::transform(ids->begin(), ids->end(), raw_ids.begin(),
|
std::transform(ids->begin(), ids->end(), raw_ids.begin(),
|
||||||
[](PlatformGpuId id) -> int { return id.value(); });
|
[](PlatformDeviceId id) -> int { return id.value(); });
|
||||||
LOG(INFO) << "Adding visible gpu devices: " << absl::StrJoin(raw_ids, ", ");
|
LOG(INFO) << "Adding visible gpu devices: " << absl::StrJoin(raw_ids, ", ");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -53,7 +53,8 @@ class BaseGPUDevice : public LocalDevice {
|
|||||||
public:
|
public:
|
||||||
BaseGPUDevice(const SessionOptions& options, const std::string& name,
|
BaseGPUDevice(const SessionOptions& options, const std::string& name,
|
||||||
Bytes memory_limit, const DeviceLocality& locality,
|
Bytes memory_limit, const DeviceLocality& locality,
|
||||||
TfGpuId tf_gpu_id, const std::string& physical_device_desc,
|
TfDeviceId tf_device_id,
|
||||||
|
const std::string& physical_device_desc,
|
||||||
Allocator* gpu_allocator, Allocator* cpu_allocator,
|
Allocator* gpu_allocator, Allocator* cpu_allocator,
|
||||||
bool sync_every_op);
|
bool sync_every_op);
|
||||||
|
|
||||||
@ -87,9 +88,10 @@ class BaseGPUDevice : public LocalDevice {
|
|||||||
// Returns the platform GPU id of this device within the native driver system;
|
// Returns the platform GPU id of this device within the native driver system;
|
||||||
// e.g., for CUDA and ROCm this is the ordinal of the GPU within the system.
|
// e.g., for CUDA and ROCm this is the ordinal of the GPU within the system.
|
||||||
int gpu_id() const {
|
int gpu_id() const {
|
||||||
PlatformGpuId platform_gpu_id;
|
PlatformDeviceId platform_device_id;
|
||||||
TF_CHECK_OK(GpuIdManager::TfToPlatformGpuId(tf_gpu_id_, &platform_gpu_id));
|
TF_CHECK_OK(
|
||||||
return platform_gpu_id.value();
|
GpuIdManager::TfToPlatformDeviceId(tf_device_id_, &platform_device_id));
|
||||||
|
return platform_device_id.value();
|
||||||
}
|
}
|
||||||
|
|
||||||
// The executor that provides control for the device; e.g., for CUDA this
|
// The executor that provides control for the device; e.g., for CUDA this
|
||||||
@ -146,7 +148,7 @@ class BaseGPUDevice : public LocalDevice {
|
|||||||
GPUDeviceContext* device_context_;
|
GPUDeviceContext* device_context_;
|
||||||
GpuDeviceInfo* gpu_device_info_ = nullptr;
|
GpuDeviceInfo* gpu_device_info_ = nullptr;
|
||||||
mutex trace_mu_;
|
mutex trace_mu_;
|
||||||
TfGpuId tf_gpu_id_;
|
TfDeviceId tf_device_id_;
|
||||||
const bool sync_every_op_ = false;
|
const bool sync_every_op_ = false;
|
||||||
EventMgr* em_ = nullptr;
|
EventMgr* em_ = nullptr;
|
||||||
std::unique_ptr<thread::ThreadPool> thread_pool_;
|
std::unique_ptr<thread::ThreadPool> thread_pool_;
|
||||||
@ -325,53 +327,56 @@ class BaseGPUDeviceFactory : public DeviceFactory {
|
|||||||
int32 strength;
|
int32 strength;
|
||||||
static const int kSameDeviceStrength;
|
static const int kSameDeviceStrength;
|
||||||
static const int kStreamExecutorStrength;
|
static const int kStreamExecutorStrength;
|
||||||
std::set<std::pair<PlatformGpuId, PlatformGpuId>> directed_links;
|
std::set<std::pair<PlatformDeviceId, PlatformDeviceId>> directed_links;
|
||||||
};
|
};
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
// Populates *maps with interconnect maps for all local direct access
|
// Populates *maps with interconnect maps for all local direct access
|
||||||
// pathways between GPUs.
|
// pathways between GPUs.
|
||||||
virtual Status GetInterconnectMaps(
|
virtual Status GetInterconnectMaps(
|
||||||
const std::vector<PlatformGpuId>& visible_gpu_order,
|
const std::vector<PlatformDeviceId>& visible_gpu_order,
|
||||||
se::Platform* gpu_manager, std::vector<InterconnectMap>* maps);
|
se::Platform* gpu_manager, std::vector<InterconnectMap>* maps);
|
||||||
|
|
||||||
struct TfGpuIdHash {
|
struct TfDeviceIdHash {
|
||||||
std::size_t operator()(const TfGpuId& id) const noexcept {
|
std::size_t operator()(const TfDeviceId& id) const noexcept {
|
||||||
return std::hash<int>{}(id.value());
|
return std::hash<int>{}(id.value());
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
typedef std::unordered_map<TfGpuId, DeviceLocality, TfGpuIdHash> LocalityMap;
|
typedef std::unordered_map<TfDeviceId, DeviceLocality, TfDeviceIdHash>
|
||||||
|
LocalityMap;
|
||||||
// Populates *localities with the DeviceLocality descriptor for
|
// Populates *localities with the DeviceLocality descriptor for
|
||||||
// every TfGpuId.
|
// every TfDeviceId.
|
||||||
virtual Status GetDeviceLocalities(
|
virtual Status GetDeviceLocalities(
|
||||||
int num_tf_gpus, const std::vector<InterconnectMap>& interconnects,
|
int num_tf_gpus, const std::vector<InterconnectMap>& interconnects,
|
||||||
LocalityMap* localities);
|
LocalityMap* localities);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
// Creates a BaseGPUDevice associated with 'tf_gpu_id', allocates (strictly)
|
// Creates a BaseGPUDevice associated with 'tf_device_id', allocates
|
||||||
// 'memory_limit' bytes of GPU memory to it, and adds it to the 'devices'
|
// (strictly) 'memory_limit' bytes of GPU memory to it, and adds it to the
|
||||||
// vector.
|
// 'devices' vector.
|
||||||
Status CreateGPUDevice(const SessionOptions& options,
|
Status CreateGPUDevice(const SessionOptions& options,
|
||||||
const std::string& name_prefix, TfGpuId tf_gpu_id,
|
const std::string& name_prefix,
|
||||||
int64 memory_limit, const DeviceLocality& dev_locality,
|
TfDeviceId tf_device_id, int64 memory_limit,
|
||||||
size_t num_tf_gpus,
|
const DeviceLocality& dev_locality, size_t num_tf_gpus,
|
||||||
std::vector<std::unique_ptr<Device>>* devices);
|
std::vector<std::unique_ptr<Device>>* devices);
|
||||||
|
|
||||||
virtual std::unique_ptr<BaseGPUDevice> CreateGPUDevice(
|
virtual std::unique_ptr<BaseGPUDevice> CreateGPUDevice(
|
||||||
const SessionOptions& options, const string& name, Bytes memory_limit,
|
const SessionOptions& options, const string& name, Bytes memory_limit,
|
||||||
const DeviceLocality& dev_locality, TfGpuId tf_gpu_id,
|
const DeviceLocality& dev_locality, TfDeviceId tf_device_id,
|
||||||
const string& physical_device_desc, Allocator* gpu_allocator,
|
const string& physical_device_desc, Allocator* gpu_allocator,
|
||||||
Allocator* cpu_allocator) = 0;
|
Allocator* cpu_allocator) = 0;
|
||||||
|
|
||||||
Status EnablePeerAccess(const std::vector<PlatformGpuId>& visible_gpu_order);
|
Status EnablePeerAccess(
|
||||||
|
const std::vector<PlatformDeviceId>& visible_gpu_order);
|
||||||
|
|
||||||
// Returns into 'ids' the list of valid platform GPU ids, in the order that
|
// Returns into 'ids' the list of valid platform GPU ids, in the order that
|
||||||
// they should map to TF GPU ids "/device:GPU:0", "/device:GPU:1", etc,
|
// they should map to TF GPU ids "/device:GPU:0", "/device:GPU:1", etc,
|
||||||
// based upon 'visible_gpu_order' which was generated by parsing
|
// based upon 'visible_gpu_order' which was generated by parsing
|
||||||
// GPUOptions::visible_device_list which is a comma-separated list of CUDA or
|
// GPUOptions::visible_device_list which is a comma-separated list of CUDA or
|
||||||
// ROCm GPU ids.
|
// ROCm GPU ids.
|
||||||
Status GetValidDeviceIds(const std::vector<PlatformGpuId>& visible_gpu_order,
|
Status GetValidDeviceIds(
|
||||||
std::vector<PlatformGpuId>* ids);
|
const std::vector<PlatformDeviceId>& visible_gpu_order,
|
||||||
|
std::vector<PlatformDeviceId>* ids);
|
||||||
|
|
||||||
// Cache the valid device IDs if not already cached. Cached IDs are stored in
|
// Cache the valid device IDs if not already cached. Cached IDs are stored in
|
||||||
// field cached_device_ids_. Passes {0, 1, ..., num_devices-1} to
|
// field cached_device_ids_. Passes {0, 1, ..., num_devices-1} to
|
||||||
@ -379,14 +384,14 @@ class BaseGPUDeviceFactory : public DeviceFactory {
|
|||||||
// devices should be treated as visible, like ListPhysicalDevices.
|
// devices should be treated as visible, like ListPhysicalDevices.
|
||||||
Status CacheDeviceIds();
|
Status CacheDeviceIds();
|
||||||
|
|
||||||
// visible_gpu_initialized_[platform_gpu_id] is true if visible GPU
|
// visible_gpu_initialized_[platform_device_id] is true if visible GPU
|
||||||
// platform_gpu_id has been initialized by the process.
|
// platform_device_id has been initialized by the process.
|
||||||
std::unordered_map<int, bool> visible_gpu_initialized_;
|
std::unordered_map<int, bool> visible_gpu_initialized_;
|
||||||
|
|
||||||
// Cached device IDs, as returned by GetValidDeviceIds when every physical
|
// Cached device IDs, as returned by GetValidDeviceIds when every physical
|
||||||
// device is visible. Cache should not be used if some devices are not
|
// device is visible. Cache should not be used if some devices are not
|
||||||
// visible.
|
// visible.
|
||||||
std::vector<PlatformGpuId> cached_device_ids_;
|
std::vector<PlatformDeviceId> cached_device_ids_;
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace tensorflow
|
} // namespace tensorflow
|
||||||
|
@ -30,9 +30,9 @@ class GPUDevice : public BaseGPUDevice {
|
|||||||
public:
|
public:
|
||||||
GPUDevice(const SessionOptions& options, const string& name,
|
GPUDevice(const SessionOptions& options, const string& name,
|
||||||
Bytes memory_limit, const DeviceLocality& locality,
|
Bytes memory_limit, const DeviceLocality& locality,
|
||||||
TfGpuId tf_gpu_id, const string& physical_device_desc,
|
TfDeviceId tf_device_id, const string& physical_device_desc,
|
||||||
Allocator* gpu_allocator, Allocator* cpu_allocator)
|
Allocator* gpu_allocator, Allocator* cpu_allocator)
|
||||||
: BaseGPUDevice(options, name, memory_limit, locality, tf_gpu_id,
|
: BaseGPUDevice(options, name, memory_limit, locality, tf_device_id,
|
||||||
physical_device_desc, gpu_allocator, cpu_allocator,
|
physical_device_desc, gpu_allocator, cpu_allocator,
|
||||||
false /* sync every op */) {
|
false /* sync every op */) {
|
||||||
if (options.config.has_gpu_options()) {
|
if (options.config.has_gpu_options()) {
|
||||||
@ -63,11 +63,11 @@ class GPUDeviceFactory : public BaseGPUDeviceFactory {
|
|||||||
private:
|
private:
|
||||||
std::unique_ptr<BaseGPUDevice> CreateGPUDevice(
|
std::unique_ptr<BaseGPUDevice> CreateGPUDevice(
|
||||||
const SessionOptions& options, const string& name, Bytes memory_limit,
|
const SessionOptions& options, const string& name, Bytes memory_limit,
|
||||||
const DeviceLocality& locality, TfGpuId tf_gpu_id,
|
const DeviceLocality& locality, TfDeviceId tf_device_id,
|
||||||
const string& physical_device_desc, Allocator* gpu_allocator,
|
const string& physical_device_desc, Allocator* gpu_allocator,
|
||||||
Allocator* cpu_allocator) override {
|
Allocator* cpu_allocator) override {
|
||||||
return absl::make_unique<GPUDevice>(options, name, memory_limit, locality,
|
return absl::make_unique<GPUDevice>(options, name, memory_limit, locality,
|
||||||
tf_gpu_id, physical_device_desc,
|
tf_device_id, physical_device_desc,
|
||||||
gpu_allocator, cpu_allocator);
|
gpu_allocator, cpu_allocator);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
@ -30,7 +30,7 @@ namespace tensorflow {
|
|||||||
namespace {
|
namespace {
|
||||||
const char* kDeviceNamePrefix = "/job:localhost/replica:0/task:0";
|
const char* kDeviceNamePrefix = "/job:localhost/replica:0/task:0";
|
||||||
|
|
||||||
int64 GetTotalGPUMemory(PlatformGpuId gpu_id) {
|
int64 GetTotalGPUMemory(PlatformDeviceId gpu_id) {
|
||||||
se::StreamExecutor* se =
|
se::StreamExecutor* se =
|
||||||
DeviceIdUtil::ExecutorForPlatformDeviceId(GPUMachineManager(), gpu_id)
|
DeviceIdUtil::ExecutorForPlatformDeviceId(GPUMachineManager(), gpu_id)
|
||||||
.ValueOrDie();
|
.ValueOrDie();
|
||||||
@ -40,7 +40,7 @@ int64 GetTotalGPUMemory(PlatformGpuId gpu_id) {
|
|||||||
return total_memory;
|
return total_memory;
|
||||||
}
|
}
|
||||||
|
|
||||||
Status GetComputeCapability(PlatformGpuId gpu_id, int* cc_major,
|
Status GetComputeCapability(PlatformDeviceId gpu_id, int* cc_major,
|
||||||
int* cc_minor) {
|
int* cc_minor) {
|
||||||
se::StreamExecutor* se =
|
se::StreamExecutor* se =
|
||||||
DeviceIdUtil::ExecutorForPlatformDeviceId(GPUMachineManager(), gpu_id)
|
DeviceIdUtil::ExecutorForPlatformDeviceId(GPUMachineManager(), gpu_id)
|
||||||
@ -350,7 +350,7 @@ TEST_F(GPUDeviceTest, MultipleVirtualDevicesWithPriority) {
|
|||||||
// error.
|
// error.
|
||||||
TEST_F(GPUDeviceTest, UnifiedMemoryUnavailableOnPrePascalGpus) {
|
TEST_F(GPUDeviceTest, UnifiedMemoryUnavailableOnPrePascalGpus) {
|
||||||
int cc_major, cc_minor;
|
int cc_major, cc_minor;
|
||||||
TF_ASSERT_OK(GetComputeCapability(PlatformGpuId(0), &cc_major, &cc_minor));
|
TF_ASSERT_OK(GetComputeCapability(PlatformDeviceId(0), &cc_major, &cc_minor));
|
||||||
// Exit early while running on Pascal or later GPUs.
|
// Exit early while running on Pascal or later GPUs.
|
||||||
if (cc_major >= 6) {
|
if (cc_major >= 6) {
|
||||||
return;
|
return;
|
||||||
@ -371,10 +371,10 @@ TEST_F(GPUDeviceTest, UnifiedMemoryUnavailableOnPrePascalGpus) {
|
|||||||
// more memory than what is available on the device.
|
// more memory than what is available on the device.
|
||||||
TEST_F(GPUDeviceTest, UnifiedMemoryAllocation) {
|
TEST_F(GPUDeviceTest, UnifiedMemoryAllocation) {
|
||||||
static constexpr double kGpuMemoryFraction = 1.2;
|
static constexpr double kGpuMemoryFraction = 1.2;
|
||||||
static constexpr PlatformGpuId kPlatformGpuId(0);
|
static constexpr PlatformDeviceId kPlatformDeviceId(0);
|
||||||
|
|
||||||
int cc_major, cc_minor;
|
int cc_major, cc_minor;
|
||||||
TF_ASSERT_OK(GetComputeCapability(kPlatformGpuId, &cc_major, &cc_minor));
|
TF_ASSERT_OK(GetComputeCapability(kPlatformDeviceId, &cc_major, &cc_minor));
|
||||||
// Exit early if running on pre-Pascal GPUs.
|
// Exit early if running on pre-Pascal GPUs.
|
||||||
if (cc_major < 6) {
|
if (cc_major < 6) {
|
||||||
LOG(INFO)
|
LOG(INFO)
|
||||||
@ -389,8 +389,9 @@ TEST_F(GPUDeviceTest, UnifiedMemoryAllocation) {
|
|||||||
ASSERT_EQ(1, devices.size());
|
ASSERT_EQ(1, devices.size());
|
||||||
|
|
||||||
int64 memory_limit = devices[0]->attributes().memory_limit();
|
int64 memory_limit = devices[0]->attributes().memory_limit();
|
||||||
ASSERT_EQ(memory_limit, static_cast<int64>(GetTotalGPUMemory(kPlatformGpuId) *
|
ASSERT_EQ(memory_limit,
|
||||||
kGpuMemoryFraction));
|
static_cast<int64>(GetTotalGPUMemory(kPlatformDeviceId) *
|
||||||
|
kGpuMemoryFraction));
|
||||||
|
|
||||||
AllocatorAttributes allocator_attributes = AllocatorAttributes();
|
AllocatorAttributes allocator_attributes = AllocatorAttributes();
|
||||||
allocator_attributes.set_gpu_compatible(true);
|
allocator_attributes.set_gpu_compatible(true);
|
||||||
|
@ -17,13 +17,6 @@ limitations under the License.
|
|||||||
|
|
||||||
#include "tensorflow/core/common_runtime/device/device_id.h"
|
#include "tensorflow/core/common_runtime/device/device_id.h"
|
||||||
|
|
||||||
namespace tensorflow {
|
// TODO(sanjoy): Delete the header and forward the references.
|
||||||
|
|
||||||
// TODO(annarev): remove these aliases after all references are updated
|
|
||||||
// to use device ids.
|
|
||||||
typedef TfDeviceId TfGpuId;
|
|
||||||
typedef PlatformDeviceId PlatformGpuId;
|
|
||||||
|
|
||||||
} // namespace tensorflow
|
|
||||||
|
|
||||||
#endif // TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_ID_H_
|
#endif // TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_ID_H_
|
||||||
|
@ -20,16 +20,16 @@ limitations under the License.
|
|||||||
|
|
||||||
namespace tensorflow {
|
namespace tensorflow {
|
||||||
|
|
||||||
Status GpuIdManager::InsertTfPlatformGpuIdPair(
|
Status GpuIdManager::InsertTfPlatformDeviceIdPair(
|
||||||
TfDeviceId tf_gpu_id, PlatformDeviceId platform_gpu_id) {
|
TfDeviceId tf_device_id, PlatformDeviceId platform_device_id) {
|
||||||
return DeviceIdManager::InsertTfPlatformDeviceIdPair(DEVICE_GPU, tf_gpu_id,
|
return DeviceIdManager::InsertTfPlatformDeviceIdPair(DEVICE_GPU, tf_device_id,
|
||||||
platform_gpu_id);
|
platform_device_id);
|
||||||
}
|
}
|
||||||
|
|
||||||
Status GpuIdManager::TfToPlatformGpuId(TfDeviceId tf_gpu_id,
|
Status GpuIdManager::TfToPlatformDeviceId(
|
||||||
PlatformDeviceId* platform_gpu_id) {
|
TfDeviceId tf_device_id, PlatformDeviceId* platform_device_id) {
|
||||||
return DeviceIdManager::TfToPlatformDeviceId(DEVICE_GPU, tf_gpu_id,
|
return DeviceIdManager::TfToPlatformDeviceId(DEVICE_GPU, tf_device_id,
|
||||||
platform_gpu_id);
|
platform_device_id);
|
||||||
}
|
}
|
||||||
|
|
||||||
void GpuIdManager::TestOnlyReset() { DeviceIdManager::TestOnlyReset(); }
|
void GpuIdManager::TestOnlyReset() { DeviceIdManager::TestOnlyReset(); }
|
||||||
|
@ -21,17 +21,18 @@ limitations under the License.
|
|||||||
|
|
||||||
namespace tensorflow {
|
namespace tensorflow {
|
||||||
|
|
||||||
// Class that maintains a map from TfGpuId to PlatformGpuId, and manages the
|
// Class that maintains a map from TfDeviceId to PlatformDeviceId, and manages
|
||||||
// translation between them.
|
// the translation between them.
|
||||||
class GpuIdManager {
|
class GpuIdManager {
|
||||||
public:
|
public:
|
||||||
// Adds a mapping from tf_gpu_id to platform_gpu_id.
|
// Adds a mapping from tf_device_id to platform_device_id.
|
||||||
static Status InsertTfPlatformGpuIdPair(TfDeviceId tf_gpu_id,
|
static Status InsertTfPlatformDeviceIdPair(
|
||||||
PlatformDeviceId platform_gpu_id);
|
TfDeviceId tf_device_id, PlatformDeviceId platform_device_id);
|
||||||
|
|
||||||
// Gets the platform_gpu_id associated with tf_gpu_id. Returns OK if found.
|
// Gets the platform_device_id associated with tf_device_id. Returns OK if
|
||||||
static Status TfToPlatformGpuId(TfDeviceId tf_gpu_id,
|
// found.
|
||||||
PlatformDeviceId* platform_gpu_id);
|
static Status TfToPlatformDeviceId(TfDeviceId tf_device_id,
|
||||||
|
PlatformDeviceId* platform_device_id);
|
||||||
|
|
||||||
// Clears the map. Used in unit tests only.
|
// Clears the map. Used in unit tests only.
|
||||||
static void TestOnlyReset();
|
static void TestOnlyReset();
|
||||||
|
@ -83,10 +83,10 @@ GPUProcessState::GPUProcessState() : gpu_device_enabled_(false) {
|
|||||||
process_state_ = ProcessState::singleton();
|
process_state_ = ProcessState::singleton();
|
||||||
}
|
}
|
||||||
|
|
||||||
int GPUProcessState::BusIdForGPU(TfGpuId tf_gpu_id) {
|
int GPUProcessState::BusIdForGPU(TfDeviceId tf_device_id) {
|
||||||
// Return the NUMA node associated with the GPU's StreamExecutor.
|
// Return the NUMA node associated with the GPU's StreamExecutor.
|
||||||
se::StreamExecutor* se = DeviceIdUtil::ExecutorForTfDeviceId(
|
se::StreamExecutor* se = DeviceIdUtil::ExecutorForTfDeviceId(
|
||||||
DEVICE_GPU, GPUMachineManager(), tf_gpu_id)
|
DEVICE_GPU, GPUMachineManager(), tf_device_id)
|
||||||
.ValueOrDie();
|
.ValueOrDie();
|
||||||
int numa_node = se->GetDeviceDescription().numa_node();
|
int numa_node = se->GetDeviceDescription().numa_node();
|
||||||
// bus_id must be non-negative. If the numa_node is not known,
|
// bus_id must be non-negative. If the numa_node is not known,
|
||||||
@ -96,11 +96,11 @@ int GPUProcessState::BusIdForGPU(TfGpuId tf_gpu_id) {
|
|||||||
|
|
||||||
// NOLINTNEXTLINE: clang-tidy complains this is unused because of build flags.
|
// NOLINTNEXTLINE: clang-tidy complains this is unused because of build flags.
|
||||||
static SubAllocator* CreateSubAllocator(
|
static SubAllocator* CreateSubAllocator(
|
||||||
const GPUOptions& options, PlatformGpuId platform_gpu_id,
|
const GPUOptions& options, PlatformDeviceId platform_device_id,
|
||||||
const std::vector<SubAllocator::Visitor>& alloc_visitors,
|
const std::vector<SubAllocator::Visitor>& alloc_visitors,
|
||||||
size_t total_bytes, const std::vector<TfGpuId>& peer_gpu_ids) {
|
size_t total_bytes, const std::vector<TfDeviceId>& peer_gpu_ids) {
|
||||||
auto executor = DeviceIdUtil::ExecutorForPlatformDeviceId(GPUMachineManager(),
|
auto executor = DeviceIdUtil::ExecutorForPlatformDeviceId(GPUMachineManager(),
|
||||||
platform_gpu_id)
|
platform_device_id)
|
||||||
.ValueOrDie();
|
.ValueOrDie();
|
||||||
|
|
||||||
// FIXME(imintz): Observed OOM issues when using the virtual memory
|
// FIXME(imintz): Observed OOM issues when using the virtual memory
|
||||||
@ -110,21 +110,21 @@ static SubAllocator* CreateSubAllocator(
|
|||||||
// TODO(imintz): Remove the cuMemAlloc capability of this allocator.
|
// TODO(imintz): Remove the cuMemAlloc capability of this allocator.
|
||||||
if (options.per_process_gpu_memory_fraction() > 1.0 ||
|
if (options.per_process_gpu_memory_fraction() > 1.0 ||
|
||||||
options.experimental().use_unified_memory()) {
|
options.experimental().use_unified_memory()) {
|
||||||
return new DeviceMemAllocator(executor, platform_gpu_id,
|
return new DeviceMemAllocator(executor, platform_device_id,
|
||||||
/*use_unified_memory=*/true, alloc_visitors,
|
/*use_unified_memory=*/true, alloc_visitors,
|
||||||
{});
|
{});
|
||||||
} else {
|
} else {
|
||||||
auto* gpu_context = reinterpret_cast<stream_executor::gpu::GpuContext*>(
|
auto* gpu_context = reinterpret_cast<stream_executor::gpu::GpuContext*>(
|
||||||
executor->implementation()->GpuContextHack());
|
executor->implementation()->GpuContextHack());
|
||||||
|
|
||||||
absl::flat_hash_set<PlatformGpuId> platform_peer_gpu_ids;
|
absl::flat_hash_set<PlatformDeviceId> platform_peer_gpu_ids;
|
||||||
platform_peer_gpu_ids.reserve(peer_gpu_ids.size());
|
platform_peer_gpu_ids.reserve(peer_gpu_ids.size());
|
||||||
for (const TfGpuId tf_gpu_id : peer_gpu_ids) {
|
for (const TfDeviceId tf_device_id : peer_gpu_ids) {
|
||||||
PlatformGpuId platform_gpu_id;
|
PlatformDeviceId platform_device_id;
|
||||||
TF_CHECK_OK(GpuIdManager::TfToPlatformGpuId(tf_gpu_id, &platform_gpu_id));
|
TF_CHECK_OK(GpuIdManager::TfToPlatformDeviceId(tf_device_id, &platform_device_id));
|
||||||
platform_peer_gpu_ids.insert(platform_gpu_id);
|
platform_peer_gpu_ids.insert(platform_device_id);
|
||||||
}
|
}
|
||||||
std::vector<PlatformGpuId> platform_peer_gpu_ids_vec(
|
std::vector<PlatformDeviceId> platform_peer_gpu_ids_vec(
|
||||||
platform_peer_gpu_ids.begin(), platform_peer_gpu_ids.end());
|
platform_peer_gpu_ids.begin(), platform_peer_gpu_ids.end());
|
||||||
|
|
||||||
// Adjust virtual address space to be slightly larger than the physical
|
// Adjust virtual address space to be slightly larger than the physical
|
||||||
@ -133,7 +133,7 @@ static SubAllocator* CreateSubAllocator(
|
|||||||
// TODO(imintz): Update BFC allocator to ensure it doesn't create holes in
|
// TODO(imintz): Update BFC allocator to ensure it doesn't create holes in
|
||||||
// the va space.
|
// the va space.
|
||||||
return GpuVirtualMemAllocator::Create(
|
return GpuVirtualMemAllocator::Create(
|
||||||
alloc_visitors, {}, *gpu_context, platform_gpu_id,
|
alloc_visitors, {}, *gpu_context, platform_device_id,
|
||||||
/*virtual_address_space_size=*/total_bytes * 2,
|
/*virtual_address_space_size=*/total_bytes * 2,
|
||||||
platform_peer_gpu_ids_vec)
|
platform_peer_gpu_ids_vec)
|
||||||
.ValueOrDie()
|
.ValueOrDie()
|
||||||
@ -141,7 +141,7 @@ static SubAllocator* CreateSubAllocator(
|
|||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
return new DeviceMemAllocator(
|
return new DeviceMemAllocator(
|
||||||
executor, platform_gpu_id,
|
executor, platform_device_id,
|
||||||
(options.per_process_gpu_memory_fraction() > 1.0 ||
|
(options.per_process_gpu_memory_fraction() > 1.0 ||
|
||||||
options.experimental().use_unified_memory()),
|
options.experimental().use_unified_memory()),
|
||||||
alloc_visitors, {});
|
alloc_visitors, {});
|
||||||
@ -149,21 +149,21 @@ static SubAllocator* CreateSubAllocator(
|
|||||||
}
|
}
|
||||||
|
|
||||||
Allocator* GPUProcessState::GetGPUAllocator(
|
Allocator* GPUProcessState::GetGPUAllocator(
|
||||||
const GPUOptions& options, TfGpuId tf_gpu_id, size_t total_bytes,
|
const GPUOptions& options, TfDeviceId tf_device_id, size_t total_bytes,
|
||||||
const std::vector<TfGpuId>& peer_gpu_ids) {
|
const std::vector<TfDeviceId>& peer_gpu_ids) {
|
||||||
CHECK(process_state_);
|
CHECK(process_state_);
|
||||||
#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
|
#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
|
||||||
(defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
|
(defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
|
||||||
const string& allocator_type = options.allocator_type();
|
const string& allocator_type = options.allocator_type();
|
||||||
mutex_lock lock(mu_);
|
mutex_lock lock(mu_);
|
||||||
DeviceIdUtil::CheckValidTfDeviceId(DEVICE_GPU, GPUMachineManager(),
|
DeviceIdUtil::CheckValidTfDeviceId(DEVICE_GPU, GPUMachineManager(),
|
||||||
tf_gpu_id);
|
tf_device_id);
|
||||||
|
|
||||||
if (tf_gpu_id.value() >= static_cast<int64>(gpu_allocators_.size())) {
|
if (tf_device_id.value() >= static_cast<int64>(gpu_allocators_.size())) {
|
||||||
gpu_allocators_.resize(tf_gpu_id.value() + 1);
|
gpu_allocators_.resize(tf_device_id.value() + 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
AllocatorParts& allocator_parts = gpu_allocators_[tf_gpu_id.value()];
|
AllocatorParts& allocator_parts = gpu_allocators_[tf_device_id.value()];
|
||||||
if (allocator_parts.allocator == nullptr) {
|
if (allocator_parts.allocator == nullptr) {
|
||||||
// Validate allocator types.
|
// Validate allocator types.
|
||||||
if (!allocator_type.empty() && allocator_type != "BFC") {
|
if (!allocator_type.empty() && allocator_type != "BFC") {
|
||||||
@ -171,19 +171,20 @@ Allocator* GPUProcessState::GetGPUAllocator(
|
|||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
PlatformGpuId platform_gpu_id;
|
PlatformDeviceId platform_device_id;
|
||||||
TF_CHECK_OK(GpuIdManager::TfToPlatformGpuId(tf_gpu_id, &platform_gpu_id));
|
TF_CHECK_OK(
|
||||||
int bus_id = BusIdForGPU(tf_gpu_id);
|
GpuIdManager::TfToPlatformDeviceId(tf_device_id, &platform_device_id));
|
||||||
|
int bus_id = BusIdForGPU(tf_device_id);
|
||||||
DCHECK_GE(bus_id, 0);
|
DCHECK_GE(bus_id, 0);
|
||||||
while (bus_id >= gpu_visitors_.size()) {
|
while (bus_id >= gpu_visitors_.size()) {
|
||||||
gpu_visitors_.push_back({});
|
gpu_visitors_.push_back({});
|
||||||
}
|
}
|
||||||
auto* sub_allocator =
|
auto* sub_allocator =
|
||||||
CreateSubAllocator(options, platform_gpu_id, gpu_visitors_[bus_id],
|
CreateSubAllocator(options, platform_device_id, gpu_visitors_[bus_id],
|
||||||
total_bytes, peer_gpu_ids);
|
total_bytes, peer_gpu_ids);
|
||||||
GPUBFCAllocator* gpu_bfc_allocator =
|
GPUBFCAllocator* gpu_bfc_allocator = new GPUBFCAllocator(
|
||||||
new GPUBFCAllocator(sub_allocator, total_bytes, options,
|
sub_allocator, total_bytes, options,
|
||||||
strings::StrCat("GPU_", tf_gpu_id.value(), "_bfc"));
|
strings::StrCat("GPU_", tf_device_id.value(), "_bfc"));
|
||||||
Allocator* gpu_allocator = gpu_bfc_allocator;
|
Allocator* gpu_allocator = gpu_bfc_allocator;
|
||||||
SharedCounter* timing_counter = nullptr;
|
SharedCounter* timing_counter = nullptr;
|
||||||
if (options.experimental().timestamped_allocator()) {
|
if (options.experimental().timestamped_allocator()) {
|
||||||
@ -195,29 +196,30 @@ Allocator* GPUProcessState::GetGPUAllocator(
|
|||||||
// distinctive patterns on both ends of allocated memory.
|
// distinctive patterns on both ends of allocated memory.
|
||||||
if (UseCudaMemoryGuardAllocator()) {
|
if (UseCudaMemoryGuardAllocator()) {
|
||||||
LOG(INFO) << "Using memory guard allocator for GPU.";
|
LOG(INFO) << "Using memory guard allocator for GPU.";
|
||||||
gpu_allocator = new GPUDebugAllocator(gpu_allocator, platform_gpu_id);
|
gpu_allocator = new GPUDebugAllocator(gpu_allocator, platform_device_id);
|
||||||
gpu_allocator = new GPUNanResetAllocator(gpu_allocator, platform_gpu_id);
|
gpu_allocator =
|
||||||
|
new GPUNanResetAllocator(gpu_allocator, platform_device_id);
|
||||||
} else if (UseCudaMallocAllocator()) {
|
} else if (UseCudaMallocAllocator()) {
|
||||||
LOG(INFO) << "Using CUDA malloc allocator for GPU.";
|
LOG(INFO) << "Using CUDA malloc allocator for GPU.";
|
||||||
// If true, passes all allocation requests through to cudaMalloc
|
// If true, passes all allocation requests through to cudaMalloc
|
||||||
// useful for doing memory debugging with tools like cuda-memcheck
|
// useful for doing memory debugging with tools like cuda-memcheck
|
||||||
// **WARNING** probably will not work in a multi-gpu scenario
|
// **WARNING** probably will not work in a multi-gpu scenario
|
||||||
gpu_allocator =
|
gpu_allocator =
|
||||||
new GPUcudaMallocAllocator(gpu_allocator, platform_gpu_id);
|
new GPUcudaMallocAllocator(gpu_allocator, platform_device_id);
|
||||||
} else if (UseCudaMallocAsyncAllocator()) {
|
} else if (UseCudaMallocAsyncAllocator()) {
|
||||||
LOG(INFO) << "Using CUDA malloc Async allocator for GPU.";
|
LOG(INFO) << "Using CUDA malloc Async allocator for GPU.";
|
||||||
// If true, passes all allocation requests through to cudaMallocAsync
|
// If true, passes all allocation requests through to cudaMallocAsync
|
||||||
// TODO: useful for doing memory debugging with tools like cuda-memcheck
|
// TODO: useful for doing memory debugging with tools like cuda-memcheck
|
||||||
// TODO: **WARNING** probably will not work in a multi-gpu scenario
|
// TODO: **WARNING** probably will not work in a multi-gpu scenario
|
||||||
gpu_allocator =
|
gpu_allocator =
|
||||||
new GpuCudaMallocAsyncAllocator(platform_gpu_id, total_bytes);
|
new GpuCudaMallocAsyncAllocator(platform_device_id, total_bytes);
|
||||||
}
|
}
|
||||||
|
|
||||||
Allocator* recording_allocator = nullptr;
|
Allocator* recording_allocator = nullptr;
|
||||||
if (process_state_->ProcessState::FLAGS_brain_gpu_record_mem_types) {
|
if (process_state_->ProcessState::FLAGS_brain_gpu_record_mem_types) {
|
||||||
ProcessState::MemDesc md;
|
ProcessState::MemDesc md;
|
||||||
md.loc = ProcessState::MemDesc::GPU;
|
md.loc = ProcessState::MemDesc::GPU;
|
||||||
md.dev_index = platform_gpu_id.value();
|
md.dev_index = platform_device_id.value();
|
||||||
md.gpu_registered = false;
|
md.gpu_registered = false;
|
||||||
md.nic_registered = true;
|
md.nic_registered = true;
|
||||||
recording_allocator = new internal::RecordingAllocator(
|
recording_allocator = new internal::RecordingAllocator(
|
||||||
@ -240,20 +242,20 @@ Allocator* GPUProcessState::GetGPUAllocator(
|
|||||||
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||||
}
|
}
|
||||||
|
|
||||||
SharedCounter* GPUProcessState::GPUAllocatorCounter(TfGpuId tf_gpu_id) {
|
SharedCounter* GPUProcessState::GPUAllocatorCounter(TfDeviceId tf_device_id) {
|
||||||
DCHECK(process_state_);
|
DCHECK(process_state_);
|
||||||
#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
|
#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
|
||||||
(defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
|
(defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
|
||||||
DeviceIdUtil::CheckValidTfDeviceId(DEVICE_GPU, GPUMachineManager(),
|
DeviceIdUtil::CheckValidTfDeviceId(DEVICE_GPU, GPUMachineManager(),
|
||||||
tf_gpu_id);
|
tf_device_id);
|
||||||
mutex_lock l(mu_);
|
mutex_lock l(mu_);
|
||||||
if (tf_gpu_id.value() >= static_cast<int64>(gpu_allocators_.size())) {
|
if (tf_device_id.value() >= static_cast<int64>(gpu_allocators_.size())) {
|
||||||
LOG(ERROR) << "Asked for counter for GPU allocator " << tf_gpu_id.value()
|
LOG(ERROR) << "Asked for counter for GPU allocator " << tf_device_id.value()
|
||||||
<< " but only have " << gpu_allocators_.size();
|
<< " but only have " << gpu_allocators_.size();
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
AllocatorParts& allocator_parts = gpu_allocators_[tf_gpu_id.value()];
|
AllocatorParts& allocator_parts = gpu_allocators_[tf_device_id.value()];
|
||||||
if (allocator_parts.counter.get() == nullptr) {
|
if (allocator_parts.counter.get() == nullptr) {
|
||||||
SharedCounter* timing_counter = new SharedCounter;
|
SharedCounter* timing_counter = new SharedCounter;
|
||||||
allocator_parts.bfc_allocator->SetTimingCounter(timing_counter);
|
allocator_parts.bfc_allocator->SetTimingCounter(timing_counter);
|
||||||
@ -303,7 +305,7 @@ Allocator* GPUProcessState::GetGpuHostAllocator(int numa_node) {
|
|||||||
for (int i = 0; i < static_cast<int>(gpu_allocators_.size()); ++i) {
|
for (int i = 0; i < static_cast<int>(gpu_allocators_.size()); ++i) {
|
||||||
if (gpu_allocators_[i].allocator != nullptr) {
|
if (gpu_allocators_[i].allocator != nullptr) {
|
||||||
se = DeviceIdUtil::ExecutorForTfDeviceId(DEVICE_GPU, GPUMachineManager(),
|
se = DeviceIdUtil::ExecutorForTfDeviceId(DEVICE_GPU, GPUMachineManager(),
|
||||||
TfGpuId(i))
|
TfDeviceId(i))
|
||||||
.ValueOrDie();
|
.ValueOrDie();
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
@ -72,18 +72,18 @@ class GPUProcessState {
|
|||||||
//
|
//
|
||||||
// 'total_bytes' is the total number of bytes that should be made
|
// 'total_bytes' is the total number of bytes that should be made
|
||||||
// available to the allocator. The first call to this function for
|
// available to the allocator. The first call to this function for
|
||||||
// a given tf_gpu_id creates the allocator, so only the total_bytes
|
// a given tf_device_id creates the allocator, so only the total_bytes
|
||||||
// used on that first call is used.
|
// used on that first call is used.
|
||||||
//
|
//
|
||||||
// "Allocator type" describes the type of algorithm to use for the
|
// "Allocator type" describes the type of algorithm to use for the
|
||||||
// underlying allocator. REQUIRES: Must be a valid type (see
|
// underlying allocator. REQUIRES: Must be a valid type (see
|
||||||
// config.proto for the list of supported strings.).
|
// config.proto for the list of supported strings.).
|
||||||
//
|
//
|
||||||
// REQUIRES: tf_gpu_id must be a valid id for a BaseGPUDevice available in the
|
// REQUIRES: tf_device_id must be a valid id for a BaseGPUDevice available in
|
||||||
// current system environment. Otherwise returns nullptr.
|
// the current system environment. Otherwise returns nullptr.
|
||||||
virtual Allocator* GetGPUAllocator(const GPUOptions& options,
|
virtual Allocator* GetGPUAllocator(
|
||||||
TfGpuId tf_gpu_id, size_t total_bytes,
|
const GPUOptions& options, TfDeviceId tf_device_id, size_t total_bytes,
|
||||||
const std::vector<TfGpuId>& peer_gpu_ids);
|
const std::vector<TfDeviceId>& peer_gpu_ids);
|
||||||
|
|
||||||
int NumGPUAllocators() {
|
int NumGPUAllocators() {
|
||||||
mutex_lock l(mu_);
|
mutex_lock l(mu_);
|
||||||
@ -115,9 +115,9 @@ class GPUProcessState {
|
|||||||
const SubAllocator::Visitor& visitor);
|
const SubAllocator::Visitor& visitor);
|
||||||
|
|
||||||
// Returns bus_id for the given GPU id.
|
// Returns bus_id for the given GPU id.
|
||||||
virtual int BusIdForGPU(TfGpuId tf_gpu_id);
|
virtual int BusIdForGPU(TfDeviceId tf_device_id);
|
||||||
|
|
||||||
SharedCounter* GPUAllocatorCounter(TfGpuId tf_gpu_id);
|
SharedCounter* GPUAllocatorCounter(TfDeviceId tf_device_id);
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
// GPUProcessState is a singleton that should not normally be deleted except
|
// GPUProcessState is a singleton that should not normally be deleted except
|
||||||
|
@ -44,7 +44,7 @@ StatusOr<bool> SupportsVirtualAddressManagement(GpuDeviceHandle device) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
Status CheckVirtualAddressManagementSupport(GpuDeviceHandle device,
|
Status CheckVirtualAddressManagementSupport(GpuDeviceHandle device,
|
||||||
PlatformGpuId gpu_id) {
|
PlatformDeviceId gpu_id) {
|
||||||
TF_ASSIGN_OR_RETURN(bool supports_virtual_address_management,
|
TF_ASSIGN_OR_RETURN(bool supports_virtual_address_management,
|
||||||
SupportsVirtualAddressManagement(device));
|
SupportsVirtualAddressManagement(device));
|
||||||
if (!supports_virtual_address_management) {
|
if (!supports_virtual_address_management) {
|
||||||
@ -59,11 +59,11 @@ Status CheckVirtualAddressManagementSupport(GpuDeviceHandle device,
|
|||||||
|
|
||||||
/* static */ stream_executor::port::StatusOr<
|
/* static */ stream_executor::port::StatusOr<
|
||||||
std::unique_ptr<GpuVirtualMemAllocator>>
|
std::unique_ptr<GpuVirtualMemAllocator>>
|
||||||
GpuVirtualMemAllocator::Create(const std::vector<Visitor>& alloc_visitors,
|
GpuVirtualMemAllocator::Create(
|
||||||
const std::vector<Visitor>& free_visitors,
|
const std::vector<Visitor>& alloc_visitors,
|
||||||
GpuContext& gpu_context, PlatformGpuId gpu_id,
|
const std::vector<Visitor>& free_visitors, GpuContext& gpu_context,
|
||||||
size_t virtual_address_space_size,
|
PlatformDeviceId gpu_id, size_t virtual_address_space_size,
|
||||||
const std::vector<PlatformGpuId>& peer_gpu_ids) {
|
const std::vector<PlatformDeviceId>& peer_gpu_ids) {
|
||||||
std::vector<GpuDeviceHandle> access_gpu_handles;
|
std::vector<GpuDeviceHandle> access_gpu_handles;
|
||||||
access_gpu_handles.reserve(peer_gpu_ids.size() + 1);
|
access_gpu_handles.reserve(peer_gpu_ids.size() + 1);
|
||||||
|
|
||||||
@ -111,7 +111,8 @@ GpuVirtualMemAllocator::Create(const std::vector<Visitor>& alloc_visitors,
|
|||||||
GpuVirtualMemAllocator::GpuVirtualMemAllocator(
|
GpuVirtualMemAllocator::GpuVirtualMemAllocator(
|
||||||
const std::vector<Visitor>& alloc_visitors,
|
const std::vector<Visitor>& alloc_visitors,
|
||||||
const std::vector<Visitor>& free_visitors, GpuContext& gpu_context,
|
const std::vector<Visitor>& free_visitors, GpuContext& gpu_context,
|
||||||
PlatformGpuId gpu_id, const std::vector<GpuDeviceHandle> access_gpu_handles,
|
PlatformDeviceId gpu_id,
|
||||||
|
const std::vector<GpuDeviceHandle> access_gpu_handles,
|
||||||
GpuDriver::VmemSpan vmem, size_t granularity)
|
GpuDriver::VmemSpan vmem, size_t granularity)
|
||||||
: SubAllocator(alloc_visitors, free_visitors),
|
: SubAllocator(alloc_visitors, free_visitors),
|
||||||
gpu_context_(gpu_context),
|
gpu_context_(gpu_context),
|
||||||
|
@ -44,9 +44,9 @@ class GpuVirtualMemAllocator : public SubAllocator {
|
|||||||
std::unique_ptr<GpuVirtualMemAllocator>>
|
std::unique_ptr<GpuVirtualMemAllocator>>
|
||||||
Create(const std::vector<Visitor>& alloc_visitors,
|
Create(const std::vector<Visitor>& alloc_visitors,
|
||||||
const std::vector<Visitor>& free_visitors,
|
const std::vector<Visitor>& free_visitors,
|
||||||
stream_executor::gpu::GpuContext& gpu_context, PlatformGpuId gpu_id,
|
stream_executor::gpu::GpuContext& gpu_context, PlatformDeviceId gpu_id,
|
||||||
size_t virtual_address_space_size,
|
size_t virtual_address_space_size,
|
||||||
const std::vector<PlatformGpuId>& peer_gpu_ids);
|
const std::vector<PlatformDeviceId>& peer_gpu_ids);
|
||||||
~GpuVirtualMemAllocator() override;
|
~GpuVirtualMemAllocator() override;
|
||||||
|
|
||||||
// Allocates memory at least as large as requested by num_bytes. Will be
|
// Allocates memory at least as large as requested by num_bytes. Will be
|
||||||
@ -74,12 +74,12 @@ class GpuVirtualMemAllocator : public SubAllocator {
|
|||||||
GpuVirtualMemAllocator(
|
GpuVirtualMemAllocator(
|
||||||
const std::vector<Visitor>& alloc_visitors,
|
const std::vector<Visitor>& alloc_visitors,
|
||||||
const std::vector<Visitor>& free_visitors,
|
const std::vector<Visitor>& free_visitors,
|
||||||
stream_executor::gpu::GpuContext& gpu_context, PlatformGpuId gpu_id,
|
stream_executor::gpu::GpuContext& gpu_context, PlatformDeviceId gpu_id,
|
||||||
std::vector<stream_executor::gpu::GpuDeviceHandle> access_device_handles,
|
std::vector<stream_executor::gpu::GpuDeviceHandle> access_device_handles,
|
||||||
stream_executor::gpu::GpuDriver::VmemSpan vmem, size_t granularity);
|
stream_executor::gpu::GpuDriver::VmemSpan vmem, size_t granularity);
|
||||||
|
|
||||||
stream_executor::gpu::GpuContext& gpu_context_;
|
stream_executor::gpu::GpuContext& gpu_context_;
|
||||||
PlatformGpuId gpu_id_;
|
PlatformDeviceId gpu_id_;
|
||||||
|
|
||||||
// Peer access is configured at mmap time so the allocator must be aware of
|
// Peer access is configured at mmap time so the allocator must be aware of
|
||||||
// all gpus that may want to read the memory. This list also includes the
|
// all gpus that may want to read the memory. This list also includes the
|
||||||
|
@ -35,7 +35,7 @@ constexpr size_t k2MiB{2 << 20};
|
|||||||
|
|
||||||
// Creates an allocator with 8 MiB of virtual address space.
|
// Creates an allocator with 8 MiB of virtual address space.
|
||||||
std::unique_ptr<GpuVirtualMemAllocator> CreateAllocator() {
|
std::unique_ptr<GpuVirtualMemAllocator> CreateAllocator() {
|
||||||
PlatformGpuId gpu_id(0);
|
PlatformDeviceId gpu_id(0);
|
||||||
auto executor =
|
auto executor =
|
||||||
DeviceIdUtil::ExecutorForPlatformDeviceId(GPUMachineManager(), gpu_id)
|
DeviceIdUtil::ExecutorForPlatformDeviceId(GPUMachineManager(), gpu_id)
|
||||||
.ValueOrDie();
|
.ValueOrDie();
|
||||||
@ -48,7 +48,7 @@ std::unique_ptr<GpuVirtualMemAllocator> CreateAllocator() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
TEST(GpuVirtualMemAllocatorTest, SimpleAlloc) {
|
TEST(GpuVirtualMemAllocatorTest, SimpleAlloc) {
|
||||||
PlatformGpuId gpu_id(0);
|
PlatformDeviceId gpu_id(0);
|
||||||
auto executor =
|
auto executor =
|
||||||
DeviceIdUtil::ExecutorForPlatformDeviceId(GPUMachineManager(), gpu_id)
|
DeviceIdUtil::ExecutorForPlatformDeviceId(GPUMachineManager(), gpu_id)
|
||||||
.ValueOrDie();
|
.ValueOrDie();
|
||||||
|
@ -92,14 +92,15 @@ Status SingleMachine::Provision() {
|
|||||||
return errors::InvalidArgument(
|
return errors::InvalidArgument(
|
||||||
strings::StrCat("Not able to parse GPU device name: ", dev.name()));
|
strings::StrCat("Not able to parse GPU device name: ", dev.name()));
|
||||||
}
|
}
|
||||||
TfGpuId tf_gpu_id(parsed.id);
|
TfDeviceId tf_device_id(parsed.id);
|
||||||
PlatformGpuId platform_gpu_id;
|
PlatformDeviceId platform_device_id;
|
||||||
Status s = GpuIdManager::TfToPlatformGpuId(tf_gpu_id, &platform_gpu_id);
|
Status s =
|
||||||
|
GpuIdManager::TfToPlatformDeviceId(tf_device_id, &platform_device_id);
|
||||||
if (!s.ok()) {
|
if (!s.ok()) {
|
||||||
return errors::Unavailable("Unknown TF GPU device with id ",
|
return errors::Unavailable("Unknown TF GPU device with id ",
|
||||||
tf_gpu_id.value(), ": ", s.ToString());
|
tf_device_id.value(), ": ", s.ToString());
|
||||||
}
|
}
|
||||||
attr = GetLocalGPUInfo(platform_gpu_id);
|
attr = GetLocalGPUInfo(platform_device_id);
|
||||||
} else if (dev.device_type().find("XLA") == string::npos) {
|
} else if (dev.device_type().find("XLA") == string::npos) {
|
||||||
// Filter out the fake XLA devices to avoid double counting the actual
|
// Filter out the fake XLA devices to avoid double counting the actual
|
||||||
// hardware resources that are available.
|
// hardware resources that are available.
|
||||||
|
@ -74,14 +74,14 @@ DeviceProperties GetLocalCPUInfo() {
|
|||||||
return device;
|
return device;
|
||||||
}
|
}
|
||||||
|
|
||||||
DeviceProperties GetLocalGPUInfo(PlatformGpuId platform_gpu_id) {
|
DeviceProperties GetLocalGPUInfo(PlatformDeviceId platform_device_id) {
|
||||||
DeviceProperties device;
|
DeviceProperties device;
|
||||||
device.set_type("GPU");
|
device.set_type("GPU");
|
||||||
|
|
||||||
#if GOOGLE_CUDA
|
#if GOOGLE_CUDA
|
||||||
cudaDeviceProp properties;
|
cudaDeviceProp properties;
|
||||||
cudaError_t error =
|
cudaError_t error =
|
||||||
cudaGetDeviceProperties(&properties, platform_gpu_id.value());
|
cudaGetDeviceProperties(&properties, platform_device_id.value());
|
||||||
if (error != cudaSuccess) {
|
if (error != cudaSuccess) {
|
||||||
device.set_type("UNKNOWN");
|
device.set_type("UNKNOWN");
|
||||||
LOG(ERROR) << "Failed to get device properties, error code: " << error;
|
LOG(ERROR) << "Failed to get device properties, error code: " << error;
|
||||||
@ -117,7 +117,7 @@ DeviceProperties GetLocalGPUInfo(PlatformGpuId platform_gpu_id) {
|
|||||||
#elif TENSORFLOW_USE_ROCM
|
#elif TENSORFLOW_USE_ROCM
|
||||||
hipDeviceProp_t properties;
|
hipDeviceProp_t properties;
|
||||||
hipError_t error =
|
hipError_t error =
|
||||||
hipGetDeviceProperties(&properties, platform_gpu_id.value());
|
hipGetDeviceProperties(&properties, platform_device_id.value());
|
||||||
if (error != hipSuccess) {
|
if (error != hipSuccess) {
|
||||||
device.set_type("UNKNOWN");
|
device.set_type("UNKNOWN");
|
||||||
LOG(ERROR) << "Failed to get device properties, error code: " << error;
|
LOG(ERROR) << "Failed to get device properties, error code: " << error;
|
||||||
@ -156,16 +156,17 @@ DeviceProperties GetDeviceInfo(const DeviceNameUtils::ParsedName& device) {
|
|||||||
return GetLocalCPUInfo();
|
return GetLocalCPUInfo();
|
||||||
} else if (device.type == "GPU") {
|
} else if (device.type == "GPU") {
|
||||||
if (device.has_id) {
|
if (device.has_id) {
|
||||||
TfGpuId tf_gpu_id(device.id);
|
TfDeviceId tf_device_id(device.id);
|
||||||
PlatformGpuId platform_gpu_id;
|
PlatformDeviceId platform_device_id;
|
||||||
Status s = GpuIdManager::TfToPlatformGpuId(tf_gpu_id, &platform_gpu_id);
|
Status s =
|
||||||
|
GpuIdManager::TfToPlatformDeviceId(tf_device_id, &platform_device_id);
|
||||||
if (!s.ok()) {
|
if (!s.ok()) {
|
||||||
LOG(ERROR) << s;
|
LOG(ERROR) << s;
|
||||||
return unknown;
|
return unknown;
|
||||||
}
|
}
|
||||||
return GetLocalGPUInfo(platform_gpu_id);
|
return GetLocalGPUInfo(platform_device_id);
|
||||||
} else {
|
} else {
|
||||||
return GetLocalGPUInfo(PlatformGpuId(0));
|
return GetLocalGPUInfo(PlatformDeviceId(0));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return unknown;
|
return unknown;
|
||||||
|
@ -28,7 +28,7 @@ DeviceProperties GetLocalCPUInfo();
|
|||||||
|
|
||||||
// Returns the DeviceProperties for the specified GPU attached to the server on
|
// Returns the DeviceProperties for the specified GPU attached to the server on
|
||||||
// which grappler is running.
|
// which grappler is running.
|
||||||
DeviceProperties GetLocalGPUInfo(PlatformGpuId platform_gpu_id);
|
DeviceProperties GetLocalGPUInfo(PlatformDeviceId platform_device_id);
|
||||||
|
|
||||||
// Returns the DeviceProperties of the specified device
|
// Returns the DeviceProperties of the specified device
|
||||||
DeviceProperties GetDeviceInfo(const DeviceNameUtils::ParsedName& device);
|
DeviceProperties GetDeviceInfo(const DeviceNameUtils::ParsedName& device);
|
||||||
|
@ -33,11 +33,11 @@ TEST(UtilsTest, GetLocalGPUInfo) {
|
|||||||
DeviceProperties properties;
|
DeviceProperties properties;
|
||||||
|
|
||||||
// Invalid platform GPU ID.
|
// Invalid platform GPU ID.
|
||||||
properties = GetLocalGPUInfo(PlatformGpuId(100));
|
properties = GetLocalGPUInfo(PlatformDeviceId(100));
|
||||||
EXPECT_EQ("UNKNOWN", properties.type());
|
EXPECT_EQ("UNKNOWN", properties.type());
|
||||||
|
|
||||||
// Succeed when a valid platform GPU id was inserted.
|
// Succeed when a valid platform GPU id was inserted.
|
||||||
properties = GetLocalGPUInfo(PlatformGpuId(0));
|
properties = GetLocalGPUInfo(PlatformDeviceId(0));
|
||||||
EXPECT_EQ("GPU", properties.type());
|
EXPECT_EQ("GPU", properties.type());
|
||||||
EXPECT_EQ("NVIDIA", properties.vendor());
|
EXPECT_EQ("NVIDIA", properties.vendor());
|
||||||
#elif TENSORFLOW_USE_ROCM
|
#elif TENSORFLOW_USE_ROCM
|
||||||
@ -45,21 +45,21 @@ TEST(UtilsTest, GetLocalGPUInfo) {
|
|||||||
DeviceProperties properties;
|
DeviceProperties properties;
|
||||||
|
|
||||||
// Invalid platform GPU ID.
|
// Invalid platform GPU ID.
|
||||||
properties = GetLocalGPUInfo(PlatformGpuId(100));
|
properties = GetLocalGPUInfo(PlatformDeviceId(100));
|
||||||
EXPECT_EQ("UNKNOWN", properties.type());
|
EXPECT_EQ("UNKNOWN", properties.type());
|
||||||
|
|
||||||
// Succeed when a valid platform GPU id was inserted.
|
// Succeed when a valid platform GPU id was inserted.
|
||||||
properties = GetLocalGPUInfo(PlatformGpuId(0));
|
properties = GetLocalGPUInfo(PlatformDeviceId(0));
|
||||||
EXPECT_EQ("GPU", properties.type());
|
EXPECT_EQ("GPU", properties.type());
|
||||||
EXPECT_EQ("Advanced Micro Devices, Inc", properties.vendor());
|
EXPECT_EQ("Advanced Micro Devices, Inc", properties.vendor());
|
||||||
#else
|
#else
|
||||||
LOG(INFO) << "CUDA is not enabled.";
|
LOG(INFO) << "CUDA is not enabled.";
|
||||||
DeviceProperties properties;
|
DeviceProperties properties;
|
||||||
|
|
||||||
properties = GetLocalGPUInfo(PlatformGpuId(0));
|
properties = GetLocalGPUInfo(PlatformDeviceId(0));
|
||||||
EXPECT_EQ("GPU", properties.type());
|
EXPECT_EQ("GPU", properties.type());
|
||||||
|
|
||||||
properties = GetLocalGPUInfo(PlatformGpuId(100));
|
properties = GetLocalGPUInfo(PlatformDeviceId(100));
|
||||||
EXPECT_EQ("GPU", properties.type());
|
EXPECT_EQ("GPU", properties.type());
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
@ -97,14 +97,14 @@ TEST(UtilsTest, GetDeviceInfo) {
|
|||||||
|
|
||||||
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||||
// Invalid platform GPU id.
|
// Invalid platform GPU id.
|
||||||
TF_ASSERT_OK(
|
TF_ASSERT_OK(GpuIdManager::InsertTfPlatformDeviceIdPair(
|
||||||
GpuIdManager::InsertTfPlatformGpuIdPair(TfGpuId(0), PlatformGpuId(100)));
|
TfDeviceId(0), PlatformDeviceId(100)));
|
||||||
properties = GetDeviceInfo(device);
|
properties = GetDeviceInfo(device);
|
||||||
EXPECT_EQ("UNKNOWN", properties.type());
|
EXPECT_EQ("UNKNOWN", properties.type());
|
||||||
|
|
||||||
// Valid platform GPU id.
|
// Valid platform GPU id.
|
||||||
TF_ASSERT_OK(
|
TF_ASSERT_OK(GpuIdManager::InsertTfPlatformDeviceIdPair(TfDeviceId(1),
|
||||||
GpuIdManager::InsertTfPlatformGpuIdPair(TfGpuId(1), PlatformGpuId(0)));
|
PlatformDeviceId(0)));
|
||||||
device.id = 1;
|
device.id = 1;
|
||||||
properties = GetDeviceInfo(device);
|
properties = GetDeviceInfo(device);
|
||||||
EXPECT_EQ("GPU", properties.type());
|
EXPECT_EQ("GPU", properties.type());
|
||||||
|
@ -241,14 +241,15 @@ DeviceProperties GetDeviceInfo(const string& device_str) {
|
|||||||
DeviceNameUtils::ParsedName parsed;
|
DeviceNameUtils::ParsedName parsed;
|
||||||
if (DeviceNameUtils::ParseFullName(device_str, &parsed)) {
|
if (DeviceNameUtils::ParseFullName(device_str, &parsed)) {
|
||||||
if (parsed.type == "GPU") {
|
if (parsed.type == "GPU") {
|
||||||
TfGpuId tf_gpu_id(parsed.id);
|
TfDeviceId tf_device_id(parsed.id);
|
||||||
PlatformGpuId platform_gpu_id;
|
PlatformDeviceId platform_device_id;
|
||||||
Status s = GpuIdManager::TfToPlatformGpuId(tf_gpu_id, &platform_gpu_id);
|
Status s =
|
||||||
|
GpuIdManager::TfToPlatformDeviceId(tf_device_id, &platform_device_id);
|
||||||
if (!s.ok()) {
|
if (!s.ok()) {
|
||||||
// We are probably running simulation without linking cuda libraries.
|
// We are probably running simulation without linking cuda libraries.
|
||||||
platform_gpu_id = PlatformGpuId(parsed.id);
|
platform_device_id = PlatformDeviceId(parsed.id);
|
||||||
}
|
}
|
||||||
return GetLocalGPUInfo(platform_gpu_id);
|
return GetLocalGPUInfo(platform_device_id);
|
||||||
} else if (parsed.type == "CPU") {
|
} else if (parsed.type == "CPU") {
|
||||||
return GetLocalCPUInfo();
|
return GetLocalCPUInfo();
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user