Finish migrating {Tf|Platform}GpuId to {Tf|Platform}DeviceId

PiperOrigin-RevId: 361252995
Change-Id: I818798fc00efe7b98c35145ce067204d9e023895
This commit is contained in:
Sanjoy Das 2021-03-05 16:56:23 -08:00 committed by TensorFlower Gardener
parent fbd744dfdb
commit d7634bbfaf
28 changed files with 398 additions and 379 deletions

View File

@ -43,15 +43,15 @@ static xla::StatusOr<absl::optional<std::set<int>>> ParseVisibleDeviceList(
} }
const std::vector<string> visible_devices = const std::vector<string> visible_devices =
absl::StrSplit(visible_device_list, ','); absl::StrSplit(visible_device_list, ',');
for (const string& platform_gpu_id_str : visible_devices) { for (const string& platform_device_id_str : visible_devices) {
int32 platform_gpu_id; int32 platform_device_id;
if (!absl::SimpleAtoi(platform_gpu_id_str, &platform_gpu_id)) { if (!absl::SimpleAtoi(platform_device_id_str, &platform_device_id)) {
return errors::InvalidArgument( return errors::InvalidArgument(
"Could not parse entry in 'visible_device_list': '", "Could not parse entry in 'visible_device_list': '",
platform_gpu_id_str, platform_device_id_str,
"'. visible_device_list = ", visible_device_list); "'. visible_device_list = ", visible_device_list);
} }
gpu_ids.insert(platform_gpu_id); gpu_ids.insert(platform_device_id);
} }
return {{gpu_ids}}; return {{gpu_ids}};
} }

View File

@ -102,19 +102,21 @@ struct EdgePtrCompare {
// TODO(laigd): instead of deciding the device here, the converter should accept // TODO(laigd): instead of deciding the device here, the converter should accept
// a device name as one of the conversion parameter so users can control on // a device name as one of the conversion parameter so users can control on
// which device they want to run the conversion. // which device they want to run the conversion.
std::pair<TfGpuId, PlatformGpuId> GetFirstValidDeviceId() { std::pair<TfDeviceId, PlatformDeviceId> GetFirstValidDeviceId() {
for (int tf_gpu_id_value = 0; tf_gpu_id_value < 100; ++tf_gpu_id_value) { for (int tf_device_id_value = 0; tf_device_id_value < 100;
TfGpuId tf_gpu_id(tf_gpu_id_value); ++tf_device_id_value) {
PlatformGpuId platform_gpu_id; TfDeviceId tf_device_id(tf_device_id_value);
Status s = GpuIdManager::TfToPlatformGpuId(tf_gpu_id, &platform_gpu_id); PlatformDeviceId platform_device_id;
Status s =
GpuIdManager::TfToPlatformDeviceId(tf_device_id, &platform_device_id);
if (s.ok()) { if (s.ok()) {
VLOG(1) << "Found TF GPU " << tf_gpu_id.value() << " at cuda device " VLOG(1) << "Found TF GPU " << tf_device_id.value() << " at cuda device "
<< platform_gpu_id.value(); << platform_device_id.value();
return std::make_pair(tf_gpu_id, platform_gpu_id); return std::make_pair(tf_device_id, platform_device_id);
} }
} }
LOG(ERROR) << "Could not find any TF GPUs"; LOG(ERROR) << "Could not find any TF GPUs";
return std::make_pair(TfGpuId(-1), PlatformGpuId(-1)); return std::make_pair(TfDeviceId(-1), PlatformDeviceId(-1));
} }
// Returns false for const nodes (we intend to drop control edges from those). // Returns false for const nodes (we intend to drop control edges from those).
@ -266,14 +268,14 @@ Status GetEngineInfo(const Graph* g,
} }
info->device = DeviceNameUtils::ParsedNameToString(segment_device); info->device = DeviceNameUtils::ParsedNameToString(segment_device);
} else { } else {
TfGpuId tf_gpu_id; TfDeviceId tf_device_id;
PlatformGpuId platform_gpu_id; PlatformDeviceId platform_device_id;
std::tie(tf_gpu_id, platform_gpu_id) = GetFirstValidDeviceId(); std::tie(tf_device_id, platform_device_id) = GetFirstValidDeviceId();
if (tf_gpu_id.value() >= 0) { if (tf_device_id.value() >= 0) {
DeviceNameUtils::ParsedName parsed_name; DeviceNameUtils::ParsedName parsed_name;
parsed_name.type = "GPU"; parsed_name.type = "GPU";
parsed_name.has_type = true; parsed_name.has_type = true;
parsed_name.id = tf_gpu_id.value(); parsed_name.id = tf_device_id.value();
parsed_name.has_id = true; parsed_name.has_id = true;
info->device = DeviceNameUtils::ParsedNameToString(parsed_name); info->device = DeviceNameUtils::ParsedNameToString(parsed_name);
} else { } else {
@ -640,17 +642,17 @@ std::pair<int, Allocator*> GetDeviceAndAllocator(const ConversionParams& params,
if (params.cluster == nullptr || params.cluster->GetDeviceSet() == nullptr || if (params.cluster == nullptr || params.cluster->GetDeviceSet() == nullptr ||
engine.device.empty()) { engine.device.empty()) {
// If device is not set, use the first found GPU device for the conversion. // If device is not set, use the first found GPU device for the conversion.
TfGpuId tf_gpu_id; TfDeviceId tf_device_id;
PlatformGpuId platform_gpu_id; PlatformDeviceId platform_device_id;
std::tie(tf_gpu_id, platform_gpu_id) = GetFirstValidDeviceId(); std::tie(tf_device_id, platform_device_id) = GetFirstValidDeviceId();
cuda_device_id = platform_gpu_id.value(); cuda_device_id = platform_device_id.value();
if (cuda_device_id >= 0) { if (cuda_device_id >= 0) {
GPUOptions gpu_options; GPUOptions gpu_options;
// If the TF to Cuda gpu id mapping exist, the device and corresponding // If the TF to Cuda gpu id mapping exist, the device and corresponding
// allocator must have been initialized already, so the // allocator must have been initialized already, so the
// GetGPUAllocator() call won't create a new allocator. // GetGPUAllocator() call won't create a new allocator.
dev_allocator = GPUProcessState::singleton()->GetGPUAllocator( dev_allocator = GPUProcessState::singleton()->GetGPUAllocator(
gpu_options, tf_gpu_id, /*total_bytes=*/1, /*peer_gpu_ids=*/{}); gpu_options, tf_device_id, /*total_bytes=*/1, /*peer_gpu_ids=*/{});
} }
return std::make_pair(cuda_device_id, dev_allocator); return std::make_pair(cuda_device_id, dev_allocator);
} }

View File

@ -1044,25 +1044,25 @@ Status TRTEngineOp::AllocateCalibrationResources(
} }
cres->calibrator_.reset( cres->calibrator_.reset(
new TRTInt8Calibrator(cres->device_buffers_, batch_size, name())); new TRTInt8Calibrator(cres->device_buffers_, batch_size, name()));
const int platform_gpu_id = const int platform_device_id =
ctx->device()->tensorflow_gpu_device_info()->gpu_id; ctx->device()->tensorflow_gpu_device_info()->gpu_id;
if (platform_gpu_id < 0) { if (platform_device_id < 0) {
LOG(ERROR) << "Can't get gpu_device_info from context->device()"; LOG(ERROR) << "Can't get gpu_device_info from context->device()";
return errors::InvalidArgument( return errors::InvalidArgument(
"Context->device doesn't contain device info!"); "Context->device doesn't contain device info!");
} }
cache_res->Ref(); cache_res->Ref();
cres->thr_.reset(new std::thread([this, cres, shapes, platform_gpu_id, cres->thr_.reset(new std::thread([this, cres, shapes, platform_device_id,
cache_res]() { cache_res]() {
core::ScopedUnref sc(cache_res); core::ScopedUnref sc(cache_res);
VLOG(1) << "Starting calibration thread on device " << platform_gpu_id VLOG(1) << "Starting calibration thread on device " << platform_device_id
<< ", Calibration Resource @ " << cres; << ", Calibration Resource @ " << cres;
auto err = cudaSetDevice(platform_gpu_id); auto err = cudaSetDevice(platform_device_id);
if (err != cudaSuccess) { if (err != cudaSuccess) {
// TODO(aaroey): should return error here. // TODO(aaroey): should return error here.
LOG(ERROR) << "Couldn't set cuda device to " << platform_gpu_id LOG(ERROR) << "Couldn't set cuda device to " << platform_device_id
<< " in calibration thread"; << " in calibration thread";
} }
std::vector<PartialTensorShape> partial_shapes(shapes.begin(), std::vector<PartialTensorShape> partial_shapes(shapes.begin(),

View File

@ -149,7 +149,7 @@ class GPUDeviceTestHelper {
DeviceFactory::NewDevice(DEVICE_GPU, sops, "/job:a/replica:0/task:0"); DeviceFactory::NewDevice(DEVICE_GPU, sops, "/job:a/replica:0/task:0");
gpu_.reset(reinterpret_cast<BaseGPUDevice*>(device_.release())); gpu_.reset(reinterpret_cast<BaseGPUDevice*>(device_.release()));
gpu_allocator_ = GPUProcessState::singleton()->GetGPUAllocator( gpu_allocator_ = GPUProcessState::singleton()->GetGPUAllocator(
GPUOptions(), TfGpuId(0), memory_limit, /*peer_gpu_ids=*/{}); GPUOptions(), TfDeviceId(0), memory_limit, /*peer_gpu_ids=*/{});
host_allocator_ = GPUProcessState::singleton()->GetGpuHostAllocator(0); host_allocator_ = GPUProcessState::singleton()->GetGpuHostAllocator(0);
} }

View File

@ -26,11 +26,11 @@ limitations under the License.
namespace tensorflow { namespace tensorflow {
GPUcudaMallocAllocator::GPUcudaMallocAllocator(Allocator* allocator, GPUcudaMallocAllocator::GPUcudaMallocAllocator(
PlatformGpuId platform_gpu_id) Allocator* allocator, PlatformDeviceId platform_device_id)
: base_allocator_(allocator) { : base_allocator_(allocator) {
stream_exec_ = DeviceIdUtil::ExecutorForPlatformDeviceId(GPUMachineManager(), stream_exec_ = DeviceIdUtil::ExecutorForPlatformDeviceId(GPUMachineManager(),
platform_gpu_id) platform_device_id)
.ValueOrDie(); .ValueOrDie();
} }

View File

@ -32,7 +32,7 @@ namespace tensorflow {
class GPUcudaMallocAllocator : public Allocator { class GPUcudaMallocAllocator : public Allocator {
public: public:
explicit GPUcudaMallocAllocator(Allocator* allocator, explicit GPUcudaMallocAllocator(Allocator* allocator,
PlatformGpuId platform_gpu_id); PlatformDeviceId platform_device_id);
~GPUcudaMallocAllocator() override; ~GPUcudaMallocAllocator() override;
string Name() override { return "gpu_debug"; } string Name() override { return "gpu_debug"; }
void* AllocateRaw(size_t alignment, size_t num_bytes) override; void* AllocateRaw(size_t alignment, size_t num_bytes) override;

View File

@ -42,12 +42,12 @@ static std::string GetCudaErrorMessage(CUresult result) {
#endif // GOOGLE_CUDA #endif // GOOGLE_CUDA
GpuCudaMallocAsyncAllocator::GpuCudaMallocAsyncAllocator( GpuCudaMallocAsyncAllocator::GpuCudaMallocAsyncAllocator(
PlatformGpuId platform_gpu_id, size_t pool_size, bool reserve_memory, PlatformDeviceId platform_device_id, size_t pool_size, bool reserve_memory,
bool compute_stats) bool compute_stats)
: name_(absl::StrCat("gpu_async_", platform_gpu_id.value())) { : name_(absl::StrCat("gpu_async_", platform_device_id.value())) {
#if TF_CUDA_MALLOC_ASYNC_SUPPORTED #if TF_CUDA_MALLOC_ASYNC_SUPPORTED
stream_exec_ = DeviceIdUtil::ExecutorForPlatformDeviceId(GPUMachineManager(), stream_exec_ = DeviceIdUtil::ExecutorForPlatformDeviceId(GPUMachineManager(),
platform_gpu_id) platform_device_id)
.ValueOrDie(); .ValueOrDie();
// Initialized here as it only exist if compiled with a recent // Initialized here as it only exist if compiled with a recent
// enough CUDA. // enough CUDA.
@ -56,7 +56,7 @@ GpuCudaMallocAsyncAllocator::GpuCudaMallocAsyncAllocator(
// WAR an CUDA 11.2 driver bug for multiple-GPU. It currently // WAR an CUDA 11.2 driver bug for multiple-GPU. It currently
// request that the context on GPU 0 is initialized. Which isn't the // request that the context on GPU 0 is initialized. Which isn't the
// case for TF+horovod. // case for TF+horovod.
if (platform_gpu_id.value() > 0) { if (platform_device_id.value() > 0) {
CUcontext pctx; // We loose track of it. But this is fine. CUcontext pctx; // We loose track of it. But this is fine.
if (auto result = cuDevicePrimaryCtxRetain(&pctx, 0)) if (auto result = cuDevicePrimaryCtxRetain(&pctx, 0))
LOG(FATAL) // Crash OK. LOG(FATAL) // Crash OK.
@ -65,9 +65,10 @@ GpuCudaMallocAsyncAllocator::GpuCudaMallocAsyncAllocator(
se::cuda::ScopedActivateExecutorContext scoped_activation{stream_exec_}; se::cuda::ScopedActivateExecutorContext scoped_activation{stream_exec_};
int cuda_malloc_async_supported; int cuda_malloc_async_supported;
if (auto status = cuDeviceGetAttribute( if (auto status =
&cuda_malloc_async_supported, cuDeviceGetAttribute(&cuda_malloc_async_supported,
CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED, platform_gpu_id.value())) CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED,
platform_device_id.value()))
LOG(FATAL) << // Crash OK. LOG(FATAL) << // Crash OK.
"Failed to get device attribute: " << GetCudaErrorMessage(status); "Failed to get device attribute: " << GetCudaErrorMessage(status);
if (!cuda_malloc_async_supported) if (!cuda_malloc_async_supported)
@ -79,12 +80,13 @@ GpuCudaMallocAsyncAllocator::GpuCudaMallocAsyncAllocator(
LOG(FATAL) // Crash OK. LOG(FATAL) // Crash OK.
<< "Failed to create CUDA stream: " << GetCudaErrorMessage(status); << "Failed to create CUDA stream: " << GetCudaErrorMessage(status);
if (auto status = cuDeviceGetDefaultMemPool(&pool_, platform_gpu_id.value())) if (auto status =
cuDeviceGetDefaultMemPool(&pool_, platform_device_id.value()))
LOG(FATAL) << // Crash OK. LOG(FATAL) << // Crash OK.
"Failed to get default CUDA pool: " << GetCudaErrorMessage(status); "Failed to get default CUDA pool: " << GetCudaErrorMessage(status);
VLOG(1) << Name() << " CudaMallocAsync initialized on platform: " VLOG(1) << Name() << " CudaMallocAsync initialized on platform: "
<< platform_gpu_id.value() << " with pool size of: " << pool_size << platform_device_id.value() << " with pool size of: " << pool_size
<< " this ptr: " << this; << " this ptr: " << this;
uint64_t pool_size_64 = pool_size; uint64_t pool_size_64 = pool_size;
if (auto status = cuMemPoolSetAttribute( if (auto status = cuMemPoolSetAttribute(

View File

@ -64,7 +64,7 @@ namespace tensorflow {
// driver can return the excess memory to other processes. // driver can return the excess memory to other processes.
class GpuCudaMallocAsyncAllocator : public Allocator { class GpuCudaMallocAsyncAllocator : public Allocator {
public: public:
explicit GpuCudaMallocAsyncAllocator(PlatformGpuId platform_gpu_id, explicit GpuCudaMallocAsyncAllocator(PlatformDeviceId platform_device_id,
size_t pool_size, size_t pool_size,
bool reserve_memory = false, bool reserve_memory = false,
bool compute_stats = false); bool compute_stats = false);

View File

@ -76,10 +76,10 @@ void InitMask(se::StreamExecutor* exec, void* ptr, int64* mask) {
// GPUDebugAllocator // GPUDebugAllocator
// ----------------------------------------------------------------------------- // -----------------------------------------------------------------------------
GPUDebugAllocator::GPUDebugAllocator(Allocator* allocator, GPUDebugAllocator::GPUDebugAllocator(Allocator* allocator,
PlatformGpuId platform_gpu_id) PlatformDeviceId platform_device_id)
: base_allocator_(allocator) { : base_allocator_(allocator) {
stream_exec_ = DeviceIdUtil::ExecutorForPlatformDeviceId(GPUMachineManager(), stream_exec_ = DeviceIdUtil::ExecutorForPlatformDeviceId(GPUMachineManager(),
platform_gpu_id) platform_device_id)
.ValueOrDie(); .ValueOrDie();
} }
@ -155,10 +155,10 @@ bool GPUDebugAllocator::CheckFooter(void* ptr) {
// GPUNanResetAllocator // GPUNanResetAllocator
// ----------------------------------------------------------------------------- // -----------------------------------------------------------------------------
GPUNanResetAllocator::GPUNanResetAllocator(Allocator* allocator, GPUNanResetAllocator::GPUNanResetAllocator(Allocator* allocator,
PlatformGpuId platform_gpu_id) PlatformDeviceId platform_device_id)
: base_allocator_(allocator) { : base_allocator_(allocator) {
stream_exec_ = DeviceIdUtil::ExecutorForPlatformDeviceId(GPUMachineManager(), stream_exec_ = DeviceIdUtil::ExecutorForPlatformDeviceId(GPUMachineManager(),
platform_gpu_id) platform_device_id)
.ValueOrDie(); .ValueOrDie();
} }

View File

@ -34,7 +34,7 @@ namespace tensorflow {
class GPUDebugAllocator : public Allocator { class GPUDebugAllocator : public Allocator {
public: public:
explicit GPUDebugAllocator(Allocator* allocator, explicit GPUDebugAllocator(Allocator* allocator,
PlatformGpuId platform_gpu_id); PlatformDeviceId platform_device_id);
~GPUDebugAllocator() override; ~GPUDebugAllocator() override;
string Name() override { return "gpu_debug"; } string Name() override { return "gpu_debug"; }
void* AllocateRaw(size_t alignment, size_t num_bytes) override; void* AllocateRaw(size_t alignment, size_t num_bytes) override;
@ -64,7 +64,7 @@ class GPUDebugAllocator : public Allocator {
class GPUNanResetAllocator : public Allocator { class GPUNanResetAllocator : public Allocator {
public: public:
explicit GPUNanResetAllocator(Allocator* allocator, explicit GPUNanResetAllocator(Allocator* allocator,
PlatformGpuId platform_gpu_id); PlatformDeviceId platform_device_id);
~GPUNanResetAllocator() override; ~GPUNanResetAllocator() override;
string Name() override { return "gpu_nan_reset"; } string Name() override { return "gpu_nan_reset"; }
void* AllocateRaw(size_t alignment, size_t num_bytes) override; void* AllocateRaw(size_t alignment, size_t num_bytes) override;

View File

@ -37,7 +37,7 @@ limitations under the License.
namespace tensorflow { namespace tensorflow {
namespace { namespace {
se::StreamExecutor* ExecutorForPlatformGpuId( se::StreamExecutor* ExecutorForPlatformDeviceId(
PlatformDeviceId platform_device_id) { PlatformDeviceId platform_device_id) {
return DeviceIdUtil::ExecutorForPlatformDeviceId(GPUMachineManager(), return DeviceIdUtil::ExecutorForPlatformDeviceId(GPUMachineManager(),
platform_device_id) platform_device_id)
@ -45,12 +45,12 @@ se::StreamExecutor* ExecutorForPlatformGpuId(
} }
TEST(GPUDebugAllocatorTest, OverwriteDetection_None) { TEST(GPUDebugAllocatorTest, OverwriteDetection_None) {
const PlatformGpuId platform_gpu_id(0); const PlatformDeviceId platform_device_id(0);
auto stream_exec = ExecutorForPlatformGpuId(platform_gpu_id); auto stream_exec = ExecutorForPlatformDeviceId(platform_device_id);
DeviceMemAllocator* sub_allocator = new DeviceMemAllocator( DeviceMemAllocator* sub_allocator = new DeviceMemAllocator(
stream_exec, platform_gpu_id, false /*use_unified_memory*/, {}, {}); stream_exec, platform_device_id, false /*use_unified_memory*/, {}, {});
GPUDebugAllocator a(new GPUBFCAllocator(sub_allocator, 1 << 30, ""), GPUDebugAllocator a(new GPUBFCAllocator(sub_allocator, 1 << 30, ""),
platform_gpu_id); platform_device_id);
for (int s : {8}) { for (int s : {8}) {
std::vector<int64> cpu_array(s); std::vector<int64> cpu_array(s);
@ -72,13 +72,13 @@ TEST(GPUDebugAllocatorTest, OverwriteDetection_Header) {
for (int s : {8, 211}) { for (int s : {8, 211}) {
EXPECT_DEATH( EXPECT_DEATH(
{ {
const PlatformGpuId platform_gpu_id(0); const PlatformDeviceId platform_device_id(0);
auto stream_exec = ExecutorForPlatformGpuId(platform_gpu_id); auto stream_exec = ExecutorForPlatformDeviceId(platform_device_id);
DeviceMemAllocator* sub_allocator = DeviceMemAllocator* sub_allocator =
new DeviceMemAllocator(stream_exec, platform_gpu_id, new DeviceMemAllocator(stream_exec, platform_device_id,
false /*use_unified_memory*/, {}, {}); false /*use_unified_memory*/, {}, {});
GPUDebugAllocator a(new GPUBFCAllocator(sub_allocator, 1 << 30, ""), GPUDebugAllocator a(new GPUBFCAllocator(sub_allocator, 1 << 30, ""),
platform_gpu_id); platform_device_id);
std::vector<int64> cpu_array(s); std::vector<int64> cpu_array(s);
memset(&cpu_array[0], 0, cpu_array.size() * sizeof(int64)); memset(&cpu_array[0], 0, cpu_array.size() * sizeof(int64));
@ -108,13 +108,13 @@ TEST(GPUDebugAllocatorTest, OverwriteDetection_Footer) {
for (int s : {8, 22}) { for (int s : {8, 22}) {
EXPECT_DEATH( EXPECT_DEATH(
{ {
const PlatformGpuId platform_gpu_id(0); const PlatformDeviceId platform_device_id(0);
auto stream_exec = ExecutorForPlatformGpuId(platform_gpu_id); auto stream_exec = ExecutorForPlatformDeviceId(platform_device_id);
DeviceMemAllocator* sub_allocator = DeviceMemAllocator* sub_allocator =
new DeviceMemAllocator(stream_exec, platform_gpu_id, new DeviceMemAllocator(stream_exec, platform_device_id,
false /*use_unified_memory*/, {}, {}); false /*use_unified_memory*/, {}, {});
GPUDebugAllocator a(new GPUBFCAllocator(sub_allocator, 1 << 30, ""), GPUDebugAllocator a(new GPUBFCAllocator(sub_allocator, 1 << 30, ""),
platform_gpu_id); platform_device_id);
std::vector<int64> cpu_array(s); std::vector<int64> cpu_array(s);
memset(&cpu_array[0], 0, cpu_array.size() * sizeof(int64)); memset(&cpu_array[0], 0, cpu_array.size() * sizeof(int64));
@ -141,12 +141,12 @@ TEST(GPUDebugAllocatorTest, OverwriteDetection_Footer) {
} }
TEST(GPUDebugAllocatorTest, ResetToNan) { TEST(GPUDebugAllocatorTest, ResetToNan) {
const PlatformGpuId platform_gpu_id(0); const PlatformDeviceId platform_device_id(0);
auto stream_exec = ExecutorForPlatformGpuId(platform_gpu_id); auto stream_exec = ExecutorForPlatformDeviceId(platform_device_id);
DeviceMemAllocator* sub_allocator = new DeviceMemAllocator( DeviceMemAllocator* sub_allocator = new DeviceMemAllocator(
stream_exec, platform_gpu_id, false /*use_unified_memory*/, {}, {}); stream_exec, platform_device_id, false /*use_unified_memory*/, {}, {});
GPUNanResetAllocator a(new GPUBFCAllocator(sub_allocator, 1 << 30, ""), GPUNanResetAllocator a(new GPUBFCAllocator(sub_allocator, 1 << 30, ""),
platform_gpu_id); platform_device_id);
std::vector<float> cpu_array(1024); std::vector<float> cpu_array(1024);
std::vector<float> cpu_array_result(1024); std::vector<float> cpu_array_result(1024);
@ -183,15 +183,15 @@ TEST(GPUDebugAllocatorTest, ResetToNan) {
} }
TEST(GPUDebugAllocatorTest, ResetToNanWithHeaderFooter) { TEST(GPUDebugAllocatorTest, ResetToNanWithHeaderFooter) {
const PlatformGpuId platform_gpu_id(0); const PlatformDeviceId platform_device_id(0);
auto stream_exec = ExecutorForPlatformGpuId(platform_gpu_id); auto stream_exec = ExecutorForPlatformDeviceId(platform_device_id);
// NaN reset must be the outer-most allocator. // NaN reset must be the outer-most allocator.
DeviceMemAllocator* sub_allocator = new DeviceMemAllocator( DeviceMemAllocator* sub_allocator = new DeviceMemAllocator(
stream_exec, platform_gpu_id, false /*use_unified_memory*/, {}, {}); stream_exec, platform_device_id, false /*use_unified_memory*/, {}, {});
GPUNanResetAllocator a( GPUNanResetAllocator a(
new GPUDebugAllocator(new GPUBFCAllocator(sub_allocator, 1 << 30, ""), new GPUDebugAllocator(new GPUBFCAllocator(sub_allocator, 1 << 30, ""),
platform_gpu_id), platform_device_id),
platform_gpu_id); platform_device_id);
std::vector<float> cpu_array(1024); std::vector<float> cpu_array(1024);
std::vector<float> cpu_array_result(1024); std::vector<float> cpu_array_result(1024);
@ -228,24 +228,24 @@ TEST(GPUDebugAllocatorTest, ResetToNanWithHeaderFooter) {
} }
TEST(GPUDebugAllocatorTest, TracksSizes) { TEST(GPUDebugAllocatorTest, TracksSizes) {
const PlatformGpuId platform_gpu_id(0); const PlatformDeviceId platform_device_id(0);
DeviceMemAllocator* sub_allocator = new DeviceMemAllocator( DeviceMemAllocator* sub_allocator = new DeviceMemAllocator(
ExecutorForPlatformGpuId(platform_gpu_id), platform_gpu_id, ExecutorForPlatformDeviceId(platform_device_id), platform_device_id,
false /*use_unified_memory*/, {}, {}); false /*use_unified_memory*/, {}, {});
GPUDebugAllocator a(new GPUBFCAllocator(sub_allocator, 1 << 30, ""), GPUDebugAllocator a(new GPUBFCAllocator(sub_allocator, 1 << 30, ""),
platform_gpu_id); platform_device_id);
EXPECT_EQ(true, a.TracksAllocationSizes()); EXPECT_EQ(true, a.TracksAllocationSizes());
} }
TEST(GPUDebugAllocatorTest, AllocatedVsRequested) { TEST(GPUDebugAllocatorTest, AllocatedVsRequested) {
const PlatformGpuId platform_gpu_id(0); const PlatformDeviceId platform_device_id(0);
DeviceMemAllocator* sub_allocator = new DeviceMemAllocator( DeviceMemAllocator* sub_allocator = new DeviceMemAllocator(
ExecutorForPlatformGpuId(platform_gpu_id), platform_gpu_id, ExecutorForPlatformDeviceId(platform_device_id), platform_device_id,
false /*use_unified_memory*/, {}, {}); false /*use_unified_memory*/, {}, {});
GPUNanResetAllocator a( GPUNanResetAllocator a(
new GPUDebugAllocator(new GPUBFCAllocator(sub_allocator, 1 << 30, ""), new GPUDebugAllocator(new GPUBFCAllocator(sub_allocator, 1 << 30, ""),
platform_gpu_id), platform_device_id),
platform_gpu_id); platform_device_id);
float* t1 = TypedAllocator::Allocate<float>(&a, 1, {}); float* t1 = TypedAllocator::Allocate<float>(&a, 1, {});
EXPECT_EQ(4, a.RequestedSize(t1)); EXPECT_EQ(4, a.RequestedSize(t1));
EXPECT_EQ(256, a.AllocatedSize(t1)); EXPECT_EQ(256, a.AllocatedSize(t1));

View File

@ -120,7 +120,7 @@ class EigenGpuStreamDevice : public ::Eigen::StreamInterface {
} }
~EigenGpuStreamDevice() override {} ~EigenGpuStreamDevice() override {}
void Reinitialize(OpKernelContext* context, const gpuStream_t* gpu_stream, void Reinitialize(OpKernelContext* context, const gpuStream_t* gpu_stream,
TfGpuId tf_gpu_id, ::tensorflow::Allocator* alloc, TfDeviceId tf_device_id, ::tensorflow::Allocator* alloc,
char* scratch) { char* scratch) {
if (LogMemory::IsEnabled()) { if (LogMemory::IsEnabled()) {
operation_ = context->op_kernel().name() + "/EigenAllocator"; operation_ = context->op_kernel().name() + "/EigenAllocator";
@ -132,9 +132,10 @@ class EigenGpuStreamDevice : public ::Eigen::StreamInterface {
reinterpret_cast<unsigned int*>(scratch + Eigen::kGpuScratchSize); reinterpret_cast<unsigned int*>(scratch + Eigen::kGpuScratchSize);
stream_ = gpu_stream; stream_ = gpu_stream;
allocator_ = alloc; allocator_ = alloc;
PlatformGpuId platform_gpu_id; PlatformDeviceId platform_device_id;
TF_CHECK_OK(GpuIdManager::TfToPlatformGpuId(tf_gpu_id, &platform_gpu_id)); TF_CHECK_OK(
device_prop_ = &Eigen::m_deviceProperties[platform_gpu_id.value()]; GpuIdManager::TfToPlatformDeviceId(tf_device_id, &platform_device_id));
device_prop_ = &Eigen::m_deviceProperties[platform_device_id.value()];
} }
const gpuStream_t& stream() const override { return *stream_; } const gpuStream_t& stream() const override { return *stream_; }
@ -233,18 +234,18 @@ class EigenGpuStreamDevice : public ::Eigen::StreamInterface {
class BaseGPUDevice::StreamGroupFactory { class BaseGPUDevice::StreamGroupFactory {
public: public:
// Returns the unique stream group for use with the stream defined by // Returns the unique stream group for use with the stream defined by
// {tf_gpu_id, stream_group_within_gpu}, creating it if it does not yet // {tf_device_id, stream_group_within_gpu}, creating it if it does not yet
// exist. // exist.
// This function is thread safe. // This function is thread safe.
BaseGPUDevice::StreamGroup* GetOrCreate(TfGpuId tf_gpu_id, BaseGPUDevice::StreamGroup* GetOrCreate(TfDeviceId tf_device_id,
int stream_group_within_gpu, int stream_group_within_gpu,
se::StreamExecutor* executor, se::StreamExecutor* executor,
const GPUOptions& options) { const GPUOptions& options) {
mutex_lock guard(lock_); mutex_lock guard(lock_);
StreamGroup* group = StreamGroup* group =
&streams_[key_type(tf_gpu_id.value(), stream_group_within_gpu)]; &streams_[key_type(tf_device_id.value(), stream_group_within_gpu)];
if (!group->compute) { if (!group->compute) {
int priority = GetPriority(tf_gpu_id.value(), options); int priority = GetPriority(tf_device_id.value(), options);
group->priority = priority; group->priority = priority;
group->compute = GetStream(executor, priority); group->compute = GetStream(executor, priority);
group->compute->Init(); group->compute->Init();
@ -339,8 +340,8 @@ class BaseGPUDevice::StreamGroupFactory {
private: private:
// Returns priority for the given virtual GPU id from the session options. // Returns priority for the given virtual GPU id from the session options.
// Returns 0 if no virtual devices are specified. // Returns 0 if no virtual devices are specified.
int GetPriority(int tf_gpu_id, const GPUOptions& options) { int GetPriority(int tf_device_id, const GPUOptions& options) {
int id = tf_gpu_id; int id = tf_device_id;
int i = 0; int i = 0;
int priority = 0; int priority = 0;
while (i < options.experimental().virtual_devices_size()) { while (i < options.experimental().virtual_devices_size()) {
@ -378,7 +379,7 @@ class BaseGPUDevice::StreamGroupFactory {
BaseGPUDevice::BaseGPUDevice(const SessionOptions& options, const string& name, BaseGPUDevice::BaseGPUDevice(const SessionOptions& options, const string& name,
Bytes memory_limit, const DeviceLocality& locality, Bytes memory_limit, const DeviceLocality& locality,
TfGpuId tf_gpu_id, TfDeviceId tf_device_id,
const string& physical_device_desc, const string& physical_device_desc,
Allocator* gpu_allocator, Allocator* cpu_allocator, Allocator* gpu_allocator, Allocator* cpu_allocator,
bool sync_every_op) bool sync_every_op)
@ -388,7 +389,7 @@ BaseGPUDevice::BaseGPUDevice(const SessionOptions& options, const string& name,
gpu_allocator_(gpu_allocator), gpu_allocator_(gpu_allocator),
cpu_allocator_(cpu_allocator), cpu_allocator_(cpu_allocator),
scoped_allocator_mgr_(new ScopedAllocatorMgr(name)), scoped_allocator_mgr_(new ScopedAllocatorMgr(name)),
tf_gpu_id_(tf_gpu_id), tf_device_id_(tf_device_id),
sync_every_op_(sync_every_op) { sync_every_op_(sync_every_op) {
GPUProcessState::singleton()->EnableGPUDevice(); GPUProcessState::singleton()->EnableGPUDevice();
} }
@ -410,7 +411,8 @@ Status BaseGPUDevice::InitScratchBuffers() {
Allocator::kAllocatorAlignment, scratch_buffer_size); Allocator::kAllocatorAlignment, scratch_buffer_size);
if (scratch_buffer == nullptr) { if (scratch_buffer == nullptr) {
return errors::FailedPrecondition( return errors::FailedPrecondition(
"Failed to allocate scratch buffer for device ", tf_gpu_id_.value()); "Failed to allocate scratch buffer for device ",
tf_device_id_.value());
} }
se::DeviceMemory<char> mem( se::DeviceMemory<char> mem(
se::DeviceMemoryBase(scratch_buffer, scratch_buffer_size)); se::DeviceMemoryBase(scratch_buffer, scratch_buffer_size));
@ -423,16 +425,16 @@ Status BaseGPUDevice::InitScratchBuffers() {
Status BaseGPUDevice::Init(const SessionOptions& options) { Status BaseGPUDevice::Init(const SessionOptions& options) {
auto executor_status = DeviceIdUtil::ExecutorForTfDeviceId( auto executor_status = DeviceIdUtil::ExecutorForTfDeviceId(
DEVICE_GPU, GPUMachineManager(), tf_gpu_id_); DEVICE_GPU, GPUMachineManager(), tf_device_id_);
if (!executor_status.status().ok()) { if (!executor_status.status().ok()) {
return errors::Internal("Failed to get StreamExecutor for device ", return errors::Internal("Failed to get StreamExecutor for device ",
tf_gpu_id_.value()); tf_device_id_.value());
} }
executor_ = executor_status.ValueOrDie(); executor_ = executor_status.ValueOrDie();
stream_ = StreamGroupFactory::Global().GetOrCreate( stream_ = StreamGroupFactory::Global().GetOrCreate(
tf_gpu_id_, 0, executor_, options.config.gpu_options()); tf_device_id_, 0, executor_, options.config.gpu_options());
device_context_ = device_context_ =
new GPUDeviceContext(0, stream_->compute, new GPUDeviceContext(0, stream_->compute,
#if TENSORFLOW_USE_ROCM #if TENSORFLOW_USE_ROCM
@ -461,7 +463,7 @@ Status BaseGPUDevice::Init(const SessionOptions& options) {
// The GPUKernelTracker will use this SharedCounter, instead of // The GPUKernelTracker will use this SharedCounter, instead of
// owning its own. // owning its own.
timing_counter = timing_counter =
GPUProcessState::singleton()->GPUAllocatorCounter(tf_gpu_id_); GPUProcessState::singleton()->GPUAllocatorCounter(tf_device_id_);
DCHECK(timing_counter); DCHECK(timing_counter);
} }
kernel_tracker_.reset(new GPUKernelTracker( kernel_tracker_.reset(new GPUKernelTracker(
@ -473,10 +475,10 @@ Status BaseGPUDevice::Init(const SessionOptions& options) {
gpu_device_info_->stream = stream_->compute; gpu_device_info_->stream = stream_->compute;
gpu_device_info_->default_context = device_context_; gpu_device_info_->default_context = device_context_;
gpu_device_info_->event_mgr = em_; gpu_device_info_->event_mgr = em_;
PlatformGpuId platform_gpu_id; PlatformDeviceId platform_device_id;
TF_RETURN_IF_ERROR( TF_RETURN_IF_ERROR(
GpuIdManager::TfToPlatformGpuId(tf_gpu_id_, &platform_gpu_id)); GpuIdManager::TfToPlatformDeviceId(tf_device_id_, &platform_device_id));
gpu_device_info_->gpu_id = platform_gpu_id.value(); gpu_device_info_->gpu_id = platform_device_id.value();
set_tensorflow_gpu_device_info(gpu_device_info_); set_tensorflow_gpu_device_info(gpu_device_info_);
// Whether and how the GPU device uses its own threadpool. // Whether and how the GPU device uses its own threadpool.
@ -505,7 +507,7 @@ Status BaseGPUDevice::Init(const SessionOptions& options) {
// TODO(zhengxq): pin the thread to the same socket of the target GPU. // TODO(zhengxq): pin the thread to the same socket of the target GPU.
thread_pool_.reset(new thread::ThreadPool( thread_pool_.reset(new thread::ThreadPool(
options.env, ThreadOptions(), options.env, ThreadOptions(),
strings::StrCat("gpu_private_", tf_gpu_id_.value()), strings::StrCat("gpu_private_", tf_device_id_.value()),
static_cast<int32>(gpu_thread_count), static_cast<int32>(gpu_thread_count),
!options.config.experimental().disable_thread_spinning(), !options.config.experimental().disable_thread_spinning(),
/*allocator=*/nullptr)); /*allocator=*/nullptr));
@ -531,8 +533,8 @@ Status BaseGPUDevice::Init(const SessionOptions& options) {
string BaseGPUDevice::ComputeOpKernelDebugString(const OpKernel& op_kernel, string BaseGPUDevice::ComputeOpKernelDebugString(const OpKernel& op_kernel,
const int& stream_id) { const int& stream_id) {
return strings::StrCat(op_kernel.name(), " op ", op_kernel.type_string(), return strings::StrCat(op_kernel.name(), " op ", op_kernel.type_string(),
" on GPU ", tf_gpu_id_.value(), " stream[", stream_id, " on GPU ", tf_device_id_.value(), " stream[",
"]"); stream_id, "]");
} }
void BaseGPUDevice::Compute(OpKernel* op_kernel, OpKernelContext* context) { void BaseGPUDevice::Compute(OpKernel* op_kernel, OpKernelContext* context) {
@ -624,8 +626,8 @@ void BaseGPUDevice::ComputeAsync(AsyncOpKernel* op_kernel,
const auto stream_id = gpu_device_context->stream_id(); const auto stream_id = gpu_device_context->stream_id();
VLOG(1) << "GpuDevice::ComputeAsync " << op_kernel->name() << " op " VLOG(1) << "GpuDevice::ComputeAsync " << op_kernel->name() << " op "
<< op_kernel->type_string() << " on GPU" << tf_gpu_id_ << " stream[" << op_kernel->type_string() << " on GPU" << tf_device_id_
<< stream_id << "]"; << " stream[" << stream_id << "]";
ScopedActivateExecutorContext scoped_activation{stream->parent()}; ScopedActivateExecutorContext scoped_activation{stream->parent()};
op_kernel->ComputeAsync(context, std::move(done)); op_kernel->ComputeAsync(context, std::move(done));
@ -763,10 +765,10 @@ class ConcretePerOpGpuDevice : public PerOpGpuDevice {
ConcretePerOpGpuDevice() : device_(&stream_device_) {} ConcretePerOpGpuDevice() : device_(&stream_device_) {}
void Reinitialize(OpKernelContext* context, const gpuStream_t* gpu_stream, void Reinitialize(OpKernelContext* context, const gpuStream_t* gpu_stream,
TfGpuId tf_gpu_id, Allocator* base_allocator, TfDeviceId tf_device_id, Allocator* base_allocator,
char* scratch) { char* scratch) {
stream_device_.Reinitialize(context, gpu_stream, tf_gpu_id, base_allocator, stream_device_.Reinitialize(context, gpu_stream, tf_device_id,
scratch); base_allocator, scratch);
} }
const Eigen::GpuDevice& device() const override { return device_; } const Eigen::GpuDevice& device() const override { return device_; }
@ -777,8 +779,9 @@ class ConcretePerOpGpuDevice : public PerOpGpuDevice {
}; };
// Parse 'visible_device_list' into a list of platform GPU ids. // Parse 'visible_device_list' into a list of platform GPU ids.
Status ParseVisibleDeviceList(const string& visible_device_list, Status ParseVisibleDeviceList(
std::vector<PlatformGpuId>* visible_gpu_order) { const string& visible_device_list,
std::vector<PlatformDeviceId>* visible_gpu_order) {
visible_gpu_order->clear(); visible_gpu_order->clear();
se::Platform* gpu_manager = GPUMachineManager(); se::Platform* gpu_manager = GPUMachineManager();
@ -793,28 +796,28 @@ Status ParseVisibleDeviceList(const string& visible_device_list,
} else { } else {
const std::vector<string> order_str = const std::vector<string> order_str =
str_util::Split(visible_device_list, ','); str_util::Split(visible_device_list, ',');
for (const string& platform_gpu_id_str : order_str) { for (const string& platform_device_id_str : order_str) {
int32 platform_gpu_id; int32 platform_device_id;
if (!strings::safe_strto32(platform_gpu_id_str, &platform_gpu_id)) { if (!strings::safe_strto32(platform_device_id_str, &platform_device_id)) {
return errors::InvalidArgument( return errors::InvalidArgument(
"Could not parse entry in 'visible_device_list': '", "Could not parse entry in 'visible_device_list': '",
platform_gpu_id_str, platform_device_id_str,
"'. visible_device_list = ", visible_device_list); "'. visible_device_list = ", visible_device_list);
} }
if (platform_gpu_id < 0 || if (platform_device_id < 0 ||
platform_gpu_id >= gpu_manager->VisibleDeviceCount()) { platform_device_id >= gpu_manager->VisibleDeviceCount()) {
return errors::InvalidArgument( return errors::InvalidArgument(
"'visible_device_list' listed an invalid GPU id '", platform_gpu_id, "'visible_device_list' listed an invalid GPU id '",
"' but visible device count is ", platform_device_id, "' but visible device count is ",
gpu_manager->VisibleDeviceCount()); gpu_manager->VisibleDeviceCount());
} }
visible_gpu_order->push_back(PlatformGpuId(platform_gpu_id)); visible_gpu_order->push_back(PlatformDeviceId(platform_device_id));
} }
} }
// Validate no repeats. // Validate no repeats.
std::set<PlatformGpuId> visible_device_set(visible_gpu_order->begin(), std::set<PlatformDeviceId> visible_device_set(visible_gpu_order->begin(),
visible_gpu_order->end()); visible_gpu_order->end());
if (visible_device_set.size() != visible_gpu_order->size()) { if (visible_device_set.size() != visible_gpu_order->size()) {
return errors::InvalidArgument( return errors::InvalidArgument(
"visible_device_list contained a duplicate entry: ", "visible_device_list contained a duplicate entry: ",
@ -825,8 +828,8 @@ Status ParseVisibleDeviceList(const string& visible_device_list,
Status VerifyVirtualDeviceSettings( Status VerifyVirtualDeviceSettings(
const size_t num_gpus_to_use, const GPUOptions& gpu_options, const size_t num_gpus_to_use, const GPUOptions& gpu_options,
const std::vector<PlatformGpuId>& visible_gpu_order, const std::vector<PlatformDeviceId>& visible_gpu_order,
const std::vector<PlatformGpuId>& valid_platform_gpu_ids, const std::vector<PlatformDeviceId>& valid_platform_device_ids,
const std::map<int, std::pair<int, int>>& supported_priority_ranges) { const std::map<int, std::pair<int, int>>& supported_priority_ranges) {
const auto& virtual_devices = gpu_options.experimental().virtual_devices(); const auto& virtual_devices = gpu_options.experimental().virtual_devices();
CHECK(!virtual_devices.empty()); CHECK(!virtual_devices.empty());
@ -849,11 +852,11 @@ Status VerifyVirtualDeviceSettings(
" #GPUs in visible_device_list: ", visible_gpu_order.size(), " #GPUs in visible_device_list: ", visible_gpu_order.size(),
" virtual_devices.size(): ", virtual_devices.size()); " virtual_devices.size(): ", virtual_devices.size());
} }
if (valid_platform_gpu_ids.size() != virtual_devices.size()) { if (valid_platform_device_ids.size() != virtual_devices.size()) {
return errors::Unknown( return errors::Unknown(
"The number of valid GPUs doesn't match the number of elements in " "The number of valid GPUs doesn't match the number of elements in "
"the virtual_devices list.", "the virtual_devices list.",
" #valid GPUs: ", valid_platform_gpu_ids.size(), " #valid GPUs: ", valid_platform_device_ids.size(),
" virtual_devices.size(): ", virtual_devices.size()); " virtual_devices.size(): ", virtual_devices.size());
} }
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
@ -882,7 +885,7 @@ Status VerifyVirtualDeviceSettings(
i, " memory_limit_mb size: ", memory_limit_mb.size(), i, " memory_limit_mb size: ", memory_limit_mb.size(),
" and priority size: ", priority.size()); " and priority size: ", priority.size());
} }
const int gpu_id = valid_platform_gpu_ids[i].value(); const int gpu_id = valid_platform_device_ids[i].value();
auto it = supported_priority_ranges.find(gpu_id); auto it = supported_priority_ranges.find(gpu_id);
if (it == supported_priority_ranges.end()) { if (it == supported_priority_ranges.end()) {
return errors::Internal( return errors::Internal(
@ -950,19 +953,19 @@ int64 MinSystemMemory(int64 available_memory, int cc_major) {
} }
// Get the memory limit for the virtual device being created on GPU with // Get the memory limit for the virtual device being created on GPU with
// 'platform_gpu_id', when that virtual device is the only virtual device being // 'platform_device_id', when that virtual device is the only virtual device
// created on that GPU. // being created on that GPU.
Status SingleVirtualDeviceMemoryLimit(const GPUOptions& gpu_options, Status SingleVirtualDeviceMemoryLimit(const GPUOptions& gpu_options,
PlatformGpuId platform_gpu_id, PlatformDeviceId platform_device_id,
int64* memory_limit) { int64* memory_limit) {
int64 total_memory = 0; int64 total_memory = 0;
int64 available_memory = 0; int64 available_memory = 0;
se::StreamExecutor* se = DeviceIdUtil::ExecutorForPlatformDeviceId( se::StreamExecutor* se = DeviceIdUtil::ExecutorForPlatformDeviceId(
GPUMachineManager(), platform_gpu_id) GPUMachineManager(), platform_device_id)
.ValueOrDie(); .ValueOrDie();
if (!se->DeviceMemoryUsage(&available_memory, &total_memory)) { if (!se->DeviceMemoryUsage(&available_memory, &total_memory)) {
return errors::Unknown("Failed to query available memory for GPU ", return errors::Unknown("Failed to query available memory for GPU ",
platform_gpu_id.value()); platform_device_id.value());
} }
int64 allocated_memory = 0; int64 allocated_memory = 0;
@ -1037,7 +1040,7 @@ void BaseGPUDevice::ReinitializeDevice(OpKernelContext* context,
DCHECK_EQ(stream_id, 0); DCHECK_EQ(stream_id, 0);
const gpuStream_t* gpu_stream = reinterpret_cast<const gpuStream_t*>( const gpuStream_t* gpu_stream = reinterpret_cast<const gpuStream_t*>(
stream_->compute->implementation()->GpuStreamMemberHack()); stream_->compute->implementation()->GpuStreamMemberHack());
concrete_device->Reinitialize(context, gpu_stream, tf_gpu_id_, allocator, concrete_device->Reinitialize(context, gpu_stream, tf_device_id_, allocator,
scratch_); scratch_);
} }
@ -1093,7 +1096,7 @@ Status BaseGPUDeviceFactory::CacheDeviceIds() {
return Status::OK(); return Status::OK();
} }
std::vector<PlatformGpuId> visible_gpu_order(device_count); std::vector<PlatformDeviceId> visible_gpu_order(device_count);
std::iota(visible_gpu_order.begin(), visible_gpu_order.end(), 0); std::iota(visible_gpu_order.begin(), visible_gpu_order.end(), 0);
TF_RETURN_IF_ERROR(GetValidDeviceIds(visible_gpu_order, &cached_device_ids_)); TF_RETURN_IF_ERROR(GetValidDeviceIds(visible_gpu_order, &cached_device_ids_));
return Status::OK(); return Status::OK();
@ -1101,9 +1104,9 @@ Status BaseGPUDeviceFactory::CacheDeviceIds() {
Status BaseGPUDeviceFactory::ListPhysicalDevices(std::vector<string>* devices) { Status BaseGPUDeviceFactory::ListPhysicalDevices(std::vector<string>* devices) {
TF_RETURN_IF_ERROR(CacheDeviceIds()); TF_RETURN_IF_ERROR(CacheDeviceIds());
for (PlatformGpuId platform_gpu_id : cached_device_ids_) { for (PlatformDeviceId platform_device_id : cached_device_ids_) {
const string device_name = const string device_name =
strings::StrCat("/physical_device:GPU:", platform_gpu_id.value()); strings::StrCat("/physical_device:GPU:", platform_device_id.value());
devices->push_back(device_name); devices->push_back(device_name);
} }
@ -1117,14 +1120,15 @@ Status BaseGPUDeviceFactory::GetDeviceDetails(
if (device_index < 0 || device_index > cached_device_ids_.size()) { if (device_index < 0 || device_index > cached_device_ids_.size()) {
return errors::Internal("Invalid device index: ", device_index); return errors::Internal("Invalid device index: ", device_index);
} }
PlatformGpuId platform_gpu_id = cached_device_ids_[device_index]; PlatformDeviceId platform_device_id = cached_device_ids_[device_index];
TF_RETURN_IF_ERROR(ValidateGPUMachineManager()); TF_RETURN_IF_ERROR(ValidateGPUMachineManager());
se::Platform* gpu_manager = GPUMachineManager(); se::Platform* gpu_manager = GPUMachineManager();
if (gpu_manager == nullptr) { if (gpu_manager == nullptr) {
return errors::Internal("Cannot get GPUMachineManager"); return errors::Internal("Cannot get GPUMachineManager");
} }
auto desc_status = gpu_manager->DescriptionForDevice(platform_gpu_id.value()); auto desc_status =
gpu_manager->DescriptionForDevice(platform_device_id.value());
if (!desc_status.ok()) { if (!desc_status.ok()) {
return desc_status.status(); return desc_status.status();
} }
@ -1159,8 +1163,8 @@ Status BaseGPUDeviceFactory::CreateDevices(
num_gpus_to_use = iter->second; num_gpus_to_use = iter->second;
} }
const auto& gpu_options = options.config.gpu_options(); const auto& gpu_options = options.config.gpu_options();
std::vector<PlatformGpuId> visible_gpu_order; std::vector<PlatformDeviceId> visible_gpu_order;
std::vector<PlatformGpuId> valid_platform_gpu_ids; std::vector<PlatformDeviceId> valid_platform_device_ids;
// If we aren't going to use any GPUs, don't initialize them. // If we aren't going to use any GPUs, don't initialize them.
// We don't want to call ParseVisibleDeviceList if num_gpus_to_use is 0, // We don't want to call ParseVisibleDeviceList if num_gpus_to_use is 0,
// because it treats an empty gpu_options.visible_device_list as 'all GPUs // because it treats an empty gpu_options.visible_device_list as 'all GPUs
@ -1188,13 +1192,13 @@ Status BaseGPUDeviceFactory::CreateDevices(
} }
TF_RETURN_IF_ERROR( TF_RETURN_IF_ERROR(
GetValidDeviceIds(visible_gpu_order, &valid_platform_gpu_ids)); GetValidDeviceIds(visible_gpu_order, &valid_platform_device_ids));
} }
if (num_gpus_to_use > valid_platform_gpu_ids.size()) { if (num_gpus_to_use > valid_platform_device_ids.size()) {
num_gpus_to_use = valid_platform_gpu_ids.size(); num_gpus_to_use = valid_platform_device_ids.size();
} }
std::map<int, std::pair<int, int>> supported_priority_ranges; std::map<int, std::pair<int, int>> supported_priority_ranges;
if (!valid_platform_gpu_ids.empty()) { if (!valid_platform_device_ids.empty()) {
// Save the original device. // Save the original device.
int original_device = 0; int original_device = 0;
#if GOOGLE_CUDA #if GOOGLE_CUDA
@ -1213,18 +1217,18 @@ Status BaseGPUDeviceFactory::CreateDevices(
// Force to implicitly initialize CUDA runtime on each valid GPU before // Force to implicitly initialize CUDA runtime on each valid GPU before
// CreateGPUDevice(). // CreateGPUDevice().
for (PlatformGpuId platform_gpu_id : valid_platform_gpu_ids) { for (PlatformDeviceId platform_device_id : valid_platform_device_ids) {
#if GOOGLE_CUDA #if GOOGLE_CUDA
err = cudaSetDevice(platform_gpu_id.value()); err = cudaSetDevice(platform_device_id.value());
if (err != cudaSuccess) { if (err != cudaSuccess) {
return errors::Internal( return errors::Internal(
"cudaSetDevice() on GPU:", platform_gpu_id.value(), "cudaSetDevice() on GPU:", platform_device_id.value(),
" failed. Status: ", cudaGetErrorString(err)); " failed. Status: ", cudaGetErrorString(err));
} }
err = cudaFree(nullptr); err = cudaFree(nullptr);
if (err != cudaSuccess) { if (err != cudaSuccess) {
return errors::Internal("CUDA runtime implicit initialization on GPU:", return errors::Internal("CUDA runtime implicit initialization on GPU:",
platform_gpu_id.value(), platform_device_id.value(),
" failed. Status: ", cudaGetErrorString(err)); " failed. Status: ", cudaGetErrorString(err));
} }
int priority_low, priority_high; int priority_low, priority_high;
@ -1237,19 +1241,19 @@ Status BaseGPUDeviceFactory::CreateDevices(
VLOG(1) << "Cuda stream priority range on GPU(" << original_device VLOG(1) << "Cuda stream priority range on GPU(" << original_device
<< "): " << priority_high << "," << priority_low; << "): " << priority_high << "," << priority_low;
supported_priority_ranges.insert( supported_priority_ranges.insert(
std::make_pair(platform_gpu_id.value(), std::make_pair(platform_device_id.value(),
std::make_pair(priority_low, priority_high))); std::make_pair(priority_low, priority_high)));
#elif TENSORFLOW_USE_ROCM #elif TENSORFLOW_USE_ROCM
err = hipSetDevice(platform_gpu_id.value()); err = hipSetDevice(platform_device_id.value());
if (err != hipSuccess) { if (err != hipSuccess) {
return errors::Internal( return errors::Internal(
"hipSetDevice() on GPU:", platform_gpu_id.value(), "hipSetDevice() on GPU:", platform_device_id.value(),
" failed. Status: ", hipGetErrorString(err)); " failed. Status: ", hipGetErrorString(err));
} }
err = hipFree(nullptr); err = hipFree(nullptr);
if (err != hipSuccess) { if (err != hipSuccess) {
return errors::Internal("ROCm runtime implicit initialization on GPU:", return errors::Internal("ROCm runtime implicit initialization on GPU:",
platform_gpu_id.value(), platform_device_id.value(),
" failed. Status: ", hipGetErrorString(err)); " failed. Status: ", hipGetErrorString(err));
} }
int priority_low, priority_high; int priority_low, priority_high;
@ -1262,7 +1266,7 @@ Status BaseGPUDeviceFactory::CreateDevices(
VLOG(1) << "HIP stream priority range on GPU(" << original_device VLOG(1) << "HIP stream priority range on GPU(" << original_device
<< "): " << priority_high << "," << priority_low; << "): " << priority_high << "," << priority_low;
supported_priority_ranges.insert( supported_priority_ranges.insert(
std::make_pair(platform_gpu_id.value(), std::make_pair(platform_device_id.value(),
std::make_pair(priority_low, priority_high))); std::make_pair(priority_low, priority_high)));
#endif #endif
} }
@ -1306,9 +1310,9 @@ Status BaseGPUDeviceFactory::CreateDevices(
LOG(INFO) << line_buf; LOG(INFO) << line_buf;
for (int i = 0; i < visible_gpu_order.size(); ++i) { for (int i = 0; i < visible_gpu_order.size(); ++i) {
line_buf = strings::StrCat(visible_gpu_order[i].value(), ": "); line_buf = strings::StrCat(visible_gpu_order[i].value(), ": ");
PlatformGpuId gpu_id_i = visible_gpu_order[i]; PlatformDeviceId gpu_id_i = visible_gpu_order[i];
for (int j = 0; j < visible_gpu_order.size(); ++j) { for (int j = 0; j < visible_gpu_order.size(); ++j) {
PlatformGpuId gpu_id_j = visible_gpu_order[j]; PlatformDeviceId gpu_id_j = visible_gpu_order[j];
if (im.directed_links.find({gpu_id_i, gpu_id_j}) != if (im.directed_links.find({gpu_id_i, gpu_id_j}) !=
im.directed_links.end()) { im.directed_links.end()) {
line_buf.append("Y "); line_buf.append("Y ");
@ -1323,22 +1327,23 @@ Status BaseGPUDeviceFactory::CreateDevices(
const auto& virtual_devices = gpu_options.experimental().virtual_devices(); const auto& virtual_devices = gpu_options.experimental().virtual_devices();
if (!virtual_devices.empty()) { if (!virtual_devices.empty()) {
TF_RETURN_IF_ERROR(VerifyVirtualDeviceSettings( TF_RETURN_IF_ERROR(VerifyVirtualDeviceSettings(
num_gpus_to_use, gpu_options, visible_gpu_order, valid_platform_gpu_ids, num_gpus_to_use, gpu_options, visible_gpu_order,
supported_priority_ranges)); valid_platform_device_ids, supported_priority_ranges));
// We've verified that num_gpus_to_use >= virtual_devices.size(). // We've verified that num_gpus_to_use >= virtual_devices.size().
num_gpus_to_use = virtual_devices.size(); num_gpus_to_use = virtual_devices.size();
CHECK(gpu_options.visible_device_list().empty() || CHECK(gpu_options.visible_device_list().empty() ||
valid_platform_gpu_ids == visible_gpu_order); valid_platform_device_ids == visible_gpu_order);
} }
int next_tf_gpu_id = 0; int next_tf_device_id = 0;
std::vector<int64> memory_limit_bytes; std::vector<int64> memory_limit_bytes;
for (int i = 0; i < num_gpus_to_use; ++i) { for (int i = 0; i < num_gpus_to_use; ++i) {
const PlatformGpuId platform_gpu_id = valid_platform_gpu_ids[i]; const PlatformDeviceId platform_device_id = valid_platform_device_ids[i];
if (virtual_devices.empty() || if (virtual_devices.empty() ||
virtual_devices.Get(i).memory_limit_mb_size() == 0) { virtual_devices.Get(i).memory_limit_mb_size() == 0) {
int64 single_virtual_device_memory_limit = 0; int64 single_virtual_device_memory_limit = 0;
TF_RETURN_IF_ERROR(SingleVirtualDeviceMemoryLimit( TF_RETURN_IF_ERROR(
gpu_options, platform_gpu_id, &single_virtual_device_memory_limit)); SingleVirtualDeviceMemoryLimit(gpu_options, platform_device_id,
&single_virtual_device_memory_limit));
memory_limit_bytes.push_back(single_virtual_device_memory_limit); memory_limit_bytes.push_back(single_virtual_device_memory_limit);
} else { } else {
const auto& memory_limit_mb = virtual_devices.Get(i).memory_limit_mb(); const auto& memory_limit_mb = virtual_devices.Get(i).memory_limit_mb();
@ -1347,36 +1352,37 @@ Status BaseGPUDeviceFactory::CreateDevices(
return static_cast<int64>(mb) * (1ll << 20); return static_cast<int64>(mb) * (1ll << 20);
}); });
} }
while (next_tf_gpu_id < memory_limit_bytes.size()) { while (next_tf_device_id < memory_limit_bytes.size()) {
TfGpuId tf_gpu_id(next_tf_gpu_id); TfDeviceId tf_device_id(next_tf_device_id);
++next_tf_gpu_id; ++next_tf_device_id;
TF_RETURN_IF_ERROR( TF_RETURN_IF_ERROR(GpuIdManager::InsertTfPlatformDeviceIdPair(
GpuIdManager::InsertTfPlatformGpuIdPair(tf_gpu_id, platform_gpu_id)); tf_device_id, platform_device_id));
} }
} }
const int num_tf_gpus = next_tf_gpu_id; const int num_tf_gpus = next_tf_device_id;
LocalityMap device_localities; LocalityMap device_localities;
TF_RETURN_IF_ERROR( TF_RETURN_IF_ERROR(
GetDeviceLocalities(num_tf_gpus, interconnect_maps, &device_localities)); GetDeviceLocalities(num_tf_gpus, interconnect_maps, &device_localities));
// Build the GPUDevices // Build the GPUDevices
CHECK_EQ(next_tf_gpu_id, memory_limit_bytes.size()); CHECK_EQ(next_tf_device_id, memory_limit_bytes.size());
for (int di = 0; di < num_tf_gpus; ++di) { for (int di = 0; di < num_tf_gpus; ++di) {
TfGpuId tf_gpu_id(di); TfDeviceId tf_device_id(di);
int64 bytes = memory_limit_bytes[di]; int64 bytes = memory_limit_bytes[di];
auto it = device_localities.find(tf_gpu_id); auto it = device_localities.find(tf_device_id);
if (it == device_localities.end()) { if (it == device_localities.end()) {
return errors::Internal("Failed to find DeviceLocality for GPU device ", return errors::Internal("Failed to find DeviceLocality for GPU device ",
tf_gpu_id.value()); tf_device_id.value());
} }
TF_RETURN_IF_ERROR(CreateGPUDevice(options, name_prefix, tf_gpu_id, bytes, TF_RETURN_IF_ERROR(CreateGPUDevice(options, name_prefix, tf_device_id,
it->second, num_tf_gpus, devices)); bytes, it->second, num_tf_gpus,
devices));
} }
return Status::OK(); return Status::OK();
} }
static string GetShortDeviceDescription(PlatformGpuId platform_gpu_id, static string GetShortDeviceDescription(PlatformDeviceId platform_device_id,
const se::DeviceDescription& desc) { const se::DeviceDescription& desc) {
#if GOOGLE_CUDA #if GOOGLE_CUDA
int cc_major; int cc_major;
@ -1386,54 +1392,56 @@ static string GetShortDeviceDescription(PlatformGpuId platform_gpu_id,
cc_minor = 0; cc_minor = 0;
} }
// LINT.IfChange // LINT.IfChange
return strings::StrCat("device: ", platform_gpu_id.value(), return strings::StrCat("device: ", platform_device_id.value(),
", name: ", desc.name(), ", name: ", desc.name(),
", pci bus id: ", desc.pci_bus_id(), ", pci bus id: ", desc.pci_bus_id(),
", compute capability: ", cc_major, ".", cc_minor); ", compute capability: ", cc_major, ".", cc_minor);
// LINT.ThenChange(//tensorflow/python/framework/gpu_util.py) // LINT.ThenChange(//tensorflow/python/framework/gpu_util.py)
#elif TENSORFLOW_USE_ROCM #elif TENSORFLOW_USE_ROCM
return strings::StrCat("device: ", platform_gpu_id.value(), return strings::StrCat("device: ", platform_device_id.value(),
", name: ", desc.name(), ", name: ", desc.name(),
", pci bus id: ", desc.pci_bus_id()); ", pci bus id: ", desc.pci_bus_id());
#endif #endif
} }
Status BaseGPUDeviceFactory::CreateGPUDevice( Status BaseGPUDeviceFactory::CreateGPUDevice(
const SessionOptions& options, const string& name_prefix, TfGpuId tf_gpu_id, const SessionOptions& options, const string& name_prefix,
int64 memory_limit, const DeviceLocality& dev_locality, size_t num_tf_gpus, TfDeviceId tf_device_id, int64 memory_limit,
const DeviceLocality& dev_locality, size_t num_tf_gpus,
std::vector<std::unique_ptr<Device>>* devices) { std::vector<std::unique_ptr<Device>>* devices) {
CHECK_GE(tf_gpu_id.value(), 0); CHECK_GE(tf_device_id.value(), 0);
const string device_name = const string device_name =
strings::StrCat(name_prefix, "/device:GPU:", tf_gpu_id.value()); strings::StrCat(name_prefix, "/device:GPU:", tf_device_id.value());
DeviceIdUtil::CheckValidTfDeviceId(DEVICE_GPU, GPUMachineManager(), DeviceIdUtil::CheckValidTfDeviceId(DEVICE_GPU, GPUMachineManager(),
tf_gpu_id); tf_device_id);
PlatformGpuId platform_gpu_id; PlatformDeviceId platform_device_id;
TF_RETURN_IF_ERROR( TF_RETURN_IF_ERROR(
GpuIdManager::TfToPlatformGpuId(tf_gpu_id, &platform_gpu_id)); GpuIdManager::TfToPlatformDeviceId(tf_device_id, &platform_device_id));
int numa_node = dev_locality.numa_node(); int numa_node = dev_locality.numa_node();
se::Platform* gpu_manager = GPUMachineManager(); se::Platform* gpu_manager = GPUMachineManager();
auto desc_status = gpu_manager->DescriptionForDevice(platform_gpu_id.value()); auto desc_status =
gpu_manager->DescriptionForDevice(platform_device_id.value());
if (!desc_status.ok()) { if (!desc_status.ok()) {
return desc_status.status(); return desc_status.status();
} }
auto desc = desc_status.ConsumeValueOrDie(); auto desc = desc_status.ConsumeValueOrDie();
std::vector<TfGpuId> peer_gpu_ids; std::vector<TfDeviceId> peer_gpu_ids;
peer_gpu_ids.reserve(num_tf_gpus); peer_gpu_ids.reserve(num_tf_gpus);
for (int id = 0; id < num_tf_gpus; ++id) { for (int id = 0; id < num_tf_gpus; ++id) {
TfGpuId peer_tf_gpu_id(id); TfDeviceId peer_tf_device_id(id);
if (peer_tf_gpu_id != tf_gpu_id) { if (peer_tf_device_id != tf_device_id) {
peer_gpu_ids.push_back(peer_tf_gpu_id); peer_gpu_ids.push_back(peer_tf_device_id);
} }
} }
GPUProcessState* process_state = GPUProcessState::singleton(); GPUProcessState* process_state = GPUProcessState::singleton();
Allocator* gpu_allocator = process_state->GetGPUAllocator( Allocator* gpu_allocator = process_state->GetGPUAllocator(
options.config.gpu_options(), tf_gpu_id, memory_limit, peer_gpu_ids); options.config.gpu_options(), tf_device_id, memory_limit, peer_gpu_ids);
if (gpu_allocator == nullptr) { if (gpu_allocator == nullptr) {
return errors::Internal("Failed to get memory allocator for TF GPU ", return errors::Internal("Failed to get memory allocator for TF GPU ",
tf_gpu_id.value(), " with ", memory_limit, tf_device_id.value(), " with ", memory_limit,
" bytes of memory."); " bytes of memory.");
} }
absl::optional<AllocatorStats> stats = gpu_allocator->GetStats(); absl::optional<AllocatorStats> stats = gpu_allocator->GetStats();
@ -1441,7 +1449,7 @@ Status BaseGPUDeviceFactory::CreateGPUDevice(
return errors::Internal("No allocator statistics"); return errors::Internal("No allocator statistics");
} }
// 'memory_limit' is the required memory size, but if the allocator with // 'memory_limit' is the required memory size, but if the allocator with
// given tf_gpu_id was created before, we'll use it instead of creating a // given tf_device_id was created before, we'll use it instead of creating a
// new one (as TF gpu device is a shared resource), in which case the actual // new one (as TF gpu device is a shared resource), in which case the actual
// memory limit represented by 'stats.bytes_limit' used by that allocator // memory limit represented by 'stats.bytes_limit' used by that allocator
// may be different (which should be an error). // may be different (which should be an error).
@ -1451,11 +1459,11 @@ Status BaseGPUDeviceFactory::CreateGPUDevice(
int64 bytes_limit = stats->bytes_limit ? *stats->bytes_limit : 0; int64 bytes_limit = stats->bytes_limit ? *stats->bytes_limit : 0;
std::unique_ptr<BaseGPUDevice> gpu_device = CreateGPUDevice( std::unique_ptr<BaseGPUDevice> gpu_device = CreateGPUDevice(
options, device_name, static_cast<Bytes>(bytes_limit), dev_locality, options, device_name, static_cast<Bytes>(bytes_limit), dev_locality,
tf_gpu_id, GetShortDeviceDescription(platform_gpu_id, *desc), tf_device_id, GetShortDeviceDescription(platform_device_id, *desc),
gpu_allocator, ProcessState::singleton()->GetCPUAllocator(numa_node)); gpu_allocator, ProcessState::singleton()->GetCPUAllocator(numa_node));
LOG(INFO) << "Created TensorFlow device (" << device_name << " with " LOG(INFO) << "Created TensorFlow device (" << device_name << " with "
<< (bytes_limit >> 20) << " MB memory) -> physical GPU (" << (bytes_limit >> 20) << " MB memory) -> physical GPU ("
<< GetShortDeviceDescription(platform_gpu_id, *desc) << ")"; << GetShortDeviceDescription(platform_device_id, *desc) << ")";
TF_RETURN_IF_ERROR(gpu_device->Init(options)); TF_RETURN_IF_ERROR(gpu_device->Init(options));
devices->push_back(std::move(gpu_device)); devices->push_back(std::move(gpu_device));
@ -1463,13 +1471,13 @@ Status BaseGPUDeviceFactory::CreateGPUDevice(
} }
namespace { namespace {
std::unique_ptr<std::map<std::pair<PlatformGpuId, PlatformGpuId>, bool>> std::unique_ptr<std::map<std::pair<PlatformDeviceId, PlatformDeviceId>, bool>>
GetPeerAccessMap(se::Platform* platform, GetPeerAccessMap(se::Platform* platform,
const std::vector<PlatformGpuId>& visible_gpu_order) { const std::vector<PlatformDeviceId>& visible_gpu_order) {
std::unique_ptr<std::map<std::pair<PlatformGpuId, PlatformGpuId>, bool>> map( std::unique_ptr<std::map<std::pair<PlatformDeviceId, PlatformDeviceId>, bool>>
new std::map<std::pair<PlatformGpuId, PlatformGpuId>, bool>); map(new std::map<std::pair<PlatformDeviceId, PlatformDeviceId>, bool>);
for (PlatformGpuId platform_gpu_i : visible_gpu_order) { for (PlatformDeviceId platform_gpu_i : visible_gpu_order) {
for (PlatformGpuId platform_gpu_j : visible_gpu_order) { for (PlatformDeviceId platform_gpu_j : visible_gpu_order) {
se::StreamExecutor* from = se::StreamExecutor* from =
DeviceIdUtil::ExecutorForPlatformDeviceId(platform, platform_gpu_i) DeviceIdUtil::ExecutorForPlatformDeviceId(platform, platform_gpu_i)
.ValueOrDie(); .ValueOrDie();
@ -1487,7 +1495,7 @@ GetPeerAccessMap(se::Platform* platform,
} // namespace } // namespace
Status BaseGPUDeviceFactory::GetInterconnectMaps( Status BaseGPUDeviceFactory::GetInterconnectMaps(
const std::vector<PlatformGpuId>& visible_gpu_order, const std::vector<PlatformDeviceId>& visible_gpu_order,
se::Platform* gpu_manager, std::vector<InterconnectMap>* maps) { se::Platform* gpu_manager, std::vector<InterconnectMap>* maps) {
// The default interconnect map is obtained from the StreamExecutor. // The default interconnect map is obtained from the StreamExecutor.
auto access_map = GetPeerAccessMap(gpu_manager, visible_gpu_order); auto access_map = GetPeerAccessMap(gpu_manager, visible_gpu_order);
@ -1495,8 +1503,8 @@ Status BaseGPUDeviceFactory::GetInterconnectMaps(
InterconnectMap& imap = maps->at(0); InterconnectMap& imap = maps->at(0);
imap.name = "StreamExecutor"; imap.name = "StreamExecutor";
imap.strength = InterconnectMap::kStreamExecutorStrength; imap.strength = InterconnectMap::kStreamExecutorStrength;
for (PlatformGpuId gpu_id_i : visible_gpu_order) { for (PlatformDeviceId gpu_id_i : visible_gpu_order) {
for (PlatformGpuId gpu_id_j : visible_gpu_order) { for (PlatformDeviceId gpu_id_j : visible_gpu_order) {
if (gpu_id_i == gpu_id_j) continue; if (gpu_id_i == gpu_id_j) continue;
if ((*access_map)[{gpu_id_i, gpu_id_j}]) { if ((*access_map)[{gpu_id_i, gpu_id_j}]) {
imap.directed_links.insert({gpu_id_i, gpu_id_j}); imap.directed_links.insert({gpu_id_i, gpu_id_j});
@ -1509,21 +1517,21 @@ Status BaseGPUDeviceFactory::GetInterconnectMaps(
Status BaseGPUDeviceFactory::GetDeviceLocalities( Status BaseGPUDeviceFactory::GetDeviceLocalities(
int num_tf_gpus, const std::vector<InterconnectMap>& interconnects, int num_tf_gpus, const std::vector<InterconnectMap>& interconnects,
LocalityMap* localities) { LocalityMap* localities) {
std::vector<TfGpuId> all_tf_gpu_ids; std::vector<TfDeviceId> all_tf_device_ids;
all_tf_gpu_ids.reserve(num_tf_gpus); all_tf_device_ids.reserve(num_tf_gpus);
for (int i = 0; i < num_tf_gpus; ++i) { for (int i = 0; i < num_tf_gpus; ++i) {
all_tf_gpu_ids.push_back(TfGpuId(i)); all_tf_device_ids.push_back(TfDeviceId(i));
} }
for (TfGpuId tf_gpu_id : all_tf_gpu_ids) { for (TfDeviceId tf_device_id : all_tf_device_ids) {
PlatformGpuId platform_gpu_id; PlatformDeviceId platform_device_id;
TF_RETURN_IF_ERROR( TF_RETURN_IF_ERROR(
GpuIdManager::TfToPlatformGpuId(tf_gpu_id, &platform_gpu_id)); GpuIdManager::TfToPlatformDeviceId(tf_device_id, &platform_device_id));
// Get GPU bus_id from its reported NUMA affinity. Because GPUs are // Get GPU bus_id from its reported NUMA affinity. Because GPUs are
// virtualized in some environments, we can't just use the GPU id. // virtualized in some environments, we can't just use the GPU id.
// NUMA locales are indexed from 0, buses are indexed from 1. // NUMA locales are indexed from 0, buses are indexed from 1.
se::Platform* gpu_manager = GPUMachineManager(); se::Platform* gpu_manager = GPUMachineManager();
auto desc_status = auto desc_status =
gpu_manager->DescriptionForDevice(platform_gpu_id.value()); gpu_manager->DescriptionForDevice(platform_device_id.value());
if (!desc_status.ok()) { if (!desc_status.ok()) {
return desc_status.status(); return desc_status.status();
} }
@ -1537,7 +1545,7 @@ Status BaseGPUDeviceFactory::GetDeviceLocalities(
// trouble may manifest as slower than expected performance, or // trouble may manifest as slower than expected performance, or
// outright failures. // outright failures.
LOG(INFO) << "Could not identify NUMA node of platform GPU id " LOG(INFO) << "Could not identify NUMA node of platform GPU id "
<< platform_gpu_id << platform_device_id
<< ", defaulting to 0. Your kernel may not have been built " << ", defaulting to 0. Your kernel may not have been built "
<< "with NUMA support."; << "with NUMA support.";
numa_node = 0; numa_node = 0;
@ -1549,11 +1557,11 @@ Status BaseGPUDeviceFactory::GetDeviceLocalities(
// Set LocalLinks from InterconnectMaps. // Set LocalLinks from InterconnectMaps.
LocalLinks* links = dev_locality.mutable_links(); LocalLinks* links = dev_locality.mutable_links();
for (const InterconnectMap& imap : interconnects) { for (const InterconnectMap& imap : interconnects) {
for (TfGpuId tf_gpu_dst : all_tf_gpu_ids) { for (TfDeviceId tf_gpu_dst : all_tf_device_ids) {
PlatformGpuId platform_gpu_dst; PlatformDeviceId platform_gpu_dst;
TF_RETURN_IF_ERROR( TF_RETURN_IF_ERROR(
GpuIdManager::TfToPlatformGpuId(tf_gpu_dst, &platform_gpu_dst)); GpuIdManager::TfToPlatformDeviceId(tf_gpu_dst, &platform_gpu_dst));
if (imap.directed_links.find({platform_gpu_id, platform_gpu_dst}) != if (imap.directed_links.find({platform_device_id, platform_gpu_dst}) !=
imap.directed_links.end()) { imap.directed_links.end()) {
InterconnectLink* ilink = links->add_link(); InterconnectLink* ilink = links->add_link();
ilink->set_device_id(tf_gpu_dst.value()); ilink->set_device_id(tf_gpu_dst.value());
@ -1565,12 +1573,12 @@ Status BaseGPUDeviceFactory::GetDeviceLocalities(
// If this is one of multiple virtual GPUs on the same physical GPU // If this is one of multiple virtual GPUs on the same physical GPU
// add high strength links to the others. // add high strength links to the others.
for (TfGpuId tf_gpu_dst : all_tf_gpu_ids) { for (TfDeviceId tf_gpu_dst : all_tf_device_ids) {
if (tf_gpu_id == tf_gpu_dst) continue; if (tf_device_id == tf_gpu_dst) continue;
PlatformGpuId platform_gpu_dst; PlatformDeviceId platform_gpu_dst;
TF_RETURN_IF_ERROR( TF_RETURN_IF_ERROR(
GpuIdManager::TfToPlatformGpuId(tf_gpu_dst, &platform_gpu_dst)); GpuIdManager::TfToPlatformDeviceId(tf_gpu_dst, &platform_gpu_dst));
if (platform_gpu_id == platform_gpu_dst) { if (platform_device_id == platform_gpu_dst) {
InterconnectLink* ilink = links->add_link(); InterconnectLink* ilink = links->add_link();
ilink->set_device_id(tf_gpu_dst.value()); ilink->set_device_id(tf_gpu_dst.value());
ilink->set_type("SAME_DEVICE"); ilink->set_type("SAME_DEVICE");
@ -1578,10 +1586,11 @@ Status BaseGPUDeviceFactory::GetDeviceLocalities(
} }
} }
(*localities)[tf_gpu_id] = dev_locality; (*localities)[tf_device_id] = dev_locality;
VLOG(1) << "GPUDevice PlatformGpuId " << platform_gpu_id << " TfGpuId " VLOG(1) << "GPUDevice PlatformDeviceId " << platform_device_id
<< tf_gpu_id << " on bus " << dev_locality.bus_id() << " TfDeviceId " << tf_device_id << " on bus "
<< " numa: " << numa_node << " pci: " << desc->pci_bus_id() << dev_locality.bus_id() << " numa: " << numa_node
<< " pci: " << desc->pci_bus_id()
<< " DeviceLocality: " << dev_locality.DebugString(); << " DeviceLocality: " << dev_locality.DebugString();
} }
return Status::OK(); return Status::OK();
@ -1589,7 +1598,7 @@ Status BaseGPUDeviceFactory::GetDeviceLocalities(
static int GetDefaultMinGPUMultiprocessorCount( static int GetDefaultMinGPUMultiprocessorCount(
se::Platform* gpu_manager, se::Platform* gpu_manager,
const std::vector<PlatformGpuId>& visible_gpu_order) { const std::vector<PlatformDeviceId>& visible_gpu_order) {
static const int kDefaultMinGPUMultiprocessorCount = 8; static const int kDefaultMinGPUMultiprocessorCount = 8;
// Find the highest multi-processor count across all visible GPUs. // Find the highest multi-processor count across all visible GPUs.
@ -1614,7 +1623,7 @@ static int GetDefaultMinGPUMultiprocessorCount(
static int GetMinGPUMultiprocessorCount( static int GetMinGPUMultiprocessorCount(
se::Platform* gpu_manager, se::Platform* gpu_manager,
const std::vector<PlatformGpuId>& visible_gpu_order) { const std::vector<PlatformDeviceId>& visible_gpu_order) {
const char* tf_min_gpu_core_count = getenv("TF_MIN_GPU_MULTIPROCESSOR_COUNT"); const char* tf_min_gpu_core_count = getenv("TF_MIN_GPU_MULTIPROCESSOR_COUNT");
if (tf_min_gpu_core_count == nullptr || if (tf_min_gpu_core_count == nullptr ||
@ -1704,14 +1713,14 @@ std::vector<int> GetSupportedAMDGPUISAVersions() {
} // namespace } // namespace
Status BaseGPUDeviceFactory::EnablePeerAccess( Status BaseGPUDeviceFactory::EnablePeerAccess(
const std::vector<PlatformGpuId>& visible_gpu_order) { const std::vector<PlatformDeviceId>& visible_gpu_order) {
se::Platform* gpu_manager = GPUMachineManager(); se::Platform* gpu_manager = GPUMachineManager();
int possible_peer_count = 0; int possible_peer_count = 0;
int enabled_peer_count = 0; int enabled_peer_count = 0;
for (int i = 0; i < visible_gpu_order.size(); ++i) { for (int i = 0; i < visible_gpu_order.size(); ++i) {
const PlatformGpuId platform_gpu_i = visible_gpu_order[i]; const PlatformDeviceId platform_gpu_i = visible_gpu_order[i];
for (int j = 0; j < visible_gpu_order.size(); ++j) { for (int j = 0; j < visible_gpu_order.size(); ++j) {
const PlatformGpuId platform_gpu_j = visible_gpu_order[j]; const PlatformDeviceId platform_gpu_j = visible_gpu_order[j];
// We have already validated that ExecutorForDevice() calls return OK. // We have already validated that ExecutorForDevice() calls return OK.
se::StreamExecutor* from = se::StreamExecutor* from =
DeviceIdUtil::ExecutorForPlatformDeviceId(gpu_manager, platform_gpu_i) DeviceIdUtil::ExecutorForPlatformDeviceId(gpu_manager, platform_gpu_i)
@ -1748,8 +1757,8 @@ Status BaseGPUDeviceFactory::EnablePeerAccess(
} }
Status BaseGPUDeviceFactory::GetValidDeviceIds( Status BaseGPUDeviceFactory::GetValidDeviceIds(
const std::vector<PlatformGpuId>& visible_gpu_order, const std::vector<PlatformDeviceId>& visible_gpu_order,
std::vector<PlatformGpuId>* ids) { std::vector<PlatformDeviceId>* ids) {
se::Platform* gpu_manager = GPUMachineManager(); se::Platform* gpu_manager = GPUMachineManager();
for (int i = 0; i < visible_gpu_order.size(); ++i) { for (int i = 0; i < visible_gpu_order.size(); ++i) {
int visible_gpu_id = visible_gpu_order[i].value(); int visible_gpu_id = visible_gpu_order[i].value();
@ -1834,7 +1843,7 @@ Status BaseGPUDeviceFactory::GetValidDeviceIds(
// Filter out devices that don't have the right capability or power. // Filter out devices that don't have the right capability or power.
for (int i = 0; i < visible_gpu_order.size(); ++i) { for (int i = 0; i < visible_gpu_order.size(); ++i) {
const PlatformGpuId visible_gpu_id = visible_gpu_order[i]; const PlatformDeviceId visible_gpu_id = visible_gpu_order[i];
auto description_status = auto description_status =
gpu_manager->DescriptionForDevice(visible_gpu_id.value()); gpu_manager->DescriptionForDevice(visible_gpu_id.value());
if (!description_status.ok()) { if (!description_status.ok()) {
@ -1904,7 +1913,7 @@ Status BaseGPUDeviceFactory::GetValidDeviceIds(
if (!ids->empty()) { if (!ids->empty()) {
std::vector<int> raw_ids(ids->size()); std::vector<int> raw_ids(ids->size());
std::transform(ids->begin(), ids->end(), raw_ids.begin(), std::transform(ids->begin(), ids->end(), raw_ids.begin(),
[](PlatformGpuId id) -> int { return id.value(); }); [](PlatformDeviceId id) -> int { return id.value(); });
LOG(INFO) << "Adding visible gpu devices: " << absl::StrJoin(raw_ids, ", "); LOG(INFO) << "Adding visible gpu devices: " << absl::StrJoin(raw_ids, ", ");
} }

View File

@ -53,7 +53,8 @@ class BaseGPUDevice : public LocalDevice {
public: public:
BaseGPUDevice(const SessionOptions& options, const std::string& name, BaseGPUDevice(const SessionOptions& options, const std::string& name,
Bytes memory_limit, const DeviceLocality& locality, Bytes memory_limit, const DeviceLocality& locality,
TfGpuId tf_gpu_id, const std::string& physical_device_desc, TfDeviceId tf_device_id,
const std::string& physical_device_desc,
Allocator* gpu_allocator, Allocator* cpu_allocator, Allocator* gpu_allocator, Allocator* cpu_allocator,
bool sync_every_op); bool sync_every_op);
@ -87,9 +88,10 @@ class BaseGPUDevice : public LocalDevice {
// Returns the platform GPU id of this device within the native driver system; // Returns the platform GPU id of this device within the native driver system;
// e.g., for CUDA and ROCm this is the ordinal of the GPU within the system. // e.g., for CUDA and ROCm this is the ordinal of the GPU within the system.
int gpu_id() const { int gpu_id() const {
PlatformGpuId platform_gpu_id; PlatformDeviceId platform_device_id;
TF_CHECK_OK(GpuIdManager::TfToPlatformGpuId(tf_gpu_id_, &platform_gpu_id)); TF_CHECK_OK(
return platform_gpu_id.value(); GpuIdManager::TfToPlatformDeviceId(tf_device_id_, &platform_device_id));
return platform_device_id.value();
} }
// The executor that provides control for the device; e.g., for CUDA this // The executor that provides control for the device; e.g., for CUDA this
@ -146,7 +148,7 @@ class BaseGPUDevice : public LocalDevice {
GPUDeviceContext* device_context_; GPUDeviceContext* device_context_;
GpuDeviceInfo* gpu_device_info_ = nullptr; GpuDeviceInfo* gpu_device_info_ = nullptr;
mutex trace_mu_; mutex trace_mu_;
TfGpuId tf_gpu_id_; TfDeviceId tf_device_id_;
const bool sync_every_op_ = false; const bool sync_every_op_ = false;
EventMgr* em_ = nullptr; EventMgr* em_ = nullptr;
std::unique_ptr<thread::ThreadPool> thread_pool_; std::unique_ptr<thread::ThreadPool> thread_pool_;
@ -325,53 +327,56 @@ class BaseGPUDeviceFactory : public DeviceFactory {
int32 strength; int32 strength;
static const int kSameDeviceStrength; static const int kSameDeviceStrength;
static const int kStreamExecutorStrength; static const int kStreamExecutorStrength;
std::set<std::pair<PlatformGpuId, PlatformGpuId>> directed_links; std::set<std::pair<PlatformDeviceId, PlatformDeviceId>> directed_links;
}; };
protected: protected:
// Populates *maps with interconnect maps for all local direct access // Populates *maps with interconnect maps for all local direct access
// pathways between GPUs. // pathways between GPUs.
virtual Status GetInterconnectMaps( virtual Status GetInterconnectMaps(
const std::vector<PlatformGpuId>& visible_gpu_order, const std::vector<PlatformDeviceId>& visible_gpu_order,
se::Platform* gpu_manager, std::vector<InterconnectMap>* maps); se::Platform* gpu_manager, std::vector<InterconnectMap>* maps);
struct TfGpuIdHash { struct TfDeviceIdHash {
std::size_t operator()(const TfGpuId& id) const noexcept { std::size_t operator()(const TfDeviceId& id) const noexcept {
return std::hash<int>{}(id.value()); return std::hash<int>{}(id.value());
} }
}; };
typedef std::unordered_map<TfGpuId, DeviceLocality, TfGpuIdHash> LocalityMap; typedef std::unordered_map<TfDeviceId, DeviceLocality, TfDeviceIdHash>
LocalityMap;
// Populates *localities with the DeviceLocality descriptor for // Populates *localities with the DeviceLocality descriptor for
// every TfGpuId. // every TfDeviceId.
virtual Status GetDeviceLocalities( virtual Status GetDeviceLocalities(
int num_tf_gpus, const std::vector<InterconnectMap>& interconnects, int num_tf_gpus, const std::vector<InterconnectMap>& interconnects,
LocalityMap* localities); LocalityMap* localities);
private: private:
// Creates a BaseGPUDevice associated with 'tf_gpu_id', allocates (strictly) // Creates a BaseGPUDevice associated with 'tf_device_id', allocates
// 'memory_limit' bytes of GPU memory to it, and adds it to the 'devices' // (strictly) 'memory_limit' bytes of GPU memory to it, and adds it to the
// vector. // 'devices' vector.
Status CreateGPUDevice(const SessionOptions& options, Status CreateGPUDevice(const SessionOptions& options,
const std::string& name_prefix, TfGpuId tf_gpu_id, const std::string& name_prefix,
int64 memory_limit, const DeviceLocality& dev_locality, TfDeviceId tf_device_id, int64 memory_limit,
size_t num_tf_gpus, const DeviceLocality& dev_locality, size_t num_tf_gpus,
std::vector<std::unique_ptr<Device>>* devices); std::vector<std::unique_ptr<Device>>* devices);
virtual std::unique_ptr<BaseGPUDevice> CreateGPUDevice( virtual std::unique_ptr<BaseGPUDevice> CreateGPUDevice(
const SessionOptions& options, const string& name, Bytes memory_limit, const SessionOptions& options, const string& name, Bytes memory_limit,
const DeviceLocality& dev_locality, TfGpuId tf_gpu_id, const DeviceLocality& dev_locality, TfDeviceId tf_device_id,
const string& physical_device_desc, Allocator* gpu_allocator, const string& physical_device_desc, Allocator* gpu_allocator,
Allocator* cpu_allocator) = 0; Allocator* cpu_allocator) = 0;
Status EnablePeerAccess(const std::vector<PlatformGpuId>& visible_gpu_order); Status EnablePeerAccess(
const std::vector<PlatformDeviceId>& visible_gpu_order);
// Returns into 'ids' the list of valid platform GPU ids, in the order that // Returns into 'ids' the list of valid platform GPU ids, in the order that
// they should map to TF GPU ids "/device:GPU:0", "/device:GPU:1", etc, // they should map to TF GPU ids "/device:GPU:0", "/device:GPU:1", etc,
// based upon 'visible_gpu_order' which was generated by parsing // based upon 'visible_gpu_order' which was generated by parsing
// GPUOptions::visible_device_list which is a comma-separated list of CUDA or // GPUOptions::visible_device_list which is a comma-separated list of CUDA or
// ROCm GPU ids. // ROCm GPU ids.
Status GetValidDeviceIds(const std::vector<PlatformGpuId>& visible_gpu_order, Status GetValidDeviceIds(
std::vector<PlatformGpuId>* ids); const std::vector<PlatformDeviceId>& visible_gpu_order,
std::vector<PlatformDeviceId>* ids);
// Cache the valid device IDs if not already cached. Cached IDs are stored in // Cache the valid device IDs if not already cached. Cached IDs are stored in
// field cached_device_ids_. Passes {0, 1, ..., num_devices-1} to // field cached_device_ids_. Passes {0, 1, ..., num_devices-1} to
@ -379,14 +384,14 @@ class BaseGPUDeviceFactory : public DeviceFactory {
// devices should be treated as visible, like ListPhysicalDevices. // devices should be treated as visible, like ListPhysicalDevices.
Status CacheDeviceIds(); Status CacheDeviceIds();
// visible_gpu_initialized_[platform_gpu_id] is true if visible GPU // visible_gpu_initialized_[platform_device_id] is true if visible GPU
// platform_gpu_id has been initialized by the process. // platform_device_id has been initialized by the process.
std::unordered_map<int, bool> visible_gpu_initialized_; std::unordered_map<int, bool> visible_gpu_initialized_;
// Cached device IDs, as returned by GetValidDeviceIds when every physical // Cached device IDs, as returned by GetValidDeviceIds when every physical
// device is visible. Cache should not be used if some devices are not // device is visible. Cache should not be used if some devices are not
// visible. // visible.
std::vector<PlatformGpuId> cached_device_ids_; std::vector<PlatformDeviceId> cached_device_ids_;
}; };
} // namespace tensorflow } // namespace tensorflow

View File

@ -30,9 +30,9 @@ class GPUDevice : public BaseGPUDevice {
public: public:
GPUDevice(const SessionOptions& options, const string& name, GPUDevice(const SessionOptions& options, const string& name,
Bytes memory_limit, const DeviceLocality& locality, Bytes memory_limit, const DeviceLocality& locality,
TfGpuId tf_gpu_id, const string& physical_device_desc, TfDeviceId tf_device_id, const string& physical_device_desc,
Allocator* gpu_allocator, Allocator* cpu_allocator) Allocator* gpu_allocator, Allocator* cpu_allocator)
: BaseGPUDevice(options, name, memory_limit, locality, tf_gpu_id, : BaseGPUDevice(options, name, memory_limit, locality, tf_device_id,
physical_device_desc, gpu_allocator, cpu_allocator, physical_device_desc, gpu_allocator, cpu_allocator,
false /* sync every op */) { false /* sync every op */) {
if (options.config.has_gpu_options()) { if (options.config.has_gpu_options()) {
@ -63,11 +63,11 @@ class GPUDeviceFactory : public BaseGPUDeviceFactory {
private: private:
std::unique_ptr<BaseGPUDevice> CreateGPUDevice( std::unique_ptr<BaseGPUDevice> CreateGPUDevice(
const SessionOptions& options, const string& name, Bytes memory_limit, const SessionOptions& options, const string& name, Bytes memory_limit,
const DeviceLocality& locality, TfGpuId tf_gpu_id, const DeviceLocality& locality, TfDeviceId tf_device_id,
const string& physical_device_desc, Allocator* gpu_allocator, const string& physical_device_desc, Allocator* gpu_allocator,
Allocator* cpu_allocator) override { Allocator* cpu_allocator) override {
return absl::make_unique<GPUDevice>(options, name, memory_limit, locality, return absl::make_unique<GPUDevice>(options, name, memory_limit, locality,
tf_gpu_id, physical_device_desc, tf_device_id, physical_device_desc,
gpu_allocator, cpu_allocator); gpu_allocator, cpu_allocator);
} }
}; };

View File

@ -30,7 +30,7 @@ namespace tensorflow {
namespace { namespace {
const char* kDeviceNamePrefix = "/job:localhost/replica:0/task:0"; const char* kDeviceNamePrefix = "/job:localhost/replica:0/task:0";
int64 GetTotalGPUMemory(PlatformGpuId gpu_id) { int64 GetTotalGPUMemory(PlatformDeviceId gpu_id) {
se::StreamExecutor* se = se::StreamExecutor* se =
DeviceIdUtil::ExecutorForPlatformDeviceId(GPUMachineManager(), gpu_id) DeviceIdUtil::ExecutorForPlatformDeviceId(GPUMachineManager(), gpu_id)
.ValueOrDie(); .ValueOrDie();
@ -40,7 +40,7 @@ int64 GetTotalGPUMemory(PlatformGpuId gpu_id) {
return total_memory; return total_memory;
} }
Status GetComputeCapability(PlatformGpuId gpu_id, int* cc_major, Status GetComputeCapability(PlatformDeviceId gpu_id, int* cc_major,
int* cc_minor) { int* cc_minor) {
se::StreamExecutor* se = se::StreamExecutor* se =
DeviceIdUtil::ExecutorForPlatformDeviceId(GPUMachineManager(), gpu_id) DeviceIdUtil::ExecutorForPlatformDeviceId(GPUMachineManager(), gpu_id)
@ -350,7 +350,7 @@ TEST_F(GPUDeviceTest, MultipleVirtualDevicesWithPriority) {
// error. // error.
TEST_F(GPUDeviceTest, UnifiedMemoryUnavailableOnPrePascalGpus) { TEST_F(GPUDeviceTest, UnifiedMemoryUnavailableOnPrePascalGpus) {
int cc_major, cc_minor; int cc_major, cc_minor;
TF_ASSERT_OK(GetComputeCapability(PlatformGpuId(0), &cc_major, &cc_minor)); TF_ASSERT_OK(GetComputeCapability(PlatformDeviceId(0), &cc_major, &cc_minor));
// Exit early while running on Pascal or later GPUs. // Exit early while running on Pascal or later GPUs.
if (cc_major >= 6) { if (cc_major >= 6) {
return; return;
@ -371,10 +371,10 @@ TEST_F(GPUDeviceTest, UnifiedMemoryUnavailableOnPrePascalGpus) {
// more memory than what is available on the device. // more memory than what is available on the device.
TEST_F(GPUDeviceTest, UnifiedMemoryAllocation) { TEST_F(GPUDeviceTest, UnifiedMemoryAllocation) {
static constexpr double kGpuMemoryFraction = 1.2; static constexpr double kGpuMemoryFraction = 1.2;
static constexpr PlatformGpuId kPlatformGpuId(0); static constexpr PlatformDeviceId kPlatformDeviceId(0);
int cc_major, cc_minor; int cc_major, cc_minor;
TF_ASSERT_OK(GetComputeCapability(kPlatformGpuId, &cc_major, &cc_minor)); TF_ASSERT_OK(GetComputeCapability(kPlatformDeviceId, &cc_major, &cc_minor));
// Exit early if running on pre-Pascal GPUs. // Exit early if running on pre-Pascal GPUs.
if (cc_major < 6) { if (cc_major < 6) {
LOG(INFO) LOG(INFO)
@ -389,8 +389,9 @@ TEST_F(GPUDeviceTest, UnifiedMemoryAllocation) {
ASSERT_EQ(1, devices.size()); ASSERT_EQ(1, devices.size());
int64 memory_limit = devices[0]->attributes().memory_limit(); int64 memory_limit = devices[0]->attributes().memory_limit();
ASSERT_EQ(memory_limit, static_cast<int64>(GetTotalGPUMemory(kPlatformGpuId) * ASSERT_EQ(memory_limit,
kGpuMemoryFraction)); static_cast<int64>(GetTotalGPUMemory(kPlatformDeviceId) *
kGpuMemoryFraction));
AllocatorAttributes allocator_attributes = AllocatorAttributes(); AllocatorAttributes allocator_attributes = AllocatorAttributes();
allocator_attributes.set_gpu_compatible(true); allocator_attributes.set_gpu_compatible(true);

View File

@ -17,13 +17,6 @@ limitations under the License.
#include "tensorflow/core/common_runtime/device/device_id.h" #include "tensorflow/core/common_runtime/device/device_id.h"
namespace tensorflow { // TODO(sanjoy): Delete the header and forward the references.
// TODO(annarev): remove these aliases after all references are updated
// to use device ids.
typedef TfDeviceId TfGpuId;
typedef PlatformDeviceId PlatformGpuId;
} // namespace tensorflow
#endif // TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_ID_H_ #endif // TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_ID_H_

View File

@ -20,16 +20,16 @@ limitations under the License.
namespace tensorflow { namespace tensorflow {
Status GpuIdManager::InsertTfPlatformGpuIdPair( Status GpuIdManager::InsertTfPlatformDeviceIdPair(
TfDeviceId tf_gpu_id, PlatformDeviceId platform_gpu_id) { TfDeviceId tf_device_id, PlatformDeviceId platform_device_id) {
return DeviceIdManager::InsertTfPlatformDeviceIdPair(DEVICE_GPU, tf_gpu_id, return DeviceIdManager::InsertTfPlatformDeviceIdPair(DEVICE_GPU, tf_device_id,
platform_gpu_id); platform_device_id);
} }
Status GpuIdManager::TfToPlatformGpuId(TfDeviceId tf_gpu_id, Status GpuIdManager::TfToPlatformDeviceId(
PlatformDeviceId* platform_gpu_id) { TfDeviceId tf_device_id, PlatformDeviceId* platform_device_id) {
return DeviceIdManager::TfToPlatformDeviceId(DEVICE_GPU, tf_gpu_id, return DeviceIdManager::TfToPlatformDeviceId(DEVICE_GPU, tf_device_id,
platform_gpu_id); platform_device_id);
} }
void GpuIdManager::TestOnlyReset() { DeviceIdManager::TestOnlyReset(); } void GpuIdManager::TestOnlyReset() { DeviceIdManager::TestOnlyReset(); }

View File

@ -21,17 +21,18 @@ limitations under the License.
namespace tensorflow { namespace tensorflow {
// Class that maintains a map from TfGpuId to PlatformGpuId, and manages the // Class that maintains a map from TfDeviceId to PlatformDeviceId, and manages
// translation between them. // the translation between them.
class GpuIdManager { class GpuIdManager {
public: public:
// Adds a mapping from tf_gpu_id to platform_gpu_id. // Adds a mapping from tf_device_id to platform_device_id.
static Status InsertTfPlatformGpuIdPair(TfDeviceId tf_gpu_id, static Status InsertTfPlatformDeviceIdPair(
PlatformDeviceId platform_gpu_id); TfDeviceId tf_device_id, PlatformDeviceId platform_device_id);
// Gets the platform_gpu_id associated with tf_gpu_id. Returns OK if found. // Gets the platform_device_id associated with tf_device_id. Returns OK if
static Status TfToPlatformGpuId(TfDeviceId tf_gpu_id, // found.
PlatformDeviceId* platform_gpu_id); static Status TfToPlatformDeviceId(TfDeviceId tf_device_id,
PlatformDeviceId* platform_device_id);
// Clears the map. Used in unit tests only. // Clears the map. Used in unit tests only.
static void TestOnlyReset(); static void TestOnlyReset();

View File

@ -83,10 +83,10 @@ GPUProcessState::GPUProcessState() : gpu_device_enabled_(false) {
process_state_ = ProcessState::singleton(); process_state_ = ProcessState::singleton();
} }
int GPUProcessState::BusIdForGPU(TfGpuId tf_gpu_id) { int GPUProcessState::BusIdForGPU(TfDeviceId tf_device_id) {
// Return the NUMA node associated with the GPU's StreamExecutor. // Return the NUMA node associated with the GPU's StreamExecutor.
se::StreamExecutor* se = DeviceIdUtil::ExecutorForTfDeviceId( se::StreamExecutor* se = DeviceIdUtil::ExecutorForTfDeviceId(
DEVICE_GPU, GPUMachineManager(), tf_gpu_id) DEVICE_GPU, GPUMachineManager(), tf_device_id)
.ValueOrDie(); .ValueOrDie();
int numa_node = se->GetDeviceDescription().numa_node(); int numa_node = se->GetDeviceDescription().numa_node();
// bus_id must be non-negative. If the numa_node is not known, // bus_id must be non-negative. If the numa_node is not known,
@ -96,11 +96,11 @@ int GPUProcessState::BusIdForGPU(TfGpuId tf_gpu_id) {
// NOLINTNEXTLINE: clang-tidy complains this is unused because of build flags. // NOLINTNEXTLINE: clang-tidy complains this is unused because of build flags.
static SubAllocator* CreateSubAllocator( static SubAllocator* CreateSubAllocator(
const GPUOptions& options, PlatformGpuId platform_gpu_id, const GPUOptions& options, PlatformDeviceId platform_device_id,
const std::vector<SubAllocator::Visitor>& alloc_visitors, const std::vector<SubAllocator::Visitor>& alloc_visitors,
size_t total_bytes, const std::vector<TfGpuId>& peer_gpu_ids) { size_t total_bytes, const std::vector<TfDeviceId>& peer_gpu_ids) {
auto executor = DeviceIdUtil::ExecutorForPlatformDeviceId(GPUMachineManager(), auto executor = DeviceIdUtil::ExecutorForPlatformDeviceId(GPUMachineManager(),
platform_gpu_id) platform_device_id)
.ValueOrDie(); .ValueOrDie();
// FIXME(imintz): Observed OOM issues when using the virtual memory // FIXME(imintz): Observed OOM issues when using the virtual memory
@ -110,21 +110,21 @@ static SubAllocator* CreateSubAllocator(
// TODO(imintz): Remove the cuMemAlloc capability of this allocator. // TODO(imintz): Remove the cuMemAlloc capability of this allocator.
if (options.per_process_gpu_memory_fraction() > 1.0 || if (options.per_process_gpu_memory_fraction() > 1.0 ||
options.experimental().use_unified_memory()) { options.experimental().use_unified_memory()) {
return new DeviceMemAllocator(executor, platform_gpu_id, return new DeviceMemAllocator(executor, platform_device_id,
/*use_unified_memory=*/true, alloc_visitors, /*use_unified_memory=*/true, alloc_visitors,
{}); {});
} else { } else {
auto* gpu_context = reinterpret_cast<stream_executor::gpu::GpuContext*>( auto* gpu_context = reinterpret_cast<stream_executor::gpu::GpuContext*>(
executor->implementation()->GpuContextHack()); executor->implementation()->GpuContextHack());
absl::flat_hash_set<PlatformGpuId> platform_peer_gpu_ids; absl::flat_hash_set<PlatformDeviceId> platform_peer_gpu_ids;
platform_peer_gpu_ids.reserve(peer_gpu_ids.size()); platform_peer_gpu_ids.reserve(peer_gpu_ids.size());
for (const TfGpuId tf_gpu_id : peer_gpu_ids) { for (const TfDeviceId tf_device_id : peer_gpu_ids) {
PlatformGpuId platform_gpu_id; PlatformDeviceId platform_device_id;
TF_CHECK_OK(GpuIdManager::TfToPlatformGpuId(tf_gpu_id, &platform_gpu_id)); TF_CHECK_OK(GpuIdManager::TfToPlatformDeviceId(tf_device_id, &platform_device_id));
platform_peer_gpu_ids.insert(platform_gpu_id); platform_peer_gpu_ids.insert(platform_device_id);
} }
std::vector<PlatformGpuId> platform_peer_gpu_ids_vec( std::vector<PlatformDeviceId> platform_peer_gpu_ids_vec(
platform_peer_gpu_ids.begin(), platform_peer_gpu_ids.end()); platform_peer_gpu_ids.begin(), platform_peer_gpu_ids.end());
// Adjust virtual address space to be slightly larger than the physical // Adjust virtual address space to be slightly larger than the physical
@ -133,7 +133,7 @@ static SubAllocator* CreateSubAllocator(
// TODO(imintz): Update BFC allocator to ensure it doesn't create holes in // TODO(imintz): Update BFC allocator to ensure it doesn't create holes in
// the va space. // the va space.
return GpuVirtualMemAllocator::Create( return GpuVirtualMemAllocator::Create(
alloc_visitors, {}, *gpu_context, platform_gpu_id, alloc_visitors, {}, *gpu_context, platform_device_id,
/*virtual_address_space_size=*/total_bytes * 2, /*virtual_address_space_size=*/total_bytes * 2,
platform_peer_gpu_ids_vec) platform_peer_gpu_ids_vec)
.ValueOrDie() .ValueOrDie()
@ -141,7 +141,7 @@ static SubAllocator* CreateSubAllocator(
} }
#else #else
return new DeviceMemAllocator( return new DeviceMemAllocator(
executor, platform_gpu_id, executor, platform_device_id,
(options.per_process_gpu_memory_fraction() > 1.0 || (options.per_process_gpu_memory_fraction() > 1.0 ||
options.experimental().use_unified_memory()), options.experimental().use_unified_memory()),
alloc_visitors, {}); alloc_visitors, {});
@ -149,21 +149,21 @@ static SubAllocator* CreateSubAllocator(
} }
Allocator* GPUProcessState::GetGPUAllocator( Allocator* GPUProcessState::GetGPUAllocator(
const GPUOptions& options, TfGpuId tf_gpu_id, size_t total_bytes, const GPUOptions& options, TfDeviceId tf_device_id, size_t total_bytes,
const std::vector<TfGpuId>& peer_gpu_ids) { const std::vector<TfDeviceId>& peer_gpu_ids) {
CHECK(process_state_); CHECK(process_state_);
#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \ #if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
(defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM) (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
const string& allocator_type = options.allocator_type(); const string& allocator_type = options.allocator_type();
mutex_lock lock(mu_); mutex_lock lock(mu_);
DeviceIdUtil::CheckValidTfDeviceId(DEVICE_GPU, GPUMachineManager(), DeviceIdUtil::CheckValidTfDeviceId(DEVICE_GPU, GPUMachineManager(),
tf_gpu_id); tf_device_id);
if (tf_gpu_id.value() >= static_cast<int64>(gpu_allocators_.size())) { if (tf_device_id.value() >= static_cast<int64>(gpu_allocators_.size())) {
gpu_allocators_.resize(tf_gpu_id.value() + 1); gpu_allocators_.resize(tf_device_id.value() + 1);
} }
AllocatorParts& allocator_parts = gpu_allocators_[tf_gpu_id.value()]; AllocatorParts& allocator_parts = gpu_allocators_[tf_device_id.value()];
if (allocator_parts.allocator == nullptr) { if (allocator_parts.allocator == nullptr) {
// Validate allocator types. // Validate allocator types.
if (!allocator_type.empty() && allocator_type != "BFC") { if (!allocator_type.empty() && allocator_type != "BFC") {
@ -171,19 +171,20 @@ Allocator* GPUProcessState::GetGPUAllocator(
return nullptr; return nullptr;
} }
PlatformGpuId platform_gpu_id; PlatformDeviceId platform_device_id;
TF_CHECK_OK(GpuIdManager::TfToPlatformGpuId(tf_gpu_id, &platform_gpu_id)); TF_CHECK_OK(
int bus_id = BusIdForGPU(tf_gpu_id); GpuIdManager::TfToPlatformDeviceId(tf_device_id, &platform_device_id));
int bus_id = BusIdForGPU(tf_device_id);
DCHECK_GE(bus_id, 0); DCHECK_GE(bus_id, 0);
while (bus_id >= gpu_visitors_.size()) { while (bus_id >= gpu_visitors_.size()) {
gpu_visitors_.push_back({}); gpu_visitors_.push_back({});
} }
auto* sub_allocator = auto* sub_allocator =
CreateSubAllocator(options, platform_gpu_id, gpu_visitors_[bus_id], CreateSubAllocator(options, platform_device_id, gpu_visitors_[bus_id],
total_bytes, peer_gpu_ids); total_bytes, peer_gpu_ids);
GPUBFCAllocator* gpu_bfc_allocator = GPUBFCAllocator* gpu_bfc_allocator = new GPUBFCAllocator(
new GPUBFCAllocator(sub_allocator, total_bytes, options, sub_allocator, total_bytes, options,
strings::StrCat("GPU_", tf_gpu_id.value(), "_bfc")); strings::StrCat("GPU_", tf_device_id.value(), "_bfc"));
Allocator* gpu_allocator = gpu_bfc_allocator; Allocator* gpu_allocator = gpu_bfc_allocator;
SharedCounter* timing_counter = nullptr; SharedCounter* timing_counter = nullptr;
if (options.experimental().timestamped_allocator()) { if (options.experimental().timestamped_allocator()) {
@ -195,29 +196,30 @@ Allocator* GPUProcessState::GetGPUAllocator(
// distinctive patterns on both ends of allocated memory. // distinctive patterns on both ends of allocated memory.
if (UseCudaMemoryGuardAllocator()) { if (UseCudaMemoryGuardAllocator()) {
LOG(INFO) << "Using memory guard allocator for GPU."; LOG(INFO) << "Using memory guard allocator for GPU.";
gpu_allocator = new GPUDebugAllocator(gpu_allocator, platform_gpu_id); gpu_allocator = new GPUDebugAllocator(gpu_allocator, platform_device_id);
gpu_allocator = new GPUNanResetAllocator(gpu_allocator, platform_gpu_id); gpu_allocator =
new GPUNanResetAllocator(gpu_allocator, platform_device_id);
} else if (UseCudaMallocAllocator()) { } else if (UseCudaMallocAllocator()) {
LOG(INFO) << "Using CUDA malloc allocator for GPU."; LOG(INFO) << "Using CUDA malloc allocator for GPU.";
// If true, passes all allocation requests through to cudaMalloc // If true, passes all allocation requests through to cudaMalloc
// useful for doing memory debugging with tools like cuda-memcheck // useful for doing memory debugging with tools like cuda-memcheck
// **WARNING** probably will not work in a multi-gpu scenario // **WARNING** probably will not work in a multi-gpu scenario
gpu_allocator = gpu_allocator =
new GPUcudaMallocAllocator(gpu_allocator, platform_gpu_id); new GPUcudaMallocAllocator(gpu_allocator, platform_device_id);
} else if (UseCudaMallocAsyncAllocator()) { } else if (UseCudaMallocAsyncAllocator()) {
LOG(INFO) << "Using CUDA malloc Async allocator for GPU."; LOG(INFO) << "Using CUDA malloc Async allocator for GPU.";
// If true, passes all allocation requests through to cudaMallocAsync // If true, passes all allocation requests through to cudaMallocAsync
// TODO: useful for doing memory debugging with tools like cuda-memcheck // TODO: useful for doing memory debugging with tools like cuda-memcheck
// TODO: **WARNING** probably will not work in a multi-gpu scenario // TODO: **WARNING** probably will not work in a multi-gpu scenario
gpu_allocator = gpu_allocator =
new GpuCudaMallocAsyncAllocator(platform_gpu_id, total_bytes); new GpuCudaMallocAsyncAllocator(platform_device_id, total_bytes);
} }
Allocator* recording_allocator = nullptr; Allocator* recording_allocator = nullptr;
if (process_state_->ProcessState::FLAGS_brain_gpu_record_mem_types) { if (process_state_->ProcessState::FLAGS_brain_gpu_record_mem_types) {
ProcessState::MemDesc md; ProcessState::MemDesc md;
md.loc = ProcessState::MemDesc::GPU; md.loc = ProcessState::MemDesc::GPU;
md.dev_index = platform_gpu_id.value(); md.dev_index = platform_device_id.value();
md.gpu_registered = false; md.gpu_registered = false;
md.nic_registered = true; md.nic_registered = true;
recording_allocator = new internal::RecordingAllocator( recording_allocator = new internal::RecordingAllocator(
@ -240,20 +242,20 @@ Allocator* GPUProcessState::GetGPUAllocator(
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
} }
SharedCounter* GPUProcessState::GPUAllocatorCounter(TfGpuId tf_gpu_id) { SharedCounter* GPUProcessState::GPUAllocatorCounter(TfDeviceId tf_device_id) {
DCHECK(process_state_); DCHECK(process_state_);
#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \ #if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
(defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM) (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
DeviceIdUtil::CheckValidTfDeviceId(DEVICE_GPU, GPUMachineManager(), DeviceIdUtil::CheckValidTfDeviceId(DEVICE_GPU, GPUMachineManager(),
tf_gpu_id); tf_device_id);
mutex_lock l(mu_); mutex_lock l(mu_);
if (tf_gpu_id.value() >= static_cast<int64>(gpu_allocators_.size())) { if (tf_device_id.value() >= static_cast<int64>(gpu_allocators_.size())) {
LOG(ERROR) << "Asked for counter for GPU allocator " << tf_gpu_id.value() LOG(ERROR) << "Asked for counter for GPU allocator " << tf_device_id.value()
<< " but only have " << gpu_allocators_.size(); << " but only have " << gpu_allocators_.size();
return nullptr; return nullptr;
} }
AllocatorParts& allocator_parts = gpu_allocators_[tf_gpu_id.value()]; AllocatorParts& allocator_parts = gpu_allocators_[tf_device_id.value()];
if (allocator_parts.counter.get() == nullptr) { if (allocator_parts.counter.get() == nullptr) {
SharedCounter* timing_counter = new SharedCounter; SharedCounter* timing_counter = new SharedCounter;
allocator_parts.bfc_allocator->SetTimingCounter(timing_counter); allocator_parts.bfc_allocator->SetTimingCounter(timing_counter);
@ -303,7 +305,7 @@ Allocator* GPUProcessState::GetGpuHostAllocator(int numa_node) {
for (int i = 0; i < static_cast<int>(gpu_allocators_.size()); ++i) { for (int i = 0; i < static_cast<int>(gpu_allocators_.size()); ++i) {
if (gpu_allocators_[i].allocator != nullptr) { if (gpu_allocators_[i].allocator != nullptr) {
se = DeviceIdUtil::ExecutorForTfDeviceId(DEVICE_GPU, GPUMachineManager(), se = DeviceIdUtil::ExecutorForTfDeviceId(DEVICE_GPU, GPUMachineManager(),
TfGpuId(i)) TfDeviceId(i))
.ValueOrDie(); .ValueOrDie();
break; break;
} }

View File

@ -72,18 +72,18 @@ class GPUProcessState {
// //
// 'total_bytes' is the total number of bytes that should be made // 'total_bytes' is the total number of bytes that should be made
// available to the allocator. The first call to this function for // available to the allocator. The first call to this function for
// a given tf_gpu_id creates the allocator, so only the total_bytes // a given tf_device_id creates the allocator, so only the total_bytes
// used on that first call is used. // used on that first call is used.
// //
// "Allocator type" describes the type of algorithm to use for the // "Allocator type" describes the type of algorithm to use for the
// underlying allocator. REQUIRES: Must be a valid type (see // underlying allocator. REQUIRES: Must be a valid type (see
// config.proto for the list of supported strings.). // config.proto for the list of supported strings.).
// //
// REQUIRES: tf_gpu_id must be a valid id for a BaseGPUDevice available in the // REQUIRES: tf_device_id must be a valid id for a BaseGPUDevice available in
// current system environment. Otherwise returns nullptr. // the current system environment. Otherwise returns nullptr.
virtual Allocator* GetGPUAllocator(const GPUOptions& options, virtual Allocator* GetGPUAllocator(
TfGpuId tf_gpu_id, size_t total_bytes, const GPUOptions& options, TfDeviceId tf_device_id, size_t total_bytes,
const std::vector<TfGpuId>& peer_gpu_ids); const std::vector<TfDeviceId>& peer_gpu_ids);
int NumGPUAllocators() { int NumGPUAllocators() {
mutex_lock l(mu_); mutex_lock l(mu_);
@ -115,9 +115,9 @@ class GPUProcessState {
const SubAllocator::Visitor& visitor); const SubAllocator::Visitor& visitor);
// Returns bus_id for the given GPU id. // Returns bus_id for the given GPU id.
virtual int BusIdForGPU(TfGpuId tf_gpu_id); virtual int BusIdForGPU(TfDeviceId tf_device_id);
SharedCounter* GPUAllocatorCounter(TfGpuId tf_gpu_id); SharedCounter* GPUAllocatorCounter(TfDeviceId tf_device_id);
protected: protected:
// GPUProcessState is a singleton that should not normally be deleted except // GPUProcessState is a singleton that should not normally be deleted except

View File

@ -44,7 +44,7 @@ StatusOr<bool> SupportsVirtualAddressManagement(GpuDeviceHandle device) {
} }
Status CheckVirtualAddressManagementSupport(GpuDeviceHandle device, Status CheckVirtualAddressManagementSupport(GpuDeviceHandle device,
PlatformGpuId gpu_id) { PlatformDeviceId gpu_id) {
TF_ASSIGN_OR_RETURN(bool supports_virtual_address_management, TF_ASSIGN_OR_RETURN(bool supports_virtual_address_management,
SupportsVirtualAddressManagement(device)); SupportsVirtualAddressManagement(device));
if (!supports_virtual_address_management) { if (!supports_virtual_address_management) {
@ -59,11 +59,11 @@ Status CheckVirtualAddressManagementSupport(GpuDeviceHandle device,
/* static */ stream_executor::port::StatusOr< /* static */ stream_executor::port::StatusOr<
std::unique_ptr<GpuVirtualMemAllocator>> std::unique_ptr<GpuVirtualMemAllocator>>
GpuVirtualMemAllocator::Create(const std::vector<Visitor>& alloc_visitors, GpuVirtualMemAllocator::Create(
const std::vector<Visitor>& free_visitors, const std::vector<Visitor>& alloc_visitors,
GpuContext& gpu_context, PlatformGpuId gpu_id, const std::vector<Visitor>& free_visitors, GpuContext& gpu_context,
size_t virtual_address_space_size, PlatformDeviceId gpu_id, size_t virtual_address_space_size,
const std::vector<PlatformGpuId>& peer_gpu_ids) { const std::vector<PlatformDeviceId>& peer_gpu_ids) {
std::vector<GpuDeviceHandle> access_gpu_handles; std::vector<GpuDeviceHandle> access_gpu_handles;
access_gpu_handles.reserve(peer_gpu_ids.size() + 1); access_gpu_handles.reserve(peer_gpu_ids.size() + 1);
@ -111,7 +111,8 @@ GpuVirtualMemAllocator::Create(const std::vector<Visitor>& alloc_visitors,
GpuVirtualMemAllocator::GpuVirtualMemAllocator( GpuVirtualMemAllocator::GpuVirtualMemAllocator(
const std::vector<Visitor>& alloc_visitors, const std::vector<Visitor>& alloc_visitors,
const std::vector<Visitor>& free_visitors, GpuContext& gpu_context, const std::vector<Visitor>& free_visitors, GpuContext& gpu_context,
PlatformGpuId gpu_id, const std::vector<GpuDeviceHandle> access_gpu_handles, PlatformDeviceId gpu_id,
const std::vector<GpuDeviceHandle> access_gpu_handles,
GpuDriver::VmemSpan vmem, size_t granularity) GpuDriver::VmemSpan vmem, size_t granularity)
: SubAllocator(alloc_visitors, free_visitors), : SubAllocator(alloc_visitors, free_visitors),
gpu_context_(gpu_context), gpu_context_(gpu_context),

View File

@ -44,9 +44,9 @@ class GpuVirtualMemAllocator : public SubAllocator {
std::unique_ptr<GpuVirtualMemAllocator>> std::unique_ptr<GpuVirtualMemAllocator>>
Create(const std::vector<Visitor>& alloc_visitors, Create(const std::vector<Visitor>& alloc_visitors,
const std::vector<Visitor>& free_visitors, const std::vector<Visitor>& free_visitors,
stream_executor::gpu::GpuContext& gpu_context, PlatformGpuId gpu_id, stream_executor::gpu::GpuContext& gpu_context, PlatformDeviceId gpu_id,
size_t virtual_address_space_size, size_t virtual_address_space_size,
const std::vector<PlatformGpuId>& peer_gpu_ids); const std::vector<PlatformDeviceId>& peer_gpu_ids);
~GpuVirtualMemAllocator() override; ~GpuVirtualMemAllocator() override;
// Allocates memory at least as large as requested by num_bytes. Will be // Allocates memory at least as large as requested by num_bytes. Will be
@ -74,12 +74,12 @@ class GpuVirtualMemAllocator : public SubAllocator {
GpuVirtualMemAllocator( GpuVirtualMemAllocator(
const std::vector<Visitor>& alloc_visitors, const std::vector<Visitor>& alloc_visitors,
const std::vector<Visitor>& free_visitors, const std::vector<Visitor>& free_visitors,
stream_executor::gpu::GpuContext& gpu_context, PlatformGpuId gpu_id, stream_executor::gpu::GpuContext& gpu_context, PlatformDeviceId gpu_id,
std::vector<stream_executor::gpu::GpuDeviceHandle> access_device_handles, std::vector<stream_executor::gpu::GpuDeviceHandle> access_device_handles,
stream_executor::gpu::GpuDriver::VmemSpan vmem, size_t granularity); stream_executor::gpu::GpuDriver::VmemSpan vmem, size_t granularity);
stream_executor::gpu::GpuContext& gpu_context_; stream_executor::gpu::GpuContext& gpu_context_;
PlatformGpuId gpu_id_; PlatformDeviceId gpu_id_;
// Peer access is configured at mmap time so the allocator must be aware of // Peer access is configured at mmap time so the allocator must be aware of
// all gpus that may want to read the memory. This list also includes the // all gpus that may want to read the memory. This list also includes the

View File

@ -35,7 +35,7 @@ constexpr size_t k2MiB{2 << 20};
// Creates an allocator with 8 MiB of virtual address space. // Creates an allocator with 8 MiB of virtual address space.
std::unique_ptr<GpuVirtualMemAllocator> CreateAllocator() { std::unique_ptr<GpuVirtualMemAllocator> CreateAllocator() {
PlatformGpuId gpu_id(0); PlatformDeviceId gpu_id(0);
auto executor = auto executor =
DeviceIdUtil::ExecutorForPlatformDeviceId(GPUMachineManager(), gpu_id) DeviceIdUtil::ExecutorForPlatformDeviceId(GPUMachineManager(), gpu_id)
.ValueOrDie(); .ValueOrDie();
@ -48,7 +48,7 @@ std::unique_ptr<GpuVirtualMemAllocator> CreateAllocator() {
} }
TEST(GpuVirtualMemAllocatorTest, SimpleAlloc) { TEST(GpuVirtualMemAllocatorTest, SimpleAlloc) {
PlatformGpuId gpu_id(0); PlatformDeviceId gpu_id(0);
auto executor = auto executor =
DeviceIdUtil::ExecutorForPlatformDeviceId(GPUMachineManager(), gpu_id) DeviceIdUtil::ExecutorForPlatformDeviceId(GPUMachineManager(), gpu_id)
.ValueOrDie(); .ValueOrDie();

View File

@ -92,14 +92,15 @@ Status SingleMachine::Provision() {
return errors::InvalidArgument( return errors::InvalidArgument(
strings::StrCat("Not able to parse GPU device name: ", dev.name())); strings::StrCat("Not able to parse GPU device name: ", dev.name()));
} }
TfGpuId tf_gpu_id(parsed.id); TfDeviceId tf_device_id(parsed.id);
PlatformGpuId platform_gpu_id; PlatformDeviceId platform_device_id;
Status s = GpuIdManager::TfToPlatformGpuId(tf_gpu_id, &platform_gpu_id); Status s =
GpuIdManager::TfToPlatformDeviceId(tf_device_id, &platform_device_id);
if (!s.ok()) { if (!s.ok()) {
return errors::Unavailable("Unknown TF GPU device with id ", return errors::Unavailable("Unknown TF GPU device with id ",
tf_gpu_id.value(), ": ", s.ToString()); tf_device_id.value(), ": ", s.ToString());
} }
attr = GetLocalGPUInfo(platform_gpu_id); attr = GetLocalGPUInfo(platform_device_id);
} else if (dev.device_type().find("XLA") == string::npos) { } else if (dev.device_type().find("XLA") == string::npos) {
// Filter out the fake XLA devices to avoid double counting the actual // Filter out the fake XLA devices to avoid double counting the actual
// hardware resources that are available. // hardware resources that are available.

View File

@ -74,14 +74,14 @@ DeviceProperties GetLocalCPUInfo() {
return device; return device;
} }
DeviceProperties GetLocalGPUInfo(PlatformGpuId platform_gpu_id) { DeviceProperties GetLocalGPUInfo(PlatformDeviceId platform_device_id) {
DeviceProperties device; DeviceProperties device;
device.set_type("GPU"); device.set_type("GPU");
#if GOOGLE_CUDA #if GOOGLE_CUDA
cudaDeviceProp properties; cudaDeviceProp properties;
cudaError_t error = cudaError_t error =
cudaGetDeviceProperties(&properties, platform_gpu_id.value()); cudaGetDeviceProperties(&properties, platform_device_id.value());
if (error != cudaSuccess) { if (error != cudaSuccess) {
device.set_type("UNKNOWN"); device.set_type("UNKNOWN");
LOG(ERROR) << "Failed to get device properties, error code: " << error; LOG(ERROR) << "Failed to get device properties, error code: " << error;
@ -117,7 +117,7 @@ DeviceProperties GetLocalGPUInfo(PlatformGpuId platform_gpu_id) {
#elif TENSORFLOW_USE_ROCM #elif TENSORFLOW_USE_ROCM
hipDeviceProp_t properties; hipDeviceProp_t properties;
hipError_t error = hipError_t error =
hipGetDeviceProperties(&properties, platform_gpu_id.value()); hipGetDeviceProperties(&properties, platform_device_id.value());
if (error != hipSuccess) { if (error != hipSuccess) {
device.set_type("UNKNOWN"); device.set_type("UNKNOWN");
LOG(ERROR) << "Failed to get device properties, error code: " << error; LOG(ERROR) << "Failed to get device properties, error code: " << error;
@ -156,16 +156,17 @@ DeviceProperties GetDeviceInfo(const DeviceNameUtils::ParsedName& device) {
return GetLocalCPUInfo(); return GetLocalCPUInfo();
} else if (device.type == "GPU") { } else if (device.type == "GPU") {
if (device.has_id) { if (device.has_id) {
TfGpuId tf_gpu_id(device.id); TfDeviceId tf_device_id(device.id);
PlatformGpuId platform_gpu_id; PlatformDeviceId platform_device_id;
Status s = GpuIdManager::TfToPlatformGpuId(tf_gpu_id, &platform_gpu_id); Status s =
GpuIdManager::TfToPlatformDeviceId(tf_device_id, &platform_device_id);
if (!s.ok()) { if (!s.ok()) {
LOG(ERROR) << s; LOG(ERROR) << s;
return unknown; return unknown;
} }
return GetLocalGPUInfo(platform_gpu_id); return GetLocalGPUInfo(platform_device_id);
} else { } else {
return GetLocalGPUInfo(PlatformGpuId(0)); return GetLocalGPUInfo(PlatformDeviceId(0));
} }
} }
return unknown; return unknown;

View File

@ -28,7 +28,7 @@ DeviceProperties GetLocalCPUInfo();
// Returns the DeviceProperties for the specified GPU attached to the server on // Returns the DeviceProperties for the specified GPU attached to the server on
// which grappler is running. // which grappler is running.
DeviceProperties GetLocalGPUInfo(PlatformGpuId platform_gpu_id); DeviceProperties GetLocalGPUInfo(PlatformDeviceId platform_device_id);
// Returns the DeviceProperties of the specified device // Returns the DeviceProperties of the specified device
DeviceProperties GetDeviceInfo(const DeviceNameUtils::ParsedName& device); DeviceProperties GetDeviceInfo(const DeviceNameUtils::ParsedName& device);

View File

@ -33,11 +33,11 @@ TEST(UtilsTest, GetLocalGPUInfo) {
DeviceProperties properties; DeviceProperties properties;
// Invalid platform GPU ID. // Invalid platform GPU ID.
properties = GetLocalGPUInfo(PlatformGpuId(100)); properties = GetLocalGPUInfo(PlatformDeviceId(100));
EXPECT_EQ("UNKNOWN", properties.type()); EXPECT_EQ("UNKNOWN", properties.type());
// Succeed when a valid platform GPU id was inserted. // Succeed when a valid platform GPU id was inserted.
properties = GetLocalGPUInfo(PlatformGpuId(0)); properties = GetLocalGPUInfo(PlatformDeviceId(0));
EXPECT_EQ("GPU", properties.type()); EXPECT_EQ("GPU", properties.type());
EXPECT_EQ("NVIDIA", properties.vendor()); EXPECT_EQ("NVIDIA", properties.vendor());
#elif TENSORFLOW_USE_ROCM #elif TENSORFLOW_USE_ROCM
@ -45,21 +45,21 @@ TEST(UtilsTest, GetLocalGPUInfo) {
DeviceProperties properties; DeviceProperties properties;
// Invalid platform GPU ID. // Invalid platform GPU ID.
properties = GetLocalGPUInfo(PlatformGpuId(100)); properties = GetLocalGPUInfo(PlatformDeviceId(100));
EXPECT_EQ("UNKNOWN", properties.type()); EXPECT_EQ("UNKNOWN", properties.type());
// Succeed when a valid platform GPU id was inserted. // Succeed when a valid platform GPU id was inserted.
properties = GetLocalGPUInfo(PlatformGpuId(0)); properties = GetLocalGPUInfo(PlatformDeviceId(0));
EXPECT_EQ("GPU", properties.type()); EXPECT_EQ("GPU", properties.type());
EXPECT_EQ("Advanced Micro Devices, Inc", properties.vendor()); EXPECT_EQ("Advanced Micro Devices, Inc", properties.vendor());
#else #else
LOG(INFO) << "CUDA is not enabled."; LOG(INFO) << "CUDA is not enabled.";
DeviceProperties properties; DeviceProperties properties;
properties = GetLocalGPUInfo(PlatformGpuId(0)); properties = GetLocalGPUInfo(PlatformDeviceId(0));
EXPECT_EQ("GPU", properties.type()); EXPECT_EQ("GPU", properties.type());
properties = GetLocalGPUInfo(PlatformGpuId(100)); properties = GetLocalGPUInfo(PlatformDeviceId(100));
EXPECT_EQ("GPU", properties.type()); EXPECT_EQ("GPU", properties.type());
#endif #endif
} }
@ -97,14 +97,14 @@ TEST(UtilsTest, GetDeviceInfo) {
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
// Invalid platform GPU id. // Invalid platform GPU id.
TF_ASSERT_OK( TF_ASSERT_OK(GpuIdManager::InsertTfPlatformDeviceIdPair(
GpuIdManager::InsertTfPlatformGpuIdPair(TfGpuId(0), PlatformGpuId(100))); TfDeviceId(0), PlatformDeviceId(100)));
properties = GetDeviceInfo(device); properties = GetDeviceInfo(device);
EXPECT_EQ("UNKNOWN", properties.type()); EXPECT_EQ("UNKNOWN", properties.type());
// Valid platform GPU id. // Valid platform GPU id.
TF_ASSERT_OK( TF_ASSERT_OK(GpuIdManager::InsertTfPlatformDeviceIdPair(TfDeviceId(1),
GpuIdManager::InsertTfPlatformGpuIdPair(TfGpuId(1), PlatformGpuId(0))); PlatformDeviceId(0)));
device.id = 1; device.id = 1;
properties = GetDeviceInfo(device); properties = GetDeviceInfo(device);
EXPECT_EQ("GPU", properties.type()); EXPECT_EQ("GPU", properties.type());

View File

@ -241,14 +241,15 @@ DeviceProperties GetDeviceInfo(const string& device_str) {
DeviceNameUtils::ParsedName parsed; DeviceNameUtils::ParsedName parsed;
if (DeviceNameUtils::ParseFullName(device_str, &parsed)) { if (DeviceNameUtils::ParseFullName(device_str, &parsed)) {
if (parsed.type == "GPU") { if (parsed.type == "GPU") {
TfGpuId tf_gpu_id(parsed.id); TfDeviceId tf_device_id(parsed.id);
PlatformGpuId platform_gpu_id; PlatformDeviceId platform_device_id;
Status s = GpuIdManager::TfToPlatformGpuId(tf_gpu_id, &platform_gpu_id); Status s =
GpuIdManager::TfToPlatformDeviceId(tf_device_id, &platform_device_id);
if (!s.ok()) { if (!s.ok()) {
// We are probably running simulation without linking cuda libraries. // We are probably running simulation without linking cuda libraries.
platform_gpu_id = PlatformGpuId(parsed.id); platform_device_id = PlatformDeviceId(parsed.id);
} }
return GetLocalGPUInfo(platform_gpu_id); return GetLocalGPUInfo(platform_device_id);
} else if (parsed.type == "CPU") { } else if (parsed.type == "CPU") {
return GetLocalCPUInfo(); return GetLocalCPUInfo();
} }