Finish migrating {Tf|Platform}GpuId to {Tf|Platform}DeviceId

PiperOrigin-RevId: 361252995
Change-Id: I818798fc00efe7b98c35145ce067204d9e023895
This commit is contained in:
Sanjoy Das 2021-03-05 16:56:23 -08:00 committed by TensorFlower Gardener
parent fbd744dfdb
commit d7634bbfaf
28 changed files with 398 additions and 379 deletions

View File

@ -43,15 +43,15 @@ static xla::StatusOr<absl::optional<std::set<int>>> ParseVisibleDeviceList(
}
const std::vector<string> visible_devices =
absl::StrSplit(visible_device_list, ',');
for (const string& platform_gpu_id_str : visible_devices) {
int32 platform_gpu_id;
if (!absl::SimpleAtoi(platform_gpu_id_str, &platform_gpu_id)) {
for (const string& platform_device_id_str : visible_devices) {
int32 platform_device_id;
if (!absl::SimpleAtoi(platform_device_id_str, &platform_device_id)) {
return errors::InvalidArgument(
"Could not parse entry in 'visible_device_list': '",
platform_gpu_id_str,
platform_device_id_str,
"'. visible_device_list = ", visible_device_list);
}
gpu_ids.insert(platform_gpu_id);
gpu_ids.insert(platform_device_id);
}
return {{gpu_ids}};
}

View File

@ -102,19 +102,21 @@ struct EdgePtrCompare {
// TODO(laigd): instead of deciding the device here, the converter should accept
// a device name as one of the conversion parameter so users can control on
// which device they want to run the conversion.
std::pair<TfGpuId, PlatformGpuId> GetFirstValidDeviceId() {
for (int tf_gpu_id_value = 0; tf_gpu_id_value < 100; ++tf_gpu_id_value) {
TfGpuId tf_gpu_id(tf_gpu_id_value);
PlatformGpuId platform_gpu_id;
Status s = GpuIdManager::TfToPlatformGpuId(tf_gpu_id, &platform_gpu_id);
std::pair<TfDeviceId, PlatformDeviceId> GetFirstValidDeviceId() {
for (int tf_device_id_value = 0; tf_device_id_value < 100;
++tf_device_id_value) {
TfDeviceId tf_device_id(tf_device_id_value);
PlatformDeviceId platform_device_id;
Status s =
GpuIdManager::TfToPlatformDeviceId(tf_device_id, &platform_device_id);
if (s.ok()) {
VLOG(1) << "Found TF GPU " << tf_gpu_id.value() << " at cuda device "
<< platform_gpu_id.value();
return std::make_pair(tf_gpu_id, platform_gpu_id);
VLOG(1) << "Found TF GPU " << tf_device_id.value() << " at cuda device "
<< platform_device_id.value();
return std::make_pair(tf_device_id, platform_device_id);
}
}
LOG(ERROR) << "Could not find any TF GPUs";
return std::make_pair(TfGpuId(-1), PlatformGpuId(-1));
return std::make_pair(TfDeviceId(-1), PlatformDeviceId(-1));
}
// Returns false for const nodes (we intend to drop control edges from those).
@ -266,14 +268,14 @@ Status GetEngineInfo(const Graph* g,
}
info->device = DeviceNameUtils::ParsedNameToString(segment_device);
} else {
TfGpuId tf_gpu_id;
PlatformGpuId platform_gpu_id;
std::tie(tf_gpu_id, platform_gpu_id) = GetFirstValidDeviceId();
if (tf_gpu_id.value() >= 0) {
TfDeviceId tf_device_id;
PlatformDeviceId platform_device_id;
std::tie(tf_device_id, platform_device_id) = GetFirstValidDeviceId();
if (tf_device_id.value() >= 0) {
DeviceNameUtils::ParsedName parsed_name;
parsed_name.type = "GPU";
parsed_name.has_type = true;
parsed_name.id = tf_gpu_id.value();
parsed_name.id = tf_device_id.value();
parsed_name.has_id = true;
info->device = DeviceNameUtils::ParsedNameToString(parsed_name);
} else {
@ -640,17 +642,17 @@ std::pair<int, Allocator*> GetDeviceAndAllocator(const ConversionParams& params,
if (params.cluster == nullptr || params.cluster->GetDeviceSet() == nullptr ||
engine.device.empty()) {
// If device is not set, use the first found GPU device for the conversion.
TfGpuId tf_gpu_id;
PlatformGpuId platform_gpu_id;
std::tie(tf_gpu_id, platform_gpu_id) = GetFirstValidDeviceId();
cuda_device_id = platform_gpu_id.value();
TfDeviceId tf_device_id;
PlatformDeviceId platform_device_id;
std::tie(tf_device_id, platform_device_id) = GetFirstValidDeviceId();
cuda_device_id = platform_device_id.value();
if (cuda_device_id >= 0) {
GPUOptions gpu_options;
// If the TF to Cuda gpu id mapping exist, the device and corresponding
// allocator must have been initialized already, so the
// GetGPUAllocator() call won't create a new allocator.
dev_allocator = GPUProcessState::singleton()->GetGPUAllocator(
gpu_options, tf_gpu_id, /*total_bytes=*/1, /*peer_gpu_ids=*/{});
gpu_options, tf_device_id, /*total_bytes=*/1, /*peer_gpu_ids=*/{});
}
return std::make_pair(cuda_device_id, dev_allocator);
}

View File

@ -1044,25 +1044,25 @@ Status TRTEngineOp::AllocateCalibrationResources(
}
cres->calibrator_.reset(
new TRTInt8Calibrator(cres->device_buffers_, batch_size, name()));
const int platform_gpu_id =
const int platform_device_id =
ctx->device()->tensorflow_gpu_device_info()->gpu_id;
if (platform_gpu_id < 0) {
if (platform_device_id < 0) {
LOG(ERROR) << "Can't get gpu_device_info from context->device()";
return errors::InvalidArgument(
"Context->device doesn't contain device info!");
}
cache_res->Ref();
cres->thr_.reset(new std::thread([this, cres, shapes, platform_gpu_id,
cres->thr_.reset(new std::thread([this, cres, shapes, platform_device_id,
cache_res]() {
core::ScopedUnref sc(cache_res);
VLOG(1) << "Starting calibration thread on device " << platform_gpu_id
VLOG(1) << "Starting calibration thread on device " << platform_device_id
<< ", Calibration Resource @ " << cres;
auto err = cudaSetDevice(platform_gpu_id);
auto err = cudaSetDevice(platform_device_id);
if (err != cudaSuccess) {
// TODO(aaroey): should return error here.
LOG(ERROR) << "Couldn't set cuda device to " << platform_gpu_id
LOG(ERROR) << "Couldn't set cuda device to " << platform_device_id
<< " in calibration thread";
}
std::vector<PartialTensorShape> partial_shapes(shapes.begin(),

View File

@ -149,7 +149,7 @@ class GPUDeviceTestHelper {
DeviceFactory::NewDevice(DEVICE_GPU, sops, "/job:a/replica:0/task:0");
gpu_.reset(reinterpret_cast<BaseGPUDevice*>(device_.release()));
gpu_allocator_ = GPUProcessState::singleton()->GetGPUAllocator(
GPUOptions(), TfGpuId(0), memory_limit, /*peer_gpu_ids=*/{});
GPUOptions(), TfDeviceId(0), memory_limit, /*peer_gpu_ids=*/{});
host_allocator_ = GPUProcessState::singleton()->GetGpuHostAllocator(0);
}

View File

@ -26,11 +26,11 @@ limitations under the License.
namespace tensorflow {
GPUcudaMallocAllocator::GPUcudaMallocAllocator(Allocator* allocator,
PlatformGpuId platform_gpu_id)
GPUcudaMallocAllocator::GPUcudaMallocAllocator(
Allocator* allocator, PlatformDeviceId platform_device_id)
: base_allocator_(allocator) {
stream_exec_ = DeviceIdUtil::ExecutorForPlatformDeviceId(GPUMachineManager(),
platform_gpu_id)
platform_device_id)
.ValueOrDie();
}

View File

@ -32,7 +32,7 @@ namespace tensorflow {
class GPUcudaMallocAllocator : public Allocator {
public:
explicit GPUcudaMallocAllocator(Allocator* allocator,
PlatformGpuId platform_gpu_id);
PlatformDeviceId platform_device_id);
~GPUcudaMallocAllocator() override;
string Name() override { return "gpu_debug"; }
void* AllocateRaw(size_t alignment, size_t num_bytes) override;

View File

@ -42,12 +42,12 @@ static std::string GetCudaErrorMessage(CUresult result) {
#endif // GOOGLE_CUDA
GpuCudaMallocAsyncAllocator::GpuCudaMallocAsyncAllocator(
PlatformGpuId platform_gpu_id, size_t pool_size, bool reserve_memory,
PlatformDeviceId platform_device_id, size_t pool_size, bool reserve_memory,
bool compute_stats)
: name_(absl::StrCat("gpu_async_", platform_gpu_id.value())) {
: name_(absl::StrCat("gpu_async_", platform_device_id.value())) {
#if TF_CUDA_MALLOC_ASYNC_SUPPORTED
stream_exec_ = DeviceIdUtil::ExecutorForPlatformDeviceId(GPUMachineManager(),
platform_gpu_id)
platform_device_id)
.ValueOrDie();
// Initialized here as it only exist if compiled with a recent
// enough CUDA.
@ -56,7 +56,7 @@ GpuCudaMallocAsyncAllocator::GpuCudaMallocAsyncAllocator(
// WAR an CUDA 11.2 driver bug for multiple-GPU. It currently
// request that the context on GPU 0 is initialized. Which isn't the
// case for TF+horovod.
if (platform_gpu_id.value() > 0) {
if (platform_device_id.value() > 0) {
CUcontext pctx; // We loose track of it. But this is fine.
if (auto result = cuDevicePrimaryCtxRetain(&pctx, 0))
LOG(FATAL) // Crash OK.
@ -65,9 +65,10 @@ GpuCudaMallocAsyncAllocator::GpuCudaMallocAsyncAllocator(
se::cuda::ScopedActivateExecutorContext scoped_activation{stream_exec_};
int cuda_malloc_async_supported;
if (auto status = cuDeviceGetAttribute(
&cuda_malloc_async_supported,
CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED, platform_gpu_id.value()))
if (auto status =
cuDeviceGetAttribute(&cuda_malloc_async_supported,
CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED,
platform_device_id.value()))
LOG(FATAL) << // Crash OK.
"Failed to get device attribute: " << GetCudaErrorMessage(status);
if (!cuda_malloc_async_supported)
@ -79,12 +80,13 @@ GpuCudaMallocAsyncAllocator::GpuCudaMallocAsyncAllocator(
LOG(FATAL) // Crash OK.
<< "Failed to create CUDA stream: " << GetCudaErrorMessage(status);
if (auto status = cuDeviceGetDefaultMemPool(&pool_, platform_gpu_id.value()))
if (auto status =
cuDeviceGetDefaultMemPool(&pool_, platform_device_id.value()))
LOG(FATAL) << // Crash OK.
"Failed to get default CUDA pool: " << GetCudaErrorMessage(status);
VLOG(1) << Name() << " CudaMallocAsync initialized on platform: "
<< platform_gpu_id.value() << " with pool size of: " << pool_size
<< platform_device_id.value() << " with pool size of: " << pool_size
<< " this ptr: " << this;
uint64_t pool_size_64 = pool_size;
if (auto status = cuMemPoolSetAttribute(

View File

@ -64,7 +64,7 @@ namespace tensorflow {
// driver can return the excess memory to other processes.
class GpuCudaMallocAsyncAllocator : public Allocator {
public:
explicit GpuCudaMallocAsyncAllocator(PlatformGpuId platform_gpu_id,
explicit GpuCudaMallocAsyncAllocator(PlatformDeviceId platform_device_id,
size_t pool_size,
bool reserve_memory = false,
bool compute_stats = false);

View File

@ -76,10 +76,10 @@ void InitMask(se::StreamExecutor* exec, void* ptr, int64* mask) {
// GPUDebugAllocator
// -----------------------------------------------------------------------------
GPUDebugAllocator::GPUDebugAllocator(Allocator* allocator,
PlatformGpuId platform_gpu_id)
PlatformDeviceId platform_device_id)
: base_allocator_(allocator) {
stream_exec_ = DeviceIdUtil::ExecutorForPlatformDeviceId(GPUMachineManager(),
platform_gpu_id)
platform_device_id)
.ValueOrDie();
}
@ -155,10 +155,10 @@ bool GPUDebugAllocator::CheckFooter(void* ptr) {
// GPUNanResetAllocator
// -----------------------------------------------------------------------------
GPUNanResetAllocator::GPUNanResetAllocator(Allocator* allocator,
PlatformGpuId platform_gpu_id)
PlatformDeviceId platform_device_id)
: base_allocator_(allocator) {
stream_exec_ = DeviceIdUtil::ExecutorForPlatformDeviceId(GPUMachineManager(),
platform_gpu_id)
platform_device_id)
.ValueOrDie();
}

View File

@ -34,7 +34,7 @@ namespace tensorflow {
class GPUDebugAllocator : public Allocator {
public:
explicit GPUDebugAllocator(Allocator* allocator,
PlatformGpuId platform_gpu_id);
PlatformDeviceId platform_device_id);
~GPUDebugAllocator() override;
string Name() override { return "gpu_debug"; }
void* AllocateRaw(size_t alignment, size_t num_bytes) override;
@ -64,7 +64,7 @@ class GPUDebugAllocator : public Allocator {
class GPUNanResetAllocator : public Allocator {
public:
explicit GPUNanResetAllocator(Allocator* allocator,
PlatformGpuId platform_gpu_id);
PlatformDeviceId platform_device_id);
~GPUNanResetAllocator() override;
string Name() override { return "gpu_nan_reset"; }
void* AllocateRaw(size_t alignment, size_t num_bytes) override;

View File

@ -37,7 +37,7 @@ limitations under the License.
namespace tensorflow {
namespace {
se::StreamExecutor* ExecutorForPlatformGpuId(
se::StreamExecutor* ExecutorForPlatformDeviceId(
PlatformDeviceId platform_device_id) {
return DeviceIdUtil::ExecutorForPlatformDeviceId(GPUMachineManager(),
platform_device_id)
@ -45,12 +45,12 @@ se::StreamExecutor* ExecutorForPlatformGpuId(
}
TEST(GPUDebugAllocatorTest, OverwriteDetection_None) {
const PlatformGpuId platform_gpu_id(0);
auto stream_exec = ExecutorForPlatformGpuId(platform_gpu_id);
const PlatformDeviceId platform_device_id(0);
auto stream_exec = ExecutorForPlatformDeviceId(platform_device_id);
DeviceMemAllocator* sub_allocator = new DeviceMemAllocator(
stream_exec, platform_gpu_id, false /*use_unified_memory*/, {}, {});
stream_exec, platform_device_id, false /*use_unified_memory*/, {}, {});
GPUDebugAllocator a(new GPUBFCAllocator(sub_allocator, 1 << 30, ""),
platform_gpu_id);
platform_device_id);
for (int s : {8}) {
std::vector<int64> cpu_array(s);
@ -72,13 +72,13 @@ TEST(GPUDebugAllocatorTest, OverwriteDetection_Header) {
for (int s : {8, 211}) {
EXPECT_DEATH(
{
const PlatformGpuId platform_gpu_id(0);
auto stream_exec = ExecutorForPlatformGpuId(platform_gpu_id);
const PlatformDeviceId platform_device_id(0);
auto stream_exec = ExecutorForPlatformDeviceId(platform_device_id);
DeviceMemAllocator* sub_allocator =
new DeviceMemAllocator(stream_exec, platform_gpu_id,
new DeviceMemAllocator(stream_exec, platform_device_id,
false /*use_unified_memory*/, {}, {});
GPUDebugAllocator a(new GPUBFCAllocator(sub_allocator, 1 << 30, ""),
platform_gpu_id);
platform_device_id);
std::vector<int64> cpu_array(s);
memset(&cpu_array[0], 0, cpu_array.size() * sizeof(int64));
@ -108,13 +108,13 @@ TEST(GPUDebugAllocatorTest, OverwriteDetection_Footer) {
for (int s : {8, 22}) {
EXPECT_DEATH(
{
const PlatformGpuId platform_gpu_id(0);
auto stream_exec = ExecutorForPlatformGpuId(platform_gpu_id);
const PlatformDeviceId platform_device_id(0);
auto stream_exec = ExecutorForPlatformDeviceId(platform_device_id);
DeviceMemAllocator* sub_allocator =
new DeviceMemAllocator(stream_exec, platform_gpu_id,
new DeviceMemAllocator(stream_exec, platform_device_id,
false /*use_unified_memory*/, {}, {});
GPUDebugAllocator a(new GPUBFCAllocator(sub_allocator, 1 << 30, ""),
platform_gpu_id);
platform_device_id);
std::vector<int64> cpu_array(s);
memset(&cpu_array[0], 0, cpu_array.size() * sizeof(int64));
@ -141,12 +141,12 @@ TEST(GPUDebugAllocatorTest, OverwriteDetection_Footer) {
}
TEST(GPUDebugAllocatorTest, ResetToNan) {
const PlatformGpuId platform_gpu_id(0);
auto stream_exec = ExecutorForPlatformGpuId(platform_gpu_id);
const PlatformDeviceId platform_device_id(0);
auto stream_exec = ExecutorForPlatformDeviceId(platform_device_id);
DeviceMemAllocator* sub_allocator = new DeviceMemAllocator(
stream_exec, platform_gpu_id, false /*use_unified_memory*/, {}, {});
stream_exec, platform_device_id, false /*use_unified_memory*/, {}, {});
GPUNanResetAllocator a(new GPUBFCAllocator(sub_allocator, 1 << 30, ""),
platform_gpu_id);
platform_device_id);
std::vector<float> cpu_array(1024);
std::vector<float> cpu_array_result(1024);
@ -183,15 +183,15 @@ TEST(GPUDebugAllocatorTest, ResetToNan) {
}
TEST(GPUDebugAllocatorTest, ResetToNanWithHeaderFooter) {
const PlatformGpuId platform_gpu_id(0);
auto stream_exec = ExecutorForPlatformGpuId(platform_gpu_id);
const PlatformDeviceId platform_device_id(0);
auto stream_exec = ExecutorForPlatformDeviceId(platform_device_id);
// NaN reset must be the outer-most allocator.
DeviceMemAllocator* sub_allocator = new DeviceMemAllocator(
stream_exec, platform_gpu_id, false /*use_unified_memory*/, {}, {});
stream_exec, platform_device_id, false /*use_unified_memory*/, {}, {});
GPUNanResetAllocator a(
new GPUDebugAllocator(new GPUBFCAllocator(sub_allocator, 1 << 30, ""),
platform_gpu_id),
platform_gpu_id);
platform_device_id),
platform_device_id);
std::vector<float> cpu_array(1024);
std::vector<float> cpu_array_result(1024);
@ -228,24 +228,24 @@ TEST(GPUDebugAllocatorTest, ResetToNanWithHeaderFooter) {
}
TEST(GPUDebugAllocatorTest, TracksSizes) {
const PlatformGpuId platform_gpu_id(0);
const PlatformDeviceId platform_device_id(0);
DeviceMemAllocator* sub_allocator = new DeviceMemAllocator(
ExecutorForPlatformGpuId(platform_gpu_id), platform_gpu_id,
ExecutorForPlatformDeviceId(platform_device_id), platform_device_id,
false /*use_unified_memory*/, {}, {});
GPUDebugAllocator a(new GPUBFCAllocator(sub_allocator, 1 << 30, ""),
platform_gpu_id);
platform_device_id);
EXPECT_EQ(true, a.TracksAllocationSizes());
}
TEST(GPUDebugAllocatorTest, AllocatedVsRequested) {
const PlatformGpuId platform_gpu_id(0);
const PlatformDeviceId platform_device_id(0);
DeviceMemAllocator* sub_allocator = new DeviceMemAllocator(
ExecutorForPlatformGpuId(platform_gpu_id), platform_gpu_id,
ExecutorForPlatformDeviceId(platform_device_id), platform_device_id,
false /*use_unified_memory*/, {}, {});
GPUNanResetAllocator a(
new GPUDebugAllocator(new GPUBFCAllocator(sub_allocator, 1 << 30, ""),
platform_gpu_id),
platform_gpu_id);
platform_device_id),
platform_device_id);
float* t1 = TypedAllocator::Allocate<float>(&a, 1, {});
EXPECT_EQ(4, a.RequestedSize(t1));
EXPECT_EQ(256, a.AllocatedSize(t1));

View File

@ -120,7 +120,7 @@ class EigenGpuStreamDevice : public ::Eigen::StreamInterface {
}
~EigenGpuStreamDevice() override {}
void Reinitialize(OpKernelContext* context, const gpuStream_t* gpu_stream,
TfGpuId tf_gpu_id, ::tensorflow::Allocator* alloc,
TfDeviceId tf_device_id, ::tensorflow::Allocator* alloc,
char* scratch) {
if (LogMemory::IsEnabled()) {
operation_ = context->op_kernel().name() + "/EigenAllocator";
@ -132,9 +132,10 @@ class EigenGpuStreamDevice : public ::Eigen::StreamInterface {
reinterpret_cast<unsigned int*>(scratch + Eigen::kGpuScratchSize);
stream_ = gpu_stream;
allocator_ = alloc;
PlatformGpuId platform_gpu_id;
TF_CHECK_OK(GpuIdManager::TfToPlatformGpuId(tf_gpu_id, &platform_gpu_id));
device_prop_ = &Eigen::m_deviceProperties[platform_gpu_id.value()];
PlatformDeviceId platform_device_id;
TF_CHECK_OK(
GpuIdManager::TfToPlatformDeviceId(tf_device_id, &platform_device_id));
device_prop_ = &Eigen::m_deviceProperties[platform_device_id.value()];
}
const gpuStream_t& stream() const override { return *stream_; }
@ -233,18 +234,18 @@ class EigenGpuStreamDevice : public ::Eigen::StreamInterface {
class BaseGPUDevice::StreamGroupFactory {
public:
// Returns the unique stream group for use with the stream defined by
// {tf_gpu_id, stream_group_within_gpu}, creating it if it does not yet
// {tf_device_id, stream_group_within_gpu}, creating it if it does not yet
// exist.
// This function is thread safe.
BaseGPUDevice::StreamGroup* GetOrCreate(TfGpuId tf_gpu_id,
BaseGPUDevice::StreamGroup* GetOrCreate(TfDeviceId tf_device_id,
int stream_group_within_gpu,
se::StreamExecutor* executor,
const GPUOptions& options) {
mutex_lock guard(lock_);
StreamGroup* group =
&streams_[key_type(tf_gpu_id.value(), stream_group_within_gpu)];
&streams_[key_type(tf_device_id.value(), stream_group_within_gpu)];
if (!group->compute) {
int priority = GetPriority(tf_gpu_id.value(), options);
int priority = GetPriority(tf_device_id.value(), options);
group->priority = priority;
group->compute = GetStream(executor, priority);
group->compute->Init();
@ -339,8 +340,8 @@ class BaseGPUDevice::StreamGroupFactory {
private:
// Returns priority for the given virtual GPU id from the session options.
// Returns 0 if no virtual devices are specified.
int GetPriority(int tf_gpu_id, const GPUOptions& options) {
int id = tf_gpu_id;
int GetPriority(int tf_device_id, const GPUOptions& options) {
int id = tf_device_id;
int i = 0;
int priority = 0;
while (i < options.experimental().virtual_devices_size()) {
@ -378,7 +379,7 @@ class BaseGPUDevice::StreamGroupFactory {
BaseGPUDevice::BaseGPUDevice(const SessionOptions& options, const string& name,
Bytes memory_limit, const DeviceLocality& locality,
TfGpuId tf_gpu_id,
TfDeviceId tf_device_id,
const string& physical_device_desc,
Allocator* gpu_allocator, Allocator* cpu_allocator,
bool sync_every_op)
@ -388,7 +389,7 @@ BaseGPUDevice::BaseGPUDevice(const SessionOptions& options, const string& name,
gpu_allocator_(gpu_allocator),
cpu_allocator_(cpu_allocator),
scoped_allocator_mgr_(new ScopedAllocatorMgr(name)),
tf_gpu_id_(tf_gpu_id),
tf_device_id_(tf_device_id),
sync_every_op_(sync_every_op) {
GPUProcessState::singleton()->EnableGPUDevice();
}
@ -410,7 +411,8 @@ Status BaseGPUDevice::InitScratchBuffers() {
Allocator::kAllocatorAlignment, scratch_buffer_size);
if (scratch_buffer == nullptr) {
return errors::FailedPrecondition(
"Failed to allocate scratch buffer for device ", tf_gpu_id_.value());
"Failed to allocate scratch buffer for device ",
tf_device_id_.value());
}
se::DeviceMemory<char> mem(
se::DeviceMemoryBase(scratch_buffer, scratch_buffer_size));
@ -423,16 +425,16 @@ Status BaseGPUDevice::InitScratchBuffers() {
Status BaseGPUDevice::Init(const SessionOptions& options) {
auto executor_status = DeviceIdUtil::ExecutorForTfDeviceId(
DEVICE_GPU, GPUMachineManager(), tf_gpu_id_);
DEVICE_GPU, GPUMachineManager(), tf_device_id_);
if (!executor_status.status().ok()) {
return errors::Internal("Failed to get StreamExecutor for device ",
tf_gpu_id_.value());
tf_device_id_.value());
}
executor_ = executor_status.ValueOrDie();
stream_ = StreamGroupFactory::Global().GetOrCreate(
tf_gpu_id_, 0, executor_, options.config.gpu_options());
tf_device_id_, 0, executor_, options.config.gpu_options());
device_context_ =
new GPUDeviceContext(0, stream_->compute,
#if TENSORFLOW_USE_ROCM
@ -461,7 +463,7 @@ Status BaseGPUDevice::Init(const SessionOptions& options) {
// The GPUKernelTracker will use this SharedCounter, instead of
// owning its own.
timing_counter =
GPUProcessState::singleton()->GPUAllocatorCounter(tf_gpu_id_);
GPUProcessState::singleton()->GPUAllocatorCounter(tf_device_id_);
DCHECK(timing_counter);
}
kernel_tracker_.reset(new GPUKernelTracker(
@ -473,10 +475,10 @@ Status BaseGPUDevice::Init(const SessionOptions& options) {
gpu_device_info_->stream = stream_->compute;
gpu_device_info_->default_context = device_context_;
gpu_device_info_->event_mgr = em_;
PlatformGpuId platform_gpu_id;
PlatformDeviceId platform_device_id;
TF_RETURN_IF_ERROR(
GpuIdManager::TfToPlatformGpuId(tf_gpu_id_, &platform_gpu_id));
gpu_device_info_->gpu_id = platform_gpu_id.value();
GpuIdManager::TfToPlatformDeviceId(tf_device_id_, &platform_device_id));
gpu_device_info_->gpu_id = platform_device_id.value();
set_tensorflow_gpu_device_info(gpu_device_info_);
// Whether and how the GPU device uses its own threadpool.
@ -505,7 +507,7 @@ Status BaseGPUDevice::Init(const SessionOptions& options) {
// TODO(zhengxq): pin the thread to the same socket of the target GPU.
thread_pool_.reset(new thread::ThreadPool(
options.env, ThreadOptions(),
strings::StrCat("gpu_private_", tf_gpu_id_.value()),
strings::StrCat("gpu_private_", tf_device_id_.value()),
static_cast<int32>(gpu_thread_count),
!options.config.experimental().disable_thread_spinning(),
/*allocator=*/nullptr));
@ -531,8 +533,8 @@ Status BaseGPUDevice::Init(const SessionOptions& options) {
string BaseGPUDevice::ComputeOpKernelDebugString(const OpKernel& op_kernel,
const int& stream_id) {
return strings::StrCat(op_kernel.name(), " op ", op_kernel.type_string(),
" on GPU ", tf_gpu_id_.value(), " stream[", stream_id,
"]");
" on GPU ", tf_device_id_.value(), " stream[",
stream_id, "]");
}
void BaseGPUDevice::Compute(OpKernel* op_kernel, OpKernelContext* context) {
@ -624,8 +626,8 @@ void BaseGPUDevice::ComputeAsync(AsyncOpKernel* op_kernel,
const auto stream_id = gpu_device_context->stream_id();
VLOG(1) << "GpuDevice::ComputeAsync " << op_kernel->name() << " op "
<< op_kernel->type_string() << " on GPU" << tf_gpu_id_ << " stream["
<< stream_id << "]";
<< op_kernel->type_string() << " on GPU" << tf_device_id_
<< " stream[" << stream_id << "]";
ScopedActivateExecutorContext scoped_activation{stream->parent()};
op_kernel->ComputeAsync(context, std::move(done));
@ -763,10 +765,10 @@ class ConcretePerOpGpuDevice : public PerOpGpuDevice {
ConcretePerOpGpuDevice() : device_(&stream_device_) {}
void Reinitialize(OpKernelContext* context, const gpuStream_t* gpu_stream,
TfGpuId tf_gpu_id, Allocator* base_allocator,
TfDeviceId tf_device_id, Allocator* base_allocator,
char* scratch) {
stream_device_.Reinitialize(context, gpu_stream, tf_gpu_id, base_allocator,
scratch);
stream_device_.Reinitialize(context, gpu_stream, tf_device_id,
base_allocator, scratch);
}
const Eigen::GpuDevice& device() const override { return device_; }
@ -777,8 +779,9 @@ class ConcretePerOpGpuDevice : public PerOpGpuDevice {
};
// Parse 'visible_device_list' into a list of platform GPU ids.
Status ParseVisibleDeviceList(const string& visible_device_list,
std::vector<PlatformGpuId>* visible_gpu_order) {
Status ParseVisibleDeviceList(
const string& visible_device_list,
std::vector<PlatformDeviceId>* visible_gpu_order) {
visible_gpu_order->clear();
se::Platform* gpu_manager = GPUMachineManager();
@ -793,28 +796,28 @@ Status ParseVisibleDeviceList(const string& visible_device_list,
} else {
const std::vector<string> order_str =
str_util::Split(visible_device_list, ',');
for (const string& platform_gpu_id_str : order_str) {
int32 platform_gpu_id;
if (!strings::safe_strto32(platform_gpu_id_str, &platform_gpu_id)) {
for (const string& platform_device_id_str : order_str) {
int32 platform_device_id;
if (!strings::safe_strto32(platform_device_id_str, &platform_device_id)) {
return errors::InvalidArgument(
"Could not parse entry in 'visible_device_list': '",
platform_gpu_id_str,
platform_device_id_str,
"'. visible_device_list = ", visible_device_list);
}
if (platform_gpu_id < 0 ||
platform_gpu_id >= gpu_manager->VisibleDeviceCount()) {
if (platform_device_id < 0 ||
platform_device_id >= gpu_manager->VisibleDeviceCount()) {
return errors::InvalidArgument(
"'visible_device_list' listed an invalid GPU id '", platform_gpu_id,
"' but visible device count is ",
"'visible_device_list' listed an invalid GPU id '",
platform_device_id, "' but visible device count is ",
gpu_manager->VisibleDeviceCount());
}
visible_gpu_order->push_back(PlatformGpuId(platform_gpu_id));
visible_gpu_order->push_back(PlatformDeviceId(platform_device_id));
}
}
// Validate no repeats.
std::set<PlatformGpuId> visible_device_set(visible_gpu_order->begin(),
visible_gpu_order->end());
std::set<PlatformDeviceId> visible_device_set(visible_gpu_order->begin(),
visible_gpu_order->end());
if (visible_device_set.size() != visible_gpu_order->size()) {
return errors::InvalidArgument(
"visible_device_list contained a duplicate entry: ",
@ -825,8 +828,8 @@ Status ParseVisibleDeviceList(const string& visible_device_list,
Status VerifyVirtualDeviceSettings(
const size_t num_gpus_to_use, const GPUOptions& gpu_options,
const std::vector<PlatformGpuId>& visible_gpu_order,
const std::vector<PlatformGpuId>& valid_platform_gpu_ids,
const std::vector<PlatformDeviceId>& visible_gpu_order,
const std::vector<PlatformDeviceId>& valid_platform_device_ids,
const std::map<int, std::pair<int, int>>& supported_priority_ranges) {
const auto& virtual_devices = gpu_options.experimental().virtual_devices();
CHECK(!virtual_devices.empty());
@ -849,11 +852,11 @@ Status VerifyVirtualDeviceSettings(
" #GPUs in visible_device_list: ", visible_gpu_order.size(),
" virtual_devices.size(): ", virtual_devices.size());
}
if (valid_platform_gpu_ids.size() != virtual_devices.size()) {
if (valid_platform_device_ids.size() != virtual_devices.size()) {
return errors::Unknown(
"The number of valid GPUs doesn't match the number of elements in "
"the virtual_devices list.",
" #valid GPUs: ", valid_platform_gpu_ids.size(),
" #valid GPUs: ", valid_platform_device_ids.size(),
" virtual_devices.size(): ", virtual_devices.size());
}
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
@ -882,7 +885,7 @@ Status VerifyVirtualDeviceSettings(
i, " memory_limit_mb size: ", memory_limit_mb.size(),
" and priority size: ", priority.size());
}
const int gpu_id = valid_platform_gpu_ids[i].value();
const int gpu_id = valid_platform_device_ids[i].value();
auto it = supported_priority_ranges.find(gpu_id);
if (it == supported_priority_ranges.end()) {
return errors::Internal(
@ -950,19 +953,19 @@ int64 MinSystemMemory(int64 available_memory, int cc_major) {
}
// Get the memory limit for the virtual device being created on GPU with
// 'platform_gpu_id', when that virtual device is the only virtual device being
// created on that GPU.
// 'platform_device_id', when that virtual device is the only virtual device
// being created on that GPU.
Status SingleVirtualDeviceMemoryLimit(const GPUOptions& gpu_options,
PlatformGpuId platform_gpu_id,
PlatformDeviceId platform_device_id,
int64* memory_limit) {
int64 total_memory = 0;
int64 available_memory = 0;
se::StreamExecutor* se = DeviceIdUtil::ExecutorForPlatformDeviceId(
GPUMachineManager(), platform_gpu_id)
GPUMachineManager(), platform_device_id)
.ValueOrDie();
if (!se->DeviceMemoryUsage(&available_memory, &total_memory)) {
return errors::Unknown("Failed to query available memory for GPU ",
platform_gpu_id.value());
platform_device_id.value());
}
int64 allocated_memory = 0;
@ -1037,7 +1040,7 @@ void BaseGPUDevice::ReinitializeDevice(OpKernelContext* context,
DCHECK_EQ(stream_id, 0);
const gpuStream_t* gpu_stream = reinterpret_cast<const gpuStream_t*>(
stream_->compute->implementation()->GpuStreamMemberHack());
concrete_device->Reinitialize(context, gpu_stream, tf_gpu_id_, allocator,
concrete_device->Reinitialize(context, gpu_stream, tf_device_id_, allocator,
scratch_);
}
@ -1093,7 +1096,7 @@ Status BaseGPUDeviceFactory::CacheDeviceIds() {
return Status::OK();
}
std::vector<PlatformGpuId> visible_gpu_order(device_count);
std::vector<PlatformDeviceId> visible_gpu_order(device_count);
std::iota(visible_gpu_order.begin(), visible_gpu_order.end(), 0);
TF_RETURN_IF_ERROR(GetValidDeviceIds(visible_gpu_order, &cached_device_ids_));
return Status::OK();
@ -1101,9 +1104,9 @@ Status BaseGPUDeviceFactory::CacheDeviceIds() {
Status BaseGPUDeviceFactory::ListPhysicalDevices(std::vector<string>* devices) {
TF_RETURN_IF_ERROR(CacheDeviceIds());
for (PlatformGpuId platform_gpu_id : cached_device_ids_) {
for (PlatformDeviceId platform_device_id : cached_device_ids_) {
const string device_name =
strings::StrCat("/physical_device:GPU:", platform_gpu_id.value());
strings::StrCat("/physical_device:GPU:", platform_device_id.value());
devices->push_back(device_name);
}
@ -1117,14 +1120,15 @@ Status BaseGPUDeviceFactory::GetDeviceDetails(
if (device_index < 0 || device_index > cached_device_ids_.size()) {
return errors::Internal("Invalid device index: ", device_index);
}
PlatformGpuId platform_gpu_id = cached_device_ids_[device_index];
PlatformDeviceId platform_device_id = cached_device_ids_[device_index];
TF_RETURN_IF_ERROR(ValidateGPUMachineManager());
se::Platform* gpu_manager = GPUMachineManager();
if (gpu_manager == nullptr) {
return errors::Internal("Cannot get GPUMachineManager");
}
auto desc_status = gpu_manager->DescriptionForDevice(platform_gpu_id.value());
auto desc_status =
gpu_manager->DescriptionForDevice(platform_device_id.value());
if (!desc_status.ok()) {
return desc_status.status();
}
@ -1159,8 +1163,8 @@ Status BaseGPUDeviceFactory::CreateDevices(
num_gpus_to_use = iter->second;
}
const auto& gpu_options = options.config.gpu_options();
std::vector<PlatformGpuId> visible_gpu_order;
std::vector<PlatformGpuId> valid_platform_gpu_ids;
std::vector<PlatformDeviceId> visible_gpu_order;
std::vector<PlatformDeviceId> valid_platform_device_ids;
// If we aren't going to use any GPUs, don't initialize them.
// We don't want to call ParseVisibleDeviceList if num_gpus_to_use is 0,
// because it treats an empty gpu_options.visible_device_list as 'all GPUs
@ -1188,13 +1192,13 @@ Status BaseGPUDeviceFactory::CreateDevices(
}
TF_RETURN_IF_ERROR(
GetValidDeviceIds(visible_gpu_order, &valid_platform_gpu_ids));
GetValidDeviceIds(visible_gpu_order, &valid_platform_device_ids));
}
if (num_gpus_to_use > valid_platform_gpu_ids.size()) {
num_gpus_to_use = valid_platform_gpu_ids.size();
if (num_gpus_to_use > valid_platform_device_ids.size()) {
num_gpus_to_use = valid_platform_device_ids.size();
}
std::map<int, std::pair<int, int>> supported_priority_ranges;
if (!valid_platform_gpu_ids.empty()) {
if (!valid_platform_device_ids.empty()) {
// Save the original device.
int original_device = 0;
#if GOOGLE_CUDA
@ -1213,18 +1217,18 @@ Status BaseGPUDeviceFactory::CreateDevices(
// Force to implicitly initialize CUDA runtime on each valid GPU before
// CreateGPUDevice().
for (PlatformGpuId platform_gpu_id : valid_platform_gpu_ids) {
for (PlatformDeviceId platform_device_id : valid_platform_device_ids) {
#if GOOGLE_CUDA
err = cudaSetDevice(platform_gpu_id.value());
err = cudaSetDevice(platform_device_id.value());
if (err != cudaSuccess) {
return errors::Internal(
"cudaSetDevice() on GPU:", platform_gpu_id.value(),
"cudaSetDevice() on GPU:", platform_device_id.value(),
" failed. Status: ", cudaGetErrorString(err));
}
err = cudaFree(nullptr);
if (err != cudaSuccess) {
return errors::Internal("CUDA runtime implicit initialization on GPU:",
platform_gpu_id.value(),
platform_device_id.value(),
" failed. Status: ", cudaGetErrorString(err));
}
int priority_low, priority_high;
@ -1237,19 +1241,19 @@ Status BaseGPUDeviceFactory::CreateDevices(
VLOG(1) << "Cuda stream priority range on GPU(" << original_device
<< "): " << priority_high << "," << priority_low;
supported_priority_ranges.insert(
std::make_pair(platform_gpu_id.value(),
std::make_pair(platform_device_id.value(),
std::make_pair(priority_low, priority_high)));
#elif TENSORFLOW_USE_ROCM
err = hipSetDevice(platform_gpu_id.value());
err = hipSetDevice(platform_device_id.value());
if (err != hipSuccess) {
return errors::Internal(
"hipSetDevice() on GPU:", platform_gpu_id.value(),
"hipSetDevice() on GPU:", platform_device_id.value(),
" failed. Status: ", hipGetErrorString(err));
}
err = hipFree(nullptr);
if (err != hipSuccess) {
return errors::Internal("ROCm runtime implicit initialization on GPU:",
platform_gpu_id.value(),
platform_device_id.value(),
" failed. Status: ", hipGetErrorString(err));
}
int priority_low, priority_high;
@ -1262,7 +1266,7 @@ Status BaseGPUDeviceFactory::CreateDevices(
VLOG(1) << "HIP stream priority range on GPU(" << original_device
<< "): " << priority_high << "," << priority_low;
supported_priority_ranges.insert(
std::make_pair(platform_gpu_id.value(),
std::make_pair(platform_device_id.value(),
std::make_pair(priority_low, priority_high)));
#endif
}
@ -1306,9 +1310,9 @@ Status BaseGPUDeviceFactory::CreateDevices(
LOG(INFO) << line_buf;
for (int i = 0; i < visible_gpu_order.size(); ++i) {
line_buf = strings::StrCat(visible_gpu_order[i].value(), ": ");
PlatformGpuId gpu_id_i = visible_gpu_order[i];
PlatformDeviceId gpu_id_i = visible_gpu_order[i];
for (int j = 0; j < visible_gpu_order.size(); ++j) {
PlatformGpuId gpu_id_j = visible_gpu_order[j];
PlatformDeviceId gpu_id_j = visible_gpu_order[j];
if (im.directed_links.find({gpu_id_i, gpu_id_j}) !=
im.directed_links.end()) {
line_buf.append("Y ");
@ -1323,22 +1327,23 @@ Status BaseGPUDeviceFactory::CreateDevices(
const auto& virtual_devices = gpu_options.experimental().virtual_devices();
if (!virtual_devices.empty()) {
TF_RETURN_IF_ERROR(VerifyVirtualDeviceSettings(
num_gpus_to_use, gpu_options, visible_gpu_order, valid_platform_gpu_ids,
supported_priority_ranges));
num_gpus_to_use, gpu_options, visible_gpu_order,
valid_platform_device_ids, supported_priority_ranges));
// We've verified that num_gpus_to_use >= virtual_devices.size().
num_gpus_to_use = virtual_devices.size();
CHECK(gpu_options.visible_device_list().empty() ||
valid_platform_gpu_ids == visible_gpu_order);
valid_platform_device_ids == visible_gpu_order);
}
int next_tf_gpu_id = 0;
int next_tf_device_id = 0;
std::vector<int64> memory_limit_bytes;
for (int i = 0; i < num_gpus_to_use; ++i) {
const PlatformGpuId platform_gpu_id = valid_platform_gpu_ids[i];
const PlatformDeviceId platform_device_id = valid_platform_device_ids[i];
if (virtual_devices.empty() ||
virtual_devices.Get(i).memory_limit_mb_size() == 0) {
int64 single_virtual_device_memory_limit = 0;
TF_RETURN_IF_ERROR(SingleVirtualDeviceMemoryLimit(
gpu_options, platform_gpu_id, &single_virtual_device_memory_limit));
TF_RETURN_IF_ERROR(
SingleVirtualDeviceMemoryLimit(gpu_options, platform_device_id,
&single_virtual_device_memory_limit));
memory_limit_bytes.push_back(single_virtual_device_memory_limit);
} else {
const auto& memory_limit_mb = virtual_devices.Get(i).memory_limit_mb();
@ -1347,36 +1352,37 @@ Status BaseGPUDeviceFactory::CreateDevices(
return static_cast<int64>(mb) * (1ll << 20);
});
}
while (next_tf_gpu_id < memory_limit_bytes.size()) {
TfGpuId tf_gpu_id(next_tf_gpu_id);
++next_tf_gpu_id;
TF_RETURN_IF_ERROR(
GpuIdManager::InsertTfPlatformGpuIdPair(tf_gpu_id, platform_gpu_id));
while (next_tf_device_id < memory_limit_bytes.size()) {
TfDeviceId tf_device_id(next_tf_device_id);
++next_tf_device_id;
TF_RETURN_IF_ERROR(GpuIdManager::InsertTfPlatformDeviceIdPair(
tf_device_id, platform_device_id));
}
}
const int num_tf_gpus = next_tf_gpu_id;
const int num_tf_gpus = next_tf_device_id;
LocalityMap device_localities;
TF_RETURN_IF_ERROR(
GetDeviceLocalities(num_tf_gpus, interconnect_maps, &device_localities));
// Build the GPUDevices
CHECK_EQ(next_tf_gpu_id, memory_limit_bytes.size());
CHECK_EQ(next_tf_device_id, memory_limit_bytes.size());
for (int di = 0; di < num_tf_gpus; ++di) {
TfGpuId tf_gpu_id(di);
TfDeviceId tf_device_id(di);
int64 bytes = memory_limit_bytes[di];
auto it = device_localities.find(tf_gpu_id);
auto it = device_localities.find(tf_device_id);
if (it == device_localities.end()) {
return errors::Internal("Failed to find DeviceLocality for GPU device ",
tf_gpu_id.value());
tf_device_id.value());
}
TF_RETURN_IF_ERROR(CreateGPUDevice(options, name_prefix, tf_gpu_id, bytes,
it->second, num_tf_gpus, devices));
TF_RETURN_IF_ERROR(CreateGPUDevice(options, name_prefix, tf_device_id,
bytes, it->second, num_tf_gpus,
devices));
}
return Status::OK();
}
static string GetShortDeviceDescription(PlatformGpuId platform_gpu_id,
static string GetShortDeviceDescription(PlatformDeviceId platform_device_id,
const se::DeviceDescription& desc) {
#if GOOGLE_CUDA
int cc_major;
@ -1386,54 +1392,56 @@ static string GetShortDeviceDescription(PlatformGpuId platform_gpu_id,
cc_minor = 0;
}
// LINT.IfChange
return strings::StrCat("device: ", platform_gpu_id.value(),
return strings::StrCat("device: ", platform_device_id.value(),
", name: ", desc.name(),
", pci bus id: ", desc.pci_bus_id(),
", compute capability: ", cc_major, ".", cc_minor);
// LINT.ThenChange(//tensorflow/python/framework/gpu_util.py)
#elif TENSORFLOW_USE_ROCM
return strings::StrCat("device: ", platform_gpu_id.value(),
return strings::StrCat("device: ", platform_device_id.value(),
", name: ", desc.name(),
", pci bus id: ", desc.pci_bus_id());
#endif
}
Status BaseGPUDeviceFactory::CreateGPUDevice(
const SessionOptions& options, const string& name_prefix, TfGpuId tf_gpu_id,
int64 memory_limit, const DeviceLocality& dev_locality, size_t num_tf_gpus,
const SessionOptions& options, const string& name_prefix,
TfDeviceId tf_device_id, int64 memory_limit,
const DeviceLocality& dev_locality, size_t num_tf_gpus,
std::vector<std::unique_ptr<Device>>* devices) {
CHECK_GE(tf_gpu_id.value(), 0);
CHECK_GE(tf_device_id.value(), 0);
const string device_name =
strings::StrCat(name_prefix, "/device:GPU:", tf_gpu_id.value());
strings::StrCat(name_prefix, "/device:GPU:", tf_device_id.value());
DeviceIdUtil::CheckValidTfDeviceId(DEVICE_GPU, GPUMachineManager(),
tf_gpu_id);
PlatformGpuId platform_gpu_id;
tf_device_id);
PlatformDeviceId platform_device_id;
TF_RETURN_IF_ERROR(
GpuIdManager::TfToPlatformGpuId(tf_gpu_id, &platform_gpu_id));
GpuIdManager::TfToPlatformDeviceId(tf_device_id, &platform_device_id));
int numa_node = dev_locality.numa_node();
se::Platform* gpu_manager = GPUMachineManager();
auto desc_status = gpu_manager->DescriptionForDevice(platform_gpu_id.value());
auto desc_status =
gpu_manager->DescriptionForDevice(platform_device_id.value());
if (!desc_status.ok()) {
return desc_status.status();
}
auto desc = desc_status.ConsumeValueOrDie();
std::vector<TfGpuId> peer_gpu_ids;
std::vector<TfDeviceId> peer_gpu_ids;
peer_gpu_ids.reserve(num_tf_gpus);
for (int id = 0; id < num_tf_gpus; ++id) {
TfGpuId peer_tf_gpu_id(id);
if (peer_tf_gpu_id != tf_gpu_id) {
peer_gpu_ids.push_back(peer_tf_gpu_id);
TfDeviceId peer_tf_device_id(id);
if (peer_tf_device_id != tf_device_id) {
peer_gpu_ids.push_back(peer_tf_device_id);
}
}
GPUProcessState* process_state = GPUProcessState::singleton();
Allocator* gpu_allocator = process_state->GetGPUAllocator(
options.config.gpu_options(), tf_gpu_id, memory_limit, peer_gpu_ids);
options.config.gpu_options(), tf_device_id, memory_limit, peer_gpu_ids);
if (gpu_allocator == nullptr) {
return errors::Internal("Failed to get memory allocator for TF GPU ",
tf_gpu_id.value(), " with ", memory_limit,
tf_device_id.value(), " with ", memory_limit,
" bytes of memory.");
}
absl::optional<AllocatorStats> stats = gpu_allocator->GetStats();
@ -1441,7 +1449,7 @@ Status BaseGPUDeviceFactory::CreateGPUDevice(
return errors::Internal("No allocator statistics");
}
// 'memory_limit' is the required memory size, but if the allocator with
// given tf_gpu_id was created before, we'll use it instead of creating a
// given tf_device_id was created before, we'll use it instead of creating a
// new one (as TF gpu device is a shared resource), in which case the actual
// memory limit represented by 'stats.bytes_limit' used by that allocator
// may be different (which should be an error).
@ -1451,11 +1459,11 @@ Status BaseGPUDeviceFactory::CreateGPUDevice(
int64 bytes_limit = stats->bytes_limit ? *stats->bytes_limit : 0;
std::unique_ptr<BaseGPUDevice> gpu_device = CreateGPUDevice(
options, device_name, static_cast<Bytes>(bytes_limit), dev_locality,
tf_gpu_id, GetShortDeviceDescription(platform_gpu_id, *desc),
tf_device_id, GetShortDeviceDescription(platform_device_id, *desc),
gpu_allocator, ProcessState::singleton()->GetCPUAllocator(numa_node));
LOG(INFO) << "Created TensorFlow device (" << device_name << " with "
<< (bytes_limit >> 20) << " MB memory) -> physical GPU ("
<< GetShortDeviceDescription(platform_gpu_id, *desc) << ")";
<< GetShortDeviceDescription(platform_device_id, *desc) << ")";
TF_RETURN_IF_ERROR(gpu_device->Init(options));
devices->push_back(std::move(gpu_device));
@ -1463,13 +1471,13 @@ Status BaseGPUDeviceFactory::CreateGPUDevice(
}
namespace {
std::unique_ptr<std::map<std::pair<PlatformGpuId, PlatformGpuId>, bool>>
std::unique_ptr<std::map<std::pair<PlatformDeviceId, PlatformDeviceId>, bool>>
GetPeerAccessMap(se::Platform* platform,
const std::vector<PlatformGpuId>& visible_gpu_order) {
std::unique_ptr<std::map<std::pair<PlatformGpuId, PlatformGpuId>, bool>> map(
new std::map<std::pair<PlatformGpuId, PlatformGpuId>, bool>);
for (PlatformGpuId platform_gpu_i : visible_gpu_order) {
for (PlatformGpuId platform_gpu_j : visible_gpu_order) {
const std::vector<PlatformDeviceId>& visible_gpu_order) {
std::unique_ptr<std::map<std::pair<PlatformDeviceId, PlatformDeviceId>, bool>>
map(new std::map<std::pair<PlatformDeviceId, PlatformDeviceId>, bool>);
for (PlatformDeviceId platform_gpu_i : visible_gpu_order) {
for (PlatformDeviceId platform_gpu_j : visible_gpu_order) {
se::StreamExecutor* from =
DeviceIdUtil::ExecutorForPlatformDeviceId(platform, platform_gpu_i)
.ValueOrDie();
@ -1487,7 +1495,7 @@ GetPeerAccessMap(se::Platform* platform,
} // namespace
Status BaseGPUDeviceFactory::GetInterconnectMaps(
const std::vector<PlatformGpuId>& visible_gpu_order,
const std::vector<PlatformDeviceId>& visible_gpu_order,
se::Platform* gpu_manager, std::vector<InterconnectMap>* maps) {
// The default interconnect map is obtained from the StreamExecutor.
auto access_map = GetPeerAccessMap(gpu_manager, visible_gpu_order);
@ -1495,8 +1503,8 @@ Status BaseGPUDeviceFactory::GetInterconnectMaps(
InterconnectMap& imap = maps->at(0);
imap.name = "StreamExecutor";
imap.strength = InterconnectMap::kStreamExecutorStrength;
for (PlatformGpuId gpu_id_i : visible_gpu_order) {
for (PlatformGpuId gpu_id_j : visible_gpu_order) {
for (PlatformDeviceId gpu_id_i : visible_gpu_order) {
for (PlatformDeviceId gpu_id_j : visible_gpu_order) {
if (gpu_id_i == gpu_id_j) continue;
if ((*access_map)[{gpu_id_i, gpu_id_j}]) {
imap.directed_links.insert({gpu_id_i, gpu_id_j});
@ -1509,21 +1517,21 @@ Status BaseGPUDeviceFactory::GetInterconnectMaps(
Status BaseGPUDeviceFactory::GetDeviceLocalities(
int num_tf_gpus, const std::vector<InterconnectMap>& interconnects,
LocalityMap* localities) {
std::vector<TfGpuId> all_tf_gpu_ids;
all_tf_gpu_ids.reserve(num_tf_gpus);
std::vector<TfDeviceId> all_tf_device_ids;
all_tf_device_ids.reserve(num_tf_gpus);
for (int i = 0; i < num_tf_gpus; ++i) {
all_tf_gpu_ids.push_back(TfGpuId(i));
all_tf_device_ids.push_back(TfDeviceId(i));
}
for (TfGpuId tf_gpu_id : all_tf_gpu_ids) {
PlatformGpuId platform_gpu_id;
for (TfDeviceId tf_device_id : all_tf_device_ids) {
PlatformDeviceId platform_device_id;
TF_RETURN_IF_ERROR(
GpuIdManager::TfToPlatformGpuId(tf_gpu_id, &platform_gpu_id));
GpuIdManager::TfToPlatformDeviceId(tf_device_id, &platform_device_id));
// Get GPU bus_id from its reported NUMA affinity. Because GPUs are
// virtualized in some environments, we can't just use the GPU id.
// NUMA locales are indexed from 0, buses are indexed from 1.
se::Platform* gpu_manager = GPUMachineManager();
auto desc_status =
gpu_manager->DescriptionForDevice(platform_gpu_id.value());
gpu_manager->DescriptionForDevice(platform_device_id.value());
if (!desc_status.ok()) {
return desc_status.status();
}
@ -1537,7 +1545,7 @@ Status BaseGPUDeviceFactory::GetDeviceLocalities(
// trouble may manifest as slower than expected performance, or
// outright failures.
LOG(INFO) << "Could not identify NUMA node of platform GPU id "
<< platform_gpu_id
<< platform_device_id
<< ", defaulting to 0. Your kernel may not have been built "
<< "with NUMA support.";
numa_node = 0;
@ -1549,11 +1557,11 @@ Status BaseGPUDeviceFactory::GetDeviceLocalities(
// Set LocalLinks from InterconnectMaps.
LocalLinks* links = dev_locality.mutable_links();
for (const InterconnectMap& imap : interconnects) {
for (TfGpuId tf_gpu_dst : all_tf_gpu_ids) {
PlatformGpuId platform_gpu_dst;
for (TfDeviceId tf_gpu_dst : all_tf_device_ids) {
PlatformDeviceId platform_gpu_dst;
TF_RETURN_IF_ERROR(
GpuIdManager::TfToPlatformGpuId(tf_gpu_dst, &platform_gpu_dst));
if (imap.directed_links.find({platform_gpu_id, platform_gpu_dst}) !=
GpuIdManager::TfToPlatformDeviceId(tf_gpu_dst, &platform_gpu_dst));
if (imap.directed_links.find({platform_device_id, platform_gpu_dst}) !=
imap.directed_links.end()) {
InterconnectLink* ilink = links->add_link();
ilink->set_device_id(tf_gpu_dst.value());
@ -1565,12 +1573,12 @@ Status BaseGPUDeviceFactory::GetDeviceLocalities(
// If this is one of multiple virtual GPUs on the same physical GPU
// add high strength links to the others.
for (TfGpuId tf_gpu_dst : all_tf_gpu_ids) {
if (tf_gpu_id == tf_gpu_dst) continue;
PlatformGpuId platform_gpu_dst;
for (TfDeviceId tf_gpu_dst : all_tf_device_ids) {
if (tf_device_id == tf_gpu_dst) continue;
PlatformDeviceId platform_gpu_dst;
TF_RETURN_IF_ERROR(
GpuIdManager::TfToPlatformGpuId(tf_gpu_dst, &platform_gpu_dst));
if (platform_gpu_id == platform_gpu_dst) {
GpuIdManager::TfToPlatformDeviceId(tf_gpu_dst, &platform_gpu_dst));
if (platform_device_id == platform_gpu_dst) {
InterconnectLink* ilink = links->add_link();
ilink->set_device_id(tf_gpu_dst.value());
ilink->set_type("SAME_DEVICE");
@ -1578,10 +1586,11 @@ Status BaseGPUDeviceFactory::GetDeviceLocalities(
}
}
(*localities)[tf_gpu_id] = dev_locality;
VLOG(1) << "GPUDevice PlatformGpuId " << platform_gpu_id << " TfGpuId "
<< tf_gpu_id << " on bus " << dev_locality.bus_id()
<< " numa: " << numa_node << " pci: " << desc->pci_bus_id()
(*localities)[tf_device_id] = dev_locality;
VLOG(1) << "GPUDevice PlatformDeviceId " << platform_device_id
<< " TfDeviceId " << tf_device_id << " on bus "
<< dev_locality.bus_id() << " numa: " << numa_node
<< " pci: " << desc->pci_bus_id()
<< " DeviceLocality: " << dev_locality.DebugString();
}
return Status::OK();
@ -1589,7 +1598,7 @@ Status BaseGPUDeviceFactory::GetDeviceLocalities(
static int GetDefaultMinGPUMultiprocessorCount(
se::Platform* gpu_manager,
const std::vector<PlatformGpuId>& visible_gpu_order) {
const std::vector<PlatformDeviceId>& visible_gpu_order) {
static const int kDefaultMinGPUMultiprocessorCount = 8;
// Find the highest multi-processor count across all visible GPUs.
@ -1614,7 +1623,7 @@ static int GetDefaultMinGPUMultiprocessorCount(
static int GetMinGPUMultiprocessorCount(
se::Platform* gpu_manager,
const std::vector<PlatformGpuId>& visible_gpu_order) {
const std::vector<PlatformDeviceId>& visible_gpu_order) {
const char* tf_min_gpu_core_count = getenv("TF_MIN_GPU_MULTIPROCESSOR_COUNT");
if (tf_min_gpu_core_count == nullptr ||
@ -1704,14 +1713,14 @@ std::vector<int> GetSupportedAMDGPUISAVersions() {
} // namespace
Status BaseGPUDeviceFactory::EnablePeerAccess(
const std::vector<PlatformGpuId>& visible_gpu_order) {
const std::vector<PlatformDeviceId>& visible_gpu_order) {
se::Platform* gpu_manager = GPUMachineManager();
int possible_peer_count = 0;
int enabled_peer_count = 0;
for (int i = 0; i < visible_gpu_order.size(); ++i) {
const PlatformGpuId platform_gpu_i = visible_gpu_order[i];
const PlatformDeviceId platform_gpu_i = visible_gpu_order[i];
for (int j = 0; j < visible_gpu_order.size(); ++j) {
const PlatformGpuId platform_gpu_j = visible_gpu_order[j];
const PlatformDeviceId platform_gpu_j = visible_gpu_order[j];
// We have already validated that ExecutorForDevice() calls return OK.
se::StreamExecutor* from =
DeviceIdUtil::ExecutorForPlatformDeviceId(gpu_manager, platform_gpu_i)
@ -1748,8 +1757,8 @@ Status BaseGPUDeviceFactory::EnablePeerAccess(
}
Status BaseGPUDeviceFactory::GetValidDeviceIds(
const std::vector<PlatformGpuId>& visible_gpu_order,
std::vector<PlatformGpuId>* ids) {
const std::vector<PlatformDeviceId>& visible_gpu_order,
std::vector<PlatformDeviceId>* ids) {
se::Platform* gpu_manager = GPUMachineManager();
for (int i = 0; i < visible_gpu_order.size(); ++i) {
int visible_gpu_id = visible_gpu_order[i].value();
@ -1834,7 +1843,7 @@ Status BaseGPUDeviceFactory::GetValidDeviceIds(
// Filter out devices that don't have the right capability or power.
for (int i = 0; i < visible_gpu_order.size(); ++i) {
const PlatformGpuId visible_gpu_id = visible_gpu_order[i];
const PlatformDeviceId visible_gpu_id = visible_gpu_order[i];
auto description_status =
gpu_manager->DescriptionForDevice(visible_gpu_id.value());
if (!description_status.ok()) {
@ -1904,7 +1913,7 @@ Status BaseGPUDeviceFactory::GetValidDeviceIds(
if (!ids->empty()) {
std::vector<int> raw_ids(ids->size());
std::transform(ids->begin(), ids->end(), raw_ids.begin(),
[](PlatformGpuId id) -> int { return id.value(); });
[](PlatformDeviceId id) -> int { return id.value(); });
LOG(INFO) << "Adding visible gpu devices: " << absl::StrJoin(raw_ids, ", ");
}

View File

@ -53,7 +53,8 @@ class BaseGPUDevice : public LocalDevice {
public:
BaseGPUDevice(const SessionOptions& options, const std::string& name,
Bytes memory_limit, const DeviceLocality& locality,
TfGpuId tf_gpu_id, const std::string& physical_device_desc,
TfDeviceId tf_device_id,
const std::string& physical_device_desc,
Allocator* gpu_allocator, Allocator* cpu_allocator,
bool sync_every_op);
@ -87,9 +88,10 @@ class BaseGPUDevice : public LocalDevice {
// Returns the platform GPU id of this device within the native driver system;
// e.g., for CUDA and ROCm this is the ordinal of the GPU within the system.
int gpu_id() const {
PlatformGpuId platform_gpu_id;
TF_CHECK_OK(GpuIdManager::TfToPlatformGpuId(tf_gpu_id_, &platform_gpu_id));
return platform_gpu_id.value();
PlatformDeviceId platform_device_id;
TF_CHECK_OK(
GpuIdManager::TfToPlatformDeviceId(tf_device_id_, &platform_device_id));
return platform_device_id.value();
}
// The executor that provides control for the device; e.g., for CUDA this
@ -146,7 +148,7 @@ class BaseGPUDevice : public LocalDevice {
GPUDeviceContext* device_context_;
GpuDeviceInfo* gpu_device_info_ = nullptr;
mutex trace_mu_;
TfGpuId tf_gpu_id_;
TfDeviceId tf_device_id_;
const bool sync_every_op_ = false;
EventMgr* em_ = nullptr;
std::unique_ptr<thread::ThreadPool> thread_pool_;
@ -325,53 +327,56 @@ class BaseGPUDeviceFactory : public DeviceFactory {
int32 strength;
static const int kSameDeviceStrength;
static const int kStreamExecutorStrength;
std::set<std::pair<PlatformGpuId, PlatformGpuId>> directed_links;
std::set<std::pair<PlatformDeviceId, PlatformDeviceId>> directed_links;
};
protected:
// Populates *maps with interconnect maps for all local direct access
// pathways between GPUs.
virtual Status GetInterconnectMaps(
const std::vector<PlatformGpuId>& visible_gpu_order,
const std::vector<PlatformDeviceId>& visible_gpu_order,
se::Platform* gpu_manager, std::vector<InterconnectMap>* maps);
struct TfGpuIdHash {
std::size_t operator()(const TfGpuId& id) const noexcept {
struct TfDeviceIdHash {
std::size_t operator()(const TfDeviceId& id) const noexcept {
return std::hash<int>{}(id.value());
}
};
typedef std::unordered_map<TfGpuId, DeviceLocality, TfGpuIdHash> LocalityMap;
typedef std::unordered_map<TfDeviceId, DeviceLocality, TfDeviceIdHash>
LocalityMap;
// Populates *localities with the DeviceLocality descriptor for
// every TfGpuId.
// every TfDeviceId.
virtual Status GetDeviceLocalities(
int num_tf_gpus, const std::vector<InterconnectMap>& interconnects,
LocalityMap* localities);
private:
// Creates a BaseGPUDevice associated with 'tf_gpu_id', allocates (strictly)
// 'memory_limit' bytes of GPU memory to it, and adds it to the 'devices'
// vector.
// Creates a BaseGPUDevice associated with 'tf_device_id', allocates
// (strictly) 'memory_limit' bytes of GPU memory to it, and adds it to the
// 'devices' vector.
Status CreateGPUDevice(const SessionOptions& options,
const std::string& name_prefix, TfGpuId tf_gpu_id,
int64 memory_limit, const DeviceLocality& dev_locality,
size_t num_tf_gpus,
const std::string& name_prefix,
TfDeviceId tf_device_id, int64 memory_limit,
const DeviceLocality& dev_locality, size_t num_tf_gpus,
std::vector<std::unique_ptr<Device>>* devices);
virtual std::unique_ptr<BaseGPUDevice> CreateGPUDevice(
const SessionOptions& options, const string& name, Bytes memory_limit,
const DeviceLocality& dev_locality, TfGpuId tf_gpu_id,
const DeviceLocality& dev_locality, TfDeviceId tf_device_id,
const string& physical_device_desc, Allocator* gpu_allocator,
Allocator* cpu_allocator) = 0;
Status EnablePeerAccess(const std::vector<PlatformGpuId>& visible_gpu_order);
Status EnablePeerAccess(
const std::vector<PlatformDeviceId>& visible_gpu_order);
// Returns into 'ids' the list of valid platform GPU ids, in the order that
// they should map to TF GPU ids "/device:GPU:0", "/device:GPU:1", etc,
// based upon 'visible_gpu_order' which was generated by parsing
// GPUOptions::visible_device_list which is a comma-separated list of CUDA or
// ROCm GPU ids.
Status GetValidDeviceIds(const std::vector<PlatformGpuId>& visible_gpu_order,
std::vector<PlatformGpuId>* ids);
Status GetValidDeviceIds(
const std::vector<PlatformDeviceId>& visible_gpu_order,
std::vector<PlatformDeviceId>* ids);
// Cache the valid device IDs if not already cached. Cached IDs are stored in
// field cached_device_ids_. Passes {0, 1, ..., num_devices-1} to
@ -379,14 +384,14 @@ class BaseGPUDeviceFactory : public DeviceFactory {
// devices should be treated as visible, like ListPhysicalDevices.
Status CacheDeviceIds();
// visible_gpu_initialized_[platform_gpu_id] is true if visible GPU
// platform_gpu_id has been initialized by the process.
// visible_gpu_initialized_[platform_device_id] is true if visible GPU
// platform_device_id has been initialized by the process.
std::unordered_map<int, bool> visible_gpu_initialized_;
// Cached device IDs, as returned by GetValidDeviceIds when every physical
// device is visible. Cache should not be used if some devices are not
// visible.
std::vector<PlatformGpuId> cached_device_ids_;
std::vector<PlatformDeviceId> cached_device_ids_;
};
} // namespace tensorflow

View File

@ -30,9 +30,9 @@ class GPUDevice : public BaseGPUDevice {
public:
GPUDevice(const SessionOptions& options, const string& name,
Bytes memory_limit, const DeviceLocality& locality,
TfGpuId tf_gpu_id, const string& physical_device_desc,
TfDeviceId tf_device_id, const string& physical_device_desc,
Allocator* gpu_allocator, Allocator* cpu_allocator)
: BaseGPUDevice(options, name, memory_limit, locality, tf_gpu_id,
: BaseGPUDevice(options, name, memory_limit, locality, tf_device_id,
physical_device_desc, gpu_allocator, cpu_allocator,
false /* sync every op */) {
if (options.config.has_gpu_options()) {
@ -63,11 +63,11 @@ class GPUDeviceFactory : public BaseGPUDeviceFactory {
private:
std::unique_ptr<BaseGPUDevice> CreateGPUDevice(
const SessionOptions& options, const string& name, Bytes memory_limit,
const DeviceLocality& locality, TfGpuId tf_gpu_id,
const DeviceLocality& locality, TfDeviceId tf_device_id,
const string& physical_device_desc, Allocator* gpu_allocator,
Allocator* cpu_allocator) override {
return absl::make_unique<GPUDevice>(options, name, memory_limit, locality,
tf_gpu_id, physical_device_desc,
tf_device_id, physical_device_desc,
gpu_allocator, cpu_allocator);
}
};

View File

@ -30,7 +30,7 @@ namespace tensorflow {
namespace {
const char* kDeviceNamePrefix = "/job:localhost/replica:0/task:0";
int64 GetTotalGPUMemory(PlatformGpuId gpu_id) {
int64 GetTotalGPUMemory(PlatformDeviceId gpu_id) {
se::StreamExecutor* se =
DeviceIdUtil::ExecutorForPlatformDeviceId(GPUMachineManager(), gpu_id)
.ValueOrDie();
@ -40,7 +40,7 @@ int64 GetTotalGPUMemory(PlatformGpuId gpu_id) {
return total_memory;
}
Status GetComputeCapability(PlatformGpuId gpu_id, int* cc_major,
Status GetComputeCapability(PlatformDeviceId gpu_id, int* cc_major,
int* cc_minor) {
se::StreamExecutor* se =
DeviceIdUtil::ExecutorForPlatformDeviceId(GPUMachineManager(), gpu_id)
@ -350,7 +350,7 @@ TEST_F(GPUDeviceTest, MultipleVirtualDevicesWithPriority) {
// error.
TEST_F(GPUDeviceTest, UnifiedMemoryUnavailableOnPrePascalGpus) {
int cc_major, cc_minor;
TF_ASSERT_OK(GetComputeCapability(PlatformGpuId(0), &cc_major, &cc_minor));
TF_ASSERT_OK(GetComputeCapability(PlatformDeviceId(0), &cc_major, &cc_minor));
// Exit early while running on Pascal or later GPUs.
if (cc_major >= 6) {
return;
@ -371,10 +371,10 @@ TEST_F(GPUDeviceTest, UnifiedMemoryUnavailableOnPrePascalGpus) {
// more memory than what is available on the device.
TEST_F(GPUDeviceTest, UnifiedMemoryAllocation) {
static constexpr double kGpuMemoryFraction = 1.2;
static constexpr PlatformGpuId kPlatformGpuId(0);
static constexpr PlatformDeviceId kPlatformDeviceId(0);
int cc_major, cc_minor;
TF_ASSERT_OK(GetComputeCapability(kPlatformGpuId, &cc_major, &cc_minor));
TF_ASSERT_OK(GetComputeCapability(kPlatformDeviceId, &cc_major, &cc_minor));
// Exit early if running on pre-Pascal GPUs.
if (cc_major < 6) {
LOG(INFO)
@ -389,8 +389,9 @@ TEST_F(GPUDeviceTest, UnifiedMemoryAllocation) {
ASSERT_EQ(1, devices.size());
int64 memory_limit = devices[0]->attributes().memory_limit();
ASSERT_EQ(memory_limit, static_cast<int64>(GetTotalGPUMemory(kPlatformGpuId) *
kGpuMemoryFraction));
ASSERT_EQ(memory_limit,
static_cast<int64>(GetTotalGPUMemory(kPlatformDeviceId) *
kGpuMemoryFraction));
AllocatorAttributes allocator_attributes = AllocatorAttributes();
allocator_attributes.set_gpu_compatible(true);

View File

@ -17,13 +17,6 @@ limitations under the License.
#include "tensorflow/core/common_runtime/device/device_id.h"
namespace tensorflow {
// TODO(annarev): remove these aliases after all references are updated
// to use device ids.
typedef TfDeviceId TfGpuId;
typedef PlatformDeviceId PlatformGpuId;
} // namespace tensorflow
// TODO(sanjoy): Delete the header and forward the references.
#endif // TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_ID_H_

View File

@ -20,16 +20,16 @@ limitations under the License.
namespace tensorflow {
Status GpuIdManager::InsertTfPlatformGpuIdPair(
TfDeviceId tf_gpu_id, PlatformDeviceId platform_gpu_id) {
return DeviceIdManager::InsertTfPlatformDeviceIdPair(DEVICE_GPU, tf_gpu_id,
platform_gpu_id);
Status GpuIdManager::InsertTfPlatformDeviceIdPair(
TfDeviceId tf_device_id, PlatformDeviceId platform_device_id) {
return DeviceIdManager::InsertTfPlatformDeviceIdPair(DEVICE_GPU, tf_device_id,
platform_device_id);
}
Status GpuIdManager::TfToPlatformGpuId(TfDeviceId tf_gpu_id,
PlatformDeviceId* platform_gpu_id) {
return DeviceIdManager::TfToPlatformDeviceId(DEVICE_GPU, tf_gpu_id,
platform_gpu_id);
Status GpuIdManager::TfToPlatformDeviceId(
TfDeviceId tf_device_id, PlatformDeviceId* platform_device_id) {
return DeviceIdManager::TfToPlatformDeviceId(DEVICE_GPU, tf_device_id,
platform_device_id);
}
void GpuIdManager::TestOnlyReset() { DeviceIdManager::TestOnlyReset(); }

View File

@ -21,17 +21,18 @@ limitations under the License.
namespace tensorflow {
// Class that maintains a map from TfGpuId to PlatformGpuId, and manages the
// translation between them.
// Class that maintains a map from TfDeviceId to PlatformDeviceId, and manages
// the translation between them.
class GpuIdManager {
public:
// Adds a mapping from tf_gpu_id to platform_gpu_id.
static Status InsertTfPlatformGpuIdPair(TfDeviceId tf_gpu_id,
PlatformDeviceId platform_gpu_id);
// Adds a mapping from tf_device_id to platform_device_id.
static Status InsertTfPlatformDeviceIdPair(
TfDeviceId tf_device_id, PlatformDeviceId platform_device_id);
// Gets the platform_gpu_id associated with tf_gpu_id. Returns OK if found.
static Status TfToPlatformGpuId(TfDeviceId tf_gpu_id,
PlatformDeviceId* platform_gpu_id);
// Gets the platform_device_id associated with tf_device_id. Returns OK if
// found.
static Status TfToPlatformDeviceId(TfDeviceId tf_device_id,
PlatformDeviceId* platform_device_id);
// Clears the map. Used in unit tests only.
static void TestOnlyReset();

View File

@ -83,10 +83,10 @@ GPUProcessState::GPUProcessState() : gpu_device_enabled_(false) {
process_state_ = ProcessState::singleton();
}
int GPUProcessState::BusIdForGPU(TfGpuId tf_gpu_id) {
int GPUProcessState::BusIdForGPU(TfDeviceId tf_device_id) {
// Return the NUMA node associated with the GPU's StreamExecutor.
se::StreamExecutor* se = DeviceIdUtil::ExecutorForTfDeviceId(
DEVICE_GPU, GPUMachineManager(), tf_gpu_id)
DEVICE_GPU, GPUMachineManager(), tf_device_id)
.ValueOrDie();
int numa_node = se->GetDeviceDescription().numa_node();
// bus_id must be non-negative. If the numa_node is not known,
@ -96,11 +96,11 @@ int GPUProcessState::BusIdForGPU(TfGpuId tf_gpu_id) {
// NOLINTNEXTLINE: clang-tidy complains this is unused because of build flags.
static SubAllocator* CreateSubAllocator(
const GPUOptions& options, PlatformGpuId platform_gpu_id,
const GPUOptions& options, PlatformDeviceId platform_device_id,
const std::vector<SubAllocator::Visitor>& alloc_visitors,
size_t total_bytes, const std::vector<TfGpuId>& peer_gpu_ids) {
size_t total_bytes, const std::vector<TfDeviceId>& peer_gpu_ids) {
auto executor = DeviceIdUtil::ExecutorForPlatformDeviceId(GPUMachineManager(),
platform_gpu_id)
platform_device_id)
.ValueOrDie();
// FIXME(imintz): Observed OOM issues when using the virtual memory
@ -110,21 +110,21 @@ static SubAllocator* CreateSubAllocator(
// TODO(imintz): Remove the cuMemAlloc capability of this allocator.
if (options.per_process_gpu_memory_fraction() > 1.0 ||
options.experimental().use_unified_memory()) {
return new DeviceMemAllocator(executor, platform_gpu_id,
return new DeviceMemAllocator(executor, platform_device_id,
/*use_unified_memory=*/true, alloc_visitors,
{});
} else {
auto* gpu_context = reinterpret_cast<stream_executor::gpu::GpuContext*>(
executor->implementation()->GpuContextHack());
absl::flat_hash_set<PlatformGpuId> platform_peer_gpu_ids;
absl::flat_hash_set<PlatformDeviceId> platform_peer_gpu_ids;
platform_peer_gpu_ids.reserve(peer_gpu_ids.size());
for (const TfGpuId tf_gpu_id : peer_gpu_ids) {
PlatformGpuId platform_gpu_id;
TF_CHECK_OK(GpuIdManager::TfToPlatformGpuId(tf_gpu_id, &platform_gpu_id));
platform_peer_gpu_ids.insert(platform_gpu_id);
for (const TfDeviceId tf_device_id : peer_gpu_ids) {
PlatformDeviceId platform_device_id;
TF_CHECK_OK(GpuIdManager::TfToPlatformDeviceId(tf_device_id, &platform_device_id));
platform_peer_gpu_ids.insert(platform_device_id);
}
std::vector<PlatformGpuId> platform_peer_gpu_ids_vec(
std::vector<PlatformDeviceId> platform_peer_gpu_ids_vec(
platform_peer_gpu_ids.begin(), platform_peer_gpu_ids.end());
// Adjust virtual address space to be slightly larger than the physical
@ -133,7 +133,7 @@ static SubAllocator* CreateSubAllocator(
// TODO(imintz): Update BFC allocator to ensure it doesn't create holes in
// the va space.
return GpuVirtualMemAllocator::Create(
alloc_visitors, {}, *gpu_context, platform_gpu_id,
alloc_visitors, {}, *gpu_context, platform_device_id,
/*virtual_address_space_size=*/total_bytes * 2,
platform_peer_gpu_ids_vec)
.ValueOrDie()
@ -141,7 +141,7 @@ static SubAllocator* CreateSubAllocator(
}
#else
return new DeviceMemAllocator(
executor, platform_gpu_id,
executor, platform_device_id,
(options.per_process_gpu_memory_fraction() > 1.0 ||
options.experimental().use_unified_memory()),
alloc_visitors, {});
@ -149,21 +149,21 @@ static SubAllocator* CreateSubAllocator(
}
Allocator* GPUProcessState::GetGPUAllocator(
const GPUOptions& options, TfGpuId tf_gpu_id, size_t total_bytes,
const std::vector<TfGpuId>& peer_gpu_ids) {
const GPUOptions& options, TfDeviceId tf_device_id, size_t total_bytes,
const std::vector<TfDeviceId>& peer_gpu_ids) {
CHECK(process_state_);
#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
(defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
const string& allocator_type = options.allocator_type();
mutex_lock lock(mu_);
DeviceIdUtil::CheckValidTfDeviceId(DEVICE_GPU, GPUMachineManager(),
tf_gpu_id);
tf_device_id);
if (tf_gpu_id.value() >= static_cast<int64>(gpu_allocators_.size())) {
gpu_allocators_.resize(tf_gpu_id.value() + 1);
if (tf_device_id.value() >= static_cast<int64>(gpu_allocators_.size())) {
gpu_allocators_.resize(tf_device_id.value() + 1);
}
AllocatorParts& allocator_parts = gpu_allocators_[tf_gpu_id.value()];
AllocatorParts& allocator_parts = gpu_allocators_[tf_device_id.value()];
if (allocator_parts.allocator == nullptr) {
// Validate allocator types.
if (!allocator_type.empty() && allocator_type != "BFC") {
@ -171,19 +171,20 @@ Allocator* GPUProcessState::GetGPUAllocator(
return nullptr;
}
PlatformGpuId platform_gpu_id;
TF_CHECK_OK(GpuIdManager::TfToPlatformGpuId(tf_gpu_id, &platform_gpu_id));
int bus_id = BusIdForGPU(tf_gpu_id);
PlatformDeviceId platform_device_id;
TF_CHECK_OK(
GpuIdManager::TfToPlatformDeviceId(tf_device_id, &platform_device_id));
int bus_id = BusIdForGPU(tf_device_id);
DCHECK_GE(bus_id, 0);
while (bus_id >= gpu_visitors_.size()) {
gpu_visitors_.push_back({});
}
auto* sub_allocator =
CreateSubAllocator(options, platform_gpu_id, gpu_visitors_[bus_id],
CreateSubAllocator(options, platform_device_id, gpu_visitors_[bus_id],
total_bytes, peer_gpu_ids);
GPUBFCAllocator* gpu_bfc_allocator =
new GPUBFCAllocator(sub_allocator, total_bytes, options,
strings::StrCat("GPU_", tf_gpu_id.value(), "_bfc"));
GPUBFCAllocator* gpu_bfc_allocator = new GPUBFCAllocator(
sub_allocator, total_bytes, options,
strings::StrCat("GPU_", tf_device_id.value(), "_bfc"));
Allocator* gpu_allocator = gpu_bfc_allocator;
SharedCounter* timing_counter = nullptr;
if (options.experimental().timestamped_allocator()) {
@ -195,29 +196,30 @@ Allocator* GPUProcessState::GetGPUAllocator(
// distinctive patterns on both ends of allocated memory.
if (UseCudaMemoryGuardAllocator()) {
LOG(INFO) << "Using memory guard allocator for GPU.";
gpu_allocator = new GPUDebugAllocator(gpu_allocator, platform_gpu_id);
gpu_allocator = new GPUNanResetAllocator(gpu_allocator, platform_gpu_id);
gpu_allocator = new GPUDebugAllocator(gpu_allocator, platform_device_id);
gpu_allocator =
new GPUNanResetAllocator(gpu_allocator, platform_device_id);
} else if (UseCudaMallocAllocator()) {
LOG(INFO) << "Using CUDA malloc allocator for GPU.";
// If true, passes all allocation requests through to cudaMalloc
// useful for doing memory debugging with tools like cuda-memcheck
// **WARNING** probably will not work in a multi-gpu scenario
gpu_allocator =
new GPUcudaMallocAllocator(gpu_allocator, platform_gpu_id);
new GPUcudaMallocAllocator(gpu_allocator, platform_device_id);
} else if (UseCudaMallocAsyncAllocator()) {
LOG(INFO) << "Using CUDA malloc Async allocator for GPU.";
// If true, passes all allocation requests through to cudaMallocAsync
// TODO: useful for doing memory debugging with tools like cuda-memcheck
// TODO: **WARNING** probably will not work in a multi-gpu scenario
gpu_allocator =
new GpuCudaMallocAsyncAllocator(platform_gpu_id, total_bytes);
new GpuCudaMallocAsyncAllocator(platform_device_id, total_bytes);
}
Allocator* recording_allocator = nullptr;
if (process_state_->ProcessState::FLAGS_brain_gpu_record_mem_types) {
ProcessState::MemDesc md;
md.loc = ProcessState::MemDesc::GPU;
md.dev_index = platform_gpu_id.value();
md.dev_index = platform_device_id.value();
md.gpu_registered = false;
md.nic_registered = true;
recording_allocator = new internal::RecordingAllocator(
@ -240,20 +242,20 @@ Allocator* GPUProcessState::GetGPUAllocator(
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
}
SharedCounter* GPUProcessState::GPUAllocatorCounter(TfGpuId tf_gpu_id) {
SharedCounter* GPUProcessState::GPUAllocatorCounter(TfDeviceId tf_device_id) {
DCHECK(process_state_);
#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
(defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
DeviceIdUtil::CheckValidTfDeviceId(DEVICE_GPU, GPUMachineManager(),
tf_gpu_id);
tf_device_id);
mutex_lock l(mu_);
if (tf_gpu_id.value() >= static_cast<int64>(gpu_allocators_.size())) {
LOG(ERROR) << "Asked for counter for GPU allocator " << tf_gpu_id.value()
if (tf_device_id.value() >= static_cast<int64>(gpu_allocators_.size())) {
LOG(ERROR) << "Asked for counter for GPU allocator " << tf_device_id.value()
<< " but only have " << gpu_allocators_.size();
return nullptr;
}
AllocatorParts& allocator_parts = gpu_allocators_[tf_gpu_id.value()];
AllocatorParts& allocator_parts = gpu_allocators_[tf_device_id.value()];
if (allocator_parts.counter.get() == nullptr) {
SharedCounter* timing_counter = new SharedCounter;
allocator_parts.bfc_allocator->SetTimingCounter(timing_counter);
@ -303,7 +305,7 @@ Allocator* GPUProcessState::GetGpuHostAllocator(int numa_node) {
for (int i = 0; i < static_cast<int>(gpu_allocators_.size()); ++i) {
if (gpu_allocators_[i].allocator != nullptr) {
se = DeviceIdUtil::ExecutorForTfDeviceId(DEVICE_GPU, GPUMachineManager(),
TfGpuId(i))
TfDeviceId(i))
.ValueOrDie();
break;
}

View File

@ -72,18 +72,18 @@ class GPUProcessState {
//
// 'total_bytes' is the total number of bytes that should be made
// available to the allocator. The first call to this function for
// a given tf_gpu_id creates the allocator, so only the total_bytes
// a given tf_device_id creates the allocator, so only the total_bytes
// used on that first call is used.
//
// "Allocator type" describes the type of algorithm to use for the
// underlying allocator. REQUIRES: Must be a valid type (see
// config.proto for the list of supported strings.).
//
// REQUIRES: tf_gpu_id must be a valid id for a BaseGPUDevice available in the
// current system environment. Otherwise returns nullptr.
virtual Allocator* GetGPUAllocator(const GPUOptions& options,
TfGpuId tf_gpu_id, size_t total_bytes,
const std::vector<TfGpuId>& peer_gpu_ids);
// REQUIRES: tf_device_id must be a valid id for a BaseGPUDevice available in
// the current system environment. Otherwise returns nullptr.
virtual Allocator* GetGPUAllocator(
const GPUOptions& options, TfDeviceId tf_device_id, size_t total_bytes,
const std::vector<TfDeviceId>& peer_gpu_ids);
int NumGPUAllocators() {
mutex_lock l(mu_);
@ -115,9 +115,9 @@ class GPUProcessState {
const SubAllocator::Visitor& visitor);
// Returns bus_id for the given GPU id.
virtual int BusIdForGPU(TfGpuId tf_gpu_id);
virtual int BusIdForGPU(TfDeviceId tf_device_id);
SharedCounter* GPUAllocatorCounter(TfGpuId tf_gpu_id);
SharedCounter* GPUAllocatorCounter(TfDeviceId tf_device_id);
protected:
// GPUProcessState is a singleton that should not normally be deleted except

View File

@ -44,7 +44,7 @@ StatusOr<bool> SupportsVirtualAddressManagement(GpuDeviceHandle device) {
}
Status CheckVirtualAddressManagementSupport(GpuDeviceHandle device,
PlatformGpuId gpu_id) {
PlatformDeviceId gpu_id) {
TF_ASSIGN_OR_RETURN(bool supports_virtual_address_management,
SupportsVirtualAddressManagement(device));
if (!supports_virtual_address_management) {
@ -59,11 +59,11 @@ Status CheckVirtualAddressManagementSupport(GpuDeviceHandle device,
/* static */ stream_executor::port::StatusOr<
std::unique_ptr<GpuVirtualMemAllocator>>
GpuVirtualMemAllocator::Create(const std::vector<Visitor>& alloc_visitors,
const std::vector<Visitor>& free_visitors,
GpuContext& gpu_context, PlatformGpuId gpu_id,
size_t virtual_address_space_size,
const std::vector<PlatformGpuId>& peer_gpu_ids) {
GpuVirtualMemAllocator::Create(
const std::vector<Visitor>& alloc_visitors,
const std::vector<Visitor>& free_visitors, GpuContext& gpu_context,
PlatformDeviceId gpu_id, size_t virtual_address_space_size,
const std::vector<PlatformDeviceId>& peer_gpu_ids) {
std::vector<GpuDeviceHandle> access_gpu_handles;
access_gpu_handles.reserve(peer_gpu_ids.size() + 1);
@ -111,7 +111,8 @@ GpuVirtualMemAllocator::Create(const std::vector<Visitor>& alloc_visitors,
GpuVirtualMemAllocator::GpuVirtualMemAllocator(
const std::vector<Visitor>& alloc_visitors,
const std::vector<Visitor>& free_visitors, GpuContext& gpu_context,
PlatformGpuId gpu_id, const std::vector<GpuDeviceHandle> access_gpu_handles,
PlatformDeviceId gpu_id,
const std::vector<GpuDeviceHandle> access_gpu_handles,
GpuDriver::VmemSpan vmem, size_t granularity)
: SubAllocator(alloc_visitors, free_visitors),
gpu_context_(gpu_context),

View File

@ -44,9 +44,9 @@ class GpuVirtualMemAllocator : public SubAllocator {
std::unique_ptr<GpuVirtualMemAllocator>>
Create(const std::vector<Visitor>& alloc_visitors,
const std::vector<Visitor>& free_visitors,
stream_executor::gpu::GpuContext& gpu_context, PlatformGpuId gpu_id,
stream_executor::gpu::GpuContext& gpu_context, PlatformDeviceId gpu_id,
size_t virtual_address_space_size,
const std::vector<PlatformGpuId>& peer_gpu_ids);
const std::vector<PlatformDeviceId>& peer_gpu_ids);
~GpuVirtualMemAllocator() override;
// Allocates memory at least as large as requested by num_bytes. Will be
@ -74,12 +74,12 @@ class GpuVirtualMemAllocator : public SubAllocator {
GpuVirtualMemAllocator(
const std::vector<Visitor>& alloc_visitors,
const std::vector<Visitor>& free_visitors,
stream_executor::gpu::GpuContext& gpu_context, PlatformGpuId gpu_id,
stream_executor::gpu::GpuContext& gpu_context, PlatformDeviceId gpu_id,
std::vector<stream_executor::gpu::GpuDeviceHandle> access_device_handles,
stream_executor::gpu::GpuDriver::VmemSpan vmem, size_t granularity);
stream_executor::gpu::GpuContext& gpu_context_;
PlatformGpuId gpu_id_;
PlatformDeviceId gpu_id_;
// Peer access is configured at mmap time so the allocator must be aware of
// all gpus that may want to read the memory. This list also includes the

View File

@ -35,7 +35,7 @@ constexpr size_t k2MiB{2 << 20};
// Creates an allocator with 8 MiB of virtual address space.
std::unique_ptr<GpuVirtualMemAllocator> CreateAllocator() {
PlatformGpuId gpu_id(0);
PlatformDeviceId gpu_id(0);
auto executor =
DeviceIdUtil::ExecutorForPlatformDeviceId(GPUMachineManager(), gpu_id)
.ValueOrDie();
@ -48,7 +48,7 @@ std::unique_ptr<GpuVirtualMemAllocator> CreateAllocator() {
}
TEST(GpuVirtualMemAllocatorTest, SimpleAlloc) {
PlatformGpuId gpu_id(0);
PlatformDeviceId gpu_id(0);
auto executor =
DeviceIdUtil::ExecutorForPlatformDeviceId(GPUMachineManager(), gpu_id)
.ValueOrDie();

View File

@ -92,14 +92,15 @@ Status SingleMachine::Provision() {
return errors::InvalidArgument(
strings::StrCat("Not able to parse GPU device name: ", dev.name()));
}
TfGpuId tf_gpu_id(parsed.id);
PlatformGpuId platform_gpu_id;
Status s = GpuIdManager::TfToPlatformGpuId(tf_gpu_id, &platform_gpu_id);
TfDeviceId tf_device_id(parsed.id);
PlatformDeviceId platform_device_id;
Status s =
GpuIdManager::TfToPlatformDeviceId(tf_device_id, &platform_device_id);
if (!s.ok()) {
return errors::Unavailable("Unknown TF GPU device with id ",
tf_gpu_id.value(), ": ", s.ToString());
tf_device_id.value(), ": ", s.ToString());
}
attr = GetLocalGPUInfo(platform_gpu_id);
attr = GetLocalGPUInfo(platform_device_id);
} else if (dev.device_type().find("XLA") == string::npos) {
// Filter out the fake XLA devices to avoid double counting the actual
// hardware resources that are available.

View File

@ -74,14 +74,14 @@ DeviceProperties GetLocalCPUInfo() {
return device;
}
DeviceProperties GetLocalGPUInfo(PlatformGpuId platform_gpu_id) {
DeviceProperties GetLocalGPUInfo(PlatformDeviceId platform_device_id) {
DeviceProperties device;
device.set_type("GPU");
#if GOOGLE_CUDA
cudaDeviceProp properties;
cudaError_t error =
cudaGetDeviceProperties(&properties, platform_gpu_id.value());
cudaGetDeviceProperties(&properties, platform_device_id.value());
if (error != cudaSuccess) {
device.set_type("UNKNOWN");
LOG(ERROR) << "Failed to get device properties, error code: " << error;
@ -117,7 +117,7 @@ DeviceProperties GetLocalGPUInfo(PlatformGpuId platform_gpu_id) {
#elif TENSORFLOW_USE_ROCM
hipDeviceProp_t properties;
hipError_t error =
hipGetDeviceProperties(&properties, platform_gpu_id.value());
hipGetDeviceProperties(&properties, platform_device_id.value());
if (error != hipSuccess) {
device.set_type("UNKNOWN");
LOG(ERROR) << "Failed to get device properties, error code: " << error;
@ -156,16 +156,17 @@ DeviceProperties GetDeviceInfo(const DeviceNameUtils::ParsedName& device) {
return GetLocalCPUInfo();
} else if (device.type == "GPU") {
if (device.has_id) {
TfGpuId tf_gpu_id(device.id);
PlatformGpuId platform_gpu_id;
Status s = GpuIdManager::TfToPlatformGpuId(tf_gpu_id, &platform_gpu_id);
TfDeviceId tf_device_id(device.id);
PlatformDeviceId platform_device_id;
Status s =
GpuIdManager::TfToPlatformDeviceId(tf_device_id, &platform_device_id);
if (!s.ok()) {
LOG(ERROR) << s;
return unknown;
}
return GetLocalGPUInfo(platform_gpu_id);
return GetLocalGPUInfo(platform_device_id);
} else {
return GetLocalGPUInfo(PlatformGpuId(0));
return GetLocalGPUInfo(PlatformDeviceId(0));
}
}
return unknown;

View File

@ -28,7 +28,7 @@ DeviceProperties GetLocalCPUInfo();
// Returns the DeviceProperties for the specified GPU attached to the server on
// which grappler is running.
DeviceProperties GetLocalGPUInfo(PlatformGpuId platform_gpu_id);
DeviceProperties GetLocalGPUInfo(PlatformDeviceId platform_device_id);
// Returns the DeviceProperties of the specified device
DeviceProperties GetDeviceInfo(const DeviceNameUtils::ParsedName& device);

View File

@ -33,11 +33,11 @@ TEST(UtilsTest, GetLocalGPUInfo) {
DeviceProperties properties;
// Invalid platform GPU ID.
properties = GetLocalGPUInfo(PlatformGpuId(100));
properties = GetLocalGPUInfo(PlatformDeviceId(100));
EXPECT_EQ("UNKNOWN", properties.type());
// Succeed when a valid platform GPU id was inserted.
properties = GetLocalGPUInfo(PlatformGpuId(0));
properties = GetLocalGPUInfo(PlatformDeviceId(0));
EXPECT_EQ("GPU", properties.type());
EXPECT_EQ("NVIDIA", properties.vendor());
#elif TENSORFLOW_USE_ROCM
@ -45,21 +45,21 @@ TEST(UtilsTest, GetLocalGPUInfo) {
DeviceProperties properties;
// Invalid platform GPU ID.
properties = GetLocalGPUInfo(PlatformGpuId(100));
properties = GetLocalGPUInfo(PlatformDeviceId(100));
EXPECT_EQ("UNKNOWN", properties.type());
// Succeed when a valid platform GPU id was inserted.
properties = GetLocalGPUInfo(PlatformGpuId(0));
properties = GetLocalGPUInfo(PlatformDeviceId(0));
EXPECT_EQ("GPU", properties.type());
EXPECT_EQ("Advanced Micro Devices, Inc", properties.vendor());
#else
LOG(INFO) << "CUDA is not enabled.";
DeviceProperties properties;
properties = GetLocalGPUInfo(PlatformGpuId(0));
properties = GetLocalGPUInfo(PlatformDeviceId(0));
EXPECT_EQ("GPU", properties.type());
properties = GetLocalGPUInfo(PlatformGpuId(100));
properties = GetLocalGPUInfo(PlatformDeviceId(100));
EXPECT_EQ("GPU", properties.type());
#endif
}
@ -97,14 +97,14 @@ TEST(UtilsTest, GetDeviceInfo) {
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
// Invalid platform GPU id.
TF_ASSERT_OK(
GpuIdManager::InsertTfPlatformGpuIdPair(TfGpuId(0), PlatformGpuId(100)));
TF_ASSERT_OK(GpuIdManager::InsertTfPlatformDeviceIdPair(
TfDeviceId(0), PlatformDeviceId(100)));
properties = GetDeviceInfo(device);
EXPECT_EQ("UNKNOWN", properties.type());
// Valid platform GPU id.
TF_ASSERT_OK(
GpuIdManager::InsertTfPlatformGpuIdPair(TfGpuId(1), PlatformGpuId(0)));
TF_ASSERT_OK(GpuIdManager::InsertTfPlatformDeviceIdPair(TfDeviceId(1),
PlatformDeviceId(0)));
device.id = 1;
properties = GetDeviceInfo(device);
EXPECT_EQ("GPU", properties.type());

View File

@ -241,14 +241,15 @@ DeviceProperties GetDeviceInfo(const string& device_str) {
DeviceNameUtils::ParsedName parsed;
if (DeviceNameUtils::ParseFullName(device_str, &parsed)) {
if (parsed.type == "GPU") {
TfGpuId tf_gpu_id(parsed.id);
PlatformGpuId platform_gpu_id;
Status s = GpuIdManager::TfToPlatformGpuId(tf_gpu_id, &platform_gpu_id);
TfDeviceId tf_device_id(parsed.id);
PlatformDeviceId platform_device_id;
Status s =
GpuIdManager::TfToPlatformDeviceId(tf_device_id, &platform_device_id);
if (!s.ok()) {
// We are probably running simulation without linking cuda libraries.
platform_gpu_id = PlatformGpuId(parsed.id);
platform_device_id = PlatformDeviceId(parsed.id);
}
return GetLocalGPUInfo(platform_gpu_id);
return GetLocalGPUInfo(platform_device_id);
} else if (parsed.type == "CPU") {
return GetLocalCPUInfo();
}