Finish migrating {Tf|Platform}GpuId to {Tf|Platform}DeviceId
PiperOrigin-RevId: 361252995 Change-Id: I818798fc00efe7b98c35145ce067204d9e023895
This commit is contained in:
parent
fbd744dfdb
commit
d7634bbfaf
@ -43,15 +43,15 @@ static xla::StatusOr<absl::optional<std::set<int>>> ParseVisibleDeviceList(
|
||||
}
|
||||
const std::vector<string> visible_devices =
|
||||
absl::StrSplit(visible_device_list, ',');
|
||||
for (const string& platform_gpu_id_str : visible_devices) {
|
||||
int32 platform_gpu_id;
|
||||
if (!absl::SimpleAtoi(platform_gpu_id_str, &platform_gpu_id)) {
|
||||
for (const string& platform_device_id_str : visible_devices) {
|
||||
int32 platform_device_id;
|
||||
if (!absl::SimpleAtoi(platform_device_id_str, &platform_device_id)) {
|
||||
return errors::InvalidArgument(
|
||||
"Could not parse entry in 'visible_device_list': '",
|
||||
platform_gpu_id_str,
|
||||
platform_device_id_str,
|
||||
"'. visible_device_list = ", visible_device_list);
|
||||
}
|
||||
gpu_ids.insert(platform_gpu_id);
|
||||
gpu_ids.insert(platform_device_id);
|
||||
}
|
||||
return {{gpu_ids}};
|
||||
}
|
||||
|
@ -102,19 +102,21 @@ struct EdgePtrCompare {
|
||||
// TODO(laigd): instead of deciding the device here, the converter should accept
|
||||
// a device name as one of the conversion parameter so users can control on
|
||||
// which device they want to run the conversion.
|
||||
std::pair<TfGpuId, PlatformGpuId> GetFirstValidDeviceId() {
|
||||
for (int tf_gpu_id_value = 0; tf_gpu_id_value < 100; ++tf_gpu_id_value) {
|
||||
TfGpuId tf_gpu_id(tf_gpu_id_value);
|
||||
PlatformGpuId platform_gpu_id;
|
||||
Status s = GpuIdManager::TfToPlatformGpuId(tf_gpu_id, &platform_gpu_id);
|
||||
std::pair<TfDeviceId, PlatformDeviceId> GetFirstValidDeviceId() {
|
||||
for (int tf_device_id_value = 0; tf_device_id_value < 100;
|
||||
++tf_device_id_value) {
|
||||
TfDeviceId tf_device_id(tf_device_id_value);
|
||||
PlatformDeviceId platform_device_id;
|
||||
Status s =
|
||||
GpuIdManager::TfToPlatformDeviceId(tf_device_id, &platform_device_id);
|
||||
if (s.ok()) {
|
||||
VLOG(1) << "Found TF GPU " << tf_gpu_id.value() << " at cuda device "
|
||||
<< platform_gpu_id.value();
|
||||
return std::make_pair(tf_gpu_id, platform_gpu_id);
|
||||
VLOG(1) << "Found TF GPU " << tf_device_id.value() << " at cuda device "
|
||||
<< platform_device_id.value();
|
||||
return std::make_pair(tf_device_id, platform_device_id);
|
||||
}
|
||||
}
|
||||
LOG(ERROR) << "Could not find any TF GPUs";
|
||||
return std::make_pair(TfGpuId(-1), PlatformGpuId(-1));
|
||||
return std::make_pair(TfDeviceId(-1), PlatformDeviceId(-1));
|
||||
}
|
||||
|
||||
// Returns false for const nodes (we intend to drop control edges from those).
|
||||
@ -266,14 +268,14 @@ Status GetEngineInfo(const Graph* g,
|
||||
}
|
||||
info->device = DeviceNameUtils::ParsedNameToString(segment_device);
|
||||
} else {
|
||||
TfGpuId tf_gpu_id;
|
||||
PlatformGpuId platform_gpu_id;
|
||||
std::tie(tf_gpu_id, platform_gpu_id) = GetFirstValidDeviceId();
|
||||
if (tf_gpu_id.value() >= 0) {
|
||||
TfDeviceId tf_device_id;
|
||||
PlatformDeviceId platform_device_id;
|
||||
std::tie(tf_device_id, platform_device_id) = GetFirstValidDeviceId();
|
||||
if (tf_device_id.value() >= 0) {
|
||||
DeviceNameUtils::ParsedName parsed_name;
|
||||
parsed_name.type = "GPU";
|
||||
parsed_name.has_type = true;
|
||||
parsed_name.id = tf_gpu_id.value();
|
||||
parsed_name.id = tf_device_id.value();
|
||||
parsed_name.has_id = true;
|
||||
info->device = DeviceNameUtils::ParsedNameToString(parsed_name);
|
||||
} else {
|
||||
@ -640,17 +642,17 @@ std::pair<int, Allocator*> GetDeviceAndAllocator(const ConversionParams& params,
|
||||
if (params.cluster == nullptr || params.cluster->GetDeviceSet() == nullptr ||
|
||||
engine.device.empty()) {
|
||||
// If device is not set, use the first found GPU device for the conversion.
|
||||
TfGpuId tf_gpu_id;
|
||||
PlatformGpuId platform_gpu_id;
|
||||
std::tie(tf_gpu_id, platform_gpu_id) = GetFirstValidDeviceId();
|
||||
cuda_device_id = platform_gpu_id.value();
|
||||
TfDeviceId tf_device_id;
|
||||
PlatformDeviceId platform_device_id;
|
||||
std::tie(tf_device_id, platform_device_id) = GetFirstValidDeviceId();
|
||||
cuda_device_id = platform_device_id.value();
|
||||
if (cuda_device_id >= 0) {
|
||||
GPUOptions gpu_options;
|
||||
// If the TF to Cuda gpu id mapping exist, the device and corresponding
|
||||
// allocator must have been initialized already, so the
|
||||
// GetGPUAllocator() call won't create a new allocator.
|
||||
dev_allocator = GPUProcessState::singleton()->GetGPUAllocator(
|
||||
gpu_options, tf_gpu_id, /*total_bytes=*/1, /*peer_gpu_ids=*/{});
|
||||
gpu_options, tf_device_id, /*total_bytes=*/1, /*peer_gpu_ids=*/{});
|
||||
}
|
||||
return std::make_pair(cuda_device_id, dev_allocator);
|
||||
}
|
||||
|
@ -1044,25 +1044,25 @@ Status TRTEngineOp::AllocateCalibrationResources(
|
||||
}
|
||||
cres->calibrator_.reset(
|
||||
new TRTInt8Calibrator(cres->device_buffers_, batch_size, name()));
|
||||
const int platform_gpu_id =
|
||||
const int platform_device_id =
|
||||
ctx->device()->tensorflow_gpu_device_info()->gpu_id;
|
||||
if (platform_gpu_id < 0) {
|
||||
if (platform_device_id < 0) {
|
||||
LOG(ERROR) << "Can't get gpu_device_info from context->device()";
|
||||
return errors::InvalidArgument(
|
||||
"Context->device doesn't contain device info!");
|
||||
}
|
||||
|
||||
cache_res->Ref();
|
||||
cres->thr_.reset(new std::thread([this, cres, shapes, platform_gpu_id,
|
||||
cres->thr_.reset(new std::thread([this, cres, shapes, platform_device_id,
|
||||
cache_res]() {
|
||||
core::ScopedUnref sc(cache_res);
|
||||
|
||||
VLOG(1) << "Starting calibration thread on device " << platform_gpu_id
|
||||
VLOG(1) << "Starting calibration thread on device " << platform_device_id
|
||||
<< ", Calibration Resource @ " << cres;
|
||||
auto err = cudaSetDevice(platform_gpu_id);
|
||||
auto err = cudaSetDevice(platform_device_id);
|
||||
if (err != cudaSuccess) {
|
||||
// TODO(aaroey): should return error here.
|
||||
LOG(ERROR) << "Couldn't set cuda device to " << platform_gpu_id
|
||||
LOG(ERROR) << "Couldn't set cuda device to " << platform_device_id
|
||||
<< " in calibration thread";
|
||||
}
|
||||
std::vector<PartialTensorShape> partial_shapes(shapes.begin(),
|
||||
|
@ -149,7 +149,7 @@ class GPUDeviceTestHelper {
|
||||
DeviceFactory::NewDevice(DEVICE_GPU, sops, "/job:a/replica:0/task:0");
|
||||
gpu_.reset(reinterpret_cast<BaseGPUDevice*>(device_.release()));
|
||||
gpu_allocator_ = GPUProcessState::singleton()->GetGPUAllocator(
|
||||
GPUOptions(), TfGpuId(0), memory_limit, /*peer_gpu_ids=*/{});
|
||||
GPUOptions(), TfDeviceId(0), memory_limit, /*peer_gpu_ids=*/{});
|
||||
host_allocator_ = GPUProcessState::singleton()->GetGpuHostAllocator(0);
|
||||
}
|
||||
|
||||
|
@ -26,11 +26,11 @@ limitations under the License.
|
||||
|
||||
namespace tensorflow {
|
||||
|
||||
GPUcudaMallocAllocator::GPUcudaMallocAllocator(Allocator* allocator,
|
||||
PlatformGpuId platform_gpu_id)
|
||||
GPUcudaMallocAllocator::GPUcudaMallocAllocator(
|
||||
Allocator* allocator, PlatformDeviceId platform_device_id)
|
||||
: base_allocator_(allocator) {
|
||||
stream_exec_ = DeviceIdUtil::ExecutorForPlatformDeviceId(GPUMachineManager(),
|
||||
platform_gpu_id)
|
||||
platform_device_id)
|
||||
.ValueOrDie();
|
||||
}
|
||||
|
||||
|
@ -32,7 +32,7 @@ namespace tensorflow {
|
||||
class GPUcudaMallocAllocator : public Allocator {
|
||||
public:
|
||||
explicit GPUcudaMallocAllocator(Allocator* allocator,
|
||||
PlatformGpuId platform_gpu_id);
|
||||
PlatformDeviceId platform_device_id);
|
||||
~GPUcudaMallocAllocator() override;
|
||||
string Name() override { return "gpu_debug"; }
|
||||
void* AllocateRaw(size_t alignment, size_t num_bytes) override;
|
||||
|
@ -42,12 +42,12 @@ static std::string GetCudaErrorMessage(CUresult result) {
|
||||
#endif // GOOGLE_CUDA
|
||||
|
||||
GpuCudaMallocAsyncAllocator::GpuCudaMallocAsyncAllocator(
|
||||
PlatformGpuId platform_gpu_id, size_t pool_size, bool reserve_memory,
|
||||
PlatformDeviceId platform_device_id, size_t pool_size, bool reserve_memory,
|
||||
bool compute_stats)
|
||||
: name_(absl::StrCat("gpu_async_", platform_gpu_id.value())) {
|
||||
: name_(absl::StrCat("gpu_async_", platform_device_id.value())) {
|
||||
#if TF_CUDA_MALLOC_ASYNC_SUPPORTED
|
||||
stream_exec_ = DeviceIdUtil::ExecutorForPlatformDeviceId(GPUMachineManager(),
|
||||
platform_gpu_id)
|
||||
platform_device_id)
|
||||
.ValueOrDie();
|
||||
// Initialized here as it only exist if compiled with a recent
|
||||
// enough CUDA.
|
||||
@ -56,7 +56,7 @@ GpuCudaMallocAsyncAllocator::GpuCudaMallocAsyncAllocator(
|
||||
// WAR an CUDA 11.2 driver bug for multiple-GPU. It currently
|
||||
// request that the context on GPU 0 is initialized. Which isn't the
|
||||
// case for TF+horovod.
|
||||
if (platform_gpu_id.value() > 0) {
|
||||
if (platform_device_id.value() > 0) {
|
||||
CUcontext pctx; // We loose track of it. But this is fine.
|
||||
if (auto result = cuDevicePrimaryCtxRetain(&pctx, 0))
|
||||
LOG(FATAL) // Crash OK.
|
||||
@ -65,9 +65,10 @@ GpuCudaMallocAsyncAllocator::GpuCudaMallocAsyncAllocator(
|
||||
|
||||
se::cuda::ScopedActivateExecutorContext scoped_activation{stream_exec_};
|
||||
int cuda_malloc_async_supported;
|
||||
if (auto status = cuDeviceGetAttribute(
|
||||
&cuda_malloc_async_supported,
|
||||
CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED, platform_gpu_id.value()))
|
||||
if (auto status =
|
||||
cuDeviceGetAttribute(&cuda_malloc_async_supported,
|
||||
CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED,
|
||||
platform_device_id.value()))
|
||||
LOG(FATAL) << // Crash OK.
|
||||
"Failed to get device attribute: " << GetCudaErrorMessage(status);
|
||||
if (!cuda_malloc_async_supported)
|
||||
@ -79,12 +80,13 @@ GpuCudaMallocAsyncAllocator::GpuCudaMallocAsyncAllocator(
|
||||
LOG(FATAL) // Crash OK.
|
||||
<< "Failed to create CUDA stream: " << GetCudaErrorMessage(status);
|
||||
|
||||
if (auto status = cuDeviceGetDefaultMemPool(&pool_, platform_gpu_id.value()))
|
||||
if (auto status =
|
||||
cuDeviceGetDefaultMemPool(&pool_, platform_device_id.value()))
|
||||
LOG(FATAL) << // Crash OK.
|
||||
"Failed to get default CUDA pool: " << GetCudaErrorMessage(status);
|
||||
|
||||
VLOG(1) << Name() << " CudaMallocAsync initialized on platform: "
|
||||
<< platform_gpu_id.value() << " with pool size of: " << pool_size
|
||||
<< platform_device_id.value() << " with pool size of: " << pool_size
|
||||
<< " this ptr: " << this;
|
||||
uint64_t pool_size_64 = pool_size;
|
||||
if (auto status = cuMemPoolSetAttribute(
|
||||
|
@ -64,7 +64,7 @@ namespace tensorflow {
|
||||
// driver can return the excess memory to other processes.
|
||||
class GpuCudaMallocAsyncAllocator : public Allocator {
|
||||
public:
|
||||
explicit GpuCudaMallocAsyncAllocator(PlatformGpuId platform_gpu_id,
|
||||
explicit GpuCudaMallocAsyncAllocator(PlatformDeviceId platform_device_id,
|
||||
size_t pool_size,
|
||||
bool reserve_memory = false,
|
||||
bool compute_stats = false);
|
||||
|
@ -76,10 +76,10 @@ void InitMask(se::StreamExecutor* exec, void* ptr, int64* mask) {
|
||||
// GPUDebugAllocator
|
||||
// -----------------------------------------------------------------------------
|
||||
GPUDebugAllocator::GPUDebugAllocator(Allocator* allocator,
|
||||
PlatformGpuId platform_gpu_id)
|
||||
PlatformDeviceId platform_device_id)
|
||||
: base_allocator_(allocator) {
|
||||
stream_exec_ = DeviceIdUtil::ExecutorForPlatformDeviceId(GPUMachineManager(),
|
||||
platform_gpu_id)
|
||||
platform_device_id)
|
||||
.ValueOrDie();
|
||||
}
|
||||
|
||||
@ -155,10 +155,10 @@ bool GPUDebugAllocator::CheckFooter(void* ptr) {
|
||||
// GPUNanResetAllocator
|
||||
// -----------------------------------------------------------------------------
|
||||
GPUNanResetAllocator::GPUNanResetAllocator(Allocator* allocator,
|
||||
PlatformGpuId platform_gpu_id)
|
||||
PlatformDeviceId platform_device_id)
|
||||
: base_allocator_(allocator) {
|
||||
stream_exec_ = DeviceIdUtil::ExecutorForPlatformDeviceId(GPUMachineManager(),
|
||||
platform_gpu_id)
|
||||
platform_device_id)
|
||||
.ValueOrDie();
|
||||
}
|
||||
|
||||
|
@ -34,7 +34,7 @@ namespace tensorflow {
|
||||
class GPUDebugAllocator : public Allocator {
|
||||
public:
|
||||
explicit GPUDebugAllocator(Allocator* allocator,
|
||||
PlatformGpuId platform_gpu_id);
|
||||
PlatformDeviceId platform_device_id);
|
||||
~GPUDebugAllocator() override;
|
||||
string Name() override { return "gpu_debug"; }
|
||||
void* AllocateRaw(size_t alignment, size_t num_bytes) override;
|
||||
@ -64,7 +64,7 @@ class GPUDebugAllocator : public Allocator {
|
||||
class GPUNanResetAllocator : public Allocator {
|
||||
public:
|
||||
explicit GPUNanResetAllocator(Allocator* allocator,
|
||||
PlatformGpuId platform_gpu_id);
|
||||
PlatformDeviceId platform_device_id);
|
||||
~GPUNanResetAllocator() override;
|
||||
string Name() override { return "gpu_nan_reset"; }
|
||||
void* AllocateRaw(size_t alignment, size_t num_bytes) override;
|
||||
|
@ -37,7 +37,7 @@ limitations under the License.
|
||||
namespace tensorflow {
|
||||
namespace {
|
||||
|
||||
se::StreamExecutor* ExecutorForPlatformGpuId(
|
||||
se::StreamExecutor* ExecutorForPlatformDeviceId(
|
||||
PlatformDeviceId platform_device_id) {
|
||||
return DeviceIdUtil::ExecutorForPlatformDeviceId(GPUMachineManager(),
|
||||
platform_device_id)
|
||||
@ -45,12 +45,12 @@ se::StreamExecutor* ExecutorForPlatformGpuId(
|
||||
}
|
||||
|
||||
TEST(GPUDebugAllocatorTest, OverwriteDetection_None) {
|
||||
const PlatformGpuId platform_gpu_id(0);
|
||||
auto stream_exec = ExecutorForPlatformGpuId(platform_gpu_id);
|
||||
const PlatformDeviceId platform_device_id(0);
|
||||
auto stream_exec = ExecutorForPlatformDeviceId(platform_device_id);
|
||||
DeviceMemAllocator* sub_allocator = new DeviceMemAllocator(
|
||||
stream_exec, platform_gpu_id, false /*use_unified_memory*/, {}, {});
|
||||
stream_exec, platform_device_id, false /*use_unified_memory*/, {}, {});
|
||||
GPUDebugAllocator a(new GPUBFCAllocator(sub_allocator, 1 << 30, ""),
|
||||
platform_gpu_id);
|
||||
platform_device_id);
|
||||
|
||||
for (int s : {8}) {
|
||||
std::vector<int64> cpu_array(s);
|
||||
@ -72,13 +72,13 @@ TEST(GPUDebugAllocatorTest, OverwriteDetection_Header) {
|
||||
for (int s : {8, 211}) {
|
||||
EXPECT_DEATH(
|
||||
{
|
||||
const PlatformGpuId platform_gpu_id(0);
|
||||
auto stream_exec = ExecutorForPlatformGpuId(platform_gpu_id);
|
||||
const PlatformDeviceId platform_device_id(0);
|
||||
auto stream_exec = ExecutorForPlatformDeviceId(platform_device_id);
|
||||
DeviceMemAllocator* sub_allocator =
|
||||
new DeviceMemAllocator(stream_exec, platform_gpu_id,
|
||||
new DeviceMemAllocator(stream_exec, platform_device_id,
|
||||
false /*use_unified_memory*/, {}, {});
|
||||
GPUDebugAllocator a(new GPUBFCAllocator(sub_allocator, 1 << 30, ""),
|
||||
platform_gpu_id);
|
||||
platform_device_id);
|
||||
|
||||
std::vector<int64> cpu_array(s);
|
||||
memset(&cpu_array[0], 0, cpu_array.size() * sizeof(int64));
|
||||
@ -108,13 +108,13 @@ TEST(GPUDebugAllocatorTest, OverwriteDetection_Footer) {
|
||||
for (int s : {8, 22}) {
|
||||
EXPECT_DEATH(
|
||||
{
|
||||
const PlatformGpuId platform_gpu_id(0);
|
||||
auto stream_exec = ExecutorForPlatformGpuId(platform_gpu_id);
|
||||
const PlatformDeviceId platform_device_id(0);
|
||||
auto stream_exec = ExecutorForPlatformDeviceId(platform_device_id);
|
||||
DeviceMemAllocator* sub_allocator =
|
||||
new DeviceMemAllocator(stream_exec, platform_gpu_id,
|
||||
new DeviceMemAllocator(stream_exec, platform_device_id,
|
||||
false /*use_unified_memory*/, {}, {});
|
||||
GPUDebugAllocator a(new GPUBFCAllocator(sub_allocator, 1 << 30, ""),
|
||||
platform_gpu_id);
|
||||
platform_device_id);
|
||||
|
||||
std::vector<int64> cpu_array(s);
|
||||
memset(&cpu_array[0], 0, cpu_array.size() * sizeof(int64));
|
||||
@ -141,12 +141,12 @@ TEST(GPUDebugAllocatorTest, OverwriteDetection_Footer) {
|
||||
}
|
||||
|
||||
TEST(GPUDebugAllocatorTest, ResetToNan) {
|
||||
const PlatformGpuId platform_gpu_id(0);
|
||||
auto stream_exec = ExecutorForPlatformGpuId(platform_gpu_id);
|
||||
const PlatformDeviceId platform_device_id(0);
|
||||
auto stream_exec = ExecutorForPlatformDeviceId(platform_device_id);
|
||||
DeviceMemAllocator* sub_allocator = new DeviceMemAllocator(
|
||||
stream_exec, platform_gpu_id, false /*use_unified_memory*/, {}, {});
|
||||
stream_exec, platform_device_id, false /*use_unified_memory*/, {}, {});
|
||||
GPUNanResetAllocator a(new GPUBFCAllocator(sub_allocator, 1 << 30, ""),
|
||||
platform_gpu_id);
|
||||
platform_device_id);
|
||||
|
||||
std::vector<float> cpu_array(1024);
|
||||
std::vector<float> cpu_array_result(1024);
|
||||
@ -183,15 +183,15 @@ TEST(GPUDebugAllocatorTest, ResetToNan) {
|
||||
}
|
||||
|
||||
TEST(GPUDebugAllocatorTest, ResetToNanWithHeaderFooter) {
|
||||
const PlatformGpuId platform_gpu_id(0);
|
||||
auto stream_exec = ExecutorForPlatformGpuId(platform_gpu_id);
|
||||
const PlatformDeviceId platform_device_id(0);
|
||||
auto stream_exec = ExecutorForPlatformDeviceId(platform_device_id);
|
||||
// NaN reset must be the outer-most allocator.
|
||||
DeviceMemAllocator* sub_allocator = new DeviceMemAllocator(
|
||||
stream_exec, platform_gpu_id, false /*use_unified_memory*/, {}, {});
|
||||
stream_exec, platform_device_id, false /*use_unified_memory*/, {}, {});
|
||||
GPUNanResetAllocator a(
|
||||
new GPUDebugAllocator(new GPUBFCAllocator(sub_allocator, 1 << 30, ""),
|
||||
platform_gpu_id),
|
||||
platform_gpu_id);
|
||||
platform_device_id),
|
||||
platform_device_id);
|
||||
|
||||
std::vector<float> cpu_array(1024);
|
||||
std::vector<float> cpu_array_result(1024);
|
||||
@ -228,24 +228,24 @@ TEST(GPUDebugAllocatorTest, ResetToNanWithHeaderFooter) {
|
||||
}
|
||||
|
||||
TEST(GPUDebugAllocatorTest, TracksSizes) {
|
||||
const PlatformGpuId platform_gpu_id(0);
|
||||
const PlatformDeviceId platform_device_id(0);
|
||||
DeviceMemAllocator* sub_allocator = new DeviceMemAllocator(
|
||||
ExecutorForPlatformGpuId(platform_gpu_id), platform_gpu_id,
|
||||
ExecutorForPlatformDeviceId(platform_device_id), platform_device_id,
|
||||
false /*use_unified_memory*/, {}, {});
|
||||
GPUDebugAllocator a(new GPUBFCAllocator(sub_allocator, 1 << 30, ""),
|
||||
platform_gpu_id);
|
||||
platform_device_id);
|
||||
EXPECT_EQ(true, a.TracksAllocationSizes());
|
||||
}
|
||||
|
||||
TEST(GPUDebugAllocatorTest, AllocatedVsRequested) {
|
||||
const PlatformGpuId platform_gpu_id(0);
|
||||
const PlatformDeviceId platform_device_id(0);
|
||||
DeviceMemAllocator* sub_allocator = new DeviceMemAllocator(
|
||||
ExecutorForPlatformGpuId(platform_gpu_id), platform_gpu_id,
|
||||
ExecutorForPlatformDeviceId(platform_device_id), platform_device_id,
|
||||
false /*use_unified_memory*/, {}, {});
|
||||
GPUNanResetAllocator a(
|
||||
new GPUDebugAllocator(new GPUBFCAllocator(sub_allocator, 1 << 30, ""),
|
||||
platform_gpu_id),
|
||||
platform_gpu_id);
|
||||
platform_device_id),
|
||||
platform_device_id);
|
||||
float* t1 = TypedAllocator::Allocate<float>(&a, 1, {});
|
||||
EXPECT_EQ(4, a.RequestedSize(t1));
|
||||
EXPECT_EQ(256, a.AllocatedSize(t1));
|
||||
|
@ -120,7 +120,7 @@ class EigenGpuStreamDevice : public ::Eigen::StreamInterface {
|
||||
}
|
||||
~EigenGpuStreamDevice() override {}
|
||||
void Reinitialize(OpKernelContext* context, const gpuStream_t* gpu_stream,
|
||||
TfGpuId tf_gpu_id, ::tensorflow::Allocator* alloc,
|
||||
TfDeviceId tf_device_id, ::tensorflow::Allocator* alloc,
|
||||
char* scratch) {
|
||||
if (LogMemory::IsEnabled()) {
|
||||
operation_ = context->op_kernel().name() + "/EigenAllocator";
|
||||
@ -132,9 +132,10 @@ class EigenGpuStreamDevice : public ::Eigen::StreamInterface {
|
||||
reinterpret_cast<unsigned int*>(scratch + Eigen::kGpuScratchSize);
|
||||
stream_ = gpu_stream;
|
||||
allocator_ = alloc;
|
||||
PlatformGpuId platform_gpu_id;
|
||||
TF_CHECK_OK(GpuIdManager::TfToPlatformGpuId(tf_gpu_id, &platform_gpu_id));
|
||||
device_prop_ = &Eigen::m_deviceProperties[platform_gpu_id.value()];
|
||||
PlatformDeviceId platform_device_id;
|
||||
TF_CHECK_OK(
|
||||
GpuIdManager::TfToPlatformDeviceId(tf_device_id, &platform_device_id));
|
||||
device_prop_ = &Eigen::m_deviceProperties[platform_device_id.value()];
|
||||
}
|
||||
|
||||
const gpuStream_t& stream() const override { return *stream_; }
|
||||
@ -233,18 +234,18 @@ class EigenGpuStreamDevice : public ::Eigen::StreamInterface {
|
||||
class BaseGPUDevice::StreamGroupFactory {
|
||||
public:
|
||||
// Returns the unique stream group for use with the stream defined by
|
||||
// {tf_gpu_id, stream_group_within_gpu}, creating it if it does not yet
|
||||
// {tf_device_id, stream_group_within_gpu}, creating it if it does not yet
|
||||
// exist.
|
||||
// This function is thread safe.
|
||||
BaseGPUDevice::StreamGroup* GetOrCreate(TfGpuId tf_gpu_id,
|
||||
BaseGPUDevice::StreamGroup* GetOrCreate(TfDeviceId tf_device_id,
|
||||
int stream_group_within_gpu,
|
||||
se::StreamExecutor* executor,
|
||||
const GPUOptions& options) {
|
||||
mutex_lock guard(lock_);
|
||||
StreamGroup* group =
|
||||
&streams_[key_type(tf_gpu_id.value(), stream_group_within_gpu)];
|
||||
&streams_[key_type(tf_device_id.value(), stream_group_within_gpu)];
|
||||
if (!group->compute) {
|
||||
int priority = GetPriority(tf_gpu_id.value(), options);
|
||||
int priority = GetPriority(tf_device_id.value(), options);
|
||||
group->priority = priority;
|
||||
group->compute = GetStream(executor, priority);
|
||||
group->compute->Init();
|
||||
@ -339,8 +340,8 @@ class BaseGPUDevice::StreamGroupFactory {
|
||||
private:
|
||||
// Returns priority for the given virtual GPU id from the session options.
|
||||
// Returns 0 if no virtual devices are specified.
|
||||
int GetPriority(int tf_gpu_id, const GPUOptions& options) {
|
||||
int id = tf_gpu_id;
|
||||
int GetPriority(int tf_device_id, const GPUOptions& options) {
|
||||
int id = tf_device_id;
|
||||
int i = 0;
|
||||
int priority = 0;
|
||||
while (i < options.experimental().virtual_devices_size()) {
|
||||
@ -378,7 +379,7 @@ class BaseGPUDevice::StreamGroupFactory {
|
||||
|
||||
BaseGPUDevice::BaseGPUDevice(const SessionOptions& options, const string& name,
|
||||
Bytes memory_limit, const DeviceLocality& locality,
|
||||
TfGpuId tf_gpu_id,
|
||||
TfDeviceId tf_device_id,
|
||||
const string& physical_device_desc,
|
||||
Allocator* gpu_allocator, Allocator* cpu_allocator,
|
||||
bool sync_every_op)
|
||||
@ -388,7 +389,7 @@ BaseGPUDevice::BaseGPUDevice(const SessionOptions& options, const string& name,
|
||||
gpu_allocator_(gpu_allocator),
|
||||
cpu_allocator_(cpu_allocator),
|
||||
scoped_allocator_mgr_(new ScopedAllocatorMgr(name)),
|
||||
tf_gpu_id_(tf_gpu_id),
|
||||
tf_device_id_(tf_device_id),
|
||||
sync_every_op_(sync_every_op) {
|
||||
GPUProcessState::singleton()->EnableGPUDevice();
|
||||
}
|
||||
@ -410,7 +411,8 @@ Status BaseGPUDevice::InitScratchBuffers() {
|
||||
Allocator::kAllocatorAlignment, scratch_buffer_size);
|
||||
if (scratch_buffer == nullptr) {
|
||||
return errors::FailedPrecondition(
|
||||
"Failed to allocate scratch buffer for device ", tf_gpu_id_.value());
|
||||
"Failed to allocate scratch buffer for device ",
|
||||
tf_device_id_.value());
|
||||
}
|
||||
se::DeviceMemory<char> mem(
|
||||
se::DeviceMemoryBase(scratch_buffer, scratch_buffer_size));
|
||||
@ -423,16 +425,16 @@ Status BaseGPUDevice::InitScratchBuffers() {
|
||||
|
||||
Status BaseGPUDevice::Init(const SessionOptions& options) {
|
||||
auto executor_status = DeviceIdUtil::ExecutorForTfDeviceId(
|
||||
DEVICE_GPU, GPUMachineManager(), tf_gpu_id_);
|
||||
DEVICE_GPU, GPUMachineManager(), tf_device_id_);
|
||||
if (!executor_status.status().ok()) {
|
||||
return errors::Internal("Failed to get StreamExecutor for device ",
|
||||
tf_gpu_id_.value());
|
||||
tf_device_id_.value());
|
||||
}
|
||||
|
||||
executor_ = executor_status.ValueOrDie();
|
||||
|
||||
stream_ = StreamGroupFactory::Global().GetOrCreate(
|
||||
tf_gpu_id_, 0, executor_, options.config.gpu_options());
|
||||
tf_device_id_, 0, executor_, options.config.gpu_options());
|
||||
device_context_ =
|
||||
new GPUDeviceContext(0, stream_->compute,
|
||||
#if TENSORFLOW_USE_ROCM
|
||||
@ -461,7 +463,7 @@ Status BaseGPUDevice::Init(const SessionOptions& options) {
|
||||
// The GPUKernelTracker will use this SharedCounter, instead of
|
||||
// owning its own.
|
||||
timing_counter =
|
||||
GPUProcessState::singleton()->GPUAllocatorCounter(tf_gpu_id_);
|
||||
GPUProcessState::singleton()->GPUAllocatorCounter(tf_device_id_);
|
||||
DCHECK(timing_counter);
|
||||
}
|
||||
kernel_tracker_.reset(new GPUKernelTracker(
|
||||
@ -473,10 +475,10 @@ Status BaseGPUDevice::Init(const SessionOptions& options) {
|
||||
gpu_device_info_->stream = stream_->compute;
|
||||
gpu_device_info_->default_context = device_context_;
|
||||
gpu_device_info_->event_mgr = em_;
|
||||
PlatformGpuId platform_gpu_id;
|
||||
PlatformDeviceId platform_device_id;
|
||||
TF_RETURN_IF_ERROR(
|
||||
GpuIdManager::TfToPlatformGpuId(tf_gpu_id_, &platform_gpu_id));
|
||||
gpu_device_info_->gpu_id = platform_gpu_id.value();
|
||||
GpuIdManager::TfToPlatformDeviceId(tf_device_id_, &platform_device_id));
|
||||
gpu_device_info_->gpu_id = platform_device_id.value();
|
||||
set_tensorflow_gpu_device_info(gpu_device_info_);
|
||||
|
||||
// Whether and how the GPU device uses its own threadpool.
|
||||
@ -505,7 +507,7 @@ Status BaseGPUDevice::Init(const SessionOptions& options) {
|
||||
// TODO(zhengxq): pin the thread to the same socket of the target GPU.
|
||||
thread_pool_.reset(new thread::ThreadPool(
|
||||
options.env, ThreadOptions(),
|
||||
strings::StrCat("gpu_private_", tf_gpu_id_.value()),
|
||||
strings::StrCat("gpu_private_", tf_device_id_.value()),
|
||||
static_cast<int32>(gpu_thread_count),
|
||||
!options.config.experimental().disable_thread_spinning(),
|
||||
/*allocator=*/nullptr));
|
||||
@ -531,8 +533,8 @@ Status BaseGPUDevice::Init(const SessionOptions& options) {
|
||||
string BaseGPUDevice::ComputeOpKernelDebugString(const OpKernel& op_kernel,
|
||||
const int& stream_id) {
|
||||
return strings::StrCat(op_kernel.name(), " op ", op_kernel.type_string(),
|
||||
" on GPU ", tf_gpu_id_.value(), " stream[", stream_id,
|
||||
"]");
|
||||
" on GPU ", tf_device_id_.value(), " stream[",
|
||||
stream_id, "]");
|
||||
}
|
||||
|
||||
void BaseGPUDevice::Compute(OpKernel* op_kernel, OpKernelContext* context) {
|
||||
@ -624,8 +626,8 @@ void BaseGPUDevice::ComputeAsync(AsyncOpKernel* op_kernel,
|
||||
const auto stream_id = gpu_device_context->stream_id();
|
||||
|
||||
VLOG(1) << "GpuDevice::ComputeAsync " << op_kernel->name() << " op "
|
||||
<< op_kernel->type_string() << " on GPU" << tf_gpu_id_ << " stream["
|
||||
<< stream_id << "]";
|
||||
<< op_kernel->type_string() << " on GPU" << tf_device_id_
|
||||
<< " stream[" << stream_id << "]";
|
||||
|
||||
ScopedActivateExecutorContext scoped_activation{stream->parent()};
|
||||
op_kernel->ComputeAsync(context, std::move(done));
|
||||
@ -763,10 +765,10 @@ class ConcretePerOpGpuDevice : public PerOpGpuDevice {
|
||||
ConcretePerOpGpuDevice() : device_(&stream_device_) {}
|
||||
|
||||
void Reinitialize(OpKernelContext* context, const gpuStream_t* gpu_stream,
|
||||
TfGpuId tf_gpu_id, Allocator* base_allocator,
|
||||
TfDeviceId tf_device_id, Allocator* base_allocator,
|
||||
char* scratch) {
|
||||
stream_device_.Reinitialize(context, gpu_stream, tf_gpu_id, base_allocator,
|
||||
scratch);
|
||||
stream_device_.Reinitialize(context, gpu_stream, tf_device_id,
|
||||
base_allocator, scratch);
|
||||
}
|
||||
|
||||
const Eigen::GpuDevice& device() const override { return device_; }
|
||||
@ -777,8 +779,9 @@ class ConcretePerOpGpuDevice : public PerOpGpuDevice {
|
||||
};
|
||||
|
||||
// Parse 'visible_device_list' into a list of platform GPU ids.
|
||||
Status ParseVisibleDeviceList(const string& visible_device_list,
|
||||
std::vector<PlatformGpuId>* visible_gpu_order) {
|
||||
Status ParseVisibleDeviceList(
|
||||
const string& visible_device_list,
|
||||
std::vector<PlatformDeviceId>* visible_gpu_order) {
|
||||
visible_gpu_order->clear();
|
||||
se::Platform* gpu_manager = GPUMachineManager();
|
||||
|
||||
@ -793,28 +796,28 @@ Status ParseVisibleDeviceList(const string& visible_device_list,
|
||||
} else {
|
||||
const std::vector<string> order_str =
|
||||
str_util::Split(visible_device_list, ',');
|
||||
for (const string& platform_gpu_id_str : order_str) {
|
||||
int32 platform_gpu_id;
|
||||
if (!strings::safe_strto32(platform_gpu_id_str, &platform_gpu_id)) {
|
||||
for (const string& platform_device_id_str : order_str) {
|
||||
int32 platform_device_id;
|
||||
if (!strings::safe_strto32(platform_device_id_str, &platform_device_id)) {
|
||||
return errors::InvalidArgument(
|
||||
"Could not parse entry in 'visible_device_list': '",
|
||||
platform_gpu_id_str,
|
||||
platform_device_id_str,
|
||||
"'. visible_device_list = ", visible_device_list);
|
||||
}
|
||||
if (platform_gpu_id < 0 ||
|
||||
platform_gpu_id >= gpu_manager->VisibleDeviceCount()) {
|
||||
if (platform_device_id < 0 ||
|
||||
platform_device_id >= gpu_manager->VisibleDeviceCount()) {
|
||||
return errors::InvalidArgument(
|
||||
"'visible_device_list' listed an invalid GPU id '", platform_gpu_id,
|
||||
"' but visible device count is ",
|
||||
"'visible_device_list' listed an invalid GPU id '",
|
||||
platform_device_id, "' but visible device count is ",
|
||||
gpu_manager->VisibleDeviceCount());
|
||||
}
|
||||
visible_gpu_order->push_back(PlatformGpuId(platform_gpu_id));
|
||||
visible_gpu_order->push_back(PlatformDeviceId(platform_device_id));
|
||||
}
|
||||
}
|
||||
|
||||
// Validate no repeats.
|
||||
std::set<PlatformGpuId> visible_device_set(visible_gpu_order->begin(),
|
||||
visible_gpu_order->end());
|
||||
std::set<PlatformDeviceId> visible_device_set(visible_gpu_order->begin(),
|
||||
visible_gpu_order->end());
|
||||
if (visible_device_set.size() != visible_gpu_order->size()) {
|
||||
return errors::InvalidArgument(
|
||||
"visible_device_list contained a duplicate entry: ",
|
||||
@ -825,8 +828,8 @@ Status ParseVisibleDeviceList(const string& visible_device_list,
|
||||
|
||||
Status VerifyVirtualDeviceSettings(
|
||||
const size_t num_gpus_to_use, const GPUOptions& gpu_options,
|
||||
const std::vector<PlatformGpuId>& visible_gpu_order,
|
||||
const std::vector<PlatformGpuId>& valid_platform_gpu_ids,
|
||||
const std::vector<PlatformDeviceId>& visible_gpu_order,
|
||||
const std::vector<PlatformDeviceId>& valid_platform_device_ids,
|
||||
const std::map<int, std::pair<int, int>>& supported_priority_ranges) {
|
||||
const auto& virtual_devices = gpu_options.experimental().virtual_devices();
|
||||
CHECK(!virtual_devices.empty());
|
||||
@ -849,11 +852,11 @@ Status VerifyVirtualDeviceSettings(
|
||||
" #GPUs in visible_device_list: ", visible_gpu_order.size(),
|
||||
" virtual_devices.size(): ", virtual_devices.size());
|
||||
}
|
||||
if (valid_platform_gpu_ids.size() != virtual_devices.size()) {
|
||||
if (valid_platform_device_ids.size() != virtual_devices.size()) {
|
||||
return errors::Unknown(
|
||||
"The number of valid GPUs doesn't match the number of elements in "
|
||||
"the virtual_devices list.",
|
||||
" #valid GPUs: ", valid_platform_gpu_ids.size(),
|
||||
" #valid GPUs: ", valid_platform_device_ids.size(),
|
||||
" virtual_devices.size(): ", virtual_devices.size());
|
||||
}
|
||||
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
@ -882,7 +885,7 @@ Status VerifyVirtualDeviceSettings(
|
||||
i, " memory_limit_mb size: ", memory_limit_mb.size(),
|
||||
" and priority size: ", priority.size());
|
||||
}
|
||||
const int gpu_id = valid_platform_gpu_ids[i].value();
|
||||
const int gpu_id = valid_platform_device_ids[i].value();
|
||||
auto it = supported_priority_ranges.find(gpu_id);
|
||||
if (it == supported_priority_ranges.end()) {
|
||||
return errors::Internal(
|
||||
@ -950,19 +953,19 @@ int64 MinSystemMemory(int64 available_memory, int cc_major) {
|
||||
}
|
||||
|
||||
// Get the memory limit for the virtual device being created on GPU with
|
||||
// 'platform_gpu_id', when that virtual device is the only virtual device being
|
||||
// created on that GPU.
|
||||
// 'platform_device_id', when that virtual device is the only virtual device
|
||||
// being created on that GPU.
|
||||
Status SingleVirtualDeviceMemoryLimit(const GPUOptions& gpu_options,
|
||||
PlatformGpuId platform_gpu_id,
|
||||
PlatformDeviceId platform_device_id,
|
||||
int64* memory_limit) {
|
||||
int64 total_memory = 0;
|
||||
int64 available_memory = 0;
|
||||
se::StreamExecutor* se = DeviceIdUtil::ExecutorForPlatformDeviceId(
|
||||
GPUMachineManager(), platform_gpu_id)
|
||||
GPUMachineManager(), platform_device_id)
|
||||
.ValueOrDie();
|
||||
if (!se->DeviceMemoryUsage(&available_memory, &total_memory)) {
|
||||
return errors::Unknown("Failed to query available memory for GPU ",
|
||||
platform_gpu_id.value());
|
||||
platform_device_id.value());
|
||||
}
|
||||
|
||||
int64 allocated_memory = 0;
|
||||
@ -1037,7 +1040,7 @@ void BaseGPUDevice::ReinitializeDevice(OpKernelContext* context,
|
||||
DCHECK_EQ(stream_id, 0);
|
||||
const gpuStream_t* gpu_stream = reinterpret_cast<const gpuStream_t*>(
|
||||
stream_->compute->implementation()->GpuStreamMemberHack());
|
||||
concrete_device->Reinitialize(context, gpu_stream, tf_gpu_id_, allocator,
|
||||
concrete_device->Reinitialize(context, gpu_stream, tf_device_id_, allocator,
|
||||
scratch_);
|
||||
}
|
||||
|
||||
@ -1093,7 +1096,7 @@ Status BaseGPUDeviceFactory::CacheDeviceIds() {
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
std::vector<PlatformGpuId> visible_gpu_order(device_count);
|
||||
std::vector<PlatformDeviceId> visible_gpu_order(device_count);
|
||||
std::iota(visible_gpu_order.begin(), visible_gpu_order.end(), 0);
|
||||
TF_RETURN_IF_ERROR(GetValidDeviceIds(visible_gpu_order, &cached_device_ids_));
|
||||
return Status::OK();
|
||||
@ -1101,9 +1104,9 @@ Status BaseGPUDeviceFactory::CacheDeviceIds() {
|
||||
|
||||
Status BaseGPUDeviceFactory::ListPhysicalDevices(std::vector<string>* devices) {
|
||||
TF_RETURN_IF_ERROR(CacheDeviceIds());
|
||||
for (PlatformGpuId platform_gpu_id : cached_device_ids_) {
|
||||
for (PlatformDeviceId platform_device_id : cached_device_ids_) {
|
||||
const string device_name =
|
||||
strings::StrCat("/physical_device:GPU:", platform_gpu_id.value());
|
||||
strings::StrCat("/physical_device:GPU:", platform_device_id.value());
|
||||
devices->push_back(device_name);
|
||||
}
|
||||
|
||||
@ -1117,14 +1120,15 @@ Status BaseGPUDeviceFactory::GetDeviceDetails(
|
||||
if (device_index < 0 || device_index > cached_device_ids_.size()) {
|
||||
return errors::Internal("Invalid device index: ", device_index);
|
||||
}
|
||||
PlatformGpuId platform_gpu_id = cached_device_ids_[device_index];
|
||||
PlatformDeviceId platform_device_id = cached_device_ids_[device_index];
|
||||
|
||||
TF_RETURN_IF_ERROR(ValidateGPUMachineManager());
|
||||
se::Platform* gpu_manager = GPUMachineManager();
|
||||
if (gpu_manager == nullptr) {
|
||||
return errors::Internal("Cannot get GPUMachineManager");
|
||||
}
|
||||
auto desc_status = gpu_manager->DescriptionForDevice(platform_gpu_id.value());
|
||||
auto desc_status =
|
||||
gpu_manager->DescriptionForDevice(platform_device_id.value());
|
||||
if (!desc_status.ok()) {
|
||||
return desc_status.status();
|
||||
}
|
||||
@ -1159,8 +1163,8 @@ Status BaseGPUDeviceFactory::CreateDevices(
|
||||
num_gpus_to_use = iter->second;
|
||||
}
|
||||
const auto& gpu_options = options.config.gpu_options();
|
||||
std::vector<PlatformGpuId> visible_gpu_order;
|
||||
std::vector<PlatformGpuId> valid_platform_gpu_ids;
|
||||
std::vector<PlatformDeviceId> visible_gpu_order;
|
||||
std::vector<PlatformDeviceId> valid_platform_device_ids;
|
||||
// If we aren't going to use any GPUs, don't initialize them.
|
||||
// We don't want to call ParseVisibleDeviceList if num_gpus_to_use is 0,
|
||||
// because it treats an empty gpu_options.visible_device_list as 'all GPUs
|
||||
@ -1188,13 +1192,13 @@ Status BaseGPUDeviceFactory::CreateDevices(
|
||||
}
|
||||
|
||||
TF_RETURN_IF_ERROR(
|
||||
GetValidDeviceIds(visible_gpu_order, &valid_platform_gpu_ids));
|
||||
GetValidDeviceIds(visible_gpu_order, &valid_platform_device_ids));
|
||||
}
|
||||
if (num_gpus_to_use > valid_platform_gpu_ids.size()) {
|
||||
num_gpus_to_use = valid_platform_gpu_ids.size();
|
||||
if (num_gpus_to_use > valid_platform_device_ids.size()) {
|
||||
num_gpus_to_use = valid_platform_device_ids.size();
|
||||
}
|
||||
std::map<int, std::pair<int, int>> supported_priority_ranges;
|
||||
if (!valid_platform_gpu_ids.empty()) {
|
||||
if (!valid_platform_device_ids.empty()) {
|
||||
// Save the original device.
|
||||
int original_device = 0;
|
||||
#if GOOGLE_CUDA
|
||||
@ -1213,18 +1217,18 @@ Status BaseGPUDeviceFactory::CreateDevices(
|
||||
|
||||
// Force to implicitly initialize CUDA runtime on each valid GPU before
|
||||
// CreateGPUDevice().
|
||||
for (PlatformGpuId platform_gpu_id : valid_platform_gpu_ids) {
|
||||
for (PlatformDeviceId platform_device_id : valid_platform_device_ids) {
|
||||
#if GOOGLE_CUDA
|
||||
err = cudaSetDevice(platform_gpu_id.value());
|
||||
err = cudaSetDevice(platform_device_id.value());
|
||||
if (err != cudaSuccess) {
|
||||
return errors::Internal(
|
||||
"cudaSetDevice() on GPU:", platform_gpu_id.value(),
|
||||
"cudaSetDevice() on GPU:", platform_device_id.value(),
|
||||
" failed. Status: ", cudaGetErrorString(err));
|
||||
}
|
||||
err = cudaFree(nullptr);
|
||||
if (err != cudaSuccess) {
|
||||
return errors::Internal("CUDA runtime implicit initialization on GPU:",
|
||||
platform_gpu_id.value(),
|
||||
platform_device_id.value(),
|
||||
" failed. Status: ", cudaGetErrorString(err));
|
||||
}
|
||||
int priority_low, priority_high;
|
||||
@ -1237,19 +1241,19 @@ Status BaseGPUDeviceFactory::CreateDevices(
|
||||
VLOG(1) << "Cuda stream priority range on GPU(" << original_device
|
||||
<< "): " << priority_high << "," << priority_low;
|
||||
supported_priority_ranges.insert(
|
||||
std::make_pair(platform_gpu_id.value(),
|
||||
std::make_pair(platform_device_id.value(),
|
||||
std::make_pair(priority_low, priority_high)));
|
||||
#elif TENSORFLOW_USE_ROCM
|
||||
err = hipSetDevice(platform_gpu_id.value());
|
||||
err = hipSetDevice(platform_device_id.value());
|
||||
if (err != hipSuccess) {
|
||||
return errors::Internal(
|
||||
"hipSetDevice() on GPU:", platform_gpu_id.value(),
|
||||
"hipSetDevice() on GPU:", platform_device_id.value(),
|
||||
" failed. Status: ", hipGetErrorString(err));
|
||||
}
|
||||
err = hipFree(nullptr);
|
||||
if (err != hipSuccess) {
|
||||
return errors::Internal("ROCm runtime implicit initialization on GPU:",
|
||||
platform_gpu_id.value(),
|
||||
platform_device_id.value(),
|
||||
" failed. Status: ", hipGetErrorString(err));
|
||||
}
|
||||
int priority_low, priority_high;
|
||||
@ -1262,7 +1266,7 @@ Status BaseGPUDeviceFactory::CreateDevices(
|
||||
VLOG(1) << "HIP stream priority range on GPU(" << original_device
|
||||
<< "): " << priority_high << "," << priority_low;
|
||||
supported_priority_ranges.insert(
|
||||
std::make_pair(platform_gpu_id.value(),
|
||||
std::make_pair(platform_device_id.value(),
|
||||
std::make_pair(priority_low, priority_high)));
|
||||
#endif
|
||||
}
|
||||
@ -1306,9 +1310,9 @@ Status BaseGPUDeviceFactory::CreateDevices(
|
||||
LOG(INFO) << line_buf;
|
||||
for (int i = 0; i < visible_gpu_order.size(); ++i) {
|
||||
line_buf = strings::StrCat(visible_gpu_order[i].value(), ": ");
|
||||
PlatformGpuId gpu_id_i = visible_gpu_order[i];
|
||||
PlatformDeviceId gpu_id_i = visible_gpu_order[i];
|
||||
for (int j = 0; j < visible_gpu_order.size(); ++j) {
|
||||
PlatformGpuId gpu_id_j = visible_gpu_order[j];
|
||||
PlatformDeviceId gpu_id_j = visible_gpu_order[j];
|
||||
if (im.directed_links.find({gpu_id_i, gpu_id_j}) !=
|
||||
im.directed_links.end()) {
|
||||
line_buf.append("Y ");
|
||||
@ -1323,22 +1327,23 @@ Status BaseGPUDeviceFactory::CreateDevices(
|
||||
const auto& virtual_devices = gpu_options.experimental().virtual_devices();
|
||||
if (!virtual_devices.empty()) {
|
||||
TF_RETURN_IF_ERROR(VerifyVirtualDeviceSettings(
|
||||
num_gpus_to_use, gpu_options, visible_gpu_order, valid_platform_gpu_ids,
|
||||
supported_priority_ranges));
|
||||
num_gpus_to_use, gpu_options, visible_gpu_order,
|
||||
valid_platform_device_ids, supported_priority_ranges));
|
||||
// We've verified that num_gpus_to_use >= virtual_devices.size().
|
||||
num_gpus_to_use = virtual_devices.size();
|
||||
CHECK(gpu_options.visible_device_list().empty() ||
|
||||
valid_platform_gpu_ids == visible_gpu_order);
|
||||
valid_platform_device_ids == visible_gpu_order);
|
||||
}
|
||||
int next_tf_gpu_id = 0;
|
||||
int next_tf_device_id = 0;
|
||||
std::vector<int64> memory_limit_bytes;
|
||||
for (int i = 0; i < num_gpus_to_use; ++i) {
|
||||
const PlatformGpuId platform_gpu_id = valid_platform_gpu_ids[i];
|
||||
const PlatformDeviceId platform_device_id = valid_platform_device_ids[i];
|
||||
if (virtual_devices.empty() ||
|
||||
virtual_devices.Get(i).memory_limit_mb_size() == 0) {
|
||||
int64 single_virtual_device_memory_limit = 0;
|
||||
TF_RETURN_IF_ERROR(SingleVirtualDeviceMemoryLimit(
|
||||
gpu_options, platform_gpu_id, &single_virtual_device_memory_limit));
|
||||
TF_RETURN_IF_ERROR(
|
||||
SingleVirtualDeviceMemoryLimit(gpu_options, platform_device_id,
|
||||
&single_virtual_device_memory_limit));
|
||||
memory_limit_bytes.push_back(single_virtual_device_memory_limit);
|
||||
} else {
|
||||
const auto& memory_limit_mb = virtual_devices.Get(i).memory_limit_mb();
|
||||
@ -1347,36 +1352,37 @@ Status BaseGPUDeviceFactory::CreateDevices(
|
||||
return static_cast<int64>(mb) * (1ll << 20);
|
||||
});
|
||||
}
|
||||
while (next_tf_gpu_id < memory_limit_bytes.size()) {
|
||||
TfGpuId tf_gpu_id(next_tf_gpu_id);
|
||||
++next_tf_gpu_id;
|
||||
TF_RETURN_IF_ERROR(
|
||||
GpuIdManager::InsertTfPlatformGpuIdPair(tf_gpu_id, platform_gpu_id));
|
||||
while (next_tf_device_id < memory_limit_bytes.size()) {
|
||||
TfDeviceId tf_device_id(next_tf_device_id);
|
||||
++next_tf_device_id;
|
||||
TF_RETURN_IF_ERROR(GpuIdManager::InsertTfPlatformDeviceIdPair(
|
||||
tf_device_id, platform_device_id));
|
||||
}
|
||||
}
|
||||
const int num_tf_gpus = next_tf_gpu_id;
|
||||
const int num_tf_gpus = next_tf_device_id;
|
||||
|
||||
LocalityMap device_localities;
|
||||
TF_RETURN_IF_ERROR(
|
||||
GetDeviceLocalities(num_tf_gpus, interconnect_maps, &device_localities));
|
||||
|
||||
// Build the GPUDevices
|
||||
CHECK_EQ(next_tf_gpu_id, memory_limit_bytes.size());
|
||||
CHECK_EQ(next_tf_device_id, memory_limit_bytes.size());
|
||||
for (int di = 0; di < num_tf_gpus; ++di) {
|
||||
TfGpuId tf_gpu_id(di);
|
||||
TfDeviceId tf_device_id(di);
|
||||
int64 bytes = memory_limit_bytes[di];
|
||||
auto it = device_localities.find(tf_gpu_id);
|
||||
auto it = device_localities.find(tf_device_id);
|
||||
if (it == device_localities.end()) {
|
||||
return errors::Internal("Failed to find DeviceLocality for GPU device ",
|
||||
tf_gpu_id.value());
|
||||
tf_device_id.value());
|
||||
}
|
||||
TF_RETURN_IF_ERROR(CreateGPUDevice(options, name_prefix, tf_gpu_id, bytes,
|
||||
it->second, num_tf_gpus, devices));
|
||||
TF_RETURN_IF_ERROR(CreateGPUDevice(options, name_prefix, tf_device_id,
|
||||
bytes, it->second, num_tf_gpus,
|
||||
devices));
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
static string GetShortDeviceDescription(PlatformGpuId platform_gpu_id,
|
||||
static string GetShortDeviceDescription(PlatformDeviceId platform_device_id,
|
||||
const se::DeviceDescription& desc) {
|
||||
#if GOOGLE_CUDA
|
||||
int cc_major;
|
||||
@ -1386,54 +1392,56 @@ static string GetShortDeviceDescription(PlatformGpuId platform_gpu_id,
|
||||
cc_minor = 0;
|
||||
}
|
||||
// LINT.IfChange
|
||||
return strings::StrCat("device: ", platform_gpu_id.value(),
|
||||
return strings::StrCat("device: ", platform_device_id.value(),
|
||||
", name: ", desc.name(),
|
||||
", pci bus id: ", desc.pci_bus_id(),
|
||||
", compute capability: ", cc_major, ".", cc_minor);
|
||||
// LINT.ThenChange(//tensorflow/python/framework/gpu_util.py)
|
||||
#elif TENSORFLOW_USE_ROCM
|
||||
return strings::StrCat("device: ", platform_gpu_id.value(),
|
||||
return strings::StrCat("device: ", platform_device_id.value(),
|
||||
", name: ", desc.name(),
|
||||
", pci bus id: ", desc.pci_bus_id());
|
||||
#endif
|
||||
}
|
||||
|
||||
Status BaseGPUDeviceFactory::CreateGPUDevice(
|
||||
const SessionOptions& options, const string& name_prefix, TfGpuId tf_gpu_id,
|
||||
int64 memory_limit, const DeviceLocality& dev_locality, size_t num_tf_gpus,
|
||||
const SessionOptions& options, const string& name_prefix,
|
||||
TfDeviceId tf_device_id, int64 memory_limit,
|
||||
const DeviceLocality& dev_locality, size_t num_tf_gpus,
|
||||
std::vector<std::unique_ptr<Device>>* devices) {
|
||||
CHECK_GE(tf_gpu_id.value(), 0);
|
||||
CHECK_GE(tf_device_id.value(), 0);
|
||||
const string device_name =
|
||||
strings::StrCat(name_prefix, "/device:GPU:", tf_gpu_id.value());
|
||||
strings::StrCat(name_prefix, "/device:GPU:", tf_device_id.value());
|
||||
DeviceIdUtil::CheckValidTfDeviceId(DEVICE_GPU, GPUMachineManager(),
|
||||
tf_gpu_id);
|
||||
PlatformGpuId platform_gpu_id;
|
||||
tf_device_id);
|
||||
PlatformDeviceId platform_device_id;
|
||||
TF_RETURN_IF_ERROR(
|
||||
GpuIdManager::TfToPlatformGpuId(tf_gpu_id, &platform_gpu_id));
|
||||
GpuIdManager::TfToPlatformDeviceId(tf_device_id, &platform_device_id));
|
||||
int numa_node = dev_locality.numa_node();
|
||||
|
||||
se::Platform* gpu_manager = GPUMachineManager();
|
||||
auto desc_status = gpu_manager->DescriptionForDevice(platform_gpu_id.value());
|
||||
auto desc_status =
|
||||
gpu_manager->DescriptionForDevice(platform_device_id.value());
|
||||
if (!desc_status.ok()) {
|
||||
return desc_status.status();
|
||||
}
|
||||
auto desc = desc_status.ConsumeValueOrDie();
|
||||
|
||||
std::vector<TfGpuId> peer_gpu_ids;
|
||||
std::vector<TfDeviceId> peer_gpu_ids;
|
||||
peer_gpu_ids.reserve(num_tf_gpus);
|
||||
for (int id = 0; id < num_tf_gpus; ++id) {
|
||||
TfGpuId peer_tf_gpu_id(id);
|
||||
if (peer_tf_gpu_id != tf_gpu_id) {
|
||||
peer_gpu_ids.push_back(peer_tf_gpu_id);
|
||||
TfDeviceId peer_tf_device_id(id);
|
||||
if (peer_tf_device_id != tf_device_id) {
|
||||
peer_gpu_ids.push_back(peer_tf_device_id);
|
||||
}
|
||||
}
|
||||
|
||||
GPUProcessState* process_state = GPUProcessState::singleton();
|
||||
Allocator* gpu_allocator = process_state->GetGPUAllocator(
|
||||
options.config.gpu_options(), tf_gpu_id, memory_limit, peer_gpu_ids);
|
||||
options.config.gpu_options(), tf_device_id, memory_limit, peer_gpu_ids);
|
||||
if (gpu_allocator == nullptr) {
|
||||
return errors::Internal("Failed to get memory allocator for TF GPU ",
|
||||
tf_gpu_id.value(), " with ", memory_limit,
|
||||
tf_device_id.value(), " with ", memory_limit,
|
||||
" bytes of memory.");
|
||||
}
|
||||
absl::optional<AllocatorStats> stats = gpu_allocator->GetStats();
|
||||
@ -1441,7 +1449,7 @@ Status BaseGPUDeviceFactory::CreateGPUDevice(
|
||||
return errors::Internal("No allocator statistics");
|
||||
}
|
||||
// 'memory_limit' is the required memory size, but if the allocator with
|
||||
// given tf_gpu_id was created before, we'll use it instead of creating a
|
||||
// given tf_device_id was created before, we'll use it instead of creating a
|
||||
// new one (as TF gpu device is a shared resource), in which case the actual
|
||||
// memory limit represented by 'stats.bytes_limit' used by that allocator
|
||||
// may be different (which should be an error).
|
||||
@ -1451,11 +1459,11 @@ Status BaseGPUDeviceFactory::CreateGPUDevice(
|
||||
int64 bytes_limit = stats->bytes_limit ? *stats->bytes_limit : 0;
|
||||
std::unique_ptr<BaseGPUDevice> gpu_device = CreateGPUDevice(
|
||||
options, device_name, static_cast<Bytes>(bytes_limit), dev_locality,
|
||||
tf_gpu_id, GetShortDeviceDescription(platform_gpu_id, *desc),
|
||||
tf_device_id, GetShortDeviceDescription(platform_device_id, *desc),
|
||||
gpu_allocator, ProcessState::singleton()->GetCPUAllocator(numa_node));
|
||||
LOG(INFO) << "Created TensorFlow device (" << device_name << " with "
|
||||
<< (bytes_limit >> 20) << " MB memory) -> physical GPU ("
|
||||
<< GetShortDeviceDescription(platform_gpu_id, *desc) << ")";
|
||||
<< GetShortDeviceDescription(platform_device_id, *desc) << ")";
|
||||
TF_RETURN_IF_ERROR(gpu_device->Init(options));
|
||||
devices->push_back(std::move(gpu_device));
|
||||
|
||||
@ -1463,13 +1471,13 @@ Status BaseGPUDeviceFactory::CreateGPUDevice(
|
||||
}
|
||||
|
||||
namespace {
|
||||
std::unique_ptr<std::map<std::pair<PlatformGpuId, PlatformGpuId>, bool>>
|
||||
std::unique_ptr<std::map<std::pair<PlatformDeviceId, PlatformDeviceId>, bool>>
|
||||
GetPeerAccessMap(se::Platform* platform,
|
||||
const std::vector<PlatformGpuId>& visible_gpu_order) {
|
||||
std::unique_ptr<std::map<std::pair<PlatformGpuId, PlatformGpuId>, bool>> map(
|
||||
new std::map<std::pair<PlatformGpuId, PlatformGpuId>, bool>);
|
||||
for (PlatformGpuId platform_gpu_i : visible_gpu_order) {
|
||||
for (PlatformGpuId platform_gpu_j : visible_gpu_order) {
|
||||
const std::vector<PlatformDeviceId>& visible_gpu_order) {
|
||||
std::unique_ptr<std::map<std::pair<PlatformDeviceId, PlatformDeviceId>, bool>>
|
||||
map(new std::map<std::pair<PlatformDeviceId, PlatformDeviceId>, bool>);
|
||||
for (PlatformDeviceId platform_gpu_i : visible_gpu_order) {
|
||||
for (PlatformDeviceId platform_gpu_j : visible_gpu_order) {
|
||||
se::StreamExecutor* from =
|
||||
DeviceIdUtil::ExecutorForPlatformDeviceId(platform, platform_gpu_i)
|
||||
.ValueOrDie();
|
||||
@ -1487,7 +1495,7 @@ GetPeerAccessMap(se::Platform* platform,
|
||||
} // namespace
|
||||
|
||||
Status BaseGPUDeviceFactory::GetInterconnectMaps(
|
||||
const std::vector<PlatformGpuId>& visible_gpu_order,
|
||||
const std::vector<PlatformDeviceId>& visible_gpu_order,
|
||||
se::Platform* gpu_manager, std::vector<InterconnectMap>* maps) {
|
||||
// The default interconnect map is obtained from the StreamExecutor.
|
||||
auto access_map = GetPeerAccessMap(gpu_manager, visible_gpu_order);
|
||||
@ -1495,8 +1503,8 @@ Status BaseGPUDeviceFactory::GetInterconnectMaps(
|
||||
InterconnectMap& imap = maps->at(0);
|
||||
imap.name = "StreamExecutor";
|
||||
imap.strength = InterconnectMap::kStreamExecutorStrength;
|
||||
for (PlatformGpuId gpu_id_i : visible_gpu_order) {
|
||||
for (PlatformGpuId gpu_id_j : visible_gpu_order) {
|
||||
for (PlatformDeviceId gpu_id_i : visible_gpu_order) {
|
||||
for (PlatformDeviceId gpu_id_j : visible_gpu_order) {
|
||||
if (gpu_id_i == gpu_id_j) continue;
|
||||
if ((*access_map)[{gpu_id_i, gpu_id_j}]) {
|
||||
imap.directed_links.insert({gpu_id_i, gpu_id_j});
|
||||
@ -1509,21 +1517,21 @@ Status BaseGPUDeviceFactory::GetInterconnectMaps(
|
||||
Status BaseGPUDeviceFactory::GetDeviceLocalities(
|
||||
int num_tf_gpus, const std::vector<InterconnectMap>& interconnects,
|
||||
LocalityMap* localities) {
|
||||
std::vector<TfGpuId> all_tf_gpu_ids;
|
||||
all_tf_gpu_ids.reserve(num_tf_gpus);
|
||||
std::vector<TfDeviceId> all_tf_device_ids;
|
||||
all_tf_device_ids.reserve(num_tf_gpus);
|
||||
for (int i = 0; i < num_tf_gpus; ++i) {
|
||||
all_tf_gpu_ids.push_back(TfGpuId(i));
|
||||
all_tf_device_ids.push_back(TfDeviceId(i));
|
||||
}
|
||||
for (TfGpuId tf_gpu_id : all_tf_gpu_ids) {
|
||||
PlatformGpuId platform_gpu_id;
|
||||
for (TfDeviceId tf_device_id : all_tf_device_ids) {
|
||||
PlatformDeviceId platform_device_id;
|
||||
TF_RETURN_IF_ERROR(
|
||||
GpuIdManager::TfToPlatformGpuId(tf_gpu_id, &platform_gpu_id));
|
||||
GpuIdManager::TfToPlatformDeviceId(tf_device_id, &platform_device_id));
|
||||
// Get GPU bus_id from its reported NUMA affinity. Because GPUs are
|
||||
// virtualized in some environments, we can't just use the GPU id.
|
||||
// NUMA locales are indexed from 0, buses are indexed from 1.
|
||||
se::Platform* gpu_manager = GPUMachineManager();
|
||||
auto desc_status =
|
||||
gpu_manager->DescriptionForDevice(platform_gpu_id.value());
|
||||
gpu_manager->DescriptionForDevice(platform_device_id.value());
|
||||
if (!desc_status.ok()) {
|
||||
return desc_status.status();
|
||||
}
|
||||
@ -1537,7 +1545,7 @@ Status BaseGPUDeviceFactory::GetDeviceLocalities(
|
||||
// trouble may manifest as slower than expected performance, or
|
||||
// outright failures.
|
||||
LOG(INFO) << "Could not identify NUMA node of platform GPU id "
|
||||
<< platform_gpu_id
|
||||
<< platform_device_id
|
||||
<< ", defaulting to 0. Your kernel may not have been built "
|
||||
<< "with NUMA support.";
|
||||
numa_node = 0;
|
||||
@ -1549,11 +1557,11 @@ Status BaseGPUDeviceFactory::GetDeviceLocalities(
|
||||
// Set LocalLinks from InterconnectMaps.
|
||||
LocalLinks* links = dev_locality.mutable_links();
|
||||
for (const InterconnectMap& imap : interconnects) {
|
||||
for (TfGpuId tf_gpu_dst : all_tf_gpu_ids) {
|
||||
PlatformGpuId platform_gpu_dst;
|
||||
for (TfDeviceId tf_gpu_dst : all_tf_device_ids) {
|
||||
PlatformDeviceId platform_gpu_dst;
|
||||
TF_RETURN_IF_ERROR(
|
||||
GpuIdManager::TfToPlatformGpuId(tf_gpu_dst, &platform_gpu_dst));
|
||||
if (imap.directed_links.find({platform_gpu_id, platform_gpu_dst}) !=
|
||||
GpuIdManager::TfToPlatformDeviceId(tf_gpu_dst, &platform_gpu_dst));
|
||||
if (imap.directed_links.find({platform_device_id, platform_gpu_dst}) !=
|
||||
imap.directed_links.end()) {
|
||||
InterconnectLink* ilink = links->add_link();
|
||||
ilink->set_device_id(tf_gpu_dst.value());
|
||||
@ -1565,12 +1573,12 @@ Status BaseGPUDeviceFactory::GetDeviceLocalities(
|
||||
|
||||
// If this is one of multiple virtual GPUs on the same physical GPU
|
||||
// add high strength links to the others.
|
||||
for (TfGpuId tf_gpu_dst : all_tf_gpu_ids) {
|
||||
if (tf_gpu_id == tf_gpu_dst) continue;
|
||||
PlatformGpuId platform_gpu_dst;
|
||||
for (TfDeviceId tf_gpu_dst : all_tf_device_ids) {
|
||||
if (tf_device_id == tf_gpu_dst) continue;
|
||||
PlatformDeviceId platform_gpu_dst;
|
||||
TF_RETURN_IF_ERROR(
|
||||
GpuIdManager::TfToPlatformGpuId(tf_gpu_dst, &platform_gpu_dst));
|
||||
if (platform_gpu_id == platform_gpu_dst) {
|
||||
GpuIdManager::TfToPlatformDeviceId(tf_gpu_dst, &platform_gpu_dst));
|
||||
if (platform_device_id == platform_gpu_dst) {
|
||||
InterconnectLink* ilink = links->add_link();
|
||||
ilink->set_device_id(tf_gpu_dst.value());
|
||||
ilink->set_type("SAME_DEVICE");
|
||||
@ -1578,10 +1586,11 @@ Status BaseGPUDeviceFactory::GetDeviceLocalities(
|
||||
}
|
||||
}
|
||||
|
||||
(*localities)[tf_gpu_id] = dev_locality;
|
||||
VLOG(1) << "GPUDevice PlatformGpuId " << platform_gpu_id << " TfGpuId "
|
||||
<< tf_gpu_id << " on bus " << dev_locality.bus_id()
|
||||
<< " numa: " << numa_node << " pci: " << desc->pci_bus_id()
|
||||
(*localities)[tf_device_id] = dev_locality;
|
||||
VLOG(1) << "GPUDevice PlatformDeviceId " << platform_device_id
|
||||
<< " TfDeviceId " << tf_device_id << " on bus "
|
||||
<< dev_locality.bus_id() << " numa: " << numa_node
|
||||
<< " pci: " << desc->pci_bus_id()
|
||||
<< " DeviceLocality: " << dev_locality.DebugString();
|
||||
}
|
||||
return Status::OK();
|
||||
@ -1589,7 +1598,7 @@ Status BaseGPUDeviceFactory::GetDeviceLocalities(
|
||||
|
||||
static int GetDefaultMinGPUMultiprocessorCount(
|
||||
se::Platform* gpu_manager,
|
||||
const std::vector<PlatformGpuId>& visible_gpu_order) {
|
||||
const std::vector<PlatformDeviceId>& visible_gpu_order) {
|
||||
static const int kDefaultMinGPUMultiprocessorCount = 8;
|
||||
|
||||
// Find the highest multi-processor count across all visible GPUs.
|
||||
@ -1614,7 +1623,7 @@ static int GetDefaultMinGPUMultiprocessorCount(
|
||||
|
||||
static int GetMinGPUMultiprocessorCount(
|
||||
se::Platform* gpu_manager,
|
||||
const std::vector<PlatformGpuId>& visible_gpu_order) {
|
||||
const std::vector<PlatformDeviceId>& visible_gpu_order) {
|
||||
const char* tf_min_gpu_core_count = getenv("TF_MIN_GPU_MULTIPROCESSOR_COUNT");
|
||||
|
||||
if (tf_min_gpu_core_count == nullptr ||
|
||||
@ -1704,14 +1713,14 @@ std::vector<int> GetSupportedAMDGPUISAVersions() {
|
||||
} // namespace
|
||||
|
||||
Status BaseGPUDeviceFactory::EnablePeerAccess(
|
||||
const std::vector<PlatformGpuId>& visible_gpu_order) {
|
||||
const std::vector<PlatformDeviceId>& visible_gpu_order) {
|
||||
se::Platform* gpu_manager = GPUMachineManager();
|
||||
int possible_peer_count = 0;
|
||||
int enabled_peer_count = 0;
|
||||
for (int i = 0; i < visible_gpu_order.size(); ++i) {
|
||||
const PlatformGpuId platform_gpu_i = visible_gpu_order[i];
|
||||
const PlatformDeviceId platform_gpu_i = visible_gpu_order[i];
|
||||
for (int j = 0; j < visible_gpu_order.size(); ++j) {
|
||||
const PlatformGpuId platform_gpu_j = visible_gpu_order[j];
|
||||
const PlatformDeviceId platform_gpu_j = visible_gpu_order[j];
|
||||
// We have already validated that ExecutorForDevice() calls return OK.
|
||||
se::StreamExecutor* from =
|
||||
DeviceIdUtil::ExecutorForPlatformDeviceId(gpu_manager, platform_gpu_i)
|
||||
@ -1748,8 +1757,8 @@ Status BaseGPUDeviceFactory::EnablePeerAccess(
|
||||
}
|
||||
|
||||
Status BaseGPUDeviceFactory::GetValidDeviceIds(
|
||||
const std::vector<PlatformGpuId>& visible_gpu_order,
|
||||
std::vector<PlatformGpuId>* ids) {
|
||||
const std::vector<PlatformDeviceId>& visible_gpu_order,
|
||||
std::vector<PlatformDeviceId>* ids) {
|
||||
se::Platform* gpu_manager = GPUMachineManager();
|
||||
for (int i = 0; i < visible_gpu_order.size(); ++i) {
|
||||
int visible_gpu_id = visible_gpu_order[i].value();
|
||||
@ -1834,7 +1843,7 @@ Status BaseGPUDeviceFactory::GetValidDeviceIds(
|
||||
|
||||
// Filter out devices that don't have the right capability or power.
|
||||
for (int i = 0; i < visible_gpu_order.size(); ++i) {
|
||||
const PlatformGpuId visible_gpu_id = visible_gpu_order[i];
|
||||
const PlatformDeviceId visible_gpu_id = visible_gpu_order[i];
|
||||
auto description_status =
|
||||
gpu_manager->DescriptionForDevice(visible_gpu_id.value());
|
||||
if (!description_status.ok()) {
|
||||
@ -1904,7 +1913,7 @@ Status BaseGPUDeviceFactory::GetValidDeviceIds(
|
||||
if (!ids->empty()) {
|
||||
std::vector<int> raw_ids(ids->size());
|
||||
std::transform(ids->begin(), ids->end(), raw_ids.begin(),
|
||||
[](PlatformGpuId id) -> int { return id.value(); });
|
||||
[](PlatformDeviceId id) -> int { return id.value(); });
|
||||
LOG(INFO) << "Adding visible gpu devices: " << absl::StrJoin(raw_ids, ", ");
|
||||
}
|
||||
|
||||
|
@ -53,7 +53,8 @@ class BaseGPUDevice : public LocalDevice {
|
||||
public:
|
||||
BaseGPUDevice(const SessionOptions& options, const std::string& name,
|
||||
Bytes memory_limit, const DeviceLocality& locality,
|
||||
TfGpuId tf_gpu_id, const std::string& physical_device_desc,
|
||||
TfDeviceId tf_device_id,
|
||||
const std::string& physical_device_desc,
|
||||
Allocator* gpu_allocator, Allocator* cpu_allocator,
|
||||
bool sync_every_op);
|
||||
|
||||
@ -87,9 +88,10 @@ class BaseGPUDevice : public LocalDevice {
|
||||
// Returns the platform GPU id of this device within the native driver system;
|
||||
// e.g., for CUDA and ROCm this is the ordinal of the GPU within the system.
|
||||
int gpu_id() const {
|
||||
PlatformGpuId platform_gpu_id;
|
||||
TF_CHECK_OK(GpuIdManager::TfToPlatformGpuId(tf_gpu_id_, &platform_gpu_id));
|
||||
return platform_gpu_id.value();
|
||||
PlatformDeviceId platform_device_id;
|
||||
TF_CHECK_OK(
|
||||
GpuIdManager::TfToPlatformDeviceId(tf_device_id_, &platform_device_id));
|
||||
return platform_device_id.value();
|
||||
}
|
||||
|
||||
// The executor that provides control for the device; e.g., for CUDA this
|
||||
@ -146,7 +148,7 @@ class BaseGPUDevice : public LocalDevice {
|
||||
GPUDeviceContext* device_context_;
|
||||
GpuDeviceInfo* gpu_device_info_ = nullptr;
|
||||
mutex trace_mu_;
|
||||
TfGpuId tf_gpu_id_;
|
||||
TfDeviceId tf_device_id_;
|
||||
const bool sync_every_op_ = false;
|
||||
EventMgr* em_ = nullptr;
|
||||
std::unique_ptr<thread::ThreadPool> thread_pool_;
|
||||
@ -325,53 +327,56 @@ class BaseGPUDeviceFactory : public DeviceFactory {
|
||||
int32 strength;
|
||||
static const int kSameDeviceStrength;
|
||||
static const int kStreamExecutorStrength;
|
||||
std::set<std::pair<PlatformGpuId, PlatformGpuId>> directed_links;
|
||||
std::set<std::pair<PlatformDeviceId, PlatformDeviceId>> directed_links;
|
||||
};
|
||||
|
||||
protected:
|
||||
// Populates *maps with interconnect maps for all local direct access
|
||||
// pathways between GPUs.
|
||||
virtual Status GetInterconnectMaps(
|
||||
const std::vector<PlatformGpuId>& visible_gpu_order,
|
||||
const std::vector<PlatformDeviceId>& visible_gpu_order,
|
||||
se::Platform* gpu_manager, std::vector<InterconnectMap>* maps);
|
||||
|
||||
struct TfGpuIdHash {
|
||||
std::size_t operator()(const TfGpuId& id) const noexcept {
|
||||
struct TfDeviceIdHash {
|
||||
std::size_t operator()(const TfDeviceId& id) const noexcept {
|
||||
return std::hash<int>{}(id.value());
|
||||
}
|
||||
};
|
||||
typedef std::unordered_map<TfGpuId, DeviceLocality, TfGpuIdHash> LocalityMap;
|
||||
typedef std::unordered_map<TfDeviceId, DeviceLocality, TfDeviceIdHash>
|
||||
LocalityMap;
|
||||
// Populates *localities with the DeviceLocality descriptor for
|
||||
// every TfGpuId.
|
||||
// every TfDeviceId.
|
||||
virtual Status GetDeviceLocalities(
|
||||
int num_tf_gpus, const std::vector<InterconnectMap>& interconnects,
|
||||
LocalityMap* localities);
|
||||
|
||||
private:
|
||||
// Creates a BaseGPUDevice associated with 'tf_gpu_id', allocates (strictly)
|
||||
// 'memory_limit' bytes of GPU memory to it, and adds it to the 'devices'
|
||||
// vector.
|
||||
// Creates a BaseGPUDevice associated with 'tf_device_id', allocates
|
||||
// (strictly) 'memory_limit' bytes of GPU memory to it, and adds it to the
|
||||
// 'devices' vector.
|
||||
Status CreateGPUDevice(const SessionOptions& options,
|
||||
const std::string& name_prefix, TfGpuId tf_gpu_id,
|
||||
int64 memory_limit, const DeviceLocality& dev_locality,
|
||||
size_t num_tf_gpus,
|
||||
const std::string& name_prefix,
|
||||
TfDeviceId tf_device_id, int64 memory_limit,
|
||||
const DeviceLocality& dev_locality, size_t num_tf_gpus,
|
||||
std::vector<std::unique_ptr<Device>>* devices);
|
||||
|
||||
virtual std::unique_ptr<BaseGPUDevice> CreateGPUDevice(
|
||||
const SessionOptions& options, const string& name, Bytes memory_limit,
|
||||
const DeviceLocality& dev_locality, TfGpuId tf_gpu_id,
|
||||
const DeviceLocality& dev_locality, TfDeviceId tf_device_id,
|
||||
const string& physical_device_desc, Allocator* gpu_allocator,
|
||||
Allocator* cpu_allocator) = 0;
|
||||
|
||||
Status EnablePeerAccess(const std::vector<PlatformGpuId>& visible_gpu_order);
|
||||
Status EnablePeerAccess(
|
||||
const std::vector<PlatformDeviceId>& visible_gpu_order);
|
||||
|
||||
// Returns into 'ids' the list of valid platform GPU ids, in the order that
|
||||
// they should map to TF GPU ids "/device:GPU:0", "/device:GPU:1", etc,
|
||||
// based upon 'visible_gpu_order' which was generated by parsing
|
||||
// GPUOptions::visible_device_list which is a comma-separated list of CUDA or
|
||||
// ROCm GPU ids.
|
||||
Status GetValidDeviceIds(const std::vector<PlatformGpuId>& visible_gpu_order,
|
||||
std::vector<PlatformGpuId>* ids);
|
||||
Status GetValidDeviceIds(
|
||||
const std::vector<PlatformDeviceId>& visible_gpu_order,
|
||||
std::vector<PlatformDeviceId>* ids);
|
||||
|
||||
// Cache the valid device IDs if not already cached. Cached IDs are stored in
|
||||
// field cached_device_ids_. Passes {0, 1, ..., num_devices-1} to
|
||||
@ -379,14 +384,14 @@ class BaseGPUDeviceFactory : public DeviceFactory {
|
||||
// devices should be treated as visible, like ListPhysicalDevices.
|
||||
Status CacheDeviceIds();
|
||||
|
||||
// visible_gpu_initialized_[platform_gpu_id] is true if visible GPU
|
||||
// platform_gpu_id has been initialized by the process.
|
||||
// visible_gpu_initialized_[platform_device_id] is true if visible GPU
|
||||
// platform_device_id has been initialized by the process.
|
||||
std::unordered_map<int, bool> visible_gpu_initialized_;
|
||||
|
||||
// Cached device IDs, as returned by GetValidDeviceIds when every physical
|
||||
// device is visible. Cache should not be used if some devices are not
|
||||
// visible.
|
||||
std::vector<PlatformGpuId> cached_device_ids_;
|
||||
std::vector<PlatformDeviceId> cached_device_ids_;
|
||||
};
|
||||
|
||||
} // namespace tensorflow
|
||||
|
@ -30,9 +30,9 @@ class GPUDevice : public BaseGPUDevice {
|
||||
public:
|
||||
GPUDevice(const SessionOptions& options, const string& name,
|
||||
Bytes memory_limit, const DeviceLocality& locality,
|
||||
TfGpuId tf_gpu_id, const string& physical_device_desc,
|
||||
TfDeviceId tf_device_id, const string& physical_device_desc,
|
||||
Allocator* gpu_allocator, Allocator* cpu_allocator)
|
||||
: BaseGPUDevice(options, name, memory_limit, locality, tf_gpu_id,
|
||||
: BaseGPUDevice(options, name, memory_limit, locality, tf_device_id,
|
||||
physical_device_desc, gpu_allocator, cpu_allocator,
|
||||
false /* sync every op */) {
|
||||
if (options.config.has_gpu_options()) {
|
||||
@ -63,11 +63,11 @@ class GPUDeviceFactory : public BaseGPUDeviceFactory {
|
||||
private:
|
||||
std::unique_ptr<BaseGPUDevice> CreateGPUDevice(
|
||||
const SessionOptions& options, const string& name, Bytes memory_limit,
|
||||
const DeviceLocality& locality, TfGpuId tf_gpu_id,
|
||||
const DeviceLocality& locality, TfDeviceId tf_device_id,
|
||||
const string& physical_device_desc, Allocator* gpu_allocator,
|
||||
Allocator* cpu_allocator) override {
|
||||
return absl::make_unique<GPUDevice>(options, name, memory_limit, locality,
|
||||
tf_gpu_id, physical_device_desc,
|
||||
tf_device_id, physical_device_desc,
|
||||
gpu_allocator, cpu_allocator);
|
||||
}
|
||||
};
|
||||
|
@ -30,7 +30,7 @@ namespace tensorflow {
|
||||
namespace {
|
||||
const char* kDeviceNamePrefix = "/job:localhost/replica:0/task:0";
|
||||
|
||||
int64 GetTotalGPUMemory(PlatformGpuId gpu_id) {
|
||||
int64 GetTotalGPUMemory(PlatformDeviceId gpu_id) {
|
||||
se::StreamExecutor* se =
|
||||
DeviceIdUtil::ExecutorForPlatformDeviceId(GPUMachineManager(), gpu_id)
|
||||
.ValueOrDie();
|
||||
@ -40,7 +40,7 @@ int64 GetTotalGPUMemory(PlatformGpuId gpu_id) {
|
||||
return total_memory;
|
||||
}
|
||||
|
||||
Status GetComputeCapability(PlatformGpuId gpu_id, int* cc_major,
|
||||
Status GetComputeCapability(PlatformDeviceId gpu_id, int* cc_major,
|
||||
int* cc_minor) {
|
||||
se::StreamExecutor* se =
|
||||
DeviceIdUtil::ExecutorForPlatformDeviceId(GPUMachineManager(), gpu_id)
|
||||
@ -350,7 +350,7 @@ TEST_F(GPUDeviceTest, MultipleVirtualDevicesWithPriority) {
|
||||
// error.
|
||||
TEST_F(GPUDeviceTest, UnifiedMemoryUnavailableOnPrePascalGpus) {
|
||||
int cc_major, cc_minor;
|
||||
TF_ASSERT_OK(GetComputeCapability(PlatformGpuId(0), &cc_major, &cc_minor));
|
||||
TF_ASSERT_OK(GetComputeCapability(PlatformDeviceId(0), &cc_major, &cc_minor));
|
||||
// Exit early while running on Pascal or later GPUs.
|
||||
if (cc_major >= 6) {
|
||||
return;
|
||||
@ -371,10 +371,10 @@ TEST_F(GPUDeviceTest, UnifiedMemoryUnavailableOnPrePascalGpus) {
|
||||
// more memory than what is available on the device.
|
||||
TEST_F(GPUDeviceTest, UnifiedMemoryAllocation) {
|
||||
static constexpr double kGpuMemoryFraction = 1.2;
|
||||
static constexpr PlatformGpuId kPlatformGpuId(0);
|
||||
static constexpr PlatformDeviceId kPlatformDeviceId(0);
|
||||
|
||||
int cc_major, cc_minor;
|
||||
TF_ASSERT_OK(GetComputeCapability(kPlatformGpuId, &cc_major, &cc_minor));
|
||||
TF_ASSERT_OK(GetComputeCapability(kPlatformDeviceId, &cc_major, &cc_minor));
|
||||
// Exit early if running on pre-Pascal GPUs.
|
||||
if (cc_major < 6) {
|
||||
LOG(INFO)
|
||||
@ -389,8 +389,9 @@ TEST_F(GPUDeviceTest, UnifiedMemoryAllocation) {
|
||||
ASSERT_EQ(1, devices.size());
|
||||
|
||||
int64 memory_limit = devices[0]->attributes().memory_limit();
|
||||
ASSERT_EQ(memory_limit, static_cast<int64>(GetTotalGPUMemory(kPlatformGpuId) *
|
||||
kGpuMemoryFraction));
|
||||
ASSERT_EQ(memory_limit,
|
||||
static_cast<int64>(GetTotalGPUMemory(kPlatformDeviceId) *
|
||||
kGpuMemoryFraction));
|
||||
|
||||
AllocatorAttributes allocator_attributes = AllocatorAttributes();
|
||||
allocator_attributes.set_gpu_compatible(true);
|
||||
|
@ -17,13 +17,6 @@ limitations under the License.
|
||||
|
||||
#include "tensorflow/core/common_runtime/device/device_id.h"
|
||||
|
||||
namespace tensorflow {
|
||||
|
||||
// TODO(annarev): remove these aliases after all references are updated
|
||||
// to use device ids.
|
||||
typedef TfDeviceId TfGpuId;
|
||||
typedef PlatformDeviceId PlatformGpuId;
|
||||
|
||||
} // namespace tensorflow
|
||||
// TODO(sanjoy): Delete the header and forward the references.
|
||||
|
||||
#endif // TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_ID_H_
|
||||
|
@ -20,16 +20,16 @@ limitations under the License.
|
||||
|
||||
namespace tensorflow {
|
||||
|
||||
Status GpuIdManager::InsertTfPlatformGpuIdPair(
|
||||
TfDeviceId tf_gpu_id, PlatformDeviceId platform_gpu_id) {
|
||||
return DeviceIdManager::InsertTfPlatformDeviceIdPair(DEVICE_GPU, tf_gpu_id,
|
||||
platform_gpu_id);
|
||||
Status GpuIdManager::InsertTfPlatformDeviceIdPair(
|
||||
TfDeviceId tf_device_id, PlatformDeviceId platform_device_id) {
|
||||
return DeviceIdManager::InsertTfPlatformDeviceIdPair(DEVICE_GPU, tf_device_id,
|
||||
platform_device_id);
|
||||
}
|
||||
|
||||
Status GpuIdManager::TfToPlatformGpuId(TfDeviceId tf_gpu_id,
|
||||
PlatformDeviceId* platform_gpu_id) {
|
||||
return DeviceIdManager::TfToPlatformDeviceId(DEVICE_GPU, tf_gpu_id,
|
||||
platform_gpu_id);
|
||||
Status GpuIdManager::TfToPlatformDeviceId(
|
||||
TfDeviceId tf_device_id, PlatformDeviceId* platform_device_id) {
|
||||
return DeviceIdManager::TfToPlatformDeviceId(DEVICE_GPU, tf_device_id,
|
||||
platform_device_id);
|
||||
}
|
||||
|
||||
void GpuIdManager::TestOnlyReset() { DeviceIdManager::TestOnlyReset(); }
|
||||
|
@ -21,17 +21,18 @@ limitations under the License.
|
||||
|
||||
namespace tensorflow {
|
||||
|
||||
// Class that maintains a map from TfGpuId to PlatformGpuId, and manages the
|
||||
// translation between them.
|
||||
// Class that maintains a map from TfDeviceId to PlatformDeviceId, and manages
|
||||
// the translation between them.
|
||||
class GpuIdManager {
|
||||
public:
|
||||
// Adds a mapping from tf_gpu_id to platform_gpu_id.
|
||||
static Status InsertTfPlatformGpuIdPair(TfDeviceId tf_gpu_id,
|
||||
PlatformDeviceId platform_gpu_id);
|
||||
// Adds a mapping from tf_device_id to platform_device_id.
|
||||
static Status InsertTfPlatformDeviceIdPair(
|
||||
TfDeviceId tf_device_id, PlatformDeviceId platform_device_id);
|
||||
|
||||
// Gets the platform_gpu_id associated with tf_gpu_id. Returns OK if found.
|
||||
static Status TfToPlatformGpuId(TfDeviceId tf_gpu_id,
|
||||
PlatformDeviceId* platform_gpu_id);
|
||||
// Gets the platform_device_id associated with tf_device_id. Returns OK if
|
||||
// found.
|
||||
static Status TfToPlatformDeviceId(TfDeviceId tf_device_id,
|
||||
PlatformDeviceId* platform_device_id);
|
||||
|
||||
// Clears the map. Used in unit tests only.
|
||||
static void TestOnlyReset();
|
||||
|
@ -83,10 +83,10 @@ GPUProcessState::GPUProcessState() : gpu_device_enabled_(false) {
|
||||
process_state_ = ProcessState::singleton();
|
||||
}
|
||||
|
||||
int GPUProcessState::BusIdForGPU(TfGpuId tf_gpu_id) {
|
||||
int GPUProcessState::BusIdForGPU(TfDeviceId tf_device_id) {
|
||||
// Return the NUMA node associated with the GPU's StreamExecutor.
|
||||
se::StreamExecutor* se = DeviceIdUtil::ExecutorForTfDeviceId(
|
||||
DEVICE_GPU, GPUMachineManager(), tf_gpu_id)
|
||||
DEVICE_GPU, GPUMachineManager(), tf_device_id)
|
||||
.ValueOrDie();
|
||||
int numa_node = se->GetDeviceDescription().numa_node();
|
||||
// bus_id must be non-negative. If the numa_node is not known,
|
||||
@ -96,11 +96,11 @@ int GPUProcessState::BusIdForGPU(TfGpuId tf_gpu_id) {
|
||||
|
||||
// NOLINTNEXTLINE: clang-tidy complains this is unused because of build flags.
|
||||
static SubAllocator* CreateSubAllocator(
|
||||
const GPUOptions& options, PlatformGpuId platform_gpu_id,
|
||||
const GPUOptions& options, PlatformDeviceId platform_device_id,
|
||||
const std::vector<SubAllocator::Visitor>& alloc_visitors,
|
||||
size_t total_bytes, const std::vector<TfGpuId>& peer_gpu_ids) {
|
||||
size_t total_bytes, const std::vector<TfDeviceId>& peer_gpu_ids) {
|
||||
auto executor = DeviceIdUtil::ExecutorForPlatformDeviceId(GPUMachineManager(),
|
||||
platform_gpu_id)
|
||||
platform_device_id)
|
||||
.ValueOrDie();
|
||||
|
||||
// FIXME(imintz): Observed OOM issues when using the virtual memory
|
||||
@ -110,21 +110,21 @@ static SubAllocator* CreateSubAllocator(
|
||||
// TODO(imintz): Remove the cuMemAlloc capability of this allocator.
|
||||
if (options.per_process_gpu_memory_fraction() > 1.0 ||
|
||||
options.experimental().use_unified_memory()) {
|
||||
return new DeviceMemAllocator(executor, platform_gpu_id,
|
||||
return new DeviceMemAllocator(executor, platform_device_id,
|
||||
/*use_unified_memory=*/true, alloc_visitors,
|
||||
{});
|
||||
} else {
|
||||
auto* gpu_context = reinterpret_cast<stream_executor::gpu::GpuContext*>(
|
||||
executor->implementation()->GpuContextHack());
|
||||
|
||||
absl::flat_hash_set<PlatformGpuId> platform_peer_gpu_ids;
|
||||
absl::flat_hash_set<PlatformDeviceId> platform_peer_gpu_ids;
|
||||
platform_peer_gpu_ids.reserve(peer_gpu_ids.size());
|
||||
for (const TfGpuId tf_gpu_id : peer_gpu_ids) {
|
||||
PlatformGpuId platform_gpu_id;
|
||||
TF_CHECK_OK(GpuIdManager::TfToPlatformGpuId(tf_gpu_id, &platform_gpu_id));
|
||||
platform_peer_gpu_ids.insert(platform_gpu_id);
|
||||
for (const TfDeviceId tf_device_id : peer_gpu_ids) {
|
||||
PlatformDeviceId platform_device_id;
|
||||
TF_CHECK_OK(GpuIdManager::TfToPlatformDeviceId(tf_device_id, &platform_device_id));
|
||||
platform_peer_gpu_ids.insert(platform_device_id);
|
||||
}
|
||||
std::vector<PlatformGpuId> platform_peer_gpu_ids_vec(
|
||||
std::vector<PlatformDeviceId> platform_peer_gpu_ids_vec(
|
||||
platform_peer_gpu_ids.begin(), platform_peer_gpu_ids.end());
|
||||
|
||||
// Adjust virtual address space to be slightly larger than the physical
|
||||
@ -133,7 +133,7 @@ static SubAllocator* CreateSubAllocator(
|
||||
// TODO(imintz): Update BFC allocator to ensure it doesn't create holes in
|
||||
// the va space.
|
||||
return GpuVirtualMemAllocator::Create(
|
||||
alloc_visitors, {}, *gpu_context, platform_gpu_id,
|
||||
alloc_visitors, {}, *gpu_context, platform_device_id,
|
||||
/*virtual_address_space_size=*/total_bytes * 2,
|
||||
platform_peer_gpu_ids_vec)
|
||||
.ValueOrDie()
|
||||
@ -141,7 +141,7 @@ static SubAllocator* CreateSubAllocator(
|
||||
}
|
||||
#else
|
||||
return new DeviceMemAllocator(
|
||||
executor, platform_gpu_id,
|
||||
executor, platform_device_id,
|
||||
(options.per_process_gpu_memory_fraction() > 1.0 ||
|
||||
options.experimental().use_unified_memory()),
|
||||
alloc_visitors, {});
|
||||
@ -149,21 +149,21 @@ static SubAllocator* CreateSubAllocator(
|
||||
}
|
||||
|
||||
Allocator* GPUProcessState::GetGPUAllocator(
|
||||
const GPUOptions& options, TfGpuId tf_gpu_id, size_t total_bytes,
|
||||
const std::vector<TfGpuId>& peer_gpu_ids) {
|
||||
const GPUOptions& options, TfDeviceId tf_device_id, size_t total_bytes,
|
||||
const std::vector<TfDeviceId>& peer_gpu_ids) {
|
||||
CHECK(process_state_);
|
||||
#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
|
||||
(defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
|
||||
const string& allocator_type = options.allocator_type();
|
||||
mutex_lock lock(mu_);
|
||||
DeviceIdUtil::CheckValidTfDeviceId(DEVICE_GPU, GPUMachineManager(),
|
||||
tf_gpu_id);
|
||||
tf_device_id);
|
||||
|
||||
if (tf_gpu_id.value() >= static_cast<int64>(gpu_allocators_.size())) {
|
||||
gpu_allocators_.resize(tf_gpu_id.value() + 1);
|
||||
if (tf_device_id.value() >= static_cast<int64>(gpu_allocators_.size())) {
|
||||
gpu_allocators_.resize(tf_device_id.value() + 1);
|
||||
}
|
||||
|
||||
AllocatorParts& allocator_parts = gpu_allocators_[tf_gpu_id.value()];
|
||||
AllocatorParts& allocator_parts = gpu_allocators_[tf_device_id.value()];
|
||||
if (allocator_parts.allocator == nullptr) {
|
||||
// Validate allocator types.
|
||||
if (!allocator_type.empty() && allocator_type != "BFC") {
|
||||
@ -171,19 +171,20 @@ Allocator* GPUProcessState::GetGPUAllocator(
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
PlatformGpuId platform_gpu_id;
|
||||
TF_CHECK_OK(GpuIdManager::TfToPlatformGpuId(tf_gpu_id, &platform_gpu_id));
|
||||
int bus_id = BusIdForGPU(tf_gpu_id);
|
||||
PlatformDeviceId platform_device_id;
|
||||
TF_CHECK_OK(
|
||||
GpuIdManager::TfToPlatformDeviceId(tf_device_id, &platform_device_id));
|
||||
int bus_id = BusIdForGPU(tf_device_id);
|
||||
DCHECK_GE(bus_id, 0);
|
||||
while (bus_id >= gpu_visitors_.size()) {
|
||||
gpu_visitors_.push_back({});
|
||||
}
|
||||
auto* sub_allocator =
|
||||
CreateSubAllocator(options, platform_gpu_id, gpu_visitors_[bus_id],
|
||||
CreateSubAllocator(options, platform_device_id, gpu_visitors_[bus_id],
|
||||
total_bytes, peer_gpu_ids);
|
||||
GPUBFCAllocator* gpu_bfc_allocator =
|
||||
new GPUBFCAllocator(sub_allocator, total_bytes, options,
|
||||
strings::StrCat("GPU_", tf_gpu_id.value(), "_bfc"));
|
||||
GPUBFCAllocator* gpu_bfc_allocator = new GPUBFCAllocator(
|
||||
sub_allocator, total_bytes, options,
|
||||
strings::StrCat("GPU_", tf_device_id.value(), "_bfc"));
|
||||
Allocator* gpu_allocator = gpu_bfc_allocator;
|
||||
SharedCounter* timing_counter = nullptr;
|
||||
if (options.experimental().timestamped_allocator()) {
|
||||
@ -195,29 +196,30 @@ Allocator* GPUProcessState::GetGPUAllocator(
|
||||
// distinctive patterns on both ends of allocated memory.
|
||||
if (UseCudaMemoryGuardAllocator()) {
|
||||
LOG(INFO) << "Using memory guard allocator for GPU.";
|
||||
gpu_allocator = new GPUDebugAllocator(gpu_allocator, platform_gpu_id);
|
||||
gpu_allocator = new GPUNanResetAllocator(gpu_allocator, platform_gpu_id);
|
||||
gpu_allocator = new GPUDebugAllocator(gpu_allocator, platform_device_id);
|
||||
gpu_allocator =
|
||||
new GPUNanResetAllocator(gpu_allocator, platform_device_id);
|
||||
} else if (UseCudaMallocAllocator()) {
|
||||
LOG(INFO) << "Using CUDA malloc allocator for GPU.";
|
||||
// If true, passes all allocation requests through to cudaMalloc
|
||||
// useful for doing memory debugging with tools like cuda-memcheck
|
||||
// **WARNING** probably will not work in a multi-gpu scenario
|
||||
gpu_allocator =
|
||||
new GPUcudaMallocAllocator(gpu_allocator, platform_gpu_id);
|
||||
new GPUcudaMallocAllocator(gpu_allocator, platform_device_id);
|
||||
} else if (UseCudaMallocAsyncAllocator()) {
|
||||
LOG(INFO) << "Using CUDA malloc Async allocator for GPU.";
|
||||
// If true, passes all allocation requests through to cudaMallocAsync
|
||||
// TODO: useful for doing memory debugging with tools like cuda-memcheck
|
||||
// TODO: **WARNING** probably will not work in a multi-gpu scenario
|
||||
gpu_allocator =
|
||||
new GpuCudaMallocAsyncAllocator(platform_gpu_id, total_bytes);
|
||||
new GpuCudaMallocAsyncAllocator(platform_device_id, total_bytes);
|
||||
}
|
||||
|
||||
Allocator* recording_allocator = nullptr;
|
||||
if (process_state_->ProcessState::FLAGS_brain_gpu_record_mem_types) {
|
||||
ProcessState::MemDesc md;
|
||||
md.loc = ProcessState::MemDesc::GPU;
|
||||
md.dev_index = platform_gpu_id.value();
|
||||
md.dev_index = platform_device_id.value();
|
||||
md.gpu_registered = false;
|
||||
md.nic_registered = true;
|
||||
recording_allocator = new internal::RecordingAllocator(
|
||||
@ -240,20 +242,20 @@ Allocator* GPUProcessState::GetGPUAllocator(
|
||||
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
}
|
||||
|
||||
SharedCounter* GPUProcessState::GPUAllocatorCounter(TfGpuId tf_gpu_id) {
|
||||
SharedCounter* GPUProcessState::GPUAllocatorCounter(TfDeviceId tf_device_id) {
|
||||
DCHECK(process_state_);
|
||||
#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
|
||||
(defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
|
||||
DeviceIdUtil::CheckValidTfDeviceId(DEVICE_GPU, GPUMachineManager(),
|
||||
tf_gpu_id);
|
||||
tf_device_id);
|
||||
mutex_lock l(mu_);
|
||||
if (tf_gpu_id.value() >= static_cast<int64>(gpu_allocators_.size())) {
|
||||
LOG(ERROR) << "Asked for counter for GPU allocator " << tf_gpu_id.value()
|
||||
if (tf_device_id.value() >= static_cast<int64>(gpu_allocators_.size())) {
|
||||
LOG(ERROR) << "Asked for counter for GPU allocator " << tf_device_id.value()
|
||||
<< " but only have " << gpu_allocators_.size();
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
AllocatorParts& allocator_parts = gpu_allocators_[tf_gpu_id.value()];
|
||||
AllocatorParts& allocator_parts = gpu_allocators_[tf_device_id.value()];
|
||||
if (allocator_parts.counter.get() == nullptr) {
|
||||
SharedCounter* timing_counter = new SharedCounter;
|
||||
allocator_parts.bfc_allocator->SetTimingCounter(timing_counter);
|
||||
@ -303,7 +305,7 @@ Allocator* GPUProcessState::GetGpuHostAllocator(int numa_node) {
|
||||
for (int i = 0; i < static_cast<int>(gpu_allocators_.size()); ++i) {
|
||||
if (gpu_allocators_[i].allocator != nullptr) {
|
||||
se = DeviceIdUtil::ExecutorForTfDeviceId(DEVICE_GPU, GPUMachineManager(),
|
||||
TfGpuId(i))
|
||||
TfDeviceId(i))
|
||||
.ValueOrDie();
|
||||
break;
|
||||
}
|
||||
|
@ -72,18 +72,18 @@ class GPUProcessState {
|
||||
//
|
||||
// 'total_bytes' is the total number of bytes that should be made
|
||||
// available to the allocator. The first call to this function for
|
||||
// a given tf_gpu_id creates the allocator, so only the total_bytes
|
||||
// a given tf_device_id creates the allocator, so only the total_bytes
|
||||
// used on that first call is used.
|
||||
//
|
||||
// "Allocator type" describes the type of algorithm to use for the
|
||||
// underlying allocator. REQUIRES: Must be a valid type (see
|
||||
// config.proto for the list of supported strings.).
|
||||
//
|
||||
// REQUIRES: tf_gpu_id must be a valid id for a BaseGPUDevice available in the
|
||||
// current system environment. Otherwise returns nullptr.
|
||||
virtual Allocator* GetGPUAllocator(const GPUOptions& options,
|
||||
TfGpuId tf_gpu_id, size_t total_bytes,
|
||||
const std::vector<TfGpuId>& peer_gpu_ids);
|
||||
// REQUIRES: tf_device_id must be a valid id for a BaseGPUDevice available in
|
||||
// the current system environment. Otherwise returns nullptr.
|
||||
virtual Allocator* GetGPUAllocator(
|
||||
const GPUOptions& options, TfDeviceId tf_device_id, size_t total_bytes,
|
||||
const std::vector<TfDeviceId>& peer_gpu_ids);
|
||||
|
||||
int NumGPUAllocators() {
|
||||
mutex_lock l(mu_);
|
||||
@ -115,9 +115,9 @@ class GPUProcessState {
|
||||
const SubAllocator::Visitor& visitor);
|
||||
|
||||
// Returns bus_id for the given GPU id.
|
||||
virtual int BusIdForGPU(TfGpuId tf_gpu_id);
|
||||
virtual int BusIdForGPU(TfDeviceId tf_device_id);
|
||||
|
||||
SharedCounter* GPUAllocatorCounter(TfGpuId tf_gpu_id);
|
||||
SharedCounter* GPUAllocatorCounter(TfDeviceId tf_device_id);
|
||||
|
||||
protected:
|
||||
// GPUProcessState is a singleton that should not normally be deleted except
|
||||
|
@ -44,7 +44,7 @@ StatusOr<bool> SupportsVirtualAddressManagement(GpuDeviceHandle device) {
|
||||
}
|
||||
|
||||
Status CheckVirtualAddressManagementSupport(GpuDeviceHandle device,
|
||||
PlatformGpuId gpu_id) {
|
||||
PlatformDeviceId gpu_id) {
|
||||
TF_ASSIGN_OR_RETURN(bool supports_virtual_address_management,
|
||||
SupportsVirtualAddressManagement(device));
|
||||
if (!supports_virtual_address_management) {
|
||||
@ -59,11 +59,11 @@ Status CheckVirtualAddressManagementSupport(GpuDeviceHandle device,
|
||||
|
||||
/* static */ stream_executor::port::StatusOr<
|
||||
std::unique_ptr<GpuVirtualMemAllocator>>
|
||||
GpuVirtualMemAllocator::Create(const std::vector<Visitor>& alloc_visitors,
|
||||
const std::vector<Visitor>& free_visitors,
|
||||
GpuContext& gpu_context, PlatformGpuId gpu_id,
|
||||
size_t virtual_address_space_size,
|
||||
const std::vector<PlatformGpuId>& peer_gpu_ids) {
|
||||
GpuVirtualMemAllocator::Create(
|
||||
const std::vector<Visitor>& alloc_visitors,
|
||||
const std::vector<Visitor>& free_visitors, GpuContext& gpu_context,
|
||||
PlatformDeviceId gpu_id, size_t virtual_address_space_size,
|
||||
const std::vector<PlatformDeviceId>& peer_gpu_ids) {
|
||||
std::vector<GpuDeviceHandle> access_gpu_handles;
|
||||
access_gpu_handles.reserve(peer_gpu_ids.size() + 1);
|
||||
|
||||
@ -111,7 +111,8 @@ GpuVirtualMemAllocator::Create(const std::vector<Visitor>& alloc_visitors,
|
||||
GpuVirtualMemAllocator::GpuVirtualMemAllocator(
|
||||
const std::vector<Visitor>& alloc_visitors,
|
||||
const std::vector<Visitor>& free_visitors, GpuContext& gpu_context,
|
||||
PlatformGpuId gpu_id, const std::vector<GpuDeviceHandle> access_gpu_handles,
|
||||
PlatformDeviceId gpu_id,
|
||||
const std::vector<GpuDeviceHandle> access_gpu_handles,
|
||||
GpuDriver::VmemSpan vmem, size_t granularity)
|
||||
: SubAllocator(alloc_visitors, free_visitors),
|
||||
gpu_context_(gpu_context),
|
||||
|
@ -44,9 +44,9 @@ class GpuVirtualMemAllocator : public SubAllocator {
|
||||
std::unique_ptr<GpuVirtualMemAllocator>>
|
||||
Create(const std::vector<Visitor>& alloc_visitors,
|
||||
const std::vector<Visitor>& free_visitors,
|
||||
stream_executor::gpu::GpuContext& gpu_context, PlatformGpuId gpu_id,
|
||||
stream_executor::gpu::GpuContext& gpu_context, PlatformDeviceId gpu_id,
|
||||
size_t virtual_address_space_size,
|
||||
const std::vector<PlatformGpuId>& peer_gpu_ids);
|
||||
const std::vector<PlatformDeviceId>& peer_gpu_ids);
|
||||
~GpuVirtualMemAllocator() override;
|
||||
|
||||
// Allocates memory at least as large as requested by num_bytes. Will be
|
||||
@ -74,12 +74,12 @@ class GpuVirtualMemAllocator : public SubAllocator {
|
||||
GpuVirtualMemAllocator(
|
||||
const std::vector<Visitor>& alloc_visitors,
|
||||
const std::vector<Visitor>& free_visitors,
|
||||
stream_executor::gpu::GpuContext& gpu_context, PlatformGpuId gpu_id,
|
||||
stream_executor::gpu::GpuContext& gpu_context, PlatformDeviceId gpu_id,
|
||||
std::vector<stream_executor::gpu::GpuDeviceHandle> access_device_handles,
|
||||
stream_executor::gpu::GpuDriver::VmemSpan vmem, size_t granularity);
|
||||
|
||||
stream_executor::gpu::GpuContext& gpu_context_;
|
||||
PlatformGpuId gpu_id_;
|
||||
PlatformDeviceId gpu_id_;
|
||||
|
||||
// Peer access is configured at mmap time so the allocator must be aware of
|
||||
// all gpus that may want to read the memory. This list also includes the
|
||||
|
@ -35,7 +35,7 @@ constexpr size_t k2MiB{2 << 20};
|
||||
|
||||
// Creates an allocator with 8 MiB of virtual address space.
|
||||
std::unique_ptr<GpuVirtualMemAllocator> CreateAllocator() {
|
||||
PlatformGpuId gpu_id(0);
|
||||
PlatformDeviceId gpu_id(0);
|
||||
auto executor =
|
||||
DeviceIdUtil::ExecutorForPlatformDeviceId(GPUMachineManager(), gpu_id)
|
||||
.ValueOrDie();
|
||||
@ -48,7 +48,7 @@ std::unique_ptr<GpuVirtualMemAllocator> CreateAllocator() {
|
||||
}
|
||||
|
||||
TEST(GpuVirtualMemAllocatorTest, SimpleAlloc) {
|
||||
PlatformGpuId gpu_id(0);
|
||||
PlatformDeviceId gpu_id(0);
|
||||
auto executor =
|
||||
DeviceIdUtil::ExecutorForPlatformDeviceId(GPUMachineManager(), gpu_id)
|
||||
.ValueOrDie();
|
||||
|
@ -92,14 +92,15 @@ Status SingleMachine::Provision() {
|
||||
return errors::InvalidArgument(
|
||||
strings::StrCat("Not able to parse GPU device name: ", dev.name()));
|
||||
}
|
||||
TfGpuId tf_gpu_id(parsed.id);
|
||||
PlatformGpuId platform_gpu_id;
|
||||
Status s = GpuIdManager::TfToPlatformGpuId(tf_gpu_id, &platform_gpu_id);
|
||||
TfDeviceId tf_device_id(parsed.id);
|
||||
PlatformDeviceId platform_device_id;
|
||||
Status s =
|
||||
GpuIdManager::TfToPlatformDeviceId(tf_device_id, &platform_device_id);
|
||||
if (!s.ok()) {
|
||||
return errors::Unavailable("Unknown TF GPU device with id ",
|
||||
tf_gpu_id.value(), ": ", s.ToString());
|
||||
tf_device_id.value(), ": ", s.ToString());
|
||||
}
|
||||
attr = GetLocalGPUInfo(platform_gpu_id);
|
||||
attr = GetLocalGPUInfo(platform_device_id);
|
||||
} else if (dev.device_type().find("XLA") == string::npos) {
|
||||
// Filter out the fake XLA devices to avoid double counting the actual
|
||||
// hardware resources that are available.
|
||||
|
@ -74,14 +74,14 @@ DeviceProperties GetLocalCPUInfo() {
|
||||
return device;
|
||||
}
|
||||
|
||||
DeviceProperties GetLocalGPUInfo(PlatformGpuId platform_gpu_id) {
|
||||
DeviceProperties GetLocalGPUInfo(PlatformDeviceId platform_device_id) {
|
||||
DeviceProperties device;
|
||||
device.set_type("GPU");
|
||||
|
||||
#if GOOGLE_CUDA
|
||||
cudaDeviceProp properties;
|
||||
cudaError_t error =
|
||||
cudaGetDeviceProperties(&properties, platform_gpu_id.value());
|
||||
cudaGetDeviceProperties(&properties, platform_device_id.value());
|
||||
if (error != cudaSuccess) {
|
||||
device.set_type("UNKNOWN");
|
||||
LOG(ERROR) << "Failed to get device properties, error code: " << error;
|
||||
@ -117,7 +117,7 @@ DeviceProperties GetLocalGPUInfo(PlatformGpuId platform_gpu_id) {
|
||||
#elif TENSORFLOW_USE_ROCM
|
||||
hipDeviceProp_t properties;
|
||||
hipError_t error =
|
||||
hipGetDeviceProperties(&properties, platform_gpu_id.value());
|
||||
hipGetDeviceProperties(&properties, platform_device_id.value());
|
||||
if (error != hipSuccess) {
|
||||
device.set_type("UNKNOWN");
|
||||
LOG(ERROR) << "Failed to get device properties, error code: " << error;
|
||||
@ -156,16 +156,17 @@ DeviceProperties GetDeviceInfo(const DeviceNameUtils::ParsedName& device) {
|
||||
return GetLocalCPUInfo();
|
||||
} else if (device.type == "GPU") {
|
||||
if (device.has_id) {
|
||||
TfGpuId tf_gpu_id(device.id);
|
||||
PlatformGpuId platform_gpu_id;
|
||||
Status s = GpuIdManager::TfToPlatformGpuId(tf_gpu_id, &platform_gpu_id);
|
||||
TfDeviceId tf_device_id(device.id);
|
||||
PlatformDeviceId platform_device_id;
|
||||
Status s =
|
||||
GpuIdManager::TfToPlatformDeviceId(tf_device_id, &platform_device_id);
|
||||
if (!s.ok()) {
|
||||
LOG(ERROR) << s;
|
||||
return unknown;
|
||||
}
|
||||
return GetLocalGPUInfo(platform_gpu_id);
|
||||
return GetLocalGPUInfo(platform_device_id);
|
||||
} else {
|
||||
return GetLocalGPUInfo(PlatformGpuId(0));
|
||||
return GetLocalGPUInfo(PlatformDeviceId(0));
|
||||
}
|
||||
}
|
||||
return unknown;
|
||||
|
@ -28,7 +28,7 @@ DeviceProperties GetLocalCPUInfo();
|
||||
|
||||
// Returns the DeviceProperties for the specified GPU attached to the server on
|
||||
// which grappler is running.
|
||||
DeviceProperties GetLocalGPUInfo(PlatformGpuId platform_gpu_id);
|
||||
DeviceProperties GetLocalGPUInfo(PlatformDeviceId platform_device_id);
|
||||
|
||||
// Returns the DeviceProperties of the specified device
|
||||
DeviceProperties GetDeviceInfo(const DeviceNameUtils::ParsedName& device);
|
||||
|
@ -33,11 +33,11 @@ TEST(UtilsTest, GetLocalGPUInfo) {
|
||||
DeviceProperties properties;
|
||||
|
||||
// Invalid platform GPU ID.
|
||||
properties = GetLocalGPUInfo(PlatformGpuId(100));
|
||||
properties = GetLocalGPUInfo(PlatformDeviceId(100));
|
||||
EXPECT_EQ("UNKNOWN", properties.type());
|
||||
|
||||
// Succeed when a valid platform GPU id was inserted.
|
||||
properties = GetLocalGPUInfo(PlatformGpuId(0));
|
||||
properties = GetLocalGPUInfo(PlatformDeviceId(0));
|
||||
EXPECT_EQ("GPU", properties.type());
|
||||
EXPECT_EQ("NVIDIA", properties.vendor());
|
||||
#elif TENSORFLOW_USE_ROCM
|
||||
@ -45,21 +45,21 @@ TEST(UtilsTest, GetLocalGPUInfo) {
|
||||
DeviceProperties properties;
|
||||
|
||||
// Invalid platform GPU ID.
|
||||
properties = GetLocalGPUInfo(PlatformGpuId(100));
|
||||
properties = GetLocalGPUInfo(PlatformDeviceId(100));
|
||||
EXPECT_EQ("UNKNOWN", properties.type());
|
||||
|
||||
// Succeed when a valid platform GPU id was inserted.
|
||||
properties = GetLocalGPUInfo(PlatformGpuId(0));
|
||||
properties = GetLocalGPUInfo(PlatformDeviceId(0));
|
||||
EXPECT_EQ("GPU", properties.type());
|
||||
EXPECT_EQ("Advanced Micro Devices, Inc", properties.vendor());
|
||||
#else
|
||||
LOG(INFO) << "CUDA is not enabled.";
|
||||
DeviceProperties properties;
|
||||
|
||||
properties = GetLocalGPUInfo(PlatformGpuId(0));
|
||||
properties = GetLocalGPUInfo(PlatformDeviceId(0));
|
||||
EXPECT_EQ("GPU", properties.type());
|
||||
|
||||
properties = GetLocalGPUInfo(PlatformGpuId(100));
|
||||
properties = GetLocalGPUInfo(PlatformDeviceId(100));
|
||||
EXPECT_EQ("GPU", properties.type());
|
||||
#endif
|
||||
}
|
||||
@ -97,14 +97,14 @@ TEST(UtilsTest, GetDeviceInfo) {
|
||||
|
||||
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
// Invalid platform GPU id.
|
||||
TF_ASSERT_OK(
|
||||
GpuIdManager::InsertTfPlatformGpuIdPair(TfGpuId(0), PlatformGpuId(100)));
|
||||
TF_ASSERT_OK(GpuIdManager::InsertTfPlatformDeviceIdPair(
|
||||
TfDeviceId(0), PlatformDeviceId(100)));
|
||||
properties = GetDeviceInfo(device);
|
||||
EXPECT_EQ("UNKNOWN", properties.type());
|
||||
|
||||
// Valid platform GPU id.
|
||||
TF_ASSERT_OK(
|
||||
GpuIdManager::InsertTfPlatformGpuIdPair(TfGpuId(1), PlatformGpuId(0)));
|
||||
TF_ASSERT_OK(GpuIdManager::InsertTfPlatformDeviceIdPair(TfDeviceId(1),
|
||||
PlatformDeviceId(0)));
|
||||
device.id = 1;
|
||||
properties = GetDeviceInfo(device);
|
||||
EXPECT_EQ("GPU", properties.type());
|
||||
|
@ -241,14 +241,15 @@ DeviceProperties GetDeviceInfo(const string& device_str) {
|
||||
DeviceNameUtils::ParsedName parsed;
|
||||
if (DeviceNameUtils::ParseFullName(device_str, &parsed)) {
|
||||
if (parsed.type == "GPU") {
|
||||
TfGpuId tf_gpu_id(parsed.id);
|
||||
PlatformGpuId platform_gpu_id;
|
||||
Status s = GpuIdManager::TfToPlatformGpuId(tf_gpu_id, &platform_gpu_id);
|
||||
TfDeviceId tf_device_id(parsed.id);
|
||||
PlatformDeviceId platform_device_id;
|
||||
Status s =
|
||||
GpuIdManager::TfToPlatformDeviceId(tf_device_id, &platform_device_id);
|
||||
if (!s.ok()) {
|
||||
// We are probably running simulation without linking cuda libraries.
|
||||
platform_gpu_id = PlatformGpuId(parsed.id);
|
||||
platform_device_id = PlatformDeviceId(parsed.id);
|
||||
}
|
||||
return GetLocalGPUInfo(platform_gpu_id);
|
||||
return GetLocalGPUInfo(platform_device_id);
|
||||
} else if (parsed.type == "CPU") {
|
||||
return GetLocalCPUInfo();
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user