Fix EagerContext::HostCPU().

The EagerContext::HostCPU method was, as far as I can tell, returning
a random device (at least when the device manager was dynamic): the
variable is initialized with the first element not only of an
unordered map (where stored devices were in random order), but an
unordered map having pointers for keys, making it even more random. As
a result, that function could return GPU devices at will - and did do
that in a test for an unrelated change I am working on.

PiperOrigin-RevId: 292583954
Change-Id: I0851d8b1a911f4044171b061ba744ee951dbd6fd
This commit is contained in:
A. Unique TensorFlower 2020-01-31 11:32:26 -08:00 committed by TensorFlower Gardener
parent 2d1f922fa0
commit 20f9bb5ba4
7 changed files with 177 additions and 13 deletions

View File

@ -3134,6 +3134,7 @@ tf_cc_tests(
"common_runtime/buf_rendezvous_test.cc",
"common_runtime/collective_executor_mgr_test.cc",
"common_runtime/collective_rma_local_test.cc",
"common_runtime/device_mgr_test.cc",
"common_runtime/device_resolver_local_test.cc",
"common_runtime/device_set_test.cc",
"common_runtime/dynamic_device_mgr_test.cc",

View File

@ -17,6 +17,7 @@ limitations under the License.
#include <memory>
#include <vector>
#include "tensorflow/core/common_runtime/local_device.h"
#include "tensorflow/core/framework/device_attributes.pb.h"
#include "tensorflow/core/lib/core/errors.h"
@ -28,7 +29,9 @@ namespace tensorflow {
DeviceMgr::~DeviceMgr() {}
StaticDeviceMgr::StaticDeviceMgr(std::vector<std::unique_ptr<Device>> devices)
: devices_(std::move(devices)), name_backing_store_(128) {
: devices_(std::move(devices)),
name_backing_store_(128),
cpu_device_(nullptr) {
for (auto& d : devices_) {
// Register under the (1) full name and (2) canonical name.
for (const string& name :
@ -40,7 +43,11 @@ StaticDeviceMgr::StaticDeviceMgr(std::vector<std::unique_ptr<Device>> devices)
DeviceNameUtils::GetLocalNamesForDeviceMappings(d->parsed_name())) {
device_map_[CopyToBackingStore(name)] = d.get();
}
device_type_counts_[d->device_type()]++;
const auto& t = d->device_type();
device_type_counts_[t]++;
if (cpu_device_ == nullptr && t == "CPU") {
cpu_device_ = d.get();
}
}
}
@ -140,4 +147,6 @@ int StaticDeviceMgr::NumDeviceType(const string& type) const {
return 0;
}
Device* StaticDeviceMgr::HostCPU() const { return cpu_device_; }
} // namespace tensorflow

View File

@ -62,6 +62,10 @@ class DeviceMgr {
virtual int NumDeviceType(const string& type) const = 0;
// Returns an arbitrary CPU device if one is present, otherwise return
// nullptr.
virtual Device* HostCPU() const = 0;
TF_DISALLOW_COPY_AND_ASSIGN(DeviceMgr);
};
@ -84,6 +88,7 @@ class StaticDeviceMgr : public DeviceMgr {
Status LookupDevice(StringPiece name, Device** device) const override;
void ClearContainers(gtl::ArraySlice<string> containers) const override;
int NumDeviceType(const string& type) const override;
Device* HostCPU() const override;
private:
const std::vector<std::unique_ptr<Device>> devices_;
@ -93,6 +98,7 @@ class StaticDeviceMgr : public DeviceMgr {
std::unordered_map<StringPiece, Device*, StringPieceHasher> device_map_;
core::Arena name_backing_store_; // Storage for keys in device_map_
std::unordered_map<string, int> device_type_counts_;
Device* cpu_device_;
TF_DISALLOW_COPY_AND_ASSIGN(StaticDeviceMgr);
};
@ -101,7 +107,7 @@ class StaticDeviceMgr : public DeviceMgr {
class DynamicDeviceMgr : public DeviceMgr {
public:
// Constructs an empty DynamicDeviceMgr.
DynamicDeviceMgr() {}
DynamicDeviceMgr();
~DynamicDeviceMgr() override;
@ -113,15 +119,19 @@ class DynamicDeviceMgr : public DeviceMgr {
Status LookupDevice(StringPiece name, Device** device) const override;
void ClearContainers(gtl::ArraySlice<string> containers) const override;
int NumDeviceType(const string& type) const override;
Device* HostCPU() const override;
// Add devices to device manager. Returns error for repeated device names.
Status AddDevices(std::vector<std::unique_ptr<Device>> devices);
// Remove devices from device manager. Returns error for non-existing devices.
// Remove devices from device manager.
// Returns error for non-existing devices or if the HostCPU() device is in the
// input list. If an error is returned, the device list is not modified.
Status RemoveDevices(std::vector<Device*> devices);
// Remove devices from device manager by their names. Returns error for
// non-existing devices.
// non-existing devices or if the HostCPU() device is given in the input list.
// If an error is returned, the device list is not modified.
Status RemoveDevicesByName(const std::vector<string>& device_names);
private:
@ -134,6 +144,8 @@ class DynamicDeviceMgr : public DeviceMgr {
std::unordered_map<string, int> device_type_counts_ GUARDED_BY(devices_mu_);
mutable Device* cpu_device_ GUARDED_BY(devices_mu_);
TF_DISALLOW_COPY_AND_ASSIGN(DynamicDeviceMgr);
};
} // namespace tensorflow

View File

@ -0,0 +1,67 @@
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/core/common_runtime/device_mgr.h"
#include <memory>
#include <vector>
#include "absl/memory/memory.h"
#include "tensorflow/core/lib/core/status.h"
#include "tensorflow/core/platform/test.h"
#include "tensorflow/core/util/ptr_util.h"
namespace tensorflow {
namespace {
// Return a fake device with the specified type and name.
static Device* CreateDevice(const char* type, const char* name) {
class FakeDevice : public Device {
public:
explicit FakeDevice(const DeviceAttributes& attr) : Device(nullptr, attr) {}
Status Sync() override { return Status::OK(); }
Allocator* GetAllocator(AllocatorAttributes) override { return nullptr; }
};
DeviceAttributes attr;
attr.set_name(name);
attr.set_device_type(type);
return new FakeDevice(attr);
}
TEST(StaticDeviceMgr, NoCPUDevice) {
std::unique_ptr<Device> d0(CreateDevice("GPU", "/device:GPU:0"));
std::unique_ptr<Device> d1(CreateDevice("GPU", "/device:GPU:1"));
std::vector<std::unique_ptr<Device>> devices;
devices.emplace_back(std::move(d0));
devices.emplace_back(std::move(d1));
StaticDeviceMgr lm(std::move(devices));
EXPECT_EQ(lm.HostCPU(), nullptr);
}
TEST(StaticDeviceMgr, SomeCPUDevice) {
std::unique_ptr<Device> d0(CreateDevice("GPU", "/device:GPU:0"));
std::unique_ptr<Device> d1(CreateDevice("GPU", "/device:GPU:1"));
std::unique_ptr<Device> d2(CreateDevice("CPU", "/device:CPU:0"));
Device* d2_ptr = d2.get();
std::vector<std::unique_ptr<Device>> devices;
devices.emplace_back(std::move(d0));
devices.emplace_back(std::move(d1));
devices.emplace_back(std::move(d2));
StaticDeviceMgr lm(std::move(devices));
EXPECT_EQ(lm.HostCPU(), d2_ptr);
}
} // namespace
} // namespace tensorflow

View File

@ -26,6 +26,8 @@ limitations under the License.
namespace tensorflow {
DynamicDeviceMgr::DynamicDeviceMgr() : cpu_device_(nullptr) {}
DynamicDeviceMgr::~DynamicDeviceMgr() {
// Release resources ahead of destroying the device manager as the resource
// destructors (e.g. ~IteratorResource) assume devices still exist.
@ -143,11 +145,20 @@ Status DynamicDeviceMgr::AddDevices(
Status DynamicDeviceMgr::RemoveDevices(std::vector<Device*> devices) {
mutex_lock l(devices_mu_);
for (const auto& d : devices) {
if (d == cpu_device_) {
TF_RETURN_IF_ERROR(
errors::InvalidArgument("Can not remove HostCPU device ", d->name()));
}
auto it = dynamic_devices_.find(d);
if (it == dynamic_devices_.end()) {
TF_RETURN_IF_ERROR(errors::InvalidArgument("Unknown device ", d->name()));
}
}
for (const auto& d : devices) {
auto it = dynamic_devices_.find(d);
// Clear registration of (1) full name and (2) canonical name
for (const string& name :
@ -176,4 +187,20 @@ Status DynamicDeviceMgr::RemoveDevicesByName(
return RemoveDevices(devices_to_remove);
}
Device* DynamicDeviceMgr::HostCPU() const {
mutex_lock l(devices_mu_);
if (dynamic_devices_.find(cpu_device_) != dynamic_devices_.end()) {
return cpu_device_;
}
cpu_device_ = nullptr;
for (const auto& pair : dynamic_devices_) {
std::cerr << "WOWZA: " << pair.first << std::endl;
if (pair.first->device_type() == DEVICE_CPU) {
cpu_device_ = pair.first;
break;
}
}
return cpu_device_;
}
} // namespace tensorflow

View File

@ -105,8 +105,9 @@ TEST(DynamicDeviceMgrTest, AddRepeatedDeviceToMgr) {
}
TEST(DynamicDeviceMgrTest, RemoveNonExistingDeviceFromMgr) {
std::unique_ptr<Device> d0(CreateDevice("CPU", "/device:CPU:0"));
std::unique_ptr<Device> d0(CreateDevice("GPU", "/device:GPU:0"));
std::unique_ptr<Device> d1(CreateDevice("CPU", "/device:CPU:1"));
Device* d0_ptr = d0.get();
Device* d1_ptr = d1.get();
auto dm = MakeUnique<DynamicDeviceMgr>();
@ -115,14 +116,16 @@ TEST(DynamicDeviceMgrTest, RemoveNonExistingDeviceFromMgr) {
TF_CHECK_OK(dm->AddDevices(std::move(devices)));
EXPECT_EQ(dm->ListDevices().size(), 1);
std::vector<Device*> removed_devices{d1_ptr};
std::vector<Device*> removed_devices{d0_ptr, d1_ptr};
Status s = dm->RemoveDevices(removed_devices);
EXPECT_TRUE(absl::StrContains(s.error_message(), "Unknown device"));
EXPECT_EQ(dm->ListDevices().size(), 1); // d0 *not* removed.
}
TEST(DynamicDeviceMgrTest, RemoveNonExistingDeviceByNameFromMgr) {
std::unique_ptr<Device> d0(CreateDevice("CPU", "/device:CPU:0"));
string d1_name = "/device:CPU:1";
std::unique_ptr<Device> d0(CreateDevice("GPU", "/device:GPU:0"));
string d0_name = "/device:GPU:0";
string d1_name = "/device:CPU:0";
auto dm = MakeUnique<DynamicDeviceMgr>();
std::vector<std::unique_ptr<Device>> devices;
@ -130,9 +133,54 @@ TEST(DynamicDeviceMgrTest, RemoveNonExistingDeviceByNameFromMgr) {
TF_CHECK_OK(dm->AddDevices(std::move(devices)));
EXPECT_EQ(dm->ListDevices().size(), 1);
std::vector<string> removed_devices{d1_name};
std::vector<string> removed_devices{d0_name, d1_name};
Status s = dm->RemoveDevicesByName(removed_devices);
EXPECT_TRUE(absl::StrContains(s.error_message(), "unknown device"));
EXPECT_EQ(dm->ListDevices().size(), 1); // d0 *not* removed
}
TEST(DynamicDeviceMgrTest, HostCPU) {
auto dm = MakeUnique<DynamicDeviceMgr>();
// If there are no CPU devices, HostCPU() should return nullptr.
std::unique_ptr<Device> gpu(CreateDevice("GPU", "/device:GPU:0"));
Device* gpu_ptr = gpu.get();
std::vector<std::unique_ptr<Device>> devices;
devices.emplace_back(std::move(gpu));
TF_CHECK_OK(dm->AddDevices(std::move(devices)));
EXPECT_EQ(dm->ListDevices().size(), 1);
EXPECT_EQ(dm->HostCPU(), nullptr);
// After adding a CPU device, it should return that device.
std::unique_ptr<Device> cpu0(CreateDevice("CPU", "/device:CPU:0"));
Device* cpu0_ptr = cpu0.get();
devices.clear();
devices.emplace_back(std::move(cpu0));
TF_CHECK_OK(dm->AddDevices(std::move(devices)));
EXPECT_EQ(dm->ListDevices().size(), 2);
EXPECT_EQ(dm->HostCPU(), cpu0_ptr);
// If we add another CPU device, HostCPU() should remain the same.
std::unique_ptr<Device> cpu1(CreateDevice("CPU", "/device:CPU:1"));
Device* cpu1_ptr = cpu1.get();
devices.clear();
devices.emplace_back(std::move(cpu1));
TF_CHECK_OK(dm->AddDevices(std::move(devices)));
EXPECT_EQ(dm->ListDevices().size(), 3);
EXPECT_EQ(dm->HostCPU(), cpu0_ptr);
// Once we have a HostCPU() device, we can't remove it ...
std::vector<Device*> removed{gpu_ptr, cpu0_ptr};
EXPECT_TRUE(absl::StrContains(dm->RemoveDevices(removed).error_message(),
"Can not remove HostCPU device"));
EXPECT_EQ(dm->ListDevices().size(), 3);
EXPECT_EQ(dm->HostCPU(), cpu0_ptr);
// ... but we should be able to remove another CPU device.
removed = std::vector<Device*>{cpu1_ptr};
TF_CHECK_OK(dm->RemoveDevices(removed));
EXPECT_EQ(dm->ListDevices().size(), 2);
EXPECT_EQ(dm->HostCPU(), cpu0_ptr);
}
} // namespace

View File

@ -80,7 +80,7 @@ EagerContext::EagerContext(
: default_device_placement_policy_(default_device_placement_policy),
default_mirroring_policy_(default_mirroring_policy),
local_device_manager_(device_mgr, device_mgr_owned),
host_cpu_device_(device_mgr->ListDevices()[0]),
host_cpu_device_(device_mgr->HostCPU()),
rendezvous_(rendezvous),
thread_pool_(NewThreadPoolFromSessionOptions(opts)),
custom_kernel_creator_(custom_kernel_creator),
@ -791,7 +791,7 @@ Status EagerContext::StoreCollectiveOpsServer(
collective_executor_mgr_.Reset(rpc_collective_executor_mgr);
local_device_manager_.Reset(device_mgr);
host_cpu_device_ = local_device_manager_.Get()->ListDevices()[0];
host_cpu_device_ = local_device_manager_.Get()->HostCPU();
InitPrioritizedDeviceTypeList();
ClearCachesAndThreadExecutors();
@ -1027,7 +1027,7 @@ Status EagerContext::SetMasterContextState(
ReadBoolFromEnvVar("TF_EAGER_REMOTE_USE_SEND_TENSOR_RPC", true);
local_device_manager_.Reset(local_device_mgr);
host_cpu_device_ = local_device_manager_.Get()->ListDevices()[0];
host_cpu_device_ = local_device_manager_.Get()->HostCPU();
if (rendezvous_ != nullptr) rendezvous_->Unref();
rendezvous_ = r;