Make sure the rendezvous abort check is finished before triggering the callback.

PiperOrigin-RevId: 313204522 Change-Id: I88f38391d9ee2296fac9a6e86bb9f9d2c477f1c8
2020-05-26 09:21:15 -07:00 · 2020-05-26 09:21:15 -07:00 · 09af9319d9
commit 09af9319d9
parent 2e842db3cc
4 changed files with 125 additions and 12 deletions
--- a/tensorflow/core/distributed_runtime/rpc/BUILD
+++ b/tensorflow/core/distributed_runtime/rpc/BUILD
@ -462,6 +462,8 @@ tf_cuda_cc_tests(
        "//tensorflow/core:test_main",
        "//tensorflow/core:testlib",
        "//tensorflow/core/distributed_runtime:server_lib",
        "//tensorflow/core/distributed_runtime:test_utils",
        "//tensorflow/core/platform:blocking_counter",
    ],
 )
--- a/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc
+++ b/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc
@ -31,6 +31,7 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/notification.h"
 #include "tensorflow/core/platform/types.h"
 namespace tensorflow {
@ -136,7 +137,12 @@ class RpcRecvTensorCall : public BaseRecvTensorCall {
  // Start the main RecvTensor call, checking for an async abort.
  void StartRTCall(std::function<void()> recv_done) {
    resp_.InitAlloc(dst_device_, alloc_attrs_);
-    auto cb = [this, recv_done = std::move(recv_done)](const Status& s) {
+    auto abort_checked = std::make_shared<Notification>();
    auto cb = [this, abort_checked,
               recv_done = std::move(recv_done)](const Status& s) {
      // Make sure the Rendezvous abort checking is finished before running the
      // callback, which might destroy the current call object.
      abort_checked->WaitForNotification();
      if (!s.ok()) {
        mutex_lock l(mu_);
        status_.Update(s);
@ -158,6 +164,8 @@ class RpcRecvTensorCall : public BaseRecvTensorCall {
    if (!s.ok()) {
      opts_.StartCancel();
    }
    // Notify that the abort check has finished.
    abort_checked->Notify();
  }
  string src_worker_;
--- a/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr_test.cc
+++ b/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr_test.cc
@ -16,13 +16,16 @@ limitations under the License.
 #include "tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.h"
 #include "tensorflow/core/common_runtime/process_util.h"
 #include "tensorflow/core/distributed_runtime/test_utils.h"
 #include "tensorflow/core/framework/cancellation.h"
 #include "tensorflow/core/framework/control_flow.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/notification.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/blocking_counter.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/random.h"
 #include "tensorflow/core/platform/test.h"
 namespace tensorflow {
@ -48,13 +51,34 @@ Rendezvous::ParsedKey MakeKey(const string& s) {
 }
 namespace {
 // A dummy worker interface implementation that simply triggers the callback
 // with OK status for RecvTensor request.
 class DummyWorker : public TestWorkerInterface {
 public:
  void RecvTensorAsync(CallOptions* opts, const RecvTensorRequest* request,
                       TensorResponse* response, StatusCallback done) override {
    SchedClosure([done = std::move(done)]() {
      // Simulate a random delay for RPC. This is needed to fill the entire
      // object buffer in `RpcRecvTensorFreeList` and trigger the destruction of
      // RPC call objects.
      const int64 t_us = random::New64() % 100 * 1000;
      Env::Default()->SleepForMicroseconds(t_us);
      done(Status::OK());
    });
  }
 };
 // Fake cache implementation for WorkerEnv.
 class DummyWorkerCache : public WorkerCacheInterface {
  void ListWorkers(std::vector<string>* workers) const override {}
  void ListWorkersInJob(const string& job_name,
                        std::vector<string>* workers) const override {}
  WorkerInterface* GetOrCreateWorker(const string& target) override {
-    return nullptr;
+    if (dummy_remote_worker_ == nullptr) {
      // Ownership transferred to WorkerFreeList
      dummy_remote_worker_ = new DummyWorker;
    }
    return dummy_remote_worker_;
  }
  Status GetEagerClientCache(
      std::unique_ptr<eager::EagerClientCache>* eager_client_cache) override {
@ -66,7 +90,31 @@ class DummyWorkerCache : public WorkerCacheInterface {
  }
  void GetDeviceLocalityAsync(const string& device, DeviceLocality* locality,
                              StatusCallback done) override {}
 private:
  DummyWorker* dummy_remote_worker_ = nullptr;
 };
 static Device* CreateDevice(const char* type, const char* name) {
  class FakeDevice : public Device {
   public:
    explicit FakeDevice(const DeviceAttributes& attr) : Device(nullptr, attr) {}
    Status Sync() override { return Status::OK(); }
    Allocator* GetAllocator(AllocatorAttributes) override { return nullptr; }
  };
  DeviceAttributes attr;
  attr.set_name(name);
  attr.set_device_type(type);
  return new FakeDevice(attr);
 }
 static DeviceMgr* CreateDeviceMgr() {
  std::unique_ptr<Device> d0(
      CreateDevice("CPU", "/job:mnist/replica:1/task:2/cpu:1"));
  std::vector<std::unique_ptr<Device>> devices;
  devices.emplace_back(std::move(d0));
  return new StaticDeviceMgr(std::move(devices));
 }
 }  // namespace
 class RpcRendezvousMgrTest : public ::testing::Test {
@ -75,7 +123,7 @@ class RpcRendezvousMgrTest : public ::testing::Test {
      : cache_(new DummyWorkerCache),
        worker_session_("rpc_session", "/job:mnist/replica:1/task:2",
                        std::unique_ptr<WorkerCacheInterface>(cache_),
-                        std::unique_ptr<DeviceMgr>(),
+                        std::unique_ptr<DeviceMgr>(CreateDeviceMgr()),
                        std::unique_ptr<GraphMgr>(), nullptr),
        rmgr_(&env) {
    env.env = Env::Default();
@ -193,6 +241,7 @@ TEST_F(RpcRendezvousMgrTest, CancelAfterReceived) {
  delete cm;
 }
 namespace {
 class DummyDeviceContext : public DeviceContext {
 public:
  explicit DummyDeviceContext(int stream_id) : stream_id_(stream_id) {}
@ -202,6 +251,7 @@ class DummyDeviceContext : public DeviceContext {
 private:
  const int stream_id_;
 };
 }  // namespace
 TEST_F(RpcRendezvousMgrTest, TransferDummyDeviceContext) {
  DummyDeviceContext* dc = new DummyDeviceContext(123);
@ -237,6 +287,59 @@ TEST_F(RpcRendezvousMgrTest, TransferDummyDeviceContext) {
  dc->Unref();
 }
-// NOTE: Remote Send/Recv is better tested in worker_test.cc
+TEST_F(RpcRendezvousMgrTest, RemoteRecvOne) {
  const int64 step_id = 123;
  const Rendezvous::ParsedKey key = MakeKey(Rendezvous::CreateKey(
      "/job:worker/replica:1/task:2/cpu:0", 7890,
      "/job:mnist/replica:1/task:2/cpu:1", "foo", FrameAndIter(0, 0)));
  {
    RemoteRendezvous* rendez = rmgr_.Find(step_id);
    TF_ASSERT_OK(rendez->Initialize(&worker_session_));
    core::ScopedUnref unref(rendez);
    Rendezvous::Args args;
    Tensor val(DT_STRING);
    bool val_dead = false;
    TF_ASSERT_OK(rendez->Recv(key, args, &val, &val_dead));
  }
  rmgr_.Cleanup(step_id);
 }
 TEST_F(RpcRendezvousMgrTest, RemoteRecvAsyncMany) {
  const int64 step_id = 123;
  const Rendezvous::ParsedKey key = MakeKey(Rendezvous::CreateKey(
      "/job:worker/replica:1/task:2/cpu:0", 7890,
      "/job:mnist/replica:1/task:2/cpu:1", "foo", FrameAndIter(0, 0)));
  {
    RemoteRendezvous* rendez = rmgr_.Find(step_id);
    TF_ASSERT_OK(rendez->Initialize(&worker_session_));
    core::ScopedUnref unref(rendez);
    Rendezvous::Args args;
    // Send a large number of async RPC requests to fill up the buffer in
    // `RpcRecvTensorFreeList`, in order to test deleting RPC call objects.
    int num_requests = 10000;
    Tensor val(DT_STRING);
    mutex mu_;
    Status status = Status::OK();
    BlockingCounter counter(num_requests);
    for (int i = 0; i < num_requests; i++) {
      rendez->RecvAsync(
          key, args,
          [&mu_, &status, &counter](const Status& s, const Rendezvous::Args&,
                                    const Rendezvous::Args&, const Tensor&,
                                    const bool) {
            mutex_lock l(mu_);
            status.Update(s);
            counter.DecrementCount();
          });
    }
    counter.Wait();
    TF_ASSERT_OK(status);
  }
  rmgr_.Cleanup(step_id);
 }
 }  // namespace tensorflow
--- a/tensorflow/core/distributed_runtime/test_utils.h
+++ b/tensorflow/core/distributed_runtime/test_utils.h
@ -70,28 +70,28 @@ class TestWorkerInterface : public WorkerInterface {
  void CleanupGraphAsync(const CleanupGraphRequest* request,
                         CleanupGraphResponse* response,
                         StatusCallback done) override {
-    done(errors::Unimplemented("RunGraphAsync"));
+    done(errors::Unimplemented("CleanupGraphAsync"));
  }
  void CleanupAllAsync(const CleanupAllRequest* request,
                       CleanupAllResponse* response,
                       StatusCallback done) override {
-    done(errors::Unimplemented("RunGraphAsync"));
+    done(errors::Unimplemented("CleanupAllAsync"));
  }
  void RecvTensorAsync(CallOptions* opts, const RecvTensorRequest* request,
                       TensorResponse* response, StatusCallback done) override {
-    done(errors::Unimplemented("RunGraphAsync"));
+    done(errors::Unimplemented("RecvTensorAsync"));
  }
  void LoggingAsync(const LoggingRequest* request, LoggingResponse* response,
                    StatusCallback done) override {
-    done(errors::Unimplemented("RunGraphAsync"));
+    done(errors::Unimplemented("LoggingAsync"));
  }
  void TracingAsync(const TracingRequest* request, TracingResponse* response,
                    StatusCallback done) override {
-    done(errors::Unimplemented("RunGraphAsync"));
+    done(errors::Unimplemented("TracingAsync"));
  }
  void RecvBufAsync(CallOptions* opts, const RecvBufRequest* request,
@ -103,20 +103,20 @@ class TestWorkerInterface : public WorkerInterface {
                          const CompleteGroupRequest* request,
                          CompleteGroupResponse* response,
                          StatusCallback done) override {
-    done(errors::Unimplemented("RunGraphAsync"));
+    done(errors::Unimplemented("CompleteGroupAsync"));
  }
  void CompleteInstanceAsync(CallOptions* ops,
                             const CompleteInstanceRequest* request,
                             CompleteInstanceResponse* response,
                             StatusCallback done) override {
-    done(errors::Unimplemented("RunGraphAsync"));
+    done(errors::Unimplemented("CompleteInstanceAsync"));
  }
  void GetStepSequenceAsync(const GetStepSequenceRequest* request,
                            GetStepSequenceResponse* response,
                            StatusCallback done) override {
-    done(errors::Unimplemented("RunGraphAsync"));
+    done(errors::Unimplemented("GetStepSequenceAsync"));
  }
 };