TpuDriver: Add more detail to connection errors and make timeout configurable.

PiperOrigin-RevId: 281108883
Change-Id: I6bbeb951815816b00d9591ba2fed27879edcd51c
This commit is contained in:
Russell Power 2019-11-18 11:05:55 -08:00 committed by TensorFlower Gardener
parent c455ab4555
commit c85d9b1369
2 changed files with 25 additions and 22 deletions

View File

@ -978,28 +978,30 @@ Status GrpcTpuDriver::Reset() {
return xla::Unimplemented("GRPC driver reset is not implemented yet."); return xla::Unimplemented("GRPC driver reset is not implemented yet.");
} }
REGISTER_TPU_DRIVER("grpc://", REGISTER_TPU_DRIVER(
[](const TpuDriverConfig& config) "grpc://",
-> xla::StatusOr<std::unique_ptr<TpuDriver>> { [](const TpuDriverConfig& config)
auto stub = GrpcTpuDriver::CreateTpuDriverStub(config); -> xla::StatusOr<std::unique_ptr<TpuDriver>> {
::grpc::ClientContext ctx; auto stub = GrpcTpuDriver::CreateTpuDriverStub(config);
ctx.set_fail_fast(false); ::grpc::ClientContext ctx;
ctx.set_deadline(std::chrono::system_clock::now() + ctx.set_fail_fast(false);
std::chrono::seconds(10)); ctx.set_deadline(std::chrono::system_clock::now() +
OpenRequest req; std::chrono::seconds(config.connection_timeout_secs));
OpenResponse resp; OpenRequest req;
::grpc::Status status = stub->Open(&ctx, req, &resp); OpenResponse resp;
if (!status.ok()) { ::grpc::Status status = stub->Open(&ctx, req, &resp);
LOG(ERROR) << "Failed to open the gRPC driver: " if (!status.ok()) {
<< status.error_code() << ": " LOG(ERROR) << "Failed to open the gRPC driver: " << status.error_code()
<< status.error_details(); << ": " << status.error_details();
return xla::Status( return xla::Status(
tensorflow::error::Code(status.error_code()), tensorflow::error::Code(status.error_code()),
status.error_message() + status.error_details()); absl::StrCat("Failed to connect to remote server at address: ",
} config.worker,
return std::unique_ptr<TpuDriver>( ". Error from gRPC: ", status.error_details()));
new GrpcTpuDriver(config, resp.client_id())); }
}); return std::unique_ptr<TpuDriver>(
new GrpcTpuDriver(config, resp.client_id()));
});
} // namespace } // namespace
} // namespace tpu_driver } // namespace tpu_driver

View File

@ -229,6 +229,7 @@ class TpuDriver {
struct TpuDriverConfig { struct TpuDriverConfig {
std::string worker; std::string worker;
int64_t connection_timeout_secs = 10;
}; };
class TpuDriverRegistry { class TpuDriverRegistry {