TpuDriver: Add more detail to connection errors and make timeout configurable.
PiperOrigin-RevId: 281108883 Change-Id: I6bbeb951815816b00d9591ba2fed27879edcd51c
This commit is contained in:
parent
c455ab4555
commit
c85d9b1369
@ -978,28 +978,30 @@ Status GrpcTpuDriver::Reset() {
|
|||||||
return xla::Unimplemented("GRPC driver reset is not implemented yet.");
|
return xla::Unimplemented("GRPC driver reset is not implemented yet.");
|
||||||
}
|
}
|
||||||
|
|
||||||
REGISTER_TPU_DRIVER("grpc://",
|
REGISTER_TPU_DRIVER(
|
||||||
[](const TpuDriverConfig& config)
|
"grpc://",
|
||||||
-> xla::StatusOr<std::unique_ptr<TpuDriver>> {
|
[](const TpuDriverConfig& config)
|
||||||
auto stub = GrpcTpuDriver::CreateTpuDriverStub(config);
|
-> xla::StatusOr<std::unique_ptr<TpuDriver>> {
|
||||||
::grpc::ClientContext ctx;
|
auto stub = GrpcTpuDriver::CreateTpuDriverStub(config);
|
||||||
ctx.set_fail_fast(false);
|
::grpc::ClientContext ctx;
|
||||||
ctx.set_deadline(std::chrono::system_clock::now() +
|
ctx.set_fail_fast(false);
|
||||||
std::chrono::seconds(10));
|
ctx.set_deadline(std::chrono::system_clock::now() +
|
||||||
OpenRequest req;
|
std::chrono::seconds(config.connection_timeout_secs));
|
||||||
OpenResponse resp;
|
OpenRequest req;
|
||||||
::grpc::Status status = stub->Open(&ctx, req, &resp);
|
OpenResponse resp;
|
||||||
if (!status.ok()) {
|
::grpc::Status status = stub->Open(&ctx, req, &resp);
|
||||||
LOG(ERROR) << "Failed to open the gRPC driver: "
|
if (!status.ok()) {
|
||||||
<< status.error_code() << ": "
|
LOG(ERROR) << "Failed to open the gRPC driver: " << status.error_code()
|
||||||
<< status.error_details();
|
<< ": " << status.error_details();
|
||||||
return xla::Status(
|
return xla::Status(
|
||||||
tensorflow::error::Code(status.error_code()),
|
tensorflow::error::Code(status.error_code()),
|
||||||
status.error_message() + status.error_details());
|
absl::StrCat("Failed to connect to remote server at address: ",
|
||||||
}
|
config.worker,
|
||||||
return std::unique_ptr<TpuDriver>(
|
". Error from gRPC: ", status.error_details()));
|
||||||
new GrpcTpuDriver(config, resp.client_id()));
|
}
|
||||||
});
|
return std::unique_ptr<TpuDriver>(
|
||||||
|
new GrpcTpuDriver(config, resp.client_id()));
|
||||||
|
});
|
||||||
|
|
||||||
} // namespace
|
} // namespace
|
||||||
} // namespace tpu_driver
|
} // namespace tpu_driver
|
||||||
|
@ -229,6 +229,7 @@ class TpuDriver {
|
|||||||
|
|
||||||
struct TpuDriverConfig {
|
struct TpuDriverConfig {
|
||||||
std::string worker;
|
std::string worker;
|
||||||
|
int64_t connection_timeout_secs = 10;
|
||||||
};
|
};
|
||||||
|
|
||||||
class TpuDriverRegistry {
|
class TpuDriverRegistry {
|
||||||
|
Loading…
x
Reference in New Issue
Block a user