Create a lockfile when loading libtpu.so to prevent attempts at double loading and initialization

PiperOrigin-RevId: 357513705
Change-Id: Iafc6a83f0a3bdfa580c98b286a4113508852c3b8
This commit is contained in:
Frank Chen 2021-02-14 23:00:32 -08:00 committed by TensorFlower Gardener
parent 5396b5ce65
commit d426f1b26f
7 changed files with 63 additions and 2 deletions

View File

@ -23,6 +23,7 @@ cc_library(
"//tensorflow/core/profiler/utils:xplane_utils",
"//tensorflow/core/tpu:tpu_api",
"//tensorflow/core/tpu:tpu_api_dlsym_initializer",
"//tensorflow/core/tpu:tpu_initializer_helper",
"//tensorflow/core/tpu:tpu_ops_c_api_hdrs",
"//tensorflow/stream_executor/tpu:status_helper",
"@com_google_absl//absl/strings",

View File

@ -29,6 +29,7 @@ limitations under the License.
#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
#include "tensorflow/core/profiler/utils/xplane_schema.h"
#include "tensorflow/core/tpu/tpu_api.h"
#include "tensorflow/core/tpu/tpu_initializer_helper.h"
#include "tensorflow/core/tpu/tpu_ops_c_api.h"
#include "tensorflow/stream_executor/tpu/status_helper.h"
@ -134,7 +135,9 @@ std::unique_ptr<ProfilerInterface> CreateTpuTracer(
}
auto register_tpu_tracer_factory = [] {
RegisterProfilerFactory(&CreateTpuTracer);
if (tensorflow::tpu::TryAcquireTpuLock()) {
RegisterProfilerFactory(&CreateTpuTracer);
}
return 0;
}();

View File

@ -11,6 +11,7 @@ package(
"//tensorflow/compiler/mlir/tensorflow:__subpackages__",
"//tensorflow/compiler/tf2xla/kernels:__subpackages__",
"//tensorflow/compiler/xrt:__subpackages__",
"//tensorflow/core/profiler/internal/tpu:__subpackages__",
"//tensorflow/core/tpu:__subpackages__",
"//tensorflow/stream_executor/tpu:__subpackages__",
],
@ -105,7 +106,11 @@ cc_library(
name = "tpu_initializer_helper",
srcs = ["tpu_initializer_helper.cc"],
hdrs = ["tpu_initializer_helper.h"],
deps = ["@com_google_absl//absl/strings"],
deps = [
"//tensorflow/core/platform:logging",
"@com_google_absl//absl/strings",
"@com_google_absl//absl/synchronization",
],
)
cc_library(

View File

@ -65,6 +65,8 @@ Status InitializeTpuLibrary(void* library_handle) {
}
bool FindAndLoadTpuLibrary() {
if (!TryAcquireTpuLock()) return false;
void* library = dlopen("libtpu.so", RTLD_NOW);
if (library) {
InitializeTpuLibrary(library);

View File

@ -62,6 +62,8 @@ Status InitializeTpuLibrary(void* library_handle) {
}
bool FindAndLoadTpuLibrary() {
if (!TryAcquireTpuLock()) return false;
void* library = dlopen("libtpu.so", RTLD_NOW);
if (library) {
InitializeTpuLibrary(library);

View File

@ -15,13 +15,56 @@ limitations under the License.
#include "tensorflow/core/tpu/tpu_initializer_helper.h"
#if defined(LIBTPU_ON_GCE)
#include <fcntl.h>
#include <stdlib.h>
#include <unistd.h>
#endif // LIBTPU_ON_GCE
#include "absl/strings/str_split.h"
#include "absl/synchronization/mutex.h"
#include "tensorflow/core/platform/logging.h"
namespace tensorflow {
namespace tpu {
bool TryAcquireTpuLock() {
#if defined(LIBTPU_ON_GCE)
static absl::Mutex* mu = new absl::Mutex();
absl::MutexLock l(mu);
static bool attempted_file_open = false;
static bool should_load_library = false;
if (!attempted_file_open) {
should_load_library = true;
// if the TPU_HOST_BOUNDS env var is set, that means we are loading each
// chip in a different process and thus multiple libtpu loads are OK.
if (getenv("TPU_HOST_BOUNDS") == nullptr) {
int fd = open("/tmp/libtpu_lockfile", O_CREAT | O_RDWR);
// This lock is held until the process exits intentionally. The underlying
// TPU device will be held on until it quits.
if (lockf(fd, F_TLOCK, 0) != 0) {
LOG(WARNING) << "libtpu.so already in used by another process. Not "
"attempting to load libtpu.so in this process.";
should_load_library = false;
} else {
should_load_library = true;
}
} else {
LOG(INFO) << "TPU_HOST_BOUNDS is set, allowing multiple libtpu.so loads.";
should_load_library = true;
}
}
return should_load_library;
#else // LIBTPU_ON_GCE
return false;
#endif
}
std::pair<std::vector<std::string>, std::vector<const char*>>
GetLibTpuInitArguments() {
// We make copies of the arguments returned by getenv because the memory

View File

@ -22,6 +22,11 @@ limitations under the License.
namespace tensorflow {
namespace tpu {
// This will acquire a system-wide lock on behalf of the whole process. Follow
// up calls to this function will return true if the lock has been acquired and
// false if we failed to acquire the lock.
bool TryAcquireTpuLock();
// Returns arguments (e.g. flags) set in the LIBTPU_INIT_ARGS environment
// variable. The first return value is the arguments, the second return value is
// pointers to the arguments suitable for passing into the C API.