Create a lockfile when loading libtpu.so to prevent attempts at double loading and initialization
PiperOrigin-RevId: 357513705 Change-Id: Iafc6a83f0a3bdfa580c98b286a4113508852c3b8
This commit is contained in:
parent
5396b5ce65
commit
d426f1b26f
@ -23,6 +23,7 @@ cc_library(
|
||||
"//tensorflow/core/profiler/utils:xplane_utils",
|
||||
"//tensorflow/core/tpu:tpu_api",
|
||||
"//tensorflow/core/tpu:tpu_api_dlsym_initializer",
|
||||
"//tensorflow/core/tpu:tpu_initializer_helper",
|
||||
"//tensorflow/core/tpu:tpu_ops_c_api_hdrs",
|
||||
"//tensorflow/stream_executor/tpu:status_helper",
|
||||
"@com_google_absl//absl/strings",
|
||||
|
@ -29,6 +29,7 @@ limitations under the License.
|
||||
#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
|
||||
#include "tensorflow/core/profiler/utils/xplane_schema.h"
|
||||
#include "tensorflow/core/tpu/tpu_api.h"
|
||||
#include "tensorflow/core/tpu/tpu_initializer_helper.h"
|
||||
#include "tensorflow/core/tpu/tpu_ops_c_api.h"
|
||||
#include "tensorflow/stream_executor/tpu/status_helper.h"
|
||||
|
||||
@ -134,7 +135,9 @@ std::unique_ptr<ProfilerInterface> CreateTpuTracer(
|
||||
}
|
||||
|
||||
auto register_tpu_tracer_factory = [] {
|
||||
RegisterProfilerFactory(&CreateTpuTracer);
|
||||
if (tensorflow::tpu::TryAcquireTpuLock()) {
|
||||
RegisterProfilerFactory(&CreateTpuTracer);
|
||||
}
|
||||
return 0;
|
||||
}();
|
||||
|
||||
|
@ -11,6 +11,7 @@ package(
|
||||
"//tensorflow/compiler/mlir/tensorflow:__subpackages__",
|
||||
"//tensorflow/compiler/tf2xla/kernels:__subpackages__",
|
||||
"//tensorflow/compiler/xrt:__subpackages__",
|
||||
"//tensorflow/core/profiler/internal/tpu:__subpackages__",
|
||||
"//tensorflow/core/tpu:__subpackages__",
|
||||
"//tensorflow/stream_executor/tpu:__subpackages__",
|
||||
],
|
||||
@ -105,7 +106,11 @@ cc_library(
|
||||
name = "tpu_initializer_helper",
|
||||
srcs = ["tpu_initializer_helper.cc"],
|
||||
hdrs = ["tpu_initializer_helper.h"],
|
||||
deps = ["@com_google_absl//absl/strings"],
|
||||
deps = [
|
||||
"//tensorflow/core/platform:logging",
|
||||
"@com_google_absl//absl/strings",
|
||||
"@com_google_absl//absl/synchronization",
|
||||
],
|
||||
)
|
||||
|
||||
cc_library(
|
||||
|
@ -65,6 +65,8 @@ Status InitializeTpuLibrary(void* library_handle) {
|
||||
}
|
||||
|
||||
bool FindAndLoadTpuLibrary() {
|
||||
if (!TryAcquireTpuLock()) return false;
|
||||
|
||||
void* library = dlopen("libtpu.so", RTLD_NOW);
|
||||
if (library) {
|
||||
InitializeTpuLibrary(library);
|
||||
|
@ -62,6 +62,8 @@ Status InitializeTpuLibrary(void* library_handle) {
|
||||
}
|
||||
|
||||
bool FindAndLoadTpuLibrary() {
|
||||
if (!TryAcquireTpuLock()) return false;
|
||||
|
||||
void* library = dlopen("libtpu.so", RTLD_NOW);
|
||||
if (library) {
|
||||
InitializeTpuLibrary(library);
|
||||
|
@ -15,13 +15,56 @@ limitations under the License.
|
||||
|
||||
#include "tensorflow/core/tpu/tpu_initializer_helper.h"
|
||||
|
||||
#if defined(LIBTPU_ON_GCE)
|
||||
#include <fcntl.h>
|
||||
#include <stdlib.h>
|
||||
#include <unistd.h>
|
||||
#endif // LIBTPU_ON_GCE
|
||||
|
||||
#include "absl/strings/str_split.h"
|
||||
#include "absl/synchronization/mutex.h"
|
||||
#include "tensorflow/core/platform/logging.h"
|
||||
|
||||
namespace tensorflow {
|
||||
namespace tpu {
|
||||
|
||||
bool TryAcquireTpuLock() {
|
||||
#if defined(LIBTPU_ON_GCE)
|
||||
static absl::Mutex* mu = new absl::Mutex();
|
||||
absl::MutexLock l(mu);
|
||||
|
||||
static bool attempted_file_open = false;
|
||||
static bool should_load_library = false;
|
||||
|
||||
if (!attempted_file_open) {
|
||||
should_load_library = true;
|
||||
|
||||
// if the TPU_HOST_BOUNDS env var is set, that means we are loading each
|
||||
// chip in a different process and thus multiple libtpu loads are OK.
|
||||
if (getenv("TPU_HOST_BOUNDS") == nullptr) {
|
||||
int fd = open("/tmp/libtpu_lockfile", O_CREAT | O_RDWR);
|
||||
|
||||
// This lock is held until the process exits intentionally. The underlying
|
||||
// TPU device will be held on until it quits.
|
||||
if (lockf(fd, F_TLOCK, 0) != 0) {
|
||||
LOG(WARNING) << "libtpu.so already in used by another process. Not "
|
||||
"attempting to load libtpu.so in this process.";
|
||||
should_load_library = false;
|
||||
} else {
|
||||
should_load_library = true;
|
||||
}
|
||||
} else {
|
||||
LOG(INFO) << "TPU_HOST_BOUNDS is set, allowing multiple libtpu.so loads.";
|
||||
should_load_library = true;
|
||||
}
|
||||
}
|
||||
|
||||
return should_load_library;
|
||||
#else // LIBTPU_ON_GCE
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
||||
std::pair<std::vector<std::string>, std::vector<const char*>>
|
||||
GetLibTpuInitArguments() {
|
||||
// We make copies of the arguments returned by getenv because the memory
|
||||
|
@ -22,6 +22,11 @@ limitations under the License.
|
||||
namespace tensorflow {
|
||||
namespace tpu {
|
||||
|
||||
// This will acquire a system-wide lock on behalf of the whole process. Follow
|
||||
// up calls to this function will return true if the lock has been acquired and
|
||||
// false if we failed to acquire the lock.
|
||||
bool TryAcquireTpuLock();
|
||||
|
||||
// Returns arguments (e.g. flags) set in the LIBTPU_INIT_ARGS environment
|
||||
// variable. The first return value is the arguments, the second return value is
|
||||
// pointers to the arguments suitable for passing into the C API.
|
||||
|
Loading…
Reference in New Issue
Block a user