Create a lockfile when loading libtpu.so to prevent attempts at double loading and initialization
PiperOrigin-RevId: 357513705 Change-Id: Iafc6a83f0a3bdfa580c98b286a4113508852c3b8
This commit is contained in:
parent
5396b5ce65
commit
d426f1b26f
@ -23,6 +23,7 @@ cc_library(
|
|||||||
"//tensorflow/core/profiler/utils:xplane_utils",
|
"//tensorflow/core/profiler/utils:xplane_utils",
|
||||||
"//tensorflow/core/tpu:tpu_api",
|
"//tensorflow/core/tpu:tpu_api",
|
||||||
"//tensorflow/core/tpu:tpu_api_dlsym_initializer",
|
"//tensorflow/core/tpu:tpu_api_dlsym_initializer",
|
||||||
|
"//tensorflow/core/tpu:tpu_initializer_helper",
|
||||||
"//tensorflow/core/tpu:tpu_ops_c_api_hdrs",
|
"//tensorflow/core/tpu:tpu_ops_c_api_hdrs",
|
||||||
"//tensorflow/stream_executor/tpu:status_helper",
|
"//tensorflow/stream_executor/tpu:status_helper",
|
||||||
"@com_google_absl//absl/strings",
|
"@com_google_absl//absl/strings",
|
||||||
|
@ -29,6 +29,7 @@ limitations under the License.
|
|||||||
#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
|
#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
|
||||||
#include "tensorflow/core/profiler/utils/xplane_schema.h"
|
#include "tensorflow/core/profiler/utils/xplane_schema.h"
|
||||||
#include "tensorflow/core/tpu/tpu_api.h"
|
#include "tensorflow/core/tpu/tpu_api.h"
|
||||||
|
#include "tensorflow/core/tpu/tpu_initializer_helper.h"
|
||||||
#include "tensorflow/core/tpu/tpu_ops_c_api.h"
|
#include "tensorflow/core/tpu/tpu_ops_c_api.h"
|
||||||
#include "tensorflow/stream_executor/tpu/status_helper.h"
|
#include "tensorflow/stream_executor/tpu/status_helper.h"
|
||||||
|
|
||||||
@ -134,7 +135,9 @@ std::unique_ptr<ProfilerInterface> CreateTpuTracer(
|
|||||||
}
|
}
|
||||||
|
|
||||||
auto register_tpu_tracer_factory = [] {
|
auto register_tpu_tracer_factory = [] {
|
||||||
RegisterProfilerFactory(&CreateTpuTracer);
|
if (tensorflow::tpu::TryAcquireTpuLock()) {
|
||||||
|
RegisterProfilerFactory(&CreateTpuTracer);
|
||||||
|
}
|
||||||
return 0;
|
return 0;
|
||||||
}();
|
}();
|
||||||
|
|
||||||
|
@ -11,6 +11,7 @@ package(
|
|||||||
"//tensorflow/compiler/mlir/tensorflow:__subpackages__",
|
"//tensorflow/compiler/mlir/tensorflow:__subpackages__",
|
||||||
"//tensorflow/compiler/tf2xla/kernels:__subpackages__",
|
"//tensorflow/compiler/tf2xla/kernels:__subpackages__",
|
||||||
"//tensorflow/compiler/xrt:__subpackages__",
|
"//tensorflow/compiler/xrt:__subpackages__",
|
||||||
|
"//tensorflow/core/profiler/internal/tpu:__subpackages__",
|
||||||
"//tensorflow/core/tpu:__subpackages__",
|
"//tensorflow/core/tpu:__subpackages__",
|
||||||
"//tensorflow/stream_executor/tpu:__subpackages__",
|
"//tensorflow/stream_executor/tpu:__subpackages__",
|
||||||
],
|
],
|
||||||
@ -105,7 +106,11 @@ cc_library(
|
|||||||
name = "tpu_initializer_helper",
|
name = "tpu_initializer_helper",
|
||||||
srcs = ["tpu_initializer_helper.cc"],
|
srcs = ["tpu_initializer_helper.cc"],
|
||||||
hdrs = ["tpu_initializer_helper.h"],
|
hdrs = ["tpu_initializer_helper.h"],
|
||||||
deps = ["@com_google_absl//absl/strings"],
|
deps = [
|
||||||
|
"//tensorflow/core/platform:logging",
|
||||||
|
"@com_google_absl//absl/strings",
|
||||||
|
"@com_google_absl//absl/synchronization",
|
||||||
|
],
|
||||||
)
|
)
|
||||||
|
|
||||||
cc_library(
|
cc_library(
|
||||||
|
@ -65,6 +65,8 @@ Status InitializeTpuLibrary(void* library_handle) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
bool FindAndLoadTpuLibrary() {
|
bool FindAndLoadTpuLibrary() {
|
||||||
|
if (!TryAcquireTpuLock()) return false;
|
||||||
|
|
||||||
void* library = dlopen("libtpu.so", RTLD_NOW);
|
void* library = dlopen("libtpu.so", RTLD_NOW);
|
||||||
if (library) {
|
if (library) {
|
||||||
InitializeTpuLibrary(library);
|
InitializeTpuLibrary(library);
|
||||||
|
@ -62,6 +62,8 @@ Status InitializeTpuLibrary(void* library_handle) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
bool FindAndLoadTpuLibrary() {
|
bool FindAndLoadTpuLibrary() {
|
||||||
|
if (!TryAcquireTpuLock()) return false;
|
||||||
|
|
||||||
void* library = dlopen("libtpu.so", RTLD_NOW);
|
void* library = dlopen("libtpu.so", RTLD_NOW);
|
||||||
if (library) {
|
if (library) {
|
||||||
InitializeTpuLibrary(library);
|
InitializeTpuLibrary(library);
|
||||||
|
@ -15,13 +15,56 @@ limitations under the License.
|
|||||||
|
|
||||||
#include "tensorflow/core/tpu/tpu_initializer_helper.h"
|
#include "tensorflow/core/tpu/tpu_initializer_helper.h"
|
||||||
|
|
||||||
|
#if defined(LIBTPU_ON_GCE)
|
||||||
|
#include <fcntl.h>
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
|
#include <unistd.h>
|
||||||
|
#endif // LIBTPU_ON_GCE
|
||||||
|
|
||||||
#include "absl/strings/str_split.h"
|
#include "absl/strings/str_split.h"
|
||||||
|
#include "absl/synchronization/mutex.h"
|
||||||
|
#include "tensorflow/core/platform/logging.h"
|
||||||
|
|
||||||
namespace tensorflow {
|
namespace tensorflow {
|
||||||
namespace tpu {
|
namespace tpu {
|
||||||
|
|
||||||
|
bool TryAcquireTpuLock() {
|
||||||
|
#if defined(LIBTPU_ON_GCE)
|
||||||
|
static absl::Mutex* mu = new absl::Mutex();
|
||||||
|
absl::MutexLock l(mu);
|
||||||
|
|
||||||
|
static bool attempted_file_open = false;
|
||||||
|
static bool should_load_library = false;
|
||||||
|
|
||||||
|
if (!attempted_file_open) {
|
||||||
|
should_load_library = true;
|
||||||
|
|
||||||
|
// if the TPU_HOST_BOUNDS env var is set, that means we are loading each
|
||||||
|
// chip in a different process and thus multiple libtpu loads are OK.
|
||||||
|
if (getenv("TPU_HOST_BOUNDS") == nullptr) {
|
||||||
|
int fd = open("/tmp/libtpu_lockfile", O_CREAT | O_RDWR);
|
||||||
|
|
||||||
|
// This lock is held until the process exits intentionally. The underlying
|
||||||
|
// TPU device will be held on until it quits.
|
||||||
|
if (lockf(fd, F_TLOCK, 0) != 0) {
|
||||||
|
LOG(WARNING) << "libtpu.so already in used by another process. Not "
|
||||||
|
"attempting to load libtpu.so in this process.";
|
||||||
|
should_load_library = false;
|
||||||
|
} else {
|
||||||
|
should_load_library = true;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
LOG(INFO) << "TPU_HOST_BOUNDS is set, allowing multiple libtpu.so loads.";
|
||||||
|
should_load_library = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return should_load_library;
|
||||||
|
#else // LIBTPU_ON_GCE
|
||||||
|
return false;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
std::pair<std::vector<std::string>, std::vector<const char*>>
|
std::pair<std::vector<std::string>, std::vector<const char*>>
|
||||||
GetLibTpuInitArguments() {
|
GetLibTpuInitArguments() {
|
||||||
// We make copies of the arguments returned by getenv because the memory
|
// We make copies of the arguments returned by getenv because the memory
|
||||||
|
@ -22,6 +22,11 @@ limitations under the License.
|
|||||||
namespace tensorflow {
|
namespace tensorflow {
|
||||||
namespace tpu {
|
namespace tpu {
|
||||||
|
|
||||||
|
// This will acquire a system-wide lock on behalf of the whole process. Follow
|
||||||
|
// up calls to this function will return true if the lock has been acquired and
|
||||||
|
// false if we failed to acquire the lock.
|
||||||
|
bool TryAcquireTpuLock();
|
||||||
|
|
||||||
// Returns arguments (e.g. flags) set in the LIBTPU_INIT_ARGS environment
|
// Returns arguments (e.g. flags) set in the LIBTPU_INIT_ARGS environment
|
||||||
// variable. The first return value is the arguments, the second return value is
|
// variable. The first return value is the arguments, the second return value is
|
||||||
// pointers to the arguments suitable for passing into the C API.
|
// pointers to the arguments suitable for passing into the C API.
|
||||||
|
Loading…
x
Reference in New Issue
Block a user