From 874418a301c5b71604dc8aead8bbdbe08a9d5bf3 Mon Sep 17 00:00:00 2001 From: Leonid Yuriev Date: Mon, 2 Sep 2019 20:52:29 +0300 Subject: [PATCH] mdbx: rework POSIX-lck and merge with Linux-lck. Change-Id: Id8fbc81b9a2ad3a3a7499ecf9a012314e1f8062a --- libmdbx.files | 1 - src/CMakeLists.txt | 2 - src/alloy.c | 4 +- src/elements/core.c | 146 +++++------ src/elements/lck-linux.c | 489 ------------------------------------- src/elements/lck-posix.c | 441 ++++++++++++++++++++------------- src/elements/lck-windows.c | 49 ++-- src/elements/osal.h | 15 +- 8 files changed, 384 insertions(+), 763 deletions(-) delete mode 100644 src/elements/lck-linux.c diff --git a/libmdbx.files b/libmdbx.files index 5b952694..f6b12598 100644 --- a/libmdbx.files +++ b/libmdbx.files @@ -12,7 +12,6 @@ src/alloy.c src/elements/data.c src/elements/internals.h src/elements/defs.h -src/elements/lck-linux.c src/elements/lck-posix.c src/elements/lck-windows.c src/elements/core.c diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 206963e3..722d6693 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -28,8 +28,6 @@ if(MDBX_ALLOY_MODE) else() if(${CMAKE_SYSTEM_NAME} STREQUAL "Windows") set(LIBMDBX_OSAL windows) - elseif(${CMAKE_SYSTEM_NAME} STREQUAL "Linux") - set(LIBMDBX_OSAL linux) else() set(LIBMDBX_OSAL posix) endif() diff --git a/src/alloy.c b/src/alloy.c index dd99a922..20642f21 100644 --- a/src/alloy.c +++ b/src/alloy.c @@ -22,9 +22,7 @@ #include "elements/core.c" #include "elements/osal.c" -#if defined(__linux__) || defined(__gnu_linux__) -#include "elements/lck-linux.c" -#elif defined(_WIN32) || defined(_WIN64) +#if defined(_WIN32) || defined(_WIN64) #include "elements/lck-windows.c" #else #include "elements/lck-posix.c" diff --git a/src/elements/core.c b/src/elements/core.c index 5c797616..cf813013 100644 --- a/src/elements/core.c +++ b/src/elements/core.c @@ -674,93 +674,80 @@ static uint64_t rrxmrrxmsx_0(uint64_t v) { return v ^ v >> 28; } -static int uniq_poke(const mdbx_mmap_t *map, const uint64_t cadabra) { - int rc; - if (map->lck) { - map->lck->mti_bait_uniqueness = cadabra; - mdbx_flush_noncoherent_cpu_writeback(); - rc = MDBX_SUCCESS; - } else { - rc = mdbx_pwrite(map->fd, &cadabra, sizeof(map->lck->mti_bait_uniqueness), - offsetof(MDBX_lockinfo, mti_bait_uniqueness)); - } - mdbx_trace("uniq-poke: %s, cadabra 0x016%" PRIx64 ", rc %d", - map->lck ? "mem" : "file", cadabra, rc); - return rc; -} - -static int uniq_peek(const mdbx_mmap_t *map, const uint64_t cadabra) { +static int uniq_peek(const mdbx_mmap_t *pending, mdbx_mmap_t *scan) { int rc; uint64_t bait; - if (map->lck) { - mdbx_invalidate_mmap_noncoherent_cache(map->lck, sizeof(*map->lck)); - bait = map->lck->mti_bait_uniqueness; + if (pending->address) { + bait = pending->lck->mti_bait_uniqueness; rc = MDBX_SUCCESS; } else { - rc = mdbx_pread(map->fd, &bait, sizeof(map->lck->mti_bait_uniqueness), - offsetof(MDBX_lockinfo, mti_bait_uniqueness)); + bait = 0 /* hush MSVC warning */; + rc = mdbx_msync(scan, 0, sizeof(MDBX_lockinfo), true); + if (rc == MDBX_SUCCESS) + rc = + mdbx_pread(pending->fd, &bait, sizeof(scan->lck->mti_bait_uniqueness), + offsetof(MDBX_lockinfo, mti_bait_uniqueness)); } + if (likely(rc == MDBX_SUCCESS) && bait == scan->lck->mti_bait_uniqueness) + rc = MDBX_RESULT_TRUE; - if (unlikely(!MDBX_IS_ERROR(rc))) - rc = (bait == cadabra) ? MDBX_RESULT_TRUE : MDBX_RESULT_FALSE; - - mdbx_trace("uniq-peek: %s, cadabra 0x%016" PRIx64 ", bait 0x%016" PRIx64 - ",%s rc %d", - map->lck ? "mem" : "file", cadabra, bait, + mdbx_trace("uniq-peek: %s, bait 0x%016" PRIx64 ",%s rc %d", + pending->lck ? "mem" : "file", bait, (rc == MDBX_RESULT_TRUE) ? " found," : (rc ? " FAILED," : ""), rc); return rc; } -__cold static int uniq_probe(const mdbx_mmap_t *map, const mdbx_pid_t pid, - MDBX_env **found) { - if (inprocess_lcklist_head == RTHC_ENVLIST_END) { - mdbx_info("<< uniq-probe: pid %u, env-list empty, skip probing, rc %d", - (unsigned)pid, MDBX_RESULT_TRUE); - return MDBX_RESULT_TRUE; +static int uniq_poke(const mdbx_mmap_t *pending, mdbx_mmap_t *scan, + uint64_t *abra) { + if (*abra == 0) { + const mdbx_tid_t tid = mdbx_thread_self(); + size_t uit = 0; + memcpy(&uit, &tid, (sizeof(tid) < sizeof(uit)) ? sizeof(tid) : sizeof(uit)); + *abra = + rrxmrrxmsx_0(mdbx_osal_monotime() + UINT64_C(5873865991930747) * uit); } + const uint64_t cadabra = + rrxmrrxmsx_0(*abra + UINT64_C(7680760450171793) * (unsigned)mdbx_getpid()) + << 24 | + *abra >> 40; + scan->lck->mti_bait_uniqueness = cadabra; + mdbx_flush_noncoherent_cpu_writeback(); + *abra = *abra * UINT64_C(6364136223846793005) + 1; + return uniq_peek(pending, scan); +} - const mdbx_tid_t tid = mdbx_thread_self(); - size_t uit = 0; - memcpy(&uit, &tid, (sizeof(tid) < sizeof(uit)) ? sizeof(tid) : sizeof(uit)); - uint64_t abra = - rrxmrrxmsx_0(mdbx_osal_monotime() + UINT64_C(5873865991930747) * uit); - - for (unsigned bits = 4; bits; bits >>= 1) { - abra = abra * UINT64_C(6364136223846793005) + 1; - const uint64_t cadabra = - rrxmrrxmsx_0(abra + UINT64_C(7680760450171793) * pid) << 20 | - abra >> 44; - - int err = uniq_poke(map, cadabra); - *found = nullptr; - for (MDBX_env *env = inprocess_lcklist_head; - err == MDBX_SUCCESS && env != RTHC_ENVLIST_END; - env = env->me_lcklist_next) { - err = uniq_peek(&env->me_lck_mmap, cadabra); - if (err == MDBX_RESULT_TRUE) - *found = env; +__cold static int uniq_check(const mdbx_mmap_t *pending, MDBX_env **found) { + *found = nullptr; + uint64_t salt = 0; + for (MDBX_env *scan = inprocess_lcklist_head; scan != RTHC_ENVLIST_END; + scan = scan->me_lcklist_next) { + int err = scan->me_lck_mmap.lck->mti_bait_uniqueness + ? uniq_peek(pending, &scan->me_lck_mmap) + : uniq_poke(pending, &scan->me_lck_mmap, &salt); + if (err == MDBX_RESULT_TRUE) + err = uniq_poke(pending, &scan->me_lck_mmap, &salt); + if (err == MDBX_RESULT_TRUE) { + (void)mdbx_msync(&scan->me_lck_mmap, 0, sizeof(MDBX_lockinfo), false); + err = uniq_poke(pending, &scan->me_lck_mmap, &salt); } - - if (unlikely(MDBX_IS_ERROR(err))) { - mdbx_verbose("<< uniq-probe: pid %u, uit %zu, failed rc %d", - (unsigned)pid, uit, err); - return err; - } - - bits += 8 & err; - if (bits == 15) { - mdbx_info("<< uniq-probe: pid %u, uit %zu, found %p", (unsigned)pid, uit, - *found); + if (err == MDBX_RESULT_TRUE) { + err = uniq_poke(pending, &scan->me_lck_mmap, &salt); + *found = scan; + mdbx_info("<< uniq-probe: found %p", *found); return MDBX_RESULT_FALSE; } + if (unlikely(err != MDBX_SUCCESS)) { + mdbx_verbose("<< uniq-probe: failed rc %d", err); + return err; + } } - mdbx_info("<< uniq-probe: pid %u, uit %zu, unique", (unsigned)pid, uit); + mdbx_info("<< uniq-probe: unique"); return MDBX_RESULT_TRUE; } static int lcklist_detach_locked(MDBX_env *env) { - MDBX_env *dup = nullptr; + MDBX_env *inprocess_neighbor = nullptr; int rc = MDBX_SUCCESS; if (env->me_lcklist_next != nullptr) { mdbx_ensure(env, env->me_lcklist_next != nullptr); @@ -776,11 +763,11 @@ static int lcklist_detach_locked(MDBX_env *env) { mdbx_ensure(env, env->me_lcklist_next == nullptr); } - rc = uniq_probe(&env->me_lck_mmap, env->me_pid, &dup); - if (!dup && env->me_live_reader) + rc = uniq_check(&env->me_lck_mmap, &inprocess_neighbor); + if (!inprocess_neighbor && env->me_live_reader) (void)mdbx_rpid_clear(env); if (!MDBX_IS_ERROR(rc)) - rc = mdbx_lck_destroy(env, dup); + rc = mdbx_lck_destroy(env, inprocess_neighbor); return rc; } @@ -6731,11 +6718,15 @@ static int __cold mdbx_setup_lck(MDBX_env *env, char *lck_pathname, return err; } + MDBX_env *inprocess_neighbor = nullptr; if (err == MDBX_RESULT_TRUE) { - MDBX_env *unused_lckdup_found; - err = uniq_probe(&env->me_lck_mmap, env->me_pid, &unused_lckdup_found); + err = uniq_check(&env->me_lck_mmap, &inprocess_neighbor); if (MDBX_IS_ERROR(err)) goto bailout; + if (inprocess_neighbor && (inprocess_neighbor->me_flags & MDBX_EXCLUSIVE)) { + err = MDBX_BUSY; + goto bailout; + } } const int lck_seize_rc = err; @@ -6814,6 +6805,7 @@ static int __cold mdbx_setup_lck(MDBX_env *env, char *lck_pathname, if (lck_seize_rc == MDBX_RESULT_TRUE) { /* LY: exlcusive mode, reset lck */ memset(env->me_lck, 0, (size_t)size); + mdbx_jitter4testing(false); env->me_lck->mti_magic_and_version = MDBX_LOCK_MAGIC; env->me_lck->mti_os_and_format = MDBX_LOCK_FORMAT; } else { @@ -6966,15 +6958,9 @@ int __cold mdbx_env_open(MDBX_env *env, const char *path, unsigned flags, MDBX_WRITEMAP | MDBX_NOSYNC | MDBX_NOMETASYNC | MDBX_MAPASYNC; if (lck_rc == MDBX_RESULT_TRUE) { env->me_lck->mti_envmode = env->me_flags & (mode_flags | MDBX_RDONLY); - if ((env->me_flags & MDBX_EXCLUSIVE) == 0) { - /* LY: downgrade lock only if exclusive access not requested. - * in case exclusive==1, just leave value as is. */ - rc = mdbx_lck_downgrade(env, true); - mdbx_debug("lck-downgrade-full: rc %i ", rc); - } else { - rc = mdbx_lck_downgrade(env, false); - mdbx_debug("lck-downgrade-partial: rc %i ", rc); - } + rc = mdbx_lck_downgrade(env); + mdbx_debug("lck-downgrade-%s: rc %i", + (env->me_flags & MDBX_EXCLUSIVE) ? "partial" : "full", rc); if (rc != MDBX_SUCCESS) goto bailout; } else { diff --git a/src/elements/lck-linux.c b/src/elements/lck-linux.c deleted file mode 100644 index 04d2a62a..00000000 --- a/src/elements/lck-linux.c +++ /dev/null @@ -1,489 +0,0 @@ -/* - * Copyright 2015-2019 Leonid Yuriev - * and other libmdbx authors: please see AUTHORS file. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted only as authorized by the OpenLDAP - * Public License. - * - * A copy of this license is available in the file LICENSE in the - * top-level directory of the distribution or, alternatively, at - * . - */ - -#if !(defined(__linux__) || defined(__gnu_linux__)) -#error "This implementation of locking only supports Linux,\ - where is no interaction between the types of lock placed\ - by flock() and fcntl()." -#endif - -#include "./internals.h" -#include - -/* Some platforms define the EOWNERDEAD error code - * even though they don't support Robust Mutexes. - * Compile with -DMDBX_USE_ROBUST=0. */ -#ifndef MDBX_USE_ROBUST -/* Howard Chu: Android currently lacks Robust Mutex support */ -#if defined(EOWNERDEAD) && \ - !defined(__ANDROID__) /* LY: glibc before 2.10 has a troubles \ - with Robust Mutex too. */ \ - && (!defined(__GLIBC__) || __GLIBC_PREREQ(2, 10) || \ - _POSIX_C_SOURCE >= 200809L) -#define MDBX_USE_ROBUST 1 -#else -#define MDBX_USE_ROBUST 0 -#endif -#endif /* MDBX_USE_ROBUST */ - -/*----------------------------------------------------------------------------*/ -/* global constructor/destructor */ - -#ifndef MDBX_ALLOY -uint32_t mdbx_linux_kernel_version; -#endif /* MDBX_ALLOY */ - -static __cold __attribute__((__constructor__)) void -mdbx_global_constructor(void) { - struct utsname buffer; - if (uname(&buffer) == 0) { - int i = 0; - char *p = buffer.release; - while (*p && i < 4) { - if (*p >= '0' && *p <= '9') { - long number = strtol(p, &p, 10); - if (number > 0) { - if (number > 255) - number = 255; - mdbx_linux_kernel_version += number << (24 - i * 8); - } - ++i; - } else { - ++p; - } - } - } - - mdbx_rthc_global_init(); -} - -static __cold __attribute__((__destructor__)) void -mdbx_global_destructor(void) { - mdbx_rthc_global_dtor(); -} - -/*----------------------------------------------------------------------------*/ -/* lck */ - -/* Описание реализации блокировок для Linux: - * - * lck-файл отображается в память, в нём организуется таблица читателей и - * размещаются совместно используемые posix-мьютексы (futex). Посредством - * этих мьютексов (см struct MDBX_lockinfo) реализуются: - * - Блокировка таблицы читателей для регистрации, - * т.е. функции mdbx_rdt_lock() и mdbx_rdt_unlock(). - * - Блокировка БД для пишущих транзакций, - * т.е. функции mdbx_txn_lock() и mdbx_txn_unlock(). - * - * Остальной функционал реализуется отдельно посредством файловых блокировок: - * - Первоначальный захват БД в режиме exclusive/shared и последующий перевод - * в операционный режим, функции mdbx_lck_seize() и mdbx_lck_downgrade(). - * - Проверка присутствие процессов-читателей, - * т.е. функции mdbx_rpid_set(), mdbx_rpid_clear() и mdbx_rpid_check(). - * - * Используется два вида файловых блокировок flock() и fcntl(F_SETLK), - * как для lck-файла, так и для основного файла БД: - * - Для контроля процессов-читателей используются однобайтовые - * range-блокировки lck-файла посредством fcntl(F_SETLK). При этом - * в качестве позиции используется pid процесса-читателя. - * - Для первоначального захвата и shared/exclusive блокировок используется - * комбинация flock() и fcntl(F_SETLK) блокировки одного байта lck-файла - * в нулевой позиции (нулевая позиция не используется механизмом контроля - * процессов-читателей, так как pid пользовательского процесса в Linux - * всегда больше 0). - * - Кроме этого, flock() блокировка основного файла БД используется при работе - * в режимах без lck-файла, как в в read-only, так и в эксклюзивном. - * - Блокировки flock() и fcntl(F_SETLK) в Linux работают независимо. Поэтому - * их комбинирование позволяет предотвратить совместное использование БД - * через NFS, что позволяет fcntl(F_SETLK), одновременно защитившись - * от проблем не-аторманости flock() при переходе между эксклюзивным - * и атомарным режимами блокировок. - */ - -static int op_setlk, op_setlkw, op_getlk; -static void __cold choice_fcntl() { - assert(!op_setlk && !op_setlkw && !op_getlk); -#if defined(F_OFD_SETLK) && defined(F_OFD_SETLKW) && defined(F_OFD_GETLK) - if (mdbx_linux_kernel_version > - 0x030f0000 /* OFD locks are available since 3.15, but engages here - only for 3.16 and larer kernels (LTS) for reliability reasons */ - && (mdbx_runtime_flags & MDBX_DBG_LEGACY_MULTIOPEN) == 0) { - op_setlk = F_OFD_SETLK; - op_setlkw = F_OFD_SETLKW; - op_getlk = F_OFD_GETLK; - return; - } -#endif /* OFD locks */ - op_setlk = F_SETLK; - op_setlkw = F_SETLKW; - op_getlk = F_GETLK; -} - -#ifndef OFF_T_MAX -#define OFF_T_MAX \ - ((sizeof(off_t) > 4 ? INT64_MAX : INT32_MAX) & ~(size_t)0xffff) -#endif -#define LCK_WHOLE OFF_T_MAX - -static int mdbx_lck_op(mdbx_filehandle_t fd, int cmd, short lck, off_t offset, - off_t len) { - for (;;) { - struct flock lock_op; - memset(&lock_op, 0, sizeof(lock_op)); - lock_op.l_type = lck; - lock_op.l_whence = SEEK_SET; - lock_op.l_start = offset; - lock_op.l_len = len; - if (fcntl(fd, cmd, &lock_op) == 0) { - if (cmd == op_getlk) { - /* Checks reader by pid. Returns: - * MDBX_RESULT_TRUE - if pid is live (unable to acquire lock) - * MDBX_RESULT_FALSE - if pid is dead (lock acquired). */ - return (lock_op.l_type == F_UNLCK) ? MDBX_RESULT_FALSE - : MDBX_RESULT_TRUE; - } - return 0; - } - int rc = errno; - if (rc != EINTR || cmd == op_setlkw) - return rc; - } -} - -static __inline int mdbx_lck_exclusive(int lfd, bool fallback2shared) { - assert(lfd != INVALID_HANDLE_VALUE); - if (flock(lfd, LOCK_EX | LOCK_NB)) - return errno; - int rc = mdbx_lck_op(lfd, op_setlk, F_WRLCK, 0, 1); - if (rc != 0 && fallback2shared) { - while (flock(lfd, LOCK_SH)) { - int rc = errno; - if (rc != EINTR) - return rc; - } - } - return rc; -} - -static __inline int mdbx_lck_shared(int lfd) { - assert(lfd != INVALID_HANDLE_VALUE); - while (flock(lfd, LOCK_SH)) { - int rc = errno; - if (rc != EINTR) - return rc; - } - return mdbx_lck_op(lfd, op_setlkw, F_RDLCK, 0, 1); -} - -MDBX_INTERNAL_FUNC int mdbx_lck_downgrade(MDBX_env *env, bool complete) { - assert(env->me_lfd != INVALID_HANDLE_VALUE); - return complete ? mdbx_lck_shared(env->me_lfd) : MDBX_SUCCESS; -} - -MDBX_INTERNAL_FUNC int mdbx_rpid_set(MDBX_env *env) { - assert(env->me_lfd != INVALID_HANDLE_VALUE); - assert(env->me_pid > 0); - return mdbx_lck_op(env->me_lfd, op_setlk, F_WRLCK, env->me_pid, 1); -} - -MDBX_INTERNAL_FUNC int mdbx_rpid_clear(MDBX_env *env) { - assert(env->me_lfd != INVALID_HANDLE_VALUE); - assert(env->me_pid > 0); - return mdbx_lck_op(env->me_lfd, op_setlkw, F_UNLCK, env->me_pid, 1); -} - -MDBX_INTERNAL_FUNC int mdbx_rpid_check(MDBX_env *env, mdbx_pid_t pid) { - assert(env->me_lfd != INVALID_HANDLE_VALUE); - assert(pid > 0); - return mdbx_lck_op(env->me_lfd, op_getlk, F_WRLCK, pid, 1); -} - -/*---------------------------------------------------------------------------*/ - -static int mdbx_mutex_failed(MDBX_env *env, pthread_mutex_t *mutex, - const int rc); - -MDBX_INTERNAL_FUNC int __cold mdbx_lck_init(MDBX_env *env, - int global_uniqueness_flag) { - if (global_uniqueness_flag == MDBX_RESULT_FALSE) - return MDBX_SUCCESS; - - pthread_mutexattr_t ma; - int rc = pthread_mutexattr_init(&ma); - if (rc) - return rc; - - rc = pthread_mutexattr_setpshared(&ma, PTHREAD_PROCESS_SHARED); - if (rc) - goto bailout; - -#if MDBX_USE_ROBUST -#if defined(__GLIBC__) && !__GLIBC_PREREQ(2, 12) && \ - !defined(pthread_mutex_consistent) && _POSIX_C_SOURCE < 200809L - rc = pthread_mutexattr_setrobust_np(&ma, PTHREAD_MUTEX_ROBUST_NP); -#else - rc = pthread_mutexattr_setrobust(&ma, PTHREAD_MUTEX_ROBUST); -#endif - if (rc) - goto bailout; -#endif /* MDBX_USE_ROBUST */ - -#if _POSIX_C_SOURCE >= 199506L && !defined(MDBX_SAFE4QEMU) - rc = pthread_mutexattr_setprotocol(&ma, PTHREAD_PRIO_INHERIT); - if (rc == ENOTSUP) - rc = pthread_mutexattr_setprotocol(&ma, PTHREAD_PRIO_NONE); - if (rc) - goto bailout; -#endif /* PTHREAD_PRIO_INHERIT */ - - rc = pthread_mutexattr_settype(&ma, PTHREAD_MUTEX_ERRORCHECK); - if (rc) - goto bailout; - - rc = pthread_mutex_init(&env->me_lck->mti_rmutex, &ma); - if (rc) - goto bailout; - rc = pthread_mutex_init(&env->me_lck->mti_wmutex, &ma); - -bailout: - pthread_mutexattr_destroy(&ma); - return rc; -} - -MDBX_INTERNAL_FUNC int __cold mdbx_lck_destroy(MDBX_env *env, - MDBX_env *inprocess_neighbor) { - if (env->me_lfd != INVALID_HANDLE_VALUE && !inprocess_neighbor && - env->me_lck && - /* try get exclusive access */ mdbx_lck_exclusive(env->me_lfd, false) == - 0) { - mdbx_info("%s: got exclusive, drown mutexes", mdbx_func_); - int rc = pthread_mutex_destroy(&env->me_lck->mti_rmutex); - if (rc == 0) - rc = pthread_mutex_destroy(&env->me_lck->mti_wmutex); - assert(rc == 0); - (void)rc; - msync(env->me_lck, env->me_os_psize, MS_ASYNC); - /* file locks would be released (by kernel) - * while the me_lfd will be closed */ - } - - if (op_setlk == F_SETLK) { - /* File locks would be released (by kernel) while the file-descriptors - * will be closed. But to avoid false-positive EDEADLK from the kernel, - * locks should be released here explicitly with properly order. */ - - /* POSIX's fcntl() locks should be restored after file was closed. - * FIXME: This code should be rethinked and retested, since it will - * executed in really rare cases. - * - * On the other hand, seems more reasonable to disallow multi-open feature - * by default, and describe it as "use at your own risk". Currently - * multi-open required only for libfpta's unit-tests. */ - - int rc = MDBX_SUCCESS; - /* close clk and restore locks */ - if (env->me_lfd != INVALID_HANDLE_VALUE) { - (void)close(env->me_lfd); - env->me_lfd = INVALID_HANDLE_VALUE; - if (inprocess_neighbor) { - /* restore file-locks */ - if (rc == MDBX_SUCCESS) - rc = mdbx_lck_op(inprocess_neighbor->me_lfd, F_SETLKW, F_RDLCK, 0, 1); - if (rc == MDBX_SUCCESS) - rc = mdbx_rpid_set(inprocess_neighbor); - } - } - - /* close dxb and restore lock */ - if (env->me_fd != INVALID_HANDLE_VALUE) { - (void)close(env->me_fd); - env->me_fd = INVALID_HANDLE_VALUE; - if (inprocess_neighbor && rc == MDBX_SUCCESS) { - /* restore file-lock */ - rc = mdbx_lck_op( - inprocess_neighbor->me_fd, F_SETLKW, - (inprocess_neighbor->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, - (inprocess_neighbor->me_lfd == INVALID_HANDLE_VALUE) - ? 0 - : inprocess_neighbor->me_pid, - (inprocess_neighbor->me_lfd == INVALID_HANDLE_VALUE) ? OFF_T_MAX - : 1); - } - } - - if (inprocess_neighbor && rc != MDBX_SUCCESS) { - inprocess_neighbor->me_flags |= MDBX_FATAL_ERROR; - return rc; - } - } - - return MDBX_SUCCESS; -} - -static int mdbx_robust_lock(MDBX_env *env, pthread_mutex_t *mutex) { - int rc = pthread_mutex_lock(mutex); - if (unlikely(rc != 0)) - rc = mdbx_mutex_failed(env, mutex, rc); - return rc; -} - -static int mdbx_robust_trylock(MDBX_env *env, pthread_mutex_t *mutex) { - int rc = pthread_mutex_trylock(mutex); - if (unlikely(rc != 0 && rc != EBUSY)) - rc = mdbx_mutex_failed(env, mutex, rc); - return (rc != EBUSY) ? rc : MDBX_BUSY; -} - -static int mdbx_robust_unlock(MDBX_env *env, pthread_mutex_t *mutex) { - int rc = pthread_mutex_unlock(mutex); - if (unlikely(rc != 0)) - rc = mdbx_mutex_failed(env, mutex, rc); - return rc; -} - -MDBX_INTERNAL_FUNC int mdbx_rdt_lock(MDBX_env *env) { - mdbx_trace(">>"); - int rc = mdbx_robust_lock(env, &env->me_lck->mti_rmutex); - mdbx_trace("<< rc %d", rc); - return rc; -} - -MDBX_INTERNAL_FUNC void mdbx_rdt_unlock(MDBX_env *env) { - mdbx_trace(">>"); - int rc = mdbx_robust_unlock(env, &env->me_lck->mti_rmutex); - mdbx_trace("<< rc %d", rc); - if (unlikely(MDBX_IS_ERROR(rc))) - mdbx_panic("%s() failed: errcode %d\n", mdbx_func_, rc); -} - -int mdbx_txn_lock(MDBX_env *env, bool dontwait) { - mdbx_trace(">>"); - int rc = dontwait ? mdbx_robust_trylock(env, env->me_wmutex) - : mdbx_robust_lock(env, env->me_wmutex); - mdbx_trace("<< rc %d", rc); - return MDBX_IS_ERROR(rc) ? rc : MDBX_SUCCESS; -} - -void mdbx_txn_unlock(MDBX_env *env) { - mdbx_trace(">>"); - int rc = mdbx_robust_unlock(env, env->me_wmutex); - mdbx_trace("<< rc %d", rc); - if (unlikely(MDBX_IS_ERROR(rc))) - mdbx_panic("%s() failed: errcode %d\n", mdbx_func_, rc); -} - -static int __cold internal_seize_lck(int lfd) { - assert(lfd != INVALID_HANDLE_VALUE); - - /* try exclusive access */ - int rc = mdbx_lck_exclusive(lfd, false); - if (rc == 0) - /* got exclusive */ - return MDBX_RESULT_TRUE; - if (rc == EAGAIN || rc == EACCES || rc == EBUSY || rc == EWOULDBLOCK) { - /* get shared access */ - rc = mdbx_lck_shared(lfd); - if (rc == 0) { - /* got shared, try exclusive again */ - rc = mdbx_lck_exclusive(lfd, true); - if (rc == 0) - /* now got exclusive */ - return MDBX_RESULT_TRUE; - if (rc == EAGAIN || rc == EACCES || rc == EBUSY || rc == EWOULDBLOCK) - /* unable exclusive, but stay shared */ - return MDBX_RESULT_FALSE; - } - } - assert(MDBX_IS_ERROR(rc)); - return rc; -} - -MDBX_INTERNAL_FUNC int __cold mdbx_lck_seize(MDBX_env *env) { - assert(env->me_fd != INVALID_HANDLE_VALUE); - if (unlikely(op_setlk == 0)) - choice_fcntl(); - - if (env->me_lfd == INVALID_HANDLE_VALUE) { - /* LY: without-lck mode (e.g. exclusive or on read-only filesystem) */ - int rc = mdbx_lck_op(env->me_fd, op_setlk, - (env->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, 0, - LCK_WHOLE); - if (rc != 0) { - mdbx_error("%s(%s) failed: errcode %u", mdbx_func_, "without-lck", rc); - return rc; - } - return MDBX_RESULT_TRUE; - } - - if ((env->me_flags & MDBX_RDONLY) == 0) { - /* Check that another process don't operates in without-lck mode. */ - int rc = mdbx_lck_op(env->me_fd, op_setlk, F_WRLCK, env->me_pid, 1); - if (rc != 0) { - mdbx_error("%s(%s) failed: errcode %u", mdbx_func_, - "lock-against-without-lck", rc); - return rc; - } - } - - return internal_seize_lck(env->me_lfd); -} - -static int __cold mdbx_mutex_failed(MDBX_env *env, pthread_mutex_t *mutex, - const int err) { - int rc = err; -#if MDBX_USE_ROBUST - if (err == EOWNERDEAD) { - /* We own the mutex. Clean up after dead previous owner. */ - - int rlocked = (env->me_lck && mutex == &env->me_lck->mti_rmutex); - rc = MDBX_SUCCESS; - if (!rlocked) { - if (unlikely(env->me_txn)) { - /* env is hosed if the dead thread was ours */ - env->me_flags |= MDBX_FATAL_ERROR; - env->me_txn = NULL; - rc = MDBX_PANIC; - } - } - mdbx_notice("%cmutex owner died, %s", (rlocked ? 'r' : 'w'), - (rc ? "this process' env is hosed" : "recovering")); - - int check_rc = mdbx_reader_check0(env, rlocked, NULL); - check_rc = (check_rc == MDBX_SUCCESS) ? MDBX_RESULT_TRUE : check_rc; - -#if defined(__GLIBC__) && !__GLIBC_PREREQ(2, 12) && \ - !defined(pthread_mutex_consistent) && _POSIX_C_SOURCE < 200809L - int mreco_rc = pthread_mutex_consistent_np(mutex); -#else - int mreco_rc = pthread_mutex_consistent(mutex); -#endif - check_rc = (mreco_rc == 0) ? check_rc : mreco_rc; - - if (unlikely(mreco_rc)) - mdbx_error("mutex recovery failed, %s", mdbx_strerror(mreco_rc)); - - rc = (rc == MDBX_SUCCESS) ? check_rc : rc; - if (MDBX_IS_ERROR(rc)) - pthread_mutex_unlock(mutex); - return rc; - } -#else - (void)mutex; -#endif /* MDBX_USE_ROBUST */ - - mdbx_error("mutex (un)lock failed, %s", mdbx_strerror(err)); - if (rc != EDEADLK) - env->me_flags |= MDBX_FATAL_ERROR; - return rc; -} diff --git a/src/elements/lck-posix.c b/src/elements/lck-posix.c index a821aac5..d60b6a05 100644 --- a/src/elements/lck-posix.c +++ b/src/elements/lck-posix.c @@ -14,22 +14,63 @@ #include "./internals.h" -/* Some platforms define the EOWNERDEAD error code - * even though they don't support Robust Mutexes. - * Compile with -DMDBX_USE_ROBUST=0. */ +/* Some platforms define the EOWNERDEAD error code even though they + * don't support Robust Mutexes. Compile with -DMDBX_USE_ROBUST=0. */ #ifndef MDBX_USE_ROBUST -#if (defined(EOWNERDEAD) || _POSIX_C_SOURCE >= 200809L) && !defined(__APPLE__) +/* Howard Chu: Android currently lacks Robust Mutex support */ +#if defined(EOWNERDEAD) && !defined(__ANDROID__) && !defined(__APPLE__) && \ + (!defined(__GLIBC__) || \ + __GLIBC_PREREQ( \ + 2, \ + 10) /* LY: glibc before 2.10 has a troubles with Robust Mutex too. */ \ + || _POSIX_C_SOURCE >= 200809L) #define MDBX_USE_ROBUST 1 #else #define MDBX_USE_ROBUST 0 #endif #endif /* MDBX_USE_ROBUST */ +#ifndef MDBX_USE_OFDLOCKS +#if defined(F_OFD_SETLK) && defined(F_OFD_SETLKW) && defined(F_OFD_GETLK) +#define MDBX_USE_OFDLOCKS 1 +#else +#define MDBX_USE_OFDLOCKS 0 +#endif +#endif /* MDBX_USE_OFDLOCKS */ + /*----------------------------------------------------------------------------*/ -/* rthc */ +/* global constructor/destructor */ + +#if defined(__linux__) || defined(__gnu_linux__) +#include +#ifndef MDBX_ALLOY +uint32_t mdbx_linux_kernel_version; +#endif /* MDBX_ALLOY */ +#endif /* Linux */ static __cold __attribute__((__constructor__)) void mdbx_global_constructor(void) { +#if defined(__linux__) || defined(__gnu_linux__) + struct utsname buffer; + if (uname(&buffer) == 0) { + int i = 0; + char *p = buffer.release; + while (*p && i < 4) { + if (*p >= '0' && *p <= '9') { + long number = strtol(p, &p, 10); + if (number > 0) { + if (number > 255) + number = 255; + mdbx_linux_kernel_version += number << (24 - i * 8); + } + ++i; + } else { + ++p; + } + } + } +#endif /* Linux */ + mdbx_rthc_global_init(); } @@ -41,7 +82,7 @@ mdbx_global_destructor(void) { /*----------------------------------------------------------------------------*/ /* lck */ -/* Описание реализации блокировок для POSIX: +/* Описание реализации блокировок для POSIX & Linux: * * lck-файл отображается в память, в нём организуется таблица читателей и * размещаются совместно используемые posix-мьютексы (futex). Посредством @@ -57,7 +98,7 @@ mdbx_global_destructor(void) { * - Проверка присутствие процессов-читателей, * т.е. функции mdbx_rpid_set(), mdbx_rpid_clear() и mdbx_rpid_check(). * - * Для блокировки файлов Используется только fcntl(F_SETLK), так как: + * Для блокировки файлов используется fcntl(F_SETLK), так как: * - lockf() оперирует только эксклюзивной блокировкой и требует * открытия файла в RW-режиме. * - flock() не гарантирует атомарности при смене блокировок @@ -67,28 +108,68 @@ mdbx_global_destructor(void) { * в качестве позиции используется pid процесса-читателя. * - Для первоначального захвата и shared/exclusive выполняется блокировка * основного файла БД и при успехе lck-файла. + * + * ---------------------------------------------------------------------------- + * УДЕРЖИВАЕМЫЕ БЛОКИРОВКИ В ЗАВИСИМОСТИ ОТ РЕЖИМА И СОСТОЯНИЯ + * + * Эксклюзивный режим без lck-файла: + * = заблокирован весь dxb-файл посредством F_RDLCK или F_WRLCK, + * в зависимости от MDBX_RDONLY. + * + * Не-операционный режим на время пере-инициализации и разрушении lck-файла: + * = F_WRLCK блокировка первого байта lck-файла, другие процессы ждут её + * снятия при получении F_RDLCK через F_SETLKW. + * - блокировки dxb-файла могут меняться до снятие эксклюзивной блокировки + * lck-файла: + * + для НЕ-эксклюзивного режима блокировка pid-байта в dxb-файле + * посредством F_RDLCK или F_WRLCK, в зависимости от MDBX_RDONLY. + * + для ЭКСКЛЮЗИВНОГО режима блокировка pid-байта всего dxb-файла + * посредством F_RDLCK или F_WRLCK, в зависимости от MDBX_RDONLY. + * + * ОПЕРАЦИОННЫЙ режим с lck-файлом: + * = F_RDLCK блокировка первого байта lck-файла, другие процессы не могут + * получить F_WRLCK и таким образом видят что БД используется. + * + F_WRLCK блокировка pid-байта в clk-файле после первой транзакции чтения. + * + для НЕ-эксклюзивного режима блокировка pid-байта в dxb-файле + * посредством F_RDLCK или F_WRLCK, в зависимости от MDBX_RDONLY. + * + для ЭКСКЛЮЗИВНОГО режима блокировка pid-байта всего dxb-файла + * посредством F_RDLCK или F_WRLCK, в зависимости от MDBX_RDONLY. */ +#if MDBX_USE_OFDLOCKS +static int op_setlk, op_setlkw, op_getlk; +static void __cold choice_fcntl() { + assert(!op_setlk && !op_setlkw && !op_getlk); + if ((mdbx_runtime_flags & MDBX_DBG_LEGACY_MULTIOPEN) == 0 +#if defined(__linux__) || defined(__gnu_linux__) + && mdbx_linux_kernel_version > + 0x030f0000 /* OFD locks are available since 3.15, but engages here + only for 3.16 and larer kernels (LTS) for reliability reasons */ +#endif /* linux */ + ) { + op_setlk = F_OFD_SETLK; + op_setlkw = F_OFD_SETLKW; + op_getlk = F_OFD_GETLK; + return; + } + op_setlk = F_SETLK; + op_setlkw = F_SETLKW; + op_getlk = F_GETLK; +} +#else +#define op_setlk F_SETLK +#define op_setlkw F_SETLKW +#define op_getlk F_GETLK +#endif /* MDBX_USE_OFDLOCKS */ + #ifndef OFF_T_MAX #define OFF_T_MAX \ ((sizeof(off_t) > 4 ? INT64_MAX : INT32_MAX) & ~(size_t)0xffff) #endif -#ifndef PID_T_MAX -#define PID_T_MAX INT_MAX -#endif -#if defined(F_OFD_SETLK) && defined(F_OFD_SETLKW) && defined(F_OFD_GETLK) -#define OP_SETLK F_OFD_SETLK -#define OP_SETLKW F_OFD_SETLKW -#define OP_GETLK F_OFD_GETLK -#else -#define OP_SETLK F_SETLK -#define OP_SETLKW F_SETLKW -#define OP_GETLK F_GETLK -#endif /* OFD locks */ - -static int mdbx_lck_op(mdbx_filehandle_t fd, int cmd, short lck, off_t offset, - off_t len) { +static int lck_op(mdbx_filehandle_t fd, int cmd, short lck, off_t offset, + off_t len) { + mdbx_jitter4testing(true); for (;;) { struct flock lock_op; memset(&lock_op, 0, sizeof(lock_op)); @@ -96,131 +177,212 @@ static int mdbx_lck_op(mdbx_filehandle_t fd, int cmd, short lck, off_t offset, lock_op.l_whence = SEEK_SET; lock_op.l_start = offset; lock_op.l_len = len; - if (fcntl(fd, cmd, &lock_op) == 0) { - if (cmd == OP_GETLK) { + int rc = fcntl(fd, cmd, &lock_op); + mdbx_jitter4testing(true); + if (rc != -1) { + if (cmd == op_getlk) { /* Checks reader by pid. Returns: * MDBX_RESULT_TRUE - if pid is live (unable to acquire lock) * MDBX_RESULT_FALSE - if pid is dead (lock acquired). */ return (lock_op.l_type == F_UNLCK) ? MDBX_RESULT_FALSE : MDBX_RESULT_TRUE; } - return 0; + return MDBX_SUCCESS; } - int rc = errno; - if (rc != EINTR || cmd == F_SETLKW) + rc = errno; + if (rc != EINTR || cmd == op_setlkw) { + mdbx_assert(nullptr, MDBX_IS_ERROR(rc)); return rc; + } } } MDBX_INTERNAL_FUNC int mdbx_rpid_set(MDBX_env *env) { assert(env->me_lfd != INVALID_HANDLE_VALUE); - assert(env->me_pid > 0 && env->me_pid <= PID_T_MAX); - return mdbx_lck_op(env->me_lfd, OP_SETLK, F_WRLCK, env->me_pid, 1); + assert(env->me_pid > 0); + return lck_op(env->me_lfd, op_setlk, F_WRLCK, env->me_pid, 1); } MDBX_INTERNAL_FUNC int mdbx_rpid_clear(MDBX_env *env) { assert(env->me_lfd != INVALID_HANDLE_VALUE); - assert(env->me_pid > 0 && env->me_pid <= PID_T_MAX); - return mdbx_lck_op(env->me_lfd, OP_SETLKW, F_UNLCK, env->me_pid, 1); + assert(env->me_pid > 0); + return lck_op(env->me_lfd, op_setlk, F_UNLCK, env->me_pid, 1); } MDBX_INTERNAL_FUNC int mdbx_rpid_check(MDBX_env *env, mdbx_pid_t pid) { assert(env->me_lfd != INVALID_HANDLE_VALUE); - assert(pid > 0 && pid <= PID_T_MAX); - assert(PID_T_MAX < OFF_T_MAX); - return mdbx_lck_op(env->me_lfd, OP_GETLK, F_WRLCK, pid, 1); + assert(pid > 0); + return lck_op(env->me_lfd, op_getlk, F_WRLCK, pid, 1); } +/*---------------------------------------------------------------------------*/ + MDBX_INTERNAL_FUNC int __cold mdbx_lck_seize(MDBX_env *env) { assert(env->me_fd != INVALID_HANDLE_VALUE); - assert(env->me_pid > 0 && env->me_pid <= PID_T_MAX); +#if MDBX_USE_OFDLOCKS + if (unlikely(op_setlk == 0)) + choice_fcntl(); +#endif /* MDBX_USE_OFDLOCKS */ + int rc; if (env->me_lfd == INVALID_HANDLE_VALUE) { /* LY: without-lck mode (e.g. exclusive or on read-only filesystem) */ - int rc = mdbx_lck_op(env->me_fd, OP_SETLK, - (env->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, 0, - OFF_T_MAX); - if (rc != 0) { + rc = + lck_op(env->me_fd, op_setlk, + (env->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, 0, OFF_T_MAX); + if (rc != MDBX_SUCCESS) { mdbx_error("%s(%s) failed: errcode %u", mdbx_func_, "without-lck", rc); + mdbx_assert(env, MDBX_IS_ERROR(rc)); return rc; } - return MDBX_RESULT_TRUE; + return MDBX_RESULT_TRUE /* Done: return with exclusive locking. */; } - /* try exclusive access */ - int rc = mdbx_lck_op(env->me_fd, OP_SETLK, - (env->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, 0, - OFF_T_MAX); - if (rc == 0) { - continue_exclusive: - /* got dxb-exclusive, continue lck-exclusive */ - rc = mdbx_lck_op(env->me_lfd, OP_SETLKW, F_WRLCK, 0, OFF_T_MAX); - if (rc == 0) { - /* got both exclusive */ - return MDBX_RESULT_TRUE; + /* Firstly try to get exclusive locking. */ + rc = lck_op(env->me_lfd, op_setlk, F_WRLCK, 0, 1); + if (rc == MDBX_SUCCESS) { + continue_dxb_exclusive: + rc = + lck_op(env->me_fd, op_setlk, + (env->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, 0, OFF_T_MAX); + if (rc == MDBX_SUCCESS) + return MDBX_RESULT_TRUE /* Done: return with exclusive locking. */; + + /* the cause may be a collision with POSIX's file-lock recovery. */ + if (!(rc == EAGAIN || rc == EACCES || rc == EBUSY || rc == EWOULDBLOCK || + rc == EDEADLK)) { + mdbx_error("%s(%s) failed: errcode %u", mdbx_func_, "dxb-exclusive", rc); + mdbx_assert(env, MDBX_IS_ERROR(rc)); + return rc; } + + /* Fallback to lck-shared */ + rc = lck_op(env->me_lfd, op_setlk, F_RDLCK, 0, 1); + if (rc != MDBX_SUCCESS) { + mdbx_error("%s(%s) failed: errcode %u", mdbx_func_, "fallback-shared", + rc); + mdbx_assert(env, MDBX_IS_ERROR(rc)); + return rc; + } + /* Done: return with shared locking. */ + return MDBX_RESULT_FALSE; + } + + /* Wait for lck-shared now. */ + /* Here may be await during transient processes, for instance until another + * competing process doesn't call lck_downgrade(). */ + rc = lck_op(env->me_lfd, op_setlkw, F_RDLCK, 0, 1); + if (rc != MDBX_SUCCESS) { + mdbx_error("%s(%s) failed: errcode %u", mdbx_func_, "try-shared", rc); + mdbx_assert(env, MDBX_IS_ERROR(rc)); + return rc; + } + + /* Lock against another process operating in without-lck or exclusive mode. */ + rc = + lck_op(env->me_fd, op_setlk, + (env->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, env->me_pid, 1); + if (rc != MDBX_SUCCESS) { mdbx_error("%s(%s) failed: errcode %u", mdbx_func_, - "lck-after-dxb-exclusive", rc); - assert(MDBX_IS_ERROR(rc)); - goto bailout; + "lock-against-without-lck", rc); + mdbx_assert(env, MDBX_IS_ERROR(rc)); + return rc; } - if (rc == EAGAIN || rc == EACCES || rc == EBUSY || rc == EWOULDBLOCK) { - rc = mdbx_lck_op(env->me_fd, OP_SETLKW, - (env->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, - env->me_pid, 1); - if (rc == 0) { - /* got dxb-shared, try again dxb-exclusive */ - rc = mdbx_lck_op(env->me_fd, OP_SETLK, - (env->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, 0, - OFF_T_MAX); - if (rc == 0) - goto continue_exclusive; + /* got shared, retry exclusive */ + rc = lck_op(env->me_lfd, op_setlk, F_WRLCK, 0, 1); + if (rc == MDBX_SUCCESS) + goto continue_dxb_exclusive; - /* continue lck-shared */ - rc = mdbx_lck_op(env->me_lfd, OP_SETLKW, F_RDLCK, 0, 1); - if (rc == 0) { - /* got both dxb and lck shared lock */ - return MDBX_RESULT_FALSE; - } - mdbx_error("%s(%s) failed: errcode %u", mdbx_func_, "lck-shared", rc); - } else { - mdbx_error("%s(%s) failed: errcode %u", mdbx_func_, "dxb-shared", rc); - } - assert(MDBX_IS_ERROR(rc)); - } + if (rc == EAGAIN || rc == EACCES || rc == EBUSY || rc == EWOULDBLOCK || + rc == EDEADLK) + return MDBX_RESULT_FALSE /* Done: exclusive is unavailable, + but shared locks are alive. */ + ; -bailout: - (void)mdbx_lck_op(env->me_lfd, OP_SETLK, F_UNLCK, 0, OFF_T_MAX); - (void)mdbx_lck_op(env->me_fd, OP_SETLK, F_UNLCK, 0, OFF_T_MAX); - assert(MDBX_IS_ERROR(rc)); + mdbx_error("%s(%s) failed: errcode %u", mdbx_func_, "try-exclusive", rc); + mdbx_assert(env, MDBX_IS_ERROR(rc)); return rc; } -int mdbx_lck_downgrade(MDBX_env *env, bool complete) { +MDBX_INTERNAL_FUNC int mdbx_lck_downgrade(MDBX_env *env) { assert(env->me_lfd != INVALID_HANDLE_VALUE); - int rc = mdbx_lck_op(env->me_lfd, OP_SETLK, F_UNLCK, 1, OFF_T_MAX - 1); - if (rc == 0) - rc = mdbx_lck_op(env->me_lfd, OP_SETLKW, F_RDLCK, 0, 1); + int rc = MDBX_SUCCESS; + if ((env->me_flags & MDBX_EXCLUSIVE) == 0) { + rc = lck_op(env->me_fd, op_setlk, F_UNLCK, 0, env->me_pid); + if (rc == MDBX_SUCCESS) + rc = lck_op(env->me_fd, op_setlk, F_UNLCK, env->me_pid + 1, + OFF_T_MAX - env->me_pid - 1); + } + if (rc == MDBX_SUCCESS) + rc = lck_op(env->me_lfd, op_setlk, F_RDLCK, 0, 1); if (unlikely(rc != 0)) { mdbx_error("%s(%s) failed: errcode %u", mdbx_func_, "lck", rc); - goto bailout; + assert(MDBX_IS_ERROR(rc)); } - if (complete) { - rc = mdbx_lck_op(env->me_fd, OP_SETLK, - (env->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, - env->me_pid, 1); - if (unlikely(rc != 0)) { - mdbx_error("%s(%s) failed: errcode %u", mdbx_func_, "dxb", rc); - goto bailout; + return rc; +} + +MDBX_INTERNAL_FUNC int __cold mdbx_lck_destroy(MDBX_env *env, + MDBX_env *inprocess_neighbor) { + int rc = MDBX_SUCCESS; + if (env->me_lfd != INVALID_HANDLE_VALUE && !inprocess_neighbor && + env->me_lck && + /* try get exclusive access */ + lck_op(env->me_lfd, op_setlk, F_WRLCK, 0, OFF_T_MAX) == 0 && + lck_op(env->me_fd, op_setlk, + (env->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, 0, OFF_T_MAX)) { + mdbx_info("%s: got exclusive, drown mutexes", mdbx_func_); + rc = pthread_mutex_destroy(&env->me_lck->mti_rmutex); + if (rc == 0) + rc = pthread_mutex_destroy(&env->me_lck->mti_wmutex); + mdbx_assert(env, rc == 0); + if (rc == 0) { + memset(env->me_lck, 0x81, sizeof(MDBX_lockinfo)); + msync(env->me_lck, env->me_os_psize, MS_ASYNC); + } + mdbx_jitter4testing(false); + } + + /* 1) POSIX's fcntl() locks (i.e. when op_setlk == F_SETLK) should be restored + * after file was closed. + * + * 2) File locks would be released (by kernel) while the file-descriptors will + * be closed. But to avoid false-positive EACCESS and EDEADLK from the kernel, + * locks should be released here explicitly with properly order. */ + + /* close dxb and restore lock */ + if (env->me_fd != INVALID_HANDLE_VALUE) { + if (unlikely(close(env->me_fd) != 0) && rc == MDBX_SUCCESS) + rc = errno; + env->me_fd = INVALID_HANDLE_VALUE; + if (op_setlk == F_SETLK && inprocess_neighbor && rc == MDBX_SUCCESS) { + /* restore file-lock */ + rc = lck_op( + inprocess_neighbor->me_fd, F_SETLKW, + (inprocess_neighbor->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, + (inprocess_neighbor->me_flags & MDBX_EXCLUSIVE) + ? 0 + : inprocess_neighbor->me_pid, + (inprocess_neighbor->me_flags & MDBX_EXCLUSIVE) ? OFF_T_MAX : 1); } } - return MDBX_SUCCESS; -bailout: - (void)mdbx_lck_op(env->me_lfd, OP_SETLK, F_UNLCK, 0, OFF_T_MAX); - (void)mdbx_lck_op(env->me_fd, OP_SETLK, F_UNLCK, 0, OFF_T_MAX); - assert(MDBX_IS_ERROR(rc)); + /* close clk and restore locks */ + if (env->me_lfd != INVALID_HANDLE_VALUE) { + if (unlikely(close(env->me_lfd) != 0) && rc == MDBX_SUCCESS) + rc = errno; + env->me_lfd = INVALID_HANDLE_VALUE; + if (op_setlk == F_SETLK && inprocess_neighbor && rc == MDBX_SUCCESS) { + /* restore file-locks */ + rc = lck_op(inprocess_neighbor->me_lfd, F_SETLKW, F_RDLCK, 0, 1); + if (rc == MDBX_SUCCESS && inprocess_neighbor->me_live_reader) + rc = mdbx_rpid_set(inprocess_neighbor); + } + } + + if (inprocess_neighbor && rc != MDBX_SUCCESS) + inprocess_neighbor->me_flags |= MDBX_FATAL_ERROR; return rc; } @@ -231,7 +393,7 @@ static int mdbx_mutex_failed(MDBX_env *env, pthread_mutex_t *mutex, MDBX_INTERNAL_FUNC int __cold mdbx_lck_init(MDBX_env *env, int global_uniqueness_flag) { - if (global_uniqueness_flag == MDBX_RESULT_FALSE) + if (global_uniqueness_flag != MDBX_RESULT_TRUE) return MDBX_SUCCESS; pthread_mutexattr_t ma; @@ -244,7 +406,12 @@ MDBX_INTERNAL_FUNC int __cold mdbx_lck_init(MDBX_env *env, goto bailout; #if MDBX_USE_ROBUST +#if defined(__GLIBC__) && !__GLIBC_PREREQ(2, 12) && \ + !defined(pthread_mutex_consistent) && _POSIX_C_SOURCE < 200809L + rc = pthread_mutexattr_setrobust_np(&ma, PTHREAD_MUTEX_ROBUST_NP); +#else rc = pthread_mutexattr_setrobust(&ma, PTHREAD_MUTEX_ROBUST); +#endif if (rc) goto bailout; #endif /* MDBX_USE_ROBUST */ @@ -271,75 +438,8 @@ bailout: return rc; } -MDBX_INTERNAL_FUNC int __cold mdbx_lck_destroy(MDBX_env *env, - MDBX_env *inprocess_neighbor) { - /* File locks would be released (by kernel) while the file-descriptors - * will be closed. But to avoid false-positive EDEADLK from the kernel, - * locks should be released here explicitly with properly order. */ - - if (env->me_lfd != INVALID_HANDLE_VALUE && !inprocess_neighbor && - env->me_lck && - /* try get exclusive access */ - mdbx_lck_op(env->me_fd, OP_SETLK, - (env->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, 0, - OFF_T_MAX) == 0 && - mdbx_lck_op(env->me_lfd, OP_SETLK, F_WRLCK, 0, OFF_T_MAX) == 0) { - mdbx_info("%s: got exclusive, drown mutexes", mdbx_func_); - int rc = pthread_mutex_destroy(&env->me_lck->mti_rmutex); - if (rc == 0) - rc = pthread_mutex_destroy(&env->me_lck->mti_wmutex); - assert(rc == 0); - (void)rc; - msync(env->me_lck, env->me_os_psize, MS_ASYNC); - } - - /* POSIX's fcntl() locks should be restored after file was closed. - * FIXME: This code should be rethinked and retested, since it will executed - * in really rare cases. For instance, this code could wait a lot, if other - * process get exclusive access immediately after the close(). - * - * On the other hand, seems more reasonable to disallow multi-open feature - * by default, and describe it as "use at your own risk". Currently - * multi-open required only for libfpta's unit-tests. */ - - int rc = MDBX_SUCCESS; - /* close clk and restore locks */ - if (env->me_lfd != INVALID_HANDLE_VALUE) { - (void)close(env->me_lfd); - env->me_lfd = INVALID_HANDLE_VALUE; - if (inprocess_neighbor) { - /* restore file-locks */ - if (rc == MDBX_SUCCESS) - rc = mdbx_lck_op(inprocess_neighbor->me_lfd, OP_SETLKW, F_RDLCK, 0, 1); - if (rc == MDBX_SUCCESS) - rc = mdbx_rpid_set(inprocess_neighbor); - } - } - - /* close dxb and restore lock */ - if (env->me_fd != INVALID_HANDLE_VALUE) { - (void)close(env->me_fd); - env->me_fd = INVALID_HANDLE_VALUE; - if (inprocess_neighbor && rc == MDBX_SUCCESS) { - /* restore file-lock */ - rc = mdbx_lck_op( - inprocess_neighbor->me_fd, OP_SETLKW, - (inprocess_neighbor->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, - (inprocess_neighbor->me_lfd == INVALID_HANDLE_VALUE) - ? 0 - : inprocess_neighbor->me_pid, - (inprocess_neighbor->me_lfd == INVALID_HANDLE_VALUE) ? OFF_T_MAX : 1); - } - } - - if (inprocess_neighbor && rc != MDBX_SUCCESS) { - inprocess_neighbor->me_flags |= MDBX_FATAL_ERROR; - return rc; - } - return MDBX_SUCCESS; -} - static int mdbx_robust_lock(MDBX_env *env, pthread_mutex_t *mutex) { + mdbx_jitter4testing(true); int rc = pthread_mutex_lock(mutex); if (unlikely(rc != 0)) rc = mdbx_mutex_failed(env, mutex, rc); @@ -347,6 +447,7 @@ static int mdbx_robust_lock(MDBX_env *env, pthread_mutex_t *mutex) { } static int mdbx_robust_trylock(MDBX_env *env, pthread_mutex_t *mutex) { + mdbx_jitter4testing(true); int rc = pthread_mutex_trylock(mutex); if (unlikely(rc != 0 && rc != EBUSY)) rc = mdbx_mutex_failed(env, mutex, rc); @@ -355,6 +456,7 @@ static int mdbx_robust_trylock(MDBX_env *env, pthread_mutex_t *mutex) { static int mdbx_robust_unlock(MDBX_env *env, pthread_mutex_t *mutex) { int rc = pthread_mutex_unlock(mutex); + mdbx_jitter4testing(true); if (unlikely(rc != 0)) rc = mdbx_mutex_failed(env, mutex, rc); return rc; @@ -414,7 +516,12 @@ static int __cold mdbx_mutex_failed(MDBX_env *env, pthread_mutex_t *mutex, int check_rc = mdbx_reader_check0(env, rlocked, NULL); check_rc = (check_rc == MDBX_SUCCESS) ? MDBX_RESULT_TRUE : check_rc; +#if defined(__GLIBC__) && !__GLIBC_PREREQ(2, 12) && \ + !defined(pthread_mutex_consistent) && _POSIX_C_SOURCE < 200809L + int mreco_rc = pthread_mutex_consistent_np(mutex); +#else int mreco_rc = pthread_mutex_consistent(mutex); +#endif check_rc = (mreco_rc == 0) ? check_rc : mreco_rc; if (unlikely(mreco_rc)) diff --git a/src/elements/lck-windows.c b/src/elements/lck-windows.c index 412bdb7e..d4d15459 100644 --- a/src/elements/lck-windows.c +++ b/src/elements/lck-windows.c @@ -341,17 +341,32 @@ mdbx_resume_threads_after_remap(mdbx_handle_array_t *array) { /* global `initial` lock for lockfile initialization, * exclusive/shared locking first cacheline */ -/* FIXME: locking schema/algo descritpion. - ?-? = free - S-? = used - E-? = exclusive-read - ?-S - ?-E = middle - S-S - S-E = locked - E-S - E-E = exclusive-write -*/ +/* Briefly descritpion of locking schema/algorithm: + * - Windows does not support upgrading or downgrading for file locking. + * - Therefore upgrading/downgrading is emulated by shared and exclusive + * locking of upper and lower halves. + * - In other words, we have FSM with possible 9 states, + * i.e. free/shared/exclusive x free/shared/exclusive == 9. + * Only 6 states of FSM are used, which 2 of ones are transitive. + * + * The mdbx_lck_seize() moves the locking-FSM from the initial free/unlocked + * state to the "exclusive write" (and returns MDBX_RESULT_TRUE) if possible, + * or to the "used" (and returns MDBX_RESULT_FALSE). + * + * The mdbx_lck_downgrade() moves the locking-FSM from "exclusive write" + * state to the "used" (i.e. shared) state. + * + * States: + * ?-? = free, i.e. unlocked + * S-? = used, i.e. shared lock + * E-? = exclusive-read, i.e. operational exclusive + * ?-S + * ?-E = middle (transitive state) + * S-S + * S-E = locked (transitive state) + * E-S + * E-E = exclusive-write, i.e. exclusive due (re)initialization + */ static void lck_unlock(MDBX_env *env) { int rc; @@ -414,8 +429,8 @@ MDBX_INTERNAL_FUNC int mdbx_lck_destroy(MDBX_env *env, } /* Seize state as 'exclusive-write' (E-E and returns MDBX_RESULT_TRUE) - * or as 'used' (S-? and returns MDBX_RESULT_FALSE), otherwise returns an error - */ + * or as 'used' (S-? and returns MDBX_RESULT_FALSE). + * Oherwise returns an error. */ static int internal_seize_lck(HANDLE lfd) { int rc; assert(lfd != INVALID_HANDLE_VALUE); @@ -511,23 +526,25 @@ MDBX_INTERNAL_FUNC int mdbx_lck_seize(MDBX_env *env) { return rc; } -MDBX_INTERNAL_FUNC int mdbx_lck_downgrade(MDBX_env *env, bool complete) { +MDBX_INTERNAL_FUNC int mdbx_lck_downgrade(MDBX_env *env) { /* Transite from exclusive state (E-?) to used (S-?) */ assert(env->me_fd != INVALID_HANDLE_VALUE); assert(env->me_lfd != INVALID_HANDLE_VALUE); +#if 1 if (env->me_flags & MDBX_EXCLUSIVE) return MDBX_SUCCESS /* nope since files were must be opened non-shareable */ ; - +#else /* 1) must be at E-E (exclusive-write) */ - if (!complete) { + if (env->me_flags & MDBX_EXCLUSIVE) { /* transite from E-E to E_? (exclusive-read) */ if (!funlock(env->me_lfd, LCK_UPPER)) mdbx_panic("%s(%s) failed: errcode %u", mdbx_func_, "E-E(exclusive-write) >> E-?(exclusive-read)", GetLastError()); return MDBX_SUCCESS /* 2) now at E-? (exclusive-read), done */; } +#endif /* 3) now at E-E (exclusive-write), transite to ?_E (middle) */ if (!funlock(env->me_lfd, LCK_LOWER)) diff --git a/src/elements/osal.h b/src/elements/osal.h index e5770c3a..32324584 100644 --- a/src/elements/osal.h +++ b/src/elements/osal.h @@ -698,12 +698,17 @@ MDBX_INTERNAL_FUNC int mdbx_lck_destroy(MDBX_env *env, MDBX_INTERNAL_FUNC int mdbx_lck_seize(MDBX_env *env); /// \brief Снижает уровень первоначальной захваченной блокировки до -/// операционного уровня определяемого аргументом. -/// \param -/// complete = TRUE - понижение до разделяемой блокировки. -/// complete = FALSE - понижение до эксклюзивной операционной блокировки. +/// операционного уровня определяемого аргументом. Смысл функции в возврате +/// в операционный режим: +/// - разблокирование других процессов ожидающих доступа, т.е если +/// (env->me_flags & MDBX_EXCLUSIVE) != 0, то другие процессы должны узнать +/// о невозможности доступа, а не ждать его. +/// - снятия блокировок мешающих работе с файлом (актуально для Windows). +/// (env->me_flags & MDBX_EXCLUSIVE) == 0 - понижение до разделяемой +/// блокировки. (env->me_flags & MDBX_EXCLUSIVE) != 0 - понижение до +/// эксклюзивной операционной блокировки. /// \return Код ошибки или 0 в случае успеха. -MDBX_INTERNAL_FUNC int mdbx_lck_downgrade(MDBX_env *env, bool complete); +MDBX_INTERNAL_FUNC int mdbx_lck_downgrade(MDBX_env *env); /// \brief Блокирует lck-файл и/или таблицу читателей для (де)регистрации. /// \return Код ошибки или 0 в случае успеха.