From 0dc544fefdcfe4f045979a263b668ae2f0c9e8ba Mon Sep 17 00:00:00 2001 From: Leonid Yuriev Date: Mon, 11 Nov 2019 12:48:31 +0300 Subject: [PATCH] mdbx: use SysV semaphores on systems without shared mutexes. Change-Id: Ib2ad9ed137ab76999a2a8e832f9f77ff1a0788ca --- src/elements/core.c | 16 ++++-- src/elements/internals.h | 43 +++++++++++----- src/elements/lck-posix.c | 103 +++++++++++++++++++++++++++++++++++---- src/elements/osal.h | 4 ++ 4 files changed, 140 insertions(+), 26 deletions(-) diff --git a/src/elements/core.c b/src/elements/core.c index c880c1f7..0cd72c3f 100644 --- a/src/elements/core.c +++ b/src/elements/core.c @@ -7494,7 +7494,7 @@ int __cold mdbx_env_create(MDBX_env **penv) { goto bailout; } -#if MDBX_LOCKING > 0 +#if MDBX_LOCKING > MDBX_LOCKING_SYSV rc = mdbx_ipclock_stub(&env->me_lckless_stub.wlock); #endif /* MDBX_LOCKING */ if (unlikely(rc != MDBX_SUCCESS)) { @@ -8338,7 +8338,7 @@ static int __cold mdbx_setup_lck(MDBX_env *env, char *lck_pathname, env->me_maxreaders = UINT_MAX; #if MDBX_LOCKING > 0 env->me_wlock = &env->me_lckless_stub.wlock; -#endif /* MDBX_LOCKING */ +#endif /* MDBX_LOCKING > 0 */ mdbx_debug("lck-setup:%s%s%s", " lck-less", (env->me_flags & MDBX_RDONLY) ? " readonly" : "", (rc == MDBX_RESULT_TRUE) ? " exclusive" : " cooperative"); @@ -8472,7 +8472,7 @@ static int __cold mdbx_setup_lck(MDBX_env *env, char *lck_pathname, env->me_meta_sync_txnid = &lck->mti_meta_sync_txnid; #if MDBX_LOCKING > 0 env->me_wlock = &lck->mti_wlock; -#endif /* MDBX_LOCKING */ +#endif /* MDBX_LOCKING > 0 */ return lck_seize_rc; } @@ -8704,6 +8704,14 @@ int __cold mdbx_env_open(MDBX_env *env, const char *path, unsigned flags, if (rc != MDBX_SUCCESS) goto bailout; +#if MDBX_LOCKING == MDBX_LOCKING_SYSV + env->me_sysv_ipc.key = ftok(dxb_pathname, 42); + if (env->me_sysv_ipc.key == -1) { + rc = errno; + goto bailout; + } +#endif /* MDBX_LOCKING */ + const int lck_rc = mdbx_setup_lck(env, lck_pathname, mode); if (MDBX_IS_ERROR(lck_rc)) { rc = lck_rc; @@ -8930,7 +8938,7 @@ int __cold mdbx_env_close_ex(MDBX_env *env, int dont_sync) { mdbx_fastmutex_destroy(&env->me_remap_guard) == MDBX_SUCCESS); #endif /* Windows */ -#if MDBX_LOCKING > 0 +#if MDBX_LOCKING > MDBX_LOCKING_SYSV mdbx_ensure(env, mdbx_ipclock_destroy(&env->me_lckless_stub.wlock) == 0); #endif /* MDBX_LOCKING */ diff --git a/src/elements/internals.h b/src/elements/internals.h index f68bdb46..5fdb37f1 100644 --- a/src/elements/internals.h +++ b/src/elements/internals.h @@ -204,9 +204,11 @@ #endif /* MDBX_64BIT_CAS */ #define MDBX_LOCKING_WIN32FILES -1 +#define MDBX_LOCKING_SYSV 5 /* SystemV IPC semaphores */ #define MDBX_LOCKING_POSIX1988 1988 /* POSIX-1 Shared anonymous semaphores */ #define MDBX_LOCKING_POSIX2001 2001 /* POSIX-2001 Shared Mutexes */ #define MDBX_LOCKING_POSIX2008 2008 /* POSIX-2008 Robust Mutexes */ +#define MDBX_LOCKING_BENAPHORE 1995 /* BeOS Benaphores, aka Futexes */ #if defined(_WIN32) || defined(_WIN64) #define MDBX_LOCKING MDBX_LOCKING_WIN32FILES @@ -227,8 +229,10 @@ #else #define MDBX_LOCKING MDBX_LOCKING_POSIX2001 #endif -#else +#elif defined(__sun) || defined(__SVR4) || defined(__svr4__) #define MDBX_LOCKING MDBX_LOCKING_POSIX1988 +#else +#define MDBX_LOCKING MDBX_LOCKING_SYSV #endif #define MDBX_LOCKING_CONFIG "AUTO=" STRINGIFY(MDBX_LOCKING) #else @@ -507,8 +511,18 @@ typedef struct MDBX_page { #pragma pack(pop) -#if MDBX_LOCKING > 0 -#if MDBX_LOCKING == MDBX_LOCKING_POSIX2001 || \ +#if MDBX_LOCKING == MDBX_LOCKING_WIN32FILES +#define MDBX_CLOCK_SIGN UINT32_C(0xF10C) +typedef void mdbx_ipclock_t; +#elif MDBX_LOCKING == MDBX_LOCKING_SYSV + +#define MDBX_CLOCK_SIGN UINT32_C(0xF18D) +typedef mdbx_pid_t mdbx_ipclock_t; +#ifndef EOWNERDEAD +#define EOWNERDEAD MDBX_RESULT_TRUE +#endif + +#elif MDBX_LOCKING == MDBX_LOCKING_POSIX2001 || \ MDBX_LOCKING == MDBX_LOCKING_POSIX2008 #define MDBX_CLOCK_SIGN UINT32_C(0x8017) typedef pthread_mutex_t mdbx_ipclock_t; @@ -517,13 +531,11 @@ typedef pthread_mutex_t mdbx_ipclock_t; typedef sem_t mdbx_ipclock_t; #else #error "FIXME" -#endif +#endif /* MDBX_LOCKING */ +#if MDBX_LOCKING > MDBX_LOCKING_SYSV MDBX_INTERNAL_FUNC int mdbx_ipclock_stub(mdbx_ipclock_t *ipc); MDBX_INTERNAL_FUNC int mdbx_ipclock_destroy(mdbx_ipclock_t *ipc); - -#else -#define MDBX_CLOCK_SIGN UINT32_C(0xF10C) #endif /* MDBX_LOCKING */ /* Reader Lock Table @@ -642,10 +654,10 @@ typedef struct MDBX_lockinfo { alignas(MDBX_CACHELINE_SIZE) /* cacheline ---------------------------------*/ - /* Write transation lok. */ + /* Write transation lock. */ #if MDBX_LOCKING > 0 mdbx_ipclock_t mti_wlock; -#endif /* MDBX_LOCKING */ +#endif /* MDBX_LOCKING > 0 */ volatile txnid_t mti_oldest_reader; @@ -668,7 +680,7 @@ typedef struct MDBX_lockinfo { /* Readeaders registration lock. */ #if MDBX_LOCKING > 0 mdbx_ipclock_t mti_rlock; -#endif /* MDBX_LOCKING */ +#endif /* MDBX_LOCKING > 0 */ /* The number of slots that have been used in the reader table. * This always records the maximum count, it is not decremented @@ -1020,9 +1032,16 @@ struct MDBX_env { MDBX_txn *me_txn0; /* prealloc'd write transaction */ /* write-txn lock */ +#if MDBX_LOCKING == MDBX_LOCKING_SYSV + union { + key_t key; + int semid; + } me_sysv_ipc; +#endif /* MDBX_LOCKING == MDBX_LOCKING_SYSV */ + #if MDBX_LOCKING > 0 mdbx_ipclock_t *me_wlock; -#endif /* MDBX_LOCKING */ +#endif /* MDBX_LOCKING > 0 */ MDBX_dbx *me_dbxs; /* array of static DB info */ uint16_t *me_dbflags; /* array of flags from MDBX_db.md_flags */ @@ -1050,7 +1069,7 @@ struct MDBX_env { struct { #if MDBX_LOCKING > 0 mdbx_ipclock_t wlock; -#endif /* MDBX_LOCKING */ +#endif /* MDBX_LOCKING > 0 */ txnid_t oldest; uint64_t sync_timestamp; uint64_t autosync_period; diff --git a/src/elements/lck-posix.c b/src/elements/lck-posix.c index 47356ddf..ce2c0c3f 100644 --- a/src/elements/lck-posix.c +++ b/src/elements/lck-posix.c @@ -13,6 +13,7 @@ */ #include "internals.h" +#include /*----------------------------------------------------------------------------*/ /* global constructor/destructor */ @@ -195,7 +196,7 @@ MDBX_INTERNAL_FUNC int mdbx_rpid_check(MDBX_env *env, uint32_t pid) { /*---------------------------------------------------------------------------*/ -#if MDBX_LOCKING > 0 +#if MDBX_LOCKING > MDBX_LOCKING_SYSV MDBX_INTERNAL_FUNC int mdbx_ipclock_stub(mdbx_ipclock_t *ipc) { #if MDBX_LOCKING == MDBX_LOCKING_POSIX1988 return sem_init(ipc, false, 1) ? errno : 0; @@ -217,7 +218,7 @@ MDBX_INTERNAL_FUNC int mdbx_ipclock_destroy(mdbx_ipclock_t *ipc) { #error "FIXME" #endif } -#endif /* MDBX_LOCKING */ +#endif /* MDBX_LOCKING > MDBX_LOCKING_SYSV */ MDBX_INTERNAL_FUNC int __cold mdbx_lck_seize(MDBX_env *env) { assert(env->me_fd != INVALID_HANDLE_VALUE); @@ -365,7 +366,10 @@ MDBX_INTERNAL_FUNC int __cold mdbx_lck_destroy(MDBX_env *env, OFF_T_MAX) == 0) { mdbx_verbose("%s: got exclusive, drown locks", __func__); -#if MDBX_LOCKING > 0 +#if MDBX_LOCKING == MDBX_LOCKING_SYSV + if (env->me_sysv_ipc.semid != -1) + rc = semctl(env->me_sysv_ipc.semid, 2, IPC_RMID) ? errno : 0; +#else rc = mdbx_ipclock_destroy(&env->me_lck->mti_rlock); if (rc == 0) rc = mdbx_ipclock_destroy(&env->me_lck->mti_wlock); @@ -431,8 +435,58 @@ MDBX_INTERNAL_FUNC int __cold mdbx_lck_init(MDBX_env *env, return MDBX_SUCCESS /* currently don't need any initialization if LCK already opened/used inside current process */ ; +#if MDBX_LOCKING == MDBX_LOCKING_SYSV + int semid = -1; + if (global_uniqueness_flag) { + struct stat st; + if (fstat(env->me_fd, &st)) + return errno; + sysv_retry_create: + semid = semget(env->me_sysv_ipc.key, 2, + IPC_CREAT | IPC_EXCL | + (st.st_mode & (S_IRWXU | S_IRWXG | S_IRWXO))); + if (unlikely(semid == -1)) { + int err = errno; + if (err != EEXIST) + return err; -#if MDBX_LOCKING == MDBX_LOCKING_POSIX1988 + /* remove and re-create semaphore set */ + semid = semget(env->me_sysv_ipc.key, 2, 0); + if (semid == -1) { + err = errno; + if (err != ENOENT) + return err; + goto sysv_retry_create; + } + if (semctl(semid, 2, IPC_RMID)) { + err = errno; + if (err != EIDRM) + return err; + } + goto sysv_retry_create; + } + + unsigned short val_array[2] = {1, 1}; + if (semctl(semid, 2, SETALL, val_array)) + return errno; + } else { + semid = semget(env->me_sysv_ipc.key, 2, 0); + if (semid == -1) + return errno; + + /* check read & write access */ + struct semid_ds data[2]; + if (semctl(semid, 2, IPC_STAT, data) || semctl(semid, 2, IPC_SET, data)) + return errno; + } + + env->me_sysv_ipc.semid = semid; + + return MDBX_SUCCESS; + +#elif MDBX_LOCKING == MDBX_LOCKING_FUTEX +#warning "TODO" +#elif MDBX_LOCKING == MDBX_LOCKING_POSIX1988 /* don't initialize semaphores twice */ if (global_uniqueness_flag == MDBX_RESULT_TRUE) { @@ -519,7 +573,7 @@ bailout: static int __cold mdbx_ipclock_failed(MDBX_env *env, mdbx_ipclock_t *ipc, const int err) { int rc = err; -#if MDBX_LOCKING == MDBX_LOCKING_POSIX2008 +#if MDBX_LOCKING == MDBX_LOCKING_POSIX2008 || MDBX_LOCKING == MDBX_LOCKING_SYSV if (err == EOWNERDEAD) { /* We own the mutex. Clean up after dead previous owner. */ @@ -533,12 +587,15 @@ static int __cold mdbx_ipclock_failed(MDBX_env *env, mdbx_ipclock_t *ipc, rc = MDBX_PANIC; } } - mdbx_notice("%cmutex owner died, %s", (rlocked ? 'r' : 'w'), + mdbx_notice("%clock owner died, %s", (rlocked ? 'r' : 'w'), (rc ? "this process' env is hosed" : "recovering")); int check_rc = mdbx_reader_check0(env, rlocked, NULL); check_rc = (check_rc == MDBX_SUCCESS) ? MDBX_RESULT_TRUE : check_rc; +#if MDBX_LOCKING == MDBX_LOCKING_SYSV + rc = (rc == MDBX_SUCCESS) ? check_rc : rc; +#else #if defined(PTHREAD_MUTEX_ROBUST) || defined(pthread_mutex_consistent) int mreco_rc = pthread_mutex_consistent(ipc); #elif defined(PTHREAD_MUTEX_ROBUST_NP) || defined(pthread_mutex_consistent_np) @@ -551,17 +608,21 @@ static int __cold mdbx_ipclock_failed(MDBX_env *env, mdbx_ipclock_t *ipc, check_rc = (mreco_rc == 0) ? check_rc : mreco_rc; if (unlikely(mreco_rc)) - mdbx_error("mutex recovery failed, %s", mdbx_strerror(mreco_rc)); + mdbx_error("lock recovery failed, %s", mdbx_strerror(mreco_rc)); rc = (rc == MDBX_SUCCESS) ? check_rc : rc; if (MDBX_IS_ERROR(rc)) pthread_mutex_unlock(ipc); +#endif /* MDBX_LOCKING == MDBX_LOCKING_POSIX2008 */ return rc; } #elif MDBX_LOCKING == MDBX_LOCKING_POSIX2001 (void)ipc; #elif MDBX_LOCKING == MDBX_LOCKING_POSIX1988 (void)ipc; +#elif MDBX_LOCKING == MDBX_LOCKING_FUTEX +#warning "TODO" + (void)ipc; #else #error "FIXME" #endif /* MDBX_LOCKING */ @@ -588,6 +649,19 @@ static int mdbx_ipclock_lock(MDBX_env *env, mdbx_ipclock_t *ipc, } } else if (sem_wait(ipc)) rc = errno; +#elif MDBX_LOCKING == MDBX_LOCKING_SYSV + struct sembuf op = {.sem_num = (ipc != env->me_wlock), + .sem_op = -1, + .sem_flg = dont_wait ? IPC_NOWAIT | SEM_UNDO : SEM_UNDO}; + int rc; + if (semop(env->me_sysv_ipc.semid, &op, 1)) { + rc = errno; + if (dont_wait && rc == EAGAIN) + rc = MDBX_BUSY; + } else { + rc = *ipc ? EOWNERDEAD : MDBX_SUCCESS; + *ipc = env->me_pid; + } #else #error "FIXME" #endif /* MDBX_LOCKING */ @@ -597,12 +671,21 @@ static int mdbx_ipclock_lock(MDBX_env *env, mdbx_ipclock_t *ipc, return rc; } -static int mdbx_ipclock_unlock(mdbx_ipclock_t *ipc) { +static int mdbx_ipclock_unlock(MDBX_env *env, mdbx_ipclock_t *ipc) { #if MDBX_LOCKING == MDBX_LOCKING_POSIX2001 || \ MDBX_LOCKING == MDBX_LOCKING_POSIX2008 int rc = pthread_mutex_unlock(ipc); + (void)env; #elif MDBX_LOCKING == MDBX_LOCKING_POSIX1988 int rc = sem_post(ipc) ? errno : MDBX_SUCCESS; + (void)env; +#elif MDBX_LOCKING == MDBX_LOCKING_SYSV + if (unlikely(*ipc != (pid_t)env->me_pid)) + return EPERM; + *ipc = 0; + struct sembuf op = { + .sem_num = (ipc != env->me_wlock), .sem_op = 1, .sem_flg = SEM_UNDO}; + int rc = semop(env->me_sysv_ipc.semid, &op, 1) ? errno : MDBX_SUCCESS; #else #error "FIXME" #endif /* MDBX_LOCKING */ @@ -619,7 +702,7 @@ MDBX_INTERNAL_FUNC int mdbx_rdt_lock(MDBX_env *env) { MDBX_INTERNAL_FUNC void mdbx_rdt_unlock(MDBX_env *env) { mdbx_trace("%s", ">>"); - int rc = mdbx_ipclock_unlock(&env->me_lck->mti_rlock); + int rc = mdbx_ipclock_unlock(env, &env->me_lck->mti_rlock); mdbx_trace("<< rc %d", rc); if (unlikely(rc != MDBX_SUCCESS)) mdbx_panic("%s() failed: errcode %d\n", __func__, rc); @@ -636,7 +719,7 @@ int mdbx_txn_lock(MDBX_env *env, bool dont_wait) { void mdbx_txn_unlock(MDBX_env *env) { mdbx_trace("%s", ">>"); - int rc = mdbx_ipclock_unlock(env->me_wlock); + int rc = mdbx_ipclock_unlock(env, env->me_wlock); mdbx_trace("<< rc %d", rc); if (unlikely(rc != MDBX_SUCCESS)) mdbx_panic("%s() failed: errcode %d\n", __func__, rc); diff --git a/src/elements/osal.h b/src/elements/osal.h index 8ec1ccf1..d49e4aa0 100644 --- a/src/elements/osal.h +++ b/src/elements/osal.h @@ -125,6 +125,9 @@ #if defined(__sun) || defined(__SVR4) || defined(__svr4__) #include +/* On Solaris, it's easier to add a missing prototype rather than find a + * combination of #defines that break nothing. */ +__extern_C key_t ftok(const char *, int); #endif /* SunOS/Solaris */ #if defined(_WIN32) || defined(_WIN64) @@ -194,6 +197,7 @@ static inline void *mdbx_realloc(void *ptr, size_t bytes) { #include #include #include +#include #include #include #include