From 3e0fad1cf61aa9038fab72e66ec9252b3f7df9ab Mon Sep 17 00:00:00 2001 From: Leonid Yuriev Date: Sat, 17 Apr 2021 00:13:51 +0300 Subject: [PATCH] mdbx: rework (NO)READAHEAD handling. Resolves https://github.com/erthink/libmdbx/issues/164 --- NOTE: Seems there is a bug in the Mach/Darwin/OSX kernel, because MADV_WILLNEED with offset != 0 may cause SIGBUS on following access to the hinted region. 19.6.0 Darwin Kernel Version 19.6.0: Tue Jan 12 22:13:05 PST 2021; root:xnu-6153.141.16~1/RELEASE_X86_64 x86_64 Change-Id: I11ebbf2bd35e3dba9d078be16cb5678aecf8329c --- .github/actions/spelling/expect.txt | 1 + src/core.c | 188 ++++++++++++++++++---------- src/internals.h | 7 +- 3 files changed, 130 insertions(+), 66 deletions(-) diff --git a/.github/actions/spelling/expect.txt b/.github/actions/spelling/expect.txt index 81b41bb5..7d958eda 100644 --- a/.github/actions/spelling/expect.txt +++ b/.github/actions/spelling/expect.txt @@ -1896,6 +1896,7 @@ xkeep XLB xmerge xml +xnu XOPEN xp XSI diff --git a/src/core.c b/src/core.c index 14c45d25..f2845007 100644 --- a/src/core.c +++ b/src/core.c @@ -5434,69 +5434,130 @@ static __always_inline __maybe_unused int ignore_enosys(int err) { #if MDBX_ENABLE_MADVISE /* Turn on/off readahead. It's harmful when the DB is larger than RAM. */ -static __cold int mdbx_set_readahead(MDBX_env *env, const size_t offset, - const size_t length, const bool enable) { - assert(length > 0); +static __cold int mdbx_set_readahead(MDBX_env *env, const pgno_t edge, + const bool enable, + const bool force_whole) { + mdbx_assert(env, edge >= NUM_METAS && edge <= MAX_PAGENO); + mdbx_assert(env, (enable & 1) == (enable != 0)); + const bool toggle = force_whole || + ((enable ^ *env->me_readahead_anchor) & 1) || + !*env->me_readahead_anchor; + const pgno_t prev_edge = *env->me_readahead_anchor >> 1; + const size_t limit = env->me_dxb_mmap.limit; + size_t offset = + toggle ? 0 + : pgno_align2os_bytes(env, (prev_edge < edge) ? prev_edge : edge); + offset = (offset < limit) ? offset : limit; + + size_t length = + pgno_align2os_bytes(env, (prev_edge < edge) ? edge : prev_edge); + length = (length < limit) ? length : limit; + length -= offset; + + mdbx_assert(env, 0 <= (intptr_t)length); + if (length == 0) + return MDBX_SUCCESS; + + int err; mdbx_notice("readahead %s %u..%u", enable ? "ON" : "OFF", bytes2pgno(env, offset), bytes2pgno(env, offset + length)); #if defined(F_RDAHEAD) - if (unlikely(fcntl(env->me_lazy_fd, F_RDAHEAD, enable) == -1)) - return errno; + if (toggle && unlikely(fcntl(env->me_lazy_fd, F_RDAHEAD, enable) == -1)) { + err = errno; + goto bailout; + } #endif /* F_RDAHEAD */ if (enable) { -#if defined(F_RDADVISE) - struct radvisory hint; - hint.ra_offset = offset; - hint.ra_count = length; - (void)/* Ignore ENOTTY for DB on the ram-disk and so on */ fcntl( - env->me_lazy_fd, F_RDADVISE, &hint); -#endif /* F_RDADVISE */ -#if defined(MADV_WILLNEED) - int err = madvise(env->me_map + offset, length, MADV_WILLNEED) - ? ignore_enosys(errno) - : MDBX_SUCCESS; +#if defined(MADV_NORMAL) + err = madvise(env->me_map + offset, length, MADV_NORMAL) + ? ignore_enosys(errno) + : MDBX_SUCCESS; if (unlikely(MDBX_IS_ERROR(err))) - return err; -#elif defined(POSIX_MADV_WILLNEED) - int err = ignore_enosys( - posix_madvise(env->me_map + offset, length, POSIX_MADV_WILLNEED)); + goto bailout; +#elif defined(POSIX_MADV_NORMAL) + err = ignore_enosys( + posix_madvise(env->me_map + offset, length, POSIX_MADV_NORMAL)); if (unlikely(MDBX_IS_ERROR(err))) - return err; + goto bailout; +#elif defined(POSIX_FADV_NORMAL) && defined(POSIX_FADV_WILLNEED) + err = ignore_enosys( + posix_fadvise(env->me_lazy_fd, offset, length, POSIX_FADV_NORMAL)); + if (unlikely(MDBX_IS_ERROR(err))) + goto bailout; #elif defined(_WIN32) || defined(_WIN64) - if (mdbx_PrefetchVirtualMemory) { - WIN32_MEMORY_RANGE_ENTRY hint; - hint.VirtualAddress = env->me_map + offset; - hint.NumberOfBytes = length; - (void)mdbx_PrefetchVirtualMemory(GetCurrentProcess(), 1, &hint, 0); - } + /* no madvise on Windows */ +#else +#warning "FIXME" +#endif + if (toggle) { + /* NOTE: Seems there is a bug in the Mach/Darwin/OSX kernel, + * because MADV_WILLNEED with offset != 0 may cause SIGBUS + * on following access to the hinted region. + * 19.6.0 Darwin Kernel Version 19.6.0: Tue Jan 12 22:13:05 PST 2021; + * root:xnu-6153.141.16~1/RELEASE_X86_64 x86_64 */ +#if defined(F_RDADVISE) + struct radvisory hint; + hint.ra_offset = offset; + hint.ra_count = length; + (void)/* Ignore ENOTTY for DB on the ram-disk and so on */ fcntl( + env->me_lazy_fd, F_RDADVISE, &hint); +#elif defined(MADV_WILLNEED) + err = madvise(env->me_map + offset, length, MADV_WILLNEED) + ? ignore_enosys(errno) + : MDBX_SUCCESS; + if (unlikely(MDBX_IS_ERROR(err))) + goto bailout; +#elif defined(POSIX_MADV_WILLNEED) + err = ignore_enosys( + posix_madvise(env->me_map + offset, length, POSIX_MADV_WILLNEED)); + if (unlikely(MDBX_IS_ERROR(err))) + goto bailout; +#elif defined(_WIN32) || defined(_WIN64) + if (mdbx_PrefetchVirtualMemory) { + WIN32_MEMORY_RANGE_ENTRY hint; + hint.VirtualAddress = env->me_map + offset; + hint.NumberOfBytes = length; + (void)mdbx_PrefetchVirtualMemory(GetCurrentProcess(), 1, &hint, 0); + } #elif defined(POSIX_FADV_WILLNEED) - int err = ignore_enosys( - posix_fadvise(env->me_lazy_fd, offset, length, POSIX_FADV_WILLNEED)); - if (unlikely(MDBX_IS_ERROR(err))) - return err; -#endif /* MADV_WILLNEED */ + err = ignore_enosys( + posix_fadvise(env->me_lazy_fd, offset, length, POSIX_FADV_WILLNEED)); + if (unlikely(MDBX_IS_ERROR(err))) + goto bailout; +#else +#warning "FIXME" +#endif + } } else { #if defined(MADV_RANDOM) - int err = madvise(env->me_map + offset, length, MADV_RANDOM) - ? ignore_enosys(errno) - : MDBX_SUCCESS; + err = madvise(env->me_map + offset, length, MADV_RANDOM) + ? ignore_enosys(errno) + : MDBX_SUCCESS; if (unlikely(MDBX_IS_ERROR(err))) - return err; + goto bailout; #elif defined(POSIX_MADV_RANDOM) - int err = ignore_enosys( + err = ignore_enosys( posix_madvise(env->me_map + offset, length, POSIX_MADV_RANDOM)); if (unlikely(MDBX_IS_ERROR(err))) - return err; + goto bailout; #elif defined(POSIX_FADV_RANDOM) - int err = ignore_enosys( + err = ignore_enosys( posix_fadvise(env->me_lazy_fd, offset, length, POSIX_FADV_RANDOM)); if (unlikely(MDBX_IS_ERROR(err))) - return err; + goto bailout; +#elif defined(_WIN32) || defined(_WIN64) + /* no madvise on Windows */ +#else +#warning "FIXME" #endif /* MADV_RANDOM */ } - return MDBX_SUCCESS; + + *env->me_readahead_anchor = (enable & 1) + (edge << 1); + err = MDBX_SUCCESS; +bailout: + return err; } #endif /* MDBX_ENABLE_MADVISE */ @@ -5636,28 +5697,18 @@ static __cold int mdbx_mapresize(MDBX_env *env, const pgno_t used_pgno, mapping_can_be_moved); #if MDBX_ENABLE_MADVISE - if (rc == MDBX_SUCCESS && (env->me_flags & MDBX_NORDAHEAD) == 0) { - const int readahead = + if (rc == MDBX_SUCCESS) { + env->me_discarded_tail->weak = size_pgno; + const bool readahead = + !(env->me_flags & MDBX_NORDAHEAD) && mdbx_is_readahead_reasonable(size_bytes, -(intptr_t)prev_size); - if (readahead == MDBX_RESULT_FALSE) - rc = mdbx_set_readahead( - env, 0, (size_bytes > prev_size) ? size_bytes : prev_size, false); - else if (readahead == MDBX_RESULT_TRUE) { - const size_t readahead_pivot = - (limit_bytes != prev_limit || env->me_dxb_mmap.address != prev_addr + const bool force = limit_bytes != prev_limit || + env->me_dxb_mmap.address != prev_addr #if defined(_WIN32) || defined(_WIN64) - || prev_size > size_bytes + || prev_size > size_bytes #endif /* Windows */ - ) - ? 0 /* reassign readahead to the entire map - because it was remapped */ - : prev_size; - if (size_bytes > readahead_pivot) { - env->me_discarded_tail->weak = size_pgno; - rc = mdbx_set_readahead(env, readahead_pivot, - size_bytes - readahead_pivot, true); - } - } + ; + rc = mdbx_set_readahead(env, size_pgno, readahead, force); } #endif /* MDBX_ENABLE_MADVISE */ @@ -11141,7 +11192,7 @@ static __cold int mdbx_setup_dxb(MDBX_env *env, const int lck_rc) { #if MDBX_ENABLE_MADVISE /* calculate readahead hint before mmap with zero redundant pages */ const bool readahead = - (env->me_flags & MDBX_NORDAHEAD) == 0 && + !(env->me_flags & MDBX_NORDAHEAD) && mdbx_is_readahead_reasonable(used_bytes, 0) == MDBX_RESULT_TRUE; #endif /* MDBX_ENABLE_MADVISE */ @@ -11375,9 +11426,9 @@ static __cold int mdbx_setup_dxb(MDBX_env *env, const int lck_rc) { atomic_store32(env->me_discarded_tail, bytes2pgno(env, used_aligned2os_bytes), mo_Relaxed); #if MDBX_ENABLE_MADVISE - if (used_aligned2os_bytes < env->me_dxb_mmap.current) { + if (lck_rc && used_aligned2os_bytes < env->me_dxb_mmap.current) { #if defined(MADV_REMOVE) - if (lck_rc && (env->me_flags & MDBX_WRITEMAP) != 0 && + if ((env->me_flags & MDBX_WRITEMAP) != 0 && /* not recovery mode */ env->me_stuck_meta < 0) { mdbx_notice("open-MADV_%s %u..%u", "REMOVE (deallocate file space)", env->me_discarded_tail->weak, @@ -11416,8 +11467,8 @@ static __cold int mdbx_setup_dxb(MDBX_env *env, const int lck_rc) { #endif /* MADV_DONTNEED */ } - err = mdbx_set_readahead(env, 0, used_bytes, readahead); - if (err != MDBX_SUCCESS && lck_rc == /* lck exclusive */ MDBX_RESULT_TRUE) + err = mdbx_set_readahead(env, bytes2pgno(env, used_bytes), readahead, true); + if (unlikely(err != MDBX_SUCCESS)) return err; #endif /* MDBX_ENABLE_MADVISE */ @@ -11470,6 +11521,7 @@ static __cold int mdbx_setup_lck(MDBX_env *env, char *lck_pathname, env->me_unsynced_pages = &env->me_lckless_stub.autosync_pending; env->me_autosync_threshold = &env->me_lckless_stub.autosync_threshold; env->me_discarded_tail = &env->me_lckless_stub.discarded_tail; + env->me_readahead_anchor = &env->me_lckless_stub.readahead_anchor; env->me_meta_sync_txnid = &env->me_lckless_stub.meta_sync_txnid; env->me_maxreaders = UINT_MAX; #if MDBX_LOCKING > 0 @@ -11569,6 +11621,10 @@ static __cold int mdbx_setup_lck(MDBX_env *env, char *lck_pathname, : MDBX_SUCCESS; if (unlikely(MDBX_IS_ERROR(err))) goto bailout; +#elif defined(POSIX_MADV_WILLNEED) + err = ignore_enosys(posix_madvise(env->me_lck, size, POSIX_MADV_WILLNEED)); + if (unlikely(MDBX_IS_ERROR(err))) + goto bailout; #endif /* MADV_WILLNEED */ #endif /* MDBX_ENABLE_MADVISE */ @@ -11623,6 +11679,7 @@ static __cold int mdbx_setup_lck(MDBX_env *env, char *lck_pathname, env->me_unsynced_pages = &lck->mti_unsynced_pages; env->me_autosync_threshold = &lck->mti_autosync_threshold; env->me_discarded_tail = &lck->mti_discarded_tail; + env->me_readahead_anchor = &lck->mti_readahead_anchor; env->me_meta_sync_txnid = &lck->mti_meta_sync_txnid; #if MDBX_LOCKING > 0 env->me_wlock = &lck->mti_wlock; @@ -12216,6 +12273,7 @@ static __cold int mdbx_env_close0(MDBX_env *env) { env->me_unsynced_pages = nullptr; env->me_autosync_threshold = nullptr; env->me_discarded_tail = nullptr; + env->me_readahead_anchor = nullptr; env->me_meta_sync_txnid = nullptr; if (env->me_flags & MDBX_ENV_TXKEY) mdbx_rthc_remove(env->me_txkey); diff --git a/src/internals.h b/src/internals.h index 6c27a665..9439a4f6 100644 --- a/src/internals.h +++ b/src/internals.h @@ -536,7 +536,7 @@ typedef struct MDBX_lockinfo { * Zero means timed auto-sync is disabled. */ MDBX_atomic_uint64_t mti_autosync_period; - /* Marker to distinguish uniqueness of DB/CLK.*/ + /* Marker to distinguish uniqueness of DB/CLK. */ MDBX_atomic_uint64_t mti_bait_uniqueness; alignas(MDBX_CACHELINE_SIZE) /* cacheline ---------------------------------*/ @@ -562,6 +562,9 @@ typedef struct MDBX_lockinfo { /* Timestamp of the last readers check. */ MDBX_atomic_uint64_t mti_reader_check_timestamp; + /* Shared anchor for tracking readahead edge and enabled/disabled status. */ + pgno_t mti_readahead_anchor; + alignas(MDBX_CACHELINE_SIZE) /* cacheline ---------------------------------*/ /* Readeaders registration lock. */ @@ -975,6 +978,7 @@ struct MDBX_env { atomic_pgno_t *me_unsynced_pages; atomic_pgno_t *me_autosync_threshold; atomic_pgno_t *me_discarded_tail; + pgno_t *me_readahead_anchor; MDBX_atomic_uint32_t *me_meta_sync_txnid; MDBX_hsr_func *me_hsr_callback; /* Callback for kicking laggard readers */ unsigned me_dp_reserve_len; @@ -998,6 +1002,7 @@ struct MDBX_env { atomic_pgno_t autosync_pending; atomic_pgno_t autosync_threshold; atomic_pgno_t discarded_tail; + pgno_t readahead_anchor; MDBX_atomic_uint32_t meta_sync_txnid; } me_lckless_stub; #if MDBX_DEBUG