mdbx: rework API and Docs around Handle-Slow-Readers (no algorithmic changes).

Change-Id: I5b76a8400ce6f5f241f8e4a7f53d746fe39f8e1e
This commit is contained in:
Leonid Yuriev 2020-09-29 19:24:57 +03:00
parent 6294e1710a
commit c8a0951566
12 changed files with 124 additions and 90 deletions

View File

@ -601,6 +601,7 @@ hpp
hppa
hpux
hrows
hsr
htags
htm
html
@ -795,6 +796,7 @@ LOCKNAME
locktable
LOGFILE
loglevel
longlived
LONGLONG
lowerbound
lowerboundvalue
@ -1090,7 +1092,6 @@ onstack
onstask
oom
oomfunc
oomkick
openfile
openldap
openmp

View File

@ -30,11 +30,14 @@ Added features:
- Improved opening large DB (> 4Gb) from 32-bit code.
- Provided `pure-function` and `const-function` attributes to C API.
- Support for user-settable transaction context.
- Revised API and documentation related to Handle-Slow-Readers callback feature.
Deprecated functions and flags:
- For clarity and API simplification the `MDBX_MAPASYNC` flag is deprecated.
Just use `MDBX_SAFE_NOSYNC` or `MDBX_UTTERLY_NOSYNC` instead of it.
- `MDBX_oom_func`, `mdbx_env_set_oomfunc()` and `mdbx_env_get_oomfunc()`
replaced with `MDBX_hsr_func`, `mdbx_env_get_hsr` and `mdbx_env_get_hsr()`.
Fixes:

View File

@ -241,7 +241,7 @@ Since version 0.9.1, the utility supports checking the database using any of the
10. Sequence generation and three persistent 64-bit markers.
11. Callback for lack-of-space condition of database that allows you to control and/or resolve such situations.
11. Handle-Slow-Readers callback to resolve a database full/overflow issues due to long-lived read transaction(s).
12. Support for opening databases in the exclusive mode, including on a network share.

View File

@ -147,9 +147,9 @@ or debugging of a client application while retaining an active read
transaction. LMDB this results in `MDB_MAP_FULL` error and subsequent write
performance degradation.
MDBX mostly solve "long-lived" readers issue by the lack-of-space callback
which allow to aborts long readers, and by the `MDBX_LIFORECLAIM` mode which
addresses subsequent performance degradation.
MDBX mostly solve "long-lived" readers issue by using the Handle-Slow-Readers
\ref MDBX_hsr_func callback which allows to abort long-lived read transactions,
and using the \ref MDBX_LIFORECLAIM mode which addresses subsequent performance degradation.
The "next" version of libmdbx (MithrilDB) will completely solve this.
- Avoid suspending a process with active transactions. These would then be

View File

@ -236,6 +236,7 @@ The full \ref c_api documentation lists further details below, like how to:
- Sstimate size of range query result: \ref c_rqest.
- Double performance by LIFO reclaiming on storages with write-back: \ref MDBX_LIFORECLAIM.
- Use sequences and canary markers: \ref mdbx_dbi_sequence(), \ref MDBX_canary.
- Use lack-of-space callback (aka OOM-KICK): \ref mdbx_env_set_oomfunc().
- Use Handle-Slow-Readers callback to resolve a database full/overflow issues
due to long-lived read transactions: \ref mdbx_env_set_hsr().
- Use exclusive mode: \ref MDBX_EXCLUSIVE.
- Define custom sort orders (but this is recommended to be avoided).

71
mdbx.h
View File

@ -2747,9 +2747,9 @@ struct MDBX_txn_info {
uint64_t txn_space_retired;
/** For READ-ONLY transaction: the space available for writer(s) and that
must be exhausted for reason to call the OOM-killer for this read
transaction. For WRITE transaction: the space inside transaction that left
to `MDBX_TXN_FULL` error. */
must be exhausted for reason to call the Handle-Slow-Readers callback for
this read transaction. For WRITE transaction: the space inside transaction
that left to `MDBX_TXN_FULL` error. */
uint64_t txn_space_leftover;
/** For READ-ONLY transaction (provided if `scan_rlt=true`): The space that
@ -4196,14 +4196,25 @@ LIBMDBX_API int mdbx_thread_register(const MDBX_env *env);
* \ref MDBX_RESULT_TRUE if thread is not registered or already unregistered. */
LIBMDBX_API int mdbx_thread_unregister(const MDBX_env *env);
/** \brief A lack-of-space callback function to resolve issues with a laggard
* readers. \ingroup c_err
/** \brief A Handle-Slow-Readers callback function to resolve database
* full/overflow issue due to a reader(s) which prevents the old data from being
* recycled.
* \ingroup c_err
*
* Read transactions prevent reuse of pages freed by newer write transactions,
* thus the database can grow quickly. This callback will be called when there
* is not enough space in the database (ie. before increasing the database size
* is not enough space in the database (i.e. before increasing the database size
* or before \ref MDBX_MAP_FULL error) and thus can be used to resolve issues
* with a "long-lived" read transactions.
* \see long-lived-read
*
* Using this callback you can choose how to resolve the situation:
* - abort the write transaction with an error;
* - wait for the read transaction(s) to complete;
* - notify a thread performing a long-lived read transaction
* and wait for an effect;
* - kill the thread or whole process that performs the long-lived read
* transaction;
*
* Depending on the arguments and needs, your implementation may wait,
* terminate a process or thread that is performing a long read, or perform
@ -4211,9 +4222,11 @@ LIBMDBX_API int mdbx_thread_unregister(const MDBX_env *env);
* corresponds to the performed action.
*
* \param [in] env An environment handle returned by \ref mdbx_env_create().
* \param [in] txn The current write transaction which internally at
* the \ref MDBX_MAP_FULL condition.
* \param [in] pid A pid of the reader process.
* \param [in] tid A thread_id of the reader thread.
* \param [in] txn A transaction number on which stalled.
* \param [in] laggard An oldest read transaction number on which stalled.
* \param [in] gap A lag from the last commited txn.
* \param [in] space A space that actually become available for reuse after
* this reader finished. The callback function can take
@ -4221,9 +4234,9 @@ LIBMDBX_API int mdbx_thread_unregister(const MDBX_env *env);
* a long-running transaction has.
* \param [in] retry A retry number starting from 0.
* If callback has returned 0 at least once, then at end
* of current OOM-handler loop callback will be called
* additionally with negative value to notify about the
* end of loop. The callback function can use this value
* of current handling loop the callback function will be
* called additionally with negative value to notify about
* the end of loop. The callback function can use this value
* to implement timeout logic while waiting for readers.
*
* \returns The RETURN CODE determines the further actions libmdbx and must
@ -4252,36 +4265,42 @@ LIBMDBX_API int mdbx_thread_unregister(const MDBX_env *env);
* \retval 2 or great The reader process was terminated or killed,
* and libmdbx should entirely reset reader registration.
*
* \see mdbx_env_set_oomfunc() \see mdbx_env_get_oomfunc()
* \see mdbx_env_set_hsr() \see mdbx_env_get_hsr()
*/
typedef int(MDBX_oom_func)(MDBX_env *env, mdbx_pid_t pid, mdbx_tid_t tid,
uint64_t txn, unsigned gap, size_t space,
typedef int(MDBX_hsr_func)(const MDBX_env *env, const MDBX_txn *txn,
mdbx_pid_t pid, mdbx_tid_t tid, uint64_t laggard,
unsigned gap, size_t space,
int retry) MDBX_CXX17_NOEXCEPT;
/** \brief Set the OOM callback.
/** \brief Sets a Handle-Slow-Readers callback to resolve database full/overflow
* issue due to a reader(s) which prevents the old data from being recycled.
* \ingroup c_err
*
* The callback will only be triggered on lack of space to resolve issues with
* lagging reader(s) (i.e. to kill it) for resume reuse pages from the garbage
* collector.
* \see mdbx_env_get_oomfunc()
* The callback will only be triggered when the database is full due to a
* reader(s) prevents the old data from being recycled.
*
* \see mdbx_env_get_hsr()
* \see long-lived-read
*
* \param [in] env An environment handle returned
* by \ref mdbx_env_create().
* \param [in] oom_func A \ref MDBX_oom_func function or NULL to disable.
* \param [in] hsr_callback A \ref MDBX_hsr_func function
* or NULL to disable.
*
* \returns A non-zero error value on failure and 0 on success. */
LIBMDBX_API int mdbx_env_set_oomfunc(MDBX_env *env, MDBX_oom_func *oom_func);
LIBMDBX_API int mdbx_env_set_hsr(MDBX_env *env, MDBX_hsr_func *hsr_callback);
/** \brief Get the current oom_func callback.
* \ingroup c_settings
* \see mdbx_env_set_oomfunc()
/** \brief Gets current Handle-Slow-Readers callback used to resolve database
* full/overflow issue due to a reader(s) which prevents the old data from being
* recycled.
* \see mdbx_env_set_hsr()
*
* \param [in] env An environment handle returned by \ref mdbx_env_create().
*
* \returns A MDBX_oom_func function or NULL if disabled. */
MDBX_NOTHROW_PURE_FUNCTION LIBMDBX_API MDBX_oom_func *
mdbx_env_get_oomfunc(const MDBX_env *env);
* \returns A MDBX_hsr_func function or NULL if disabled
* or something wrong. */
MDBX_NOTHROW_PURE_FUNCTION LIBMDBX_API MDBX_hsr_func *
mdbx_env_get_hsr(const MDBX_env *env);
/** \defgroup btree_traversal B-tree Traversal
* This is internal API for mdbx_chk tool. You should avoid to use it, except

View File

@ -2100,14 +2100,16 @@ public:
/// return number of cleared slots.
inline unsigned check_readers();
/// \brief Sets the out-of-space callback.
/// \brief Sets a Handle-Slow-Readers callback to resolve database
/// full/overflow issue due to a reader(s) which prevents the old data from
/// being recycled.
///
/// Such callback will be triggered in a case where there is not enough free
/// space in the database due to long read transaction(s) which impedes
/// reusing the pages of an old MVCC snapshot(s).
///
/// Using this callback you can choose how to get out of the situation:
/// - abort the record transaction with an error;
/// Using this callback you can choose how to resolve the situation:
/// - abort the write transaction with an error;
/// - wait for the read transaction(s) to complete;
/// - notify a thread performing a long-lived read transaction
/// and wait for an effect;
@ -2115,10 +2117,13 @@ public:
/// transaction;
///
/// \see long-lived-read
inline env &set_OutOfSpace_callback(MDBX_oom_func *);
/// \brief Returns the current out-of-space callback.
/// \see set_OutOfSpace_callback()
inline MDBX_oom_func *get_OutOfSpace_callback() const noexcept;
inline env &set_HandleSlowReaders(MDBX_hsr_func *);
/// \brief Returns the current Handle-Slow-Readers callback used to resolve
/// database full/overflow issue due to a reader(s) which prevents the old
/// data from being recycled.
/// \see set_HandleSlowReaders()
inline MDBX_hsr_func *get_HandleSlowReaders() const noexcept;
/// \brief Starts read (read-only) transaction.
inline txn_managed start_read() const;
@ -3499,13 +3504,13 @@ inline unsigned env::check_readers() {
return static_cast<unsigned>(dead_count);
}
inline env &env::set_OutOfSpace_callback(MDBX_oom_func *cb) {
error::success_or_throw(::mdbx_env_set_oomfunc(handle_, cb));
inline env &env::set_HandleSlowReaders(MDBX_hsr_func *cb) {
error::success_or_throw(::mdbx_env_set_hsr(handle_, cb));
return *this;
}
inline MDBX_oom_func *env::get_OutOfSpace_callback() const noexcept {
return ::mdbx_env_get_oomfunc(handle_);
inline MDBX_hsr_func *env::get_HandleSlowReaders() const noexcept {
return ::mdbx_env_get_hsr(handle_);
}
inline txn_managed env::start_read() const {

View File

@ -900,7 +900,7 @@ static __always_inline void safe64_reset(mdbx_safe64_t *ptr,
static __always_inline bool safe64_reset_compare(mdbx_safe64_t *ptr,
txnid_t compare) {
mdbx_compiler_barrier();
/* LY: This function is used to reset `mr_txnid` from OOM-kick in case
/* LY: This function is used to reset `mr_txnid` from hsr-handler in case
* the asynchronously cancellation of read transaction. Therefore,
* there may be a collision between the cleanup performed here and
* asynchronous termination and restarting of the read transaction
@ -3074,7 +3074,8 @@ static __must_check_result int mdbx_page_retire(MDBX_cursor *mc, MDBX_page *mp);
static __must_check_result int mdbx_page_loose(MDBX_txn *txn, MDBX_page *mp);
static int mdbx_page_alloc(MDBX_cursor *mc, const unsigned num,
MDBX_page **const mp, int flags);
static txnid_t mdbx_oomkick(MDBX_env *env, const txnid_t laggard);
static txnid_t mdbx_kick_longlived_readers(MDBX_env *env,
const txnid_t laggard);
static int mdbx_page_new(MDBX_cursor *mc, uint32_t flags, unsigned num,
MDBX_page **mp);
@ -5071,7 +5072,7 @@ skip_cache:
txnid_t oldest = 0, last = 0;
const unsigned wanna_range = num - 1;
while (true) { /* oom-kick retry loop */
while (true) { /* hsr-kick retry loop */
/* If our dirty list is already full, we can't do anything */
if (unlikely(txn->tw.dirtyroom == 0)) {
rc = MDBX_TXN_FULL;
@ -5374,7 +5375,7 @@ skip_cache:
/* it is reasonable check/kick lagging reader(s) here,
* since we made a new steady point or wipe the last. */
if (oldest < txn->mt_txnid - MDBX_TXNID_STEP &&
mdbx_oomkick(env, oldest) > oldest)
mdbx_kick_longlived_readers(env, oldest) > oldest)
continue;
} else if (unlikely(rc != MDBX_RESULT_TRUE))
goto fail;
@ -5386,7 +5387,7 @@ skip_cache:
if ((flags & MDBX_ALLOC_NEW) && next <= txn->mt_end_pgno)
goto done;
if ((flags & MDBX_ALLOC_GC) && oldest < txn->mt_txnid - MDBX_TXNID_STEP &&
mdbx_oomkick(env, oldest) > oldest)
mdbx_kick_longlived_readers(env, oldest) > oldest)
continue;
rc = MDBX_NOTFOUND;
@ -6005,7 +6006,7 @@ static bind_rslot_result bind_rslot(MDBX_env *env, const uintptr_t tid) {
if (likely(slot < env->me_maxreaders))
break;
result.err = mdbx_reader_check0(env, true, NULL);
result.err = mdbx_cleanup_dead_readers(env, true, NULL);
if (result.err != MDBX_RESULT_TRUE) {
mdbx_rdt_unlock(env);
result.err =
@ -10613,7 +10614,7 @@ __cold int mdbx_env_open(MDBX_env *env, const char *pathname,
if (rc != MDBX_SUCCESS)
goto bailout;
} else {
rc = mdbx_reader_check0(env, false, NULL);
rc = mdbx_cleanup_dead_readers(env, false, NULL);
if (MDBX_IS_ERROR(rc))
goto bailout;
}
@ -17612,14 +17613,15 @@ static bool __cold mdbx_pid_insert(uint32_t *ids, uint32_t pid) {
int __cold mdbx_reader_check(MDBX_env *env, int *dead) {
if (dead)
*dead = 0;
return mdbx_reader_check0(env, false, dead);
return mdbx_cleanup_dead_readers(env, false, dead);
}
/* Return:
* MDBX_RESULT_TRUE - done and mutex recovered
* MDBX_SUCCESS - done
* Otherwise errcode. */
MDBX_INTERNAL_FUNC int __cold mdbx_reader_check0(MDBX_env *env, int rdt_locked,
MDBX_INTERNAL_FUNC int __cold mdbx_cleanup_dead_readers(MDBX_env *env,
int rdt_locked,
int *dead) {
int rc = check_env(env);
if (unlikely(rc != MDBX_SUCCESS))
@ -17737,8 +17739,9 @@ int __cold mdbx_setup_debug(int loglevel, int flags, MDBX_debug_func *logger) {
return rc;
}
static txnid_t __cold mdbx_oomkick(MDBX_env *env, const txnid_t laggard) {
mdbx_debug("%s", "DB size maxed out");
static txnid_t __cold mdbx_kick_longlived_readers(MDBX_env *env,
const txnid_t laggard) {
mdbx_debug("DB size maxed out by reading #%" PRIaTXN, laggard);
int retry;
for (retry = 0; retry < INT_MAX; ++retry) {
@ -17746,10 +17749,10 @@ static txnid_t __cold mdbx_oomkick(MDBX_env *env, const txnid_t laggard) {
mdbx_assert(env, oldest < env->me_txn0->mt_txnid);
mdbx_assert(env, oldest >= laggard);
mdbx_assert(env, oldest >= *env->me_oldest);
if (oldest == laggard || unlikely(env->me_lck == NULL /* exclusive mode */))
if (oldest == laggard || unlikely(!env->me_lck /* without-LCK mode */))
return oldest;
if (MDBX_IS_ERROR(mdbx_reader_check0(env, false, NULL)))
if (MDBX_IS_ERROR(mdbx_cleanup_dead_readers(env, false, NULL)))
break;
MDBX_reader *asleep = nullptr;
@ -17778,20 +17781,20 @@ static txnid_t __cold mdbx_oomkick(MDBX_env *env, const txnid_t laggard) {
}
if (laggard < oldest || !asleep) {
if (retry && env->me_oom_func) {
/* LY: notify end of oom-loop */
if (retry && env->me_hsr_callback) {
/* LY: notify end of hsr-loop */
const txnid_t gap = oldest - laggard;
env->me_oom_func(env, 0, 0, laggard,
env->me_hsr_callback(env, env->me_txn, 0, 0, laggard,
(gap < UINT_MAX) ? (unsigned)gap : UINT_MAX, 0,
-retry);
}
mdbx_notice("oom-kick: update oldest %" PRIaTXN " -> %" PRIaTXN,
mdbx_notice("hsr-kick: update oldest %" PRIaTXN " -> %" PRIaTXN,
*env->me_oldest, oldest);
mdbx_assert(env, *env->me_oldest <= oldest);
return *env->me_oldest = oldest;
}
if (!env->me_oom_func)
if (!env->me_hsr_callback)
break;
uint32_t pid = asleep->mr_pid;
@ -17807,9 +17810,9 @@ static txnid_t __cold mdbx_oomkick(MDBX_env *env, const txnid_t laggard) {
(oldest_retired > head_retired)
? pgno2bytes(env, (pgno_t)(oldest_retired - head_retired))
: 0;
int rc = env->me_oom_func(env, pid, (mdbx_tid_t)tid, laggard,
(gap < UINT_MAX) ? (unsigned)gap : UINT_MAX,
space, retry);
int rc = env->me_hsr_callback(
env, env->me_txn, pid, (mdbx_tid_t)tid, laggard,
(gap < UINT_MAX) ? (unsigned)gap : UINT_MAX, space, retry);
if (rc < 0)
break;
@ -17827,9 +17830,9 @@ static txnid_t __cold mdbx_oomkick(MDBX_env *env, const txnid_t laggard) {
}
}
if (retry && env->me_oom_func) {
/* LY: notify end of oom-loop */
env->me_oom_func(env, 0, 0, laggard, 0, 0, -retry);
if (retry && env->me_hsr_callback) {
/* LY: notify end of hsr-loop */
env->me_hsr_callback(env, env->me_txn, 0, 0, laggard, 0, 0, -retry);
}
return mdbx_find_oldest(env->me_txn);
}
@ -17874,18 +17877,18 @@ int __cold mdbx_env_set_syncperiod(MDBX_env *env, unsigned seconds_16dot16) {
return MDBX_SUCCESS;
}
int __cold mdbx_env_set_oomfunc(MDBX_env *env, MDBX_oom_func *oomfunc) {
int __cold mdbx_env_set_hsr(MDBX_env *env, MDBX_hsr_func *hsr) {
int rc = check_env(env);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
env->me_oom_func = oomfunc;
env->me_hsr_callback = hsr;
return MDBX_SUCCESS;
}
MDBX_oom_func *__cold mdbx_env_get_oomfunc(const MDBX_env *env) {
MDBX_hsr_func *__cold mdbx_env_get_hsr(const MDBX_env *env) {
return likely(env && env->me_signature == MDBX_ME_SIGNATURE)
? env->me_oom_func
? env->me_hsr_callback
: NULL;
}

View File

@ -989,7 +989,7 @@ struct MDBX_env {
volatile pgno_t *me_autosync_threshold;
volatile pgno_t *me_discarded_tail;
volatile uint32_t *me_meta_sync_txnid;
MDBX_oom_func *me_oom_func; /* Callback for kicking laggard readers */
MDBX_hsr_func *me_hsr_callback; /* Callback for kicking laggard readers */
struct {
#if MDBX_LOCKING > 0
mdbx_ipclock_t wlock;
@ -1203,7 +1203,7 @@ mdbx_flush_incoherent_mmap(void *addr, size_t nbytes, const intptr_t pagesize) {
/*----------------------------------------------------------------------------*/
/* Internal prototypes */
MDBX_INTERNAL_FUNC int mdbx_reader_check0(MDBX_env *env, int rlocked,
MDBX_INTERNAL_FUNC int mdbx_cleanup_dead_readers(MDBX_env *env, int rlocked,
int *dead);
MDBX_INTERNAL_FUNC int mdbx_rthc_alloc(mdbx_thread_key_t *key,
MDBX_reader *begin, MDBX_reader *end);

View File

@ -702,7 +702,7 @@ static int __cold mdbx_ipclock_failed(MDBX_env *env, mdbx_ipclock_t *ipc,
mdbx_warning("%clock owner died, %s", (rlocked ? 'r' : 'w'),
(rc ? "this process' env is hosed" : "recovering"));
int check_rc = mdbx_reader_check0(env, rlocked, NULL);
int check_rc = mdbx_cleanup_dead_readers(env, rlocked, NULL);
check_rc = (check_rc == MDBX_SUCCESS) ? MDBX_RESULT_TRUE : check_rc;
#if MDBX_LOCKING == MDBX_LOCKING_SYSV

View File

@ -78,16 +78,17 @@ const char *keygencase2str(const keygen_case keycase) {
//-----------------------------------------------------------------------------
int testcase::oom_callback(MDBX_env *env, mdbx_pid_t pid, mdbx_tid_t tid,
uint64_t txn, unsigned gap, size_t space,
int testcase::hsr_callback(const MDBX_env *env, const MDBX_txn *txn,
mdbx_pid_t pid, mdbx_tid_t tid, uint64_t laggard,
unsigned gap, size_t space,
int retry) MDBX_CXX17_NOEXCEPT {
(void)txn;
testcase *self = (testcase *)mdbx_env_get_userctx(env);
if (retry == 0)
log_notice("oom_callback: waitfor pid %lu, thread %" PRIuPTR
log_notice("hsr_callback: waitfor pid %lu, thread %" PRIuPTR
", txn #%" PRIu64 ", gap %d, scape %zu",
(long)pid, (size_t)tid, txn, gap, space);
(long)pid, (size_t)tid, laggard, gap, space);
if (self->should_continue(true)) {
osal_yield();
@ -123,9 +124,9 @@ void testcase::db_prepare() {
if (unlikely(rc != MDBX_SUCCESS))
failure_perror("mdbx_env_set_maxdbs()", rc);
rc = mdbx_env_set_oomfunc(env, testcase::oom_callback);
rc = mdbx_env_set_hsr(env, testcase::hsr_callback);
if (unlikely(rc != MDBX_SUCCESS))
failure_perror("mdbx_env_set_oomfunc()", rc);
failure_perror("mdbx_env_set_hsr()", rc);
rc = mdbx_env_set_geometry(
env, config.params.size_lower, config.params.size_now,

View File

@ -166,8 +166,9 @@ protected:
const keygen::buffer &old_value, MDBX_put_flags_t flags);
int remove(const keygen::buffer &akey, const keygen::buffer &adata);
static int oom_callback(MDBX_env *env, mdbx_pid_t pid, mdbx_tid_t tid,
uint64_t txn, unsigned gap, size_t space,
static int hsr_callback(const MDBX_env *env, const MDBX_txn *txn,
mdbx_pid_t pid, mdbx_tid_t tid, uint64_t laggard,
unsigned gap, size_t space,
int retry) MDBX_CXX17_NOEXCEPT;
MDBX_env_flags_t actual_env_mode{MDBX_ENV_DEFAULTS};