mirror of
https://gitflic.ru/project/erthink/libmdbx.git
synced 2025-01-27 18:39:24 +00:00
mdbx: поддержка асинхронного ввода-вывода для Windows и подготовка к io_ring
(объединённые коммиты и исправления).
This commit is contained in:
parent
9f64e2a10c
commit
474391c83c
2
TODO.md
2
TODO.md
@ -11,7 +11,6 @@ For the same reason ~~Github~~ is blacklisted forever.
|
||||
|
||||
So currently most of the links are broken due to noted malicious ~~Github~~ sabotage.
|
||||
|
||||
- [Engage an "overlapped I/O" on Windows](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/224).
|
||||
- [Move most of `mdbx_chk` functional to the library API](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/204).
|
||||
- [Replace SRW-lock on Windows to allow shrink DB with `MDBX_NOTLS` option](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/210).
|
||||
- [More flexible support of asynchronous runtime/framework(s)](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/200).
|
||||
@ -27,3 +26,4 @@ Done
|
||||
----
|
||||
|
||||
- [Simple careful mode for working with corrupted DB](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/223).
|
||||
- [Engage an "overlapped I/O" on Windows](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/224).
|
||||
|
6
mdbx.h
6
mdbx.h
@ -2522,9 +2522,13 @@ struct MDBX_envinfo {
|
||||
uint64_t unspill; /**< Quantity of unspilled/reloaded pages */
|
||||
uint64_t wops; /**< Number of explicit write operations (not a pages)
|
||||
to a disk */
|
||||
uint64_t
|
||||
msync; /**< Number of explicit msync-to-disk operations (not a pages) */
|
||||
uint64_t
|
||||
fsync; /**< Number of explicit fsync-to-disk operations (not a pages) */
|
||||
uint64_t
|
||||
gcrtime_seconds16dot16; /**< Time spent loading and searching inside
|
||||
GC (aka FreeDB) in 1/65536 of second. */
|
||||
GC (aka FreeDB) in 1/65536 of second */
|
||||
} mi_pgop_stat;
|
||||
};
|
||||
#ifndef __cplusplus
|
||||
|
@ -63,7 +63,7 @@
|
||||
#define SSIZE_MAX INTPTR_MAX
|
||||
#endif
|
||||
|
||||
#if UINTPTR_MAX > 0xffffFFFFul || ULONG_MAX > 0xffffFFFFul
|
||||
#if UINTPTR_MAX > 0xffffFFFFul || ULONG_MAX > 0xffffFFFFul || defined(_WIN64)
|
||||
#define MDBX_WORDBITS 64
|
||||
#else
|
||||
#define MDBX_WORDBITS 32
|
||||
|
628
src/core.c
628
src/core.c
@ -3983,13 +3983,12 @@ __cold static void kill_page(MDBX_txn *txn, MDBX_page *mp, pgno_t pgno,
|
||||
while (--npages) {
|
||||
iov[n] = iov[0];
|
||||
if (++n == MDBX_COMMIT_PAGES) {
|
||||
osal_pwritev(env->me_lazy_fd, iov, MDBX_COMMIT_PAGES, iov_off,
|
||||
pgno2bytes(env, MDBX_COMMIT_PAGES));
|
||||
osal_pwritev(env->me_lazy_fd, iov, MDBX_COMMIT_PAGES, iov_off);
|
||||
iov_off += pgno2bytes(env, MDBX_COMMIT_PAGES);
|
||||
n = 0;
|
||||
}
|
||||
}
|
||||
osal_pwritev(env->me_lazy_fd, iov, n, iov_off, pgno2bytes(env, n));
|
||||
osal_pwritev(env->me_lazy_fd, iov, n, iov_off);
|
||||
}
|
||||
}
|
||||
|
||||
@ -4318,139 +4317,155 @@ static __inline int page_retire(MDBX_cursor *mc, MDBX_page *mp) {
|
||||
return page_retire_ex(mc, mp->mp_pgno, mp, mp->mp_flags);
|
||||
}
|
||||
|
||||
struct iov_ctx {
|
||||
unsigned iov_items;
|
||||
size_t iov_bytes;
|
||||
size_t iov_off;
|
||||
typedef struct iov_ctx {
|
||||
MDBX_env *env;
|
||||
osal_ioring_t *ior;
|
||||
int err;
|
||||
#ifndef MDBX_NEED_WRITTEN_RANGE
|
||||
#define MDBX_NEED_WRITTEN_RANGE 1
|
||||
#endif /* MDBX_NEED_WRITTEN_RANGE */
|
||||
#if MDBX_NEED_WRITTEN_RANGE
|
||||
pgno_t flush_begin;
|
||||
pgno_t flush_end;
|
||||
struct iovec iov[MDBX_COMMIT_PAGES];
|
||||
};
|
||||
#endif /* MDBX_NEED_WRITTEN_RANGE */
|
||||
uint64_t coherency_timestamp;
|
||||
} iov_ctx_t;
|
||||
|
||||
static __inline void iov_init(MDBX_txn *const txn, struct iov_ctx *ctx) {
|
||||
ctx->flush_begin = MAX_PAGENO;
|
||||
ctx->flush_end = MIN_PAGENO;
|
||||
ctx->iov_items = 0;
|
||||
ctx->iov_bytes = 0;
|
||||
ctx->iov_off = 0;
|
||||
(void)txn;
|
||||
__must_check_result static int iov_init(MDBX_txn *const txn, iov_ctx_t *ctx,
|
||||
unsigned items, pgno_t npages) {
|
||||
ctx->env = txn->mt_env;
|
||||
ctx->ior = &txn->mt_env->me_ioring;
|
||||
ctx->err = osal_ioring_reserve(ctx->ior, items,
|
||||
pgno_align2os_bytes(txn->mt_env, npages));
|
||||
if (likely(ctx->err == MDBX_SUCCESS)) {
|
||||
#if MDBX_NEED_WRITTEN_RANGE
|
||||
ctx->flush_begin = MAX_PAGENO;
|
||||
ctx->flush_end = MIN_PAGENO;
|
||||
#endif /* MDBX_NEED_WRITTEN_RANGE */
|
||||
osal_ioring_reset(ctx->ior);
|
||||
}
|
||||
return ctx->err;
|
||||
}
|
||||
|
||||
static __inline void iov_done(MDBX_txn *const txn, struct iov_ctx *ctx) {
|
||||
tASSERT(txn, ctx->iov_items == 0);
|
||||
#if defined(__linux__) || defined(__gnu_linux__)
|
||||
MDBX_env *const env = txn->mt_env;
|
||||
if (!(txn->mt_flags & MDBX_WRITEMAP) && linux_kernel_version < 0x02060b00)
|
||||
/* Linux kernels older than version 2.6.11 ignore the addr and nbytes
|
||||
* arguments, making this function fairly expensive. Therefore, the
|
||||
* whole cache is always flushed. */
|
||||
osal_flush_incoherent_mmap(
|
||||
env->me_map + pgno2bytes(env, ctx->flush_begin),
|
||||
pgno2bytes(env, ctx->flush_end - ctx->flush_begin), env->me_os_psize);
|
||||
#endif /* Linux */
|
||||
static inline bool iov_empty(const iov_ctx_t *ctx) {
|
||||
return osal_ioring_used(ctx->ior) == 0;
|
||||
}
|
||||
|
||||
static int iov_write(MDBX_txn *const txn, struct iov_ctx *ctx) {
|
||||
tASSERT(txn, !(txn->mt_flags & MDBX_WRITEMAP));
|
||||
tASSERT(txn, ctx->iov_items > 0);
|
||||
static void iov_callback4dirtypages(iov_ctx_t *ctx, size_t offset, void *data,
|
||||
size_t bytes) {
|
||||
MDBX_env *const env = ctx->env;
|
||||
eASSERT(env, (env->me_flags & MDBX_WRITEMAP) == 0);
|
||||
|
||||
MDBX_env *const env = txn->mt_env;
|
||||
int rc;
|
||||
if (likely(ctx->iov_items == 1)) {
|
||||
eASSERT(env, ctx->iov_bytes == (size_t)ctx->iov[0].iov_len);
|
||||
rc = osal_pwrite(env->me_lazy_fd, ctx->iov[0].iov_base, ctx->iov[0].iov_len,
|
||||
ctx->iov_off);
|
||||
} else {
|
||||
rc = osal_pwritev(env->me_lazy_fd, ctx->iov, ctx->iov_items, ctx->iov_off,
|
||||
ctx->iov_bytes);
|
||||
}
|
||||
MDBX_page *wp = (MDBX_page *)data;
|
||||
eASSERT(env, wp->mp_pgno == bytes2pgno(env, offset));
|
||||
eASSERT(env, bytes2pgno(env, bytes) >= (IS_OVERFLOW(wp) ? wp->mp_pages : 1u));
|
||||
eASSERT(env, (wp->mp_flags & P_ILL_BITS) == 0);
|
||||
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
ERROR("Write error: %s", mdbx_strerror(rc));
|
||||
else {
|
||||
VALGRIND_MAKE_MEM_DEFINED(txn->mt_env->me_map + ctx->iov_off,
|
||||
ctx->iov_bytes);
|
||||
MDBX_ASAN_UNPOISON_MEMORY_REGION(txn->mt_env->me_map + ctx->iov_off,
|
||||
ctx->iov_bytes);
|
||||
}
|
||||
|
||||
unsigned iov_items = ctx->iov_items;
|
||||
#if MDBX_ENABLE_PGOP_STAT
|
||||
txn->mt_env->me_lck->mti_pgop_stat.wops.weak += iov_items;
|
||||
#endif /* MDBX_ENABLE_PGOP_STAT */
|
||||
ctx->iov_items = 0;
|
||||
ctx->iov_bytes = 0;
|
||||
|
||||
uint64_t timestamp = 0;
|
||||
for (unsigned i = 0; i < iov_items; i++) {
|
||||
MDBX_page *wp = (MDBX_page *)ctx->iov[i].iov_base;
|
||||
const MDBX_page *rp = pgno2page(txn->mt_env, wp->mp_pgno);
|
||||
if (likely(ctx->err == MDBX_SUCCESS)) {
|
||||
VALGRIND_MAKE_MEM_DEFINED(env->me_map + offset, bytes);
|
||||
MDBX_ASAN_UNPOISON_MEMORY_REGION(env->me_map + offset, bytes);
|
||||
osal_flush_incoherent_mmap(env->me_map + offset, bytes, env->me_os_psize);
|
||||
const MDBX_page *const rp = (const MDBX_page *)(env->me_map + offset);
|
||||
/* check with timeout as the workaround
|
||||
* for todo4recovery://erased_by_github/libmdbx/issues/269 */
|
||||
while (likely(rc == MDBX_SUCCESS) &&
|
||||
unlikely(memcmp(wp, rp, ctx->iov[i].iov_len) != 0)) {
|
||||
if (!timestamp) {
|
||||
iov_done(txn, ctx);
|
||||
WARNING(
|
||||
"catch delayed/non-arrived page %" PRIaPGNO " %s", wp->mp_pgno,
|
||||
"(workaround for incoherent flaw of unified page/buffer cache)");
|
||||
}
|
||||
if (coherency_timeout(×tamp, wp->mp_pgno) != MDBX_RESULT_TRUE)
|
||||
rc = MDBX_PROBLEM;
|
||||
if (unlikely(memcmp(wp, rp, bytes))) {
|
||||
ctx->coherency_timestamp = 0;
|
||||
WARNING("catch delayed/non-arrived page %" PRIaPGNO " %s", wp->mp_pgno,
|
||||
"(workaround for incoherent flaw of unified page/buffer cache)");
|
||||
do
|
||||
if (coherency_timeout(&ctx->coherency_timestamp, wp->mp_pgno) !=
|
||||
MDBX_RESULT_TRUE) {
|
||||
ctx->err = MDBX_PROBLEM;
|
||||
break;
|
||||
}
|
||||
while (unlikely(memcmp(wp, rp, bytes)));
|
||||
}
|
||||
dpage_free(env, wp, bytes2pgno(env, ctx->iov[i].iov_len));
|
||||
}
|
||||
return rc;
|
||||
|
||||
if (likely(bytes == env->me_psize))
|
||||
dpage_free(env, wp, 1);
|
||||
else {
|
||||
do {
|
||||
eASSERT(env, wp->mp_pgno == bytes2pgno(env, offset));
|
||||
eASSERT(env, (wp->mp_flags & P_ILL_BITS) == 0);
|
||||
unsigned npages = IS_OVERFLOW(wp) ? wp->mp_pages : 1u;
|
||||
size_t chunk = pgno2bytes(env, npages);
|
||||
eASSERT(env, bytes >= chunk);
|
||||
dpage_free(env, wp, npages);
|
||||
wp = (MDBX_page *)((char *)wp + chunk);
|
||||
offset += chunk;
|
||||
bytes -= chunk;
|
||||
} while (bytes);
|
||||
}
|
||||
}
|
||||
|
||||
static int iov_page(MDBX_txn *txn, struct iov_ctx *ctx, MDBX_page *dp,
|
||||
unsigned npages) {
|
||||
static void iov_complete(iov_ctx_t *ctx) {
|
||||
if ((ctx->env->me_flags & MDBX_WRITEMAP) == 0)
|
||||
osal_ioring_walk(ctx->ior, ctx, iov_callback4dirtypages);
|
||||
osal_ioring_reset(ctx->ior);
|
||||
}
|
||||
|
||||
__must_check_result static int iov_write(iov_ctx_t *ctx) {
|
||||
eASSERT(ctx->env, !iov_empty(ctx));
|
||||
osal_ioring_write_result_t r = osal_ioring_write(ctx->ior);
|
||||
#if MDBX_ENABLE_PGOP_STAT
|
||||
ctx->env->me_lck->mti_pgop_stat.wops.weak += r.wops;
|
||||
#endif /* MDBX_ENABLE_PGOP_STAT */
|
||||
ctx->err = r.err;
|
||||
if (unlikely(ctx->err != MDBX_SUCCESS))
|
||||
ERROR("Write error: %s", mdbx_strerror(ctx->err));
|
||||
iov_complete(ctx);
|
||||
return ctx->err;
|
||||
}
|
||||
|
||||
__must_check_result static int iov_page(MDBX_txn *txn, iov_ctx_t *ctx,
|
||||
MDBX_page *dp, unsigned npages) {
|
||||
MDBX_env *const env = txn->mt_env;
|
||||
tASSERT(txn, ctx->err == MDBX_SUCCESS);
|
||||
tASSERT(txn, dp->mp_pgno >= MIN_PAGENO && dp->mp_pgno < txn->mt_next_pgno);
|
||||
tASSERT(txn, IS_MODIFIABLE(txn, dp));
|
||||
tASSERT(txn, !(dp->mp_flags & ~(P_BRANCH | P_LEAF | P_LEAF2 | P_OVERFLOW)));
|
||||
|
||||
ctx->flush_begin =
|
||||
(ctx->flush_begin < dp->mp_pgno) ? ctx->flush_begin : dp->mp_pgno;
|
||||
ctx->flush_end = (ctx->flush_end > dp->mp_pgno + npages)
|
||||
? ctx->flush_end
|
||||
: dp->mp_pgno + npages;
|
||||
env->me_lck->mti_unsynced_pages.weak += npages;
|
||||
|
||||
if (IS_SHADOWED(txn, dp)) {
|
||||
tASSERT(txn, !(txn->mt_flags & MDBX_WRITEMAP));
|
||||
dp->mp_txnid = txn->mt_txnid;
|
||||
tASSERT(txn, IS_SPILLED(txn, dp));
|
||||
const size_t size = pgno2bytes(env, npages);
|
||||
if (ctx->iov_off + ctx->iov_bytes != pgno2bytes(env, dp->mp_pgno) ||
|
||||
ctx->iov_items == ARRAY_LENGTH(ctx->iov) ||
|
||||
ctx->iov_bytes + size > MAX_WRITE) {
|
||||
if (ctx->iov_items) {
|
||||
int err = iov_write(txn, ctx);
|
||||
if (unlikely(err != MDBX_SUCCESS))
|
||||
return err;
|
||||
#if defined(__linux__) || defined(__gnu_linux__)
|
||||
if (linux_kernel_version >= 0x02060b00)
|
||||
/* Linux kernels older than version 2.6.11 ignore the addr and nbytes
|
||||
* arguments, making this function fairly expensive. Therefore, the
|
||||
* whole cache is always flushed. */
|
||||
#endif /* Linux */
|
||||
osal_flush_incoherent_mmap(env->me_map + ctx->iov_off, ctx->iov_bytes,
|
||||
env->me_os_psize);
|
||||
int err = osal_ioring_add(ctx->ior, pgno2bytes(env, dp->mp_pgno), dp,
|
||||
pgno2bytes(env, npages));
|
||||
if (unlikely(err != MDBX_SUCCESS)) {
|
||||
ctx->err = err;
|
||||
if (unlikely(err != MDBX_RESULT_TRUE)) {
|
||||
iov_complete(ctx);
|
||||
return err;
|
||||
}
|
||||
ctx->iov_off = pgno2bytes(env, dp->mp_pgno);
|
||||
err = iov_write(ctx);
|
||||
tASSERT(txn, iov_empty(ctx));
|
||||
if (likely(err == MDBX_SUCCESS)) {
|
||||
err = osal_ioring_add(ctx->ior, pgno2bytes(env, dp->mp_pgno), dp,
|
||||
pgno2bytes(env, npages));
|
||||
if (unlikely(err != MDBX_SUCCESS)) {
|
||||
iov_complete(ctx);
|
||||
return ctx->err = err;
|
||||
}
|
||||
}
|
||||
tASSERT(txn, ctx->err == MDBX_SUCCESS);
|
||||
}
|
||||
ctx->iov[ctx->iov_items].iov_base = (void *)dp;
|
||||
ctx->iov[ctx->iov_items].iov_len = size;
|
||||
ctx->iov_items += 1;
|
||||
ctx->iov_bytes += size;
|
||||
} else {
|
||||
tASSERT(txn, txn->mt_flags & MDBX_WRITEMAP);
|
||||
}
|
||||
|
||||
#if MDBX_NEED_WRITTEN_RANGE
|
||||
ctx->flush_begin =
|
||||
(ctx->flush_begin < dp->mp_pgno) ? ctx->flush_begin : dp->mp_pgno;
|
||||
ctx->flush_end = (ctx->flush_end > dp->mp_pgno + npages)
|
||||
? ctx->flush_end
|
||||
: dp->mp_pgno + npages;
|
||||
#endif /* MDBX_NEED_WRITTEN_RANGE */
|
||||
env->me_lck->mti_unsynced_pages.weak += npages;
|
||||
return MDBX_SUCCESS;
|
||||
}
|
||||
|
||||
static int spill_page(MDBX_txn *txn, struct iov_ctx *ctx, MDBX_page *dp,
|
||||
static int spill_page(MDBX_txn *txn, iov_ctx_t *ctx, MDBX_page *dp,
|
||||
unsigned npages) {
|
||||
tASSERT(txn, !(txn->mt_flags & MDBX_WRITEMAP));
|
||||
pgno_t pgno = dp->mp_pgno;
|
||||
@ -4613,13 +4628,18 @@ static int txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0,
|
||||
txn->tw.dirtyroom, need);
|
||||
tASSERT(txn, txn->tw.dirtylist->length >= wanna_spill);
|
||||
|
||||
struct iov_ctx ctx;
|
||||
iov_init(txn, &ctx);
|
||||
int rc = MDBX_SUCCESS;
|
||||
if (txn->mt_flags & MDBX_WRITEMAP) {
|
||||
MDBX_dpl *const dl = txn->tw.dirtylist;
|
||||
const unsigned span = dl->length - txn->tw.loose_count;
|
||||
txn->tw.dirtyroom += span;
|
||||
|
||||
iov_ctx_t ctx;
|
||||
rc = iov_init(txn, &ctx, wanna_spill,
|
||||
dl->pages_including_loose - txn->tw.loose_count);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
goto bailout;
|
||||
|
||||
unsigned r, w;
|
||||
for (w = 0, r = 1; r <= dl->length; ++r) {
|
||||
MDBX_page *dp = dl->items[r].ptr;
|
||||
@ -4749,6 +4769,13 @@ static int txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0,
|
||||
prio2spill, prio2adjacent, spillable, wanna_spill, amount);
|
||||
tASSERT(txn, prio2spill < prio2adjacent && prio2adjacent <= 256);
|
||||
|
||||
iov_ctx_t ctx;
|
||||
rc = iov_init(txn, &ctx, amount,
|
||||
txn->tw.dirtylist->pages_including_loose -
|
||||
txn->tw.loose_count);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
goto bailout;
|
||||
|
||||
unsigned prev_prio = 256;
|
||||
unsigned r, w, prio;
|
||||
pgno_t spilled_entries = 0, spilled_npages = 0;
|
||||
@ -4814,12 +4841,10 @@ static int txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0,
|
||||
txn->tw.dirtylist->pages_including_loose -= spilled_npages;
|
||||
tASSERT(txn, dirtylist_check(txn));
|
||||
|
||||
if (ctx.iov_items) {
|
||||
/* iov_page() frees dirty-pages and reset iov_items in case of failure. */
|
||||
if (!iov_empty(&ctx)) {
|
||||
tASSERT(txn, rc == MDBX_SUCCESS);
|
||||
rc = iov_write(txn, &ctx);
|
||||
rc = iov_write(&ctx);
|
||||
}
|
||||
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
goto bailout;
|
||||
|
||||
@ -4827,9 +4852,8 @@ static int txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0,
|
||||
txn->mt_flags |= MDBX_TXN_SPILLS;
|
||||
NOTICE("spilled %u dirty-entries, now have %u dirty-room", spilled_entries,
|
||||
txn->tw.dirtyroom);
|
||||
iov_done(txn, &ctx);
|
||||
} else {
|
||||
tASSERT(txn, ctx.iov_items == 0 && rc == MDBX_SUCCESS);
|
||||
tASSERT(txn, rc == MDBX_SUCCESS);
|
||||
for (unsigned i = 1; i <= dl->length; ++i) {
|
||||
MDBX_page *dp = dl->items[i].ptr;
|
||||
NOTICE("dirtylist[%u]: pgno %u, npages %u, flags 0x%04X, age %u, prio %u",
|
||||
@ -5610,7 +5634,7 @@ __cold static int map_resize(MDBX_env *env, const pgno_t used_pgno,
|
||||
|
||||
if ((env->me_flags & MDBX_WRITEMAP) && env->me_lck->mti_unsynced_pages.weak) {
|
||||
#if MDBX_ENABLE_PGOP_STAT
|
||||
env->me_lck->mti_pgop_stat.wops.weak += 1;
|
||||
env->me_lck->mti_pgop_stat.msync.weak += 1;
|
||||
#endif /* MDBX_ENABLE_PGOP_STAT */
|
||||
rc = osal_msync(&env->me_dxb_mmap, 0, pgno_align2os_bytes(env, used_pgno),
|
||||
MDBX_SYNC_NONE);
|
||||
@ -5743,74 +5767,71 @@ __cold static int map_resize_implicit(MDBX_env *env, const pgno_t used_pgno,
|
||||
true);
|
||||
}
|
||||
|
||||
static int meta_unsteady(MDBX_env *env, const txnid_t last_steady,
|
||||
MDBX_meta *const meta, mdbx_filehandle_t fd) {
|
||||
static int meta_unsteady(int err, MDBX_env *env, const txnid_t early_than,
|
||||
const pgno_t pgno) {
|
||||
MDBX_meta *const meta = METAPAGE(env, pgno);
|
||||
const txnid_t txnid = constmeta_txnid(meta);
|
||||
if (unlikely(err != MDBX_SUCCESS) || !META_IS_STEADY(meta) ||
|
||||
!(txnid < early_than))
|
||||
return err;
|
||||
|
||||
WARNING("wipe txn #%" PRIaTXN ", meta %" PRIaPGNO, txnid, pgno);
|
||||
const uint64_t wipe = MDBX_DATASIGN_NONE;
|
||||
if (unlikely(META_IS_STEADY(meta)) && constmeta_txnid(meta) <= last_steady) {
|
||||
WARNING("wipe txn #%" PRIaTXN ", meta %" PRIaPGNO, last_steady,
|
||||
data_page(meta)->mp_pgno);
|
||||
if (env->me_flags & MDBX_WRITEMAP)
|
||||
unaligned_poke_u64(4, meta->mm_sign, wipe);
|
||||
else
|
||||
return osal_pwrite(fd, &wipe, sizeof(meta->mm_sign),
|
||||
(uint8_t *)&meta->mm_sign - env->me_map);
|
||||
}
|
||||
return MDBX_SUCCESS;
|
||||
}
|
||||
|
||||
__cold static int wipe_steady(MDBX_txn *txn, const txnid_t last_steady) {
|
||||
MDBX_env *const env = txn->mt_env;
|
||||
#if MDBX_ENABLE_PGOP_STAT
|
||||
env->me_lck->mti_pgop_stat.wops.weak += 1;
|
||||
#endif /* MDBX_ENABLE_PGOP_STAT */
|
||||
const mdbx_filehandle_t fd = (env->me_dsync_fd != INVALID_HANDLE_VALUE)
|
||||
? env->me_dsync_fd
|
||||
: env->me_lazy_fd;
|
||||
int err = meta_unsteady(env, last_steady, METAPAGE(env, 0), fd);
|
||||
if (unlikely(err != MDBX_SUCCESS))
|
||||
return err;
|
||||
err = meta_unsteady(env, last_steady, METAPAGE(env, 1), fd);
|
||||
if (unlikely(err != MDBX_SUCCESS))
|
||||
return err;
|
||||
err = meta_unsteady(env, last_steady, METAPAGE(env, 2), fd);
|
||||
if (unlikely(err != MDBX_SUCCESS))
|
||||
return err;
|
||||
|
||||
const void *ptr = &wipe;
|
||||
size_t bytes = sizeof(meta->mm_sign),
|
||||
offset = (uint8_t *)&meta->mm_sign - env->me_map;
|
||||
if (env->me_flags & MDBX_WRITEMAP) {
|
||||
unaligned_poke_u64(4, meta->mm_sign, wipe);
|
||||
osal_flush_incoherent_cpu_writeback();
|
||||
err = osal_msync(&env->me_dxb_mmap, 0, pgno_align2os_bytes(env, NUM_METAS),
|
||||
MDBX_SYNC_DATA);
|
||||
if (unlikely(err != MDBX_SUCCESS))
|
||||
return err;
|
||||
} else {
|
||||
if (fd == env->me_lazy_fd) {
|
||||
#if MDBX_USE_SYNCFILERANGE
|
||||
static bool syncfilerange_unavailable;
|
||||
if (!syncfilerange_unavailable &&
|
||||
sync_file_range(env->me_lazy_fd, 0, pgno2bytes(env, NUM_METAS),
|
||||
SYNC_FILE_RANGE_WRITE | SYNC_FILE_RANGE_WAIT_AFTER)) {
|
||||
err = errno;
|
||||
if (ignore_enosys(err) == MDBX_RESULT_TRUE)
|
||||
syncfilerange_unavailable = true;
|
||||
}
|
||||
if (syncfilerange_unavailable)
|
||||
#endif /* MDBX_USE_SYNCFILERANGE */
|
||||
err = osal_fsync(env->me_lazy_fd, MDBX_SYNC_DATA);
|
||||
if (unlikely(err != MDBX_SUCCESS))
|
||||
return err;
|
||||
}
|
||||
osal_flush_incoherent_mmap(env->me_map, pgno2bytes(env, NUM_METAS),
|
||||
env->me_os_psize);
|
||||
ptr = data_page(meta);
|
||||
offset = (uint8_t *)ptr - env->me_map;
|
||||
bytes = env->me_psize;
|
||||
}
|
||||
|
||||
#if MDBX_ENABLE_PGOP_STAT
|
||||
env->me_lck->mti_pgop_stat.wops.weak += 1;
|
||||
#endif /* MDBX_ENABLE_PGOP_STAT */
|
||||
err = osal_pwrite(env->me_fd4meta, ptr, bytes, offset);
|
||||
if (likely(err == MDBX_SUCCESS) && env->me_fd4meta == env->me_lazy_fd) {
|
||||
err = osal_fsync(env->me_lazy_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ);
|
||||
#if MDBX_ENABLE_PGOP_STAT
|
||||
env->me_lck->mti_pgop_stat.fsync.weak += 1;
|
||||
#endif /* MDBX_ENABLE_PGOP_STAT */
|
||||
}
|
||||
return err;
|
||||
}
|
||||
|
||||
__cold static int wipe_steady(MDBX_txn *txn, txnid_t last_steady) {
|
||||
MDBX_env *const env = txn->mt_env;
|
||||
int err = MDBX_SUCCESS;
|
||||
|
||||
/* early than last_steady */
|
||||
err = meta_unsteady(err, env, last_steady, 0);
|
||||
err = meta_unsteady(err, env, last_steady, 1);
|
||||
err = meta_unsteady(err, env, last_steady, 2);
|
||||
|
||||
/* the last_steady */
|
||||
err = meta_unsteady(err, env, last_steady + 1, 0);
|
||||
err = meta_unsteady(err, env, last_steady + 1, 1);
|
||||
err = meta_unsteady(err, env, last_steady + 1, 2);
|
||||
|
||||
osal_flush_incoherent_mmap(env->me_map, pgno2bytes(env, NUM_METAS),
|
||||
env->me_os_psize);
|
||||
|
||||
/* force oldest refresh */
|
||||
atomic_store32(&env->me_lck->mti_readers_refresh_flag, true, mo_Relaxed);
|
||||
|
||||
tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0);
|
||||
txn->tw.troika = meta_tap(env);
|
||||
for (MDBX_txn *scan = txn->mt_env->me_txn0; scan; scan = scan->mt_child)
|
||||
if (scan != txn)
|
||||
scan->tw.troika = txn->tw.troika;
|
||||
return MDBX_SUCCESS;
|
||||
return err;
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
@ -7052,6 +7073,40 @@ fail:
|
||||
return rc;
|
||||
}
|
||||
|
||||
static int meta_sync(const MDBX_env *env, const meta_ptr_t head) {
|
||||
eASSERT(env, atomic_load32(&env->me_lck->mti_meta_sync_txnid, mo_Relaxed) !=
|
||||
(uint32_t)head.txnid);
|
||||
/* Функция может вызываться (в том числе) при (env->me_flags &
|
||||
* MDBX_NOMETASYNC) == 0 и env->me_fd4meta == env->me_dsync_fd, например если
|
||||
* предыдущая транзакция была выполненна с флагом MDBX_NOMETASYNC. */
|
||||
|
||||
int rc = MDBX_RESULT_TRUE;
|
||||
if (env->me_flags & MDBX_WRITEMAP) {
|
||||
#if MDBX_ENABLE_PGOP_ST
|
||||
env->me_lck->mti_pgop_stat.wops.weak += 1;
|
||||
#endif /* MDBX_ENABLE_PGOP_STAT */
|
||||
const MDBX_page *page = data_page(head.ptr_c);
|
||||
rc = osal_pwrite(env->me_fd4meta, page, env->me_psize,
|
||||
(uint8_t *)page - env->me_map);
|
||||
|
||||
if (likely(rc == MDBX_SUCCESS) && env->me_fd4meta == env->me_lazy_fd) {
|
||||
rc = osal_fsync(env->me_lazy_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ);
|
||||
#if MDBX_ENABLE_PGOP_STAT
|
||||
env->me_lck->mti_pgop_stat.fsync.weak += 1;
|
||||
#endif /* MDBX_ENABLE_PGOP_STAT */
|
||||
}
|
||||
} else {
|
||||
rc = osal_fsync(env->me_lazy_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ);
|
||||
#if MDBX_ENABLE_PGOP_STAT
|
||||
env->me_lck->mti_pgop_stat.fsync.weak += 1;
|
||||
#endif /* MDBX_ENABLE_PGOP_STAT */
|
||||
}
|
||||
|
||||
if (likely(rc == MDBX_SUCCESS))
|
||||
env->me_lck->mti_meta_sync_txnid.weak = (uint32_t)head.txnid;
|
||||
return rc;
|
||||
}
|
||||
|
||||
__cold static int env_sync(MDBX_env *env, bool force, bool nonblock) {
|
||||
bool locked = false;
|
||||
int rc = MDBX_RESULT_TRUE /* means "nothing to sync" */;
|
||||
@ -7104,7 +7159,7 @@ retry:;
|
||||
|
||||
int err;
|
||||
/* pre-sync to avoid latency for writer */
|
||||
if (unsynced_pages > /* FIXME: define threshold */ 16 &&
|
||||
if (unsynced_pages > /* FIXME: define threshold */ 42 &&
|
||||
(flags & MDBX_SAFE_NOSYNC) == 0) {
|
||||
eASSERT(env, ((flags ^ env->me_flags) & MDBX_WRITEMAP) == 0);
|
||||
if (flags & MDBX_WRITEMAP) {
|
||||
@ -7173,19 +7228,8 @@ retry:;
|
||||
/* LY: sync meta-pages if MDBX_NOMETASYNC enabled
|
||||
* and someone was not synced above. */
|
||||
if (atomic_load32(&env->me_lck->mti_meta_sync_txnid, mo_Relaxed) !=
|
||||
(uint32_t)head.txnid) {
|
||||
#if MDBX_ENABLE_PGOP_STAT
|
||||
env->me_lck->mti_pgop_stat.wops.weak += 1;
|
||||
#endif /* MDBX_ENABLE_PGOP_STAT */
|
||||
rc = (flags & MDBX_WRITEMAP)
|
||||
? osal_msync(&env->me_dxb_mmap, 0,
|
||||
pgno_align2os_bytes(env, NUM_METAS),
|
||||
MDBX_SYNC_DATA | MDBX_SYNC_IODQ)
|
||||
: osal_fsync(env->me_lazy_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ);
|
||||
if (likely(rc == MDBX_SUCCESS))
|
||||
atomic_store32(&env->me_lck->mti_meta_sync_txnid, (uint32_t)head.txnid,
|
||||
mo_Relaxed);
|
||||
}
|
||||
(uint32_t)head.txnid)
|
||||
rc = meta_sync(env, head);
|
||||
|
||||
bailout:
|
||||
if (locked)
|
||||
@ -7628,7 +7672,8 @@ static bool coherency_check(const MDBX_env *env, const txnid_t txnid,
|
||||
__cold static int coherency_timeout(uint64_t *timestamp, pgno_t pgno) {
|
||||
if (likely(timestamp && *timestamp == 0))
|
||||
*timestamp = osal_monotime();
|
||||
else if (unlikely(!timestamp || osal_monotime() - *timestamp > 65536 / 10)) {
|
||||
else if (unlikely(!timestamp || osal_monotime() - *timestamp >
|
||||
osal_16dot16_to_monotime(65536 / 10))) {
|
||||
if (pgno)
|
||||
ERROR("bailout waiting for %" PRIaPGNO " page arrival %s", pgno,
|
||||
"(workaround for incoherent flaw of unified page/buffer cache)");
|
||||
@ -9902,7 +9947,7 @@ bailout:
|
||||
return rc;
|
||||
}
|
||||
|
||||
static int txn_write(MDBX_txn *txn, struct iov_ctx *ctx) {
|
||||
static int txn_write(MDBX_txn *txn, iov_ctx_t *ctx) {
|
||||
MDBX_dpl *const dl =
|
||||
(txn->mt_flags & MDBX_WRITEMAP) ? txn->tw.dirtylist : dpl_sort(txn);
|
||||
int rc = MDBX_SUCCESS;
|
||||
@ -9919,10 +9964,9 @@ static int txn_write(MDBX_txn *txn, struct iov_ctx *ctx) {
|
||||
break;
|
||||
}
|
||||
|
||||
if (ctx->iov_items) {
|
||||
/* iov_page() frees dirty-pages and reset iov_items in case of failure. */
|
||||
if (!iov_empty(ctx)) {
|
||||
tASSERT(txn, rc == MDBX_SUCCESS);
|
||||
rc = iov_write(txn, ctx);
|
||||
rc = iov_write(ctx);
|
||||
}
|
||||
|
||||
while (r <= dl->length)
|
||||
@ -10568,42 +10612,53 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) {
|
||||
goto fail;
|
||||
}
|
||||
|
||||
struct iov_ctx write_ctx;
|
||||
iov_init(txn, &write_ctx);
|
||||
const meta_ptr_t head = meta_recent(env, &txn->tw.troika);
|
||||
iov_ctx_t write_ctx;
|
||||
rc = iov_init(txn, &write_ctx, txn->tw.dirtylist->length,
|
||||
txn->tw.dirtylist->pages_including_loose - txn->tw.loose_count);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
goto fail;
|
||||
|
||||
if (head.is_steady && atomic_load32(&env->me_lck->mti_meta_sync_txnid,
|
||||
mo_Relaxed) != (uint32_t)head.txnid) {
|
||||
/* sync prev meta */
|
||||
rc = meta_sync(env, head);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
goto fail;
|
||||
}
|
||||
|
||||
rc = txn_write(txn, &write_ctx);
|
||||
if (likely(rc == MDBX_SUCCESS))
|
||||
iov_done(txn, &write_ctx);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
goto fail;
|
||||
|
||||
/* TODO: use ctx.flush_begin & ctx.flush_end for range-sync */
|
||||
ts_3 = latency ? osal_monotime() : 0;
|
||||
|
||||
if (likely(rc == MDBX_SUCCESS)) {
|
||||
const meta_ptr_t head = meta_recent(env, &txn->tw.troika);
|
||||
MDBX_meta meta;
|
||||
memcpy(meta.mm_magic_and_version, head.ptr_c->mm_magic_and_version, 8);
|
||||
meta.mm_extra_flags = head.ptr_c->mm_extra_flags;
|
||||
meta.mm_validator_id = head.ptr_c->mm_validator_id;
|
||||
meta.mm_extra_pagehdr = head.ptr_c->mm_extra_pagehdr;
|
||||
unaligned_poke_u64(4, meta.mm_pages_retired,
|
||||
unaligned_peek_u64(4, head.ptr_c->mm_pages_retired) +
|
||||
MDBX_PNL_SIZE(txn->tw.retired_pages));
|
||||
meta.mm_geo = txn->mt_geo;
|
||||
meta.mm_dbs[FREE_DBI] = txn->mt_dbs[FREE_DBI];
|
||||
meta.mm_dbs[MAIN_DBI] = txn->mt_dbs[MAIN_DBI];
|
||||
meta.mm_canary = txn->mt_canary;
|
||||
MDBX_meta meta;
|
||||
memcpy(meta.mm_magic_and_version, head.ptr_c->mm_magic_and_version, 8);
|
||||
meta.mm_extra_flags = head.ptr_c->mm_extra_flags;
|
||||
meta.mm_validator_id = head.ptr_c->mm_validator_id;
|
||||
meta.mm_extra_pagehdr = head.ptr_c->mm_extra_pagehdr;
|
||||
unaligned_poke_u64(4, meta.mm_pages_retired,
|
||||
unaligned_peek_u64(4, head.ptr_c->mm_pages_retired) +
|
||||
MDBX_PNL_SIZE(txn->tw.retired_pages));
|
||||
meta.mm_geo = txn->mt_geo;
|
||||
meta.mm_dbs[FREE_DBI] = txn->mt_dbs[FREE_DBI];
|
||||
meta.mm_dbs[MAIN_DBI] = txn->mt_dbs[MAIN_DBI];
|
||||
meta.mm_canary = txn->mt_canary;
|
||||
|
||||
txnid_t commit_txnid = txn->mt_txnid;
|
||||
txnid_t commit_txnid = txn->mt_txnid;
|
||||
#if MDBX_ENABLE_BIGFOOT
|
||||
if (gcu_ctx.bigfoot > txn->mt_txnid) {
|
||||
commit_txnid = gcu_ctx.bigfoot;
|
||||
TRACE("use @%" PRIaTXN " (+%u) for commit bigfoot-txn", commit_txnid,
|
||||
(unsigned)(commit_txnid - txn->mt_txnid));
|
||||
}
|
||||
#endif
|
||||
meta_set_txnid(env, &meta, commit_txnid);
|
||||
|
||||
rc = sync_locked(env, env->me_flags | txn->mt_flags | MDBX_SHRINK_ALLOWED,
|
||||
&meta, &txn->tw.troika);
|
||||
if (gcu_ctx.bigfoot > txn->mt_txnid) {
|
||||
commit_txnid = gcu_ctx.bigfoot;
|
||||
TRACE("use @%" PRIaTXN " (+%u) for commit bigfoot-txn", commit_txnid,
|
||||
(unsigned)(commit_txnid - txn->mt_txnid));
|
||||
}
|
||||
#endif
|
||||
meta_set_txnid(env, &meta, commit_txnid);
|
||||
|
||||
rc = sync_locked(env, env->me_flags | txn->mt_flags | MDBX_SHRINK_ALLOWED,
|
||||
&meta, &txn->tw.troika);
|
||||
ts_4 = latency ? osal_monotime() : 0;
|
||||
if (unlikely(rc != MDBX_SUCCESS)) {
|
||||
env->me_flags |= MDBX_FATAL_ERROR;
|
||||
@ -10894,11 +10949,11 @@ static int validate_meta_copy(MDBX_env *env, const MDBX_meta *meta,
|
||||
__cold static int read_header(MDBX_env *env, MDBX_meta *dest,
|
||||
const int lck_exclusive,
|
||||
const mdbx_mode_t mode_bits) {
|
||||
memset(dest, 0, sizeof(MDBX_meta));
|
||||
int rc = osal_filesize(env->me_lazy_fd, &env->me_dxb_mmap.filesize);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return rc;
|
||||
|
||||
memset(dest, 0, sizeof(MDBX_meta));
|
||||
unaligned_poke_u64(4, dest->mm_sign, MDBX_DATASIGN_WEAK);
|
||||
rc = MDBX_CORRUPTED;
|
||||
|
||||
@ -11200,7 +11255,9 @@ static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending,
|
||||
if (atomic_load32(&env->me_lck->mti_unsynced_pages, mo_Relaxed)) {
|
||||
eASSERT(env, ((flags ^ env->me_flags) & MDBX_WRITEMAP) == 0);
|
||||
enum osal_syncmode_bits mode_bits = MDBX_SYNC_NONE;
|
||||
unsigned sync_op = 0;
|
||||
if ((flags & MDBX_SAFE_NOSYNC) == 0) {
|
||||
sync_op = 1;
|
||||
mode_bits = MDBX_SYNC_DATA;
|
||||
if (pending->mm_geo.next >
|
||||
meta_prefer_steady(env, troika).ptr_c->mm_geo.now)
|
||||
@ -11209,7 +11266,7 @@ static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending,
|
||||
mode_bits |= MDBX_SYNC_IODQ;
|
||||
}
|
||||
#if MDBX_ENABLE_PGOP_STAT
|
||||
env->me_lck->mti_pgop_stat.wops.weak += 1;
|
||||
env->me_lck->mti_pgop_stat.msync.weak += sync_op;
|
||||
#endif /* MDBX_ENABLE_PGOP_STAT */
|
||||
if (flags & MDBX_WRITEMAP)
|
||||
rc =
|
||||
@ -11298,9 +11355,6 @@ static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending,
|
||||
eASSERT(env, ((env->me_flags ^ flags) & MDBX_WRITEMAP) == 0);
|
||||
ENSURE(env, target == head.ptr_c ||
|
||||
constmeta_txnid(target) < pending->unsafe_txnid);
|
||||
#if MDBX_ENABLE_PGOP_STAT
|
||||
env->me_lck->mti_pgop_stat.wops.weak += 1;
|
||||
#endif /* MDBX_ENABLE_PGOP_STAT */
|
||||
if (flags & MDBX_WRITEMAP) {
|
||||
jitter4testing(true);
|
||||
if (likely(target != head.ptr_c)) {
|
||||
@ -11338,34 +11392,37 @@ static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending,
|
||||
osal_flush_incoherent_cpu_writeback();
|
||||
jitter4testing(true);
|
||||
/* sync meta-pages */
|
||||
rc =
|
||||
osal_msync(&env->me_dxb_mmap, 0, pgno_align2os_bytes(env, NUM_METAS),
|
||||
(flags & MDBX_NOMETASYNC) ? MDBX_SYNC_NONE
|
||||
: MDBX_SYNC_DATA | MDBX_SYNC_IODQ);
|
||||
#if MDBX_ENABLE_PGOP_STAT
|
||||
env->me_lck->mti_pgop_stat.msync.weak += 1;
|
||||
#endif /* MDBX_ENABLE_PGOP_STAT */
|
||||
rc = osal_msync(&env->me_dxb_mmap, 0, pgno_align2os_bytes(env, NUM_METAS),
|
||||
(flags & MDBX_NOMETASYNC)
|
||||
? MDBX_SYNC_NONE
|
||||
: MDBX_SYNC_DATA | MDBX_SYNC_IODQ);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
goto fail;
|
||||
} else {
|
||||
const MDBX_meta undo_meta = *target;
|
||||
const mdbx_filehandle_t fd = (env->me_dsync_fd != INVALID_HANDLE_VALUE)
|
||||
? env->me_dsync_fd
|
||||
: env->me_lazy_fd;
|
||||
#if MDBX_ENABLE_PGOP_STAT
|
||||
env->me_lck->mti_pgop_stat.wops.weak += 1;
|
||||
#endif /* MDBX_ENABLE_PGOP_STAT */
|
||||
rc = osal_pwrite(fd, pending, sizeof(MDBX_meta),
|
||||
const MDBX_meta undo_meta = *target;
|
||||
rc = osal_pwrite(env->me_fd4meta, pending, sizeof(MDBX_meta),
|
||||
(uint8_t *)target - env->me_map);
|
||||
if (unlikely(rc != MDBX_SUCCESS)) {
|
||||
undo:
|
||||
DEBUG("%s", "write failed, disk error?");
|
||||
/* On a failure, the pagecache still contains the new data.
|
||||
* Try write some old data back, to prevent it from being used. */
|
||||
osal_pwrite(fd, &undo_meta, sizeof(MDBX_meta),
|
||||
osal_pwrite(env->me_fd4meta, &undo_meta, sizeof(MDBX_meta),
|
||||
(uint8_t *)target - env->me_map);
|
||||
goto fail;
|
||||
}
|
||||
osal_flush_incoherent_mmap(target, sizeof(MDBX_meta), env->me_os_psize);
|
||||
/* sync meta-pages */
|
||||
if ((flags & MDBX_NOMETASYNC) == 0 && fd == env->me_lazy_fd) {
|
||||
if ((flags & MDBX_NOMETASYNC) == 0 && env->me_fd4meta == env->me_lazy_fd) {
|
||||
#if MDBX_ENABLE_PGOP_STAT
|
||||
env->me_lck->mti_pgop_stat.fsync.weak += 1;
|
||||
#endif /* MDBX_ENABLE_PGOP_STAT */
|
||||
rc = osal_fsync(env->me_lazy_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ);
|
||||
if (rc != MDBX_SUCCESS)
|
||||
goto undo;
|
||||
@ -11382,7 +11439,7 @@ static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending,
|
||||
goto fail;
|
||||
}
|
||||
env->me_lck->mti_meta_sync_txnid.weak =
|
||||
(uint32_t)pending->unsafe_txnid -
|
||||
pending->mm_txnid_a[__BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__].weak -
|
||||
((flags & MDBX_NOMETASYNC) ? UINT32_MAX / 3 : 0);
|
||||
|
||||
*troika = meta_tap(env);
|
||||
@ -11528,9 +11585,11 @@ __cold int mdbx_env_create(MDBX_env **penv) {
|
||||
|
||||
env->me_maxreaders = DEFAULT_READERS;
|
||||
env->me_maxdbs = env->me_numdbs = CORE_DBS;
|
||||
env->me_lazy_fd = INVALID_HANDLE_VALUE;
|
||||
env->me_dsync_fd = INVALID_HANDLE_VALUE;
|
||||
env->me_lfd = INVALID_HANDLE_VALUE;
|
||||
env->me_lazy_fd = env->me_dsync_fd = env->me_fd4meta = env->me_fd4data =
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
env->me_overlapped_fd =
|
||||
#endif /* Windows */
|
||||
env->me_lfd = INVALID_HANDLE_VALUE;
|
||||
env->me_pid = osal_getpid();
|
||||
env->me_stuck_meta = -1;
|
||||
|
||||
@ -12863,10 +12922,10 @@ __cold static int __must_check_result override_meta(MDBX_env *env,
|
||||
if (shape && memcmp(model, shape, sizeof(MDBX_meta)) == 0)
|
||||
return MDBX_SUCCESS;
|
||||
|
||||
#if MDBX_ENABLE_PGOP_STAT
|
||||
env->me_lck->mti_pgop_stat.wops.weak += 1;
|
||||
#endif /* MDBX_ENABLE_PGOP_STAT */
|
||||
if (env->me_flags & MDBX_WRITEMAP) {
|
||||
#if MDBX_ENABLE_PGOP_STAT
|
||||
env->me_lck->mti_pgop_stat.msync.weak += 1;
|
||||
#endif /* MDBX_ENABLE_PGOP_STAT */
|
||||
rc = osal_msync(&env->me_dxb_mmap, 0,
|
||||
pgno_align2os_bytes(env, model->mm_geo.next),
|
||||
MDBX_SYNC_DATA | MDBX_SYNC_IODQ);
|
||||
@ -12877,18 +12936,26 @@ __cold static int __must_check_result override_meta(MDBX_env *env,
|
||||
* clearing consistency flag by mdbx_meta_update_begin() */
|
||||
memcpy(pgno2page(env, target), page, env->me_psize);
|
||||
osal_flush_incoherent_cpu_writeback();
|
||||
#if MDBX_ENABLE_PGOP_STAT
|
||||
env->me_lck->mti_pgop_stat.msync.weak += 1;
|
||||
#endif /* MDBX_ENABLE_PGOP_STAT */
|
||||
rc = osal_msync(&env->me_dxb_mmap, 0, pgno_align2os_bytes(env, target + 1),
|
||||
MDBX_SYNC_DATA | MDBX_SYNC_IODQ);
|
||||
} else {
|
||||
const mdbx_filehandle_t fd = (env->me_dsync_fd != INVALID_HANDLE_VALUE)
|
||||
? env->me_dsync_fd
|
||||
: env->me_lazy_fd;
|
||||
rc = osal_pwrite(fd, page, env->me_psize, pgno2bytes(env, target));
|
||||
if (rc == MDBX_SUCCESS && fd == env->me_lazy_fd)
|
||||
#if MDBX_ENABLE_PGOP_STAT
|
||||
env->me_lck->mti_pgop_stat.wops.weak += 1;
|
||||
#endif /* MDBX_ENABLE_PGOP_STAT */
|
||||
rc = osal_pwrite(env->me_fd4meta, page, env->me_psize,
|
||||
pgno2bytes(env, target));
|
||||
if (rc == MDBX_SUCCESS && env->me_fd4meta == env->me_lazy_fd) {
|
||||
#if MDBX_ENABLE_PGOP_STAT
|
||||
env->me_lck->mti_pgop_stat.fsync.weak += 1;
|
||||
#endif /* MDBX_ENABLE_PGOP_STAT */
|
||||
rc = osal_fsync(env->me_lazy_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ);
|
||||
}
|
||||
osal_flush_incoherent_mmap(env->me_map, pgno2bytes(env, NUM_METAS),
|
||||
env->me_os_psize);
|
||||
}
|
||||
osal_flush_incoherent_mmap(env->me_map, pgno2bytes(env, NUM_METAS),
|
||||
env->me_os_psize);
|
||||
eASSERT(env, !env->me_txn && !env->me_txn0);
|
||||
return rc;
|
||||
}
|
||||
@ -13254,14 +13321,6 @@ __cold int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname,
|
||||
if (rc != MDBX_SUCCESS)
|
||||
goto bailout;
|
||||
|
||||
eASSERT(env, env->me_dsync_fd == INVALID_HANDLE_VALUE);
|
||||
if ((flags & (MDBX_RDONLY | MDBX_SAFE_NOSYNC | MDBX_NOMETASYNC)) == 0) {
|
||||
rc = osal_openfile(MDBX_OPEN_DXB_DSYNC, env, env_pathname.dxb,
|
||||
&env->me_dsync_fd, 0);
|
||||
ENSURE(env,
|
||||
(rc != MDBX_SUCCESS) == (env->me_dsync_fd == INVALID_HANDLE_VALUE));
|
||||
}
|
||||
|
||||
#if MDBX_LOCKING == MDBX_LOCKING_SYSV
|
||||
env->me_sysv_ipc.key = ftok(env_pathname.dxb, 42);
|
||||
if (env->me_sysv_ipc.key == -1) {
|
||||
@ -13270,7 +13329,30 @@ __cold int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname,
|
||||
}
|
||||
#endif /* MDBX_LOCKING */
|
||||
|
||||
#if !(defined(_WIN32) || defined(_WIN64))
|
||||
/* Set the position in files outside of the data to avoid corruption
|
||||
* due to erroneous use of file descriptors in the application code. */
|
||||
const uint64_t safe_parking_lot_offset = UINT64_C(0x7fffFFFF80000000);
|
||||
osal_fseek(env->me_lazy_fd, safe_parking_lot_offset);
|
||||
|
||||
env->me_fd4data = env->me_fd4meta = env->me_lazy_fd;
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
uint8_t ior_flags = 0;
|
||||
if ((flags & (MDBX_RDONLY | MDBX_SAFE_NOSYNC)) == MDBX_SYNC_DURABLE) {
|
||||
ior_flags = IOR_OVERLAPPED;
|
||||
rc =
|
||||
osal_openfile(MDBX_OPEN_DXB_OVERLAPPED,
|
||||
env, env_pathname.dxb, &env->me_overlapped_fd, 0);
|
||||
if (rc != MDBX_SUCCESS)
|
||||
goto bailout;
|
||||
env->me_data_lock_event = CreateEventW(nullptr, true, false, nullptr);
|
||||
if (!env->me_data_lock_event) {
|
||||
rc = (int)GetLastError();
|
||||
goto bailout;
|
||||
}
|
||||
env->me_fd4data = env->me_overlapped_fd;
|
||||
osal_fseek(env->me_overlapped_fd, safe_parking_lot_offset);
|
||||
}
|
||||
#else
|
||||
if (mode == 0) {
|
||||
/* pickup mode for lck-file */
|
||||
struct stat st;
|
||||
@ -13291,13 +13373,7 @@ __cold int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname,
|
||||
rc = lck_rc;
|
||||
goto bailout;
|
||||
}
|
||||
|
||||
/* Set the position in files outside of the data to avoid corruption
|
||||
* due to erroneous use of file descriptors in the application code. */
|
||||
osal_fseek(env->me_lfd, UINT64_C(1) << 63);
|
||||
osal_fseek(env->me_lazy_fd, UINT64_C(1) << 63);
|
||||
if (env->me_dsync_fd != INVALID_HANDLE_VALUE)
|
||||
osal_fseek(env->me_dsync_fd, UINT64_C(1) << 63);
|
||||
osal_fseek(env->me_lfd, safe_parking_lot_offset);
|
||||
|
||||
const MDBX_env_flags_t rigorous_flags =
|
||||
MDBX_SAFE_NOSYNC | MDBX_DEPRECATED_MAPASYNC;
|
||||
@ -13305,6 +13381,20 @@ __cold int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname,
|
||||
MDBX_LIFORECLAIM |
|
||||
MDBX_DEPRECATED_COALESCE | MDBX_NORDAHEAD;
|
||||
|
||||
eASSERT(env, env->me_dsync_fd == INVALID_HANDLE_VALUE);
|
||||
if ((flags & (MDBX_RDONLY | MDBX_SAFE_NOSYNC)) == 0 &&
|
||||
(env->me_fd4data == env->me_lazy_fd || !(flags & MDBX_NOMETASYNC))) {
|
||||
rc = osal_openfile(MDBX_OPEN_DXB_DSYNC, env, env_pathname.dxb,
|
||||
&env->me_dsync_fd, 0);
|
||||
if (env->me_dsync_fd != INVALID_HANDLE_VALUE) {
|
||||
if ((flags & MDBX_NOMETASYNC) == 0)
|
||||
env->me_fd4meta = env->me_dsync_fd;
|
||||
if (env->me_fd4data == env->me_lazy_fd)
|
||||
env->me_fd4data = env->me_dsync_fd;
|
||||
osal_fseek(env->me_dsync_fd, safe_parking_lot_offset);
|
||||
}
|
||||
}
|
||||
|
||||
MDBX_lockinfo *const lck = env->me_lck_mmap.lck;
|
||||
if (lck && lck_rc != MDBX_RESULT_TRUE && (env->me_flags & MDBX_RDONLY) == 0) {
|
||||
while (atomic_load32(&lck->mti_envmode, mo_AcquireRelease) == MDBX_RDONLY) {
|
||||
@ -13413,6 +13503,12 @@ __cold int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname,
|
||||
} else
|
||||
rc = MDBX_ENOMEM;
|
||||
}
|
||||
if (rc == MDBX_SUCCESS)
|
||||
rc = osal_ioring_create(&env->me_ioring,
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
ior_flags,
|
||||
#endif /* Windows */
|
||||
env->me_fd4data);
|
||||
}
|
||||
|
||||
#if MDBX_DEBUG
|
||||
@ -13469,6 +13565,8 @@ __cold static int env_close(MDBX_env *env) {
|
||||
const int rc = lcklist_detach_locked(env);
|
||||
lcklist_unlock();
|
||||
|
||||
osal_ioring_destroy(&env->me_ioring);
|
||||
|
||||
if (env->me_map) {
|
||||
osal_munmap(&env->me_dxb_mmap);
|
||||
#ifdef MDBX_USE_VALGRIND
|
||||
@ -13477,6 +13575,14 @@ __cold static int env_close(MDBX_env *env) {
|
||||
#endif
|
||||
}
|
||||
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
if (env->me_overlapped_fd != INVALID_HANDLE_VALUE) {
|
||||
CloseHandle(env->me_data_lock_event);
|
||||
CloseHandle(env->me_overlapped_fd);
|
||||
env->me_overlapped_fd = INVALID_HANDLE_VALUE;
|
||||
}
|
||||
#endif /* Windows */
|
||||
|
||||
if (env->me_dsync_fd != INVALID_HANDLE_VALUE) {
|
||||
(void)osal_closefile(env->me_dsync_fd);
|
||||
env->me_dsync_fd = INVALID_HANDLE_VALUE;
|
||||
@ -13578,7 +13684,7 @@ __cold int mdbx_env_close_ex(MDBX_env *env, bool dont_sync) {
|
||||
? MDBX_SUCCESS
|
||||
: rc;
|
||||
}
|
||||
#endif
|
||||
#endif /* Windows */
|
||||
}
|
||||
|
||||
eASSERT(env, env->me_signature.weak == 0);
|
||||
@ -20528,6 +20634,10 @@ __cold static int fetch_envinfo_ex(const MDBX_env *env, const MDBX_txn *txn,
|
||||
atomic_load64(&lck->mti_pgop_stat.unspill, mo_Relaxed);
|
||||
arg->mi_pgop_stat.wops =
|
||||
atomic_load64(&lck->mti_pgop_stat.wops, mo_Relaxed);
|
||||
arg->mi_pgop_stat.msync =
|
||||
atomic_load64(&lck->mti_pgop_stat.msync, mo_Relaxed);
|
||||
arg->mi_pgop_stat.fsync =
|
||||
atomic_load64(&lck->mti_pgop_stat.fsync, mo_Relaxed);
|
||||
arg->mi_pgop_stat.gcrtime_seconds16dot16 = osal_monotime_to_16dot16(
|
||||
atomic_load64(&lck->mti_pgop_stat.gcrtime, mo_Relaxed));
|
||||
#else
|
||||
|
@ -591,6 +591,10 @@ typedef struct {
|
||||
MDBX_atomic_uint64_t
|
||||
gcrtime; /* Time spending for reading/searching GC (aka FreeDB). The
|
||||
unit/scale is platform-depended, see osal_monotime(). */
|
||||
MDBX_atomic_uint64_t
|
||||
msync; /* Number of explicit msync/flush-to-disk operations */
|
||||
MDBX_atomic_uint64_t
|
||||
fsync; /* Number of explicit fsync/flush-to-disk operations */
|
||||
} MDBX_pgop_stat_t;
|
||||
#endif /* MDBX_ENABLE_PGOP_STAT */
|
||||
|
||||
@ -1143,7 +1147,11 @@ struct MDBX_env {
|
||||
osal_mmap_t me_dxb_mmap; /* The main data file */
|
||||
#define me_map me_dxb_mmap.dxb
|
||||
#define me_lazy_fd me_dxb_mmap.fd
|
||||
mdbx_filehandle_t me_dsync_fd;
|
||||
#define me_fd4data me_ioring.fd
|
||||
mdbx_filehandle_t me_dsync_fd, me_fd4meta;
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
HANDLE me_overlapped_fd, me_data_lock_event;
|
||||
#endif /* Windows */
|
||||
osal_mmap_t me_lck_mmap; /* The lock file */
|
||||
#define me_lfd me_lck_mmap.fd
|
||||
struct MDBX_lockinfo *me_lck;
|
||||
@ -1222,6 +1230,7 @@ struct MDBX_env {
|
||||
unsigned me_dp_reserve_len;
|
||||
/* PNL of pages that became unused in a write txn */
|
||||
MDBX_PNL me_retired_pages;
|
||||
osal_ioring_t me_ioring;
|
||||
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
osal_srwlock_t me_remap_guard;
|
||||
@ -1609,20 +1618,24 @@ ceil_powerof2(size_t value, size_t granularity) {
|
||||
}
|
||||
|
||||
MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static unsigned
|
||||
log2n_powerof2(size_t value) {
|
||||
assert(value > 0 && value < INT32_MAX && is_powerof2(value));
|
||||
assert((value & -(int32_t)value) == value);
|
||||
#if __GNUC_PREREQ(4, 1) || __has_builtin(__builtin_ctzl)
|
||||
return __builtin_ctzl(value);
|
||||
log2n_powerof2(size_t value_uintptr) {
|
||||
assert(value_uintptr > 0 && value_uintptr < INT32_MAX &&
|
||||
is_powerof2(value_uintptr));
|
||||
assert((value_uintptr & -(intptr_t)value_uintptr) == value_uintptr);
|
||||
const uint32_t value_uint32 = (uint32_t)value_uintptr;
|
||||
#if __GNUC_PREREQ(4, 1) || __has_builtin(__builtin_ctz)
|
||||
STATIC_ASSERT(sizeof(value_uint32) <= sizeof(unsigned));
|
||||
return __builtin_ctz(value_uint32);
|
||||
#elif defined(_MSC_VER)
|
||||
unsigned long index;
|
||||
_BitScanForward(&index, (unsigned long)value);
|
||||
STATIC_ASSERT(sizeof(value_uint32) <= sizeof(long));
|
||||
_BitScanForward(&index, value_uint32);
|
||||
return index;
|
||||
#else
|
||||
static const uint8_t debruijn_ctz32[32] = {
|
||||
0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
|
||||
31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9};
|
||||
return debruijn_ctz32[(uint32_t)(value * 0x077CB531u) >> 27];
|
||||
return debruijn_ctz32[(uint32_t)(value_uint32 * 0x077CB531ul) >> 27];
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -112,32 +112,73 @@ static
|
||||
#define LCK_WAITFOR 0
|
||||
#define LCK_DONTWAIT LOCKFILE_FAIL_IMMEDIATELY
|
||||
|
||||
static __inline BOOL flock(mdbx_filehandle_t fd, DWORD flags, uint64_t offset,
|
||||
size_t bytes) {
|
||||
static int flock_with_event(HANDLE fd, HANDLE event, DWORD flags,
|
||||
uint64_t offset, size_t bytes) {
|
||||
TRACE("lock>>: fd %p, event %p, flags 0x%x offset %" PRId64 ", bytes %" PRId64
|
||||
" >>",
|
||||
fd, event, flags, offset, bytes);
|
||||
OVERLAPPED ov;
|
||||
ov.hEvent = 0;
|
||||
ov.Internal = 0;
|
||||
ov.InternalHigh = 0;
|
||||
ov.hEvent = event;
|
||||
ov.Offset = (DWORD)offset;
|
||||
ov.OffsetHigh = HIGH_DWORD(offset);
|
||||
return LockFileEx(fd, flags, 0, (DWORD)bytes, HIGH_DWORD(bytes), &ov);
|
||||
if (LockFileEx(fd, flags, 0, (DWORD)bytes, HIGH_DWORD(bytes), &ov)) {
|
||||
TRACE("lock<<: fd %p, event %p, flags 0x%x offset %" PRId64
|
||||
", bytes %" PRId64 " << %s",
|
||||
fd, event, flags, offset, bytes, "done");
|
||||
return MDBX_SUCCESS;
|
||||
}
|
||||
|
||||
DWORD rc = GetLastError();
|
||||
if (rc == ERROR_IO_PENDING) {
|
||||
if (event) {
|
||||
if (GetOverlappedResult(fd, &ov, &rc, true)) {
|
||||
TRACE("lock<<: fd %p, event %p, flags 0x%x offset %" PRId64
|
||||
", bytes %" PRId64 " << %s",
|
||||
fd, event, flags, offset, bytes, "overlapped-done");
|
||||
return MDBX_SUCCESS;
|
||||
}
|
||||
rc = GetLastError();
|
||||
} else
|
||||
CancelIo(fd);
|
||||
}
|
||||
TRACE("lock<<: fd %p, event %p, flags 0x%x offset %" PRId64 ", bytes %" PRId64
|
||||
" << err %d",
|
||||
fd, event, flags, offset, bytes, rc);
|
||||
return (int)rc;
|
||||
}
|
||||
|
||||
static __inline BOOL funlock(mdbx_filehandle_t fd, uint64_t offset,
|
||||
size_t bytes) {
|
||||
static __inline int flock(HANDLE fd, DWORD flags, uint64_t offset,
|
||||
size_t bytes) {
|
||||
return flock_with_event(fd, 0, flags, offset, bytes);
|
||||
}
|
||||
|
||||
static __inline int flock_data(const MDBX_env *env, DWORD flags,
|
||||
uint64_t offset, size_t bytes) {
|
||||
return flock_with_event(env->me_fd4data, env->me_data_lock_event, flags,
|
||||
offset, bytes);
|
||||
}
|
||||
|
||||
static int funlock(mdbx_filehandle_t fd, uint64_t offset, size_t bytes) {
|
||||
TRACE("unlock: fd %p, offset %" PRId64 ", bytes %" PRId64, fd, offset, bytes);
|
||||
return UnlockFile(fd, (DWORD)offset, HIGH_DWORD(offset), (DWORD)bytes,
|
||||
HIGH_DWORD(bytes));
|
||||
HIGH_DWORD(bytes))
|
||||
? MDBX_SUCCESS
|
||||
: (int)GetLastError();
|
||||
}
|
||||
|
||||
/*----------------------------------------------------------------------------*/
|
||||
/* global `write` lock for write-txt processing,
|
||||
* exclusive locking both meta-pages) */
|
||||
|
||||
#define LCK_MAXLEN (1u + ((~(size_t)0) >> 1))
|
||||
#define LCK_META_OFFSET 0
|
||||
#define LCK_META_LEN (MAX_PAGESIZE * NUM_METAS)
|
||||
#define LCK_BODY_OFFSET LCK_META_LEN
|
||||
#define LCK_BODY_LEN (LCK_MAXLEN - LCK_BODY_OFFSET)
|
||||
#define LCK_BODY LCK_BODY_OFFSET, LCK_BODY_LEN
|
||||
#define LCK_WHOLE 0, LCK_MAXLEN
|
||||
#ifdef _WIN64
|
||||
#define DXB_MAXLEN UINT64_C(0x7fffFFFFfff00000)
|
||||
#else
|
||||
#define DXB_MAXLEN UINT32_C(0x7ff00000)
|
||||
#endif
|
||||
#define DXB_BODY (env->me_psize * NUM_METAS), DXB_MAXLEN
|
||||
#define DXB_WHOLE 0, DXB_MAXLEN
|
||||
|
||||
int mdbx_txn_lock(MDBX_env *env, bool dontwait) {
|
||||
if (dontwait) {
|
||||
@ -155,24 +196,27 @@ int mdbx_txn_lock(MDBX_env *env, bool dontwait) {
|
||||
}
|
||||
}
|
||||
|
||||
if ((env->me_flags & MDBX_EXCLUSIVE) ||
|
||||
flock(env->me_lazy_fd,
|
||||
dontwait ? (LCK_EXCLUSIVE | LCK_DONTWAIT)
|
||||
: (LCK_EXCLUSIVE | LCK_WAITFOR),
|
||||
LCK_BODY))
|
||||
if (env->me_flags & MDBX_EXCLUSIVE)
|
||||
return MDBX_SUCCESS;
|
||||
int rc = (int)GetLastError();
|
||||
|
||||
int rc = flock_with_event(env->me_fd4data, env->me_data_lock_event,
|
||||
dontwait ? (LCK_EXCLUSIVE | LCK_DONTWAIT)
|
||||
: (LCK_EXCLUSIVE | LCK_WAITFOR),
|
||||
DXB_BODY);
|
||||
if (rc == MDBX_SUCCESS)
|
||||
return rc;
|
||||
|
||||
LeaveCriticalSection(&env->me_windowsbug_lock);
|
||||
return (!dontwait || rc != ERROR_LOCK_VIOLATION) ? rc : MDBX_BUSY;
|
||||
}
|
||||
|
||||
void mdbx_txn_unlock(MDBX_env *env) {
|
||||
int rc = (env->me_flags & MDBX_EXCLUSIVE)
|
||||
? TRUE
|
||||
: funlock(env->me_lazy_fd, LCK_BODY);
|
||||
if ((env->me_flags & MDBX_EXCLUSIVE) == 0) {
|
||||
int err = funlock(env->me_fd4data, DXB_BODY);
|
||||
if (err != MDBX_SUCCESS)
|
||||
mdbx_panic("%s failed: err %u", __func__, err);
|
||||
}
|
||||
LeaveCriticalSection(&env->me_windowsbug_lock);
|
||||
if (!rc)
|
||||
mdbx_panic("%s failed: err %u", __func__, (int)GetLastError());
|
||||
}
|
||||
|
||||
/*----------------------------------------------------------------------------*/
|
||||
@ -193,32 +237,32 @@ MDBX_INTERNAL_FUNC int osal_rdt_lock(MDBX_env *env) {
|
||||
|
||||
/* transition from S-? (used) to S-E (locked),
|
||||
* e.g. exclusive lock upper-part */
|
||||
if ((env->me_flags & MDBX_EXCLUSIVE) ||
|
||||
flock(env->me_lfd, LCK_EXCLUSIVE | LCK_WAITFOR, LCK_UPPER))
|
||||
if (env->me_flags & MDBX_EXCLUSIVE)
|
||||
return MDBX_SUCCESS;
|
||||
|
||||
int rc = flock(env->me_lfd, LCK_EXCLUSIVE | LCK_WAITFOR, LCK_UPPER);
|
||||
if (rc == MDBX_SUCCESS)
|
||||
return MDBX_SUCCESS;
|
||||
|
||||
int rc = (int)GetLastError();
|
||||
osal_srwlock_ReleaseShared(&env->me_remap_guard);
|
||||
return rc;
|
||||
}
|
||||
|
||||
MDBX_INTERNAL_FUNC void osal_rdt_unlock(MDBX_env *env) {
|
||||
if (env->me_lfd != INVALID_HANDLE_VALUE) {
|
||||
if (env->me_lfd != INVALID_HANDLE_VALUE &&
|
||||
(env->me_flags & MDBX_EXCLUSIVE) == 0) {
|
||||
/* transition from S-E (locked) to S-? (used), e.g. unlock upper-part */
|
||||
if ((env->me_flags & MDBX_EXCLUSIVE) == 0 &&
|
||||
!funlock(env->me_lfd, LCK_UPPER))
|
||||
mdbx_panic("%s failed: err %u", __func__, (int)GetLastError());
|
||||
int err = funlock(env->me_lfd, LCK_UPPER);
|
||||
if (err != MDBX_SUCCESS)
|
||||
mdbx_panic("%s failed: err %u", __func__, err);
|
||||
}
|
||||
osal_srwlock_ReleaseShared(&env->me_remap_guard);
|
||||
}
|
||||
|
||||
MDBX_INTERNAL_FUNC int osal_lockfile(mdbx_filehandle_t fd, bool wait) {
|
||||
return flock(fd,
|
||||
wait ? LCK_EXCLUSIVE | LCK_WAITFOR
|
||||
: LCK_EXCLUSIVE | LCK_DONTWAIT,
|
||||
0, LCK_MAXLEN)
|
||||
? MDBX_SUCCESS
|
||||
: (int)GetLastError();
|
||||
return flock(
|
||||
fd, wait ? LCK_EXCLUSIVE | LCK_WAITFOR : LCK_EXCLUSIVE | LCK_DONTWAIT, 0,
|
||||
DXB_MAXLEN);
|
||||
}
|
||||
|
||||
static int suspend_and_append(mdbx_handle_array_t **array,
|
||||
@ -386,40 +430,36 @@ static void lck_unlock(MDBX_env *env) {
|
||||
|
||||
if (env->me_lfd != INVALID_HANDLE_VALUE) {
|
||||
/* double `unlock` for robustly remove overlapped shared/exclusive locks */
|
||||
while (funlock(env->me_lfd, LCK_LOWER))
|
||||
;
|
||||
err = (int)GetLastError();
|
||||
do
|
||||
err = funlock(env->me_lfd, LCK_LOWER);
|
||||
while (err == MDBX_SUCCESS);
|
||||
assert(err == ERROR_NOT_LOCKED ||
|
||||
(mdbx_RunningUnderWine() && err == ERROR_LOCK_VIOLATION));
|
||||
(void)err;
|
||||
SetLastError(ERROR_SUCCESS);
|
||||
|
||||
while (funlock(env->me_lfd, LCK_UPPER))
|
||||
;
|
||||
err = (int)GetLastError();
|
||||
do
|
||||
err = funlock(env->me_lfd, LCK_UPPER);
|
||||
while (err == MDBX_SUCCESS);
|
||||
assert(err == ERROR_NOT_LOCKED ||
|
||||
(mdbx_RunningUnderWine() && err == ERROR_LOCK_VIOLATION));
|
||||
(void)err;
|
||||
SetLastError(ERROR_SUCCESS);
|
||||
}
|
||||
|
||||
if (env->me_lazy_fd != INVALID_HANDLE_VALUE) {
|
||||
if (env->me_fd4data != INVALID_HANDLE_VALUE) {
|
||||
/* explicitly unlock to avoid latency for other processes (windows kernel
|
||||
* releases such locks via deferred queues) */
|
||||
while (funlock(env->me_lazy_fd, LCK_BODY))
|
||||
;
|
||||
err = (int)GetLastError();
|
||||
do
|
||||
err = funlock(env->me_fd4data, DXB_BODY);
|
||||
while (err == MDBX_SUCCESS);
|
||||
assert(err == ERROR_NOT_LOCKED ||
|
||||
(mdbx_RunningUnderWine() && err == ERROR_LOCK_VIOLATION));
|
||||
(void)err;
|
||||
SetLastError(ERROR_SUCCESS);
|
||||
|
||||
while (funlock(env->me_lazy_fd, LCK_WHOLE))
|
||||
;
|
||||
err = (int)GetLastError();
|
||||
do
|
||||
err = funlock(env->me_fd4data, DXB_WHOLE);
|
||||
while (err == MDBX_SUCCESS);
|
||||
assert(err == ERROR_NOT_LOCKED ||
|
||||
(mdbx_RunningUnderWine() && err == ERROR_LOCK_VIOLATION));
|
||||
(void)err;
|
||||
SetLastError(ERROR_SUCCESS);
|
||||
}
|
||||
}
|
||||
@ -428,56 +468,55 @@ static void lck_unlock(MDBX_env *env) {
|
||||
* or as 'used' (S-? and returns MDBX_RESULT_FALSE).
|
||||
* Otherwise returns an error. */
|
||||
static int internal_seize_lck(HANDLE lfd) {
|
||||
int rc;
|
||||
assert(lfd != INVALID_HANDLE_VALUE);
|
||||
|
||||
/* 1) now on ?-? (free), get ?-E (middle) */
|
||||
jitter4testing(false);
|
||||
if (!flock(lfd, LCK_EXCLUSIVE | LCK_WAITFOR, LCK_UPPER)) {
|
||||
rc = (int)GetLastError() /* 2) something went wrong, give up */;
|
||||
int rc = flock(lfd, LCK_EXCLUSIVE | LCK_WAITFOR, LCK_UPPER);
|
||||
if (rc != MDBX_SUCCESS) {
|
||||
/* 2) something went wrong, give up */;
|
||||
ERROR("%s, err %u", "?-?(free) >> ?-E(middle)", rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* 3) now on ?-E (middle), try E-E (exclusive-write) */
|
||||
jitter4testing(false);
|
||||
if (flock(lfd, LCK_EXCLUSIVE | LCK_DONTWAIT, LCK_LOWER))
|
||||
rc = flock(lfd, LCK_EXCLUSIVE | LCK_DONTWAIT, LCK_LOWER);
|
||||
if (rc == MDBX_SUCCESS)
|
||||
return MDBX_RESULT_TRUE /* 4) got E-E (exclusive-write), done */;
|
||||
|
||||
/* 5) still on ?-E (middle) */
|
||||
rc = (int)GetLastError();
|
||||
jitter4testing(false);
|
||||
if (rc != ERROR_SHARING_VIOLATION && rc != ERROR_LOCK_VIOLATION) {
|
||||
/* 6) something went wrong, give up */
|
||||
if (!funlock(lfd, LCK_UPPER))
|
||||
rc = funlock(lfd, LCK_UPPER);
|
||||
if (rc != MDBX_SUCCESS)
|
||||
mdbx_panic("%s(%s) failed: err %u", __func__, "?-E(middle) >> ?-?(free)",
|
||||
(int)GetLastError());
|
||||
rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* 7) still on ?-E (middle), try S-E (locked) */
|
||||
jitter4testing(false);
|
||||
rc = flock(lfd, LCK_SHARED | LCK_DONTWAIT, LCK_LOWER) ? MDBX_RESULT_FALSE
|
||||
: (int)GetLastError();
|
||||
rc = flock(lfd, LCK_SHARED | LCK_DONTWAIT, LCK_LOWER);
|
||||
|
||||
jitter4testing(false);
|
||||
if (rc != MDBX_RESULT_FALSE)
|
||||
if (rc != MDBX_SUCCESS)
|
||||
ERROR("%s, err %u", "?-E(middle) >> S-E(locked)", rc);
|
||||
|
||||
/* 8) now on S-E (locked) or still on ?-E (middle),
|
||||
* transition to S-? (used) or ?-? (free) */
|
||||
if (!funlock(lfd, LCK_UPPER))
|
||||
int err = funlock(lfd, LCK_UPPER);
|
||||
if (err != MDBX_SUCCESS)
|
||||
mdbx_panic("%s(%s) failed: err %u", __func__,
|
||||
"X-E(locked/middle) >> X-?(used/free)", (int)GetLastError());
|
||||
"X-E(locked/middle) >> X-?(used/free)", err);
|
||||
|
||||
/* 9) now on S-? (used, DONE) or ?-? (free, FAILURE) */
|
||||
return rc;
|
||||
}
|
||||
|
||||
MDBX_INTERNAL_FUNC int osal_lck_seize(MDBX_env *env) {
|
||||
int rc;
|
||||
|
||||
assert(env->me_lazy_fd != INVALID_HANDLE_VALUE);
|
||||
assert(env->me_fd4data != INVALID_HANDLE_VALUE);
|
||||
if (env->me_flags & MDBX_EXCLUSIVE)
|
||||
return MDBX_RESULT_TRUE /* nope since files were must be opened
|
||||
non-shareable */
|
||||
@ -486,15 +525,13 @@ MDBX_INTERNAL_FUNC int osal_lck_seize(MDBX_env *env) {
|
||||
if (env->me_lfd == INVALID_HANDLE_VALUE) {
|
||||
/* LY: without-lck mode (e.g. on read-only filesystem) */
|
||||
jitter4testing(false);
|
||||
if (!flock(env->me_lazy_fd, LCK_SHARED | LCK_DONTWAIT, LCK_WHOLE)) {
|
||||
rc = (int)GetLastError();
|
||||
int rc = flock_data(env, LCK_SHARED | LCK_DONTWAIT, DXB_WHOLE);
|
||||
if (rc != MDBX_SUCCESS)
|
||||
ERROR("%s, err %u", "without-lck", rc);
|
||||
return rc;
|
||||
}
|
||||
return MDBX_RESULT_FALSE;
|
||||
return rc;
|
||||
}
|
||||
|
||||
rc = internal_seize_lck(env->me_lfd);
|
||||
int rc = internal_seize_lck(env->me_lfd);
|
||||
jitter4testing(false);
|
||||
if (rc == MDBX_RESULT_TRUE && (env->me_flags & MDBX_RDONLY) == 0) {
|
||||
/* Check that another process don't operates in without-lck mode.
|
||||
@ -503,17 +540,18 @@ MDBX_INTERNAL_FUNC int osal_lck_seize(MDBX_env *env) {
|
||||
* - we need an exclusive lock for do so;
|
||||
* - we can't lock meta-pages, otherwise other process could get an error
|
||||
* while opening db in valid (non-conflict) mode. */
|
||||
if (!flock(env->me_lazy_fd, LCK_EXCLUSIVE | LCK_DONTWAIT, LCK_BODY)) {
|
||||
rc = (int)GetLastError();
|
||||
ERROR("%s, err %u", "lock-against-without-lck", rc);
|
||||
int err = flock_data(env, LCK_EXCLUSIVE | LCK_DONTWAIT, DXB_WHOLE);
|
||||
if (err != MDBX_SUCCESS) {
|
||||
ERROR("%s, err %u", "lock-against-without-lck", err);
|
||||
jitter4testing(false);
|
||||
lck_unlock(env);
|
||||
} else {
|
||||
jitter4testing(false);
|
||||
if (!funlock(env->me_lazy_fd, LCK_BODY))
|
||||
mdbx_panic("%s(%s) failed: err %u", __func__,
|
||||
"unlock-against-without-lck", (int)GetLastError());
|
||||
return err;
|
||||
}
|
||||
jitter4testing(false);
|
||||
err = funlock(env->me_fd4data, DXB_WHOLE);
|
||||
if (err != MDBX_SUCCESS)
|
||||
mdbx_panic("%s(%s) failed: err %u", __func__,
|
||||
"unlock-against-without-lck", err);
|
||||
}
|
||||
|
||||
return rc;
|
||||
@ -521,28 +559,31 @@ MDBX_INTERNAL_FUNC int osal_lck_seize(MDBX_env *env) {
|
||||
|
||||
MDBX_INTERNAL_FUNC int osal_lck_downgrade(MDBX_env *env) {
|
||||
/* Transite from exclusive-write state (E-E) to used (S-?) */
|
||||
assert(env->me_lazy_fd != INVALID_HANDLE_VALUE);
|
||||
assert(env->me_fd4data != INVALID_HANDLE_VALUE);
|
||||
assert(env->me_lfd != INVALID_HANDLE_VALUE);
|
||||
|
||||
if (env->me_flags & MDBX_EXCLUSIVE)
|
||||
return MDBX_SUCCESS /* nope since files were must be opened non-shareable */
|
||||
;
|
||||
/* 1) now at E-E (exclusive-write), transition to ?_E (middle) */
|
||||
if (!funlock(env->me_lfd, LCK_LOWER))
|
||||
int rc = funlock(env->me_lfd, LCK_LOWER);
|
||||
if (rc != MDBX_SUCCESS)
|
||||
mdbx_panic("%s(%s) failed: err %u", __func__,
|
||||
"E-E(exclusive-write) >> ?-E(middle)", (int)GetLastError());
|
||||
"E-E(exclusive-write) >> ?-E(middle)", rc);
|
||||
|
||||
/* 2) now at ?-E (middle), transition to S-E (locked) */
|
||||
if (!flock(env->me_lfd, LCK_SHARED | LCK_DONTWAIT, LCK_LOWER)) {
|
||||
int rc = (int)GetLastError() /* 3) something went wrong, give up */;
|
||||
rc = flock(env->me_lfd, LCK_SHARED | LCK_DONTWAIT, LCK_LOWER);
|
||||
if (rc != MDBX_SUCCESS) {
|
||||
/* 3) something went wrong, give up */;
|
||||
ERROR("%s, err %u", "?-E(middle) >> S-E(locked)", rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* 4) got S-E (locked), continue transition to S-? (used) */
|
||||
if (!funlock(env->me_lfd, LCK_UPPER))
|
||||
rc = funlock(env->me_lfd, LCK_UPPER);
|
||||
if (rc != MDBX_SUCCESS)
|
||||
mdbx_panic("%s(%s) failed: err %u", __func__, "S-E(locked) >> S-?(used)",
|
||||
(int)GetLastError());
|
||||
rc);
|
||||
|
||||
return MDBX_SUCCESS /* 5) now at S-? (used), done */;
|
||||
}
|
||||
@ -555,24 +596,26 @@ MDBX_INTERNAL_FUNC int mdbx_lck_upgrade(MDBX_env *env) {
|
||||
return MDBX_SUCCESS /* nope since files were must be opened non-shareable */
|
||||
;
|
||||
|
||||
int rc;
|
||||
/* 1) now on S-? (used), try S-E (locked) */
|
||||
jitter4testing(false);
|
||||
if (!flock(env->me_lfd, LCK_EXCLUSIVE | LCK_DONTWAIT, LCK_UPPER)) {
|
||||
rc = (int)GetLastError() /* 2) something went wrong, give up */;
|
||||
int rc = flock(env->me_lfd, LCK_EXCLUSIVE | LCK_DONTWAIT, LCK_UPPER);
|
||||
if (rc != MDBX_SUCCESS) {
|
||||
/* 2) something went wrong, give up */;
|
||||
VERBOSE("%s, err %u", "S-?(used) >> S-E(locked)", rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* 3) now on S-E (locked), transition to ?-E (middle) */
|
||||
if (!funlock(env->me_lfd, LCK_LOWER))
|
||||
rc = funlock(env->me_lfd, LCK_LOWER);
|
||||
if (rc != MDBX_SUCCESS)
|
||||
mdbx_panic("%s(%s) failed: err %u", __func__, "S-E(locked) >> ?-E(middle)",
|
||||
(int)GetLastError());
|
||||
rc);
|
||||
|
||||
/* 4) now on ?-E (middle), try E-E (exclusive-write) */
|
||||
jitter4testing(false);
|
||||
if (!flock(env->me_lfd, LCK_EXCLUSIVE | LCK_DONTWAIT, LCK_LOWER)) {
|
||||
rc = (int)GetLastError() /* 5) something went wrong, give up */;
|
||||
rc = flock(env->me_lfd, LCK_EXCLUSIVE | LCK_DONTWAIT, LCK_LOWER);
|
||||
if (rc != MDBX_SUCCESS) {
|
||||
/* 5) something went wrong, give up */;
|
||||
VERBOSE("%s, err %u", "?-E(middle) >> E-E(exclusive-write)", rc);
|
||||
return rc;
|
||||
}
|
||||
@ -586,6 +629,23 @@ MDBX_INTERNAL_FUNC int osal_lck_init(MDBX_env *env,
|
||||
(void)env;
|
||||
(void)inprocess_neighbor;
|
||||
(void)global_uniqueness_flag;
|
||||
if (mdbx_SetFileIoOverlappedRange && !(env->me_flags & MDBX_RDONLY)) {
|
||||
HANDLE token = INVALID_HANDLE_VALUE;
|
||||
TOKEN_PRIVILEGES privileges;
|
||||
privileges.PrivilegeCount = 1;
|
||||
privileges.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED;
|
||||
if (!OpenProcessToken(GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES,
|
||||
&token) ||
|
||||
!LookupPrivilegeValue(NULL, SE_LOCK_MEMORY_NAME,
|
||||
&privileges.Privileges[0].Luid) ||
|
||||
!AdjustTokenPrivileges(token, FALSE, &privileges, sizeof(privileges),
|
||||
nullptr, nullptr) ||
|
||||
GetLastError() != ERROR_SUCCESS)
|
||||
mdbx_SetFileIoOverlappedRange = NULL;
|
||||
|
||||
if (token != INVALID_HANDLE_VALUE)
|
||||
CloseHandle(token);
|
||||
}
|
||||
return MDBX_SUCCESS;
|
||||
}
|
||||
|
||||
@ -752,6 +812,7 @@ MDBX_NtFsControlFile mdbx_NtFsControlFile;
|
||||
MDBX_PrefetchVirtualMemory mdbx_PrefetchVirtualMemory;
|
||||
MDBX_GetTickCount64 mdbx_GetTickCount64;
|
||||
MDBX_RegGetValueA mdbx_RegGetValueA;
|
||||
MDBX_SetFileIoOverlappedRange mdbx_SetFileIoOverlappedRange;
|
||||
#endif /* xMDBX_ALLOY */
|
||||
|
||||
#if __GNUC_PREREQ(8, 0)
|
||||
@ -783,6 +844,7 @@ static void mdbx_winnt_import(void) {
|
||||
GET_PROC_ADDR(hKernel32dll, GetVolumeInformationByHandleW);
|
||||
GET_PROC_ADDR(hKernel32dll, GetFinalPathNameByHandleW);
|
||||
GET_PROC_ADDR(hKernel32dll, PrefetchVirtualMemory);
|
||||
GET_PROC_ADDR(hKernel32dll, SetFileIoOverlappedRange);
|
||||
}
|
||||
|
||||
const HINSTANCE hAdvapi32dll = GetModuleHandleA("advapi32.dll");
|
||||
|
624
src/osal.c
624
src/osal.c
@ -1,4 +1,4 @@
|
||||
/* https://en.wikipedia.org/wiki/Operating_system_abstraction_layer */
|
||||
/* https://en.wikipedia.org/wiki/Operating_system_abstraction_layer */
|
||||
|
||||
/*
|
||||
* Copyright 2015-2022 Leonid Yuriev <leo@yuriev.ru>
|
||||
@ -537,6 +537,596 @@ MDBX_INTERNAL_FUNC size_t osal_mb2w(wchar_t *dst, size_t dst_n, const char *src,
|
||||
|
||||
/*----------------------------------------------------------------------------*/
|
||||
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
#define ior_alignment_mask (ior->pagesize - 1)
|
||||
#define OSAL_IOV_MAX (4096 / sizeof(ior_sgv_element))
|
||||
|
||||
static void ior_put_event(osal_ioring_t *ior, HANDLE event) {
|
||||
assert(event && event != INVALID_HANDLE_VALUE && event != ior);
|
||||
assert(ior->event_stack < ior->allocated);
|
||||
ior->event_pool[ior->event_stack] = event;
|
||||
ior->event_stack += 1;
|
||||
}
|
||||
|
||||
static HANDLE ior_get_event(osal_ioring_t *ior) {
|
||||
assert(ior->event_stack <= ior->allocated);
|
||||
if (ior->event_stack > 0) {
|
||||
ior->event_stack -= 1;
|
||||
assert(ior->event_pool[ior->event_stack] != 0);
|
||||
return ior->event_pool[ior->event_stack];
|
||||
}
|
||||
return CreateEventW(nullptr, true, false, nullptr);
|
||||
}
|
||||
|
||||
static void WINAPI ior_wocr(DWORD err, DWORD bytes, OVERLAPPED *ov) {
|
||||
osal_ioring_t *ior = ov->hEvent;
|
||||
ov->Internal = err;
|
||||
ov->InternalHigh = bytes;
|
||||
if (++ior->async_completed >= ior->async_waiting)
|
||||
SetEvent(ior->async_done);
|
||||
}
|
||||
|
||||
#elif MDBX_HAVE_PWRITEV
|
||||
#if defined(_SC_IOV_MAX)
|
||||
static size_t osal_iov_max;
|
||||
#define OSAL_IOV_MAX osal_iov_max
|
||||
#else
|
||||
#define OSAL_IOV_MAX IOV_MAX
|
||||
#endif
|
||||
#else
|
||||
#undef OSAL_IOV_MAX
|
||||
#endif /* OSAL_IOV_MAX */
|
||||
|
||||
MDBX_INTERNAL_FUNC int osal_ioring_create(osal_ioring_t *ior,
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
unsigned flags,
|
||||
#endif /* Windows */
|
||||
mdbx_filehandle_t fd) {
|
||||
memset(ior, 0, sizeof(osal_ioring_t));
|
||||
ior->fd = fd;
|
||||
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
ior->flags = flags;
|
||||
const unsigned pagesize = (unsigned)osal_syspagesize();
|
||||
ior->pagesize = pagesize;
|
||||
ior->pagesize_ln2 = (uint8_t)log2n_powerof2(pagesize);
|
||||
ior->async_done = ior_get_event(ior);
|
||||
if (!ior->async_done)
|
||||
return GetLastError();
|
||||
#endif /* !Windows */
|
||||
|
||||
#if MDBX_HAVE_PWRITEV && defined(_SC_IOV_MAX)
|
||||
if (!osal_iov_max)
|
||||
osal_iov_max = sysconf(_SC_IOV_MAX);
|
||||
#endif
|
||||
|
||||
ior->boundary = (char *)(ior->pool + ior->allocated);
|
||||
return MDBX_SUCCESS;
|
||||
}
|
||||
|
||||
static __inline size_t ior_offset(const ior_item_t *item) {
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
return item->ov.Offset | (size_t)((sizeof(size_t) > sizeof(item->ov.Offset))
|
||||
? (uint64_t)item->ov.OffsetHigh << 32
|
||||
: 0);
|
||||
#else
|
||||
return item->offset;
|
||||
#endif /* !Windows */
|
||||
}
|
||||
|
||||
static __inline ior_item_t *ior_next(ior_item_t *item, size_t sgvcnt) {
|
||||
#if defined(ior_sgv_element)
|
||||
assert(sgvcnt > 0);
|
||||
return (ior_item_t *)((char *)item + sizeof(ior_item_t) -
|
||||
sizeof(ior_sgv_element) +
|
||||
sizeof(ior_sgv_element) * sgvcnt);
|
||||
#else
|
||||
assert(sgvcnt == 1);
|
||||
(void)sgvcnt;
|
||||
return item + 1;
|
||||
#endif
|
||||
}
|
||||
|
||||
MDBX_INTERNAL_FUNC int osal_ioring_add(osal_ioring_t *ior, const size_t offset,
|
||||
void *data, const size_t bytes) {
|
||||
|
||||
assert(bytes && data);
|
||||
assert(bytes % MIN_PAGESIZE == 0 && bytes <= MAX_WRITE);
|
||||
assert(offset % MIN_PAGESIZE == 0 && offset + (uint64_t)bytes <= MAX_MAPSIZE);
|
||||
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
const unsigned segments = (unsigned)(bytes >> ior->pagesize_ln2);
|
||||
const bool use_gather =
|
||||
(ior->flags & IOR_UNBUFFERED) && ior->slots_left >= segments;
|
||||
#endif /* Windows */
|
||||
|
||||
ior_item_t *item = ior->pool;
|
||||
if (likely(ior->last)) {
|
||||
item = ior->last;
|
||||
if (unlikely(ior_offset(item) + ior_last_bytes(ior, item) == offset) &&
|
||||
likely(ior_last_bytes(ior, item) + bytes <= MAX_WRITE)) {
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
if (use_gather &&
|
||||
((bytes | (uintptr_t)data | ior->last_bytes |
|
||||
(uintptr_t)(uint64_t)item->sgv[0].Buffer) &
|
||||
ior_alignment_mask) == 0 &&
|
||||
ior->last_sgvcnt + segments < OSAL_IOV_MAX) {
|
||||
assert((item->single.iov_len & 1) == 0);
|
||||
assert(item->sgv[ior->last_sgvcnt].Buffer == 0);
|
||||
ior->last_bytes += bytes;
|
||||
size_t i = 0;
|
||||
do {
|
||||
item->sgv[ior->last_sgvcnt + i].Buffer = PtrToPtr64(data);
|
||||
data = (char *)data + ior->pagesize;
|
||||
} while (++i < segments);
|
||||
ior->slots_left -= segments;
|
||||
item->sgv[ior->last_sgvcnt += segments].Buffer = 0;
|
||||
assert((item->single.iov_len & 1) == 0);
|
||||
return MDBX_SUCCESS;
|
||||
}
|
||||
const void *end =
|
||||
(char *)(item->single.iov_base) + item->single.iov_len - 1;
|
||||
if (unlikely(end == data)) {
|
||||
assert((item->single.iov_len & 1) != 0);
|
||||
item->single.iov_len += bytes;
|
||||
return MDBX_SUCCESS;
|
||||
}
|
||||
#elif MDBX_HAVE_PWRITEV
|
||||
assert((int)item->sgvcnt > 0);
|
||||
const void *end = (char *)(item->sgv[item->sgvcnt - 1].iov_base) +
|
||||
item->sgv[item->sgvcnt - 1].iov_len;
|
||||
if (unlikely(end == data)) {
|
||||
item->sgv[item->sgvcnt - 1].iov_len += bytes;
|
||||
ior->last_bytes += bytes;
|
||||
return MDBX_SUCCESS;
|
||||
}
|
||||
if (likely(item->sgvcnt < OSAL_IOV_MAX)) {
|
||||
if (unlikely(ior->slots_left < 1))
|
||||
return MDBX_RESULT_TRUE;
|
||||
item->sgv[item->sgvcnt].iov_base = data;
|
||||
item->sgv[item->sgvcnt].iov_len = bytes;
|
||||
ior->last_bytes += bytes;
|
||||
item->sgvcnt += 1;
|
||||
ior->slots_left -= 1;
|
||||
return MDBX_SUCCESS;
|
||||
}
|
||||
#else
|
||||
const void *end = (char *)(item->single.iov_base) + item->single.iov_len;
|
||||
if (unlikely(end == data)) {
|
||||
item->single.iov_len += bytes;
|
||||
return MDBX_SUCCESS;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
item = ior_next(item, ior_last_sgvcnt(ior, item));
|
||||
}
|
||||
|
||||
if (unlikely(ior->slots_left < 1))
|
||||
return MDBX_RESULT_TRUE;
|
||||
|
||||
unsigned slots_used = 1;
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
item->ov.Internal = item->ov.InternalHigh = 0;
|
||||
item->ov.Offset = (DWORD)offset;
|
||||
item->ov.OffsetHigh = HIGH_DWORD(offset);
|
||||
item->ov.hEvent = 0;
|
||||
if (!use_gather || ((bytes | (uintptr_t)(data)) & ior_alignment_mask) != 0 ||
|
||||
segments > OSAL_IOV_MAX) {
|
||||
/* WriteFile() */
|
||||
item->single.iov_base = data;
|
||||
item->single.iov_len = bytes + 1;
|
||||
assert((item->single.iov_len & 1) != 0);
|
||||
} else {
|
||||
/* WriteFileGather() */
|
||||
item->sgv[0].Buffer = PtrToPtr64(data);
|
||||
for (size_t i = 1; i < segments; ++i) {
|
||||
data = (char *)data + ior->pagesize;
|
||||
item->sgv[slots_used].Buffer = PtrToPtr64(data);
|
||||
}
|
||||
item->sgv[slots_used].Buffer = 0;
|
||||
assert((item->single.iov_len & 1) == 0);
|
||||
slots_used = segments;
|
||||
}
|
||||
ior->last_bytes = bytes;
|
||||
ior_last_sgvcnt(ior, item) = slots_used;
|
||||
#elif MDBX_HAVE_PWRITEV
|
||||
item->offset = offset;
|
||||
item->sgv[0].iov_base = data;
|
||||
item->sgv[0].iov_len = bytes;
|
||||
ior->last_bytes = bytes;
|
||||
ior_last_sgvcnt(ior, item) = slots_used;
|
||||
#else
|
||||
item->offset = offset;
|
||||
item->single.iov_base = data;
|
||||
item->single.iov_len = bytes;
|
||||
#endif /* !Windows */
|
||||
ior->slots_left -= slots_used;
|
||||
ior->last = item;
|
||||
return MDBX_SUCCESS;
|
||||
}
|
||||
|
||||
MDBX_INTERNAL_FUNC void osal_ioring_walk(
|
||||
osal_ioring_t *ior, iov_ctx_t *ctx,
|
||||
void (*callback)(iov_ctx_t *ctx, size_t offset, void *data, size_t bytes)) {
|
||||
for (ior_item_t *item = ior->pool; item <= ior->last;) {
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
size_t offset = ior_offset(item);
|
||||
char *data = item->single.iov_base;
|
||||
size_t bytes = item->single.iov_len - 1;
|
||||
size_t i = 1;
|
||||
if (bytes & 1) {
|
||||
data = Ptr64ToPtr(item->sgv[0].Buffer);
|
||||
bytes = ior->pagesize;
|
||||
while (item->sgv[i].Buffer) {
|
||||
if (data + ior->pagesize != item->sgv[i].Buffer) {
|
||||
callback(ctx, offset, data, bytes);
|
||||
offset += bytes;
|
||||
data = Ptr64ToPtr(item->sgv[i].Buffer);
|
||||
bytes = 0;
|
||||
}
|
||||
bytes += ior->pagesize;
|
||||
++i;
|
||||
}
|
||||
}
|
||||
assert(bytes < MAX_WRITE);
|
||||
callback(ctx, offset, data, bytes);
|
||||
#elif MDBX_HAVE_PWRITEV
|
||||
assert(item->sgvcnt > 0);
|
||||
size_t offset = item->offset;
|
||||
size_t i = 0;
|
||||
do {
|
||||
callback(ctx, offset, item->sgv[i].iov_base, item->sgv[i].iov_len);
|
||||
offset += item->sgv[i].iov_len;
|
||||
} while (++i != item->sgvcnt);
|
||||
#else
|
||||
const size_t i = 1;
|
||||
callback(ctx, item->offset, item->single.iov_base, item->single.iov_len);
|
||||
#endif
|
||||
item = ior_next(item, i);
|
||||
}
|
||||
}
|
||||
|
||||
MDBX_INTERNAL_FUNC osal_ioring_write_result_t
|
||||
osal_ioring_write(osal_ioring_t *ior) {
|
||||
osal_ioring_write_result_t r = {MDBX_SUCCESS, 0};
|
||||
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
HANDLE *const end_wait_for =
|
||||
ior->event_pool + ior->allocated +
|
||||
/* был выделен один дополнительный элемент для async_done */ 1;
|
||||
HANDLE *wait_for = end_wait_for;
|
||||
LONG async_started = 0;
|
||||
for (ior_item_t *item = ior->pool; item <= ior->last;) {
|
||||
item->ov.Internal = STATUS_PENDING;
|
||||
size_t i = 1, bytes = item->single.iov_len - 1;
|
||||
r.wops += 1;
|
||||
if (bytes & 1) {
|
||||
bytes = ior->pagesize;
|
||||
while (item->sgv[i].Buffer) {
|
||||
bytes += ior->pagesize;
|
||||
++i;
|
||||
}
|
||||
assert(bytes < MAX_WRITE);
|
||||
item->ov.hEvent = ior_get_event(ior);
|
||||
if (unlikely(!item->ov.hEvent)) {
|
||||
bailout_geterr:
|
||||
r.err = GetLastError();
|
||||
bailout_rc:
|
||||
assert(r.err != MDBX_SUCCESS);
|
||||
CancelIo(ior->fd);
|
||||
return r;
|
||||
}
|
||||
if (WriteFileGather(ior->fd, item->sgv, (DWORD)bytes, nullptr,
|
||||
&item->ov)) {
|
||||
assert(item->ov.Internal == 0 &&
|
||||
WaitForSingleObject(item->ov.hEvent, 0) == WAIT_OBJECT_0);
|
||||
ior_put_event(ior, item->ov.hEvent);
|
||||
item->ov.hEvent = 0;
|
||||
} else {
|
||||
r.err = (int)GetLastError();
|
||||
if (unlikely(r.err != ERROR_IO_PENDING)) {
|
||||
ERROR("%s: fd %p, item %p (%zu), pgno %u, bytes %zu, offset %" PRId64
|
||||
", err %d",
|
||||
"WriteFileGather", ior->fd, item, item - ior->pool,
|
||||
((MDBX_page *)item->single.iov_base)->mp_pgno, bytes,
|
||||
item->ov.Offset + ((uint64_t)item->ov.OffsetHigh << 32), r.err);
|
||||
goto bailout_rc;
|
||||
}
|
||||
assert(wait_for > ior->event_pool + ior->event_stack);
|
||||
*--wait_for = item->ov.hEvent;
|
||||
}
|
||||
} else if (ior->flags & IOR_OVERLAPPED) {
|
||||
assert(bytes < MAX_WRITE);
|
||||
retry:
|
||||
item->ov.hEvent = ior;
|
||||
if (WriteFileEx(ior->fd, item->single.iov_base, (DWORD)bytes, &item->ov,
|
||||
ior_wocr)) {
|
||||
async_started += 1;
|
||||
} else {
|
||||
r.err = (int)GetLastError();
|
||||
switch (r.err) {
|
||||
default:
|
||||
ERROR("%s: fd %p, item %p (%zu), pgno %u, bytes %zu, offset %" PRId64
|
||||
", err %d",
|
||||
"WriteFileEx", ior->fd, item, item - ior->pool,
|
||||
((MDBX_page *)item->single.iov_base)->mp_pgno, bytes,
|
||||
item->ov.Offset + ((uint64_t)item->ov.OffsetHigh << 32), r.err);
|
||||
goto bailout_rc;
|
||||
case ERROR_NOT_FOUND:
|
||||
case ERROR_USER_MAPPED_FILE:
|
||||
case ERROR_LOCK_VIOLATION:
|
||||
WARNING(
|
||||
"%s: fd %p, item %p (%zu), pgno %u, bytes %zu, offset %" PRId64
|
||||
", err %d",
|
||||
"WriteFileEx", ior->fd, item, item - ior->pool,
|
||||
((MDBX_page *)item->single.iov_base)->mp_pgno, bytes,
|
||||
item->ov.Offset + ((uint64_t)item->ov.OffsetHigh << 32), r.err);
|
||||
SleepEx(0, true);
|
||||
goto retry;
|
||||
case ERROR_INVALID_USER_BUFFER:
|
||||
case ERROR_NOT_ENOUGH_MEMORY:
|
||||
if (SleepEx(0, true) == WAIT_IO_COMPLETION)
|
||||
goto retry;
|
||||
goto bailout_rc;
|
||||
case ERROR_IO_PENDING:
|
||||
async_started += 1;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
assert(bytes < MAX_WRITE);
|
||||
DWORD written = 0;
|
||||
if (!WriteFile(ior->fd, item->single.iov_base, (DWORD)bytes, &written,
|
||||
&item->ov)) {
|
||||
r.err = (int)GetLastError();
|
||||
ERROR("%s: fd %p, item %p (%zu), pgno %u, bytes %zu, offset %" PRId64
|
||||
", err %d",
|
||||
"WriteFile", ior->fd, item, item - ior->pool,
|
||||
((MDBX_page *)item->single.iov_base)->mp_pgno, bytes,
|
||||
item->ov.Offset + ((uint64_t)item->ov.OffsetHigh << 32), r.err);
|
||||
goto bailout_rc;
|
||||
} else if (unlikely(written != bytes)) {
|
||||
r.err = ERROR_WRITE_FAULT;
|
||||
goto bailout_rc;
|
||||
}
|
||||
}
|
||||
item = ior_next(item, i);
|
||||
}
|
||||
|
||||
assert(ior->async_waiting > ior->async_completed &&
|
||||
ior->async_waiting == INT_MAX);
|
||||
ior->async_waiting = async_started;
|
||||
if (async_started > ior->async_completed && end_wait_for == wait_for) {
|
||||
assert(wait_for > ior->event_pool + ior->event_stack);
|
||||
*--wait_for = ior->async_done;
|
||||
}
|
||||
|
||||
const size_t pending_count = end_wait_for - wait_for;
|
||||
if (pending_count) {
|
||||
/* Ждем до MAXIMUM_WAIT_OBJECTS (64) последних хендлов, а после избирательно
|
||||
* ждем посредством GetOverlappedResult(), если какие-то более ранние
|
||||
* элементы еще не завершены. В целом, так получается меньше системных
|
||||
* вызовов, т.е. меньше накладных расходов. Однако, не факт что эта экономия
|
||||
* не будет перекрыта неэффективностью реализации
|
||||
* WaitForMultipleObjectsEx(), но тогда это проблемы на стороне M$. */
|
||||
DWORD madness;
|
||||
do
|
||||
madness = WaitForMultipleObjectsEx((pending_count < MAXIMUM_WAIT_OBJECTS)
|
||||
? (DWORD)pending_count
|
||||
: MAXIMUM_WAIT_OBJECTS,
|
||||
wait_for, true,
|
||||
/* сутки */ 86400000ul, true);
|
||||
while (madness == WAIT_IO_COMPLETION);
|
||||
STATIC_ASSERT(WAIT_OBJECT_0 == 0);
|
||||
if (/* madness >= WAIT_OBJECT_0 && */
|
||||
madness < WAIT_OBJECT_0 + MAXIMUM_WAIT_OBJECTS)
|
||||
r.err = MDBX_SUCCESS;
|
||||
else if (madness >= WAIT_ABANDONED_0 &&
|
||||
madness < WAIT_ABANDONED_0 + MAXIMUM_WAIT_OBJECTS) {
|
||||
r.err = ERROR_ABANDONED_WAIT_0;
|
||||
goto bailout_rc;
|
||||
} else if (madness == WAIT_TIMEOUT) {
|
||||
r.err = WAIT_TIMEOUT;
|
||||
goto bailout_rc;
|
||||
} else {
|
||||
r.err = /* madness == WAIT_FAILED */ MDBX_PROBLEM;
|
||||
goto bailout_rc;
|
||||
}
|
||||
|
||||
assert(ior->async_waiting == ior->async_completed);
|
||||
for (ior_item_t *item = ior->pool; item <= ior->last;) {
|
||||
size_t i = 1, bytes = item->single.iov_len - 1;
|
||||
if (bytes & 1) {
|
||||
bytes = ior->pagesize;
|
||||
while (item->sgv[i].Buffer) {
|
||||
bytes += ior->pagesize;
|
||||
++i;
|
||||
}
|
||||
if (!HasOverlappedIoCompleted(&item->ov)) {
|
||||
DWORD written = 0;
|
||||
if (unlikely(
|
||||
!GetOverlappedResult(ior->fd, &item->ov, &written, true))) {
|
||||
ERROR("%s: item %p (%zu), pgno %u, bytes %zu, offset %" PRId64
|
||||
", err %d",
|
||||
"GetOverlappedResult", item, item - ior->pool,
|
||||
((MDBX_page *)item->single.iov_base)->mp_pgno, bytes,
|
||||
item->ov.Offset + ((uint64_t)item->ov.OffsetHigh << 32),
|
||||
GetLastError());
|
||||
goto bailout_geterr;
|
||||
}
|
||||
assert(MDBX_SUCCESS == item->ov.Internal);
|
||||
assert(written == item->ov.InternalHigh);
|
||||
}
|
||||
} else {
|
||||
assert(HasOverlappedIoCompleted(&item->ov));
|
||||
}
|
||||
assert(item->ov.Internal != ERROR_IO_PENDING);
|
||||
if (unlikely(item->ov.Internal != MDBX_SUCCESS)) {
|
||||
DWORD written = 0;
|
||||
r.err = (int)item->ov.Internal;
|
||||
if ((r.err & 0x80000000) &&
|
||||
GetOverlappedResult(NULL, &item->ov, &written, true))
|
||||
r.err = (int)GetLastError();
|
||||
ERROR("%s: item %p (%zu), pgno %u, bytes %zu, offset %" PRId64
|
||||
", err %d",
|
||||
"Result", item, item - ior->pool,
|
||||
((MDBX_page *)item->single.iov_base)->mp_pgno, bytes,
|
||||
item->ov.Offset + ((uint64_t)item->ov.OffsetHigh << 32),
|
||||
GetLastError());
|
||||
goto bailout_rc;
|
||||
}
|
||||
if (unlikely(item->ov.InternalHigh != bytes)) {
|
||||
r.err = ERROR_WRITE_FAULT;
|
||||
goto bailout_rc;
|
||||
}
|
||||
item = ior_next(item, i);
|
||||
}
|
||||
assert(ior->async_waiting == ior->async_completed);
|
||||
} else {
|
||||
assert(r.err == MDBX_SUCCESS);
|
||||
}
|
||||
assert(ior->async_waiting == ior->async_completed);
|
||||
|
||||
#else
|
||||
STATIC_ASSERT_MSG(sizeof(off_t) >= sizeof(size_t),
|
||||
"libmdbx requires 64-bit file I/O on 64-bit systems");
|
||||
for (ior_item_t *item = ior->pool; item <= ior->last;) {
|
||||
#if MDBX_HAVE_PWRITEV
|
||||
assert(item->sgvcnt > 0);
|
||||
if (item->sgvcnt == 1)
|
||||
r.err = osal_pwrite(ior->fd, item->sgv[0].iov_base, item->sgv[0].iov_len,
|
||||
item->offset);
|
||||
else
|
||||
r.err = osal_pwritev(ior->fd, item->sgv, item->sgvcnt, item->offset);
|
||||
|
||||
// TODO: io_uring_prep_write(sqe, fd, ...);
|
||||
|
||||
item = ior_next(item, item->sgvcnt);
|
||||
#else
|
||||
r.err = osal_pwrite(ior->fd, item->single.iov_base, item->single.iov_len,
|
||||
item->offset);
|
||||
item = ior_next(item, 1);
|
||||
#endif
|
||||
r.wops += 1;
|
||||
if (unlikely(r.err != MDBX_SUCCESS))
|
||||
break;
|
||||
}
|
||||
|
||||
// TODO: io_uring_submit(&ring)
|
||||
// TODO: err = io_uring_wait_cqe(&ring, &cqe);
|
||||
// TODO: io_uring_cqe_seen(&ring, cqe);
|
||||
|
||||
#endif /* !Windows */
|
||||
return r;
|
||||
}
|
||||
|
||||
MDBX_INTERNAL_FUNC void osal_ioring_reset(osal_ioring_t *ior) {
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
if (ior->last) {
|
||||
for (ior_item_t *item = ior->pool; item <= ior->last;) {
|
||||
if (!HasOverlappedIoCompleted(&item->ov))
|
||||
CancelIoEx(ior->fd, &item->ov);
|
||||
if (item->ov.hEvent && item->ov.hEvent != ior)
|
||||
ior_put_event(ior, item->ov.hEvent);
|
||||
size_t i = 1;
|
||||
if ((item->single.iov_len & 1) == 0)
|
||||
while (item->sgv[i].Buffer)
|
||||
++i;
|
||||
item = ior_next(item, i);
|
||||
}
|
||||
}
|
||||
ior->async_waiting = INT_MAX;
|
||||
ior->async_completed = 0;
|
||||
ResetEvent(ior->async_done);
|
||||
#endif /* !Windows */
|
||||
ior->slots_left = ior->allocated;
|
||||
ior->last = nullptr;
|
||||
}
|
||||
|
||||
static void ior_cleanup(osal_ioring_t *ior, const size_t since) {
|
||||
osal_ioring_reset(ior);
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
for (size_t i = since; i < ior->event_stack; ++i)
|
||||
CloseHandle(ior->event_pool[i]);
|
||||
ior->event_stack = 0;
|
||||
#else
|
||||
(void)since;
|
||||
#endif /* Windows */
|
||||
}
|
||||
|
||||
MDBX_INTERNAL_FUNC int osal_ioring_resize(osal_ioring_t *ior, size_t items) {
|
||||
assert(items > 0 && items < INT_MAX / sizeof(ior_item_t));
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
if (ior->state & IOR_STATE_LOCKED)
|
||||
return MDBX_SUCCESS;
|
||||
const bool useSetFileIoOverlappedRange = (ior->flags & IOR_OVERLAPPED) &&
|
||||
mdbx_SetFileIoOverlappedRange &&
|
||||
items > 7;
|
||||
const size_t ceiling =
|
||||
useSetFileIoOverlappedRange
|
||||
? ((items < 65536 / 2 / sizeof(ior_item_t)) ? 65536 : 65536 * 4)
|
||||
: 4096;
|
||||
const size_t bytes = ceil_powerof2(sizeof(ior_item_t) * items, ceiling);
|
||||
items = bytes / sizeof(ior_item_t);
|
||||
#endif /* Windows */
|
||||
|
||||
if (items != ior->allocated) {
|
||||
assert(items >= osal_ioring_used(ior));
|
||||
if (items < ior->allocated)
|
||||
ior_cleanup(ior, items);
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
void *ptr = osal_realloc(
|
||||
ior->event_pool,
|
||||
(items + /* extra for waiting the async_done */ 1) * sizeof(HANDLE));
|
||||
if (unlikely(!ptr))
|
||||
return MDBX_ENOMEM;
|
||||
ior->event_pool = ptr;
|
||||
|
||||
int err = osal_memalign_alloc(ceiling, bytes, &ptr);
|
||||
if (unlikely(err != MDBX_SUCCESS))
|
||||
return err;
|
||||
if (ior->pool) {
|
||||
memcpy(ptr, ior->pool, ior->allocated * sizeof(ior_item_t));
|
||||
osal_memalign_free(ior->pool);
|
||||
}
|
||||
#else
|
||||
void *ptr = osal_realloc(ior->pool, sizeof(ior_item_t) * items);
|
||||
if (unlikely(!ptr))
|
||||
return MDBX_ENOMEM;
|
||||
#endif
|
||||
ior->pool = ptr;
|
||||
|
||||
if (items > ior->allocated)
|
||||
memset(ior->pool + ior->allocated, 0,
|
||||
sizeof(ior_item_t) * (items - ior->allocated));
|
||||
ior->allocated = (unsigned)items;
|
||||
ior->boundary = (char *)(ior->pool + ior->allocated);
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
if (useSetFileIoOverlappedRange) {
|
||||
if (mdbx_SetFileIoOverlappedRange(ior->fd, ptr, (ULONG)bytes))
|
||||
ior->state += IOR_STATE_LOCKED;
|
||||
else
|
||||
return GetLastError();
|
||||
}
|
||||
#endif /* Windows */
|
||||
}
|
||||
return MDBX_SUCCESS;
|
||||
}
|
||||
|
||||
MDBX_INTERNAL_FUNC void osal_ioring_destroy(osal_ioring_t *ior) {
|
||||
if (ior->allocated)
|
||||
ior_cleanup(ior, 0);
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
osal_memalign_free(ior->pool);
|
||||
osal_free(ior->event_pool);
|
||||
CloseHandle(ior->async_done);
|
||||
#else
|
||||
osal_free(ior->pool);
|
||||
#endif
|
||||
memset(ior, -1, sizeof(osal_ioring_t));
|
||||
}
|
||||
|
||||
/*----------------------------------------------------------------------------*/
|
||||
|
||||
MDBX_INTERNAL_FUNC int osal_removefile(const pathchar_t *pathname) {
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
return DeleteFileW(pathname) ? MDBX_SUCCESS : (int)GetLastError();
|
||||
@ -589,17 +1179,21 @@ MDBX_INTERNAL_FUNC int osal_openfile(const enum osal_openfile_purpose purpose,
|
||||
case MDBX_OPEN_DXB_LAZY:
|
||||
DesiredAccess |= GENERIC_READ | GENERIC_WRITE;
|
||||
break;
|
||||
case MDBX_OPEN_DXB_OVERLAPPED:
|
||||
FlagsAndAttributes |= FILE_FLAG_OVERLAPPED;
|
||||
/* fall through */
|
||||
__fallthrough;
|
||||
case MDBX_OPEN_DXB_DSYNC:
|
||||
CreationDisposition = OPEN_EXISTING;
|
||||
DesiredAccess |= GENERIC_WRITE;
|
||||
DesiredAccess |= GENERIC_WRITE | GENERIC_READ;
|
||||
FlagsAndAttributes |= FILE_FLAG_WRITE_THROUGH;
|
||||
break;
|
||||
case MDBX_OPEN_COPY:
|
||||
CreationDisposition = CREATE_NEW;
|
||||
ShareMode = 0;
|
||||
DesiredAccess |= GENERIC_WRITE;
|
||||
FlagsAndAttributes |=
|
||||
(env->me_psize < env->me_os_psize) ? 0 : FILE_FLAG_NO_BUFFERING;
|
||||
if (env->me_psize >= env->me_os_psize)
|
||||
FlagsAndAttributes |= FILE_FLAG_NO_BUFFERING;
|
||||
break;
|
||||
case MDBX_OPEN_DELETE:
|
||||
CreationDisposition = OPEN_EXISTING;
|
||||
@ -878,28 +1472,30 @@ MDBX_INTERNAL_FUNC int osal_write(mdbx_filehandle_t fd, const void *buf,
|
||||
}
|
||||
}
|
||||
|
||||
int osal_pwritev(mdbx_filehandle_t fd, struct iovec *iov, int iovcnt,
|
||||
uint64_t offset, size_t expected_written) {
|
||||
#if defined(_WIN32) || defined(_WIN64) || defined(__APPLE__) || \
|
||||
(defined(__ANDROID_API__) && __ANDROID_API__ < 24)
|
||||
int osal_pwritev(mdbx_filehandle_t fd, struct iovec *iov, int sgvcnt,
|
||||
uint64_t offset) {
|
||||
size_t expected = 0;
|
||||
for (int i = 0; i < sgvcnt; ++i)
|
||||
expected += iov[i].iov_len;
|
||||
#if !MDBX_HAVE_PWRITEV
|
||||
size_t written = 0;
|
||||
for (int i = 0; i < iovcnt; ++i) {
|
||||
for (int i = 0; i < sgvcnt; ++i) {
|
||||
int rc = osal_pwrite(fd, iov[i].iov_base, iov[i].iov_len, offset);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return rc;
|
||||
written += iov[i].iov_len;
|
||||
offset += iov[i].iov_len;
|
||||
}
|
||||
return (expected_written == written) ? MDBX_SUCCESS
|
||||
: MDBX_EIO /* ERROR_WRITE_FAULT */;
|
||||
return (expected == written) ? MDBX_SUCCESS
|
||||
: MDBX_EIO /* ERROR_WRITE_FAULT */;
|
||||
#else
|
||||
int rc;
|
||||
intptr_t written;
|
||||
do {
|
||||
STATIC_ASSERT_MSG(sizeof(off_t) >= sizeof(size_t),
|
||||
"libmdbx requires 64-bit file I/O on 64-bit systems");
|
||||
written = pwritev(fd, iov, iovcnt, offset);
|
||||
if (likely(expected_written == (size_t)written))
|
||||
written = pwritev(fd, iov, sgvcnt, offset);
|
||||
if (likely(expected == (size_t)written))
|
||||
return MDBX_SUCCESS;
|
||||
rc = errno;
|
||||
} while (rc == EINTR);
|
||||
@ -1066,7 +1662,7 @@ MDBX_INTERNAL_FUNC int osal_thread_join(osal_thread_t thread) {
|
||||
|
||||
/*----------------------------------------------------------------------------*/
|
||||
|
||||
MDBX_INTERNAL_FUNC int osal_msync(osal_mmap_t *map, size_t offset,
|
||||
MDBX_INTERNAL_FUNC int osal_msync(const osal_mmap_t *map, size_t offset,
|
||||
size_t length,
|
||||
enum osal_syncmode_bits mode_bits) {
|
||||
uint8_t *ptr = (uint8_t *)map->address + offset;
|
||||
|
163
src/osal.h
163
src/osal.h
@ -263,8 +263,138 @@ typedef union osal_srwlock {
|
||||
} osal_srwlock_t;
|
||||
#endif /* Windows */
|
||||
|
||||
#ifndef MDBX_HAVE_PWRITEV
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
|
||||
#define MDBX_HAVE_PWRITEV 0
|
||||
|
||||
#elif defined(__ANDROID_API__)
|
||||
|
||||
#if __ANDROID_API__ < 24
|
||||
#define MDBX_HAVE_PWRITEV 0
|
||||
#else
|
||||
#define MDBX_HAVE_PWRITEV 1
|
||||
#endif
|
||||
|
||||
#elif defined(__APPLE__) || defined(__MACH__) || defined(_DARWIN_C_SOURCE)
|
||||
|
||||
#if defined(MAC_OS_X_VERSION_MIN_REQUIRED) && defined(MAC_OS_VERSION_11_0) && \
|
||||
MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_VERSION_11_0
|
||||
/* FIXME: add checks for IOS versions, etc */
|
||||
#define MDBX_HAVE_PWRITEV 1
|
||||
#else
|
||||
#define MDBX_HAVE_PWRITEV 0
|
||||
#endif
|
||||
|
||||
#elif defined(_SC_IOV_MAX) || (defined(IOV_MAX) && IOV_MAX > 1)
|
||||
#define MDBX_HAVE_PWRITEV 1
|
||||
#else
|
||||
#define MDBX_HAVE_PWRITEV 0
|
||||
#endif
|
||||
#endif /* MDBX_HAVE_PWRITEV */
|
||||
|
||||
typedef struct ior_item {
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
OVERLAPPED ov;
|
||||
#define ior_svg_gap4terminator 1
|
||||
#define ior_sgv_element FILE_SEGMENT_ELEMENT
|
||||
#else
|
||||
size_t offset;
|
||||
#if MDBX_HAVE_PWRITEV
|
||||
size_t sgvcnt;
|
||||
#define ior_svg_gap4terminator 0
|
||||
#define ior_sgv_element struct iovec
|
||||
#endif /* MDBX_HAVE_PWRITEV */
|
||||
#endif /* !Windows */
|
||||
union {
|
||||
MDBX_val single;
|
||||
#if defined(ior_sgv_element)
|
||||
ior_sgv_element sgv[1 + ior_svg_gap4terminator];
|
||||
#endif /* ior_sgv_element */
|
||||
};
|
||||
} ior_item_t;
|
||||
|
||||
typedef struct osal_ioring {
|
||||
unsigned slots_left;
|
||||
unsigned allocated;
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
#define IOR_UNBUFFERED 1
|
||||
#define IOR_OVERLAPPED 2
|
||||
#define IOR_STATE_LOCKED 1
|
||||
unsigned pagesize;
|
||||
unsigned last_sgvcnt;
|
||||
size_t last_bytes;
|
||||
uint8_t flags, state, pagesize_ln2;
|
||||
unsigned event_stack;
|
||||
HANDLE *event_pool;
|
||||
volatile LONG async_waiting;
|
||||
volatile LONG async_completed;
|
||||
HANDLE async_done;
|
||||
|
||||
#define ior_last_sgvcnt(ior, item) (ior)->last_sgvcnt
|
||||
#define ior_last_bytes(ior, item) (ior)->last_bytes
|
||||
#elif MDBX_HAVE_PWRITEV
|
||||
unsigned last_bytes;
|
||||
#define ior_last_sgvcnt(ior, item) (item)->sgvcnt
|
||||
#define ior_last_bytes(ior, item) (ior)->last_bytes
|
||||
#else
|
||||
#define ior_last_sgvcnt(ior, item) (1)
|
||||
#define ior_last_bytes(ior, item) (item)->single.iov_len
|
||||
#endif /* !Windows */
|
||||
mdbx_filehandle_t fd;
|
||||
ior_item_t *last;
|
||||
ior_item_t *pool;
|
||||
char *boundary;
|
||||
} osal_ioring_t;
|
||||
|
||||
#ifndef __cplusplus
|
||||
|
||||
/* Actually this is not ioring for now, but on the way. */
|
||||
MDBX_INTERNAL_FUNC int osal_ioring_create(osal_ioring_t *,
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
unsigned flags,
|
||||
#endif /* Windows */
|
||||
mdbx_filehandle_t fd);
|
||||
MDBX_INTERNAL_FUNC int osal_ioring_resize(osal_ioring_t *, size_t items);
|
||||
MDBX_INTERNAL_FUNC void osal_ioring_destroy(osal_ioring_t *);
|
||||
MDBX_INTERNAL_FUNC void osal_ioring_reset(osal_ioring_t *);
|
||||
MDBX_INTERNAL_FUNC int osal_ioring_add(osal_ioring_t *ctx, const size_t offset,
|
||||
void *data, const size_t bytes);
|
||||
typedef struct osal_ioring_write_result {
|
||||
int err;
|
||||
unsigned wops;
|
||||
} osal_ioring_write_result_t;
|
||||
MDBX_INTERNAL_FUNC osal_ioring_write_result_t
|
||||
osal_ioring_write(osal_ioring_t *ior);
|
||||
|
||||
typedef struct iov_ctx iov_ctx_t;
|
||||
MDBX_INTERNAL_FUNC void osal_ioring_walk(
|
||||
osal_ioring_t *ior, iov_ctx_t *ctx,
|
||||
void (*callback)(iov_ctx_t *ctx, size_t offset, void *data, size_t bytes));
|
||||
|
||||
static inline unsigned osal_ioring_left(const osal_ioring_t *ior) {
|
||||
return ior->slots_left;
|
||||
}
|
||||
|
||||
static inline unsigned osal_ioring_used(const osal_ioring_t *ior) {
|
||||
return ior->allocated - ior->slots_left;
|
||||
}
|
||||
|
||||
static inline int osal_ioring_reserve(osal_ioring_t *ior, unsigned items,
|
||||
size_t bytes) {
|
||||
items = (items > 32) ? items : 32;
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
const unsigned npages = (unsigned)(bytes >> ior->pagesize_ln2);
|
||||
items = (items > npages) ? items : npages;
|
||||
#else
|
||||
(void)bytes;
|
||||
#endif
|
||||
items = (items < 65536) ? items : 65536;
|
||||
if (likely(ior->allocated >= items))
|
||||
return MDBX_SUCCESS;
|
||||
return osal_ioring_resize(ior, items);
|
||||
}
|
||||
|
||||
/*----------------------------------------------------------------------------*/
|
||||
/* libc compatibility stuff */
|
||||
|
||||
@ -290,10 +420,12 @@ MDBX_MAYBE_UNUSED MDBX_INTERNAL_FUNC void osal_jitter(bool tiny);
|
||||
MDBX_MAYBE_UNUSED static __inline void jitter4testing(bool tiny);
|
||||
|
||||
/* max bytes to write in one call */
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
#define MAX_WRITE UINT32_C(0x01000000)
|
||||
#if defined(_WIN64)
|
||||
#define MAX_WRITE UINT32_C(0x10000000)
|
||||
#elif defined(_WIN32)
|
||||
#define MAX_WRITE UINT32_C(0x04000000)
|
||||
#else
|
||||
#define MAX_WRITE UINT32_C(0x3fff0000)
|
||||
#define MAX_WRITE UINT32_C(0x3f000000)
|
||||
#endif
|
||||
|
||||
#if defined(__linux__) || defined(__gnu_linux__)
|
||||
@ -336,8 +468,7 @@ MDBX_INTERNAL_FUNC int osal_fastmutex_release(osal_fastmutex_t *fastmutex);
|
||||
MDBX_INTERNAL_FUNC int osal_fastmutex_destroy(osal_fastmutex_t *fastmutex);
|
||||
|
||||
MDBX_INTERNAL_FUNC int osal_pwritev(mdbx_filehandle_t fd, struct iovec *iov,
|
||||
int iovcnt, uint64_t offset,
|
||||
size_t expected_written);
|
||||
int sgvcnt, uint64_t offset);
|
||||
MDBX_INTERNAL_FUNC int osal_pread(mdbx_filehandle_t fd, void *buf, size_t count,
|
||||
uint64_t offset);
|
||||
MDBX_INTERNAL_FUNC int osal_pwrite(mdbx_filehandle_t fd, const void *buf,
|
||||
@ -365,12 +496,15 @@ MDBX_INTERNAL_FUNC int osal_fseek(mdbx_filehandle_t fd, uint64_t pos);
|
||||
MDBX_INTERNAL_FUNC int osal_filesize(mdbx_filehandle_t fd, uint64_t *length);
|
||||
|
||||
enum osal_openfile_purpose {
|
||||
MDBX_OPEN_DXB_READ = 0,
|
||||
MDBX_OPEN_DXB_LAZY = 1,
|
||||
MDBX_OPEN_DXB_DSYNC = 2,
|
||||
MDBX_OPEN_LCK = 3,
|
||||
MDBX_OPEN_COPY = 4,
|
||||
MDBX_OPEN_DELETE = 5
|
||||
MDBX_OPEN_DXB_READ,
|
||||
MDBX_OPEN_DXB_LAZY,
|
||||
MDBX_OPEN_DXB_DSYNC,
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
MDBX_OPEN_DXB_OVERLAPPED,
|
||||
#endif /* Windows */
|
||||
MDBX_OPEN_LCK,
|
||||
MDBX_OPEN_COPY,
|
||||
MDBX_OPEN_DELETE
|
||||
};
|
||||
|
||||
MDBX_INTERNAL_FUNC int osal_openfile(const enum osal_openfile_purpose purpose,
|
||||
@ -404,7 +538,7 @@ osal_suspend_threads_before_remap(MDBX_env *env, mdbx_handle_array_t **array);
|
||||
MDBX_INTERNAL_FUNC int
|
||||
osal_resume_threads_after_remap(mdbx_handle_array_t *array);
|
||||
#endif /* Windows */
|
||||
MDBX_INTERNAL_FUNC int osal_msync(osal_mmap_t *map, size_t offset,
|
||||
MDBX_INTERNAL_FUNC int osal_msync(const osal_mmap_t *map, size_t offset,
|
||||
size_t length,
|
||||
enum osal_syncmode_bits mode_bits);
|
||||
MDBX_INTERNAL_FUNC int osal_check_fs_rdonly(mdbx_filehandle_t handle,
|
||||
@ -692,6 +826,11 @@ MDBX_INTERNAL_VAR MDBX_RegGetValueA mdbx_RegGetValueA;
|
||||
|
||||
NTSYSAPI ULONG RtlRandomEx(PULONG Seed);
|
||||
|
||||
typedef BOOL(WINAPI *MDBX_SetFileIoOverlappedRange)(HANDLE FileHandle,
|
||||
PUCHAR OverlappedRangeStart,
|
||||
ULONG Length);
|
||||
MDBX_INTERNAL_VAR MDBX_SetFileIoOverlappedRange mdbx_SetFileIoOverlappedRange;
|
||||
|
||||
#endif /* Windows */
|
||||
|
||||
#endif /* !__cplusplus */
|
||||
|
@ -71,7 +71,7 @@ void osal_setup(const std::vector<actor_config> &actors) {
|
||||
events.reserve(n);
|
||||
|
||||
for (unsigned i = 0; i < n; ++i) {
|
||||
HANDLE hEvent = CreateEvent(NULL, TRUE, FALSE, NULL);
|
||||
HANDLE hEvent = CreateEventW(NULL, TRUE, FALSE, NULL);
|
||||
if (!hEvent)
|
||||
failure_perror("CreateEvent()", GetLastError());
|
||||
hEvent = make_inheritable(hEvent);
|
||||
@ -79,22 +79,22 @@ void osal_setup(const std::vector<actor_config> &actors) {
|
||||
events[i] = hEvent;
|
||||
}
|
||||
|
||||
hBarrierSemaphore = CreateSemaphore(NULL, 0, (LONG)actors.size(), NULL);
|
||||
hBarrierSemaphore = CreateSemaphoreW(NULL, 0, (LONG)actors.size(), NULL);
|
||||
if (!hBarrierSemaphore)
|
||||
failure_perror("CreateSemaphore(BarrierSemaphore)", GetLastError());
|
||||
hBarrierSemaphore = make_inheritable(hBarrierSemaphore);
|
||||
|
||||
hBarrierEvent = CreateEvent(NULL, TRUE, FALSE, NULL);
|
||||
hBarrierEvent = CreateEventW(NULL, TRUE, FALSE, NULL);
|
||||
if (!hBarrierEvent)
|
||||
failure_perror("CreateEvent(BarrierEvent)", GetLastError());
|
||||
hBarrierEvent = make_inheritable(hBarrierEvent);
|
||||
|
||||
hProgressActiveEvent = CreateEvent(NULL, FALSE, FALSE, NULL);
|
||||
hProgressActiveEvent = CreateEventW(NULL, FALSE, FALSE, NULL);
|
||||
if (!hProgressActiveEvent)
|
||||
failure_perror("CreateEvent(ProgressActiveEvent)", GetLastError());
|
||||
hProgressActiveEvent = make_inheritable(hProgressActiveEvent);
|
||||
|
||||
hProgressPassiveEvent = CreateEvent(NULL, FALSE, FALSE, NULL);
|
||||
hProgressPassiveEvent = CreateEventW(NULL, FALSE, FALSE, NULL);
|
||||
if (!hProgressPassiveEvent)
|
||||
failure_perror("CreateEvent(ProgressPassiveEvent)", GetLastError());
|
||||
hProgressPassiveEvent = make_inheritable(hProgressPassiveEvent);
|
||||
|
Loading…
Reference in New Issue
Block a user