mdbx: поддержка асинхронного ввода-вывода для Windows и подготовка к io_ring (объединённые коммиты и исправления).

This commit is contained in:
Леонид Юрьев (Leonid Yuriev) 2022-09-25 12:47:31 +03:00
parent 9f64e2a10c
commit 474391c83c
9 changed files with 1323 additions and 399 deletions

View File

@ -11,7 +11,6 @@ For the same reason ~~Github~~ is blacklisted forever.
So currently most of the links are broken due to noted malicious ~~Github~~ sabotage. So currently most of the links are broken due to noted malicious ~~Github~~ sabotage.
- [Engage an "overlapped I/O" on Windows](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/224).
- [Move most of `mdbx_chk` functional to the library API](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/204). - [Move most of `mdbx_chk` functional to the library API](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/204).
- [Replace SRW-lock on Windows to allow shrink DB with `MDBX_NOTLS` option](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/210). - [Replace SRW-lock on Windows to allow shrink DB with `MDBX_NOTLS` option](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/210).
- [More flexible support of asynchronous runtime/framework(s)](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/200). - [More flexible support of asynchronous runtime/framework(s)](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/200).
@ -27,3 +26,4 @@ Done
---- ----
- [Simple careful mode for working with corrupted DB](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/223). - [Simple careful mode for working with corrupted DB](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/223).
- [Engage an "overlapped I/O" on Windows](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/224).

6
mdbx.h
View File

@ -2522,9 +2522,13 @@ struct MDBX_envinfo {
uint64_t unspill; /**< Quantity of unspilled/reloaded pages */ uint64_t unspill; /**< Quantity of unspilled/reloaded pages */
uint64_t wops; /**< Number of explicit write operations (not a pages) uint64_t wops; /**< Number of explicit write operations (not a pages)
to a disk */ to a disk */
uint64_t
msync; /**< Number of explicit msync-to-disk operations (not a pages) */
uint64_t
fsync; /**< Number of explicit fsync-to-disk operations (not a pages) */
uint64_t uint64_t
gcrtime_seconds16dot16; /**< Time spent loading and searching inside gcrtime_seconds16dot16; /**< Time spent loading and searching inside
GC (aka FreeDB) in 1/65536 of second. */ GC (aka FreeDB) in 1/65536 of second */
} mi_pgop_stat; } mi_pgop_stat;
}; };
#ifndef __cplusplus #ifndef __cplusplus

View File

@ -63,7 +63,7 @@
#define SSIZE_MAX INTPTR_MAX #define SSIZE_MAX INTPTR_MAX
#endif #endif
#if UINTPTR_MAX > 0xffffFFFFul || ULONG_MAX > 0xffffFFFFul #if UINTPTR_MAX > 0xffffFFFFul || ULONG_MAX > 0xffffFFFFul || defined(_WIN64)
#define MDBX_WORDBITS 64 #define MDBX_WORDBITS 64
#else #else
#define MDBX_WORDBITS 32 #define MDBX_WORDBITS 32

View File

@ -3983,13 +3983,12 @@ __cold static void kill_page(MDBX_txn *txn, MDBX_page *mp, pgno_t pgno,
while (--npages) { while (--npages) {
iov[n] = iov[0]; iov[n] = iov[0];
if (++n == MDBX_COMMIT_PAGES) { if (++n == MDBX_COMMIT_PAGES) {
osal_pwritev(env->me_lazy_fd, iov, MDBX_COMMIT_PAGES, iov_off, osal_pwritev(env->me_lazy_fd, iov, MDBX_COMMIT_PAGES, iov_off);
pgno2bytes(env, MDBX_COMMIT_PAGES));
iov_off += pgno2bytes(env, MDBX_COMMIT_PAGES); iov_off += pgno2bytes(env, MDBX_COMMIT_PAGES);
n = 0; n = 0;
} }
} }
osal_pwritev(env->me_lazy_fd, iov, n, iov_off, pgno2bytes(env, n)); osal_pwritev(env->me_lazy_fd, iov, n, iov_off);
} }
} }
@ -4318,139 +4317,155 @@ static __inline int page_retire(MDBX_cursor *mc, MDBX_page *mp) {
return page_retire_ex(mc, mp->mp_pgno, mp, mp->mp_flags); return page_retire_ex(mc, mp->mp_pgno, mp, mp->mp_flags);
} }
struct iov_ctx { typedef struct iov_ctx {
unsigned iov_items; MDBX_env *env;
size_t iov_bytes; osal_ioring_t *ior;
size_t iov_off; int err;
#ifndef MDBX_NEED_WRITTEN_RANGE
#define MDBX_NEED_WRITTEN_RANGE 1
#endif /* MDBX_NEED_WRITTEN_RANGE */
#if MDBX_NEED_WRITTEN_RANGE
pgno_t flush_begin; pgno_t flush_begin;
pgno_t flush_end; pgno_t flush_end;
struct iovec iov[MDBX_COMMIT_PAGES]; #endif /* MDBX_NEED_WRITTEN_RANGE */
}; uint64_t coherency_timestamp;
} iov_ctx_t;
static __inline void iov_init(MDBX_txn *const txn, struct iov_ctx *ctx) { __must_check_result static int iov_init(MDBX_txn *const txn, iov_ctx_t *ctx,
ctx->flush_begin = MAX_PAGENO; unsigned items, pgno_t npages) {
ctx->flush_end = MIN_PAGENO; ctx->env = txn->mt_env;
ctx->iov_items = 0; ctx->ior = &txn->mt_env->me_ioring;
ctx->iov_bytes = 0; ctx->err = osal_ioring_reserve(ctx->ior, items,
ctx->iov_off = 0; pgno_align2os_bytes(txn->mt_env, npages));
(void)txn; if (likely(ctx->err == MDBX_SUCCESS)) {
#if MDBX_NEED_WRITTEN_RANGE
ctx->flush_begin = MAX_PAGENO;
ctx->flush_end = MIN_PAGENO;
#endif /* MDBX_NEED_WRITTEN_RANGE */
osal_ioring_reset(ctx->ior);
}
return ctx->err;
} }
static __inline void iov_done(MDBX_txn *const txn, struct iov_ctx *ctx) { static inline bool iov_empty(const iov_ctx_t *ctx) {
tASSERT(txn, ctx->iov_items == 0); return osal_ioring_used(ctx->ior) == 0;
#if defined(__linux__) || defined(__gnu_linux__)
MDBX_env *const env = txn->mt_env;
if (!(txn->mt_flags & MDBX_WRITEMAP) && linux_kernel_version < 0x02060b00)
/* Linux kernels older than version 2.6.11 ignore the addr and nbytes
* arguments, making this function fairly expensive. Therefore, the
* whole cache is always flushed. */
osal_flush_incoherent_mmap(
env->me_map + pgno2bytes(env, ctx->flush_begin),
pgno2bytes(env, ctx->flush_end - ctx->flush_begin), env->me_os_psize);
#endif /* Linux */
} }
static int iov_write(MDBX_txn *const txn, struct iov_ctx *ctx) { static void iov_callback4dirtypages(iov_ctx_t *ctx, size_t offset, void *data,
tASSERT(txn, !(txn->mt_flags & MDBX_WRITEMAP)); size_t bytes) {
tASSERT(txn, ctx->iov_items > 0); MDBX_env *const env = ctx->env;
eASSERT(env, (env->me_flags & MDBX_WRITEMAP) == 0);
MDBX_env *const env = txn->mt_env; MDBX_page *wp = (MDBX_page *)data;
int rc; eASSERT(env, wp->mp_pgno == bytes2pgno(env, offset));
if (likely(ctx->iov_items == 1)) { eASSERT(env, bytes2pgno(env, bytes) >= (IS_OVERFLOW(wp) ? wp->mp_pages : 1u));
eASSERT(env, ctx->iov_bytes == (size_t)ctx->iov[0].iov_len); eASSERT(env, (wp->mp_flags & P_ILL_BITS) == 0);
rc = osal_pwrite(env->me_lazy_fd, ctx->iov[0].iov_base, ctx->iov[0].iov_len,
ctx->iov_off);
} else {
rc = osal_pwritev(env->me_lazy_fd, ctx->iov, ctx->iov_items, ctx->iov_off,
ctx->iov_bytes);
}
if (unlikely(rc != MDBX_SUCCESS)) if (likely(ctx->err == MDBX_SUCCESS)) {
ERROR("Write error: %s", mdbx_strerror(rc)); VALGRIND_MAKE_MEM_DEFINED(env->me_map + offset, bytes);
else { MDBX_ASAN_UNPOISON_MEMORY_REGION(env->me_map + offset, bytes);
VALGRIND_MAKE_MEM_DEFINED(txn->mt_env->me_map + ctx->iov_off, osal_flush_incoherent_mmap(env->me_map + offset, bytes, env->me_os_psize);
ctx->iov_bytes); const MDBX_page *const rp = (const MDBX_page *)(env->me_map + offset);
MDBX_ASAN_UNPOISON_MEMORY_REGION(txn->mt_env->me_map + ctx->iov_off,
ctx->iov_bytes);
}
unsigned iov_items = ctx->iov_items;
#if MDBX_ENABLE_PGOP_STAT
txn->mt_env->me_lck->mti_pgop_stat.wops.weak += iov_items;
#endif /* MDBX_ENABLE_PGOP_STAT */
ctx->iov_items = 0;
ctx->iov_bytes = 0;
uint64_t timestamp = 0;
for (unsigned i = 0; i < iov_items; i++) {
MDBX_page *wp = (MDBX_page *)ctx->iov[i].iov_base;
const MDBX_page *rp = pgno2page(txn->mt_env, wp->mp_pgno);
/* check with timeout as the workaround /* check with timeout as the workaround
* for todo4recovery://erased_by_github/libmdbx/issues/269 */ * for todo4recovery://erased_by_github/libmdbx/issues/269 */
while (likely(rc == MDBX_SUCCESS) && if (unlikely(memcmp(wp, rp, bytes))) {
unlikely(memcmp(wp, rp, ctx->iov[i].iov_len) != 0)) { ctx->coherency_timestamp = 0;
if (!timestamp) { WARNING("catch delayed/non-arrived page %" PRIaPGNO " %s", wp->mp_pgno,
iov_done(txn, ctx); "(workaround for incoherent flaw of unified page/buffer cache)");
WARNING( do
"catch delayed/non-arrived page %" PRIaPGNO " %s", wp->mp_pgno, if (coherency_timeout(&ctx->coherency_timestamp, wp->mp_pgno) !=
"(workaround for incoherent flaw of unified page/buffer cache)"); MDBX_RESULT_TRUE) {
} ctx->err = MDBX_PROBLEM;
if (coherency_timeout(&timestamp, wp->mp_pgno) != MDBX_RESULT_TRUE) break;
rc = MDBX_PROBLEM; }
while (unlikely(memcmp(wp, rp, bytes)));
} }
dpage_free(env, wp, bytes2pgno(env, ctx->iov[i].iov_len));
} }
return rc;
if (likely(bytes == env->me_psize))
dpage_free(env, wp, 1);
else {
do {
eASSERT(env, wp->mp_pgno == bytes2pgno(env, offset));
eASSERT(env, (wp->mp_flags & P_ILL_BITS) == 0);
unsigned npages = IS_OVERFLOW(wp) ? wp->mp_pages : 1u;
size_t chunk = pgno2bytes(env, npages);
eASSERT(env, bytes >= chunk);
dpage_free(env, wp, npages);
wp = (MDBX_page *)((char *)wp + chunk);
offset += chunk;
bytes -= chunk;
} while (bytes);
}
} }
static int iov_page(MDBX_txn *txn, struct iov_ctx *ctx, MDBX_page *dp, static void iov_complete(iov_ctx_t *ctx) {
unsigned npages) { if ((ctx->env->me_flags & MDBX_WRITEMAP) == 0)
osal_ioring_walk(ctx->ior, ctx, iov_callback4dirtypages);
osal_ioring_reset(ctx->ior);
}
__must_check_result static int iov_write(iov_ctx_t *ctx) {
eASSERT(ctx->env, !iov_empty(ctx));
osal_ioring_write_result_t r = osal_ioring_write(ctx->ior);
#if MDBX_ENABLE_PGOP_STAT
ctx->env->me_lck->mti_pgop_stat.wops.weak += r.wops;
#endif /* MDBX_ENABLE_PGOP_STAT */
ctx->err = r.err;
if (unlikely(ctx->err != MDBX_SUCCESS))
ERROR("Write error: %s", mdbx_strerror(ctx->err));
iov_complete(ctx);
return ctx->err;
}
__must_check_result static int iov_page(MDBX_txn *txn, iov_ctx_t *ctx,
MDBX_page *dp, unsigned npages) {
MDBX_env *const env = txn->mt_env; MDBX_env *const env = txn->mt_env;
tASSERT(txn, ctx->err == MDBX_SUCCESS);
tASSERT(txn, dp->mp_pgno >= MIN_PAGENO && dp->mp_pgno < txn->mt_next_pgno); tASSERT(txn, dp->mp_pgno >= MIN_PAGENO && dp->mp_pgno < txn->mt_next_pgno);
tASSERT(txn, IS_MODIFIABLE(txn, dp)); tASSERT(txn, IS_MODIFIABLE(txn, dp));
tASSERT(txn, !(dp->mp_flags & ~(P_BRANCH | P_LEAF | P_LEAF2 | P_OVERFLOW))); tASSERT(txn, !(dp->mp_flags & ~(P_BRANCH | P_LEAF | P_LEAF2 | P_OVERFLOW)));
ctx->flush_begin =
(ctx->flush_begin < dp->mp_pgno) ? ctx->flush_begin : dp->mp_pgno;
ctx->flush_end = (ctx->flush_end > dp->mp_pgno + npages)
? ctx->flush_end
: dp->mp_pgno + npages;
env->me_lck->mti_unsynced_pages.weak += npages;
if (IS_SHADOWED(txn, dp)) { if (IS_SHADOWED(txn, dp)) {
tASSERT(txn, !(txn->mt_flags & MDBX_WRITEMAP)); tASSERT(txn, !(txn->mt_flags & MDBX_WRITEMAP));
dp->mp_txnid = txn->mt_txnid; dp->mp_txnid = txn->mt_txnid;
tASSERT(txn, IS_SPILLED(txn, dp)); tASSERT(txn, IS_SPILLED(txn, dp));
const size_t size = pgno2bytes(env, npages); int err = osal_ioring_add(ctx->ior, pgno2bytes(env, dp->mp_pgno), dp,
if (ctx->iov_off + ctx->iov_bytes != pgno2bytes(env, dp->mp_pgno) || pgno2bytes(env, npages));
ctx->iov_items == ARRAY_LENGTH(ctx->iov) || if (unlikely(err != MDBX_SUCCESS)) {
ctx->iov_bytes + size > MAX_WRITE) { ctx->err = err;
if (ctx->iov_items) { if (unlikely(err != MDBX_RESULT_TRUE)) {
int err = iov_write(txn, ctx); iov_complete(ctx);
if (unlikely(err != MDBX_SUCCESS)) return err;
return err;
#if defined(__linux__) || defined(__gnu_linux__)
if (linux_kernel_version >= 0x02060b00)
/* Linux kernels older than version 2.6.11 ignore the addr and nbytes
* arguments, making this function fairly expensive. Therefore, the
* whole cache is always flushed. */
#endif /* Linux */
osal_flush_incoherent_mmap(env->me_map + ctx->iov_off, ctx->iov_bytes,
env->me_os_psize);
} }
ctx->iov_off = pgno2bytes(env, dp->mp_pgno); err = iov_write(ctx);
tASSERT(txn, iov_empty(ctx));
if (likely(err == MDBX_SUCCESS)) {
err = osal_ioring_add(ctx->ior, pgno2bytes(env, dp->mp_pgno), dp,
pgno2bytes(env, npages));
if (unlikely(err != MDBX_SUCCESS)) {
iov_complete(ctx);
return ctx->err = err;
}
}
tASSERT(txn, ctx->err == MDBX_SUCCESS);
} }
ctx->iov[ctx->iov_items].iov_base = (void *)dp;
ctx->iov[ctx->iov_items].iov_len = size;
ctx->iov_items += 1;
ctx->iov_bytes += size;
} else { } else {
tASSERT(txn, txn->mt_flags & MDBX_WRITEMAP); tASSERT(txn, txn->mt_flags & MDBX_WRITEMAP);
} }
#if MDBX_NEED_WRITTEN_RANGE
ctx->flush_begin =
(ctx->flush_begin < dp->mp_pgno) ? ctx->flush_begin : dp->mp_pgno;
ctx->flush_end = (ctx->flush_end > dp->mp_pgno + npages)
? ctx->flush_end
: dp->mp_pgno + npages;
#endif /* MDBX_NEED_WRITTEN_RANGE */
env->me_lck->mti_unsynced_pages.weak += npages;
return MDBX_SUCCESS; return MDBX_SUCCESS;
} }
static int spill_page(MDBX_txn *txn, struct iov_ctx *ctx, MDBX_page *dp, static int spill_page(MDBX_txn *txn, iov_ctx_t *ctx, MDBX_page *dp,
unsigned npages) { unsigned npages) {
tASSERT(txn, !(txn->mt_flags & MDBX_WRITEMAP)); tASSERT(txn, !(txn->mt_flags & MDBX_WRITEMAP));
pgno_t pgno = dp->mp_pgno; pgno_t pgno = dp->mp_pgno;
@ -4613,13 +4628,18 @@ static int txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0,
txn->tw.dirtyroom, need); txn->tw.dirtyroom, need);
tASSERT(txn, txn->tw.dirtylist->length >= wanna_spill); tASSERT(txn, txn->tw.dirtylist->length >= wanna_spill);
struct iov_ctx ctx;
iov_init(txn, &ctx);
int rc = MDBX_SUCCESS; int rc = MDBX_SUCCESS;
if (txn->mt_flags & MDBX_WRITEMAP) { if (txn->mt_flags & MDBX_WRITEMAP) {
MDBX_dpl *const dl = txn->tw.dirtylist; MDBX_dpl *const dl = txn->tw.dirtylist;
const unsigned span = dl->length - txn->tw.loose_count; const unsigned span = dl->length - txn->tw.loose_count;
txn->tw.dirtyroom += span; txn->tw.dirtyroom += span;
iov_ctx_t ctx;
rc = iov_init(txn, &ctx, wanna_spill,
dl->pages_including_loose - txn->tw.loose_count);
if (unlikely(rc != MDBX_SUCCESS))
goto bailout;
unsigned r, w; unsigned r, w;
for (w = 0, r = 1; r <= dl->length; ++r) { for (w = 0, r = 1; r <= dl->length; ++r) {
MDBX_page *dp = dl->items[r].ptr; MDBX_page *dp = dl->items[r].ptr;
@ -4749,6 +4769,13 @@ static int txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0,
prio2spill, prio2adjacent, spillable, wanna_spill, amount); prio2spill, prio2adjacent, spillable, wanna_spill, amount);
tASSERT(txn, prio2spill < prio2adjacent && prio2adjacent <= 256); tASSERT(txn, prio2spill < prio2adjacent && prio2adjacent <= 256);
iov_ctx_t ctx;
rc = iov_init(txn, &ctx, amount,
txn->tw.dirtylist->pages_including_loose -
txn->tw.loose_count);
if (unlikely(rc != MDBX_SUCCESS))
goto bailout;
unsigned prev_prio = 256; unsigned prev_prio = 256;
unsigned r, w, prio; unsigned r, w, prio;
pgno_t spilled_entries = 0, spilled_npages = 0; pgno_t spilled_entries = 0, spilled_npages = 0;
@ -4814,12 +4841,10 @@ static int txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0,
txn->tw.dirtylist->pages_including_loose -= spilled_npages; txn->tw.dirtylist->pages_including_loose -= spilled_npages;
tASSERT(txn, dirtylist_check(txn)); tASSERT(txn, dirtylist_check(txn));
if (ctx.iov_items) { if (!iov_empty(&ctx)) {
/* iov_page() frees dirty-pages and reset iov_items in case of failure. */
tASSERT(txn, rc == MDBX_SUCCESS); tASSERT(txn, rc == MDBX_SUCCESS);
rc = iov_write(txn, &ctx); rc = iov_write(&ctx);
} }
if (unlikely(rc != MDBX_SUCCESS)) if (unlikely(rc != MDBX_SUCCESS))
goto bailout; goto bailout;
@ -4827,9 +4852,8 @@ static int txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0,
txn->mt_flags |= MDBX_TXN_SPILLS; txn->mt_flags |= MDBX_TXN_SPILLS;
NOTICE("spilled %u dirty-entries, now have %u dirty-room", spilled_entries, NOTICE("spilled %u dirty-entries, now have %u dirty-room", spilled_entries,
txn->tw.dirtyroom); txn->tw.dirtyroom);
iov_done(txn, &ctx);
} else { } else {
tASSERT(txn, ctx.iov_items == 0 && rc == MDBX_SUCCESS); tASSERT(txn, rc == MDBX_SUCCESS);
for (unsigned i = 1; i <= dl->length; ++i) { for (unsigned i = 1; i <= dl->length; ++i) {
MDBX_page *dp = dl->items[i].ptr; MDBX_page *dp = dl->items[i].ptr;
NOTICE("dirtylist[%u]: pgno %u, npages %u, flags 0x%04X, age %u, prio %u", NOTICE("dirtylist[%u]: pgno %u, npages %u, flags 0x%04X, age %u, prio %u",
@ -5610,7 +5634,7 @@ __cold static int map_resize(MDBX_env *env, const pgno_t used_pgno,
if ((env->me_flags & MDBX_WRITEMAP) && env->me_lck->mti_unsynced_pages.weak) { if ((env->me_flags & MDBX_WRITEMAP) && env->me_lck->mti_unsynced_pages.weak) {
#if MDBX_ENABLE_PGOP_STAT #if MDBX_ENABLE_PGOP_STAT
env->me_lck->mti_pgop_stat.wops.weak += 1; env->me_lck->mti_pgop_stat.msync.weak += 1;
#endif /* MDBX_ENABLE_PGOP_STAT */ #endif /* MDBX_ENABLE_PGOP_STAT */
rc = osal_msync(&env->me_dxb_mmap, 0, pgno_align2os_bytes(env, used_pgno), rc = osal_msync(&env->me_dxb_mmap, 0, pgno_align2os_bytes(env, used_pgno),
MDBX_SYNC_NONE); MDBX_SYNC_NONE);
@ -5743,74 +5767,71 @@ __cold static int map_resize_implicit(MDBX_env *env, const pgno_t used_pgno,
true); true);
} }
static int meta_unsteady(MDBX_env *env, const txnid_t last_steady, static int meta_unsteady(int err, MDBX_env *env, const txnid_t early_than,
MDBX_meta *const meta, mdbx_filehandle_t fd) { const pgno_t pgno) {
MDBX_meta *const meta = METAPAGE(env, pgno);
const txnid_t txnid = constmeta_txnid(meta);
if (unlikely(err != MDBX_SUCCESS) || !META_IS_STEADY(meta) ||
!(txnid < early_than))
return err;
WARNING("wipe txn #%" PRIaTXN ", meta %" PRIaPGNO, txnid, pgno);
const uint64_t wipe = MDBX_DATASIGN_NONE; const uint64_t wipe = MDBX_DATASIGN_NONE;
if (unlikely(META_IS_STEADY(meta)) && constmeta_txnid(meta) <= last_steady) { const void *ptr = &wipe;
WARNING("wipe txn #%" PRIaTXN ", meta %" PRIaPGNO, last_steady, size_t bytes = sizeof(meta->mm_sign),
data_page(meta)->mp_pgno); offset = (uint8_t *)&meta->mm_sign - env->me_map;
if (env->me_flags & MDBX_WRITEMAP)
unaligned_poke_u64(4, meta->mm_sign, wipe);
else
return osal_pwrite(fd, &wipe, sizeof(meta->mm_sign),
(uint8_t *)&meta->mm_sign - env->me_map);
}
return MDBX_SUCCESS;
}
__cold static int wipe_steady(MDBX_txn *txn, const txnid_t last_steady) {
MDBX_env *const env = txn->mt_env;
#if MDBX_ENABLE_PGOP_STAT
env->me_lck->mti_pgop_stat.wops.weak += 1;
#endif /* MDBX_ENABLE_PGOP_STAT */
const mdbx_filehandle_t fd = (env->me_dsync_fd != INVALID_HANDLE_VALUE)
? env->me_dsync_fd
: env->me_lazy_fd;
int err = meta_unsteady(env, last_steady, METAPAGE(env, 0), fd);
if (unlikely(err != MDBX_SUCCESS))
return err;
err = meta_unsteady(env, last_steady, METAPAGE(env, 1), fd);
if (unlikely(err != MDBX_SUCCESS))
return err;
err = meta_unsteady(env, last_steady, METAPAGE(env, 2), fd);
if (unlikely(err != MDBX_SUCCESS))
return err;
if (env->me_flags & MDBX_WRITEMAP) { if (env->me_flags & MDBX_WRITEMAP) {
unaligned_poke_u64(4, meta->mm_sign, wipe);
osal_flush_incoherent_cpu_writeback(); osal_flush_incoherent_cpu_writeback();
err = osal_msync(&env->me_dxb_mmap, 0, pgno_align2os_bytes(env, NUM_METAS), err = osal_msync(&env->me_dxb_mmap, 0, pgno_align2os_bytes(env, NUM_METAS),
MDBX_SYNC_DATA); MDBX_SYNC_DATA);
if (unlikely(err != MDBX_SUCCESS)) if (unlikely(err != MDBX_SUCCESS))
return err; return err;
} else {
if (fd == env->me_lazy_fd) {
#if MDBX_USE_SYNCFILERANGE
static bool syncfilerange_unavailable;
if (!syncfilerange_unavailable &&
sync_file_range(env->me_lazy_fd, 0, pgno2bytes(env, NUM_METAS),
SYNC_FILE_RANGE_WRITE | SYNC_FILE_RANGE_WAIT_AFTER)) {
err = errno;
if (ignore_enosys(err) == MDBX_RESULT_TRUE)
syncfilerange_unavailable = true;
}
if (syncfilerange_unavailable)
#endif /* MDBX_USE_SYNCFILERANGE */
err = osal_fsync(env->me_lazy_fd, MDBX_SYNC_DATA);
if (unlikely(err != MDBX_SUCCESS))
return err;
} }
osal_flush_incoherent_mmap(env->me_map, pgno2bytes(env, NUM_METAS), ptr = data_page(meta);
env->me_os_psize); offset = (uint8_t *)ptr - env->me_map;
bytes = env->me_psize;
} }
#if MDBX_ENABLE_PGOP_STAT
env->me_lck->mti_pgop_stat.wops.weak += 1;
#endif /* MDBX_ENABLE_PGOP_STAT */
err = osal_pwrite(env->me_fd4meta, ptr, bytes, offset);
if (likely(err == MDBX_SUCCESS) && env->me_fd4meta == env->me_lazy_fd) {
err = osal_fsync(env->me_lazy_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ);
#if MDBX_ENABLE_PGOP_STAT
env->me_lck->mti_pgop_stat.fsync.weak += 1;
#endif /* MDBX_ENABLE_PGOP_STAT */
}
return err;
}
__cold static int wipe_steady(MDBX_txn *txn, txnid_t last_steady) {
MDBX_env *const env = txn->mt_env;
int err = MDBX_SUCCESS;
/* early than last_steady */
err = meta_unsteady(err, env, last_steady, 0);
err = meta_unsteady(err, env, last_steady, 1);
err = meta_unsteady(err, env, last_steady, 2);
/* the last_steady */
err = meta_unsteady(err, env, last_steady + 1, 0);
err = meta_unsteady(err, env, last_steady + 1, 1);
err = meta_unsteady(err, env, last_steady + 1, 2);
osal_flush_incoherent_mmap(env->me_map, pgno2bytes(env, NUM_METAS),
env->me_os_psize);
/* force oldest refresh */ /* force oldest refresh */
atomic_store32(&env->me_lck->mti_readers_refresh_flag, true, mo_Relaxed); atomic_store32(&env->me_lck->mti_readers_refresh_flag, true, mo_Relaxed);
tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0);
txn->tw.troika = meta_tap(env); txn->tw.troika = meta_tap(env);
for (MDBX_txn *scan = txn->mt_env->me_txn0; scan; scan = scan->mt_child) for (MDBX_txn *scan = txn->mt_env->me_txn0; scan; scan = scan->mt_child)
if (scan != txn) if (scan != txn)
scan->tw.troika = txn->tw.troika; scan->tw.troika = txn->tw.troika;
return MDBX_SUCCESS; return err;
} }
//------------------------------------------------------------------------------ //------------------------------------------------------------------------------
@ -7052,6 +7073,40 @@ fail:
return rc; return rc;
} }
static int meta_sync(const MDBX_env *env, const meta_ptr_t head) {
eASSERT(env, atomic_load32(&env->me_lck->mti_meta_sync_txnid, mo_Relaxed) !=
(uint32_t)head.txnid);
/* Функция может вызываться (в том числе) при (env->me_flags &
* MDBX_NOMETASYNC) == 0 и env->me_fd4meta == env->me_dsync_fd, например если
* предыдущая транзакция была выполненна с флагом MDBX_NOMETASYNC. */
int rc = MDBX_RESULT_TRUE;
if (env->me_flags & MDBX_WRITEMAP) {
#if MDBX_ENABLE_PGOP_ST
env->me_lck->mti_pgop_stat.wops.weak += 1;
#endif /* MDBX_ENABLE_PGOP_STAT */
const MDBX_page *page = data_page(head.ptr_c);
rc = osal_pwrite(env->me_fd4meta, page, env->me_psize,
(uint8_t *)page - env->me_map);
if (likely(rc == MDBX_SUCCESS) && env->me_fd4meta == env->me_lazy_fd) {
rc = osal_fsync(env->me_lazy_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ);
#if MDBX_ENABLE_PGOP_STAT
env->me_lck->mti_pgop_stat.fsync.weak += 1;
#endif /* MDBX_ENABLE_PGOP_STAT */
}
} else {
rc = osal_fsync(env->me_lazy_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ);
#if MDBX_ENABLE_PGOP_STAT
env->me_lck->mti_pgop_stat.fsync.weak += 1;
#endif /* MDBX_ENABLE_PGOP_STAT */
}
if (likely(rc == MDBX_SUCCESS))
env->me_lck->mti_meta_sync_txnid.weak = (uint32_t)head.txnid;
return rc;
}
__cold static int env_sync(MDBX_env *env, bool force, bool nonblock) { __cold static int env_sync(MDBX_env *env, bool force, bool nonblock) {
bool locked = false; bool locked = false;
int rc = MDBX_RESULT_TRUE /* means "nothing to sync" */; int rc = MDBX_RESULT_TRUE /* means "nothing to sync" */;
@ -7104,7 +7159,7 @@ retry:;
int err; int err;
/* pre-sync to avoid latency for writer */ /* pre-sync to avoid latency for writer */
if (unsynced_pages > /* FIXME: define threshold */ 16 && if (unsynced_pages > /* FIXME: define threshold */ 42 &&
(flags & MDBX_SAFE_NOSYNC) == 0) { (flags & MDBX_SAFE_NOSYNC) == 0) {
eASSERT(env, ((flags ^ env->me_flags) & MDBX_WRITEMAP) == 0); eASSERT(env, ((flags ^ env->me_flags) & MDBX_WRITEMAP) == 0);
if (flags & MDBX_WRITEMAP) { if (flags & MDBX_WRITEMAP) {
@ -7173,19 +7228,8 @@ retry:;
/* LY: sync meta-pages if MDBX_NOMETASYNC enabled /* LY: sync meta-pages if MDBX_NOMETASYNC enabled
* and someone was not synced above. */ * and someone was not synced above. */
if (atomic_load32(&env->me_lck->mti_meta_sync_txnid, mo_Relaxed) != if (atomic_load32(&env->me_lck->mti_meta_sync_txnid, mo_Relaxed) !=
(uint32_t)head.txnid) { (uint32_t)head.txnid)
#if MDBX_ENABLE_PGOP_STAT rc = meta_sync(env, head);
env->me_lck->mti_pgop_stat.wops.weak += 1;
#endif /* MDBX_ENABLE_PGOP_STAT */
rc = (flags & MDBX_WRITEMAP)
? osal_msync(&env->me_dxb_mmap, 0,
pgno_align2os_bytes(env, NUM_METAS),
MDBX_SYNC_DATA | MDBX_SYNC_IODQ)
: osal_fsync(env->me_lazy_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ);
if (likely(rc == MDBX_SUCCESS))
atomic_store32(&env->me_lck->mti_meta_sync_txnid, (uint32_t)head.txnid,
mo_Relaxed);
}
bailout: bailout:
if (locked) if (locked)
@ -7628,7 +7672,8 @@ static bool coherency_check(const MDBX_env *env, const txnid_t txnid,
__cold static int coherency_timeout(uint64_t *timestamp, pgno_t pgno) { __cold static int coherency_timeout(uint64_t *timestamp, pgno_t pgno) {
if (likely(timestamp && *timestamp == 0)) if (likely(timestamp && *timestamp == 0))
*timestamp = osal_monotime(); *timestamp = osal_monotime();
else if (unlikely(!timestamp || osal_monotime() - *timestamp > 65536 / 10)) { else if (unlikely(!timestamp || osal_monotime() - *timestamp >
osal_16dot16_to_monotime(65536 / 10))) {
if (pgno) if (pgno)
ERROR("bailout waiting for %" PRIaPGNO " page arrival %s", pgno, ERROR("bailout waiting for %" PRIaPGNO " page arrival %s", pgno,
"(workaround for incoherent flaw of unified page/buffer cache)"); "(workaround for incoherent flaw of unified page/buffer cache)");
@ -9902,7 +9947,7 @@ bailout:
return rc; return rc;
} }
static int txn_write(MDBX_txn *txn, struct iov_ctx *ctx) { static int txn_write(MDBX_txn *txn, iov_ctx_t *ctx) {
MDBX_dpl *const dl = MDBX_dpl *const dl =
(txn->mt_flags & MDBX_WRITEMAP) ? txn->tw.dirtylist : dpl_sort(txn); (txn->mt_flags & MDBX_WRITEMAP) ? txn->tw.dirtylist : dpl_sort(txn);
int rc = MDBX_SUCCESS; int rc = MDBX_SUCCESS;
@ -9919,10 +9964,9 @@ static int txn_write(MDBX_txn *txn, struct iov_ctx *ctx) {
break; break;
} }
if (ctx->iov_items) { if (!iov_empty(ctx)) {
/* iov_page() frees dirty-pages and reset iov_items in case of failure. */
tASSERT(txn, rc == MDBX_SUCCESS); tASSERT(txn, rc == MDBX_SUCCESS);
rc = iov_write(txn, ctx); rc = iov_write(ctx);
} }
while (r <= dl->length) while (r <= dl->length)
@ -10568,42 +10612,53 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) {
goto fail; goto fail;
} }
struct iov_ctx write_ctx; const meta_ptr_t head = meta_recent(env, &txn->tw.troika);
iov_init(txn, &write_ctx); iov_ctx_t write_ctx;
rc = iov_init(txn, &write_ctx, txn->tw.dirtylist->length,
txn->tw.dirtylist->pages_including_loose - txn->tw.loose_count);
if (unlikely(rc != MDBX_SUCCESS))
goto fail;
if (head.is_steady && atomic_load32(&env->me_lck->mti_meta_sync_txnid,
mo_Relaxed) != (uint32_t)head.txnid) {
/* sync prev meta */
rc = meta_sync(env, head);
if (unlikely(rc != MDBX_SUCCESS))
goto fail;
}
rc = txn_write(txn, &write_ctx); rc = txn_write(txn, &write_ctx);
if (likely(rc == MDBX_SUCCESS)) if (unlikely(rc != MDBX_SUCCESS))
iov_done(txn, &write_ctx); goto fail;
/* TODO: use ctx.flush_begin & ctx.flush_end for range-sync */ /* TODO: use ctx.flush_begin & ctx.flush_end for range-sync */
ts_3 = latency ? osal_monotime() : 0; ts_3 = latency ? osal_monotime() : 0;
if (likely(rc == MDBX_SUCCESS)) { MDBX_meta meta;
const meta_ptr_t head = meta_recent(env, &txn->tw.troika); memcpy(meta.mm_magic_and_version, head.ptr_c->mm_magic_and_version, 8);
MDBX_meta meta; meta.mm_extra_flags = head.ptr_c->mm_extra_flags;
memcpy(meta.mm_magic_and_version, head.ptr_c->mm_magic_and_version, 8); meta.mm_validator_id = head.ptr_c->mm_validator_id;
meta.mm_extra_flags = head.ptr_c->mm_extra_flags; meta.mm_extra_pagehdr = head.ptr_c->mm_extra_pagehdr;
meta.mm_validator_id = head.ptr_c->mm_validator_id; unaligned_poke_u64(4, meta.mm_pages_retired,
meta.mm_extra_pagehdr = head.ptr_c->mm_extra_pagehdr; unaligned_peek_u64(4, head.ptr_c->mm_pages_retired) +
unaligned_poke_u64(4, meta.mm_pages_retired, MDBX_PNL_SIZE(txn->tw.retired_pages));
unaligned_peek_u64(4, head.ptr_c->mm_pages_retired) + meta.mm_geo = txn->mt_geo;
MDBX_PNL_SIZE(txn->tw.retired_pages)); meta.mm_dbs[FREE_DBI] = txn->mt_dbs[FREE_DBI];
meta.mm_geo = txn->mt_geo; meta.mm_dbs[MAIN_DBI] = txn->mt_dbs[MAIN_DBI];
meta.mm_dbs[FREE_DBI] = txn->mt_dbs[FREE_DBI]; meta.mm_canary = txn->mt_canary;
meta.mm_dbs[MAIN_DBI] = txn->mt_dbs[MAIN_DBI];
meta.mm_canary = txn->mt_canary;
txnid_t commit_txnid = txn->mt_txnid; txnid_t commit_txnid = txn->mt_txnid;
#if MDBX_ENABLE_BIGFOOT #if MDBX_ENABLE_BIGFOOT
if (gcu_ctx.bigfoot > txn->mt_txnid) { if (gcu_ctx.bigfoot > txn->mt_txnid) {
commit_txnid = gcu_ctx.bigfoot; commit_txnid = gcu_ctx.bigfoot;
TRACE("use @%" PRIaTXN " (+%u) for commit bigfoot-txn", commit_txnid, TRACE("use @%" PRIaTXN " (+%u) for commit bigfoot-txn", commit_txnid,
(unsigned)(commit_txnid - txn->mt_txnid)); (unsigned)(commit_txnid - txn->mt_txnid));
}
#endif
meta_set_txnid(env, &meta, commit_txnid);
rc = sync_locked(env, env->me_flags | txn->mt_flags | MDBX_SHRINK_ALLOWED,
&meta, &txn->tw.troika);
} }
#endif
meta_set_txnid(env, &meta, commit_txnid);
rc = sync_locked(env, env->me_flags | txn->mt_flags | MDBX_SHRINK_ALLOWED,
&meta, &txn->tw.troika);
ts_4 = latency ? osal_monotime() : 0; ts_4 = latency ? osal_monotime() : 0;
if (unlikely(rc != MDBX_SUCCESS)) { if (unlikely(rc != MDBX_SUCCESS)) {
env->me_flags |= MDBX_FATAL_ERROR; env->me_flags |= MDBX_FATAL_ERROR;
@ -10894,11 +10949,11 @@ static int validate_meta_copy(MDBX_env *env, const MDBX_meta *meta,
__cold static int read_header(MDBX_env *env, MDBX_meta *dest, __cold static int read_header(MDBX_env *env, MDBX_meta *dest,
const int lck_exclusive, const int lck_exclusive,
const mdbx_mode_t mode_bits) { const mdbx_mode_t mode_bits) {
memset(dest, 0, sizeof(MDBX_meta));
int rc = osal_filesize(env->me_lazy_fd, &env->me_dxb_mmap.filesize); int rc = osal_filesize(env->me_lazy_fd, &env->me_dxb_mmap.filesize);
if (unlikely(rc != MDBX_SUCCESS)) if (unlikely(rc != MDBX_SUCCESS))
return rc; return rc;
memset(dest, 0, sizeof(MDBX_meta));
unaligned_poke_u64(4, dest->mm_sign, MDBX_DATASIGN_WEAK); unaligned_poke_u64(4, dest->mm_sign, MDBX_DATASIGN_WEAK);
rc = MDBX_CORRUPTED; rc = MDBX_CORRUPTED;
@ -11200,7 +11255,9 @@ static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending,
if (atomic_load32(&env->me_lck->mti_unsynced_pages, mo_Relaxed)) { if (atomic_load32(&env->me_lck->mti_unsynced_pages, mo_Relaxed)) {
eASSERT(env, ((flags ^ env->me_flags) & MDBX_WRITEMAP) == 0); eASSERT(env, ((flags ^ env->me_flags) & MDBX_WRITEMAP) == 0);
enum osal_syncmode_bits mode_bits = MDBX_SYNC_NONE; enum osal_syncmode_bits mode_bits = MDBX_SYNC_NONE;
unsigned sync_op = 0;
if ((flags & MDBX_SAFE_NOSYNC) == 0) { if ((flags & MDBX_SAFE_NOSYNC) == 0) {
sync_op = 1;
mode_bits = MDBX_SYNC_DATA; mode_bits = MDBX_SYNC_DATA;
if (pending->mm_geo.next > if (pending->mm_geo.next >
meta_prefer_steady(env, troika).ptr_c->mm_geo.now) meta_prefer_steady(env, troika).ptr_c->mm_geo.now)
@ -11209,7 +11266,7 @@ static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending,
mode_bits |= MDBX_SYNC_IODQ; mode_bits |= MDBX_SYNC_IODQ;
} }
#if MDBX_ENABLE_PGOP_STAT #if MDBX_ENABLE_PGOP_STAT
env->me_lck->mti_pgop_stat.wops.weak += 1; env->me_lck->mti_pgop_stat.msync.weak += sync_op;
#endif /* MDBX_ENABLE_PGOP_STAT */ #endif /* MDBX_ENABLE_PGOP_STAT */
if (flags & MDBX_WRITEMAP) if (flags & MDBX_WRITEMAP)
rc = rc =
@ -11298,9 +11355,6 @@ static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending,
eASSERT(env, ((env->me_flags ^ flags) & MDBX_WRITEMAP) == 0); eASSERT(env, ((env->me_flags ^ flags) & MDBX_WRITEMAP) == 0);
ENSURE(env, target == head.ptr_c || ENSURE(env, target == head.ptr_c ||
constmeta_txnid(target) < pending->unsafe_txnid); constmeta_txnid(target) < pending->unsafe_txnid);
#if MDBX_ENABLE_PGOP_STAT
env->me_lck->mti_pgop_stat.wops.weak += 1;
#endif /* MDBX_ENABLE_PGOP_STAT */
if (flags & MDBX_WRITEMAP) { if (flags & MDBX_WRITEMAP) {
jitter4testing(true); jitter4testing(true);
if (likely(target != head.ptr_c)) { if (likely(target != head.ptr_c)) {
@ -11338,34 +11392,37 @@ static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending,
osal_flush_incoherent_cpu_writeback(); osal_flush_incoherent_cpu_writeback();
jitter4testing(true); jitter4testing(true);
/* sync meta-pages */ /* sync meta-pages */
rc = #if MDBX_ENABLE_PGOP_STAT
osal_msync(&env->me_dxb_mmap, 0, pgno_align2os_bytes(env, NUM_METAS), env->me_lck->mti_pgop_stat.msync.weak += 1;
(flags & MDBX_NOMETASYNC) ? MDBX_SYNC_NONE #endif /* MDBX_ENABLE_PGOP_STAT */
: MDBX_SYNC_DATA | MDBX_SYNC_IODQ); rc = osal_msync(&env->me_dxb_mmap, 0, pgno_align2os_bytes(env, NUM_METAS),
(flags & MDBX_NOMETASYNC)
? MDBX_SYNC_NONE
: MDBX_SYNC_DATA | MDBX_SYNC_IODQ);
if (unlikely(rc != MDBX_SUCCESS)) if (unlikely(rc != MDBX_SUCCESS))
goto fail; goto fail;
} else { } else {
const MDBX_meta undo_meta = *target;
const mdbx_filehandle_t fd = (env->me_dsync_fd != INVALID_HANDLE_VALUE)
? env->me_dsync_fd
: env->me_lazy_fd;
#if MDBX_ENABLE_PGOP_STAT #if MDBX_ENABLE_PGOP_STAT
env->me_lck->mti_pgop_stat.wops.weak += 1; env->me_lck->mti_pgop_stat.wops.weak += 1;
#endif /* MDBX_ENABLE_PGOP_STAT */ #endif /* MDBX_ENABLE_PGOP_STAT */
rc = osal_pwrite(fd, pending, sizeof(MDBX_meta), const MDBX_meta undo_meta = *target;
rc = osal_pwrite(env->me_fd4meta, pending, sizeof(MDBX_meta),
(uint8_t *)target - env->me_map); (uint8_t *)target - env->me_map);
if (unlikely(rc != MDBX_SUCCESS)) { if (unlikely(rc != MDBX_SUCCESS)) {
undo: undo:
DEBUG("%s", "write failed, disk error?"); DEBUG("%s", "write failed, disk error?");
/* On a failure, the pagecache still contains the new data. /* On a failure, the pagecache still contains the new data.
* Try write some old data back, to prevent it from being used. */ * Try write some old data back, to prevent it from being used. */
osal_pwrite(fd, &undo_meta, sizeof(MDBX_meta), osal_pwrite(env->me_fd4meta, &undo_meta, sizeof(MDBX_meta),
(uint8_t *)target - env->me_map); (uint8_t *)target - env->me_map);
goto fail; goto fail;
} }
osal_flush_incoherent_mmap(target, sizeof(MDBX_meta), env->me_os_psize); osal_flush_incoherent_mmap(target, sizeof(MDBX_meta), env->me_os_psize);
/* sync meta-pages */ /* sync meta-pages */
if ((flags & MDBX_NOMETASYNC) == 0 && fd == env->me_lazy_fd) { if ((flags & MDBX_NOMETASYNC) == 0 && env->me_fd4meta == env->me_lazy_fd) {
#if MDBX_ENABLE_PGOP_STAT
env->me_lck->mti_pgop_stat.fsync.weak += 1;
#endif /* MDBX_ENABLE_PGOP_STAT */
rc = osal_fsync(env->me_lazy_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ); rc = osal_fsync(env->me_lazy_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ);
if (rc != MDBX_SUCCESS) if (rc != MDBX_SUCCESS)
goto undo; goto undo;
@ -11382,7 +11439,7 @@ static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending,
goto fail; goto fail;
} }
env->me_lck->mti_meta_sync_txnid.weak = env->me_lck->mti_meta_sync_txnid.weak =
(uint32_t)pending->unsafe_txnid - pending->mm_txnid_a[__BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__].weak -
((flags & MDBX_NOMETASYNC) ? UINT32_MAX / 3 : 0); ((flags & MDBX_NOMETASYNC) ? UINT32_MAX / 3 : 0);
*troika = meta_tap(env); *troika = meta_tap(env);
@ -11528,9 +11585,11 @@ __cold int mdbx_env_create(MDBX_env **penv) {
env->me_maxreaders = DEFAULT_READERS; env->me_maxreaders = DEFAULT_READERS;
env->me_maxdbs = env->me_numdbs = CORE_DBS; env->me_maxdbs = env->me_numdbs = CORE_DBS;
env->me_lazy_fd = INVALID_HANDLE_VALUE; env->me_lazy_fd = env->me_dsync_fd = env->me_fd4meta = env->me_fd4data =
env->me_dsync_fd = INVALID_HANDLE_VALUE; #if defined(_WIN32) || defined(_WIN64)
env->me_lfd = INVALID_HANDLE_VALUE; env->me_overlapped_fd =
#endif /* Windows */
env->me_lfd = INVALID_HANDLE_VALUE;
env->me_pid = osal_getpid(); env->me_pid = osal_getpid();
env->me_stuck_meta = -1; env->me_stuck_meta = -1;
@ -12863,10 +12922,10 @@ __cold static int __must_check_result override_meta(MDBX_env *env,
if (shape && memcmp(model, shape, sizeof(MDBX_meta)) == 0) if (shape && memcmp(model, shape, sizeof(MDBX_meta)) == 0)
return MDBX_SUCCESS; return MDBX_SUCCESS;
#if MDBX_ENABLE_PGOP_STAT
env->me_lck->mti_pgop_stat.wops.weak += 1;
#endif /* MDBX_ENABLE_PGOP_STAT */
if (env->me_flags & MDBX_WRITEMAP) { if (env->me_flags & MDBX_WRITEMAP) {
#if MDBX_ENABLE_PGOP_STAT
env->me_lck->mti_pgop_stat.msync.weak += 1;
#endif /* MDBX_ENABLE_PGOP_STAT */
rc = osal_msync(&env->me_dxb_mmap, 0, rc = osal_msync(&env->me_dxb_mmap, 0,
pgno_align2os_bytes(env, model->mm_geo.next), pgno_align2os_bytes(env, model->mm_geo.next),
MDBX_SYNC_DATA | MDBX_SYNC_IODQ); MDBX_SYNC_DATA | MDBX_SYNC_IODQ);
@ -12877,18 +12936,26 @@ __cold static int __must_check_result override_meta(MDBX_env *env,
* clearing consistency flag by mdbx_meta_update_begin() */ * clearing consistency flag by mdbx_meta_update_begin() */
memcpy(pgno2page(env, target), page, env->me_psize); memcpy(pgno2page(env, target), page, env->me_psize);
osal_flush_incoherent_cpu_writeback(); osal_flush_incoherent_cpu_writeback();
#if MDBX_ENABLE_PGOP_STAT
env->me_lck->mti_pgop_stat.msync.weak += 1;
#endif /* MDBX_ENABLE_PGOP_STAT */
rc = osal_msync(&env->me_dxb_mmap, 0, pgno_align2os_bytes(env, target + 1), rc = osal_msync(&env->me_dxb_mmap, 0, pgno_align2os_bytes(env, target + 1),
MDBX_SYNC_DATA | MDBX_SYNC_IODQ); MDBX_SYNC_DATA | MDBX_SYNC_IODQ);
} else { } else {
const mdbx_filehandle_t fd = (env->me_dsync_fd != INVALID_HANDLE_VALUE) #if MDBX_ENABLE_PGOP_STAT
? env->me_dsync_fd env->me_lck->mti_pgop_stat.wops.weak += 1;
: env->me_lazy_fd; #endif /* MDBX_ENABLE_PGOP_STAT */
rc = osal_pwrite(fd, page, env->me_psize, pgno2bytes(env, target)); rc = osal_pwrite(env->me_fd4meta, page, env->me_psize,
if (rc == MDBX_SUCCESS && fd == env->me_lazy_fd) pgno2bytes(env, target));
if (rc == MDBX_SUCCESS && env->me_fd4meta == env->me_lazy_fd) {
#if MDBX_ENABLE_PGOP_STAT
env->me_lck->mti_pgop_stat.fsync.weak += 1;
#endif /* MDBX_ENABLE_PGOP_STAT */
rc = osal_fsync(env->me_lazy_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ); rc = osal_fsync(env->me_lazy_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ);
}
osal_flush_incoherent_mmap(env->me_map, pgno2bytes(env, NUM_METAS),
env->me_os_psize);
} }
osal_flush_incoherent_mmap(env->me_map, pgno2bytes(env, NUM_METAS),
env->me_os_psize);
eASSERT(env, !env->me_txn && !env->me_txn0); eASSERT(env, !env->me_txn && !env->me_txn0);
return rc; return rc;
} }
@ -13254,14 +13321,6 @@ __cold int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname,
if (rc != MDBX_SUCCESS) if (rc != MDBX_SUCCESS)
goto bailout; goto bailout;
eASSERT(env, env->me_dsync_fd == INVALID_HANDLE_VALUE);
if ((flags & (MDBX_RDONLY | MDBX_SAFE_NOSYNC | MDBX_NOMETASYNC)) == 0) {
rc = osal_openfile(MDBX_OPEN_DXB_DSYNC, env, env_pathname.dxb,
&env->me_dsync_fd, 0);
ENSURE(env,
(rc != MDBX_SUCCESS) == (env->me_dsync_fd == INVALID_HANDLE_VALUE));
}
#if MDBX_LOCKING == MDBX_LOCKING_SYSV #if MDBX_LOCKING == MDBX_LOCKING_SYSV
env->me_sysv_ipc.key = ftok(env_pathname.dxb, 42); env->me_sysv_ipc.key = ftok(env_pathname.dxb, 42);
if (env->me_sysv_ipc.key == -1) { if (env->me_sysv_ipc.key == -1) {
@ -13270,7 +13329,30 @@ __cold int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname,
} }
#endif /* MDBX_LOCKING */ #endif /* MDBX_LOCKING */
#if !(defined(_WIN32) || defined(_WIN64)) /* Set the position in files outside of the data to avoid corruption
* due to erroneous use of file descriptors in the application code. */
const uint64_t safe_parking_lot_offset = UINT64_C(0x7fffFFFF80000000);
osal_fseek(env->me_lazy_fd, safe_parking_lot_offset);
env->me_fd4data = env->me_fd4meta = env->me_lazy_fd;
#if defined(_WIN32) || defined(_WIN64)
uint8_t ior_flags = 0;
if ((flags & (MDBX_RDONLY | MDBX_SAFE_NOSYNC)) == MDBX_SYNC_DURABLE) {
ior_flags = IOR_OVERLAPPED;
rc =
osal_openfile(MDBX_OPEN_DXB_OVERLAPPED,
env, env_pathname.dxb, &env->me_overlapped_fd, 0);
if (rc != MDBX_SUCCESS)
goto bailout;
env->me_data_lock_event = CreateEventW(nullptr, true, false, nullptr);
if (!env->me_data_lock_event) {
rc = (int)GetLastError();
goto bailout;
}
env->me_fd4data = env->me_overlapped_fd;
osal_fseek(env->me_overlapped_fd, safe_parking_lot_offset);
}
#else
if (mode == 0) { if (mode == 0) {
/* pickup mode for lck-file */ /* pickup mode for lck-file */
struct stat st; struct stat st;
@ -13291,13 +13373,7 @@ __cold int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname,
rc = lck_rc; rc = lck_rc;
goto bailout; goto bailout;
} }
osal_fseek(env->me_lfd, safe_parking_lot_offset);
/* Set the position in files outside of the data to avoid corruption
* due to erroneous use of file descriptors in the application code. */
osal_fseek(env->me_lfd, UINT64_C(1) << 63);
osal_fseek(env->me_lazy_fd, UINT64_C(1) << 63);
if (env->me_dsync_fd != INVALID_HANDLE_VALUE)
osal_fseek(env->me_dsync_fd, UINT64_C(1) << 63);
const MDBX_env_flags_t rigorous_flags = const MDBX_env_flags_t rigorous_flags =
MDBX_SAFE_NOSYNC | MDBX_DEPRECATED_MAPASYNC; MDBX_SAFE_NOSYNC | MDBX_DEPRECATED_MAPASYNC;
@ -13305,6 +13381,20 @@ __cold int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname,
MDBX_LIFORECLAIM | MDBX_LIFORECLAIM |
MDBX_DEPRECATED_COALESCE | MDBX_NORDAHEAD; MDBX_DEPRECATED_COALESCE | MDBX_NORDAHEAD;
eASSERT(env, env->me_dsync_fd == INVALID_HANDLE_VALUE);
if ((flags & (MDBX_RDONLY | MDBX_SAFE_NOSYNC)) == 0 &&
(env->me_fd4data == env->me_lazy_fd || !(flags & MDBX_NOMETASYNC))) {
rc = osal_openfile(MDBX_OPEN_DXB_DSYNC, env, env_pathname.dxb,
&env->me_dsync_fd, 0);
if (env->me_dsync_fd != INVALID_HANDLE_VALUE) {
if ((flags & MDBX_NOMETASYNC) == 0)
env->me_fd4meta = env->me_dsync_fd;
if (env->me_fd4data == env->me_lazy_fd)
env->me_fd4data = env->me_dsync_fd;
osal_fseek(env->me_dsync_fd, safe_parking_lot_offset);
}
}
MDBX_lockinfo *const lck = env->me_lck_mmap.lck; MDBX_lockinfo *const lck = env->me_lck_mmap.lck;
if (lck && lck_rc != MDBX_RESULT_TRUE && (env->me_flags & MDBX_RDONLY) == 0) { if (lck && lck_rc != MDBX_RESULT_TRUE && (env->me_flags & MDBX_RDONLY) == 0) {
while (atomic_load32(&lck->mti_envmode, mo_AcquireRelease) == MDBX_RDONLY) { while (atomic_load32(&lck->mti_envmode, mo_AcquireRelease) == MDBX_RDONLY) {
@ -13413,6 +13503,12 @@ __cold int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname,
} else } else
rc = MDBX_ENOMEM; rc = MDBX_ENOMEM;
} }
if (rc == MDBX_SUCCESS)
rc = osal_ioring_create(&env->me_ioring,
#if defined(_WIN32) || defined(_WIN64)
ior_flags,
#endif /* Windows */
env->me_fd4data);
} }
#if MDBX_DEBUG #if MDBX_DEBUG
@ -13469,6 +13565,8 @@ __cold static int env_close(MDBX_env *env) {
const int rc = lcklist_detach_locked(env); const int rc = lcklist_detach_locked(env);
lcklist_unlock(); lcklist_unlock();
osal_ioring_destroy(&env->me_ioring);
if (env->me_map) { if (env->me_map) {
osal_munmap(&env->me_dxb_mmap); osal_munmap(&env->me_dxb_mmap);
#ifdef MDBX_USE_VALGRIND #ifdef MDBX_USE_VALGRIND
@ -13477,6 +13575,14 @@ __cold static int env_close(MDBX_env *env) {
#endif #endif
} }
#if defined(_WIN32) || defined(_WIN64)
if (env->me_overlapped_fd != INVALID_HANDLE_VALUE) {
CloseHandle(env->me_data_lock_event);
CloseHandle(env->me_overlapped_fd);
env->me_overlapped_fd = INVALID_HANDLE_VALUE;
}
#endif /* Windows */
if (env->me_dsync_fd != INVALID_HANDLE_VALUE) { if (env->me_dsync_fd != INVALID_HANDLE_VALUE) {
(void)osal_closefile(env->me_dsync_fd); (void)osal_closefile(env->me_dsync_fd);
env->me_dsync_fd = INVALID_HANDLE_VALUE; env->me_dsync_fd = INVALID_HANDLE_VALUE;
@ -13578,7 +13684,7 @@ __cold int mdbx_env_close_ex(MDBX_env *env, bool dont_sync) {
? MDBX_SUCCESS ? MDBX_SUCCESS
: rc; : rc;
} }
#endif #endif /* Windows */
} }
eASSERT(env, env->me_signature.weak == 0); eASSERT(env, env->me_signature.weak == 0);
@ -20528,6 +20634,10 @@ __cold static int fetch_envinfo_ex(const MDBX_env *env, const MDBX_txn *txn,
atomic_load64(&lck->mti_pgop_stat.unspill, mo_Relaxed); atomic_load64(&lck->mti_pgop_stat.unspill, mo_Relaxed);
arg->mi_pgop_stat.wops = arg->mi_pgop_stat.wops =
atomic_load64(&lck->mti_pgop_stat.wops, mo_Relaxed); atomic_load64(&lck->mti_pgop_stat.wops, mo_Relaxed);
arg->mi_pgop_stat.msync =
atomic_load64(&lck->mti_pgop_stat.msync, mo_Relaxed);
arg->mi_pgop_stat.fsync =
atomic_load64(&lck->mti_pgop_stat.fsync, mo_Relaxed);
arg->mi_pgop_stat.gcrtime_seconds16dot16 = osal_monotime_to_16dot16( arg->mi_pgop_stat.gcrtime_seconds16dot16 = osal_monotime_to_16dot16(
atomic_load64(&lck->mti_pgop_stat.gcrtime, mo_Relaxed)); atomic_load64(&lck->mti_pgop_stat.gcrtime, mo_Relaxed));
#else #else

View File

@ -591,6 +591,10 @@ typedef struct {
MDBX_atomic_uint64_t MDBX_atomic_uint64_t
gcrtime; /* Time spending for reading/searching GC (aka FreeDB). The gcrtime; /* Time spending for reading/searching GC (aka FreeDB). The
unit/scale is platform-depended, see osal_monotime(). */ unit/scale is platform-depended, see osal_monotime(). */
MDBX_atomic_uint64_t
msync; /* Number of explicit msync/flush-to-disk operations */
MDBX_atomic_uint64_t
fsync; /* Number of explicit fsync/flush-to-disk operations */
} MDBX_pgop_stat_t; } MDBX_pgop_stat_t;
#endif /* MDBX_ENABLE_PGOP_STAT */ #endif /* MDBX_ENABLE_PGOP_STAT */
@ -1143,7 +1147,11 @@ struct MDBX_env {
osal_mmap_t me_dxb_mmap; /* The main data file */ osal_mmap_t me_dxb_mmap; /* The main data file */
#define me_map me_dxb_mmap.dxb #define me_map me_dxb_mmap.dxb
#define me_lazy_fd me_dxb_mmap.fd #define me_lazy_fd me_dxb_mmap.fd
mdbx_filehandle_t me_dsync_fd; #define me_fd4data me_ioring.fd
mdbx_filehandle_t me_dsync_fd, me_fd4meta;
#if defined(_WIN32) || defined(_WIN64)
HANDLE me_overlapped_fd, me_data_lock_event;
#endif /* Windows */
osal_mmap_t me_lck_mmap; /* The lock file */ osal_mmap_t me_lck_mmap; /* The lock file */
#define me_lfd me_lck_mmap.fd #define me_lfd me_lck_mmap.fd
struct MDBX_lockinfo *me_lck; struct MDBX_lockinfo *me_lck;
@ -1222,6 +1230,7 @@ struct MDBX_env {
unsigned me_dp_reserve_len; unsigned me_dp_reserve_len;
/* PNL of pages that became unused in a write txn */ /* PNL of pages that became unused in a write txn */
MDBX_PNL me_retired_pages; MDBX_PNL me_retired_pages;
osal_ioring_t me_ioring;
#if defined(_WIN32) || defined(_WIN64) #if defined(_WIN32) || defined(_WIN64)
osal_srwlock_t me_remap_guard; osal_srwlock_t me_remap_guard;
@ -1609,20 +1618,24 @@ ceil_powerof2(size_t value, size_t granularity) {
} }
MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static unsigned MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static unsigned
log2n_powerof2(size_t value) { log2n_powerof2(size_t value_uintptr) {
assert(value > 0 && value < INT32_MAX && is_powerof2(value)); assert(value_uintptr > 0 && value_uintptr < INT32_MAX &&
assert((value & -(int32_t)value) == value); is_powerof2(value_uintptr));
#if __GNUC_PREREQ(4, 1) || __has_builtin(__builtin_ctzl) assert((value_uintptr & -(intptr_t)value_uintptr) == value_uintptr);
return __builtin_ctzl(value); const uint32_t value_uint32 = (uint32_t)value_uintptr;
#if __GNUC_PREREQ(4, 1) || __has_builtin(__builtin_ctz)
STATIC_ASSERT(sizeof(value_uint32) <= sizeof(unsigned));
return __builtin_ctz(value_uint32);
#elif defined(_MSC_VER) #elif defined(_MSC_VER)
unsigned long index; unsigned long index;
_BitScanForward(&index, (unsigned long)value); STATIC_ASSERT(sizeof(value_uint32) <= sizeof(long));
_BitScanForward(&index, value_uint32);
return index; return index;
#else #else
static const uint8_t debruijn_ctz32[32] = { static const uint8_t debruijn_ctz32[32] = {
0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8, 0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9}; 31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9};
return debruijn_ctz32[(uint32_t)(value * 0x077CB531u) >> 27]; return debruijn_ctz32[(uint32_t)(value_uint32 * 0x077CB531ul) >> 27];
#endif #endif
} }

View File

@ -112,32 +112,73 @@ static
#define LCK_WAITFOR 0 #define LCK_WAITFOR 0
#define LCK_DONTWAIT LOCKFILE_FAIL_IMMEDIATELY #define LCK_DONTWAIT LOCKFILE_FAIL_IMMEDIATELY
static __inline BOOL flock(mdbx_filehandle_t fd, DWORD flags, uint64_t offset, static int flock_with_event(HANDLE fd, HANDLE event, DWORD flags,
size_t bytes) { uint64_t offset, size_t bytes) {
TRACE("lock>>: fd %p, event %p, flags 0x%x offset %" PRId64 ", bytes %" PRId64
" >>",
fd, event, flags, offset, bytes);
OVERLAPPED ov; OVERLAPPED ov;
ov.hEvent = 0; ov.Internal = 0;
ov.InternalHigh = 0;
ov.hEvent = event;
ov.Offset = (DWORD)offset; ov.Offset = (DWORD)offset;
ov.OffsetHigh = HIGH_DWORD(offset); ov.OffsetHigh = HIGH_DWORD(offset);
return LockFileEx(fd, flags, 0, (DWORD)bytes, HIGH_DWORD(bytes), &ov); if (LockFileEx(fd, flags, 0, (DWORD)bytes, HIGH_DWORD(bytes), &ov)) {
TRACE("lock<<: fd %p, event %p, flags 0x%x offset %" PRId64
", bytes %" PRId64 " << %s",
fd, event, flags, offset, bytes, "done");
return MDBX_SUCCESS;
}
DWORD rc = GetLastError();
if (rc == ERROR_IO_PENDING) {
if (event) {
if (GetOverlappedResult(fd, &ov, &rc, true)) {
TRACE("lock<<: fd %p, event %p, flags 0x%x offset %" PRId64
", bytes %" PRId64 " << %s",
fd, event, flags, offset, bytes, "overlapped-done");
return MDBX_SUCCESS;
}
rc = GetLastError();
} else
CancelIo(fd);
}
TRACE("lock<<: fd %p, event %p, flags 0x%x offset %" PRId64 ", bytes %" PRId64
" << err %d",
fd, event, flags, offset, bytes, rc);
return (int)rc;
} }
static __inline BOOL funlock(mdbx_filehandle_t fd, uint64_t offset, static __inline int flock(HANDLE fd, DWORD flags, uint64_t offset,
size_t bytes) { size_t bytes) {
return flock_with_event(fd, 0, flags, offset, bytes);
}
static __inline int flock_data(const MDBX_env *env, DWORD flags,
uint64_t offset, size_t bytes) {
return flock_with_event(env->me_fd4data, env->me_data_lock_event, flags,
offset, bytes);
}
static int funlock(mdbx_filehandle_t fd, uint64_t offset, size_t bytes) {
TRACE("unlock: fd %p, offset %" PRId64 ", bytes %" PRId64, fd, offset, bytes);
return UnlockFile(fd, (DWORD)offset, HIGH_DWORD(offset), (DWORD)bytes, return UnlockFile(fd, (DWORD)offset, HIGH_DWORD(offset), (DWORD)bytes,
HIGH_DWORD(bytes)); HIGH_DWORD(bytes))
? MDBX_SUCCESS
: (int)GetLastError();
} }
/*----------------------------------------------------------------------------*/ /*----------------------------------------------------------------------------*/
/* global `write` lock for write-txt processing, /* global `write` lock for write-txt processing,
* exclusive locking both meta-pages) */ * exclusive locking both meta-pages) */
#define LCK_MAXLEN (1u + ((~(size_t)0) >> 1)) #ifdef _WIN64
#define LCK_META_OFFSET 0 #define DXB_MAXLEN UINT64_C(0x7fffFFFFfff00000)
#define LCK_META_LEN (MAX_PAGESIZE * NUM_METAS) #else
#define LCK_BODY_OFFSET LCK_META_LEN #define DXB_MAXLEN UINT32_C(0x7ff00000)
#define LCK_BODY_LEN (LCK_MAXLEN - LCK_BODY_OFFSET) #endif
#define LCK_BODY LCK_BODY_OFFSET, LCK_BODY_LEN #define DXB_BODY (env->me_psize * NUM_METAS), DXB_MAXLEN
#define LCK_WHOLE 0, LCK_MAXLEN #define DXB_WHOLE 0, DXB_MAXLEN
int mdbx_txn_lock(MDBX_env *env, bool dontwait) { int mdbx_txn_lock(MDBX_env *env, bool dontwait) {
if (dontwait) { if (dontwait) {
@ -155,24 +196,27 @@ int mdbx_txn_lock(MDBX_env *env, bool dontwait) {
} }
} }
if ((env->me_flags & MDBX_EXCLUSIVE) || if (env->me_flags & MDBX_EXCLUSIVE)
flock(env->me_lazy_fd,
dontwait ? (LCK_EXCLUSIVE | LCK_DONTWAIT)
: (LCK_EXCLUSIVE | LCK_WAITFOR),
LCK_BODY))
return MDBX_SUCCESS; return MDBX_SUCCESS;
int rc = (int)GetLastError();
int rc = flock_with_event(env->me_fd4data, env->me_data_lock_event,
dontwait ? (LCK_EXCLUSIVE | LCK_DONTWAIT)
: (LCK_EXCLUSIVE | LCK_WAITFOR),
DXB_BODY);
if (rc == MDBX_SUCCESS)
return rc;
LeaveCriticalSection(&env->me_windowsbug_lock); LeaveCriticalSection(&env->me_windowsbug_lock);
return (!dontwait || rc != ERROR_LOCK_VIOLATION) ? rc : MDBX_BUSY; return (!dontwait || rc != ERROR_LOCK_VIOLATION) ? rc : MDBX_BUSY;
} }
void mdbx_txn_unlock(MDBX_env *env) { void mdbx_txn_unlock(MDBX_env *env) {
int rc = (env->me_flags & MDBX_EXCLUSIVE) if ((env->me_flags & MDBX_EXCLUSIVE) == 0) {
? TRUE int err = funlock(env->me_fd4data, DXB_BODY);
: funlock(env->me_lazy_fd, LCK_BODY); if (err != MDBX_SUCCESS)
mdbx_panic("%s failed: err %u", __func__, err);
}
LeaveCriticalSection(&env->me_windowsbug_lock); LeaveCriticalSection(&env->me_windowsbug_lock);
if (!rc)
mdbx_panic("%s failed: err %u", __func__, (int)GetLastError());
} }
/*----------------------------------------------------------------------------*/ /*----------------------------------------------------------------------------*/
@ -193,32 +237,32 @@ MDBX_INTERNAL_FUNC int osal_rdt_lock(MDBX_env *env) {
/* transition from S-? (used) to S-E (locked), /* transition from S-? (used) to S-E (locked),
* e.g. exclusive lock upper-part */ * e.g. exclusive lock upper-part */
if ((env->me_flags & MDBX_EXCLUSIVE) || if (env->me_flags & MDBX_EXCLUSIVE)
flock(env->me_lfd, LCK_EXCLUSIVE | LCK_WAITFOR, LCK_UPPER)) return MDBX_SUCCESS;
int rc = flock(env->me_lfd, LCK_EXCLUSIVE | LCK_WAITFOR, LCK_UPPER);
if (rc == MDBX_SUCCESS)
return MDBX_SUCCESS; return MDBX_SUCCESS;
int rc = (int)GetLastError();
osal_srwlock_ReleaseShared(&env->me_remap_guard); osal_srwlock_ReleaseShared(&env->me_remap_guard);
return rc; return rc;
} }
MDBX_INTERNAL_FUNC void osal_rdt_unlock(MDBX_env *env) { MDBX_INTERNAL_FUNC void osal_rdt_unlock(MDBX_env *env) {
if (env->me_lfd != INVALID_HANDLE_VALUE) { if (env->me_lfd != INVALID_HANDLE_VALUE &&
(env->me_flags & MDBX_EXCLUSIVE) == 0) {
/* transition from S-E (locked) to S-? (used), e.g. unlock upper-part */ /* transition from S-E (locked) to S-? (used), e.g. unlock upper-part */
if ((env->me_flags & MDBX_EXCLUSIVE) == 0 && int err = funlock(env->me_lfd, LCK_UPPER);
!funlock(env->me_lfd, LCK_UPPER)) if (err != MDBX_SUCCESS)
mdbx_panic("%s failed: err %u", __func__, (int)GetLastError()); mdbx_panic("%s failed: err %u", __func__, err);
} }
osal_srwlock_ReleaseShared(&env->me_remap_guard); osal_srwlock_ReleaseShared(&env->me_remap_guard);
} }
MDBX_INTERNAL_FUNC int osal_lockfile(mdbx_filehandle_t fd, bool wait) { MDBX_INTERNAL_FUNC int osal_lockfile(mdbx_filehandle_t fd, bool wait) {
return flock(fd, return flock(
wait ? LCK_EXCLUSIVE | LCK_WAITFOR fd, wait ? LCK_EXCLUSIVE | LCK_WAITFOR : LCK_EXCLUSIVE | LCK_DONTWAIT, 0,
: LCK_EXCLUSIVE | LCK_DONTWAIT, DXB_MAXLEN);
0, LCK_MAXLEN)
? MDBX_SUCCESS
: (int)GetLastError();
} }
static int suspend_and_append(mdbx_handle_array_t **array, static int suspend_and_append(mdbx_handle_array_t **array,
@ -386,40 +430,36 @@ static void lck_unlock(MDBX_env *env) {
if (env->me_lfd != INVALID_HANDLE_VALUE) { if (env->me_lfd != INVALID_HANDLE_VALUE) {
/* double `unlock` for robustly remove overlapped shared/exclusive locks */ /* double `unlock` for robustly remove overlapped shared/exclusive locks */
while (funlock(env->me_lfd, LCK_LOWER)) do
; err = funlock(env->me_lfd, LCK_LOWER);
err = (int)GetLastError(); while (err == MDBX_SUCCESS);
assert(err == ERROR_NOT_LOCKED || assert(err == ERROR_NOT_LOCKED ||
(mdbx_RunningUnderWine() && err == ERROR_LOCK_VIOLATION)); (mdbx_RunningUnderWine() && err == ERROR_LOCK_VIOLATION));
(void)err;
SetLastError(ERROR_SUCCESS); SetLastError(ERROR_SUCCESS);
while (funlock(env->me_lfd, LCK_UPPER)) do
; err = funlock(env->me_lfd, LCK_UPPER);
err = (int)GetLastError(); while (err == MDBX_SUCCESS);
assert(err == ERROR_NOT_LOCKED || assert(err == ERROR_NOT_LOCKED ||
(mdbx_RunningUnderWine() && err == ERROR_LOCK_VIOLATION)); (mdbx_RunningUnderWine() && err == ERROR_LOCK_VIOLATION));
(void)err;
SetLastError(ERROR_SUCCESS); SetLastError(ERROR_SUCCESS);
} }
if (env->me_lazy_fd != INVALID_HANDLE_VALUE) { if (env->me_fd4data != INVALID_HANDLE_VALUE) {
/* explicitly unlock to avoid latency for other processes (windows kernel /* explicitly unlock to avoid latency for other processes (windows kernel
* releases such locks via deferred queues) */ * releases such locks via deferred queues) */
while (funlock(env->me_lazy_fd, LCK_BODY)) do
; err = funlock(env->me_fd4data, DXB_BODY);
err = (int)GetLastError(); while (err == MDBX_SUCCESS);
assert(err == ERROR_NOT_LOCKED || assert(err == ERROR_NOT_LOCKED ||
(mdbx_RunningUnderWine() && err == ERROR_LOCK_VIOLATION)); (mdbx_RunningUnderWine() && err == ERROR_LOCK_VIOLATION));
(void)err;
SetLastError(ERROR_SUCCESS); SetLastError(ERROR_SUCCESS);
while (funlock(env->me_lazy_fd, LCK_WHOLE)) do
; err = funlock(env->me_fd4data, DXB_WHOLE);
err = (int)GetLastError(); while (err == MDBX_SUCCESS);
assert(err == ERROR_NOT_LOCKED || assert(err == ERROR_NOT_LOCKED ||
(mdbx_RunningUnderWine() && err == ERROR_LOCK_VIOLATION)); (mdbx_RunningUnderWine() && err == ERROR_LOCK_VIOLATION));
(void)err;
SetLastError(ERROR_SUCCESS); SetLastError(ERROR_SUCCESS);
} }
} }
@ -428,56 +468,55 @@ static void lck_unlock(MDBX_env *env) {
* or as 'used' (S-? and returns MDBX_RESULT_FALSE). * or as 'used' (S-? and returns MDBX_RESULT_FALSE).
* Otherwise returns an error. */ * Otherwise returns an error. */
static int internal_seize_lck(HANDLE lfd) { static int internal_seize_lck(HANDLE lfd) {
int rc;
assert(lfd != INVALID_HANDLE_VALUE); assert(lfd != INVALID_HANDLE_VALUE);
/* 1) now on ?-? (free), get ?-E (middle) */ /* 1) now on ?-? (free), get ?-E (middle) */
jitter4testing(false); jitter4testing(false);
if (!flock(lfd, LCK_EXCLUSIVE | LCK_WAITFOR, LCK_UPPER)) { int rc = flock(lfd, LCK_EXCLUSIVE | LCK_WAITFOR, LCK_UPPER);
rc = (int)GetLastError() /* 2) something went wrong, give up */; if (rc != MDBX_SUCCESS) {
/* 2) something went wrong, give up */;
ERROR("%s, err %u", "?-?(free) >> ?-E(middle)", rc); ERROR("%s, err %u", "?-?(free) >> ?-E(middle)", rc);
return rc; return rc;
} }
/* 3) now on ?-E (middle), try E-E (exclusive-write) */ /* 3) now on ?-E (middle), try E-E (exclusive-write) */
jitter4testing(false); jitter4testing(false);
if (flock(lfd, LCK_EXCLUSIVE | LCK_DONTWAIT, LCK_LOWER)) rc = flock(lfd, LCK_EXCLUSIVE | LCK_DONTWAIT, LCK_LOWER);
if (rc == MDBX_SUCCESS)
return MDBX_RESULT_TRUE /* 4) got E-E (exclusive-write), done */; return MDBX_RESULT_TRUE /* 4) got E-E (exclusive-write), done */;
/* 5) still on ?-E (middle) */ /* 5) still on ?-E (middle) */
rc = (int)GetLastError();
jitter4testing(false); jitter4testing(false);
if (rc != ERROR_SHARING_VIOLATION && rc != ERROR_LOCK_VIOLATION) { if (rc != ERROR_SHARING_VIOLATION && rc != ERROR_LOCK_VIOLATION) {
/* 6) something went wrong, give up */ /* 6) something went wrong, give up */
if (!funlock(lfd, LCK_UPPER)) rc = funlock(lfd, LCK_UPPER);
if (rc != MDBX_SUCCESS)
mdbx_panic("%s(%s) failed: err %u", __func__, "?-E(middle) >> ?-?(free)", mdbx_panic("%s(%s) failed: err %u", __func__, "?-E(middle) >> ?-?(free)",
(int)GetLastError()); rc);
return rc; return rc;
} }
/* 7) still on ?-E (middle), try S-E (locked) */ /* 7) still on ?-E (middle), try S-E (locked) */
jitter4testing(false); jitter4testing(false);
rc = flock(lfd, LCK_SHARED | LCK_DONTWAIT, LCK_LOWER) ? MDBX_RESULT_FALSE rc = flock(lfd, LCK_SHARED | LCK_DONTWAIT, LCK_LOWER);
: (int)GetLastError();
jitter4testing(false); jitter4testing(false);
if (rc != MDBX_RESULT_FALSE) if (rc != MDBX_SUCCESS)
ERROR("%s, err %u", "?-E(middle) >> S-E(locked)", rc); ERROR("%s, err %u", "?-E(middle) >> S-E(locked)", rc);
/* 8) now on S-E (locked) or still on ?-E (middle), /* 8) now on S-E (locked) or still on ?-E (middle),
* transition to S-? (used) or ?-? (free) */ * transition to S-? (used) or ?-? (free) */
if (!funlock(lfd, LCK_UPPER)) int err = funlock(lfd, LCK_UPPER);
if (err != MDBX_SUCCESS)
mdbx_panic("%s(%s) failed: err %u", __func__, mdbx_panic("%s(%s) failed: err %u", __func__,
"X-E(locked/middle) >> X-?(used/free)", (int)GetLastError()); "X-E(locked/middle) >> X-?(used/free)", err);
/* 9) now on S-? (used, DONE) or ?-? (free, FAILURE) */ /* 9) now on S-? (used, DONE) or ?-? (free, FAILURE) */
return rc; return rc;
} }
MDBX_INTERNAL_FUNC int osal_lck_seize(MDBX_env *env) { MDBX_INTERNAL_FUNC int osal_lck_seize(MDBX_env *env) {
int rc; assert(env->me_fd4data != INVALID_HANDLE_VALUE);
assert(env->me_lazy_fd != INVALID_HANDLE_VALUE);
if (env->me_flags & MDBX_EXCLUSIVE) if (env->me_flags & MDBX_EXCLUSIVE)
return MDBX_RESULT_TRUE /* nope since files were must be opened return MDBX_RESULT_TRUE /* nope since files were must be opened
non-shareable */ non-shareable */
@ -486,15 +525,13 @@ MDBX_INTERNAL_FUNC int osal_lck_seize(MDBX_env *env) {
if (env->me_lfd == INVALID_HANDLE_VALUE) { if (env->me_lfd == INVALID_HANDLE_VALUE) {
/* LY: without-lck mode (e.g. on read-only filesystem) */ /* LY: without-lck mode (e.g. on read-only filesystem) */
jitter4testing(false); jitter4testing(false);
if (!flock(env->me_lazy_fd, LCK_SHARED | LCK_DONTWAIT, LCK_WHOLE)) { int rc = flock_data(env, LCK_SHARED | LCK_DONTWAIT, DXB_WHOLE);
rc = (int)GetLastError(); if (rc != MDBX_SUCCESS)
ERROR("%s, err %u", "without-lck", rc); ERROR("%s, err %u", "without-lck", rc);
return rc; return rc;
}
return MDBX_RESULT_FALSE;
} }
rc = internal_seize_lck(env->me_lfd); int rc = internal_seize_lck(env->me_lfd);
jitter4testing(false); jitter4testing(false);
if (rc == MDBX_RESULT_TRUE && (env->me_flags & MDBX_RDONLY) == 0) { if (rc == MDBX_RESULT_TRUE && (env->me_flags & MDBX_RDONLY) == 0) {
/* Check that another process don't operates in without-lck mode. /* Check that another process don't operates in without-lck mode.
@ -503,17 +540,18 @@ MDBX_INTERNAL_FUNC int osal_lck_seize(MDBX_env *env) {
* - we need an exclusive lock for do so; * - we need an exclusive lock for do so;
* - we can't lock meta-pages, otherwise other process could get an error * - we can't lock meta-pages, otherwise other process could get an error
* while opening db in valid (non-conflict) mode. */ * while opening db in valid (non-conflict) mode. */
if (!flock(env->me_lazy_fd, LCK_EXCLUSIVE | LCK_DONTWAIT, LCK_BODY)) { int err = flock_data(env, LCK_EXCLUSIVE | LCK_DONTWAIT, DXB_WHOLE);
rc = (int)GetLastError(); if (err != MDBX_SUCCESS) {
ERROR("%s, err %u", "lock-against-without-lck", rc); ERROR("%s, err %u", "lock-against-without-lck", err);
jitter4testing(false); jitter4testing(false);
lck_unlock(env); lck_unlock(env);
} else { return err;
jitter4testing(false);
if (!funlock(env->me_lazy_fd, LCK_BODY))
mdbx_panic("%s(%s) failed: err %u", __func__,
"unlock-against-without-lck", (int)GetLastError());
} }
jitter4testing(false);
err = funlock(env->me_fd4data, DXB_WHOLE);
if (err != MDBX_SUCCESS)
mdbx_panic("%s(%s) failed: err %u", __func__,
"unlock-against-without-lck", err);
} }
return rc; return rc;
@ -521,28 +559,31 @@ MDBX_INTERNAL_FUNC int osal_lck_seize(MDBX_env *env) {
MDBX_INTERNAL_FUNC int osal_lck_downgrade(MDBX_env *env) { MDBX_INTERNAL_FUNC int osal_lck_downgrade(MDBX_env *env) {
/* Transite from exclusive-write state (E-E) to used (S-?) */ /* Transite from exclusive-write state (E-E) to used (S-?) */
assert(env->me_lazy_fd != INVALID_HANDLE_VALUE); assert(env->me_fd4data != INVALID_HANDLE_VALUE);
assert(env->me_lfd != INVALID_HANDLE_VALUE); assert(env->me_lfd != INVALID_HANDLE_VALUE);
if (env->me_flags & MDBX_EXCLUSIVE) if (env->me_flags & MDBX_EXCLUSIVE)
return MDBX_SUCCESS /* nope since files were must be opened non-shareable */ return MDBX_SUCCESS /* nope since files were must be opened non-shareable */
; ;
/* 1) now at E-E (exclusive-write), transition to ?_E (middle) */ /* 1) now at E-E (exclusive-write), transition to ?_E (middle) */
if (!funlock(env->me_lfd, LCK_LOWER)) int rc = funlock(env->me_lfd, LCK_LOWER);
if (rc != MDBX_SUCCESS)
mdbx_panic("%s(%s) failed: err %u", __func__, mdbx_panic("%s(%s) failed: err %u", __func__,
"E-E(exclusive-write) >> ?-E(middle)", (int)GetLastError()); "E-E(exclusive-write) >> ?-E(middle)", rc);
/* 2) now at ?-E (middle), transition to S-E (locked) */ /* 2) now at ?-E (middle), transition to S-E (locked) */
if (!flock(env->me_lfd, LCK_SHARED | LCK_DONTWAIT, LCK_LOWER)) { rc = flock(env->me_lfd, LCK_SHARED | LCK_DONTWAIT, LCK_LOWER);
int rc = (int)GetLastError() /* 3) something went wrong, give up */; if (rc != MDBX_SUCCESS) {
/* 3) something went wrong, give up */;
ERROR("%s, err %u", "?-E(middle) >> S-E(locked)", rc); ERROR("%s, err %u", "?-E(middle) >> S-E(locked)", rc);
return rc; return rc;
} }
/* 4) got S-E (locked), continue transition to S-? (used) */ /* 4) got S-E (locked), continue transition to S-? (used) */
if (!funlock(env->me_lfd, LCK_UPPER)) rc = funlock(env->me_lfd, LCK_UPPER);
if (rc != MDBX_SUCCESS)
mdbx_panic("%s(%s) failed: err %u", __func__, "S-E(locked) >> S-?(used)", mdbx_panic("%s(%s) failed: err %u", __func__, "S-E(locked) >> S-?(used)",
(int)GetLastError()); rc);
return MDBX_SUCCESS /* 5) now at S-? (used), done */; return MDBX_SUCCESS /* 5) now at S-? (used), done */;
} }
@ -555,24 +596,26 @@ MDBX_INTERNAL_FUNC int mdbx_lck_upgrade(MDBX_env *env) {
return MDBX_SUCCESS /* nope since files were must be opened non-shareable */ return MDBX_SUCCESS /* nope since files were must be opened non-shareable */
; ;
int rc;
/* 1) now on S-? (used), try S-E (locked) */ /* 1) now on S-? (used), try S-E (locked) */
jitter4testing(false); jitter4testing(false);
if (!flock(env->me_lfd, LCK_EXCLUSIVE | LCK_DONTWAIT, LCK_UPPER)) { int rc = flock(env->me_lfd, LCK_EXCLUSIVE | LCK_DONTWAIT, LCK_UPPER);
rc = (int)GetLastError() /* 2) something went wrong, give up */; if (rc != MDBX_SUCCESS) {
/* 2) something went wrong, give up */;
VERBOSE("%s, err %u", "S-?(used) >> S-E(locked)", rc); VERBOSE("%s, err %u", "S-?(used) >> S-E(locked)", rc);
return rc; return rc;
} }
/* 3) now on S-E (locked), transition to ?-E (middle) */ /* 3) now on S-E (locked), transition to ?-E (middle) */
if (!funlock(env->me_lfd, LCK_LOWER)) rc = funlock(env->me_lfd, LCK_LOWER);
if (rc != MDBX_SUCCESS)
mdbx_panic("%s(%s) failed: err %u", __func__, "S-E(locked) >> ?-E(middle)", mdbx_panic("%s(%s) failed: err %u", __func__, "S-E(locked) >> ?-E(middle)",
(int)GetLastError()); rc);
/* 4) now on ?-E (middle), try E-E (exclusive-write) */ /* 4) now on ?-E (middle), try E-E (exclusive-write) */
jitter4testing(false); jitter4testing(false);
if (!flock(env->me_lfd, LCK_EXCLUSIVE | LCK_DONTWAIT, LCK_LOWER)) { rc = flock(env->me_lfd, LCK_EXCLUSIVE | LCK_DONTWAIT, LCK_LOWER);
rc = (int)GetLastError() /* 5) something went wrong, give up */; if (rc != MDBX_SUCCESS) {
/* 5) something went wrong, give up */;
VERBOSE("%s, err %u", "?-E(middle) >> E-E(exclusive-write)", rc); VERBOSE("%s, err %u", "?-E(middle) >> E-E(exclusive-write)", rc);
return rc; return rc;
} }
@ -586,6 +629,23 @@ MDBX_INTERNAL_FUNC int osal_lck_init(MDBX_env *env,
(void)env; (void)env;
(void)inprocess_neighbor; (void)inprocess_neighbor;
(void)global_uniqueness_flag; (void)global_uniqueness_flag;
if (mdbx_SetFileIoOverlappedRange && !(env->me_flags & MDBX_RDONLY)) {
HANDLE token = INVALID_HANDLE_VALUE;
TOKEN_PRIVILEGES privileges;
privileges.PrivilegeCount = 1;
privileges.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED;
if (!OpenProcessToken(GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES,
&token) ||
!LookupPrivilegeValue(NULL, SE_LOCK_MEMORY_NAME,
&privileges.Privileges[0].Luid) ||
!AdjustTokenPrivileges(token, FALSE, &privileges, sizeof(privileges),
nullptr, nullptr) ||
GetLastError() != ERROR_SUCCESS)
mdbx_SetFileIoOverlappedRange = NULL;
if (token != INVALID_HANDLE_VALUE)
CloseHandle(token);
}
return MDBX_SUCCESS; return MDBX_SUCCESS;
} }
@ -752,6 +812,7 @@ MDBX_NtFsControlFile mdbx_NtFsControlFile;
MDBX_PrefetchVirtualMemory mdbx_PrefetchVirtualMemory; MDBX_PrefetchVirtualMemory mdbx_PrefetchVirtualMemory;
MDBX_GetTickCount64 mdbx_GetTickCount64; MDBX_GetTickCount64 mdbx_GetTickCount64;
MDBX_RegGetValueA mdbx_RegGetValueA; MDBX_RegGetValueA mdbx_RegGetValueA;
MDBX_SetFileIoOverlappedRange mdbx_SetFileIoOverlappedRange;
#endif /* xMDBX_ALLOY */ #endif /* xMDBX_ALLOY */
#if __GNUC_PREREQ(8, 0) #if __GNUC_PREREQ(8, 0)
@ -783,6 +844,7 @@ static void mdbx_winnt_import(void) {
GET_PROC_ADDR(hKernel32dll, GetVolumeInformationByHandleW); GET_PROC_ADDR(hKernel32dll, GetVolumeInformationByHandleW);
GET_PROC_ADDR(hKernel32dll, GetFinalPathNameByHandleW); GET_PROC_ADDR(hKernel32dll, GetFinalPathNameByHandleW);
GET_PROC_ADDR(hKernel32dll, PrefetchVirtualMemory); GET_PROC_ADDR(hKernel32dll, PrefetchVirtualMemory);
GET_PROC_ADDR(hKernel32dll, SetFileIoOverlappedRange);
} }
const HINSTANCE hAdvapi32dll = GetModuleHandleA("advapi32.dll"); const HINSTANCE hAdvapi32dll = GetModuleHandleA("advapi32.dll");

View File

@ -1,4 +1,4 @@
/* https://en.wikipedia.org/wiki/Operating_system_abstraction_layer */ /* https://en.wikipedia.org/wiki/Operating_system_abstraction_layer */
/* /*
* Copyright 2015-2022 Leonid Yuriev <leo@yuriev.ru> * Copyright 2015-2022 Leonid Yuriev <leo@yuriev.ru>
@ -537,6 +537,596 @@ MDBX_INTERNAL_FUNC size_t osal_mb2w(wchar_t *dst, size_t dst_n, const char *src,
/*----------------------------------------------------------------------------*/ /*----------------------------------------------------------------------------*/
#if defined(_WIN32) || defined(_WIN64)
#define ior_alignment_mask (ior->pagesize - 1)
#define OSAL_IOV_MAX (4096 / sizeof(ior_sgv_element))
static void ior_put_event(osal_ioring_t *ior, HANDLE event) {
assert(event && event != INVALID_HANDLE_VALUE && event != ior);
assert(ior->event_stack < ior->allocated);
ior->event_pool[ior->event_stack] = event;
ior->event_stack += 1;
}
static HANDLE ior_get_event(osal_ioring_t *ior) {
assert(ior->event_stack <= ior->allocated);
if (ior->event_stack > 0) {
ior->event_stack -= 1;
assert(ior->event_pool[ior->event_stack] != 0);
return ior->event_pool[ior->event_stack];
}
return CreateEventW(nullptr, true, false, nullptr);
}
static void WINAPI ior_wocr(DWORD err, DWORD bytes, OVERLAPPED *ov) {
osal_ioring_t *ior = ov->hEvent;
ov->Internal = err;
ov->InternalHigh = bytes;
if (++ior->async_completed >= ior->async_waiting)
SetEvent(ior->async_done);
}
#elif MDBX_HAVE_PWRITEV
#if defined(_SC_IOV_MAX)
static size_t osal_iov_max;
#define OSAL_IOV_MAX osal_iov_max
#else
#define OSAL_IOV_MAX IOV_MAX
#endif
#else
#undef OSAL_IOV_MAX
#endif /* OSAL_IOV_MAX */
MDBX_INTERNAL_FUNC int osal_ioring_create(osal_ioring_t *ior,
#if defined(_WIN32) || defined(_WIN64)
unsigned flags,
#endif /* Windows */
mdbx_filehandle_t fd) {
memset(ior, 0, sizeof(osal_ioring_t));
ior->fd = fd;
#if defined(_WIN32) || defined(_WIN64)
ior->flags = flags;
const unsigned pagesize = (unsigned)osal_syspagesize();
ior->pagesize = pagesize;
ior->pagesize_ln2 = (uint8_t)log2n_powerof2(pagesize);
ior->async_done = ior_get_event(ior);
if (!ior->async_done)
return GetLastError();
#endif /* !Windows */
#if MDBX_HAVE_PWRITEV && defined(_SC_IOV_MAX)
if (!osal_iov_max)
osal_iov_max = sysconf(_SC_IOV_MAX);
#endif
ior->boundary = (char *)(ior->pool + ior->allocated);
return MDBX_SUCCESS;
}
static __inline size_t ior_offset(const ior_item_t *item) {
#if defined(_WIN32) || defined(_WIN64)
return item->ov.Offset | (size_t)((sizeof(size_t) > sizeof(item->ov.Offset))
? (uint64_t)item->ov.OffsetHigh << 32
: 0);
#else
return item->offset;
#endif /* !Windows */
}
static __inline ior_item_t *ior_next(ior_item_t *item, size_t sgvcnt) {
#if defined(ior_sgv_element)
assert(sgvcnt > 0);
return (ior_item_t *)((char *)item + sizeof(ior_item_t) -
sizeof(ior_sgv_element) +
sizeof(ior_sgv_element) * sgvcnt);
#else
assert(sgvcnt == 1);
(void)sgvcnt;
return item + 1;
#endif
}
MDBX_INTERNAL_FUNC int osal_ioring_add(osal_ioring_t *ior, const size_t offset,
void *data, const size_t bytes) {
assert(bytes && data);
assert(bytes % MIN_PAGESIZE == 0 && bytes <= MAX_WRITE);
assert(offset % MIN_PAGESIZE == 0 && offset + (uint64_t)bytes <= MAX_MAPSIZE);
#if defined(_WIN32) || defined(_WIN64)
const unsigned segments = (unsigned)(bytes >> ior->pagesize_ln2);
const bool use_gather =
(ior->flags & IOR_UNBUFFERED) && ior->slots_left >= segments;
#endif /* Windows */
ior_item_t *item = ior->pool;
if (likely(ior->last)) {
item = ior->last;
if (unlikely(ior_offset(item) + ior_last_bytes(ior, item) == offset) &&
likely(ior_last_bytes(ior, item) + bytes <= MAX_WRITE)) {
#if defined(_WIN32) || defined(_WIN64)
if (use_gather &&
((bytes | (uintptr_t)data | ior->last_bytes |
(uintptr_t)(uint64_t)item->sgv[0].Buffer) &
ior_alignment_mask) == 0 &&
ior->last_sgvcnt + segments < OSAL_IOV_MAX) {
assert((item->single.iov_len & 1) == 0);
assert(item->sgv[ior->last_sgvcnt].Buffer == 0);
ior->last_bytes += bytes;
size_t i = 0;
do {
item->sgv[ior->last_sgvcnt + i].Buffer = PtrToPtr64(data);
data = (char *)data + ior->pagesize;
} while (++i < segments);
ior->slots_left -= segments;
item->sgv[ior->last_sgvcnt += segments].Buffer = 0;
assert((item->single.iov_len & 1) == 0);
return MDBX_SUCCESS;
}
const void *end =
(char *)(item->single.iov_base) + item->single.iov_len - 1;
if (unlikely(end == data)) {
assert((item->single.iov_len & 1) != 0);
item->single.iov_len += bytes;
return MDBX_SUCCESS;
}
#elif MDBX_HAVE_PWRITEV
assert((int)item->sgvcnt > 0);
const void *end = (char *)(item->sgv[item->sgvcnt - 1].iov_base) +
item->sgv[item->sgvcnt - 1].iov_len;
if (unlikely(end == data)) {
item->sgv[item->sgvcnt - 1].iov_len += bytes;
ior->last_bytes += bytes;
return MDBX_SUCCESS;
}
if (likely(item->sgvcnt < OSAL_IOV_MAX)) {
if (unlikely(ior->slots_left < 1))
return MDBX_RESULT_TRUE;
item->sgv[item->sgvcnt].iov_base = data;
item->sgv[item->sgvcnt].iov_len = bytes;
ior->last_bytes += bytes;
item->sgvcnt += 1;
ior->slots_left -= 1;
return MDBX_SUCCESS;
}
#else
const void *end = (char *)(item->single.iov_base) + item->single.iov_len;
if (unlikely(end == data)) {
item->single.iov_len += bytes;
return MDBX_SUCCESS;
}
#endif
}
item = ior_next(item, ior_last_sgvcnt(ior, item));
}
if (unlikely(ior->slots_left < 1))
return MDBX_RESULT_TRUE;
unsigned slots_used = 1;
#if defined(_WIN32) || defined(_WIN64)
item->ov.Internal = item->ov.InternalHigh = 0;
item->ov.Offset = (DWORD)offset;
item->ov.OffsetHigh = HIGH_DWORD(offset);
item->ov.hEvent = 0;
if (!use_gather || ((bytes | (uintptr_t)(data)) & ior_alignment_mask) != 0 ||
segments > OSAL_IOV_MAX) {
/* WriteFile() */
item->single.iov_base = data;
item->single.iov_len = bytes + 1;
assert((item->single.iov_len & 1) != 0);
} else {
/* WriteFileGather() */
item->sgv[0].Buffer = PtrToPtr64(data);
for (size_t i = 1; i < segments; ++i) {
data = (char *)data + ior->pagesize;
item->sgv[slots_used].Buffer = PtrToPtr64(data);
}
item->sgv[slots_used].Buffer = 0;
assert((item->single.iov_len & 1) == 0);
slots_used = segments;
}
ior->last_bytes = bytes;
ior_last_sgvcnt(ior, item) = slots_used;
#elif MDBX_HAVE_PWRITEV
item->offset = offset;
item->sgv[0].iov_base = data;
item->sgv[0].iov_len = bytes;
ior->last_bytes = bytes;
ior_last_sgvcnt(ior, item) = slots_used;
#else
item->offset = offset;
item->single.iov_base = data;
item->single.iov_len = bytes;
#endif /* !Windows */
ior->slots_left -= slots_used;
ior->last = item;
return MDBX_SUCCESS;
}
MDBX_INTERNAL_FUNC void osal_ioring_walk(
osal_ioring_t *ior, iov_ctx_t *ctx,
void (*callback)(iov_ctx_t *ctx, size_t offset, void *data, size_t bytes)) {
for (ior_item_t *item = ior->pool; item <= ior->last;) {
#if defined(_WIN32) || defined(_WIN64)
size_t offset = ior_offset(item);
char *data = item->single.iov_base;
size_t bytes = item->single.iov_len - 1;
size_t i = 1;
if (bytes & 1) {
data = Ptr64ToPtr(item->sgv[0].Buffer);
bytes = ior->pagesize;
while (item->sgv[i].Buffer) {
if (data + ior->pagesize != item->sgv[i].Buffer) {
callback(ctx, offset, data, bytes);
offset += bytes;
data = Ptr64ToPtr(item->sgv[i].Buffer);
bytes = 0;
}
bytes += ior->pagesize;
++i;
}
}
assert(bytes < MAX_WRITE);
callback(ctx, offset, data, bytes);
#elif MDBX_HAVE_PWRITEV
assert(item->sgvcnt > 0);
size_t offset = item->offset;
size_t i = 0;
do {
callback(ctx, offset, item->sgv[i].iov_base, item->sgv[i].iov_len);
offset += item->sgv[i].iov_len;
} while (++i != item->sgvcnt);
#else
const size_t i = 1;
callback(ctx, item->offset, item->single.iov_base, item->single.iov_len);
#endif
item = ior_next(item, i);
}
}
MDBX_INTERNAL_FUNC osal_ioring_write_result_t
osal_ioring_write(osal_ioring_t *ior) {
osal_ioring_write_result_t r = {MDBX_SUCCESS, 0};
#if defined(_WIN32) || defined(_WIN64)
HANDLE *const end_wait_for =
ior->event_pool + ior->allocated +
/* был выделен один дополнительный элемент для async_done */ 1;
HANDLE *wait_for = end_wait_for;
LONG async_started = 0;
for (ior_item_t *item = ior->pool; item <= ior->last;) {
item->ov.Internal = STATUS_PENDING;
size_t i = 1, bytes = item->single.iov_len - 1;
r.wops += 1;
if (bytes & 1) {
bytes = ior->pagesize;
while (item->sgv[i].Buffer) {
bytes += ior->pagesize;
++i;
}
assert(bytes < MAX_WRITE);
item->ov.hEvent = ior_get_event(ior);
if (unlikely(!item->ov.hEvent)) {
bailout_geterr:
r.err = GetLastError();
bailout_rc:
assert(r.err != MDBX_SUCCESS);
CancelIo(ior->fd);
return r;
}
if (WriteFileGather(ior->fd, item->sgv, (DWORD)bytes, nullptr,
&item->ov)) {
assert(item->ov.Internal == 0 &&
WaitForSingleObject(item->ov.hEvent, 0) == WAIT_OBJECT_0);
ior_put_event(ior, item->ov.hEvent);
item->ov.hEvent = 0;
} else {
r.err = (int)GetLastError();
if (unlikely(r.err != ERROR_IO_PENDING)) {
ERROR("%s: fd %p, item %p (%zu), pgno %u, bytes %zu, offset %" PRId64
", err %d",
"WriteFileGather", ior->fd, item, item - ior->pool,
((MDBX_page *)item->single.iov_base)->mp_pgno, bytes,
item->ov.Offset + ((uint64_t)item->ov.OffsetHigh << 32), r.err);
goto bailout_rc;
}
assert(wait_for > ior->event_pool + ior->event_stack);
*--wait_for = item->ov.hEvent;
}
} else if (ior->flags & IOR_OVERLAPPED) {
assert(bytes < MAX_WRITE);
retry:
item->ov.hEvent = ior;
if (WriteFileEx(ior->fd, item->single.iov_base, (DWORD)bytes, &item->ov,
ior_wocr)) {
async_started += 1;
} else {
r.err = (int)GetLastError();
switch (r.err) {
default:
ERROR("%s: fd %p, item %p (%zu), pgno %u, bytes %zu, offset %" PRId64
", err %d",
"WriteFileEx", ior->fd, item, item - ior->pool,
((MDBX_page *)item->single.iov_base)->mp_pgno, bytes,
item->ov.Offset + ((uint64_t)item->ov.OffsetHigh << 32), r.err);
goto bailout_rc;
case ERROR_NOT_FOUND:
case ERROR_USER_MAPPED_FILE:
case ERROR_LOCK_VIOLATION:
WARNING(
"%s: fd %p, item %p (%zu), pgno %u, bytes %zu, offset %" PRId64
", err %d",
"WriteFileEx", ior->fd, item, item - ior->pool,
((MDBX_page *)item->single.iov_base)->mp_pgno, bytes,
item->ov.Offset + ((uint64_t)item->ov.OffsetHigh << 32), r.err);
SleepEx(0, true);
goto retry;
case ERROR_INVALID_USER_BUFFER:
case ERROR_NOT_ENOUGH_MEMORY:
if (SleepEx(0, true) == WAIT_IO_COMPLETION)
goto retry;
goto bailout_rc;
case ERROR_IO_PENDING:
async_started += 1;
}
}
} else {
assert(bytes < MAX_WRITE);
DWORD written = 0;
if (!WriteFile(ior->fd, item->single.iov_base, (DWORD)bytes, &written,
&item->ov)) {
r.err = (int)GetLastError();
ERROR("%s: fd %p, item %p (%zu), pgno %u, bytes %zu, offset %" PRId64
", err %d",
"WriteFile", ior->fd, item, item - ior->pool,
((MDBX_page *)item->single.iov_base)->mp_pgno, bytes,
item->ov.Offset + ((uint64_t)item->ov.OffsetHigh << 32), r.err);
goto bailout_rc;
} else if (unlikely(written != bytes)) {
r.err = ERROR_WRITE_FAULT;
goto bailout_rc;
}
}
item = ior_next(item, i);
}
assert(ior->async_waiting > ior->async_completed &&
ior->async_waiting == INT_MAX);
ior->async_waiting = async_started;
if (async_started > ior->async_completed && end_wait_for == wait_for) {
assert(wait_for > ior->event_pool + ior->event_stack);
*--wait_for = ior->async_done;
}
const size_t pending_count = end_wait_for - wait_for;
if (pending_count) {
/* Ждем до MAXIMUM_WAIT_OBJECTS (64) последних хендлов, а после избирательно
* ждем посредством GetOverlappedResult(), если какие-то более ранние
* элементы еще не завершены. В целом, так получается меньше системных
* вызовов, т.е. меньше накладных расходов. Однако, не факт что эта экономия
* не будет перекрыта неэффективностью реализации
* WaitForMultipleObjectsEx(), но тогда это проблемы на стороне M$. */
DWORD madness;
do
madness = WaitForMultipleObjectsEx((pending_count < MAXIMUM_WAIT_OBJECTS)
? (DWORD)pending_count
: MAXIMUM_WAIT_OBJECTS,
wait_for, true,
/* сутки */ 86400000ul, true);
while (madness == WAIT_IO_COMPLETION);
STATIC_ASSERT(WAIT_OBJECT_0 == 0);
if (/* madness >= WAIT_OBJECT_0 && */
madness < WAIT_OBJECT_0 + MAXIMUM_WAIT_OBJECTS)
r.err = MDBX_SUCCESS;
else if (madness >= WAIT_ABANDONED_0 &&
madness < WAIT_ABANDONED_0 + MAXIMUM_WAIT_OBJECTS) {
r.err = ERROR_ABANDONED_WAIT_0;
goto bailout_rc;
} else if (madness == WAIT_TIMEOUT) {
r.err = WAIT_TIMEOUT;
goto bailout_rc;
} else {
r.err = /* madness == WAIT_FAILED */ MDBX_PROBLEM;
goto bailout_rc;
}
assert(ior->async_waiting == ior->async_completed);
for (ior_item_t *item = ior->pool; item <= ior->last;) {
size_t i = 1, bytes = item->single.iov_len - 1;
if (bytes & 1) {
bytes = ior->pagesize;
while (item->sgv[i].Buffer) {
bytes += ior->pagesize;
++i;
}
if (!HasOverlappedIoCompleted(&item->ov)) {
DWORD written = 0;
if (unlikely(
!GetOverlappedResult(ior->fd, &item->ov, &written, true))) {
ERROR("%s: item %p (%zu), pgno %u, bytes %zu, offset %" PRId64
", err %d",
"GetOverlappedResult", item, item - ior->pool,
((MDBX_page *)item->single.iov_base)->mp_pgno, bytes,
item->ov.Offset + ((uint64_t)item->ov.OffsetHigh << 32),
GetLastError());
goto bailout_geterr;
}
assert(MDBX_SUCCESS == item->ov.Internal);
assert(written == item->ov.InternalHigh);
}
} else {
assert(HasOverlappedIoCompleted(&item->ov));
}
assert(item->ov.Internal != ERROR_IO_PENDING);
if (unlikely(item->ov.Internal != MDBX_SUCCESS)) {
DWORD written = 0;
r.err = (int)item->ov.Internal;
if ((r.err & 0x80000000) &&
GetOverlappedResult(NULL, &item->ov, &written, true))
r.err = (int)GetLastError();
ERROR("%s: item %p (%zu), pgno %u, bytes %zu, offset %" PRId64
", err %d",
"Result", item, item - ior->pool,
((MDBX_page *)item->single.iov_base)->mp_pgno, bytes,
item->ov.Offset + ((uint64_t)item->ov.OffsetHigh << 32),
GetLastError());
goto bailout_rc;
}
if (unlikely(item->ov.InternalHigh != bytes)) {
r.err = ERROR_WRITE_FAULT;
goto bailout_rc;
}
item = ior_next(item, i);
}
assert(ior->async_waiting == ior->async_completed);
} else {
assert(r.err == MDBX_SUCCESS);
}
assert(ior->async_waiting == ior->async_completed);
#else
STATIC_ASSERT_MSG(sizeof(off_t) >= sizeof(size_t),
"libmdbx requires 64-bit file I/O on 64-bit systems");
for (ior_item_t *item = ior->pool; item <= ior->last;) {
#if MDBX_HAVE_PWRITEV
assert(item->sgvcnt > 0);
if (item->sgvcnt == 1)
r.err = osal_pwrite(ior->fd, item->sgv[0].iov_base, item->sgv[0].iov_len,
item->offset);
else
r.err = osal_pwritev(ior->fd, item->sgv, item->sgvcnt, item->offset);
// TODO: io_uring_prep_write(sqe, fd, ...);
item = ior_next(item, item->sgvcnt);
#else
r.err = osal_pwrite(ior->fd, item->single.iov_base, item->single.iov_len,
item->offset);
item = ior_next(item, 1);
#endif
r.wops += 1;
if (unlikely(r.err != MDBX_SUCCESS))
break;
}
// TODO: io_uring_submit(&ring)
// TODO: err = io_uring_wait_cqe(&ring, &cqe);
// TODO: io_uring_cqe_seen(&ring, cqe);
#endif /* !Windows */
return r;
}
MDBX_INTERNAL_FUNC void osal_ioring_reset(osal_ioring_t *ior) {
#if defined(_WIN32) || defined(_WIN64)
if (ior->last) {
for (ior_item_t *item = ior->pool; item <= ior->last;) {
if (!HasOverlappedIoCompleted(&item->ov))
CancelIoEx(ior->fd, &item->ov);
if (item->ov.hEvent && item->ov.hEvent != ior)
ior_put_event(ior, item->ov.hEvent);
size_t i = 1;
if ((item->single.iov_len & 1) == 0)
while (item->sgv[i].Buffer)
++i;
item = ior_next(item, i);
}
}
ior->async_waiting = INT_MAX;
ior->async_completed = 0;
ResetEvent(ior->async_done);
#endif /* !Windows */
ior->slots_left = ior->allocated;
ior->last = nullptr;
}
static void ior_cleanup(osal_ioring_t *ior, const size_t since) {
osal_ioring_reset(ior);
#if defined(_WIN32) || defined(_WIN64)
for (size_t i = since; i < ior->event_stack; ++i)
CloseHandle(ior->event_pool[i]);
ior->event_stack = 0;
#else
(void)since;
#endif /* Windows */
}
MDBX_INTERNAL_FUNC int osal_ioring_resize(osal_ioring_t *ior, size_t items) {
assert(items > 0 && items < INT_MAX / sizeof(ior_item_t));
#if defined(_WIN32) || defined(_WIN64)
if (ior->state & IOR_STATE_LOCKED)
return MDBX_SUCCESS;
const bool useSetFileIoOverlappedRange = (ior->flags & IOR_OVERLAPPED) &&
mdbx_SetFileIoOverlappedRange &&
items > 7;
const size_t ceiling =
useSetFileIoOverlappedRange
? ((items < 65536 / 2 / sizeof(ior_item_t)) ? 65536 : 65536 * 4)
: 4096;
const size_t bytes = ceil_powerof2(sizeof(ior_item_t) * items, ceiling);
items = bytes / sizeof(ior_item_t);
#endif /* Windows */
if (items != ior->allocated) {
assert(items >= osal_ioring_used(ior));
if (items < ior->allocated)
ior_cleanup(ior, items);
#if defined(_WIN32) || defined(_WIN64)
void *ptr = osal_realloc(
ior->event_pool,
(items + /* extra for waiting the async_done */ 1) * sizeof(HANDLE));
if (unlikely(!ptr))
return MDBX_ENOMEM;
ior->event_pool = ptr;
int err = osal_memalign_alloc(ceiling, bytes, &ptr);
if (unlikely(err != MDBX_SUCCESS))
return err;
if (ior->pool) {
memcpy(ptr, ior->pool, ior->allocated * sizeof(ior_item_t));
osal_memalign_free(ior->pool);
}
#else
void *ptr = osal_realloc(ior->pool, sizeof(ior_item_t) * items);
if (unlikely(!ptr))
return MDBX_ENOMEM;
#endif
ior->pool = ptr;
if (items > ior->allocated)
memset(ior->pool + ior->allocated, 0,
sizeof(ior_item_t) * (items - ior->allocated));
ior->allocated = (unsigned)items;
ior->boundary = (char *)(ior->pool + ior->allocated);
#if defined(_WIN32) || defined(_WIN64)
if (useSetFileIoOverlappedRange) {
if (mdbx_SetFileIoOverlappedRange(ior->fd, ptr, (ULONG)bytes))
ior->state += IOR_STATE_LOCKED;
else
return GetLastError();
}
#endif /* Windows */
}
return MDBX_SUCCESS;
}
MDBX_INTERNAL_FUNC void osal_ioring_destroy(osal_ioring_t *ior) {
if (ior->allocated)
ior_cleanup(ior, 0);
#if defined(_WIN32) || defined(_WIN64)
osal_memalign_free(ior->pool);
osal_free(ior->event_pool);
CloseHandle(ior->async_done);
#else
osal_free(ior->pool);
#endif
memset(ior, -1, sizeof(osal_ioring_t));
}
/*----------------------------------------------------------------------------*/
MDBX_INTERNAL_FUNC int osal_removefile(const pathchar_t *pathname) { MDBX_INTERNAL_FUNC int osal_removefile(const pathchar_t *pathname) {
#if defined(_WIN32) || defined(_WIN64) #if defined(_WIN32) || defined(_WIN64)
return DeleteFileW(pathname) ? MDBX_SUCCESS : (int)GetLastError(); return DeleteFileW(pathname) ? MDBX_SUCCESS : (int)GetLastError();
@ -589,17 +1179,21 @@ MDBX_INTERNAL_FUNC int osal_openfile(const enum osal_openfile_purpose purpose,
case MDBX_OPEN_DXB_LAZY: case MDBX_OPEN_DXB_LAZY:
DesiredAccess |= GENERIC_READ | GENERIC_WRITE; DesiredAccess |= GENERIC_READ | GENERIC_WRITE;
break; break;
case MDBX_OPEN_DXB_OVERLAPPED:
FlagsAndAttributes |= FILE_FLAG_OVERLAPPED;
/* fall through */
__fallthrough;
case MDBX_OPEN_DXB_DSYNC: case MDBX_OPEN_DXB_DSYNC:
CreationDisposition = OPEN_EXISTING; CreationDisposition = OPEN_EXISTING;
DesiredAccess |= GENERIC_WRITE; DesiredAccess |= GENERIC_WRITE | GENERIC_READ;
FlagsAndAttributes |= FILE_FLAG_WRITE_THROUGH; FlagsAndAttributes |= FILE_FLAG_WRITE_THROUGH;
break; break;
case MDBX_OPEN_COPY: case MDBX_OPEN_COPY:
CreationDisposition = CREATE_NEW; CreationDisposition = CREATE_NEW;
ShareMode = 0; ShareMode = 0;
DesiredAccess |= GENERIC_WRITE; DesiredAccess |= GENERIC_WRITE;
FlagsAndAttributes |= if (env->me_psize >= env->me_os_psize)
(env->me_psize < env->me_os_psize) ? 0 : FILE_FLAG_NO_BUFFERING; FlagsAndAttributes |= FILE_FLAG_NO_BUFFERING;
break; break;
case MDBX_OPEN_DELETE: case MDBX_OPEN_DELETE:
CreationDisposition = OPEN_EXISTING; CreationDisposition = OPEN_EXISTING;
@ -878,28 +1472,30 @@ MDBX_INTERNAL_FUNC int osal_write(mdbx_filehandle_t fd, const void *buf,
} }
} }
int osal_pwritev(mdbx_filehandle_t fd, struct iovec *iov, int iovcnt, int osal_pwritev(mdbx_filehandle_t fd, struct iovec *iov, int sgvcnt,
uint64_t offset, size_t expected_written) { uint64_t offset) {
#if defined(_WIN32) || defined(_WIN64) || defined(__APPLE__) || \ size_t expected = 0;
(defined(__ANDROID_API__) && __ANDROID_API__ < 24) for (int i = 0; i < sgvcnt; ++i)
expected += iov[i].iov_len;
#if !MDBX_HAVE_PWRITEV
size_t written = 0; size_t written = 0;
for (int i = 0; i < iovcnt; ++i) { for (int i = 0; i < sgvcnt; ++i) {
int rc = osal_pwrite(fd, iov[i].iov_base, iov[i].iov_len, offset); int rc = osal_pwrite(fd, iov[i].iov_base, iov[i].iov_len, offset);
if (unlikely(rc != MDBX_SUCCESS)) if (unlikely(rc != MDBX_SUCCESS))
return rc; return rc;
written += iov[i].iov_len; written += iov[i].iov_len;
offset += iov[i].iov_len; offset += iov[i].iov_len;
} }
return (expected_written == written) ? MDBX_SUCCESS return (expected == written) ? MDBX_SUCCESS
: MDBX_EIO /* ERROR_WRITE_FAULT */; : MDBX_EIO /* ERROR_WRITE_FAULT */;
#else #else
int rc; int rc;
intptr_t written; intptr_t written;
do { do {
STATIC_ASSERT_MSG(sizeof(off_t) >= sizeof(size_t), STATIC_ASSERT_MSG(sizeof(off_t) >= sizeof(size_t),
"libmdbx requires 64-bit file I/O on 64-bit systems"); "libmdbx requires 64-bit file I/O on 64-bit systems");
written = pwritev(fd, iov, iovcnt, offset); written = pwritev(fd, iov, sgvcnt, offset);
if (likely(expected_written == (size_t)written)) if (likely(expected == (size_t)written))
return MDBX_SUCCESS; return MDBX_SUCCESS;
rc = errno; rc = errno;
} while (rc == EINTR); } while (rc == EINTR);
@ -1066,7 +1662,7 @@ MDBX_INTERNAL_FUNC int osal_thread_join(osal_thread_t thread) {
/*----------------------------------------------------------------------------*/ /*----------------------------------------------------------------------------*/
MDBX_INTERNAL_FUNC int osal_msync(osal_mmap_t *map, size_t offset, MDBX_INTERNAL_FUNC int osal_msync(const osal_mmap_t *map, size_t offset,
size_t length, size_t length,
enum osal_syncmode_bits mode_bits) { enum osal_syncmode_bits mode_bits) {
uint8_t *ptr = (uint8_t *)map->address + offset; uint8_t *ptr = (uint8_t *)map->address + offset;

View File

@ -263,8 +263,138 @@ typedef union osal_srwlock {
} osal_srwlock_t; } osal_srwlock_t;
#endif /* Windows */ #endif /* Windows */
#ifndef MDBX_HAVE_PWRITEV
#if defined(_WIN32) || defined(_WIN64)
#define MDBX_HAVE_PWRITEV 0
#elif defined(__ANDROID_API__)
#if __ANDROID_API__ < 24
#define MDBX_HAVE_PWRITEV 0
#else
#define MDBX_HAVE_PWRITEV 1
#endif
#elif defined(__APPLE__) || defined(__MACH__) || defined(_DARWIN_C_SOURCE)
#if defined(MAC_OS_X_VERSION_MIN_REQUIRED) && defined(MAC_OS_VERSION_11_0) && \
MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_VERSION_11_0
/* FIXME: add checks for IOS versions, etc */
#define MDBX_HAVE_PWRITEV 1
#else
#define MDBX_HAVE_PWRITEV 0
#endif
#elif defined(_SC_IOV_MAX) || (defined(IOV_MAX) && IOV_MAX > 1)
#define MDBX_HAVE_PWRITEV 1
#else
#define MDBX_HAVE_PWRITEV 0
#endif
#endif /* MDBX_HAVE_PWRITEV */
typedef struct ior_item {
#if defined(_WIN32) || defined(_WIN64)
OVERLAPPED ov;
#define ior_svg_gap4terminator 1
#define ior_sgv_element FILE_SEGMENT_ELEMENT
#else
size_t offset;
#if MDBX_HAVE_PWRITEV
size_t sgvcnt;
#define ior_svg_gap4terminator 0
#define ior_sgv_element struct iovec
#endif /* MDBX_HAVE_PWRITEV */
#endif /* !Windows */
union {
MDBX_val single;
#if defined(ior_sgv_element)
ior_sgv_element sgv[1 + ior_svg_gap4terminator];
#endif /* ior_sgv_element */
};
} ior_item_t;
typedef struct osal_ioring {
unsigned slots_left;
unsigned allocated;
#if defined(_WIN32) || defined(_WIN64)
#define IOR_UNBUFFERED 1
#define IOR_OVERLAPPED 2
#define IOR_STATE_LOCKED 1
unsigned pagesize;
unsigned last_sgvcnt;
size_t last_bytes;
uint8_t flags, state, pagesize_ln2;
unsigned event_stack;
HANDLE *event_pool;
volatile LONG async_waiting;
volatile LONG async_completed;
HANDLE async_done;
#define ior_last_sgvcnt(ior, item) (ior)->last_sgvcnt
#define ior_last_bytes(ior, item) (ior)->last_bytes
#elif MDBX_HAVE_PWRITEV
unsigned last_bytes;
#define ior_last_sgvcnt(ior, item) (item)->sgvcnt
#define ior_last_bytes(ior, item) (ior)->last_bytes
#else
#define ior_last_sgvcnt(ior, item) (1)
#define ior_last_bytes(ior, item) (item)->single.iov_len
#endif /* !Windows */
mdbx_filehandle_t fd;
ior_item_t *last;
ior_item_t *pool;
char *boundary;
} osal_ioring_t;
#ifndef __cplusplus #ifndef __cplusplus
/* Actually this is not ioring for now, but on the way. */
MDBX_INTERNAL_FUNC int osal_ioring_create(osal_ioring_t *,
#if defined(_WIN32) || defined(_WIN64)
unsigned flags,
#endif /* Windows */
mdbx_filehandle_t fd);
MDBX_INTERNAL_FUNC int osal_ioring_resize(osal_ioring_t *, size_t items);
MDBX_INTERNAL_FUNC void osal_ioring_destroy(osal_ioring_t *);
MDBX_INTERNAL_FUNC void osal_ioring_reset(osal_ioring_t *);
MDBX_INTERNAL_FUNC int osal_ioring_add(osal_ioring_t *ctx, const size_t offset,
void *data, const size_t bytes);
typedef struct osal_ioring_write_result {
int err;
unsigned wops;
} osal_ioring_write_result_t;
MDBX_INTERNAL_FUNC osal_ioring_write_result_t
osal_ioring_write(osal_ioring_t *ior);
typedef struct iov_ctx iov_ctx_t;
MDBX_INTERNAL_FUNC void osal_ioring_walk(
osal_ioring_t *ior, iov_ctx_t *ctx,
void (*callback)(iov_ctx_t *ctx, size_t offset, void *data, size_t bytes));
static inline unsigned osal_ioring_left(const osal_ioring_t *ior) {
return ior->slots_left;
}
static inline unsigned osal_ioring_used(const osal_ioring_t *ior) {
return ior->allocated - ior->slots_left;
}
static inline int osal_ioring_reserve(osal_ioring_t *ior, unsigned items,
size_t bytes) {
items = (items > 32) ? items : 32;
#if defined(_WIN32) || defined(_WIN64)
const unsigned npages = (unsigned)(bytes >> ior->pagesize_ln2);
items = (items > npages) ? items : npages;
#else
(void)bytes;
#endif
items = (items < 65536) ? items : 65536;
if (likely(ior->allocated >= items))
return MDBX_SUCCESS;
return osal_ioring_resize(ior, items);
}
/*----------------------------------------------------------------------------*/ /*----------------------------------------------------------------------------*/
/* libc compatibility stuff */ /* libc compatibility stuff */
@ -290,10 +420,12 @@ MDBX_MAYBE_UNUSED MDBX_INTERNAL_FUNC void osal_jitter(bool tiny);
MDBX_MAYBE_UNUSED static __inline void jitter4testing(bool tiny); MDBX_MAYBE_UNUSED static __inline void jitter4testing(bool tiny);
/* max bytes to write in one call */ /* max bytes to write in one call */
#if defined(_WIN32) || defined(_WIN64) #if defined(_WIN64)
#define MAX_WRITE UINT32_C(0x01000000) #define MAX_WRITE UINT32_C(0x10000000)
#elif defined(_WIN32)
#define MAX_WRITE UINT32_C(0x04000000)
#else #else
#define MAX_WRITE UINT32_C(0x3fff0000) #define MAX_WRITE UINT32_C(0x3f000000)
#endif #endif
#if defined(__linux__) || defined(__gnu_linux__) #if defined(__linux__) || defined(__gnu_linux__)
@ -336,8 +468,7 @@ MDBX_INTERNAL_FUNC int osal_fastmutex_release(osal_fastmutex_t *fastmutex);
MDBX_INTERNAL_FUNC int osal_fastmutex_destroy(osal_fastmutex_t *fastmutex); MDBX_INTERNAL_FUNC int osal_fastmutex_destroy(osal_fastmutex_t *fastmutex);
MDBX_INTERNAL_FUNC int osal_pwritev(mdbx_filehandle_t fd, struct iovec *iov, MDBX_INTERNAL_FUNC int osal_pwritev(mdbx_filehandle_t fd, struct iovec *iov,
int iovcnt, uint64_t offset, int sgvcnt, uint64_t offset);
size_t expected_written);
MDBX_INTERNAL_FUNC int osal_pread(mdbx_filehandle_t fd, void *buf, size_t count, MDBX_INTERNAL_FUNC int osal_pread(mdbx_filehandle_t fd, void *buf, size_t count,
uint64_t offset); uint64_t offset);
MDBX_INTERNAL_FUNC int osal_pwrite(mdbx_filehandle_t fd, const void *buf, MDBX_INTERNAL_FUNC int osal_pwrite(mdbx_filehandle_t fd, const void *buf,
@ -365,12 +496,15 @@ MDBX_INTERNAL_FUNC int osal_fseek(mdbx_filehandle_t fd, uint64_t pos);
MDBX_INTERNAL_FUNC int osal_filesize(mdbx_filehandle_t fd, uint64_t *length); MDBX_INTERNAL_FUNC int osal_filesize(mdbx_filehandle_t fd, uint64_t *length);
enum osal_openfile_purpose { enum osal_openfile_purpose {
MDBX_OPEN_DXB_READ = 0, MDBX_OPEN_DXB_READ,
MDBX_OPEN_DXB_LAZY = 1, MDBX_OPEN_DXB_LAZY,
MDBX_OPEN_DXB_DSYNC = 2, MDBX_OPEN_DXB_DSYNC,
MDBX_OPEN_LCK = 3, #if defined(_WIN32) || defined(_WIN64)
MDBX_OPEN_COPY = 4, MDBX_OPEN_DXB_OVERLAPPED,
MDBX_OPEN_DELETE = 5 #endif /* Windows */
MDBX_OPEN_LCK,
MDBX_OPEN_COPY,
MDBX_OPEN_DELETE
}; };
MDBX_INTERNAL_FUNC int osal_openfile(const enum osal_openfile_purpose purpose, MDBX_INTERNAL_FUNC int osal_openfile(const enum osal_openfile_purpose purpose,
@ -404,7 +538,7 @@ osal_suspend_threads_before_remap(MDBX_env *env, mdbx_handle_array_t **array);
MDBX_INTERNAL_FUNC int MDBX_INTERNAL_FUNC int
osal_resume_threads_after_remap(mdbx_handle_array_t *array); osal_resume_threads_after_remap(mdbx_handle_array_t *array);
#endif /* Windows */ #endif /* Windows */
MDBX_INTERNAL_FUNC int osal_msync(osal_mmap_t *map, size_t offset, MDBX_INTERNAL_FUNC int osal_msync(const osal_mmap_t *map, size_t offset,
size_t length, size_t length,
enum osal_syncmode_bits mode_bits); enum osal_syncmode_bits mode_bits);
MDBX_INTERNAL_FUNC int osal_check_fs_rdonly(mdbx_filehandle_t handle, MDBX_INTERNAL_FUNC int osal_check_fs_rdonly(mdbx_filehandle_t handle,
@ -692,6 +826,11 @@ MDBX_INTERNAL_VAR MDBX_RegGetValueA mdbx_RegGetValueA;
NTSYSAPI ULONG RtlRandomEx(PULONG Seed); NTSYSAPI ULONG RtlRandomEx(PULONG Seed);
typedef BOOL(WINAPI *MDBX_SetFileIoOverlappedRange)(HANDLE FileHandle,
PUCHAR OverlappedRangeStart,
ULONG Length);
MDBX_INTERNAL_VAR MDBX_SetFileIoOverlappedRange mdbx_SetFileIoOverlappedRange;
#endif /* Windows */ #endif /* Windows */
#endif /* !__cplusplus */ #endif /* !__cplusplus */

View File

@ -71,7 +71,7 @@ void osal_setup(const std::vector<actor_config> &actors) {
events.reserve(n); events.reserve(n);
for (unsigned i = 0; i < n; ++i) { for (unsigned i = 0; i < n; ++i) {
HANDLE hEvent = CreateEvent(NULL, TRUE, FALSE, NULL); HANDLE hEvent = CreateEventW(NULL, TRUE, FALSE, NULL);
if (!hEvent) if (!hEvent)
failure_perror("CreateEvent()", GetLastError()); failure_perror("CreateEvent()", GetLastError());
hEvent = make_inheritable(hEvent); hEvent = make_inheritable(hEvent);
@ -79,22 +79,22 @@ void osal_setup(const std::vector<actor_config> &actors) {
events[i] = hEvent; events[i] = hEvent;
} }
hBarrierSemaphore = CreateSemaphore(NULL, 0, (LONG)actors.size(), NULL); hBarrierSemaphore = CreateSemaphoreW(NULL, 0, (LONG)actors.size(), NULL);
if (!hBarrierSemaphore) if (!hBarrierSemaphore)
failure_perror("CreateSemaphore(BarrierSemaphore)", GetLastError()); failure_perror("CreateSemaphore(BarrierSemaphore)", GetLastError());
hBarrierSemaphore = make_inheritable(hBarrierSemaphore); hBarrierSemaphore = make_inheritable(hBarrierSemaphore);
hBarrierEvent = CreateEvent(NULL, TRUE, FALSE, NULL); hBarrierEvent = CreateEventW(NULL, TRUE, FALSE, NULL);
if (!hBarrierEvent) if (!hBarrierEvent)
failure_perror("CreateEvent(BarrierEvent)", GetLastError()); failure_perror("CreateEvent(BarrierEvent)", GetLastError());
hBarrierEvent = make_inheritable(hBarrierEvent); hBarrierEvent = make_inheritable(hBarrierEvent);
hProgressActiveEvent = CreateEvent(NULL, FALSE, FALSE, NULL); hProgressActiveEvent = CreateEventW(NULL, FALSE, FALSE, NULL);
if (!hProgressActiveEvent) if (!hProgressActiveEvent)
failure_perror("CreateEvent(ProgressActiveEvent)", GetLastError()); failure_perror("CreateEvent(ProgressActiveEvent)", GetLastError());
hProgressActiveEvent = make_inheritable(hProgressActiveEvent); hProgressActiveEvent = make_inheritable(hProgressActiveEvent);
hProgressPassiveEvent = CreateEvent(NULL, FALSE, FALSE, NULL); hProgressPassiveEvent = CreateEventW(NULL, FALSE, FALSE, NULL);
if (!hProgressPassiveEvent) if (!hProgressPassiveEvent)
failure_perror("CreateEvent(ProgressPassiveEvent)", GetLastError()); failure_perror("CreateEvent(ProgressPassiveEvent)", GetLastError());
hProgressPassiveEvent = make_inheritable(hProgressPassiveEvent); hProgressPassiveEvent = make_inheritable(hProgressPassiveEvent);