diff --git a/TODO.md b/TODO.md index db853653..984b97ea 100644 --- a/TODO.md +++ b/TODO.md @@ -11,7 +11,6 @@ For the same reason ~~Github~~ is blacklisted forever. So currently most of the links are broken due to noted malicious ~~Github~~ sabotage. - - [Engage an "overlapped I/O" on Windows](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/224). - [Move most of `mdbx_chk` functional to the library API](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/204). - [Replace SRW-lock on Windows to allow shrink DB with `MDBX_NOTLS` option](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/210). - [More flexible support of asynchronous runtime/framework(s)](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/200). @@ -27,3 +26,4 @@ Done ---- - [Simple careful mode for working with corrupted DB](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/223). + - [Engage an "overlapped I/O" on Windows](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/224). diff --git a/mdbx.h b/mdbx.h index 88550dd8..b10ab212 100644 --- a/mdbx.h +++ b/mdbx.h @@ -2522,9 +2522,13 @@ struct MDBX_envinfo { uint64_t unspill; /**< Quantity of unspilled/reloaded pages */ uint64_t wops; /**< Number of explicit write operations (not a pages) to a disk */ + uint64_t + msync; /**< Number of explicit msync-to-disk operations (not a pages) */ + uint64_t + fsync; /**< Number of explicit fsync-to-disk operations (not a pages) */ uint64_t gcrtime_seconds16dot16; /**< Time spent loading and searching inside - GC (aka FreeDB) in 1/65536 of second. */ + GC (aka FreeDB) in 1/65536 of second */ } mi_pgop_stat; }; #ifndef __cplusplus diff --git a/src/base.h b/src/base.h index a927f805..1596d26a 100644 --- a/src/base.h +++ b/src/base.h @@ -63,7 +63,7 @@ #define SSIZE_MAX INTPTR_MAX #endif -#if UINTPTR_MAX > 0xffffFFFFul || ULONG_MAX > 0xffffFFFFul +#if UINTPTR_MAX > 0xffffFFFFul || ULONG_MAX > 0xffffFFFFul || defined(_WIN64) #define MDBX_WORDBITS 64 #else #define MDBX_WORDBITS 32 diff --git a/src/core.c b/src/core.c index 65e7ae40..d307b60b 100644 --- a/src/core.c +++ b/src/core.c @@ -3983,13 +3983,12 @@ __cold static void kill_page(MDBX_txn *txn, MDBX_page *mp, pgno_t pgno, while (--npages) { iov[n] = iov[0]; if (++n == MDBX_COMMIT_PAGES) { - osal_pwritev(env->me_lazy_fd, iov, MDBX_COMMIT_PAGES, iov_off, - pgno2bytes(env, MDBX_COMMIT_PAGES)); + osal_pwritev(env->me_lazy_fd, iov, MDBX_COMMIT_PAGES, iov_off); iov_off += pgno2bytes(env, MDBX_COMMIT_PAGES); n = 0; } } - osal_pwritev(env->me_lazy_fd, iov, n, iov_off, pgno2bytes(env, n)); + osal_pwritev(env->me_lazy_fd, iov, n, iov_off); } } @@ -4318,139 +4317,155 @@ static __inline int page_retire(MDBX_cursor *mc, MDBX_page *mp) { return page_retire_ex(mc, mp->mp_pgno, mp, mp->mp_flags); } -struct iov_ctx { - unsigned iov_items; - size_t iov_bytes; - size_t iov_off; +typedef struct iov_ctx { + MDBX_env *env; + osal_ioring_t *ior; + int err; +#ifndef MDBX_NEED_WRITTEN_RANGE +#define MDBX_NEED_WRITTEN_RANGE 1 +#endif /* MDBX_NEED_WRITTEN_RANGE */ +#if MDBX_NEED_WRITTEN_RANGE pgno_t flush_begin; pgno_t flush_end; - struct iovec iov[MDBX_COMMIT_PAGES]; -}; +#endif /* MDBX_NEED_WRITTEN_RANGE */ + uint64_t coherency_timestamp; +} iov_ctx_t; -static __inline void iov_init(MDBX_txn *const txn, struct iov_ctx *ctx) { - ctx->flush_begin = MAX_PAGENO; - ctx->flush_end = MIN_PAGENO; - ctx->iov_items = 0; - ctx->iov_bytes = 0; - ctx->iov_off = 0; - (void)txn; +__must_check_result static int iov_init(MDBX_txn *const txn, iov_ctx_t *ctx, + unsigned items, pgno_t npages) { + ctx->env = txn->mt_env; + ctx->ior = &txn->mt_env->me_ioring; + ctx->err = osal_ioring_reserve(ctx->ior, items, + pgno_align2os_bytes(txn->mt_env, npages)); + if (likely(ctx->err == MDBX_SUCCESS)) { +#if MDBX_NEED_WRITTEN_RANGE + ctx->flush_begin = MAX_PAGENO; + ctx->flush_end = MIN_PAGENO; +#endif /* MDBX_NEED_WRITTEN_RANGE */ + osal_ioring_reset(ctx->ior); + } + return ctx->err; } -static __inline void iov_done(MDBX_txn *const txn, struct iov_ctx *ctx) { - tASSERT(txn, ctx->iov_items == 0); -#if defined(__linux__) || defined(__gnu_linux__) - MDBX_env *const env = txn->mt_env; - if (!(txn->mt_flags & MDBX_WRITEMAP) && linux_kernel_version < 0x02060b00) - /* Linux kernels older than version 2.6.11 ignore the addr and nbytes - * arguments, making this function fairly expensive. Therefore, the - * whole cache is always flushed. */ - osal_flush_incoherent_mmap( - env->me_map + pgno2bytes(env, ctx->flush_begin), - pgno2bytes(env, ctx->flush_end - ctx->flush_begin), env->me_os_psize); -#endif /* Linux */ +static inline bool iov_empty(const iov_ctx_t *ctx) { + return osal_ioring_used(ctx->ior) == 0; } -static int iov_write(MDBX_txn *const txn, struct iov_ctx *ctx) { - tASSERT(txn, !(txn->mt_flags & MDBX_WRITEMAP)); - tASSERT(txn, ctx->iov_items > 0); +static void iov_callback4dirtypages(iov_ctx_t *ctx, size_t offset, void *data, + size_t bytes) { + MDBX_env *const env = ctx->env; + eASSERT(env, (env->me_flags & MDBX_WRITEMAP) == 0); - MDBX_env *const env = txn->mt_env; - int rc; - if (likely(ctx->iov_items == 1)) { - eASSERT(env, ctx->iov_bytes == (size_t)ctx->iov[0].iov_len); - rc = osal_pwrite(env->me_lazy_fd, ctx->iov[0].iov_base, ctx->iov[0].iov_len, - ctx->iov_off); - } else { - rc = osal_pwritev(env->me_lazy_fd, ctx->iov, ctx->iov_items, ctx->iov_off, - ctx->iov_bytes); - } + MDBX_page *wp = (MDBX_page *)data; + eASSERT(env, wp->mp_pgno == bytes2pgno(env, offset)); + eASSERT(env, bytes2pgno(env, bytes) >= (IS_OVERFLOW(wp) ? wp->mp_pages : 1u)); + eASSERT(env, (wp->mp_flags & P_ILL_BITS) == 0); - if (unlikely(rc != MDBX_SUCCESS)) - ERROR("Write error: %s", mdbx_strerror(rc)); - else { - VALGRIND_MAKE_MEM_DEFINED(txn->mt_env->me_map + ctx->iov_off, - ctx->iov_bytes); - MDBX_ASAN_UNPOISON_MEMORY_REGION(txn->mt_env->me_map + ctx->iov_off, - ctx->iov_bytes); - } - - unsigned iov_items = ctx->iov_items; -#if MDBX_ENABLE_PGOP_STAT - txn->mt_env->me_lck->mti_pgop_stat.wops.weak += iov_items; -#endif /* MDBX_ENABLE_PGOP_STAT */ - ctx->iov_items = 0; - ctx->iov_bytes = 0; - - uint64_t timestamp = 0; - for (unsigned i = 0; i < iov_items; i++) { - MDBX_page *wp = (MDBX_page *)ctx->iov[i].iov_base; - const MDBX_page *rp = pgno2page(txn->mt_env, wp->mp_pgno); + if (likely(ctx->err == MDBX_SUCCESS)) { + VALGRIND_MAKE_MEM_DEFINED(env->me_map + offset, bytes); + MDBX_ASAN_UNPOISON_MEMORY_REGION(env->me_map + offset, bytes); + osal_flush_incoherent_mmap(env->me_map + offset, bytes, env->me_os_psize); + const MDBX_page *const rp = (const MDBX_page *)(env->me_map + offset); /* check with timeout as the workaround * for todo4recovery://erased_by_github/libmdbx/issues/269 */ - while (likely(rc == MDBX_SUCCESS) && - unlikely(memcmp(wp, rp, ctx->iov[i].iov_len) != 0)) { - if (!timestamp) { - iov_done(txn, ctx); - WARNING( - "catch delayed/non-arrived page %" PRIaPGNO " %s", wp->mp_pgno, - "(workaround for incoherent flaw of unified page/buffer cache)"); - } - if (coherency_timeout(×tamp, wp->mp_pgno) != MDBX_RESULT_TRUE) - rc = MDBX_PROBLEM; + if (unlikely(memcmp(wp, rp, bytes))) { + ctx->coherency_timestamp = 0; + WARNING("catch delayed/non-arrived page %" PRIaPGNO " %s", wp->mp_pgno, + "(workaround for incoherent flaw of unified page/buffer cache)"); + do + if (coherency_timeout(&ctx->coherency_timestamp, wp->mp_pgno) != + MDBX_RESULT_TRUE) { + ctx->err = MDBX_PROBLEM; + break; + } + while (unlikely(memcmp(wp, rp, bytes))); } - dpage_free(env, wp, bytes2pgno(env, ctx->iov[i].iov_len)); } - return rc; + + if (likely(bytes == env->me_psize)) + dpage_free(env, wp, 1); + else { + do { + eASSERT(env, wp->mp_pgno == bytes2pgno(env, offset)); + eASSERT(env, (wp->mp_flags & P_ILL_BITS) == 0); + unsigned npages = IS_OVERFLOW(wp) ? wp->mp_pages : 1u; + size_t chunk = pgno2bytes(env, npages); + eASSERT(env, bytes >= chunk); + dpage_free(env, wp, npages); + wp = (MDBX_page *)((char *)wp + chunk); + offset += chunk; + bytes -= chunk; + } while (bytes); + } } -static int iov_page(MDBX_txn *txn, struct iov_ctx *ctx, MDBX_page *dp, - unsigned npages) { +static void iov_complete(iov_ctx_t *ctx) { + if ((ctx->env->me_flags & MDBX_WRITEMAP) == 0) + osal_ioring_walk(ctx->ior, ctx, iov_callback4dirtypages); + osal_ioring_reset(ctx->ior); +} + +__must_check_result static int iov_write(iov_ctx_t *ctx) { + eASSERT(ctx->env, !iov_empty(ctx)); + osal_ioring_write_result_t r = osal_ioring_write(ctx->ior); +#if MDBX_ENABLE_PGOP_STAT + ctx->env->me_lck->mti_pgop_stat.wops.weak += r.wops; +#endif /* MDBX_ENABLE_PGOP_STAT */ + ctx->err = r.err; + if (unlikely(ctx->err != MDBX_SUCCESS)) + ERROR("Write error: %s", mdbx_strerror(ctx->err)); + iov_complete(ctx); + return ctx->err; +} + +__must_check_result static int iov_page(MDBX_txn *txn, iov_ctx_t *ctx, + MDBX_page *dp, unsigned npages) { MDBX_env *const env = txn->mt_env; + tASSERT(txn, ctx->err == MDBX_SUCCESS); tASSERT(txn, dp->mp_pgno >= MIN_PAGENO && dp->mp_pgno < txn->mt_next_pgno); tASSERT(txn, IS_MODIFIABLE(txn, dp)); tASSERT(txn, !(dp->mp_flags & ~(P_BRANCH | P_LEAF | P_LEAF2 | P_OVERFLOW))); - ctx->flush_begin = - (ctx->flush_begin < dp->mp_pgno) ? ctx->flush_begin : dp->mp_pgno; - ctx->flush_end = (ctx->flush_end > dp->mp_pgno + npages) - ? ctx->flush_end - : dp->mp_pgno + npages; - env->me_lck->mti_unsynced_pages.weak += npages; - if (IS_SHADOWED(txn, dp)) { tASSERT(txn, !(txn->mt_flags & MDBX_WRITEMAP)); dp->mp_txnid = txn->mt_txnid; tASSERT(txn, IS_SPILLED(txn, dp)); - const size_t size = pgno2bytes(env, npages); - if (ctx->iov_off + ctx->iov_bytes != pgno2bytes(env, dp->mp_pgno) || - ctx->iov_items == ARRAY_LENGTH(ctx->iov) || - ctx->iov_bytes + size > MAX_WRITE) { - if (ctx->iov_items) { - int err = iov_write(txn, ctx); - if (unlikely(err != MDBX_SUCCESS)) - return err; -#if defined(__linux__) || defined(__gnu_linux__) - if (linux_kernel_version >= 0x02060b00) - /* Linux kernels older than version 2.6.11 ignore the addr and nbytes - * arguments, making this function fairly expensive. Therefore, the - * whole cache is always flushed. */ -#endif /* Linux */ - osal_flush_incoherent_mmap(env->me_map + ctx->iov_off, ctx->iov_bytes, - env->me_os_psize); + int err = osal_ioring_add(ctx->ior, pgno2bytes(env, dp->mp_pgno), dp, + pgno2bytes(env, npages)); + if (unlikely(err != MDBX_SUCCESS)) { + ctx->err = err; + if (unlikely(err != MDBX_RESULT_TRUE)) { + iov_complete(ctx); + return err; } - ctx->iov_off = pgno2bytes(env, dp->mp_pgno); + err = iov_write(ctx); + tASSERT(txn, iov_empty(ctx)); + if (likely(err == MDBX_SUCCESS)) { + err = osal_ioring_add(ctx->ior, pgno2bytes(env, dp->mp_pgno), dp, + pgno2bytes(env, npages)); + if (unlikely(err != MDBX_SUCCESS)) { + iov_complete(ctx); + return ctx->err = err; + } + } + tASSERT(txn, ctx->err == MDBX_SUCCESS); } - ctx->iov[ctx->iov_items].iov_base = (void *)dp; - ctx->iov[ctx->iov_items].iov_len = size; - ctx->iov_items += 1; - ctx->iov_bytes += size; } else { tASSERT(txn, txn->mt_flags & MDBX_WRITEMAP); } + +#if MDBX_NEED_WRITTEN_RANGE + ctx->flush_begin = + (ctx->flush_begin < dp->mp_pgno) ? ctx->flush_begin : dp->mp_pgno; + ctx->flush_end = (ctx->flush_end > dp->mp_pgno + npages) + ? ctx->flush_end + : dp->mp_pgno + npages; +#endif /* MDBX_NEED_WRITTEN_RANGE */ + env->me_lck->mti_unsynced_pages.weak += npages; return MDBX_SUCCESS; } -static int spill_page(MDBX_txn *txn, struct iov_ctx *ctx, MDBX_page *dp, +static int spill_page(MDBX_txn *txn, iov_ctx_t *ctx, MDBX_page *dp, unsigned npages) { tASSERT(txn, !(txn->mt_flags & MDBX_WRITEMAP)); pgno_t pgno = dp->mp_pgno; @@ -4613,13 +4628,18 @@ static int txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0, txn->tw.dirtyroom, need); tASSERT(txn, txn->tw.dirtylist->length >= wanna_spill); - struct iov_ctx ctx; - iov_init(txn, &ctx); int rc = MDBX_SUCCESS; if (txn->mt_flags & MDBX_WRITEMAP) { MDBX_dpl *const dl = txn->tw.dirtylist; const unsigned span = dl->length - txn->tw.loose_count; txn->tw.dirtyroom += span; + + iov_ctx_t ctx; + rc = iov_init(txn, &ctx, wanna_spill, + dl->pages_including_loose - txn->tw.loose_count); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + unsigned r, w; for (w = 0, r = 1; r <= dl->length; ++r) { MDBX_page *dp = dl->items[r].ptr; @@ -4749,6 +4769,13 @@ static int txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0, prio2spill, prio2adjacent, spillable, wanna_spill, amount); tASSERT(txn, prio2spill < prio2adjacent && prio2adjacent <= 256); + iov_ctx_t ctx; + rc = iov_init(txn, &ctx, amount, + txn->tw.dirtylist->pages_including_loose - + txn->tw.loose_count); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + unsigned prev_prio = 256; unsigned r, w, prio; pgno_t spilled_entries = 0, spilled_npages = 0; @@ -4814,12 +4841,10 @@ static int txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0, txn->tw.dirtylist->pages_including_loose -= spilled_npages; tASSERT(txn, dirtylist_check(txn)); - if (ctx.iov_items) { - /* iov_page() frees dirty-pages and reset iov_items in case of failure. */ + if (!iov_empty(&ctx)) { tASSERT(txn, rc == MDBX_SUCCESS); - rc = iov_write(txn, &ctx); + rc = iov_write(&ctx); } - if (unlikely(rc != MDBX_SUCCESS)) goto bailout; @@ -4827,9 +4852,8 @@ static int txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0, txn->mt_flags |= MDBX_TXN_SPILLS; NOTICE("spilled %u dirty-entries, now have %u dirty-room", spilled_entries, txn->tw.dirtyroom); - iov_done(txn, &ctx); } else { - tASSERT(txn, ctx.iov_items == 0 && rc == MDBX_SUCCESS); + tASSERT(txn, rc == MDBX_SUCCESS); for (unsigned i = 1; i <= dl->length; ++i) { MDBX_page *dp = dl->items[i].ptr; NOTICE("dirtylist[%u]: pgno %u, npages %u, flags 0x%04X, age %u, prio %u", @@ -5610,7 +5634,7 @@ __cold static int map_resize(MDBX_env *env, const pgno_t used_pgno, if ((env->me_flags & MDBX_WRITEMAP) && env->me_lck->mti_unsynced_pages.weak) { #if MDBX_ENABLE_PGOP_STAT - env->me_lck->mti_pgop_stat.wops.weak += 1; + env->me_lck->mti_pgop_stat.msync.weak += 1; #endif /* MDBX_ENABLE_PGOP_STAT */ rc = osal_msync(&env->me_dxb_mmap, 0, pgno_align2os_bytes(env, used_pgno), MDBX_SYNC_NONE); @@ -5743,74 +5767,71 @@ __cold static int map_resize_implicit(MDBX_env *env, const pgno_t used_pgno, true); } -static int meta_unsteady(MDBX_env *env, const txnid_t last_steady, - MDBX_meta *const meta, mdbx_filehandle_t fd) { +static int meta_unsteady(int err, MDBX_env *env, const txnid_t early_than, + const pgno_t pgno) { + MDBX_meta *const meta = METAPAGE(env, pgno); + const txnid_t txnid = constmeta_txnid(meta); + if (unlikely(err != MDBX_SUCCESS) || !META_IS_STEADY(meta) || + !(txnid < early_than)) + return err; + + WARNING("wipe txn #%" PRIaTXN ", meta %" PRIaPGNO, txnid, pgno); const uint64_t wipe = MDBX_DATASIGN_NONE; - if (unlikely(META_IS_STEADY(meta)) && constmeta_txnid(meta) <= last_steady) { - WARNING("wipe txn #%" PRIaTXN ", meta %" PRIaPGNO, last_steady, - data_page(meta)->mp_pgno); - if (env->me_flags & MDBX_WRITEMAP) - unaligned_poke_u64(4, meta->mm_sign, wipe); - else - return osal_pwrite(fd, &wipe, sizeof(meta->mm_sign), - (uint8_t *)&meta->mm_sign - env->me_map); - } - return MDBX_SUCCESS; -} - -__cold static int wipe_steady(MDBX_txn *txn, const txnid_t last_steady) { - MDBX_env *const env = txn->mt_env; -#if MDBX_ENABLE_PGOP_STAT - env->me_lck->mti_pgop_stat.wops.weak += 1; -#endif /* MDBX_ENABLE_PGOP_STAT */ - const mdbx_filehandle_t fd = (env->me_dsync_fd != INVALID_HANDLE_VALUE) - ? env->me_dsync_fd - : env->me_lazy_fd; - int err = meta_unsteady(env, last_steady, METAPAGE(env, 0), fd); - if (unlikely(err != MDBX_SUCCESS)) - return err; - err = meta_unsteady(env, last_steady, METAPAGE(env, 1), fd); - if (unlikely(err != MDBX_SUCCESS)) - return err; - err = meta_unsteady(env, last_steady, METAPAGE(env, 2), fd); - if (unlikely(err != MDBX_SUCCESS)) - return err; - + const void *ptr = &wipe; + size_t bytes = sizeof(meta->mm_sign), + offset = (uint8_t *)&meta->mm_sign - env->me_map; if (env->me_flags & MDBX_WRITEMAP) { + unaligned_poke_u64(4, meta->mm_sign, wipe); osal_flush_incoherent_cpu_writeback(); err = osal_msync(&env->me_dxb_mmap, 0, pgno_align2os_bytes(env, NUM_METAS), MDBX_SYNC_DATA); if (unlikely(err != MDBX_SUCCESS)) return err; - } else { - if (fd == env->me_lazy_fd) { -#if MDBX_USE_SYNCFILERANGE - static bool syncfilerange_unavailable; - if (!syncfilerange_unavailable && - sync_file_range(env->me_lazy_fd, 0, pgno2bytes(env, NUM_METAS), - SYNC_FILE_RANGE_WRITE | SYNC_FILE_RANGE_WAIT_AFTER)) { - err = errno; - if (ignore_enosys(err) == MDBX_RESULT_TRUE) - syncfilerange_unavailable = true; - } - if (syncfilerange_unavailable) -#endif /* MDBX_USE_SYNCFILERANGE */ - err = osal_fsync(env->me_lazy_fd, MDBX_SYNC_DATA); - if (unlikely(err != MDBX_SUCCESS)) - return err; } - osal_flush_incoherent_mmap(env->me_map, pgno2bytes(env, NUM_METAS), - env->me_os_psize); + ptr = data_page(meta); + offset = (uint8_t *)ptr - env->me_map; + bytes = env->me_psize; } +#if MDBX_ENABLE_PGOP_STAT + env->me_lck->mti_pgop_stat.wops.weak += 1; +#endif /* MDBX_ENABLE_PGOP_STAT */ + err = osal_pwrite(env->me_fd4meta, ptr, bytes, offset); + if (likely(err == MDBX_SUCCESS) && env->me_fd4meta == env->me_lazy_fd) { + err = osal_fsync(env->me_lazy_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ); +#if MDBX_ENABLE_PGOP_STAT + env->me_lck->mti_pgop_stat.fsync.weak += 1; +#endif /* MDBX_ENABLE_PGOP_STAT */ + } + return err; +} + +__cold static int wipe_steady(MDBX_txn *txn, txnid_t last_steady) { + MDBX_env *const env = txn->mt_env; + int err = MDBX_SUCCESS; + + /* early than last_steady */ + err = meta_unsteady(err, env, last_steady, 0); + err = meta_unsteady(err, env, last_steady, 1); + err = meta_unsteady(err, env, last_steady, 2); + + /* the last_steady */ + err = meta_unsteady(err, env, last_steady + 1, 0); + err = meta_unsteady(err, env, last_steady + 1, 1); + err = meta_unsteady(err, env, last_steady + 1, 2); + + osal_flush_incoherent_mmap(env->me_map, pgno2bytes(env, NUM_METAS), + env->me_os_psize); + /* force oldest refresh */ atomic_store32(&env->me_lck->mti_readers_refresh_flag, true, mo_Relaxed); + tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); txn->tw.troika = meta_tap(env); for (MDBX_txn *scan = txn->mt_env->me_txn0; scan; scan = scan->mt_child) if (scan != txn) scan->tw.troika = txn->tw.troika; - return MDBX_SUCCESS; + return err; } //------------------------------------------------------------------------------ @@ -7052,6 +7073,40 @@ fail: return rc; } +static int meta_sync(const MDBX_env *env, const meta_ptr_t head) { + eASSERT(env, atomic_load32(&env->me_lck->mti_meta_sync_txnid, mo_Relaxed) != + (uint32_t)head.txnid); + /* Функция может вызываться (в том числе) при (env->me_flags & + * MDBX_NOMETASYNC) == 0 и env->me_fd4meta == env->me_dsync_fd, например если + * предыдущая транзакция была выполненна с флагом MDBX_NOMETASYNC. */ + + int rc = MDBX_RESULT_TRUE; + if (env->me_flags & MDBX_WRITEMAP) { +#if MDBX_ENABLE_PGOP_ST + env->me_lck->mti_pgop_stat.wops.weak += 1; +#endif /* MDBX_ENABLE_PGOP_STAT */ + const MDBX_page *page = data_page(head.ptr_c); + rc = osal_pwrite(env->me_fd4meta, page, env->me_psize, + (uint8_t *)page - env->me_map); + + if (likely(rc == MDBX_SUCCESS) && env->me_fd4meta == env->me_lazy_fd) { + rc = osal_fsync(env->me_lazy_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ); +#if MDBX_ENABLE_PGOP_STAT + env->me_lck->mti_pgop_stat.fsync.weak += 1; +#endif /* MDBX_ENABLE_PGOP_STAT */ + } + } else { + rc = osal_fsync(env->me_lazy_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ); +#if MDBX_ENABLE_PGOP_STAT + env->me_lck->mti_pgop_stat.fsync.weak += 1; +#endif /* MDBX_ENABLE_PGOP_STAT */ + } + + if (likely(rc == MDBX_SUCCESS)) + env->me_lck->mti_meta_sync_txnid.weak = (uint32_t)head.txnid; + return rc; +} + __cold static int env_sync(MDBX_env *env, bool force, bool nonblock) { bool locked = false; int rc = MDBX_RESULT_TRUE /* means "nothing to sync" */; @@ -7104,7 +7159,7 @@ retry:; int err; /* pre-sync to avoid latency for writer */ - if (unsynced_pages > /* FIXME: define threshold */ 16 && + if (unsynced_pages > /* FIXME: define threshold */ 42 && (flags & MDBX_SAFE_NOSYNC) == 0) { eASSERT(env, ((flags ^ env->me_flags) & MDBX_WRITEMAP) == 0); if (flags & MDBX_WRITEMAP) { @@ -7173,19 +7228,8 @@ retry:; /* LY: sync meta-pages if MDBX_NOMETASYNC enabled * and someone was not synced above. */ if (atomic_load32(&env->me_lck->mti_meta_sync_txnid, mo_Relaxed) != - (uint32_t)head.txnid) { -#if MDBX_ENABLE_PGOP_STAT - env->me_lck->mti_pgop_stat.wops.weak += 1; -#endif /* MDBX_ENABLE_PGOP_STAT */ - rc = (flags & MDBX_WRITEMAP) - ? osal_msync(&env->me_dxb_mmap, 0, - pgno_align2os_bytes(env, NUM_METAS), - MDBX_SYNC_DATA | MDBX_SYNC_IODQ) - : osal_fsync(env->me_lazy_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ); - if (likely(rc == MDBX_SUCCESS)) - atomic_store32(&env->me_lck->mti_meta_sync_txnid, (uint32_t)head.txnid, - mo_Relaxed); - } + (uint32_t)head.txnid) + rc = meta_sync(env, head); bailout: if (locked) @@ -7628,7 +7672,8 @@ static bool coherency_check(const MDBX_env *env, const txnid_t txnid, __cold static int coherency_timeout(uint64_t *timestamp, pgno_t pgno) { if (likely(timestamp && *timestamp == 0)) *timestamp = osal_monotime(); - else if (unlikely(!timestamp || osal_monotime() - *timestamp > 65536 / 10)) { + else if (unlikely(!timestamp || osal_monotime() - *timestamp > + osal_16dot16_to_monotime(65536 / 10))) { if (pgno) ERROR("bailout waiting for %" PRIaPGNO " page arrival %s", pgno, "(workaround for incoherent flaw of unified page/buffer cache)"); @@ -9902,7 +9947,7 @@ bailout: return rc; } -static int txn_write(MDBX_txn *txn, struct iov_ctx *ctx) { +static int txn_write(MDBX_txn *txn, iov_ctx_t *ctx) { MDBX_dpl *const dl = (txn->mt_flags & MDBX_WRITEMAP) ? txn->tw.dirtylist : dpl_sort(txn); int rc = MDBX_SUCCESS; @@ -9919,10 +9964,9 @@ static int txn_write(MDBX_txn *txn, struct iov_ctx *ctx) { break; } - if (ctx->iov_items) { - /* iov_page() frees dirty-pages and reset iov_items in case of failure. */ + if (!iov_empty(ctx)) { tASSERT(txn, rc == MDBX_SUCCESS); - rc = iov_write(txn, ctx); + rc = iov_write(ctx); } while (r <= dl->length) @@ -10568,42 +10612,53 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { goto fail; } - struct iov_ctx write_ctx; - iov_init(txn, &write_ctx); + const meta_ptr_t head = meta_recent(env, &txn->tw.troika); + iov_ctx_t write_ctx; + rc = iov_init(txn, &write_ctx, txn->tw.dirtylist->length, + txn->tw.dirtylist->pages_including_loose - txn->tw.loose_count); + if (unlikely(rc != MDBX_SUCCESS)) + goto fail; + + if (head.is_steady && atomic_load32(&env->me_lck->mti_meta_sync_txnid, + mo_Relaxed) != (uint32_t)head.txnid) { + /* sync prev meta */ + rc = meta_sync(env, head); + if (unlikely(rc != MDBX_SUCCESS)) + goto fail; + } + rc = txn_write(txn, &write_ctx); - if (likely(rc == MDBX_SUCCESS)) - iov_done(txn, &write_ctx); + if (unlikely(rc != MDBX_SUCCESS)) + goto fail; + /* TODO: use ctx.flush_begin & ctx.flush_end for range-sync */ ts_3 = latency ? osal_monotime() : 0; - if (likely(rc == MDBX_SUCCESS)) { - const meta_ptr_t head = meta_recent(env, &txn->tw.troika); - MDBX_meta meta; - memcpy(meta.mm_magic_and_version, head.ptr_c->mm_magic_and_version, 8); - meta.mm_extra_flags = head.ptr_c->mm_extra_flags; - meta.mm_validator_id = head.ptr_c->mm_validator_id; - meta.mm_extra_pagehdr = head.ptr_c->mm_extra_pagehdr; - unaligned_poke_u64(4, meta.mm_pages_retired, - unaligned_peek_u64(4, head.ptr_c->mm_pages_retired) + - MDBX_PNL_SIZE(txn->tw.retired_pages)); - meta.mm_geo = txn->mt_geo; - meta.mm_dbs[FREE_DBI] = txn->mt_dbs[FREE_DBI]; - meta.mm_dbs[MAIN_DBI] = txn->mt_dbs[MAIN_DBI]; - meta.mm_canary = txn->mt_canary; + MDBX_meta meta; + memcpy(meta.mm_magic_and_version, head.ptr_c->mm_magic_and_version, 8); + meta.mm_extra_flags = head.ptr_c->mm_extra_flags; + meta.mm_validator_id = head.ptr_c->mm_validator_id; + meta.mm_extra_pagehdr = head.ptr_c->mm_extra_pagehdr; + unaligned_poke_u64(4, meta.mm_pages_retired, + unaligned_peek_u64(4, head.ptr_c->mm_pages_retired) + + MDBX_PNL_SIZE(txn->tw.retired_pages)); + meta.mm_geo = txn->mt_geo; + meta.mm_dbs[FREE_DBI] = txn->mt_dbs[FREE_DBI]; + meta.mm_dbs[MAIN_DBI] = txn->mt_dbs[MAIN_DBI]; + meta.mm_canary = txn->mt_canary; - txnid_t commit_txnid = txn->mt_txnid; + txnid_t commit_txnid = txn->mt_txnid; #if MDBX_ENABLE_BIGFOOT - if (gcu_ctx.bigfoot > txn->mt_txnid) { - commit_txnid = gcu_ctx.bigfoot; - TRACE("use @%" PRIaTXN " (+%u) for commit bigfoot-txn", commit_txnid, - (unsigned)(commit_txnid - txn->mt_txnid)); - } -#endif - meta_set_txnid(env, &meta, commit_txnid); - - rc = sync_locked(env, env->me_flags | txn->mt_flags | MDBX_SHRINK_ALLOWED, - &meta, &txn->tw.troika); + if (gcu_ctx.bigfoot > txn->mt_txnid) { + commit_txnid = gcu_ctx.bigfoot; + TRACE("use @%" PRIaTXN " (+%u) for commit bigfoot-txn", commit_txnid, + (unsigned)(commit_txnid - txn->mt_txnid)); } +#endif + meta_set_txnid(env, &meta, commit_txnid); + + rc = sync_locked(env, env->me_flags | txn->mt_flags | MDBX_SHRINK_ALLOWED, + &meta, &txn->tw.troika); ts_4 = latency ? osal_monotime() : 0; if (unlikely(rc != MDBX_SUCCESS)) { env->me_flags |= MDBX_FATAL_ERROR; @@ -10894,11 +10949,11 @@ static int validate_meta_copy(MDBX_env *env, const MDBX_meta *meta, __cold static int read_header(MDBX_env *env, MDBX_meta *dest, const int lck_exclusive, const mdbx_mode_t mode_bits) { + memset(dest, 0, sizeof(MDBX_meta)); int rc = osal_filesize(env->me_lazy_fd, &env->me_dxb_mmap.filesize); if (unlikely(rc != MDBX_SUCCESS)) return rc; - memset(dest, 0, sizeof(MDBX_meta)); unaligned_poke_u64(4, dest->mm_sign, MDBX_DATASIGN_WEAK); rc = MDBX_CORRUPTED; @@ -11200,7 +11255,9 @@ static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending, if (atomic_load32(&env->me_lck->mti_unsynced_pages, mo_Relaxed)) { eASSERT(env, ((flags ^ env->me_flags) & MDBX_WRITEMAP) == 0); enum osal_syncmode_bits mode_bits = MDBX_SYNC_NONE; + unsigned sync_op = 0; if ((flags & MDBX_SAFE_NOSYNC) == 0) { + sync_op = 1; mode_bits = MDBX_SYNC_DATA; if (pending->mm_geo.next > meta_prefer_steady(env, troika).ptr_c->mm_geo.now) @@ -11209,7 +11266,7 @@ static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending, mode_bits |= MDBX_SYNC_IODQ; } #if MDBX_ENABLE_PGOP_STAT - env->me_lck->mti_pgop_stat.wops.weak += 1; + env->me_lck->mti_pgop_stat.msync.weak += sync_op; #endif /* MDBX_ENABLE_PGOP_STAT */ if (flags & MDBX_WRITEMAP) rc = @@ -11298,9 +11355,6 @@ static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending, eASSERT(env, ((env->me_flags ^ flags) & MDBX_WRITEMAP) == 0); ENSURE(env, target == head.ptr_c || constmeta_txnid(target) < pending->unsafe_txnid); -#if MDBX_ENABLE_PGOP_STAT - env->me_lck->mti_pgop_stat.wops.weak += 1; -#endif /* MDBX_ENABLE_PGOP_STAT */ if (flags & MDBX_WRITEMAP) { jitter4testing(true); if (likely(target != head.ptr_c)) { @@ -11338,34 +11392,37 @@ static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending, osal_flush_incoherent_cpu_writeback(); jitter4testing(true); /* sync meta-pages */ - rc = - osal_msync(&env->me_dxb_mmap, 0, pgno_align2os_bytes(env, NUM_METAS), - (flags & MDBX_NOMETASYNC) ? MDBX_SYNC_NONE - : MDBX_SYNC_DATA | MDBX_SYNC_IODQ); +#if MDBX_ENABLE_PGOP_STAT + env->me_lck->mti_pgop_stat.msync.weak += 1; +#endif /* MDBX_ENABLE_PGOP_STAT */ + rc = osal_msync(&env->me_dxb_mmap, 0, pgno_align2os_bytes(env, NUM_METAS), + (flags & MDBX_NOMETASYNC) + ? MDBX_SYNC_NONE + : MDBX_SYNC_DATA | MDBX_SYNC_IODQ); if (unlikely(rc != MDBX_SUCCESS)) goto fail; } else { - const MDBX_meta undo_meta = *target; - const mdbx_filehandle_t fd = (env->me_dsync_fd != INVALID_HANDLE_VALUE) - ? env->me_dsync_fd - : env->me_lazy_fd; #if MDBX_ENABLE_PGOP_STAT env->me_lck->mti_pgop_stat.wops.weak += 1; #endif /* MDBX_ENABLE_PGOP_STAT */ - rc = osal_pwrite(fd, pending, sizeof(MDBX_meta), + const MDBX_meta undo_meta = *target; + rc = osal_pwrite(env->me_fd4meta, pending, sizeof(MDBX_meta), (uint8_t *)target - env->me_map); if (unlikely(rc != MDBX_SUCCESS)) { undo: DEBUG("%s", "write failed, disk error?"); /* On a failure, the pagecache still contains the new data. * Try write some old data back, to prevent it from being used. */ - osal_pwrite(fd, &undo_meta, sizeof(MDBX_meta), + osal_pwrite(env->me_fd4meta, &undo_meta, sizeof(MDBX_meta), (uint8_t *)target - env->me_map); goto fail; } osal_flush_incoherent_mmap(target, sizeof(MDBX_meta), env->me_os_psize); /* sync meta-pages */ - if ((flags & MDBX_NOMETASYNC) == 0 && fd == env->me_lazy_fd) { + if ((flags & MDBX_NOMETASYNC) == 0 && env->me_fd4meta == env->me_lazy_fd) { +#if MDBX_ENABLE_PGOP_STAT + env->me_lck->mti_pgop_stat.fsync.weak += 1; +#endif /* MDBX_ENABLE_PGOP_STAT */ rc = osal_fsync(env->me_lazy_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ); if (rc != MDBX_SUCCESS) goto undo; @@ -11382,7 +11439,7 @@ static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending, goto fail; } env->me_lck->mti_meta_sync_txnid.weak = - (uint32_t)pending->unsafe_txnid - + pending->mm_txnid_a[__BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__].weak - ((flags & MDBX_NOMETASYNC) ? UINT32_MAX / 3 : 0); *troika = meta_tap(env); @@ -11528,9 +11585,11 @@ __cold int mdbx_env_create(MDBX_env **penv) { env->me_maxreaders = DEFAULT_READERS; env->me_maxdbs = env->me_numdbs = CORE_DBS; - env->me_lazy_fd = INVALID_HANDLE_VALUE; - env->me_dsync_fd = INVALID_HANDLE_VALUE; - env->me_lfd = INVALID_HANDLE_VALUE; + env->me_lazy_fd = env->me_dsync_fd = env->me_fd4meta = env->me_fd4data = +#if defined(_WIN32) || defined(_WIN64) + env->me_overlapped_fd = +#endif /* Windows */ + env->me_lfd = INVALID_HANDLE_VALUE; env->me_pid = osal_getpid(); env->me_stuck_meta = -1; @@ -12863,10 +12922,10 @@ __cold static int __must_check_result override_meta(MDBX_env *env, if (shape && memcmp(model, shape, sizeof(MDBX_meta)) == 0) return MDBX_SUCCESS; -#if MDBX_ENABLE_PGOP_STAT - env->me_lck->mti_pgop_stat.wops.weak += 1; -#endif /* MDBX_ENABLE_PGOP_STAT */ if (env->me_flags & MDBX_WRITEMAP) { +#if MDBX_ENABLE_PGOP_STAT + env->me_lck->mti_pgop_stat.msync.weak += 1; +#endif /* MDBX_ENABLE_PGOP_STAT */ rc = osal_msync(&env->me_dxb_mmap, 0, pgno_align2os_bytes(env, model->mm_geo.next), MDBX_SYNC_DATA | MDBX_SYNC_IODQ); @@ -12877,18 +12936,26 @@ __cold static int __must_check_result override_meta(MDBX_env *env, * clearing consistency flag by mdbx_meta_update_begin() */ memcpy(pgno2page(env, target), page, env->me_psize); osal_flush_incoherent_cpu_writeback(); +#if MDBX_ENABLE_PGOP_STAT + env->me_lck->mti_pgop_stat.msync.weak += 1; +#endif /* MDBX_ENABLE_PGOP_STAT */ rc = osal_msync(&env->me_dxb_mmap, 0, pgno_align2os_bytes(env, target + 1), MDBX_SYNC_DATA | MDBX_SYNC_IODQ); } else { - const mdbx_filehandle_t fd = (env->me_dsync_fd != INVALID_HANDLE_VALUE) - ? env->me_dsync_fd - : env->me_lazy_fd; - rc = osal_pwrite(fd, page, env->me_psize, pgno2bytes(env, target)); - if (rc == MDBX_SUCCESS && fd == env->me_lazy_fd) +#if MDBX_ENABLE_PGOP_STAT + env->me_lck->mti_pgop_stat.wops.weak += 1; +#endif /* MDBX_ENABLE_PGOP_STAT */ + rc = osal_pwrite(env->me_fd4meta, page, env->me_psize, + pgno2bytes(env, target)); + if (rc == MDBX_SUCCESS && env->me_fd4meta == env->me_lazy_fd) { +#if MDBX_ENABLE_PGOP_STAT + env->me_lck->mti_pgop_stat.fsync.weak += 1; +#endif /* MDBX_ENABLE_PGOP_STAT */ rc = osal_fsync(env->me_lazy_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ); + } + osal_flush_incoherent_mmap(env->me_map, pgno2bytes(env, NUM_METAS), + env->me_os_psize); } - osal_flush_incoherent_mmap(env->me_map, pgno2bytes(env, NUM_METAS), - env->me_os_psize); eASSERT(env, !env->me_txn && !env->me_txn0); return rc; } @@ -13254,14 +13321,6 @@ __cold int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname, if (rc != MDBX_SUCCESS) goto bailout; - eASSERT(env, env->me_dsync_fd == INVALID_HANDLE_VALUE); - if ((flags & (MDBX_RDONLY | MDBX_SAFE_NOSYNC | MDBX_NOMETASYNC)) == 0) { - rc = osal_openfile(MDBX_OPEN_DXB_DSYNC, env, env_pathname.dxb, - &env->me_dsync_fd, 0); - ENSURE(env, - (rc != MDBX_SUCCESS) == (env->me_dsync_fd == INVALID_HANDLE_VALUE)); - } - #if MDBX_LOCKING == MDBX_LOCKING_SYSV env->me_sysv_ipc.key = ftok(env_pathname.dxb, 42); if (env->me_sysv_ipc.key == -1) { @@ -13270,7 +13329,30 @@ __cold int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname, } #endif /* MDBX_LOCKING */ -#if !(defined(_WIN32) || defined(_WIN64)) + /* Set the position in files outside of the data to avoid corruption + * due to erroneous use of file descriptors in the application code. */ + const uint64_t safe_parking_lot_offset = UINT64_C(0x7fffFFFF80000000); + osal_fseek(env->me_lazy_fd, safe_parking_lot_offset); + + env->me_fd4data = env->me_fd4meta = env->me_lazy_fd; +#if defined(_WIN32) || defined(_WIN64) + uint8_t ior_flags = 0; + if ((flags & (MDBX_RDONLY | MDBX_SAFE_NOSYNC)) == MDBX_SYNC_DURABLE) { + ior_flags = IOR_OVERLAPPED; + rc = + osal_openfile(MDBX_OPEN_DXB_OVERLAPPED, + env, env_pathname.dxb, &env->me_overlapped_fd, 0); + if (rc != MDBX_SUCCESS) + goto bailout; + env->me_data_lock_event = CreateEventW(nullptr, true, false, nullptr); + if (!env->me_data_lock_event) { + rc = (int)GetLastError(); + goto bailout; + } + env->me_fd4data = env->me_overlapped_fd; + osal_fseek(env->me_overlapped_fd, safe_parking_lot_offset); + } +#else if (mode == 0) { /* pickup mode for lck-file */ struct stat st; @@ -13291,13 +13373,7 @@ __cold int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname, rc = lck_rc; goto bailout; } - - /* Set the position in files outside of the data to avoid corruption - * due to erroneous use of file descriptors in the application code. */ - osal_fseek(env->me_lfd, UINT64_C(1) << 63); - osal_fseek(env->me_lazy_fd, UINT64_C(1) << 63); - if (env->me_dsync_fd != INVALID_HANDLE_VALUE) - osal_fseek(env->me_dsync_fd, UINT64_C(1) << 63); + osal_fseek(env->me_lfd, safe_parking_lot_offset); const MDBX_env_flags_t rigorous_flags = MDBX_SAFE_NOSYNC | MDBX_DEPRECATED_MAPASYNC; @@ -13305,6 +13381,20 @@ __cold int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname, MDBX_LIFORECLAIM | MDBX_DEPRECATED_COALESCE | MDBX_NORDAHEAD; + eASSERT(env, env->me_dsync_fd == INVALID_HANDLE_VALUE); + if ((flags & (MDBX_RDONLY | MDBX_SAFE_NOSYNC)) == 0 && + (env->me_fd4data == env->me_lazy_fd || !(flags & MDBX_NOMETASYNC))) { + rc = osal_openfile(MDBX_OPEN_DXB_DSYNC, env, env_pathname.dxb, + &env->me_dsync_fd, 0); + if (env->me_dsync_fd != INVALID_HANDLE_VALUE) { + if ((flags & MDBX_NOMETASYNC) == 0) + env->me_fd4meta = env->me_dsync_fd; + if (env->me_fd4data == env->me_lazy_fd) + env->me_fd4data = env->me_dsync_fd; + osal_fseek(env->me_dsync_fd, safe_parking_lot_offset); + } + } + MDBX_lockinfo *const lck = env->me_lck_mmap.lck; if (lck && lck_rc != MDBX_RESULT_TRUE && (env->me_flags & MDBX_RDONLY) == 0) { while (atomic_load32(&lck->mti_envmode, mo_AcquireRelease) == MDBX_RDONLY) { @@ -13413,6 +13503,12 @@ __cold int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname, } else rc = MDBX_ENOMEM; } + if (rc == MDBX_SUCCESS) + rc = osal_ioring_create(&env->me_ioring, +#if defined(_WIN32) || defined(_WIN64) + ior_flags, +#endif /* Windows */ + env->me_fd4data); } #if MDBX_DEBUG @@ -13469,6 +13565,8 @@ __cold static int env_close(MDBX_env *env) { const int rc = lcklist_detach_locked(env); lcklist_unlock(); + osal_ioring_destroy(&env->me_ioring); + if (env->me_map) { osal_munmap(&env->me_dxb_mmap); #ifdef MDBX_USE_VALGRIND @@ -13477,6 +13575,14 @@ __cold static int env_close(MDBX_env *env) { #endif } +#if defined(_WIN32) || defined(_WIN64) + if (env->me_overlapped_fd != INVALID_HANDLE_VALUE) { + CloseHandle(env->me_data_lock_event); + CloseHandle(env->me_overlapped_fd); + env->me_overlapped_fd = INVALID_HANDLE_VALUE; + } +#endif /* Windows */ + if (env->me_dsync_fd != INVALID_HANDLE_VALUE) { (void)osal_closefile(env->me_dsync_fd); env->me_dsync_fd = INVALID_HANDLE_VALUE; @@ -13578,7 +13684,7 @@ __cold int mdbx_env_close_ex(MDBX_env *env, bool dont_sync) { ? MDBX_SUCCESS : rc; } -#endif +#endif /* Windows */ } eASSERT(env, env->me_signature.weak == 0); @@ -20528,6 +20634,10 @@ __cold static int fetch_envinfo_ex(const MDBX_env *env, const MDBX_txn *txn, atomic_load64(&lck->mti_pgop_stat.unspill, mo_Relaxed); arg->mi_pgop_stat.wops = atomic_load64(&lck->mti_pgop_stat.wops, mo_Relaxed); + arg->mi_pgop_stat.msync = + atomic_load64(&lck->mti_pgop_stat.msync, mo_Relaxed); + arg->mi_pgop_stat.fsync = + atomic_load64(&lck->mti_pgop_stat.fsync, mo_Relaxed); arg->mi_pgop_stat.gcrtime_seconds16dot16 = osal_monotime_to_16dot16( atomic_load64(&lck->mti_pgop_stat.gcrtime, mo_Relaxed)); #else diff --git a/src/internals.h b/src/internals.h index 74137815..05f7393f 100644 --- a/src/internals.h +++ b/src/internals.h @@ -591,6 +591,10 @@ typedef struct { MDBX_atomic_uint64_t gcrtime; /* Time spending for reading/searching GC (aka FreeDB). The unit/scale is platform-depended, see osal_monotime(). */ + MDBX_atomic_uint64_t + msync; /* Number of explicit msync/flush-to-disk operations */ + MDBX_atomic_uint64_t + fsync; /* Number of explicit fsync/flush-to-disk operations */ } MDBX_pgop_stat_t; #endif /* MDBX_ENABLE_PGOP_STAT */ @@ -1143,7 +1147,11 @@ struct MDBX_env { osal_mmap_t me_dxb_mmap; /* The main data file */ #define me_map me_dxb_mmap.dxb #define me_lazy_fd me_dxb_mmap.fd - mdbx_filehandle_t me_dsync_fd; +#define me_fd4data me_ioring.fd + mdbx_filehandle_t me_dsync_fd, me_fd4meta; +#if defined(_WIN32) || defined(_WIN64) + HANDLE me_overlapped_fd, me_data_lock_event; +#endif /* Windows */ osal_mmap_t me_lck_mmap; /* The lock file */ #define me_lfd me_lck_mmap.fd struct MDBX_lockinfo *me_lck; @@ -1222,6 +1230,7 @@ struct MDBX_env { unsigned me_dp_reserve_len; /* PNL of pages that became unused in a write txn */ MDBX_PNL me_retired_pages; + osal_ioring_t me_ioring; #if defined(_WIN32) || defined(_WIN64) osal_srwlock_t me_remap_guard; @@ -1609,20 +1618,24 @@ ceil_powerof2(size_t value, size_t granularity) { } MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static unsigned -log2n_powerof2(size_t value) { - assert(value > 0 && value < INT32_MAX && is_powerof2(value)); - assert((value & -(int32_t)value) == value); -#if __GNUC_PREREQ(4, 1) || __has_builtin(__builtin_ctzl) - return __builtin_ctzl(value); +log2n_powerof2(size_t value_uintptr) { + assert(value_uintptr > 0 && value_uintptr < INT32_MAX && + is_powerof2(value_uintptr)); + assert((value_uintptr & -(intptr_t)value_uintptr) == value_uintptr); + const uint32_t value_uint32 = (uint32_t)value_uintptr; +#if __GNUC_PREREQ(4, 1) || __has_builtin(__builtin_ctz) + STATIC_ASSERT(sizeof(value_uint32) <= sizeof(unsigned)); + return __builtin_ctz(value_uint32); #elif defined(_MSC_VER) unsigned long index; - _BitScanForward(&index, (unsigned long)value); + STATIC_ASSERT(sizeof(value_uint32) <= sizeof(long)); + _BitScanForward(&index, value_uint32); return index; #else static const uint8_t debruijn_ctz32[32] = { 0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8, 31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9}; - return debruijn_ctz32[(uint32_t)(value * 0x077CB531u) >> 27]; + return debruijn_ctz32[(uint32_t)(value_uint32 * 0x077CB531ul) >> 27]; #endif } diff --git a/src/lck-windows.c b/src/lck-windows.c index 7b833773..7038854e 100644 --- a/src/lck-windows.c +++ b/src/lck-windows.c @@ -112,32 +112,73 @@ static #define LCK_WAITFOR 0 #define LCK_DONTWAIT LOCKFILE_FAIL_IMMEDIATELY -static __inline BOOL flock(mdbx_filehandle_t fd, DWORD flags, uint64_t offset, - size_t bytes) { +static int flock_with_event(HANDLE fd, HANDLE event, DWORD flags, + uint64_t offset, size_t bytes) { + TRACE("lock>>: fd %p, event %p, flags 0x%x offset %" PRId64 ", bytes %" PRId64 + " >>", + fd, event, flags, offset, bytes); OVERLAPPED ov; - ov.hEvent = 0; + ov.Internal = 0; + ov.InternalHigh = 0; + ov.hEvent = event; ov.Offset = (DWORD)offset; ov.OffsetHigh = HIGH_DWORD(offset); - return LockFileEx(fd, flags, 0, (DWORD)bytes, HIGH_DWORD(bytes), &ov); + if (LockFileEx(fd, flags, 0, (DWORD)bytes, HIGH_DWORD(bytes), &ov)) { + TRACE("lock<<: fd %p, event %p, flags 0x%x offset %" PRId64 + ", bytes %" PRId64 " << %s", + fd, event, flags, offset, bytes, "done"); + return MDBX_SUCCESS; + } + + DWORD rc = GetLastError(); + if (rc == ERROR_IO_PENDING) { + if (event) { + if (GetOverlappedResult(fd, &ov, &rc, true)) { + TRACE("lock<<: fd %p, event %p, flags 0x%x offset %" PRId64 + ", bytes %" PRId64 " << %s", + fd, event, flags, offset, bytes, "overlapped-done"); + return MDBX_SUCCESS; + } + rc = GetLastError(); + } else + CancelIo(fd); + } + TRACE("lock<<: fd %p, event %p, flags 0x%x offset %" PRId64 ", bytes %" PRId64 + " << err %d", + fd, event, flags, offset, bytes, rc); + return (int)rc; } -static __inline BOOL funlock(mdbx_filehandle_t fd, uint64_t offset, - size_t bytes) { +static __inline int flock(HANDLE fd, DWORD flags, uint64_t offset, + size_t bytes) { + return flock_with_event(fd, 0, flags, offset, bytes); +} + +static __inline int flock_data(const MDBX_env *env, DWORD flags, + uint64_t offset, size_t bytes) { + return flock_with_event(env->me_fd4data, env->me_data_lock_event, flags, + offset, bytes); +} + +static int funlock(mdbx_filehandle_t fd, uint64_t offset, size_t bytes) { + TRACE("unlock: fd %p, offset %" PRId64 ", bytes %" PRId64, fd, offset, bytes); return UnlockFile(fd, (DWORD)offset, HIGH_DWORD(offset), (DWORD)bytes, - HIGH_DWORD(bytes)); + HIGH_DWORD(bytes)) + ? MDBX_SUCCESS + : (int)GetLastError(); } /*----------------------------------------------------------------------------*/ /* global `write` lock for write-txt processing, * exclusive locking both meta-pages) */ -#define LCK_MAXLEN (1u + ((~(size_t)0) >> 1)) -#define LCK_META_OFFSET 0 -#define LCK_META_LEN (MAX_PAGESIZE * NUM_METAS) -#define LCK_BODY_OFFSET LCK_META_LEN -#define LCK_BODY_LEN (LCK_MAXLEN - LCK_BODY_OFFSET) -#define LCK_BODY LCK_BODY_OFFSET, LCK_BODY_LEN -#define LCK_WHOLE 0, LCK_MAXLEN +#ifdef _WIN64 +#define DXB_MAXLEN UINT64_C(0x7fffFFFFfff00000) +#else +#define DXB_MAXLEN UINT32_C(0x7ff00000) +#endif +#define DXB_BODY (env->me_psize * NUM_METAS), DXB_MAXLEN +#define DXB_WHOLE 0, DXB_MAXLEN int mdbx_txn_lock(MDBX_env *env, bool dontwait) { if (dontwait) { @@ -155,24 +196,27 @@ int mdbx_txn_lock(MDBX_env *env, bool dontwait) { } } - if ((env->me_flags & MDBX_EXCLUSIVE) || - flock(env->me_lazy_fd, - dontwait ? (LCK_EXCLUSIVE | LCK_DONTWAIT) - : (LCK_EXCLUSIVE | LCK_WAITFOR), - LCK_BODY)) + if (env->me_flags & MDBX_EXCLUSIVE) return MDBX_SUCCESS; - int rc = (int)GetLastError(); + + int rc = flock_with_event(env->me_fd4data, env->me_data_lock_event, + dontwait ? (LCK_EXCLUSIVE | LCK_DONTWAIT) + : (LCK_EXCLUSIVE | LCK_WAITFOR), + DXB_BODY); + if (rc == MDBX_SUCCESS) + return rc; + LeaveCriticalSection(&env->me_windowsbug_lock); return (!dontwait || rc != ERROR_LOCK_VIOLATION) ? rc : MDBX_BUSY; } void mdbx_txn_unlock(MDBX_env *env) { - int rc = (env->me_flags & MDBX_EXCLUSIVE) - ? TRUE - : funlock(env->me_lazy_fd, LCK_BODY); + if ((env->me_flags & MDBX_EXCLUSIVE) == 0) { + int err = funlock(env->me_fd4data, DXB_BODY); + if (err != MDBX_SUCCESS) + mdbx_panic("%s failed: err %u", __func__, err); + } LeaveCriticalSection(&env->me_windowsbug_lock); - if (!rc) - mdbx_panic("%s failed: err %u", __func__, (int)GetLastError()); } /*----------------------------------------------------------------------------*/ @@ -193,32 +237,32 @@ MDBX_INTERNAL_FUNC int osal_rdt_lock(MDBX_env *env) { /* transition from S-? (used) to S-E (locked), * e.g. exclusive lock upper-part */ - if ((env->me_flags & MDBX_EXCLUSIVE) || - flock(env->me_lfd, LCK_EXCLUSIVE | LCK_WAITFOR, LCK_UPPER)) + if (env->me_flags & MDBX_EXCLUSIVE) + return MDBX_SUCCESS; + + int rc = flock(env->me_lfd, LCK_EXCLUSIVE | LCK_WAITFOR, LCK_UPPER); + if (rc == MDBX_SUCCESS) return MDBX_SUCCESS; - int rc = (int)GetLastError(); osal_srwlock_ReleaseShared(&env->me_remap_guard); return rc; } MDBX_INTERNAL_FUNC void osal_rdt_unlock(MDBX_env *env) { - if (env->me_lfd != INVALID_HANDLE_VALUE) { + if (env->me_lfd != INVALID_HANDLE_VALUE && + (env->me_flags & MDBX_EXCLUSIVE) == 0) { /* transition from S-E (locked) to S-? (used), e.g. unlock upper-part */ - if ((env->me_flags & MDBX_EXCLUSIVE) == 0 && - !funlock(env->me_lfd, LCK_UPPER)) - mdbx_panic("%s failed: err %u", __func__, (int)GetLastError()); + int err = funlock(env->me_lfd, LCK_UPPER); + if (err != MDBX_SUCCESS) + mdbx_panic("%s failed: err %u", __func__, err); } osal_srwlock_ReleaseShared(&env->me_remap_guard); } MDBX_INTERNAL_FUNC int osal_lockfile(mdbx_filehandle_t fd, bool wait) { - return flock(fd, - wait ? LCK_EXCLUSIVE | LCK_WAITFOR - : LCK_EXCLUSIVE | LCK_DONTWAIT, - 0, LCK_MAXLEN) - ? MDBX_SUCCESS - : (int)GetLastError(); + return flock( + fd, wait ? LCK_EXCLUSIVE | LCK_WAITFOR : LCK_EXCLUSIVE | LCK_DONTWAIT, 0, + DXB_MAXLEN); } static int suspend_and_append(mdbx_handle_array_t **array, @@ -386,40 +430,36 @@ static void lck_unlock(MDBX_env *env) { if (env->me_lfd != INVALID_HANDLE_VALUE) { /* double `unlock` for robustly remove overlapped shared/exclusive locks */ - while (funlock(env->me_lfd, LCK_LOWER)) - ; - err = (int)GetLastError(); + do + err = funlock(env->me_lfd, LCK_LOWER); + while (err == MDBX_SUCCESS); assert(err == ERROR_NOT_LOCKED || (mdbx_RunningUnderWine() && err == ERROR_LOCK_VIOLATION)); - (void)err; SetLastError(ERROR_SUCCESS); - while (funlock(env->me_lfd, LCK_UPPER)) - ; - err = (int)GetLastError(); + do + err = funlock(env->me_lfd, LCK_UPPER); + while (err == MDBX_SUCCESS); assert(err == ERROR_NOT_LOCKED || (mdbx_RunningUnderWine() && err == ERROR_LOCK_VIOLATION)); - (void)err; SetLastError(ERROR_SUCCESS); } - if (env->me_lazy_fd != INVALID_HANDLE_VALUE) { + if (env->me_fd4data != INVALID_HANDLE_VALUE) { /* explicitly unlock to avoid latency for other processes (windows kernel * releases such locks via deferred queues) */ - while (funlock(env->me_lazy_fd, LCK_BODY)) - ; - err = (int)GetLastError(); + do + err = funlock(env->me_fd4data, DXB_BODY); + while (err == MDBX_SUCCESS); assert(err == ERROR_NOT_LOCKED || (mdbx_RunningUnderWine() && err == ERROR_LOCK_VIOLATION)); - (void)err; SetLastError(ERROR_SUCCESS); - while (funlock(env->me_lazy_fd, LCK_WHOLE)) - ; - err = (int)GetLastError(); + do + err = funlock(env->me_fd4data, DXB_WHOLE); + while (err == MDBX_SUCCESS); assert(err == ERROR_NOT_LOCKED || (mdbx_RunningUnderWine() && err == ERROR_LOCK_VIOLATION)); - (void)err; SetLastError(ERROR_SUCCESS); } } @@ -428,56 +468,55 @@ static void lck_unlock(MDBX_env *env) { * or as 'used' (S-? and returns MDBX_RESULT_FALSE). * Otherwise returns an error. */ static int internal_seize_lck(HANDLE lfd) { - int rc; assert(lfd != INVALID_HANDLE_VALUE); /* 1) now on ?-? (free), get ?-E (middle) */ jitter4testing(false); - if (!flock(lfd, LCK_EXCLUSIVE | LCK_WAITFOR, LCK_UPPER)) { - rc = (int)GetLastError() /* 2) something went wrong, give up */; + int rc = flock(lfd, LCK_EXCLUSIVE | LCK_WAITFOR, LCK_UPPER); + if (rc != MDBX_SUCCESS) { + /* 2) something went wrong, give up */; ERROR("%s, err %u", "?-?(free) >> ?-E(middle)", rc); return rc; } /* 3) now on ?-E (middle), try E-E (exclusive-write) */ jitter4testing(false); - if (flock(lfd, LCK_EXCLUSIVE | LCK_DONTWAIT, LCK_LOWER)) + rc = flock(lfd, LCK_EXCLUSIVE | LCK_DONTWAIT, LCK_LOWER); + if (rc == MDBX_SUCCESS) return MDBX_RESULT_TRUE /* 4) got E-E (exclusive-write), done */; /* 5) still on ?-E (middle) */ - rc = (int)GetLastError(); jitter4testing(false); if (rc != ERROR_SHARING_VIOLATION && rc != ERROR_LOCK_VIOLATION) { /* 6) something went wrong, give up */ - if (!funlock(lfd, LCK_UPPER)) + rc = funlock(lfd, LCK_UPPER); + if (rc != MDBX_SUCCESS) mdbx_panic("%s(%s) failed: err %u", __func__, "?-E(middle) >> ?-?(free)", - (int)GetLastError()); + rc); return rc; } /* 7) still on ?-E (middle), try S-E (locked) */ jitter4testing(false); - rc = flock(lfd, LCK_SHARED | LCK_DONTWAIT, LCK_LOWER) ? MDBX_RESULT_FALSE - : (int)GetLastError(); + rc = flock(lfd, LCK_SHARED | LCK_DONTWAIT, LCK_LOWER); jitter4testing(false); - if (rc != MDBX_RESULT_FALSE) + if (rc != MDBX_SUCCESS) ERROR("%s, err %u", "?-E(middle) >> S-E(locked)", rc); /* 8) now on S-E (locked) or still on ?-E (middle), * transition to S-? (used) or ?-? (free) */ - if (!funlock(lfd, LCK_UPPER)) + int err = funlock(lfd, LCK_UPPER); + if (err != MDBX_SUCCESS) mdbx_panic("%s(%s) failed: err %u", __func__, - "X-E(locked/middle) >> X-?(used/free)", (int)GetLastError()); + "X-E(locked/middle) >> X-?(used/free)", err); /* 9) now on S-? (used, DONE) or ?-? (free, FAILURE) */ return rc; } MDBX_INTERNAL_FUNC int osal_lck_seize(MDBX_env *env) { - int rc; - - assert(env->me_lazy_fd != INVALID_HANDLE_VALUE); + assert(env->me_fd4data != INVALID_HANDLE_VALUE); if (env->me_flags & MDBX_EXCLUSIVE) return MDBX_RESULT_TRUE /* nope since files were must be opened non-shareable */ @@ -486,15 +525,13 @@ MDBX_INTERNAL_FUNC int osal_lck_seize(MDBX_env *env) { if (env->me_lfd == INVALID_HANDLE_VALUE) { /* LY: without-lck mode (e.g. on read-only filesystem) */ jitter4testing(false); - if (!flock(env->me_lazy_fd, LCK_SHARED | LCK_DONTWAIT, LCK_WHOLE)) { - rc = (int)GetLastError(); + int rc = flock_data(env, LCK_SHARED | LCK_DONTWAIT, DXB_WHOLE); + if (rc != MDBX_SUCCESS) ERROR("%s, err %u", "without-lck", rc); - return rc; - } - return MDBX_RESULT_FALSE; + return rc; } - rc = internal_seize_lck(env->me_lfd); + int rc = internal_seize_lck(env->me_lfd); jitter4testing(false); if (rc == MDBX_RESULT_TRUE && (env->me_flags & MDBX_RDONLY) == 0) { /* Check that another process don't operates in without-lck mode. @@ -503,17 +540,18 @@ MDBX_INTERNAL_FUNC int osal_lck_seize(MDBX_env *env) { * - we need an exclusive lock for do so; * - we can't lock meta-pages, otherwise other process could get an error * while opening db in valid (non-conflict) mode. */ - if (!flock(env->me_lazy_fd, LCK_EXCLUSIVE | LCK_DONTWAIT, LCK_BODY)) { - rc = (int)GetLastError(); - ERROR("%s, err %u", "lock-against-without-lck", rc); + int err = flock_data(env, LCK_EXCLUSIVE | LCK_DONTWAIT, DXB_WHOLE); + if (err != MDBX_SUCCESS) { + ERROR("%s, err %u", "lock-against-without-lck", err); jitter4testing(false); lck_unlock(env); - } else { - jitter4testing(false); - if (!funlock(env->me_lazy_fd, LCK_BODY)) - mdbx_panic("%s(%s) failed: err %u", __func__, - "unlock-against-without-lck", (int)GetLastError()); + return err; } + jitter4testing(false); + err = funlock(env->me_fd4data, DXB_WHOLE); + if (err != MDBX_SUCCESS) + mdbx_panic("%s(%s) failed: err %u", __func__, + "unlock-against-without-lck", err); } return rc; @@ -521,28 +559,31 @@ MDBX_INTERNAL_FUNC int osal_lck_seize(MDBX_env *env) { MDBX_INTERNAL_FUNC int osal_lck_downgrade(MDBX_env *env) { /* Transite from exclusive-write state (E-E) to used (S-?) */ - assert(env->me_lazy_fd != INVALID_HANDLE_VALUE); + assert(env->me_fd4data != INVALID_HANDLE_VALUE); assert(env->me_lfd != INVALID_HANDLE_VALUE); if (env->me_flags & MDBX_EXCLUSIVE) return MDBX_SUCCESS /* nope since files were must be opened non-shareable */ ; /* 1) now at E-E (exclusive-write), transition to ?_E (middle) */ - if (!funlock(env->me_lfd, LCK_LOWER)) + int rc = funlock(env->me_lfd, LCK_LOWER); + if (rc != MDBX_SUCCESS) mdbx_panic("%s(%s) failed: err %u", __func__, - "E-E(exclusive-write) >> ?-E(middle)", (int)GetLastError()); + "E-E(exclusive-write) >> ?-E(middle)", rc); /* 2) now at ?-E (middle), transition to S-E (locked) */ - if (!flock(env->me_lfd, LCK_SHARED | LCK_DONTWAIT, LCK_LOWER)) { - int rc = (int)GetLastError() /* 3) something went wrong, give up */; + rc = flock(env->me_lfd, LCK_SHARED | LCK_DONTWAIT, LCK_LOWER); + if (rc != MDBX_SUCCESS) { + /* 3) something went wrong, give up */; ERROR("%s, err %u", "?-E(middle) >> S-E(locked)", rc); return rc; } /* 4) got S-E (locked), continue transition to S-? (used) */ - if (!funlock(env->me_lfd, LCK_UPPER)) + rc = funlock(env->me_lfd, LCK_UPPER); + if (rc != MDBX_SUCCESS) mdbx_panic("%s(%s) failed: err %u", __func__, "S-E(locked) >> S-?(used)", - (int)GetLastError()); + rc); return MDBX_SUCCESS /* 5) now at S-? (used), done */; } @@ -555,24 +596,26 @@ MDBX_INTERNAL_FUNC int mdbx_lck_upgrade(MDBX_env *env) { return MDBX_SUCCESS /* nope since files were must be opened non-shareable */ ; - int rc; /* 1) now on S-? (used), try S-E (locked) */ jitter4testing(false); - if (!flock(env->me_lfd, LCK_EXCLUSIVE | LCK_DONTWAIT, LCK_UPPER)) { - rc = (int)GetLastError() /* 2) something went wrong, give up */; + int rc = flock(env->me_lfd, LCK_EXCLUSIVE | LCK_DONTWAIT, LCK_UPPER); + if (rc != MDBX_SUCCESS) { + /* 2) something went wrong, give up */; VERBOSE("%s, err %u", "S-?(used) >> S-E(locked)", rc); return rc; } /* 3) now on S-E (locked), transition to ?-E (middle) */ - if (!funlock(env->me_lfd, LCK_LOWER)) + rc = funlock(env->me_lfd, LCK_LOWER); + if (rc != MDBX_SUCCESS) mdbx_panic("%s(%s) failed: err %u", __func__, "S-E(locked) >> ?-E(middle)", - (int)GetLastError()); + rc); /* 4) now on ?-E (middle), try E-E (exclusive-write) */ jitter4testing(false); - if (!flock(env->me_lfd, LCK_EXCLUSIVE | LCK_DONTWAIT, LCK_LOWER)) { - rc = (int)GetLastError() /* 5) something went wrong, give up */; + rc = flock(env->me_lfd, LCK_EXCLUSIVE | LCK_DONTWAIT, LCK_LOWER); + if (rc != MDBX_SUCCESS) { + /* 5) something went wrong, give up */; VERBOSE("%s, err %u", "?-E(middle) >> E-E(exclusive-write)", rc); return rc; } @@ -586,6 +629,23 @@ MDBX_INTERNAL_FUNC int osal_lck_init(MDBX_env *env, (void)env; (void)inprocess_neighbor; (void)global_uniqueness_flag; + if (mdbx_SetFileIoOverlappedRange && !(env->me_flags & MDBX_RDONLY)) { + HANDLE token = INVALID_HANDLE_VALUE; + TOKEN_PRIVILEGES privileges; + privileges.PrivilegeCount = 1; + privileges.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED; + if (!OpenProcessToken(GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES, + &token) || + !LookupPrivilegeValue(NULL, SE_LOCK_MEMORY_NAME, + &privileges.Privileges[0].Luid) || + !AdjustTokenPrivileges(token, FALSE, &privileges, sizeof(privileges), + nullptr, nullptr) || + GetLastError() != ERROR_SUCCESS) + mdbx_SetFileIoOverlappedRange = NULL; + + if (token != INVALID_HANDLE_VALUE) + CloseHandle(token); + } return MDBX_SUCCESS; } @@ -752,6 +812,7 @@ MDBX_NtFsControlFile mdbx_NtFsControlFile; MDBX_PrefetchVirtualMemory mdbx_PrefetchVirtualMemory; MDBX_GetTickCount64 mdbx_GetTickCount64; MDBX_RegGetValueA mdbx_RegGetValueA; +MDBX_SetFileIoOverlappedRange mdbx_SetFileIoOverlappedRange; #endif /* xMDBX_ALLOY */ #if __GNUC_PREREQ(8, 0) @@ -783,6 +844,7 @@ static void mdbx_winnt_import(void) { GET_PROC_ADDR(hKernel32dll, GetVolumeInformationByHandleW); GET_PROC_ADDR(hKernel32dll, GetFinalPathNameByHandleW); GET_PROC_ADDR(hKernel32dll, PrefetchVirtualMemory); + GET_PROC_ADDR(hKernel32dll, SetFileIoOverlappedRange); } const HINSTANCE hAdvapi32dll = GetModuleHandleA("advapi32.dll"); diff --git a/src/osal.c b/src/osal.c index b8b8cf54..77b6adfc 100644 --- a/src/osal.c +++ b/src/osal.c @@ -1,4 +1,4 @@ -/* https://en.wikipedia.org/wiki/Operating_system_abstraction_layer */ +/* https://en.wikipedia.org/wiki/Operating_system_abstraction_layer */ /* * Copyright 2015-2022 Leonid Yuriev @@ -537,6 +537,596 @@ MDBX_INTERNAL_FUNC size_t osal_mb2w(wchar_t *dst, size_t dst_n, const char *src, /*----------------------------------------------------------------------------*/ +#if defined(_WIN32) || defined(_WIN64) +#define ior_alignment_mask (ior->pagesize - 1) +#define OSAL_IOV_MAX (4096 / sizeof(ior_sgv_element)) + +static void ior_put_event(osal_ioring_t *ior, HANDLE event) { + assert(event && event != INVALID_HANDLE_VALUE && event != ior); + assert(ior->event_stack < ior->allocated); + ior->event_pool[ior->event_stack] = event; + ior->event_stack += 1; +} + +static HANDLE ior_get_event(osal_ioring_t *ior) { + assert(ior->event_stack <= ior->allocated); + if (ior->event_stack > 0) { + ior->event_stack -= 1; + assert(ior->event_pool[ior->event_stack] != 0); + return ior->event_pool[ior->event_stack]; + } + return CreateEventW(nullptr, true, false, nullptr); +} + +static void WINAPI ior_wocr(DWORD err, DWORD bytes, OVERLAPPED *ov) { + osal_ioring_t *ior = ov->hEvent; + ov->Internal = err; + ov->InternalHigh = bytes; + if (++ior->async_completed >= ior->async_waiting) + SetEvent(ior->async_done); +} + +#elif MDBX_HAVE_PWRITEV +#if defined(_SC_IOV_MAX) +static size_t osal_iov_max; +#define OSAL_IOV_MAX osal_iov_max +#else +#define OSAL_IOV_MAX IOV_MAX +#endif +#else +#undef OSAL_IOV_MAX +#endif /* OSAL_IOV_MAX */ + +MDBX_INTERNAL_FUNC int osal_ioring_create(osal_ioring_t *ior, +#if defined(_WIN32) || defined(_WIN64) + unsigned flags, +#endif /* Windows */ + mdbx_filehandle_t fd) { + memset(ior, 0, sizeof(osal_ioring_t)); + ior->fd = fd; + +#if defined(_WIN32) || defined(_WIN64) + ior->flags = flags; + const unsigned pagesize = (unsigned)osal_syspagesize(); + ior->pagesize = pagesize; + ior->pagesize_ln2 = (uint8_t)log2n_powerof2(pagesize); + ior->async_done = ior_get_event(ior); + if (!ior->async_done) + return GetLastError(); +#endif /* !Windows */ + +#if MDBX_HAVE_PWRITEV && defined(_SC_IOV_MAX) + if (!osal_iov_max) + osal_iov_max = sysconf(_SC_IOV_MAX); +#endif + + ior->boundary = (char *)(ior->pool + ior->allocated); + return MDBX_SUCCESS; +} + +static __inline size_t ior_offset(const ior_item_t *item) { +#if defined(_WIN32) || defined(_WIN64) + return item->ov.Offset | (size_t)((sizeof(size_t) > sizeof(item->ov.Offset)) + ? (uint64_t)item->ov.OffsetHigh << 32 + : 0); +#else + return item->offset; +#endif /* !Windows */ +} + +static __inline ior_item_t *ior_next(ior_item_t *item, size_t sgvcnt) { +#if defined(ior_sgv_element) + assert(sgvcnt > 0); + return (ior_item_t *)((char *)item + sizeof(ior_item_t) - + sizeof(ior_sgv_element) + + sizeof(ior_sgv_element) * sgvcnt); +#else + assert(sgvcnt == 1); + (void)sgvcnt; + return item + 1; +#endif +} + +MDBX_INTERNAL_FUNC int osal_ioring_add(osal_ioring_t *ior, const size_t offset, + void *data, const size_t bytes) { + + assert(bytes && data); + assert(bytes % MIN_PAGESIZE == 0 && bytes <= MAX_WRITE); + assert(offset % MIN_PAGESIZE == 0 && offset + (uint64_t)bytes <= MAX_MAPSIZE); + +#if defined(_WIN32) || defined(_WIN64) + const unsigned segments = (unsigned)(bytes >> ior->pagesize_ln2); + const bool use_gather = + (ior->flags & IOR_UNBUFFERED) && ior->slots_left >= segments; +#endif /* Windows */ + + ior_item_t *item = ior->pool; + if (likely(ior->last)) { + item = ior->last; + if (unlikely(ior_offset(item) + ior_last_bytes(ior, item) == offset) && + likely(ior_last_bytes(ior, item) + bytes <= MAX_WRITE)) { +#if defined(_WIN32) || defined(_WIN64) + if (use_gather && + ((bytes | (uintptr_t)data | ior->last_bytes | + (uintptr_t)(uint64_t)item->sgv[0].Buffer) & + ior_alignment_mask) == 0 && + ior->last_sgvcnt + segments < OSAL_IOV_MAX) { + assert((item->single.iov_len & 1) == 0); + assert(item->sgv[ior->last_sgvcnt].Buffer == 0); + ior->last_bytes += bytes; + size_t i = 0; + do { + item->sgv[ior->last_sgvcnt + i].Buffer = PtrToPtr64(data); + data = (char *)data + ior->pagesize; + } while (++i < segments); + ior->slots_left -= segments; + item->sgv[ior->last_sgvcnt += segments].Buffer = 0; + assert((item->single.iov_len & 1) == 0); + return MDBX_SUCCESS; + } + const void *end = + (char *)(item->single.iov_base) + item->single.iov_len - 1; + if (unlikely(end == data)) { + assert((item->single.iov_len & 1) != 0); + item->single.iov_len += bytes; + return MDBX_SUCCESS; + } +#elif MDBX_HAVE_PWRITEV + assert((int)item->sgvcnt > 0); + const void *end = (char *)(item->sgv[item->sgvcnt - 1].iov_base) + + item->sgv[item->sgvcnt - 1].iov_len; + if (unlikely(end == data)) { + item->sgv[item->sgvcnt - 1].iov_len += bytes; + ior->last_bytes += bytes; + return MDBX_SUCCESS; + } + if (likely(item->sgvcnt < OSAL_IOV_MAX)) { + if (unlikely(ior->slots_left < 1)) + return MDBX_RESULT_TRUE; + item->sgv[item->sgvcnt].iov_base = data; + item->sgv[item->sgvcnt].iov_len = bytes; + ior->last_bytes += bytes; + item->sgvcnt += 1; + ior->slots_left -= 1; + return MDBX_SUCCESS; + } +#else + const void *end = (char *)(item->single.iov_base) + item->single.iov_len; + if (unlikely(end == data)) { + item->single.iov_len += bytes; + return MDBX_SUCCESS; + } +#endif + } + item = ior_next(item, ior_last_sgvcnt(ior, item)); + } + + if (unlikely(ior->slots_left < 1)) + return MDBX_RESULT_TRUE; + + unsigned slots_used = 1; +#if defined(_WIN32) || defined(_WIN64) + item->ov.Internal = item->ov.InternalHigh = 0; + item->ov.Offset = (DWORD)offset; + item->ov.OffsetHigh = HIGH_DWORD(offset); + item->ov.hEvent = 0; + if (!use_gather || ((bytes | (uintptr_t)(data)) & ior_alignment_mask) != 0 || + segments > OSAL_IOV_MAX) { + /* WriteFile() */ + item->single.iov_base = data; + item->single.iov_len = bytes + 1; + assert((item->single.iov_len & 1) != 0); + } else { + /* WriteFileGather() */ + item->sgv[0].Buffer = PtrToPtr64(data); + for (size_t i = 1; i < segments; ++i) { + data = (char *)data + ior->pagesize; + item->sgv[slots_used].Buffer = PtrToPtr64(data); + } + item->sgv[slots_used].Buffer = 0; + assert((item->single.iov_len & 1) == 0); + slots_used = segments; + } + ior->last_bytes = bytes; + ior_last_sgvcnt(ior, item) = slots_used; +#elif MDBX_HAVE_PWRITEV + item->offset = offset; + item->sgv[0].iov_base = data; + item->sgv[0].iov_len = bytes; + ior->last_bytes = bytes; + ior_last_sgvcnt(ior, item) = slots_used; +#else + item->offset = offset; + item->single.iov_base = data; + item->single.iov_len = bytes; +#endif /* !Windows */ + ior->slots_left -= slots_used; + ior->last = item; + return MDBX_SUCCESS; +} + +MDBX_INTERNAL_FUNC void osal_ioring_walk( + osal_ioring_t *ior, iov_ctx_t *ctx, + void (*callback)(iov_ctx_t *ctx, size_t offset, void *data, size_t bytes)) { + for (ior_item_t *item = ior->pool; item <= ior->last;) { +#if defined(_WIN32) || defined(_WIN64) + size_t offset = ior_offset(item); + char *data = item->single.iov_base; + size_t bytes = item->single.iov_len - 1; + size_t i = 1; + if (bytes & 1) { + data = Ptr64ToPtr(item->sgv[0].Buffer); + bytes = ior->pagesize; + while (item->sgv[i].Buffer) { + if (data + ior->pagesize != item->sgv[i].Buffer) { + callback(ctx, offset, data, bytes); + offset += bytes; + data = Ptr64ToPtr(item->sgv[i].Buffer); + bytes = 0; + } + bytes += ior->pagesize; + ++i; + } + } + assert(bytes < MAX_WRITE); + callback(ctx, offset, data, bytes); +#elif MDBX_HAVE_PWRITEV + assert(item->sgvcnt > 0); + size_t offset = item->offset; + size_t i = 0; + do { + callback(ctx, offset, item->sgv[i].iov_base, item->sgv[i].iov_len); + offset += item->sgv[i].iov_len; + } while (++i != item->sgvcnt); +#else + const size_t i = 1; + callback(ctx, item->offset, item->single.iov_base, item->single.iov_len); +#endif + item = ior_next(item, i); + } +} + +MDBX_INTERNAL_FUNC osal_ioring_write_result_t +osal_ioring_write(osal_ioring_t *ior) { + osal_ioring_write_result_t r = {MDBX_SUCCESS, 0}; + +#if defined(_WIN32) || defined(_WIN64) + HANDLE *const end_wait_for = + ior->event_pool + ior->allocated + + /* был выделен один дополнительный элемент для async_done */ 1; + HANDLE *wait_for = end_wait_for; + LONG async_started = 0; + for (ior_item_t *item = ior->pool; item <= ior->last;) { + item->ov.Internal = STATUS_PENDING; + size_t i = 1, bytes = item->single.iov_len - 1; + r.wops += 1; + if (bytes & 1) { + bytes = ior->pagesize; + while (item->sgv[i].Buffer) { + bytes += ior->pagesize; + ++i; + } + assert(bytes < MAX_WRITE); + item->ov.hEvent = ior_get_event(ior); + if (unlikely(!item->ov.hEvent)) { + bailout_geterr: + r.err = GetLastError(); + bailout_rc: + assert(r.err != MDBX_SUCCESS); + CancelIo(ior->fd); + return r; + } + if (WriteFileGather(ior->fd, item->sgv, (DWORD)bytes, nullptr, + &item->ov)) { + assert(item->ov.Internal == 0 && + WaitForSingleObject(item->ov.hEvent, 0) == WAIT_OBJECT_0); + ior_put_event(ior, item->ov.hEvent); + item->ov.hEvent = 0; + } else { + r.err = (int)GetLastError(); + if (unlikely(r.err != ERROR_IO_PENDING)) { + ERROR("%s: fd %p, item %p (%zu), pgno %u, bytes %zu, offset %" PRId64 + ", err %d", + "WriteFileGather", ior->fd, item, item - ior->pool, + ((MDBX_page *)item->single.iov_base)->mp_pgno, bytes, + item->ov.Offset + ((uint64_t)item->ov.OffsetHigh << 32), r.err); + goto bailout_rc; + } + assert(wait_for > ior->event_pool + ior->event_stack); + *--wait_for = item->ov.hEvent; + } + } else if (ior->flags & IOR_OVERLAPPED) { + assert(bytes < MAX_WRITE); + retry: + item->ov.hEvent = ior; + if (WriteFileEx(ior->fd, item->single.iov_base, (DWORD)bytes, &item->ov, + ior_wocr)) { + async_started += 1; + } else { + r.err = (int)GetLastError(); + switch (r.err) { + default: + ERROR("%s: fd %p, item %p (%zu), pgno %u, bytes %zu, offset %" PRId64 + ", err %d", + "WriteFileEx", ior->fd, item, item - ior->pool, + ((MDBX_page *)item->single.iov_base)->mp_pgno, bytes, + item->ov.Offset + ((uint64_t)item->ov.OffsetHigh << 32), r.err); + goto bailout_rc; + case ERROR_NOT_FOUND: + case ERROR_USER_MAPPED_FILE: + case ERROR_LOCK_VIOLATION: + WARNING( + "%s: fd %p, item %p (%zu), pgno %u, bytes %zu, offset %" PRId64 + ", err %d", + "WriteFileEx", ior->fd, item, item - ior->pool, + ((MDBX_page *)item->single.iov_base)->mp_pgno, bytes, + item->ov.Offset + ((uint64_t)item->ov.OffsetHigh << 32), r.err); + SleepEx(0, true); + goto retry; + case ERROR_INVALID_USER_BUFFER: + case ERROR_NOT_ENOUGH_MEMORY: + if (SleepEx(0, true) == WAIT_IO_COMPLETION) + goto retry; + goto bailout_rc; + case ERROR_IO_PENDING: + async_started += 1; + } + } + } else { + assert(bytes < MAX_WRITE); + DWORD written = 0; + if (!WriteFile(ior->fd, item->single.iov_base, (DWORD)bytes, &written, + &item->ov)) { + r.err = (int)GetLastError(); + ERROR("%s: fd %p, item %p (%zu), pgno %u, bytes %zu, offset %" PRId64 + ", err %d", + "WriteFile", ior->fd, item, item - ior->pool, + ((MDBX_page *)item->single.iov_base)->mp_pgno, bytes, + item->ov.Offset + ((uint64_t)item->ov.OffsetHigh << 32), r.err); + goto bailout_rc; + } else if (unlikely(written != bytes)) { + r.err = ERROR_WRITE_FAULT; + goto bailout_rc; + } + } + item = ior_next(item, i); + } + + assert(ior->async_waiting > ior->async_completed && + ior->async_waiting == INT_MAX); + ior->async_waiting = async_started; + if (async_started > ior->async_completed && end_wait_for == wait_for) { + assert(wait_for > ior->event_pool + ior->event_stack); + *--wait_for = ior->async_done; + } + + const size_t pending_count = end_wait_for - wait_for; + if (pending_count) { + /* Ждем до MAXIMUM_WAIT_OBJECTS (64) последних хендлов, а после избирательно + * ждем посредством GetOverlappedResult(), если какие-то более ранние + * элементы еще не завершены. В целом, так получается меньше системных + * вызовов, т.е. меньше накладных расходов. Однако, не факт что эта экономия + * не будет перекрыта неэффективностью реализации + * WaitForMultipleObjectsEx(), но тогда это проблемы на стороне M$. */ + DWORD madness; + do + madness = WaitForMultipleObjectsEx((pending_count < MAXIMUM_WAIT_OBJECTS) + ? (DWORD)pending_count + : MAXIMUM_WAIT_OBJECTS, + wait_for, true, + /* сутки */ 86400000ul, true); + while (madness == WAIT_IO_COMPLETION); + STATIC_ASSERT(WAIT_OBJECT_0 == 0); + if (/* madness >= WAIT_OBJECT_0 && */ + madness < WAIT_OBJECT_0 + MAXIMUM_WAIT_OBJECTS) + r.err = MDBX_SUCCESS; + else if (madness >= WAIT_ABANDONED_0 && + madness < WAIT_ABANDONED_0 + MAXIMUM_WAIT_OBJECTS) { + r.err = ERROR_ABANDONED_WAIT_0; + goto bailout_rc; + } else if (madness == WAIT_TIMEOUT) { + r.err = WAIT_TIMEOUT; + goto bailout_rc; + } else { + r.err = /* madness == WAIT_FAILED */ MDBX_PROBLEM; + goto bailout_rc; + } + + assert(ior->async_waiting == ior->async_completed); + for (ior_item_t *item = ior->pool; item <= ior->last;) { + size_t i = 1, bytes = item->single.iov_len - 1; + if (bytes & 1) { + bytes = ior->pagesize; + while (item->sgv[i].Buffer) { + bytes += ior->pagesize; + ++i; + } + if (!HasOverlappedIoCompleted(&item->ov)) { + DWORD written = 0; + if (unlikely( + !GetOverlappedResult(ior->fd, &item->ov, &written, true))) { + ERROR("%s: item %p (%zu), pgno %u, bytes %zu, offset %" PRId64 + ", err %d", + "GetOverlappedResult", item, item - ior->pool, + ((MDBX_page *)item->single.iov_base)->mp_pgno, bytes, + item->ov.Offset + ((uint64_t)item->ov.OffsetHigh << 32), + GetLastError()); + goto bailout_geterr; + } + assert(MDBX_SUCCESS == item->ov.Internal); + assert(written == item->ov.InternalHigh); + } + } else { + assert(HasOverlappedIoCompleted(&item->ov)); + } + assert(item->ov.Internal != ERROR_IO_PENDING); + if (unlikely(item->ov.Internal != MDBX_SUCCESS)) { + DWORD written = 0; + r.err = (int)item->ov.Internal; + if ((r.err & 0x80000000) && + GetOverlappedResult(NULL, &item->ov, &written, true)) + r.err = (int)GetLastError(); + ERROR("%s: item %p (%zu), pgno %u, bytes %zu, offset %" PRId64 + ", err %d", + "Result", item, item - ior->pool, + ((MDBX_page *)item->single.iov_base)->mp_pgno, bytes, + item->ov.Offset + ((uint64_t)item->ov.OffsetHigh << 32), + GetLastError()); + goto bailout_rc; + } + if (unlikely(item->ov.InternalHigh != bytes)) { + r.err = ERROR_WRITE_FAULT; + goto bailout_rc; + } + item = ior_next(item, i); + } + assert(ior->async_waiting == ior->async_completed); + } else { + assert(r.err == MDBX_SUCCESS); + } + assert(ior->async_waiting == ior->async_completed); + +#else + STATIC_ASSERT_MSG(sizeof(off_t) >= sizeof(size_t), + "libmdbx requires 64-bit file I/O on 64-bit systems"); + for (ior_item_t *item = ior->pool; item <= ior->last;) { +#if MDBX_HAVE_PWRITEV + assert(item->sgvcnt > 0); + if (item->sgvcnt == 1) + r.err = osal_pwrite(ior->fd, item->sgv[0].iov_base, item->sgv[0].iov_len, + item->offset); + else + r.err = osal_pwritev(ior->fd, item->sgv, item->sgvcnt, item->offset); + + // TODO: io_uring_prep_write(sqe, fd, ...); + + item = ior_next(item, item->sgvcnt); +#else + r.err = osal_pwrite(ior->fd, item->single.iov_base, item->single.iov_len, + item->offset); + item = ior_next(item, 1); +#endif + r.wops += 1; + if (unlikely(r.err != MDBX_SUCCESS)) + break; + } + + // TODO: io_uring_submit(&ring) + // TODO: err = io_uring_wait_cqe(&ring, &cqe); + // TODO: io_uring_cqe_seen(&ring, cqe); + +#endif /* !Windows */ + return r; +} + +MDBX_INTERNAL_FUNC void osal_ioring_reset(osal_ioring_t *ior) { +#if defined(_WIN32) || defined(_WIN64) + if (ior->last) { + for (ior_item_t *item = ior->pool; item <= ior->last;) { + if (!HasOverlappedIoCompleted(&item->ov)) + CancelIoEx(ior->fd, &item->ov); + if (item->ov.hEvent && item->ov.hEvent != ior) + ior_put_event(ior, item->ov.hEvent); + size_t i = 1; + if ((item->single.iov_len & 1) == 0) + while (item->sgv[i].Buffer) + ++i; + item = ior_next(item, i); + } + } + ior->async_waiting = INT_MAX; + ior->async_completed = 0; + ResetEvent(ior->async_done); +#endif /* !Windows */ + ior->slots_left = ior->allocated; + ior->last = nullptr; +} + +static void ior_cleanup(osal_ioring_t *ior, const size_t since) { + osal_ioring_reset(ior); +#if defined(_WIN32) || defined(_WIN64) + for (size_t i = since; i < ior->event_stack; ++i) + CloseHandle(ior->event_pool[i]); + ior->event_stack = 0; +#else + (void)since; +#endif /* Windows */ +} + +MDBX_INTERNAL_FUNC int osal_ioring_resize(osal_ioring_t *ior, size_t items) { + assert(items > 0 && items < INT_MAX / sizeof(ior_item_t)); +#if defined(_WIN32) || defined(_WIN64) + if (ior->state & IOR_STATE_LOCKED) + return MDBX_SUCCESS; + const bool useSetFileIoOverlappedRange = (ior->flags & IOR_OVERLAPPED) && + mdbx_SetFileIoOverlappedRange && + items > 7; + const size_t ceiling = + useSetFileIoOverlappedRange + ? ((items < 65536 / 2 / sizeof(ior_item_t)) ? 65536 : 65536 * 4) + : 4096; + const size_t bytes = ceil_powerof2(sizeof(ior_item_t) * items, ceiling); + items = bytes / sizeof(ior_item_t); +#endif /* Windows */ + + if (items != ior->allocated) { + assert(items >= osal_ioring_used(ior)); + if (items < ior->allocated) + ior_cleanup(ior, items); +#if defined(_WIN32) || defined(_WIN64) + void *ptr = osal_realloc( + ior->event_pool, + (items + /* extra for waiting the async_done */ 1) * sizeof(HANDLE)); + if (unlikely(!ptr)) + return MDBX_ENOMEM; + ior->event_pool = ptr; + + int err = osal_memalign_alloc(ceiling, bytes, &ptr); + if (unlikely(err != MDBX_SUCCESS)) + return err; + if (ior->pool) { + memcpy(ptr, ior->pool, ior->allocated * sizeof(ior_item_t)); + osal_memalign_free(ior->pool); + } +#else + void *ptr = osal_realloc(ior->pool, sizeof(ior_item_t) * items); + if (unlikely(!ptr)) + return MDBX_ENOMEM; +#endif + ior->pool = ptr; + + if (items > ior->allocated) + memset(ior->pool + ior->allocated, 0, + sizeof(ior_item_t) * (items - ior->allocated)); + ior->allocated = (unsigned)items; + ior->boundary = (char *)(ior->pool + ior->allocated); +#if defined(_WIN32) || defined(_WIN64) + if (useSetFileIoOverlappedRange) { + if (mdbx_SetFileIoOverlappedRange(ior->fd, ptr, (ULONG)bytes)) + ior->state += IOR_STATE_LOCKED; + else + return GetLastError(); + } +#endif /* Windows */ + } + return MDBX_SUCCESS; +} + +MDBX_INTERNAL_FUNC void osal_ioring_destroy(osal_ioring_t *ior) { + if (ior->allocated) + ior_cleanup(ior, 0); +#if defined(_WIN32) || defined(_WIN64) + osal_memalign_free(ior->pool); + osal_free(ior->event_pool); + CloseHandle(ior->async_done); +#else + osal_free(ior->pool); +#endif + memset(ior, -1, sizeof(osal_ioring_t)); +} + +/*----------------------------------------------------------------------------*/ + MDBX_INTERNAL_FUNC int osal_removefile(const pathchar_t *pathname) { #if defined(_WIN32) || defined(_WIN64) return DeleteFileW(pathname) ? MDBX_SUCCESS : (int)GetLastError(); @@ -589,17 +1179,21 @@ MDBX_INTERNAL_FUNC int osal_openfile(const enum osal_openfile_purpose purpose, case MDBX_OPEN_DXB_LAZY: DesiredAccess |= GENERIC_READ | GENERIC_WRITE; break; + case MDBX_OPEN_DXB_OVERLAPPED: + FlagsAndAttributes |= FILE_FLAG_OVERLAPPED; + /* fall through */ + __fallthrough; case MDBX_OPEN_DXB_DSYNC: CreationDisposition = OPEN_EXISTING; - DesiredAccess |= GENERIC_WRITE; + DesiredAccess |= GENERIC_WRITE | GENERIC_READ; FlagsAndAttributes |= FILE_FLAG_WRITE_THROUGH; break; case MDBX_OPEN_COPY: CreationDisposition = CREATE_NEW; ShareMode = 0; DesiredAccess |= GENERIC_WRITE; - FlagsAndAttributes |= - (env->me_psize < env->me_os_psize) ? 0 : FILE_FLAG_NO_BUFFERING; + if (env->me_psize >= env->me_os_psize) + FlagsAndAttributes |= FILE_FLAG_NO_BUFFERING; break; case MDBX_OPEN_DELETE: CreationDisposition = OPEN_EXISTING; @@ -878,28 +1472,30 @@ MDBX_INTERNAL_FUNC int osal_write(mdbx_filehandle_t fd, const void *buf, } } -int osal_pwritev(mdbx_filehandle_t fd, struct iovec *iov, int iovcnt, - uint64_t offset, size_t expected_written) { -#if defined(_WIN32) || defined(_WIN64) || defined(__APPLE__) || \ - (defined(__ANDROID_API__) && __ANDROID_API__ < 24) +int osal_pwritev(mdbx_filehandle_t fd, struct iovec *iov, int sgvcnt, + uint64_t offset) { + size_t expected = 0; + for (int i = 0; i < sgvcnt; ++i) + expected += iov[i].iov_len; +#if !MDBX_HAVE_PWRITEV size_t written = 0; - for (int i = 0; i < iovcnt; ++i) { + for (int i = 0; i < sgvcnt; ++i) { int rc = osal_pwrite(fd, iov[i].iov_base, iov[i].iov_len, offset); if (unlikely(rc != MDBX_SUCCESS)) return rc; written += iov[i].iov_len; offset += iov[i].iov_len; } - return (expected_written == written) ? MDBX_SUCCESS - : MDBX_EIO /* ERROR_WRITE_FAULT */; + return (expected == written) ? MDBX_SUCCESS + : MDBX_EIO /* ERROR_WRITE_FAULT */; #else int rc; intptr_t written; do { STATIC_ASSERT_MSG(sizeof(off_t) >= sizeof(size_t), "libmdbx requires 64-bit file I/O on 64-bit systems"); - written = pwritev(fd, iov, iovcnt, offset); - if (likely(expected_written == (size_t)written)) + written = pwritev(fd, iov, sgvcnt, offset); + if (likely(expected == (size_t)written)) return MDBX_SUCCESS; rc = errno; } while (rc == EINTR); @@ -1066,7 +1662,7 @@ MDBX_INTERNAL_FUNC int osal_thread_join(osal_thread_t thread) { /*----------------------------------------------------------------------------*/ -MDBX_INTERNAL_FUNC int osal_msync(osal_mmap_t *map, size_t offset, +MDBX_INTERNAL_FUNC int osal_msync(const osal_mmap_t *map, size_t offset, size_t length, enum osal_syncmode_bits mode_bits) { uint8_t *ptr = (uint8_t *)map->address + offset; diff --git a/src/osal.h b/src/osal.h index cec91dca..11ef24f8 100644 --- a/src/osal.h +++ b/src/osal.h @@ -263,8 +263,138 @@ typedef union osal_srwlock { } osal_srwlock_t; #endif /* Windows */ +#ifndef MDBX_HAVE_PWRITEV +#if defined(_WIN32) || defined(_WIN64) + +#define MDBX_HAVE_PWRITEV 0 + +#elif defined(__ANDROID_API__) + +#if __ANDROID_API__ < 24 +#define MDBX_HAVE_PWRITEV 0 +#else +#define MDBX_HAVE_PWRITEV 1 +#endif + +#elif defined(__APPLE__) || defined(__MACH__) || defined(_DARWIN_C_SOURCE) + +#if defined(MAC_OS_X_VERSION_MIN_REQUIRED) && defined(MAC_OS_VERSION_11_0) && \ + MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_VERSION_11_0 +/* FIXME: add checks for IOS versions, etc */ +#define MDBX_HAVE_PWRITEV 1 +#else +#define MDBX_HAVE_PWRITEV 0 +#endif + +#elif defined(_SC_IOV_MAX) || (defined(IOV_MAX) && IOV_MAX > 1) +#define MDBX_HAVE_PWRITEV 1 +#else +#define MDBX_HAVE_PWRITEV 0 +#endif +#endif /* MDBX_HAVE_PWRITEV */ + +typedef struct ior_item { +#if defined(_WIN32) || defined(_WIN64) + OVERLAPPED ov; +#define ior_svg_gap4terminator 1 +#define ior_sgv_element FILE_SEGMENT_ELEMENT +#else + size_t offset; +#if MDBX_HAVE_PWRITEV + size_t sgvcnt; +#define ior_svg_gap4terminator 0 +#define ior_sgv_element struct iovec +#endif /* MDBX_HAVE_PWRITEV */ +#endif /* !Windows */ + union { + MDBX_val single; +#if defined(ior_sgv_element) + ior_sgv_element sgv[1 + ior_svg_gap4terminator]; +#endif /* ior_sgv_element */ + }; +} ior_item_t; + +typedef struct osal_ioring { + unsigned slots_left; + unsigned allocated; +#if defined(_WIN32) || defined(_WIN64) +#define IOR_UNBUFFERED 1 +#define IOR_OVERLAPPED 2 +#define IOR_STATE_LOCKED 1 + unsigned pagesize; + unsigned last_sgvcnt; + size_t last_bytes; + uint8_t flags, state, pagesize_ln2; + unsigned event_stack; + HANDLE *event_pool; + volatile LONG async_waiting; + volatile LONG async_completed; + HANDLE async_done; + +#define ior_last_sgvcnt(ior, item) (ior)->last_sgvcnt +#define ior_last_bytes(ior, item) (ior)->last_bytes +#elif MDBX_HAVE_PWRITEV + unsigned last_bytes; +#define ior_last_sgvcnt(ior, item) (item)->sgvcnt +#define ior_last_bytes(ior, item) (ior)->last_bytes +#else +#define ior_last_sgvcnt(ior, item) (1) +#define ior_last_bytes(ior, item) (item)->single.iov_len +#endif /* !Windows */ + mdbx_filehandle_t fd; + ior_item_t *last; + ior_item_t *pool; + char *boundary; +} osal_ioring_t; + #ifndef __cplusplus +/* Actually this is not ioring for now, but on the way. */ +MDBX_INTERNAL_FUNC int osal_ioring_create(osal_ioring_t *, +#if defined(_WIN32) || defined(_WIN64) + unsigned flags, +#endif /* Windows */ + mdbx_filehandle_t fd); +MDBX_INTERNAL_FUNC int osal_ioring_resize(osal_ioring_t *, size_t items); +MDBX_INTERNAL_FUNC void osal_ioring_destroy(osal_ioring_t *); +MDBX_INTERNAL_FUNC void osal_ioring_reset(osal_ioring_t *); +MDBX_INTERNAL_FUNC int osal_ioring_add(osal_ioring_t *ctx, const size_t offset, + void *data, const size_t bytes); +typedef struct osal_ioring_write_result { + int err; + unsigned wops; +} osal_ioring_write_result_t; +MDBX_INTERNAL_FUNC osal_ioring_write_result_t +osal_ioring_write(osal_ioring_t *ior); + +typedef struct iov_ctx iov_ctx_t; +MDBX_INTERNAL_FUNC void osal_ioring_walk( + osal_ioring_t *ior, iov_ctx_t *ctx, + void (*callback)(iov_ctx_t *ctx, size_t offset, void *data, size_t bytes)); + +static inline unsigned osal_ioring_left(const osal_ioring_t *ior) { + return ior->slots_left; +} + +static inline unsigned osal_ioring_used(const osal_ioring_t *ior) { + return ior->allocated - ior->slots_left; +} + +static inline int osal_ioring_reserve(osal_ioring_t *ior, unsigned items, + size_t bytes) { + items = (items > 32) ? items : 32; +#if defined(_WIN32) || defined(_WIN64) + const unsigned npages = (unsigned)(bytes >> ior->pagesize_ln2); + items = (items > npages) ? items : npages; +#else + (void)bytes; +#endif + items = (items < 65536) ? items : 65536; + if (likely(ior->allocated >= items)) + return MDBX_SUCCESS; + return osal_ioring_resize(ior, items); +} + /*----------------------------------------------------------------------------*/ /* libc compatibility stuff */ @@ -290,10 +420,12 @@ MDBX_MAYBE_UNUSED MDBX_INTERNAL_FUNC void osal_jitter(bool tiny); MDBX_MAYBE_UNUSED static __inline void jitter4testing(bool tiny); /* max bytes to write in one call */ -#if defined(_WIN32) || defined(_WIN64) -#define MAX_WRITE UINT32_C(0x01000000) +#if defined(_WIN64) +#define MAX_WRITE UINT32_C(0x10000000) +#elif defined(_WIN32) +#define MAX_WRITE UINT32_C(0x04000000) #else -#define MAX_WRITE UINT32_C(0x3fff0000) +#define MAX_WRITE UINT32_C(0x3f000000) #endif #if defined(__linux__) || defined(__gnu_linux__) @@ -336,8 +468,7 @@ MDBX_INTERNAL_FUNC int osal_fastmutex_release(osal_fastmutex_t *fastmutex); MDBX_INTERNAL_FUNC int osal_fastmutex_destroy(osal_fastmutex_t *fastmutex); MDBX_INTERNAL_FUNC int osal_pwritev(mdbx_filehandle_t fd, struct iovec *iov, - int iovcnt, uint64_t offset, - size_t expected_written); + int sgvcnt, uint64_t offset); MDBX_INTERNAL_FUNC int osal_pread(mdbx_filehandle_t fd, void *buf, size_t count, uint64_t offset); MDBX_INTERNAL_FUNC int osal_pwrite(mdbx_filehandle_t fd, const void *buf, @@ -365,12 +496,15 @@ MDBX_INTERNAL_FUNC int osal_fseek(mdbx_filehandle_t fd, uint64_t pos); MDBX_INTERNAL_FUNC int osal_filesize(mdbx_filehandle_t fd, uint64_t *length); enum osal_openfile_purpose { - MDBX_OPEN_DXB_READ = 0, - MDBX_OPEN_DXB_LAZY = 1, - MDBX_OPEN_DXB_DSYNC = 2, - MDBX_OPEN_LCK = 3, - MDBX_OPEN_COPY = 4, - MDBX_OPEN_DELETE = 5 + MDBX_OPEN_DXB_READ, + MDBX_OPEN_DXB_LAZY, + MDBX_OPEN_DXB_DSYNC, +#if defined(_WIN32) || defined(_WIN64) + MDBX_OPEN_DXB_OVERLAPPED, +#endif /* Windows */ + MDBX_OPEN_LCK, + MDBX_OPEN_COPY, + MDBX_OPEN_DELETE }; MDBX_INTERNAL_FUNC int osal_openfile(const enum osal_openfile_purpose purpose, @@ -404,7 +538,7 @@ osal_suspend_threads_before_remap(MDBX_env *env, mdbx_handle_array_t **array); MDBX_INTERNAL_FUNC int osal_resume_threads_after_remap(mdbx_handle_array_t *array); #endif /* Windows */ -MDBX_INTERNAL_FUNC int osal_msync(osal_mmap_t *map, size_t offset, +MDBX_INTERNAL_FUNC int osal_msync(const osal_mmap_t *map, size_t offset, size_t length, enum osal_syncmode_bits mode_bits); MDBX_INTERNAL_FUNC int osal_check_fs_rdonly(mdbx_filehandle_t handle, @@ -692,6 +826,11 @@ MDBX_INTERNAL_VAR MDBX_RegGetValueA mdbx_RegGetValueA; NTSYSAPI ULONG RtlRandomEx(PULONG Seed); +typedef BOOL(WINAPI *MDBX_SetFileIoOverlappedRange)(HANDLE FileHandle, + PUCHAR OverlappedRangeStart, + ULONG Length); +MDBX_INTERNAL_VAR MDBX_SetFileIoOverlappedRange mdbx_SetFileIoOverlappedRange; + #endif /* Windows */ #endif /* !__cplusplus */ diff --git a/test/osal-windows.cc b/test/osal-windows.cc index 29ac5cb1..c90e4c05 100644 --- a/test/osal-windows.cc +++ b/test/osal-windows.cc @@ -71,7 +71,7 @@ void osal_setup(const std::vector &actors) { events.reserve(n); for (unsigned i = 0; i < n; ++i) { - HANDLE hEvent = CreateEvent(NULL, TRUE, FALSE, NULL); + HANDLE hEvent = CreateEventW(NULL, TRUE, FALSE, NULL); if (!hEvent) failure_perror("CreateEvent()", GetLastError()); hEvent = make_inheritable(hEvent); @@ -79,22 +79,22 @@ void osal_setup(const std::vector &actors) { events[i] = hEvent; } - hBarrierSemaphore = CreateSemaphore(NULL, 0, (LONG)actors.size(), NULL); + hBarrierSemaphore = CreateSemaphoreW(NULL, 0, (LONG)actors.size(), NULL); if (!hBarrierSemaphore) failure_perror("CreateSemaphore(BarrierSemaphore)", GetLastError()); hBarrierSemaphore = make_inheritable(hBarrierSemaphore); - hBarrierEvent = CreateEvent(NULL, TRUE, FALSE, NULL); + hBarrierEvent = CreateEventW(NULL, TRUE, FALSE, NULL); if (!hBarrierEvent) failure_perror("CreateEvent(BarrierEvent)", GetLastError()); hBarrierEvent = make_inheritable(hBarrierEvent); - hProgressActiveEvent = CreateEvent(NULL, FALSE, FALSE, NULL); + hProgressActiveEvent = CreateEventW(NULL, FALSE, FALSE, NULL); if (!hProgressActiveEvent) failure_perror("CreateEvent(ProgressActiveEvent)", GetLastError()); hProgressActiveEvent = make_inheritable(hProgressActiveEvent); - hProgressPassiveEvent = CreateEvent(NULL, FALSE, FALSE, NULL); + hProgressPassiveEvent = CreateEventW(NULL, FALSE, FALSE, NULL); if (!hProgressPassiveEvent) failure_perror("CreateEvent(ProgressPassiveEvent)", GetLastError()); hProgressPassiveEvent = make_inheritable(hProgressPassiveEvent);