mdbx: implements remapping of the database file when it it possible.

Change-Id: Ida15ba1f396a33b2c6063e680dff612f39a9608f
This commit is contained in:
Leonid Yuriev 2020-07-06 16:23:52 +03:00
parent 2d0a5c42a9
commit 3351c1f869
4 changed files with 103 additions and 22 deletions

View File

@ -13,6 +13,8 @@ v0.8.2 2020-07-??:
  - Refined mode bits while auto-creating LCK-file.   - Refined mode bits while auto-creating LCK-file.
- Avoids unnecessary database file re-mapping in case geometry changed by another process(es). - Avoids unnecessary database file re-mapping in case geometry changed by another process(es).
From the user's point of view, the MDBX_UNABLE_EXTEND_MAPSIZE error will now be returned less frequently and only when using the DB in the current process really requires it to be reopened. From the user's point of view, the MDBX_UNABLE_EXTEND_MAPSIZE error will now be returned less frequently and only when using the DB in the current process really requires it to be reopened.
- Remapping on-the-fly and of the database file was implemented.
Now remapping with a change of address is performed automatically if there are no dependent readers in the current process.
v0.8.1 2020-06-12: v0.8.1 2020-06-12:
  - Minor change versioning. The last number in the version now means the number of commits since last release/tag.   - Minor change versioning. The last number in the version now means the number of commits since last release/tag.

View File

@ -4658,7 +4658,7 @@ static int __cold mdbx_set_readahead(MDBX_env *env, const size_t offset,
static __cold int mdbx_mapresize(MDBX_env *env, const pgno_t used_pgno, static __cold int mdbx_mapresize(MDBX_env *env, const pgno_t used_pgno,
const pgno_t size_pgno, const pgno_t size_pgno,
const pgno_t limit_pgno) { const pgno_t limit_pgno, const bool implicit) {
if ((env->me_flags & MDBX_WRITEMAP) && *env->me_unsynced_pages) { if ((env->me_flags & MDBX_WRITEMAP) && *env->me_unsynced_pages) {
int err = mdbx_msync(&env->me_dxb_mmap, 0, int err = mdbx_msync(&env->me_dxb_mmap, 0,
pgno_align2os_bytes(env, used_pgno), true); pgno_align2os_bytes(env, used_pgno), true);
@ -4711,16 +4711,40 @@ static __cold int mdbx_mapresize(MDBX_env *env, const pgno_t used_pgno,
mdbx_error("failed suspend-for-remap: errcode %d", rc); mdbx_error("failed suspend-for-remap: errcode %d", rc);
goto bailout; goto bailout;
} }
#else const bool mapping_can_be_moved = !implicit;
#else /* Windows */
/* Acquire guard to avoid collision between read and write txns /* Acquire guard to avoid collision between read and write txns
* around env->me_dbgeo */ * around env->me_dbgeo */
bool mapping_can_be_moved = false;
int rc = mdbx_fastmutex_acquire(&env->me_remap_guard); int rc = mdbx_fastmutex_acquire(&env->me_remap_guard);
if (unlikely(rc != MDBX_SUCCESS)) if (unlikely(rc != MDBX_SUCCESS))
return rc; return rc;
if (limit_bytes == env->me_dxb_mmap.limit && if (limit_bytes == env->me_dxb_mmap.limit &&
size_bytes == env->me_dxb_mmap.current) size_bytes == env->me_dxb_mmap.current)
goto bailout; goto bailout;
#endif /* Windows */
if (limit_bytes != env->me_dxb_mmap.limit && env->me_lck && !implicit) {
rc = mdbx_rdt_lock(env) /* lock readers table until remap done */;
if (unlikely(rc != MDBX_SUCCESS))
goto bailout;
/* looking for readers from this process */
MDBX_lockinfo *const lck = env->me_lck;
const unsigned snap_nreaders = lck->mti_numreaders;
mapping_can_be_moved = true;
for (unsigned i = 0; i < snap_nreaders; ++i) {
if (lck->mti_readers[i].mr_pid == env->me_pid &&
lck->mti_readers[i].mr_tid != mdbx_thread_self()) {
/* the base address of the mapping can't be changed since
* the other reader thread from this process exists. */
mdbx_rdt_unlock(env);
mapping_can_be_moved = false;
break;
}
}
}
#endif /* ! Windows */
const size_t prev_size = env->me_dxb_mmap.current; const size_t prev_size = env->me_dxb_mmap.current;
if (size_bytes < prev_size) { if (size_bytes < prev_size) {
@ -4758,7 +4782,8 @@ static __cold int mdbx_mapresize(MDBX_env *env, const pgno_t used_pgno,
*env->me_discarded_tail = size_pgno; *env->me_discarded_tail = size_pgno;
} }
rc = mdbx_mresize(env->me_flags, &env->me_dxb_mmap, size_bytes, limit_bytes); rc = mdbx_mresize(env->me_flags, &env->me_dxb_mmap, size_bytes, limit_bytes,
mapping_can_be_moved);
if (rc == MDBX_SUCCESS && (env->me_flags & MDBX_NORDAHEAD) == 0) { if (rc == MDBX_SUCCESS && (env->me_flags & MDBX_NORDAHEAD) == 0) {
const int readahead = mdbx_is_readahead_reasonable(size_bytes, 0); const int readahead = mdbx_is_readahead_reasonable(size_bytes, 0);
if (readahead == MDBX_RESULT_FALSE) if (readahead == MDBX_RESULT_FALSE)
@ -4829,6 +4854,8 @@ bailout:
mdbx_free(suspended); mdbx_free(suspended);
} }
#else #else
if (env->me_lck && mapping_can_be_moved)
mdbx_rdt_unlock(env);
int err = mdbx_fastmutex_release(&env->me_remap_guard); int err = mdbx_fastmutex_release(&env->me_remap_guard);
#endif /* Windows */ #endif /* Windows */
if (err != MDBX_SUCCESS) { if (err != MDBX_SUCCESS) {
@ -4849,7 +4876,8 @@ static __cold int mdbx_mapresize_implicit(MDBX_env *env, const pgno_t used_pgno,
? limit_pgno ? limit_pgno
: /* The actual mapsize may be less since the geo.upper may be changed : /* The actual mapsize may be less since the geo.upper may be changed
by other process. So, avoids remapping until it necessary. */ by other process. So, avoids remapping until it necessary. */
mapped_pgno); mapped_pgno,
true);
} }
static int mdbx_meta_unsteady(MDBX_env *env, const txnid_t last_steady, static int mdbx_meta_unsteady(MDBX_env *env, const txnid_t last_steady,
@ -6115,8 +6143,9 @@ static int mdbx_txn_renew0(MDBX_txn *txn, unsigned flags) {
rc = MDBX_UNABLE_EXTEND_MAPSIZE; rc = MDBX_UNABLE_EXTEND_MAPSIZE;
goto bailout; goto bailout;
} }
rc = mdbx_mapresize_implicit(env, txn->mt_next_pgno, txn->mt_end_pgno, rc = mdbx_mapresize(env, txn->mt_next_pgno, txn->mt_end_pgno,
txn->mt_geo.upper); txn->mt_geo.upper,
(txn->mt_flags & MDBX_RDONLY) ? true : false);
if (rc != MDBX_SUCCESS) if (rc != MDBX_SUCCESS)
goto bailout; goto bailout;
} }
@ -9192,7 +9221,8 @@ mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, intptr_t size_now,
if (new_geo.now != current_geo->now || if (new_geo.now != current_geo->now ||
new_geo.upper != current_geo->upper) { new_geo.upper != current_geo->upper) {
rc = mdbx_mapresize(env, current_geo->next, new_geo.now, new_geo.upper); rc = mdbx_mapresize(env, current_geo->next, new_geo.now, new_geo.upper,
false);
if (unlikely(rc != MDBX_SUCCESS)) if (unlikely(rc != MDBX_SUCCESS))
goto bailout; goto bailout;
mdbx_assert(env, (head == nullptr) == inside_txn); mdbx_assert(env, (head == nullptr) == inside_txn);

View File

@ -1403,7 +1403,7 @@ MDBX_INTERNAL_FUNC int mdbx_munmap(mdbx_mmap_t *map) {
} }
MDBX_INTERNAL_FUNC int mdbx_mresize(int flags, mdbx_mmap_t *map, size_t size, MDBX_INTERNAL_FUNC int mdbx_mresize(int flags, mdbx_mmap_t *map, size_t size,
size_t limit) { size_t limit, const bool may_move) {
assert(size <= limit); assert(size <= limit);
#if defined(_WIN32) || defined(_WIN64) #if defined(_WIN32) || defined(_WIN64)
assert(size != map->current || limit != map->limit || size < map->filesize); assert(size != map->current || limit != map->limit || size < map->filesize);
@ -1482,9 +1482,9 @@ MDBX_INTERNAL_FUNC int mdbx_mresize(int flags, mdbx_mmap_t *map, size_t size,
if (status != /* STATUS_CONFLICTING_ADDRESSES */ 0xC0000018) if (status != /* STATUS_CONFLICTING_ADDRESSES */ 0xC0000018)
goto bailout_ntstatus /* no way to recovery */; goto bailout_ntstatus /* no way to recovery */;
/* assume we can change base address if mapping size changed or prev address if (may_move)
* couldn't be used */ /* the base address could be changed */
map->address = NULL; map->address = NULL;
} }
retry_file_and_section: retry_file_and_section:
@ -1541,7 +1541,7 @@ retry_mapview:;
if (!NT_SUCCESS(status)) { if (!NT_SUCCESS(status)) {
if (status == /* STATUS_CONFLICTING_ADDRESSES */ 0xC0000018 && if (status == /* STATUS_CONFLICTING_ADDRESSES */ 0xC0000018 &&
map->address) { map->address && may_move) {
/* try remap at another base address */ /* try remap at another base address */
map->address = NULL; map->address = NULL;
goto retry_mapview; goto retry_mapview;
@ -1565,6 +1565,7 @@ retry_mapview:;
map->current = (size_t)SectionSize.QuadPart; map->current = (size_t)SectionSize.QuadPart;
map->limit = ViewSize; map->limit = ViewSize;
#else #else
uint64_t filesize = 0; uint64_t filesize = 0;
@ -1585,7 +1586,8 @@ retry_mapview:;
if (limit != map->limit) { if (limit != map->limit) {
#if defined(MREMAP_MAYMOVE) #if defined(MREMAP_MAYMOVE)
void *ptr = mremap(map->address, map->limit, limit, 0); void *ptr =
mremap(map->address, map->limit, limit, may_move ? MREMAP_MAYMOVE : 0);
if (ptr == MAP_FAILED) { if (ptr == MAP_FAILED) {
rc = errno; rc = errno;
switch (rc) { switch (rc) {
@ -1596,7 +1598,59 @@ retry_mapview:;
} }
return rc; return rc;
} }
map->address = ptr; #else
if (!may_move)
/* TODO: Perhaps here it is worth to implement suspend/resume threads
* and perform unmap/map as like for Windows. */
return MDBX_UNABLE_EXTEND_MAPSIZE;
if (unlikely(munmap(map->address, map->limit)))
return errno;
unsigned mmap_flags =
MAP_CONCEAL | MAP_SHARED | MAP_FILE |
(F_ISSET(flags, MDBX_UTTERLY_NOSYNC) ? MAP_NOSYNC : 0);
#ifdef MAP_FIXED
if (!may_move)
mmap_flags |= MAP_FIXED;
#endif
void *ptr =
mmap(map->address, limit,
(flags & MDBX_WRITEMAP) ? PROT_READ | PROT_WRITE : PROT_READ,
mmap_flags, map->fd, 0);
if (unlikely(ptr == MAP_FAILED)) {
ptr = mmap(map->address, map->limit,
(flags & MDBX_WRITEMAP) ? PROT_READ | PROT_WRITE : PROT_READ,
mmap_flags, map->fd, 0);
if (unlikely(ptr == MAP_FAILED)) {
VALGRIND_MAKE_MEM_NOACCESS(map->address, map->current);
/* Unpoisoning is required for ASAN to avoid false-positive diagnostic
* when this memory will re-used by malloc or another mmaping.
* See https://github.com/erthink/libmdbx/pull/93#issuecomment-613687203
*/
ASAN_UNPOISON_MEMORY_REGION(map->address, map->limit);
map->limit = 0;
map->current = 0;
map->address = nullptr;
return errno;
}
return MDBX_UNABLE_EXTEND_MAPSIZE;
}
#endif /* !MREMAP_MAYMOVE */
if (map->address != ptr) {
VALGRIND_MAKE_MEM_NOACCESS(map->address, map->current);
/* Unpoisoning is required for ASAN to avoid false-positive diagnostic
* when this memory will re-used by malloc or another mmaping.
* See https://github.com/erthink/libmdbx/pull/93#issuecomment-613687203
*/
ASAN_UNPOISON_MEMORY_REGION(map->address, map->limit);
VALGRIND_MAKE_MEM_DEFINED(ptr, map->current);
ASAN_UNPOISON_MEMORY_REGION(ptr, map->current);
map->address = ptr;
}
map->limit = limit; map->limit = limit;
#ifdef MADV_DONTFORK #ifdef MADV_DONTFORK
@ -1607,14 +1661,9 @@ retry_mapview:;
#ifdef MADV_NOHUGEPAGE #ifdef MADV_NOHUGEPAGE
(void)madvise(map->address, map->limit, MADV_NOHUGEPAGE); (void)madvise(map->address, map->limit, MADV_NOHUGEPAGE);
#endif /* MADV_NOHUGEPAGE */ #endif /* MADV_NOHUGEPAGE */
#else /* MREMAP_MAYMOVE */
/* TODO: Perhaps here it is worth to implement suspend/resume threads
* and perform unmap/map as like for Windows. */
rc = MDBX_UNABLE_EXTEND_MAPSIZE;
#endif /* !MREMAP_MAYMOVE */
} }
#endif #endif
return rc; return rc;
} }

View File

@ -623,7 +623,7 @@ MDBX_INTERNAL_FUNC int mdbx_mmap(const int flags, mdbx_mmap_t *map,
const unsigned options); const unsigned options);
MDBX_INTERNAL_FUNC int mdbx_munmap(mdbx_mmap_t *map); MDBX_INTERNAL_FUNC int mdbx_munmap(mdbx_mmap_t *map);
MDBX_INTERNAL_FUNC int mdbx_mresize(int flags, mdbx_mmap_t *map, size_t current, MDBX_INTERNAL_FUNC int mdbx_mresize(int flags, mdbx_mmap_t *map, size_t current,
size_t wanna); size_t wanna, const bool may_move);
#if defined(_WIN32) || defined(_WIN64) #if defined(_WIN32) || defined(_WIN64)
typedef struct { typedef struct {
unsigned limit, count; unsigned limit, count;