Compare commits

..

No commits in common. "master" and "v0.13.6" have entirely different histories.

84 changed files with 4328 additions and 7023 deletions

View File

@ -132,8 +132,6 @@ if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/.git"
AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/preface.h"
AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/proto.h"
AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/refund.c"
AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/rkl.c"
AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/rkl.h"
AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/sort.h"
AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/spill.c"
AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/spill.h"
@ -151,9 +149,6 @@ if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/.git"
AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/tree-ops.c"
AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/txl.c"
AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/txl.h"
AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/txn-basal.c"
AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/txn-nested.c"
AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/txn-ro.c"
AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/txn.c"
AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/unaligned.h"
AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/utils.c"
@ -834,8 +829,6 @@ else()
"${MDBX_SOURCE_DIR}/preface.h"
"${MDBX_SOURCE_DIR}/proto.h"
"${MDBX_SOURCE_DIR}/refund.c"
"${MDBX_SOURCE_DIR}/rkl.c"
"${MDBX_SOURCE_DIR}/rkl.h"
"${MDBX_SOURCE_DIR}/sort.h"
"${MDBX_SOURCE_DIR}/spill.c"
"${MDBX_SOURCE_DIR}/spill.h"
@ -845,9 +838,6 @@ else()
"${MDBX_SOURCE_DIR}/tree-ops.c"
"${MDBX_SOURCE_DIR}/txl.c"
"${MDBX_SOURCE_DIR}/txl.h"
"${MDBX_SOURCE_DIR}/txn-basal.c"
"${MDBX_SOURCE_DIR}/txn-nested.c"
"${MDBX_SOURCE_DIR}/txn-ro.c"
"${MDBX_SOURCE_DIR}/txn.c"
"${MDBX_SOURCE_DIR}/unaligned.h"
"${MDBX_SOURCE_DIR}/utils.c"

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -634,12 +634,11 @@ docs/usage.md: docs/__usage.md docs/_starting.md docs/__bindings.md
@echo ' MAKE $@'
$(QUIET)echo -e "\\page usage Usage\n\\section getting Building & Embedding" | cat - $^ | $(SED) 's/^Bindings$$/Bindings {#bindings}/' >$@
doxygen: docs/Doxyfile docs/overall.md docs/intro.md docs/usage.md mdbx.h mdbx.h++ src/options.h ChangeLog.md COPYRIGHT LICENSE NOTICE docs/favicon.ico docs/manifest.webmanifest docs/ld+json $(lastword $(MAKEFILE_LIST))
doxygen: docs/Doxyfile docs/overall.md docs/intro.md docs/usage.md mdbx.h mdbx.h++ src/options.h ChangeLog.md COPYRIGHT LICENSE NOTICE $(lastword $(MAKEFILE_LIST))
@echo ' RUNNING doxygen...'
$(QUIET)rm -rf docs/html && \
cat mdbx.h | tr '\n' '\r' | $(SED) -e 's/LIBMDBX_INLINE_API\s*(\s*\([^,]\+\),\s*\([^,]\+\),\s*(\s*\([^)]\+\)\s*)\s*)\s*{/inline \1 \2(\3) {/g' | tr '\r' '\n' >docs/mdbx.h && \
cp mdbx.h++ src/options.h ChangeLog.md docs/ && (cd docs && doxygen Doxyfile $(HUSH)) && cp COPYRIGHT LICENSE NOTICE docs/favicon.ico docs/manifest.webmanifest docs/html/ && \
$(SED) -i docs/html/index.html -e '/\/MathJax.js"><\/script>/r docs/ld+json' -e 's/<title>libmdbx: Overall<\/title>//;T;r docs/title'
cp mdbx.h++ src/options.h ChangeLog.md docs/ && (cd docs && doxygen Doxyfile $(HUSH)) && cp COPYRIGHT LICENSE NOTICE docs/html/
mdbx++-dylib.o: src/config.h src/mdbx.c++ mdbx.h mdbx.h++ $(lastword $(MAKEFILE_LIST))
@echo ' CC $@'
@ -722,7 +721,6 @@ $(DIST_DIR)/@tmp-internals.inc: $(DIST_DIR)/@tmp-essentials.inc src/version.c $(
-e '/#include "essentials.h"/d' \
-e '/#include "atomics-ops.h"/r src/atomics-ops.h' \
-e '/#include "proto.h"/r src/proto.h' \
-e '/#include "rkl.h"/r src/rkl.h' \
-e '/#include "txl.h"/r src/txl.h' \
-e '/#include "unaligned.h"/r src/unaligned.h' \
-e '/#include "cogs.h"/r src/cogs.h' \

View File

@ -1,5 +1,18 @@
<!-- Required extensions: pymdownx.betterem, pymdownx.tilde, pymdownx.emoji, pymdownx.tasklist, pymdownx.superfences -->
> Please refer to the online [documentation](https://libmdbx.dqdkfa.ru)
> with [`C` API description](https://libmdbx.dqdkfa.ru/group__c__api.html)
> and pay attention to the [`C++` API](https://gitflic.ru/project/erthink/libmdbx/blob?file=mdbx.h%2B%2B#line-num-1).
> Questions, feedback and suggestions are welcome to the [Telegram' group](https://t.me/libmdbx) (archive [1](https://libmdbx.dqdkfa.ru/tg-archive/messages1.html),
> [2](https://libmdbx.dqdkfa.ru/tg-archive/messages2.html), [3](https://libmdbx.dqdkfa.ru/tg-archive/messages3.html), [4](https://libmdbx.dqdkfa.ru/tg-archive/messages4.html),
> [5](https://libmdbx.dqdkfa.ru/tg-archive/messages5.html), [6](https://libmdbx.dqdkfa.ru/tg-archive/messages6.html), [7](https://libmdbx.dqdkfa.ru/tg-archive/messages7.html)).
> See the [ChangeLog](https://gitflic.ru/project/erthink/libmdbx/blob?file=ChangeLog.md) for `NEWS` and latest updates.
> Donations are welcome to the Ethereum/ERC-20 `0xD104d8f8B2dC312aaD74899F83EBf3EEBDC1EA3A`.
> Всё будет хорошо!
libmdbx
========
@ -26,44 +39,32 @@ tree](https://en.wikipedia.org/wiki/B%2B_tree).
[WAL](https://en.wikipedia.org/wiki/Write-ahead_logging), but that might
be a caveat for write-intensive workloads with durability requirements.
4. Enforces [serializability](https://en.wikipedia.org/wiki/Serializability) for
4. **Compact and friendly for fully embedding**. Only ≈25KLOC of `C11`,
≈64K x86 binary code of core, no internal threads neither server process(es),
but implements a simplified variant of the [Berkeley
DB](https://en.wikipedia.org/wiki/Berkeley_DB) and
[dbm](https://en.wikipedia.org/wiki/DBM_(computing)) API.
5. Enforces [serializability](https://en.wikipedia.org/wiki/Serializability) for
writers just by single
[mutex](https://en.wikipedia.org/wiki/Mutual_exclusion) and affords
[wait-free](https://en.wikipedia.org/wiki/Non-blocking_algorithm#Wait-freedom)
for parallel readers without atomic/interlocked operations, while
**writing and reading transactions do not block each other**.
5. **Guarantee data integrity** after crash unless this was explicitly
6. **Guarantee data integrity** after crash unless this was explicitly
neglected in favour of write performance.
6. Supports Linux, Windows, MacOS, Android, iOS, FreeBSD, DragonFly, Solaris,
7. Supports Linux, Windows, MacOS, Android, iOS, FreeBSD, DragonFly, Solaris,
OpenSolaris, OpenIndiana, NetBSD, OpenBSD and other systems compliant with
**POSIX.1-2008**.
7. **Compact and friendly for fully embedding**. Only ≈25KLOC of `C11`,
≈64K x86 binary code of core, no internal threads neither server process(es),
but implements a simplified variant of the [Berkeley
DB](https://en.wikipedia.org/wiki/Berkeley_DB) and
[dbm](https://en.wikipedia.org/wiki/DBM_(computing)) API.
<!-- section-end -->
Historically, _libmdbx_ is a deeply revised and extended descendant of the legendary
Historically, _libmdbx_ is a deeply revised and extended descendant of the amazing
[Lightning Memory-Mapped Database](https://en.wikipedia.org/wiki/Lightning_Memory-Mapped_Database).
_libmdbx_ inherits all benefits from _LMDB_, but resolves some issues and adds [a set of improvements](#improvements-beyond-lmdb).
[![Telergam: Support | Discussions | News](https://img.shields.io/endpoint?color=scarlet&logo=telegram&label=Support%20%7C%20Discussions%20%7C%20News&url=https%3A%2F%2Ftg.sumanjay.workers.dev%2Flibmdbx)](https://t.me/libmdbx)
> Please refer to the online [documentation](https://libmdbx.dqdkfa.ru)
> with [`C` API description](https://libmdbx.dqdkfa.ru/group__c__api.html)
> and pay attention to the [`C++` API](https://gitflic.ru/project/erthink/libmdbx/blob?file=mdbx.h%2B%2B#line-num-1).
> Donations are welcome to the Ethereum/ERC-20 `0xD104d8f8B2dC312aaD74899F83EBf3EEBDC1EA3A`.
> Всё будет хорошо!
Telegram Group archive: [1](https://libmdbx.dqdkfa.ru/tg-archive/messages1.html),
[2](https://libmdbx.dqdkfa.ru/tg-archive/messages2.html), [3](https://libmdbx.dqdkfa.ru/tg-archive/messages3.html), [4](https://libmdbx.dqdkfa.ru/tg-archive/messages4.html),
[5](https://libmdbx.dqdkfa.ru/tg-archive/messages5.html), [6](https://libmdbx.dqdkfa.ru/tg-archive/messages6.html), [7](https://libmdbx.dqdkfa.ru/tg-archive/messages7.html).
## Github
### на Русском (мой родной язык)
@ -125,7 +126,8 @@ of the database. All fundamental architectural problems of libmdbx/LMDB
have been solved there, but now the active development has been
suspended for top-three reasons:
1. For now _libmdbx_ mostly enough and Im busy for scalability.
1. For now _libmdbx_ «mostly» enough for all [our products](https://www.ptsecurity.com/ww-en/products/),
and Im busy in development of replication for scalability.
2. Waiting for fresh [Elbrus CPU](https://wiki.elbrus.ru/) of [e2k architecture](https://en.wikipedia.org/wiki/Elbrus_2000),
especially with hardware acceleration of [Streebog](https://en.wikipedia.org/wiki/Streebog) and
[Kuznyechik](https://en.wikipedia.org/wiki/Kuznyechik), which are required for Merkle tree, etc.
@ -554,9 +556,9 @@ Of course, in addition to this, your toolchain must ensure the reproducibility o
For more information please refer to [reproducible-builds.org](https://reproducible-builds.org/).
#### Containers
There are no special traits nor quirks if you use _libmdbx_ ONLY inside
the single container. But in a cross-container(s) or with a host-container(s)
interoperability cases the three major things MUST be guaranteed:
There are no special traits nor quirks if you use libmdbx ONLY inside the single container.
But in a cross-container cases or with a host-container(s) mix the two major things MUST be
guaranteed:
1. Coherence of memory mapping content and unified page cache inside OS
kernel for host and all container(s) operated with a DB. Basically this
@ -572,12 +574,6 @@ in the system memory.
including `ERROR_ACCESS_DENIED`,
but not the `ERROR_INVALID_PARAMETER` as for an invalid/non-existent PID.
3. The versions/builds of _libmdbx_ and `libc`/`pthreads` (`glibc`, `musl`, etc) must be be compatible.
- Basically, the `options:` string in the output of `mdbx_chk -V` must be the same for host and container(s).
See `MDBX_LOCKING`, `MDBX_USE_OFDLOCKS` and other build options for details.
- Avoid using different versions of `libc`, especially mixing different implementations, i.e. `glibc` with `musl`, etc.
Prefer to use the same LTS version, or switch to full virtualization/isolation if in doubt.
#### DSO/DLL unloading and destructors of Thread-Local-Storage objects
When building _libmdbx_ as a shared library or use static _libmdbx_ as a
part of another dynamic library, it is advisable to make sure that your

21
TODO.md
View File

@ -1,16 +1,16 @@
TODO
----
- [SWIG](https://www.swig.org/).
- Параллельная lto-сборка с устранением предупреждений.
- Интеграция c DTrace и аналогами.
- Новый стиль обработки ошибок с записью "трассы" и причин.
- Формирование отладочной информации посредством gdb.
- Поддержка WASM.
- Ранняя/не-отложенная очистка GC.
- Явная и автоматические уплотнение/дефрагментация.
- Нелинейная обработка GC.
- Перевести курсоры на двусвязный список вместо односвязного.
Unfortunately, on 2022-04-15 the Github administration, without any
warning nor explanation, deleted _libmdbx_ along with a lot of other
projects, simultaneously blocking access for many developers. Therefore
on 2022-04-21 we have migrated to a reliable trusted infrastructure.
The origin for now is at[GitFlic](https://gitflic.ru/project/erthink/libmdbx)
with backup at [ABF by ROSA Лаб](https://abf.rosalinux.ru/erthink/libmdbx).
For the same reason ~~Github~~ is blacklisted forever.
So currently most of the links are broken due to noted malicious ~~Github~~ sabotage.
- Внутри `txn_renew()` вынести проверку когерентности mmap за/после изменение размера.
- [Migration guide from LMDB to MDBX](https://libmdbx.dqdkfa.ru/dead-github/issues/199).
- [Support for RAW devices](https://libmdbx.dqdkfa.ru/dead-github/issues/124).
@ -20,7 +20,6 @@ TODO
Done
----
- Рефакторинг gc-get/gc-put c переходом на "интервальные" списки.
- [Engage new terminology](https://libmdbx.dqdkfa.ru/dead-github/issues/137).
- [More flexible support of asynchronous runtime/framework(s)](https://libmdbx.dqdkfa.ru/dead-github/issues/200).
- [Move most of `mdbx_chk` functional to the library API](https://libmdbx.dqdkfa.ru/dead-github/issues/204).

File diff suppressed because it is too large Load Diff

View File

@ -54,7 +54,7 @@ cleans readers, as an a process aborting (especially with core dump) can
take a long time, and checking readers cannot be performed too often due
to performance degradation.
This issue will be addressed in MithrilDB and one of libmdbx releases,
This issue will be addressed in MithrlDB and one of libmdbx releases,
presumably in 2025. To do this, nonlinear GC recycling will be
implemented, without stopping garbage recycling on the old MVCC snapshot
used by a long read transaction.
@ -92,7 +92,7 @@ free consecutive/adjacent pages through GC has been significantly
speeded, including acceleration using NOEN/SSE2/AVX2/AVX512
instructions.
This issue will be addressed in MithrilDB and refined within one of
This issue will be addressed in MithrlDB and refined within one of
0.15.x libmdbx releases, presumably at end of 2025.

View File

@ -2,10 +2,7 @@ The source code is availale on [Gitflic](https://gitflic.ru/project/erthink/libm
Donations are welcome to ETH `0xD104d8f8B2dC312aaD74899F83EBf3EEBDC1EA3A`.
Всё будет хорошо!
> Questions, feedback and suggestions are welcome to the [Telegram' group](https://t.me/libmdbx) (archive [1](https://libmdbx.dqdkfa.ru/tg-archive/messages1.html),
> [2](https://libmdbx.dqdkfa.ru/tg-archive/messages2.html), [3](https://libmdbx.dqdkfa.ru/tg-archive/messages3.html), [4](https://libmdbx.dqdkfa.ru/tg-archive/messages4.html),
> [5](https://libmdbx.dqdkfa.ru/tg-archive/messages5.html), [6](https://libmdbx.dqdkfa.ru/tg-archive/messages6.html), [7](https://libmdbx.dqdkfa.ru/tg-archive/messages7.html)).
> See the [ChangeLog](https://gitflic.ru/project/erthink/libmdbx/blob?file=ChangeLog.md) for `NEWS` and latest updates.
> Questions, feedback and suggestions are welcome to the [Telegram' group](https://t.me/libmdbx).
\section toc Table of Contents

Binary file not shown.

Before

Width:  |  Height:  |  Size: 4.2 KiB

View File

@ -1,17 +1,10 @@
<!DOCTYPE html>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "https://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" lang="$langISO">
<head>
<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
<meta http-equiv="X-UA-Compatible" content="IE=11"/>
<meta name="generator" content="Doxygen $doxygenversion"/>
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<link rel="icon" href="favicon.ico">
<link rel="icon" href="img/bear.png" type="image/png">
<link rel="apple-touch-icon" href="img/bear.png">
<meta property="og:type" content="article"/>
<meta property="og:url" content="https://libmdbx.dqdkfa.ru/"/>
<meta name="twitter:title" content="One of the fastest embeddable key-value engine"/>
<meta name="twitter:description" content="MDBX surpasses the legendary LMDB in terms of reliability, features and performance. For now libmdbx is chosen by all modern Ethereum frontiers as a storage engine."/>
<!--BEGIN PROJECT_NAME--><title>$projectname: $title</title><!--END PROJECT_NAME-->
<!--BEGIN !PROJECT_NAME--><title>$title</title><!--END !PROJECT_NAME-->
<!--BEGIN PROJECT_ICON-->

View File

@ -1,27 +0,0 @@
<script type="application/ld+json">
{
"@context": "https://schema.org",
"@type": "ItemList",
"itemListElement": [{
"@type": "ListItem",
"position": 1,
"name": "Группа в Telegram",
"url": "https://t.me/libmdbx"
},{
"@type": "ListItem",
"position": 2,
"name": "Исходный код",
"url": "https://gitflic.ru/project/erthink/libmdbx"
},{
"@type": "ListItem",
"position": 3,
"name": "C++ API",
"url": "https://libmdbx.dqdkfa.ru/group__cxx__api.html"
},{
"@type": "ListItem",
"position": 4,
"name": "Mirror on Github",
"url": "https://github.com/erthink/libmdbx"
}]
}
</script>

View File

@ -1,6 +0,0 @@
{
"icons": [
{ "src": "favicon.ico", "type": "image/ico", "sizes": "32x32" },
{ "src": "img/bear.png", "type": "image/png", "sizes": "256x256" }
]
}

View File

@ -1,2 +0,0 @@
<title>libmdbx: One of the fastest embeddable key-value engine</title>
<meta name="description" content="libmdbx surpasses the legendary LMDB in terms of reliability, features and performance. For now libmdbx is chosen by all modern Ethereum frontiers as a storage engine.">

18
mdbx.h
View File

@ -581,10 +581,9 @@ typedef mode_t mdbx_mode_t;
extern "C" {
#endif
/* MDBX version 0.14.x, but it is unstable/under-development yet. */
#define MDBX_VERSION_UNSTABLE
/* MDBX version 0.13.x */
#define MDBX_VERSION_MAJOR 0
#define MDBX_VERSION_MINOR 14
#define MDBX_VERSION_MINOR 13
#ifndef LIBMDBX_API
#if defined(LIBMDBX_EXPORTS) || defined(DOXYGEN)
@ -2775,10 +2774,10 @@ typedef struct MDBX_stat MDBX_stat;
* Legacy mdbx_env_stat() correspond to calling \ref mdbx_env_stat_ex() with the
* null `txn` argument.
*
* \param [in] env An environment handle returned by \ref mdbx_env_create().
* \param [in] txn A transaction handle returned by \ref mdbx_txn_begin().
* \param [in] env An environment handle returned by \ref mdbx_env_create()
* \param [in] txn A transaction handle returned by \ref mdbx_txn_begin()
* \param [out] stat The address of an \ref MDBX_stat structure where
* the statistics will be copied.
* the statistics will be copied
* \param [in] bytes The size of \ref MDBX_stat.
*
* \returns A non-zero error value on failure and 0 on success. */
@ -4197,10 +4196,7 @@ LIBMDBX_API int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency);
* \returns A non-zero error value on failure and 0 on success,
* some possible errors are:
* \retval MDBX_RESULT_TRUE Transaction was aborted since it should
* be aborted due to previous errors,
* either no changes were made during the transaction,
* and the build time option
* \ref MDBX_NOSUCCESS_PURE_COMMIT was enabled.
* be aborted due to previous errors.
* \retval MDBX_PANIC A fatal error occurred earlier
* and the environment must be shut down.
* \retval MDBX_BAD_TXN Transaction is already finished or never began.
@ -6542,8 +6538,6 @@ typedef struct MDBX_chk_table {
struct MDBX_chk_histogram key_len;
/// Values length histogram
struct MDBX_chk_histogram val_len;
/// Number of multi-values (aka duplicates) histogram
struct MDBX_chk_histogram multival;
} histogram;
} MDBX_chk_table_t;

View File

@ -1,7 +1,7 @@
From 349c08cf21b66ecea851340133a1b845c25675f7 Mon Sep 17 00:00:00 2001
From 49256dcd050fd0ee67860b7bc544dabe088d08e9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?=
=?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= <leo@yuriev.ru>
Date: Tue, 22 Apr 2025 14:38:49 +0300
Date: Fri, 14 Feb 2025 21:34:25 +0300
Subject: [PATCH] package/libmdbx: new package (library/database).
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
@ -15,7 +15,7 @@ This patch adds libmdbx:
in terms of reliability, features and performance.
- more information at https://libmdbx.dqdkfa.ru
The 0.13.6 "Бузина" (Elderberry) is stable release of _libmdbx_ branch with new superior features.
The 0.13.4 "Sigma Boy" is stable release of _libmdbx_ branch with new superior features.
The complete ChangeLog: https://gitflic.ru/project/erthink/libmdbx/blob?file=ChangeLog.md
@ -110,19 +110,19 @@ index 0000000000..a9a4ac45c5
+ !BR2_TOOLCHAIN_GCC_AT_LEAST_4_4
diff --git a/package/libmdbx/libmdbx.hash b/package/libmdbx/libmdbx.hash
new file mode 100644
index 0000000000..ae5266716b
index 0000000000..202937e7be
--- /dev/null
+++ b/package/libmdbx/libmdbx.hash
@@ -0,0 +1,6 @@
+# Hashes from: https://libmdbx.dqdkfa.ru/release/SHA256SUMS
+sha256 57db987de6f7ccc66a66ae28a7bda9f9fbb48ac5fb9279bcca92fd5de13075d1 libmdbx-amalgamated-0.13.6.tar.xz
+sha256 86df30ca2231c9b3ad71424bb829dca9041947f5539d4295030c653d4982c1be libmdbx-amalgamated-0.13.4.tar.xz
+
+# Locally calculated
+sha256 0d542e0c8804e39aa7f37eb00da5a762149dc682d7829451287e11b938e94594 LICENSE
+sha256 651f71b46c6bb0046d2122df7f9def9cb24f4dc28c5b11cef059f66565cda30f NOTICE
+sha256 699a62986b6c8d31124646dffd4b15872c7d3bc5eecea5994edb1f5195df49d1 NOTICE
diff --git a/package/libmdbx/libmdbx.mk b/package/libmdbx/libmdbx.mk
new file mode 100644
index 0000000000..571757262e
index 0000000000..a8a6f3dbdf
--- /dev/null
+++ b/package/libmdbx/libmdbx.mk
@@ -0,0 +1,42 @@
@ -132,7 +132,7 @@ index 0000000000..571757262e
+#
+################################################################################
+
+LIBMDBX_VERSION = 0.13.6
+LIBMDBX_VERSION = 0.13.4
+LIBMDBX_SOURCE = libmdbx-amalgamated-$(LIBMDBX_VERSION).tar.xz
+LIBMDBX_SITE = https://libmdbx.dqdkfa.ru/release
+LIBMDBX_SUPPORTS_IN_SOURCE_BUILD = NO
@ -169,5 +169,5 @@ index 0000000000..571757262e
+
+$(eval $(cmake-package))
--
2.49.0
2.48.1

View File

@ -41,16 +41,12 @@
#include "page-ops.c"
#include "pnl.c"
#include "refund.c"
#include "rkl.c"
#include "spill.c"
#include "table.c"
#include "tls.c"
#include "tree-ops.c"
#include "tree-search.c"
#include "txl.c"
#include "txn-basal.c"
#include "txn-nested.c"
#include "txn-ro.c"
#include "txn.c"
#include "utils.c"
#include "version.c"

View File

@ -73,7 +73,6 @@ int mdbx_cursor_bind(MDBX_txn *txn, MDBX_cursor *mc, MDBX_dbi dbi) {
mc->next = txn->cursors[dbi];
txn->cursors[dbi] = mc;
txn->flags |= txn_may_have_cursors;
return MDBX_SUCCESS;
}

View File

@ -488,12 +488,39 @@ __cold int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname, MDBX_env_flags
}
if ((flags & MDBX_RDONLY) == 0) {
env->basal_txn = txn_basal_create(env->max_dbi);
if (unlikely(!env->basal_txn)) {
MDBX_txn *txn = nullptr;
const intptr_t bitmap_bytes =
#if MDBX_ENABLE_DBI_SPARSE
ceil_powerof2(env->max_dbi, CHAR_BIT * sizeof(txn->dbi_sparse[0])) / CHAR_BIT;
#else
0;
#endif /* MDBX_ENABLE_DBI_SPARSE */
const size_t base = sizeof(MDBX_txn) + sizeof(cursor_couple_t);
const size_t size = base + bitmap_bytes +
env->max_dbi * (sizeof(txn->dbs[0]) + sizeof(txn->cursors[0]) + sizeof(txn->dbi_seqs[0]) +
sizeof(txn->dbi_state[0]));
txn = osal_calloc(1, size);
if (unlikely(!txn)) {
rc = MDBX_ENOMEM;
goto bailout;
}
txn->dbs = ptr_disp(txn, base);
txn->cursors = ptr_disp(txn->dbs, env->max_dbi * sizeof(txn->dbs[0]));
txn->dbi_seqs = ptr_disp(txn->cursors, env->max_dbi * sizeof(txn->cursors[0]));
txn->dbi_state = ptr_disp(txn, size - env->max_dbi * sizeof(txn->dbi_state[0]));
#if MDBX_ENABLE_DBI_SPARSE
txn->dbi_sparse = ptr_disp(txn->dbi_state, -bitmap_bytes);
#endif /* MDBX_ENABLE_DBI_SPARSE */
txn->env = env;
txn->flags = MDBX_TXN_FINISHED;
env->basal_txn = txn;
txn->tw.retired_pages = pnl_alloc(MDBX_PNL_INITIAL);
txn->tw.repnl = pnl_alloc(MDBX_PNL_INITIAL);
if (unlikely(!txn->tw.retired_pages || !txn->tw.repnl)) {
rc = MDBX_ENOMEM;
goto bailout;
}
env->basal_txn->env = env;
env_options_adjust_defaults(env);
}
@ -689,7 +716,7 @@ static int env_info_snap(const MDBX_env *env, const MDBX_txn *txn, MDBX_envinfo
#endif
}
*troika = (txn && !(txn->flags & MDBX_TXN_RDONLY)) ? txn->wr.troika : meta_tap(env);
*troika = (txn && !(txn->flags & MDBX_TXN_RDONLY)) ? txn->tw.troika : meta_tap(env);
const meta_ptr_t head = meta_recent(env, troika);
const meta_t *const meta0 = METAPAGE(env, 0);
const meta_t *const meta1 = METAPAGE(env, 1);
@ -952,16 +979,16 @@ __cold int mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, intptr_t si
if (unlikely(err != MDBX_SUCCESS))
return LOG_IFERR(err);
should_unlock = true;
env->basal_txn->wr.troika = meta_tap(env);
env->basal_txn->tw.troika = meta_tap(env);
eASSERT(env, !env->txn && !env->basal_txn->nested);
env->basal_txn->txnid = env->basal_txn->wr.troika.txnid[env->basal_txn->wr.troika.recent];
txn_gc_detent(env->basal_txn);
env->basal_txn->txnid = env->basal_txn->tw.troika.txnid[env->basal_txn->tw.troika.recent];
txn_snapshot_oldest(env->basal_txn);
}
/* get untouched params from current TXN or DB */
if (pagesize <= 0 || pagesize >= INT_MAX)
pagesize = env->ps;
const geo_t *const geo = env->txn ? &env->txn->geo : &meta_recent(env, &env->basal_txn->wr.troika).ptr_c->geometry;
const geo_t *const geo = env->txn ? &env->txn->geo : &meta_recent(env, &env->basal_txn->tw.troika).ptr_c->geometry;
if (size_lower < 0)
size_lower = pgno2bytes(env, geo->lower);
if (size_now < 0)
@ -1176,7 +1203,7 @@ __cold int mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, intptr_t si
meta_t meta;
memset(&meta, 0, sizeof(meta));
if (!env->txn) {
const meta_ptr_t head = meta_recent(env, &env->basal_txn->wr.troika);
const meta_ptr_t head = meta_recent(env, &env->basal_txn->tw.troika);
uint64_t timestamp = 0;
while ("workaround for "
@ -1270,7 +1297,7 @@ __cold int mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, intptr_t si
env->txn->flags |= MDBX_TXN_DIRTY;
} else {
meta.geometry = new_geo;
rc = dxb_sync_locked(env, env->flags, &meta, &env->basal_txn->wr.troika);
rc = dxb_sync_locked(env, env->flags, &meta, &env->basal_txn->tw.troika);
if (likely(rc == MDBX_SUCCESS)) {
env->geo_in_bytes.now = pgno2bytes(env, new_geo.now = meta.geometry.now);
env->geo_in_bytes.upper = pgno2bytes(env, new_geo.upper = meta.geometry.upper);

View File

@ -147,9 +147,6 @@ void env_options_adjust_dp_limit(MDBX_env *env) {
if (env->options.dp_limit < CURSOR_STACK_SIZE * 4)
env->options.dp_limit = CURSOR_STACK_SIZE * 4;
}
#ifdef MDBX_DEBUG_DPL_LIMIT
env->options.dp_limit = MDBX_DEBUG_DPL_LIMIT;
#endif /* MDBX_DEBUG_DPL_LIMIT */
if (env->options.dp_initial > env->options.dp_limit && env->options.dp_initial > default_dp_initial(env))
env->options.dp_initial = env->options.dp_limit;
env->options.need_dp_limit_adjust = false;

View File

@ -411,7 +411,7 @@ int mdbx_replace_ex(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, MDBX_val *
}
if (is_modifable(txn, page)) {
if (new_data && eq_fast(&present_data, new_data)) {
if (new_data && cmp_lenfast(&present_data, new_data) == 0) {
/* если данные совпадают, то ничего делать не надо */
*old_data = *new_data;
goto bailout;

View File

@ -10,11 +10,10 @@ __attribute__((__no_sanitize_thread__, __noinline__))
int mdbx_txn_straggler(const MDBX_txn *txn, int *percent)
{
int rc = check_txn(txn, MDBX_TXN_BLOCKED - MDBX_TXN_PARKED);
if (likely(rc == MDBX_SUCCESS))
rc = check_env(txn->env, true);
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR((rc > 0) ? -rc : rc);
MDBX_env *env = txn->env;
if (unlikely((txn->flags & MDBX_TXN_RDONLY) == 0)) {
if (percent)
*percent = (int)((txn->geo.first_unallocated * UINT64_C(100) + txn->geo.end_pgno / 2) / txn->geo.end_pgno);
@ -22,15 +21,15 @@ int mdbx_txn_straggler(const MDBX_txn *txn, int *percent)
}
txnid_t lag;
troika_t troika = meta_tap(txn->env);
troika_t troika = meta_tap(env);
do {
const meta_ptr_t head = meta_recent(txn->env, &troika);
const meta_ptr_t head = meta_recent(env, &troika);
if (percent) {
const pgno_t maxpg = head.ptr_v->geometry.now;
*percent = (int)((head.ptr_v->geometry.first_unallocated * UINT64_C(100) + maxpg / 2) / maxpg);
}
lag = (head.txnid - txn->txnid) / xMDBX_TXNID_STEP;
} while (unlikely(meta_should_retry(txn->env, &troika)));
} while (unlikely(meta_should_retry(env, &troika)));
return (lag > INT_MAX) ? INT_MAX : (int)lag;
}
@ -56,8 +55,8 @@ MDBX_txn_flags_t mdbx_txn_flags(const MDBX_txn *txn) {
assert(0 == (int)(txn->flags & MDBX_TXN_INVALID));
MDBX_txn_flags_t flags = txn->flags;
if (F_ISSET(flags, MDBX_TXN_PARKED | MDBX_TXN_RDONLY) && txn->ro.slot &&
safe64_read(&txn->ro.slot->tid) == MDBX_TID_TXN_OUSTED)
if (F_ISSET(flags, MDBX_TXN_PARKED | MDBX_TXN_RDONLY) && txn->to.reader &&
safe64_read(&txn->to.reader->tid) == MDBX_TID_TXN_OUSTED)
flags |= MDBX_TXN_OUSTED;
return flags;
}
@ -67,10 +66,6 @@ int mdbx_txn_reset(MDBX_txn *txn) {
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR(rc);
rc = check_env(txn->env, false);
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR(rc);
/* This call is only valid for read-only txns */
if (unlikely((txn->flags & MDBX_TXN_RDONLY) == 0))
return LOG_IFERR(MDBX_EINVAL);
@ -90,6 +85,8 @@ int mdbx_txn_break(MDBX_txn *txn) {
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR(rc);
txn->flags |= MDBX_TXN_ERROR;
if (txn->flags & MDBX_TXN_RDONLY)
break;
txn = txn->nested;
} while (txn);
return MDBX_SUCCESS;
@ -120,11 +117,6 @@ int mdbx_txn_park(MDBX_txn *txn, bool autounpark) {
int rc = check_txn(txn, MDBX_TXN_BLOCKED - MDBX_TXN_ERROR);
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR(rc);
rc = check_env(txn->env, true);
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR(rc);
if (unlikely((txn->flags & MDBX_TXN_RDONLY) == 0))
return LOG_IFERR(MDBX_TXN_INVALID);
@ -133,7 +125,7 @@ int mdbx_txn_park(MDBX_txn *txn, bool autounpark) {
return LOG_IFERR(rc ? rc : MDBX_OUSTED);
}
return LOG_IFERR(txn_ro_park(txn, autounpark));
return LOG_IFERR(txn_park(txn, autounpark));
}
int mdbx_txn_unpark(MDBX_txn *txn, bool restart_if_ousted) {
@ -141,15 +133,10 @@ int mdbx_txn_unpark(MDBX_txn *txn, bool restart_if_ousted) {
int rc = check_txn(txn, MDBX_TXN_BLOCKED - MDBX_TXN_PARKED - MDBX_TXN_ERROR);
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR(rc);
rc = check_env(txn->env, true);
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR(rc);
if (unlikely(!F_ISSET(txn->flags, MDBX_TXN_RDONLY | MDBX_TXN_PARKED)))
return MDBX_SUCCESS;
rc = txn_ro_unpark(txn);
rc = txn_unpark(txn);
if (likely(rc != MDBX_OUSTED) || !restart_if_ousted)
return LOG_IFERR(rc);
@ -159,24 +146,22 @@ int mdbx_txn_unpark(MDBX_txn *txn, bool restart_if_ousted) {
}
int mdbx_txn_renew(MDBX_txn *txn) {
int rc = check_txn(txn, 0);
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR(rc);
if (unlikely(!txn))
return LOG_IFERR(MDBX_EINVAL);
rc = check_env(txn->env, true);
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR(rc);
if (unlikely(txn->signature != txn_signature))
return LOG_IFERR(MDBX_EBADSIGN);
if (unlikely((txn->flags & MDBX_TXN_RDONLY) == 0))
return LOG_IFERR(MDBX_EINVAL);
if (unlikely(txn->owner != 0 || !(txn->flags & MDBX_TXN_FINISHED))) {
rc = mdbx_txn_reset(txn);
int rc = mdbx_txn_reset(txn);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
}
rc = txn_renew(txn, MDBX_TXN_RDONLY);
int rc = txn_renew(txn, MDBX_TXN_RDONLY);
if (rc == MDBX_SUCCESS) {
tASSERT(txn, txn->owner == (txn->flags & MDBX_NOSTICKYTHREADS) ? 0 : osal_thread_self());
DEBUG("renew txn %" PRIaTXN "%c %p on env %p, root page %" PRIaPGNO "/%" PRIaPGNO, txn->txnid,
@ -187,7 +172,7 @@ int mdbx_txn_renew(MDBX_txn *txn) {
}
int mdbx_txn_set_userctx(MDBX_txn *txn, void *ctx) {
int rc = check_txn(txn, 0);
int rc = check_txn(txn, MDBX_TXN_FINISHED);
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR(rc);
@ -212,8 +197,6 @@ int mdbx_txn_begin_ex(MDBX_env *env, MDBX_txn *parent, MDBX_txn_flags_t flags, M
if (unlikely(env->flags & MDBX_RDONLY & ~flags)) /* write txn in RDONLY env */
return LOG_IFERR(MDBX_EACCESS);
/* Reuse preallocated write txn. However, do not touch it until
* txn_renew() succeeds, since it currently may be active. */
MDBX_txn *txn = nullptr;
if (parent) {
/* Nested transactions: Max 1 child, write txns only, no writemap */
@ -229,126 +212,202 @@ int mdbx_txn_begin_ex(MDBX_env *env, MDBX_txn *parent, MDBX_txn_flags_t flags, M
}
return LOG_IFERR(rc);
}
if (unlikely(parent->env != env))
return LOG_IFERR(MDBX_BAD_TXN);
if (env->options.spill_parent4child_denominator) {
/* Spill dirty-pages of parent to provide dirtyroom for child txn */
rc = txn_spill(parent, nullptr, parent->tw.dirtylist->length / env->options.spill_parent4child_denominator);
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR(rc);
}
tASSERT(parent, audit_ex(parent, 0, false) == 0);
flags |= parent->flags & (txn_rw_begin_flags | MDBX_TXN_SPILLS | MDBX_NOSTICKYTHREADS | MDBX_WRITEMAP);
rc = txn_nested_create(parent, flags);
txn = parent->nested;
if (unlikely(rc != MDBX_SUCCESS)) {
int err = txn_end(txn, TXN_END_FAIL_BEGIN_NESTED);
return err ? err : rc;
} else if ((flags & MDBX_TXN_RDONLY) == 0) {
/* Reuse preallocated write txn. However, do not touch it until
* txn_renew() succeeds, since it currently may be active. */
txn = env->basal_txn;
goto renew;
}
const intptr_t bitmap_bytes =
#if MDBX_ENABLE_DBI_SPARSE
ceil_powerof2(env->max_dbi, CHAR_BIT * sizeof(txn->dbi_sparse[0])) / CHAR_BIT;
#else
0;
#endif /* MDBX_ENABLE_DBI_SPARSE */
STATIC_ASSERT(sizeof(txn->tw) > sizeof(txn->to));
const size_t base =
(flags & MDBX_TXN_RDONLY) ? sizeof(MDBX_txn) - sizeof(txn->tw) + sizeof(txn->to) : sizeof(MDBX_txn);
const size_t size = base +
((flags & MDBX_TXN_RDONLY) ? (size_t)bitmap_bytes + env->max_dbi * sizeof(txn->dbi_seqs[0]) : 0) +
env->max_dbi * (sizeof(txn->dbs[0]) + sizeof(txn->cursors[0]) + sizeof(txn->dbi_state[0]));
txn = osal_malloc(size);
if (unlikely(txn == nullptr))
return LOG_IFERR(MDBX_ENOMEM);
#if MDBX_DEBUG
memset(txn, 0xCD, size);
VALGRIND_MAKE_MEM_UNDEFINED(txn, size);
#endif /* MDBX_DEBUG */
MDBX_ANALYSIS_ASSUME(size > base);
memset(txn, 0, (MDBX_GOOFY_MSVC_STATIC_ANALYZER && base > size) ? size : base);
txn->dbs = ptr_disp(txn, base);
txn->cursors = ptr_disp(txn->dbs, env->max_dbi * sizeof(txn->dbs[0]));
#if MDBX_DEBUG
txn->cursors[FREE_DBI] = nullptr; /* avoid SIGSEGV in an assertion later */
#endif
txn->dbi_state = ptr_disp(txn, size - env->max_dbi * sizeof(txn->dbi_state[0]));
txn->flags = flags;
txn->env = env;
if (parent) {
tASSERT(parent, dpl_check(parent));
#if MDBX_ENABLE_DBI_SPARSE
txn->dbi_sparse = parent->dbi_sparse;
#endif /* MDBX_ENABLE_DBI_SPARSE */
txn->dbi_seqs = parent->dbi_seqs;
txn->geo = parent->geo;
rc = dpl_alloc(txn);
if (likely(rc == MDBX_SUCCESS)) {
const size_t len = MDBX_PNL_GETSIZE(parent->tw.repnl) + parent->tw.loose_count;
txn->tw.repnl = pnl_alloc((len > MDBX_PNL_INITIAL) ? len : MDBX_PNL_INITIAL);
if (unlikely(!txn->tw.repnl))
rc = MDBX_ENOMEM;
}
if (unlikely(rc != MDBX_SUCCESS)) {
nested_failed:
pnl_free(txn->tw.repnl);
dpl_free(txn);
osal_free(txn);
return LOG_IFERR(rc);
}
/* Move loose pages to reclaimed list */
if (parent->tw.loose_count) {
do {
page_t *lp = parent->tw.loose_pages;
tASSERT(parent, lp->flags == P_LOOSE);
rc = pnl_insert_span(&parent->tw.repnl, lp->pgno, 1);
if (unlikely(rc != MDBX_SUCCESS))
goto nested_failed;
MDBX_ASAN_UNPOISON_MEMORY_REGION(&page_next(lp), sizeof(page_t *));
VALGRIND_MAKE_MEM_DEFINED(&page_next(lp), sizeof(page_t *));
parent->tw.loose_pages = page_next(lp);
/* Remove from dirty list */
page_wash(parent, dpl_exist(parent, lp->pgno), lp, 1);
} while (parent->tw.loose_pages);
parent->tw.loose_count = 0;
#if MDBX_ENABLE_REFUND
parent->tw.loose_refund_wl = 0;
#endif /* MDBX_ENABLE_REFUND */
tASSERT(parent, dpl_check(parent));
}
txn->tw.dirtyroom = parent->tw.dirtyroom;
txn->tw.dirtylru = parent->tw.dirtylru;
dpl_sort(parent);
if (parent->tw.spilled.list)
spill_purge(parent);
tASSERT(txn, MDBX_PNL_ALLOCLEN(txn->tw.repnl) >= MDBX_PNL_GETSIZE(parent->tw.repnl));
memcpy(txn->tw.repnl, parent->tw.repnl, MDBX_PNL_SIZEOF(parent->tw.repnl));
eASSERT(env, pnl_check_allocated(txn->tw.repnl, (txn->geo.first_unallocated /* LY: intentional assignment
here, only for assertion */
= parent->geo.first_unallocated) -
MDBX_ENABLE_REFUND));
txn->tw.gc.time_acc = parent->tw.gc.time_acc;
txn->tw.gc.last_reclaimed = parent->tw.gc.last_reclaimed;
if (parent->tw.gc.retxl) {
txn->tw.gc.retxl = parent->tw.gc.retxl;
parent->tw.gc.retxl = (void *)(intptr_t)MDBX_PNL_GETSIZE(parent->tw.gc.retxl);
}
txn->tw.retired_pages = parent->tw.retired_pages;
parent->tw.retired_pages = (void *)(intptr_t)MDBX_PNL_GETSIZE(parent->tw.retired_pages);
txn->txnid = parent->txnid;
txn->front_txnid = parent->front_txnid + 1;
#if MDBX_ENABLE_REFUND
txn->tw.loose_refund_wl = 0;
#endif /* MDBX_ENABLE_REFUND */
txn->canary = parent->canary;
parent->flags |= MDBX_TXN_HAS_CHILD;
parent->nested = txn;
txn->parent = parent;
txn->owner = parent->owner;
txn->tw.troika = parent->tw.troika;
txn->cursors[FREE_DBI] = nullptr;
txn->cursors[MAIN_DBI] = nullptr;
txn->dbi_state[FREE_DBI] = parent->dbi_state[FREE_DBI] & ~(DBI_FRESH | DBI_CREAT | DBI_DIRTY);
txn->dbi_state[MAIN_DBI] = parent->dbi_state[MAIN_DBI] & ~(DBI_FRESH | DBI_CREAT | DBI_DIRTY);
memset(txn->dbi_state + CORE_DBS, 0, (txn->n_dbi = parent->n_dbi) - CORE_DBS);
memcpy(txn->dbs, parent->dbs, sizeof(txn->dbs[0]) * CORE_DBS);
tASSERT(parent, parent->tw.dirtyroom + parent->tw.dirtylist->length ==
(parent->parent ? parent->parent->tw.dirtyroom : parent->env->options.dp_limit));
tASSERT(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length ==
(txn->parent ? txn->parent->tw.dirtyroom : txn->env->options.dp_limit));
env->txn = txn;
tASSERT(parent, parent->cursors[FREE_DBI] == nullptr);
rc = parent->cursors[MAIN_DBI] ? cursor_shadow(parent->cursors[MAIN_DBI], txn, MAIN_DBI) : MDBX_SUCCESS;
if (AUDIT_ENABLED() && ASSERT_ENABLED()) {
txn->signature = txn_signature;
tASSERT(txn, audit_ex(txn, 0, false) == 0);
}
} else {
txn = env->basal_txn;
if (flags & MDBX_TXN_RDONLY) {
txn = txn_alloc(flags, env);
if (unlikely(!txn))
return LOG_IFERR(MDBX_ENOMEM);
}
if (unlikely(rc != MDBX_SUCCESS))
txn_end(txn, TXN_END_FAIL_BEGINCHILD);
} else { /* MDBX_TXN_RDONLY */
txn->dbi_seqs = ptr_disp(txn->cursors, env->max_dbi * sizeof(txn->cursors[0]));
#if MDBX_ENABLE_DBI_SPARSE
txn->dbi_sparse = ptr_disp(txn->dbi_state, -bitmap_bytes);
#endif /* MDBX_ENABLE_DBI_SPARSE */
renew:
rc = txn_renew(txn, flags);
if (unlikely(rc != MDBX_SUCCESS)) {
if (txn != env->basal_txn)
osal_free(txn);
return LOG_IFERR(rc);
}
if (unlikely(rc != MDBX_SUCCESS)) {
if (txn != env->basal_txn)
osal_free(txn);
} else {
if (flags & (MDBX_TXN_RDONLY_PREPARE - MDBX_TXN_RDONLY))
eASSERT(env, txn->flags == (MDBX_TXN_RDONLY | MDBX_TXN_FINISHED));
else if (flags & MDBX_TXN_RDONLY)
eASSERT(env, (txn->flags & ~(MDBX_NOSTICKYTHREADS | MDBX_TXN_RDONLY | MDBX_WRITEMAP |
/* Win32: SRWL flag */ txn_shrink_allowed)) == 0);
else {
eASSERT(env, (txn->flags & ~(MDBX_NOSTICKYTHREADS | MDBX_WRITEMAP | txn_shrink_allowed | MDBX_NOMETASYNC |
MDBX_SAFE_NOSYNC | MDBX_TXN_SPILLS)) == 0);
assert(!txn->tw.spilled.list && !txn->tw.spilled.least_removed);
}
txn->signature = txn_signature;
txn->userctx = context;
*ret = txn;
DEBUG("begin txn %" PRIaTXN "%c %p on env %p, root page %" PRIaPGNO "/%" PRIaPGNO, txn->txnid,
(flags & MDBX_TXN_RDONLY) ? 'r' : 'w', (void *)txn, (void *)env, txn->dbs[MAIN_DBI].root,
txn->dbs[FREE_DBI].root);
}
if (flags & (MDBX_TXN_RDONLY_PREPARE - MDBX_TXN_RDONLY))
eASSERT(env, txn->flags == (MDBX_TXN_RDONLY | MDBX_TXN_FINISHED));
else if (flags & MDBX_TXN_RDONLY)
eASSERT(env, (txn->flags & ~(MDBX_NOSTICKYTHREADS | MDBX_TXN_RDONLY | MDBX_WRITEMAP |
/* Win32: SRWL flag */ txn_shrink_allowed)) == 0);
else {
eASSERT(env, (txn->flags & ~(MDBX_NOSTICKYTHREADS | MDBX_WRITEMAP | txn_shrink_allowed | txn_may_have_cursors |
MDBX_NOMETASYNC | MDBX_SAFE_NOSYNC | MDBX_TXN_SPILLS)) == 0);
assert(!txn->wr.spilled.list && !txn->wr.spilled.least_removed);
}
txn->signature = txn_signature;
txn->userctx = context;
*ret = txn;
DEBUG("begin txn %" PRIaTXN "%c %p on env %p, root page %" PRIaPGNO "/%" PRIaPGNO, txn->txnid,
(flags & MDBX_TXN_RDONLY) ? 'r' : 'w', (void *)txn, (void *)env, txn->dbs[MAIN_DBI].root,
txn->dbs[FREE_DBI].root);
return MDBX_SUCCESS;
}
static void latency_gcprof(MDBX_commit_latency *latency, const MDBX_txn *txn) {
MDBX_env *const env = txn->env;
if (latency && likely(env->lck) && MDBX_ENABLE_PROFGC) {
pgop_stat_t *const ptr = &env->lck->pgops;
latency->gc_prof.work_counter = ptr->gc_prof.work.spe_counter;
latency->gc_prof.work_rtime_monotonic = osal_monotime_to_16dot16(ptr->gc_prof.work.rtime_monotonic);
latency->gc_prof.work_xtime_cpu = osal_monotime_to_16dot16(ptr->gc_prof.work.xtime_cpu);
latency->gc_prof.work_rsteps = ptr->gc_prof.work.rsteps;
latency->gc_prof.work_xpages = ptr->gc_prof.work.xpages;
latency->gc_prof.work_majflt = ptr->gc_prof.work.majflt;
latency->gc_prof.self_counter = ptr->gc_prof.self.spe_counter;
latency->gc_prof.self_rtime_monotonic = osal_monotime_to_16dot16(ptr->gc_prof.self.rtime_monotonic);
latency->gc_prof.self_xtime_cpu = osal_monotime_to_16dot16(ptr->gc_prof.self.xtime_cpu);
latency->gc_prof.self_rsteps = ptr->gc_prof.self.rsteps;
latency->gc_prof.self_xpages = ptr->gc_prof.self.xpages;
latency->gc_prof.self_majflt = ptr->gc_prof.self.majflt;
latency->gc_prof.wloops = ptr->gc_prof.wloops;
latency->gc_prof.coalescences = ptr->gc_prof.coalescences;
latency->gc_prof.wipes = ptr->gc_prof.wipes;
latency->gc_prof.flushes = ptr->gc_prof.flushes;
latency->gc_prof.kicks = ptr->gc_prof.kicks;
latency->gc_prof.pnl_merge_work.time = osal_monotime_to_16dot16(ptr->gc_prof.work.pnl_merge.time);
latency->gc_prof.pnl_merge_work.calls = ptr->gc_prof.work.pnl_merge.calls;
latency->gc_prof.pnl_merge_work.volume = ptr->gc_prof.work.pnl_merge.volume;
latency->gc_prof.pnl_merge_self.time = osal_monotime_to_16dot16(ptr->gc_prof.self.pnl_merge.time);
latency->gc_prof.pnl_merge_self.calls = ptr->gc_prof.self.pnl_merge.calls;
latency->gc_prof.pnl_merge_self.volume = ptr->gc_prof.self.pnl_merge.volume;
if (txn == env->basal_txn)
memset(&ptr->gc_prof, 0, sizeof(ptr->gc_prof));
}
}
static void latency_init(MDBX_commit_latency *latency, struct commit_timestamp *ts) {
ts->start = 0;
ts->gc_cpu = 0;
if (latency) {
ts->start = osal_monotime();
memset(latency, 0, sizeof(*latency));
}
ts->prep = ts->gc = ts->audit = ts->write = ts->sync = ts->start;
}
static void latency_done(MDBX_commit_latency *latency, struct commit_timestamp *ts) {
if (latency) {
latency->preparation = (ts->prep > ts->start) ? osal_monotime_to_16dot16(ts->prep - ts->start) : 0;
latency->gc_wallclock = (ts->gc > ts->prep) ? osal_monotime_to_16dot16(ts->gc - ts->prep) : 0;
latency->gc_cputime = ts->gc_cpu ? osal_monotime_to_16dot16(ts->gc_cpu) : 0;
latency->audit = (ts->audit > ts->gc) ? osal_monotime_to_16dot16(ts->audit - ts->gc) : 0;
latency->write = (ts->write > ts->audit) ? osal_monotime_to_16dot16(ts->write - ts->audit) : 0;
latency->sync = (ts->sync > ts->write) ? osal_monotime_to_16dot16(ts->sync - ts->write) : 0;
const uint64_t ts_end = osal_monotime();
latency->ending = (ts_end > ts->sync) ? osal_monotime_to_16dot16(ts_end - ts->sync) : 0;
latency->whole = osal_monotime_to_16dot16_noUnderflow(ts_end - ts->start);
}
return LOG_IFERR(rc);
}
int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) {
STATIC_ASSERT(MDBX_TXN_FINISHED == MDBX_TXN_BLOCKED - MDBX_TXN_HAS_CHILD - MDBX_TXN_ERROR - MDBX_TXN_PARKED);
const uint64_t ts_0 = latency ? osal_monotime() : 0;
uint64_t ts_1 = 0, ts_2 = 0, ts_3 = 0, ts_4 = 0, ts_5 = 0, gc_cputime = 0;
struct commit_timestamp ts;
latency_init(latency, &ts);
/* txn_end() mode for a commit which writes nothing */
unsigned end_mode = TXN_END_PURE_COMMIT | TXN_END_UPDATE | TXN_END_SLOT | TXN_END_FREE;
int rc = check_txn(txn, MDBX_TXN_FINISHED);
if (unlikely(rc != MDBX_SUCCESS)) {
if (rc == MDBX_BAD_TXN && F_ISSET(txn->flags, MDBX_TXN_FINISHED | MDBX_TXN_RDONLY)) {
if (rc == MDBX_BAD_TXN && (txn->flags & MDBX_TXN_RDONLY)) {
rc = MDBX_RESULT_TRUE;
goto fail;
}
bailout:
if (latency)
memset(latency, 0, sizeof(*latency));
return LOG_IFERR(rc);
}
@ -356,17 +415,14 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) {
if (MDBX_ENV_CHECKPID && unlikely(env->pid != osal_getpid())) {
env->flags |= ENV_FATAL_ERROR;
rc = MDBX_PANIC;
return LOG_IFERR(rc);
goto bailout;
}
if (txn->flags & MDBX_TXN_RDONLY) {
if (unlikely(txn->parent || (txn->flags & MDBX_TXN_HAS_CHILD) || txn == env->txn || txn == env->basal_txn)) {
ERROR("attempt to commit %s txn %p", "strange read-only", (void *)txn);
return MDBX_PROBLEM;
if (unlikely(txn->flags & MDBX_TXN_RDONLY)) {
if (txn->flags & MDBX_TXN_ERROR) {
rc = MDBX_RESULT_TRUE;
goto fail;
}
latency_gcprof(latency, txn);
rc = (txn->flags & MDBX_TXN_ERROR) ? MDBX_RESULT_TRUE : MDBX_SUCCESS;
txn_end(txn, TXN_END_PURE_COMMIT | TXN_END_UPDATE | TXN_END_SLOT | TXN_END_FREE);
goto done;
}
@ -380,12 +436,7 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) {
if (unlikely(txn->flags & MDBX_TXN_ERROR)) {
rc = MDBX_RESULT_TRUE;
fail:
latency_gcprof(latency, txn);
int err = txn_abort(txn);
if (unlikely(err != MDBX_SUCCESS))
rc = err;
goto done;
goto fail;
}
if (txn->nested) {
@ -396,38 +447,370 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) {
}
if (unlikely(txn != env->txn)) {
ERROR("attempt to commit %s txn %p", "unknown", (void *)txn);
return MDBX_EINVAL;
DEBUG("%s", "attempt to commit unknown transaction");
rc = MDBX_EINVAL;
goto fail;
}
if (txn->parent) {
if (unlikely(txn->parent->nested != txn || txn->parent->env != env)) {
ERROR("attempt to commit %s txn %p", "strange nested", (void *)txn);
return MDBX_PROBLEM;
tASSERT(txn, audit_ex(txn, 0, false) == 0);
eASSERT(env, txn != env->basal_txn);
MDBX_txn *const parent = txn->parent;
eASSERT(env, parent->signature == txn_signature);
eASSERT(env, parent->nested == txn && (parent->flags & MDBX_TXN_HAS_CHILD) != 0);
eASSERT(env, dpl_check(txn));
if (txn->tw.dirtylist->length == 0 && !(txn->flags & MDBX_TXN_DIRTY) && parent->n_dbi == txn->n_dbi) {
/* fast completion of pure nested transaction */
VERBOSE("fast-complete pure nested txn %" PRIaTXN, txn->txnid);
tASSERT(txn, memcmp(&parent->geo, &txn->geo, sizeof(parent->geo)) == 0);
tASSERT(txn, memcmp(&parent->canary, &txn->canary, sizeof(parent->canary)) == 0);
tASSERT(txn, !txn->tw.spilled.list || MDBX_PNL_GETSIZE(txn->tw.spilled.list) == 0);
tASSERT(txn, txn->tw.loose_count == 0);
/* Update parent's DBs array */
eASSERT(env, parent->n_dbi == txn->n_dbi);
TXN_FOREACH_DBI_ALL(txn, dbi) {
tASSERT(txn, (txn->dbi_state[dbi] & (DBI_CREAT | DBI_DIRTY)) == 0);
if (txn->dbi_state[dbi] & DBI_FRESH) {
parent->dbs[dbi] = txn->dbs[dbi];
/* preserve parent's status */
const uint8_t state = txn->dbi_state[dbi] | DBI_FRESH;
DEBUG("dbi %zu dbi-state %s 0x%02x -> 0x%02x", dbi, (parent->dbi_state[dbi] != state) ? "update" : "still",
parent->dbi_state[dbi], state);
parent->dbi_state[dbi] = state;
}
}
txn_done_cursors(txn, true);
end_mode = TXN_END_PURE_COMMIT | TXN_END_SLOT | TXN_END_FREE | TXN_END_EOTDONE;
goto done;
}
latency_gcprof(latency, txn);
rc = txn_nested_join(txn, latency ? &ts : nullptr);
/* Preserve space for spill list to avoid parent's state corruption
* if allocation fails. */
const size_t parent_retired_len = (uintptr_t)parent->tw.retired_pages;
tASSERT(txn, parent_retired_len <= MDBX_PNL_GETSIZE(txn->tw.retired_pages));
const size_t retired_delta = MDBX_PNL_GETSIZE(txn->tw.retired_pages) - parent_retired_len;
if (retired_delta) {
rc = pnl_need(&txn->tw.repnl, retired_delta);
if (unlikely(rc != MDBX_SUCCESS))
goto fail;
}
if (txn->tw.spilled.list) {
if (parent->tw.spilled.list) {
rc = pnl_need(&parent->tw.spilled.list, MDBX_PNL_GETSIZE(txn->tw.spilled.list));
if (unlikely(rc != MDBX_SUCCESS))
goto fail;
}
spill_purge(txn);
}
if (unlikely(txn->tw.dirtylist->length + parent->tw.dirtylist->length > parent->tw.dirtylist->detent &&
!dpl_reserve(parent, txn->tw.dirtylist->length + parent->tw.dirtylist->length))) {
rc = MDBX_ENOMEM;
goto fail;
}
//-------------------------------------------------------------------------
parent->tw.gc.retxl = txn->tw.gc.retxl;
txn->tw.gc.retxl = nullptr;
parent->tw.retired_pages = txn->tw.retired_pages;
txn->tw.retired_pages = nullptr;
pnl_free(parent->tw.repnl);
parent->tw.repnl = txn->tw.repnl;
txn->tw.repnl = nullptr;
parent->tw.gc.time_acc = txn->tw.gc.time_acc;
parent->tw.gc.last_reclaimed = txn->tw.gc.last_reclaimed;
parent->geo = txn->geo;
parent->canary = txn->canary;
parent->flags |= txn->flags & MDBX_TXN_DIRTY;
/* Move loose pages to parent */
#if MDBX_ENABLE_REFUND
parent->tw.loose_refund_wl = txn->tw.loose_refund_wl;
#endif /* MDBX_ENABLE_REFUND */
parent->tw.loose_count = txn->tw.loose_count;
parent->tw.loose_pages = txn->tw.loose_pages;
/* Merge our cursors into parent's and close them */
txn_done_cursors(txn, true);
end_mode |= TXN_END_EOTDONE;
/* Update parent's DBs array */
eASSERT(env, parent->n_dbi == txn->n_dbi);
TXN_FOREACH_DBI_ALL(txn, dbi) {
if (txn->dbi_state[dbi] != (parent->dbi_state[dbi] & ~(DBI_FRESH | DBI_CREAT | DBI_DIRTY))) {
eASSERT(env, (txn->dbi_state[dbi] & (DBI_CREAT | DBI_FRESH | DBI_DIRTY)) != 0 ||
(txn->dbi_state[dbi] | DBI_STALE) ==
(parent->dbi_state[dbi] & ~(DBI_FRESH | DBI_CREAT | DBI_DIRTY)));
parent->dbs[dbi] = txn->dbs[dbi];
/* preserve parent's status */
const uint8_t state = txn->dbi_state[dbi] | (parent->dbi_state[dbi] & (DBI_CREAT | DBI_FRESH | DBI_DIRTY));
DEBUG("dbi %zu dbi-state %s 0x%02x -> 0x%02x", dbi, (parent->dbi_state[dbi] != state) ? "update" : "still",
parent->dbi_state[dbi], state);
parent->dbi_state[dbi] = state;
}
}
if (latency) {
ts_1 = osal_monotime();
ts_2 = /* no gc-update */ ts_1;
ts_3 = /* no audit */ ts_2;
ts_4 = /* no write */ ts_3;
ts_5 = /* no sync */ ts_4;
}
txn_merge(parent, txn, parent_retired_len);
env->txn = parent;
parent->nested = nullptr;
tASSERT(parent, dpl_check(parent));
#if MDBX_ENABLE_REFUND
txn_refund(parent);
if (ASSERT_ENABLED()) {
/* Check parent's loose pages not suitable for refund */
for (page_t *lp = parent->tw.loose_pages; lp; lp = page_next(lp)) {
tASSERT(parent, lp->pgno < parent->tw.loose_refund_wl && lp->pgno + 1 < parent->geo.first_unallocated);
MDBX_ASAN_UNPOISON_MEMORY_REGION(&page_next(lp), sizeof(page_t *));
VALGRIND_MAKE_MEM_DEFINED(&page_next(lp), sizeof(page_t *));
}
/* Check parent's reclaimed pages not suitable for refund */
if (MDBX_PNL_GETSIZE(parent->tw.repnl))
tASSERT(parent, MDBX_PNL_MOST(parent->tw.repnl) + 1 < parent->geo.first_unallocated);
}
#endif /* MDBX_ENABLE_REFUND */
txn->signature = 0;
osal_free(txn);
tASSERT(parent, audit_ex(parent, 0, false) == 0);
rc = MDBX_SUCCESS;
goto provide_latency;
}
if (!txn->tw.dirtylist) {
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) != 0 && !MDBX_AVOID_MSYNC);
} else {
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC);
tASSERT(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length ==
(txn->parent ? txn->parent->tw.dirtyroom : env->options.dp_limit));
}
txn_done_cursors(txn, false);
end_mode |= TXN_END_EOTDONE;
if ((!txn->tw.dirtylist || txn->tw.dirtylist->length == 0) &&
(txn->flags & (MDBX_TXN_DIRTY | MDBX_TXN_SPILLS)) == 0) {
TXN_FOREACH_DBI_ALL(txn, i) { tASSERT(txn, !(txn->dbi_state[i] & DBI_DIRTY)); }
#if defined(MDBX_NOSUCCESS_EMPTY_COMMIT) && MDBX_NOSUCCESS_EMPTY_COMMIT
rc = txn_end(txn, end_mode);
if (unlikely(rc != MDBX_SUCCESS))
goto fail;
rc = MDBX_RESULT_TRUE;
goto provide_latency;
#else
goto done;
#endif /* MDBX_NOSUCCESS_EMPTY_COMMIT */
}
rc = txn_basal_commit(txn, latency ? &ts : nullptr);
latency_gcprof(latency, txn);
int end = TXN_END_COMMITTED | TXN_END_UPDATE;
if (unlikely(rc != MDBX_SUCCESS)) {
end = TXN_END_ABORT;
if (rc == MDBX_RESULT_TRUE) {
end = TXN_END_PURE_COMMIT | TXN_END_UPDATE;
rc = MDBX_NOSUCCESS_PURE_COMMIT ? MDBX_RESULT_TRUE : MDBX_SUCCESS;
DEBUG("committing txn %" PRIaTXN " %p on env %p, root page %" PRIaPGNO "/%" PRIaPGNO, txn->txnid, (void *)txn,
(void *)env, txn->dbs[MAIN_DBI].root, txn->dbs[FREE_DBI].root);
if (txn->n_dbi > CORE_DBS) {
/* Update table root pointers */
cursor_couple_t cx;
rc = cursor_init(&cx.outer, txn, MAIN_DBI);
if (unlikely(rc != MDBX_SUCCESS))
goto fail;
cx.outer.next = txn->cursors[MAIN_DBI];
txn->cursors[MAIN_DBI] = &cx.outer;
TXN_FOREACH_DBI_USER(txn, i) {
if ((txn->dbi_state[i] & DBI_DIRTY) == 0)
continue;
tree_t *const db = &txn->dbs[i];
DEBUG("update main's entry for sub-db %zu, mod_txnid %" PRIaTXN " -> %" PRIaTXN, i, db->mod_txnid, txn->txnid);
/* Может быть mod_txnid > front после коммита вложенных тразакций */
db->mod_txnid = txn->txnid;
MDBX_val data = {db, sizeof(tree_t)};
rc = cursor_put(&cx.outer, &env->kvs[i].name, &data, N_TREE);
if (unlikely(rc != MDBX_SUCCESS)) {
txn->cursors[MAIN_DBI] = cx.outer.next;
goto fail;
}
}
txn->cursors[MAIN_DBI] = cx.outer.next;
}
ts_1 = latency ? osal_monotime() : 0;
gcu_t gcu_ctx;
gc_cputime = latency ? osal_cputime(nullptr) : 0;
rc = gc_update_init(txn, &gcu_ctx);
if (unlikely(rc != MDBX_SUCCESS))
goto fail;
rc = gc_update(txn, &gcu_ctx);
gc_cputime = latency ? osal_cputime(nullptr) - gc_cputime : 0;
if (unlikely(rc != MDBX_SUCCESS))
goto fail;
tASSERT(txn, txn->tw.loose_count == 0);
txn->dbs[FREE_DBI].mod_txnid = (txn->dbi_state[FREE_DBI] & DBI_DIRTY) ? txn->txnid : txn->dbs[FREE_DBI].mod_txnid;
txn->dbs[MAIN_DBI].mod_txnid = (txn->dbi_state[MAIN_DBI] & DBI_DIRTY) ? txn->txnid : txn->dbs[MAIN_DBI].mod_txnid;
ts_2 = latency ? osal_monotime() : 0;
ts_3 = ts_2;
if (AUDIT_ENABLED()) {
rc = audit_ex(txn, MDBX_PNL_GETSIZE(txn->tw.retired_pages), true);
ts_3 = osal_monotime();
if (unlikely(rc != MDBX_SUCCESS))
goto fail;
}
bool need_flush_for_nometasync = false;
const meta_ptr_t head = meta_recent(env, &txn->tw.troika);
const uint32_t meta_sync_txnid = atomic_load32(&env->lck->meta_sync_txnid, mo_Relaxed);
/* sync prev meta */
if (head.is_steady && meta_sync_txnid != (uint32_t)head.txnid) {
/* Исправление унаследованного от LMDB недочета:
*
* Всё хорошо, если все процессы работающие с БД не используют WRITEMAP.
* Тогда мета-страница (обновленная, но не сброшенная на диск) будет
* сохранена в результате fdatasync() при записи данных этой транзакции.
*
* Всё хорошо, если все процессы работающие с БД используют WRITEMAP
* без MDBX_AVOID_MSYNC.
* Тогда мета-страница (обновленная, но не сброшенная на диск) будет
* сохранена в результате msync() при записи данных этой транзакции.
*
* Если же в процессах работающих с БД используется оба метода, как sync()
* в режиме MDBX_WRITEMAP, так и записи через файловый дескриптор, то
* становится невозможным обеспечить фиксацию на диске мета-страницы
* предыдущей транзакции и данных текущей транзакции, за счет одной
* sync-операцией выполняемой после записи данных текущей транзакции.
* Соответственно, требуется явно обновлять мета-страницу, что полностью
* уничтожает выгоду от NOMETASYNC. */
const uint32_t txnid_dist = ((txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC) ? MDBX_NOMETASYNC_LAZY_FD
: MDBX_NOMETASYNC_LAZY_WRITEMAP;
/* Смысл "магии" в том, чтобы избежать отдельного вызова fdatasync()
* или msync() для гарантированной фиксации на диске мета-страницы,
* которая была "лениво" отправлена на запись в предыдущей транзакции,
* но не сброшена на диск из-за активного режима MDBX_NOMETASYNC. */
if (
#if defined(_WIN32) || defined(_WIN64)
!env->ioring.overlapped_fd &&
#endif
meta_sync_txnid == (uint32_t)head.txnid - txnid_dist)
need_flush_for_nometasync = true;
else {
rc = meta_sync(env, head);
if (unlikely(rc != MDBX_SUCCESS)) {
ERROR("txn-%s: error %d", "presync-meta", rc);
goto fail;
}
}
}
int err = txn_end(txn, end);
if (unlikely(err != MDBX_SUCCESS))
rc = err;
if (txn->tw.dirtylist) {
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC);
tASSERT(txn, txn->tw.loose_count == 0);
mdbx_filehandle_t fd =
#if defined(_WIN32) || defined(_WIN64)
env->ioring.overlapped_fd ? env->ioring.overlapped_fd : env->lazy_fd;
(void)need_flush_for_nometasync;
#else
(need_flush_for_nometasync || env->dsync_fd == INVALID_HANDLE_VALUE ||
txn->tw.dirtylist->length > env->options.writethrough_threshold ||
atomic_load64(&env->lck->unsynced_pages, mo_Relaxed))
? env->lazy_fd
: env->dsync_fd;
#endif /* Windows */
iov_ctx_t write_ctx;
rc = iov_init(txn, &write_ctx, txn->tw.dirtylist->length, txn->tw.dirtylist->pages_including_loose, fd, false);
if (unlikely(rc != MDBX_SUCCESS)) {
ERROR("txn-%s: error %d", "iov-init", rc);
goto fail;
}
rc = txn_write(txn, &write_ctx);
if (unlikely(rc != MDBX_SUCCESS)) {
ERROR("txn-%s: error %d", "write", rc);
goto fail;
}
} else {
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) != 0 && !MDBX_AVOID_MSYNC);
env->lck->unsynced_pages.weak += txn->tw.writemap_dirty_npages;
if (!env->lck->eoos_timestamp.weak)
env->lck->eoos_timestamp.weak = osal_monotime();
}
/* TODO: use ctx.flush_begin & ctx.flush_end for range-sync */
ts_4 = latency ? osal_monotime() : 0;
meta_t meta;
memcpy(meta.magic_and_version, head.ptr_c->magic_and_version, 8);
meta.reserve16 = head.ptr_c->reserve16;
meta.validator_id = head.ptr_c->validator_id;
meta.extra_pagehdr = head.ptr_c->extra_pagehdr;
unaligned_poke_u64(4, meta.pages_retired,
unaligned_peek_u64(4, head.ptr_c->pages_retired) + MDBX_PNL_GETSIZE(txn->tw.retired_pages));
meta.geometry = txn->geo;
meta.trees.gc = txn->dbs[FREE_DBI];
meta.trees.main = txn->dbs[MAIN_DBI];
meta.canary = txn->canary;
memcpy(&meta.dxbid, &head.ptr_c->dxbid, sizeof(meta.dxbid));
txnid_t commit_txnid = txn->txnid;
#if MDBX_ENABLE_BIGFOOT
if (gcu_ctx.bigfoot > txn->txnid) {
commit_txnid = gcu_ctx.bigfoot;
TRACE("use @%" PRIaTXN " (+%zu) for commit bigfoot-txn", commit_txnid, (size_t)(commit_txnid - txn->txnid));
}
#endif
meta.unsafe_sign = DATASIGN_NONE;
meta_set_txnid(env, &meta, commit_txnid);
rc = dxb_sync_locked(env, env->flags | txn->flags | txn_shrink_allowed, &meta, &txn->tw.troika);
ts_5 = latency ? osal_monotime() : 0;
if (unlikely(rc != MDBX_SUCCESS)) {
env->flags |= ENV_FATAL_ERROR;
ERROR("txn-%s: error %d", "sync", rc);
goto fail;
}
end_mode = TXN_END_COMMITTED | TXN_END_UPDATE | TXN_END_EOTDONE;
done:
latency_done(latency, &ts);
if (latency)
txn_take_gcprof(txn, latency);
rc = txn_end(txn, end_mode);
provide_latency:
if (latency) {
latency->preparation = ts_1 ? osal_monotime_to_16dot16(ts_1 - ts_0) : 0;
latency->gc_wallclock = (ts_2 > ts_1) ? osal_monotime_to_16dot16(ts_2 - ts_1) : 0;
latency->gc_cputime = gc_cputime ? osal_monotime_to_16dot16(gc_cputime) : 0;
latency->audit = (ts_3 > ts_2) ? osal_monotime_to_16dot16(ts_3 - ts_2) : 0;
latency->write = (ts_4 > ts_3) ? osal_monotime_to_16dot16(ts_4 - ts_3) : 0;
latency->sync = (ts_5 > ts_4) ? osal_monotime_to_16dot16(ts_5 - ts_4) : 0;
const uint64_t ts_6 = osal_monotime();
latency->ending = ts_5 ? osal_monotime_to_16dot16(ts_6 - ts_5) : 0;
latency->whole = osal_monotime_to_16dot16_noUnderflow(ts_6 - ts_0);
}
return LOG_IFERR(rc);
fail:
txn->flags |= MDBX_TXN_ERROR;
if (latency)
txn_take_gcprof(txn, latency);
txn_abort(txn);
goto provide_latency;
}
int mdbx_txn_info(const MDBX_txn *txn, MDBX_txn_info *info, bool scan_rlt) {
@ -465,10 +848,10 @@ int mdbx_txn_info(const MDBX_txn *txn, MDBX_txn_info *info, bool scan_rlt) {
info->txn_reader_lag = head.txnid - info->txn_id;
info->txn_space_dirty = info->txn_space_retired = 0;
uint64_t reader_snapshot_pages_retired = 0;
if (txn->ro.slot &&
((txn->flags & MDBX_TXN_PARKED) == 0 || safe64_read(&txn->ro.slot->tid) != MDBX_TID_TXN_OUSTED) &&
if (txn->to.reader &&
((txn->flags & MDBX_TXN_PARKED) == 0 || safe64_read(&txn->to.reader->tid) != MDBX_TID_TXN_OUSTED) &&
head_retired >
(reader_snapshot_pages_retired = atomic_load64(&txn->ro.slot->snapshot_pages_retired, mo_Relaxed))) {
(reader_snapshot_pages_retired = atomic_load64(&txn->to.reader->snapshot_pages_retired, mo_Relaxed))) {
info->txn_space_dirty = info->txn_space_retired =
pgno2bytes(env, (pgno_t)(head_retired - reader_snapshot_pages_retired));
@ -495,7 +878,7 @@ int mdbx_txn_info(const MDBX_txn *txn, MDBX_txn_info *info, bool scan_rlt) {
if (snap_txnid < next_reader && snap_tid >= MDBX_TID_TXN_OUSTED) {
next_reader = snap_txnid;
retired_next_reader = pgno2bytes(
env, (pgno_t)(snap_retired - atomic_load64(&txn->ro.slot->snapshot_pages_retired, mo_Relaxed)));
env, (pgno_t)(snap_retired - atomic_load64(&txn->to.reader->snapshot_pages_retired, mo_Relaxed)));
}
}
}
@ -506,33 +889,31 @@ int mdbx_txn_info(const MDBX_txn *txn, MDBX_txn_info *info, bool scan_rlt) {
info->txn_space_limit_soft = pgno2bytes(env, txn->geo.now);
info->txn_space_limit_hard = pgno2bytes(env, txn->geo.upper);
info->txn_space_retired =
pgno2bytes(env, txn->nested ? (size_t)txn->wr.retired_pages : MDBX_PNL_GETSIZE(txn->wr.retired_pages));
info->txn_space_leftover = pgno2bytes(env, txn->wr.dirtyroom);
pgno2bytes(env, txn->nested ? (size_t)txn->tw.retired_pages : MDBX_PNL_GETSIZE(txn->tw.retired_pages));
info->txn_space_leftover = pgno2bytes(env, txn->tw.dirtyroom);
info->txn_space_dirty =
pgno2bytes(env, txn->wr.dirtylist ? txn->wr.dirtylist->pages_including_loose
: (txn->wr.writemap_dirty_npages + txn->wr.writemap_spilled_npages));
pgno2bytes(env, txn->tw.dirtylist ? txn->tw.dirtylist->pages_including_loose
: (txn->tw.writemap_dirty_npages + txn->tw.writemap_spilled_npages));
info->txn_reader_lag = INT64_MAX;
lck_t *const lck = env->lck_mmap.lck;
if (scan_rlt && lck) {
txnid_t oldest_reading = txn->txnid;
txnid_t oldest_snapshot = txn->txnid;
const size_t snap_nreaders = atomic_load32(&lck->rdt_length, mo_AcquireRelease);
if (snap_nreaders) {
txn_gc_detent(txn);
oldest_reading = txn->env->gc.detent;
if (oldest_reading == txn->wr.troika.txnid[txn->wr.troika.recent]) {
/* Если самый старый используемый снимок является предыдущим, т. е. непосредственно предшествующим текущей
* транзакции, то просматриваем таблицу читателей чтобы выяснить действительно ли снимок используется
* читателями. */
oldest_reading = txn->txnid;
oldest_snapshot = txn_snapshot_oldest(txn);
if (oldest_snapshot == txn->txnid - 1) {
/* check if there is at least one reader */
bool exists = false;
for (size_t i = 0; i < snap_nreaders; ++i) {
if (atomic_load32(&lck->rdt[i].pid, mo_Relaxed) && txn->env->gc.detent == safe64_read(&lck->rdt[i].txnid)) {
oldest_reading = txn->env->gc.detent;
if (atomic_load32(&lck->rdt[i].pid, mo_Relaxed) && txn->txnid > safe64_read(&lck->rdt[i].txnid)) {
exists = true;
break;
}
}
oldest_snapshot += !exists;
}
}
info->txn_reader_lag = txn->txnid - oldest_reading;
info->txn_reader_lag = txn->txnid - oldest_snapshot;
}
}

View File

@ -24,11 +24,12 @@ static size_t audit_db_used(const tree_t *db) {
return db ? (size_t)db->branch_pages + (size_t)db->leaf_pages + (size_t)db->large_pages : 0;
}
__cold static int audit_ex_locked(MDBX_txn *txn, const size_t retired_stored, const bool dont_filter_gc) {
__cold static int audit_ex_locked(MDBX_txn *txn, size_t retired_stored, bool dont_filter_gc) {
const MDBX_env *const env = txn->env;
tASSERT(txn, (txn->flags & MDBX_TXN_RDONLY) == 0);
const size_t pending = txn->wr.loose_count + MDBX_PNL_GETSIZE(txn->wr.repnl) +
(MDBX_PNL_GETSIZE(txn->wr.retired_pages) - retired_stored);
size_t pending = 0;
if ((txn->flags & MDBX_TXN_RDONLY) == 0)
pending = txn->tw.loose_count + MDBX_PNL_GETSIZE(txn->tw.repnl) +
(MDBX_PNL_GETSIZE(txn->tw.retired_pages) - retired_stored);
cursor_couple_t cx;
int rc = cursor_init(&cx.outer, txn, FREE_DBI);
@ -39,16 +40,17 @@ __cold static int audit_ex_locked(MDBX_txn *txn, const size_t retired_stored, co
MDBX_val key, data;
rc = outer_first(&cx.outer, &key, &data);
while (rc == MDBX_SUCCESS) {
if (unlikely(key.iov_len != sizeof(txnid_t))) {
ERROR("%s/%d: %s %u", "MDBX_CORRUPTED", MDBX_CORRUPTED, "invalid GC-key size", (unsigned)key.iov_len);
return MDBX_CORRUPTED;
if (!dont_filter_gc) {
if (unlikely(key.iov_len != sizeof(txnid_t))) {
ERROR("%s/%d: %s %u", "MDBX_CORRUPTED", MDBX_CORRUPTED, "invalid GC-key size", (unsigned)key.iov_len);
return MDBX_CORRUPTED;
}
txnid_t id = unaligned_peek_u64(4, key.iov_base);
if (txn->tw.gc.retxl ? txl_contain(txn->tw.gc.retxl, id) : (id <= txn->tw.gc.last_reclaimed))
goto skip;
}
const txnid_t id = unaligned_peek_u64(4, key.iov_base);
const size_t len = *(pgno_t *)data.iov_base;
const bool acc = dont_filter_gc || !gc_is_reclaimed(txn, id);
TRACE("%s id %" PRIaTXN " len %zu", acc ? "acc" : "skip", id, len);
if (acc)
gc += len;
gc += *(pgno_t *)data.iov_base;
skip:
rc = outer_next(&cx.outer, &key, &data, MDBX_NEXT);
}
tASSERT(txn, rc == MDBX_NOTFOUND);
@ -87,8 +89,8 @@ __cold static int audit_ex_locked(MDBX_txn *txn, const size_t retired_stored, co
if ((txn->flags & MDBX_TXN_RDONLY) == 0)
ERROR("audit @%" PRIaTXN ": %zu(pending) = %zu(loose) + "
"%zu(reclaimed) + %zu(retired-pending) - %zu(retired-stored)",
txn->txnid, pending, txn->wr.loose_count, MDBX_PNL_GETSIZE(txn->wr.repnl),
txn->wr.retired_pages ? MDBX_PNL_GETSIZE(txn->wr.retired_pages) : 0, retired_stored);
txn->txnid, pending, txn->tw.loose_count, MDBX_PNL_GETSIZE(txn->tw.repnl),
txn->tw.retired_pages ? MDBX_PNL_GETSIZE(txn->tw.retired_pages) : 0, retired_stored);
ERROR("audit @%" PRIaTXN ": %zu(pending) + %zu"
"(gc) + %zu(count) = %zu(total) <> %zu"
"(allocated)",

View File

@ -8,7 +8,7 @@ N | MASK | ENV | TXN | DB | PUT | DBI | NOD
5 |0000 0020| |TXN_PARKED |INTEGERDUP|NODUPDATA | | |P_DUPFIX | |
6 |0000 0040| |TXN_AUTOUNPARK|REVERSEDUP|CURRENT |DBI_OLDEN | |P_SUBP | |
7 |0000 0080| |TXN_DRAINED_GC|DB_VALID |ALLDUPS |DBI_LINDO | | | |
8 |0000 0100| _MAY_MOVE |TXN_CURSORS | | | | | | <= |
8 |0000 0100| _MAY_MOVE | | | | | | | <= |
9 |0000 0200| _MAY_UNMAP| | | | | | | <= |
10|0000 0400| | | | | | | | |
11|0000 0800| | | | | | | | |

View File

@ -159,19 +159,6 @@ __cold static MDBX_chk_line_t *MDBX_PRINTF_ARGS(2, 3) chk_print(MDBX_chk_line_t
return line;
}
MDBX_MAYBE_UNUSED __cold static void chk_println_va(MDBX_chk_scope_t *const scope, enum MDBX_chk_severity severity,
const char *fmt, va_list args) {
chk_line_end(chk_print_va(chk_line_begin(scope, severity), fmt, args));
}
MDBX_MAYBE_UNUSED __cold static void chk_println(MDBX_chk_scope_t *const scope, enum MDBX_chk_severity severity,
const char *fmt, ...) {
va_list args;
va_start(args, fmt);
chk_println_va(scope, severity, fmt, args);
va_end(args);
}
__cold static MDBX_chk_line_t *chk_print_size(MDBX_chk_line_t *line, const char *prefix, const uint64_t value,
const char *suffix) {
static const char sf[] = "KMGTPEZY"; /* LY: Kilo, Mega, Giga, Tera, Peta, Exa, Zetta, Yotta! */
@ -468,8 +455,9 @@ __cold static void chk_dispose(MDBX_chk_internal_t *chk) {
chk->cb->table_dispose(chk->usr, tbl);
tbl->cookie = nullptr;
}
if (tbl != &chk->table_gc && tbl != &chk->table_main)
if (tbl != &chk->table_gc && tbl != &chk->table_main) {
osal_free(tbl);
}
}
}
osal_free(chk->v2a_buf.iov_base);
@ -1139,7 +1127,6 @@ __cold static int chk_db(MDBX_chk_scope_t *const scope, MDBX_dbi dbi, MDBX_chk_t
const size_t maxkeysize = mdbx_env_get_maxkeysize_ex(env, tbl->flags);
MDBX_val prev_key = {nullptr, 0}, prev_data = {nullptr, 0};
MDBX_val key, data;
size_t dups_count = 0;
err = mdbx_cursor_get(cursor, &key, &data, MDBX_FIRST);
while (err == MDBX_SUCCESS) {
err = chk_check_break(scope);
@ -1163,12 +1150,6 @@ __cold static int chk_db(MDBX_chk_scope_t *const scope, MDBX_dbi dbi, MDBX_chk_t
}
if (prev_key.iov_base) {
if (key.iov_base == prev_key.iov_base)
dups_count += 1;
else {
histogram_acc(dups_count, &tbl->histogram.multival);
dups_count = 0;
}
if (prev_data.iov_base && !bad_data && (tbl->flags & MDBX_DUPFIXED) && prev_data.iov_len != data.iov_len) {
chk_object_issue(scope, "entry", record_count, "different data length", "%" PRIuPTR " != %" PRIuPTR,
prev_data.iov_len, data.iov_len);
@ -1255,27 +1236,17 @@ __cold static int chk_db(MDBX_chk_scope_t *const scope, MDBX_dbi dbi, MDBX_chk_t
err = mdbx_cursor_get(cursor, &key, &data, MDBX_NEXT);
}
if (prev_key.iov_base)
histogram_acc(dups_count, &tbl->histogram.multival);
err = (err != MDBX_NOTFOUND) ? chk_error_rc(scope, err, "mdbx_cursor_get") : MDBX_SUCCESS;
if (err == MDBX_SUCCESS && record_count != db->items)
chk_scope_issue(scope, "different number of entries %" PRIuSIZE " != %" PRIu64, record_count, db->items);
bailout:
if (cursor) {
if (handler) {
if (record_count) {
if (tbl->histogram.key_len.count) {
MDBX_chk_line_t *line = chk_line_begin(scope, MDBX_chk_info);
line = histogram_dist(line, &tbl->histogram.key_len, "key length density", "0/1", false);
chk_line_feed(line);
line = histogram_dist(line, &tbl->histogram.val_len, "value length density", "0/1", false);
if (tbl->histogram.multival.amount) {
chk_line_feed(line);
line = histogram_dist(line, &tbl->histogram.multival, "number of multi-values density", "single", false);
chk_line_feed(line);
line = chk_print(line, "number of keys %" PRIuSIZE ", average values per key %.1f",
tbl->histogram.multival.count, record_count / (double)tbl->histogram.multival.count);
}
chk_line_end(line);
}
if (scope->stage == MDBX_chk_maindb)
@ -1330,9 +1301,9 @@ __cold static int chk_handle_gc(MDBX_chk_scope_t *const scope, MDBX_chk_table_t
(number + 1) * sizeof(pgno_t), data->iov_len);
number = data->iov_len / sizeof(pgno_t) - 1;
} else if (data->iov_len - (number + 1) * sizeof(pgno_t) >=
/* LY: allow gap up to two page. it is ok
/* LY: allow gap up to one page. it is ok
* and better than shink-and-retry inside gc_update() */
usr->env->ps * 2)
usr->env->ps)
chk_object_issue(scope, "entry", txnid, "extra idl space",
"%" PRIuSIZE " < %" PRIuSIZE " (minor, not a trouble)", (number + 1) * sizeof(pgno_t),
data->iov_len);

View File

@ -250,15 +250,9 @@ MDBX_NOTHROW_PURE_FUNCTION static inline const page_t *data_page(const void *dat
MDBX_NOTHROW_PURE_FUNCTION static inline meta_t *page_meta(page_t *mp) { return (meta_t *)page_data(mp); }
MDBX_NOTHROW_PURE_FUNCTION static inline size_t page_numkeys(const page_t *mp) {
assert(mp->lower <= mp->upper);
return mp->lower >> 1;
}
MDBX_NOTHROW_PURE_FUNCTION static inline size_t page_numkeys(const page_t *mp) { return mp->lower >> 1; }
MDBX_NOTHROW_PURE_FUNCTION static inline size_t page_room(const page_t *mp) {
assert(mp->lower <= mp->upper);
return mp->upper - mp->lower;
}
MDBX_NOTHROW_PURE_FUNCTION static inline size_t page_room(const page_t *mp) { return mp->upper - mp->lower; }
MDBX_NOTHROW_PURE_FUNCTION static inline size_t page_space(const MDBX_env *env) {
STATIC_ASSERT(PAGEHDRSZ % 2 == 0);

View File

@ -6,12 +6,12 @@
#include "internals.h"
__cold int cursor_validate(const MDBX_cursor *mc) {
if (!mc->txn->wr.dirtylist) {
if (!mc->txn->tw.dirtylist) {
cASSERT(mc, (mc->txn->flags & MDBX_WRITEMAP) != 0 && !MDBX_AVOID_MSYNC);
} else {
cASSERT(mc, (mc->txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC);
cASSERT(mc, mc->txn->wr.dirtyroom + mc->txn->wr.dirtylist->length ==
(mc->txn->parent ? mc->txn->parent->wr.dirtyroom : mc->txn->env->options.dp_limit));
cASSERT(mc, mc->txn->tw.dirtyroom + mc->txn->tw.dirtylist->length ==
(mc->txn->parent ? mc->txn->parent->tw.dirtyroom : mc->txn->env->options.dp_limit));
}
cASSERT(mc, (mc->checking & z_updating) ? mc->top + 1 <= mc->tree->height : mc->top + 1 == mc->tree->height);
@ -184,74 +184,79 @@ __hot int cursor_touch(MDBX_cursor *const mc, const MDBX_val *key, const MDBX_va
/*----------------------------------------------------------------------------*/
int cursor_shadow(MDBX_cursor *cursor, MDBX_txn *nested, const size_t dbi) {
tASSERT(nested, cursor->signature == cur_signature_live);
tASSERT(nested, cursor->txn != nested);
cASSERT(cursor, cursor->txn->flags & txn_may_have_cursors);
cASSERT(cursor, dbi == cursor_dbi(cursor));
int cursor_shadow(MDBX_cursor *mc, MDBX_txn *nested, const size_t dbi) {
tASSERT(nested, dbi > FREE_DBI && dbi < nested->n_dbi);
const size_t size = cursor->subcur ? sizeof(MDBX_cursor) + sizeof(subcur_t) : sizeof(MDBX_cursor);
MDBX_cursor *const shadow = osal_malloc(size);
if (unlikely(!shadow))
return MDBX_ENOMEM;
const size_t size = mc->subcur ? sizeof(MDBX_cursor) + sizeof(subcur_t) : sizeof(MDBX_cursor);
for (MDBX_cursor *bk; mc; mc = bk->next) {
cASSERT(mc, mc != mc->next);
if (mc->signature != cur_signature_live) {
ENSURE(nested->env, mc->signature == cur_signature_wait4eot);
bk = mc;
continue;
}
bk = osal_malloc(size);
if (unlikely(!bk))
return MDBX_ENOMEM;
#if MDBX_DEBUG
memset(shadow, 0xCD, size);
VALGRIND_MAKE_MEM_UNDEFINED(shadow, size);
memset(bk, 0xCD, size);
VALGRIND_MAKE_MEM_UNDEFINED(bk, size);
#endif /* MDBX_DEBUG */
*shadow = *cursor;
cursor->backup = shadow;
cursor->txn = nested;
cursor->tree = &nested->dbs[dbi];
cursor->dbi_state = &nested->dbi_state[dbi];
subcur_t *subcur = cursor->subcur;
if (subcur) {
*(subcur_t *)(shadow + 1) = *subcur;
subcur->cursor.txn = nested;
subcur->cursor.dbi_state = &nested->dbi_state[dbi];
*bk = *mc;
mc->backup = bk;
mc->txn = nested;
mc->tree = &nested->dbs[dbi];
mc->dbi_state = &nested->dbi_state[dbi];
subcur_t *mx = mc->subcur;
if (mx) {
*(subcur_t *)(bk + 1) = *mx;
mx->cursor.txn = nested;
mx->cursor.dbi_state = &nested->dbi_state[dbi];
}
mc->next = nested->cursors[dbi];
nested->cursors[dbi] = mc;
}
return MDBX_SUCCESS;
}
MDBX_cursor *cursor_eot(MDBX_cursor *cursor, MDBX_txn *txn) {
MDBX_cursor *const next = cursor->next;
const unsigned stage = cursor->signature;
MDBX_cursor *const shadow = cursor->backup;
ENSURE(txn->env, stage == cur_signature_live || (stage == cur_signature_wait4eot && shadow));
tASSERT(txn, cursor->txn == txn);
if (shadow) {
subcur_t *subcur = cursor->subcur;
tASSERT(txn, txn->parent != nullptr && shadow->txn == txn->parent);
/* Zap: Using uninitialized memory '*subcur->backup'. */
MDBX_cursor *cursor_eot(MDBX_cursor *mc, MDBX_txn *txn, const bool merge) {
MDBX_cursor *const next = mc->next;
const unsigned stage = mc->signature;
MDBX_cursor *const bk = mc->backup;
ENSURE(txn->env, stage == cur_signature_live || (stage == cur_signature_wait4eot && bk));
tASSERT(txn, mc->txn == txn);
if (bk) {
subcur_t *mx = mc->subcur;
tASSERT(txn, mc->txn->parent != nullptr);
tASSERT(txn, bk->txn == txn->parent);
/* Zap: Using uninitialized memory '*mc->backup'. */
MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER(6001);
ENSURE(txn->env, shadow->signature == cur_signature_live);
tASSERT(txn, subcur == shadow->subcur);
if ((txn->flags & MDBX_TXN_ERROR) == 0) {
ENSURE(txn->env, bk->signature == cur_signature_live);
tASSERT(txn, mx == bk->subcur);
if (merge) {
/* Update pointers to parent txn */
cursor->next = shadow->next;
cursor->backup = shadow->backup;
cursor->txn = shadow->txn;
cursor->tree = shadow->tree;
cursor->dbi_state = shadow->dbi_state;
if (subcur) {
subcur->cursor.txn = shadow->txn;
subcur->cursor.dbi_state = shadow->dbi_state;
mc->next = bk->next;
mc->backup = bk->backup;
mc->txn = bk->txn;
mc->tree = bk->tree;
mc->dbi_state = bk->dbi_state;
if (mx) {
mx->cursor.txn = bk->txn;
mx->cursor.dbi_state = bk->dbi_state;
}
} else {
/* Restore from backup, i.e. rollback/abort nested txn */
*cursor = *shadow;
cursor->signature = stage /* Promote (cur_signature_wait4eot) state to parent txn */;
if (subcur)
*subcur = *(subcur_t *)(shadow + 1);
*mc = *bk;
mc->signature = stage /* Promote (cur_signature_wait4eot) state to parent txn */;
if (mx)
*mx = *(subcur_t *)(bk + 1);
}
shadow->signature = 0;
osal_free(shadow);
bk->signature = 0;
osal_free(bk);
} else {
ENSURE(cursor->txn->env, stage == cur_signature_live);
cursor->signature = cur_signature_ready4dispose /* Cursor may be reused */;
cursor->next = cursor;
cursor_drown((cursor_couple_t *)cursor);
ENSURE(mc->txn->env, stage == cur_signature_live);
mc->signature = cur_signature_ready4dispose /* Cursor may be reused */;
mc->next = mc;
cursor_drown((cursor_couple_t *)mc);
}
return next;
}
@ -638,7 +643,7 @@ static __always_inline int cursor_step(const bool inner, const bool forward, MDB
inner_gone(mc);
} else {
if (mc->flags & z_hollow) {
cASSERT(mc, !inner_pointed(mc) || inner_hollow(mc));
cASSERT(mc, !inner_pointed(mc));
return MDBX_ENODATA;
}
@ -766,7 +771,7 @@ __hot int cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, unsig
goto skip_check_samedata;
}
}
if (!(flags & MDBX_RESERVE) && unlikely(eq_fast(&current_data, data)))
if (!(flags & MDBX_RESERVE) && unlikely(cmp_lenfast(&current_data, data) == 0))
return MDBX_SUCCESS /* the same data, nothing to update */;
skip_check_samedata:;
}
@ -778,9 +783,8 @@ __hot int cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, unsig
rc = MDBX_NO_ROOT;
} else if ((flags & MDBX_CURRENT) == 0) {
bool exact = false;
MDBX_val old_data;
MDBX_val last_key, old_data;
if ((flags & MDBX_APPEND) && mc->tree->items > 0) {
MDBX_val last_key;
old_data.iov_base = nullptr;
old_data.iov_len = 0;
rc = (mc->flags & z_inner) ? inner_last(mc, &last_key) : outer_last(mc, &last_key, &old_data);
@ -798,53 +802,51 @@ __hot int cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, unsig
}
}
} else {
csr_t csr = cursor_seek(mc, (MDBX_val *)key, &old_data, MDBX_SET);
csr_t csr =
/* olddata may not be updated in case DUPFIX-page of dupfix-table */
cursor_seek(mc, (MDBX_val *)key, &old_data, MDBX_SET);
rc = csr.err;
exact = csr.exact;
}
if (exact) {
cASSERT(mc, rc == MDBX_SUCCESS);
if (unlikely(flags & MDBX_NOOVERWRITE)) {
DEBUG("duplicate key [%s]", DKEY_DEBUG(key));
*data = old_data;
return MDBX_KEYEXIST;
}
if (unlikely(mc->flags & z_inner)) {
/* nested subtree of DUPSORT-database with the same key, nothing to update */
cASSERT(mc, !"Should not happen since");
return (flags & MDBX_NODUPDATA) ? MDBX_KEYEXIST : MDBX_SUCCESS;
}
if (inner_pointed(mc)) {
if (unlikely(flags & MDBX_ALLDUPS)) {
rc = cursor_del(mc, MDBX_ALLDUPS);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
if (likely(rc == MDBX_SUCCESS)) {
if (exact) {
if (unlikely(flags & MDBX_NOOVERWRITE)) {
DEBUG("duplicate key [%s]", DKEY_DEBUG(key));
*data = old_data;
return MDBX_KEYEXIST;
}
if (unlikely(mc->flags & z_inner)) {
/* nested subtree of DUPSORT-database with the same key,
* nothing to update */
eASSERT(env, data->iov_len == 0 && (old_data.iov_len == 0 ||
/* olddata may not be updated in case
DUPFIX-page of dupfix-table */
(mc->tree->flags & MDBX_DUPFIXED)));
return MDBX_SUCCESS;
}
if (unlikely(flags & MDBX_ALLDUPS) && inner_pointed(mc)) {
err = cursor_del(mc, MDBX_ALLDUPS);
if (unlikely(err != MDBX_SUCCESS))
return err;
flags -= MDBX_ALLDUPS;
cASSERT(mc, mc->top + 1 == mc->tree->height);
rc = (mc->top >= 0) ? MDBX_NOTFOUND : MDBX_NO_ROOT;
} else if ((flags & (MDBX_RESERVE | MDBX_MULTIPLE)) == 0) {
old_data = *data;
csr_t csr = cursor_seek(&mc->subcur->cursor, &old_data, nullptr, MDBX_SET_RANGE);
if (unlikely(csr.exact)) {
cASSERT(mc, csr.err == MDBX_SUCCESS);
if (flags & MDBX_NODUPDATA)
return MDBX_KEYEXIST;
if (flags & MDBX_APPENDDUP)
return MDBX_EKEYMISMATCH;
exact = false;
} else if (!(flags & (MDBX_RESERVE | MDBX_MULTIPLE))) {
/* checking for early exit without dirtying pages */
if (unlikely(eq_fast(data, &old_data))) {
cASSERT(mc, mc->clc->v.cmp(data, &old_data) == 0);
if (mc->subcur) {
if (flags & MDBX_NODUPDATA)
return MDBX_KEYEXIST;
if (flags & MDBX_APPENDDUP)
return MDBX_EKEYMISMATCH;
}
/* the same data, nothing to update */
return MDBX_SUCCESS;
} else if (csr.err != MDBX_SUCCESS && unlikely(csr.err != MDBX_NOTFOUND)) {
be_poor(mc);
return csr.err;
}
cASSERT(mc, mc->clc->v.cmp(data, &old_data) != 0);
}
} else if (!(flags & (MDBX_RESERVE | MDBX_MULTIPLE))) {
if (unlikely(eq_fast(data, &old_data))) {
cASSERT(mc, mc->clc->v.cmp(data, &old_data) == 0);
/* the same data, nothing to update */
return (mc->subcur && (flags & MDBX_NODUPDATA)) ? MDBX_KEYEXIST : MDBX_SUCCESS;
}
cASSERT(mc, mc->clc->v.cmp(data, &old_data) != 0);
}
} else if (unlikely(rc != MDBX_NOTFOUND))
return rc;
@ -1050,7 +1052,6 @@ __hot int cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, unsig
return MDBX_EKEYMISMATCH;
} else if (eq_fast(data, &old_data)) {
cASSERT(mc, mc->clc->v.cmp(data, &old_data) == 0);
cASSERT(mc, !"Should not happen since" || batch_dupfix_done);
if (flags & MDBX_NODUPDATA)
return MDBX_KEYEXIST;
/* data is match exactly byte-to-byte, nothing to update */
@ -1726,7 +1727,6 @@ __hot csr_t cursor_seek(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, MDBX_cur
csr_t ret;
ret.exact = false;
/* coverity[logical_vs_bitwise] */
if (unlikely(key->iov_len < mc->clc->k.lmin ||
(key->iov_len > mc->clc->k.lmax &&
(mc->clc->k.lmin == mc->clc->k.lmax || MDBX_DEBUG || MDBX_FORCE_ASSERTIONS)))) {
@ -1781,7 +1781,8 @@ __hot csr_t cursor_seek(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, MDBX_cur
}
int cmp = mc->clc->k.cmp(&aligned_key, &nodekey);
if (unlikely(cmp == 0)) {
/* Probably happens rarely, but first node on the page was the one we wanted. */
/* Probably happens rarely, but first node on the page
* was the one we wanted. */
mc->ki[mc->top] = 0;
ret.exact = true;
goto got_node;
@ -1844,9 +1845,8 @@ __hot csr_t cursor_seek(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, MDBX_cur
* Поэтому переводим курсор в неустановленное состояние, но без сброса
* top, что позволяет работать fastpath при последующем поиске по дереву
* страниц. */
mc->flags |= z_hollow;
if (inner_pointed(mc))
mc->subcur->cursor.flags |= z_hollow;
mc->flags = z_hollow | (mc->flags & z_clear_mask);
inner_gone(mc);
ret.err = MDBX_NOTFOUND;
return ret;
}

View File

@ -151,7 +151,7 @@ MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static inline bool is_hollow(const
cASSERT(mc, mc->top >= 0);
cASSERT(mc, (mc->flags & z_eof_hard) || mc->ki[mc->top] < page_numkeys(mc->pg[mc->top]));
} else if (mc->subcur)
cASSERT(mc, is_poor(&mc->subcur->cursor) || (is_pointed(mc) && mc->subcur->cursor.flags < 0));
cASSERT(mc, is_poor(&mc->subcur->cursor));
return r;
}
@ -307,8 +307,8 @@ static inline int cursor_check_rw(const MDBX_cursor *mc) {
return cursor_check(mc, (MDBX_TXN_BLOCKED - MDBX_TXN_PARKED) | MDBX_TXN_RDONLY);
}
MDBX_INTERNAL MDBX_cursor *cursor_eot(MDBX_cursor *cursor, MDBX_txn *txn);
MDBX_INTERNAL int cursor_shadow(MDBX_cursor *cursor, MDBX_txn *nested, const size_t dbi);
MDBX_INTERNAL MDBX_cursor *cursor_eot(MDBX_cursor *mc, MDBX_txn *txn, const bool merge);
MDBX_INTERNAL int cursor_shadow(MDBX_cursor *mc, MDBX_txn *nested, const size_t dbi);
MDBX_INTERNAL MDBX_cursor *cursor_cpstk(const MDBX_cursor *csrc, MDBX_cursor *cdst);

View File

@ -87,12 +87,19 @@ __noinline int dbi_import(MDBX_txn *txn, const size_t dbi) {
if (parent) {
/* вложенная пишущая транзакция */
int rc = dbi_check(parent, dbi);
/* копируем состояние dbi-хендла очищая new-флаги. */
/* копируем состояние table очищая new-флаги. */
eASSERT(env, txn->dbi_seqs == parent->dbi_seqs);
txn->dbi_state[dbi] = parent->dbi_state[dbi] & ~(DBI_FRESH | DBI_CREAT | DBI_DIRTY);
if (likely(rc == MDBX_SUCCESS)) {
txn->dbs[dbi] = parent->dbs[dbi];
rc = txn_shadow_cursors(parent, dbi);
if (parent->cursors[dbi]) {
rc = cursor_shadow(parent->cursors[dbi], txn, dbi);
if (unlikely(rc != MDBX_SUCCESS)) {
/* не получилось забекапить курсоры */
txn->dbi_state[dbi] = DBI_OLDEN | DBI_LINDO | DBI_STALE;
txn->flags |= MDBX_TXN_ERROR;
}
}
}
return rc;
}
@ -176,7 +183,7 @@ int dbi_defer_release(MDBX_env *const env, defer_free_item_t *const chain) {
}
/* Export or close DBI handles opened in this txn. */
int dbi_update(MDBX_txn *txn, bool keep) {
int dbi_update(MDBX_txn *txn, int keep) {
MDBX_env *const env = txn->env;
tASSERT(txn, !txn->parent && txn == env->basal_txn);
bool locked = false;
@ -216,7 +223,6 @@ int dbi_update(MDBX_txn *txn, bool keep) {
if (locked) {
size_t i = env->n_dbi;
eASSERT(env, env->n_dbi >= CORE_DBS);
while ((env->dbs_flags[i - 1] & DB_VALID) == 0) {
--i;
eASSERT(env, i >= CORE_DBS);

View File

@ -43,35 +43,30 @@ static inline size_t dbi_bitmap_ctz(const MDBX_txn *txn, intptr_t bmi) {
return dbi_bitmap_ctz_fallback(txn, bmi);
}
static inline bool dbi_foreach_step(const MDBX_txn *const txn, size_t *bitmap_item, size_t *dbi) {
const size_t bitmap_chunk = CHAR_BIT * sizeof(txn->dbi_sparse[0]);
if (*bitmap_item & 1) {
*bitmap_item >>= 1;
return txn->dbi_state[*dbi] != 0;
}
if (*bitmap_item) {
size_t bitmap_skip = dbi_bitmap_ctz(txn, *bitmap_item);
*bitmap_item >>= bitmap_skip;
*dbi += bitmap_skip - 1;
} else {
*dbi = (*dbi - 1) | (bitmap_chunk - 1);
*bitmap_item = txn->dbi_sparse[(1 + *dbi) / bitmap_chunk];
if (*bitmap_item == 0)
*dbi += bitmap_chunk;
}
return false;
}
/* LY: Макрос целенаправленно сделан с одним циклом, чтобы сохранить возможность
* использования оператора break */
#define TXN_FOREACH_DBI_FROM(TXN, I, FROM) \
for (size_t bitmap_item = TXN->dbi_sparse[0] >> FROM, I = FROM; I < TXN->n_dbi; ++I) \
if (dbi_foreach_step(TXN, &bitmap_item, &I))
for (size_t bitmap_chunk = CHAR_BIT * sizeof(TXN->dbi_sparse[0]), bitmap_item = TXN->dbi_sparse[0] >> FROM, \
I = FROM; \
I < TXN->n_dbi; ++I) \
if (bitmap_item == 0) { \
I = (I - 1) | (bitmap_chunk - 1); \
bitmap_item = TXN->dbi_sparse[(1 + I) / bitmap_chunk]; \
if (!bitmap_item) \
/* coverity[const_overflow] */ \
I += bitmap_chunk; \
continue; \
} else if ((bitmap_item & 1) == 0) { \
size_t bitmap_skip = dbi_bitmap_ctz(txn, bitmap_item); \
bitmap_item >>= bitmap_skip; \
I += bitmap_skip - 1; \
continue; \
} else if (bitmap_item >>= 1, TXN->dbi_state[I])
#else
#define TXN_FOREACH_DBI_FROM(TXN, I, FROM) \
for (size_t I = FROM; I < TXN->n_dbi; ++I) \
#define TXN_FOREACH_DBI_FROM(TXN, I, SKIP) \
for (size_t I = SKIP; I < TXN->n_dbi; ++I) \
if (TXN->dbi_state[I])
#endif /* MDBX_ENABLE_DBI_SPARSE */
@ -87,7 +82,7 @@ struct dbi_snap_result {
};
MDBX_INTERNAL struct dbi_snap_result dbi_snap(const MDBX_env *env, const size_t dbi);
MDBX_INTERNAL int dbi_update(MDBX_txn *txn, bool keep);
MDBX_INTERNAL int dbi_update(MDBX_txn *txn, int keep);
static inline uint8_t dbi_state(const MDBX_txn *txn, const size_t dbi) {
STATIC_ASSERT((int)DBI_DIRTY == MDBX_DBI_DIRTY && (int)DBI_STALE == MDBX_DBI_STALE &&

View File

@ -28,9 +28,9 @@ static inline size_t dpl_bytes2size(const ptrdiff_t bytes) {
}
void dpl_free(MDBX_txn *txn) {
if (likely(txn->wr.dirtylist)) {
osal_free(txn->wr.dirtylist);
txn->wr.dirtylist = nullptr;
if (likely(txn->tw.dirtylist)) {
osal_free(txn->tw.dirtylist);
txn->tw.dirtylist = nullptr;
}
}
@ -39,14 +39,14 @@ dpl_t *dpl_reserve(MDBX_txn *txn, size_t size) {
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC);
size_t bytes = dpl_size2bytes((size < PAGELIST_LIMIT) ? size : PAGELIST_LIMIT);
dpl_t *const dl = osal_realloc(txn->wr.dirtylist, bytes);
dpl_t *const dl = osal_realloc(txn->tw.dirtylist, bytes);
if (likely(dl)) {
#ifdef osal_malloc_usable_size
bytes = osal_malloc_usable_size(dl);
#endif /* osal_malloc_usable_size */
dl->detent = dpl_bytes2size(bytes);
tASSERT(txn, txn->wr.dirtylist == nullptr || dl->length <= dl->detent);
txn->wr.dirtylist = dl;
tASSERT(txn, txn->tw.dirtylist == nullptr || dl->length <= dl->detent);
txn->tw.dirtylist = dl;
}
return dl;
}
@ -57,17 +57,15 @@ int dpl_alloc(MDBX_txn *txn) {
const size_t wanna = (txn->env->options.dp_initial < txn->geo.upper) ? txn->env->options.dp_initial : txn->geo.upper;
#if MDBX_FORCE_ASSERTIONS || MDBX_DEBUG
if (txn->wr.dirtylist)
if (txn->tw.dirtylist)
/* обнуляем чтобы не сработал ассерт внутри dpl_reserve() */
txn->wr.dirtylist->sorted = txn->wr.dirtylist->length = 0;
txn->tw.dirtylist->sorted = txn->tw.dirtylist->length = 0;
#endif /* asertions enabled */
if (unlikely(!txn->wr.dirtylist || txn->wr.dirtylist->detent < wanna || txn->wr.dirtylist->detent > wanna + wanna) &&
if (unlikely(!txn->tw.dirtylist || txn->tw.dirtylist->detent < wanna || txn->tw.dirtylist->detent > wanna + wanna) &&
unlikely(!dpl_reserve(txn, wanna)))
return MDBX_ENOMEM;
/* LY: wr.dirtylist не может быть nullptr, так как либо уже выделен, либо будет выделен в dpl_reserve(). */
/* coverity[var_deref_model] */
dpl_clear(txn->wr.dirtylist);
dpl_clear(txn->tw.dirtylist);
return MDBX_SUCCESS;
}
@ -81,7 +79,7 @@ __hot __noinline dpl_t *dpl_sort_slowpath(const MDBX_txn *txn) {
tASSERT(txn, (txn->flags & MDBX_TXN_RDONLY) == 0);
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC);
dpl_t *dl = txn->wr.dirtylist;
dpl_t *dl = txn->tw.dirtylist;
assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID);
const size_t unsorted = dl->length - dl->sorted;
if (likely(unsorted < MDBX_RADIXSORT_THRESHOLD) || unlikely(!dp_radixsort(dl->items + 1, dl->length))) {
@ -135,7 +133,7 @@ __hot __noinline size_t dpl_search(const MDBX_txn *txn, pgno_t pgno) {
tASSERT(txn, (txn->flags & MDBX_TXN_RDONLY) == 0);
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC);
dpl_t *dl = txn->wr.dirtylist;
dpl_t *dl = txn->tw.dirtylist;
assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID);
if (AUDIT_ENABLED()) {
for (const dp_t *ptr = dl->items + dl->sorted; --ptr > dl->items;) {
@ -177,7 +175,7 @@ __hot __noinline size_t dpl_search(const MDBX_txn *txn, pgno_t pgno) {
const page_t *debug_dpl_find(const MDBX_txn *txn, const pgno_t pgno) {
tASSERT(txn, (txn->flags & MDBX_TXN_RDONLY) == 0);
const dpl_t *dl = txn->wr.dirtylist;
const dpl_t *dl = txn->tw.dirtylist;
if (dl) {
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC);
assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID);
@ -200,7 +198,7 @@ void dpl_remove_ex(const MDBX_txn *txn, size_t i, size_t npages) {
tASSERT(txn, (txn->flags & MDBX_TXN_RDONLY) == 0);
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC);
dpl_t *dl = txn->wr.dirtylist;
dpl_t *dl = txn->tw.dirtylist;
assert((intptr_t)i > 0 && i <= dl->length);
assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID);
dl->pages_including_loose -= npages;
@ -216,10 +214,10 @@ int __must_check_result dpl_append(MDBX_txn *txn, pgno_t pgno, page_t *page, siz
const dp_t dp = {page, pgno, (pgno_t)npages};
if ((txn->flags & MDBX_WRITEMAP) == 0) {
size_t *const ptr = ptr_disp(page, -(ptrdiff_t)sizeof(size_t));
*ptr = txn->wr.dirtylru;
*ptr = txn->tw.dirtylru;
}
dpl_t *dl = txn->wr.dirtylist;
dpl_t *dl = txn->tw.dirtylist;
tASSERT(txn, dl->length <= PAGELIST_LIMIT + MDBX_PNL_GRANULATE);
tASSERT(txn, dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID);
if (AUDIT_ENABLED()) {
@ -315,7 +313,7 @@ int __must_check_result dpl_append(MDBX_txn *txn, pgno_t pgno, page_t *page, siz
__cold bool dpl_check(MDBX_txn *txn) {
tASSERT(txn, (txn->flags & MDBX_TXN_RDONLY) == 0);
const dpl_t *const dl = txn->wr.dirtylist;
const dpl_t *const dl = txn->tw.dirtylist;
if (!dl) {
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) != 0 && !MDBX_AVOID_MSYNC);
return true;
@ -324,7 +322,7 @@ __cold bool dpl_check(MDBX_txn *txn) {
assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID);
tASSERT(txn,
txn->wr.dirtyroom + dl->length == (txn->parent ? txn->parent->wr.dirtyroom : txn->env->options.dp_limit));
txn->tw.dirtyroom + dl->length == (txn->parent ? txn->parent->tw.dirtyroom : txn->env->options.dp_limit));
if (!AUDIT_ENABLED())
return true;
@ -364,28 +362,28 @@ __cold bool dpl_check(MDBX_txn *txn) {
return false;
}
const size_t rpa = pnl_search(txn->wr.repnl, dp->pgno, txn->geo.first_unallocated);
tASSERT(txn, rpa > MDBX_PNL_GETSIZE(txn->wr.repnl) || txn->wr.repnl[rpa] != dp->pgno);
if (rpa <= MDBX_PNL_GETSIZE(txn->wr.repnl) && unlikely(txn->wr.repnl[rpa] == dp->pgno))
const size_t rpa = pnl_search(txn->tw.repnl, dp->pgno, txn->geo.first_unallocated);
tASSERT(txn, rpa > MDBX_PNL_GETSIZE(txn->tw.repnl) || txn->tw.repnl[rpa] != dp->pgno);
if (rpa <= MDBX_PNL_GETSIZE(txn->tw.repnl) && unlikely(txn->tw.repnl[rpa] == dp->pgno))
return false;
if (num > 1) {
const size_t rpb = pnl_search(txn->wr.repnl, dp->pgno + num - 1, txn->geo.first_unallocated);
const size_t rpb = pnl_search(txn->tw.repnl, dp->pgno + num - 1, txn->geo.first_unallocated);
tASSERT(txn, rpa == rpb);
if (unlikely(rpa != rpb))
return false;
}
}
tASSERT(txn, loose == txn->wr.loose_count);
if (unlikely(loose != txn->wr.loose_count))
tASSERT(txn, loose == txn->tw.loose_count);
if (unlikely(loose != txn->tw.loose_count))
return false;
tASSERT(txn, pages == dl->pages_including_loose);
if (unlikely(pages != dl->pages_including_loose))
return false;
for (size_t i = 1; i <= MDBX_PNL_GETSIZE(txn->wr.retired_pages); ++i) {
const page_t *const dp = debug_dpl_find(txn, txn->wr.retired_pages[i]);
for (size_t i = 1; i <= MDBX_PNL_GETSIZE(txn->tw.retired_pages); ++i) {
const page_t *const dp = debug_dpl_find(txn, txn->tw.retired_pages[i]);
tASSERT(txn, !dp);
if (unlikely(dp))
return false;
@ -397,11 +395,11 @@ __cold bool dpl_check(MDBX_txn *txn) {
/*----------------------------------------------------------------------------*/
__noinline void dpl_lru_reduce(MDBX_txn *txn) {
VERBOSE("lru-reduce %u -> %u", txn->wr.dirtylru, txn->wr.dirtylru >> 1);
NOTICE("lru-reduce %u -> %u", txn->tw.dirtylru, txn->tw.dirtylru >> 1);
tASSERT(txn, (txn->flags & (MDBX_TXN_RDONLY | MDBX_WRITEMAP)) == 0);
do {
txn->wr.dirtylru >>= 1;
dpl_t *dl = txn->wr.dirtylist;
txn->tw.dirtylru >>= 1;
dpl_t *dl = txn->tw.dirtylist;
for (size_t i = 1; i <= dl->length; ++i) {
size_t *const ptr = ptr_disp(dl->items[i].ptr, -(ptrdiff_t)sizeof(size_t));
*ptr >>= 1;
@ -413,7 +411,7 @@ __noinline void dpl_lru_reduce(MDBX_txn *txn) {
void dpl_sift(MDBX_txn *const txn, pnl_t pl, const bool spilled) {
tASSERT(txn, (txn->flags & MDBX_TXN_RDONLY) == 0);
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC);
if (MDBX_PNL_GETSIZE(pl) && txn->wr.dirtylist->length) {
if (MDBX_PNL_GETSIZE(pl) && txn->tw.dirtylist->length) {
tASSERT(txn, pnl_check_allocated(pl, (size_t)txn->geo.first_unallocated << spilled));
dpl_t *dl = dpl_sort(txn);
@ -468,9 +466,9 @@ void dpl_sift(MDBX_txn *const txn, pnl_t pl, const bool spilled) {
}
}
dl->sorted = dpl_setlen(dl, w - 1);
txn->wr.dirtyroom += r - w;
tASSERT(txn, txn->wr.dirtyroom + txn->wr.dirtylist->length ==
(txn->parent ? txn->parent->wr.dirtyroom : txn->env->options.dp_limit));
txn->tw.dirtyroom += r - w;
tASSERT(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length ==
(txn->parent ? txn->parent->tw.dirtyroom : txn->env->options.dp_limit));
return;
}
}
@ -479,7 +477,7 @@ void dpl_sift(MDBX_txn *const txn, pnl_t pl, const bool spilled) {
void dpl_release_shadows(MDBX_txn *txn) {
tASSERT(txn, (txn->flags & (MDBX_TXN_RDONLY | MDBX_WRITEMAP)) == 0);
MDBX_env *env = txn->env;
dpl_t *const dl = txn->wr.dirtylist;
dpl_t *const dl = txn->tw.dirtylist;
for (size_t i = 1; i <= dl->length; i++)
page_shadow_release(env, dl->items[i].ptr, dpl_npages(dl, i));

View File

@ -46,14 +46,14 @@ static inline dpl_t *dpl_sort(const MDBX_txn *txn) {
tASSERT(txn, (txn->flags & MDBX_TXN_RDONLY) == 0);
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC);
dpl_t *dl = txn->wr.dirtylist;
dpl_t *dl = txn->tw.dirtylist;
tASSERT(txn, dl->length <= PAGELIST_LIMIT);
tASSERT(txn, dl->sorted <= dl->length);
tASSERT(txn, dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID);
return likely(dl->sorted == dl->length) ? dl : dpl_sort_slowpath(txn);
}
MDBX_NOTHROW_PURE_FUNCTION MDBX_INTERNAL __noinline size_t dpl_search(const MDBX_txn *txn, pgno_t pgno);
MDBX_INTERNAL __noinline size_t dpl_search(const MDBX_txn *txn, pgno_t pgno);
MDBX_MAYBE_UNUSED MDBX_INTERNAL const page_t *debug_dpl_find(const MDBX_txn *txn, const pgno_t pgno);
@ -68,11 +68,11 @@ MDBX_NOTHROW_PURE_FUNCTION static inline pgno_t dpl_endpgno(const dpl_t *dl, siz
return dpl_npages(dl, i) + dl->items[i].pgno;
}
MDBX_NOTHROW_PURE_FUNCTION static inline bool dpl_intersect(const MDBX_txn *txn, pgno_t pgno, size_t npages) {
static inline bool dpl_intersect(const MDBX_txn *txn, pgno_t pgno, size_t npages) {
tASSERT(txn, (txn->flags & MDBX_TXN_RDONLY) == 0);
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC);
dpl_t *dl = txn->wr.dirtylist;
dpl_t *dl = txn->tw.dirtylist;
tASSERT(txn, dl->sorted == dl->length);
tASSERT(txn, dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID);
size_t const n = dpl_search(txn, pgno);
@ -96,7 +96,7 @@ MDBX_NOTHROW_PURE_FUNCTION static inline bool dpl_intersect(const MDBX_txn *txn,
MDBX_NOTHROW_PURE_FUNCTION static inline size_t dpl_exist(const MDBX_txn *txn, pgno_t pgno) {
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC);
dpl_t *dl = txn->wr.dirtylist;
dpl_t *dl = txn->tw.dirtylist;
size_t i = dpl_search(txn, pgno);
tASSERT(txn, (int)i > 0);
return (dl->items[i].pgno == pgno) ? i : 0;
@ -105,7 +105,7 @@ MDBX_NOTHROW_PURE_FUNCTION static inline size_t dpl_exist(const MDBX_txn *txn, p
MDBX_INTERNAL void dpl_remove_ex(const MDBX_txn *txn, size_t i, size_t npages);
static inline void dpl_remove(const MDBX_txn *txn, size_t i) {
dpl_remove_ex(txn, i, dpl_npages(txn->wr.dirtylist, i));
dpl_remove_ex(txn, i, dpl_npages(txn->tw.dirtylist, i));
}
MDBX_INTERNAL int __must_check_result dpl_append(MDBX_txn *txn, pgno_t pgno, page_t *page, size_t npages);
@ -114,19 +114,19 @@ MDBX_MAYBE_UNUSED MDBX_INTERNAL bool dpl_check(MDBX_txn *txn);
MDBX_NOTHROW_PURE_FUNCTION static inline uint32_t dpl_age(const MDBX_txn *txn, size_t i) {
tASSERT(txn, (txn->flags & (MDBX_TXN_RDONLY | MDBX_WRITEMAP)) == 0);
const dpl_t *dl = txn->wr.dirtylist;
const dpl_t *dl = txn->tw.dirtylist;
assert((intptr_t)i > 0 && i <= dl->length);
size_t *const ptr = ptr_disp(dl->items[i].ptr, -(ptrdiff_t)sizeof(size_t));
return txn->wr.dirtylru - (uint32_t)*ptr;
return txn->tw.dirtylru - (uint32_t)*ptr;
}
MDBX_INTERNAL void dpl_lru_reduce(MDBX_txn *txn);
static inline uint32_t dpl_lru_turn(MDBX_txn *txn) {
txn->wr.dirtylru += 1;
if (unlikely(txn->wr.dirtylru > UINT32_MAX / 3) && (txn->flags & MDBX_WRITEMAP) == 0)
txn->tw.dirtylru += 1;
if (unlikely(txn->tw.dirtylru > UINT32_MAX / 3) && (txn->flags & MDBX_WRITEMAP) == 0)
dpl_lru_reduce(txn);
return txn->wr.dirtylru;
return txn->tw.dirtylru;
}
MDBX_INTERNAL void dpl_sift(MDBX_txn *const txn, pnl_t pl, const bool spilled);

View File

@ -370,7 +370,7 @@ void dxb_sanitize_tail(MDBX_env *env, MDBX_txn *txn) {
return;
} else if (env_owned_wrtxn(env)) {
/* inside write-txn */
last = meta_recent(env, &env->basal_txn->wr.troika).ptr_v->geometry.first_unallocated;
last = meta_recent(env, &env->basal_txn->tw.troika).ptr_v->geometry.first_unallocated;
} else if (env->flags & MDBX_RDONLY) {
/* read-only mode, no write-txn, no wlock mutex */
last = NUM_METAS;
@ -1061,17 +1061,16 @@ int dxb_sync_locked(MDBX_env *env, unsigned flags, meta_t *const pending, troika
#endif /* MADV_DONTNEED || POSIX_MADV_DONTNEED */
/* LY: check conditions to shrink datafile */
const pgno_t stockpile_gap = 3 + pending->trees.gc.height * 3;
const pgno_t backlog_gap = 3 + pending->trees.gc.height * 3;
pgno_t shrink_step = 0;
if (pending->geometry.shrink_pv && pending->geometry.now - pending->geometry.first_unallocated >
(shrink_step = pv2pages(pending->geometry.shrink_pv)) + stockpile_gap) {
if (pending->geometry.now > largest_pgno &&
pending->geometry.now - largest_pgno > shrink_step + stockpile_gap) {
(shrink_step = pv2pages(pending->geometry.shrink_pv)) + backlog_gap) {
if (pending->geometry.now > largest_pgno && pending->geometry.now - largest_pgno > shrink_step + backlog_gap) {
const pgno_t aligner =
pending->geometry.grow_pv ? /* grow_step */ pv2pages(pending->geometry.grow_pv) : shrink_step;
const pgno_t with_stockpile_gap = largest_pgno + stockpile_gap;
const pgno_t with_backlog_gap = largest_pgno + backlog_gap;
const pgno_t aligned =
pgno_align2os_pgno(env, (size_t)with_stockpile_gap + aligner - with_stockpile_gap % aligner);
pgno_align2os_pgno(env, (size_t)with_backlog_gap + aligner - with_backlog_gap % aligner);
const pgno_t bottom = (aligned > pending->geometry.lower) ? aligned : pending->geometry.lower;
if (pending->geometry.now > bottom) {
if (TROIKA_HAVE_STEADY(troika))
@ -1291,7 +1290,6 @@ int dxb_sync_locked(MDBX_env *env, unsigned flags, meta_t *const pending, troika
}
uint64_t timestamp = 0;
/* coverity[array_null] */
while ("workaround for https://libmdbx.dqdkfa.ru/dead-github/issues/269") {
rc = coherency_check_written(env, pending->unsafe_txnid, target,
bytes2pgno(env, ptr_dist(target, env->dxb_mmap.base)), &timestamp);
@ -1308,8 +1306,8 @@ int dxb_sync_locked(MDBX_env *env, unsigned flags, meta_t *const pending, troika
*troika = meta_tap(env);
for (MDBX_txn *txn = env->basal_txn; txn; txn = txn->nested)
if (troika != &txn->wr.troika)
txn->wr.troika = *troika;
if (troika != &txn->tw.troika)
txn->tw.troika = *troika;
/* LY: shrink datafile if needed */
if (unlikely(shrink)) {

View File

@ -76,7 +76,7 @@ retry:;
goto bailout;
}
const troika_t troika = (txn_owned || should_unlock) ? env->basal_txn->wr.troika : meta_tap(env);
const troika_t troika = (txn_owned || should_unlock) ? env->basal_txn->tw.troika : meta_tap(env);
const meta_ptr_t head = meta_recent(env, &troika);
const uint64_t unsynced_pages = atomic_load64(&env->lck->unsynced_pages, mo_Relaxed);
if (unsynced_pages == 0) {
@ -158,13 +158,13 @@ retry:;
#if MDBX_ENABLE_PGOP_STAT
env->lck->pgops.wops.weak += wops;
#endif /* MDBX_ENABLE_PGOP_STAT */
env->basal_txn->wr.troika = meta_tap(env);
env->basal_txn->tw.troika = meta_tap(env);
eASSERT(env, !env->txn && !env->basal_txn->nested);
goto retry;
}
eASSERT(env, head.txnid == recent_committed_txnid(env));
env->basal_txn->txnid = head.txnid;
txn_gc_detent(env->basal_txn);
txn_snapshot_oldest(env->basal_txn);
flags |= txn_shrink_allowed;
}
@ -182,7 +182,7 @@ retry:;
DEBUG("meta-head %" PRIaPGNO ", %s, sync_pending %" PRIu64, data_page(head.ptr_c)->pgno,
durable_caption(head.ptr_c), unsynced_pages);
meta_t meta = *head.ptr_c;
rc = dxb_sync_locked(env, flags, &meta, &env->basal_txn->wr.troika);
rc = dxb_sync_locked(env, flags, &meta, &env->basal_txn->tw.troika);
if (unlikely(rc != MDBX_SUCCESS))
goto bailout;
}
@ -524,7 +524,7 @@ __cold int env_close(MDBX_env *env, bool resurrect_after_fork) {
env->defer_free = nullptr;
#endif /* MDBX_ENABLE_DBI_LOCKFREE */
if ((env->flags & MDBX_RDONLY) == 0)
if (!(env->flags & MDBX_RDONLY))
osal_ioring_destroy(&env->ioring);
env->lck = nullptr;
@ -593,7 +593,12 @@ __cold int env_close(MDBX_env *env, bool resurrect_after_fork) {
env->pathname.buffer = nullptr;
}
if (env->basal_txn) {
txn_basal_destroy(env->basal_txn);
dpl_free(env->basal_txn);
txl_free(env->basal_txn->tw.gc.retxl);
pnl_free(env->basal_txn->tw.retired_pages);
pnl_free(env->basal_txn->tw.spilled.list);
pnl_free(env->basal_txn->tw.repnl);
osal_free(env->basal_txn);
env->basal_txn = nullptr;
}
}

View File

@ -30,10 +30,8 @@ typedef struct iov_ctx iov_ctx_t;
#if UINTPTR_MAX > 0xffffFFFFul || ULONG_MAX > 0xffffFFFFul || defined(_WIN64)
#define MDBX_WORDBITS 64
#define MDBX_WORDBITS_LN2 6
#else
#define MDBX_WORDBITS 32
#define MDBX_WORDBITS_LN2 5
#endif /* MDBX_WORDBITS */
#include "options.h"

View File

@ -570,11 +570,14 @@ static pgno_t *scan4seq_resolver(pgno_t *range, const size_t len, const size_t s
/*----------------------------------------------------------------------------*/
static inline bool is_reclaimable(MDBX_txn *txn, const MDBX_cursor *mc, const uint8_t flags) {
#define ALLOC_COALESCE 4 /* внутреннее состояние */
#define ALLOC_SHOULD_SCAN 8 /* внутреннее состояние */
#define ALLOC_LIFO 16 /* внутреннее состояние */
static inline bool is_gc_usable(MDBX_txn *txn, const MDBX_cursor *mc, const uint8_t flags) {
/* If txn is updating the GC, then the retired-list cannot play catch-up with
* itself by growing while trying to save it. */
STATIC_ASSERT(ALLOC_RESERVE == z_gcu_preparation);
if (mc->tree == &txn->dbs[FREE_DBI] && !((flags | mc->flags) & z_gcu_preparation))
if (mc->tree == &txn->dbs[FREE_DBI] && !(flags & ALLOC_RESERVE) && !(mc->flags & z_gcu_preparation))
return false;
/* avoid search inside empty tree and while tree is updating,
@ -587,10 +590,12 @@ static inline bool is_reclaimable(MDBX_txn *txn, const MDBX_cursor *mc, const ui
return true;
}
static inline bool is_already_reclaimed(const MDBX_txn *txn, txnid_t id) { return txl_contain(txn->tw.gc.retxl, id); }
__hot static pgno_t repnl_get_single(MDBX_txn *txn) {
const size_t len = MDBX_PNL_GETSIZE(txn->wr.repnl);
const size_t len = MDBX_PNL_GETSIZE(txn->tw.repnl);
assert(len > 0);
pgno_t *target = MDBX_PNL_EDGE(txn->wr.repnl);
pgno_t *target = MDBX_PNL_EDGE(txn->tw.repnl);
const ptrdiff_t dir = MDBX_PNL_ASCENDING ? 1 : -1;
/* Есть ТРИ потенциально выигрышные, но противо-направленные тактики:
@ -658,7 +663,7 @@ __hot static pgno_t repnl_get_single(MDBX_txn *txn) {
#else
/* вырезаем элемент с перемещением хвоста */
const pgno_t pgno = *scan;
MDBX_PNL_SETSIZE(txn->wr.repnl, len - 1);
MDBX_PNL_SETSIZE(txn->tw.repnl, len - 1);
while (++scan <= target)
scan[-1] = *scan;
return pgno;
@ -671,44 +676,44 @@ __hot static pgno_t repnl_get_single(MDBX_txn *txn) {
const pgno_t pgno = *target;
#if MDBX_PNL_ASCENDING
/* вырезаем элемент с перемещением хвоста */
MDBX_PNL_SETSIZE(txn->wr.repnl, len - 1);
for (const pgno_t *const end = txn->wr.repnl + len - 1; target <= end; ++target)
MDBX_PNL_SETSIZE(txn->tw.repnl, len - 1);
for (const pgno_t *const end = txn->tw.repnl + len - 1; target <= end; ++target)
*target = target[1];
#else
/* перемещать хвост не нужно, просто усекам список */
MDBX_PNL_SETSIZE(txn->wr.repnl, len - 1);
MDBX_PNL_SETSIZE(txn->tw.repnl, len - 1);
#endif
return pgno;
}
__hot static pgno_t repnl_get_sequence(MDBX_txn *txn, const size_t num, uint8_t flags) {
const size_t len = MDBX_PNL_GETSIZE(txn->wr.repnl);
pgno_t *edge = MDBX_PNL_EDGE(txn->wr.repnl);
const size_t len = MDBX_PNL_GETSIZE(txn->tw.repnl);
pgno_t *edge = MDBX_PNL_EDGE(txn->tw.repnl);
assert(len >= num && num > 1);
const size_t seq = num - 1;
#if !MDBX_PNL_ASCENDING
if (edge[-(ptrdiff_t)seq] - *edge == seq) {
if (unlikely(flags & ALLOC_RESERVE))
return P_INVALID;
assert(edge == scan4range_checker(txn->wr.repnl, seq));
assert(edge == scan4range_checker(txn->tw.repnl, seq));
/* перемещать хвост не нужно, просто усекам список */
MDBX_PNL_SETSIZE(txn->wr.repnl, len - num);
MDBX_PNL_SETSIZE(txn->tw.repnl, len - num);
return *edge;
}
#endif
pgno_t *target = scan4seq_impl(edge, len, seq);
assert(target == scan4range_checker(txn->wr.repnl, seq));
assert(target == scan4range_checker(txn->tw.repnl, seq));
if (target) {
if (unlikely(flags & ALLOC_RESERVE))
return P_INVALID;
const pgno_t pgno = *target;
/* вырезаем найденную последовательность с перемещением хвоста */
MDBX_PNL_SETSIZE(txn->wr.repnl, len - num);
MDBX_PNL_SETSIZE(txn->tw.repnl, len - num);
#if MDBX_PNL_ASCENDING
for (const pgno_t *const end = txn->wr.repnl + len - num; target <= end; ++target)
for (const pgno_t *const end = txn->tw.repnl + len - num; target <= end; ++target)
*target = target[num];
#else
for (const pgno_t *const end = txn->wr.repnl + len; ++target <= end;)
for (const pgno_t *const end = txn->tw.repnl + len; ++target <= end;)
target[-(ptrdiff_t)num] = *target;
#endif
return pgno;
@ -716,10 +721,6 @@ __hot static pgno_t repnl_get_sequence(MDBX_txn *txn, const size_t num, uint8_t
return 0;
}
bool gc_repnl_has_span(const MDBX_txn *txn, const size_t num) {
return (num > 1) ? repnl_get_sequence((MDBX_txn *)txn, num, ALLOC_RESERVE) != 0 : !MDBX_PNL_IS_EMPTY(txn->wr.repnl);
}
static inline pgr_t page_alloc_finalize(MDBX_env *const env, MDBX_txn *const txn, const MDBX_cursor *const mc,
const pgno_t pgno, const size_t num) {
#if MDBX_ENABLE_PROFGC
@ -761,7 +762,7 @@ static inline pgr_t page_alloc_finalize(MDBX_env *const env, MDBX_txn *const txn
* обновляться PTE с последующей генерацией page-fault и чтением данных из
* грязной I/O очереди. Из-за этого штраф за лишнюю запись может быть
* сравним с избегаемым ненужным чтением. */
if (txn->wr.prefault_write_activated) {
if (txn->tw.prefault_write_activated) {
void *const pattern = ptr_disp(env->page_auxbuf, need_clean ? env->ps : env->ps * 2);
size_t file_offset = pgno2bytes(env, pgno);
if (likely(num == 1)) {
@ -822,7 +823,7 @@ static inline pgr_t page_alloc_finalize(MDBX_env *const env, MDBX_txn *const txn
ret.err = page_dirty(txn, ret.page, (pgno_t)num);
bailout:
tASSERT(txn, pnl_check_allocated(txn->wr.repnl, txn->geo.first_unallocated - MDBX_ENABLE_REFUND));
tASSERT(txn, pnl_check_allocated(txn->tw.repnl, txn->geo.first_unallocated - MDBX_ENABLE_REFUND));
#if MDBX_ENABLE_PROFGC
size_t majflt_after;
prof->xtime_cpu += osal_cputime(&majflt_after) - cputime_before;
@ -841,15 +842,8 @@ pgr_t gc_alloc_ex(const MDBX_cursor *const mc, const size_t num, uint8_t flags)
prof->spe_counter += 1;
#endif /* MDBX_ENABLE_PROFGC */
/* Если взведен флажок ALLOC_RESERVE, то требуется только обеспечение соответствующего резерва в txn->wr.repnl
* и/или txn->wr.gc.reclaimed, но без выделения и возврата страницы. При этом возможны три варианта вызова:
* 1. num == 0 требуется слот для возврата в GC остатков ранее переработанных/извлеченных страниц,
* при этом нет смысла перерабатывать длинные записи, так как тогда дефицит свободных id/слотов не уменьшится;
* 2. num == 1 требуется увеличение резерва перед обновлением GC;
* 3. num > 1 требуется последовательность страниц для сохранения retired-страниц
* при выключенном MDBX_ENABLE_BIGFOOT. */
eASSERT(env, num > 0 || (flags & ALLOC_RESERVE));
eASSERT(env, pnl_check_allocated(txn->wr.repnl, txn->geo.first_unallocated - MDBX_ENABLE_REFUND));
eASSERT(env, pnl_check_allocated(txn->tw.repnl, txn->geo.first_unallocated - MDBX_ENABLE_REFUND));
size_t newnext;
const uint64_t monotime_begin = (MDBX_ENABLE_PROFGC || (num > 1 && env->options.gc_time_limit)) ? osal_monotime() : 0;
@ -864,20 +858,21 @@ pgr_t gc_alloc_ex(const MDBX_cursor *const mc, const size_t num, uint8_t flags)
#if MDBX_ENABLE_PROFGC
prof->xpages += 1;
#endif /* MDBX_ENABLE_PROFGC */
if (MDBX_PNL_GETSIZE(txn->wr.repnl) >= num) {
eASSERT(env, MDBX_PNL_LAST(txn->wr.repnl) < txn->geo.first_unallocated &&
MDBX_PNL_FIRST(txn->wr.repnl) < txn->geo.first_unallocated);
if (MDBX_PNL_GETSIZE(txn->tw.repnl) >= num) {
eASSERT(env, MDBX_PNL_LAST(txn->tw.repnl) < txn->geo.first_unallocated &&
MDBX_PNL_FIRST(txn->tw.repnl) < txn->geo.first_unallocated);
pgno = repnl_get_sequence(txn, num, flags);
if (likely(pgno))
goto done;
}
} else {
eASSERT(env, num == 0 || MDBX_PNL_GETSIZE(txn->wr.repnl) == 0 || (flags & ALLOC_RESERVE));
eASSERT(env, num == 0 || MDBX_PNL_GETSIZE(txn->tw.repnl) == 0);
eASSERT(env, !(flags & ALLOC_RESERVE) || num == 0);
}
//---------------------------------------------------------------------------
if (unlikely(!is_reclaimable(txn, mc, flags))) {
if (unlikely(!is_gc_usable(txn, mc, flags))) {
eASSERT(env, (txn->flags & txn_gc_drained) || num > 1);
goto no_gc;
}
@ -885,19 +880,22 @@ pgr_t gc_alloc_ex(const MDBX_cursor *const mc, const size_t num, uint8_t flags)
eASSERT(env, (flags & (ALLOC_COALESCE | ALLOC_LIFO | ALLOC_SHOULD_SCAN)) == 0);
flags += (env->flags & MDBX_LIFORECLAIM) ? ALLOC_LIFO : 0;
/* Не коагулируем записи в случае запроса слота для возврата страниц в GC. Иначе попытка увеличить резерв
* может приводить к необходимости ещё большего резерва из-за увеличения списка переработанных страниц. */
if (num > 0 && txn->dbs[FREE_DBI].branch_pages && MDBX_PNL_GETSIZE(txn->wr.repnl) < env->maxgc_large1page / 2)
flags += ALLOC_COALESCE;
if (/* Не коагулируем записи при подготовке резерва для обновления GC.
* Иначе попытка увеличить резерв может приводить к необходимости ещё
* большего резерва из-за увеличения списка переработанных страниц. */
(flags & ALLOC_RESERVE) == 0) {
if (txn->dbs[FREE_DBI].branch_pages && MDBX_PNL_GETSIZE(txn->tw.repnl) < env->maxgc_large1page / 2)
flags += ALLOC_COALESCE;
}
MDBX_cursor *const gc = txn_gc_cursor(txn);
MDBX_cursor *const gc = ptr_disp(env->basal_txn, sizeof(MDBX_txn));
eASSERT(env, mc != gc && gc->next == gc);
gc->txn = txn;
gc->dbi_state = txn->dbi_state;
gc->top_and_flags = z_fresh_mark;
txn->wr.prefault_write_activated = !env->incore && env->options.prefault_write;
if (txn->wr.prefault_write_activated) {
txn->tw.prefault_write_activated = env->options.prefault_write;
if (txn->tw.prefault_write_activated) {
/* Проверка посредством minicore() существенно снижает затраты, но в
* простейших случаях (тривиальный бенчмарк) интегральная производительность
* становится вдвое меньше. А на платформах без mincore() и с проблемной
@ -910,41 +908,48 @@ pgr_t gc_alloc_ex(const MDBX_cursor *const mc, const size_t num, uint8_t flags)
(txn->dbs[FREE_DBI].branch_pages == 0 && txn->geo.now < 1234) ||
/* Не суетимся если страница в зоне включенного упреждающего чтения */
(readahead_enabled && pgno + num < readahead_edge))
txn->wr.prefault_write_activated = false;
txn->tw.prefault_write_activated = false;
}
retry_gc_refresh_detent:
txn_gc_detent(txn);
retry_gc_have_detent:
if (unlikely(txn->env->gc.detent >= txn->txnid)) {
FATAL("unexpected/invalid gc-detent %" PRIaTXN " for current-txnid %" PRIaTXN, txn->env->gc.detent, txn->txnid);
retry_gc_refresh_oldest:;
txnid_t oldest = txn_snapshot_oldest(txn);
retry_gc_have_oldest:
if (unlikely(oldest >= txn->txnid)) {
ERROR("unexpected/invalid oldest-readed txnid %" PRIaTXN " for current-txnid %" PRIaTXN, oldest, txn->txnid);
ret.err = MDBX_PROBLEM;
goto fail;
}
const txnid_t detent = oldest + 1;
txnid_t id = 0;
MDBX_cursor_op op = MDBX_FIRST;
if (flags & ALLOC_LIFO) {
/* Begin lookup backward from oldest reader */
id = txn->env->gc.detent;
op = MDBX_SET_RANGE;
} else {
/* Continue lookup forward from last-reclaimed */
id = rkl_highest(&txn->wr.gc.reclaimed);
if (id) {
id += 1;
op = MDBX_SET_RANGE;
if (id >= txn->env->gc.detent)
goto depleted_gc;
if (!txn->tw.gc.retxl) {
txn->tw.gc.retxl = txl_alloc();
if (unlikely(!txn->tw.gc.retxl)) {
ret.err = MDBX_ENOMEM;
goto fail;
}
}
/* Begin lookup backward from oldest reader */
id = detent - 1;
op = MDBX_SET_RANGE;
} else if (txn->tw.gc.last_reclaimed) {
/* Continue lookup forward from last-reclaimed */
id = txn->tw.gc.last_reclaimed + 1;
if (id >= detent)
goto depleted_gc;
op = MDBX_SET_RANGE;
}
next_gc:
next_gc:;
MDBX_val key;
key.iov_base = &id;
key.iov_len = sizeof(id);
#if MDBX_ENABLE_PROFGC
prof->rsteps += 1
prof->rsteps += 1;
#endif /* MDBX_ENABLE_PROFGC */
;
MDBX_val key = {.iov_base = &id, .iov_len = sizeof(id)};
/* Seek first/next GC record */
ret.err = cursor_ops(gc, &key, nullptr, op);
@ -962,18 +967,15 @@ next_gc:
ret.err = MDBX_CORRUPTED;
goto fail;
}
id = unaligned_peek_u64(4, key.iov_base);
if (flags & ALLOC_LIFO) {
op = MDBX_PREV;
if (id >= txn->env->gc.detent || gc_is_reclaimed(txn, id))
if (id >= detent || is_already_reclaimed(txn, id))
goto next_gc;
} else {
if (unlikely(id >= txn->env->gc.detent))
goto depleted_gc;
op = MDBX_NEXT;
if (gc_is_reclaimed(txn, id))
goto next_gc;
if (unlikely(id >= detent))
goto depleted_gc;
}
txn->flags &= ~txn_gc_drained;
@ -992,75 +994,59 @@ next_gc:
}
const size_t gc_len = MDBX_PNL_GETSIZE(gc_pnl);
TRACE("gc-read: id #%" PRIaTXN " len %zu, re-list will %zu ", id, gc_len, gc_len + MDBX_PNL_GETSIZE(txn->wr.repnl));
TRACE("gc-read: id #%" PRIaTXN " len %zu, re-list will %zu ", id, gc_len, gc_len + MDBX_PNL_GETSIZE(txn->tw.repnl));
if (unlikely(!num)) {
/* TODO: Проверка критериев пункта 2 сформулированного в gc_provide_slots().
* Сейчас тут сильно упрощенная и не совсем верная проверка, так как пока недоступна информация о кол-ве имеющихся
* слотов и их дефиците для возврата wr.repl. */
if (gc_len > env->maxgc_large1page / 4 * 3
/* если запись достаточно длинная, то переработка слота не особо увеличит место для возврата wr.repl, и т.п. */
&& MDBX_PNL_GETSIZE(txn->wr.repnl) + gc_len > env->maxgc_large1page /* не помещается в хвост */) {
DEBUG("avoid reclaiming %" PRIaTXN " slot, since it is too long (%zu)", id, gc_len);
ret.err = MDBX_NOTFOUND;
goto reserve_done;
}
}
if (unlikely(gc_len + MDBX_PNL_GETSIZE(txn->wr.repnl) /* Don't try to coalesce too much. */ >=
env->maxgc_large1page)) {
if (unlikely(gc_len + MDBX_PNL_GETSIZE(txn->tw.repnl) >= env->maxgc_large1page)) {
/* Don't try to coalesce too much. */
if (flags & ALLOC_SHOULD_SCAN) {
eASSERT(env, (flags & ALLOC_COALESCE) /* && !(flags & ALLOC_RESERVE) */ && num > 0);
eASSERT(env, flags & ALLOC_COALESCE);
eASSERT(env, !(flags & ALLOC_RESERVE));
eASSERT(env, num > 0);
#if MDBX_ENABLE_PROFGC
env->lck->pgops.gc_prof.coalescences += 1;
#endif /* MDBX_ENABLE_PROFGC */
TRACE("clear %s %s", "ALLOC_COALESCE", "since got threshold");
if (MDBX_PNL_GETSIZE(txn->wr.repnl) >= num) {
eASSERT(env, MDBX_PNL_LAST(txn->wr.repnl) < txn->geo.first_unallocated &&
MDBX_PNL_FIRST(txn->wr.repnl) < txn->geo.first_unallocated);
if (MDBX_PNL_GETSIZE(txn->tw.repnl) >= num) {
eASSERT(env, MDBX_PNL_LAST(txn->tw.repnl) < txn->geo.first_unallocated &&
MDBX_PNL_FIRST(txn->tw.repnl) < txn->geo.first_unallocated);
if (likely(num == 1)) {
pgno = (flags & ALLOC_RESERVE) ? P_INVALID : repnl_get_single(txn);
pgno = repnl_get_single(txn);
goto done;
}
pgno = repnl_get_sequence(txn, num, flags);
if (likely(pgno))
goto done;
}
flags -= ALLOC_COALESCE | ALLOC_SHOULD_SCAN;
}
flags &= ~(ALLOC_COALESCE | ALLOC_SHOULD_SCAN);
if (unlikely(/* list is too long already */ MDBX_PNL_GETSIZE(txn->wr.repnl) >= env->options.rp_augment_limit) &&
if (unlikely(/* list is too long already */ MDBX_PNL_GETSIZE(txn->tw.repnl) >= env->options.rp_augment_limit) &&
((/* not a slot-request from gc-update */ num &&
/* have enough unallocated space */ txn->geo.upper >= txn->geo.first_unallocated + num &&
monotime_since_cached(monotime_begin, &now_cache) + txn->wr.gc.spent >= env->options.gc_time_limit) ||
gc_len + MDBX_PNL_GETSIZE(txn->wr.repnl) >= PAGELIST_LIMIT)) {
monotime_since_cached(monotime_begin, &now_cache) + txn->tw.gc.time_acc >= env->options.gc_time_limit) ||
gc_len + MDBX_PNL_GETSIZE(txn->tw.repnl) >= PAGELIST_LIMIT)) {
/* Stop reclaiming to avoid large/overflow the page list. This is a rare
* case while search for a continuously multi-page region in a large database,
* see https://libmdbx.dqdkfa.ru/dead-github/issues/123 */
* case while search for a continuously multi-page region in a
* large database, see https://libmdbx.dqdkfa.ru/dead-github/issues/123 */
NOTICE("stop reclaiming %s: %zu (current) + %zu "
"(chunk) >= %zu, rp_augment_limit %u",
likely(gc_len + MDBX_PNL_GETSIZE(txn->wr.repnl) < PAGELIST_LIMIT) ? "since rp_augment_limit was reached"
"(chunk) -> %zu, rp_augment_limit %u",
likely(gc_len + MDBX_PNL_GETSIZE(txn->tw.repnl) < PAGELIST_LIMIT) ? "since rp_augment_limit was reached"
: "to avoid PNL overflow",
MDBX_PNL_GETSIZE(txn->wr.repnl), gc_len, gc_len + MDBX_PNL_GETSIZE(txn->wr.repnl),
MDBX_PNL_GETSIZE(txn->tw.repnl), gc_len, gc_len + MDBX_PNL_GETSIZE(txn->tw.repnl),
env->options.rp_augment_limit);
goto depleted_gc;
}
}
/* Remember ID of readed GC record */
ret.err = rkl_push(&txn->wr.gc.reclaimed, id,
false /* Вместо false, тут можно передавать/использовать (flags & ALLOC_LIFO) == 0, тогда
* дыры/пропуски в идентификаторах GC будут образовывать непрерывные интервалы в wr.gc.reclaimed,
* что обеспечит больше свободных идентификаторов/слотов для возврата страниц. Однако, это
* также приведёт к пустым попыткам удаления отсутствующих записей в gc_clear_reclaimed(),
* а далее к перекладыванию этих сплошных интервалов поэлементно в ready4reuse.
* Поэтому смысла в этом решительно нет. Следует либо формировать сплошные интервалы при
* работе gc_clear_reclaimed(), особенно в FIFO-режиме, либо искать их только в gc_provide_ids() */);
TRACE("%" PRIaTXN " len %zu pushed to txn-rkl, err %d", id, gc_len, ret.err);
if (unlikely(ret.err != MDBX_SUCCESS))
goto fail;
txn->tw.gc.last_reclaimed = id;
if (flags & ALLOC_LIFO) {
ret.err = txl_append(&txn->tw.gc.retxl, id);
if (unlikely(ret.err != MDBX_SUCCESS))
goto fail;
}
/* Append PNL from GC record to wr.repnl */
ret.err = pnl_need(&txn->wr.repnl, gc_len);
/* Append PNL from GC record to tw.repnl */
ret.err = pnl_need(&txn->tw.repnl, gc_len);
if (unlikely(ret.err != MDBX_SUCCESS))
goto fail;
@ -1075,56 +1061,53 @@ next_gc:
#if MDBX_ENABLE_PROFGC
const uint64_t merge_begin = osal_monotime();
#endif /* MDBX_ENABLE_PROFGC */
pnl_merge(txn->wr.repnl, gc_pnl);
pnl_merge(txn->tw.repnl, gc_pnl);
#if MDBX_ENABLE_PROFGC
prof->pnl_merge.calls += 1;
prof->pnl_merge.volume += MDBX_PNL_GETSIZE(txn->wr.repnl);
prof->pnl_merge.volume += MDBX_PNL_GETSIZE(txn->tw.repnl);
prof->pnl_merge.time += osal_monotime() - merge_begin;
#endif /* MDBX_ENABLE_PROFGC */
flags |= ALLOC_SHOULD_SCAN;
if (AUDIT_ENABLED()) {
if (unlikely(!pnl_check(txn->wr.repnl, txn->geo.first_unallocated))) {
if (unlikely(!pnl_check(txn->tw.repnl, txn->geo.first_unallocated))) {
ERROR("%s/%d: %s", "MDBX_CORRUPTED", MDBX_CORRUPTED, "invalid txn retired-list");
ret.err = MDBX_CORRUPTED;
goto fail;
}
} else {
eASSERT(env, pnl_check_allocated(txn->wr.repnl, txn->geo.first_unallocated));
eASSERT(env, pnl_check_allocated(txn->tw.repnl, txn->geo.first_unallocated));
}
eASSERT(env, dpl_check(txn));
eASSERT(env, MDBX_PNL_GETSIZE(txn->wr.repnl) == 0 || MDBX_PNL_MOST(txn->wr.repnl) < txn->geo.first_unallocated);
if (MDBX_ENABLE_REFUND && MDBX_PNL_GETSIZE(txn->wr.repnl) &&
unlikely(MDBX_PNL_MOST(txn->wr.repnl) == txn->geo.first_unallocated - 1)) {
eASSERT(env, MDBX_PNL_GETSIZE(txn->tw.repnl) == 0 || MDBX_PNL_MOST(txn->tw.repnl) < txn->geo.first_unallocated);
if (MDBX_ENABLE_REFUND && MDBX_PNL_GETSIZE(txn->tw.repnl) &&
unlikely(MDBX_PNL_MOST(txn->tw.repnl) == txn->geo.first_unallocated - 1)) {
/* Refund suitable pages into "unallocated" space */
txn_refund(txn);
}
eASSERT(env, pnl_check_allocated(txn->wr.repnl, txn->geo.first_unallocated - MDBX_ENABLE_REFUND));
eASSERT(env, pnl_check_allocated(txn->tw.repnl, txn->geo.first_unallocated - MDBX_ENABLE_REFUND));
/* TODO: удаление загруженных из GC записей */
/* Done for a kick-reclaim mode, actually no page needed */
if (unlikely(num == 0)) {
eASSERT(env, ret.err == MDBX_SUCCESS);
TRACE("%s: last id #%" PRIaTXN ", re-len %zu", "early-exit for slot", id, MDBX_PNL_GETSIZE(txn->tw.repnl));
goto early_exit;
}
/* TODO: delete reclaimed records */
eASSERT(env, op == MDBX_PREV || op == MDBX_NEXT);
if (flags & ALLOC_COALESCE) {
if (MDBX_PNL_GETSIZE(txn->wr.repnl) < env->maxgc_large1page / 2) {
TRACE("%s: last id #%" PRIaTXN ", re-len %zu", "coalesce-continue", id, MDBX_PNL_GETSIZE(txn->wr.repnl));
goto next_gc;
}
flags -= ALLOC_COALESCE;
TRACE("%s: last id #%" PRIaTXN ", re-len %zu", "coalesce-continue", id, MDBX_PNL_GETSIZE(txn->tw.repnl));
goto next_gc;
}
scan:
if ((flags & ALLOC_RESERVE) && num < 2) {
/* Если был нужен только slot/id для gc_reclaim_slot() или gc_reserve4stockpile() */
TRACE("%s: last id #%" PRIaTXN ", re-len %zu", "reserve-done", id, MDBX_PNL_GETSIZE(txn->wr.repnl));
ret.err = MDBX_SUCCESS;
goto reserve_done;
}
eASSERT(env, flags & ALLOC_SHOULD_SCAN);
eASSERT(env, num > 0);
if (MDBX_PNL_GETSIZE(txn->wr.repnl) >= num) {
eASSERT(env, MDBX_PNL_LAST(txn->wr.repnl) < txn->geo.first_unallocated &&
MDBX_PNL_FIRST(txn->wr.repnl) < txn->geo.first_unallocated);
if (MDBX_PNL_GETSIZE(txn->tw.repnl) >= num) {
eASSERT(env, MDBX_PNL_LAST(txn->tw.repnl) < txn->geo.first_unallocated &&
MDBX_PNL_FIRST(txn->tw.repnl) < txn->geo.first_unallocated);
if (likely(num == 1)) {
eASSERT(env, !(flags & ALLOC_RESERVE));
pgno = repnl_get_single(txn);
@ -1135,16 +1118,17 @@ scan:
goto done;
}
flags -= ALLOC_SHOULD_SCAN;
if ((txn->flags & txn_gc_drained) == 0) {
TRACE("%s: last id #%" PRIaTXN ", re-len %zu", "continue-search", id, MDBX_PNL_GETSIZE(txn->wr.repnl));
if (ret.err == MDBX_SUCCESS) {
TRACE("%s: last id #%" PRIaTXN ", re-len %zu", "continue-search", id, MDBX_PNL_GETSIZE(txn->tw.repnl));
goto next_gc;
}
depleted_gc:
TRACE("%s: last id #%" PRIaTXN ", re-len %zu", "gc-depleted", id, MDBX_PNL_GETSIZE(txn->wr.repnl));
txn->flags |= txn_gc_drained;
TRACE("%s: last id #%" PRIaTXN ", re-len %zu", "gc-depleted", id, MDBX_PNL_GETSIZE(txn->tw.repnl));
ret.err = MDBX_NOTFOUND;
if (flags & ALLOC_SHOULD_SCAN)
goto scan;
txn->flags |= txn_gc_drained;
//-------------------------------------------------------------------------
@ -1159,11 +1143,11 @@ depleted_gc:
newnext = txn->geo.first_unallocated + num;
/* Does reclaiming stopped at the last steady point? */
const meta_ptr_t recent = meta_recent(env, &txn->wr.troika);
const meta_ptr_t prefer_steady = meta_prefer_steady(env, &txn->wr.troika);
if (recent.ptr_c != prefer_steady.ptr_c && prefer_steady.is_steady && txn->env->gc.detent == prefer_steady.txnid) {
DEBUG("gc-kick-steady: recent %" PRIaTXN "-%s, steady %" PRIaTXN "-%s", recent.txnid, durable_caption(recent.ptr_c),
prefer_steady.txnid, durable_caption(prefer_steady.ptr_c));
const meta_ptr_t recent = meta_recent(env, &txn->tw.troika);
const meta_ptr_t prefer_steady = meta_prefer_steady(env, &txn->tw.troika);
if (recent.ptr_c != prefer_steady.ptr_c && prefer_steady.is_steady && detent == prefer_steady.txnid + 1) {
DEBUG("gc-kick-steady: recent %" PRIaTXN "-%s, steady %" PRIaTXN "-%s, detent %" PRIaTXN, recent.txnid,
durable_caption(recent.ptr_c), prefer_steady.txnid, durable_caption(prefer_steady.ptr_c), detent);
const pgno_t autosync_threshold = atomic_load32(&env->lck->autosync_threshold, mo_Relaxed);
const uint64_t autosync_period = atomic_load64(&env->lck->autosync_period, mo_Relaxed);
uint64_t eoos_timestamp;
@ -1182,12 +1166,12 @@ depleted_gc:
#if MDBX_ENABLE_PROFGC
env->lck->pgops.gc_prof.wipes += 1;
#endif /* MDBX_ENABLE_PROFGC */
ret.err = meta_wipe_steady(env, txn->env->gc.detent);
ret.err = meta_wipe_steady(env, detent);
DEBUG("gc-wipe-steady, rc %d", ret.err);
if (unlikely(ret.err != MDBX_SUCCESS))
goto fail;
eASSERT(env, prefer_steady.ptr_c != meta_prefer_steady(env, &txn->wr.troika).ptr_c);
goto retry_gc_refresh_detent;
eASSERT(env, prefer_steady.ptr_c != meta_prefer_steady(env, &txn->tw.troika).ptr_c);
goto retry_gc_refresh_oldest;
}
if ((autosync_threshold && atomic_load64(&env->lck->unsynced_pages, mo_Relaxed) >= autosync_threshold) ||
(autosync_period && (eoos_timestamp = atomic_load64(&env->lck->eoos_timestamp, mo_Relaxed)) &&
@ -1199,18 +1183,21 @@ depleted_gc:
env->lck->pgops.gc_prof.flushes += 1;
#endif /* MDBX_ENABLE_PROFGC */
meta_t meta = *recent.ptr_c;
ret.err = dxb_sync_locked(env, env->flags & MDBX_WRITEMAP, &meta, &txn->wr.troika);
ret.err = dxb_sync_locked(env, env->flags & MDBX_WRITEMAP, &meta, &txn->tw.troika);
DEBUG("gc-make-steady, rc %d", ret.err);
eASSERT(env, ret.err != MDBX_RESULT_TRUE);
if (unlikely(ret.err != MDBX_SUCCESS))
goto fail;
eASSERT(env, prefer_steady.ptr_c != meta_prefer_steady(env, &txn->wr.troika).ptr_c);
goto retry_gc_refresh_detent;
eASSERT(env, prefer_steady.ptr_c != meta_prefer_steady(env, &txn->tw.troika).ptr_c);
goto retry_gc_refresh_oldest;
}
}
if (unlikely(true == atomic_load32(&env->lck->rdt_refresh_flag, mo_AcquireRelease)) && txn_gc_detent(txn))
goto retry_gc_have_detent;
if (unlikely(true == atomic_load32(&env->lck->rdt_refresh_flag, mo_AcquireRelease))) {
oldest = txn_snapshot_oldest(txn);
if (oldest >= detent)
goto retry_gc_have_oldest;
}
/* Avoid kick lagging reader(s) if is enough unallocated space
* at the end of database file. */
@ -1219,8 +1206,11 @@ depleted_gc:
goto done;
}
if (txn->txnid - txn->env->gc.detent > xMDBX_TXNID_STEP && mvcc_kick_laggards(env, txn->env->gc.detent))
goto retry_gc_refresh_detent;
if (oldest < txn->txnid - xMDBX_TXNID_STEP) {
oldest = mvcc_kick_laggards(env, oldest);
if (oldest >= detent)
goto retry_gc_have_oldest;
}
//---------------------------------------------------------------------------
@ -1273,7 +1263,7 @@ done:
if (likely((flags & ALLOC_RESERVE) == 0)) {
if (pgno) {
eASSERT(env, pgno + num <= txn->geo.first_unallocated && pgno >= NUM_METAS);
eASSERT(env, pnl_check_allocated(txn->wr.repnl, txn->geo.first_unallocated - MDBX_ENABLE_REFUND));
eASSERT(env, pnl_check_allocated(txn->tw.repnl, txn->geo.first_unallocated - MDBX_ENABLE_REFUND));
} else {
pgno = txn->geo.first_unallocated;
txn->geo.first_unallocated += (pgno_t)num;
@ -1285,42 +1275,32 @@ done:
if (unlikely(ret.err != MDBX_SUCCESS)) {
fail:
eASSERT(env, ret.err != MDBX_SUCCESS);
eASSERT(env, pnl_check_allocated(txn->wr.repnl, txn->geo.first_unallocated - MDBX_ENABLE_REFUND));
eASSERT(env, pnl_check_allocated(txn->tw.repnl, txn->geo.first_unallocated - MDBX_ENABLE_REFUND));
int level;
if (flags & ALLOC_UNIMPORTANT)
level = MDBX_LOG_DEBUG;
else if (flags & ALLOC_RESERVE)
level = MDBX_LOG_NOTICE;
else {
const char *what;
if (flags & ALLOC_RESERVE) {
level = (flags & ALLOC_UNIMPORTANT) ? MDBX_LOG_DEBUG : MDBX_LOG_NOTICE;
what = num ? "reserve-pages" : "fetch-slot";
} else {
txn->flags |= MDBX_TXN_ERROR;
level = MDBX_LOG_ERROR;
what = "pages";
}
if (LOG_ENABLED(level)) {
if (num)
debug_log(level, __func__, __LINE__,
"unable %s %zu, alloc-flags 0x%x, err %d, txn-flags "
"0x%x, re-list-len %zu, loose-count %zu, gc: height %u, "
"branch %zu, leaf %zu, large %zu, entries %zu\n",
(flags & ALLOC_RESERVE) ? "reserve" : "alloc", num, flags, ret.err, txn->flags,
MDBX_PNL_GETSIZE(txn->wr.repnl), txn->wr.loose_count, txn->dbs[FREE_DBI].height,
(size_t)txn->dbs[FREE_DBI].branch_pages, (size_t)txn->dbs[FREE_DBI].leaf_pages,
(size_t)txn->dbs[FREE_DBI].large_pages, (size_t)txn->dbs[FREE_DBI].items);
else
debug_log(level, __func__, __LINE__,
"unable fetch-slot, alloc-flags 0x%x, err %d, txn-flags "
"0x%x, re-list-len %zu, loose-count %zu, gc: height %u, "
"branch %zu, leaf %zu, large %zu, entries %zu\n",
flags, ret.err, txn->flags, MDBX_PNL_GETSIZE(txn->wr.repnl), txn->wr.loose_count,
txn->dbs[FREE_DBI].height, (size_t)txn->dbs[FREE_DBI].branch_pages,
(size_t)txn->dbs[FREE_DBI].leaf_pages, (size_t)txn->dbs[FREE_DBI].large_pages,
(size_t)txn->dbs[FREE_DBI].items);
}
if (LOG_ENABLED(level))
debug_log(level, __func__, __LINE__,
"unable alloc %zu %s, alloc-flags 0x%x, err %d, txn-flags "
"0x%x, re-list-len %zu, loose-count %zu, gc: height %u, "
"branch %zu, leaf %zu, large %zu, entries %zu\n",
num, what, flags, ret.err, txn->flags, MDBX_PNL_GETSIZE(txn->tw.repnl), txn->tw.loose_count,
txn->dbs[FREE_DBI].height, (size_t)txn->dbs[FREE_DBI].branch_pages,
(size_t)txn->dbs[FREE_DBI].leaf_pages, (size_t)txn->dbs[FREE_DBI].large_pages,
(size_t)txn->dbs[FREE_DBI].items);
ret.page = nullptr;
}
if (num > 1)
txn->wr.gc.spent += monotime_since_cached(monotime_begin, &now_cache);
txn->tw.gc.time_acc += monotime_since_cached(monotime_begin, &now_cache);
} else {
reserve_done:
early_exit:
DEBUG("return nullptr for %zu pages for ALLOC_%s, rc %d", num, num ? "RESERVE" : "SLOT", ret.err);
ret.page = nullptr;
}
@ -1337,20 +1317,20 @@ __hot pgr_t gc_alloc_single(const MDBX_cursor *const mc) {
tASSERT(txn, F_ISSET(*cursor_dbi_state(mc), DBI_LINDO | DBI_VALID | DBI_DIRTY));
/* If there are any loose pages, just use them */
while (likely(txn->wr.loose_pages)) {
while (likely(txn->tw.loose_pages)) {
#if MDBX_ENABLE_REFUND
if (unlikely(txn->wr.loose_refund_wl > txn->geo.first_unallocated)) {
if (unlikely(txn->tw.loose_refund_wl > txn->geo.first_unallocated)) {
txn_refund(txn);
if (!txn->wr.loose_pages)
if (!txn->tw.loose_pages)
break;
}
#endif /* MDBX_ENABLE_REFUND */
page_t *lp = txn->wr.loose_pages;
page_t *lp = txn->tw.loose_pages;
MDBX_ASAN_UNPOISON_MEMORY_REGION(lp, txn->env->ps);
VALGRIND_MAKE_MEM_DEFINED(&page_next(lp), sizeof(page_t *));
txn->wr.loose_pages = page_next(lp);
txn->wr.loose_count--;
txn->tw.loose_pages = page_next(lp);
txn->tw.loose_count--;
DEBUG_EXTRA("db %d use loose page %" PRIaPGNO, cursor_dbi_dbg(mc), lp->pgno);
tASSERT(txn, lp->pgno < txn->geo.first_unallocated);
tASSERT(txn, lp->pgno >= NUM_METAS);
@ -1360,7 +1340,7 @@ __hot pgr_t gc_alloc_single(const MDBX_cursor *const mc) {
return ret;
}
if (likely(MDBX_PNL_GETSIZE(txn->wr.repnl) > 0))
if (likely(MDBX_PNL_GETSIZE(txn->tw.repnl) > 0))
return page_alloc_finalize(txn->env, txn, mc, repnl_get_single(txn), 1);
return gc_alloc_ex(mc, 1, ALLOC_DEFAULT);

File diff suppressed because it is too large Load Diff

View File

@ -5,37 +5,14 @@
#include "essentials.h"
/* Гистограмма решения нарезки фрагментов для ситуации нехватки идентификаторов/слотов. */
typedef struct gc_dense_histogram {
/* Размер массива одновременно задаёт максимальный размер последовательностей,
* с которыми решается задача распределения.
*
* Использование длинных последовательностей контрпродуктивно, так как такие последовательности будут
* создавать/воспроизводить/повторять аналогичные затруднения при последующей переработке. Однако,
* в редких ситуациях это может быть единственным выходом. */
unsigned end;
pgno_t array[31];
} gc_dense_histogram_t;
typedef struct gc_update_context {
unsigned loop;
unsigned goodchunk;
bool dense;
pgno_t prev_first_unallocated;
bool dense;
size_t reserve_adj;
size_t retired_stored;
size_t return_reserved_lo, return_reserved_hi;
txnid_t gc_first;
intptr_t return_left;
#ifndef MDBX_DEBUG_GCU
#define MDBX_DEBUG_GCU 0
#endif
#if MDBX_DEBUG_GCU
struct {
txnid_t prev;
unsigned n;
} dbg;
#endif /* MDBX_DEBUG_GCU */
rkl_t ready4reuse, sequel;
size_t amount, reserved, cleaned_slot, reused_slot, fill_idx;
txnid_t cleaned_id, rid;
#if MDBX_ENABLE_BIGFOOT
txnid_t bigfoot;
#endif /* MDBX_ENABLE_BIGFOOT */
@ -43,38 +20,21 @@ typedef struct gc_update_context {
MDBX_cursor cursor;
cursor_couple_t couple;
};
gc_dense_histogram_t dense_histogram;
} gcu_t;
MDBX_INTERNAL int gc_put_init(MDBX_txn *txn, gcu_t *ctx);
MDBX_INTERNAL void gc_put_destroy(gcu_t *ctx);
#define ALLOC_DEFAULT 0 /* штатное/обычное выделение страниц */
#define ALLOC_UNIMPORTANT 1 /* запрос неважен, невозможность выделения не приведет к ошибке транзакции */
#define ALLOC_RESERVE 2 /* подготовка резерва для обновления GC, без аллокации */
#define ALLOC_COALESCE 4 /* внутреннее состояние/флажок */
#define ALLOC_SHOULD_SCAN 8 /* внутреннее состояние/флажок */
#define ALLOC_LIFO 16 /* внутреннее состояние/флажок */
static inline int gc_update_init(MDBX_txn *txn, gcu_t *ctx) {
memset(ctx, 0, offsetof(gcu_t, cursor));
ctx->dense = txn->txnid <= MIN_TXNID;
#if MDBX_ENABLE_BIGFOOT
ctx->bigfoot = txn->txnid;
#endif /* MDBX_ENABLE_BIGFOOT */
return cursor_init(&ctx->cursor, txn, FREE_DBI);
}
#define ALLOC_DEFAULT 0
#define ALLOC_RESERVE 1
#define ALLOC_UNIMPORTANT 2
MDBX_INTERNAL pgr_t gc_alloc_ex(const MDBX_cursor *const mc, const size_t num, uint8_t flags);
MDBX_INTERNAL pgr_t gc_alloc_single(const MDBX_cursor *const mc);
MDBX_INTERNAL int gc_update(MDBX_txn *txn, gcu_t *ctx);
MDBX_NOTHROW_PURE_FUNCTION static inline size_t gc_stockpile(const MDBX_txn *txn) {
return MDBX_PNL_GETSIZE(txn->wr.repnl) + txn->wr.loose_count;
}
MDBX_NOTHROW_PURE_FUNCTION static inline size_t gc_chunk_bytes(const size_t chunk) {
return (chunk + 1) * sizeof(pgno_t);
}
MDBX_INTERNAL bool gc_repnl_has_span(const MDBX_txn *txn, const size_t num);
static inline bool gc_is_reclaimed(const MDBX_txn *txn, const txnid_t id) {
return rkl_contain(&txn->wr.gc.reclaimed, id) || rkl_contain(&txn->wr.gc.comeback, id);
}
static inline txnid_t txnid_min(txnid_t a, txnid_t b) { return (a < b) ? a : b; }
static inline txnid_t txnid_max(txnid_t a, txnid_t b) { return (a > b) ? a : b; }

View File

@ -41,12 +41,11 @@ typedef struct node_search_result {
typedef struct bind_reader_slot_result {
int err;
reader_slot_t *slot;
reader_slot_t *rslot;
} bsr_t;
#include "atomics-ops.h"
#include "proto.h"
#include "rkl.h"
#include "txl.h"
#include "unaligned.h"
#if defined(_WIN32) || defined(_WIN64)
@ -156,8 +155,7 @@ enum txn_flags {
txn_rw_begin_flags = MDBX_TXN_NOMETASYNC | MDBX_TXN_NOSYNC | MDBX_TXN_TRY,
txn_shrink_allowed = UINT32_C(0x40000000),
txn_parked = MDBX_TXN_PARKED,
txn_gc_drained = 0x80 /* GC was depleted up to oldest reader */,
txn_may_have_cursors = 0x100,
txn_gc_drained = 0x40 /* GC was depleted up to oldest reader */,
txn_state_flags = MDBX_TXN_FINISHED | MDBX_TXN_ERROR | MDBX_TXN_DIRTY | MDBX_TXN_SPILLS | MDBX_TXN_HAS_CHILD |
MDBX_TXN_INVALID | txn_gc_drained
};
@ -207,16 +205,17 @@ struct MDBX_txn {
union {
struct {
/* For read txns: This thread/txn's slot table slot, or nullptr. */
reader_slot_t *slot;
} ro;
/* For read txns: This thread/txn's reader table slot, or nullptr. */
reader_slot_t *reader;
} to;
struct {
troika_t troika;
pnl_t __restrict repnl; /* Reclaimed GC pages */
struct {
rkl_t reclaimed; /* The list of reclaimed txn-ids from GC */
uint64_t spent; /* Time spent reading and searching GC */
rkl_t comeback; /* The list of ids of records returned into GC during commit, etc */
/* The list of reclaimed txn-ids from GC */
txl_t __restrict retxl;
txnid_t last_reclaimed; /* ID of last used record */
uint64_t time_acc;
} gc;
bool prefault_write_activated;
#if MDBX_ENABLE_REFUND
@ -236,7 +235,7 @@ struct MDBX_txn {
/* The list of loose pages that became unused and may be reused
* in this transaction, linked through `page_next()`. */
page_t *__restrict loose_pages;
/* Number of loose pages (wr.loose_pages) */
/* Number of loose pages (tw.loose_pages) */
size_t loose_count;
union {
struct {
@ -250,7 +249,7 @@ struct MDBX_txn {
size_t writemap_spilled_npages;
};
/* In write txns, next is located the array of cursors for each DB */
} wr;
} tw;
};
};
@ -286,14 +285,13 @@ struct MDBX_cursor {
};
/* флаги проверки, в том числе биты для проверки типа листовых страниц. */
uint8_t checking;
uint8_t pad;
/* Указывает на txn->dbi_state[] для DBI этого курсора.
* Модификатор __restrict тут полезен и безопасен в текущем понимании,
* так как пересечение возможно только с dbi_state транзакции,
* и происходит по-чтению до последующего изменения/записи. */
uint8_t *__restrict dbi_state;
/* Связь списка отслеживания курсоров в транзакции. */
/* Связь списка отслеживания курсоров в транзакции */
MDBX_txn *txn;
/* Указывает на tree->dbs[] для DBI этого курсора. */
tree_t *tree;
@ -362,14 +360,15 @@ struct MDBX_env {
uint16_t subpage_reserve_prereq;
uint16_t subpage_reserve_limit;
atomic_pgno_t mlocked_pgno;
uint8_t ps2ln; /* log2 of DB page size */
int8_t stuck_meta; /* recovery-only: target meta page or less that zero */
uint16_t merge_threshold; /* pages emptier than this are candidates for merging */
unsigned max_readers; /* size of the reader table */
MDBX_dbi max_dbi; /* size of the DB table */
uint32_t pid; /* process ID of this env */
osal_thread_key_t me_txkey; /* thread-key for readers */
struct { /* path to the DB files */
uint8_t ps2ln; /* log2 of DB page size */
int8_t stuck_meta; /* recovery-only: target meta page or less that zero */
uint16_t merge_threshold, merge_threshold_gc; /* pages emptier than this are
candidates for merging */
unsigned max_readers; /* size of the reader table */
MDBX_dbi max_dbi; /* size of the DB table */
uint32_t pid; /* process ID of this env */
osal_thread_key_t me_txkey; /* thread-key for readers */
struct { /* path to the DB files */
pathchar_t *lck, *dxb, *specified;
void *buffer;
} pathname;
@ -466,9 +465,6 @@ struct MDBX_env {
/* --------------------------------------------------- mostly volatile part */
MDBX_txn *txn; /* current write transaction */
struct {
txnid_t detent;
} gc;
osal_fastmutex_t dbi_lock;
unsigned n_dbi; /* number of DBs opened */
@ -540,9 +536,7 @@ MDBX_MAYBE_UNUSED static void static_checks(void) {
STATIC_ASSERT(offsetof(lck_t, cached_oldest) % MDBX_CACHELINE_SIZE == 0);
STATIC_ASSERT(offsetof(lck_t, rdt_length) % MDBX_CACHELINE_SIZE == 0);
#endif /* MDBX_LOCKING */
#if FLEXIBLE_ARRAY_MEMBERS
STATIC_ASSERT(offsetof(lck_t, rdt) % MDBX_CACHELINE_SIZE == 0);
#endif /* FLEXIBLE_ARRAY_MEMBERS */
#if FLEXIBLE_ARRAY_MEMBERS
STATIC_ASSERT(NODESIZE == offsetof(node_t, payload));
@ -551,7 +545,11 @@ MDBX_MAYBE_UNUSED static void static_checks(void) {
STATIC_ASSERT(sizeof(clc_t) == 3 * sizeof(void *));
STATIC_ASSERT(sizeof(kvx_t) == 8 * sizeof(void *));
#define KVX_SIZE_LN2 MDBX_WORDBITS_LN2
#if MDBX_WORDBITS == 64
#define KVX_SIZE_LN2 6
#else
#define KVX_SIZE_LN2 5
#endif
STATIC_ASSERT(sizeof(kvx_t) == (1u << KVX_SIZE_LN2));
}
#endif /* Disabled for MSVC 19.0 (VisualStudio 2015) */

View File

@ -186,7 +186,7 @@ typedef struct reader_slot {
/* The header for the reader table (a memory-mapped lock file). */
typedef struct shared_lck {
/* Stamp identifying this as an MDBX file.
* It must be set to MDBX_MAGIC with MDBX_LOCK_VERSION. */
* It must be set to MDBX_MAGIC with with MDBX_LOCK_VERSION. */
uint64_t magic_and_version;
/* Format of this lock file. Must be set to MDBX_LOCK_FORMAT. */

View File

@ -49,7 +49,7 @@
* = F_WRLCK блокировка первого байта lck-файла, другие процессы ждут её
* снятия при получении F_RDLCK через F_SETLKW.
* - блокировки dxb-файла могут меняться до снятие эксклюзивной блокировки
* lck-файла:
* lck-файла:
* + для НЕ-эксклюзивного режима блокировка pid-байта в dxb-файле
* посредством F_RDLCK или F_WRLCK, в зависимости от MDBX_RDONLY.
* + для ЭКСКЛЮЗИВНОГО режима блокировка всего dxb-файла

View File

@ -1,6 +1,6 @@
.\" Copyright 2015-2025 Leonid Yuriev <leo@yuriev.ru>.
.\" Copying restrictions apply. See COPYRIGHT/LICENSE.
.TH MDBX_CHK 1 "2025-01-14" "MDBX 0.14"
.TH MDBX_CHK 1 "2024-08-29" "MDBX 0.13"
.SH NAME
mdbx_chk \- MDBX checking tool
.SH SYNOPSIS

View File

@ -2,7 +2,7 @@
.\" Copyright 2015,2016 Peter-Service R&D LLC <http://billing.ru/>.
.\" Copyright 2012-2015 Howard Chu, Symas Corp. All Rights Reserved.
.\" Copying restrictions apply. See COPYRIGHT/LICENSE.
.TH MDBX_COPY 1 "2025-01-14" "MDBX 0.14"
.TH MDBX_COPY 1 "2024-08-29" "MDBX 0.13"
.SH NAME
mdbx_copy \- MDBX environment copy tool
.SH SYNOPSIS

View File

@ -1,7 +1,7 @@
.\" Copyright 2021-2025 Leonid Yuriev <leo@yuriev.ru>.
.\" Copyright 2014-2021 Howard Chu, Symas Corp. All Rights Reserved.
.\" Copying restrictions apply. See COPYRIGHT/LICENSE.
.TH MDBX_DROP 1 "2025-01-14" "MDBX 0.14"
.TH MDBX_DROP 1 "2024-08-29" "MDBX 0.13"
.SH NAME
mdbx_drop \- MDBX database delete tool
.SH SYNOPSIS

View File

@ -2,7 +2,7 @@
.\" Copyright 2015,2016 Peter-Service R&D LLC <http://billing.ru/>.
.\" Copyright 2014-2015 Howard Chu, Symas Corp. All Rights Reserved.
.\" Copying restrictions apply. See COPYRIGHT/LICENSE.
.TH MDBX_DUMP 1 "2025-01-14" "MDBX 0.14"
.TH MDBX_DUMP 1 "2024-08-29" "MDBX 0.13"
.SH NAME
mdbx_dump \- MDBX environment export tool
.SH SYNOPSIS
@ -12,8 +12,6 @@ mdbx_dump \- MDBX environment export tool
[\c
.BR \-q ]
[\c
.BR \-c ]
[\c
.BI \-f \ file\fR]
[\c
.BR \-l ]
@ -43,9 +41,6 @@ Write the library version number to the standard output, and exit.
.BR \-q
Be quiet.
.TP
.BR \-c
Concise mode without repeating keys in a dump, but incompatible with Berkeley DB and LMDB.
.TP
.BR \-f \ file
Write to the specified file instead of to the standard output.
.TP

View File

@ -2,7 +2,7 @@
.\" Copyright 2015,2016 Peter-Service R&D LLC <http://billing.ru/>.
.\" Copyright 2014-2015 Howard Chu, Symas Corp. All Rights Reserved.
.\" Copying restrictions apply. See COPYRIGHT/LICENSE.
.TH MDBX_LOAD 1 "2025-01-14" "MDBX 0.14"
.TH MDBX_LOAD 1 "2024-08-29" "MDBX 0.13"
.SH NAME
mdbx_load \- MDBX environment import tool
.SH SYNOPSIS

View File

@ -2,7 +2,7 @@
.\" Copyright 2015,2016 Peter-Service R&D LLC <http://billing.ru/>.
.\" Copyright 2012-2015 Howard Chu, Symas Corp. All Rights Reserved.
.\" Copying restrictions apply. See COPYRIGHT/LICENSE.
.TH MDBX_STAT 1 "2025-01-14" "MDBX 0.14"
.TH MDBX_STAT 1 "2024-08-29" "MDBX 0.13"
.SH NAME
mdbx_stat \- MDBX environment status tool
.SH SYNOPSIS

View File

@ -252,9 +252,9 @@ __cold int meta_wipe_steady(MDBX_env *env, txnid_t inclusive_upto) {
/* force oldest refresh */
atomic_store32(&env->lck->rdt_refresh_flag, true, mo_Relaxed);
env->basal_txn->wr.troika = meta_tap(env);
env->basal_txn->tw.troika = meta_tap(env);
for (MDBX_txn *scan = env->basal_txn->nested; scan; scan = scan->nested)
scan->wr.troika = env->basal_txn->wr.troika;
scan->tw.troika = env->basal_txn->tw.troika;
return err;
}

View File

@ -50,23 +50,23 @@ bsr_t mvcc_bind_slot(MDBX_env *env) {
}
}
result.slot = &env->lck->rdt[slot];
result.rslot = &env->lck->rdt[slot];
/* Claim the reader slot, carefully since other code
* uses the reader table un-mutexed: First reset the
* slot, next publish it in lck->rdt_length. After
* that, it is safe for mdbx_env_close() to touch it.
* When it will be closed, we can finally claim it. */
atomic_store32(&result.slot->pid, 0, mo_AcquireRelease);
safe64_reset(&result.slot->txnid, true);
atomic_store32(&result.rslot->pid, 0, mo_AcquireRelease);
safe64_reset(&result.rslot->txnid, true);
if (slot == nreaders)
env->lck->rdt_length.weak = (uint32_t)++nreaders;
result.slot->tid.weak = (env->flags & MDBX_NOSTICKYTHREADS) ? 0 : osal_thread_self();
atomic_store32(&result.slot->pid, env->pid, mo_AcquireRelease);
result.rslot->tid.weak = (env->flags & MDBX_NOSTICKYTHREADS) ? 0 : osal_thread_self();
atomic_store32(&result.rslot->pid, env->pid, mo_AcquireRelease);
lck_rdt_unlock(env);
if (likely(env->flags & ENV_TXKEY)) {
eASSERT(env, env->registered_reader_pid == env->pid);
thread_rthc_set(env->me_txkey, result.slot);
thread_rthc_set(env->me_txkey, result.rslot);
}
return result;
}
@ -300,7 +300,7 @@ __cold MDBX_INTERNAL int mvcc_cleanup_dead(MDBX_env *env, int rdt_locked, int *d
return rc;
}
__cold bool mvcc_kick_laggards(MDBX_env *env, const txnid_t straggler) {
__cold txnid_t mvcc_kick_laggards(MDBX_env *env, const txnid_t straggler) {
DEBUG("DB size maxed out by reading #%" PRIaTXN, straggler);
osal_memory_fence(mo_AcquireRelease, false);
MDBX_hsr_func *const callback = env->hsr_callback;
@ -308,7 +308,7 @@ __cold bool mvcc_kick_laggards(MDBX_env *env, const txnid_t straggler) {
bool notify_eof_of_loop = false;
int retry = 0;
do {
const txnid_t steady = env->txn->wr.troika.txnid[env->txn->wr.troika.prefer_steady];
const txnid_t steady = env->txn->tw.troika.txnid[env->txn->tw.troika.prefer_steady];
env->lck->rdt_refresh_flag.weak = /* force refresh */ true;
oldest = mvcc_shapshot_oldest(env, steady);
eASSERT(env, oldest < env->basal_txn->txnid);
@ -374,7 +374,7 @@ __cold bool mvcc_kick_laggards(MDBX_env *env, const txnid_t straggler) {
if (safe64_read(&stucked->txnid) != straggler || !pid)
continue;
const meta_ptr_t head = meta_recent(env, &env->txn->wr.troika);
const meta_ptr_t head = meta_recent(env, &env->txn->tw.troika);
const txnid_t gap = (head.txnid - straggler) / xMDBX_TXNID_STEP;
const uint64_t head_retired = unaligned_peek_u64(4, head.ptr_c->pages_retired);
const size_t space = (head_retired > hold_retired) ? pgno2bytes(env, (pgno_t)(head_retired - hold_retired)) : 0;
@ -410,5 +410,5 @@ __cold bool mvcc_kick_laggards(MDBX_env *env, const txnid_t straggler) {
NOTICE("hsr-kick: done turn %" PRIaTXN " -> %" PRIaTXN " +%" PRIaTXN, straggler, oldest, turn);
callback(env, env->txn, 0, 0, straggler, (turn < UINT_MAX) ? (unsigned)turn : UINT_MAX, 0, -retry);
}
return oldest > straggler;
return oldest;
}

View File

@ -50,9 +50,14 @@ int __must_check_result node_add_branch(MDBX_cursor *mc, size_t indx, const MDBX
is_subpage(mp) ? "sub-" : "", mp->pgno, indx, pgno, key ? key->iov_len : 0, DKEY_DEBUG(key));
cASSERT(mc, page_type(mp) == P_BRANCH);
cASSERT(mc, mp->txnid >= mc->txn->front_txnid);
STATIC_ASSERT(NODESIZE % 2 == 0);
/* Move higher pointers up one slot. */
const size_t nkeys = page_numkeys(mp);
cASSERT(mc, nkeys >= indx);
for (size_t i = nkeys; i > indx; --i)
mp->entries[i] = mp->entries[i - 1];
/* Adjust free space offsets. */
const size_t branch_bytes = branch_size(mc->txn->env, key);
const intptr_t lower = mp->lower + sizeof(indx_t);
@ -61,13 +66,6 @@ int __must_check_result node_add_branch(MDBX_cursor *mc, size_t indx, const MDBX
mc->txn->flags |= MDBX_TXN_ERROR;
return MDBX_PAGE_FULL;
}
/* Move higher pointers up one slot. */
const size_t nkeys = page_numkeys(mp);
cASSERT(mc, nkeys >= indx);
for (size_t i = nkeys; i > indx; --i)
mp->entries[i] = mp->entries[i - 1];
mp->lower = (indx_t)lower;
mp->entries[indx] = mp->upper = (indx_t)upper;

View File

@ -257,14 +257,6 @@
#error MDBX_HAVE_BUILTIN_CPU_SUPPORTS must be defined as 0 or 1
#endif /* MDBX_HAVE_BUILTIN_CPU_SUPPORTS */
/** if enabled then treats the commit of pure (nothing changes) transactions as special
* cases and return \ref MDBX_RESULT_TRUE instead of \ref MDBX_SUCCESS. */
#ifndef MDBX_NOSUCCESS_PURE_COMMIT
#define MDBX_NOSUCCESS_PURE_COMMIT 0
#elif !(MDBX_NOSUCCESS_PURE_COMMIT == 0 || MDBX_NOSUCCESS_PURE_COMMIT == 1)
#error MDBX_NOSUCCESS_PURE_COMMIT must be defined as 0 or 1
#endif /* MDBX_NOSUCCESS_PURE_COMMIT */
/** if enabled then instead of the returned error `MDBX_REMOTE`, only a warning is issued, when
* the database being opened in non-read-only mode is located in a file system exported via NFS. */
#ifndef MDBX_ENABLE_NON_READONLY_EXPORT

View File

@ -248,7 +248,7 @@ __cold void mdbx_panic(const char *fmt, ...) {
unlikely(num < 1 || !message) ? "<troubles with panic-message preparation>" : message;
if (globals.logger.ptr)
debug_log(MDBX_LOG_FATAL, "mdbx-panic", 0, "%s", const_message);
debug_log(MDBX_LOG_FATAL, "panic", 0, "%s", const_message);
while (1) {
#if defined(_WIN32) || defined(_WIN64)
@ -262,7 +262,7 @@ __cold void mdbx_panic(const char *fmt, ...) {
#endif
FatalExit(ERROR_UNHANDLED_ERROR);
#else
__assert_fail(const_message, "mdbx-panic", 0, const_message);
__assert_fail(const_message, "mdbx", 0, "panic");
abort();
#endif
}

View File

@ -443,8 +443,8 @@ static __always_inline pgr_t page_get_inline(const uint16_t ILL, const MDBX_curs
const size_t i = dpl_search(spiller, pgno);
tASSERT(txn, (intptr_t)i > 0);
if (spiller->wr.dirtylist->items[i].pgno == pgno) {
r.page = spiller->wr.dirtylist->items[i].ptr;
if (spiller->tw.dirtylist->items[i].pgno == pgno) {
r.page = spiller->tw.dirtylist->items[i].ptr;
break;
}
@ -457,8 +457,6 @@ static __always_inline pgr_t page_get_inline(const uint16_t ILL, const MDBX_curs
goto bailout;
}
TRACE("dbi %zu, mc %p, page %u, %p", cursor_dbi(mc), __Wpedantic_format_voidptr(mc), pgno,
__Wpedantic_format_voidptr(r.page));
if (unlikely(mc->checking & z_pagecheck))
return check_page_complete(ILL, r.page, mc, front);

View File

@ -144,14 +144,14 @@ __cold pgr_t __must_check_result page_unspill(MDBX_txn *const txn, const page_t
}
__hot int page_touch_modifable(MDBX_txn *txn, const page_t *const mp) {
tASSERT(txn, is_modifable(txn, mp) && txn->wr.dirtylist);
tASSERT(txn, is_modifable(txn, mp) && txn->tw.dirtylist);
tASSERT(txn, !is_largepage(mp) && !is_subpage(mp));
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC);
const size_t n = dpl_search(txn, mp->pgno);
if (MDBX_AVOID_MSYNC && unlikely(txn->wr.dirtylist->items[n].pgno != mp->pgno)) {
if (MDBX_AVOID_MSYNC && unlikely(txn->tw.dirtylist->items[n].pgno != mp->pgno)) {
tASSERT(txn, (txn->flags & MDBX_WRITEMAP));
tASSERT(txn, n > 0 && n <= txn->wr.dirtylist->length + 1);
tASSERT(txn, n > 0 && n <= txn->tw.dirtylist->length + 1);
VERBOSE("unspill page %" PRIaPGNO, mp->pgno);
#if MDBX_ENABLE_PGOP_STAT
txn->env->lck->pgops.unspill.weak += 1;
@ -159,11 +159,11 @@ __hot int page_touch_modifable(MDBX_txn *txn, const page_t *const mp) {
return page_dirty(txn, (page_t *)mp, 1);
}
tASSERT(txn, n > 0 && n <= txn->wr.dirtylist->length);
tASSERT(txn, txn->wr.dirtylist->items[n].pgno == mp->pgno && txn->wr.dirtylist->items[n].ptr == mp);
tASSERT(txn, n > 0 && n <= txn->tw.dirtylist->length);
tASSERT(txn, txn->tw.dirtylist->items[n].pgno == mp->pgno && txn->tw.dirtylist->items[n].ptr == mp);
if (!MDBX_AVOID_MSYNC || (txn->flags & MDBX_WRITEMAP) == 0) {
size_t *const ptr = ptr_disp(txn->wr.dirtylist->items[n].ptr, -(ptrdiff_t)sizeof(size_t));
*ptr = txn->wr.dirtylru;
size_t *const ptr = ptr_disp(txn->tw.dirtylist->items[n].ptr, -(ptrdiff_t)sizeof(size_t));
*ptr = txn->tw.dirtylru;
}
return MDBX_SUCCESS;
}
@ -179,7 +179,7 @@ __hot int page_touch_unmodifable(MDBX_txn *txn, MDBX_cursor *mc, const page_t *c
page_t *np;
if (is_frozen(txn, mp)) {
/* CoW the page */
rc = pnl_need(&txn->wr.retired_pages, 1);
rc = pnl_need(&txn->tw.retired_pages, 1);
if (unlikely(rc != MDBX_SUCCESS))
goto fail;
const pgr_t par = gc_alloc_single(mc);
@ -191,7 +191,7 @@ __hot int page_touch_unmodifable(MDBX_txn *txn, MDBX_cursor *mc, const page_t *c
const pgno_t pgno = np->pgno;
DEBUG("touched db %d page %" PRIaPGNO " -> %" PRIaPGNO, cursor_dbi_dbg(mc), mp->pgno, pgno);
tASSERT(txn, mp->pgno != pgno);
pnl_append_prereserved(txn->wr.retired_pages, mp->pgno);
pnl_append_prereserved(txn->tw.retired_pages, mp->pgno);
/* Update the parent page, if any, to point to the new page */
if (likely(mc->top)) {
page_t *parent = mc->pg[mc->top - 1];
@ -227,7 +227,7 @@ __hot int page_touch_unmodifable(MDBX_txn *txn, MDBX_cursor *mc, const page_t *c
}
DEBUG("clone db %d page %" PRIaPGNO, cursor_dbi_dbg(mc), mp->pgno);
tASSERT(txn, txn->wr.dirtylist->length <= PAGELIST_LIMIT + MDBX_PNL_GRANULATE);
tASSERT(txn, txn->tw.dirtylist->length <= PAGELIST_LIMIT + MDBX_PNL_GRANULATE);
/* No - copy it */
np = page_shadow_alloc(txn, 1);
if (unlikely(!np)) {
@ -369,7 +369,7 @@ static inline bool suitable4loose(const MDBX_txn *txn, pgno_t pgno) {
* страница не примыкает к какой-либо из уже находящийся в reclaimed.
* 2) стоит подумать над тем, чтобы при большом loose-списке отбрасывать
половину в reclaimed. */
return txn->wr.loose_count < txn->env->options.dp_loose_limit &&
return txn->tw.loose_count < txn->env->options.dp_loose_limit &&
(!MDBX_ENABLE_REFUND ||
/* skip pages near to the end in favor of compactification */
txn->geo.first_unallocated > pgno + txn->env->options.dp_loose_limit ||
@ -417,14 +417,14 @@ int page_retire_ex(MDBX_cursor *mc, const pgno_t pgno, page_t *mp /* maybe null
status = frozen;
if (ASSERT_ENABLED()) {
for (MDBX_txn *scan = txn; scan; scan = scan->parent) {
tASSERT(txn, !txn->wr.spilled.list || !spill_search(scan, pgno));
tASSERT(txn, !scan->wr.dirtylist || !debug_dpl_find(scan, pgno));
tASSERT(txn, !txn->tw.spilled.list || !spill_search(scan, pgno));
tASSERT(txn, !scan->tw.dirtylist || !debug_dpl_find(scan, pgno));
}
}
goto status_done;
} else if (pageflags && txn->wr.dirtylist) {
} else if (pageflags && txn->tw.dirtylist) {
if ((di = dpl_exist(txn, pgno)) != 0) {
mp = txn->wr.dirtylist->items[di].ptr;
mp = txn->tw.dirtylist->items[di].ptr;
tASSERT(txn, is_modifable(txn, mp));
status = modifable;
goto status_done;
@ -461,16 +461,16 @@ int page_retire_ex(MDBX_cursor *mc, const pgno_t pgno, page_t *mp /* maybe null
tASSERT(txn, !is_spilled(txn, mp));
tASSERT(txn, !is_shadowed(txn, mp));
tASSERT(txn, !debug_dpl_find(txn, pgno));
tASSERT(txn, !txn->wr.spilled.list || !spill_search(txn, pgno));
tASSERT(txn, !txn->tw.spilled.list || !spill_search(txn, pgno));
} else if (is_modifable(txn, mp)) {
status = modifable;
if (txn->wr.dirtylist)
if (txn->tw.dirtylist)
di = dpl_exist(txn, pgno);
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) || !is_spilled(txn, mp));
tASSERT(txn, !txn->wr.spilled.list || !spill_search(txn, pgno));
tASSERT(txn, !txn->tw.spilled.list || !spill_search(txn, pgno));
} else if (is_shadowed(txn, mp)) {
status = shadowed;
tASSERT(txn, !txn->wr.spilled.list || !spill_search(txn, pgno));
tASSERT(txn, !txn->tw.spilled.list || !spill_search(txn, pgno));
tASSERT(txn, !debug_dpl_find(txn, pgno));
} else {
tASSERT(txn, is_spilled(txn, mp));
@ -504,7 +504,7 @@ status_done:
if (status == frozen) {
retire:
DEBUG("retire %zu page %" PRIaPGNO, npages, pgno);
rc = pnl_append_span(&txn->wr.retired_pages, pgno, npages);
rc = pnl_append_span(&txn->tw.retired_pages, pgno, npages);
tASSERT(txn, dpl_check(txn));
return rc;
}
@ -560,17 +560,17 @@ status_done:
if (status == modifable) {
/* Dirty page from this transaction */
/* If suitable we can reuse it through loose list */
if (likely(npages == 1 && suitable4loose(txn, pgno)) && (di || !txn->wr.dirtylist)) {
if (likely(npages == 1 && suitable4loose(txn, pgno)) && (di || !txn->tw.dirtylist)) {
DEBUG("loosen dirty page %" PRIaPGNO, pgno);
if (MDBX_DEBUG != 0 || unlikely(txn->env->flags & MDBX_PAGEPERTURB))
memset(page_data(mp), -1, txn->env->ps - PAGEHDRSZ);
mp->txnid = INVALID_TXNID;
mp->flags = P_LOOSE;
page_next(mp) = txn->wr.loose_pages;
txn->wr.loose_pages = mp;
txn->wr.loose_count++;
page_next(mp) = txn->tw.loose_pages;
txn->tw.loose_pages = mp;
txn->tw.loose_count++;
#if MDBX_ENABLE_REFUND
txn->wr.loose_refund_wl = (pgno + 2 > txn->wr.loose_refund_wl) ? pgno + 2 : txn->wr.loose_refund_wl;
txn->tw.loose_refund_wl = (pgno + 2 > txn->tw.loose_refund_wl) ? pgno + 2 : txn->tw.loose_refund_wl;
#endif /* MDBX_ENABLE_REFUND */
VALGRIND_MAKE_MEM_NOACCESS(page_data(mp), txn->env->ps - PAGEHDRSZ);
MDBX_ASAN_POISON_MEMORY_REGION(page_data(mp), txn->env->ps - PAGEHDRSZ);
@ -608,8 +608,8 @@ status_done:
reclaim:
DEBUG("reclaim %zu %s page %" PRIaPGNO, npages, "dirty", pgno);
rc = pnl_insert_span(&txn->wr.repnl, pgno, npages);
tASSERT(txn, pnl_check_allocated(txn->wr.repnl, txn->geo.first_unallocated - MDBX_ENABLE_REFUND));
rc = pnl_insert_span(&txn->tw.repnl, pgno, npages);
tASSERT(txn, pnl_check_allocated(txn->tw.repnl, txn->geo.first_unallocated - MDBX_ENABLE_REFUND));
tASSERT(txn, dpl_check(txn));
return rc;
}
@ -660,10 +660,10 @@ status_done:
__hot int __must_check_result page_dirty(MDBX_txn *txn, page_t *mp, size_t npages) {
tASSERT(txn, (txn->flags & MDBX_TXN_RDONLY) == 0);
mp->txnid = txn->front_txnid;
if (!txn->wr.dirtylist) {
if (!txn->tw.dirtylist) {
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) != 0 && !MDBX_AVOID_MSYNC);
txn->wr.writemap_dirty_npages += npages;
tASSERT(txn, txn->wr.spilled.list == nullptr);
txn->tw.writemap_dirty_npages += npages;
tASSERT(txn, txn->tw.spilled.list == nullptr);
return MDBX_SUCCESS;
}
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC);
@ -671,29 +671,29 @@ __hot int __must_check_result page_dirty(MDBX_txn *txn, page_t *mp, size_t npage
#if xMDBX_DEBUG_SPILLING == 2
txn->env->debug_dirtied_act += 1;
ENSURE(txn->env, txn->env->debug_dirtied_act < txn->env->debug_dirtied_est);
ENSURE(txn->env, txn->wr.dirtyroom + txn->wr.loose_count > 0);
ENSURE(txn->env, txn->tw.dirtyroom + txn->tw.loose_count > 0);
#endif /* xMDBX_DEBUG_SPILLING == 2 */
int rc;
if (unlikely(txn->wr.dirtyroom == 0)) {
if (txn->wr.loose_count) {
page_t *lp = txn->wr.loose_pages;
if (unlikely(txn->tw.dirtyroom == 0)) {
if (txn->tw.loose_count) {
page_t *lp = txn->tw.loose_pages;
DEBUG("purge-and-reclaim loose page %" PRIaPGNO, lp->pgno);
rc = pnl_insert_span(&txn->wr.repnl, lp->pgno, 1);
rc = pnl_insert_span(&txn->tw.repnl, lp->pgno, 1);
if (unlikely(rc != MDBX_SUCCESS))
goto bailout;
size_t di = dpl_search(txn, lp->pgno);
tASSERT(txn, txn->wr.dirtylist->items[di].ptr == lp);
tASSERT(txn, txn->tw.dirtylist->items[di].ptr == lp);
dpl_remove(txn, di);
MDBX_ASAN_UNPOISON_MEMORY_REGION(&page_next(lp), sizeof(page_t *));
VALGRIND_MAKE_MEM_DEFINED(&page_next(lp), sizeof(page_t *));
txn->wr.loose_pages = page_next(lp);
txn->wr.loose_count--;
txn->wr.dirtyroom++;
txn->tw.loose_pages = page_next(lp);
txn->tw.loose_count--;
txn->tw.dirtyroom++;
if (!MDBX_AVOID_MSYNC || !(txn->flags & MDBX_WRITEMAP))
page_shadow_release(txn->env, lp, 1);
} else {
ERROR("Dirtyroom is depleted, DPL length %zu", txn->wr.dirtylist->length);
ERROR("Dirtyroom is depleted, DPL length %zu", txn->tw.dirtylist->length);
if (!MDBX_AVOID_MSYNC || !(txn->flags & MDBX_WRITEMAP))
page_shadow_release(txn->env, mp, npages);
return MDBX_TXN_FULL;
@ -706,7 +706,7 @@ __hot int __must_check_result page_dirty(MDBX_txn *txn, page_t *mp, size_t npage
txn->flags |= MDBX_TXN_ERROR;
return rc;
}
txn->wr.dirtyroom--;
txn->tw.dirtyroom--;
tASSERT(txn, dpl_check(txn));
return MDBX_SUCCESS;
}

View File

@ -88,7 +88,7 @@ static inline int page_touch(MDBX_cursor *mc) {
}
if (is_modifable(txn, mp)) {
if (!txn->wr.dirtylist) {
if (!txn->tw.dirtylist) {
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) && !MDBX_AVOID_MSYNC);
return MDBX_SUCCESS;
}
@ -114,14 +114,14 @@ static inline void page_wash(MDBX_txn *txn, size_t di, page_t *const mp, const s
mp->txnid = INVALID_TXNID;
mp->flags = P_BAD;
if (txn->wr.dirtylist) {
if (txn->tw.dirtylist) {
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC);
tASSERT(txn, MDBX_AVOID_MSYNC || (di && txn->wr.dirtylist->items[di].ptr == mp));
tASSERT(txn, MDBX_AVOID_MSYNC || (di && txn->tw.dirtylist->items[di].ptr == mp));
if (!MDBX_AVOID_MSYNC || di) {
dpl_remove_ex(txn, di, npages);
txn->wr.dirtyroom++;
tASSERT(txn, txn->wr.dirtyroom + txn->wr.dirtylist->length ==
(txn->parent ? txn->parent->wr.dirtyroom : txn->env->options.dp_limit));
txn->tw.dirtyroom++;
tASSERT(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length ==
(txn->parent ? txn->parent->tw.dirtyroom : txn->env->options.dp_limit));
if (!MDBX_AVOID_MSYNC || !(txn->flags & MDBX_WRITEMAP)) {
page_shadow_release(txn->env, mp, npages);
return;
@ -129,7 +129,7 @@ static inline void page_wash(MDBX_txn *txn, size_t di, page_t *const mp, const s
}
} else {
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) && !MDBX_AVOID_MSYNC && !di);
txn->wr.writemap_dirty_npages -= (txn->wr.writemap_dirty_npages > npages) ? npages : txn->wr.writemap_dirty_npages;
txn->tw.writemap_dirty_npages -= (txn->tw.writemap_dirty_npages > npages) ? npages : txn->tw.writemap_dirty_npages;
}
VALGRIND_MAKE_MEM_UNDEFINED(mp, PAGEHDRSZ);
VALGRIND_MAKE_MEM_NOACCESS(page_data(mp), pgno2bytes(txn->env, npages) - PAGEHDRSZ);

View File

@ -23,13 +23,6 @@ void pnl_free(pnl_t pnl) {
osal_free(pnl - 1);
}
pnl_t pnl_clone(const pnl_t src) {
pnl_t pl = pnl_alloc(MDBX_PNL_ALLOCLEN(src));
if (likely(pl))
memcpy(pl, src, MDBX_PNL_SIZEOF(src));
return pl;
}
void pnl_shrink(pnl_t __restrict *__restrict ppnl) {
assert(pnl_bytes2size(pnl_size2bytes(MDBX_PNL_INITIAL)) >= MDBX_PNL_INITIAL &&
pnl_bytes2size(pnl_size2bytes(MDBX_PNL_INITIAL)) < MDBX_PNL_INITIAL * 3 / 2);
@ -241,18 +234,3 @@ __hot __noinline size_t pnl_search_nochk(const pnl_t pnl, pgno_t pgno) {
assert(!MDBX_PNL_ORDERED(it[0], pgno));
return it - begin + 1;
}
size_t pnl_maxspan(const pnl_t pnl) {
size_t len = MDBX_PNL_GETSIZE(pnl);
if (len > 1) {
size_t span = 1, left = len - span;
const pgno_t *scan = MDBX_PNL_BEGIN(pnl);
do {
const bool contiguous = MDBX_PNL_CONTIGUOUS(*scan, scan[span], span);
span += contiguous;
scan += 1 - contiguous;
} while (--left);
len = span;
}
return len;
}

View File

@ -45,18 +45,16 @@ typedef const pgno_t *const_pnl_t;
#define MDBX_PNL_EDGE(pl) ((pl) + 1)
#define MDBX_PNL_LEAST(pl) MDBX_PNL_FIRST(pl)
#define MDBX_PNL_MOST(pl) MDBX_PNL_LAST(pl)
#define MDBX_PNL_CONTIGUOUS(prev, next, span) ((next) - (prev)) == (span))
#else
#define MDBX_PNL_EDGE(pl) ((pl) + MDBX_PNL_GETSIZE(pl))
#define MDBX_PNL_LEAST(pl) MDBX_PNL_LAST(pl)
#define MDBX_PNL_MOST(pl) MDBX_PNL_FIRST(pl)
#define MDBX_PNL_CONTIGUOUS(prev, next, span) (((prev) - (next)) == (span))
#endif
#define MDBX_PNL_SIZEOF(pl) ((MDBX_PNL_GETSIZE(pl) + 1) * sizeof(pgno_t))
#define MDBX_PNL_IS_EMPTY(pl) (MDBX_PNL_GETSIZE(pl) == 0)
MDBX_NOTHROW_PURE_FUNCTION MDBX_MAYBE_UNUSED static inline size_t pnl_size2bytes(size_t size) {
MDBX_MAYBE_UNUSED static inline size_t pnl_size2bytes(size_t size) {
assert(size > 0 && size <= PAGELIST_LIMIT);
#if MDBX_PNL_PREALLOC_FOR_RADIXSORT
@ -71,7 +69,7 @@ MDBX_NOTHROW_PURE_FUNCTION MDBX_MAYBE_UNUSED static inline size_t pnl_size2bytes
return bytes;
}
MDBX_NOTHROW_PURE_FUNCTION MDBX_MAYBE_UNUSED static inline pgno_t pnl_bytes2size(const size_t bytes) {
MDBX_MAYBE_UNUSED static inline pgno_t pnl_bytes2size(const size_t bytes) {
size_t size = bytes / sizeof(pgno_t);
assert(size > 3 && size <= PAGELIST_LIMIT + /* alignment gap */ 65536);
size -= 3;
@ -85,8 +83,6 @@ MDBX_INTERNAL pnl_t pnl_alloc(size_t size);
MDBX_INTERNAL void pnl_free(pnl_t pnl);
MDBX_MAYBE_UNUSED MDBX_INTERNAL pnl_t pnl_clone(const pnl_t src);
MDBX_INTERNAL int pnl_reserve(pnl_t __restrict *__restrict ppnl, const size_t wanna);
MDBX_MAYBE_UNUSED static inline int __must_check_result pnl_need(pnl_t __restrict *__restrict ppnl, size_t num) {
@ -114,7 +110,7 @@ MDBX_INTERNAL int __must_check_result pnl_append_span(__restrict pnl_t *ppnl, pg
MDBX_INTERNAL int __must_check_result pnl_insert_span(__restrict pnl_t *ppnl, pgno_t pgno, size_t n);
MDBX_NOTHROW_PURE_FUNCTION MDBX_INTERNAL size_t pnl_search_nochk(const pnl_t pnl, pgno_t pgno);
MDBX_INTERNAL size_t pnl_search_nochk(const pnl_t pnl, pgno_t pgno);
MDBX_INTERNAL void pnl_sort_nochk(pnl_t pnl);
@ -130,8 +126,7 @@ MDBX_MAYBE_UNUSED static inline void pnl_sort(pnl_t pnl, size_t limit4check) {
(void)limit4check;
}
MDBX_NOTHROW_PURE_FUNCTION MDBX_MAYBE_UNUSED static inline size_t pnl_search(const pnl_t pnl, pgno_t pgno,
size_t limit) {
MDBX_MAYBE_UNUSED static inline size_t pnl_search(const pnl_t pnl, pgno_t pgno, size_t limit) {
assert(pnl_check_allocated(pnl, limit));
if (MDBX_HAVE_CMOV) {
/* cmov-ускоренный бинарный поиск может читать (но не использовать) один
@ -149,5 +144,3 @@ MDBX_NOTHROW_PURE_FUNCTION MDBX_MAYBE_UNUSED static inline size_t pnl_search(con
}
MDBX_INTERNAL size_t pnl_merge(pnl_t dst, const pnl_t src);
MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION MDBX_INTERNAL size_t pnl_maxspan(const pnl_t pnl);

View File

@ -15,8 +15,9 @@ MDBX_INTERNAL bsr_t mvcc_bind_slot(MDBX_env *env);
MDBX_MAYBE_UNUSED MDBX_INTERNAL pgno_t mvcc_largest_this(MDBX_env *env, pgno_t largest);
MDBX_INTERNAL txnid_t mvcc_shapshot_oldest(MDBX_env *const env, const txnid_t steady);
MDBX_INTERNAL pgno_t mvcc_snapshot_largest(const MDBX_env *env, pgno_t last_used_page);
MDBX_INTERNAL txnid_t mvcc_kick_laggards(MDBX_env *env, const txnid_t straggler);
MDBX_INTERNAL int mvcc_cleanup_dead(MDBX_env *env, int rlocked, int *dead);
MDBX_INTERNAL bool mvcc_kick_laggards(MDBX_env *env, const txnid_t laggard);
MDBX_INTERNAL txnid_t mvcc_kick_laggards(MDBX_env *env, const txnid_t laggard);
/* dxb.c */
MDBX_INTERNAL int dxb_setup(MDBX_env *env, const int lck_rc, const mdbx_mode_t mode_bits);
@ -38,54 +39,37 @@ static inline void dxb_sanitize_tail(MDBX_env *env, MDBX_txn *txn) {
#endif /* ENABLE_MEMCHECK || __SANITIZE_ADDRESS__ */
/* txn.c */
MDBX_INTERNAL bool txn_refund(MDBX_txn *txn);
MDBX_INTERNAL txnid_t txn_snapshot_oldest(const MDBX_txn *const txn);
MDBX_INTERNAL int txn_abort(MDBX_txn *txn);
MDBX_INTERNAL int txn_renew(MDBX_txn *txn, unsigned flags);
MDBX_INTERNAL int txn_park(MDBX_txn *txn, bool autounpark);
MDBX_INTERNAL int txn_unpark(MDBX_txn *txn);
MDBX_INTERNAL int txn_check_badbits_parked(const MDBX_txn *txn, int bad_bits);
MDBX_INTERNAL void txn_done_cursors(MDBX_txn *txn, const bool merge);
#define TXN_END_NAMES \
{"committed", "pure-commit", "abort", "reset", "fail-begin", "fail-begin-nested", "ousted", nullptr}
{"committed", "empty-commit", "abort", "reset", "fail-begin", "fail-beginchild", "ousted", nullptr}
enum {
/* txn_end operation number, for logging */
TXN_END_COMMITTED /* 0 */,
TXN_END_PURE_COMMIT /* 1 */,
TXN_END_ABORT /* 2 */,
TXN_END_RESET /* 3 */,
TXN_END_FAIL_BEGIN /* 4 */,
TXN_END_FAIL_BEGIN_NESTED /* 5 */,
TXN_END_OUSTED /* 6 */,
TXN_END_COMMITTED,
TXN_END_PURE_COMMIT,
TXN_END_ABORT,
TXN_END_RESET,
TXN_END_FAIL_BEGIN,
TXN_END_FAIL_BEGINCHILD,
TXN_END_OUSTED,
TXN_END_OPMASK = 0x07 /* mask for txn_end() operation number */,
TXN_END_UPDATE = 0x10 /* update env state (DBIs) */,
TXN_END_FREE = 0x20 /* free txn unless it is env.basal_txn */,
TXN_END_SLOT = 0x40 /* release any reader slot if NOSTICKYTHREADS */
TXN_END_EOTDONE = 0x40 /* txn's cursors already closed */,
TXN_END_SLOT = 0x80 /* release any reader slot if NOSTICKYTHREADS */
};
struct commit_timestamp {
uint64_t start, prep, gc, audit, write, sync, gc_cpu;
};
MDBX_INTERNAL bool txn_refund(MDBX_txn *txn);
MDBX_INTERNAL bool txn_gc_detent(const MDBX_txn *const txn);
MDBX_INTERNAL int txn_check_badbits_parked(const MDBX_txn *txn, int bad_bits);
MDBX_INTERNAL void txn_done_cursors(MDBX_txn *txn);
MDBX_INTERNAL int txn_shadow_cursors(const MDBX_txn *parent, const size_t dbi);
MDBX_INTERNAL MDBX_cursor *txn_gc_cursor(MDBX_txn *txn);
MDBX_INTERNAL MDBX_txn *txn_alloc(const MDBX_txn_flags_t flags, MDBX_env *env);
MDBX_INTERNAL int txn_abort(MDBX_txn *txn);
MDBX_INTERNAL int txn_renew(MDBX_txn *txn, unsigned flags);
MDBX_INTERNAL int txn_end(MDBX_txn *txn, unsigned mode);
MDBX_INTERNAL int txn_nested_create(MDBX_txn *parent, const MDBX_txn_flags_t flags);
MDBX_INTERNAL void txn_nested_abort(MDBX_txn *nested);
MDBX_INTERNAL int txn_nested_join(MDBX_txn *txn, struct commit_timestamp *ts);
MDBX_INTERNAL MDBX_txn *txn_basal_create(const size_t max_dbi);
MDBX_INTERNAL void txn_basal_destroy(MDBX_txn *txn);
MDBX_INTERNAL int txn_basal_start(MDBX_txn *txn, unsigned flags);
MDBX_INTERNAL int txn_basal_commit(MDBX_txn *txn, struct commit_timestamp *ts);
MDBX_INTERNAL int txn_basal_end(MDBX_txn *txn, unsigned mode);
MDBX_INTERNAL int txn_ro_park(MDBX_txn *txn, bool autounpark);
MDBX_INTERNAL int txn_ro_unpark(MDBX_txn *txn);
MDBX_INTERNAL int txn_ro_start(MDBX_txn *txn, unsigned flags);
MDBX_INTERNAL int txn_ro_end(MDBX_txn *txn, unsigned mode);
MDBX_INTERNAL int txn_write(MDBX_txn *txn, iov_ctx_t *ctx);
MDBX_INTERNAL void txn_take_gcprof(const MDBX_txn *txn, MDBX_commit_latency *latency);
MDBX_INTERNAL void txn_merge(MDBX_txn *const parent, MDBX_txn *const txn, const size_t parent_retired_len);
/* env.c */
MDBX_INTERNAL int env_open(MDBX_env *env, mdbx_mode_t mode);

View File

@ -7,7 +7,7 @@
static void refund_reclaimed(MDBX_txn *txn) {
/* Scanning in descend order */
pgno_t first_unallocated = txn->geo.first_unallocated;
const pnl_t pnl = txn->wr.repnl;
const pnl_t pnl = txn->tw.repnl;
tASSERT(txn, MDBX_PNL_GETSIZE(pnl) && MDBX_PNL_MOST(pnl) == first_unallocated - 1);
#if MDBX_PNL_ASCENDING
size_t i = MDBX_PNL_GETSIZE(pnl);
@ -28,16 +28,16 @@ static void refund_reclaimed(MDBX_txn *txn) {
VERBOSE("refunded %" PRIaPGNO " pages: %" PRIaPGNO " -> %" PRIaPGNO, txn->geo.first_unallocated - first_unallocated,
txn->geo.first_unallocated, first_unallocated);
txn->geo.first_unallocated = first_unallocated;
tASSERT(txn, pnl_check_allocated(txn->wr.repnl, txn->geo.first_unallocated - 1));
tASSERT(txn, pnl_check_allocated(txn->tw.repnl, txn->geo.first_unallocated - 1));
}
static void refund_loose(MDBX_txn *txn) {
tASSERT(txn, txn->wr.loose_pages != nullptr);
tASSERT(txn, txn->wr.loose_count > 0);
tASSERT(txn, txn->tw.loose_pages != nullptr);
tASSERT(txn, txn->tw.loose_count > 0);
dpl_t *const dl = txn->wr.dirtylist;
dpl_t *const dl = txn->tw.dirtylist;
if (dl) {
tASSERT(txn, dl->length >= txn->wr.loose_count);
tASSERT(txn, dl->length >= txn->tw.loose_count);
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC);
} else {
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) != 0 && !MDBX_AVOID_MSYNC);
@ -46,22 +46,22 @@ static void refund_loose(MDBX_txn *txn) {
pgno_t onstack[MDBX_CACHELINE_SIZE * 8 / sizeof(pgno_t)];
pnl_t suitable = onstack;
if (!dl || dl->length - dl->sorted > txn->wr.loose_count) {
if (!dl || dl->length - dl->sorted > txn->tw.loose_count) {
/* Dirty list is useless since unsorted. */
if (pnl_bytes2size(sizeof(onstack)) < txn->wr.loose_count) {
suitable = pnl_alloc(txn->wr.loose_count);
if (pnl_bytes2size(sizeof(onstack)) < txn->tw.loose_count) {
suitable = pnl_alloc(txn->tw.loose_count);
if (unlikely(!suitable))
return /* this is not a reason for transaction fail */;
}
/* Collect loose-pages which may be refunded. */
tASSERT(txn, txn->geo.first_unallocated >= MIN_PAGENO + txn->wr.loose_count);
tASSERT(txn, txn->geo.first_unallocated >= MIN_PAGENO + txn->tw.loose_count);
pgno_t most = MIN_PAGENO;
size_t w = 0;
for (const page_t *lp = txn->wr.loose_pages; lp; lp = page_next(lp)) {
for (const page_t *lp = txn->tw.loose_pages; lp; lp = page_next(lp)) {
tASSERT(txn, lp->flags == P_LOOSE);
tASSERT(txn, txn->geo.first_unallocated > lp->pgno);
if (likely(txn->geo.first_unallocated - txn->wr.loose_count <= lp->pgno)) {
if (likely(txn->geo.first_unallocated - txn->tw.loose_count <= lp->pgno)) {
tASSERT(txn, w < ((suitable == onstack) ? pnl_bytes2size(sizeof(onstack)) : MDBX_PNL_ALLOCLEN(suitable)));
suitable[++w] = lp->pgno;
most = (lp->pgno > most) ? lp->pgno : most;
@ -90,11 +90,11 @@ static void refund_loose(MDBX_txn *txn) {
const size_t refunded = txn->geo.first_unallocated - most;
DEBUG("refund-suitable %zu pages %" PRIaPGNO " -> %" PRIaPGNO, refunded, most, txn->geo.first_unallocated);
txn->geo.first_unallocated = most;
txn->wr.loose_count -= refunded;
txn->tw.loose_count -= refunded;
if (dl) {
txn->wr.dirtyroom += refunded;
txn->tw.dirtyroom += refunded;
dl->pages_including_loose -= refunded;
assert(txn->wr.dirtyroom <= txn->env->options.dp_limit);
assert(txn->tw.dirtyroom <= txn->env->options.dp_limit);
/* Filter-out dirty list */
size_t r = 0;
@ -115,8 +115,8 @@ static void refund_loose(MDBX_txn *txn) {
}
}
dpl_setlen(dl, w);
tASSERT(txn, txn->wr.dirtyroom + txn->wr.dirtylist->length ==
(txn->parent ? txn->parent->wr.dirtyroom : txn->env->options.dp_limit));
tASSERT(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length ==
(txn->parent ? txn->parent->tw.dirtyroom : txn->env->options.dp_limit));
}
goto unlink_loose;
}
@ -141,15 +141,15 @@ static void refund_loose(MDBX_txn *txn) {
if (dl->sorted != dl->length) {
const size_t refunded = dl->sorted - dl->length;
dl->sorted = dl->length;
txn->wr.loose_count -= refunded;
txn->wr.dirtyroom += refunded;
txn->tw.loose_count -= refunded;
txn->tw.dirtyroom += refunded;
dl->pages_including_loose -= refunded;
tASSERT(txn, txn->wr.dirtyroom + txn->wr.dirtylist->length ==
(txn->parent ? txn->parent->wr.dirtyroom : txn->env->options.dp_limit));
tASSERT(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length ==
(txn->parent ? txn->parent->tw.dirtyroom : txn->env->options.dp_limit));
/* Filter-out loose chain & dispose refunded pages. */
unlink_loose:
for (page_t *__restrict *__restrict link = &txn->wr.loose_pages; *link;) {
for (page_t *__restrict *__restrict link = &txn->tw.loose_pages; *link;) {
page_t *dp = *link;
tASSERT(txn, dp->flags == P_LOOSE);
MDBX_ASAN_UNPOISON_MEMORY_REGION(&page_next(dp), sizeof(page_t *));
@ -168,21 +168,21 @@ static void refund_loose(MDBX_txn *txn) {
tASSERT(txn, dpl_check(txn));
if (suitable != onstack)
pnl_free(suitable);
txn->wr.loose_refund_wl = txn->geo.first_unallocated;
txn->tw.loose_refund_wl = txn->geo.first_unallocated;
}
bool txn_refund(MDBX_txn *txn) {
const pgno_t before = txn->geo.first_unallocated;
if (txn->wr.loose_pages && txn->wr.loose_refund_wl > txn->geo.first_unallocated)
if (txn->tw.loose_pages && txn->tw.loose_refund_wl > txn->geo.first_unallocated)
refund_loose(txn);
while (true) {
if (MDBX_PNL_GETSIZE(txn->wr.repnl) == 0 || MDBX_PNL_MOST(txn->wr.repnl) != txn->geo.first_unallocated - 1)
if (MDBX_PNL_GETSIZE(txn->tw.repnl) == 0 || MDBX_PNL_MOST(txn->tw.repnl) != txn->geo.first_unallocated - 1)
break;
refund_reclaimed(txn);
if (!txn->wr.loose_pages || txn->wr.loose_refund_wl <= txn->geo.first_unallocated)
if (!txn->tw.loose_pages || txn->tw.loose_refund_wl <= txn->geo.first_unallocated)
break;
const pgno_t memo = txn->geo.first_unallocated;
@ -194,7 +194,7 @@ bool txn_refund(MDBX_txn *txn) {
if (before == txn->geo.first_unallocated)
return false;
if (txn->wr.spilled.list)
if (txn->tw.spilled.list)
/* Squash deleted pagenums if we refunded any */
spill_purge(txn);

639
src/rkl.c
View File

@ -1,639 +0,0 @@
/// \copyright SPDX-License-Identifier: Apache-2.0
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2025
#include "internals.h"
static inline size_t rkl_size2bytes(const size_t size) {
assert(size > 0 && size <= txl_max * 2);
size_t bytes = ceil_powerof2(MDBX_ASSUME_MALLOC_OVERHEAD + sizeof(txnid_t) * size, txl_granulate * sizeof(txnid_t)) -
MDBX_ASSUME_MALLOC_OVERHEAD;
return bytes;
}
static inline size_t rkl_bytes2size(const size_t bytes) {
size_t size = bytes / sizeof(txnid_t);
assert(size > 0 && size <= txl_max * 2);
return size;
}
void rkl_init(rkl_t *rkl) {
rkl->list_limit = ARRAY_LENGTH(rkl->inplace);
rkl->list = rkl->inplace;
rkl_clear(rkl);
}
void rkl_clear(rkl_t *rkl) {
rkl->solid_begin = UINT64_MAX;
rkl->solid_end = 0;
rkl->list_length = 0;
}
void rkl_destroy(rkl_t *rkl) {
void *ptr = rkl->list;
rkl->list = nullptr;
if (ptr != rkl->inplace)
osal_free(ptr);
}
static inline bool solid_empty(const rkl_t *rkl) { return !(rkl->solid_begin < rkl->solid_end); }
#define RKL_ORDERED(first, last) ((first) < (last))
SEARCH_IMPL(rkl_bsearch, txnid_t, txnid_t, RKL_ORDERED)
void rkl_destructive_move(rkl_t *src, rkl_t *dst) {
assert(rkl_check(src));
dst->solid_begin = src->solid_begin;
dst->solid_end = src->solid_end;
dst->list_length = src->list_length;
if (dst->list != dst->inplace)
osal_free(dst->list);
if (src->list != src->inplace) {
dst->list = src->list;
dst->list_limit = src->list_limit;
} else {
dst->list = dst->inplace;
dst->list_limit = ARRAY_LENGTH(src->inplace);
memcpy(dst->inplace, src->list, sizeof(dst->inplace));
}
rkl_init(src);
}
static int rkl_resize(rkl_t *rkl, size_t wanna_size) {
assert(wanna_size > rkl->list_length);
assert(rkl_check(rkl));
STATIC_ASSERT(txl_max < INT_MAX / sizeof(txnid_t));
if (unlikely(wanna_size > txl_max)) {
ERROR("rkl too long (%zu >= %zu)", wanna_size, (size_t)txl_max);
return MDBX_TXN_FULL;
}
if (unlikely(wanna_size < rkl->list_length)) {
ERROR("unable shrink rkl to %zu since length is %u", wanna_size, rkl->list_length);
return MDBX_PROBLEM;
}
if (unlikely(wanna_size <= ARRAY_LENGTH(rkl->inplace))) {
if (rkl->list != rkl->inplace) {
assert(rkl->list_limit > ARRAY_LENGTH(rkl->inplace) && rkl->list_length <= ARRAY_LENGTH(rkl->inplace));
memcpy(rkl->inplace, rkl->list, sizeof(rkl->inplace));
rkl->list_limit = ARRAY_LENGTH(rkl->inplace);
osal_free(rkl->list);
rkl->list = rkl->inplace;
} else {
assert(rkl->list_limit == ARRAY_LENGTH(rkl->inplace));
}
return MDBX_SUCCESS;
}
if (wanna_size != rkl->list_limit) {
size_t bytes = rkl_size2bytes(wanna_size);
void *ptr = (rkl->list == rkl->inplace) ? osal_malloc(bytes) : osal_realloc(rkl->list, bytes);
if (unlikely(!ptr))
return MDBX_ENOMEM;
#ifdef osal_malloc_usable_size
bytes = osal_malloc_usable_size(ptr);
#endif /* osal_malloc_usable_size */
rkl->list_limit = rkl_bytes2size(bytes);
if (rkl->list == rkl->inplace)
memcpy(ptr, rkl->inplace, sizeof(rkl->inplace));
rkl->list = ptr;
}
return MDBX_SUCCESS;
}
int rkl_copy(const rkl_t *src, rkl_t *dst) {
assert(rkl_check(src));
rkl_init(dst);
if (!rkl_empty(src)) {
if (dst->list_limit < src->list_length) {
int err = rkl_resize(dst, src->list_limit);
if (unlikely(err != MDBX_SUCCESS))
return err;
}
memcpy(dst->list, src->list, sizeof(txnid_t) * src->list_length);
dst->list_length = src->list_length;
dst->solid_begin = src->solid_begin;
dst->solid_end = src->solid_end;
}
return MDBX_SUCCESS;
}
size_t rkl_len(const rkl_t *rkl) { return rkl_empty(rkl) ? 0 : rkl->solid_end - rkl->solid_begin + rkl->list_length; }
__hot bool rkl_contain(const rkl_t *rkl, txnid_t id) {
assert(rkl_check(rkl));
if (id >= rkl->solid_begin && id < rkl->solid_end)
return true;
if (rkl->list_length) {
const txnid_t *it = rkl_bsearch(rkl->list, rkl->list_length, id);
const txnid_t *const end = rkl->list + rkl->list_length;
assert(it >= rkl->list && it <= end);
if (it != rkl->list)
assert(RKL_ORDERED(it[-1], id));
if (it != end) {
assert(!RKL_ORDERED(it[0], id));
return *it == id;
}
}
return false;
}
__hot bool rkl_find(const rkl_t *rkl, txnid_t id, rkl_iter_t *iter) {
assert(rkl_check(rkl));
*iter = rkl_iterator(rkl, false);
if (id >= rkl->solid_begin) {
if (id < rkl->solid_end) {
iter->pos = iter->solid_offset + (unsigned)(id - rkl->solid_begin);
return true;
}
iter->pos = (unsigned)(rkl->solid_end - rkl->solid_begin);
}
if (rkl->list_length) {
const txnid_t *it = rkl_bsearch(rkl->list, rkl->list_length, id);
const txnid_t *const end = rkl->list + rkl->list_length;
assert(it >= rkl->list && it <= end);
if (it != rkl->list)
assert(RKL_ORDERED(it[-1], id));
iter->pos += (unsigned)(it - rkl->list);
if (it != end) {
assert(!RKL_ORDERED(it[0], id));
return *it == id;
}
}
return false;
}
static inline txnid_t list_remove_first(rkl_t *rkl) {
assert(rkl->list_length > 0);
const txnid_t first = rkl->list[0];
if (--rkl->list_length) {
/* TODO: Можно подумать о том, чтобы для избавления от memove() добавить headroom или вместо длины и
* указателя на список использовать три поля: list_begin, list_end и list_buffer. */
size_t i = 0;
do
rkl->list[i] = rkl->list[i + 1];
while (++i <= rkl->list_length);
}
return first;
}
static inline txnid_t after_cut(rkl_t *rkl, const txnid_t out) {
if (rkl->list_length == 0 && rkl->solid_begin == rkl->solid_end) {
rkl->solid_end = 0;
rkl->solid_begin = UINT64_MAX;
}
return out;
}
static int extend_solid(rkl_t *rkl, txnid_t solid_begin, txnid_t solid_end, const txnid_t id) {
if (rkl->list_length) {
const txnid_t *i = rkl_bsearch(rkl->list, rkl->list_length, id);
const txnid_t *const end = rkl->list + rkl->list_length;
/* если начало или конец списка примыкает к непрерывному интервалу,
* то переносим эти элементы из списка в непрерывный интервал */
txnid_t *f = (txnid_t *)i;
while (f > rkl->list && f[-1] >= solid_begin - 1) {
f -= 1;
solid_begin -= 1;
if (unlikely(*f != solid_begin))
return MDBX_RESULT_TRUE;
}
txnid_t *t = (txnid_t *)i;
while (t < end && *t <= solid_end) {
if (unlikely(*t != solid_end))
return MDBX_RESULT_TRUE;
solid_end += 1;
t += 1;
}
if (f < t) {
rkl->list_length -= t - f;
while (t < end)
*f++ = *t++;
}
}
rkl->solid_begin = solid_begin;
rkl->solid_end = solid_end;
assert(rkl_check(rkl));
return MDBX_SUCCESS;
}
int rkl_push(rkl_t *rkl, const txnid_t id, const bool known_continuous) {
assert(id >= MIN_TXNID && id < INVALID_TXNID);
assert(rkl_check(rkl));
if (rkl->solid_begin >= rkl->solid_end) {
/* непрерывный интервал пуст */
return extend_solid(rkl, id, id + 1, id);
} else if (id < rkl->solid_begin) {
if (known_continuous || id + 1 == rkl->solid_begin)
/* id примыкает к solid_begin */
return extend_solid(rkl, id, rkl->solid_end, id);
} else if (id >= rkl->solid_end) {
if (known_continuous || id == rkl->solid_end)
/* id примыкает к solid_end */
return extend_solid(rkl, rkl->solid_begin, id + 1, id);
} else {
/* id входит в интервал между solid_begin и solid_end, т.е. подан дубликат */
return MDBX_RESULT_TRUE;
}
if (rkl->list_length == 1 && rkl->solid_end == rkl->solid_begin + 1 &&
(rkl->list[0] == id + 1 || rkl->list[0] == id - 1)) {
/* В списке один элемент и добавляемый id примыкает к нему, при этом в непрерывном интервале тоже один элемент.
* Лучше поменять элементы списка и непрерывного интервала. */
const txnid_t couple = (rkl->list[0] == id - 1) ? id - 1 : id;
rkl->list[0] = rkl->solid_begin;
rkl->solid_begin = couple;
rkl->solid_end = couple + 2;
assert(rkl_check(rkl));
return MDBX_SUCCESS;
}
if (unlikely(rkl->list_length == rkl->list_limit)) {
/* удваиваем размер буфера если закончилось место */
size_t x2 = (rkl->list_limit + 1) << 1;
x2 = (x2 > 62) ? x2 : 62;
x2 = (x2 < txl_max) ? x2 : txl_max;
x2 = (x2 > rkl->list_length) ? x2 : rkl->list_length + 42;
int err = rkl_resize(rkl, x2);
if (unlikely(err != MDBX_SUCCESS))
return err;
assert(rkl->list_limit > rkl->list_length);
}
size_t i = rkl->list_length;
/* ищем место для вставки двигаясь от конца к началу списка, сразу переставляя/раздвигая элементы */
while (i > 0) {
if (RKL_ORDERED(id, rkl->list[i - 1])) {
rkl->list[i] = rkl->list[i - 1];
i -= 1;
continue;
}
if (unlikely(id == rkl->list[i - 1])) {
while (++i < rkl->list_length)
rkl->list[i - 1] = rkl->list[i];
return MDBX_RESULT_TRUE;
}
break;
}
rkl->list[i] = id;
rkl->list_length++;
assert(rkl_check(rkl));
/* После добавления id в списке могла образоваться длинная последовательность,
* которую (возможно) стоит обменять с непрерывным интервалом. */
if (rkl->list_length > (MDBX_DEBUG ? 2 : 16) &&
((i > 0 && rkl->list[i - 1] == id - 1) || (i + 1 < rkl->list_length && rkl->list[i + 1] == id + 1))) {
txnid_t new_solid_begin = id;
size_t from = i;
while (from > 0 && rkl->list[from - 1] == new_solid_begin - 1) {
from -= 1;
new_solid_begin -= 1;
}
txnid_t new_solid_end = id + 1;
size_t to = i + 1;
while (to < rkl->list_length && rkl->list[to] == new_solid_end) {
to += 1;
new_solid_end += 1;
}
const size_t new_solid_len = to - from;
if (new_solid_len > 3) {
const size_t old_solid_len = rkl->solid_end - rkl->solid_begin;
if (new_solid_len > old_solid_len) {
/* Новая непрерывная последовательность длиннее текущей.
* Считаем обмен выгодным, если он дешевле пути развития событий с добавлением следующего элемента в список. */
const size_t old_solid_pos = rkl_bsearch(rkl->list, rkl->list_length, rkl->solid_begin) - rkl->list;
const size_t swap_cost =
/* количество элементов списка после изымаемой из списка последовательности,
* которые нужно переместить */
rkl->list_length - to +
/* количество элементов списка после позиции добавляемой в список последовательности,
* которые нужно переместить */
((from > old_solid_pos) ? from - old_solid_pos : 0)
/* количество элементов списка добавляемой последовательности, которые нужно добавить */
+ old_solid_len;
/* количество элементов списка, которые нужно переместить для вставки еще-одного/следующего элемента */
const size_t new_insert_cost = rkl->list_length - i;
/* coverity[logical_vs_bitwise] */
if (unlikely(swap_cost < new_insert_cost) || MDBX_DEBUG) {
/* Изымаемая последовательность длиннее добавляемой, поэтому:
* - список станет короче;
* - перемещать хвост нужно всегда к началу;
* - если начальные элементы потребуется раздвигать,
* то места хватит и остающиеся элементы в конце не будут перезаписаны. */
size_t moved = 0;
if (from > old_solid_pos) {
/* добавляемая последовательность ближе к началу, нужно раздвинуть элементы в голове для вставки. */
moved = from - old_solid_pos;
do {
from -= 1;
rkl->list[from + old_solid_len] = rkl->list[from];
} while (from > old_solid_pos);
} else if (from + new_solid_len < old_solid_pos) {
/* добавляемая последовательность дальше от начала,
* перемещаем часть элементов из хвоста после изымаемой последовательности */
do
rkl->list[from++] = rkl->list[to++];
while (from < old_solid_pos - new_solid_len);
}
/* вставляем последовательноть */
i = 0;
do
rkl->list[from++] = rkl->solid_begin + i++;
while (i != old_solid_len);
/* сдвигаем оставшийся хвост */
while (to < rkl->list_length)
rkl->list[moved + from++] = rkl->list[to++];
rkl->list_length = rkl->list_length - new_solid_len + old_solid_len;
rkl->solid_begin = new_solid_begin;
rkl->solid_end = new_solid_end;
assert(rkl_check(rkl));
}
}
}
}
return MDBX_SUCCESS;
}
txnid_t rkl_pop(rkl_t *rkl, const bool highest_not_lowest) {
assert(rkl_check(rkl));
if (rkl->list_length) {
assert(rkl->solid_begin <= rkl->solid_end);
if (highest_not_lowest && (solid_empty(rkl) || rkl->solid_end < rkl->list[rkl->list_length - 1]))
return after_cut(rkl, rkl->list[rkl->list_length -= 1]);
if (!highest_not_lowest && (solid_empty(rkl) || rkl->solid_begin > rkl->list[0]))
return after_cut(rkl, list_remove_first(rkl));
}
if (!solid_empty(rkl))
return after_cut(rkl, highest_not_lowest ? --rkl->solid_end : rkl->solid_begin++);
assert(rkl_empty(rkl));
return 0;
}
txnid_t rkl_lowest(const rkl_t *rkl) {
if (rkl->list_length)
return (solid_empty(rkl) || rkl->list[0] < rkl->solid_begin) ? rkl->list[0] : rkl->solid_begin;
return !solid_empty(rkl) ? rkl->solid_begin : INVALID_TXNID;
}
txnid_t rkl_highest(const rkl_t *rkl) {
if (rkl->list_length)
return (solid_empty(rkl) || rkl->list[rkl->list_length - 1] >= rkl->solid_end) ? rkl->list[rkl->list_length - 1]
: rkl->solid_end - 1;
return !solid_empty(rkl) ? rkl->solid_end - 1 : 0;
}
int rkl_merge(rkl_t *dst, const rkl_t *src, bool ignore_duplicates) {
if (src->list_length) {
size_t i = src->list_length;
do {
int err = rkl_push(dst, src->list[i - 1], false);
if (unlikely(err != MDBX_SUCCESS) && (!ignore_duplicates || err != MDBX_RESULT_TRUE))
return err;
} while (--i);
}
txnid_t id = src->solid_begin;
while (id < src->solid_end) {
int err = rkl_push(dst, id, false);
if (unlikely(err != MDBX_SUCCESS) && (!ignore_duplicates || err != MDBX_RESULT_TRUE))
return err;
++id;
}
return MDBX_SUCCESS;
}
rkl_iter_t rkl_iterator(const rkl_t *rkl, const bool reverse) {
rkl_iter_t iter = {.rkl = rkl, .pos = reverse ? rkl_len(rkl) : 0, .solid_offset = 0};
if (!solid_empty(rkl) && rkl->list_length) {
const txnid_t *it = rkl_bsearch(rkl->list, rkl->list_length, rkl->solid_begin);
const txnid_t *const end = rkl->list + rkl->list_length;
assert(it >= rkl->list && it <= end && (it == end || *it > rkl->solid_begin));
iter.solid_offset = it - rkl->list;
}
return iter;
}
txnid_t rkl_turn(rkl_iter_t *iter, const bool reverse) {
assert((unsigned)reverse == (unsigned)!!reverse);
size_t pos = iter->pos - reverse;
if (unlikely(pos >= rkl_len(iter->rkl)))
return 0;
iter->pos = pos + !reverse;
assert(iter->pos <= rkl_len(iter->rkl));
const size_t solid_len = iter->rkl->solid_end - iter->rkl->solid_begin;
if (iter->rkl->list_length) {
if (pos < iter->solid_offset)
return iter->rkl->list[pos];
else if (pos < iter->solid_offset + solid_len)
return iter->rkl->solid_begin + pos - iter->solid_offset;
else
return iter->rkl->list[pos - solid_len];
}
assert(pos < solid_len);
return iter->rkl->solid_begin + pos;
}
size_t rkl_left(rkl_iter_t *iter, const bool reverse) {
assert(iter->pos <= rkl_len(iter->rkl));
return reverse ? iter->pos : rkl_len(iter->rkl) - iter->pos;
}
#if 1
#define DEBUG_HOLE(hole) \
do { \
} while (0)
#else
#define DEBUG_HOLE(hole) \
do { \
printf(" return-%sward: %d, ", reverse ? "back" : "for", __LINE__); \
if (hole.begin == hole.end) \
printf("empty-hole\n"); \
else if (hole.end - hole.begin == 1) \
printf("hole %" PRIaTXN "\n", hole.begin); \
else \
printf("hole %" PRIaTXN "-%" PRIaTXN "\n", hole.begin, hole.end - 1); \
fflush(nullptr); \
} while (0)
#endif
rkl_hole_t rkl_hole(rkl_iter_t *iter, const bool reverse) {
assert((unsigned)reverse == (unsigned)!!reverse);
rkl_hole_t hole;
const size_t len = rkl_len(iter->rkl);
size_t pos = iter->pos;
if (unlikely(pos >= len)) {
if (len == 0) {
hole.begin = 1;
hole.end = MAX_TXNID;
iter->pos = 0;
DEBUG_HOLE(hole);
return hole;
} else if (pos == len && reverse) {
/* шаг назад из позиции на конце rkl */
} else if (reverse) {
hole.begin = 1;
hole.end = 1 /* rkl_lowest(iter->rkl); */;
iter->pos = 0;
DEBUG_HOLE(hole);
return hole;
} else {
hole.begin = MAX_TXNID /* rkl_highest(iter->rkl) + 1 */;
hole.end = MAX_TXNID;
iter->pos = len;
DEBUG_HOLE(hole);
return hole;
}
}
const size_t solid_len = iter->rkl->solid_end - iter->rkl->solid_begin;
if (iter->rkl->list_length) {
/* список элементов не пуст */
txnid_t here, there;
for (size_t next;; pos = next) {
next = reverse ? pos - 1 : pos + 1;
if (pos < iter->solid_offset) {
/* текущая позиция перед непрерывным интервалом */
here = iter->rkl->list[pos];
if (next == iter->solid_offset) {
/* в следующей позиции начинается непрерывный интерал (при поиске вперед) */
assert(!reverse);
hole.begin = here + 1;
hole.end = iter->rkl->solid_begin;
next += solid_len;
assert(hole.begin < hole.end /* зазор обязан быть, иначе это ошибка не-слияния */);
/* зазор между элементом списка перед сплошным интервалом и началом интервала */
iter->pos = next - 1;
DEBUG_HOLE(hole);
return hole;
}
if (next >= len)
/* уперлись в конец или начало rkl */
break;
/* следующая позиция также перед непрерывным интервалом */
there = iter->rkl->list[next];
} else if (pos >= iter->solid_offset + solid_len) {
/* текущая позиция после непрерывного интервала */
here = (pos < len) ? iter->rkl->list[pos - solid_len] : MAX_TXNID;
if (next >= len)
/* уперлись в конец или начало rkl */
break;
if (next == iter->solid_offset + solid_len - 1) {
/* в следующей позиции конец непрерывного интервала (при поиске назад) */
assert(reverse);
hole.begin = iter->rkl->solid_end;
hole.end = here;
pos = iter->solid_offset;
assert(hole.begin < hole.end /* зазор обязан быть, иначе это ошибка не-слияния */);
/* зазор между элементом списка после сплошного интервала и концом интервала */
iter->pos = pos;
DEBUG_HOLE(hole);
return hole;
}
/* следующая позиция также после непрерывного интервала */
there = iter->rkl->list[next - solid_len];
} else if (reverse) {
/* текущая позиция внутри непрерывного интервала и поиск назад */
next = iter->solid_offset - 1;
here = iter->rkl->solid_begin;
if (next >= len)
/* нет элементов списка перед непрерывным интервалом */
break;
/* предыдущая позиция перед непрерывным интервалом */
there = iter->rkl->list[next];
} else {
/* текущая позиция внутри непрерывного интервала и поиск вперед */
next = iter->solid_offset + solid_len;
here = iter->rkl->solid_end - 1;
if (next >= len)
/* нет элементов списка после непрерывного интервала */
break;
/* следующая позиция после непрерывного интервала */
there = iter->rkl->list[next - solid_len];
}
hole.begin = (reverse ? there : here) + 1;
hole.end = reverse ? here : there;
if (hole.begin < hole.end) {
/* есть зазор между текущей и следующей позицией */
iter->pos = next;
DEBUG_HOLE(hole);
return hole;
}
}
if (reverse) {
/* уперлись в начало rkl, возвращаем зазор перед началом rkl */
hole.begin = 1;
hole.end = here;
iter->pos = 0;
DEBUG_HOLE(hole);
} else {
/* уперлись в конец rkl, возвращаем зазор после конца rkl */
hole.begin = here + 1;
hole.end = MAX_TXNID;
iter->pos = len;
DEBUG_HOLE(hole);
}
return hole;
}
/* список элементов пуст, но есть непрерывный интервал */
iter->pos = reverse ? 0 : len;
if (reverse && pos < len) {
/* возвращаем зазор перед непрерывным интервалом */
hole.begin = 1;
hole.end = iter->rkl->solid_begin;
DEBUG_HOLE(hole);
} else {
/* возвращаем зазор после непрерывного интервала */
hole.begin = iter->rkl->solid_end;
hole.end = MAX_TXNID;
DEBUG_HOLE(hole);
}
return hole;
}
bool rkl_check(const rkl_t *rkl) {
if (!rkl)
return false;
if (rkl->list == rkl->inplace && unlikely(rkl->list_limit != ARRAY_LENGTH(rkl->inplace)))
return false;
if (unlikely(rkl->list_limit < ARRAY_LENGTH(rkl->inplace)))
return false;
if (rkl_empty(rkl))
return rkl->list_length == 0 && solid_empty(rkl);
if (rkl->list_length) {
for (size_t i = 1; i < rkl->list_length; ++i)
if (unlikely(!RKL_ORDERED(rkl->list[i - 1], rkl->list[i])))
return false;
if (!solid_empty(rkl) && rkl->solid_begin - 1 <= rkl->list[rkl->list_length - 1] &&
rkl->solid_end >= rkl->list[0]) {
/* непрерывный интервал "плавает" внутри списка, т.е. находится между какими-то соседними значениями */
const txnid_t *it = rkl_bsearch(rkl->list, rkl->list_length, rkl->solid_begin);
const txnid_t *const end = rkl->list + rkl->list_length;
if (it < rkl->list || it > end)
return false;
if (it > rkl->list && it[-1] >= rkl->solid_begin)
return false;
if (it < end && it[0] <= rkl->solid_end)
return false;
}
}
return true;
}

View File

@ -1,76 +0,0 @@
/// \copyright SPDX-License-Identifier: Apache-2.0
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2025
#pragma once
#include "essentials.h"
/* Сортированный набор txnid, использующий внутри комбинацию непрерывного интервала и списка.
* Обеспечивает хранение id записей при переработке, очистку и обновлении GC, включая возврат остатков переработанных
* страниц.
*
* При переработке GC записи преимущественно выбираются последовательно, но это не гарантируется. В LIFO-режиме
* переработка и добавление записей в rkl происходит преимущественно в обратном порядке, но из-за завершения читающих
* транзакций могут быть «скачки» в прямом направлении. В FIFO-режиме записи GC перерабатываются в прямом порядке и при
* этом линейно, но не обязательно строго последовательно, при этом гарантируется что между добавляемыми в rkl
* идентификаторами в GC нет записей, т.е. между первой (минимальный id) и последней (максимальный id) в GC нет записей
* и весь интервал может быть использован для возврата остатков страниц в GC.
*
* Таким образом, комбинация линейного интервала и списка (отсортированного в порядке возрастания элементов) является
* рациональным решением, близким к теоретически оптимальному пределу.
*
* Реализация rkl достаточно проста/прозрачная, если не считать неочевидную «магию» обмена непрерывного интервала и
* образующихся в списке последовательностей. Однако, именно этот автоматически выполняемый без лишних операций обмен
* оправдывает все накладные расходы. */
typedef struct MDBX_rkl {
txnid_t solid_begin, solid_end; /* начало и конец непрерывной последовательности solid_begin ... solid_end-1. */
unsigned list_length; /* текущая длина списка. */
unsigned list_limit; /* размер буфера выделенного под список, равен ARRAY_LENGTH(inplace) когда list == inplace. */
txnid_t *list; /* список отдельных элементов в порядке возрастания (наименьший в начале). */
txnid_t inplace[4 + 8]; /* статический массив для коротких списков, чтобы избавиться от выделения/освобождения памяти
* в большинстве случаев. */
} rkl_t;
MDBX_MAYBE_UNUSED MDBX_INTERNAL void rkl_init(rkl_t *rkl);
MDBX_MAYBE_UNUSED MDBX_INTERNAL void rkl_clear(rkl_t *rkl);
static inline void rkl_clear_and_shrink(rkl_t *rkl) { rkl_clear(rkl); /* TODO */ }
MDBX_MAYBE_UNUSED MDBX_INTERNAL void rkl_destroy(rkl_t *rkl);
MDBX_MAYBE_UNUSED MDBX_INTERNAL void rkl_destructive_move(rkl_t *dst, rkl_t *src);
MDBX_MAYBE_UNUSED MDBX_INTERNAL __must_check_result int rkl_copy(const rkl_t *src, rkl_t *dst);
MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static inline bool rkl_empty(const rkl_t *rkl) {
return rkl->solid_begin > rkl->solid_end;
}
MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION MDBX_INTERNAL bool rkl_check(const rkl_t *rkl);
MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION MDBX_INTERNAL size_t rkl_len(const rkl_t *rkl);
MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION MDBX_INTERNAL txnid_t rkl_lowest(const rkl_t *rkl);
MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION MDBX_INTERNAL txnid_t rkl_highest(const rkl_t *rkl);
MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static inline txnid_t rkl_edge(const rkl_t *rkl,
const bool highest_not_lowest) {
return highest_not_lowest ? rkl_highest(rkl) : rkl_lowest(rkl);
}
MDBX_MAYBE_UNUSED MDBX_INTERNAL __must_check_result int rkl_push(rkl_t *rkl, const txnid_t id,
const bool known_continuous);
MDBX_MAYBE_UNUSED MDBX_INTERNAL txnid_t rkl_pop(rkl_t *rkl, const bool highest_not_lowest);
MDBX_MAYBE_UNUSED MDBX_INTERNAL __must_check_result int rkl_merge(rkl_t *dst, const rkl_t *src, bool ignore_duplicates);
/* Итератор для rkl.
* Обеспечивает изоляцию внутреннего устройства rkl от остального кода, чем существенно его упрощает.
* Фактически именно использованием rkl с итераторами ликвидируется "ребус" исторически образовавшийся в gc-update. */
typedef struct MDBX_rkl_iter {
const rkl_t *rkl;
unsigned pos;
unsigned solid_offset;
} rkl_iter_t;
MDBX_MAYBE_UNUSED MDBX_INTERNAL __must_check_result rkl_iter_t rkl_iterator(const rkl_t *rkl, const bool reverse);
MDBX_MAYBE_UNUSED MDBX_INTERNAL __must_check_result txnid_t rkl_turn(rkl_iter_t *iter, const bool reverse);
MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION MDBX_INTERNAL size_t rkl_left(rkl_iter_t *iter, const bool reverse);
MDBX_MAYBE_UNUSED MDBX_INTERNAL bool rkl_find(const rkl_t *rkl, const txnid_t id, rkl_iter_t *iter);
MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION __must_check_result MDBX_INTERNAL bool rkl_contain(const rkl_t *rkl,
txnid_t id);
typedef struct MDBX_rkl_hole {
txnid_t begin;
txnid_t end;
} rkl_hole_t;
MDBX_MAYBE_UNUSED MDBX_INTERNAL __must_check_result rkl_hole_t rkl_hole(rkl_iter_t *iter, const bool reverse);

View File

@ -4,42 +4,42 @@
#include "internals.h"
void spill_remove(MDBX_txn *txn, size_t idx, size_t npages) {
tASSERT(txn, idx > 0 && idx <= MDBX_PNL_GETSIZE(txn->wr.spilled.list) && txn->wr.spilled.least_removed > 0);
txn->wr.spilled.least_removed = (idx < txn->wr.spilled.least_removed) ? idx : txn->wr.spilled.least_removed;
txn->wr.spilled.list[idx] |= 1;
MDBX_PNL_SETSIZE(txn->wr.spilled.list,
MDBX_PNL_GETSIZE(txn->wr.spilled.list) - (idx == MDBX_PNL_GETSIZE(txn->wr.spilled.list)));
tASSERT(txn, idx > 0 && idx <= MDBX_PNL_GETSIZE(txn->tw.spilled.list) && txn->tw.spilled.least_removed > 0);
txn->tw.spilled.least_removed = (idx < txn->tw.spilled.least_removed) ? idx : txn->tw.spilled.least_removed;
txn->tw.spilled.list[idx] |= 1;
MDBX_PNL_SETSIZE(txn->tw.spilled.list,
MDBX_PNL_GETSIZE(txn->tw.spilled.list) - (idx == MDBX_PNL_GETSIZE(txn->tw.spilled.list)));
while (unlikely(npages > 1)) {
const pgno_t pgno = (txn->wr.spilled.list[idx] >> 1) + 1;
const pgno_t pgno = (txn->tw.spilled.list[idx] >> 1) + 1;
if (MDBX_PNL_ASCENDING) {
if (++idx > MDBX_PNL_GETSIZE(txn->wr.spilled.list) || (txn->wr.spilled.list[idx] >> 1) != pgno)
if (++idx > MDBX_PNL_GETSIZE(txn->tw.spilled.list) || (txn->tw.spilled.list[idx] >> 1) != pgno)
return;
} else {
if (--idx < 1 || (txn->wr.spilled.list[idx] >> 1) != pgno)
if (--idx < 1 || (txn->tw.spilled.list[idx] >> 1) != pgno)
return;
txn->wr.spilled.least_removed = (idx < txn->wr.spilled.least_removed) ? idx : txn->wr.spilled.least_removed;
txn->tw.spilled.least_removed = (idx < txn->tw.spilled.least_removed) ? idx : txn->tw.spilled.least_removed;
}
txn->wr.spilled.list[idx] |= 1;
MDBX_PNL_SETSIZE(txn->wr.spilled.list,
MDBX_PNL_GETSIZE(txn->wr.spilled.list) - (idx == MDBX_PNL_GETSIZE(txn->wr.spilled.list)));
txn->tw.spilled.list[idx] |= 1;
MDBX_PNL_SETSIZE(txn->tw.spilled.list,
MDBX_PNL_GETSIZE(txn->tw.spilled.list) - (idx == MDBX_PNL_GETSIZE(txn->tw.spilled.list)));
--npages;
}
}
pnl_t spill_purge(MDBX_txn *txn) {
tASSERT(txn, txn->wr.spilled.least_removed > 0);
const pnl_t sl = txn->wr.spilled.list;
if (txn->wr.spilled.least_removed != INT_MAX) {
tASSERT(txn, txn->tw.spilled.least_removed > 0);
const pnl_t sl = txn->tw.spilled.list;
if (txn->tw.spilled.least_removed != INT_MAX) {
size_t len = MDBX_PNL_GETSIZE(sl), r, w;
for (w = r = txn->wr.spilled.least_removed; r <= len; ++r) {
for (w = r = txn->tw.spilled.least_removed; r <= len; ++r) {
sl[w] = sl[r];
w += 1 - (sl[r] & 1);
}
for (size_t i = 1; i < w; ++i)
tASSERT(txn, (sl[i] & 1) == 0);
MDBX_PNL_SETSIZE(sl, w - 1);
txn->wr.spilled.least_removed = INT_MAX;
txn->tw.spilled.least_removed = INT_MAX;
} else {
for (size_t i = 1; i <= MDBX_PNL_GETSIZE(sl); ++i)
tASSERT(txn, (sl[i] & 1) == 0);
@ -57,7 +57,7 @@ static int spill_page(MDBX_txn *txn, iov_ctx_t *ctx, page_t *dp, const size_t np
const pgno_t pgno = dp->pgno;
int err = iov_page(txn, ctx, dp, npages);
if (likely(err == MDBX_SUCCESS))
err = spill_append_span(&txn->wr.spilled.list, pgno, npages);
err = spill_append_span(&txn->tw.spilled.list, pgno, npages);
return err;
}
@ -72,29 +72,25 @@ static size_t spill_cursor_keep(const MDBX_txn *const txn, const MDBX_cursor *mc
intptr_t i = 0;
do {
mp = mc->pg[i];
TRACE("dbi %zu, mc-%p[%zu], page %u %p", cursor_dbi(mc), __Wpedantic_format_voidptr(mc), i, mp->pgno,
__Wpedantic_format_voidptr(mp));
tASSERT(txn, !is_subpage(mp));
if (is_modifable(txn, mp)) {
size_t const n = dpl_search(txn, mp->pgno);
if (txn->wr.dirtylist->items[n].pgno == mp->pgno &&
if (txn->tw.dirtylist->items[n].pgno == mp->pgno &&
/* не считаем дважды */ dpl_age(txn, n)) {
size_t *const ptr = ptr_disp(txn->wr.dirtylist->items[n].ptr, -(ptrdiff_t)sizeof(size_t));
*ptr = txn->wr.dirtylru;
size_t *const ptr = ptr_disp(txn->tw.dirtylist->items[n].ptr, -(ptrdiff_t)sizeof(size_t));
*ptr = txn->tw.dirtylru;
tASSERT(txn, dpl_age(txn, n) == 0);
++keep;
DEBUG("keep page %" PRIaPGNO " (%p), dbi %zu, %scursor %p[%zu]", mp->pgno, __Wpedantic_format_voidptr(mp),
cursor_dbi(mc), is_inner(mc) ? "sub-" : "", __Wpedantic_format_voidptr(mc), i);
}
}
} while (++i <= mc->top);
tASSERT(txn, is_leaf(mp));
if (!inner_pointed(mc))
if (!mc->subcur || mc->ki[mc->top] >= page_numkeys(mp))
break;
if (!(node_flags(page_node(mp, mc->ki[mc->top])) & N_TREE))
break;
mc = &mc->subcur->cursor;
if (is_subpage(mc->pg[0]))
break;
}
return keep;
}
@ -119,7 +115,7 @@ static size_t spill_txn_keep(MDBX_txn *txn, MDBX_cursor *m0) {
* ...
* > 255 = must not be spilled. */
MDBX_NOTHROW_PURE_FUNCTION static unsigned spill_prio(const MDBX_txn *txn, const size_t i, const uint32_t reciprocal) {
dpl_t *const dl = txn->wr.dirtylist;
dpl_t *const dl = txn->tw.dirtylist;
const uint32_t age = dpl_age(txn, i);
const size_t npages = dpl_npages(dl, i);
const pgno_t pgno = dl->items[i].pgno;
@ -182,14 +178,14 @@ __cold int spill_slowpath(MDBX_txn *const txn, MDBX_cursor *const m0, const intp
tASSERT(txn, (txn->flags & MDBX_TXN_RDONLY) == 0);
int rc = MDBX_SUCCESS;
if (unlikely(txn->wr.loose_count >=
(txn->wr.dirtylist ? txn->wr.dirtylist->pages_including_loose : txn->wr.writemap_dirty_npages)))
if (unlikely(txn->tw.loose_count >=
(txn->tw.dirtylist ? txn->tw.dirtylist->pages_including_loose : txn->tw.writemap_dirty_npages)))
goto done;
const size_t dirty_entries = txn->wr.dirtylist ? (txn->wr.dirtylist->length - txn->wr.loose_count) : 1;
const size_t dirty_entries = txn->tw.dirtylist ? (txn->tw.dirtylist->length - txn->tw.loose_count) : 1;
const size_t dirty_npages =
(txn->wr.dirtylist ? txn->wr.dirtylist->pages_including_loose : txn->wr.writemap_dirty_npages) -
txn->wr.loose_count;
(txn->tw.dirtylist ? txn->tw.dirtylist->pages_including_loose : txn->tw.writemap_dirty_npages) -
txn->tw.loose_count;
const size_t need_spill_entries = spill_gate(txn->env, wanna_spill_entries, dirty_entries);
const size_t need_spill_npages = spill_gate(txn->env, wanna_spill_npages, dirty_npages);
@ -200,17 +196,17 @@ __cold int spill_slowpath(MDBX_txn *const txn, MDBX_cursor *const m0, const intp
if (txn->flags & MDBX_WRITEMAP) {
NOTICE("%s-spilling %zu dirty-entries, %zu dirty-npages", "msync", dirty_entries, dirty_npages);
const MDBX_env *env = txn->env;
tASSERT(txn, txn->wr.spilled.list == nullptr);
tASSERT(txn, txn->tw.spilled.list == nullptr);
rc = osal_msync(&txn->env->dxb_mmap, 0, pgno_align2os_bytes(env, txn->geo.first_unallocated), MDBX_SYNC_KICK);
if (unlikely(rc != MDBX_SUCCESS))
goto bailout;
#if MDBX_AVOID_MSYNC
MDBX_ANALYSIS_ASSUME(txn->wr.dirtylist != nullptr);
MDBX_ANALYSIS_ASSUME(txn->tw.dirtylist != nullptr);
tASSERT(txn, dpl_check(txn));
env->lck->unsynced_pages.weak += txn->wr.dirtylist->pages_including_loose - txn->wr.loose_count;
dpl_clear(txn->wr.dirtylist);
txn->wr.dirtyroom = env->options.dp_limit - txn->wr.loose_count;
for (page_t *lp = txn->wr.loose_pages; lp != nullptr; lp = page_next(lp)) {
env->lck->unsynced_pages.weak += txn->tw.dirtylist->pages_including_loose - txn->tw.loose_count;
dpl_clear(txn->tw.dirtylist);
txn->tw.dirtyroom = env->options.dp_limit - txn->tw.loose_count;
for (page_t *lp = txn->tw.loose_pages; lp != nullptr; lp = page_next(lp)) {
tASSERT(txn, lp->flags == P_LOOSE);
rc = dpl_append(txn, lp->pgno, lp, 1);
if (unlikely(rc != MDBX_SUCCESS))
@ -220,22 +216,22 @@ __cold int spill_slowpath(MDBX_txn *const txn, MDBX_cursor *const m0, const intp
}
tASSERT(txn, dpl_check(txn));
#else
tASSERT(txn, txn->wr.dirtylist == nullptr);
env->lck->unsynced_pages.weak += txn->wr.writemap_dirty_npages;
txn->wr.writemap_spilled_npages += txn->wr.writemap_dirty_npages;
txn->wr.writemap_dirty_npages = 0;
tASSERT(txn, txn->tw.dirtylist == nullptr);
env->lck->unsynced_pages.weak += txn->tw.writemap_dirty_npages;
txn->tw.writemap_spilled_npages += txn->tw.writemap_dirty_npages;
txn->tw.writemap_dirty_npages = 0;
#endif /* MDBX_AVOID_MSYNC */
goto done;
}
NOTICE("%s-spilling %zu dirty-entries, %zu dirty-npages", "write", need_spill_entries, need_spill_npages);
MDBX_ANALYSIS_ASSUME(txn->wr.dirtylist != nullptr);
tASSERT(txn, txn->wr.dirtylist->length - txn->wr.loose_count >= 1);
tASSERT(txn, txn->wr.dirtylist->pages_including_loose - txn->wr.loose_count >= need_spill_npages);
if (!txn->wr.spilled.list) {
txn->wr.spilled.least_removed = INT_MAX;
txn->wr.spilled.list = pnl_alloc(need_spill);
if (unlikely(!txn->wr.spilled.list)) {
MDBX_ANALYSIS_ASSUME(txn->tw.dirtylist != nullptr);
tASSERT(txn, txn->tw.dirtylist->length - txn->tw.loose_count >= 1);
tASSERT(txn, txn->tw.dirtylist->pages_including_loose - txn->tw.loose_count >= need_spill_npages);
if (!txn->tw.spilled.list) {
txn->tw.spilled.least_removed = INT_MAX;
txn->tw.spilled.list = pnl_alloc(need_spill);
if (unlikely(!txn->tw.spilled.list)) {
rc = MDBX_ENOMEM;
bailout:
txn->flags |= MDBX_TXN_ERROR;
@ -244,7 +240,7 @@ __cold int spill_slowpath(MDBX_txn *const txn, MDBX_cursor *const m0, const intp
} else {
/* purge deleted slots */
spill_purge(txn);
rc = pnl_reserve(&txn->wr.spilled.list, need_spill);
rc = pnl_reserve(&txn->tw.spilled.list, need_spill);
(void)rc /* ignore since the resulting list may be shorter
and pnl_append() will increase pnl on demand */
;
@ -255,9 +251,9 @@ __cold int spill_slowpath(MDBX_txn *const txn, MDBX_cursor *const m0, const intp
/* Preserve pages which may soon be dirtied again */
const size_t unspillable = spill_txn_keep(txn, m0);
if (unspillable + txn->wr.loose_count >= dl->length) {
if (unspillable + txn->tw.loose_count >= dl->length) {
#if xMDBX_DEBUG_SPILLING == 1 /* avoid false failure in debug mode */
if (likely(txn->wr.dirtyroom + txn->wr.loose_count >= need))
if (likely(txn->tw.dirtyroom + txn->tw.loose_count >= need))
return MDBX_SUCCESS;
#endif /* xMDBX_DEBUG_SPILLING */
ERROR("all %zu dirty pages are unspillable since referenced "
@ -297,7 +293,7 @@ __cold int spill_slowpath(MDBX_txn *const txn, MDBX_cursor *const m0, const intp
age_max = (age_max >= age) ? age_max : age;
}
VERBOSE("lru-head %u, age-max %u", txn->wr.dirtylru, age_max);
VERBOSE("lru-head %u, age-max %u", txn->tw.dirtylru, age_max);
/* half of 8-bit radix-sort */
pgno_t radix_entries[256], radix_npages[256];
@ -392,8 +388,8 @@ __cold int spill_slowpath(MDBX_txn *const txn, MDBX_cursor *const m0, const intp
tASSERT(txn, r - w == spilled_entries || rc != MDBX_SUCCESS);
dl->sorted = dpl_setlen(dl, w);
txn->wr.dirtyroom += spilled_entries;
txn->wr.dirtylist->pages_including_loose -= spilled_npages;
txn->tw.dirtyroom += spilled_entries;
txn->tw.dirtylist->pages_including_loose -= spilled_npages;
tASSERT(txn, dpl_check(txn));
if (!iov_empty(&ctx)) {
@ -404,10 +400,10 @@ __cold int spill_slowpath(MDBX_txn *const txn, MDBX_cursor *const m0, const intp
goto bailout;
txn->env->lck->unsynced_pages.weak += spilled_npages;
pnl_sort(txn->wr.spilled.list, (size_t)txn->geo.first_unallocated << 1);
pnl_sort(txn->tw.spilled.list, (size_t)txn->geo.first_unallocated << 1);
txn->flags |= MDBX_TXN_SPILLS;
NOTICE("spilled %u dirty-entries, %u dirty-npages, now have %zu dirty-room", spilled_entries, spilled_npages,
txn->wr.dirtyroom);
txn->tw.dirtyroom);
} else {
tASSERT(txn, rc == MDBX_SUCCESS);
for (size_t i = 1; i <= dl->length; ++i) {
@ -418,18 +414,18 @@ __cold int spill_slowpath(MDBX_txn *const txn, MDBX_cursor *const m0, const intp
}
#if xMDBX_DEBUG_SPILLING == 2
if (txn->wr.loose_count + txn->wr.dirtyroom <= need / 2 + 1)
if (txn->tw.loose_count + txn->tw.dirtyroom <= need / 2 + 1)
ERROR("dirty-list length: before %zu, after %zu, parent %zi, loose %zu; "
"needed %zu, spillable %zu; "
"spilled %u dirty-entries, now have %zu dirty-room",
dl->length + spilled_entries, dl->length,
(txn->parent && txn->parent->wr.dirtylist) ? (intptr_t)txn->parent->wr.dirtylist->length : -1,
txn->wr.loose_count, need, spillable_entries, spilled_entries, txn->wr.dirtyroom);
ENSURE(txn->env, txn->wr.loose_count + txn->wr.dirtyroom > need / 2);
(txn->parent && txn->parent->tw.dirtylist) ? (intptr_t)txn->parent->tw.dirtylist->length : -1,
txn->tw.loose_count, need, spillable_entries, spilled_entries, txn->tw.dirtyroom);
ENSURE(txn->env, txn->tw.loose_count + txn->tw.dirtyroom > need / 2);
#endif /* xMDBX_DEBUG_SPILLING */
done:
return likely(txn->wr.dirtyroom + txn->wr.loose_count > ((need > CURSOR_STACK_SIZE) ? CURSOR_STACK_SIZE : need))
return likely(txn->tw.dirtyroom + txn->tw.loose_count > ((need > CURSOR_STACK_SIZE) ? CURSOR_STACK_SIZE : need))
? MDBX_SUCCESS
: MDBX_TXN_FULL;
}

View File

@ -13,7 +13,7 @@ MDBX_INTERNAL int spill_slowpath(MDBX_txn *const txn, MDBX_cursor *const m0, con
static inline size_t spill_search(const MDBX_txn *txn, pgno_t pgno) {
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC);
const pnl_t pnl = txn->wr.spilled.list;
const pnl_t pnl = txn->tw.spilled.list;
if (likely(!pnl))
return 0;
pgno <<= 1;
@ -22,7 +22,7 @@ static inline size_t spill_search(const MDBX_txn *txn, pgno_t pgno) {
}
static inline bool spill_intersect(const MDBX_txn *txn, pgno_t pgno, size_t npages) {
const pnl_t pnl = txn->wr.spilled.list;
const pnl_t pnl = txn->tw.spilled.list;
if (likely(!pnl))
return false;
const size_t len = MDBX_PNL_GETSIZE(pnl);
@ -56,10 +56,10 @@ static inline int txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0, const si
tASSERT(txn, (txn->flags & MDBX_TXN_RDONLY) == 0);
tASSERT(txn, !m0 || cursor_is_tracked(m0));
const intptr_t wanna_spill_entries = txn->wr.dirtylist ? (need - txn->wr.dirtyroom - txn->wr.loose_count) : 0;
const intptr_t wanna_spill_entries = txn->tw.dirtylist ? (need - txn->tw.dirtyroom - txn->tw.loose_count) : 0;
const intptr_t wanna_spill_npages =
need + (txn->wr.dirtylist ? txn->wr.dirtylist->pages_including_loose : txn->wr.writemap_dirty_npages) -
txn->wr.loose_count - txn->env->options.dp_limit;
need + (txn->tw.dirtylist ? txn->tw.dirtylist->pages_including_loose : txn->tw.writemap_dirty_npages) -
txn->tw.loose_count - txn->env->options.dp_limit;
/* production mode */
if (likely(wanna_spill_npages < 1 && wanna_spill_entries < 1)

View File

@ -20,7 +20,6 @@
#define PRINT 1
#define GLOBAL 2
#define CONCISE 4
static int mode = GLOBAL;
typedef struct flagbit {
@ -56,23 +55,42 @@ static void signal_handler(int sig) {
#endif /* !WINDOWS */
static void dumpval(const MDBX_val *v) {
static const char digits[] = "0123456789abcdef";
static const char hexc[] = "0123456789abcdef";
static void dumpbyte(unsigned char c) {
putchar(hexc[c >> 4]);
putchar(hexc[c & 15]);
}
static void text(MDBX_val *v) {
unsigned char *c, *end;
putchar(' ');
for (const unsigned char *c = v->iov_base, *end = c + v->iov_len; c < end; ++c) {
if (mode & PRINT) {
if (isprint(*c) && *c != '\\') {
putchar(*c);
continue;
} else
putchar('\\');
c = v->iov_base;
end = c + v->iov_len;
while (c < end) {
if (isprint(*c) && *c != '\\') {
putchar(*c);
} else {
putchar('\\');
dumpbyte(*c);
}
putchar(digits[*c >> 4]);
putchar(digits[*c & 15]);
c++;
}
putchar('\n');
}
static void dumpval(MDBX_val *v) {
unsigned char *c, *end;
putchar(' ');
c = v->iov_base;
end = c + v->iov_len;
while (c < end)
dumpbyte(*c++);
putchar('\n');
}
bool quiet = false, rescue = false;
const char *prog;
static void error(const char *func, int rc) {
@ -167,19 +185,12 @@ static int dump_tbl(MDBX_txn *txn, MDBX_dbi dbi, char *name) {
rc = MDBX_EINTR;
break;
}
dumpval(&key);
dumpval(&data);
if ((flags & MDBX_DUPSORT) && (mode & CONCISE)) {
while ((rc = mdbx_cursor_get(cursor, &key, &data, MDBX_NEXT_DUP)) == MDBX_SUCCESS) {
if (user_break) {
rc = MDBX_EINTR;
break;
}
putchar(' ');
dumpval(&data);
}
if (rc != MDBX_NOTFOUND)
break;
if (mode & PRINT) {
text(&key);
text(&data);
} else {
dumpval(&key);
dumpval(&data);
}
}
printf("DATA=END\n");
@ -195,12 +206,10 @@ static int dump_tbl(MDBX_txn *txn, MDBX_dbi dbi, char *name) {
static void usage(void) {
fprintf(stderr,
"usage: %s "
"[-V] [-q] [-c] [-f file] [-l] [-p] [-r] [-a|-s table] [-u|U] "
"[-V] [-q] [-f file] [-l] [-p] [-r] [-a|-s table] [-u|U] "
"dbpath\n"
" -V\t\tprint version and exit\n"
" -q\t\tbe quiet\n"
" -c\t\tconcise mode without repeating keys,\n"
" \t\tbut incompatible with Berkeley DB and LMDB\n"
" -f\t\twrite to file instead of stdout\n"
" -l\t\tlist tables and exit\n"
" -p\t\tuse printable characters\n"
@ -259,7 +268,6 @@ int main(int argc, char *argv[]) {
"s:"
"V"
"r"
"c"
"q")) != EOF) {
switch (i) {
case 'V':
@ -290,9 +298,6 @@ int main(int argc, char *argv[]) {
break;
case 'n':
break;
case 'c':
mode |= CONCISE;
break;
case 'p':
mode |= PRINT;
break;

View File

@ -380,16 +380,7 @@ __hot static int readline(MDBX_val *out, MDBX_val *buf) {
return badend();
}
}
/* modern concise mode, where space in second position mean the same (previously) value */
c = fgetc(stdin);
if (c == EOF)
return errno ? errno : EOF;
if (c == ' ')
return (ungetc(c, stdin) == c) ? MDBX_SUCCESS : (errno ? errno : EOF);
*(char *)buf->iov_base = c;
if (fgets((char *)buf->iov_base + 1, (int)buf->iov_len - 1, stdin) == nullptr)
if (fgets(buf->iov_base, (int)buf->iov_len, stdin) == nullptr)
return errno ? errno : EOF;
lineno++;
@ -730,8 +721,8 @@ int main(int argc, char *argv[]) {
}
int batch = 0;
MDBX_val key = {.iov_base = nullptr, .iov_len = 0}, data = {.iov_base = nullptr, .iov_len = 0};
while (err == MDBX_SUCCESS) {
MDBX_val key, data;
err = readline(&key, &kbuf);
if (err == EOF)
break;

View File

@ -38,10 +38,11 @@ static MDBX_cursor *cursor_clone(const MDBX_cursor *csrc, cursor_couple_t *coupl
/*----------------------------------------------------------------------------*/
void recalculate_merge_thresholds(MDBX_env *env) {
const size_t whole_page_space = page_space(env);
env->merge_threshold =
(uint16_t)(whole_page_space - (whole_page_space * env->options.merge_threshold_16dot16_percent >> 16));
eASSERT(env, env->merge_threshold >= whole_page_space / 2 && env->merge_threshold <= whole_page_space / 64 * 63);
const size_t bytes = page_space(env);
env->merge_threshold = (uint16_t)(bytes - (bytes * env->options.merge_threshold_16dot16_percent >> 16));
env->merge_threshold_gc =
(uint16_t)(bytes - ((env->options.merge_threshold_16dot16_percent > 19005) ? bytes / 3 /* 33 % */
: bytes / 4 /* 25 % */));
}
int tree_drop(MDBX_cursor *mc, const bool may_have_tables) {
@ -55,7 +56,7 @@ int tree_drop(MDBX_cursor *mc, const bool may_have_tables) {
if (!(may_have_tables | mc->tree->large_pages))
cursor_pop(mc);
rc = pnl_need(&txn->wr.retired_pages,
rc = pnl_need(&txn->tw.retired_pages,
(size_t)mc->tree->branch_pages + (size_t)mc->tree->leaf_pages + (size_t)mc->tree->large_pages);
if (unlikely(rc != MDBX_SUCCESS))
goto bailout;
@ -445,8 +446,8 @@ static int page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst) {
cASSERT(cdst, cdst->top > 0);
cASSERT(cdst, cdst->top + 1 < cdst->tree->height || is_leaf(cdst->pg[cdst->tree->height - 1]));
cASSERT(csrc, csrc->top + 1 < csrc->tree->height || is_leaf(csrc->pg[csrc->tree->height - 1]));
cASSERT(cdst, cursor_dbi(csrc) == FREE_DBI || csrc->txn->env->options.prefer_waf_insteadof_balance ||
page_room(pdst) >= page_used(cdst->txn->env, psrc));
cASSERT(cdst,
csrc->txn->env->options.prefer_waf_insteadof_balance || page_room(pdst) >= page_used(cdst->txn->env, psrc));
const int pagetype = page_type(psrc);
/* Move all nodes from src to dst */
@ -679,18 +680,8 @@ int tree_rebalance(MDBX_cursor *mc) {
const size_t minkeys = (pagetype & P_BRANCH) + (size_t)1;
/* Pages emptier than this are candidates for merging. */
size_t room_threshold = mc->txn->env->merge_threshold;
bool minimize_waf = mc->txn->env->options.prefer_waf_insteadof_balance;
if (unlikely(mc->tree == &mc->txn->dbs[FREE_DBI])) {
/* В случае GC всегда минимизируем WAF, а рыхлые страницы объединяем только при наличии запаса в gc_stockpile().
* Это позволяет уменьшить WAF и избавиться от лишних действий/циклов как при переработке GC,
* так и при возврате неиспользованных страниц. Сбалансированность b-tree при этом почти не деградирует,
* ибо добавление/удаление/обновление запиcей происходит почти всегда только по краям. */
minimize_waf = true;
room_threshold = page_space(mc->txn->env);
if (gc_stockpile(mc->txn) > mc->tree->height + mc->tree->height)
room_threshold >>= 1;
}
size_t room_threshold =
likely(mc->tree != &mc->txn->dbs[FREE_DBI]) ? mc->txn->env->merge_threshold : mc->txn->env->merge_threshold_gc;
const size_t numkeys = page_numkeys(tp);
const size_t room = page_room(tp);
@ -811,26 +802,10 @@ int tree_rebalance(MDBX_cursor *mc) {
const size_t right_room = right ? page_room(right) : 0;
const size_t left_nkeys = left ? page_numkeys(left) : 0;
const size_t right_nkeys = right ? page_numkeys(right) : 0;
/* Нужно выбрать между правой и левой страницами для слияния текущей или перемещения узла в текущую.
* Таким образом, нужно выбрать один из четырёх вариантов согласно критериям.
*
* Если включен minimize_waf, то стараемся не вовлекать чистые страницы,
* пренебрегая идеальностью баланса ради уменьшения WAF.
*
* При этом отдельные варианты могут быть не доступны, либо "не сработать" из-за того что:
* - в какой-то branch-странице не хватит места из-за распространения/обновления первых ключей,
* которые хранятся в родительских страницах;
* - при включенном minimize_waf распространение/обновление первых ключей
* потребуется разделение какой-либо странице, что увеличит WAF и поэтому обесценивает дальнейшее
* следование minimize_waf. */
bool involve = !(left && right);
retry:
cASSERT(mc, mc->top > 0);
const bool consider_left = left && (involve || is_modifable(mc->txn, left));
const bool consider_right = right && (involve || is_modifable(mc->txn, right));
if (consider_left && left_room > room_threshold && left_room >= right_room) {
if (left_room > room_threshold && left_room >= right_room && (is_modifable(mc->txn, left) || involve)) {
/* try merge with left */
cASSERT(mc, left_nkeys >= minkeys);
mn->pg[mn->top] = left;
@ -850,7 +825,7 @@ retry:
return rc;
}
}
if (consider_right && right_room > room_threshold) {
if (right_room > room_threshold && (is_modifable(mc->txn, right) || involve)) {
/* try merge with right */
cASSERT(mc, right_nkeys >= minkeys);
mn->pg[mn->top] = right;
@ -868,7 +843,8 @@ retry:
}
}
if (consider_left && left_nkeys > minkeys && (right_nkeys <= left_nkeys || right_room >= left_room)) {
if (left_nkeys > minkeys && (right_nkeys <= left_nkeys || right_room >= left_room) &&
(is_modifable(mc->txn, left) || involve)) {
/* try move from left */
mn->pg[mn->top] = left;
mn->ki[mn->top - 1] = (indx_t)(ki_pre_top - 1);
@ -884,7 +860,7 @@ retry:
return rc;
}
}
if (consider_right && right_nkeys > minkeys) {
if (right_nkeys > minkeys && (is_modifable(mc->txn, right) || involve)) {
/* try move from right */
mn->pg[mn->top] = right;
mn->ki[mn->top - 1] = (indx_t)(ki_pre_top + 1);
@ -908,20 +884,17 @@ retry:
return MDBX_SUCCESS;
}
if (minimize_waf && room_threshold > 0) {
/* Если включен minimize_waf, то переходим к попыткам слияния с сильно
* заполненными страницами до вовлечения чистых страниц (не измененных в этой транзакции) */
if (mc->txn->env->options.prefer_waf_insteadof_balance && likely(room_threshold > 0)) {
room_threshold = 0;
goto retry;
}
if (!involve) {
/* Теперь допускаем вовлечение чистых страниц (не измененных в этой транзакции),
* что улучшает баланс в дереве, но увеличивает WAF. */
if (likely(!involve) &&
(likely(mc->tree != &mc->txn->dbs[FREE_DBI]) || mc->txn->tw.loose_pages || MDBX_PNL_GETSIZE(mc->txn->tw.repnl) ||
(mc->flags & z_gcu_preparation) || (mc->txn->flags & txn_gc_drained) || room_threshold)) {
involve = true;
goto retry;
}
if (room_threshold > 0) {
/* Если не нашли подходящей соседней, то допускаем слияние с сильно заполненными страницами */
if (likely(room_threshold > 0)) {
room_threshold = 0;
goto retry;
}
@ -1255,7 +1228,6 @@ int page_split(MDBX_cursor *mc, const MDBX_val *const newkey, MDBX_val *const ne
/* root split? */
prev_top += mc->top - top;
cASSERT(mn, prev_top <= mn->top && prev_top <= mc->top);
/* Right page might now have changed parent.
* Check if left page also changed parent. */

View File

@ -63,14 +63,14 @@ static int txl_reserve(txl_t __restrict *__restrict ptxl, const size_t wanna) {
return MDBX_ENOMEM;
}
static inline int __must_check_result txl_need(txl_t __restrict *__restrict ptxl, size_t num) {
static __always_inline int __must_check_result txl_need(txl_t __restrict *__restrict ptxl, size_t num) {
assert(MDBX_PNL_GETSIZE(*ptxl) <= txl_max && MDBX_PNL_ALLOCLEN(*ptxl) >= MDBX_PNL_GETSIZE(*ptxl));
assert(num <= PAGELIST_LIMIT);
const size_t wanna = (size_t)MDBX_PNL_GETSIZE(*ptxl) + num;
return likely(MDBX_PNL_ALLOCLEN(*ptxl) >= wanna) ? MDBX_SUCCESS : txl_reserve(ptxl, wanna);
}
static inline void txl_xappend(txl_t __restrict txl, txnid_t id) {
static __always_inline void txl_xappend(txl_t __restrict txl, txnid_t id) {
assert(MDBX_PNL_GETSIZE(txl) < MDBX_PNL_ALLOCLEN(txl));
txl[0] += 1;
MDBX_PNL_LAST(txl) = id;

View File

@ -15,12 +15,12 @@ enum txl_rules {
txl_max = (1u << 26) - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(txnid_t)
};
MDBX_MAYBE_UNUSED MDBX_INTERNAL txl_t txl_alloc(void);
MDBX_INTERNAL txl_t txl_alloc(void);
MDBX_MAYBE_UNUSED MDBX_INTERNAL void txl_free(txl_t txl);
MDBX_INTERNAL void txl_free(txl_t txl);
MDBX_MAYBE_UNUSED MDBX_INTERNAL int __must_check_result txl_append(txl_t __restrict *ptxl, txnid_t id);
MDBX_INTERNAL int __must_check_result txl_append(txl_t __restrict *ptxl, txnid_t id);
MDBX_MAYBE_UNUSED MDBX_INTERNAL void txl_sort(txl_t txl);
MDBX_INTERNAL void txl_sort(txl_t txl);
MDBX_MAYBE_UNUSED MDBX_INTERNAL bool txl_contain(const txl_t txl, txnid_t id);
MDBX_INTERNAL bool txl_contain(const txl_t txl, txnid_t id);

View File

@ -1,366 +0,0 @@
/// \copyright SPDX-License-Identifier: Apache-2.0
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2025
#include "internals.h"
static int txn_write(MDBX_txn *txn, iov_ctx_t *ctx) {
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC);
dpl_t *const dl = dpl_sort(txn);
int rc = MDBX_SUCCESS;
size_t r, w, total_npages = 0;
for (w = 0, r = 1; r <= dl->length; ++r) {
page_t *dp = dl->items[r].ptr;
if (dp->flags & P_LOOSE) {
dl->items[++w] = dl->items[r];
continue;
}
unsigned npages = dpl_npages(dl, r);
total_npages += npages;
rc = iov_page(txn, ctx, dp, npages);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
}
if (!iov_empty(ctx)) {
tASSERT(txn, rc == MDBX_SUCCESS);
rc = iov_write(ctx);
}
if (likely(rc == MDBX_SUCCESS) && ctx->fd == txn->env->lazy_fd) {
txn->env->lck->unsynced_pages.weak += total_npages;
if (!txn->env->lck->eoos_timestamp.weak)
txn->env->lck->eoos_timestamp.weak = osal_monotime();
}
txn->wr.dirtylist->pages_including_loose -= total_npages;
while (r <= dl->length)
dl->items[++w] = dl->items[r++];
dl->sorted = dpl_setlen(dl, w);
txn->wr.dirtyroom += r - 1 - w;
tASSERT(txn, txn->wr.dirtyroom + txn->wr.dirtylist->length ==
(txn->parent ? txn->parent->wr.dirtyroom : txn->env->options.dp_limit));
tASSERT(txn, txn->wr.dirtylist->length == txn->wr.loose_count);
tASSERT(txn, txn->wr.dirtylist->pages_including_loose == txn->wr.loose_count);
return rc;
}
__cold MDBX_txn *txn_basal_create(const size_t max_dbi) {
MDBX_txn *txn = nullptr;
const intptr_t bitmap_bytes =
#if MDBX_ENABLE_DBI_SPARSE
ceil_powerof2(max_dbi, CHAR_BIT * sizeof(txn->dbi_sparse[0])) / CHAR_BIT;
#else
0;
#endif /* MDBX_ENABLE_DBI_SPARSE */
const size_t base = sizeof(MDBX_txn) + /* GC cursor */ sizeof(cursor_couple_t);
const size_t size =
base + bitmap_bytes +
max_dbi * (sizeof(txn->dbs[0]) + sizeof(txn->cursors[0]) + sizeof(txn->dbi_seqs[0]) + sizeof(txn->dbi_state[0]));
txn = osal_calloc(1, size);
if (unlikely(!txn))
return txn;
rkl_init(&txn->wr.gc.reclaimed);
rkl_init(&txn->wr.gc.comeback);
txn->dbs = ptr_disp(txn, base);
txn->cursors = ptr_disp(txn->dbs, max_dbi * sizeof(txn->dbs[0]));
txn->dbi_seqs = ptr_disp(txn->cursors, max_dbi * sizeof(txn->cursors[0]));
txn->dbi_state = ptr_disp(txn, size - max_dbi * sizeof(txn->dbi_state[0]));
#if MDBX_ENABLE_DBI_SPARSE
txn->dbi_sparse = ptr_disp(txn->dbi_state, -bitmap_bytes);
#endif /* MDBX_ENABLE_DBI_SPARSE */
txn->flags = MDBX_TXN_FINISHED;
txn->wr.retired_pages = pnl_alloc(MDBX_PNL_INITIAL);
txn->wr.repnl = pnl_alloc(MDBX_PNL_INITIAL);
if (unlikely(!txn->wr.retired_pages || !txn->wr.repnl)) {
txn_basal_destroy(txn);
txn = nullptr;
}
return txn;
}
__cold void txn_basal_destroy(MDBX_txn *txn) {
dpl_free(txn);
rkl_destroy(&txn->wr.gc.reclaimed);
rkl_destroy(&txn->wr.gc.comeback);
pnl_free(txn->wr.retired_pages);
pnl_free(txn->wr.spilled.list);
pnl_free(txn->wr.repnl);
osal_free(txn);
}
int txn_basal_start(MDBX_txn *txn, unsigned flags) {
MDBX_env *const env = txn->env;
txn->wr.troika = meta_tap(env);
const meta_ptr_t head = meta_recent(env, &txn->wr.troika);
uint64_t timestamp = 0;
/* coverity[array_null] */
while ("workaround for https://libmdbx.dqdkfa.ru/dead-github/issues/269") {
int err = coherency_fetch_head(txn, head, &timestamp);
if (likely(err == MDBX_SUCCESS))
break;
if (unlikely(err != MDBX_RESULT_TRUE))
return err;
}
eASSERT(env, meta_txnid(head.ptr_v) == txn->txnid);
txn->txnid = safe64_txnid_next(txn->txnid);
if (unlikely(txn->txnid > MAX_TXNID)) {
ERROR("txnid overflow, raise %d", MDBX_TXN_FULL);
return MDBX_TXN_FULL;
}
tASSERT(txn, txn->dbs[FREE_DBI].flags == MDBX_INTEGERKEY);
tASSERT(txn, check_table_flags(txn->dbs[MAIN_DBI].flags));
txn->flags = flags;
txn->nested = nullptr;
txn->wr.loose_pages = nullptr;
txn->wr.loose_count = 0;
#if MDBX_ENABLE_REFUND
txn->wr.loose_refund_wl = 0;
#endif /* MDBX_ENABLE_REFUND */
MDBX_PNL_SETSIZE(txn->wr.retired_pages, 0);
txn->wr.spilled.list = nullptr;
txn->wr.spilled.least_removed = 0;
txn->wr.gc.spent = 0;
tASSERT(txn, rkl_empty(&txn->wr.gc.reclaimed));
txn->env->gc.detent = 0;
env->txn = txn;
return MDBX_SUCCESS;
}
int txn_basal_end(MDBX_txn *txn, unsigned mode) {
MDBX_env *const env = txn->env;
tASSERT(txn, (txn->flags & (MDBX_TXN_FINISHED | txn_may_have_cursors)) == 0 && txn->owner);
ENSURE(env, txn->txnid >= /* paranoia is appropriate here */ env->lck->cached_oldest.weak);
dxb_sanitize_tail(env, nullptr);
txn->flags = MDBX_TXN_FINISHED;
env->txn = nullptr;
pnl_free(txn->wr.spilled.list);
txn->wr.spilled.list = nullptr;
rkl_clear_and_shrink(&txn->wr.gc.reclaimed);
rkl_clear_and_shrink(&txn->wr.gc.comeback);
eASSERT(env, txn->parent == nullptr);
pnl_shrink(&txn->wr.retired_pages);
pnl_shrink(&txn->wr.repnl);
if (!(env->flags & MDBX_WRITEMAP))
dpl_release_shadows(txn);
/* Export or close DBI handles created in this txn */
int err = dbi_update(txn, (mode & TXN_END_UPDATE) != 0);
if (unlikely(err != MDBX_SUCCESS)) {
ERROR("unexpected error %d during export the state of dbi-handles to env", err);
err = MDBX_PROBLEM;
}
/* The writer mutex was locked in mdbx_txn_begin. */
lck_txn_unlock(env);
return err;
}
int txn_basal_commit(MDBX_txn *txn, struct commit_timestamp *ts) {
MDBX_env *const env = txn->env;
tASSERT(txn, txn == env->basal_txn && !txn->parent && !txn->nested);
if (!txn->wr.dirtylist) {
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) != 0 && !MDBX_AVOID_MSYNC);
} else {
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC);
tASSERT(txn, txn->wr.dirtyroom + txn->wr.dirtylist->length == env->options.dp_limit);
}
if (txn->flags & txn_may_have_cursors)
txn_done_cursors(txn);
bool need_flush_for_nometasync = false;
const meta_ptr_t head = meta_recent(env, &txn->wr.troika);
const uint32_t meta_sync_txnid = atomic_load32(&env->lck->meta_sync_txnid, mo_Relaxed);
/* sync prev meta */
if (head.is_steady && meta_sync_txnid != (uint32_t)head.txnid) {
/* Исправление унаследованного от LMDB недочета:
*
* Всё хорошо, если все процессы работающие с БД не используют WRITEMAP.
* Тогда мета-страница (обновленная, но не сброшенная на диск) будет
* сохранена в результате fdatasync() при записи данных этой транзакции.
*
* Всё хорошо, если все процессы работающие с БД используют WRITEMAP
* без MDBX_AVOID_MSYNC.
* Тогда мета-страница (обновленная, но не сброшенная на диск) будет
* сохранена в результате msync() при записи данных этой транзакции.
*
* Если же в процессах работающих с БД используется оба метода, как sync()
* в режиме MDBX_WRITEMAP, так и записи через файловый дескриптор, то
* становится невозможным обеспечить фиксацию на диске мета-страницы
* предыдущей транзакции и данных текущей транзакции, за счет одной
* sync-операцией выполняемой после записи данных текущей транзакции.
* Соответственно, требуется явно обновлять мета-страницу, что полностью
* уничтожает выгоду от NOMETASYNC. */
const uint32_t txnid_dist = ((txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC) ? MDBX_NOMETASYNC_LAZY_FD
: MDBX_NOMETASYNC_LAZY_WRITEMAP;
/* Смысл "магии" в том, чтобы избежать отдельного вызова fdatasync()
* или msync() для гарантированной фиксации на диске мета-страницы,
* которая была "лениво" отправлена на запись в предыдущей транзакции,
* но не сброшена на диск из-за активного режима MDBX_NOMETASYNC. */
if (
#if defined(_WIN32) || defined(_WIN64)
!env->ioring.overlapped_fd &&
#endif
meta_sync_txnid == (uint32_t)head.txnid - txnid_dist)
need_flush_for_nometasync = true;
else {
int err = meta_sync(env, head);
if (unlikely(err != MDBX_SUCCESS)) {
ERROR("txn-%s: error %d", "presync-meta", err);
return err;
}
}
}
if ((!txn->wr.dirtylist || txn->wr.dirtylist->length == 0) &&
(txn->flags & (MDBX_TXN_DIRTY | MDBX_TXN_SPILLS | MDBX_TXN_NOSYNC | MDBX_TXN_NOMETASYNC)) == 0 &&
!need_flush_for_nometasync && !head.is_steady && !AUDIT_ENABLED()) {
TXN_FOREACH_DBI_ALL(txn, i) { tASSERT(txn, !(txn->dbi_state[i] & DBI_DIRTY)); }
/* fast completion of pure transaction */
return MDBX_NOSUCCESS_PURE_COMMIT ? MDBX_RESULT_TRUE : MDBX_SUCCESS;
}
DEBUG("committing txn %" PRIaTXN " %p on env %p, root page %" PRIaPGNO "/%" PRIaPGNO, txn->txnid, (void *)txn,
(void *)env, txn->dbs[MAIN_DBI].root, txn->dbs[FREE_DBI].root);
if (txn->n_dbi > CORE_DBS) {
/* Update table root pointers */
cursor_couple_t cx;
int err = cursor_init(&cx.outer, txn, MAIN_DBI);
if (unlikely(err != MDBX_SUCCESS))
return err;
cx.outer.next = txn->cursors[MAIN_DBI];
txn->cursors[MAIN_DBI] = &cx.outer;
TXN_FOREACH_DBI_USER(txn, i) {
if ((txn->dbi_state[i] & DBI_DIRTY) == 0)
continue;
tree_t *const db = &txn->dbs[i];
DEBUG("update main's entry for sub-db %zu, mod_txnid %" PRIaTXN " -> %" PRIaTXN, i, db->mod_txnid, txn->txnid);
/* Может быть mod_txnid > front после коммита вложенных тразакций */
db->mod_txnid = txn->txnid;
MDBX_val data = {db, sizeof(tree_t)};
err = cursor_put(&cx.outer, &env->kvs[i].name, &data, N_TREE);
if (unlikely(err != MDBX_SUCCESS)) {
txn->cursors[MAIN_DBI] = cx.outer.next;
return err;
}
}
txn->cursors[MAIN_DBI] = cx.outer.next;
}
if (ts) {
ts->prep = osal_monotime();
ts->gc_cpu = osal_cputime(nullptr);
}
gcu_t gcu_ctx;
int rc = gc_put_init(txn, &gcu_ctx);
if (likely(rc == MDBX_SUCCESS))
rc = gc_update(txn, &gcu_ctx);
#if MDBX_ENABLE_BIGFOOT
const txnid_t commit_txnid = gcu_ctx.bigfoot;
if (commit_txnid > txn->txnid)
TRACE("use @%" PRIaTXN " (+%zu) for commit bigfoot-txn", commit_txnid, (size_t)(commit_txnid - txn->txnid));
#else
const txnid_t commit_txnid = txn->txnid;
#endif
gc_put_destroy(&gcu_ctx);
if (ts)
ts->gc_cpu = osal_cputime(nullptr) - ts->gc_cpu;
if (unlikely(rc != MDBX_SUCCESS))
return rc;
tASSERT(txn, txn->wr.loose_count == 0);
txn->dbs[FREE_DBI].mod_txnid = (txn->dbi_state[FREE_DBI] & DBI_DIRTY) ? txn->txnid : txn->dbs[FREE_DBI].mod_txnid;
txn->dbs[MAIN_DBI].mod_txnid = (txn->dbi_state[MAIN_DBI] & DBI_DIRTY) ? txn->txnid : txn->dbs[MAIN_DBI].mod_txnid;
if (ts) {
ts->gc = osal_monotime();
ts->audit = ts->gc;
}
if (AUDIT_ENABLED()) {
rc = audit_ex(txn, MDBX_PNL_GETSIZE(txn->wr.retired_pages), true);
if (ts)
ts->audit = osal_monotime();
if (unlikely(rc != MDBX_SUCCESS))
return rc;
}
if (txn->wr.dirtylist) {
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC);
tASSERT(txn, txn->wr.loose_count == 0);
mdbx_filehandle_t fd =
#if defined(_WIN32) || defined(_WIN64)
env->ioring.overlapped_fd ? env->ioring.overlapped_fd : env->lazy_fd;
(void)need_flush_for_nometasync;
#else
(need_flush_for_nometasync || env->dsync_fd == INVALID_HANDLE_VALUE ||
txn->wr.dirtylist->length > env->options.writethrough_threshold ||
atomic_load64(&env->lck->unsynced_pages, mo_Relaxed))
? env->lazy_fd
: env->dsync_fd;
#endif /* Windows */
iov_ctx_t write_ctx;
rc = iov_init(txn, &write_ctx, txn->wr.dirtylist->length, txn->wr.dirtylist->pages_including_loose, fd, false);
if (unlikely(rc != MDBX_SUCCESS)) {
ERROR("txn-%s: error %d", "iov-init", rc);
return rc;
}
rc = txn_write(txn, &write_ctx);
if (unlikely(rc != MDBX_SUCCESS)) {
ERROR("txn-%s: error %d", "write", rc);
return rc;
}
} else {
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) != 0 && !MDBX_AVOID_MSYNC);
env->lck->unsynced_pages.weak += txn->wr.writemap_dirty_npages;
if (!env->lck->eoos_timestamp.weak)
env->lck->eoos_timestamp.weak = osal_monotime();
}
/* TODO: use ctx.flush_begin & ctx.flush_end for range-sync */
if (ts)
ts->write = osal_monotime();
meta_t meta;
memcpy(meta.magic_and_version, head.ptr_c->magic_and_version, 8);
meta.reserve16 = head.ptr_c->reserve16;
meta.validator_id = head.ptr_c->validator_id;
meta.extra_pagehdr = head.ptr_c->extra_pagehdr;
unaligned_poke_u64(4, meta.pages_retired,
unaligned_peek_u64(4, head.ptr_c->pages_retired) + MDBX_PNL_GETSIZE(txn->wr.retired_pages));
meta.geometry = txn->geo;
meta.trees.gc = txn->dbs[FREE_DBI];
meta.trees.main = txn->dbs[MAIN_DBI];
meta.canary = txn->canary;
memcpy(&meta.dxbid, &head.ptr_c->dxbid, sizeof(meta.dxbid));
meta.unsafe_sign = DATASIGN_NONE;
meta_set_txnid(env, &meta, commit_txnid);
rc = dxb_sync_locked(env, env->flags | txn->flags | txn_shrink_allowed, &meta, &txn->wr.troika);
if (ts)
ts->sync = osal_monotime();
if (unlikely(rc != MDBX_SUCCESS)) {
env->flags |= ENV_FATAL_ERROR;
ERROR("txn-%s: error %d", "sync", rc);
return rc;
}
return MDBX_SUCCESS;
}

View File

@ -1,595 +0,0 @@
/// \copyright SPDX-License-Identifier: Apache-2.0
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2025
#include "internals.h"
/* Merge pageset of the nested txn into parent */
static void txn_merge(MDBX_txn *const parent, MDBX_txn *const txn, const size_t parent_retired_len) {
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0);
dpl_t *const src = dpl_sort(txn);
/* Remove refunded pages from parent's dirty list */
dpl_t *const dst = dpl_sort(parent);
if (MDBX_ENABLE_REFUND) {
size_t n = dst->length;
while (n && dst->items[n].pgno >= parent->geo.first_unallocated) {
const unsigned npages = dpl_npages(dst, n);
page_shadow_release(txn->env, dst->items[n].ptr, npages);
--n;
}
parent->wr.dirtyroom += dst->sorted - n;
dst->sorted = dpl_setlen(dst, n);
tASSERT(parent, parent->wr.dirtyroom + parent->wr.dirtylist->length ==
(parent->parent ? parent->parent->wr.dirtyroom : parent->env->options.dp_limit));
}
/* Remove reclaimed pages from parent's dirty list */
const pnl_t reclaimed_list = parent->wr.repnl;
dpl_sift(parent, reclaimed_list, false);
/* Move retired pages from parent's dirty & spilled list to reclaimed */
size_t r, w, d, s, l;
for (r = w = parent_retired_len; ++r <= MDBX_PNL_GETSIZE(parent->wr.retired_pages);) {
const pgno_t pgno = parent->wr.retired_pages[r];
const size_t di = dpl_exist(parent, pgno);
const size_t si = !di ? spill_search(parent, pgno) : 0;
unsigned npages;
const char *kind;
if (di) {
page_t *dp = dst->items[di].ptr;
tASSERT(parent, (dp->flags & ~(P_LEAF | P_DUPFIX | P_BRANCH | P_LARGE | P_SPILLED)) == 0);
npages = dpl_npages(dst, di);
page_wash(parent, di, dp, npages);
kind = "dirty";
l = 1;
if (unlikely(npages > l)) {
/* OVERFLOW-страница могла быть переиспользована по частям. Тогда
* в retired-списке может быть только начало последовательности,
* а остаток растащен по dirty, spilled и reclaimed спискам. Поэтому
* переносим в reclaimed с проверкой на обрыв последовательности.
* В любом случае, все осколки будут учтены и отфильтрованы, т.е. если
* страница была разбита на части, то важно удалить dirty-элемент,
* а все осколки будут учтены отдельно. */
/* Список retired страниц не сортирован, но для ускорения сортировки
* дополняется в соответствии с MDBX_PNL_ASCENDING */
#if MDBX_PNL_ASCENDING
const size_t len = MDBX_PNL_GETSIZE(parent->wr.retired_pages);
while (r < len && parent->wr.retired_pages[r + 1] == pgno + l) {
++r;
if (++l == npages)
break;
}
#else
while (w > parent_retired_len && parent->wr.retired_pages[w - 1] == pgno + l) {
--w;
if (++l == npages)
break;
}
#endif
}
} else if (unlikely(si)) {
l = npages = 1;
spill_remove(parent, si, 1);
kind = "spilled";
} else {
parent->wr.retired_pages[++w] = pgno;
continue;
}
DEBUG("reclaim retired parent's %u -> %zu %s page %" PRIaPGNO, npages, l, kind, pgno);
int err = pnl_insert_span(&parent->wr.repnl, pgno, l);
ENSURE(txn->env, err == MDBX_SUCCESS);
}
MDBX_PNL_SETSIZE(parent->wr.retired_pages, w);
/* Filter-out parent spill list */
if (parent->wr.spilled.list && MDBX_PNL_GETSIZE(parent->wr.spilled.list) > 0) {
const pnl_t sl = spill_purge(parent);
size_t len = MDBX_PNL_GETSIZE(sl);
if (len) {
/* Remove refunded pages from parent's spill list */
if (MDBX_ENABLE_REFUND && MDBX_PNL_MOST(sl) >= (parent->geo.first_unallocated << 1)) {
#if MDBX_PNL_ASCENDING
size_t i = MDBX_PNL_GETSIZE(sl);
assert(MDBX_PNL_MOST(sl) == MDBX_PNL_LAST(sl));
do {
if ((sl[i] & 1) == 0)
DEBUG("refund parent's spilled page %" PRIaPGNO, sl[i] >> 1);
i -= 1;
} while (i && sl[i] >= (parent->geo.first_unallocated << 1));
MDBX_PNL_SETSIZE(sl, i);
#else
assert(MDBX_PNL_MOST(sl) == MDBX_PNL_FIRST(sl));
size_t i = 0;
do {
++i;
if ((sl[i] & 1) == 0)
DEBUG("refund parent's spilled page %" PRIaPGNO, sl[i] >> 1);
} while (i < len && sl[i + 1] >= (parent->geo.first_unallocated << 1));
MDBX_PNL_SETSIZE(sl, len -= i);
memmove(sl + 1, sl + 1 + i, len * sizeof(sl[0]));
#endif
}
tASSERT(txn, pnl_check_allocated(sl, (size_t)parent->geo.first_unallocated << 1));
/* Remove reclaimed pages from parent's spill list */
s = MDBX_PNL_GETSIZE(sl), r = MDBX_PNL_GETSIZE(reclaimed_list);
/* Scanning from end to begin */
while (s && r) {
if (sl[s] & 1) {
--s;
continue;
}
const pgno_t spilled_pgno = sl[s] >> 1;
const pgno_t reclaimed_pgno = reclaimed_list[r];
if (reclaimed_pgno != spilled_pgno) {
const bool cmp = MDBX_PNL_ORDERED(spilled_pgno, reclaimed_pgno);
s -= !cmp;
r -= cmp;
} else {
DEBUG("remove reclaimed parent's spilled page %" PRIaPGNO, reclaimed_pgno);
spill_remove(parent, s, 1);
--s;
--r;
}
}
/* Remove anything in our dirty list from parent's spill list */
/* Scanning spill list in descend order */
const intptr_t step = MDBX_PNL_ASCENDING ? -1 : 1;
s = MDBX_PNL_ASCENDING ? MDBX_PNL_GETSIZE(sl) : 1;
d = src->length;
while (d && (MDBX_PNL_ASCENDING ? s > 0 : s <= MDBX_PNL_GETSIZE(sl))) {
if (sl[s] & 1) {
s += step;
continue;
}
const pgno_t spilled_pgno = sl[s] >> 1;
const pgno_t dirty_pgno_form = src->items[d].pgno;
const unsigned npages = dpl_npages(src, d);
const pgno_t dirty_pgno_to = dirty_pgno_form + npages;
if (dirty_pgno_form > spilled_pgno) {
--d;
continue;
}
if (dirty_pgno_to <= spilled_pgno) {
s += step;
continue;
}
DEBUG("remove dirtied parent's spilled %u page %" PRIaPGNO, npages, dirty_pgno_form);
spill_remove(parent, s, 1);
s += step;
}
/* Squash deleted pagenums if we deleted any */
spill_purge(parent);
}
}
/* Remove anything in our spill list from parent's dirty list */
if (txn->wr.spilled.list) {
tASSERT(txn, pnl_check_allocated(txn->wr.spilled.list, (size_t)parent->geo.first_unallocated << 1));
dpl_sift(parent, txn->wr.spilled.list, true);
tASSERT(parent, parent->wr.dirtyroom + parent->wr.dirtylist->length ==
(parent->parent ? parent->parent->wr.dirtyroom : parent->env->options.dp_limit));
}
/* Find length of merging our dirty list with parent's and release
* filter-out pages */
for (l = 0, d = dst->length, s = src->length; d > 0 && s > 0;) {
page_t *sp = src->items[s].ptr;
tASSERT(parent, (sp->flags & ~(P_LEAF | P_DUPFIX | P_BRANCH | P_LARGE | P_LOOSE | P_SPILLED)) == 0);
const unsigned s_npages = dpl_npages(src, s);
const pgno_t s_pgno = src->items[s].pgno;
page_t *dp = dst->items[d].ptr;
tASSERT(parent, (dp->flags & ~(P_LEAF | P_DUPFIX | P_BRANCH | P_LARGE | P_SPILLED)) == 0);
const unsigned d_npages = dpl_npages(dst, d);
const pgno_t d_pgno = dst->items[d].pgno;
if (d_pgno >= s_pgno + s_npages) {
--d;
++l;
} else if (d_pgno + d_npages <= s_pgno) {
if (sp->flags != P_LOOSE) {
sp->txnid = parent->front_txnid;
sp->flags &= ~P_SPILLED;
}
--s;
++l;
} else {
dst->items[d--].ptr = nullptr;
page_shadow_release(txn->env, dp, d_npages);
}
}
assert(dst->sorted == dst->length);
tASSERT(parent, dst->detent >= l + d + s);
dst->sorted = l + d + s; /* the merged length */
while (s > 0) {
page_t *sp = src->items[s].ptr;
tASSERT(parent, (sp->flags & ~(P_LEAF | P_DUPFIX | P_BRANCH | P_LARGE | P_LOOSE | P_SPILLED)) == 0);
if (sp->flags != P_LOOSE) {
sp->txnid = parent->front_txnid;
sp->flags &= ~P_SPILLED;
}
--s;
}
/* Merge our dirty list into parent's, i.e. merge(dst, src) -> dst */
if (dst->sorted >= dst->length) {
/* from end to begin with dst extending */
for (l = dst->sorted, s = src->length, d = dst->length; s > 0 && d > 0;) {
if (unlikely(l <= d)) {
/* squash to get a gap of free space for merge */
for (r = w = 1; r <= d; ++r)
if (dst->items[r].ptr) {
if (w != r) {
dst->items[w] = dst->items[r];
dst->items[r].ptr = nullptr;
}
++w;
}
VERBOSE("squash to begin for extending-merge %zu -> %zu", d, w - 1);
d = w - 1;
continue;
}
assert(l > d);
if (dst->items[d].ptr) {
dst->items[l--] = (dst->items[d].pgno > src->items[s].pgno) ? dst->items[d--] : src->items[s--];
} else
--d;
}
if (s > 0) {
assert(l == s);
while (d > 0) {
assert(dst->items[d].ptr == nullptr);
--d;
}
do {
assert(l > 0);
dst->items[l--] = src->items[s--];
} while (s > 0);
} else {
assert(l == d);
while (l > 0) {
assert(dst->items[l].ptr != nullptr);
--l;
}
}
} else {
/* from begin to end with shrinking (a lot of new large/overflow pages) */
for (l = s = d = 1; s <= src->length && d <= dst->length;) {
if (unlikely(l >= d)) {
/* squash to get a gap of free space for merge */
for (r = w = dst->length; r >= d; --r)
if (dst->items[r].ptr) {
if (w != r) {
dst->items[w] = dst->items[r];
dst->items[r].ptr = nullptr;
}
--w;
}
VERBOSE("squash to end for shrinking-merge %zu -> %zu", d, w + 1);
d = w + 1;
continue;
}
assert(l < d);
if (dst->items[d].ptr) {
dst->items[l++] = (dst->items[d].pgno < src->items[s].pgno) ? dst->items[d++] : src->items[s++];
} else
++d;
}
if (s <= src->length) {
assert(dst->sorted - l == src->length - s);
while (d <= dst->length) {
assert(dst->items[d].ptr == nullptr);
--d;
}
do {
assert(l <= dst->sorted);
dst->items[l++] = src->items[s++];
} while (s <= src->length);
} else {
assert(dst->sorted - l == dst->length - d);
while (l <= dst->sorted) {
assert(l <= d && d <= dst->length && dst->items[d].ptr);
dst->items[l++] = dst->items[d++];
}
}
}
parent->wr.dirtyroom -= dst->sorted - dst->length;
assert(parent->wr.dirtyroom <= parent->env->options.dp_limit);
dpl_setlen(dst, dst->sorted);
parent->wr.dirtylru = txn->wr.dirtylru;
/* В текущем понимании выгоднее пересчитать кол-во страниц,
* чем подмешивать лишние ветвления и вычисления в циклы выше. */
dst->pages_including_loose = 0;
for (r = 1; r <= dst->length; ++r)
dst->pages_including_loose += dpl_npages(dst, r);
tASSERT(parent, dpl_check(parent));
dpl_free(txn);
if (txn->wr.spilled.list) {
if (parent->wr.spilled.list) {
/* Must not fail since space was preserved above. */
pnl_merge(parent->wr.spilled.list, txn->wr.spilled.list);
pnl_free(txn->wr.spilled.list);
} else {
parent->wr.spilled.list = txn->wr.spilled.list;
parent->wr.spilled.least_removed = txn->wr.spilled.least_removed;
}
tASSERT(parent, dpl_check(parent));
}
if (parent->wr.spilled.list) {
assert(pnl_check_allocated(parent->wr.spilled.list, (size_t)parent->geo.first_unallocated << 1));
if (MDBX_PNL_GETSIZE(parent->wr.spilled.list))
parent->flags |= MDBX_TXN_SPILLS;
}
}
int txn_nested_create(MDBX_txn *parent, const MDBX_txn_flags_t flags) {
if (parent->env->options.spill_parent4child_denominator) {
/* Spill dirty-pages of parent to provide dirtyroom for child txn */
int err =
txn_spill(parent, nullptr, parent->wr.dirtylist->length / parent->env->options.spill_parent4child_denominator);
if (unlikely(err != MDBX_SUCCESS))
return LOG_IFERR(err);
}
tASSERT(parent, audit_ex(parent, 0, false) == 0);
MDBX_txn *const txn = txn_alloc(flags, parent->env);
if (unlikely(!txn))
return LOG_IFERR(MDBX_ENOMEM);
tASSERT(parent, dpl_check(parent));
txn->txnid = parent->txnid;
txn->front_txnid = parent->front_txnid + 1;
txn->canary = parent->canary;
parent->flags |= MDBX_TXN_HAS_CHILD;
parent->nested = txn;
txn->parent = parent;
txn->env->txn = txn;
txn->owner = parent->owner;
txn->wr.troika = parent->wr.troika;
rkl_init(&txn->wr.gc.reclaimed);
#if MDBX_ENABLE_DBI_SPARSE
txn->dbi_sparse = parent->dbi_sparse;
#endif /* MDBX_ENABLE_DBI_SPARSE */
txn->dbi_seqs = parent->dbi_seqs;
txn->geo = parent->geo;
int err = dpl_alloc(txn);
if (unlikely(err != MDBX_SUCCESS))
return LOG_IFERR(err);
const size_t len = MDBX_PNL_GETSIZE(parent->wr.repnl) + parent->wr.loose_count;
txn->wr.repnl = pnl_alloc((len > MDBX_PNL_INITIAL) ? len : MDBX_PNL_INITIAL);
if (unlikely(!txn->wr.repnl))
return LOG_IFERR(MDBX_ENOMEM);
/* Move loose pages to reclaimed list */
if (parent->wr.loose_count) {
do {
page_t *lp = parent->wr.loose_pages;
tASSERT(parent, lp->flags == P_LOOSE);
err = pnl_insert_span(&parent->wr.repnl, lp->pgno, 1);
if (unlikely(err != MDBX_SUCCESS))
return LOG_IFERR(err);
MDBX_ASAN_UNPOISON_MEMORY_REGION(&page_next(lp), sizeof(page_t *));
VALGRIND_MAKE_MEM_DEFINED(&page_next(lp), sizeof(page_t *));
parent->wr.loose_pages = page_next(lp);
/* Remove from dirty list */
page_wash(parent, dpl_exist(parent, lp->pgno), lp, 1);
} while (parent->wr.loose_pages);
parent->wr.loose_count = 0;
#if MDBX_ENABLE_REFUND
parent->wr.loose_refund_wl = 0;
#endif /* MDBX_ENABLE_REFUND */
tASSERT(parent, dpl_check(parent));
}
#if MDBX_ENABLE_REFUND
txn->wr.loose_refund_wl = 0;
#endif /* MDBX_ENABLE_REFUND */
txn->wr.dirtyroom = parent->wr.dirtyroom;
txn->wr.dirtylru = parent->wr.dirtylru;
dpl_sort(parent);
if (parent->wr.spilled.list)
spill_purge(parent);
tASSERT(txn, MDBX_PNL_ALLOCLEN(txn->wr.repnl) >= MDBX_PNL_GETSIZE(parent->wr.repnl));
memcpy(txn->wr.repnl, parent->wr.repnl, MDBX_PNL_SIZEOF(parent->wr.repnl));
/* coverity[assignment_where_comparison_intended] */
tASSERT(txn, pnl_check_allocated(txn->wr.repnl, (txn->geo.first_unallocated /* LY: intentional assignment
here, only for assertion */
= parent->geo.first_unallocated) -
MDBX_ENABLE_REFUND));
txn->wr.gc.spent = parent->wr.gc.spent;
rkl_init(&txn->wr.gc.comeback);
err = rkl_copy(&parent->wr.gc.reclaimed, &txn->wr.gc.reclaimed);
if (unlikely(err != MDBX_SUCCESS))
return err;
txn->wr.retired_pages = parent->wr.retired_pages;
parent->wr.retired_pages = (void *)(intptr_t)MDBX_PNL_GETSIZE(parent->wr.retired_pages);
txn->cursors[FREE_DBI] = nullptr;
txn->cursors[MAIN_DBI] = nullptr;
txn->dbi_state[FREE_DBI] = parent->dbi_state[FREE_DBI] & ~(DBI_FRESH | DBI_CREAT | DBI_DIRTY);
txn->dbi_state[MAIN_DBI] = parent->dbi_state[MAIN_DBI] & ~(DBI_FRESH | DBI_CREAT | DBI_DIRTY);
memset(txn->dbi_state + CORE_DBS, 0, (txn->n_dbi = parent->n_dbi) - CORE_DBS);
memcpy(txn->dbs, parent->dbs, sizeof(txn->dbs[0]) * CORE_DBS);
tASSERT(parent, parent->wr.dirtyroom + parent->wr.dirtylist->length ==
(parent->parent ? parent->parent->wr.dirtyroom : parent->env->options.dp_limit));
tASSERT(txn, txn->wr.dirtyroom + txn->wr.dirtylist->length ==
(txn->parent ? txn->parent->wr.dirtyroom : txn->env->options.dp_limit));
tASSERT(parent, parent->cursors[FREE_DBI] == nullptr);
// TODO: shadow GC' cursor
return txn_shadow_cursors(parent, MAIN_DBI);
}
void txn_nested_abort(MDBX_txn *nested) {
MDBX_txn *const parent = nested->parent;
tASSERT(nested, !(nested->flags & txn_may_have_cursors));
nested->signature = 0;
nested->owner = 0;
rkl_destroy(&nested->wr.gc.reclaimed);
if (nested->wr.retired_pages) {
tASSERT(parent, MDBX_PNL_GETSIZE(nested->wr.retired_pages) >= (uintptr_t)parent->wr.retired_pages);
MDBX_PNL_SETSIZE(nested->wr.retired_pages, (uintptr_t)parent->wr.retired_pages);
parent->wr.retired_pages = nested->wr.retired_pages;
}
tASSERT(parent, dpl_check(parent));
tASSERT(parent, audit_ex(parent, 0, false) == 0);
dpl_release_shadows(nested);
dpl_free(nested);
pnl_free(nested->wr.repnl);
osal_free(nested);
}
int txn_nested_join(MDBX_txn *txn, struct commit_timestamp *ts) {
MDBX_env *const env = txn->env;
MDBX_txn *const parent = txn->parent;
tASSERT(txn, audit_ex(txn, 0, false) == 0);
eASSERT(env, txn != env->basal_txn);
eASSERT(env, parent->signature == txn_signature);
eASSERT(env, parent->nested == txn && (parent->flags & MDBX_TXN_HAS_CHILD) != 0);
eASSERT(env, dpl_check(txn));
if (txn->wr.dirtylist->length == 0 && !(txn->flags & MDBX_TXN_DIRTY) && parent->n_dbi == txn->n_dbi) {
VERBOSE("fast-complete pure nested txn %" PRIaTXN, txn->txnid);
tASSERT(txn, memcmp(&parent->geo, &txn->geo, sizeof(parent->geo)) == 0);
tASSERT(txn, memcmp(&parent->canary, &txn->canary, sizeof(parent->canary)) == 0);
tASSERT(txn, !txn->wr.spilled.list || MDBX_PNL_GETSIZE(txn->wr.spilled.list) == 0);
tASSERT(txn, txn->wr.loose_count == 0);
/* Update parent's DBs array */
eASSERT(env, parent->n_dbi == txn->n_dbi);
TXN_FOREACH_DBI_ALL(txn, dbi) {
tASSERT(txn, (txn->dbi_state[dbi] & (DBI_CREAT | DBI_DIRTY)) == 0);
if (txn->dbi_state[dbi] & DBI_FRESH) {
parent->dbs[dbi] = txn->dbs[dbi];
/* preserve parent's status */
const uint8_t state = txn->dbi_state[dbi] | DBI_FRESH;
DEBUG("dbi %zu dbi-state %s 0x%02x -> 0x%02x", dbi, (parent->dbi_state[dbi] != state) ? "update" : "still",
parent->dbi_state[dbi], state);
parent->dbi_state[dbi] = state;
}
}
return txn_end(txn, TXN_END_PURE_COMMIT | TXN_END_SLOT | TXN_END_FREE);
}
/* Preserve space for spill list to avoid parent's state corruption
* if allocation fails. */
const size_t parent_retired_len = (uintptr_t)parent->wr.retired_pages;
tASSERT(txn, parent_retired_len <= MDBX_PNL_GETSIZE(txn->wr.retired_pages));
const size_t retired_delta = MDBX_PNL_GETSIZE(txn->wr.retired_pages) - parent_retired_len;
if (retired_delta) {
int err = pnl_need(&txn->wr.repnl, retired_delta);
if (unlikely(err != MDBX_SUCCESS))
return err;
}
if (txn->wr.spilled.list) {
if (parent->wr.spilled.list) {
int err = pnl_need(&parent->wr.spilled.list, MDBX_PNL_GETSIZE(txn->wr.spilled.list));
if (unlikely(err != MDBX_SUCCESS))
return err;
}
spill_purge(txn);
}
if (unlikely(txn->wr.dirtylist->length + parent->wr.dirtylist->length > parent->wr.dirtylist->detent &&
!dpl_reserve(parent, txn->wr.dirtylist->length + parent->wr.dirtylist->length))) {
return MDBX_ENOMEM;
}
//-------------------------------------------------------------------------
parent->wr.retired_pages = txn->wr.retired_pages;
txn->wr.retired_pages = nullptr;
pnl_free(parent->wr.repnl);
parent->wr.repnl = txn->wr.repnl;
txn->wr.repnl = nullptr;
parent->wr.gc.spent = txn->wr.gc.spent;
rkl_destructive_move(&txn->wr.gc.reclaimed, &parent->wr.gc.reclaimed);
parent->geo = txn->geo;
parent->canary = txn->canary;
parent->flags |= txn->flags & MDBX_TXN_DIRTY;
/* Move loose pages to parent */
#if MDBX_ENABLE_REFUND
parent->wr.loose_refund_wl = txn->wr.loose_refund_wl;
#endif /* MDBX_ENABLE_REFUND */
parent->wr.loose_count = txn->wr.loose_count;
parent->wr.loose_pages = txn->wr.loose_pages;
if (txn->flags & txn_may_have_cursors)
/* Merge our cursors into parent's and close them */
txn_done_cursors(txn);
/* Update parent's DBs array */
eASSERT(env, parent->n_dbi == txn->n_dbi);
TXN_FOREACH_DBI_ALL(txn, dbi) {
if (txn->dbi_state[dbi] != (parent->dbi_state[dbi] & ~(DBI_FRESH | DBI_CREAT | DBI_DIRTY))) {
eASSERT(env,
(txn->dbi_state[dbi] & (DBI_CREAT | DBI_FRESH | DBI_DIRTY)) != 0 ||
(txn->dbi_state[dbi] | DBI_STALE) == (parent->dbi_state[dbi] & ~(DBI_FRESH | DBI_CREAT | DBI_DIRTY)));
parent->dbs[dbi] = txn->dbs[dbi];
/* preserve parent's status */
const uint8_t state = txn->dbi_state[dbi] | (parent->dbi_state[dbi] & (DBI_CREAT | DBI_FRESH | DBI_DIRTY));
DEBUG("dbi %zu dbi-state %s 0x%02x -> 0x%02x", dbi, (parent->dbi_state[dbi] != state) ? "update" : "still",
parent->dbi_state[dbi], state);
parent->dbi_state[dbi] = state;
}
}
if (ts) {
ts->prep = osal_monotime();
ts->gc = /* no gc-update */ ts->prep;
ts->audit = /* no audit */ ts->gc;
ts->write = /* no write */ ts->audit;
ts->sync = /* no sync */ ts->write;
}
txn_merge(parent, txn, parent_retired_len);
tASSERT(parent, parent->flags & MDBX_TXN_HAS_CHILD);
parent->flags -= MDBX_TXN_HAS_CHILD;
env->txn = parent;
parent->nested = nullptr;
tASSERT(parent, dpl_check(parent));
#if MDBX_ENABLE_REFUND
txn_refund(parent);
if (ASSERT_ENABLED()) {
/* Check parent's loose pages not suitable for refund */
for (page_t *lp = parent->wr.loose_pages; lp; lp = page_next(lp)) {
tASSERT(parent, lp->pgno < parent->wr.loose_refund_wl && lp->pgno + 1 < parent->geo.first_unallocated);
MDBX_ASAN_UNPOISON_MEMORY_REGION(&page_next(lp), sizeof(page_t *));
VALGRIND_MAKE_MEM_DEFINED(&page_next(lp), sizeof(page_t *));
}
/* Check parent's reclaimed pages not suitable for refund */
if (MDBX_PNL_GETSIZE(parent->wr.repnl))
tASSERT(parent, MDBX_PNL_MOST(parent->wr.repnl) + 1 < parent->geo.first_unallocated);
}
#endif /* MDBX_ENABLE_REFUND */
txn->signature = 0;
osal_free(txn);
tASSERT(parent, audit_ex(parent, 0, false) == 0);
return MDBX_SUCCESS;
}

View File

@ -1,289 +0,0 @@
/// \copyright SPDX-License-Identifier: Apache-2.0
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2025
#include "internals.h"
static inline int txn_ro_rslot(MDBX_txn *txn) {
reader_slot_t *slot = txn->ro.slot;
STATIC_ASSERT(sizeof(uintptr_t) <= sizeof(slot->tid));
if (likely(slot)) {
if (likely(slot->pid.weak == txn->env->pid && slot->txnid.weak >= SAFE64_INVALID_THRESHOLD)) {
tASSERT(txn, slot->pid.weak == osal_getpid());
tASSERT(txn, slot->tid.weak == ((txn->env->flags & MDBX_NOSTICKYTHREADS) ? 0 : osal_thread_self()));
return MDBX_SUCCESS;
}
return MDBX_BAD_RSLOT;
}
if (unlikely(!txn->env->lck_mmap.lck))
return MDBX_SUCCESS;
MDBX_env *const env = txn->env;
if (env->flags & ENV_TXKEY) {
eASSERT(env, !(env->flags & MDBX_NOSTICKYTHREADS));
slot = thread_rthc_get(env->me_txkey);
if (likely(slot)) {
if (likely(slot->pid.weak == env->pid && slot->txnid.weak >= SAFE64_INVALID_THRESHOLD)) {
tASSERT(txn, slot->pid.weak == osal_getpid());
tASSERT(txn, slot->tid.weak == ((env->flags & MDBX_NOSTICKYTHREADS) ? 0 : osal_thread_self()));
txn->ro.slot = slot;
return MDBX_SUCCESS;
}
if (unlikely(slot->pid.weak) || !(globals.runtime_flags & MDBX_DBG_LEGACY_MULTIOPEN))
return MDBX_BAD_RSLOT;
thread_rthc_set(env->me_txkey, nullptr);
}
} else {
eASSERT(env, (env->flags & MDBX_NOSTICKYTHREADS));
}
bsr_t brs = mvcc_bind_slot(env);
if (likely(brs.err == MDBX_SUCCESS)) {
tASSERT(txn, brs.slot->pid.weak == osal_getpid());
tASSERT(txn, brs.slot->tid.weak == ((env->flags & MDBX_NOSTICKYTHREADS) ? 0 : osal_thread_self()));
}
txn->ro.slot = brs.slot;
return brs.err;
}
static inline int txn_ro_seize(MDBX_txn *txn) {
/* Seek & fetch the last meta */
troika_t troika = meta_tap(txn->env);
uint64_t timestamp = 0;
size_t loop = 0;
do {
MDBX_env *const env = txn->env;
const meta_ptr_t head = likely(env->stuck_meta < 0) ? /* regular */ meta_recent(env, &troika)
: /* recovery mode */ meta_ptr(env, env->stuck_meta);
reader_slot_t *const r = txn->ro.slot;
if (likely(r != nullptr)) {
safe64_reset(&r->txnid, true);
atomic_store32(&r->snapshot_pages_used, head.ptr_v->geometry.first_unallocated, mo_Relaxed);
atomic_store64(&r->snapshot_pages_retired, unaligned_peek_u64_volatile(4, head.ptr_v->pages_retired), mo_Relaxed);
safe64_write(&r->txnid, head.txnid);
eASSERT(env, r->pid.weak == osal_getpid());
eASSERT(env, r->tid.weak == ((env->flags & MDBX_NOSTICKYTHREADS) ? 0 : osal_thread_self()));
eASSERT(env, r->txnid.weak == head.txnid ||
(r->txnid.weak >= SAFE64_INVALID_THRESHOLD && head.txnid < env->lck->cached_oldest.weak));
atomic_store32(&env->lck->rdt_refresh_flag, true, mo_AcquireRelease);
} else {
/* exclusive mode without lck */
eASSERT(env, !env->lck_mmap.lck && env->lck == lckless_stub(env));
}
jitter4testing(true);
if (unlikely(meta_should_retry(env, &troika))) {
timestamp = 0;
continue;
}
/* Snap the state from current meta-head */
int err = coherency_fetch_head(txn, head, &timestamp);
jitter4testing(false);
if (unlikely(err != MDBX_SUCCESS)) {
if (err != MDBX_RESULT_TRUE)
return err;
continue;
}
const uint64_t snap_oldest = atomic_load64(&env->lck->cached_oldest, mo_AcquireRelease);
if (unlikely(txn->txnid < snap_oldest)) {
if (env->stuck_meta >= 0) {
ERROR("target meta-page %i is referenced to an obsolete MVCC-snapshot "
"%" PRIaTXN " < cached-oldest %" PRIaTXN,
env->stuck_meta, txn->txnid, snap_oldest);
return MDBX_MVCC_RETARDED;
}
continue;
}
if (!r || likely(txn->txnid == atomic_load64(&r->txnid, mo_Relaxed)))
return MDBX_SUCCESS;
} while (likely(++loop < 42));
ERROR("bailout waiting for valid snapshot (%s)", "meta-pages are too volatile");
return MDBX_PROBLEM;
}
int txn_ro_start(MDBX_txn *txn, unsigned flags) {
MDBX_env *const env = txn->env;
eASSERT(env, flags & MDBX_TXN_RDONLY);
eASSERT(env, (flags & ~(txn_ro_begin_flags | MDBX_WRITEMAP | MDBX_NOSTICKYTHREADS)) == 0);
txn->flags = flags;
int err = txn_ro_rslot(txn);
if (unlikely(err != MDBX_SUCCESS))
goto bailout;
STATIC_ASSERT(MDBX_TXN_RDONLY_PREPARE > MDBX_TXN_RDONLY);
reader_slot_t *r = txn->ro.slot;
if (flags & (MDBX_TXN_RDONLY_PREPARE - MDBX_TXN_RDONLY)) {
eASSERT(env, txn->txnid == 0);
eASSERT(env, txn->owner == 0);
eASSERT(env, txn->n_dbi == 0);
if (likely(r)) {
eASSERT(env, r->snapshot_pages_used.weak == 0);
eASSERT(env, r->txnid.weak >= SAFE64_INVALID_THRESHOLD);
atomic_store32(&r->snapshot_pages_used, 0, mo_Relaxed);
}
txn->flags = MDBX_TXN_RDONLY | MDBX_TXN_FINISHED;
return MDBX_SUCCESS;
}
txn->owner = likely(r) ? (uintptr_t)r->tid.weak : ((env->flags & MDBX_NOSTICKYTHREADS) ? 0 : osal_thread_self());
if ((env->flags & MDBX_NOSTICKYTHREADS) == 0 && env->txn && unlikely(env->basal_txn->owner == txn->owner) &&
(globals.runtime_flags & MDBX_DBG_LEGACY_OVERLAP) == 0) {
err = MDBX_TXN_OVERLAPPING;
goto bailout;
}
err = txn_ro_seize(txn);
if (unlikely(err != MDBX_SUCCESS))
goto bailout;
if (unlikely(txn->txnid < MIN_TXNID || txn->txnid > MAX_TXNID)) {
ERROR("%s", "environment corrupted by died writer, must shutdown!");
err = MDBX_CORRUPTED;
goto bailout;
}
return MDBX_SUCCESS;
bailout:
tASSERT(txn, err != MDBX_SUCCESS);
txn->txnid = INVALID_TXNID;
if (likely(txn->ro.slot))
safe64_reset(&txn->ro.slot->txnid, true);
return err;
}
int txn_ro_end(MDBX_txn *txn, unsigned mode) {
MDBX_env *const env = txn->env;
tASSERT(txn, (txn->flags & txn_may_have_cursors) == 0);
txn->n_dbi = 0; /* prevent further DBI activity */
if (txn->ro.slot) {
reader_slot_t *slot = txn->ro.slot;
if (unlikely(!env->lck))
txn->ro.slot = nullptr;
else {
eASSERT(env, slot->pid.weak == env->pid);
if (likely((txn->flags & MDBX_TXN_FINISHED) == 0)) {
if (likely((txn->flags & MDBX_TXN_PARKED) == 0)) {
ENSURE(env, txn->txnid >=
/* paranoia is appropriate here */ env->lck->cached_oldest.weak);
eASSERT(env, txn->txnid == slot->txnid.weak && slot->txnid.weak >= env->lck->cached_oldest.weak);
} else {
if ((mode & TXN_END_OPMASK) != TXN_END_OUSTED && safe64_read(&slot->tid) == MDBX_TID_TXN_OUSTED)
mode = (mode & ~TXN_END_OPMASK) | TXN_END_OUSTED;
do {
safe64_reset(&slot->txnid, false);
atomic_store64(&slot->tid, txn->owner, mo_AcquireRelease);
atomic_yield();
} while (
unlikely(safe64_read(&slot->txnid) < SAFE64_INVALID_THRESHOLD || safe64_read(&slot->tid) != txn->owner));
}
dxb_sanitize_tail(env, nullptr);
atomic_store32(&slot->snapshot_pages_used, 0, mo_Relaxed);
safe64_reset(&slot->txnid, true);
atomic_store32(&env->lck->rdt_refresh_flag, true, mo_Relaxed);
} else {
eASSERT(env, slot->pid.weak == env->pid);
eASSERT(env, slot->txnid.weak >= SAFE64_INVALID_THRESHOLD);
}
if (mode & TXN_END_SLOT) {
if ((env->flags & ENV_TXKEY) == 0)
atomic_store32(&slot->pid, 0, mo_Relaxed);
txn->ro.slot = nullptr;
}
}
}
#if defined(_WIN32) || defined(_WIN64)
if (txn->flags & txn_shrink_allowed)
imports.srwl_ReleaseShared(&env->remap_guard);
#endif
txn->flags = ((mode & TXN_END_OPMASK) != TXN_END_OUSTED) ? MDBX_TXN_RDONLY | MDBX_TXN_FINISHED
: MDBX_TXN_RDONLY | MDBX_TXN_FINISHED | MDBX_TXN_OUSTED;
txn->owner = 0;
if (mode & TXN_END_FREE) {
txn->signature = 0;
osal_free(txn);
}
return MDBX_SUCCESS;
}
int txn_ro_park(MDBX_txn *txn, bool autounpark) {
reader_slot_t *const rslot = txn->ro.slot;
tASSERT(txn, (txn->flags & (MDBX_TXN_FINISHED | MDBX_TXN_RDONLY | MDBX_TXN_PARKED)) == MDBX_TXN_RDONLY);
tASSERT(txn, txn->ro.slot->tid.weak < MDBX_TID_TXN_OUSTED);
if (unlikely((txn->flags & (MDBX_TXN_FINISHED | MDBX_TXN_RDONLY | MDBX_TXN_PARKED)) != MDBX_TXN_RDONLY))
return MDBX_BAD_TXN;
const uint32_t pid = atomic_load32(&rslot->pid, mo_Relaxed);
const uint64_t tid = atomic_load64(&rslot->tid, mo_Relaxed);
const uint64_t txnid = atomic_load64(&rslot->txnid, mo_Relaxed);
if (unlikely(pid != txn->env->pid)) {
ERROR("unexpected pid %u%s%u", pid, " != must ", txn->env->pid);
return MDBX_PROBLEM;
}
if (unlikely(tid != txn->owner || txnid != txn->txnid)) {
ERROR("unexpected thread-id 0x%" PRIx64 "%s0x%0zx"
" and/or txn-id %" PRIaTXN "%s%" PRIaTXN,
tid, " != must ", txn->owner, txnid, " != must ", txn->txnid);
return MDBX_BAD_RSLOT;
}
atomic_store64(&rslot->tid, MDBX_TID_TXN_PARKED, mo_AcquireRelease);
atomic_store32(&txn->env->lck->rdt_refresh_flag, true, mo_Relaxed);
txn->flags += autounpark ? MDBX_TXN_PARKED | MDBX_TXN_AUTOUNPARK : MDBX_TXN_PARKED;
return MDBX_SUCCESS;
}
int txn_ro_unpark(MDBX_txn *txn) {
if (unlikely((txn->flags & (MDBX_TXN_FINISHED | MDBX_TXN_HAS_CHILD | MDBX_TXN_RDONLY | MDBX_TXN_PARKED)) !=
(MDBX_TXN_RDONLY | MDBX_TXN_PARKED)))
return MDBX_BAD_TXN;
for (reader_slot_t *const rslot = txn->ro.slot; rslot; atomic_yield()) {
const uint32_t pid = atomic_load32(&rslot->pid, mo_Relaxed);
uint64_t tid = safe64_read(&rslot->tid);
uint64_t txnid = safe64_read(&rslot->txnid);
if (unlikely(pid != txn->env->pid)) {
ERROR("unexpected pid %u%s%u", pid, " != expected ", txn->env->pid);
return MDBX_PROBLEM;
}
if (unlikely(tid == MDBX_TID_TXN_OUSTED || txnid >= SAFE64_INVALID_THRESHOLD))
break;
if (unlikely(tid != MDBX_TID_TXN_PARKED || txnid != txn->txnid)) {
ERROR("unexpected thread-id 0x%" PRIx64 "%s0x%" PRIx64 " and/or txn-id %" PRIaTXN "%s%" PRIaTXN, tid, " != must ",
MDBX_TID_TXN_OUSTED, txnid, " != must ", txn->txnid);
break;
}
if (unlikely((txn->flags & MDBX_TXN_ERROR)))
break;
#if MDBX_64BIT_CAS
if (unlikely(!atomic_cas64(&rslot->tid, MDBX_TID_TXN_PARKED, txn->owner)))
continue;
#else
atomic_store32(&rslot->tid.high, (uint32_t)((uint64_t)txn->owner >> 32), mo_Relaxed);
if (unlikely(!atomic_cas32(&rslot->tid.low, (uint32_t)MDBX_TID_TXN_PARKED, (uint32_t)txn->owner))) {
atomic_store32(&rslot->tid.high, (uint32_t)(MDBX_TID_TXN_PARKED >> 32), mo_AcquireRelease);
continue;
}
#endif
txnid = safe64_read(&rslot->txnid);
tid = safe64_read(&rslot->tid);
if (unlikely(txnid != txn->txnid || tid != txn->owner)) {
ERROR("unexpected thread-id 0x%" PRIx64 "%s0x%zx"
" and/or txn-id %" PRIaTXN "%s%" PRIaTXN,
tid, " != must ", txn->owner, txnid, " != must ", txn->txnid);
break;
}
txn->flags &= ~(MDBX_TXN_PARKED | MDBX_TXN_AUTOUNPARK);
return MDBX_SUCCESS;
}
int err = txn_end(txn, TXN_END_OUSTED | TXN_END_RESET | TXN_END_UPDATE);
return err ? err : MDBX_OUSTED;
}

944
src/txn.c

File diff suppressed because it is too large Load Diff

View File

@ -3,17 +3,6 @@
#include "internals.h"
MDBX_NOTHROW_CONST_FUNCTION MDBX_MAYBE_UNUSED MDBX_INTERNAL unsigned ceil_log2n(size_t value_uintptr) {
assert(value_uintptr > 0 && value_uintptr < INT32_MAX);
value_uintptr -= 1;
value_uintptr |= value_uintptr >> 1;
value_uintptr |= value_uintptr >> 2;
value_uintptr |= value_uintptr >> 4;
value_uintptr |= value_uintptr >> 8;
value_uintptr |= value_uintptr >> 16;
return log2n_powerof2(value_uintptr + 1);
}
MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION MDBX_INTERNAL unsigned log2n_powerof2(size_t value_uintptr) {
assert(value_uintptr > 0 && value_uintptr < INT32_MAX && is_powerof2(value_uintptr));
assert((value_uintptr & -(intptr_t)value_uintptr) == value_uintptr);

View File

@ -58,8 +58,6 @@ MDBX_NOTHROW_CONST_FUNCTION MDBX_MAYBE_UNUSED static inline size_t ceil_powerof2
MDBX_NOTHROW_CONST_FUNCTION MDBX_MAYBE_UNUSED MDBX_INTERNAL unsigned log2n_powerof2(size_t value_uintptr);
MDBX_NOTHROW_CONST_FUNCTION MDBX_MAYBE_UNUSED MDBX_INTERNAL unsigned ceil_log2n(size_t value_uintptr);
MDBX_NOTHROW_CONST_FUNCTION MDBX_INTERNAL uint64_t rrxmrrxmsx_0(uint64_t v);
struct monotime_cache {

View File

@ -3,16 +3,11 @@
#include "internals.h"
#if !defined(MDBX_VERSION_UNSTABLE) && \
(MDBX_VERSION_MAJOR != ${MDBX_VERSION_MAJOR} || MDBX_VERSION_MINOR != ${MDBX_VERSION_MINOR})
#if MDBX_VERSION_MAJOR != ${MDBX_VERSION_MAJOR} || MDBX_VERSION_MINOR != ${MDBX_VERSION_MINOR}
#error "API version mismatch! Had `git fetch --tags` done?"
#endif
static const char sourcery[] =
#ifdef MDBX_VERSION_UNSTABLE
"UNSTABLE@"
#endif
MDBX_STRINGIFY(MDBX_BUILD_SOURCERY);
static const char sourcery[] = MDBX_STRINGIFY(MDBX_BUILD_SOURCERY);
__dll_export
#ifdef __attribute_used__

View File

@ -298,7 +298,6 @@ else()
add_extra_test(upsert_alldups SOURCE extra/upsert_alldups.c)
add_extra_test(dupfix_addodd SOURCE extra/dupfix_addodd.c)
endif()
add_extra_test(details_rkl SOURCE extra/details_rkl.c)
if(MDBX_BUILD_CXX)
if(NOT WIN32 OR NOT MDBX_CXX_STANDARD LESS 17)
add_extra_test(cursor_closing TIMEOUT 10800)

View File

@ -72,7 +72,6 @@ void configure_actor(unsigned &last_space_id, const actor_testcase testcase, con
log_trace("configure_actor: space %lu for %s", space_id, testcase2str(testcase));
global::actors.emplace_back(actor_config(testcase, params, unsigned(space_id), wait4id));
global::databases.insert(params.pathname_db);
params.prng_seed += bleach64(space_id);
}
void testcase_setup(const char *casename, const actor_params &params, unsigned &last_space_id) {

View File

@ -23,13 +23,7 @@
#define RELIEF_FACTOR 1
#endif
static const auto NN = 1000u / RELIEF_FACTOR;
#if defined(__cpp_lib_latch) && __cpp_lib_latch >= 201907L
static const auto N = std::min(17u, std::thread::hardware_concurrency());
#else
static const auto N = 3u;
#endif
#define NN (1000 / RELIEF_FACTOR)
static void logger_nofmt(MDBX_log_level_t loglevel, const char *function, int line, const char *msg,
unsigned length) noexcept {
@ -113,7 +107,6 @@ bool case0(mdbx::env env) {
* 4. Ждем завершения фоновых потоков.
* 5. Закрываем оставшиеся курсоры и закрываем БД. */
size_t global_seed = size_t(std::chrono::high_resolution_clock::now().time_since_epoch().count());
thread_local size_t salt;
static size_t prng() {
@ -269,7 +262,7 @@ void case1_write_cycle(mdbx::txn_managed txn, std::deque<mdbx::map_handle> &dbi,
pre.unbind();
if (!pre.txn())
pre.bind(txn, dbi[prng(dbi.size())]);
for (auto i = 0u; i < NN; ++i) {
for (auto i = 0; i < NN; ++i) {
auto k = mdbx::default_buffer::wrap(prng(NN));
auto v = mdbx::default_buffer::wrap(prng(NN));
if (pre.find_multivalue(k, v, false))
@ -291,16 +284,7 @@ void case1_write_cycle(mdbx::txn_managed txn, std::deque<mdbx::map_handle> &dbi,
}
bool case1_thread(mdbx::env env, std::deque<mdbx::map_handle> dbi, mdbx::cursor pre) {
#if defined(__cpp_lib_latch) && __cpp_lib_latch >= 201907L
mdbx::error::success_or_throw(mdbx_txn_lock(env, false));
std::hash<std::thread::id> hasher;
salt = global_seed ^ hasher(std::this_thread::get_id());
std::cout << "thread " << std::this_thread::get_id() << ", salt " << salt << std::endl << std::flush;
mdbx_txn_unlock(env);
#else
salt = global_seed;
#endif
salt = size_t(std::chrono::high_resolution_clock::now().time_since_epoch().count());
std::vector<MDBX_cursor *> pool;
for (auto loop = 0; loop < 333 / RELIEF_FACTOR; ++loop) {
for (auto read = 0; read < 333 / RELIEF_FACTOR; ++read) {
@ -327,7 +311,12 @@ bool case1(mdbx::env env) {
bool ok = true;
std::deque<mdbx::map_handle> dbi;
std::vector<mdbx::cursor_managed> cursors;
for (auto t = 0u; t < N; ++t) {
#if defined(__cpp_lib_latch) && __cpp_lib_latch >= 201907L
static const auto N = 10;
#else
static const auto N = 3;
#endif
for (auto t = 0; t < N; ++t) {
auto txn = env.start_write();
auto table = txn.create_map(std::to_string(t), mdbx::key_mode::ordinal, mdbx::value_mode::multi_samelength);
auto cursor = txn.open_cursor(table);
@ -342,7 +331,7 @@ bool case1(mdbx::env env) {
#if defined(__cpp_lib_latch) && __cpp_lib_latch >= 201907L
std::latch s(1);
std::vector<std::thread> threads;
for (auto t = 1u; t < cursors.size(); ++t) {
for (auto t = 1; t < N; ++t) {
case1_cycle_dbi(dbi);
threads.push_back(std::thread([&, t]() {
s.wait();
@ -393,7 +382,7 @@ int doit() {
mdbx::env::remove(db_filename);
mdbx::env_managed env(db_filename, mdbx::env_managed::create_parameters(),
mdbx::env::operate_parameters(N + 2, 0, mdbx::env::nested_transactions));
mdbx::env::operate_parameters(42, 0, mdbx::env::nested_transactions));
bool ok = case0(env);
ok = case1(env) && ok;

View File

@ -1,488 +0,0 @@
/// \copyright SPDX-License-Identifier: Apache-2.0
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2025
#define debug_log debug_log_sub
#include "../../src/rkl.c"
#include "../../src/txl.c"
MDBX_MAYBE_UNUSED __cold void debug_log_sub(int level, const char *function, int line, const char *fmt, ...) {
(void)level;
(void)function;
(void)line;
(void)fmt;
}
/*-----------------------------------------------------------------------------*/
static size_t tst_failed, tst_ok, tst_iterations, tst_cases, tst_cases_hole;
#ifndef NDEBUG
static size_t tst_target;
#endif
static bool check_bool(bool v, bool expect, const char *fn, unsigned line) {
if (unlikely(v != expect)) {
++tst_failed;
fflush(nullptr);
fprintf(stderr, "iteration %zi: got %s, expected %s, at %s:%u\n", tst_iterations, v ? "true" : "false",
expect ? "true" : "false", fn, line);
fflush(nullptr);
return false;
}
++tst_ok;
return true;
}
static bool check_eq(uint64_t v, uint64_t expect, const char *fn, unsigned line) {
if (unlikely(v != expect)) {
++tst_failed;
fflush(nullptr);
fprintf(stderr, "iteration %zi: %" PRIu64 " (got) != %" PRIu64 " (expected), at %s:%u\n", tst_iterations, v, expect,
fn, line);
fflush(nullptr);
return false;
}
++tst_ok;
return true;
}
#define CHECK_BOOL(T, EXPECT) check_bool((T), (EXPECT), __func__, __LINE__)
#define CHECK_TRUE(T) CHECK_BOOL(T, true)
#define CHECK_FALSE(T) CHECK_BOOL(T, false)
#define CHECK_EQ(T, EXPECT) check_eq((T), (EXPECT), __func__, __LINE__)
void trivia(void) {
rkl_t x, y;
rkl_init(&x);
rkl_init(&y);
CHECK_TRUE(rkl_check(&x));
CHECK_TRUE(rkl_empty(&x));
CHECK_EQ(rkl_len(&x), 0);
rkl_iter_t f = rkl_iterator(&x, false);
rkl_iter_t r = rkl_iterator(&x, true);
CHECK_EQ(rkl_left(&f, false), 0);
CHECK_EQ(rkl_left(&f, true), 0);
CHECK_EQ(rkl_left(&r, false), 0);
CHECK_EQ(rkl_left(&r, true), 0);
CHECK_EQ(rkl_turn(&f, false), 0);
CHECK_EQ(rkl_turn(&f, true), 0);
CHECK_EQ(rkl_turn(&r, false), 0);
CHECK_EQ(rkl_turn(&r, true), 0);
CHECK_TRUE(rkl_check(&x));
rkl_hole_t hole;
hole = rkl_hole(&f, true);
CHECK_EQ(hole.begin, 1);
CHECK_EQ(hole.end, MAX_TXNID);
hole = rkl_hole(&f, false);
CHECK_EQ(hole.begin, 1);
CHECK_EQ(hole.end, MAX_TXNID);
hole = rkl_hole(&r, true);
CHECK_EQ(hole.begin, 1);
CHECK_EQ(hole.end, MAX_TXNID);
hole = rkl_hole(&r, false);
CHECK_EQ(hole.begin, 1);
CHECK_EQ(hole.end, MAX_TXNID);
CHECK_EQ(rkl_push(&x, 42, false), MDBX_SUCCESS);
CHECK_TRUE(rkl_check(&x));
CHECK_FALSE(rkl_empty(&x));
CHECK_EQ(rkl_len(&x), 1);
CHECK_EQ(rkl_push(&x, 42, true), MDBX_RESULT_TRUE);
CHECK_TRUE(rkl_check(&x));
f = rkl_iterator(&x, false);
r = rkl_iterator(&x, true);
CHECK_EQ(rkl_left(&f, false), 1);
CHECK_EQ(rkl_left(&f, true), 0);
CHECK_EQ(rkl_left(&r, false), 0);
CHECK_EQ(rkl_left(&r, true), 1);
CHECK_EQ(rkl_turn(&f, true), 0);
CHECK_EQ(rkl_turn(&f, false), 42);
CHECK_EQ(rkl_turn(&f, false), 0);
CHECK_EQ(rkl_turn(&f, true), 42);
CHECK_EQ(rkl_turn(&f, true), 0);
CHECK_EQ(rkl_turn(&r, false), 0);
CHECK_EQ(rkl_turn(&r, true), 42);
CHECK_EQ(rkl_turn(&r, true), 0);
CHECK_EQ(rkl_turn(&r, false), 42);
CHECK_EQ(rkl_turn(&r, false), 0);
f = rkl_iterator(&x, false);
hole = rkl_hole(&f, false);
CHECK_EQ(hole.begin, 43);
CHECK_EQ(hole.end, MAX_TXNID);
hole = rkl_hole(&f, false);
CHECK_EQ(hole.begin, MAX_TXNID);
CHECK_EQ(hole.end, MAX_TXNID);
hole = rkl_hole(&f, true);
CHECK_EQ(hole.begin, 43);
CHECK_EQ(hole.end, MAX_TXNID);
hole = rkl_hole(&f, true);
CHECK_EQ(hole.begin, 1);
CHECK_EQ(hole.end, 42);
hole = rkl_hole(&f, true);
CHECK_EQ(hole.begin, 1);
CHECK_EQ(hole.end, 42);
r = rkl_iterator(&x, true);
hole = rkl_hole(&r, false);
CHECK_EQ(hole.begin, MAX_TXNID);
CHECK_EQ(hole.end, MAX_TXNID);
hole = rkl_hole(&r, true);
CHECK_EQ(hole.begin, 43);
CHECK_EQ(hole.end, MAX_TXNID);
hole = rkl_hole(&r, true);
CHECK_EQ(hole.begin, 1);
CHECK_EQ(hole.end, 42);
hole = rkl_hole(&r, false);
CHECK_EQ(hole.begin, 43);
CHECK_EQ(hole.end, MAX_TXNID);
hole = rkl_hole(&r, false);
CHECK_EQ(hole.begin, MAX_TXNID);
CHECK_EQ(hole.end, MAX_TXNID);
rkl_resize(&x, 222);
CHECK_FALSE(rkl_empty(&x));
CHECK_TRUE(rkl_check(&x));
rkl_destructive_move(&x, &y);
CHECK_TRUE(rkl_check(&x));
CHECK_TRUE(rkl_check(&y));
rkl_destroy(&x);
rkl_destroy(&y);
}
/*-----------------------------------------------------------------------------*/
uint64_t prng_state;
static uint64_t prng(void) {
prng_state = prng_state * UINT64_C(6364136223846793005) + 1;
return prng_state;
}
static bool flipcoin(void) { return (bool)prng() & 1; }
static bool stochastic_pass(const unsigned start, const unsigned width, const unsigned n) {
rkl_t k, c;
txl_t l = txl_alloc();
if (!CHECK_TRUE(l))
return false;
rkl_init(&k);
rkl_init(&c);
const size_t errors = tst_failed;
rkl_iter_t f = rkl_iterator(&k, false);
rkl_iter_t r = rkl_iterator(&k, true);
txnid_t lowest = UINT_MAX;
txnid_t highest = 0;
while (MDBX_PNL_GETSIZE(l) < n) {
txnid_t id = (txnid_t)(prng() % width + start);
if (id < MIN_TXNID || id >= INVALID_TXNID)
continue;
if (txl_contain(l, id)) {
if (CHECK_TRUE(rkl_contain(&k, id)) && CHECK_EQ(rkl_push(&k, id, false), MDBX_RESULT_TRUE))
continue;
break;
}
if (!CHECK_FALSE(rkl_contain(&k, id)))
break;
if (tst_iterations % (1u << 24) == 0 && tst_iterations) {
printf("done %.3fM iteration, %zu cases\n", tst_iterations / 1000000.0, tst_cases);
fflush(nullptr);
}
tst_iterations += 1;
#ifndef NDEBUG
if (tst_iterations == tst_target) {
printf("reach %zu iteration\n", tst_iterations);
fflush(nullptr);
}
#endif
if (!CHECK_EQ(rkl_push(&k, id, false), MDBX_SUCCESS))
break;
if (!CHECK_TRUE(rkl_check(&k)))
break;
if (!CHECK_EQ(txl_append(&l, id), MDBX_SUCCESS))
break;
if (!CHECK_TRUE(rkl_contain(&k, id)))
break;
lowest = (lowest < id) ? lowest : id;
highest = (highest > id) ? highest : id;
if (!CHECK_EQ(rkl_lowest(&k), lowest))
break;
if (!CHECK_EQ(rkl_highest(&k), highest))
break;
}
txl_sort(l);
CHECK_EQ(rkl_len(&k), n);
CHECK_EQ(MDBX_PNL_GETSIZE(l), n);
f = rkl_iterator(&k, false);
r = rkl_iterator(&k, true);
CHECK_EQ(rkl_left(&f, false), n);
CHECK_EQ(rkl_left(&f, true), 0);
CHECK_EQ(rkl_left(&r, false), 0);
CHECK_EQ(rkl_left(&r, true), n);
for (size_t i = 0; i < n; ++i) {
CHECK_EQ(rkl_turn(&f, false), l[n - i]);
CHECK_EQ(rkl_left(&f, false), n - i - 1);
CHECK_EQ(rkl_left(&f, true), i + 1);
CHECK_EQ(rkl_turn(&r, true), l[i + 1]);
r.pos += 1;
CHECK_EQ(rkl_turn(&r, true), l[i + 1]);
CHECK_EQ(rkl_left(&r, true), n - i - 1);
CHECK_EQ(rkl_left(&r, false), i + 1);
}
if (CHECK_EQ(rkl_copy(&k, &c), MDBX_SUCCESS)) {
for (size_t i = 1; i <= n; ++i) {
if (!CHECK_FALSE(rkl_empty(&k)))
break;
if (!CHECK_FALSE(rkl_empty(&c)))
break;
CHECK_EQ(rkl_pop(&k, true), l[i]);
CHECK_EQ(rkl_pop(&c, false), l[1 + n - i]);
}
}
CHECK_TRUE(rkl_empty(&k));
CHECK_TRUE(rkl_empty(&c));
rkl_destroy(&k);
rkl_destroy(&c);
txl_free(l);
++tst_cases;
return errors == tst_failed;
}
static bool stochastic(const size_t limit_cases, const size_t limit_loops) {
for (unsigned loop = 0; tst_cases < limit_cases || loop < limit_loops; ++loop)
for (unsigned width = 2; width < 10; ++width)
for (unsigned n = 1; n < width; ++n)
for (unsigned prev = 1, start = 0, t; start < 4242; t = start + prev, prev = start, start = t)
if (!stochastic_pass(start, 1u << width, 1u << n) || tst_failed > 42) {
puts("bailout\n");
return false;
}
return true;
}
/*-----------------------------------------------------------------------------*/
static bool bit(size_t set, size_t n) {
assert(n < CHAR_BIT * sizeof(set));
return (set >> n) & 1;
}
static size_t hamming_weight(size_t v) {
const size_t m1 = (size_t)UINT64_C(0x5555555555555555);
const size_t m2 = (size_t)UINT64_C(0x3333333333333333);
const size_t m4 = (size_t)UINT64_C(0x0f0f0f0f0f0f0f0f);
const size_t h01 = (size_t)UINT64_C(0x0101010101010101);
v -= (v >> 1) & m1;
v = (v & m2) + ((v >> 2) & m2);
v = (v + (v >> 4)) & m4;
return (v * h01) >> (sizeof(v) * 8 - 8);
}
static bool check_hole(const size_t set, const rkl_hole_t hole, size_t *acc) {
const size_t errors = tst_failed;
++tst_iterations;
if (hole.begin > 1)
CHECK_EQ(bit(set, hole.begin - 1), 1);
if (hole.end < CHAR_BIT * sizeof(set))
CHECK_EQ(bit(set, hole.end), 1);
for (size_t n = hole.begin; n < hole.end && n < CHAR_BIT * sizeof(set); n++) {
CHECK_EQ(bit(set, n), 0);
*acc += 1;
}
return errors == tst_failed;
}
static void debug_set(const size_t set, const char *str, int iter_offset) {
#if 1
(void)set;
(void)str;
(void)iter_offset;
#else
printf("\ncase %s+%d: count %zu, holes", str, iter_offset, hamming_weight(~set) - 1);
for (size_t k, i = 1; i < CHAR_BIT * sizeof(set); ++i) {
if (!bit(set, i)) {
printf(" %zu", i);
for (k = i; k < CHAR_BIT * sizeof(set) - 1 && !bit(set, k + 1); ++k)
;
if (k > i) {
printf("-%zu", k);
i = k;
}
}
}
printf("\n");
fflush(nullptr);
#endif
}
static bool check_holes_bothsides(const size_t set, rkl_iter_t const *i) {
const size_t number_of_holes = hamming_weight(~set) - 1;
size_t acc = 0;
rkl_iter_t f = *i;
for (;;) {
rkl_hole_t hole = rkl_hole(&f, false);
if (hole.begin == hole.end)
break;
if (!check_hole(set, hole, &acc))
return false;
if (hole.end >= CHAR_BIT * sizeof(set))
break;
}
rkl_iter_t b = *i;
for (;;) {
rkl_hole_t hole = rkl_hole(&b, true);
if (hole.begin == hole.end)
break;
if (!check_hole(set, hole, &acc))
return false;
if (hole.begin == 1)
break;
}
if (!CHECK_EQ(acc, number_of_holes))
return false;
return true;
}
static bool check_holes_fourways(const size_t set, const rkl_t *rkl) {
rkl_iter_t i = rkl_iterator(rkl, false);
int o = 0;
do {
debug_set(set, "initial-forward", o++);
if (!check_holes_bothsides(set, &i))
return false;
} while (rkl_turn(&i, false));
do {
debug_set(set, "recoil-reverse", --o);
if (!check_holes_bothsides(set, &i))
return false;
} while (rkl_turn(&i, true));
i = rkl_iterator(rkl, true);
o = 0;
do {
debug_set(set, "initial-reverse", --o);
if (!check_holes_bothsides(set, &i))
return false;
} while (rkl_turn(&i, false));
do {
debug_set(set, "recoil-forward", o++);
if (!check_holes_bothsides(set, &i))
return false;
} while (rkl_turn(&i, true));
return true;
}
static bool stochastic_pass_hole(size_t set, size_t trims) {
const size_t one = 1;
set &= ~one;
if (!set)
return true;
++tst_cases_hole;
rkl_t rkl;
rkl_init(&rkl);
for (size_t n = 1; n < CHAR_BIT * sizeof(set); ++n)
if (bit(set, n))
CHECK_EQ(rkl_push(&rkl, n, false), MDBX_SUCCESS);
if (!check_holes_fourways(set, &rkl))
return false;
while (rkl_len(&rkl) > 1 && trims-- > 0) {
if (flipcoin()) {
const size_t l = (size_t)rkl_pop(&rkl, false);
if (l == 0)
break;
assert(bit(set, l));
set -= one << l;
if (!check_holes_fourways(set, &rkl))
return false;
} else {
const size_t h = (size_t)rkl_pop(&rkl, true);
if (h == 0)
break;
assert(bit(set, h));
set -= one << h;
if (!check_holes_fourways(set, &rkl))
return false;
}
}
return true;
}
static size_t prng_word(void) {
size_t word = (size_t)(prng() >> 32);
if (sizeof(word) > 4)
word = (uint64_t)word << 32 | (size_t)(prng() >> 32);
return word;
}
static bool stochastic_hole(size_t probes) {
for (size_t n = 0; n < probes; ++n) {
size_t set = prng_word();
if (!stochastic_pass_hole(set, prng() % 11))
return false;
if (!stochastic_pass_hole(set & prng_word(), prng() % 11))
return false;
if (!stochastic_pass_hole(set | prng_word(), prng() % 11))
return false;
}
return true;
}
/*-----------------------------------------------------------------------------*/
int main(int argc, const char *argv[]) {
(void)argc;
(void)argv;
#ifndef NDEBUG
// tst_target = 281870;
#endif
prng_state = (uint64_t)time(nullptr);
printf("prng-seed %" PRIu64 "\n", prng_state);
fflush(nullptr);
trivia();
stochastic(42 * 42 * 42, 42);
stochastic_hole(24 * 24 * 24);
printf("done: %zu+%zu cases, %zu iterations, %zu checks ok, %zu checks failed\n", tst_cases, tst_cases_hole,
tst_iterations, tst_ok, tst_failed);
fflush(nullptr);
return tst_failed ? EXIT_FAILURE : EXIT_SUCCESS;
}

View File

@ -460,9 +460,9 @@ int main(int argc, char *const argv[]) {
params.datalen_max = params.datalen_min;
continue;
}
if (config::parse_option(argc, argv, narg, "batch.read", params.batch_read, config::decimal, 1))
if (config::parse_option(argc, argv, narg, "batch.read", params.batch_read, config::no_scale, 1))
continue;
if (config::parse_option(argc, argv, narg, "batch.write", params.batch_write, config::decimal, 1))
if (config::parse_option(argc, argv, narg, "batch.write", params.batch_write, config::no_scale, 1))
continue;
if (config::parse_option(argc, argv, narg, "delay", params.delaystart, config::duration))
continue;

View File

@ -770,7 +770,7 @@ static bool execute_thunk(const actor_config *const_config, const mdbx_pid_t pid
size_t iter = 0;
do {
if (iter) {
prng_salt(iter);
prng_seed(config.params.prng_seed += INT32_C(0xA4F4D37B));
log_verbose("turn PRNG to %u", config.params.prng_seed);
}
iter++;

View File

@ -263,8 +263,8 @@ public:
}
static bool review_params(actor_params &params, unsigned space_id) {
(void)space_id;
// silently fix key/data length for fixed-length modes
params.prng_seed += bleach32(space_id);
if ((params.table_flags & MDBX_INTEGERKEY) && params.keylen_min != params.keylen_max)
params.keylen_min = params.keylen_max;
if ((params.table_flags & (MDBX_INTEGERDUP | MDBX_DUPFIXED)) && params.datalen_min != params.datalen_max)

View File

@ -124,8 +124,7 @@ void prng_fill(uint64_t &state, void *ptr, size_t bytes) {
/* __thread */ uint64_t prng_state;
void prng_seed(uint64_t seed) { prng_state = seed; }
void prng_salt(unsigned salt) { prng_state += bleach32(salt) * UINT64_C(0xD14A2783862DAB); }
void prng_seed(uint64_t seed) { prng_state = bleach64(seed); }
uint32_t prng32(void) { return prng32_white(prng_state); }

View File

@ -313,7 +313,6 @@ void prng_fill(uint64_t &state, void *ptr, size_t bytes);
extern uint64_t prng_state;
void prng_seed(uint64_t seed);
void prng_salt(unsigned salt);
uint32_t prng32(void);
uint64_t prng64(void);
void prng_fill(void *ptr, size_t bytes);