mirror of
https://gitflic.ru/project/erthink/libmdbx.git
synced 2025-05-15 07:18:28 +00:00
Compare commits
198 Commits
Author | SHA1 | Date | |
---|---|---|---|
|
9fb0919468 | ||
|
a13147d115 | ||
|
800c96f22f | ||
|
d1023dc6b5 | ||
|
859c350df0 | ||
|
76e2544cc0 | ||
|
0a96b2ad97 | ||
|
402a8e62be | ||
|
06300de34e | ||
|
da9f78d2f6 | ||
|
a5af0c1a85 | ||
|
2b36fd5974 | ||
|
3338551860 | ||
|
1c7a5e18fe | ||
|
6627d14edf | ||
|
7db9c40fe0 | ||
|
52c9ef8807 | ||
|
5c44dd201c | ||
|
f4384800b5 | ||
|
011c3072da | ||
|
02b56e185f | ||
|
576fc94fef | ||
|
a56f5acc3d | ||
|
072103ab67 | ||
|
668a1e42e3 | ||
|
dc747483dd | ||
|
89de43293d | ||
|
270cf399aa | ||
|
b5503b5670 | ||
|
a71cefc288 | ||
|
6d6a19e3c3 | ||
|
0d7d4db3f1 | ||
|
0f505c1377 | ||
|
f6ce9381af | ||
|
2ceda89b05 | ||
|
5bd99d4da2 | ||
|
a04053ee98 | ||
|
f35c1fe5bc | ||
|
4691c0b5c8 | ||
|
f91c2bb8da | ||
|
6cb1b6754e | ||
|
187bd59aa0 | ||
|
1c49548ea5 | ||
|
4b9427685a | ||
|
650569cc6a | ||
|
d8f46344b5 | ||
|
ebf1e9d8ba | ||
|
4c3df230d3 | ||
|
9ea8e9b2cf | ||
|
b8c1b835ed | ||
|
db163cbcfd | ||
|
936c25e671 | ||
|
b308559dd9 | ||
|
b4e65f5d21 | ||
|
390490edf4 | ||
|
94531a9cdc | ||
|
f8e332a205 | ||
|
021d83b841 | ||
|
4e33bad6e7 | ||
|
a313dd2fae | ||
|
2e4962a2f3 | ||
|
00917f8c96 | ||
|
999f8644f6 | ||
|
06f8573f5f | ||
|
7eb7931a23 | ||
|
e37194affe | ||
|
917e2827f5 | ||
|
2fd1772503 | ||
|
694626727f | ||
|
2aa47f20c3 | ||
|
e6891b295b | ||
|
c0b1ab1466 | ||
|
71d95d1a5f | ||
|
7a923b3d41 | ||
|
8008afc6e1 | ||
|
7ae11e0fdb | ||
|
5c1745a7cd | ||
|
23a417fe19 | ||
|
db44f4ed71 | ||
|
ef9fd1f3fb | ||
|
2e6d9fd4d4 | ||
|
83e42d03bb | ||
|
dfd265d46f | ||
|
08d10ad0a1 | ||
|
8ebedde181 | ||
|
dcf35e5306 | ||
|
aeac971f0b | ||
|
6c8047a402 | ||
|
438d185250 | ||
|
ee6843062d | ||
|
70adf71770 | ||
|
fa2c27fa08 | ||
|
7a72d1b273 | ||
|
3e91500fac | ||
|
546b48b6eb | ||
|
2ffa5cf371 | ||
|
b546dc69d2 | ||
|
42706c45a0 | ||
|
8dda33329b | ||
|
b2bd8bae38 | ||
|
1299653457 | ||
|
333069e7a8 | ||
|
436998ca83 | ||
|
b0665f7016 | ||
|
4fcfb07b97 | ||
|
ca30365d3b | ||
|
6424747636 | ||
|
183610b050 | ||
|
920d9b5b2f | ||
|
283c962fea | ||
|
8efcdeae9d | ||
|
9c161cdafd | ||
|
a3265e11dc | ||
|
709d524d21 | ||
|
e0843429a1 | ||
|
329eee4e4f | ||
|
4fd165f8d2 | ||
|
05e7a94619 | ||
|
826cdb708f | ||
|
da24fda578 | ||
|
0fa21a3c0d | ||
|
dd9f608320 | ||
|
28ca18972a | ||
|
fbb93f9cfb | ||
|
bc464521c0 | ||
|
9273e2ee60 | ||
|
e035f102ab | ||
|
1240ed2ba3 | ||
|
6ca63b46d8 | ||
|
9fee0bc3a6 | ||
|
c14bb7814f | ||
|
9b31c517e6 | ||
|
66c747e4a9 | ||
|
54d8c0d290 | ||
|
26cd5ebc43 | ||
|
806f819bae | ||
|
05cdf9d202 | ||
|
818740976b | ||
|
287bab36a1 | ||
|
5388d2273b | ||
|
d2864029da | ||
|
b63ca3c12e | ||
|
4730abe3e5 | ||
|
401454dadf | ||
|
9568209ee4 | ||
|
781c04f6e2 | ||
|
b7206c68a5 | ||
|
3a0b857e1d | ||
|
6ccbce9afc | ||
|
9d7495fa09 | ||
|
c8f6d90e18 | ||
|
778aee25fe | ||
|
cb8eec6d11 | ||
|
f6d91b3c5b | ||
|
750fab2427 | ||
|
fffa78d912 | ||
|
fc85d1c61f | ||
|
340bd080c9 | ||
|
7074b94b2e | ||
|
f39542a9f0 | ||
|
d89670bcea | ||
|
fce40169bd | ||
|
560aa72f3d | ||
|
cb7ba6b53f | ||
|
1b9ad144ea | ||
|
0233eda949 | ||
|
78552a5c84 | ||
|
beb5a81d12 | ||
|
56d1dbef45 | ||
|
761248cc21 | ||
|
72fb45e13d | ||
|
e529cd7d19 | ||
|
2c3b36da64 | ||
|
314b8ce1f0 | ||
|
7e772114bc | ||
|
0accf98ff7 | ||
|
e4054b56c3 | ||
|
950db52fe8 | ||
|
380385c1db | ||
|
10e7e5c899 | ||
|
6d92a778a5 | ||
|
c60f6afe5f | ||
|
a5bb555db3 | ||
|
b9b784c18e | ||
|
c6cd482ea0 | ||
|
2b9401e372 | ||
|
6fe7baa1b8 | ||
|
1e5fef2c76 | ||
|
0a4156fe6f | ||
|
a89d418c91 | ||
|
585ccdf716 | ||
|
81e2623a54 | ||
|
b681b59434 | ||
|
67460dd0fd | ||
|
3a1ac35009 | ||
|
3c60e1e94c | ||
|
a994a9bbcc | ||
|
84e2c70b98 |
@ -132,6 +132,8 @@ if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/.git"
|
||||
AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/preface.h"
|
||||
AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/proto.h"
|
||||
AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/refund.c"
|
||||
AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/rkl.c"
|
||||
AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/rkl.h"
|
||||
AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/sort.h"
|
||||
AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/spill.c"
|
||||
AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/spill.h"
|
||||
@ -149,6 +151,9 @@ if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/.git"
|
||||
AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/tree-ops.c"
|
||||
AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/txl.c"
|
||||
AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/txl.h"
|
||||
AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/txn-basal.c"
|
||||
AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/txn-nested.c"
|
||||
AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/txn-ro.c"
|
||||
AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/txn.c"
|
||||
AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/unaligned.h"
|
||||
AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/utils.c"
|
||||
@ -829,6 +834,8 @@ else()
|
||||
"${MDBX_SOURCE_DIR}/preface.h"
|
||||
"${MDBX_SOURCE_DIR}/proto.h"
|
||||
"${MDBX_SOURCE_DIR}/refund.c"
|
||||
"${MDBX_SOURCE_DIR}/rkl.c"
|
||||
"${MDBX_SOURCE_DIR}/rkl.h"
|
||||
"${MDBX_SOURCE_DIR}/sort.h"
|
||||
"${MDBX_SOURCE_DIR}/spill.c"
|
||||
"${MDBX_SOURCE_DIR}/spill.h"
|
||||
@ -838,6 +845,9 @@ else()
|
||||
"${MDBX_SOURCE_DIR}/tree-ops.c"
|
||||
"${MDBX_SOURCE_DIR}/txl.c"
|
||||
"${MDBX_SOURCE_DIR}/txl.h"
|
||||
"${MDBX_SOURCE_DIR}/txn-basal.c"
|
||||
"${MDBX_SOURCE_DIR}/txn-nested.c"
|
||||
"${MDBX_SOURCE_DIR}/txn-ro.c"
|
||||
"${MDBX_SOURCE_DIR}/txn.c"
|
||||
"${MDBX_SOURCE_DIR}/unaligned.h"
|
||||
"${MDBX_SOURCE_DIR}/utils.c"
|
||||
|
1067
ChangeLog-01.md
Normal file
1067
ChangeLog-01.md
Normal file
File diff suppressed because it is too large
Load Diff
1540
ChangeLog.md
1540
ChangeLog.md
File diff suppressed because it is too large
Load Diff
@ -634,11 +634,12 @@ docs/usage.md: docs/__usage.md docs/_starting.md docs/__bindings.md
|
||||
@echo ' MAKE $@'
|
||||
$(QUIET)echo -e "\\page usage Usage\n\\section getting Building & Embedding" | cat - $^ | $(SED) 's/^Bindings$$/Bindings {#bindings}/' >$@
|
||||
|
||||
doxygen: docs/Doxyfile docs/overall.md docs/intro.md docs/usage.md mdbx.h mdbx.h++ src/options.h ChangeLog.md COPYRIGHT LICENSE NOTICE $(lastword $(MAKEFILE_LIST))
|
||||
doxygen: docs/Doxyfile docs/overall.md docs/intro.md docs/usage.md mdbx.h mdbx.h++ src/options.h ChangeLog.md COPYRIGHT LICENSE NOTICE docs/favicon.ico docs/manifest.webmanifest docs/ld+json $(lastword $(MAKEFILE_LIST))
|
||||
@echo ' RUNNING doxygen...'
|
||||
$(QUIET)rm -rf docs/html && \
|
||||
cat mdbx.h | tr '\n' '\r' | $(SED) -e 's/LIBMDBX_INLINE_API\s*(\s*\([^,]\+\),\s*\([^,]\+\),\s*(\s*\([^)]\+\)\s*)\s*)\s*{/inline \1 \2(\3) {/g' | tr '\r' '\n' >docs/mdbx.h && \
|
||||
cp mdbx.h++ src/options.h ChangeLog.md docs/ && (cd docs && doxygen Doxyfile $(HUSH)) && cp COPYRIGHT LICENSE NOTICE docs/html/
|
||||
cp mdbx.h++ src/options.h ChangeLog.md docs/ && (cd docs && doxygen Doxyfile $(HUSH)) && cp COPYRIGHT LICENSE NOTICE docs/favicon.ico docs/manifest.webmanifest docs/html/ && \
|
||||
$(SED) -i docs/html/index.html -e '/\/MathJax.js"><\/script>/r docs/ld+json' -e 's/<title>libmdbx: Overall<\/title>//;T;r docs/title'
|
||||
|
||||
mdbx++-dylib.o: src/config.h src/mdbx.c++ mdbx.h mdbx.h++ $(lastword $(MAKEFILE_LIST))
|
||||
@echo ' CC $@'
|
||||
@ -721,6 +722,7 @@ $(DIST_DIR)/@tmp-internals.inc: $(DIST_DIR)/@tmp-essentials.inc src/version.c $(
|
||||
-e '/#include "essentials.h"/d' \
|
||||
-e '/#include "atomics-ops.h"/r src/atomics-ops.h' \
|
||||
-e '/#include "proto.h"/r src/proto.h' \
|
||||
-e '/#include "rkl.h"/r src/rkl.h' \
|
||||
-e '/#include "txl.h"/r src/txl.h' \
|
||||
-e '/#include "unaligned.h"/r src/unaligned.h' \
|
||||
-e '/#include "cogs.h"/r src/cogs.h' \
|
||||
|
60
README.md
60
README.md
@ -1,18 +1,5 @@
|
||||
<!-- Required extensions: pymdownx.betterem, pymdownx.tilde, pymdownx.emoji, pymdownx.tasklist, pymdownx.superfences -->
|
||||
|
||||
> Please refer to the online [documentation](https://libmdbx.dqdkfa.ru)
|
||||
> with [`C` API description](https://libmdbx.dqdkfa.ru/group__c__api.html)
|
||||
> and pay attention to the [`C++` API](https://gitflic.ru/project/erthink/libmdbx/blob?file=mdbx.h%2B%2B#line-num-1).
|
||||
|
||||
> Questions, feedback and suggestions are welcome to the [Telegram' group](https://t.me/libmdbx) (archive [1](https://libmdbx.dqdkfa.ru/tg-archive/messages1.html),
|
||||
> [2](https://libmdbx.dqdkfa.ru/tg-archive/messages2.html), [3](https://libmdbx.dqdkfa.ru/tg-archive/messages3.html), [4](https://libmdbx.dqdkfa.ru/tg-archive/messages4.html),
|
||||
> [5](https://libmdbx.dqdkfa.ru/tg-archive/messages5.html), [6](https://libmdbx.dqdkfa.ru/tg-archive/messages6.html), [7](https://libmdbx.dqdkfa.ru/tg-archive/messages7.html)).
|
||||
> See the [ChangeLog](https://gitflic.ru/project/erthink/libmdbx/blob?file=ChangeLog.md) for `NEWS` and latest updates.
|
||||
|
||||
> Donations are welcome to the Ethereum/ERC-20 `0xD104d8f8B2dC312aaD74899F83EBf3EEBDC1EA3A`.
|
||||
> Всё будет хорошо!
|
||||
|
||||
|
||||
libmdbx
|
||||
========
|
||||
|
||||
@ -39,32 +26,44 @@ tree](https://en.wikipedia.org/wiki/B%2B_tree).
|
||||
[WAL](https://en.wikipedia.org/wiki/Write-ahead_logging), but that might
|
||||
be a caveat for write-intensive workloads with durability requirements.
|
||||
|
||||
4. **Compact and friendly for fully embedding**. Only ≈25KLOC of `C11`,
|
||||
≈64K x86 binary code of core, no internal threads neither server process(es),
|
||||
but implements a simplified variant of the [Berkeley
|
||||
DB](https://en.wikipedia.org/wiki/Berkeley_DB) and
|
||||
[dbm](https://en.wikipedia.org/wiki/DBM_(computing)) API.
|
||||
|
||||
5. Enforces [serializability](https://en.wikipedia.org/wiki/Serializability) for
|
||||
4. Enforces [serializability](https://en.wikipedia.org/wiki/Serializability) for
|
||||
writers just by single
|
||||
[mutex](https://en.wikipedia.org/wiki/Mutual_exclusion) and affords
|
||||
[wait-free](https://en.wikipedia.org/wiki/Non-blocking_algorithm#Wait-freedom)
|
||||
for parallel readers without atomic/interlocked operations, while
|
||||
**writing and reading transactions do not block each other**.
|
||||
|
||||
6. **Guarantee data integrity** after crash unless this was explicitly
|
||||
5. **Guarantee data integrity** after crash unless this was explicitly
|
||||
neglected in favour of write performance.
|
||||
|
||||
7. Supports Linux, Windows, MacOS, Android, iOS, FreeBSD, DragonFly, Solaris,
|
||||
6. Supports Linux, Windows, MacOS, Android, iOS, FreeBSD, DragonFly, Solaris,
|
||||
OpenSolaris, OpenIndiana, NetBSD, OpenBSD and other systems compliant with
|
||||
**POSIX.1-2008**.
|
||||
|
||||
7. **Compact and friendly for fully embedding**. Only ≈25KLOC of `C11`,
|
||||
≈64K x86 binary code of core, no internal threads neither server process(es),
|
||||
but implements a simplified variant of the [Berkeley
|
||||
DB](https://en.wikipedia.org/wiki/Berkeley_DB) and
|
||||
[dbm](https://en.wikipedia.org/wiki/DBM_(computing)) API.
|
||||
|
||||
<!-- section-end -->
|
||||
|
||||
Historically, _libmdbx_ is a deeply revised and extended descendant of the amazing
|
||||
Historically, _libmdbx_ is a deeply revised and extended descendant of the legendary
|
||||
[Lightning Memory-Mapped Database](https://en.wikipedia.org/wiki/Lightning_Memory-Mapped_Database).
|
||||
_libmdbx_ inherits all benefits from _LMDB_, but resolves some issues and adds [a set of improvements](#improvements-beyond-lmdb).
|
||||
|
||||
[](https://t.me/libmdbx)
|
||||
|
||||
> Please refer to the online [documentation](https://libmdbx.dqdkfa.ru)
|
||||
> with [`C` API description](https://libmdbx.dqdkfa.ru/group__c__api.html)
|
||||
> and pay attention to the [`C++` API](https://gitflic.ru/project/erthink/libmdbx/blob?file=mdbx.h%2B%2B#line-num-1).
|
||||
> Donations are welcome to the Ethereum/ERC-20 `0xD104d8f8B2dC312aaD74899F83EBf3EEBDC1EA3A`.
|
||||
> Всё будет хорошо!
|
||||
|
||||
Telegram Group archive: [1](https://libmdbx.dqdkfa.ru/tg-archive/messages1.html),
|
||||
[2](https://libmdbx.dqdkfa.ru/tg-archive/messages2.html), [3](https://libmdbx.dqdkfa.ru/tg-archive/messages3.html), [4](https://libmdbx.dqdkfa.ru/tg-archive/messages4.html),
|
||||
[5](https://libmdbx.dqdkfa.ru/tg-archive/messages5.html), [6](https://libmdbx.dqdkfa.ru/tg-archive/messages6.html), [7](https://libmdbx.dqdkfa.ru/tg-archive/messages7.html).
|
||||
|
||||
## Github
|
||||
|
||||
### на Русском (мой родной язык)
|
||||
@ -126,8 +125,7 @@ of the database. All fundamental architectural problems of libmdbx/LMDB
|
||||
have been solved there, but now the active development has been
|
||||
suspended for top-three reasons:
|
||||
|
||||
1. For now _libmdbx_ «mostly» enough for all [our products](https://www.ptsecurity.com/ww-en/products/),
|
||||
and I’m busy in development of replication for scalability.
|
||||
1. For now _libmdbx_ mostly enough and I’m busy for scalability.
|
||||
2. Waiting for fresh [Elbrus CPU](https://wiki.elbrus.ru/) of [e2k architecture](https://en.wikipedia.org/wiki/Elbrus_2000),
|
||||
especially with hardware acceleration of [Streebog](https://en.wikipedia.org/wiki/Streebog) and
|
||||
[Kuznyechik](https://en.wikipedia.org/wiki/Kuznyechik), which are required for Merkle tree, etc.
|
||||
@ -556,9 +554,9 @@ Of course, in addition to this, your toolchain must ensure the reproducibility o
|
||||
For more information please refer to [reproducible-builds.org](https://reproducible-builds.org/).
|
||||
|
||||
#### Containers
|
||||
There are no special traits nor quirks if you use libmdbx ONLY inside the single container.
|
||||
But in a cross-container cases or with a host-container(s) mix the two major things MUST be
|
||||
guaranteed:
|
||||
There are no special traits nor quirks if you use _libmdbx_ ONLY inside
|
||||
the single container. But in a cross-container(s) or with a host-container(s)
|
||||
interoperability cases the three major things MUST be guaranteed:
|
||||
|
||||
1. Coherence of memory mapping content and unified page cache inside OS
|
||||
kernel for host and all container(s) operated with a DB. Basically this
|
||||
@ -574,6 +572,12 @@ in the system memory.
|
||||
including `ERROR_ACCESS_DENIED`,
|
||||
but not the `ERROR_INVALID_PARAMETER` as for an invalid/non-existent PID.
|
||||
|
||||
3. The versions/builds of _libmdbx_ and `libc`/`pthreads` (`glibc`, `musl`, etc) must be be compatible.
|
||||
- Basically, the `options:` string in the output of `mdbx_chk -V` must be the same for host and container(s).
|
||||
See `MDBX_LOCKING`, `MDBX_USE_OFDLOCKS` and other build options for details.
|
||||
- Avoid using different versions of `libc`, especially mixing different implementations, i.e. `glibc` with `musl`, etc.
|
||||
Prefer to use the same LTS version, or switch to full virtualization/isolation if in doubt.
|
||||
|
||||
#### DSO/DLL unloading and destructors of Thread-Local-Storage objects
|
||||
When building _libmdbx_ as a shared library or use static _libmdbx_ as a
|
||||
part of another dynamic library, it is advisable to make sure that your
|
||||
|
21
TODO.md
21
TODO.md
@ -1,16 +1,16 @@
|
||||
TODO
|
||||
----
|
||||
|
||||
Unfortunately, on 2022-04-15 the Github administration, without any
|
||||
warning nor explanation, deleted _libmdbx_ along with a lot of other
|
||||
projects, simultaneously blocking access for many developers. Therefore
|
||||
on 2022-04-21 we have migrated to a reliable trusted infrastructure.
|
||||
The origin for now is at[GitFlic](https://gitflic.ru/project/erthink/libmdbx)
|
||||
with backup at [ABF by ROSA Лаб](https://abf.rosalinux.ru/erthink/libmdbx).
|
||||
For the same reason ~~Github~~ is blacklisted forever.
|
||||
|
||||
So currently most of the links are broken due to noted malicious ~~Github~~ sabotage.
|
||||
|
||||
- [SWIG](https://www.swig.org/).
|
||||
- Параллельная lto-сборка с устранением предупреждений.
|
||||
- Интеграция c DTrace и аналогами.
|
||||
- Новый стиль обработки ошибок с записью "трассы" и причин.
|
||||
- Формирование отладочной информации посредством gdb.
|
||||
- Поддержка WASM.
|
||||
- Ранняя/не-отложенная очистка GC.
|
||||
- Явная и автоматические уплотнение/дефрагментация.
|
||||
- Нелинейная обработка GC.
|
||||
- Перевести курсоры на двусвязный список вместо односвязного.
|
||||
- Внутри `txn_renew()` вынести проверку когерентности mmap за/после изменение размера.
|
||||
- [Migration guide from LMDB to MDBX](https://libmdbx.dqdkfa.ru/dead-github/issues/199).
|
||||
- [Support for RAW devices](https://libmdbx.dqdkfa.ru/dead-github/issues/124).
|
||||
@ -20,6 +20,7 @@ So currently most of the links are broken due to noted malicious ~~Github~~ sabo
|
||||
Done
|
||||
----
|
||||
|
||||
- Рефакторинг gc-get/gc-put c переходом на "интервальные" списки.
|
||||
- [Engage new terminology](https://libmdbx.dqdkfa.ru/dead-github/issues/137).
|
||||
- [More flexible support of asynchronous runtime/framework(s)](https://libmdbx.dqdkfa.ru/dead-github/issues/200).
|
||||
- [Move most of `mdbx_chk` functional to the library API](https://libmdbx.dqdkfa.ru/dead-github/issues/204).
|
||||
|
734
docs/Doxyfile.in
734
docs/Doxyfile.in
File diff suppressed because it is too large
Load Diff
@ -54,7 +54,7 @@ cleans readers, as an a process aborting (especially with core dump) can
|
||||
take a long time, and checking readers cannot be performed too often due
|
||||
to performance degradation.
|
||||
|
||||
This issue will be addressed in MithrlDB and one of libmdbx releases,
|
||||
This issue will be addressed in MithrilDB and one of libmdbx releases,
|
||||
presumably in 2025. To do this, nonlinear GC recycling will be
|
||||
implemented, without stopping garbage recycling on the old MVCC snapshot
|
||||
used by a long read transaction.
|
||||
@ -92,7 +92,7 @@ free consecutive/adjacent pages through GC has been significantly
|
||||
speeded, including acceleration using NOEN/SSE2/AVX2/AVX512
|
||||
instructions.
|
||||
|
||||
This issue will be addressed in MithrlDB and refined within one of
|
||||
This issue will be addressed in MithrilDB and refined within one of
|
||||
0.15.x libmdbx releases, presumably at end of 2025.
|
||||
|
||||
|
||||
|
@ -2,7 +2,10 @@ The source code is availale on [Gitflic](https://gitflic.ru/project/erthink/libm
|
||||
Donations are welcome to ETH `0xD104d8f8B2dC312aaD74899F83EBf3EEBDC1EA3A`.
|
||||
Всё будет хорошо!
|
||||
|
||||
> Questions, feedback and suggestions are welcome to the [Telegram' group](https://t.me/libmdbx).
|
||||
> Questions, feedback and suggestions are welcome to the [Telegram' group](https://t.me/libmdbx) (archive [1](https://libmdbx.dqdkfa.ru/tg-archive/messages1.html),
|
||||
> [2](https://libmdbx.dqdkfa.ru/tg-archive/messages2.html), [3](https://libmdbx.dqdkfa.ru/tg-archive/messages3.html), [4](https://libmdbx.dqdkfa.ru/tg-archive/messages4.html),
|
||||
> [5](https://libmdbx.dqdkfa.ru/tg-archive/messages5.html), [6](https://libmdbx.dqdkfa.ru/tg-archive/messages6.html), [7](https://libmdbx.dqdkfa.ru/tg-archive/messages7.html)).
|
||||
> See the [ChangeLog](https://gitflic.ru/project/erthink/libmdbx/blob?file=ChangeLog.md) for `NEWS` and latest updates.
|
||||
|
||||
\section toc Table of Contents
|
||||
|
||||
|
BIN
docs/favicon.ico
Normal file
BIN
docs/favicon.ico
Normal file
Binary file not shown.
After Width: | Height: | Size: 4.2 KiB |
@ -1,10 +1,17 @@
|
||||
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "https://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
||||
<!DOCTYPE html>
|
||||
<html xmlns="http://www.w3.org/1999/xhtml" lang="$langISO">
|
||||
<head>
|
||||
<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
|
||||
<meta http-equiv="X-UA-Compatible" content="IE=11"/>
|
||||
<meta name="generator" content="Doxygen $doxygenversion"/>
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<link rel="icon" href="favicon.ico">
|
||||
<link rel="icon" href="img/bear.png" type="image/png">
|
||||
<link rel="apple-touch-icon" href="img/bear.png">
|
||||
<meta property="og:type" content="article"/>
|
||||
<meta property="og:url" content="https://libmdbx.dqdkfa.ru/"/>
|
||||
<meta name="twitter:title" content="One of the fastest embeddable key-value engine"/>
|
||||
<meta name="twitter:description" content="MDBX surpasses the legendary LMDB in terms of reliability, features and performance. For now libmdbx is chosen by all modern Ethereum frontiers as a storage engine."/>
|
||||
<!--BEGIN PROJECT_NAME--><title>$projectname: $title</title><!--END PROJECT_NAME-->
|
||||
<!--BEGIN !PROJECT_NAME--><title>$title</title><!--END !PROJECT_NAME-->
|
||||
<!--BEGIN PROJECT_ICON-->
|
||||
|
27
docs/ld+json
Normal file
27
docs/ld+json
Normal file
@ -0,0 +1,27 @@
|
||||
<script type="application/ld+json">
|
||||
{
|
||||
"@context": "https://schema.org",
|
||||
"@type": "ItemList",
|
||||
"itemListElement": [{
|
||||
"@type": "ListItem",
|
||||
"position": 1,
|
||||
"name": "Группа в Telegram",
|
||||
"url": "https://t.me/libmdbx"
|
||||
},{
|
||||
"@type": "ListItem",
|
||||
"position": 2,
|
||||
"name": "Исходный код",
|
||||
"url": "https://gitflic.ru/project/erthink/libmdbx"
|
||||
},{
|
||||
"@type": "ListItem",
|
||||
"position": 3,
|
||||
"name": "C++ API",
|
||||
"url": "https://libmdbx.dqdkfa.ru/group__cxx__api.html"
|
||||
},{
|
||||
"@type": "ListItem",
|
||||
"position": 4,
|
||||
"name": "Mirror on Github",
|
||||
"url": "https://github.com/erthink/libmdbx"
|
||||
}]
|
||||
}
|
||||
</script>
|
6
docs/manifest.webmanifest
Normal file
6
docs/manifest.webmanifest
Normal file
@ -0,0 +1,6 @@
|
||||
{
|
||||
"icons": [
|
||||
{ "src": "favicon.ico", "type": "image/ico", "sizes": "32x32" },
|
||||
{ "src": "img/bear.png", "type": "image/png", "sizes": "256x256" }
|
||||
]
|
||||
}
|
2
docs/title
Normal file
2
docs/title
Normal file
@ -0,0 +1,2 @@
|
||||
<title>libmdbx: One of the fastest embeddable key-value engine</title>
|
||||
<meta name="description" content="libmdbx surpasses the legendary LMDB in terms of reliability, features and performance. For now libmdbx is chosen by all modern Ethereum frontiers as a storage engine.">
|
18
mdbx.h
18
mdbx.h
@ -581,9 +581,10 @@ typedef mode_t mdbx_mode_t;
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/* MDBX version 0.13.x */
|
||||
/* MDBX version 0.14.x, but it is unstable/under-development yet. */
|
||||
#define MDBX_VERSION_UNSTABLE
|
||||
#define MDBX_VERSION_MAJOR 0
|
||||
#define MDBX_VERSION_MINOR 13
|
||||
#define MDBX_VERSION_MINOR 14
|
||||
|
||||
#ifndef LIBMDBX_API
|
||||
#if defined(LIBMDBX_EXPORTS) || defined(DOXYGEN)
|
||||
@ -2774,10 +2775,10 @@ typedef struct MDBX_stat MDBX_stat;
|
||||
* Legacy mdbx_env_stat() correspond to calling \ref mdbx_env_stat_ex() with the
|
||||
* null `txn` argument.
|
||||
*
|
||||
* \param [in] env An environment handle returned by \ref mdbx_env_create()
|
||||
* \param [in] txn A transaction handle returned by \ref mdbx_txn_begin()
|
||||
* \param [in] env An environment handle returned by \ref mdbx_env_create().
|
||||
* \param [in] txn A transaction handle returned by \ref mdbx_txn_begin().
|
||||
* \param [out] stat The address of an \ref MDBX_stat structure where
|
||||
* the statistics will be copied
|
||||
* the statistics will be copied.
|
||||
* \param [in] bytes The size of \ref MDBX_stat.
|
||||
*
|
||||
* \returns A non-zero error value on failure and 0 on success. */
|
||||
@ -4196,7 +4197,10 @@ LIBMDBX_API int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency);
|
||||
* \returns A non-zero error value on failure and 0 on success,
|
||||
* some possible errors are:
|
||||
* \retval MDBX_RESULT_TRUE Transaction was aborted since it should
|
||||
* be aborted due to previous errors.
|
||||
* be aborted due to previous errors,
|
||||
* either no changes were made during the transaction,
|
||||
* and the build time option
|
||||
* \ref MDBX_NOSUCCESS_PURE_COMMIT was enabled.
|
||||
* \retval MDBX_PANIC A fatal error occurred earlier
|
||||
* and the environment must be shut down.
|
||||
* \retval MDBX_BAD_TXN Transaction is already finished or never began.
|
||||
@ -6538,6 +6542,8 @@ typedef struct MDBX_chk_table {
|
||||
struct MDBX_chk_histogram key_len;
|
||||
/// Values length histogram
|
||||
struct MDBX_chk_histogram val_len;
|
||||
/// Number of multi-values (aka duplicates) histogram
|
||||
struct MDBX_chk_histogram multival;
|
||||
} histogram;
|
||||
} MDBX_chk_table_t;
|
||||
|
||||
|
@ -1,7 +1,7 @@
|
||||
From 49256dcd050fd0ee67860b7bc544dabe088d08e9 Mon Sep 17 00:00:00 2001
|
||||
From 349c08cf21b66ecea851340133a1b845c25675f7 Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?=
|
||||
=?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= <leo@yuriev.ru>
|
||||
Date: Fri, 14 Feb 2025 21:34:25 +0300
|
||||
Date: Tue, 22 Apr 2025 14:38:49 +0300
|
||||
Subject: [PATCH] package/libmdbx: new package (library/database).
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
@ -15,7 +15,7 @@ This patch adds libmdbx:
|
||||
in terms of reliability, features and performance.
|
||||
- more information at https://libmdbx.dqdkfa.ru
|
||||
|
||||
The 0.13.4 "Sigma Boy" is stable release of _libmdbx_ branch with new superior features.
|
||||
The 0.13.6 "Бузина" (Elderberry) is stable release of _libmdbx_ branch with new superior features.
|
||||
|
||||
The complete ChangeLog: https://gitflic.ru/project/erthink/libmdbx/blob?file=ChangeLog.md
|
||||
|
||||
@ -110,19 +110,19 @@ index 0000000000..a9a4ac45c5
|
||||
+ !BR2_TOOLCHAIN_GCC_AT_LEAST_4_4
|
||||
diff --git a/package/libmdbx/libmdbx.hash b/package/libmdbx/libmdbx.hash
|
||||
new file mode 100644
|
||||
index 0000000000..202937e7be
|
||||
index 0000000000..ae5266716b
|
||||
--- /dev/null
|
||||
+++ b/package/libmdbx/libmdbx.hash
|
||||
@@ -0,0 +1,6 @@
|
||||
+# Hashes from: https://libmdbx.dqdkfa.ru/release/SHA256SUMS
|
||||
+sha256 86df30ca2231c9b3ad71424bb829dca9041947f5539d4295030c653d4982c1be libmdbx-amalgamated-0.13.4.tar.xz
|
||||
+sha256 57db987de6f7ccc66a66ae28a7bda9f9fbb48ac5fb9279bcca92fd5de13075d1 libmdbx-amalgamated-0.13.6.tar.xz
|
||||
+
|
||||
+# Locally calculated
|
||||
+sha256 0d542e0c8804e39aa7f37eb00da5a762149dc682d7829451287e11b938e94594 LICENSE
|
||||
+sha256 699a62986b6c8d31124646dffd4b15872c7d3bc5eecea5994edb1f5195df49d1 NOTICE
|
||||
+sha256 651f71b46c6bb0046d2122df7f9def9cb24f4dc28c5b11cef059f66565cda30f NOTICE
|
||||
diff --git a/package/libmdbx/libmdbx.mk b/package/libmdbx/libmdbx.mk
|
||||
new file mode 100644
|
||||
index 0000000000..a8a6f3dbdf
|
||||
index 0000000000..571757262e
|
||||
--- /dev/null
|
||||
+++ b/package/libmdbx/libmdbx.mk
|
||||
@@ -0,0 +1,42 @@
|
||||
@ -132,7 +132,7 @@ index 0000000000..a8a6f3dbdf
|
||||
+#
|
||||
+################################################################################
|
||||
+
|
||||
+LIBMDBX_VERSION = 0.13.4
|
||||
+LIBMDBX_VERSION = 0.13.6
|
||||
+LIBMDBX_SOURCE = libmdbx-amalgamated-$(LIBMDBX_VERSION).tar.xz
|
||||
+LIBMDBX_SITE = https://libmdbx.dqdkfa.ru/release
|
||||
+LIBMDBX_SUPPORTS_IN_SOURCE_BUILD = NO
|
||||
@ -169,5 +169,5 @@ index 0000000000..a8a6f3dbdf
|
||||
+
|
||||
+$(eval $(cmake-package))
|
||||
--
|
||||
2.48.1
|
||||
2.49.0
|
||||
|
||||
|
@ -41,12 +41,16 @@
|
||||
#include "page-ops.c"
|
||||
#include "pnl.c"
|
||||
#include "refund.c"
|
||||
#include "rkl.c"
|
||||
#include "spill.c"
|
||||
#include "table.c"
|
||||
#include "tls.c"
|
||||
#include "tree-ops.c"
|
||||
#include "tree-search.c"
|
||||
#include "txl.c"
|
||||
#include "txn-basal.c"
|
||||
#include "txn-nested.c"
|
||||
#include "txn-ro.c"
|
||||
#include "txn.c"
|
||||
#include "utils.c"
|
||||
#include "version.c"
|
||||
|
@ -73,6 +73,7 @@ int mdbx_cursor_bind(MDBX_txn *txn, MDBX_cursor *mc, MDBX_dbi dbi) {
|
||||
|
||||
mc->next = txn->cursors[dbi];
|
||||
txn->cursors[dbi] = mc;
|
||||
txn->flags |= txn_may_have_cursors;
|
||||
return MDBX_SUCCESS;
|
||||
}
|
||||
|
||||
|
@ -488,39 +488,12 @@ __cold int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname, MDBX_env_flags
|
||||
}
|
||||
|
||||
if ((flags & MDBX_RDONLY) == 0) {
|
||||
MDBX_txn *txn = nullptr;
|
||||
const intptr_t bitmap_bytes =
|
||||
#if MDBX_ENABLE_DBI_SPARSE
|
||||
ceil_powerof2(env->max_dbi, CHAR_BIT * sizeof(txn->dbi_sparse[0])) / CHAR_BIT;
|
||||
#else
|
||||
0;
|
||||
#endif /* MDBX_ENABLE_DBI_SPARSE */
|
||||
const size_t base = sizeof(MDBX_txn) + sizeof(cursor_couple_t);
|
||||
const size_t size = base + bitmap_bytes +
|
||||
env->max_dbi * (sizeof(txn->dbs[0]) + sizeof(txn->cursors[0]) + sizeof(txn->dbi_seqs[0]) +
|
||||
sizeof(txn->dbi_state[0]));
|
||||
|
||||
txn = osal_calloc(1, size);
|
||||
if (unlikely(!txn)) {
|
||||
rc = MDBX_ENOMEM;
|
||||
goto bailout;
|
||||
}
|
||||
txn->dbs = ptr_disp(txn, base);
|
||||
txn->cursors = ptr_disp(txn->dbs, env->max_dbi * sizeof(txn->dbs[0]));
|
||||
txn->dbi_seqs = ptr_disp(txn->cursors, env->max_dbi * sizeof(txn->cursors[0]));
|
||||
txn->dbi_state = ptr_disp(txn, size - env->max_dbi * sizeof(txn->dbi_state[0]));
|
||||
#if MDBX_ENABLE_DBI_SPARSE
|
||||
txn->dbi_sparse = ptr_disp(txn->dbi_state, -bitmap_bytes);
|
||||
#endif /* MDBX_ENABLE_DBI_SPARSE */
|
||||
txn->env = env;
|
||||
txn->flags = MDBX_TXN_FINISHED;
|
||||
env->basal_txn = txn;
|
||||
txn->tw.retired_pages = pnl_alloc(MDBX_PNL_INITIAL);
|
||||
txn->tw.repnl = pnl_alloc(MDBX_PNL_INITIAL);
|
||||
if (unlikely(!txn->tw.retired_pages || !txn->tw.repnl)) {
|
||||
env->basal_txn = txn_basal_create(env->max_dbi);
|
||||
if (unlikely(!env->basal_txn)) {
|
||||
rc = MDBX_ENOMEM;
|
||||
goto bailout;
|
||||
}
|
||||
env->basal_txn->env = env;
|
||||
env_options_adjust_defaults(env);
|
||||
}
|
||||
|
||||
@ -716,7 +689,7 @@ static int env_info_snap(const MDBX_env *env, const MDBX_txn *txn, MDBX_envinfo
|
||||
#endif
|
||||
}
|
||||
|
||||
*troika = (txn && !(txn->flags & MDBX_TXN_RDONLY)) ? txn->tw.troika : meta_tap(env);
|
||||
*troika = (txn && !(txn->flags & MDBX_TXN_RDONLY)) ? txn->wr.troika : meta_tap(env);
|
||||
const meta_ptr_t head = meta_recent(env, troika);
|
||||
const meta_t *const meta0 = METAPAGE(env, 0);
|
||||
const meta_t *const meta1 = METAPAGE(env, 1);
|
||||
@ -979,16 +952,16 @@ __cold int mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, intptr_t si
|
||||
if (unlikely(err != MDBX_SUCCESS))
|
||||
return LOG_IFERR(err);
|
||||
should_unlock = true;
|
||||
env->basal_txn->tw.troika = meta_tap(env);
|
||||
env->basal_txn->wr.troika = meta_tap(env);
|
||||
eASSERT(env, !env->txn && !env->basal_txn->nested);
|
||||
env->basal_txn->txnid = env->basal_txn->tw.troika.txnid[env->basal_txn->tw.troika.recent];
|
||||
txn_snapshot_oldest(env->basal_txn);
|
||||
env->basal_txn->txnid = env->basal_txn->wr.troika.txnid[env->basal_txn->wr.troika.recent];
|
||||
txn_gc_detent(env->basal_txn);
|
||||
}
|
||||
|
||||
/* get untouched params from current TXN or DB */
|
||||
if (pagesize <= 0 || pagesize >= INT_MAX)
|
||||
pagesize = env->ps;
|
||||
const geo_t *const geo = env->txn ? &env->txn->geo : &meta_recent(env, &env->basal_txn->tw.troika).ptr_c->geometry;
|
||||
const geo_t *const geo = env->txn ? &env->txn->geo : &meta_recent(env, &env->basal_txn->wr.troika).ptr_c->geometry;
|
||||
if (size_lower < 0)
|
||||
size_lower = pgno2bytes(env, geo->lower);
|
||||
if (size_now < 0)
|
||||
@ -1203,7 +1176,7 @@ __cold int mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, intptr_t si
|
||||
meta_t meta;
|
||||
memset(&meta, 0, sizeof(meta));
|
||||
if (!env->txn) {
|
||||
const meta_ptr_t head = meta_recent(env, &env->basal_txn->tw.troika);
|
||||
const meta_ptr_t head = meta_recent(env, &env->basal_txn->wr.troika);
|
||||
|
||||
uint64_t timestamp = 0;
|
||||
while ("workaround for "
|
||||
@ -1297,7 +1270,7 @@ __cold int mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, intptr_t si
|
||||
env->txn->flags |= MDBX_TXN_DIRTY;
|
||||
} else {
|
||||
meta.geometry = new_geo;
|
||||
rc = dxb_sync_locked(env, env->flags, &meta, &env->basal_txn->tw.troika);
|
||||
rc = dxb_sync_locked(env, env->flags, &meta, &env->basal_txn->wr.troika);
|
||||
if (likely(rc == MDBX_SUCCESS)) {
|
||||
env->geo_in_bytes.now = pgno2bytes(env, new_geo.now = meta.geometry.now);
|
||||
env->geo_in_bytes.upper = pgno2bytes(env, new_geo.upper = meta.geometry.upper);
|
||||
|
@ -147,6 +147,9 @@ void env_options_adjust_dp_limit(MDBX_env *env) {
|
||||
if (env->options.dp_limit < CURSOR_STACK_SIZE * 4)
|
||||
env->options.dp_limit = CURSOR_STACK_SIZE * 4;
|
||||
}
|
||||
#ifdef MDBX_DEBUG_DPL_LIMIT
|
||||
env->options.dp_limit = MDBX_DEBUG_DPL_LIMIT;
|
||||
#endif /* MDBX_DEBUG_DPL_LIMIT */
|
||||
if (env->options.dp_initial > env->options.dp_limit && env->options.dp_initial > default_dp_initial(env))
|
||||
env->options.dp_initial = env->options.dp_limit;
|
||||
env->options.need_dp_limit_adjust = false;
|
||||
|
@ -411,7 +411,7 @@ int mdbx_replace_ex(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, MDBX_val *
|
||||
}
|
||||
|
||||
if (is_modifable(txn, page)) {
|
||||
if (new_data && cmp_lenfast(&present_data, new_data) == 0) {
|
||||
if (new_data && eq_fast(&present_data, new_data)) {
|
||||
/* если данные совпадают, то ничего делать не надо */
|
||||
*old_data = *new_data;
|
||||
goto bailout;
|
||||
|
747
src/api-txn.c
747
src/api-txn.c
@ -10,10 +10,11 @@ __attribute__((__no_sanitize_thread__, __noinline__))
|
||||
int mdbx_txn_straggler(const MDBX_txn *txn, int *percent)
|
||||
{
|
||||
int rc = check_txn(txn, MDBX_TXN_BLOCKED - MDBX_TXN_PARKED);
|
||||
if (likely(rc == MDBX_SUCCESS))
|
||||
rc = check_env(txn->env, true);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return LOG_IFERR((rc > 0) ? -rc : rc);
|
||||
|
||||
MDBX_env *env = txn->env;
|
||||
if (unlikely((txn->flags & MDBX_TXN_RDONLY) == 0)) {
|
||||
if (percent)
|
||||
*percent = (int)((txn->geo.first_unallocated * UINT64_C(100) + txn->geo.end_pgno / 2) / txn->geo.end_pgno);
|
||||
@ -21,15 +22,15 @@ int mdbx_txn_straggler(const MDBX_txn *txn, int *percent)
|
||||
}
|
||||
|
||||
txnid_t lag;
|
||||
troika_t troika = meta_tap(env);
|
||||
troika_t troika = meta_tap(txn->env);
|
||||
do {
|
||||
const meta_ptr_t head = meta_recent(env, &troika);
|
||||
const meta_ptr_t head = meta_recent(txn->env, &troika);
|
||||
if (percent) {
|
||||
const pgno_t maxpg = head.ptr_v->geometry.now;
|
||||
*percent = (int)((head.ptr_v->geometry.first_unallocated * UINT64_C(100) + maxpg / 2) / maxpg);
|
||||
}
|
||||
lag = (head.txnid - txn->txnid) / xMDBX_TXNID_STEP;
|
||||
} while (unlikely(meta_should_retry(env, &troika)));
|
||||
} while (unlikely(meta_should_retry(txn->env, &troika)));
|
||||
|
||||
return (lag > INT_MAX) ? INT_MAX : (int)lag;
|
||||
}
|
||||
@ -55,8 +56,8 @@ MDBX_txn_flags_t mdbx_txn_flags(const MDBX_txn *txn) {
|
||||
assert(0 == (int)(txn->flags & MDBX_TXN_INVALID));
|
||||
|
||||
MDBX_txn_flags_t flags = txn->flags;
|
||||
if (F_ISSET(flags, MDBX_TXN_PARKED | MDBX_TXN_RDONLY) && txn->to.reader &&
|
||||
safe64_read(&txn->to.reader->tid) == MDBX_TID_TXN_OUSTED)
|
||||
if (F_ISSET(flags, MDBX_TXN_PARKED | MDBX_TXN_RDONLY) && txn->ro.slot &&
|
||||
safe64_read(&txn->ro.slot->tid) == MDBX_TID_TXN_OUSTED)
|
||||
flags |= MDBX_TXN_OUSTED;
|
||||
return flags;
|
||||
}
|
||||
@ -66,6 +67,10 @@ int mdbx_txn_reset(MDBX_txn *txn) {
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return LOG_IFERR(rc);
|
||||
|
||||
rc = check_env(txn->env, false);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return LOG_IFERR(rc);
|
||||
|
||||
/* This call is only valid for read-only txns */
|
||||
if (unlikely((txn->flags & MDBX_TXN_RDONLY) == 0))
|
||||
return LOG_IFERR(MDBX_EINVAL);
|
||||
@ -85,8 +90,6 @@ int mdbx_txn_break(MDBX_txn *txn) {
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return LOG_IFERR(rc);
|
||||
txn->flags |= MDBX_TXN_ERROR;
|
||||
if (txn->flags & MDBX_TXN_RDONLY)
|
||||
break;
|
||||
txn = txn->nested;
|
||||
} while (txn);
|
||||
return MDBX_SUCCESS;
|
||||
@ -117,6 +120,11 @@ int mdbx_txn_park(MDBX_txn *txn, bool autounpark) {
|
||||
int rc = check_txn(txn, MDBX_TXN_BLOCKED - MDBX_TXN_ERROR);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return LOG_IFERR(rc);
|
||||
|
||||
rc = check_env(txn->env, true);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return LOG_IFERR(rc);
|
||||
|
||||
if (unlikely((txn->flags & MDBX_TXN_RDONLY) == 0))
|
||||
return LOG_IFERR(MDBX_TXN_INVALID);
|
||||
|
||||
@ -125,7 +133,7 @@ int mdbx_txn_park(MDBX_txn *txn, bool autounpark) {
|
||||
return LOG_IFERR(rc ? rc : MDBX_OUSTED);
|
||||
}
|
||||
|
||||
return LOG_IFERR(txn_park(txn, autounpark));
|
||||
return LOG_IFERR(txn_ro_park(txn, autounpark));
|
||||
}
|
||||
|
||||
int mdbx_txn_unpark(MDBX_txn *txn, bool restart_if_ousted) {
|
||||
@ -133,10 +141,15 @@ int mdbx_txn_unpark(MDBX_txn *txn, bool restart_if_ousted) {
|
||||
int rc = check_txn(txn, MDBX_TXN_BLOCKED - MDBX_TXN_PARKED - MDBX_TXN_ERROR);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return LOG_IFERR(rc);
|
||||
|
||||
rc = check_env(txn->env, true);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return LOG_IFERR(rc);
|
||||
|
||||
if (unlikely(!F_ISSET(txn->flags, MDBX_TXN_RDONLY | MDBX_TXN_PARKED)))
|
||||
return MDBX_SUCCESS;
|
||||
|
||||
rc = txn_unpark(txn);
|
||||
rc = txn_ro_unpark(txn);
|
||||
if (likely(rc != MDBX_OUSTED) || !restart_if_ousted)
|
||||
return LOG_IFERR(rc);
|
||||
|
||||
@ -146,22 +159,24 @@ int mdbx_txn_unpark(MDBX_txn *txn, bool restart_if_ousted) {
|
||||
}
|
||||
|
||||
int mdbx_txn_renew(MDBX_txn *txn) {
|
||||
if (unlikely(!txn))
|
||||
return LOG_IFERR(MDBX_EINVAL);
|
||||
int rc = check_txn(txn, 0);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return LOG_IFERR(rc);
|
||||
|
||||
if (unlikely(txn->signature != txn_signature))
|
||||
return LOG_IFERR(MDBX_EBADSIGN);
|
||||
rc = check_env(txn->env, true);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return LOG_IFERR(rc);
|
||||
|
||||
if (unlikely((txn->flags & MDBX_TXN_RDONLY) == 0))
|
||||
return LOG_IFERR(MDBX_EINVAL);
|
||||
|
||||
if (unlikely(txn->owner != 0 || !(txn->flags & MDBX_TXN_FINISHED))) {
|
||||
int rc = mdbx_txn_reset(txn);
|
||||
rc = mdbx_txn_reset(txn);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return rc;
|
||||
}
|
||||
|
||||
int rc = txn_renew(txn, MDBX_TXN_RDONLY);
|
||||
rc = txn_renew(txn, MDBX_TXN_RDONLY);
|
||||
if (rc == MDBX_SUCCESS) {
|
||||
tASSERT(txn, txn->owner == (txn->flags & MDBX_NOSTICKYTHREADS) ? 0 : osal_thread_self());
|
||||
DEBUG("renew txn %" PRIaTXN "%c %p on env %p, root page %" PRIaPGNO "/%" PRIaPGNO, txn->txnid,
|
||||
@ -172,7 +187,7 @@ int mdbx_txn_renew(MDBX_txn *txn) {
|
||||
}
|
||||
|
||||
int mdbx_txn_set_userctx(MDBX_txn *txn, void *ctx) {
|
||||
int rc = check_txn(txn, MDBX_TXN_FINISHED);
|
||||
int rc = check_txn(txn, 0);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return LOG_IFERR(rc);
|
||||
|
||||
@ -197,6 +212,8 @@ int mdbx_txn_begin_ex(MDBX_env *env, MDBX_txn *parent, MDBX_txn_flags_t flags, M
|
||||
if (unlikely(env->flags & MDBX_RDONLY & ~flags)) /* write txn in RDONLY env */
|
||||
return LOG_IFERR(MDBX_EACCESS);
|
||||
|
||||
/* Reuse preallocated write txn. However, do not touch it until
|
||||
* txn_renew() succeeds, since it currently may be active. */
|
||||
MDBX_txn *txn = nullptr;
|
||||
if (parent) {
|
||||
/* Nested transactions: Max 1 child, write txns only, no writemap */
|
||||
@ -212,202 +229,126 @@ int mdbx_txn_begin_ex(MDBX_env *env, MDBX_txn *parent, MDBX_txn_flags_t flags, M
|
||||
}
|
||||
return LOG_IFERR(rc);
|
||||
}
|
||||
|
||||
if (env->options.spill_parent4child_denominator) {
|
||||
/* Spill dirty-pages of parent to provide dirtyroom for child txn */
|
||||
rc = txn_spill(parent, nullptr, parent->tw.dirtylist->length / env->options.spill_parent4child_denominator);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return LOG_IFERR(rc);
|
||||
}
|
||||
tASSERT(parent, audit_ex(parent, 0, false) == 0);
|
||||
if (unlikely(parent->env != env))
|
||||
return LOG_IFERR(MDBX_BAD_TXN);
|
||||
|
||||
flags |= parent->flags & (txn_rw_begin_flags | MDBX_TXN_SPILLS | MDBX_NOSTICKYTHREADS | MDBX_WRITEMAP);
|
||||
} else if ((flags & MDBX_TXN_RDONLY) == 0) {
|
||||
/* Reuse preallocated write txn. However, do not touch it until
|
||||
* txn_renew() succeeds, since it currently may be active. */
|
||||
txn = env->basal_txn;
|
||||
goto renew;
|
||||
}
|
||||
|
||||
const intptr_t bitmap_bytes =
|
||||
#if MDBX_ENABLE_DBI_SPARSE
|
||||
ceil_powerof2(env->max_dbi, CHAR_BIT * sizeof(txn->dbi_sparse[0])) / CHAR_BIT;
|
||||
#else
|
||||
0;
|
||||
#endif /* MDBX_ENABLE_DBI_SPARSE */
|
||||
STATIC_ASSERT(sizeof(txn->tw) > sizeof(txn->to));
|
||||
const size_t base =
|
||||
(flags & MDBX_TXN_RDONLY) ? sizeof(MDBX_txn) - sizeof(txn->tw) + sizeof(txn->to) : sizeof(MDBX_txn);
|
||||
const size_t size = base +
|
||||
((flags & MDBX_TXN_RDONLY) ? (size_t)bitmap_bytes + env->max_dbi * sizeof(txn->dbi_seqs[0]) : 0) +
|
||||
env->max_dbi * (sizeof(txn->dbs[0]) + sizeof(txn->cursors[0]) + sizeof(txn->dbi_state[0]));
|
||||
txn = osal_malloc(size);
|
||||
if (unlikely(txn == nullptr))
|
||||
return LOG_IFERR(MDBX_ENOMEM);
|
||||
#if MDBX_DEBUG
|
||||
memset(txn, 0xCD, size);
|
||||
VALGRIND_MAKE_MEM_UNDEFINED(txn, size);
|
||||
#endif /* MDBX_DEBUG */
|
||||
MDBX_ANALYSIS_ASSUME(size > base);
|
||||
memset(txn, 0, (MDBX_GOOFY_MSVC_STATIC_ANALYZER && base > size) ? size : base);
|
||||
txn->dbs = ptr_disp(txn, base);
|
||||
txn->cursors = ptr_disp(txn->dbs, env->max_dbi * sizeof(txn->dbs[0]));
|
||||
#if MDBX_DEBUG
|
||||
txn->cursors[FREE_DBI] = nullptr; /* avoid SIGSEGV in an assertion later */
|
||||
#endif
|
||||
txn->dbi_state = ptr_disp(txn, size - env->max_dbi * sizeof(txn->dbi_state[0]));
|
||||
txn->flags = flags;
|
||||
txn->env = env;
|
||||
|
||||
if (parent) {
|
||||
tASSERT(parent, dpl_check(parent));
|
||||
#if MDBX_ENABLE_DBI_SPARSE
|
||||
txn->dbi_sparse = parent->dbi_sparse;
|
||||
#endif /* MDBX_ENABLE_DBI_SPARSE */
|
||||
txn->dbi_seqs = parent->dbi_seqs;
|
||||
txn->geo = parent->geo;
|
||||
rc = dpl_alloc(txn);
|
||||
if (likely(rc == MDBX_SUCCESS)) {
|
||||
const size_t len = MDBX_PNL_GETSIZE(parent->tw.repnl) + parent->tw.loose_count;
|
||||
txn->tw.repnl = pnl_alloc((len > MDBX_PNL_INITIAL) ? len : MDBX_PNL_INITIAL);
|
||||
if (unlikely(!txn->tw.repnl))
|
||||
rc = MDBX_ENOMEM;
|
||||
}
|
||||
rc = txn_nested_create(parent, flags);
|
||||
txn = parent->nested;
|
||||
if (unlikely(rc != MDBX_SUCCESS)) {
|
||||
nested_failed:
|
||||
pnl_free(txn->tw.repnl);
|
||||
dpl_free(txn);
|
||||
osal_free(txn);
|
||||
return LOG_IFERR(rc);
|
||||
int err = txn_end(txn, TXN_END_FAIL_BEGIN_NESTED);
|
||||
return err ? err : rc;
|
||||
}
|
||||
|
||||
/* Move loose pages to reclaimed list */
|
||||
if (parent->tw.loose_count) {
|
||||
do {
|
||||
page_t *lp = parent->tw.loose_pages;
|
||||
tASSERT(parent, lp->flags == P_LOOSE);
|
||||
rc = pnl_insert_span(&parent->tw.repnl, lp->pgno, 1);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
goto nested_failed;
|
||||
MDBX_ASAN_UNPOISON_MEMORY_REGION(&page_next(lp), sizeof(page_t *));
|
||||
VALGRIND_MAKE_MEM_DEFINED(&page_next(lp), sizeof(page_t *));
|
||||
parent->tw.loose_pages = page_next(lp);
|
||||
/* Remove from dirty list */
|
||||
page_wash(parent, dpl_exist(parent, lp->pgno), lp, 1);
|
||||
} while (parent->tw.loose_pages);
|
||||
parent->tw.loose_count = 0;
|
||||
#if MDBX_ENABLE_REFUND
|
||||
parent->tw.loose_refund_wl = 0;
|
||||
#endif /* MDBX_ENABLE_REFUND */
|
||||
tASSERT(parent, dpl_check(parent));
|
||||
}
|
||||
txn->tw.dirtyroom = parent->tw.dirtyroom;
|
||||
txn->tw.dirtylru = parent->tw.dirtylru;
|
||||
|
||||
dpl_sort(parent);
|
||||
if (parent->tw.spilled.list)
|
||||
spill_purge(parent);
|
||||
|
||||
tASSERT(txn, MDBX_PNL_ALLOCLEN(txn->tw.repnl) >= MDBX_PNL_GETSIZE(parent->tw.repnl));
|
||||
memcpy(txn->tw.repnl, parent->tw.repnl, MDBX_PNL_SIZEOF(parent->tw.repnl));
|
||||
eASSERT(env, pnl_check_allocated(txn->tw.repnl, (txn->geo.first_unallocated /* LY: intentional assignment
|
||||
here, only for assertion */
|
||||
= parent->geo.first_unallocated) -
|
||||
MDBX_ENABLE_REFUND));
|
||||
|
||||
txn->tw.gc.time_acc = parent->tw.gc.time_acc;
|
||||
txn->tw.gc.last_reclaimed = parent->tw.gc.last_reclaimed;
|
||||
if (parent->tw.gc.retxl) {
|
||||
txn->tw.gc.retxl = parent->tw.gc.retxl;
|
||||
parent->tw.gc.retxl = (void *)(intptr_t)MDBX_PNL_GETSIZE(parent->tw.gc.retxl);
|
||||
}
|
||||
|
||||
txn->tw.retired_pages = parent->tw.retired_pages;
|
||||
parent->tw.retired_pages = (void *)(intptr_t)MDBX_PNL_GETSIZE(parent->tw.retired_pages);
|
||||
|
||||
txn->txnid = parent->txnid;
|
||||
txn->front_txnid = parent->front_txnid + 1;
|
||||
#if MDBX_ENABLE_REFUND
|
||||
txn->tw.loose_refund_wl = 0;
|
||||
#endif /* MDBX_ENABLE_REFUND */
|
||||
txn->canary = parent->canary;
|
||||
parent->flags |= MDBX_TXN_HAS_CHILD;
|
||||
parent->nested = txn;
|
||||
txn->parent = parent;
|
||||
txn->owner = parent->owner;
|
||||
txn->tw.troika = parent->tw.troika;
|
||||
|
||||
txn->cursors[FREE_DBI] = nullptr;
|
||||
txn->cursors[MAIN_DBI] = nullptr;
|
||||
txn->dbi_state[FREE_DBI] = parent->dbi_state[FREE_DBI] & ~(DBI_FRESH | DBI_CREAT | DBI_DIRTY);
|
||||
txn->dbi_state[MAIN_DBI] = parent->dbi_state[MAIN_DBI] & ~(DBI_FRESH | DBI_CREAT | DBI_DIRTY);
|
||||
memset(txn->dbi_state + CORE_DBS, 0, (txn->n_dbi = parent->n_dbi) - CORE_DBS);
|
||||
memcpy(txn->dbs, parent->dbs, sizeof(txn->dbs[0]) * CORE_DBS);
|
||||
|
||||
tASSERT(parent, parent->tw.dirtyroom + parent->tw.dirtylist->length ==
|
||||
(parent->parent ? parent->parent->tw.dirtyroom : parent->env->options.dp_limit));
|
||||
tASSERT(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length ==
|
||||
(txn->parent ? txn->parent->tw.dirtyroom : txn->env->options.dp_limit));
|
||||
env->txn = txn;
|
||||
tASSERT(parent, parent->cursors[FREE_DBI] == nullptr);
|
||||
rc = parent->cursors[MAIN_DBI] ? cursor_shadow(parent->cursors[MAIN_DBI], txn, MAIN_DBI) : MDBX_SUCCESS;
|
||||
if (AUDIT_ENABLED() && ASSERT_ENABLED()) {
|
||||
txn->signature = txn_signature;
|
||||
tASSERT(txn, audit_ex(txn, 0, false) == 0);
|
||||
}
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
txn_end(txn, TXN_END_FAIL_BEGINCHILD);
|
||||
} else { /* MDBX_TXN_RDONLY */
|
||||
txn->dbi_seqs = ptr_disp(txn->cursors, env->max_dbi * sizeof(txn->cursors[0]));
|
||||
#if MDBX_ENABLE_DBI_SPARSE
|
||||
txn->dbi_sparse = ptr_disp(txn->dbi_state, -bitmap_bytes);
|
||||
#endif /* MDBX_ENABLE_DBI_SPARSE */
|
||||
renew:
|
||||
rc = txn_renew(txn, flags);
|
||||
}
|
||||
|
||||
if (unlikely(rc != MDBX_SUCCESS)) {
|
||||
if (txn != env->basal_txn)
|
||||
osal_free(txn);
|
||||
} else {
|
||||
if (flags & (MDBX_TXN_RDONLY_PREPARE - MDBX_TXN_RDONLY))
|
||||
eASSERT(env, txn->flags == (MDBX_TXN_RDONLY | MDBX_TXN_FINISHED));
|
||||
else if (flags & MDBX_TXN_RDONLY)
|
||||
eASSERT(env, (txn->flags & ~(MDBX_NOSTICKYTHREADS | MDBX_TXN_RDONLY | MDBX_WRITEMAP |
|
||||
/* Win32: SRWL flag */ txn_shrink_allowed)) == 0);
|
||||
else {
|
||||
eASSERT(env, (txn->flags & ~(MDBX_NOSTICKYTHREADS | MDBX_WRITEMAP | txn_shrink_allowed | MDBX_NOMETASYNC |
|
||||
MDBX_SAFE_NOSYNC | MDBX_TXN_SPILLS)) == 0);
|
||||
assert(!txn->tw.spilled.list && !txn->tw.spilled.least_removed);
|
||||
txn = env->basal_txn;
|
||||
if (flags & MDBX_TXN_RDONLY) {
|
||||
txn = txn_alloc(flags, env);
|
||||
if (unlikely(!txn))
|
||||
return LOG_IFERR(MDBX_ENOMEM);
|
||||
}
|
||||
rc = txn_renew(txn, flags);
|
||||
if (unlikely(rc != MDBX_SUCCESS)) {
|
||||
if (txn != env->basal_txn)
|
||||
osal_free(txn);
|
||||
return LOG_IFERR(rc);
|
||||
}
|
||||
txn->signature = txn_signature;
|
||||
txn->userctx = context;
|
||||
*ret = txn;
|
||||
DEBUG("begin txn %" PRIaTXN "%c %p on env %p, root page %" PRIaPGNO "/%" PRIaPGNO, txn->txnid,
|
||||
(flags & MDBX_TXN_RDONLY) ? 'r' : 'w', (void *)txn, (void *)env, txn->dbs[MAIN_DBI].root,
|
||||
txn->dbs[FREE_DBI].root);
|
||||
}
|
||||
|
||||
return LOG_IFERR(rc);
|
||||
if (flags & (MDBX_TXN_RDONLY_PREPARE - MDBX_TXN_RDONLY))
|
||||
eASSERT(env, txn->flags == (MDBX_TXN_RDONLY | MDBX_TXN_FINISHED));
|
||||
else if (flags & MDBX_TXN_RDONLY)
|
||||
eASSERT(env, (txn->flags & ~(MDBX_NOSTICKYTHREADS | MDBX_TXN_RDONLY | MDBX_WRITEMAP |
|
||||
/* Win32: SRWL flag */ txn_shrink_allowed)) == 0);
|
||||
else {
|
||||
eASSERT(env, (txn->flags & ~(MDBX_NOSTICKYTHREADS | MDBX_WRITEMAP | txn_shrink_allowed | txn_may_have_cursors |
|
||||
MDBX_NOMETASYNC | MDBX_SAFE_NOSYNC | MDBX_TXN_SPILLS)) == 0);
|
||||
assert(!txn->wr.spilled.list && !txn->wr.spilled.least_removed);
|
||||
}
|
||||
txn->signature = txn_signature;
|
||||
txn->userctx = context;
|
||||
*ret = txn;
|
||||
DEBUG("begin txn %" PRIaTXN "%c %p on env %p, root page %" PRIaPGNO "/%" PRIaPGNO, txn->txnid,
|
||||
(flags & MDBX_TXN_RDONLY) ? 'r' : 'w', (void *)txn, (void *)env, txn->dbs[MAIN_DBI].root,
|
||||
txn->dbs[FREE_DBI].root);
|
||||
return MDBX_SUCCESS;
|
||||
}
|
||||
|
||||
static void latency_gcprof(MDBX_commit_latency *latency, const MDBX_txn *txn) {
|
||||
MDBX_env *const env = txn->env;
|
||||
if (latency && likely(env->lck) && MDBX_ENABLE_PROFGC) {
|
||||
pgop_stat_t *const ptr = &env->lck->pgops;
|
||||
latency->gc_prof.work_counter = ptr->gc_prof.work.spe_counter;
|
||||
latency->gc_prof.work_rtime_monotonic = osal_monotime_to_16dot16(ptr->gc_prof.work.rtime_monotonic);
|
||||
latency->gc_prof.work_xtime_cpu = osal_monotime_to_16dot16(ptr->gc_prof.work.xtime_cpu);
|
||||
latency->gc_prof.work_rsteps = ptr->gc_prof.work.rsteps;
|
||||
latency->gc_prof.work_xpages = ptr->gc_prof.work.xpages;
|
||||
latency->gc_prof.work_majflt = ptr->gc_prof.work.majflt;
|
||||
|
||||
latency->gc_prof.self_counter = ptr->gc_prof.self.spe_counter;
|
||||
latency->gc_prof.self_rtime_monotonic = osal_monotime_to_16dot16(ptr->gc_prof.self.rtime_monotonic);
|
||||
latency->gc_prof.self_xtime_cpu = osal_monotime_to_16dot16(ptr->gc_prof.self.xtime_cpu);
|
||||
latency->gc_prof.self_rsteps = ptr->gc_prof.self.rsteps;
|
||||
latency->gc_prof.self_xpages = ptr->gc_prof.self.xpages;
|
||||
latency->gc_prof.self_majflt = ptr->gc_prof.self.majflt;
|
||||
|
||||
latency->gc_prof.wloops = ptr->gc_prof.wloops;
|
||||
latency->gc_prof.coalescences = ptr->gc_prof.coalescences;
|
||||
latency->gc_prof.wipes = ptr->gc_prof.wipes;
|
||||
latency->gc_prof.flushes = ptr->gc_prof.flushes;
|
||||
latency->gc_prof.kicks = ptr->gc_prof.kicks;
|
||||
|
||||
latency->gc_prof.pnl_merge_work.time = osal_monotime_to_16dot16(ptr->gc_prof.work.pnl_merge.time);
|
||||
latency->gc_prof.pnl_merge_work.calls = ptr->gc_prof.work.pnl_merge.calls;
|
||||
latency->gc_prof.pnl_merge_work.volume = ptr->gc_prof.work.pnl_merge.volume;
|
||||
latency->gc_prof.pnl_merge_self.time = osal_monotime_to_16dot16(ptr->gc_prof.self.pnl_merge.time);
|
||||
latency->gc_prof.pnl_merge_self.calls = ptr->gc_prof.self.pnl_merge.calls;
|
||||
latency->gc_prof.pnl_merge_self.volume = ptr->gc_prof.self.pnl_merge.volume;
|
||||
|
||||
if (txn == env->basal_txn)
|
||||
memset(&ptr->gc_prof, 0, sizeof(ptr->gc_prof));
|
||||
}
|
||||
}
|
||||
|
||||
static void latency_init(MDBX_commit_latency *latency, struct commit_timestamp *ts) {
|
||||
ts->start = 0;
|
||||
ts->gc_cpu = 0;
|
||||
if (latency) {
|
||||
ts->start = osal_monotime();
|
||||
memset(latency, 0, sizeof(*latency));
|
||||
}
|
||||
ts->prep = ts->gc = ts->audit = ts->write = ts->sync = ts->start;
|
||||
}
|
||||
|
||||
static void latency_done(MDBX_commit_latency *latency, struct commit_timestamp *ts) {
|
||||
if (latency) {
|
||||
latency->preparation = (ts->prep > ts->start) ? osal_monotime_to_16dot16(ts->prep - ts->start) : 0;
|
||||
latency->gc_wallclock = (ts->gc > ts->prep) ? osal_monotime_to_16dot16(ts->gc - ts->prep) : 0;
|
||||
latency->gc_cputime = ts->gc_cpu ? osal_monotime_to_16dot16(ts->gc_cpu) : 0;
|
||||
latency->audit = (ts->audit > ts->gc) ? osal_monotime_to_16dot16(ts->audit - ts->gc) : 0;
|
||||
latency->write = (ts->write > ts->audit) ? osal_monotime_to_16dot16(ts->write - ts->audit) : 0;
|
||||
latency->sync = (ts->sync > ts->write) ? osal_monotime_to_16dot16(ts->sync - ts->write) : 0;
|
||||
const uint64_t ts_end = osal_monotime();
|
||||
latency->ending = (ts_end > ts->sync) ? osal_monotime_to_16dot16(ts_end - ts->sync) : 0;
|
||||
latency->whole = osal_monotime_to_16dot16_noUnderflow(ts_end - ts->start);
|
||||
}
|
||||
}
|
||||
|
||||
int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) {
|
||||
STATIC_ASSERT(MDBX_TXN_FINISHED == MDBX_TXN_BLOCKED - MDBX_TXN_HAS_CHILD - MDBX_TXN_ERROR - MDBX_TXN_PARKED);
|
||||
const uint64_t ts_0 = latency ? osal_monotime() : 0;
|
||||
uint64_t ts_1 = 0, ts_2 = 0, ts_3 = 0, ts_4 = 0, ts_5 = 0, gc_cputime = 0;
|
||||
|
||||
/* txn_end() mode for a commit which writes nothing */
|
||||
unsigned end_mode = TXN_END_PURE_COMMIT | TXN_END_UPDATE | TXN_END_SLOT | TXN_END_FREE;
|
||||
struct commit_timestamp ts;
|
||||
latency_init(latency, &ts);
|
||||
|
||||
int rc = check_txn(txn, MDBX_TXN_FINISHED);
|
||||
if (unlikely(rc != MDBX_SUCCESS)) {
|
||||
if (rc == MDBX_BAD_TXN && (txn->flags & MDBX_TXN_RDONLY)) {
|
||||
if (rc == MDBX_BAD_TXN && F_ISSET(txn->flags, MDBX_TXN_FINISHED | MDBX_TXN_RDONLY)) {
|
||||
rc = MDBX_RESULT_TRUE;
|
||||
goto fail;
|
||||
}
|
||||
bailout:
|
||||
if (latency)
|
||||
memset(latency, 0, sizeof(*latency));
|
||||
return LOG_IFERR(rc);
|
||||
}
|
||||
|
||||
@ -415,14 +356,17 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) {
|
||||
if (MDBX_ENV_CHECKPID && unlikely(env->pid != osal_getpid())) {
|
||||
env->flags |= ENV_FATAL_ERROR;
|
||||
rc = MDBX_PANIC;
|
||||
goto bailout;
|
||||
return LOG_IFERR(rc);
|
||||
}
|
||||
|
||||
if (unlikely(txn->flags & MDBX_TXN_RDONLY)) {
|
||||
if (txn->flags & MDBX_TXN_ERROR) {
|
||||
rc = MDBX_RESULT_TRUE;
|
||||
goto fail;
|
||||
if (txn->flags & MDBX_TXN_RDONLY) {
|
||||
if (unlikely(txn->parent || (txn->flags & MDBX_TXN_HAS_CHILD) || txn == env->txn || txn == env->basal_txn)) {
|
||||
ERROR("attempt to commit %s txn %p", "strange read-only", (void *)txn);
|
||||
return MDBX_PROBLEM;
|
||||
}
|
||||
latency_gcprof(latency, txn);
|
||||
rc = (txn->flags & MDBX_TXN_ERROR) ? MDBX_RESULT_TRUE : MDBX_SUCCESS;
|
||||
txn_end(txn, TXN_END_PURE_COMMIT | TXN_END_UPDATE | TXN_END_SLOT | TXN_END_FREE);
|
||||
goto done;
|
||||
}
|
||||
|
||||
@ -436,7 +380,12 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) {
|
||||
|
||||
if (unlikely(txn->flags & MDBX_TXN_ERROR)) {
|
||||
rc = MDBX_RESULT_TRUE;
|
||||
goto fail;
|
||||
fail:
|
||||
latency_gcprof(latency, txn);
|
||||
int err = txn_abort(txn);
|
||||
if (unlikely(err != MDBX_SUCCESS))
|
||||
rc = err;
|
||||
goto done;
|
||||
}
|
||||
|
||||
if (txn->nested) {
|
||||
@ -447,370 +396,38 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) {
|
||||
}
|
||||
|
||||
if (unlikely(txn != env->txn)) {
|
||||
DEBUG("%s", "attempt to commit unknown transaction");
|
||||
rc = MDBX_EINVAL;
|
||||
goto fail;
|
||||
ERROR("attempt to commit %s txn %p", "unknown", (void *)txn);
|
||||
return MDBX_EINVAL;
|
||||
}
|
||||
|
||||
if (txn->parent) {
|
||||
tASSERT(txn, audit_ex(txn, 0, false) == 0);
|
||||
eASSERT(env, txn != env->basal_txn);
|
||||
MDBX_txn *const parent = txn->parent;
|
||||
eASSERT(env, parent->signature == txn_signature);
|
||||
eASSERT(env, parent->nested == txn && (parent->flags & MDBX_TXN_HAS_CHILD) != 0);
|
||||
eASSERT(env, dpl_check(txn));
|
||||
|
||||
if (txn->tw.dirtylist->length == 0 && !(txn->flags & MDBX_TXN_DIRTY) && parent->n_dbi == txn->n_dbi) {
|
||||
/* fast completion of pure nested transaction */
|
||||
VERBOSE("fast-complete pure nested txn %" PRIaTXN, txn->txnid);
|
||||
|
||||
tASSERT(txn, memcmp(&parent->geo, &txn->geo, sizeof(parent->geo)) == 0);
|
||||
tASSERT(txn, memcmp(&parent->canary, &txn->canary, sizeof(parent->canary)) == 0);
|
||||
tASSERT(txn, !txn->tw.spilled.list || MDBX_PNL_GETSIZE(txn->tw.spilled.list) == 0);
|
||||
tASSERT(txn, txn->tw.loose_count == 0);
|
||||
|
||||
/* Update parent's DBs array */
|
||||
eASSERT(env, parent->n_dbi == txn->n_dbi);
|
||||
TXN_FOREACH_DBI_ALL(txn, dbi) {
|
||||
tASSERT(txn, (txn->dbi_state[dbi] & (DBI_CREAT | DBI_DIRTY)) == 0);
|
||||
if (txn->dbi_state[dbi] & DBI_FRESH) {
|
||||
parent->dbs[dbi] = txn->dbs[dbi];
|
||||
/* preserve parent's status */
|
||||
const uint8_t state = txn->dbi_state[dbi] | DBI_FRESH;
|
||||
DEBUG("dbi %zu dbi-state %s 0x%02x -> 0x%02x", dbi, (parent->dbi_state[dbi] != state) ? "update" : "still",
|
||||
parent->dbi_state[dbi], state);
|
||||
parent->dbi_state[dbi] = state;
|
||||
}
|
||||
}
|
||||
txn_done_cursors(txn, true);
|
||||
end_mode = TXN_END_PURE_COMMIT | TXN_END_SLOT | TXN_END_FREE | TXN_END_EOTDONE;
|
||||
goto done;
|
||||
if (unlikely(txn->parent->nested != txn || txn->parent->env != env)) {
|
||||
ERROR("attempt to commit %s txn %p", "strange nested", (void *)txn);
|
||||
return MDBX_PROBLEM;
|
||||
}
|
||||
|
||||
/* Preserve space for spill list to avoid parent's state corruption
|
||||
* if allocation fails. */
|
||||
const size_t parent_retired_len = (uintptr_t)parent->tw.retired_pages;
|
||||
tASSERT(txn, parent_retired_len <= MDBX_PNL_GETSIZE(txn->tw.retired_pages));
|
||||
const size_t retired_delta = MDBX_PNL_GETSIZE(txn->tw.retired_pages) - parent_retired_len;
|
||||
if (retired_delta) {
|
||||
rc = pnl_need(&txn->tw.repnl, retired_delta);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
goto fail;
|
||||
}
|
||||
|
||||
if (txn->tw.spilled.list) {
|
||||
if (parent->tw.spilled.list) {
|
||||
rc = pnl_need(&parent->tw.spilled.list, MDBX_PNL_GETSIZE(txn->tw.spilled.list));
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
goto fail;
|
||||
}
|
||||
spill_purge(txn);
|
||||
}
|
||||
|
||||
if (unlikely(txn->tw.dirtylist->length + parent->tw.dirtylist->length > parent->tw.dirtylist->detent &&
|
||||
!dpl_reserve(parent, txn->tw.dirtylist->length + parent->tw.dirtylist->length))) {
|
||||
rc = MDBX_ENOMEM;
|
||||
goto fail;
|
||||
}
|
||||
|
||||
//-------------------------------------------------------------------------
|
||||
|
||||
parent->tw.gc.retxl = txn->tw.gc.retxl;
|
||||
txn->tw.gc.retxl = nullptr;
|
||||
|
||||
parent->tw.retired_pages = txn->tw.retired_pages;
|
||||
txn->tw.retired_pages = nullptr;
|
||||
|
||||
pnl_free(parent->tw.repnl);
|
||||
parent->tw.repnl = txn->tw.repnl;
|
||||
txn->tw.repnl = nullptr;
|
||||
parent->tw.gc.time_acc = txn->tw.gc.time_acc;
|
||||
parent->tw.gc.last_reclaimed = txn->tw.gc.last_reclaimed;
|
||||
|
||||
parent->geo = txn->geo;
|
||||
parent->canary = txn->canary;
|
||||
parent->flags |= txn->flags & MDBX_TXN_DIRTY;
|
||||
|
||||
/* Move loose pages to parent */
|
||||
#if MDBX_ENABLE_REFUND
|
||||
parent->tw.loose_refund_wl = txn->tw.loose_refund_wl;
|
||||
#endif /* MDBX_ENABLE_REFUND */
|
||||
parent->tw.loose_count = txn->tw.loose_count;
|
||||
parent->tw.loose_pages = txn->tw.loose_pages;
|
||||
|
||||
/* Merge our cursors into parent's and close them */
|
||||
txn_done_cursors(txn, true);
|
||||
end_mode |= TXN_END_EOTDONE;
|
||||
|
||||
/* Update parent's DBs array */
|
||||
eASSERT(env, parent->n_dbi == txn->n_dbi);
|
||||
TXN_FOREACH_DBI_ALL(txn, dbi) {
|
||||
if (txn->dbi_state[dbi] != (parent->dbi_state[dbi] & ~(DBI_FRESH | DBI_CREAT | DBI_DIRTY))) {
|
||||
eASSERT(env, (txn->dbi_state[dbi] & (DBI_CREAT | DBI_FRESH | DBI_DIRTY)) != 0 ||
|
||||
(txn->dbi_state[dbi] | DBI_STALE) ==
|
||||
(parent->dbi_state[dbi] & ~(DBI_FRESH | DBI_CREAT | DBI_DIRTY)));
|
||||
parent->dbs[dbi] = txn->dbs[dbi];
|
||||
/* preserve parent's status */
|
||||
const uint8_t state = txn->dbi_state[dbi] | (parent->dbi_state[dbi] & (DBI_CREAT | DBI_FRESH | DBI_DIRTY));
|
||||
DEBUG("dbi %zu dbi-state %s 0x%02x -> 0x%02x", dbi, (parent->dbi_state[dbi] != state) ? "update" : "still",
|
||||
parent->dbi_state[dbi], state);
|
||||
parent->dbi_state[dbi] = state;
|
||||
}
|
||||
}
|
||||
|
||||
if (latency) {
|
||||
ts_1 = osal_monotime();
|
||||
ts_2 = /* no gc-update */ ts_1;
|
||||
ts_3 = /* no audit */ ts_2;
|
||||
ts_4 = /* no write */ ts_3;
|
||||
ts_5 = /* no sync */ ts_4;
|
||||
}
|
||||
txn_merge(parent, txn, parent_retired_len);
|
||||
env->txn = parent;
|
||||
parent->nested = nullptr;
|
||||
tASSERT(parent, dpl_check(parent));
|
||||
|
||||
#if MDBX_ENABLE_REFUND
|
||||
txn_refund(parent);
|
||||
if (ASSERT_ENABLED()) {
|
||||
/* Check parent's loose pages not suitable for refund */
|
||||
for (page_t *lp = parent->tw.loose_pages; lp; lp = page_next(lp)) {
|
||||
tASSERT(parent, lp->pgno < parent->tw.loose_refund_wl && lp->pgno + 1 < parent->geo.first_unallocated);
|
||||
MDBX_ASAN_UNPOISON_MEMORY_REGION(&page_next(lp), sizeof(page_t *));
|
||||
VALGRIND_MAKE_MEM_DEFINED(&page_next(lp), sizeof(page_t *));
|
||||
}
|
||||
/* Check parent's reclaimed pages not suitable for refund */
|
||||
if (MDBX_PNL_GETSIZE(parent->tw.repnl))
|
||||
tASSERT(parent, MDBX_PNL_MOST(parent->tw.repnl) + 1 < parent->geo.first_unallocated);
|
||||
}
|
||||
#endif /* MDBX_ENABLE_REFUND */
|
||||
|
||||
txn->signature = 0;
|
||||
osal_free(txn);
|
||||
tASSERT(parent, audit_ex(parent, 0, false) == 0);
|
||||
rc = MDBX_SUCCESS;
|
||||
goto provide_latency;
|
||||
}
|
||||
|
||||
if (!txn->tw.dirtylist) {
|
||||
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) != 0 && !MDBX_AVOID_MSYNC);
|
||||
} else {
|
||||
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC);
|
||||
tASSERT(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length ==
|
||||
(txn->parent ? txn->parent->tw.dirtyroom : env->options.dp_limit));
|
||||
}
|
||||
txn_done_cursors(txn, false);
|
||||
end_mode |= TXN_END_EOTDONE;
|
||||
|
||||
if ((!txn->tw.dirtylist || txn->tw.dirtylist->length == 0) &&
|
||||
(txn->flags & (MDBX_TXN_DIRTY | MDBX_TXN_SPILLS)) == 0) {
|
||||
TXN_FOREACH_DBI_ALL(txn, i) { tASSERT(txn, !(txn->dbi_state[i] & DBI_DIRTY)); }
|
||||
#if defined(MDBX_NOSUCCESS_EMPTY_COMMIT) && MDBX_NOSUCCESS_EMPTY_COMMIT
|
||||
rc = txn_end(txn, end_mode);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
goto fail;
|
||||
rc = MDBX_RESULT_TRUE;
|
||||
goto provide_latency;
|
||||
#else
|
||||
latency_gcprof(latency, txn);
|
||||
rc = txn_nested_join(txn, latency ? &ts : nullptr);
|
||||
goto done;
|
||||
#endif /* MDBX_NOSUCCESS_EMPTY_COMMIT */
|
||||
}
|
||||
|
||||
DEBUG("committing txn %" PRIaTXN " %p on env %p, root page %" PRIaPGNO "/%" PRIaPGNO, txn->txnid, (void *)txn,
|
||||
(void *)env, txn->dbs[MAIN_DBI].root, txn->dbs[FREE_DBI].root);
|
||||
|
||||
if (txn->n_dbi > CORE_DBS) {
|
||||
/* Update table root pointers */
|
||||
cursor_couple_t cx;
|
||||
rc = cursor_init(&cx.outer, txn, MAIN_DBI);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
goto fail;
|
||||
cx.outer.next = txn->cursors[MAIN_DBI];
|
||||
txn->cursors[MAIN_DBI] = &cx.outer;
|
||||
TXN_FOREACH_DBI_USER(txn, i) {
|
||||
if ((txn->dbi_state[i] & DBI_DIRTY) == 0)
|
||||
continue;
|
||||
tree_t *const db = &txn->dbs[i];
|
||||
DEBUG("update main's entry for sub-db %zu, mod_txnid %" PRIaTXN " -> %" PRIaTXN, i, db->mod_txnid, txn->txnid);
|
||||
/* Может быть mod_txnid > front после коммита вложенных тразакций */
|
||||
db->mod_txnid = txn->txnid;
|
||||
MDBX_val data = {db, sizeof(tree_t)};
|
||||
rc = cursor_put(&cx.outer, &env->kvs[i].name, &data, N_TREE);
|
||||
if (unlikely(rc != MDBX_SUCCESS)) {
|
||||
txn->cursors[MAIN_DBI] = cx.outer.next;
|
||||
goto fail;
|
||||
}
|
||||
}
|
||||
txn->cursors[MAIN_DBI] = cx.outer.next;
|
||||
}
|
||||
|
||||
ts_1 = latency ? osal_monotime() : 0;
|
||||
|
||||
gcu_t gcu_ctx;
|
||||
gc_cputime = latency ? osal_cputime(nullptr) : 0;
|
||||
rc = gc_update_init(txn, &gcu_ctx);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
goto fail;
|
||||
rc = gc_update(txn, &gcu_ctx);
|
||||
gc_cputime = latency ? osal_cputime(nullptr) - gc_cputime : 0;
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
goto fail;
|
||||
|
||||
tASSERT(txn, txn->tw.loose_count == 0);
|
||||
txn->dbs[FREE_DBI].mod_txnid = (txn->dbi_state[FREE_DBI] & DBI_DIRTY) ? txn->txnid : txn->dbs[FREE_DBI].mod_txnid;
|
||||
|
||||
txn->dbs[MAIN_DBI].mod_txnid = (txn->dbi_state[MAIN_DBI] & DBI_DIRTY) ? txn->txnid : txn->dbs[MAIN_DBI].mod_txnid;
|
||||
|
||||
ts_2 = latency ? osal_monotime() : 0;
|
||||
ts_3 = ts_2;
|
||||
if (AUDIT_ENABLED()) {
|
||||
rc = audit_ex(txn, MDBX_PNL_GETSIZE(txn->tw.retired_pages), true);
|
||||
ts_3 = osal_monotime();
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
goto fail;
|
||||
}
|
||||
|
||||
bool need_flush_for_nometasync = false;
|
||||
const meta_ptr_t head = meta_recent(env, &txn->tw.troika);
|
||||
const uint32_t meta_sync_txnid = atomic_load32(&env->lck->meta_sync_txnid, mo_Relaxed);
|
||||
/* sync prev meta */
|
||||
if (head.is_steady && meta_sync_txnid != (uint32_t)head.txnid) {
|
||||
/* Исправление унаследованного от LMDB недочета:
|
||||
*
|
||||
* Всё хорошо, если все процессы работающие с БД не используют WRITEMAP.
|
||||
* Тогда мета-страница (обновленная, но не сброшенная на диск) будет
|
||||
* сохранена в результате fdatasync() при записи данных этой транзакции.
|
||||
*
|
||||
* Всё хорошо, если все процессы работающие с БД используют WRITEMAP
|
||||
* без MDBX_AVOID_MSYNC.
|
||||
* Тогда мета-страница (обновленная, но не сброшенная на диск) будет
|
||||
* сохранена в результате msync() при записи данных этой транзакции.
|
||||
*
|
||||
* Если же в процессах работающих с БД используется оба метода, как sync()
|
||||
* в режиме MDBX_WRITEMAP, так и записи через файловый дескриптор, то
|
||||
* становится невозможным обеспечить фиксацию на диске мета-страницы
|
||||
* предыдущей транзакции и данных текущей транзакции, за счет одной
|
||||
* sync-операцией выполняемой после записи данных текущей транзакции.
|
||||
* Соответственно, требуется явно обновлять мета-страницу, что полностью
|
||||
* уничтожает выгоду от NOMETASYNC. */
|
||||
const uint32_t txnid_dist = ((txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC) ? MDBX_NOMETASYNC_LAZY_FD
|
||||
: MDBX_NOMETASYNC_LAZY_WRITEMAP;
|
||||
/* Смысл "магии" в том, чтобы избежать отдельного вызова fdatasync()
|
||||
* или msync() для гарантированной фиксации на диске мета-страницы,
|
||||
* которая была "лениво" отправлена на запись в предыдущей транзакции,
|
||||
* но не сброшена на диск из-за активного режима MDBX_NOMETASYNC. */
|
||||
if (
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
!env->ioring.overlapped_fd &&
|
||||
#endif
|
||||
meta_sync_txnid == (uint32_t)head.txnid - txnid_dist)
|
||||
need_flush_for_nometasync = true;
|
||||
else {
|
||||
rc = meta_sync(env, head);
|
||||
if (unlikely(rc != MDBX_SUCCESS)) {
|
||||
ERROR("txn-%s: error %d", "presync-meta", rc);
|
||||
goto fail;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (txn->tw.dirtylist) {
|
||||
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC);
|
||||
tASSERT(txn, txn->tw.loose_count == 0);
|
||||
|
||||
mdbx_filehandle_t fd =
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
env->ioring.overlapped_fd ? env->ioring.overlapped_fd : env->lazy_fd;
|
||||
(void)need_flush_for_nometasync;
|
||||
#else
|
||||
(need_flush_for_nometasync || env->dsync_fd == INVALID_HANDLE_VALUE ||
|
||||
txn->tw.dirtylist->length > env->options.writethrough_threshold ||
|
||||
atomic_load64(&env->lck->unsynced_pages, mo_Relaxed))
|
||||
? env->lazy_fd
|
||||
: env->dsync_fd;
|
||||
#endif /* Windows */
|
||||
|
||||
iov_ctx_t write_ctx;
|
||||
rc = iov_init(txn, &write_ctx, txn->tw.dirtylist->length, txn->tw.dirtylist->pages_including_loose, fd, false);
|
||||
if (unlikely(rc != MDBX_SUCCESS)) {
|
||||
ERROR("txn-%s: error %d", "iov-init", rc);
|
||||
goto fail;
|
||||
}
|
||||
|
||||
rc = txn_write(txn, &write_ctx);
|
||||
if (unlikely(rc != MDBX_SUCCESS)) {
|
||||
ERROR("txn-%s: error %d", "write", rc);
|
||||
goto fail;
|
||||
}
|
||||
} else {
|
||||
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) != 0 && !MDBX_AVOID_MSYNC);
|
||||
env->lck->unsynced_pages.weak += txn->tw.writemap_dirty_npages;
|
||||
if (!env->lck->eoos_timestamp.weak)
|
||||
env->lck->eoos_timestamp.weak = osal_monotime();
|
||||
}
|
||||
|
||||
/* TODO: use ctx.flush_begin & ctx.flush_end for range-sync */
|
||||
ts_4 = latency ? osal_monotime() : 0;
|
||||
|
||||
meta_t meta;
|
||||
memcpy(meta.magic_and_version, head.ptr_c->magic_and_version, 8);
|
||||
meta.reserve16 = head.ptr_c->reserve16;
|
||||
meta.validator_id = head.ptr_c->validator_id;
|
||||
meta.extra_pagehdr = head.ptr_c->extra_pagehdr;
|
||||
unaligned_poke_u64(4, meta.pages_retired,
|
||||
unaligned_peek_u64(4, head.ptr_c->pages_retired) + MDBX_PNL_GETSIZE(txn->tw.retired_pages));
|
||||
meta.geometry = txn->geo;
|
||||
meta.trees.gc = txn->dbs[FREE_DBI];
|
||||
meta.trees.main = txn->dbs[MAIN_DBI];
|
||||
meta.canary = txn->canary;
|
||||
memcpy(&meta.dxbid, &head.ptr_c->dxbid, sizeof(meta.dxbid));
|
||||
|
||||
txnid_t commit_txnid = txn->txnid;
|
||||
#if MDBX_ENABLE_BIGFOOT
|
||||
if (gcu_ctx.bigfoot > txn->txnid) {
|
||||
commit_txnid = gcu_ctx.bigfoot;
|
||||
TRACE("use @%" PRIaTXN " (+%zu) for commit bigfoot-txn", commit_txnid, (size_t)(commit_txnid - txn->txnid));
|
||||
}
|
||||
#endif
|
||||
meta.unsafe_sign = DATASIGN_NONE;
|
||||
meta_set_txnid(env, &meta, commit_txnid);
|
||||
|
||||
rc = dxb_sync_locked(env, env->flags | txn->flags | txn_shrink_allowed, &meta, &txn->tw.troika);
|
||||
|
||||
ts_5 = latency ? osal_monotime() : 0;
|
||||
rc = txn_basal_commit(txn, latency ? &ts : nullptr);
|
||||
latency_gcprof(latency, txn);
|
||||
int end = TXN_END_COMMITTED | TXN_END_UPDATE;
|
||||
if (unlikely(rc != MDBX_SUCCESS)) {
|
||||
env->flags |= ENV_FATAL_ERROR;
|
||||
ERROR("txn-%s: error %d", "sync", rc);
|
||||
goto fail;
|
||||
end = TXN_END_ABORT;
|
||||
if (rc == MDBX_RESULT_TRUE) {
|
||||
end = TXN_END_PURE_COMMIT | TXN_END_UPDATE;
|
||||
rc = MDBX_NOSUCCESS_PURE_COMMIT ? MDBX_RESULT_TRUE : MDBX_SUCCESS;
|
||||
}
|
||||
}
|
||||
|
||||
end_mode = TXN_END_COMMITTED | TXN_END_UPDATE | TXN_END_EOTDONE;
|
||||
int err = txn_end(txn, end);
|
||||
if (unlikely(err != MDBX_SUCCESS))
|
||||
rc = err;
|
||||
|
||||
done:
|
||||
if (latency)
|
||||
txn_take_gcprof(txn, latency);
|
||||
rc = txn_end(txn, end_mode);
|
||||
|
||||
provide_latency:
|
||||
if (latency) {
|
||||
latency->preparation = ts_1 ? osal_monotime_to_16dot16(ts_1 - ts_0) : 0;
|
||||
latency->gc_wallclock = (ts_2 > ts_1) ? osal_monotime_to_16dot16(ts_2 - ts_1) : 0;
|
||||
latency->gc_cputime = gc_cputime ? osal_monotime_to_16dot16(gc_cputime) : 0;
|
||||
latency->audit = (ts_3 > ts_2) ? osal_monotime_to_16dot16(ts_3 - ts_2) : 0;
|
||||
latency->write = (ts_4 > ts_3) ? osal_monotime_to_16dot16(ts_4 - ts_3) : 0;
|
||||
latency->sync = (ts_5 > ts_4) ? osal_monotime_to_16dot16(ts_5 - ts_4) : 0;
|
||||
const uint64_t ts_6 = osal_monotime();
|
||||
latency->ending = ts_5 ? osal_monotime_to_16dot16(ts_6 - ts_5) : 0;
|
||||
latency->whole = osal_monotime_to_16dot16_noUnderflow(ts_6 - ts_0);
|
||||
}
|
||||
latency_done(latency, &ts);
|
||||
return LOG_IFERR(rc);
|
||||
|
||||
fail:
|
||||
txn->flags |= MDBX_TXN_ERROR;
|
||||
if (latency)
|
||||
txn_take_gcprof(txn, latency);
|
||||
txn_abort(txn);
|
||||
goto provide_latency;
|
||||
}
|
||||
|
||||
int mdbx_txn_info(const MDBX_txn *txn, MDBX_txn_info *info, bool scan_rlt) {
|
||||
@ -848,10 +465,10 @@ int mdbx_txn_info(const MDBX_txn *txn, MDBX_txn_info *info, bool scan_rlt) {
|
||||
info->txn_reader_lag = head.txnid - info->txn_id;
|
||||
info->txn_space_dirty = info->txn_space_retired = 0;
|
||||
uint64_t reader_snapshot_pages_retired = 0;
|
||||
if (txn->to.reader &&
|
||||
((txn->flags & MDBX_TXN_PARKED) == 0 || safe64_read(&txn->to.reader->tid) != MDBX_TID_TXN_OUSTED) &&
|
||||
if (txn->ro.slot &&
|
||||
((txn->flags & MDBX_TXN_PARKED) == 0 || safe64_read(&txn->ro.slot->tid) != MDBX_TID_TXN_OUSTED) &&
|
||||
head_retired >
|
||||
(reader_snapshot_pages_retired = atomic_load64(&txn->to.reader->snapshot_pages_retired, mo_Relaxed))) {
|
||||
(reader_snapshot_pages_retired = atomic_load64(&txn->ro.slot->snapshot_pages_retired, mo_Relaxed))) {
|
||||
info->txn_space_dirty = info->txn_space_retired =
|
||||
pgno2bytes(env, (pgno_t)(head_retired - reader_snapshot_pages_retired));
|
||||
|
||||
@ -878,7 +495,7 @@ int mdbx_txn_info(const MDBX_txn *txn, MDBX_txn_info *info, bool scan_rlt) {
|
||||
if (snap_txnid < next_reader && snap_tid >= MDBX_TID_TXN_OUSTED) {
|
||||
next_reader = snap_txnid;
|
||||
retired_next_reader = pgno2bytes(
|
||||
env, (pgno_t)(snap_retired - atomic_load64(&txn->to.reader->snapshot_pages_retired, mo_Relaxed)));
|
||||
env, (pgno_t)(snap_retired - atomic_load64(&txn->ro.slot->snapshot_pages_retired, mo_Relaxed)));
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -889,31 +506,33 @@ int mdbx_txn_info(const MDBX_txn *txn, MDBX_txn_info *info, bool scan_rlt) {
|
||||
info->txn_space_limit_soft = pgno2bytes(env, txn->geo.now);
|
||||
info->txn_space_limit_hard = pgno2bytes(env, txn->geo.upper);
|
||||
info->txn_space_retired =
|
||||
pgno2bytes(env, txn->nested ? (size_t)txn->tw.retired_pages : MDBX_PNL_GETSIZE(txn->tw.retired_pages));
|
||||
info->txn_space_leftover = pgno2bytes(env, txn->tw.dirtyroom);
|
||||
pgno2bytes(env, txn->nested ? (size_t)txn->wr.retired_pages : MDBX_PNL_GETSIZE(txn->wr.retired_pages));
|
||||
info->txn_space_leftover = pgno2bytes(env, txn->wr.dirtyroom);
|
||||
info->txn_space_dirty =
|
||||
pgno2bytes(env, txn->tw.dirtylist ? txn->tw.dirtylist->pages_including_loose
|
||||
: (txn->tw.writemap_dirty_npages + txn->tw.writemap_spilled_npages));
|
||||
pgno2bytes(env, txn->wr.dirtylist ? txn->wr.dirtylist->pages_including_loose
|
||||
: (txn->wr.writemap_dirty_npages + txn->wr.writemap_spilled_npages));
|
||||
info->txn_reader_lag = INT64_MAX;
|
||||
lck_t *const lck = env->lck_mmap.lck;
|
||||
if (scan_rlt && lck) {
|
||||
txnid_t oldest_snapshot = txn->txnid;
|
||||
txnid_t oldest_reading = txn->txnid;
|
||||
const size_t snap_nreaders = atomic_load32(&lck->rdt_length, mo_AcquireRelease);
|
||||
if (snap_nreaders) {
|
||||
oldest_snapshot = txn_snapshot_oldest(txn);
|
||||
if (oldest_snapshot == txn->txnid - 1) {
|
||||
/* check if there is at least one reader */
|
||||
bool exists = false;
|
||||
txn_gc_detent(txn);
|
||||
oldest_reading = txn->env->gc.detent;
|
||||
if (oldest_reading == txn->wr.troika.txnid[txn->wr.troika.recent]) {
|
||||
/* Если самый старый используемый снимок является предыдущим, т. е. непосредственно предшествующим текущей
|
||||
* транзакции, то просматриваем таблицу читателей чтобы выяснить действительно ли снимок используется
|
||||
* читателями. */
|
||||
oldest_reading = txn->txnid;
|
||||
for (size_t i = 0; i < snap_nreaders; ++i) {
|
||||
if (atomic_load32(&lck->rdt[i].pid, mo_Relaxed) && txn->txnid > safe64_read(&lck->rdt[i].txnid)) {
|
||||
exists = true;
|
||||
if (atomic_load32(&lck->rdt[i].pid, mo_Relaxed) && txn->env->gc.detent == safe64_read(&lck->rdt[i].txnid)) {
|
||||
oldest_reading = txn->env->gc.detent;
|
||||
break;
|
||||
}
|
||||
}
|
||||
oldest_snapshot += !exists;
|
||||
}
|
||||
}
|
||||
info->txn_reader_lag = txn->txnid - oldest_snapshot;
|
||||
info->txn_reader_lag = txn->txnid - oldest_reading;
|
||||
}
|
||||
}
|
||||
|
||||
|
32
src/audit.c
32
src/audit.c
@ -24,12 +24,11 @@ static size_t audit_db_used(const tree_t *db) {
|
||||
return db ? (size_t)db->branch_pages + (size_t)db->leaf_pages + (size_t)db->large_pages : 0;
|
||||
}
|
||||
|
||||
__cold static int audit_ex_locked(MDBX_txn *txn, size_t retired_stored, bool dont_filter_gc) {
|
||||
__cold static int audit_ex_locked(MDBX_txn *txn, const size_t retired_stored, const bool dont_filter_gc) {
|
||||
const MDBX_env *const env = txn->env;
|
||||
size_t pending = 0;
|
||||
if ((txn->flags & MDBX_TXN_RDONLY) == 0)
|
||||
pending = txn->tw.loose_count + MDBX_PNL_GETSIZE(txn->tw.repnl) +
|
||||
(MDBX_PNL_GETSIZE(txn->tw.retired_pages) - retired_stored);
|
||||
tASSERT(txn, (txn->flags & MDBX_TXN_RDONLY) == 0);
|
||||
const size_t pending = txn->wr.loose_count + MDBX_PNL_GETSIZE(txn->wr.repnl) +
|
||||
(MDBX_PNL_GETSIZE(txn->wr.retired_pages) - retired_stored);
|
||||
|
||||
cursor_couple_t cx;
|
||||
int rc = cursor_init(&cx.outer, txn, FREE_DBI);
|
||||
@ -40,17 +39,16 @@ __cold static int audit_ex_locked(MDBX_txn *txn, size_t retired_stored, bool don
|
||||
MDBX_val key, data;
|
||||
rc = outer_first(&cx.outer, &key, &data);
|
||||
while (rc == MDBX_SUCCESS) {
|
||||
if (!dont_filter_gc) {
|
||||
if (unlikely(key.iov_len != sizeof(txnid_t))) {
|
||||
ERROR("%s/%d: %s %u", "MDBX_CORRUPTED", MDBX_CORRUPTED, "invalid GC-key size", (unsigned)key.iov_len);
|
||||
return MDBX_CORRUPTED;
|
||||
}
|
||||
txnid_t id = unaligned_peek_u64(4, key.iov_base);
|
||||
if (txn->tw.gc.retxl ? txl_contain(txn->tw.gc.retxl, id) : (id <= txn->tw.gc.last_reclaimed))
|
||||
goto skip;
|
||||
if (unlikely(key.iov_len != sizeof(txnid_t))) {
|
||||
ERROR("%s/%d: %s %u", "MDBX_CORRUPTED", MDBX_CORRUPTED, "invalid GC-key size", (unsigned)key.iov_len);
|
||||
return MDBX_CORRUPTED;
|
||||
}
|
||||
gc += *(pgno_t *)data.iov_base;
|
||||
skip:
|
||||
const txnid_t id = unaligned_peek_u64(4, key.iov_base);
|
||||
const size_t len = *(pgno_t *)data.iov_base;
|
||||
const bool acc = dont_filter_gc || !gc_is_reclaimed(txn, id);
|
||||
TRACE("%s id %" PRIaTXN " len %zu", acc ? "acc" : "skip", id, len);
|
||||
if (acc)
|
||||
gc += len;
|
||||
rc = outer_next(&cx.outer, &key, &data, MDBX_NEXT);
|
||||
}
|
||||
tASSERT(txn, rc == MDBX_NOTFOUND);
|
||||
@ -89,8 +87,8 @@ __cold static int audit_ex_locked(MDBX_txn *txn, size_t retired_stored, bool don
|
||||
if ((txn->flags & MDBX_TXN_RDONLY) == 0)
|
||||
ERROR("audit @%" PRIaTXN ": %zu(pending) = %zu(loose) + "
|
||||
"%zu(reclaimed) + %zu(retired-pending) - %zu(retired-stored)",
|
||||
txn->txnid, pending, txn->tw.loose_count, MDBX_PNL_GETSIZE(txn->tw.repnl),
|
||||
txn->tw.retired_pages ? MDBX_PNL_GETSIZE(txn->tw.retired_pages) : 0, retired_stored);
|
||||
txn->txnid, pending, txn->wr.loose_count, MDBX_PNL_GETSIZE(txn->wr.repnl),
|
||||
txn->wr.retired_pages ? MDBX_PNL_GETSIZE(txn->wr.retired_pages) : 0, retired_stored);
|
||||
ERROR("audit @%" PRIaTXN ": %zu(pending) + %zu"
|
||||
"(gc) + %zu(count) = %zu(total) <> %zu"
|
||||
"(allocated)",
|
||||
|
@ -8,7 +8,7 @@ N | MASK | ENV | TXN | DB | PUT | DBI | NOD
|
||||
5 |0000 0020| |TXN_PARKED |INTEGERDUP|NODUPDATA | | |P_DUPFIX | |
|
||||
6 |0000 0040| |TXN_AUTOUNPARK|REVERSEDUP|CURRENT |DBI_OLDEN | |P_SUBP | |
|
||||
7 |0000 0080| |TXN_DRAINED_GC|DB_VALID |ALLDUPS |DBI_LINDO | | | |
|
||||
8 |0000 0100| _MAY_MOVE | | | | | | | <= |
|
||||
8 |0000 0100| _MAY_MOVE |TXN_CURSORS | | | | | | <= |
|
||||
9 |0000 0200| _MAY_UNMAP| | | | | | | <= |
|
||||
10|0000 0400| | | | | | | | |
|
||||
11|0000 0800| | | | | | | | |
|
||||
|
39
src/chk.c
39
src/chk.c
@ -159,6 +159,19 @@ __cold static MDBX_chk_line_t *MDBX_PRINTF_ARGS(2, 3) chk_print(MDBX_chk_line_t
|
||||
return line;
|
||||
}
|
||||
|
||||
MDBX_MAYBE_UNUSED __cold static void chk_println_va(MDBX_chk_scope_t *const scope, enum MDBX_chk_severity severity,
|
||||
const char *fmt, va_list args) {
|
||||
chk_line_end(chk_print_va(chk_line_begin(scope, severity), fmt, args));
|
||||
}
|
||||
|
||||
MDBX_MAYBE_UNUSED __cold static void chk_println(MDBX_chk_scope_t *const scope, enum MDBX_chk_severity severity,
|
||||
const char *fmt, ...) {
|
||||
va_list args;
|
||||
va_start(args, fmt);
|
||||
chk_println_va(scope, severity, fmt, args);
|
||||
va_end(args);
|
||||
}
|
||||
|
||||
__cold static MDBX_chk_line_t *chk_print_size(MDBX_chk_line_t *line, const char *prefix, const uint64_t value,
|
||||
const char *suffix) {
|
||||
static const char sf[] = "KMGTPEZY"; /* LY: Kilo, Mega, Giga, Tera, Peta, Exa, Zetta, Yotta! */
|
||||
@ -455,9 +468,8 @@ __cold static void chk_dispose(MDBX_chk_internal_t *chk) {
|
||||
chk->cb->table_dispose(chk->usr, tbl);
|
||||
tbl->cookie = nullptr;
|
||||
}
|
||||
if (tbl != &chk->table_gc && tbl != &chk->table_main) {
|
||||
if (tbl != &chk->table_gc && tbl != &chk->table_main)
|
||||
osal_free(tbl);
|
||||
}
|
||||
}
|
||||
}
|
||||
osal_free(chk->v2a_buf.iov_base);
|
||||
@ -1127,6 +1139,7 @@ __cold static int chk_db(MDBX_chk_scope_t *const scope, MDBX_dbi dbi, MDBX_chk_t
|
||||
const size_t maxkeysize = mdbx_env_get_maxkeysize_ex(env, tbl->flags);
|
||||
MDBX_val prev_key = {nullptr, 0}, prev_data = {nullptr, 0};
|
||||
MDBX_val key, data;
|
||||
size_t dups_count = 0;
|
||||
err = mdbx_cursor_get(cursor, &key, &data, MDBX_FIRST);
|
||||
while (err == MDBX_SUCCESS) {
|
||||
err = chk_check_break(scope);
|
||||
@ -1150,6 +1163,12 @@ __cold static int chk_db(MDBX_chk_scope_t *const scope, MDBX_dbi dbi, MDBX_chk_t
|
||||
}
|
||||
|
||||
if (prev_key.iov_base) {
|
||||
if (key.iov_base == prev_key.iov_base)
|
||||
dups_count += 1;
|
||||
else {
|
||||
histogram_acc(dups_count, &tbl->histogram.multival);
|
||||
dups_count = 0;
|
||||
}
|
||||
if (prev_data.iov_base && !bad_data && (tbl->flags & MDBX_DUPFIXED) && prev_data.iov_len != data.iov_len) {
|
||||
chk_object_issue(scope, "entry", record_count, "different data length", "%" PRIuPTR " != %" PRIuPTR,
|
||||
prev_data.iov_len, data.iov_len);
|
||||
@ -1236,17 +1255,27 @@ __cold static int chk_db(MDBX_chk_scope_t *const scope, MDBX_dbi dbi, MDBX_chk_t
|
||||
err = mdbx_cursor_get(cursor, &key, &data, MDBX_NEXT);
|
||||
}
|
||||
|
||||
if (prev_key.iov_base)
|
||||
histogram_acc(dups_count, &tbl->histogram.multival);
|
||||
|
||||
err = (err != MDBX_NOTFOUND) ? chk_error_rc(scope, err, "mdbx_cursor_get") : MDBX_SUCCESS;
|
||||
if (err == MDBX_SUCCESS && record_count != db->items)
|
||||
chk_scope_issue(scope, "different number of entries %" PRIuSIZE " != %" PRIu64, record_count, db->items);
|
||||
bailout:
|
||||
if (cursor) {
|
||||
if (handler) {
|
||||
if (tbl->histogram.key_len.count) {
|
||||
if (record_count) {
|
||||
MDBX_chk_line_t *line = chk_line_begin(scope, MDBX_chk_info);
|
||||
line = histogram_dist(line, &tbl->histogram.key_len, "key length density", "0/1", false);
|
||||
chk_line_feed(line);
|
||||
line = histogram_dist(line, &tbl->histogram.val_len, "value length density", "0/1", false);
|
||||
if (tbl->histogram.multival.amount) {
|
||||
chk_line_feed(line);
|
||||
line = histogram_dist(line, &tbl->histogram.multival, "number of multi-values density", "single", false);
|
||||
chk_line_feed(line);
|
||||
line = chk_print(line, "number of keys %" PRIuSIZE ", average values per key %.1f",
|
||||
tbl->histogram.multival.count, record_count / (double)tbl->histogram.multival.count);
|
||||
}
|
||||
chk_line_end(line);
|
||||
}
|
||||
if (scope->stage == MDBX_chk_maindb)
|
||||
@ -1301,9 +1330,9 @@ __cold static int chk_handle_gc(MDBX_chk_scope_t *const scope, MDBX_chk_table_t
|
||||
(number + 1) * sizeof(pgno_t), data->iov_len);
|
||||
number = data->iov_len / sizeof(pgno_t) - 1;
|
||||
} else if (data->iov_len - (number + 1) * sizeof(pgno_t) >=
|
||||
/* LY: allow gap up to one page. it is ok
|
||||
/* LY: allow gap up to two page. it is ok
|
||||
* and better than shink-and-retry inside gc_update() */
|
||||
usr->env->ps)
|
||||
usr->env->ps * 2)
|
||||
chk_object_issue(scope, "entry", txnid, "extra idl space",
|
||||
"%" PRIuSIZE " < %" PRIuSIZE " (minor, not a trouble)", (number + 1) * sizeof(pgno_t),
|
||||
data->iov_len);
|
||||
|
10
src/cogs.h
10
src/cogs.h
@ -250,9 +250,15 @@ MDBX_NOTHROW_PURE_FUNCTION static inline const page_t *data_page(const void *dat
|
||||
|
||||
MDBX_NOTHROW_PURE_FUNCTION static inline meta_t *page_meta(page_t *mp) { return (meta_t *)page_data(mp); }
|
||||
|
||||
MDBX_NOTHROW_PURE_FUNCTION static inline size_t page_numkeys(const page_t *mp) { return mp->lower >> 1; }
|
||||
MDBX_NOTHROW_PURE_FUNCTION static inline size_t page_numkeys(const page_t *mp) {
|
||||
assert(mp->lower <= mp->upper);
|
||||
return mp->lower >> 1;
|
||||
}
|
||||
|
||||
MDBX_NOTHROW_PURE_FUNCTION static inline size_t page_room(const page_t *mp) { return mp->upper - mp->lower; }
|
||||
MDBX_NOTHROW_PURE_FUNCTION static inline size_t page_room(const page_t *mp) {
|
||||
assert(mp->lower <= mp->upper);
|
||||
return mp->upper - mp->lower;
|
||||
}
|
||||
|
||||
MDBX_NOTHROW_PURE_FUNCTION static inline size_t page_space(const MDBX_env *env) {
|
||||
STATIC_ASSERT(PAGEHDRSZ % 2 == 0);
|
||||
|
208
src/cursor.c
208
src/cursor.c
@ -6,12 +6,12 @@
|
||||
#include "internals.h"
|
||||
|
||||
__cold int cursor_validate(const MDBX_cursor *mc) {
|
||||
if (!mc->txn->tw.dirtylist) {
|
||||
if (!mc->txn->wr.dirtylist) {
|
||||
cASSERT(mc, (mc->txn->flags & MDBX_WRITEMAP) != 0 && !MDBX_AVOID_MSYNC);
|
||||
} else {
|
||||
cASSERT(mc, (mc->txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC);
|
||||
cASSERT(mc, mc->txn->tw.dirtyroom + mc->txn->tw.dirtylist->length ==
|
||||
(mc->txn->parent ? mc->txn->parent->tw.dirtyroom : mc->txn->env->options.dp_limit));
|
||||
cASSERT(mc, mc->txn->wr.dirtyroom + mc->txn->wr.dirtylist->length ==
|
||||
(mc->txn->parent ? mc->txn->parent->wr.dirtyroom : mc->txn->env->options.dp_limit));
|
||||
}
|
||||
|
||||
cASSERT(mc, (mc->checking & z_updating) ? mc->top + 1 <= mc->tree->height : mc->top + 1 == mc->tree->height);
|
||||
@ -184,79 +184,74 @@ __hot int cursor_touch(MDBX_cursor *const mc, const MDBX_val *key, const MDBX_va
|
||||
|
||||
/*----------------------------------------------------------------------------*/
|
||||
|
||||
int cursor_shadow(MDBX_cursor *mc, MDBX_txn *nested, const size_t dbi) {
|
||||
int cursor_shadow(MDBX_cursor *cursor, MDBX_txn *nested, const size_t dbi) {
|
||||
tASSERT(nested, cursor->signature == cur_signature_live);
|
||||
tASSERT(nested, cursor->txn != nested);
|
||||
cASSERT(cursor, cursor->txn->flags & txn_may_have_cursors);
|
||||
cASSERT(cursor, dbi == cursor_dbi(cursor));
|
||||
tASSERT(nested, dbi > FREE_DBI && dbi < nested->n_dbi);
|
||||
const size_t size = mc->subcur ? sizeof(MDBX_cursor) + sizeof(subcur_t) : sizeof(MDBX_cursor);
|
||||
for (MDBX_cursor *bk; mc; mc = bk->next) {
|
||||
cASSERT(mc, mc != mc->next);
|
||||
if (mc->signature != cur_signature_live) {
|
||||
ENSURE(nested->env, mc->signature == cur_signature_wait4eot);
|
||||
bk = mc;
|
||||
continue;
|
||||
}
|
||||
bk = osal_malloc(size);
|
||||
if (unlikely(!bk))
|
||||
return MDBX_ENOMEM;
|
||||
|
||||
const size_t size = cursor->subcur ? sizeof(MDBX_cursor) + sizeof(subcur_t) : sizeof(MDBX_cursor);
|
||||
MDBX_cursor *const shadow = osal_malloc(size);
|
||||
if (unlikely(!shadow))
|
||||
return MDBX_ENOMEM;
|
||||
|
||||
#if MDBX_DEBUG
|
||||
memset(bk, 0xCD, size);
|
||||
VALGRIND_MAKE_MEM_UNDEFINED(bk, size);
|
||||
memset(shadow, 0xCD, size);
|
||||
VALGRIND_MAKE_MEM_UNDEFINED(shadow, size);
|
||||
#endif /* MDBX_DEBUG */
|
||||
*bk = *mc;
|
||||
mc->backup = bk;
|
||||
mc->txn = nested;
|
||||
mc->tree = &nested->dbs[dbi];
|
||||
mc->dbi_state = &nested->dbi_state[dbi];
|
||||
subcur_t *mx = mc->subcur;
|
||||
if (mx) {
|
||||
*(subcur_t *)(bk + 1) = *mx;
|
||||
mx->cursor.txn = nested;
|
||||
mx->cursor.dbi_state = &nested->dbi_state[dbi];
|
||||
}
|
||||
mc->next = nested->cursors[dbi];
|
||||
nested->cursors[dbi] = mc;
|
||||
*shadow = *cursor;
|
||||
cursor->backup = shadow;
|
||||
cursor->txn = nested;
|
||||
cursor->tree = &nested->dbs[dbi];
|
||||
cursor->dbi_state = &nested->dbi_state[dbi];
|
||||
subcur_t *subcur = cursor->subcur;
|
||||
if (subcur) {
|
||||
*(subcur_t *)(shadow + 1) = *subcur;
|
||||
subcur->cursor.txn = nested;
|
||||
subcur->cursor.dbi_state = &nested->dbi_state[dbi];
|
||||
}
|
||||
return MDBX_SUCCESS;
|
||||
}
|
||||
|
||||
MDBX_cursor *cursor_eot(MDBX_cursor *mc, MDBX_txn *txn, const bool merge) {
|
||||
MDBX_cursor *const next = mc->next;
|
||||
const unsigned stage = mc->signature;
|
||||
MDBX_cursor *const bk = mc->backup;
|
||||
ENSURE(txn->env, stage == cur_signature_live || (stage == cur_signature_wait4eot && bk));
|
||||
tASSERT(txn, mc->txn == txn);
|
||||
if (bk) {
|
||||
subcur_t *mx = mc->subcur;
|
||||
tASSERT(txn, mc->txn->parent != nullptr);
|
||||
tASSERT(txn, bk->txn == txn->parent);
|
||||
/* Zap: Using uninitialized memory '*mc->backup'. */
|
||||
MDBX_cursor *cursor_eot(MDBX_cursor *cursor, MDBX_txn *txn) {
|
||||
MDBX_cursor *const next = cursor->next;
|
||||
const unsigned stage = cursor->signature;
|
||||
MDBX_cursor *const shadow = cursor->backup;
|
||||
ENSURE(txn->env, stage == cur_signature_live || (stage == cur_signature_wait4eot && shadow));
|
||||
tASSERT(txn, cursor->txn == txn);
|
||||
if (shadow) {
|
||||
subcur_t *subcur = cursor->subcur;
|
||||
tASSERT(txn, txn->parent != nullptr && shadow->txn == txn->parent);
|
||||
/* Zap: Using uninitialized memory '*subcur->backup'. */
|
||||
MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER(6001);
|
||||
ENSURE(txn->env, bk->signature == cur_signature_live);
|
||||
tASSERT(txn, mx == bk->subcur);
|
||||
if (merge) {
|
||||
ENSURE(txn->env, shadow->signature == cur_signature_live);
|
||||
tASSERT(txn, subcur == shadow->subcur);
|
||||
if ((txn->flags & MDBX_TXN_ERROR) == 0) {
|
||||
/* Update pointers to parent txn */
|
||||
mc->next = bk->next;
|
||||
mc->backup = bk->backup;
|
||||
mc->txn = bk->txn;
|
||||
mc->tree = bk->tree;
|
||||
mc->dbi_state = bk->dbi_state;
|
||||
if (mx) {
|
||||
mx->cursor.txn = bk->txn;
|
||||
mx->cursor.dbi_state = bk->dbi_state;
|
||||
cursor->next = shadow->next;
|
||||
cursor->backup = shadow->backup;
|
||||
cursor->txn = shadow->txn;
|
||||
cursor->tree = shadow->tree;
|
||||
cursor->dbi_state = shadow->dbi_state;
|
||||
if (subcur) {
|
||||
subcur->cursor.txn = shadow->txn;
|
||||
subcur->cursor.dbi_state = shadow->dbi_state;
|
||||
}
|
||||
} else {
|
||||
/* Restore from backup, i.e. rollback/abort nested txn */
|
||||
*mc = *bk;
|
||||
mc->signature = stage /* Promote (cur_signature_wait4eot) state to parent txn */;
|
||||
if (mx)
|
||||
*mx = *(subcur_t *)(bk + 1);
|
||||
*cursor = *shadow;
|
||||
cursor->signature = stage /* Promote (cur_signature_wait4eot) state to parent txn */;
|
||||
if (subcur)
|
||||
*subcur = *(subcur_t *)(shadow + 1);
|
||||
}
|
||||
bk->signature = 0;
|
||||
osal_free(bk);
|
||||
shadow->signature = 0;
|
||||
osal_free(shadow);
|
||||
} else {
|
||||
ENSURE(mc->txn->env, stage == cur_signature_live);
|
||||
mc->signature = cur_signature_ready4dispose /* Cursor may be reused */;
|
||||
mc->next = mc;
|
||||
cursor_drown((cursor_couple_t *)mc);
|
||||
ENSURE(cursor->txn->env, stage == cur_signature_live);
|
||||
cursor->signature = cur_signature_ready4dispose /* Cursor may be reused */;
|
||||
cursor->next = cursor;
|
||||
cursor_drown((cursor_couple_t *)cursor);
|
||||
}
|
||||
return next;
|
||||
}
|
||||
@ -643,7 +638,7 @@ static __always_inline int cursor_step(const bool inner, const bool forward, MDB
|
||||
inner_gone(mc);
|
||||
} else {
|
||||
if (mc->flags & z_hollow) {
|
||||
cASSERT(mc, !inner_pointed(mc));
|
||||
cASSERT(mc, !inner_pointed(mc) || inner_hollow(mc));
|
||||
return MDBX_ENODATA;
|
||||
}
|
||||
|
||||
@ -771,7 +766,7 @@ __hot int cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, unsig
|
||||
goto skip_check_samedata;
|
||||
}
|
||||
}
|
||||
if (!(flags & MDBX_RESERVE) && unlikely(cmp_lenfast(¤t_data, data) == 0))
|
||||
if (!(flags & MDBX_RESERVE) && unlikely(eq_fast(¤t_data, data)))
|
||||
return MDBX_SUCCESS /* the same data, nothing to update */;
|
||||
skip_check_samedata:;
|
||||
}
|
||||
@ -783,8 +778,9 @@ __hot int cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, unsig
|
||||
rc = MDBX_NO_ROOT;
|
||||
} else if ((flags & MDBX_CURRENT) == 0) {
|
||||
bool exact = false;
|
||||
MDBX_val last_key, old_data;
|
||||
MDBX_val old_data;
|
||||
if ((flags & MDBX_APPEND) && mc->tree->items > 0) {
|
||||
MDBX_val last_key;
|
||||
old_data.iov_base = nullptr;
|
||||
old_data.iov_len = 0;
|
||||
rc = (mc->flags & z_inner) ? inner_last(mc, &last_key) : outer_last(mc, &last_key, &old_data);
|
||||
@ -802,51 +798,53 @@ __hot int cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, unsig
|
||||
}
|
||||
}
|
||||
} else {
|
||||
csr_t csr =
|
||||
/* olddata may not be updated in case DUPFIX-page of dupfix-table */
|
||||
cursor_seek(mc, (MDBX_val *)key, &old_data, MDBX_SET);
|
||||
csr_t csr = cursor_seek(mc, (MDBX_val *)key, &old_data, MDBX_SET);
|
||||
rc = csr.err;
|
||||
exact = csr.exact;
|
||||
}
|
||||
if (likely(rc == MDBX_SUCCESS)) {
|
||||
if (exact) {
|
||||
if (unlikely(flags & MDBX_NOOVERWRITE)) {
|
||||
DEBUG("duplicate key [%s]", DKEY_DEBUG(key));
|
||||
*data = old_data;
|
||||
return MDBX_KEYEXIST;
|
||||
}
|
||||
if (unlikely(mc->flags & z_inner)) {
|
||||
/* nested subtree of DUPSORT-database with the same key,
|
||||
* nothing to update */
|
||||
eASSERT(env, data->iov_len == 0 && (old_data.iov_len == 0 ||
|
||||
/* olddata may not be updated in case
|
||||
DUPFIX-page of dupfix-table */
|
||||
(mc->tree->flags & MDBX_DUPFIXED)));
|
||||
return MDBX_SUCCESS;
|
||||
}
|
||||
if (unlikely(flags & MDBX_ALLDUPS) && inner_pointed(mc)) {
|
||||
err = cursor_del(mc, MDBX_ALLDUPS);
|
||||
if (unlikely(err != MDBX_SUCCESS))
|
||||
return err;
|
||||
if (exact) {
|
||||
cASSERT(mc, rc == MDBX_SUCCESS);
|
||||
if (unlikely(flags & MDBX_NOOVERWRITE)) {
|
||||
DEBUG("duplicate key [%s]", DKEY_DEBUG(key));
|
||||
*data = old_data;
|
||||
return MDBX_KEYEXIST;
|
||||
}
|
||||
if (unlikely(mc->flags & z_inner)) {
|
||||
/* nested subtree of DUPSORT-database with the same key, nothing to update */
|
||||
cASSERT(mc, !"Should not happen since");
|
||||
return (flags & MDBX_NODUPDATA) ? MDBX_KEYEXIST : MDBX_SUCCESS;
|
||||
}
|
||||
if (inner_pointed(mc)) {
|
||||
if (unlikely(flags & MDBX_ALLDUPS)) {
|
||||
rc = cursor_del(mc, MDBX_ALLDUPS);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return rc;
|
||||
flags -= MDBX_ALLDUPS;
|
||||
cASSERT(mc, mc->top + 1 == mc->tree->height);
|
||||
rc = (mc->top >= 0) ? MDBX_NOTFOUND : MDBX_NO_ROOT;
|
||||
exact = false;
|
||||
} else if (!(flags & (MDBX_RESERVE | MDBX_MULTIPLE))) {
|
||||
/* checking for early exit without dirtying pages */
|
||||
if (unlikely(eq_fast(data, &old_data))) {
|
||||
cASSERT(mc, mc->clc->v.cmp(data, &old_data) == 0);
|
||||
if (mc->subcur) {
|
||||
if (flags & MDBX_NODUPDATA)
|
||||
return MDBX_KEYEXIST;
|
||||
if (flags & MDBX_APPENDDUP)
|
||||
return MDBX_EKEYMISMATCH;
|
||||
}
|
||||
} else if ((flags & (MDBX_RESERVE | MDBX_MULTIPLE)) == 0) {
|
||||
old_data = *data;
|
||||
csr_t csr = cursor_seek(&mc->subcur->cursor, &old_data, nullptr, MDBX_SET_RANGE);
|
||||
if (unlikely(csr.exact)) {
|
||||
cASSERT(mc, csr.err == MDBX_SUCCESS);
|
||||
if (flags & MDBX_NODUPDATA)
|
||||
return MDBX_KEYEXIST;
|
||||
if (flags & MDBX_APPENDDUP)
|
||||
return MDBX_EKEYMISMATCH;
|
||||
/* the same data, nothing to update */
|
||||
return MDBX_SUCCESS;
|
||||
} else if (csr.err != MDBX_SUCCESS && unlikely(csr.err != MDBX_NOTFOUND)) {
|
||||
be_poor(mc);
|
||||
return csr.err;
|
||||
}
|
||||
cASSERT(mc, mc->clc->v.cmp(data, &old_data) != 0);
|
||||
}
|
||||
} else if (!(flags & (MDBX_RESERVE | MDBX_MULTIPLE))) {
|
||||
if (unlikely(eq_fast(data, &old_data))) {
|
||||
cASSERT(mc, mc->clc->v.cmp(data, &old_data) == 0);
|
||||
/* the same data, nothing to update */
|
||||
return (mc->subcur && (flags & MDBX_NODUPDATA)) ? MDBX_KEYEXIST : MDBX_SUCCESS;
|
||||
}
|
||||
cASSERT(mc, mc->clc->v.cmp(data, &old_data) != 0);
|
||||
}
|
||||
} else if (unlikely(rc != MDBX_NOTFOUND))
|
||||
return rc;
|
||||
@ -1052,6 +1050,7 @@ __hot int cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, unsig
|
||||
return MDBX_EKEYMISMATCH;
|
||||
} else if (eq_fast(data, &old_data)) {
|
||||
cASSERT(mc, mc->clc->v.cmp(data, &old_data) == 0);
|
||||
cASSERT(mc, !"Should not happen since" || batch_dupfix_done);
|
||||
if (flags & MDBX_NODUPDATA)
|
||||
return MDBX_KEYEXIST;
|
||||
/* data is match exactly byte-to-byte, nothing to update */
|
||||
@ -1727,6 +1726,7 @@ __hot csr_t cursor_seek(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, MDBX_cur
|
||||
|
||||
csr_t ret;
|
||||
ret.exact = false;
|
||||
/* coverity[logical_vs_bitwise] */
|
||||
if (unlikely(key->iov_len < mc->clc->k.lmin ||
|
||||
(key->iov_len > mc->clc->k.lmax &&
|
||||
(mc->clc->k.lmin == mc->clc->k.lmax || MDBX_DEBUG || MDBX_FORCE_ASSERTIONS)))) {
|
||||
@ -1781,8 +1781,7 @@ __hot csr_t cursor_seek(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, MDBX_cur
|
||||
}
|
||||
int cmp = mc->clc->k.cmp(&aligned_key, &nodekey);
|
||||
if (unlikely(cmp == 0)) {
|
||||
/* Probably happens rarely, but first node on the page
|
||||
* was the one we wanted. */
|
||||
/* Probably happens rarely, but first node on the page was the one we wanted. */
|
||||
mc->ki[mc->top] = 0;
|
||||
ret.exact = true;
|
||||
goto got_node;
|
||||
@ -1845,8 +1844,9 @@ __hot csr_t cursor_seek(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, MDBX_cur
|
||||
* Поэтому переводим курсор в неустановленное состояние, но без сброса
|
||||
* top, что позволяет работать fastpath при последующем поиске по дереву
|
||||
* страниц. */
|
||||
mc->flags = z_hollow | (mc->flags & z_clear_mask);
|
||||
inner_gone(mc);
|
||||
mc->flags |= z_hollow;
|
||||
if (inner_pointed(mc))
|
||||
mc->subcur->cursor.flags |= z_hollow;
|
||||
ret.err = MDBX_NOTFOUND;
|
||||
return ret;
|
||||
}
|
||||
|
@ -151,7 +151,7 @@ MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static inline bool is_hollow(const
|
||||
cASSERT(mc, mc->top >= 0);
|
||||
cASSERT(mc, (mc->flags & z_eof_hard) || mc->ki[mc->top] < page_numkeys(mc->pg[mc->top]));
|
||||
} else if (mc->subcur)
|
||||
cASSERT(mc, is_poor(&mc->subcur->cursor));
|
||||
cASSERT(mc, is_poor(&mc->subcur->cursor) || (is_pointed(mc) && mc->subcur->cursor.flags < 0));
|
||||
return r;
|
||||
}
|
||||
|
||||
@ -307,8 +307,8 @@ static inline int cursor_check_rw(const MDBX_cursor *mc) {
|
||||
return cursor_check(mc, (MDBX_TXN_BLOCKED - MDBX_TXN_PARKED) | MDBX_TXN_RDONLY);
|
||||
}
|
||||
|
||||
MDBX_INTERNAL MDBX_cursor *cursor_eot(MDBX_cursor *mc, MDBX_txn *txn, const bool merge);
|
||||
MDBX_INTERNAL int cursor_shadow(MDBX_cursor *mc, MDBX_txn *nested, const size_t dbi);
|
||||
MDBX_INTERNAL MDBX_cursor *cursor_eot(MDBX_cursor *cursor, MDBX_txn *txn);
|
||||
MDBX_INTERNAL int cursor_shadow(MDBX_cursor *cursor, MDBX_txn *nested, const size_t dbi);
|
||||
|
||||
MDBX_INTERNAL MDBX_cursor *cursor_cpstk(const MDBX_cursor *csrc, MDBX_cursor *cdst);
|
||||
|
||||
|
14
src/dbi.c
14
src/dbi.c
@ -87,19 +87,12 @@ __noinline int dbi_import(MDBX_txn *txn, const size_t dbi) {
|
||||
if (parent) {
|
||||
/* вложенная пишущая транзакция */
|
||||
int rc = dbi_check(parent, dbi);
|
||||
/* копируем состояние table очищая new-флаги. */
|
||||
/* копируем состояние dbi-хендла очищая new-флаги. */
|
||||
eASSERT(env, txn->dbi_seqs == parent->dbi_seqs);
|
||||
txn->dbi_state[dbi] = parent->dbi_state[dbi] & ~(DBI_FRESH | DBI_CREAT | DBI_DIRTY);
|
||||
if (likely(rc == MDBX_SUCCESS)) {
|
||||
txn->dbs[dbi] = parent->dbs[dbi];
|
||||
if (parent->cursors[dbi]) {
|
||||
rc = cursor_shadow(parent->cursors[dbi], txn, dbi);
|
||||
if (unlikely(rc != MDBX_SUCCESS)) {
|
||||
/* не получилось забекапить курсоры */
|
||||
txn->dbi_state[dbi] = DBI_OLDEN | DBI_LINDO | DBI_STALE;
|
||||
txn->flags |= MDBX_TXN_ERROR;
|
||||
}
|
||||
}
|
||||
rc = txn_shadow_cursors(parent, dbi);
|
||||
}
|
||||
return rc;
|
||||
}
|
||||
@ -183,7 +176,7 @@ int dbi_defer_release(MDBX_env *const env, defer_free_item_t *const chain) {
|
||||
}
|
||||
|
||||
/* Export or close DBI handles opened in this txn. */
|
||||
int dbi_update(MDBX_txn *txn, int keep) {
|
||||
int dbi_update(MDBX_txn *txn, bool keep) {
|
||||
MDBX_env *const env = txn->env;
|
||||
tASSERT(txn, !txn->parent && txn == env->basal_txn);
|
||||
bool locked = false;
|
||||
@ -223,6 +216,7 @@ int dbi_update(MDBX_txn *txn, int keep) {
|
||||
|
||||
if (locked) {
|
||||
size_t i = env->n_dbi;
|
||||
eASSERT(env, env->n_dbi >= CORE_DBS);
|
||||
while ((env->dbs_flags[i - 1] & DB_VALID) == 0) {
|
||||
--i;
|
||||
eASSERT(env, i >= CORE_DBS);
|
||||
|
43
src/dbi.h
43
src/dbi.h
@ -43,30 +43,35 @@ static inline size_t dbi_bitmap_ctz(const MDBX_txn *txn, intptr_t bmi) {
|
||||
return dbi_bitmap_ctz_fallback(txn, bmi);
|
||||
}
|
||||
|
||||
static inline bool dbi_foreach_step(const MDBX_txn *const txn, size_t *bitmap_item, size_t *dbi) {
|
||||
const size_t bitmap_chunk = CHAR_BIT * sizeof(txn->dbi_sparse[0]);
|
||||
if (*bitmap_item & 1) {
|
||||
*bitmap_item >>= 1;
|
||||
return txn->dbi_state[*dbi] != 0;
|
||||
}
|
||||
if (*bitmap_item) {
|
||||
size_t bitmap_skip = dbi_bitmap_ctz(txn, *bitmap_item);
|
||||
*bitmap_item >>= bitmap_skip;
|
||||
*dbi += bitmap_skip - 1;
|
||||
} else {
|
||||
*dbi = (*dbi - 1) | (bitmap_chunk - 1);
|
||||
*bitmap_item = txn->dbi_sparse[(1 + *dbi) / bitmap_chunk];
|
||||
if (*bitmap_item == 0)
|
||||
*dbi += bitmap_chunk;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/* LY: Макрос целенаправленно сделан с одним циклом, чтобы сохранить возможность
|
||||
* использования оператора break */
|
||||
#define TXN_FOREACH_DBI_FROM(TXN, I, FROM) \
|
||||
for (size_t bitmap_chunk = CHAR_BIT * sizeof(TXN->dbi_sparse[0]), bitmap_item = TXN->dbi_sparse[0] >> FROM, \
|
||||
I = FROM; \
|
||||
I < TXN->n_dbi; ++I) \
|
||||
if (bitmap_item == 0) { \
|
||||
I = (I - 1) | (bitmap_chunk - 1); \
|
||||
bitmap_item = TXN->dbi_sparse[(1 + I) / bitmap_chunk]; \
|
||||
if (!bitmap_item) \
|
||||
/* coverity[const_overflow] */ \
|
||||
I += bitmap_chunk; \
|
||||
continue; \
|
||||
} else if ((bitmap_item & 1) == 0) { \
|
||||
size_t bitmap_skip = dbi_bitmap_ctz(txn, bitmap_item); \
|
||||
bitmap_item >>= bitmap_skip; \
|
||||
I += bitmap_skip - 1; \
|
||||
continue; \
|
||||
} else if (bitmap_item >>= 1, TXN->dbi_state[I])
|
||||
for (size_t bitmap_item = TXN->dbi_sparse[0] >> FROM, I = FROM; I < TXN->n_dbi; ++I) \
|
||||
if (dbi_foreach_step(TXN, &bitmap_item, &I))
|
||||
|
||||
#else
|
||||
|
||||
#define TXN_FOREACH_DBI_FROM(TXN, I, SKIP) \
|
||||
for (size_t I = SKIP; I < TXN->n_dbi; ++I) \
|
||||
#define TXN_FOREACH_DBI_FROM(TXN, I, FROM) \
|
||||
for (size_t I = FROM; I < TXN->n_dbi; ++I) \
|
||||
if (TXN->dbi_state[I])
|
||||
|
||||
#endif /* MDBX_ENABLE_DBI_SPARSE */
|
||||
@ -82,7 +87,7 @@ struct dbi_snap_result {
|
||||
};
|
||||
MDBX_INTERNAL struct dbi_snap_result dbi_snap(const MDBX_env *env, const size_t dbi);
|
||||
|
||||
MDBX_INTERNAL int dbi_update(MDBX_txn *txn, int keep);
|
||||
MDBX_INTERNAL int dbi_update(MDBX_txn *txn, bool keep);
|
||||
|
||||
static inline uint8_t dbi_state(const MDBX_txn *txn, const size_t dbi) {
|
||||
STATIC_ASSERT((int)DBI_DIRTY == MDBX_DBI_DIRTY && (int)DBI_STALE == MDBX_DBI_STALE &&
|
||||
|
70
src/dpl.c
70
src/dpl.c
@ -28,9 +28,9 @@ static inline size_t dpl_bytes2size(const ptrdiff_t bytes) {
|
||||
}
|
||||
|
||||
void dpl_free(MDBX_txn *txn) {
|
||||
if (likely(txn->tw.dirtylist)) {
|
||||
osal_free(txn->tw.dirtylist);
|
||||
txn->tw.dirtylist = nullptr;
|
||||
if (likely(txn->wr.dirtylist)) {
|
||||
osal_free(txn->wr.dirtylist);
|
||||
txn->wr.dirtylist = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
@ -39,14 +39,14 @@ dpl_t *dpl_reserve(MDBX_txn *txn, size_t size) {
|
||||
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC);
|
||||
|
||||
size_t bytes = dpl_size2bytes((size < PAGELIST_LIMIT) ? size : PAGELIST_LIMIT);
|
||||
dpl_t *const dl = osal_realloc(txn->tw.dirtylist, bytes);
|
||||
dpl_t *const dl = osal_realloc(txn->wr.dirtylist, bytes);
|
||||
if (likely(dl)) {
|
||||
#ifdef osal_malloc_usable_size
|
||||
bytes = osal_malloc_usable_size(dl);
|
||||
#endif /* osal_malloc_usable_size */
|
||||
dl->detent = dpl_bytes2size(bytes);
|
||||
tASSERT(txn, txn->tw.dirtylist == nullptr || dl->length <= dl->detent);
|
||||
txn->tw.dirtylist = dl;
|
||||
tASSERT(txn, txn->wr.dirtylist == nullptr || dl->length <= dl->detent);
|
||||
txn->wr.dirtylist = dl;
|
||||
}
|
||||
return dl;
|
||||
}
|
||||
@ -57,15 +57,17 @@ int dpl_alloc(MDBX_txn *txn) {
|
||||
|
||||
const size_t wanna = (txn->env->options.dp_initial < txn->geo.upper) ? txn->env->options.dp_initial : txn->geo.upper;
|
||||
#if MDBX_FORCE_ASSERTIONS || MDBX_DEBUG
|
||||
if (txn->tw.dirtylist)
|
||||
if (txn->wr.dirtylist)
|
||||
/* обнуляем чтобы не сработал ассерт внутри dpl_reserve() */
|
||||
txn->tw.dirtylist->sorted = txn->tw.dirtylist->length = 0;
|
||||
txn->wr.dirtylist->sorted = txn->wr.dirtylist->length = 0;
|
||||
#endif /* asertions enabled */
|
||||
if (unlikely(!txn->tw.dirtylist || txn->tw.dirtylist->detent < wanna || txn->tw.dirtylist->detent > wanna + wanna) &&
|
||||
if (unlikely(!txn->wr.dirtylist || txn->wr.dirtylist->detent < wanna || txn->wr.dirtylist->detent > wanna + wanna) &&
|
||||
unlikely(!dpl_reserve(txn, wanna)))
|
||||
return MDBX_ENOMEM;
|
||||
|
||||
dpl_clear(txn->tw.dirtylist);
|
||||
/* LY: wr.dirtylist не может быть nullptr, так как либо уже выделен, либо будет выделен в dpl_reserve(). */
|
||||
/* coverity[var_deref_model] */
|
||||
dpl_clear(txn->wr.dirtylist);
|
||||
return MDBX_SUCCESS;
|
||||
}
|
||||
|
||||
@ -79,7 +81,7 @@ __hot __noinline dpl_t *dpl_sort_slowpath(const MDBX_txn *txn) {
|
||||
tASSERT(txn, (txn->flags & MDBX_TXN_RDONLY) == 0);
|
||||
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC);
|
||||
|
||||
dpl_t *dl = txn->tw.dirtylist;
|
||||
dpl_t *dl = txn->wr.dirtylist;
|
||||
assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID);
|
||||
const size_t unsorted = dl->length - dl->sorted;
|
||||
if (likely(unsorted < MDBX_RADIXSORT_THRESHOLD) || unlikely(!dp_radixsort(dl->items + 1, dl->length))) {
|
||||
@ -133,7 +135,7 @@ __hot __noinline size_t dpl_search(const MDBX_txn *txn, pgno_t pgno) {
|
||||
tASSERT(txn, (txn->flags & MDBX_TXN_RDONLY) == 0);
|
||||
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC);
|
||||
|
||||
dpl_t *dl = txn->tw.dirtylist;
|
||||
dpl_t *dl = txn->wr.dirtylist;
|
||||
assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID);
|
||||
if (AUDIT_ENABLED()) {
|
||||
for (const dp_t *ptr = dl->items + dl->sorted; --ptr > dl->items;) {
|
||||
@ -175,7 +177,7 @@ __hot __noinline size_t dpl_search(const MDBX_txn *txn, pgno_t pgno) {
|
||||
|
||||
const page_t *debug_dpl_find(const MDBX_txn *txn, const pgno_t pgno) {
|
||||
tASSERT(txn, (txn->flags & MDBX_TXN_RDONLY) == 0);
|
||||
const dpl_t *dl = txn->tw.dirtylist;
|
||||
const dpl_t *dl = txn->wr.dirtylist;
|
||||
if (dl) {
|
||||
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC);
|
||||
assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID);
|
||||
@ -198,7 +200,7 @@ void dpl_remove_ex(const MDBX_txn *txn, size_t i, size_t npages) {
|
||||
tASSERT(txn, (txn->flags & MDBX_TXN_RDONLY) == 0);
|
||||
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC);
|
||||
|
||||
dpl_t *dl = txn->tw.dirtylist;
|
||||
dpl_t *dl = txn->wr.dirtylist;
|
||||
assert((intptr_t)i > 0 && i <= dl->length);
|
||||
assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID);
|
||||
dl->pages_including_loose -= npages;
|
||||
@ -214,10 +216,10 @@ int __must_check_result dpl_append(MDBX_txn *txn, pgno_t pgno, page_t *page, siz
|
||||
const dp_t dp = {page, pgno, (pgno_t)npages};
|
||||
if ((txn->flags & MDBX_WRITEMAP) == 0) {
|
||||
size_t *const ptr = ptr_disp(page, -(ptrdiff_t)sizeof(size_t));
|
||||
*ptr = txn->tw.dirtylru;
|
||||
*ptr = txn->wr.dirtylru;
|
||||
}
|
||||
|
||||
dpl_t *dl = txn->tw.dirtylist;
|
||||
dpl_t *dl = txn->wr.dirtylist;
|
||||
tASSERT(txn, dl->length <= PAGELIST_LIMIT + MDBX_PNL_GRANULATE);
|
||||
tASSERT(txn, dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID);
|
||||
if (AUDIT_ENABLED()) {
|
||||
@ -313,7 +315,7 @@ int __must_check_result dpl_append(MDBX_txn *txn, pgno_t pgno, page_t *page, siz
|
||||
|
||||
__cold bool dpl_check(MDBX_txn *txn) {
|
||||
tASSERT(txn, (txn->flags & MDBX_TXN_RDONLY) == 0);
|
||||
const dpl_t *const dl = txn->tw.dirtylist;
|
||||
const dpl_t *const dl = txn->wr.dirtylist;
|
||||
if (!dl) {
|
||||
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) != 0 && !MDBX_AVOID_MSYNC);
|
||||
return true;
|
||||
@ -322,7 +324,7 @@ __cold bool dpl_check(MDBX_txn *txn) {
|
||||
|
||||
assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID);
|
||||
tASSERT(txn,
|
||||
txn->tw.dirtyroom + dl->length == (txn->parent ? txn->parent->tw.dirtyroom : txn->env->options.dp_limit));
|
||||
txn->wr.dirtyroom + dl->length == (txn->parent ? txn->parent->wr.dirtyroom : txn->env->options.dp_limit));
|
||||
|
||||
if (!AUDIT_ENABLED())
|
||||
return true;
|
||||
@ -362,28 +364,28 @@ __cold bool dpl_check(MDBX_txn *txn) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const size_t rpa = pnl_search(txn->tw.repnl, dp->pgno, txn->geo.first_unallocated);
|
||||
tASSERT(txn, rpa > MDBX_PNL_GETSIZE(txn->tw.repnl) || txn->tw.repnl[rpa] != dp->pgno);
|
||||
if (rpa <= MDBX_PNL_GETSIZE(txn->tw.repnl) && unlikely(txn->tw.repnl[rpa] == dp->pgno))
|
||||
const size_t rpa = pnl_search(txn->wr.repnl, dp->pgno, txn->geo.first_unallocated);
|
||||
tASSERT(txn, rpa > MDBX_PNL_GETSIZE(txn->wr.repnl) || txn->wr.repnl[rpa] != dp->pgno);
|
||||
if (rpa <= MDBX_PNL_GETSIZE(txn->wr.repnl) && unlikely(txn->wr.repnl[rpa] == dp->pgno))
|
||||
return false;
|
||||
if (num > 1) {
|
||||
const size_t rpb = pnl_search(txn->tw.repnl, dp->pgno + num - 1, txn->geo.first_unallocated);
|
||||
const size_t rpb = pnl_search(txn->wr.repnl, dp->pgno + num - 1, txn->geo.first_unallocated);
|
||||
tASSERT(txn, rpa == rpb);
|
||||
if (unlikely(rpa != rpb))
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
tASSERT(txn, loose == txn->tw.loose_count);
|
||||
if (unlikely(loose != txn->tw.loose_count))
|
||||
tASSERT(txn, loose == txn->wr.loose_count);
|
||||
if (unlikely(loose != txn->wr.loose_count))
|
||||
return false;
|
||||
|
||||
tASSERT(txn, pages == dl->pages_including_loose);
|
||||
if (unlikely(pages != dl->pages_including_loose))
|
||||
return false;
|
||||
|
||||
for (size_t i = 1; i <= MDBX_PNL_GETSIZE(txn->tw.retired_pages); ++i) {
|
||||
const page_t *const dp = debug_dpl_find(txn, txn->tw.retired_pages[i]);
|
||||
for (size_t i = 1; i <= MDBX_PNL_GETSIZE(txn->wr.retired_pages); ++i) {
|
||||
const page_t *const dp = debug_dpl_find(txn, txn->wr.retired_pages[i]);
|
||||
tASSERT(txn, !dp);
|
||||
if (unlikely(dp))
|
||||
return false;
|
||||
@ -395,11 +397,11 @@ __cold bool dpl_check(MDBX_txn *txn) {
|
||||
/*----------------------------------------------------------------------------*/
|
||||
|
||||
__noinline void dpl_lru_reduce(MDBX_txn *txn) {
|
||||
NOTICE("lru-reduce %u -> %u", txn->tw.dirtylru, txn->tw.dirtylru >> 1);
|
||||
VERBOSE("lru-reduce %u -> %u", txn->wr.dirtylru, txn->wr.dirtylru >> 1);
|
||||
tASSERT(txn, (txn->flags & (MDBX_TXN_RDONLY | MDBX_WRITEMAP)) == 0);
|
||||
do {
|
||||
txn->tw.dirtylru >>= 1;
|
||||
dpl_t *dl = txn->tw.dirtylist;
|
||||
txn->wr.dirtylru >>= 1;
|
||||
dpl_t *dl = txn->wr.dirtylist;
|
||||
for (size_t i = 1; i <= dl->length; ++i) {
|
||||
size_t *const ptr = ptr_disp(dl->items[i].ptr, -(ptrdiff_t)sizeof(size_t));
|
||||
*ptr >>= 1;
|
||||
@ -411,7 +413,7 @@ __noinline void dpl_lru_reduce(MDBX_txn *txn) {
|
||||
void dpl_sift(MDBX_txn *const txn, pnl_t pl, const bool spilled) {
|
||||
tASSERT(txn, (txn->flags & MDBX_TXN_RDONLY) == 0);
|
||||
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC);
|
||||
if (MDBX_PNL_GETSIZE(pl) && txn->tw.dirtylist->length) {
|
||||
if (MDBX_PNL_GETSIZE(pl) && txn->wr.dirtylist->length) {
|
||||
tASSERT(txn, pnl_check_allocated(pl, (size_t)txn->geo.first_unallocated << spilled));
|
||||
dpl_t *dl = dpl_sort(txn);
|
||||
|
||||
@ -466,9 +468,9 @@ void dpl_sift(MDBX_txn *const txn, pnl_t pl, const bool spilled) {
|
||||
}
|
||||
}
|
||||
dl->sorted = dpl_setlen(dl, w - 1);
|
||||
txn->tw.dirtyroom += r - w;
|
||||
tASSERT(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length ==
|
||||
(txn->parent ? txn->parent->tw.dirtyroom : txn->env->options.dp_limit));
|
||||
txn->wr.dirtyroom += r - w;
|
||||
tASSERT(txn, txn->wr.dirtyroom + txn->wr.dirtylist->length ==
|
||||
(txn->parent ? txn->parent->wr.dirtyroom : txn->env->options.dp_limit));
|
||||
return;
|
||||
}
|
||||
}
|
||||
@ -477,7 +479,7 @@ void dpl_sift(MDBX_txn *const txn, pnl_t pl, const bool spilled) {
|
||||
void dpl_release_shadows(MDBX_txn *txn) {
|
||||
tASSERT(txn, (txn->flags & (MDBX_TXN_RDONLY | MDBX_WRITEMAP)) == 0);
|
||||
MDBX_env *env = txn->env;
|
||||
dpl_t *const dl = txn->tw.dirtylist;
|
||||
dpl_t *const dl = txn->wr.dirtylist;
|
||||
|
||||
for (size_t i = 1; i <= dl->length; i++)
|
||||
page_shadow_release(env, dl->items[i].ptr, dpl_npages(dl, i));
|
||||
|
22
src/dpl.h
22
src/dpl.h
@ -46,14 +46,14 @@ static inline dpl_t *dpl_sort(const MDBX_txn *txn) {
|
||||
tASSERT(txn, (txn->flags & MDBX_TXN_RDONLY) == 0);
|
||||
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC);
|
||||
|
||||
dpl_t *dl = txn->tw.dirtylist;
|
||||
dpl_t *dl = txn->wr.dirtylist;
|
||||
tASSERT(txn, dl->length <= PAGELIST_LIMIT);
|
||||
tASSERT(txn, dl->sorted <= dl->length);
|
||||
tASSERT(txn, dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID);
|
||||
return likely(dl->sorted == dl->length) ? dl : dpl_sort_slowpath(txn);
|
||||
}
|
||||
|
||||
MDBX_INTERNAL __noinline size_t dpl_search(const MDBX_txn *txn, pgno_t pgno);
|
||||
MDBX_NOTHROW_PURE_FUNCTION MDBX_INTERNAL __noinline size_t dpl_search(const MDBX_txn *txn, pgno_t pgno);
|
||||
|
||||
MDBX_MAYBE_UNUSED MDBX_INTERNAL const page_t *debug_dpl_find(const MDBX_txn *txn, const pgno_t pgno);
|
||||
|
||||
@ -68,11 +68,11 @@ MDBX_NOTHROW_PURE_FUNCTION static inline pgno_t dpl_endpgno(const dpl_t *dl, siz
|
||||
return dpl_npages(dl, i) + dl->items[i].pgno;
|
||||
}
|
||||
|
||||
static inline bool dpl_intersect(const MDBX_txn *txn, pgno_t pgno, size_t npages) {
|
||||
MDBX_NOTHROW_PURE_FUNCTION static inline bool dpl_intersect(const MDBX_txn *txn, pgno_t pgno, size_t npages) {
|
||||
tASSERT(txn, (txn->flags & MDBX_TXN_RDONLY) == 0);
|
||||
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC);
|
||||
|
||||
dpl_t *dl = txn->tw.dirtylist;
|
||||
dpl_t *dl = txn->wr.dirtylist;
|
||||
tASSERT(txn, dl->sorted == dl->length);
|
||||
tASSERT(txn, dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID);
|
||||
size_t const n = dpl_search(txn, pgno);
|
||||
@ -96,7 +96,7 @@ static inline bool dpl_intersect(const MDBX_txn *txn, pgno_t pgno, size_t npages
|
||||
|
||||
MDBX_NOTHROW_PURE_FUNCTION static inline size_t dpl_exist(const MDBX_txn *txn, pgno_t pgno) {
|
||||
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC);
|
||||
dpl_t *dl = txn->tw.dirtylist;
|
||||
dpl_t *dl = txn->wr.dirtylist;
|
||||
size_t i = dpl_search(txn, pgno);
|
||||
tASSERT(txn, (int)i > 0);
|
||||
return (dl->items[i].pgno == pgno) ? i : 0;
|
||||
@ -105,7 +105,7 @@ MDBX_NOTHROW_PURE_FUNCTION static inline size_t dpl_exist(const MDBX_txn *txn, p
|
||||
MDBX_INTERNAL void dpl_remove_ex(const MDBX_txn *txn, size_t i, size_t npages);
|
||||
|
||||
static inline void dpl_remove(const MDBX_txn *txn, size_t i) {
|
||||
dpl_remove_ex(txn, i, dpl_npages(txn->tw.dirtylist, i));
|
||||
dpl_remove_ex(txn, i, dpl_npages(txn->wr.dirtylist, i));
|
||||
}
|
||||
|
||||
MDBX_INTERNAL int __must_check_result dpl_append(MDBX_txn *txn, pgno_t pgno, page_t *page, size_t npages);
|
||||
@ -114,19 +114,19 @@ MDBX_MAYBE_UNUSED MDBX_INTERNAL bool dpl_check(MDBX_txn *txn);
|
||||
|
||||
MDBX_NOTHROW_PURE_FUNCTION static inline uint32_t dpl_age(const MDBX_txn *txn, size_t i) {
|
||||
tASSERT(txn, (txn->flags & (MDBX_TXN_RDONLY | MDBX_WRITEMAP)) == 0);
|
||||
const dpl_t *dl = txn->tw.dirtylist;
|
||||
const dpl_t *dl = txn->wr.dirtylist;
|
||||
assert((intptr_t)i > 0 && i <= dl->length);
|
||||
size_t *const ptr = ptr_disp(dl->items[i].ptr, -(ptrdiff_t)sizeof(size_t));
|
||||
return txn->tw.dirtylru - (uint32_t)*ptr;
|
||||
return txn->wr.dirtylru - (uint32_t)*ptr;
|
||||
}
|
||||
|
||||
MDBX_INTERNAL void dpl_lru_reduce(MDBX_txn *txn);
|
||||
|
||||
static inline uint32_t dpl_lru_turn(MDBX_txn *txn) {
|
||||
txn->tw.dirtylru += 1;
|
||||
if (unlikely(txn->tw.dirtylru > UINT32_MAX / 3) && (txn->flags & MDBX_WRITEMAP) == 0)
|
||||
txn->wr.dirtylru += 1;
|
||||
if (unlikely(txn->wr.dirtylru > UINT32_MAX / 3) && (txn->flags & MDBX_WRITEMAP) == 0)
|
||||
dpl_lru_reduce(txn);
|
||||
return txn->tw.dirtylru;
|
||||
return txn->wr.dirtylru;
|
||||
}
|
||||
|
||||
MDBX_INTERNAL void dpl_sift(MDBX_txn *const txn, pnl_t pl, const bool spilled);
|
||||
|
18
src/dxb.c
18
src/dxb.c
@ -370,7 +370,7 @@ void dxb_sanitize_tail(MDBX_env *env, MDBX_txn *txn) {
|
||||
return;
|
||||
} else if (env_owned_wrtxn(env)) {
|
||||
/* inside write-txn */
|
||||
last = meta_recent(env, &env->basal_txn->tw.troika).ptr_v->geometry.first_unallocated;
|
||||
last = meta_recent(env, &env->basal_txn->wr.troika).ptr_v->geometry.first_unallocated;
|
||||
} else if (env->flags & MDBX_RDONLY) {
|
||||
/* read-only mode, no write-txn, no wlock mutex */
|
||||
last = NUM_METAS;
|
||||
@ -1061,16 +1061,17 @@ int dxb_sync_locked(MDBX_env *env, unsigned flags, meta_t *const pending, troika
|
||||
#endif /* MADV_DONTNEED || POSIX_MADV_DONTNEED */
|
||||
|
||||
/* LY: check conditions to shrink datafile */
|
||||
const pgno_t backlog_gap = 3 + pending->trees.gc.height * 3;
|
||||
const pgno_t stockpile_gap = 3 + pending->trees.gc.height * 3;
|
||||
pgno_t shrink_step = 0;
|
||||
if (pending->geometry.shrink_pv && pending->geometry.now - pending->geometry.first_unallocated >
|
||||
(shrink_step = pv2pages(pending->geometry.shrink_pv)) + backlog_gap) {
|
||||
if (pending->geometry.now > largest_pgno && pending->geometry.now - largest_pgno > shrink_step + backlog_gap) {
|
||||
(shrink_step = pv2pages(pending->geometry.shrink_pv)) + stockpile_gap) {
|
||||
if (pending->geometry.now > largest_pgno &&
|
||||
pending->geometry.now - largest_pgno > shrink_step + stockpile_gap) {
|
||||
const pgno_t aligner =
|
||||
pending->geometry.grow_pv ? /* grow_step */ pv2pages(pending->geometry.grow_pv) : shrink_step;
|
||||
const pgno_t with_backlog_gap = largest_pgno + backlog_gap;
|
||||
const pgno_t with_stockpile_gap = largest_pgno + stockpile_gap;
|
||||
const pgno_t aligned =
|
||||
pgno_align2os_pgno(env, (size_t)with_backlog_gap + aligner - with_backlog_gap % aligner);
|
||||
pgno_align2os_pgno(env, (size_t)with_stockpile_gap + aligner - with_stockpile_gap % aligner);
|
||||
const pgno_t bottom = (aligned > pending->geometry.lower) ? aligned : pending->geometry.lower;
|
||||
if (pending->geometry.now > bottom) {
|
||||
if (TROIKA_HAVE_STEADY(troika))
|
||||
@ -1290,6 +1291,7 @@ int dxb_sync_locked(MDBX_env *env, unsigned flags, meta_t *const pending, troika
|
||||
}
|
||||
|
||||
uint64_t timestamp = 0;
|
||||
/* coverity[array_null] */
|
||||
while ("workaround for https://libmdbx.dqdkfa.ru/dead-github/issues/269") {
|
||||
rc = coherency_check_written(env, pending->unsafe_txnid, target,
|
||||
bytes2pgno(env, ptr_dist(target, env->dxb_mmap.base)), ×tamp);
|
||||
@ -1306,8 +1308,8 @@ int dxb_sync_locked(MDBX_env *env, unsigned flags, meta_t *const pending, troika
|
||||
|
||||
*troika = meta_tap(env);
|
||||
for (MDBX_txn *txn = env->basal_txn; txn; txn = txn->nested)
|
||||
if (troika != &txn->tw.troika)
|
||||
txn->tw.troika = *troika;
|
||||
if (troika != &txn->wr.troika)
|
||||
txn->wr.troika = *troika;
|
||||
|
||||
/* LY: shrink datafile if needed */
|
||||
if (unlikely(shrink)) {
|
||||
|
17
src/env.c
17
src/env.c
@ -76,7 +76,7 @@ retry:;
|
||||
goto bailout;
|
||||
}
|
||||
|
||||
const troika_t troika = (txn_owned || should_unlock) ? env->basal_txn->tw.troika : meta_tap(env);
|
||||
const troika_t troika = (txn_owned || should_unlock) ? env->basal_txn->wr.troika : meta_tap(env);
|
||||
const meta_ptr_t head = meta_recent(env, &troika);
|
||||
const uint64_t unsynced_pages = atomic_load64(&env->lck->unsynced_pages, mo_Relaxed);
|
||||
if (unsynced_pages == 0) {
|
||||
@ -158,13 +158,13 @@ retry:;
|
||||
#if MDBX_ENABLE_PGOP_STAT
|
||||
env->lck->pgops.wops.weak += wops;
|
||||
#endif /* MDBX_ENABLE_PGOP_STAT */
|
||||
env->basal_txn->tw.troika = meta_tap(env);
|
||||
env->basal_txn->wr.troika = meta_tap(env);
|
||||
eASSERT(env, !env->txn && !env->basal_txn->nested);
|
||||
goto retry;
|
||||
}
|
||||
eASSERT(env, head.txnid == recent_committed_txnid(env));
|
||||
env->basal_txn->txnid = head.txnid;
|
||||
txn_snapshot_oldest(env->basal_txn);
|
||||
txn_gc_detent(env->basal_txn);
|
||||
flags |= txn_shrink_allowed;
|
||||
}
|
||||
|
||||
@ -182,7 +182,7 @@ retry:;
|
||||
DEBUG("meta-head %" PRIaPGNO ", %s, sync_pending %" PRIu64, data_page(head.ptr_c)->pgno,
|
||||
durable_caption(head.ptr_c), unsynced_pages);
|
||||
meta_t meta = *head.ptr_c;
|
||||
rc = dxb_sync_locked(env, flags, &meta, &env->basal_txn->tw.troika);
|
||||
rc = dxb_sync_locked(env, flags, &meta, &env->basal_txn->wr.troika);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
goto bailout;
|
||||
}
|
||||
@ -524,7 +524,7 @@ __cold int env_close(MDBX_env *env, bool resurrect_after_fork) {
|
||||
env->defer_free = nullptr;
|
||||
#endif /* MDBX_ENABLE_DBI_LOCKFREE */
|
||||
|
||||
if (!(env->flags & MDBX_RDONLY))
|
||||
if ((env->flags & MDBX_RDONLY) == 0)
|
||||
osal_ioring_destroy(&env->ioring);
|
||||
|
||||
env->lck = nullptr;
|
||||
@ -593,12 +593,7 @@ __cold int env_close(MDBX_env *env, bool resurrect_after_fork) {
|
||||
env->pathname.buffer = nullptr;
|
||||
}
|
||||
if (env->basal_txn) {
|
||||
dpl_free(env->basal_txn);
|
||||
txl_free(env->basal_txn->tw.gc.retxl);
|
||||
pnl_free(env->basal_txn->tw.retired_pages);
|
||||
pnl_free(env->basal_txn->tw.spilled.list);
|
||||
pnl_free(env->basal_txn->tw.repnl);
|
||||
osal_free(env->basal_txn);
|
||||
txn_basal_destroy(env->basal_txn);
|
||||
env->basal_txn = nullptr;
|
||||
}
|
||||
}
|
||||
|
@ -30,8 +30,10 @@ typedef struct iov_ctx iov_ctx_t;
|
||||
|
||||
#if UINTPTR_MAX > 0xffffFFFFul || ULONG_MAX > 0xffffFFFFul || defined(_WIN64)
|
||||
#define MDBX_WORDBITS 64
|
||||
#define MDBX_WORDBITS_LN2 6
|
||||
#else
|
||||
#define MDBX_WORDBITS 32
|
||||
#define MDBX_WORDBITS_LN2 5
|
||||
#endif /* MDBX_WORDBITS */
|
||||
|
||||
#include "options.h"
|
||||
|
360
src/gc-get.c
360
src/gc-get.c
@ -570,14 +570,11 @@ static pgno_t *scan4seq_resolver(pgno_t *range, const size_t len, const size_t s
|
||||
|
||||
/*----------------------------------------------------------------------------*/
|
||||
|
||||
#define ALLOC_COALESCE 4 /* внутреннее состояние */
|
||||
#define ALLOC_SHOULD_SCAN 8 /* внутреннее состояние */
|
||||
#define ALLOC_LIFO 16 /* внутреннее состояние */
|
||||
|
||||
static inline bool is_gc_usable(MDBX_txn *txn, const MDBX_cursor *mc, const uint8_t flags) {
|
||||
static inline bool is_reclaimable(MDBX_txn *txn, const MDBX_cursor *mc, const uint8_t flags) {
|
||||
/* If txn is updating the GC, then the retired-list cannot play catch-up with
|
||||
* itself by growing while trying to save it. */
|
||||
if (mc->tree == &txn->dbs[FREE_DBI] && !(flags & ALLOC_RESERVE) && !(mc->flags & z_gcu_preparation))
|
||||
STATIC_ASSERT(ALLOC_RESERVE == z_gcu_preparation);
|
||||
if (mc->tree == &txn->dbs[FREE_DBI] && !((flags | mc->flags) & z_gcu_preparation))
|
||||
return false;
|
||||
|
||||
/* avoid search inside empty tree and while tree is updating,
|
||||
@ -590,12 +587,10 @@ static inline bool is_gc_usable(MDBX_txn *txn, const MDBX_cursor *mc, const uint
|
||||
return true;
|
||||
}
|
||||
|
||||
static inline bool is_already_reclaimed(const MDBX_txn *txn, txnid_t id) { return txl_contain(txn->tw.gc.retxl, id); }
|
||||
|
||||
__hot static pgno_t repnl_get_single(MDBX_txn *txn) {
|
||||
const size_t len = MDBX_PNL_GETSIZE(txn->tw.repnl);
|
||||
const size_t len = MDBX_PNL_GETSIZE(txn->wr.repnl);
|
||||
assert(len > 0);
|
||||
pgno_t *target = MDBX_PNL_EDGE(txn->tw.repnl);
|
||||
pgno_t *target = MDBX_PNL_EDGE(txn->wr.repnl);
|
||||
const ptrdiff_t dir = MDBX_PNL_ASCENDING ? 1 : -1;
|
||||
|
||||
/* Есть ТРИ потенциально выигрышные, но противо-направленные тактики:
|
||||
@ -663,7 +658,7 @@ __hot static pgno_t repnl_get_single(MDBX_txn *txn) {
|
||||
#else
|
||||
/* вырезаем элемент с перемещением хвоста */
|
||||
const pgno_t pgno = *scan;
|
||||
MDBX_PNL_SETSIZE(txn->tw.repnl, len - 1);
|
||||
MDBX_PNL_SETSIZE(txn->wr.repnl, len - 1);
|
||||
while (++scan <= target)
|
||||
scan[-1] = *scan;
|
||||
return pgno;
|
||||
@ -676,44 +671,44 @@ __hot static pgno_t repnl_get_single(MDBX_txn *txn) {
|
||||
const pgno_t pgno = *target;
|
||||
#if MDBX_PNL_ASCENDING
|
||||
/* вырезаем элемент с перемещением хвоста */
|
||||
MDBX_PNL_SETSIZE(txn->tw.repnl, len - 1);
|
||||
for (const pgno_t *const end = txn->tw.repnl + len - 1; target <= end; ++target)
|
||||
MDBX_PNL_SETSIZE(txn->wr.repnl, len - 1);
|
||||
for (const pgno_t *const end = txn->wr.repnl + len - 1; target <= end; ++target)
|
||||
*target = target[1];
|
||||
#else
|
||||
/* перемещать хвост не нужно, просто усекам список */
|
||||
MDBX_PNL_SETSIZE(txn->tw.repnl, len - 1);
|
||||
MDBX_PNL_SETSIZE(txn->wr.repnl, len - 1);
|
||||
#endif
|
||||
return pgno;
|
||||
}
|
||||
|
||||
__hot static pgno_t repnl_get_sequence(MDBX_txn *txn, const size_t num, uint8_t flags) {
|
||||
const size_t len = MDBX_PNL_GETSIZE(txn->tw.repnl);
|
||||
pgno_t *edge = MDBX_PNL_EDGE(txn->tw.repnl);
|
||||
const size_t len = MDBX_PNL_GETSIZE(txn->wr.repnl);
|
||||
pgno_t *edge = MDBX_PNL_EDGE(txn->wr.repnl);
|
||||
assert(len >= num && num > 1);
|
||||
const size_t seq = num - 1;
|
||||
#if !MDBX_PNL_ASCENDING
|
||||
if (edge[-(ptrdiff_t)seq] - *edge == seq) {
|
||||
if (unlikely(flags & ALLOC_RESERVE))
|
||||
return P_INVALID;
|
||||
assert(edge == scan4range_checker(txn->tw.repnl, seq));
|
||||
assert(edge == scan4range_checker(txn->wr.repnl, seq));
|
||||
/* перемещать хвост не нужно, просто усекам список */
|
||||
MDBX_PNL_SETSIZE(txn->tw.repnl, len - num);
|
||||
MDBX_PNL_SETSIZE(txn->wr.repnl, len - num);
|
||||
return *edge;
|
||||
}
|
||||
#endif
|
||||
pgno_t *target = scan4seq_impl(edge, len, seq);
|
||||
assert(target == scan4range_checker(txn->tw.repnl, seq));
|
||||
assert(target == scan4range_checker(txn->wr.repnl, seq));
|
||||
if (target) {
|
||||
if (unlikely(flags & ALLOC_RESERVE))
|
||||
return P_INVALID;
|
||||
const pgno_t pgno = *target;
|
||||
/* вырезаем найденную последовательность с перемещением хвоста */
|
||||
MDBX_PNL_SETSIZE(txn->tw.repnl, len - num);
|
||||
MDBX_PNL_SETSIZE(txn->wr.repnl, len - num);
|
||||
#if MDBX_PNL_ASCENDING
|
||||
for (const pgno_t *const end = txn->tw.repnl + len - num; target <= end; ++target)
|
||||
for (const pgno_t *const end = txn->wr.repnl + len - num; target <= end; ++target)
|
||||
*target = target[num];
|
||||
#else
|
||||
for (const pgno_t *const end = txn->tw.repnl + len; ++target <= end;)
|
||||
for (const pgno_t *const end = txn->wr.repnl + len; ++target <= end;)
|
||||
target[-(ptrdiff_t)num] = *target;
|
||||
#endif
|
||||
return pgno;
|
||||
@ -721,6 +716,10 @@ __hot static pgno_t repnl_get_sequence(MDBX_txn *txn, const size_t num, uint8_t
|
||||
return 0;
|
||||
}
|
||||
|
||||
bool gc_repnl_has_span(const MDBX_txn *txn, const size_t num) {
|
||||
return (num > 1) ? repnl_get_sequence((MDBX_txn *)txn, num, ALLOC_RESERVE) != 0 : !MDBX_PNL_IS_EMPTY(txn->wr.repnl);
|
||||
}
|
||||
|
||||
static inline pgr_t page_alloc_finalize(MDBX_env *const env, MDBX_txn *const txn, const MDBX_cursor *const mc,
|
||||
const pgno_t pgno, const size_t num) {
|
||||
#if MDBX_ENABLE_PROFGC
|
||||
@ -762,7 +761,7 @@ static inline pgr_t page_alloc_finalize(MDBX_env *const env, MDBX_txn *const txn
|
||||
* обновляться PTE с последующей генерацией page-fault и чтением данных из
|
||||
* грязной I/O очереди. Из-за этого штраф за лишнюю запись может быть
|
||||
* сравним с избегаемым ненужным чтением. */
|
||||
if (txn->tw.prefault_write_activated) {
|
||||
if (txn->wr.prefault_write_activated) {
|
||||
void *const pattern = ptr_disp(env->page_auxbuf, need_clean ? env->ps : env->ps * 2);
|
||||
size_t file_offset = pgno2bytes(env, pgno);
|
||||
if (likely(num == 1)) {
|
||||
@ -823,7 +822,7 @@ static inline pgr_t page_alloc_finalize(MDBX_env *const env, MDBX_txn *const txn
|
||||
|
||||
ret.err = page_dirty(txn, ret.page, (pgno_t)num);
|
||||
bailout:
|
||||
tASSERT(txn, pnl_check_allocated(txn->tw.repnl, txn->geo.first_unallocated - MDBX_ENABLE_REFUND));
|
||||
tASSERT(txn, pnl_check_allocated(txn->wr.repnl, txn->geo.first_unallocated - MDBX_ENABLE_REFUND));
|
||||
#if MDBX_ENABLE_PROFGC
|
||||
size_t majflt_after;
|
||||
prof->xtime_cpu += osal_cputime(&majflt_after) - cputime_before;
|
||||
@ -842,8 +841,15 @@ pgr_t gc_alloc_ex(const MDBX_cursor *const mc, const size_t num, uint8_t flags)
|
||||
prof->spe_counter += 1;
|
||||
#endif /* MDBX_ENABLE_PROFGC */
|
||||
|
||||
/* Если взведен флажок ALLOC_RESERVE, то требуется только обеспечение соответствующего резерва в txn->wr.repnl
|
||||
* и/или txn->wr.gc.reclaimed, но без выделения и возврата страницы. При этом возможны три варианта вызова:
|
||||
* 1. num == 0 — требуется слот для возврата в GC остатков ранее переработанных/извлеченных страниц,
|
||||
* при этом нет смысла перерабатывать длинные записи, так как тогда дефицит свободных id/слотов не уменьшится;
|
||||
* 2. num == 1 — требуется увеличение резерва перед обновлением GC;
|
||||
* 3. num > 1 — требуется последовательность страниц для сохранения retired-страниц
|
||||
* при выключенном MDBX_ENABLE_BIGFOOT. */
|
||||
eASSERT(env, num > 0 || (flags & ALLOC_RESERVE));
|
||||
eASSERT(env, pnl_check_allocated(txn->tw.repnl, txn->geo.first_unallocated - MDBX_ENABLE_REFUND));
|
||||
eASSERT(env, pnl_check_allocated(txn->wr.repnl, txn->geo.first_unallocated - MDBX_ENABLE_REFUND));
|
||||
|
||||
size_t newnext;
|
||||
const uint64_t monotime_begin = (MDBX_ENABLE_PROFGC || (num > 1 && env->options.gc_time_limit)) ? osal_monotime() : 0;
|
||||
@ -858,21 +864,20 @@ pgr_t gc_alloc_ex(const MDBX_cursor *const mc, const size_t num, uint8_t flags)
|
||||
#if MDBX_ENABLE_PROFGC
|
||||
prof->xpages += 1;
|
||||
#endif /* MDBX_ENABLE_PROFGC */
|
||||
if (MDBX_PNL_GETSIZE(txn->tw.repnl) >= num) {
|
||||
eASSERT(env, MDBX_PNL_LAST(txn->tw.repnl) < txn->geo.first_unallocated &&
|
||||
MDBX_PNL_FIRST(txn->tw.repnl) < txn->geo.first_unallocated);
|
||||
if (MDBX_PNL_GETSIZE(txn->wr.repnl) >= num) {
|
||||
eASSERT(env, MDBX_PNL_LAST(txn->wr.repnl) < txn->geo.first_unallocated &&
|
||||
MDBX_PNL_FIRST(txn->wr.repnl) < txn->geo.first_unallocated);
|
||||
pgno = repnl_get_sequence(txn, num, flags);
|
||||
if (likely(pgno))
|
||||
goto done;
|
||||
}
|
||||
} else {
|
||||
eASSERT(env, num == 0 || MDBX_PNL_GETSIZE(txn->tw.repnl) == 0);
|
||||
eASSERT(env, !(flags & ALLOC_RESERVE) || num == 0);
|
||||
eASSERT(env, num == 0 || MDBX_PNL_GETSIZE(txn->wr.repnl) == 0 || (flags & ALLOC_RESERVE));
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------
|
||||
|
||||
if (unlikely(!is_gc_usable(txn, mc, flags))) {
|
||||
if (unlikely(!is_reclaimable(txn, mc, flags))) {
|
||||
eASSERT(env, (txn->flags & txn_gc_drained) || num > 1);
|
||||
goto no_gc;
|
||||
}
|
||||
@ -880,22 +885,19 @@ pgr_t gc_alloc_ex(const MDBX_cursor *const mc, const size_t num, uint8_t flags)
|
||||
eASSERT(env, (flags & (ALLOC_COALESCE | ALLOC_LIFO | ALLOC_SHOULD_SCAN)) == 0);
|
||||
flags += (env->flags & MDBX_LIFORECLAIM) ? ALLOC_LIFO : 0;
|
||||
|
||||
if (/* Не коагулируем записи при подготовке резерва для обновления GC.
|
||||
* Иначе попытка увеличить резерв может приводить к необходимости ещё
|
||||
* большего резерва из-за увеличения списка переработанных страниц. */
|
||||
(flags & ALLOC_RESERVE) == 0) {
|
||||
if (txn->dbs[FREE_DBI].branch_pages && MDBX_PNL_GETSIZE(txn->tw.repnl) < env->maxgc_large1page / 2)
|
||||
flags += ALLOC_COALESCE;
|
||||
}
|
||||
/* Не коагулируем записи в случае запроса слота для возврата страниц в GC. Иначе попытка увеличить резерв
|
||||
* может приводить к необходимости ещё большего резерва из-за увеличения списка переработанных страниц. */
|
||||
if (num > 0 && txn->dbs[FREE_DBI].branch_pages && MDBX_PNL_GETSIZE(txn->wr.repnl) < env->maxgc_large1page / 2)
|
||||
flags += ALLOC_COALESCE;
|
||||
|
||||
MDBX_cursor *const gc = ptr_disp(env->basal_txn, sizeof(MDBX_txn));
|
||||
MDBX_cursor *const gc = txn_gc_cursor(txn);
|
||||
eASSERT(env, mc != gc && gc->next == gc);
|
||||
gc->txn = txn;
|
||||
gc->dbi_state = txn->dbi_state;
|
||||
gc->top_and_flags = z_fresh_mark;
|
||||
|
||||
txn->tw.prefault_write_activated = env->options.prefault_write;
|
||||
if (txn->tw.prefault_write_activated) {
|
||||
txn->wr.prefault_write_activated = !env->incore && env->options.prefault_write;
|
||||
if (txn->wr.prefault_write_activated) {
|
||||
/* Проверка посредством minicore() существенно снижает затраты, но в
|
||||
* простейших случаях (тривиальный бенчмарк) интегральная производительность
|
||||
* становится вдвое меньше. А на платформах без mincore() и с проблемной
|
||||
@ -908,48 +910,41 @@ pgr_t gc_alloc_ex(const MDBX_cursor *const mc, const size_t num, uint8_t flags)
|
||||
(txn->dbs[FREE_DBI].branch_pages == 0 && txn->geo.now < 1234) ||
|
||||
/* Не суетимся если страница в зоне включенного упреждающего чтения */
|
||||
(readahead_enabled && pgno + num < readahead_edge))
|
||||
txn->tw.prefault_write_activated = false;
|
||||
txn->wr.prefault_write_activated = false;
|
||||
}
|
||||
|
||||
retry_gc_refresh_oldest:;
|
||||
txnid_t oldest = txn_snapshot_oldest(txn);
|
||||
retry_gc_have_oldest:
|
||||
if (unlikely(oldest >= txn->txnid)) {
|
||||
ERROR("unexpected/invalid oldest-readed txnid %" PRIaTXN " for current-txnid %" PRIaTXN, oldest, txn->txnid);
|
||||
retry_gc_refresh_detent:
|
||||
txn_gc_detent(txn);
|
||||
retry_gc_have_detent:
|
||||
if (unlikely(txn->env->gc.detent >= txn->txnid)) {
|
||||
FATAL("unexpected/invalid gc-detent %" PRIaTXN " for current-txnid %" PRIaTXN, txn->env->gc.detent, txn->txnid);
|
||||
ret.err = MDBX_PROBLEM;
|
||||
goto fail;
|
||||
}
|
||||
const txnid_t detent = oldest + 1;
|
||||
|
||||
txnid_t id = 0;
|
||||
MDBX_cursor_op op = MDBX_FIRST;
|
||||
if (flags & ALLOC_LIFO) {
|
||||
if (!txn->tw.gc.retxl) {
|
||||
txn->tw.gc.retxl = txl_alloc();
|
||||
if (unlikely(!txn->tw.gc.retxl)) {
|
||||
ret.err = MDBX_ENOMEM;
|
||||
goto fail;
|
||||
}
|
||||
}
|
||||
/* Begin lookup backward from oldest reader */
|
||||
id = detent - 1;
|
||||
id = txn->env->gc.detent;
|
||||
op = MDBX_SET_RANGE;
|
||||
} else if (txn->tw.gc.last_reclaimed) {
|
||||
} else {
|
||||
/* Continue lookup forward from last-reclaimed */
|
||||
id = txn->tw.gc.last_reclaimed + 1;
|
||||
if (id >= detent)
|
||||
goto depleted_gc;
|
||||
op = MDBX_SET_RANGE;
|
||||
id = rkl_highest(&txn->wr.gc.reclaimed);
|
||||
if (id) {
|
||||
id += 1;
|
||||
op = MDBX_SET_RANGE;
|
||||
if (id >= txn->env->gc.detent)
|
||||
goto depleted_gc;
|
||||
}
|
||||
}
|
||||
|
||||
next_gc:;
|
||||
MDBX_val key;
|
||||
key.iov_base = &id;
|
||||
key.iov_len = sizeof(id);
|
||||
|
||||
next_gc:
|
||||
#if MDBX_ENABLE_PROFGC
|
||||
prof->rsteps += 1;
|
||||
prof->rsteps += 1
|
||||
#endif /* MDBX_ENABLE_PROFGC */
|
||||
;
|
||||
MDBX_val key = {.iov_base = &id, .iov_len = sizeof(id)};
|
||||
|
||||
/* Seek first/next GC record */
|
||||
ret.err = cursor_ops(gc, &key, nullptr, op);
|
||||
@ -967,15 +962,18 @@ next_gc:;
|
||||
ret.err = MDBX_CORRUPTED;
|
||||
goto fail;
|
||||
}
|
||||
|
||||
id = unaligned_peek_u64(4, key.iov_base);
|
||||
if (flags & ALLOC_LIFO) {
|
||||
op = MDBX_PREV;
|
||||
if (id >= detent || is_already_reclaimed(txn, id))
|
||||
if (id >= txn->env->gc.detent || gc_is_reclaimed(txn, id))
|
||||
goto next_gc;
|
||||
} else {
|
||||
op = MDBX_NEXT;
|
||||
if (unlikely(id >= detent))
|
||||
if (unlikely(id >= txn->env->gc.detent))
|
||||
goto depleted_gc;
|
||||
op = MDBX_NEXT;
|
||||
if (gc_is_reclaimed(txn, id))
|
||||
goto next_gc;
|
||||
}
|
||||
txn->flags &= ~txn_gc_drained;
|
||||
|
||||
@ -994,59 +992,75 @@ next_gc:;
|
||||
}
|
||||
|
||||
const size_t gc_len = MDBX_PNL_GETSIZE(gc_pnl);
|
||||
TRACE("gc-read: id #%" PRIaTXN " len %zu, re-list will %zu ", id, gc_len, gc_len + MDBX_PNL_GETSIZE(txn->tw.repnl));
|
||||
TRACE("gc-read: id #%" PRIaTXN " len %zu, re-list will %zu ", id, gc_len, gc_len + MDBX_PNL_GETSIZE(txn->wr.repnl));
|
||||
|
||||
if (unlikely(gc_len + MDBX_PNL_GETSIZE(txn->tw.repnl) >= env->maxgc_large1page)) {
|
||||
/* Don't try to coalesce too much. */
|
||||
if (unlikely(!num)) {
|
||||
/* TODO: Проверка критериев пункта 2 сформулированного в gc_provide_slots().
|
||||
* Сейчас тут сильно упрощенная и не совсем верная проверка, так как пока недоступна информация о кол-ве имеющихся
|
||||
* слотов и их дефиците для возврата wr.repl. */
|
||||
if (gc_len > env->maxgc_large1page / 4 * 3
|
||||
/* если запись достаточно длинная, то переработка слота не особо увеличит место для возврата wr.repl, и т.п. */
|
||||
&& MDBX_PNL_GETSIZE(txn->wr.repnl) + gc_len > env->maxgc_large1page /* не помещается в хвост */) {
|
||||
DEBUG("avoid reclaiming %" PRIaTXN " slot, since it is too long (%zu)", id, gc_len);
|
||||
ret.err = MDBX_NOTFOUND;
|
||||
goto reserve_done;
|
||||
}
|
||||
}
|
||||
|
||||
if (unlikely(gc_len + MDBX_PNL_GETSIZE(txn->wr.repnl) /* Don't try to coalesce too much. */ >=
|
||||
env->maxgc_large1page)) {
|
||||
if (flags & ALLOC_SHOULD_SCAN) {
|
||||
eASSERT(env, flags & ALLOC_COALESCE);
|
||||
eASSERT(env, !(flags & ALLOC_RESERVE));
|
||||
eASSERT(env, num > 0);
|
||||
eASSERT(env, (flags & ALLOC_COALESCE) /* && !(flags & ALLOC_RESERVE) */ && num > 0);
|
||||
#if MDBX_ENABLE_PROFGC
|
||||
env->lck->pgops.gc_prof.coalescences += 1;
|
||||
#endif /* MDBX_ENABLE_PROFGC */
|
||||
TRACE("clear %s %s", "ALLOC_COALESCE", "since got threshold");
|
||||
if (MDBX_PNL_GETSIZE(txn->tw.repnl) >= num) {
|
||||
eASSERT(env, MDBX_PNL_LAST(txn->tw.repnl) < txn->geo.first_unallocated &&
|
||||
MDBX_PNL_FIRST(txn->tw.repnl) < txn->geo.first_unallocated);
|
||||
if (MDBX_PNL_GETSIZE(txn->wr.repnl) >= num) {
|
||||
eASSERT(env, MDBX_PNL_LAST(txn->wr.repnl) < txn->geo.first_unallocated &&
|
||||
MDBX_PNL_FIRST(txn->wr.repnl) < txn->geo.first_unallocated);
|
||||
if (likely(num == 1)) {
|
||||
pgno = repnl_get_single(txn);
|
||||
pgno = (flags & ALLOC_RESERVE) ? P_INVALID : repnl_get_single(txn);
|
||||
goto done;
|
||||
}
|
||||
pgno = repnl_get_sequence(txn, num, flags);
|
||||
if (likely(pgno))
|
||||
goto done;
|
||||
}
|
||||
flags -= ALLOC_COALESCE | ALLOC_SHOULD_SCAN;
|
||||
}
|
||||
if (unlikely(/* list is too long already */ MDBX_PNL_GETSIZE(txn->tw.repnl) >= env->options.rp_augment_limit) &&
|
||||
flags &= ~(ALLOC_COALESCE | ALLOC_SHOULD_SCAN);
|
||||
if (unlikely(/* list is too long already */ MDBX_PNL_GETSIZE(txn->wr.repnl) >= env->options.rp_augment_limit) &&
|
||||
((/* not a slot-request from gc-update */ num &&
|
||||
/* have enough unallocated space */ txn->geo.upper >= txn->geo.first_unallocated + num &&
|
||||
monotime_since_cached(monotime_begin, &now_cache) + txn->tw.gc.time_acc >= env->options.gc_time_limit) ||
|
||||
gc_len + MDBX_PNL_GETSIZE(txn->tw.repnl) >= PAGELIST_LIMIT)) {
|
||||
monotime_since_cached(monotime_begin, &now_cache) + txn->wr.gc.spent >= env->options.gc_time_limit) ||
|
||||
gc_len + MDBX_PNL_GETSIZE(txn->wr.repnl) >= PAGELIST_LIMIT)) {
|
||||
/* Stop reclaiming to avoid large/overflow the page list. This is a rare
|
||||
* case while search for a continuously multi-page region in a
|
||||
* large database, see https://libmdbx.dqdkfa.ru/dead-github/issues/123 */
|
||||
* case while search for a continuously multi-page region in a large database,
|
||||
* see https://libmdbx.dqdkfa.ru/dead-github/issues/123 */
|
||||
NOTICE("stop reclaiming %s: %zu (current) + %zu "
|
||||
"(chunk) -> %zu, rp_augment_limit %u",
|
||||
likely(gc_len + MDBX_PNL_GETSIZE(txn->tw.repnl) < PAGELIST_LIMIT) ? "since rp_augment_limit was reached"
|
||||
"(chunk) >= %zu, rp_augment_limit %u",
|
||||
likely(gc_len + MDBX_PNL_GETSIZE(txn->wr.repnl) < PAGELIST_LIMIT) ? "since rp_augment_limit was reached"
|
||||
: "to avoid PNL overflow",
|
||||
MDBX_PNL_GETSIZE(txn->tw.repnl), gc_len, gc_len + MDBX_PNL_GETSIZE(txn->tw.repnl),
|
||||
MDBX_PNL_GETSIZE(txn->wr.repnl), gc_len, gc_len + MDBX_PNL_GETSIZE(txn->wr.repnl),
|
||||
env->options.rp_augment_limit);
|
||||
goto depleted_gc;
|
||||
}
|
||||
}
|
||||
|
||||
/* Remember ID of readed GC record */
|
||||
txn->tw.gc.last_reclaimed = id;
|
||||
if (flags & ALLOC_LIFO) {
|
||||
ret.err = txl_append(&txn->tw.gc.retxl, id);
|
||||
if (unlikely(ret.err != MDBX_SUCCESS))
|
||||
goto fail;
|
||||
}
|
||||
ret.err = rkl_push(&txn->wr.gc.reclaimed, id,
|
||||
false /* Вместо false, тут можно передавать/использовать (flags & ALLOC_LIFO) == 0, тогда
|
||||
* дыры/пропуски в идентификаторах GC будут образовывать непрерывные интервалы в wr.gc.reclaimed,
|
||||
* что обеспечит больше свободных идентификаторов/слотов для возврата страниц. Однако, это
|
||||
* также приведёт к пустым попыткам удаления отсутствующих записей в gc_clear_reclaimed(),
|
||||
* а далее к перекладыванию этих сплошных интервалов поэлементно в ready4reuse.
|
||||
* Поэтому смысла в этом решительно нет. Следует либо формировать сплошные интервалы при
|
||||
* работе gc_clear_reclaimed(), особенно в FIFO-режиме, либо искать их только в gc_provide_ids() */);
|
||||
TRACE("%" PRIaTXN " len %zu pushed to txn-rkl, err %d", id, gc_len, ret.err);
|
||||
if (unlikely(ret.err != MDBX_SUCCESS))
|
||||
goto fail;
|
||||
|
||||
/* Append PNL from GC record to tw.repnl */
|
||||
ret.err = pnl_need(&txn->tw.repnl, gc_len);
|
||||
/* Append PNL from GC record to wr.repnl */
|
||||
ret.err = pnl_need(&txn->wr.repnl, gc_len);
|
||||
if (unlikely(ret.err != MDBX_SUCCESS))
|
||||
goto fail;
|
||||
|
||||
@ -1061,53 +1075,56 @@ next_gc:;
|
||||
#if MDBX_ENABLE_PROFGC
|
||||
const uint64_t merge_begin = osal_monotime();
|
||||
#endif /* MDBX_ENABLE_PROFGC */
|
||||
pnl_merge(txn->tw.repnl, gc_pnl);
|
||||
pnl_merge(txn->wr.repnl, gc_pnl);
|
||||
#if MDBX_ENABLE_PROFGC
|
||||
prof->pnl_merge.calls += 1;
|
||||
prof->pnl_merge.volume += MDBX_PNL_GETSIZE(txn->tw.repnl);
|
||||
prof->pnl_merge.volume += MDBX_PNL_GETSIZE(txn->wr.repnl);
|
||||
prof->pnl_merge.time += osal_monotime() - merge_begin;
|
||||
#endif /* MDBX_ENABLE_PROFGC */
|
||||
flags |= ALLOC_SHOULD_SCAN;
|
||||
if (AUDIT_ENABLED()) {
|
||||
if (unlikely(!pnl_check(txn->tw.repnl, txn->geo.first_unallocated))) {
|
||||
if (unlikely(!pnl_check(txn->wr.repnl, txn->geo.first_unallocated))) {
|
||||
ERROR("%s/%d: %s", "MDBX_CORRUPTED", MDBX_CORRUPTED, "invalid txn retired-list");
|
||||
ret.err = MDBX_CORRUPTED;
|
||||
goto fail;
|
||||
}
|
||||
} else {
|
||||
eASSERT(env, pnl_check_allocated(txn->tw.repnl, txn->geo.first_unallocated));
|
||||
eASSERT(env, pnl_check_allocated(txn->wr.repnl, txn->geo.first_unallocated));
|
||||
}
|
||||
eASSERT(env, dpl_check(txn));
|
||||
|
||||
eASSERT(env, MDBX_PNL_GETSIZE(txn->tw.repnl) == 0 || MDBX_PNL_MOST(txn->tw.repnl) < txn->geo.first_unallocated);
|
||||
if (MDBX_ENABLE_REFUND && MDBX_PNL_GETSIZE(txn->tw.repnl) &&
|
||||
unlikely(MDBX_PNL_MOST(txn->tw.repnl) == txn->geo.first_unallocated - 1)) {
|
||||
eASSERT(env, MDBX_PNL_GETSIZE(txn->wr.repnl) == 0 || MDBX_PNL_MOST(txn->wr.repnl) < txn->geo.first_unallocated);
|
||||
if (MDBX_ENABLE_REFUND && MDBX_PNL_GETSIZE(txn->wr.repnl) &&
|
||||
unlikely(MDBX_PNL_MOST(txn->wr.repnl) == txn->geo.first_unallocated - 1)) {
|
||||
/* Refund suitable pages into "unallocated" space */
|
||||
txn_refund(txn);
|
||||
}
|
||||
eASSERT(env, pnl_check_allocated(txn->tw.repnl, txn->geo.first_unallocated - MDBX_ENABLE_REFUND));
|
||||
eASSERT(env, pnl_check_allocated(txn->wr.repnl, txn->geo.first_unallocated - MDBX_ENABLE_REFUND));
|
||||
|
||||
/* Done for a kick-reclaim mode, actually no page needed */
|
||||
if (unlikely(num == 0)) {
|
||||
eASSERT(env, ret.err == MDBX_SUCCESS);
|
||||
TRACE("%s: last id #%" PRIaTXN ", re-len %zu", "early-exit for slot", id, MDBX_PNL_GETSIZE(txn->tw.repnl));
|
||||
goto early_exit;
|
||||
}
|
||||
|
||||
/* TODO: delete reclaimed records */
|
||||
/* TODO: удаление загруженных из GC записей */
|
||||
|
||||
eASSERT(env, op == MDBX_PREV || op == MDBX_NEXT);
|
||||
if (flags & ALLOC_COALESCE) {
|
||||
TRACE("%s: last id #%" PRIaTXN ", re-len %zu", "coalesce-continue", id, MDBX_PNL_GETSIZE(txn->tw.repnl));
|
||||
goto next_gc;
|
||||
if (MDBX_PNL_GETSIZE(txn->wr.repnl) < env->maxgc_large1page / 2) {
|
||||
TRACE("%s: last id #%" PRIaTXN ", re-len %zu", "coalesce-continue", id, MDBX_PNL_GETSIZE(txn->wr.repnl));
|
||||
goto next_gc;
|
||||
}
|
||||
flags -= ALLOC_COALESCE;
|
||||
}
|
||||
|
||||
scan:
|
||||
if ((flags & ALLOC_RESERVE) && num < 2) {
|
||||
/* Если был нужен только slot/id для gc_reclaim_slot() или gc_reserve4stockpile() */
|
||||
TRACE("%s: last id #%" PRIaTXN ", re-len %zu", "reserve-done", id, MDBX_PNL_GETSIZE(txn->wr.repnl));
|
||||
ret.err = MDBX_SUCCESS;
|
||||
goto reserve_done;
|
||||
}
|
||||
|
||||
eASSERT(env, flags & ALLOC_SHOULD_SCAN);
|
||||
eASSERT(env, num > 0);
|
||||
if (MDBX_PNL_GETSIZE(txn->tw.repnl) >= num) {
|
||||
eASSERT(env, MDBX_PNL_LAST(txn->tw.repnl) < txn->geo.first_unallocated &&
|
||||
MDBX_PNL_FIRST(txn->tw.repnl) < txn->geo.first_unallocated);
|
||||
if (MDBX_PNL_GETSIZE(txn->wr.repnl) >= num) {
|
||||
eASSERT(env, MDBX_PNL_LAST(txn->wr.repnl) < txn->geo.first_unallocated &&
|
||||
MDBX_PNL_FIRST(txn->wr.repnl) < txn->geo.first_unallocated);
|
||||
if (likely(num == 1)) {
|
||||
eASSERT(env, !(flags & ALLOC_RESERVE));
|
||||
pgno = repnl_get_single(txn);
|
||||
@ -1118,17 +1135,16 @@ scan:
|
||||
goto done;
|
||||
}
|
||||
flags -= ALLOC_SHOULD_SCAN;
|
||||
if (ret.err == MDBX_SUCCESS) {
|
||||
TRACE("%s: last id #%" PRIaTXN ", re-len %zu", "continue-search", id, MDBX_PNL_GETSIZE(txn->tw.repnl));
|
||||
if ((txn->flags & txn_gc_drained) == 0) {
|
||||
TRACE("%s: last id #%" PRIaTXN ", re-len %zu", "continue-search", id, MDBX_PNL_GETSIZE(txn->wr.repnl));
|
||||
goto next_gc;
|
||||
}
|
||||
|
||||
depleted_gc:
|
||||
TRACE("%s: last id #%" PRIaTXN ", re-len %zu", "gc-depleted", id, MDBX_PNL_GETSIZE(txn->tw.repnl));
|
||||
ret.err = MDBX_NOTFOUND;
|
||||
TRACE("%s: last id #%" PRIaTXN ", re-len %zu", "gc-depleted", id, MDBX_PNL_GETSIZE(txn->wr.repnl));
|
||||
txn->flags |= txn_gc_drained;
|
||||
if (flags & ALLOC_SHOULD_SCAN)
|
||||
goto scan;
|
||||
txn->flags |= txn_gc_drained;
|
||||
|
||||
//-------------------------------------------------------------------------
|
||||
|
||||
@ -1143,11 +1159,11 @@ depleted_gc:
|
||||
newnext = txn->geo.first_unallocated + num;
|
||||
|
||||
/* Does reclaiming stopped at the last steady point? */
|
||||
const meta_ptr_t recent = meta_recent(env, &txn->tw.troika);
|
||||
const meta_ptr_t prefer_steady = meta_prefer_steady(env, &txn->tw.troika);
|
||||
if (recent.ptr_c != prefer_steady.ptr_c && prefer_steady.is_steady && detent == prefer_steady.txnid + 1) {
|
||||
DEBUG("gc-kick-steady: recent %" PRIaTXN "-%s, steady %" PRIaTXN "-%s, detent %" PRIaTXN, recent.txnid,
|
||||
durable_caption(recent.ptr_c), prefer_steady.txnid, durable_caption(prefer_steady.ptr_c), detent);
|
||||
const meta_ptr_t recent = meta_recent(env, &txn->wr.troika);
|
||||
const meta_ptr_t prefer_steady = meta_prefer_steady(env, &txn->wr.troika);
|
||||
if (recent.ptr_c != prefer_steady.ptr_c && prefer_steady.is_steady && txn->env->gc.detent == prefer_steady.txnid) {
|
||||
DEBUG("gc-kick-steady: recent %" PRIaTXN "-%s, steady %" PRIaTXN "-%s", recent.txnid, durable_caption(recent.ptr_c),
|
||||
prefer_steady.txnid, durable_caption(prefer_steady.ptr_c));
|
||||
const pgno_t autosync_threshold = atomic_load32(&env->lck->autosync_threshold, mo_Relaxed);
|
||||
const uint64_t autosync_period = atomic_load64(&env->lck->autosync_period, mo_Relaxed);
|
||||
uint64_t eoos_timestamp;
|
||||
@ -1166,12 +1182,12 @@ depleted_gc:
|
||||
#if MDBX_ENABLE_PROFGC
|
||||
env->lck->pgops.gc_prof.wipes += 1;
|
||||
#endif /* MDBX_ENABLE_PROFGC */
|
||||
ret.err = meta_wipe_steady(env, detent);
|
||||
ret.err = meta_wipe_steady(env, txn->env->gc.detent);
|
||||
DEBUG("gc-wipe-steady, rc %d", ret.err);
|
||||
if (unlikely(ret.err != MDBX_SUCCESS))
|
||||
goto fail;
|
||||
eASSERT(env, prefer_steady.ptr_c != meta_prefer_steady(env, &txn->tw.troika).ptr_c);
|
||||
goto retry_gc_refresh_oldest;
|
||||
eASSERT(env, prefer_steady.ptr_c != meta_prefer_steady(env, &txn->wr.troika).ptr_c);
|
||||
goto retry_gc_refresh_detent;
|
||||
}
|
||||
if ((autosync_threshold && atomic_load64(&env->lck->unsynced_pages, mo_Relaxed) >= autosync_threshold) ||
|
||||
(autosync_period && (eoos_timestamp = atomic_load64(&env->lck->eoos_timestamp, mo_Relaxed)) &&
|
||||
@ -1183,21 +1199,18 @@ depleted_gc:
|
||||
env->lck->pgops.gc_prof.flushes += 1;
|
||||
#endif /* MDBX_ENABLE_PROFGC */
|
||||
meta_t meta = *recent.ptr_c;
|
||||
ret.err = dxb_sync_locked(env, env->flags & MDBX_WRITEMAP, &meta, &txn->tw.troika);
|
||||
ret.err = dxb_sync_locked(env, env->flags & MDBX_WRITEMAP, &meta, &txn->wr.troika);
|
||||
DEBUG("gc-make-steady, rc %d", ret.err);
|
||||
eASSERT(env, ret.err != MDBX_RESULT_TRUE);
|
||||
if (unlikely(ret.err != MDBX_SUCCESS))
|
||||
goto fail;
|
||||
eASSERT(env, prefer_steady.ptr_c != meta_prefer_steady(env, &txn->tw.troika).ptr_c);
|
||||
goto retry_gc_refresh_oldest;
|
||||
eASSERT(env, prefer_steady.ptr_c != meta_prefer_steady(env, &txn->wr.troika).ptr_c);
|
||||
goto retry_gc_refresh_detent;
|
||||
}
|
||||
}
|
||||
|
||||
if (unlikely(true == atomic_load32(&env->lck->rdt_refresh_flag, mo_AcquireRelease))) {
|
||||
oldest = txn_snapshot_oldest(txn);
|
||||
if (oldest >= detent)
|
||||
goto retry_gc_have_oldest;
|
||||
}
|
||||
if (unlikely(true == atomic_load32(&env->lck->rdt_refresh_flag, mo_AcquireRelease)) && txn_gc_detent(txn))
|
||||
goto retry_gc_have_detent;
|
||||
|
||||
/* Avoid kick lagging reader(s) if is enough unallocated space
|
||||
* at the end of database file. */
|
||||
@ -1206,11 +1219,8 @@ depleted_gc:
|
||||
goto done;
|
||||
}
|
||||
|
||||
if (oldest < txn->txnid - xMDBX_TXNID_STEP) {
|
||||
oldest = mvcc_kick_laggards(env, oldest);
|
||||
if (oldest >= detent)
|
||||
goto retry_gc_have_oldest;
|
||||
}
|
||||
if (txn->txnid - txn->env->gc.detent > xMDBX_TXNID_STEP && mvcc_kick_laggards(env, txn->env->gc.detent))
|
||||
goto retry_gc_refresh_detent;
|
||||
|
||||
//---------------------------------------------------------------------------
|
||||
|
||||
@ -1263,7 +1273,7 @@ done:
|
||||
if (likely((flags & ALLOC_RESERVE) == 0)) {
|
||||
if (pgno) {
|
||||
eASSERT(env, pgno + num <= txn->geo.first_unallocated && pgno >= NUM_METAS);
|
||||
eASSERT(env, pnl_check_allocated(txn->tw.repnl, txn->geo.first_unallocated - MDBX_ENABLE_REFUND));
|
||||
eASSERT(env, pnl_check_allocated(txn->wr.repnl, txn->geo.first_unallocated - MDBX_ENABLE_REFUND));
|
||||
} else {
|
||||
pgno = txn->geo.first_unallocated;
|
||||
txn->geo.first_unallocated += (pgno_t)num;
|
||||
@ -1275,32 +1285,42 @@ done:
|
||||
if (unlikely(ret.err != MDBX_SUCCESS)) {
|
||||
fail:
|
||||
eASSERT(env, ret.err != MDBX_SUCCESS);
|
||||
eASSERT(env, pnl_check_allocated(txn->tw.repnl, txn->geo.first_unallocated - MDBX_ENABLE_REFUND));
|
||||
eASSERT(env, pnl_check_allocated(txn->wr.repnl, txn->geo.first_unallocated - MDBX_ENABLE_REFUND));
|
||||
int level;
|
||||
const char *what;
|
||||
if (flags & ALLOC_RESERVE) {
|
||||
level = (flags & ALLOC_UNIMPORTANT) ? MDBX_LOG_DEBUG : MDBX_LOG_NOTICE;
|
||||
what = num ? "reserve-pages" : "fetch-slot";
|
||||
} else {
|
||||
if (flags & ALLOC_UNIMPORTANT)
|
||||
level = MDBX_LOG_DEBUG;
|
||||
else if (flags & ALLOC_RESERVE)
|
||||
level = MDBX_LOG_NOTICE;
|
||||
else {
|
||||
txn->flags |= MDBX_TXN_ERROR;
|
||||
level = MDBX_LOG_ERROR;
|
||||
what = "pages";
|
||||
}
|
||||
if (LOG_ENABLED(level))
|
||||
debug_log(level, __func__, __LINE__,
|
||||
"unable alloc %zu %s, alloc-flags 0x%x, err %d, txn-flags "
|
||||
"0x%x, re-list-len %zu, loose-count %zu, gc: height %u, "
|
||||
"branch %zu, leaf %zu, large %zu, entries %zu\n",
|
||||
num, what, flags, ret.err, txn->flags, MDBX_PNL_GETSIZE(txn->tw.repnl), txn->tw.loose_count,
|
||||
txn->dbs[FREE_DBI].height, (size_t)txn->dbs[FREE_DBI].branch_pages,
|
||||
(size_t)txn->dbs[FREE_DBI].leaf_pages, (size_t)txn->dbs[FREE_DBI].large_pages,
|
||||
(size_t)txn->dbs[FREE_DBI].items);
|
||||
if (LOG_ENABLED(level)) {
|
||||
if (num)
|
||||
debug_log(level, __func__, __LINE__,
|
||||
"unable %s %zu, alloc-flags 0x%x, err %d, txn-flags "
|
||||
"0x%x, re-list-len %zu, loose-count %zu, gc: height %u, "
|
||||
"branch %zu, leaf %zu, large %zu, entries %zu\n",
|
||||
(flags & ALLOC_RESERVE) ? "reserve" : "alloc", num, flags, ret.err, txn->flags,
|
||||
MDBX_PNL_GETSIZE(txn->wr.repnl), txn->wr.loose_count, txn->dbs[FREE_DBI].height,
|
||||
(size_t)txn->dbs[FREE_DBI].branch_pages, (size_t)txn->dbs[FREE_DBI].leaf_pages,
|
||||
(size_t)txn->dbs[FREE_DBI].large_pages, (size_t)txn->dbs[FREE_DBI].items);
|
||||
else
|
||||
debug_log(level, __func__, __LINE__,
|
||||
"unable fetch-slot, alloc-flags 0x%x, err %d, txn-flags "
|
||||
"0x%x, re-list-len %zu, loose-count %zu, gc: height %u, "
|
||||
"branch %zu, leaf %zu, large %zu, entries %zu\n",
|
||||
flags, ret.err, txn->flags, MDBX_PNL_GETSIZE(txn->wr.repnl), txn->wr.loose_count,
|
||||
txn->dbs[FREE_DBI].height, (size_t)txn->dbs[FREE_DBI].branch_pages,
|
||||
(size_t)txn->dbs[FREE_DBI].leaf_pages, (size_t)txn->dbs[FREE_DBI].large_pages,
|
||||
(size_t)txn->dbs[FREE_DBI].items);
|
||||
}
|
||||
ret.page = nullptr;
|
||||
}
|
||||
if (num > 1)
|
||||
txn->tw.gc.time_acc += monotime_since_cached(monotime_begin, &now_cache);
|
||||
txn->wr.gc.spent += monotime_since_cached(monotime_begin, &now_cache);
|
||||
} else {
|
||||
early_exit:
|
||||
reserve_done:
|
||||
DEBUG("return nullptr for %zu pages for ALLOC_%s, rc %d", num, num ? "RESERVE" : "SLOT", ret.err);
|
||||
ret.page = nullptr;
|
||||
}
|
||||
@ -1317,20 +1337,20 @@ __hot pgr_t gc_alloc_single(const MDBX_cursor *const mc) {
|
||||
tASSERT(txn, F_ISSET(*cursor_dbi_state(mc), DBI_LINDO | DBI_VALID | DBI_DIRTY));
|
||||
|
||||
/* If there are any loose pages, just use them */
|
||||
while (likely(txn->tw.loose_pages)) {
|
||||
while (likely(txn->wr.loose_pages)) {
|
||||
#if MDBX_ENABLE_REFUND
|
||||
if (unlikely(txn->tw.loose_refund_wl > txn->geo.first_unallocated)) {
|
||||
if (unlikely(txn->wr.loose_refund_wl > txn->geo.first_unallocated)) {
|
||||
txn_refund(txn);
|
||||
if (!txn->tw.loose_pages)
|
||||
if (!txn->wr.loose_pages)
|
||||
break;
|
||||
}
|
||||
#endif /* MDBX_ENABLE_REFUND */
|
||||
|
||||
page_t *lp = txn->tw.loose_pages;
|
||||
page_t *lp = txn->wr.loose_pages;
|
||||
MDBX_ASAN_UNPOISON_MEMORY_REGION(lp, txn->env->ps);
|
||||
VALGRIND_MAKE_MEM_DEFINED(&page_next(lp), sizeof(page_t *));
|
||||
txn->tw.loose_pages = page_next(lp);
|
||||
txn->tw.loose_count--;
|
||||
txn->wr.loose_pages = page_next(lp);
|
||||
txn->wr.loose_count--;
|
||||
DEBUG_EXTRA("db %d use loose page %" PRIaPGNO, cursor_dbi_dbg(mc), lp->pgno);
|
||||
tASSERT(txn, lp->pgno < txn->geo.first_unallocated);
|
||||
tASSERT(txn, lp->pgno >= NUM_METAS);
|
||||
@ -1340,7 +1360,7 @@ __hot pgr_t gc_alloc_single(const MDBX_cursor *const mc) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
if (likely(MDBX_PNL_GETSIZE(txn->tw.repnl) > 0))
|
||||
if (likely(MDBX_PNL_GETSIZE(txn->wr.repnl) > 0))
|
||||
return page_alloc_finalize(txn->env, txn, mc, repnl_get_single(txn), 1);
|
||||
|
||||
return gc_alloc_ex(mc, 1, ALLOC_DEFAULT);
|
||||
|
1971
src/gc-put.c
1971
src/gc-put.c
File diff suppressed because it is too large
Load Diff
70
src/gc.h
70
src/gc.h
@ -5,14 +5,37 @@
|
||||
|
||||
#include "essentials.h"
|
||||
|
||||
/* Гистограмма решения нарезки фрагментов для ситуации нехватки идентификаторов/слотов. */
|
||||
typedef struct gc_dense_histogram {
|
||||
/* Размер массива одновременно задаёт максимальный размер последовательностей,
|
||||
* с которыми решается задача распределения.
|
||||
*
|
||||
* Использование длинных последовательностей контрпродуктивно, так как такие последовательности будут
|
||||
* создавать/воспроизводить/повторять аналогичные затруднения при последующей переработке. Однако,
|
||||
* в редких ситуациях это может быть единственным выходом. */
|
||||
unsigned end;
|
||||
pgno_t array[31];
|
||||
} gc_dense_histogram_t;
|
||||
|
||||
typedef struct gc_update_context {
|
||||
unsigned loop;
|
||||
pgno_t prev_first_unallocated;
|
||||
unsigned goodchunk;
|
||||
bool dense;
|
||||
size_t reserve_adj;
|
||||
pgno_t prev_first_unallocated;
|
||||
size_t retired_stored;
|
||||
size_t amount, reserved, cleaned_slot, reused_slot, fill_idx;
|
||||
txnid_t cleaned_id, rid;
|
||||
size_t return_reserved_lo, return_reserved_hi;
|
||||
txnid_t gc_first;
|
||||
intptr_t return_left;
|
||||
#ifndef MDBX_DEBUG_GCU
|
||||
#define MDBX_DEBUG_GCU 0
|
||||
#endif
|
||||
#if MDBX_DEBUG_GCU
|
||||
struct {
|
||||
txnid_t prev;
|
||||
unsigned n;
|
||||
} dbg;
|
||||
#endif /* MDBX_DEBUG_GCU */
|
||||
rkl_t ready4reuse, sequel;
|
||||
#if MDBX_ENABLE_BIGFOOT
|
||||
txnid_t bigfoot;
|
||||
#endif /* MDBX_ENABLE_BIGFOOT */
|
||||
@ -20,21 +43,38 @@ typedef struct gc_update_context {
|
||||
MDBX_cursor cursor;
|
||||
cursor_couple_t couple;
|
||||
};
|
||||
gc_dense_histogram_t dense_histogram;
|
||||
} gcu_t;
|
||||
|
||||
static inline int gc_update_init(MDBX_txn *txn, gcu_t *ctx) {
|
||||
memset(ctx, 0, offsetof(gcu_t, cursor));
|
||||
ctx->dense = txn->txnid <= MIN_TXNID;
|
||||
#if MDBX_ENABLE_BIGFOOT
|
||||
ctx->bigfoot = txn->txnid;
|
||||
#endif /* MDBX_ENABLE_BIGFOOT */
|
||||
return cursor_init(&ctx->cursor, txn, FREE_DBI);
|
||||
}
|
||||
MDBX_INTERNAL int gc_put_init(MDBX_txn *txn, gcu_t *ctx);
|
||||
MDBX_INTERNAL void gc_put_destroy(gcu_t *ctx);
|
||||
|
||||
#define ALLOC_DEFAULT 0 /* штатное/обычное выделение страниц */
|
||||
#define ALLOC_UNIMPORTANT 1 /* запрос неважен, невозможность выделения не приведет к ошибке транзакции */
|
||||
#define ALLOC_RESERVE 2 /* подготовка резерва для обновления GC, без аллокации */
|
||||
#define ALLOC_COALESCE 4 /* внутреннее состояние/флажок */
|
||||
#define ALLOC_SHOULD_SCAN 8 /* внутреннее состояние/флажок */
|
||||
#define ALLOC_LIFO 16 /* внутреннее состояние/флажок */
|
||||
|
||||
#define ALLOC_DEFAULT 0
|
||||
#define ALLOC_RESERVE 1
|
||||
#define ALLOC_UNIMPORTANT 2
|
||||
MDBX_INTERNAL pgr_t gc_alloc_ex(const MDBX_cursor *const mc, const size_t num, uint8_t flags);
|
||||
|
||||
MDBX_INTERNAL pgr_t gc_alloc_single(const MDBX_cursor *const mc);
|
||||
MDBX_INTERNAL int gc_update(MDBX_txn *txn, gcu_t *ctx);
|
||||
|
||||
MDBX_NOTHROW_PURE_FUNCTION static inline size_t gc_stockpile(const MDBX_txn *txn) {
|
||||
return MDBX_PNL_GETSIZE(txn->wr.repnl) + txn->wr.loose_count;
|
||||
}
|
||||
|
||||
MDBX_NOTHROW_PURE_FUNCTION static inline size_t gc_chunk_bytes(const size_t chunk) {
|
||||
return (chunk + 1) * sizeof(pgno_t);
|
||||
}
|
||||
|
||||
MDBX_INTERNAL bool gc_repnl_has_span(const MDBX_txn *txn, const size_t num);
|
||||
|
||||
static inline bool gc_is_reclaimed(const MDBX_txn *txn, const txnid_t id) {
|
||||
return rkl_contain(&txn->wr.gc.reclaimed, id) || rkl_contain(&txn->wr.gc.comeback, id);
|
||||
}
|
||||
|
||||
static inline txnid_t txnid_min(txnid_t a, txnid_t b) { return (a < b) ? a : b; }
|
||||
|
||||
static inline txnid_t txnid_max(txnid_t a, txnid_t b) { return (a > b) ? a : b; }
|
||||
|
@ -41,11 +41,12 @@ typedef struct node_search_result {
|
||||
|
||||
typedef struct bind_reader_slot_result {
|
||||
int err;
|
||||
reader_slot_t *rslot;
|
||||
reader_slot_t *slot;
|
||||
} bsr_t;
|
||||
|
||||
#include "atomics-ops.h"
|
||||
#include "proto.h"
|
||||
#include "rkl.h"
|
||||
#include "txl.h"
|
||||
#include "unaligned.h"
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
@ -155,7 +156,8 @@ enum txn_flags {
|
||||
txn_rw_begin_flags = MDBX_TXN_NOMETASYNC | MDBX_TXN_NOSYNC | MDBX_TXN_TRY,
|
||||
txn_shrink_allowed = UINT32_C(0x40000000),
|
||||
txn_parked = MDBX_TXN_PARKED,
|
||||
txn_gc_drained = 0x40 /* GC was depleted up to oldest reader */,
|
||||
txn_gc_drained = 0x80 /* GC was depleted up to oldest reader */,
|
||||
txn_may_have_cursors = 0x100,
|
||||
txn_state_flags = MDBX_TXN_FINISHED | MDBX_TXN_ERROR | MDBX_TXN_DIRTY | MDBX_TXN_SPILLS | MDBX_TXN_HAS_CHILD |
|
||||
MDBX_TXN_INVALID | txn_gc_drained
|
||||
};
|
||||
@ -205,17 +207,16 @@ struct MDBX_txn {
|
||||
|
||||
union {
|
||||
struct {
|
||||
/* For read txns: This thread/txn's reader table slot, or nullptr. */
|
||||
reader_slot_t *reader;
|
||||
} to;
|
||||
/* For read txns: This thread/txn's slot table slot, or nullptr. */
|
||||
reader_slot_t *slot;
|
||||
} ro;
|
||||
struct {
|
||||
troika_t troika;
|
||||
pnl_t __restrict repnl; /* Reclaimed GC pages */
|
||||
struct {
|
||||
/* The list of reclaimed txn-ids from GC */
|
||||
txl_t __restrict retxl;
|
||||
txnid_t last_reclaimed; /* ID of last used record */
|
||||
uint64_t time_acc;
|
||||
rkl_t reclaimed; /* The list of reclaimed txn-ids from GC */
|
||||
uint64_t spent; /* Time spent reading and searching GC */
|
||||
rkl_t comeback; /* The list of ids of records returned into GC during commit, etc */
|
||||
} gc;
|
||||
bool prefault_write_activated;
|
||||
#if MDBX_ENABLE_REFUND
|
||||
@ -235,7 +236,7 @@ struct MDBX_txn {
|
||||
/* The list of loose pages that became unused and may be reused
|
||||
* in this transaction, linked through `page_next()`. */
|
||||
page_t *__restrict loose_pages;
|
||||
/* Number of loose pages (tw.loose_pages) */
|
||||
/* Number of loose pages (wr.loose_pages) */
|
||||
size_t loose_count;
|
||||
union {
|
||||
struct {
|
||||
@ -249,7 +250,7 @@ struct MDBX_txn {
|
||||
size_t writemap_spilled_npages;
|
||||
};
|
||||
/* In write txns, next is located the array of cursors for each DB */
|
||||
} tw;
|
||||
} wr;
|
||||
};
|
||||
};
|
||||
|
||||
@ -285,13 +286,14 @@ struct MDBX_cursor {
|
||||
};
|
||||
/* флаги проверки, в том числе биты для проверки типа листовых страниц. */
|
||||
uint8_t checking;
|
||||
uint8_t pad;
|
||||
|
||||
/* Указывает на txn->dbi_state[] для DBI этого курсора.
|
||||
* Модификатор __restrict тут полезен и безопасен в текущем понимании,
|
||||
* так как пересечение возможно только с dbi_state транзакции,
|
||||
* и происходит по-чтению до последующего изменения/записи. */
|
||||
uint8_t *__restrict dbi_state;
|
||||
/* Связь списка отслеживания курсоров в транзакции */
|
||||
/* Связь списка отслеживания курсоров в транзакции. */
|
||||
MDBX_txn *txn;
|
||||
/* Указывает на tree->dbs[] для DBI этого курсора. */
|
||||
tree_t *tree;
|
||||
@ -360,15 +362,14 @@ struct MDBX_env {
|
||||
uint16_t subpage_reserve_prereq;
|
||||
uint16_t subpage_reserve_limit;
|
||||
atomic_pgno_t mlocked_pgno;
|
||||
uint8_t ps2ln; /* log2 of DB page size */
|
||||
int8_t stuck_meta; /* recovery-only: target meta page or less that zero */
|
||||
uint16_t merge_threshold, merge_threshold_gc; /* pages emptier than this are
|
||||
candidates for merging */
|
||||
unsigned max_readers; /* size of the reader table */
|
||||
MDBX_dbi max_dbi; /* size of the DB table */
|
||||
uint32_t pid; /* process ID of this env */
|
||||
osal_thread_key_t me_txkey; /* thread-key for readers */
|
||||
struct { /* path to the DB files */
|
||||
uint8_t ps2ln; /* log2 of DB page size */
|
||||
int8_t stuck_meta; /* recovery-only: target meta page or less that zero */
|
||||
uint16_t merge_threshold; /* pages emptier than this are candidates for merging */
|
||||
unsigned max_readers; /* size of the reader table */
|
||||
MDBX_dbi max_dbi; /* size of the DB table */
|
||||
uint32_t pid; /* process ID of this env */
|
||||
osal_thread_key_t me_txkey; /* thread-key for readers */
|
||||
struct { /* path to the DB files */
|
||||
pathchar_t *lck, *dxb, *specified;
|
||||
void *buffer;
|
||||
} pathname;
|
||||
@ -465,6 +466,9 @@ struct MDBX_env {
|
||||
/* --------------------------------------------------- mostly volatile part */
|
||||
|
||||
MDBX_txn *txn; /* current write transaction */
|
||||
struct {
|
||||
txnid_t detent;
|
||||
} gc;
|
||||
osal_fastmutex_t dbi_lock;
|
||||
unsigned n_dbi; /* number of DBs opened */
|
||||
|
||||
@ -536,7 +540,9 @@ MDBX_MAYBE_UNUSED static void static_checks(void) {
|
||||
STATIC_ASSERT(offsetof(lck_t, cached_oldest) % MDBX_CACHELINE_SIZE == 0);
|
||||
STATIC_ASSERT(offsetof(lck_t, rdt_length) % MDBX_CACHELINE_SIZE == 0);
|
||||
#endif /* MDBX_LOCKING */
|
||||
#if FLEXIBLE_ARRAY_MEMBERS
|
||||
STATIC_ASSERT(offsetof(lck_t, rdt) % MDBX_CACHELINE_SIZE == 0);
|
||||
#endif /* FLEXIBLE_ARRAY_MEMBERS */
|
||||
|
||||
#if FLEXIBLE_ARRAY_MEMBERS
|
||||
STATIC_ASSERT(NODESIZE == offsetof(node_t, payload));
|
||||
@ -545,11 +551,7 @@ MDBX_MAYBE_UNUSED static void static_checks(void) {
|
||||
STATIC_ASSERT(sizeof(clc_t) == 3 * sizeof(void *));
|
||||
STATIC_ASSERT(sizeof(kvx_t) == 8 * sizeof(void *));
|
||||
|
||||
#if MDBX_WORDBITS == 64
|
||||
#define KVX_SIZE_LN2 6
|
||||
#else
|
||||
#define KVX_SIZE_LN2 5
|
||||
#endif
|
||||
#define KVX_SIZE_LN2 MDBX_WORDBITS_LN2
|
||||
STATIC_ASSERT(sizeof(kvx_t) == (1u << KVX_SIZE_LN2));
|
||||
}
|
||||
#endif /* Disabled for MSVC 19.0 (VisualStudio 2015) */
|
||||
|
@ -186,7 +186,7 @@ typedef struct reader_slot {
|
||||
/* The header for the reader table (a memory-mapped lock file). */
|
||||
typedef struct shared_lck {
|
||||
/* Stamp identifying this as an MDBX file.
|
||||
* It must be set to MDBX_MAGIC with with MDBX_LOCK_VERSION. */
|
||||
* It must be set to MDBX_MAGIC with MDBX_LOCK_VERSION. */
|
||||
uint64_t magic_and_version;
|
||||
|
||||
/* Format of this lock file. Must be set to MDBX_LOCK_FORMAT. */
|
||||
|
@ -49,7 +49,7 @@
|
||||
* = F_WRLCK блокировка первого байта lck-файла, другие процессы ждут её
|
||||
* снятия при получении F_RDLCK через F_SETLKW.
|
||||
* - блокировки dxb-файла могут меняться до снятие эксклюзивной блокировки
|
||||
* lck-файла:
|
||||
* lck-файла:
|
||||
* + для НЕ-эксклюзивного режима блокировка pid-байта в dxb-файле
|
||||
* посредством F_RDLCK или F_WRLCK, в зависимости от MDBX_RDONLY.
|
||||
* + для ЭКСКЛЮЗИВНОГО режима блокировка всего dxb-файла
|
||||
|
@ -1,6 +1,6 @@
|
||||
.\" Copyright 2015-2025 Leonid Yuriev <leo@yuriev.ru>.
|
||||
.\" Copying restrictions apply. See COPYRIGHT/LICENSE.
|
||||
.TH MDBX_CHK 1 "2024-08-29" "MDBX 0.13"
|
||||
.TH MDBX_CHK 1 "2025-01-14" "MDBX 0.14"
|
||||
.SH NAME
|
||||
mdbx_chk \- MDBX checking tool
|
||||
.SH SYNOPSIS
|
||||
|
@ -2,7 +2,7 @@
|
||||
.\" Copyright 2015,2016 Peter-Service R&D LLC <http://billing.ru/>.
|
||||
.\" Copyright 2012-2015 Howard Chu, Symas Corp. All Rights Reserved.
|
||||
.\" Copying restrictions apply. See COPYRIGHT/LICENSE.
|
||||
.TH MDBX_COPY 1 "2024-08-29" "MDBX 0.13"
|
||||
.TH MDBX_COPY 1 "2025-01-14" "MDBX 0.14"
|
||||
.SH NAME
|
||||
mdbx_copy \- MDBX environment copy tool
|
||||
.SH SYNOPSIS
|
||||
|
@ -1,7 +1,7 @@
|
||||
.\" Copyright 2021-2025 Leonid Yuriev <leo@yuriev.ru>.
|
||||
.\" Copyright 2014-2021 Howard Chu, Symas Corp. All Rights Reserved.
|
||||
.\" Copying restrictions apply. See COPYRIGHT/LICENSE.
|
||||
.TH MDBX_DROP 1 "2024-08-29" "MDBX 0.13"
|
||||
.TH MDBX_DROP 1 "2025-01-14" "MDBX 0.14"
|
||||
.SH NAME
|
||||
mdbx_drop \- MDBX database delete tool
|
||||
.SH SYNOPSIS
|
||||
|
@ -2,7 +2,7 @@
|
||||
.\" Copyright 2015,2016 Peter-Service R&D LLC <http://billing.ru/>.
|
||||
.\" Copyright 2014-2015 Howard Chu, Symas Corp. All Rights Reserved.
|
||||
.\" Copying restrictions apply. See COPYRIGHT/LICENSE.
|
||||
.TH MDBX_DUMP 1 "2024-08-29" "MDBX 0.13"
|
||||
.TH MDBX_DUMP 1 "2025-01-14" "MDBX 0.14"
|
||||
.SH NAME
|
||||
mdbx_dump \- MDBX environment export tool
|
||||
.SH SYNOPSIS
|
||||
@ -12,6 +12,8 @@ mdbx_dump \- MDBX environment export tool
|
||||
[\c
|
||||
.BR \-q ]
|
||||
[\c
|
||||
.BR \-c ]
|
||||
[\c
|
||||
.BI \-f \ file\fR]
|
||||
[\c
|
||||
.BR \-l ]
|
||||
@ -41,6 +43,9 @@ Write the library version number to the standard output, and exit.
|
||||
.BR \-q
|
||||
Be quiet.
|
||||
.TP
|
||||
.BR \-c
|
||||
Concise mode without repeating keys in a dump, but incompatible with Berkeley DB and LMDB.
|
||||
.TP
|
||||
.BR \-f \ file
|
||||
Write to the specified file instead of to the standard output.
|
||||
.TP
|
||||
|
@ -2,7 +2,7 @@
|
||||
.\" Copyright 2015,2016 Peter-Service R&D LLC <http://billing.ru/>.
|
||||
.\" Copyright 2014-2015 Howard Chu, Symas Corp. All Rights Reserved.
|
||||
.\" Copying restrictions apply. See COPYRIGHT/LICENSE.
|
||||
.TH MDBX_LOAD 1 "2024-08-29" "MDBX 0.13"
|
||||
.TH MDBX_LOAD 1 "2025-01-14" "MDBX 0.14"
|
||||
.SH NAME
|
||||
mdbx_load \- MDBX environment import tool
|
||||
.SH SYNOPSIS
|
||||
|
@ -2,7 +2,7 @@
|
||||
.\" Copyright 2015,2016 Peter-Service R&D LLC <http://billing.ru/>.
|
||||
.\" Copyright 2012-2015 Howard Chu, Symas Corp. All Rights Reserved.
|
||||
.\" Copying restrictions apply. See COPYRIGHT/LICENSE.
|
||||
.TH MDBX_STAT 1 "2024-08-29" "MDBX 0.13"
|
||||
.TH MDBX_STAT 1 "2025-01-14" "MDBX 0.14"
|
||||
.SH NAME
|
||||
mdbx_stat \- MDBX environment status tool
|
||||
.SH SYNOPSIS
|
||||
|
@ -252,9 +252,9 @@ __cold int meta_wipe_steady(MDBX_env *env, txnid_t inclusive_upto) {
|
||||
/* force oldest refresh */
|
||||
atomic_store32(&env->lck->rdt_refresh_flag, true, mo_Relaxed);
|
||||
|
||||
env->basal_txn->tw.troika = meta_tap(env);
|
||||
env->basal_txn->wr.troika = meta_tap(env);
|
||||
for (MDBX_txn *scan = env->basal_txn->nested; scan; scan = scan->nested)
|
||||
scan->tw.troika = env->basal_txn->tw.troika;
|
||||
scan->wr.troika = env->basal_txn->wr.troika;
|
||||
return err;
|
||||
}
|
||||
|
||||
|
@ -50,23 +50,23 @@ bsr_t mvcc_bind_slot(MDBX_env *env) {
|
||||
}
|
||||
}
|
||||
|
||||
result.rslot = &env->lck->rdt[slot];
|
||||
result.slot = &env->lck->rdt[slot];
|
||||
/* Claim the reader slot, carefully since other code
|
||||
* uses the reader table un-mutexed: First reset the
|
||||
* slot, next publish it in lck->rdt_length. After
|
||||
* that, it is safe for mdbx_env_close() to touch it.
|
||||
* When it will be closed, we can finally claim it. */
|
||||
atomic_store32(&result.rslot->pid, 0, mo_AcquireRelease);
|
||||
safe64_reset(&result.rslot->txnid, true);
|
||||
atomic_store32(&result.slot->pid, 0, mo_AcquireRelease);
|
||||
safe64_reset(&result.slot->txnid, true);
|
||||
if (slot == nreaders)
|
||||
env->lck->rdt_length.weak = (uint32_t)++nreaders;
|
||||
result.rslot->tid.weak = (env->flags & MDBX_NOSTICKYTHREADS) ? 0 : osal_thread_self();
|
||||
atomic_store32(&result.rslot->pid, env->pid, mo_AcquireRelease);
|
||||
result.slot->tid.weak = (env->flags & MDBX_NOSTICKYTHREADS) ? 0 : osal_thread_self();
|
||||
atomic_store32(&result.slot->pid, env->pid, mo_AcquireRelease);
|
||||
lck_rdt_unlock(env);
|
||||
|
||||
if (likely(env->flags & ENV_TXKEY)) {
|
||||
eASSERT(env, env->registered_reader_pid == env->pid);
|
||||
thread_rthc_set(env->me_txkey, result.rslot);
|
||||
thread_rthc_set(env->me_txkey, result.slot);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
@ -300,7 +300,7 @@ __cold MDBX_INTERNAL int mvcc_cleanup_dead(MDBX_env *env, int rdt_locked, int *d
|
||||
return rc;
|
||||
}
|
||||
|
||||
__cold txnid_t mvcc_kick_laggards(MDBX_env *env, const txnid_t straggler) {
|
||||
__cold bool mvcc_kick_laggards(MDBX_env *env, const txnid_t straggler) {
|
||||
DEBUG("DB size maxed out by reading #%" PRIaTXN, straggler);
|
||||
osal_memory_fence(mo_AcquireRelease, false);
|
||||
MDBX_hsr_func *const callback = env->hsr_callback;
|
||||
@ -308,7 +308,7 @@ __cold txnid_t mvcc_kick_laggards(MDBX_env *env, const txnid_t straggler) {
|
||||
bool notify_eof_of_loop = false;
|
||||
int retry = 0;
|
||||
do {
|
||||
const txnid_t steady = env->txn->tw.troika.txnid[env->txn->tw.troika.prefer_steady];
|
||||
const txnid_t steady = env->txn->wr.troika.txnid[env->txn->wr.troika.prefer_steady];
|
||||
env->lck->rdt_refresh_flag.weak = /* force refresh */ true;
|
||||
oldest = mvcc_shapshot_oldest(env, steady);
|
||||
eASSERT(env, oldest < env->basal_txn->txnid);
|
||||
@ -374,7 +374,7 @@ __cold txnid_t mvcc_kick_laggards(MDBX_env *env, const txnid_t straggler) {
|
||||
if (safe64_read(&stucked->txnid) != straggler || !pid)
|
||||
continue;
|
||||
|
||||
const meta_ptr_t head = meta_recent(env, &env->txn->tw.troika);
|
||||
const meta_ptr_t head = meta_recent(env, &env->txn->wr.troika);
|
||||
const txnid_t gap = (head.txnid - straggler) / xMDBX_TXNID_STEP;
|
||||
const uint64_t head_retired = unaligned_peek_u64(4, head.ptr_c->pages_retired);
|
||||
const size_t space = (head_retired > hold_retired) ? pgno2bytes(env, (pgno_t)(head_retired - hold_retired)) : 0;
|
||||
@ -410,5 +410,5 @@ __cold txnid_t mvcc_kick_laggards(MDBX_env *env, const txnid_t straggler) {
|
||||
NOTICE("hsr-kick: done turn %" PRIaTXN " -> %" PRIaTXN " +%" PRIaTXN, straggler, oldest, turn);
|
||||
callback(env, env->txn, 0, 0, straggler, (turn < UINT_MAX) ? (unsigned)turn : UINT_MAX, 0, -retry);
|
||||
}
|
||||
return oldest;
|
||||
return oldest > straggler;
|
||||
}
|
||||
|
14
src/node.c
14
src/node.c
@ -50,14 +50,9 @@ int __must_check_result node_add_branch(MDBX_cursor *mc, size_t indx, const MDBX
|
||||
is_subpage(mp) ? "sub-" : "", mp->pgno, indx, pgno, key ? key->iov_len : 0, DKEY_DEBUG(key));
|
||||
|
||||
cASSERT(mc, page_type(mp) == P_BRANCH);
|
||||
cASSERT(mc, mp->txnid >= mc->txn->front_txnid);
|
||||
STATIC_ASSERT(NODESIZE % 2 == 0);
|
||||
|
||||
/* Move higher pointers up one slot. */
|
||||
const size_t nkeys = page_numkeys(mp);
|
||||
cASSERT(mc, nkeys >= indx);
|
||||
for (size_t i = nkeys; i > indx; --i)
|
||||
mp->entries[i] = mp->entries[i - 1];
|
||||
|
||||
/* Adjust free space offsets. */
|
||||
const size_t branch_bytes = branch_size(mc->txn->env, key);
|
||||
const intptr_t lower = mp->lower + sizeof(indx_t);
|
||||
@ -66,6 +61,13 @@ int __must_check_result node_add_branch(MDBX_cursor *mc, size_t indx, const MDBX
|
||||
mc->txn->flags |= MDBX_TXN_ERROR;
|
||||
return MDBX_PAGE_FULL;
|
||||
}
|
||||
|
||||
/* Move higher pointers up one slot. */
|
||||
const size_t nkeys = page_numkeys(mp);
|
||||
cASSERT(mc, nkeys >= indx);
|
||||
for (size_t i = nkeys; i > indx; --i)
|
||||
mp->entries[i] = mp->entries[i - 1];
|
||||
|
||||
mp->lower = (indx_t)lower;
|
||||
mp->entries[indx] = mp->upper = (indx_t)upper;
|
||||
|
||||
|
@ -257,6 +257,14 @@
|
||||
#error MDBX_HAVE_BUILTIN_CPU_SUPPORTS must be defined as 0 or 1
|
||||
#endif /* MDBX_HAVE_BUILTIN_CPU_SUPPORTS */
|
||||
|
||||
/** if enabled then treats the commit of pure (nothing changes) transactions as special
|
||||
* cases and return \ref MDBX_RESULT_TRUE instead of \ref MDBX_SUCCESS. */
|
||||
#ifndef MDBX_NOSUCCESS_PURE_COMMIT
|
||||
#define MDBX_NOSUCCESS_PURE_COMMIT 0
|
||||
#elif !(MDBX_NOSUCCESS_PURE_COMMIT == 0 || MDBX_NOSUCCESS_PURE_COMMIT == 1)
|
||||
#error MDBX_NOSUCCESS_PURE_COMMIT must be defined as 0 or 1
|
||||
#endif /* MDBX_NOSUCCESS_PURE_COMMIT */
|
||||
|
||||
/** if enabled then instead of the returned error `MDBX_REMOTE`, only a warning is issued, when
|
||||
* the database being opened in non-read-only mode is located in a file system exported via NFS. */
|
||||
#ifndef MDBX_ENABLE_NON_READONLY_EXPORT
|
||||
|
@ -248,7 +248,7 @@ __cold void mdbx_panic(const char *fmt, ...) {
|
||||
unlikely(num < 1 || !message) ? "<troubles with panic-message preparation>" : message;
|
||||
|
||||
if (globals.logger.ptr)
|
||||
debug_log(MDBX_LOG_FATAL, "panic", 0, "%s", const_message);
|
||||
debug_log(MDBX_LOG_FATAL, "mdbx-panic", 0, "%s", const_message);
|
||||
|
||||
while (1) {
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
@ -262,7 +262,7 @@ __cold void mdbx_panic(const char *fmt, ...) {
|
||||
#endif
|
||||
FatalExit(ERROR_UNHANDLED_ERROR);
|
||||
#else
|
||||
__assert_fail(const_message, "mdbx", 0, "panic");
|
||||
__assert_fail(const_message, "mdbx-panic", 0, const_message);
|
||||
abort();
|
||||
#endif
|
||||
}
|
||||
|
@ -443,8 +443,8 @@ static __always_inline pgr_t page_get_inline(const uint16_t ILL, const MDBX_curs
|
||||
|
||||
const size_t i = dpl_search(spiller, pgno);
|
||||
tASSERT(txn, (intptr_t)i > 0);
|
||||
if (spiller->tw.dirtylist->items[i].pgno == pgno) {
|
||||
r.page = spiller->tw.dirtylist->items[i].ptr;
|
||||
if (spiller->wr.dirtylist->items[i].pgno == pgno) {
|
||||
r.page = spiller->wr.dirtylist->items[i].ptr;
|
||||
break;
|
||||
}
|
||||
|
||||
@ -457,6 +457,8 @@ static __always_inline pgr_t page_get_inline(const uint16_t ILL, const MDBX_curs
|
||||
goto bailout;
|
||||
}
|
||||
|
||||
TRACE("dbi %zu, mc %p, page %u, %p", cursor_dbi(mc), __Wpedantic_format_voidptr(mc), pgno,
|
||||
__Wpedantic_format_voidptr(r.page));
|
||||
if (unlikely(mc->checking & z_pagecheck))
|
||||
return check_page_complete(ILL, r.page, mc, front);
|
||||
|
||||
|
@ -144,14 +144,14 @@ __cold pgr_t __must_check_result page_unspill(MDBX_txn *const txn, const page_t
|
||||
}
|
||||
|
||||
__hot int page_touch_modifable(MDBX_txn *txn, const page_t *const mp) {
|
||||
tASSERT(txn, is_modifable(txn, mp) && txn->tw.dirtylist);
|
||||
tASSERT(txn, is_modifable(txn, mp) && txn->wr.dirtylist);
|
||||
tASSERT(txn, !is_largepage(mp) && !is_subpage(mp));
|
||||
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC);
|
||||
|
||||
const size_t n = dpl_search(txn, mp->pgno);
|
||||
if (MDBX_AVOID_MSYNC && unlikely(txn->tw.dirtylist->items[n].pgno != mp->pgno)) {
|
||||
if (MDBX_AVOID_MSYNC && unlikely(txn->wr.dirtylist->items[n].pgno != mp->pgno)) {
|
||||
tASSERT(txn, (txn->flags & MDBX_WRITEMAP));
|
||||
tASSERT(txn, n > 0 && n <= txn->tw.dirtylist->length + 1);
|
||||
tASSERT(txn, n > 0 && n <= txn->wr.dirtylist->length + 1);
|
||||
VERBOSE("unspill page %" PRIaPGNO, mp->pgno);
|
||||
#if MDBX_ENABLE_PGOP_STAT
|
||||
txn->env->lck->pgops.unspill.weak += 1;
|
||||
@ -159,11 +159,11 @@ __hot int page_touch_modifable(MDBX_txn *txn, const page_t *const mp) {
|
||||
return page_dirty(txn, (page_t *)mp, 1);
|
||||
}
|
||||
|
||||
tASSERT(txn, n > 0 && n <= txn->tw.dirtylist->length);
|
||||
tASSERT(txn, txn->tw.dirtylist->items[n].pgno == mp->pgno && txn->tw.dirtylist->items[n].ptr == mp);
|
||||
tASSERT(txn, n > 0 && n <= txn->wr.dirtylist->length);
|
||||
tASSERT(txn, txn->wr.dirtylist->items[n].pgno == mp->pgno && txn->wr.dirtylist->items[n].ptr == mp);
|
||||
if (!MDBX_AVOID_MSYNC || (txn->flags & MDBX_WRITEMAP) == 0) {
|
||||
size_t *const ptr = ptr_disp(txn->tw.dirtylist->items[n].ptr, -(ptrdiff_t)sizeof(size_t));
|
||||
*ptr = txn->tw.dirtylru;
|
||||
size_t *const ptr = ptr_disp(txn->wr.dirtylist->items[n].ptr, -(ptrdiff_t)sizeof(size_t));
|
||||
*ptr = txn->wr.dirtylru;
|
||||
}
|
||||
return MDBX_SUCCESS;
|
||||
}
|
||||
@ -179,7 +179,7 @@ __hot int page_touch_unmodifable(MDBX_txn *txn, MDBX_cursor *mc, const page_t *c
|
||||
page_t *np;
|
||||
if (is_frozen(txn, mp)) {
|
||||
/* CoW the page */
|
||||
rc = pnl_need(&txn->tw.retired_pages, 1);
|
||||
rc = pnl_need(&txn->wr.retired_pages, 1);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
goto fail;
|
||||
const pgr_t par = gc_alloc_single(mc);
|
||||
@ -191,7 +191,7 @@ __hot int page_touch_unmodifable(MDBX_txn *txn, MDBX_cursor *mc, const page_t *c
|
||||
const pgno_t pgno = np->pgno;
|
||||
DEBUG("touched db %d page %" PRIaPGNO " -> %" PRIaPGNO, cursor_dbi_dbg(mc), mp->pgno, pgno);
|
||||
tASSERT(txn, mp->pgno != pgno);
|
||||
pnl_append_prereserved(txn->tw.retired_pages, mp->pgno);
|
||||
pnl_append_prereserved(txn->wr.retired_pages, mp->pgno);
|
||||
/* Update the parent page, if any, to point to the new page */
|
||||
if (likely(mc->top)) {
|
||||
page_t *parent = mc->pg[mc->top - 1];
|
||||
@ -227,7 +227,7 @@ __hot int page_touch_unmodifable(MDBX_txn *txn, MDBX_cursor *mc, const page_t *c
|
||||
}
|
||||
|
||||
DEBUG("clone db %d page %" PRIaPGNO, cursor_dbi_dbg(mc), mp->pgno);
|
||||
tASSERT(txn, txn->tw.dirtylist->length <= PAGELIST_LIMIT + MDBX_PNL_GRANULATE);
|
||||
tASSERT(txn, txn->wr.dirtylist->length <= PAGELIST_LIMIT + MDBX_PNL_GRANULATE);
|
||||
/* No - copy it */
|
||||
np = page_shadow_alloc(txn, 1);
|
||||
if (unlikely(!np)) {
|
||||
@ -369,7 +369,7 @@ static inline bool suitable4loose(const MDBX_txn *txn, pgno_t pgno) {
|
||||
* страница не примыкает к какой-либо из уже находящийся в reclaimed.
|
||||
* 2) стоит подумать над тем, чтобы при большом loose-списке отбрасывать
|
||||
половину в reclaimed. */
|
||||
return txn->tw.loose_count < txn->env->options.dp_loose_limit &&
|
||||
return txn->wr.loose_count < txn->env->options.dp_loose_limit &&
|
||||
(!MDBX_ENABLE_REFUND ||
|
||||
/* skip pages near to the end in favor of compactification */
|
||||
txn->geo.first_unallocated > pgno + txn->env->options.dp_loose_limit ||
|
||||
@ -417,14 +417,14 @@ int page_retire_ex(MDBX_cursor *mc, const pgno_t pgno, page_t *mp /* maybe null
|
||||
status = frozen;
|
||||
if (ASSERT_ENABLED()) {
|
||||
for (MDBX_txn *scan = txn; scan; scan = scan->parent) {
|
||||
tASSERT(txn, !txn->tw.spilled.list || !spill_search(scan, pgno));
|
||||
tASSERT(txn, !scan->tw.dirtylist || !debug_dpl_find(scan, pgno));
|
||||
tASSERT(txn, !txn->wr.spilled.list || !spill_search(scan, pgno));
|
||||
tASSERT(txn, !scan->wr.dirtylist || !debug_dpl_find(scan, pgno));
|
||||
}
|
||||
}
|
||||
goto status_done;
|
||||
} else if (pageflags && txn->tw.dirtylist) {
|
||||
} else if (pageflags && txn->wr.dirtylist) {
|
||||
if ((di = dpl_exist(txn, pgno)) != 0) {
|
||||
mp = txn->tw.dirtylist->items[di].ptr;
|
||||
mp = txn->wr.dirtylist->items[di].ptr;
|
||||
tASSERT(txn, is_modifable(txn, mp));
|
||||
status = modifable;
|
||||
goto status_done;
|
||||
@ -461,16 +461,16 @@ int page_retire_ex(MDBX_cursor *mc, const pgno_t pgno, page_t *mp /* maybe null
|
||||
tASSERT(txn, !is_spilled(txn, mp));
|
||||
tASSERT(txn, !is_shadowed(txn, mp));
|
||||
tASSERT(txn, !debug_dpl_find(txn, pgno));
|
||||
tASSERT(txn, !txn->tw.spilled.list || !spill_search(txn, pgno));
|
||||
tASSERT(txn, !txn->wr.spilled.list || !spill_search(txn, pgno));
|
||||
} else if (is_modifable(txn, mp)) {
|
||||
status = modifable;
|
||||
if (txn->tw.dirtylist)
|
||||
if (txn->wr.dirtylist)
|
||||
di = dpl_exist(txn, pgno);
|
||||
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) || !is_spilled(txn, mp));
|
||||
tASSERT(txn, !txn->tw.spilled.list || !spill_search(txn, pgno));
|
||||
tASSERT(txn, !txn->wr.spilled.list || !spill_search(txn, pgno));
|
||||
} else if (is_shadowed(txn, mp)) {
|
||||
status = shadowed;
|
||||
tASSERT(txn, !txn->tw.spilled.list || !spill_search(txn, pgno));
|
||||
tASSERT(txn, !txn->wr.spilled.list || !spill_search(txn, pgno));
|
||||
tASSERT(txn, !debug_dpl_find(txn, pgno));
|
||||
} else {
|
||||
tASSERT(txn, is_spilled(txn, mp));
|
||||
@ -504,7 +504,7 @@ status_done:
|
||||
if (status == frozen) {
|
||||
retire:
|
||||
DEBUG("retire %zu page %" PRIaPGNO, npages, pgno);
|
||||
rc = pnl_append_span(&txn->tw.retired_pages, pgno, npages);
|
||||
rc = pnl_append_span(&txn->wr.retired_pages, pgno, npages);
|
||||
tASSERT(txn, dpl_check(txn));
|
||||
return rc;
|
||||
}
|
||||
@ -560,17 +560,17 @@ status_done:
|
||||
if (status == modifable) {
|
||||
/* Dirty page from this transaction */
|
||||
/* If suitable we can reuse it through loose list */
|
||||
if (likely(npages == 1 && suitable4loose(txn, pgno)) && (di || !txn->tw.dirtylist)) {
|
||||
if (likely(npages == 1 && suitable4loose(txn, pgno)) && (di || !txn->wr.dirtylist)) {
|
||||
DEBUG("loosen dirty page %" PRIaPGNO, pgno);
|
||||
if (MDBX_DEBUG != 0 || unlikely(txn->env->flags & MDBX_PAGEPERTURB))
|
||||
memset(page_data(mp), -1, txn->env->ps - PAGEHDRSZ);
|
||||
mp->txnid = INVALID_TXNID;
|
||||
mp->flags = P_LOOSE;
|
||||
page_next(mp) = txn->tw.loose_pages;
|
||||
txn->tw.loose_pages = mp;
|
||||
txn->tw.loose_count++;
|
||||
page_next(mp) = txn->wr.loose_pages;
|
||||
txn->wr.loose_pages = mp;
|
||||
txn->wr.loose_count++;
|
||||
#if MDBX_ENABLE_REFUND
|
||||
txn->tw.loose_refund_wl = (pgno + 2 > txn->tw.loose_refund_wl) ? pgno + 2 : txn->tw.loose_refund_wl;
|
||||
txn->wr.loose_refund_wl = (pgno + 2 > txn->wr.loose_refund_wl) ? pgno + 2 : txn->wr.loose_refund_wl;
|
||||
#endif /* MDBX_ENABLE_REFUND */
|
||||
VALGRIND_MAKE_MEM_NOACCESS(page_data(mp), txn->env->ps - PAGEHDRSZ);
|
||||
MDBX_ASAN_POISON_MEMORY_REGION(page_data(mp), txn->env->ps - PAGEHDRSZ);
|
||||
@ -608,8 +608,8 @@ status_done:
|
||||
|
||||
reclaim:
|
||||
DEBUG("reclaim %zu %s page %" PRIaPGNO, npages, "dirty", pgno);
|
||||
rc = pnl_insert_span(&txn->tw.repnl, pgno, npages);
|
||||
tASSERT(txn, pnl_check_allocated(txn->tw.repnl, txn->geo.first_unallocated - MDBX_ENABLE_REFUND));
|
||||
rc = pnl_insert_span(&txn->wr.repnl, pgno, npages);
|
||||
tASSERT(txn, pnl_check_allocated(txn->wr.repnl, txn->geo.first_unallocated - MDBX_ENABLE_REFUND));
|
||||
tASSERT(txn, dpl_check(txn));
|
||||
return rc;
|
||||
}
|
||||
@ -660,10 +660,10 @@ status_done:
|
||||
__hot int __must_check_result page_dirty(MDBX_txn *txn, page_t *mp, size_t npages) {
|
||||
tASSERT(txn, (txn->flags & MDBX_TXN_RDONLY) == 0);
|
||||
mp->txnid = txn->front_txnid;
|
||||
if (!txn->tw.dirtylist) {
|
||||
if (!txn->wr.dirtylist) {
|
||||
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) != 0 && !MDBX_AVOID_MSYNC);
|
||||
txn->tw.writemap_dirty_npages += npages;
|
||||
tASSERT(txn, txn->tw.spilled.list == nullptr);
|
||||
txn->wr.writemap_dirty_npages += npages;
|
||||
tASSERT(txn, txn->wr.spilled.list == nullptr);
|
||||
return MDBX_SUCCESS;
|
||||
}
|
||||
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC);
|
||||
@ -671,29 +671,29 @@ __hot int __must_check_result page_dirty(MDBX_txn *txn, page_t *mp, size_t npage
|
||||
#if xMDBX_DEBUG_SPILLING == 2
|
||||
txn->env->debug_dirtied_act += 1;
|
||||
ENSURE(txn->env, txn->env->debug_dirtied_act < txn->env->debug_dirtied_est);
|
||||
ENSURE(txn->env, txn->tw.dirtyroom + txn->tw.loose_count > 0);
|
||||
ENSURE(txn->env, txn->wr.dirtyroom + txn->wr.loose_count > 0);
|
||||
#endif /* xMDBX_DEBUG_SPILLING == 2 */
|
||||
|
||||
int rc;
|
||||
if (unlikely(txn->tw.dirtyroom == 0)) {
|
||||
if (txn->tw.loose_count) {
|
||||
page_t *lp = txn->tw.loose_pages;
|
||||
if (unlikely(txn->wr.dirtyroom == 0)) {
|
||||
if (txn->wr.loose_count) {
|
||||
page_t *lp = txn->wr.loose_pages;
|
||||
DEBUG("purge-and-reclaim loose page %" PRIaPGNO, lp->pgno);
|
||||
rc = pnl_insert_span(&txn->tw.repnl, lp->pgno, 1);
|
||||
rc = pnl_insert_span(&txn->wr.repnl, lp->pgno, 1);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
goto bailout;
|
||||
size_t di = dpl_search(txn, lp->pgno);
|
||||
tASSERT(txn, txn->tw.dirtylist->items[di].ptr == lp);
|
||||
tASSERT(txn, txn->wr.dirtylist->items[di].ptr == lp);
|
||||
dpl_remove(txn, di);
|
||||
MDBX_ASAN_UNPOISON_MEMORY_REGION(&page_next(lp), sizeof(page_t *));
|
||||
VALGRIND_MAKE_MEM_DEFINED(&page_next(lp), sizeof(page_t *));
|
||||
txn->tw.loose_pages = page_next(lp);
|
||||
txn->tw.loose_count--;
|
||||
txn->tw.dirtyroom++;
|
||||
txn->wr.loose_pages = page_next(lp);
|
||||
txn->wr.loose_count--;
|
||||
txn->wr.dirtyroom++;
|
||||
if (!MDBX_AVOID_MSYNC || !(txn->flags & MDBX_WRITEMAP))
|
||||
page_shadow_release(txn->env, lp, 1);
|
||||
} else {
|
||||
ERROR("Dirtyroom is depleted, DPL length %zu", txn->tw.dirtylist->length);
|
||||
ERROR("Dirtyroom is depleted, DPL length %zu", txn->wr.dirtylist->length);
|
||||
if (!MDBX_AVOID_MSYNC || !(txn->flags & MDBX_WRITEMAP))
|
||||
page_shadow_release(txn->env, mp, npages);
|
||||
return MDBX_TXN_FULL;
|
||||
@ -706,7 +706,7 @@ __hot int __must_check_result page_dirty(MDBX_txn *txn, page_t *mp, size_t npage
|
||||
txn->flags |= MDBX_TXN_ERROR;
|
||||
return rc;
|
||||
}
|
||||
txn->tw.dirtyroom--;
|
||||
txn->wr.dirtyroom--;
|
||||
tASSERT(txn, dpl_check(txn));
|
||||
return MDBX_SUCCESS;
|
||||
}
|
||||
|
@ -88,7 +88,7 @@ static inline int page_touch(MDBX_cursor *mc) {
|
||||
}
|
||||
|
||||
if (is_modifable(txn, mp)) {
|
||||
if (!txn->tw.dirtylist) {
|
||||
if (!txn->wr.dirtylist) {
|
||||
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) && !MDBX_AVOID_MSYNC);
|
||||
return MDBX_SUCCESS;
|
||||
}
|
||||
@ -114,14 +114,14 @@ static inline void page_wash(MDBX_txn *txn, size_t di, page_t *const mp, const s
|
||||
mp->txnid = INVALID_TXNID;
|
||||
mp->flags = P_BAD;
|
||||
|
||||
if (txn->tw.dirtylist) {
|
||||
if (txn->wr.dirtylist) {
|
||||
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC);
|
||||
tASSERT(txn, MDBX_AVOID_MSYNC || (di && txn->tw.dirtylist->items[di].ptr == mp));
|
||||
tASSERT(txn, MDBX_AVOID_MSYNC || (di && txn->wr.dirtylist->items[di].ptr == mp));
|
||||
if (!MDBX_AVOID_MSYNC || di) {
|
||||
dpl_remove_ex(txn, di, npages);
|
||||
txn->tw.dirtyroom++;
|
||||
tASSERT(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length ==
|
||||
(txn->parent ? txn->parent->tw.dirtyroom : txn->env->options.dp_limit));
|
||||
txn->wr.dirtyroom++;
|
||||
tASSERT(txn, txn->wr.dirtyroom + txn->wr.dirtylist->length ==
|
||||
(txn->parent ? txn->parent->wr.dirtyroom : txn->env->options.dp_limit));
|
||||
if (!MDBX_AVOID_MSYNC || !(txn->flags & MDBX_WRITEMAP)) {
|
||||
page_shadow_release(txn->env, mp, npages);
|
||||
return;
|
||||
@ -129,7 +129,7 @@ static inline void page_wash(MDBX_txn *txn, size_t di, page_t *const mp, const s
|
||||
}
|
||||
} else {
|
||||
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) && !MDBX_AVOID_MSYNC && !di);
|
||||
txn->tw.writemap_dirty_npages -= (txn->tw.writemap_dirty_npages > npages) ? npages : txn->tw.writemap_dirty_npages;
|
||||
txn->wr.writemap_dirty_npages -= (txn->wr.writemap_dirty_npages > npages) ? npages : txn->wr.writemap_dirty_npages;
|
||||
}
|
||||
VALGRIND_MAKE_MEM_UNDEFINED(mp, PAGEHDRSZ);
|
||||
VALGRIND_MAKE_MEM_NOACCESS(page_data(mp), pgno2bytes(txn->env, npages) - PAGEHDRSZ);
|
||||
|
22
src/pnl.c
22
src/pnl.c
@ -23,6 +23,13 @@ void pnl_free(pnl_t pnl) {
|
||||
osal_free(pnl - 1);
|
||||
}
|
||||
|
||||
pnl_t pnl_clone(const pnl_t src) {
|
||||
pnl_t pl = pnl_alloc(MDBX_PNL_ALLOCLEN(src));
|
||||
if (likely(pl))
|
||||
memcpy(pl, src, MDBX_PNL_SIZEOF(src));
|
||||
return pl;
|
||||
}
|
||||
|
||||
void pnl_shrink(pnl_t __restrict *__restrict ppnl) {
|
||||
assert(pnl_bytes2size(pnl_size2bytes(MDBX_PNL_INITIAL)) >= MDBX_PNL_INITIAL &&
|
||||
pnl_bytes2size(pnl_size2bytes(MDBX_PNL_INITIAL)) < MDBX_PNL_INITIAL * 3 / 2);
|
||||
@ -234,3 +241,18 @@ __hot __noinline size_t pnl_search_nochk(const pnl_t pnl, pgno_t pgno) {
|
||||
assert(!MDBX_PNL_ORDERED(it[0], pgno));
|
||||
return it - begin + 1;
|
||||
}
|
||||
|
||||
size_t pnl_maxspan(const pnl_t pnl) {
|
||||
size_t len = MDBX_PNL_GETSIZE(pnl);
|
||||
if (len > 1) {
|
||||
size_t span = 1, left = len - span;
|
||||
const pgno_t *scan = MDBX_PNL_BEGIN(pnl);
|
||||
do {
|
||||
const bool contiguous = MDBX_PNL_CONTIGUOUS(*scan, scan[span], span);
|
||||
span += contiguous;
|
||||
scan += 1 - contiguous;
|
||||
} while (--left);
|
||||
len = span;
|
||||
}
|
||||
return len;
|
||||
}
|
||||
|
15
src/pnl.h
15
src/pnl.h
@ -45,16 +45,18 @@ typedef const pgno_t *const_pnl_t;
|
||||
#define MDBX_PNL_EDGE(pl) ((pl) + 1)
|
||||
#define MDBX_PNL_LEAST(pl) MDBX_PNL_FIRST(pl)
|
||||
#define MDBX_PNL_MOST(pl) MDBX_PNL_LAST(pl)
|
||||
#define MDBX_PNL_CONTIGUOUS(prev, next, span) ((next) - (prev)) == (span))
|
||||
#else
|
||||
#define MDBX_PNL_EDGE(pl) ((pl) + MDBX_PNL_GETSIZE(pl))
|
||||
#define MDBX_PNL_LEAST(pl) MDBX_PNL_LAST(pl)
|
||||
#define MDBX_PNL_MOST(pl) MDBX_PNL_FIRST(pl)
|
||||
#define MDBX_PNL_CONTIGUOUS(prev, next, span) (((prev) - (next)) == (span))
|
||||
#endif
|
||||
|
||||
#define MDBX_PNL_SIZEOF(pl) ((MDBX_PNL_GETSIZE(pl) + 1) * sizeof(pgno_t))
|
||||
#define MDBX_PNL_IS_EMPTY(pl) (MDBX_PNL_GETSIZE(pl) == 0)
|
||||
|
||||
MDBX_MAYBE_UNUSED static inline size_t pnl_size2bytes(size_t size) {
|
||||
MDBX_NOTHROW_PURE_FUNCTION MDBX_MAYBE_UNUSED static inline size_t pnl_size2bytes(size_t size) {
|
||||
assert(size > 0 && size <= PAGELIST_LIMIT);
|
||||
#if MDBX_PNL_PREALLOC_FOR_RADIXSORT
|
||||
|
||||
@ -69,7 +71,7 @@ MDBX_MAYBE_UNUSED static inline size_t pnl_size2bytes(size_t size) {
|
||||
return bytes;
|
||||
}
|
||||
|
||||
MDBX_MAYBE_UNUSED static inline pgno_t pnl_bytes2size(const size_t bytes) {
|
||||
MDBX_NOTHROW_PURE_FUNCTION MDBX_MAYBE_UNUSED static inline pgno_t pnl_bytes2size(const size_t bytes) {
|
||||
size_t size = bytes / sizeof(pgno_t);
|
||||
assert(size > 3 && size <= PAGELIST_LIMIT + /* alignment gap */ 65536);
|
||||
size -= 3;
|
||||
@ -83,6 +85,8 @@ MDBX_INTERNAL pnl_t pnl_alloc(size_t size);
|
||||
|
||||
MDBX_INTERNAL void pnl_free(pnl_t pnl);
|
||||
|
||||
MDBX_MAYBE_UNUSED MDBX_INTERNAL pnl_t pnl_clone(const pnl_t src);
|
||||
|
||||
MDBX_INTERNAL int pnl_reserve(pnl_t __restrict *__restrict ppnl, const size_t wanna);
|
||||
|
||||
MDBX_MAYBE_UNUSED static inline int __must_check_result pnl_need(pnl_t __restrict *__restrict ppnl, size_t num) {
|
||||
@ -110,7 +114,7 @@ MDBX_INTERNAL int __must_check_result pnl_append_span(__restrict pnl_t *ppnl, pg
|
||||
|
||||
MDBX_INTERNAL int __must_check_result pnl_insert_span(__restrict pnl_t *ppnl, pgno_t pgno, size_t n);
|
||||
|
||||
MDBX_INTERNAL size_t pnl_search_nochk(const pnl_t pnl, pgno_t pgno);
|
||||
MDBX_NOTHROW_PURE_FUNCTION MDBX_INTERNAL size_t pnl_search_nochk(const pnl_t pnl, pgno_t pgno);
|
||||
|
||||
MDBX_INTERNAL void pnl_sort_nochk(pnl_t pnl);
|
||||
|
||||
@ -126,7 +130,8 @@ MDBX_MAYBE_UNUSED static inline void pnl_sort(pnl_t pnl, size_t limit4check) {
|
||||
(void)limit4check;
|
||||
}
|
||||
|
||||
MDBX_MAYBE_UNUSED static inline size_t pnl_search(const pnl_t pnl, pgno_t pgno, size_t limit) {
|
||||
MDBX_NOTHROW_PURE_FUNCTION MDBX_MAYBE_UNUSED static inline size_t pnl_search(const pnl_t pnl, pgno_t pgno,
|
||||
size_t limit) {
|
||||
assert(pnl_check_allocated(pnl, limit));
|
||||
if (MDBX_HAVE_CMOV) {
|
||||
/* cmov-ускоренный бинарный поиск может читать (но не использовать) один
|
||||
@ -144,3 +149,5 @@ MDBX_MAYBE_UNUSED static inline size_t pnl_search(const pnl_t pnl, pgno_t pgno,
|
||||
}
|
||||
|
||||
MDBX_INTERNAL size_t pnl_merge(pnl_t dst, const pnl_t src);
|
||||
|
||||
MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION MDBX_INTERNAL size_t pnl_maxspan(const pnl_t pnl);
|
||||
|
64
src/proto.h
64
src/proto.h
@ -15,9 +15,8 @@ MDBX_INTERNAL bsr_t mvcc_bind_slot(MDBX_env *env);
|
||||
MDBX_MAYBE_UNUSED MDBX_INTERNAL pgno_t mvcc_largest_this(MDBX_env *env, pgno_t largest);
|
||||
MDBX_INTERNAL txnid_t mvcc_shapshot_oldest(MDBX_env *const env, const txnid_t steady);
|
||||
MDBX_INTERNAL pgno_t mvcc_snapshot_largest(const MDBX_env *env, pgno_t last_used_page);
|
||||
MDBX_INTERNAL txnid_t mvcc_kick_laggards(MDBX_env *env, const txnid_t straggler);
|
||||
MDBX_INTERNAL int mvcc_cleanup_dead(MDBX_env *env, int rlocked, int *dead);
|
||||
MDBX_INTERNAL txnid_t mvcc_kick_laggards(MDBX_env *env, const txnid_t laggard);
|
||||
MDBX_INTERNAL bool mvcc_kick_laggards(MDBX_env *env, const txnid_t laggard);
|
||||
|
||||
/* dxb.c */
|
||||
MDBX_INTERNAL int dxb_setup(MDBX_env *env, const int lck_rc, const mdbx_mode_t mode_bits);
|
||||
@ -39,37 +38,54 @@ static inline void dxb_sanitize_tail(MDBX_env *env, MDBX_txn *txn) {
|
||||
#endif /* ENABLE_MEMCHECK || __SANITIZE_ADDRESS__ */
|
||||
|
||||
/* txn.c */
|
||||
MDBX_INTERNAL bool txn_refund(MDBX_txn *txn);
|
||||
MDBX_INTERNAL txnid_t txn_snapshot_oldest(const MDBX_txn *const txn);
|
||||
MDBX_INTERNAL int txn_abort(MDBX_txn *txn);
|
||||
MDBX_INTERNAL int txn_renew(MDBX_txn *txn, unsigned flags);
|
||||
MDBX_INTERNAL int txn_park(MDBX_txn *txn, bool autounpark);
|
||||
MDBX_INTERNAL int txn_unpark(MDBX_txn *txn);
|
||||
MDBX_INTERNAL int txn_check_badbits_parked(const MDBX_txn *txn, int bad_bits);
|
||||
MDBX_INTERNAL void txn_done_cursors(MDBX_txn *txn, const bool merge);
|
||||
|
||||
#define TXN_END_NAMES \
|
||||
{"committed", "empty-commit", "abort", "reset", "fail-begin", "fail-beginchild", "ousted", nullptr}
|
||||
{"committed", "pure-commit", "abort", "reset", "fail-begin", "fail-begin-nested", "ousted", nullptr}
|
||||
enum {
|
||||
/* txn_end operation number, for logging */
|
||||
TXN_END_COMMITTED,
|
||||
TXN_END_PURE_COMMIT,
|
||||
TXN_END_ABORT,
|
||||
TXN_END_RESET,
|
||||
TXN_END_FAIL_BEGIN,
|
||||
TXN_END_FAIL_BEGINCHILD,
|
||||
TXN_END_OUSTED,
|
||||
TXN_END_COMMITTED /* 0 */,
|
||||
TXN_END_PURE_COMMIT /* 1 */,
|
||||
TXN_END_ABORT /* 2 */,
|
||||
TXN_END_RESET /* 3 */,
|
||||
TXN_END_FAIL_BEGIN /* 4 */,
|
||||
TXN_END_FAIL_BEGIN_NESTED /* 5 */,
|
||||
TXN_END_OUSTED /* 6 */,
|
||||
|
||||
TXN_END_OPMASK = 0x07 /* mask for txn_end() operation number */,
|
||||
TXN_END_UPDATE = 0x10 /* update env state (DBIs) */,
|
||||
TXN_END_FREE = 0x20 /* free txn unless it is env.basal_txn */,
|
||||
TXN_END_EOTDONE = 0x40 /* txn's cursors already closed */,
|
||||
TXN_END_SLOT = 0x80 /* release any reader slot if NOSTICKYTHREADS */
|
||||
TXN_END_SLOT = 0x40 /* release any reader slot if NOSTICKYTHREADS */
|
||||
};
|
||||
|
||||
struct commit_timestamp {
|
||||
uint64_t start, prep, gc, audit, write, sync, gc_cpu;
|
||||
};
|
||||
|
||||
MDBX_INTERNAL bool txn_refund(MDBX_txn *txn);
|
||||
MDBX_INTERNAL bool txn_gc_detent(const MDBX_txn *const txn);
|
||||
MDBX_INTERNAL int txn_check_badbits_parked(const MDBX_txn *txn, int bad_bits);
|
||||
MDBX_INTERNAL void txn_done_cursors(MDBX_txn *txn);
|
||||
MDBX_INTERNAL int txn_shadow_cursors(const MDBX_txn *parent, const size_t dbi);
|
||||
MDBX_INTERNAL MDBX_cursor *txn_gc_cursor(MDBX_txn *txn);
|
||||
|
||||
MDBX_INTERNAL MDBX_txn *txn_alloc(const MDBX_txn_flags_t flags, MDBX_env *env);
|
||||
MDBX_INTERNAL int txn_abort(MDBX_txn *txn);
|
||||
MDBX_INTERNAL int txn_renew(MDBX_txn *txn, unsigned flags);
|
||||
MDBX_INTERNAL int txn_end(MDBX_txn *txn, unsigned mode);
|
||||
MDBX_INTERNAL int txn_write(MDBX_txn *txn, iov_ctx_t *ctx);
|
||||
MDBX_INTERNAL void txn_take_gcprof(const MDBX_txn *txn, MDBX_commit_latency *latency);
|
||||
MDBX_INTERNAL void txn_merge(MDBX_txn *const parent, MDBX_txn *const txn, const size_t parent_retired_len);
|
||||
|
||||
MDBX_INTERNAL int txn_nested_create(MDBX_txn *parent, const MDBX_txn_flags_t flags);
|
||||
MDBX_INTERNAL void txn_nested_abort(MDBX_txn *nested);
|
||||
MDBX_INTERNAL int txn_nested_join(MDBX_txn *txn, struct commit_timestamp *ts);
|
||||
|
||||
MDBX_INTERNAL MDBX_txn *txn_basal_create(const size_t max_dbi);
|
||||
MDBX_INTERNAL void txn_basal_destroy(MDBX_txn *txn);
|
||||
MDBX_INTERNAL int txn_basal_start(MDBX_txn *txn, unsigned flags);
|
||||
MDBX_INTERNAL int txn_basal_commit(MDBX_txn *txn, struct commit_timestamp *ts);
|
||||
MDBX_INTERNAL int txn_basal_end(MDBX_txn *txn, unsigned mode);
|
||||
|
||||
MDBX_INTERNAL int txn_ro_park(MDBX_txn *txn, bool autounpark);
|
||||
MDBX_INTERNAL int txn_ro_unpark(MDBX_txn *txn);
|
||||
MDBX_INTERNAL int txn_ro_start(MDBX_txn *txn, unsigned flags);
|
||||
MDBX_INTERNAL int txn_ro_end(MDBX_txn *txn, unsigned mode);
|
||||
|
||||
/* env.c */
|
||||
MDBX_INTERNAL int env_open(MDBX_env *env, mdbx_mode_t mode);
|
||||
|
54
src/refund.c
54
src/refund.c
@ -7,7 +7,7 @@
|
||||
static void refund_reclaimed(MDBX_txn *txn) {
|
||||
/* Scanning in descend order */
|
||||
pgno_t first_unallocated = txn->geo.first_unallocated;
|
||||
const pnl_t pnl = txn->tw.repnl;
|
||||
const pnl_t pnl = txn->wr.repnl;
|
||||
tASSERT(txn, MDBX_PNL_GETSIZE(pnl) && MDBX_PNL_MOST(pnl) == first_unallocated - 1);
|
||||
#if MDBX_PNL_ASCENDING
|
||||
size_t i = MDBX_PNL_GETSIZE(pnl);
|
||||
@ -28,16 +28,16 @@ static void refund_reclaimed(MDBX_txn *txn) {
|
||||
VERBOSE("refunded %" PRIaPGNO " pages: %" PRIaPGNO " -> %" PRIaPGNO, txn->geo.first_unallocated - first_unallocated,
|
||||
txn->geo.first_unallocated, first_unallocated);
|
||||
txn->geo.first_unallocated = first_unallocated;
|
||||
tASSERT(txn, pnl_check_allocated(txn->tw.repnl, txn->geo.first_unallocated - 1));
|
||||
tASSERT(txn, pnl_check_allocated(txn->wr.repnl, txn->geo.first_unallocated - 1));
|
||||
}
|
||||
|
||||
static void refund_loose(MDBX_txn *txn) {
|
||||
tASSERT(txn, txn->tw.loose_pages != nullptr);
|
||||
tASSERT(txn, txn->tw.loose_count > 0);
|
||||
tASSERT(txn, txn->wr.loose_pages != nullptr);
|
||||
tASSERT(txn, txn->wr.loose_count > 0);
|
||||
|
||||
dpl_t *const dl = txn->tw.dirtylist;
|
||||
dpl_t *const dl = txn->wr.dirtylist;
|
||||
if (dl) {
|
||||
tASSERT(txn, dl->length >= txn->tw.loose_count);
|
||||
tASSERT(txn, dl->length >= txn->wr.loose_count);
|
||||
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC);
|
||||
} else {
|
||||
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) != 0 && !MDBX_AVOID_MSYNC);
|
||||
@ -46,22 +46,22 @@ static void refund_loose(MDBX_txn *txn) {
|
||||
pgno_t onstack[MDBX_CACHELINE_SIZE * 8 / sizeof(pgno_t)];
|
||||
pnl_t suitable = onstack;
|
||||
|
||||
if (!dl || dl->length - dl->sorted > txn->tw.loose_count) {
|
||||
if (!dl || dl->length - dl->sorted > txn->wr.loose_count) {
|
||||
/* Dirty list is useless since unsorted. */
|
||||
if (pnl_bytes2size(sizeof(onstack)) < txn->tw.loose_count) {
|
||||
suitable = pnl_alloc(txn->tw.loose_count);
|
||||
if (pnl_bytes2size(sizeof(onstack)) < txn->wr.loose_count) {
|
||||
suitable = pnl_alloc(txn->wr.loose_count);
|
||||
if (unlikely(!suitable))
|
||||
return /* this is not a reason for transaction fail */;
|
||||
}
|
||||
|
||||
/* Collect loose-pages which may be refunded. */
|
||||
tASSERT(txn, txn->geo.first_unallocated >= MIN_PAGENO + txn->tw.loose_count);
|
||||
tASSERT(txn, txn->geo.first_unallocated >= MIN_PAGENO + txn->wr.loose_count);
|
||||
pgno_t most = MIN_PAGENO;
|
||||
size_t w = 0;
|
||||
for (const page_t *lp = txn->tw.loose_pages; lp; lp = page_next(lp)) {
|
||||
for (const page_t *lp = txn->wr.loose_pages; lp; lp = page_next(lp)) {
|
||||
tASSERT(txn, lp->flags == P_LOOSE);
|
||||
tASSERT(txn, txn->geo.first_unallocated > lp->pgno);
|
||||
if (likely(txn->geo.first_unallocated - txn->tw.loose_count <= lp->pgno)) {
|
||||
if (likely(txn->geo.first_unallocated - txn->wr.loose_count <= lp->pgno)) {
|
||||
tASSERT(txn, w < ((suitable == onstack) ? pnl_bytes2size(sizeof(onstack)) : MDBX_PNL_ALLOCLEN(suitable)));
|
||||
suitable[++w] = lp->pgno;
|
||||
most = (lp->pgno > most) ? lp->pgno : most;
|
||||
@ -90,11 +90,11 @@ static void refund_loose(MDBX_txn *txn) {
|
||||
const size_t refunded = txn->geo.first_unallocated - most;
|
||||
DEBUG("refund-suitable %zu pages %" PRIaPGNO " -> %" PRIaPGNO, refunded, most, txn->geo.first_unallocated);
|
||||
txn->geo.first_unallocated = most;
|
||||
txn->tw.loose_count -= refunded;
|
||||
txn->wr.loose_count -= refunded;
|
||||
if (dl) {
|
||||
txn->tw.dirtyroom += refunded;
|
||||
txn->wr.dirtyroom += refunded;
|
||||
dl->pages_including_loose -= refunded;
|
||||
assert(txn->tw.dirtyroom <= txn->env->options.dp_limit);
|
||||
assert(txn->wr.dirtyroom <= txn->env->options.dp_limit);
|
||||
|
||||
/* Filter-out dirty list */
|
||||
size_t r = 0;
|
||||
@ -115,8 +115,8 @@ static void refund_loose(MDBX_txn *txn) {
|
||||
}
|
||||
}
|
||||
dpl_setlen(dl, w);
|
||||
tASSERT(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length ==
|
||||
(txn->parent ? txn->parent->tw.dirtyroom : txn->env->options.dp_limit));
|
||||
tASSERT(txn, txn->wr.dirtyroom + txn->wr.dirtylist->length ==
|
||||
(txn->parent ? txn->parent->wr.dirtyroom : txn->env->options.dp_limit));
|
||||
}
|
||||
goto unlink_loose;
|
||||
}
|
||||
@ -141,15 +141,15 @@ static void refund_loose(MDBX_txn *txn) {
|
||||
if (dl->sorted != dl->length) {
|
||||
const size_t refunded = dl->sorted - dl->length;
|
||||
dl->sorted = dl->length;
|
||||
txn->tw.loose_count -= refunded;
|
||||
txn->tw.dirtyroom += refunded;
|
||||
txn->wr.loose_count -= refunded;
|
||||
txn->wr.dirtyroom += refunded;
|
||||
dl->pages_including_loose -= refunded;
|
||||
tASSERT(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length ==
|
||||
(txn->parent ? txn->parent->tw.dirtyroom : txn->env->options.dp_limit));
|
||||
tASSERT(txn, txn->wr.dirtyroom + txn->wr.dirtylist->length ==
|
||||
(txn->parent ? txn->parent->wr.dirtyroom : txn->env->options.dp_limit));
|
||||
|
||||
/* Filter-out loose chain & dispose refunded pages. */
|
||||
unlink_loose:
|
||||
for (page_t *__restrict *__restrict link = &txn->tw.loose_pages; *link;) {
|
||||
for (page_t *__restrict *__restrict link = &txn->wr.loose_pages; *link;) {
|
||||
page_t *dp = *link;
|
||||
tASSERT(txn, dp->flags == P_LOOSE);
|
||||
MDBX_ASAN_UNPOISON_MEMORY_REGION(&page_next(dp), sizeof(page_t *));
|
||||
@ -168,21 +168,21 @@ static void refund_loose(MDBX_txn *txn) {
|
||||
tASSERT(txn, dpl_check(txn));
|
||||
if (suitable != onstack)
|
||||
pnl_free(suitable);
|
||||
txn->tw.loose_refund_wl = txn->geo.first_unallocated;
|
||||
txn->wr.loose_refund_wl = txn->geo.first_unallocated;
|
||||
}
|
||||
|
||||
bool txn_refund(MDBX_txn *txn) {
|
||||
const pgno_t before = txn->geo.first_unallocated;
|
||||
|
||||
if (txn->tw.loose_pages && txn->tw.loose_refund_wl > txn->geo.first_unallocated)
|
||||
if (txn->wr.loose_pages && txn->wr.loose_refund_wl > txn->geo.first_unallocated)
|
||||
refund_loose(txn);
|
||||
|
||||
while (true) {
|
||||
if (MDBX_PNL_GETSIZE(txn->tw.repnl) == 0 || MDBX_PNL_MOST(txn->tw.repnl) != txn->geo.first_unallocated - 1)
|
||||
if (MDBX_PNL_GETSIZE(txn->wr.repnl) == 0 || MDBX_PNL_MOST(txn->wr.repnl) != txn->geo.first_unallocated - 1)
|
||||
break;
|
||||
|
||||
refund_reclaimed(txn);
|
||||
if (!txn->tw.loose_pages || txn->tw.loose_refund_wl <= txn->geo.first_unallocated)
|
||||
if (!txn->wr.loose_pages || txn->wr.loose_refund_wl <= txn->geo.first_unallocated)
|
||||
break;
|
||||
|
||||
const pgno_t memo = txn->geo.first_unallocated;
|
||||
@ -194,7 +194,7 @@ bool txn_refund(MDBX_txn *txn) {
|
||||
if (before == txn->geo.first_unallocated)
|
||||
return false;
|
||||
|
||||
if (txn->tw.spilled.list)
|
||||
if (txn->wr.spilled.list)
|
||||
/* Squash deleted pagenums if we refunded any */
|
||||
spill_purge(txn);
|
||||
|
||||
|
639
src/rkl.c
Normal file
639
src/rkl.c
Normal file
@ -0,0 +1,639 @@
|
||||
/// \copyright SPDX-License-Identifier: Apache-2.0
|
||||
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2025
|
||||
|
||||
#include "internals.h"
|
||||
|
||||
static inline size_t rkl_size2bytes(const size_t size) {
|
||||
assert(size > 0 && size <= txl_max * 2);
|
||||
size_t bytes = ceil_powerof2(MDBX_ASSUME_MALLOC_OVERHEAD + sizeof(txnid_t) * size, txl_granulate * sizeof(txnid_t)) -
|
||||
MDBX_ASSUME_MALLOC_OVERHEAD;
|
||||
return bytes;
|
||||
}
|
||||
|
||||
static inline size_t rkl_bytes2size(const size_t bytes) {
|
||||
size_t size = bytes / sizeof(txnid_t);
|
||||
assert(size > 0 && size <= txl_max * 2);
|
||||
return size;
|
||||
}
|
||||
|
||||
void rkl_init(rkl_t *rkl) {
|
||||
rkl->list_limit = ARRAY_LENGTH(rkl->inplace);
|
||||
rkl->list = rkl->inplace;
|
||||
rkl_clear(rkl);
|
||||
}
|
||||
|
||||
void rkl_clear(rkl_t *rkl) {
|
||||
rkl->solid_begin = UINT64_MAX;
|
||||
rkl->solid_end = 0;
|
||||
rkl->list_length = 0;
|
||||
}
|
||||
|
||||
void rkl_destroy(rkl_t *rkl) {
|
||||
void *ptr = rkl->list;
|
||||
rkl->list = nullptr;
|
||||
if (ptr != rkl->inplace)
|
||||
osal_free(ptr);
|
||||
}
|
||||
|
||||
static inline bool solid_empty(const rkl_t *rkl) { return !(rkl->solid_begin < rkl->solid_end); }
|
||||
|
||||
#define RKL_ORDERED(first, last) ((first) < (last))
|
||||
|
||||
SEARCH_IMPL(rkl_bsearch, txnid_t, txnid_t, RKL_ORDERED)
|
||||
|
||||
void rkl_destructive_move(rkl_t *src, rkl_t *dst) {
|
||||
assert(rkl_check(src));
|
||||
dst->solid_begin = src->solid_begin;
|
||||
dst->solid_end = src->solid_end;
|
||||
dst->list_length = src->list_length;
|
||||
if (dst->list != dst->inplace)
|
||||
osal_free(dst->list);
|
||||
if (src->list != src->inplace) {
|
||||
dst->list = src->list;
|
||||
dst->list_limit = src->list_limit;
|
||||
} else {
|
||||
dst->list = dst->inplace;
|
||||
dst->list_limit = ARRAY_LENGTH(src->inplace);
|
||||
memcpy(dst->inplace, src->list, sizeof(dst->inplace));
|
||||
}
|
||||
rkl_init(src);
|
||||
}
|
||||
|
||||
static int rkl_resize(rkl_t *rkl, size_t wanna_size) {
|
||||
assert(wanna_size > rkl->list_length);
|
||||
assert(rkl_check(rkl));
|
||||
STATIC_ASSERT(txl_max < INT_MAX / sizeof(txnid_t));
|
||||
if (unlikely(wanna_size > txl_max)) {
|
||||
ERROR("rkl too long (%zu >= %zu)", wanna_size, (size_t)txl_max);
|
||||
return MDBX_TXN_FULL;
|
||||
}
|
||||
if (unlikely(wanna_size < rkl->list_length)) {
|
||||
ERROR("unable shrink rkl to %zu since length is %u", wanna_size, rkl->list_length);
|
||||
return MDBX_PROBLEM;
|
||||
}
|
||||
|
||||
if (unlikely(wanna_size <= ARRAY_LENGTH(rkl->inplace))) {
|
||||
if (rkl->list != rkl->inplace) {
|
||||
assert(rkl->list_limit > ARRAY_LENGTH(rkl->inplace) && rkl->list_length <= ARRAY_LENGTH(rkl->inplace));
|
||||
memcpy(rkl->inplace, rkl->list, sizeof(rkl->inplace));
|
||||
rkl->list_limit = ARRAY_LENGTH(rkl->inplace);
|
||||
osal_free(rkl->list);
|
||||
rkl->list = rkl->inplace;
|
||||
} else {
|
||||
assert(rkl->list_limit == ARRAY_LENGTH(rkl->inplace));
|
||||
}
|
||||
return MDBX_SUCCESS;
|
||||
}
|
||||
|
||||
if (wanna_size != rkl->list_limit) {
|
||||
size_t bytes = rkl_size2bytes(wanna_size);
|
||||
void *ptr = (rkl->list == rkl->inplace) ? osal_malloc(bytes) : osal_realloc(rkl->list, bytes);
|
||||
if (unlikely(!ptr))
|
||||
return MDBX_ENOMEM;
|
||||
#ifdef osal_malloc_usable_size
|
||||
bytes = osal_malloc_usable_size(ptr);
|
||||
#endif /* osal_malloc_usable_size */
|
||||
rkl->list_limit = rkl_bytes2size(bytes);
|
||||
if (rkl->list == rkl->inplace)
|
||||
memcpy(ptr, rkl->inplace, sizeof(rkl->inplace));
|
||||
rkl->list = ptr;
|
||||
}
|
||||
return MDBX_SUCCESS;
|
||||
}
|
||||
|
||||
int rkl_copy(const rkl_t *src, rkl_t *dst) {
|
||||
assert(rkl_check(src));
|
||||
rkl_init(dst);
|
||||
if (!rkl_empty(src)) {
|
||||
if (dst->list_limit < src->list_length) {
|
||||
int err = rkl_resize(dst, src->list_limit);
|
||||
if (unlikely(err != MDBX_SUCCESS))
|
||||
return err;
|
||||
}
|
||||
memcpy(dst->list, src->list, sizeof(txnid_t) * src->list_length);
|
||||
dst->list_length = src->list_length;
|
||||
dst->solid_begin = src->solid_begin;
|
||||
dst->solid_end = src->solid_end;
|
||||
}
|
||||
return MDBX_SUCCESS;
|
||||
}
|
||||
|
||||
size_t rkl_len(const rkl_t *rkl) { return rkl_empty(rkl) ? 0 : rkl->solid_end - rkl->solid_begin + rkl->list_length; }
|
||||
|
||||
__hot bool rkl_contain(const rkl_t *rkl, txnid_t id) {
|
||||
assert(rkl_check(rkl));
|
||||
if (id >= rkl->solid_begin && id < rkl->solid_end)
|
||||
return true;
|
||||
if (rkl->list_length) {
|
||||
const txnid_t *it = rkl_bsearch(rkl->list, rkl->list_length, id);
|
||||
const txnid_t *const end = rkl->list + rkl->list_length;
|
||||
assert(it >= rkl->list && it <= end);
|
||||
if (it != rkl->list)
|
||||
assert(RKL_ORDERED(it[-1], id));
|
||||
if (it != end) {
|
||||
assert(!RKL_ORDERED(it[0], id));
|
||||
return *it == id;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
__hot bool rkl_find(const rkl_t *rkl, txnid_t id, rkl_iter_t *iter) {
|
||||
assert(rkl_check(rkl));
|
||||
*iter = rkl_iterator(rkl, false);
|
||||
if (id >= rkl->solid_begin) {
|
||||
if (id < rkl->solid_end) {
|
||||
iter->pos = iter->solid_offset + (unsigned)(id - rkl->solid_begin);
|
||||
return true;
|
||||
}
|
||||
iter->pos = (unsigned)(rkl->solid_end - rkl->solid_begin);
|
||||
}
|
||||
if (rkl->list_length) {
|
||||
const txnid_t *it = rkl_bsearch(rkl->list, rkl->list_length, id);
|
||||
const txnid_t *const end = rkl->list + rkl->list_length;
|
||||
assert(it >= rkl->list && it <= end);
|
||||
if (it != rkl->list)
|
||||
assert(RKL_ORDERED(it[-1], id));
|
||||
iter->pos += (unsigned)(it - rkl->list);
|
||||
if (it != end) {
|
||||
assert(!RKL_ORDERED(it[0], id));
|
||||
return *it == id;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
static inline txnid_t list_remove_first(rkl_t *rkl) {
|
||||
assert(rkl->list_length > 0);
|
||||
const txnid_t first = rkl->list[0];
|
||||
if (--rkl->list_length) {
|
||||
/* TODO: Можно подумать о том, чтобы для избавления от memove() добавить headroom или вместо длины и
|
||||
* указателя на список использовать три поля: list_begin, list_end и list_buffer. */
|
||||
size_t i = 0;
|
||||
do
|
||||
rkl->list[i] = rkl->list[i + 1];
|
||||
while (++i <= rkl->list_length);
|
||||
}
|
||||
return first;
|
||||
}
|
||||
|
||||
static inline txnid_t after_cut(rkl_t *rkl, const txnid_t out) {
|
||||
if (rkl->list_length == 0 && rkl->solid_begin == rkl->solid_end) {
|
||||
rkl->solid_end = 0;
|
||||
rkl->solid_begin = UINT64_MAX;
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
static int extend_solid(rkl_t *rkl, txnid_t solid_begin, txnid_t solid_end, const txnid_t id) {
|
||||
if (rkl->list_length) {
|
||||
const txnid_t *i = rkl_bsearch(rkl->list, rkl->list_length, id);
|
||||
const txnid_t *const end = rkl->list + rkl->list_length;
|
||||
/* если начало или конец списка примыкает к непрерывному интервалу,
|
||||
* то переносим эти элементы из списка в непрерывный интервал */
|
||||
txnid_t *f = (txnid_t *)i;
|
||||
while (f > rkl->list && f[-1] >= solid_begin - 1) {
|
||||
f -= 1;
|
||||
solid_begin -= 1;
|
||||
if (unlikely(*f != solid_begin))
|
||||
return MDBX_RESULT_TRUE;
|
||||
}
|
||||
txnid_t *t = (txnid_t *)i;
|
||||
while (t < end && *t <= solid_end) {
|
||||
if (unlikely(*t != solid_end))
|
||||
return MDBX_RESULT_TRUE;
|
||||
solid_end += 1;
|
||||
t += 1;
|
||||
}
|
||||
if (f < t) {
|
||||
rkl->list_length -= t - f;
|
||||
while (t < end)
|
||||
*f++ = *t++;
|
||||
}
|
||||
}
|
||||
|
||||
rkl->solid_begin = solid_begin;
|
||||
rkl->solid_end = solid_end;
|
||||
assert(rkl_check(rkl));
|
||||
return MDBX_SUCCESS;
|
||||
}
|
||||
|
||||
int rkl_push(rkl_t *rkl, const txnid_t id, const bool known_continuous) {
|
||||
assert(id >= MIN_TXNID && id < INVALID_TXNID);
|
||||
assert(rkl_check(rkl));
|
||||
|
||||
if (rkl->solid_begin >= rkl->solid_end) {
|
||||
/* непрерывный интервал пуст */
|
||||
return extend_solid(rkl, id, id + 1, id);
|
||||
} else if (id < rkl->solid_begin) {
|
||||
if (known_continuous || id + 1 == rkl->solid_begin)
|
||||
/* id примыкает к solid_begin */
|
||||
return extend_solid(rkl, id, rkl->solid_end, id);
|
||||
} else if (id >= rkl->solid_end) {
|
||||
if (known_continuous || id == rkl->solid_end)
|
||||
/* id примыкает к solid_end */
|
||||
return extend_solid(rkl, rkl->solid_begin, id + 1, id);
|
||||
} else {
|
||||
/* id входит в интервал между solid_begin и solid_end, т.е. подан дубликат */
|
||||
return MDBX_RESULT_TRUE;
|
||||
}
|
||||
|
||||
if (rkl->list_length == 1 && rkl->solid_end == rkl->solid_begin + 1 &&
|
||||
(rkl->list[0] == id + 1 || rkl->list[0] == id - 1)) {
|
||||
/* В списке один элемент и добавляемый id примыкает к нему, при этом в непрерывном интервале тоже один элемент.
|
||||
* Лучше поменять элементы списка и непрерывного интервала. */
|
||||
const txnid_t couple = (rkl->list[0] == id - 1) ? id - 1 : id;
|
||||
rkl->list[0] = rkl->solid_begin;
|
||||
rkl->solid_begin = couple;
|
||||
rkl->solid_end = couple + 2;
|
||||
assert(rkl_check(rkl));
|
||||
return MDBX_SUCCESS;
|
||||
}
|
||||
|
||||
if (unlikely(rkl->list_length == rkl->list_limit)) {
|
||||
/* удваиваем размер буфера если закончилось место */
|
||||
size_t x2 = (rkl->list_limit + 1) << 1;
|
||||
x2 = (x2 > 62) ? x2 : 62;
|
||||
x2 = (x2 < txl_max) ? x2 : txl_max;
|
||||
x2 = (x2 > rkl->list_length) ? x2 : rkl->list_length + 42;
|
||||
int err = rkl_resize(rkl, x2);
|
||||
if (unlikely(err != MDBX_SUCCESS))
|
||||
return err;
|
||||
assert(rkl->list_limit > rkl->list_length);
|
||||
}
|
||||
|
||||
size_t i = rkl->list_length;
|
||||
/* ищем место для вставки двигаясь от конца к началу списка, сразу переставляя/раздвигая элементы */
|
||||
while (i > 0) {
|
||||
if (RKL_ORDERED(id, rkl->list[i - 1])) {
|
||||
rkl->list[i] = rkl->list[i - 1];
|
||||
i -= 1;
|
||||
continue;
|
||||
}
|
||||
if (unlikely(id == rkl->list[i - 1])) {
|
||||
while (++i < rkl->list_length)
|
||||
rkl->list[i - 1] = rkl->list[i];
|
||||
return MDBX_RESULT_TRUE;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
rkl->list[i] = id;
|
||||
rkl->list_length++;
|
||||
assert(rkl_check(rkl));
|
||||
|
||||
/* После добавления id в списке могла образоваться длинная последовательность,
|
||||
* которую (возможно) стоит обменять с непрерывным интервалом. */
|
||||
if (rkl->list_length > (MDBX_DEBUG ? 2 : 16) &&
|
||||
((i > 0 && rkl->list[i - 1] == id - 1) || (i + 1 < rkl->list_length && rkl->list[i + 1] == id + 1))) {
|
||||
txnid_t new_solid_begin = id;
|
||||
size_t from = i;
|
||||
while (from > 0 && rkl->list[from - 1] == new_solid_begin - 1) {
|
||||
from -= 1;
|
||||
new_solid_begin -= 1;
|
||||
}
|
||||
txnid_t new_solid_end = id + 1;
|
||||
size_t to = i + 1;
|
||||
while (to < rkl->list_length && rkl->list[to] == new_solid_end) {
|
||||
to += 1;
|
||||
new_solid_end += 1;
|
||||
}
|
||||
|
||||
const size_t new_solid_len = to - from;
|
||||
if (new_solid_len > 3) {
|
||||
const size_t old_solid_len = rkl->solid_end - rkl->solid_begin;
|
||||
if (new_solid_len > old_solid_len) {
|
||||
/* Новая непрерывная последовательность длиннее текущей.
|
||||
* Считаем обмен выгодным, если он дешевле пути развития событий с добавлением следующего элемента в список. */
|
||||
const size_t old_solid_pos = rkl_bsearch(rkl->list, rkl->list_length, rkl->solid_begin) - rkl->list;
|
||||
const size_t swap_cost =
|
||||
/* количество элементов списка после изымаемой из списка последовательности,
|
||||
* которые нужно переместить */
|
||||
rkl->list_length - to +
|
||||
/* количество элементов списка после позиции добавляемой в список последовательности,
|
||||
* которые нужно переместить */
|
||||
((from > old_solid_pos) ? from - old_solid_pos : 0)
|
||||
/* количество элементов списка добавляемой последовательности, которые нужно добавить */
|
||||
+ old_solid_len;
|
||||
/* количество элементов списка, которые нужно переместить для вставки еще-одного/следующего элемента */
|
||||
const size_t new_insert_cost = rkl->list_length - i;
|
||||
/* coverity[logical_vs_bitwise] */
|
||||
if (unlikely(swap_cost < new_insert_cost) || MDBX_DEBUG) {
|
||||
/* Изымаемая последовательность длиннее добавляемой, поэтому:
|
||||
* - список станет короче;
|
||||
* - перемещать хвост нужно всегда к началу;
|
||||
* - если начальные элементы потребуется раздвигать,
|
||||
* то места хватит и остающиеся элементы в конце не будут перезаписаны. */
|
||||
size_t moved = 0;
|
||||
if (from > old_solid_pos) {
|
||||
/* добавляемая последовательность ближе к началу, нужно раздвинуть элементы в голове для вставки. */
|
||||
moved = from - old_solid_pos;
|
||||
do {
|
||||
from -= 1;
|
||||
rkl->list[from + old_solid_len] = rkl->list[from];
|
||||
} while (from > old_solid_pos);
|
||||
} else if (from + new_solid_len < old_solid_pos) {
|
||||
/* добавляемая последовательность дальше от начала,
|
||||
* перемещаем часть элементов из хвоста после изымаемой последовательности */
|
||||
do
|
||||
rkl->list[from++] = rkl->list[to++];
|
||||
while (from < old_solid_pos - new_solid_len);
|
||||
}
|
||||
|
||||
/* вставляем последовательноть */
|
||||
i = 0;
|
||||
do
|
||||
rkl->list[from++] = rkl->solid_begin + i++;
|
||||
while (i != old_solid_len);
|
||||
|
||||
/* сдвигаем оставшийся хвост */
|
||||
while (to < rkl->list_length)
|
||||
rkl->list[moved + from++] = rkl->list[to++];
|
||||
|
||||
rkl->list_length = rkl->list_length - new_solid_len + old_solid_len;
|
||||
rkl->solid_begin = new_solid_begin;
|
||||
rkl->solid_end = new_solid_end;
|
||||
assert(rkl_check(rkl));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return MDBX_SUCCESS;
|
||||
}
|
||||
|
||||
txnid_t rkl_pop(rkl_t *rkl, const bool highest_not_lowest) {
|
||||
assert(rkl_check(rkl));
|
||||
|
||||
if (rkl->list_length) {
|
||||
assert(rkl->solid_begin <= rkl->solid_end);
|
||||
if (highest_not_lowest && (solid_empty(rkl) || rkl->solid_end < rkl->list[rkl->list_length - 1]))
|
||||
return after_cut(rkl, rkl->list[rkl->list_length -= 1]);
|
||||
if (!highest_not_lowest && (solid_empty(rkl) || rkl->solid_begin > rkl->list[0]))
|
||||
return after_cut(rkl, list_remove_first(rkl));
|
||||
}
|
||||
|
||||
if (!solid_empty(rkl))
|
||||
return after_cut(rkl, highest_not_lowest ? --rkl->solid_end : rkl->solid_begin++);
|
||||
|
||||
assert(rkl_empty(rkl));
|
||||
return 0;
|
||||
}
|
||||
|
||||
txnid_t rkl_lowest(const rkl_t *rkl) {
|
||||
if (rkl->list_length)
|
||||
return (solid_empty(rkl) || rkl->list[0] < rkl->solid_begin) ? rkl->list[0] : rkl->solid_begin;
|
||||
return !solid_empty(rkl) ? rkl->solid_begin : INVALID_TXNID;
|
||||
}
|
||||
|
||||
txnid_t rkl_highest(const rkl_t *rkl) {
|
||||
if (rkl->list_length)
|
||||
return (solid_empty(rkl) || rkl->list[rkl->list_length - 1] >= rkl->solid_end) ? rkl->list[rkl->list_length - 1]
|
||||
: rkl->solid_end - 1;
|
||||
return !solid_empty(rkl) ? rkl->solid_end - 1 : 0;
|
||||
}
|
||||
|
||||
int rkl_merge(rkl_t *dst, const rkl_t *src, bool ignore_duplicates) {
|
||||
if (src->list_length) {
|
||||
size_t i = src->list_length;
|
||||
do {
|
||||
int err = rkl_push(dst, src->list[i - 1], false);
|
||||
if (unlikely(err != MDBX_SUCCESS) && (!ignore_duplicates || err != MDBX_RESULT_TRUE))
|
||||
return err;
|
||||
} while (--i);
|
||||
}
|
||||
|
||||
txnid_t id = src->solid_begin;
|
||||
while (id < src->solid_end) {
|
||||
int err = rkl_push(dst, id, false);
|
||||
if (unlikely(err != MDBX_SUCCESS) && (!ignore_duplicates || err != MDBX_RESULT_TRUE))
|
||||
return err;
|
||||
++id;
|
||||
}
|
||||
return MDBX_SUCCESS;
|
||||
}
|
||||
|
||||
rkl_iter_t rkl_iterator(const rkl_t *rkl, const bool reverse) {
|
||||
rkl_iter_t iter = {.rkl = rkl, .pos = reverse ? rkl_len(rkl) : 0, .solid_offset = 0};
|
||||
if (!solid_empty(rkl) && rkl->list_length) {
|
||||
const txnid_t *it = rkl_bsearch(rkl->list, rkl->list_length, rkl->solid_begin);
|
||||
const txnid_t *const end = rkl->list + rkl->list_length;
|
||||
assert(it >= rkl->list && it <= end && (it == end || *it > rkl->solid_begin));
|
||||
iter.solid_offset = it - rkl->list;
|
||||
}
|
||||
return iter;
|
||||
}
|
||||
|
||||
txnid_t rkl_turn(rkl_iter_t *iter, const bool reverse) {
|
||||
assert((unsigned)reverse == (unsigned)!!reverse);
|
||||
size_t pos = iter->pos - reverse;
|
||||
if (unlikely(pos >= rkl_len(iter->rkl)))
|
||||
return 0;
|
||||
|
||||
iter->pos = pos + !reverse;
|
||||
assert(iter->pos <= rkl_len(iter->rkl));
|
||||
|
||||
const size_t solid_len = iter->rkl->solid_end - iter->rkl->solid_begin;
|
||||
if (iter->rkl->list_length) {
|
||||
if (pos < iter->solid_offset)
|
||||
return iter->rkl->list[pos];
|
||||
else if (pos < iter->solid_offset + solid_len)
|
||||
return iter->rkl->solid_begin + pos - iter->solid_offset;
|
||||
else
|
||||
return iter->rkl->list[pos - solid_len];
|
||||
}
|
||||
|
||||
assert(pos < solid_len);
|
||||
return iter->rkl->solid_begin + pos;
|
||||
}
|
||||
|
||||
size_t rkl_left(rkl_iter_t *iter, const bool reverse) {
|
||||
assert(iter->pos <= rkl_len(iter->rkl));
|
||||
return reverse ? iter->pos : rkl_len(iter->rkl) - iter->pos;
|
||||
}
|
||||
|
||||
#if 1
|
||||
#define DEBUG_HOLE(hole) \
|
||||
do { \
|
||||
} while (0)
|
||||
#else
|
||||
#define DEBUG_HOLE(hole) \
|
||||
do { \
|
||||
printf(" return-%sward: %d, ", reverse ? "back" : "for", __LINE__); \
|
||||
if (hole.begin == hole.end) \
|
||||
printf("empty-hole\n"); \
|
||||
else if (hole.end - hole.begin == 1) \
|
||||
printf("hole %" PRIaTXN "\n", hole.begin); \
|
||||
else \
|
||||
printf("hole %" PRIaTXN "-%" PRIaTXN "\n", hole.begin, hole.end - 1); \
|
||||
fflush(nullptr); \
|
||||
} while (0)
|
||||
#endif
|
||||
|
||||
rkl_hole_t rkl_hole(rkl_iter_t *iter, const bool reverse) {
|
||||
assert((unsigned)reverse == (unsigned)!!reverse);
|
||||
rkl_hole_t hole;
|
||||
const size_t len = rkl_len(iter->rkl);
|
||||
size_t pos = iter->pos;
|
||||
if (unlikely(pos >= len)) {
|
||||
if (len == 0) {
|
||||
hole.begin = 1;
|
||||
hole.end = MAX_TXNID;
|
||||
iter->pos = 0;
|
||||
DEBUG_HOLE(hole);
|
||||
return hole;
|
||||
} else if (pos == len && reverse) {
|
||||
/* шаг назад из позиции на конце rkl */
|
||||
} else if (reverse) {
|
||||
hole.begin = 1;
|
||||
hole.end = 1 /* rkl_lowest(iter->rkl); */;
|
||||
iter->pos = 0;
|
||||
DEBUG_HOLE(hole);
|
||||
return hole;
|
||||
} else {
|
||||
hole.begin = MAX_TXNID /* rkl_highest(iter->rkl) + 1 */;
|
||||
hole.end = MAX_TXNID;
|
||||
iter->pos = len;
|
||||
DEBUG_HOLE(hole);
|
||||
return hole;
|
||||
}
|
||||
}
|
||||
|
||||
const size_t solid_len = iter->rkl->solid_end - iter->rkl->solid_begin;
|
||||
if (iter->rkl->list_length) {
|
||||
/* список элементов не пуст */
|
||||
txnid_t here, there;
|
||||
for (size_t next;; pos = next) {
|
||||
next = reverse ? pos - 1 : pos + 1;
|
||||
if (pos < iter->solid_offset) {
|
||||
/* текущая позиция перед непрерывным интервалом */
|
||||
here = iter->rkl->list[pos];
|
||||
if (next == iter->solid_offset) {
|
||||
/* в следующей позиции начинается непрерывный интерал (при поиске вперед) */
|
||||
assert(!reverse);
|
||||
hole.begin = here + 1;
|
||||
hole.end = iter->rkl->solid_begin;
|
||||
next += solid_len;
|
||||
assert(hole.begin < hole.end /* зазор обязан быть, иначе это ошибка не-слияния */);
|
||||
/* зазор между элементом списка перед сплошным интервалом и началом интервала */
|
||||
iter->pos = next - 1;
|
||||
DEBUG_HOLE(hole);
|
||||
return hole;
|
||||
}
|
||||
if (next >= len)
|
||||
/* уперлись в конец или начало rkl */
|
||||
break;
|
||||
/* следующая позиция также перед непрерывным интервалом */
|
||||
there = iter->rkl->list[next];
|
||||
} else if (pos >= iter->solid_offset + solid_len) {
|
||||
/* текущая позиция после непрерывного интервала */
|
||||
here = (pos < len) ? iter->rkl->list[pos - solid_len] : MAX_TXNID;
|
||||
if (next >= len)
|
||||
/* уперлись в конец или начало rkl */
|
||||
break;
|
||||
if (next == iter->solid_offset + solid_len - 1) {
|
||||
/* в следующей позиции конец непрерывного интервала (при поиске назад) */
|
||||
assert(reverse);
|
||||
hole.begin = iter->rkl->solid_end;
|
||||
hole.end = here;
|
||||
pos = iter->solid_offset;
|
||||
assert(hole.begin < hole.end /* зазор обязан быть, иначе это ошибка не-слияния */);
|
||||
/* зазор между элементом списка после сплошного интервала и концом интервала */
|
||||
iter->pos = pos;
|
||||
DEBUG_HOLE(hole);
|
||||
return hole;
|
||||
}
|
||||
/* следующая позиция также после непрерывного интервала */
|
||||
there = iter->rkl->list[next - solid_len];
|
||||
} else if (reverse) {
|
||||
/* текущая позиция внутри непрерывного интервала и поиск назад */
|
||||
next = iter->solid_offset - 1;
|
||||
here = iter->rkl->solid_begin;
|
||||
if (next >= len)
|
||||
/* нет элементов списка перед непрерывным интервалом */
|
||||
break;
|
||||
/* предыдущая позиция перед непрерывным интервалом */
|
||||
there = iter->rkl->list[next];
|
||||
} else {
|
||||
/* текущая позиция внутри непрерывного интервала и поиск вперед */
|
||||
next = iter->solid_offset + solid_len;
|
||||
here = iter->rkl->solid_end - 1;
|
||||
if (next >= len)
|
||||
/* нет элементов списка после непрерывного интервала */
|
||||
break;
|
||||
/* следующая позиция после непрерывного интервала */
|
||||
there = iter->rkl->list[next - solid_len];
|
||||
}
|
||||
|
||||
hole.begin = (reverse ? there : here) + 1;
|
||||
hole.end = reverse ? here : there;
|
||||
if (hole.begin < hole.end) {
|
||||
/* есть зазор между текущей и следующей позицией */
|
||||
iter->pos = next;
|
||||
DEBUG_HOLE(hole);
|
||||
return hole;
|
||||
}
|
||||
}
|
||||
|
||||
if (reverse) {
|
||||
/* уперлись в начало rkl, возвращаем зазор перед началом rkl */
|
||||
hole.begin = 1;
|
||||
hole.end = here;
|
||||
iter->pos = 0;
|
||||
DEBUG_HOLE(hole);
|
||||
} else {
|
||||
/* уперлись в конец rkl, возвращаем зазор после конца rkl */
|
||||
hole.begin = here + 1;
|
||||
hole.end = MAX_TXNID;
|
||||
iter->pos = len;
|
||||
DEBUG_HOLE(hole);
|
||||
}
|
||||
return hole;
|
||||
}
|
||||
|
||||
/* список элементов пуст, но есть непрерывный интервал */
|
||||
iter->pos = reverse ? 0 : len;
|
||||
if (reverse && pos < len) {
|
||||
/* возвращаем зазор перед непрерывным интервалом */
|
||||
hole.begin = 1;
|
||||
hole.end = iter->rkl->solid_begin;
|
||||
DEBUG_HOLE(hole);
|
||||
} else {
|
||||
/* возвращаем зазор после непрерывного интервала */
|
||||
hole.begin = iter->rkl->solid_end;
|
||||
hole.end = MAX_TXNID;
|
||||
DEBUG_HOLE(hole);
|
||||
}
|
||||
return hole;
|
||||
}
|
||||
|
||||
bool rkl_check(const rkl_t *rkl) {
|
||||
if (!rkl)
|
||||
return false;
|
||||
if (rkl->list == rkl->inplace && unlikely(rkl->list_limit != ARRAY_LENGTH(rkl->inplace)))
|
||||
return false;
|
||||
if (unlikely(rkl->list_limit < ARRAY_LENGTH(rkl->inplace)))
|
||||
return false;
|
||||
|
||||
if (rkl_empty(rkl))
|
||||
return rkl->list_length == 0 && solid_empty(rkl);
|
||||
|
||||
if (rkl->list_length) {
|
||||
for (size_t i = 1; i < rkl->list_length; ++i)
|
||||
if (unlikely(!RKL_ORDERED(rkl->list[i - 1], rkl->list[i])))
|
||||
return false;
|
||||
if (!solid_empty(rkl) && rkl->solid_begin - 1 <= rkl->list[rkl->list_length - 1] &&
|
||||
rkl->solid_end >= rkl->list[0]) {
|
||||
/* непрерывный интервал "плавает" внутри списка, т.е. находится между какими-то соседними значениями */
|
||||
const txnid_t *it = rkl_bsearch(rkl->list, rkl->list_length, rkl->solid_begin);
|
||||
const txnid_t *const end = rkl->list + rkl->list_length;
|
||||
if (it < rkl->list || it > end)
|
||||
return false;
|
||||
if (it > rkl->list && it[-1] >= rkl->solid_begin)
|
||||
return false;
|
||||
if (it < end && it[0] <= rkl->solid_end)
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
76
src/rkl.h
Normal file
76
src/rkl.h
Normal file
@ -0,0 +1,76 @@
|
||||
/// \copyright SPDX-License-Identifier: Apache-2.0
|
||||
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2025
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "essentials.h"
|
||||
|
||||
/* Сортированный набор txnid, использующий внутри комбинацию непрерывного интервала и списка.
|
||||
* Обеспечивает хранение id записей при переработке, очистку и обновлении GC, включая возврат остатков переработанных
|
||||
* страниц.
|
||||
*
|
||||
* При переработке GC записи преимущественно выбираются последовательно, но это не гарантируется. В LIFO-режиме
|
||||
* переработка и добавление записей в rkl происходит преимущественно в обратном порядке, но из-за завершения читающих
|
||||
* транзакций могут быть «скачки» в прямом направлении. В FIFO-режиме записи GC перерабатываются в прямом порядке и при
|
||||
* этом линейно, но не обязательно строго последовательно, при этом гарантируется что между добавляемыми в rkl
|
||||
* идентификаторами в GC нет записей, т.е. между первой (минимальный id) и последней (максимальный id) в GC нет записей
|
||||
* и весь интервал может быть использован для возврата остатков страниц в GC.
|
||||
*
|
||||
* Таким образом, комбинация линейного интервала и списка (отсортированного в порядке возрастания элементов) является
|
||||
* рациональным решением, близким к теоретически оптимальному пределу.
|
||||
*
|
||||
* Реализация rkl достаточно проста/прозрачная, если не считать неочевидную «магию» обмена непрерывного интервала и
|
||||
* образующихся в списке последовательностей. Однако, именно этот автоматически выполняемый без лишних операций обмен
|
||||
* оправдывает все накладные расходы. */
|
||||
typedef struct MDBX_rkl {
|
||||
txnid_t solid_begin, solid_end; /* начало и конец непрерывной последовательности solid_begin ... solid_end-1. */
|
||||
unsigned list_length; /* текущая длина списка. */
|
||||
unsigned list_limit; /* размер буфера выделенного под список, равен ARRAY_LENGTH(inplace) когда list == inplace. */
|
||||
txnid_t *list; /* список отдельных элементов в порядке возрастания (наименьший в начале). */
|
||||
txnid_t inplace[4 + 8]; /* статический массив для коротких списков, чтобы избавиться от выделения/освобождения памяти
|
||||
* в большинстве случаев. */
|
||||
} rkl_t;
|
||||
|
||||
MDBX_MAYBE_UNUSED MDBX_INTERNAL void rkl_init(rkl_t *rkl);
|
||||
MDBX_MAYBE_UNUSED MDBX_INTERNAL void rkl_clear(rkl_t *rkl);
|
||||
static inline void rkl_clear_and_shrink(rkl_t *rkl) { rkl_clear(rkl); /* TODO */ }
|
||||
MDBX_MAYBE_UNUSED MDBX_INTERNAL void rkl_destroy(rkl_t *rkl);
|
||||
MDBX_MAYBE_UNUSED MDBX_INTERNAL void rkl_destructive_move(rkl_t *dst, rkl_t *src);
|
||||
MDBX_MAYBE_UNUSED MDBX_INTERNAL __must_check_result int rkl_copy(const rkl_t *src, rkl_t *dst);
|
||||
MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static inline bool rkl_empty(const rkl_t *rkl) {
|
||||
return rkl->solid_begin > rkl->solid_end;
|
||||
}
|
||||
MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION MDBX_INTERNAL bool rkl_check(const rkl_t *rkl);
|
||||
MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION MDBX_INTERNAL size_t rkl_len(const rkl_t *rkl);
|
||||
MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION MDBX_INTERNAL txnid_t rkl_lowest(const rkl_t *rkl);
|
||||
MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION MDBX_INTERNAL txnid_t rkl_highest(const rkl_t *rkl);
|
||||
MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static inline txnid_t rkl_edge(const rkl_t *rkl,
|
||||
const bool highest_not_lowest) {
|
||||
return highest_not_lowest ? rkl_highest(rkl) : rkl_lowest(rkl);
|
||||
}
|
||||
MDBX_MAYBE_UNUSED MDBX_INTERNAL __must_check_result int rkl_push(rkl_t *rkl, const txnid_t id,
|
||||
const bool known_continuous);
|
||||
MDBX_MAYBE_UNUSED MDBX_INTERNAL txnid_t rkl_pop(rkl_t *rkl, const bool highest_not_lowest);
|
||||
MDBX_MAYBE_UNUSED MDBX_INTERNAL __must_check_result int rkl_merge(rkl_t *dst, const rkl_t *src, bool ignore_duplicates);
|
||||
|
||||
/* Итератор для rkl.
|
||||
* Обеспечивает изоляцию внутреннего устройства rkl от остального кода, чем существенно его упрощает.
|
||||
* Фактически именно использованием rkl с итераторами ликвидируется "ребус" исторически образовавшийся в gc-update. */
|
||||
typedef struct MDBX_rkl_iter {
|
||||
const rkl_t *rkl;
|
||||
unsigned pos;
|
||||
unsigned solid_offset;
|
||||
} rkl_iter_t;
|
||||
|
||||
MDBX_MAYBE_UNUSED MDBX_INTERNAL __must_check_result rkl_iter_t rkl_iterator(const rkl_t *rkl, const bool reverse);
|
||||
MDBX_MAYBE_UNUSED MDBX_INTERNAL __must_check_result txnid_t rkl_turn(rkl_iter_t *iter, const bool reverse);
|
||||
MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION MDBX_INTERNAL size_t rkl_left(rkl_iter_t *iter, const bool reverse);
|
||||
MDBX_MAYBE_UNUSED MDBX_INTERNAL bool rkl_find(const rkl_t *rkl, const txnid_t id, rkl_iter_t *iter);
|
||||
MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION __must_check_result MDBX_INTERNAL bool rkl_contain(const rkl_t *rkl,
|
||||
txnid_t id);
|
||||
|
||||
typedef struct MDBX_rkl_hole {
|
||||
txnid_t begin;
|
||||
txnid_t end;
|
||||
} rkl_hole_t;
|
||||
MDBX_MAYBE_UNUSED MDBX_INTERNAL __must_check_result rkl_hole_t rkl_hole(rkl_iter_t *iter, const bool reverse);
|
124
src/spill.c
124
src/spill.c
@ -4,42 +4,42 @@
|
||||
#include "internals.h"
|
||||
|
||||
void spill_remove(MDBX_txn *txn, size_t idx, size_t npages) {
|
||||
tASSERT(txn, idx > 0 && idx <= MDBX_PNL_GETSIZE(txn->tw.spilled.list) && txn->tw.spilled.least_removed > 0);
|
||||
txn->tw.spilled.least_removed = (idx < txn->tw.spilled.least_removed) ? idx : txn->tw.spilled.least_removed;
|
||||
txn->tw.spilled.list[idx] |= 1;
|
||||
MDBX_PNL_SETSIZE(txn->tw.spilled.list,
|
||||
MDBX_PNL_GETSIZE(txn->tw.spilled.list) - (idx == MDBX_PNL_GETSIZE(txn->tw.spilled.list)));
|
||||
tASSERT(txn, idx > 0 && idx <= MDBX_PNL_GETSIZE(txn->wr.spilled.list) && txn->wr.spilled.least_removed > 0);
|
||||
txn->wr.spilled.least_removed = (idx < txn->wr.spilled.least_removed) ? idx : txn->wr.spilled.least_removed;
|
||||
txn->wr.spilled.list[idx] |= 1;
|
||||
MDBX_PNL_SETSIZE(txn->wr.spilled.list,
|
||||
MDBX_PNL_GETSIZE(txn->wr.spilled.list) - (idx == MDBX_PNL_GETSIZE(txn->wr.spilled.list)));
|
||||
|
||||
while (unlikely(npages > 1)) {
|
||||
const pgno_t pgno = (txn->tw.spilled.list[idx] >> 1) + 1;
|
||||
const pgno_t pgno = (txn->wr.spilled.list[idx] >> 1) + 1;
|
||||
if (MDBX_PNL_ASCENDING) {
|
||||
if (++idx > MDBX_PNL_GETSIZE(txn->tw.spilled.list) || (txn->tw.spilled.list[idx] >> 1) != pgno)
|
||||
if (++idx > MDBX_PNL_GETSIZE(txn->wr.spilled.list) || (txn->wr.spilled.list[idx] >> 1) != pgno)
|
||||
return;
|
||||
} else {
|
||||
if (--idx < 1 || (txn->tw.spilled.list[idx] >> 1) != pgno)
|
||||
if (--idx < 1 || (txn->wr.spilled.list[idx] >> 1) != pgno)
|
||||
return;
|
||||
txn->tw.spilled.least_removed = (idx < txn->tw.spilled.least_removed) ? idx : txn->tw.spilled.least_removed;
|
||||
txn->wr.spilled.least_removed = (idx < txn->wr.spilled.least_removed) ? idx : txn->wr.spilled.least_removed;
|
||||
}
|
||||
txn->tw.spilled.list[idx] |= 1;
|
||||
MDBX_PNL_SETSIZE(txn->tw.spilled.list,
|
||||
MDBX_PNL_GETSIZE(txn->tw.spilled.list) - (idx == MDBX_PNL_GETSIZE(txn->tw.spilled.list)));
|
||||
txn->wr.spilled.list[idx] |= 1;
|
||||
MDBX_PNL_SETSIZE(txn->wr.spilled.list,
|
||||
MDBX_PNL_GETSIZE(txn->wr.spilled.list) - (idx == MDBX_PNL_GETSIZE(txn->wr.spilled.list)));
|
||||
--npages;
|
||||
}
|
||||
}
|
||||
|
||||
pnl_t spill_purge(MDBX_txn *txn) {
|
||||
tASSERT(txn, txn->tw.spilled.least_removed > 0);
|
||||
const pnl_t sl = txn->tw.spilled.list;
|
||||
if (txn->tw.spilled.least_removed != INT_MAX) {
|
||||
tASSERT(txn, txn->wr.spilled.least_removed > 0);
|
||||
const pnl_t sl = txn->wr.spilled.list;
|
||||
if (txn->wr.spilled.least_removed != INT_MAX) {
|
||||
size_t len = MDBX_PNL_GETSIZE(sl), r, w;
|
||||
for (w = r = txn->tw.spilled.least_removed; r <= len; ++r) {
|
||||
for (w = r = txn->wr.spilled.least_removed; r <= len; ++r) {
|
||||
sl[w] = sl[r];
|
||||
w += 1 - (sl[r] & 1);
|
||||
}
|
||||
for (size_t i = 1; i < w; ++i)
|
||||
tASSERT(txn, (sl[i] & 1) == 0);
|
||||
MDBX_PNL_SETSIZE(sl, w - 1);
|
||||
txn->tw.spilled.least_removed = INT_MAX;
|
||||
txn->wr.spilled.least_removed = INT_MAX;
|
||||
} else {
|
||||
for (size_t i = 1; i <= MDBX_PNL_GETSIZE(sl); ++i)
|
||||
tASSERT(txn, (sl[i] & 1) == 0);
|
||||
@ -57,7 +57,7 @@ static int spill_page(MDBX_txn *txn, iov_ctx_t *ctx, page_t *dp, const size_t np
|
||||
const pgno_t pgno = dp->pgno;
|
||||
int err = iov_page(txn, ctx, dp, npages);
|
||||
if (likely(err == MDBX_SUCCESS))
|
||||
err = spill_append_span(&txn->tw.spilled.list, pgno, npages);
|
||||
err = spill_append_span(&txn->wr.spilled.list, pgno, npages);
|
||||
return err;
|
||||
}
|
||||
|
||||
@ -72,25 +72,29 @@ static size_t spill_cursor_keep(const MDBX_txn *const txn, const MDBX_cursor *mc
|
||||
intptr_t i = 0;
|
||||
do {
|
||||
mp = mc->pg[i];
|
||||
TRACE("dbi %zu, mc-%p[%zu], page %u %p", cursor_dbi(mc), __Wpedantic_format_voidptr(mc), i, mp->pgno,
|
||||
__Wpedantic_format_voidptr(mp));
|
||||
tASSERT(txn, !is_subpage(mp));
|
||||
if (is_modifable(txn, mp)) {
|
||||
size_t const n = dpl_search(txn, mp->pgno);
|
||||
if (txn->tw.dirtylist->items[n].pgno == mp->pgno &&
|
||||
if (txn->wr.dirtylist->items[n].pgno == mp->pgno &&
|
||||
/* не считаем дважды */ dpl_age(txn, n)) {
|
||||
size_t *const ptr = ptr_disp(txn->tw.dirtylist->items[n].ptr, -(ptrdiff_t)sizeof(size_t));
|
||||
*ptr = txn->tw.dirtylru;
|
||||
size_t *const ptr = ptr_disp(txn->wr.dirtylist->items[n].ptr, -(ptrdiff_t)sizeof(size_t));
|
||||
*ptr = txn->wr.dirtylru;
|
||||
tASSERT(txn, dpl_age(txn, n) == 0);
|
||||
++keep;
|
||||
DEBUG("keep page %" PRIaPGNO " (%p), dbi %zu, %scursor %p[%zu]", mp->pgno, __Wpedantic_format_voidptr(mp),
|
||||
cursor_dbi(mc), is_inner(mc) ? "sub-" : "", __Wpedantic_format_voidptr(mc), i);
|
||||
}
|
||||
}
|
||||
} while (++i <= mc->top);
|
||||
|
||||
tASSERT(txn, is_leaf(mp));
|
||||
if (!mc->subcur || mc->ki[mc->top] >= page_numkeys(mp))
|
||||
break;
|
||||
if (!(node_flags(page_node(mp, mc->ki[mc->top])) & N_TREE))
|
||||
if (!inner_pointed(mc))
|
||||
break;
|
||||
mc = &mc->subcur->cursor;
|
||||
if (is_subpage(mc->pg[0]))
|
||||
break;
|
||||
}
|
||||
return keep;
|
||||
}
|
||||
@ -115,7 +119,7 @@ static size_t spill_txn_keep(MDBX_txn *txn, MDBX_cursor *m0) {
|
||||
* ...
|
||||
* > 255 = must not be spilled. */
|
||||
MDBX_NOTHROW_PURE_FUNCTION static unsigned spill_prio(const MDBX_txn *txn, const size_t i, const uint32_t reciprocal) {
|
||||
dpl_t *const dl = txn->tw.dirtylist;
|
||||
dpl_t *const dl = txn->wr.dirtylist;
|
||||
const uint32_t age = dpl_age(txn, i);
|
||||
const size_t npages = dpl_npages(dl, i);
|
||||
const pgno_t pgno = dl->items[i].pgno;
|
||||
@ -178,14 +182,14 @@ __cold int spill_slowpath(MDBX_txn *const txn, MDBX_cursor *const m0, const intp
|
||||
tASSERT(txn, (txn->flags & MDBX_TXN_RDONLY) == 0);
|
||||
|
||||
int rc = MDBX_SUCCESS;
|
||||
if (unlikely(txn->tw.loose_count >=
|
||||
(txn->tw.dirtylist ? txn->tw.dirtylist->pages_including_loose : txn->tw.writemap_dirty_npages)))
|
||||
if (unlikely(txn->wr.loose_count >=
|
||||
(txn->wr.dirtylist ? txn->wr.dirtylist->pages_including_loose : txn->wr.writemap_dirty_npages)))
|
||||
goto done;
|
||||
|
||||
const size_t dirty_entries = txn->tw.dirtylist ? (txn->tw.dirtylist->length - txn->tw.loose_count) : 1;
|
||||
const size_t dirty_entries = txn->wr.dirtylist ? (txn->wr.dirtylist->length - txn->wr.loose_count) : 1;
|
||||
const size_t dirty_npages =
|
||||
(txn->tw.dirtylist ? txn->tw.dirtylist->pages_including_loose : txn->tw.writemap_dirty_npages) -
|
||||
txn->tw.loose_count;
|
||||
(txn->wr.dirtylist ? txn->wr.dirtylist->pages_including_loose : txn->wr.writemap_dirty_npages) -
|
||||
txn->wr.loose_count;
|
||||
const size_t need_spill_entries = spill_gate(txn->env, wanna_spill_entries, dirty_entries);
|
||||
const size_t need_spill_npages = spill_gate(txn->env, wanna_spill_npages, dirty_npages);
|
||||
|
||||
@ -196,17 +200,17 @@ __cold int spill_slowpath(MDBX_txn *const txn, MDBX_cursor *const m0, const intp
|
||||
if (txn->flags & MDBX_WRITEMAP) {
|
||||
NOTICE("%s-spilling %zu dirty-entries, %zu dirty-npages", "msync", dirty_entries, dirty_npages);
|
||||
const MDBX_env *env = txn->env;
|
||||
tASSERT(txn, txn->tw.spilled.list == nullptr);
|
||||
tASSERT(txn, txn->wr.spilled.list == nullptr);
|
||||
rc = osal_msync(&txn->env->dxb_mmap, 0, pgno_align2os_bytes(env, txn->geo.first_unallocated), MDBX_SYNC_KICK);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
goto bailout;
|
||||
#if MDBX_AVOID_MSYNC
|
||||
MDBX_ANALYSIS_ASSUME(txn->tw.dirtylist != nullptr);
|
||||
MDBX_ANALYSIS_ASSUME(txn->wr.dirtylist != nullptr);
|
||||
tASSERT(txn, dpl_check(txn));
|
||||
env->lck->unsynced_pages.weak += txn->tw.dirtylist->pages_including_loose - txn->tw.loose_count;
|
||||
dpl_clear(txn->tw.dirtylist);
|
||||
txn->tw.dirtyroom = env->options.dp_limit - txn->tw.loose_count;
|
||||
for (page_t *lp = txn->tw.loose_pages; lp != nullptr; lp = page_next(lp)) {
|
||||
env->lck->unsynced_pages.weak += txn->wr.dirtylist->pages_including_loose - txn->wr.loose_count;
|
||||
dpl_clear(txn->wr.dirtylist);
|
||||
txn->wr.dirtyroom = env->options.dp_limit - txn->wr.loose_count;
|
||||
for (page_t *lp = txn->wr.loose_pages; lp != nullptr; lp = page_next(lp)) {
|
||||
tASSERT(txn, lp->flags == P_LOOSE);
|
||||
rc = dpl_append(txn, lp->pgno, lp, 1);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
@ -216,22 +220,22 @@ __cold int spill_slowpath(MDBX_txn *const txn, MDBX_cursor *const m0, const intp
|
||||
}
|
||||
tASSERT(txn, dpl_check(txn));
|
||||
#else
|
||||
tASSERT(txn, txn->tw.dirtylist == nullptr);
|
||||
env->lck->unsynced_pages.weak += txn->tw.writemap_dirty_npages;
|
||||
txn->tw.writemap_spilled_npages += txn->tw.writemap_dirty_npages;
|
||||
txn->tw.writemap_dirty_npages = 0;
|
||||
tASSERT(txn, txn->wr.dirtylist == nullptr);
|
||||
env->lck->unsynced_pages.weak += txn->wr.writemap_dirty_npages;
|
||||
txn->wr.writemap_spilled_npages += txn->wr.writemap_dirty_npages;
|
||||
txn->wr.writemap_dirty_npages = 0;
|
||||
#endif /* MDBX_AVOID_MSYNC */
|
||||
goto done;
|
||||
}
|
||||
|
||||
NOTICE("%s-spilling %zu dirty-entries, %zu dirty-npages", "write", need_spill_entries, need_spill_npages);
|
||||
MDBX_ANALYSIS_ASSUME(txn->tw.dirtylist != nullptr);
|
||||
tASSERT(txn, txn->tw.dirtylist->length - txn->tw.loose_count >= 1);
|
||||
tASSERT(txn, txn->tw.dirtylist->pages_including_loose - txn->tw.loose_count >= need_spill_npages);
|
||||
if (!txn->tw.spilled.list) {
|
||||
txn->tw.spilled.least_removed = INT_MAX;
|
||||
txn->tw.spilled.list = pnl_alloc(need_spill);
|
||||
if (unlikely(!txn->tw.spilled.list)) {
|
||||
MDBX_ANALYSIS_ASSUME(txn->wr.dirtylist != nullptr);
|
||||
tASSERT(txn, txn->wr.dirtylist->length - txn->wr.loose_count >= 1);
|
||||
tASSERT(txn, txn->wr.dirtylist->pages_including_loose - txn->wr.loose_count >= need_spill_npages);
|
||||
if (!txn->wr.spilled.list) {
|
||||
txn->wr.spilled.least_removed = INT_MAX;
|
||||
txn->wr.spilled.list = pnl_alloc(need_spill);
|
||||
if (unlikely(!txn->wr.spilled.list)) {
|
||||
rc = MDBX_ENOMEM;
|
||||
bailout:
|
||||
txn->flags |= MDBX_TXN_ERROR;
|
||||
@ -240,7 +244,7 @@ __cold int spill_slowpath(MDBX_txn *const txn, MDBX_cursor *const m0, const intp
|
||||
} else {
|
||||
/* purge deleted slots */
|
||||
spill_purge(txn);
|
||||
rc = pnl_reserve(&txn->tw.spilled.list, need_spill);
|
||||
rc = pnl_reserve(&txn->wr.spilled.list, need_spill);
|
||||
(void)rc /* ignore since the resulting list may be shorter
|
||||
and pnl_append() will increase pnl on demand */
|
||||
;
|
||||
@ -251,9 +255,9 @@ __cold int spill_slowpath(MDBX_txn *const txn, MDBX_cursor *const m0, const intp
|
||||
|
||||
/* Preserve pages which may soon be dirtied again */
|
||||
const size_t unspillable = spill_txn_keep(txn, m0);
|
||||
if (unspillable + txn->tw.loose_count >= dl->length) {
|
||||
if (unspillable + txn->wr.loose_count >= dl->length) {
|
||||
#if xMDBX_DEBUG_SPILLING == 1 /* avoid false failure in debug mode */
|
||||
if (likely(txn->tw.dirtyroom + txn->tw.loose_count >= need))
|
||||
if (likely(txn->wr.dirtyroom + txn->wr.loose_count >= need))
|
||||
return MDBX_SUCCESS;
|
||||
#endif /* xMDBX_DEBUG_SPILLING */
|
||||
ERROR("all %zu dirty pages are unspillable since referenced "
|
||||
@ -293,7 +297,7 @@ __cold int spill_slowpath(MDBX_txn *const txn, MDBX_cursor *const m0, const intp
|
||||
age_max = (age_max >= age) ? age_max : age;
|
||||
}
|
||||
|
||||
VERBOSE("lru-head %u, age-max %u", txn->tw.dirtylru, age_max);
|
||||
VERBOSE("lru-head %u, age-max %u", txn->wr.dirtylru, age_max);
|
||||
|
||||
/* half of 8-bit radix-sort */
|
||||
pgno_t radix_entries[256], radix_npages[256];
|
||||
@ -388,8 +392,8 @@ __cold int spill_slowpath(MDBX_txn *const txn, MDBX_cursor *const m0, const intp
|
||||
tASSERT(txn, r - w == spilled_entries || rc != MDBX_SUCCESS);
|
||||
|
||||
dl->sorted = dpl_setlen(dl, w);
|
||||
txn->tw.dirtyroom += spilled_entries;
|
||||
txn->tw.dirtylist->pages_including_loose -= spilled_npages;
|
||||
txn->wr.dirtyroom += spilled_entries;
|
||||
txn->wr.dirtylist->pages_including_loose -= spilled_npages;
|
||||
tASSERT(txn, dpl_check(txn));
|
||||
|
||||
if (!iov_empty(&ctx)) {
|
||||
@ -400,10 +404,10 @@ __cold int spill_slowpath(MDBX_txn *const txn, MDBX_cursor *const m0, const intp
|
||||
goto bailout;
|
||||
|
||||
txn->env->lck->unsynced_pages.weak += spilled_npages;
|
||||
pnl_sort(txn->tw.spilled.list, (size_t)txn->geo.first_unallocated << 1);
|
||||
pnl_sort(txn->wr.spilled.list, (size_t)txn->geo.first_unallocated << 1);
|
||||
txn->flags |= MDBX_TXN_SPILLS;
|
||||
NOTICE("spilled %u dirty-entries, %u dirty-npages, now have %zu dirty-room", spilled_entries, spilled_npages,
|
||||
txn->tw.dirtyroom);
|
||||
txn->wr.dirtyroom);
|
||||
} else {
|
||||
tASSERT(txn, rc == MDBX_SUCCESS);
|
||||
for (size_t i = 1; i <= dl->length; ++i) {
|
||||
@ -414,18 +418,18 @@ __cold int spill_slowpath(MDBX_txn *const txn, MDBX_cursor *const m0, const intp
|
||||
}
|
||||
|
||||
#if xMDBX_DEBUG_SPILLING == 2
|
||||
if (txn->tw.loose_count + txn->tw.dirtyroom <= need / 2 + 1)
|
||||
if (txn->wr.loose_count + txn->wr.dirtyroom <= need / 2 + 1)
|
||||
ERROR("dirty-list length: before %zu, after %zu, parent %zi, loose %zu; "
|
||||
"needed %zu, spillable %zu; "
|
||||
"spilled %u dirty-entries, now have %zu dirty-room",
|
||||
dl->length + spilled_entries, dl->length,
|
||||
(txn->parent && txn->parent->tw.dirtylist) ? (intptr_t)txn->parent->tw.dirtylist->length : -1,
|
||||
txn->tw.loose_count, need, spillable_entries, spilled_entries, txn->tw.dirtyroom);
|
||||
ENSURE(txn->env, txn->tw.loose_count + txn->tw.dirtyroom > need / 2);
|
||||
(txn->parent && txn->parent->wr.dirtylist) ? (intptr_t)txn->parent->wr.dirtylist->length : -1,
|
||||
txn->wr.loose_count, need, spillable_entries, spilled_entries, txn->wr.dirtyroom);
|
||||
ENSURE(txn->env, txn->wr.loose_count + txn->wr.dirtyroom > need / 2);
|
||||
#endif /* xMDBX_DEBUG_SPILLING */
|
||||
|
||||
done:
|
||||
return likely(txn->tw.dirtyroom + txn->tw.loose_count > ((need > CURSOR_STACK_SIZE) ? CURSOR_STACK_SIZE : need))
|
||||
return likely(txn->wr.dirtyroom + txn->wr.loose_count > ((need > CURSOR_STACK_SIZE) ? CURSOR_STACK_SIZE : need))
|
||||
? MDBX_SUCCESS
|
||||
: MDBX_TXN_FULL;
|
||||
}
|
||||
|
10
src/spill.h
10
src/spill.h
@ -13,7 +13,7 @@ MDBX_INTERNAL int spill_slowpath(MDBX_txn *const txn, MDBX_cursor *const m0, con
|
||||
|
||||
static inline size_t spill_search(const MDBX_txn *txn, pgno_t pgno) {
|
||||
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC);
|
||||
const pnl_t pnl = txn->tw.spilled.list;
|
||||
const pnl_t pnl = txn->wr.spilled.list;
|
||||
if (likely(!pnl))
|
||||
return 0;
|
||||
pgno <<= 1;
|
||||
@ -22,7 +22,7 @@ static inline size_t spill_search(const MDBX_txn *txn, pgno_t pgno) {
|
||||
}
|
||||
|
||||
static inline bool spill_intersect(const MDBX_txn *txn, pgno_t pgno, size_t npages) {
|
||||
const pnl_t pnl = txn->tw.spilled.list;
|
||||
const pnl_t pnl = txn->wr.spilled.list;
|
||||
if (likely(!pnl))
|
||||
return false;
|
||||
const size_t len = MDBX_PNL_GETSIZE(pnl);
|
||||
@ -56,10 +56,10 @@ static inline int txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0, const si
|
||||
tASSERT(txn, (txn->flags & MDBX_TXN_RDONLY) == 0);
|
||||
tASSERT(txn, !m0 || cursor_is_tracked(m0));
|
||||
|
||||
const intptr_t wanna_spill_entries = txn->tw.dirtylist ? (need - txn->tw.dirtyroom - txn->tw.loose_count) : 0;
|
||||
const intptr_t wanna_spill_entries = txn->wr.dirtylist ? (need - txn->wr.dirtyroom - txn->wr.loose_count) : 0;
|
||||
const intptr_t wanna_spill_npages =
|
||||
need + (txn->tw.dirtylist ? txn->tw.dirtylist->pages_including_loose : txn->tw.writemap_dirty_npages) -
|
||||
txn->tw.loose_count - txn->env->options.dp_limit;
|
||||
need + (txn->wr.dirtylist ? txn->wr.dirtylist->pages_including_loose : txn->wr.writemap_dirty_npages) -
|
||||
txn->wr.loose_count - txn->env->options.dp_limit;
|
||||
|
||||
/* production mode */
|
||||
if (likely(wanna_spill_npages < 1 && wanna_spill_entries < 1)
|
||||
|
@ -20,6 +20,7 @@
|
||||
|
||||
#define PRINT 1
|
||||
#define GLOBAL 2
|
||||
#define CONCISE 4
|
||||
static int mode = GLOBAL;
|
||||
|
||||
typedef struct flagbit {
|
||||
@ -55,42 +56,23 @@ static void signal_handler(int sig) {
|
||||
|
||||
#endif /* !WINDOWS */
|
||||
|
||||
static const char hexc[] = "0123456789abcdef";
|
||||
|
||||
static void dumpbyte(unsigned char c) {
|
||||
putchar(hexc[c >> 4]);
|
||||
putchar(hexc[c & 15]);
|
||||
}
|
||||
|
||||
static void text(MDBX_val *v) {
|
||||
unsigned char *c, *end;
|
||||
|
||||
static void dumpval(const MDBX_val *v) {
|
||||
static const char digits[] = "0123456789abcdef";
|
||||
putchar(' ');
|
||||
c = v->iov_base;
|
||||
end = c + v->iov_len;
|
||||
while (c < end) {
|
||||
if (isprint(*c) && *c != '\\') {
|
||||
putchar(*c);
|
||||
} else {
|
||||
putchar('\\');
|
||||
dumpbyte(*c);
|
||||
for (const unsigned char *c = v->iov_base, *end = c + v->iov_len; c < end; ++c) {
|
||||
if (mode & PRINT) {
|
||||
if (isprint(*c) && *c != '\\') {
|
||||
putchar(*c);
|
||||
continue;
|
||||
} else
|
||||
putchar('\\');
|
||||
}
|
||||
c++;
|
||||
putchar(digits[*c >> 4]);
|
||||
putchar(digits[*c & 15]);
|
||||
}
|
||||
putchar('\n');
|
||||
}
|
||||
|
||||
static void dumpval(MDBX_val *v) {
|
||||
unsigned char *c, *end;
|
||||
|
||||
putchar(' ');
|
||||
c = v->iov_base;
|
||||
end = c + v->iov_len;
|
||||
while (c < end)
|
||||
dumpbyte(*c++);
|
||||
putchar('\n');
|
||||
}
|
||||
|
||||
bool quiet = false, rescue = false;
|
||||
const char *prog;
|
||||
static void error(const char *func, int rc) {
|
||||
@ -185,12 +167,19 @@ static int dump_tbl(MDBX_txn *txn, MDBX_dbi dbi, char *name) {
|
||||
rc = MDBX_EINTR;
|
||||
break;
|
||||
}
|
||||
if (mode & PRINT) {
|
||||
text(&key);
|
||||
text(&data);
|
||||
} else {
|
||||
dumpval(&key);
|
||||
dumpval(&data);
|
||||
dumpval(&key);
|
||||
dumpval(&data);
|
||||
if ((flags & MDBX_DUPSORT) && (mode & CONCISE)) {
|
||||
while ((rc = mdbx_cursor_get(cursor, &key, &data, MDBX_NEXT_DUP)) == MDBX_SUCCESS) {
|
||||
if (user_break) {
|
||||
rc = MDBX_EINTR;
|
||||
break;
|
||||
}
|
||||
putchar(' ');
|
||||
dumpval(&data);
|
||||
}
|
||||
if (rc != MDBX_NOTFOUND)
|
||||
break;
|
||||
}
|
||||
}
|
||||
printf("DATA=END\n");
|
||||
@ -206,10 +195,12 @@ static int dump_tbl(MDBX_txn *txn, MDBX_dbi dbi, char *name) {
|
||||
static void usage(void) {
|
||||
fprintf(stderr,
|
||||
"usage: %s "
|
||||
"[-V] [-q] [-f file] [-l] [-p] [-r] [-a|-s table] [-u|U] "
|
||||
"[-V] [-q] [-c] [-f file] [-l] [-p] [-r] [-a|-s table] [-u|U] "
|
||||
"dbpath\n"
|
||||
" -V\t\tprint version and exit\n"
|
||||
" -q\t\tbe quiet\n"
|
||||
" -c\t\tconcise mode without repeating keys,\n"
|
||||
" \t\tbut incompatible with Berkeley DB and LMDB\n"
|
||||
" -f\t\twrite to file instead of stdout\n"
|
||||
" -l\t\tlist tables and exit\n"
|
||||
" -p\t\tuse printable characters\n"
|
||||
@ -268,6 +259,7 @@ int main(int argc, char *argv[]) {
|
||||
"s:"
|
||||
"V"
|
||||
"r"
|
||||
"c"
|
||||
"q")) != EOF) {
|
||||
switch (i) {
|
||||
case 'V':
|
||||
@ -298,6 +290,9 @@ int main(int argc, char *argv[]) {
|
||||
break;
|
||||
case 'n':
|
||||
break;
|
||||
case 'c':
|
||||
mode |= CONCISE;
|
||||
break;
|
||||
case 'p':
|
||||
mode |= PRINT;
|
||||
break;
|
||||
|
@ -380,7 +380,16 @@ __hot static int readline(MDBX_val *out, MDBX_val *buf) {
|
||||
return badend();
|
||||
}
|
||||
}
|
||||
if (fgets(buf->iov_base, (int)buf->iov_len, stdin) == nullptr)
|
||||
|
||||
/* modern concise mode, where space in second position mean the same (previously) value */
|
||||
c = fgetc(stdin);
|
||||
if (c == EOF)
|
||||
return errno ? errno : EOF;
|
||||
if (c == ' ')
|
||||
return (ungetc(c, stdin) == c) ? MDBX_SUCCESS : (errno ? errno : EOF);
|
||||
|
||||
*(char *)buf->iov_base = c;
|
||||
if (fgets((char *)buf->iov_base + 1, (int)buf->iov_len - 1, stdin) == nullptr)
|
||||
return errno ? errno : EOF;
|
||||
lineno++;
|
||||
|
||||
@ -721,8 +730,8 @@ int main(int argc, char *argv[]) {
|
||||
}
|
||||
|
||||
int batch = 0;
|
||||
MDBX_val key = {.iov_base = nullptr, .iov_len = 0}, data = {.iov_base = nullptr, .iov_len = 0};
|
||||
while (err == MDBX_SUCCESS) {
|
||||
MDBX_val key, data;
|
||||
err = readline(&key, &kbuf);
|
||||
if (err == EOF)
|
||||
break;
|
||||
|
@ -38,11 +38,10 @@ static MDBX_cursor *cursor_clone(const MDBX_cursor *csrc, cursor_couple_t *coupl
|
||||
/*----------------------------------------------------------------------------*/
|
||||
|
||||
void recalculate_merge_thresholds(MDBX_env *env) {
|
||||
const size_t bytes = page_space(env);
|
||||
env->merge_threshold = (uint16_t)(bytes - (bytes * env->options.merge_threshold_16dot16_percent >> 16));
|
||||
env->merge_threshold_gc =
|
||||
(uint16_t)(bytes - ((env->options.merge_threshold_16dot16_percent > 19005) ? bytes / 3 /* 33 % */
|
||||
: bytes / 4 /* 25 % */));
|
||||
const size_t whole_page_space = page_space(env);
|
||||
env->merge_threshold =
|
||||
(uint16_t)(whole_page_space - (whole_page_space * env->options.merge_threshold_16dot16_percent >> 16));
|
||||
eASSERT(env, env->merge_threshold >= whole_page_space / 2 && env->merge_threshold <= whole_page_space / 64 * 63);
|
||||
}
|
||||
|
||||
int tree_drop(MDBX_cursor *mc, const bool may_have_tables) {
|
||||
@ -56,7 +55,7 @@ int tree_drop(MDBX_cursor *mc, const bool may_have_tables) {
|
||||
if (!(may_have_tables | mc->tree->large_pages))
|
||||
cursor_pop(mc);
|
||||
|
||||
rc = pnl_need(&txn->tw.retired_pages,
|
||||
rc = pnl_need(&txn->wr.retired_pages,
|
||||
(size_t)mc->tree->branch_pages + (size_t)mc->tree->leaf_pages + (size_t)mc->tree->large_pages);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
goto bailout;
|
||||
@ -446,8 +445,8 @@ static int page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst) {
|
||||
cASSERT(cdst, cdst->top > 0);
|
||||
cASSERT(cdst, cdst->top + 1 < cdst->tree->height || is_leaf(cdst->pg[cdst->tree->height - 1]));
|
||||
cASSERT(csrc, csrc->top + 1 < csrc->tree->height || is_leaf(csrc->pg[csrc->tree->height - 1]));
|
||||
cASSERT(cdst,
|
||||
csrc->txn->env->options.prefer_waf_insteadof_balance || page_room(pdst) >= page_used(cdst->txn->env, psrc));
|
||||
cASSERT(cdst, cursor_dbi(csrc) == FREE_DBI || csrc->txn->env->options.prefer_waf_insteadof_balance ||
|
||||
page_room(pdst) >= page_used(cdst->txn->env, psrc));
|
||||
const int pagetype = page_type(psrc);
|
||||
|
||||
/* Move all nodes from src to dst */
|
||||
@ -680,8 +679,18 @@ int tree_rebalance(MDBX_cursor *mc) {
|
||||
const size_t minkeys = (pagetype & P_BRANCH) + (size_t)1;
|
||||
|
||||
/* Pages emptier than this are candidates for merging. */
|
||||
size_t room_threshold =
|
||||
likely(mc->tree != &mc->txn->dbs[FREE_DBI]) ? mc->txn->env->merge_threshold : mc->txn->env->merge_threshold_gc;
|
||||
size_t room_threshold = mc->txn->env->merge_threshold;
|
||||
bool minimize_waf = mc->txn->env->options.prefer_waf_insteadof_balance;
|
||||
if (unlikely(mc->tree == &mc->txn->dbs[FREE_DBI])) {
|
||||
/* В случае GC всегда минимизируем WAF, а рыхлые страницы объединяем только при наличии запаса в gc_stockpile().
|
||||
* Это позволяет уменьшить WAF и избавиться от лишних действий/циклов как при переработке GC,
|
||||
* так и при возврате неиспользованных страниц. Сбалансированность b-tree при этом почти не деградирует,
|
||||
* ибо добавление/удаление/обновление запиcей происходит почти всегда только по краям. */
|
||||
minimize_waf = true;
|
||||
room_threshold = page_space(mc->txn->env);
|
||||
if (gc_stockpile(mc->txn) > mc->tree->height + mc->tree->height)
|
||||
room_threshold >>= 1;
|
||||
}
|
||||
|
||||
const size_t numkeys = page_numkeys(tp);
|
||||
const size_t room = page_room(tp);
|
||||
@ -802,10 +811,26 @@ int tree_rebalance(MDBX_cursor *mc) {
|
||||
const size_t right_room = right ? page_room(right) : 0;
|
||||
const size_t left_nkeys = left ? page_numkeys(left) : 0;
|
||||
const size_t right_nkeys = right ? page_numkeys(right) : 0;
|
||||
|
||||
/* Нужно выбрать между правой и левой страницами для слияния текущей или перемещения узла в текущую.
|
||||
* Таким образом, нужно выбрать один из четырёх вариантов согласно критериям.
|
||||
*
|
||||
* Если включен minimize_waf, то стараемся не вовлекать чистые страницы,
|
||||
* пренебрегая идеальностью баланса ради уменьшения WAF.
|
||||
*
|
||||
* При этом отдельные варианты могут быть не доступны, либо "не сработать" из-за того что:
|
||||
* - в какой-то branch-странице не хватит места из-за распространения/обновления первых ключей,
|
||||
* которые хранятся в родительских страницах;
|
||||
* - при включенном minimize_waf распространение/обновление первых ключей
|
||||
* потребуется разделение какой-либо странице, что увеличит WAF и поэтому обесценивает дальнейшее
|
||||
* следование minimize_waf. */
|
||||
|
||||
bool involve = !(left && right);
|
||||
retry:
|
||||
cASSERT(mc, mc->top > 0);
|
||||
if (left_room > room_threshold && left_room >= right_room && (is_modifable(mc->txn, left) || involve)) {
|
||||
const bool consider_left = left && (involve || is_modifable(mc->txn, left));
|
||||
const bool consider_right = right && (involve || is_modifable(mc->txn, right));
|
||||
if (consider_left && left_room > room_threshold && left_room >= right_room) {
|
||||
/* try merge with left */
|
||||
cASSERT(mc, left_nkeys >= minkeys);
|
||||
mn->pg[mn->top] = left;
|
||||
@ -825,7 +850,7 @@ retry:
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
if (right_room > room_threshold && (is_modifable(mc->txn, right) || involve)) {
|
||||
if (consider_right && right_room > room_threshold) {
|
||||
/* try merge with right */
|
||||
cASSERT(mc, right_nkeys >= minkeys);
|
||||
mn->pg[mn->top] = right;
|
||||
@ -843,8 +868,7 @@ retry:
|
||||
}
|
||||
}
|
||||
|
||||
if (left_nkeys > minkeys && (right_nkeys <= left_nkeys || right_room >= left_room) &&
|
||||
(is_modifable(mc->txn, left) || involve)) {
|
||||
if (consider_left && left_nkeys > minkeys && (right_nkeys <= left_nkeys || right_room >= left_room)) {
|
||||
/* try move from left */
|
||||
mn->pg[mn->top] = left;
|
||||
mn->ki[mn->top - 1] = (indx_t)(ki_pre_top - 1);
|
||||
@ -860,7 +884,7 @@ retry:
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
if (right_nkeys > minkeys && (is_modifable(mc->txn, right) || involve)) {
|
||||
if (consider_right && right_nkeys > minkeys) {
|
||||
/* try move from right */
|
||||
mn->pg[mn->top] = right;
|
||||
mn->ki[mn->top - 1] = (indx_t)(ki_pre_top + 1);
|
||||
@ -884,17 +908,20 @@ retry:
|
||||
return MDBX_SUCCESS;
|
||||
}
|
||||
|
||||
if (mc->txn->env->options.prefer_waf_insteadof_balance && likely(room_threshold > 0)) {
|
||||
if (minimize_waf && room_threshold > 0) {
|
||||
/* Если включен minimize_waf, то переходим к попыткам слияния с сильно
|
||||
* заполненными страницами до вовлечения чистых страниц (не измененных в этой транзакции) */
|
||||
room_threshold = 0;
|
||||
goto retry;
|
||||
}
|
||||
if (likely(!involve) &&
|
||||
(likely(mc->tree != &mc->txn->dbs[FREE_DBI]) || mc->txn->tw.loose_pages || MDBX_PNL_GETSIZE(mc->txn->tw.repnl) ||
|
||||
(mc->flags & z_gcu_preparation) || (mc->txn->flags & txn_gc_drained) || room_threshold)) {
|
||||
if (!involve) {
|
||||
/* Теперь допускаем вовлечение чистых страниц (не измененных в этой транзакции),
|
||||
* что улучшает баланс в дереве, но увеличивает WAF. */
|
||||
involve = true;
|
||||
goto retry;
|
||||
}
|
||||
if (likely(room_threshold > 0)) {
|
||||
if (room_threshold > 0) {
|
||||
/* Если не нашли подходящей соседней, то допускаем слияние с сильно заполненными страницами */
|
||||
room_threshold = 0;
|
||||
goto retry;
|
||||
}
|
||||
@ -1228,6 +1255,7 @@ int page_split(MDBX_cursor *mc, const MDBX_val *const newkey, MDBX_val *const ne
|
||||
|
||||
/* root split? */
|
||||
prev_top += mc->top - top;
|
||||
cASSERT(mn, prev_top <= mn->top && prev_top <= mc->top);
|
||||
|
||||
/* Right page might now have changed parent.
|
||||
* Check if left page also changed parent. */
|
||||
|
@ -63,14 +63,14 @@ static int txl_reserve(txl_t __restrict *__restrict ptxl, const size_t wanna) {
|
||||
return MDBX_ENOMEM;
|
||||
}
|
||||
|
||||
static __always_inline int __must_check_result txl_need(txl_t __restrict *__restrict ptxl, size_t num) {
|
||||
static inline int __must_check_result txl_need(txl_t __restrict *__restrict ptxl, size_t num) {
|
||||
assert(MDBX_PNL_GETSIZE(*ptxl) <= txl_max && MDBX_PNL_ALLOCLEN(*ptxl) >= MDBX_PNL_GETSIZE(*ptxl));
|
||||
assert(num <= PAGELIST_LIMIT);
|
||||
const size_t wanna = (size_t)MDBX_PNL_GETSIZE(*ptxl) + num;
|
||||
return likely(MDBX_PNL_ALLOCLEN(*ptxl) >= wanna) ? MDBX_SUCCESS : txl_reserve(ptxl, wanna);
|
||||
}
|
||||
|
||||
static __always_inline void txl_xappend(txl_t __restrict txl, txnid_t id) {
|
||||
static inline void txl_xappend(txl_t __restrict txl, txnid_t id) {
|
||||
assert(MDBX_PNL_GETSIZE(txl) < MDBX_PNL_ALLOCLEN(txl));
|
||||
txl[0] += 1;
|
||||
MDBX_PNL_LAST(txl) = id;
|
||||
|
10
src/txl.h
10
src/txl.h
@ -15,12 +15,12 @@ enum txl_rules {
|
||||
txl_max = (1u << 26) - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(txnid_t)
|
||||
};
|
||||
|
||||
MDBX_INTERNAL txl_t txl_alloc(void);
|
||||
MDBX_MAYBE_UNUSED MDBX_INTERNAL txl_t txl_alloc(void);
|
||||
|
||||
MDBX_INTERNAL void txl_free(txl_t txl);
|
||||
MDBX_MAYBE_UNUSED MDBX_INTERNAL void txl_free(txl_t txl);
|
||||
|
||||
MDBX_INTERNAL int __must_check_result txl_append(txl_t __restrict *ptxl, txnid_t id);
|
||||
MDBX_MAYBE_UNUSED MDBX_INTERNAL int __must_check_result txl_append(txl_t __restrict *ptxl, txnid_t id);
|
||||
|
||||
MDBX_INTERNAL void txl_sort(txl_t txl);
|
||||
MDBX_MAYBE_UNUSED MDBX_INTERNAL void txl_sort(txl_t txl);
|
||||
|
||||
MDBX_INTERNAL bool txl_contain(const txl_t txl, txnid_t id);
|
||||
MDBX_MAYBE_UNUSED MDBX_INTERNAL bool txl_contain(const txl_t txl, txnid_t id);
|
||||
|
366
src/txn-basal.c
Normal file
366
src/txn-basal.c
Normal file
@ -0,0 +1,366 @@
|
||||
/// \copyright SPDX-License-Identifier: Apache-2.0
|
||||
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2025
|
||||
|
||||
#include "internals.h"
|
||||
|
||||
static int txn_write(MDBX_txn *txn, iov_ctx_t *ctx) {
|
||||
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC);
|
||||
dpl_t *const dl = dpl_sort(txn);
|
||||
int rc = MDBX_SUCCESS;
|
||||
size_t r, w, total_npages = 0;
|
||||
for (w = 0, r = 1; r <= dl->length; ++r) {
|
||||
page_t *dp = dl->items[r].ptr;
|
||||
if (dp->flags & P_LOOSE) {
|
||||
dl->items[++w] = dl->items[r];
|
||||
continue;
|
||||
}
|
||||
unsigned npages = dpl_npages(dl, r);
|
||||
total_npages += npages;
|
||||
rc = iov_page(txn, ctx, dp, npages);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return rc;
|
||||
}
|
||||
|
||||
if (!iov_empty(ctx)) {
|
||||
tASSERT(txn, rc == MDBX_SUCCESS);
|
||||
rc = iov_write(ctx);
|
||||
}
|
||||
|
||||
if (likely(rc == MDBX_SUCCESS) && ctx->fd == txn->env->lazy_fd) {
|
||||
txn->env->lck->unsynced_pages.weak += total_npages;
|
||||
if (!txn->env->lck->eoos_timestamp.weak)
|
||||
txn->env->lck->eoos_timestamp.weak = osal_monotime();
|
||||
}
|
||||
|
||||
txn->wr.dirtylist->pages_including_loose -= total_npages;
|
||||
while (r <= dl->length)
|
||||
dl->items[++w] = dl->items[r++];
|
||||
|
||||
dl->sorted = dpl_setlen(dl, w);
|
||||
txn->wr.dirtyroom += r - 1 - w;
|
||||
tASSERT(txn, txn->wr.dirtyroom + txn->wr.dirtylist->length ==
|
||||
(txn->parent ? txn->parent->wr.dirtyroom : txn->env->options.dp_limit));
|
||||
tASSERT(txn, txn->wr.dirtylist->length == txn->wr.loose_count);
|
||||
tASSERT(txn, txn->wr.dirtylist->pages_including_loose == txn->wr.loose_count);
|
||||
return rc;
|
||||
}
|
||||
|
||||
__cold MDBX_txn *txn_basal_create(const size_t max_dbi) {
|
||||
MDBX_txn *txn = nullptr;
|
||||
const intptr_t bitmap_bytes =
|
||||
#if MDBX_ENABLE_DBI_SPARSE
|
||||
ceil_powerof2(max_dbi, CHAR_BIT * sizeof(txn->dbi_sparse[0])) / CHAR_BIT;
|
||||
#else
|
||||
0;
|
||||
#endif /* MDBX_ENABLE_DBI_SPARSE */
|
||||
const size_t base = sizeof(MDBX_txn) + /* GC cursor */ sizeof(cursor_couple_t);
|
||||
const size_t size =
|
||||
base + bitmap_bytes +
|
||||
max_dbi * (sizeof(txn->dbs[0]) + sizeof(txn->cursors[0]) + sizeof(txn->dbi_seqs[0]) + sizeof(txn->dbi_state[0]));
|
||||
|
||||
txn = osal_calloc(1, size);
|
||||
if (unlikely(!txn))
|
||||
return txn;
|
||||
|
||||
rkl_init(&txn->wr.gc.reclaimed);
|
||||
rkl_init(&txn->wr.gc.comeback);
|
||||
txn->dbs = ptr_disp(txn, base);
|
||||
txn->cursors = ptr_disp(txn->dbs, max_dbi * sizeof(txn->dbs[0]));
|
||||
txn->dbi_seqs = ptr_disp(txn->cursors, max_dbi * sizeof(txn->cursors[0]));
|
||||
txn->dbi_state = ptr_disp(txn, size - max_dbi * sizeof(txn->dbi_state[0]));
|
||||
#if MDBX_ENABLE_DBI_SPARSE
|
||||
txn->dbi_sparse = ptr_disp(txn->dbi_state, -bitmap_bytes);
|
||||
#endif /* MDBX_ENABLE_DBI_SPARSE */
|
||||
txn->flags = MDBX_TXN_FINISHED;
|
||||
txn->wr.retired_pages = pnl_alloc(MDBX_PNL_INITIAL);
|
||||
txn->wr.repnl = pnl_alloc(MDBX_PNL_INITIAL);
|
||||
if (unlikely(!txn->wr.retired_pages || !txn->wr.repnl)) {
|
||||
txn_basal_destroy(txn);
|
||||
txn = nullptr;
|
||||
}
|
||||
|
||||
return txn;
|
||||
}
|
||||
|
||||
__cold void txn_basal_destroy(MDBX_txn *txn) {
|
||||
dpl_free(txn);
|
||||
rkl_destroy(&txn->wr.gc.reclaimed);
|
||||
rkl_destroy(&txn->wr.gc.comeback);
|
||||
pnl_free(txn->wr.retired_pages);
|
||||
pnl_free(txn->wr.spilled.list);
|
||||
pnl_free(txn->wr.repnl);
|
||||
osal_free(txn);
|
||||
}
|
||||
|
||||
int txn_basal_start(MDBX_txn *txn, unsigned flags) {
|
||||
MDBX_env *const env = txn->env;
|
||||
|
||||
txn->wr.troika = meta_tap(env);
|
||||
const meta_ptr_t head = meta_recent(env, &txn->wr.troika);
|
||||
uint64_t timestamp = 0;
|
||||
/* coverity[array_null] */
|
||||
while ("workaround for https://libmdbx.dqdkfa.ru/dead-github/issues/269") {
|
||||
int err = coherency_fetch_head(txn, head, ×tamp);
|
||||
if (likely(err == MDBX_SUCCESS))
|
||||
break;
|
||||
if (unlikely(err != MDBX_RESULT_TRUE))
|
||||
return err;
|
||||
}
|
||||
eASSERT(env, meta_txnid(head.ptr_v) == txn->txnid);
|
||||
txn->txnid = safe64_txnid_next(txn->txnid);
|
||||
if (unlikely(txn->txnid > MAX_TXNID)) {
|
||||
ERROR("txnid overflow, raise %d", MDBX_TXN_FULL);
|
||||
return MDBX_TXN_FULL;
|
||||
}
|
||||
|
||||
tASSERT(txn, txn->dbs[FREE_DBI].flags == MDBX_INTEGERKEY);
|
||||
tASSERT(txn, check_table_flags(txn->dbs[MAIN_DBI].flags));
|
||||
txn->flags = flags;
|
||||
txn->nested = nullptr;
|
||||
txn->wr.loose_pages = nullptr;
|
||||
txn->wr.loose_count = 0;
|
||||
#if MDBX_ENABLE_REFUND
|
||||
txn->wr.loose_refund_wl = 0;
|
||||
#endif /* MDBX_ENABLE_REFUND */
|
||||
MDBX_PNL_SETSIZE(txn->wr.retired_pages, 0);
|
||||
txn->wr.spilled.list = nullptr;
|
||||
txn->wr.spilled.least_removed = 0;
|
||||
txn->wr.gc.spent = 0;
|
||||
tASSERT(txn, rkl_empty(&txn->wr.gc.reclaimed));
|
||||
txn->env->gc.detent = 0;
|
||||
env->txn = txn;
|
||||
|
||||
return MDBX_SUCCESS;
|
||||
}
|
||||
|
||||
int txn_basal_end(MDBX_txn *txn, unsigned mode) {
|
||||
MDBX_env *const env = txn->env;
|
||||
tASSERT(txn, (txn->flags & (MDBX_TXN_FINISHED | txn_may_have_cursors)) == 0 && txn->owner);
|
||||
ENSURE(env, txn->txnid >= /* paranoia is appropriate here */ env->lck->cached_oldest.weak);
|
||||
dxb_sanitize_tail(env, nullptr);
|
||||
|
||||
txn->flags = MDBX_TXN_FINISHED;
|
||||
env->txn = nullptr;
|
||||
pnl_free(txn->wr.spilled.list);
|
||||
txn->wr.spilled.list = nullptr;
|
||||
rkl_clear_and_shrink(&txn->wr.gc.reclaimed);
|
||||
rkl_clear_and_shrink(&txn->wr.gc.comeback);
|
||||
|
||||
eASSERT(env, txn->parent == nullptr);
|
||||
pnl_shrink(&txn->wr.retired_pages);
|
||||
pnl_shrink(&txn->wr.repnl);
|
||||
if (!(env->flags & MDBX_WRITEMAP))
|
||||
dpl_release_shadows(txn);
|
||||
|
||||
/* Export or close DBI handles created in this txn */
|
||||
int err = dbi_update(txn, (mode & TXN_END_UPDATE) != 0);
|
||||
if (unlikely(err != MDBX_SUCCESS)) {
|
||||
ERROR("unexpected error %d during export the state of dbi-handles to env", err);
|
||||
err = MDBX_PROBLEM;
|
||||
}
|
||||
|
||||
/* The writer mutex was locked in mdbx_txn_begin. */
|
||||
lck_txn_unlock(env);
|
||||
return err;
|
||||
}
|
||||
|
||||
int txn_basal_commit(MDBX_txn *txn, struct commit_timestamp *ts) {
|
||||
MDBX_env *const env = txn->env;
|
||||
tASSERT(txn, txn == env->basal_txn && !txn->parent && !txn->nested);
|
||||
if (!txn->wr.dirtylist) {
|
||||
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) != 0 && !MDBX_AVOID_MSYNC);
|
||||
} else {
|
||||
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC);
|
||||
tASSERT(txn, txn->wr.dirtyroom + txn->wr.dirtylist->length == env->options.dp_limit);
|
||||
}
|
||||
|
||||
if (txn->flags & txn_may_have_cursors)
|
||||
txn_done_cursors(txn);
|
||||
|
||||
bool need_flush_for_nometasync = false;
|
||||
const meta_ptr_t head = meta_recent(env, &txn->wr.troika);
|
||||
const uint32_t meta_sync_txnid = atomic_load32(&env->lck->meta_sync_txnid, mo_Relaxed);
|
||||
/* sync prev meta */
|
||||
if (head.is_steady && meta_sync_txnid != (uint32_t)head.txnid) {
|
||||
/* Исправление унаследованного от LMDB недочета:
|
||||
*
|
||||
* Всё хорошо, если все процессы работающие с БД не используют WRITEMAP.
|
||||
* Тогда мета-страница (обновленная, но не сброшенная на диск) будет
|
||||
* сохранена в результате fdatasync() при записи данных этой транзакции.
|
||||
*
|
||||
* Всё хорошо, если все процессы работающие с БД используют WRITEMAP
|
||||
* без MDBX_AVOID_MSYNC.
|
||||
* Тогда мета-страница (обновленная, но не сброшенная на диск) будет
|
||||
* сохранена в результате msync() при записи данных этой транзакции.
|
||||
*
|
||||
* Если же в процессах работающих с БД используется оба метода, как sync()
|
||||
* в режиме MDBX_WRITEMAP, так и записи через файловый дескриптор, то
|
||||
* становится невозможным обеспечить фиксацию на диске мета-страницы
|
||||
* предыдущей транзакции и данных текущей транзакции, за счет одной
|
||||
* sync-операцией выполняемой после записи данных текущей транзакции.
|
||||
* Соответственно, требуется явно обновлять мета-страницу, что полностью
|
||||
* уничтожает выгоду от NOMETASYNC. */
|
||||
const uint32_t txnid_dist = ((txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC) ? MDBX_NOMETASYNC_LAZY_FD
|
||||
: MDBX_NOMETASYNC_LAZY_WRITEMAP;
|
||||
/* Смысл "магии" в том, чтобы избежать отдельного вызова fdatasync()
|
||||
* или msync() для гарантированной фиксации на диске мета-страницы,
|
||||
* которая была "лениво" отправлена на запись в предыдущей транзакции,
|
||||
* но не сброшена на диск из-за активного режима MDBX_NOMETASYNC. */
|
||||
if (
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
!env->ioring.overlapped_fd &&
|
||||
#endif
|
||||
meta_sync_txnid == (uint32_t)head.txnid - txnid_dist)
|
||||
need_flush_for_nometasync = true;
|
||||
else {
|
||||
int err = meta_sync(env, head);
|
||||
if (unlikely(err != MDBX_SUCCESS)) {
|
||||
ERROR("txn-%s: error %d", "presync-meta", err);
|
||||
return err;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if ((!txn->wr.dirtylist || txn->wr.dirtylist->length == 0) &&
|
||||
(txn->flags & (MDBX_TXN_DIRTY | MDBX_TXN_SPILLS | MDBX_TXN_NOSYNC | MDBX_TXN_NOMETASYNC)) == 0 &&
|
||||
!need_flush_for_nometasync && !head.is_steady && !AUDIT_ENABLED()) {
|
||||
TXN_FOREACH_DBI_ALL(txn, i) { tASSERT(txn, !(txn->dbi_state[i] & DBI_DIRTY)); }
|
||||
/* fast completion of pure transaction */
|
||||
return MDBX_NOSUCCESS_PURE_COMMIT ? MDBX_RESULT_TRUE : MDBX_SUCCESS;
|
||||
}
|
||||
|
||||
DEBUG("committing txn %" PRIaTXN " %p on env %p, root page %" PRIaPGNO "/%" PRIaPGNO, txn->txnid, (void *)txn,
|
||||
(void *)env, txn->dbs[MAIN_DBI].root, txn->dbs[FREE_DBI].root);
|
||||
|
||||
if (txn->n_dbi > CORE_DBS) {
|
||||
/* Update table root pointers */
|
||||
cursor_couple_t cx;
|
||||
int err = cursor_init(&cx.outer, txn, MAIN_DBI);
|
||||
if (unlikely(err != MDBX_SUCCESS))
|
||||
return err;
|
||||
cx.outer.next = txn->cursors[MAIN_DBI];
|
||||
txn->cursors[MAIN_DBI] = &cx.outer;
|
||||
TXN_FOREACH_DBI_USER(txn, i) {
|
||||
if ((txn->dbi_state[i] & DBI_DIRTY) == 0)
|
||||
continue;
|
||||
tree_t *const db = &txn->dbs[i];
|
||||
DEBUG("update main's entry for sub-db %zu, mod_txnid %" PRIaTXN " -> %" PRIaTXN, i, db->mod_txnid, txn->txnid);
|
||||
/* Может быть mod_txnid > front после коммита вложенных тразакций */
|
||||
db->mod_txnid = txn->txnid;
|
||||
MDBX_val data = {db, sizeof(tree_t)};
|
||||
err = cursor_put(&cx.outer, &env->kvs[i].name, &data, N_TREE);
|
||||
if (unlikely(err != MDBX_SUCCESS)) {
|
||||
txn->cursors[MAIN_DBI] = cx.outer.next;
|
||||
return err;
|
||||
}
|
||||
}
|
||||
txn->cursors[MAIN_DBI] = cx.outer.next;
|
||||
}
|
||||
|
||||
if (ts) {
|
||||
ts->prep = osal_monotime();
|
||||
ts->gc_cpu = osal_cputime(nullptr);
|
||||
}
|
||||
|
||||
gcu_t gcu_ctx;
|
||||
int rc = gc_put_init(txn, &gcu_ctx);
|
||||
if (likely(rc == MDBX_SUCCESS))
|
||||
rc = gc_update(txn, &gcu_ctx);
|
||||
|
||||
#if MDBX_ENABLE_BIGFOOT
|
||||
const txnid_t commit_txnid = gcu_ctx.bigfoot;
|
||||
if (commit_txnid > txn->txnid)
|
||||
TRACE("use @%" PRIaTXN " (+%zu) for commit bigfoot-txn", commit_txnid, (size_t)(commit_txnid - txn->txnid));
|
||||
#else
|
||||
const txnid_t commit_txnid = txn->txnid;
|
||||
#endif
|
||||
gc_put_destroy(&gcu_ctx);
|
||||
|
||||
if (ts)
|
||||
ts->gc_cpu = osal_cputime(nullptr) - ts->gc_cpu;
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return rc;
|
||||
|
||||
tASSERT(txn, txn->wr.loose_count == 0);
|
||||
txn->dbs[FREE_DBI].mod_txnid = (txn->dbi_state[FREE_DBI] & DBI_DIRTY) ? txn->txnid : txn->dbs[FREE_DBI].mod_txnid;
|
||||
txn->dbs[MAIN_DBI].mod_txnid = (txn->dbi_state[MAIN_DBI] & DBI_DIRTY) ? txn->txnid : txn->dbs[MAIN_DBI].mod_txnid;
|
||||
|
||||
if (ts) {
|
||||
ts->gc = osal_monotime();
|
||||
ts->audit = ts->gc;
|
||||
}
|
||||
if (AUDIT_ENABLED()) {
|
||||
rc = audit_ex(txn, MDBX_PNL_GETSIZE(txn->wr.retired_pages), true);
|
||||
if (ts)
|
||||
ts->audit = osal_monotime();
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return rc;
|
||||
}
|
||||
|
||||
if (txn->wr.dirtylist) {
|
||||
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC);
|
||||
tASSERT(txn, txn->wr.loose_count == 0);
|
||||
|
||||
mdbx_filehandle_t fd =
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
env->ioring.overlapped_fd ? env->ioring.overlapped_fd : env->lazy_fd;
|
||||
(void)need_flush_for_nometasync;
|
||||
#else
|
||||
(need_flush_for_nometasync || env->dsync_fd == INVALID_HANDLE_VALUE ||
|
||||
txn->wr.dirtylist->length > env->options.writethrough_threshold ||
|
||||
atomic_load64(&env->lck->unsynced_pages, mo_Relaxed))
|
||||
? env->lazy_fd
|
||||
: env->dsync_fd;
|
||||
#endif /* Windows */
|
||||
|
||||
iov_ctx_t write_ctx;
|
||||
rc = iov_init(txn, &write_ctx, txn->wr.dirtylist->length, txn->wr.dirtylist->pages_including_loose, fd, false);
|
||||
if (unlikely(rc != MDBX_SUCCESS)) {
|
||||
ERROR("txn-%s: error %d", "iov-init", rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
rc = txn_write(txn, &write_ctx);
|
||||
if (unlikely(rc != MDBX_SUCCESS)) {
|
||||
ERROR("txn-%s: error %d", "write", rc);
|
||||
return rc;
|
||||
}
|
||||
} else {
|
||||
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) != 0 && !MDBX_AVOID_MSYNC);
|
||||
env->lck->unsynced_pages.weak += txn->wr.writemap_dirty_npages;
|
||||
if (!env->lck->eoos_timestamp.weak)
|
||||
env->lck->eoos_timestamp.weak = osal_monotime();
|
||||
}
|
||||
|
||||
/* TODO: use ctx.flush_begin & ctx.flush_end for range-sync */
|
||||
if (ts)
|
||||
ts->write = osal_monotime();
|
||||
|
||||
meta_t meta;
|
||||
memcpy(meta.magic_and_version, head.ptr_c->magic_and_version, 8);
|
||||
meta.reserve16 = head.ptr_c->reserve16;
|
||||
meta.validator_id = head.ptr_c->validator_id;
|
||||
meta.extra_pagehdr = head.ptr_c->extra_pagehdr;
|
||||
unaligned_poke_u64(4, meta.pages_retired,
|
||||
unaligned_peek_u64(4, head.ptr_c->pages_retired) + MDBX_PNL_GETSIZE(txn->wr.retired_pages));
|
||||
meta.geometry = txn->geo;
|
||||
meta.trees.gc = txn->dbs[FREE_DBI];
|
||||
meta.trees.main = txn->dbs[MAIN_DBI];
|
||||
meta.canary = txn->canary;
|
||||
memcpy(&meta.dxbid, &head.ptr_c->dxbid, sizeof(meta.dxbid));
|
||||
|
||||
meta.unsafe_sign = DATASIGN_NONE;
|
||||
meta_set_txnid(env, &meta, commit_txnid);
|
||||
|
||||
rc = dxb_sync_locked(env, env->flags | txn->flags | txn_shrink_allowed, &meta, &txn->wr.troika);
|
||||
|
||||
if (ts)
|
||||
ts->sync = osal_monotime();
|
||||
if (unlikely(rc != MDBX_SUCCESS)) {
|
||||
env->flags |= ENV_FATAL_ERROR;
|
||||
ERROR("txn-%s: error %d", "sync", rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
return MDBX_SUCCESS;
|
||||
}
|
595
src/txn-nested.c
Normal file
595
src/txn-nested.c
Normal file
@ -0,0 +1,595 @@
|
||||
/// \copyright SPDX-License-Identifier: Apache-2.0
|
||||
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2025
|
||||
|
||||
#include "internals.h"
|
||||
|
||||
/* Merge pageset of the nested txn into parent */
|
||||
static void txn_merge(MDBX_txn *const parent, MDBX_txn *const txn, const size_t parent_retired_len) {
|
||||
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0);
|
||||
dpl_t *const src = dpl_sort(txn);
|
||||
|
||||
/* Remove refunded pages from parent's dirty list */
|
||||
dpl_t *const dst = dpl_sort(parent);
|
||||
if (MDBX_ENABLE_REFUND) {
|
||||
size_t n = dst->length;
|
||||
while (n && dst->items[n].pgno >= parent->geo.first_unallocated) {
|
||||
const unsigned npages = dpl_npages(dst, n);
|
||||
page_shadow_release(txn->env, dst->items[n].ptr, npages);
|
||||
--n;
|
||||
}
|
||||
parent->wr.dirtyroom += dst->sorted - n;
|
||||
dst->sorted = dpl_setlen(dst, n);
|
||||
tASSERT(parent, parent->wr.dirtyroom + parent->wr.dirtylist->length ==
|
||||
(parent->parent ? parent->parent->wr.dirtyroom : parent->env->options.dp_limit));
|
||||
}
|
||||
|
||||
/* Remove reclaimed pages from parent's dirty list */
|
||||
const pnl_t reclaimed_list = parent->wr.repnl;
|
||||
dpl_sift(parent, reclaimed_list, false);
|
||||
|
||||
/* Move retired pages from parent's dirty & spilled list to reclaimed */
|
||||
size_t r, w, d, s, l;
|
||||
for (r = w = parent_retired_len; ++r <= MDBX_PNL_GETSIZE(parent->wr.retired_pages);) {
|
||||
const pgno_t pgno = parent->wr.retired_pages[r];
|
||||
const size_t di = dpl_exist(parent, pgno);
|
||||
const size_t si = !di ? spill_search(parent, pgno) : 0;
|
||||
unsigned npages;
|
||||
const char *kind;
|
||||
if (di) {
|
||||
page_t *dp = dst->items[di].ptr;
|
||||
tASSERT(parent, (dp->flags & ~(P_LEAF | P_DUPFIX | P_BRANCH | P_LARGE | P_SPILLED)) == 0);
|
||||
npages = dpl_npages(dst, di);
|
||||
page_wash(parent, di, dp, npages);
|
||||
kind = "dirty";
|
||||
l = 1;
|
||||
if (unlikely(npages > l)) {
|
||||
/* OVERFLOW-страница могла быть переиспользована по частям. Тогда
|
||||
* в retired-списке может быть только начало последовательности,
|
||||
* а остаток растащен по dirty, spilled и reclaimed спискам. Поэтому
|
||||
* переносим в reclaimed с проверкой на обрыв последовательности.
|
||||
* В любом случае, все осколки будут учтены и отфильтрованы, т.е. если
|
||||
* страница была разбита на части, то важно удалить dirty-элемент,
|
||||
* а все осколки будут учтены отдельно. */
|
||||
|
||||
/* Список retired страниц не сортирован, но для ускорения сортировки
|
||||
* дополняется в соответствии с MDBX_PNL_ASCENDING */
|
||||
#if MDBX_PNL_ASCENDING
|
||||
const size_t len = MDBX_PNL_GETSIZE(parent->wr.retired_pages);
|
||||
while (r < len && parent->wr.retired_pages[r + 1] == pgno + l) {
|
||||
++r;
|
||||
if (++l == npages)
|
||||
break;
|
||||
}
|
||||
#else
|
||||
while (w > parent_retired_len && parent->wr.retired_pages[w - 1] == pgno + l) {
|
||||
--w;
|
||||
if (++l == npages)
|
||||
break;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
} else if (unlikely(si)) {
|
||||
l = npages = 1;
|
||||
spill_remove(parent, si, 1);
|
||||
kind = "spilled";
|
||||
} else {
|
||||
parent->wr.retired_pages[++w] = pgno;
|
||||
continue;
|
||||
}
|
||||
|
||||
DEBUG("reclaim retired parent's %u -> %zu %s page %" PRIaPGNO, npages, l, kind, pgno);
|
||||
int err = pnl_insert_span(&parent->wr.repnl, pgno, l);
|
||||
ENSURE(txn->env, err == MDBX_SUCCESS);
|
||||
}
|
||||
MDBX_PNL_SETSIZE(parent->wr.retired_pages, w);
|
||||
|
||||
/* Filter-out parent spill list */
|
||||
if (parent->wr.spilled.list && MDBX_PNL_GETSIZE(parent->wr.spilled.list) > 0) {
|
||||
const pnl_t sl = spill_purge(parent);
|
||||
size_t len = MDBX_PNL_GETSIZE(sl);
|
||||
if (len) {
|
||||
/* Remove refunded pages from parent's spill list */
|
||||
if (MDBX_ENABLE_REFUND && MDBX_PNL_MOST(sl) >= (parent->geo.first_unallocated << 1)) {
|
||||
#if MDBX_PNL_ASCENDING
|
||||
size_t i = MDBX_PNL_GETSIZE(sl);
|
||||
assert(MDBX_PNL_MOST(sl) == MDBX_PNL_LAST(sl));
|
||||
do {
|
||||
if ((sl[i] & 1) == 0)
|
||||
DEBUG("refund parent's spilled page %" PRIaPGNO, sl[i] >> 1);
|
||||
i -= 1;
|
||||
} while (i && sl[i] >= (parent->geo.first_unallocated << 1));
|
||||
MDBX_PNL_SETSIZE(sl, i);
|
||||
#else
|
||||
assert(MDBX_PNL_MOST(sl) == MDBX_PNL_FIRST(sl));
|
||||
size_t i = 0;
|
||||
do {
|
||||
++i;
|
||||
if ((sl[i] & 1) == 0)
|
||||
DEBUG("refund parent's spilled page %" PRIaPGNO, sl[i] >> 1);
|
||||
} while (i < len && sl[i + 1] >= (parent->geo.first_unallocated << 1));
|
||||
MDBX_PNL_SETSIZE(sl, len -= i);
|
||||
memmove(sl + 1, sl + 1 + i, len * sizeof(sl[0]));
|
||||
#endif
|
||||
}
|
||||
tASSERT(txn, pnl_check_allocated(sl, (size_t)parent->geo.first_unallocated << 1));
|
||||
|
||||
/* Remove reclaimed pages from parent's spill list */
|
||||
s = MDBX_PNL_GETSIZE(sl), r = MDBX_PNL_GETSIZE(reclaimed_list);
|
||||
/* Scanning from end to begin */
|
||||
while (s && r) {
|
||||
if (sl[s] & 1) {
|
||||
--s;
|
||||
continue;
|
||||
}
|
||||
const pgno_t spilled_pgno = sl[s] >> 1;
|
||||
const pgno_t reclaimed_pgno = reclaimed_list[r];
|
||||
if (reclaimed_pgno != spilled_pgno) {
|
||||
const bool cmp = MDBX_PNL_ORDERED(spilled_pgno, reclaimed_pgno);
|
||||
s -= !cmp;
|
||||
r -= cmp;
|
||||
} else {
|
||||
DEBUG("remove reclaimed parent's spilled page %" PRIaPGNO, reclaimed_pgno);
|
||||
spill_remove(parent, s, 1);
|
||||
--s;
|
||||
--r;
|
||||
}
|
||||
}
|
||||
|
||||
/* Remove anything in our dirty list from parent's spill list */
|
||||
/* Scanning spill list in descend order */
|
||||
const intptr_t step = MDBX_PNL_ASCENDING ? -1 : 1;
|
||||
s = MDBX_PNL_ASCENDING ? MDBX_PNL_GETSIZE(sl) : 1;
|
||||
d = src->length;
|
||||
while (d && (MDBX_PNL_ASCENDING ? s > 0 : s <= MDBX_PNL_GETSIZE(sl))) {
|
||||
if (sl[s] & 1) {
|
||||
s += step;
|
||||
continue;
|
||||
}
|
||||
const pgno_t spilled_pgno = sl[s] >> 1;
|
||||
const pgno_t dirty_pgno_form = src->items[d].pgno;
|
||||
const unsigned npages = dpl_npages(src, d);
|
||||
const pgno_t dirty_pgno_to = dirty_pgno_form + npages;
|
||||
if (dirty_pgno_form > spilled_pgno) {
|
||||
--d;
|
||||
continue;
|
||||
}
|
||||
if (dirty_pgno_to <= spilled_pgno) {
|
||||
s += step;
|
||||
continue;
|
||||
}
|
||||
|
||||
DEBUG("remove dirtied parent's spilled %u page %" PRIaPGNO, npages, dirty_pgno_form);
|
||||
spill_remove(parent, s, 1);
|
||||
s += step;
|
||||
}
|
||||
|
||||
/* Squash deleted pagenums if we deleted any */
|
||||
spill_purge(parent);
|
||||
}
|
||||
}
|
||||
|
||||
/* Remove anything in our spill list from parent's dirty list */
|
||||
if (txn->wr.spilled.list) {
|
||||
tASSERT(txn, pnl_check_allocated(txn->wr.spilled.list, (size_t)parent->geo.first_unallocated << 1));
|
||||
dpl_sift(parent, txn->wr.spilled.list, true);
|
||||
tASSERT(parent, parent->wr.dirtyroom + parent->wr.dirtylist->length ==
|
||||
(parent->parent ? parent->parent->wr.dirtyroom : parent->env->options.dp_limit));
|
||||
}
|
||||
|
||||
/* Find length of merging our dirty list with parent's and release
|
||||
* filter-out pages */
|
||||
for (l = 0, d = dst->length, s = src->length; d > 0 && s > 0;) {
|
||||
page_t *sp = src->items[s].ptr;
|
||||
tASSERT(parent, (sp->flags & ~(P_LEAF | P_DUPFIX | P_BRANCH | P_LARGE | P_LOOSE | P_SPILLED)) == 0);
|
||||
const unsigned s_npages = dpl_npages(src, s);
|
||||
const pgno_t s_pgno = src->items[s].pgno;
|
||||
|
||||
page_t *dp = dst->items[d].ptr;
|
||||
tASSERT(parent, (dp->flags & ~(P_LEAF | P_DUPFIX | P_BRANCH | P_LARGE | P_SPILLED)) == 0);
|
||||
const unsigned d_npages = dpl_npages(dst, d);
|
||||
const pgno_t d_pgno = dst->items[d].pgno;
|
||||
|
||||
if (d_pgno >= s_pgno + s_npages) {
|
||||
--d;
|
||||
++l;
|
||||
} else if (d_pgno + d_npages <= s_pgno) {
|
||||
if (sp->flags != P_LOOSE) {
|
||||
sp->txnid = parent->front_txnid;
|
||||
sp->flags &= ~P_SPILLED;
|
||||
}
|
||||
--s;
|
||||
++l;
|
||||
} else {
|
||||
dst->items[d--].ptr = nullptr;
|
||||
page_shadow_release(txn->env, dp, d_npages);
|
||||
}
|
||||
}
|
||||
assert(dst->sorted == dst->length);
|
||||
tASSERT(parent, dst->detent >= l + d + s);
|
||||
dst->sorted = l + d + s; /* the merged length */
|
||||
|
||||
while (s > 0) {
|
||||
page_t *sp = src->items[s].ptr;
|
||||
tASSERT(parent, (sp->flags & ~(P_LEAF | P_DUPFIX | P_BRANCH | P_LARGE | P_LOOSE | P_SPILLED)) == 0);
|
||||
if (sp->flags != P_LOOSE) {
|
||||
sp->txnid = parent->front_txnid;
|
||||
sp->flags &= ~P_SPILLED;
|
||||
}
|
||||
--s;
|
||||
}
|
||||
|
||||
/* Merge our dirty list into parent's, i.e. merge(dst, src) -> dst */
|
||||
if (dst->sorted >= dst->length) {
|
||||
/* from end to begin with dst extending */
|
||||
for (l = dst->sorted, s = src->length, d = dst->length; s > 0 && d > 0;) {
|
||||
if (unlikely(l <= d)) {
|
||||
/* squash to get a gap of free space for merge */
|
||||
for (r = w = 1; r <= d; ++r)
|
||||
if (dst->items[r].ptr) {
|
||||
if (w != r) {
|
||||
dst->items[w] = dst->items[r];
|
||||
dst->items[r].ptr = nullptr;
|
||||
}
|
||||
++w;
|
||||
}
|
||||
VERBOSE("squash to begin for extending-merge %zu -> %zu", d, w - 1);
|
||||
d = w - 1;
|
||||
continue;
|
||||
}
|
||||
assert(l > d);
|
||||
if (dst->items[d].ptr) {
|
||||
dst->items[l--] = (dst->items[d].pgno > src->items[s].pgno) ? dst->items[d--] : src->items[s--];
|
||||
} else
|
||||
--d;
|
||||
}
|
||||
if (s > 0) {
|
||||
assert(l == s);
|
||||
while (d > 0) {
|
||||
assert(dst->items[d].ptr == nullptr);
|
||||
--d;
|
||||
}
|
||||
do {
|
||||
assert(l > 0);
|
||||
dst->items[l--] = src->items[s--];
|
||||
} while (s > 0);
|
||||
} else {
|
||||
assert(l == d);
|
||||
while (l > 0) {
|
||||
assert(dst->items[l].ptr != nullptr);
|
||||
--l;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
/* from begin to end with shrinking (a lot of new large/overflow pages) */
|
||||
for (l = s = d = 1; s <= src->length && d <= dst->length;) {
|
||||
if (unlikely(l >= d)) {
|
||||
/* squash to get a gap of free space for merge */
|
||||
for (r = w = dst->length; r >= d; --r)
|
||||
if (dst->items[r].ptr) {
|
||||
if (w != r) {
|
||||
dst->items[w] = dst->items[r];
|
||||
dst->items[r].ptr = nullptr;
|
||||
}
|
||||
--w;
|
||||
}
|
||||
VERBOSE("squash to end for shrinking-merge %zu -> %zu", d, w + 1);
|
||||
d = w + 1;
|
||||
continue;
|
||||
}
|
||||
assert(l < d);
|
||||
if (dst->items[d].ptr) {
|
||||
dst->items[l++] = (dst->items[d].pgno < src->items[s].pgno) ? dst->items[d++] : src->items[s++];
|
||||
} else
|
||||
++d;
|
||||
}
|
||||
if (s <= src->length) {
|
||||
assert(dst->sorted - l == src->length - s);
|
||||
while (d <= dst->length) {
|
||||
assert(dst->items[d].ptr == nullptr);
|
||||
--d;
|
||||
}
|
||||
do {
|
||||
assert(l <= dst->sorted);
|
||||
dst->items[l++] = src->items[s++];
|
||||
} while (s <= src->length);
|
||||
} else {
|
||||
assert(dst->sorted - l == dst->length - d);
|
||||
while (l <= dst->sorted) {
|
||||
assert(l <= d && d <= dst->length && dst->items[d].ptr);
|
||||
dst->items[l++] = dst->items[d++];
|
||||
}
|
||||
}
|
||||
}
|
||||
parent->wr.dirtyroom -= dst->sorted - dst->length;
|
||||
assert(parent->wr.dirtyroom <= parent->env->options.dp_limit);
|
||||
dpl_setlen(dst, dst->sorted);
|
||||
parent->wr.dirtylru = txn->wr.dirtylru;
|
||||
|
||||
/* В текущем понимании выгоднее пересчитать кол-во страниц,
|
||||
* чем подмешивать лишние ветвления и вычисления в циклы выше. */
|
||||
dst->pages_including_loose = 0;
|
||||
for (r = 1; r <= dst->length; ++r)
|
||||
dst->pages_including_loose += dpl_npages(dst, r);
|
||||
|
||||
tASSERT(parent, dpl_check(parent));
|
||||
dpl_free(txn);
|
||||
|
||||
if (txn->wr.spilled.list) {
|
||||
if (parent->wr.spilled.list) {
|
||||
/* Must not fail since space was preserved above. */
|
||||
pnl_merge(parent->wr.spilled.list, txn->wr.spilled.list);
|
||||
pnl_free(txn->wr.spilled.list);
|
||||
} else {
|
||||
parent->wr.spilled.list = txn->wr.spilled.list;
|
||||
parent->wr.spilled.least_removed = txn->wr.spilled.least_removed;
|
||||
}
|
||||
tASSERT(parent, dpl_check(parent));
|
||||
}
|
||||
|
||||
if (parent->wr.spilled.list) {
|
||||
assert(pnl_check_allocated(parent->wr.spilled.list, (size_t)parent->geo.first_unallocated << 1));
|
||||
if (MDBX_PNL_GETSIZE(parent->wr.spilled.list))
|
||||
parent->flags |= MDBX_TXN_SPILLS;
|
||||
}
|
||||
}
|
||||
|
||||
int txn_nested_create(MDBX_txn *parent, const MDBX_txn_flags_t flags) {
|
||||
if (parent->env->options.spill_parent4child_denominator) {
|
||||
/* Spill dirty-pages of parent to provide dirtyroom for child txn */
|
||||
int err =
|
||||
txn_spill(parent, nullptr, parent->wr.dirtylist->length / parent->env->options.spill_parent4child_denominator);
|
||||
if (unlikely(err != MDBX_SUCCESS))
|
||||
return LOG_IFERR(err);
|
||||
}
|
||||
tASSERT(parent, audit_ex(parent, 0, false) == 0);
|
||||
|
||||
MDBX_txn *const txn = txn_alloc(flags, parent->env);
|
||||
if (unlikely(!txn))
|
||||
return LOG_IFERR(MDBX_ENOMEM);
|
||||
|
||||
tASSERT(parent, dpl_check(parent));
|
||||
txn->txnid = parent->txnid;
|
||||
txn->front_txnid = parent->front_txnid + 1;
|
||||
txn->canary = parent->canary;
|
||||
parent->flags |= MDBX_TXN_HAS_CHILD;
|
||||
parent->nested = txn;
|
||||
txn->parent = parent;
|
||||
txn->env->txn = txn;
|
||||
txn->owner = parent->owner;
|
||||
txn->wr.troika = parent->wr.troika;
|
||||
rkl_init(&txn->wr.gc.reclaimed);
|
||||
|
||||
#if MDBX_ENABLE_DBI_SPARSE
|
||||
txn->dbi_sparse = parent->dbi_sparse;
|
||||
#endif /* MDBX_ENABLE_DBI_SPARSE */
|
||||
txn->dbi_seqs = parent->dbi_seqs;
|
||||
txn->geo = parent->geo;
|
||||
|
||||
int err = dpl_alloc(txn);
|
||||
if (unlikely(err != MDBX_SUCCESS))
|
||||
return LOG_IFERR(err);
|
||||
|
||||
const size_t len = MDBX_PNL_GETSIZE(parent->wr.repnl) + parent->wr.loose_count;
|
||||
txn->wr.repnl = pnl_alloc((len > MDBX_PNL_INITIAL) ? len : MDBX_PNL_INITIAL);
|
||||
if (unlikely(!txn->wr.repnl))
|
||||
return LOG_IFERR(MDBX_ENOMEM);
|
||||
|
||||
/* Move loose pages to reclaimed list */
|
||||
if (parent->wr.loose_count) {
|
||||
do {
|
||||
page_t *lp = parent->wr.loose_pages;
|
||||
tASSERT(parent, lp->flags == P_LOOSE);
|
||||
err = pnl_insert_span(&parent->wr.repnl, lp->pgno, 1);
|
||||
if (unlikely(err != MDBX_SUCCESS))
|
||||
return LOG_IFERR(err);
|
||||
MDBX_ASAN_UNPOISON_MEMORY_REGION(&page_next(lp), sizeof(page_t *));
|
||||
VALGRIND_MAKE_MEM_DEFINED(&page_next(lp), sizeof(page_t *));
|
||||
parent->wr.loose_pages = page_next(lp);
|
||||
/* Remove from dirty list */
|
||||
page_wash(parent, dpl_exist(parent, lp->pgno), lp, 1);
|
||||
} while (parent->wr.loose_pages);
|
||||
parent->wr.loose_count = 0;
|
||||
#if MDBX_ENABLE_REFUND
|
||||
parent->wr.loose_refund_wl = 0;
|
||||
#endif /* MDBX_ENABLE_REFUND */
|
||||
tASSERT(parent, dpl_check(parent));
|
||||
}
|
||||
#if MDBX_ENABLE_REFUND
|
||||
txn->wr.loose_refund_wl = 0;
|
||||
#endif /* MDBX_ENABLE_REFUND */
|
||||
txn->wr.dirtyroom = parent->wr.dirtyroom;
|
||||
txn->wr.dirtylru = parent->wr.dirtylru;
|
||||
|
||||
dpl_sort(parent);
|
||||
if (parent->wr.spilled.list)
|
||||
spill_purge(parent);
|
||||
|
||||
tASSERT(txn, MDBX_PNL_ALLOCLEN(txn->wr.repnl) >= MDBX_PNL_GETSIZE(parent->wr.repnl));
|
||||
memcpy(txn->wr.repnl, parent->wr.repnl, MDBX_PNL_SIZEOF(parent->wr.repnl));
|
||||
/* coverity[assignment_where_comparison_intended] */
|
||||
tASSERT(txn, pnl_check_allocated(txn->wr.repnl, (txn->geo.first_unallocated /* LY: intentional assignment
|
||||
here, only for assertion */
|
||||
= parent->geo.first_unallocated) -
|
||||
MDBX_ENABLE_REFUND));
|
||||
|
||||
txn->wr.gc.spent = parent->wr.gc.spent;
|
||||
rkl_init(&txn->wr.gc.comeback);
|
||||
err = rkl_copy(&parent->wr.gc.reclaimed, &txn->wr.gc.reclaimed);
|
||||
if (unlikely(err != MDBX_SUCCESS))
|
||||
return err;
|
||||
|
||||
txn->wr.retired_pages = parent->wr.retired_pages;
|
||||
parent->wr.retired_pages = (void *)(intptr_t)MDBX_PNL_GETSIZE(parent->wr.retired_pages);
|
||||
|
||||
txn->cursors[FREE_DBI] = nullptr;
|
||||
txn->cursors[MAIN_DBI] = nullptr;
|
||||
txn->dbi_state[FREE_DBI] = parent->dbi_state[FREE_DBI] & ~(DBI_FRESH | DBI_CREAT | DBI_DIRTY);
|
||||
txn->dbi_state[MAIN_DBI] = parent->dbi_state[MAIN_DBI] & ~(DBI_FRESH | DBI_CREAT | DBI_DIRTY);
|
||||
memset(txn->dbi_state + CORE_DBS, 0, (txn->n_dbi = parent->n_dbi) - CORE_DBS);
|
||||
memcpy(txn->dbs, parent->dbs, sizeof(txn->dbs[0]) * CORE_DBS);
|
||||
|
||||
tASSERT(parent, parent->wr.dirtyroom + parent->wr.dirtylist->length ==
|
||||
(parent->parent ? parent->parent->wr.dirtyroom : parent->env->options.dp_limit));
|
||||
tASSERT(txn, txn->wr.dirtyroom + txn->wr.dirtylist->length ==
|
||||
(txn->parent ? txn->parent->wr.dirtyroom : txn->env->options.dp_limit));
|
||||
tASSERT(parent, parent->cursors[FREE_DBI] == nullptr);
|
||||
// TODO: shadow GC' cursor
|
||||
return txn_shadow_cursors(parent, MAIN_DBI);
|
||||
}
|
||||
|
||||
void txn_nested_abort(MDBX_txn *nested) {
|
||||
MDBX_txn *const parent = nested->parent;
|
||||
tASSERT(nested, !(nested->flags & txn_may_have_cursors));
|
||||
nested->signature = 0;
|
||||
nested->owner = 0;
|
||||
|
||||
rkl_destroy(&nested->wr.gc.reclaimed);
|
||||
|
||||
if (nested->wr.retired_pages) {
|
||||
tASSERT(parent, MDBX_PNL_GETSIZE(nested->wr.retired_pages) >= (uintptr_t)parent->wr.retired_pages);
|
||||
MDBX_PNL_SETSIZE(nested->wr.retired_pages, (uintptr_t)parent->wr.retired_pages);
|
||||
parent->wr.retired_pages = nested->wr.retired_pages;
|
||||
}
|
||||
|
||||
tASSERT(parent, dpl_check(parent));
|
||||
tASSERT(parent, audit_ex(parent, 0, false) == 0);
|
||||
dpl_release_shadows(nested);
|
||||
dpl_free(nested);
|
||||
pnl_free(nested->wr.repnl);
|
||||
osal_free(nested);
|
||||
}
|
||||
|
||||
int txn_nested_join(MDBX_txn *txn, struct commit_timestamp *ts) {
|
||||
MDBX_env *const env = txn->env;
|
||||
MDBX_txn *const parent = txn->parent;
|
||||
tASSERT(txn, audit_ex(txn, 0, false) == 0);
|
||||
eASSERT(env, txn != env->basal_txn);
|
||||
eASSERT(env, parent->signature == txn_signature);
|
||||
eASSERT(env, parent->nested == txn && (parent->flags & MDBX_TXN_HAS_CHILD) != 0);
|
||||
eASSERT(env, dpl_check(txn));
|
||||
|
||||
if (txn->wr.dirtylist->length == 0 && !(txn->flags & MDBX_TXN_DIRTY) && parent->n_dbi == txn->n_dbi) {
|
||||
VERBOSE("fast-complete pure nested txn %" PRIaTXN, txn->txnid);
|
||||
|
||||
tASSERT(txn, memcmp(&parent->geo, &txn->geo, sizeof(parent->geo)) == 0);
|
||||
tASSERT(txn, memcmp(&parent->canary, &txn->canary, sizeof(parent->canary)) == 0);
|
||||
tASSERT(txn, !txn->wr.spilled.list || MDBX_PNL_GETSIZE(txn->wr.spilled.list) == 0);
|
||||
tASSERT(txn, txn->wr.loose_count == 0);
|
||||
|
||||
/* Update parent's DBs array */
|
||||
eASSERT(env, parent->n_dbi == txn->n_dbi);
|
||||
TXN_FOREACH_DBI_ALL(txn, dbi) {
|
||||
tASSERT(txn, (txn->dbi_state[dbi] & (DBI_CREAT | DBI_DIRTY)) == 0);
|
||||
if (txn->dbi_state[dbi] & DBI_FRESH) {
|
||||
parent->dbs[dbi] = txn->dbs[dbi];
|
||||
/* preserve parent's status */
|
||||
const uint8_t state = txn->dbi_state[dbi] | DBI_FRESH;
|
||||
DEBUG("dbi %zu dbi-state %s 0x%02x -> 0x%02x", dbi, (parent->dbi_state[dbi] != state) ? "update" : "still",
|
||||
parent->dbi_state[dbi], state);
|
||||
parent->dbi_state[dbi] = state;
|
||||
}
|
||||
}
|
||||
return txn_end(txn, TXN_END_PURE_COMMIT | TXN_END_SLOT | TXN_END_FREE);
|
||||
}
|
||||
|
||||
/* Preserve space for spill list to avoid parent's state corruption
|
||||
* if allocation fails. */
|
||||
const size_t parent_retired_len = (uintptr_t)parent->wr.retired_pages;
|
||||
tASSERT(txn, parent_retired_len <= MDBX_PNL_GETSIZE(txn->wr.retired_pages));
|
||||
const size_t retired_delta = MDBX_PNL_GETSIZE(txn->wr.retired_pages) - parent_retired_len;
|
||||
if (retired_delta) {
|
||||
int err = pnl_need(&txn->wr.repnl, retired_delta);
|
||||
if (unlikely(err != MDBX_SUCCESS))
|
||||
return err;
|
||||
}
|
||||
|
||||
if (txn->wr.spilled.list) {
|
||||
if (parent->wr.spilled.list) {
|
||||
int err = pnl_need(&parent->wr.spilled.list, MDBX_PNL_GETSIZE(txn->wr.spilled.list));
|
||||
if (unlikely(err != MDBX_SUCCESS))
|
||||
return err;
|
||||
}
|
||||
spill_purge(txn);
|
||||
}
|
||||
|
||||
if (unlikely(txn->wr.dirtylist->length + parent->wr.dirtylist->length > parent->wr.dirtylist->detent &&
|
||||
!dpl_reserve(parent, txn->wr.dirtylist->length + parent->wr.dirtylist->length))) {
|
||||
return MDBX_ENOMEM;
|
||||
}
|
||||
|
||||
//-------------------------------------------------------------------------
|
||||
|
||||
parent->wr.retired_pages = txn->wr.retired_pages;
|
||||
txn->wr.retired_pages = nullptr;
|
||||
|
||||
pnl_free(parent->wr.repnl);
|
||||
parent->wr.repnl = txn->wr.repnl;
|
||||
txn->wr.repnl = nullptr;
|
||||
parent->wr.gc.spent = txn->wr.gc.spent;
|
||||
rkl_destructive_move(&txn->wr.gc.reclaimed, &parent->wr.gc.reclaimed);
|
||||
|
||||
parent->geo = txn->geo;
|
||||
parent->canary = txn->canary;
|
||||
parent->flags |= txn->flags & MDBX_TXN_DIRTY;
|
||||
|
||||
/* Move loose pages to parent */
|
||||
#if MDBX_ENABLE_REFUND
|
||||
parent->wr.loose_refund_wl = txn->wr.loose_refund_wl;
|
||||
#endif /* MDBX_ENABLE_REFUND */
|
||||
parent->wr.loose_count = txn->wr.loose_count;
|
||||
parent->wr.loose_pages = txn->wr.loose_pages;
|
||||
|
||||
if (txn->flags & txn_may_have_cursors)
|
||||
/* Merge our cursors into parent's and close them */
|
||||
txn_done_cursors(txn);
|
||||
|
||||
/* Update parent's DBs array */
|
||||
eASSERT(env, parent->n_dbi == txn->n_dbi);
|
||||
TXN_FOREACH_DBI_ALL(txn, dbi) {
|
||||
if (txn->dbi_state[dbi] != (parent->dbi_state[dbi] & ~(DBI_FRESH | DBI_CREAT | DBI_DIRTY))) {
|
||||
eASSERT(env,
|
||||
(txn->dbi_state[dbi] & (DBI_CREAT | DBI_FRESH | DBI_DIRTY)) != 0 ||
|
||||
(txn->dbi_state[dbi] | DBI_STALE) == (parent->dbi_state[dbi] & ~(DBI_FRESH | DBI_CREAT | DBI_DIRTY)));
|
||||
parent->dbs[dbi] = txn->dbs[dbi];
|
||||
/* preserve parent's status */
|
||||
const uint8_t state = txn->dbi_state[dbi] | (parent->dbi_state[dbi] & (DBI_CREAT | DBI_FRESH | DBI_DIRTY));
|
||||
DEBUG("dbi %zu dbi-state %s 0x%02x -> 0x%02x", dbi, (parent->dbi_state[dbi] != state) ? "update" : "still",
|
||||
parent->dbi_state[dbi], state);
|
||||
parent->dbi_state[dbi] = state;
|
||||
}
|
||||
}
|
||||
|
||||
if (ts) {
|
||||
ts->prep = osal_monotime();
|
||||
ts->gc = /* no gc-update */ ts->prep;
|
||||
ts->audit = /* no audit */ ts->gc;
|
||||
ts->write = /* no write */ ts->audit;
|
||||
ts->sync = /* no sync */ ts->write;
|
||||
}
|
||||
txn_merge(parent, txn, parent_retired_len);
|
||||
tASSERT(parent, parent->flags & MDBX_TXN_HAS_CHILD);
|
||||
parent->flags -= MDBX_TXN_HAS_CHILD;
|
||||
env->txn = parent;
|
||||
parent->nested = nullptr;
|
||||
tASSERT(parent, dpl_check(parent));
|
||||
|
||||
#if MDBX_ENABLE_REFUND
|
||||
txn_refund(parent);
|
||||
if (ASSERT_ENABLED()) {
|
||||
/* Check parent's loose pages not suitable for refund */
|
||||
for (page_t *lp = parent->wr.loose_pages; lp; lp = page_next(lp)) {
|
||||
tASSERT(parent, lp->pgno < parent->wr.loose_refund_wl && lp->pgno + 1 < parent->geo.first_unallocated);
|
||||
MDBX_ASAN_UNPOISON_MEMORY_REGION(&page_next(lp), sizeof(page_t *));
|
||||
VALGRIND_MAKE_MEM_DEFINED(&page_next(lp), sizeof(page_t *));
|
||||
}
|
||||
/* Check parent's reclaimed pages not suitable for refund */
|
||||
if (MDBX_PNL_GETSIZE(parent->wr.repnl))
|
||||
tASSERT(parent, MDBX_PNL_MOST(parent->wr.repnl) + 1 < parent->geo.first_unallocated);
|
||||
}
|
||||
#endif /* MDBX_ENABLE_REFUND */
|
||||
|
||||
txn->signature = 0;
|
||||
osal_free(txn);
|
||||
tASSERT(parent, audit_ex(parent, 0, false) == 0);
|
||||
return MDBX_SUCCESS;
|
||||
}
|
289
src/txn-ro.c
Normal file
289
src/txn-ro.c
Normal file
@ -0,0 +1,289 @@
|
||||
/// \copyright SPDX-License-Identifier: Apache-2.0
|
||||
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2025
|
||||
|
||||
#include "internals.h"
|
||||
|
||||
static inline int txn_ro_rslot(MDBX_txn *txn) {
|
||||
reader_slot_t *slot = txn->ro.slot;
|
||||
STATIC_ASSERT(sizeof(uintptr_t) <= sizeof(slot->tid));
|
||||
if (likely(slot)) {
|
||||
if (likely(slot->pid.weak == txn->env->pid && slot->txnid.weak >= SAFE64_INVALID_THRESHOLD)) {
|
||||
tASSERT(txn, slot->pid.weak == osal_getpid());
|
||||
tASSERT(txn, slot->tid.weak == ((txn->env->flags & MDBX_NOSTICKYTHREADS) ? 0 : osal_thread_self()));
|
||||
return MDBX_SUCCESS;
|
||||
}
|
||||
return MDBX_BAD_RSLOT;
|
||||
}
|
||||
|
||||
if (unlikely(!txn->env->lck_mmap.lck))
|
||||
return MDBX_SUCCESS;
|
||||
|
||||
MDBX_env *const env = txn->env;
|
||||
if (env->flags & ENV_TXKEY) {
|
||||
eASSERT(env, !(env->flags & MDBX_NOSTICKYTHREADS));
|
||||
slot = thread_rthc_get(env->me_txkey);
|
||||
if (likely(slot)) {
|
||||
if (likely(slot->pid.weak == env->pid && slot->txnid.weak >= SAFE64_INVALID_THRESHOLD)) {
|
||||
tASSERT(txn, slot->pid.weak == osal_getpid());
|
||||
tASSERT(txn, slot->tid.weak == ((env->flags & MDBX_NOSTICKYTHREADS) ? 0 : osal_thread_self()));
|
||||
txn->ro.slot = slot;
|
||||
return MDBX_SUCCESS;
|
||||
}
|
||||
if (unlikely(slot->pid.weak) || !(globals.runtime_flags & MDBX_DBG_LEGACY_MULTIOPEN))
|
||||
return MDBX_BAD_RSLOT;
|
||||
thread_rthc_set(env->me_txkey, nullptr);
|
||||
}
|
||||
} else {
|
||||
eASSERT(env, (env->flags & MDBX_NOSTICKYTHREADS));
|
||||
}
|
||||
|
||||
bsr_t brs = mvcc_bind_slot(env);
|
||||
if (likely(brs.err == MDBX_SUCCESS)) {
|
||||
tASSERT(txn, brs.slot->pid.weak == osal_getpid());
|
||||
tASSERT(txn, brs.slot->tid.weak == ((env->flags & MDBX_NOSTICKYTHREADS) ? 0 : osal_thread_self()));
|
||||
}
|
||||
txn->ro.slot = brs.slot;
|
||||
return brs.err;
|
||||
}
|
||||
|
||||
static inline int txn_ro_seize(MDBX_txn *txn) {
|
||||
/* Seek & fetch the last meta */
|
||||
troika_t troika = meta_tap(txn->env);
|
||||
uint64_t timestamp = 0;
|
||||
size_t loop = 0;
|
||||
do {
|
||||
MDBX_env *const env = txn->env;
|
||||
const meta_ptr_t head = likely(env->stuck_meta < 0) ? /* regular */ meta_recent(env, &troika)
|
||||
: /* recovery mode */ meta_ptr(env, env->stuck_meta);
|
||||
reader_slot_t *const r = txn->ro.slot;
|
||||
if (likely(r != nullptr)) {
|
||||
safe64_reset(&r->txnid, true);
|
||||
atomic_store32(&r->snapshot_pages_used, head.ptr_v->geometry.first_unallocated, mo_Relaxed);
|
||||
atomic_store64(&r->snapshot_pages_retired, unaligned_peek_u64_volatile(4, head.ptr_v->pages_retired), mo_Relaxed);
|
||||
safe64_write(&r->txnid, head.txnid);
|
||||
eASSERT(env, r->pid.weak == osal_getpid());
|
||||
eASSERT(env, r->tid.weak == ((env->flags & MDBX_NOSTICKYTHREADS) ? 0 : osal_thread_self()));
|
||||
eASSERT(env, r->txnid.weak == head.txnid ||
|
||||
(r->txnid.weak >= SAFE64_INVALID_THRESHOLD && head.txnid < env->lck->cached_oldest.weak));
|
||||
atomic_store32(&env->lck->rdt_refresh_flag, true, mo_AcquireRelease);
|
||||
} else {
|
||||
/* exclusive mode without lck */
|
||||
eASSERT(env, !env->lck_mmap.lck && env->lck == lckless_stub(env));
|
||||
}
|
||||
jitter4testing(true);
|
||||
|
||||
if (unlikely(meta_should_retry(env, &troika))) {
|
||||
timestamp = 0;
|
||||
continue;
|
||||
}
|
||||
|
||||
/* Snap the state from current meta-head */
|
||||
int err = coherency_fetch_head(txn, head, ×tamp);
|
||||
jitter4testing(false);
|
||||
if (unlikely(err != MDBX_SUCCESS)) {
|
||||
if (err != MDBX_RESULT_TRUE)
|
||||
return err;
|
||||
continue;
|
||||
}
|
||||
|
||||
const uint64_t snap_oldest = atomic_load64(&env->lck->cached_oldest, mo_AcquireRelease);
|
||||
if (unlikely(txn->txnid < snap_oldest)) {
|
||||
if (env->stuck_meta >= 0) {
|
||||
ERROR("target meta-page %i is referenced to an obsolete MVCC-snapshot "
|
||||
"%" PRIaTXN " < cached-oldest %" PRIaTXN,
|
||||
env->stuck_meta, txn->txnid, snap_oldest);
|
||||
return MDBX_MVCC_RETARDED;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!r || likely(txn->txnid == atomic_load64(&r->txnid, mo_Relaxed)))
|
||||
return MDBX_SUCCESS;
|
||||
|
||||
} while (likely(++loop < 42));
|
||||
|
||||
ERROR("bailout waiting for valid snapshot (%s)", "meta-pages are too volatile");
|
||||
return MDBX_PROBLEM;
|
||||
}
|
||||
|
||||
int txn_ro_start(MDBX_txn *txn, unsigned flags) {
|
||||
MDBX_env *const env = txn->env;
|
||||
eASSERT(env, flags & MDBX_TXN_RDONLY);
|
||||
eASSERT(env, (flags & ~(txn_ro_begin_flags | MDBX_WRITEMAP | MDBX_NOSTICKYTHREADS)) == 0);
|
||||
txn->flags = flags;
|
||||
|
||||
int err = txn_ro_rslot(txn);
|
||||
if (unlikely(err != MDBX_SUCCESS))
|
||||
goto bailout;
|
||||
|
||||
STATIC_ASSERT(MDBX_TXN_RDONLY_PREPARE > MDBX_TXN_RDONLY);
|
||||
reader_slot_t *r = txn->ro.slot;
|
||||
if (flags & (MDBX_TXN_RDONLY_PREPARE - MDBX_TXN_RDONLY)) {
|
||||
eASSERT(env, txn->txnid == 0);
|
||||
eASSERT(env, txn->owner == 0);
|
||||
eASSERT(env, txn->n_dbi == 0);
|
||||
if (likely(r)) {
|
||||
eASSERT(env, r->snapshot_pages_used.weak == 0);
|
||||
eASSERT(env, r->txnid.weak >= SAFE64_INVALID_THRESHOLD);
|
||||
atomic_store32(&r->snapshot_pages_used, 0, mo_Relaxed);
|
||||
}
|
||||
txn->flags = MDBX_TXN_RDONLY | MDBX_TXN_FINISHED;
|
||||
return MDBX_SUCCESS;
|
||||
}
|
||||
|
||||
txn->owner = likely(r) ? (uintptr_t)r->tid.weak : ((env->flags & MDBX_NOSTICKYTHREADS) ? 0 : osal_thread_self());
|
||||
if ((env->flags & MDBX_NOSTICKYTHREADS) == 0 && env->txn && unlikely(env->basal_txn->owner == txn->owner) &&
|
||||
(globals.runtime_flags & MDBX_DBG_LEGACY_OVERLAP) == 0) {
|
||||
err = MDBX_TXN_OVERLAPPING;
|
||||
goto bailout;
|
||||
}
|
||||
|
||||
err = txn_ro_seize(txn);
|
||||
if (unlikely(err != MDBX_SUCCESS))
|
||||
goto bailout;
|
||||
|
||||
if (unlikely(txn->txnid < MIN_TXNID || txn->txnid > MAX_TXNID)) {
|
||||
ERROR("%s", "environment corrupted by died writer, must shutdown!");
|
||||
err = MDBX_CORRUPTED;
|
||||
goto bailout;
|
||||
}
|
||||
|
||||
return MDBX_SUCCESS;
|
||||
|
||||
bailout:
|
||||
tASSERT(txn, err != MDBX_SUCCESS);
|
||||
txn->txnid = INVALID_TXNID;
|
||||
if (likely(txn->ro.slot))
|
||||
safe64_reset(&txn->ro.slot->txnid, true);
|
||||
return err;
|
||||
}
|
||||
|
||||
int txn_ro_end(MDBX_txn *txn, unsigned mode) {
|
||||
MDBX_env *const env = txn->env;
|
||||
tASSERT(txn, (txn->flags & txn_may_have_cursors) == 0);
|
||||
txn->n_dbi = 0; /* prevent further DBI activity */
|
||||
if (txn->ro.slot) {
|
||||
reader_slot_t *slot = txn->ro.slot;
|
||||
if (unlikely(!env->lck))
|
||||
txn->ro.slot = nullptr;
|
||||
else {
|
||||
eASSERT(env, slot->pid.weak == env->pid);
|
||||
if (likely((txn->flags & MDBX_TXN_FINISHED) == 0)) {
|
||||
if (likely((txn->flags & MDBX_TXN_PARKED) == 0)) {
|
||||
ENSURE(env, txn->txnid >=
|
||||
/* paranoia is appropriate here */ env->lck->cached_oldest.weak);
|
||||
eASSERT(env, txn->txnid == slot->txnid.weak && slot->txnid.weak >= env->lck->cached_oldest.weak);
|
||||
} else {
|
||||
if ((mode & TXN_END_OPMASK) != TXN_END_OUSTED && safe64_read(&slot->tid) == MDBX_TID_TXN_OUSTED)
|
||||
mode = (mode & ~TXN_END_OPMASK) | TXN_END_OUSTED;
|
||||
do {
|
||||
safe64_reset(&slot->txnid, false);
|
||||
atomic_store64(&slot->tid, txn->owner, mo_AcquireRelease);
|
||||
atomic_yield();
|
||||
} while (
|
||||
unlikely(safe64_read(&slot->txnid) < SAFE64_INVALID_THRESHOLD || safe64_read(&slot->tid) != txn->owner));
|
||||
}
|
||||
dxb_sanitize_tail(env, nullptr);
|
||||
atomic_store32(&slot->snapshot_pages_used, 0, mo_Relaxed);
|
||||
safe64_reset(&slot->txnid, true);
|
||||
atomic_store32(&env->lck->rdt_refresh_flag, true, mo_Relaxed);
|
||||
} else {
|
||||
eASSERT(env, slot->pid.weak == env->pid);
|
||||
eASSERT(env, slot->txnid.weak >= SAFE64_INVALID_THRESHOLD);
|
||||
}
|
||||
if (mode & TXN_END_SLOT) {
|
||||
if ((env->flags & ENV_TXKEY) == 0)
|
||||
atomic_store32(&slot->pid, 0, mo_Relaxed);
|
||||
txn->ro.slot = nullptr;
|
||||
}
|
||||
}
|
||||
}
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
if (txn->flags & txn_shrink_allowed)
|
||||
imports.srwl_ReleaseShared(&env->remap_guard);
|
||||
#endif
|
||||
txn->flags = ((mode & TXN_END_OPMASK) != TXN_END_OUSTED) ? MDBX_TXN_RDONLY | MDBX_TXN_FINISHED
|
||||
: MDBX_TXN_RDONLY | MDBX_TXN_FINISHED | MDBX_TXN_OUSTED;
|
||||
txn->owner = 0;
|
||||
if (mode & TXN_END_FREE) {
|
||||
txn->signature = 0;
|
||||
osal_free(txn);
|
||||
}
|
||||
return MDBX_SUCCESS;
|
||||
}
|
||||
|
||||
int txn_ro_park(MDBX_txn *txn, bool autounpark) {
|
||||
reader_slot_t *const rslot = txn->ro.slot;
|
||||
tASSERT(txn, (txn->flags & (MDBX_TXN_FINISHED | MDBX_TXN_RDONLY | MDBX_TXN_PARKED)) == MDBX_TXN_RDONLY);
|
||||
tASSERT(txn, txn->ro.slot->tid.weak < MDBX_TID_TXN_OUSTED);
|
||||
if (unlikely((txn->flags & (MDBX_TXN_FINISHED | MDBX_TXN_RDONLY | MDBX_TXN_PARKED)) != MDBX_TXN_RDONLY))
|
||||
return MDBX_BAD_TXN;
|
||||
|
||||
const uint32_t pid = atomic_load32(&rslot->pid, mo_Relaxed);
|
||||
const uint64_t tid = atomic_load64(&rslot->tid, mo_Relaxed);
|
||||
const uint64_t txnid = atomic_load64(&rslot->txnid, mo_Relaxed);
|
||||
if (unlikely(pid != txn->env->pid)) {
|
||||
ERROR("unexpected pid %u%s%u", pid, " != must ", txn->env->pid);
|
||||
return MDBX_PROBLEM;
|
||||
}
|
||||
if (unlikely(tid != txn->owner || txnid != txn->txnid)) {
|
||||
ERROR("unexpected thread-id 0x%" PRIx64 "%s0x%0zx"
|
||||
" and/or txn-id %" PRIaTXN "%s%" PRIaTXN,
|
||||
tid, " != must ", txn->owner, txnid, " != must ", txn->txnid);
|
||||
return MDBX_BAD_RSLOT;
|
||||
}
|
||||
|
||||
atomic_store64(&rslot->tid, MDBX_TID_TXN_PARKED, mo_AcquireRelease);
|
||||
atomic_store32(&txn->env->lck->rdt_refresh_flag, true, mo_Relaxed);
|
||||
txn->flags += autounpark ? MDBX_TXN_PARKED | MDBX_TXN_AUTOUNPARK : MDBX_TXN_PARKED;
|
||||
return MDBX_SUCCESS;
|
||||
}
|
||||
|
||||
int txn_ro_unpark(MDBX_txn *txn) {
|
||||
if (unlikely((txn->flags & (MDBX_TXN_FINISHED | MDBX_TXN_HAS_CHILD | MDBX_TXN_RDONLY | MDBX_TXN_PARKED)) !=
|
||||
(MDBX_TXN_RDONLY | MDBX_TXN_PARKED)))
|
||||
return MDBX_BAD_TXN;
|
||||
|
||||
for (reader_slot_t *const rslot = txn->ro.slot; rslot; atomic_yield()) {
|
||||
const uint32_t pid = atomic_load32(&rslot->pid, mo_Relaxed);
|
||||
uint64_t tid = safe64_read(&rslot->tid);
|
||||
uint64_t txnid = safe64_read(&rslot->txnid);
|
||||
if (unlikely(pid != txn->env->pid)) {
|
||||
ERROR("unexpected pid %u%s%u", pid, " != expected ", txn->env->pid);
|
||||
return MDBX_PROBLEM;
|
||||
}
|
||||
if (unlikely(tid == MDBX_TID_TXN_OUSTED || txnid >= SAFE64_INVALID_THRESHOLD))
|
||||
break;
|
||||
if (unlikely(tid != MDBX_TID_TXN_PARKED || txnid != txn->txnid)) {
|
||||
ERROR("unexpected thread-id 0x%" PRIx64 "%s0x%" PRIx64 " and/or txn-id %" PRIaTXN "%s%" PRIaTXN, tid, " != must ",
|
||||
MDBX_TID_TXN_OUSTED, txnid, " != must ", txn->txnid);
|
||||
break;
|
||||
}
|
||||
if (unlikely((txn->flags & MDBX_TXN_ERROR)))
|
||||
break;
|
||||
|
||||
#if MDBX_64BIT_CAS
|
||||
if (unlikely(!atomic_cas64(&rslot->tid, MDBX_TID_TXN_PARKED, txn->owner)))
|
||||
continue;
|
||||
#else
|
||||
atomic_store32(&rslot->tid.high, (uint32_t)((uint64_t)txn->owner >> 32), mo_Relaxed);
|
||||
if (unlikely(!atomic_cas32(&rslot->tid.low, (uint32_t)MDBX_TID_TXN_PARKED, (uint32_t)txn->owner))) {
|
||||
atomic_store32(&rslot->tid.high, (uint32_t)(MDBX_TID_TXN_PARKED >> 32), mo_AcquireRelease);
|
||||
continue;
|
||||
}
|
||||
#endif
|
||||
txnid = safe64_read(&rslot->txnid);
|
||||
tid = safe64_read(&rslot->tid);
|
||||
if (unlikely(txnid != txn->txnid || tid != txn->owner)) {
|
||||
ERROR("unexpected thread-id 0x%" PRIx64 "%s0x%zx"
|
||||
" and/or txn-id %" PRIaTXN "%s%" PRIaTXN,
|
||||
tid, " != must ", txn->owner, txnid, " != must ", txn->txnid);
|
||||
break;
|
||||
}
|
||||
txn->flags &= ~(MDBX_TXN_PARKED | MDBX_TXN_AUTOUNPARK);
|
||||
return MDBX_SUCCESS;
|
||||
}
|
||||
|
||||
int err = txn_end(txn, TXN_END_OUSTED | TXN_END_RESET | TXN_END_UPDATE);
|
||||
return err ? err : MDBX_OUSTED;
|
||||
}
|
11
src/utils.c
11
src/utils.c
@ -3,6 +3,17 @@
|
||||
|
||||
#include "internals.h"
|
||||
|
||||
MDBX_NOTHROW_CONST_FUNCTION MDBX_MAYBE_UNUSED MDBX_INTERNAL unsigned ceil_log2n(size_t value_uintptr) {
|
||||
assert(value_uintptr > 0 && value_uintptr < INT32_MAX);
|
||||
value_uintptr -= 1;
|
||||
value_uintptr |= value_uintptr >> 1;
|
||||
value_uintptr |= value_uintptr >> 2;
|
||||
value_uintptr |= value_uintptr >> 4;
|
||||
value_uintptr |= value_uintptr >> 8;
|
||||
value_uintptr |= value_uintptr >> 16;
|
||||
return log2n_powerof2(value_uintptr + 1);
|
||||
}
|
||||
|
||||
MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION MDBX_INTERNAL unsigned log2n_powerof2(size_t value_uintptr) {
|
||||
assert(value_uintptr > 0 && value_uintptr < INT32_MAX && is_powerof2(value_uintptr));
|
||||
assert((value_uintptr & -(intptr_t)value_uintptr) == value_uintptr);
|
||||
|
@ -58,6 +58,8 @@ MDBX_NOTHROW_CONST_FUNCTION MDBX_MAYBE_UNUSED static inline size_t ceil_powerof2
|
||||
|
||||
MDBX_NOTHROW_CONST_FUNCTION MDBX_MAYBE_UNUSED MDBX_INTERNAL unsigned log2n_powerof2(size_t value_uintptr);
|
||||
|
||||
MDBX_NOTHROW_CONST_FUNCTION MDBX_MAYBE_UNUSED MDBX_INTERNAL unsigned ceil_log2n(size_t value_uintptr);
|
||||
|
||||
MDBX_NOTHROW_CONST_FUNCTION MDBX_INTERNAL uint64_t rrxmrrxmsx_0(uint64_t v);
|
||||
|
||||
struct monotime_cache {
|
||||
|
@ -3,11 +3,16 @@
|
||||
|
||||
#include "internals.h"
|
||||
|
||||
#if MDBX_VERSION_MAJOR != ${MDBX_VERSION_MAJOR} || MDBX_VERSION_MINOR != ${MDBX_VERSION_MINOR}
|
||||
#if !defined(MDBX_VERSION_UNSTABLE) && \
|
||||
(MDBX_VERSION_MAJOR != ${MDBX_VERSION_MAJOR} || MDBX_VERSION_MINOR != ${MDBX_VERSION_MINOR})
|
||||
#error "API version mismatch! Had `git fetch --tags` done?"
|
||||
#endif
|
||||
|
||||
static const char sourcery[] = MDBX_STRINGIFY(MDBX_BUILD_SOURCERY);
|
||||
static const char sourcery[] =
|
||||
#ifdef MDBX_VERSION_UNSTABLE
|
||||
"UNSTABLE@"
|
||||
#endif
|
||||
MDBX_STRINGIFY(MDBX_BUILD_SOURCERY);
|
||||
|
||||
__dll_export
|
||||
#ifdef __attribute_used__
|
||||
|
@ -298,6 +298,7 @@ else()
|
||||
add_extra_test(upsert_alldups SOURCE extra/upsert_alldups.c)
|
||||
add_extra_test(dupfix_addodd SOURCE extra/dupfix_addodd.c)
|
||||
endif()
|
||||
add_extra_test(details_rkl SOURCE extra/details_rkl.c)
|
||||
if(MDBX_BUILD_CXX)
|
||||
if(NOT WIN32 OR NOT MDBX_CXX_STANDARD LESS 17)
|
||||
add_extra_test(cursor_closing TIMEOUT 10800)
|
||||
|
@ -72,6 +72,7 @@ void configure_actor(unsigned &last_space_id, const actor_testcase testcase, con
|
||||
log_trace("configure_actor: space %lu for %s", space_id, testcase2str(testcase));
|
||||
global::actors.emplace_back(actor_config(testcase, params, unsigned(space_id), wait4id));
|
||||
global::databases.insert(params.pathname_db);
|
||||
params.prng_seed += bleach64(space_id);
|
||||
}
|
||||
|
||||
void testcase_setup(const char *casename, const actor_params ¶ms, unsigned &last_space_id) {
|
||||
|
@ -23,7 +23,13 @@
|
||||
#define RELIEF_FACTOR 1
|
||||
#endif
|
||||
|
||||
#define NN (1000 / RELIEF_FACTOR)
|
||||
static const auto NN = 1000u / RELIEF_FACTOR;
|
||||
|
||||
#if defined(__cpp_lib_latch) && __cpp_lib_latch >= 201907L
|
||||
static const auto N = std::min(17u, std::thread::hardware_concurrency());
|
||||
#else
|
||||
static const auto N = 3u;
|
||||
#endif
|
||||
|
||||
static void logger_nofmt(MDBX_log_level_t loglevel, const char *function, int line, const char *msg,
|
||||
unsigned length) noexcept {
|
||||
@ -107,6 +113,7 @@ bool case0(mdbx::env env) {
|
||||
* 4. Ждем завершения фоновых потоков.
|
||||
* 5. Закрываем оставшиеся курсоры и закрываем БД. */
|
||||
|
||||
size_t global_seed = size_t(std::chrono::high_resolution_clock::now().time_since_epoch().count());
|
||||
thread_local size_t salt;
|
||||
|
||||
static size_t prng() {
|
||||
@ -262,7 +269,7 @@ void case1_write_cycle(mdbx::txn_managed txn, std::deque<mdbx::map_handle> &dbi,
|
||||
pre.unbind();
|
||||
if (!pre.txn())
|
||||
pre.bind(txn, dbi[prng(dbi.size())]);
|
||||
for (auto i = 0; i < NN; ++i) {
|
||||
for (auto i = 0u; i < NN; ++i) {
|
||||
auto k = mdbx::default_buffer::wrap(prng(NN));
|
||||
auto v = mdbx::default_buffer::wrap(prng(NN));
|
||||
if (pre.find_multivalue(k, v, false))
|
||||
@ -284,7 +291,16 @@ void case1_write_cycle(mdbx::txn_managed txn, std::deque<mdbx::map_handle> &dbi,
|
||||
}
|
||||
|
||||
bool case1_thread(mdbx::env env, std::deque<mdbx::map_handle> dbi, mdbx::cursor pre) {
|
||||
salt = size_t(std::chrono::high_resolution_clock::now().time_since_epoch().count());
|
||||
#if defined(__cpp_lib_latch) && __cpp_lib_latch >= 201907L
|
||||
mdbx::error::success_or_throw(mdbx_txn_lock(env, false));
|
||||
std::hash<std::thread::id> hasher;
|
||||
salt = global_seed ^ hasher(std::this_thread::get_id());
|
||||
std::cout << "thread " << std::this_thread::get_id() << ", salt " << salt << std::endl << std::flush;
|
||||
mdbx_txn_unlock(env);
|
||||
#else
|
||||
salt = global_seed;
|
||||
#endif
|
||||
|
||||
std::vector<MDBX_cursor *> pool;
|
||||
for (auto loop = 0; loop < 333 / RELIEF_FACTOR; ++loop) {
|
||||
for (auto read = 0; read < 333 / RELIEF_FACTOR; ++read) {
|
||||
@ -311,12 +327,7 @@ bool case1(mdbx::env env) {
|
||||
bool ok = true;
|
||||
std::deque<mdbx::map_handle> dbi;
|
||||
std::vector<mdbx::cursor_managed> cursors;
|
||||
#if defined(__cpp_lib_latch) && __cpp_lib_latch >= 201907L
|
||||
static const auto N = 10;
|
||||
#else
|
||||
static const auto N = 3;
|
||||
#endif
|
||||
for (auto t = 0; t < N; ++t) {
|
||||
for (auto t = 0u; t < N; ++t) {
|
||||
auto txn = env.start_write();
|
||||
auto table = txn.create_map(std::to_string(t), mdbx::key_mode::ordinal, mdbx::value_mode::multi_samelength);
|
||||
auto cursor = txn.open_cursor(table);
|
||||
@ -331,7 +342,7 @@ bool case1(mdbx::env env) {
|
||||
#if defined(__cpp_lib_latch) && __cpp_lib_latch >= 201907L
|
||||
std::latch s(1);
|
||||
std::vector<std::thread> threads;
|
||||
for (auto t = 1; t < N; ++t) {
|
||||
for (auto t = 1u; t < cursors.size(); ++t) {
|
||||
case1_cycle_dbi(dbi);
|
||||
threads.push_back(std::thread([&, t]() {
|
||||
s.wait();
|
||||
@ -382,7 +393,7 @@ int doit() {
|
||||
mdbx::env::remove(db_filename);
|
||||
|
||||
mdbx::env_managed env(db_filename, mdbx::env_managed::create_parameters(),
|
||||
mdbx::env::operate_parameters(42, 0, mdbx::env::nested_transactions));
|
||||
mdbx::env::operate_parameters(N + 2, 0, mdbx::env::nested_transactions));
|
||||
|
||||
bool ok = case0(env);
|
||||
ok = case1(env) && ok;
|
||||
|
488
test/extra/details_rkl.c
Normal file
488
test/extra/details_rkl.c
Normal file
@ -0,0 +1,488 @@
|
||||
/// \copyright SPDX-License-Identifier: Apache-2.0
|
||||
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2025
|
||||
|
||||
#define debug_log debug_log_sub
|
||||
|
||||
#include "../../src/rkl.c"
|
||||
#include "../../src/txl.c"
|
||||
|
||||
MDBX_MAYBE_UNUSED __cold void debug_log_sub(int level, const char *function, int line, const char *fmt, ...) {
|
||||
(void)level;
|
||||
(void)function;
|
||||
(void)line;
|
||||
(void)fmt;
|
||||
}
|
||||
|
||||
/*-----------------------------------------------------------------------------*/
|
||||
|
||||
static size_t tst_failed, tst_ok, tst_iterations, tst_cases, tst_cases_hole;
|
||||
#ifndef NDEBUG
|
||||
static size_t tst_target;
|
||||
#endif
|
||||
|
||||
static bool check_bool(bool v, bool expect, const char *fn, unsigned line) {
|
||||
if (unlikely(v != expect)) {
|
||||
++tst_failed;
|
||||
fflush(nullptr);
|
||||
fprintf(stderr, "iteration %zi: got %s, expected %s, at %s:%u\n", tst_iterations, v ? "true" : "false",
|
||||
expect ? "true" : "false", fn, line);
|
||||
fflush(nullptr);
|
||||
return false;
|
||||
}
|
||||
++tst_ok;
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool check_eq(uint64_t v, uint64_t expect, const char *fn, unsigned line) {
|
||||
if (unlikely(v != expect)) {
|
||||
++tst_failed;
|
||||
fflush(nullptr);
|
||||
fprintf(stderr, "iteration %zi: %" PRIu64 " (got) != %" PRIu64 " (expected), at %s:%u\n", tst_iterations, v, expect,
|
||||
fn, line);
|
||||
fflush(nullptr);
|
||||
return false;
|
||||
}
|
||||
++tst_ok;
|
||||
return true;
|
||||
}
|
||||
|
||||
#define CHECK_BOOL(T, EXPECT) check_bool((T), (EXPECT), __func__, __LINE__)
|
||||
#define CHECK_TRUE(T) CHECK_BOOL(T, true)
|
||||
#define CHECK_FALSE(T) CHECK_BOOL(T, false)
|
||||
#define CHECK_EQ(T, EXPECT) check_eq((T), (EXPECT), __func__, __LINE__)
|
||||
|
||||
void trivia(void) {
|
||||
rkl_t x, y;
|
||||
|
||||
rkl_init(&x);
|
||||
rkl_init(&y);
|
||||
CHECK_TRUE(rkl_check(&x));
|
||||
CHECK_TRUE(rkl_empty(&x));
|
||||
CHECK_EQ(rkl_len(&x), 0);
|
||||
|
||||
rkl_iter_t f = rkl_iterator(&x, false);
|
||||
rkl_iter_t r = rkl_iterator(&x, true);
|
||||
CHECK_EQ(rkl_left(&f, false), 0);
|
||||
CHECK_EQ(rkl_left(&f, true), 0);
|
||||
CHECK_EQ(rkl_left(&r, false), 0);
|
||||
CHECK_EQ(rkl_left(&r, true), 0);
|
||||
CHECK_EQ(rkl_turn(&f, false), 0);
|
||||
CHECK_EQ(rkl_turn(&f, true), 0);
|
||||
CHECK_EQ(rkl_turn(&r, false), 0);
|
||||
CHECK_EQ(rkl_turn(&r, true), 0);
|
||||
CHECK_TRUE(rkl_check(&x));
|
||||
|
||||
rkl_hole_t hole;
|
||||
hole = rkl_hole(&f, true);
|
||||
CHECK_EQ(hole.begin, 1);
|
||||
CHECK_EQ(hole.end, MAX_TXNID);
|
||||
hole = rkl_hole(&f, false);
|
||||
CHECK_EQ(hole.begin, 1);
|
||||
CHECK_EQ(hole.end, MAX_TXNID);
|
||||
hole = rkl_hole(&r, true);
|
||||
CHECK_EQ(hole.begin, 1);
|
||||
CHECK_EQ(hole.end, MAX_TXNID);
|
||||
hole = rkl_hole(&r, false);
|
||||
CHECK_EQ(hole.begin, 1);
|
||||
CHECK_EQ(hole.end, MAX_TXNID);
|
||||
|
||||
CHECK_EQ(rkl_push(&x, 42, false), MDBX_SUCCESS);
|
||||
CHECK_TRUE(rkl_check(&x));
|
||||
CHECK_FALSE(rkl_empty(&x));
|
||||
CHECK_EQ(rkl_len(&x), 1);
|
||||
CHECK_EQ(rkl_push(&x, 42, true), MDBX_RESULT_TRUE);
|
||||
CHECK_TRUE(rkl_check(&x));
|
||||
|
||||
f = rkl_iterator(&x, false);
|
||||
r = rkl_iterator(&x, true);
|
||||
CHECK_EQ(rkl_left(&f, false), 1);
|
||||
CHECK_EQ(rkl_left(&f, true), 0);
|
||||
CHECK_EQ(rkl_left(&r, false), 0);
|
||||
CHECK_EQ(rkl_left(&r, true), 1);
|
||||
|
||||
CHECK_EQ(rkl_turn(&f, true), 0);
|
||||
CHECK_EQ(rkl_turn(&f, false), 42);
|
||||
CHECK_EQ(rkl_turn(&f, false), 0);
|
||||
CHECK_EQ(rkl_turn(&f, true), 42);
|
||||
CHECK_EQ(rkl_turn(&f, true), 0);
|
||||
|
||||
CHECK_EQ(rkl_turn(&r, false), 0);
|
||||
CHECK_EQ(rkl_turn(&r, true), 42);
|
||||
CHECK_EQ(rkl_turn(&r, true), 0);
|
||||
CHECK_EQ(rkl_turn(&r, false), 42);
|
||||
CHECK_EQ(rkl_turn(&r, false), 0);
|
||||
|
||||
f = rkl_iterator(&x, false);
|
||||
hole = rkl_hole(&f, false);
|
||||
CHECK_EQ(hole.begin, 43);
|
||||
CHECK_EQ(hole.end, MAX_TXNID);
|
||||
hole = rkl_hole(&f, false);
|
||||
CHECK_EQ(hole.begin, MAX_TXNID);
|
||||
CHECK_EQ(hole.end, MAX_TXNID);
|
||||
hole = rkl_hole(&f, true);
|
||||
CHECK_EQ(hole.begin, 43);
|
||||
CHECK_EQ(hole.end, MAX_TXNID);
|
||||
hole = rkl_hole(&f, true);
|
||||
CHECK_EQ(hole.begin, 1);
|
||||
CHECK_EQ(hole.end, 42);
|
||||
hole = rkl_hole(&f, true);
|
||||
CHECK_EQ(hole.begin, 1);
|
||||
CHECK_EQ(hole.end, 42);
|
||||
|
||||
r = rkl_iterator(&x, true);
|
||||
hole = rkl_hole(&r, false);
|
||||
CHECK_EQ(hole.begin, MAX_TXNID);
|
||||
CHECK_EQ(hole.end, MAX_TXNID);
|
||||
hole = rkl_hole(&r, true);
|
||||
CHECK_EQ(hole.begin, 43);
|
||||
CHECK_EQ(hole.end, MAX_TXNID);
|
||||
hole = rkl_hole(&r, true);
|
||||
CHECK_EQ(hole.begin, 1);
|
||||
CHECK_EQ(hole.end, 42);
|
||||
hole = rkl_hole(&r, false);
|
||||
CHECK_EQ(hole.begin, 43);
|
||||
CHECK_EQ(hole.end, MAX_TXNID);
|
||||
hole = rkl_hole(&r, false);
|
||||
CHECK_EQ(hole.begin, MAX_TXNID);
|
||||
CHECK_EQ(hole.end, MAX_TXNID);
|
||||
|
||||
rkl_resize(&x, 222);
|
||||
CHECK_FALSE(rkl_empty(&x));
|
||||
CHECK_TRUE(rkl_check(&x));
|
||||
|
||||
rkl_destructive_move(&x, &y);
|
||||
CHECK_TRUE(rkl_check(&x));
|
||||
CHECK_TRUE(rkl_check(&y));
|
||||
rkl_destroy(&x);
|
||||
rkl_destroy(&y);
|
||||
}
|
||||
|
||||
/*-----------------------------------------------------------------------------*/
|
||||
|
||||
uint64_t prng_state;
|
||||
|
||||
static uint64_t prng(void) {
|
||||
prng_state = prng_state * UINT64_C(6364136223846793005) + 1;
|
||||
return prng_state;
|
||||
}
|
||||
|
||||
static bool flipcoin(void) { return (bool)prng() & 1; }
|
||||
|
||||
static bool stochastic_pass(const unsigned start, const unsigned width, const unsigned n) {
|
||||
rkl_t k, c;
|
||||
txl_t l = txl_alloc();
|
||||
if (!CHECK_TRUE(l))
|
||||
return false;
|
||||
|
||||
rkl_init(&k);
|
||||
rkl_init(&c);
|
||||
const size_t errors = tst_failed;
|
||||
|
||||
rkl_iter_t f = rkl_iterator(&k, false);
|
||||
rkl_iter_t r = rkl_iterator(&k, true);
|
||||
|
||||
txnid_t lowest = UINT_MAX;
|
||||
txnid_t highest = 0;
|
||||
while (MDBX_PNL_GETSIZE(l) < n) {
|
||||
txnid_t id = (txnid_t)(prng() % width + start);
|
||||
if (id < MIN_TXNID || id >= INVALID_TXNID)
|
||||
continue;
|
||||
if (txl_contain(l, id)) {
|
||||
if (CHECK_TRUE(rkl_contain(&k, id)) && CHECK_EQ(rkl_push(&k, id, false), MDBX_RESULT_TRUE))
|
||||
continue;
|
||||
break;
|
||||
}
|
||||
if (!CHECK_FALSE(rkl_contain(&k, id)))
|
||||
break;
|
||||
|
||||
if (tst_iterations % (1u << 24) == 0 && tst_iterations) {
|
||||
printf("done %.3fM iteration, %zu cases\n", tst_iterations / 1000000.0, tst_cases);
|
||||
fflush(nullptr);
|
||||
}
|
||||
tst_iterations += 1;
|
||||
|
||||
#ifndef NDEBUG
|
||||
if (tst_iterations == tst_target) {
|
||||
printf("reach %zu iteration\n", tst_iterations);
|
||||
fflush(nullptr);
|
||||
}
|
||||
#endif
|
||||
|
||||
if (!CHECK_EQ(rkl_push(&k, id, false), MDBX_SUCCESS))
|
||||
break;
|
||||
if (!CHECK_TRUE(rkl_check(&k)))
|
||||
break;
|
||||
if (!CHECK_EQ(txl_append(&l, id), MDBX_SUCCESS))
|
||||
break;
|
||||
if (!CHECK_TRUE(rkl_contain(&k, id)))
|
||||
break;
|
||||
|
||||
lowest = (lowest < id) ? lowest : id;
|
||||
highest = (highest > id) ? highest : id;
|
||||
if (!CHECK_EQ(rkl_lowest(&k), lowest))
|
||||
break;
|
||||
if (!CHECK_EQ(rkl_highest(&k), highest))
|
||||
break;
|
||||
}
|
||||
|
||||
txl_sort(l);
|
||||
CHECK_EQ(rkl_len(&k), n);
|
||||
CHECK_EQ(MDBX_PNL_GETSIZE(l), n);
|
||||
|
||||
f = rkl_iterator(&k, false);
|
||||
r = rkl_iterator(&k, true);
|
||||
CHECK_EQ(rkl_left(&f, false), n);
|
||||
CHECK_EQ(rkl_left(&f, true), 0);
|
||||
CHECK_EQ(rkl_left(&r, false), 0);
|
||||
CHECK_EQ(rkl_left(&r, true), n);
|
||||
|
||||
for (size_t i = 0; i < n; ++i) {
|
||||
CHECK_EQ(rkl_turn(&f, false), l[n - i]);
|
||||
CHECK_EQ(rkl_left(&f, false), n - i - 1);
|
||||
CHECK_EQ(rkl_left(&f, true), i + 1);
|
||||
|
||||
CHECK_EQ(rkl_turn(&r, true), l[i + 1]);
|
||||
r.pos += 1;
|
||||
CHECK_EQ(rkl_turn(&r, true), l[i + 1]);
|
||||
CHECK_EQ(rkl_left(&r, true), n - i - 1);
|
||||
CHECK_EQ(rkl_left(&r, false), i + 1);
|
||||
}
|
||||
|
||||
if (CHECK_EQ(rkl_copy(&k, &c), MDBX_SUCCESS)) {
|
||||
for (size_t i = 1; i <= n; ++i) {
|
||||
if (!CHECK_FALSE(rkl_empty(&k)))
|
||||
break;
|
||||
if (!CHECK_FALSE(rkl_empty(&c)))
|
||||
break;
|
||||
CHECK_EQ(rkl_pop(&k, true), l[i]);
|
||||
CHECK_EQ(rkl_pop(&c, false), l[1 + n - i]);
|
||||
}
|
||||
}
|
||||
|
||||
CHECK_TRUE(rkl_empty(&k));
|
||||
CHECK_TRUE(rkl_empty(&c));
|
||||
|
||||
rkl_destroy(&k);
|
||||
rkl_destroy(&c);
|
||||
txl_free(l);
|
||||
|
||||
++tst_cases;
|
||||
return errors == tst_failed;
|
||||
}
|
||||
|
||||
static bool stochastic(const size_t limit_cases, const size_t limit_loops) {
|
||||
for (unsigned loop = 0; tst_cases < limit_cases || loop < limit_loops; ++loop)
|
||||
for (unsigned width = 2; width < 10; ++width)
|
||||
for (unsigned n = 1; n < width; ++n)
|
||||
for (unsigned prev = 1, start = 0, t; start < 4242; t = start + prev, prev = start, start = t)
|
||||
if (!stochastic_pass(start, 1u << width, 1u << n) || tst_failed > 42) {
|
||||
puts("bailout\n");
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/*-----------------------------------------------------------------------------*/
|
||||
|
||||
static bool bit(size_t set, size_t n) {
|
||||
assert(n < CHAR_BIT * sizeof(set));
|
||||
return (set >> n) & 1;
|
||||
}
|
||||
|
||||
static size_t hamming_weight(size_t v) {
|
||||
const size_t m1 = (size_t)UINT64_C(0x5555555555555555);
|
||||
const size_t m2 = (size_t)UINT64_C(0x3333333333333333);
|
||||
const size_t m4 = (size_t)UINT64_C(0x0f0f0f0f0f0f0f0f);
|
||||
const size_t h01 = (size_t)UINT64_C(0x0101010101010101);
|
||||
v -= (v >> 1) & m1;
|
||||
v = (v & m2) + ((v >> 2) & m2);
|
||||
v = (v + (v >> 4)) & m4;
|
||||
return (v * h01) >> (sizeof(v) * 8 - 8);
|
||||
}
|
||||
|
||||
static bool check_hole(const size_t set, const rkl_hole_t hole, size_t *acc) {
|
||||
const size_t errors = tst_failed;
|
||||
++tst_iterations;
|
||||
|
||||
if (hole.begin > 1)
|
||||
CHECK_EQ(bit(set, hole.begin - 1), 1);
|
||||
if (hole.end < CHAR_BIT * sizeof(set))
|
||||
CHECK_EQ(bit(set, hole.end), 1);
|
||||
|
||||
for (size_t n = hole.begin; n < hole.end && n < CHAR_BIT * sizeof(set); n++) {
|
||||
CHECK_EQ(bit(set, n), 0);
|
||||
*acc += 1;
|
||||
}
|
||||
|
||||
return errors == tst_failed;
|
||||
}
|
||||
|
||||
static void debug_set(const size_t set, const char *str, int iter_offset) {
|
||||
#if 1
|
||||
(void)set;
|
||||
(void)str;
|
||||
(void)iter_offset;
|
||||
#else
|
||||
printf("\ncase %s+%d: count %zu, holes", str, iter_offset, hamming_weight(~set) - 1);
|
||||
for (size_t k, i = 1; i < CHAR_BIT * sizeof(set); ++i) {
|
||||
if (!bit(set, i)) {
|
||||
printf(" %zu", i);
|
||||
for (k = i; k < CHAR_BIT * sizeof(set) - 1 && !bit(set, k + 1); ++k)
|
||||
;
|
||||
if (k > i) {
|
||||
printf("-%zu", k);
|
||||
i = k;
|
||||
}
|
||||
}
|
||||
}
|
||||
printf("\n");
|
||||
fflush(nullptr);
|
||||
#endif
|
||||
}
|
||||
|
||||
static bool check_holes_bothsides(const size_t set, rkl_iter_t const *i) {
|
||||
const size_t number_of_holes = hamming_weight(~set) - 1;
|
||||
size_t acc = 0;
|
||||
|
||||
rkl_iter_t f = *i;
|
||||
for (;;) {
|
||||
rkl_hole_t hole = rkl_hole(&f, false);
|
||||
if (hole.begin == hole.end)
|
||||
break;
|
||||
if (!check_hole(set, hole, &acc))
|
||||
return false;
|
||||
if (hole.end >= CHAR_BIT * sizeof(set))
|
||||
break;
|
||||
}
|
||||
|
||||
rkl_iter_t b = *i;
|
||||
for (;;) {
|
||||
rkl_hole_t hole = rkl_hole(&b, true);
|
||||
if (hole.begin == hole.end)
|
||||
break;
|
||||
if (!check_hole(set, hole, &acc))
|
||||
return false;
|
||||
if (hole.begin == 1)
|
||||
break;
|
||||
}
|
||||
|
||||
if (!CHECK_EQ(acc, number_of_holes))
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool check_holes_fourways(const size_t set, const rkl_t *rkl) {
|
||||
rkl_iter_t i = rkl_iterator(rkl, false);
|
||||
int o = 0;
|
||||
do {
|
||||
debug_set(set, "initial-forward", o++);
|
||||
if (!check_holes_bothsides(set, &i))
|
||||
return false;
|
||||
} while (rkl_turn(&i, false));
|
||||
|
||||
do {
|
||||
debug_set(set, "recoil-reverse", --o);
|
||||
if (!check_holes_bothsides(set, &i))
|
||||
return false;
|
||||
} while (rkl_turn(&i, true));
|
||||
|
||||
i = rkl_iterator(rkl, true);
|
||||
o = 0;
|
||||
do {
|
||||
debug_set(set, "initial-reverse", --o);
|
||||
if (!check_holes_bothsides(set, &i))
|
||||
return false;
|
||||
} while (rkl_turn(&i, false));
|
||||
|
||||
do {
|
||||
debug_set(set, "recoil-forward", o++);
|
||||
if (!check_holes_bothsides(set, &i))
|
||||
return false;
|
||||
} while (rkl_turn(&i, true));
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool stochastic_pass_hole(size_t set, size_t trims) {
|
||||
const size_t one = 1;
|
||||
set &= ~one;
|
||||
if (!set)
|
||||
return true;
|
||||
|
||||
++tst_cases_hole;
|
||||
|
||||
rkl_t rkl;
|
||||
rkl_init(&rkl);
|
||||
for (size_t n = 1; n < CHAR_BIT * sizeof(set); ++n)
|
||||
if (bit(set, n))
|
||||
CHECK_EQ(rkl_push(&rkl, n, false), MDBX_SUCCESS);
|
||||
|
||||
if (!check_holes_fourways(set, &rkl))
|
||||
return false;
|
||||
|
||||
while (rkl_len(&rkl) > 1 && trims-- > 0) {
|
||||
if (flipcoin()) {
|
||||
const size_t l = (size_t)rkl_pop(&rkl, false);
|
||||
if (l == 0)
|
||||
break;
|
||||
assert(bit(set, l));
|
||||
set -= one << l;
|
||||
if (!check_holes_fourways(set, &rkl))
|
||||
return false;
|
||||
} else {
|
||||
|
||||
const size_t h = (size_t)rkl_pop(&rkl, true);
|
||||
if (h == 0)
|
||||
break;
|
||||
assert(bit(set, h));
|
||||
set -= one << h;
|
||||
if (!check_holes_fourways(set, &rkl))
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static size_t prng_word(void) {
|
||||
size_t word = (size_t)(prng() >> 32);
|
||||
if (sizeof(word) > 4)
|
||||
word = (uint64_t)word << 32 | (size_t)(prng() >> 32);
|
||||
return word;
|
||||
}
|
||||
|
||||
static bool stochastic_hole(size_t probes) {
|
||||
for (size_t n = 0; n < probes; ++n) {
|
||||
size_t set = prng_word();
|
||||
if (!stochastic_pass_hole(set, prng() % 11))
|
||||
return false;
|
||||
if (!stochastic_pass_hole(set & prng_word(), prng() % 11))
|
||||
return false;
|
||||
if (!stochastic_pass_hole(set | prng_word(), prng() % 11))
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/*-----------------------------------------------------------------------------*/
|
||||
|
||||
int main(int argc, const char *argv[]) {
|
||||
(void)argc;
|
||||
(void)argv;
|
||||
|
||||
#ifndef NDEBUG
|
||||
// tst_target = 281870;
|
||||
#endif
|
||||
prng_state = (uint64_t)time(nullptr);
|
||||
printf("prng-seed %" PRIu64 "\n", prng_state);
|
||||
fflush(nullptr);
|
||||
|
||||
trivia();
|
||||
stochastic(42 * 42 * 42, 42);
|
||||
stochastic_hole(24 * 24 * 24);
|
||||
printf("done: %zu+%zu cases, %zu iterations, %zu checks ok, %zu checks failed\n", tst_cases, tst_cases_hole,
|
||||
tst_iterations, tst_ok, tst_failed);
|
||||
fflush(nullptr);
|
||||
return tst_failed ? EXIT_FAILURE : EXIT_SUCCESS;
|
||||
}
|
@ -460,9 +460,9 @@ int main(int argc, char *const argv[]) {
|
||||
params.datalen_max = params.datalen_min;
|
||||
continue;
|
||||
}
|
||||
if (config::parse_option(argc, argv, narg, "batch.read", params.batch_read, config::no_scale, 1))
|
||||
if (config::parse_option(argc, argv, narg, "batch.read", params.batch_read, config::decimal, 1))
|
||||
continue;
|
||||
if (config::parse_option(argc, argv, narg, "batch.write", params.batch_write, config::no_scale, 1))
|
||||
if (config::parse_option(argc, argv, narg, "batch.write", params.batch_write, config::decimal, 1))
|
||||
continue;
|
||||
if (config::parse_option(argc, argv, narg, "delay", params.delaystart, config::duration))
|
||||
continue;
|
||||
|
@ -770,7 +770,7 @@ static bool execute_thunk(const actor_config *const_config, const mdbx_pid_t pid
|
||||
size_t iter = 0;
|
||||
do {
|
||||
if (iter) {
|
||||
prng_seed(config.params.prng_seed += INT32_C(0xA4F4D37B));
|
||||
prng_salt(iter);
|
||||
log_verbose("turn PRNG to %u", config.params.prng_seed);
|
||||
}
|
||||
iter++;
|
||||
|
@ -263,8 +263,8 @@ public:
|
||||
}
|
||||
|
||||
static bool review_params(actor_params ¶ms, unsigned space_id) {
|
||||
(void)space_id;
|
||||
// silently fix key/data length for fixed-length modes
|
||||
params.prng_seed += bleach32(space_id);
|
||||
if ((params.table_flags & MDBX_INTEGERKEY) && params.keylen_min != params.keylen_max)
|
||||
params.keylen_min = params.keylen_max;
|
||||
if ((params.table_flags & (MDBX_INTEGERDUP | MDBX_DUPFIXED)) && params.datalen_min != params.datalen_max)
|
||||
|
@ -124,7 +124,8 @@ void prng_fill(uint64_t &state, void *ptr, size_t bytes) {
|
||||
|
||||
/* __thread */ uint64_t prng_state;
|
||||
|
||||
void prng_seed(uint64_t seed) { prng_state = bleach64(seed); }
|
||||
void prng_seed(uint64_t seed) { prng_state = seed; }
|
||||
void prng_salt(unsigned salt) { prng_state += bleach32(salt) * UINT64_C(0xD14A2783862DAB); }
|
||||
|
||||
uint32_t prng32(void) { return prng32_white(prng_state); }
|
||||
|
||||
|
@ -313,6 +313,7 @@ void prng_fill(uint64_t &state, void *ptr, size_t bytes);
|
||||
|
||||
extern uint64_t prng_state;
|
||||
void prng_seed(uint64_t seed);
|
||||
void prng_salt(unsigned salt);
|
||||
uint32_t prng32(void);
|
||||
uint64_t prng64(void);
|
||||
void prng_fill(void *ptr, size_t bytes);
|
||||
|
Loading…
Reference in New Issue
Block a user