mirror of
https://gitflic.ru/project/erthink/libmdbx.git
synced 2025-06-24 08:48:52 +00:00
mdbx: изменение лицензии и реструктуризация исходного кода.
This commit is contained in:
parent
e9f5c0c308
commit
3de3d425a1
34
AUTHORS
34
AUTHORS
@ -1,34 +0,0 @@
|
||||
Contributors
|
||||
============
|
||||
|
||||
- Alexey Naumov <alexey.naumov@gmail.com>
|
||||
- Andrew Ashikhmin <andrey.ashikhmin@gmail.com>
|
||||
- Chris Mikkelson <cmikk@qwest.net>
|
||||
- Claude Brisson <claude.brisson@gmail.com>
|
||||
- David Barbour <dmbarbour@gmail.com>
|
||||
- David Wilson <dw@botanicus.net>
|
||||
- dreamsxin <dreamsxin@126.com>
|
||||
- Hallvard Furuseth <hallvard@openldap.org>, <h.b.furuseth@usit.uio.no>
|
||||
- Heiko Becker <heirecka@exherbo.org>
|
||||
- Howard Chu <hyc@openldap.org>, <hyc@symas.com>
|
||||
- Ignacio Casal Quinteiro <ignacio.casal@nice-software.com>
|
||||
- James Rouzier <rouzier@gmail.com>
|
||||
- Jean-Christophe DUBOIS <jcd@tribudubois.net>
|
||||
- John Hewson <john@jahewson.com>
|
||||
- Klaus Malorny <klaus.malorny@knipp.de>
|
||||
- Kurt Zeilenga <kurt.zeilenga@isode.com>
|
||||
- Leonid Yuriev <leo@yuriev.ru>, <lyuryev@ptsecurity.ru>
|
||||
- Lorenz Bauer <lmb@cloudflare.com>
|
||||
- Luke Yeager <lyeager@nvidia.com>
|
||||
- Martin Hedenfalk <martin@bzero.se>
|
||||
- Ondrej Kuznik <ondrej.kuznik@acision.com>
|
||||
- Orivej Desh <orivej@gmx.fr>
|
||||
- Oskari Timperi <oskari.timperi@iki.fi>
|
||||
- Pavel Medvedev <pmedvedev@gmail.com>
|
||||
- Philipp Storz <philipp.storz@bareos.com>
|
||||
- Quanah Gibson-Mount <quanah@openldap.org>
|
||||
- Salvador Ortiz <sog@msg.com.mx>
|
||||
- Sebastien Launay <sebastien@slaunay.fr>
|
||||
- Vladimir Romanov <vromanov@gmail.com>
|
||||
- Zano Foundation <crypto.sowle@gmail.com>
|
||||
- 장세연 <sasgas@castis.com>
|
248
CMakeLists.txt
248
CMakeLists.txt
@ -1,16 +1,5 @@
|
||||
##
|
||||
## Copyright 2020-2024 Leonid Yuriev <leo@yuriev.ru>
|
||||
## and other libmdbx authors: please see AUTHORS file.
|
||||
## All rights reserved.
|
||||
##
|
||||
## Redistribution and use in source and binary forms, with or without
|
||||
## modification, are permitted only as authorized by the OpenLDAP
|
||||
## Public License.
|
||||
##
|
||||
## A copy of this license is available in the file LICENSE in the
|
||||
## top-level directory of the distribution or, alternatively, at
|
||||
## <http://www.OpenLDAP.org/license.html>.
|
||||
##
|
||||
## Copyright (c) 2020-2024 Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru>
|
||||
## SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
##
|
||||
## libmdbx = { Revised and extended descendant of Symas LMDB. }
|
||||
@ -69,14 +58,109 @@ else()
|
||||
endif()
|
||||
|
||||
if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/.git" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/COPYRIGHT" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/NOTICE" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/README.md" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/mdbx.h" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/mdbx.h++" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/CMakeLists.txt" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/core.c" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/alloy.c" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/api-cursor.c" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/api-env.c" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/api-extra.c" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/api-key-transform.c" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/api-txn.c" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/atomics-ops.h" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/atomics-types.h" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/audit.c" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/chk.c" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/cogs.c" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/cogs.h" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/coherency.c" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/cold.c" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/config.h.in" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/copy.c" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/cursor.c" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/cursor.h" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/dbi.c" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/dbi.h" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/debug_begin.h" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/debug_end.h" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/dpl.c" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/dpl.h" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/dxb.c" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/env-opts.c" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/env.c" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/essentials.h" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/gc-get.c" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/gc-put.c" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/gc.h" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/global.c" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/internals.h" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/layout-dxb.h" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/layout-lck.h" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/lck-posix.c" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/lck-windows.c" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/lck.c" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/lck.h" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/logging_and_debug.c" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/logging_and_debug.h" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/man1/mdbx_chk.1" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/man1/mdbx_copy.1" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/man1/mdbx_drop.1" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/man1/mdbx_dump.1" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/man1/mdbx_load.1" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/man1/mdbx_stat.1" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/mdbx.c++" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/meta.c" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/meta.h" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/misc.c" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/mvcc-readers.c" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/node.c" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/node.h" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/ntdll.def" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/options.h" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/osal.c" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/osal.h" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/page-get.c" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/page-iov.c" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/page-iov.h" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/page-ops.c" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/page-ops.h" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/page-search.c" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/pnl.c" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/pnl.h" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/preface.h" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/proto.h" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/range-estimate.c" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/refund.c" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/sort.h" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/spill.c" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/spill.h" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/subdb.c" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/tls.c" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/tls.h" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/tools/chk.c" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/tools/copy.c" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/tools/drop.c" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/tools/dump.c" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/tools/load.c" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/tools/stat.c" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/tools/wingetopt.c" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/tools/wingetopt.h" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/tree.c" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/txl.c" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/txl.h" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/txn.c" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/unaligned.h" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/utils.c" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/utils.h" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/version.c.in" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/man1" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/mdbx_chk.c" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/mdbx.c++")
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/walk.c" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/walk.h" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/windows-import.c" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/windows-import.h")
|
||||
set(MDBX_AMALGAMATED_SOURCE FALSE)
|
||||
find_program(GIT git)
|
||||
if(NOT GIT)
|
||||
@ -84,21 +168,27 @@ if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/.git" AND
|
||||
endif()
|
||||
set(MDBX_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/src")
|
||||
elseif(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/VERSION.txt" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/NOTICE" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/mdbx.c" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/mdbx.c++" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/config.h.in" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/man1" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/mdbx_chk.c")
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/mdbx.h" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/mdbx.h++" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/mdbx_chk.c" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/mdbx_copy.c" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/mdbx_dump.c" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/mdbx_load.c" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/mdbx_stat.c" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/mdbx_drop.c" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/ntdll.def" AND
|
||||
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/config.h.in")
|
||||
set(MDBX_AMALGAMATED_SOURCE TRUE)
|
||||
set(MDBX_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}")
|
||||
else()
|
||||
message(FATAL_ERROR "\n"
|
||||
"Please don't use tarballs nor zips which are automatically provided by Github! "
|
||||
"These archives do not contain version information and thus are unfit to build libmdbx. "
|
||||
"You can vote for ability of disabling auto-creation such unsuitable archives at https://github.community/t/disable-tarball\n"
|
||||
"Instead of above, just clone the git repository, either download a tarball or zip with the properly amalgamated source core. "
|
||||
"For embedding libmdbx use a git-submodule or the amalgamated source code.\n"
|
||||
"Please, avoid using any other techniques.")
|
||||
"The set of libmdbx source code files is incomplete! "
|
||||
"Instead just follow the https://libmdbx.dqdkfa.ru/usage.html "
|
||||
"PLEASE, AVOID USING ANY OTHER TECHNIQUES.")
|
||||
endif()
|
||||
|
||||
if(DEFINED PROJECT_NAME)
|
||||
@ -600,13 +690,88 @@ else()
|
||||
include_directories("${MDBX_SOURCE_DIR}" "${CMAKE_CURRENT_BINARY_DIR}")
|
||||
else()
|
||||
list(APPEND LIBMDBX_SOURCES
|
||||
"${MDBX_SOURCE_DIR}/api-cursor.c"
|
||||
"${MDBX_SOURCE_DIR}/api-env.c"
|
||||
"${MDBX_SOURCE_DIR}/api-extra.c"
|
||||
"${MDBX_SOURCE_DIR}/api-key-transform.c"
|
||||
"${MDBX_SOURCE_DIR}/api-txn.c"
|
||||
"${MDBX_SOURCE_DIR}/atomics-ops.h"
|
||||
"${MDBX_SOURCE_DIR}/atomics-types.h"
|
||||
"${MDBX_SOURCE_DIR}/audit.c"
|
||||
"${MDBX_SOURCE_DIR}/chk.c"
|
||||
"${MDBX_SOURCE_DIR}/cogs.c"
|
||||
"${MDBX_SOURCE_DIR}/cogs.h"
|
||||
"${MDBX_SOURCE_DIR}/coherency.c"
|
||||
"${MDBX_SOURCE_DIR}/cold.c"
|
||||
"${MDBX_SOURCE_DIR}/copy.c"
|
||||
"${MDBX_SOURCE_DIR}/cursor.c"
|
||||
"${MDBX_SOURCE_DIR}/cursor.h"
|
||||
"${MDBX_SOURCE_DIR}/dbi.c"
|
||||
"${MDBX_SOURCE_DIR}/dbi.h"
|
||||
"${MDBX_SOURCE_DIR}/dpl.c"
|
||||
"${MDBX_SOURCE_DIR}/dpl.h"
|
||||
"${MDBX_SOURCE_DIR}/dxb.c"
|
||||
"${MDBX_SOURCE_DIR}/env-opts.c"
|
||||
"${MDBX_SOURCE_DIR}/env.c"
|
||||
"${MDBX_SOURCE_DIR}/essentials.h"
|
||||
"${MDBX_SOURCE_DIR}/gc-get.c"
|
||||
"${MDBX_SOURCE_DIR}/gc-put.c"
|
||||
"${MDBX_SOURCE_DIR}/gc.h"
|
||||
"${MDBX_SOURCE_DIR}/global.c"
|
||||
"${MDBX_SOURCE_DIR}/internals.h"
|
||||
"${MDBX_SOURCE_DIR}/layout-dxb.h"
|
||||
"${MDBX_SOURCE_DIR}/layout-lck.h"
|
||||
"${MDBX_SOURCE_DIR}/lck.c"
|
||||
"${MDBX_SOURCE_DIR}/lck.h"
|
||||
"${MDBX_SOURCE_DIR}/logging_and_debug.c"
|
||||
"${MDBX_SOURCE_DIR}/logging_and_debug.h"
|
||||
"${MDBX_SOURCE_DIR}/meta.c"
|
||||
"${MDBX_SOURCE_DIR}/meta.h"
|
||||
"${MDBX_SOURCE_DIR}/misc.c"
|
||||
"${MDBX_SOURCE_DIR}/mvcc-readers.c"
|
||||
"${MDBX_SOURCE_DIR}/node.c"
|
||||
"${MDBX_SOURCE_DIR}/node.h"
|
||||
"${MDBX_SOURCE_DIR}/options.h"
|
||||
"${MDBX_SOURCE_DIR}/osal.c"
|
||||
"${MDBX_SOURCE_DIR}/osal.h"
|
||||
"${MDBX_SOURCE_DIR}/page-get.c"
|
||||
"${MDBX_SOURCE_DIR}/page-iov.c"
|
||||
"${MDBX_SOURCE_DIR}/page-iov.h"
|
||||
"${MDBX_SOURCE_DIR}/page-ops.c"
|
||||
"${MDBX_SOURCE_DIR}/page-ops.h"
|
||||
"${MDBX_SOURCE_DIR}/page-search.c"
|
||||
"${MDBX_SOURCE_DIR}/pnl.c"
|
||||
"${MDBX_SOURCE_DIR}/pnl.h"
|
||||
"${MDBX_SOURCE_DIR}/preface.h"
|
||||
"${MDBX_SOURCE_DIR}/proto.h"
|
||||
"${MDBX_SOURCE_DIR}/range-estimate.c"
|
||||
"${MDBX_SOURCE_DIR}/refund.c"
|
||||
"${MDBX_SOURCE_DIR}/sort.h"
|
||||
"${MDBX_SOURCE_DIR}/spill.c"
|
||||
"${MDBX_SOURCE_DIR}/spill.h"
|
||||
"${MDBX_SOURCE_DIR}/subdb.c"
|
||||
"${MDBX_SOURCE_DIR}/tls.c"
|
||||
"${MDBX_SOURCE_DIR}/tls.h"
|
||||
"${MDBX_SOURCE_DIR}/tree.c"
|
||||
"${MDBX_SOURCE_DIR}/txl.c"
|
||||
"${MDBX_SOURCE_DIR}/txl.h"
|
||||
"${MDBX_SOURCE_DIR}/txn.c"
|
||||
"${MDBX_SOURCE_DIR}/unaligned.h"
|
||||
"${MDBX_SOURCE_DIR}/utils.c"
|
||||
"${MDBX_SOURCE_DIR}/utils.h"
|
||||
"${MDBX_SOURCE_DIR}/walk.c"
|
||||
"${MDBX_SOURCE_DIR}/walk.h"
|
||||
"${CMAKE_CURRENT_BINARY_DIR}/version.c"
|
||||
"${MDBX_SOURCE_DIR}/options.h" "${MDBX_SOURCE_DIR}/base.h"
|
||||
"${MDBX_SOURCE_DIR}/internals.h" "${MDBX_SOURCE_DIR}/osal.h"
|
||||
"${MDBX_SOURCE_DIR}/core.c" "${MDBX_SOURCE_DIR}/osal.c"
|
||||
"${MDBX_SOURCE_DIR}/lck-posix.c")
|
||||
)
|
||||
if(NOT MSVC)
|
||||
list(APPEND LIBMDBX_SOURCES "${MDBX_SOURCE_DIR}/lck-posix.c")
|
||||
endif()
|
||||
if(NOT APPLE)
|
||||
list(APPEND LIBMDBX_SOURCES "${MDBX_SOURCE_DIR}/lck-windows.c")
|
||||
list(APPEND LIBMDBX_SOURCES
|
||||
"${MDBX_SOURCE_DIR}/windows-import.h"
|
||||
"${MDBX_SOURCE_DIR}/windows-import.c"
|
||||
"${MDBX_SOURCE_DIR}/lck-windows.c"
|
||||
)
|
||||
endif()
|
||||
include_directories("${MDBX_SOURCE_DIR}")
|
||||
endif()
|
||||
@ -747,20 +912,23 @@ endif()
|
||||
|
||||
# build mdbx-tools
|
||||
if(MDBX_BUILD_TOOLS)
|
||||
if(NOT MDBX_AMALGAMATED_SOURCE AND ${CMAKE_SYSTEM_NAME} STREQUAL "Windows")
|
||||
set(WINGETOPT_SRC ${MDBX_SOURCE_DIR}/wingetopt.c ${MDBX_SOURCE_DIR}/wingetopt.h)
|
||||
else()
|
||||
set(WINGETOPT_SRC "")
|
||||
set(WINGETOPT_SRC "")
|
||||
if(${CMAKE_SYSTEM_NAME} STREQUAL "Windows")
|
||||
set(WINGETOPT_SRC ${MDBX_SOURCE_DIR}/tools/wingetopt.c ${MDBX_SOURCE_DIR}/tools/wingetopt.h)
|
||||
endif()
|
||||
|
||||
foreach(TOOL mdbx_chk mdbx_copy mdbx_stat mdbx_dump mdbx_load mdbx_drop)
|
||||
add_executable(${TOOL} mdbx.h ${MDBX_SOURCE_DIR}/${TOOL}.c ${WINGETOPT_SRC})
|
||||
foreach(TOOL chk copy stat dump load drop)
|
||||
if(MDBX_AMALGAMATED_SOURCE)
|
||||
add_executable(mdbx_${TOOL} mdbx.h ${MDBX_SOURCE_DIR}/mdbx_${TOOL}.c)
|
||||
else()
|
||||
add_executable(mdbx_${TOOL} mdbx.h ${MDBX_SOURCE_DIR}/tools/${TOOL}.c ${WINGETOPT_SRC})
|
||||
endif()
|
||||
if(NOT C_FALLBACK_GNU11 AND NOT C_FALLBACK_11)
|
||||
set_target_properties(${TOOL} PROPERTIES
|
||||
set_target_properties(mdbx_${TOOL} PROPERTIES
|
||||
C_STANDARD ${MDBX_C_STANDARD} C_STANDARD_REQUIRED ON)
|
||||
endif()
|
||||
target_setup_options(${TOOL})
|
||||
target_link_libraries(${TOOL} ${TOOL_MDBX_LIB})
|
||||
target_setup_options(mdbx_${TOOL})
|
||||
target_link_libraries(mdbx_${TOOL} ${TOOL_MDBX_LIB})
|
||||
endforeach()
|
||||
if(LIB_MATH)
|
||||
target_link_libraries(mdbx_chk ${LIB_MATH})
|
||||
|
158
COPYRIGHT
158
COPYRIGHT
@ -1,7 +1,138 @@
|
||||
Copyright 2015-2024 Leonid Yuriev <leo@yuriev.ru>.
|
||||
Copyright 2011-2015 Howard Chu, Symas Corp.
|
||||
Copyright 2015,2016 Peter-Service R&D LLC.
|
||||
All rights reserved.
|
||||
Copyright (c) 2015-2024 Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru>
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
|
||||
-----------------------------------------------------------------------
|
||||
|
||||
СМЕНА ЛИЦЕНЗИИ (THE LICENSE CHANGE)
|
||||
|
||||
OpenLDAP Public License → Apache 2.0
|
||||
|
||||
Briefly:
|
||||
Historically, in 2015 an early MDBX source code was derived from the
|
||||
"LMDB engine" created by Howard Chu <hyc@symas.com> in 2011-2015,
|
||||
which based on btree.c written by Martin Hedenfalk <martin@bzero.se>.
|
||||
|
||||
By 2024, MDBX source code has actually been rewritten and has so
|
||||
little in common with the original LMDB that I thought it admissible
|
||||
to change the license. Below are more detailed explanations.
|
||||
|
||||
Кратко:
|
||||
Исторически в 2015 году ранний исходный MDBX был заимствован из «LMDB
|
||||
engine», созданной Howard Chu <hyc@symas.com> в 2011-2015, на основе
|
||||
btree.c созданного Martin Hedenfalk <martin@bzero.se> в 2009-2010.
|
||||
|
||||
К 2024 году исходный код MDBX фактически переписан и имеет настолько
|
||||
мало общего с первоначальным заимствованием из LMDB, что я счел
|
||||
уместным сменить лицензию. Ниже более подробные пояснения.
|
||||
|
||||
---
|
||||
|
||||
Первоисточник текста формулирован на Русском языке, который является
|
||||
родным для автора. Предполагается что все заинтересованные могут легко
|
||||
воспользоваться машинным переводом, который при всех недостатках сможет
|
||||
донести суть, намерения и местами даже передать тональность.
|
||||
|
||||
The original source of this text is in Russian, which is the author's
|
||||
native language. It is assumed that all concerned can easily use machine
|
||||
translation, which, with all the disadvantages, will be able to convey
|
||||
the essence, intentions and, in some places, even convey the tonality of
|
||||
a wording.
|
||||
|
||||
1. Причины
|
||||
|
||||
1.1. Лицензия Apache-2.0 является одной из самых популярных, так как
|
||||
содержит ряд уточнений, проясняющих и упрощающих использование исходного
|
||||
кода в производных работах и больших проектах. Эти особенности лицензии
|
||||
Apache-2.0 я нахожу достаточно ценными и удобными. Соответственно,
|
||||
переход на лицензию Apache-2.0 полезным в целом.
|
||||
|
||||
1.2. Проект OpenLDAP имеет определенную известность, в том числе, к
|
||||
сожалению, среди специалистов славится кране плохим качеством кода и
|
||||
сбоями при отходе от простых/базовых сценариев использования. Поэтому
|
||||
использование лицензии OpenLDAP, в глазах части аудитории, бросает тень
|
||||
на качества кода libmdbx, несмотря на то, что исходный код библиотеки
|
||||
переписан, в том числе, с целью повышения качества, надежности,
|
||||
стабильности и пригодности к тестированию.
|
||||
|
||||
Отмечу, что здесь не место для обсуждения объективности подобных мнений
|
||||
и причин, равно как и не место для оценки компетентности специалистов
|
||||
высказывающих такие суждения. Однако, здесь необходимо озвучить сам факт
|
||||
наличия такой негативной коннотации качества кода при упоминании
|
||||
OpenLDAP, совершенно без намерения как-либо задеть или обидеть
|
||||
контрибьюторов OpenLDAP.
|
||||
|
||||
1.3. С точки зрения исходного кода, к настоящему времени libmdbx стала
|
||||
совсем другим продуктом, о котором сейчас правильнее сказать что
|
||||
разработка вдохновлена LMDB, нежели основывается на заимствовании кода.
|
||||
Смена лицензии на переписанный код подчеркивает, что это действительно
|
||||
новый исходный код.
|
||||
|
||||
2. Легитимность
|
||||
|
||||
2.1. Исходная лицензия OpenLDAP 2.8 и актуальная лицензия Apache 2.0
|
||||
совпадают по базовым условиям. При этом лицензия Apache 2.0 уточняет,
|
||||
определяет и проясняет многие аспекты. Поэтому смену лицензии я склонен
|
||||
трактовать как уточнение, но как принципиальное изменение, которое
|
||||
могло-бы нарушить чьи-либо права.
|
||||
|
||||
2.2. С процедурной точки зрения, у меня есть право сменить лицензию на
|
||||
новый, написанный мной, исходный код. При этом объективно существует как
|
||||
техническая, так и юридическая проблемы отделения «нового кода» от
|
||||
«заимствованного», а также выделение/классификация кода, который
|
||||
является общественным достоянием и/или общеупотребительным воплощением
|
||||
«математических моделей и других публичных знаний».
|
||||
|
||||
Основываясь на собственной субъективной оценке кодовой базы, включая
|
||||
соотношения «нового», «заимствованного» и «общеупотребительного»
|
||||
исходного кода, я считаю что смена лицензии допустима. Одновременно с
|
||||
этим, я понимаю и признаю, что можно найти повод, чтобы трактовать
|
||||
ситуацию как «стакан наполовину полон/пуст». Поэтому декларирую
|
||||
готовность принимать претензии и устранять их путем полного
|
||||
переписывания оставшегося исходного кода, который попадает под критерии
|
||||
«заимствованного» и кто-то из контрибьюторов которого будет против
|
||||
изменения лицензии.
|
||||
|
||||
2.3. Вне зависимости от истории происхождения каждой строки исходного
|
||||
кода и её буквального авторства, прошу не считать производимую смену
|
||||
лицензии, и связанных с этим технических действий, как попытку плагиата,
|
||||
присвоения чужого труда, присвоения авторства или принижения вклада
|
||||
других авторов/контрибьторов. Безусловно проект MDBX/libmdbx не появился
|
||||
бы без LMDB и всех участников проекта LMDB, в особенности Говарда Чу
|
||||
(Howard Chu), Холлварда Фурусет (Hallvard Furuseth) и Мартина Хеденфок
|
||||
(Martin Hedenfalk). Как-бы исходный код не переписывался он всё равно
|
||||
будет основываться на базовых идеях и включать основные концепции LMDB.
|
||||
|
||||
3. Последствия и актуальные требования
|
||||
|
||||
Всё очень просто. Потребуется обеспечить требования новой лицензии в
|
||||
соответствии с 4-м пунктом лицензции Apache 2.0.
|
||||
|
||||
В частности, при использовании/распространении libmdbx потребуется
|
||||
обеспечить наличие файлов с текстом лицензии и файла NOTICE, а также
|
||||
обеспечить пользователям возможность ознакомиться с их содержимым в
|
||||
работах/продуктах использующих libmdbx.
|
||||
|
||||
-----------------------------------------------------------------------
|
||||
|
||||
Далее в справочных целях приведены уведомления об авторских правах из
|
||||
первоначально заимствованного кода.
|
||||
|
||||
---
|
||||
|
||||
Original source code was derived from LMDB in 2015,
|
||||
and later evolutionarily rewritten in 2015-2024:
|
||||
Copyright (c) 2011-2015 Howard Chu, Symas Corp. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted only as authorized by the OpenLDAP
|
||||
@ -11,12 +142,17 @@ A copy of this license is available in the file LICENSE in the
|
||||
top-level directory of the distribution or, alternatively, at
|
||||
<http://www.OpenLDAP.org/license.html>.
|
||||
|
||||
OpenLDAP is a registered trademark of the OpenLDAP Foundation.
|
||||
LMDB itself devived code from btree.c written by Martin Hedenfalk:
|
||||
Copyright (c) 2009, 2010 Martin Hedenfalk <martin@bzero.se>
|
||||
|
||||
Individual files and/or contributed packages may be copyright by
|
||||
other parties and/or subject to additional restrictions.
|
||||
Permission to use, copy, modify, and distribute this software for any
|
||||
purpose with or without fee is hereby granted, provided that the above
|
||||
copyright notice and this permission notice appear in all copies.
|
||||
|
||||
This work also contains materials derived from public sources.
|
||||
|
||||
Additional information about OpenLDAP can be obtained at
|
||||
<http://www.openldap.org/>.
|
||||
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
|
||||
WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
|
||||
MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
|
||||
ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
|
||||
WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
|
||||
ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
|
||||
OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
||||
|
196
GNUmakefile
196
GNUmakefile
@ -61,7 +61,7 @@ MDBX_BUILD_CXX ?= YES
|
||||
CFLAGS ?= $(strip $(eval CFLAGS := -std=gnu11 -O2 -g -Wall -Werror -Wextra -Wpedantic -ffunction-sections -fPIC -fvisibility=hidden -pthread -Wno-error=attributes $$(shell for opt in -fno-semantic-interposition -Wno-unused-command-line-argument -Wno-tautological-compare; do [ -z "$$$$($(CC) '-DMDBX_BUILD_FLAGS="probe"' $$$${opt} -c $(SRC_PROBE_C) -o /dev/null >/dev/null 2>&1 || echo failed)" ] && echo "$$$${opt} "; done)$(CFLAGS_EXTRA))$(CFLAGS))
|
||||
|
||||
# choosing C++ standard with variable expansion trick (seems this work two times per session for GNU Make 3.81)
|
||||
CXXSTD ?= $(eval CXXSTD := $$(shell for std in gnu++23 c++23 gnu++2b c++2b gnu++20 c++20 gnu++2a c++2a gnu++17 c++17 gnu++1z c++1z gnu++14 c++14 gnu++1y c++1y gnu+11 c++11 gnu++0x c++0x; do $(CXX) -std=$$$${std} -c $(SRC_PROBE_CXX) -o /dev/null 2>probe4std-$$$${std}.err >/dev/null && echo "-std=$$$${std}" && exit; done))$(CXXSTD)
|
||||
CXXSTD ?= $(eval CXXSTD := $$(shell for std in gnu++23 c++23 gnu++2b c++2b gnu++20 c++20 gnu++2a c++2a gnu++17 c++17 gnu++1z c++1z gnu++14 c++14 gnu++1y c++1y gnu+11 c++11 gnu++0x c++0x; do $(CXX) -std=$$$${std} -DMDBX_BUILD_CXX=1 -c $(SRC_PROBE_CXX) -o /dev/null 2>probe4std-$$$${std}.err >/dev/null && echo "-std=$$$${std}" && exit; done))$(CXXSTD)
|
||||
CXXFLAGS ?= $(strip $(CXXSTD) $(filter-out -std=gnu11,$(CFLAGS)))
|
||||
|
||||
# libraries and options for linking
|
||||
@ -121,7 +121,8 @@ endef
|
||||
SO_SUFFIX := $(shell $(uname2sosuffix))
|
||||
HEADERS := mdbx.h mdbx.h++
|
||||
LIBRARIES := libmdbx.a libmdbx.$(SO_SUFFIX)
|
||||
TOOLS := mdbx_stat mdbx_copy mdbx_dump mdbx_load mdbx_chk mdbx_drop
|
||||
TOOLS := chk copy drop dump load stat
|
||||
MDBX_TOOLS := $(addprefix mdbx_,$(TOOLS))
|
||||
MANPAGES := mdbx_stat.1 mdbx_copy.1 mdbx_dump.1 mdbx_load.1 mdbx_chk.1 mdbx_drop.1
|
||||
TIP := // TIP:
|
||||
|
||||
@ -148,7 +149,7 @@ else
|
||||
$(info $(TIP) Use `make V=1` for verbose.)
|
||||
endif
|
||||
|
||||
all: show-options $(LIBRARIES) $(TOOLS)
|
||||
all: show-options $(LIBRARIES) $(MDBX_TOOLS)
|
||||
|
||||
help:
|
||||
@echo " make all - build libraries and tools"
|
||||
@ -234,26 +235,26 @@ options:
|
||||
ifeq ($(wildcard mdbx.c),mdbx.c)
|
||||
#< dist-cutoff-end
|
||||
@echo "## in README and source code (see mdbx.c) if you do."
|
||||
@grep -h '#ifndef MDBX_' mdbx.c | grep -v BUILD | uniq | sed 's/#ifndef / /'
|
||||
@grep -h '#ifndef MDBX_' mdbx.c | grep -v BUILD | sort -u | sed 's/#ifndef / /'
|
||||
#> dist-cutoff-begin
|
||||
else
|
||||
@echo "## in README and source code (see src/options.h) if you do."
|
||||
@grep -h '#ifndef MDBX_' src/internals.h src/options.h | grep -v BUILD | uniq | sed 's/#ifndef / /'
|
||||
@grep -h '#ifndef MDBX_' src/*.h | grep -v BUILD | sort -u | sed 's/#ifndef / /'
|
||||
endif
|
||||
#< dist-cutoff-end
|
||||
|
||||
lib libs libmdbx mdbx: libmdbx.a libmdbx.$(SO_SUFFIX)
|
||||
|
||||
tools: $(TOOLS)
|
||||
tools-static: $(addsuffix .static,$(TOOLS)) $(addsuffix .static-lto,$(TOOLS))
|
||||
tools: $(MDBX_TOOLS)
|
||||
tools-static: $(addsuffix .static,$(MDBX_TOOLS)) $(addsuffix .static-lto,$(MDBX_TOOLS))
|
||||
|
||||
strip: all
|
||||
@echo ' STRIP libmdbx.$(SO_SUFFIX) $(TOOLS)'
|
||||
$(TRACE )strip libmdbx.$(SO_SUFFIX) $(TOOLS)
|
||||
@echo ' STRIP libmdbx.$(SO_SUFFIX) $(MDBX_TOOLS)'
|
||||
$(TRACE )strip libmdbx.$(SO_SUFFIX) $(MDBX_TOOLS)
|
||||
|
||||
clean:
|
||||
@echo ' REMOVE ...'
|
||||
$(QUIET)rm -rf $(TOOLS) mdbx_test @* *.[ao] *.[ls]o *.$(SO_SUFFIX) *.dSYM *~ tmp.db/* \
|
||||
$(QUIET)rm -rf $(MDBX_TOOLS) mdbx_test @* *.[ao] *.[ls]o *.$(SO_SUFFIX) *.dSYM *~ tmp.db/* \
|
||||
*.gcov *.log *.err src/*.o test/*.o mdbx_example dist \
|
||||
config.h src/config.h src/version.c *.tar* buildflags.tag \
|
||||
mdbx_*.static mdbx_*.static-lto
|
||||
@ -284,27 +285,28 @@ ifeq ($(wildcard mdbx.c),mdbx.c)
|
||||
# Amalgamated source code, i.e. distributed after `make dist`
|
||||
MAN_SRCDIR := man1/
|
||||
|
||||
config.h: buildflags.tag mdbx.c $(lastword $(MAKEFILE_LIST))
|
||||
config.h: buildflags.tag mdbx.c $(lastword $(MAKEFILE_LIST)) LICENSE NOTICE
|
||||
@echo ' MAKE $@'
|
||||
$(QUIET)(echo '#define MDBX_BUILD_TIMESTAMP "$(MDBX_BUILD_TIMESTAMP)"' \
|
||||
&& echo "#define MDBX_BUILD_FLAGS \"$$(cat buildflags.tag)\"" \
|
||||
&& echo '#define MDBX_BUILD_COMPILER "$(shell (LC_ALL=C $(CC) --version || echo 'Please use GCC or CLANG compatible compiler') | head -1)"' \
|
||||
&& echo '#define MDBX_BUILD_TARGET "$(shell set -o pipefail; (LC_ALL=C $(CC) -v 2>&1 | grep -i '^Target:' | cut -d ' ' -f 2- || (LC_ALL=C $(CC) --version | grep -qi e2k && echo E2K) || echo 'Please use GCC or CLANG compatible compiler') | head -1)"' \
|
||||
&& echo '#define MDBX_BUILD_CXX $(call select_by,MDBX_BUILD_CXX,1,0)' \
|
||||
) >$@
|
||||
|
||||
mdbx-dylib.o: config.h mdbx.c mdbx.h $(lastword $(MAKEFILE_LIST))
|
||||
mdbx-dylib.o: config.h mdbx.c mdbx.h $(lastword $(MAKEFILE_LIST)) LICENSE NOTICE
|
||||
@echo ' CC $@'
|
||||
$(QUIET)$(CC) $(CFLAGS) $(MDBX_BUILD_OPTIONS) '-DMDBX_CONFIG_H="config.h"' -DLIBMDBX_EXPORTS=1 -c mdbx.c -o $@
|
||||
|
||||
mdbx-static.o: config.h mdbx.c mdbx.h $(lastword $(MAKEFILE_LIST))
|
||||
mdbx-static.o: config.h mdbx.c mdbx.h $(lastword $(MAKEFILE_LIST)) LICENSE NOTICE
|
||||
@echo ' CC $@'
|
||||
$(QUIET)$(CC) $(CFLAGS) $(MDBX_BUILD_OPTIONS) '-DMDBX_CONFIG_H="config.h"' -ULIBMDBX_EXPORTS -c mdbx.c -o $@
|
||||
|
||||
mdbx++-dylib.o: config.h mdbx.c++ mdbx.h mdbx.h++ $(lastword $(MAKEFILE_LIST))
|
||||
mdbx++-dylib.o: config.h mdbx.c++ mdbx.h mdbx.h++ $(lastword $(MAKEFILE_LIST)) LICENSE NOTICE
|
||||
@echo ' CC $@'
|
||||
$(QUIET)$(CXX) $(CXXFLAGS) $(MDBX_BUILD_OPTIONS) '-DMDBX_CONFIG_H="config.h"' -DLIBMDBX_EXPORTS=1 -c mdbx.c++ -o $@
|
||||
|
||||
mdbx++-static.o: config.h mdbx.c++ mdbx.h mdbx.h++ $(lastword $(MAKEFILE_LIST))
|
||||
mdbx++-static.o: config.h mdbx.c++ mdbx.h mdbx.h++ $(lastword $(MAKEFILE_LIST)) LICENSE NOTICE
|
||||
@echo ' CC $@'
|
||||
$(QUIET)$(CXX) $(CXXFLAGS) $(MDBX_BUILD_OPTIONS) '-DMDBX_CONFIG_H="config.h"' -ULIBMDBX_EXPORTS -c mdbx.c++ -o $@
|
||||
|
||||
@ -351,9 +353,9 @@ define uname2titer
|
||||
esac
|
||||
endef
|
||||
|
||||
DIST_EXTRA := LICENSE README.md CMakeLists.txt GNUmakefile Makefile ChangeLog.md VERSION.txt config.h.in ntdll.def \
|
||||
DIST_EXTRA := LICENSE NOTICE README.md CMakeLists.txt GNUmakefile Makefile ChangeLog.md VERSION.txt config.h.in ntdll.def \
|
||||
$(addprefix man1/, $(MANPAGES)) cmake/compiler.cmake cmake/profile.cmake cmake/utils.cmake
|
||||
DIST_SRC := mdbx.h mdbx.h++ mdbx.c mdbx.c++ $(addsuffix .c, $(TOOLS))
|
||||
DIST_SRC := mdbx.h mdbx.h++ mdbx.c mdbx.c++ $(addsuffix .c, $(MDBX_TOOLS))
|
||||
|
||||
TEST_DB ?= $(shell [ -d /dev/shm ] && echo /dev/shm || echo /tmp)/mdbx-test.db
|
||||
TEST_LOG ?= $(shell [ -d /dev/shm ] && echo /dev/shm || echo /tmp)/mdbx-test.log
|
||||
@ -362,20 +364,20 @@ TEST_ITER := $(shell $(uname2titer))
|
||||
TEST_SRC := test/osal-$(TEST_OSAL).c++ $(filter-out $(wildcard test/osal-*.c++),$(wildcard test/*.c++)) $(call select_by,MDBX_BUILD_CXX,,src/mdbx.c++)
|
||||
TEST_INC := $(wildcard test/*.h++)
|
||||
TEST_OBJ := $(patsubst %.c++,%.o,$(TEST_SRC))
|
||||
TAR ?= $(shell which gnu-tar || echo tar)
|
||||
TAR ?= $(shell which gnu-tar 2>&- || echo tar)
|
||||
ZIP ?= $(shell which zip || echo "echo 'Please install zip'")
|
||||
CLANG_FORMAT ?= $(shell (which clang-format-14 || which clang-format-13 || which clang-format) 2>/dev/null)
|
||||
CLANG_FORMAT ?= $(shell (which clang-format-19 || which clang-format) 2>/dev/null)
|
||||
|
||||
reformat:
|
||||
@echo ' RUNNING clang-format...'
|
||||
$(QUIET)if [ -n "$(CLANG_FORMAT)" ]; then \
|
||||
git ls-files | grep -E '\.(c|c++|h|h++)(\.in)?$$' | xargs -r $(CLANG_FORMAT) -i --style=file; \
|
||||
else \
|
||||
echo "clang-format version 13..14 not found for 'reformat'"; \
|
||||
echo "clang-format version 19 not found for 'reformat'"; \
|
||||
fi
|
||||
|
||||
MAN_SRCDIR := src/man1/
|
||||
ALLOY_DEPS := $(shell git ls-files src/)
|
||||
ALLOY_DEPS := $(shell git ls-files src/ | grep -e /tools -e /man -v)
|
||||
git_DIR := $(shell if [ -d .git ]; then echo .git; elif [ -s .git -a -f .git ]; then grep '^gitdir: ' .git | cut -d ':' -f 2; else echo git_directory_is_absent; fi)
|
||||
MDBX_GIT_VERSION = $(shell set -o pipefail; git describe --tags '--match=v[0-9]*' 2>&- | sed -n 's|^v*\([0-9]\{1,\}\.[0-9]\{1,\}\.[0-9]\{1,\}\)\(.*\)|\1|p' || echo 'Please fetch tags and/or use non-obsolete git version')
|
||||
MDBX_GIT_REVISION = $(shell set -o pipefail; git rev-list `git describe --tags --abbrev=0`..HEAD --count 2>&- || echo 'Please fetch tags and/or use non-obsolete git version')
|
||||
@ -392,11 +394,11 @@ MDBX_SMOKE_EXTRA ?=
|
||||
check: DESTDIR = $(shell pwd)/@check-install
|
||||
check: test dist install
|
||||
|
||||
smoke-assertion: MDBX_BUILD_OPTIONS:=$(strip $(MDBX_BUILD_OPTIONS) -DMDBX_FORCE_ASSERTIONS=1)
|
||||
smoke-assertion: MDBX_BUILD_OPTIONS:=$(strip $(MDBX_BUILD_OPTIONS) -DMDBX_FORCE_ASSERTIONS=1 -UNDEBUG -DMDBX_DEBUG=0)
|
||||
smoke-assertion: smoke
|
||||
test-assertion: MDBX_BUILD_OPTIONS:=$(strip $(MDBX_BUILD_OPTIONS) -DMDBX_FORCE_ASSERTIONS=1)
|
||||
test-assertion: MDBX_BUILD_OPTIONS:=$(strip $(MDBX_BUILD_OPTIONS) -DMDBX_FORCE_ASSERTIONS=1 -UNDEBUG -DMDBX_DEBUG=0)
|
||||
test-assertion: smoke
|
||||
long-test-assertion: MDBX_BUILD_OPTIONS:=$(strip $(MDBX_BUILD_OPTIONS) -DMDBX_FORCE_ASSERTIONS=1)
|
||||
long-test-assertion: MDBX_BUILD_OPTIONS:=$(strip $(MDBX_BUILD_OPTIONS) -DMDBX_FORCE_ASSERTIONS=1 -UNDEBUG -DMDBX_DEBUG=0)
|
||||
long-test-assertion: smoke
|
||||
|
||||
smoke: build-test
|
||||
@ -424,7 +426,7 @@ smoke-fault: build-test
|
||||
|
||||
test: build-test
|
||||
@echo ' RUNNING `test/long_stochastic.sh --loops 2`...'
|
||||
$(QUIET)test/long_stochastic.sh --dont-check-ram-size --loops 2 --db-upto-mb 256 --extra --skip-make --taillog >$(TEST_LOG) || (cat $(TEST_LOG) && false)
|
||||
$(QUIET)test/long_stochastic.sh --dont-check-ram-size --loops 2 --db-upto-mb 256 --skip-make --taillog >$(TEST_LOG) || (cat $(TEST_LOG) && false)
|
||||
|
||||
long-test: test-long
|
||||
test-long: build-test
|
||||
@ -439,7 +441,7 @@ test-valgrind: test-memcheck
|
||||
test-memcheck: CFLAGS_EXTRA=-Ofast -DENABLE_MEMCHECK
|
||||
test-memcheck: build-test
|
||||
@echo ' RUNNING `test/long_stochastic.sh --with-valgrind --loops 2`...'
|
||||
$(QUIET)test/long_stochastic.sh --with-valgrind --extra --loops 2 --db-upto-mb 256 --skip-make >$(TEST_LOG) || (cat $(TEST_LOG) && false)
|
||||
$(QUIET)test/long_stochastic.sh --with-valgrind --loops 2 --db-upto-mb 256 --skip-make >$(TEST_LOG) || (cat $(TEST_LOG) && false)
|
||||
|
||||
memcheck: smoke-memcheck
|
||||
smoke-memcheck: VALGRIND=valgrind --trace-children=yes --log-file=valgrind-%p.log --leak-check=full --track-origins=yes --read-var-info=yes --error-exitcode=42 --suppressions=test/valgrind_suppress.txt
|
||||
@ -480,23 +482,27 @@ build-test: all mdbx_example mdbx_test
|
||||
define test-rule
|
||||
$(patsubst %.c++,%.o,$(1)): $(1) $(TEST_INC) $(HEADERS) $(lastword $(MAKEFILE_LIST))
|
||||
@echo ' CC $$@'
|
||||
$(QUIET)$$(CXX) $$(CXXFLAGS) $$(MDBX_BUILD_OPTIONS) -c $(1) -o $$@
|
||||
$(QUIET)$$(CXX) $$(CXXFLAGS) $$(MDBX_BUILD_OPTIONS) -DMDBX_BUILD_CXX=1 -DMDBX_WITHOUT_MSVC_CRT=0 -c $(1) -o $$@
|
||||
|
||||
endef
|
||||
$(foreach file,$(TEST_SRC),$(eval $(call test-rule,$(file))))
|
||||
|
||||
mdbx_%: src/mdbx_%.c libmdbx.a
|
||||
@echo ' CC+LD $@'
|
||||
$(QUIET)$(CC) $(CFLAGS) $(MDBX_BUILD_OPTIONS) '-DMDBX_CONFIG_H="config.h"' $^ $(EXE_LDFLAGS) $(LIBS) -o $@
|
||||
define tool-rule
|
||||
mdbx_$(1): src/tools/$(1).c libmdbx.a
|
||||
@echo ' CC+LD $$@'
|
||||
$(QUIET)$$(CC) $$(CFLAGS) $$(MDBX_BUILD_OPTIONS) -Isrc '-DMDBX_CONFIG_H="config.h"' $$^ $$(EXE_LDFLAGS) $$(LIBS) -o $$@
|
||||
|
||||
mdbx_%.static: src/mdbx_%.c mdbx-static.o
|
||||
@echo ' CC+LD $@'
|
||||
$(QUIET)$(CC) $(CFLAGS) $(MDBX_BUILD_OPTIONS) '-DMDBX_CONFIG_H="config.h"' $^ $(EXE_LDFLAGS) $(LIBS) -static -Wl,--strip-all -o $@
|
||||
mdbx_$(1).static: src/tools/$(1).c mdbx-static.o
|
||||
@echo ' CC+LD $$@'
|
||||
$(QUIET)$$(CC) $$(CFLAGS) $$(MDBX_BUILD_OPTIONS) -Isrc '-DMDBX_CONFIG_H="config.h"' $$^ $$(EXE_LDFLAGS) $$(LIBS) -static -Wl,--strip-all -o $$@
|
||||
|
||||
mdbx_%.static-lto: src/mdbx_%.c src/config.h src/version.c src/alloy.c $(ALLOY_DEPS)
|
||||
@echo ' CC+LD $@'
|
||||
$(QUIET)$(CC) $(CFLAGS) -Os -flto $(MDBX_BUILD_OPTIONS) '-DLIBMDBX_API=' '-DMDBX_CONFIG_H="config.h"' \
|
||||
$< src/alloy.c $(EXE_LDFLAGS) $(LIBS) -static -Wl,--strip-all -o $@
|
||||
mdbx_$(1).static-lto: src/tools/$(1).c src/config.h src/version.c src/alloy.c $(ALLOY_DEPS)
|
||||
@echo ' CC+LD $$@'
|
||||
$(QUIET)$$(CC) $$(CFLAGS) -Os -flto $$(MDBX_BUILD_OPTIONS) -Isrc '-DLIBMDBX_API=' '-DMDBX_CONFIG_H="config.h"' \
|
||||
$$< src/alloy.c $$(EXE_LDFLAGS) $$(LIBS) -static -Wl,--strip-all -o $$@
|
||||
|
||||
endef
|
||||
$(foreach file,$(TOOLS),$(eval $(call tool-rule,$(file))))
|
||||
|
||||
mdbx_test: $(TEST_OBJ) libmdbx.$(SO_SUFFIX)
|
||||
@echo ' LD $@'
|
||||
@ -506,16 +512,13 @@ $(git_DIR)/HEAD $(git_DIR)/index $(git_DIR)/refs/tags:
|
||||
@echo '*** ' >&2
|
||||
@echo '*** Please don''t use tarballs nor zips which are automatically provided by Github !' >&2
|
||||
@echo '*** These archives do not contain version information and thus are unfit to build libmdbx.' >&2
|
||||
@echo '*** You can vote for ability of disabling auto-creation such unsuitable archives at https://github.community/t/disable-tarball' >&2
|
||||
@echo '*** ' >&2
|
||||
@echo '*** Instead of above, just clone the git repository, either download a tarball or zip with the properly amalgamated source core.' >&2
|
||||
@echo '*** For embedding libmdbx use a git-submodule or the amalgamated source code.' >&2
|
||||
@echo '*** ' >&2
|
||||
@echo '*** Please, avoid using any other techniques.' >&2
|
||||
@echo '*** Instead just follow the https://libmdbx.dqdkfa.ru/usage.html' >&2
|
||||
@echo '*** PLEASE, AVOID USING ANY OTHER TECHNIQUES.' >&2
|
||||
@echo '*** ' >&2
|
||||
@false
|
||||
|
||||
src/version.c: src/version.c.in $(lastword $(MAKEFILE_LIST)) $(git_DIR)/HEAD $(git_DIR)/index $(git_DIR)/refs/tags
|
||||
src/version.c: src/version.c.in $(lastword $(MAKEFILE_LIST)) $(git_DIR)/HEAD $(git_DIR)/index $(git_DIR)/refs/tags LICENSE NOTICE
|
||||
@echo ' MAKE $@'
|
||||
$(QUIET)sed \
|
||||
-e "s|@MDBX_GIT_TIMESTAMP@|$(MDBX_GIT_TIMESTAMP)|" \
|
||||
@ -528,20 +531,21 @@ src/version.c: src/version.c.in $(lastword $(MAKEFILE_LIST)) $(git_DIR)/HEAD $(g
|
||||
-e "s|\$${MDBX_VERSION_REVISION}|$(MDBX_GIT_REVISION)|" \
|
||||
src/version.c.in >$@
|
||||
|
||||
src/config.h: buildflags.tag src/version.c $(lastword $(MAKEFILE_LIST))
|
||||
src/config.h: buildflags.tag src/version.c $(lastword $(MAKEFILE_LIST)) LICENSE NOTICE
|
||||
@echo ' MAKE $@'
|
||||
$(QUIET)(echo '#define MDBX_BUILD_TIMESTAMP "$(MDBX_BUILD_TIMESTAMP)"' \
|
||||
&& echo "#define MDBX_BUILD_FLAGS \"$$(cat buildflags.tag)\"" \
|
||||
&& echo '#define MDBX_BUILD_COMPILER "$(shell (LC_ALL=C $(CC) --version || echo 'Please use GCC or CLANG compatible compiler') | head -1)"' \
|
||||
&& echo '#define MDBX_BUILD_TARGET "$(shell set -o pipefail; (LC_ALL=C $(CC) -v 2>&1 | grep -i '^Target:' | cut -d ' ' -f 2- || (LC_ALL=C $(CC) --version | grep -qi e2k && echo E2K) || echo 'Please use GCC or CLANG compatible compiler') | head -1)"' \
|
||||
&& echo '#define MDBX_BUILD_SOURCERY $(MDBX_BUILD_SOURCERY)' \
|
||||
&& echo '#define MDBX_BUILD_CXX $(call select_by,MDBX_BUILD_CXX,1,0)' \
|
||||
) >$@
|
||||
|
||||
mdbx-dylib.o: src/config.h src/version.c src/alloy.c $(ALLOY_DEPS) $(lastword $(MAKEFILE_LIST))
|
||||
mdbx-dylib.o: src/config.h src/version.c src/alloy.c $(ALLOY_DEPS) $(lastword $(MAKEFILE_LIST)) LICENSE NOTICE
|
||||
@echo ' CC $@'
|
||||
$(QUIET)$(CC) $(CFLAGS) $(MDBX_BUILD_OPTIONS) '-DMDBX_CONFIG_H="config.h"' -DLIBMDBX_EXPORTS=1 -c src/alloy.c -o $@
|
||||
|
||||
mdbx-static.o: src/config.h src/version.c src/alloy.c $(ALLOY_DEPS) $(lastword $(MAKEFILE_LIST))
|
||||
mdbx-static.o: src/config.h src/version.c src/alloy.c $(ALLOY_DEPS) $(lastword $(MAKEFILE_LIST)) LICENSE NOTICE
|
||||
@echo ' CC $@'
|
||||
$(QUIET)$(CC) $(CFLAGS) $(MDBX_BUILD_OPTIONS) '-DMDBX_CONFIG_H="config.h"' -ULIBMDBX_EXPORTS -c src/alloy.c -o $@
|
||||
|
||||
@ -570,9 +574,9 @@ docs/contrib.fame: src/version.c $(lastword $(MAKEFILE_LIST))
|
||||
@echo ' MAKE $@'
|
||||
$(QUIET)echo "" > $@ && git fame --show-email --format=md --silent-progress -w -M -C | grep '^|' >> $@
|
||||
|
||||
docs/overall.md: docs/__overview.md docs/_toc.md docs/__mithril.md docs/__history.md AUTHORS docs/contrib.fame LICENSE $(lastword $(MAKEFILE_LIST))
|
||||
docs/overall.md: docs/__overview.md docs/_toc.md docs/__mithril.md docs/__history.md COPYRIGHT LICENSE NOTICE $(lastword $(MAKEFILE_LIST))
|
||||
@echo ' MAKE $@'
|
||||
$(QUIET)echo -e "\\mainpage Overall\n\\section brief Brief" | cat - $(filter %.md, $^) >$@ && echo -e "\n\n\nLicense\n=======\n" | cat AUTHORS docs/contrib.fame - LICENSE >>$@
|
||||
$(QUIET)echo -e "\\mainpage Overall\n\\section brief Brief" | cat - $(filter %.md, $^) >$@ && echo -e "\n\n\nLicense\n=======\n" | cat - LICENSE >>$@
|
||||
|
||||
docs/intro.md: docs/_preface.md docs/__characteristics.md docs/__improvements.md docs/_restrictions.md docs/__performance.md
|
||||
@echo ' MAKE $@'
|
||||
@ -582,11 +586,11 @@ docs/usage.md: docs/__usage.md docs/_starting.md docs/__bindings.md
|
||||
@echo ' MAKE $@'
|
||||
$(QUIET)echo -e "\\page usage Usage\n\\section getting Building & Embedding" | cat - $^ | sed 's/^Bindings$$/Bindings {#bindings}/' >$@
|
||||
|
||||
doxygen: docs/Doxyfile docs/overall.md docs/intro.md docs/usage.md mdbx.h mdbx.h++ src/options.h ChangeLog.md AUTHORS LICENSE $(lastword $(MAKEFILE_LIST))
|
||||
doxygen: docs/Doxyfile docs/overall.md docs/intro.md docs/usage.md mdbx.h mdbx.h++ src/options.h ChangeLog.md COPYRIGHT LICENSE NOTICE $(lastword $(MAKEFILE_LIST))
|
||||
@echo ' RUNNING doxygen...'
|
||||
$(QUIET)rm -rf docs/html && \
|
||||
cat mdbx.h | tr '\n' '\r' | sed -e 's/LIBMDBX_INLINE_API\s*(\s*\([^,]\+\),\s*\([^,]\+\),\s*(\s*\([^)]\+\)\s*)\s*)\s*{/inline \1 \2(\3) {/g' | tr '\r' '\n' >docs/mdbx.h && \
|
||||
cp mdbx.h++ src/options.h ChangeLog.md docs/ && (cd docs && doxygen Doxyfile $(HUSH)) && cp AUTHORS LICENSE docs/html/
|
||||
cp mdbx.h++ src/options.h ChangeLog.md docs/ && (cd docs && doxygen Doxyfile $(HUSH)) && cp COPYRIGHT LICENSE NOTICE docs/html/
|
||||
|
||||
mdbx++-dylib.o: src/config.h src/mdbx.c++ mdbx.h mdbx.h++ $(lastword $(MAKEFILE_LIST))
|
||||
@echo ' CC $@'
|
||||
@ -617,7 +621,7 @@ release-assets: libmdbx-amalgamated-$(MDBX_GIT_VERSION).zpaq \
|
||||
|
||||
dist-checked.tag: $(addprefix dist/, $(DIST_SRC) $(DIST_EXTRA))
|
||||
@echo -n ' VERIFY amalgamated sources...'
|
||||
$(QUIET)rm -rf $@ dist/@tmp-shared_internals.inc \
|
||||
$(QUIET)rm -rf $@ dist/@tmp-essentials.inc dist/@tmp-internals.inc \
|
||||
&& if grep -R "define xMDBX_ALLOY" dist | grep -q MDBX_BUILD_SOURCERY; then echo "sed output is WRONG!" >&2; exit 2; fi \
|
||||
&& rm -rf dist-check && cp -r -p dist dist-check && ($(MAKE) IOARENA=false CXXSTD=$(CXXSTD) -C dist-check >dist-check.log 2>dist-check.err || (cat dist-check.err && exit 1)) \
|
||||
&& touch $@ || (echo " FAILED! See dist-check.log and dist-check.err" >&2; exit 2) && echo " Ok"
|
||||
@ -634,7 +638,6 @@ dist-checked.tag: $(addprefix dist/, $(DIST_SRC) $(DIST_EXTRA))
|
||||
@echo ' CREATE $@'
|
||||
$(QUIET)$(TAR) -c $(shell LC_ALL=C $(TAR) --help | grep -q -- '--owner' && echo '--owner=0 --group=0') -f - -C dist $(DIST_SRC) $(DIST_EXTRA) | bzip2 -9 -z >$@
|
||||
|
||||
|
||||
%.zip: dist-checked.tag
|
||||
@echo ' CREATE $@'
|
||||
$(QUIET)rm -rf $@ && (cd dist && $(ZIP) -9 ../$@ $(DIST_SRC) $(DIST_EXTRA)) &>zip.log
|
||||
@ -643,52 +646,81 @@ dist-checked.tag: $(addprefix dist/, $(DIST_SRC) $(DIST_EXTRA))
|
||||
@echo ' CREATE $@'
|
||||
$(QUIET)rm -rf $@ && (cd dist && zpaq a ../$@ $(DIST_SRC) $(DIST_EXTRA) -m59) &>zpaq.log
|
||||
|
||||
dist/mdbx.h: mdbx.h src/version.c $(lastword $(MAKEFILE_LIST))
|
||||
@echo ' COPY $@'
|
||||
$(QUIET)mkdir -p dist && cp $< $@
|
||||
|
||||
dist/mdbx.h++: mdbx.h++ src/version.c $(lastword $(MAKEFILE_LIST))
|
||||
@echo ' COPY $@'
|
||||
$(QUIET)mkdir -p dist && cp $< $@
|
||||
|
||||
dist/@tmp-shared_internals.inc: src/version.c $(ALLOY_DEPS) $(lastword $(MAKEFILE_LIST))
|
||||
dist/@tmp-essentials.inc: src/version.c $(ALLOY_DEPS) $(lastword $(MAKEFILE_LIST))
|
||||
@echo ' ALLOYING...'
|
||||
$(QUIET)mkdir -p dist \
|
||||
&& echo '#define xMDBX_ALLOY 1' >dist/@tmp-sed.inc && echo '#define MDBX_BUILD_SOURCERY $(MDBX_BUILD_SOURCERY)' >>dist/@tmp-sed.inc \
|
||||
&& (grep -v '#include ' src/alloy.c && echo '#define MDBX_BUILD_SOURCERY $(MDBX_BUILD_SOURCERY)' \
|
||||
&& sed \
|
||||
-e '/#pragma once/r dist/@tmp-sed.inc' \
|
||||
-e 's|#include "../mdbx.h"|@INCLUDE "mdbx.h"|' \
|
||||
-e '/#include "base.h"/r src/base.h' \
|
||||
-e '/#include "preface.h"/r src/preface.h' \
|
||||
-e '/#include "osal.h"/r src/osal.h' \
|
||||
-e '/#include "options.h"/r src/options.h' \
|
||||
-e '/#include "atomics-types.h"/r src/atomics-types.h' \
|
||||
-e '/#include "layout-dxb.h"/r src/layout-dxb.h' \
|
||||
-e '/#include "layout-lck.h"/r src/layout-lck.h' \
|
||||
-e '/#include "logging_and_debug.h"/r src/logging_and_debug.h' \
|
||||
-e '/#include "utils.h"/r src/utils.h' \
|
||||
-e '/#include "pnl.h"/r src/pnl.h' \
|
||||
src/essentials.h \
|
||||
| sed \
|
||||
-e '/#pragma once/d' -e '/#include "/d' \
|
||||
-e '/ clang-format o/d' -e '/ \*INDENT-O/d' \
|
||||
src/internals.h >$@ \
|
||||
&& rm -rf dist/@tmp-sed.inc
|
||||
| grep -v '^/// ') >$@
|
||||
|
||||
dist/mdbx.c: dist/@tmp-shared_internals.inc $(lastword $(MAKEFILE_LIST))
|
||||
dist/@tmp-internals.inc: dist/@tmp-essentials.inc src/version.c $(ALLOY_DEPS) $(lastword $(MAKEFILE_LIST))
|
||||
$(QUIET)(cat dist/@tmp-essentials.inc \
|
||||
&& sed \
|
||||
-e '/#include "essentials.h"/d' \
|
||||
-e '/#include "atomics-ops.h"/r src/atomics-ops.h' \
|
||||
-e '/#include "proto.h"/r src/proto.h' \
|
||||
-e '/#include "txl.h"/r src/txl.h' \
|
||||
-e '/#include "unaligned.h"/r src/unaligned.h' \
|
||||
-e '/#include "cogs.h"/r src/cogs.h' \
|
||||
-e '/#include "cursor.h"/r src/cursor.h' \
|
||||
-e '/#include "dbi.h"/r src/dbi.h' \
|
||||
-e '/#include "dpl.h"/r src/dpl.h' \
|
||||
-e '/#include "gc.h"/r src/gc.h' \
|
||||
-e '/#include "lck.h"/r src/lck.h' \
|
||||
-e '/#include "meta.h"/r src/meta.h' \
|
||||
-e '/#include "node.h"/r src/node.h' \
|
||||
-e '/#include "page-iov.h"/r src/page-iov.h' \
|
||||
-e '/#include "page-ops.h"/r src/page-ops.h' \
|
||||
-e '/#include "spill.h"/r src/spill.h' \
|
||||
-e '/#include "sort.h"/r src/sort.h' \
|
||||
-e '/#include "tls.h"/r src/tls.h' \
|
||||
-e '/#include "walk.h"/r src/walk.h' \
|
||||
-e '/#include "windows-import.h"/r src/windows-import.h' \
|
||||
src/internals.h \
|
||||
| sed \
|
||||
-e '/#pragma once/d' -e '/#include "/d' \
|
||||
-e '/ clang-format o/d' -e '/ \*INDENT-O/d' \
|
||||
| grep -v '^/// ') >$@
|
||||
|
||||
dist/mdbx.c: dist/@tmp-internals.inc $(lastword $(MAKEFILE_LIST))
|
||||
@echo ' MAKE $@'
|
||||
$(QUIET)mkdir -p dist && (cat dist/@tmp-shared_internals.inc \
|
||||
&& cat src/core.c src/osal.c src/version.c src/lck-windows.c src/lck-posix.c | sed \
|
||||
$(QUIET)(cat dist/@tmp-internals.inc $(shell git ls-files src/*.c | grep -v alloy) src/version.c | sed \
|
||||
-e '/#include "debug_begin.h"/r src/debug_begin.h' \
|
||||
-e '/#include "debug_end.h"/r src/debug_end.h' \
|
||||
) | sed -e '/#include "/d;/#pragma once/d' -e 's|@INCLUDE|#include|' \
|
||||
-e '/ clang-format o/d;/ \*INDENT-O/d' >$@
|
||||
|
||||
dist/mdbx.c++: dist/@tmp-shared_internals.inc src/mdbx.c++ $(lastword $(MAKEFILE_LIST))
|
||||
dist/mdbx.c++: dist/@tmp-essentials.inc src/mdbx.c++ $(lastword $(MAKEFILE_LIST))
|
||||
@echo ' MAKE $@'
|
||||
$(QUIET)mkdir -p dist && (cat dist/@tmp-shared_internals.inc && cat src/mdbx.c++) \
|
||||
| sed -e '/#include "/d;/#pragma once/d' -e 's|@INCLUDE|#include|;s|"mdbx.h"|"mdbx.h++"|' \
|
||||
$(QUIET)cat dist/@tmp-essentials.inc src/mdbx.c++ | sed \
|
||||
-e '/#define xMDBX_ALLOY/d' \
|
||||
-e '/#include "/d;/#pragma once/d' \
|
||||
-e 's|@INCLUDE|#include|;s|"mdbx.h"|"mdbx.h++"|' \
|
||||
-e '/ clang-format o/d;/ \*INDENT-O/d' >$@
|
||||
|
||||
define dist-tool-rule
|
||||
dist/$(1).c: src/$(1).c src/wingetopt.h src/wingetopt.c \
|
||||
dist/@tmp-shared_internals.inc $(lastword $(MAKEFILE_LIST))
|
||||
dist/mdbx_$(1).c: src/tools/$(1).c src/tools/wingetopt.h src/tools/wingetopt.c \
|
||||
dist/@tmp-internals.inc $(lastword $(MAKEFILE_LIST))
|
||||
@echo ' MAKE $$@'
|
||||
$(QUIET)mkdir -p dist && sed \
|
||||
-e '/#include "internals.h"/r dist/@tmp-shared_internals.inc' \
|
||||
-e '/#include "wingetopt.h"/r src/wingetopt.c' \
|
||||
-e '/#include "essentials.h"/r dist/@tmp-essentials.inc' \
|
||||
-e '/#include "wingetopt.h"/r src/tools/wingetopt.c' \
|
||||
-e '/ clang-format o/d' -e '/ \*INDENT-O/d' \
|
||||
src/$(1).c \
|
||||
src/tools/$(1).c \
|
||||
| sed -e '/#include "/d;/#pragma once/d;/#define xMDBX_ALLOY/d' -e 's|@INCLUDE|#include|' \
|
||||
-e '/ clang-format o/d;/ \*INDENT-O/d' >$$@
|
||||
|
||||
@ -696,12 +728,12 @@ endef
|
||||
$(foreach file,$(TOOLS),$(eval $(call dist-tool-rule,$(file))))
|
||||
|
||||
define dist-extra-rule
|
||||
dist/$(1): $(1)
|
||||
dist/$(1): $(1) src/version.c $(lastword $(MAKEFILE_LIST))
|
||||
@echo ' REFINE $$@'
|
||||
$(QUIET)mkdir -p $$(dir $$@) && sed -e '/^#> dist-cutoff-begin/,/^#< dist-cutoff-end/d' $$< >$$@
|
||||
|
||||
endef
|
||||
$(foreach file,$(filter-out man1/% VERSION.txt %.in ntdll.def,$(DIST_EXTRA)),$(eval $(call dist-extra-rule,$(file))))
|
||||
$(foreach file,mdbx.h mdbx.h++ $(filter-out man1/% VERSION.txt %.in ntdll.def,$(DIST_EXTRA)),$(eval $(call dist-extra-rule,$(file))))
|
||||
|
||||
dist/VERSION.txt: src/version.c
|
||||
@echo ' MAKE $@'
|
||||
@ -763,10 +795,10 @@ cross-qemu:
|
||||
|
||||
#< dist-cutoff-end
|
||||
|
||||
install: $(LIBRARIES) $(TOOLS) $(HEADERS)
|
||||
install: $(LIBRARIES) $(MDBX_TOOLS) $(HEADERS)
|
||||
@echo ' INSTALLING...'
|
||||
$(QUIET)mkdir -p $(DESTDIR)$(prefix)/bin$(suffix) && \
|
||||
$(INSTALL) -p $(EXE_INSTALL_FLAGS) $(TOOLS) $(DESTDIR)$(prefix)/bin$(suffix)/ && \
|
||||
$(INSTALL) -p $(EXE_INSTALL_FLAGS) $(MDBX_TOOLS) $(DESTDIR)$(prefix)/bin$(suffix)/ && \
|
||||
mkdir -p $(DESTDIR)$(prefix)/lib$(suffix)/ && \
|
||||
$(INSTALL) -p $(EXE_INSTALL_FLAGS) $(filter-out libmdbx.a,$(LIBRARIES)) $(DESTDIR)$(prefix)/lib$(suffix)/ && \
|
||||
mkdir -p $(DESTDIR)$(prefix)/lib$(suffix)/ && \
|
||||
@ -784,7 +816,7 @@ install-no-strip: install
|
||||
|
||||
uninstall:
|
||||
@echo ' UNINSTALLING/REMOVE...'
|
||||
$(QUIET)rm -f $(addprefix $(DESTDIR)$(prefix)/bin$(suffix)/,$(TOOLS)) \
|
||||
$(QUIET)rm -f $(addprefix $(DESTDIR)$(prefix)/bin$(suffix)/,$(MDBX_TOOLS)) \
|
||||
$(addprefix $(DESTDIR)$(prefix)/lib$(suffix)/,$(LIBRARIES)) \
|
||||
$(addprefix $(DESTDIR)$(prefix)/include/,$(HEADERS)) \
|
||||
$(addprefix $(DESTDIR)$(mandir)/man1/,$(MANPAGES))
|
||||
|
206
LICENSE
206
LICENSE
@ -1,47 +1,177 @@
|
||||
The OpenLDAP Public License
|
||||
Version 2.8, 17 August 2003
|
||||
|
||||
Redistribution and use of this software and associated documentation
|
||||
("Software"), with or without modification, are permitted provided
|
||||
that the following conditions are met:
|
||||
Apache License
|
||||
Version 2.0, January 2004
|
||||
http://www.apache.org/licenses/
|
||||
|
||||
1. Redistributions in source form must retain copyright statements
|
||||
and notices,
|
||||
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
||||
|
||||
2. Redistributions in binary form must reproduce applicable copyright
|
||||
statements and notices, this list of conditions, and the following
|
||||
disclaimer in the documentation and/or other materials provided
|
||||
with the distribution, and
|
||||
1. Definitions.
|
||||
|
||||
3. Redistributions must contain a verbatim copy of this document.
|
||||
"License" shall mean the terms and conditions for use, reproduction,
|
||||
and distribution as defined by Sections 1 through 9 of this document.
|
||||
|
||||
The OpenLDAP Foundation may revise this license from time to time.
|
||||
Each revision is distinguished by a version number. You may use
|
||||
this Software under terms of this license revision or under the
|
||||
terms of any subsequent revision of the license.
|
||||
"Licensor" shall mean the copyright owner or entity authorized by
|
||||
the copyright owner that is granting the License.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE OPENLDAP FOUNDATION AND ITS
|
||||
CONTRIBUTORS ``AS IS'' AND ANY EXPRESSED OR IMPLIED WARRANTIES,
|
||||
INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
|
||||
AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
|
||||
SHALL THE OPENLDAP FOUNDATION, ITS CONTRIBUTORS, OR THE AUTHOR(S)
|
||||
OR OWNER(S) OF THE SOFTWARE BE LIABLE FOR ANY DIRECT, INDIRECT,
|
||||
INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
||||
BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||
LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
|
||||
ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
"Legal Entity" shall mean the union of the acting entity and all
|
||||
other entities that control, are controlled by, or are under common
|
||||
control with that entity. For the purposes of this definition,
|
||||
"control" means (i) the power, direct or indirect, to cause the
|
||||
direction or management of such entity, whether by contract or
|
||||
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
||||
outstanding shares, or (iii) beneficial ownership of such entity.
|
||||
|
||||
The names of the authors and copyright holders must not be used in
|
||||
advertising or otherwise to promote the sale, use or other dealing
|
||||
in this Software without specific, written prior permission. Title
|
||||
to copyright in this Software shall at all times remain with copyright
|
||||
holders.
|
||||
"You" (or "Your") shall mean an individual or Legal Entity
|
||||
exercising permissions granted by this License.
|
||||
|
||||
OpenLDAP is a registered trademark of the OpenLDAP Foundation.
|
||||
"Source" form shall mean the preferred form for making modifications,
|
||||
including but not limited to software source code, documentation
|
||||
source, and configuration files.
|
||||
|
||||
Copyright 1999-2003 The OpenLDAP Foundation, Redwood City,
|
||||
California, USA. All Rights Reserved. Permission to copy and
|
||||
distribute verbatim copies of this document is granted.
|
||||
"Object" form shall mean any form resulting from mechanical
|
||||
transformation or translation of a Source form, including but
|
||||
not limited to compiled object code, generated documentation,
|
||||
and conversions to other media types.
|
||||
|
||||
"Work" shall mean the work of authorship, whether in Source or
|
||||
Object form, made available under the License, as indicated by a
|
||||
copyright notice that is included in or attached to the work
|
||||
(an example is provided in the Appendix below).
|
||||
|
||||
"Derivative Works" shall mean any work, whether in Source or Object
|
||||
form, that is based on (or derived from) the Work and for which the
|
||||
editorial revisions, annotations, elaborations, or other modifications
|
||||
represent, as a whole, an original work of authorship. For the purposes
|
||||
of this License, Derivative Works shall not include works that remain
|
||||
separable from, or merely link (or bind by name) to the interfaces of,
|
||||
the Work and Derivative Works thereof.
|
||||
|
||||
"Contribution" shall mean any work of authorship, including
|
||||
the original version of the Work and any modifications or additions
|
||||
to that Work or Derivative Works thereof, that is intentionally
|
||||
submitted to Licensor for inclusion in the Work by the copyright owner
|
||||
or by an individual or Legal Entity authorized to submit on behalf of
|
||||
the copyright owner. For the purposes of this definition, "submitted"
|
||||
means any form of electronic, verbal, or written communication sent
|
||||
to the Licensor or its representatives, including but not limited to
|
||||
communication on electronic mailing lists, source code control systems,
|
||||
and issue tracking systems that are managed by, or on behalf of, the
|
||||
Licensor for the purpose of discussing and improving the Work, but
|
||||
excluding communication that is conspicuously marked or otherwise
|
||||
designated in writing by the copyright owner as "Not a Contribution."
|
||||
|
||||
"Contributor" shall mean Licensor and any individual or Legal Entity
|
||||
on behalf of whom a Contribution has been received by Licensor and
|
||||
subsequently incorporated within the Work.
|
||||
|
||||
2. Grant of Copyright License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
copyright license to reproduce, prepare Derivative Works of,
|
||||
publicly display, publicly perform, sublicense, and distribute the
|
||||
Work and such Derivative Works in Source or Object form.
|
||||
|
||||
3. Grant of Patent License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
(except as stated in this section) patent license to make, have made,
|
||||
use, offer to sell, sell, import, and otherwise transfer the Work,
|
||||
where such license applies only to those patent claims licensable
|
||||
by such Contributor that are necessarily infringed by their
|
||||
Contribution(s) alone or by combination of their Contribution(s)
|
||||
with the Work to which such Contribution(s) was submitted. If You
|
||||
institute patent litigation against any entity (including a
|
||||
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
||||
or a Contribution incorporated within the Work constitutes direct
|
||||
or contributory patent infringement, then any patent licenses
|
||||
granted to You under this License for that Work shall terminate
|
||||
as of the date such litigation is filed.
|
||||
|
||||
4. Redistribution. You may reproduce and distribute copies of the
|
||||
Work or Derivative Works thereof in any medium, with or without
|
||||
modifications, and in Source or Object form, provided that You
|
||||
meet the following conditions:
|
||||
|
||||
(a) You must give any other recipients of the Work or
|
||||
Derivative Works a copy of this License; and
|
||||
|
||||
(b) You must cause any modified files to carry prominent notices
|
||||
stating that You changed the files; and
|
||||
|
||||
(c) You must retain, in the Source form of any Derivative Works
|
||||
that You distribute, all copyright, patent, trademark, and
|
||||
attribution notices from the Source form of the Work,
|
||||
excluding those notices that do not pertain to any part of
|
||||
the Derivative Works; and
|
||||
|
||||
(d) If the Work includes a "NOTICE" text file as part of its
|
||||
distribution, then any Derivative Works that You distribute must
|
||||
include a readable copy of the attribution notices contained
|
||||
within such NOTICE file, excluding those notices that do not
|
||||
pertain to any part of the Derivative Works, in at least one
|
||||
of the following places: within a NOTICE text file distributed
|
||||
as part of the Derivative Works; within the Source form or
|
||||
documentation, if provided along with the Derivative Works; or,
|
||||
within a display generated by the Derivative Works, if and
|
||||
wherever such third-party notices normally appear. The contents
|
||||
of the NOTICE file are for informational purposes only and
|
||||
do not modify the License. You may add Your own attribution
|
||||
notices within Derivative Works that You distribute, alongside
|
||||
or as an addendum to the NOTICE text from the Work, provided
|
||||
that such additional attribution notices cannot be construed
|
||||
as modifying the License.
|
||||
|
||||
You may add Your own copyright statement to Your modifications and
|
||||
may provide additional or different license terms and conditions
|
||||
for use, reproduction, or distribution of Your modifications, or
|
||||
for any such Derivative Works as a whole, provided Your use,
|
||||
reproduction, and distribution of the Work otherwise complies with
|
||||
the conditions stated in this License.
|
||||
|
||||
5. Submission of Contributions. Unless You explicitly state otherwise,
|
||||
any Contribution intentionally submitted for inclusion in the Work
|
||||
by You to the Licensor shall be under the terms and conditions of
|
||||
this License, without any additional terms or conditions.
|
||||
Notwithstanding the above, nothing herein shall supersede or modify
|
||||
the terms of any separate license agreement you may have executed
|
||||
with Licensor regarding such Contributions.
|
||||
|
||||
6. Trademarks. This License does not grant permission to use the trade
|
||||
names, trademarks, service marks, or product names of the Licensor,
|
||||
except as required for reasonable and customary use in describing the
|
||||
origin of the Work and reproducing the content of the NOTICE file.
|
||||
|
||||
7. Disclaimer of Warranty. Unless required by applicable law or
|
||||
agreed to in writing, Licensor provides the Work (and each
|
||||
Contributor provides its Contributions) on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||
implied, including, without limitation, any warranties or conditions
|
||||
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
||||
PARTICULAR PURPOSE. You are solely responsible for determining the
|
||||
appropriateness of using or redistributing the Work and assume any
|
||||
risks associated with Your exercise of permissions under this License.
|
||||
|
||||
8. Limitation of Liability. In no event and under no legal theory,
|
||||
whether in tort (including negligence), contract, or otherwise,
|
||||
unless required by applicable law (such as deliberate and grossly
|
||||
negligent acts) or agreed to in writing, shall any Contributor be
|
||||
liable to You for damages, including any direct, indirect, special,
|
||||
incidental, or consequential damages of any character arising as a
|
||||
result of this License or out of the use or inability to use the
|
||||
Work (including but not limited to damages for loss of goodwill,
|
||||
work stoppage, computer failure or malfunction, or any and all
|
||||
other commercial damages or losses), even if such Contributor
|
||||
has been advised of the possibility of such damages.
|
||||
|
||||
9. Accepting Warranty or Additional Liability. While redistributing
|
||||
the Work or Derivative Works thereof, You may choose to offer,
|
||||
and charge a fee for, acceptance of support, warranty, indemnity,
|
||||
or other liability obligations and/or rights consistent with this
|
||||
License. However, in accepting such obligations, You may act only
|
||||
on Your own behalf and on Your sole responsibility, not on behalf
|
||||
of any other Contributor, and only if You agree to indemnify,
|
||||
defend, and hold each Contributor harmless for any liability
|
||||
incurred by, or claims asserted against, such Contributor by reason
|
||||
of your accepting any such warranty or additional liability.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
23
NOTICE
Normal file
23
NOTICE
Normal file
@ -0,0 +1,23 @@
|
||||
libmdbx (aka MDBX) is an extremely fast, compact, powerful, embeddedable,
|
||||
transactional key-value storage engine with open-source code. MDBX has a
|
||||
specific set of properties and capabilities, focused on creating unique
|
||||
lightweight solutions.
|
||||
|
||||
Please visit https://libmdbx.dqdkfa.ru for more information, changelog,
|
||||
documentation, C++ API description and links to the original git repo
|
||||
with the source code. Questions, feedback and suggestions are welcome
|
||||
to the Telegram' group https://t.me/libmdbx.
|
||||
|
||||
Since 2017 _libmdbx_ development is funded by [Positive Technologies](https://www.ptsecurity.com)
|
||||
and used inside company products. Всё будет хорошо!
|
||||
|
||||
Copyright 2015-2024 Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru>
|
||||
SPDX-License-Identifier: Apache-2.0
|
||||
For notes about the license change, credits and acknowledgments,
|
||||
please refer to the COPYRIGHT file within original libmdbx source code
|
||||
repository https://gitflic.ru/project/erthink/libmdbx
|
||||
|
||||
On 2022-04-15 the Github administration, without any warning nor
|
||||
explanation, deleted _libmdbx_ along with a lot of other projects,
|
||||
simultaneously blocking access for many developers.
|
||||
For the same reason ~~Github~~ is blacklisted forever.
|
75
README.md
75
README.md
@ -1,36 +1,5 @@
|
||||
<!-- Required extensions: pymdownx.betterem, pymdownx.tilde, pymdownx.emoji, pymdownx.tasklist, pymdownx.superfences -->
|
||||
|
||||
### Время учить Русский
|
||||
|
||||
Начиная с 2021 года наблюдается устойчивые тенденции к распространению
|
||||
недостоверной информации о _libmdbx_ в странах ~~НАТО~~,
|
||||
политизированной критика, а также отказу от использования библиотеки в
|
||||
пользу LMDB, несмотря на явные проблемы с одной стороны и преимущества с
|
||||
другой. Поэтому начиная с 17 марта 2024 года прекращается
|
||||
документирование и сопровождение проекта на английском языке. Новый
|
||||
функционал будет документироваться только на русском языке, однако,
|
||||
целенаправленного переписывания/перевода документации пока не
|
||||
планируется.
|
||||
|
||||
### The origin has been migrated to [GitFlic](https://gitflic.ru/project/erthink/libmdbx)
|
||||
Since on 2022-04-15 the Github administration, without any warning
|
||||
nor explanation, deleted _libmdbx_ along with a lot of other projects,
|
||||
simultaneously blocking access for many developers.
|
||||
For the same reason ~~Github~~ is blacklisted forever.
|
||||
|
||||
GitFlic's developers plan to support other languages,
|
||||
including English 和 中文, in the near future.
|
||||
|
||||
### Основной репозиторий перемещен на [GitFlic](https://gitflic.ru/project/erthink/libmdbx)
|
||||
Так как 15 апреля 2022 администрация Github без предупреждения и
|
||||
объяснения причин удалила _libmdbx_ вместе с массой других проектов,
|
||||
одновременно заблокировав доступ многим разработчикам.
|
||||
По этой же причине ~~Github~~ навсегда занесен в черный список.
|
||||
|
||||
--------------------------------------------------------------------------------
|
||||
|
||||
*The Future will (be) [Positive](https://www.ptsecurity.com). Всё будет хорошо.*
|
||||
|
||||
> Please refer to the online [documentation](https://libmdbx.dqdkfa.ru)
|
||||
> with [`C` API description](https://libmdbx.dqdkfa.ru/group__c__api.html)
|
||||
> and pay attention to the [`C++` API](https://gitflic.ru/project/erthink/libmdbx/blob?file=mdbx.h%2B%2B#line-num-1).
|
||||
@ -40,6 +9,8 @@ including English 和 中文, in the near future.
|
||||
> For NEWS take a look to the [ChangeLog](https://gitflic.ru/project/erthink/libmdbx/blob?file=ChangeLog.md)
|
||||
> or the [TODO](https://gitflic.ru/project/erthink/libmdbx/blob?file=TODO.md).
|
||||
|
||||
*The Future will (be) [Positive](https://www.ptsecurity.com). Всё будет хорошо.*
|
||||
|
||||
|
||||
libmdbx
|
||||
========
|
||||
@ -48,7 +19,7 @@ libmdbx
|
||||
|
||||
_libmdbx_ is an extremely fast, compact, powerful, embedded, transactional
|
||||
[key-value database](https://en.wikipedia.org/wiki/Key-value_database),
|
||||
with [permissive license](https://gitflic.ru/project/erthink/libmdbx/blob?file=LICENSE).
|
||||
with [Apache 2.0 license](https://gitflic.ru/project/erthink/libmdbx/blob?file=LICENSE).
|
||||
_libmdbx_ has a specific set of properties and capabilities,
|
||||
focused on creating unique lightweight solutions.
|
||||
|
||||
@ -144,15 +115,14 @@ $ objdump -f -h -j .text libmdbx.so
|
||||
libmdbx.so: формат файла elf64-e2k
|
||||
архитектура: elbrus-v6:64, флаги 0x00000150:
|
||||
HAS_SYMS, DYNAMIC, D_PAGED
|
||||
начальный адрес 0x0000000000021680
|
||||
начальный адрес 0x00000000??????00
|
||||
|
||||
Разделы:
|
||||
Idx Name Разм VMA LMA Фа смещ. Выр.
|
||||
10 .text 000ddd28 0000000000021680 0000000000021680 00021680 2**3
|
||||
CONTENTS, ALLOC, LOAD, READONLY, CODE
|
||||
Idx Name Разм VMA LMA Фа смещ. Выр. Флаги
|
||||
10 .text 000e7460 0000000000025c00 0000000000025c00 00025c00 2**10 CONTENTS, ALLOC, LOAD, READONLY, CODE
|
||||
|
||||
$ cc --version
|
||||
lcc:1.26.12:Jun-05-2022:e2k-v6-linux
|
||||
lcc:1.27.14:Jan-31-2024:e2k-v6-linux
|
||||
gcc (GCC) 9.3.0 compatible
|
||||
```
|
||||
|
||||
@ -276,7 +246,7 @@ out-of-the-box, not silently and catastrophically break down. The list
|
||||
below is pruned down to the improvements most notable and obvious from
|
||||
the user's point of view.
|
||||
|
||||
## Added Features
|
||||
## Some Added Features
|
||||
|
||||
1. Keys could be more than 2 times longer than _LMDB_.
|
||||
> For DB with default page size _libmdbx_ support keys up to 2022 bytes
|
||||
@ -319,8 +289,7 @@ be found between a `KEY1` and a `KEY2`. This is a prerequisite for build
|
||||
and/or optimize query execution plans.
|
||||
> _libmdbx_ performs a rough estimate based on common B-tree pages of the paths from root to corresponding keys.
|
||||
|
||||
8. `mdbx_chk` utility for database integrity check.
|
||||
Since version 0.9.1, the utility supports checking the database using any of the three meta pages and the ability to switch to it.
|
||||
8. Database integrity check API both with standalone `mdbx_chk` utility.
|
||||
|
||||
9. Support for opening databases in the exclusive mode, including on a network share.
|
||||
|
||||
@ -410,12 +379,26 @@ The origin for now is at [GitFlic](https://gitflic.ru/project/erthink/libmdbx)
|
||||
with backup at [ABF by ROSA Лаб](https://abf.rosalinux.ru/erthink/libmdbx).
|
||||
For the same reason ~~Github~~ is blacklisted forever.
|
||||
|
||||
Начиная с 2021 года наблюдаются устойчивые тенденции к распространению
|
||||
недостоверной информации о libmdbx в странах НАТО, политизированной
|
||||
критики, а также отказу от использования библиотеки в пользу LMDB,
|
||||
несмотря на явные проблемы с одной стороны и преимущества с другой.
|
||||
Поэтому, начиная с 17 марта 2024 года, прекращается документирование и
|
||||
сопровождение проекта на английском языке. Новая функциональность будет
|
||||
документироваться только на русском языке, однако, целенаправленного
|
||||
переписывания/перевода документации пока не планируется.
|
||||
|
||||
Since May 2024 and version v0.13 _libmdbx_ was re-licensed under Apache-2.0 license.
|
||||
Please refer to the `COPYRIGHT` file for license change explanations.
|
||||
|
||||
|
||||
## Acknowledgments
|
||||
Howard Chu <hyc@openldap.org> is the author of LMDB, from which
|
||||
originated the _libmdbx_ in 2015.
|
||||
Howard Chu <hyc@openldap.org> and Hallvard Furuseth
|
||||
<hallvard@openldap.org> are the authors of _LMDB_, from which _libmdbx_
|
||||
was forked in 2015.
|
||||
|
||||
Martin Hedenfalk <martin@bzero.se> is the author of `btree.c` code, which
|
||||
was used to begin development of LMDB.
|
||||
was used to begin development of _LMDB_.
|
||||
|
||||
<!-- section-end -->
|
||||
|
||||
@ -523,8 +506,10 @@ There are no special traits nor quirks if you use libmdbx ONLY inside the single
|
||||
But in a cross-container cases or with a host-container(s) mix the two major things MUST be
|
||||
guaranteed:
|
||||
|
||||
1. Coherence of memory mapping content and unified page cache inside OS kernel for host and all container(s) operated with a DB.
|
||||
Basically this means must be only a single physical copy of each memory mapped DB' page in the system memory.
|
||||
1. Coherence of memory mapping content and unified page cache inside OS
|
||||
kernel for host and all container(s) operated with a DB. Basically this
|
||||
means must be only a single physical copy of each memory mapped DB' page
|
||||
in the system memory.
|
||||
|
||||
2. Uniqueness of [PID](https://en.wikipedia.org/wiki/Process_identifier) values and/or a common space for ones:
|
||||
- for POSIX systems: PID uniqueness for all processes operated with a DB.
|
||||
|
@ -1,17 +1,5 @@
|
||||
## Copyright (c) 2012-2024 Leonid Yuriev <leo@yuriev.ru>.
|
||||
##
|
||||
## Licensed under the Apache License, Version 2.0 (the "License");
|
||||
## you may not use this file except in compliance with the License.
|
||||
## You may obtain a copy of the License at
|
||||
##
|
||||
## http://www.apache.org/licenses/LICENSE-2.0
|
||||
##
|
||||
## Unless required by applicable law or agreed to in writing, software
|
||||
## distributed under the License is distributed on an "AS IS" BASIS,
|
||||
## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
## See the License for the specific language governing permissions and
|
||||
## limitations under the License.
|
||||
##
|
||||
## Copyright (c) 2010-2024 Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru>
|
||||
## SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
if(CMAKE_VERSION VERSION_LESS 3.8.2)
|
||||
cmake_minimum_required(VERSION 3.0.2)
|
||||
|
@ -1,17 +1,5 @@
|
||||
## Copyright (c) 2012-2024 Leonid Yuriev <leo@yuriev.ru>.
|
||||
##
|
||||
## Licensed under the Apache License, Version 2.0 (the "License");
|
||||
## you may not use this file except in compliance with the License.
|
||||
## You may obtain a copy of the License at
|
||||
##
|
||||
## http://www.apache.org/licenses/LICENSE-2.0
|
||||
##
|
||||
## Unless required by applicable law or agreed to in writing, software
|
||||
## distributed under the License is distributed on an "AS IS" BASIS,
|
||||
## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
## See the License for the specific language governing permissions and
|
||||
## limitations under the License.
|
||||
##
|
||||
## Copyright (c) 2012-2024 Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru>
|
||||
## SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
if(CMAKE_VERSION VERSION_LESS 3.8.2)
|
||||
cmake_minimum_required(VERSION 3.0.2)
|
||||
|
@ -1,17 +1,5 @@
|
||||
## Copyright (c) 2012-2024 Leonid Yuriev <leo@yuriev.ru>.
|
||||
##
|
||||
## Licensed under the Apache License, Version 2.0 (the "License");
|
||||
## you may not use this file except in compliance with the License.
|
||||
## You may obtain a copy of the License at
|
||||
##
|
||||
## http://www.apache.org/licenses/LICENSE-2.0
|
||||
##
|
||||
## Unless required by applicable law or agreed to in writing, software
|
||||
## distributed under the License is distributed on an "AS IS" BASIS,
|
||||
## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
## See the License for the specific language governing permissions and
|
||||
## limitations under the License.
|
||||
##
|
||||
## Copyright (c) 2012-2024 Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru>
|
||||
## SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
if(CMAKE_VERSION VERSION_LESS 3.8.2)
|
||||
cmake_minimum_required(VERSION 3.0.2)
|
||||
|
313
mdbx.h
313
mdbx.h
@ -1,11 +1,10 @@
|
||||
/**
|
||||
|
||||
_libmdbx_ is an extremely fast, compact, powerful, embedded,
|
||||
_libmdbx_ (aka MDBX) is an extremely fast, compact, powerful, embeddable,
|
||||
transactional [key-value
|
||||
store](https://en.wikipedia.org/wiki/Key-value_database) database, with
|
||||
[permissive license](./LICENSE). _MDBX_ has a specific set of properties and
|
||||
capabilities, focused on creating unique lightweight solutions with
|
||||
extraordinary performance.
|
||||
store](https://en.wikipedia.org/wiki/Key-value_database), with [Apache 2.0
|
||||
license](./LICENSE). _MDBX_ has a specific set of properties and capabilities,
|
||||
focused on creating unique lightweight solutions with extraordinary performance.
|
||||
|
||||
_libmdbx_ is superior to [LMDB](https://bit.ly/26ts7tL) in terms of features
|
||||
and reliability, not inferior in performance. In comparison to LMDB, _libmdbx_
|
||||
@ -14,60 +13,24 @@ break down. _libmdbx_ supports Linux, Windows, MacOS, OSX, iOS, Android,
|
||||
FreeBSD, DragonFly, Solaris, OpenSolaris, OpenIndiana, NetBSD, OpenBSD and other
|
||||
systems compliant with POSIX.1-2008.
|
||||
|
||||
The origin has been migrated to
|
||||
[GitFlic](https://gitflic.ru/project/erthink/libmdbx) since on 2022-04-15
|
||||
the Github administration, without any warning nor explanation, deleted libmdbx
|
||||
along with a lot of other projects, simultaneously blocking access for many
|
||||
developers. For the same reason ~~Github~~ is blacklisted forever.
|
||||
Please visit https://libmdbx.dqdkfa.ru for more information, documentation,
|
||||
C++ API description and links to the origin git repo with the source code.
|
||||
Questions, feedback and suggestions are welcome to the Telegram' group
|
||||
https://t.me/libmdbx.
|
||||
|
||||
_The Future will (be) [Positive](https://www.ptsecurity.com). Всё будет хорошо._
|
||||
|
||||
\note The origin has been migrated to
|
||||
[GitFlic](https://gitflic.ru/project/erthink/libmdbx) since on 2022-04-15 the
|
||||
Github administration, without any warning nor explanation, deleted libmdbx
|
||||
along with a lot of other projects, simultaneously blocking access for many
|
||||
developers. For the same reason ~~Github~~ is blacklisted forever.
|
||||
|
||||
\section copyright LICENSE & COPYRIGHT
|
||||
|
||||
\authors Copyright (c) 2015-2024, Leonid Yuriev <leo@yuriev.ru>
|
||||
and other _libmdbx_ authors: please see [AUTHORS](./AUTHORS) file.
|
||||
|
||||
\copyright Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted only as authorized by the OpenLDAP Public License.
|
||||
|
||||
A copy of this license is available in the file LICENSE in the
|
||||
top-level directory of the distribution or, alternatively, at
|
||||
<http://www.OpenLDAP.org/license.html>.
|
||||
|
||||
---
|
||||
|
||||
This code is derived from "LMDB engine" written by
|
||||
Howard Chu (Symas Corporation), which itself derived from btree.c
|
||||
written by Martin Hedenfalk.
|
||||
|
||||
---
|
||||
|
||||
Portions Copyright 2011-2015 Howard Chu, Symas Corp. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted only as authorized by the OpenLDAP
|
||||
Public License.
|
||||
|
||||
A copy of this license is available in the file LICENSE in the
|
||||
top-level directory of the distribution or, alternatively, at
|
||||
<http://www.OpenLDAP.org/license.html>.
|
||||
|
||||
---
|
||||
|
||||
Portions Copyright (c) 2009, 2010 Martin Hedenfalk <martin@bzero.se>
|
||||
|
||||
Permission to use, copy, modify, and distribute this software for any
|
||||
purpose with or without fee is hereby granted, provided that the above
|
||||
copyright notice and this permission notice appear in all copies.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
|
||||
WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
|
||||
MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
|
||||
ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
|
||||
WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
|
||||
ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
|
||||
OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
||||
\copyright SPDX-License-Identifier: Apache-2.0
|
||||
\note Please refer to the COPYRIGHT file for explanations license change,
|
||||
credits and acknowledgments.
|
||||
\author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
|
||||
|
||||
*******************************************************************************/
|
||||
|
||||
@ -98,7 +61,7 @@ OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
||||
/* clang-format off */
|
||||
/**
|
||||
\file mdbx.h
|
||||
\brief The libmdbx C API header file
|
||||
\brief The libmdbx C API header file.
|
||||
|
||||
\defgroup c_api C API
|
||||
@{
|
||||
@ -359,6 +322,14 @@ typedef mode_t mdbx_mode_t;
|
||||
#endif
|
||||
#endif /* MDBX_DEPRECATED */
|
||||
|
||||
#ifndef MDBX_DEPRECATED_ENUM
|
||||
#if !defined(DOXYGEN) && (!defined(_MSC_VER) || _MSC_VER >= 1930)
|
||||
#define MDBX_DEPRECATED_ENUM MDBX_DEPRECATED
|
||||
#else
|
||||
#define MDBX_DEPRECATED_ENUM /* avoid madness MSVC */
|
||||
#endif
|
||||
#endif /* MDBX_DEPRECATED_ENUM */
|
||||
|
||||
#ifndef __dll_export
|
||||
#if defined(_WIN32) || defined(_WIN64) || defined(__CYGWIN__) || \
|
||||
defined(__MINGW__) || defined(__MINGW32__) || defined(__MINGW64__)
|
||||
@ -393,7 +364,8 @@ typedef mode_t mdbx_mode_t;
|
||||
|
||||
/** \brief Auxiliary macro for robustly define the both inline version of API
|
||||
* function and non-inline fallback dll-exported version for applications linked
|
||||
* with old version of libmdbx, with a strictly ODR-common implementation. */
|
||||
* with old version of libmdbx, with a strictly ODR-common implementation. Thus,
|
||||
* we emulate __extern_inline for all compilers, including non-GNU ones. */
|
||||
#if defined(LIBMDBX_INTERNALS) && !defined(LIBMDBX_NO_EXPORTS_LEGACY_API)
|
||||
#define LIBMDBX_INLINE_API(TYPE, NAME, ARGS) \
|
||||
/* proto of exported which uses common impl */ LIBMDBX_API TYPE NAME ARGS; \
|
||||
@ -888,7 +860,7 @@ enum MDBX_constants {
|
||||
/** Log level
|
||||
* \note Levels detailed than (great than) \ref MDBX_LOG_NOTICE
|
||||
* requires build libmdbx with \ref MDBX_DEBUG option. */
|
||||
enum MDBX_log_level_t {
|
||||
typedef enum MDBX_log_level {
|
||||
/** Critical conditions, i.e. assertion failures.
|
||||
* \note libmdbx always produces such messages regardless
|
||||
* of \ref MDBX_DEBUG build option. */
|
||||
@ -938,17 +910,14 @@ enum MDBX_log_level_t {
|
||||
|
||||
/** for \ref mdbx_setup_debug() only: Don't change current settings */
|
||||
MDBX_LOG_DONTCHANGE = -1
|
||||
};
|
||||
#ifndef __cplusplus
|
||||
typedef enum MDBX_log_level_t MDBX_log_level_t;
|
||||
#endif
|
||||
} MDBX_log_level_t;
|
||||
|
||||
/** \brief Runtime debug flags
|
||||
*
|
||||
* \details `MDBX_DBG_DUMP` and `MDBX_DBG_LEGACY_MULTIOPEN` always have an
|
||||
* effect, but `MDBX_DBG_ASSERT`, `MDBX_DBG_AUDIT` and `MDBX_DBG_JITTER` only if
|
||||
* libmdbx built with \ref MDBX_DEBUG. */
|
||||
enum MDBX_debug_flags_t {
|
||||
typedef enum MDBX_debug_flags {
|
||||
MDBX_DBG_NONE = 0,
|
||||
|
||||
/** Enable assertion checks.
|
||||
@ -986,12 +955,8 @@ enum MDBX_debug_flags_t {
|
||||
|
||||
/** for mdbx_setup_debug() only: Don't change current settings */
|
||||
MDBX_DBG_DONTCHANGE = -1
|
||||
};
|
||||
#ifndef __cplusplus
|
||||
typedef enum MDBX_debug_flags_t MDBX_debug_flags_t;
|
||||
#else
|
||||
DEFINE_ENUM_FLAG_OPERATORS(MDBX_debug_flags_t)
|
||||
#endif
|
||||
} MDBX_debug_flags_t;
|
||||
DEFINE_ENUM_FLAG_OPERATORS(MDBX_debug_flags)
|
||||
|
||||
/** \brief A debug-logger callback function,
|
||||
* called before printing the message and aborting.
|
||||
@ -1086,7 +1051,7 @@ MDBX_NORETURN LIBMDBX_API void mdbx_assert_fail(const MDBX_env *env,
|
||||
* \ingroup c_opening
|
||||
* \anchor env_flags
|
||||
* \see mdbx_env_open() \see mdbx_env_set_flags() */
|
||||
enum MDBX_env_flags_t {
|
||||
typedef enum MDBX_env_flags {
|
||||
MDBX_ENV_DEFAULTS = 0,
|
||||
|
||||
/** Extra validation of DB structure and pages content.
|
||||
@ -1210,7 +1175,7 @@ enum MDBX_env_flags_t {
|
||||
|
||||
/** Отвязывает транзакции от потоков/threads насколько это возможно.
|
||||
*
|
||||
* Эта опция предназначена для приложений, которые мультиплексируют множество
|
||||
* Опция предназначена для приложений, которые мультиплексируют множество
|
||||
* пользовательских легковесных потоков выполнения по отдельным потокам
|
||||
* операционной системы, например как это происходит в средах выполнения
|
||||
* GoLang и Rust. Таким приложениям также рекомендуется сериализовать
|
||||
@ -1278,10 +1243,9 @@ enum MDBX_env_flags_t {
|
||||
* Этот флаг вступает в силу при открытии среды и не может быть изменен после.
|
||||
*/
|
||||
MDBX_NOSTICKYTHREADS = UINT32_C(0x200000),
|
||||
#ifndef _MSC_VER /* avoid madness MSVC */
|
||||
|
||||
/** \deprecated Please use \ref MDBX_NOSTICKYTHREADS instead. */
|
||||
MDBX_NOTLS MDBX_DEPRECATED = MDBX_NOSTICKYTHREADS,
|
||||
#endif /* avoid madness MSVC */
|
||||
MDBX_NOTLS MDBX_DEPRECATED_ENUM = MDBX_NOSTICKYTHREADS,
|
||||
|
||||
/** Don't do readahead.
|
||||
*
|
||||
@ -1327,7 +1291,6 @@ enum MDBX_env_flags_t {
|
||||
* This flag may be changed at any time using `mdbx_env_set_flags()`. */
|
||||
MDBX_NOMEMINIT = UINT32_C(0x1000000),
|
||||
|
||||
#ifndef _MSC_VER /* avoid madness MSVC */
|
||||
/** Aims to coalesce a Garbage Collection items.
|
||||
* \deprecated Always enabled since v0.12 and deprecated since v0.13.
|
||||
*
|
||||
@ -1339,8 +1302,7 @@ enum MDBX_env_flags_t {
|
||||
* Unallocated space and reducing the database file.
|
||||
*
|
||||
* This flag may be changed at any time using mdbx_env_set_flags(). */
|
||||
MDBX_COALESCE MDBX_DEPRECATED = UINT32_C(0x2000000),
|
||||
#endif /* avoid madness MSVC */
|
||||
MDBX_COALESCE MDBX_DEPRECATED_ENUM = UINT32_C(0x2000000),
|
||||
|
||||
/** LIFO policy for recycling a Garbage Collection items.
|
||||
*
|
||||
@ -1543,19 +1505,14 @@ enum MDBX_env_flags_t {
|
||||
MDBX_UTTERLY_NOSYNC = MDBX_SAFE_NOSYNC | UINT32_C(0x100000),
|
||||
|
||||
/** end of sync_modes @} */
|
||||
};
|
||||
#ifndef __cplusplus
|
||||
/** \ingroup c_opening */
|
||||
typedef enum MDBX_env_flags_t MDBX_env_flags_t;
|
||||
#else
|
||||
DEFINE_ENUM_FLAG_OPERATORS(MDBX_env_flags_t)
|
||||
#endif
|
||||
} MDBX_env_flags_t;
|
||||
DEFINE_ENUM_FLAG_OPERATORS(MDBX_env_flags)
|
||||
|
||||
/** Transaction flags
|
||||
* \ingroup c_transactions
|
||||
* \anchor txn_flags
|
||||
* \see mdbx_txn_begin() \see mdbx_txn_flags() */
|
||||
enum MDBX_txn_flags_t {
|
||||
typedef enum MDBX_txn_flags {
|
||||
/** Start read-write transaction.
|
||||
*
|
||||
* Only one write transaction may be active at a time. Writes are fully
|
||||
@ -1627,18 +1584,14 @@ enum MDBX_txn_flags_t {
|
||||
* \note Transaction state flag. Returned from \ref mdbx_txn_flags()
|
||||
* but can't be used with \ref mdbx_txn_begin(). */
|
||||
MDBX_TXN_BLOCKED = MDBX_TXN_FINISHED | MDBX_TXN_ERROR | MDBX_TXN_HAS_CHILD
|
||||
};
|
||||
#ifndef __cplusplus
|
||||
typedef enum MDBX_txn_flags_t MDBX_txn_flags_t;
|
||||
#else
|
||||
DEFINE_ENUM_FLAG_OPERATORS(MDBX_txn_flags_t)
|
||||
#endif
|
||||
} MDBX_txn_flags_t;
|
||||
DEFINE_ENUM_FLAG_OPERATORS(MDBX_txn_flags)
|
||||
|
||||
/** \brief Database flags
|
||||
* \ingroup c_dbi
|
||||
* \anchor db_flags
|
||||
* \see mdbx_dbi_open() */
|
||||
enum MDBX_db_flags_t {
|
||||
typedef enum MDBX_db_flags {
|
||||
/** Variable length unique keys with usual byte-by-byte string comparison. */
|
||||
MDBX_DB_DEFAULTS = 0,
|
||||
|
||||
@ -1681,19 +1634,14 @@ enum MDBX_db_flags_t {
|
||||
* sub-database will be opened with flags which it was created, and then an
|
||||
* application could determine the actual flags by \ref mdbx_dbi_flags(). */
|
||||
MDBX_DB_ACCEDE = MDBX_ACCEDE
|
||||
};
|
||||
#ifndef __cplusplus
|
||||
/** \ingroup c_dbi */
|
||||
typedef enum MDBX_db_flags_t MDBX_db_flags_t;
|
||||
#else
|
||||
} MDBX_db_flags_t;
|
||||
DEFINE_ENUM_FLAG_OPERATORS(MDBX_db_flags_t)
|
||||
#endif
|
||||
|
||||
/** \brief Data changing flags
|
||||
* \ingroup c_crud
|
||||
* \see \ref c_crud_hints "Quick reference for Insert/Update/Delete operations"
|
||||
* \see mdbx_put() \see mdbx_cursor_put() \see mdbx_replace() */
|
||||
enum MDBX_put_flags_t {
|
||||
typedef enum MDBX_put_flags {
|
||||
/** Upsertion by default (without any other flags) */
|
||||
MDBX_UPSERT = 0,
|
||||
|
||||
@ -1731,18 +1679,13 @@ enum MDBX_put_flags_t {
|
||||
/** Only for \ref MDBX_DUPFIXED.
|
||||
* Store multiple data items in one call. */
|
||||
MDBX_MULTIPLE = UINT32_C(0x80000)
|
||||
};
|
||||
#ifndef __cplusplus
|
||||
/** \ingroup c_crud */
|
||||
typedef enum MDBX_put_flags_t MDBX_put_flags_t;
|
||||
#else
|
||||
DEFINE_ENUM_FLAG_OPERATORS(MDBX_put_flags_t)
|
||||
#endif
|
||||
} MDBX_put_flags_t;
|
||||
DEFINE_ENUM_FLAG_OPERATORS(MDBX_put_flags)
|
||||
|
||||
/** \brief Environment copy flags
|
||||
* \ingroup c_extra
|
||||
* \see mdbx_env_copy() \see mdbx_env_copy2fd() */
|
||||
enum MDBX_copy_flags_t {
|
||||
typedef enum MDBX_copy_flags {
|
||||
MDBX_CP_DEFAULTS = 0,
|
||||
|
||||
/** Copy with compactification: Omit free space from copy and renumber all
|
||||
@ -1751,19 +1694,14 @@ enum MDBX_copy_flags_t {
|
||||
|
||||
/** Force to make resizable copy, i.e. dynamic size instead of fixed */
|
||||
MDBX_CP_FORCE_DYNAMIC_SIZE = 2u
|
||||
};
|
||||
#ifndef __cplusplus
|
||||
/** \ingroup c_extra */
|
||||
typedef enum MDBX_copy_flags_t MDBX_copy_flags_t;
|
||||
#else
|
||||
DEFINE_ENUM_FLAG_OPERATORS(MDBX_copy_flags_t)
|
||||
#endif
|
||||
} MDBX_copy_flags_t;
|
||||
DEFINE_ENUM_FLAG_OPERATORS(MDBX_copy_flags)
|
||||
|
||||
/** \brief Cursor operations
|
||||
* \ingroup c_cursors
|
||||
* This is the set of all operations for retrieving data using a cursor.
|
||||
* \see mdbx_cursor_get() */
|
||||
enum MDBX_cursor_op {
|
||||
typedef enum MDBX_cursor_op {
|
||||
/** Position at first key/data item */
|
||||
MDBX_FIRST,
|
||||
|
||||
@ -1875,18 +1813,14 @@ enum MDBX_cursor_op {
|
||||
MDBX_TO_PAIR_EQUAL,
|
||||
MDBX_TO_PAIR_GREATER_OR_EQUAL,
|
||||
MDBX_TO_PAIR_GREATER_THAN
|
||||
};
|
||||
#ifndef __cplusplus
|
||||
/** \ingroup c_cursors */
|
||||
typedef enum MDBX_cursor_op MDBX_cursor_op;
|
||||
#endif
|
||||
} MDBX_cursor_op;
|
||||
|
||||
/** \brief Errors and return codes
|
||||
* \ingroup c_err
|
||||
*
|
||||
* BerkeleyDB uses -30800 to -30999, we'll go under them
|
||||
* \see mdbx_strerror() \see mdbx_strerror_r() \see mdbx_liberr2str() */
|
||||
enum MDBX_error_t {
|
||||
typedef enum MDBX_error {
|
||||
/** Successful result */
|
||||
MDBX_SUCCESS = 0,
|
||||
|
||||
@ -2062,11 +1996,7 @@ enum MDBX_error_t {
|
||||
MDBX_EREMOTE = ENOTBLK,
|
||||
MDBX_EDEADLK = EDEADLK
|
||||
#endif /* !Windows */
|
||||
};
|
||||
#ifndef __cplusplus
|
||||
/** \ingroup c_err */
|
||||
typedef enum MDBX_error_t MDBX_error_t;
|
||||
#endif
|
||||
} MDBX_error_t;
|
||||
|
||||
/** MDBX_MAP_RESIZED
|
||||
* \ingroup c_err
|
||||
@ -2158,7 +2088,7 @@ LIBMDBX_API int mdbx_env_create(MDBX_env **penv);
|
||||
/** \brief MDBX environment extra runtime options.
|
||||
* \ingroup c_settings
|
||||
* \see mdbx_env_set_option() \see mdbx_env_get_option() */
|
||||
enum MDBX_option_t {
|
||||
typedef enum MDBX_option {
|
||||
/** \brief Controls the maximum number of named databases for the environment.
|
||||
*
|
||||
* \details By default only unnamed key-value database could used and
|
||||
@ -2323,10 +2253,11 @@ enum MDBX_option_t {
|
||||
* \details This option controls the in-process threshold of minimum page
|
||||
* fill, as used space of percentage of a page. Neighbour pages emptier than
|
||||
* this value are candidates for merging. The threshold value is specified
|
||||
* in 1/65536 of percent, which is equivalent to the 16-dot-16 fixed point
|
||||
* format. The specified value must be in the range from 12.5% (almost empty)
|
||||
* to 50% (half empty) which corresponds to the range from 8192 and to 32768
|
||||
* in units respectively.
|
||||
* in 1/65536 points of a whole page, which is equivalent to the 16-dot-16
|
||||
* fixed point format.
|
||||
* The specified value must be in the range from 12.5% (almost empty page)
|
||||
* to 50% (half empty page) which corresponds to the range from 8192 and
|
||||
* to 32768 in units respectively.
|
||||
* \see MDBX_opt_prefer_waf_insteadof_balance */
|
||||
MDBX_opt_merge_threshold_16dot16_percent,
|
||||
|
||||
@ -2414,11 +2345,7 @@ enum MDBX_option_t {
|
||||
*
|
||||
* \see MDBX_opt_merge_threshold_16dot16_percent */
|
||||
MDBX_opt_prefer_waf_insteadof_balance
|
||||
};
|
||||
#ifndef __cplusplus
|
||||
/** \ingroup c_settings */
|
||||
typedef enum MDBX_option_t MDBX_option_t;
|
||||
#endif
|
||||
} MDBX_option_t;
|
||||
|
||||
/** \brief Sets the value of a extra runtime options for an environment.
|
||||
* \ingroup c_settings
|
||||
@ -2533,7 +2460,7 @@ LIBMDBX_API int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname,
|
||||
/** \brief Deletion modes for \ref mdbx_env_delete().
|
||||
* \ingroup c_extra
|
||||
* \see mdbx_env_delete() */
|
||||
enum MDBX_env_delete_mode_t {
|
||||
typedef enum MDBX_env_delete_mode {
|
||||
/** \brief Just delete the environment's files and directory if any.
|
||||
* \note On POSIX systems, processes already working with the database will
|
||||
* continue to work without interference until it close the environment.
|
||||
@ -2547,11 +2474,7 @@ enum MDBX_env_delete_mode_t {
|
||||
/** \brief Wait until other processes closes the environment before deletion.
|
||||
*/
|
||||
MDBX_ENV_WAIT_FOR_UNUSED = 2,
|
||||
};
|
||||
#ifndef __cplusplus
|
||||
/** \ingroup c_extra */
|
||||
typedef enum MDBX_env_delete_mode_t MDBX_env_delete_mode_t;
|
||||
#endif
|
||||
} MDBX_env_delete_mode_t;
|
||||
|
||||
/** \brief Delete the environment's files in a proper and multiprocess-safe way.
|
||||
* \ingroup c_extra
|
||||
@ -2662,7 +2585,7 @@ struct MDBX_stat {
|
||||
uint32_t ms_depth; /**< Depth (height) of the B-tree */
|
||||
uint64_t ms_branch_pages; /**< Number of internal (non-leaf) pages */
|
||||
uint64_t ms_leaf_pages; /**< Number of leaf pages */
|
||||
uint64_t ms_overflow_pages; /**< Number of overflow pages */
|
||||
uint64_t ms_overflow_pages; /**< Number of large/overflow pages */
|
||||
uint64_t ms_entries; /**< Number of data items */
|
||||
uint64_t ms_mod_txnid; /**< Transaction ID of committed last modification */
|
||||
};
|
||||
@ -3122,7 +3045,7 @@ LIBMDBX_API int mdbx_env_resurrect_after_fork(MDBX_env *env);
|
||||
* \ingroup c_settings
|
||||
* \anchor warmup_flags
|
||||
* \see mdbx_env_warmup() */
|
||||
enum MDBX_warmup_flags_t {
|
||||
typedef enum MDBX_warmup_flags {
|
||||
/** By default \ref mdbx_env_warmup() just ask OS kernel to asynchronously
|
||||
* prefetch database pages. */
|
||||
MDBX_warmup_default = 0,
|
||||
@ -3165,12 +3088,8 @@ enum MDBX_warmup_flags_t {
|
||||
|
||||
/** Release the lock that was performed before by \ref MDBX_warmup_lock. */
|
||||
MDBX_warmup_release = 16,
|
||||
};
|
||||
#ifndef __cplusplus
|
||||
typedef enum MDBX_warmup_flags_t MDBX_warmup_flags_t;
|
||||
#else
|
||||
DEFINE_ENUM_FLAG_OPERATORS(MDBX_warmup_flags_t)
|
||||
#endif
|
||||
} MDBX_warmup_flags_t;
|
||||
DEFINE_ENUM_FLAG_OPERATORS(MDBX_warmup_flags)
|
||||
|
||||
/** \brief Warms up the database by loading pages into memory, optionally lock
|
||||
* ones. \ingroup c_settings
|
||||
@ -3564,7 +3483,7 @@ MDBX_NOTHROW_CONST_FUNCTION LIBMDBX_API intptr_t
|
||||
mdbx_limits_pairsize4page_max(intptr_t pagesize, MDBX_db_flags_t flags);
|
||||
|
||||
/** \brief Returns maximal data size in bytes to fit in a leaf-page or
|
||||
* single overflow/large-page with the given page size and database flags,
|
||||
* single large/overflow-page with the given page size and database flags,
|
||||
* or -1 if pagesize is invalid.
|
||||
* \ingroup c_statinfo
|
||||
* \see db_flags */
|
||||
@ -3740,7 +3659,7 @@ MDBX_NOTHROW_PURE_FUNCTION LIBMDBX_API int
|
||||
mdbx_env_get_pairsize4page_max(const MDBX_env *env, MDBX_db_flags_t flags);
|
||||
|
||||
/** \brief Returns maximal data size in bytes to fit in a leaf-page or
|
||||
* single overflow/large-page for specified database flags.
|
||||
* single large/overflow-page for specified database flags.
|
||||
* \ingroup c_statinfo
|
||||
*
|
||||
* \param [in] env An environment handle returned by \ref mdbx_env_create().
|
||||
@ -4578,7 +4497,7 @@ LIBMDBX_API int mdbx_dbi_dupsort_depthmask(const MDBX_txn *txn, MDBX_dbi dbi,
|
||||
/** \brief DBI state bits returted by \ref mdbx_dbi_flags_ex()
|
||||
* \ingroup c_statinfo
|
||||
* \see mdbx_dbi_flags_ex() */
|
||||
enum MDBX_dbi_state_t {
|
||||
typedef enum MDBX_dbi_state {
|
||||
/** DB was written in this txn */
|
||||
MDBX_DBI_DIRTY = 0x01,
|
||||
/** Cached Named-DB record is older than txnID */
|
||||
@ -4587,13 +4506,8 @@ enum MDBX_dbi_state_t {
|
||||
MDBX_DBI_FRESH = 0x04,
|
||||
/** Named-DB handle created in this txn */
|
||||
MDBX_DBI_CREAT = 0x08,
|
||||
};
|
||||
#ifndef __cplusplus
|
||||
/** \ingroup c_statinfo */
|
||||
typedef enum MDBX_dbi_state_t MDBX_dbi_state_t;
|
||||
#else
|
||||
DEFINE_ENUM_FLAG_OPERATORS(MDBX_dbi_state_t)
|
||||
#endif
|
||||
} MDBX_dbi_state_t;
|
||||
DEFINE_ENUM_FLAG_OPERATORS(MDBX_dbi_state)
|
||||
|
||||
/** \brief Retrieve the DB flags and status for a database handle.
|
||||
* \ingroup c_statinfo
|
||||
@ -5005,6 +4919,7 @@ LIBMDBX_API int mdbx_cursor_bind(const MDBX_txn *txn, MDBX_cursor *cursor,
|
||||
* \see mdbx_cursor_renew()
|
||||
* \see mdbx_cursor_bind()
|
||||
* \see mdbx_cursor_close()
|
||||
* \see mdbx_cursor_reset()
|
||||
*
|
||||
* \note In contrast to LMDB, the MDBX required that any opened cursors can be
|
||||
* reused and must be freed explicitly, regardless ones was opened in a
|
||||
@ -5017,6 +4932,20 @@ LIBMDBX_API int mdbx_cursor_bind(const MDBX_txn *txn, MDBX_cursor *cursor,
|
||||
* \returns A non-zero error value on failure and 0 on success. */
|
||||
LIBMDBX_API int mdbx_cursor_unbind(MDBX_cursor *cursor);
|
||||
|
||||
/** \brief Сбрасывает состояние курсора.
|
||||
* \ingroup c_cursors
|
||||
*
|
||||
* В результате сброса курсор становится неустановленным и не позволяет
|
||||
* выполнять операции относительного позиционирования, получения или изменения
|
||||
* данных, до установки на позицию не зависящую от текущей. Что позволяет
|
||||
* приложению пресекать дальнейшие операции без предварительного
|
||||
* позиционирования курсора.
|
||||
*
|
||||
* \param [in] cursor Указатель на курсор.
|
||||
*
|
||||
* \returns Результат операции сканирования, либо код ошибки. */
|
||||
LIBMDBX_API int mdbx_cursor_reset(MDBX_cursor *cursor);
|
||||
|
||||
/** \brief Create a cursor handle for the specified transaction and DBI handle.
|
||||
* \ingroup c_cursors
|
||||
*
|
||||
@ -5197,6 +5126,21 @@ LIBMDBX_API int mdbx_cursor_compare(const MDBX_cursor *left,
|
||||
LIBMDBX_API int mdbx_cursor_get(MDBX_cursor *cursor, MDBX_val *key,
|
||||
MDBX_val *data, MDBX_cursor_op op);
|
||||
|
||||
/** \brief Служебная функция для использования в утилитах.
|
||||
* \ingroup c_extra
|
||||
*
|
||||
* При использовании определяемых пользователем функций сравнения (aka custom
|
||||
* comparison functions) проверка порядка ключей может приводить к неверным
|
||||
* результатам и возврате ошибки \ref MDBX_CORRUPTED.
|
||||
*
|
||||
* Эта функция отключает контроль порядка следования ключей на страницах при
|
||||
* чтении страниц БД для этого курсора, и таким образом, позволяет прочитать
|
||||
* данные при отсутствии/недоступности использованных функций сравнения.
|
||||
* \see avoid_custom_comparators
|
||||
*
|
||||
* \returns Результат операции сканирования, либо код ошибки. */
|
||||
LIBMDBX_API int mdbx_cursor_ignord(MDBX_cursor *cursor);
|
||||
|
||||
/** \brief Тип предикативных функций обратного вызова используемых
|
||||
* \ref mdbx_cursor_scan() и \ref mdbx_cursor_scan_from() для пробирования
|
||||
* пар ключ-значения.
|
||||
@ -5424,18 +5368,16 @@ LIBMDBX_API int mdbx_cursor_scan_from(MDBX_cursor *cursor,
|
||||
* \param [in] limit The size of pairs buffer as the number of items,
|
||||
* but not a pairs.
|
||||
* \param [in] op A cursor operation \ref MDBX_cursor_op (only
|
||||
* \ref MDBX_FIRST, \ref MDBX_NEXT, \ref MDBX_GET_CURRENT
|
||||
* are supported).
|
||||
* \ref MDBX_FIRST and \ref MDBX_NEXT are supported).
|
||||
*
|
||||
* \returns A non-zero error value on failure and 0 on success,
|
||||
* some possible errors are:
|
||||
* \retval MDBX_THREAD_MISMATCH Given transaction is not owned
|
||||
* by current thread.
|
||||
* \retval MDBX_NOTFOUND No more key-value pairs are available.
|
||||
* \retval MDBX_NOTFOUND No any key-value pairs are available.
|
||||
* \retval MDBX_ENODATA The cursor is already at the end of data.
|
||||
* \retval MDBX_RESULT_TRUE The specified limit is less than the available
|
||||
* key-value pairs on the current page/position
|
||||
* that the cursor points to.
|
||||
* \retval MDBX_RESULT_TRUE The returned chunk is the last one,
|
||||
* and there are no pairs left.
|
||||
* \retval MDBX_EINVAL An invalid parameter was specified. */
|
||||
LIBMDBX_API int mdbx_cursor_get_batch(MDBX_cursor *cursor, size_t *count,
|
||||
MDBX_val *pairs, size_t limit,
|
||||
@ -6166,7 +6108,7 @@ LIBMDBX_API int mdbx_preopen_snapinfoW(const wchar_t *pathname,
|
||||
* \note Данный API еще не зафиксирован, в последующих версиях могут быть
|
||||
* незначительные доработки и изменения.
|
||||
* \see mdbx_env_chk() */
|
||||
enum MDBX_chk_flags_t {
|
||||
typedef enum MDBX_chk_flags {
|
||||
/** Режим проверки по-умолчанию, в том числе в режиме только-чтения. */
|
||||
MDBX_CHK_DEFAULTS = 0,
|
||||
|
||||
@ -6184,18 +6126,13 @@ enum MDBX_chk_flags_t {
|
||||
* \note Требуется при проверке унаследованных БД созданных с использованием
|
||||
* нестандартных (пользовательских) функций сравнения ключей или значений. */
|
||||
MDBX_CHK_IGNORE_ORDER = 8
|
||||
};
|
||||
#ifndef __cplusplus
|
||||
/** \ingroup c_opening */
|
||||
typedef enum MDBX_chk_flags_t MDBX_chk_flags_t;
|
||||
#else
|
||||
DEFINE_ENUM_FLAG_OPERATORS(MDBX_chk_flags_t)
|
||||
#endif
|
||||
} MDBX_chk_flags_t;
|
||||
DEFINE_ENUM_FLAG_OPERATORS(MDBX_chk_flags)
|
||||
|
||||
/** \brief Уровни логирование/детализации информации,
|
||||
* поставляемой через обратные вызовы при проверке целостности базы данных.
|
||||
* \see mdbx_env_chk() */
|
||||
enum MDBX_chk_severity {
|
||||
typedef enum MDBX_chk_severity {
|
||||
MDBX_chk_severity_prio_shift = 4,
|
||||
MDBX_chk_severity_kind_mask = 0xF,
|
||||
MDBX_chk_fatal = 0x00u,
|
||||
@ -6209,25 +6146,25 @@ enum MDBX_chk_severity {
|
||||
MDBX_chk_verbose = 0x78u,
|
||||
MDBX_chk_details = 0x89u,
|
||||
MDBX_chk_extra = 0x9Au
|
||||
};
|
||||
} MDBX_chk_severity_t;
|
||||
|
||||
/** \brief Стадии проверки,
|
||||
* сообщаемые через обратные вызовы при проверке целостности базы данных.
|
||||
* \see mdbx_env_chk() */
|
||||
enum MDBX_chk_stage {
|
||||
typedef enum MDBX_chk_stage {
|
||||
MDBX_chk_none,
|
||||
MDBX_chk_init,
|
||||
MDBX_chk_lock,
|
||||
MDBX_chk_meta,
|
||||
MDBX_chk_traversal_tree,
|
||||
MDBX_chk_traversal_freedb,
|
||||
MDBX_chk_tree,
|
||||
MDBX_chk_gc,
|
||||
MDBX_chk_space,
|
||||
MDBX_chk_traversal_maindb,
|
||||
MDBX_chk_traversal_subdbs,
|
||||
MDBX_chk_maindb,
|
||||
MDBX_chk_subdbs,
|
||||
MDBX_chk_conclude,
|
||||
MDBX_chk_unlock,
|
||||
MDBX_chk_finalize
|
||||
};
|
||||
} MDBX_chk_stage_t;
|
||||
|
||||
/** \brief Виртуальная строка отчета, формируемого при проверке целостности базы
|
||||
* данных. \see mdbx_env_chk() */
|
||||
@ -6251,8 +6188,8 @@ typedef struct MDBX_chk_scope {
|
||||
MDBX_chk_issue_t *issues;
|
||||
struct MDBX_chk_internal *internal;
|
||||
const void *object;
|
||||
enum MDBX_chk_stage stage;
|
||||
enum MDBX_chk_severity verbosity;
|
||||
MDBX_chk_stage_t stage;
|
||||
MDBX_chk_severity_t verbosity;
|
||||
size_t subtotal_issues;
|
||||
union {
|
||||
void *ptr;
|
||||
@ -6373,11 +6310,11 @@ typedef struct MDBX_chk_callbacks {
|
||||
size_t entry_number, const MDBX_val *key,
|
||||
const MDBX_val *value);
|
||||
|
||||
int (*stage_begin)(MDBX_chk_context_t *ctx, enum MDBX_chk_stage);
|
||||
int (*stage_end)(MDBX_chk_context_t *ctx, enum MDBX_chk_stage, int err);
|
||||
int (*stage_begin)(MDBX_chk_context_t *ctx, MDBX_chk_stage_t);
|
||||
int (*stage_end)(MDBX_chk_context_t *ctx, MDBX_chk_stage_t, int err);
|
||||
|
||||
MDBX_chk_line_t *(*print_begin)(MDBX_chk_context_t *ctx,
|
||||
enum MDBX_chk_severity severity);
|
||||
MDBX_chk_severity_t severity);
|
||||
void (*print_flush)(MDBX_chk_line_t *);
|
||||
void (*print_done)(MDBX_chk_line_t *);
|
||||
void (*print_chars)(MDBX_chk_line_t *, const char *str, size_t len);
|
||||
@ -6417,8 +6354,8 @@ typedef struct MDBX_chk_callbacks {
|
||||
* \returns Нулевое значение в случае успеха, иначе код ошибки. */
|
||||
LIBMDBX_API int mdbx_env_chk(MDBX_env *env, const MDBX_chk_callbacks_t *cb,
|
||||
MDBX_chk_context_t *ctx,
|
||||
const enum MDBX_chk_flags_t flags,
|
||||
enum MDBX_chk_severity verbosity,
|
||||
const MDBX_chk_flags_t flags,
|
||||
MDBX_chk_severity_t verbosity,
|
||||
unsigned timeout_seconds_16dot16);
|
||||
|
||||
/** \brief Вспомогательная функция для подсчета проблем детектируемых
|
||||
|
41
mdbx.h++
41
mdbx.h++
@ -1,8 +1,8 @@
|
||||
/// \file mdbx.h++
|
||||
/// \brief The libmdbx C++ API header file.
|
||||
/// \copyright SPDX-License-Identifier: Apache-2.0
|
||||
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2020-2024
|
||||
///
|
||||
/// \author Copyright (c) 2020-2024, Leonid Yuriev <leo@yuriev.ru>.
|
||||
/// \copyright SPDX-License-Identifier: Apache-2.0
|
||||
/// \file mdbx.h++
|
||||
/// \brief The libmdbx C++ API header file.
|
||||
///
|
||||
/// Tested with:
|
||||
/// - Elbrus LCC >= 1.23 (http://www.mcst.ru/lcc);
|
||||
@ -2329,14 +2329,16 @@ public:
|
||||
|
||||
buffer(const char *c_str, bool make_reference,
|
||||
const allocator_type &allocator = allocator_type())
|
||||
: buffer(::mdbx::slice(c_str), make_reference, allocator) {}
|
||||
: buffer(::mdbx::slice(c_str), make_reference, allocator){}
|
||||
|
||||
#if defined(DOXYGEN) || \
|
||||
(defined(__cpp_lib_string_view) && __cpp_lib_string_view >= 201606L)
|
||||
template <class CHAR, class T>
|
||||
buffer(const ::std::basic_string_view<CHAR, T> &view, bool make_reference,
|
||||
const allocator_type &allocator = allocator_type())
|
||||
: buffer(::mdbx::slice(view), make_reference, allocator) {}
|
||||
template <class CHAR, class T>
|
||||
buffer(const ::std::basic_string_view<CHAR, T> &view,
|
||||
bool make_reference,
|
||||
const allocator_type &allocator = allocator_type())
|
||||
: buffer(::mdbx::slice(view), make_reference, allocator) {
|
||||
}
|
||||
#endif /* __cpp_lib_string_view >= 201606L */
|
||||
|
||||
MDBX_CXX20_CONSTEXPR
|
||||
@ -2362,15 +2364,16 @@ public:
|
||||
|
||||
MDBX_CXX20_CONSTEXPR
|
||||
buffer(const char *c_str, const allocator_type &allocator = allocator_type())
|
||||
: buffer(::mdbx::slice(c_str), allocator) {}
|
||||
: buffer(::mdbx::slice(c_str), allocator){}
|
||||
|
||||
#if defined(DOXYGEN) || \
|
||||
(defined(__cpp_lib_string_view) && __cpp_lib_string_view >= 201606L)
|
||||
template <class CHAR, class T>
|
||||
MDBX_CXX20_CONSTEXPR
|
||||
buffer(const ::std::basic_string_view<CHAR, T> &view,
|
||||
const allocator_type &allocator = allocator_type())
|
||||
: buffer(::mdbx::slice(view), allocator) {}
|
||||
template <class CHAR, class T>
|
||||
MDBX_CXX20_CONSTEXPR
|
||||
buffer(const ::std::basic_string_view<CHAR, T> &view,
|
||||
const allocator_type &allocator = allocator_type())
|
||||
: buffer(::mdbx::slice(view), allocator) {
|
||||
}
|
||||
#endif /* __cpp_lib_string_view >= 201606L */
|
||||
|
||||
buffer(size_t head_room, size_t tail_room,
|
||||
@ -3819,17 +3822,17 @@ public:
|
||||
static inline size_t pairsize4page_max(const env &, value_mode);
|
||||
|
||||
/// \brief Returns maximal data size in bytes to fit in a leaf-page or
|
||||
/// single overflow/large-page for specified size and database flags.
|
||||
/// single large/overflow-page for specified size and database flags.
|
||||
static inline size_t valsize4page_max(intptr_t pagesize,
|
||||
MDBX_db_flags_t flags);
|
||||
/// \brief Returns maximal data size in bytes to fit in a leaf-page or
|
||||
/// single overflow/large-page for specified page size and values mode.
|
||||
/// single large/overflow-page for specified page size and values mode.
|
||||
static inline size_t valsize4page_max(intptr_t pagesize, value_mode);
|
||||
/// \brief Returns maximal data size in bytes to fit in a leaf-page or
|
||||
/// single overflow/large-page for given environment and database flags.
|
||||
/// single large/overflow-page for given environment and database flags.
|
||||
static inline size_t valsize4page_max(const env &, MDBX_db_flags_t flags);
|
||||
/// \brief Returns maximal data size in bytes to fit in a leaf-page or
|
||||
/// single overflow/large-page for specified page size and values mode.
|
||||
/// single large/overflow-page for specified page size and values mode.
|
||||
static inline size_t valsize4page_max(const env &, value_mode);
|
||||
|
||||
/// \brief Returns the maximal write transaction size (i.e. limit for
|
||||
|
@ -1,184 +0,0 @@
|
||||
cmake_minimum_required(VERSION 2.8.7)
|
||||
set(TARGET mdbx)
|
||||
project(${TARGET})
|
||||
|
||||
set(MDBX_VERSION_MAJOR 0)
|
||||
set(MDBX_VERSION_MINOR 3)
|
||||
set(MDBX_VERSION_RELEASE 1)
|
||||
set(MDBX_VERSION_REVISION 0)
|
||||
|
||||
set(MDBX_VERSION_STRING ${MDBX_VERSION_MAJOR}.${MDBX_VERSION_MINOR}.${MDBX_VERSION_RELEASE})
|
||||
|
||||
enable_language(C)
|
||||
enable_language(CXX)
|
||||
|
||||
set(CMAKE_CXX_STANDARD 11)
|
||||
set(CMAKE_CXX_STANDARD_REQUIRED on)
|
||||
|
||||
add_definitions(-DNDEBUG=1 -DMDBX_DEBUG=0 -DLIBMDBX_EXPORTS=1 -D_GNU_SOURCE=1)
|
||||
|
||||
find_package(Threads REQUIRED)
|
||||
|
||||
get_directory_property(hasParent PARENT_DIRECTORY)
|
||||
if(hasParent)
|
||||
set(STANDALONE_BUILD 0)
|
||||
else()
|
||||
set(STANDALONE_BUILD 1)
|
||||
enable_testing()
|
||||
|
||||
if (CMAKE_C_COMPILER_ID MATCHES GNU)
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O2")
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -g3")
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall")
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Werror")
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wextra")
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -ffunction-sections")
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fPIC")
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fvisibility=hidden")
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -std=gnu11")
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -pthread")
|
||||
endif()
|
||||
|
||||
if (CMAKE_CXX_COMPILER_ID MATCHES GNU)
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -W")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wpointer-arith")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-sign-compare")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wformat-security")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Woverloaded-virtual")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wwrite-strings")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fmax-errors=20")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fvisibility=hidden")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-parameter -Wunused-function -Wunused-variable -Wunused-value -Wmissing-declarations")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-missing-field-initializers")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wcast-qual")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ggdb")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-omit-frame-pointer")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-strict-aliasing")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -finline-functions-called-once")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-packed-bitfield-compat")
|
||||
|
||||
set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g3")
|
||||
set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g3")
|
||||
endif()
|
||||
|
||||
if (COVERAGE)
|
||||
if (NOT "${CMAKE_BUILD_TYPE}" STREQUAL "Debug")
|
||||
message(FATAL_ERROR "Coverage requires -DCMAKE_BUILD_TYPE=Debug Current value=${CMAKE_BUILD_TYPE}")
|
||||
endif()
|
||||
|
||||
message(STATUS "Setting coverage compiler flags")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -ggdb3 -O0 --coverage -fprofile-arcs -ftest-coverage")
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -g -ggdb3 -O0 --coverage -fprofile-arcs -ftest-coverage")
|
||||
add_definitions(-DCOVERAGE_TEST)
|
||||
endif()
|
||||
|
||||
if (NOT TRAVIS)
|
||||
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -fsanitize=address -fsanitize=leak -fstack-protector-strong -static-libasan")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
set(${TARGET}_SRC
|
||||
mdbx.h
|
||||
src/bits.h
|
||||
src/defs.h
|
||||
src/lck-linux.c
|
||||
src/mdbx.c
|
||||
src/osal.c
|
||||
src/osal.h
|
||||
src/version.c
|
||||
)
|
||||
|
||||
add_library(${TARGET}_STATIC STATIC
|
||||
${${TARGET}_SRC}
|
||||
)
|
||||
|
||||
add_library(${TARGET} ALIAS ${TARGET}_STATIC)
|
||||
|
||||
add_library(${TARGET}_SHARED SHARED
|
||||
${${TARGET}_SRC}
|
||||
)
|
||||
|
||||
set_target_properties(${TARGET}_SHARED PROPERTIES
|
||||
VERSION ${MDBX_VERSION_STRING}
|
||||
SOVERSION ${MDBX_VERSION_MAJOR}.${MDBX_VERSION_MINOR}
|
||||
OUTPUT_NAME ${TARGET}
|
||||
CLEAN_DIRECT_OUTPUT 1
|
||||
)
|
||||
|
||||
set_target_properties(${TARGET}_STATIC PROPERTIES
|
||||
VERSION ${MDBX_VERSION_STRING}
|
||||
SOVERSION ${MDBX_VERSION_MAJOR}.${MDBX_VERSION_MINOR}
|
||||
OUTPUT_NAME ${TARGET}
|
||||
CLEAN_DIRECT_OUTPUT 1
|
||||
)
|
||||
|
||||
target_include_directories(${TARGET}_STATIC PUBLIC
|
||||
${CMAKE_CURRENT_SOURCE_DIR})
|
||||
target_include_directories(${TARGET}_SHARED PUBLIC
|
||||
${CMAKE_CURRENT_SOURCE_DIR})
|
||||
|
||||
target_link_libraries(${TARGET}_STATIC ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_link_libraries(${TARGET}_SHARED ${CMAKE_THREAD_LIBS_INIT})
|
||||
if(UNIX AND NOT APPLE)
|
||||
target_link_libraries(${TARGET}_STATIC rt)
|
||||
target_link_libraries(${TARGET}_SHARED rt)
|
||||
endif()
|
||||
|
||||
install(TARGETS ${TARGET}_STATIC DESTINATION ${CMAKE_INSTALL_PREFIX}/lib64 COMPONENT mdbx)
|
||||
install(TARGETS ${TARGET}_SHARED DESTINATION ${CMAKE_INSTALL_PREFIX}/lib64 COMPONENT mdbx)
|
||||
install(FILES mdbx.h DESTINATION ${CMAKE_INSTALL_PREFIX}/include COMPONENT mdbx-devel)
|
||||
|
||||
add_subdirectory(src/tools)
|
||||
add_subdirectory(test)
|
||||
add_subdirectory(test/pcrf)
|
||||
add_subdirectory(tutorial)
|
||||
|
||||
##############################################################################
|
||||
|
||||
set(CPACK_GENERATOR "RPM")
|
||||
set(CPACK_RPM_COMPONENT_INSTALL ON)
|
||||
|
||||
# Version
|
||||
if (NOT "$ENV{BUILD_NUMBER}" STREQUAL "")
|
||||
set(CPACK_PACKAGE_RELEASE $ENV{BUILD_NUMBER})
|
||||
else()
|
||||
if (NOT "$ENV{CI_PIPELINE_ID}" STREQUAL "")
|
||||
set(CPACK_PACKAGE_RELEASE $ENV{CI_PIPELINE_ID})
|
||||
else()
|
||||
set(CPACK_PACKAGE_RELEASE 1)
|
||||
endif()
|
||||
endif()
|
||||
set(CPACK_RPM_PACKAGE_RELEASE ${CPACK_PACKAGE_RELEASE})
|
||||
|
||||
set(CPACK_PACKAGE_VERSION ${MDBX_VERSION_STRING})
|
||||
set(CPACK_PACKAGE_VERSION_FULL ${CPACK_PACKAGE_VERSION}-${CPACK_PACKAGE_RELEASE})
|
||||
|
||||
set(CPACK_RPM_mdbx-devel_PACKAGE_REQUIRES "mdbx = ${CPACK_PACKAGE_VERSION}")
|
||||
|
||||
set(CPACK_RPM_SPEC_INSTALL_POST "/bin/true")
|
||||
set(CPACK_RPM_mdbx_PACKAGE_NAME mdbx)
|
||||
set(CPACK_RPM_mdbx-devel_PACKAGE_NAME mdbx-devel)
|
||||
set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "The revised and extended descendant of Symas LMDB")
|
||||
|
||||
set(CPACK_PACKAGE_VENDOR "???")
|
||||
set(CPACK_PACKAGE_CONTACT "Vladimir Romanov")
|
||||
set(CPACK_PACKAGE_RELOCATABLE false)
|
||||
set(CPACK_RPM_PACKAGE_ARCHITECTURE "x86_64")
|
||||
set(CPACK_RPM_PACKAGE_REQUIRES "")
|
||||
set(CPACK_RPM_PACKAGE_GROUP "Applications/Database")
|
||||
|
||||
set(CPACK_RPM_mdbx_FILE_NAME "${CPACK_RPM_mdbx_PACKAGE_NAME}-${CPACK_PACKAGE_VERSION_FULL}.${CPACK_RPM_PACKAGE_ARCHITECTURE}.rpm")
|
||||
set(CPACK_RPM_mdbx-devel_FILE_NAME "${CPACK_RPM_mdbx-devel_PACKAGE_NAME}-${CPACK_PACKAGE_VERSION_FULL}.${CPACK_RPM_PACKAGE_ARCHITECTURE}.rpm")
|
||||
|
||||
set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION
|
||||
/usr/local
|
||||
/usr/local/bin
|
||||
/usr/local/lib64
|
||||
/usr/local/include
|
||||
/usr/local/man
|
||||
/usr/local/man/man1
|
||||
)
|
||||
|
||||
include(CPack)
|
@ -1,18 +0,0 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
CONFIG=$1
|
||||
|
||||
if [[ -z "${CONFIG}" ]]; then
|
||||
CONFIG=Debug
|
||||
fi
|
||||
if [[ -r /opt/rh/devtoolset-6/enable ]]; then
|
||||
source /opt/rh/devtoolset-6/enable
|
||||
fi
|
||||
#rm -f -r build || true
|
||||
mkdir -p cmake-build-${CONFIG}
|
||||
pushd cmake-build-${CONFIG} &> /dev/null
|
||||
if [[ ! -r Makefile ]]; then
|
||||
cmake .. -DCMAKE_BUILD_TYPE=${CONFIG}
|
||||
fi
|
||||
make -j8 || exit 1
|
||||
popd &> /dev/null
|
@ -1,25 +0,0 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
CONFIG=$1
|
||||
|
||||
if [[ -z "${CONFIG}" ]]; then
|
||||
CONFIG=Debug
|
||||
fi
|
||||
|
||||
DIRNAME=`dirname ${BASH_SOURCE[0]}`
|
||||
DIRNAME=`readlink --canonicalize ${DIRNAME}`
|
||||
|
||||
if [[ -r /opt/rh/devtoolset-6/enable ]]; then
|
||||
source /opt/rh/devtoolset-6/enable
|
||||
fi
|
||||
|
||||
mkdir -p cmake-build-${CONFIG}
|
||||
pushd cmake-build-${CONFIG} &> /dev/null
|
||||
if [[ ! -r Makefile ]]; then
|
||||
cmake .. -DCMAKE_BUILD_TYPE=${CONFIG}
|
||||
fi
|
||||
rm -f *.rpm
|
||||
make -j8 package || exit 1
|
||||
rm -f *-Unspecified.rpm
|
||||
popd &> /dev/null
|
67
src/alloy.c
67
src/alloy.c
@ -1,25 +1,52 @@
|
||||
/*
|
||||
* Copyright 2015-2024 Leonid Yuriev <leo@yuriev.ru>
|
||||
* and other libmdbx authors: please see AUTHORS file.
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted only as authorized by the OpenLDAP
|
||||
* Public License.
|
||||
*
|
||||
* A copy of this license is available in the file LICENSE in the
|
||||
* top-level directory of the distribution or, alternatively, at
|
||||
* <http://www.OpenLDAP.org/license.html>. */
|
||||
/// \copyright SPDX-License-Identifier: Apache-2.0
|
||||
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
|
||||
|
||||
#define xMDBX_ALLOY 1 /* alloyed build */
|
||||
#include "internals.h" /* must be included first */
|
||||
|
||||
#include "core.c"
|
||||
#include "osal.c"
|
||||
#include "version.c"
|
||||
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
#include "lck-windows.c"
|
||||
#else
|
||||
#include "api-cursor.c"
|
||||
#include "api-env.c"
|
||||
#include "api-extra.c"
|
||||
#include "api-key-transform.c"
|
||||
#include "api-txn.c"
|
||||
#include "audit.c"
|
||||
#include "chk.c"
|
||||
#include "cogs.c"
|
||||
#include "coherency.c"
|
||||
#include "cold.c"
|
||||
#include "copy.c"
|
||||
#include "cursor.c"
|
||||
#include "dbi.c"
|
||||
#include "dpl.c"
|
||||
#include "dxb.c"
|
||||
#include "env-opts.c"
|
||||
#include "env.c"
|
||||
#include "gc-get.c"
|
||||
#include "gc-put.c"
|
||||
#include "global.c"
|
||||
#include "lck-posix.c"
|
||||
#endif
|
||||
#include "lck-windows.c"
|
||||
#include "lck.c"
|
||||
#include "logging_and_debug.c"
|
||||
#include "meta.c"
|
||||
#include "misc.c"
|
||||
#include "mvcc-readers.c"
|
||||
#include "node.c"
|
||||
#include "osal.c"
|
||||
#include "page-get.c"
|
||||
#include "page-iov.c"
|
||||
#include "page-ops.c"
|
||||
#include "page-search.c"
|
||||
#include "pnl.c"
|
||||
#include "range-estimate.c"
|
||||
#include "refund.c"
|
||||
#include "spill.c"
|
||||
#include "subdb.c"
|
||||
#include "tls.c"
|
||||
#include "tree.c"
|
||||
#include "txl.c"
|
||||
#include "txn.c"
|
||||
#include "utils.c"
|
||||
#include "version.c"
|
||||
#include "walk.c"
|
||||
#include "windows-import.c"
|
||||
|
797
src/api-cursor.c
Normal file
797
src/api-cursor.c
Normal file
@ -0,0 +1,797 @@
|
||||
/// \copyright SPDX-License-Identifier: Apache-2.0
|
||||
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
|
||||
|
||||
#include "internals.h"
|
||||
|
||||
MDBX_cursor *mdbx_cursor_create(void *context) {
|
||||
cursor_couple_t *couple = osal_calloc(1, sizeof(cursor_couple_t));
|
||||
if (unlikely(!couple))
|
||||
return nullptr;
|
||||
|
||||
VALGRIND_MAKE_MEM_UNDEFINED(couple, sizeof(cursor_couple_t));
|
||||
couple->outer.signature = cur_signature_ready4dispose;
|
||||
couple->outer.next = &couple->outer;
|
||||
couple->userctx = context;
|
||||
couple->outer.top_and_flags = z_poor_mark;
|
||||
couple->inner.cursor.top_and_flags = z_poor_mark | z_inner;
|
||||
VALGRIND_MAKE_MEM_DEFINED(&couple->outer.backup,
|
||||
sizeof(couple->outer.backup));
|
||||
VALGRIND_MAKE_MEM_DEFINED(&couple->outer.tree, sizeof(couple->outer.tree));
|
||||
VALGRIND_MAKE_MEM_DEFINED(&couple->outer.clc, sizeof(couple->outer.clc));
|
||||
VALGRIND_MAKE_MEM_DEFINED(&couple->outer.dbi_state,
|
||||
sizeof(couple->outer.dbi_state));
|
||||
VALGRIND_MAKE_MEM_DEFINED(&couple->outer.subcur,
|
||||
sizeof(couple->outer.subcur));
|
||||
VALGRIND_MAKE_MEM_DEFINED(&couple->outer.txn, sizeof(couple->outer.txn));
|
||||
return &couple->outer;
|
||||
}
|
||||
|
||||
int mdbx_cursor_renew(const MDBX_txn *txn, MDBX_cursor *mc) {
|
||||
return likely(mc)
|
||||
? mdbx_cursor_bind(txn, mc, (kvx_t *)mc->clc - txn->env->kvs)
|
||||
: MDBX_EINVAL;
|
||||
}
|
||||
|
||||
int mdbx_cursor_reset(MDBX_cursor *mc) {
|
||||
if (unlikely(!mc))
|
||||
return MDBX_EINVAL;
|
||||
|
||||
if (unlikely(mc->signature != cur_signature_ready4dispose &&
|
||||
mc->signature != cur_signature_live))
|
||||
return MDBX_EBADSIGN;
|
||||
|
||||
cursor_couple_t *couple = (cursor_couple_t *)mc;
|
||||
couple->outer.top_and_flags = z_poor_mark;
|
||||
couple->inner.cursor.top_and_flags = z_poor_mark | z_inner;
|
||||
return MDBX_SUCCESS;
|
||||
}
|
||||
|
||||
int mdbx_cursor_bind(const MDBX_txn *txn, MDBX_cursor *mc, MDBX_dbi dbi) {
|
||||
if (unlikely(!mc))
|
||||
return MDBX_EINVAL;
|
||||
|
||||
if (unlikely(mc->signature != cur_signature_ready4dispose &&
|
||||
mc->signature != cur_signature_live))
|
||||
return MDBX_EBADSIGN;
|
||||
|
||||
int rc = check_txn(txn, MDBX_TXN_BLOCKED);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return rc;
|
||||
|
||||
rc = dbi_check(txn, dbi);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return rc;
|
||||
|
||||
if (unlikely(dbi == FREE_DBI && !(txn->flags & MDBX_TXN_RDONLY)))
|
||||
return MDBX_EACCESS;
|
||||
|
||||
if (unlikely(mc->backup)) /* Cursor from parent transaction */ {
|
||||
cASSERT(mc, mc->signature == cur_signature_live);
|
||||
if (unlikely(cursor_dbi(mc) != dbi ||
|
||||
/* paranoia */ mc->signature != cur_signature_live ||
|
||||
mc->txn != txn))
|
||||
return MDBX_EINVAL;
|
||||
|
||||
cASSERT(mc, mc->tree == &txn->dbs[dbi]);
|
||||
cASSERT(mc, mc->clc == &txn->env->kvs[dbi].clc);
|
||||
cASSERT(mc, cursor_dbi(mc) == dbi);
|
||||
return likely(cursor_dbi(mc) == dbi &&
|
||||
/* paranoia */ mc->signature == cur_signature_live &&
|
||||
mc->txn == txn)
|
||||
? MDBX_SUCCESS
|
||||
: MDBX_EINVAL /* Disallow change DBI in nested transactions */;
|
||||
}
|
||||
|
||||
if (mc->signature == cur_signature_live) {
|
||||
rc = mdbx_cursor_unbind(mc);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return rc;
|
||||
}
|
||||
cASSERT(mc, mc->next == mc);
|
||||
|
||||
rc = cursor_init(mc, txn, dbi);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return rc;
|
||||
|
||||
mc->next = txn->cursors[dbi];
|
||||
txn->cursors[dbi] = mc;
|
||||
return MDBX_SUCCESS;
|
||||
}
|
||||
|
||||
int mdbx_cursor_unbind(MDBX_cursor *mc) {
|
||||
if (unlikely(!mc))
|
||||
return MDBX_EINVAL;
|
||||
|
||||
if (unlikely(mc->signature != cur_signature_live))
|
||||
return (mc->signature == cur_signature_ready4dispose) ? MDBX_SUCCESS
|
||||
: MDBX_EBADSIGN;
|
||||
|
||||
if (unlikely(mc->backup)) /* Cursor from parent transaction */
|
||||
return MDBX_EINVAL;
|
||||
|
||||
eASSERT(nullptr, mc->txn && mc->txn->signature == txn_signature);
|
||||
cASSERT(mc, mc->signature == cur_signature_live);
|
||||
cASSERT(mc, !mc->backup);
|
||||
if (unlikely(!mc->txn || mc->txn->signature != txn_signature)) {
|
||||
ERROR("Wrong cursor's transaction %p 0x%x",
|
||||
__Wpedantic_format_voidptr(mc->txn),
|
||||
mc->txn ? mc->txn->signature : 0);
|
||||
return MDBX_PROBLEM;
|
||||
}
|
||||
if (mc->next != mc) {
|
||||
const size_t dbi = (kvx_t *)mc->clc - mc->txn->env->kvs;
|
||||
cASSERT(mc, cursor_dbi(mc) == dbi);
|
||||
cASSERT(mc, dbi < mc->txn->n_dbi);
|
||||
if (dbi < mc->txn->n_dbi) {
|
||||
MDBX_cursor **prev = &mc->txn->cursors[dbi];
|
||||
while (*prev && *prev != mc)
|
||||
prev = &(*prev)->next;
|
||||
cASSERT(mc, *prev == mc);
|
||||
*prev = mc->next;
|
||||
}
|
||||
mc->next = mc;
|
||||
}
|
||||
mc->signature = cur_signature_ready4dispose;
|
||||
mc->flags = 0;
|
||||
return MDBX_SUCCESS;
|
||||
}
|
||||
|
||||
int mdbx_cursor_open(const MDBX_txn *txn, MDBX_dbi dbi, MDBX_cursor **ret) {
|
||||
if (unlikely(!ret))
|
||||
return MDBX_EINVAL;
|
||||
*ret = nullptr;
|
||||
|
||||
MDBX_cursor *const mc = mdbx_cursor_create(nullptr);
|
||||
if (unlikely(!mc))
|
||||
return MDBX_ENOMEM;
|
||||
|
||||
int rc = mdbx_cursor_bind(txn, mc, dbi);
|
||||
if (unlikely(rc != MDBX_SUCCESS)) {
|
||||
mdbx_cursor_close(mc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
*ret = mc;
|
||||
return MDBX_SUCCESS;
|
||||
}
|
||||
|
||||
void mdbx_cursor_close(MDBX_cursor *mc) {
|
||||
if (likely(mc)) {
|
||||
ENSURE(nullptr, mc->signature == cur_signature_live ||
|
||||
mc->signature == cur_signature_ready4dispose);
|
||||
MDBX_txn *const txn = mc->txn;
|
||||
if (!mc->backup) {
|
||||
mc->txn = nullptr;
|
||||
/* Unlink from txn, if tracked. */
|
||||
if (mc->next != mc) {
|
||||
ENSURE(txn->env, check_txn(txn, 0) == MDBX_SUCCESS);
|
||||
const size_t dbi = (kvx_t *)mc->clc - txn->env->kvs;
|
||||
tASSERT(txn, dbi < txn->n_dbi);
|
||||
if (dbi < txn->n_dbi) {
|
||||
MDBX_cursor **prev = &txn->cursors[dbi];
|
||||
while (*prev && *prev != mc)
|
||||
prev = &(*prev)->next;
|
||||
tASSERT(txn, *prev == mc);
|
||||
*prev = mc->next;
|
||||
}
|
||||
mc->next = mc;
|
||||
}
|
||||
mc->signature = 0;
|
||||
osal_free(mc);
|
||||
} else {
|
||||
/* Cursor closed before nested txn ends */
|
||||
tASSERT(txn, mc->signature == cur_signature_live);
|
||||
ENSURE(txn->env, check_txn_rw(txn, 0) == MDBX_SUCCESS);
|
||||
mc->signature = cur_signature_wait4eot;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int mdbx_cursor_copy(const MDBX_cursor *src, MDBX_cursor *dest) {
|
||||
if (unlikely(!src))
|
||||
return MDBX_EINVAL;
|
||||
if (unlikely(src->signature != cur_signature_live))
|
||||
return (src->signature == cur_signature_ready4dispose) ? MDBX_EINVAL
|
||||
: MDBX_EBADSIGN;
|
||||
|
||||
int rc = mdbx_cursor_bind(src->txn, dest, cursor_dbi(src));
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return rc;
|
||||
|
||||
assert(dest->tree == src->tree);
|
||||
assert(cursor_dbi(dest) == cursor_dbi(src));
|
||||
again:
|
||||
assert(dest->clc == src->clc);
|
||||
assert(dest->txn == src->txn);
|
||||
dest->top_and_flags = src->top_and_flags;
|
||||
for (intptr_t i = 0; i <= src->top; ++i) {
|
||||
dest->ki[i] = src->ki[i];
|
||||
dest->pg[i] = src->pg[i];
|
||||
}
|
||||
|
||||
if (src->subcur) {
|
||||
dest->subcur->nested_tree = src->subcur->nested_tree;
|
||||
src = &src->subcur->cursor;
|
||||
dest = &dest->subcur->cursor;
|
||||
goto again;
|
||||
}
|
||||
|
||||
return MDBX_SUCCESS;
|
||||
}
|
||||
|
||||
int mdbx_txn_release_all_cursors(const MDBX_txn *txn, bool unbind) {
|
||||
int rc = check_txn(txn, MDBX_TXN_FINISHED | MDBX_TXN_HAS_CHILD);
|
||||
if (likely(rc == MDBX_SUCCESS)) {
|
||||
TXN_FOREACH_DBI_FROM(txn, i, MAIN_DBI) {
|
||||
while (txn->cursors[i]) {
|
||||
MDBX_cursor *mc = txn->cursors[i];
|
||||
ENSURE(nullptr, mc->signature == cur_signature_live &&
|
||||
(mc->next != mc) && !mc->backup);
|
||||
rc = likely(rc < INT_MAX) ? rc + 1 : rc;
|
||||
txn->cursors[i] = mc->next;
|
||||
mc->next = mc;
|
||||
if (unbind) {
|
||||
mc->signature = cur_signature_ready4dispose;
|
||||
mc->flags = 0;
|
||||
} else {
|
||||
mc->signature = 0;
|
||||
osal_free(mc);
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
eASSERT(nullptr, rc < 0);
|
||||
}
|
||||
return rc;
|
||||
}
|
||||
|
||||
int mdbx_cursor_compare(const MDBX_cursor *l, const MDBX_cursor *r,
|
||||
bool ignore_multival) {
|
||||
const int incomparable = INT16_MAX + 1;
|
||||
if (unlikely(!l))
|
||||
return r ? -incomparable * 9 : 0;
|
||||
else if (unlikely(!r))
|
||||
return incomparable * 9;
|
||||
|
||||
if (unlikely(l->signature != cur_signature_live))
|
||||
return (r->signature == cur_signature_live) ? -incomparable * 8 : 0;
|
||||
if (unlikely(r->signature != cur_signature_live))
|
||||
return (l->signature == cur_signature_live) ? incomparable * 8 : 0;
|
||||
|
||||
if (unlikely(l->clc != r->clc)) {
|
||||
if (l->txn->env != r->txn->env)
|
||||
return (l->txn->env > r->txn->env) ? incomparable * 7 : -incomparable * 7;
|
||||
if (l->txn->txnid != r->txn->txnid)
|
||||
return (l->txn->txnid > r->txn->txnid) ? incomparable * 6
|
||||
: -incomparable * 6;
|
||||
return (l->clc > r->clc) ? incomparable * 5 : -incomparable * 5;
|
||||
}
|
||||
assert(cursor_dbi(l) == cursor_dbi(r));
|
||||
|
||||
int diff = is_pointed(l) - is_pointed(r);
|
||||
if (unlikely(diff))
|
||||
return (diff > 0) ? incomparable * 4 : -incomparable * 4;
|
||||
if (unlikely(!is_pointed(l)))
|
||||
return 0;
|
||||
|
||||
intptr_t detent = (l->top <= r->top) ? l->top : r->top;
|
||||
for (intptr_t i = 0; i <= detent; ++i) {
|
||||
diff = l->ki[i] - r->ki[i];
|
||||
if (diff)
|
||||
return diff;
|
||||
}
|
||||
if (unlikely(l->top != r->top))
|
||||
return (l->top > r->top) ? incomparable * 3 : -incomparable * 3;
|
||||
|
||||
assert((l->subcur != nullptr) == (r->subcur != nullptr));
|
||||
if (unlikely((l->subcur != nullptr) != (r->subcur != nullptr)))
|
||||
return l->subcur ? incomparable * 2 : -incomparable * 2;
|
||||
if (ignore_multival || !l->subcur)
|
||||
return 0;
|
||||
|
||||
#if MDBX_DEBUG
|
||||
if (is_pointed(&l->subcur->cursor)) {
|
||||
const page_t *mp = l->pg[l->top];
|
||||
const node_t *node = page_node(mp, l->ki[l->top]);
|
||||
assert(node_flags(node) & N_DUPDATA);
|
||||
}
|
||||
if (is_pointed(&r->subcur->cursor)) {
|
||||
const page_t *mp = r->pg[r->top];
|
||||
const node_t *node = page_node(mp, r->ki[r->top]);
|
||||
assert(node_flags(node) & N_DUPDATA);
|
||||
}
|
||||
#endif /* MDBX_DEBUG */
|
||||
|
||||
l = &l->subcur->cursor;
|
||||
r = &r->subcur->cursor;
|
||||
diff = is_pointed(l) - is_pointed(r);
|
||||
if (unlikely(diff))
|
||||
return (diff > 0) ? incomparable * 2 : -incomparable * 2;
|
||||
if (unlikely(!is_pointed(l)))
|
||||
return 0;
|
||||
|
||||
detent = (l->top <= r->top) ? l->top : r->top;
|
||||
for (intptr_t i = 0; i <= detent; ++i) {
|
||||
diff = l->ki[i] - r->ki[i];
|
||||
if (diff)
|
||||
return diff;
|
||||
}
|
||||
if (unlikely(l->top != r->top))
|
||||
return (l->top > r->top) ? incomparable : -incomparable;
|
||||
|
||||
return (l->flags & z_eof_hard) - (r->flags & z_eof_hard);
|
||||
}
|
||||
|
||||
/* Return the count of duplicate data items for the current key */
|
||||
int mdbx_cursor_count(const MDBX_cursor *mc, size_t *countp) {
|
||||
if (unlikely(mc == nullptr))
|
||||
return MDBX_EINVAL;
|
||||
|
||||
if (unlikely(mc->signature != cur_signature_live))
|
||||
return (mc->signature == cur_signature_ready4dispose) ? MDBX_EINVAL
|
||||
: MDBX_EBADSIGN;
|
||||
|
||||
int rc = check_txn(mc->txn, MDBX_TXN_BLOCKED);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return rc;
|
||||
|
||||
if (unlikely(countp == nullptr))
|
||||
return MDBX_EINVAL;
|
||||
|
||||
if ((*countp = is_filled(mc)) > 0) {
|
||||
if (!inner_hollow(mc)) {
|
||||
const page_t *mp = mc->pg[mc->top];
|
||||
const node_t *node = page_node(mp, mc->ki[mc->top]);
|
||||
cASSERT(mc, node_flags(node) & N_DUPDATA);
|
||||
*countp = unlikely(mc->subcur->nested_tree.items > PTRDIFF_MAX)
|
||||
? PTRDIFF_MAX
|
||||
: (size_t)mc->subcur->nested_tree.items;
|
||||
}
|
||||
}
|
||||
return MDBX_SUCCESS;
|
||||
}
|
||||
|
||||
int mdbx_cursor_on_first(const MDBX_cursor *mc) {
|
||||
if (unlikely(mc == nullptr))
|
||||
return MDBX_EINVAL;
|
||||
|
||||
if (unlikely(mc->signature != cur_signature_live))
|
||||
return (mc->signature == cur_signature_ready4dispose) ? MDBX_EINVAL
|
||||
: MDBX_EBADSIGN;
|
||||
|
||||
for (intptr_t i = 0; i <= mc->top; ++i) {
|
||||
if (mc->ki[i])
|
||||
return MDBX_RESULT_FALSE;
|
||||
}
|
||||
|
||||
return MDBX_RESULT_TRUE;
|
||||
}
|
||||
|
||||
int mdbx_cursor_on_first_dup(const MDBX_cursor *mc) {
|
||||
if (unlikely(mc == nullptr))
|
||||
return MDBX_EINVAL;
|
||||
|
||||
if (unlikely(mc->signature != cur_signature_live))
|
||||
return (mc->signature == cur_signature_ready4dispose) ? MDBX_EINVAL
|
||||
: MDBX_EBADSIGN;
|
||||
|
||||
if (is_filled(mc) && mc->subcur) {
|
||||
mc = &mc->subcur->cursor;
|
||||
for (intptr_t i = 0; i <= mc->top; ++i) {
|
||||
if (mc->ki[i])
|
||||
return MDBX_RESULT_FALSE;
|
||||
}
|
||||
}
|
||||
|
||||
return MDBX_RESULT_TRUE;
|
||||
}
|
||||
|
||||
int mdbx_cursor_on_last(const MDBX_cursor *mc) {
|
||||
if (unlikely(mc == nullptr))
|
||||
return MDBX_EINVAL;
|
||||
|
||||
if (unlikely(mc->signature != cur_signature_live))
|
||||
return (mc->signature == cur_signature_ready4dispose) ? MDBX_EINVAL
|
||||
: MDBX_EBADSIGN;
|
||||
|
||||
for (intptr_t i = 0; i <= mc->top; ++i) {
|
||||
size_t nkeys = page_numkeys(mc->pg[i]);
|
||||
if (mc->ki[i] < nkeys - 1)
|
||||
return MDBX_RESULT_FALSE;
|
||||
}
|
||||
|
||||
return MDBX_RESULT_TRUE;
|
||||
}
|
||||
|
||||
int mdbx_cursor_on_last_dup(const MDBX_cursor *mc) {
|
||||
if (unlikely(mc == nullptr))
|
||||
return MDBX_EINVAL;
|
||||
|
||||
if (unlikely(mc->signature != cur_signature_live))
|
||||
return (mc->signature == cur_signature_ready4dispose) ? MDBX_EINVAL
|
||||
: MDBX_EBADSIGN;
|
||||
|
||||
if (is_filled(mc) && mc->subcur) {
|
||||
mc = &mc->subcur->cursor;
|
||||
for (intptr_t i = 0; i <= mc->top; ++i) {
|
||||
size_t nkeys = page_numkeys(mc->pg[i]);
|
||||
if (mc->ki[i] < nkeys - 1)
|
||||
return MDBX_RESULT_FALSE;
|
||||
}
|
||||
}
|
||||
|
||||
return MDBX_RESULT_TRUE;
|
||||
}
|
||||
|
||||
int mdbx_cursor_eof(const MDBX_cursor *mc) {
|
||||
if (unlikely(mc == nullptr))
|
||||
return MDBX_EINVAL;
|
||||
|
||||
if (unlikely(mc->signature != cur_signature_live))
|
||||
return (mc->signature == cur_signature_ready4dispose) ? MDBX_EINVAL
|
||||
: MDBX_EBADSIGN;
|
||||
|
||||
return is_eof(mc) ? MDBX_RESULT_TRUE : MDBX_RESULT_FALSE;
|
||||
}
|
||||
|
||||
int mdbx_cursor_get(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data,
|
||||
MDBX_cursor_op op) {
|
||||
if (unlikely(mc == nullptr))
|
||||
return MDBX_EINVAL;
|
||||
|
||||
if (unlikely(mc->signature != cur_signature_live))
|
||||
return (mc->signature == cur_signature_ready4dispose) ? MDBX_EINVAL
|
||||
: MDBX_EBADSIGN;
|
||||
|
||||
int rc = check_txn(mc->txn, MDBX_TXN_BLOCKED);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return rc;
|
||||
|
||||
if (unlikely(cursor_dbi_changed(mc)))
|
||||
return MDBX_BAD_DBI;
|
||||
|
||||
return cursor_ops(mc, key, data, op);
|
||||
}
|
||||
|
||||
__hot static int scan_confinue(MDBX_cursor *mc, MDBX_predicate_func *predicate,
|
||||
void *context, void *arg, MDBX_val *key,
|
||||
MDBX_val *value, MDBX_cursor_op turn_op) {
|
||||
int rc;
|
||||
switch (turn_op) {
|
||||
case MDBX_NEXT:
|
||||
case MDBX_NEXT_NODUP:
|
||||
for (;;) {
|
||||
rc = predicate(context, key, value, arg);
|
||||
if (rc != MDBX_RESULT_FALSE)
|
||||
return rc;
|
||||
rc = outer_next(mc, key, value, turn_op);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return (rc == MDBX_NOTFOUND) ? MDBX_RESULT_FALSE : rc;
|
||||
}
|
||||
|
||||
case MDBX_PREV:
|
||||
case MDBX_PREV_NODUP:
|
||||
for (;;) {
|
||||
rc = predicate(context, key, value, arg);
|
||||
if (rc != MDBX_RESULT_FALSE)
|
||||
return rc;
|
||||
rc = outer_prev(mc, key, value, turn_op);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return (rc == MDBX_NOTFOUND) ? MDBX_RESULT_FALSE : rc;
|
||||
}
|
||||
|
||||
case MDBX_NEXT_DUP:
|
||||
if (mc->subcur)
|
||||
for (;;) {
|
||||
rc = predicate(context, key, value, arg);
|
||||
if (rc != MDBX_RESULT_FALSE)
|
||||
return rc;
|
||||
rc = inner_next(&mc->subcur->cursor, value);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return (rc == MDBX_NOTFOUND) ? MDBX_RESULT_FALSE : rc;
|
||||
}
|
||||
return MDBX_NOTFOUND;
|
||||
|
||||
case MDBX_PREV_DUP:
|
||||
if (mc->subcur)
|
||||
for (;;) {
|
||||
rc = predicate(context, key, value, arg);
|
||||
if (rc != MDBX_RESULT_FALSE)
|
||||
return rc;
|
||||
rc = inner_prev(&mc->subcur->cursor, value);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return (rc == MDBX_NOTFOUND) ? MDBX_RESULT_FALSE : rc;
|
||||
}
|
||||
return MDBX_NOTFOUND;
|
||||
|
||||
default:
|
||||
for (;;) {
|
||||
rc = predicate(context, key, value, arg);
|
||||
if (rc != MDBX_RESULT_FALSE)
|
||||
return rc;
|
||||
rc = cursor_ops(mc, key, value, turn_op);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return (rc == MDBX_NOTFOUND) ? MDBX_RESULT_FALSE : rc;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int mdbx_cursor_scan(MDBX_cursor *mc, MDBX_predicate_func *predicate,
|
||||
void *context, MDBX_cursor_op start_op,
|
||||
MDBX_cursor_op turn_op, void *arg) {
|
||||
if (unlikely(!predicate))
|
||||
return MDBX_EINVAL;
|
||||
|
||||
const unsigned valid_start_mask =
|
||||
1 << MDBX_FIRST | 1 << MDBX_FIRST_DUP | 1 << MDBX_LAST |
|
||||
1 << MDBX_LAST_DUP | 1 << MDBX_GET_CURRENT | 1 << MDBX_GET_MULTIPLE;
|
||||
if (unlikely(start_op > 30 || ((1 << start_op) & valid_start_mask) == 0))
|
||||
return MDBX_EINVAL;
|
||||
|
||||
const unsigned valid_turn_mask =
|
||||
1 << MDBX_NEXT | 1 << MDBX_NEXT_DUP | 1 << MDBX_NEXT_NODUP |
|
||||
1 << MDBX_PREV | 1 << MDBX_PREV_DUP | 1 << MDBX_PREV_NODUP |
|
||||
1 << MDBX_NEXT_MULTIPLE | 1 << MDBX_PREV_MULTIPLE;
|
||||
if (unlikely(turn_op > 30 || ((1 << turn_op) & valid_turn_mask) == 0))
|
||||
return MDBX_EINVAL;
|
||||
|
||||
MDBX_val key = {nullptr, 0}, value = {nullptr, 0};
|
||||
int rc = mdbx_cursor_get(mc, &key, &value, start_op);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return rc;
|
||||
return scan_confinue(mc, predicate, context, arg, &key, &value, turn_op);
|
||||
}
|
||||
|
||||
int mdbx_cursor_scan_from(MDBX_cursor *mc, MDBX_predicate_func *predicate,
|
||||
void *context, MDBX_cursor_op from_op, MDBX_val *key,
|
||||
MDBX_val *value, MDBX_cursor_op turn_op, void *arg) {
|
||||
if (unlikely(!predicate || !key))
|
||||
return MDBX_EINVAL;
|
||||
|
||||
const unsigned valid_start_mask =
|
||||
1 << MDBX_GET_BOTH | 1 << MDBX_GET_BOTH_RANGE | 1 << MDBX_SET_KEY |
|
||||
1 << MDBX_GET_MULTIPLE | 1 << MDBX_SET_LOWERBOUND |
|
||||
1 << MDBX_SET_UPPERBOUND;
|
||||
if (unlikely(from_op < MDBX_TO_KEY_LESSER_THAN &&
|
||||
((1 << from_op) & valid_start_mask) == 0))
|
||||
return MDBX_EINVAL;
|
||||
|
||||
const unsigned valid_turn_mask =
|
||||
1 << MDBX_NEXT | 1 << MDBX_NEXT_DUP | 1 << MDBX_NEXT_NODUP |
|
||||
1 << MDBX_PREV | 1 << MDBX_PREV_DUP | 1 << MDBX_PREV_NODUP |
|
||||
1 << MDBX_NEXT_MULTIPLE | 1 << MDBX_PREV_MULTIPLE;
|
||||
if (unlikely(turn_op > 30 || ((1 << turn_op) & valid_turn_mask) == 0))
|
||||
return MDBX_EINVAL;
|
||||
|
||||
int rc = mdbx_cursor_get(mc, key, value, from_op);
|
||||
if (unlikely(MDBX_IS_ERROR(rc)))
|
||||
return rc;
|
||||
|
||||
cASSERT(mc, key != nullptr);
|
||||
MDBX_val stub;
|
||||
if (!value) {
|
||||
value = &stub;
|
||||
rc = cursor_ops(mc, key, value, MDBX_GET_CURRENT);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return rc;
|
||||
}
|
||||
return scan_confinue(mc, predicate, context, arg, key, value, turn_op);
|
||||
}
|
||||
|
||||
int mdbx_cursor_get_batch(MDBX_cursor *mc, size_t *count, MDBX_val *pairs,
|
||||
size_t limit, MDBX_cursor_op op) {
|
||||
if (unlikely(!count))
|
||||
return MDBX_EINVAL;
|
||||
|
||||
*count = 0;
|
||||
if (unlikely(mc == nullptr || limit < 4 || limit > INTPTR_MAX - 2))
|
||||
return MDBX_EINVAL;
|
||||
|
||||
if (unlikely(mc->signature != cur_signature_live))
|
||||
return (mc->signature == cur_signature_ready4dispose) ? MDBX_EINVAL
|
||||
: MDBX_EBADSIGN;
|
||||
|
||||
int rc = check_txn(mc->txn, MDBX_TXN_BLOCKED);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return rc;
|
||||
|
||||
if (unlikely(cursor_dbi_changed(mc)))
|
||||
return MDBX_BAD_DBI;
|
||||
|
||||
if (unlikely(mc->subcur))
|
||||
return MDBX_INCOMPATIBLE /* must be a non-dupsort subDB */;
|
||||
|
||||
switch (op) {
|
||||
case MDBX_NEXT:
|
||||
if (unlikely(is_eof(mc)))
|
||||
return is_pointed(mc) ? MDBX_NOTFOUND : MDBX_ENODATA;
|
||||
break;
|
||||
|
||||
case MDBX_FIRST:
|
||||
if (!is_filled(mc)) {
|
||||
rc = outer_first(mc, nullptr, nullptr);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return rc;
|
||||
}
|
||||
break;
|
||||
|
||||
default:
|
||||
DEBUG("unhandled/unimplemented cursor operation %u", op);
|
||||
return MDBX_EINVAL;
|
||||
}
|
||||
|
||||
const page_t *mp = mc->pg[mc->top];
|
||||
size_t nkeys = page_numkeys(mp);
|
||||
size_t ki = mc->ki[mc->top];
|
||||
size_t n = 0;
|
||||
while (n + 2 <= limit) {
|
||||
cASSERT(mc, ki < nkeys);
|
||||
if (unlikely(ki >= nkeys))
|
||||
goto sibling;
|
||||
|
||||
const node_t *leaf = page_node(mp, ki);
|
||||
pairs[n] = get_key(leaf);
|
||||
rc = node_read(mc, leaf, &pairs[n + 1], mp);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
goto bailout;
|
||||
|
||||
n += 2;
|
||||
if (++ki == nkeys) {
|
||||
sibling:
|
||||
rc = cursor_sibling_right(mc);
|
||||
if (rc != MDBX_SUCCESS) {
|
||||
if (rc == MDBX_NOTFOUND)
|
||||
rc = MDBX_RESULT_TRUE;
|
||||
goto bailout;
|
||||
}
|
||||
|
||||
mp = mc->pg[mc->top];
|
||||
DEBUG("next page is %" PRIaPGNO ", key index %u", mp->pgno,
|
||||
mc->ki[mc->top]);
|
||||
if (!MDBX_DISABLE_VALIDATION && unlikely(!check_leaf_type(mc, mp))) {
|
||||
ERROR("unexpected leaf-page #%" PRIaPGNO " type 0x%x seen by cursor",
|
||||
mp->pgno, mp->flags);
|
||||
rc = MDBX_CORRUPTED;
|
||||
goto bailout;
|
||||
}
|
||||
nkeys = page_numkeys(mp);
|
||||
ki = 0;
|
||||
}
|
||||
}
|
||||
mc->ki[mc->top] = (indx_t)ki;
|
||||
|
||||
bailout:
|
||||
*count = n;
|
||||
return rc;
|
||||
}
|
||||
|
||||
/*----------------------------------------------------------------------------*/
|
||||
|
||||
int mdbx_cursor_set_userctx(MDBX_cursor *mc, void *ctx) {
|
||||
if (unlikely(!mc))
|
||||
return MDBX_EINVAL;
|
||||
|
||||
if (unlikely(mc->signature != cur_signature_ready4dispose &&
|
||||
mc->signature != cur_signature_live))
|
||||
return MDBX_EBADSIGN;
|
||||
|
||||
cursor_couple_t *couple = container_of(mc, cursor_couple_t, outer);
|
||||
couple->userctx = ctx;
|
||||
return MDBX_SUCCESS;
|
||||
}
|
||||
|
||||
void *mdbx_cursor_get_userctx(const MDBX_cursor *mc) {
|
||||
if (unlikely(!mc))
|
||||
return nullptr;
|
||||
|
||||
if (unlikely(mc->signature != cur_signature_ready4dispose &&
|
||||
mc->signature != cur_signature_live))
|
||||
return nullptr;
|
||||
|
||||
cursor_couple_t *couple = container_of(mc, cursor_couple_t, outer);
|
||||
return couple->userctx;
|
||||
}
|
||||
|
||||
MDBX_txn *mdbx_cursor_txn(const MDBX_cursor *mc) {
|
||||
if (unlikely(!mc || mc->signature != cur_signature_live))
|
||||
return nullptr;
|
||||
MDBX_txn *txn = mc->txn;
|
||||
if (unlikely(!txn || txn->signature != txn_signature))
|
||||
return nullptr;
|
||||
if (unlikely(txn->flags & MDBX_TXN_FINISHED))
|
||||
return nullptr;
|
||||
return txn;
|
||||
}
|
||||
|
||||
MDBX_dbi mdbx_cursor_dbi(const MDBX_cursor *mc) {
|
||||
if (unlikely(!mc || mc->signature != cur_signature_live))
|
||||
return UINT_MAX;
|
||||
return cursor_dbi(mc);
|
||||
}
|
||||
|
||||
/*----------------------------------------------------------------------------*/
|
||||
|
||||
int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data,
|
||||
MDBX_put_flags_t flags) {
|
||||
if (unlikely(mc == nullptr || key == nullptr || data == nullptr))
|
||||
return MDBX_EINVAL;
|
||||
|
||||
if (unlikely(mc->signature != cur_signature_live))
|
||||
return (mc->signature == cur_signature_ready4dispose) ? MDBX_EINVAL
|
||||
: MDBX_EBADSIGN;
|
||||
|
||||
int rc = check_txn_rw(mc->txn, MDBX_TXN_BLOCKED);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return rc;
|
||||
|
||||
if (unlikely(cursor_dbi_changed(mc)))
|
||||
return MDBX_BAD_DBI;
|
||||
|
||||
cASSERT(mc, cursor_is_tracked(mc));
|
||||
|
||||
/* Check this first so counter will always be zero on any early failures. */
|
||||
if (unlikely(flags & MDBX_MULTIPLE)) {
|
||||
if (unlikely(flags & MDBX_RESERVE))
|
||||
return MDBX_EINVAL;
|
||||
if (unlikely(!(mc->tree->flags & MDBX_DUPFIXED)))
|
||||
return MDBX_INCOMPATIBLE;
|
||||
const size_t dcount = data[1].iov_len;
|
||||
if (unlikely(dcount < 2 || data->iov_len == 0))
|
||||
return MDBX_BAD_VALSIZE;
|
||||
if (unlikely(mc->tree->dupfix_size != data->iov_len) &&
|
||||
mc->tree->dupfix_size)
|
||||
return MDBX_BAD_VALSIZE;
|
||||
if (unlikely(dcount >
|
||||
MAX_MAPSIZE / 2 /
|
||||
(BRANCH_NODE_MAX(MDBX_MAX_PAGESIZE) - NODESIZE))) {
|
||||
/* checking for multiplication overflow */
|
||||
if (unlikely(dcount > MAX_MAPSIZE / 2 / data->iov_len))
|
||||
return MDBX_TOO_LARGE;
|
||||
}
|
||||
}
|
||||
|
||||
if (flags & MDBX_RESERVE) {
|
||||
if (unlikely(mc->tree->flags & (MDBX_DUPSORT | MDBX_REVERSEDUP |
|
||||
MDBX_INTEGERDUP | MDBX_DUPFIXED)))
|
||||
return MDBX_INCOMPATIBLE;
|
||||
data->iov_base = nullptr;
|
||||
}
|
||||
|
||||
if (unlikely(mc->txn->flags & (MDBX_TXN_RDONLY | MDBX_TXN_BLOCKED)))
|
||||
return (mc->txn->flags & MDBX_TXN_RDONLY) ? MDBX_EACCESS : MDBX_BAD_TXN;
|
||||
|
||||
return cursor_put_checklen(mc, key, data, flags);
|
||||
}
|
||||
|
||||
int mdbx_cursor_del(MDBX_cursor *mc, MDBX_put_flags_t flags) {
|
||||
if (unlikely(!mc))
|
||||
return MDBX_EINVAL;
|
||||
|
||||
if (unlikely(mc->signature != cur_signature_live))
|
||||
return (mc->signature == cur_signature_ready4dispose) ? MDBX_EINVAL
|
||||
: MDBX_EBADSIGN;
|
||||
|
||||
int rc = check_txn_rw(mc->txn, MDBX_TXN_BLOCKED);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return rc;
|
||||
|
||||
if (unlikely(cursor_dbi_changed(mc)))
|
||||
return MDBX_BAD_DBI;
|
||||
|
||||
return cursor_del(mc, flags);
|
||||
}
|
||||
|
||||
__cold int mdbx_cursor_ignord(MDBX_cursor *mc) {
|
||||
if (unlikely(!mc))
|
||||
return MDBX_EINVAL;
|
||||
|
||||
if (unlikely(mc->signature != cur_signature_live))
|
||||
return (mc->signature == cur_signature_ready4dispose) ? MDBX_EINVAL
|
||||
: MDBX_EBADSIGN;
|
||||
|
||||
mc->checking |= z_ignord;
|
||||
if (mc->subcur)
|
||||
mc->subcur->cursor.checking |= z_ignord;
|
||||
|
||||
return MDBX_SUCCESS;
|
||||
}
|
1399
src/api-env.c
Normal file
1399
src/api-env.c
Normal file
File diff suppressed because it is too large
Load Diff
117
src/api-extra.c
Normal file
117
src/api-extra.c
Normal file
@ -0,0 +1,117 @@
|
||||
/// \copyright SPDX-License-Identifier: Apache-2.0
|
||||
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
|
||||
|
||||
#include "internals.h"
|
||||
|
||||
/*------------------------------------------------------------------------------
|
||||
* Readers API */
|
||||
|
||||
__cold int mdbx_reader_list(const MDBX_env *env, MDBX_reader_list_func *func,
|
||||
void *ctx) {
|
||||
int rc = check_env(env, true);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return rc;
|
||||
|
||||
if (unlikely(!func))
|
||||
return MDBX_EINVAL;
|
||||
|
||||
rc = MDBX_RESULT_TRUE;
|
||||
int serial = 0;
|
||||
lck_t *const lck = env->lck_mmap.lck;
|
||||
if (likely(lck)) {
|
||||
const size_t snap_nreaders =
|
||||
atomic_load32(&lck->rdt_length, mo_AcquireRelease);
|
||||
for (size_t i = 0; i < snap_nreaders; i++) {
|
||||
const reader_slot_t *r = lck->rdt + i;
|
||||
retry_reader:;
|
||||
const uint32_t pid = atomic_load32(&r->pid, mo_AcquireRelease);
|
||||
if (!pid)
|
||||
continue;
|
||||
txnid_t txnid = safe64_read(&r->txnid);
|
||||
const uint64_t tid = atomic_load64(&r->tid, mo_Relaxed);
|
||||
const pgno_t pages_used =
|
||||
atomic_load32(&r->snapshot_pages_used, mo_Relaxed);
|
||||
const uint64_t reader_pages_retired =
|
||||
atomic_load64(&r->snapshot_pages_retired, mo_Relaxed);
|
||||
if (unlikely(txnid != safe64_read(&r->txnid) ||
|
||||
pid != atomic_load32(&r->pid, mo_AcquireRelease) ||
|
||||
tid != atomic_load64(&r->tid, mo_Relaxed) ||
|
||||
pages_used !=
|
||||
atomic_load32(&r->snapshot_pages_used, mo_Relaxed) ||
|
||||
reader_pages_retired !=
|
||||
atomic_load64(&r->snapshot_pages_retired, mo_Relaxed)))
|
||||
goto retry_reader;
|
||||
|
||||
eASSERT(env, txnid > 0);
|
||||
if (txnid >= SAFE64_INVALID_THRESHOLD)
|
||||
txnid = 0;
|
||||
|
||||
size_t bytes_used = 0;
|
||||
size_t bytes_retained = 0;
|
||||
uint64_t lag = 0;
|
||||
if (txnid) {
|
||||
troika_t troika = meta_tap(env);
|
||||
retry_header:;
|
||||
const meta_ptr_t head = meta_recent(env, &troika);
|
||||
const uint64_t head_pages_retired =
|
||||
unaligned_peek_u64_volatile(4, head.ptr_v->pages_retired);
|
||||
if (unlikely(meta_should_retry(env, &troika) ||
|
||||
head_pages_retired != unaligned_peek_u64_volatile(
|
||||
4, head.ptr_v->pages_retired)))
|
||||
goto retry_header;
|
||||
|
||||
lag = (head.txnid - txnid) / xMDBX_TXNID_STEP;
|
||||
bytes_used = pgno2bytes(env, pages_used);
|
||||
bytes_retained = (head_pages_retired > reader_pages_retired)
|
||||
? pgno2bytes(env, (pgno_t)(head_pages_retired -
|
||||
reader_pages_retired))
|
||||
: 0;
|
||||
}
|
||||
rc = func(ctx, ++serial, (unsigned)i, pid, (mdbx_tid_t)((intptr_t)tid),
|
||||
txnid, lag, bytes_used, bytes_retained);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
__cold int mdbx_reader_check(MDBX_env *env, int *dead) {
|
||||
if (dead)
|
||||
*dead = 0;
|
||||
return mvcc_cleanup_dead(env, false, dead);
|
||||
}
|
||||
|
||||
/*------------------------------------------------------------------------------
|
||||
* Locking API */
|
||||
|
||||
int mdbx_txn_lock(MDBX_env *env, bool dont_wait) {
|
||||
int rc = check_env(env, true);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return rc;
|
||||
|
||||
if (unlikely(env->flags & MDBX_RDONLY))
|
||||
return MDBX_EACCESS;
|
||||
if (unlikely(env->basal_txn->owner ||
|
||||
(env->basal_txn->flags & MDBX_TXN_FINISHED) == 0))
|
||||
return MDBX_BUSY;
|
||||
|
||||
return lck_txn_lock(env, dont_wait);
|
||||
}
|
||||
|
||||
int mdbx_txn_unlock(MDBX_env *env) {
|
||||
int rc = check_env(env, true);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return rc;
|
||||
|
||||
if (unlikely(env->flags & MDBX_RDONLY))
|
||||
return MDBX_EACCESS;
|
||||
if (unlikely(env->basal_txn->owner != osal_thread_self()))
|
||||
return MDBX_THREAD_MISMATCH;
|
||||
if (unlikely((env->basal_txn->flags & MDBX_TXN_FINISHED) == 0))
|
||||
return MDBX_BUSY;
|
||||
|
||||
lck_txn_unlock(env);
|
||||
return MDBX_SUCCESS;
|
||||
}
|
225
src/api-key-transform.c
Normal file
225
src/api-key-transform.c
Normal file
@ -0,0 +1,225 @@
|
||||
/// \copyright SPDX-License-Identifier: Apache-2.0
|
||||
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
|
||||
|
||||
#include "internals.h"
|
||||
|
||||
static inline double key2double(const int64_t key) {
|
||||
union {
|
||||
uint64_t u;
|
||||
double f;
|
||||
} casting;
|
||||
|
||||
casting.u = (key < 0) ? key + UINT64_C(0x8000000000000000)
|
||||
: UINT64_C(0xffffFFFFffffFFFF) - key;
|
||||
return casting.f;
|
||||
}
|
||||
|
||||
static inline uint64_t double2key(const double *const ptr) {
|
||||
STATIC_ASSERT(sizeof(double) == sizeof(int64_t));
|
||||
const int64_t i = *(const int64_t *)ptr;
|
||||
const uint64_t u = (i < 0) ? UINT64_C(0xffffFFFFffffFFFF) - i
|
||||
: i + UINT64_C(0x8000000000000000);
|
||||
if (ASSERT_ENABLED()) {
|
||||
const double f = key2double(u);
|
||||
assert(memcmp(&f, ptr, sizeof(double)) == 0);
|
||||
}
|
||||
return u;
|
||||
}
|
||||
|
||||
static inline float key2float(const int32_t key) {
|
||||
union {
|
||||
uint32_t u;
|
||||
float f;
|
||||
} casting;
|
||||
|
||||
casting.u =
|
||||
(key < 0) ? key + UINT32_C(0x80000000) : UINT32_C(0xffffFFFF) - key;
|
||||
return casting.f;
|
||||
}
|
||||
|
||||
static inline uint32_t float2key(const float *const ptr) {
|
||||
STATIC_ASSERT(sizeof(float) == sizeof(int32_t));
|
||||
const int32_t i = *(const int32_t *)ptr;
|
||||
const uint32_t u =
|
||||
(i < 0) ? UINT32_C(0xffffFFFF) - i : i + UINT32_C(0x80000000);
|
||||
if (ASSERT_ENABLED()) {
|
||||
const float f = key2float(u);
|
||||
assert(memcmp(&f, ptr, sizeof(float)) == 0);
|
||||
}
|
||||
return u;
|
||||
}
|
||||
|
||||
uint64_t mdbx_key_from_double(const double ieee754_64bit) {
|
||||
return double2key(&ieee754_64bit);
|
||||
}
|
||||
|
||||
uint64_t mdbx_key_from_ptrdouble(const double *const ieee754_64bit) {
|
||||
return double2key(ieee754_64bit);
|
||||
}
|
||||
|
||||
uint32_t mdbx_key_from_float(const float ieee754_32bit) {
|
||||
return float2key(&ieee754_32bit);
|
||||
}
|
||||
|
||||
uint32_t mdbx_key_from_ptrfloat(const float *const ieee754_32bit) {
|
||||
return float2key(ieee754_32bit);
|
||||
}
|
||||
|
||||
#define IEEE754_DOUBLE_MANTISSA_SIZE 52
|
||||
#define IEEE754_DOUBLE_EXPONENTA_BIAS 0x3FF
|
||||
#define IEEE754_DOUBLE_EXPONENTA_MAX 0x7FF
|
||||
#define IEEE754_DOUBLE_IMPLICIT_LEAD UINT64_C(0x0010000000000000)
|
||||
#define IEEE754_DOUBLE_MANTISSA_MASK UINT64_C(0x000FFFFFFFFFFFFF)
|
||||
#define IEEE754_DOUBLE_MANTISSA_AMAX UINT64_C(0x001FFFFFFFFFFFFF)
|
||||
|
||||
static inline int clz64(uint64_t value) {
|
||||
#if __GNUC_PREREQ(4, 1) || __has_builtin(__builtin_clzl)
|
||||
if (sizeof(value) == sizeof(int))
|
||||
return __builtin_clz(value);
|
||||
if (sizeof(value) == sizeof(long))
|
||||
return __builtin_clzl(value);
|
||||
#if (defined(__SIZEOF_LONG_LONG__) && __SIZEOF_LONG_LONG__ == 8) || \
|
||||
__has_builtin(__builtin_clzll)
|
||||
return __builtin_clzll(value);
|
||||
#endif /* have(long long) && long long == uint64_t */
|
||||
#endif /* GNU C */
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
unsigned long index;
|
||||
#if defined(_M_AMD64) || defined(_M_ARM64) || defined(_M_X64)
|
||||
_BitScanReverse64(&index, value);
|
||||
return 63 - index;
|
||||
#else
|
||||
if (value > UINT32_MAX) {
|
||||
_BitScanReverse(&index, (uint32_t)(value >> 32));
|
||||
return 31 - index;
|
||||
}
|
||||
_BitScanReverse(&index, (uint32_t)value);
|
||||
return 63 - index;
|
||||
#endif
|
||||
#endif /* MSVC */
|
||||
|
||||
value |= value >> 1;
|
||||
value |= value >> 2;
|
||||
value |= value >> 4;
|
||||
value |= value >> 8;
|
||||
value |= value >> 16;
|
||||
value |= value >> 32;
|
||||
static const uint8_t debruijn_clz64[64] = {
|
||||
63, 16, 62, 7, 15, 36, 61, 3, 6, 14, 22, 26, 35, 47, 60, 2,
|
||||
9, 5, 28, 11, 13, 21, 42, 19, 25, 31, 34, 40, 46, 52, 59, 1,
|
||||
17, 8, 37, 4, 23, 27, 48, 10, 29, 12, 43, 20, 32, 41, 53, 18,
|
||||
38, 24, 49, 30, 44, 33, 54, 39, 50, 45, 55, 51, 56, 57, 58, 0};
|
||||
return debruijn_clz64[value * UINT64_C(0x03F79D71B4CB0A89) >> 58];
|
||||
}
|
||||
|
||||
static inline uint64_t round_mantissa(const uint64_t u64, int shift) {
|
||||
assert(shift < 0 && u64 > 0);
|
||||
shift = -shift;
|
||||
const unsigned half = 1 << (shift - 1);
|
||||
const unsigned lsb = 1 & (unsigned)(u64 >> shift);
|
||||
const unsigned tie2even = 1 ^ lsb;
|
||||
return (u64 + half - tie2even) >> shift;
|
||||
}
|
||||
|
||||
uint64_t mdbx_key_from_jsonInteger(const int64_t json_integer) {
|
||||
const uint64_t bias = UINT64_C(0x8000000000000000);
|
||||
if (json_integer > 0) {
|
||||
const uint64_t u64 = json_integer;
|
||||
int shift = clz64(u64) - (64 - IEEE754_DOUBLE_MANTISSA_SIZE - 1);
|
||||
uint64_t mantissa = u64 << shift;
|
||||
if (unlikely(shift < 0)) {
|
||||
mantissa = round_mantissa(u64, shift);
|
||||
if (mantissa > IEEE754_DOUBLE_MANTISSA_AMAX)
|
||||
mantissa = round_mantissa(u64, --shift);
|
||||
}
|
||||
|
||||
assert(mantissa >= IEEE754_DOUBLE_IMPLICIT_LEAD &&
|
||||
mantissa <= IEEE754_DOUBLE_MANTISSA_AMAX);
|
||||
const uint64_t exponent = (uint64_t)IEEE754_DOUBLE_EXPONENTA_BIAS +
|
||||
IEEE754_DOUBLE_MANTISSA_SIZE - shift;
|
||||
assert(exponent > 0 && exponent <= IEEE754_DOUBLE_EXPONENTA_MAX);
|
||||
const uint64_t key = bias + (exponent << IEEE754_DOUBLE_MANTISSA_SIZE) +
|
||||
(mantissa - IEEE754_DOUBLE_IMPLICIT_LEAD);
|
||||
#if !defined(_MSC_VER) || \
|
||||
defined( \
|
||||
_DEBUG) /* Workaround for MSVC error LNK2019: unresolved external \
|
||||
symbol __except1 referenced in function __ftol3_except */
|
||||
assert(key == mdbx_key_from_double((double)json_integer));
|
||||
#endif /* Workaround for MSVC */
|
||||
return key;
|
||||
}
|
||||
|
||||
if (json_integer < 0) {
|
||||
const uint64_t u64 = -json_integer;
|
||||
int shift = clz64(u64) - (64 - IEEE754_DOUBLE_MANTISSA_SIZE - 1);
|
||||
uint64_t mantissa = u64 << shift;
|
||||
if (unlikely(shift < 0)) {
|
||||
mantissa = round_mantissa(u64, shift);
|
||||
if (mantissa > IEEE754_DOUBLE_MANTISSA_AMAX)
|
||||
mantissa = round_mantissa(u64, --shift);
|
||||
}
|
||||
|
||||
assert(mantissa >= IEEE754_DOUBLE_IMPLICIT_LEAD &&
|
||||
mantissa <= IEEE754_DOUBLE_MANTISSA_AMAX);
|
||||
const uint64_t exponent = (uint64_t)IEEE754_DOUBLE_EXPONENTA_BIAS +
|
||||
IEEE754_DOUBLE_MANTISSA_SIZE - shift;
|
||||
assert(exponent > 0 && exponent <= IEEE754_DOUBLE_EXPONENTA_MAX);
|
||||
const uint64_t key = bias - 1 - (exponent << IEEE754_DOUBLE_MANTISSA_SIZE) -
|
||||
(mantissa - IEEE754_DOUBLE_IMPLICIT_LEAD);
|
||||
#if !defined(_MSC_VER) || \
|
||||
defined( \
|
||||
_DEBUG) /* Workaround for MSVC error LNK2019: unresolved external \
|
||||
symbol __except1 referenced in function __ftol3_except */
|
||||
assert(key == mdbx_key_from_double((double)json_integer));
|
||||
#endif /* Workaround for MSVC */
|
||||
return key;
|
||||
}
|
||||
|
||||
return bias;
|
||||
}
|
||||
|
||||
int64_t mdbx_jsonInteger_from_key(const MDBX_val v) {
|
||||
assert(v.iov_len == 8);
|
||||
const uint64_t key = unaligned_peek_u64(2, v.iov_base);
|
||||
const uint64_t bias = UINT64_C(0x8000000000000000);
|
||||
const uint64_t covalent = (key > bias) ? key - bias : bias - key - 1;
|
||||
const int shift = IEEE754_DOUBLE_EXPONENTA_BIAS + 63 -
|
||||
(IEEE754_DOUBLE_EXPONENTA_MAX &
|
||||
(int)(covalent >> IEEE754_DOUBLE_MANTISSA_SIZE));
|
||||
if (unlikely(shift < 1))
|
||||
return (key < bias) ? INT64_MIN : INT64_MAX;
|
||||
if (unlikely(shift > 63))
|
||||
return 0;
|
||||
|
||||
const uint64_t unscaled = ((covalent & IEEE754_DOUBLE_MANTISSA_MASK)
|
||||
<< (63 - IEEE754_DOUBLE_MANTISSA_SIZE)) +
|
||||
bias;
|
||||
const int64_t absolute = unscaled >> shift;
|
||||
const int64_t value = (key < bias) ? -absolute : absolute;
|
||||
assert(key == mdbx_key_from_jsonInteger(value) ||
|
||||
(mdbx_key_from_jsonInteger(value - 1) < key &&
|
||||
key < mdbx_key_from_jsonInteger(value + 1)));
|
||||
return value;
|
||||
}
|
||||
|
||||
double mdbx_double_from_key(const MDBX_val v) {
|
||||
assert(v.iov_len == 8);
|
||||
return key2double(unaligned_peek_u64(2, v.iov_base));
|
||||
}
|
||||
|
||||
float mdbx_float_from_key(const MDBX_val v) {
|
||||
assert(v.iov_len == 4);
|
||||
return key2float(unaligned_peek_u32(2, v.iov_base));
|
||||
}
|
||||
|
||||
int32_t mdbx_int32_from_key(const MDBX_val v) {
|
||||
assert(v.iov_len == 4);
|
||||
return (int32_t)(unaligned_peek_u32(2, v.iov_base) - UINT32_C(0x80000000));
|
||||
}
|
||||
|
||||
int64_t mdbx_int64_from_key(const MDBX_val v) {
|
||||
assert(v.iov_len == 8);
|
||||
return (int64_t)(unaligned_peek_u64(2, v.iov_base) -
|
||||
UINT64_C(0x8000000000000000));
|
||||
}
|
508
src/api-txn.c
Normal file
508
src/api-txn.c
Normal file
@ -0,0 +1,508 @@
|
||||
/// \copyright SPDX-License-Identifier: Apache-2.0
|
||||
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
|
||||
|
||||
#include "internals.h"
|
||||
|
||||
#ifdef __SANITIZE_THREAD__
|
||||
/* LY: avoid tsan-trap by txn, mm_last_pg and geo.first_unallocated */
|
||||
__attribute__((__no_sanitize_thread__, __noinline__))
|
||||
#endif
|
||||
int mdbx_txn_straggler(const MDBX_txn *txn, int *percent)
|
||||
{
|
||||
int rc = check_txn(txn, MDBX_TXN_BLOCKED);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return (rc > 0) ? -rc : rc;
|
||||
|
||||
MDBX_env *env = txn->env;
|
||||
if (unlikely((txn->flags & MDBX_TXN_RDONLY) == 0)) {
|
||||
if (percent)
|
||||
*percent = (int)((txn->geo.first_unallocated * UINT64_C(100) +
|
||||
txn->geo.end_pgno / 2) /
|
||||
txn->geo.end_pgno);
|
||||
return 0;
|
||||
}
|
||||
|
||||
txnid_t lag;
|
||||
troika_t troika = meta_tap(env);
|
||||
do {
|
||||
const meta_ptr_t head = meta_recent(env, &troika);
|
||||
if (percent) {
|
||||
const pgno_t maxpg = head.ptr_v->geometry.now;
|
||||
*percent = (int)((head.ptr_v->geometry.first_unallocated * UINT64_C(100) +
|
||||
maxpg / 2) /
|
||||
maxpg);
|
||||
}
|
||||
lag = (head.txnid - txn->txnid) / xMDBX_TXNID_STEP;
|
||||
} while (unlikely(meta_should_retry(env, &troika)));
|
||||
|
||||
return (lag > INT_MAX) ? INT_MAX : (int)lag;
|
||||
}
|
||||
|
||||
__cold int mdbx_dbi_dupsort_depthmask(const MDBX_txn *txn, MDBX_dbi dbi,
|
||||
uint32_t *mask) {
|
||||
int rc = check_txn(txn, MDBX_TXN_BLOCKED);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return rc;
|
||||
|
||||
if (unlikely(!mask))
|
||||
return MDBX_EINVAL;
|
||||
|
||||
cursor_couple_t cx;
|
||||
rc = cursor_init(&cx.outer, txn, dbi);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return rc;
|
||||
if ((cx.outer.tree->flags & MDBX_DUPSORT) == 0)
|
||||
return MDBX_RESULT_TRUE;
|
||||
|
||||
MDBX_val key, data;
|
||||
rc = outer_first(&cx.outer, &key, &data);
|
||||
*mask = 0;
|
||||
while (rc == MDBX_SUCCESS) {
|
||||
const node_t *node =
|
||||
page_node(cx.outer.pg[cx.outer.top], cx.outer.ki[cx.outer.top]);
|
||||
const tree_t *db = node_data(node);
|
||||
const unsigned flags = node_flags(node);
|
||||
switch (flags) {
|
||||
case N_BIGDATA:
|
||||
case 0:
|
||||
/* single-value entry, deep = 0 */
|
||||
*mask |= 1 << 0;
|
||||
break;
|
||||
case N_DUPDATA:
|
||||
/* single sub-page, deep = 1 */
|
||||
*mask |= 1 << 1;
|
||||
break;
|
||||
case N_DUPDATA | N_SUBDATA:
|
||||
/* sub-tree */
|
||||
*mask |= 1 << UNALIGNED_PEEK_16(db, tree_t, height);
|
||||
break;
|
||||
default:
|
||||
ERROR("%s/%d: %s %u", "MDBX_CORRUPTED", MDBX_CORRUPTED,
|
||||
"invalid node-size", flags);
|
||||
return MDBX_CORRUPTED;
|
||||
}
|
||||
rc = outer_next(&cx.outer, &key, &data, MDBX_NEXT_NODUP);
|
||||
}
|
||||
|
||||
return (rc == MDBX_NOTFOUND) ? MDBX_SUCCESS : rc;
|
||||
}
|
||||
|
||||
int mdbx_canary_get(const MDBX_txn *txn, MDBX_canary *canary) {
|
||||
int rc = check_txn(txn, MDBX_TXN_BLOCKED);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return rc;
|
||||
|
||||
if (unlikely(canary == nullptr))
|
||||
return MDBX_EINVAL;
|
||||
|
||||
*canary = txn->canary;
|
||||
return MDBX_SUCCESS;
|
||||
}
|
||||
|
||||
int mdbx_get(const MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key,
|
||||
MDBX_val *data) {
|
||||
DKBUF_DEBUG;
|
||||
DEBUG("===> get db %u key [%s]", dbi, DKEY_DEBUG(key));
|
||||
|
||||
int rc = check_txn(txn, MDBX_TXN_BLOCKED);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return rc;
|
||||
|
||||
if (unlikely(!key || !data))
|
||||
return MDBX_EINVAL;
|
||||
|
||||
cursor_couple_t cx;
|
||||
rc = cursor_init(&cx.outer, txn, dbi);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return rc;
|
||||
|
||||
return cursor_seek(&cx.outer, (MDBX_val *)key, data, MDBX_SET).err;
|
||||
}
|
||||
|
||||
int mdbx_get_equal_or_great(const MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key,
|
||||
MDBX_val *data) {
|
||||
int rc = check_txn(txn, MDBX_TXN_BLOCKED);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return rc;
|
||||
|
||||
if (unlikely(!key || !data))
|
||||
return MDBX_EINVAL;
|
||||
|
||||
if (unlikely(txn->flags & MDBX_TXN_BLOCKED))
|
||||
return MDBX_BAD_TXN;
|
||||
|
||||
cursor_couple_t cx;
|
||||
rc = cursor_init(&cx.outer, txn, dbi);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return rc;
|
||||
|
||||
return cursor_ops(&cx.outer, key, data, MDBX_SET_LOWERBOUND);
|
||||
}
|
||||
|
||||
int mdbx_get_ex(const MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key,
|
||||
MDBX_val *data, size_t *values_count) {
|
||||
DKBUF_DEBUG;
|
||||
DEBUG("===> get db %u key [%s]", dbi, DKEY_DEBUG(key));
|
||||
|
||||
int rc = check_txn(txn, MDBX_TXN_BLOCKED);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return rc;
|
||||
|
||||
if (unlikely(!key || !data))
|
||||
return MDBX_EINVAL;
|
||||
|
||||
cursor_couple_t cx;
|
||||
rc = cursor_init(&cx.outer, txn, dbi);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return rc;
|
||||
|
||||
rc = cursor_seek(&cx.outer, key, data, MDBX_SET_KEY).err;
|
||||
if (unlikely(rc != MDBX_SUCCESS)) {
|
||||
if (values_count)
|
||||
*values_count = 0;
|
||||
return rc;
|
||||
}
|
||||
|
||||
if (values_count) {
|
||||
*values_count = 1;
|
||||
if (inner_pointed(&cx.outer))
|
||||
*values_count =
|
||||
(sizeof(*values_count) >= sizeof(cx.inner.nested_tree.items) ||
|
||||
cx.inner.nested_tree.items <= PTRDIFF_MAX)
|
||||
? (size_t)cx.inner.nested_tree.items
|
||||
: PTRDIFF_MAX;
|
||||
}
|
||||
return MDBX_SUCCESS;
|
||||
}
|
||||
|
||||
/*----------------------------------------------------------------------------*/
|
||||
|
||||
int mdbx_canary_put(MDBX_txn *txn, const MDBX_canary *canary) {
|
||||
int rc = check_txn_rw(txn, MDBX_TXN_BLOCKED);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return rc;
|
||||
|
||||
if (likely(canary)) {
|
||||
if (txn->canary.x == canary->x && txn->canary.y == canary->y &&
|
||||
txn->canary.z == canary->z)
|
||||
return MDBX_SUCCESS;
|
||||
txn->canary.x = canary->x;
|
||||
txn->canary.y = canary->y;
|
||||
txn->canary.z = canary->z;
|
||||
}
|
||||
txn->canary.v = txn->txnid;
|
||||
txn->flags |= MDBX_TXN_DIRTY;
|
||||
|
||||
return MDBX_SUCCESS;
|
||||
}
|
||||
|
||||
/* Функция сообщает находится ли указанный адрес в "грязной" странице у
|
||||
* заданной пишущей транзакции. В конечном счете это позволяет избавиться от
|
||||
* лишнего копирования данных из НЕ-грязных страниц.
|
||||
*
|
||||
* "Грязные" страницы - это те, которые уже были изменены в ходе пишущей
|
||||
* транзакции. Соответственно, какие-либо дальнейшие изменения могут привести
|
||||
* к перезаписи таких страниц. Поэтому все функции, выполняющие изменения, в
|
||||
* качестве аргументов НЕ должны получать указатели на данные в таких
|
||||
* страницах. В свою очередь "НЕ грязные" страницы перед модификацией будут
|
||||
* скопированы.
|
||||
*
|
||||
* Другими словами, данные из "грязных" страниц должны быть либо скопированы
|
||||
* перед передачей в качестве аргументов для дальнейших модификаций, либо
|
||||
* отвергнуты на стадии проверки корректности аргументов.
|
||||
*
|
||||
* Таким образом, функция позволяет как избавится от лишнего копирования,
|
||||
* так и выполнить более полную проверку аргументов.
|
||||
*
|
||||
* ВАЖНО: Передаваемый указатель должен указывать на начало данных. Только
|
||||
* так гарантируется что актуальный заголовок страницы будет физически
|
||||
* расположен в той-же странице памяти, в том числе для многостраничных
|
||||
* P_LARGE страниц с длинными данными. */
|
||||
int mdbx_is_dirty(const MDBX_txn *txn, const void *ptr) {
|
||||
int rc = check_txn(txn, MDBX_TXN_BLOCKED);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return rc;
|
||||
|
||||
const MDBX_env *env = txn->env;
|
||||
const ptrdiff_t offset = ptr_dist(ptr, env->dxb_mmap.base);
|
||||
if (offset >= 0) {
|
||||
const pgno_t pgno = bytes2pgno(env, offset);
|
||||
if (likely(pgno < txn->geo.first_unallocated)) {
|
||||
const page_t *page = pgno2page(env, pgno);
|
||||
if (unlikely(page->pgno != pgno || (page->flags & P_ILL_BITS) != 0)) {
|
||||
/* The ptr pointed into middle of a large page,
|
||||
* not to the beginning of a data. */
|
||||
return MDBX_EINVAL;
|
||||
}
|
||||
return ((txn->flags & MDBX_TXN_RDONLY) || !is_modifable(txn, page))
|
||||
? MDBX_RESULT_FALSE
|
||||
: MDBX_RESULT_TRUE;
|
||||
}
|
||||
if ((size_t)offset < env->dxb_mmap.limit) {
|
||||
/* Указатель адресует что-то в пределах mmap, но за границей
|
||||
* распределенных страниц. Такое может случится если mdbx_is_dirty()
|
||||
* вызывается после операции, в ходе которой грязная страница была
|
||||
* возвращена в нераспределенное пространство. */
|
||||
return (txn->flags & MDBX_TXN_RDONLY) ? MDBX_EINVAL : MDBX_RESULT_TRUE;
|
||||
}
|
||||
}
|
||||
|
||||
/* Страница вне используемого mmap-диапазона, т.е. либо в функцию был
|
||||
* передан некорректный адрес, либо адрес в теневой странице, которая была
|
||||
* выделена посредством malloc().
|
||||
*
|
||||
* Для режима MDBX_WRITE_MAP режима страница однозначно "не грязная",
|
||||
* а для режимов без MDBX_WRITE_MAP однозначно "не чистая". */
|
||||
return (txn->flags & (MDBX_WRITEMAP | MDBX_TXN_RDONLY)) ? MDBX_EINVAL
|
||||
: MDBX_RESULT_TRUE;
|
||||
}
|
||||
|
||||
int mdbx_del(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key,
|
||||
const MDBX_val *data) {
|
||||
int rc = check_txn_rw(txn, MDBX_TXN_BLOCKED);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return rc;
|
||||
|
||||
if (unlikely(!key))
|
||||
return MDBX_EINVAL;
|
||||
|
||||
if (unlikely(dbi <= FREE_DBI))
|
||||
return MDBX_BAD_DBI;
|
||||
|
||||
if (unlikely(txn->flags & (MDBX_TXN_RDONLY | MDBX_TXN_BLOCKED)))
|
||||
return (txn->flags & MDBX_TXN_RDONLY) ? MDBX_EACCESS : MDBX_BAD_TXN;
|
||||
|
||||
cursor_couple_t cx;
|
||||
rc = cursor_init(&cx.outer, txn, dbi);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return rc;
|
||||
|
||||
MDBX_val proxy;
|
||||
MDBX_cursor_op op = MDBX_SET;
|
||||
unsigned flags = MDBX_ALLDUPS;
|
||||
if (data) {
|
||||
proxy = *data;
|
||||
data = &proxy;
|
||||
op = MDBX_GET_BOTH;
|
||||
flags = 0;
|
||||
}
|
||||
rc = cursor_seek(&cx.outer, (MDBX_val *)key, (MDBX_val *)data, op).err;
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return rc;
|
||||
|
||||
cx.outer.next = txn->cursors[dbi];
|
||||
txn->cursors[dbi] = &cx.outer;
|
||||
rc = cursor_del(&cx.outer, flags);
|
||||
txn->cursors[dbi] = cx.outer.next;
|
||||
return rc;
|
||||
}
|
||||
|
||||
int mdbx_put(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, MDBX_val *data,
|
||||
MDBX_put_flags_t flags) {
|
||||
int rc = check_txn_rw(txn, MDBX_TXN_BLOCKED);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return rc;
|
||||
|
||||
if (unlikely(!key || !data))
|
||||
return MDBX_EINVAL;
|
||||
|
||||
if (unlikely(dbi <= FREE_DBI))
|
||||
return MDBX_BAD_DBI;
|
||||
|
||||
if (unlikely(flags & ~(MDBX_NOOVERWRITE | MDBX_NODUPDATA | MDBX_ALLDUPS |
|
||||
MDBX_ALLDUPS | MDBX_RESERVE | MDBX_APPEND |
|
||||
MDBX_APPENDDUP | MDBX_CURRENT | MDBX_MULTIPLE)))
|
||||
return MDBX_EINVAL;
|
||||
|
||||
if (unlikely(txn->flags & (MDBX_TXN_RDONLY | MDBX_TXN_BLOCKED)))
|
||||
return (txn->flags & MDBX_TXN_RDONLY) ? MDBX_EACCESS : MDBX_BAD_TXN;
|
||||
|
||||
cursor_couple_t cx;
|
||||
rc = cursor_init(&cx.outer, txn, dbi);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return rc;
|
||||
cx.outer.next = txn->cursors[dbi];
|
||||
txn->cursors[dbi] = &cx.outer;
|
||||
|
||||
/* LY: support for update (explicit overwrite) */
|
||||
if (flags & MDBX_CURRENT) {
|
||||
rc = cursor_seek(&cx.outer, (MDBX_val *)key, nullptr, MDBX_SET).err;
|
||||
if (likely(rc == MDBX_SUCCESS) && (txn->dbs[dbi].flags & MDBX_DUPSORT) &&
|
||||
(flags & MDBX_ALLDUPS) == 0) {
|
||||
/* LY: allows update (explicit overwrite) only for unique keys */
|
||||
node_t *node =
|
||||
page_node(cx.outer.pg[cx.outer.top], cx.outer.ki[cx.outer.top]);
|
||||
if (node_flags(node) & N_DUPDATA) {
|
||||
tASSERT(txn, inner_pointed(&cx.outer) &&
|
||||
cx.outer.subcur->nested_tree.items > 1);
|
||||
rc = MDBX_EMULTIVAL;
|
||||
if ((flags & MDBX_NOOVERWRITE) == 0) {
|
||||
flags -= MDBX_CURRENT;
|
||||
rc = cursor_del(&cx.outer, MDBX_ALLDUPS);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (likely(rc == MDBX_SUCCESS))
|
||||
rc = cursor_put_checklen(&cx.outer, key, data, flags);
|
||||
txn->cursors[dbi] = cx.outer.next;
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
/* Позволяет обновить или удалить существующую запись с получением
|
||||
* в old_data предыдущего значения данных. При этом если new_data равен
|
||||
* нулю, то выполняется удаление, иначе обновление/вставка.
|
||||
*
|
||||
* Текущее значение может находиться в уже измененной (грязной) странице.
|
||||
* В этом случае страница будет перезаписана при обновлении, а само старое
|
||||
* значение утрачено. Поэтому исходно в old_data должен быть передан
|
||||
* дополнительный буфер для копирования старого значения.
|
||||
* Если переданный буфер слишком мал, то функция вернет -1, установив
|
||||
* old_data->iov_len в соответствующее значение.
|
||||
*
|
||||
* Для не-уникальных ключей также возможен второй сценарий использования,
|
||||
* когда посредством old_data из записей с одинаковым ключом для
|
||||
* удаления/обновления выбирается конкретная. Для выбора этого сценария
|
||||
* во flags следует одновременно указать MDBX_CURRENT и MDBX_NOOVERWRITE.
|
||||
* Именно эта комбинация выбрана, так как она лишена смысла, и этим позволяет
|
||||
* идентифицировать запрос такого сценария.
|
||||
*
|
||||
* Функция может быть замещена соответствующими операциями с курсорами
|
||||
* после двух доработок (TODO):
|
||||
* - внешняя аллокация курсоров, в том числе на стеке (без malloc).
|
||||
* - получения dirty-статуса страницы по адресу (знать о MUTABLE/WRITEABLE).
|
||||
*/
|
||||
|
||||
int mdbx_replace_ex(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key,
|
||||
MDBX_val *new_data, MDBX_val *old_data,
|
||||
MDBX_put_flags_t flags, MDBX_preserve_func preserver,
|
||||
void *preserver_context) {
|
||||
int rc = check_txn_rw(txn, MDBX_TXN_BLOCKED);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return rc;
|
||||
|
||||
if (unlikely(!key || !old_data || old_data == new_data))
|
||||
return MDBX_EINVAL;
|
||||
|
||||
if (unlikely(old_data->iov_base == nullptr && old_data->iov_len))
|
||||
return MDBX_EINVAL;
|
||||
|
||||
if (unlikely(new_data == nullptr &&
|
||||
(flags & (MDBX_CURRENT | MDBX_RESERVE)) != MDBX_CURRENT))
|
||||
return MDBX_EINVAL;
|
||||
|
||||
if (unlikely(dbi <= FREE_DBI))
|
||||
return MDBX_BAD_DBI;
|
||||
|
||||
if (unlikely(flags &
|
||||
~(MDBX_NOOVERWRITE | MDBX_NODUPDATA | MDBX_ALLDUPS |
|
||||
MDBX_RESERVE | MDBX_APPEND | MDBX_APPENDDUP | MDBX_CURRENT)))
|
||||
return MDBX_EINVAL;
|
||||
|
||||
cursor_couple_t cx;
|
||||
rc = cursor_init(&cx.outer, txn, dbi);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return rc;
|
||||
cx.outer.next = txn->cursors[dbi];
|
||||
txn->cursors[dbi] = &cx.outer;
|
||||
|
||||
MDBX_val present_key = *key;
|
||||
if (F_ISSET(flags, MDBX_CURRENT | MDBX_NOOVERWRITE)) {
|
||||
/* в old_data значение для выбора конкретного дубликата */
|
||||
if (unlikely(!(txn->dbs[dbi].flags & MDBX_DUPSORT))) {
|
||||
rc = MDBX_EINVAL;
|
||||
goto bailout;
|
||||
}
|
||||
|
||||
/* убираем лишний бит, он был признаком запрошенного режима */
|
||||
flags -= MDBX_NOOVERWRITE;
|
||||
|
||||
rc = cursor_seek(&cx.outer, &present_key, old_data, MDBX_GET_BOTH).err;
|
||||
if (rc != MDBX_SUCCESS)
|
||||
goto bailout;
|
||||
} else {
|
||||
/* в old_data буфер для сохранения предыдущего значения */
|
||||
if (unlikely(new_data && old_data->iov_base == new_data->iov_base))
|
||||
return MDBX_EINVAL;
|
||||
MDBX_val present_data;
|
||||
rc = cursor_seek(&cx.outer, &present_key, &present_data, MDBX_SET_KEY).err;
|
||||
if (unlikely(rc != MDBX_SUCCESS)) {
|
||||
old_data->iov_base = nullptr;
|
||||
old_data->iov_len = 0;
|
||||
if (rc != MDBX_NOTFOUND || (flags & MDBX_CURRENT))
|
||||
goto bailout;
|
||||
} else if (flags & MDBX_NOOVERWRITE) {
|
||||
rc = MDBX_KEYEXIST;
|
||||
*old_data = present_data;
|
||||
goto bailout;
|
||||
} else {
|
||||
page_t *page = cx.outer.pg[cx.outer.top];
|
||||
if (txn->dbs[dbi].flags & MDBX_DUPSORT) {
|
||||
if (flags & MDBX_CURRENT) {
|
||||
/* disallow update/delete for multi-values */
|
||||
node_t *node = page_node(page, cx.outer.ki[cx.outer.top]);
|
||||
if (node_flags(node) & N_DUPDATA) {
|
||||
tASSERT(txn, inner_pointed(&cx.outer) &&
|
||||
cx.outer.subcur->nested_tree.items > 1);
|
||||
if (cx.outer.subcur->nested_tree.items > 1) {
|
||||
rc = MDBX_EMULTIVAL;
|
||||
goto bailout;
|
||||
}
|
||||
}
|
||||
/* В LMDB флажок MDBX_CURRENT здесь приведет
|
||||
* к замене данных без учета MDBX_DUPSORT сортировки,
|
||||
* но здесь это в любом случае допустимо, так как мы
|
||||
* проверили что для ключа есть только одно значение. */
|
||||
}
|
||||
}
|
||||
|
||||
if (is_modifable(txn, page)) {
|
||||
if (new_data && cmp_lenfast(&present_data, new_data) == 0) {
|
||||
/* если данные совпадают, то ничего делать не надо */
|
||||
*old_data = *new_data;
|
||||
goto bailout;
|
||||
}
|
||||
rc = preserver ? preserver(preserver_context, old_data,
|
||||
present_data.iov_base, present_data.iov_len)
|
||||
: MDBX_SUCCESS;
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
goto bailout;
|
||||
} else {
|
||||
*old_data = present_data;
|
||||
}
|
||||
flags |= MDBX_CURRENT;
|
||||
}
|
||||
}
|
||||
|
||||
if (likely(new_data))
|
||||
rc = cursor_put_checklen(&cx.outer, key, new_data, flags);
|
||||
else
|
||||
rc = cursor_del(&cx.outer, flags & MDBX_ALLDUPS);
|
||||
|
||||
bailout:
|
||||
txn->cursors[dbi] = cx.outer.next;
|
||||
return rc;
|
||||
}
|
||||
|
||||
static int default_value_preserver(void *context, MDBX_val *target,
|
||||
const void *src, size_t bytes) {
|
||||
(void)context;
|
||||
if (unlikely(target->iov_len < bytes)) {
|
||||
target->iov_base = nullptr;
|
||||
target->iov_len = bytes;
|
||||
return MDBX_RESULT_TRUE;
|
||||
}
|
||||
memcpy(target->iov_base, src, target->iov_len = bytes);
|
||||
return MDBX_SUCCESS;
|
||||
}
|
||||
|
||||
int mdbx_replace(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key,
|
||||
MDBX_val *new_data, MDBX_val *old_data,
|
||||
MDBX_put_flags_t flags) {
|
||||
return mdbx_replace_ex(txn, dbi, key, new_data, old_data, flags,
|
||||
default_value_preserver, nullptr);
|
||||
}
|
390
src/atomics-ops.h
Normal file
390
src/atomics-ops.h
Normal file
@ -0,0 +1,390 @@
|
||||
/// \copyright SPDX-License-Identifier: Apache-2.0
|
||||
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "essentials.h"
|
||||
|
||||
#ifndef __cplusplus
|
||||
|
||||
#ifdef MDBX_HAVE_C11ATOMICS
|
||||
#define osal_memory_fence(order, write) \
|
||||
atomic_thread_fence((write) ? mo_c11_store(order) : mo_c11_load(order))
|
||||
#else /* MDBX_HAVE_C11ATOMICS */
|
||||
#define osal_memory_fence(order, write) \
|
||||
do { \
|
||||
osal_compiler_barrier(); \
|
||||
if (write && order > (MDBX_CPU_WRITEBACK_INCOHERENT ? mo_Relaxed \
|
||||
: mo_AcquireRelease)) \
|
||||
osal_memory_barrier(); \
|
||||
} while (0)
|
||||
#endif /* MDBX_HAVE_C11ATOMICS */
|
||||
|
||||
#if defined(MDBX_HAVE_C11ATOMICS) && defined(__LCC__)
|
||||
#define atomic_store32(p, value, order) \
|
||||
({ \
|
||||
const uint32_t value_to_store = (value); \
|
||||
atomic_store_explicit(MDBX_c11a_rw(uint32_t, p), value_to_store, \
|
||||
mo_c11_store(order)); \
|
||||
value_to_store; \
|
||||
})
|
||||
#define atomic_load32(p, order) \
|
||||
atomic_load_explicit(MDBX_c11a_ro(uint32_t, p), mo_c11_load(order))
|
||||
#define atomic_store64(p, value, order) \
|
||||
({ \
|
||||
const uint64_t value_to_store = (value); \
|
||||
atomic_store_explicit(MDBX_c11a_rw(uint64_t, p), value_to_store, \
|
||||
mo_c11_store(order)); \
|
||||
value_to_store; \
|
||||
})
|
||||
#define atomic_load64(p, order) \
|
||||
atomic_load_explicit(MDBX_c11a_ro(uint64_t, p), mo_c11_load(order))
|
||||
#endif /* LCC && MDBX_HAVE_C11ATOMICS */
|
||||
|
||||
#ifndef atomic_store32
|
||||
MDBX_MAYBE_UNUSED static __always_inline uint32_t
|
||||
atomic_store32(mdbx_atomic_uint32_t *p, const uint32_t value,
|
||||
enum mdbx_memory_order order) {
|
||||
STATIC_ASSERT(sizeof(mdbx_atomic_uint32_t) == 4);
|
||||
#ifdef MDBX_HAVE_C11ATOMICS
|
||||
assert(atomic_is_lock_free(MDBX_c11a_rw(uint32_t, p)));
|
||||
atomic_store_explicit(MDBX_c11a_rw(uint32_t, p), value, mo_c11_store(order));
|
||||
#else /* MDBX_HAVE_C11ATOMICS */
|
||||
if (order != mo_Relaxed)
|
||||
osal_compiler_barrier();
|
||||
p->weak = value;
|
||||
osal_memory_fence(order, true);
|
||||
#endif /* MDBX_HAVE_C11ATOMICS */
|
||||
return value;
|
||||
}
|
||||
#endif /* atomic_store32 */
|
||||
|
||||
#ifndef atomic_load32
|
||||
MDBX_MAYBE_UNUSED static __always_inline uint32_t atomic_load32(
|
||||
const volatile mdbx_atomic_uint32_t *p, enum mdbx_memory_order order) {
|
||||
STATIC_ASSERT(sizeof(mdbx_atomic_uint32_t) == 4);
|
||||
#ifdef MDBX_HAVE_C11ATOMICS
|
||||
assert(atomic_is_lock_free(MDBX_c11a_ro(uint32_t, p)));
|
||||
return atomic_load_explicit(MDBX_c11a_ro(uint32_t, p), mo_c11_load(order));
|
||||
#else /* MDBX_HAVE_C11ATOMICS */
|
||||
osal_memory_fence(order, false);
|
||||
const uint32_t value = p->weak;
|
||||
if (order != mo_Relaxed)
|
||||
osal_compiler_barrier();
|
||||
return value;
|
||||
#endif /* MDBX_HAVE_C11ATOMICS */
|
||||
}
|
||||
#endif /* atomic_load32 */
|
||||
|
||||
/*------------------------------------------------------------------------------
|
||||
* safe read/write volatile 64-bit fields on 32-bit architectures. */
|
||||
|
||||
/* LY: for testing non-atomic 64-bit txnid on 32-bit arches.
|
||||
* #define xMDBX_TXNID_STEP (UINT32_MAX / 3) */
|
||||
#ifndef xMDBX_TXNID_STEP
|
||||
#if MDBX_64BIT_CAS
|
||||
#define xMDBX_TXNID_STEP 1u
|
||||
#else
|
||||
#define xMDBX_TXNID_STEP 2u
|
||||
#endif
|
||||
#endif /* xMDBX_TXNID_STEP */
|
||||
|
||||
#ifndef atomic_store64
|
||||
MDBX_MAYBE_UNUSED static __always_inline uint64_t
|
||||
atomic_store64(mdbx_atomic_uint64_t *p, const uint64_t value,
|
||||
enum mdbx_memory_order order) {
|
||||
STATIC_ASSERT(sizeof(mdbx_atomic_uint64_t) == 8);
|
||||
#if MDBX_64BIT_ATOMIC
|
||||
#if __GNUC_PREREQ(11, 0)
|
||||
STATIC_ASSERT(__alignof__(mdbx_atomic_uint64_t) >= sizeof(uint64_t));
|
||||
#endif /* GNU C >= 11 */
|
||||
#ifdef MDBX_HAVE_C11ATOMICS
|
||||
assert(atomic_is_lock_free(MDBX_c11a_rw(uint64_t, p)));
|
||||
atomic_store_explicit(MDBX_c11a_rw(uint64_t, p), value, mo_c11_store(order));
|
||||
#else /* MDBX_HAVE_C11ATOMICS */
|
||||
if (order != mo_Relaxed)
|
||||
osal_compiler_barrier();
|
||||
p->weak = value;
|
||||
osal_memory_fence(order, true);
|
||||
#endif /* MDBX_HAVE_C11ATOMICS */
|
||||
#else /* !MDBX_64BIT_ATOMIC */
|
||||
osal_compiler_barrier();
|
||||
atomic_store32(&p->low, (uint32_t)value, mo_Relaxed);
|
||||
jitter4testing(true);
|
||||
atomic_store32(&p->high, (uint32_t)(value >> 32), order);
|
||||
jitter4testing(true);
|
||||
#endif /* !MDBX_64BIT_ATOMIC */
|
||||
return value;
|
||||
}
|
||||
#endif /* atomic_store64 */
|
||||
|
||||
#ifndef atomic_load64
|
||||
MDBX_MAYBE_UNUSED static
|
||||
#if MDBX_64BIT_ATOMIC
|
||||
__always_inline
|
||||
#endif /* MDBX_64BIT_ATOMIC */
|
||||
uint64_t
|
||||
atomic_load64(const volatile mdbx_atomic_uint64_t *p,
|
||||
enum mdbx_memory_order order) {
|
||||
STATIC_ASSERT(sizeof(mdbx_atomic_uint64_t) == 8);
|
||||
#if MDBX_64BIT_ATOMIC
|
||||
#ifdef MDBX_HAVE_C11ATOMICS
|
||||
assert(atomic_is_lock_free(MDBX_c11a_ro(uint64_t, p)));
|
||||
return atomic_load_explicit(MDBX_c11a_ro(uint64_t, p), mo_c11_load(order));
|
||||
#else /* MDBX_HAVE_C11ATOMICS */
|
||||
osal_memory_fence(order, false);
|
||||
const uint64_t value = p->weak;
|
||||
if (order != mo_Relaxed)
|
||||
osal_compiler_barrier();
|
||||
return value;
|
||||
#endif /* MDBX_HAVE_C11ATOMICS */
|
||||
#else /* !MDBX_64BIT_ATOMIC */
|
||||
osal_compiler_barrier();
|
||||
uint64_t value = (uint64_t)atomic_load32(&p->high, order) << 32;
|
||||
jitter4testing(true);
|
||||
value |= atomic_load32(&p->low, (order == mo_Relaxed) ? mo_Relaxed
|
||||
: mo_AcquireRelease);
|
||||
jitter4testing(true);
|
||||
for (;;) {
|
||||
osal_compiler_barrier();
|
||||
uint64_t again = (uint64_t)atomic_load32(&p->high, order) << 32;
|
||||
jitter4testing(true);
|
||||
again |= atomic_load32(&p->low, (order == mo_Relaxed) ? mo_Relaxed
|
||||
: mo_AcquireRelease);
|
||||
jitter4testing(true);
|
||||
if (likely(value == again))
|
||||
return value;
|
||||
value = again;
|
||||
}
|
||||
#endif /* !MDBX_64BIT_ATOMIC */
|
||||
}
|
||||
#endif /* atomic_load64 */
|
||||
|
||||
MDBX_MAYBE_UNUSED static __always_inline void atomic_yield(void) {
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
YieldProcessor();
|
||||
#elif defined(__ia32__) || defined(__e2k__)
|
||||
__builtin_ia32_pause();
|
||||
#elif defined(__ia64__)
|
||||
#if defined(__HP_cc__) || defined(__HP_aCC__)
|
||||
_Asm_hint(_HINT_PAUSE);
|
||||
#else
|
||||
__asm__ __volatile__("hint @pause");
|
||||
#endif
|
||||
#elif defined(__aarch64__) || (defined(__ARM_ARCH) && __ARM_ARCH > 6) || \
|
||||
defined(__ARM_ARCH_6K__)
|
||||
#ifdef __CC_ARM
|
||||
__yield();
|
||||
#else
|
||||
__asm__ __volatile__("yield");
|
||||
#endif
|
||||
#elif (defined(__mips64) || defined(__mips64__)) && defined(__mips_isa_rev) && \
|
||||
__mips_isa_rev >= 2
|
||||
__asm__ __volatile__("pause");
|
||||
#elif defined(__mips) || defined(__mips__) || defined(__mips64) || \
|
||||
defined(__mips64__) || defined(_M_MRX000) || defined(_MIPS_) || \
|
||||
defined(__MWERKS__) || defined(__sgi)
|
||||
__asm__ __volatile__(".word 0x00000140");
|
||||
#elif defined(__linux__) || defined(__gnu_linux__) || defined(_UNIX03_SOURCE)
|
||||
sched_yield();
|
||||
#elif (defined(_GNU_SOURCE) && __GLIBC_PREREQ(2, 1)) || defined(_OPEN_THREADS)
|
||||
pthread_yield();
|
||||
#endif
|
||||
}
|
||||
|
||||
#if MDBX_64BIT_CAS
|
||||
MDBX_MAYBE_UNUSED static __always_inline bool
|
||||
atomic_cas64(mdbx_atomic_uint64_t *p, uint64_t c, uint64_t v) {
|
||||
#ifdef MDBX_HAVE_C11ATOMICS
|
||||
STATIC_ASSERT(sizeof(long long) >= sizeof(uint64_t));
|
||||
assert(atomic_is_lock_free(MDBX_c11a_rw(uint64_t, p)));
|
||||
return atomic_compare_exchange_strong(MDBX_c11a_rw(uint64_t, p), &c, v);
|
||||
#elif defined(__GNUC__) || defined(__clang__)
|
||||
return __sync_bool_compare_and_swap(&p->weak, c, v);
|
||||
#elif defined(_MSC_VER)
|
||||
return c == (uint64_t)_InterlockedCompareExchange64(
|
||||
(volatile __int64 *)&p->weak, v, c);
|
||||
#elif defined(__APPLE__)
|
||||
return OSAtomicCompareAndSwap64Barrier(c, v, &p->weak);
|
||||
#else
|
||||
#error FIXME: Unsupported compiler
|
||||
#endif
|
||||
}
|
||||
#endif /* MDBX_64BIT_CAS */
|
||||
|
||||
MDBX_MAYBE_UNUSED static __always_inline bool
|
||||
atomic_cas32(mdbx_atomic_uint32_t *p, uint32_t c, uint32_t v) {
|
||||
#ifdef MDBX_HAVE_C11ATOMICS
|
||||
STATIC_ASSERT(sizeof(int) >= sizeof(uint32_t));
|
||||
assert(atomic_is_lock_free(MDBX_c11a_rw(uint32_t, p)));
|
||||
return atomic_compare_exchange_strong(MDBX_c11a_rw(uint32_t, p), &c, v);
|
||||
#elif defined(__GNUC__) || defined(__clang__)
|
||||
return __sync_bool_compare_and_swap(&p->weak, c, v);
|
||||
#elif defined(_MSC_VER)
|
||||
STATIC_ASSERT(sizeof(volatile long) == sizeof(volatile uint32_t));
|
||||
return c ==
|
||||
(uint32_t)_InterlockedCompareExchange((volatile long *)&p->weak, v, c);
|
||||
#elif defined(__APPLE__)
|
||||
return OSAtomicCompareAndSwap32Barrier(c, v, &p->weak);
|
||||
#else
|
||||
#error FIXME: Unsupported compiler
|
||||
#endif
|
||||
}
|
||||
|
||||
MDBX_MAYBE_UNUSED static __always_inline uint32_t
|
||||
atomic_add32(mdbx_atomic_uint32_t *p, uint32_t v) {
|
||||
#ifdef MDBX_HAVE_C11ATOMICS
|
||||
STATIC_ASSERT(sizeof(int) >= sizeof(uint32_t));
|
||||
assert(atomic_is_lock_free(MDBX_c11a_rw(uint32_t, p)));
|
||||
return atomic_fetch_add(MDBX_c11a_rw(uint32_t, p), v);
|
||||
#elif defined(__GNUC__) || defined(__clang__)
|
||||
return __sync_fetch_and_add(&p->weak, v);
|
||||
#elif defined(_MSC_VER)
|
||||
STATIC_ASSERT(sizeof(volatile long) == sizeof(volatile uint32_t));
|
||||
return (uint32_t)_InterlockedExchangeAdd((volatile long *)&p->weak, v);
|
||||
#elif defined(__APPLE__)
|
||||
return OSAtomicAdd32Barrier(v, &p->weak);
|
||||
#else
|
||||
#error FIXME: Unsupported compiler
|
||||
#endif
|
||||
}
|
||||
|
||||
#define atomic_sub32(p, v) atomic_add32(p, 0 - (v))
|
||||
|
||||
MDBX_MAYBE_UNUSED static __always_inline uint64_t
|
||||
safe64_txnid_next(uint64_t txnid) {
|
||||
txnid += xMDBX_TXNID_STEP;
|
||||
#if !MDBX_64BIT_CAS
|
||||
/* avoid overflow of low-part in safe64_reset() */
|
||||
txnid += (UINT32_MAX == (uint32_t)txnid);
|
||||
#endif
|
||||
return txnid;
|
||||
}
|
||||
|
||||
/* Atomically make target value >= SAFE64_INVALID_THRESHOLD */
|
||||
MDBX_MAYBE_UNUSED static __always_inline void
|
||||
safe64_reset(mdbx_atomic_uint64_t *p, bool single_writer) {
|
||||
if (single_writer) {
|
||||
#if MDBX_64BIT_ATOMIC && MDBX_WORDBITS >= 64
|
||||
atomic_store64(p, UINT64_MAX, mo_AcquireRelease);
|
||||
#else
|
||||
atomic_store32(&p->high, UINT32_MAX, mo_AcquireRelease);
|
||||
#endif /* MDBX_64BIT_ATOMIC && MDBX_WORDBITS >= 64 */
|
||||
} else {
|
||||
#if MDBX_64BIT_CAS && MDBX_64BIT_ATOMIC
|
||||
/* atomically make value >= SAFE64_INVALID_THRESHOLD by 64-bit operation */
|
||||
atomic_store64(p, UINT64_MAX, mo_AcquireRelease);
|
||||
#elif MDBX_64BIT_CAS
|
||||
/* atomically make value >= SAFE64_INVALID_THRESHOLD by 32-bit operation */
|
||||
atomic_store32(&p->high, UINT32_MAX, mo_AcquireRelease);
|
||||
#else
|
||||
/* it is safe to increment low-part to avoid ABA, since xMDBX_TXNID_STEP > 1
|
||||
* and overflow was preserved in safe64_txnid_next() */
|
||||
STATIC_ASSERT(xMDBX_TXNID_STEP > 1);
|
||||
atomic_add32(&p->low, 1) /* avoid ABA in safe64_reset_compare() */;
|
||||
atomic_store32(&p->high, UINT32_MAX, mo_AcquireRelease);
|
||||
atomic_add32(&p->low, 1) /* avoid ABA in safe64_reset_compare() */;
|
||||
#endif /* MDBX_64BIT_CAS && MDBX_64BIT_ATOMIC */
|
||||
}
|
||||
assert(p->weak >= SAFE64_INVALID_THRESHOLD);
|
||||
jitter4testing(true);
|
||||
}
|
||||
|
||||
MDBX_MAYBE_UNUSED static __always_inline bool
|
||||
safe64_reset_compare(mdbx_atomic_uint64_t *p, uint64_t compare) {
|
||||
/* LY: This function is used to reset `txnid` from hsr-handler in case
|
||||
* the asynchronously cancellation of read transaction. Therefore,
|
||||
* there may be a collision between the cleanup performed here and
|
||||
* asynchronous termination and restarting of the read transaction
|
||||
* in another process/thread. In general we MUST NOT reset the `txnid`
|
||||
* if a new transaction was started (i.e. if `txnid` was changed). */
|
||||
#if MDBX_64BIT_CAS
|
||||
bool rc = atomic_cas64(p, compare, UINT64_MAX);
|
||||
#else
|
||||
/* LY: There is no gold ratio here since shared mutex is too costly,
|
||||
* in such way we must acquire/release it for every update of txnid,
|
||||
* i.e. twice for each read transaction). */
|
||||
bool rc = false;
|
||||
if (likely(atomic_load32(&p->low, mo_AcquireRelease) == (uint32_t)compare &&
|
||||
atomic_cas32(&p->high, (uint32_t)(compare >> 32), UINT32_MAX))) {
|
||||
if (unlikely(atomic_load32(&p->low, mo_AcquireRelease) !=
|
||||
(uint32_t)compare))
|
||||
atomic_cas32(&p->high, UINT32_MAX, (uint32_t)(compare >> 32));
|
||||
else
|
||||
rc = true;
|
||||
}
|
||||
#endif /* MDBX_64BIT_CAS */
|
||||
jitter4testing(true);
|
||||
return rc;
|
||||
}
|
||||
|
||||
MDBX_MAYBE_UNUSED static __always_inline void
|
||||
safe64_write(mdbx_atomic_uint64_t *p, const uint64_t v) {
|
||||
assert(p->weak >= SAFE64_INVALID_THRESHOLD);
|
||||
#if MDBX_64BIT_ATOMIC && MDBX_64BIT_CAS
|
||||
atomic_store64(p, v, mo_AcquireRelease);
|
||||
#else /* MDBX_64BIT_ATOMIC */
|
||||
osal_compiler_barrier();
|
||||
/* update low-part but still value >= SAFE64_INVALID_THRESHOLD */
|
||||
atomic_store32(&p->low, (uint32_t)v, mo_Relaxed);
|
||||
assert(p->weak >= SAFE64_INVALID_THRESHOLD);
|
||||
jitter4testing(true);
|
||||
/* update high-part from SAFE64_INVALID_THRESHOLD to actual value */
|
||||
atomic_store32(&p->high, (uint32_t)(v >> 32), mo_AcquireRelease);
|
||||
#endif /* MDBX_64BIT_ATOMIC */
|
||||
assert(p->weak == v);
|
||||
jitter4testing(true);
|
||||
}
|
||||
|
||||
MDBX_MAYBE_UNUSED static __always_inline uint64_t
|
||||
safe64_read(const mdbx_atomic_uint64_t *p) {
|
||||
jitter4testing(true);
|
||||
uint64_t v;
|
||||
do
|
||||
v = atomic_load64(p, mo_AcquireRelease);
|
||||
while (!MDBX_64BIT_ATOMIC && unlikely(v != p->weak));
|
||||
return v;
|
||||
}
|
||||
|
||||
#if 0 /* unused for now */
|
||||
MDBX_MAYBE_UNUSED static __always_inline bool safe64_is_valid(uint64_t v) {
|
||||
#if MDBX_WORDBITS >= 64
|
||||
return v < SAFE64_INVALID_THRESHOLD;
|
||||
#else
|
||||
return (v >> 32) != UINT32_MAX;
|
||||
#endif /* MDBX_WORDBITS */
|
||||
}
|
||||
|
||||
MDBX_MAYBE_UNUSED static __always_inline bool
|
||||
safe64_is_valid_ptr(const mdbx_atomic_uint64_t *p) {
|
||||
#if MDBX_64BIT_ATOMIC
|
||||
return atomic_load64(p, mo_AcquireRelease) < SAFE64_INVALID_THRESHOLD;
|
||||
#else
|
||||
return atomic_load32(&p->high, mo_AcquireRelease) != UINT32_MAX;
|
||||
#endif /* MDBX_64BIT_ATOMIC */
|
||||
}
|
||||
#endif /* unused for now */
|
||||
|
||||
/* non-atomic write with safety for reading a half-updated value */
|
||||
MDBX_MAYBE_UNUSED static __always_inline void
|
||||
safe64_update(mdbx_atomic_uint64_t *p, const uint64_t v) {
|
||||
#if MDBX_64BIT_ATOMIC
|
||||
atomic_store64(p, v, mo_Relaxed);
|
||||
#else
|
||||
safe64_reset(p, true);
|
||||
safe64_write(p, v);
|
||||
#endif /* MDBX_64BIT_ATOMIC */
|
||||
}
|
||||
|
||||
/* non-atomic increment with safety for reading a half-updated value */
|
||||
MDBX_MAYBE_UNUSED static
|
||||
#if MDBX_64BIT_ATOMIC
|
||||
__always_inline
|
||||
#endif /* MDBX_64BIT_ATOMIC */
|
||||
void
|
||||
safe64_inc(mdbx_atomic_uint64_t *p, const uint64_t v) {
|
||||
assert(v > 0);
|
||||
safe64_update(p, safe64_read(p) + v);
|
||||
}
|
||||
|
||||
#endif /* !__cplusplus */
|
99
src/atomics-types.h
Normal file
99
src/atomics-types.h
Normal file
@ -0,0 +1,99 @@
|
||||
/// \copyright SPDX-License-Identifier: Apache-2.0
|
||||
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "essentials.h"
|
||||
|
||||
#ifndef MDBX_64BIT_ATOMIC
|
||||
#error "The MDBX_64BIT_ATOMIC must be defined before"
|
||||
#endif /* MDBX_64BIT_ATOMIC */
|
||||
|
||||
#ifndef MDBX_64BIT_CAS
|
||||
#error "The MDBX_64BIT_CAS must be defined before"
|
||||
#endif /* MDBX_64BIT_CAS */
|
||||
|
||||
#if defined(__cplusplus) && !defined(__STDC_NO_ATOMICS__) && __has_include(<cstdatomic>)
|
||||
#include <cstdatomic>
|
||||
#define MDBX_HAVE_C11ATOMICS
|
||||
#elif !defined(__cplusplus) && \
|
||||
(__STDC_VERSION__ >= 201112L || __has_extension(c_atomic)) && \
|
||||
!defined(__STDC_NO_ATOMICS__) && \
|
||||
(__GNUC_PREREQ(4, 9) || __CLANG_PREREQ(3, 8) || \
|
||||
!(defined(__GNUC__) || defined(__clang__)))
|
||||
#include <stdatomic.h>
|
||||
#define MDBX_HAVE_C11ATOMICS
|
||||
#elif defined(__GNUC__) || defined(__clang__)
|
||||
#elif defined(_MSC_VER)
|
||||
#pragma warning(disable : 4163) /* 'xyz': not available as an intrinsic */
|
||||
#pragma warning(disable : 4133) /* 'function': incompatible types - from \
|
||||
'size_t' to 'LONGLONG' */
|
||||
#pragma warning(disable : 4244) /* 'return': conversion from 'LONGLONG' to \
|
||||
'std::size_t', possible loss of data */
|
||||
#pragma warning(disable : 4267) /* 'function': conversion from 'size_t' to \
|
||||
'long', possible loss of data */
|
||||
#pragma intrinsic(_InterlockedExchangeAdd, _InterlockedCompareExchange)
|
||||
#pragma intrinsic(_InterlockedExchangeAdd64, _InterlockedCompareExchange64)
|
||||
#elif defined(__APPLE__)
|
||||
#include <libkern/OSAtomic.h>
|
||||
#else
|
||||
#error FIXME atomic-ops
|
||||
#endif
|
||||
|
||||
typedef enum mdbx_memory_order {
|
||||
mo_Relaxed,
|
||||
mo_AcquireRelease
|
||||
/* , mo_SequentialConsistency */
|
||||
} mdbx_memory_order_t;
|
||||
|
||||
typedef union {
|
||||
volatile uint32_t weak;
|
||||
#ifdef MDBX_HAVE_C11ATOMICS
|
||||
volatile _Atomic uint32_t c11a;
|
||||
#endif /* MDBX_HAVE_C11ATOMICS */
|
||||
} mdbx_atomic_uint32_t;
|
||||
|
||||
typedef union {
|
||||
volatile uint64_t weak;
|
||||
#if defined(MDBX_HAVE_C11ATOMICS) && (MDBX_64BIT_CAS || MDBX_64BIT_ATOMIC)
|
||||
volatile _Atomic uint64_t c11a;
|
||||
#endif
|
||||
#if !defined(MDBX_HAVE_C11ATOMICS) || !MDBX_64BIT_CAS || !MDBX_64BIT_ATOMIC
|
||||
__anonymous_struct_extension__ struct {
|
||||
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
|
||||
mdbx_atomic_uint32_t low, high;
|
||||
#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
|
||||
mdbx_atomic_uint32_t high, low;
|
||||
#else
|
||||
#error "FIXME: Unsupported byte order"
|
||||
#endif /* __BYTE_ORDER__ */
|
||||
};
|
||||
#endif
|
||||
} mdbx_atomic_uint64_t;
|
||||
|
||||
#ifdef MDBX_HAVE_C11ATOMICS
|
||||
|
||||
/* Crutches for C11 atomic compiler's bugs */
|
||||
#if defined(__e2k__) && defined(__LCC__) && __LCC__ < /* FIXME */ 127
|
||||
#define MDBX_c11a_ro(type, ptr) (&(ptr)->weak)
|
||||
#define MDBX_c11a_rw(type, ptr) (&(ptr)->weak)
|
||||
#elif defined(__clang__) && __clang__ < 8
|
||||
#define MDBX_c11a_ro(type, ptr) ((volatile _Atomic(type) *)&(ptr)->c11a)
|
||||
#define MDBX_c11a_rw(type, ptr) (&(ptr)->c11a)
|
||||
#else
|
||||
#define MDBX_c11a_ro(type, ptr) (&(ptr)->c11a)
|
||||
#define MDBX_c11a_rw(type, ptr) (&(ptr)->c11a)
|
||||
#endif /* Crutches for C11 atomic compiler's bugs */
|
||||
|
||||
#define mo_c11_store(fence) \
|
||||
(((fence) == mo_Relaxed) ? memory_order_relaxed \
|
||||
: ((fence) == mo_AcquireRelease) ? memory_order_release \
|
||||
: memory_order_seq_cst)
|
||||
#define mo_c11_load(fence) \
|
||||
(((fence) == mo_Relaxed) ? memory_order_relaxed \
|
||||
: ((fence) == mo_AcquireRelease) ? memory_order_acquire \
|
||||
: memory_order_seq_cst)
|
||||
|
||||
#endif /* MDBX_HAVE_C11ATOMICS */
|
||||
|
||||
#define SAFE64_INVALID_THRESHOLD UINT64_C(0xffffFFFF00000000)
|
164
src/audit.c
Normal file
164
src/audit.c
Normal file
@ -0,0 +1,164 @@
|
||||
/// \copyright SPDX-License-Identifier: Apache-2.0
|
||||
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
|
||||
|
||||
#include "internals.h"
|
||||
|
||||
__cold static tree_t *audit_db_dig(const MDBX_txn *txn, const size_t dbi,
|
||||
tree_t *fallback) {
|
||||
const MDBX_txn *dig = txn;
|
||||
do {
|
||||
tASSERT(txn, txn->n_dbi == dig->n_dbi);
|
||||
const uint8_t state = dbi_state(dig, dbi);
|
||||
if (state & DBI_LINDO)
|
||||
switch (state & (DBI_VALID | DBI_STALE | DBI_OLDEN)) {
|
||||
case DBI_VALID:
|
||||
case DBI_OLDEN:
|
||||
return dig->dbs + dbi;
|
||||
case 0:
|
||||
return nullptr;
|
||||
case DBI_VALID | DBI_STALE:
|
||||
case DBI_OLDEN | DBI_STALE:
|
||||
break;
|
||||
default:
|
||||
tASSERT(txn, !!"unexpected dig->dbi_state[dbi]");
|
||||
}
|
||||
dig = dig->parent;
|
||||
} while (dig);
|
||||
return fallback;
|
||||
}
|
||||
|
||||
static size_t audit_db_used(const tree_t *db) {
|
||||
return db ? (size_t)db->branch_pages + (size_t)db->leaf_pages +
|
||||
(size_t)db->large_pages
|
||||
: 0;
|
||||
}
|
||||
|
||||
__cold static int audit_ex_locked(MDBX_txn *txn, size_t retired_stored,
|
||||
bool dont_filter_gc) {
|
||||
const MDBX_env *const env = txn->env;
|
||||
size_t pending = 0;
|
||||
if ((txn->flags & MDBX_TXN_RDONLY) == 0)
|
||||
pending = txn->tw.loose_count + MDBX_PNL_GETSIZE(txn->tw.relist) +
|
||||
(MDBX_PNL_GETSIZE(txn->tw.retired_pages) - retired_stored);
|
||||
|
||||
cursor_couple_t cx;
|
||||
int rc = cursor_init(&cx.outer, txn, FREE_DBI);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return rc;
|
||||
|
||||
size_t gc = 0;
|
||||
MDBX_val key, data;
|
||||
rc = outer_first(&cx.outer, &key, &data);
|
||||
while (rc == MDBX_SUCCESS) {
|
||||
if (!dont_filter_gc) {
|
||||
if (unlikely(key.iov_len != sizeof(txnid_t))) {
|
||||
ERROR("%s/%d: %s %u", "MDBX_CORRUPTED", MDBX_CORRUPTED,
|
||||
"invalid GC-key size", (unsigned)key.iov_len);
|
||||
return MDBX_CORRUPTED;
|
||||
}
|
||||
txnid_t id = unaligned_peek_u64(4, key.iov_base);
|
||||
if (txn->tw.gc.reclaimed) {
|
||||
for (size_t i = 1; i <= MDBX_PNL_GETSIZE(txn->tw.gc.reclaimed); ++i)
|
||||
if (id == txn->tw.gc.reclaimed[i])
|
||||
goto skip;
|
||||
} else if (id <= txn->tw.gc.last_reclaimed)
|
||||
goto skip;
|
||||
}
|
||||
gc += *(pgno_t *)data.iov_base;
|
||||
skip:
|
||||
rc = outer_next(&cx.outer, &key, &data, MDBX_NEXT);
|
||||
}
|
||||
tASSERT(txn, rc == MDBX_NOTFOUND);
|
||||
|
||||
const size_t done_bitmap_size = (txn->n_dbi + CHAR_BIT - 1) / CHAR_BIT;
|
||||
uint8_t *const done_bitmap = alloca(done_bitmap_size);
|
||||
memset(done_bitmap, 0, done_bitmap_size);
|
||||
if (txn->parent) {
|
||||
tASSERT(txn, txn->n_dbi == txn->parent->n_dbi &&
|
||||
txn->n_dbi == txn->env->txn->n_dbi);
|
||||
#if MDBX_ENABLE_DBI_SPARSE
|
||||
tASSERT(txn, txn->dbi_sparse == txn->parent->dbi_sparse &&
|
||||
txn->dbi_sparse == txn->env->txn->dbi_sparse);
|
||||
#endif /* MDBX_ENABLE_DBI_SPARSE */
|
||||
}
|
||||
|
||||
size_t used = NUM_METAS +
|
||||
audit_db_used(audit_db_dig(txn, FREE_DBI, nullptr)) +
|
||||
audit_db_used(audit_db_dig(txn, MAIN_DBI, nullptr));
|
||||
rc = cursor_init(&cx.outer, txn, MAIN_DBI);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return rc;
|
||||
|
||||
rc = tree_search(&cx.outer, nullptr, Z_FIRST);
|
||||
while (rc == MDBX_SUCCESS) {
|
||||
page_t *mp = cx.outer.pg[cx.outer.top];
|
||||
for (size_t k = 0; k < page_numkeys(mp); k++) {
|
||||
node_t *node = page_node(mp, k);
|
||||
if (node_flags(node) != N_SUBDATA)
|
||||
continue;
|
||||
if (unlikely(node_ds(node) != sizeof(tree_t))) {
|
||||
ERROR("%s/%d: %s %u", "MDBX_CORRUPTED", MDBX_CORRUPTED,
|
||||
"invalid dupsort sub-tree node size", (unsigned)node_ds(node));
|
||||
return MDBX_CORRUPTED;
|
||||
}
|
||||
|
||||
tree_t reside;
|
||||
const tree_t *db = memcpy(&reside, node_data(node), sizeof(reside));
|
||||
const MDBX_val name = {node_key(node), node_ks(node)};
|
||||
for (size_t dbi = CORE_DBS; dbi < env->n_dbi; ++dbi) {
|
||||
if (dbi >= txn->n_dbi || !(env->dbs_flags[dbi] & DB_VALID))
|
||||
continue;
|
||||
if (env->kvs[MAIN_DBI].clc.k.cmp(&name, &env->kvs[dbi].name))
|
||||
continue;
|
||||
|
||||
done_bitmap[dbi / CHAR_BIT] |= 1 << dbi % CHAR_BIT;
|
||||
db = audit_db_dig(txn, dbi, &reside);
|
||||
break;
|
||||
}
|
||||
used += audit_db_used(db);
|
||||
}
|
||||
rc = cursor_sibling_right(&cx.outer);
|
||||
}
|
||||
tASSERT(txn, rc == MDBX_NOTFOUND);
|
||||
|
||||
for (size_t dbi = CORE_DBS; dbi < txn->n_dbi; ++dbi) {
|
||||
if (done_bitmap[dbi / CHAR_BIT] & (1 << dbi % CHAR_BIT))
|
||||
continue;
|
||||
const tree_t *db = audit_db_dig(txn, dbi, nullptr);
|
||||
if (db)
|
||||
used += audit_db_used(db);
|
||||
else if (dbi_state(txn, dbi))
|
||||
WARNING("audit %s@%" PRIaTXN
|
||||
": unable account dbi %zd / \"%*s\", state 0x%02x",
|
||||
txn->parent ? "nested-" : "", txn->txnid, dbi,
|
||||
(int)env->kvs[dbi].name.iov_len,
|
||||
(const char *)env->kvs[dbi].name.iov_base, dbi_state(txn, dbi));
|
||||
}
|
||||
|
||||
if (pending + gc + used == txn->geo.first_unallocated)
|
||||
return MDBX_SUCCESS;
|
||||
|
||||
if ((txn->flags & MDBX_TXN_RDONLY) == 0)
|
||||
ERROR("audit @%" PRIaTXN ": %zu(pending) = %zu(loose) + "
|
||||
"%zu(reclaimed) + %zu(retired-pending) - %zu(retired-stored)",
|
||||
txn->txnid, pending, txn->tw.loose_count,
|
||||
MDBX_PNL_GETSIZE(txn->tw.relist),
|
||||
txn->tw.retired_pages ? MDBX_PNL_GETSIZE(txn->tw.retired_pages) : 0,
|
||||
retired_stored);
|
||||
ERROR("audit @%" PRIaTXN ": %zu(pending) + %zu"
|
||||
"(gc) + %zu(count) = %zu(total) <> %zu"
|
||||
"(allocated)",
|
||||
txn->txnid, pending, gc, used, pending + gc + used,
|
||||
(size_t)txn->geo.first_unallocated);
|
||||
return MDBX_PROBLEM;
|
||||
}
|
||||
|
||||
__cold int audit_ex(MDBX_txn *txn, size_t retired_stored, bool dont_filter_gc) {
|
||||
MDBX_env *const env = txn->env;
|
||||
int rc = osal_fastmutex_acquire(&env->dbi_lock);
|
||||
if (likely(rc == MDBX_SUCCESS)) {
|
||||
rc = audit_ex_locked(txn, retired_stored, dont_filter_gc);
|
||||
ENSURE(txn->env, osal_fastmutex_release(&env->dbi_lock) == MDBX_SUCCESS);
|
||||
}
|
||||
return rc;
|
||||
}
|
12
src/bits.md
12
src/bits.md
@ -1,13 +1,13 @@
|
||||
N | MASK | ENV | TXN | DB | PUT | DBI | NODE | PAGE | MRESIZE |
|
||||
--|---------|-----------|--------------|----------|-----------|------------|---------|----------|---------|
|
||||
0 |0000 0001|ALLOC_RSRV |TXN_FINISHED | | |DBI_DIRTY |F_BIGDATA|P_BRANCH | |
|
||||
1 |0000 0002|ALLOC_UNIMP|TXN_ERROR |REVERSEKEY|F_SUBDATA |DBI_STALE |F_SUBDATA|P_LEAF | |
|
||||
2 |0000 0004|ALLOC_COLSC|TXN_DIRTY |DUPSORT | |DBI_FRESH |F_DUPDATA|P_OVERFLOW| |
|
||||
0 |0000 0001|ALLOC_RSRV |TXN_FINISHED | | |DBI_DIRTY |N_BIGDATA|P_BRANCH | |
|
||||
1 |0000 0002|ALLOC_UNIMP|TXN_ERROR |REVERSEKEY|F_SUBDATA |DBI_STALE |N_SUBDATA|P_LEAF | |
|
||||
2 |0000 0004|ALLOC_COLSC|TXN_DIRTY |DUPSORT | |DBI_FRESH |N_DUPDATA|P_LARGE | |
|
||||
3 |0000 0008|ALLOC_SSCAN|TXN_SPILLS |INTEGERKEY| |DBI_CREAT | |P_META | |
|
||||
4 |0000 0010|ALLOC_FIFO |TXN_HAS_CHILD |DUPFIXED |NOOVERWRITE|DBI_VALID | |P_BAD | |
|
||||
5 |0000 0020| |TXN_DRAINED_GC|INTEGERDUP|NODUPDATA | | |P_LEAF2 | |
|
||||
5 |0000 0020| |TXN_DRAINED_GC|INTEGERDUP|NODUPDATA | | |P_DUPFIX | |
|
||||
6 |0000 0040| | |REVERSEDUP|CURRENT |DBI_OLDEN | |P_SUBP | |
|
||||
7 |0000 0080| | | |ALLDUPS |DBI_LINDO | | | |
|
||||
7 |0000 0080| | |DB_VALID |ALLDUPS |DBI_LINDO | | | |
|
||||
8 |0000 0100| _MAY_MOVE | | | | | | | <= |
|
||||
9 |0000 0200| _MAY_UNMAP| | | | | | | <= |
|
||||
10|0000 0400| | | | | | | | |
|
||||
@ -15,7 +15,7 @@ N | MASK | ENV | TXN | DB | PUT | DBI | NOD
|
||||
12|0000 1000| | | | | | | | |
|
||||
13|0000 2000|VALIDATION | | | | | |P_SPILLED | |
|
||||
14|0000 4000|NOSUBDIR | | | | | |P_LOOSE | |
|
||||
15|0000 8000| | |DB_VALID | | | |P_FROZEN | |
|
||||
15|0000 8000| | | | | | |P_FROZEN | |
|
||||
16|0001 0000|SAFE_NOSYNC|TXN_NOSYNC | |RESERVE | |RESERVE | | |
|
||||
17|0002 0000|RDONLY |TXN_RDONLY | |APPEND | |APPEND | | <= |
|
||||
18|0004 0000|NOMETASYNC |TXN_NOMETASYNC|CREATE |APPENDDUP | | | | |
|
||||
|
353
src/cogs.c
Normal file
353
src/cogs.c
Normal file
@ -0,0 +1,353 @@
|
||||
/// \copyright SPDX-License-Identifier: Apache-2.0
|
||||
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
|
||||
|
||||
#include "internals.h"
|
||||
|
||||
/*------------------------------------------------------------------------------
|
||||
* Pack/Unpack 16-bit values for Grow step & Shrink threshold */
|
||||
|
||||
MDBX_NOTHROW_CONST_FUNCTION static inline pgno_t me2v(size_t m, size_t e) {
|
||||
assert(m < 2048 && e < 8);
|
||||
return (pgno_t)(32768 + ((m + 1) << (e + 8)));
|
||||
}
|
||||
|
||||
MDBX_NOTHROW_CONST_FUNCTION static inline uint16_t v2me(size_t v, size_t e) {
|
||||
assert(v > (e ? me2v(2047, e - 1) : 32768));
|
||||
assert(v <= me2v(2047, e));
|
||||
size_t m = (v - 32768 + ((size_t)1 << (e + 8)) - 1) >> (e + 8);
|
||||
m -= m > 0;
|
||||
assert(m < 2048 && e < 8);
|
||||
// f e d c b a 9 8 7 6 5 4 3 2 1 0
|
||||
// 1 e e e m m m m m m m m m m m 1
|
||||
const uint16_t pv = (uint16_t)(0x8001 + (e << 12) + (m << 1));
|
||||
assert(pv != 65535);
|
||||
return pv;
|
||||
}
|
||||
|
||||
/* Convert 16-bit packed (exponential quantized) value to number of pages */
|
||||
pgno_t pv2pages(uint16_t pv) {
|
||||
if ((pv & 0x8001) != 0x8001)
|
||||
return pv;
|
||||
if (pv == 65535)
|
||||
return 65536;
|
||||
// f e d c b a 9 8 7 6 5 4 3 2 1 0
|
||||
// 1 e e e m m m m m m m m m m m 1
|
||||
return me2v((pv >> 1) & 2047, (pv >> 12) & 7);
|
||||
}
|
||||
|
||||
/* Convert number of pages to 16-bit packed (exponential quantized) value */
|
||||
uint16_t pages2pv(size_t pages) {
|
||||
if (pages < 32769 || (pages < 65536 && (pages & 1) == 0))
|
||||
return (uint16_t)pages;
|
||||
if (pages <= me2v(2047, 0))
|
||||
return v2me(pages, 0);
|
||||
if (pages <= me2v(2047, 1))
|
||||
return v2me(pages, 1);
|
||||
if (pages <= me2v(2047, 2))
|
||||
return v2me(pages, 2);
|
||||
if (pages <= me2v(2047, 3))
|
||||
return v2me(pages, 3);
|
||||
if (pages <= me2v(2047, 4))
|
||||
return v2me(pages, 4);
|
||||
if (pages <= me2v(2047, 5))
|
||||
return v2me(pages, 5);
|
||||
if (pages <= me2v(2047, 6))
|
||||
return v2me(pages, 6);
|
||||
return (pages < me2v(2046, 7)) ? v2me(pages, 7) : 65533;
|
||||
}
|
||||
|
||||
__cold bool pv2pages_verify(void) {
|
||||
bool ok = true, dump_translation = false;
|
||||
for (size_t i = 0; i < 65536; ++i) {
|
||||
size_t pages = pv2pages(i);
|
||||
size_t x = pages2pv(pages);
|
||||
size_t xp = pv2pages(x);
|
||||
if (pages != xp) {
|
||||
ERROR("%zu => %zu => %zu => %zu\n", i, pages, x, xp);
|
||||
ok = false;
|
||||
} else if (dump_translation && !(x == i || (x % 2 == 0 && x < 65536))) {
|
||||
DEBUG("%zu => %zu => %zu => %zu\n", i, pages, x, xp);
|
||||
}
|
||||
}
|
||||
return ok;
|
||||
}
|
||||
|
||||
/*----------------------------------------------------------------------------*/
|
||||
|
||||
MDBX_NOTHROW_PURE_FUNCTION size_t bytes_align2os_bytes(const MDBX_env *env,
|
||||
size_t bytes) {
|
||||
return ceil_powerof2(
|
||||
bytes, (env->ps > globals.sys_pagesize) ? env->ps : globals.sys_pagesize);
|
||||
}
|
||||
|
||||
MDBX_NOTHROW_PURE_FUNCTION size_t pgno_align2os_bytes(const MDBX_env *env,
|
||||
size_t pgno) {
|
||||
return ceil_powerof2(pgno2bytes(env, pgno), globals.sys_pagesize);
|
||||
}
|
||||
|
||||
MDBX_NOTHROW_PURE_FUNCTION pgno_t pgno_align2os_pgno(const MDBX_env *env,
|
||||
size_t pgno) {
|
||||
return bytes2pgno(env, pgno_align2os_bytes(env, pgno));
|
||||
}
|
||||
|
||||
/*----------------------------------------------------------------------------*/
|
||||
|
||||
MDBX_NOTHROW_PURE_FUNCTION static __always_inline int
|
||||
cmp_int_inline(const size_t expected_alignment, const MDBX_val *a,
|
||||
const MDBX_val *b) {
|
||||
if (likely(a->iov_len == b->iov_len)) {
|
||||
if (sizeof(size_t) > 7 && likely(a->iov_len == 8))
|
||||
return CMP2INT(unaligned_peek_u64(expected_alignment, a->iov_base),
|
||||
unaligned_peek_u64(expected_alignment, b->iov_base));
|
||||
if (likely(a->iov_len == 4))
|
||||
return CMP2INT(unaligned_peek_u32(expected_alignment, a->iov_base),
|
||||
unaligned_peek_u32(expected_alignment, b->iov_base));
|
||||
if (sizeof(size_t) < 8 && likely(a->iov_len == 8))
|
||||
return CMP2INT(unaligned_peek_u64(expected_alignment, a->iov_base),
|
||||
unaligned_peek_u64(expected_alignment, b->iov_base));
|
||||
}
|
||||
ERROR("mismatch and/or invalid size %p.%zu/%p.%zu for INTEGERKEY/INTEGERDUP",
|
||||
a->iov_base, a->iov_len, b->iov_base, b->iov_len);
|
||||
return 0;
|
||||
}
|
||||
|
||||
MDBX_NOTHROW_PURE_FUNCTION __hot int cmp_int_unaligned(const MDBX_val *a,
|
||||
const MDBX_val *b) {
|
||||
return cmp_int_inline(1, a, b);
|
||||
}
|
||||
|
||||
#ifndef cmp_int_align2
|
||||
/* Compare two items pointing at 2-byte aligned unsigned int's. */
|
||||
MDBX_NOTHROW_PURE_FUNCTION __hot int cmp_int_align2(const MDBX_val *a,
|
||||
const MDBX_val *b) {
|
||||
return cmp_int_inline(2, a, b);
|
||||
}
|
||||
#endif /* cmp_int_align2 */
|
||||
|
||||
#ifndef cmp_int_align4
|
||||
/* Compare two items pointing at 4-byte aligned unsigned int's. */
|
||||
MDBX_NOTHROW_PURE_FUNCTION __hot int cmp_int_align4(const MDBX_val *a,
|
||||
const MDBX_val *b) {
|
||||
return cmp_int_inline(4, a, b);
|
||||
}
|
||||
#endif /* cmp_int_align4 */
|
||||
|
||||
/* Compare two items lexically */
|
||||
MDBX_NOTHROW_PURE_FUNCTION __hot int cmp_lexical(const MDBX_val *a,
|
||||
const MDBX_val *b) {
|
||||
if (a->iov_len == b->iov_len)
|
||||
return a->iov_len ? memcmp(a->iov_base, b->iov_base, a->iov_len) : 0;
|
||||
|
||||
const int diff_len = (a->iov_len < b->iov_len) ? -1 : 1;
|
||||
const size_t shortest = (a->iov_len < b->iov_len) ? a->iov_len : b->iov_len;
|
||||
int diff_data = shortest ? memcmp(a->iov_base, b->iov_base, shortest) : 0;
|
||||
return likely(diff_data) ? diff_data : diff_len;
|
||||
}
|
||||
|
||||
MDBX_NOTHROW_PURE_FUNCTION static __always_inline unsigned
|
||||
tail3le(const uint8_t *p, size_t l) {
|
||||
STATIC_ASSERT(sizeof(unsigned) > 2);
|
||||
// 1: 0 0 0
|
||||
// 2: 0 1 1
|
||||
// 3: 0 1 2
|
||||
return p[0] | p[l >> 1] << 8 | p[l - 1] << 16;
|
||||
}
|
||||
|
||||
/* Compare two items in reverse byte order */
|
||||
MDBX_NOTHROW_PURE_FUNCTION __hot int cmp_reverse(const MDBX_val *a,
|
||||
const MDBX_val *b) {
|
||||
size_t left = (a->iov_len < b->iov_len) ? a->iov_len : b->iov_len;
|
||||
if (likely(left)) {
|
||||
const uint8_t *pa = ptr_disp(a->iov_base, a->iov_len);
|
||||
const uint8_t *pb = ptr_disp(b->iov_base, b->iov_len);
|
||||
while (left >= sizeof(size_t)) {
|
||||
pa -= sizeof(size_t);
|
||||
pb -= sizeof(size_t);
|
||||
left -= sizeof(size_t);
|
||||
STATIC_ASSERT(sizeof(size_t) == 4 || sizeof(size_t) == 8);
|
||||
if (sizeof(size_t) == 4) {
|
||||
uint32_t xa = unaligned_peek_u32(1, pa);
|
||||
uint32_t xb = unaligned_peek_u32(1, pb);
|
||||
#if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__
|
||||
xa = osal_bswap32(xa);
|
||||
xb = osal_bswap32(xb);
|
||||
#endif /* __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__ */
|
||||
if (xa != xb)
|
||||
return (xa < xb) ? -1 : 1;
|
||||
} else {
|
||||
uint64_t xa = unaligned_peek_u64(1, pa);
|
||||
uint64_t xb = unaligned_peek_u64(1, pb);
|
||||
#if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__
|
||||
xa = osal_bswap64(xa);
|
||||
xb = osal_bswap64(xb);
|
||||
#endif /* __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__ */
|
||||
if (xa != xb)
|
||||
return (xa < xb) ? -1 : 1;
|
||||
}
|
||||
}
|
||||
if (sizeof(size_t) == 8 && left >= 4) {
|
||||
pa -= 4;
|
||||
pb -= 4;
|
||||
left -= 4;
|
||||
uint32_t xa = unaligned_peek_u32(1, pa);
|
||||
uint32_t xb = unaligned_peek_u32(1, pb);
|
||||
#if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__
|
||||
xa = osal_bswap32(xa);
|
||||
xb = osal_bswap32(xb);
|
||||
#endif /* __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__ */
|
||||
if (xa != xb)
|
||||
return (xa < xb) ? -1 : 1;
|
||||
}
|
||||
if (left) {
|
||||
unsigned xa = tail3le(pa - left, left);
|
||||
unsigned xb = tail3le(pb - left, left);
|
||||
if (xa != xb)
|
||||
return (xa < xb) ? -1 : 1;
|
||||
}
|
||||
}
|
||||
return CMP2INT(a->iov_len, b->iov_len);
|
||||
}
|
||||
|
||||
/* Fast non-lexically comparator */
|
||||
MDBX_NOTHROW_PURE_FUNCTION __hot int cmp_lenfast(const MDBX_val *a,
|
||||
const MDBX_val *b) {
|
||||
int diff = CMP2INT(a->iov_len, b->iov_len);
|
||||
return (likely(diff) || a->iov_len == 0)
|
||||
? diff
|
||||
: memcmp(a->iov_base, b->iov_base, a->iov_len);
|
||||
}
|
||||
|
||||
MDBX_NOTHROW_PURE_FUNCTION __hot bool
|
||||
eq_fast_slowpath(const uint8_t *a, const uint8_t *b, size_t l) {
|
||||
if (likely(l > 3)) {
|
||||
if (MDBX_UNALIGNED_OK >= 4 && likely(l < 9))
|
||||
return ((unaligned_peek_u32(1, a) - unaligned_peek_u32(1, b)) |
|
||||
(unaligned_peek_u32(1, a + l - 4) -
|
||||
unaligned_peek_u32(1, b + l - 4))) == 0;
|
||||
if (MDBX_UNALIGNED_OK >= 8 && sizeof(size_t) > 7 && likely(l < 17))
|
||||
return ((unaligned_peek_u64(1, a) - unaligned_peek_u64(1, b)) |
|
||||
(unaligned_peek_u64(1, a + l - 8) -
|
||||
unaligned_peek_u64(1, b + l - 8))) == 0;
|
||||
return memcmp(a, b, l) == 0;
|
||||
}
|
||||
if (likely(l))
|
||||
return tail3le(a, l) == tail3le(b, l);
|
||||
return true;
|
||||
}
|
||||
|
||||
int cmp_equal_or_greater(const MDBX_val *a, const MDBX_val *b) {
|
||||
return eq_fast(a, b) ? 0 : 1;
|
||||
}
|
||||
|
||||
int cmp_equal_or_wrong(const MDBX_val *a, const MDBX_val *b) {
|
||||
return eq_fast(a, b) ? 0 : -1;
|
||||
}
|
||||
|
||||
/*----------------------------------------------------------------------------*/
|
||||
|
||||
__cold void update_mlcnt(const MDBX_env *env,
|
||||
const pgno_t new_aligned_mlocked_pgno,
|
||||
const bool lock_not_release) {
|
||||
for (;;) {
|
||||
const pgno_t mlock_pgno_before =
|
||||
atomic_load32(&env->mlocked_pgno, mo_AcquireRelease);
|
||||
eASSERT(env,
|
||||
pgno_align2os_pgno(env, mlock_pgno_before) == mlock_pgno_before);
|
||||
eASSERT(env, pgno_align2os_pgno(env, new_aligned_mlocked_pgno) ==
|
||||
new_aligned_mlocked_pgno);
|
||||
if (lock_not_release ? (mlock_pgno_before >= new_aligned_mlocked_pgno)
|
||||
: (mlock_pgno_before <= new_aligned_mlocked_pgno))
|
||||
break;
|
||||
if (likely(atomic_cas32(&((MDBX_env *)env)->mlocked_pgno, mlock_pgno_before,
|
||||
new_aligned_mlocked_pgno)))
|
||||
for (;;) {
|
||||
mdbx_atomic_uint32_t *const mlcnt = env->lck->mlcnt;
|
||||
const int32_t snap_locked = atomic_load32(mlcnt + 0, mo_Relaxed);
|
||||
const int32_t snap_unlocked = atomic_load32(mlcnt + 1, mo_Relaxed);
|
||||
if (mlock_pgno_before == 0 && (snap_locked - snap_unlocked) < INT_MAX) {
|
||||
eASSERT(env, lock_not_release);
|
||||
if (unlikely(!atomic_cas32(mlcnt + 0, snap_locked, snap_locked + 1)))
|
||||
continue;
|
||||
}
|
||||
if (new_aligned_mlocked_pgno == 0 &&
|
||||
(snap_locked - snap_unlocked) > 0) {
|
||||
eASSERT(env, !lock_not_release);
|
||||
if (unlikely(
|
||||
!atomic_cas32(mlcnt + 1, snap_unlocked, snap_unlocked + 1)))
|
||||
continue;
|
||||
}
|
||||
NOTICE("%s-pages %u..%u, mlocked-process(es) %u -> %u",
|
||||
lock_not_release ? "lock" : "unlock",
|
||||
lock_not_release ? mlock_pgno_before : new_aligned_mlocked_pgno,
|
||||
lock_not_release ? new_aligned_mlocked_pgno : mlock_pgno_before,
|
||||
snap_locked - snap_unlocked,
|
||||
atomic_load32(mlcnt + 0, mo_Relaxed) -
|
||||
atomic_load32(mlcnt + 1, mo_Relaxed));
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__cold void munlock_after(const MDBX_env *env, const pgno_t aligned_pgno,
|
||||
const size_t end_bytes) {
|
||||
if (atomic_load32(&env->mlocked_pgno, mo_AcquireRelease) > aligned_pgno) {
|
||||
int err = MDBX_ENOSYS;
|
||||
const size_t munlock_begin = pgno2bytes(env, aligned_pgno);
|
||||
const size_t munlock_size = end_bytes - munlock_begin;
|
||||
eASSERT(env, end_bytes % globals.sys_pagesize == 0 &&
|
||||
munlock_begin % globals.sys_pagesize == 0 &&
|
||||
munlock_size % globals.sys_pagesize == 0);
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
err =
|
||||
VirtualUnlock(ptr_disp(env->dxb_mmap.base, munlock_begin), munlock_size)
|
||||
? MDBX_SUCCESS
|
||||
: (int)GetLastError();
|
||||
if (err == ERROR_NOT_LOCKED)
|
||||
err = MDBX_SUCCESS;
|
||||
#elif defined(_POSIX_MEMLOCK_RANGE)
|
||||
err = munlock(ptr_disp(env->dxb_mmap.base, munlock_begin), munlock_size)
|
||||
? errno
|
||||
: MDBX_SUCCESS;
|
||||
#endif
|
||||
if (likely(err == MDBX_SUCCESS))
|
||||
update_mlcnt(env, aligned_pgno, false);
|
||||
else {
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
WARNING("VirtualUnlock(%zu, %zu) error %d", munlock_begin, munlock_size,
|
||||
err);
|
||||
#else
|
||||
WARNING("munlock(%zu, %zu) error %d", munlock_begin, munlock_size, err);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__cold void munlock_all(const MDBX_env *env) {
|
||||
munlock_after(env, 0, bytes_align2os_bytes(env, env->dxb_mmap.current));
|
||||
}
|
||||
|
||||
/*----------------------------------------------------------------------------*/
|
||||
|
||||
uint32_t combine_durability_flags(const uint32_t a, const uint32_t b) {
|
||||
uint32_t r = a | b;
|
||||
|
||||
/* avoid false MDBX_UTTERLY_NOSYNC */
|
||||
if (F_ISSET(r, MDBX_UTTERLY_NOSYNC) && !F_ISSET(a, MDBX_UTTERLY_NOSYNC) &&
|
||||
!F_ISSET(b, MDBX_UTTERLY_NOSYNC))
|
||||
r = (r - MDBX_UTTERLY_NOSYNC) | MDBX_SAFE_NOSYNC;
|
||||
|
||||
/* convert DEPRECATED_MAPASYNC to MDBX_SAFE_NOSYNC */
|
||||
if ((r & (MDBX_WRITEMAP | DEPRECATED_MAPASYNC)) ==
|
||||
(MDBX_WRITEMAP | DEPRECATED_MAPASYNC) &&
|
||||
!F_ISSET(r, MDBX_UTTERLY_NOSYNC))
|
||||
r = (r - DEPRECATED_MAPASYNC) | MDBX_SAFE_NOSYNC;
|
||||
|
||||
/* force MDBX_NOMETASYNC if NOSYNC enabled */
|
||||
if (r & (MDBX_SAFE_NOSYNC | MDBX_UTTERLY_NOSYNC))
|
||||
r |= MDBX_NOMETASYNC;
|
||||
|
||||
assert(!(F_ISSET(r, MDBX_UTTERLY_NOSYNC) &&
|
||||
!F_ISSET(a, MDBX_UTTERLY_NOSYNC) &&
|
||||
!F_ISSET(b, MDBX_UTTERLY_NOSYNC)));
|
||||
return r;
|
||||
}
|
558
src/cogs.h
Normal file
558
src/cogs.h
Normal file
@ -0,0 +1,558 @@
|
||||
/// \copyright SPDX-License-Identifier: Apache-2.0
|
||||
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "essentials.h"
|
||||
|
||||
MDBX_NOTHROW_CONST_FUNCTION MDBX_INTERNAL pgno_t pv2pages(uint16_t pv);
|
||||
|
||||
MDBX_NOTHROW_CONST_FUNCTION MDBX_INTERNAL uint16_t pages2pv(size_t pages);
|
||||
|
||||
MDBX_MAYBE_UNUSED MDBX_INTERNAL bool pv2pages_verify(void);
|
||||
|
||||
/*------------------------------------------------------------------------------
|
||||
* Nodes, Keys & Values length limitation factors:
|
||||
*
|
||||
* BRANCH_NODE_MAX
|
||||
* Branch-page must contain at least two nodes, within each a key and a child
|
||||
* page number. But page can't be split if it contains less that 4 keys,
|
||||
* i.e. a page should not overflow before adding the fourth key. Therefore,
|
||||
* at least 3 branch-node should fit in the single branch-page. Further, the
|
||||
* first node of a branch-page doesn't contain a key, i.e. the first node
|
||||
* is always require space just for itself. Thus:
|
||||
* PAGESPACE = pagesize - page_hdr_len;
|
||||
* BRANCH_NODE_MAX = even_floor(
|
||||
* (PAGESPACE - sizeof(indx_t) - NODESIZE) / (3 - 1) - sizeof(indx_t));
|
||||
* KEYLEN_MAX = BRANCH_NODE_MAX - node_hdr_len;
|
||||
*
|
||||
* LEAF_NODE_MAX
|
||||
* Leaf-node must fit into single leaf-page, where a value could be placed on
|
||||
* a large/overflow page. However, may require to insert a nearly page-sized
|
||||
* node between two large nodes are already fill-up a page. In this case the
|
||||
* page must be split to two if some pair of nodes fits on one page, or
|
||||
* otherwise the page should be split to the THREE with a single node
|
||||
* per each of ones. Such 1-into-3 page splitting is costly and complex since
|
||||
* requires TWO insertion into the parent page, that could lead to split it
|
||||
* and so on up to the root. Therefore double-splitting is avoided here and
|
||||
* the maximum node size is half of a leaf page space:
|
||||
* LEAF_NODE_MAX = even_floor(PAGESPACE / 2 - sizeof(indx_t));
|
||||
* DATALEN_NO_OVERFLOW = LEAF_NODE_MAX - NODESIZE - KEYLEN_MAX;
|
||||
*
|
||||
* - SubDatabase-node must fit into one leaf-page:
|
||||
* SUBDB_NAME_MAX = LEAF_NODE_MAX - node_hdr_len - sizeof(tree_t);
|
||||
*
|
||||
* - Dupsort values itself are a keys in a dupsort-subdb and couldn't be longer
|
||||
* than the KEYLEN_MAX. But dupsort node must not great than LEAF_NODE_MAX,
|
||||
* since dupsort value couldn't be placed on a large/overflow page:
|
||||
* DUPSORT_DATALEN_MAX = min(KEYLEN_MAX,
|
||||
* max(DATALEN_NO_OVERFLOW, sizeof(tree_t));
|
||||
*/
|
||||
|
||||
#define PAGESPACE(pagesize) ((pagesize) - PAGEHDRSZ)
|
||||
|
||||
#define BRANCH_NODE_MAX(pagesize) \
|
||||
(EVEN_FLOOR((PAGESPACE(pagesize) - sizeof(indx_t) - NODESIZE) / (3 - 1) - \
|
||||
sizeof(indx_t)))
|
||||
|
||||
#define LEAF_NODE_MAX(pagesize) \
|
||||
(EVEN_FLOOR(PAGESPACE(pagesize) / 2) - sizeof(indx_t))
|
||||
|
||||
#define MAX_GC1OVPAGE(pagesize) (PAGESPACE(pagesize) / sizeof(pgno_t) - 1)
|
||||
|
||||
MDBX_NOTHROW_CONST_FUNCTION static inline size_t
|
||||
keysize_max(size_t pagesize, MDBX_db_flags_t flags) {
|
||||
assert(pagesize >= MDBX_MIN_PAGESIZE && pagesize <= MDBX_MAX_PAGESIZE &&
|
||||
is_powerof2(pagesize));
|
||||
STATIC_ASSERT(BRANCH_NODE_MAX(MDBX_MIN_PAGESIZE) - NODESIZE >= 8);
|
||||
if (flags & MDBX_INTEGERKEY)
|
||||
return 8 /* sizeof(uint64_t) */;
|
||||
|
||||
const intptr_t max_branch_key = BRANCH_NODE_MAX(pagesize) - NODESIZE;
|
||||
STATIC_ASSERT(LEAF_NODE_MAX(MDBX_MIN_PAGESIZE) - NODESIZE -
|
||||
/* sizeof(uint64) as a key */ 8 >
|
||||
sizeof(tree_t));
|
||||
if (flags &
|
||||
(MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_REVERSEDUP | MDBX_INTEGERDUP)) {
|
||||
const intptr_t max_dupsort_leaf_key =
|
||||
LEAF_NODE_MAX(pagesize) - NODESIZE - sizeof(tree_t);
|
||||
return (max_branch_key < max_dupsort_leaf_key) ? max_branch_key
|
||||
: max_dupsort_leaf_key;
|
||||
}
|
||||
return max_branch_key;
|
||||
}
|
||||
|
||||
MDBX_NOTHROW_CONST_FUNCTION static inline size_t
|
||||
env_keysize_max(const MDBX_env *env, MDBX_db_flags_t flags) {
|
||||
size_t size_max;
|
||||
if (flags & MDBX_INTEGERKEY)
|
||||
size_max = 8 /* sizeof(uint64_t) */;
|
||||
else {
|
||||
const intptr_t max_branch_key = env->branch_nodemax - NODESIZE;
|
||||
STATIC_ASSERT(LEAF_NODE_MAX(MDBX_MIN_PAGESIZE) - NODESIZE -
|
||||
/* sizeof(uint64) as a key */ 8 >
|
||||
sizeof(tree_t));
|
||||
if (flags &
|
||||
(MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_REVERSEDUP | MDBX_INTEGERDUP)) {
|
||||
const intptr_t max_dupsort_leaf_key =
|
||||
env->leaf_nodemax - NODESIZE - sizeof(tree_t);
|
||||
size_max = (max_branch_key < max_dupsort_leaf_key) ? max_branch_key
|
||||
: max_dupsort_leaf_key;
|
||||
} else
|
||||
size_max = max_branch_key;
|
||||
}
|
||||
eASSERT(env, size_max == keysize_max(env->ps, flags));
|
||||
return size_max;
|
||||
}
|
||||
|
||||
MDBX_NOTHROW_CONST_FUNCTION static inline size_t
|
||||
keysize_min(MDBX_db_flags_t flags) {
|
||||
return (flags & MDBX_INTEGERKEY) ? 4 /* sizeof(uint32_t) */ : 0;
|
||||
}
|
||||
|
||||
MDBX_NOTHROW_CONST_FUNCTION static inline size_t
|
||||
valsize_min(MDBX_db_flags_t flags) {
|
||||
if (flags & MDBX_INTEGERDUP)
|
||||
return 4 /* sizeof(uint32_t) */;
|
||||
else if (flags & MDBX_DUPFIXED)
|
||||
return sizeof(indx_t);
|
||||
else
|
||||
return 0;
|
||||
}
|
||||
|
||||
MDBX_NOTHROW_CONST_FUNCTION static inline size_t
|
||||
valsize_max(size_t pagesize, MDBX_db_flags_t flags) {
|
||||
assert(pagesize >= MDBX_MIN_PAGESIZE && pagesize <= MDBX_MAX_PAGESIZE &&
|
||||
is_powerof2(pagesize));
|
||||
|
||||
if (flags & MDBX_INTEGERDUP)
|
||||
return 8 /* sizeof(uint64_t) */;
|
||||
|
||||
if (flags & (MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_REVERSEDUP))
|
||||
return keysize_max(pagesize, 0);
|
||||
|
||||
const unsigned page_ln2 = log2n_powerof2(pagesize);
|
||||
const size_t hard = 0x7FF00000ul;
|
||||
const size_t hard_pages = hard >> page_ln2;
|
||||
STATIC_ASSERT(PAGELIST_LIMIT <= MAX_PAGENO);
|
||||
const size_t pages_limit = PAGELIST_LIMIT / 4;
|
||||
const size_t limit =
|
||||
(hard_pages < pages_limit) ? hard : (pages_limit << page_ln2);
|
||||
return (limit < MAX_MAPSIZE / 2) ? limit : MAX_MAPSIZE / 2;
|
||||
}
|
||||
|
||||
MDBX_NOTHROW_CONST_FUNCTION static inline size_t
|
||||
env_valsize_max(const MDBX_env *env, MDBX_db_flags_t flags) {
|
||||
size_t size_max;
|
||||
if (flags & MDBX_INTEGERDUP)
|
||||
size_max = 8 /* sizeof(uint64_t) */;
|
||||
else if (flags & (MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_REVERSEDUP))
|
||||
size_max = env_keysize_max(env, 0);
|
||||
else {
|
||||
const size_t hard = 0x7FF00000ul;
|
||||
const size_t hard_pages = hard >> env->ps2ln;
|
||||
STATIC_ASSERT(PAGELIST_LIMIT <= MAX_PAGENO);
|
||||
const size_t pages_limit = PAGELIST_LIMIT / 4;
|
||||
const size_t limit =
|
||||
(hard_pages < pages_limit) ? hard : (pages_limit << env->ps2ln);
|
||||
size_max = (limit < MAX_MAPSIZE / 2) ? limit : MAX_MAPSIZE / 2;
|
||||
}
|
||||
eASSERT(env, size_max == valsize_max(env->ps, flags));
|
||||
return size_max;
|
||||
}
|
||||
|
||||
/*----------------------------------------------------------------------------*/
|
||||
|
||||
MDBX_NOTHROW_PURE_FUNCTION static inline size_t
|
||||
leaf_size(const MDBX_env *env, const MDBX_val *key, const MDBX_val *data) {
|
||||
size_t node_bytes = node_size(key, data);
|
||||
if (node_bytes > env->leaf_nodemax)
|
||||
/* put on large/overflow page */
|
||||
node_bytes = node_size_len(key->iov_len, 0) + sizeof(pgno_t);
|
||||
|
||||
return node_bytes + sizeof(indx_t);
|
||||
}
|
||||
|
||||
MDBX_NOTHROW_PURE_FUNCTION static inline size_t
|
||||
branch_size(const MDBX_env *env, const MDBX_val *key) {
|
||||
/* Size of a node in a branch page with a given key.
|
||||
* This is just the node header plus the key, there is no data. */
|
||||
size_t node_bytes = node_size(key, nullptr);
|
||||
if (unlikely(node_bytes > env->branch_nodemax)) {
|
||||
/* put on large/overflow page, not implemented */
|
||||
mdbx_panic("node_size(key) %zu > %u branch_nodemax", node_bytes,
|
||||
env->branch_nodemax);
|
||||
node_bytes = node_size(key, nullptr) + sizeof(pgno_t);
|
||||
}
|
||||
|
||||
return node_bytes + sizeof(indx_t);
|
||||
}
|
||||
|
||||
MDBX_NOTHROW_CONST_FUNCTION static inline uint16_t
|
||||
flags_db2sub(uint16_t db_flags) {
|
||||
uint16_t sub_flags = db_flags & MDBX_DUPFIXED;
|
||||
|
||||
/* MDBX_INTEGERDUP => MDBX_INTEGERKEY */
|
||||
#define SHIFT_INTEGERDUP_TO_INTEGERKEY 2
|
||||
STATIC_ASSERT((MDBX_INTEGERDUP >> SHIFT_INTEGERDUP_TO_INTEGERKEY) ==
|
||||
MDBX_INTEGERKEY);
|
||||
sub_flags |= (db_flags & MDBX_INTEGERDUP) >> SHIFT_INTEGERDUP_TO_INTEGERKEY;
|
||||
|
||||
/* MDBX_REVERSEDUP => MDBX_REVERSEKEY */
|
||||
#define SHIFT_REVERSEDUP_TO_REVERSEKEY 5
|
||||
STATIC_ASSERT((MDBX_REVERSEDUP >> SHIFT_REVERSEDUP_TO_REVERSEKEY) ==
|
||||
MDBX_REVERSEKEY);
|
||||
sub_flags |= (db_flags & MDBX_REVERSEDUP) >> SHIFT_REVERSEDUP_TO_REVERSEKEY;
|
||||
|
||||
return sub_flags;
|
||||
}
|
||||
|
||||
static inline bool check_sdb_flags(unsigned flags) {
|
||||
switch (flags & ~(MDBX_REVERSEKEY | MDBX_INTEGERKEY)) {
|
||||
default:
|
||||
NOTICE("invalid db-flags 0x%x", flags);
|
||||
return false;
|
||||
case MDBX_DUPSORT:
|
||||
case MDBX_DUPSORT | MDBX_REVERSEDUP:
|
||||
case MDBX_DUPSORT | MDBX_DUPFIXED:
|
||||
case MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_REVERSEDUP:
|
||||
case MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_INTEGERDUP:
|
||||
case MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_INTEGERDUP | MDBX_REVERSEDUP:
|
||||
case MDBX_DB_DEFAULTS:
|
||||
return (flags & (MDBX_REVERSEKEY | MDBX_INTEGERKEY)) !=
|
||||
(MDBX_REVERSEKEY | MDBX_INTEGERKEY);
|
||||
}
|
||||
}
|
||||
|
||||
/*----------------------------------------------------------------------------*/
|
||||
|
||||
MDBX_NOTHROW_PURE_FUNCTION static inline size_t pgno2bytes(const MDBX_env *env,
|
||||
size_t pgno) {
|
||||
eASSERT(env, (1u << env->ps2ln) == env->ps);
|
||||
return ((size_t)pgno) << env->ps2ln;
|
||||
}
|
||||
|
||||
MDBX_NOTHROW_PURE_FUNCTION static inline page_t *pgno2page(const MDBX_env *env,
|
||||
size_t pgno) {
|
||||
return ptr_disp(env->dxb_mmap.base, pgno2bytes(env, pgno));
|
||||
}
|
||||
|
||||
MDBX_NOTHROW_PURE_FUNCTION static inline pgno_t bytes2pgno(const MDBX_env *env,
|
||||
size_t bytes) {
|
||||
eASSERT(env, (env->ps >> env->ps2ln) == 1);
|
||||
return (pgno_t)(bytes >> env->ps2ln);
|
||||
}
|
||||
|
||||
MDBX_NOTHROW_PURE_FUNCTION MDBX_INTERNAL size_t
|
||||
bytes_align2os_bytes(const MDBX_env *env, size_t bytes);
|
||||
|
||||
MDBX_NOTHROW_PURE_FUNCTION MDBX_INTERNAL size_t
|
||||
pgno_align2os_bytes(const MDBX_env *env, size_t pgno);
|
||||
|
||||
MDBX_NOTHROW_PURE_FUNCTION MDBX_INTERNAL pgno_t
|
||||
pgno_align2os_pgno(const MDBX_env *env, size_t pgno);
|
||||
|
||||
MDBX_NOTHROW_PURE_FUNCTION static inline pgno_t
|
||||
largechunk_npages(const MDBX_env *env, size_t bytes) {
|
||||
return bytes2pgno(env, PAGEHDRSZ - 1 + bytes) + 1;
|
||||
}
|
||||
|
||||
MDBX_NOTHROW_PURE_FUNCTION static inline MDBX_val get_key(const node_t *node) {
|
||||
MDBX_val key;
|
||||
key.iov_len = node_ks(node);
|
||||
key.iov_base = node_key(node);
|
||||
return key;
|
||||
}
|
||||
|
||||
static inline void get_key_optional(const node_t *node,
|
||||
MDBX_val *keyptr /* __may_null */) {
|
||||
if (keyptr)
|
||||
*keyptr = get_key(node);
|
||||
}
|
||||
|
||||
MDBX_NOTHROW_PURE_FUNCTION static inline void *page_data(const page_t *mp) {
|
||||
return ptr_disp(mp, PAGEHDRSZ);
|
||||
}
|
||||
|
||||
MDBX_NOTHROW_PURE_FUNCTION static inline const page_t *
|
||||
data_page(const void *data) {
|
||||
return container_of(data, page_t, entries);
|
||||
}
|
||||
|
||||
MDBX_NOTHROW_PURE_FUNCTION static inline meta_t *page_meta(page_t *mp) {
|
||||
return (meta_t *)page_data(mp);
|
||||
}
|
||||
|
||||
MDBX_NOTHROW_PURE_FUNCTION static inline size_t page_numkeys(const page_t *mp) {
|
||||
return mp->lower >> 1;
|
||||
}
|
||||
|
||||
MDBX_NOTHROW_PURE_FUNCTION static inline size_t page_room(const page_t *mp) {
|
||||
return mp->upper - mp->lower;
|
||||
}
|
||||
|
||||
MDBX_NOTHROW_PURE_FUNCTION static inline size_t
|
||||
page_space(const MDBX_env *env) {
|
||||
STATIC_ASSERT(PAGEHDRSZ % 2 == 0);
|
||||
return env->ps - PAGEHDRSZ;
|
||||
}
|
||||
|
||||
MDBX_NOTHROW_PURE_FUNCTION static inline size_t page_used(const MDBX_env *env,
|
||||
const page_t *mp) {
|
||||
return page_space(env) - page_room(mp);
|
||||
}
|
||||
|
||||
/* The percentage of space used in the page, in a percents. */
|
||||
MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static inline unsigned
|
||||
page_fill_percentum_x10(const MDBX_env *env, const page_t *mp) {
|
||||
const size_t space = page_space(env);
|
||||
return (unsigned)((page_used(env, mp) * 1000 + space / 2) / space);
|
||||
}
|
||||
|
||||
MDBX_NOTHROW_PURE_FUNCTION static inline node_t *page_node(const page_t *mp,
|
||||
size_t i) {
|
||||
assert(page_type_compat(mp) == P_LEAF || page_type(mp) == P_BRANCH);
|
||||
assert(page_numkeys(mp) > i);
|
||||
assert(mp->entries[i] % 2 == 0);
|
||||
return ptr_disp(mp, mp->entries[i] + PAGEHDRSZ);
|
||||
}
|
||||
|
||||
MDBX_NOTHROW_PURE_FUNCTION static inline void *
|
||||
page_dupfix_ptr(const page_t *mp, size_t i, size_t keysize) {
|
||||
assert(page_type_compat(mp) == (P_LEAF | P_DUPFIX) && i == (indx_t)i &&
|
||||
mp->dupfix_ksize == keysize);
|
||||
(void)keysize;
|
||||
return ptr_disp(mp, PAGEHDRSZ + mp->dupfix_ksize * (indx_t)i);
|
||||
}
|
||||
|
||||
MDBX_NOTHROW_PURE_FUNCTION static inline MDBX_val
|
||||
page_dupfix_key(const page_t *mp, size_t i, size_t keysize) {
|
||||
MDBX_val r;
|
||||
r.iov_base = page_dupfix_ptr(mp, i, keysize);
|
||||
r.iov_len = mp->dupfix_ksize;
|
||||
return r;
|
||||
}
|
||||
|
||||
/*----------------------------------------------------------------------------*/
|
||||
|
||||
MDBX_NOTHROW_PURE_FUNCTION MDBX_INTERNAL int
|
||||
cmp_int_unaligned(const MDBX_val *a, const MDBX_val *b);
|
||||
|
||||
#if MDBX_UNALIGNED_OK < 2 || \
|
||||
(MDBX_DEBUG || MDBX_FORCE_ASSERTIONS || !defined(NDEBUG))
|
||||
MDBX_NOTHROW_PURE_FUNCTION MDBX_INTERNAL int
|
||||
/* Compare two items pointing at 2-byte aligned unsigned int's. */
|
||||
cmp_int_align2(const MDBX_val *a, const MDBX_val *b);
|
||||
#else
|
||||
#define cmp_int_align2 cmp_int_unaligned
|
||||
#endif /* !MDBX_UNALIGNED_OK || debug */
|
||||
|
||||
#if MDBX_UNALIGNED_OK < 4 || \
|
||||
(MDBX_DEBUG || MDBX_FORCE_ASSERTIONS || !defined(NDEBUG))
|
||||
MDBX_NOTHROW_PURE_FUNCTION MDBX_INTERNAL int
|
||||
/* Compare two items pointing at 4-byte aligned unsigned int's. */
|
||||
cmp_int_align4(const MDBX_val *a, const MDBX_val *b);
|
||||
#else
|
||||
#define cmp_int_align4 cmp_int_unaligned
|
||||
#endif /* !MDBX_UNALIGNED_OK || debug */
|
||||
|
||||
/* Compare two items lexically */
|
||||
MDBX_NOTHROW_PURE_FUNCTION MDBX_INTERNAL int cmp_lexical(const MDBX_val *a,
|
||||
const MDBX_val *b);
|
||||
|
||||
/* Compare two items in reverse byte order */
|
||||
MDBX_NOTHROW_PURE_FUNCTION MDBX_INTERNAL int cmp_reverse(const MDBX_val *a,
|
||||
const MDBX_val *b);
|
||||
|
||||
/* Fast non-lexically comparator */
|
||||
MDBX_NOTHROW_PURE_FUNCTION MDBX_INTERNAL int cmp_lenfast(const MDBX_val *a,
|
||||
const MDBX_val *b);
|
||||
|
||||
MDBX_NOTHROW_PURE_FUNCTION MDBX_INTERNAL bool
|
||||
eq_fast_slowpath(const uint8_t *a, const uint8_t *b, size_t l);
|
||||
|
||||
MDBX_NOTHROW_PURE_FUNCTION static inline bool eq_fast(const MDBX_val *a,
|
||||
const MDBX_val *b) {
|
||||
return unlikely(a->iov_len == b->iov_len) &&
|
||||
eq_fast_slowpath(a->iov_base, b->iov_base, a->iov_len);
|
||||
}
|
||||
|
||||
MDBX_NOTHROW_PURE_FUNCTION MDBX_INTERNAL int
|
||||
cmp_equal_or_greater(const MDBX_val *a, const MDBX_val *b);
|
||||
|
||||
MDBX_NOTHROW_PURE_FUNCTION MDBX_INTERNAL int
|
||||
cmp_equal_or_wrong(const MDBX_val *a, const MDBX_val *b);
|
||||
|
||||
static inline MDBX_cmp_func *builtin_keycmp(MDBX_db_flags_t flags) {
|
||||
return (flags & MDBX_REVERSEKEY) ? cmp_reverse
|
||||
: (flags & MDBX_INTEGERKEY) ? cmp_int_align2
|
||||
: cmp_lexical;
|
||||
}
|
||||
|
||||
static inline MDBX_cmp_func *builtin_datacmp(MDBX_db_flags_t flags) {
|
||||
return !(flags & MDBX_DUPSORT)
|
||||
? cmp_lenfast
|
||||
: ((flags & MDBX_INTEGERDUP)
|
||||
? cmp_int_unaligned
|
||||
: ((flags & MDBX_REVERSEDUP) ? cmp_reverse : cmp_lexical));
|
||||
}
|
||||
|
||||
/*----------------------------------------------------------------------------*/
|
||||
|
||||
MDBX_INTERNAL uint32_t combine_durability_flags(const uint32_t a,
|
||||
const uint32_t b);
|
||||
|
||||
MDBX_CONST_FUNCTION static inline lck_t *lckless_stub(const MDBX_env *env) {
|
||||
uintptr_t stub = (uintptr_t)&env->lckless_placeholder;
|
||||
/* align to avoid false-positive alarm from UndefinedBehaviorSanitizer */
|
||||
stub = (stub + MDBX_CACHELINE_SIZE - 1) & ~(MDBX_CACHELINE_SIZE - 1);
|
||||
return (lck_t *)stub;
|
||||
}
|
||||
|
||||
#if !(defined(_WIN32) || defined(_WIN64))
|
||||
MDBX_MAYBE_UNUSED static inline int ignore_enosys(int err) {
|
||||
#ifdef ENOSYS
|
||||
if (err == ENOSYS)
|
||||
return MDBX_RESULT_TRUE;
|
||||
#endif /* ENOSYS */
|
||||
#ifdef ENOIMPL
|
||||
if (err == ENOIMPL)
|
||||
return MDBX_RESULT_TRUE;
|
||||
#endif /* ENOIMPL */
|
||||
#ifdef ENOTSUP
|
||||
if (err == ENOTSUP)
|
||||
return MDBX_RESULT_TRUE;
|
||||
#endif /* ENOTSUP */
|
||||
#ifdef ENOSUPP
|
||||
if (err == ENOSUPP)
|
||||
return MDBX_RESULT_TRUE;
|
||||
#endif /* ENOSUPP */
|
||||
#ifdef EOPNOTSUPP
|
||||
if (err == EOPNOTSUPP)
|
||||
return MDBX_RESULT_TRUE;
|
||||
#endif /* EOPNOTSUPP */
|
||||
if (err == EAGAIN)
|
||||
return MDBX_RESULT_TRUE;
|
||||
return err;
|
||||
}
|
||||
#endif /* defined(_WIN32) || defined(_WIN64) */
|
||||
|
||||
static inline int check_env(const MDBX_env *env, const bool wanna_active) {
|
||||
if (unlikely(!env))
|
||||
return MDBX_EINVAL;
|
||||
|
||||
if (unlikely(env->signature.weak != env_signature))
|
||||
return MDBX_EBADSIGN;
|
||||
|
||||
if (unlikely(env->flags & ENV_FATAL_ERROR))
|
||||
return MDBX_PANIC;
|
||||
|
||||
if (wanna_active) {
|
||||
#if MDBX_ENV_CHECKPID
|
||||
if (unlikely(env->pid != osal_getpid()) && env->pid) {
|
||||
((MDBX_env *)env)->flags |= ENV_FATAL_ERROR;
|
||||
return MDBX_PANIC;
|
||||
}
|
||||
#endif /* MDBX_ENV_CHECKPID */
|
||||
if (unlikely((env->flags & ENV_ACTIVE) == 0))
|
||||
return MDBX_EPERM;
|
||||
eASSERT(env, env->dxb_mmap.base != nullptr);
|
||||
}
|
||||
|
||||
return MDBX_SUCCESS;
|
||||
}
|
||||
|
||||
static inline int check_txn(const MDBX_txn *txn, int bad_bits) {
|
||||
if (unlikely(!txn))
|
||||
return MDBX_EINVAL;
|
||||
|
||||
if (unlikely(txn->signature != txn_signature))
|
||||
return MDBX_EBADSIGN;
|
||||
|
||||
if (unlikely(txn->flags & bad_bits))
|
||||
return MDBX_BAD_TXN;
|
||||
|
||||
tASSERT(txn, (txn->flags & MDBX_TXN_FINISHED) ||
|
||||
(txn->flags & MDBX_NOSTICKYTHREADS) ==
|
||||
(txn->env->flags & MDBX_NOSTICKYTHREADS));
|
||||
#if MDBX_TXN_CHECKOWNER
|
||||
STATIC_ASSERT((long)MDBX_NOSTICKYTHREADS > (long)MDBX_TXN_FINISHED);
|
||||
if ((txn->flags & (MDBX_NOSTICKYTHREADS | MDBX_TXN_FINISHED)) <
|
||||
MDBX_TXN_FINISHED &&
|
||||
unlikely(txn->owner != osal_thread_self()))
|
||||
return txn->owner ? MDBX_THREAD_MISMATCH : MDBX_BAD_TXN;
|
||||
#endif /* MDBX_TXN_CHECKOWNER */
|
||||
|
||||
if (bad_bits && unlikely(!txn->env->dxb_mmap.base))
|
||||
return MDBX_EPERM;
|
||||
|
||||
return MDBX_SUCCESS;
|
||||
}
|
||||
|
||||
static inline int check_txn_rw(const MDBX_txn *txn, int bad_bits) {
|
||||
int err = check_txn(txn, bad_bits);
|
||||
if (unlikely(err))
|
||||
return err;
|
||||
|
||||
if (unlikely(txn->flags & MDBX_TXN_RDONLY))
|
||||
return MDBX_EACCESS;
|
||||
|
||||
return MDBX_SUCCESS;
|
||||
}
|
||||
|
||||
/*----------------------------------------------------------------------------*/
|
||||
|
||||
MDBX_INTERNAL void mincore_clean_cache(const MDBX_env *const env);
|
||||
|
||||
MDBX_INTERNAL void update_mlcnt(const MDBX_env *env,
|
||||
const pgno_t new_aligned_mlocked_pgno,
|
||||
const bool lock_not_release);
|
||||
|
||||
MDBX_INTERNAL void munlock_after(const MDBX_env *env, const pgno_t aligned_pgno,
|
||||
const size_t end_bytes);
|
||||
|
||||
MDBX_INTERNAL void munlock_all(const MDBX_env *env);
|
||||
|
||||
/*----------------------------------------------------------------------------*/
|
||||
/* Cache coherence and mmap invalidation */
|
||||
#ifndef MDBX_CPU_WRITEBACK_INCOHERENT
|
||||
#error "The MDBX_CPU_WRITEBACK_INCOHERENT must be defined before"
|
||||
#elif MDBX_CPU_WRITEBACK_INCOHERENT
|
||||
#define osal_flush_incoherent_cpu_writeback() osal_memory_barrier()
|
||||
#else
|
||||
#define osal_flush_incoherent_cpu_writeback() osal_compiler_barrier()
|
||||
#endif /* MDBX_CPU_WRITEBACK_INCOHERENT */
|
||||
|
||||
MDBX_MAYBE_UNUSED static inline void
|
||||
osal_flush_incoherent_mmap(const void *addr, size_t nbytes,
|
||||
const intptr_t pagesize) {
|
||||
#ifndef MDBX_MMAP_INCOHERENT_FILE_WRITE
|
||||
#error "The MDBX_MMAP_INCOHERENT_FILE_WRITE must be defined before"
|
||||
#elif MDBX_MMAP_INCOHERENT_FILE_WRITE
|
||||
char *const begin = (char *)(-pagesize & (intptr_t)addr);
|
||||
char *const end =
|
||||
(char *)(-pagesize & (intptr_t)((char *)addr + nbytes + pagesize - 1));
|
||||
int err = msync(begin, end - begin, MS_SYNC | MS_INVALIDATE) ? errno : 0;
|
||||
eASSERT(nullptr, err == 0);
|
||||
(void)err;
|
||||
#else
|
||||
(void)pagesize;
|
||||
#endif /* MDBX_MMAP_INCOHERENT_FILE_WRITE */
|
||||
|
||||
#ifndef MDBX_MMAP_INCOHERENT_CPU_CACHE
|
||||
#error "The MDBX_MMAP_INCOHERENT_CPU_CACHE must be defined before"
|
||||
#elif MDBX_MMAP_INCOHERENT_CPU_CACHE
|
||||
#ifdef DCACHE
|
||||
/* MIPS has cache coherency issues.
|
||||
* Note: for any nbytes >= on-chip cache size, entire is flushed. */
|
||||
cacheflush((void *)addr, nbytes, DCACHE);
|
||||
#else
|
||||
#error "Oops, cacheflush() not available"
|
||||
#endif /* DCACHE */
|
||||
#endif /* MDBX_MMAP_INCOHERENT_CPU_CACHE */
|
||||
|
||||
#if !MDBX_MMAP_INCOHERENT_FILE_WRITE && !MDBX_MMAP_INCOHERENT_CPU_CACHE
|
||||
(void)addr;
|
||||
(void)nbytes;
|
||||
#endif
|
||||
}
|
198
src/coherency.c
Normal file
198
src/coherency.c
Normal file
@ -0,0 +1,198 @@
|
||||
/// \copyright SPDX-License-Identifier: Apache-2.0
|
||||
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
|
||||
|
||||
#include "internals.h"
|
||||
|
||||
/* check against https://libmdbx.dqdkfa.ru/dead-github/issues/269 */
|
||||
static bool coherency_check(const MDBX_env *env, const txnid_t txnid,
|
||||
const volatile tree_t *trees,
|
||||
const volatile meta_t *meta, bool report) {
|
||||
const txnid_t freedb_mod_txnid = trees[FREE_DBI].mod_txnid;
|
||||
const txnid_t maindb_mod_txnid = trees[MAIN_DBI].mod_txnid;
|
||||
const pgno_t last_pgno = meta->geometry.now;
|
||||
|
||||
const pgno_t freedb_root_pgno = trees[FREE_DBI].root;
|
||||
const page_t *freedb_root =
|
||||
(env->dxb_mmap.base && freedb_root_pgno < last_pgno)
|
||||
? pgno2page(env, freedb_root_pgno)
|
||||
: nullptr;
|
||||
|
||||
const pgno_t maindb_root_pgno = trees[MAIN_DBI].root;
|
||||
const page_t *maindb_root =
|
||||
(env->dxb_mmap.base && maindb_root_pgno < last_pgno)
|
||||
? pgno2page(env, maindb_root_pgno)
|
||||
: nullptr;
|
||||
const uint64_t magic_and_version =
|
||||
unaligned_peek_u64_volatile(4, &meta->magic_and_version);
|
||||
|
||||
bool ok = true;
|
||||
if (freedb_root_pgno != P_INVALID &&
|
||||
unlikely(freedb_root_pgno >= last_pgno)) {
|
||||
if (report)
|
||||
WARNING(
|
||||
"catch invalid %sdb root %" PRIaPGNO " for meta_txnid %" PRIaTXN
|
||||
" %s",
|
||||
"free", freedb_root_pgno, txnid,
|
||||
(env->stuck_meta < 0)
|
||||
? "(workaround for incoherent flaw of unified page/buffer cache)"
|
||||
: "(wagering meta)");
|
||||
ok = false;
|
||||
}
|
||||
if (maindb_root_pgno != P_INVALID &&
|
||||
unlikely(maindb_root_pgno >= last_pgno)) {
|
||||
if (report)
|
||||
WARNING(
|
||||
"catch invalid %sdb root %" PRIaPGNO " for meta_txnid %" PRIaTXN
|
||||
" %s",
|
||||
"main", maindb_root_pgno, txnid,
|
||||
(env->stuck_meta < 0)
|
||||
? "(workaround for incoherent flaw of unified page/buffer cache)"
|
||||
: "(wagering meta)");
|
||||
ok = false;
|
||||
}
|
||||
if (unlikely(txnid < freedb_mod_txnid ||
|
||||
(!freedb_mod_txnid && freedb_root &&
|
||||
likely(magic_and_version == MDBX_DATA_MAGIC)))) {
|
||||
if (report)
|
||||
WARNING(
|
||||
"catch invalid %sdb.mod_txnid %" PRIaTXN " for meta_txnid %" PRIaTXN
|
||||
" %s",
|
||||
"free", freedb_mod_txnid, txnid,
|
||||
(env->stuck_meta < 0)
|
||||
? "(workaround for incoherent flaw of unified page/buffer cache)"
|
||||
: "(wagering meta)");
|
||||
ok = false;
|
||||
}
|
||||
if (unlikely(txnid < maindb_mod_txnid ||
|
||||
(!maindb_mod_txnid && maindb_root &&
|
||||
likely(magic_and_version == MDBX_DATA_MAGIC)))) {
|
||||
if (report)
|
||||
WARNING(
|
||||
"catch invalid %sdb.mod_txnid %" PRIaTXN " for meta_txnid %" PRIaTXN
|
||||
" %s",
|
||||
"main", maindb_mod_txnid, txnid,
|
||||
(env->stuck_meta < 0)
|
||||
? "(workaround for incoherent flaw of unified page/buffer cache)"
|
||||
: "(wagering meta)");
|
||||
ok = false;
|
||||
}
|
||||
if (likely(freedb_root && freedb_mod_txnid)) {
|
||||
VALGRIND_MAKE_MEM_DEFINED(freedb_root, sizeof(freedb_root->txnid));
|
||||
MDBX_ASAN_UNPOISON_MEMORY_REGION(freedb_root, sizeof(freedb_root->txnid));
|
||||
const txnid_t root_txnid = freedb_root->txnid;
|
||||
if (unlikely(root_txnid != freedb_mod_txnid)) {
|
||||
if (report)
|
||||
WARNING("catch invalid root_page %" PRIaPGNO " mod_txnid %" PRIaTXN
|
||||
" for %sdb.mod_txnid %" PRIaTXN " %s",
|
||||
freedb_root_pgno, root_txnid, "free", freedb_mod_txnid,
|
||||
(env->stuck_meta < 0) ? "(workaround for incoherent flaw of "
|
||||
"unified page/buffer cache)"
|
||||
: "(wagering meta)");
|
||||
ok = false;
|
||||
}
|
||||
}
|
||||
if (likely(maindb_root && maindb_mod_txnid)) {
|
||||
VALGRIND_MAKE_MEM_DEFINED(maindb_root, sizeof(maindb_root->txnid));
|
||||
MDBX_ASAN_UNPOISON_MEMORY_REGION(maindb_root, sizeof(maindb_root->txnid));
|
||||
const txnid_t root_txnid = maindb_root->txnid;
|
||||
if (unlikely(root_txnid != maindb_mod_txnid)) {
|
||||
if (report)
|
||||
WARNING("catch invalid root_page %" PRIaPGNO " mod_txnid %" PRIaTXN
|
||||
" for %sdb.mod_txnid %" PRIaTXN " %s",
|
||||
maindb_root_pgno, root_txnid, "main", maindb_mod_txnid,
|
||||
(env->stuck_meta < 0) ? "(workaround for incoherent flaw of "
|
||||
"unified page/buffer cache)"
|
||||
: "(wagering meta)");
|
||||
ok = false;
|
||||
}
|
||||
}
|
||||
if (unlikely(!ok) && report)
|
||||
env->lck->pgops.incoherence.weak =
|
||||
(env->lck->pgops.incoherence.weak >= INT32_MAX)
|
||||
? INT32_MAX
|
||||
: env->lck->pgops.incoherence.weak + 1;
|
||||
return ok;
|
||||
}
|
||||
|
||||
__cold int coherency_timeout(uint64_t *timestamp, intptr_t pgno,
|
||||
const MDBX_env *env) {
|
||||
if (likely(timestamp && *timestamp == 0))
|
||||
*timestamp = osal_monotime();
|
||||
else if (unlikely(!timestamp || osal_monotime() - *timestamp >
|
||||
osal_16dot16_to_monotime(65536 / 10))) {
|
||||
if (pgno >= 0 && pgno != env->stuck_meta)
|
||||
ERROR("bailout waiting for %" PRIuSIZE " page arrival %s", pgno,
|
||||
"(workaround for incoherent flaw of unified page/buffer cache)");
|
||||
else if (env->stuck_meta < 0)
|
||||
ERROR("bailout waiting for valid snapshot (%s)",
|
||||
"workaround for incoherent flaw of unified page/buffer cache");
|
||||
return MDBX_PROBLEM;
|
||||
}
|
||||
|
||||
osal_memory_fence(mo_AcquireRelease, true);
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
SwitchToThread();
|
||||
#elif defined(__linux__) || defined(__gnu_linux__) || defined(_UNIX03_SOURCE)
|
||||
sched_yield();
|
||||
#elif (defined(_GNU_SOURCE) && __GLIBC_PREREQ(2, 1)) || defined(_OPEN_THREADS)
|
||||
pthread_yield();
|
||||
#else
|
||||
usleep(42);
|
||||
#endif
|
||||
return MDBX_RESULT_TRUE;
|
||||
}
|
||||
|
||||
/* check with timeout as the workaround
|
||||
* for https://libmdbx.dqdkfa.ru/dead-github/issues/269 */
|
||||
__hot int coherency_check_head(MDBX_txn *txn, const meta_ptr_t head,
|
||||
uint64_t *timestamp) {
|
||||
/* Copy the DB info and flags */
|
||||
txn->geo = head.ptr_v->geometry;
|
||||
memcpy(txn->dbs, &head.ptr_c->trees, sizeof(head.ptr_c->trees));
|
||||
STATIC_ASSERT(sizeof(head.ptr_c->trees) == CORE_DBS * sizeof(tree_t));
|
||||
VALGRIND_MAKE_MEM_UNDEFINED(txn->dbs + CORE_DBS,
|
||||
txn->env->max_dbi - CORE_DBS);
|
||||
txn->canary = head.ptr_v->canary;
|
||||
|
||||
if (unlikely(!coherency_check(txn->env, head.txnid, txn->dbs, head.ptr_v,
|
||||
*timestamp == 0)))
|
||||
return coherency_timeout(timestamp, -1, txn->env);
|
||||
|
||||
tASSERT(txn, txn->dbs[FREE_DBI].flags == MDBX_INTEGERKEY);
|
||||
tASSERT(txn, check_sdb_flags(txn->dbs[MAIN_DBI].flags));
|
||||
return MDBX_SUCCESS;
|
||||
}
|
||||
|
||||
int coherency_check_written(const MDBX_env *env, const txnid_t txnid,
|
||||
const volatile meta_t *meta, const intptr_t pgno,
|
||||
uint64_t *timestamp) {
|
||||
const bool report = !(timestamp && *timestamp);
|
||||
const txnid_t head_txnid = meta_txnid(meta);
|
||||
if (unlikely(head_txnid < MIN_TXNID || head_txnid < txnid)) {
|
||||
if (report) {
|
||||
env->lck->pgops.incoherence.weak =
|
||||
(env->lck->pgops.incoherence.weak >= INT32_MAX)
|
||||
? INT32_MAX
|
||||
: env->lck->pgops.incoherence.weak + 1;
|
||||
WARNING("catch %s txnid %" PRIaTXN " for meta_%" PRIaPGNO " %s",
|
||||
(head_txnid < MIN_TXNID) ? "invalid" : "unexpected", head_txnid,
|
||||
bytes2pgno(env, ptr_dist(meta, env->dxb_mmap.base)),
|
||||
"(workaround for incoherent flaw of unified page/buffer cache)");
|
||||
}
|
||||
return coherency_timeout(timestamp, pgno, env);
|
||||
}
|
||||
if (unlikely(
|
||||
!coherency_check(env, head_txnid, &meta->trees.gc, meta, report)))
|
||||
return coherency_timeout(timestamp, pgno, env);
|
||||
|
||||
eASSERT(env, meta->trees.gc.flags == MDBX_INTEGERKEY);
|
||||
eASSERT(env, check_sdb_flags(meta->trees.main.flags));
|
||||
return MDBX_SUCCESS;
|
||||
}
|
||||
|
||||
bool coherency_check_meta(const MDBX_env *env, const volatile meta_t *meta,
|
||||
bool report) {
|
||||
uint64_t timestamp = 0;
|
||||
return coherency_check_written(env, 0, meta, -1,
|
||||
report ? ×tamp : nullptr) == MDBX_SUCCESS;
|
||||
}
|
768
src/cold.c
Normal file
768
src/cold.c
Normal file
@ -0,0 +1,768 @@
|
||||
/// \copyright SPDX-License-Identifier: Apache-2.0
|
||||
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
|
||||
|
||||
#include "internals.h"
|
||||
|
||||
__cold size_t mdbx_default_pagesize(void) {
|
||||
size_t pagesize = globals.sys_pagesize;
|
||||
ENSURE(nullptr, is_powerof2(pagesize));
|
||||
pagesize = (pagesize >= MDBX_MIN_PAGESIZE) ? pagesize : MDBX_MIN_PAGESIZE;
|
||||
pagesize = (pagesize <= MDBX_MAX_PAGESIZE) ? pagesize : MDBX_MAX_PAGESIZE;
|
||||
return pagesize;
|
||||
}
|
||||
|
||||
__cold intptr_t mdbx_limits_dbsize_min(intptr_t pagesize) {
|
||||
if (pagesize < 1)
|
||||
pagesize = (intptr_t)mdbx_default_pagesize();
|
||||
else if (unlikely(pagesize < (intptr_t)MDBX_MIN_PAGESIZE ||
|
||||
pagesize > (intptr_t)MDBX_MAX_PAGESIZE ||
|
||||
!is_powerof2((size_t)pagesize)))
|
||||
return -1;
|
||||
|
||||
return MIN_PAGENO * pagesize;
|
||||
}
|
||||
|
||||
__cold intptr_t mdbx_limits_dbsize_max(intptr_t pagesize) {
|
||||
if (pagesize < 1)
|
||||
pagesize = (intptr_t)mdbx_default_pagesize();
|
||||
else if (unlikely(pagesize < (intptr_t)MDBX_MIN_PAGESIZE ||
|
||||
pagesize > (intptr_t)MDBX_MAX_PAGESIZE ||
|
||||
!is_powerof2((size_t)pagesize)))
|
||||
return -1;
|
||||
|
||||
STATIC_ASSERT(MAX_MAPSIZE < INTPTR_MAX);
|
||||
const uint64_t limit = (1 + (uint64_t)MAX_PAGENO) * pagesize;
|
||||
return (limit < MAX_MAPSIZE) ? (intptr_t)limit : (intptr_t)MAX_MAPSIZE;
|
||||
}
|
||||
|
||||
__cold intptr_t mdbx_limits_txnsize_max(intptr_t pagesize) {
|
||||
if (pagesize < 1)
|
||||
pagesize = (intptr_t)mdbx_default_pagesize();
|
||||
else if (unlikely(pagesize < (intptr_t)MDBX_MIN_PAGESIZE ||
|
||||
pagesize > (intptr_t)MDBX_MAX_PAGESIZE ||
|
||||
!is_powerof2((size_t)pagesize)))
|
||||
return -1;
|
||||
|
||||
STATIC_ASSERT(MAX_MAPSIZE < INTPTR_MAX);
|
||||
const uint64_t pgl_limit =
|
||||
pagesize * (uint64_t)(PAGELIST_LIMIT / MDBX_GOLD_RATIO_DBL);
|
||||
const uint64_t map_limit = (uint64_t)(MAX_MAPSIZE / MDBX_GOLD_RATIO_DBL);
|
||||
return (pgl_limit < map_limit) ? (intptr_t)pgl_limit : (intptr_t)map_limit;
|
||||
}
|
||||
|
||||
__cold intptr_t mdbx_limits_keysize_max(intptr_t pagesize,
|
||||
MDBX_db_flags_t flags) {
|
||||
if (pagesize < 1)
|
||||
pagesize = (intptr_t)mdbx_default_pagesize();
|
||||
if (unlikely(pagesize < (intptr_t)MDBX_MIN_PAGESIZE ||
|
||||
pagesize > (intptr_t)MDBX_MAX_PAGESIZE ||
|
||||
!is_powerof2((size_t)pagesize)))
|
||||
return -1;
|
||||
|
||||
return keysize_max(pagesize, flags);
|
||||
}
|
||||
|
||||
__cold int mdbx_env_get_maxkeysize_ex(const MDBX_env *env,
|
||||
MDBX_db_flags_t flags) {
|
||||
if (unlikely(!env || env->signature.weak != env_signature))
|
||||
return -1;
|
||||
|
||||
return (int)mdbx_limits_keysize_max((intptr_t)env->ps, flags);
|
||||
}
|
||||
|
||||
__cold int mdbx_env_get_maxkeysize(const MDBX_env *env) {
|
||||
return mdbx_env_get_maxkeysize_ex(env, MDBX_DUPSORT);
|
||||
}
|
||||
|
||||
__cold intptr_t mdbx_limits_keysize_min(MDBX_db_flags_t flags) {
|
||||
return keysize_min(flags);
|
||||
}
|
||||
|
||||
__cold intptr_t mdbx_limits_valsize_max(intptr_t pagesize,
|
||||
MDBX_db_flags_t flags) {
|
||||
if (pagesize < 1)
|
||||
pagesize = (intptr_t)mdbx_default_pagesize();
|
||||
if (unlikely(pagesize < (intptr_t)MDBX_MIN_PAGESIZE ||
|
||||
pagesize > (intptr_t)MDBX_MAX_PAGESIZE ||
|
||||
!is_powerof2((size_t)pagesize)))
|
||||
return -1;
|
||||
|
||||
return valsize_max(pagesize, flags);
|
||||
}
|
||||
|
||||
__cold int mdbx_env_get_maxvalsize_ex(const MDBX_env *env,
|
||||
MDBX_db_flags_t flags) {
|
||||
if (unlikely(!env || env->signature.weak != env_signature))
|
||||
return -1;
|
||||
|
||||
return (int)mdbx_limits_valsize_max((intptr_t)env->ps, flags);
|
||||
}
|
||||
|
||||
__cold intptr_t mdbx_limits_valsize_min(MDBX_db_flags_t flags) {
|
||||
return valsize_min(flags);
|
||||
}
|
||||
|
||||
__cold intptr_t mdbx_limits_pairsize4page_max(intptr_t pagesize,
|
||||
MDBX_db_flags_t flags) {
|
||||
if (pagesize < 1)
|
||||
pagesize = (intptr_t)mdbx_default_pagesize();
|
||||
if (unlikely(pagesize < (intptr_t)MDBX_MIN_PAGESIZE ||
|
||||
pagesize > (intptr_t)MDBX_MAX_PAGESIZE ||
|
||||
!is_powerof2((size_t)pagesize)))
|
||||
return -1;
|
||||
|
||||
if (flags &
|
||||
(MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_INTEGERDUP | MDBX_REVERSEDUP))
|
||||
return BRANCH_NODE_MAX(pagesize) - NODESIZE;
|
||||
|
||||
return LEAF_NODE_MAX(pagesize) - NODESIZE;
|
||||
}
|
||||
|
||||
__cold int mdbx_env_get_pairsize4page_max(const MDBX_env *env,
|
||||
MDBX_db_flags_t flags) {
|
||||
if (unlikely(!env || env->signature.weak != env_signature))
|
||||
return -1;
|
||||
|
||||
return (int)mdbx_limits_pairsize4page_max((intptr_t)env->ps, flags);
|
||||
}
|
||||
|
||||
__cold intptr_t mdbx_limits_valsize4page_max(intptr_t pagesize,
|
||||
MDBX_db_flags_t flags) {
|
||||
if (pagesize < 1)
|
||||
pagesize = (intptr_t)mdbx_default_pagesize();
|
||||
if (unlikely(pagesize < (intptr_t)MDBX_MIN_PAGESIZE ||
|
||||
pagesize > (intptr_t)MDBX_MAX_PAGESIZE ||
|
||||
!is_powerof2((size_t)pagesize)))
|
||||
return -1;
|
||||
|
||||
if (flags &
|
||||
(MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_INTEGERDUP | MDBX_REVERSEDUP))
|
||||
return valsize_max(pagesize, flags);
|
||||
|
||||
return PAGESPACE(pagesize);
|
||||
}
|
||||
|
||||
__cold int mdbx_env_get_valsize4page_max(const MDBX_env *env,
|
||||
MDBX_db_flags_t flags) {
|
||||
if (unlikely(!env || env->signature.weak != env_signature))
|
||||
return -1;
|
||||
|
||||
return (int)mdbx_limits_valsize4page_max((intptr_t)env->ps, flags);
|
||||
}
|
||||
|
||||
/*----------------------------------------------------------------------------*/
|
||||
|
||||
__cold static void stat_add(const tree_t *db, MDBX_stat *const st,
|
||||
const size_t bytes) {
|
||||
st->ms_depth += db->height;
|
||||
st->ms_branch_pages += db->branch_pages;
|
||||
st->ms_leaf_pages += db->leaf_pages;
|
||||
st->ms_overflow_pages += db->large_pages;
|
||||
st->ms_entries += db->items;
|
||||
if (likely(bytes >=
|
||||
offsetof(MDBX_stat, ms_mod_txnid) + sizeof(st->ms_mod_txnid)))
|
||||
st->ms_mod_txnid =
|
||||
(st->ms_mod_txnid > db->mod_txnid) ? st->ms_mod_txnid : db->mod_txnid;
|
||||
}
|
||||
|
||||
__cold static int stat_acc(const MDBX_txn *txn, MDBX_stat *st, size_t bytes) {
|
||||
int err = check_txn(txn, MDBX_TXN_BLOCKED);
|
||||
if (unlikely(err != MDBX_SUCCESS))
|
||||
return err;
|
||||
|
||||
cursor_couple_t cx;
|
||||
err = cursor_init(&cx.outer, (MDBX_txn *)txn, MAIN_DBI);
|
||||
if (unlikely(err != MDBX_SUCCESS))
|
||||
return err;
|
||||
|
||||
const MDBX_env *const env = txn->env;
|
||||
st->ms_psize = env->ps;
|
||||
TXN_FOREACH_DBI_FROM(
|
||||
txn, dbi,
|
||||
/* assuming GC is internal and not subject for accounting */ MAIN_DBI) {
|
||||
if ((txn->dbi_state[dbi] & (DBI_VALID | DBI_STALE)) == DBI_VALID)
|
||||
stat_add(txn->dbs + dbi, st, bytes);
|
||||
}
|
||||
|
||||
if (!(txn->dbs[MAIN_DBI].flags & MDBX_DUPSORT) &&
|
||||
txn->dbs[MAIN_DBI].items /* TODO: use `md_subs` field */) {
|
||||
|
||||
/* scan and account not opened named subDBs */
|
||||
err = tree_search(&cx.outer, nullptr, Z_FIRST);
|
||||
while (err == MDBX_SUCCESS) {
|
||||
const page_t *mp = cx.outer.pg[cx.outer.top];
|
||||
for (size_t i = 0; i < page_numkeys(mp); i++) {
|
||||
const node_t *node = page_node(mp, i);
|
||||
if (node_flags(node) != N_SUBDATA)
|
||||
continue;
|
||||
if (unlikely(node_ds(node) != sizeof(tree_t))) {
|
||||
ERROR("%s/%d: %s %zu", "MDBX_CORRUPTED", MDBX_CORRUPTED,
|
||||
"invalid subDb node size", node_ds(node));
|
||||
return MDBX_CORRUPTED;
|
||||
}
|
||||
|
||||
/* skip opened and already accounted */
|
||||
const MDBX_val name = {node_key(node), node_ks(node)};
|
||||
TXN_FOREACH_DBI_USER(txn, dbi) {
|
||||
if ((txn->dbi_state[dbi] & (DBI_VALID | DBI_STALE)) == DBI_VALID &&
|
||||
env->kvs[MAIN_DBI].clc.k.cmp(&name, &env->kvs[dbi].name) == 0) {
|
||||
node = nullptr;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (node) {
|
||||
tree_t db;
|
||||
memcpy(&db, node_data(node), sizeof(db));
|
||||
stat_add(&db, st, bytes);
|
||||
}
|
||||
}
|
||||
err = cursor_sibling_right(&cx.outer);
|
||||
}
|
||||
if (unlikely(err != MDBX_NOTFOUND))
|
||||
return err;
|
||||
}
|
||||
|
||||
return MDBX_SUCCESS;
|
||||
}
|
||||
|
||||
__cold int mdbx_env_stat_ex(const MDBX_env *env, const MDBX_txn *txn,
|
||||
MDBX_stat *dest, size_t bytes) {
|
||||
if (unlikely(!dest))
|
||||
return MDBX_EINVAL;
|
||||
const size_t size_before_modtxnid = offsetof(MDBX_stat, ms_mod_txnid);
|
||||
if (unlikely(bytes != sizeof(MDBX_stat)) && bytes != size_before_modtxnid)
|
||||
return MDBX_EINVAL;
|
||||
|
||||
if (likely(txn)) {
|
||||
if (env && unlikely(txn->env != env))
|
||||
return MDBX_EINVAL;
|
||||
return stat_acc(txn, dest, bytes);
|
||||
}
|
||||
|
||||
int err = check_env(env, true);
|
||||
if (unlikely(err != MDBX_SUCCESS))
|
||||
return err;
|
||||
|
||||
if (env->txn && env_txn0_owned(env))
|
||||
/* inside write-txn */
|
||||
return stat_acc(env->txn, dest, bytes);
|
||||
|
||||
MDBX_txn *tmp_txn;
|
||||
err = mdbx_txn_begin((MDBX_env *)env, nullptr, MDBX_TXN_RDONLY, &tmp_txn);
|
||||
if (unlikely(err != MDBX_SUCCESS))
|
||||
return err;
|
||||
|
||||
const int rc = stat_acc(tmp_txn, dest, bytes);
|
||||
err = mdbx_txn_abort(tmp_txn);
|
||||
if (unlikely(err != MDBX_SUCCESS))
|
||||
return err;
|
||||
return rc;
|
||||
}
|
||||
|
||||
/*----------------------------------------------------------------------------*/
|
||||
|
||||
static size_t estimate_rss(size_t database_bytes) {
|
||||
return database_bytes + database_bytes / 64 +
|
||||
(512 + MDBX_WORDBITS * 16) * MEGABYTE;
|
||||
}
|
||||
|
||||
__cold int mdbx_env_warmup(const MDBX_env *env, const MDBX_txn *txn,
|
||||
MDBX_warmup_flags_t flags,
|
||||
unsigned timeout_seconds_16dot16) {
|
||||
if (unlikely(env == nullptr && txn == nullptr))
|
||||
return MDBX_EINVAL;
|
||||
if (unlikely(flags >
|
||||
(MDBX_warmup_force | MDBX_warmup_oomsafe | MDBX_warmup_lock |
|
||||
MDBX_warmup_touchlimit | MDBX_warmup_release)))
|
||||
return MDBX_EINVAL;
|
||||
|
||||
if (txn) {
|
||||
int err = check_txn(txn, MDBX_TXN_BLOCKED - MDBX_TXN_ERROR);
|
||||
if (unlikely(err != MDBX_SUCCESS))
|
||||
return err;
|
||||
}
|
||||
if (env) {
|
||||
int err = check_env(env, false);
|
||||
if (unlikely(err != MDBX_SUCCESS))
|
||||
return err;
|
||||
if (txn && unlikely(txn->env != env))
|
||||
return MDBX_EINVAL;
|
||||
} else {
|
||||
env = txn->env;
|
||||
}
|
||||
|
||||
const uint64_t timeout_monotime =
|
||||
(timeout_seconds_16dot16 && (flags & MDBX_warmup_force))
|
||||
? osal_monotime() + osal_16dot16_to_monotime(timeout_seconds_16dot16)
|
||||
: 0;
|
||||
|
||||
if (flags & MDBX_warmup_release)
|
||||
munlock_all(env);
|
||||
|
||||
pgno_t used_pgno;
|
||||
if (txn) {
|
||||
used_pgno = txn->geo.first_unallocated;
|
||||
} else {
|
||||
const troika_t troika = meta_tap(env);
|
||||
used_pgno = meta_recent(env, &troika).ptr_v->geometry.first_unallocated;
|
||||
}
|
||||
const size_t used_range = pgno_align2os_bytes(env, used_pgno);
|
||||
const pgno_t mlock_pgno = bytes2pgno(env, used_range);
|
||||
|
||||
int rc = MDBX_SUCCESS;
|
||||
if (flags & MDBX_warmup_touchlimit) {
|
||||
const size_t estimated_rss = estimate_rss(used_range);
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
SIZE_T current_ws_lower, current_ws_upper;
|
||||
if (GetProcessWorkingSetSize(GetCurrentProcess(), ¤t_ws_lower,
|
||||
¤t_ws_upper) &&
|
||||
current_ws_lower < estimated_rss) {
|
||||
const SIZE_T ws_lower = estimated_rss;
|
||||
const SIZE_T ws_upper =
|
||||
(MDBX_WORDBITS == 32 && ws_lower > MEGABYTE * 2048)
|
||||
? ws_lower
|
||||
: ws_lower + MDBX_WORDBITS * MEGABYTE * 32;
|
||||
if (!SetProcessWorkingSetSize(GetCurrentProcess(), ws_lower, ws_upper)) {
|
||||
rc = (int)GetLastError();
|
||||
WARNING("SetProcessWorkingSetSize(%zu, %zu) error %d", ws_lower,
|
||||
ws_upper, rc);
|
||||
}
|
||||
}
|
||||
#endif /* Windows */
|
||||
#ifdef RLIMIT_RSS
|
||||
struct rlimit rss;
|
||||
if (getrlimit(RLIMIT_RSS, &rss) == 0 && rss.rlim_cur < estimated_rss) {
|
||||
rss.rlim_cur = estimated_rss;
|
||||
if (rss.rlim_max < estimated_rss)
|
||||
rss.rlim_max = estimated_rss;
|
||||
if (setrlimit(RLIMIT_RSS, &rss)) {
|
||||
rc = errno;
|
||||
WARNING("setrlimit(%s, {%zu, %zu}) error %d", "RLIMIT_RSS",
|
||||
(size_t)rss.rlim_cur, (size_t)rss.rlim_max, rc);
|
||||
}
|
||||
}
|
||||
#endif /* RLIMIT_RSS */
|
||||
#ifdef RLIMIT_MEMLOCK
|
||||
if (flags & MDBX_warmup_lock) {
|
||||
struct rlimit memlock;
|
||||
if (getrlimit(RLIMIT_MEMLOCK, &memlock) == 0 &&
|
||||
memlock.rlim_cur < estimated_rss) {
|
||||
memlock.rlim_cur = estimated_rss;
|
||||
if (memlock.rlim_max < estimated_rss)
|
||||
memlock.rlim_max = estimated_rss;
|
||||
if (setrlimit(RLIMIT_MEMLOCK, &memlock)) {
|
||||
rc = errno;
|
||||
WARNING("setrlimit(%s, {%zu, %zu}) error %d", "RLIMIT_MEMLOCK",
|
||||
(size_t)memlock.rlim_cur, (size_t)memlock.rlim_max, rc);
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif /* RLIMIT_MEMLOCK */
|
||||
(void)estimated_rss;
|
||||
}
|
||||
|
||||
#if defined(MLOCK_ONFAULT) && \
|
||||
((defined(_GNU_SOURCE) && __GLIBC_PREREQ(2, 27)) || \
|
||||
(defined(__ANDROID_API__) && __ANDROID_API__ >= 30)) && \
|
||||
(defined(__linux__) || defined(__gnu_linux__))
|
||||
if ((flags & MDBX_warmup_lock) != 0 &&
|
||||
globals.linux_kernel_version >= 0x04040000 &&
|
||||
atomic_load32(&env->mlocked_pgno, mo_AcquireRelease) < mlock_pgno) {
|
||||
if (mlock2(env->dxb_mmap.base, used_range, MLOCK_ONFAULT)) {
|
||||
rc = errno;
|
||||
WARNING("mlock2(%zu, %s) error %d", used_range, "MLOCK_ONFAULT", rc);
|
||||
} else {
|
||||
update_mlcnt(env, mlock_pgno, true);
|
||||
rc = MDBX_SUCCESS;
|
||||
}
|
||||
if (rc != EINVAL)
|
||||
flags -= MDBX_warmup_lock;
|
||||
}
|
||||
#endif /* MLOCK_ONFAULT */
|
||||
|
||||
int err = MDBX_ENOSYS;
|
||||
#if MDBX_ENABLE_MADVISE
|
||||
err = dxb_set_readahead(env, used_pgno, true, true);
|
||||
#else
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
if (imports.PrefetchVirtualMemory) {
|
||||
WIN32_MEMORY_RANGE_ENTRY hint;
|
||||
hint.VirtualAddress = env->dxb_mmap.base;
|
||||
hint.NumberOfBytes = used_range;
|
||||
if (imports.PrefetchVirtualMemory(GetCurrentProcess(), 1, &hint, 0))
|
||||
err = MDBX_SUCCESS;
|
||||
else {
|
||||
err = (int)GetLastError();
|
||||
ERROR("%s(%zu) error %d", "PrefetchVirtualMemory", used_range, err);
|
||||
}
|
||||
}
|
||||
#endif /* Windows */
|
||||
|
||||
#if defined(POSIX_MADV_WILLNEED)
|
||||
err = posix_madvise(env->dxb_mmap.base, used_range, POSIX_MADV_WILLNEED)
|
||||
? ignore_enosys(errno)
|
||||
: MDBX_SUCCESS;
|
||||
#elif defined(MADV_WILLNEED)
|
||||
err = madvise(env->dxb_mmap.base, used_range, MADV_WILLNEED)
|
||||
? ignore_enosys(errno)
|
||||
: MDBX_SUCCESS;
|
||||
#endif
|
||||
|
||||
#if defined(F_RDADVISE)
|
||||
if (err) {
|
||||
fcntl(env->lazy_fd, F_RDAHEAD, true);
|
||||
struct radvisory hint;
|
||||
hint.ra_offset = 0;
|
||||
hint.ra_count = unlikely(used_range > INT_MAX &&
|
||||
sizeof(used_range) > sizeof(hint.ra_count))
|
||||
? INT_MAX
|
||||
: (int)used_range;
|
||||
err = fcntl(env->lazy_fd, F_RDADVISE, &hint) ? ignore_enosys(errno)
|
||||
: MDBX_SUCCESS;
|
||||
if (err == ENOTTY)
|
||||
err = MDBX_SUCCESS /* Ignore ENOTTY for DB on the ram-disk */;
|
||||
}
|
||||
#endif /* F_RDADVISE */
|
||||
#endif /* MDBX_ENABLE_MADVISE */
|
||||
if (err != MDBX_SUCCESS && rc == MDBX_SUCCESS)
|
||||
rc = err;
|
||||
|
||||
if ((flags & MDBX_warmup_force) != 0 &&
|
||||
(rc == MDBX_SUCCESS || rc == MDBX_ENOSYS)) {
|
||||
const volatile uint8_t *ptr = env->dxb_mmap.base;
|
||||
size_t offset = 0, unused = 42;
|
||||
#if !(defined(_WIN32) || defined(_WIN64))
|
||||
if (flags & MDBX_warmup_oomsafe) {
|
||||
const int null_fd = open("/dev/null", O_WRONLY);
|
||||
if (unlikely(null_fd < 0))
|
||||
rc = errno;
|
||||
else {
|
||||
struct iovec iov[MDBX_AUXILARY_IOV_MAX];
|
||||
for (;;) {
|
||||
unsigned i;
|
||||
for (i = 0; i < MDBX_AUXILARY_IOV_MAX && offset < used_range; ++i) {
|
||||
iov[i].iov_base = (void *)(ptr + offset);
|
||||
iov[i].iov_len = 1;
|
||||
offset += globals.sys_pagesize;
|
||||
}
|
||||
if (unlikely(writev(null_fd, iov, i) < 0)) {
|
||||
rc = errno;
|
||||
if (rc == EFAULT)
|
||||
rc = ENOMEM;
|
||||
break;
|
||||
}
|
||||
if (offset >= used_range) {
|
||||
rc = MDBX_SUCCESS;
|
||||
break;
|
||||
}
|
||||
if (timeout_seconds_16dot16 && osal_monotime() > timeout_monotime) {
|
||||
rc = MDBX_RESULT_TRUE;
|
||||
break;
|
||||
}
|
||||
}
|
||||
close(null_fd);
|
||||
}
|
||||
} else
|
||||
#endif /* Windows */
|
||||
for (;;) {
|
||||
unused += ptr[offset];
|
||||
offset += globals.sys_pagesize;
|
||||
if (offset >= used_range) {
|
||||
rc = MDBX_SUCCESS;
|
||||
break;
|
||||
}
|
||||
if (timeout_seconds_16dot16 && osal_monotime() > timeout_monotime) {
|
||||
rc = MDBX_RESULT_TRUE;
|
||||
break;
|
||||
}
|
||||
}
|
||||
(void)unused;
|
||||
}
|
||||
|
||||
if ((flags & MDBX_warmup_lock) != 0 &&
|
||||
(rc == MDBX_SUCCESS || rc == MDBX_ENOSYS) &&
|
||||
atomic_load32(&env->mlocked_pgno, mo_AcquireRelease) < mlock_pgno) {
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
if (VirtualLock(env->dxb_mmap.base, used_range)) {
|
||||
update_mlcnt(env, mlock_pgno, true);
|
||||
rc = MDBX_SUCCESS;
|
||||
} else {
|
||||
rc = (int)GetLastError();
|
||||
WARNING("%s(%zu) error %d", "VirtualLock", used_range, rc);
|
||||
}
|
||||
#elif defined(_POSIX_MEMLOCK_RANGE)
|
||||
if (mlock(env->dxb_mmap.base, used_range) == 0) {
|
||||
update_mlcnt(env, mlock_pgno, true);
|
||||
rc = MDBX_SUCCESS;
|
||||
} else {
|
||||
rc = errno;
|
||||
WARNING("%s(%zu) error %d", "mlock", used_range, rc);
|
||||
}
|
||||
#else
|
||||
rc = MDBX_ENOSYS;
|
||||
#endif
|
||||
}
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
/*----------------------------------------------------------------------------*/
|
||||
|
||||
__cold int mdbx_env_get_fd(const MDBX_env *env, mdbx_filehandle_t *arg) {
|
||||
int rc = check_env(env, true);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return rc;
|
||||
|
||||
if (unlikely(!arg))
|
||||
return MDBX_EINVAL;
|
||||
|
||||
*arg = env->lazy_fd;
|
||||
return MDBX_SUCCESS;
|
||||
}
|
||||
|
||||
__cold int mdbx_env_set_flags(MDBX_env *env, MDBX_env_flags_t flags,
|
||||
bool onoff) {
|
||||
int rc = check_env(env, false);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return rc;
|
||||
|
||||
if (unlikely(flags & ((env->flags & ENV_ACTIVE) ? ~ENV_CHANGEABLE_FLAGS
|
||||
: ~ENV_USABLE_FLAGS)))
|
||||
return MDBX_EPERM;
|
||||
|
||||
if (unlikely(env->flags & MDBX_RDONLY))
|
||||
return MDBX_EACCESS;
|
||||
|
||||
const bool lock_needed = (env->flags & ENV_ACTIVE) && !env_txn0_owned(env);
|
||||
bool should_unlock = false;
|
||||
if (lock_needed) {
|
||||
rc = lck_txn_lock(env, false);
|
||||
if (unlikely(rc))
|
||||
return rc;
|
||||
should_unlock = true;
|
||||
}
|
||||
|
||||
if (onoff)
|
||||
env->flags = combine_durability_flags(env->flags, flags);
|
||||
else
|
||||
env->flags &= ~flags;
|
||||
|
||||
if (should_unlock)
|
||||
lck_txn_unlock(env);
|
||||
return MDBX_SUCCESS;
|
||||
}
|
||||
|
||||
__cold int mdbx_env_get_flags(const MDBX_env *env, unsigned *arg) {
|
||||
int rc = check_env(env, false);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return rc;
|
||||
|
||||
if (unlikely(!arg))
|
||||
return MDBX_EINVAL;
|
||||
|
||||
*arg = env->flags & ENV_USABLE_FLAGS;
|
||||
return MDBX_SUCCESS;
|
||||
}
|
||||
|
||||
__cold int mdbx_env_set_userctx(MDBX_env *env, void *ctx) {
|
||||
int rc = check_env(env, false);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return rc;
|
||||
|
||||
env->userctx = ctx;
|
||||
return MDBX_SUCCESS;
|
||||
}
|
||||
|
||||
__cold void *mdbx_env_get_userctx(const MDBX_env *env) {
|
||||
return env ? env->userctx : nullptr;
|
||||
}
|
||||
|
||||
__cold int mdbx_env_set_assert(MDBX_env *env, MDBX_assert_func *func) {
|
||||
int rc = check_env(env, false);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return rc;
|
||||
|
||||
#if MDBX_DEBUG
|
||||
env->assert_func = func;
|
||||
return MDBX_SUCCESS;
|
||||
#else
|
||||
(void)func;
|
||||
return MDBX_ENOSYS;
|
||||
#endif
|
||||
}
|
||||
|
||||
__cold int mdbx_env_set_hsr(MDBX_env *env, MDBX_hsr_func *hsr) {
|
||||
int rc = check_env(env, false);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return rc;
|
||||
|
||||
env->hsr_callback = hsr;
|
||||
return MDBX_SUCCESS;
|
||||
}
|
||||
|
||||
__cold MDBX_hsr_func *mdbx_env_get_hsr(const MDBX_env *env) {
|
||||
return likely(env && env->signature.weak == env_signature) ? env->hsr_callback
|
||||
: nullptr;
|
||||
}
|
||||
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
__cold int mdbx_env_get_pathW(const MDBX_env *env, const wchar_t **arg) {
|
||||
int rc = check_env(env, true);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return rc;
|
||||
|
||||
if (unlikely(!arg))
|
||||
return MDBX_EINVAL;
|
||||
|
||||
*arg = env->pathname.specified;
|
||||
return MDBX_SUCCESS;
|
||||
}
|
||||
#endif /* Windows */
|
||||
|
||||
__cold int mdbx_env_get_path(const MDBX_env *env, const char **arg) {
|
||||
int rc = check_env(env, true);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return rc;
|
||||
|
||||
if (unlikely(!arg))
|
||||
return MDBX_EINVAL;
|
||||
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
if (!env->pathname_char) {
|
||||
*arg = nullptr;
|
||||
DWORD flags = /* WC_ERR_INVALID_CHARS */ 0x80;
|
||||
size_t mb_len =
|
||||
WideCharToMultiByte(CP_THREAD_ACP, flags, env->pathname.specified, -1,
|
||||
nullptr, 0, nullptr, nullptr);
|
||||
rc = mb_len ? MDBX_SUCCESS : (int)GetLastError();
|
||||
if (rc == ERROR_INVALID_FLAGS) {
|
||||
mb_len =
|
||||
WideCharToMultiByte(CP_THREAD_ACP, flags = 0, env->pathname.specified,
|
||||
-1, nullptr, 0, nullptr, nullptr);
|
||||
rc = mb_len ? MDBX_SUCCESS : (int)GetLastError();
|
||||
}
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return rc;
|
||||
|
||||
char *const mb_pathname = osal_malloc(mb_len);
|
||||
if (!mb_pathname)
|
||||
return MDBX_ENOMEM;
|
||||
if (mb_len != (size_t)WideCharToMultiByte(
|
||||
CP_THREAD_ACP, flags, env->pathname.specified, -1,
|
||||
mb_pathname, (int)mb_len, nullptr, nullptr)) {
|
||||
rc = (int)GetLastError();
|
||||
osal_free(mb_pathname);
|
||||
return rc;
|
||||
}
|
||||
if (env->pathname_char ||
|
||||
InterlockedCompareExchangePointer((PVOID volatile *)&env->pathname_char,
|
||||
mb_pathname, nullptr))
|
||||
osal_free(mb_pathname);
|
||||
}
|
||||
*arg = env->pathname_char;
|
||||
#else
|
||||
*arg = env->pathname.specified;
|
||||
#endif /* Windows */
|
||||
return MDBX_SUCCESS;
|
||||
}
|
||||
|
||||
/*------------------------------------------------------------------------------
|
||||
* Legacy API */
|
||||
|
||||
#ifndef LIBMDBX_NO_EXPORTS_LEGACY_API
|
||||
|
||||
LIBMDBX_API int mdbx_txn_begin(MDBX_env *env, MDBX_txn *parent,
|
||||
MDBX_txn_flags_t flags, MDBX_txn **ret) {
|
||||
return __inline_mdbx_txn_begin(env, parent, flags, ret);
|
||||
}
|
||||
|
||||
LIBMDBX_API int mdbx_txn_commit(MDBX_txn *txn) {
|
||||
return __inline_mdbx_txn_commit(txn);
|
||||
}
|
||||
|
||||
LIBMDBX_API __cold int mdbx_env_stat(const MDBX_env *env, MDBX_stat *stat,
|
||||
size_t bytes) {
|
||||
return __inline_mdbx_env_stat(env, stat, bytes);
|
||||
}
|
||||
|
||||
LIBMDBX_API __cold int mdbx_env_info(const MDBX_env *env, MDBX_envinfo *info,
|
||||
size_t bytes) {
|
||||
return __inline_mdbx_env_info(env, info, bytes);
|
||||
}
|
||||
|
||||
LIBMDBX_API int mdbx_dbi_flags(const MDBX_txn *txn, MDBX_dbi dbi,
|
||||
unsigned *flags) {
|
||||
return __inline_mdbx_dbi_flags(txn, dbi, flags);
|
||||
}
|
||||
|
||||
LIBMDBX_API __cold int mdbx_env_sync(MDBX_env *env) {
|
||||
return __inline_mdbx_env_sync(env);
|
||||
}
|
||||
|
||||
LIBMDBX_API __cold int mdbx_env_sync_poll(MDBX_env *env) {
|
||||
return __inline_mdbx_env_sync_poll(env);
|
||||
}
|
||||
|
||||
LIBMDBX_API __cold int mdbx_env_close(MDBX_env *env) {
|
||||
return __inline_mdbx_env_close(env);
|
||||
}
|
||||
|
||||
LIBMDBX_API __cold int mdbx_env_set_mapsize(MDBX_env *env, size_t size) {
|
||||
return __inline_mdbx_env_set_mapsize(env, size);
|
||||
}
|
||||
|
||||
LIBMDBX_API __cold int mdbx_env_set_maxdbs(MDBX_env *env, MDBX_dbi dbs) {
|
||||
return __inline_mdbx_env_set_maxdbs(env, dbs);
|
||||
}
|
||||
|
||||
LIBMDBX_API __cold int mdbx_env_get_maxdbs(const MDBX_env *env, MDBX_dbi *dbs) {
|
||||
return __inline_mdbx_env_get_maxdbs(env, dbs);
|
||||
}
|
||||
|
||||
LIBMDBX_API __cold int mdbx_env_set_maxreaders(MDBX_env *env,
|
||||
unsigned readers) {
|
||||
return __inline_mdbx_env_set_maxreaders(env, readers);
|
||||
}
|
||||
|
||||
LIBMDBX_API __cold int mdbx_env_get_maxreaders(const MDBX_env *env,
|
||||
unsigned *readers) {
|
||||
return __inline_mdbx_env_get_maxreaders(env, readers);
|
||||
}
|
||||
|
||||
LIBMDBX_API __cold int mdbx_env_set_syncbytes(MDBX_env *env, size_t threshold) {
|
||||
return __inline_mdbx_env_set_syncbytes(env, threshold);
|
||||
}
|
||||
|
||||
LIBMDBX_API __cold int mdbx_env_get_syncbytes(const MDBX_env *env,
|
||||
size_t *threshold) {
|
||||
return __inline_mdbx_env_get_syncbytes(env, threshold);
|
||||
}
|
||||
|
||||
LIBMDBX_API __cold int mdbx_env_set_syncperiod(MDBX_env *env,
|
||||
unsigned seconds_16dot16) {
|
||||
return __inline_mdbx_env_set_syncperiod(env, seconds_16dot16);
|
||||
}
|
||||
|
||||
LIBMDBX_API __cold int mdbx_env_get_syncperiod(const MDBX_env *env,
|
||||
unsigned *seconds_16dot16) {
|
||||
return __inline_mdbx_env_get_syncperiod(env, seconds_16dot16);
|
||||
}
|
||||
|
||||
LIBMDBX_API __cold uint64_t mdbx_key_from_int64(const int64_t i64) {
|
||||
return __inline_mdbx_key_from_int64(i64);
|
||||
}
|
||||
|
||||
LIBMDBX_API __cold uint32_t mdbx_key_from_int32(const int32_t i32) {
|
||||
return __inline_mdbx_key_from_int32(i32);
|
||||
}
|
||||
|
||||
LIBMDBX_API __cold intptr_t mdbx_limits_pgsize_min(void) {
|
||||
return __inline_mdbx_limits_pgsize_min();
|
||||
}
|
||||
|
||||
LIBMDBX_API __cold intptr_t mdbx_limits_pgsize_max(void) {
|
||||
return __inline_mdbx_limits_pgsize_max();
|
||||
}
|
||||
|
||||
#endif /* LIBMDBX_NO_EXPORTS_LEGACY_API */
|
@ -11,6 +11,9 @@
|
||||
#cmakedefine ENABLE_ASAN
|
||||
#cmakedefine ENABLE_UBSAN
|
||||
#cmakedefine01 MDBX_FORCE_ASSERTIONS
|
||||
#if !defined(MDBX_BUILD_TEST) && !defined(MDBX_BUILD_CXX)
|
||||
#cmakedefine01 MDBX_BUILD_CXX
|
||||
#endif
|
||||
|
||||
/* Common */
|
||||
#cmakedefine01 MDBX_TXN_CHECKOWNER
|
||||
@ -37,7 +40,9 @@
|
||||
#cmakedefine01 MDBX_ENABLE_DBI_LOCKFREE
|
||||
|
||||
/* Windows */
|
||||
#if !defined(MDBX_BUILD_TEST) && !defined(MDBX_WITHOUT_MSVC_CRT)
|
||||
#cmakedefine01 MDBX_WITHOUT_MSVC_CRT
|
||||
#endif
|
||||
|
||||
/* MacOS & iOS */
|
||||
#cmakedefine01 MDBX_OSX_SPEED_INSTEADOF_DURABILITY
|
||||
|
781
src/copy.c
Normal file
781
src/copy.c
Normal file
@ -0,0 +1,781 @@
|
||||
/// \copyright SPDX-License-Identifier: Apache-2.0
|
||||
/// \note Please refer to the COPYRIGHT file for explanations license change,
|
||||
/// credits and acknowledgments.
|
||||
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
|
||||
|
||||
#include "internals.h"
|
||||
|
||||
typedef struct compacting_context {
|
||||
MDBX_env *env;
|
||||
MDBX_txn *txn;
|
||||
pgno_t first_unallocated;
|
||||
osal_condpair_t condpair;
|
||||
volatile unsigned head;
|
||||
volatile unsigned tail;
|
||||
uint8_t *write_buf[2];
|
||||
size_t write_len[2];
|
||||
/* Error code. Never cleared if set. Both threads can set nonzero
|
||||
* to fail the copy. Not mutex-protected, expects atomic int. */
|
||||
volatile int error;
|
||||
mdbx_filehandle_t fd;
|
||||
} ctx_t;
|
||||
|
||||
__cold static int compacting_walk_tree(ctx_t *ctx, tree_t *tree);
|
||||
|
||||
/* Dedicated writer thread for compacting copy. */
|
||||
__cold static THREAD_RESULT THREAD_CALL compacting_write_thread(void *arg) {
|
||||
ctx_t *const ctx = arg;
|
||||
|
||||
#if defined(EPIPE) && !(defined(_WIN32) || defined(_WIN64))
|
||||
sigset_t sigset;
|
||||
sigemptyset(&sigset);
|
||||
sigaddset(&sigset, SIGPIPE);
|
||||
ctx->error = pthread_sigmask(SIG_BLOCK, &sigset, nullptr);
|
||||
#endif /* EPIPE */
|
||||
|
||||
osal_condpair_lock(&ctx->condpair);
|
||||
while (!ctx->error) {
|
||||
while (ctx->tail == ctx->head && !ctx->error) {
|
||||
int err = osal_condpair_wait(&ctx->condpair, true);
|
||||
if (err != MDBX_SUCCESS) {
|
||||
ctx->error = err;
|
||||
goto bailout;
|
||||
}
|
||||
}
|
||||
const unsigned toggle = ctx->tail & 1;
|
||||
size_t wsize = ctx->write_len[toggle];
|
||||
if (wsize == 0) {
|
||||
ctx->tail += 1;
|
||||
break /* EOF */;
|
||||
}
|
||||
ctx->write_len[toggle] = 0;
|
||||
uint8_t *ptr = ctx->write_buf[toggle];
|
||||
if (!ctx->error) {
|
||||
int err = osal_write(ctx->fd, ptr, wsize);
|
||||
if (err != MDBX_SUCCESS) {
|
||||
#if defined(EPIPE) && !(defined(_WIN32) || defined(_WIN64))
|
||||
if (err == EPIPE) {
|
||||
/* Collect the pending SIGPIPE,
|
||||
* otherwise at least OS X gives it to the process on thread-exit. */
|
||||
int unused;
|
||||
sigwait(&sigset, &unused);
|
||||
}
|
||||
#endif /* EPIPE */
|
||||
ctx->error = err;
|
||||
goto bailout;
|
||||
}
|
||||
}
|
||||
ctx->tail += 1;
|
||||
osal_condpair_signal(&ctx->condpair, false);
|
||||
}
|
||||
bailout:
|
||||
osal_condpair_unlock(&ctx->condpair);
|
||||
return (THREAD_RESULT)0;
|
||||
}
|
||||
|
||||
/* Give buffer and/or MDBX_EOF to writer thread, await unused buffer. */
|
||||
__cold static int compacting_toggle_write_buffers(ctx_t *ctx) {
|
||||
osal_condpair_lock(&ctx->condpair);
|
||||
eASSERT(ctx->env, ctx->head - ctx->tail < 2 || ctx->error);
|
||||
ctx->head += 1;
|
||||
osal_condpair_signal(&ctx->condpair, true);
|
||||
while (!ctx->error && ctx->head - ctx->tail == 2 /* both buffers in use */) {
|
||||
int err = osal_condpair_wait(&ctx->condpair, false);
|
||||
if (err != MDBX_SUCCESS)
|
||||
ctx->error = err;
|
||||
}
|
||||
osal_condpair_unlock(&ctx->condpair);
|
||||
return ctx->error;
|
||||
}
|
||||
|
||||
static int compacting_put_bytes(ctx_t *ctx, const void *src, size_t bytes,
|
||||
pgno_t pgno, pgno_t npages) {
|
||||
assert(pgno == 0 || bytes > PAGEHDRSZ);
|
||||
while (bytes > 0) {
|
||||
const size_t side = ctx->head & 1;
|
||||
const size_t left = MDBX_ENVCOPY_WRITEBUF - ctx->write_len[side];
|
||||
if (left < (pgno ? PAGEHDRSZ : 1)) {
|
||||
int err = compacting_toggle_write_buffers(ctx);
|
||||
if (unlikely(err != MDBX_SUCCESS))
|
||||
return err;
|
||||
continue;
|
||||
}
|
||||
const size_t chunk = (bytes < left) ? bytes : left;
|
||||
void *const dst = ctx->write_buf[side] + ctx->write_len[side];
|
||||
if (src) {
|
||||
memcpy(dst, src, chunk);
|
||||
if (pgno) {
|
||||
assert(chunk > PAGEHDRSZ);
|
||||
page_t *mp = dst;
|
||||
mp->pgno = pgno;
|
||||
if (mp->txnid == 0)
|
||||
mp->txnid = ctx->txn->txnid;
|
||||
if (mp->flags == P_LARGE) {
|
||||
assert(bytes <= pgno2bytes(ctx->env, npages));
|
||||
mp->pages = npages;
|
||||
}
|
||||
pgno = 0;
|
||||
}
|
||||
src = ptr_disp(src, chunk);
|
||||
} else
|
||||
memset(dst, 0, chunk);
|
||||
bytes -= chunk;
|
||||
ctx->write_len[side] += chunk;
|
||||
}
|
||||
return MDBX_SUCCESS;
|
||||
}
|
||||
|
||||
static int compacting_put_page(ctx_t *ctx, const page_t *mp,
|
||||
const size_t head_bytes, const size_t tail_bytes,
|
||||
const pgno_t npages) {
|
||||
if (tail_bytes) {
|
||||
assert(head_bytes + tail_bytes <= ctx->env->ps);
|
||||
assert(npages == 1 &&
|
||||
(page_type(mp) == P_BRANCH || page_type(mp) == P_LEAF));
|
||||
} else {
|
||||
assert(head_bytes <= pgno2bytes(ctx->env, npages));
|
||||
assert((npages == 1 && page_type(mp) == (P_LEAF | P_DUPFIX)) ||
|
||||
page_type(mp) == P_LARGE);
|
||||
}
|
||||
|
||||
const pgno_t pgno = ctx->first_unallocated;
|
||||
ctx->first_unallocated += npages;
|
||||
int err = compacting_put_bytes(ctx, mp, head_bytes, pgno, npages);
|
||||
if (unlikely(err != MDBX_SUCCESS))
|
||||
return err;
|
||||
err = compacting_put_bytes(
|
||||
ctx, nullptr, pgno2bytes(ctx->env, npages) - (head_bytes + tail_bytes), 0,
|
||||
0);
|
||||
if (unlikely(err != MDBX_SUCCESS))
|
||||
return err;
|
||||
return compacting_put_bytes(ctx, ptr_disp(mp, ctx->env->ps - tail_bytes),
|
||||
tail_bytes, 0, 0);
|
||||
}
|
||||
|
||||
__cold static int compacting_walk(ctx_t *ctx, MDBX_cursor *mc,
|
||||
pgno_t *const parent_pgno,
|
||||
txnid_t parent_txnid) {
|
||||
mc->top = 0;
|
||||
mc->ki[0] = 0;
|
||||
int rc = page_get(mc, *parent_pgno, &mc->pg[0], parent_txnid);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return rc;
|
||||
|
||||
rc = tree_search_finalize(mc, nullptr, Z_FIRST);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return rc;
|
||||
|
||||
/* Make cursor pages writable */
|
||||
const intptr_t deep_limit = mc->top + 1;
|
||||
void *const buf = osal_malloc(pgno2bytes(ctx->env, deep_limit + 1));
|
||||
if (buf == nullptr)
|
||||
return MDBX_ENOMEM;
|
||||
|
||||
void *ptr = buf;
|
||||
for (intptr_t i = 0; i <= mc->top; i++) {
|
||||
page_copy(ptr, mc->pg[i], ctx->env->ps);
|
||||
mc->pg[i] = ptr;
|
||||
ptr = ptr_disp(ptr, ctx->env->ps);
|
||||
}
|
||||
/* This is writable space for a leaf page. Usually not needed. */
|
||||
page_t *const leaf = ptr;
|
||||
|
||||
while (mc->top >= 0) {
|
||||
page_t *mp = mc->pg[mc->top];
|
||||
const size_t nkeys = page_numkeys(mp);
|
||||
if (is_leaf(mp)) {
|
||||
if (!(mc->flags &
|
||||
z_inner) /* may have nested N_SUBDATA or N_BIGDATA nodes */) {
|
||||
for (size_t i = 0; i < nkeys; i++) {
|
||||
node_t *node = page_node(mp, i);
|
||||
if (node_flags(node) == N_BIGDATA) {
|
||||
/* Need writable leaf */
|
||||
if (mp != leaf) {
|
||||
mc->pg[mc->top] = leaf;
|
||||
page_copy(leaf, mp, ctx->env->ps);
|
||||
mp = leaf;
|
||||
node = page_node(mp, i);
|
||||
}
|
||||
|
||||
const pgr_t lp =
|
||||
page_get_large(mc, node_largedata_pgno(node), mp->txnid);
|
||||
if (unlikely((rc = lp.err) != MDBX_SUCCESS))
|
||||
goto bailout;
|
||||
const size_t datasize = node_ds(node);
|
||||
const pgno_t npages = largechunk_npages(ctx->env, datasize);
|
||||
poke_pgno(node_data(node), ctx->first_unallocated);
|
||||
rc = compacting_put_page(ctx, lp.page, PAGEHDRSZ + datasize, 0,
|
||||
npages);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
goto bailout;
|
||||
} else if (node_flags(node) & N_SUBDATA) {
|
||||
if (!MDBX_DISABLE_VALIDATION &&
|
||||
unlikely(node_ds(node) != sizeof(tree_t))) {
|
||||
ERROR("%s/%d: %s %u", "MDBX_CORRUPTED", MDBX_CORRUPTED,
|
||||
"invalid dupsort sub-tree node size",
|
||||
(unsigned)node_ds(node));
|
||||
rc = MDBX_CORRUPTED;
|
||||
goto bailout;
|
||||
}
|
||||
|
||||
/* Need writable leaf */
|
||||
if (mp != leaf) {
|
||||
mc->pg[mc->top] = leaf;
|
||||
page_copy(leaf, mp, ctx->env->ps);
|
||||
mp = leaf;
|
||||
node = page_node(mp, i);
|
||||
}
|
||||
|
||||
tree_t *nested = nullptr;
|
||||
if (node_flags(node) & N_DUPDATA) {
|
||||
rc = cursor_dupsort_setup(mc, node, mp);
|
||||
if (likely(rc == MDBX_SUCCESS)) {
|
||||
nested = &mc->subcur->nested_tree;
|
||||
rc = compacting_walk(ctx, &mc->subcur->cursor, &nested->root,
|
||||
mp->txnid);
|
||||
}
|
||||
} else {
|
||||
cASSERT(mc, (mc->flags & z_inner) == 0 && mc->subcur == 0);
|
||||
cursor_couple_t *couple =
|
||||
container_of(mc, cursor_couple_t, outer);
|
||||
nested = &couple->inner.nested_tree;
|
||||
memcpy(nested, node_data(node), sizeof(tree_t));
|
||||
rc = compacting_walk_tree(ctx, nested);
|
||||
}
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
goto bailout;
|
||||
memcpy(node_data(node), nested, sizeof(tree_t));
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
mc->ki[mc->top]++;
|
||||
if (mc->ki[mc->top] < nkeys) {
|
||||
for (;;) {
|
||||
const node_t *node = page_node(mp, mc->ki[mc->top]);
|
||||
rc = page_get(mc, node_pgno(node), &mp, mp->txnid);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
goto bailout;
|
||||
mc->top += 1;
|
||||
if (unlikely(mc->top >= deep_limit)) {
|
||||
rc = MDBX_CURSOR_FULL;
|
||||
goto bailout;
|
||||
}
|
||||
mc->ki[mc->top] = 0;
|
||||
if (!is_branch(mp)) {
|
||||
mc->pg[mc->top] = mp;
|
||||
break;
|
||||
}
|
||||
/* Whenever we advance to a sibling branch page,
|
||||
* we must proceed all the way down to its first leaf. */
|
||||
page_copy(mc->pg[mc->top], mp, ctx->env->ps);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
const pgno_t pgno = ctx->first_unallocated;
|
||||
if (likely(!is_dupfix_leaf(mp))) {
|
||||
rc = compacting_put_page(ctx, mp, PAGEHDRSZ + mp->lower,
|
||||
ctx->env->ps - (PAGEHDRSZ + mp->upper), 1);
|
||||
} else {
|
||||
rc = compacting_put_page(
|
||||
ctx, mp, PAGEHDRSZ + page_numkeys(mp) * mp->dupfix_ksize, 0, 1);
|
||||
}
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
goto bailout;
|
||||
|
||||
if (mc->top) {
|
||||
/* Update parent if there is one */
|
||||
node_set_pgno(page_node(mc->pg[mc->top - 1], mc->ki[mc->top - 1]), pgno);
|
||||
cursor_pop(mc);
|
||||
} else {
|
||||
/* Otherwise we're done */
|
||||
*parent_pgno = pgno;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
bailout:
|
||||
osal_free(buf);
|
||||
return rc;
|
||||
}
|
||||
|
||||
__cold static int compacting_walk_tree(ctx_t *ctx, tree_t *tree) {
|
||||
if (unlikely(tree->root == P_INVALID))
|
||||
return MDBX_SUCCESS; /* empty db */
|
||||
|
||||
cursor_couple_t couple;
|
||||
memset(&couple, 0, sizeof(couple));
|
||||
couple.inner.cursor.signature = ~cur_signature_live;
|
||||
kvx_t kvx = {.clc = {.k = {.lmin = INT_MAX}, .v = {.lmin = INT_MAX}}};
|
||||
int rc = cursor_init4walk(&couple, ctx->txn, tree, &kvx);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return rc;
|
||||
|
||||
couple.outer.checking |= z_ignord | z_pagecheck;
|
||||
couple.inner.cursor.checking |= z_ignord | z_pagecheck;
|
||||
if (!tree->mod_txnid)
|
||||
tree->mod_txnid = ctx->txn->txnid;
|
||||
return compacting_walk(ctx, &couple.outer, &tree->root, tree->mod_txnid);
|
||||
}
|
||||
|
||||
__cold static void compacting_fixup_meta(MDBX_env *env, meta_t *meta) {
|
||||
eASSERT(env, meta->trees.gc.mod_txnid || meta->trees.gc.root == P_INVALID);
|
||||
eASSERT(env,
|
||||
meta->trees.main.mod_txnid || meta->trees.main.root == P_INVALID);
|
||||
|
||||
/* Calculate filesize taking in account shrink/growing thresholds */
|
||||
if (meta->geometry.first_unallocated != meta->geometry.now) {
|
||||
meta->geometry.now = meta->geometry.first_unallocated;
|
||||
const size_t aligner =
|
||||
pv2pages(meta->geometry.grow_pv ? meta->geometry.grow_pv
|
||||
: meta->geometry.shrink_pv);
|
||||
if (aligner) {
|
||||
const pgno_t aligned = pgno_align2os_pgno(
|
||||
env, meta->geometry.first_unallocated + aligner -
|
||||
meta->geometry.first_unallocated % aligner);
|
||||
meta->geometry.now = aligned;
|
||||
}
|
||||
}
|
||||
|
||||
if (meta->geometry.now < meta->geometry.lower)
|
||||
meta->geometry.now = meta->geometry.lower;
|
||||
if (meta->geometry.now > meta->geometry.upper)
|
||||
meta->geometry.now = meta->geometry.upper;
|
||||
|
||||
/* Update signature */
|
||||
assert(meta->geometry.now >= meta->geometry.first_unallocated);
|
||||
meta_sign_as_steady(meta);
|
||||
}
|
||||
|
||||
/* Make resizable */
|
||||
__cold static void meta_make_sizeable(meta_t *meta) {
|
||||
meta->geometry.lower = MIN_PAGENO;
|
||||
if (meta->geometry.grow_pv == 0) {
|
||||
const pgno_t step = 1 + (meta->geometry.upper - meta->geometry.lower) / 42;
|
||||
meta->geometry.grow_pv = pages2pv(step);
|
||||
}
|
||||
if (meta->geometry.shrink_pv == 0) {
|
||||
const pgno_t step = pv2pages(meta->geometry.grow_pv) << 1;
|
||||
meta->geometry.shrink_pv = pages2pv(step);
|
||||
}
|
||||
}
|
||||
|
||||
__cold static int copy_with_compacting(MDBX_env *env, MDBX_txn *read_txn,
|
||||
mdbx_filehandle_t fd, uint8_t *buffer,
|
||||
const bool dest_is_pipe,
|
||||
const MDBX_copy_flags_t flags) {
|
||||
const size_t meta_bytes = pgno2bytes(env, NUM_METAS);
|
||||
uint8_t *const data_buffer =
|
||||
buffer + ceil_powerof2(meta_bytes, globals.sys_pagesize);
|
||||
meta_t *const meta = meta_init_triplet(env, buffer);
|
||||
meta_set_txnid(env, meta, read_txn->txnid);
|
||||
|
||||
if (flags & MDBX_CP_FORCE_DYNAMIC_SIZE)
|
||||
meta_make_sizeable(meta);
|
||||
|
||||
/* copy canary sequences if present */
|
||||
if (read_txn->canary.v) {
|
||||
meta->canary = read_txn->canary;
|
||||
meta->canary.v = constmeta_txnid(meta);
|
||||
}
|
||||
|
||||
if (read_txn->dbs[MAIN_DBI].root == P_INVALID) {
|
||||
/* When the DB is empty, handle it specially to
|
||||
* fix any breakage like page leaks from ITS#8174. */
|
||||
meta->trees.main.flags = read_txn->dbs[MAIN_DBI].flags;
|
||||
compacting_fixup_meta(env, meta);
|
||||
if (dest_is_pipe) {
|
||||
int rc = osal_write(fd, buffer, meta_bytes);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return rc;
|
||||
}
|
||||
} else {
|
||||
/* Count free pages + GC pages. */
|
||||
cursor_couple_t couple;
|
||||
int rc = cursor_init(&couple.outer, read_txn, FREE_DBI);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return rc;
|
||||
pgno_t gc_npages = read_txn->dbs[FREE_DBI].branch_pages +
|
||||
read_txn->dbs[FREE_DBI].leaf_pages +
|
||||
read_txn->dbs[FREE_DBI].large_pages;
|
||||
MDBX_val key, data;
|
||||
rc = outer_first(&couple.outer, &key, &data);
|
||||
while (rc == MDBX_SUCCESS) {
|
||||
const pnl_t pnl = data.iov_base;
|
||||
if (unlikely(data.iov_len % sizeof(pgno_t) ||
|
||||
data.iov_len < MDBX_PNL_SIZEOF(pnl))) {
|
||||
ERROR("%s/%d: %s %zu", "MDBX_CORRUPTED", MDBX_CORRUPTED,
|
||||
"invalid GC-record length", data.iov_len);
|
||||
return MDBX_CORRUPTED;
|
||||
}
|
||||
if (unlikely(!pnl_check(pnl, read_txn->geo.first_unallocated))) {
|
||||
ERROR("%s/%d: %s", "MDBX_CORRUPTED", MDBX_CORRUPTED,
|
||||
"invalid GC-record content");
|
||||
return MDBX_CORRUPTED;
|
||||
}
|
||||
gc_npages += MDBX_PNL_GETSIZE(pnl);
|
||||
rc = outer_next(&couple.outer, &key, &data, MDBX_NEXT);
|
||||
}
|
||||
if (unlikely(rc != MDBX_NOTFOUND))
|
||||
return rc;
|
||||
|
||||
meta->geometry.first_unallocated =
|
||||
read_txn->geo.first_unallocated - gc_npages;
|
||||
meta->trees.main = read_txn->dbs[MAIN_DBI];
|
||||
|
||||
ctx_t ctx;
|
||||
memset(&ctx, 0, sizeof(ctx));
|
||||
rc = osal_condpair_init(&ctx.condpair);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return rc;
|
||||
|
||||
memset(data_buffer, 0, 2 * (size_t)MDBX_ENVCOPY_WRITEBUF);
|
||||
ctx.write_buf[0] = data_buffer;
|
||||
ctx.write_buf[1] = data_buffer + (size_t)MDBX_ENVCOPY_WRITEBUF;
|
||||
ctx.first_unallocated = NUM_METAS;
|
||||
ctx.env = env;
|
||||
ctx.fd = fd;
|
||||
ctx.txn = read_txn;
|
||||
|
||||
osal_thread_t thread;
|
||||
int thread_err = osal_thread_create(&thread, compacting_write_thread, &ctx);
|
||||
if (likely(thread_err == MDBX_SUCCESS)) {
|
||||
if (dest_is_pipe) {
|
||||
if (!meta->trees.main.mod_txnid)
|
||||
meta->trees.main.mod_txnid = read_txn->txnid;
|
||||
compacting_fixup_meta(env, meta);
|
||||
rc = osal_write(fd, buffer, meta_bytes);
|
||||
}
|
||||
if (likely(rc == MDBX_SUCCESS))
|
||||
rc = compacting_walk_tree(&ctx, &meta->trees.main);
|
||||
if (ctx.write_len[ctx.head & 1])
|
||||
/* toggle to flush non-empty buffers */
|
||||
compacting_toggle_write_buffers(&ctx);
|
||||
|
||||
if (likely(rc == MDBX_SUCCESS) &&
|
||||
unlikely(meta->geometry.first_unallocated != ctx.first_unallocated)) {
|
||||
if (ctx.first_unallocated > meta->geometry.first_unallocated) {
|
||||
ERROR("the source DB %s: post-compactification used pages %" PRIaPGNO
|
||||
" %c expected %" PRIaPGNO,
|
||||
"has double-used pages or other corruption",
|
||||
ctx.first_unallocated, '>', meta->geometry.first_unallocated);
|
||||
rc = MDBX_CORRUPTED; /* corrupted DB */
|
||||
}
|
||||
if (ctx.first_unallocated < meta->geometry.first_unallocated) {
|
||||
WARNING(
|
||||
"the source DB %s: post-compactification used pages %" PRIaPGNO
|
||||
" %c expected %" PRIaPGNO,
|
||||
"has page leak(s)", ctx.first_unallocated, '<',
|
||||
meta->geometry.first_unallocated);
|
||||
if (dest_is_pipe)
|
||||
/* the root within already written meta-pages is wrong */
|
||||
rc = MDBX_CORRUPTED;
|
||||
}
|
||||
/* fixup meta */
|
||||
meta->geometry.first_unallocated = ctx.first_unallocated;
|
||||
}
|
||||
|
||||
/* toggle with empty buffers to exit thread's loop */
|
||||
eASSERT(env, (ctx.write_len[ctx.head & 1]) == 0);
|
||||
compacting_toggle_write_buffers(&ctx);
|
||||
thread_err = osal_thread_join(thread);
|
||||
eASSERT(env, (ctx.tail == ctx.head && ctx.write_len[ctx.head & 1] == 0) ||
|
||||
ctx.error);
|
||||
osal_condpair_destroy(&ctx.condpair);
|
||||
}
|
||||
if (unlikely(thread_err != MDBX_SUCCESS))
|
||||
return thread_err;
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return rc;
|
||||
if (unlikely(ctx.error != MDBX_SUCCESS))
|
||||
return ctx.error;
|
||||
if (!dest_is_pipe)
|
||||
compacting_fixup_meta(env, meta);
|
||||
}
|
||||
|
||||
/* Extend file if required */
|
||||
if (meta->geometry.now != meta->geometry.first_unallocated) {
|
||||
const size_t whole_size = pgno2bytes(env, meta->geometry.now);
|
||||
if (!dest_is_pipe)
|
||||
return osal_ftruncate(fd, whole_size);
|
||||
|
||||
const size_t used_size = pgno2bytes(env, meta->geometry.first_unallocated);
|
||||
memset(data_buffer, 0, (size_t)MDBX_ENVCOPY_WRITEBUF);
|
||||
for (size_t offset = used_size; offset < whole_size;) {
|
||||
const size_t chunk = ((size_t)MDBX_ENVCOPY_WRITEBUF < whole_size - offset)
|
||||
? (size_t)MDBX_ENVCOPY_WRITEBUF
|
||||
: whole_size - offset;
|
||||
int rc = osal_write(fd, data_buffer, chunk);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return rc;
|
||||
offset += chunk;
|
||||
}
|
||||
}
|
||||
return MDBX_SUCCESS;
|
||||
}
|
||||
|
||||
__cold static int copy_asis(MDBX_env *env, MDBX_txn *read_txn,
|
||||
mdbx_filehandle_t fd, uint8_t *buffer,
|
||||
const bool dest_is_pipe,
|
||||
const MDBX_copy_flags_t flags) {
|
||||
int rc = txn_end(read_txn, TXN_END_RESET_TMP);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return rc;
|
||||
|
||||
/* Temporarily block writers until we snapshot the meta pages */
|
||||
rc = lck_txn_lock(env, false);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return rc;
|
||||
|
||||
rc = txn_renew(read_txn, MDBX_TXN_RDONLY);
|
||||
if (unlikely(rc != MDBX_SUCCESS)) {
|
||||
lck_txn_unlock(env);
|
||||
return rc;
|
||||
}
|
||||
|
||||
jitter4testing(false);
|
||||
const size_t meta_bytes = pgno2bytes(env, NUM_METAS);
|
||||
const troika_t troika = meta_tap(env);
|
||||
/* Make a snapshot of meta-pages,
|
||||
* but writing ones after the data was flushed */
|
||||
memcpy(buffer, env->dxb_mmap.base, meta_bytes);
|
||||
meta_t *const headcopy = /* LY: get pointer to the snapshot copy */
|
||||
ptr_disp(buffer,
|
||||
ptr_dist(meta_recent(env, &troika).ptr_c, env->dxb_mmap.base));
|
||||
lck_txn_unlock(env);
|
||||
|
||||
if (flags & MDBX_CP_FORCE_DYNAMIC_SIZE)
|
||||
meta_make_sizeable(headcopy);
|
||||
/* Update signature to steady */
|
||||
meta_sign_as_steady(headcopy);
|
||||
|
||||
/* Copy the data */
|
||||
const size_t whole_size = pgno_align2os_bytes(env, read_txn->geo.end_pgno);
|
||||
const size_t used_size = pgno2bytes(env, read_txn->geo.first_unallocated);
|
||||
jitter4testing(false);
|
||||
|
||||
if (dest_is_pipe)
|
||||
rc = osal_write(fd, buffer, meta_bytes);
|
||||
|
||||
uint8_t *const data_buffer =
|
||||
buffer + ceil_powerof2(meta_bytes, globals.sys_pagesize);
|
||||
#if MDBX_USE_COPYFILERANGE
|
||||
static bool copyfilerange_unavailable;
|
||||
bool not_the_same_filesystem = false;
|
||||
struct statfs statfs_info;
|
||||
if (fstatfs(fd, &statfs_info) ||
|
||||
statfs_info.f_type == /* ECRYPTFS_SUPER_MAGIC */ 0xf15f)
|
||||
/* avoid use copyfilerange_unavailable() to ecryptfs due bugs */
|
||||
not_the_same_filesystem = true;
|
||||
#endif /* MDBX_USE_COPYFILERANGE */
|
||||
for (size_t offset = meta_bytes; rc == MDBX_SUCCESS && offset < used_size;) {
|
||||
#if MDBX_USE_SENDFILE
|
||||
static bool sendfile_unavailable;
|
||||
if (dest_is_pipe && likely(!sendfile_unavailable)) {
|
||||
off_t in_offset = offset;
|
||||
const ssize_t written =
|
||||
sendfile(fd, env->lazy_fd, &in_offset, used_size - offset);
|
||||
if (likely(written > 0)) {
|
||||
offset = in_offset;
|
||||
continue;
|
||||
}
|
||||
rc = MDBX_ENODATA;
|
||||
if (written == 0 || ignore_enosys(rc = errno) != MDBX_RESULT_TRUE)
|
||||
break;
|
||||
sendfile_unavailable = true;
|
||||
}
|
||||
#endif /* MDBX_USE_SENDFILE */
|
||||
|
||||
#if MDBX_USE_COPYFILERANGE
|
||||
if (!dest_is_pipe && !not_the_same_filesystem &&
|
||||
likely(!copyfilerange_unavailable)) {
|
||||
off_t in_offset = offset, out_offset = offset;
|
||||
ssize_t bytes_copied = copy_file_range(
|
||||
env->lazy_fd, &in_offset, fd, &out_offset, used_size - offset, 0);
|
||||
if (likely(bytes_copied > 0)) {
|
||||
offset = in_offset;
|
||||
continue;
|
||||
}
|
||||
rc = MDBX_ENODATA;
|
||||
if (bytes_copied == 0)
|
||||
break;
|
||||
rc = errno;
|
||||
if (rc == EXDEV || rc == /* workaround for ecryptfs bug(s),
|
||||
maybe useful for others FS */
|
||||
EINVAL)
|
||||
not_the_same_filesystem = true;
|
||||
else if (ignore_enosys(rc) == MDBX_RESULT_TRUE)
|
||||
copyfilerange_unavailable = true;
|
||||
else
|
||||
break;
|
||||
}
|
||||
#endif /* MDBX_USE_COPYFILERANGE */
|
||||
|
||||
/* fallback to portable */
|
||||
const size_t chunk = ((size_t)MDBX_ENVCOPY_WRITEBUF < used_size - offset)
|
||||
? (size_t)MDBX_ENVCOPY_WRITEBUF
|
||||
: used_size - offset;
|
||||
/* copy to avoid EFAULT in case swapped-out */
|
||||
memcpy(data_buffer, ptr_disp(env->dxb_mmap.base, offset), chunk);
|
||||
rc = osal_write(fd, data_buffer, chunk);
|
||||
offset += chunk;
|
||||
}
|
||||
|
||||
/* Extend file if required */
|
||||
if (likely(rc == MDBX_SUCCESS) && whole_size != used_size) {
|
||||
if (!dest_is_pipe)
|
||||
rc = osal_ftruncate(fd, whole_size);
|
||||
else {
|
||||
memset(data_buffer, 0, (size_t)MDBX_ENVCOPY_WRITEBUF);
|
||||
for (size_t offset = used_size;
|
||||
rc == MDBX_SUCCESS && offset < whole_size;) {
|
||||
const size_t chunk =
|
||||
((size_t)MDBX_ENVCOPY_WRITEBUF < whole_size - offset)
|
||||
? (size_t)MDBX_ENVCOPY_WRITEBUF
|
||||
: whole_size - offset;
|
||||
rc = osal_write(fd, data_buffer, chunk);
|
||||
offset += chunk;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
__cold int mdbx_env_copy2fd(MDBX_env *env, mdbx_filehandle_t fd,
|
||||
MDBX_copy_flags_t flags) {
|
||||
int rc = check_env(env, true);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return rc;
|
||||
|
||||
const int dest_is_pipe = osal_is_pipe(fd);
|
||||
if (MDBX_IS_ERROR(dest_is_pipe))
|
||||
return dest_is_pipe;
|
||||
|
||||
if (!dest_is_pipe) {
|
||||
rc = osal_fseek(fd, 0);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return rc;
|
||||
}
|
||||
|
||||
const size_t buffer_size =
|
||||
pgno_align2os_bytes(env, NUM_METAS) +
|
||||
ceil_powerof2(((flags & MDBX_CP_COMPACT)
|
||||
? 2 * (size_t)MDBX_ENVCOPY_WRITEBUF
|
||||
: (size_t)MDBX_ENVCOPY_WRITEBUF),
|
||||
globals.sys_pagesize);
|
||||
|
||||
uint8_t *buffer = nullptr;
|
||||
rc = osal_memalign_alloc(globals.sys_pagesize, buffer_size, (void **)&buffer);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return rc;
|
||||
|
||||
MDBX_txn *read_txn = nullptr;
|
||||
/* Do the lock/unlock of the reader mutex before starting the
|
||||
* write txn. Otherwise other read txns could block writers. */
|
||||
rc = mdbx_txn_begin(env, nullptr, MDBX_TXN_RDONLY, &read_txn);
|
||||
if (unlikely(rc != MDBX_SUCCESS)) {
|
||||
osal_memalign_free(buffer);
|
||||
return rc;
|
||||
}
|
||||
|
||||
if (!dest_is_pipe) {
|
||||
/* Firstly write a stub to meta-pages.
|
||||
* Now we sure to incomplete copy will not be used. */
|
||||
memset(buffer, -1, pgno2bytes(env, NUM_METAS));
|
||||
rc = osal_write(fd, buffer, pgno2bytes(env, NUM_METAS));
|
||||
}
|
||||
|
||||
if (likely(rc == MDBX_SUCCESS)) {
|
||||
memset(buffer, 0, pgno2bytes(env, NUM_METAS));
|
||||
rc = ((flags & MDBX_CP_COMPACT) ? copy_with_compacting : copy_asis)(
|
||||
env, read_txn, fd, buffer, dest_is_pipe, flags);
|
||||
}
|
||||
mdbx_txn_abort(read_txn);
|
||||
|
||||
if (!dest_is_pipe) {
|
||||
if (likely(rc == MDBX_SUCCESS))
|
||||
rc = osal_fsync(fd, MDBX_SYNC_DATA | MDBX_SYNC_SIZE);
|
||||
|
||||
/* Write actual meta */
|
||||
if (likely(rc == MDBX_SUCCESS))
|
||||
rc = osal_pwrite(fd, buffer, pgno2bytes(env, NUM_METAS), 0);
|
||||
|
||||
if (likely(rc == MDBX_SUCCESS))
|
||||
rc = osal_fsync(fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ);
|
||||
}
|
||||
|
||||
osal_memalign_free(buffer);
|
||||
return rc;
|
||||
}
|
||||
|
||||
__cold int mdbx_env_copy(MDBX_env *env, const char *dest_path,
|
||||
MDBX_copy_flags_t flags) {
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
wchar_t *dest_pathW = nullptr;
|
||||
int rc = osal_mb2w(dest_path, &dest_pathW);
|
||||
if (likely(rc == MDBX_SUCCESS)) {
|
||||
rc = mdbx_env_copyW(env, dest_pathW, flags);
|
||||
osal_free(dest_pathW);
|
||||
}
|
||||
return rc;
|
||||
}
|
||||
|
||||
__cold int mdbx_env_copyW(MDBX_env *env, const wchar_t *dest_path,
|
||||
MDBX_copy_flags_t flags) {
|
||||
#endif /* Windows */
|
||||
|
||||
int rc = check_env(env, true);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return rc;
|
||||
|
||||
if (unlikely(!dest_path))
|
||||
return MDBX_EINVAL;
|
||||
|
||||
/* The destination path must exist, but the destination file must not.
|
||||
* We don't want the OS to cache the writes, since the source data is
|
||||
* already in the OS cache. */
|
||||
mdbx_filehandle_t newfd;
|
||||
rc = osal_openfile(MDBX_OPEN_COPY, env, dest_path, &newfd,
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
(mdbx_mode_t)-1
|
||||
#else
|
||||
S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP
|
||||
#endif
|
||||
);
|
||||
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
/* no locking required since the file opened with ShareMode == 0 */
|
||||
#else
|
||||
if (rc == MDBX_SUCCESS) {
|
||||
MDBX_STRUCT_FLOCK lock_op;
|
||||
memset(&lock_op, 0, sizeof(lock_op));
|
||||
lock_op.l_type = F_WRLCK;
|
||||
lock_op.l_whence = SEEK_SET;
|
||||
lock_op.l_start = 0;
|
||||
lock_op.l_len = OFF_T_MAX;
|
||||
if (MDBX_FCNTL(newfd, MDBX_F_SETLK, &lock_op)
|
||||
#if (defined(__linux__) || defined(__gnu_linux__)) && defined(LOCK_EX) && \
|
||||
(!defined(__ANDROID_API__) || __ANDROID_API__ >= 24)
|
||||
|| flock(newfd, LOCK_EX | LOCK_NB)
|
||||
#endif /* Linux */
|
||||
)
|
||||
rc = errno;
|
||||
}
|
||||
#endif /* Windows / POSIX */
|
||||
|
||||
if (rc == MDBX_SUCCESS)
|
||||
rc = mdbx_env_copy2fd(env, newfd, flags);
|
||||
|
||||
if (newfd != INVALID_HANDLE_VALUE) {
|
||||
int err = osal_closefile(newfd);
|
||||
if (rc == MDBX_SUCCESS && err != rc)
|
||||
rc = err;
|
||||
if (rc != MDBX_SUCCESS)
|
||||
(void)osal_removefile(dest_path);
|
||||
}
|
||||
|
||||
return rc;
|
||||
}
|
29506
src/core.c
29506
src/core.c
File diff suppressed because it is too large
Load Diff
2451
src/cursor.c
Normal file
2451
src/cursor.c
Normal file
File diff suppressed because it is too large
Load Diff
398
src/cursor.h
Normal file
398
src/cursor.h
Normal file
@ -0,0 +1,398 @@
|
||||
/// \copyright SPDX-License-Identifier: Apache-2.0
|
||||
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "essentials.h"
|
||||
|
||||
/* Состояние курсора.
|
||||
*
|
||||
* плохой/poor:
|
||||
* - неустановленный курсор с незаполненым стеком;
|
||||
* - следует пропускать во всех циклах отслеживания/корректировки
|
||||
* позиций курсоров;
|
||||
* - допускаются только операции предполагающие установку абсолютной позиции;
|
||||
* - в остальных случаях возвращается ENODATA.
|
||||
*
|
||||
* У таких курсоров top = -1 и flags < 0, что позволяет дешево проверять и
|
||||
* пропускать такие курсоры в циклах отслеживания/корректировки по условию
|
||||
* probe_cursor->top < this_cursor->top.
|
||||
*
|
||||
* пустой/hollow:
|
||||
* - частично инициализированный курсор, но без доступной пользователю позиции,
|
||||
* поэтому нельзя выполнить какую-либо операцию без абсолютного (не
|
||||
* относительного) позиционирования;
|
||||
* - ki[top] может быть некорректным, в том числе >= page_numkeys(pg[top]).
|
||||
*
|
||||
* У таких курсоров top >= 0, но flags < 0 (есть флажок z_hollow).
|
||||
*
|
||||
* установленный/pointed:
|
||||
* - полностью инициализированный курсор с конкретной позицией с данными;
|
||||
* - можно прочитать текущую строку, удалить её, либо выполнить
|
||||
* относительное перемещение;
|
||||
* - может иметь флажки z_after_delete, z_eof_hard и z_eof_soft;
|
||||
* - наличие z_eof_soft означает что курсор перемещен за пределы данных,
|
||||
* поэтому нелья прочитать текущие данные, либо удалить их.
|
||||
*
|
||||
* У таких курсоров top >= 0 и flags >= 0 (нет флажка z_hollow).
|
||||
*
|
||||
* наполненный данными/filled:
|
||||
* - это установленный/pointed курсор без флагов z_eof_soft;
|
||||
* - за курсором есть даные, возможны CRUD операции в текущей позиции.
|
||||
*
|
||||
* У таких курсоров top >= 0 и (unsigned)flags < z_eof_soft.
|
||||
*
|
||||
* Изменения состояния.
|
||||
*
|
||||
* - Сбрасывается состояние курсора посредством top_and_flags |= z_poor_mark,
|
||||
* что равносильно top = -1 вместе с flags |= z_poor_mark;
|
||||
* - При позиционировании курсора сначала устанавливается top, а flags
|
||||
* только в самом конце при отсутстви ошибок.
|
||||
* - Повторное позиционирование first/last может начинаться
|
||||
* с установки/обнуления только top без сброса flags, что позволяет работать
|
||||
* быстрому пути внутри tree_search_finalize().
|
||||
*
|
||||
* - Заморочки с концом данных:
|
||||
* - mdbx_cursor_get(NEXT) выполняет две операции (перемещение и чтение),
|
||||
* поэтому перемещение на последнюю строку строку всегда успешно,
|
||||
* а ошибка возвращается только при последующем next().
|
||||
* Однако, из-за этой двойственности семантика ситуации возврата ошибки
|
||||
* из mdbx_cursor_get(NEXT) допускает разночтение/неопределенность, ибо
|
||||
* не понятно к чему относится ошибка:
|
||||
* - Если к чтению данных, то курсор перемещен и стоит после последней
|
||||
* строки. Соответственно, чтение в текущей позиции запрещено,
|
||||
* а при выполнении prev() курсор вернется на последнюю строку;
|
||||
* - Если же ошибка относится к перемещению, то курсор не перемещен и
|
||||
* остается на последней строке. Соответственно, чтение в текущей
|
||||
* позиции допустимо, а при выполнении prev() курсор встанет
|
||||
* на пред-последнюю строку.
|
||||
* - Пикантность в том, что пользователи (так или иначе) полагаются
|
||||
* на оба варианта поведения, при этом конечно ожидают что после
|
||||
* ошибки MDBX_NEXT функция mdbx_cursor_eof() будет возвращать true.
|
||||
* - далее добавляется схожая ситуация с MDBX_GET_RANGE, MDBX_LOWERBOUND,
|
||||
* MDBX_GET_BOTH_RANGE и MDBX_UPPERBOUND. Тут при неуспехе поиска курсор
|
||||
* может/должен стоять после последней строки.
|
||||
* - далее добавляется MDBX_LAST. Тут курсор должен стоять на последней
|
||||
* строке и допускать чтение в текузщей позиции,
|
||||
* но mdbx_cursor_eof() должен возвращать true.
|
||||
*
|
||||
* Решение = делаем два флажка z_eof_soft и z_eof_hard:
|
||||
* - Когда установлен только z_eof_soft,
|
||||
* функция mdbx_cursor_eof() возвращает true, но допускается
|
||||
* чтение данных в текущей позиции, а prev() передвигает курсор
|
||||
* на пред-последнюю строку.
|
||||
* - Когда установлен z_eof_hard, чтение данных в текущей позиции
|
||||
* не допускается, и mdbx_cursor_eof() также возвращает true,
|
||||
* а prev() устанавливает курсора на последюю строку. */
|
||||
enum cursor_state {
|
||||
/* Это вложенный курсор для вложенного дерева/страницы и является
|
||||
inner-элементом struct cursor_couple. */
|
||||
z_inner = 0x01,
|
||||
|
||||
/* Происходит подготовка к обновлению GC,
|
||||
поэтому можно брать страницы из GC даже для FREE_DBI. */
|
||||
z_gcu_preparation = 0x02,
|
||||
|
||||
/* Курсор только-что создан, поэтому допускается авто-установка
|
||||
в начало/конец, вместо возврата ошибки. */
|
||||
z_fresh = 0x04,
|
||||
|
||||
/* Предыдущей операцией было удаление, поэтому курсор уже физически указывает
|
||||
на следующий элемент и соответствующая операция перемещения должна
|
||||
игнорироваться. */
|
||||
z_after_delete = 0x08,
|
||||
|
||||
/* */
|
||||
z_disable_tree_search_fastpath = 0x10,
|
||||
|
||||
/* Курсор логически в конце данных, но физически на последней строке,
|
||||
* ki[top] == page_numkeys(pg[top]) - 1 и читать данные в текущей позиции. */
|
||||
z_eof_soft = 0x20,
|
||||
|
||||
/* Курсор логически за концом данных, поэтому следующий переход "назад"
|
||||
должен игнорироваться и/или приводить к установке на последнюю строку.
|
||||
В текущем же состоянии нельзя делать CRUD операции. */
|
||||
z_eof_hard = 0x40,
|
||||
|
||||
/* За курсором нет данных, логически его позиция не определена,
|
||||
нельзя делать CRUD операции в текущей позиции.
|
||||
Относительное перемещение запрещено. */
|
||||
z_hollow = -128 /* 0x80 */,
|
||||
|
||||
/* Маски для сброса/установки состояния. */
|
||||
z_clear_mask = z_inner | z_gcu_preparation,
|
||||
z_poor_mark = z_eof_hard | z_hollow | z_disable_tree_search_fastpath,
|
||||
z_fresh_mark = z_poor_mark | z_fresh
|
||||
};
|
||||
|
||||
MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static inline bool
|
||||
is_inner(const MDBX_cursor *mc) {
|
||||
return (mc->flags & z_inner) != 0;
|
||||
}
|
||||
|
||||
MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static inline bool
|
||||
is_poor(const MDBX_cursor *mc) {
|
||||
const bool r = mc->top < 0;
|
||||
cASSERT(mc, r == (mc->top_and_flags < 0));
|
||||
if (r && mc->subcur)
|
||||
cASSERT(mc, mc->subcur->cursor.flags < 0 && mc->subcur->cursor.top < 0);
|
||||
return r;
|
||||
}
|
||||
|
||||
MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static inline bool
|
||||
is_pointed(const MDBX_cursor *mc) {
|
||||
const bool r = mc->top >= 0;
|
||||
cASSERT(mc, r == (mc->top_and_flags >= 0));
|
||||
if (!r && mc->subcur)
|
||||
cASSERT(mc, is_poor(&mc->subcur->cursor));
|
||||
return r;
|
||||
}
|
||||
|
||||
MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static inline bool
|
||||
is_hollow(const MDBX_cursor *mc) {
|
||||
const bool r = mc->flags < 0;
|
||||
if (!r) {
|
||||
cASSERT(mc, mc->top >= 0);
|
||||
cASSERT(mc, (mc->flags & z_eof_hard) ||
|
||||
mc->ki[mc->top] < page_numkeys(mc->pg[mc->top]));
|
||||
} else if (mc->subcur)
|
||||
cASSERT(mc, is_poor(&mc->subcur->cursor));
|
||||
return r;
|
||||
}
|
||||
|
||||
MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static inline bool
|
||||
is_eof(const MDBX_cursor *mc) {
|
||||
const bool r = z_eof_soft <= (uint8_t)mc->flags;
|
||||
return r;
|
||||
}
|
||||
|
||||
MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static inline bool
|
||||
is_filled(const MDBX_cursor *mc) {
|
||||
const bool r = z_eof_hard > (uint8_t)mc->flags;
|
||||
return r;
|
||||
}
|
||||
|
||||
MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static inline bool
|
||||
inner_filled(const MDBX_cursor *mc) {
|
||||
return mc->subcur && is_filled(&mc->subcur->cursor);
|
||||
}
|
||||
|
||||
MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static inline bool
|
||||
inner_pointed(const MDBX_cursor *mc) {
|
||||
return mc->subcur && is_pointed(&mc->subcur->cursor);
|
||||
}
|
||||
|
||||
MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static inline bool
|
||||
inner_hollow(const MDBX_cursor *mc) {
|
||||
return !mc->subcur || is_hollow(&mc->subcur->cursor);
|
||||
}
|
||||
|
||||
MDBX_MAYBE_UNUSED static inline void inner_gone(MDBX_cursor *mc) {
|
||||
if (mc->subcur) {
|
||||
TRACE("reset inner cursor %p",
|
||||
__Wpedantic_format_voidptr(&mc->subcur->cursor));
|
||||
mc->subcur->nested_tree.root = 0;
|
||||
mc->subcur->cursor.top_and_flags = z_inner | z_poor_mark;
|
||||
}
|
||||
}
|
||||
|
||||
MDBX_MAYBE_UNUSED static inline void be_poor(MDBX_cursor *mc) {
|
||||
const bool inner = is_inner(mc);
|
||||
if (inner) {
|
||||
mc->tree->root = 0;
|
||||
mc->top_and_flags = z_inner | z_poor_mark;
|
||||
} else {
|
||||
mc->top_and_flags |= z_poor_mark;
|
||||
inner_gone(mc);
|
||||
}
|
||||
cASSERT(mc, is_poor(mc) && !is_pointed(mc) && !is_filled(mc));
|
||||
cASSERT(mc, inner == is_inner(mc));
|
||||
}
|
||||
|
||||
MDBX_MAYBE_UNUSED static inline void be_filled(MDBX_cursor *mc) {
|
||||
cASSERT(mc, mc->top >= 0);
|
||||
cASSERT(mc, mc->ki[mc->top] < page_numkeys(mc->pg[mc->top]));
|
||||
const bool inner = is_inner(mc);
|
||||
mc->flags &= z_clear_mask;
|
||||
cASSERT(mc, is_filled(mc));
|
||||
cASSERT(mc, inner == is_inner(mc));
|
||||
}
|
||||
|
||||
MDBX_MAYBE_UNUSED static inline bool is_related(const MDBX_cursor *base,
|
||||
const MDBX_cursor *scan) {
|
||||
cASSERT(base, base->top >= 0);
|
||||
return base->top <= scan->top && base != scan;
|
||||
}
|
||||
|
||||
/* Флаги контроля/проверки курсора. */
|
||||
enum cursor_checking {
|
||||
z_branch = 0x01 /* same as P_BRANCH for check_leaf_type() */,
|
||||
z_leaf = 0x02 /* same as P_LEAF for check_leaf_type() */,
|
||||
z_largepage = 0x04 /* same as P_LARGE for check_leaf_type() */,
|
||||
z_updating = 0x08 /* update/rebalance pending */,
|
||||
z_ignord = 0x10 /* don't check keys ordering */,
|
||||
z_dupfix = 0x20 /* same as P_DUPFIX for check_leaf_type() */,
|
||||
z_retiring = 0x40 /* refs to child pages may be invalid */,
|
||||
z_pagecheck = 0x80 /* perform page checking, see MDBX_VALIDATION */
|
||||
};
|
||||
|
||||
MDBX_INTERNAL int __must_check_result cursor_check(const MDBX_cursor *mc);
|
||||
|
||||
MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static inline size_t
|
||||
cursor_dbi(const MDBX_cursor *mc) {
|
||||
cASSERT(mc, mc->txn && mc->txn->signature == txn_signature);
|
||||
size_t dbi = mc->dbi_state - mc->txn->dbi_state;
|
||||
cASSERT(mc, dbi < mc->txn->env->n_dbi);
|
||||
return dbi;
|
||||
}
|
||||
|
||||
MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static inline bool
|
||||
cursor_dbi_changed(const MDBX_cursor *mc) {
|
||||
return dbi_changed(mc->txn, cursor_dbi(mc));
|
||||
}
|
||||
|
||||
MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static inline uint8_t *
|
||||
cursor_dbi_state(const MDBX_cursor *mc) {
|
||||
return mc->dbi_state;
|
||||
}
|
||||
|
||||
MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static inline bool
|
||||
cursor_is_gc(const MDBX_cursor *mc) {
|
||||
return mc->dbi_state == mc->txn->dbi_state + FREE_DBI;
|
||||
}
|
||||
|
||||
MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static inline bool
|
||||
cursor_is_main(const MDBX_cursor *mc) {
|
||||
return mc->dbi_state == mc->txn->dbi_state + MAIN_DBI;
|
||||
}
|
||||
|
||||
MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static inline bool
|
||||
cursor_is_core(const MDBX_cursor *mc) {
|
||||
return mc->dbi_state < mc->txn->dbi_state + CORE_DBS;
|
||||
}
|
||||
|
||||
MDBX_MAYBE_UNUSED static inline int cursor_dbi_dbg(const MDBX_cursor *mc) {
|
||||
/* Debugging output value of a cursor's DBI: Negative for a sub-cursor. */
|
||||
const int dbi = cursor_dbi(mc);
|
||||
return (mc->flags & z_inner) ? -dbi : dbi;
|
||||
}
|
||||
|
||||
MDBX_MAYBE_UNUSED static inline int __must_check_result
|
||||
cursor_push(MDBX_cursor *mc, page_t *mp, indx_t ki) {
|
||||
TRACE("pushing page %" PRIaPGNO " on db %d cursor %p", mp->pgno,
|
||||
cursor_dbi_dbg(mc), __Wpedantic_format_voidptr(mc));
|
||||
if (unlikely(mc->top >= CURSOR_STACK_SIZE - 1)) {
|
||||
be_poor(mc);
|
||||
mc->txn->flags |= MDBX_TXN_ERROR;
|
||||
return MDBX_CURSOR_FULL;
|
||||
}
|
||||
mc->top += 1;
|
||||
mc->pg[mc->top] = mp;
|
||||
mc->ki[mc->top] = ki;
|
||||
return MDBX_SUCCESS;
|
||||
}
|
||||
|
||||
MDBX_MAYBE_UNUSED static inline void cursor_pop(MDBX_cursor *mc) {
|
||||
TRACE("popped page %" PRIaPGNO " off db %d cursor %p", mc->pg[mc->top]->pgno,
|
||||
cursor_dbi_dbg(mc), __Wpedantic_format_voidptr(mc));
|
||||
cASSERT(mc, mc->top >= 0);
|
||||
mc->top -= 1;
|
||||
}
|
||||
|
||||
MDBX_NOTHROW_PURE_FUNCTION static inline bool
|
||||
check_leaf_type(const MDBX_cursor *mc, const page_t *mp) {
|
||||
return (((page_type(mp) ^ mc->checking) &
|
||||
(z_branch | z_leaf | z_largepage | z_dupfix)) == 0);
|
||||
}
|
||||
|
||||
MDBX_INTERNAL void cursor_eot(MDBX_cursor *mc, const bool merge);
|
||||
MDBX_INTERNAL int cursor_shadow(MDBX_cursor *parent_cursor,
|
||||
MDBX_txn *nested_txn, const size_t dbi);
|
||||
|
||||
MDBX_INTERNAL MDBX_cursor *cursor_cpstk(const MDBX_cursor *csrc,
|
||||
MDBX_cursor *cdst);
|
||||
|
||||
MDBX_INTERNAL int __must_check_result cursor_ops(MDBX_cursor *mc, MDBX_val *key,
|
||||
MDBX_val *data,
|
||||
const MDBX_cursor_op op);
|
||||
|
||||
MDBX_INTERNAL int __must_check_result cursor_put_checklen(MDBX_cursor *mc,
|
||||
const MDBX_val *key,
|
||||
MDBX_val *data,
|
||||
unsigned flags);
|
||||
|
||||
MDBX_INTERNAL int __must_check_result cursor_put(MDBX_cursor *mc,
|
||||
const MDBX_val *key,
|
||||
MDBX_val *data,
|
||||
unsigned flags);
|
||||
|
||||
MDBX_INTERNAL int __must_check_result cursor_check_updating(MDBX_cursor *mc);
|
||||
|
||||
MDBX_INTERNAL int __must_check_result cursor_del(MDBX_cursor *mc,
|
||||
unsigned flags);
|
||||
|
||||
MDBX_INTERNAL int __must_check_result cursor_sibling_left(MDBX_cursor *mc);
|
||||
MDBX_INTERNAL int __must_check_result cursor_sibling_right(MDBX_cursor *mc);
|
||||
|
||||
typedef struct cursor_set_result {
|
||||
int err;
|
||||
bool exact;
|
||||
} csr_t;
|
||||
|
||||
MDBX_INTERNAL csr_t cursor_seek(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data,
|
||||
MDBX_cursor_op op);
|
||||
|
||||
MDBX_INTERNAL int __must_check_result inner_first(MDBX_cursor *__restrict mc,
|
||||
MDBX_val *__restrict data);
|
||||
MDBX_INTERNAL int __must_check_result inner_last(MDBX_cursor *__restrict mc,
|
||||
MDBX_val *__restrict data);
|
||||
MDBX_INTERNAL int __must_check_result outer_first(MDBX_cursor *__restrict mc,
|
||||
MDBX_val *__restrict key,
|
||||
MDBX_val *__restrict data);
|
||||
MDBX_INTERNAL int __must_check_result outer_last(MDBX_cursor *__restrict mc,
|
||||
MDBX_val *__restrict key,
|
||||
MDBX_val *__restrict data);
|
||||
|
||||
MDBX_INTERNAL int __must_check_result inner_next(MDBX_cursor *__restrict mc,
|
||||
MDBX_val *__restrict data);
|
||||
MDBX_INTERNAL int __must_check_result inner_prev(MDBX_cursor *__restrict mc,
|
||||
MDBX_val *__restrict data);
|
||||
MDBX_INTERNAL int __must_check_result outer_next(MDBX_cursor *__restrict mc,
|
||||
MDBX_val *__restrict key,
|
||||
MDBX_val *__restrict data,
|
||||
MDBX_cursor_op op);
|
||||
MDBX_INTERNAL int __must_check_result outer_prev(MDBX_cursor *__restrict mc,
|
||||
MDBX_val *__restrict key,
|
||||
MDBX_val *__restrict data,
|
||||
MDBX_cursor_op op);
|
||||
|
||||
MDBX_INTERNAL int cursor_init4walk(cursor_couple_t *couple,
|
||||
const MDBX_txn *const txn,
|
||||
tree_t *const tree, kvx_t *const kvx);
|
||||
|
||||
MDBX_INTERNAL int __must_check_result cursor_init(MDBX_cursor *mc,
|
||||
const MDBX_txn *txn,
|
||||
size_t dbi);
|
||||
|
||||
MDBX_INTERNAL int __must_check_result cursor_dupsort_setup(MDBX_cursor *mc,
|
||||
const node_t *node,
|
||||
const page_t *mp);
|
||||
|
||||
MDBX_INTERNAL int __must_check_result cursor_touch(MDBX_cursor *const mc,
|
||||
const MDBX_val *key,
|
||||
const MDBX_val *data);
|
||||
|
||||
/*----------------------------------------------------------------------------*/
|
||||
|
||||
/* Update sub-page pointer, if any, in mc->subcur.
|
||||
* Needed when the node which contains the sub-page may have moved.
|
||||
* Called with mp = mc->pg[mc->top], ki = mc->ki[mc->top]. */
|
||||
MDBX_MAYBE_UNUSED static inline void
|
||||
cursor_inner_refresh(const MDBX_cursor *mc, const page_t *mp, unsigned ki) {
|
||||
cASSERT(mc, is_leaf(mp));
|
||||
const node_t *node = page_node(mp, ki);
|
||||
if ((node_flags(node) & (N_DUPDATA | N_SUBDATA)) == N_DUPDATA)
|
||||
mc->subcur->cursor.pg[0] = node_data(node);
|
||||
}
|
||||
|
||||
MDBX_MAYBE_UNUSED MDBX_INTERNAL bool cursor_is_tracked(const MDBX_cursor *mc);
|
954
src/dbi.c
Normal file
954
src/dbi.c
Normal file
@ -0,0 +1,954 @@
|
||||
/// \copyright SPDX-License-Identifier: Apache-2.0
|
||||
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
|
||||
|
||||
#include "internals.h"
|
||||
|
||||
size_t dbi_bitmap_ctz_fallback(const MDBX_txn *txn, intptr_t bmi) {
|
||||
tASSERT(txn, bmi > 0);
|
||||
bmi &= -bmi;
|
||||
if (sizeof(txn->dbi_sparse[0]) > 4) {
|
||||
static const uint8_t debruijn_ctz64[64] = {
|
||||
0, 1, 2, 53, 3, 7, 54, 27, 4, 38, 41, 8, 34, 55, 48, 28,
|
||||
62, 5, 39, 46, 44, 42, 22, 9, 24, 35, 59, 56, 49, 18, 29, 11,
|
||||
63, 52, 6, 26, 37, 40, 33, 47, 61, 45, 43, 21, 23, 58, 17, 10,
|
||||
51, 25, 36, 32, 60, 20, 57, 16, 50, 31, 19, 15, 30, 14, 13, 12};
|
||||
return debruijn_ctz64[(UINT64_C(0x022FDD63CC95386D) * (uint64_t)bmi) >> 58];
|
||||
} else {
|
||||
static const uint8_t debruijn_ctz32[32] = {
|
||||
0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
|
||||
31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9};
|
||||
return debruijn_ctz32[(UINT32_C(0x077CB531) * (uint32_t)bmi) >> 27];
|
||||
}
|
||||
}
|
||||
|
||||
struct dbi_snap_result dbi_snap(const MDBX_env *env, const size_t dbi) {
|
||||
eASSERT(env, dbi < env->n_dbi);
|
||||
struct dbi_snap_result r;
|
||||
uint32_t snap = atomic_load32(&env->dbi_seqs[dbi], mo_AcquireRelease);
|
||||
do {
|
||||
r.sequence = snap;
|
||||
r.flags = env->dbs_flags[dbi];
|
||||
snap = atomic_load32(&env->dbi_seqs[dbi], mo_AcquireRelease);
|
||||
} while (unlikely(snap != r.sequence));
|
||||
return r;
|
||||
}
|
||||
|
||||
__noinline int dbi_import(MDBX_txn *txn, const size_t dbi) {
|
||||
const MDBX_env *const env = txn->env;
|
||||
if (dbi >= env->n_dbi || !env->dbs_flags[dbi])
|
||||
return MDBX_BAD_DBI;
|
||||
|
||||
#if MDBX_ENABLE_DBI_SPARSE
|
||||
const size_t bitmap_chunk = CHAR_BIT * sizeof(txn->dbi_sparse[0]);
|
||||
const size_t bitmap_indx = dbi / bitmap_chunk;
|
||||
const size_t bitmap_mask = (size_t)1 << dbi % bitmap_chunk;
|
||||
if (dbi >= txn->n_dbi) {
|
||||
for (size_t i = (txn->n_dbi + bitmap_chunk - 1) / bitmap_chunk;
|
||||
bitmap_indx >= i; ++i)
|
||||
txn->dbi_sparse[i] = 0;
|
||||
eASSERT(env, (txn->dbi_sparse[bitmap_indx] & bitmap_mask) == 0);
|
||||
MDBX_txn *scan = txn;
|
||||
do {
|
||||
eASSERT(env, scan->dbi_sparse == txn->dbi_sparse);
|
||||
eASSERT(env, scan->n_dbi < dbi + 1);
|
||||
scan->n_dbi = (unsigned)dbi + 1;
|
||||
scan->dbi_state[dbi] = 0;
|
||||
scan = scan->parent;
|
||||
} while (scan /* && scan->dbi_sparse == txn->dbi_sparse */);
|
||||
txn->dbi_sparse[bitmap_indx] |= bitmap_mask;
|
||||
goto lindo;
|
||||
}
|
||||
if ((txn->dbi_sparse[bitmap_indx] & bitmap_mask) == 0) {
|
||||
MDBX_txn *scan = txn;
|
||||
do {
|
||||
eASSERT(env, scan->dbi_sparse == txn->dbi_sparse);
|
||||
eASSERT(env, scan->n_dbi == txn->n_dbi);
|
||||
scan->dbi_state[dbi] = 0;
|
||||
scan = scan->parent;
|
||||
} while (scan /* && scan->dbi_sparse == txn->dbi_sparse */);
|
||||
txn->dbi_sparse[bitmap_indx] |= bitmap_mask;
|
||||
goto lindo;
|
||||
}
|
||||
#else
|
||||
if (dbi >= txn->n_dbi) {
|
||||
size_t i = txn->n_dbi;
|
||||
do
|
||||
txn->dbi_state[i] = 0;
|
||||
while (dbi >= ++i);
|
||||
txn->n_dbi = i;
|
||||
goto lindo;
|
||||
}
|
||||
#endif /* MDBX_ENABLE_DBI_SPARSE */
|
||||
|
||||
if (!txn->dbi_state[dbi]) {
|
||||
lindo:
|
||||
/* dbi-слот еще не инициализирован в транзакции, а хендл не использовался */
|
||||
txn->cursors[dbi] = nullptr;
|
||||
MDBX_txn *const parent = txn->parent;
|
||||
if (parent) {
|
||||
/* вложенная пишущая транзакция */
|
||||
int rc = dbi_check(parent, dbi);
|
||||
/* копируем состояние subDB очищая new-флаги. */
|
||||
eASSERT(env, txn->dbi_seqs == parent->dbi_seqs);
|
||||
txn->dbi_state[dbi] =
|
||||
parent->dbi_state[dbi] & ~(DBI_FRESH | DBI_CREAT | DBI_DIRTY);
|
||||
if (likely(rc == MDBX_SUCCESS)) {
|
||||
txn->dbs[dbi] = parent->dbs[dbi];
|
||||
if (parent->cursors[dbi]) {
|
||||
rc = cursor_shadow(parent->cursors[dbi], txn, dbi);
|
||||
if (unlikely(rc != MDBX_SUCCESS)) {
|
||||
/* не получилось забекапить курсоры */
|
||||
txn->dbi_state[dbi] = DBI_OLDEN | DBI_LINDO | DBI_STALE;
|
||||
txn->flags |= MDBX_TXN_ERROR;
|
||||
}
|
||||
}
|
||||
}
|
||||
return rc;
|
||||
}
|
||||
txn->dbi_seqs[dbi] = 0;
|
||||
txn->dbi_state[dbi] = DBI_LINDO;
|
||||
} else {
|
||||
eASSERT(env, txn->dbi_seqs[dbi] != env->dbi_seqs[dbi].weak);
|
||||
if (unlikely((txn->dbi_state[dbi] & (DBI_VALID | DBI_OLDEN)) ||
|
||||
txn->cursors[dbi])) {
|
||||
/* хендл уже использовался в транзакции, но был закрыт или переоткрыт,
|
||||
* либо при явном пере-открытии хендла есть висячие курсоры */
|
||||
eASSERT(env, (txn->dbi_state[dbi] & DBI_STALE) == 0);
|
||||
txn->dbi_seqs[dbi] = env->dbi_seqs[dbi].weak;
|
||||
txn->dbi_state[dbi] = DBI_OLDEN | DBI_LINDO;
|
||||
return txn->cursors[dbi] ? MDBX_DANGLING_DBI : MDBX_BAD_DBI;
|
||||
}
|
||||
}
|
||||
|
||||
/* хендл не использовался в транзакции, либо явно пере-отрывается при
|
||||
* отсутствии висячих курсоров */
|
||||
eASSERT(env, (txn->dbi_state[dbi] & DBI_LINDO) && !txn->cursors[dbi]);
|
||||
|
||||
/* читаем актуальные флаги и sequence */
|
||||
struct dbi_snap_result snap = dbi_snap(env, dbi);
|
||||
txn->dbi_seqs[dbi] = snap.sequence;
|
||||
if (snap.flags & DB_VALID) {
|
||||
txn->dbs[dbi].flags = snap.flags & DB_PERSISTENT_FLAGS;
|
||||
txn->dbi_state[dbi] = DBI_LINDO | DBI_VALID | DBI_STALE;
|
||||
return MDBX_SUCCESS;
|
||||
}
|
||||
return MDBX_BAD_DBI;
|
||||
}
|
||||
|
||||
static int defer_and_release(MDBX_env *const env,
|
||||
defer_free_item_t *const chain) {
|
||||
size_t length = 0;
|
||||
defer_free_item_t *obsolete_chain = nullptr;
|
||||
#if MDBX_ENABLE_DBI_LOCKFREE
|
||||
const uint64_t now = osal_monotime();
|
||||
defer_free_item_t **scan = &env->defer_free;
|
||||
if (env->defer_free) {
|
||||
const uint64_t threshold_1second = osal_16dot16_to_monotime(1 * 65536);
|
||||
do {
|
||||
defer_free_item_t *item = *scan;
|
||||
if (now - item->timestamp < threshold_1second) {
|
||||
scan = &item->next;
|
||||
length += 1;
|
||||
} else {
|
||||
*scan = item->next;
|
||||
item->next = obsolete_chain;
|
||||
obsolete_chain = item;
|
||||
}
|
||||
} while (*scan);
|
||||
}
|
||||
|
||||
eASSERT(env, *scan == nullptr);
|
||||
if (chain) {
|
||||
defer_free_item_t *item = chain;
|
||||
do {
|
||||
item->timestamp = now;
|
||||
item = item->next;
|
||||
} while (item);
|
||||
*scan = chain;
|
||||
}
|
||||
#else /* MDBX_ENABLE_DBI_LOCKFREE */
|
||||
obsolete_chain = chain;
|
||||
#endif /* MDBX_ENABLE_DBI_LOCKFREE */
|
||||
|
||||
ENSURE(env, osal_fastmutex_release(&env->dbi_lock) == MDBX_SUCCESS);
|
||||
if (length > 42) {
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
SwitchToThread();
|
||||
#else
|
||||
sched_yield();
|
||||
#endif /* Windows */
|
||||
}
|
||||
while (obsolete_chain) {
|
||||
defer_free_item_t *item = obsolete_chain;
|
||||
obsolete_chain = obsolete_chain->next;
|
||||
osal_free(item);
|
||||
}
|
||||
return chain ? MDBX_SUCCESS : MDBX_BAD_DBI;
|
||||
}
|
||||
|
||||
/* Export or close DBI handles opened in this txn. */
|
||||
int dbi_update(MDBX_txn *txn, int keep) {
|
||||
MDBX_env *const env = txn->env;
|
||||
tASSERT(txn, !txn->parent && txn == env->basal_txn);
|
||||
bool locked = false;
|
||||
defer_free_item_t *defer_chain = nullptr;
|
||||
TXN_FOREACH_DBI_USER(txn, dbi) {
|
||||
if (likely((txn->dbi_state[dbi] & DBI_CREAT) == 0))
|
||||
continue;
|
||||
if (!locked) {
|
||||
int err = osal_fastmutex_acquire(&env->dbi_lock);
|
||||
if (unlikely(err != MDBX_SUCCESS))
|
||||
return err;
|
||||
locked = true;
|
||||
if (dbi >= env->n_dbi)
|
||||
/* хендл был закрыт из другого потока пока захватывали блокировку */
|
||||
continue;
|
||||
}
|
||||
tASSERT(txn, dbi < env->n_dbi);
|
||||
if (keep) {
|
||||
env->dbs_flags[dbi] = txn->dbs[dbi].flags | DB_VALID;
|
||||
} else {
|
||||
uint32_t seq = dbi_seq_next(env, dbi);
|
||||
defer_free_item_t *item = env->kvs[dbi].name.iov_base;
|
||||
if (item) {
|
||||
env->dbs_flags[dbi] = 0;
|
||||
env->kvs[dbi].name.iov_len = 0;
|
||||
env->kvs[dbi].name.iov_base = nullptr;
|
||||
atomic_store32(&env->dbi_seqs[dbi], seq, mo_AcquireRelease);
|
||||
osal_flush_incoherent_cpu_writeback();
|
||||
item->next = defer_chain;
|
||||
defer_chain = item;
|
||||
} else {
|
||||
eASSERT(env, env->kvs[dbi].name.iov_len == 0);
|
||||
eASSERT(env, env->dbs_flags[dbi] == 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (locked) {
|
||||
size_t i = env->n_dbi;
|
||||
while ((env->dbs_flags[i - 1] & DB_VALID) == 0) {
|
||||
--i;
|
||||
eASSERT(env, i >= CORE_DBS);
|
||||
eASSERT(env, !env->dbs_flags[i] && !env->kvs[i].name.iov_len &&
|
||||
!env->kvs[i].name.iov_base);
|
||||
}
|
||||
env->n_dbi = (unsigned)i;
|
||||
defer_and_release(env, defer_chain);
|
||||
}
|
||||
return MDBX_SUCCESS;
|
||||
}
|
||||
|
||||
int dbi_bind(MDBX_txn *txn, const size_t dbi, unsigned user_flags,
|
||||
MDBX_cmp_func *keycmp, MDBX_cmp_func *datacmp) {
|
||||
const MDBX_env *const env = txn->env;
|
||||
eASSERT(env, dbi < txn->n_dbi && dbi < env->n_dbi);
|
||||
eASSERT(env, dbi_state(txn, dbi) & DBI_LINDO);
|
||||
eASSERT(env, env->dbs_flags[dbi] != DB_POISON);
|
||||
if ((env->dbs_flags[dbi] & DB_VALID) == 0) {
|
||||
eASSERT(env, !env->kvs[dbi].clc.k.cmp && !env->kvs[dbi].clc.v.cmp &&
|
||||
!env->kvs[dbi].name.iov_len &&
|
||||
!env->kvs[dbi].name.iov_base &&
|
||||
!env->kvs[dbi].clc.k.lmax && !env->kvs[dbi].clc.k.lmin &&
|
||||
!env->kvs[dbi].clc.v.lmax && !env->kvs[dbi].clc.v.lmin);
|
||||
} else {
|
||||
eASSERT(env, !(txn->dbi_state[dbi] & DBI_VALID) ||
|
||||
(txn->dbs[dbi].flags | DB_VALID) == env->dbs_flags[dbi]);
|
||||
eASSERT(env, env->kvs[dbi].name.iov_base || dbi < CORE_DBS);
|
||||
}
|
||||
|
||||
/* Если dbi уже использовался, то корректными считаем четыре варианта:
|
||||
* 1) user_flags равны MDBX_DB_ACCEDE
|
||||
* = предполагаем что пользователь открывает существующую subDb,
|
||||
* при этом код проверки не позволит установить другие компараторы.
|
||||
* 2) user_flags нулевые, а оба компаратора пустые/нулевые или равны текущим
|
||||
* = предполагаем что пользователь открывает существующую subDb
|
||||
* старым способом с нулевыми с флагами по-умолчанию.
|
||||
* 3) user_flags совпадают, а компараторы не заданы или те же
|
||||
* = предполагаем что пользователь открывает subDb указывая все параметры;
|
||||
* 4) user_flags отличаются, но subDb пустая и задан флаг MDBX_CREATE
|
||||
* = предполагаем что пользователь пересоздает subDb;
|
||||
*/
|
||||
if ((user_flags & ~MDBX_CREATE) !=
|
||||
(unsigned)(env->dbs_flags[dbi] & DB_PERSISTENT_FLAGS)) {
|
||||
/* flags are differs, check other conditions */
|
||||
if ((!user_flags && (!keycmp || keycmp == env->kvs[dbi].clc.k.cmp) &&
|
||||
(!datacmp || datacmp == env->kvs[dbi].clc.v.cmp)) ||
|
||||
user_flags == MDBX_DB_ACCEDE) {
|
||||
user_flags = env->dbs_flags[dbi] & DB_PERSISTENT_FLAGS;
|
||||
} else if ((user_flags & MDBX_CREATE) == 0)
|
||||
return /* FIXME: return extended info */ MDBX_INCOMPATIBLE;
|
||||
else {
|
||||
eASSERT(env, env->dbs_flags[dbi] & DB_VALID);
|
||||
if (txn->dbi_state[dbi] & DBI_STALE) {
|
||||
int err = sdb_fetch(txn, dbi);
|
||||
if (unlikely(err == MDBX_SUCCESS))
|
||||
return err;
|
||||
}
|
||||
eASSERT(env,
|
||||
(txn->dbi_state[dbi] & (DBI_LINDO | DBI_VALID | DBI_STALE)) ==
|
||||
(DBI_LINDO | DBI_VALID));
|
||||
if (unlikely(txn->dbs[dbi].leaf_pages))
|
||||
return /* FIXME: return extended info */ MDBX_INCOMPATIBLE;
|
||||
|
||||
/* Пересоздаём subDB если там пусто */
|
||||
if (unlikely(txn->cursors[dbi]))
|
||||
return MDBX_DANGLING_DBI;
|
||||
env->dbs_flags[dbi] = DB_POISON;
|
||||
atomic_store32(&env->dbi_seqs[dbi], dbi_seq_next(env, MAIN_DBI),
|
||||
mo_AcquireRelease);
|
||||
|
||||
const uint32_t seq = dbi_seq_next(env, dbi);
|
||||
const uint16_t db_flags = user_flags & DB_PERSISTENT_FLAGS;
|
||||
eASSERT(env, txn->dbs[dbi].height == 0 && txn->dbs[dbi].items == 0 &&
|
||||
txn->dbs[dbi].root == P_INVALID);
|
||||
env->kvs[dbi].clc.k.cmp = keycmp ? keycmp : builtin_keycmp(user_flags);
|
||||
env->kvs[dbi].clc.v.cmp = datacmp ? datacmp : builtin_datacmp(user_flags);
|
||||
txn->dbs[dbi].flags = db_flags;
|
||||
txn->dbs[dbi].dupfix_size = 0;
|
||||
if (unlikely(sdb_setup(env, &env->kvs[dbi], &txn->dbs[dbi]))) {
|
||||
txn->dbi_state[dbi] = DBI_LINDO;
|
||||
txn->flags |= MDBX_TXN_ERROR;
|
||||
return MDBX_PROBLEM;
|
||||
}
|
||||
|
||||
env->dbs_flags[dbi] = db_flags | DB_VALID;
|
||||
atomic_store32(&env->dbi_seqs[dbi], seq, mo_AcquireRelease);
|
||||
txn->dbi_seqs[dbi] = seq;
|
||||
txn->dbi_state[dbi] = DBI_LINDO | DBI_VALID | DBI_CREAT | DBI_DIRTY;
|
||||
txn->flags |= MDBX_TXN_DIRTY;
|
||||
}
|
||||
}
|
||||
|
||||
if (!keycmp)
|
||||
keycmp = (env->dbs_flags[dbi] & DB_VALID) ? env->kvs[dbi].clc.k.cmp
|
||||
: builtin_keycmp(user_flags);
|
||||
if (env->kvs[dbi].clc.k.cmp != keycmp) {
|
||||
if (env->dbs_flags[dbi] & DB_VALID)
|
||||
return MDBX_EINVAL;
|
||||
env->kvs[dbi].clc.k.cmp = keycmp;
|
||||
}
|
||||
|
||||
if (!datacmp)
|
||||
datacmp = (env->dbs_flags[dbi] & DB_VALID) ? env->kvs[dbi].clc.v.cmp
|
||||
: builtin_datacmp(user_flags);
|
||||
if (env->kvs[dbi].clc.v.cmp != datacmp) {
|
||||
if (env->dbs_flags[dbi] & DB_VALID)
|
||||
return MDBX_EINVAL;
|
||||
env->kvs[dbi].clc.v.cmp = datacmp;
|
||||
}
|
||||
|
||||
return MDBX_SUCCESS;
|
||||
}
|
||||
|
||||
static inline size_t dbi_namelen(const MDBX_val name) {
|
||||
return (name.iov_len > sizeof(defer_free_item_t)) ? name.iov_len
|
||||
: sizeof(defer_free_item_t);
|
||||
}
|
||||
|
||||
static int dbi_open_locked(MDBX_txn *txn, unsigned user_flags, MDBX_dbi *dbi,
|
||||
MDBX_cmp_func *keycmp, MDBX_cmp_func *datacmp,
|
||||
MDBX_val name) {
|
||||
MDBX_env *const env = txn->env;
|
||||
|
||||
/* Cannot mix named table(s) with DUPSORT flags */
|
||||
tASSERT(txn,
|
||||
(txn->dbi_state[MAIN_DBI] & (DBI_LINDO | DBI_VALID | DBI_STALE)) ==
|
||||
(DBI_LINDO | DBI_VALID));
|
||||
if (unlikely(txn->dbs[MAIN_DBI].flags & MDBX_DUPSORT)) {
|
||||
if (unlikely((user_flags & MDBX_CREATE) == 0))
|
||||
return MDBX_NOTFOUND;
|
||||
if (unlikely(txn->dbs[MAIN_DBI].leaf_pages))
|
||||
/* В MainDB есть записи, либо она уже использовалась. */
|
||||
return MDBX_INCOMPATIBLE;
|
||||
|
||||
/* Пересоздаём MainDB когда там пусто. */
|
||||
tASSERT(txn, txn->dbs[MAIN_DBI].height == 0 &&
|
||||
txn->dbs[MAIN_DBI].items == 0 &&
|
||||
txn->dbs[MAIN_DBI].root == P_INVALID);
|
||||
if (unlikely(txn->cursors[MAIN_DBI]))
|
||||
return MDBX_DANGLING_DBI;
|
||||
env->dbs_flags[MAIN_DBI] = DB_POISON;
|
||||
atomic_store32(&env->dbi_seqs[MAIN_DBI], dbi_seq_next(env, MAIN_DBI),
|
||||
mo_AcquireRelease);
|
||||
|
||||
const uint32_t seq = dbi_seq_next(env, MAIN_DBI);
|
||||
const uint16_t main_flags =
|
||||
txn->dbs[MAIN_DBI].flags & (MDBX_REVERSEKEY | MDBX_INTEGERKEY);
|
||||
env->kvs[MAIN_DBI].clc.k.cmp = builtin_keycmp(main_flags);
|
||||
env->kvs[MAIN_DBI].clc.v.cmp = builtin_datacmp(main_flags);
|
||||
txn->dbs[MAIN_DBI].flags = main_flags;
|
||||
txn->dbs[MAIN_DBI].dupfix_size = 0;
|
||||
int err = sdb_setup(env, &env->kvs[MAIN_DBI], &txn->dbs[MAIN_DBI]);
|
||||
if (unlikely(err != MDBX_SUCCESS)) {
|
||||
txn->dbi_state[MAIN_DBI] = DBI_LINDO;
|
||||
txn->flags |= MDBX_TXN_ERROR;
|
||||
env->flags |= ENV_FATAL_ERROR;
|
||||
return err;
|
||||
}
|
||||
env->dbs_flags[MAIN_DBI] = main_flags | DB_VALID;
|
||||
txn->dbi_seqs[MAIN_DBI] =
|
||||
atomic_store32(&env->dbi_seqs[MAIN_DBI], seq, mo_AcquireRelease);
|
||||
txn->dbi_state[MAIN_DBI] |= DBI_DIRTY;
|
||||
txn->flags |= MDBX_TXN_DIRTY;
|
||||
}
|
||||
|
||||
tASSERT(txn, env->kvs[MAIN_DBI].clc.k.cmp);
|
||||
|
||||
/* Is the DB already open? */
|
||||
size_t slot = env->n_dbi;
|
||||
for (size_t scan = CORE_DBS; scan < env->n_dbi; ++scan) {
|
||||
if ((env->dbs_flags[scan] & DB_VALID) == 0) {
|
||||
/* Remember this free slot */
|
||||
slot = (slot < scan) ? slot : scan;
|
||||
continue;
|
||||
}
|
||||
if (!env->kvs[MAIN_DBI].clc.k.cmp(&name, &env->kvs[scan].name)) {
|
||||
slot = scan;
|
||||
int err = dbi_check(txn, slot);
|
||||
if (err == MDBX_BAD_DBI &&
|
||||
txn->dbi_state[slot] == (DBI_OLDEN | DBI_LINDO)) {
|
||||
/* хендл использовался, стал невалидным,
|
||||
* но теперь явно пере-открывается в этой транзакци */
|
||||
eASSERT(env, !txn->cursors[slot]);
|
||||
txn->dbi_state[slot] = DBI_LINDO;
|
||||
err = dbi_check(txn, slot);
|
||||
}
|
||||
if (err == MDBX_SUCCESS) {
|
||||
err = dbi_bind(txn, slot, user_flags, keycmp, datacmp);
|
||||
if (likely(err == MDBX_SUCCESS)) {
|
||||
goto done;
|
||||
}
|
||||
}
|
||||
return err;
|
||||
}
|
||||
}
|
||||
|
||||
/* Fail, if no free slot and max hit */
|
||||
if (unlikely(slot >= env->max_dbi))
|
||||
return MDBX_DBS_FULL;
|
||||
|
||||
if (env->n_dbi == slot)
|
||||
eASSERT(env, !env->dbs_flags[slot] && !env->kvs[slot].name.iov_len &&
|
||||
!env->kvs[slot].name.iov_base);
|
||||
|
||||
env->dbs_flags[slot] = DB_POISON;
|
||||
atomic_store32(&env->dbi_seqs[slot], dbi_seq_next(env, slot),
|
||||
mo_AcquireRelease);
|
||||
memset(&env->kvs[slot], 0, sizeof(env->kvs[slot]));
|
||||
if (env->n_dbi == slot)
|
||||
env->n_dbi = (unsigned)slot + 1;
|
||||
eASSERT(env, slot < env->n_dbi);
|
||||
|
||||
int err = dbi_check(txn, slot);
|
||||
eASSERT(env, err == MDBX_BAD_DBI);
|
||||
if (err != MDBX_BAD_DBI)
|
||||
return MDBX_PROBLEM;
|
||||
|
||||
/* Find the DB info */
|
||||
MDBX_val body;
|
||||
cursor_couple_t cx;
|
||||
int rc = cursor_init(&cx.outer, txn, MAIN_DBI);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return rc;
|
||||
rc = cursor_seek(&cx.outer, &name, &body, MDBX_SET).err;
|
||||
if (unlikely(rc != MDBX_SUCCESS)) {
|
||||
if (rc != MDBX_NOTFOUND || !(user_flags & MDBX_CREATE))
|
||||
return rc;
|
||||
} else {
|
||||
/* make sure this is actually a table */
|
||||
node_t *node =
|
||||
page_node(cx.outer.pg[cx.outer.top], cx.outer.ki[cx.outer.top]);
|
||||
if (unlikely((node_flags(node) & (N_DUPDATA | N_SUBDATA)) != N_SUBDATA))
|
||||
return MDBX_INCOMPATIBLE;
|
||||
if (!MDBX_DISABLE_VALIDATION && unlikely(body.iov_len != sizeof(tree_t))) {
|
||||
ERROR("%s/%d: %s %zu", "MDBX_CORRUPTED", MDBX_CORRUPTED,
|
||||
"invalid subDb node size", body.iov_len);
|
||||
return MDBX_CORRUPTED;
|
||||
}
|
||||
memcpy(&txn->dbs[slot], body.iov_base, sizeof(tree_t));
|
||||
}
|
||||
|
||||
/* Done here so we cannot fail after creating a new DB */
|
||||
void *clone = nullptr;
|
||||
if (name.iov_len) {
|
||||
clone = osal_malloc(dbi_namelen(name));
|
||||
if (unlikely(!clone))
|
||||
return MDBX_ENOMEM;
|
||||
name.iov_base = memcpy(clone, name.iov_base, name.iov_len);
|
||||
} else
|
||||
name.iov_base = "";
|
||||
|
||||
uint8_t dbi_state = DBI_LINDO | DBI_VALID | DBI_FRESH;
|
||||
if (unlikely(rc)) {
|
||||
/* MDBX_NOTFOUND and MDBX_CREATE: Create new DB */
|
||||
tASSERT(txn, rc == MDBX_NOTFOUND);
|
||||
body.iov_base = memset(&txn->dbs[slot], 0, body.iov_len = sizeof(tree_t));
|
||||
txn->dbs[slot].root = P_INVALID;
|
||||
txn->dbs[slot].mod_txnid = txn->txnid;
|
||||
txn->dbs[slot].flags = user_flags & DB_PERSISTENT_FLAGS;
|
||||
cx.outer.next = txn->cursors[MAIN_DBI];
|
||||
txn->cursors[MAIN_DBI] = &cx.outer;
|
||||
rc = cursor_put_checklen(&cx.outer, &name, &body,
|
||||
N_SUBDATA | MDBX_NOOVERWRITE);
|
||||
txn->cursors[MAIN_DBI] = cx.outer.next;
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
goto bailout;
|
||||
|
||||
dbi_state |= DBI_DIRTY | DBI_CREAT;
|
||||
txn->flags |= MDBX_TXN_DIRTY;
|
||||
tASSERT(txn, (txn->dbi_state[MAIN_DBI] & DBI_DIRTY) != 0);
|
||||
}
|
||||
|
||||
/* Got info, register DBI in this txn */
|
||||
const uint32_t seq = dbi_seq_next(env, slot);
|
||||
eASSERT(env,
|
||||
env->dbs_flags[slot] == DB_POISON && !txn->cursors[slot] &&
|
||||
(txn->dbi_state[slot] & (DBI_LINDO | DBI_VALID)) == DBI_LINDO);
|
||||
txn->dbi_state[slot] = dbi_state;
|
||||
memcpy(&txn->dbs[slot], body.iov_base, sizeof(txn->dbs[slot]));
|
||||
env->dbs_flags[slot] = txn->dbs[slot].flags;
|
||||
rc = dbi_bind(txn, slot, user_flags, keycmp, datacmp);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
goto bailout;
|
||||
|
||||
env->kvs[slot].name = name;
|
||||
env->dbs_flags[slot] = txn->dbs[slot].flags | DB_VALID;
|
||||
txn->dbi_seqs[slot] =
|
||||
atomic_store32(&env->dbi_seqs[slot], seq, mo_AcquireRelease);
|
||||
|
||||
done:
|
||||
*dbi = (MDBX_dbi)slot;
|
||||
tASSERT(txn, slot < txn->n_dbi && (env->dbs_flags[slot] & DB_VALID) != 0);
|
||||
eASSERT(env, dbi_check(txn, slot) == MDBX_SUCCESS);
|
||||
return MDBX_SUCCESS;
|
||||
|
||||
bailout:
|
||||
eASSERT(env, !txn->cursors[slot] && !env->kvs[slot].name.iov_len &&
|
||||
!env->kvs[slot].name.iov_base);
|
||||
txn->dbi_state[slot] &= DBI_LINDO | DBI_OLDEN;
|
||||
env->dbs_flags[slot] = 0;
|
||||
osal_free(clone);
|
||||
if (slot + 1 == env->n_dbi)
|
||||
txn->n_dbi = env->n_dbi = (unsigned)slot;
|
||||
return rc;
|
||||
}
|
||||
|
||||
int dbi_open(MDBX_txn *txn, const MDBX_val *const name, unsigned user_flags,
|
||||
MDBX_dbi *dbi, MDBX_cmp_func *keycmp, MDBX_cmp_func *datacmp) {
|
||||
if (unlikely(!dbi))
|
||||
return MDBX_EINVAL;
|
||||
*dbi = 0;
|
||||
|
||||
if (user_flags != MDBX_ACCEDE &&
|
||||
unlikely(!check_sdb_flags(user_flags & ~MDBX_CREATE)))
|
||||
return MDBX_EINVAL;
|
||||
|
||||
int rc = check_txn(txn, MDBX_TXN_BLOCKED);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return rc;
|
||||
|
||||
if ((user_flags & MDBX_CREATE) && unlikely(txn->flags & MDBX_TXN_RDONLY))
|
||||
return MDBX_EACCESS;
|
||||
|
||||
/* main table? */
|
||||
if (unlikely(name == MDBX_CHK_MAIN || name->iov_base == MDBX_CHK_MAIN)) {
|
||||
rc = dbi_bind(txn, MAIN_DBI, user_flags, keycmp, datacmp);
|
||||
if (likely(rc == MDBX_SUCCESS))
|
||||
*dbi = MAIN_DBI;
|
||||
return rc;
|
||||
}
|
||||
if (unlikely(name == MDBX_CHK_GC || name->iov_base == MDBX_CHK_GC)) {
|
||||
rc = dbi_bind(txn, FREE_DBI, user_flags, keycmp, datacmp);
|
||||
if (likely(rc == MDBX_SUCCESS))
|
||||
*dbi = FREE_DBI;
|
||||
return rc;
|
||||
}
|
||||
if (unlikely(name == MDBX_CHK_META || name->iov_base == MDBX_CHK_META))
|
||||
return MDBX_EINVAL;
|
||||
if (unlikely(name->iov_len >
|
||||
txn->env->leaf_nodemax - NODESIZE - sizeof(tree_t)))
|
||||
return MDBX_EINVAL;
|
||||
|
||||
#if MDBX_ENABLE_DBI_LOCKFREE
|
||||
/* Is the DB already open? */
|
||||
const MDBX_env *const env = txn->env;
|
||||
size_t free_slot = env->n_dbi;
|
||||
for (size_t i = CORE_DBS; i < env->n_dbi; ++i) {
|
||||
retry:
|
||||
if ((env->dbs_flags[i] & DB_VALID) == 0) {
|
||||
free_slot = i;
|
||||
continue;
|
||||
}
|
||||
|
||||
const uint32_t snap_seq =
|
||||
atomic_load32(&env->dbi_seqs[i], mo_AcquireRelease);
|
||||
const uint16_t snap_flags = env->dbs_flags[i];
|
||||
const MDBX_val snap_name = env->kvs[i].name;
|
||||
if (user_flags != MDBX_ACCEDE &&
|
||||
(((user_flags ^ snap_flags) & DB_PERSISTENT_FLAGS) ||
|
||||
(keycmp && keycmp != env->kvs[i].clc.k.cmp) ||
|
||||
(datacmp && datacmp != env->kvs[i].clc.v.cmp)))
|
||||
continue;
|
||||
const uint32_t main_seq =
|
||||
atomic_load32(&env->dbi_seqs[MAIN_DBI], mo_AcquireRelease);
|
||||
MDBX_cmp_func *const snap_cmp = env->kvs[MAIN_DBI].clc.k.cmp;
|
||||
if (unlikely(!(snap_flags & DB_VALID) || !snap_name.iov_base ||
|
||||
!snap_name.iov_len || !snap_cmp))
|
||||
continue;
|
||||
|
||||
const bool name_match = snap_cmp(&snap_name, name) == 0;
|
||||
osal_flush_incoherent_cpu_writeback();
|
||||
if (unlikely(
|
||||
snap_seq != atomic_load32(&env->dbi_seqs[i], mo_AcquireRelease) ||
|
||||
main_seq !=
|
||||
atomic_load32(&env->dbi_seqs[MAIN_DBI], mo_AcquireRelease) ||
|
||||
snap_flags != env->dbs_flags[i] ||
|
||||
snap_name.iov_base != env->kvs[i].name.iov_base ||
|
||||
snap_name.iov_len != env->kvs[i].name.iov_len))
|
||||
goto retry;
|
||||
if (name_match) {
|
||||
rc = dbi_check(txn, i);
|
||||
if (rc == MDBX_BAD_DBI && txn->dbi_state[i] == (DBI_OLDEN | DBI_LINDO)) {
|
||||
/* хендл использовался, стал невалидным,
|
||||
* но теперь явно пере-открывается в этой транзакци */
|
||||
eASSERT(env, !txn->cursors[i]);
|
||||
txn->dbi_state[i] = DBI_LINDO;
|
||||
rc = dbi_check(txn, i);
|
||||
}
|
||||
if (likely(rc == MDBX_SUCCESS)) {
|
||||
rc = dbi_bind(txn, i, user_flags, keycmp, datacmp);
|
||||
if (likely(rc == MDBX_SUCCESS))
|
||||
*dbi = (MDBX_dbi)i;
|
||||
}
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
|
||||
/* Fail, if no free slot and max hit */
|
||||
if (unlikely(free_slot >= env->max_dbi))
|
||||
return MDBX_DBS_FULL;
|
||||
#endif /* MDBX_ENABLE_DBI_LOCKFREE */
|
||||
|
||||
rc = osal_fastmutex_acquire(&txn->env->dbi_lock);
|
||||
if (likely(rc == MDBX_SUCCESS)) {
|
||||
rc = dbi_open_locked(txn, user_flags, dbi, keycmp, datacmp, *name);
|
||||
ENSURE(txn->env,
|
||||
osal_fastmutex_release(&txn->env->dbi_lock) == MDBX_SUCCESS);
|
||||
}
|
||||
return rc;
|
||||
}
|
||||
|
||||
static int dbi_open_cstr(MDBX_txn *txn, const char *name_cstr,
|
||||
MDBX_db_flags_t flags, MDBX_dbi *dbi,
|
||||
MDBX_cmp_func *keycmp, MDBX_cmp_func *datacmp) {
|
||||
MDBX_val thunk, *name;
|
||||
if (name_cstr == MDBX_CHK_MAIN || name_cstr == MDBX_CHK_GC ||
|
||||
name_cstr == MDBX_CHK_META)
|
||||
name = (void *)name_cstr;
|
||||
else {
|
||||
thunk.iov_len = strlen(name_cstr);
|
||||
thunk.iov_base = (void *)name_cstr;
|
||||
name = &thunk;
|
||||
}
|
||||
return dbi_open(txn, name, flags, dbi, keycmp, datacmp);
|
||||
}
|
||||
|
||||
struct dbi_rename_result {
|
||||
defer_free_item_t *defer;
|
||||
int err;
|
||||
};
|
||||
|
||||
__cold static struct dbi_rename_result
|
||||
dbi_rename_locked(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val new_name) {
|
||||
struct dbi_rename_result pair;
|
||||
pair.defer = nullptr;
|
||||
pair.err = dbi_check(txn, dbi);
|
||||
if (unlikely(pair.err != MDBX_SUCCESS))
|
||||
return pair;
|
||||
|
||||
MDBX_env *const env = txn->env;
|
||||
MDBX_val old_name = env->kvs[dbi].name;
|
||||
if (env->kvs[MAIN_DBI].clc.k.cmp(&new_name, &old_name) == 0 &&
|
||||
MDBX_DEBUG == 0)
|
||||
return pair;
|
||||
|
||||
cursor_couple_t cx;
|
||||
pair.err = cursor_init(&cx.outer, txn, MAIN_DBI);
|
||||
if (unlikely(pair.err != MDBX_SUCCESS))
|
||||
return pair;
|
||||
pair.err = cursor_seek(&cx.outer, &new_name, nullptr, MDBX_SET).err;
|
||||
if (unlikely(pair.err != MDBX_NOTFOUND)) {
|
||||
pair.err = (pair.err == MDBX_SUCCESS) ? MDBX_KEYEXIST : pair.err;
|
||||
return pair;
|
||||
}
|
||||
|
||||
pair.defer = osal_malloc(dbi_namelen(new_name));
|
||||
if (unlikely(!pair.defer)) {
|
||||
pair.err = MDBX_ENOMEM;
|
||||
return pair;
|
||||
}
|
||||
new_name.iov_base = memcpy(pair.defer, new_name.iov_base, new_name.iov_len);
|
||||
|
||||
cx.outer.next = txn->cursors[MAIN_DBI];
|
||||
txn->cursors[MAIN_DBI] = &cx.outer;
|
||||
|
||||
MDBX_val data = {&txn->dbs[dbi], sizeof(tree_t)};
|
||||
pair.err = cursor_put_checklen(&cx.outer, &new_name, &data,
|
||||
N_SUBDATA | MDBX_NOOVERWRITE);
|
||||
if (likely(pair.err == MDBX_SUCCESS)) {
|
||||
pair.err = cursor_seek(&cx.outer, &old_name, nullptr, MDBX_SET).err;
|
||||
if (likely(pair.err == MDBX_SUCCESS))
|
||||
pair.err = cursor_del(&cx.outer, N_SUBDATA);
|
||||
if (likely(pair.err == MDBX_SUCCESS)) {
|
||||
pair.defer = env->kvs[dbi].name.iov_base;
|
||||
env->kvs[dbi].name = new_name;
|
||||
} else
|
||||
txn->flags |= MDBX_TXN_ERROR;
|
||||
}
|
||||
|
||||
txn->cursors[MAIN_DBI] = cx.outer.next;
|
||||
return pair;
|
||||
}
|
||||
|
||||
static defer_free_item_t *dbi_close_locked(MDBX_env *env, MDBX_dbi dbi) {
|
||||
eASSERT(env, dbi >= CORE_DBS);
|
||||
if (unlikely(dbi >= env->n_dbi))
|
||||
return nullptr;
|
||||
|
||||
const uint32_t seq = dbi_seq_next(env, dbi);
|
||||
defer_free_item_t *defer_item = env->kvs[dbi].name.iov_base;
|
||||
if (likely(defer_item)) {
|
||||
env->dbs_flags[dbi] = 0;
|
||||
env->kvs[dbi].name.iov_len = 0;
|
||||
env->kvs[dbi].name.iov_base = nullptr;
|
||||
atomic_store32(&env->dbi_seqs[dbi], seq, mo_AcquireRelease);
|
||||
osal_flush_incoherent_cpu_writeback();
|
||||
defer_item->next = nullptr;
|
||||
|
||||
if (env->n_dbi == dbi + 1) {
|
||||
size_t i = env->n_dbi;
|
||||
do {
|
||||
--i;
|
||||
eASSERT(env, i >= CORE_DBS);
|
||||
eASSERT(env, !env->dbs_flags[i] && !env->kvs[i].name.iov_len &&
|
||||
!env->kvs[i].name.iov_base);
|
||||
} while (i > CORE_DBS && !env->kvs[i - 1].name.iov_base);
|
||||
env->n_dbi = (unsigned)i;
|
||||
}
|
||||
}
|
||||
|
||||
return defer_item;
|
||||
}
|
||||
|
||||
/*----------------------------------------------------------------------------*/
|
||||
/* API */
|
||||
|
||||
int mdbx_dbi_open(MDBX_txn *txn, const char *name, MDBX_db_flags_t flags,
|
||||
MDBX_dbi *dbi) {
|
||||
return dbi_open_cstr(txn, name, flags, dbi, nullptr, nullptr);
|
||||
}
|
||||
|
||||
int mdbx_dbi_open2(MDBX_txn *txn, const MDBX_val *name, MDBX_db_flags_t flags,
|
||||
MDBX_dbi *dbi) {
|
||||
return dbi_open(txn, name, flags, dbi, nullptr, nullptr);
|
||||
}
|
||||
|
||||
int mdbx_dbi_open_ex(MDBX_txn *txn, const char *name, MDBX_db_flags_t flags,
|
||||
MDBX_dbi *dbi, MDBX_cmp_func *keycmp,
|
||||
MDBX_cmp_func *datacmp) {
|
||||
return dbi_open_cstr(txn, name, flags, dbi, keycmp, datacmp);
|
||||
}
|
||||
|
||||
int mdbx_dbi_open_ex2(MDBX_txn *txn, const MDBX_val *name,
|
||||
MDBX_db_flags_t flags, MDBX_dbi *dbi,
|
||||
MDBX_cmp_func *keycmp, MDBX_cmp_func *datacmp) {
|
||||
return dbi_open(txn, name, flags, dbi, keycmp, datacmp);
|
||||
}
|
||||
|
||||
__cold int mdbx_drop(MDBX_txn *txn, MDBX_dbi dbi, bool del) {
|
||||
int rc = check_txn_rw(txn, MDBX_TXN_BLOCKED);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return rc;
|
||||
|
||||
cursor_couple_t cx;
|
||||
rc = cursor_init(&cx.outer, txn, dbi);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return rc;
|
||||
|
||||
if (txn->dbs[dbi].height) {
|
||||
cx.outer.next = txn->cursors[dbi];
|
||||
txn->cursors[dbi] = &cx.outer;
|
||||
rc = tree_drop(&cx.outer,
|
||||
dbi == MAIN_DBI || (cx.outer.tree->flags & MDBX_DUPSORT));
|
||||
txn->cursors[dbi] = cx.outer.next;
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* Invalidate the dropped DB's cursors */
|
||||
for (MDBX_cursor *mc = txn->cursors[dbi]; mc; mc = mc->next)
|
||||
be_poor(mc);
|
||||
|
||||
if (!del || dbi < CORE_DBS) {
|
||||
/* reset the DB record, mark it dirty */
|
||||
txn->dbi_state[dbi] |= DBI_DIRTY;
|
||||
txn->dbs[dbi].height = 0;
|
||||
txn->dbs[dbi].branch_pages = 0;
|
||||
txn->dbs[dbi].leaf_pages = 0;
|
||||
txn->dbs[dbi].large_pages = 0;
|
||||
txn->dbs[dbi].items = 0;
|
||||
txn->dbs[dbi].root = P_INVALID;
|
||||
txn->dbs[dbi].sequence = 0;
|
||||
/* txn->dbs[dbi].mod_txnid = txn->txnid; */
|
||||
txn->flags |= MDBX_TXN_DIRTY;
|
||||
return MDBX_SUCCESS;
|
||||
}
|
||||
|
||||
MDBX_env *const env = txn->env;
|
||||
MDBX_val name = env->kvs[dbi].name;
|
||||
rc = cursor_init(&cx.outer, txn, MAIN_DBI);
|
||||
if (likely(rc == MDBX_SUCCESS)) {
|
||||
rc = cursor_seek(&cx.outer, &name, nullptr, MDBX_SET).err;
|
||||
if (likely(rc == MDBX_SUCCESS)) {
|
||||
cx.outer.next = txn->cursors[MAIN_DBI];
|
||||
txn->cursors[MAIN_DBI] = &cx.outer;
|
||||
rc = cursor_del(&cx.outer, N_SUBDATA);
|
||||
txn->cursors[MAIN_DBI] = cx.outer.next;
|
||||
if (likely(rc == MDBX_SUCCESS)) {
|
||||
tASSERT(txn, txn->dbi_state[MAIN_DBI] & DBI_DIRTY);
|
||||
tASSERT(txn, txn->flags & MDBX_TXN_DIRTY);
|
||||
txn->dbi_state[dbi] = DBI_LINDO | DBI_OLDEN;
|
||||
rc = osal_fastmutex_acquire(&env->dbi_lock);
|
||||
if (likely(rc == MDBX_SUCCESS))
|
||||
return defer_and_release(env, dbi_close_locked(env, dbi));
|
||||
}
|
||||
}
|
||||
}
|
||||
txn->flags |= MDBX_TXN_ERROR;
|
||||
return rc;
|
||||
}
|
||||
|
||||
__cold int mdbx_dbi_rename(MDBX_txn *txn, MDBX_dbi dbi, const char *name_cstr) {
|
||||
MDBX_val thunk, *name;
|
||||
if (name_cstr == MDBX_CHK_MAIN || name_cstr == MDBX_CHK_GC ||
|
||||
name_cstr == MDBX_CHK_META)
|
||||
name = (void *)name_cstr;
|
||||
else {
|
||||
thunk.iov_len = strlen(name_cstr);
|
||||
thunk.iov_base = (void *)name_cstr;
|
||||
name = &thunk;
|
||||
}
|
||||
return mdbx_dbi_rename2(txn, dbi, name);
|
||||
}
|
||||
|
||||
int mdbx_dbi_close(MDBX_env *env, MDBX_dbi dbi) {
|
||||
int rc = check_env(env, true);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return rc;
|
||||
|
||||
if (unlikely(dbi < CORE_DBS))
|
||||
return (dbi == MAIN_DBI) ? MDBX_SUCCESS : MDBX_BAD_DBI;
|
||||
|
||||
if (unlikely(dbi >= env->max_dbi))
|
||||
return MDBX_BAD_DBI;
|
||||
|
||||
if (unlikely(dbi < CORE_DBS || dbi >= env->max_dbi))
|
||||
return MDBX_BAD_DBI;
|
||||
|
||||
rc = osal_fastmutex_acquire(&env->dbi_lock);
|
||||
if (likely(rc == MDBX_SUCCESS))
|
||||
rc = defer_and_release(env, dbi_close_locked(env, dbi));
|
||||
return rc;
|
||||
}
|
||||
|
||||
int mdbx_dbi_flags_ex(const MDBX_txn *txn, MDBX_dbi dbi, unsigned *flags,
|
||||
unsigned *state) {
|
||||
int rc = check_txn(txn, MDBX_TXN_BLOCKED - MDBX_TXN_ERROR);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return rc;
|
||||
|
||||
if (unlikely(!flags || !state))
|
||||
return MDBX_EINVAL;
|
||||
|
||||
rc = dbi_check(txn, dbi);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return rc;
|
||||
|
||||
*flags = txn->dbs[dbi].flags & DB_PERSISTENT_FLAGS;
|
||||
*state =
|
||||
txn->dbi_state[dbi] & (DBI_FRESH | DBI_CREAT | DBI_DIRTY | DBI_STALE);
|
||||
|
||||
return MDBX_SUCCESS;
|
||||
}
|
||||
|
||||
__cold int mdbx_dbi_rename2(MDBX_txn *txn, MDBX_dbi dbi,
|
||||
const MDBX_val *new_name) {
|
||||
int rc = check_txn_rw(txn, MDBX_TXN_BLOCKED);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return rc;
|
||||
|
||||
if (unlikely(new_name == MDBX_CHK_MAIN ||
|
||||
new_name->iov_base == MDBX_CHK_MAIN || new_name == MDBX_CHK_GC ||
|
||||
new_name->iov_base == MDBX_CHK_GC || new_name == MDBX_CHK_META ||
|
||||
new_name->iov_base == MDBX_CHK_META))
|
||||
return MDBX_EINVAL;
|
||||
|
||||
if (unlikely(dbi < CORE_DBS))
|
||||
return MDBX_EINVAL;
|
||||
rc = dbi_check(txn, dbi);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return rc;
|
||||
|
||||
rc = osal_fastmutex_acquire(&txn->env->dbi_lock);
|
||||
if (likely(rc == MDBX_SUCCESS)) {
|
||||
struct dbi_rename_result pair = dbi_rename_locked(txn, dbi, *new_name);
|
||||
if (pair.defer)
|
||||
pair.defer->next = nullptr;
|
||||
defer_and_release(txn->env, pair.defer);
|
||||
rc = pair.err;
|
||||
}
|
||||
return rc;
|
||||
}
|
||||
|
||||
static void stat_get(const tree_t *db, MDBX_stat *st, size_t bytes) {
|
||||
st->ms_depth = db->height;
|
||||
st->ms_branch_pages = db->branch_pages;
|
||||
st->ms_leaf_pages = db->leaf_pages;
|
||||
st->ms_overflow_pages = db->large_pages;
|
||||
st->ms_entries = db->items;
|
||||
if (likely(bytes >=
|
||||
offsetof(MDBX_stat, ms_mod_txnid) + sizeof(st->ms_mod_txnid)))
|
||||
st->ms_mod_txnid = db->mod_txnid;
|
||||
}
|
||||
|
||||
__cold int mdbx_dbi_stat(const MDBX_txn *txn, MDBX_dbi dbi, MDBX_stat *dest,
|
||||
size_t bytes) {
|
||||
int rc = check_txn(txn, MDBX_TXN_BLOCKED);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return rc;
|
||||
|
||||
if (unlikely(!dest))
|
||||
return MDBX_EINVAL;
|
||||
|
||||
rc = dbi_check(txn, dbi);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return rc;
|
||||
|
||||
const size_t size_before_modtxnid = offsetof(MDBX_stat, ms_mod_txnid);
|
||||
if (unlikely(bytes != sizeof(MDBX_stat)) && bytes != size_before_modtxnid)
|
||||
return MDBX_EINVAL;
|
||||
|
||||
if (unlikely(txn->flags & MDBX_TXN_BLOCKED))
|
||||
return MDBX_BAD_TXN;
|
||||
|
||||
if (unlikely(txn->dbi_state[dbi] & DBI_STALE)) {
|
||||
rc = sdb_fetch((MDBX_txn *)txn, dbi);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return rc;
|
||||
}
|
||||
|
||||
dest->ms_psize = txn->env->ps;
|
||||
stat_get(&txn->dbs[dbi], dest, bytes);
|
||||
return MDBX_SUCCESS;
|
||||
}
|
133
src/dbi.h
Normal file
133
src/dbi.h
Normal file
@ -0,0 +1,133 @@
|
||||
/// \copyright SPDX-License-Identifier: Apache-2.0
|
||||
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "essentials.h"
|
||||
|
||||
MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION MDBX_INTERNAL size_t
|
||||
dbi_bitmap_ctz_fallback(const MDBX_txn *txn, intptr_t bmi);
|
||||
|
||||
#if MDBX_ENABLE_DBI_SPARSE
|
||||
|
||||
static inline size_t dbi_bitmap_ctz(const MDBX_txn *txn, intptr_t bmi) {
|
||||
tASSERT(txn, bmi > 0);
|
||||
STATIC_ASSERT(sizeof(bmi) >= sizeof(txn->dbi_sparse[0]));
|
||||
#if __GNUC_PREREQ(4, 1) || __has_builtin(__builtin_ctzl)
|
||||
if (sizeof(txn->dbi_sparse[0]) <= sizeof(int))
|
||||
return __builtin_ctz((int)bmi);
|
||||
if (sizeof(txn->dbi_sparse[0]) == sizeof(long))
|
||||
return __builtin_ctzl((long)bmi);
|
||||
#if (defined(__SIZEOF_LONG_LONG__) && __SIZEOF_LONG_LONG__ == 8) || \
|
||||
__has_builtin(__builtin_ctzll)
|
||||
return __builtin_ctzll(bmi);
|
||||
#endif /* have(long long) && long long == uint64_t */
|
||||
#endif /* GNU C */
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
unsigned long index;
|
||||
if (sizeof(txn->dbi_sparse[0]) > 4) {
|
||||
#if defined(_M_AMD64) || defined(_M_ARM64) || defined(_M_X64)
|
||||
_BitScanForward64(&index, bmi);
|
||||
return index;
|
||||
#else
|
||||
if (bmi > UINT32_MAX) {
|
||||
_BitScanForward(&index, (uint32_t)((uint64_t)bmi >> 32));
|
||||
return index;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
_BitScanForward(&index, (uint32_t)bmi);
|
||||
return index;
|
||||
#endif /* MSVC */
|
||||
|
||||
return dbi_bitmap_ctz_fallback(txn, bmi);
|
||||
}
|
||||
|
||||
/* LY: Макрос целенаправленно сделан с одним циклом, чтобы сохранить возможность
|
||||
* использования оператора break */
|
||||
#define TXN_FOREACH_DBI_FROM(TXN, I, FROM) \
|
||||
for (size_t bitmap_chunk = CHAR_BIT * sizeof(TXN->dbi_sparse[0]), \
|
||||
bitmap_item = TXN->dbi_sparse[0] >> FROM, I = FROM; \
|
||||
I < TXN->n_dbi; ++I) \
|
||||
if (bitmap_item == 0) { \
|
||||
I = (I - 1) | (bitmap_chunk - 1); \
|
||||
bitmap_item = TXN->dbi_sparse[(1 + I) / bitmap_chunk]; \
|
||||
if (!bitmap_item) \
|
||||
I += bitmap_chunk; \
|
||||
continue; \
|
||||
} else if ((bitmap_item & 1) == 0) { \
|
||||
size_t bitmap_skip = dbi_bitmap_ctz(txn, bitmap_item); \
|
||||
bitmap_item >>= bitmap_skip; \
|
||||
I += bitmap_skip - 1; \
|
||||
continue; \
|
||||
} else if (bitmap_item >>= 1, TXN->dbi_state[I])
|
||||
|
||||
#else
|
||||
|
||||
#define TXN_FOREACH_DBI_FROM(TXN, I, SKIP) \
|
||||
for (size_t I = SKIP; I < TXN->n_dbi; ++I) \
|
||||
if (TXN->dbi_state[I])
|
||||
|
||||
#endif /* MDBX_ENABLE_DBI_SPARSE */
|
||||
|
||||
#define TXN_FOREACH_DBI_ALL(TXN, I) TXN_FOREACH_DBI_FROM(TXN, I, 0)
|
||||
#define TXN_FOREACH_DBI_USER(TXN, I) TXN_FOREACH_DBI_FROM(TXN, I, CORE_DBS)
|
||||
|
||||
MDBX_INTERNAL int dbi_import(MDBX_txn *txn, const size_t dbi);
|
||||
|
||||
struct dbi_snap_result {
|
||||
uint32_t sequence;
|
||||
unsigned flags;
|
||||
};
|
||||
MDBX_INTERNAL struct dbi_snap_result dbi_snap(const MDBX_env *env,
|
||||
const size_t dbi);
|
||||
|
||||
MDBX_INTERNAL int dbi_update(MDBX_txn *txn, int keep);
|
||||
|
||||
static inline uint8_t dbi_state(const MDBX_txn *txn, const size_t dbi) {
|
||||
STATIC_ASSERT(
|
||||
(int)DBI_DIRTY == MDBX_DBI_DIRTY && (int)DBI_STALE == MDBX_DBI_STALE &&
|
||||
(int)DBI_FRESH == MDBX_DBI_FRESH && (int)DBI_CREAT == MDBX_DBI_CREAT);
|
||||
|
||||
#if MDBX_ENABLE_DBI_SPARSE
|
||||
const size_t bitmap_chunk = CHAR_BIT * sizeof(txn->dbi_sparse[0]);
|
||||
const size_t bitmap_indx = dbi / bitmap_chunk;
|
||||
const size_t bitmap_mask = (size_t)1 << dbi % bitmap_chunk;
|
||||
return likely(dbi < txn->n_dbi &&
|
||||
(txn->dbi_sparse[bitmap_indx] & bitmap_mask) != 0)
|
||||
? txn->dbi_state[dbi]
|
||||
: 0;
|
||||
#else
|
||||
return likely(dbi < txn->n_dbi) ? txn->dbi_state[dbi] : 0;
|
||||
#endif /* MDBX_ENABLE_DBI_SPARSE */
|
||||
}
|
||||
|
||||
static inline bool dbi_changed(const MDBX_txn *txn, const size_t dbi) {
|
||||
const MDBX_env *const env = txn->env;
|
||||
eASSERT(env, dbi_state(txn, dbi) & DBI_LINDO);
|
||||
const uint32_t snap_seq =
|
||||
atomic_load32(&env->dbi_seqs[dbi], mo_AcquireRelease);
|
||||
return snap_seq != txn->dbi_seqs[dbi];
|
||||
}
|
||||
|
||||
static inline int dbi_check(const MDBX_txn *txn, const size_t dbi) {
|
||||
const uint8_t state = dbi_state(txn, dbi);
|
||||
if (likely((state & DBI_LINDO) != 0 && !dbi_changed(txn, dbi)))
|
||||
return (state & DBI_VALID) ? MDBX_SUCCESS : MDBX_BAD_DBI;
|
||||
|
||||
/* Медленный путь: ленивая до-инициализацяи и импорт */
|
||||
return dbi_import((MDBX_txn *)txn, dbi);
|
||||
}
|
||||
|
||||
static inline uint32_t dbi_seq_next(const MDBX_env *const env, size_t dbi) {
|
||||
uint32_t v = atomic_load32(&env->dbi_seqs[dbi], mo_AcquireRelease) + 1;
|
||||
return v ? v : 1;
|
||||
}
|
||||
|
||||
MDBX_INTERNAL int dbi_open(MDBX_txn *txn, const MDBX_val *const name,
|
||||
unsigned user_flags, MDBX_dbi *dbi,
|
||||
MDBX_cmp_func *keycmp, MDBX_cmp_func *datacmp);
|
||||
|
||||
MDBX_INTERNAL int dbi_bind(MDBX_txn *txn, const size_t dbi, unsigned user_flags,
|
||||
MDBX_cmp_func *keycmp, MDBX_cmp_func *datacmp);
|
520
src/dpl.c
Normal file
520
src/dpl.c
Normal file
@ -0,0 +1,520 @@
|
||||
/// \copyright SPDX-License-Identifier: Apache-2.0
|
||||
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
|
||||
|
||||
#include "internals.h"
|
||||
|
||||
static inline size_t dpl_size2bytes(ptrdiff_t size) {
|
||||
assert(size > CURSOR_STACK_SIZE && (size_t)size <= PAGELIST_LIMIT);
|
||||
#if MDBX_DPL_PREALLOC_FOR_RADIXSORT
|
||||
size += size;
|
||||
#endif /* MDBX_DPL_PREALLOC_FOR_RADIXSORT */
|
||||
STATIC_ASSERT(MDBX_ASSUME_MALLOC_OVERHEAD + sizeof(dpl_t) +
|
||||
(PAGELIST_LIMIT * (MDBX_DPL_PREALLOC_FOR_RADIXSORT + 1)) *
|
||||
sizeof(dp_t) +
|
||||
MDBX_PNL_GRANULATE * sizeof(void *) * 2 <
|
||||
SIZE_MAX / 4 * 3);
|
||||
size_t bytes = ceil_powerof2(MDBX_ASSUME_MALLOC_OVERHEAD + sizeof(dpl_t) +
|
||||
size * sizeof(dp_t),
|
||||
MDBX_PNL_GRANULATE * sizeof(void *) * 2) -
|
||||
MDBX_ASSUME_MALLOC_OVERHEAD;
|
||||
return bytes;
|
||||
}
|
||||
|
||||
static inline size_t dpl_bytes2size(const ptrdiff_t bytes) {
|
||||
size_t size = (bytes - sizeof(dpl_t)) / sizeof(dp_t);
|
||||
assert(size > CURSOR_STACK_SIZE &&
|
||||
size <= PAGELIST_LIMIT + MDBX_PNL_GRANULATE);
|
||||
#if MDBX_DPL_PREALLOC_FOR_RADIXSORT
|
||||
size >>= 1;
|
||||
#endif /* MDBX_DPL_PREALLOC_FOR_RADIXSORT */
|
||||
return size;
|
||||
}
|
||||
|
||||
void dpl_free(MDBX_txn *txn) {
|
||||
if (likely(txn->tw.dirtylist)) {
|
||||
osal_free(txn->tw.dirtylist);
|
||||
txn->tw.dirtylist = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
dpl_t *dpl_reserve(MDBX_txn *txn, size_t size) {
|
||||
tASSERT(txn, (txn->flags & MDBX_TXN_RDONLY) == 0);
|
||||
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC);
|
||||
|
||||
size_t bytes =
|
||||
dpl_size2bytes((size < PAGELIST_LIMIT) ? size : PAGELIST_LIMIT);
|
||||
dpl_t *const dl = osal_realloc(txn->tw.dirtylist, bytes);
|
||||
if (likely(dl)) {
|
||||
#if __GLIBC_PREREQ(2, 12) || defined(__FreeBSD__) || defined(malloc_usable_size)
|
||||
bytes = malloc_usable_size(dl);
|
||||
#endif /* malloc_usable_size */
|
||||
dl->detent = dpl_bytes2size(bytes);
|
||||
tASSERT(txn, txn->tw.dirtylist == nullptr || dl->length <= dl->detent);
|
||||
txn->tw.dirtylist = dl;
|
||||
}
|
||||
return dl;
|
||||
}
|
||||
|
||||
int dpl_alloc(MDBX_txn *txn) {
|
||||
tASSERT(txn, (txn->flags & MDBX_TXN_RDONLY) == 0);
|
||||
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC);
|
||||
|
||||
const size_t wanna = (txn->env->options.dp_initial < txn->geo.upper)
|
||||
? txn->env->options.dp_initial
|
||||
: txn->geo.upper;
|
||||
#if MDBX_FORCE_ASSERTIONS || MDBX_DEBUG
|
||||
if (txn->tw.dirtylist)
|
||||
/* обнуляем чтобы не сработал ассерт внутри dpl_reserve() */
|
||||
txn->tw.dirtylist->sorted = txn->tw.dirtylist->length = 0;
|
||||
#endif /* asertions enabled */
|
||||
if (unlikely(!txn->tw.dirtylist || txn->tw.dirtylist->detent < wanna ||
|
||||
txn->tw.dirtylist->detent > wanna + wanna) &&
|
||||
unlikely(!dpl_reserve(txn, wanna)))
|
||||
return MDBX_ENOMEM;
|
||||
|
||||
dpl_clear(txn->tw.dirtylist);
|
||||
return MDBX_SUCCESS;
|
||||
}
|
||||
|
||||
#define MDBX_DPL_EXTRACT_KEY(ptr) ((ptr)->pgno)
|
||||
RADIXSORT_IMPL(dp, dp_t, MDBX_DPL_EXTRACT_KEY, MDBX_DPL_PREALLOC_FOR_RADIXSORT,
|
||||
1)
|
||||
|
||||
#define DP_SORT_CMP(first, last) ((first).pgno < (last).pgno)
|
||||
SORT_IMPL(dp_sort, false, dp_t, DP_SORT_CMP)
|
||||
|
||||
__hot __noinline dpl_t *dpl_sort_slowpath(const MDBX_txn *txn) {
|
||||
tASSERT(txn, (txn->flags & MDBX_TXN_RDONLY) == 0);
|
||||
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC);
|
||||
|
||||
dpl_t *dl = txn->tw.dirtylist;
|
||||
assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID);
|
||||
const size_t unsorted = dl->length - dl->sorted;
|
||||
if (likely(unsorted < MDBX_RADIXSORT_THRESHOLD) ||
|
||||
unlikely(!dp_radixsort(dl->items + 1, dl->length))) {
|
||||
if (dl->sorted > unsorted / 4 + 4 &&
|
||||
(MDBX_DPL_PREALLOC_FOR_RADIXSORT ||
|
||||
dl->length + unsorted < dl->detent + dpl_gap_mergesort)) {
|
||||
dp_t *const sorted_begin = dl->items + 1;
|
||||
dp_t *const sorted_end = sorted_begin + dl->sorted;
|
||||
dp_t *const end = dl->items + (MDBX_DPL_PREALLOC_FOR_RADIXSORT
|
||||
? dl->length + dl->length + 1
|
||||
: dl->detent + dpl_reserve_gap);
|
||||
dp_t *const tmp = end - unsorted;
|
||||
assert(dl->items + dl->length + 1 < tmp);
|
||||
/* copy unsorted to the end of allocated space and sort it */
|
||||
memcpy(tmp, sorted_end, unsorted * sizeof(dp_t));
|
||||
dp_sort(tmp, tmp + unsorted);
|
||||
/* merge two parts from end to begin */
|
||||
dp_t *__restrict w = dl->items + dl->length;
|
||||
dp_t *__restrict l = dl->items + dl->sorted;
|
||||
dp_t *__restrict r = end - 1;
|
||||
do {
|
||||
const bool cmp = expect_with_probability(l->pgno > r->pgno, 0, .5);
|
||||
#if defined(__LCC__) || __CLANG_PREREQ(13, 0) || !MDBX_HAVE_CMOV
|
||||
*w = cmp ? *l-- : *r--;
|
||||
#else
|
||||
*w = cmp ? *l : *r;
|
||||
l -= cmp;
|
||||
r += (ptrdiff_t)cmp - 1;
|
||||
#endif
|
||||
} while (likely(--w > l));
|
||||
assert(r == tmp - 1);
|
||||
assert(dl->items[0].pgno == 0 &&
|
||||
dl->items[dl->length + 1].pgno == P_INVALID);
|
||||
if (ASSERT_ENABLED())
|
||||
for (size_t i = 0; i <= dl->length; ++i)
|
||||
assert(dl->items[i].pgno < dl->items[i + 1].pgno);
|
||||
} else {
|
||||
dp_sort(dl->items + 1, dl->items + dl->length + 1);
|
||||
assert(dl->items[0].pgno == 0 &&
|
||||
dl->items[dl->length + 1].pgno == P_INVALID);
|
||||
}
|
||||
} else {
|
||||
assert(dl->items[0].pgno == 0 &&
|
||||
dl->items[dl->length + 1].pgno == P_INVALID);
|
||||
}
|
||||
dl->sorted = dl->length;
|
||||
return dl;
|
||||
}
|
||||
|
||||
/* Returns the index of the first dirty-page whose pgno
|
||||
* member is greater than or equal to id. */
|
||||
#define DP_SEARCH_CMP(dp, id) ((dp).pgno < (id))
|
||||
SEARCH_IMPL(dp_bsearch, dp_t, pgno_t, DP_SEARCH_CMP)
|
||||
|
||||
__hot __noinline MDBX_INTERNAL size_t dpl_search(const MDBX_txn *txn,
|
||||
pgno_t pgno) {
|
||||
tASSERT(txn, (txn->flags & MDBX_TXN_RDONLY) == 0);
|
||||
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC);
|
||||
|
||||
dpl_t *dl = txn->tw.dirtylist;
|
||||
assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID);
|
||||
if (AUDIT_ENABLED()) {
|
||||
for (const dp_t *ptr = dl->items + dl->sorted; --ptr > dl->items;) {
|
||||
assert(ptr[0].pgno < ptr[1].pgno);
|
||||
assert(ptr[0].pgno >= NUM_METAS);
|
||||
}
|
||||
}
|
||||
|
||||
switch (dl->length - dl->sorted) {
|
||||
default:
|
||||
/* sort a whole */
|
||||
dpl_sort_slowpath(txn);
|
||||
break;
|
||||
case 0:
|
||||
/* whole sorted cases */
|
||||
break;
|
||||
|
||||
#define LINEAR_SEARCH_CASE(N) \
|
||||
case N: \
|
||||
if (dl->items[dl->length - N + 1].pgno == pgno) \
|
||||
return dl->length - N + 1; \
|
||||
__fallthrough
|
||||
|
||||
/* use linear scan until the threshold */
|
||||
LINEAR_SEARCH_CASE(7); /* fall through */
|
||||
LINEAR_SEARCH_CASE(6); /* fall through */
|
||||
LINEAR_SEARCH_CASE(5); /* fall through */
|
||||
LINEAR_SEARCH_CASE(4); /* fall through */
|
||||
LINEAR_SEARCH_CASE(3); /* fall through */
|
||||
LINEAR_SEARCH_CASE(2); /* fall through */
|
||||
case 1:
|
||||
if (dl->items[dl->length].pgno == pgno)
|
||||
return dl->length;
|
||||
/* continue bsearch on the sorted part */
|
||||
break;
|
||||
}
|
||||
return dp_bsearch(dl->items + 1, dl->sorted, pgno) - dl->items;
|
||||
}
|
||||
|
||||
const page_t *debug_dpl_find(const MDBX_txn *txn, const pgno_t pgno) {
|
||||
tASSERT(txn, (txn->flags & MDBX_TXN_RDONLY) == 0);
|
||||
const dpl_t *dl = txn->tw.dirtylist;
|
||||
if (dl) {
|
||||
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC);
|
||||
assert(dl->items[0].pgno == 0 &&
|
||||
dl->items[dl->length + 1].pgno == P_INVALID);
|
||||
for (size_t i = dl->length; i > dl->sorted; --i)
|
||||
if (dl->items[i].pgno == pgno)
|
||||
return dl->items[i].ptr;
|
||||
|
||||
if (dl->sorted) {
|
||||
const size_t i = dp_bsearch(dl->items + 1, dl->sorted, pgno) - dl->items;
|
||||
if (dl->items[i].pgno == pgno)
|
||||
return dl->items[i].ptr;
|
||||
}
|
||||
} else {
|
||||
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) != 0 && !MDBX_AVOID_MSYNC);
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
void dpl_remove_ex(const MDBX_txn *txn, size_t i, size_t npages) {
|
||||
tASSERT(txn, (txn->flags & MDBX_TXN_RDONLY) == 0);
|
||||
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC);
|
||||
|
||||
dpl_t *dl = txn->tw.dirtylist;
|
||||
assert((intptr_t)i > 0 && i <= dl->length);
|
||||
assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID);
|
||||
dl->pages_including_loose -= npages;
|
||||
dl->sorted -= dl->sorted >= i;
|
||||
dl->length -= 1;
|
||||
memmove(dl->items + i, dl->items + i + 1,
|
||||
(dl->length - i + 2) * sizeof(dl->items[0]));
|
||||
assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID);
|
||||
}
|
||||
|
||||
int __must_check_result dpl_append(MDBX_txn *txn, pgno_t pgno, page_t *page,
|
||||
size_t npages) {
|
||||
tASSERT(txn, (txn->flags & MDBX_TXN_RDONLY) == 0);
|
||||
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC);
|
||||
const dp_t dp = {page, pgno, (pgno_t)npages};
|
||||
if ((txn->flags & MDBX_WRITEMAP) == 0) {
|
||||
size_t *const ptr = ptr_disp(page, -(ptrdiff_t)sizeof(size_t));
|
||||
*ptr = txn->tw.dirtylru;
|
||||
}
|
||||
|
||||
dpl_t *dl = txn->tw.dirtylist;
|
||||
tASSERT(txn, dl->length <= PAGELIST_LIMIT + MDBX_PNL_GRANULATE);
|
||||
tASSERT(txn, dl->items[0].pgno == 0 &&
|
||||
dl->items[dl->length + 1].pgno == P_INVALID);
|
||||
if (AUDIT_ENABLED()) {
|
||||
for (size_t i = dl->length; i > 0; --i) {
|
||||
assert(dl->items[i].pgno != dp.pgno);
|
||||
if (unlikely(dl->items[i].pgno == dp.pgno)) {
|
||||
ERROR("Page %u already exist in the DPL at %zu", dp.pgno, i);
|
||||
return MDBX_PROBLEM;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (unlikely(dl->length == dl->detent)) {
|
||||
if (unlikely(dl->detent >= PAGELIST_LIMIT)) {
|
||||
ERROR("DPL is full (PAGELIST_LIMIT %zu)", PAGELIST_LIMIT);
|
||||
return MDBX_TXN_FULL;
|
||||
}
|
||||
const size_t size = (dl->detent < MDBX_PNL_INITIAL * 42)
|
||||
? dl->detent + dl->detent
|
||||
: dl->detent + dl->detent / 2;
|
||||
dl = dpl_reserve(txn, size);
|
||||
if (unlikely(!dl))
|
||||
return MDBX_ENOMEM;
|
||||
tASSERT(txn, dl->length < dl->detent);
|
||||
}
|
||||
|
||||
/* Сортировка нужна для быстрого поиска, используем несколько тактик:
|
||||
* 1) Сохраняем упорядоченность при естественной вставке в нужном порядке.
|
||||
* 2) Добавляем в не-сортированный хвост, который сортируем и сливаем
|
||||
* с отсортированной головой по необходимости, а пока хвост короткий
|
||||
* ищем в нём сканированием, избегая большой пересортировки.
|
||||
* 3) Если не-сортированный хвост короткий, а добавляемый элемент близок
|
||||
* к концу отсортированной головы, то выгоднее сразу вставить элемент
|
||||
* в нужное место.
|
||||
*
|
||||
* Алгоритмически:
|
||||
* - добавлять в не-сортированный хвост следует только если вставка сильно
|
||||
* дорогая, т.е. если целевая позиция элемента сильно далека от конца;
|
||||
* - для быстрой проверки достаточно сравнить добавляемый элемент с отстоящим
|
||||
* от конца на максимально-приемлемое расстояние;
|
||||
* - если список короче, либо элемент в этой позиции меньше вставляемого,
|
||||
* то следует перемещать элементы и вставлять в отсортированную голову;
|
||||
* - если не-сортированный хвост длиннее, либо элемент в этой позиции больше,
|
||||
* то следует добавлять в не-сортированный хвост. */
|
||||
|
||||
dl->pages_including_loose += npages;
|
||||
dp_t *i = dl->items + dl->length;
|
||||
|
||||
const ptrdiff_t pivot = (ptrdiff_t)dl->length - dpl_insertion_threshold;
|
||||
#if MDBX_HAVE_CMOV
|
||||
const pgno_t pivot_pgno =
|
||||
dl->items[(dl->length < dpl_insertion_threshold)
|
||||
? 0
|
||||
: dl->length - dpl_insertion_threshold]
|
||||
.pgno;
|
||||
#endif /* MDBX_HAVE_CMOV */
|
||||
|
||||
/* copy the stub beyond the end */
|
||||
i[2] = i[1];
|
||||
dl->length += 1;
|
||||
|
||||
if (likely(pivot <= (ptrdiff_t)dl->sorted) &&
|
||||
#if MDBX_HAVE_CMOV
|
||||
pivot_pgno < dp.pgno) {
|
||||
#else
|
||||
(pivot <= 0 || dl->items[pivot].pgno < dp.pgno)) {
|
||||
#endif /* MDBX_HAVE_CMOV */
|
||||
dl->sorted += 1;
|
||||
|
||||
/* сдвигаем несортированный хвост */
|
||||
while (i >= dl->items + dl->sorted) {
|
||||
#if !defined(__GNUC__) /* пытаемся избежать вызова memmove() */
|
||||
i[1] = *i;
|
||||
#elif MDBX_WORDBITS == 64 && \
|
||||
(defined(__SIZEOF_INT128__) || \
|
||||
(defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128))
|
||||
STATIC_ASSERT(sizeof(dp) == sizeof(__uint128_t));
|
||||
((__uint128_t *)i)[1] = *(volatile __uint128_t *)i;
|
||||
#else
|
||||
i[1].ptr = i->ptr;
|
||||
i[1].pgno = i->pgno;
|
||||
i[1].npages = i->npages;
|
||||
#endif
|
||||
--i;
|
||||
}
|
||||
/* ищем нужную позицию сдвигая отсортированные элементы */
|
||||
while (i->pgno > pgno) {
|
||||
tASSERT(txn, i > dl->items);
|
||||
i[1] = *i;
|
||||
--i;
|
||||
}
|
||||
tASSERT(txn, i->pgno < dp.pgno);
|
||||
}
|
||||
|
||||
i[1] = dp;
|
||||
assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID);
|
||||
assert(dl->sorted <= dl->length);
|
||||
return MDBX_SUCCESS;
|
||||
}
|
||||
|
||||
__cold bool dpl_check(MDBX_txn *txn) {
|
||||
tASSERT(txn, (txn->flags & MDBX_TXN_RDONLY) == 0);
|
||||
const dpl_t *const dl = txn->tw.dirtylist;
|
||||
if (!dl) {
|
||||
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) != 0 && !MDBX_AVOID_MSYNC);
|
||||
return true;
|
||||
}
|
||||
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC);
|
||||
|
||||
assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID);
|
||||
tASSERT(txn, txn->tw.dirtyroom + dl->length ==
|
||||
(txn->parent ? txn->parent->tw.dirtyroom
|
||||
: txn->env->options.dp_limit));
|
||||
|
||||
if (!AUDIT_ENABLED())
|
||||
return true;
|
||||
|
||||
size_t loose = 0, pages = 0;
|
||||
for (size_t i = dl->length; i > 0; --i) {
|
||||
const page_t *const dp = dl->items[i].ptr;
|
||||
if (!dp)
|
||||
continue;
|
||||
|
||||
tASSERT(txn, dp->pgno == dl->items[i].pgno);
|
||||
if (unlikely(dp->pgno != dl->items[i].pgno))
|
||||
return false;
|
||||
|
||||
if ((txn->flags & MDBX_WRITEMAP) == 0) {
|
||||
const uint32_t age = dpl_age(txn, i);
|
||||
tASSERT(txn, age < UINT32_MAX / 3);
|
||||
if (unlikely(age > UINT32_MAX / 3))
|
||||
return false;
|
||||
}
|
||||
|
||||
tASSERT(txn, dp->flags == P_LOOSE || is_modifable(txn, dp));
|
||||
if (dp->flags == P_LOOSE) {
|
||||
loose += 1;
|
||||
} else if (unlikely(!is_modifable(txn, dp)))
|
||||
return false;
|
||||
|
||||
const unsigned num = dpl_npages(dl, i);
|
||||
pages += num;
|
||||
tASSERT(txn, txn->geo.first_unallocated >= dp->pgno + num);
|
||||
if (unlikely(txn->geo.first_unallocated < dp->pgno + num))
|
||||
return false;
|
||||
|
||||
if (i < dl->sorted) {
|
||||
tASSERT(txn, dl->items[i + 1].pgno >= dp->pgno + num);
|
||||
if (unlikely(dl->items[i + 1].pgno < dp->pgno + num))
|
||||
return false;
|
||||
}
|
||||
|
||||
const size_t rpa =
|
||||
pnl_search(txn->tw.relist, dp->pgno, txn->geo.first_unallocated);
|
||||
tASSERT(txn, rpa > MDBX_PNL_GETSIZE(txn->tw.relist) ||
|
||||
txn->tw.relist[rpa] != dp->pgno);
|
||||
if (rpa <= MDBX_PNL_GETSIZE(txn->tw.relist) &&
|
||||
unlikely(txn->tw.relist[rpa] == dp->pgno))
|
||||
return false;
|
||||
if (num > 1) {
|
||||
const size_t rpb = pnl_search(txn->tw.relist, dp->pgno + num - 1,
|
||||
txn->geo.first_unallocated);
|
||||
tASSERT(txn, rpa == rpb);
|
||||
if (unlikely(rpa != rpb))
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
tASSERT(txn, loose == txn->tw.loose_count);
|
||||
if (unlikely(loose != txn->tw.loose_count))
|
||||
return false;
|
||||
|
||||
tASSERT(txn, pages == dl->pages_including_loose);
|
||||
if (unlikely(pages != dl->pages_including_loose))
|
||||
return false;
|
||||
|
||||
for (size_t i = 1; i <= MDBX_PNL_GETSIZE(txn->tw.retired_pages); ++i) {
|
||||
const page_t *const dp = debug_dpl_find(txn, txn->tw.retired_pages[i]);
|
||||
tASSERT(txn, !dp);
|
||||
if (unlikely(dp))
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/*----------------------------------------------------------------------------*/
|
||||
|
||||
__noinline void dpl_lru_reduce(MDBX_txn *txn) {
|
||||
NOTICE("lru-reduce %u -> %u", txn->tw.dirtylru, txn->tw.dirtylru >> 1);
|
||||
tASSERT(txn, (txn->flags & (MDBX_TXN_RDONLY | MDBX_WRITEMAP)) == 0);
|
||||
do {
|
||||
txn->tw.dirtylru >>= 1;
|
||||
dpl_t *dl = txn->tw.dirtylist;
|
||||
for (size_t i = 1; i <= dl->length; ++i) {
|
||||
size_t *const ptr =
|
||||
ptr_disp(dl->items[i].ptr, -(ptrdiff_t)sizeof(size_t));
|
||||
*ptr >>= 1;
|
||||
}
|
||||
txn = txn->parent;
|
||||
} while (txn);
|
||||
}
|
||||
|
||||
void dpl_sift(MDBX_txn *const txn, pnl_t pl, const bool spilled) {
|
||||
tASSERT(txn, (txn->flags & MDBX_TXN_RDONLY) == 0);
|
||||
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC);
|
||||
if (MDBX_PNL_GETSIZE(pl) && txn->tw.dirtylist->length) {
|
||||
tASSERT(txn, pnl_check_allocated(pl, (size_t)txn->geo.first_unallocated
|
||||
<< spilled));
|
||||
dpl_t *dl = dpl_sort(txn);
|
||||
|
||||
/* Scanning in ascend order */
|
||||
const intptr_t step = MDBX_PNL_ASCENDING ? 1 : -1;
|
||||
const intptr_t begin = MDBX_PNL_ASCENDING ? 1 : MDBX_PNL_GETSIZE(pl);
|
||||
const intptr_t end = MDBX_PNL_ASCENDING ? MDBX_PNL_GETSIZE(pl) + 1 : 0;
|
||||
tASSERT(txn, pl[begin] <= pl[end - step]);
|
||||
|
||||
size_t w, r = dpl_search(txn, pl[begin] >> spilled);
|
||||
tASSERT(txn, dl->sorted == dl->length);
|
||||
for (intptr_t i = begin; r <= dl->length;) { /* scan loop */
|
||||
assert(i != end);
|
||||
tASSERT(txn, !spilled || (pl[i] & 1) == 0);
|
||||
pgno_t pl_pgno = pl[i] >> spilled;
|
||||
pgno_t dp_pgno = dl->items[r].pgno;
|
||||
if (likely(dp_pgno != pl_pgno)) {
|
||||
const bool cmp = dp_pgno < pl_pgno;
|
||||
r += cmp;
|
||||
i += cmp ? 0 : step;
|
||||
if (likely(i != end))
|
||||
continue;
|
||||
return;
|
||||
}
|
||||
|
||||
/* update loop */
|
||||
unsigned npages;
|
||||
w = r;
|
||||
remove_dl:
|
||||
npages = dpl_npages(dl, r);
|
||||
dl->pages_including_loose -= npages;
|
||||
if (!MDBX_AVOID_MSYNC || !(txn->flags & MDBX_WRITEMAP))
|
||||
page_shadow_release(txn->env, dl->items[r].ptr, npages);
|
||||
++r;
|
||||
next_i:
|
||||
i += step;
|
||||
if (unlikely(i == end)) {
|
||||
while (r <= dl->length)
|
||||
dl->items[w++] = dl->items[r++];
|
||||
} else {
|
||||
while (r <= dl->length) {
|
||||
assert(i != end);
|
||||
tASSERT(txn, !spilled || (pl[i] & 1) == 0);
|
||||
pl_pgno = pl[i] >> spilled;
|
||||
dp_pgno = dl->items[r].pgno;
|
||||
if (dp_pgno < pl_pgno)
|
||||
dl->items[w++] = dl->items[r++];
|
||||
else if (dp_pgno > pl_pgno)
|
||||
goto next_i;
|
||||
else
|
||||
goto remove_dl;
|
||||
}
|
||||
}
|
||||
dl->sorted = dpl_setlen(dl, w - 1);
|
||||
txn->tw.dirtyroom += r - w;
|
||||
tASSERT(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length ==
|
||||
(txn->parent ? txn->parent->tw.dirtyroom
|
||||
: txn->env->options.dp_limit));
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void dpl_release_shadows(MDBX_txn *txn) {
|
||||
tASSERT(txn, (txn->flags & (MDBX_TXN_RDONLY | MDBX_WRITEMAP)) == 0);
|
||||
MDBX_env *env = txn->env;
|
||||
dpl_t *const dl = txn->tw.dirtylist;
|
||||
|
||||
for (size_t i = 1; i <= dl->length; i++)
|
||||
page_shadow_release(env, dl->items[i].ptr, dpl_npages(dl, i));
|
||||
|
||||
dpl_clear(dl);
|
||||
}
|
145
src/dpl.h
Normal file
145
src/dpl.h
Normal file
@ -0,0 +1,145 @@
|
||||
/// \copyright SPDX-License-Identifier: Apache-2.0
|
||||
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "essentials.h"
|
||||
|
||||
static inline size_t dpl_setlen(dpl_t *dl, size_t len) {
|
||||
static const page_t dpl_stub_pageE = {INVALID_TXNID,
|
||||
0,
|
||||
P_BAD,
|
||||
{0},
|
||||
/* pgno */ ~(pgno_t)0};
|
||||
assert(dpl_stub_pageE.flags == P_BAD && dpl_stub_pageE.pgno == P_INVALID);
|
||||
dl->length = len;
|
||||
dl->items[len + 1].ptr = (page_t *)&dpl_stub_pageE;
|
||||
dl->items[len + 1].pgno = P_INVALID;
|
||||
dl->items[len + 1].npages = 1;
|
||||
return len;
|
||||
}
|
||||
|
||||
static inline void dpl_clear(dpl_t *dl) {
|
||||
static const page_t dpl_stub_pageB = {INVALID_TXNID,
|
||||
0,
|
||||
P_BAD,
|
||||
{0},
|
||||
/* pgno */ 0};
|
||||
assert(dpl_stub_pageB.flags == P_BAD && dpl_stub_pageB.pgno == 0);
|
||||
dl->sorted = dpl_setlen(dl, 0);
|
||||
dl->pages_including_loose = 0;
|
||||
dl->items[0].ptr = (page_t *)&dpl_stub_pageB;
|
||||
dl->items[0].pgno = 0;
|
||||
dl->items[0].npages = 1;
|
||||
assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID);
|
||||
}
|
||||
|
||||
MDBX_INTERNAL int __must_check_result dpl_alloc(MDBX_txn *txn);
|
||||
|
||||
MDBX_INTERNAL void dpl_free(MDBX_txn *txn);
|
||||
|
||||
MDBX_INTERNAL dpl_t *dpl_reserve(MDBX_txn *txn, size_t size);
|
||||
|
||||
MDBX_INTERNAL __noinline dpl_t *dpl_sort_slowpath(const MDBX_txn *txn);
|
||||
|
||||
static inline dpl_t *dpl_sort(const MDBX_txn *txn) {
|
||||
tASSERT(txn, (txn->flags & MDBX_TXN_RDONLY) == 0);
|
||||
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC);
|
||||
|
||||
dpl_t *dl = txn->tw.dirtylist;
|
||||
tASSERT(txn, dl->length <= PAGELIST_LIMIT);
|
||||
tASSERT(txn, dl->sorted <= dl->length);
|
||||
tASSERT(txn, dl->items[0].pgno == 0 &&
|
||||
dl->items[dl->length + 1].pgno == P_INVALID);
|
||||
return likely(dl->sorted == dl->length) ? dl : dpl_sort_slowpath(txn);
|
||||
}
|
||||
|
||||
MDBX_INTERNAL __noinline size_t dpl_search(const MDBX_txn *txn, pgno_t pgno);
|
||||
|
||||
MDBX_MAYBE_UNUSED MDBX_INTERNAL const page_t *
|
||||
debug_dpl_find(const MDBX_txn *txn, const pgno_t pgno);
|
||||
|
||||
MDBX_NOTHROW_PURE_FUNCTION static inline unsigned dpl_npages(const dpl_t *dl,
|
||||
size_t i) {
|
||||
assert(0 <= (intptr_t)i && i <= dl->length);
|
||||
unsigned n = dl->items[i].npages;
|
||||
assert(n == (is_largepage(dl->items[i].ptr) ? dl->items[i].ptr->pages : 1));
|
||||
return n;
|
||||
}
|
||||
|
||||
MDBX_NOTHROW_PURE_FUNCTION static inline pgno_t dpl_endpgno(const dpl_t *dl,
|
||||
size_t i) {
|
||||
return dpl_npages(dl, i) + dl->items[i].pgno;
|
||||
}
|
||||
|
||||
static inline bool dpl_intersect(const MDBX_txn *txn, pgno_t pgno,
|
||||
size_t npages) {
|
||||
tASSERT(txn, (txn->flags & MDBX_TXN_RDONLY) == 0);
|
||||
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC);
|
||||
|
||||
dpl_t *dl = txn->tw.dirtylist;
|
||||
tASSERT(txn, dl->sorted == dl->length);
|
||||
tASSERT(txn, dl->items[0].pgno == 0 &&
|
||||
dl->items[dl->length + 1].pgno == P_INVALID);
|
||||
size_t const n = dpl_search(txn, pgno);
|
||||
tASSERT(txn, n >= 1 && n <= dl->length + 1);
|
||||
tASSERT(txn, pgno <= dl->items[n].pgno);
|
||||
tASSERT(txn, pgno > dl->items[n - 1].pgno);
|
||||
const bool rc =
|
||||
/* intersection with founded */ pgno + npages > dl->items[n].pgno ||
|
||||
/* intersection with prev */ dpl_endpgno(dl, n - 1) > pgno;
|
||||
if (ASSERT_ENABLED()) {
|
||||
bool check = false;
|
||||
for (size_t i = 1; i <= dl->length; ++i) {
|
||||
const page_t *const dp = dl->items[i].ptr;
|
||||
if (!(dp->pgno /* begin */ >= /* end */ pgno + npages ||
|
||||
dpl_endpgno(dl, i) /* end */ <= /* begin */ pgno))
|
||||
check |= true;
|
||||
}
|
||||
tASSERT(txn, check == rc);
|
||||
}
|
||||
return rc;
|
||||
}
|
||||
|
||||
MDBX_NOTHROW_PURE_FUNCTION static inline size_t dpl_exist(const MDBX_txn *txn,
|
||||
pgno_t pgno) {
|
||||
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC);
|
||||
dpl_t *dl = txn->tw.dirtylist;
|
||||
size_t i = dpl_search(txn, pgno);
|
||||
tASSERT(txn, (int)i > 0);
|
||||
return (dl->items[i].pgno == pgno) ? i : 0;
|
||||
}
|
||||
|
||||
MDBX_INTERNAL void dpl_remove_ex(const MDBX_txn *txn, size_t i, size_t npages);
|
||||
|
||||
static inline void dpl_remove(const MDBX_txn *txn, size_t i) {
|
||||
dpl_remove_ex(txn, i, dpl_npages(txn->tw.dirtylist, i));
|
||||
}
|
||||
|
||||
MDBX_INTERNAL int __must_check_result dpl_append(MDBX_txn *txn, pgno_t pgno,
|
||||
page_t *page, size_t npages);
|
||||
|
||||
MDBX_MAYBE_UNUSED MDBX_INTERNAL bool dpl_check(MDBX_txn *txn);
|
||||
|
||||
MDBX_NOTHROW_PURE_FUNCTION static inline uint32_t dpl_age(const MDBX_txn *txn,
|
||||
size_t i) {
|
||||
tASSERT(txn, (txn->flags & (MDBX_TXN_RDONLY | MDBX_WRITEMAP)) == 0);
|
||||
const dpl_t *dl = txn->tw.dirtylist;
|
||||
assert((intptr_t)i > 0 && i <= dl->length);
|
||||
size_t *const ptr = ptr_disp(dl->items[i].ptr, -(ptrdiff_t)sizeof(size_t));
|
||||
return txn->tw.dirtylru - (uint32_t)*ptr;
|
||||
}
|
||||
|
||||
MDBX_INTERNAL void dpl_lru_reduce(MDBX_txn *txn);
|
||||
|
||||
static inline uint32_t dpl_lru_turn(MDBX_txn *txn) {
|
||||
txn->tw.dirtylru += 1;
|
||||
if (unlikely(txn->tw.dirtylru > UINT32_MAX / 3) &&
|
||||
(txn->flags & MDBX_WRITEMAP) == 0)
|
||||
dpl_lru_reduce(txn);
|
||||
return txn->tw.dirtylru;
|
||||
}
|
||||
|
||||
MDBX_INTERNAL void dpl_sift(MDBX_txn *const txn, pnl_t pl, const bool spilled);
|
||||
|
||||
MDBX_INTERNAL void dpl_release_shadows(MDBX_txn *txn);
|
419
src/env-opts.c
Normal file
419
src/env-opts.c
Normal file
@ -0,0 +1,419 @@
|
||||
/// \copyright SPDX-License-Identifier: Apache-2.0
|
||||
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
|
||||
|
||||
#include "internals.h"
|
||||
|
||||
__cold static unsigned default_rp_augment_limit(const MDBX_env *env) {
|
||||
const size_t timeframe = /* 16 секунд */ 16 << 16;
|
||||
const size_t remain_1sec =
|
||||
(env->options.gc_time_limit < timeframe)
|
||||
? timeframe - (size_t)env->options.gc_time_limit
|
||||
: 0;
|
||||
const size_t minimum = (env->maxgc_large1page * 2 > MDBX_PNL_INITIAL)
|
||||
? env->maxgc_large1page * 2
|
||||
: MDBX_PNL_INITIAL;
|
||||
const size_t one_third = env->geo_in_bytes.now / 3 >> env->ps2ln;
|
||||
const size_t augment_limit =
|
||||
(one_third > minimum)
|
||||
? minimum + (one_third - minimum) / timeframe * remain_1sec
|
||||
: minimum;
|
||||
eASSERT(env, augment_limit < PAGELIST_LIMIT);
|
||||
return pnl_bytes2size(pnl_size2bytes(augment_limit));
|
||||
}
|
||||
|
||||
static bool default_prefault_write(const MDBX_env *env) {
|
||||
return !MDBX_MMAP_INCOHERENT_FILE_WRITE && !env->incore &&
|
||||
(env->flags & (MDBX_WRITEMAP | MDBX_RDONLY)) == MDBX_WRITEMAP;
|
||||
}
|
||||
|
||||
static bool default_prefer_waf_insteadof_balance(const MDBX_env *env) {
|
||||
(void)env;
|
||||
return false;
|
||||
}
|
||||
|
||||
void env_options_init(MDBX_env *env) {
|
||||
env->options.rp_augment_limit = MDBX_PNL_INITIAL;
|
||||
env->options.dp_reserve_limit = MDBX_PNL_INITIAL;
|
||||
env->options.dp_initial = MDBX_PNL_INITIAL;
|
||||
env->options.spill_max_denominator = 8;
|
||||
env->options.spill_min_denominator = 8;
|
||||
env->options.spill_parent4child_denominator = 0;
|
||||
env->options.dp_loose_limit = 64;
|
||||
env->options.merge_threshold_16dot16_percent = 65536 / 4 /* 25% */;
|
||||
if (default_prefer_waf_insteadof_balance(env))
|
||||
env->options.prefer_waf_insteadof_balance = true;
|
||||
|
||||
#if !(defined(_WIN32) || defined(_WIN64))
|
||||
env->options.writethrough_threshold =
|
||||
#if defined(__linux__) || defined(__gnu_linux__)
|
||||
globals.running_on_WSL1 ? MAX_PAGENO :
|
||||
#endif /* Linux */
|
||||
MDBX_WRITETHROUGH_THRESHOLD_DEFAULT;
|
||||
#endif /* Windows */
|
||||
}
|
||||
|
||||
void env_options_adjust_defaults(MDBX_env *env) {
|
||||
if (!env->options.flags.non_auto.rp_augment_limit)
|
||||
env->options.rp_augment_limit = default_rp_augment_limit(env);
|
||||
if (!env->options.flags.non_auto.prefault_write)
|
||||
env->options.prefault_write = default_prefault_write(env);
|
||||
|
||||
const size_t basis = env->geo_in_bytes.now;
|
||||
/* TODO: use options? */
|
||||
const unsigned factor = 9;
|
||||
size_t threshold = (basis < ((size_t)65536 << factor))
|
||||
? 65536 /* minimal threshold */
|
||||
: (basis > (MEGABYTE * 4 << factor))
|
||||
? MEGABYTE * 4 /* maximal threshold */
|
||||
: basis >> factor;
|
||||
threshold =
|
||||
(threshold < env->geo_in_bytes.shrink || !env->geo_in_bytes.shrink)
|
||||
? threshold
|
||||
: env->geo_in_bytes.shrink;
|
||||
|
||||
env->madv_threshold = bytes2pgno(env, bytes_align2os_bytes(env, threshold));
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
__cold int mdbx_env_set_option(MDBX_env *env, const MDBX_option_t option,
|
||||
uint64_t value) {
|
||||
int err = check_env(env, false);
|
||||
if (unlikely(err != MDBX_SUCCESS))
|
||||
return err;
|
||||
|
||||
const bool lock_needed =
|
||||
((env->flags & ENV_ACTIVE) && env->basal_txn && !env_txn0_owned(env));
|
||||
bool should_unlock = false;
|
||||
switch (option) {
|
||||
case MDBX_opt_sync_bytes:
|
||||
if (value == /* default */ UINT64_MAX)
|
||||
value = MAX_WRITE;
|
||||
if (unlikely(env->flags & MDBX_RDONLY))
|
||||
return MDBX_EACCESS;
|
||||
if (unlikely(!(env->flags & ENV_ACTIVE)))
|
||||
return MDBX_EPERM;
|
||||
if (unlikely(value > SIZE_MAX - 65536))
|
||||
return MDBX_EINVAL;
|
||||
value = bytes2pgno(env, (size_t)value + env->ps - 1);
|
||||
if ((uint32_t)value !=
|
||||
atomic_load32(&env->lck->autosync_threshold, mo_AcquireRelease) &&
|
||||
atomic_store32(&env->lck->autosync_threshold, (uint32_t)value,
|
||||
mo_Relaxed)
|
||||
/* Дергаем sync(force=off) только если задано новое не-нулевое значение
|
||||
* и мы вне транзакции */
|
||||
&& lock_needed) {
|
||||
err = env_sync(env, false, false);
|
||||
if (err == /* нечего сбрасывать на диск */ MDBX_RESULT_TRUE)
|
||||
err = MDBX_SUCCESS;
|
||||
}
|
||||
break;
|
||||
|
||||
case MDBX_opt_sync_period:
|
||||
if (value == /* default */ UINT64_MAX)
|
||||
value = 2780315 /* 42.42424 секунды */;
|
||||
if (unlikely(env->flags & MDBX_RDONLY))
|
||||
return MDBX_EACCESS;
|
||||
if (unlikely(!(env->flags & ENV_ACTIVE)))
|
||||
return MDBX_EPERM;
|
||||
if (unlikely(value > UINT32_MAX))
|
||||
return MDBX_EINVAL;
|
||||
value = osal_16dot16_to_monotime((uint32_t)value);
|
||||
if (value != atomic_load64(&env->lck->autosync_period, mo_AcquireRelease) &&
|
||||
atomic_store64(&env->lck->autosync_period, value, mo_Relaxed)
|
||||
/* Дергаем sync(force=off) только если задано новое не-нулевое значение
|
||||
* и мы вне транзакции */
|
||||
&& lock_needed) {
|
||||
err = env_sync(env, false, false);
|
||||
if (err == /* нечего сбрасывать на диск */ MDBX_RESULT_TRUE)
|
||||
err = MDBX_SUCCESS;
|
||||
}
|
||||
break;
|
||||
|
||||
case MDBX_opt_max_db:
|
||||
if (value == /* default */ UINT64_MAX)
|
||||
value = 42;
|
||||
if (unlikely(value > MDBX_MAX_DBI))
|
||||
return MDBX_EINVAL;
|
||||
if (unlikely(env->dxb_mmap.base))
|
||||
return MDBX_EPERM;
|
||||
env->max_dbi = (unsigned)value + CORE_DBS;
|
||||
break;
|
||||
|
||||
case MDBX_opt_max_readers:
|
||||
if (value == /* default */ UINT64_MAX)
|
||||
value = MDBX_READERS_LIMIT;
|
||||
if (unlikely(value < 1 || value > MDBX_READERS_LIMIT))
|
||||
return MDBX_EINVAL;
|
||||
if (unlikely(env->dxb_mmap.base))
|
||||
return MDBX_EPERM;
|
||||
env->max_readers = (unsigned)value;
|
||||
break;
|
||||
|
||||
case MDBX_opt_dp_reserve_limit:
|
||||
if (value == /* default */ UINT64_MAX)
|
||||
value = INT_MAX;
|
||||
if (unlikely(value > INT_MAX))
|
||||
return MDBX_EINVAL;
|
||||
if (env->options.dp_reserve_limit != (unsigned)value) {
|
||||
if (lock_needed) {
|
||||
err = lck_txn_lock(env, false);
|
||||
if (unlikely(err != MDBX_SUCCESS))
|
||||
return err;
|
||||
should_unlock = true;
|
||||
}
|
||||
env->options.dp_reserve_limit = (unsigned)value;
|
||||
while (env->shadow_reserve_len > env->options.dp_reserve_limit) {
|
||||
eASSERT(env, env->shadow_reserve != nullptr);
|
||||
page_t *dp = env->shadow_reserve;
|
||||
MDBX_ASAN_UNPOISON_MEMORY_REGION(dp, env->ps);
|
||||
VALGRIND_MAKE_MEM_DEFINED(&page_next(dp), sizeof(page_t *));
|
||||
env->shadow_reserve = page_next(dp);
|
||||
void *const ptr = ptr_disp(dp, -(ptrdiff_t)sizeof(size_t));
|
||||
osal_free(ptr);
|
||||
env->shadow_reserve_len -= 1;
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case MDBX_opt_rp_augment_limit:
|
||||
if (value == /* default */ UINT64_MAX) {
|
||||
env->options.flags.non_auto.rp_augment_limit = 0;
|
||||
env->options.rp_augment_limit = default_rp_augment_limit(env);
|
||||
} else if (unlikely(value > PAGELIST_LIMIT))
|
||||
return MDBX_EINVAL;
|
||||
else {
|
||||
env->options.flags.non_auto.rp_augment_limit = 1;
|
||||
env->options.rp_augment_limit = (unsigned)value;
|
||||
}
|
||||
break;
|
||||
|
||||
case MDBX_opt_gc_time_limit:
|
||||
if (value == /* default */ UINT64_MAX)
|
||||
value = 0;
|
||||
if (unlikely(value > UINT32_MAX))
|
||||
return MDBX_EINVAL;
|
||||
if (unlikely(env->flags & MDBX_RDONLY))
|
||||
return MDBX_EACCESS;
|
||||
value = osal_16dot16_to_monotime((uint32_t)value);
|
||||
if (value != env->options.gc_time_limit) {
|
||||
if (env->txn && lock_needed)
|
||||
return MDBX_EPERM;
|
||||
env->options.gc_time_limit = value;
|
||||
if (!env->options.flags.non_auto.rp_augment_limit)
|
||||
env->options.rp_augment_limit = default_rp_augment_limit(env);
|
||||
}
|
||||
break;
|
||||
|
||||
case MDBX_opt_txn_dp_limit:
|
||||
case MDBX_opt_txn_dp_initial:
|
||||
if (value == /* default */ UINT64_MAX)
|
||||
value = PAGELIST_LIMIT;
|
||||
if (unlikely(value > PAGELIST_LIMIT || value < CURSOR_STACK_SIZE * 4))
|
||||
return MDBX_EINVAL;
|
||||
if (unlikely(env->flags & MDBX_RDONLY))
|
||||
return MDBX_EACCESS;
|
||||
if (lock_needed) {
|
||||
err = lck_txn_lock(env, false);
|
||||
if (unlikely(err != MDBX_SUCCESS))
|
||||
return err;
|
||||
should_unlock = true;
|
||||
}
|
||||
if (env->txn)
|
||||
err = MDBX_EPERM /* unable change during transaction */;
|
||||
else {
|
||||
const pgno_t value32 = (pgno_t)value;
|
||||
if (option == MDBX_opt_txn_dp_initial &&
|
||||
env->options.dp_initial != value32) {
|
||||
env->options.dp_initial = value32;
|
||||
if (env->options.dp_limit < value32) {
|
||||
env->options.dp_limit = value32;
|
||||
env->options.flags.non_auto.dp_limit = 1;
|
||||
}
|
||||
}
|
||||
if (option == MDBX_opt_txn_dp_limit && env->options.dp_limit != value32) {
|
||||
env->options.dp_limit = value32;
|
||||
env->options.flags.non_auto.dp_limit = 1;
|
||||
if (env->options.dp_initial > value32)
|
||||
env->options.dp_initial = value32;
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case MDBX_opt_spill_max_denominator:
|
||||
if (value == /* default */ UINT64_MAX)
|
||||
value = 8;
|
||||
if (unlikely(value > 255))
|
||||
return MDBX_EINVAL;
|
||||
env->options.spill_max_denominator = (uint8_t)value;
|
||||
break;
|
||||
case MDBX_opt_spill_min_denominator:
|
||||
if (value == /* default */ UINT64_MAX)
|
||||
value = 8;
|
||||
if (unlikely(value > 255))
|
||||
return MDBX_EINVAL;
|
||||
env->options.spill_min_denominator = (uint8_t)value;
|
||||
break;
|
||||
case MDBX_opt_spill_parent4child_denominator:
|
||||
if (value == /* default */ UINT64_MAX)
|
||||
value = 0;
|
||||
if (unlikely(value > 255))
|
||||
return MDBX_EINVAL;
|
||||
env->options.spill_parent4child_denominator = (uint8_t)value;
|
||||
break;
|
||||
|
||||
case MDBX_opt_loose_limit:
|
||||
if (value == /* default */ UINT64_MAX)
|
||||
value = 64;
|
||||
if (unlikely(value > 255))
|
||||
return MDBX_EINVAL;
|
||||
env->options.dp_loose_limit = (uint8_t)value;
|
||||
break;
|
||||
|
||||
case MDBX_opt_merge_threshold_16dot16_percent:
|
||||
if (value == /* default */ UINT64_MAX)
|
||||
value = 65536 / 4 /* 25% */;
|
||||
if (unlikely(value < 8192 || value > 32768))
|
||||
return MDBX_EINVAL;
|
||||
env->options.merge_threshold_16dot16_percent = (unsigned)value;
|
||||
recalculate_merge_thresholds(env);
|
||||
break;
|
||||
|
||||
case MDBX_opt_writethrough_threshold:
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
/* позволяем "установить" значение по-умолчанию и совпадающее
|
||||
* с поведением соответствующим текущей установке MDBX_NOMETASYNC */
|
||||
if (value == /* default */ UINT64_MAX &&
|
||||
value != ((env->flags & MDBX_NOMETASYNC) ? 0 : UINT_MAX))
|
||||
err = MDBX_EINVAL;
|
||||
#else
|
||||
if (value == /* default */ UINT64_MAX)
|
||||
value = MDBX_WRITETHROUGH_THRESHOLD_DEFAULT;
|
||||
if (value != (unsigned)value)
|
||||
err = MDBX_EINVAL;
|
||||
else
|
||||
env->options.writethrough_threshold = (unsigned)value;
|
||||
#endif
|
||||
break;
|
||||
|
||||
case MDBX_opt_prefault_write_enable:
|
||||
if (value == /* default */ UINT64_MAX) {
|
||||
env->options.prefault_write = default_prefault_write(env);
|
||||
env->options.flags.non_auto.prefault_write = false;
|
||||
} else if (value > 1)
|
||||
err = MDBX_EINVAL;
|
||||
else {
|
||||
env->options.prefault_write = value != 0;
|
||||
env->options.flags.non_auto.prefault_write = true;
|
||||
}
|
||||
break;
|
||||
|
||||
case MDBX_opt_prefer_waf_insteadof_balance:
|
||||
if (value == /* default */ UINT64_MAX)
|
||||
env->options.prefer_waf_insteadof_balance =
|
||||
default_prefer_waf_insteadof_balance(env);
|
||||
else if (value > 1)
|
||||
err = MDBX_EINVAL;
|
||||
else
|
||||
env->options.prefer_waf_insteadof_balance = value != 0;
|
||||
break;
|
||||
|
||||
default:
|
||||
return MDBX_EINVAL;
|
||||
}
|
||||
|
||||
if (should_unlock)
|
||||
lck_txn_unlock(env);
|
||||
return err;
|
||||
}
|
||||
|
||||
__cold int mdbx_env_get_option(const MDBX_env *env, const MDBX_option_t option,
|
||||
uint64_t *pvalue) {
|
||||
int err = check_env(env, false);
|
||||
if (unlikely(err != MDBX_SUCCESS))
|
||||
return err;
|
||||
if (unlikely(!pvalue))
|
||||
return MDBX_EINVAL;
|
||||
|
||||
switch (option) {
|
||||
case MDBX_opt_sync_bytes:
|
||||
if (unlikely(!(env->flags & ENV_ACTIVE)))
|
||||
return MDBX_EPERM;
|
||||
*pvalue = pgno2bytes(
|
||||
env, atomic_load32(&env->lck->autosync_threshold, mo_Relaxed));
|
||||
break;
|
||||
|
||||
case MDBX_opt_sync_period:
|
||||
if (unlikely(!(env->flags & ENV_ACTIVE)))
|
||||
return MDBX_EPERM;
|
||||
*pvalue = osal_monotime_to_16dot16(
|
||||
atomic_load64(&env->lck->autosync_period, mo_Relaxed));
|
||||
break;
|
||||
|
||||
case MDBX_opt_max_db:
|
||||
*pvalue = env->max_dbi - CORE_DBS;
|
||||
break;
|
||||
|
||||
case MDBX_opt_max_readers:
|
||||
*pvalue = env->max_readers;
|
||||
break;
|
||||
|
||||
case MDBX_opt_dp_reserve_limit:
|
||||
*pvalue = env->options.dp_reserve_limit;
|
||||
break;
|
||||
|
||||
case MDBX_opt_rp_augment_limit:
|
||||
*pvalue = env->options.rp_augment_limit;
|
||||
break;
|
||||
|
||||
case MDBX_opt_gc_time_limit:
|
||||
*pvalue = osal_monotime_to_16dot16(env->options.gc_time_limit);
|
||||
break;
|
||||
|
||||
case MDBX_opt_txn_dp_limit:
|
||||
*pvalue = env->options.dp_limit;
|
||||
break;
|
||||
case MDBX_opt_txn_dp_initial:
|
||||
*pvalue = env->options.dp_initial;
|
||||
break;
|
||||
|
||||
case MDBX_opt_spill_max_denominator:
|
||||
*pvalue = env->options.spill_max_denominator;
|
||||
break;
|
||||
case MDBX_opt_spill_min_denominator:
|
||||
*pvalue = env->options.spill_min_denominator;
|
||||
break;
|
||||
case MDBX_opt_spill_parent4child_denominator:
|
||||
*pvalue = env->options.spill_parent4child_denominator;
|
||||
break;
|
||||
|
||||
case MDBX_opt_loose_limit:
|
||||
*pvalue = env->options.dp_loose_limit;
|
||||
break;
|
||||
|
||||
case MDBX_opt_merge_threshold_16dot16_percent:
|
||||
*pvalue = env->options.merge_threshold_16dot16_percent;
|
||||
break;
|
||||
|
||||
case MDBX_opt_writethrough_threshold:
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
*pvalue = (env->flags & MDBX_NOMETASYNC) ? 0 : INT_MAX;
|
||||
#else
|
||||
*pvalue = env->options.writethrough_threshold;
|
||||
#endif
|
||||
break;
|
||||
|
||||
case MDBX_opt_prefault_write_enable:
|
||||
*pvalue = env->options.prefault_write;
|
||||
break;
|
||||
|
||||
case MDBX_opt_prefer_waf_insteadof_balance:
|
||||
*pvalue = env->options.prefer_waf_insteadof_balance;
|
||||
break;
|
||||
|
||||
default:
|
||||
return MDBX_EINVAL;
|
||||
}
|
||||
|
||||
return MDBX_SUCCESS;
|
||||
}
|
679
src/env.c
Normal file
679
src/env.c
Normal file
@ -0,0 +1,679 @@
|
||||
/// \copyright SPDX-License-Identifier: Apache-2.0
|
||||
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
|
||||
|
||||
#include "internals.h"
|
||||
|
||||
bool env_txn0_owned(const MDBX_env *env) {
|
||||
return (env->flags & MDBX_NOSTICKYTHREADS)
|
||||
? (env->basal_txn->owner != 0)
|
||||
: (env->basal_txn->owner == osal_thread_self());
|
||||
}
|
||||
|
||||
int env_page_auxbuffer(MDBX_env *env) {
|
||||
return env->page_auxbuf ? MDBX_SUCCESS
|
||||
: osal_memalign_alloc(globals.sys_pagesize,
|
||||
env->ps * (size_t)NUM_METAS,
|
||||
&env->page_auxbuf);
|
||||
}
|
||||
|
||||
__cold unsigned env_setup_pagesize(MDBX_env *env, const size_t pagesize) {
|
||||
STATIC_ASSERT(PTRDIFF_MAX > MAX_MAPSIZE);
|
||||
STATIC_ASSERT(MDBX_MIN_PAGESIZE > sizeof(page_t) + sizeof(meta_t));
|
||||
ENSURE(env, is_powerof2(pagesize));
|
||||
ENSURE(env, pagesize >= MDBX_MIN_PAGESIZE);
|
||||
ENSURE(env, pagesize <= MDBX_MAX_PAGESIZE);
|
||||
env->ps = (unsigned)pagesize;
|
||||
if (env->page_auxbuf) {
|
||||
osal_memalign_free(env->page_auxbuf);
|
||||
env->page_auxbuf = nullptr;
|
||||
}
|
||||
|
||||
STATIC_ASSERT(MAX_GC1OVPAGE(MDBX_MIN_PAGESIZE) > 4);
|
||||
STATIC_ASSERT(MAX_GC1OVPAGE(MDBX_MAX_PAGESIZE) < PAGELIST_LIMIT);
|
||||
const intptr_t maxgc_ov1page = (pagesize - PAGEHDRSZ) / sizeof(pgno_t) - 1;
|
||||
ENSURE(env,
|
||||
maxgc_ov1page > 42 && maxgc_ov1page < (intptr_t)PAGELIST_LIMIT / 4);
|
||||
env->maxgc_large1page = (unsigned)maxgc_ov1page;
|
||||
env->maxgc_per_branch =
|
||||
(unsigned)((pagesize - PAGEHDRSZ) /
|
||||
(sizeof(indx_t) + sizeof(node_t) + sizeof(txnid_t)));
|
||||
|
||||
STATIC_ASSERT(LEAF_NODE_MAX(MDBX_MIN_PAGESIZE) >
|
||||
sizeof(tree_t) + NODESIZE + 42);
|
||||
STATIC_ASSERT(LEAF_NODE_MAX(MDBX_MAX_PAGESIZE) < UINT16_MAX);
|
||||
STATIC_ASSERT(LEAF_NODE_MAX(MDBX_MIN_PAGESIZE) >=
|
||||
BRANCH_NODE_MAX(MDBX_MIN_PAGESIZE));
|
||||
STATIC_ASSERT(BRANCH_NODE_MAX(MDBX_MAX_PAGESIZE) > NODESIZE + 42);
|
||||
STATIC_ASSERT(BRANCH_NODE_MAX(MDBX_MAX_PAGESIZE) < UINT16_MAX);
|
||||
const intptr_t branch_nodemax = BRANCH_NODE_MAX(pagesize);
|
||||
const intptr_t leaf_nodemax = LEAF_NODE_MAX(pagesize);
|
||||
ENSURE(env, branch_nodemax > (intptr_t)(NODESIZE + 42) &&
|
||||
branch_nodemax % 2 == 0 &&
|
||||
leaf_nodemax > (intptr_t)(sizeof(tree_t) + NODESIZE + 42) &&
|
||||
leaf_nodemax >= branch_nodemax &&
|
||||
leaf_nodemax < (int)UINT16_MAX && leaf_nodemax % 2 == 0);
|
||||
env->leaf_nodemax = (uint16_t)leaf_nodemax;
|
||||
env->branch_nodemax = (uint16_t)branch_nodemax;
|
||||
env->ps2ln = (uint8_t)log2n_powerof2(pagesize);
|
||||
eASSERT(env, pgno2bytes(env, 1) == pagesize);
|
||||
eASSERT(env, bytes2pgno(env, pagesize + pagesize) == 2);
|
||||
recalculate_merge_thresholds(env);
|
||||
|
||||
/* TODO: recalculate me_subpage_xyz values from MDBX_opt_subpage_xyz. */
|
||||
env->subpage_limit = env->leaf_nodemax - NODESIZE;
|
||||
env->subpage_room_threshold = 0;
|
||||
env->subpage_reserve_prereq = env->leaf_nodemax;
|
||||
env->subpage_reserve_limit = env->subpage_limit / 42;
|
||||
eASSERT(env, env->subpage_reserve_prereq >
|
||||
env->subpage_room_threshold + env->subpage_reserve_limit);
|
||||
eASSERT(env, env->leaf_nodemax >= env->subpage_limit + NODESIZE);
|
||||
|
||||
const pgno_t max_pgno = bytes2pgno(env, MAX_MAPSIZE);
|
||||
if (!env->options.flags.non_auto.dp_limit) {
|
||||
/* auto-setup dp_limit by "The42" ;-) */
|
||||
intptr_t total_ram_pages, avail_ram_pages;
|
||||
int err = mdbx_get_sysraminfo(nullptr, &total_ram_pages, &avail_ram_pages);
|
||||
if (unlikely(err != MDBX_SUCCESS))
|
||||
ERROR("mdbx_get_sysraminfo(), rc %d", err);
|
||||
else {
|
||||
size_t reasonable_dpl_limit =
|
||||
(size_t)(total_ram_pages + avail_ram_pages) / 42;
|
||||
if (pagesize > globals.sys_pagesize)
|
||||
reasonable_dpl_limit /= pagesize / globals.sys_pagesize;
|
||||
else if (pagesize < globals.sys_pagesize)
|
||||
reasonable_dpl_limit *= globals.sys_pagesize / pagesize;
|
||||
reasonable_dpl_limit = (reasonable_dpl_limit < PAGELIST_LIMIT)
|
||||
? reasonable_dpl_limit
|
||||
: PAGELIST_LIMIT;
|
||||
reasonable_dpl_limit = (reasonable_dpl_limit > CURSOR_STACK_SIZE * 4)
|
||||
? reasonable_dpl_limit
|
||||
: CURSOR_STACK_SIZE * 4;
|
||||
env->options.dp_limit = (unsigned)reasonable_dpl_limit;
|
||||
}
|
||||
}
|
||||
if (env->options.dp_limit > max_pgno - NUM_METAS)
|
||||
env->options.dp_limit = max_pgno - NUM_METAS;
|
||||
if (env->options.dp_initial > env->options.dp_limit)
|
||||
env->options.dp_initial = env->options.dp_limit;
|
||||
return env->ps;
|
||||
}
|
||||
|
||||
__cold int env_sync(MDBX_env *env, bool force, bool nonblock) {
|
||||
if (unlikely(env->flags & MDBX_RDONLY))
|
||||
return MDBX_EACCESS;
|
||||
|
||||
const bool txn0_owned = env_txn0_owned(env);
|
||||
bool should_unlock = false;
|
||||
int rc = MDBX_RESULT_TRUE /* means "nothing to sync" */;
|
||||
|
||||
retry:;
|
||||
unsigned flags = env->flags & ~(MDBX_NOMETASYNC | txn_shrink_allowed);
|
||||
if (unlikely((flags & (ENV_FATAL_ERROR | ENV_ACTIVE)) != ENV_ACTIVE)) {
|
||||
rc = (flags & ENV_FATAL_ERROR) ? MDBX_PANIC : MDBX_EPERM;
|
||||
goto bailout;
|
||||
}
|
||||
|
||||
const troika_t troika =
|
||||
(txn0_owned | should_unlock) ? env->basal_txn->tw.troika : meta_tap(env);
|
||||
const meta_ptr_t head = meta_recent(env, &troika);
|
||||
const uint64_t unsynced_pages =
|
||||
atomic_load64(&env->lck->unsynced_pages, mo_Relaxed);
|
||||
if (unsynced_pages == 0) {
|
||||
const uint32_t synched_meta_txnid_u32 =
|
||||
atomic_load32(&env->lck->meta_sync_txnid, mo_Relaxed);
|
||||
if (synched_meta_txnid_u32 == (uint32_t)head.txnid && head.is_steady)
|
||||
goto bailout;
|
||||
}
|
||||
|
||||
if (should_unlock && (env->flags & MDBX_WRITEMAP) &&
|
||||
unlikely(head.ptr_c->geometry.first_unallocated >
|
||||
bytes2pgno(env, env->dxb_mmap.current))) {
|
||||
|
||||
if (unlikely(env->stuck_meta >= 0) &&
|
||||
troika.recent != (uint8_t)env->stuck_meta) {
|
||||
NOTICE("skip %s since wagering meta-page (%u) is mispatch the recent "
|
||||
"meta-page (%u)",
|
||||
"sync datafile", env->stuck_meta, troika.recent);
|
||||
rc = MDBX_RESULT_TRUE;
|
||||
} else {
|
||||
rc = dxb_resize(env, head.ptr_c->geometry.first_unallocated,
|
||||
head.ptr_c->geometry.now, head.ptr_c->geometry.upper,
|
||||
implicit_grow);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
goto bailout;
|
||||
}
|
||||
}
|
||||
|
||||
const size_t autosync_threshold =
|
||||
atomic_load32(&env->lck->autosync_threshold, mo_Relaxed);
|
||||
const uint64_t autosync_period =
|
||||
atomic_load64(&env->lck->autosync_period, mo_Relaxed);
|
||||
uint64_t eoos_timestamp;
|
||||
if (force || (autosync_threshold && unsynced_pages >= autosync_threshold) ||
|
||||
(autosync_period &&
|
||||
(eoos_timestamp =
|
||||
atomic_load64(&env->lck->eoos_timestamp, mo_Relaxed)) &&
|
||||
osal_monotime() - eoos_timestamp >= autosync_period))
|
||||
flags &= MDBX_WRITEMAP /* clear flags for full steady sync */;
|
||||
|
||||
if (!txn0_owned) {
|
||||
if (!should_unlock) {
|
||||
#if MDBX_ENABLE_PGOP_STAT
|
||||
unsigned wops = 0;
|
||||
#endif /* MDBX_ENABLE_PGOP_STAT */
|
||||
|
||||
int err;
|
||||
/* pre-sync to avoid latency for writer */
|
||||
if (unsynced_pages > /* FIXME: define threshold */ 42 &&
|
||||
(flags & MDBX_SAFE_NOSYNC) == 0) {
|
||||
eASSERT(env, ((flags ^ env->flags) & MDBX_WRITEMAP) == 0);
|
||||
if (flags & MDBX_WRITEMAP) {
|
||||
/* Acquire guard to avoid collision with remap */
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
imports.srwl_AcquireShared(&env->remap_guard);
|
||||
#else
|
||||
err = osal_fastmutex_acquire(&env->remap_guard);
|
||||
if (unlikely(err != MDBX_SUCCESS))
|
||||
return err;
|
||||
#endif
|
||||
const size_t usedbytes =
|
||||
pgno_align2os_bytes(env, head.ptr_c->geometry.first_unallocated);
|
||||
err = osal_msync(&env->dxb_mmap, 0, usedbytes, MDBX_SYNC_DATA);
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
imports.srwl_ReleaseShared(&env->remap_guard);
|
||||
#else
|
||||
int unlock_err = osal_fastmutex_release(&env->remap_guard);
|
||||
if (unlikely(unlock_err != MDBX_SUCCESS) && err == MDBX_SUCCESS)
|
||||
err = unlock_err;
|
||||
#endif
|
||||
} else
|
||||
err = osal_fsync(env->lazy_fd, MDBX_SYNC_DATA);
|
||||
|
||||
if (unlikely(err != MDBX_SUCCESS))
|
||||
return err;
|
||||
|
||||
#if MDBX_ENABLE_PGOP_STAT
|
||||
wops = 1;
|
||||
#endif /* MDBX_ENABLE_PGOP_STAT */
|
||||
/* pre-sync done */
|
||||
rc = MDBX_SUCCESS /* means "some data was synced" */;
|
||||
}
|
||||
|
||||
err = lck_txn_lock(env, nonblock);
|
||||
if (unlikely(err != MDBX_SUCCESS))
|
||||
return err;
|
||||
|
||||
should_unlock = true;
|
||||
#if MDBX_ENABLE_PGOP_STAT
|
||||
env->lck->pgops.wops.weak += wops;
|
||||
#endif /* MDBX_ENABLE_PGOP_STAT */
|
||||
env->basal_txn->tw.troika = meta_tap(env);
|
||||
eASSERT(env, !env->txn && !env->basal_txn->nested);
|
||||
goto retry;
|
||||
}
|
||||
eASSERT(env, head.txnid == recent_committed_txnid(env));
|
||||
env->basal_txn->txnid = head.txnid;
|
||||
txn_snapshot_oldest(env->basal_txn);
|
||||
flags |= txn_shrink_allowed;
|
||||
}
|
||||
|
||||
eASSERT(env, txn0_owned || should_unlock);
|
||||
eASSERT(env, !txn0_owned || (flags & txn_shrink_allowed) == 0);
|
||||
|
||||
if (!head.is_steady && unlikely(env->stuck_meta >= 0) &&
|
||||
troika.recent != (uint8_t)env->stuck_meta) {
|
||||
NOTICE("skip %s since wagering meta-page (%u) is mispatch the recent "
|
||||
"meta-page (%u)",
|
||||
"sync datafile", env->stuck_meta, troika.recent);
|
||||
rc = MDBX_RESULT_TRUE;
|
||||
goto bailout;
|
||||
}
|
||||
if (!head.is_steady || ((flags & MDBX_SAFE_NOSYNC) == 0 && unsynced_pages)) {
|
||||
DEBUG("meta-head %" PRIaPGNO ", %s, sync_pending %" PRIu64,
|
||||
data_page(head.ptr_c)->pgno, durable_caption(head.ptr_c),
|
||||
unsynced_pages);
|
||||
meta_t meta = *head.ptr_c;
|
||||
rc = dxb_sync_locked(env, flags, &meta, &env->basal_txn->tw.troika);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
goto bailout;
|
||||
}
|
||||
|
||||
/* LY: sync meta-pages if MDBX_NOMETASYNC enabled
|
||||
* and someone was not synced above. */
|
||||
if (atomic_load32(&env->lck->meta_sync_txnid, mo_Relaxed) !=
|
||||
(uint32_t)head.txnid)
|
||||
rc = meta_sync(env, head);
|
||||
|
||||
bailout:
|
||||
if (should_unlock)
|
||||
lck_txn_unlock(env);
|
||||
return rc;
|
||||
}
|
||||
|
||||
__cold int env_open(MDBX_env *env, mdbx_mode_t mode) {
|
||||
/* Использование O_DSYNC или FILE_FLAG_WRITE_THROUGH:
|
||||
*
|
||||
* 0) Если размер страниц БД меньше системной страницы ОЗУ, то ядру ОС
|
||||
* придется чаще обновлять страницы в unified page cache.
|
||||
*
|
||||
* Однако, O_DSYNC не предполагает отключение unified page cache,
|
||||
* поэтому подобные затруднения будем считать проблемой ОС и/или
|
||||
* ожидаемым пенальти из-за использования мелких страниц БД.
|
||||
*
|
||||
* 1) В режиме MDBX_SYNC_DURABLE - O_DSYNC для записи как данных,
|
||||
* так и мета-страниц. Однако, на Linux отказ от O_DSYNC с последующим
|
||||
* fdatasync() может быть выгоднее при использовании HDD, так как
|
||||
* позволяет io-scheduler переупорядочить запись с учетом актуального
|
||||
* расположения файла БД на носителе.
|
||||
*
|
||||
* 2) В режиме MDBX_NOMETASYNC - O_DSYNC можно использовать для данных,
|
||||
* но в этом может не быть смысла, так как fdatasync() всё равно
|
||||
* требуется для гарантии фиксации мета после предыдущей транзакции.
|
||||
*
|
||||
* В итоге на нормальных системах (не Windows) есть два варианта:
|
||||
* - при возможности O_DIRECT и/или io_ring для данных, скорее всего,
|
||||
* есть смысл вызвать fdatasync() перед записью данных, а затем
|
||||
* использовать O_DSYNC;
|
||||
* - не использовать O_DSYNC и вызывать fdatasync() после записи данных.
|
||||
*
|
||||
* На Windows же следует минимизировать использование FlushFileBuffers()
|
||||
* из-за проблем с производительностью. Поэтому на Windows в режиме
|
||||
* MDBX_NOMETASYNC:
|
||||
* - мета обновляется через дескриптор без FILE_FLAG_WRITE_THROUGH;
|
||||
* - перед началом записи данных вызывается FlushFileBuffers(), если
|
||||
* meta_sync_txnid не совпадает с последней записанной мета;
|
||||
* - данные записываются через дескриптор с FILE_FLAG_WRITE_THROUGH.
|
||||
*
|
||||
* 3) В режиме MDBX_SAFE_NOSYNC - O_DSYNC нет смысла использовать, пока не
|
||||
* будет реализована возможность полностью асинхронной "догоняющей"
|
||||
* записи в выделенном процессе-сервере с io-ring очередями внутри.
|
||||
*
|
||||
* -----
|
||||
*
|
||||
* Использование O_DIRECT или FILE_FLAG_NO_BUFFERING:
|
||||
*
|
||||
* Назначение этих флагов в отключении файлового дескриптора от
|
||||
* unified page cache, т.е. от отображенных в память данных в случае
|
||||
* libmdbx.
|
||||
*
|
||||
* Поэтому, использование direct i/o в libmdbx без MDBX_WRITEMAP лишено
|
||||
* смысла и контр-продуктивно, ибо так мы провоцируем ядро ОС на
|
||||
* не-когерентность отображения в память с содержимым файла на носителе,
|
||||
* либо требуем дополнительных проверок и действий направленных на
|
||||
* фактическое отключение O_DIRECT для отображенных в память данных.
|
||||
*
|
||||
* В режиме MDBX_WRITEMAP когерентность отображенных данных обеспечивается
|
||||
* физически. Поэтому использование direct i/o может иметь смысл, если у
|
||||
* ядра ОС есть какие-то проблемы с msync(), в том числе с
|
||||
* производительностью:
|
||||
* - использование io_ring или gather-write может быть дешевле, чем
|
||||
* просмотр PTE ядром и запись измененных/грязных;
|
||||
* - но проблема в том, что записываемые из user mode страницы либо не
|
||||
* будут помечены чистыми (и соответственно будут записаны ядром
|
||||
* еще раз), либо ядру необходимо искать и чистить PTE при получении
|
||||
* запроса на запись.
|
||||
*
|
||||
* Поэтому O_DIRECT или FILE_FLAG_NO_BUFFERING используется:
|
||||
* - только в режиме MDBX_SYNC_DURABLE с MDBX_WRITEMAP;
|
||||
* - когда ps >= me_os_psize;
|
||||
* - опция сборки MDBX_AVOID_MSYNC != 0, которая по-умолчанию включена
|
||||
* только на Windows (см ниже).
|
||||
*
|
||||
* -----
|
||||
*
|
||||
* Использование FILE_FLAG_OVERLAPPED на Windows:
|
||||
*
|
||||
* У Windows очень плохо с I/O (за исключением прямых постраничных
|
||||
* scatter/gather, которые работают в обход проблемного unified page
|
||||
* cache и поэтому почти бесполезны в libmdbx).
|
||||
*
|
||||
* При этом всё еще хуже при использовании FlushFileBuffers(), что также
|
||||
* требуется после FlushViewOfFile() в режиме MDBX_WRITEMAP. Поэтому
|
||||
* на Windows вместо FlushViewOfFile() и FlushFileBuffers() следует
|
||||
* использовать запись через дескриптор с FILE_FLAG_WRITE_THROUGH.
|
||||
*
|
||||
* В свою очередь, запись с FILE_FLAG_WRITE_THROUGH дешевле/быстрее
|
||||
* при использовании FILE_FLAG_OVERLAPPED. В результате, на Windows
|
||||
* в durable-режимах запись данных всегда в overlapped-режиме,
|
||||
* при этом для записи мета требуется отдельный не-overlapped дескриптор.
|
||||
*/
|
||||
|
||||
env->pid = osal_getpid();
|
||||
int rc = osal_openfile((env->flags & MDBX_RDONLY) ? MDBX_OPEN_DXB_READ
|
||||
: MDBX_OPEN_DXB_LAZY,
|
||||
env, env->pathname.dxb, &env->lazy_fd, mode);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return rc;
|
||||
|
||||
#if MDBX_LOCKING == MDBX_LOCKING_SYSV
|
||||
env->me_sysv_ipc.key = ftok(env->pathname.dxb, 42);
|
||||
if (unlikely(env->me_sysv_ipc.key == -1))
|
||||
return errno;
|
||||
#endif /* MDBX_LOCKING */
|
||||
|
||||
/* Set the position in files outside of the data to avoid corruption
|
||||
* due to erroneous use of file descriptors in the application code. */
|
||||
const uint64_t safe_parking_lot_offset = UINT64_C(0x7fffFFFF80000000);
|
||||
osal_fseek(env->lazy_fd, safe_parking_lot_offset);
|
||||
|
||||
env->fd4meta = env->lazy_fd;
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
eASSERT(env, env->ioring.overlapped_fd == 0);
|
||||
bool ior_direct = false;
|
||||
if (!(env->flags &
|
||||
(MDBX_RDONLY | MDBX_SAFE_NOSYNC | MDBX_NOMETASYNC | MDBX_EXCLUSIVE))) {
|
||||
if (MDBX_AVOID_MSYNC && (env->flags & MDBX_WRITEMAP)) {
|
||||
/* Запрошен режим MDBX_SYNC_DURABLE | MDBX_WRITEMAP при активной опции
|
||||
* MDBX_AVOID_MSYNC.
|
||||
*
|
||||
* 1) В этой комбинации наиболее выгодно использовать WriteFileGather(),
|
||||
* но для этого необходимо открыть файл с флагом FILE_FLAG_NO_BUFFERING и
|
||||
* после обеспечивать выравнивание адресов и размера данных на границу
|
||||
* системной страницы, что в свою очередь возможно если размер страницы БД
|
||||
* не меньше размера системной страницы ОЗУ. Поэтому для открытия файла в
|
||||
* нужном режиме требуется знать размер страницы БД.
|
||||
*
|
||||
* 2) Кроме этого, в Windows запись в заблокированный регион файла
|
||||
* возможно только через тот-же дескриптор. Поэтому изначальный захват
|
||||
* блокировок посредством lck_seize(), захват/освобождение блокировок
|
||||
* во время пишущих транзакций и запись данных должны выполнятся через
|
||||
* один дескриптор.
|
||||
*
|
||||
* Таким образом, требуется прочитать волатильный заголовок БД, чтобы
|
||||
* узнать размер страницы, чтобы открыть дескриптор файла в режиме нужном
|
||||
* для записи данных, чтобы использовать именно этот дескриптор для
|
||||
* изначального захвата блокировок. */
|
||||
meta_t header;
|
||||
uint64_t dxb_filesize;
|
||||
int err = dxb_read_header(env, &header, MDBX_SUCCESS, true);
|
||||
if ((err == MDBX_SUCCESS && header.pagesize >= globals.sys_pagesize) ||
|
||||
(err == MDBX_ENODATA && mode && env->ps >= globals.sys_pagesize &&
|
||||
osal_filesize(env->lazy_fd, &dxb_filesize) == MDBX_SUCCESS &&
|
||||
dxb_filesize == 0))
|
||||
/* Может быть коллизия, если два процесса пытаются одновременно создать
|
||||
* БД с разным размером страницы, который у одного меньше системной
|
||||
* страницы, а у другого НЕ меньше. Эта допустимая, но очень странная
|
||||
* ситуация. Поэтому считаем её ошибочной и не пытаемся разрешить. */
|
||||
ior_direct = true;
|
||||
}
|
||||
|
||||
rc = osal_openfile(ior_direct ? MDBX_OPEN_DXB_OVERLAPPED_DIRECT
|
||||
: MDBX_OPEN_DXB_OVERLAPPED,
|
||||
env, env->pathname.dxb, &env->ioring.overlapped_fd, 0);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return rc;
|
||||
env->dxb_lock_event = CreateEventW(nullptr, true, false, nullptr);
|
||||
if (unlikely(!env->dxb_lock_event))
|
||||
return (int)GetLastError();
|
||||
osal_fseek(env->ioring.overlapped_fd, safe_parking_lot_offset);
|
||||
}
|
||||
#else
|
||||
if (mode == 0) {
|
||||
/* pickup mode for lck-file */
|
||||
struct stat st;
|
||||
if (unlikely(fstat(env->lazy_fd, &st)))
|
||||
return errno;
|
||||
mode = st.st_mode;
|
||||
}
|
||||
mode = (/* inherit read permissions for group and others */ mode &
|
||||
(S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH)) |
|
||||
/* always add read/write for owner */ S_IRUSR | S_IWUSR |
|
||||
((mode & S_IRGRP) ? /* +write if readable by group */ S_IWGRP : 0) |
|
||||
((mode & S_IROTH) ? /* +write if readable by others */ S_IWOTH : 0);
|
||||
#endif /* !Windows */
|
||||
const int lck_rc = lck_setup(env, mode);
|
||||
if (unlikely(MDBX_IS_ERROR(lck_rc)))
|
||||
return lck_rc;
|
||||
if (env->lck_mmap.fd != INVALID_HANDLE_VALUE)
|
||||
osal_fseek(env->lck_mmap.fd, safe_parking_lot_offset);
|
||||
|
||||
eASSERT(env, env->dsync_fd == INVALID_HANDLE_VALUE);
|
||||
if (!(env->flags & (MDBX_RDONLY | MDBX_SAFE_NOSYNC | DEPRECATED_MAPASYNC
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
| MDBX_EXCLUSIVE
|
||||
#endif /* !Windows */
|
||||
))) {
|
||||
rc = osal_openfile(MDBX_OPEN_DXB_DSYNC, env, env->pathname.dxb,
|
||||
&env->dsync_fd, 0);
|
||||
if (unlikely(MDBX_IS_ERROR(rc)))
|
||||
return rc;
|
||||
if (env->dsync_fd != INVALID_HANDLE_VALUE) {
|
||||
if ((env->flags & MDBX_NOMETASYNC) == 0)
|
||||
env->fd4meta = env->dsync_fd;
|
||||
osal_fseek(env->dsync_fd, safe_parking_lot_offset);
|
||||
}
|
||||
}
|
||||
|
||||
const MDBX_env_flags_t lazy_flags =
|
||||
MDBX_SAFE_NOSYNC | MDBX_UTTERLY_NOSYNC | MDBX_NOMETASYNC;
|
||||
const MDBX_env_flags_t mode_flags = lazy_flags | MDBX_LIFORECLAIM |
|
||||
MDBX_NORDAHEAD | MDBX_RDONLY |
|
||||
MDBX_WRITEMAP;
|
||||
|
||||
lck_t *const lck = env->lck_mmap.lck;
|
||||
if (lck && lck_rc != MDBX_RESULT_TRUE && (env->flags & MDBX_RDONLY) == 0) {
|
||||
MDBX_env_flags_t snap_flags;
|
||||
while ((snap_flags = atomic_load32(&lck->envmode, mo_AcquireRelease)) ==
|
||||
MDBX_RDONLY) {
|
||||
if (atomic_cas32(&lck->envmode, MDBX_RDONLY,
|
||||
(snap_flags = (env->flags & mode_flags)))) {
|
||||
/* The case:
|
||||
* - let's assume that for some reason the DB file is smaller
|
||||
* than it should be according to the geometry,
|
||||
* but not smaller than the last page used;
|
||||
* - the first process that opens the database (lck_rc == RESULT_TRUE)
|
||||
* does this in readonly mode and therefore cannot bring
|
||||
* the file size back to normal;
|
||||
* - some next process (lck_rc != RESULT_TRUE) opens the DB in
|
||||
* read-write mode and now is here.
|
||||
*
|
||||
* FIXME: Should we re-check and set the size of DB-file right here? */
|
||||
break;
|
||||
}
|
||||
atomic_yield();
|
||||
}
|
||||
|
||||
if (env->flags & MDBX_ACCEDE) {
|
||||
/* Pickup current mode-flags (MDBX_LIFORECLAIM, MDBX_NORDAHEAD, etc). */
|
||||
const MDBX_env_flags_t diff =
|
||||
(snap_flags ^ env->flags) &
|
||||
((snap_flags & lazy_flags) ? mode_flags
|
||||
: mode_flags & ~MDBX_WRITEMAP);
|
||||
env->flags ^= diff;
|
||||
NOTICE("accede mode-flags: 0x%X, 0x%X -> 0x%X", diff, env->flags ^ diff,
|
||||
env->flags);
|
||||
}
|
||||
|
||||
/* Ранее упущенный не очевидный момент: При работе БД в режимах
|
||||
* не-синхронной/отложенной фиксации на диске, все процессы-писатели должны
|
||||
* иметь одинаковый режим MDBX_WRITEMAP.
|
||||
*
|
||||
* В противном случае, сброс на диск следует выполнять дважды: сначала
|
||||
* msync(), затем fdatasync(). При этом msync() не обязан отрабатывать
|
||||
* в процессах без MDBX_WRITEMAP, так как файл в память отображен только
|
||||
* для чтения. Поэтому, в общем случае, различия по MDBX_WRITEMAP не
|
||||
* позволяют выполнить фиксацию данных на диск, после их изменения в другом
|
||||
* процессе.
|
||||
*
|
||||
* В режиме MDBX_UTTERLY_NOSYNC позволять совместную работу с MDBX_WRITEMAP
|
||||
* также не следует, поскольку никакой процесс (в том числе последний) не
|
||||
* может гарантированно сбросить данные на диск, а следовательно не должен
|
||||
* помечать какую-либо транзакцию как steady.
|
||||
*
|
||||
* В результате, требуется либо запретить совместную работу процессам с
|
||||
* разным MDBX_WRITEMAP в режиме отложенной записи, либо отслеживать такое
|
||||
* смешивание и блокировать steady-пометки - что контрпродуктивно. */
|
||||
const MDBX_env_flags_t rigorous_flags =
|
||||
(snap_flags & lazy_flags)
|
||||
? MDBX_SAFE_NOSYNC | MDBX_UTTERLY_NOSYNC | MDBX_WRITEMAP
|
||||
: MDBX_SAFE_NOSYNC | MDBX_UTTERLY_NOSYNC;
|
||||
const MDBX_env_flags_t rigorous_diff =
|
||||
(snap_flags ^ env->flags) & rigorous_flags;
|
||||
if (rigorous_diff) {
|
||||
ERROR("current mode/flags 0x%X incompatible with requested 0x%X, "
|
||||
"rigorous diff 0x%X",
|
||||
env->flags, snap_flags, rigorous_diff);
|
||||
return MDBX_INCOMPATIBLE;
|
||||
}
|
||||
}
|
||||
|
||||
mincore_clean_cache(env);
|
||||
const int dxb_rc = dxb_setup(env, lck_rc, mode);
|
||||
if (MDBX_IS_ERROR(dxb_rc))
|
||||
return dxb_rc;
|
||||
|
||||
rc = osal_check_fs_incore(env->lazy_fd);
|
||||
env->incore = false;
|
||||
if (rc == MDBX_RESULT_TRUE) {
|
||||
env->incore = true;
|
||||
NOTICE("%s", "in-core database");
|
||||
rc = MDBX_SUCCESS;
|
||||
} else if (unlikely(rc != MDBX_SUCCESS)) {
|
||||
ERROR("check_fs_incore(), err %d", rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
if (unlikely(/* recovery mode */ env->stuck_meta >= 0) &&
|
||||
(lck_rc != /* exclusive */ MDBX_RESULT_TRUE ||
|
||||
(env->flags & MDBX_EXCLUSIVE) == 0)) {
|
||||
ERROR("%s", "recovery requires exclusive mode");
|
||||
return MDBX_BUSY;
|
||||
}
|
||||
|
||||
DEBUG("opened dbenv %p", (void *)env);
|
||||
env->flags |= ENV_ACTIVE;
|
||||
if (!lck || lck_rc == MDBX_RESULT_TRUE) {
|
||||
env->lck->envmode.weak = env->flags & mode_flags;
|
||||
env->lck->meta_sync_txnid.weak = (uint32_t)recent_committed_txnid(env);
|
||||
env->lck->readers_check_timestamp.weak = osal_monotime();
|
||||
}
|
||||
if (lck) {
|
||||
if (lck_rc == MDBX_RESULT_TRUE) {
|
||||
rc = lck_downgrade(env);
|
||||
DEBUG("lck-downgrade-%s: rc %i",
|
||||
(env->flags & MDBX_EXCLUSIVE) ? "partial" : "full", rc);
|
||||
if (rc != MDBX_SUCCESS)
|
||||
return rc;
|
||||
} else {
|
||||
rc = mvcc_cleanup_dead(env, false, nullptr);
|
||||
if (MDBX_IS_ERROR(rc))
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
|
||||
rc = (env->flags & MDBX_RDONLY)
|
||||
? MDBX_SUCCESS
|
||||
: osal_ioring_create(&env->ioring
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
,
|
||||
ior_direct, env->ioring.overlapped_fd
|
||||
#endif /* Windows */
|
||||
);
|
||||
return rc;
|
||||
}
|
||||
|
||||
__cold int env_close(MDBX_env *env, bool resurrect_after_fork) {
|
||||
const unsigned flags = env->flags;
|
||||
env->flags &= ~ENV_INTERNAL_FLAGS;
|
||||
if (flags & ENV_TXKEY) {
|
||||
thread_key_delete(env->me_txkey);
|
||||
env->me_txkey = 0;
|
||||
}
|
||||
|
||||
if (env->lck)
|
||||
munlock_all(env);
|
||||
|
||||
rthc_lock();
|
||||
int rc = rthc_remove(env);
|
||||
rthc_unlock();
|
||||
|
||||
#if MDBX_ENABLE_DBI_LOCKFREE
|
||||
for (defer_free_item_t *next, *ptr = env->defer_free; ptr; ptr = next) {
|
||||
next = ptr->next;
|
||||
osal_free(ptr);
|
||||
}
|
||||
env->defer_free = nullptr;
|
||||
#endif /* MDBX_ENABLE_DBI_LOCKFREE */
|
||||
|
||||
if (!(env->flags & MDBX_RDONLY))
|
||||
osal_ioring_destroy(&env->ioring);
|
||||
|
||||
env->lck = nullptr;
|
||||
if (env->lck_mmap.lck)
|
||||
osal_munmap(&env->lck_mmap);
|
||||
|
||||
if (env->dxb_mmap.base) {
|
||||
osal_munmap(&env->dxb_mmap);
|
||||
#ifdef ENABLE_MEMCHECK
|
||||
VALGRIND_DISCARD(env->valgrind_handle);
|
||||
env->valgrind_handle = -1;
|
||||
#endif /* ENABLE_MEMCHECK */
|
||||
}
|
||||
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
eASSERT(env, !env->ioring.overlapped_fd ||
|
||||
env->ioring.overlapped_fd == INVALID_HANDLE_VALUE);
|
||||
if (env->dxb_lock_event != INVALID_HANDLE_VALUE) {
|
||||
CloseHandle(env->dxb_lock_event);
|
||||
env->dxb_lock_event = INVALID_HANDLE_VALUE;
|
||||
}
|
||||
eASSERT(env, !resurrect_after_fork);
|
||||
if (env->pathname_char) {
|
||||
osal_free(env->pathname_char);
|
||||
env->pathname_char = nullptr;
|
||||
}
|
||||
#endif /* Windows */
|
||||
|
||||
if (env->dsync_fd != INVALID_HANDLE_VALUE) {
|
||||
(void)osal_closefile(env->dsync_fd);
|
||||
env->dsync_fd = INVALID_HANDLE_VALUE;
|
||||
}
|
||||
|
||||
if (env->lazy_fd != INVALID_HANDLE_VALUE) {
|
||||
(void)osal_closefile(env->lazy_fd);
|
||||
env->lazy_fd = INVALID_HANDLE_VALUE;
|
||||
}
|
||||
|
||||
if (env->lck_mmap.fd != INVALID_HANDLE_VALUE) {
|
||||
(void)osal_closefile(env->lck_mmap.fd);
|
||||
env->lck_mmap.fd = INVALID_HANDLE_VALUE;
|
||||
}
|
||||
|
||||
if (!resurrect_after_fork) {
|
||||
if (env->kvs) {
|
||||
for (size_t i = CORE_DBS; i < env->n_dbi; ++i)
|
||||
if (env->kvs[i].name.iov_len)
|
||||
osal_free(env->kvs[i].name.iov_base);
|
||||
osal_free(env->kvs);
|
||||
env->n_dbi = CORE_DBS;
|
||||
env->kvs = nullptr;
|
||||
}
|
||||
if (env->page_auxbuf) {
|
||||
osal_memalign_free(env->page_auxbuf);
|
||||
env->page_auxbuf = nullptr;
|
||||
}
|
||||
if (env->dbi_seqs) {
|
||||
osal_free(env->dbi_seqs);
|
||||
env->dbi_seqs = nullptr;
|
||||
}
|
||||
if (env->dbs_flags) {
|
||||
osal_free(env->dbs_flags);
|
||||
env->dbs_flags = nullptr;
|
||||
}
|
||||
if (env->pathname.buffer) {
|
||||
osal_free(env->pathname.buffer);
|
||||
env->pathname.buffer = nullptr;
|
||||
}
|
||||
if (env->basal_txn) {
|
||||
dpl_free(env->basal_txn);
|
||||
txl_free(env->basal_txn->tw.gc.reclaimed);
|
||||
pnl_free(env->basal_txn->tw.retired_pages);
|
||||
pnl_free(env->basal_txn->tw.spilled.list);
|
||||
pnl_free(env->basal_txn->tw.relist);
|
||||
osal_free(env->basal_txn);
|
||||
env->basal_txn = nullptr;
|
||||
}
|
||||
}
|
||||
env->stuck_meta = -1;
|
||||
return rc;
|
||||
}
|
136
src/essentials.h
Normal file
136
src/essentials.h
Normal file
@ -0,0 +1,136 @@
|
||||
/// \copyright SPDX-License-Identifier: Apache-2.0
|
||||
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
|
||||
|
||||
#pragma once
|
||||
|
||||
#define LIBMDBX_INTERNALS
|
||||
#define MDBX_DEPRECATED
|
||||
|
||||
#ifdef MDBX_CONFIG_H
|
||||
#include MDBX_CONFIG_H
|
||||
#endif
|
||||
|
||||
#include "preface.h"
|
||||
|
||||
#ifdef xMDBX_ALLOY
|
||||
/* Amalgamated build */
|
||||
#define MDBX_INTERNAL static
|
||||
#else
|
||||
/* Non-amalgamated build */
|
||||
#define MDBX_INTERNAL
|
||||
#endif /* xMDBX_ALLOY */
|
||||
|
||||
#include "../mdbx.h"
|
||||
|
||||
/*----------------------------------------------------------------------------*/
|
||||
/* Basic constants and types */
|
||||
|
||||
typedef struct iov_ctx iov_ctx_t;
|
||||
#include "osal.h"
|
||||
|
||||
#if UINTPTR_MAX > 0xffffFFFFul || ULONG_MAX > 0xffffFFFFul || defined(_WIN64)
|
||||
#define MDBX_WORDBITS 64
|
||||
#else
|
||||
#define MDBX_WORDBITS 32
|
||||
#endif /* MDBX_WORDBITS */
|
||||
|
||||
#include "options.h"
|
||||
|
||||
#include "atomics-types.h"
|
||||
|
||||
#include "layout-dxb.h"
|
||||
#include "layout-lck.h"
|
||||
|
||||
#define MIN_MAPSIZE (MDBX_MIN_PAGESIZE * MIN_PAGENO)
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
#define MAX_MAPSIZE32 UINT32_C(0x38000000)
|
||||
#else
|
||||
#define MAX_MAPSIZE32 UINT32_C(0x7f000000)
|
||||
#endif
|
||||
#define MAX_MAPSIZE64 ((MAX_PAGENO + 1) * (uint64_t)MDBX_MAX_PAGESIZE)
|
||||
|
||||
#if MDBX_WORDBITS >= 64
|
||||
#define MAX_MAPSIZE MAX_MAPSIZE64
|
||||
#define PAGELIST_LIMIT ((size_t)MAX_PAGENO)
|
||||
#else
|
||||
#define MAX_MAPSIZE MAX_MAPSIZE32
|
||||
#define PAGELIST_LIMIT (MAX_MAPSIZE32 / MDBX_MIN_PAGESIZE)
|
||||
#endif /* MDBX_WORDBITS */
|
||||
|
||||
#define MDBX_GOLD_RATIO_DBL 1.6180339887498948482
|
||||
#define MEGABYTE ((size_t)1 << 20)
|
||||
|
||||
/*----------------------------------------------------------------------------*/
|
||||
|
||||
union logger_union {
|
||||
void *ptr;
|
||||
MDBX_debug_func *fmt;
|
||||
MDBX_debug_func_nofmt *nofmt;
|
||||
};
|
||||
|
||||
struct libmdbx_globals {
|
||||
bin128_t bootid;
|
||||
unsigned sys_pagesize, sys_allocation_granularity;
|
||||
uint8_t sys_pagesize_ln2;
|
||||
uint8_t runtime_flags;
|
||||
uint8_t loglevel;
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
bool running_under_Wine;
|
||||
#elif defined(__linux__) || defined(__gnu_linux__)
|
||||
bool running_on_WSL1 /* Windows Subsystem 1 for Linux */;
|
||||
uint32_t linux_kernel_version;
|
||||
#endif /* Linux */
|
||||
union logger_union logger;
|
||||
osal_fastmutex_t debug_lock;
|
||||
size_t logger_buffer_size;
|
||||
char *logger_buffer;
|
||||
};
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif /* __cplusplus */
|
||||
|
||||
extern struct libmdbx_globals globals;
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
extern struct libmdbx_imports imports;
|
||||
#endif /* Windows */
|
||||
|
||||
#include "logging_and_debug.h"
|
||||
|
||||
#include "utils.h"
|
||||
|
||||
#include "pnl.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif /* __cplusplus */
|
||||
|
||||
#define mdbx_sourcery_anchor XCONCAT(mdbx_sourcery_, MDBX_BUILD_SOURCERY)
|
||||
#if defined(xMDBX_TOOLS)
|
||||
extern LIBMDBX_API const char *const mdbx_sourcery_anchor;
|
||||
#endif
|
||||
|
||||
#define MDBX_IS_ERROR(rc) \
|
||||
((rc) != MDBX_RESULT_TRUE && (rc) != MDBX_RESULT_FALSE)
|
||||
|
||||
/*----------------------------------------------------------------------------*/
|
||||
|
||||
MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static inline pgno_t
|
||||
int64pgno(int64_t i64) {
|
||||
if (likely(i64 >= (int64_t)MIN_PAGENO && i64 <= (int64_t)MAX_PAGENO + 1))
|
||||
return (pgno_t)i64;
|
||||
return (i64 < (int64_t)MIN_PAGENO) ? MIN_PAGENO : MAX_PAGENO;
|
||||
}
|
||||
|
||||
MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static inline pgno_t
|
||||
pgno_add(size_t base, size_t augend) {
|
||||
assert(base <= MAX_PAGENO + 1 && augend < MAX_PAGENO);
|
||||
return int64pgno((int64_t)base + (int64_t)augend);
|
||||
}
|
||||
|
||||
MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static inline pgno_t
|
||||
pgno_sub(size_t base, size_t subtrahend) {
|
||||
assert(base >= MIN_PAGENO && base <= MAX_PAGENO + 1 &&
|
||||
subtrahend < MAX_PAGENO);
|
||||
return int64pgno((int64_t)base - (int64_t)subtrahend);
|
||||
}
|
1460
src/gc-get.c
Normal file
1460
src/gc-get.c
Normal file
File diff suppressed because it is too large
Load Diff
1094
src/gc-put.c
Normal file
1094
src/gc-put.c
Normal file
File diff suppressed because it is too large
Load Diff
39
src/gc.h
Normal file
39
src/gc.h
Normal file
@ -0,0 +1,39 @@
|
||||
/// \copyright SPDX-License-Identifier: Apache-2.0
|
||||
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "essentials.h"
|
||||
|
||||
typedef struct gc_update_context {
|
||||
size_t loop, reserve_adj;
|
||||
size_t retired_stored;
|
||||
size_t amount, reserved, cleaned_slot, reused_slot, fill_idx;
|
||||
txnid_t cleaned_id, rid;
|
||||
bool lifo, dense;
|
||||
#if MDBX_ENABLE_BIGFOOT
|
||||
txnid_t bigfoot;
|
||||
#endif /* MDBX_ENABLE_BIGFOOT */
|
||||
union {
|
||||
MDBX_cursor cursor;
|
||||
cursor_couple_t couple;
|
||||
};
|
||||
} gcu_t;
|
||||
|
||||
static inline int gc_update_init(MDBX_txn *txn, gcu_t *ctx) {
|
||||
memset(ctx, 0, offsetof(gcu_t, cursor));
|
||||
ctx->lifo = (txn->env->flags & MDBX_LIFORECLAIM) != 0;
|
||||
#if MDBX_ENABLE_BIGFOOT
|
||||
ctx->bigfoot = txn->txnid;
|
||||
#endif /* MDBX_ENABLE_BIGFOOT */
|
||||
return cursor_init(&ctx->cursor, txn, FREE_DBI);
|
||||
}
|
||||
|
||||
#define ALLOC_DEFAULT 0
|
||||
#define ALLOC_RESERVE 1
|
||||
#define ALLOC_UNIMPORTANT 2
|
||||
MDBX_INTERNAL pgr_t gc_alloc_ex(const MDBX_cursor *const mc, const size_t num,
|
||||
uint8_t flags);
|
||||
|
||||
MDBX_INTERNAL pgr_t gc_alloc_single(const MDBX_cursor *const mc);
|
||||
MDBX_INTERNAL int gc_update(MDBX_txn *txn, gcu_t *ctx);
|
476
src/global.c
Normal file
476
src/global.c
Normal file
@ -0,0 +1,476 @@
|
||||
/// \copyright SPDX-License-Identifier: Apache-2.0
|
||||
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
|
||||
|
||||
#include "internals.h"
|
||||
|
||||
static void mdbx_init(void);
|
||||
static void mdbx_fini(void);
|
||||
|
||||
/*----------------------------------------------------------------------------*/
|
||||
/* mdbx constructor/destructor */
|
||||
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
|
||||
#if MDBX_BUILD_SHARED_LIBRARY
|
||||
#if MDBX_WITHOUT_MSVC_CRT && defined(NDEBUG)
|
||||
/* DEBUG/CHECKED builds still require MSVC's CRT for runtime checks.
|
||||
*
|
||||
* Define dll's entry point only for Release build when NDEBUG is defined and
|
||||
* MDBX_WITHOUT_MSVC_CRT=ON. if the entry point isn't defined then MSVC's will
|
||||
* automatically use DllMainCRTStartup() from CRT library, which also
|
||||
* automatically call DllMain() from our mdbx.dll */
|
||||
#pragma comment(linker, "/ENTRY:DllMain")
|
||||
#endif /* MDBX_WITHOUT_MSVC_CRT */
|
||||
|
||||
BOOL APIENTRY DllMain(HANDLE module, DWORD reason, LPVOID reserved)
|
||||
#else
|
||||
#if !MDBX_MANUAL_MODULE_HANDLER
|
||||
static
|
||||
#endif /* !MDBX_MANUAL_MODULE_HANDLER */
|
||||
void NTAPI
|
||||
mdbx_module_handler(PVOID module, DWORD reason, PVOID reserved)
|
||||
#endif /* MDBX_BUILD_SHARED_LIBRARY */
|
||||
{
|
||||
(void)reserved;
|
||||
switch (reason) {
|
||||
case DLL_PROCESS_ATTACH:
|
||||
windows_import();
|
||||
mdbx_init();
|
||||
break;
|
||||
case DLL_PROCESS_DETACH:
|
||||
mdbx_fini();
|
||||
break;
|
||||
|
||||
case DLL_THREAD_ATTACH:
|
||||
break;
|
||||
case DLL_THREAD_DETACH:
|
||||
rthc_thread_dtor(module);
|
||||
break;
|
||||
}
|
||||
#if MDBX_BUILD_SHARED_LIBRARY
|
||||
return TRUE;
|
||||
#endif
|
||||
}
|
||||
|
||||
#if !MDBX_BUILD_SHARED_LIBRARY && !MDBX_MANUAL_MODULE_HANDLER
|
||||
/* *INDENT-OFF* */
|
||||
/* clang-format off */
|
||||
#if defined(_MSC_VER)
|
||||
# pragma const_seg(push)
|
||||
# pragma data_seg(push)
|
||||
|
||||
# ifndef _M_IX86
|
||||
/* kick a linker to create the TLS directory if not already done */
|
||||
# pragma comment(linker, "/INCLUDE:_tls_used")
|
||||
/* Force some symbol references. */
|
||||
# pragma comment(linker, "/INCLUDE:mdbx_tls_anchor")
|
||||
/* specific const-segment for WIN64 */
|
||||
# pragma const_seg(".CRT$XLB")
|
||||
const
|
||||
# else
|
||||
/* kick a linker to create the TLS directory if not already done */
|
||||
# pragma comment(linker, "/INCLUDE:__tls_used")
|
||||
/* Force some symbol references. */
|
||||
# pragma comment(linker, "/INCLUDE:_mdbx_tls_anchor")
|
||||
/* specific data-segment for WIN32 */
|
||||
# pragma data_seg(".CRT$XLB")
|
||||
# endif
|
||||
|
||||
__declspec(allocate(".CRT$XLB")) PIMAGE_TLS_CALLBACK mdbx_tls_anchor = mdbx_module_handler;
|
||||
# pragma data_seg(pop)
|
||||
# pragma const_seg(pop)
|
||||
|
||||
#elif defined(__GNUC__)
|
||||
# ifndef _M_IX86
|
||||
const
|
||||
# endif
|
||||
PIMAGE_TLS_CALLBACK mdbx_tls_anchor __attribute__((__section__(".CRT$XLB"), used)) = mdbx_module_handler;
|
||||
#else
|
||||
# error FIXME
|
||||
#endif
|
||||
/* *INDENT-ON* */
|
||||
/* clang-format on */
|
||||
#endif /* !MDBX_BUILD_SHARED_LIBRARY && !MDBX_MANUAL_MODULE_HANDLER */
|
||||
|
||||
#else
|
||||
|
||||
#if defined(__linux__) || defined(__gnu_linux__)
|
||||
#include <sys/utsname.h>
|
||||
|
||||
MDBX_EXCLUDE_FOR_GPROF
|
||||
__cold static uint8_t probe_for_WSL(const char *tag) {
|
||||
const char *const WSL = strstr(tag, "WSL");
|
||||
if (WSL && WSL[3] >= '2' && WSL[3] <= '9')
|
||||
return WSL[3] - '0';
|
||||
const char *const wsl = strstr(tag, "wsl");
|
||||
if (wsl && wsl[3] >= '2' && wsl[3] <= '9')
|
||||
return wsl[3] - '0';
|
||||
if (WSL || wsl || strcasestr(tag, "Microsoft"))
|
||||
/* Expecting no new kernel within WSL1, either it will explicitly
|
||||
* marked by an appropriate WSL-version hint. */
|
||||
return (globals.linux_kernel_version < /* 4.19.x */ 0x04130000) ? 1 : 2;
|
||||
return 0;
|
||||
}
|
||||
#endif /* Linux */
|
||||
|
||||
#ifdef ENABLE_GPROF
|
||||
extern void _mcleanup(void);
|
||||
extern void monstartup(unsigned long, unsigned long);
|
||||
extern void _init(void);
|
||||
extern void _fini(void);
|
||||
extern void __gmon_start__(void) __attribute__((__weak__));
|
||||
#endif /* ENABLE_GPROF */
|
||||
|
||||
MDBX_EXCLUDE_FOR_GPROF
|
||||
__cold static __attribute__((__constructor__)) void
|
||||
mdbx_global_constructor(void) {
|
||||
#ifdef ENABLE_GPROF
|
||||
if (!&__gmon_start__)
|
||||
monstartup((uintptr_t)&_init, (uintptr_t)&_fini);
|
||||
#endif /* ENABLE_GPROF */
|
||||
|
||||
#if defined(__linux__) || defined(__gnu_linux__)
|
||||
struct utsname buffer;
|
||||
if (uname(&buffer) == 0) {
|
||||
int i = 0;
|
||||
char *p = buffer.release;
|
||||
while (*p && i < 4) {
|
||||
if (*p >= '0' && *p <= '9') {
|
||||
long number = strtol(p, &p, 10);
|
||||
if (number > 0) {
|
||||
if (number > 255)
|
||||
number = 255;
|
||||
globals.linux_kernel_version += number << (24 - i * 8);
|
||||
}
|
||||
++i;
|
||||
} else {
|
||||
++p;
|
||||
}
|
||||
}
|
||||
/* "Official" way of detecting WSL1 but not WSL2
|
||||
* https://github.com/Microsoft/WSL/issues/423#issuecomment-221627364
|
||||
*
|
||||
* WARNING: False negative detection of WSL1 will result in DATA LOSS!
|
||||
* So, the REQUIREMENTS for this code:
|
||||
* 1. MUST detect WSL1 without false-negatives.
|
||||
* 2. DESIRABLE detect WSL2 but without the risk of violating the first. */
|
||||
globals.running_on_WSL1 = probe_for_WSL(buffer.version) == 1 ||
|
||||
probe_for_WSL(buffer.sysname) == 1 ||
|
||||
probe_for_WSL(buffer.release) == 1;
|
||||
}
|
||||
#endif /* Linux */
|
||||
|
||||
mdbx_init();
|
||||
}
|
||||
|
||||
MDBX_EXCLUDE_FOR_GPROF
|
||||
__cold static __attribute__((__destructor__)) void
|
||||
mdbx_global_destructor(void) {
|
||||
mdbx_fini();
|
||||
#ifdef ENABLE_GPROF
|
||||
if (!&__gmon_start__)
|
||||
_mcleanup();
|
||||
#endif /* ENABLE_GPROF */
|
||||
}
|
||||
|
||||
#endif /* ! Windows */
|
||||
|
||||
/******************************************************************************/
|
||||
|
||||
struct libmdbx_globals globals;
|
||||
|
||||
__cold static void mdbx_init(void) {
|
||||
globals.runtime_flags = ((MDBX_DEBUG) > 0) * MDBX_DBG_ASSERT +
|
||||
((MDBX_DEBUG) > 1) * MDBX_DBG_AUDIT;
|
||||
globals.loglevel = MDBX_LOG_FATAL;
|
||||
ENSURE(nullptr, osal_fastmutex_init(&globals.debug_lock) == 0);
|
||||
osal_ctor();
|
||||
assert(globals.sys_pagesize > 0 &&
|
||||
(globals.sys_pagesize & (globals.sys_pagesize - 1)) == 0);
|
||||
rthc_ctor();
|
||||
#if MDBX_DEBUG
|
||||
ENSURE(nullptr, troika_verify_fsm());
|
||||
ENSURE(nullptr, pv2pages_verify());
|
||||
#endif /* MDBX_DEBUG*/
|
||||
}
|
||||
|
||||
MDBX_EXCLUDE_FOR_GPROF
|
||||
__cold static void mdbx_fini(void) {
|
||||
const uint32_t current_pid = osal_getpid();
|
||||
TRACE(">> pid %d", current_pid);
|
||||
rthc_dtor(current_pid);
|
||||
osal_dtor();
|
||||
TRACE("<< pid %d\n", current_pid);
|
||||
ENSURE(nullptr, osal_fastmutex_destroy(&globals.debug_lock) == 0);
|
||||
}
|
||||
|
||||
/******************************************************************************/
|
||||
|
||||
/* *INDENT-OFF* */
|
||||
/* clang-format off */
|
||||
|
||||
__dll_export
|
||||
#ifdef __attribute_used__
|
||||
__attribute_used__
|
||||
#elif defined(__GNUC__) || __has_attribute(__used__)
|
||||
__attribute__((__used__))
|
||||
#endif
|
||||
#ifdef __attribute_externally_visible__
|
||||
__attribute_externally_visible__
|
||||
#elif (defined(__GNUC__) && !defined(__clang__)) || \
|
||||
__has_attribute(__externally_visible__)
|
||||
__attribute__((__externally_visible__))
|
||||
#endif
|
||||
const struct MDBX_build_info mdbx_build = {
|
||||
#ifdef MDBX_BUILD_TIMESTAMP
|
||||
MDBX_BUILD_TIMESTAMP
|
||||
#else
|
||||
"\"" __DATE__ " " __TIME__ "\""
|
||||
#endif /* MDBX_BUILD_TIMESTAMP */
|
||||
|
||||
,
|
||||
#ifdef MDBX_BUILD_TARGET
|
||||
MDBX_BUILD_TARGET
|
||||
#else
|
||||
#if defined(__ANDROID_API__)
|
||||
"Android" MDBX_STRINGIFY(__ANDROID_API__)
|
||||
#elif defined(__linux__) || defined(__gnu_linux__)
|
||||
"Linux"
|
||||
#elif defined(EMSCRIPTEN) || defined(__EMSCRIPTEN__)
|
||||
"webassembly"
|
||||
#elif defined(__CYGWIN__)
|
||||
"CYGWIN"
|
||||
#elif defined(_WIN64) || defined(_WIN32) || defined(__TOS_WIN__) \
|
||||
|| defined(__WINDOWS__)
|
||||
"Windows"
|
||||
#elif defined(__APPLE__)
|
||||
#if (defined(TARGET_OS_IPHONE) && TARGET_OS_IPHONE) \
|
||||
|| (defined(TARGET_IPHONE_SIMULATOR) && TARGET_IPHONE_SIMULATOR)
|
||||
"iOS"
|
||||
#else
|
||||
"MacOS"
|
||||
#endif
|
||||
#elif defined(__FreeBSD__)
|
||||
"FreeBSD"
|
||||
#elif defined(__DragonFly__)
|
||||
"DragonFlyBSD"
|
||||
#elif defined(__NetBSD__)
|
||||
"NetBSD"
|
||||
#elif defined(__OpenBSD__)
|
||||
"OpenBSD"
|
||||
#elif defined(__bsdi__)
|
||||
"UnixBSDI"
|
||||
#elif defined(__MACH__)
|
||||
"MACH"
|
||||
#elif (defined(_HPUX_SOURCE) || defined(__hpux) || defined(__HP_aCC))
|
||||
"HPUX"
|
||||
#elif defined(_AIX)
|
||||
"AIX"
|
||||
#elif defined(__sun) && defined(__SVR4)
|
||||
"Solaris"
|
||||
#elif defined(__BSD__) || defined(BSD)
|
||||
"UnixBSD"
|
||||
#elif defined(__unix__) || defined(UNIX) || defined(__unix) \
|
||||
|| defined(__UNIX) || defined(__UNIX__)
|
||||
"UNIX"
|
||||
#elif defined(_POSIX_VERSION)
|
||||
"POSIX" MDBX_STRINGIFY(_POSIX_VERSION)
|
||||
#else
|
||||
"UnknownOS"
|
||||
#endif /* Target OS */
|
||||
|
||||
"-"
|
||||
|
||||
#if defined(__amd64__)
|
||||
"AMD64"
|
||||
#elif defined(__ia32__)
|
||||
"IA32"
|
||||
#elif defined(__e2k__) || defined(__elbrus__)
|
||||
"Elbrus"
|
||||
#elif defined(__alpha__) || defined(__alpha) || defined(_M_ALPHA)
|
||||
"Alpha"
|
||||
#elif defined(__aarch64__) || defined(_M_ARM64)
|
||||
"ARM64"
|
||||
#elif defined(__arm__) || defined(__thumb__) || defined(__TARGET_ARCH_ARM) \
|
||||
|| defined(__TARGET_ARCH_THUMB) || defined(_ARM) || defined(_M_ARM) \
|
||||
|| defined(_M_ARMT) || defined(__arm)
|
||||
"ARM"
|
||||
#elif defined(__mips64) || defined(__mips64__) || (defined(__mips) && (__mips >= 64))
|
||||
"MIPS64"
|
||||
#elif defined(__mips__) || defined(__mips) || defined(_R4000) || defined(__MIPS__)
|
||||
"MIPS"
|
||||
#elif defined(__hppa64__) || defined(__HPPA64__) || defined(__hppa64)
|
||||
"PARISC64"
|
||||
#elif defined(__hppa__) || defined(__HPPA__) || defined(__hppa)
|
||||
"PARISC"
|
||||
#elif defined(__ia64__) || defined(__ia64) || defined(_IA64) \
|
||||
|| defined(__IA64__) || defined(_M_IA64) || defined(__itanium__)
|
||||
"Itanium"
|
||||
#elif defined(__powerpc64__) || defined(__ppc64__) || defined(__ppc64) \
|
||||
|| defined(__powerpc64) || defined(_ARCH_PPC64)
|
||||
"PowerPC64"
|
||||
#elif defined(__powerpc__) || defined(__ppc__) || defined(__powerpc) \
|
||||
|| defined(__ppc) || defined(_ARCH_PPC) || defined(__PPC__) || defined(__POWERPC__)
|
||||
"PowerPC"
|
||||
#elif defined(__sparc64__) || defined(__sparc64)
|
||||
"SPARC64"
|
||||
#elif defined(__sparc__) || defined(__sparc)
|
||||
"SPARC"
|
||||
#elif defined(__s390__) || defined(__s390) || defined(__zarch__) || defined(__zarch)
|
||||
"S390"
|
||||
#else
|
||||
"UnknownARCH"
|
||||
#endif
|
||||
#endif /* MDBX_BUILD_TARGET */
|
||||
|
||||
#ifdef MDBX_BUILD_TYPE
|
||||
# if defined(_MSC_VER)
|
||||
# pragma message("Configuration-depended MDBX_BUILD_TYPE: " MDBX_BUILD_TYPE)
|
||||
# endif
|
||||
"-" MDBX_BUILD_TYPE
|
||||
#endif /* MDBX_BUILD_TYPE */
|
||||
,
|
||||
"MDBX_DEBUG=" MDBX_STRINGIFY(MDBX_DEBUG)
|
||||
#ifdef ENABLE_GPROF
|
||||
" ENABLE_GPROF"
|
||||
#endif /* ENABLE_GPROF */
|
||||
" MDBX_WORDBITS=" MDBX_STRINGIFY(MDBX_WORDBITS)
|
||||
" BYTE_ORDER="
|
||||
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
|
||||
"LITTLE_ENDIAN"
|
||||
#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
|
||||
"BIG_ENDIAN"
|
||||
#else
|
||||
#error "FIXME: Unsupported byte order"
|
||||
#endif /* __BYTE_ORDER__ */
|
||||
" MDBX_ENABLE_BIGFOOT=" MDBX_STRINGIFY(MDBX_ENABLE_BIGFOOT)
|
||||
" MDBX_ENV_CHECKPID=" MDBX_ENV_CHECKPID_CONFIG
|
||||
" MDBX_TXN_CHECKOWNER=" MDBX_TXN_CHECKOWNER_CONFIG
|
||||
" MDBX_64BIT_ATOMIC=" MDBX_64BIT_ATOMIC_CONFIG
|
||||
" MDBX_64BIT_CAS=" MDBX_64BIT_CAS_CONFIG
|
||||
" MDBX_TRUST_RTC=" MDBX_TRUST_RTC_CONFIG
|
||||
" MDBX_AVOID_MSYNC=" MDBX_STRINGIFY(MDBX_AVOID_MSYNC)
|
||||
" MDBX_ENABLE_REFUND=" MDBX_STRINGIFY(MDBX_ENABLE_REFUND)
|
||||
" MDBX_ENABLE_MADVISE=" MDBX_STRINGIFY(MDBX_ENABLE_MADVISE)
|
||||
" MDBX_ENABLE_MINCORE=" MDBX_STRINGIFY(MDBX_ENABLE_MINCORE)
|
||||
" MDBX_ENABLE_PGOP_STAT=" MDBX_STRINGIFY(MDBX_ENABLE_PGOP_STAT)
|
||||
" MDBX_ENABLE_PROFGC=" MDBX_STRINGIFY(MDBX_ENABLE_PROFGC)
|
||||
#if MDBX_DISABLE_VALIDATION
|
||||
" MDBX_DISABLE_VALIDATION=YES"
|
||||
#endif /* MDBX_DISABLE_VALIDATION */
|
||||
#ifdef __SANITIZE_ADDRESS__
|
||||
" SANITIZE_ADDRESS=YES"
|
||||
#endif /* __SANITIZE_ADDRESS__ */
|
||||
#ifdef ENABLE_MEMCHECK
|
||||
" ENABLE_MEMCHECK=YES"
|
||||
#endif /* ENABLE_MEMCHECK */
|
||||
#if MDBX_FORCE_ASSERTIONS
|
||||
" MDBX_FORCE_ASSERTIONS=YES"
|
||||
#endif /* MDBX_FORCE_ASSERTIONS */
|
||||
#ifdef _GNU_SOURCE
|
||||
" _GNU_SOURCE=YES"
|
||||
#else
|
||||
" _GNU_SOURCE=NO"
|
||||
#endif /* _GNU_SOURCE */
|
||||
#ifdef __APPLE__
|
||||
" MDBX_OSX_SPEED_INSTEADOF_DURABILITY=" MDBX_STRINGIFY(MDBX_OSX_SPEED_INSTEADOF_DURABILITY)
|
||||
#endif /* MacOS */
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
" MDBX_WITHOUT_MSVC_CRT=" MDBX_STRINGIFY(MDBX_WITHOUT_MSVC_CRT)
|
||||
" MDBX_BUILD_SHARED_LIBRARY=" MDBX_STRINGIFY(MDBX_BUILD_SHARED_LIBRARY)
|
||||
#if !MDBX_BUILD_SHARED_LIBRARY
|
||||
" MDBX_MANUAL_MODULE_HANDLER=" MDBX_STRINGIFY(MDBX_MANUAL_MODULE_HANDLER)
|
||||
#endif
|
||||
" WINVER=" MDBX_STRINGIFY(WINVER)
|
||||
#else /* Windows */
|
||||
" MDBX_LOCKING=" MDBX_LOCKING_CONFIG
|
||||
" MDBX_USE_OFDLOCKS=" MDBX_USE_OFDLOCKS_CONFIG
|
||||
#endif /* !Windows */
|
||||
" MDBX_CACHELINE_SIZE=" MDBX_STRINGIFY(MDBX_CACHELINE_SIZE)
|
||||
" MDBX_CPU_WRITEBACK_INCOHERENT=" MDBX_STRINGIFY(MDBX_CPU_WRITEBACK_INCOHERENT)
|
||||
" MDBX_MMAP_INCOHERENT_CPU_CACHE=" MDBX_STRINGIFY(MDBX_MMAP_INCOHERENT_CPU_CACHE)
|
||||
" MDBX_MMAP_INCOHERENT_FILE_WRITE=" MDBX_STRINGIFY(MDBX_MMAP_INCOHERENT_FILE_WRITE)
|
||||
" MDBX_UNALIGNED_OK=" MDBX_STRINGIFY(MDBX_UNALIGNED_OK)
|
||||
" MDBX_PNL_ASCENDING=" MDBX_STRINGIFY(MDBX_PNL_ASCENDING)
|
||||
,
|
||||
#ifdef MDBX_BUILD_COMPILER
|
||||
MDBX_BUILD_COMPILER
|
||||
#else
|
||||
#ifdef __INTEL_COMPILER
|
||||
"Intel C/C++ " MDBX_STRINGIFY(__INTEL_COMPILER)
|
||||
#elif defined(__apple_build_version__)
|
||||
"Apple clang " MDBX_STRINGIFY(__apple_build_version__)
|
||||
#elif defined(__ibmxl__)
|
||||
"IBM clang C " MDBX_STRINGIFY(__ibmxl_version__) "." MDBX_STRINGIFY(__ibmxl_release__)
|
||||
"." MDBX_STRINGIFY(__ibmxl_modification__) "." MDBX_STRINGIFY(__ibmxl_ptf_fix_level__)
|
||||
#elif defined(__clang__)
|
||||
"clang " MDBX_STRINGIFY(__clang_version__)
|
||||
#elif defined(__MINGW64__)
|
||||
"MINGW-64 " MDBX_STRINGIFY(__MINGW64_MAJOR_VERSION) "." MDBX_STRINGIFY(__MINGW64_MINOR_VERSION)
|
||||
#elif defined(__MINGW32__)
|
||||
"MINGW-32 " MDBX_STRINGIFY(__MINGW32_MAJOR_VERSION) "." MDBX_STRINGIFY(__MINGW32_MINOR_VERSION)
|
||||
#elif defined(__MINGW__)
|
||||
"MINGW " MDBX_STRINGIFY(__MINGW_MAJOR_VERSION) "." MDBX_STRINGIFY(__MINGW_MINOR_VERSION)
|
||||
#elif defined(__IBMC__)
|
||||
"IBM C " MDBX_STRINGIFY(__IBMC__)
|
||||
#elif defined(__GNUC__)
|
||||
"GNU C/C++ "
|
||||
#ifdef __VERSION__
|
||||
__VERSION__
|
||||
#else
|
||||
MDBX_STRINGIFY(__GNUC__) "." MDBX_STRINGIFY(__GNUC_MINOR__) "." MDBX_STRINGIFY(__GNUC_PATCHLEVEL__)
|
||||
#endif
|
||||
#elif defined(_MSC_VER)
|
||||
"MSVC " MDBX_STRINGIFY(_MSC_FULL_VER) "-" MDBX_STRINGIFY(_MSC_BUILD)
|
||||
#else
|
||||
"Unknown compiler"
|
||||
#endif
|
||||
#endif /* MDBX_BUILD_COMPILER */
|
||||
,
|
||||
#ifdef MDBX_BUILD_FLAGS_CONFIG
|
||||
MDBX_BUILD_FLAGS_CONFIG
|
||||
#endif /* MDBX_BUILD_FLAGS_CONFIG */
|
||||
#ifdef MDBX_BUILD_FLAGS
|
||||
MDBX_BUILD_FLAGS
|
||||
#endif /* MDBX_BUILD_FLAGS */
|
||||
#if !(defined(MDBX_BUILD_FLAGS_CONFIG) || defined(MDBX_BUILD_FLAGS))
|
||||
"undefined (please use correct build script)"
|
||||
#ifdef _MSC_VER
|
||||
#pragma message("warning: Build flags undefined. Please use correct build script")
|
||||
#else
|
||||
#warning "Build flags undefined. Please use correct build script"
|
||||
#endif // _MSC_VER
|
||||
#endif
|
||||
};
|
||||
|
||||
#ifdef __SANITIZE_ADDRESS__
|
||||
#if !defined(_MSC_VER) || __has_attribute(weak)
|
||||
LIBMDBX_API __attribute__((__weak__))
|
||||
#endif
|
||||
const char *__asan_default_options(void) {
|
||||
return "symbolize=1:allow_addr2line=1:"
|
||||
#if MDBX_DEBUG
|
||||
"debug=1:"
|
||||
"verbosity=2:"
|
||||
#endif /* MDBX_DEBUG */
|
||||
"log_threads=1:"
|
||||
"report_globals=1:"
|
||||
"replace_str=1:replace_intrin=1:"
|
||||
"malloc_context_size=9:"
|
||||
#if !defined(__APPLE__)
|
||||
"detect_leaks=1:"
|
||||
#endif
|
||||
"check_printf=1:"
|
||||
"detect_deadlocks=1:"
|
||||
#ifndef LTO_ENABLED
|
||||
"check_initialization_order=1:"
|
||||
#endif
|
||||
"detect_stack_use_after_return=1:"
|
||||
"intercept_tls_get_addr=1:"
|
||||
"decorate_proc_maps=1:"
|
||||
"abort_on_error=1";
|
||||
}
|
||||
#endif /* __SANITIZE_ADDRESS__ */
|
||||
|
||||
/* *INDENT-ON* */
|
||||
/* clang-format on */
|
2083
src/internals.h
2083
src/internals.h
File diff suppressed because it is too large
Load Diff
306
src/layout-dxb.h
Normal file
306
src/layout-dxb.h
Normal file
@ -0,0 +1,306 @@
|
||||
/// \copyright SPDX-License-Identifier: Apache-2.0
|
||||
/// \note Please refer to the COPYRIGHT file for explanations license change,
|
||||
/// credits and acknowledgments.
|
||||
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "essentials.h"
|
||||
|
||||
#pragma pack(push, 4)
|
||||
|
||||
/* A stamp that identifies a file as an MDBX file.
|
||||
* There's nothing special about this value other than that it is easily
|
||||
* recognizable, and it will reflect any byte order mismatches. */
|
||||
#define MDBX_MAGIC UINT64_C(/* 56-bit prime */ 0x59659DBDEF4C11)
|
||||
|
||||
/* FROZEN: The version number for a database's datafile format. */
|
||||
#define MDBX_DATA_VERSION 3
|
||||
|
||||
#define MDBX_DATA_MAGIC \
|
||||
((MDBX_MAGIC << 8) + MDBX_PNL_ASCENDING * 64 + MDBX_DATA_VERSION)
|
||||
#define MDBX_DATA_MAGIC_LEGACY_COMPAT \
|
||||
((MDBX_MAGIC << 8) + MDBX_PNL_ASCENDING * 64 + 2)
|
||||
#define MDBX_DATA_MAGIC_LEGACY_DEVEL ((MDBX_MAGIC << 8) + 255)
|
||||
|
||||
/* handle for the DB used to track free pages. */
|
||||
#define FREE_DBI 0
|
||||
/* handle for the default DB. */
|
||||
#define MAIN_DBI 1
|
||||
/* Number of DBs in metapage (free and main) - also hardcoded elsewhere */
|
||||
#define CORE_DBS 2
|
||||
|
||||
/* Number of meta pages - also hardcoded elsewhere */
|
||||
#define NUM_METAS 3
|
||||
|
||||
/* A page number in the database.
|
||||
*
|
||||
* MDBX uses 32 bit for page numbers. This limits database
|
||||
* size up to 2^44 bytes, in case of 4K pages. */
|
||||
typedef uint32_t pgno_t;
|
||||
typedef mdbx_atomic_uint32_t atomic_pgno_t;
|
||||
#define PRIaPGNO PRIu32
|
||||
#define MAX_PAGENO UINT32_C(0x7FFFffff)
|
||||
#define MIN_PAGENO NUM_METAS
|
||||
|
||||
/* An invalid page number.
|
||||
* Mainly used to denote an empty tree. */
|
||||
#define P_INVALID (~(pgno_t)0)
|
||||
|
||||
/* A transaction ID. */
|
||||
typedef uint64_t txnid_t;
|
||||
typedef mdbx_atomic_uint64_t atomic_txnid_t;
|
||||
#define PRIaTXN PRIi64
|
||||
#define MIN_TXNID UINT64_C(1)
|
||||
#define MAX_TXNID (SAFE64_INVALID_THRESHOLD - 1)
|
||||
#define INITIAL_TXNID (MIN_TXNID + NUM_METAS - 1)
|
||||
#define INVALID_TXNID UINT64_MAX
|
||||
|
||||
/* Used for offsets within a single page. */
|
||||
typedef uint16_t indx_t;
|
||||
|
||||
typedef struct tree {
|
||||
uint16_t flags; /* see mdbx_dbi_open */
|
||||
uint16_t height; /* height of this tree */
|
||||
uint32_t dupfix_size; /* key-size for MDBX_DUPFIXED (DUPFIX pages) */
|
||||
pgno_t root; /* the root page of this tree */
|
||||
pgno_t branch_pages; /* number of internal pages */
|
||||
pgno_t leaf_pages; /* number of leaf pages */
|
||||
pgno_t large_pages; /* number of large pages */
|
||||
uint64_t sequence; /* table sequence counter */
|
||||
uint64_t items; /* number of data items */
|
||||
uint64_t mod_txnid; /* txnid of last committed modification */
|
||||
} tree_t;
|
||||
|
||||
/* database size-related parameters */
|
||||
typedef struct geo {
|
||||
uint16_t grow_pv; /* datafile growth step as a 16-bit packed (exponential
|
||||
quantized) value */
|
||||
uint16_t shrink_pv; /* datafile shrink threshold as a 16-bit packed
|
||||
(exponential quantized) value */
|
||||
pgno_t lower; /* minimal size of datafile in pages */
|
||||
pgno_t upper; /* maximal size of datafile in pages */
|
||||
union {
|
||||
pgno_t now; /* current size of datafile in pages */
|
||||
pgno_t end_pgno;
|
||||
};
|
||||
union {
|
||||
pgno_t first_unallocated; /* first unused page in the datafile,
|
||||
but actually the file may be shorter. */
|
||||
pgno_t next_pgno;
|
||||
};
|
||||
} geo_t;
|
||||
|
||||
typedef union bin128 {
|
||||
__anonymous_struct_extension__ struct {
|
||||
uint64_t x, y;
|
||||
};
|
||||
__anonymous_struct_extension__ struct {
|
||||
uint32_t a, b, c, d;
|
||||
};
|
||||
} bin128_t;
|
||||
|
||||
/* Meta page content.
|
||||
* A meta page is the start point for accessing a database snapshot.
|
||||
* Pages 0-2 are meta pages. */
|
||||
typedef struct meta {
|
||||
/* Stamp identifying this as an MDBX file.
|
||||
* It must be set to MDBX_MAGIC with MDBX_DATA_VERSION. */
|
||||
uint32_t magic_and_version[2];
|
||||
|
||||
/* txnid that committed this meta, the first of a two-phase-update pair */
|
||||
union {
|
||||
mdbx_atomic_uint32_t txnid_a[2];
|
||||
uint64_t unsafe_txnid;
|
||||
};
|
||||
|
||||
uint16_t reserve16; /* extra flags, zero (nothing) for now */
|
||||
uint8_t validator_id; /* ID of checksum and page validation method,
|
||||
* zero (nothing) for now */
|
||||
int8_t extra_pagehdr; /* extra bytes in the page header,
|
||||
* zero (nothing) for now */
|
||||
|
||||
geo_t geometry; /* database size-related parameters */
|
||||
|
||||
union {
|
||||
struct {
|
||||
tree_t gc, main;
|
||||
} trees;
|
||||
__anonymous_struct_extension__ struct {
|
||||
uint16_t gc_flags;
|
||||
uint16_t gc_height;
|
||||
uint32_t pagesize;
|
||||
};
|
||||
};
|
||||
|
||||
MDBX_canary canary;
|
||||
|
||||
#define DATASIGN_NONE 0u
|
||||
#define DATASIGN_WEAK 1u
|
||||
#define SIGN_IS_STEADY(sign) ((sign) > DATASIGN_WEAK)
|
||||
union {
|
||||
uint32_t sign[2];
|
||||
uint64_t unsafe_sign;
|
||||
};
|
||||
|
||||
/* txnid that committed this meta, the second of a two-phase-update pair */
|
||||
mdbx_atomic_uint32_t txnid_b[2];
|
||||
|
||||
/* Number of non-meta pages which were put in GC after COW. May be 0 in case
|
||||
* DB was previously handled by libmdbx without corresponding feature.
|
||||
* This value in couple with reader.snapshot_pages_retired allows fast
|
||||
* estimation of "how much reader is restraining GC recycling". */
|
||||
uint32_t pages_retired[2];
|
||||
|
||||
/* The analogue /proc/sys/kernel/random/boot_id or similar to determine
|
||||
* whether the system was rebooted after the last use of the database files.
|
||||
* If there was no reboot, but there is no need to rollback to the last
|
||||
* steady sync point. Zeros mean that no relevant information is available
|
||||
* from the system. */
|
||||
bin128_t bootid;
|
||||
} meta_t;
|
||||
|
||||
#pragma pack(1)
|
||||
|
||||
typedef enum page_type {
|
||||
P_BRANCH = 0x01u /* branch page */,
|
||||
P_LEAF = 0x02u /* leaf page */,
|
||||
P_LARGE = 0x04u /* large/overflow page */,
|
||||
P_META = 0x08u /* meta page */,
|
||||
P_LEGACY_DIRTY = 0x10u /* legacy P_DIRTY flag prior to v0.10 958fd5b9 */,
|
||||
P_BAD = P_LEGACY_DIRTY /* explicit flag for invalid/bad page */,
|
||||
P_DUPFIX = 0x20u /* for MDBX_DUPFIXED records */,
|
||||
P_SUBP = 0x40u /* for MDBX_DUPSORT sub-pages */,
|
||||
P_SPILLED = 0x2000u /* spilled in parent txn */,
|
||||
P_LOOSE = 0x4000u /* page was dirtied then freed, can be reused */,
|
||||
P_FROZEN = 0x8000u /* used for retire page with known status */,
|
||||
P_ILL_BITS = (uint16_t)~(P_BRANCH | P_LEAF | P_DUPFIX | P_LARGE | P_SPILLED),
|
||||
|
||||
page_broken = 0,
|
||||
page_large = P_LARGE,
|
||||
page_branch = P_BRANCH,
|
||||
page_leaf = P_LEAF,
|
||||
page_dupfix_leaf = P_DUPFIX,
|
||||
page_sub_leaf = P_SUBP | P_LEAF,
|
||||
page_sub_dupfix_leaf = P_SUBP | P_DUPFIX,
|
||||
page_sub_broken = P_SUBP,
|
||||
} page_type_t;
|
||||
|
||||
/* Common header for all page types. The page type depends on flags.
|
||||
*
|
||||
* P_BRANCH and P_LEAF pages have unsorted 'node_t's at the end, with
|
||||
* sorted entries[] entries referring to them. Exception: P_DUPFIX pages
|
||||
* omit entries and pack sorted MDBX_DUPFIXED values after the page header.
|
||||
*
|
||||
* P_LARGE records occupy one or more contiguous pages where only the
|
||||
* first has a page header. They hold the real data of N_BIGDATA nodes.
|
||||
*
|
||||
* P_SUBP sub-pages are small leaf "pages" with duplicate data.
|
||||
* A node with flag N_DUPDATA but not N_SUBDATA contains a sub-page.
|
||||
* (Duplicate data can also go in sub-databases, which use normal pages.)
|
||||
*
|
||||
* P_META pages contain meta_t, the start point of an MDBX snapshot.
|
||||
*
|
||||
* Each non-metapage up to meta_t.mm_last_pg is reachable exactly once
|
||||
* in the snapshot: Either used by a database or listed in a GC record. */
|
||||
typedef struct page {
|
||||
uint64_t txnid; /* txnid which created page, maybe zero in legacy DB */
|
||||
uint16_t dupfix_ksize; /* key size if this is a DUPFIX page */
|
||||
uint16_t flags;
|
||||
union {
|
||||
uint32_t pages; /* number of overflow pages */
|
||||
__anonymous_struct_extension__ struct {
|
||||
indx_t lower; /* lower bound of free space */
|
||||
indx_t upper; /* upper bound of free space */
|
||||
};
|
||||
};
|
||||
pgno_t pgno; /* page number */
|
||||
|
||||
#if FLEXIBLE_ARRAY_MEMBERS
|
||||
indx_t entries[] /* dynamic size */;
|
||||
#endif /* FLEXIBLE_ARRAY_MEMBERS */
|
||||
} page_t;
|
||||
|
||||
/* Size of the page header, excluding dynamic data at the end */
|
||||
#define PAGEHDRSZ 20u
|
||||
|
||||
/* Header for a single key/data pair within a page.
|
||||
* Used in pages of type P_BRANCH and P_LEAF without P_DUPFIX.
|
||||
* We guarantee 2-byte alignment for 'node_t's.
|
||||
*
|
||||
* Leaf node flags describe node contents. N_BIGDATA says the node's
|
||||
* data part is the page number of an overflow page with actual data.
|
||||
* N_DUPDATA and N_SUBDATA can be combined giving duplicate data in
|
||||
* a sub-page/sub-database, and named databases (just N_SUBDATA). */
|
||||
typedef struct node {
|
||||
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
|
||||
union {
|
||||
uint32_t dsize;
|
||||
uint32_t child_pgno;
|
||||
};
|
||||
uint8_t flags; /* see node_flags */
|
||||
uint8_t extra;
|
||||
uint16_t ksize; /* key size */
|
||||
#else
|
||||
uint16_t ksize; /* key size */
|
||||
uint8_t extra;
|
||||
uint8_t flags; /* see node_flags */
|
||||
union {
|
||||
uint32_t child_pgno;
|
||||
uint32_t dsize;
|
||||
};
|
||||
#endif /* __BYTE_ORDER__ */
|
||||
|
||||
#if FLEXIBLE_ARRAY_MEMBERS
|
||||
uint8_t payload[] /* key and data are appended here */;
|
||||
#endif /* FLEXIBLE_ARRAY_MEMBERS */
|
||||
} node_t;
|
||||
|
||||
/* Size of the node header, excluding dynamic data at the end */
|
||||
#define NODESIZE 8u
|
||||
|
||||
typedef enum node_flags {
|
||||
N_BIGDATA = 0x01 /* data put on large page */,
|
||||
N_SUBDATA = 0x02 /* data is a sub-database */,
|
||||
N_DUPDATA = 0x04 /* data has duplicates */
|
||||
} node_flags_t;
|
||||
|
||||
#pragma pack(pop)
|
||||
|
||||
MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static inline uint8_t
|
||||
page_type(const page_t *mp) {
|
||||
return mp->flags;
|
||||
}
|
||||
|
||||
MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static inline uint8_t
|
||||
page_type_compat(const page_t *mp) {
|
||||
/* Drop legacy P_DIRTY flag for sub-pages for compatilibity,
|
||||
* for assertions only. */
|
||||
return unlikely(mp->flags & P_SUBP) ? mp->flags & ~(P_SUBP | P_LEGACY_DIRTY)
|
||||
: mp->flags;
|
||||
}
|
||||
|
||||
MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static inline bool
|
||||
is_leaf(const page_t *mp) {
|
||||
return (mp->flags & P_LEAF) != 0;
|
||||
}
|
||||
|
||||
MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static inline bool
|
||||
is_dupfix_leaf(const page_t *mp) {
|
||||
return (mp->flags & P_DUPFIX) != 0;
|
||||
}
|
||||
|
||||
MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static inline bool
|
||||
is_branch(const page_t *mp) {
|
||||
return (mp->flags & P_BRANCH) != 0;
|
||||
}
|
||||
|
||||
MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static inline bool
|
||||
is_largepage(const page_t *mp) {
|
||||
return (mp->flags & P_LARGE) != 0;
|
||||
}
|
||||
|
||||
MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static inline bool
|
||||
is_subpage(const page_t *mp) {
|
||||
return (mp->flags & P_SUBP) != 0;
|
||||
}
|
285
src/layout-lck.h
Normal file
285
src/layout-lck.h
Normal file
@ -0,0 +1,285 @@
|
||||
/// \copyright SPDX-License-Identifier: Apache-2.0
|
||||
/// \note Please refer to the COPYRIGHT file for explanations license change,
|
||||
/// credits and acknowledgments.
|
||||
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "essentials.h"
|
||||
|
||||
/* The version number for a database's lockfile format. */
|
||||
#define MDBX_LOCK_VERSION 5
|
||||
|
||||
#if MDBX_LOCKING == MDBX_LOCKING_WIN32FILES
|
||||
|
||||
#define MDBX_LCK_SIGN UINT32_C(0xF10C)
|
||||
typedef void osal_ipclock_t;
|
||||
#elif MDBX_LOCKING == MDBX_LOCKING_SYSV
|
||||
|
||||
#define MDBX_LCK_SIGN UINT32_C(0xF18D)
|
||||
typedef mdbx_pid_t osal_ipclock_t;
|
||||
|
||||
#elif MDBX_LOCKING == MDBX_LOCKING_POSIX2001 || \
|
||||
MDBX_LOCKING == MDBX_LOCKING_POSIX2008
|
||||
|
||||
#define MDBX_LCK_SIGN UINT32_C(0x8017)
|
||||
typedef pthread_mutex_t osal_ipclock_t;
|
||||
|
||||
#elif MDBX_LOCKING == MDBX_LOCKING_POSIX1988
|
||||
|
||||
#define MDBX_LCK_SIGN UINT32_C(0xFC29)
|
||||
typedef sem_t osal_ipclock_t;
|
||||
|
||||
#else
|
||||
#error "FIXME"
|
||||
#endif /* MDBX_LOCKING */
|
||||
|
||||
/* Статистика профилирования работы GC */
|
||||
typedef struct gc_prof_stat {
|
||||
/* Монотонное время по "настенным часам"
|
||||
* затраченное на чтение и поиск внутри GC */
|
||||
uint64_t rtime_monotonic;
|
||||
/* Процессорное время в режим пользователя
|
||||
* на подготовку страниц извлекаемых из GC, включая подкачку с диска. */
|
||||
uint64_t xtime_cpu;
|
||||
/* Количество итераций чтения-поиска внутри GC при выделении страниц */
|
||||
uint32_t rsteps;
|
||||
/* Количество запросов на выделение последовательностей страниц,
|
||||
* т.е. когда запрашивает выделение больше одной страницы */
|
||||
uint32_t xpages;
|
||||
/* Счетчик выполнения по медленному пути (slow path execution count) */
|
||||
uint32_t spe_counter;
|
||||
/* page faults (hard page faults) */
|
||||
uint32_t majflt;
|
||||
} gc_prof_stat_t;
|
||||
|
||||
/* Statistics of pages operations for all transactions,
|
||||
* including incomplete and aborted. */
|
||||
typedef struct pgops {
|
||||
mdbx_atomic_uint64_t newly; /* Quantity of a new pages added */
|
||||
mdbx_atomic_uint64_t cow; /* Quantity of pages copied for update */
|
||||
mdbx_atomic_uint64_t clone; /* Quantity of parent's dirty pages clones
|
||||
for nested transactions */
|
||||
mdbx_atomic_uint64_t split; /* Page splits */
|
||||
mdbx_atomic_uint64_t merge; /* Page merges */
|
||||
mdbx_atomic_uint64_t spill; /* Quantity of spilled dirty pages */
|
||||
mdbx_atomic_uint64_t unspill; /* Quantity of unspilled/reloaded pages */
|
||||
mdbx_atomic_uint64_t
|
||||
wops; /* Number of explicit write operations (not a pages) to a disk */
|
||||
mdbx_atomic_uint64_t
|
||||
msync; /* Number of explicit msync/flush-to-disk operations */
|
||||
mdbx_atomic_uint64_t
|
||||
fsync; /* Number of explicit fsync/flush-to-disk operations */
|
||||
|
||||
mdbx_atomic_uint64_t prefault; /* Number of prefault write operations */
|
||||
mdbx_atomic_uint64_t mincore; /* Number of mincore() calls */
|
||||
|
||||
mdbx_atomic_uint32_t
|
||||
incoherence; /* number of https://libmdbx.dqdkfa.ru/dead-github/issues/269
|
||||
caught */
|
||||
mdbx_atomic_uint32_t reserved;
|
||||
|
||||
/* Статистика для профилирования GC.
|
||||
* Логически эти данные, возможно, стоит вынести в другую структуру,
|
||||
* но разница будет сугубо косметическая. */
|
||||
struct {
|
||||
/* Затраты на поддержку данных пользователя */
|
||||
gc_prof_stat_t work;
|
||||
/* Затраты на поддержку и обновления самой GC */
|
||||
gc_prof_stat_t self;
|
||||
/* Итераций обновления GC,
|
||||
* больше 1 если были повторы/перезапуски */
|
||||
uint32_t wloops;
|
||||
/* Итерации слияния записей GC */
|
||||
uint32_t coalescences;
|
||||
/* Уничтожения steady-точек фиксации в MDBX_UTTERLY_NOSYNC */
|
||||
uint32_t wipes;
|
||||
/* Сбросы данные на диск вне MDBX_UTTERLY_NOSYNC */
|
||||
uint32_t flushes;
|
||||
/* Попытки пнуть тормозящих читателей */
|
||||
uint32_t kicks;
|
||||
} gc_prof;
|
||||
} pgop_stat_t;
|
||||
|
||||
/* Reader Lock Table
|
||||
*
|
||||
* Readers don't acquire any locks for their data access. Instead, they
|
||||
* simply record their transaction ID in the reader table. The reader
|
||||
* mutex is needed just to find an empty slot in the reader table. The
|
||||
* slot's address is saved in thread-specific data so that subsequent
|
||||
* read transactions started by the same thread need no further locking to
|
||||
* proceed.
|
||||
*
|
||||
* If MDBX_NOSTICKYTHREADS is set, the slot address is not saved in
|
||||
* thread-specific data. No reader table is used if the database is on a
|
||||
* read-only filesystem.
|
||||
*
|
||||
* Since the database uses multi-version concurrency control, readers don't
|
||||
* actually need any locking. This table is used to keep track of which
|
||||
* readers are using data from which old transactions, so that we'll know
|
||||
* when a particular old transaction is no longer in use. Old transactions
|
||||
* that have discarded any data pages can then have those pages reclaimed
|
||||
* for use by a later write transaction.
|
||||
*
|
||||
* The lock table is constructed such that reader slots are aligned with the
|
||||
* processor's cache line size. Any slot is only ever used by one thread.
|
||||
* This alignment guarantees that there will be no contention or cache
|
||||
* thrashing as threads update their own slot info, and also eliminates
|
||||
* any need for locking when accessing a slot.
|
||||
*
|
||||
* A writer thread will scan every slot in the table to determine the oldest
|
||||
* outstanding reader transaction. Any freed pages older than this will be
|
||||
* reclaimed by the writer. The writer doesn't use any locks when scanning
|
||||
* this table. This means that there's no guarantee that the writer will
|
||||
* see the most up-to-date reader info, but that's not required for correct
|
||||
* operation - all we need is to know the upper bound on the oldest reader,
|
||||
* we don't care at all about the newest reader. So the only consequence of
|
||||
* reading stale information here is that old pages might hang around a
|
||||
* while longer before being reclaimed. That's actually good anyway, because
|
||||
* the longer we delay reclaiming old pages, the more likely it is that a
|
||||
* string of contiguous pages can be found after coalescing old pages from
|
||||
* many old transactions together. */
|
||||
|
||||
/* The actual reader record, with cacheline padding. */
|
||||
typedef struct reader_slot {
|
||||
/* Current Transaction ID when this transaction began, or INVALID_TXNID.
|
||||
* Multiple readers that start at the same time will probably have the
|
||||
* same ID here. Again, it's not important to exclude them from
|
||||
* anything; all we need to know is which version of the DB they
|
||||
* started from so we can avoid overwriting any data used in that
|
||||
* particular version. */
|
||||
atomic_txnid_t txnid;
|
||||
|
||||
/* The information we store in a single slot of the reader table.
|
||||
* In addition to a transaction ID, we also record the process and
|
||||
* thread ID that owns a slot, so that we can detect stale information,
|
||||
* e.g. threads or processes that went away without cleaning up.
|
||||
*
|
||||
* NOTE: We currently don't check for stale records.
|
||||
* We simply re-init the table when we know that we're the only process
|
||||
* opening the lock file. */
|
||||
|
||||
/* The thread ID of the thread owning this txn. */
|
||||
mdbx_atomic_uint64_t tid;
|
||||
|
||||
/* The process ID of the process owning this reader txn. */
|
||||
mdbx_atomic_uint32_t pid;
|
||||
|
||||
/* The number of pages used in the reader's MVCC snapshot,
|
||||
* i.e. the value of meta->geometry.first_unallocated and
|
||||
* txn->geo.first_unallocated */
|
||||
atomic_pgno_t snapshot_pages_used;
|
||||
/* Number of retired pages at the time this reader starts transaction. So,
|
||||
* at any time the difference meta.pages_retired -
|
||||
* reader.snapshot_pages_retired will give the number of pages which this
|
||||
* reader restraining from reuse. */
|
||||
mdbx_atomic_uint64_t snapshot_pages_retired;
|
||||
} reader_slot_t;
|
||||
|
||||
/* The header for the reader table (a memory-mapped lock file). */
|
||||
typedef struct shared_lck {
|
||||
/* Stamp identifying this as an MDBX file.
|
||||
* It must be set to MDBX_MAGIC with with MDBX_LOCK_VERSION. */
|
||||
uint64_t magic_and_version;
|
||||
|
||||
/* Format of this lock file. Must be set to MDBX_LOCK_FORMAT. */
|
||||
uint32_t os_and_format;
|
||||
|
||||
/* Flags which environment was opened. */
|
||||
mdbx_atomic_uint32_t envmode;
|
||||
|
||||
/* Threshold of un-synced-with-disk pages for auto-sync feature,
|
||||
* zero means no-threshold, i.e. auto-sync is disabled. */
|
||||
atomic_pgno_t autosync_threshold;
|
||||
|
||||
/* Low 32-bit of txnid with which meta-pages was synced,
|
||||
* i.e. for sync-polling in the MDBX_NOMETASYNC mode. */
|
||||
#define MDBX_NOMETASYNC_LAZY_UNK (UINT32_MAX / 3)
|
||||
#define MDBX_NOMETASYNC_LAZY_FD (MDBX_NOMETASYNC_LAZY_UNK + UINT32_MAX / 8)
|
||||
#define MDBX_NOMETASYNC_LAZY_WRITEMAP \
|
||||
(MDBX_NOMETASYNC_LAZY_UNK - UINT32_MAX / 8)
|
||||
mdbx_atomic_uint32_t meta_sync_txnid;
|
||||
|
||||
/* Period for timed auto-sync feature, i.e. at the every steady checkpoint
|
||||
* the mti_unsynced_timeout sets to the current_time + autosync_period.
|
||||
* The time value is represented in a suitable system-dependent form, for
|
||||
* example clock_gettime(CLOCK_BOOTTIME) or clock_gettime(CLOCK_MONOTONIC).
|
||||
* Zero means timed auto-sync is disabled. */
|
||||
mdbx_atomic_uint64_t autosync_period;
|
||||
|
||||
/* Marker to distinguish uniqueness of DB/CLK. */
|
||||
mdbx_atomic_uint64_t bait_uniqueness;
|
||||
|
||||
/* Paired counter of processes that have mlock()ed part of mmapped DB.
|
||||
* The (mlcnt[0] - mlcnt[1]) > 0 means at least one process
|
||||
* lock at least one page, so therefore madvise() could return EINVAL. */
|
||||
mdbx_atomic_uint32_t mlcnt[2];
|
||||
|
||||
MDBX_ALIGNAS(MDBX_CACHELINE_SIZE) /* cacheline ----------------------------*/
|
||||
|
||||
/* Statistics of costly ops of all (running, completed and aborted)
|
||||
* transactions */
|
||||
pgop_stat_t pgops;
|
||||
|
||||
MDBX_ALIGNAS(MDBX_CACHELINE_SIZE) /* cacheline ----------------------------*/
|
||||
|
||||
#if MDBX_LOCKING > 0
|
||||
/* Write transaction lock. */
|
||||
osal_ipclock_t wrt_lock;
|
||||
#endif /* MDBX_LOCKING > 0 */
|
||||
|
||||
atomic_txnid_t cached_oldest;
|
||||
|
||||
/* Timestamp of entering an out-of-sync state. Value is represented in a
|
||||
* suitable system-dependent form, for example clock_gettime(CLOCK_BOOTTIME)
|
||||
* or clock_gettime(CLOCK_MONOTONIC). */
|
||||
mdbx_atomic_uint64_t eoos_timestamp;
|
||||
|
||||
/* Number un-synced-with-disk pages for auto-sync feature. */
|
||||
mdbx_atomic_uint64_t unsynced_pages;
|
||||
|
||||
/* Timestamp of the last readers check. */
|
||||
mdbx_atomic_uint64_t readers_check_timestamp;
|
||||
|
||||
/* Number of page which was discarded last time by madvise(DONTNEED). */
|
||||
atomic_pgno_t discarded_tail;
|
||||
|
||||
/* Shared anchor for tracking readahead edge and enabled/disabled status. */
|
||||
pgno_t readahead_anchor;
|
||||
|
||||
/* Shared cache for mincore() results */
|
||||
struct {
|
||||
pgno_t begin[4];
|
||||
uint64_t mask[4];
|
||||
} mincore_cache;
|
||||
|
||||
MDBX_ALIGNAS(MDBX_CACHELINE_SIZE) /* cacheline ----------------------------*/
|
||||
|
||||
#if MDBX_LOCKING > 0
|
||||
/* Readeaders table lock. */
|
||||
osal_ipclock_t rdt_lock;
|
||||
#endif /* MDBX_LOCKING > 0 */
|
||||
|
||||
/* The number of slots that have been used in the reader table.
|
||||
* This always records the maximum count, it is not decremented
|
||||
* when readers release their slots. */
|
||||
mdbx_atomic_uint32_t rdt_length;
|
||||
mdbx_atomic_uint32_t rdt_refresh_flag;
|
||||
|
||||
#if FLEXIBLE_ARRAY_MEMBERS
|
||||
MDBX_ALIGNAS(MDBX_CACHELINE_SIZE) /* cacheline ----------------------------*/
|
||||
reader_slot_t rdt[] /* dynamic size */;
|
||||
|
||||
/* Lockfile format signature: version, features and field layout */
|
||||
#define MDBX_LOCK_FORMAT \
|
||||
(MDBX_LCK_SIGN * 27733 + (unsigned)sizeof(reader_slot_t) * 13 + \
|
||||
(unsigned)offsetof(reader_slot_t, snapshot_pages_used) * 251 + \
|
||||
(unsigned)offsetof(lck_t, cached_oldest) * 83 + \
|
||||
(unsigned)offsetof(lck_t, rdt_length) * 37 + \
|
||||
(unsigned)offsetof(lck_t, rdt) * 29)
|
||||
#endif /* FLEXIBLE_ARRAY_MEMBERS */
|
||||
} lck_t;
|
||||
|
||||
#define MDBX_LOCK_MAGIC ((MDBX_MAGIC << 8) + MDBX_LOCK_VERSION)
|
||||
|
||||
#define MDBX_READERS_LIMIT 32767
|
409
src/lck-posix.c
409
src/lck-posix.c
@ -1,18 +1,9 @@
|
||||
/*
|
||||
* Copyright 2015-2024 Leonid Yuriev <leo@yuriev.ru>
|
||||
* and other libmdbx authors: please see AUTHORS file.
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted only as authorized by the OpenLDAP
|
||||
* Public License.
|
||||
*
|
||||
* A copy of this license is available in the file LICENSE in the
|
||||
* top-level directory of the distribution or, alternatively, at
|
||||
* <http://www.OpenLDAP.org/license.html>.
|
||||
*/
|
||||
/// \copyright SPDX-License-Identifier: Apache-2.0
|
||||
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
|
||||
|
||||
#if !(defined(_WIN32) || defined(_WIN64)) /* !Windows LCK-implementation */
|
||||
#if !(defined(_WIN32) || defined(_WIN64))
|
||||
/*----------------------------------------------------------------------------*
|
||||
* POSIX/non-Windows LCK-implementation */
|
||||
|
||||
#include "internals.h"
|
||||
|
||||
@ -20,112 +11,21 @@
|
||||
#include <sys/sem.h>
|
||||
#endif /* MDBX_LOCKING == MDBX_LOCKING_SYSV */
|
||||
|
||||
/*----------------------------------------------------------------------------*/
|
||||
/* global constructor/destructor */
|
||||
|
||||
#if defined(__linux__) || defined(__gnu_linux__)
|
||||
|
||||
#include <sys/utsname.h>
|
||||
|
||||
MDBX_INTERNAL_VAR_INSTA uint32_t linux_kernel_version;
|
||||
MDBX_INTERNAL_VAR_INSTA bool
|
||||
mdbx_RunningOnWSL1 /* Windows Subsystem 1 for Linux */;
|
||||
|
||||
MDBX_EXCLUDE_FOR_GPROF
|
||||
__cold static uint8_t probe_for_WSL(const char *tag) {
|
||||
const char *const WSL = strstr(tag, "WSL");
|
||||
if (WSL && WSL[3] >= '2' && WSL[3] <= '9')
|
||||
return WSL[3] - '0';
|
||||
const char *const wsl = strstr(tag, "wsl");
|
||||
if (wsl && wsl[3] >= '2' && wsl[3] <= '9')
|
||||
return wsl[3] - '0';
|
||||
if (WSL || wsl || strcasestr(tag, "Microsoft"))
|
||||
/* Expecting no new kernel within WSL1, either it will explicitly
|
||||
* marked by an appropriate WSL-version hint. */
|
||||
return (linux_kernel_version < /* 4.19.x */ 0x04130000) ? 1 : 2;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif /* Linux */
|
||||
|
||||
#ifdef ENABLE_GPROF
|
||||
extern void _mcleanup(void);
|
||||
extern void monstartup(unsigned long, unsigned long);
|
||||
extern void _init(void);
|
||||
extern void _fini(void);
|
||||
extern void __gmon_start__(void) __attribute__((__weak__));
|
||||
#endif /* ENABLE_GPROF */
|
||||
|
||||
MDBX_EXCLUDE_FOR_GPROF
|
||||
__cold static __attribute__((__constructor__)) void
|
||||
mdbx_global_constructor(void) {
|
||||
#ifdef ENABLE_GPROF
|
||||
if (!&__gmon_start__)
|
||||
monstartup((uintptr_t)&_init, (uintptr_t)&_fini);
|
||||
#endif /* ENABLE_GPROF */
|
||||
|
||||
#if defined(__linux__) || defined(__gnu_linux__)
|
||||
struct utsname buffer;
|
||||
if (uname(&buffer) == 0) {
|
||||
int i = 0;
|
||||
char *p = buffer.release;
|
||||
while (*p && i < 4) {
|
||||
if (*p >= '0' && *p <= '9') {
|
||||
long number = strtol(p, &p, 10);
|
||||
if (number > 0) {
|
||||
if (number > 255)
|
||||
number = 255;
|
||||
linux_kernel_version += number << (24 - i * 8);
|
||||
}
|
||||
++i;
|
||||
} else {
|
||||
++p;
|
||||
}
|
||||
}
|
||||
/* "Official" way of detecting WSL1 but not WSL2
|
||||
* https://github.com/Microsoft/WSL/issues/423#issuecomment-221627364
|
||||
*
|
||||
* WARNING: False negative detection of WSL1 will result in DATA LOSS!
|
||||
* So, the REQUIREMENTS for this code:
|
||||
* 1. MUST detect WSL1 without false-negatives.
|
||||
* 2. DESIRABLE detect WSL2 but without the risk of violating the first. */
|
||||
mdbx_RunningOnWSL1 = probe_for_WSL(buffer.version) == 1 ||
|
||||
probe_for_WSL(buffer.sysname) == 1 ||
|
||||
probe_for_WSL(buffer.release) == 1;
|
||||
}
|
||||
#endif /* Linux */
|
||||
|
||||
global_ctor();
|
||||
}
|
||||
|
||||
MDBX_EXCLUDE_FOR_GPROF
|
||||
__cold static __attribute__((__destructor__)) void
|
||||
mdbx_global_destructor(void) {
|
||||
global_dtor();
|
||||
#ifdef ENABLE_GPROF
|
||||
if (!&__gmon_start__)
|
||||
_mcleanup();
|
||||
#endif /* ENABLE_GPROF */
|
||||
}
|
||||
|
||||
/*----------------------------------------------------------------------------*/
|
||||
/* lck */
|
||||
|
||||
/* Описание реализации блокировок для POSIX & Linux:
|
||||
*
|
||||
* lck-файл отображается в память, в нём организуется таблица читателей и
|
||||
* размещаются совместно используемые posix-мьютексы (futex). Посредством
|
||||
* этих мьютексов (см struct MDBX_lockinfo) реализуются:
|
||||
* этих мьютексов (см struct lck_t) реализуются:
|
||||
* - Блокировка таблицы читателей для регистрации,
|
||||
* т.е. функции osal_rdt_lock() и osal_rdt_unlock().
|
||||
* т.е. функции lck_rdt_lock() и lck_rdt_unlock().
|
||||
* - Блокировка БД для пишущих транзакций,
|
||||
* т.е. функции osal_txn_lock() и osal_txn_unlock().
|
||||
* т.е. функции lck_txn_lock() и lck_txn_unlock().
|
||||
*
|
||||
* Остальной функционал реализуется отдельно посредством файловых блокировок:
|
||||
* - Первоначальный захват БД в режиме exclusive/shared и последующий перевод
|
||||
* в операционный режим, функции osal_lck_seize() и osal_lck_downgrade().
|
||||
* в операционный режим, функции lck_seize() и lck_downgrade().
|
||||
* - Проверка присутствие процессов-читателей,
|
||||
* т.е. функции osal_rpid_set(), osal_rpid_clear() и osal_rpid_check().
|
||||
* т.е. функции lck_rpid_set(), lck_rpid_clear() и lck_rpid_check().
|
||||
*
|
||||
* Для блокировки файлов используется fcntl(F_SETLK), так как:
|
||||
* - lockf() оперирует только эксклюзивной блокировкой и требует
|
||||
@ -169,9 +69,9 @@ mdbx_global_destructor(void) {
|
||||
static int op_setlk, op_setlkw, op_getlk;
|
||||
__cold static void choice_fcntl(void) {
|
||||
assert(!op_setlk && !op_setlkw && !op_getlk);
|
||||
if ((mdbx_static.flags & MDBX_DBG_LEGACY_MULTIOPEN) == 0
|
||||
if ((globals.runtime_flags & MDBX_DBG_LEGACY_MULTIOPEN) == 0
|
||||
#if defined(__linux__) || defined(__gnu_linux__)
|
||||
&& linux_kernel_version >
|
||||
&& globals.linux_kernel_version >
|
||||
0x030f0000 /* OFD locks are available since 3.15, but engages here
|
||||
only for 3.16 and later kernels (i.e. LTS) because
|
||||
of reliability reasons */
|
||||
@ -201,7 +101,6 @@ static int lck_op(const mdbx_filehandle_t fd, int cmd, const int lck,
|
||||
"The bitness of system `off_t` type is mismatch. Please "
|
||||
"fix build and/or NDK configuration.");
|
||||
#endif /* Android */
|
||||
jitter4testing(true);
|
||||
assert(offset >= 0 && len > 0);
|
||||
assert((uint64_t)offset < (uint64_t)INT64_MAX &&
|
||||
(uint64_t)len < (uint64_t)INT64_MAX &&
|
||||
@ -213,6 +112,8 @@ static int lck_op(const mdbx_filehandle_t fd, int cmd, const int lck,
|
||||
|
||||
assert((uint64_t)((off_t)((uint64_t)offset + (uint64_t)len)) ==
|
||||
((uint64_t)offset + (uint64_t)len));
|
||||
|
||||
jitter4testing(true);
|
||||
for (;;) {
|
||||
MDBX_STRUCT_FLOCK lock_op;
|
||||
STATIC_ASSERT_MSG(sizeof(off_t) <= sizeof(lock_op.l_start) &&
|
||||
@ -262,7 +163,7 @@ static int lck_op(const mdbx_filehandle_t fd, int cmd, const int lck,
|
||||
}
|
||||
}
|
||||
|
||||
MDBX_INTERNAL_FUNC int osal_lockfile(mdbx_filehandle_t fd, bool wait) {
|
||||
MDBX_INTERNAL int osal_lockfile(mdbx_filehandle_t fd, bool wait) {
|
||||
#if MDBX_USE_OFDLOCKS
|
||||
if (unlikely(op_setlk == 0))
|
||||
choice_fcntl();
|
||||
@ -270,30 +171,30 @@ MDBX_INTERNAL_FUNC int osal_lockfile(mdbx_filehandle_t fd, bool wait) {
|
||||
return lck_op(fd, wait ? op_setlkw : op_setlk, F_WRLCK, 0, OFF_T_MAX);
|
||||
}
|
||||
|
||||
MDBX_INTERNAL_FUNC int osal_rpid_set(MDBX_env *env) {
|
||||
assert(env->me_lfd != INVALID_HANDLE_VALUE);
|
||||
assert(env->me_pid > 0);
|
||||
if (unlikely(osal_getpid() != env->me_pid))
|
||||
MDBX_INTERNAL int lck_rpid_set(MDBX_env *env) {
|
||||
assert(env->lck_mmap.fd != INVALID_HANDLE_VALUE);
|
||||
assert(env->pid > 0);
|
||||
if (unlikely(osal_getpid() != env->pid))
|
||||
return MDBX_PANIC;
|
||||
return lck_op(env->me_lfd, op_setlk, F_WRLCK, env->me_pid, 1);
|
||||
return lck_op(env->lck_mmap.fd, op_setlk, F_WRLCK, env->pid, 1);
|
||||
}
|
||||
|
||||
MDBX_INTERNAL_FUNC int osal_rpid_clear(MDBX_env *env) {
|
||||
assert(env->me_lfd != INVALID_HANDLE_VALUE);
|
||||
assert(env->me_pid > 0);
|
||||
return lck_op(env->me_lfd, op_setlk, F_UNLCK, env->me_pid, 1);
|
||||
MDBX_INTERNAL int lck_rpid_clear(MDBX_env *env) {
|
||||
assert(env->lck_mmap.fd != INVALID_HANDLE_VALUE);
|
||||
assert(env->pid > 0);
|
||||
return lck_op(env->lck_mmap.fd, op_setlk, F_UNLCK, env->pid, 1);
|
||||
}
|
||||
|
||||
MDBX_INTERNAL_FUNC int osal_rpid_check(MDBX_env *env, uint32_t pid) {
|
||||
assert(env->me_lfd != INVALID_HANDLE_VALUE);
|
||||
MDBX_INTERNAL int lck_rpid_check(MDBX_env *env, uint32_t pid) {
|
||||
assert(env->lck_mmap.fd != INVALID_HANDLE_VALUE);
|
||||
assert(pid > 0);
|
||||
return lck_op(env->me_lfd, op_getlk, F_WRLCK, pid, 1);
|
||||
return lck_op(env->lck_mmap.fd, op_getlk, F_WRLCK, pid, 1);
|
||||
}
|
||||
|
||||
/*---------------------------------------------------------------------------*/
|
||||
|
||||
#if MDBX_LOCKING > MDBX_LOCKING_SYSV
|
||||
MDBX_INTERNAL_FUNC int osal_ipclock_stubinit(osal_ipclock_t *ipc) {
|
||||
MDBX_INTERNAL int lck_ipclock_stubinit(osal_ipclock_t *ipc) {
|
||||
#if MDBX_LOCKING == MDBX_LOCKING_POSIX1988
|
||||
return sem_init(ipc, false, 1) ? errno : 0;
|
||||
#elif MDBX_LOCKING == MDBX_LOCKING_POSIX2001 || \
|
||||
@ -304,7 +205,7 @@ MDBX_INTERNAL_FUNC int osal_ipclock_stubinit(osal_ipclock_t *ipc) {
|
||||
#endif
|
||||
}
|
||||
|
||||
MDBX_INTERNAL_FUNC int osal_ipclock_destroy(osal_ipclock_t *ipc) {
|
||||
MDBX_INTERNAL int lck_ipclock_destroy(osal_ipclock_t *ipc) {
|
||||
#if MDBX_LOCKING == MDBX_LOCKING_POSIX1988
|
||||
return sem_destroy(ipc) ? errno : 0;
|
||||
#elif MDBX_LOCKING == MDBX_LOCKING_POSIX2001 || \
|
||||
@ -320,7 +221,7 @@ static int check_fstat(MDBX_env *env) {
|
||||
struct stat st;
|
||||
|
||||
int rc = MDBX_SUCCESS;
|
||||
if (fstat(env->me_lazy_fd, &st)) {
|
||||
if (fstat(env->lazy_fd, &st)) {
|
||||
rc = errno;
|
||||
ERROR("fstat(%s), err %d", "DXB", rc);
|
||||
return rc;
|
||||
@ -345,7 +246,7 @@ static int check_fstat(MDBX_env *env) {
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
if (fstat(env->me_lfd, &st)) {
|
||||
if (fstat(env->lck_mmap.fd, &st)) {
|
||||
rc = errno;
|
||||
ERROR("fstat(%s), err %d", "LCK", rc);
|
||||
return rc;
|
||||
@ -363,8 +264,8 @@ static int check_fstat(MDBX_env *env) {
|
||||
}
|
||||
|
||||
/* Checking file size for detect the situation when we got the shared lock
|
||||
* immediately after osal_lck_destroy(). */
|
||||
if (st.st_size < (off_t)(sizeof(MDBX_lockinfo) + sizeof(MDBX_reader))) {
|
||||
* immediately after lck_destroy(). */
|
||||
if (st.st_size < (off_t)(sizeof(lck_t) + sizeof(reader_slot_t))) {
|
||||
VERBOSE("lck-file is too short (%u), exclusive-lock needed",
|
||||
(unsigned)st.st_size);
|
||||
rc = MDBX_RESULT_TRUE;
|
||||
@ -373,18 +274,14 @@ static int check_fstat(MDBX_env *env) {
|
||||
return rc;
|
||||
}
|
||||
|
||||
__cold MDBX_INTERNAL_FUNC int osal_lck_seize(MDBX_env *env) {
|
||||
assert(env->me_lazy_fd != INVALID_HANDLE_VALUE);
|
||||
if (unlikely(osal_getpid() != env->me_pid))
|
||||
__cold MDBX_INTERNAL int lck_seize(MDBX_env *env) {
|
||||
assert(env->lazy_fd != INVALID_HANDLE_VALUE);
|
||||
if (unlikely(osal_getpid() != env->pid))
|
||||
return MDBX_PANIC;
|
||||
#if MDBX_USE_OFDLOCKS
|
||||
if (unlikely(op_setlk == 0))
|
||||
choice_fcntl();
|
||||
#endif /* MDBX_USE_OFDLOCKS */
|
||||
|
||||
int rc = MDBX_SUCCESS;
|
||||
#if defined(__linux__) || defined(__gnu_linux__)
|
||||
if (unlikely(mdbx_RunningOnWSL1)) {
|
||||
if (unlikely(globals.running_on_WSL1)) {
|
||||
rc = ENOLCK /* No record locks available */;
|
||||
ERROR("%s, err %u",
|
||||
"WSL1 (Windows Subsystem for Linux) is mad and trouble-full, "
|
||||
@ -394,11 +291,15 @@ __cold MDBX_INTERNAL_FUNC int osal_lck_seize(MDBX_env *env) {
|
||||
}
|
||||
#endif /* Linux */
|
||||
|
||||
if (env->me_lfd == INVALID_HANDLE_VALUE) {
|
||||
#if MDBX_USE_OFDLOCKS
|
||||
if (unlikely(op_setlk == 0))
|
||||
choice_fcntl();
|
||||
#endif /* MDBX_USE_OFDLOCKS */
|
||||
|
||||
if (env->lck_mmap.fd == INVALID_HANDLE_VALUE) {
|
||||
/* LY: without-lck mode (e.g. exclusive or on read-only filesystem) */
|
||||
rc =
|
||||
lck_op(env->me_lazy_fd, op_setlk,
|
||||
(env->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, 0, OFF_T_MAX);
|
||||
rc = lck_op(env->lazy_fd, op_setlk,
|
||||
(env->flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, 0, OFF_T_MAX);
|
||||
if (rc != MDBX_SUCCESS) {
|
||||
ERROR("%s, err %u", "without-lck", rc);
|
||||
eASSERT(env, MDBX_IS_ERROR(rc));
|
||||
@ -412,7 +313,7 @@ __cold MDBX_INTERNAL_FUNC int osal_lck_seize(MDBX_env *env) {
|
||||
|
||||
retry:
|
||||
if (rc == MDBX_RESULT_TRUE) {
|
||||
rc = lck_op(env->me_lfd, op_setlk, F_UNLCK, 0, 1);
|
||||
rc = lck_op(env->lck_mmap.fd, op_setlk, F_UNLCK, 0, 1);
|
||||
if (rc != MDBX_SUCCESS) {
|
||||
ERROR("%s, err %u", "unlock-before-retry", rc);
|
||||
eASSERT(env, MDBX_IS_ERROR(rc));
|
||||
@ -421,16 +322,15 @@ retry:
|
||||
}
|
||||
|
||||
/* Firstly try to get exclusive locking. */
|
||||
rc = lck_op(env->me_lfd, op_setlk, F_WRLCK, 0, 1);
|
||||
rc = lck_op(env->lck_mmap.fd, op_setlk, F_WRLCK, 0, 1);
|
||||
if (rc == MDBX_SUCCESS) {
|
||||
rc = check_fstat(env);
|
||||
if (MDBX_IS_ERROR(rc))
|
||||
return rc;
|
||||
|
||||
continue_dxb_exclusive:
|
||||
rc =
|
||||
lck_op(env->me_lazy_fd, op_setlk,
|
||||
(env->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, 0, OFF_T_MAX);
|
||||
rc = lck_op(env->lazy_fd, op_setlk,
|
||||
(env->flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, 0, OFF_T_MAX);
|
||||
if (rc == MDBX_SUCCESS)
|
||||
return MDBX_RESULT_TRUE /* Done: return with exclusive locking. */;
|
||||
|
||||
@ -455,16 +355,16 @@ retry:
|
||||
}
|
||||
|
||||
/* Here could be one of two:
|
||||
* - osal_lck_destroy() from the another process was hold the lock
|
||||
* - lck_destroy() from the another process was hold the lock
|
||||
* during a destruction.
|
||||
* - either osal_lck_seize() from the another process was got the exclusive
|
||||
* - either lck_seize() from the another process was got the exclusive
|
||||
* lock and doing initialization.
|
||||
* For distinguish these cases will use size of the lck-file later. */
|
||||
|
||||
/* Wait for lck-shared now. */
|
||||
/* Here may be await during transient processes, for instance until another
|
||||
* competing process doesn't call lck_downgrade(). */
|
||||
rc = lck_op(env->me_lfd, op_setlkw, F_RDLCK, 0, 1);
|
||||
rc = lck_op(env->lck_mmap.fd, op_setlkw, F_RDLCK, 0, 1);
|
||||
if (rc != MDBX_SUCCESS) {
|
||||
ERROR("%s, err %u", "try-shared", rc);
|
||||
eASSERT(env, MDBX_IS_ERROR(rc));
|
||||
@ -480,7 +380,7 @@ retry:
|
||||
}
|
||||
|
||||
/* got shared, retry exclusive */
|
||||
rc = lck_op(env->me_lfd, op_setlk, F_WRLCK, 0, 1);
|
||||
rc = lck_op(env->lck_mmap.fd, op_setlk, F_WRLCK, 0, 1);
|
||||
if (rc == MDBX_SUCCESS)
|
||||
goto continue_dxb_exclusive;
|
||||
|
||||
@ -492,9 +392,8 @@ retry:
|
||||
}
|
||||
|
||||
/* Lock against another process operating in without-lck or exclusive mode. */
|
||||
rc =
|
||||
lck_op(env->me_lazy_fd, op_setlk,
|
||||
(env->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, env->me_pid, 1);
|
||||
rc = lck_op(env->lazy_fd, op_setlk,
|
||||
(env->flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, env->pid, 1);
|
||||
if (rc != MDBX_SUCCESS) {
|
||||
ERROR("%s, err %u", "lock-against-without-lck", rc);
|
||||
eASSERT(env, MDBX_IS_ERROR(rc));
|
||||
@ -505,20 +404,20 @@ retry:
|
||||
return MDBX_RESULT_FALSE;
|
||||
}
|
||||
|
||||
MDBX_INTERNAL_FUNC int osal_lck_downgrade(MDBX_env *env) {
|
||||
assert(env->me_lfd != INVALID_HANDLE_VALUE);
|
||||
if (unlikely(osal_getpid() != env->me_pid))
|
||||
MDBX_INTERNAL int lck_downgrade(MDBX_env *env) {
|
||||
assert(env->lck_mmap.fd != INVALID_HANDLE_VALUE);
|
||||
if (unlikely(osal_getpid() != env->pid))
|
||||
return MDBX_PANIC;
|
||||
|
||||
int rc = MDBX_SUCCESS;
|
||||
if ((env->me_flags & MDBX_EXCLUSIVE) == 0) {
|
||||
rc = lck_op(env->me_lazy_fd, op_setlk, F_UNLCK, 0, env->me_pid);
|
||||
if ((env->flags & MDBX_EXCLUSIVE) == 0) {
|
||||
rc = lck_op(env->lazy_fd, op_setlk, F_UNLCK, 0, env->pid);
|
||||
if (rc == MDBX_SUCCESS)
|
||||
rc = lck_op(env->me_lazy_fd, op_setlk, F_UNLCK, env->me_pid + 1,
|
||||
OFF_T_MAX - env->me_pid - 1);
|
||||
rc = lck_op(env->lazy_fd, op_setlk, F_UNLCK, env->pid + 1,
|
||||
OFF_T_MAX - env->pid - 1);
|
||||
}
|
||||
if (rc == MDBX_SUCCESS)
|
||||
rc = lck_op(env->me_lfd, op_setlk, F_RDLCK, 0, 1);
|
||||
rc = lck_op(env->lck_mmap.fd, op_setlk, F_RDLCK, 0, 1);
|
||||
if (unlikely(rc != 0)) {
|
||||
ERROR("%s, err %u", "lck", rc);
|
||||
assert(MDBX_IS_ERROR(rc));
|
||||
@ -526,25 +425,24 @@ MDBX_INTERNAL_FUNC int osal_lck_downgrade(MDBX_env *env) {
|
||||
return rc;
|
||||
}
|
||||
|
||||
MDBX_INTERNAL_FUNC int osal_lck_upgrade(MDBX_env *env, bool dont_wait) {
|
||||
assert(env->me_lfd != INVALID_HANDLE_VALUE);
|
||||
if (unlikely(osal_getpid() != env->me_pid))
|
||||
MDBX_INTERNAL int lck_upgrade(MDBX_env *env, bool dont_wait) {
|
||||
assert(env->lck_mmap.fd != INVALID_HANDLE_VALUE);
|
||||
if (unlikely(osal_getpid() != env->pid))
|
||||
return MDBX_PANIC;
|
||||
|
||||
const int cmd = dont_wait ? op_setlk : op_setlkw;
|
||||
int rc = lck_op(env->me_lfd, cmd, F_WRLCK, 0, 1);
|
||||
if (rc == MDBX_SUCCESS && (env->me_flags & MDBX_EXCLUSIVE) == 0) {
|
||||
rc = (env->me_pid > 1)
|
||||
? lck_op(env->me_lazy_fd, cmd, F_WRLCK, 0, env->me_pid - 1)
|
||||
: MDBX_SUCCESS;
|
||||
int rc = lck_op(env->lck_mmap.fd, cmd, F_WRLCK, 0, 1);
|
||||
if (rc == MDBX_SUCCESS && (env->flags & MDBX_EXCLUSIVE) == 0) {
|
||||
rc = (env->pid > 1) ? lck_op(env->lazy_fd, cmd, F_WRLCK, 0, env->pid - 1)
|
||||
: MDBX_SUCCESS;
|
||||
if (rc == MDBX_SUCCESS) {
|
||||
rc = lck_op(env->me_lazy_fd, cmd, F_WRLCK, env->me_pid + 1,
|
||||
OFF_T_MAX - env->me_pid - 1);
|
||||
if (rc != MDBX_SUCCESS && env->me_pid > 1 &&
|
||||
lck_op(env->me_lazy_fd, op_setlk, F_UNLCK, 0, env->me_pid - 1))
|
||||
rc = lck_op(env->lazy_fd, cmd, F_WRLCK, env->pid + 1,
|
||||
OFF_T_MAX - env->pid - 1);
|
||||
if (rc != MDBX_SUCCESS && env->pid > 1 &&
|
||||
lck_op(env->lazy_fd, op_setlk, F_UNLCK, 0, env->pid - 1))
|
||||
rc = MDBX_PANIC;
|
||||
}
|
||||
if (rc != MDBX_SUCCESS && lck_op(env->me_lfd, op_setlk, F_RDLCK, 0, 1))
|
||||
if (rc != MDBX_SUCCESS && lck_op(env->lck_mmap.fd, op_setlk, F_RDLCK, 0, 1))
|
||||
rc = MDBX_PANIC;
|
||||
}
|
||||
if (unlikely(rc != 0)) {
|
||||
@ -554,48 +452,48 @@ MDBX_INTERNAL_FUNC int osal_lck_upgrade(MDBX_env *env, bool dont_wait) {
|
||||
return rc;
|
||||
}
|
||||
|
||||
__cold MDBX_INTERNAL_FUNC int osal_lck_destroy(MDBX_env *env,
|
||||
MDBX_env *inprocess_neighbor,
|
||||
const uint32_t current_pid) {
|
||||
__cold MDBX_INTERNAL int lck_destroy(MDBX_env *env,
|
||||
MDBX_env *inprocess_neighbor,
|
||||
const uint32_t current_pid) {
|
||||
eASSERT(env, osal_getpid() == current_pid);
|
||||
int rc = MDBX_SUCCESS;
|
||||
struct stat lck_info;
|
||||
MDBX_lockinfo *lck = env->me_lck;
|
||||
if (lck && lck == env->me_lck_mmap.lck && !inprocess_neighbor &&
|
||||
lck_t *lck = env->lck;
|
||||
if (lck && lck == env->lck_mmap.lck && !inprocess_neighbor &&
|
||||
/* try get exclusive access */
|
||||
lck_op(env->me_lfd, op_setlk, F_WRLCK, 0, OFF_T_MAX) == 0 &&
|
||||
lck_op(env->lck_mmap.fd, op_setlk, F_WRLCK, 0, OFF_T_MAX) == 0 &&
|
||||
/* if LCK was not removed */
|
||||
fstat(env->me_lfd, &lck_info) == 0 && lck_info.st_nlink > 0 &&
|
||||
lck_op(env->me_lazy_fd, op_setlk,
|
||||
(env->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, 0,
|
||||
fstat(env->lck_mmap.fd, &lck_info) == 0 && lck_info.st_nlink > 0 &&
|
||||
lck_op(env->lazy_fd, op_setlk,
|
||||
(env->flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, 0,
|
||||
OFF_T_MAX) == 0) {
|
||||
|
||||
VERBOSE("%p got exclusive, drown ipc-locks", (void *)env);
|
||||
eASSERT(env, current_pid == env->me_pid);
|
||||
eASSERT(env, current_pid == env->pid);
|
||||
#if MDBX_LOCKING == MDBX_LOCKING_SYSV
|
||||
if (env->me_sysv_ipc.semid != -1)
|
||||
rc = semctl(env->me_sysv_ipc.semid, 2, IPC_RMID) ? errno : 0;
|
||||
#else
|
||||
rc = osal_ipclock_destroy(&lck->mti_rlock);
|
||||
rc = lck_ipclock_destroy(&lck->rdt_lock);
|
||||
if (rc == 0)
|
||||
rc = osal_ipclock_destroy(&lck->mti_wlock);
|
||||
rc = lck_ipclock_destroy(&lck->wrt_lock);
|
||||
#endif /* MDBX_LOCKING */
|
||||
|
||||
eASSERT(env, rc == 0);
|
||||
if (rc == 0) {
|
||||
const bool synced = lck->mti_unsynced_pages.weak == 0;
|
||||
osal_munmap(&env->me_lck_mmap);
|
||||
if (synced && env->me_lfd != INVALID_HANDLE_VALUE)
|
||||
rc = ftruncate(env->me_lfd, 0) ? errno : 0;
|
||||
const bool synced = lck->unsynced_pages.weak == 0;
|
||||
osal_munmap(&env->lck_mmap);
|
||||
if (synced && env->lck_mmap.fd != INVALID_HANDLE_VALUE)
|
||||
rc = ftruncate(env->lck_mmap.fd, 0) ? errno : 0;
|
||||
}
|
||||
|
||||
jitter4testing(false);
|
||||
}
|
||||
|
||||
if (current_pid != env->me_pid) {
|
||||
if (current_pid != env->pid) {
|
||||
eASSERT(env, !inprocess_neighbor);
|
||||
NOTICE("drown env %p after-fork pid %d -> %d",
|
||||
__Wpedantic_format_voidptr(env), env->me_pid, current_pid);
|
||||
__Wpedantic_format_voidptr(env), env->pid, current_pid);
|
||||
inprocess_neighbor = nullptr;
|
||||
}
|
||||
|
||||
@ -607,57 +505,55 @@ __cold MDBX_INTERNAL_FUNC int osal_lck_destroy(MDBX_env *env,
|
||||
* locks should be released here explicitly with properly order. */
|
||||
|
||||
/* close dxb and restore lock */
|
||||
if (env->me_dsync_fd != INVALID_HANDLE_VALUE) {
|
||||
if (unlikely(close(env->me_dsync_fd) != 0) && rc == MDBX_SUCCESS)
|
||||
if (env->dsync_fd != INVALID_HANDLE_VALUE) {
|
||||
if (unlikely(close(env->dsync_fd) != 0) && rc == MDBX_SUCCESS)
|
||||
rc = errno;
|
||||
env->me_dsync_fd = INVALID_HANDLE_VALUE;
|
||||
env->dsync_fd = INVALID_HANDLE_VALUE;
|
||||
}
|
||||
if (env->me_lazy_fd != INVALID_HANDLE_VALUE) {
|
||||
if (unlikely(close(env->me_lazy_fd) != 0) && rc == MDBX_SUCCESS)
|
||||
if (env->lazy_fd != INVALID_HANDLE_VALUE) {
|
||||
if (unlikely(close(env->lazy_fd) != 0) && rc == MDBX_SUCCESS)
|
||||
rc = errno;
|
||||
env->me_lazy_fd = INVALID_HANDLE_VALUE;
|
||||
env->lazy_fd = INVALID_HANDLE_VALUE;
|
||||
if (op_setlk == F_SETLK && inprocess_neighbor && rc == MDBX_SUCCESS) {
|
||||
/* restore file-lock */
|
||||
rc = lck_op(
|
||||
inprocess_neighbor->me_lazy_fd, F_SETLKW,
|
||||
(inprocess_neighbor->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK,
|
||||
(inprocess_neighbor->me_flags & MDBX_EXCLUSIVE)
|
||||
? 0
|
||||
: inprocess_neighbor->me_pid,
|
||||
(inprocess_neighbor->me_flags & MDBX_EXCLUSIVE) ? OFF_T_MAX : 1);
|
||||
rc = lck_op(inprocess_neighbor->lazy_fd, F_SETLKW,
|
||||
(inprocess_neighbor->flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK,
|
||||
(inprocess_neighbor->flags & MDBX_EXCLUSIVE)
|
||||
? 0
|
||||
: inprocess_neighbor->pid,
|
||||
(inprocess_neighbor->flags & MDBX_EXCLUSIVE) ? OFF_T_MAX : 1);
|
||||
}
|
||||
}
|
||||
|
||||
/* close clk and restore locks */
|
||||
if (env->me_lfd != INVALID_HANDLE_VALUE) {
|
||||
if (unlikely(close(env->me_lfd) != 0) && rc == MDBX_SUCCESS)
|
||||
if (env->lck_mmap.fd != INVALID_HANDLE_VALUE) {
|
||||
if (unlikely(close(env->lck_mmap.fd) != 0) && rc == MDBX_SUCCESS)
|
||||
rc = errno;
|
||||
env->me_lfd = INVALID_HANDLE_VALUE;
|
||||
env->lck_mmap.fd = INVALID_HANDLE_VALUE;
|
||||
if (op_setlk == F_SETLK && inprocess_neighbor && rc == MDBX_SUCCESS) {
|
||||
/* restore file-locks */
|
||||
rc = lck_op(inprocess_neighbor->me_lfd, F_SETLKW, F_RDLCK, 0, 1);
|
||||
if (rc == MDBX_SUCCESS && inprocess_neighbor->me_live_reader)
|
||||
rc = osal_rpid_set(inprocess_neighbor);
|
||||
rc = lck_op(inprocess_neighbor->lck_mmap.fd, F_SETLKW, F_RDLCK, 0, 1);
|
||||
if (rc == MDBX_SUCCESS && inprocess_neighbor->registered_reader_pid)
|
||||
rc = lck_rpid_set(inprocess_neighbor);
|
||||
}
|
||||
}
|
||||
|
||||
if (inprocess_neighbor && rc != MDBX_SUCCESS)
|
||||
inprocess_neighbor->me_flags |= MDBX_FATAL_ERROR;
|
||||
inprocess_neighbor->flags |= ENV_FATAL_ERROR;
|
||||
return rc;
|
||||
}
|
||||
|
||||
/*---------------------------------------------------------------------------*/
|
||||
|
||||
__cold MDBX_INTERNAL_FUNC int osal_lck_init(MDBX_env *env,
|
||||
MDBX_env *inprocess_neighbor,
|
||||
int global_uniqueness_flag) {
|
||||
__cold MDBX_INTERNAL int lck_init(MDBX_env *env, MDBX_env *inprocess_neighbor,
|
||||
int global_uniqueness_flag) {
|
||||
#if MDBX_LOCKING == MDBX_LOCKING_SYSV
|
||||
int semid = -1;
|
||||
/* don't initialize semaphores twice */
|
||||
(void)inprocess_neighbor;
|
||||
if (global_uniqueness_flag == MDBX_RESULT_TRUE) {
|
||||
struct stat st;
|
||||
if (fstat(env->me_lazy_fd, &st))
|
||||
if (fstat(env->lazy_fd, &st))
|
||||
return errno;
|
||||
sysv_retry_create:
|
||||
semid = semget(env->me_sysv_ipc.key, 2,
|
||||
@ -711,9 +607,9 @@ __cold MDBX_INTERNAL_FUNC int osal_lck_init(MDBX_env *env,
|
||||
/* don't initialize semaphores twice */
|
||||
(void)inprocess_neighbor;
|
||||
if (global_uniqueness_flag == MDBX_RESULT_TRUE) {
|
||||
if (sem_init(&env->me_lck_mmap.lck->mti_rlock, true, 1))
|
||||
if (sem_init(&env->lck_mmap.lck->rdt_lock, true, 1))
|
||||
return errno;
|
||||
if (sem_init(&env->me_lck_mmap.lck->mti_wlock, true, 1))
|
||||
if (sem_init(&env->lck_mmap.lck->wrt_lock, true, 1))
|
||||
return errno;
|
||||
}
|
||||
return MDBX_SUCCESS;
|
||||
@ -782,10 +678,10 @@ __cold MDBX_INTERNAL_FUNC int osal_lck_init(MDBX_env *env,
|
||||
if (rc && rc != ENOTSUP)
|
||||
goto bailout;
|
||||
|
||||
rc = pthread_mutex_init(&env->me_lck_mmap.lck->mti_rlock, &ma);
|
||||
rc = pthread_mutex_init(&env->lck_mmap.lck->rdt_lock, &ma);
|
||||
if (rc)
|
||||
goto bailout;
|
||||
rc = pthread_mutex_init(&env->me_lck_mmap.lck->mti_wlock, &ma);
|
||||
rc = pthread_mutex_init(&env->lck_mmap.lck->wrt_lock, &ma);
|
||||
|
||||
bailout:
|
||||
pthread_mutexattr_destroy(&ma);
|
||||
@ -799,23 +695,27 @@ __cold static int osal_ipclock_failed(MDBX_env *env, osal_ipclock_t *ipc,
|
||||
const int err) {
|
||||
int rc = err;
|
||||
#if MDBX_LOCKING == MDBX_LOCKING_POSIX2008 || MDBX_LOCKING == MDBX_LOCKING_SYSV
|
||||
|
||||
#ifndef EOWNERDEAD
|
||||
#define EOWNERDEAD MDBX_RESULT_TRUE
|
||||
#endif /* EOWNERDEAD */
|
||||
|
||||
if (err == EOWNERDEAD) {
|
||||
/* We own the mutex. Clean up after dead previous owner. */
|
||||
|
||||
const bool rlocked = ipc == &env->me_lck->mti_rlock;
|
||||
const bool rlocked = ipc == &env->lck->rdt_lock;
|
||||
rc = MDBX_SUCCESS;
|
||||
if (!rlocked) {
|
||||
if (unlikely(env->me_txn)) {
|
||||
if (unlikely(env->txn)) {
|
||||
/* env is hosed if the dead thread was ours */
|
||||
env->me_flags |= MDBX_FATAL_ERROR;
|
||||
env->me_txn = NULL;
|
||||
env->flags |= ENV_FATAL_ERROR;
|
||||
env->txn = nullptr;
|
||||
rc = MDBX_PANIC;
|
||||
}
|
||||
}
|
||||
WARNING("%clock owner died, %s", (rlocked ? 'r' : 'w'),
|
||||
(rc ? "this process' env is hosed" : "recovering"));
|
||||
|
||||
int check_rc = cleanup_dead_readers(env, rlocked, NULL);
|
||||
int check_rc = mvcc_cleanup_dead(env, rlocked, nullptr);
|
||||
check_rc = (check_rc == MDBX_SUCCESS) ? MDBX_RESULT_TRUE : check_rc;
|
||||
|
||||
#if MDBX_LOCKING == MDBX_LOCKING_SYSV
|
||||
@ -858,12 +758,12 @@ __cold static int osal_ipclock_failed(MDBX_env *env, osal_ipclock_t *ipc,
|
||||
|
||||
ERROR("mutex (un)lock failed, %s", mdbx_strerror(err));
|
||||
if (rc != EDEADLK)
|
||||
env->me_flags |= MDBX_FATAL_ERROR;
|
||||
env->flags |= ENV_FATAL_ERROR;
|
||||
return rc;
|
||||
}
|
||||
|
||||
#if defined(__ANDROID_API__) || defined(ANDROID) || defined(BIONIC)
|
||||
MDBX_INTERNAL_FUNC int osal_check_tid4bionic(void) {
|
||||
MDBX_INTERNAL int osal_check_tid4bionic(void) {
|
||||
/* avoid 32-bit Bionic bug/hang with 32-pit TID */
|
||||
if (sizeof(pthread_mutex_t) < sizeof(pid_t) + sizeof(unsigned)) {
|
||||
pid_t tid = gettid();
|
||||
@ -900,7 +800,7 @@ static int osal_ipclock_lock(MDBX_env *env, osal_ipclock_t *ipc,
|
||||
} else if (sem_wait(ipc))
|
||||
rc = errno;
|
||||
#elif MDBX_LOCKING == MDBX_LOCKING_SYSV
|
||||
struct sembuf op = {.sem_num = (ipc != &env->me_lck->mti_wlock),
|
||||
struct sembuf op = {.sem_num = (ipc != &env->lck->wrt_lock),
|
||||
.sem_op = -1,
|
||||
.sem_flg = dont_wait ? IPC_NOWAIT | SEM_UNDO : SEM_UNDO};
|
||||
int rc;
|
||||
@ -910,7 +810,7 @@ static int osal_ipclock_lock(MDBX_env *env, osal_ipclock_t *ipc,
|
||||
rc = MDBX_BUSY;
|
||||
} else {
|
||||
rc = *ipc ? EOWNERDEAD : MDBX_SUCCESS;
|
||||
*ipc = env->me_pid;
|
||||
*ipc = env->pid;
|
||||
}
|
||||
#else
|
||||
#error "FIXME"
|
||||
@ -929,11 +829,11 @@ int osal_ipclock_unlock(MDBX_env *env, osal_ipclock_t *ipc) {
|
||||
#elif MDBX_LOCKING == MDBX_LOCKING_POSIX1988
|
||||
err = sem_post(ipc) ? errno : MDBX_SUCCESS;
|
||||
#elif MDBX_LOCKING == MDBX_LOCKING_SYSV
|
||||
if (unlikely(*ipc != (pid_t)env->me_pid))
|
||||
if (unlikely(*ipc != (pid_t)env->pid))
|
||||
err = EPERM;
|
||||
else {
|
||||
*ipc = 0;
|
||||
struct sembuf op = {.sem_num = (ipc != &env->me_lck->mti_wlock),
|
||||
struct sembuf op = {.sem_num = (ipc != &env->lck->wrt_lock),
|
||||
.sem_op = 1,
|
||||
.sem_flg = SEM_UNDO};
|
||||
err = semop(env->me_sysv_ipc.semid, &op, 1) ? errno : MDBX_SUCCESS;
|
||||
@ -944,66 +844,61 @@ int osal_ipclock_unlock(MDBX_env *env, osal_ipclock_t *ipc) {
|
||||
int rc = err;
|
||||
if (unlikely(rc != MDBX_SUCCESS)) {
|
||||
const uint32_t current_pid = osal_getpid();
|
||||
if (current_pid == env->me_pid || LOG_ENABLED(MDBX_LOG_NOTICE))
|
||||
debug_log((current_pid == env->me_pid)
|
||||
if (current_pid == env->pid || LOG_ENABLED(MDBX_LOG_NOTICE))
|
||||
debug_log((current_pid == env->pid)
|
||||
? MDBX_LOG_FATAL
|
||||
: (rc = MDBX_SUCCESS, MDBX_LOG_NOTICE),
|
||||
"ipc-unlock()", __LINE__, "failed: env %p, lck-%s %p, err %d\n",
|
||||
__Wpedantic_format_voidptr(env),
|
||||
(env->me_lck == env->me_lck_mmap.lck) ? "mmap" : "stub",
|
||||
__Wpedantic_format_voidptr(env->me_lck), err);
|
||||
(env->lck == env->lck_mmap.lck) ? "mmap" : "stub",
|
||||
__Wpedantic_format_voidptr(env->lck), err);
|
||||
}
|
||||
return rc;
|
||||
}
|
||||
|
||||
MDBX_INTERNAL_FUNC int osal_rdt_lock(MDBX_env *env) {
|
||||
MDBX_INTERNAL int lck_rdt_lock(MDBX_env *env) {
|
||||
TRACE("%s", ">>");
|
||||
jitter4testing(true);
|
||||
int rc = osal_ipclock_lock(env, &env->me_lck->mti_rlock, false);
|
||||
int rc = osal_ipclock_lock(env, &env->lck->rdt_lock, false);
|
||||
TRACE("<< rc %d", rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
MDBX_INTERNAL_FUNC void osal_rdt_unlock(MDBX_env *env) {
|
||||
MDBX_INTERNAL void lck_rdt_unlock(MDBX_env *env) {
|
||||
TRACE("%s", ">>");
|
||||
int err = osal_ipclock_unlock(env, &env->me_lck->mti_rlock);
|
||||
int err = osal_ipclock_unlock(env, &env->lck->rdt_lock);
|
||||
TRACE("<< err %d", err);
|
||||
if (unlikely(err != MDBX_SUCCESS))
|
||||
mdbx_panic("%s() failed: err %d\n", __func__, err);
|
||||
jitter4testing(true);
|
||||
}
|
||||
|
||||
int osal_txn_lock(MDBX_env *env, bool dont_wait) {
|
||||
int lck_txn_lock(MDBX_env *env, bool dont_wait) {
|
||||
TRACE("%swait %s", dont_wait ? "dont-" : "", ">>");
|
||||
jitter4testing(true);
|
||||
const int err = osal_ipclock_lock(env, &env->me_lck->mti_wlock, dont_wait);
|
||||
const int err = osal_ipclock_lock(env, &env->lck->wrt_lock, dont_wait);
|
||||
int rc = err;
|
||||
if (likely(!MDBX_IS_ERROR(err))) {
|
||||
eASSERT(env, !env->me_txn0->mt_owner ||
|
||||
eASSERT(env, !env->basal_txn->owner ||
|
||||
err == /* если другой поток в этом-же процессе завершился
|
||||
не освободив блокировку */
|
||||
MDBX_RESULT_TRUE);
|
||||
env->me_txn0->mt_owner = osal_thread_self();
|
||||
env->basal_txn->owner = osal_thread_self();
|
||||
rc = MDBX_SUCCESS;
|
||||
}
|
||||
TRACE("<< err %d, rc %d", err, rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
void osal_txn_unlock(MDBX_env *env) {
|
||||
void lck_txn_unlock(MDBX_env *env) {
|
||||
TRACE("%s", ">>");
|
||||
eASSERT(env, env->me_txn0->mt_owner == osal_thread_self());
|
||||
env->me_txn0->mt_owner = 0;
|
||||
int err = osal_ipclock_unlock(env, &env->me_lck->mti_wlock);
|
||||
eASSERT(env, env->basal_txn->owner == osal_thread_self());
|
||||
env->basal_txn->owner = 0;
|
||||
int err = osal_ipclock_unlock(env, &env->lck->wrt_lock);
|
||||
TRACE("<< err %d", err);
|
||||
if (unlikely(err != MDBX_SUCCESS))
|
||||
mdbx_panic("%s() failed: err %d\n", __func__, err);
|
||||
jitter4testing(true);
|
||||
}
|
||||
|
||||
#else
|
||||
#ifdef _MSC_VER
|
||||
#pragma warning(disable : 4206) /* nonstandard extension used: translation \
|
||||
unit is empty */
|
||||
#endif /* _MSC_VER (warnings) */
|
||||
#endif /* !Windows LCK-implementation */
|
||||
#endif /* !Windows LCK-implementation */
|
||||
|
@ -1,18 +1,7 @@
|
||||
/*
|
||||
* Copyright 2015-2024 Leonid Yuriev <leo@yuriev.ru>
|
||||
* and other libmdbx authors: please see AUTHORS file.
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted only as authorized by the OpenLDAP
|
||||
* Public License.
|
||||
*
|
||||
* A copy of this license is available in the file LICENSE in the
|
||||
* top-level directory of the distribution or, alternatively, at
|
||||
* <http://www.OpenLDAP.org/license.html>.
|
||||
*/
|
||||
/// \copyright SPDX-License-Identifier: Apache-2.0
|
||||
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
|
||||
|
||||
#if defined(_WIN32) || defined(_WIN64) /* Windows LCK-implementation */
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
|
||||
/* PREAMBLE FOR WINDOWS:
|
||||
*
|
||||
@ -22,91 +11,6 @@
|
||||
|
||||
#include "internals.h"
|
||||
|
||||
static void mdbx_winnt_import(void);
|
||||
|
||||
#if MDBX_BUILD_SHARED_LIBRARY
|
||||
#if MDBX_WITHOUT_MSVC_CRT && defined(NDEBUG)
|
||||
/* DEBUG/CHECKED builds still require MSVC's CRT for runtime checks.
|
||||
*
|
||||
* Define dll's entry point only for Release build when NDEBUG is defined and
|
||||
* MDBX_WITHOUT_MSVC_CRT=ON. if the entry point isn't defined then MSVC's will
|
||||
* automatically use DllMainCRTStartup() from CRT library, which also
|
||||
* automatically call DllMain() from our mdbx.dll */
|
||||
#pragma comment(linker, "/ENTRY:DllMain")
|
||||
#endif /* MDBX_WITHOUT_MSVC_CRT */
|
||||
|
||||
BOOL APIENTRY DllMain(HANDLE module, DWORD reason, LPVOID reserved)
|
||||
#else
|
||||
#if !MDBX_MANUAL_MODULE_HANDLER
|
||||
static
|
||||
#endif /* !MDBX_MANUAL_MODULE_HANDLER */
|
||||
void NTAPI
|
||||
mdbx_module_handler(PVOID module, DWORD reason, PVOID reserved)
|
||||
#endif /* MDBX_BUILD_SHARED_LIBRARY */
|
||||
{
|
||||
(void)reserved;
|
||||
switch (reason) {
|
||||
case DLL_PROCESS_ATTACH:
|
||||
mdbx_winnt_import();
|
||||
global_ctor();
|
||||
break;
|
||||
case DLL_PROCESS_DETACH:
|
||||
global_dtor();
|
||||
break;
|
||||
|
||||
case DLL_THREAD_ATTACH:
|
||||
break;
|
||||
case DLL_THREAD_DETACH:
|
||||
thread_dtor(module);
|
||||
break;
|
||||
}
|
||||
#if MDBX_BUILD_SHARED_LIBRARY
|
||||
return TRUE;
|
||||
#endif
|
||||
}
|
||||
|
||||
#if !MDBX_BUILD_SHARED_LIBRARY && !MDBX_MANUAL_MODULE_HANDLER
|
||||
/* *INDENT-OFF* */
|
||||
/* clang-format off */
|
||||
#if defined(_MSC_VER)
|
||||
# pragma const_seg(push)
|
||||
# pragma data_seg(push)
|
||||
|
||||
# ifndef _M_IX86
|
||||
/* kick a linker to create the TLS directory if not already done */
|
||||
# pragma comment(linker, "/INCLUDE:_tls_used")
|
||||
/* Force some symbol references. */
|
||||
# pragma comment(linker, "/INCLUDE:mdbx_tls_anchor")
|
||||
/* specific const-segment for WIN64 */
|
||||
# pragma const_seg(".CRT$XLB")
|
||||
const
|
||||
# else
|
||||
/* kick a linker to create the TLS directory if not already done */
|
||||
# pragma comment(linker, "/INCLUDE:__tls_used")
|
||||
/* Force some symbol references. */
|
||||
# pragma comment(linker, "/INCLUDE:_mdbx_tls_anchor")
|
||||
/* specific data-segment for WIN32 */
|
||||
# pragma data_seg(".CRT$XLB")
|
||||
# endif
|
||||
|
||||
__declspec(allocate(".CRT$XLB")) PIMAGE_TLS_CALLBACK mdbx_tls_anchor = mdbx_module_handler;
|
||||
# pragma data_seg(pop)
|
||||
# pragma const_seg(pop)
|
||||
|
||||
#elif defined(__GNUC__)
|
||||
# ifndef _M_IX86
|
||||
const
|
||||
# endif
|
||||
PIMAGE_TLS_CALLBACK mdbx_tls_anchor __attribute__((__section__(".CRT$XLB"), used)) = mdbx_module_handler;
|
||||
#else
|
||||
# error FIXME
|
||||
#endif
|
||||
/* *INDENT-ON* */
|
||||
/* clang-format on */
|
||||
#endif /* !MDBX_BUILD_SHARED_LIBRARY && !MDBX_MANUAL_MODULE_HANDLER */
|
||||
|
||||
/*----------------------------------------------------------------------------*/
|
||||
|
||||
#define LCK_SHARED 0
|
||||
#define LCK_EXCLUSIVE LOCKFILE_EXCLUSIVE_LOCK
|
||||
#define LCK_WAITFOR 0
|
||||
@ -145,17 +49,16 @@ static int flock_with_event(HANDLE fd, HANDLE event, unsigned flags,
|
||||
return (int)rc;
|
||||
}
|
||||
|
||||
static __inline int flock(HANDLE fd, unsigned flags, size_t offset,
|
||||
size_t bytes) {
|
||||
static inline int flock(HANDLE fd, unsigned flags, size_t offset,
|
||||
size_t bytes) {
|
||||
return flock_with_event(fd, 0, flags, offset, bytes);
|
||||
}
|
||||
|
||||
static __inline int flock_data(const MDBX_env *env, unsigned flags,
|
||||
size_t offset, size_t bytes) {
|
||||
static inline int flock_data(const MDBX_env *env, unsigned flags, size_t offset,
|
||||
size_t bytes) {
|
||||
const HANDLE fd4data =
|
||||
env->me_overlapped_fd ? env->me_overlapped_fd : env->me_lazy_fd;
|
||||
return flock_with_event(fd4data, env->me_data_lock_event, flags, offset,
|
||||
bytes);
|
||||
env->ioring.overlapped_fd ? env->ioring.overlapped_fd : env->lazy_fd;
|
||||
return flock_with_event(fd4data, env->dxb_lock_event, flags, offset, bytes);
|
||||
}
|
||||
|
||||
static int funlock(mdbx_filehandle_t fd, size_t offset, size_t bytes) {
|
||||
@ -175,16 +78,16 @@ static int funlock(mdbx_filehandle_t fd, size_t offset, size_t bytes) {
|
||||
#else
|
||||
#define DXB_MAXLEN UINT32_C(0x7ff00000)
|
||||
#endif
|
||||
#define DXB_BODY (env->me_psize * (size_t)NUM_METAS), DXB_MAXLEN
|
||||
#define DXB_BODY (env->ps * (size_t)NUM_METAS), DXB_MAXLEN
|
||||
#define DXB_WHOLE 0, DXB_MAXLEN
|
||||
|
||||
int osal_txn_lock(MDBX_env *env, bool dontwait) {
|
||||
int lck_txn_lock(MDBX_env *env, bool dontwait) {
|
||||
if (dontwait) {
|
||||
if (!TryEnterCriticalSection(&env->me_windowsbug_lock))
|
||||
if (!TryEnterCriticalSection(&env->windowsbug_lock))
|
||||
return MDBX_BUSY;
|
||||
} else {
|
||||
__try {
|
||||
EnterCriticalSection(&env->me_windowsbug_lock);
|
||||
EnterCriticalSection(&env->windowsbug_lock);
|
||||
}
|
||||
__except ((GetExceptionCode() ==
|
||||
0xC0000194 /* STATUS_POSSIBLE_DEADLOCK / EXCEPTION_POSSIBLE_DEADLOCK */)
|
||||
@ -194,93 +97,93 @@ int osal_txn_lock(MDBX_env *env, bool dontwait) {
|
||||
}
|
||||
}
|
||||
|
||||
eASSERT(env, !env->me_txn0->mt_owner);
|
||||
if (env->me_flags & MDBX_EXCLUSIVE)
|
||||
eASSERT(env, !env->basal_txn->owner);
|
||||
if (env->flags & MDBX_EXCLUSIVE)
|
||||
goto done;
|
||||
|
||||
const HANDLE fd4data =
|
||||
env->me_overlapped_fd ? env->me_overlapped_fd : env->me_lazy_fd;
|
||||
int rc = flock_with_event(fd4data, env->me_data_lock_event,
|
||||
env->ioring.overlapped_fd ? env->ioring.overlapped_fd : env->lazy_fd;
|
||||
int rc = flock_with_event(fd4data, env->dxb_lock_event,
|
||||
dontwait ? (LCK_EXCLUSIVE | LCK_DONTWAIT)
|
||||
: (LCK_EXCLUSIVE | LCK_WAITFOR),
|
||||
DXB_BODY);
|
||||
if (rc == ERROR_LOCK_VIOLATION && dontwait) {
|
||||
SleepEx(0, true);
|
||||
rc = flock_with_event(fd4data, env->me_data_lock_event,
|
||||
rc = flock_with_event(fd4data, env->dxb_lock_event,
|
||||
LCK_EXCLUSIVE | LCK_DONTWAIT, DXB_BODY);
|
||||
if (rc == ERROR_LOCK_VIOLATION) {
|
||||
SleepEx(0, true);
|
||||
rc = flock_with_event(fd4data, env->me_data_lock_event,
|
||||
rc = flock_with_event(fd4data, env->dxb_lock_event,
|
||||
LCK_EXCLUSIVE | LCK_DONTWAIT, DXB_BODY);
|
||||
}
|
||||
}
|
||||
if (rc == MDBX_SUCCESS) {
|
||||
done:
|
||||
/* Zap: Failing to release lock 'env->me_windowsbug_lock'
|
||||
/* Zap: Failing to release lock 'env->windowsbug_lock'
|
||||
* in function 'mdbx_txn_lock' */
|
||||
MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER(26115);
|
||||
env->me_txn0->mt_owner = osal_thread_self();
|
||||
env->basal_txn->owner = osal_thread_self();
|
||||
return MDBX_SUCCESS;
|
||||
}
|
||||
|
||||
LeaveCriticalSection(&env->me_windowsbug_lock);
|
||||
LeaveCriticalSection(&env->windowsbug_lock);
|
||||
return (!dontwait || rc != ERROR_LOCK_VIOLATION) ? rc : MDBX_BUSY;
|
||||
}
|
||||
|
||||
void osal_txn_unlock(MDBX_env *env) {
|
||||
eASSERT(env, env->me_txn0->mt_owner == osal_thread_self());
|
||||
if ((env->me_flags & MDBX_EXCLUSIVE) == 0) {
|
||||
void lck_txn_unlock(MDBX_env *env) {
|
||||
eASSERT(env, env->basal_txn->owner == osal_thread_self());
|
||||
if ((env->flags & MDBX_EXCLUSIVE) == 0) {
|
||||
const HANDLE fd4data =
|
||||
env->me_overlapped_fd ? env->me_overlapped_fd : env->me_lazy_fd;
|
||||
env->ioring.overlapped_fd ? env->ioring.overlapped_fd : env->lazy_fd;
|
||||
int err = funlock(fd4data, DXB_BODY);
|
||||
if (err != MDBX_SUCCESS)
|
||||
mdbx_panic("%s failed: err %u", __func__, err);
|
||||
}
|
||||
env->me_txn0->mt_owner = 0;
|
||||
LeaveCriticalSection(&env->me_windowsbug_lock);
|
||||
env->basal_txn->owner = 0;
|
||||
LeaveCriticalSection(&env->windowsbug_lock);
|
||||
}
|
||||
|
||||
/*----------------------------------------------------------------------------*/
|
||||
/* global `read` lock for readers registration,
|
||||
* exclusive locking `mti_numreaders` (second) cacheline */
|
||||
* exclusive locking `rdt_length` (second) cacheline */
|
||||
|
||||
#define LCK_LO_OFFSET 0
|
||||
#define LCK_LO_LEN offsetof(MDBX_lockinfo, mti_numreaders)
|
||||
#define LCK_LO_LEN offsetof(lck_t, rdt_length)
|
||||
#define LCK_UP_OFFSET LCK_LO_LEN
|
||||
#define LCK_UP_LEN (sizeof(MDBX_lockinfo) - LCK_UP_OFFSET)
|
||||
#define LCK_UP_LEN (sizeof(lck_t) - LCK_UP_OFFSET)
|
||||
#define LCK_LOWER LCK_LO_OFFSET, LCK_LO_LEN
|
||||
#define LCK_UPPER LCK_UP_OFFSET, LCK_UP_LEN
|
||||
|
||||
MDBX_INTERNAL_FUNC int osal_rdt_lock(MDBX_env *env) {
|
||||
osal_srwlock_AcquireShared(&env->me_remap_guard);
|
||||
if (env->me_lfd == INVALID_HANDLE_VALUE)
|
||||
MDBX_INTERNAL int lck_rdt_lock(MDBX_env *env) {
|
||||
imports.srwl_AcquireShared(&env->remap_guard);
|
||||
if (env->lck_mmap.fd == INVALID_HANDLE_VALUE)
|
||||
return MDBX_SUCCESS; /* readonly database in readonly filesystem */
|
||||
|
||||
/* transition from S-? (used) to S-E (locked),
|
||||
* e.g. exclusive lock upper-part */
|
||||
if (env->me_flags & MDBX_EXCLUSIVE)
|
||||
if (env->flags & MDBX_EXCLUSIVE)
|
||||
return MDBX_SUCCESS;
|
||||
|
||||
int rc = flock(env->me_lfd, LCK_EXCLUSIVE | LCK_WAITFOR, LCK_UPPER);
|
||||
int rc = flock(env->lck_mmap.fd, LCK_EXCLUSIVE | LCK_WAITFOR, LCK_UPPER);
|
||||
if (rc == MDBX_SUCCESS)
|
||||
return MDBX_SUCCESS;
|
||||
|
||||
osal_srwlock_ReleaseShared(&env->me_remap_guard);
|
||||
imports.srwl_ReleaseShared(&env->remap_guard);
|
||||
return rc;
|
||||
}
|
||||
|
||||
MDBX_INTERNAL_FUNC void osal_rdt_unlock(MDBX_env *env) {
|
||||
if (env->me_lfd != INVALID_HANDLE_VALUE &&
|
||||
(env->me_flags & MDBX_EXCLUSIVE) == 0) {
|
||||
MDBX_INTERNAL void lck_rdt_unlock(MDBX_env *env) {
|
||||
if (env->lck_mmap.fd != INVALID_HANDLE_VALUE &&
|
||||
(env->flags & MDBX_EXCLUSIVE) == 0) {
|
||||
/* transition from S-E (locked) to S-? (used), e.g. unlock upper-part */
|
||||
int err = funlock(env->me_lfd, LCK_UPPER);
|
||||
int err = funlock(env->lck_mmap.fd, LCK_UPPER);
|
||||
if (err != MDBX_SUCCESS)
|
||||
mdbx_panic("%s failed: err %u", __func__, err);
|
||||
}
|
||||
osal_srwlock_ReleaseShared(&env->me_remap_guard);
|
||||
imports.srwl_ReleaseShared(&env->remap_guard);
|
||||
}
|
||||
|
||||
MDBX_INTERNAL_FUNC int osal_lockfile(mdbx_filehandle_t fd, bool wait) {
|
||||
MDBX_INTERNAL int osal_lockfile(mdbx_filehandle_t fd, bool wait) {
|
||||
return flock(
|
||||
fd, wait ? LCK_EXCLUSIVE | LCK_WAITFOR : LCK_EXCLUSIVE | LCK_DONTWAIT, 0,
|
||||
DXB_MAXLEN);
|
||||
@ -293,7 +196,7 @@ static int suspend_and_append(mdbx_handle_array_t **array,
|
||||
mdbx_handle_array_t *const ptr =
|
||||
osal_realloc((limit > ARRAY_LENGTH((*array)->handles))
|
||||
? *array
|
||||
: /* don't free initial array on the stack */ NULL,
|
||||
: /* don't free initial array on the stack */ nullptr,
|
||||
sizeof(mdbx_handle_array_t) +
|
||||
sizeof(HANDLE) * (limit * (size_t)2 -
|
||||
ARRAY_LENGTH((*array)->handles)));
|
||||
@ -307,7 +210,7 @@ static int suspend_and_append(mdbx_handle_array_t **array,
|
||||
|
||||
HANDLE hThread = OpenThread(THREAD_SUSPEND_RESUME | THREAD_QUERY_INFORMATION,
|
||||
FALSE, ThreadId);
|
||||
if (hThread == NULL)
|
||||
if (hThread == nullptr)
|
||||
return (int)GetLastError();
|
||||
|
||||
if (SuspendThread(hThread) == (DWORD)-1) {
|
||||
@ -324,28 +227,27 @@ static int suspend_and_append(mdbx_handle_array_t **array,
|
||||
return MDBX_SUCCESS;
|
||||
}
|
||||
|
||||
MDBX_INTERNAL_FUNC int
|
||||
MDBX_INTERNAL int
|
||||
osal_suspend_threads_before_remap(MDBX_env *env, mdbx_handle_array_t **array) {
|
||||
eASSERT(env, (env->me_flags & MDBX_NOSTICKYTHREADS) == 0);
|
||||
eASSERT(env, (env->flags & MDBX_NOSTICKYTHREADS) == 0);
|
||||
const uintptr_t CurrentTid = GetCurrentThreadId();
|
||||
int rc;
|
||||
if (env->me_lck_mmap.lck) {
|
||||
if (env->lck_mmap.lck) {
|
||||
/* Scan LCK for threads of the current process */
|
||||
const MDBX_reader *const begin = env->me_lck_mmap.lck->mti_readers;
|
||||
const MDBX_reader *const end =
|
||||
const reader_slot_t *const begin = env->lck_mmap.lck->rdt;
|
||||
const reader_slot_t *const end =
|
||||
begin +
|
||||
atomic_load32(&env->me_lck_mmap.lck->mti_numreaders, mo_AcquireRelease);
|
||||
const uintptr_t WriteTxnOwner = env->me_txn0 ? env->me_txn0->mt_owner : 0;
|
||||
for (const MDBX_reader *reader = begin; reader < end; ++reader) {
|
||||
if (reader->mr_pid.weak != env->me_pid || !reader->mr_tid.weak) {
|
||||
atomic_load32(&env->lck_mmap.lck->rdt_length, mo_AcquireRelease);
|
||||
const uintptr_t WriteTxnOwner = env->basal_txn ? env->basal_txn->owner : 0;
|
||||
for (const reader_slot_t *reader = begin; reader < end; ++reader) {
|
||||
if (reader->pid.weak != env->pid || !reader->tid.weak) {
|
||||
skip_lck:
|
||||
continue;
|
||||
}
|
||||
if (reader->mr_tid.weak == CurrentTid ||
|
||||
reader->mr_tid.weak == WriteTxnOwner)
|
||||
if (reader->tid.weak == CurrentTid || reader->tid.weak == WriteTxnOwner)
|
||||
goto skip_lck;
|
||||
|
||||
rc = suspend_and_append(array, (mdbx_tid_t)reader->mr_tid.weak);
|
||||
rc = suspend_and_append(array, (mdbx_tid_t)reader->tid.weak);
|
||||
if (rc != MDBX_SUCCESS) {
|
||||
bailout_lck:
|
||||
(void)osal_resume_threads_after_remap(*array);
|
||||
@ -360,7 +262,7 @@ osal_suspend_threads_before_remap(MDBX_env *env, mdbx_handle_array_t **array) {
|
||||
} else {
|
||||
/* Without LCK (i.e. read-only mode).
|
||||
* Walk through a snapshot of all running threads */
|
||||
eASSERT(env, env->me_flags & (MDBX_EXCLUSIVE | MDBX_RDONLY));
|
||||
eASSERT(env, env->flags & (MDBX_EXCLUSIVE | MDBX_RDONLY));
|
||||
const HANDLE hSnapshot = CreateToolhelp32Snapshot(TH32CS_SNAPTHREAD, 0);
|
||||
if (hSnapshot == INVALID_HANDLE_VALUE)
|
||||
return (int)GetLastError();
|
||||
@ -377,7 +279,7 @@ osal_suspend_threads_before_remap(MDBX_env *env, mdbx_handle_array_t **array) {
|
||||
}
|
||||
|
||||
do {
|
||||
if (entry.th32OwnerProcessID != env->me_pid ||
|
||||
if (entry.th32OwnerProcessID != env->pid ||
|
||||
entry.th32ThreadID == CurrentTid)
|
||||
continue;
|
||||
|
||||
@ -396,8 +298,7 @@ osal_suspend_threads_before_remap(MDBX_env *env, mdbx_handle_array_t **array) {
|
||||
return MDBX_SUCCESS;
|
||||
}
|
||||
|
||||
MDBX_INTERNAL_FUNC int
|
||||
osal_resume_threads_after_remap(mdbx_handle_array_t *array) {
|
||||
MDBX_INTERNAL int osal_resume_threads_after_remap(mdbx_handle_array_t *array) {
|
||||
int rc = MDBX_SUCCESS;
|
||||
for (unsigned i = 0; i < array->count; ++i) {
|
||||
const HANDLE hThread = array->handles[i];
|
||||
@ -426,6 +327,7 @@ osal_resume_threads_after_remap(mdbx_handle_array_t *array) {
|
||||
* Only 6 states of FSM are used, which 2 of ones are transitive.
|
||||
*
|
||||
* States:
|
||||
* LO HI
|
||||
* ?-? = free, i.e. unlocked
|
||||
* S-? = used, i.e. shared lock
|
||||
* E-? = exclusive-read, i.e. operational exclusive
|
||||
@ -436,39 +338,39 @@ osal_resume_threads_after_remap(mdbx_handle_array_t *array) {
|
||||
* E-S
|
||||
* E-E = exclusive-write, i.e. exclusive due (re)initialization
|
||||
*
|
||||
* The osal_lck_seize() moves the locking-FSM from the initial free/unlocked
|
||||
* The lck_seize() moves the locking-FSM from the initial free/unlocked
|
||||
* state to the "exclusive write" (and returns MDBX_RESULT_TRUE) if possible,
|
||||
* or to the "used" (and returns MDBX_RESULT_FALSE).
|
||||
*
|
||||
* The osal_lck_downgrade() moves the locking-FSM from "exclusive write"
|
||||
* The lck_downgrade() moves the locking-FSM from "exclusive write"
|
||||
* state to the "used" (i.e. shared) state.
|
||||
*
|
||||
* The osal_lck_upgrade() moves the locking-FSM from "used" (i.e. shared)
|
||||
* The lck_upgrade() moves the locking-FSM from "used" (i.e. shared)
|
||||
* state to the "exclusive write" state.
|
||||
*/
|
||||
|
||||
static void lck_unlock(MDBX_env *env) {
|
||||
int err;
|
||||
|
||||
if (env->me_lfd != INVALID_HANDLE_VALUE) {
|
||||
if (env->lck_mmap.fd != INVALID_HANDLE_VALUE) {
|
||||
/* double `unlock` for robustly remove overlapped shared/exclusive locks */
|
||||
do
|
||||
err = funlock(env->me_lfd, LCK_LOWER);
|
||||
err = funlock(env->lck_mmap.fd, LCK_LOWER);
|
||||
while (err == MDBX_SUCCESS);
|
||||
assert(err == ERROR_NOT_LOCKED ||
|
||||
(mdbx_RunningUnderWine() && err == ERROR_LOCK_VIOLATION));
|
||||
(globals.running_under_Wine && err == ERROR_LOCK_VIOLATION));
|
||||
SetLastError(ERROR_SUCCESS);
|
||||
|
||||
do
|
||||
err = funlock(env->me_lfd, LCK_UPPER);
|
||||
err = funlock(env->lck_mmap.fd, LCK_UPPER);
|
||||
while (err == MDBX_SUCCESS);
|
||||
assert(err == ERROR_NOT_LOCKED ||
|
||||
(mdbx_RunningUnderWine() && err == ERROR_LOCK_VIOLATION));
|
||||
(globals.running_under_Wine && err == ERROR_LOCK_VIOLATION));
|
||||
SetLastError(ERROR_SUCCESS);
|
||||
}
|
||||
|
||||
const HANDLE fd4data =
|
||||
env->me_overlapped_fd ? env->me_overlapped_fd : env->me_lazy_fd;
|
||||
env->ioring.overlapped_fd ? env->ioring.overlapped_fd : env->lazy_fd;
|
||||
if (fd4data != INVALID_HANDLE_VALUE) {
|
||||
/* explicitly unlock to avoid latency for other processes (windows kernel
|
||||
* releases such locks via deferred queues) */
|
||||
@ -476,14 +378,14 @@ static void lck_unlock(MDBX_env *env) {
|
||||
err = funlock(fd4data, DXB_BODY);
|
||||
while (err == MDBX_SUCCESS);
|
||||
assert(err == ERROR_NOT_LOCKED ||
|
||||
(mdbx_RunningUnderWine() && err == ERROR_LOCK_VIOLATION));
|
||||
(globals.running_under_Wine && err == ERROR_LOCK_VIOLATION));
|
||||
SetLastError(ERROR_SUCCESS);
|
||||
|
||||
do
|
||||
err = funlock(fd4data, DXB_WHOLE);
|
||||
while (err == MDBX_SUCCESS);
|
||||
assert(err == ERROR_NOT_LOCKED ||
|
||||
(mdbx_RunningUnderWine() && err == ERROR_LOCK_VIOLATION));
|
||||
(globals.running_under_Wine && err == ERROR_LOCK_VIOLATION));
|
||||
SetLastError(ERROR_SUCCESS);
|
||||
}
|
||||
}
|
||||
@ -539,16 +441,16 @@ static int internal_seize_lck(HANDLE lfd) {
|
||||
return rc;
|
||||
}
|
||||
|
||||
MDBX_INTERNAL_FUNC int osal_lck_seize(MDBX_env *env) {
|
||||
MDBX_INTERNAL int lck_seize(MDBX_env *env) {
|
||||
const HANDLE fd4data =
|
||||
env->me_overlapped_fd ? env->me_overlapped_fd : env->me_lazy_fd;
|
||||
env->ioring.overlapped_fd ? env->ioring.overlapped_fd : env->lazy_fd;
|
||||
assert(fd4data != INVALID_HANDLE_VALUE);
|
||||
if (env->me_flags & MDBX_EXCLUSIVE)
|
||||
if (env->flags & MDBX_EXCLUSIVE)
|
||||
return MDBX_RESULT_TRUE /* nope since files were must be opened
|
||||
non-shareable */
|
||||
;
|
||||
|
||||
if (env->me_lfd == INVALID_HANDLE_VALUE) {
|
||||
if (env->lck_mmap.fd == INVALID_HANDLE_VALUE) {
|
||||
/* LY: without-lck mode (e.g. on read-only filesystem) */
|
||||
jitter4testing(false);
|
||||
int rc = flock_data(env, LCK_SHARED | LCK_DONTWAIT, DXB_WHOLE);
|
||||
@ -557,9 +459,9 @@ MDBX_INTERNAL_FUNC int osal_lck_seize(MDBX_env *env) {
|
||||
return rc;
|
||||
}
|
||||
|
||||
int rc = internal_seize_lck(env->me_lfd);
|
||||
int rc = internal_seize_lck(env->lck_mmap.fd);
|
||||
jitter4testing(false);
|
||||
if (rc == MDBX_RESULT_TRUE && (env->me_flags & MDBX_RDONLY) == 0) {
|
||||
if (rc == MDBX_RESULT_TRUE && (env->flags & MDBX_RDONLY) == 0) {
|
||||
/* Check that another process don't operates in without-lck mode.
|
||||
* Doing such check by exclusive locking the body-part of db. Should be
|
||||
* noted:
|
||||
@ -583,24 +485,24 @@ MDBX_INTERNAL_FUNC int osal_lck_seize(MDBX_env *env) {
|
||||
return rc;
|
||||
}
|
||||
|
||||
MDBX_INTERNAL_FUNC int osal_lck_downgrade(MDBX_env *env) {
|
||||
MDBX_INTERNAL int lck_downgrade(MDBX_env *env) {
|
||||
const HANDLE fd4data =
|
||||
env->me_overlapped_fd ? env->me_overlapped_fd : env->me_lazy_fd;
|
||||
env->ioring.overlapped_fd ? env->ioring.overlapped_fd : env->lazy_fd;
|
||||
/* Transite from exclusive-write state (E-E) to used (S-?) */
|
||||
assert(fd4data != INVALID_HANDLE_VALUE);
|
||||
assert(env->me_lfd != INVALID_HANDLE_VALUE);
|
||||
assert(env->lck_mmap.fd != INVALID_HANDLE_VALUE);
|
||||
|
||||
if (env->me_flags & MDBX_EXCLUSIVE)
|
||||
if (env->flags & MDBX_EXCLUSIVE)
|
||||
return MDBX_SUCCESS /* nope since files were must be opened non-shareable */
|
||||
;
|
||||
/* 1) now at E-E (exclusive-write), transition to ?_E (middle) */
|
||||
int rc = funlock(env->me_lfd, LCK_LOWER);
|
||||
int rc = funlock(env->lck_mmap.fd, LCK_LOWER);
|
||||
if (rc != MDBX_SUCCESS)
|
||||
mdbx_panic("%s(%s) failed: err %u", __func__,
|
||||
"E-E(exclusive-write) >> ?-E(middle)", rc);
|
||||
|
||||
/* 2) now at ?-E (middle), transition to S-E (locked) */
|
||||
rc = flock(env->me_lfd, LCK_SHARED | LCK_DONTWAIT, LCK_LOWER);
|
||||
rc = flock(env->lck_mmap.fd, LCK_SHARED | LCK_DONTWAIT, LCK_LOWER);
|
||||
if (rc != MDBX_SUCCESS) {
|
||||
/* 3) something went wrong, give up */;
|
||||
ERROR("%s, err %u", "?-E(middle) >> S-E(locked)", rc);
|
||||
@ -608,7 +510,7 @@ MDBX_INTERNAL_FUNC int osal_lck_downgrade(MDBX_env *env) {
|
||||
}
|
||||
|
||||
/* 4) got S-E (locked), continue transition to S-? (used) */
|
||||
rc = funlock(env->me_lfd, LCK_UPPER);
|
||||
rc = funlock(env->lck_mmap.fd, LCK_UPPER);
|
||||
if (rc != MDBX_SUCCESS)
|
||||
mdbx_panic("%s(%s) failed: err %u", __func__, "S-E(locked) >> S-?(used)",
|
||||
rc);
|
||||
@ -616,17 +518,17 @@ MDBX_INTERNAL_FUNC int osal_lck_downgrade(MDBX_env *env) {
|
||||
return MDBX_SUCCESS /* 5) now at S-? (used), done */;
|
||||
}
|
||||
|
||||
MDBX_INTERNAL_FUNC int osal_lck_upgrade(MDBX_env *env, bool dont_wait) {
|
||||
MDBX_INTERNAL int lck_upgrade(MDBX_env *env, bool dont_wait) {
|
||||
/* Transite from used state (S-?) to exclusive-write (E-E) */
|
||||
assert(env->me_lfd != INVALID_HANDLE_VALUE);
|
||||
assert(env->lck_mmap.fd != INVALID_HANDLE_VALUE);
|
||||
|
||||
if (env->me_flags & MDBX_EXCLUSIVE)
|
||||
if (env->flags & MDBX_EXCLUSIVE)
|
||||
return MDBX_SUCCESS /* nope since files were must be opened non-shareable */
|
||||
;
|
||||
|
||||
/* 1) now on S-? (used), try S-E (locked) */
|
||||
jitter4testing(false);
|
||||
int rc = flock(env->me_lfd,
|
||||
int rc = flock(env->lck_mmap.fd,
|
||||
dont_wait ? LCK_EXCLUSIVE | LCK_DONTWAIT : LCK_EXCLUSIVE,
|
||||
LCK_UPPER);
|
||||
if (rc != MDBX_SUCCESS) {
|
||||
@ -636,14 +538,14 @@ MDBX_INTERNAL_FUNC int osal_lck_upgrade(MDBX_env *env, bool dont_wait) {
|
||||
}
|
||||
|
||||
/* 3) now on S-E (locked), transition to ?-E (middle) */
|
||||
rc = funlock(env->me_lfd, LCK_LOWER);
|
||||
rc = funlock(env->lck_mmap.fd, LCK_LOWER);
|
||||
if (rc != MDBX_SUCCESS)
|
||||
mdbx_panic("%s(%s) failed: err %u", __func__, "S-E(locked) >> ?-E(middle)",
|
||||
rc);
|
||||
|
||||
/* 4) now on ?-E (middle), try E-E (exclusive-write) */
|
||||
jitter4testing(false);
|
||||
rc = flock(env->me_lfd,
|
||||
rc = flock(env->lck_mmap.fd,
|
||||
dont_wait ? LCK_EXCLUSIVE | LCK_DONTWAIT : LCK_EXCLUSIVE,
|
||||
LCK_LOWER);
|
||||
if (rc != MDBX_SUCCESS) {
|
||||
@ -655,25 +557,24 @@ MDBX_INTERNAL_FUNC int osal_lck_upgrade(MDBX_env *env, bool dont_wait) {
|
||||
return MDBX_SUCCESS /* 6) now at E-E (exclusive-write), done */;
|
||||
}
|
||||
|
||||
MDBX_INTERNAL_FUNC int osal_lck_init(MDBX_env *env,
|
||||
MDBX_env *inprocess_neighbor,
|
||||
int global_uniqueness_flag) {
|
||||
MDBX_INTERNAL int lck_init(MDBX_env *env, MDBX_env *inprocess_neighbor,
|
||||
int global_uniqueness_flag) {
|
||||
(void)env;
|
||||
(void)inprocess_neighbor;
|
||||
(void)global_uniqueness_flag;
|
||||
if (mdbx_SetFileIoOverlappedRange && !(env->me_flags & MDBX_RDONLY)) {
|
||||
if (imports.SetFileIoOverlappedRange && !(env->flags & MDBX_RDONLY)) {
|
||||
HANDLE token = INVALID_HANDLE_VALUE;
|
||||
TOKEN_PRIVILEGES privileges;
|
||||
privileges.PrivilegeCount = 1;
|
||||
privileges.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED;
|
||||
if (!OpenProcessToken(GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES,
|
||||
&token) ||
|
||||
!LookupPrivilegeValue(NULL, SE_LOCK_MEMORY_NAME,
|
||||
!LookupPrivilegeValue(nullptr, SE_LOCK_MEMORY_NAME,
|
||||
&privileges.Privileges[0].Luid) ||
|
||||
!AdjustTokenPrivileges(token, FALSE, &privileges, sizeof(privileges),
|
||||
nullptr, nullptr) ||
|
||||
GetLastError() != ERROR_SUCCESS)
|
||||
mdbx_SetFileIoOverlappedRange = NULL;
|
||||
imports.SetFileIoOverlappedRange = nullptr;
|
||||
|
||||
if (token != INVALID_HANDLE_VALUE)
|
||||
CloseHandle(token);
|
||||
@ -681,21 +582,21 @@ MDBX_INTERNAL_FUNC int osal_lck_init(MDBX_env *env,
|
||||
return MDBX_SUCCESS;
|
||||
}
|
||||
|
||||
MDBX_INTERNAL_FUNC int osal_lck_destroy(MDBX_env *env,
|
||||
MDBX_env *inprocess_neighbor,
|
||||
const uint32_t current_pid) {
|
||||
MDBX_INTERNAL int lck_destroy(MDBX_env *env, MDBX_env *inprocess_neighbor,
|
||||
const uint32_t current_pid) {
|
||||
(void)current_pid;
|
||||
/* LY: should unmap before releasing the locks to avoid race condition and
|
||||
* STATUS_USER_MAPPED_FILE/ERROR_USER_MAPPED_FILE */
|
||||
if (env->me_map)
|
||||
osal_munmap(&env->me_dxb_mmap);
|
||||
if (env->me_lck_mmap.lck) {
|
||||
const bool synced = env->me_lck_mmap.lck->mti_unsynced_pages.weak == 0;
|
||||
osal_munmap(&env->me_lck_mmap);
|
||||
if (synced && !inprocess_neighbor && env->me_lfd != INVALID_HANDLE_VALUE &&
|
||||
osal_lck_upgrade(env, true) == MDBX_SUCCESS)
|
||||
if (env->dxb_mmap.base)
|
||||
osal_munmap(&env->dxb_mmap);
|
||||
if (env->lck_mmap.lck) {
|
||||
const bool synced = env->lck_mmap.lck->unsynced_pages.weak == 0;
|
||||
osal_munmap(&env->lck_mmap);
|
||||
if (synced && !inprocess_neighbor &&
|
||||
env->lck_mmap.fd != INVALID_HANDLE_VALUE &&
|
||||
lck_upgrade(env, true) == MDBX_SUCCESS)
|
||||
/* this will fail if LCK is used/mmapped by other process(es) */
|
||||
osal_ftruncate(env->me_lfd, 0);
|
||||
osal_ftruncate(env->lck_mmap.fd, 0);
|
||||
}
|
||||
lck_unlock(env);
|
||||
return MDBX_SUCCESS;
|
||||
@ -704,12 +605,12 @@ MDBX_INTERNAL_FUNC int osal_lck_destroy(MDBX_env *env,
|
||||
/*----------------------------------------------------------------------------*/
|
||||
/* reader checking (by pid) */
|
||||
|
||||
MDBX_INTERNAL_FUNC int osal_rpid_set(MDBX_env *env) {
|
||||
MDBX_INTERNAL int lck_rpid_set(MDBX_env *env) {
|
||||
(void)env;
|
||||
return MDBX_SUCCESS;
|
||||
}
|
||||
|
||||
MDBX_INTERNAL_FUNC int osal_rpid_clear(MDBX_env *env) {
|
||||
MDBX_INTERNAL int lck_rpid_clear(MDBX_env *env) {
|
||||
(void)env;
|
||||
return MDBX_SUCCESS;
|
||||
}
|
||||
@ -720,7 +621,7 @@ MDBX_INTERNAL_FUNC int osal_rpid_clear(MDBX_env *env) {
|
||||
* MDBX_RESULT_TRUE, if pid is live (unable to acquire lock)
|
||||
* MDBX_RESULT_FALSE, if pid is dead (lock acquired)
|
||||
* or otherwise the errcode. */
|
||||
MDBX_INTERNAL_FUNC int osal_rpid_check(MDBX_env *env, uint32_t pid) {
|
||||
MDBX_INTERNAL int lck_rpid_check(MDBX_env *env, uint32_t pid) {
|
||||
(void)env;
|
||||
HANDLE hProcess = OpenProcess(SYNCHRONIZE, FALSE, pid);
|
||||
int rc;
|
||||
@ -753,169 +654,4 @@ MDBX_INTERNAL_FUNC int osal_rpid_check(MDBX_env *env, uint32_t pid) {
|
||||
}
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
// Stub for slim read-write lock
|
||||
// Copyright (C) 1995-2002 Brad Wilson
|
||||
|
||||
static void WINAPI stub_srwlock_Init(osal_srwlock_t *srwl) {
|
||||
srwl->readerCount = srwl->writerCount = 0;
|
||||
}
|
||||
|
||||
static void WINAPI stub_srwlock_AcquireShared(osal_srwlock_t *srwl) {
|
||||
while (true) {
|
||||
assert(srwl->writerCount >= 0 && srwl->readerCount >= 0);
|
||||
|
||||
// If there's a writer already, spin without unnecessarily
|
||||
// interlocking the CPUs
|
||||
if (srwl->writerCount != 0) {
|
||||
SwitchToThread();
|
||||
continue;
|
||||
}
|
||||
|
||||
// Add to the readers list
|
||||
_InterlockedIncrement(&srwl->readerCount);
|
||||
|
||||
// Check for writers again (we may have been preempted). If
|
||||
// there are no writers writing or waiting, then we're done.
|
||||
if (srwl->writerCount == 0)
|
||||
break;
|
||||
|
||||
// Remove from the readers list, spin, try again
|
||||
_InterlockedDecrement(&srwl->readerCount);
|
||||
SwitchToThread();
|
||||
}
|
||||
}
|
||||
|
||||
static void WINAPI stub_srwlock_ReleaseShared(osal_srwlock_t *srwl) {
|
||||
assert(srwl->readerCount > 0);
|
||||
_InterlockedDecrement(&srwl->readerCount);
|
||||
}
|
||||
|
||||
static void WINAPI stub_srwlock_AcquireExclusive(osal_srwlock_t *srwl) {
|
||||
while (true) {
|
||||
assert(srwl->writerCount >= 0 && srwl->readerCount >= 0);
|
||||
|
||||
// If there's a writer already, spin without unnecessarily
|
||||
// interlocking the CPUs
|
||||
if (srwl->writerCount != 0) {
|
||||
SwitchToThread();
|
||||
continue;
|
||||
}
|
||||
|
||||
// See if we can become the writer (expensive, because it inter-
|
||||
// locks the CPUs, so writing should be an infrequent process)
|
||||
if (_InterlockedExchange(&srwl->writerCount, 1) == 0)
|
||||
break;
|
||||
}
|
||||
|
||||
// Now we're the writer, but there may be outstanding readers.
|
||||
// Spin until there aren't any more; new readers will wait now
|
||||
// that we're the writer.
|
||||
while (srwl->readerCount != 0) {
|
||||
assert(srwl->writerCount >= 0 && srwl->readerCount >= 0);
|
||||
SwitchToThread();
|
||||
}
|
||||
}
|
||||
|
||||
static void WINAPI stub_srwlock_ReleaseExclusive(osal_srwlock_t *srwl) {
|
||||
assert(srwl->writerCount == 1 && srwl->readerCount >= 0);
|
||||
srwl->writerCount = 0;
|
||||
}
|
||||
|
||||
static uint64_t WINAPI stub_GetTickCount64(void) {
|
||||
LARGE_INTEGER Counter, Frequency;
|
||||
return (QueryPerformanceFrequency(&Frequency) &&
|
||||
QueryPerformanceCounter(&Counter))
|
||||
? Counter.QuadPart * 1000ul / Frequency.QuadPart
|
||||
: 0;
|
||||
}
|
||||
|
||||
/*----------------------------------------------------------------------------*/
|
||||
|
||||
#ifndef xMDBX_ALLOY
|
||||
osal_srwlock_t_function osal_srwlock_Init, osal_srwlock_AcquireShared,
|
||||
osal_srwlock_ReleaseShared, osal_srwlock_AcquireExclusive,
|
||||
osal_srwlock_ReleaseExclusive;
|
||||
|
||||
MDBX_NtExtendSection mdbx_NtExtendSection;
|
||||
MDBX_GetFileInformationByHandleEx mdbx_GetFileInformationByHandleEx;
|
||||
MDBX_GetVolumeInformationByHandleW mdbx_GetVolumeInformationByHandleW;
|
||||
MDBX_GetFinalPathNameByHandleW mdbx_GetFinalPathNameByHandleW;
|
||||
MDBX_SetFileInformationByHandle mdbx_SetFileInformationByHandle;
|
||||
MDBX_NtFsControlFile mdbx_NtFsControlFile;
|
||||
MDBX_PrefetchVirtualMemory mdbx_PrefetchVirtualMemory;
|
||||
MDBX_GetTickCount64 mdbx_GetTickCount64;
|
||||
MDBX_RegGetValueA mdbx_RegGetValueA;
|
||||
MDBX_SetFileIoOverlappedRange mdbx_SetFileIoOverlappedRange;
|
||||
#endif /* xMDBX_ALLOY */
|
||||
|
||||
#if __GNUC_PREREQ(8, 0)
|
||||
#pragma GCC diagnostic push
|
||||
#pragma GCC diagnostic ignored "-Wcast-function-type"
|
||||
#endif /* GCC/MINGW */
|
||||
|
||||
static void mdbx_winnt_import(void) {
|
||||
#define GET_PROC_ADDR(dll, ENTRY) \
|
||||
mdbx_##ENTRY = (MDBX_##ENTRY)GetProcAddress(dll, #ENTRY)
|
||||
|
||||
const HINSTANCE hNtdll = GetModuleHandleA("ntdll.dll");
|
||||
if (hNtdll) {
|
||||
if (GetProcAddress(hNtdll, "wine_get_version")) {
|
||||
assert(mdbx_RunningUnderWine());
|
||||
} else {
|
||||
GET_PROC_ADDR(hNtdll, NtFsControlFile);
|
||||
GET_PROC_ADDR(hNtdll, NtExtendSection);
|
||||
assert(!mdbx_RunningUnderWine());
|
||||
}
|
||||
}
|
||||
|
||||
const HINSTANCE hKernel32dll = GetModuleHandleA("kernel32.dll");
|
||||
if (hKernel32dll) {
|
||||
GET_PROC_ADDR(hKernel32dll, GetFileInformationByHandleEx);
|
||||
GET_PROC_ADDR(hKernel32dll, GetTickCount64);
|
||||
if (!mdbx_GetTickCount64)
|
||||
mdbx_GetTickCount64 = stub_GetTickCount64;
|
||||
if (!mdbx_RunningUnderWine()) {
|
||||
GET_PROC_ADDR(hKernel32dll, SetFileInformationByHandle);
|
||||
GET_PROC_ADDR(hKernel32dll, GetVolumeInformationByHandleW);
|
||||
GET_PROC_ADDR(hKernel32dll, GetFinalPathNameByHandleW);
|
||||
GET_PROC_ADDR(hKernel32dll, PrefetchVirtualMemory);
|
||||
GET_PROC_ADDR(hKernel32dll, SetFileIoOverlappedRange);
|
||||
}
|
||||
}
|
||||
|
||||
const osal_srwlock_t_function init =
|
||||
(osal_srwlock_t_function)(hKernel32dll
|
||||
? GetProcAddress(hKernel32dll,
|
||||
"InitializeSRWLock")
|
||||
: nullptr);
|
||||
if (init != NULL) {
|
||||
osal_srwlock_Init = init;
|
||||
osal_srwlock_AcquireShared = (osal_srwlock_t_function)GetProcAddress(
|
||||
hKernel32dll, "AcquireSRWLockShared");
|
||||
osal_srwlock_ReleaseShared = (osal_srwlock_t_function)GetProcAddress(
|
||||
hKernel32dll, "ReleaseSRWLockShared");
|
||||
osal_srwlock_AcquireExclusive = (osal_srwlock_t_function)GetProcAddress(
|
||||
hKernel32dll, "AcquireSRWLockExclusive");
|
||||
osal_srwlock_ReleaseExclusive = (osal_srwlock_t_function)GetProcAddress(
|
||||
hKernel32dll, "ReleaseSRWLockExclusive");
|
||||
} else {
|
||||
osal_srwlock_Init = stub_srwlock_Init;
|
||||
osal_srwlock_AcquireShared = stub_srwlock_AcquireShared;
|
||||
osal_srwlock_ReleaseShared = stub_srwlock_ReleaseShared;
|
||||
osal_srwlock_AcquireExclusive = stub_srwlock_AcquireExclusive;
|
||||
osal_srwlock_ReleaseExclusive = stub_srwlock_ReleaseExclusive;
|
||||
}
|
||||
|
||||
const HINSTANCE hAdvapi32dll = GetModuleHandleA("advapi32.dll");
|
||||
if (hAdvapi32dll) {
|
||||
GET_PROC_ADDR(hAdvapi32dll, RegGetValueA);
|
||||
}
|
||||
#undef GET_PROC_ADDR
|
||||
}
|
||||
|
||||
#if __GNUC_PREREQ(8, 0)
|
||||
#pragma GCC diagnostic pop
|
||||
#endif /* GCC/MINGW */
|
||||
|
||||
#endif /* Windows LCK-implementation */
|
||||
#endif /* Windows */
|
||||
|
193
src/lck.c
Normal file
193
src/lck.c
Normal file
@ -0,0 +1,193 @@
|
||||
/// \copyright SPDX-License-Identifier: Apache-2.0
|
||||
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
|
||||
|
||||
#include "internals.h"
|
||||
|
||||
__cold static int lck_setup_locked(MDBX_env *env) {
|
||||
int err = rthc_register(env);
|
||||
if (unlikely(err != MDBX_SUCCESS))
|
||||
return err;
|
||||
|
||||
int lck_seize_rc = lck_seize(env);
|
||||
if (unlikely(MDBX_IS_ERROR(lck_seize_rc)))
|
||||
return lck_seize_rc;
|
||||
|
||||
if (env->lck_mmap.fd == INVALID_HANDLE_VALUE) {
|
||||
env->lck = lckless_stub(env);
|
||||
env->max_readers = UINT_MAX;
|
||||
DEBUG("lck-setup:%s%s%s", " lck-less",
|
||||
(env->flags & MDBX_RDONLY) ? " readonly" : "",
|
||||
(lck_seize_rc == MDBX_RESULT_TRUE) ? " exclusive" : " cooperative");
|
||||
return lck_seize_rc;
|
||||
}
|
||||
|
||||
DEBUG("lck-setup:%s%s%s", " with-lck",
|
||||
(env->flags & MDBX_RDONLY) ? " readonly" : "",
|
||||
(lck_seize_rc == MDBX_RESULT_TRUE) ? " exclusive" : " cooperative");
|
||||
|
||||
MDBX_env *inprocess_neighbor = nullptr;
|
||||
err = rthc_uniq_check(&env->lck_mmap, &inprocess_neighbor);
|
||||
if (unlikely(MDBX_IS_ERROR(err)))
|
||||
return err;
|
||||
if (inprocess_neighbor) {
|
||||
if ((globals.runtime_flags & MDBX_DBG_LEGACY_MULTIOPEN) == 0 ||
|
||||
(inprocess_neighbor->flags & MDBX_EXCLUSIVE) != 0)
|
||||
return MDBX_BUSY;
|
||||
if (lck_seize_rc == MDBX_RESULT_TRUE) {
|
||||
err = lck_downgrade(env);
|
||||
if (unlikely(err != MDBX_SUCCESS))
|
||||
return err;
|
||||
lck_seize_rc = MDBX_RESULT_FALSE;
|
||||
}
|
||||
}
|
||||
|
||||
uint64_t size = 0;
|
||||
err = osal_filesize(env->lck_mmap.fd, &size);
|
||||
if (unlikely(err != MDBX_SUCCESS))
|
||||
return err;
|
||||
|
||||
if (lck_seize_rc == MDBX_RESULT_TRUE) {
|
||||
size =
|
||||
ceil_powerof2(env->max_readers * sizeof(reader_slot_t) + sizeof(lck_t),
|
||||
globals.sys_pagesize);
|
||||
jitter4testing(false);
|
||||
} else {
|
||||
if (env->flags & MDBX_EXCLUSIVE)
|
||||
return MDBX_BUSY;
|
||||
if (size > INT_MAX || (size & (globals.sys_pagesize - 1)) != 0 ||
|
||||
size < globals.sys_pagesize) {
|
||||
ERROR("lck-file has invalid size %" PRIu64 " bytes", size);
|
||||
return MDBX_PROBLEM;
|
||||
}
|
||||
}
|
||||
|
||||
const size_t maxreaders =
|
||||
((size_t)size - sizeof(lck_t)) / sizeof(reader_slot_t);
|
||||
if (maxreaders < 4) {
|
||||
ERROR("lck-size too small (up to %" PRIuPTR " readers)", maxreaders);
|
||||
return MDBX_PROBLEM;
|
||||
}
|
||||
env->max_readers = (maxreaders <= MDBX_READERS_LIMIT)
|
||||
? (unsigned)maxreaders
|
||||
: (unsigned)MDBX_READERS_LIMIT;
|
||||
|
||||
err = osal_mmap((env->flags & MDBX_EXCLUSIVE) | MDBX_WRITEMAP, &env->lck_mmap,
|
||||
(size_t)size, (size_t)size,
|
||||
lck_seize_rc ? MMAP_OPTION_TRUNCATE | MMAP_OPTION_SEMAPHORE
|
||||
: MMAP_OPTION_SEMAPHORE);
|
||||
if (unlikely(err != MDBX_SUCCESS))
|
||||
return err;
|
||||
|
||||
#if MDBX_ENABLE_MADVISE
|
||||
#ifdef MADV_DODUMP
|
||||
err = madvise(env->lck_mmap.lck, size, MADV_DODUMP) ? ignore_enosys(errno)
|
||||
: MDBX_SUCCESS;
|
||||
if (unlikely(MDBX_IS_ERROR(err)))
|
||||
return err;
|
||||
#endif /* MADV_DODUMP */
|
||||
|
||||
#ifdef MADV_WILLNEED
|
||||
err = madvise(env->lck_mmap.lck, size, MADV_WILLNEED) ? ignore_enosys(errno)
|
||||
: MDBX_SUCCESS;
|
||||
if (unlikely(MDBX_IS_ERROR(err)))
|
||||
return err;
|
||||
#elif defined(POSIX_MADV_WILLNEED)
|
||||
err = ignore_enosys(
|
||||
posix_madvise(env->lck_mmap.lck, size, POSIX_MADV_WILLNEED));
|
||||
if (unlikely(MDBX_IS_ERROR(err)))
|
||||
return err;
|
||||
#endif /* MADV_WILLNEED */
|
||||
#endif /* MDBX_ENABLE_MADVISE */
|
||||
|
||||
lck_t *lck = env->lck_mmap.lck;
|
||||
if (lck_seize_rc == MDBX_RESULT_TRUE) {
|
||||
/* If we succeed got exclusive lock, then nobody is using the lock region
|
||||
* and we should initialize it. */
|
||||
memset(lck, 0, (size_t)size);
|
||||
jitter4testing(false);
|
||||
lck->magic_and_version = MDBX_LOCK_MAGIC;
|
||||
lck->os_and_format = MDBX_LOCK_FORMAT;
|
||||
#if MDBX_ENABLE_PGOP_STAT
|
||||
lck->pgops.wops.weak = 1;
|
||||
#endif /* MDBX_ENABLE_PGOP_STAT */
|
||||
err = osal_msync(&env->lck_mmap, 0, (size_t)size,
|
||||
MDBX_SYNC_DATA | MDBX_SYNC_SIZE);
|
||||
if (unlikely(err != MDBX_SUCCESS)) {
|
||||
ERROR("initial-%s for lck-file failed, err %d", "msync/fsync", err);
|
||||
eASSERT(env, MDBX_IS_ERROR(err));
|
||||
return err;
|
||||
}
|
||||
} else {
|
||||
if (lck->magic_and_version != MDBX_LOCK_MAGIC) {
|
||||
const bool invalid = (lck->magic_and_version >> 8) != MDBX_MAGIC;
|
||||
ERROR("lock region has %s",
|
||||
invalid
|
||||
? "invalid magic"
|
||||
: "incompatible version (only applications with nearly or the "
|
||||
"same versions of libmdbx can share the same database)");
|
||||
return invalid ? MDBX_INVALID : MDBX_VERSION_MISMATCH;
|
||||
}
|
||||
if (lck->os_and_format != MDBX_LOCK_FORMAT) {
|
||||
ERROR("lock region has os/format signature 0x%" PRIx32
|
||||
", expected 0x%" PRIx32,
|
||||
lck->os_and_format, MDBX_LOCK_FORMAT);
|
||||
return MDBX_VERSION_MISMATCH;
|
||||
}
|
||||
}
|
||||
|
||||
err = lck_init(env, inprocess_neighbor, lck_seize_rc);
|
||||
if (unlikely(err != MDBX_SUCCESS)) {
|
||||
eASSERT(env, MDBX_IS_ERROR(err));
|
||||
return err;
|
||||
}
|
||||
|
||||
env->lck = lck;
|
||||
eASSERT(env, !MDBX_IS_ERROR(lck_seize_rc));
|
||||
return lck_seize_rc;
|
||||
}
|
||||
|
||||
__cold int lck_setup(MDBX_env *env, mdbx_mode_t mode) {
|
||||
eASSERT(env, env->lazy_fd != INVALID_HANDLE_VALUE);
|
||||
eASSERT(env, env->lck_mmap.fd == INVALID_HANDLE_VALUE);
|
||||
|
||||
int err = osal_openfile(MDBX_OPEN_LCK, env, env->pathname.lck,
|
||||
&env->lck_mmap.fd, mode);
|
||||
if (err != MDBX_SUCCESS) {
|
||||
switch (err) {
|
||||
default:
|
||||
return err;
|
||||
case MDBX_ENOFILE:
|
||||
case MDBX_EACCESS:
|
||||
case MDBX_EPERM:
|
||||
if (!F_ISSET(env->flags, MDBX_RDONLY | MDBX_EXCLUSIVE))
|
||||
return err;
|
||||
break;
|
||||
case MDBX_EROFS:
|
||||
if ((env->flags & MDBX_RDONLY) == 0)
|
||||
return err;
|
||||
break;
|
||||
}
|
||||
|
||||
if (err != MDBX_ENOFILE) {
|
||||
/* ENSURE the file system is read-only */
|
||||
err = osal_check_fs_rdonly(env->lazy_fd, env->pathname.lck, err);
|
||||
if (err != MDBX_SUCCESS &&
|
||||
/* ignore ERROR_NOT_SUPPORTED for exclusive mode */
|
||||
!(err == MDBX_ENOSYS && (env->flags & MDBX_EXCLUSIVE)))
|
||||
return err;
|
||||
}
|
||||
|
||||
/* LY: without-lck mode (e.g. exclusive or on read-only filesystem) */
|
||||
env->lck_mmap.fd = INVALID_HANDLE_VALUE;
|
||||
}
|
||||
|
||||
rthc_lock();
|
||||
err = lck_setup_locked(env);
|
||||
rthc_unlock();
|
||||
return err;
|
||||
}
|
||||
|
||||
void mincore_clean_cache(const MDBX_env *const env) {
|
||||
memset(env->lck->mincore_cache.begin, -1,
|
||||
sizeof(env->lck->mincore_cache.begin));
|
||||
}
|
112
src/lck.h
Normal file
112
src/lck.h
Normal file
@ -0,0 +1,112 @@
|
||||
/// \copyright SPDX-License-Identifier: Apache-2.0
|
||||
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "essentials.h"
|
||||
|
||||
MDBX_INTERNAL int lck_setup(MDBX_env *env, mdbx_mode_t mode);
|
||||
#if MDBX_LOCKING > MDBX_LOCKING_SYSV
|
||||
MDBX_INTERNAL int lck_ipclock_stubinit(osal_ipclock_t *ipc);
|
||||
MDBX_INTERNAL int lck_ipclock_destroy(osal_ipclock_t *ipc);
|
||||
#endif /* MDBX_LOCKING > MDBX_LOCKING_SYSV */
|
||||
|
||||
/// \brief Initialization of synchronization primitives linked with MDBX_env
|
||||
/// instance both in LCK-file and within the current process.
|
||||
/// \param
|
||||
/// global_uniqueness_flag = true - denotes that there are no other processes
|
||||
/// working with DB and LCK-file. Thus the function MUST initialize
|
||||
/// shared synchronization objects in memory-mapped LCK-file.
|
||||
/// global_uniqueness_flag = false - denotes that at least one process is
|
||||
/// already working with DB and LCK-file, including the case when DB
|
||||
/// has already been opened in the current process. Thus the function
|
||||
/// MUST NOT initialize shared synchronization objects in memory-mapped
|
||||
/// LCK-file that are already in use.
|
||||
/// \return Error code or zero on success.
|
||||
MDBX_INTERNAL int lck_init(MDBX_env *env, MDBX_env *inprocess_neighbor,
|
||||
int global_uniqueness_flag);
|
||||
|
||||
/// \brief Disconnects from shared interprocess objects and destructs
|
||||
/// synchronization objects linked with MDBX_env instance
|
||||
/// within the current process.
|
||||
/// \param
|
||||
/// inprocess_neighbor = nullptr - if the current process does not have other
|
||||
/// instances of MDBX_env linked with the DB being closed.
|
||||
/// Thus the function MUST check for other processes working with DB or
|
||||
/// LCK-file, and keep or destroy shared synchronization objects in
|
||||
/// memory-mapped LCK-file depending on the result.
|
||||
/// inprocess_neighbor = not-nullptr - pointer to another instance of MDBX_env
|
||||
/// (anyone of there is several) working with DB or LCK-file within the
|
||||
/// current process. Thus the function MUST NOT try to acquire exclusive
|
||||
/// lock and/or try to destruct shared synchronization objects linked with
|
||||
/// DB or LCK-file. Moreover, the implementation MUST ensure correct work
|
||||
/// of other instances of MDBX_env within the current process, e.g.
|
||||
/// restore POSIX-fcntl locks after the closing of file descriptors.
|
||||
/// \return Error code (MDBX_PANIC) or zero on success.
|
||||
MDBX_INTERNAL int lck_destroy(MDBX_env *env, MDBX_env *inprocess_neighbor,
|
||||
const uint32_t current_pid);
|
||||
|
||||
/// \brief Connects to shared interprocess locking objects and tries to acquire
|
||||
/// the maximum lock level (shared if exclusive is not available)
|
||||
/// Depending on implementation or/and platform (Windows) this function may
|
||||
/// acquire the non-OS super-level lock (e.g. for shared synchronization
|
||||
/// objects initialization), which will be downgraded to OS-exclusive or
|
||||
/// shared via explicit calling of lck_downgrade().
|
||||
/// \return
|
||||
/// MDBX_RESULT_TRUE (-1) - if an exclusive lock was acquired and thus
|
||||
/// the current process is the first and only after the last use of DB.
|
||||
/// MDBX_RESULT_FALSE (0) - if a shared lock was acquired and thus
|
||||
/// DB has already been opened and now is used by other processes.
|
||||
/// Otherwise (not 0 and not -1) - error code.
|
||||
MDBX_INTERNAL int lck_seize(MDBX_env *env);
|
||||
|
||||
/// \brief Downgrades the level of initially acquired lock to
|
||||
/// operational level specified by argument. The reason for such downgrade:
|
||||
/// - unblocking of other processes that are waiting for access, i.e.
|
||||
/// if (env->flags & MDBX_EXCLUSIVE) != 0, then other processes
|
||||
/// should be made aware that access is unavailable rather than
|
||||
/// wait for it.
|
||||
/// - freeing locks that interfere file operation (especially for Windows)
|
||||
/// (env->flags & MDBX_EXCLUSIVE) == 0 - downgrade to shared lock.
|
||||
/// (env->flags & MDBX_EXCLUSIVE) != 0 - downgrade to exclusive
|
||||
/// operational lock.
|
||||
/// \return Error code or zero on success
|
||||
MDBX_INTERNAL int lck_downgrade(MDBX_env *env);
|
||||
|
||||
MDBX_MAYBE_UNUSED MDBX_INTERNAL int lck_upgrade(MDBX_env *env, bool dont_wait);
|
||||
|
||||
/// \brief Locks LCK-file or/and table of readers for (de)registering.
|
||||
/// \return Error code or zero on success
|
||||
MDBX_INTERNAL int lck_rdt_lock(MDBX_env *env);
|
||||
|
||||
/// \brief Unlocks LCK-file or/and table of readers after (de)registering.
|
||||
MDBX_INTERNAL void lck_rdt_unlock(MDBX_env *env);
|
||||
|
||||
/// \brief Acquires write-transaction lock.
|
||||
/// \return Error code or zero on success
|
||||
MDBX_INTERNAL int lck_txn_lock(MDBX_env *env, bool dont_wait);
|
||||
|
||||
/// \brief Releases write-transaction lock..
|
||||
MDBX_INTERNAL void lck_txn_unlock(MDBX_env *env);
|
||||
|
||||
/// \brief Sets alive-flag of reader presence (indicative lock) for PID of
|
||||
/// the current process. The function does no more than needed for
|
||||
/// the correct working of lck_rpid_check() in other processes.
|
||||
/// \return Error code or zero on success
|
||||
MDBX_INTERNAL int lck_rpid_set(MDBX_env *env);
|
||||
|
||||
/// \brief Resets alive-flag of reader presence (indicative lock)
|
||||
/// for PID of the current process. The function does no more than needed
|
||||
/// for the correct working of lck_rpid_check() in other processes.
|
||||
/// \return Error code or zero on success
|
||||
MDBX_INTERNAL int lck_rpid_clear(MDBX_env *env);
|
||||
|
||||
/// \brief Checks for reading process status with the given pid with help of
|
||||
/// alive-flag of presence (indicative lock) or using another way.
|
||||
/// \return
|
||||
/// MDBX_RESULT_TRUE (-1) - if the reader process with the given PID is alive
|
||||
/// and working with DB (indicative lock is present).
|
||||
/// MDBX_RESULT_FALSE (0) - if the reader process with the given PID is absent
|
||||
/// or not working with DB (indicative lock is not present).
|
||||
/// Otherwise (not 0 and not -1) - error code.
|
||||
MDBX_INTERNAL int lck_rpid_check(MDBX_env *env, uint32_t pid);
|
261
src/logging_and_debug.c
Normal file
261
src/logging_and_debug.c
Normal file
@ -0,0 +1,261 @@
|
||||
/// \copyright SPDX-License-Identifier: Apache-2.0
|
||||
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
|
||||
|
||||
#include "internals.h"
|
||||
|
||||
__cold void debug_log_va(int level, const char *function, int line,
|
||||
const char *fmt, va_list args) {
|
||||
ENSURE(nullptr, osal_fastmutex_acquire(&globals.debug_lock) == 0);
|
||||
if (globals.logger.ptr) {
|
||||
if (globals.logger_buffer == nullptr)
|
||||
globals.logger.fmt(level, function, line, fmt, args);
|
||||
else {
|
||||
const int len = vsnprintf(globals.logger_buffer,
|
||||
globals.logger_buffer_size, fmt, args);
|
||||
if (len > 0)
|
||||
globals.logger.nofmt(level, function, line, globals.logger_buffer, len);
|
||||
}
|
||||
} else {
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
if (IsDebuggerPresent()) {
|
||||
int prefix_len = 0;
|
||||
char *prefix = nullptr;
|
||||
if (function && line > 0)
|
||||
prefix_len = osal_asprintf(&prefix, "%s:%d ", function, line);
|
||||
else if (function)
|
||||
prefix_len = osal_asprintf(&prefix, "%s: ", function);
|
||||
else if (line > 0)
|
||||
prefix_len = osal_asprintf(&prefix, "%d: ", line);
|
||||
if (prefix_len > 0 && prefix) {
|
||||
OutputDebugStringA(prefix);
|
||||
osal_free(prefix);
|
||||
}
|
||||
char *msg = nullptr;
|
||||
int msg_len = osal_vasprintf(&msg, fmt, args);
|
||||
if (msg_len > 0 && msg) {
|
||||
OutputDebugStringA(msg);
|
||||
osal_free(msg);
|
||||
}
|
||||
}
|
||||
#else
|
||||
if (function && line > 0)
|
||||
fprintf(stderr, "%s:%d ", function, line);
|
||||
else if (function)
|
||||
fprintf(stderr, "%s: ", function);
|
||||
else if (line > 0)
|
||||
fprintf(stderr, "%d: ", line);
|
||||
vfprintf(stderr, fmt, args);
|
||||
fflush(stderr);
|
||||
#endif
|
||||
}
|
||||
ENSURE(nullptr, osal_fastmutex_release(&globals.debug_lock) == 0);
|
||||
}
|
||||
|
||||
__cold void debug_log(int level, const char *function, int line,
|
||||
const char *fmt, ...) {
|
||||
va_list args;
|
||||
va_start(args, fmt);
|
||||
debug_log_va(level, function, line, fmt, args);
|
||||
va_end(args);
|
||||
}
|
||||
|
||||
/* Dump a val in ascii or hexadecimal. */
|
||||
__cold const char *mdbx_dump_val(const MDBX_val *val, char *const buf,
|
||||
const size_t bufsize) {
|
||||
if (!val)
|
||||
return "<null>";
|
||||
if (!val->iov_len)
|
||||
return "<empty>";
|
||||
if (!buf || bufsize < 4)
|
||||
return nullptr;
|
||||
|
||||
if (!val->iov_base) {
|
||||
int len = snprintf(buf, bufsize, "<nullptr.%zu>", val->iov_len);
|
||||
assert(len > 0 && (size_t)len < bufsize);
|
||||
(void)len;
|
||||
return buf;
|
||||
}
|
||||
|
||||
bool is_ascii = true;
|
||||
const uint8_t *const data = val->iov_base;
|
||||
for (size_t i = 0; i < val->iov_len; i++)
|
||||
if (data[i] < ' ' || data[i] > '~') {
|
||||
is_ascii = false;
|
||||
break;
|
||||
}
|
||||
|
||||
if (is_ascii) {
|
||||
int len =
|
||||
snprintf(buf, bufsize, "%.*s",
|
||||
(val->iov_len > INT_MAX) ? INT_MAX : (int)val->iov_len, data);
|
||||
assert(len > 0 && (size_t)len < bufsize);
|
||||
(void)len;
|
||||
} else {
|
||||
char *const detent = buf + bufsize - 2;
|
||||
char *ptr = buf;
|
||||
*ptr++ = '<';
|
||||
for (size_t i = 0; i < val->iov_len && ptr < detent; i++) {
|
||||
const char hex[16] = {'0', '1', '2', '3', '4', '5', '6', '7',
|
||||
'8', '9', 'a', 'b', 'c', 'd', 'e', 'f'};
|
||||
*ptr++ = hex[data[i] >> 4];
|
||||
*ptr++ = hex[data[i] & 15];
|
||||
}
|
||||
if (ptr < detent)
|
||||
*ptr++ = '>';
|
||||
*ptr = '\0';
|
||||
}
|
||||
return buf;
|
||||
}
|
||||
|
||||
/*------------------------------------------------------------------------------
|
||||
LY: debug stuff */
|
||||
|
||||
__cold const char *pagetype_caption(const uint8_t type, char buf4unknown[16]) {
|
||||
switch (type) {
|
||||
case P_BRANCH:
|
||||
return "branch";
|
||||
case P_LEAF:
|
||||
return "leaf";
|
||||
case P_LEAF | P_SUBP:
|
||||
return "subleaf";
|
||||
case P_LEAF | P_DUPFIX:
|
||||
return "dupfix-leaf";
|
||||
case P_LEAF | P_DUPFIX | P_SUBP:
|
||||
return "dupfix-subleaf";
|
||||
case P_LEAF | P_DUPFIX | P_SUBP | P_LEGACY_DIRTY:
|
||||
return "dupfix-subleaf.legacy-dirty";
|
||||
case P_LARGE:
|
||||
return "large";
|
||||
default:
|
||||
snprintf(buf4unknown, 16, "unknown_0x%x", type);
|
||||
return buf4unknown;
|
||||
}
|
||||
}
|
||||
|
||||
__cold static const char *leafnode_type(node_t *n) {
|
||||
static const char *const tp[2][2] = {{"", ": DB"},
|
||||
{": sub-page", ": sub-DB"}};
|
||||
return (node_flags(n) & N_BIGDATA)
|
||||
? ": large page"
|
||||
: tp[!!(node_flags(n) & N_DUPDATA)][!!(node_flags(n) & N_SUBDATA)];
|
||||
}
|
||||
|
||||
/* Display all the keys in the page. */
|
||||
__cold void page_list(page_t *mp) {
|
||||
pgno_t pgno = mp->pgno;
|
||||
const char *type;
|
||||
node_t *node;
|
||||
size_t i, nkeys, nsize, total = 0;
|
||||
MDBX_val key;
|
||||
DKBUF;
|
||||
|
||||
switch (page_type(mp)) {
|
||||
case P_BRANCH:
|
||||
type = "Branch page";
|
||||
break;
|
||||
case P_LEAF:
|
||||
type = "Leaf page";
|
||||
break;
|
||||
case P_LEAF | P_SUBP:
|
||||
type = "Leaf sub-page";
|
||||
break;
|
||||
case P_LEAF | P_DUPFIX:
|
||||
type = "Leaf2 page";
|
||||
break;
|
||||
case P_LEAF | P_DUPFIX | P_SUBP:
|
||||
type = "Leaf2 sub-page";
|
||||
break;
|
||||
case P_LARGE:
|
||||
VERBOSE("Overflow page %" PRIaPGNO " pages %u\n", pgno, mp->pages);
|
||||
return;
|
||||
case P_META:
|
||||
VERBOSE("Meta-page %" PRIaPGNO " txnid %" PRIu64 "\n", pgno,
|
||||
unaligned_peek_u64(4, page_meta(mp)->txnid_a));
|
||||
return;
|
||||
default:
|
||||
VERBOSE("Bad page %" PRIaPGNO " flags 0x%X\n", pgno, mp->flags);
|
||||
return;
|
||||
}
|
||||
|
||||
nkeys = page_numkeys(mp);
|
||||
VERBOSE("%s %" PRIaPGNO " numkeys %zu\n", type, pgno, nkeys);
|
||||
|
||||
for (i = 0; i < nkeys; i++) {
|
||||
if (is_dupfix_leaf(
|
||||
mp)) { /* DUPFIX pages have no entries[] or node headers */
|
||||
key = page_dupfix_key(mp, i, nsize = mp->dupfix_ksize);
|
||||
total += nsize;
|
||||
VERBOSE("key %zu: nsize %zu, %s\n", i, nsize, DKEY(&key));
|
||||
continue;
|
||||
}
|
||||
node = page_node(mp, i);
|
||||
key.iov_len = node_ks(node);
|
||||
key.iov_base = node->payload;
|
||||
nsize = NODESIZE + key.iov_len;
|
||||
if (is_branch(mp)) {
|
||||
VERBOSE("key %zu: page %" PRIaPGNO ", %s\n", i, node_pgno(node),
|
||||
DKEY(&key));
|
||||
total += nsize;
|
||||
} else {
|
||||
if (node_flags(node) & N_BIGDATA)
|
||||
nsize += sizeof(pgno_t);
|
||||
else
|
||||
nsize += node_ds(node);
|
||||
total += nsize;
|
||||
nsize += sizeof(indx_t);
|
||||
VERBOSE("key %zu: nsize %zu, %s%s\n", i, nsize, DKEY(&key),
|
||||
leafnode_type(node));
|
||||
}
|
||||
total = EVEN_CEIL(total);
|
||||
}
|
||||
VERBOSE("Total: header %u + contents %zu + unused %zu\n",
|
||||
is_dupfix_leaf(mp) ? PAGEHDRSZ : PAGEHDRSZ + mp->lower, total,
|
||||
page_room(mp));
|
||||
}
|
||||
|
||||
__cold static int setup_debug(MDBX_log_level_t level, MDBX_debug_flags_t flags,
|
||||
union logger_union logger, char *buffer,
|
||||
size_t buffer_size) {
|
||||
ENSURE(nullptr, osal_fastmutex_acquire(&globals.debug_lock) == 0);
|
||||
|
||||
const int rc = globals.runtime_flags | (globals.loglevel << 16);
|
||||
if (level != MDBX_LOG_DONTCHANGE)
|
||||
globals.loglevel = (uint8_t)level;
|
||||
|
||||
if (flags != MDBX_DBG_DONTCHANGE) {
|
||||
flags &=
|
||||
#if MDBX_DEBUG
|
||||
MDBX_DBG_ASSERT | MDBX_DBG_AUDIT | MDBX_DBG_JITTER |
|
||||
#endif
|
||||
MDBX_DBG_DUMP | MDBX_DBG_LEGACY_MULTIOPEN | MDBX_DBG_LEGACY_OVERLAP |
|
||||
MDBX_DBG_DONT_UPGRADE;
|
||||
globals.runtime_flags = (uint8_t)flags;
|
||||
}
|
||||
|
||||
assert(MDBX_LOGGER_DONTCHANGE == ((MDBX_debug_func *)(intptr_t)-1));
|
||||
if (logger.ptr != (void *)((intptr_t)-1)) {
|
||||
globals.logger.ptr = logger.ptr;
|
||||
globals.logger_buffer = buffer;
|
||||
globals.logger_buffer_size = buffer_size;
|
||||
}
|
||||
|
||||
ENSURE(nullptr, osal_fastmutex_release(&globals.debug_lock) == 0);
|
||||
return rc;
|
||||
}
|
||||
|
||||
__cold int mdbx_setup_debug_nofmt(MDBX_log_level_t level,
|
||||
MDBX_debug_flags_t flags,
|
||||
MDBX_debug_func_nofmt *logger, char *buffer,
|
||||
size_t buffer_size) {
|
||||
union logger_union thunk;
|
||||
thunk.nofmt =
|
||||
(logger && buffer && buffer_size) ? logger : MDBX_LOGGER_NOFMT_DONTCHANGE;
|
||||
return setup_debug(level, flags, thunk, buffer, buffer_size);
|
||||
}
|
||||
|
||||
__cold int mdbx_setup_debug(MDBX_log_level_t level, MDBX_debug_flags_t flags,
|
||||
MDBX_debug_func *logger) {
|
||||
union logger_union thunk;
|
||||
thunk.fmt = logger;
|
||||
return setup_debug(level, flags, thunk, nullptr, 0);
|
||||
}
|
160
src/logging_and_debug.h
Normal file
160
src/logging_and_debug.h
Normal file
@ -0,0 +1,160 @@
|
||||
/// \copyright SPDX-License-Identifier: Apache-2.0
|
||||
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "essentials.h"
|
||||
|
||||
#ifndef __Wpedantic_format_voidptr
|
||||
MDBX_MAYBE_UNUSED static inline const void *
|
||||
__Wpedantic_format_voidptr(const void *ptr) {
|
||||
return ptr;
|
||||
}
|
||||
#define __Wpedantic_format_voidptr(ARG) __Wpedantic_format_voidptr(ARG)
|
||||
#endif /* __Wpedantic_format_voidptr */
|
||||
|
||||
MDBX_INTERNAL void MDBX_PRINTF_ARGS(4, 5)
|
||||
debug_log(int level, const char *function, int line, const char *fmt, ...)
|
||||
MDBX_PRINTF_ARGS(4, 5);
|
||||
MDBX_INTERNAL void debug_log_va(int level, const char *function, int line,
|
||||
const char *fmt, va_list args);
|
||||
|
||||
#if MDBX_DEBUG
|
||||
#define LOG_ENABLED(LVL) unlikely(LVL <= globals.loglevel)
|
||||
#define AUDIT_ENABLED() \
|
||||
unlikely((globals.runtime_flags & (unsigned)MDBX_DBG_AUDIT))
|
||||
#else /* MDBX_DEBUG */
|
||||
#define LOG_ENABLED(LVL) (LVL < MDBX_LOG_VERBOSE && LVL <= globals.loglevel)
|
||||
#define AUDIT_ENABLED() (0)
|
||||
#endif /* LOG_ENABLED() & AUDIT_ENABLED() */
|
||||
|
||||
#if MDBX_FORCE_ASSERTIONS
|
||||
#define ASSERT_ENABLED() (1)
|
||||
#elif MDBX_DEBUG
|
||||
#define ASSERT_ENABLED() \
|
||||
likely((globals.runtime_flags & (unsigned)MDBX_DBG_ASSERT))
|
||||
#else
|
||||
#define ASSERT_ENABLED() (0)
|
||||
#endif /* ASSERT_ENABLED() */
|
||||
|
||||
#define DEBUG_EXTRA(fmt, ...) \
|
||||
do { \
|
||||
if (LOG_ENABLED(MDBX_LOG_EXTRA)) \
|
||||
debug_log(MDBX_LOG_EXTRA, __func__, __LINE__, fmt, __VA_ARGS__); \
|
||||
} while (0)
|
||||
|
||||
#define DEBUG_EXTRA_PRINT(fmt, ...) \
|
||||
do { \
|
||||
if (LOG_ENABLED(MDBX_LOG_EXTRA)) \
|
||||
debug_log(MDBX_LOG_EXTRA, nullptr, 0, fmt, __VA_ARGS__); \
|
||||
} while (0)
|
||||
|
||||
#define TRACE(fmt, ...) \
|
||||
do { \
|
||||
if (LOG_ENABLED(MDBX_LOG_TRACE)) \
|
||||
debug_log(MDBX_LOG_TRACE, __func__, __LINE__, fmt "\n", __VA_ARGS__); \
|
||||
} while (0)
|
||||
|
||||
#define DEBUG(fmt, ...) \
|
||||
do { \
|
||||
if (LOG_ENABLED(MDBX_LOG_DEBUG)) \
|
||||
debug_log(MDBX_LOG_DEBUG, __func__, __LINE__, fmt "\n", __VA_ARGS__); \
|
||||
} while (0)
|
||||
|
||||
#define VERBOSE(fmt, ...) \
|
||||
do { \
|
||||
if (LOG_ENABLED(MDBX_LOG_VERBOSE)) \
|
||||
debug_log(MDBX_LOG_VERBOSE, __func__, __LINE__, fmt "\n", __VA_ARGS__); \
|
||||
} while (0)
|
||||
|
||||
#define NOTICE(fmt, ...) \
|
||||
do { \
|
||||
if (LOG_ENABLED(MDBX_LOG_NOTICE)) \
|
||||
debug_log(MDBX_LOG_NOTICE, __func__, __LINE__, fmt "\n", __VA_ARGS__); \
|
||||
} while (0)
|
||||
|
||||
#define WARNING(fmt, ...) \
|
||||
do { \
|
||||
if (LOG_ENABLED(MDBX_LOG_WARN)) \
|
||||
debug_log(MDBX_LOG_WARN, __func__, __LINE__, fmt "\n", __VA_ARGS__); \
|
||||
} while (0)
|
||||
|
||||
#undef ERROR /* wingdi.h \
|
||||
Yeah, morons from M$ put such definition to the public header. */
|
||||
|
||||
#define ERROR(fmt, ...) \
|
||||
do { \
|
||||
if (LOG_ENABLED(MDBX_LOG_ERROR)) \
|
||||
debug_log(MDBX_LOG_ERROR, __func__, __LINE__, fmt "\n", __VA_ARGS__); \
|
||||
} while (0)
|
||||
|
||||
#define FATAL(fmt, ...) \
|
||||
debug_log(MDBX_LOG_FATAL, __func__, __LINE__, fmt "\n", __VA_ARGS__);
|
||||
|
||||
#if MDBX_DEBUG
|
||||
#define ASSERT_FAIL(env, msg, func, line) mdbx_assert_fail(env, msg, func, line)
|
||||
#else /* MDBX_DEBUG */
|
||||
MDBX_NORETURN __cold void assert_fail(const char *msg, const char *func,
|
||||
unsigned line);
|
||||
#define ASSERT_FAIL(env, msg, func, line) \
|
||||
do { \
|
||||
(void)(env); \
|
||||
assert_fail(msg, func, line); \
|
||||
} while (0)
|
||||
#endif /* MDBX_DEBUG */
|
||||
|
||||
#define ENSURE_MSG(env, expr, msg) \
|
||||
do { \
|
||||
if (unlikely(!(expr))) \
|
||||
ASSERT_FAIL(env, msg, __func__, __LINE__); \
|
||||
} while (0)
|
||||
|
||||
#define ENSURE(env, expr) ENSURE_MSG(env, expr, #expr)
|
||||
|
||||
/* assert(3) variant in environment context */
|
||||
#define eASSERT(env, expr) \
|
||||
do { \
|
||||
if (ASSERT_ENABLED()) \
|
||||
ENSURE(env, expr); \
|
||||
} while (0)
|
||||
|
||||
/* assert(3) variant in cursor context */
|
||||
#define cASSERT(mc, expr) eASSERT((mc)->txn->env, expr)
|
||||
|
||||
/* assert(3) variant in transaction context */
|
||||
#define tASSERT(txn, expr) eASSERT((txn)->env, expr)
|
||||
|
||||
#ifndef xMDBX_TOOLS /* Avoid using internal eASSERT() */
|
||||
#undef assert
|
||||
#define assert(expr) eASSERT(nullptr, expr)
|
||||
#endif
|
||||
|
||||
MDBX_MAYBE_UNUSED static inline void jitter4testing(bool tiny) {
|
||||
#if MDBX_DEBUG
|
||||
if (globals.runtime_flags & (unsigned)MDBX_DBG_JITTER)
|
||||
osal_jitter(tiny);
|
||||
#else
|
||||
(void)tiny;
|
||||
#endif
|
||||
}
|
||||
|
||||
MDBX_MAYBE_UNUSED MDBX_INTERNAL void page_list(page_t *mp);
|
||||
|
||||
MDBX_INTERNAL const char *pagetype_caption(const uint8_t type,
|
||||
char buf4unknown[16]);
|
||||
/* Key size which fits in a DKBUF (debug key buffer). */
|
||||
#define DKBUF_MAX 127
|
||||
#define DKBUF char dbg_kbuf[DKBUF_MAX * 4 + 2]
|
||||
#define DKEY(x) mdbx_dump_val(x, dbg_kbuf, DKBUF_MAX * 2 + 1)
|
||||
#define DVAL(x) \
|
||||
mdbx_dump_val(x, dbg_kbuf + DKBUF_MAX * 2 + 1, DKBUF_MAX * 2 + 1)
|
||||
|
||||
#if MDBX_DEBUG
|
||||
#define DKBUF_DEBUG DKBUF
|
||||
#define DKEY_DEBUG(x) DKEY(x)
|
||||
#define DVAL_DEBUG(x) DVAL(x)
|
||||
#else
|
||||
#define DKBUF_DEBUG ((void)(0))
|
||||
#define DKEY_DEBUG(x) ("-")
|
||||
#define DVAL_DEBUG(x) ("-")
|
||||
#endif
|
56
src/mdbx.c++
56
src/mdbx.c++
@ -1,18 +1,14 @@
|
||||
//
|
||||
// Copyright (c) 2020-2024, Leonid Yuriev <leo@yuriev.ru>.
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Non-inline part of the libmdbx C++ API
|
||||
//
|
||||
/// \copyright SPDX-License-Identifier: Apache-2.0
|
||||
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2020-2024
|
||||
///
|
||||
/// \brief Non-inline part of the libmdbx C++ API
|
||||
///
|
||||
|
||||
#if defined(_MSC_VER) && !defined(_CRT_SECURE_NO_WARNINGS)
|
||||
#define _CRT_SECURE_NO_WARNINGS
|
||||
#endif /* _CRT_SECURE_NO_WARNINGS */
|
||||
#include "essentials.h"
|
||||
|
||||
#if (defined(__MINGW__) || defined(__MINGW32__) || defined(__MINGW64__)) && \
|
||||
!defined(__USE_MINGW_ANSI_STDIO)
|
||||
#define __USE_MINGW_ANSI_STDIO 1
|
||||
#endif /* MinGW */
|
||||
#if !defined(MDBX_BUILD_CXX) || MDBX_BUILD_CXX != 1
|
||||
#error "Build is misconfigured! Expecting MDBX_BUILD_CXX=1 for C++ API."
|
||||
#endif /* MDBX_BUILD_CXX*/
|
||||
|
||||
/* Workaround for MSVC' header `extern "C"` vs `std::` redefinition bug */
|
||||
#if defined(_MSC_VER) && defined(__SANITIZE_ADDRESS__) && \
|
||||
@ -22,8 +18,6 @@
|
||||
|
||||
#include "../mdbx.h++"
|
||||
|
||||
#include "internals.h"
|
||||
|
||||
#include <array>
|
||||
#include <atomic>
|
||||
#include <cctype> // for isxdigit(), etc
|
||||
@ -402,6 +396,7 @@ __cold void error::throw_exception() const {
|
||||
CASE_EXCEPTION(incompatible_operation, MDBX_INCOMPATIBLE);
|
||||
CASE_EXCEPTION(internal_page_full, MDBX_PAGE_FULL);
|
||||
CASE_EXCEPTION(internal_problem, MDBX_PROBLEM);
|
||||
CASE_EXCEPTION(key_exists, MDBX_KEYEXIST);
|
||||
CASE_EXCEPTION(key_mismatch, MDBX_EKEYMISMATCH);
|
||||
CASE_EXCEPTION(max_maps_reached, MDBX_DBS_FULL);
|
||||
CASE_EXCEPTION(max_readers_reached, MDBX_READERS_FULL);
|
||||
@ -1227,7 +1222,7 @@ env::operate_parameters::make_flags(bool accede, bool use_subdirectory) const {
|
||||
if (options.nested_write_transactions)
|
||||
flags &= ~MDBX_WRITEMAP;
|
||||
if (reclaiming.coalesce)
|
||||
flags |= MDBX_env_flags_t(MDBX_DEPRECATED_COALESCE);
|
||||
flags |= MDBX_COALESCE;
|
||||
if (reclaiming.lifo)
|
||||
flags |= MDBX_LIFORECLAIM;
|
||||
switch (durability) {
|
||||
@ -1272,7 +1267,7 @@ env::durability env::operate_parameters::durability_from_flags(
|
||||
|
||||
env::reclaiming_options::reclaiming_options(MDBX_env_flags_t flags) noexcept
|
||||
: lifo((flags & MDBX_LIFORECLAIM) ? true : false),
|
||||
coalesce((flags & MDBX_DEPRECATED_COALESCE) ? true : false) {}
|
||||
coalesce((flags & MDBX_COALESCE) ? true : false) {}
|
||||
|
||||
env::operate_options::operate_options(MDBX_env_flags_t flags) noexcept
|
||||
: no_sticky_threads(((flags & (MDBX_NOSTICKYTHREADS | MDBX_EXCLUSIVE)) ==
|
||||
@ -1742,21 +1737,20 @@ __cold ::std::ostream &operator<<(::std::ostream &out,
|
||||
const char *suffix;
|
||||
} static const scales[] = {
|
||||
#if MDBX_WORDBITS > 32
|
||||
{env_managed::geometry::EiB, "EiB"},
|
||||
{env_managed::geometry::EB, "EB"},
|
||||
{env_managed::geometry::PiB, "PiB"},
|
||||
{env_managed::geometry::PB, "PB"},
|
||||
{env_managed::geometry::TiB, "TiB"},
|
||||
{env_managed::geometry::TB, "TB"},
|
||||
{env_managed::geometry::EiB, "EiB"},
|
||||
{env_managed::geometry::EB, "EB"},
|
||||
{env_managed::geometry::PiB, "PiB"},
|
||||
{env_managed::geometry::PB, "PB"},
|
||||
{env_managed::geometry::TiB, "TiB"},
|
||||
{env_managed::geometry::TB, "TB"},
|
||||
#endif
|
||||
{env_managed::geometry::GiB, "GiB"},
|
||||
{env_managed::geometry::GB, "GB"},
|
||||
{env_managed::geometry::MiB, "MiB"},
|
||||
{env_managed::geometry::MB, "MB"},
|
||||
{env_managed::geometry::KiB, "KiB"},
|
||||
{env_managed::geometry::kB, "kB"},
|
||||
{1, " bytes"}
|
||||
};
|
||||
{env_managed::geometry::GiB, "GiB"},
|
||||
{env_managed::geometry::GB, "GB"},
|
||||
{env_managed::geometry::MiB, "MiB"},
|
||||
{env_managed::geometry::MB, "MB"},
|
||||
{env_managed::geometry::KiB, "KiB"},
|
||||
{env_managed::geometry::kB, "kB"},
|
||||
{1, " bytes"}};
|
||||
|
||||
for (const auto i : scales)
|
||||
if (bytes % i.one == 0)
|
||||
|
746
src/meta.c
Normal file
746
src/meta.c
Normal file
@ -0,0 +1,746 @@
|
||||
/// \copyright SPDX-License-Identifier: Apache-2.0
|
||||
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
|
||||
|
||||
#include "internals.h"
|
||||
|
||||
typedef struct meta_snap {
|
||||
uint64_t txnid;
|
||||
size_t is_steady;
|
||||
} meta_snap_t;
|
||||
|
||||
static inline txnid_t fetch_txnid(const volatile mdbx_atomic_uint32_t *ptr) {
|
||||
#if (defined(__amd64__) || defined(__e2k__)) && !defined(ENABLE_UBSAN) && \
|
||||
MDBX_UNALIGNED_OK >= 8
|
||||
return atomic_load64((const volatile mdbx_atomic_uint64_t *)ptr,
|
||||
mo_AcquireRelease);
|
||||
#else
|
||||
const uint32_t l = atomic_load32(
|
||||
&ptr[__BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__], mo_AcquireRelease);
|
||||
const uint32_t h = atomic_load32(
|
||||
&ptr[__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__], mo_AcquireRelease);
|
||||
return (uint64_t)h << 32 | l;
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline meta_snap_t meta_snap(const volatile meta_t *meta) {
|
||||
txnid_t txnid = fetch_txnid(meta->txnid_a);
|
||||
jitter4testing(true);
|
||||
size_t is_steady = meta_is_steady(meta) && txnid >= MIN_TXNID;
|
||||
jitter4testing(true);
|
||||
if (unlikely(txnid != fetch_txnid(meta->txnid_b)))
|
||||
txnid = is_steady = 0;
|
||||
meta_snap_t r = {txnid, is_steady};
|
||||
return r;
|
||||
}
|
||||
|
||||
txnid_t meta_txnid(const volatile meta_t *meta) {
|
||||
return meta_snap(meta).txnid;
|
||||
}
|
||||
|
||||
meta_ptr_t meta_ptr(const MDBX_env *env, unsigned n) {
|
||||
eASSERT(env, n < NUM_METAS);
|
||||
meta_ptr_t r;
|
||||
meta_snap_t snap = meta_snap(r.ptr_v = METAPAGE(env, n));
|
||||
r.txnid = snap.txnid;
|
||||
r.is_steady = snap.is_steady;
|
||||
return r;
|
||||
}
|
||||
|
||||
static uint8_t meta_cmp2pack(uint8_t c01, uint8_t c02, uint8_t c12, bool s0,
|
||||
bool s1, bool s2) {
|
||||
assert(c01 < 3 && c02 < 3 && c12 < 3);
|
||||
/* assert(s0 < 2 && s1 < 2 && s2 < 2); */
|
||||
const uint8_t recent = meta_cmp2recent(c01, s0, s1)
|
||||
? (meta_cmp2recent(c02, s0, s2) ? 0 : 2)
|
||||
: (meta_cmp2recent(c12, s1, s2) ? 1 : 2);
|
||||
const uint8_t prefer_steady = meta_cmp2steady(c01, s0, s1)
|
||||
? (meta_cmp2steady(c02, s0, s2) ? 0 : 2)
|
||||
: (meta_cmp2steady(c12, s1, s2) ? 1 : 2);
|
||||
|
||||
uint8_t tail;
|
||||
if (recent == 0)
|
||||
tail = meta_cmp2steady(c12, s1, s2) ? 2 : 1;
|
||||
else if (recent == 1)
|
||||
tail = meta_cmp2steady(c02, s0, s2) ? 2 : 0;
|
||||
else
|
||||
tail = meta_cmp2steady(c01, s0, s1) ? 1 : 0;
|
||||
|
||||
const bool valid =
|
||||
c01 != 1 || s0 != s1 || c02 != 1 || s0 != s2 || c12 != 1 || s1 != s2;
|
||||
const bool strict = (c01 != 1 || s0 != s1) && (c02 != 1 || s0 != s2) &&
|
||||
(c12 != 1 || s1 != s2);
|
||||
return tail | recent << 2 | prefer_steady << 4 | strict << 6 | valid << 7;
|
||||
}
|
||||
|
||||
static inline void meta_troika_unpack(troika_t *troika, const uint8_t packed) {
|
||||
troika->recent = (packed >> 2) & 3;
|
||||
troika->prefer_steady = (packed >> 4) & 3;
|
||||
troika->tail_and_flags = packed & 0xC3;
|
||||
#if MDBX_WORDBITS > 32 /* Workaround for false-positives from Valgrind */
|
||||
troika->unused_pad = 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
static const uint8_t troika_fsm_map[2 * 2 * 2 * 3 * 3 * 3] = {
|
||||
232, 201, 216, 216, 232, 233, 232, 232, 168, 201, 216, 152, 168, 233, 232,
|
||||
168, 233, 201, 216, 201, 233, 233, 232, 233, 168, 201, 152, 216, 232, 169,
|
||||
232, 168, 168, 193, 152, 152, 168, 169, 232, 168, 169, 193, 152, 194, 233,
|
||||
169, 232, 169, 232, 201, 216, 216, 232, 201, 232, 232, 168, 193, 216, 152,
|
||||
168, 193, 232, 168, 193, 193, 210, 194, 225, 193, 225, 193, 168, 137, 212,
|
||||
214, 232, 233, 168, 168, 168, 137, 212, 150, 168, 233, 168, 168, 169, 137,
|
||||
216, 201, 233, 233, 168, 169, 168, 137, 148, 214, 232, 169, 168, 168, 40,
|
||||
129, 148, 150, 168, 169, 168, 40, 169, 129, 152, 194, 233, 169, 168, 169,
|
||||
168, 137, 214, 214, 232, 201, 168, 168, 168, 129, 214, 150, 168, 193, 168,
|
||||
168, 129, 129, 210, 194, 225, 193, 161, 129, 212, 198, 212, 214, 228, 228,
|
||||
212, 212, 148, 201, 212, 150, 164, 233, 212, 148, 233, 201, 216, 201, 233,
|
||||
233, 216, 233, 148, 198, 148, 214, 228, 164, 212, 148, 148, 194, 148, 150,
|
||||
164, 169, 212, 148, 169, 194, 152, 194, 233, 169, 216, 169, 214, 198, 214,
|
||||
214, 228, 198, 212, 214, 150, 194, 214, 150, 164, 193, 212, 150, 194, 194,
|
||||
210, 194, 225, 193, 210, 194};
|
||||
|
||||
__cold bool troika_verify_fsm(void) {
|
||||
bool ok = true;
|
||||
for (size_t i = 0; i < 2 * 2 * 2 * 3 * 3 * 3; ++i) {
|
||||
const bool s0 = (i >> 0) & 1;
|
||||
const bool s1 = (i >> 1) & 1;
|
||||
const bool s2 = (i >> 2) & 1;
|
||||
const uint8_t c01 = (i / (8 * 1)) % 3;
|
||||
const uint8_t c02 = (i / (8 * 3)) % 3;
|
||||
const uint8_t c12 = (i / (8 * 9)) % 3;
|
||||
|
||||
const uint8_t packed = meta_cmp2pack(c01, c02, c12, s0, s1, s2);
|
||||
troika_t troika;
|
||||
troika.fsm = (uint8_t)i;
|
||||
meta_troika_unpack(&troika, packed);
|
||||
|
||||
const uint8_t tail = TROIKA_TAIL(&troika);
|
||||
const bool strict = TROIKA_STRICT_VALID(&troika);
|
||||
const bool valid = TROIKA_VALID(&troika);
|
||||
|
||||
const uint8_t recent_chk = meta_cmp2recent(c01, s0, s1)
|
||||
? (meta_cmp2recent(c02, s0, s2) ? 0 : 2)
|
||||
: (meta_cmp2recent(c12, s1, s2) ? 1 : 2);
|
||||
const uint8_t prefer_steady_chk =
|
||||
meta_cmp2steady(c01, s0, s1) ? (meta_cmp2steady(c02, s0, s2) ? 0 : 2)
|
||||
: (meta_cmp2steady(c12, s1, s2) ? 1 : 2);
|
||||
|
||||
uint8_t tail_chk;
|
||||
if (recent_chk == 0)
|
||||
tail_chk = meta_cmp2steady(c12, s1, s2) ? 2 : 1;
|
||||
else if (recent_chk == 1)
|
||||
tail_chk = meta_cmp2steady(c02, s0, s2) ? 2 : 0;
|
||||
else
|
||||
tail_chk = meta_cmp2steady(c01, s0, s1) ? 1 : 0;
|
||||
|
||||
const bool valid_chk =
|
||||
c01 != 1 || s0 != s1 || c02 != 1 || s0 != s2 || c12 != 1 || s1 != s2;
|
||||
const bool strict_chk = (c01 != 1 || s0 != s1) && (c02 != 1 || s0 != s2) &&
|
||||
(c12 != 1 || s1 != s2);
|
||||
assert(troika.recent == recent_chk);
|
||||
assert(troika.prefer_steady == prefer_steady_chk);
|
||||
assert(tail == tail_chk);
|
||||
assert(valid == valid_chk);
|
||||
assert(strict == strict_chk);
|
||||
assert(troika_fsm_map[troika.fsm] == packed);
|
||||
if (troika.recent != recent_chk ||
|
||||
troika.prefer_steady != prefer_steady_chk || tail != tail_chk ||
|
||||
valid != valid_chk || strict != strict_chk ||
|
||||
troika_fsm_map[troika.fsm] != packed) {
|
||||
ok = false;
|
||||
}
|
||||
}
|
||||
return ok;
|
||||
}
|
||||
|
||||
__hot troika_t meta_tap(const MDBX_env *env) {
|
||||
meta_snap_t snap;
|
||||
troika_t troika;
|
||||
snap = meta_snap(METAPAGE(env, 0));
|
||||
troika.txnid[0] = snap.txnid;
|
||||
troika.fsm = (uint8_t)snap.is_steady << 0;
|
||||
snap = meta_snap(METAPAGE(env, 1));
|
||||
troika.txnid[1] = snap.txnid;
|
||||
troika.fsm += (uint8_t)snap.is_steady << 1;
|
||||
troika.fsm += meta_cmp2int(troika.txnid[0], troika.txnid[1], 8);
|
||||
snap = meta_snap(METAPAGE(env, 2));
|
||||
troika.txnid[2] = snap.txnid;
|
||||
troika.fsm += (uint8_t)snap.is_steady << 2;
|
||||
troika.fsm += meta_cmp2int(troika.txnid[0], troika.txnid[2], 8 * 3);
|
||||
troika.fsm += meta_cmp2int(troika.txnid[1], troika.txnid[2], 8 * 3 * 3);
|
||||
|
||||
meta_troika_unpack(&troika, troika_fsm_map[troika.fsm]);
|
||||
return troika;
|
||||
}
|
||||
|
||||
txnid_t recent_committed_txnid(const MDBX_env *env) {
|
||||
const txnid_t m0 = meta_txnid(METAPAGE(env, 0));
|
||||
const txnid_t m1 = meta_txnid(METAPAGE(env, 1));
|
||||
const txnid_t m2 = meta_txnid(METAPAGE(env, 2));
|
||||
return (m0 > m1) ? ((m0 > m2) ? m0 : m2) : ((m1 > m2) ? m1 : m2);
|
||||
}
|
||||
|
||||
static inline bool meta_eq(const troika_t *troika, size_t a, size_t b) {
|
||||
assert(a < NUM_METAS && b < NUM_METAS);
|
||||
return troika->txnid[a] == troika->txnid[b] &&
|
||||
(((troika->fsm >> a) ^ (troika->fsm >> b)) & 1) == 0 &&
|
||||
troika->txnid[a];
|
||||
}
|
||||
|
||||
unsigned meta_eq_mask(const troika_t *troika) {
|
||||
return meta_eq(troika, 0, 1) | meta_eq(troika, 1, 2) << 1 |
|
||||
meta_eq(troika, 2, 0) << 2;
|
||||
}
|
||||
|
||||
__hot bool meta_should_retry(const MDBX_env *env, troika_t *troika) {
|
||||
const troika_t prev = *troika;
|
||||
*troika = meta_tap(env);
|
||||
return prev.fsm != troika->fsm || prev.txnid[0] != troika->txnid[0] ||
|
||||
prev.txnid[1] != troika->txnid[1] || prev.txnid[2] != troika->txnid[2];
|
||||
}
|
||||
|
||||
const char *durable_caption(const meta_t *const meta) {
|
||||
if (meta_is_steady(meta))
|
||||
return (meta_sign_get(meta) == meta_sign_calculate(meta)) ? "Steady"
|
||||
: "Tainted";
|
||||
return "Weak";
|
||||
}
|
||||
|
||||
__cold void meta_troika_dump(const MDBX_env *env, const troika_t *troika) {
|
||||
const meta_ptr_t recent = meta_recent(env, troika);
|
||||
const meta_ptr_t prefer_steady = meta_prefer_steady(env, troika);
|
||||
const meta_ptr_t tail = meta_tail(env, troika);
|
||||
NOTICE("troika: %" PRIaTXN ".%c:%" PRIaTXN ".%c:%" PRIaTXN ".%c, fsm=0x%02x, "
|
||||
"head=%d-%" PRIaTXN ".%c, "
|
||||
"base=%d-%" PRIaTXN ".%c, "
|
||||
"tail=%d-%" PRIaTXN ".%c, "
|
||||
"valid %c, strict %c",
|
||||
troika->txnid[0], (troika->fsm & 1) ? 's' : 'w', troika->txnid[1],
|
||||
(troika->fsm & 2) ? 's' : 'w', troika->txnid[2],
|
||||
(troika->fsm & 4) ? 's' : 'w', troika->fsm, troika->recent,
|
||||
recent.txnid, recent.is_steady ? 's' : 'w', troika->prefer_steady,
|
||||
prefer_steady.txnid, prefer_steady.is_steady ? 's' : 'w',
|
||||
troika->tail_and_flags % NUM_METAS, tail.txnid,
|
||||
tail.is_steady ? 's' : 'w', TROIKA_VALID(troika) ? 'Y' : 'N',
|
||||
TROIKA_STRICT_VALID(troika) ? 'Y' : 'N');
|
||||
}
|
||||
|
||||
/*----------------------------------------------------------------------------*/
|
||||
|
||||
static int meta_unsteady(MDBX_env *env, const txnid_t inclusive_upto,
|
||||
const pgno_t pgno) {
|
||||
meta_t *const meta = METAPAGE(env, pgno);
|
||||
const txnid_t txnid = constmeta_txnid(meta);
|
||||
if (!meta_is_steady(meta) || txnid > inclusive_upto)
|
||||
return MDBX_RESULT_FALSE;
|
||||
|
||||
WARNING("wipe txn #%" PRIaTXN ", meta %" PRIaPGNO, txnid, pgno);
|
||||
const uint64_t wipe = DATASIGN_NONE;
|
||||
const void *ptr = &wipe;
|
||||
size_t bytes = sizeof(meta->sign),
|
||||
offset = ptr_dist(&meta->sign, env->dxb_mmap.base);
|
||||
if (env->flags & MDBX_WRITEMAP) {
|
||||
unaligned_poke_u64(4, meta->sign, wipe);
|
||||
osal_flush_incoherent_cpu_writeback();
|
||||
if (!MDBX_AVOID_MSYNC)
|
||||
return MDBX_RESULT_TRUE;
|
||||
ptr = data_page(meta);
|
||||
offset = ptr_dist(ptr, env->dxb_mmap.base);
|
||||
bytes = env->ps;
|
||||
}
|
||||
|
||||
#if MDBX_ENABLE_PGOP_STAT
|
||||
env->lck->pgops.wops.weak += 1;
|
||||
#endif /* MDBX_ENABLE_PGOP_STAT */
|
||||
int err = osal_pwrite(env->fd4meta, ptr, bytes, offset);
|
||||
return likely(err == MDBX_SUCCESS) ? MDBX_RESULT_TRUE : err;
|
||||
}
|
||||
|
||||
__cold int meta_wipe_steady(MDBX_env *env, txnid_t inclusive_upto) {
|
||||
int err = meta_unsteady(env, inclusive_upto, 0);
|
||||
if (likely(!MDBX_IS_ERROR(err)))
|
||||
err = meta_unsteady(env, inclusive_upto, 1);
|
||||
if (likely(!MDBX_IS_ERROR(err)))
|
||||
err = meta_unsteady(env, inclusive_upto, 2);
|
||||
|
||||
if (err == MDBX_RESULT_TRUE) {
|
||||
err = MDBX_SUCCESS;
|
||||
if (!MDBX_AVOID_MSYNC && (env->flags & MDBX_WRITEMAP)) {
|
||||
err = osal_msync(&env->dxb_mmap, 0, pgno_align2os_bytes(env, NUM_METAS),
|
||||
MDBX_SYNC_DATA | MDBX_SYNC_IODQ);
|
||||
#if MDBX_ENABLE_PGOP_STAT
|
||||
env->lck->pgops.msync.weak += 1;
|
||||
#endif /* MDBX_ENABLE_PGOP_STAT */
|
||||
} else if (env->fd4meta == env->lazy_fd) {
|
||||
err = osal_fsync(env->lazy_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ);
|
||||
#if MDBX_ENABLE_PGOP_STAT
|
||||
env->lck->pgops.fsync.weak += 1;
|
||||
#endif /* MDBX_ENABLE_PGOP_STAT */
|
||||
}
|
||||
}
|
||||
|
||||
osal_flush_incoherent_mmap(env->dxb_mmap.base, pgno2bytes(env, NUM_METAS),
|
||||
globals.sys_pagesize);
|
||||
|
||||
/* force oldest refresh */
|
||||
atomic_store32(&env->lck->rdt_refresh_flag, true, mo_Relaxed);
|
||||
|
||||
env->basal_txn->tw.troika = meta_tap(env);
|
||||
for (MDBX_txn *scan = env->basal_txn->nested; scan; scan = scan->nested)
|
||||
scan->tw.troika = env->basal_txn->tw.troika;
|
||||
return err;
|
||||
}
|
||||
|
||||
int meta_sync(const MDBX_env *env, const meta_ptr_t head) {
|
||||
eASSERT(env, atomic_load32(&env->lck->meta_sync_txnid, mo_Relaxed) !=
|
||||
(uint32_t)head.txnid);
|
||||
/* Функция может вызываться (в том числе) при (env->flags &
|
||||
* MDBX_NOMETASYNC) == 0 и env->fd4meta == env->dsync_fd, например если
|
||||
* предыдущая транзакция была выполненна с флагом MDBX_NOMETASYNC. */
|
||||
|
||||
int rc = MDBX_RESULT_TRUE;
|
||||
if (env->flags & MDBX_WRITEMAP) {
|
||||
if (!MDBX_AVOID_MSYNC) {
|
||||
rc = osal_msync(&env->dxb_mmap, 0, pgno_align2os_bytes(env, NUM_METAS),
|
||||
MDBX_SYNC_DATA | MDBX_SYNC_IODQ);
|
||||
#if MDBX_ENABLE_PGOP_STAT
|
||||
env->lck->pgops.msync.weak += 1;
|
||||
#endif /* MDBX_ENABLE_PGOP_STAT */
|
||||
} else {
|
||||
#if MDBX_ENABLE_PGOP_ST
|
||||
env->lck->pgops.wops.weak += 1;
|
||||
#endif /* MDBX_ENABLE_PGOP_STAT */
|
||||
const page_t *page = data_page(head.ptr_c);
|
||||
rc = osal_pwrite(env->fd4meta, page, env->ps,
|
||||
ptr_dist(page, env->dxb_mmap.base));
|
||||
|
||||
if (likely(rc == MDBX_SUCCESS) && env->fd4meta == env->lazy_fd) {
|
||||
rc = osal_fsync(env->lazy_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ);
|
||||
#if MDBX_ENABLE_PGOP_STAT
|
||||
env->lck->pgops.fsync.weak += 1;
|
||||
#endif /* MDBX_ENABLE_PGOP_STAT */
|
||||
}
|
||||
}
|
||||
} else {
|
||||
rc = osal_fsync(env->lazy_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ);
|
||||
#if MDBX_ENABLE_PGOP_STAT
|
||||
env->lck->pgops.fsync.weak += 1;
|
||||
#endif /* MDBX_ENABLE_PGOP_STAT */
|
||||
}
|
||||
|
||||
if (likely(rc == MDBX_SUCCESS))
|
||||
env->lck->meta_sync_txnid.weak = (uint32_t)head.txnid;
|
||||
return rc;
|
||||
}
|
||||
|
||||
__cold static page_t *meta_model(const MDBX_env *env, page_t *model,
|
||||
size_t num) {
|
||||
ENSURE(env, is_powerof2(env->ps));
|
||||
ENSURE(env, env->ps >= MDBX_MIN_PAGESIZE);
|
||||
ENSURE(env, env->ps <= MDBX_MAX_PAGESIZE);
|
||||
ENSURE(env, env->geo_in_bytes.lower >= MIN_MAPSIZE);
|
||||
ENSURE(env, env->geo_in_bytes.upper <= MAX_MAPSIZE);
|
||||
ENSURE(env, env->geo_in_bytes.now >= env->geo_in_bytes.lower);
|
||||
ENSURE(env, env->geo_in_bytes.now <= env->geo_in_bytes.upper);
|
||||
|
||||
memset(model, 0, env->ps);
|
||||
model->pgno = (pgno_t)num;
|
||||
model->flags = P_META;
|
||||
meta_t *const model_meta = page_meta(model);
|
||||
unaligned_poke_u64(4, model_meta->magic_and_version, MDBX_DATA_MAGIC);
|
||||
|
||||
model_meta->geometry.lower = bytes2pgno(env, env->geo_in_bytes.lower);
|
||||
model_meta->geometry.upper = bytes2pgno(env, env->geo_in_bytes.upper);
|
||||
model_meta->geometry.grow_pv =
|
||||
pages2pv(bytes2pgno(env, env->geo_in_bytes.grow));
|
||||
model_meta->geometry.shrink_pv =
|
||||
pages2pv(bytes2pgno(env, env->geo_in_bytes.shrink));
|
||||
model_meta->geometry.now = bytes2pgno(env, env->geo_in_bytes.now);
|
||||
model_meta->geometry.first_unallocated = NUM_METAS;
|
||||
|
||||
ENSURE(env, model_meta->geometry.lower >= MIN_PAGENO);
|
||||
ENSURE(env, model_meta->geometry.upper <= MAX_PAGENO + 1);
|
||||
ENSURE(env, model_meta->geometry.now >= model_meta->geometry.lower);
|
||||
ENSURE(env, model_meta->geometry.now <= model_meta->geometry.upper);
|
||||
ENSURE(env, model_meta->geometry.first_unallocated >= MIN_PAGENO);
|
||||
ENSURE(env,
|
||||
model_meta->geometry.first_unallocated <= model_meta->geometry.now);
|
||||
ENSURE(env, model_meta->geometry.grow_pv ==
|
||||
pages2pv(pv2pages(model_meta->geometry.grow_pv)));
|
||||
ENSURE(env, model_meta->geometry.shrink_pv ==
|
||||
pages2pv(pv2pages(model_meta->geometry.shrink_pv)));
|
||||
|
||||
model_meta->pagesize = env->ps;
|
||||
model_meta->trees.gc.flags = MDBX_INTEGERKEY;
|
||||
model_meta->trees.gc.root = P_INVALID;
|
||||
model_meta->trees.main.root = P_INVALID;
|
||||
meta_set_txnid(env, model_meta, MIN_TXNID + num);
|
||||
unaligned_poke_u64(4, model_meta->sign, meta_sign_calculate(model_meta));
|
||||
eASSERT(env, coherency_check_meta(env, model_meta, true));
|
||||
return ptr_disp(model, env->ps);
|
||||
}
|
||||
|
||||
__cold meta_t *meta_init_triplet(const MDBX_env *env, void *buffer) {
|
||||
page_t *page0 = (page_t *)buffer;
|
||||
page_t *page1 = meta_model(env, page0, 0);
|
||||
page_t *page2 = meta_model(env, page1, 1);
|
||||
meta_model(env, page2, 2);
|
||||
return page_meta(page2);
|
||||
}
|
||||
|
||||
__cold int __must_check_result meta_override(MDBX_env *env, size_t target,
|
||||
txnid_t txnid,
|
||||
const meta_t *shape) {
|
||||
int rc = env_page_auxbuffer(env);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return rc;
|
||||
page_t *const page = env->page_auxbuf;
|
||||
meta_model(env, page, target);
|
||||
meta_t *const model = page_meta(page);
|
||||
meta_set_txnid(env, model, txnid);
|
||||
if (txnid)
|
||||
eASSERT(env, coherency_check_meta(env, model, true));
|
||||
if (shape) {
|
||||
if (txnid && unlikely(!coherency_check_meta(env, shape, false))) {
|
||||
ERROR("bailout overriding meta-%zu since model failed "
|
||||
"FreeDB/MainDB %s-check for txnid #%" PRIaTXN,
|
||||
target, "pre", constmeta_txnid(shape));
|
||||
return MDBX_PROBLEM;
|
||||
}
|
||||
if (globals.runtime_flags & MDBX_DBG_DONT_UPGRADE)
|
||||
memcpy(&model->magic_and_version, &shape->magic_and_version,
|
||||
sizeof(model->magic_and_version));
|
||||
model->reserve16 = shape->reserve16;
|
||||
model->validator_id = shape->validator_id;
|
||||
model->extra_pagehdr = shape->extra_pagehdr;
|
||||
memcpy(&model->geometry, &shape->geometry, sizeof(model->geometry));
|
||||
memcpy(&model->trees, &shape->trees, sizeof(model->trees));
|
||||
memcpy(&model->canary, &shape->canary, sizeof(model->canary));
|
||||
memcpy(&model->pages_retired, &shape->pages_retired,
|
||||
sizeof(model->pages_retired));
|
||||
if (txnid) {
|
||||
if ((!model->trees.gc.mod_txnid && model->trees.gc.root != P_INVALID) ||
|
||||
(!model->trees.main.mod_txnid && model->trees.main.root != P_INVALID))
|
||||
memcpy(&model->magic_and_version, &shape->magic_and_version,
|
||||
sizeof(model->magic_and_version));
|
||||
if (unlikely(!coherency_check_meta(env, model, false))) {
|
||||
ERROR("bailout overriding meta-%zu since model failed "
|
||||
"FreeDB/MainDB %s-check for txnid #%" PRIaTXN,
|
||||
target, "post", txnid);
|
||||
return MDBX_PROBLEM;
|
||||
}
|
||||
}
|
||||
}
|
||||
meta_sign_as_steady(model);
|
||||
rc = meta_validate(env, model, page, (pgno_t)target, nullptr);
|
||||
if (unlikely(MDBX_IS_ERROR(rc)))
|
||||
return MDBX_PROBLEM;
|
||||
|
||||
if (shape && memcmp(model, shape, sizeof(meta_t)) == 0) {
|
||||
NOTICE("skip overriding meta-%zu since no changes "
|
||||
"for txnid #%" PRIaTXN,
|
||||
target, txnid);
|
||||
return MDBX_SUCCESS;
|
||||
}
|
||||
|
||||
if (env->flags & MDBX_WRITEMAP) {
|
||||
#if MDBX_ENABLE_PGOP_STAT
|
||||
env->lck->pgops.msync.weak += 1;
|
||||
#endif /* MDBX_ENABLE_PGOP_STAT */
|
||||
rc = osal_msync(&env->dxb_mmap, 0,
|
||||
pgno_align2os_bytes(env, model->geometry.first_unallocated),
|
||||
MDBX_SYNC_DATA | MDBX_SYNC_IODQ);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return rc;
|
||||
/* meta_override() called only while current process have exclusive
|
||||
* lock of a DB file. So meta-page could be updated directly without
|
||||
* clearing consistency flag by mdbx_meta_update_begin() */
|
||||
memcpy(pgno2page(env, target), page, env->ps);
|
||||
osal_flush_incoherent_cpu_writeback();
|
||||
#if MDBX_ENABLE_PGOP_STAT
|
||||
env->lck->pgops.msync.weak += 1;
|
||||
#endif /* MDBX_ENABLE_PGOP_STAT */
|
||||
rc = osal_msync(&env->dxb_mmap, 0, pgno_align2os_bytes(env, target + 1),
|
||||
MDBX_SYNC_DATA | MDBX_SYNC_IODQ);
|
||||
} else {
|
||||
#if MDBX_ENABLE_PGOP_STAT
|
||||
env->lck->pgops.wops.weak += 1;
|
||||
#endif /* MDBX_ENABLE_PGOP_STAT */
|
||||
rc = osal_pwrite(env->fd4meta, page, env->ps, pgno2bytes(env, target));
|
||||
if (rc == MDBX_SUCCESS && env->fd4meta == env->lazy_fd) {
|
||||
#if MDBX_ENABLE_PGOP_STAT
|
||||
env->lck->pgops.fsync.weak += 1;
|
||||
#endif /* MDBX_ENABLE_PGOP_STAT */
|
||||
rc = osal_fsync(env->lazy_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ);
|
||||
}
|
||||
osal_flush_incoherent_mmap(env->dxb_mmap.base, pgno2bytes(env, NUM_METAS),
|
||||
globals.sys_pagesize);
|
||||
}
|
||||
eASSERT(env, (!env->txn && !env->basal_txn) ||
|
||||
(env->stuck_meta == (int)target &&
|
||||
(env->flags & (MDBX_EXCLUSIVE | MDBX_RDONLY)) ==
|
||||
MDBX_EXCLUSIVE));
|
||||
return rc;
|
||||
}
|
||||
|
||||
__cold int meta_validate(MDBX_env *env, meta_t *const meta,
|
||||
const page_t *const page, const unsigned meta_number,
|
||||
unsigned *guess_pagesize) {
|
||||
const uint64_t magic_and_version =
|
||||
unaligned_peek_u64(4, &meta->magic_and_version);
|
||||
if (unlikely(magic_and_version != MDBX_DATA_MAGIC &&
|
||||
magic_and_version != MDBX_DATA_MAGIC_LEGACY_COMPAT &&
|
||||
magic_and_version != MDBX_DATA_MAGIC_LEGACY_DEVEL)) {
|
||||
ERROR("meta[%u] has invalid magic/version %" PRIx64, meta_number,
|
||||
magic_and_version);
|
||||
return ((magic_and_version >> 8) != MDBX_MAGIC) ? MDBX_INVALID
|
||||
: MDBX_VERSION_MISMATCH;
|
||||
}
|
||||
|
||||
if (unlikely(page->pgno != meta_number)) {
|
||||
ERROR("meta[%u] has invalid pageno %" PRIaPGNO, meta_number, page->pgno);
|
||||
return MDBX_INVALID;
|
||||
}
|
||||
|
||||
if (unlikely(page->flags != P_META)) {
|
||||
ERROR("page #%u not a meta-page", meta_number);
|
||||
return MDBX_INVALID;
|
||||
}
|
||||
|
||||
if (unlikely(!is_powerof2(meta->pagesize) ||
|
||||
meta->pagesize < MDBX_MIN_PAGESIZE ||
|
||||
meta->pagesize > MDBX_MAX_PAGESIZE)) {
|
||||
WARNING("meta[%u] has invalid pagesize (%u), skip it", meta_number,
|
||||
meta->pagesize);
|
||||
return is_powerof2(meta->pagesize) ? MDBX_VERSION_MISMATCH : MDBX_INVALID;
|
||||
}
|
||||
|
||||
if (guess_pagesize && *guess_pagesize != meta->pagesize) {
|
||||
*guess_pagesize = meta->pagesize;
|
||||
VERBOSE("meta[%u] took pagesize %u", meta_number, meta->pagesize);
|
||||
}
|
||||
|
||||
const txnid_t txnid = unaligned_peek_u64(4, &meta->txnid_a);
|
||||
if (unlikely(txnid != unaligned_peek_u64(4, &meta->txnid_b))) {
|
||||
WARNING("meta[%u] not completely updated, skip it", meta_number);
|
||||
return MDBX_RESULT_TRUE;
|
||||
}
|
||||
|
||||
/* LY: check signature as a checksum */
|
||||
const uint64_t sign = meta_sign_get(meta);
|
||||
const uint64_t sign_stready = meta_sign_calculate(meta);
|
||||
if (SIGN_IS_STEADY(sign) && unlikely(sign != sign_stready)) {
|
||||
WARNING("meta[%u] has invalid steady-checksum (0x%" PRIx64 " != 0x%" PRIx64
|
||||
"), skip it",
|
||||
meta_number, sign, sign_stready);
|
||||
return MDBX_RESULT_TRUE;
|
||||
}
|
||||
|
||||
if (unlikely(meta->trees.gc.flags != MDBX_INTEGERKEY)) {
|
||||
WARNING("meta[%u] has invalid %s flags 0x%u, skip it", meta_number,
|
||||
"GC/FreeDB", meta->trees.gc.flags);
|
||||
return MDBX_INCOMPATIBLE;
|
||||
}
|
||||
|
||||
if (unlikely(!check_sdb_flags(meta->trees.main.flags))) {
|
||||
WARNING("meta[%u] has invalid %s flags 0x%u, skip it", meta_number,
|
||||
"MainDB", meta->trees.main.flags);
|
||||
return MDBX_INCOMPATIBLE;
|
||||
}
|
||||
|
||||
DEBUG("checking meta%" PRIaPGNO " = root %" PRIaPGNO "/%" PRIaPGNO
|
||||
", geo %" PRIaPGNO "/%" PRIaPGNO "-%" PRIaPGNO "/%" PRIaPGNO
|
||||
" +%u -%u, txn_id %" PRIaTXN ", %s",
|
||||
page->pgno, meta->trees.main.root, meta->trees.gc.root,
|
||||
meta->geometry.lower, meta->geometry.first_unallocated,
|
||||
meta->geometry.now, meta->geometry.upper,
|
||||
pv2pages(meta->geometry.grow_pv), pv2pages(meta->geometry.shrink_pv),
|
||||
txnid, durable_caption(meta));
|
||||
|
||||
if (unlikely(txnid < MIN_TXNID || txnid > MAX_TXNID)) {
|
||||
WARNING("meta[%u] has invalid txnid %" PRIaTXN ", skip it", meta_number,
|
||||
txnid);
|
||||
return MDBX_RESULT_TRUE;
|
||||
}
|
||||
|
||||
if (unlikely(meta->geometry.lower < MIN_PAGENO ||
|
||||
meta->geometry.lower > MAX_PAGENO + 1)) {
|
||||
WARNING("meta[%u] has invalid min-pages (%" PRIaPGNO "), skip it",
|
||||
meta_number, meta->geometry.lower);
|
||||
return MDBX_INVALID;
|
||||
}
|
||||
|
||||
if (unlikely(meta->geometry.upper < MIN_PAGENO ||
|
||||
meta->geometry.upper > MAX_PAGENO + 1 ||
|
||||
meta->geometry.upper < meta->geometry.lower)) {
|
||||
WARNING("meta[%u] has invalid max-pages (%" PRIaPGNO "), skip it",
|
||||
meta_number, meta->geometry.upper);
|
||||
return MDBX_INVALID;
|
||||
}
|
||||
|
||||
if (unlikely(meta->geometry.first_unallocated < MIN_PAGENO ||
|
||||
meta->geometry.first_unallocated - 1 > MAX_PAGENO)) {
|
||||
WARNING("meta[%u] has invalid next-pageno (%" PRIaPGNO "), skip it",
|
||||
meta_number, meta->geometry.first_unallocated);
|
||||
return MDBX_CORRUPTED;
|
||||
}
|
||||
|
||||
const uint64_t used_bytes =
|
||||
meta->geometry.first_unallocated * (uint64_t)meta->pagesize;
|
||||
if (unlikely(used_bytes > env->dxb_mmap.filesize)) {
|
||||
/* Here could be a race with DB-shrinking performed by other process */
|
||||
int err = osal_filesize(env->lazy_fd, &env->dxb_mmap.filesize);
|
||||
if (unlikely(err != MDBX_SUCCESS))
|
||||
return err;
|
||||
if (unlikely(used_bytes > env->dxb_mmap.filesize)) {
|
||||
WARNING("meta[%u] used-bytes (%" PRIu64 ") beyond filesize (%" PRIu64
|
||||
"), skip it",
|
||||
meta_number, used_bytes, env->dxb_mmap.filesize);
|
||||
return MDBX_CORRUPTED;
|
||||
}
|
||||
}
|
||||
if (unlikely(meta->geometry.first_unallocated - 1 > MAX_PAGENO ||
|
||||
used_bytes > MAX_MAPSIZE)) {
|
||||
WARNING("meta[%u] has too large used-space (%" PRIu64 "), skip it",
|
||||
meta_number, used_bytes);
|
||||
return MDBX_TOO_LARGE;
|
||||
}
|
||||
|
||||
pgno_t geo_lower = meta->geometry.lower;
|
||||
uint64_t mapsize_min = geo_lower * (uint64_t)meta->pagesize;
|
||||
STATIC_ASSERT(MAX_MAPSIZE < PTRDIFF_MAX - MDBX_MAX_PAGESIZE);
|
||||
STATIC_ASSERT(MIN_MAPSIZE < MAX_MAPSIZE);
|
||||
STATIC_ASSERT((uint64_t)(MAX_PAGENO + 1) * MDBX_MIN_PAGESIZE % (4ul << 20) ==
|
||||
0);
|
||||
if (unlikely(mapsize_min < MIN_MAPSIZE || mapsize_min > MAX_MAPSIZE)) {
|
||||
if (MAX_MAPSIZE != MAX_MAPSIZE64 && mapsize_min > MAX_MAPSIZE &&
|
||||
mapsize_min <= MAX_MAPSIZE64) {
|
||||
eASSERT(env, meta->geometry.first_unallocated - 1 <= MAX_PAGENO &&
|
||||
used_bytes <= MAX_MAPSIZE);
|
||||
WARNING("meta[%u] has too large min-mapsize (%" PRIu64 "), "
|
||||
"but size of used space still acceptable (%" PRIu64 ")",
|
||||
meta_number, mapsize_min, used_bytes);
|
||||
geo_lower = (pgno_t)((mapsize_min = MAX_MAPSIZE) / meta->pagesize);
|
||||
if (geo_lower > MAX_PAGENO + 1) {
|
||||
geo_lower = MAX_PAGENO + 1;
|
||||
mapsize_min = geo_lower * (uint64_t)meta->pagesize;
|
||||
}
|
||||
WARNING("meta[%u] consider get-%s pageno is %" PRIaPGNO
|
||||
" instead of wrong %" PRIaPGNO
|
||||
", will be corrected on next commit(s)",
|
||||
meta_number, "lower", geo_lower, meta->geometry.lower);
|
||||
meta->geometry.lower = geo_lower;
|
||||
} else {
|
||||
WARNING("meta[%u] has invalid min-mapsize (%" PRIu64 "), skip it",
|
||||
meta_number, mapsize_min);
|
||||
return MDBX_VERSION_MISMATCH;
|
||||
}
|
||||
}
|
||||
|
||||
pgno_t geo_upper = meta->geometry.upper;
|
||||
uint64_t mapsize_max = geo_upper * (uint64_t)meta->pagesize;
|
||||
STATIC_ASSERT(MIN_MAPSIZE < MAX_MAPSIZE);
|
||||
if (unlikely(mapsize_max > MAX_MAPSIZE ||
|
||||
(MAX_PAGENO + 1) <
|
||||
ceil_powerof2((size_t)mapsize_max, globals.sys_pagesize) /
|
||||
(size_t)meta->pagesize)) {
|
||||
if (mapsize_max > MAX_MAPSIZE64) {
|
||||
WARNING("meta[%u] has invalid max-mapsize (%" PRIu64 "), skip it",
|
||||
meta_number, mapsize_max);
|
||||
return MDBX_VERSION_MISMATCH;
|
||||
}
|
||||
/* allow to open large DB from a 32-bit environment */
|
||||
eASSERT(env, meta->geometry.first_unallocated - 1 <= MAX_PAGENO &&
|
||||
used_bytes <= MAX_MAPSIZE);
|
||||
WARNING("meta[%u] has too large max-mapsize (%" PRIu64 "), "
|
||||
"but size of used space still acceptable (%" PRIu64 ")",
|
||||
meta_number, mapsize_max, used_bytes);
|
||||
geo_upper = (pgno_t)((mapsize_max = MAX_MAPSIZE) / meta->pagesize);
|
||||
if (geo_upper > MAX_PAGENO + 1) {
|
||||
geo_upper = MAX_PAGENO + 1;
|
||||
mapsize_max = geo_upper * (uint64_t)meta->pagesize;
|
||||
}
|
||||
WARNING("meta[%u] consider get-%s pageno is %" PRIaPGNO
|
||||
" instead of wrong %" PRIaPGNO
|
||||
", will be corrected on next commit(s)",
|
||||
meta_number, "upper", geo_upper, meta->geometry.upper);
|
||||
meta->geometry.upper = geo_upper;
|
||||
}
|
||||
|
||||
/* LY: check and silently put geometry.now into [geo.lower...geo.upper].
|
||||
*
|
||||
* Copy-with-compaction by old version of libmdbx could produce DB-file
|
||||
* less than meta.geo.lower bound, in case actual filling is low or no data
|
||||
* at all. This is not a problem as there is no damage or loss of data.
|
||||
* Therefore it is better not to consider such situation as an error, but
|
||||
* silently correct it. */
|
||||
pgno_t geo_now = meta->geometry.now;
|
||||
if (geo_now < geo_lower)
|
||||
geo_now = geo_lower;
|
||||
if (geo_now > geo_upper && meta->geometry.first_unallocated <= geo_upper)
|
||||
geo_now = geo_upper;
|
||||
|
||||
if (unlikely(meta->geometry.first_unallocated > geo_now)) {
|
||||
WARNING("meta[%u] next-pageno (%" PRIaPGNO
|
||||
") is beyond end-pgno (%" PRIaPGNO "), skip it",
|
||||
meta_number, meta->geometry.first_unallocated, geo_now);
|
||||
return MDBX_CORRUPTED;
|
||||
}
|
||||
if (meta->geometry.now != geo_now) {
|
||||
WARNING("meta[%u] consider geo-%s pageno is %" PRIaPGNO
|
||||
" instead of wrong %" PRIaPGNO
|
||||
", will be corrected on next commit(s)",
|
||||
meta_number, "now", geo_now, meta->geometry.now);
|
||||
meta->geometry.now = geo_now;
|
||||
}
|
||||
|
||||
/* GC */
|
||||
if (meta->trees.gc.root == P_INVALID) {
|
||||
if (unlikely(meta->trees.gc.branch_pages || meta->trees.gc.height ||
|
||||
meta->trees.gc.items || meta->trees.gc.leaf_pages ||
|
||||
meta->trees.gc.large_pages)) {
|
||||
WARNING("meta[%u] has false-empty %s, skip it", meta_number, "GC");
|
||||
return MDBX_CORRUPTED;
|
||||
}
|
||||
} else if (unlikely(meta->trees.gc.root >=
|
||||
meta->geometry.first_unallocated)) {
|
||||
WARNING("meta[%u] has invalid %s-root %" PRIaPGNO ", skip it", meta_number,
|
||||
"GC", meta->trees.gc.root);
|
||||
return MDBX_CORRUPTED;
|
||||
}
|
||||
|
||||
/* MainDB */
|
||||
if (meta->trees.main.root == P_INVALID) {
|
||||
if (unlikely(meta->trees.main.branch_pages || meta->trees.main.height ||
|
||||
meta->trees.main.items || meta->trees.main.leaf_pages ||
|
||||
meta->trees.main.large_pages)) {
|
||||
WARNING("meta[%u] has false-empty %s", meta_number, "MainDB");
|
||||
return MDBX_CORRUPTED;
|
||||
}
|
||||
} else if (unlikely(meta->trees.main.root >=
|
||||
meta->geometry.first_unallocated)) {
|
||||
WARNING("meta[%u] has invalid %s-root %" PRIaPGNO ", skip it", meta_number,
|
||||
"MainDB", meta->trees.main.root);
|
||||
return MDBX_CORRUPTED;
|
||||
}
|
||||
|
||||
if (unlikely(meta->trees.gc.mod_txnid > txnid)) {
|
||||
WARNING("meta[%u] has wrong mod_txnid %" PRIaTXN " for %s, skip it",
|
||||
meta_number, meta->trees.gc.mod_txnid, "GC");
|
||||
return MDBX_CORRUPTED;
|
||||
}
|
||||
|
||||
if (unlikely(meta->trees.main.mod_txnid > txnid)) {
|
||||
WARNING("meta[%u] has wrong mod_txnid %" PRIaTXN " for %s, skip it",
|
||||
meta_number, meta->trees.main.mod_txnid, "MainDB");
|
||||
return MDBX_CORRUPTED;
|
||||
}
|
||||
|
||||
return MDBX_SUCCESS;
|
||||
}
|
||||
|
||||
__cold int meta_validate_copy(MDBX_env *env, const meta_t *meta, meta_t *dest) {
|
||||
*dest = *meta;
|
||||
return meta_validate(env, dest, data_page(meta),
|
||||
bytes2pgno(env, ptr_dist(meta, env->dxb_mmap.base)),
|
||||
nullptr);
|
||||
}
|
203
src/meta.h
Normal file
203
src/meta.h
Normal file
@ -0,0 +1,203 @@
|
||||
/// \copyright SPDX-License-Identifier: Apache-2.0
|
||||
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "essentials.h"
|
||||
|
||||
static inline uint64_t meta_sign_calculate(const meta_t *meta) {
|
||||
uint64_t sign = DATASIGN_NONE;
|
||||
#if 0 /* TODO */
|
||||
sign = hippeus_hash64(...);
|
||||
#else
|
||||
(void)meta;
|
||||
#endif
|
||||
/* LY: newer returns DATASIGN_NONE or DATASIGN_WEAK */
|
||||
return (sign > DATASIGN_WEAK) ? sign : ~sign;
|
||||
}
|
||||
|
||||
static inline uint64_t meta_sign_get(const volatile meta_t *meta) {
|
||||
return unaligned_peek_u64_volatile(4, meta->sign);
|
||||
}
|
||||
|
||||
static inline void meta_sign_as_steady(meta_t *meta) {
|
||||
unaligned_poke_u64(4, meta->sign, meta_sign_calculate(meta));
|
||||
}
|
||||
|
||||
static inline bool meta_is_steady(const volatile meta_t *meta) {
|
||||
return SIGN_IS_STEADY(meta_sign_get(meta));
|
||||
}
|
||||
|
||||
MDBX_INTERNAL troika_t meta_tap(const MDBX_env *env);
|
||||
MDBX_INTERNAL unsigned meta_eq_mask(const troika_t *troika);
|
||||
MDBX_INTERNAL bool meta_should_retry(const MDBX_env *env, troika_t *troika);
|
||||
MDBX_MAYBE_UNUSED MDBX_INTERNAL bool troika_verify_fsm(void);
|
||||
|
||||
struct meta_ptr {
|
||||
txnid_t txnid;
|
||||
union {
|
||||
const volatile meta_t *ptr_v;
|
||||
const meta_t *ptr_c;
|
||||
};
|
||||
size_t is_steady;
|
||||
};
|
||||
|
||||
MDBX_INTERNAL meta_ptr_t meta_ptr(const MDBX_env *env, unsigned n);
|
||||
MDBX_INTERNAL txnid_t meta_txnid(const volatile meta_t *meta);
|
||||
MDBX_INTERNAL txnid_t recent_committed_txnid(const MDBX_env *env);
|
||||
MDBX_INTERNAL int meta_sync(const MDBX_env *env, const meta_ptr_t head);
|
||||
|
||||
MDBX_INTERNAL const char *durable_caption(const meta_t *const meta);
|
||||
MDBX_INTERNAL void meta_troika_dump(const MDBX_env *env,
|
||||
const troika_t *troika);
|
||||
|
||||
#define METAPAGE(env, n) page_meta(pgno2page(env, n))
|
||||
#define METAPAGE_END(env) METAPAGE(env, NUM_METAS)
|
||||
|
||||
static inline meta_ptr_t meta_recent(const MDBX_env *env,
|
||||
const troika_t *troika) {
|
||||
meta_ptr_t r;
|
||||
r.txnid = troika->txnid[troika->recent];
|
||||
r.ptr_v = METAPAGE(env, troika->recent);
|
||||
r.is_steady = (troika->fsm >> troika->recent) & 1;
|
||||
return r;
|
||||
}
|
||||
|
||||
static inline meta_ptr_t meta_prefer_steady(const MDBX_env *env,
|
||||
const troika_t *troika) {
|
||||
meta_ptr_t r;
|
||||
r.txnid = troika->txnid[troika->prefer_steady];
|
||||
r.ptr_v = METAPAGE(env, troika->prefer_steady);
|
||||
r.is_steady = (troika->fsm >> troika->prefer_steady) & 1;
|
||||
return r;
|
||||
}
|
||||
|
||||
static inline meta_ptr_t meta_tail(const MDBX_env *env,
|
||||
const troika_t *troika) {
|
||||
const uint8_t tail = troika->tail_and_flags & 3;
|
||||
MDBX_ANALYSIS_ASSUME(tail < NUM_METAS);
|
||||
meta_ptr_t r;
|
||||
r.txnid = troika->txnid[tail];
|
||||
r.ptr_v = METAPAGE(env, tail);
|
||||
r.is_steady = (troika->fsm >> tail) & 1;
|
||||
return r;
|
||||
}
|
||||
|
||||
static inline bool meta_bootid_match(const meta_t *meta) {
|
||||
return memcmp(&meta->bootid, &globals.bootid, 16) == 0 &&
|
||||
(globals.bootid.x | globals.bootid.y) != 0;
|
||||
}
|
||||
|
||||
static inline bool meta_weak_acceptable(const MDBX_env *env, const meta_t *meta,
|
||||
const int lck_exclusive) {
|
||||
return lck_exclusive
|
||||
? /* exclusive lock */ meta_bootid_match(meta)
|
||||
: /* db already opened */ env->lck_mmap.lck &&
|
||||
(env->lck_mmap.lck->envmode.weak & MDBX_RDONLY) == 0;
|
||||
}
|
||||
|
||||
MDBX_NOTHROW_PURE_FUNCTION static inline txnid_t
|
||||
constmeta_txnid(const meta_t *meta) {
|
||||
const txnid_t a = unaligned_peek_u64(4, &meta->txnid_a);
|
||||
const txnid_t b = unaligned_peek_u64(4, &meta->txnid_b);
|
||||
return likely(a == b) ? a : 0;
|
||||
}
|
||||
|
||||
static inline void meta_update_begin(const MDBX_env *env, meta_t *meta,
|
||||
txnid_t txnid) {
|
||||
eASSERT(env, meta >= METAPAGE(env, 0) && meta < METAPAGE_END(env));
|
||||
eASSERT(env, unaligned_peek_u64(4, meta->txnid_a) < txnid &&
|
||||
unaligned_peek_u64(4, meta->txnid_b) < txnid);
|
||||
(void)env;
|
||||
#if (defined(__amd64__) || defined(__e2k__)) && !defined(ENABLE_UBSAN) && \
|
||||
MDBX_UNALIGNED_OK >= 8
|
||||
atomic_store64((mdbx_atomic_uint64_t *)&meta->txnid_b, 0, mo_AcquireRelease);
|
||||
atomic_store64((mdbx_atomic_uint64_t *)&meta->txnid_a, txnid,
|
||||
mo_AcquireRelease);
|
||||
#else
|
||||
atomic_store32(&meta->txnid_b[__BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__], 0,
|
||||
mo_AcquireRelease);
|
||||
atomic_store32(&meta->txnid_b[__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__], 0,
|
||||
mo_AcquireRelease);
|
||||
atomic_store32(&meta->txnid_a[__BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__],
|
||||
(uint32_t)txnid, mo_AcquireRelease);
|
||||
atomic_store32(&meta->txnid_a[__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__],
|
||||
(uint32_t)(txnid >> 32), mo_AcquireRelease);
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline void meta_update_end(const MDBX_env *env, meta_t *meta,
|
||||
txnid_t txnid) {
|
||||
eASSERT(env, meta >= METAPAGE(env, 0) && meta < METAPAGE_END(env));
|
||||
eASSERT(env, unaligned_peek_u64(4, meta->txnid_a) == txnid);
|
||||
eASSERT(env, unaligned_peek_u64(4, meta->txnid_b) < txnid);
|
||||
(void)env;
|
||||
jitter4testing(true);
|
||||
memcpy(&meta->bootid, &globals.bootid, 16);
|
||||
#if (defined(__amd64__) || defined(__e2k__)) && !defined(ENABLE_UBSAN) && \
|
||||
MDBX_UNALIGNED_OK >= 8
|
||||
atomic_store64((mdbx_atomic_uint64_t *)&meta->txnid_b, txnid,
|
||||
mo_AcquireRelease);
|
||||
#else
|
||||
atomic_store32(&meta->txnid_b[__BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__],
|
||||
(uint32_t)txnid, mo_AcquireRelease);
|
||||
atomic_store32(&meta->txnid_b[__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__],
|
||||
(uint32_t)(txnid >> 32), mo_AcquireRelease);
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline void meta_set_txnid(const MDBX_env *env, meta_t *meta,
|
||||
const txnid_t txnid) {
|
||||
eASSERT(env, !env->dxb_mmap.base || meta < METAPAGE(env, 0) ||
|
||||
meta >= METAPAGE_END(env));
|
||||
(void)env;
|
||||
/* update inconsistently since this function used ONLY for filling meta-image
|
||||
* for writing, but not the actual meta-page */
|
||||
memcpy(&meta->bootid, &globals.bootid, 16);
|
||||
unaligned_poke_u64(4, meta->txnid_a, txnid);
|
||||
unaligned_poke_u64(4, meta->txnid_b, txnid);
|
||||
}
|
||||
|
||||
static inline uint8_t meta_cmp2int(txnid_t a, txnid_t b, uint8_t s) {
|
||||
return unlikely(a == b) ? 1 * s : (a > b) ? 2 * s : 0 * s;
|
||||
}
|
||||
|
||||
static inline uint8_t meta_cmp2recent(uint8_t ab_cmp2int, bool a_steady,
|
||||
bool b_steady) {
|
||||
assert(ab_cmp2int < 3 /* && a_steady< 2 && b_steady < 2 */);
|
||||
return ab_cmp2int > 1 || (ab_cmp2int == 1 && a_steady > b_steady);
|
||||
}
|
||||
|
||||
static inline uint8_t meta_cmp2steady(uint8_t ab_cmp2int, bool a_steady,
|
||||
bool b_steady) {
|
||||
assert(ab_cmp2int < 3 /* && a_steady< 2 && b_steady < 2 */);
|
||||
return a_steady > b_steady || (a_steady == b_steady && ab_cmp2int > 1);
|
||||
}
|
||||
|
||||
static inline bool meta_choice_recent(txnid_t a_txnid, bool a_steady,
|
||||
txnid_t b_txnid, bool b_steady) {
|
||||
return meta_cmp2recent(meta_cmp2int(a_txnid, b_txnid, 1), a_steady, b_steady);
|
||||
}
|
||||
|
||||
static inline bool meta_choice_steady(txnid_t a_txnid, bool a_steady,
|
||||
txnid_t b_txnid, bool b_steady) {
|
||||
return meta_cmp2steady(meta_cmp2int(a_txnid, b_txnid, 1), a_steady, b_steady);
|
||||
}
|
||||
|
||||
MDBX_INTERNAL meta_t *meta_init_triplet(const MDBX_env *env, void *buffer);
|
||||
|
||||
MDBX_INTERNAL int meta_validate(MDBX_env *env, meta_t *const meta,
|
||||
const page_t *const page,
|
||||
const unsigned meta_number,
|
||||
unsigned *guess_pagesize);
|
||||
|
||||
MDBX_INTERNAL int __must_check_result meta_validate_copy(MDBX_env *env,
|
||||
const meta_t *meta,
|
||||
meta_t *dest);
|
||||
|
||||
MDBX_INTERNAL int __must_check_result meta_override(MDBX_env *env,
|
||||
size_t target,
|
||||
txnid_t txnid,
|
||||
const meta_t *shape);
|
||||
|
||||
MDBX_INTERNAL int meta_wipe_steady(MDBX_env *env, txnid_t inclusive_upto);
|
252
src/misc.c
Normal file
252
src/misc.c
Normal file
@ -0,0 +1,252 @@
|
||||
/// \copyright SPDX-License-Identifier: Apache-2.0
|
||||
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
|
||||
|
||||
#include "internals.h"
|
||||
|
||||
__cold int mdbx_is_readahead_reasonable(size_t volume, intptr_t redundancy) {
|
||||
if (volume <= 1024 * 1024 * 4ul)
|
||||
return MDBX_RESULT_TRUE;
|
||||
|
||||
intptr_t pagesize, total_ram_pages;
|
||||
int err = mdbx_get_sysraminfo(&pagesize, &total_ram_pages, nullptr);
|
||||
if (unlikely(err != MDBX_SUCCESS))
|
||||
return err;
|
||||
|
||||
const int log2page = log2n_powerof2(pagesize);
|
||||
const intptr_t volume_pages = (volume + pagesize - 1) >> log2page;
|
||||
const intptr_t redundancy_pages =
|
||||
(redundancy < 0) ? -(intptr_t)((-redundancy + pagesize - 1) >> log2page)
|
||||
: (intptr_t)(redundancy + pagesize - 1) >> log2page;
|
||||
if (volume_pages >= total_ram_pages ||
|
||||
volume_pages + redundancy_pages >= total_ram_pages)
|
||||
return MDBX_RESULT_FALSE;
|
||||
|
||||
intptr_t avail_ram_pages;
|
||||
err = mdbx_get_sysraminfo(nullptr, nullptr, &avail_ram_pages);
|
||||
if (unlikely(err != MDBX_SUCCESS))
|
||||
return err;
|
||||
|
||||
return (volume_pages + redundancy_pages >= avail_ram_pages)
|
||||
? MDBX_RESULT_FALSE
|
||||
: MDBX_RESULT_TRUE;
|
||||
}
|
||||
|
||||
int mdbx_dbi_sequence(MDBX_txn *txn, MDBX_dbi dbi, uint64_t *result,
|
||||
uint64_t increment) {
|
||||
int rc = check_txn(txn, MDBX_TXN_BLOCKED);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return rc;
|
||||
|
||||
rc = dbi_check(txn, dbi);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return rc;
|
||||
|
||||
if (unlikely(txn->dbi_state[dbi] & DBI_STALE)) {
|
||||
rc = sdb_fetch(txn, dbi);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return rc;
|
||||
}
|
||||
|
||||
tree_t *dbs = &txn->dbs[dbi];
|
||||
if (likely(result))
|
||||
*result = dbs->sequence;
|
||||
|
||||
if (likely(increment > 0)) {
|
||||
if (unlikely(dbi == FREE_DBI || (txn->flags & MDBX_TXN_RDONLY) != 0))
|
||||
return MDBX_EACCESS;
|
||||
|
||||
uint64_t new = dbs->sequence + increment;
|
||||
if (unlikely(new < increment))
|
||||
return MDBX_RESULT_TRUE;
|
||||
|
||||
tASSERT(txn, new > dbs->sequence);
|
||||
dbs->sequence = new;
|
||||
txn->flags |= MDBX_TXN_DIRTY;
|
||||
txn->dbi_state[dbi] |= DBI_DIRTY;
|
||||
}
|
||||
|
||||
return MDBX_SUCCESS;
|
||||
}
|
||||
|
||||
int mdbx_cmp(const MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *a,
|
||||
const MDBX_val *b) {
|
||||
eASSERT(nullptr, txn->signature == txn_signature);
|
||||
tASSERT(txn, (dbi_state(txn, dbi) & DBI_VALID) && !dbi_changed(txn, dbi));
|
||||
tASSERT(txn,
|
||||
dbi < txn->env->n_dbi && (txn->env->dbs_flags[dbi] & DB_VALID) != 0);
|
||||
return txn->env->kvs[dbi].clc.k.cmp(a, b);
|
||||
}
|
||||
|
||||
int mdbx_dcmp(const MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *a,
|
||||
const MDBX_val *b) {
|
||||
eASSERT(nullptr, txn->signature == txn_signature);
|
||||
tASSERT(txn, (dbi_state(txn, dbi) & DBI_VALID) && !dbi_changed(txn, dbi));
|
||||
tASSERT(txn, dbi < txn->env->n_dbi && (txn->env->dbs_flags[dbi] & DB_VALID));
|
||||
return txn->env->kvs[dbi].clc.v.cmp(a, b);
|
||||
}
|
||||
|
||||
__cold MDBX_cmp_func *mdbx_get_keycmp(MDBX_db_flags_t flags) {
|
||||
return builtin_keycmp(flags);
|
||||
}
|
||||
|
||||
__cold MDBX_cmp_func *mdbx_get_datacmp(MDBX_db_flags_t flags) {
|
||||
return builtin_datacmp(flags);
|
||||
}
|
||||
|
||||
/*----------------------------------------------------------------------------*/
|
||||
|
||||
__cold const char *mdbx_liberr2str(int errnum) {
|
||||
/* Table of descriptions for MDBX errors */
|
||||
static const char *const tbl[] = {
|
||||
"MDBX_KEYEXIST: Key/data pair already exists",
|
||||
"MDBX_NOTFOUND: No matching key/data pair found",
|
||||
"MDBX_PAGE_NOTFOUND: Requested page not found",
|
||||
"MDBX_CORRUPTED: Database is corrupted",
|
||||
"MDBX_PANIC: Environment had fatal error",
|
||||
"MDBX_VERSION_MISMATCH: DB version mismatch libmdbx",
|
||||
"MDBX_INVALID: File is not an MDBX file",
|
||||
"MDBX_MAP_FULL: Environment mapsize limit reached",
|
||||
"MDBX_DBS_FULL: Too many DBI-handles (maxdbs reached)",
|
||||
"MDBX_READERS_FULL: Too many readers (maxreaders reached)",
|
||||
nullptr /* MDBX_TLS_FULL (-30789): unused in MDBX */,
|
||||
"MDBX_TXN_FULL: Transaction has too many dirty pages,"
|
||||
" i.e transaction is too big",
|
||||
"MDBX_CURSOR_FULL: Cursor stack limit reachedn - this usually indicates"
|
||||
" corruption, i.e branch-pages loop",
|
||||
"MDBX_PAGE_FULL: Internal error - Page has no more space",
|
||||
"MDBX_UNABLE_EXTEND_MAPSIZE: Database engine was unable to extend"
|
||||
" mapping, e.g. since address space is unavailable or busy,"
|
||||
" or Operation system not supported such operations",
|
||||
"MDBX_INCOMPATIBLE: Environment or database is not compatible"
|
||||
" with the requested operation or the specified flags",
|
||||
"MDBX_BAD_RSLOT: Invalid reuse of reader locktable slot,"
|
||||
" e.g. read-transaction already run for current thread",
|
||||
"MDBX_BAD_TXN: Transaction is not valid for requested operation,"
|
||||
" e.g. had errored and be must aborted, has a child, or is invalid",
|
||||
"MDBX_BAD_VALSIZE: Invalid size or alignment of key or data"
|
||||
" for target database, either invalid subDB name",
|
||||
"MDBX_BAD_DBI: The specified DBI-handle is invalid"
|
||||
" or changed by another thread/transaction",
|
||||
"MDBX_PROBLEM: Unexpected internal error, transaction should be aborted",
|
||||
"MDBX_BUSY: Another write transaction is running,"
|
||||
" or environment is already used while opening with MDBX_EXCLUSIVE flag",
|
||||
};
|
||||
|
||||
if (errnum >= MDBX_KEYEXIST && errnum <= MDBX_BUSY) {
|
||||
int i = errnum - MDBX_KEYEXIST;
|
||||
return tbl[i];
|
||||
}
|
||||
|
||||
switch (errnum) {
|
||||
case MDBX_SUCCESS:
|
||||
return "MDBX_SUCCESS: Successful";
|
||||
case MDBX_EMULTIVAL:
|
||||
return "MDBX_EMULTIVAL: The specified key has"
|
||||
" more than one associated value";
|
||||
case MDBX_EBADSIGN:
|
||||
return "MDBX_EBADSIGN: Wrong signature of a runtime object(s),"
|
||||
" e.g. memory corruption or double-free";
|
||||
case MDBX_WANNA_RECOVERY:
|
||||
return "MDBX_WANNA_RECOVERY: Database should be recovered,"
|
||||
" but this could NOT be done automatically for now"
|
||||
" since it opened in read-only mode";
|
||||
case MDBX_EKEYMISMATCH:
|
||||
return "MDBX_EKEYMISMATCH: The given key value is mismatched to the"
|
||||
" current cursor position";
|
||||
case MDBX_TOO_LARGE:
|
||||
return "MDBX_TOO_LARGE: Database is too large for current system,"
|
||||
" e.g. could NOT be mapped into RAM";
|
||||
case MDBX_THREAD_MISMATCH:
|
||||
return "MDBX_THREAD_MISMATCH: A thread has attempted to use a not"
|
||||
" owned object, e.g. a transaction that started by another thread";
|
||||
case MDBX_TXN_OVERLAPPING:
|
||||
return "MDBX_TXN_OVERLAPPING: Overlapping read and write transactions for"
|
||||
" the current thread";
|
||||
case MDBX_DUPLICATED_CLK:
|
||||
return "MDBX_DUPLICATED_CLK: Alternative/Duplicate LCK-file is exists,"
|
||||
" please keep one and remove unused other";
|
||||
case MDBX_DANGLING_DBI:
|
||||
return "MDBX_DANGLING_DBI: Some cursors and/or other resources should be"
|
||||
" closed before subDb or corresponding DBI-handle could be (re)used";
|
||||
default:
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
__cold const char *mdbx_strerror_r(int errnum, char *buf, size_t buflen) {
|
||||
const char *msg = mdbx_liberr2str(errnum);
|
||||
if (!msg && buflen > 0 && buflen < INT_MAX) {
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
const DWORD size = FormatMessageA(
|
||||
FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS, nullptr,
|
||||
errnum, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), buf, (DWORD)buflen,
|
||||
nullptr);
|
||||
return size ? buf : "FormatMessageA(FORMAT_MESSAGE_FROM_SYSTEM) failed";
|
||||
#elif defined(_GNU_SOURCE) && defined(__GLIBC__)
|
||||
/* GNU-specific */
|
||||
if (errnum > 0)
|
||||
msg = strerror_r(errnum, buf, buflen);
|
||||
#elif (_POSIX_C_SOURCE >= 200112L || _XOPEN_SOURCE >= 600)
|
||||
/* XSI-compliant */
|
||||
if (errnum > 0 && strerror_r(errnum, buf, buflen) == 0)
|
||||
msg = buf;
|
||||
#else
|
||||
if (errnum > 0) {
|
||||
msg = strerror(errnum);
|
||||
if (msg) {
|
||||
strncpy(buf, msg, buflen);
|
||||
msg = buf;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
if (!msg) {
|
||||
(void)snprintf(buf, buflen, "error %d", errnum);
|
||||
msg = buf;
|
||||
}
|
||||
buf[buflen - 1] = '\0';
|
||||
}
|
||||
return msg;
|
||||
}
|
||||
|
||||
__cold const char *mdbx_strerror(int errnum) {
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
static char buf[1024];
|
||||
return mdbx_strerror_r(errnum, buf, sizeof(buf));
|
||||
#else
|
||||
const char *msg = mdbx_liberr2str(errnum);
|
||||
if (!msg) {
|
||||
if (errnum > 0)
|
||||
msg = strerror(errnum);
|
||||
if (!msg) {
|
||||
static char buf[32];
|
||||
(void)snprintf(buf, sizeof(buf) - 1, "error %d", errnum);
|
||||
msg = buf;
|
||||
}
|
||||
}
|
||||
return msg;
|
||||
#endif
|
||||
}
|
||||
|
||||
#if defined(_WIN32) || defined(_WIN64) /* Bit of madness for Windows */
|
||||
const char *mdbx_strerror_r_ANSI2OEM(int errnum, char *buf, size_t buflen) {
|
||||
const char *msg = mdbx_liberr2str(errnum);
|
||||
if (!msg && buflen > 0 && buflen < INT_MAX) {
|
||||
const DWORD size = FormatMessageA(
|
||||
FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS, nullptr,
|
||||
errnum, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), buf, (DWORD)buflen,
|
||||
nullptr);
|
||||
if (!size)
|
||||
msg = "FormatMessageA(FORMAT_MESSAGE_FROM_SYSTEM) failed";
|
||||
else if (!CharToOemBuffA(buf, buf, size))
|
||||
msg = "CharToOemBuffA() failed";
|
||||
else
|
||||
msg = buf;
|
||||
}
|
||||
return msg;
|
||||
}
|
||||
|
||||
const char *mdbx_strerror_ANSI2OEM(int errnum) {
|
||||
static char buf[1024];
|
||||
return mdbx_strerror_r_ANSI2OEM(errnum, buf, sizeof(buf));
|
||||
}
|
||||
#endif /* Bit of madness for Windows */
|
477
src/mvcc-readers.c
Normal file
477
src/mvcc-readers.c
Normal file
@ -0,0 +1,477 @@
|
||||
/// \copyright SPDX-License-Identifier: Apache-2.0
|
||||
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
|
||||
|
||||
#include "internals.h"
|
||||
|
||||
bsr_t mvcc_bind_slot(MDBX_env *env, const uintptr_t tid) {
|
||||
eASSERT(env, env->lck_mmap.lck);
|
||||
eASSERT(env, env->lck->magic_and_version == MDBX_LOCK_MAGIC);
|
||||
eASSERT(env, env->lck->os_and_format == MDBX_LOCK_FORMAT);
|
||||
|
||||
bsr_t result = {lck_rdt_lock(env), nullptr};
|
||||
if (unlikely(MDBX_IS_ERROR(result.err)))
|
||||
return result;
|
||||
if (unlikely(env->flags & ENV_FATAL_ERROR)) {
|
||||
lck_rdt_unlock(env);
|
||||
result.err = MDBX_PANIC;
|
||||
return result;
|
||||
}
|
||||
if (unlikely(!env->dxb_mmap.base)) {
|
||||
lck_rdt_unlock(env);
|
||||
result.err = MDBX_EPERM;
|
||||
return result;
|
||||
}
|
||||
|
||||
if (unlikely(env->registered_reader_pid != env->pid)) {
|
||||
result.err = lck_rpid_set(env);
|
||||
if (unlikely(result.err != MDBX_SUCCESS)) {
|
||||
lck_rdt_unlock(env);
|
||||
return result;
|
||||
}
|
||||
env->registered_reader_pid = env->pid;
|
||||
}
|
||||
|
||||
result.err = MDBX_SUCCESS;
|
||||
size_t slot, nreaders;
|
||||
while (1) {
|
||||
nreaders = env->lck->rdt_length.weak;
|
||||
for (slot = 0; slot < nreaders; slot++)
|
||||
if (!atomic_load32(&env->lck->rdt[slot].pid, mo_AcquireRelease))
|
||||
break;
|
||||
|
||||
if (likely(slot < env->max_readers))
|
||||
break;
|
||||
|
||||
result.err = mvcc_cleanup_dead(env, true, nullptr);
|
||||
if (result.err != MDBX_RESULT_TRUE) {
|
||||
lck_rdt_unlock(env);
|
||||
result.err =
|
||||
(result.err == MDBX_SUCCESS) ? MDBX_READERS_FULL : result.err;
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
result.rslot = &env->lck->rdt[slot];
|
||||
/* Claim the reader slot, carefully since other code
|
||||
* uses the reader table un-mutexed: First reset the
|
||||
* slot, next publish it in lck->rdt_length. After
|
||||
* that, it is safe for mdbx_env_close() to touch it.
|
||||
* When it will be closed, we can finally claim it. */
|
||||
atomic_store32(&result.rslot->pid, 0, mo_AcquireRelease);
|
||||
safe64_reset(&result.rslot->txnid, true);
|
||||
if (slot == nreaders)
|
||||
env->lck->rdt_length.weak = (uint32_t)++nreaders;
|
||||
result.rslot->tid.weak = (env->flags & MDBX_NOSTICKYTHREADS) ? 0 : tid;
|
||||
atomic_store32(&result.rslot->pid, env->pid, mo_AcquireRelease);
|
||||
lck_rdt_unlock(env);
|
||||
|
||||
if (likely(env->flags & ENV_TXKEY)) {
|
||||
eASSERT(env, env->registered_reader_pid == env->pid);
|
||||
thread_rthc_set(env->me_txkey, result.rslot);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
__hot txnid_t mvcc_shapshot_oldest(MDBX_env *const env, const txnid_t steady) {
|
||||
const uint32_t nothing_changed = MDBX_STRING_TETRAD("None");
|
||||
eASSERT(env, steady <= env->basal_txn->txnid);
|
||||
|
||||
lck_t *const lck = env->lck_mmap.lck;
|
||||
if (unlikely(lck == nullptr /* exclusive without-lck mode */)) {
|
||||
eASSERT(env, env->lck == lckless_stub(env));
|
||||
env->lck->rdt_refresh_flag.weak = nothing_changed;
|
||||
return env->lck->cached_oldest.weak = steady;
|
||||
}
|
||||
|
||||
const txnid_t prev_oldest =
|
||||
atomic_load64(&lck->cached_oldest, mo_AcquireRelease);
|
||||
eASSERT(env, steady >= prev_oldest);
|
||||
|
||||
txnid_t new_oldest = prev_oldest;
|
||||
while (nothing_changed !=
|
||||
atomic_load32(&lck->rdt_refresh_flag, mo_AcquireRelease)) {
|
||||
lck->rdt_refresh_flag.weak = nothing_changed;
|
||||
jitter4testing(false);
|
||||
const size_t snap_nreaders =
|
||||
atomic_load32(&lck->rdt_length, mo_AcquireRelease);
|
||||
new_oldest = steady;
|
||||
|
||||
for (size_t i = 0; i < snap_nreaders; ++i) {
|
||||
const uint32_t pid = atomic_load32(&lck->rdt[i].pid, mo_AcquireRelease);
|
||||
if (!pid)
|
||||
continue;
|
||||
jitter4testing(true);
|
||||
|
||||
const txnid_t rtxn = safe64_read(&lck->rdt[i].txnid);
|
||||
if (unlikely(rtxn < prev_oldest)) {
|
||||
if (unlikely(nothing_changed == atomic_load32(&lck->rdt_refresh_flag,
|
||||
mo_AcquireRelease)) &&
|
||||
safe64_reset_compare(&lck->rdt[i].txnid, rtxn)) {
|
||||
NOTICE("kick stuck reader[%zu of %zu].pid_%u %" PRIaTXN
|
||||
" < prev-oldest %" PRIaTXN ", steady-txn %" PRIaTXN,
|
||||
i, snap_nreaders, pid, rtxn, prev_oldest, steady);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
if (rtxn < new_oldest) {
|
||||
new_oldest = rtxn;
|
||||
if (!MDBX_DEBUG && !MDBX_FORCE_ASSERTIONS && new_oldest == prev_oldest)
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (new_oldest != prev_oldest) {
|
||||
VERBOSE("update oldest %" PRIaTXN " -> %" PRIaTXN, prev_oldest, new_oldest);
|
||||
eASSERT(env, new_oldest >= lck->cached_oldest.weak);
|
||||
atomic_store64(&lck->cached_oldest, new_oldest, mo_Relaxed);
|
||||
}
|
||||
return new_oldest;
|
||||
}
|
||||
|
||||
pgno_t mvcc_snapshot_largest(const MDBX_env *env, pgno_t last_used_page) {
|
||||
lck_t *const lck = env->lck_mmap.lck;
|
||||
if (likely(lck != nullptr /* check for exclusive without-lck mode */)) {
|
||||
retry:;
|
||||
const size_t snap_nreaders =
|
||||
atomic_load32(&lck->rdt_length, mo_AcquireRelease);
|
||||
for (size_t i = 0; i < snap_nreaders; ++i) {
|
||||
if (atomic_load32(&lck->rdt[i].pid, mo_AcquireRelease)) {
|
||||
/* jitter4testing(true); */
|
||||
const pgno_t snap_pages =
|
||||
atomic_load32(&lck->rdt[i].snapshot_pages_used, mo_Relaxed);
|
||||
const txnid_t snap_txnid = safe64_read(&lck->rdt[i].txnid);
|
||||
if (unlikely(snap_pages !=
|
||||
atomic_load32(&lck->rdt[i].snapshot_pages_used,
|
||||
mo_AcquireRelease) ||
|
||||
snap_txnid != safe64_read(&lck->rdt[i].txnid)))
|
||||
goto retry;
|
||||
if (last_used_page < snap_pages && snap_txnid <= env->basal_txn->txnid)
|
||||
last_used_page = snap_pages;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return last_used_page;
|
||||
}
|
||||
|
||||
/* Find largest mvcc-snapshot still referenced by this process. */
|
||||
pgno_t mvcc_largest_this(MDBX_env *env, pgno_t largest) {
|
||||
lck_t *const lck = env->lck_mmap.lck;
|
||||
if (likely(lck != nullptr /* exclusive mode */)) {
|
||||
const size_t snap_nreaders =
|
||||
atomic_load32(&lck->rdt_length, mo_AcquireRelease);
|
||||
for (size_t i = 0; i < snap_nreaders; ++i) {
|
||||
retry:
|
||||
if (atomic_load32(&lck->rdt[i].pid, mo_AcquireRelease) == env->pid) {
|
||||
/* jitter4testing(true); */
|
||||
const pgno_t snap_pages =
|
||||
atomic_load32(&lck->rdt[i].snapshot_pages_used, mo_Relaxed);
|
||||
const txnid_t snap_txnid = safe64_read(&lck->rdt[i].txnid);
|
||||
if (unlikely(snap_pages !=
|
||||
atomic_load32(&lck->rdt[i].snapshot_pages_used,
|
||||
mo_AcquireRelease) ||
|
||||
snap_txnid != safe64_read(&lck->rdt[i].txnid)))
|
||||
goto retry;
|
||||
if (largest < snap_pages &&
|
||||
atomic_load64(&lck->cached_oldest, mo_AcquireRelease) <=
|
||||
/* ignore pending updates */ snap_txnid &&
|
||||
snap_txnid <= MAX_TXNID)
|
||||
largest = snap_pages;
|
||||
}
|
||||
}
|
||||
}
|
||||
return largest;
|
||||
}
|
||||
|
||||
static bool pid_insert(uint32_t *list, uint32_t pid) {
|
||||
/* binary search of pid in list */
|
||||
size_t base = 0;
|
||||
size_t cursor = 1;
|
||||
int32_t val = 0;
|
||||
size_t n = /* length */ list[0];
|
||||
|
||||
while (n > 0) {
|
||||
size_t pivot = n >> 1;
|
||||
cursor = base + pivot + 1;
|
||||
val = pid - list[cursor];
|
||||
|
||||
if (val < 0) {
|
||||
n = pivot;
|
||||
} else if (val > 0) {
|
||||
base = cursor;
|
||||
n -= pivot + 1;
|
||||
} else {
|
||||
/* found, so it's a duplicate */
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
if (val > 0)
|
||||
++cursor;
|
||||
|
||||
list[0]++;
|
||||
for (n = list[0]; n > cursor; n--)
|
||||
list[n] = list[n - 1];
|
||||
list[n] = pid;
|
||||
return true;
|
||||
}
|
||||
|
||||
__cold MDBX_INTERNAL int mvcc_cleanup_dead(MDBX_env *env, int rdt_locked,
|
||||
int *dead) {
|
||||
int rc = check_env(env, true);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return rc;
|
||||
|
||||
eASSERT(env, rdt_locked >= 0);
|
||||
lck_t *const lck = env->lck_mmap.lck;
|
||||
if (unlikely(lck == nullptr)) {
|
||||
/* exclusive mode */
|
||||
if (dead)
|
||||
*dead = 0;
|
||||
return MDBX_SUCCESS;
|
||||
}
|
||||
|
||||
const size_t snap_nreaders =
|
||||
atomic_load32(&lck->rdt_length, mo_AcquireRelease);
|
||||
uint32_t pidsbuf_onstask[142];
|
||||
uint32_t *const pids =
|
||||
(snap_nreaders < ARRAY_LENGTH(pidsbuf_onstask))
|
||||
? pidsbuf_onstask
|
||||
: osal_malloc((snap_nreaders + 1) * sizeof(uint32_t));
|
||||
if (unlikely(!pids))
|
||||
return MDBX_ENOMEM;
|
||||
|
||||
pids[0] = 0;
|
||||
int count = 0;
|
||||
for (size_t i = 0; i < snap_nreaders; i++) {
|
||||
const uint32_t pid = atomic_load32(&lck->rdt[i].pid, mo_AcquireRelease);
|
||||
if (pid == 0)
|
||||
continue /* skip empty */;
|
||||
if (pid == env->pid)
|
||||
continue /* skip self */;
|
||||
if (!pid_insert(pids, pid))
|
||||
continue /* such pid already processed */;
|
||||
|
||||
int err = lck_rpid_check(env, pid);
|
||||
if (err == MDBX_RESULT_TRUE)
|
||||
continue /* reader is live */;
|
||||
|
||||
if (err != MDBX_SUCCESS) {
|
||||
rc = err;
|
||||
break /* lck_rpid_check() failed */;
|
||||
}
|
||||
|
||||
/* stale reader found */
|
||||
if (!rdt_locked) {
|
||||
err = lck_rdt_lock(env);
|
||||
if (MDBX_IS_ERROR(err)) {
|
||||
rc = err;
|
||||
break;
|
||||
}
|
||||
|
||||
rdt_locked = -1;
|
||||
if (err == MDBX_RESULT_TRUE) {
|
||||
/* mutex recovered, the mdbx_ipclock_failed() checked all readers */
|
||||
rc = MDBX_RESULT_TRUE;
|
||||
break;
|
||||
}
|
||||
|
||||
/* a other process may have clean and reused slot, recheck */
|
||||
if (lck->rdt[i].pid.weak != pid)
|
||||
continue;
|
||||
|
||||
err = lck_rpid_check(env, pid);
|
||||
if (MDBX_IS_ERROR(err)) {
|
||||
rc = err;
|
||||
break;
|
||||
}
|
||||
|
||||
if (err != MDBX_SUCCESS)
|
||||
continue /* the race with other process, slot reused */;
|
||||
}
|
||||
|
||||
/* clean it */
|
||||
for (size_t ii = i; ii < snap_nreaders; ii++) {
|
||||
if (lck->rdt[ii].pid.weak == pid) {
|
||||
DEBUG("clear stale reader pid %" PRIuPTR " txn %" PRIaTXN, (size_t)pid,
|
||||
lck->rdt[ii].txnid.weak);
|
||||
atomic_store32(&lck->rdt[ii].pid, 0, mo_Relaxed);
|
||||
atomic_store32(&lck->rdt_refresh_flag, true, mo_AcquireRelease);
|
||||
count++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (likely(!MDBX_IS_ERROR(rc)))
|
||||
atomic_store64(&lck->readers_check_timestamp, osal_monotime(), mo_Relaxed);
|
||||
|
||||
if (rdt_locked < 0)
|
||||
lck_rdt_unlock(env);
|
||||
|
||||
if (pids != pidsbuf_onstask)
|
||||
osal_free(pids);
|
||||
|
||||
if (dead)
|
||||
*dead = count;
|
||||
return rc;
|
||||
}
|
||||
|
||||
__cold txnid_t mvcc_kick_laggards(MDBX_env *env, const txnid_t straggler) {
|
||||
DEBUG("DB size maxed out by reading #%" PRIaTXN, straggler);
|
||||
osal_memory_fence(mo_AcquireRelease, false);
|
||||
MDBX_hsr_func *const callback = env->hsr_callback;
|
||||
txnid_t oldest = 0;
|
||||
bool notify_eof_of_loop = false;
|
||||
int retry = 0;
|
||||
do {
|
||||
const txnid_t steady =
|
||||
env->txn->tw.troika.txnid[env->txn->tw.troika.prefer_steady];
|
||||
env->lck->rdt_refresh_flag.weak = /* force refresh */ true;
|
||||
oldest = mvcc_shapshot_oldest(env, steady);
|
||||
eASSERT(env, oldest < env->basal_txn->txnid);
|
||||
eASSERT(env, oldest >= straggler);
|
||||
eASSERT(env, oldest >= env->lck->cached_oldest.weak);
|
||||
|
||||
lck_t *const lck = env->lck_mmap.lck;
|
||||
if (oldest == steady || oldest > straggler || /* without-LCK mode */ !lck)
|
||||
break;
|
||||
|
||||
if (MDBX_IS_ERROR(mvcc_cleanup_dead(env, false, nullptr)))
|
||||
break;
|
||||
|
||||
if (!callback)
|
||||
break;
|
||||
|
||||
reader_slot_t *stucked = nullptr;
|
||||
uint64_t hold_retired = 0;
|
||||
for (size_t i = 0; i < lck->rdt_length.weak; ++i) {
|
||||
const uint64_t snap_retired =
|
||||
atomic_load64(&lck->rdt[i].snapshot_pages_retired, mo_Relaxed);
|
||||
const txnid_t rtxn = safe64_read(&lck->rdt[i].txnid);
|
||||
if (rtxn == straggler &&
|
||||
atomic_load32(&lck->rdt[i].pid, mo_AcquireRelease)) {
|
||||
hold_retired = snap_retired;
|
||||
stucked = &lck->rdt[i];
|
||||
}
|
||||
}
|
||||
|
||||
if (!stucked)
|
||||
break;
|
||||
|
||||
uint32_t pid = atomic_load32(&stucked->pid, mo_AcquireRelease);
|
||||
uint64_t tid = atomic_load64(&stucked->tid, mo_AcquireRelease);
|
||||
if (safe64_read(&stucked->txnid) != straggler || !pid ||
|
||||
stucked->snapshot_pages_retired.weak != hold_retired)
|
||||
continue;
|
||||
|
||||
const meta_ptr_t head = meta_recent(env, &env->txn->tw.troika);
|
||||
const txnid_t gap = (head.txnid - straggler) / xMDBX_TXNID_STEP;
|
||||
const uint64_t head_retired =
|
||||
unaligned_peek_u64(4, head.ptr_c->pages_retired);
|
||||
const size_t space =
|
||||
(head_retired > hold_retired)
|
||||
? pgno2bytes(env, (pgno_t)(head_retired - hold_retired))
|
||||
: 0;
|
||||
int rc =
|
||||
callback(env, env->txn, pid, (mdbx_tid_t)((intptr_t)tid), straggler,
|
||||
(gap < UINT_MAX) ? (unsigned)gap : UINT_MAX, space, retry);
|
||||
if (rc < 0)
|
||||
/* hsr returned error and/or agree MDBX_MAP_FULL error */
|
||||
break;
|
||||
|
||||
if (rc > 0) {
|
||||
if (rc == 1) {
|
||||
/* hsr reported transaction (will be) aborted asynchronous */
|
||||
safe64_reset_compare(&stucked->txnid, straggler);
|
||||
} else {
|
||||
/* hsr reported reader process was killed and slot should be cleared */
|
||||
safe64_reset(&stucked->txnid, true);
|
||||
atomic_store64(&stucked->tid, 0, mo_Relaxed);
|
||||
atomic_store32(&stucked->pid, 0, mo_AcquireRelease);
|
||||
}
|
||||
} else if (!notify_eof_of_loop) {
|
||||
#if MDBX_ENABLE_PROFGC
|
||||
env->lck->pgops.gc_prof.kicks += 1;
|
||||
#endif /* MDBX_ENABLE_PROFGC */
|
||||
notify_eof_of_loop = true;
|
||||
}
|
||||
|
||||
} while (++retry < INT_MAX);
|
||||
|
||||
if (notify_eof_of_loop) {
|
||||
/* notify end of hsr-loop */
|
||||
const txnid_t turn = oldest - straggler;
|
||||
if (turn)
|
||||
NOTICE("hsr-kick: done turn %" PRIaTXN " -> %" PRIaTXN " +%" PRIaTXN,
|
||||
straggler, oldest, turn);
|
||||
callback(env, env->txn, 0, 0, straggler,
|
||||
(turn < UINT_MAX) ? (unsigned)turn : UINT_MAX, 0, -retry);
|
||||
}
|
||||
return oldest;
|
||||
}
|
||||
|
||||
/*----------------------------------------------------------------------------*/
|
||||
|
||||
__cold int mdbx_thread_register(const MDBX_env *env) {
|
||||
int rc = check_env(env, true);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return rc;
|
||||
|
||||
if (unlikely(!env->lck_mmap.lck))
|
||||
return (env->flags & MDBX_EXCLUSIVE) ? MDBX_EINVAL : MDBX_EPERM;
|
||||
|
||||
if (unlikely((env->flags & ENV_TXKEY) == 0)) {
|
||||
eASSERT(env, env->flags & MDBX_NOSTICKYTHREADS);
|
||||
return MDBX_EINVAL /* MDBX_NOSTICKYTHREADS mode */;
|
||||
}
|
||||
|
||||
eASSERT(env, (env->flags & (MDBX_NOSTICKYTHREADS | ENV_TXKEY)) == ENV_TXKEY);
|
||||
reader_slot_t *r = thread_rthc_get(env->me_txkey);
|
||||
if (unlikely(r != nullptr)) {
|
||||
eASSERT(env, r->pid.weak == env->pid);
|
||||
eASSERT(env, r->tid.weak == osal_thread_self());
|
||||
if (unlikely(r->pid.weak != env->pid))
|
||||
return MDBX_BAD_RSLOT;
|
||||
return MDBX_RESULT_TRUE /* already registered */;
|
||||
}
|
||||
|
||||
const uintptr_t tid = osal_thread_self();
|
||||
if (env->txn && unlikely(env->basal_txn->owner == tid))
|
||||
return MDBX_TXN_OVERLAPPING;
|
||||
return mvcc_bind_slot((MDBX_env *)env, tid).err;
|
||||
}
|
||||
|
||||
__cold int mdbx_thread_unregister(const MDBX_env *env) {
|
||||
int rc = check_env(env, true);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return rc;
|
||||
|
||||
if (unlikely(!env->lck_mmap.lck))
|
||||
return MDBX_RESULT_TRUE;
|
||||
|
||||
if (unlikely((env->flags & ENV_TXKEY) == 0)) {
|
||||
eASSERT(env, env->flags & MDBX_NOSTICKYTHREADS);
|
||||
return MDBX_RESULT_TRUE /* MDBX_NOSTICKYTHREADS mode */;
|
||||
}
|
||||
|
||||
eASSERT(env, (env->flags & (MDBX_NOSTICKYTHREADS | ENV_TXKEY)) == ENV_TXKEY);
|
||||
reader_slot_t *r = thread_rthc_get(env->me_txkey);
|
||||
if (unlikely(r == nullptr))
|
||||
return MDBX_RESULT_TRUE /* not registered */;
|
||||
|
||||
eASSERT(env, r->pid.weak == env->pid);
|
||||
eASSERT(env, r->tid.weak == osal_thread_self());
|
||||
if (unlikely(r->pid.weak != env->pid || r->tid.weak != osal_thread_self()))
|
||||
return MDBX_BAD_RSLOT;
|
||||
|
||||
eASSERT(env, r->txnid.weak >= SAFE64_INVALID_THRESHOLD);
|
||||
if (unlikely(r->txnid.weak < SAFE64_INVALID_THRESHOLD))
|
||||
return MDBX_BUSY /* transaction is still active */;
|
||||
|
||||
atomic_store32(&r->pid, 0, mo_Relaxed);
|
||||
atomic_store32(&env->lck->rdt_refresh_flag, true, mo_AcquireRelease);
|
||||
thread_rthc_set(env->me_txkey, nullptr);
|
||||
return MDBX_SUCCESS;
|
||||
}
|
395
src/node.c
Normal file
395
src/node.c
Normal file
@ -0,0 +1,395 @@
|
||||
/// \copyright SPDX-License-Identifier: Apache-2.0
|
||||
/// \note Please refer to the COPYRIGHT file for explanations license change,
|
||||
/// credits and acknowledgments.
|
||||
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
|
||||
|
||||
#include "internals.h"
|
||||
|
||||
__hot int __must_check_result node_add_dupfix(MDBX_cursor *mc, size_t indx,
|
||||
const MDBX_val *key) {
|
||||
page_t *mp = mc->pg[mc->top];
|
||||
MDBX_ANALYSIS_ASSUME(key != nullptr);
|
||||
DKBUF_DEBUG;
|
||||
DEBUG("add to leaf2-%spage %" PRIaPGNO " index %zi, "
|
||||
" key size %" PRIuPTR " [%s]",
|
||||
is_subpage(mp) ? "sub-" : "", mp->pgno, indx, key ? key->iov_len : 0,
|
||||
DKEY_DEBUG(key));
|
||||
|
||||
cASSERT(mc, key);
|
||||
cASSERT(mc, page_type_compat(mp) == (P_LEAF | P_DUPFIX));
|
||||
const size_t ksize = mc->tree->dupfix_size;
|
||||
cASSERT(mc, ksize == key->iov_len);
|
||||
const size_t nkeys = page_numkeys(mp);
|
||||
cASSERT(mc, (((ksize & page_numkeys(mp)) ^ mp->upper) & 1) == 0);
|
||||
|
||||
/* Just using these for counting */
|
||||
const intptr_t lower = mp->lower + sizeof(indx_t);
|
||||
const intptr_t upper = mp->upper - (ksize - sizeof(indx_t));
|
||||
if (unlikely(lower > upper)) {
|
||||
mc->txn->flags |= MDBX_TXN_ERROR;
|
||||
return MDBX_PAGE_FULL;
|
||||
}
|
||||
mp->lower = (indx_t)lower;
|
||||
mp->upper = (indx_t)upper;
|
||||
|
||||
void *const ptr = page_dupfix_ptr(mp, indx, ksize);
|
||||
cASSERT(mc, nkeys >= indx);
|
||||
const size_t diff = nkeys - indx;
|
||||
if (likely(diff > 0))
|
||||
/* Move higher keys up one slot. */
|
||||
memmove(ptr_disp(ptr, ksize), ptr, diff * ksize);
|
||||
/* insert new key */
|
||||
memcpy(ptr, key->iov_base, ksize);
|
||||
|
||||
cASSERT(mc, (((ksize & page_numkeys(mp)) ^ mp->upper) & 1) == 0);
|
||||
return MDBX_SUCCESS;
|
||||
}
|
||||
|
||||
int __must_check_result node_add_branch(MDBX_cursor *mc, size_t indx,
|
||||
const MDBX_val *key, pgno_t pgno) {
|
||||
page_t *mp = mc->pg[mc->top];
|
||||
DKBUF_DEBUG;
|
||||
DEBUG("add to branch-%spage %" PRIaPGNO " index %zi, node-pgno %" PRIaPGNO
|
||||
" key size %" PRIuPTR " [%s]",
|
||||
is_subpage(mp) ? "sub-" : "", mp->pgno, indx, pgno,
|
||||
key ? key->iov_len : 0, DKEY_DEBUG(key));
|
||||
|
||||
cASSERT(mc, page_type(mp) == P_BRANCH);
|
||||
STATIC_ASSERT(NODESIZE % 2 == 0);
|
||||
|
||||
/* Move higher pointers up one slot. */
|
||||
const size_t nkeys = page_numkeys(mp);
|
||||
cASSERT(mc, nkeys >= indx);
|
||||
for (size_t i = nkeys; i > indx; --i)
|
||||
mp->entries[i] = mp->entries[i - 1];
|
||||
|
||||
/* Adjust free space offsets. */
|
||||
const size_t branch_bytes = branch_size(mc->txn->env, key);
|
||||
const intptr_t lower = mp->lower + sizeof(indx_t);
|
||||
const intptr_t upper = mp->upper - (branch_bytes - sizeof(indx_t));
|
||||
if (unlikely(lower > upper)) {
|
||||
mc->txn->flags |= MDBX_TXN_ERROR;
|
||||
return MDBX_PAGE_FULL;
|
||||
}
|
||||
mp->lower = (indx_t)lower;
|
||||
mp->entries[indx] = mp->upper = (indx_t)upper;
|
||||
|
||||
/* Write the node data. */
|
||||
node_t *node = page_node(mp, indx);
|
||||
node_set_pgno(node, pgno);
|
||||
node_set_flags(node, 0);
|
||||
UNALIGNED_POKE_8(node, node_t, extra, 0);
|
||||
node_set_ks(node, 0);
|
||||
if (likely(key != nullptr)) {
|
||||
node_set_ks(node, key->iov_len);
|
||||
memcpy(node_key(node), key->iov_base, key->iov_len);
|
||||
}
|
||||
return MDBX_SUCCESS;
|
||||
}
|
||||
|
||||
__hot int __must_check_result node_add_leaf(MDBX_cursor *mc, size_t indx,
|
||||
const MDBX_val *key, MDBX_val *data,
|
||||
unsigned flags) {
|
||||
MDBX_ANALYSIS_ASSUME(key != nullptr);
|
||||
MDBX_ANALYSIS_ASSUME(data != nullptr);
|
||||
page_t *mp = mc->pg[mc->top];
|
||||
DKBUF_DEBUG;
|
||||
DEBUG("add to leaf-%spage %" PRIaPGNO " index %zi, data size %" PRIuPTR
|
||||
" key size %" PRIuPTR " [%s]",
|
||||
is_subpage(mp) ? "sub-" : "", mp->pgno, indx, data ? data->iov_len : 0,
|
||||
key ? key->iov_len : 0, DKEY_DEBUG(key));
|
||||
cASSERT(mc, key != nullptr && data != nullptr);
|
||||
cASSERT(mc, page_type_compat(mp) == P_LEAF);
|
||||
page_t *largepage = nullptr;
|
||||
|
||||
size_t node_bytes;
|
||||
if (unlikely(flags & N_BIGDATA)) {
|
||||
/* Data already on large/overflow page. */
|
||||
STATIC_ASSERT(sizeof(pgno_t) % 2 == 0);
|
||||
node_bytes =
|
||||
node_size_len(key->iov_len, 0) + sizeof(pgno_t) + sizeof(indx_t);
|
||||
cASSERT(mc, page_room(mp) >= node_bytes);
|
||||
} else if (unlikely(node_size(key, data) > mc->txn->env->leaf_nodemax)) {
|
||||
/* Put data on large/overflow page. */
|
||||
if (unlikely(mc->tree->flags & MDBX_DUPSORT)) {
|
||||
ERROR("Unexpected target %s flags 0x%x for large data-item", "dupsort-db",
|
||||
mc->tree->flags);
|
||||
return MDBX_PROBLEM;
|
||||
}
|
||||
if (unlikely(flags & (N_DUPDATA | N_SUBDATA))) {
|
||||
ERROR("Unexpected target %s flags 0x%x for large data-item", "node",
|
||||
flags);
|
||||
return MDBX_PROBLEM;
|
||||
}
|
||||
cASSERT(mc, page_room(mp) >= leaf_size(mc->txn->env, key, data));
|
||||
const pgno_t ovpages = largechunk_npages(mc->txn->env, data->iov_len);
|
||||
const pgr_t npr = page_new_large(mc, ovpages);
|
||||
if (unlikely(npr.err != MDBX_SUCCESS))
|
||||
return npr.err;
|
||||
largepage = npr.page;
|
||||
DEBUG("allocated %u large/overflow page(s) %" PRIaPGNO "for %" PRIuPTR
|
||||
" data bytes",
|
||||
largepage->pages, largepage->pgno, data->iov_len);
|
||||
flags |= N_BIGDATA;
|
||||
node_bytes =
|
||||
node_size_len(key->iov_len, 0) + sizeof(pgno_t) + sizeof(indx_t);
|
||||
cASSERT(mc, node_bytes == leaf_size(mc->txn->env, key, data));
|
||||
} else {
|
||||
cASSERT(mc, page_room(mp) >= leaf_size(mc->txn->env, key, data));
|
||||
node_bytes = node_size(key, data) + sizeof(indx_t);
|
||||
cASSERT(mc, node_bytes == leaf_size(mc->txn->env, key, data));
|
||||
}
|
||||
|
||||
/* Move higher pointers up one slot. */
|
||||
const size_t nkeys = page_numkeys(mp);
|
||||
cASSERT(mc, nkeys >= indx);
|
||||
for (size_t i = nkeys; i > indx; --i)
|
||||
mp->entries[i] = mp->entries[i - 1];
|
||||
|
||||
/* Adjust free space offsets. */
|
||||
const intptr_t lower = mp->lower + sizeof(indx_t);
|
||||
const intptr_t upper = mp->upper - (node_bytes - sizeof(indx_t));
|
||||
if (unlikely(lower > upper)) {
|
||||
mc->txn->flags |= MDBX_TXN_ERROR;
|
||||
return MDBX_PAGE_FULL;
|
||||
}
|
||||
mp->lower = (indx_t)lower;
|
||||
mp->entries[indx] = mp->upper = (indx_t)upper;
|
||||
|
||||
/* Write the node data. */
|
||||
node_t *node = page_node(mp, indx);
|
||||
node_set_ks(node, key->iov_len);
|
||||
node_set_flags(node, (uint8_t)flags);
|
||||
UNALIGNED_POKE_8(node, node_t, extra, 0);
|
||||
node_set_ds(node, data->iov_len);
|
||||
memcpy(node_key(node), key->iov_base, key->iov_len);
|
||||
|
||||
void *nodedata = node_data(node);
|
||||
if (likely(largepage == nullptr)) {
|
||||
if (unlikely(flags & N_BIGDATA)) {
|
||||
memcpy(nodedata, data->iov_base, sizeof(pgno_t));
|
||||
return MDBX_SUCCESS;
|
||||
}
|
||||
} else {
|
||||
poke_pgno(nodedata, largepage->pgno);
|
||||
nodedata = page_data(largepage);
|
||||
}
|
||||
if (unlikely(flags & MDBX_RESERVE))
|
||||
data->iov_base = nodedata;
|
||||
else if (likely(data->iov_len /* to avoid UBSAN traps */))
|
||||
memcpy(nodedata, data->iov_base, data->iov_len);
|
||||
return MDBX_SUCCESS;
|
||||
}
|
||||
|
||||
__hot void node_del(MDBX_cursor *mc, size_t ksize) {
|
||||
page_t *mp = mc->pg[mc->top];
|
||||
const size_t hole = mc->ki[mc->top];
|
||||
const size_t nkeys = page_numkeys(mp);
|
||||
|
||||
DEBUG("delete node %zu on %s page %" PRIaPGNO, hole,
|
||||
is_leaf(mp) ? "leaf" : "branch", mp->pgno);
|
||||
cASSERT(mc, hole < nkeys);
|
||||
|
||||
if (is_dupfix_leaf(mp)) {
|
||||
cASSERT(mc, ksize >= sizeof(indx_t));
|
||||
size_t diff = nkeys - 1 - hole;
|
||||
void *const base = page_dupfix_ptr(mp, hole, ksize);
|
||||
if (diff)
|
||||
memmove(base, ptr_disp(base, ksize), diff * ksize);
|
||||
cASSERT(mc, mp->lower >= sizeof(indx_t));
|
||||
mp->lower -= sizeof(indx_t);
|
||||
cASSERT(mc, (size_t)UINT16_MAX - mp->upper >= ksize - sizeof(indx_t));
|
||||
mp->upper += (indx_t)(ksize - sizeof(indx_t));
|
||||
cASSERT(mc, (((ksize & page_numkeys(mp)) ^ mp->upper) & 1) == 0);
|
||||
return;
|
||||
}
|
||||
|
||||
node_t *node = page_node(mp, hole);
|
||||
cASSERT(mc, !is_branch(mp) || hole || node_ks(node) == 0);
|
||||
size_t hole_size = NODESIZE + node_ks(node);
|
||||
if (is_leaf(mp))
|
||||
hole_size +=
|
||||
(node_flags(node) & N_BIGDATA) ? sizeof(pgno_t) : node_ds(node);
|
||||
hole_size = EVEN_CEIL(hole_size);
|
||||
|
||||
const indx_t hole_offset = mp->entries[hole];
|
||||
size_t r, w;
|
||||
for (r = w = 0; r < nkeys; r++)
|
||||
if (r != hole)
|
||||
mp->entries[w++] = (mp->entries[r] < hole_offset)
|
||||
? mp->entries[r] + (indx_t)hole_size
|
||||
: mp->entries[r];
|
||||
|
||||
void *const base = ptr_disp(mp, mp->upper + PAGEHDRSZ);
|
||||
memmove(ptr_disp(base, hole_size), base, hole_offset - mp->upper);
|
||||
|
||||
cASSERT(mc, mp->lower >= sizeof(indx_t));
|
||||
mp->lower -= sizeof(indx_t);
|
||||
cASSERT(mc, (size_t)UINT16_MAX - mp->upper >= hole_size);
|
||||
mp->upper += (indx_t)hole_size;
|
||||
|
||||
if (AUDIT_ENABLED()) {
|
||||
const uint8_t checking = mc->checking;
|
||||
mc->checking |= z_updating;
|
||||
const int page_check_err = page_check(mc, mp);
|
||||
mc->checking = checking;
|
||||
cASSERT(mc, page_check_err == MDBX_SUCCESS);
|
||||
}
|
||||
}
|
||||
|
||||
__noinline int node_read_bigdata(MDBX_cursor *mc, const node_t *node,
|
||||
MDBX_val *data, const page_t *mp) {
|
||||
cASSERT(mc, node_flags(node) == N_BIGDATA && data->iov_len == node_ds(node));
|
||||
|
||||
pgr_t lp = page_get_large(mc, node_largedata_pgno(node), mp->txnid);
|
||||
if (unlikely((lp.err != MDBX_SUCCESS))) {
|
||||
DEBUG("read large/overflow page %" PRIaPGNO " failed",
|
||||
node_largedata_pgno(node));
|
||||
return lp.err;
|
||||
}
|
||||
|
||||
cASSERT(mc, page_type(lp.page) == P_LARGE);
|
||||
data->iov_base = page_data(lp.page);
|
||||
if (!MDBX_DISABLE_VALIDATION) {
|
||||
const MDBX_env *env = mc->txn->env;
|
||||
const size_t dsize = data->iov_len;
|
||||
const unsigned npages = largechunk_npages(env, dsize);
|
||||
if (unlikely(lp.page->pages < npages))
|
||||
return bad_page(lp.page,
|
||||
"too less n-pages %u for bigdata-node (%zu bytes)",
|
||||
lp.page->pages, dsize);
|
||||
}
|
||||
return MDBX_SUCCESS;
|
||||
}
|
||||
|
||||
node_t *node_shrink(page_t *mp, size_t indx, node_t *node) {
|
||||
assert(node == page_node(mp, indx));
|
||||
page_t *sp = (page_t *)node_data(node);
|
||||
assert(is_subpage(sp) && page_numkeys(sp) > 0);
|
||||
const size_t delta =
|
||||
EVEN_FLOOR(page_room(sp) /* avoid the node uneven-sized */);
|
||||
if (unlikely(delta) == 0)
|
||||
return node;
|
||||
|
||||
/* Prepare to shift upward, set len = length(subpage part to shift) */
|
||||
size_t nsize = node_ds(node) - delta, len = nsize;
|
||||
assert(nsize % 1 == 0);
|
||||
if (!is_dupfix_leaf(sp)) {
|
||||
len = PAGEHDRSZ;
|
||||
page_t *xp = ptr_disp(sp, delta); /* destination subpage */
|
||||
for (intptr_t i = page_numkeys(sp); --i >= 0;) {
|
||||
assert(sp->entries[i] >= delta);
|
||||
xp->entries[i] = (indx_t)(sp->entries[i] - delta);
|
||||
}
|
||||
}
|
||||
assert(sp->upper >= sp->lower + delta);
|
||||
sp->upper -= (indx_t)delta;
|
||||
sp->pgno = mp->pgno;
|
||||
node_set_ds(node, nsize);
|
||||
|
||||
/* Shift <lower nodes...initial part of subpage> upward */
|
||||
void *const base = ptr_disp(mp, mp->upper + PAGEHDRSZ);
|
||||
memmove(ptr_disp(base, delta), base, ptr_dist(sp, base) + len);
|
||||
|
||||
const size_t pivot = mp->entries[indx];
|
||||
for (intptr_t i = page_numkeys(mp); --i >= 0;) {
|
||||
if (mp->entries[i] <= pivot) {
|
||||
assert((size_t)UINT16_MAX - mp->entries[i] >= delta);
|
||||
mp->entries[i] += (indx_t)delta;
|
||||
}
|
||||
}
|
||||
assert((size_t)UINT16_MAX - mp->upper >= delta);
|
||||
mp->upper += (indx_t)delta;
|
||||
|
||||
return ptr_disp(node, delta);
|
||||
}
|
||||
|
||||
__hot struct node_search_result node_search(MDBX_cursor *mc,
|
||||
const MDBX_val *key) {
|
||||
page_t *mp = mc->pg[mc->top];
|
||||
const intptr_t nkeys = page_numkeys(mp);
|
||||
DKBUF_DEBUG;
|
||||
|
||||
DEBUG("searching %zu keys in %s %spage %" PRIaPGNO, nkeys,
|
||||
is_leaf(mp) ? "leaf" : "branch", is_subpage(mp) ? "sub-" : "",
|
||||
mp->pgno);
|
||||
|
||||
struct node_search_result ret;
|
||||
ret.exact = false;
|
||||
STATIC_ASSERT(P_BRANCH == 1);
|
||||
intptr_t low = mp->flags & P_BRANCH;
|
||||
intptr_t high = nkeys - 1;
|
||||
if (unlikely(high < low)) {
|
||||
mc->ki[mc->top] = 0;
|
||||
ret.node = nullptr;
|
||||
return ret;
|
||||
}
|
||||
|
||||
intptr_t i;
|
||||
MDBX_cmp_func *cmp = mc->clc->k.cmp;
|
||||
MDBX_val nodekey;
|
||||
if (unlikely(is_dupfix_leaf(mp))) {
|
||||
cASSERT(mc, mp->dupfix_ksize == mc->tree->dupfix_size);
|
||||
nodekey.iov_len = mp->dupfix_ksize;
|
||||
do {
|
||||
i = (low + high) >> 1;
|
||||
nodekey.iov_base = page_dupfix_ptr(mp, i, nodekey.iov_len);
|
||||
cASSERT(mc, ptr_disp(mp, mc->txn->env->ps) >=
|
||||
ptr_disp(nodekey.iov_base, nodekey.iov_len));
|
||||
int cr = cmp(key, &nodekey);
|
||||
DEBUG("found leaf index %zu [%s], rc = %i", i, DKEY_DEBUG(&nodekey), cr);
|
||||
if (cr > 0)
|
||||
low = ++i;
|
||||
else if (cr < 0)
|
||||
high = i - 1;
|
||||
else {
|
||||
ret.exact = true;
|
||||
break;
|
||||
}
|
||||
} while (likely(low <= high));
|
||||
|
||||
/* store the key index */
|
||||
mc->ki[mc->top] = (indx_t)i;
|
||||
ret.node =
|
||||
(i < nkeys)
|
||||
? /* fake for DUPFIX */ (node_t *)(intptr_t)-1
|
||||
: /* There is no entry larger or equal to the key. */ nullptr;
|
||||
return ret;
|
||||
}
|
||||
|
||||
if (MDBX_UNALIGNED_OK < 4 && is_branch(mp) && cmp == cmp_int_align2)
|
||||
/* Branch pages have no data, so if using integer keys,
|
||||
* alignment is guaranteed. Use faster cmp_int_align4(). */
|
||||
cmp = cmp_int_align4;
|
||||
|
||||
node_t *node;
|
||||
do {
|
||||
i = (low + high) >> 1;
|
||||
node = page_node(mp, i);
|
||||
nodekey.iov_len = node_ks(node);
|
||||
nodekey.iov_base = node_key(node);
|
||||
cASSERT(mc, ptr_disp(mp, mc->txn->env->ps) >=
|
||||
ptr_disp(nodekey.iov_base, nodekey.iov_len));
|
||||
int cr = cmp(key, &nodekey);
|
||||
if (is_leaf(mp))
|
||||
DEBUG("found leaf index %zu [%s], rc = %i", i, DKEY_DEBUG(&nodekey), cr);
|
||||
else
|
||||
DEBUG("found branch index %zu [%s -> %" PRIaPGNO "], rc = %i", i,
|
||||
DKEY_DEBUG(&nodekey), node_pgno(node), cr);
|
||||
if (cr > 0)
|
||||
low = ++i;
|
||||
else if (cr < 0)
|
||||
high = i - 1;
|
||||
else {
|
||||
ret.exact = true;
|
||||
break;
|
||||
}
|
||||
} while (likely(low <= high));
|
||||
|
||||
/* store the key index */
|
||||
mc->ki[mc->top] = (indx_t)i;
|
||||
ret.node = (i < nkeys)
|
||||
? page_node(mp, i)
|
||||
: /* There is no entry larger or equal to the key. */ nullptr;
|
||||
return ret;
|
||||
}
|
125
src/node.h
Normal file
125
src/node.h
Normal file
@ -0,0 +1,125 @@
|
||||
/// \copyright SPDX-License-Identifier: Apache-2.0
|
||||
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "essentials.h"
|
||||
|
||||
/* valid flags for mdbx_node_add() */
|
||||
#define NODE_ADD_FLAGS (N_DUPDATA | N_SUBDATA | MDBX_RESERVE | MDBX_APPEND)
|
||||
|
||||
/* Get the page number pointed to by a branch node */
|
||||
MDBX_NOTHROW_PURE_FUNCTION static inline pgno_t
|
||||
node_pgno(const node_t *const __restrict node) {
|
||||
pgno_t pgno = UNALIGNED_PEEK_32(node, node_t, child_pgno);
|
||||
return pgno;
|
||||
}
|
||||
|
||||
/* Set the page number in a branch node */
|
||||
static inline void node_set_pgno(node_t *const __restrict node, pgno_t pgno) {
|
||||
assert(pgno >= MIN_PAGENO && pgno <= MAX_PAGENO);
|
||||
|
||||
UNALIGNED_POKE_32(node, node_t, child_pgno, (uint32_t)pgno);
|
||||
}
|
||||
|
||||
/* Get the size of the data in a leaf node */
|
||||
MDBX_NOTHROW_PURE_FUNCTION static inline size_t
|
||||
node_ds(const node_t *const __restrict node) {
|
||||
return UNALIGNED_PEEK_32(node, node_t, dsize);
|
||||
}
|
||||
|
||||
/* Set the size of the data for a leaf node */
|
||||
static inline void node_set_ds(node_t *const __restrict node, size_t size) {
|
||||
assert(size < INT_MAX);
|
||||
UNALIGNED_POKE_32(node, node_t, dsize, (uint32_t)size);
|
||||
}
|
||||
|
||||
/* The size of a key in a node */
|
||||
MDBX_NOTHROW_PURE_FUNCTION static inline size_t
|
||||
node_ks(const node_t *const __restrict node) {
|
||||
return UNALIGNED_PEEK_16(node, node_t, ksize);
|
||||
}
|
||||
|
||||
/* Set the size of the key for a leaf node */
|
||||
static inline void node_set_ks(node_t *const __restrict node, size_t size) {
|
||||
assert(size < INT16_MAX);
|
||||
UNALIGNED_POKE_16(node, node_t, ksize, (uint16_t)size);
|
||||
}
|
||||
|
||||
MDBX_NOTHROW_PURE_FUNCTION static inline uint8_t
|
||||
node_flags(const node_t *const __restrict node) {
|
||||
return UNALIGNED_PEEK_8(node, node_t, flags);
|
||||
}
|
||||
|
||||
static inline void node_set_flags(node_t *const __restrict node,
|
||||
uint8_t flags) {
|
||||
UNALIGNED_POKE_8(node, node_t, flags, flags);
|
||||
}
|
||||
|
||||
/* Address of the key for the node */
|
||||
MDBX_NOTHROW_PURE_FUNCTION static inline void *
|
||||
node_key(const node_t *const __restrict node) {
|
||||
return ptr_disp(node, NODESIZE);
|
||||
}
|
||||
|
||||
/* Address of the data for a node */
|
||||
MDBX_NOTHROW_PURE_FUNCTION static inline void *
|
||||
node_data(const node_t *const __restrict node) {
|
||||
return ptr_disp(node_key(node), node_ks(node));
|
||||
}
|
||||
|
||||
/* Size of a node in a leaf page with a given key and data.
|
||||
* This is node header plus key plus data size. */
|
||||
MDBX_NOTHROW_CONST_FUNCTION static inline size_t
|
||||
node_size_len(const size_t key_len, const size_t value_len) {
|
||||
return NODESIZE + EVEN_CEIL(key_len + value_len);
|
||||
}
|
||||
MDBX_NOTHROW_PURE_FUNCTION static inline size_t
|
||||
node_size(const MDBX_val *key, const MDBX_val *value) {
|
||||
return node_size_len(key ? key->iov_len : 0, value ? value->iov_len : 0);
|
||||
}
|
||||
|
||||
MDBX_NOTHROW_PURE_FUNCTION static inline pgno_t
|
||||
node_largedata_pgno(const node_t *const __restrict node) {
|
||||
assert(node_flags(node) & N_BIGDATA);
|
||||
return peek_pgno(node_data(node));
|
||||
}
|
||||
|
||||
MDBX_INTERNAL int __must_check_result node_read_bigdata(MDBX_cursor *mc,
|
||||
const node_t *node,
|
||||
MDBX_val *data,
|
||||
const page_t *mp);
|
||||
|
||||
static inline int __must_check_result node_read(MDBX_cursor *mc,
|
||||
const node_t *node,
|
||||
MDBX_val *data,
|
||||
const page_t *mp) {
|
||||
data->iov_len = node_ds(node);
|
||||
data->iov_base = node_data(node);
|
||||
if (likely(node_flags(node) != N_BIGDATA))
|
||||
return MDBX_SUCCESS;
|
||||
return node_read_bigdata(mc, node, data, mp);
|
||||
}
|
||||
|
||||
/*----------------------------------------------------------------------------*/
|
||||
|
||||
MDBX_INTERNAL nsr_t node_search(MDBX_cursor *mc, const MDBX_val *key);
|
||||
|
||||
MDBX_INTERNAL int __must_check_result node_add_branch(MDBX_cursor *mc,
|
||||
size_t indx,
|
||||
const MDBX_val *key,
|
||||
pgno_t pgno);
|
||||
|
||||
MDBX_INTERNAL int __must_check_result node_add_leaf(MDBX_cursor *mc,
|
||||
size_t indx,
|
||||
const MDBX_val *key,
|
||||
MDBX_val *data,
|
||||
unsigned flags);
|
||||
|
||||
MDBX_INTERNAL int __must_check_result node_add_dupfix(MDBX_cursor *mc,
|
||||
size_t indx,
|
||||
const MDBX_val *key);
|
||||
|
||||
MDBX_INTERNAL void node_del(MDBX_cursor *mc, size_t ksize);
|
||||
|
||||
MDBX_INTERNAL node_t *node_shrink(page_t *mp, size_t indx, node_t *node);
|
@ -1,7 +1,10 @@
|
||||
/// \copyright SPDX-License-Identifier: Apache-2.0
|
||||
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
|
||||
|
||||
/*******************************************************************************
|
||||
*******************************************************************************
|
||||
*******************************************************************************
|
||||
*
|
||||
* BUILD TIME
|
||||
*
|
||||
* #### ##### ##### # #### # # ####
|
||||
* # # # # # # # # ## # #
|
||||
@ -13,6 +16,10 @@
|
||||
*
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "essentials.h"
|
||||
|
||||
/** \defgroup build_option Build options
|
||||
* The libmdbx build options.
|
||||
@{ */
|
||||
@ -192,7 +199,11 @@
|
||||
|
||||
/** Avoid dependence from MSVC CRT and use ntdll.dll instead. */
|
||||
#ifndef MDBX_WITHOUT_MSVC_CRT
|
||||
#if !defined(MDBX_BUILD_CXX) || !MDBX_BUILD_CXX
|
||||
#define MDBX_WITHOUT_MSVC_CRT 1
|
||||
#else
|
||||
#define MDBX_WITHOUT_MSVC_CRT 0
|
||||
#endif
|
||||
#elif !(MDBX_WITHOUT_MSVC_CRT == 0 || MDBX_WITHOUT_MSVC_CRT == 1)
|
||||
#error MDBX_WITHOUT_MSVC_CRT must be defined as 0 or 1
|
||||
#endif /* MDBX_WITHOUT_MSVC_CRT */
|
||||
@ -499,6 +510,13 @@
|
||||
#endif
|
||||
#endif /* MDBX_CACHELINE_SIZE */
|
||||
|
||||
/* Max length of iov-vector passed to writev() call, used for auxilary writes */
|
||||
#define MDBX_AUXILARY_IOV_MAX 64
|
||||
#if defined(IOV_MAX) && IOV_MAX < MDBX_AUXILARY_IOV_MAX
|
||||
#undef MDBX_AUXILARY_IOV_MAX
|
||||
#define MDBX_AUXILARY_IOV_MAX IOV_MAX
|
||||
#endif /* MDBX_AUXILARY_IOV_MAX */
|
||||
|
||||
/** @} end of build options */
|
||||
/*******************************************************************************
|
||||
*******************************************************************************
|
||||
@ -513,6 +531,9 @@
|
||||
#else
|
||||
#define MDBX_DEBUG 1
|
||||
#endif
|
||||
#endif
|
||||
#if MDBX_DEBUG < 0 || MDBX_DEBUG > 2
|
||||
#error "The MDBX_DEBUG must be defined to 0, 1 or 2"
|
||||
#endif /* MDBX_DEBUG */
|
||||
|
||||
#else
|
||||
@ -532,7 +553,7 @@
|
||||
* Also enables \ref MDBX_DBG_AUDIT if `MDBX_DEBUG >= 2`.
|
||||
*
|
||||
* \ingroup build_option */
|
||||
#define MDBX_DEBUG 0...7
|
||||
#define MDBX_DEBUG 0...2
|
||||
|
||||
/** Disables using of GNU libc extensions. */
|
||||
#define MDBX_DISABLE_GNU_SOURCE 0 or 1
|
||||
|
494
src/osal.c
494
src/osal.c
File diff suppressed because it is too large
Load Diff
539
src/osal.h
539
src/osal.h
@ -1,50 +1,11 @@
|
||||
/* https://en.wikipedia.org/wiki/Operating_system_abstraction_layer */
|
||||
|
||||
/*
|
||||
* Copyright 2015-2024 Leonid Yuriev <leo@yuriev.ru>
|
||||
* and other libmdbx authors: please see AUTHORS file.
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted only as authorized by the OpenLDAP
|
||||
* Public License.
|
||||
*
|
||||
* A copy of this license is available in the file LICENSE in the
|
||||
* top-level directory of the distribution or, alternatively, at
|
||||
* <http://www.OpenLDAP.org/license.html>.
|
||||
*/
|
||||
/// \copyright SPDX-License-Identifier: Apache-2.0
|
||||
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
|
||||
///
|
||||
/// https://en.wikipedia.org/wiki/Operating_system_abstraction_layer
|
||||
|
||||
#pragma once
|
||||
|
||||
/*----------------------------------------------------------------------------*/
|
||||
/* C11 Atomics */
|
||||
|
||||
#if defined(__cplusplus) && !defined(__STDC_NO_ATOMICS__) && __has_include(<cstdatomic>)
|
||||
#include <cstdatomic>
|
||||
#define MDBX_HAVE_C11ATOMICS
|
||||
#elif !defined(__cplusplus) && \
|
||||
(__STDC_VERSION__ >= 201112L || __has_extension(c_atomic)) && \
|
||||
!defined(__STDC_NO_ATOMICS__) && \
|
||||
(__GNUC_PREREQ(4, 9) || __CLANG_PREREQ(3, 8) || \
|
||||
!(defined(__GNUC__) || defined(__clang__)))
|
||||
#include <stdatomic.h>
|
||||
#define MDBX_HAVE_C11ATOMICS
|
||||
#elif defined(__GNUC__) || defined(__clang__)
|
||||
#elif defined(_MSC_VER)
|
||||
#pragma warning(disable : 4163) /* 'xyz': not available as an intrinsic */
|
||||
#pragma warning(disable : 4133) /* 'function': incompatible types - from \
|
||||
'size_t' to 'LONGLONG' */
|
||||
#pragma warning(disable : 4244) /* 'return': conversion from 'LONGLONG' to \
|
||||
'std::size_t', possible loss of data */
|
||||
#pragma warning(disable : 4267) /* 'function': conversion from 'size_t' to \
|
||||
'long', possible loss of data */
|
||||
#pragma intrinsic(_InterlockedExchangeAdd, _InterlockedCompareExchange)
|
||||
#pragma intrinsic(_InterlockedExchangeAdd64, _InterlockedCompareExchange64)
|
||||
#elif defined(__APPLE__)
|
||||
#include <libkern/OSAtomic.h>
|
||||
#else
|
||||
#error FIXME atomic-ops
|
||||
#endif
|
||||
#include "essentials.h"
|
||||
|
||||
/*----------------------------------------------------------------------------*/
|
||||
/* Memory/Compiler barriers, cache coherence */
|
||||
@ -58,7 +19,7 @@
|
||||
#include <sys/cachectl.h>
|
||||
#endif
|
||||
|
||||
MDBX_MAYBE_UNUSED static __inline void osal_compiler_barrier(void) {
|
||||
MDBX_MAYBE_UNUSED static inline void osal_compiler_barrier(void) {
|
||||
#if defined(__clang__) || defined(__GNUC__)
|
||||
__asm__ __volatile__("" ::: "memory");
|
||||
#elif defined(_MSC_VER)
|
||||
@ -78,7 +39,7 @@ MDBX_MAYBE_UNUSED static __inline void osal_compiler_barrier(void) {
|
||||
#endif
|
||||
}
|
||||
|
||||
MDBX_MAYBE_UNUSED static __inline void osal_memory_barrier(void) {
|
||||
MDBX_MAYBE_UNUSED static inline void osal_memory_barrier(void) {
|
||||
#ifdef MDBX_HAVE_C11ATOMICS
|
||||
atomic_thread_fence(memory_order_seq_cst);
|
||||
#elif defined(__ATOMIC_SEQ_CST)
|
||||
@ -118,7 +79,7 @@ MDBX_MAYBE_UNUSED static __inline void osal_memory_barrier(void) {
|
||||
#define HAVE_SYS_TYPES_H
|
||||
typedef HANDLE osal_thread_t;
|
||||
typedef unsigned osal_thread_key_t;
|
||||
#define MAP_FAILED NULL
|
||||
#define MAP_FAILED nullptr
|
||||
#define HIGH_DWORD(v) ((DWORD)((sizeof(v) > 4) ? ((uint64_t)(v) >> 32) : 0))
|
||||
#define THREAD_CALL WINAPI
|
||||
#define THREAD_RESULT DWORD
|
||||
@ -210,19 +171,6 @@ typedef pthread_mutex_t osal_fastmutex_t;
|
||||
/*----------------------------------------------------------------------------*/
|
||||
/* OS abstraction layer stuff */
|
||||
|
||||
MDBX_INTERNAL_VAR_PROTO unsigned sys_pagesize;
|
||||
MDBX_MAYBE_UNUSED MDBX_INTERNAL_VAR_PROTO unsigned sys_pagesize_ln2,
|
||||
sys_allocation_granularity;
|
||||
|
||||
/* Get the size of a memory page for the system.
|
||||
* This is the basic size that the platform's memory manager uses, and is
|
||||
* fundamental to the use of memory-mapped files. */
|
||||
MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __inline size_t
|
||||
osal_syspagesize(void) {
|
||||
assert(sys_pagesize > 0 && (sys_pagesize & (sys_pagesize - 1)) == 0);
|
||||
return sys_pagesize;
|
||||
}
|
||||
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
typedef wchar_t pathchar_t;
|
||||
#define MDBX_PRIsPATH "ls"
|
||||
@ -234,7 +182,7 @@ typedef char pathchar_t;
|
||||
typedef struct osal_mmap {
|
||||
union {
|
||||
void *base;
|
||||
struct MDBX_lockinfo *lck;
|
||||
struct shared_lck *lck;
|
||||
};
|
||||
mdbx_filehandle_t fd;
|
||||
size_t limit; /* mapping length, but NOT a size of file nor DB */
|
||||
@ -245,25 +193,6 @@ typedef struct osal_mmap {
|
||||
#endif
|
||||
} osal_mmap_t;
|
||||
|
||||
typedef union bin128 {
|
||||
__anonymous_struct_extension__ struct {
|
||||
uint64_t x, y;
|
||||
};
|
||||
__anonymous_struct_extension__ struct {
|
||||
uint32_t a, b, c, d;
|
||||
};
|
||||
} bin128_t;
|
||||
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
typedef union osal_srwlock {
|
||||
__anonymous_struct_extension__ struct {
|
||||
long volatile readerCount;
|
||||
long volatile writerCount;
|
||||
};
|
||||
RTL_SRWLOCK native;
|
||||
} osal_srwlock_t;
|
||||
#endif /* Windows */
|
||||
|
||||
#ifndef MDBX_HAVE_PWRITEV
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
|
||||
@ -346,32 +275,30 @@ typedef struct osal_ioring {
|
||||
char *boundary;
|
||||
} osal_ioring_t;
|
||||
|
||||
#ifndef __cplusplus
|
||||
|
||||
/* Actually this is not ioring for now, but on the way. */
|
||||
MDBX_INTERNAL_FUNC int osal_ioring_create(osal_ioring_t *
|
||||
MDBX_INTERNAL int osal_ioring_create(osal_ioring_t *
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
,
|
||||
bool enable_direct,
|
||||
mdbx_filehandle_t overlapped_fd
|
||||
,
|
||||
bool enable_direct,
|
||||
mdbx_filehandle_t overlapped_fd
|
||||
#endif /* Windows */
|
||||
);
|
||||
MDBX_INTERNAL_FUNC int osal_ioring_resize(osal_ioring_t *, size_t items);
|
||||
MDBX_INTERNAL_FUNC void osal_ioring_destroy(osal_ioring_t *);
|
||||
MDBX_INTERNAL_FUNC void osal_ioring_reset(osal_ioring_t *);
|
||||
MDBX_INTERNAL_FUNC int osal_ioring_add(osal_ioring_t *ctx, const size_t offset,
|
||||
void *data, const size_t bytes);
|
||||
MDBX_INTERNAL int osal_ioring_resize(osal_ioring_t *, size_t items);
|
||||
MDBX_INTERNAL void osal_ioring_destroy(osal_ioring_t *);
|
||||
MDBX_INTERNAL void osal_ioring_reset(osal_ioring_t *);
|
||||
MDBX_INTERNAL int osal_ioring_add(osal_ioring_t *ctx, const size_t offset,
|
||||
void *data, const size_t bytes);
|
||||
typedef struct osal_ioring_write_result {
|
||||
int err;
|
||||
unsigned wops;
|
||||
} osal_ioring_write_result_t;
|
||||
MDBX_INTERNAL_FUNC osal_ioring_write_result_t
|
||||
MDBX_INTERNAL osal_ioring_write_result_t
|
||||
osal_ioring_write(osal_ioring_t *ior, mdbx_filehandle_t fd);
|
||||
|
||||
typedef struct iov_ctx iov_ctx_t;
|
||||
MDBX_INTERNAL_FUNC void osal_ioring_walk(
|
||||
osal_ioring_t *ior, iov_ctx_t *ctx,
|
||||
void (*callback)(iov_ctx_t *ctx, size_t offset, void *data, size_t bytes));
|
||||
MDBX_INTERNAL void osal_ioring_walk(osal_ioring_t *ior, iov_ctx_t *ctx,
|
||||
void (*callback)(iov_ctx_t *ctx,
|
||||
size_t offset, void *data,
|
||||
size_t bytes));
|
||||
|
||||
MDBX_MAYBE_UNUSED static inline unsigned
|
||||
osal_ioring_left(const osal_ioring_t *ior) {
|
||||
@ -408,9 +335,9 @@ osal_ioring_prepare(osal_ioring_t *ior, size_t items, size_t bytes) {
|
||||
#define osal_asprintf asprintf
|
||||
#define osal_vasprintf vasprintf
|
||||
#else
|
||||
MDBX_MAYBE_UNUSED MDBX_INTERNAL_FUNC
|
||||
MDBX_MAYBE_UNUSED MDBX_INTERNAL
|
||||
MDBX_PRINTF_ARGS(2, 3) int osal_asprintf(char **strp, const char *fmt, ...);
|
||||
MDBX_INTERNAL_FUNC int osal_vasprintf(char **strp, const char *fmt, va_list ap);
|
||||
MDBX_INTERNAL int osal_vasprintf(char **strp, const char *fmt, va_list ap);
|
||||
#endif
|
||||
|
||||
#if !defined(MADV_DODUMP) && defined(MADV_CORE)
|
||||
@ -421,8 +348,7 @@ MDBX_INTERNAL_FUNC int osal_vasprintf(char **strp, const char *fmt, va_list ap);
|
||||
#define MADV_DONTDUMP MADV_NOCORE
|
||||
#endif /* MADV_NOCORE -> MADV_DONTDUMP */
|
||||
|
||||
MDBX_MAYBE_UNUSED MDBX_INTERNAL_FUNC void osal_jitter(bool tiny);
|
||||
MDBX_MAYBE_UNUSED static __inline void jitter4testing(bool tiny);
|
||||
MDBX_MAYBE_UNUSED MDBX_INTERNAL void osal_jitter(bool tiny);
|
||||
|
||||
/* max bytes to write in one call */
|
||||
#if defined(_WIN64)
|
||||
@ -472,19 +398,13 @@ MDBX_MAYBE_UNUSED static __inline void jitter4testing(bool tiny);
|
||||
#endif /* OFF_T_MAX */
|
||||
#endif /* MDBX_F_OFD_SETLK64, MDBX_F_OFD_SETLKW64, MDBX_F_OFD_GETLK64 */
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(__linux__) || defined(__gnu_linux__)
|
||||
MDBX_INTERNAL_VAR_PROTO uint32_t linux_kernel_version;
|
||||
MDBX_INTERNAL_VAR_PROTO bool
|
||||
mdbx_RunningOnWSL1 /* Windows Subsystem 1 for Linux */;
|
||||
#endif /* Linux */
|
||||
#endif /* !Windows */
|
||||
|
||||
#ifndef osal_strdup
|
||||
LIBMDBX_API char *osal_strdup(const char *str);
|
||||
#endif
|
||||
|
||||
MDBX_MAYBE_UNUSED static __inline int osal_get_errno(void) {
|
||||
MDBX_MAYBE_UNUSED static inline int osal_get_errno(void) {
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
DWORD rc = GetLastError();
|
||||
#else
|
||||
@ -494,40 +414,39 @@ MDBX_MAYBE_UNUSED static __inline int osal_get_errno(void) {
|
||||
}
|
||||
|
||||
#ifndef osal_memalign_alloc
|
||||
MDBX_INTERNAL_FUNC int osal_memalign_alloc(size_t alignment, size_t bytes,
|
||||
void **result);
|
||||
MDBX_INTERNAL int osal_memalign_alloc(size_t alignment, size_t bytes,
|
||||
void **result);
|
||||
#endif
|
||||
#ifndef osal_memalign_free
|
||||
MDBX_INTERNAL_FUNC void osal_memalign_free(void *ptr);
|
||||
MDBX_INTERNAL void osal_memalign_free(void *ptr);
|
||||
#endif
|
||||
|
||||
MDBX_INTERNAL_FUNC int osal_condpair_init(osal_condpair_t *condpair);
|
||||
MDBX_INTERNAL_FUNC int osal_condpair_lock(osal_condpair_t *condpair);
|
||||
MDBX_INTERNAL_FUNC int osal_condpair_unlock(osal_condpair_t *condpair);
|
||||
MDBX_INTERNAL_FUNC int osal_condpair_signal(osal_condpair_t *condpair,
|
||||
bool part);
|
||||
MDBX_INTERNAL_FUNC int osal_condpair_wait(osal_condpair_t *condpair, bool part);
|
||||
MDBX_INTERNAL_FUNC int osal_condpair_destroy(osal_condpair_t *condpair);
|
||||
MDBX_INTERNAL int osal_condpair_init(osal_condpair_t *condpair);
|
||||
MDBX_INTERNAL int osal_condpair_lock(osal_condpair_t *condpair);
|
||||
MDBX_INTERNAL int osal_condpair_unlock(osal_condpair_t *condpair);
|
||||
MDBX_INTERNAL int osal_condpair_signal(osal_condpair_t *condpair, bool part);
|
||||
MDBX_INTERNAL int osal_condpair_wait(osal_condpair_t *condpair, bool part);
|
||||
MDBX_INTERNAL int osal_condpair_destroy(osal_condpair_t *condpair);
|
||||
|
||||
MDBX_INTERNAL_FUNC int osal_fastmutex_init(osal_fastmutex_t *fastmutex);
|
||||
MDBX_INTERNAL_FUNC int osal_fastmutex_acquire(osal_fastmutex_t *fastmutex);
|
||||
MDBX_INTERNAL_FUNC int osal_fastmutex_release(osal_fastmutex_t *fastmutex);
|
||||
MDBX_INTERNAL_FUNC int osal_fastmutex_destroy(osal_fastmutex_t *fastmutex);
|
||||
MDBX_INTERNAL int osal_fastmutex_init(osal_fastmutex_t *fastmutex);
|
||||
MDBX_INTERNAL int osal_fastmutex_acquire(osal_fastmutex_t *fastmutex);
|
||||
MDBX_INTERNAL int osal_fastmutex_release(osal_fastmutex_t *fastmutex);
|
||||
MDBX_INTERNAL int osal_fastmutex_destroy(osal_fastmutex_t *fastmutex);
|
||||
|
||||
MDBX_INTERNAL_FUNC int osal_pwritev(mdbx_filehandle_t fd, struct iovec *iov,
|
||||
size_t sgvcnt, uint64_t offset);
|
||||
MDBX_INTERNAL_FUNC int osal_pread(mdbx_filehandle_t fd, void *buf, size_t count,
|
||||
uint64_t offset);
|
||||
MDBX_INTERNAL_FUNC int osal_pwrite(mdbx_filehandle_t fd, const void *buf,
|
||||
size_t count, uint64_t offset);
|
||||
MDBX_INTERNAL_FUNC int osal_write(mdbx_filehandle_t fd, const void *buf,
|
||||
size_t count);
|
||||
MDBX_INTERNAL int osal_pwritev(mdbx_filehandle_t fd, struct iovec *iov,
|
||||
size_t sgvcnt, uint64_t offset);
|
||||
MDBX_INTERNAL int osal_pread(mdbx_filehandle_t fd, void *buf, size_t count,
|
||||
uint64_t offset);
|
||||
MDBX_INTERNAL int osal_pwrite(mdbx_filehandle_t fd, const void *buf,
|
||||
size_t count, uint64_t offset);
|
||||
MDBX_INTERNAL int osal_write(mdbx_filehandle_t fd, const void *buf,
|
||||
size_t count);
|
||||
|
||||
MDBX_INTERNAL_FUNC int
|
||||
MDBX_INTERNAL int
|
||||
osal_thread_create(osal_thread_t *thread,
|
||||
THREAD_RESULT(THREAD_CALL *start_routine)(void *),
|
||||
void *arg);
|
||||
MDBX_INTERNAL_FUNC int osal_thread_join(osal_thread_t thread);
|
||||
MDBX_INTERNAL int osal_thread_join(osal_thread_t thread);
|
||||
|
||||
enum osal_syncmode_bits {
|
||||
MDBX_SYNC_NONE = 0,
|
||||
@ -537,11 +456,11 @@ enum osal_syncmode_bits {
|
||||
MDBX_SYNC_IODQ = 8
|
||||
};
|
||||
|
||||
MDBX_INTERNAL_FUNC int osal_fsync(mdbx_filehandle_t fd,
|
||||
const enum osal_syncmode_bits mode_bits);
|
||||
MDBX_INTERNAL_FUNC int osal_ftruncate(mdbx_filehandle_t fd, uint64_t length);
|
||||
MDBX_INTERNAL_FUNC int osal_fseek(mdbx_filehandle_t fd, uint64_t pos);
|
||||
MDBX_INTERNAL_FUNC int osal_filesize(mdbx_filehandle_t fd, uint64_t *length);
|
||||
MDBX_INTERNAL int osal_fsync(mdbx_filehandle_t fd,
|
||||
const enum osal_syncmode_bits mode_bits);
|
||||
MDBX_INTERNAL int osal_ftruncate(mdbx_filehandle_t fd, uint64_t length);
|
||||
MDBX_INTERNAL int osal_fseek(mdbx_filehandle_t fd, uint64_t pos);
|
||||
MDBX_INTERNAL int osal_filesize(mdbx_filehandle_t fd, uint64_t *length);
|
||||
|
||||
enum osal_openfile_purpose {
|
||||
MDBX_OPEN_DXB_READ,
|
||||
@ -556,7 +475,7 @@ enum osal_openfile_purpose {
|
||||
MDBX_OPEN_DELETE
|
||||
};
|
||||
|
||||
MDBX_MAYBE_UNUSED static __inline bool osal_isdirsep(pathchar_t c) {
|
||||
MDBX_MAYBE_UNUSED static inline bool osal_isdirsep(pathchar_t c) {
|
||||
return
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
c == '\\' ||
|
||||
@ -564,50 +483,45 @@ MDBX_MAYBE_UNUSED static __inline bool osal_isdirsep(pathchar_t c) {
|
||||
c == '/';
|
||||
}
|
||||
|
||||
MDBX_INTERNAL_FUNC bool osal_pathequal(const pathchar_t *l, const pathchar_t *r,
|
||||
size_t len);
|
||||
MDBX_INTERNAL_FUNC pathchar_t *osal_fileext(const pathchar_t *pathname,
|
||||
size_t len);
|
||||
MDBX_INTERNAL_FUNC int osal_fileexists(const pathchar_t *pathname);
|
||||
MDBX_INTERNAL_FUNC int osal_openfile(const enum osal_openfile_purpose purpose,
|
||||
const MDBX_env *env,
|
||||
const pathchar_t *pathname,
|
||||
mdbx_filehandle_t *fd,
|
||||
mdbx_mode_t unix_mode_bits);
|
||||
MDBX_INTERNAL_FUNC int osal_closefile(mdbx_filehandle_t fd);
|
||||
MDBX_INTERNAL_FUNC int osal_removefile(const pathchar_t *pathname);
|
||||
MDBX_INTERNAL_FUNC int osal_removedirectory(const pathchar_t *pathname);
|
||||
MDBX_INTERNAL_FUNC int osal_is_pipe(mdbx_filehandle_t fd);
|
||||
MDBX_INTERNAL_FUNC int osal_lockfile(mdbx_filehandle_t fd, bool wait);
|
||||
MDBX_INTERNAL bool osal_pathequal(const pathchar_t *l, const pathchar_t *r,
|
||||
size_t len);
|
||||
MDBX_INTERNAL pathchar_t *osal_fileext(const pathchar_t *pathname, size_t len);
|
||||
MDBX_INTERNAL int osal_fileexists(const pathchar_t *pathname);
|
||||
MDBX_INTERNAL int osal_openfile(const enum osal_openfile_purpose purpose,
|
||||
const MDBX_env *env, const pathchar_t *pathname,
|
||||
mdbx_filehandle_t *fd,
|
||||
mdbx_mode_t unix_mode_bits);
|
||||
MDBX_INTERNAL int osal_closefile(mdbx_filehandle_t fd);
|
||||
MDBX_INTERNAL int osal_removefile(const pathchar_t *pathname);
|
||||
MDBX_INTERNAL int osal_removedirectory(const pathchar_t *pathname);
|
||||
MDBX_INTERNAL int osal_is_pipe(mdbx_filehandle_t fd);
|
||||
MDBX_INTERNAL int osal_lockfile(mdbx_filehandle_t fd, bool wait);
|
||||
|
||||
#define MMAP_OPTION_TRUNCATE 1
|
||||
#define MMAP_OPTION_SEMAPHORE 2
|
||||
MDBX_INTERNAL_FUNC int osal_mmap(const int flags, osal_mmap_t *map, size_t size,
|
||||
const size_t limit, const unsigned options);
|
||||
MDBX_INTERNAL_FUNC int osal_munmap(osal_mmap_t *map);
|
||||
MDBX_INTERNAL int osal_mmap(const int flags, osal_mmap_t *map, size_t size,
|
||||
const size_t limit, const unsigned options);
|
||||
MDBX_INTERNAL int osal_munmap(osal_mmap_t *map);
|
||||
#define MDBX_MRESIZE_MAY_MOVE 0x00000100
|
||||
#define MDBX_MRESIZE_MAY_UNMAP 0x00000200
|
||||
MDBX_INTERNAL_FUNC int osal_mresize(const int flags, osal_mmap_t *map,
|
||||
size_t size, size_t limit);
|
||||
MDBX_INTERNAL int osal_mresize(const int flags, osal_mmap_t *map, size_t size,
|
||||
size_t limit);
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
typedef struct {
|
||||
unsigned limit, count;
|
||||
HANDLE handles[31];
|
||||
} mdbx_handle_array_t;
|
||||
MDBX_INTERNAL_FUNC int
|
||||
MDBX_INTERNAL int
|
||||
osal_suspend_threads_before_remap(MDBX_env *env, mdbx_handle_array_t **array);
|
||||
MDBX_INTERNAL_FUNC int
|
||||
osal_resume_threads_after_remap(mdbx_handle_array_t *array);
|
||||
MDBX_INTERNAL int osal_resume_threads_after_remap(mdbx_handle_array_t *array);
|
||||
#endif /* Windows */
|
||||
MDBX_INTERNAL_FUNC int osal_msync(const osal_mmap_t *map, size_t offset,
|
||||
size_t length,
|
||||
enum osal_syncmode_bits mode_bits);
|
||||
MDBX_INTERNAL_FUNC int osal_check_fs_rdonly(mdbx_filehandle_t handle,
|
||||
const pathchar_t *pathname,
|
||||
int err);
|
||||
MDBX_INTERNAL_FUNC int osal_check_fs_incore(mdbx_filehandle_t handle);
|
||||
MDBX_INTERNAL int osal_msync(const osal_mmap_t *map, size_t offset,
|
||||
size_t length, enum osal_syncmode_bits mode_bits);
|
||||
MDBX_INTERNAL int osal_check_fs_rdonly(mdbx_filehandle_t handle,
|
||||
const pathchar_t *pathname, int err);
|
||||
MDBX_INTERNAL int osal_check_fs_incore(mdbx_filehandle_t handle);
|
||||
|
||||
MDBX_MAYBE_UNUSED static __inline uint32_t osal_getpid(void) {
|
||||
MDBX_MAYBE_UNUSED static inline uint32_t osal_getpid(void) {
|
||||
STATIC_ASSERT(sizeof(mdbx_pid_t) <= sizeof(uint32_t));
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
return GetCurrentProcessId();
|
||||
@ -617,7 +531,7 @@ MDBX_MAYBE_UNUSED static __inline uint32_t osal_getpid(void) {
|
||||
#endif
|
||||
}
|
||||
|
||||
MDBX_MAYBE_UNUSED static __inline uintptr_t osal_thread_self(void) {
|
||||
MDBX_MAYBE_UNUSED static inline uintptr_t osal_thread_self(void) {
|
||||
mdbx_tid_t thunk;
|
||||
STATIC_ASSERT(sizeof(uintptr_t) >= sizeof(thunk));
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
@ -630,22 +544,22 @@ MDBX_MAYBE_UNUSED static __inline uintptr_t osal_thread_self(void) {
|
||||
|
||||
#if !defined(_WIN32) && !defined(_WIN64)
|
||||
#if defined(__ANDROID_API__) || defined(ANDROID) || defined(BIONIC)
|
||||
MDBX_INTERNAL_FUNC int osal_check_tid4bionic(void);
|
||||
MDBX_INTERNAL int osal_check_tid4bionic(void);
|
||||
#else
|
||||
static __inline int osal_check_tid4bionic(void) { return 0; }
|
||||
static inline int osal_check_tid4bionic(void) { return 0; }
|
||||
#endif /* __ANDROID_API__ || ANDROID) || BIONIC */
|
||||
|
||||
MDBX_MAYBE_UNUSED static __inline int
|
||||
MDBX_MAYBE_UNUSED static inline int
|
||||
osal_pthread_mutex_lock(pthread_mutex_t *mutex) {
|
||||
int err = osal_check_tid4bionic();
|
||||
return unlikely(err) ? err : pthread_mutex_lock(mutex);
|
||||
}
|
||||
#endif /* !Windows */
|
||||
|
||||
MDBX_INTERNAL_FUNC uint64_t osal_monotime(void);
|
||||
MDBX_INTERNAL_FUNC uint64_t osal_cputime(size_t *optional_page_faults);
|
||||
MDBX_INTERNAL_FUNC uint64_t osal_16dot16_to_monotime(uint32_t seconds_16dot16);
|
||||
MDBX_INTERNAL_FUNC uint32_t osal_monotime_to_16dot16(uint64_t monotime);
|
||||
MDBX_INTERNAL uint64_t osal_monotime(void);
|
||||
MDBX_INTERNAL uint64_t osal_cputime(size_t *optional_page_faults);
|
||||
MDBX_INTERNAL uint64_t osal_16dot16_to_monotime(uint32_t seconds_16dot16);
|
||||
MDBX_INTERNAL uint32_t osal_monotime_to_16dot16(uint64_t monotime);
|
||||
|
||||
MDBX_MAYBE_UNUSED static inline uint32_t
|
||||
osal_monotime_to_16dot16_noUnderflow(uint64_t monotime) {
|
||||
@ -653,249 +567,18 @@ osal_monotime_to_16dot16_noUnderflow(uint64_t monotime) {
|
||||
return seconds_16dot16 ? seconds_16dot16 : /* fix underflow */ (monotime > 0);
|
||||
}
|
||||
|
||||
MDBX_INTERNAL_FUNC bin128_t osal_bootid(void);
|
||||
/*----------------------------------------------------------------------------*/
|
||||
/* lck stuff */
|
||||
|
||||
/// \brief Initialization of synchronization primitives linked with MDBX_env
|
||||
/// instance both in LCK-file and within the current process.
|
||||
/// \param
|
||||
/// global_uniqueness_flag = true - denotes that there are no other processes
|
||||
/// working with DB and LCK-file. Thus the function MUST initialize
|
||||
/// shared synchronization objects in memory-mapped LCK-file.
|
||||
/// global_uniqueness_flag = false - denotes that at least one process is
|
||||
/// already working with DB and LCK-file, including the case when DB
|
||||
/// has already been opened in the current process. Thus the function
|
||||
/// MUST NOT initialize shared synchronization objects in memory-mapped
|
||||
/// LCK-file that are already in use.
|
||||
/// \return Error code or zero on success.
|
||||
MDBX_INTERNAL_FUNC int osal_lck_init(MDBX_env *env,
|
||||
MDBX_env *inprocess_neighbor,
|
||||
int global_uniqueness_flag);
|
||||
|
||||
/// \brief Disconnects from shared interprocess objects and destructs
|
||||
/// synchronization objects linked with MDBX_env instance
|
||||
/// within the current process.
|
||||
/// \param
|
||||
/// inprocess_neighbor = NULL - if the current process does not have other
|
||||
/// instances of MDBX_env linked with the DB being closed.
|
||||
/// Thus the function MUST check for other processes working with DB or
|
||||
/// LCK-file, and keep or destroy shared synchronization objects in
|
||||
/// memory-mapped LCK-file depending on the result.
|
||||
/// inprocess_neighbor = not-NULL - pointer to another instance of MDBX_env
|
||||
/// (anyone of there is several) working with DB or LCK-file within the
|
||||
/// current process. Thus the function MUST NOT try to acquire exclusive
|
||||
/// lock and/or try to destruct shared synchronization objects linked with
|
||||
/// DB or LCK-file. Moreover, the implementation MUST ensure correct work
|
||||
/// of other instances of MDBX_env within the current process, e.g.
|
||||
/// restore POSIX-fcntl locks after the closing of file descriptors.
|
||||
/// \return Error code (MDBX_PANIC) or zero on success.
|
||||
MDBX_INTERNAL_FUNC int osal_lck_destroy(MDBX_env *env,
|
||||
MDBX_env *inprocess_neighbor,
|
||||
const uint32_t current_pid);
|
||||
|
||||
/// \brief Connects to shared interprocess locking objects and tries to acquire
|
||||
/// the maximum lock level (shared if exclusive is not available)
|
||||
/// Depending on implementation or/and platform (Windows) this function may
|
||||
/// acquire the non-OS super-level lock (e.g. for shared synchronization
|
||||
/// objects initialization), which will be downgraded to OS-exclusive or
|
||||
/// shared via explicit calling of osal_lck_downgrade().
|
||||
/// \return
|
||||
/// MDBX_RESULT_TRUE (-1) - if an exclusive lock was acquired and thus
|
||||
/// the current process is the first and only after the last use of DB.
|
||||
/// MDBX_RESULT_FALSE (0) - if a shared lock was acquired and thus
|
||||
/// DB has already been opened and now is used by other processes.
|
||||
/// Otherwise (not 0 and not -1) - error code.
|
||||
MDBX_INTERNAL_FUNC int osal_lck_seize(MDBX_env *env);
|
||||
|
||||
/// \brief Downgrades the level of initially acquired lock to
|
||||
/// operational level specified by argument. The reason for such downgrade:
|
||||
/// - unblocking of other processes that are waiting for access, i.e.
|
||||
/// if (env->me_flags & MDBX_EXCLUSIVE) != 0, then other processes
|
||||
/// should be made aware that access is unavailable rather than
|
||||
/// wait for it.
|
||||
/// - freeing locks that interfere file operation (especially for Windows)
|
||||
/// (env->me_flags & MDBX_EXCLUSIVE) == 0 - downgrade to shared lock.
|
||||
/// (env->me_flags & MDBX_EXCLUSIVE) != 0 - downgrade to exclusive
|
||||
/// operational lock.
|
||||
/// \return Error code or zero on success
|
||||
MDBX_INTERNAL_FUNC int osal_lck_downgrade(MDBX_env *env);
|
||||
MDBX_MAYBE_UNUSED MDBX_INTERNAL_FUNC int osal_lck_upgrade(MDBX_env *env,
|
||||
bool dont_wait);
|
||||
|
||||
/// \brief Locks LCK-file or/and table of readers for (de)registering.
|
||||
/// \return Error code or zero on success
|
||||
MDBX_INTERNAL_FUNC int osal_rdt_lock(MDBX_env *env);
|
||||
|
||||
/// \brief Unlocks LCK-file or/and table of readers after (de)registering.
|
||||
MDBX_INTERNAL_FUNC void osal_rdt_unlock(MDBX_env *env);
|
||||
|
||||
/// \brief Acquires write-transaction lock.
|
||||
/// \return Error code or zero on success
|
||||
MDBX_INTERNAL_FUNC int osal_txn_lock(MDBX_env *env, bool dont_wait);
|
||||
|
||||
/// \brief Releases write-transaction lock..
|
||||
MDBX_INTERNAL_FUNC void osal_txn_unlock(MDBX_env *env);
|
||||
|
||||
/// \brief Sets alive-flag of reader presence (indicative lock) for PID of
|
||||
/// the current process. The function does no more than needed for
|
||||
/// the correct working of osal_rpid_check() in other processes.
|
||||
/// \return Error code or zero on success
|
||||
MDBX_INTERNAL_FUNC int osal_rpid_set(MDBX_env *env);
|
||||
|
||||
/// \brief Resets alive-flag of reader presence (indicative lock)
|
||||
/// for PID of the current process. The function does no more than needed
|
||||
/// for the correct working of osal_rpid_check() in other processes.
|
||||
/// \return Error code or zero on success
|
||||
MDBX_INTERNAL_FUNC int osal_rpid_clear(MDBX_env *env);
|
||||
|
||||
/// \brief Checks for reading process status with the given pid with help of
|
||||
/// alive-flag of presence (indicative lock) or using another way.
|
||||
/// \return
|
||||
/// MDBX_RESULT_TRUE (-1) - if the reader process with the given PID is alive
|
||||
/// and working with DB (indicative lock is present).
|
||||
/// MDBX_RESULT_FALSE (0) - if the reader process with the given PID is absent
|
||||
/// or not working with DB (indicative lock is not present).
|
||||
/// Otherwise (not 0 and not -1) - error code.
|
||||
MDBX_INTERNAL_FUNC int osal_rpid_check(MDBX_env *env, uint32_t pid);
|
||||
MDBX_INTERNAL void osal_ctor(void);
|
||||
MDBX_INTERNAL void osal_dtor(void);
|
||||
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
|
||||
MDBX_INTERNAL_FUNC int osal_mb2w(const char *const src, wchar_t **const pdst);
|
||||
|
||||
typedef void(WINAPI *osal_srwlock_t_function)(osal_srwlock_t *);
|
||||
MDBX_INTERNAL_VAR_PROTO osal_srwlock_t_function osal_srwlock_Init,
|
||||
osal_srwlock_AcquireShared, osal_srwlock_ReleaseShared,
|
||||
osal_srwlock_AcquireExclusive, osal_srwlock_ReleaseExclusive;
|
||||
|
||||
#if _WIN32_WINNT < 0x0600 /* prior to Windows Vista */
|
||||
typedef enum _FILE_INFO_BY_HANDLE_CLASS {
|
||||
FileBasicInfo,
|
||||
FileStandardInfo,
|
||||
FileNameInfo,
|
||||
FileRenameInfo,
|
||||
FileDispositionInfo,
|
||||
FileAllocationInfo,
|
||||
FileEndOfFileInfo,
|
||||
FileStreamInfo,
|
||||
FileCompressionInfo,
|
||||
FileAttributeTagInfo,
|
||||
FileIdBothDirectoryInfo,
|
||||
FileIdBothDirectoryRestartInfo,
|
||||
FileIoPriorityHintInfo,
|
||||
FileRemoteProtocolInfo,
|
||||
MaximumFileInfoByHandleClass
|
||||
} FILE_INFO_BY_HANDLE_CLASS,
|
||||
*PFILE_INFO_BY_HANDLE_CLASS;
|
||||
|
||||
typedef struct _FILE_END_OF_FILE_INFO {
|
||||
LARGE_INTEGER EndOfFile;
|
||||
} FILE_END_OF_FILE_INFO, *PFILE_END_OF_FILE_INFO;
|
||||
|
||||
#define REMOTE_PROTOCOL_INFO_FLAG_LOOPBACK 0x00000001
|
||||
#define REMOTE_PROTOCOL_INFO_FLAG_OFFLINE 0x00000002
|
||||
|
||||
typedef struct _FILE_REMOTE_PROTOCOL_INFO {
|
||||
USHORT StructureVersion;
|
||||
USHORT StructureSize;
|
||||
DWORD Protocol;
|
||||
USHORT ProtocolMajorVersion;
|
||||
USHORT ProtocolMinorVersion;
|
||||
USHORT ProtocolRevision;
|
||||
USHORT Reserved;
|
||||
DWORD Flags;
|
||||
struct {
|
||||
DWORD Reserved[8];
|
||||
} GenericReserved;
|
||||
struct {
|
||||
DWORD Reserved[16];
|
||||
} ProtocolSpecificReserved;
|
||||
} FILE_REMOTE_PROTOCOL_INFO, *PFILE_REMOTE_PROTOCOL_INFO;
|
||||
|
||||
#endif /* _WIN32_WINNT < 0x0600 (prior to Windows Vista) */
|
||||
|
||||
typedef BOOL(WINAPI *MDBX_GetFileInformationByHandleEx)(
|
||||
_In_ HANDLE hFile, _In_ FILE_INFO_BY_HANDLE_CLASS FileInformationClass,
|
||||
_Out_ LPVOID lpFileInformation, _In_ DWORD dwBufferSize);
|
||||
MDBX_INTERNAL_VAR_PROTO MDBX_GetFileInformationByHandleEx
|
||||
mdbx_GetFileInformationByHandleEx;
|
||||
|
||||
typedef BOOL(WINAPI *MDBX_GetVolumeInformationByHandleW)(
|
||||
_In_ HANDLE hFile, _Out_opt_ LPWSTR lpVolumeNameBuffer,
|
||||
_In_ DWORD nVolumeNameSize, _Out_opt_ LPDWORD lpVolumeSerialNumber,
|
||||
_Out_opt_ LPDWORD lpMaximumComponentLength,
|
||||
_Out_opt_ LPDWORD lpFileSystemFlags,
|
||||
_Out_opt_ LPWSTR lpFileSystemNameBuffer, _In_ DWORD nFileSystemNameSize);
|
||||
MDBX_INTERNAL_VAR_PROTO MDBX_GetVolumeInformationByHandleW
|
||||
mdbx_GetVolumeInformationByHandleW;
|
||||
|
||||
typedef DWORD(WINAPI *MDBX_GetFinalPathNameByHandleW)(_In_ HANDLE hFile,
|
||||
_Out_ LPWSTR lpszFilePath,
|
||||
_In_ DWORD cchFilePath,
|
||||
_In_ DWORD dwFlags);
|
||||
MDBX_INTERNAL_VAR_PROTO MDBX_GetFinalPathNameByHandleW
|
||||
mdbx_GetFinalPathNameByHandleW;
|
||||
|
||||
typedef BOOL(WINAPI *MDBX_SetFileInformationByHandle)(
|
||||
_In_ HANDLE hFile, _In_ FILE_INFO_BY_HANDLE_CLASS FileInformationClass,
|
||||
_Out_ LPVOID lpFileInformation, _In_ DWORD dwBufferSize);
|
||||
MDBX_INTERNAL_VAR_PROTO MDBX_SetFileInformationByHandle
|
||||
mdbx_SetFileInformationByHandle;
|
||||
|
||||
typedef NTSTATUS(NTAPI *MDBX_NtFsControlFile)(
|
||||
IN HANDLE FileHandle, IN OUT HANDLE Event,
|
||||
IN OUT PVOID /* PIO_APC_ROUTINE */ ApcRoutine, IN OUT PVOID ApcContext,
|
||||
OUT PIO_STATUS_BLOCK IoStatusBlock, IN ULONG FsControlCode,
|
||||
IN OUT PVOID InputBuffer, IN ULONG InputBufferLength,
|
||||
OUT OPTIONAL PVOID OutputBuffer, IN ULONG OutputBufferLength);
|
||||
MDBX_INTERNAL_VAR_PROTO MDBX_NtFsControlFile mdbx_NtFsControlFile;
|
||||
|
||||
typedef uint64_t(WINAPI *MDBX_GetTickCount64)(void);
|
||||
MDBX_INTERNAL_VAR_PROTO MDBX_GetTickCount64 mdbx_GetTickCount64;
|
||||
|
||||
#if !defined(_WIN32_WINNT_WIN8) || _WIN32_WINNT < _WIN32_WINNT_WIN8
|
||||
typedef struct _WIN32_MEMORY_RANGE_ENTRY {
|
||||
PVOID VirtualAddress;
|
||||
SIZE_T NumberOfBytes;
|
||||
} WIN32_MEMORY_RANGE_ENTRY, *PWIN32_MEMORY_RANGE_ENTRY;
|
||||
#endif /* Windows 8.x */
|
||||
|
||||
typedef BOOL(WINAPI *MDBX_PrefetchVirtualMemory)(
|
||||
HANDLE hProcess, ULONG_PTR NumberOfEntries,
|
||||
PWIN32_MEMORY_RANGE_ENTRY VirtualAddresses, ULONG Flags);
|
||||
MDBX_INTERNAL_VAR_PROTO MDBX_PrefetchVirtualMemory mdbx_PrefetchVirtualMemory;
|
||||
|
||||
typedef enum _SECTION_INHERIT { ViewShare = 1, ViewUnmap = 2 } SECTION_INHERIT;
|
||||
|
||||
typedef NTSTATUS(NTAPI *MDBX_NtExtendSection)(IN HANDLE SectionHandle,
|
||||
IN PLARGE_INTEGER NewSectionSize);
|
||||
MDBX_INTERNAL_VAR_PROTO MDBX_NtExtendSection mdbx_NtExtendSection;
|
||||
|
||||
static __inline bool mdbx_RunningUnderWine(void) {
|
||||
return !mdbx_NtExtendSection;
|
||||
}
|
||||
|
||||
typedef LSTATUS(WINAPI *MDBX_RegGetValueA)(HKEY hkey, LPCSTR lpSubKey,
|
||||
LPCSTR lpValue, DWORD dwFlags,
|
||||
LPDWORD pdwType, PVOID pvData,
|
||||
LPDWORD pcbData);
|
||||
MDBX_INTERNAL_VAR_PROTO MDBX_RegGetValueA mdbx_RegGetValueA;
|
||||
|
||||
NTSYSAPI ULONG RtlRandomEx(PULONG Seed);
|
||||
|
||||
typedef BOOL(WINAPI *MDBX_SetFileIoOverlappedRange)(HANDLE FileHandle,
|
||||
PUCHAR OverlappedRangeStart,
|
||||
ULONG Length);
|
||||
MDBX_INTERNAL_VAR_PROTO MDBX_SetFileIoOverlappedRange
|
||||
mdbx_SetFileIoOverlappedRange;
|
||||
|
||||
MDBX_INTERNAL int osal_mb2w(const char *const src, wchar_t **const pdst);
|
||||
#endif /* Windows */
|
||||
|
||||
#endif /* !__cplusplus */
|
||||
|
||||
/*----------------------------------------------------------------------------*/
|
||||
|
||||
MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static __always_inline uint64_t
|
||||
MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static inline uint64_t
|
||||
osal_bswap64(uint64_t v) {
|
||||
#if __GNUC_PREREQ(4, 4) || __CLANG_PREREQ(4, 0) || \
|
||||
__has_builtin(__builtin_bswap64)
|
||||
@ -916,7 +599,7 @@ osal_bswap64(uint64_t v) {
|
||||
#endif
|
||||
}
|
||||
|
||||
MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static __always_inline uint32_t
|
||||
MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static inline uint32_t
|
||||
osal_bswap32(uint32_t v) {
|
||||
#if __GNUC_PREREQ(4, 4) || __CLANG_PREREQ(4, 0) || \
|
||||
__has_builtin(__builtin_bswap32)
|
||||
@ -932,33 +615,3 @@ osal_bswap32(uint32_t v) {
|
||||
((v >> 8) & UINT32_C(0x0000ff00));
|
||||
#endif
|
||||
}
|
||||
|
||||
/*----------------------------------------------------------------------------*/
|
||||
|
||||
#if defined(_MSC_VER) && _MSC_VER >= 1900
|
||||
/* LY: MSVC 2015/2017/2019 has buggy/inconsistent PRIuPTR/PRIxPTR macros
|
||||
* for internal format-args checker. */
|
||||
#undef PRIuPTR
|
||||
#undef PRIiPTR
|
||||
#undef PRIdPTR
|
||||
#undef PRIxPTR
|
||||
#define PRIuPTR "Iu"
|
||||
#define PRIiPTR "Ii"
|
||||
#define PRIdPTR "Id"
|
||||
#define PRIxPTR "Ix"
|
||||
#define PRIuSIZE "zu"
|
||||
#define PRIiSIZE "zi"
|
||||
#define PRIdSIZE "zd"
|
||||
#define PRIxSIZE "zx"
|
||||
#endif /* fix PRI*PTR for _MSC_VER */
|
||||
|
||||
#ifndef PRIuSIZE
|
||||
#define PRIuSIZE PRIuPTR
|
||||
#define PRIiSIZE PRIiPTR
|
||||
#define PRIdSIZE PRIdPTR
|
||||
#define PRIxSIZE PRIxPTR
|
||||
#endif /* PRI*SIZE macros for MSVC */
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#pragma warning(pop)
|
||||
#endif
|
||||
|
579
src/page-get.c
Normal file
579
src/page-get.c
Normal file
@ -0,0 +1,579 @@
|
||||
/// \copyright SPDX-License-Identifier: Apache-2.0
|
||||
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
|
||||
|
||||
#include "internals.h"
|
||||
|
||||
__cold int MDBX_PRINTF_ARGS(2, 3)
|
||||
bad_page(const page_t *mp, const char *fmt, ...) {
|
||||
if (LOG_ENABLED(MDBX_LOG_ERROR)) {
|
||||
static const page_t *prev;
|
||||
if (prev != mp) {
|
||||
char buf4unknown[16];
|
||||
prev = mp;
|
||||
debug_log(MDBX_LOG_ERROR, "badpage", 0,
|
||||
"corrupted %s-page #%u, mod-txnid %" PRIaTXN "\n",
|
||||
pagetype_caption(page_type(mp), buf4unknown), mp->pgno,
|
||||
mp->txnid);
|
||||
}
|
||||
|
||||
va_list args;
|
||||
va_start(args, fmt);
|
||||
debug_log_va(MDBX_LOG_ERROR, "badpage", 0, fmt, args);
|
||||
va_end(args);
|
||||
}
|
||||
return MDBX_CORRUPTED;
|
||||
}
|
||||
|
||||
__cold void MDBX_PRINTF_ARGS(2, 3)
|
||||
poor_page(const page_t *mp, const char *fmt, ...) {
|
||||
if (LOG_ENABLED(MDBX_LOG_NOTICE)) {
|
||||
static const page_t *prev;
|
||||
if (prev != mp) {
|
||||
char buf4unknown[16];
|
||||
prev = mp;
|
||||
debug_log(MDBX_LOG_NOTICE, "poorpage", 0,
|
||||
"suboptimal %s-page #%u, mod-txnid %" PRIaTXN "\n",
|
||||
pagetype_caption(page_type(mp), buf4unknown), mp->pgno,
|
||||
mp->txnid);
|
||||
}
|
||||
|
||||
va_list args;
|
||||
va_start(args, fmt);
|
||||
debug_log_va(MDBX_LOG_NOTICE, "poorpage", 0, fmt, args);
|
||||
va_end(args);
|
||||
}
|
||||
}
|
||||
|
||||
MDBX_CONST_FUNCTION static clc_t value_clc(const MDBX_cursor *mc) {
|
||||
if (likely((mc->flags & z_inner) == 0))
|
||||
return mc->clc->v;
|
||||
else {
|
||||
clc_t stub = {.cmp = cmp_equal_or_wrong, .lmin = 0, .lmax = 0};
|
||||
return stub;
|
||||
}
|
||||
}
|
||||
|
||||
__cold int page_check(const MDBX_cursor *const mc, const page_t *const mp) {
|
||||
DKBUF;
|
||||
int rc = MDBX_SUCCESS;
|
||||
if (unlikely(mp->pgno < MIN_PAGENO || mp->pgno > MAX_PAGENO))
|
||||
rc = bad_page(mp, "invalid pgno (%u)\n", mp->pgno);
|
||||
|
||||
MDBX_env *const env = mc->txn->env;
|
||||
const ptrdiff_t offset = ptr_dist(mp, env->dxb_mmap.base);
|
||||
unsigned flags_mask = P_ILL_BITS;
|
||||
unsigned flags_expected = 0;
|
||||
if (offset < 0 ||
|
||||
offset > (ptrdiff_t)(pgno2bytes(env, mc->txn->geo.first_unallocated) -
|
||||
((mp->flags & P_SUBP) ? PAGEHDRSZ + 1 : env->ps))) {
|
||||
/* should be dirty page without MDBX_WRITEMAP, or a subpage of. */
|
||||
flags_mask -= P_SUBP;
|
||||
if ((env->flags & MDBX_WRITEMAP) != 0 ||
|
||||
(!is_shadowed(mc->txn, mp) && !(mp->flags & P_SUBP)))
|
||||
rc = bad_page(mp, "invalid page-address %p, offset %zi\n",
|
||||
__Wpedantic_format_voidptr(mp), offset);
|
||||
} else if (offset & (env->ps - 1))
|
||||
flags_expected = P_SUBP;
|
||||
|
||||
if (unlikely((mp->flags & flags_mask) != flags_expected))
|
||||
rc = bad_page(mp, "unknown/extra page-flags (have 0x%x, expect 0x%x)\n",
|
||||
mp->flags & flags_mask, flags_expected);
|
||||
|
||||
cASSERT(mc, (mc->checking & z_dupfix) == 0 || (mc->flags & z_inner) != 0);
|
||||
const uint8_t type = page_type(mp);
|
||||
switch (type) {
|
||||
default:
|
||||
return bad_page(mp, "invalid type (%u)\n", type);
|
||||
case P_LARGE:
|
||||
if (unlikely(mc->flags & z_inner))
|
||||
rc = bad_page(mp, "unexpected %s-page for %s (db-flags 0x%x)\n", "large",
|
||||
"nested dupsort tree", mc->tree->flags);
|
||||
const pgno_t npages = mp->pages;
|
||||
if (unlikely(npages < 1 || npages >= MAX_PAGENO / 2))
|
||||
rc = bad_page(mp, "invalid n-pages (%u) for large-page\n", npages);
|
||||
if (unlikely(mp->pgno + npages > mc->txn->geo.first_unallocated))
|
||||
rc = bad_page(
|
||||
mp, "end of large-page beyond (%u) allocated space (%u next-pgno)\n",
|
||||
mp->pgno + npages, mc->txn->geo.first_unallocated);
|
||||
return rc; //-------------------------- end of large/overflow page handling
|
||||
case P_LEAF | P_SUBP:
|
||||
if (unlikely(mc->tree->height != 1))
|
||||
rc = bad_page(mp, "unexpected %s-page for %s (db-flags 0x%x)\n",
|
||||
"leaf-sub", "nested dupsort db", mc->tree->flags);
|
||||
/* fall through */
|
||||
__fallthrough;
|
||||
case P_LEAF:
|
||||
if (unlikely((mc->checking & z_dupfix) != 0))
|
||||
rc = bad_page(mp,
|
||||
"unexpected leaf-page for dupfix subtree (db-lags 0x%x)\n",
|
||||
mc->tree->flags);
|
||||
break;
|
||||
case P_LEAF | P_DUPFIX | P_SUBP:
|
||||
if (unlikely(mc->tree->height != 1))
|
||||
rc = bad_page(mp, "unexpected %s-page for %s (db-flags 0x%x)\n",
|
||||
"leaf2-sub", "nested dupsort db", mc->tree->flags);
|
||||
/* fall through */
|
||||
__fallthrough;
|
||||
case P_LEAF | P_DUPFIX:
|
||||
if (unlikely((mc->checking & z_dupfix) == 0))
|
||||
rc = bad_page(
|
||||
mp,
|
||||
"unexpected leaf2-page for non-dupfix (sub)tree (db-flags 0x%x)\n",
|
||||
mc->tree->flags);
|
||||
break;
|
||||
case P_BRANCH:
|
||||
break;
|
||||
}
|
||||
|
||||
if (unlikely(mp->upper < mp->lower || (mp->lower & 1) ||
|
||||
PAGEHDRSZ + mp->upper > env->ps))
|
||||
rc = bad_page(mp, "invalid page lower(%u)/upper(%u) with limit %zu\n",
|
||||
mp->lower, mp->upper, page_space(env));
|
||||
|
||||
const char *const end_of_page = ptr_disp(mp, env->ps);
|
||||
const size_t nkeys = page_numkeys(mp);
|
||||
STATIC_ASSERT(P_BRANCH == 1);
|
||||
if (unlikely(nkeys <= (uint8_t)(mp->flags & P_BRANCH))) {
|
||||
if ((!(mc->flags & z_inner) || mc->tree->items) &&
|
||||
(!(mc->checking & z_updating) ||
|
||||
!(is_modifable(mc->txn, mp) || (mp->flags & P_SUBP))))
|
||||
rc =
|
||||
bad_page(mp, "%s-page nkeys (%zu) < %u\n",
|
||||
is_branch(mp) ? "branch" : "leaf", nkeys, 1 + is_branch(mp));
|
||||
}
|
||||
|
||||
const size_t ksize_max = keysize_max(env->ps, 0);
|
||||
const size_t leaf2_ksize = mp->dupfix_ksize;
|
||||
if (is_dupfix_leaf(mp)) {
|
||||
if (unlikely((mc->flags & z_inner) == 0 ||
|
||||
(mc->tree->flags & MDBX_DUPFIXED) == 0))
|
||||
rc = bad_page(mp, "unexpected leaf2-page (db-flags 0x%x)\n",
|
||||
mc->tree->flags);
|
||||
else if (unlikely(leaf2_ksize != mc->tree->dupfix_size))
|
||||
rc = bad_page(mp, "invalid leaf2_ksize %zu\n", leaf2_ksize);
|
||||
else if (unlikely(((leaf2_ksize & nkeys) ^ mp->upper) & 1))
|
||||
rc = bad_page(
|
||||
mp, "invalid page upper (%u) for nkeys %zu with leaf2-length %zu\n",
|
||||
mp->upper, nkeys, leaf2_ksize);
|
||||
} else {
|
||||
if (unlikely((mp->upper & 1) ||
|
||||
PAGEHDRSZ + mp->upper + nkeys * sizeof(node_t) + nkeys - 1 >
|
||||
env->ps))
|
||||
rc =
|
||||
bad_page(mp, "invalid page upper (%u) for nkeys %zu with limit %zu\n",
|
||||
mp->upper, nkeys, page_space(env));
|
||||
}
|
||||
|
||||
MDBX_val here, prev = {0, 0};
|
||||
clc_t v_clc = value_clc(mc);
|
||||
for (size_t i = 0; i < nkeys; ++i) {
|
||||
if (is_dupfix_leaf(mp)) {
|
||||
const char *const key = page_dupfix_ptr(mp, i, mc->tree->dupfix_size);
|
||||
if (unlikely(end_of_page < key + leaf2_ksize)) {
|
||||
rc = bad_page(mp, "leaf2-item beyond (%zu) page-end\n",
|
||||
key + leaf2_ksize - end_of_page);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (unlikely(leaf2_ksize != mc->clc->k.lmin)) {
|
||||
if (unlikely(leaf2_ksize < mc->clc->k.lmin ||
|
||||
leaf2_ksize > mc->clc->k.lmax))
|
||||
rc = bad_page(mp,
|
||||
"leaf2-item size (%zu) <> min/max length (%zu/%zu)\n",
|
||||
leaf2_ksize, mc->clc->k.lmin, mc->clc->k.lmax);
|
||||
else
|
||||
mc->clc->k.lmin = mc->clc->k.lmax = leaf2_ksize;
|
||||
}
|
||||
if ((mc->checking & z_ignord) == 0) {
|
||||
here.iov_base = (void *)key;
|
||||
here.iov_len = leaf2_ksize;
|
||||
if (prev.iov_base && unlikely(mc->clc->k.cmp(&prev, &here) >= 0))
|
||||
rc = bad_page(mp, "leaf2-item #%zu wrong order (%s >= %s)\n", i,
|
||||
DKEY(&prev), DVAL(&here));
|
||||
prev = here;
|
||||
}
|
||||
} else {
|
||||
const node_t *const node = page_node(mp, i);
|
||||
const char *const node_end = ptr_disp(node, NODESIZE);
|
||||
if (unlikely(node_end > end_of_page)) {
|
||||
rc = bad_page(mp, "node[%zu] (%zu) beyond page-end\n", i,
|
||||
node_end - end_of_page);
|
||||
continue;
|
||||
}
|
||||
const size_t ksize = node_ks(node);
|
||||
if (unlikely(ksize > ksize_max))
|
||||
rc = bad_page(mp, "node[%zu] too long key (%zu)\n", i, ksize);
|
||||
const char *const key = node_key(node);
|
||||
if (unlikely(end_of_page < key + ksize)) {
|
||||
rc = bad_page(mp, "node[%zu] key (%zu) beyond page-end\n", i,
|
||||
key + ksize - end_of_page);
|
||||
continue;
|
||||
}
|
||||
if ((is_leaf(mp) || i > 0)) {
|
||||
if (unlikely(ksize < mc->clc->k.lmin || ksize > mc->clc->k.lmax))
|
||||
rc = bad_page(
|
||||
mp, "node[%zu] key size (%zu) <> min/max key-length (%zu/%zu)\n",
|
||||
i, ksize, mc->clc->k.lmin, mc->clc->k.lmax);
|
||||
if ((mc->checking & z_ignord) == 0) {
|
||||
here.iov_base = (void *)key;
|
||||
here.iov_len = ksize;
|
||||
if (prev.iov_base && unlikely(mc->clc->k.cmp(&prev, &here) >= 0))
|
||||
rc = bad_page(mp, "node[%zu] key wrong order (%s >= %s)\n", i,
|
||||
DKEY(&prev), DVAL(&here));
|
||||
prev = here;
|
||||
}
|
||||
}
|
||||
if (is_branch(mp)) {
|
||||
if ((mc->checking & z_updating) == 0 && i == 0 && unlikely(ksize != 0))
|
||||
rc = bad_page(mp, "branch-node[%zu] wrong 0-node key-length (%zu)\n",
|
||||
i, ksize);
|
||||
const pgno_t ref = node_pgno(node);
|
||||
if (unlikely(ref < MIN_PAGENO) ||
|
||||
(unlikely(ref >= mc->txn->geo.first_unallocated) &&
|
||||
(unlikely(ref >= mc->txn->geo.now) ||
|
||||
!(mc->checking & z_retiring))))
|
||||
rc = bad_page(mp, "branch-node[%zu] wrong pgno (%u)\n", i, ref);
|
||||
if (unlikely(node_flags(node)))
|
||||
rc = bad_page(mp, "branch-node[%zu] wrong flags (%u)\n", i,
|
||||
node_flags(node));
|
||||
continue;
|
||||
}
|
||||
|
||||
switch (node_flags(node)) {
|
||||
default:
|
||||
rc =
|
||||
bad_page(mp, "invalid node[%zu] flags (%u)\n", i, node_flags(node));
|
||||
break;
|
||||
case N_BIGDATA /* data on large-page */:
|
||||
case 0 /* usual */:
|
||||
case N_SUBDATA /* sub-db */:
|
||||
case N_SUBDATA | N_DUPDATA /* dupsorted sub-tree */:
|
||||
case N_DUPDATA /* short sub-page */:
|
||||
break;
|
||||
}
|
||||
|
||||
const size_t dsize = node_ds(node);
|
||||
const char *const data = node_data(node);
|
||||
if (node_flags(node) & N_BIGDATA) {
|
||||
if (unlikely(end_of_page < data + sizeof(pgno_t))) {
|
||||
rc = bad_page(
|
||||
mp, "node-%s(%zu of %zu, %zu bytes) beyond (%zu) page-end\n",
|
||||
"bigdata-pgno", i, nkeys, dsize, data + dsize - end_of_page);
|
||||
continue;
|
||||
}
|
||||
if (unlikely(dsize <= v_clc.lmin || dsize > v_clc.lmax))
|
||||
rc = bad_page(
|
||||
mp,
|
||||
"big-node data size (%zu) <> min/max value-length (%zu/%zu)\n",
|
||||
dsize, v_clc.lmin, v_clc.lmax);
|
||||
if (unlikely(node_size_len(node_ks(node), dsize) <=
|
||||
mc->txn->env->leaf_nodemax) &&
|
||||
mc->tree != &mc->txn->dbs[FREE_DBI])
|
||||
poor_page(mp, "too small data (%zu bytes) for bigdata-node", dsize);
|
||||
|
||||
if ((mc->checking & z_retiring) == 0) {
|
||||
const pgr_t lp =
|
||||
page_get_large(mc, node_largedata_pgno(node), mp->txnid);
|
||||
if (unlikely(lp.err != MDBX_SUCCESS))
|
||||
return lp.err;
|
||||
cASSERT(mc, page_type(lp.page) == P_LARGE);
|
||||
const unsigned npages = largechunk_npages(env, dsize);
|
||||
if (unlikely(lp.page->pages != npages)) {
|
||||
if (lp.page->pages < npages)
|
||||
rc = bad_page(lp.page,
|
||||
"too less n-pages %u for bigdata-node (%zu bytes)",
|
||||
lp.page->pages, dsize);
|
||||
else if (mc->tree != &mc->txn->dbs[FREE_DBI])
|
||||
poor_page(lp.page,
|
||||
"extra n-pages %u for bigdata-node (%zu bytes)",
|
||||
lp.page->pages, dsize);
|
||||
}
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
if (unlikely(end_of_page < data + dsize)) {
|
||||
rc = bad_page(mp,
|
||||
"node-%s(%zu of %zu, %zu bytes) beyond (%zu) page-end\n",
|
||||
"data", i, nkeys, dsize, data + dsize - end_of_page);
|
||||
continue;
|
||||
}
|
||||
|
||||
switch (node_flags(node)) {
|
||||
default:
|
||||
/* wrong, but already handled */
|
||||
continue;
|
||||
case 0 /* usual */:
|
||||
if (unlikely(dsize < v_clc.lmin || dsize > v_clc.lmax)) {
|
||||
rc = bad_page(
|
||||
mp, "node-data size (%zu) <> min/max value-length (%zu/%zu)\n",
|
||||
dsize, v_clc.lmin, v_clc.lmax);
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
case N_SUBDATA /* sub-db */:
|
||||
if (unlikely(dsize != sizeof(tree_t))) {
|
||||
rc = bad_page(mp, "invalid sub-db record size (%zu)\n", dsize);
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
case N_SUBDATA | N_DUPDATA /* dupsorted sub-tree */:
|
||||
if (unlikely(dsize != sizeof(tree_t))) {
|
||||
rc = bad_page(mp, "invalid nested-db record size (%zu, expect %zu)\n",
|
||||
dsize, sizeof(tree_t));
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
case N_DUPDATA /* short sub-page */:
|
||||
if (unlikely(dsize <= PAGEHDRSZ)) {
|
||||
rc = bad_page(mp, "invalid nested/sub-page record size (%zu)\n",
|
||||
dsize);
|
||||
continue;
|
||||
} else {
|
||||
const page_t *const sp = (page_t *)data;
|
||||
switch (sp->flags &
|
||||
/* ignore legacy P_DIRTY flag */ ~P_LEGACY_DIRTY) {
|
||||
case P_LEAF | P_SUBP:
|
||||
case P_LEAF | P_DUPFIX | P_SUBP:
|
||||
break;
|
||||
default:
|
||||
rc = bad_page(mp, "invalid nested/sub-page flags (0x%02x)\n",
|
||||
sp->flags);
|
||||
continue;
|
||||
}
|
||||
|
||||
const char *const end_of_subpage = data + dsize;
|
||||
const intptr_t nsubkeys = page_numkeys(sp);
|
||||
if (unlikely(nsubkeys == 0) && !(mc->checking & z_updating) &&
|
||||
mc->tree->items)
|
||||
rc = bad_page(mp, "no keys on a %s-page\n",
|
||||
is_dupfix_leaf(sp) ? "leaf2-sub" : "leaf-sub");
|
||||
|
||||
MDBX_val sub_here, sub_prev = {0, 0};
|
||||
for (int ii = 0; ii < nsubkeys; ii++) {
|
||||
if (is_dupfix_leaf(sp)) {
|
||||
/* DUPFIX pages have no entries[] or node headers */
|
||||
const size_t sub_ksize = sp->dupfix_ksize;
|
||||
const char *const sub_key =
|
||||
page_dupfix_ptr(sp, ii, mc->tree->dupfix_size);
|
||||
if (unlikely(end_of_subpage < sub_key + sub_ksize)) {
|
||||
rc = bad_page(mp, "nested-leaf2-key beyond (%zu) nested-page\n",
|
||||
sub_key + sub_ksize - end_of_subpage);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (unlikely(sub_ksize != v_clc.lmin)) {
|
||||
if (unlikely(sub_ksize < v_clc.lmin || sub_ksize > v_clc.lmax))
|
||||
rc = bad_page(mp,
|
||||
"nested-leaf2-key size (%zu) <> min/max "
|
||||
"value-length (%zu/%zu)\n",
|
||||
sub_ksize, v_clc.lmin, v_clc.lmax);
|
||||
else
|
||||
v_clc.lmin = v_clc.lmax = sub_ksize;
|
||||
}
|
||||
if ((mc->checking & z_ignord) == 0) {
|
||||
sub_here.iov_base = (void *)sub_key;
|
||||
sub_here.iov_len = sub_ksize;
|
||||
if (sub_prev.iov_base &&
|
||||
unlikely(v_clc.cmp(&sub_prev, &sub_here) >= 0))
|
||||
rc = bad_page(mp,
|
||||
"nested-leaf2-key #%u wrong order (%s >= %s)\n",
|
||||
ii, DKEY(&sub_prev), DVAL(&sub_here));
|
||||
sub_prev = sub_here;
|
||||
}
|
||||
} else {
|
||||
const node_t *const sub_node = page_node(sp, ii);
|
||||
const char *const sub_node_end = ptr_disp(sub_node, NODESIZE);
|
||||
if (unlikely(sub_node_end > end_of_subpage)) {
|
||||
rc = bad_page(mp, "nested-node beyond (%zu) nested-page\n",
|
||||
end_of_subpage - sub_node_end);
|
||||
continue;
|
||||
}
|
||||
if (unlikely(node_flags(sub_node) != 0))
|
||||
rc = bad_page(mp, "nested-node invalid flags (%u)\n",
|
||||
node_flags(sub_node));
|
||||
|
||||
const size_t sub_ksize = node_ks(sub_node);
|
||||
const char *const sub_key = node_key(sub_node);
|
||||
const size_t sub_dsize = node_ds(sub_node);
|
||||
/* char *sub_data = node_data(sub_node); */
|
||||
|
||||
if (unlikely(sub_ksize < v_clc.lmin || sub_ksize > v_clc.lmax))
|
||||
rc = bad_page(mp,
|
||||
"nested-node-key size (%zu) <> min/max "
|
||||
"value-length (%zu/%zu)\n",
|
||||
sub_ksize, v_clc.lmin, v_clc.lmax);
|
||||
if ((mc->checking & z_ignord) == 0) {
|
||||
sub_here.iov_base = (void *)sub_key;
|
||||
sub_here.iov_len = sub_ksize;
|
||||
if (sub_prev.iov_base &&
|
||||
unlikely(v_clc.cmp(&sub_prev, &sub_here) >= 0))
|
||||
rc = bad_page(mp,
|
||||
"nested-node-key #%u wrong order (%s >= %s)\n",
|
||||
ii, DKEY(&sub_prev), DVAL(&sub_here));
|
||||
sub_prev = sub_here;
|
||||
}
|
||||
if (unlikely(sub_dsize != 0))
|
||||
rc = bad_page(mp, "nested-node non-empty data size (%zu)\n",
|
||||
sub_dsize);
|
||||
if (unlikely(end_of_subpage < sub_key + sub_ksize))
|
||||
rc = bad_page(mp, "nested-node-key beyond (%zu) nested-page\n",
|
||||
sub_key + sub_ksize - end_of_subpage);
|
||||
}
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
return rc;
|
||||
}
|
||||
|
||||
static __always_inline int check_page_header(const uint16_t ILL,
|
||||
const page_t *page,
|
||||
MDBX_txn *const txn,
|
||||
const txnid_t front) {
|
||||
if (unlikely(page->flags & ILL)) {
|
||||
if (ILL == P_ILL_BITS || (page->flags & P_ILL_BITS))
|
||||
return bad_page(page, "invalid page's flags (%u)\n", page->flags);
|
||||
else if (ILL & P_LARGE) {
|
||||
assert((ILL & (P_BRANCH | P_LEAF | P_DUPFIX)) == 0);
|
||||
assert(page->flags & (P_BRANCH | P_LEAF | P_DUPFIX));
|
||||
return bad_page(page, "unexpected %s instead of %s (%u)\n",
|
||||
"large/overflow", "branch/leaf/leaf2", page->flags);
|
||||
} else if (ILL & (P_BRANCH | P_LEAF | P_DUPFIX)) {
|
||||
assert((ILL & P_BRANCH) && (ILL & P_LEAF) && (ILL & P_DUPFIX));
|
||||
assert(page->flags & (P_BRANCH | P_LEAF | P_DUPFIX));
|
||||
return bad_page(page, "unexpected %s instead of %s (%u)\n",
|
||||
"branch/leaf/leaf2", "large/overflow", page->flags);
|
||||
} else {
|
||||
assert(false);
|
||||
}
|
||||
}
|
||||
|
||||
if (unlikely(page->txnid > front) &&
|
||||
unlikely(page->txnid > txn->front_txnid || front < txn->txnid))
|
||||
return bad_page(
|
||||
page,
|
||||
"invalid page' txnid (%" PRIaTXN ") for %s' txnid (%" PRIaTXN ")\n",
|
||||
page->txnid,
|
||||
(front == txn->front_txnid && front != txn->txnid) ? "front-txn"
|
||||
: "parent-page",
|
||||
front);
|
||||
|
||||
if (((ILL & P_LARGE) || !is_largepage(page)) &&
|
||||
(ILL & (P_BRANCH | P_LEAF | P_DUPFIX)) == 0) {
|
||||
/* Контроль четности page->upper тут либо приводит к ложным ошибкам,
|
||||
* либо слишком дорог по количеству операций. Заковырка в том, что upper
|
||||
* может быть нечетным на DUPFIX-страницах, при нечетном количестве
|
||||
* элементов нечетной длины. Поэтому четность page->upper здесь не
|
||||
* проверяется, но соответствующие полные проверки есть в page_check(). */
|
||||
if (unlikely(page->upper < page->lower || (page->lower & 1) ||
|
||||
PAGEHDRSZ + page->upper > txn->env->ps))
|
||||
return bad_page(page,
|
||||
"invalid page' lower(%u)/upper(%u) with limit %zu\n",
|
||||
page->lower, page->upper, page_space(txn->env));
|
||||
|
||||
} else if ((ILL & P_LARGE) == 0) {
|
||||
const pgno_t npages = page->pages;
|
||||
if (unlikely(npages < 1) || unlikely(npages >= MAX_PAGENO / 2))
|
||||
return bad_page(page, "invalid n-pages (%u) for large-page\n", npages);
|
||||
if (unlikely(page->pgno + npages > txn->geo.first_unallocated))
|
||||
return bad_page(
|
||||
page,
|
||||
"end of large-page beyond (%u) allocated space (%u next-pgno)\n",
|
||||
page->pgno + npages, txn->geo.first_unallocated);
|
||||
} else {
|
||||
assert(false);
|
||||
}
|
||||
return MDBX_SUCCESS;
|
||||
}
|
||||
|
||||
__cold static __noinline pgr_t check_page_complete(const uint16_t ILL,
|
||||
page_t *page,
|
||||
const MDBX_cursor *const mc,
|
||||
const txnid_t front) {
|
||||
pgr_t r = {page, check_page_header(ILL, page, mc->txn, front)};
|
||||
if (likely(r.err == MDBX_SUCCESS))
|
||||
r.err = page_check(mc, page);
|
||||
if (unlikely(r.err != MDBX_SUCCESS))
|
||||
mc->txn->flags |= MDBX_TXN_ERROR;
|
||||
return r;
|
||||
}
|
||||
|
||||
static __always_inline pgr_t page_get_inline(const uint16_t ILL,
|
||||
const MDBX_cursor *const mc,
|
||||
const pgno_t pgno,
|
||||
const txnid_t front) {
|
||||
MDBX_txn *const txn = mc->txn;
|
||||
tASSERT(txn, front <= txn->front_txnid);
|
||||
|
||||
pgr_t r;
|
||||
if (unlikely(pgno >= txn->geo.first_unallocated)) {
|
||||
ERROR("page #%" PRIaPGNO " beyond next-pgno", pgno);
|
||||
r.page = nullptr;
|
||||
r.err = MDBX_PAGE_NOTFOUND;
|
||||
bailout:
|
||||
txn->flags |= MDBX_TXN_ERROR;
|
||||
return r;
|
||||
}
|
||||
|
||||
eASSERT(txn->env, ((txn->flags ^ txn->env->flags) & MDBX_WRITEMAP) == 0);
|
||||
r.page = pgno2page(txn->env, pgno);
|
||||
if ((txn->flags & (MDBX_TXN_RDONLY | MDBX_WRITEMAP)) == 0) {
|
||||
const MDBX_txn *spiller = txn;
|
||||
do {
|
||||
/* Spilled pages were dirtied in this txn and flushed
|
||||
* because the dirty list got full. Bring this page
|
||||
* back in from the map (but don't unspill it here,
|
||||
* leave that unless page_touch happens again). */
|
||||
if (unlikely(spiller->flags & MDBX_TXN_SPILLS) &&
|
||||
spill_search(spiller, pgno))
|
||||
break;
|
||||
|
||||
const size_t i = dpl_search(spiller, pgno);
|
||||
tASSERT(txn, (intptr_t)i > 0);
|
||||
if (spiller->tw.dirtylist->items[i].pgno == pgno) {
|
||||
r.page = spiller->tw.dirtylist->items[i].ptr;
|
||||
break;
|
||||
}
|
||||
|
||||
spiller = spiller->parent;
|
||||
} while (unlikely(spiller));
|
||||
}
|
||||
|
||||
if (unlikely(r.page->pgno != pgno)) {
|
||||
r.err = bad_page(
|
||||
r.page, "pgno mismatch (%" PRIaPGNO ") != expected (%" PRIaPGNO ")\n",
|
||||
r.page->pgno, pgno);
|
||||
goto bailout;
|
||||
}
|
||||
|
||||
if (unlikely(mc->checking & z_pagecheck))
|
||||
return check_page_complete(ILL, r.page, mc, front);
|
||||
|
||||
#if MDBX_DISABLE_VALIDATION
|
||||
r.err = MDBX_SUCCESS;
|
||||
#else
|
||||
r.err = check_page_header(ILL, r.page, txn, front);
|
||||
if (unlikely(r.err != MDBX_SUCCESS))
|
||||
goto bailout;
|
||||
#endif /* MDBX_DISABLE_VALIDATION */
|
||||
return r;
|
||||
}
|
||||
|
||||
pgr_t page_get_any(const MDBX_cursor *const mc, const pgno_t pgno,
|
||||
const txnid_t front) {
|
||||
return page_get_inline(P_ILL_BITS, mc, pgno, front);
|
||||
}
|
||||
|
||||
__hot pgr_t page_get_three(const MDBX_cursor *const mc, const pgno_t pgno,
|
||||
const txnid_t front) {
|
||||
return page_get_inline(P_ILL_BITS | P_LARGE, mc, pgno, front);
|
||||
}
|
||||
|
||||
pgr_t page_get_large(const MDBX_cursor *const mc, const pgno_t pgno,
|
||||
const txnid_t front) {
|
||||
return page_get_inline(P_ILL_BITS | P_BRANCH | P_LEAF | P_DUPFIX, mc, pgno,
|
||||
front);
|
||||
}
|
198
src/page-iov.c
Normal file
198
src/page-iov.c
Normal file
@ -0,0 +1,198 @@
|
||||
/// \copyright SPDX-License-Identifier: Apache-2.0
|
||||
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
|
||||
|
||||
#include "internals.h"
|
||||
|
||||
int iov_init(MDBX_txn *const txn, iov_ctx_t *ctx, size_t items, size_t npages,
|
||||
mdbx_filehandle_t fd, bool check_coherence) {
|
||||
ctx->env = txn->env;
|
||||
ctx->ior = &txn->env->ioring;
|
||||
ctx->fd = fd;
|
||||
ctx->coherency_timestamp =
|
||||
(check_coherence || txn->env->lck->pgops.incoherence.weak)
|
||||
? 0
|
||||
: UINT64_MAX /* не выполнять сверку */;
|
||||
ctx->err = osal_ioring_prepare(ctx->ior, items,
|
||||
pgno_align2os_bytes(txn->env, npages));
|
||||
if (likely(ctx->err == MDBX_SUCCESS)) {
|
||||
#if MDBX_NEED_WRITTEN_RANGE
|
||||
ctx->flush_begin = MAX_PAGENO;
|
||||
ctx->flush_end = MIN_PAGENO;
|
||||
#endif /* MDBX_NEED_WRITTEN_RANGE */
|
||||
osal_ioring_reset(ctx->ior);
|
||||
}
|
||||
return ctx->err;
|
||||
}
|
||||
|
||||
static void iov_callback4dirtypages(iov_ctx_t *ctx, size_t offset, void *data,
|
||||
size_t bytes) {
|
||||
MDBX_env *const env = ctx->env;
|
||||
eASSERT(env, (env->flags & MDBX_WRITEMAP) == 0);
|
||||
|
||||
page_t *wp = (page_t *)data;
|
||||
eASSERT(env, wp->pgno == bytes2pgno(env, offset));
|
||||
eASSERT(env, bytes2pgno(env, bytes) >= (is_largepage(wp) ? wp->pages : 1u));
|
||||
eASSERT(env, (wp->flags & P_ILL_BITS) == 0);
|
||||
|
||||
if (likely(ctx->err == MDBX_SUCCESS)) {
|
||||
const page_t *const rp = ptr_disp(env->dxb_mmap.base, offset);
|
||||
VALGRIND_MAKE_MEM_DEFINED(rp, bytes);
|
||||
MDBX_ASAN_UNPOISON_MEMORY_REGION(rp, bytes);
|
||||
osal_flush_incoherent_mmap(rp, bytes, globals.sys_pagesize);
|
||||
/* check with timeout as the workaround
|
||||
* for https://libmdbx.dqdkfa.ru/dead-github/issues/269
|
||||
*
|
||||
* Проблема проявляется только при неупорядоченности: если записанная
|
||||
* последней мета-страница "обгоняет" ранее записанные, т.е. когда
|
||||
* записанное в файл позже становится видимым в отображении раньше,
|
||||
* чем записанное ранее.
|
||||
*
|
||||
* Исходно здесь всегда выполнялась полная сверка. Это давало полную
|
||||
* гарантию защиты от проявления проблемы, но порождало накладные расходы.
|
||||
* В некоторых сценариях наблюдалось снижение производительности до 10-15%,
|
||||
* а в синтетических тестах до 30%. Конечно никто не вникал в причины,
|
||||
* а просто останавливался на мнении "libmdbx не быстрее LMDB",
|
||||
* например: https://clck.ru/3386er
|
||||
*
|
||||
* Поэтому после серии экспериментов и тестов реализовано следующее:
|
||||
* 0. Посредством опции сборки MDBX_FORCE_CHECK_MMAP_COHERENCY=1
|
||||
* можно включить полную сверку после записи.
|
||||
* Остальные пункты являются взвешенным компромиссом между полной
|
||||
* гарантией обнаружения проблемы и бесполезными затратами на системах
|
||||
* без этого недостатка.
|
||||
* 1. При старте транзакций проверяется соответствие выбранной мета-страницы
|
||||
* корневым страницам b-tree проверяется. Эта проверка показала себя
|
||||
* достаточной без сверки после записи. При обнаружении "некогерентности"
|
||||
* эти случаи подсчитываются, а при их ненулевом счетчике выполняется
|
||||
* полная сверка. Таким образом, произойдет переключение в режим полной
|
||||
* сверки, если показавшая себя достаточной проверка заметит проявление
|
||||
* проблемы хоты-бы раз.
|
||||
* 2. Сверка не выполняется при фиксации транзакции, так как:
|
||||
* - при наличии проблемы "не-когерентности" (при отложенном копировании
|
||||
* или обновлении PTE, после возврата из write-syscall), проверка
|
||||
* в этом процессе не гарантирует актуальность данных в другом
|
||||
* процессе, который может запустить транзакцию сразу после коммита;
|
||||
* - сверка только последнего блока позволяет почти восстановить
|
||||
* производительность в больших транзакциях, но одновременно размывает
|
||||
* уверенность в отсутствии сбоев, чем обесценивает всю затею;
|
||||
* - после записи данных будет записана мета-страница, соответствие
|
||||
* которой корневым страницам b-tree проверяется при старте
|
||||
* транзакций, и только эта проверка показала себя достаточной;
|
||||
* 3. При спиллинге производится полная сверка записанных страниц. Тут был
|
||||
* соблазн сверять не полностью, а например начало и конец каждого блока.
|
||||
* Но при спиллинге возможна ситуация повторного вытеснения страниц, в
|
||||
* том числе large/overflow. При этом возникает риск прочитать в текущей
|
||||
* транзакции старую версию страницы, до повторной записи. В этом случае
|
||||
* могут возникать крайне редкие невоспроизводимые ошибки. С учетом того
|
||||
* что спиллинг выполняет крайне редко, решено отказаться от экономии
|
||||
* в пользу надежности. */
|
||||
#ifndef MDBX_FORCE_CHECK_MMAP_COHERENCY
|
||||
#define MDBX_FORCE_CHECK_MMAP_COHERENCY 0
|
||||
#endif /* MDBX_FORCE_CHECK_MMAP_COHERENCY */
|
||||
if ((MDBX_FORCE_CHECK_MMAP_COHERENCY ||
|
||||
ctx->coherency_timestamp != UINT64_MAX) &&
|
||||
unlikely(memcmp(wp, rp, bytes))) {
|
||||
ctx->coherency_timestamp = 0;
|
||||
env->lck->pgops.incoherence.weak =
|
||||
(env->lck->pgops.incoherence.weak >= INT32_MAX)
|
||||
? INT32_MAX
|
||||
: env->lck->pgops.incoherence.weak + 1;
|
||||
WARNING("catch delayed/non-arrived page %" PRIaPGNO " %s", wp->pgno,
|
||||
"(workaround for incoherent flaw of unified page/buffer cache)");
|
||||
do
|
||||
if (coherency_timeout(&ctx->coherency_timestamp, wp->pgno, env) !=
|
||||
MDBX_RESULT_TRUE) {
|
||||
ctx->err = MDBX_PROBLEM;
|
||||
break;
|
||||
}
|
||||
while (unlikely(memcmp(wp, rp, bytes)));
|
||||
}
|
||||
}
|
||||
|
||||
if (likely(bytes == env->ps))
|
||||
page_shadow_release(env, wp, 1);
|
||||
else {
|
||||
do {
|
||||
eASSERT(env, wp->pgno == bytes2pgno(env, offset));
|
||||
eASSERT(env, (wp->flags & P_ILL_BITS) == 0);
|
||||
size_t npages = is_largepage(wp) ? wp->pages : 1u;
|
||||
size_t chunk = pgno2bytes(env, npages);
|
||||
eASSERT(env, bytes >= chunk);
|
||||
page_t *next = ptr_disp(wp, chunk);
|
||||
page_shadow_release(env, wp, npages);
|
||||
wp = next;
|
||||
offset += chunk;
|
||||
bytes -= chunk;
|
||||
} while (bytes);
|
||||
}
|
||||
}
|
||||
|
||||
static void iov_complete(iov_ctx_t *ctx) {
|
||||
if ((ctx->env->flags & MDBX_WRITEMAP) == 0)
|
||||
osal_ioring_walk(ctx->ior, ctx, iov_callback4dirtypages);
|
||||
osal_ioring_reset(ctx->ior);
|
||||
}
|
||||
|
||||
int iov_write(iov_ctx_t *ctx) {
|
||||
eASSERT(ctx->env, !iov_empty(ctx));
|
||||
osal_ioring_write_result_t r = osal_ioring_write(ctx->ior, ctx->fd);
|
||||
#if MDBX_ENABLE_PGOP_STAT
|
||||
ctx->env->lck->pgops.wops.weak += r.wops;
|
||||
#endif /* MDBX_ENABLE_PGOP_STAT */
|
||||
ctx->err = r.err;
|
||||
if (unlikely(ctx->err != MDBX_SUCCESS))
|
||||
ERROR("Write error: %s", mdbx_strerror(ctx->err));
|
||||
iov_complete(ctx);
|
||||
return ctx->err;
|
||||
}
|
||||
|
||||
int iov_page(MDBX_txn *txn, iov_ctx_t *ctx, page_t *dp, size_t npages) {
|
||||
MDBX_env *const env = txn->env;
|
||||
tASSERT(txn, ctx->err == MDBX_SUCCESS);
|
||||
tASSERT(txn, dp->pgno >= MIN_PAGENO && dp->pgno < txn->geo.first_unallocated);
|
||||
tASSERT(txn, is_modifable(txn, dp));
|
||||
tASSERT(txn, !(dp->flags & ~(P_BRANCH | P_LEAF | P_DUPFIX | P_LARGE)));
|
||||
|
||||
if (is_shadowed(txn, dp)) {
|
||||
tASSERT(txn, !(txn->flags & MDBX_WRITEMAP));
|
||||
dp->txnid = txn->txnid;
|
||||
tASSERT(txn, is_spilled(txn, dp));
|
||||
#if MDBX_AVOID_MSYNC
|
||||
doit:;
|
||||
#endif /* MDBX_AVOID_MSYNC */
|
||||
int err = osal_ioring_add(ctx->ior, pgno2bytes(env, dp->pgno), dp,
|
||||
pgno2bytes(env, npages));
|
||||
if (unlikely(err != MDBX_SUCCESS)) {
|
||||
ctx->err = err;
|
||||
if (unlikely(err != MDBX_RESULT_TRUE)) {
|
||||
iov_complete(ctx);
|
||||
return err;
|
||||
}
|
||||
err = iov_write(ctx);
|
||||
tASSERT(txn, iov_empty(ctx));
|
||||
if (likely(err == MDBX_SUCCESS)) {
|
||||
err = osal_ioring_add(ctx->ior, pgno2bytes(env, dp->pgno), dp,
|
||||
pgno2bytes(env, npages));
|
||||
if (unlikely(err != MDBX_SUCCESS)) {
|
||||
iov_complete(ctx);
|
||||
return ctx->err = err;
|
||||
}
|
||||
}
|
||||
tASSERT(txn, ctx->err == MDBX_SUCCESS);
|
||||
}
|
||||
} else {
|
||||
tASSERT(txn, txn->flags & MDBX_WRITEMAP);
|
||||
#if MDBX_AVOID_MSYNC
|
||||
goto doit;
|
||||
#endif /* MDBX_AVOID_MSYNC */
|
||||
}
|
||||
|
||||
#if MDBX_NEED_WRITTEN_RANGE
|
||||
ctx->flush_begin =
|
||||
(ctx->flush_begin < dp->pgno) ? ctx->flush_begin : dp->pgno;
|
||||
ctx->flush_end = (ctx->flush_end > dp->pgno + (pgno_t)npages)
|
||||
? ctx->flush_end
|
||||
: dp->pgno + (pgno_t)npages;
|
||||
#endif /* MDBX_NEED_WRITTEN_RANGE */
|
||||
return MDBX_SUCCESS;
|
||||
}
|
38
src/page-iov.h
Normal file
38
src/page-iov.h
Normal file
@ -0,0 +1,38 @@
|
||||
/// \copyright SPDX-License-Identifier: Apache-2.0
|
||||
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "essentials.h"
|
||||
|
||||
#if !(defined(_WIN32) || defined(_WIN64))
|
||||
#define MDBX_WRITETHROUGH_THRESHOLD_DEFAULT 2
|
||||
#endif
|
||||
|
||||
struct iov_ctx {
|
||||
MDBX_env *env;
|
||||
osal_ioring_t *ior;
|
||||
mdbx_filehandle_t fd;
|
||||
int err;
|
||||
#ifndef MDBX_NEED_WRITTEN_RANGE
|
||||
#define MDBX_NEED_WRITTEN_RANGE 1
|
||||
#endif /* MDBX_NEED_WRITTEN_RANGE */
|
||||
#if MDBX_NEED_WRITTEN_RANGE
|
||||
pgno_t flush_begin;
|
||||
pgno_t flush_end;
|
||||
#endif /* MDBX_NEED_WRITTEN_RANGE */
|
||||
uint64_t coherency_timestamp;
|
||||
};
|
||||
|
||||
MDBX_INTERNAL __must_check_result int
|
||||
iov_init(MDBX_txn *const txn, iov_ctx_t *ctx, size_t items, size_t npages,
|
||||
mdbx_filehandle_t fd, bool check_coherence);
|
||||
|
||||
static inline bool iov_empty(const iov_ctx_t *ctx) {
|
||||
return osal_ioring_used(ctx->ior) == 0;
|
||||
}
|
||||
|
||||
MDBX_INTERNAL __must_check_result int iov_page(MDBX_txn *txn, iov_ctx_t *ctx,
|
||||
page_t *dp, size_t npages);
|
||||
|
||||
MDBX_INTERNAL __must_check_result int iov_write(iov_ctx_t *ctx);
|
772
src/page-ops.c
Normal file
772
src/page-ops.c
Normal file
@ -0,0 +1,772 @@
|
||||
/// \copyright SPDX-License-Identifier: Apache-2.0
|
||||
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
|
||||
|
||||
#include "internals.h"
|
||||
|
||||
static inline tree_t *outer_tree(MDBX_cursor *mc) {
|
||||
cASSERT(mc, (mc->flags & z_inner) != 0);
|
||||
subcur_t *mx = container_of(mc->tree, subcur_t, nested_tree);
|
||||
cursor_couple_t *couple = container_of(mx, cursor_couple_t, inner);
|
||||
cASSERT(mc, mc->tree == &couple->outer.subcur->nested_tree);
|
||||
cASSERT(mc, &mc->clc->k == &couple->outer.clc->v);
|
||||
return couple->outer.tree;
|
||||
}
|
||||
|
||||
pgr_t page_new(MDBX_cursor *mc, const unsigned flags) {
|
||||
cASSERT(mc, (flags & P_LARGE) == 0);
|
||||
pgr_t ret = gc_alloc_single(mc);
|
||||
if (unlikely(ret.err != MDBX_SUCCESS))
|
||||
return ret;
|
||||
|
||||
DEBUG("db %zu allocated new page %" PRIaPGNO, cursor_dbi(mc), ret.page->pgno);
|
||||
ret.page->flags = (uint16_t)flags;
|
||||
cASSERT(mc, *cursor_dbi_state(mc) & DBI_DIRTY);
|
||||
cASSERT(mc, mc->txn->flags & MDBX_TXN_DIRTY);
|
||||
#if MDBX_ENABLE_PGOP_STAT
|
||||
mc->txn->env->lck->pgops.newly.weak += 1;
|
||||
#endif /* MDBX_ENABLE_PGOP_STAT */
|
||||
|
||||
STATIC_ASSERT(P_BRANCH == 1);
|
||||
const unsigned is_branch = flags & P_BRANCH;
|
||||
|
||||
ret.page->lower = 0;
|
||||
ret.page->upper = (indx_t)(mc->txn->env->ps - PAGEHDRSZ);
|
||||
mc->tree->branch_pages += is_branch;
|
||||
mc->tree->leaf_pages += 1 - is_branch;
|
||||
if (unlikely(mc->flags & z_inner)) {
|
||||
tree_t *outer = outer_tree(mc);
|
||||
outer->branch_pages += is_branch;
|
||||
outer->leaf_pages += 1 - is_branch;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
pgr_t page_new_large(MDBX_cursor *mc, const size_t npages) {
|
||||
pgr_t ret = likely(npages == 1) ? gc_alloc_single(mc)
|
||||
: gc_alloc_ex(mc, npages, ALLOC_DEFAULT);
|
||||
if (unlikely(ret.err != MDBX_SUCCESS))
|
||||
return ret;
|
||||
|
||||
DEBUG("dbi %zu allocated new large-page %" PRIaPGNO ", num %zu",
|
||||
cursor_dbi(mc), ret.page->pgno, npages);
|
||||
ret.page->flags = P_LARGE;
|
||||
cASSERT(mc, *cursor_dbi_state(mc) & DBI_DIRTY);
|
||||
cASSERT(mc, mc->txn->flags & MDBX_TXN_DIRTY);
|
||||
#if MDBX_ENABLE_PGOP_STAT
|
||||
mc->txn->env->lck->pgops.newly.weak += npages;
|
||||
#endif /* MDBX_ENABLE_PGOP_STAT */
|
||||
|
||||
mc->tree->large_pages += (pgno_t)npages;
|
||||
ret.page->pages = (pgno_t)npages;
|
||||
cASSERT(mc, !(mc->flags & z_inner));
|
||||
return ret;
|
||||
}
|
||||
|
||||
__hot void page_copy(page_t *const dst, const page_t *const src,
|
||||
const size_t size) {
|
||||
STATIC_ASSERT(UINT16_MAX > MDBX_MAX_PAGESIZE - PAGEHDRSZ);
|
||||
STATIC_ASSERT(MDBX_MIN_PAGESIZE > PAGEHDRSZ + NODESIZE * 4);
|
||||
void *copy_dst = dst;
|
||||
const void *copy_src = src;
|
||||
size_t copy_len = size;
|
||||
if (src->flags & P_DUPFIX) {
|
||||
copy_len = PAGEHDRSZ + src->dupfix_ksize * page_numkeys(src);
|
||||
if (unlikely(copy_len > size))
|
||||
goto bailout;
|
||||
} else if ((src->flags & P_LARGE) == 0) {
|
||||
size_t upper = src->upper, lower = src->lower;
|
||||
intptr_t unused = upper - lower;
|
||||
/* If page isn't full, just copy the used portion. Adjust
|
||||
* alignment so memcpy may copy words instead of bytes. */
|
||||
if (unused > MDBX_CACHELINE_SIZE * 3) {
|
||||
lower = ceil_powerof2(lower + PAGEHDRSZ, sizeof(void *));
|
||||
upper = floor_powerof2(upper + PAGEHDRSZ, sizeof(void *));
|
||||
if (unlikely(upper > copy_len))
|
||||
goto bailout;
|
||||
memcpy(copy_dst, copy_src, lower);
|
||||
copy_dst = ptr_disp(copy_dst, upper);
|
||||
copy_src = ptr_disp(copy_src, upper);
|
||||
copy_len -= upper;
|
||||
}
|
||||
}
|
||||
memcpy(copy_dst, copy_src, copy_len);
|
||||
return;
|
||||
|
||||
bailout:
|
||||
if (src->flags & P_DUPFIX)
|
||||
bad_page(src, "%s addr %p, n-keys %zu, ksize %u",
|
||||
"invalid/corrupted source page", __Wpedantic_format_voidptr(src),
|
||||
page_numkeys(src), src->dupfix_ksize);
|
||||
else
|
||||
bad_page(src, "%s addr %p, upper %u", "invalid/corrupted source page",
|
||||
__Wpedantic_format_voidptr(src), src->upper);
|
||||
memset(dst, -1, size);
|
||||
}
|
||||
|
||||
__cold pgr_t __must_check_result page_unspill(MDBX_txn *const txn,
|
||||
const page_t *const mp) {
|
||||
VERBOSE("unspill page %" PRIaPGNO, mp->pgno);
|
||||
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0);
|
||||
tASSERT(txn, is_spilled(txn, mp));
|
||||
const MDBX_txn *scan = txn;
|
||||
pgr_t ret;
|
||||
do {
|
||||
tASSERT(txn, (scan->flags & MDBX_TXN_SPILLS) != 0);
|
||||
const size_t si = spill_search(scan, mp->pgno);
|
||||
if (!si)
|
||||
continue;
|
||||
const unsigned npages = is_largepage(mp) ? mp->pages : 1;
|
||||
ret.page = page_shadow_alloc(txn, npages);
|
||||
if (unlikely(!ret.page)) {
|
||||
ret.err = MDBX_ENOMEM;
|
||||
return ret;
|
||||
}
|
||||
page_copy(ret.page, mp, pgno2bytes(txn->env, npages));
|
||||
if (scan == txn) {
|
||||
/* If in current txn, this page is no longer spilled.
|
||||
* If it happens to be the last page, truncate the spill list.
|
||||
* Otherwise mark it as deleted by setting the LSB. */
|
||||
spill_remove(txn, si, npages);
|
||||
} /* otherwise, if belonging to a parent txn, the
|
||||
* page remains spilled until child commits */
|
||||
|
||||
ret.err = page_dirty(txn, ret.page, npages);
|
||||
if (unlikely(ret.err != MDBX_SUCCESS))
|
||||
return ret;
|
||||
#if MDBX_ENABLE_PGOP_STAT
|
||||
txn->env->lck->pgops.unspill.weak += npages;
|
||||
#endif /* MDBX_ENABLE_PGOP_STAT */
|
||||
ret.page->flags |= (scan == txn) ? 0 : P_SPILLED;
|
||||
ret.err = MDBX_SUCCESS;
|
||||
return ret;
|
||||
} while (likely((scan = scan->parent) != nullptr &&
|
||||
(scan->flags & MDBX_TXN_SPILLS) != 0));
|
||||
ERROR("Page %" PRIaPGNO " mod-txnid %" PRIaTXN
|
||||
" not found in the spill-list(s), current txn %" PRIaTXN
|
||||
" front %" PRIaTXN ", root txn %" PRIaTXN " front %" PRIaTXN,
|
||||
mp->pgno, mp->txnid, txn->txnid, txn->front_txnid,
|
||||
txn->env->basal_txn->txnid, txn->env->basal_txn->front_txnid);
|
||||
ret.err = MDBX_PROBLEM;
|
||||
ret.page = nullptr;
|
||||
return ret;
|
||||
}
|
||||
|
||||
__hot int page_touch_modifable(MDBX_txn *txn, const page_t *const mp) {
|
||||
tASSERT(txn, is_modifable(txn, mp) && txn->tw.dirtylist);
|
||||
tASSERT(txn, !is_largepage(mp) && !is_subpage(mp));
|
||||
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC);
|
||||
|
||||
const size_t n = dpl_search(txn, mp->pgno);
|
||||
if (MDBX_AVOID_MSYNC &&
|
||||
unlikely(txn->tw.dirtylist->items[n].pgno != mp->pgno)) {
|
||||
tASSERT(txn, (txn->flags & MDBX_WRITEMAP));
|
||||
tASSERT(txn, n > 0 && n <= txn->tw.dirtylist->length + 1);
|
||||
VERBOSE("unspill page %" PRIaPGNO, mp->pgno);
|
||||
#if MDBX_ENABLE_PGOP_STAT
|
||||
txn->env->lck->pgops.unspill.weak += 1;
|
||||
#endif /* MDBX_ENABLE_PGOP_STAT */
|
||||
return page_dirty(txn, (page_t *)mp, 1);
|
||||
}
|
||||
|
||||
tASSERT(txn, n > 0 && n <= txn->tw.dirtylist->length);
|
||||
tASSERT(txn, txn->tw.dirtylist->items[n].pgno == mp->pgno &&
|
||||
txn->tw.dirtylist->items[n].ptr == mp);
|
||||
if (!MDBX_AVOID_MSYNC || (txn->flags & MDBX_WRITEMAP) == 0) {
|
||||
size_t *const ptr =
|
||||
ptr_disp(txn->tw.dirtylist->items[n].ptr, -(ptrdiff_t)sizeof(size_t));
|
||||
*ptr = txn->tw.dirtylru;
|
||||
}
|
||||
return MDBX_SUCCESS;
|
||||
}
|
||||
|
||||
__hot int page_touch_unmodifable(MDBX_txn *txn, MDBX_cursor *mc,
|
||||
const page_t *const mp) {
|
||||
tASSERT(txn, !is_modifable(txn, mp) && !is_largepage(mp));
|
||||
if (is_subpage(mp)) {
|
||||
((page_t *)mp)->txnid = txn->front_txnid;
|
||||
return MDBX_SUCCESS;
|
||||
}
|
||||
|
||||
int rc;
|
||||
page_t *np;
|
||||
if (is_frozen(txn, mp)) {
|
||||
/* CoW the page */
|
||||
rc = pnl_need(&txn->tw.retired_pages, 1);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
goto fail;
|
||||
const pgr_t par = gc_alloc_single(mc);
|
||||
rc = par.err;
|
||||
np = par.page;
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
goto fail;
|
||||
|
||||
const pgno_t pgno = np->pgno;
|
||||
DEBUG("touched db %d page %" PRIaPGNO " -> %" PRIaPGNO, cursor_dbi_dbg(mc),
|
||||
mp->pgno, pgno);
|
||||
tASSERT(txn, mp->pgno != pgno);
|
||||
pnl_append_prereserved(txn->tw.retired_pages, mp->pgno);
|
||||
/* Update the parent page, if any, to point to the new page */
|
||||
if (likely(mc->top)) {
|
||||
page_t *parent = mc->pg[mc->top - 1];
|
||||
node_t *node = page_node(parent, mc->ki[mc->top - 1]);
|
||||
node_set_pgno(node, pgno);
|
||||
} else {
|
||||
mc->tree->root = pgno;
|
||||
}
|
||||
|
||||
#if MDBX_ENABLE_PGOP_STAT
|
||||
txn->env->lck->pgops.cow.weak += 1;
|
||||
#endif /* MDBX_ENABLE_PGOP_STAT */
|
||||
page_copy(np, mp, txn->env->ps);
|
||||
np->pgno = pgno;
|
||||
np->txnid = txn->front_txnid;
|
||||
} else if (is_spilled(txn, mp)) {
|
||||
pgr_t pur = page_unspill(txn, mp);
|
||||
np = pur.page;
|
||||
rc = pur.err;
|
||||
if (likely(rc == MDBX_SUCCESS)) {
|
||||
tASSERT(txn, np != nullptr);
|
||||
goto done;
|
||||
}
|
||||
goto fail;
|
||||
} else {
|
||||
if (unlikely(!txn->parent)) {
|
||||
ERROR("Unexpected not frozen/modifiable/spilled but shadowed %s "
|
||||
"page %" PRIaPGNO " mod-txnid %" PRIaTXN ","
|
||||
" without parent transaction, current txn %" PRIaTXN
|
||||
" front %" PRIaTXN,
|
||||
is_branch(mp) ? "branch" : "leaf", mp->pgno, mp->txnid,
|
||||
mc->txn->txnid, mc->txn->front_txnid);
|
||||
rc = MDBX_PROBLEM;
|
||||
goto fail;
|
||||
}
|
||||
|
||||
DEBUG("clone db %d page %" PRIaPGNO, cursor_dbi_dbg(mc), mp->pgno);
|
||||
tASSERT(txn,
|
||||
txn->tw.dirtylist->length <= PAGELIST_LIMIT + MDBX_PNL_GRANULATE);
|
||||
/* No - copy it */
|
||||
np = page_shadow_alloc(txn, 1);
|
||||
if (unlikely(!np)) {
|
||||
rc = MDBX_ENOMEM;
|
||||
goto fail;
|
||||
}
|
||||
page_copy(np, mp, txn->env->ps);
|
||||
|
||||
/* insert a clone of parent's dirty page, so don't touch dirtyroom */
|
||||
rc = page_dirty(txn, np, 1);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
goto fail;
|
||||
|
||||
#if MDBX_ENABLE_PGOP_STAT
|
||||
txn->env->lck->pgops.clone.weak += 1;
|
||||
#endif /* MDBX_ENABLE_PGOP_STAT */
|
||||
}
|
||||
|
||||
done:
|
||||
/* Adjust cursors pointing to mp */
|
||||
mc->pg[mc->top] = np;
|
||||
MDBX_cursor *m2 = txn->cursors[cursor_dbi(mc)];
|
||||
if (mc->flags & z_inner) {
|
||||
for (; m2; m2 = m2->next) {
|
||||
MDBX_cursor *m3 = &m2->subcur->cursor;
|
||||
if (m3->top < mc->top)
|
||||
continue;
|
||||
if (m3->pg[mc->top] == mp)
|
||||
m3->pg[mc->top] = np;
|
||||
}
|
||||
} else {
|
||||
for (; m2; m2 = m2->next) {
|
||||
if (m2->top < mc->top)
|
||||
continue;
|
||||
if (m2->pg[mc->top] == mp) {
|
||||
m2->pg[mc->top] = np;
|
||||
if (is_leaf(np) && inner_pointed(m2))
|
||||
cursor_inner_refresh(m2, np, m2->ki[mc->top]);
|
||||
}
|
||||
}
|
||||
}
|
||||
return MDBX_SUCCESS;
|
||||
|
||||
fail:
|
||||
txn->flags |= MDBX_TXN_ERROR;
|
||||
return rc;
|
||||
}
|
||||
|
||||
page_t *page_shadow_alloc(MDBX_txn *txn, size_t num) {
|
||||
MDBX_env *env = txn->env;
|
||||
page_t *np = env->shadow_reserve;
|
||||
size_t size = env->ps;
|
||||
if (likely(num == 1 && np)) {
|
||||
eASSERT(env, env->shadow_reserve_len > 0);
|
||||
MDBX_ASAN_UNPOISON_MEMORY_REGION(np, size);
|
||||
VALGRIND_MEMPOOL_ALLOC(env, ptr_disp(np, -(ptrdiff_t)sizeof(size_t)),
|
||||
size + sizeof(size_t));
|
||||
VALGRIND_MAKE_MEM_DEFINED(&page_next(np), sizeof(page_t *));
|
||||
env->shadow_reserve = page_next(np);
|
||||
env->shadow_reserve_len -= 1;
|
||||
} else {
|
||||
size = pgno2bytes(env, num);
|
||||
void *const ptr = osal_malloc(size + sizeof(size_t));
|
||||
if (unlikely(!ptr)) {
|
||||
txn->flags |= MDBX_TXN_ERROR;
|
||||
return nullptr;
|
||||
}
|
||||
VALGRIND_MEMPOOL_ALLOC(env, ptr, size + sizeof(size_t));
|
||||
np = ptr_disp(ptr, sizeof(size_t));
|
||||
}
|
||||
|
||||
if ((env->flags & MDBX_NOMEMINIT) == 0) {
|
||||
/* For a single page alloc, we init everything after the page header.
|
||||
* For multi-page, we init the final page; if the caller needed that
|
||||
* many pages they will be filling in at least up to the last page. */
|
||||
size_t skip = PAGEHDRSZ;
|
||||
if (num > 1)
|
||||
skip += pgno2bytes(env, num - 1);
|
||||
memset(ptr_disp(np, skip), 0, size - skip);
|
||||
}
|
||||
#if MDBX_DEBUG
|
||||
np->pgno = 0;
|
||||
#endif
|
||||
VALGRIND_MAKE_MEM_UNDEFINED(np, size);
|
||||
np->flags = 0;
|
||||
np->pages = (pgno_t)num;
|
||||
return np;
|
||||
}
|
||||
|
||||
void page_shadow_release(MDBX_env *env, page_t *dp, size_t npages) {
|
||||
VALGRIND_MAKE_MEM_UNDEFINED(dp, pgno2bytes(env, npages));
|
||||
MDBX_ASAN_UNPOISON_MEMORY_REGION(dp, pgno2bytes(env, npages));
|
||||
if (unlikely(env->flags & MDBX_PAGEPERTURB))
|
||||
memset(dp, -1, pgno2bytes(env, npages));
|
||||
if (likely(npages == 1 &&
|
||||
env->shadow_reserve_len < env->options.dp_reserve_limit)) {
|
||||
MDBX_ASAN_POISON_MEMORY_REGION(dp, env->ps);
|
||||
MDBX_ASAN_UNPOISON_MEMORY_REGION(&page_next(dp), sizeof(page_t *));
|
||||
page_next(dp) = env->shadow_reserve;
|
||||
VALGRIND_MEMPOOL_FREE(env, ptr_disp(dp, -(ptrdiff_t)sizeof(size_t)));
|
||||
env->shadow_reserve = dp;
|
||||
env->shadow_reserve_len += 1;
|
||||
} else {
|
||||
/* large pages just get freed directly */
|
||||
void *const ptr = ptr_disp(dp, -(ptrdiff_t)sizeof(size_t));
|
||||
VALGRIND_MEMPOOL_FREE(env, ptr);
|
||||
osal_free(ptr);
|
||||
}
|
||||
}
|
||||
|
||||
__cold static void page_kill(MDBX_txn *txn, page_t *mp, pgno_t pgno,
|
||||
size_t npages) {
|
||||
MDBX_env *const env = txn->env;
|
||||
DEBUG("kill %zu page(s) %" PRIaPGNO, npages, pgno);
|
||||
eASSERT(env, pgno >= NUM_METAS && npages);
|
||||
if (!is_frozen(txn, mp)) {
|
||||
const size_t bytes = pgno2bytes(env, npages);
|
||||
memset(mp, -1, bytes);
|
||||
mp->pgno = pgno;
|
||||
if ((txn->flags & MDBX_WRITEMAP) == 0)
|
||||
osal_pwrite(env->lazy_fd, mp, bytes, pgno2bytes(env, pgno));
|
||||
} else {
|
||||
struct iovec iov[MDBX_AUXILARY_IOV_MAX];
|
||||
iov[0].iov_len = env->ps;
|
||||
iov[0].iov_base = ptr_disp(env->page_auxbuf, env->ps);
|
||||
size_t iov_off = pgno2bytes(env, pgno), n = 1;
|
||||
while (--npages) {
|
||||
iov[n] = iov[0];
|
||||
if (++n == MDBX_AUXILARY_IOV_MAX) {
|
||||
osal_pwritev(env->lazy_fd, iov, MDBX_AUXILARY_IOV_MAX, iov_off);
|
||||
iov_off += pgno2bytes(env, MDBX_AUXILARY_IOV_MAX);
|
||||
n = 0;
|
||||
}
|
||||
}
|
||||
osal_pwritev(env->lazy_fd, iov, n, iov_off);
|
||||
}
|
||||
}
|
||||
|
||||
static inline bool suitable4loose(const MDBX_txn *txn, pgno_t pgno) {
|
||||
/* TODO:
|
||||
* 1) при включенной "экономии последовательностей" проверить, что
|
||||
* страница не примыкает к какой-либо из уже находящийся в reclaimed.
|
||||
* 2) стоит подумать над тем, чтобы при большом loose-списке отбрасывать
|
||||
половину в reclaimed. */
|
||||
return txn->tw.loose_count < txn->env->options.dp_loose_limit &&
|
||||
(!MDBX_ENABLE_REFUND ||
|
||||
/* skip pages near to the end in favor of compactification */
|
||||
txn->geo.first_unallocated >
|
||||
pgno + txn->env->options.dp_loose_limit ||
|
||||
txn->geo.first_unallocated <= txn->env->options.dp_loose_limit);
|
||||
}
|
||||
|
||||
/* Retire, loosen or free a single page.
|
||||
*
|
||||
* For dirty pages, saves single pages to a list for future reuse in this same
|
||||
* txn. It has been pulled from the GC and already resides on the dirty list,
|
||||
* but has been deleted. Use these pages first before pulling again from the GC.
|
||||
*
|
||||
* If the page wasn't dirtied in this txn, just add it
|
||||
* to this txn's free list. */
|
||||
int page_retire_ex(MDBX_cursor *mc, const pgno_t pgno,
|
||||
page_t *mp /* maybe null */,
|
||||
unsigned pageflags /* maybe unknown/zero */) {
|
||||
int rc;
|
||||
MDBX_txn *const txn = mc->txn;
|
||||
tASSERT(txn, !mp || (mp->pgno == pgno && mp->flags == pageflags));
|
||||
|
||||
/* During deleting entire subtrees, it is reasonable and possible to avoid
|
||||
* reading leaf pages, i.e. significantly reduce hard page-faults & IOPs:
|
||||
* - mp is null, i.e. the page has not yet been read;
|
||||
* - pagetype is known and the P_LEAF bit is set;
|
||||
* - we can determine the page status via scanning the lists
|
||||
* of dirty and spilled pages.
|
||||
*
|
||||
* On the other hand, this could be suboptimal for WRITEMAP mode, since
|
||||
* requires support the list of dirty pages and avoid explicit spilling.
|
||||
* So for flexibility and avoid extra internal dependencies we just
|
||||
* fallback to reading if dirty list was not allocated yet. */
|
||||
size_t di = 0, si = 0, npages = 1;
|
||||
enum page_status {
|
||||
unknown,
|
||||
frozen,
|
||||
spilled,
|
||||
shadowed,
|
||||
modifable
|
||||
} status = unknown;
|
||||
|
||||
if (unlikely(!mp)) {
|
||||
if (ASSERT_ENABLED() && pageflags) {
|
||||
pgr_t check;
|
||||
check = page_get_any(mc, pgno, txn->front_txnid);
|
||||
if (unlikely(check.err != MDBX_SUCCESS))
|
||||
return check.err;
|
||||
tASSERT(txn, ((unsigned)check.page->flags & ~P_SPILLED) ==
|
||||
(pageflags & ~P_FROZEN));
|
||||
tASSERT(txn, !(pageflags & P_FROZEN) || is_frozen(txn, check.page));
|
||||
}
|
||||
if (pageflags & P_FROZEN) {
|
||||
status = frozen;
|
||||
if (ASSERT_ENABLED()) {
|
||||
for (MDBX_txn *scan = txn; scan; scan = scan->parent) {
|
||||
tASSERT(txn, !txn->tw.spilled.list || !spill_search(scan, pgno));
|
||||
tASSERT(txn, !scan->tw.dirtylist || !debug_dpl_find(scan, pgno));
|
||||
}
|
||||
}
|
||||
goto status_done;
|
||||
} else if (pageflags && txn->tw.dirtylist) {
|
||||
if ((di = dpl_exist(txn, pgno)) != 0) {
|
||||
mp = txn->tw.dirtylist->items[di].ptr;
|
||||
tASSERT(txn, is_modifable(txn, mp));
|
||||
status = modifable;
|
||||
goto status_done;
|
||||
}
|
||||
if ((si = spill_search(txn, pgno)) != 0) {
|
||||
status = spilled;
|
||||
goto status_done;
|
||||
}
|
||||
for (MDBX_txn *parent = txn->parent; parent; parent = parent->parent) {
|
||||
if (dpl_exist(parent, pgno)) {
|
||||
status = shadowed;
|
||||
goto status_done;
|
||||
}
|
||||
if (spill_search(parent, pgno)) {
|
||||
status = spilled;
|
||||
goto status_done;
|
||||
}
|
||||
}
|
||||
status = frozen;
|
||||
goto status_done;
|
||||
}
|
||||
|
||||
pgr_t pg = page_get_any(mc, pgno, txn->front_txnid);
|
||||
if (unlikely(pg.err != MDBX_SUCCESS))
|
||||
return pg.err;
|
||||
mp = pg.page;
|
||||
tASSERT(txn, !pageflags || mp->flags == pageflags);
|
||||
pageflags = mp->flags;
|
||||
}
|
||||
|
||||
if (is_frozen(txn, mp)) {
|
||||
status = frozen;
|
||||
tASSERT(txn, !is_modifable(txn, mp));
|
||||
tASSERT(txn, !is_spilled(txn, mp));
|
||||
tASSERT(txn, !is_shadowed(txn, mp));
|
||||
tASSERT(txn, !debug_dpl_find(txn, pgno));
|
||||
tASSERT(txn, !txn->tw.spilled.list || !spill_search(txn, pgno));
|
||||
} else if (is_modifable(txn, mp)) {
|
||||
status = modifable;
|
||||
if (txn->tw.dirtylist)
|
||||
di = dpl_exist(txn, pgno);
|
||||
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) || !is_spilled(txn, mp));
|
||||
tASSERT(txn, !txn->tw.spilled.list || !spill_search(txn, pgno));
|
||||
} else if (is_shadowed(txn, mp)) {
|
||||
status = shadowed;
|
||||
tASSERT(txn, !txn->tw.spilled.list || !spill_search(txn, pgno));
|
||||
tASSERT(txn, !debug_dpl_find(txn, pgno));
|
||||
} else {
|
||||
tASSERT(txn, is_spilled(txn, mp));
|
||||
status = spilled;
|
||||
si = spill_search(txn, pgno);
|
||||
tASSERT(txn, !debug_dpl_find(txn, pgno));
|
||||
}
|
||||
|
||||
status_done:
|
||||
if (likely((pageflags & P_LARGE) == 0)) {
|
||||
STATIC_ASSERT(P_BRANCH == 1);
|
||||
const bool is_branch = pageflags & P_BRANCH;
|
||||
cASSERT(mc, ((pageflags & P_LEAF) == 0) == is_branch);
|
||||
if (unlikely(mc->flags & z_inner)) {
|
||||
tree_t *outer = outer_tree(mc);
|
||||
cASSERT(mc, !is_branch || outer->branch_pages > 0);
|
||||
outer->branch_pages -= is_branch;
|
||||
cASSERT(mc, is_branch || outer->leaf_pages > 0);
|
||||
outer->leaf_pages -= 1 - is_branch;
|
||||
}
|
||||
cASSERT(mc, !is_branch || mc->tree->branch_pages > 0);
|
||||
mc->tree->branch_pages -= is_branch;
|
||||
cASSERT(mc, is_branch || mc->tree->leaf_pages > 0);
|
||||
mc->tree->leaf_pages -= 1 - is_branch;
|
||||
} else {
|
||||
npages = mp->pages;
|
||||
cASSERT(mc, mc->tree->large_pages >= npages);
|
||||
mc->tree->large_pages -= (pgno_t)npages;
|
||||
}
|
||||
|
||||
if (status == frozen) {
|
||||
retire:
|
||||
DEBUG("retire %zu page %" PRIaPGNO, npages, pgno);
|
||||
rc = pnl_append_span(&txn->tw.retired_pages, pgno, npages);
|
||||
tASSERT(txn, dpl_check(txn));
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* Возврат страниц в нераспределенный "хвост" БД.
|
||||
* Содержимое страниц не уничтожается, а для вложенных транзакций граница
|
||||
* нераспределенного "хвоста" БД сдвигается только при их коммите. */
|
||||
if (MDBX_ENABLE_REFUND &&
|
||||
unlikely(pgno + npages == txn->geo.first_unallocated)) {
|
||||
const char *kind = nullptr;
|
||||
if (status == modifable) {
|
||||
/* Страница испачкана в этой транзакции, но до этого могла быть
|
||||
* аллоцирована, испачкана и пролита в одной из родительских транзакций.
|
||||
* Её МОЖНО вытолкнуть в нераспределенный хвост. */
|
||||
kind = "dirty";
|
||||
/* Remove from dirty list */
|
||||
page_wash(txn, di, mp, npages);
|
||||
} else if (si) {
|
||||
/* Страница пролита в этой транзакции, т.е. она аллоцирована
|
||||
* и запачкана в этой или одной из родительских транзакций.
|
||||
* Её МОЖНО вытолкнуть в нераспределенный хвост. */
|
||||
kind = "spilled";
|
||||
tASSERT(txn, status == spilled);
|
||||
spill_remove(txn, si, npages);
|
||||
} else {
|
||||
/* Страница аллоцирована, запачкана и возможно пролита в одной
|
||||
* из родительских транзакций.
|
||||
* Её МОЖНО вытолкнуть в нераспределенный хвост. */
|
||||
kind = "parent's";
|
||||
if (ASSERT_ENABLED() && mp) {
|
||||
kind = nullptr;
|
||||
for (MDBX_txn *parent = txn->parent; parent; parent = parent->parent) {
|
||||
if (spill_search(parent, pgno)) {
|
||||
kind = "parent-spilled";
|
||||
tASSERT(txn, status == spilled);
|
||||
break;
|
||||
}
|
||||
if (mp == debug_dpl_find(parent, pgno)) {
|
||||
kind = "parent-dirty";
|
||||
tASSERT(txn, status == shadowed);
|
||||
break;
|
||||
}
|
||||
}
|
||||
tASSERT(txn, kind != nullptr);
|
||||
}
|
||||
tASSERT(txn, status == spilled || status == shadowed);
|
||||
}
|
||||
DEBUG("refunded %zu %s page %" PRIaPGNO, npages, kind, pgno);
|
||||
txn->geo.first_unallocated = pgno;
|
||||
txn_refund(txn);
|
||||
return MDBX_SUCCESS;
|
||||
}
|
||||
|
||||
if (status == modifable) {
|
||||
/* Dirty page from this transaction */
|
||||
/* If suitable we can reuse it through loose list */
|
||||
if (likely(npages == 1 && suitable4loose(txn, pgno)) &&
|
||||
(di || !txn->tw.dirtylist)) {
|
||||
DEBUG("loosen dirty page %" PRIaPGNO, pgno);
|
||||
if (MDBX_DEBUG != 0 || unlikely(txn->env->flags & MDBX_PAGEPERTURB))
|
||||
memset(page_data(mp), -1, txn->env->ps - PAGEHDRSZ);
|
||||
mp->txnid = INVALID_TXNID;
|
||||
mp->flags = P_LOOSE;
|
||||
page_next(mp) = txn->tw.loose_pages;
|
||||
txn->tw.loose_pages = mp;
|
||||
txn->tw.loose_count++;
|
||||
#if MDBX_ENABLE_REFUND
|
||||
txn->tw.loose_refund_wl = (pgno + 2 > txn->tw.loose_refund_wl)
|
||||
? pgno + 2
|
||||
: txn->tw.loose_refund_wl;
|
||||
#endif /* MDBX_ENABLE_REFUND */
|
||||
VALGRIND_MAKE_MEM_NOACCESS(page_data(mp), txn->env->ps - PAGEHDRSZ);
|
||||
MDBX_ASAN_POISON_MEMORY_REGION(page_data(mp), txn->env->ps - PAGEHDRSZ);
|
||||
return MDBX_SUCCESS;
|
||||
}
|
||||
|
||||
#if !MDBX_DEBUG && !defined(ENABLE_MEMCHECK) && !defined(__SANITIZE_ADDRESS__)
|
||||
if (unlikely(txn->env->flags & MDBX_PAGEPERTURB))
|
||||
#endif
|
||||
{
|
||||
/* Страница могла быть изменена в одной из родительских транзакций,
|
||||
* в том числе, позже выгружена и затем снова загружена и изменена.
|
||||
* В обоих случаях её нельзя затирать на диске и помечать недоступной
|
||||
* в asan и/или valgrind */
|
||||
for (MDBX_txn *parent = txn->parent;
|
||||
parent && (parent->flags & MDBX_TXN_SPILLS);
|
||||
parent = parent->parent) {
|
||||
if (spill_intersect(parent, pgno, npages))
|
||||
goto skip_invalidate;
|
||||
if (dpl_intersect(parent, pgno, npages))
|
||||
goto skip_invalidate;
|
||||
}
|
||||
|
||||
#if defined(ENABLE_MEMCHECK) || defined(__SANITIZE_ADDRESS__)
|
||||
if (MDBX_DEBUG != 0 || unlikely(txn->env->flags & MDBX_PAGEPERTURB))
|
||||
#endif
|
||||
page_kill(txn, mp, pgno, npages);
|
||||
if ((txn->flags & MDBX_WRITEMAP) == 0) {
|
||||
VALGRIND_MAKE_MEM_NOACCESS(page_data(pgno2page(txn->env, pgno)),
|
||||
pgno2bytes(txn->env, npages) - PAGEHDRSZ);
|
||||
MDBX_ASAN_POISON_MEMORY_REGION(page_data(pgno2page(txn->env, pgno)),
|
||||
pgno2bytes(txn->env, npages) -
|
||||
PAGEHDRSZ);
|
||||
}
|
||||
}
|
||||
skip_invalidate:
|
||||
|
||||
/* wash dirty page */
|
||||
page_wash(txn, di, mp, npages);
|
||||
|
||||
reclaim:
|
||||
DEBUG("reclaim %zu %s page %" PRIaPGNO, npages, "dirty", pgno);
|
||||
rc = pnl_insert_span(&txn->tw.relist, pgno, npages);
|
||||
tASSERT(txn,
|
||||
pnl_check_allocated(txn->tw.relist, txn->geo.first_unallocated -
|
||||
MDBX_ENABLE_REFUND));
|
||||
tASSERT(txn, dpl_check(txn));
|
||||
return rc;
|
||||
}
|
||||
|
||||
if (si) {
|
||||
/* Page ws spilled in this txn */
|
||||
spill_remove(txn, si, npages);
|
||||
/* Страница могла быть выделена и затем пролита в этой транзакции,
|
||||
* тогда её необходимо поместить в reclaimed-список.
|
||||
* Либо она могла быть выделена в одной из родительских транзакций и затем
|
||||
* пролита в этой транзакции, тогда её необходимо поместить в
|
||||
* retired-список для последующей фильтрации при коммите. */
|
||||
for (MDBX_txn *parent = txn->parent; parent; parent = parent->parent) {
|
||||
if (dpl_exist(parent, pgno))
|
||||
goto retire;
|
||||
}
|
||||
/* Страница точно была выделена в этой транзакции
|
||||
* и теперь может быть использована повторно. */
|
||||
goto reclaim;
|
||||
}
|
||||
|
||||
if (status == shadowed) {
|
||||
/* Dirty page MUST BE a clone from (one of) parent transaction(s). */
|
||||
if (ASSERT_ENABLED()) {
|
||||
const page_t *parent_dp = nullptr;
|
||||
/* Check parent(s)'s dirty lists. */
|
||||
for (MDBX_txn *parent = txn->parent; parent && !parent_dp;
|
||||
parent = parent->parent) {
|
||||
tASSERT(txn, !spill_search(parent, pgno));
|
||||
parent_dp = debug_dpl_find(parent, pgno);
|
||||
}
|
||||
tASSERT(txn, parent_dp && (!mp || parent_dp == mp));
|
||||
}
|
||||
/* Страница была выделена в родительской транзакции и теперь может быть
|
||||
* использована повторно, но только внутри этой транзакции, либо дочерних.
|
||||
*/
|
||||
goto reclaim;
|
||||
}
|
||||
|
||||
/* Страница может входить в доступный читателям MVCC-снимок, либо же она
|
||||
* могла быть выделена, а затем пролита в одной из родительских
|
||||
* транзакций. Поэтому пока помещаем её в retired-список, который будет
|
||||
* фильтроваться относительно dirty- и spilled-списков родительских
|
||||
* транзакций при коммите дочерних транзакций, либо же будет записан
|
||||
* в GC в неизменном виде. */
|
||||
goto retire;
|
||||
}
|
||||
|
||||
__hot int __must_check_result page_dirty(MDBX_txn *txn, page_t *mp,
|
||||
size_t npages) {
|
||||
tASSERT(txn, (txn->flags & MDBX_TXN_RDONLY) == 0);
|
||||
mp->txnid = txn->front_txnid;
|
||||
if (!txn->tw.dirtylist) {
|
||||
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) != 0 && !MDBX_AVOID_MSYNC);
|
||||
txn->tw.writemap_dirty_npages += npages;
|
||||
tASSERT(txn, txn->tw.spilled.list == nullptr);
|
||||
return MDBX_SUCCESS;
|
||||
}
|
||||
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC);
|
||||
|
||||
#if xMDBX_DEBUG_SPILLING == 2
|
||||
txn->env->debug_dirtied_act += 1;
|
||||
ENSURE(txn->env, txn->env->debug_dirtied_act < txn->env->debug_dirtied_est);
|
||||
ENSURE(txn->env, txn->tw.dirtyroom + txn->tw.loose_count > 0);
|
||||
#endif /* xMDBX_DEBUG_SPILLING == 2 */
|
||||
|
||||
int rc;
|
||||
if (unlikely(txn->tw.dirtyroom == 0)) {
|
||||
if (txn->tw.loose_count) {
|
||||
page_t *lp = txn->tw.loose_pages;
|
||||
DEBUG("purge-and-reclaim loose page %" PRIaPGNO, lp->pgno);
|
||||
rc = pnl_insert_span(&txn->tw.relist, lp->pgno, 1);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
goto bailout;
|
||||
size_t di = dpl_search(txn, lp->pgno);
|
||||
tASSERT(txn, txn->tw.dirtylist->items[di].ptr == lp);
|
||||
dpl_remove(txn, di);
|
||||
MDBX_ASAN_UNPOISON_MEMORY_REGION(&page_next(lp), sizeof(page_t *));
|
||||
VALGRIND_MAKE_MEM_DEFINED(&page_next(lp), sizeof(page_t *));
|
||||
txn->tw.loose_pages = page_next(lp);
|
||||
txn->tw.loose_count--;
|
||||
txn->tw.dirtyroom++;
|
||||
if (!MDBX_AVOID_MSYNC || !(txn->flags & MDBX_WRITEMAP))
|
||||
page_shadow_release(txn->env, lp, 1);
|
||||
} else {
|
||||
ERROR("Dirtyroom is depleted, DPL length %zu", txn->tw.dirtylist->length);
|
||||
if (!MDBX_AVOID_MSYNC || !(txn->flags & MDBX_WRITEMAP))
|
||||
page_shadow_release(txn->env, mp, npages);
|
||||
return MDBX_TXN_FULL;
|
||||
}
|
||||
}
|
||||
|
||||
rc = dpl_append(txn, mp->pgno, mp, npages);
|
||||
if (unlikely(rc != MDBX_SUCCESS)) {
|
||||
bailout:
|
||||
txn->flags |= MDBX_TXN_ERROR;
|
||||
return rc;
|
||||
}
|
||||
txn->tw.dirtyroom--;
|
||||
tASSERT(txn, dpl_check(txn));
|
||||
return MDBX_SUCCESS;
|
||||
}
|
||||
|
||||
size_t page_subleaf2_reserve(const MDBX_env *const env, size_t host_page_room,
|
||||
size_t subpage_len, size_t item_len) {
|
||||
eASSERT(env, (subpage_len & 1) == 0);
|
||||
eASSERT(env, env->subpage_reserve_prereq > env->subpage_room_threshold +
|
||||
env->subpage_reserve_limit &&
|
||||
env->leaf_nodemax >= env->subpage_limit + NODESIZE);
|
||||
size_t reserve = 0;
|
||||
for (size_t n = 0;
|
||||
n < 5 && reserve + item_len <= env->subpage_reserve_limit &&
|
||||
EVEN_CEIL(subpage_len + item_len) <= env->subpage_limit &&
|
||||
host_page_room >=
|
||||
env->subpage_reserve_prereq + EVEN_CEIL(subpage_len + item_len);
|
||||
++n) {
|
||||
subpage_len += item_len;
|
||||
reserve += item_len;
|
||||
}
|
||||
return reserve + (subpage_len & 1);
|
||||
}
|
179
src/page-ops.h
Normal file
179
src/page-ops.h
Normal file
@ -0,0 +1,179 @@
|
||||
/// \copyright SPDX-License-Identifier: Apache-2.0
|
||||
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "essentials.h"
|
||||
|
||||
MDBX_INTERNAL int __must_check_result tree_search_finalize(MDBX_cursor *mc,
|
||||
const MDBX_val *key,
|
||||
int flags);
|
||||
MDBX_INTERNAL int tree_search_lowest(MDBX_cursor *mc);
|
||||
|
||||
enum page_search_flags {
|
||||
Z_MODIFY = 1,
|
||||
Z_ROOTONLY = 2,
|
||||
Z_FIRST = 4,
|
||||
Z_LAST = 8,
|
||||
};
|
||||
MDBX_INTERNAL int __must_check_result tree_search(MDBX_cursor *mc,
|
||||
const MDBX_val *key,
|
||||
int flags);
|
||||
|
||||
#define MDBX_SPLIT_REPLACE MDBX_APPENDDUP /* newkey is not new */
|
||||
MDBX_INTERNAL int __must_check_result page_split(MDBX_cursor *mc,
|
||||
const MDBX_val *const newkey,
|
||||
MDBX_val *const newdata,
|
||||
pgno_t newpgno,
|
||||
const unsigned naf);
|
||||
|
||||
/*----------------------------------------------------------------------------*/
|
||||
|
||||
MDBX_INTERNAL int MDBX_PRINTF_ARGS(2, 3)
|
||||
bad_page(const page_t *mp, const char *fmt, ...);
|
||||
|
||||
MDBX_INTERNAL void MDBX_PRINTF_ARGS(2, 3)
|
||||
poor_page(const page_t *mp, const char *fmt, ...);
|
||||
|
||||
MDBX_NOTHROW_PURE_FUNCTION static inline bool is_frozen(const MDBX_txn *txn,
|
||||
const page_t *mp) {
|
||||
return mp->txnid < txn->txnid;
|
||||
}
|
||||
|
||||
MDBX_NOTHROW_PURE_FUNCTION static inline bool is_spilled(const MDBX_txn *txn,
|
||||
const page_t *mp) {
|
||||
return mp->txnid == txn->txnid;
|
||||
}
|
||||
|
||||
MDBX_NOTHROW_PURE_FUNCTION static inline bool is_shadowed(const MDBX_txn *txn,
|
||||
const page_t *mp) {
|
||||
return mp->txnid > txn->txnid;
|
||||
}
|
||||
|
||||
MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static inline bool
|
||||
is_correct(const MDBX_txn *txn, const page_t *mp) {
|
||||
return mp->txnid <= txn->front_txnid;
|
||||
}
|
||||
|
||||
MDBX_NOTHROW_PURE_FUNCTION static inline bool is_modifable(const MDBX_txn *txn,
|
||||
const page_t *mp) {
|
||||
return mp->txnid == txn->front_txnid;
|
||||
}
|
||||
|
||||
MDBX_INTERNAL int __must_check_result page_check(const MDBX_cursor *const mc,
|
||||
const page_t *const mp);
|
||||
|
||||
MDBX_INTERNAL pgr_t page_get_any(const MDBX_cursor *const mc, const pgno_t pgno,
|
||||
const txnid_t front);
|
||||
|
||||
MDBX_INTERNAL pgr_t page_get_three(const MDBX_cursor *const mc,
|
||||
const pgno_t pgno, const txnid_t front);
|
||||
|
||||
MDBX_INTERNAL pgr_t page_get_large(const MDBX_cursor *const mc,
|
||||
const pgno_t pgno, const txnid_t front);
|
||||
|
||||
static inline int __must_check_result page_get(const MDBX_cursor *mc,
|
||||
const pgno_t pgno, page_t **mp,
|
||||
const txnid_t front) {
|
||||
pgr_t ret = page_get_three(mc, pgno, front);
|
||||
*mp = ret.page;
|
||||
return ret.err;
|
||||
}
|
||||
|
||||
/*----------------------------------------------------------------------------*/
|
||||
|
||||
MDBX_INTERNAL int __must_check_result page_dirty(MDBX_txn *txn, page_t *mp,
|
||||
size_t npages);
|
||||
MDBX_INTERNAL pgr_t page_new(MDBX_cursor *mc, const unsigned flags);
|
||||
MDBX_INTERNAL pgr_t page_new_large(MDBX_cursor *mc, const size_t npages);
|
||||
MDBX_INTERNAL int page_touch_modifable(MDBX_txn *txn, const page_t *const mp);
|
||||
MDBX_INTERNAL int page_touch_unmodifable(MDBX_txn *txn, MDBX_cursor *mc,
|
||||
const page_t *const mp);
|
||||
|
||||
static inline int page_touch(MDBX_cursor *mc) {
|
||||
page_t *const mp = mc->pg[mc->top];
|
||||
MDBX_txn *txn = mc->txn;
|
||||
|
||||
tASSERT(txn, mc->txn->flags & MDBX_TXN_DIRTY);
|
||||
tASSERT(txn,
|
||||
F_ISSET(*cursor_dbi_state(mc), DBI_LINDO | DBI_VALID | DBI_DIRTY));
|
||||
tASSERT(txn, !is_largepage(mp));
|
||||
if (ASSERT_ENABLED()) {
|
||||
if (mc->flags & z_inner) {
|
||||
subcur_t *mx = container_of(mc->tree, subcur_t, nested_tree);
|
||||
cursor_couple_t *couple = container_of(mx, cursor_couple_t, inner);
|
||||
tASSERT(txn, mc->tree == &couple->outer.subcur->nested_tree);
|
||||
tASSERT(txn, &mc->clc->k == &couple->outer.clc->v);
|
||||
tASSERT(txn, *couple->outer.dbi_state & DBI_DIRTY);
|
||||
}
|
||||
tASSERT(txn, dpl_check(txn));
|
||||
}
|
||||
|
||||
if (is_modifable(txn, mp)) {
|
||||
if (!txn->tw.dirtylist) {
|
||||
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) && !MDBX_AVOID_MSYNC);
|
||||
return MDBX_SUCCESS;
|
||||
}
|
||||
return is_subpage(mp) ? MDBX_SUCCESS : page_touch_modifable(txn, mp);
|
||||
}
|
||||
return page_touch_unmodifable(txn, mc, mp);
|
||||
}
|
||||
|
||||
MDBX_INTERNAL void page_copy(page_t *const dst, const page_t *const src,
|
||||
const size_t size);
|
||||
MDBX_INTERNAL pgr_t __must_check_result page_unspill(MDBX_txn *const txn,
|
||||
const page_t *const mp);
|
||||
|
||||
MDBX_INTERNAL page_t *page_shadow_alloc(MDBX_txn *txn, size_t num);
|
||||
|
||||
MDBX_INTERNAL void page_shadow_release(MDBX_env *env, page_t *dp,
|
||||
size_t npages);
|
||||
|
||||
MDBX_INTERNAL int page_retire_ex(MDBX_cursor *mc, const pgno_t pgno,
|
||||
page_t *mp /* maybe null */,
|
||||
unsigned pageflags /* maybe unknown/zero */);
|
||||
|
||||
static inline int page_retire(MDBX_cursor *mc, page_t *mp) {
|
||||
return page_retire_ex(mc, mp->pgno, mp, mp->flags);
|
||||
}
|
||||
|
||||
static inline void page_wash(MDBX_txn *txn, size_t di, page_t *const mp,
|
||||
const size_t npages) {
|
||||
tASSERT(txn, (txn->flags & MDBX_TXN_RDONLY) == 0);
|
||||
mp->txnid = INVALID_TXNID;
|
||||
mp->flags = P_BAD;
|
||||
|
||||
if (txn->tw.dirtylist) {
|
||||
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC);
|
||||
tASSERT(txn,
|
||||
MDBX_AVOID_MSYNC || (di && txn->tw.dirtylist->items[di].ptr == mp));
|
||||
if (!MDBX_AVOID_MSYNC || di) {
|
||||
dpl_remove_ex(txn, di, npages);
|
||||
txn->tw.dirtyroom++;
|
||||
tASSERT(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length ==
|
||||
(txn->parent ? txn->parent->tw.dirtyroom
|
||||
: txn->env->options.dp_limit));
|
||||
if (!MDBX_AVOID_MSYNC || !(txn->flags & MDBX_WRITEMAP)) {
|
||||
page_shadow_release(txn->env, mp, npages);
|
||||
return;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) && !MDBX_AVOID_MSYNC && !di);
|
||||
txn->tw.writemap_dirty_npages -= (txn->tw.writemap_dirty_npages > npages)
|
||||
? npages
|
||||
: txn->tw.writemap_dirty_npages;
|
||||
}
|
||||
VALGRIND_MAKE_MEM_UNDEFINED(mp, PAGEHDRSZ);
|
||||
VALGRIND_MAKE_MEM_NOACCESS(page_data(mp),
|
||||
pgno2bytes(txn->env, npages) - PAGEHDRSZ);
|
||||
MDBX_ASAN_POISON_MEMORY_REGION(page_data(mp),
|
||||
pgno2bytes(txn->env, npages) - PAGEHDRSZ);
|
||||
}
|
||||
|
||||
MDBX_INTERNAL size_t page_subleaf2_reserve(const MDBX_env *const env,
|
||||
size_t host_page_room,
|
||||
size_t subpage_len, size_t item_len);
|
||||
|
||||
#define page_next(mp) \
|
||||
(*(page_t **)ptr_disp((mp)->entries, sizeof(void *) - sizeof(uint32_t)))
|
147
src/page-search.c
Normal file
147
src/page-search.c
Normal file
@ -0,0 +1,147 @@
|
||||
/// \copyright SPDX-License-Identifier: Apache-2.0
|
||||
/// \note Please refer to the COPYRIGHT file for explanations license change,
|
||||
/// credits and acknowledgments.
|
||||
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
|
||||
|
||||
#include "internals.h"
|
||||
|
||||
/* Search for the lowest key under the current branch page.
|
||||
* This just bypasses a numkeys check in the current page
|
||||
* before calling tree_search_finalize(), because the callers
|
||||
* are all in situations where the current page is known to
|
||||
* be underfilled. */
|
||||
__hot int tree_search_lowest(MDBX_cursor *mc) {
|
||||
cASSERT(mc, mc->top >= 0);
|
||||
page_t *mp = mc->pg[mc->top];
|
||||
cASSERT(mc, is_branch(mp));
|
||||
|
||||
node_t *node = page_node(mp, 0);
|
||||
int err = page_get(mc, node_pgno(node), &mp, mp->txnid);
|
||||
if (unlikely(err != MDBX_SUCCESS))
|
||||
return err;
|
||||
|
||||
mc->ki[mc->top] = 0;
|
||||
err = cursor_push(mc, mp, 0);
|
||||
if (unlikely(err != MDBX_SUCCESS))
|
||||
return err;
|
||||
return tree_search_finalize(mc, nullptr, Z_FIRST);
|
||||
}
|
||||
|
||||
__hot int tree_search(MDBX_cursor *mc, const MDBX_val *key, int flags) {
|
||||
int err;
|
||||
if (unlikely(mc->txn->flags & MDBX_TXN_BLOCKED)) {
|
||||
DEBUG("%s", "transaction has failed, must abort");
|
||||
err = MDBX_BAD_TXN;
|
||||
bailout:
|
||||
be_poor(mc);
|
||||
return err;
|
||||
}
|
||||
|
||||
const size_t dbi = cursor_dbi(mc);
|
||||
if (unlikely(*cursor_dbi_state(mc) & DBI_STALE)) {
|
||||
err = sdb_fetch(mc->txn, dbi);
|
||||
if (unlikely(err != MDBX_SUCCESS))
|
||||
goto bailout;
|
||||
}
|
||||
|
||||
const pgno_t root = mc->tree->root;
|
||||
if (unlikely(root == P_INVALID)) {
|
||||
DEBUG("%s", "tree is empty");
|
||||
cASSERT(mc, is_poor(mc));
|
||||
return MDBX_NOTFOUND;
|
||||
}
|
||||
|
||||
cASSERT(mc, root >= NUM_METAS && root < mc->txn->geo.first_unallocated);
|
||||
if (mc->top < 0 || mc->pg[0]->pgno != root) {
|
||||
txnid_t pp_txnid = mc->tree->mod_txnid;
|
||||
pp_txnid = /* tree->mod_txnid maybe zero in a legacy DB */ pp_txnid
|
||||
? pp_txnid
|
||||
: mc->txn->txnid;
|
||||
if ((mc->txn->flags & MDBX_TXN_RDONLY) == 0) {
|
||||
MDBX_txn *scan = mc->txn;
|
||||
do
|
||||
if ((scan->flags & MDBX_TXN_DIRTY) &&
|
||||
(dbi == MAIN_DBI || (scan->dbi_state[dbi] & DBI_DIRTY))) {
|
||||
/* После коммита вложенных тразакций может быть mod_txnid > front */
|
||||
pp_txnid = scan->front_txnid;
|
||||
break;
|
||||
}
|
||||
while (unlikely((scan = scan->parent) != nullptr));
|
||||
}
|
||||
err = page_get(mc, root, &mc->pg[0], pp_txnid);
|
||||
if (unlikely(err != MDBX_SUCCESS))
|
||||
goto bailout;
|
||||
}
|
||||
|
||||
mc->top = 0;
|
||||
mc->ki[0] = (flags & Z_LAST) ? page_numkeys(mc->pg[0]) - 1 : 0;
|
||||
DEBUG("db %d root page %" PRIaPGNO " has flags 0x%X", cursor_dbi_dbg(mc),
|
||||
root, mc->pg[0]->flags);
|
||||
|
||||
if (flags & Z_MODIFY) {
|
||||
err = page_touch(mc);
|
||||
if (unlikely(err != MDBX_SUCCESS))
|
||||
goto bailout;
|
||||
}
|
||||
|
||||
if (flags & Z_ROOTONLY)
|
||||
return MDBX_SUCCESS;
|
||||
|
||||
return tree_search_finalize(mc, key, flags);
|
||||
}
|
||||
|
||||
__hot __noinline int tree_search_finalize(MDBX_cursor *mc, const MDBX_val *key,
|
||||
int flags) {
|
||||
cASSERT(mc, !is_poor(mc));
|
||||
DKBUF_DEBUG;
|
||||
int err;
|
||||
page_t *mp = mc->pg[mc->top];
|
||||
intptr_t ki = (flags & Z_FIRST) ? 0 : page_numkeys(mp) - 1;
|
||||
while (is_branch(mp)) {
|
||||
DEBUG("branch page %" PRIaPGNO " has %zu keys", mp->pgno, page_numkeys(mp));
|
||||
cASSERT(mc, page_numkeys(mp) > 1);
|
||||
DEBUG("found index 0 to page %" PRIaPGNO, node_pgno(page_node(mp, 0)));
|
||||
|
||||
if ((flags & (Z_FIRST | Z_LAST)) == 0) {
|
||||
const struct node_search_result nsr = node_search(mc, key);
|
||||
if (likely(nsr.node))
|
||||
ki = mc->ki[mc->top] + (intptr_t)nsr.exact - 1;
|
||||
DEBUG("following index %zu for key [%s]", ki, DKEY_DEBUG(key));
|
||||
}
|
||||
|
||||
err = page_get(mc, node_pgno(page_node(mp, ki)), &mp, mp->txnid);
|
||||
if (unlikely(err != MDBX_SUCCESS))
|
||||
goto bailout;
|
||||
|
||||
mc->ki[mc->top] = (indx_t)ki;
|
||||
ki = (flags & Z_FIRST) ? 0 : page_numkeys(mp) - 1;
|
||||
err = cursor_push(mc, mp, ki);
|
||||
if (unlikely(err != MDBX_SUCCESS))
|
||||
goto bailout;
|
||||
|
||||
if (flags & Z_MODIFY) {
|
||||
err = page_touch(mc);
|
||||
if (unlikely(err != MDBX_SUCCESS))
|
||||
goto bailout;
|
||||
mp = mc->pg[mc->top];
|
||||
}
|
||||
}
|
||||
|
||||
if (!MDBX_DISABLE_VALIDATION && unlikely(!check_leaf_type(mc, mp))) {
|
||||
ERROR("unexpected leaf-page #%" PRIaPGNO " type 0x%x seen by cursor",
|
||||
mp->pgno, mp->flags);
|
||||
err = MDBX_CORRUPTED;
|
||||
bailout:
|
||||
be_poor(mc);
|
||||
return err;
|
||||
}
|
||||
|
||||
DEBUG("found leaf page %" PRIaPGNO " for key [%s]", mp->pgno,
|
||||
DKEY_DEBUG(key));
|
||||
/* Логически верно, но (в текущем понимании) нет необходимости.
|
||||
Однако, стоит ещё по-проверять/по-тестировать.
|
||||
Возможно есть сценарий, в котором очистка флагов всё-таки требуется.
|
||||
|
||||
be_filled(mc); */
|
||||
return MDBX_SUCCESS;
|
||||
}
|
254
src/pnl.c
Normal file
254
src/pnl.c
Normal file
@ -0,0 +1,254 @@
|
||||
/// \copyright SPDX-License-Identifier: Apache-2.0
|
||||
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
|
||||
|
||||
#include "internals.h"
|
||||
|
||||
MDBX_INTERNAL pnl_t pnl_alloc(size_t size) {
|
||||
size_t bytes = pnl_size2bytes(size);
|
||||
pnl_t pnl = osal_malloc(bytes);
|
||||
if (likely(pnl)) {
|
||||
#if __GLIBC_PREREQ(2, 12) || defined(__FreeBSD__) || defined(malloc_usable_size)
|
||||
bytes = malloc_usable_size(pnl);
|
||||
#endif /* malloc_usable_size */
|
||||
pnl[0] = pnl_bytes2size(bytes);
|
||||
assert(pnl[0] >= size);
|
||||
pnl += 1;
|
||||
*pnl = 0;
|
||||
}
|
||||
return pnl;
|
||||
}
|
||||
|
||||
MDBX_INTERNAL void pnl_free(pnl_t pnl) {
|
||||
if (likely(pnl))
|
||||
osal_free(pnl - 1);
|
||||
}
|
||||
|
||||
MDBX_INTERNAL void pnl_shrink(pnl_t __restrict *__restrict ppnl) {
|
||||
assert(pnl_bytes2size(pnl_size2bytes(MDBX_PNL_INITIAL)) >= MDBX_PNL_INITIAL &&
|
||||
pnl_bytes2size(pnl_size2bytes(MDBX_PNL_INITIAL)) <
|
||||
MDBX_PNL_INITIAL * 3 / 2);
|
||||
assert(MDBX_PNL_GETSIZE(*ppnl) <= PAGELIST_LIMIT &&
|
||||
MDBX_PNL_ALLOCLEN(*ppnl) >= MDBX_PNL_GETSIZE(*ppnl));
|
||||
MDBX_PNL_SETSIZE(*ppnl, 0);
|
||||
if (unlikely(MDBX_PNL_ALLOCLEN(*ppnl) >
|
||||
MDBX_PNL_INITIAL * (MDBX_PNL_PREALLOC_FOR_RADIXSORT ? 8 : 4) -
|
||||
MDBX_CACHELINE_SIZE / sizeof(pgno_t))) {
|
||||
size_t bytes = pnl_size2bytes(MDBX_PNL_INITIAL * 2);
|
||||
pnl_t pnl = osal_realloc(*ppnl - 1, bytes);
|
||||
if (likely(pnl)) {
|
||||
#if __GLIBC_PREREQ(2, 12) || defined(__FreeBSD__) || defined(malloc_usable_size)
|
||||
bytes = malloc_usable_size(pnl);
|
||||
#endif /* malloc_usable_size */
|
||||
*pnl = pnl_bytes2size(bytes);
|
||||
*ppnl = pnl + 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
MDBX_INTERNAL int pnl_reserve(pnl_t __restrict *__restrict ppnl,
|
||||
const size_t wanna) {
|
||||
const size_t allocated = MDBX_PNL_ALLOCLEN(*ppnl);
|
||||
assert(MDBX_PNL_GETSIZE(*ppnl) <= PAGELIST_LIMIT &&
|
||||
MDBX_PNL_ALLOCLEN(*ppnl) >= MDBX_PNL_GETSIZE(*ppnl));
|
||||
if (likely(allocated >= wanna))
|
||||
return MDBX_SUCCESS;
|
||||
|
||||
if (unlikely(wanna > /* paranoia */ PAGELIST_LIMIT)) {
|
||||
ERROR("PNL too long (%zu > %zu)", wanna, (size_t)PAGELIST_LIMIT);
|
||||
return MDBX_TXN_FULL;
|
||||
}
|
||||
|
||||
const size_t size = (wanna + wanna - allocated < PAGELIST_LIMIT)
|
||||
? wanna + wanna - allocated
|
||||
: PAGELIST_LIMIT;
|
||||
size_t bytes = pnl_size2bytes(size);
|
||||
pnl_t pnl = osal_realloc(*ppnl - 1, bytes);
|
||||
if (likely(pnl)) {
|
||||
#if __GLIBC_PREREQ(2, 12) || defined(__FreeBSD__) || defined(malloc_usable_size)
|
||||
bytes = malloc_usable_size(pnl);
|
||||
#endif /* malloc_usable_size */
|
||||
*pnl = pnl_bytes2size(bytes);
|
||||
assert(*pnl >= wanna);
|
||||
*ppnl = pnl + 1;
|
||||
return MDBX_SUCCESS;
|
||||
}
|
||||
return MDBX_ENOMEM;
|
||||
}
|
||||
|
||||
static __always_inline int __must_check_result pnl_append_stepped(
|
||||
unsigned step, __restrict pnl_t *ppnl, pgno_t pgno, size_t n) {
|
||||
assert(n > 0);
|
||||
int rc = pnl_need(ppnl, n);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return rc;
|
||||
|
||||
const pnl_t pnl = *ppnl;
|
||||
if (likely(n == 1)) {
|
||||
pnl_append_prereserved(pnl, pgno);
|
||||
return MDBX_SUCCESS;
|
||||
}
|
||||
|
||||
#if MDBX_PNL_ASCENDING
|
||||
size_t w = MDBX_PNL_GETSIZE(pnl);
|
||||
do {
|
||||
pnl[++w] = pgno;
|
||||
pgno += step;
|
||||
} while (--n);
|
||||
MDBX_PNL_SETSIZE(pnl, w);
|
||||
#else
|
||||
size_t w = MDBX_PNL_GETSIZE(pnl) + n;
|
||||
MDBX_PNL_SETSIZE(pnl, w);
|
||||
do {
|
||||
pnl[w--] = pgno;
|
||||
pgno += step;
|
||||
} while (--n);
|
||||
#endif
|
||||
return MDBX_SUCCESS;
|
||||
}
|
||||
|
||||
__hot MDBX_INTERNAL int __must_check_result
|
||||
spill_append_span(__restrict pnl_t *ppnl, pgno_t pgno, size_t n) {
|
||||
return pnl_append_stepped(2, ppnl, pgno << 1, n);
|
||||
}
|
||||
|
||||
__hot MDBX_INTERNAL int __must_check_result
|
||||
pnl_append_span(__restrict pnl_t *ppnl, pgno_t pgno, size_t n) {
|
||||
return pnl_append_stepped(1, ppnl, pgno, n);
|
||||
}
|
||||
|
||||
__hot MDBX_INTERNAL int __must_check_result
|
||||
pnl_insert_span(__restrict pnl_t *ppnl, pgno_t pgno, size_t n) {
|
||||
assert(n > 0);
|
||||
int rc = pnl_need(ppnl, n);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return rc;
|
||||
|
||||
const pnl_t pnl = *ppnl;
|
||||
size_t r = MDBX_PNL_GETSIZE(pnl), w = r + n;
|
||||
MDBX_PNL_SETSIZE(pnl, w);
|
||||
while (r && MDBX_PNL_DISORDERED(pnl[r], pgno))
|
||||
pnl[w--] = pnl[r--];
|
||||
|
||||
for (pgno_t fill = MDBX_PNL_ASCENDING ? pgno + n : pgno; w > r; --w)
|
||||
pnl[w] = MDBX_PNL_ASCENDING ? --fill : fill++;
|
||||
|
||||
return MDBX_SUCCESS;
|
||||
}
|
||||
|
||||
__hot __noinline MDBX_INTERNAL bool pnl_check(const const_pnl_t pnl,
|
||||
const size_t limit) {
|
||||
assert(limit >= MIN_PAGENO - MDBX_ENABLE_REFUND);
|
||||
if (likely(MDBX_PNL_GETSIZE(pnl))) {
|
||||
if (unlikely(MDBX_PNL_GETSIZE(pnl) > PAGELIST_LIMIT))
|
||||
return false;
|
||||
if (unlikely(MDBX_PNL_LEAST(pnl) < MIN_PAGENO))
|
||||
return false;
|
||||
if (unlikely(MDBX_PNL_MOST(pnl) >= limit))
|
||||
return false;
|
||||
|
||||
if ((!MDBX_DISABLE_VALIDATION || AUDIT_ENABLED()) &&
|
||||
likely(MDBX_PNL_GETSIZE(pnl) > 1)) {
|
||||
const pgno_t *scan = MDBX_PNL_BEGIN(pnl);
|
||||
const pgno_t *const end = MDBX_PNL_END(pnl);
|
||||
pgno_t prev = *scan++;
|
||||
do {
|
||||
if (unlikely(!MDBX_PNL_ORDERED(prev, *scan)))
|
||||
return false;
|
||||
prev = *scan;
|
||||
} while (likely(++scan != end));
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
static __always_inline void
|
||||
pnl_merge_inner(pgno_t *__restrict dst, const pgno_t *__restrict src_a,
|
||||
const pgno_t *__restrict src_b,
|
||||
const pgno_t *__restrict const src_b_detent) {
|
||||
do {
|
||||
#if MDBX_HAVE_CMOV
|
||||
const bool flag = MDBX_PNL_ORDERED(*src_b, *src_a);
|
||||
#if defined(__LCC__) || __CLANG_PREREQ(13, 0)
|
||||
// lcc 1.26: 13ШК (подготовка и первая итерация) + 7ШК (цикл), БЕЗ loop-mode
|
||||
// gcc>=7: cmp+jmp с возвратом в тело цикла (WTF?)
|
||||
// gcc<=6: cmov×3
|
||||
// clang<=12: cmov×3
|
||||
// clang>=13: cmov, set+add/sub
|
||||
*dst = flag ? *src_a-- : *src_b--;
|
||||
#else
|
||||
// gcc: cmov, cmp+set+add/sub
|
||||
// clang<=5: cmov×2, set+add/sub
|
||||
// clang>=6: cmov, set+add/sub
|
||||
*dst = flag ? *src_a : *src_b;
|
||||
src_b += (ptrdiff_t)flag - 1;
|
||||
src_a -= flag;
|
||||
#endif
|
||||
--dst;
|
||||
#else /* MDBX_HAVE_CMOV */
|
||||
while (MDBX_PNL_ORDERED(*src_b, *src_a))
|
||||
*dst-- = *src_a--;
|
||||
*dst-- = *src_b--;
|
||||
#endif /* !MDBX_HAVE_CMOV */
|
||||
} while (likely(src_b > src_b_detent));
|
||||
}
|
||||
|
||||
__hot MDBX_INTERNAL size_t pnl_merge(pnl_t dst, const pnl_t src) {
|
||||
assert(pnl_check_allocated(dst, MAX_PAGENO + 1));
|
||||
assert(pnl_check(src, MAX_PAGENO + 1));
|
||||
const size_t src_len = MDBX_PNL_GETSIZE(src);
|
||||
const size_t dst_len = MDBX_PNL_GETSIZE(dst);
|
||||
size_t total = dst_len;
|
||||
assert(MDBX_PNL_ALLOCLEN(dst) >= total);
|
||||
if (likely(src_len > 0)) {
|
||||
total += src_len;
|
||||
if (!MDBX_DEBUG && total < (MDBX_HAVE_CMOV ? 21 : 12))
|
||||
goto avoid_call_libc_for_short_cases;
|
||||
if (dst_len == 0 ||
|
||||
MDBX_PNL_ORDERED(MDBX_PNL_LAST(dst), MDBX_PNL_FIRST(src)))
|
||||
memcpy(MDBX_PNL_END(dst), MDBX_PNL_BEGIN(src), src_len * sizeof(pgno_t));
|
||||
else if (MDBX_PNL_ORDERED(MDBX_PNL_LAST(src), MDBX_PNL_FIRST(dst))) {
|
||||
memmove(MDBX_PNL_BEGIN(dst) + src_len, MDBX_PNL_BEGIN(dst),
|
||||
dst_len * sizeof(pgno_t));
|
||||
memcpy(MDBX_PNL_BEGIN(dst), MDBX_PNL_BEGIN(src),
|
||||
src_len * sizeof(pgno_t));
|
||||
} else {
|
||||
avoid_call_libc_for_short_cases:
|
||||
dst[0] = /* the detent */ (MDBX_PNL_ASCENDING ? 0 : P_INVALID);
|
||||
pnl_merge_inner(dst + total, dst + dst_len, src + src_len, src);
|
||||
}
|
||||
MDBX_PNL_SETSIZE(dst, total);
|
||||
}
|
||||
assert(pnl_check_allocated(dst, MAX_PAGENO + 1));
|
||||
return total;
|
||||
}
|
||||
|
||||
#if MDBX_PNL_ASCENDING
|
||||
#define MDBX_PNL_EXTRACT_KEY(ptr) (*(ptr))
|
||||
#else
|
||||
#define MDBX_PNL_EXTRACT_KEY(ptr) (P_INVALID - *(ptr))
|
||||
#endif
|
||||
RADIXSORT_IMPL(pgno, pgno_t, MDBX_PNL_EXTRACT_KEY,
|
||||
MDBX_PNL_PREALLOC_FOR_RADIXSORT, 0)
|
||||
|
||||
SORT_IMPL(pgno_sort, false, pgno_t, MDBX_PNL_ORDERED)
|
||||
|
||||
__hot __noinline MDBX_INTERNAL void pnl_sort_nochk(pnl_t pnl) {
|
||||
if (likely(MDBX_PNL_GETSIZE(pnl) < MDBX_RADIXSORT_THRESHOLD) ||
|
||||
unlikely(!pgno_radixsort(&MDBX_PNL_FIRST(pnl), MDBX_PNL_GETSIZE(pnl))))
|
||||
pgno_sort(MDBX_PNL_BEGIN(pnl), MDBX_PNL_END(pnl));
|
||||
}
|
||||
|
||||
SEARCH_IMPL(pgno_bsearch, pgno_t, pgno_t, MDBX_PNL_ORDERED)
|
||||
|
||||
__hot __noinline MDBX_INTERNAL size_t pnl_search_nochk(const pnl_t pnl,
|
||||
pgno_t pgno) {
|
||||
const pgno_t *begin = MDBX_PNL_BEGIN(pnl);
|
||||
const pgno_t *it = pgno_bsearch(begin, MDBX_PNL_GETSIZE(pnl), pgno);
|
||||
const pgno_t *end = begin + MDBX_PNL_GETSIZE(pnl);
|
||||
assert(it >= begin && it <= end);
|
||||
if (it != begin)
|
||||
assert(MDBX_PNL_ORDERED(it[-1], pgno));
|
||||
if (it != end)
|
||||
assert(!MDBX_PNL_ORDERED(it[0], pgno));
|
||||
return it - begin + 1;
|
||||
}
|
161
src/pnl.h
Normal file
161
src/pnl.h
Normal file
@ -0,0 +1,161 @@
|
||||
/// \copyright SPDX-License-Identifier: Apache-2.0
|
||||
/// \note Please refer to the COPYRIGHT file for explanations license change,
|
||||
/// credits and acknowledgments.
|
||||
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "essentials.h"
|
||||
|
||||
/* An PNL is an Page Number List, a sorted array of IDs.
|
||||
*
|
||||
* The first element of the array is a counter for how many actual page-numbers
|
||||
* are in the list. By default PNLs are sorted in descending order, this allow
|
||||
* cut off a page with lowest pgno (at the tail) just truncating the list. The
|
||||
* sort order of PNLs is controlled by the MDBX_PNL_ASCENDING build option. */
|
||||
typedef pgno_t *pnl_t;
|
||||
typedef const pgno_t *const_pnl_t;
|
||||
|
||||
#if MDBX_PNL_ASCENDING
|
||||
#define MDBX_PNL_ORDERED(first, last) ((first) < (last))
|
||||
#define MDBX_PNL_DISORDERED(first, last) ((first) >= (last))
|
||||
#else
|
||||
#define MDBX_PNL_ORDERED(first, last) ((first) > (last))
|
||||
#define MDBX_PNL_DISORDERED(first, last) ((first) <= (last))
|
||||
#endif
|
||||
|
||||
#define MDBX_PNL_GRANULATE_LOG2 10
|
||||
#define MDBX_PNL_GRANULATE (1 << MDBX_PNL_GRANULATE_LOG2)
|
||||
#define MDBX_PNL_INITIAL \
|
||||
(MDBX_PNL_GRANULATE - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(pgno_t))
|
||||
|
||||
#define MDBX_PNL_ALLOCLEN(pl) ((pl)[-1])
|
||||
#define MDBX_PNL_GETSIZE(pl) ((size_t)((pl)[0]))
|
||||
#define MDBX_PNL_SETSIZE(pl, size) \
|
||||
do { \
|
||||
const size_t __size = size; \
|
||||
assert(__size < INT_MAX); \
|
||||
(pl)[0] = (pgno_t)__size; \
|
||||
} while (0)
|
||||
#define MDBX_PNL_FIRST(pl) ((pl)[1])
|
||||
#define MDBX_PNL_LAST(pl) ((pl)[MDBX_PNL_GETSIZE(pl)])
|
||||
#define MDBX_PNL_BEGIN(pl) (&(pl)[1])
|
||||
#define MDBX_PNL_END(pl) (&(pl)[MDBX_PNL_GETSIZE(pl) + 1])
|
||||
|
||||
#if MDBX_PNL_ASCENDING
|
||||
#define MDBX_PNL_EDGE(pl) ((pl) + 1)
|
||||
#define MDBX_PNL_LEAST(pl) MDBX_PNL_FIRST(pl)
|
||||
#define MDBX_PNL_MOST(pl) MDBX_PNL_LAST(pl)
|
||||
#else
|
||||
#define MDBX_PNL_EDGE(pl) ((pl) + MDBX_PNL_GETSIZE(pl))
|
||||
#define MDBX_PNL_LEAST(pl) MDBX_PNL_LAST(pl)
|
||||
#define MDBX_PNL_MOST(pl) MDBX_PNL_FIRST(pl)
|
||||
#endif
|
||||
|
||||
#define MDBX_PNL_SIZEOF(pl) ((MDBX_PNL_GETSIZE(pl) + 1) * sizeof(pgno_t))
|
||||
#define MDBX_PNL_IS_EMPTY(pl) (MDBX_PNL_GETSIZE(pl) == 0)
|
||||
|
||||
MDBX_MAYBE_UNUSED static inline size_t pnl_size2bytes(size_t size) {
|
||||
assert(size > 0 && size <= PAGELIST_LIMIT);
|
||||
#if MDBX_PNL_PREALLOC_FOR_RADIXSORT
|
||||
|
||||
size += size;
|
||||
#endif /* MDBX_PNL_PREALLOC_FOR_RADIXSORT */
|
||||
STATIC_ASSERT(MDBX_ASSUME_MALLOC_OVERHEAD +
|
||||
(PAGELIST_LIMIT * (MDBX_PNL_PREALLOC_FOR_RADIXSORT + 1) +
|
||||
MDBX_PNL_GRANULATE + 3) *
|
||||
sizeof(pgno_t) <
|
||||
SIZE_MAX / 4 * 3);
|
||||
size_t bytes =
|
||||
ceil_powerof2(MDBX_ASSUME_MALLOC_OVERHEAD + sizeof(pgno_t) * (size + 3),
|
||||
MDBX_PNL_GRANULATE * sizeof(pgno_t)) -
|
||||
MDBX_ASSUME_MALLOC_OVERHEAD;
|
||||
return bytes;
|
||||
}
|
||||
|
||||
MDBX_MAYBE_UNUSED static inline pgno_t pnl_bytes2size(const size_t bytes) {
|
||||
size_t size = bytes / sizeof(pgno_t);
|
||||
assert(size > 3 && size <= PAGELIST_LIMIT + /* alignment gap */ 65536);
|
||||
size -= 3;
|
||||
#if MDBX_PNL_PREALLOC_FOR_RADIXSORT
|
||||
size >>= 1;
|
||||
#endif /* MDBX_PNL_PREALLOC_FOR_RADIXSORT */
|
||||
return (pgno_t)size;
|
||||
}
|
||||
|
||||
MDBX_INTERNAL pnl_t pnl_alloc(size_t size);
|
||||
|
||||
MDBX_INTERNAL void pnl_free(pnl_t pnl);
|
||||
|
||||
MDBX_INTERNAL int pnl_reserve(pnl_t __restrict *__restrict ppnl,
|
||||
const size_t wanna);
|
||||
|
||||
MDBX_MAYBE_UNUSED static inline int __must_check_result
|
||||
pnl_need(pnl_t __restrict *__restrict ppnl, size_t num) {
|
||||
assert(MDBX_PNL_GETSIZE(*ppnl) <= PAGELIST_LIMIT &&
|
||||
MDBX_PNL_ALLOCLEN(*ppnl) >= MDBX_PNL_GETSIZE(*ppnl));
|
||||
assert(num <= PAGELIST_LIMIT);
|
||||
const size_t wanna = MDBX_PNL_GETSIZE(*ppnl) + num;
|
||||
return likely(MDBX_PNL_ALLOCLEN(*ppnl) >= wanna) ? MDBX_SUCCESS
|
||||
: pnl_reserve(ppnl, wanna);
|
||||
}
|
||||
|
||||
MDBX_MAYBE_UNUSED static inline void
|
||||
pnl_append_prereserved(__restrict pnl_t pnl, pgno_t pgno) {
|
||||
assert(MDBX_PNL_GETSIZE(pnl) < MDBX_PNL_ALLOCLEN(pnl));
|
||||
if (AUDIT_ENABLED()) {
|
||||
for (size_t i = MDBX_PNL_GETSIZE(pnl); i > 0; --i)
|
||||
assert(pgno != pnl[i]);
|
||||
}
|
||||
*pnl += 1;
|
||||
MDBX_PNL_LAST(pnl) = pgno;
|
||||
}
|
||||
|
||||
MDBX_INTERNAL void pnl_shrink(pnl_t __restrict *__restrict ppnl);
|
||||
|
||||
MDBX_INTERNAL int __must_check_result spill_append_span(__restrict pnl_t *ppnl,
|
||||
pgno_t pgno, size_t n);
|
||||
|
||||
MDBX_INTERNAL int __must_check_result pnl_append_span(__restrict pnl_t *ppnl,
|
||||
pgno_t pgno, size_t n);
|
||||
|
||||
MDBX_INTERNAL int __must_check_result pnl_insert_span(__restrict pnl_t *ppnl,
|
||||
pgno_t pgno, size_t n);
|
||||
|
||||
MDBX_INTERNAL size_t pnl_search_nochk(const pnl_t pnl, pgno_t pgno);
|
||||
|
||||
MDBX_INTERNAL void pnl_sort_nochk(pnl_t pnl);
|
||||
|
||||
MDBX_INTERNAL bool pnl_check(const const_pnl_t pnl, const size_t limit);
|
||||
|
||||
MDBX_MAYBE_UNUSED static inline bool pnl_check_allocated(const const_pnl_t pnl,
|
||||
const size_t limit) {
|
||||
return pnl == nullptr || (MDBX_PNL_ALLOCLEN(pnl) >= MDBX_PNL_GETSIZE(pnl) &&
|
||||
pnl_check(pnl, limit));
|
||||
}
|
||||
|
||||
MDBX_MAYBE_UNUSED static inline void pnl_sort(pnl_t pnl, size_t limit4check) {
|
||||
pnl_sort_nochk(pnl);
|
||||
assert(pnl_check(pnl, limit4check));
|
||||
(void)limit4check;
|
||||
}
|
||||
|
||||
MDBX_MAYBE_UNUSED static inline size_t pnl_search(const pnl_t pnl, pgno_t pgno,
|
||||
size_t limit) {
|
||||
assert(pnl_check_allocated(pnl, limit));
|
||||
if (MDBX_HAVE_CMOV) {
|
||||
/* cmov-ускоренный бинарный поиск может читать (но не использовать) один
|
||||
* элемент за концом данных, этот элемент в пределах выделенного участка
|
||||
* памяти, но не инициализирован. */
|
||||
VALGRIND_MAKE_MEM_DEFINED(MDBX_PNL_END(pnl), sizeof(pgno_t));
|
||||
}
|
||||
assert(pgno < limit);
|
||||
(void)limit;
|
||||
size_t n = pnl_search_nochk(pnl, pgno);
|
||||
if (MDBX_HAVE_CMOV) {
|
||||
VALGRIND_MAKE_MEM_UNDEFINED(MDBX_PNL_END(pnl), sizeof(pgno_t));
|
||||
}
|
||||
return n;
|
||||
}
|
||||
|
||||
MDBX_INTERNAL size_t pnl_merge(pnl_t dst, const pnl_t src);
|
@ -1,19 +1,162 @@
|
||||
/*
|
||||
* Copyright 2015-2024 Leonid Yuriev <leo@yuriev.ru>
|
||||
* and other libmdbx authors: please see AUTHORS file.
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted only as authorized by the OpenLDAP
|
||||
* Public License.
|
||||
*
|
||||
* A copy of this license is available in the file LICENSE in the
|
||||
* top-level directory of the distribution or, alternatively, at
|
||||
* <http://www.OpenLDAP.org/license.html>.
|
||||
*/
|
||||
/// \copyright SPDX-License-Identifier: Apache-2.0
|
||||
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
|
||||
|
||||
#pragma once
|
||||
|
||||
/* Undefine the NDEBUG if debugging is enforced by MDBX_DEBUG */
|
||||
#if (defined(MDBX_DEBUG) && MDBX_DEBUG > 0) || \
|
||||
(defined(MDBX_FORCE_ASSERTIONS) && MDBX_FORCE_ASSERTIONS)
|
||||
#undef NDEBUG
|
||||
#endif
|
||||
|
||||
/*----------------------------------------------------------------------------*/
|
||||
|
||||
/** Disables using GNU/Linux libc extensions.
|
||||
* \ingroup build_option
|
||||
* \note This option couldn't be moved to the options.h since dependent
|
||||
* control macros/defined should be prepared before include the options.h */
|
||||
#ifndef MDBX_DISABLE_GNU_SOURCE
|
||||
#define MDBX_DISABLE_GNU_SOURCE 0
|
||||
#endif
|
||||
#if MDBX_DISABLE_GNU_SOURCE
|
||||
#undef _GNU_SOURCE
|
||||
#elif (defined(__linux__) || defined(__gnu_linux__)) && !defined(_GNU_SOURCE)
|
||||
#define _GNU_SOURCE
|
||||
#endif /* MDBX_DISABLE_GNU_SOURCE */
|
||||
|
||||
/* Should be defined before any includes */
|
||||
#if !defined(_FILE_OFFSET_BITS) && !defined(__ANDROID_API__) && \
|
||||
!defined(ANDROID)
|
||||
#define _FILE_OFFSET_BITS 64
|
||||
#endif /* _FILE_OFFSET_BITS */
|
||||
|
||||
#if defined(__APPLE__) && !defined(_DARWIN_C_SOURCE)
|
||||
#define _DARWIN_C_SOURCE
|
||||
#endif /* _DARWIN_C_SOURCE */
|
||||
|
||||
#if (defined(__MINGW__) || defined(__MINGW32__) || defined(__MINGW64__)) && \
|
||||
!defined(__USE_MINGW_ANSI_STDIO)
|
||||
#define __USE_MINGW_ANSI_STDIO 1
|
||||
#endif /* MinGW */
|
||||
|
||||
#if defined(_WIN32) || defined(_WIN64) || defined(_WINDOWS)
|
||||
|
||||
#ifndef _WIN32_WINNT
|
||||
#define _WIN32_WINNT 0x0601 /* Windows 7 */
|
||||
#endif /* _WIN32_WINNT */
|
||||
|
||||
#if !defined(_CRT_SECURE_NO_WARNINGS)
|
||||
#define _CRT_SECURE_NO_WARNINGS
|
||||
#endif /* _CRT_SECURE_NO_WARNINGS */
|
||||
#if !defined(UNICODE)
|
||||
#define UNICODE
|
||||
#endif /* UNICODE */
|
||||
|
||||
#if !defined(_NO_CRT_STDIO_INLINE) && MDBX_BUILD_SHARED_LIBRARY && \
|
||||
!defined(xMDBX_TOOLS) && MDBX_WITHOUT_MSVC_CRT
|
||||
#define _NO_CRT_STDIO_INLINE
|
||||
#endif /* _NO_CRT_STDIO_INLINE */
|
||||
|
||||
#elif !defined(_POSIX_C_SOURCE)
|
||||
#define _POSIX_C_SOURCE 200809L
|
||||
#endif /* Windows */
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
||||
#ifndef NOMINMAX
|
||||
#define NOMINMAX
|
||||
#endif /* NOMINMAX */
|
||||
|
||||
/* Workaround for modern libstdc++ with CLANG < 4.x */
|
||||
#if defined(__SIZEOF_INT128__) && !defined(__GLIBCXX_TYPE_INT_N_0) && \
|
||||
defined(__clang__) && __clang_major__ < 4
|
||||
#define __GLIBCXX_BITSIZE_INT_N_0 128
|
||||
#define __GLIBCXX_TYPE_INT_N_0 __int128
|
||||
#endif /* Workaround for modern libstdc++ with CLANG < 4.x */
|
||||
|
||||
#ifdef _MSC_VER
|
||||
/* Workaround for MSVC' header `extern "C"` vs `std::` redefinition bug */
|
||||
#if defined(__SANITIZE_ADDRESS__) && !defined(_DISABLE_VECTOR_ANNOTATION)
|
||||
#define _DISABLE_VECTOR_ANNOTATION
|
||||
#endif /* _DISABLE_VECTOR_ANNOTATION */
|
||||
#endif /* _MSC_VER */
|
||||
|
||||
#endif /* __cplusplus */
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#if _MSC_FULL_VER < 190024234
|
||||
/* Actually libmdbx was not tested with compilers older than 19.00.24234 (Visual
|
||||
* Studio 2015 Update 3). But you could remove this #error and try to continue
|
||||
* at your own risk. In such case please don't rise up an issues related ONLY to
|
||||
* old compilers.
|
||||
*
|
||||
* NOTE:
|
||||
* Unfortunately, there are several different builds of "Visual Studio" that
|
||||
* are called "Visual Studio 2015 Update 3".
|
||||
*
|
||||
* The 190024234 is used here because it is minimal version of Visual Studio
|
||||
* that was used for build and testing libmdbx in recent years. Soon this
|
||||
* value will be increased to 19.0.24241.7, since build and testing using
|
||||
* "Visual Studio 2015" will be performed only at https://ci.appveyor.com.
|
||||
*
|
||||
* Please ask Microsoft (but not us) for information about version differences
|
||||
* and how to and where you can obtain the latest "Visual Studio 2015" build
|
||||
* with all fixes.
|
||||
*/
|
||||
#error \
|
||||
"At least \"Microsoft C/C++ Compiler\" version 19.00.24234 (Visual Studio 2015 Update 3) is required."
|
||||
#endif
|
||||
#if _MSC_VER > 1800
|
||||
#pragma warning(disable : 4464) /* relative include path contains '..' */
|
||||
#endif
|
||||
#if _MSC_VER > 1913
|
||||
#pragma warning(disable : 5045) /* will insert Spectre mitigation... */
|
||||
#endif
|
||||
#if _MSC_VER > 1914
|
||||
#pragma warning( \
|
||||
disable : 5105) /* winbase.h(9531): warning C5105: macro expansion \
|
||||
producing 'defined' has undefined behavior */
|
||||
#endif
|
||||
#if _MSC_VER < 1920
|
||||
/* avoid "error C2219: syntax error: type qualifier must be after '*'" */
|
||||
#define __restrict
|
||||
#endif
|
||||
#if _MSC_VER > 1930
|
||||
#pragma warning(disable : 6235) /* <expression> is always a constant */
|
||||
#pragma warning(disable : 6237) /* <expression> is never evaluated and might \
|
||||
have side effects */
|
||||
#endif
|
||||
#pragma warning(disable : 4710) /* 'xyz': function not inlined */
|
||||
#pragma warning(disable : 4711) /* function 'xyz' selected for automatic \
|
||||
inline expansion */
|
||||
#pragma warning(disable : 4201) /* nonstandard extension used: nameless \
|
||||
struct/union */
|
||||
#pragma warning(disable : 4702) /* unreachable code */
|
||||
#pragma warning(disable : 4706) /* assignment within conditional expression */
|
||||
#pragma warning(disable : 4127) /* conditional expression is constant */
|
||||
#pragma warning(disable : 4324) /* 'xyz': structure was padded due to \
|
||||
alignment specifier */
|
||||
#pragma warning(disable : 4310) /* cast truncates constant value */
|
||||
#pragma warning(disable : 4820) /* bytes padding added after data member for \
|
||||
alignment */
|
||||
#pragma warning(disable : 4548) /* expression before comma has no effect; \
|
||||
expected expression with side - effect */
|
||||
#pragma warning(disable : 4366) /* the result of the unary '&' operator may be \
|
||||
unaligned */
|
||||
#pragma warning(disable : 4200) /* nonstandard extension used: zero-sized \
|
||||
array in struct/union */
|
||||
#pragma warning(disable : 4204) /* nonstandard extension used: non-constant \
|
||||
aggregate initializer */
|
||||
#pragma warning( \
|
||||
disable : 4505) /* unreferenced local function has been removed */
|
||||
#endif /* _MSC_VER (warnings) */
|
||||
|
||||
#if defined(__GNUC__) && __GNUC__ < 9
|
||||
#pragma GCC diagnostic ignored "-Wattributes"
|
||||
#endif /* GCC < 9 */
|
||||
|
||||
#include "../mdbx.h"
|
||||
|
||||
/*----------------------------------------------------------------------------*/
|
||||
/* Microsoft compiler generates a lot of warning for self includes... */
|
||||
|
||||
@ -28,20 +171,9 @@
|
||||
* not guaranteed. Specify /EHsc */
|
||||
#endif /* _MSC_VER (warnings) */
|
||||
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
#if !defined(_CRT_SECURE_NO_WARNINGS)
|
||||
#define _CRT_SECURE_NO_WARNINGS
|
||||
#endif /* _CRT_SECURE_NO_WARNINGS */
|
||||
#if !defined(_NO_CRT_STDIO_INLINE) && MDBX_BUILD_SHARED_LIBRARY && \
|
||||
!defined(xMDBX_TOOLS) && MDBX_WITHOUT_MSVC_CRT
|
||||
#define _NO_CRT_STDIO_INLINE
|
||||
#endif
|
||||
#elif !defined(_POSIX_C_SOURCE)
|
||||
#define _POSIX_C_SOURCE 200809L
|
||||
#endif /* Windows */
|
||||
|
||||
/*----------------------------------------------------------------------------*/
|
||||
/* basic C99 includes */
|
||||
|
||||
#include <inttypes.h>
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
@ -55,21 +187,6 @@
|
||||
#include <string.h>
|
||||
#include <time.h>
|
||||
|
||||
#if (-6 & 5) || CHAR_BIT != 8 || UINT_MAX < 0xffffffff || ULONG_MAX % 0xFFFF
|
||||
#error \
|
||||
"Sanity checking failed: Two's complement, reasonably sized integer types"
|
||||
#endif
|
||||
|
||||
#ifndef SSIZE_MAX
|
||||
#define SSIZE_MAX INTPTR_MAX
|
||||
#endif
|
||||
|
||||
#if UINTPTR_MAX > 0xffffFFFFul || ULONG_MAX > 0xffffFFFFul || defined(_WIN64)
|
||||
#define MDBX_WORDBITS 64
|
||||
#else
|
||||
#define MDBX_WORDBITS 32
|
||||
#endif /* MDBX_WORDBITS */
|
||||
|
||||
/*----------------------------------------------------------------------------*/
|
||||
/* feature testing */
|
||||
|
||||
@ -81,6 +198,14 @@
|
||||
#define __has_include(x) (0)
|
||||
#endif
|
||||
|
||||
#ifndef __has_attribute
|
||||
#define __has_attribute(x) (0)
|
||||
#endif
|
||||
|
||||
#ifndef __has_cpp_attribute
|
||||
#define __has_cpp_attribute(x) 0
|
||||
#endif
|
||||
|
||||
#ifndef __has_feature
|
||||
#define __has_feature(x) (0)
|
||||
#endif
|
||||
@ -89,6 +214,10 @@
|
||||
#define __has_extension(x) (0)
|
||||
#endif
|
||||
|
||||
#ifndef __has_builtin
|
||||
#define __has_builtin(x) (0)
|
||||
#endif
|
||||
|
||||
#if __has_feature(thread_sanitizer)
|
||||
#define __SANITIZE_THREAD__ 1
|
||||
#endif
|
||||
@ -124,6 +253,47 @@
|
||||
#endif
|
||||
#endif /* __GLIBC_PREREQ */
|
||||
|
||||
/*----------------------------------------------------------------------------*/
|
||||
/* pre-requirements */
|
||||
|
||||
#if (-6 & 5) || CHAR_BIT != 8 || UINT_MAX < 0xffffffff || ULONG_MAX % 0xFFFF
|
||||
#error \
|
||||
"Sanity checking failed: Two's complement, reasonably sized integer types"
|
||||
#endif
|
||||
|
||||
#ifndef SSIZE_MAX
|
||||
#define SSIZE_MAX INTPTR_MAX
|
||||
#endif
|
||||
|
||||
#if defined(__GNUC__) && !__GNUC_PREREQ(4, 2)
|
||||
/* Actually libmdbx was not tested with compilers older than GCC 4.2.
|
||||
* But you could ignore this warning at your own risk.
|
||||
* In such case please don't rise up an issues related ONLY to old compilers.
|
||||
*/
|
||||
#warning "libmdbx required GCC >= 4.2"
|
||||
#endif
|
||||
|
||||
#if defined(__clang__) && !__CLANG_PREREQ(3, 8)
|
||||
/* Actually libmdbx was not tested with CLANG older than 3.8.
|
||||
* But you could ignore this warning at your own risk.
|
||||
* In such case please don't rise up an issues related ONLY to old compilers.
|
||||
*/
|
||||
#warning "libmdbx required CLANG >= 3.8"
|
||||
#endif
|
||||
|
||||
#if defined(__GLIBC__) && !__GLIBC_PREREQ(2, 12)
|
||||
/* Actually libmdbx was not tested with something older than glibc 2.12.
|
||||
* But you could ignore this warning at your own risk.
|
||||
* In such case please don't rise up an issues related ONLY to old systems.
|
||||
*/
|
||||
#warning "libmdbx was only tested with GLIBC >= 2.12."
|
||||
#endif
|
||||
|
||||
#ifdef __SANITIZE_THREAD__
|
||||
#warning \
|
||||
"libmdbx don't compatible with ThreadSanitizer, you will get a lot of false-positive issues."
|
||||
#endif /* __SANITIZE_THREAD__ */
|
||||
|
||||
/*----------------------------------------------------------------------------*/
|
||||
/* C11' alignas() */
|
||||
|
||||
@ -240,12 +410,14 @@ __extern_C key_t ftok(const char *, int);
|
||||
#ifndef WIN32_LEAN_AND_MEAN
|
||||
#define WIN32_LEAN_AND_MEAN
|
||||
#endif /* WIN32_LEAN_AND_MEAN */
|
||||
#include <excpt.h>
|
||||
#include <tlhelp32.h>
|
||||
#include <windows.h>
|
||||
#include <winnt.h>
|
||||
#include <winternl.h>
|
||||
|
||||
/* После подгрузки windows.h, чтобы избежать проблем со сборкой MINGW и т.п. */
|
||||
#include <excpt.h>
|
||||
#include <tlhelp32.h>
|
||||
|
||||
#else /*----------------------------------------------------------------------*/
|
||||
|
||||
#include <unistd.h>
|
||||
@ -502,10 +674,11 @@ __extern_C key_t ftok(const char *, int);
|
||||
|
||||
#ifndef container_of
|
||||
#define container_of(ptr, type, member) \
|
||||
((type *)((char *)(ptr)-offsetof(type, member)))
|
||||
((type *)((char *)(ptr) - offsetof(type, member)))
|
||||
#endif /* container_of */
|
||||
|
||||
/*----------------------------------------------------------------------------*/
|
||||
/* useful attributes */
|
||||
|
||||
#ifndef __always_inline
|
||||
#if defined(__GNUC__) || __has_attribute(__always_inline__)
|
||||
@ -513,7 +686,7 @@ __extern_C key_t ftok(const char *, int);
|
||||
#elif defined(_MSC_VER)
|
||||
#define __always_inline __forceinline
|
||||
#else
|
||||
#define __always_inline
|
||||
#define __always_inline __inline
|
||||
#endif
|
||||
#endif /* __always_inline */
|
||||
|
||||
@ -639,16 +812,6 @@ __extern_C key_t ftok(const char *, int);
|
||||
#endif
|
||||
#endif /* __anonymous_struct_extension__ */
|
||||
|
||||
#ifndef expect_with_probability
|
||||
#if defined(__builtin_expect_with_probability) || \
|
||||
__has_builtin(__builtin_expect_with_probability) || __GNUC_PREREQ(9, 0)
|
||||
#define expect_with_probability(expr, value, prob) \
|
||||
__builtin_expect_with_probability(expr, value, prob)
|
||||
#else
|
||||
#define expect_with_probability(expr, value, prob) (expr)
|
||||
#endif
|
||||
#endif /* expect_with_probability */
|
||||
|
||||
#ifndef MDBX_WEAK_IMPORT_ATTRIBUTE
|
||||
#ifdef WEAK_IMPORT_ATTRIBUTE
|
||||
#define MDBX_WEAK_IMPORT_ATTRIBUTE WEAK_IMPORT_ATTRIBUTE
|
||||
@ -662,6 +825,32 @@ __extern_C key_t ftok(const char *, int);
|
||||
#endif
|
||||
#endif /* MDBX_WEAK_IMPORT_ATTRIBUTE */
|
||||
|
||||
#if !defined(__thread) && (defined(_MSC_VER) || defined(__DMC__))
|
||||
#define __thread __declspec(thread)
|
||||
#endif /* __thread */
|
||||
|
||||
#ifndef MDBX_EXCLUDE_FOR_GPROF
|
||||
#ifdef ENABLE_GPROF
|
||||
#define MDBX_EXCLUDE_FOR_GPROF \
|
||||
__attribute__((__no_instrument_function__, \
|
||||
__no_profile_instrument_function__))
|
||||
#else
|
||||
#define MDBX_EXCLUDE_FOR_GPROF
|
||||
#endif /* ENABLE_GPROF */
|
||||
#endif /* MDBX_EXCLUDE_FOR_GPROF */
|
||||
|
||||
/*----------------------------------------------------------------------------*/
|
||||
|
||||
#ifndef expect_with_probability
|
||||
#if defined(__builtin_expect_with_probability) || \
|
||||
__has_builtin(__builtin_expect_with_probability) || __GNUC_PREREQ(9, 0)
|
||||
#define expect_with_probability(expr, value, prob) \
|
||||
__builtin_expect_with_probability(expr, value, prob)
|
||||
#else
|
||||
#define expect_with_probability(expr, value, prob) (expr)
|
||||
#endif
|
||||
#endif /* expect_with_probability */
|
||||
|
||||
#ifndef MDBX_GOOFY_MSVC_STATIC_ANALYZER
|
||||
#ifdef _PREFAST_
|
||||
#define MDBX_GOOFY_MSVC_STATIC_ANALYZER 1
|
||||
@ -684,7 +873,17 @@ __extern_C key_t ftok(const char *, int);
|
||||
#define MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER(warn_id)
|
||||
#endif /* MDBX_GOOFY_MSVC_STATIC_ANALYZER */
|
||||
|
||||
#ifndef FLEXIBLE_ARRAY_MEMBERS
|
||||
#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) || \
|
||||
(!defined(__cplusplus) && defined(_MSC_VER))
|
||||
#define FLEXIBLE_ARRAY_MEMBERS 1
|
||||
#else
|
||||
#define FLEXIBLE_ARRAY_MEMBERS 0
|
||||
#endif
|
||||
#endif /* FLEXIBLE_ARRAY_MEMBERS */
|
||||
|
||||
/*----------------------------------------------------------------------------*/
|
||||
/* Valgrind and Address Sanitizer */
|
||||
|
||||
#if defined(ENABLE_MEMCHECK)
|
||||
#include <valgrind/memcheck.h>
|
||||
@ -766,10 +965,69 @@ template <typename T, size_t N> char (&__ArraySizeHelper(T (&array)[N]))[N];
|
||||
#define STATIC_ASSERT(expr) STATIC_ASSERT_MSG(expr, #expr)
|
||||
#endif
|
||||
|
||||
#ifndef __Wpedantic_format_voidptr
|
||||
MDBX_MAYBE_UNUSED MDBX_PURE_FUNCTION static __inline const void *
|
||||
__Wpedantic_format_voidptr(const void *ptr) {
|
||||
return ptr;
|
||||
}
|
||||
#define __Wpedantic_format_voidptr(ARG) __Wpedantic_format_voidptr(ARG)
|
||||
#endif /* __Wpedantic_format_voidptr */
|
||||
/*----------------------------------------------------------------------------*/
|
||||
|
||||
#if defined(_MSC_VER) && _MSC_VER >= 1900
|
||||
/* LY: MSVC 2015/2017/2019 has buggy/inconsistent PRIuPTR/PRIxPTR macros
|
||||
* for internal format-args checker. */
|
||||
#undef PRIuPTR
|
||||
#undef PRIiPTR
|
||||
#undef PRIdPTR
|
||||
#undef PRIxPTR
|
||||
#define PRIuPTR "Iu"
|
||||
#define PRIiPTR "Ii"
|
||||
#define PRIdPTR "Id"
|
||||
#define PRIxPTR "Ix"
|
||||
#define PRIuSIZE "zu"
|
||||
#define PRIiSIZE "zi"
|
||||
#define PRIdSIZE "zd"
|
||||
#define PRIxSIZE "zx"
|
||||
#endif /* fix PRI*PTR for _MSC_VER */
|
||||
|
||||
#ifndef PRIuSIZE
|
||||
#define PRIuSIZE PRIuPTR
|
||||
#define PRIiSIZE PRIiPTR
|
||||
#define PRIdSIZE PRIdPTR
|
||||
#define PRIxSIZE PRIxPTR
|
||||
#endif /* PRI*SIZE macros for MSVC */
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#pragma warning(pop)
|
||||
#endif
|
||||
|
||||
/*----------------------------------------------------------------------------*/
|
||||
|
||||
#if __has_warning("-Wnested-anon-types")
|
||||
#if defined(__clang__)
|
||||
#pragma clang diagnostic ignored "-Wnested-anon-types"
|
||||
#elif defined(__GNUC__)
|
||||
#pragma GCC diagnostic ignored "-Wnested-anon-types"
|
||||
#else
|
||||
#pragma warning disable "nested-anon-types"
|
||||
#endif
|
||||
#endif /* -Wnested-anon-types */
|
||||
|
||||
#if __has_warning("-Wconstant-logical-operand")
|
||||
#if defined(__clang__)
|
||||
#pragma clang diagnostic ignored "-Wconstant-logical-operand"
|
||||
#elif defined(__GNUC__)
|
||||
#pragma GCC diagnostic ignored "-Wconstant-logical-operand"
|
||||
#else
|
||||
#pragma warning disable "constant-logical-operand"
|
||||
#endif
|
||||
#endif /* -Wconstant-logical-operand */
|
||||
|
||||
#if defined(__LCC__) && (__LCC__ <= 121)
|
||||
/* bug #2798 */
|
||||
#pragma diag_suppress alignment_reduction_ignored
|
||||
#elif defined(__ICC)
|
||||
#pragma warning(disable : 3453 1366)
|
||||
#elif __has_warning("-Walignment-reduction-ignored")
|
||||
#if defined(__clang__)
|
||||
#pragma clang diagnostic ignored "-Walignment-reduction-ignored"
|
||||
#elif defined(__GNUC__)
|
||||
#pragma GCC diagnostic ignored "-Walignment-reduction-ignored"
|
||||
#else
|
||||
#pragma warning disable "alignment-reduction-ignored"
|
||||
#endif
|
||||
#endif /* -Walignment-reduction-ignored */
|
119
src/proto.h
Normal file
119
src/proto.h
Normal file
@ -0,0 +1,119 @@
|
||||
/// \copyright SPDX-License-Identifier: Apache-2.0
|
||||
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "essentials.h"
|
||||
|
||||
/* Internal prototypes */
|
||||
|
||||
/* audit.c */
|
||||
MDBX_INTERNAL int audit_ex(MDBX_txn *txn, size_t retired_stored,
|
||||
bool dont_filter_gc);
|
||||
|
||||
/* mvcc-readers.c */
|
||||
MDBX_INTERNAL bsr_t mvcc_bind_slot(MDBX_env *env, const uintptr_t tid);
|
||||
MDBX_MAYBE_UNUSED MDBX_INTERNAL pgno_t mvcc_largest_this(MDBX_env *env,
|
||||
pgno_t largest);
|
||||
MDBX_INTERNAL txnid_t mvcc_shapshot_oldest(MDBX_env *const env,
|
||||
const txnid_t steady);
|
||||
MDBX_INTERNAL pgno_t mvcc_snapshot_largest(const MDBX_env *env,
|
||||
pgno_t last_used_page);
|
||||
MDBX_INTERNAL txnid_t mvcc_kick_laggards(MDBX_env *env,
|
||||
const txnid_t straggler);
|
||||
MDBX_INTERNAL int mvcc_cleanup_dead(MDBX_env *env, int rlocked, int *dead);
|
||||
MDBX_INTERNAL txnid_t mvcc_kick_laggards(MDBX_env *env, const txnid_t laggard);
|
||||
|
||||
/* dxb.c */
|
||||
MDBX_INTERNAL int dxb_setup(MDBX_env *env, const int lck_rc,
|
||||
const mdbx_mode_t mode_bits);
|
||||
MDBX_INTERNAL int __must_check_result
|
||||
dxb_read_header(MDBX_env *env, meta_t *meta, const int lck_exclusive,
|
||||
const mdbx_mode_t mode_bits);
|
||||
enum resize_mode { implicit_grow, impilict_shrink, explicit_resize };
|
||||
MDBX_INTERNAL int __must_check_result dxb_resize(MDBX_env *const env,
|
||||
const pgno_t used_pgno,
|
||||
const pgno_t size_pgno,
|
||||
pgno_t limit_pgno,
|
||||
const enum resize_mode mode);
|
||||
MDBX_INTERNAL int dxb_set_readahead(const MDBX_env *env, const pgno_t edge,
|
||||
const bool enable, const bool force_whole);
|
||||
MDBX_INTERNAL int __must_check_result dxb_sync_locked(MDBX_env *env,
|
||||
unsigned flags,
|
||||
meta_t *const pending,
|
||||
troika_t *const troika);
|
||||
#if defined(ENABLE_MEMCHECK) || defined(__SANITIZE_ADDRESS__)
|
||||
MDBX_INTERNAL void dxb_sanitize_tail(MDBX_env *env, MDBX_txn *txn);
|
||||
#else
|
||||
static inline void dxb_sanitize_tail(MDBX_env *env, MDBX_txn *txn) {
|
||||
(void)env;
|
||||
(void)txn;
|
||||
}
|
||||
#endif /* ENABLE_MEMCHECK || __SANITIZE_ADDRESS__ */
|
||||
|
||||
/* txn.c */
|
||||
MDBX_INTERNAL bool txn_refund(MDBX_txn *txn);
|
||||
MDBX_INTERNAL txnid_t txn_snapshot_oldest(const MDBX_txn *const txn);
|
||||
MDBX_INTERNAL int txn_abort(MDBX_txn *txn);
|
||||
MDBX_INTERNAL int txn_renew(MDBX_txn *txn, unsigned flags);
|
||||
|
||||
#define TXN_END_NAMES \
|
||||
{"committed", "empty-commit", "abort", "reset", \
|
||||
"reset-tmp", "fail-begin", "fail-beginchild"}
|
||||
enum {
|
||||
/* txn_end operation number, for logging */
|
||||
TXN_END_COMMITTED,
|
||||
TXN_END_PURE_COMMIT,
|
||||
TXN_END_ABORT,
|
||||
TXN_END_RESET,
|
||||
TXN_END_RESET_TMP,
|
||||
TXN_END_FAIL_BEGIN,
|
||||
TXN_END_FAIL_BEGINCHILD,
|
||||
|
||||
TXN_END_OPMASK = 0x0F /* mask for txn_end() operation number */,
|
||||
TXN_END_UPDATE = 0x10 /* update env state (DBIs) */,
|
||||
TXN_END_FREE = 0x20 /* free txn unless it is env.basal_txn */,
|
||||
TXN_END_EOTDONE = 0x40 /* txn's cursors already closed */,
|
||||
TXN_END_SLOT = 0x80 /* release any reader slot if NOSTICKYTHREADS */
|
||||
};
|
||||
MDBX_INTERNAL int txn_end(MDBX_txn *txn, const unsigned mode);
|
||||
MDBX_INTERNAL int txn_write(MDBX_txn *txn, iov_ctx_t *ctx);
|
||||
|
||||
/* env.c */
|
||||
MDBX_INTERNAL int env_open(MDBX_env *env, mdbx_mode_t mode);
|
||||
MDBX_INTERNAL int env_info(const MDBX_env *env, const MDBX_txn *txn,
|
||||
MDBX_envinfo *out, size_t bytes, troika_t *troika);
|
||||
MDBX_INTERNAL int env_sync(MDBX_env *env, bool force, bool nonblock);
|
||||
MDBX_INTERNAL int env_close(MDBX_env *env, bool resurrect_after_fork);
|
||||
MDBX_INTERNAL bool env_txn0_owned(const MDBX_env *env);
|
||||
MDBX_INTERNAL void env_options_init(MDBX_env *env);
|
||||
MDBX_INTERNAL void env_options_adjust_defaults(MDBX_env *env);
|
||||
MDBX_INTERNAL int __must_check_result env_page_auxbuffer(MDBX_env *env);
|
||||
MDBX_INTERNAL unsigned env_setup_pagesize(MDBX_env *env, const size_t pagesize);
|
||||
|
||||
/* tree.c */
|
||||
MDBX_INTERNAL int tree_drop(MDBX_cursor *mc, const bool may_have_subDBs);
|
||||
MDBX_INTERNAL int __must_check_result tree_rebalance(MDBX_cursor *mc);
|
||||
MDBX_INTERNAL int __must_check_result tree_propagate_key(MDBX_cursor *mc,
|
||||
const MDBX_val *key);
|
||||
MDBX_INTERNAL void recalculate_merge_thresholds(MDBX_env *env);
|
||||
|
||||
/* subdb.c */
|
||||
MDBX_INTERNAL int __must_check_result sdb_fetch(MDBX_txn *txn, size_t dbi);
|
||||
MDBX_INTERNAL int __must_check_result sdb_setup(const MDBX_env *env,
|
||||
kvx_t *const kvx,
|
||||
const tree_t *const db);
|
||||
|
||||
/* coherency.c */
|
||||
MDBX_INTERNAL bool coherency_check_meta(const MDBX_env *env,
|
||||
const volatile meta_t *meta,
|
||||
bool report);
|
||||
MDBX_INTERNAL int coherency_check_head(MDBX_txn *txn, const meta_ptr_t head,
|
||||
uint64_t *timestamp);
|
||||
MDBX_INTERNAL int coherency_check_written(const MDBX_env *env,
|
||||
const txnid_t txnid,
|
||||
const volatile meta_t *meta,
|
||||
const intptr_t pgno,
|
||||
uint64_t *timestamp);
|
||||
MDBX_INTERNAL int coherency_timeout(uint64_t *timestamp, intptr_t pgno,
|
||||
const MDBX_env *env);
|
394
src/range-estimate.c
Normal file
394
src/range-estimate.c
Normal file
@ -0,0 +1,394 @@
|
||||
/// \copyright SPDX-License-Identifier: Apache-2.0
|
||||
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
|
||||
|
||||
#include "internals.h"
|
||||
|
||||
typedef struct diff_result {
|
||||
ptrdiff_t diff;
|
||||
intptr_t level;
|
||||
ptrdiff_t root_nkeys;
|
||||
} diff_t;
|
||||
|
||||
/* calculates: r = x - y */
|
||||
__hot static int cursor_diff(const MDBX_cursor *const __restrict x,
|
||||
const MDBX_cursor *const __restrict y,
|
||||
diff_t *const __restrict r) {
|
||||
r->diff = 0;
|
||||
r->level = 0;
|
||||
r->root_nkeys = 0;
|
||||
|
||||
if (unlikely(x->signature != cur_signature_live))
|
||||
return (x->signature == cur_signature_ready4dispose) ? MDBX_EINVAL
|
||||
: MDBX_EBADSIGN;
|
||||
|
||||
if (unlikely(y->signature != cur_signature_live))
|
||||
return (y->signature == cur_signature_ready4dispose) ? MDBX_EINVAL
|
||||
: MDBX_EBADSIGN;
|
||||
|
||||
int rc = check_txn(x->txn, MDBX_TXN_BLOCKED);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return rc;
|
||||
|
||||
if (unlikely(x->txn != y->txn))
|
||||
return MDBX_BAD_TXN;
|
||||
|
||||
if (unlikely(y->dbi_state != x->dbi_state))
|
||||
return MDBX_EINVAL;
|
||||
|
||||
const intptr_t depth = (x->top < y->top) ? x->top : y->top;
|
||||
if (unlikely(depth < 0))
|
||||
return MDBX_ENODATA;
|
||||
|
||||
r->root_nkeys = page_numkeys(x->pg[0]);
|
||||
intptr_t nkeys = r->root_nkeys;
|
||||
for (;;) {
|
||||
if (unlikely(y->pg[r->level] != x->pg[r->level])) {
|
||||
ERROR("Mismatch cursors's pages at %zu level", r->level);
|
||||
return MDBX_PROBLEM;
|
||||
}
|
||||
r->diff = x->ki[r->level] - y->ki[r->level];
|
||||
if (r->diff)
|
||||
break;
|
||||
r->level += 1;
|
||||
if (r->level > depth) {
|
||||
r->diff = CMP2INT(x->flags & z_eof_hard, y->flags & z_eof_hard);
|
||||
return MDBX_SUCCESS;
|
||||
}
|
||||
nkeys = page_numkeys(x->pg[r->level]);
|
||||
}
|
||||
|
||||
while (unlikely(r->diff == 1) && likely(r->level < depth)) {
|
||||
r->level += 1;
|
||||
/* DB'PAGEs: 0------------------>MAX
|
||||
*
|
||||
* CURSORs: y < x
|
||||
* STACK[i ]: |
|
||||
* STACK[+1]: ...y++N|0++x...
|
||||
*/
|
||||
nkeys = page_numkeys(y->pg[r->level]);
|
||||
r->diff = (nkeys - y->ki[r->level]) + x->ki[r->level];
|
||||
assert(r->diff > 0);
|
||||
}
|
||||
|
||||
while (unlikely(r->diff == -1) && likely(r->level < depth)) {
|
||||
r->level += 1;
|
||||
/* DB'PAGEs: 0------------------>MAX
|
||||
*
|
||||
* CURSORs: x < y
|
||||
* STACK[i ]: |
|
||||
* STACK[+1]: ...x--N|0--y...
|
||||
*/
|
||||
nkeys = page_numkeys(x->pg[r->level]);
|
||||
r->diff = -(nkeys - x->ki[r->level]) - y->ki[r->level];
|
||||
assert(r->diff < 0);
|
||||
}
|
||||
|
||||
return MDBX_SUCCESS;
|
||||
}
|
||||
|
||||
__hot static ptrdiff_t estimate(const tree_t *tree,
|
||||
diff_t *const __restrict dr) {
|
||||
/* root: branch-page => scale = leaf-factor * branch-factor^(N-1)
|
||||
* level-1: branch-page(s) => scale = leaf-factor * branch-factor^2
|
||||
* level-2: branch-page(s) => scale = leaf-factor * branch-factor
|
||||
* level-N: branch-page(s) => scale = leaf-factor
|
||||
* leaf-level: leaf-page(s) => scale = 1
|
||||
*/
|
||||
ptrdiff_t btree_power = (ptrdiff_t)tree->height - 2 - (ptrdiff_t)dr->level;
|
||||
if (btree_power < 0)
|
||||
return dr->diff;
|
||||
|
||||
ptrdiff_t estimated =
|
||||
(ptrdiff_t)tree->items * dr->diff / (ptrdiff_t)tree->leaf_pages;
|
||||
if (btree_power == 0)
|
||||
return estimated;
|
||||
|
||||
if (tree->height < 4) {
|
||||
assert(dr->level == 0 && btree_power == 1);
|
||||
return (ptrdiff_t)tree->items * dr->diff / (ptrdiff_t)dr->root_nkeys;
|
||||
}
|
||||
|
||||
/* average_branchpage_fillfactor = total(branch_entries) / branch_pages
|
||||
total(branch_entries) = leaf_pages + branch_pages - 1 (root page) */
|
||||
const size_t log2_fixedpoint = sizeof(size_t) - 1;
|
||||
const size_t half = UINT64_C(1) << (log2_fixedpoint - 1);
|
||||
const size_t factor =
|
||||
((tree->leaf_pages + tree->branch_pages - 1) << log2_fixedpoint) /
|
||||
tree->branch_pages;
|
||||
while (1) {
|
||||
switch ((size_t)btree_power) {
|
||||
default: {
|
||||
const size_t square = (factor * factor + half) >> log2_fixedpoint;
|
||||
const size_t quad = (square * square + half) >> log2_fixedpoint;
|
||||
do {
|
||||
estimated = estimated * quad + half;
|
||||
estimated >>= log2_fixedpoint;
|
||||
btree_power -= 4;
|
||||
} while (btree_power >= 4);
|
||||
continue;
|
||||
}
|
||||
case 3:
|
||||
estimated = estimated * factor + half;
|
||||
estimated >>= log2_fixedpoint;
|
||||
__fallthrough /* fall through */;
|
||||
case 2:
|
||||
estimated = estimated * factor + half;
|
||||
estimated >>= log2_fixedpoint;
|
||||
__fallthrough /* fall through */;
|
||||
case 1:
|
||||
estimated = estimated * factor + half;
|
||||
estimated >>= log2_fixedpoint;
|
||||
__fallthrough /* fall through */;
|
||||
case 0:
|
||||
if (unlikely(estimated > (ptrdiff_t)tree->items))
|
||||
return (ptrdiff_t)tree->items;
|
||||
if (unlikely(estimated < -(ptrdiff_t)tree->items))
|
||||
return -(ptrdiff_t)tree->items;
|
||||
return estimated;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__hot int mdbx_estimate_distance(const MDBX_cursor *first,
|
||||
const MDBX_cursor *last,
|
||||
ptrdiff_t *distance_items) {
|
||||
if (unlikely(first == nullptr || last == nullptr ||
|
||||
distance_items == nullptr))
|
||||
return MDBX_EINVAL;
|
||||
|
||||
*distance_items = 0;
|
||||
diff_t dr;
|
||||
int rc = cursor_diff(last, first, &dr);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return rc;
|
||||
|
||||
cASSERT(first, dr.diff || inner_pointed(first) == inner_pointed(last));
|
||||
if (unlikely(dr.diff == 0) && inner_pointed(first)) {
|
||||
first = &first->subcur->cursor;
|
||||
last = &last->subcur->cursor;
|
||||
rc = cursor_diff(first, last, &dr);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return rc;
|
||||
}
|
||||
|
||||
if (likely(dr.diff != 0))
|
||||
*distance_items = estimate(first->tree, &dr);
|
||||
|
||||
return MDBX_SUCCESS;
|
||||
}
|
||||
|
||||
__hot int mdbx_estimate_move(const MDBX_cursor *cursor, MDBX_val *key,
|
||||
MDBX_val *data, MDBX_cursor_op move_op,
|
||||
ptrdiff_t *distance_items) {
|
||||
if (unlikely(cursor == nullptr || distance_items == nullptr ||
|
||||
move_op == MDBX_GET_CURRENT || move_op == MDBX_GET_MULTIPLE))
|
||||
return MDBX_EINVAL;
|
||||
|
||||
if (unlikely(cursor->signature != cur_signature_live))
|
||||
return (cursor->signature == cur_signature_ready4dispose) ? MDBX_EINVAL
|
||||
: MDBX_EBADSIGN;
|
||||
|
||||
int rc = check_txn(cursor->txn, MDBX_TXN_BLOCKED);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return rc;
|
||||
|
||||
if (unlikely(!is_pointed(cursor)))
|
||||
return MDBX_ENODATA;
|
||||
|
||||
cursor_couple_t next;
|
||||
rc = cursor_init(&next.outer, cursor->txn, cursor_dbi(cursor));
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return rc;
|
||||
|
||||
cursor_cpstk(cursor, &next.outer);
|
||||
if (cursor->tree->flags & MDBX_DUPSORT) {
|
||||
subcur_t *mx = &container_of(cursor, cursor_couple_t, outer)->inner;
|
||||
cursor_cpstk(&mx->cursor, &next.inner.cursor);
|
||||
}
|
||||
|
||||
MDBX_val stub_data;
|
||||
if (data == nullptr) {
|
||||
const unsigned mask =
|
||||
1 << MDBX_GET_BOTH | 1 << MDBX_GET_BOTH_RANGE | 1 << MDBX_SET_KEY;
|
||||
if (unlikely(mask & (1 << move_op)))
|
||||
return MDBX_EINVAL;
|
||||
stub_data.iov_base = nullptr;
|
||||
stub_data.iov_len = 0;
|
||||
data = &stub_data;
|
||||
}
|
||||
|
||||
MDBX_val stub_key;
|
||||
if (key == nullptr) {
|
||||
const unsigned mask = 1 << MDBX_GET_BOTH | 1 << MDBX_GET_BOTH_RANGE |
|
||||
1 << MDBX_SET_KEY | 1 << MDBX_SET |
|
||||
1 << MDBX_SET_RANGE;
|
||||
if (unlikely(mask & (1 << move_op)))
|
||||
return MDBX_EINVAL;
|
||||
stub_key.iov_base = nullptr;
|
||||
stub_key.iov_len = 0;
|
||||
key = &stub_key;
|
||||
}
|
||||
|
||||
next.outer.signature = cur_signature_live;
|
||||
rc = cursor_ops(&next.outer, key, data, move_op);
|
||||
if (unlikely(rc != MDBX_SUCCESS &&
|
||||
(rc != MDBX_NOTFOUND || !is_pointed(&next.outer))))
|
||||
return rc;
|
||||
|
||||
if (move_op == MDBX_LAST) {
|
||||
next.outer.flags |= z_eof_hard;
|
||||
next.inner.cursor.flags |= z_eof_hard;
|
||||
}
|
||||
return mdbx_estimate_distance(cursor, &next.outer, distance_items);
|
||||
}
|
||||
|
||||
__hot int mdbx_estimate_range(const MDBX_txn *txn, MDBX_dbi dbi,
|
||||
const MDBX_val *begin_key,
|
||||
const MDBX_val *begin_data,
|
||||
const MDBX_val *end_key, const MDBX_val *end_data,
|
||||
ptrdiff_t *size_items) {
|
||||
int rc = check_txn(txn, MDBX_TXN_BLOCKED);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return rc;
|
||||
|
||||
if (unlikely(!size_items))
|
||||
return MDBX_EINVAL;
|
||||
|
||||
if (unlikely(begin_data &&
|
||||
(begin_key == nullptr || begin_key == MDBX_EPSILON)))
|
||||
return MDBX_EINVAL;
|
||||
|
||||
if (unlikely(end_data && (end_key == nullptr || end_key == MDBX_EPSILON)))
|
||||
return MDBX_EINVAL;
|
||||
|
||||
if (unlikely(begin_key == MDBX_EPSILON && end_key == MDBX_EPSILON))
|
||||
return MDBX_EINVAL;
|
||||
|
||||
cursor_couple_t begin;
|
||||
/* LY: first, initialize cursor to refresh a DB in case it have DB_STALE */
|
||||
rc = cursor_init(&begin.outer, txn, dbi);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return rc;
|
||||
|
||||
if (unlikely(begin.outer.tree->items == 0)) {
|
||||
*size_items = 0;
|
||||
return MDBX_SUCCESS;
|
||||
}
|
||||
|
||||
if (!begin_key) {
|
||||
if (unlikely(!end_key)) {
|
||||
/* LY: FIRST..LAST case */
|
||||
*size_items = (ptrdiff_t)begin.outer.tree->items;
|
||||
return MDBX_SUCCESS;
|
||||
}
|
||||
rc = outer_first(&begin.outer, nullptr, nullptr);
|
||||
if (unlikely(end_key == MDBX_EPSILON)) {
|
||||
/* LY: FIRST..+epsilon case */
|
||||
return (rc == MDBX_SUCCESS)
|
||||
? mdbx_cursor_count(&begin.outer, (size_t *)size_items)
|
||||
: rc;
|
||||
}
|
||||
} else {
|
||||
if (unlikely(begin_key == MDBX_EPSILON)) {
|
||||
if (end_key == nullptr) {
|
||||
/* LY: -epsilon..LAST case */
|
||||
rc = outer_last(&begin.outer, nullptr, nullptr);
|
||||
return (rc == MDBX_SUCCESS)
|
||||
? mdbx_cursor_count(&begin.outer, (size_t *)size_items)
|
||||
: rc;
|
||||
}
|
||||
/* LY: -epsilon..value case */
|
||||
assert(end_key != MDBX_EPSILON);
|
||||
begin_key = end_key;
|
||||
} else if (unlikely(end_key == MDBX_EPSILON)) {
|
||||
/* LY: value..+epsilon case */
|
||||
assert(begin_key != MDBX_EPSILON);
|
||||
end_key = begin_key;
|
||||
}
|
||||
if (end_key && !begin_data && !end_data &&
|
||||
(begin_key == end_key ||
|
||||
begin.outer.clc->k.cmp(begin_key, end_key) == 0)) {
|
||||
/* LY: single key case */
|
||||
rc = cursor_seek(&begin.outer, (MDBX_val *)begin_key, nullptr, MDBX_SET)
|
||||
.err;
|
||||
if (unlikely(rc != MDBX_SUCCESS)) {
|
||||
*size_items = 0;
|
||||
return (rc == MDBX_NOTFOUND) ? MDBX_SUCCESS : rc;
|
||||
}
|
||||
*size_items = 1;
|
||||
if (inner_pointed(&begin.outer))
|
||||
*size_items =
|
||||
(sizeof(*size_items) >= sizeof(begin.inner.nested_tree.items) ||
|
||||
begin.inner.nested_tree.items <= PTRDIFF_MAX)
|
||||
? (size_t)begin.inner.nested_tree.items
|
||||
: PTRDIFF_MAX;
|
||||
|
||||
return MDBX_SUCCESS;
|
||||
} else {
|
||||
MDBX_val proxy_key = *begin_key;
|
||||
MDBX_val proxy_data = {nullptr, 0};
|
||||
if (begin_data)
|
||||
proxy_data = *begin_data;
|
||||
rc = cursor_seek(&begin.outer, &proxy_key, &proxy_data,
|
||||
MDBX_SET_LOWERBOUND)
|
||||
.err;
|
||||
}
|
||||
}
|
||||
|
||||
if (unlikely(rc != MDBX_SUCCESS)) {
|
||||
if (rc != MDBX_NOTFOUND || !is_pointed(&begin.outer))
|
||||
return rc;
|
||||
}
|
||||
|
||||
cursor_couple_t end;
|
||||
rc = cursor_init(&end.outer, txn, dbi);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return rc;
|
||||
if (!end_key) {
|
||||
rc = outer_last(&end.outer, nullptr, nullptr);
|
||||
end.outer.flags |= z_eof_hard;
|
||||
end.inner.cursor.flags |= z_eof_hard;
|
||||
} else {
|
||||
MDBX_val proxy_key = *end_key;
|
||||
MDBX_val proxy_data = {nullptr, 0};
|
||||
if (end_data)
|
||||
proxy_data = *end_data;
|
||||
rc = cursor_seek(&end.outer, &proxy_key, &proxy_data, MDBX_SET_LOWERBOUND)
|
||||
.err;
|
||||
}
|
||||
if (unlikely(rc != MDBX_SUCCESS)) {
|
||||
if (rc != MDBX_NOTFOUND || !is_pointed(&end.outer))
|
||||
return rc;
|
||||
}
|
||||
|
||||
rc = mdbx_estimate_distance(&begin.outer, &end.outer, size_items);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return rc;
|
||||
assert(*size_items >= -(ptrdiff_t)begin.outer.tree->items &&
|
||||
*size_items <= (ptrdiff_t)begin.outer.tree->items);
|
||||
|
||||
#if 0 /* LY: Was decided to returns as-is (i.e. negative) the estimation \
|
||||
* results for an inverted ranges. */
|
||||
|
||||
/* Commit 8ddfd1f34ad7cf7a3c4aa75d2e248ca7e639ed63
|
||||
Change-Id: If59eccf7311123ab6384c4b93f9b1fed5a0a10d1 */
|
||||
|
||||
if (*size_items < 0) {
|
||||
/* LY: inverted range case */
|
||||
*size_items += (ptrdiff_t)begin.outer.tree->items;
|
||||
} else if (*size_items == 0 && begin_key && end_key) {
|
||||
int cmp = begin.outer.kvx->cmp(&origin_begin_key, &origin_end_key);
|
||||
if (cmp == 0 && cursor_pointed(begin.inner.cursor.flags) &&
|
||||
begin_data && end_data)
|
||||
cmp = begin.outer.kvx->v.cmp(&origin_begin_data, &origin_end_data);
|
||||
if (cmp > 0) {
|
||||
/* LY: inverted range case with empty scope */
|
||||
*size_items = (ptrdiff_t)begin.outer.tree->items;
|
||||
}
|
||||
}
|
||||
assert(*size_items >= 0 &&
|
||||
*size_items <= (ptrdiff_t)begin.outer.tree->items);
|
||||
#endif
|
||||
|
||||
return MDBX_SUCCESS;
|
||||
}
|
229
src/refund.c
Normal file
229
src/refund.c
Normal file
@ -0,0 +1,229 @@
|
||||
/// \copyright SPDX-License-Identifier: Apache-2.0
|
||||
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
|
||||
|
||||
#include "internals.h"
|
||||
|
||||
#if MDBX_ENABLE_REFUND
|
||||
static void refund_reclaimed(MDBX_txn *txn) {
|
||||
/* Scanning in descend order */
|
||||
pgno_t first_unallocated = txn->geo.first_unallocated;
|
||||
const pnl_t pnl = txn->tw.relist;
|
||||
tASSERT(txn,
|
||||
MDBX_PNL_GETSIZE(pnl) && MDBX_PNL_MOST(pnl) == first_unallocated - 1);
|
||||
#if MDBX_PNL_ASCENDING
|
||||
size_t i = MDBX_PNL_GETSIZE(pnl);
|
||||
tASSERT(txn, pnl[i] == first_unallocated - 1);
|
||||
while (--first_unallocated, --i > 0 && pnl[i] == first_unallocated - 1)
|
||||
;
|
||||
MDBX_PNL_SETSIZE(pnl, i);
|
||||
#else
|
||||
size_t i = 1;
|
||||
tASSERT(txn, pnl[i] == first_unallocated - 1);
|
||||
size_t len = MDBX_PNL_GETSIZE(pnl);
|
||||
while (--first_unallocated, ++i <= len && pnl[i] == first_unallocated - 1)
|
||||
;
|
||||
MDBX_PNL_SETSIZE(pnl, len -= i - 1);
|
||||
for (size_t move = 0; move < len; ++move)
|
||||
pnl[1 + move] = pnl[i + move];
|
||||
#endif
|
||||
VERBOSE("refunded %" PRIaPGNO " pages: %" PRIaPGNO " -> %" PRIaPGNO,
|
||||
txn->geo.first_unallocated - first_unallocated,
|
||||
txn->geo.first_unallocated, first_unallocated);
|
||||
txn->geo.first_unallocated = first_unallocated;
|
||||
tASSERT(txn,
|
||||
pnl_check_allocated(txn->tw.relist, txn->geo.first_unallocated - 1));
|
||||
}
|
||||
|
||||
static void refund_loose(MDBX_txn *txn) {
|
||||
tASSERT(txn, txn->tw.loose_pages != nullptr);
|
||||
tASSERT(txn, txn->tw.loose_count > 0);
|
||||
|
||||
dpl_t *const dl = txn->tw.dirtylist;
|
||||
if (dl) {
|
||||
tASSERT(txn, dl->length >= txn->tw.loose_count);
|
||||
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC);
|
||||
} else {
|
||||
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) != 0 && !MDBX_AVOID_MSYNC);
|
||||
}
|
||||
|
||||
pgno_t onstack[MDBX_CACHELINE_SIZE * 8 / sizeof(pgno_t)];
|
||||
pnl_t suitable = onstack;
|
||||
|
||||
if (!dl || dl->length - dl->sorted > txn->tw.loose_count) {
|
||||
/* Dirty list is useless since unsorted. */
|
||||
if (pnl_bytes2size(sizeof(onstack)) < txn->tw.loose_count) {
|
||||
suitable = pnl_alloc(txn->tw.loose_count);
|
||||
if (unlikely(!suitable))
|
||||
return /* this is not a reason for transaction fail */;
|
||||
}
|
||||
|
||||
/* Collect loose-pages which may be refunded. */
|
||||
tASSERT(txn,
|
||||
txn->geo.first_unallocated >= MIN_PAGENO + txn->tw.loose_count);
|
||||
pgno_t most = MIN_PAGENO;
|
||||
size_t w = 0;
|
||||
for (const page_t *lp = txn->tw.loose_pages; lp; lp = page_next(lp)) {
|
||||
tASSERT(txn, lp->flags == P_LOOSE);
|
||||
tASSERT(txn, txn->geo.first_unallocated > lp->pgno);
|
||||
if (likely(txn->geo.first_unallocated - txn->tw.loose_count <=
|
||||
lp->pgno)) {
|
||||
tASSERT(txn,
|
||||
w < ((suitable == onstack) ? pnl_bytes2size(sizeof(onstack))
|
||||
: MDBX_PNL_ALLOCLEN(suitable)));
|
||||
suitable[++w] = lp->pgno;
|
||||
most = (lp->pgno > most) ? lp->pgno : most;
|
||||
}
|
||||
MDBX_ASAN_UNPOISON_MEMORY_REGION(&page_next(lp), sizeof(page_t *));
|
||||
VALGRIND_MAKE_MEM_DEFINED(&page_next(lp), sizeof(page_t *));
|
||||
}
|
||||
|
||||
if (most + 1 == txn->geo.first_unallocated) {
|
||||
/* Sort suitable list and refund pages at the tail. */
|
||||
MDBX_PNL_SETSIZE(suitable, w);
|
||||
pnl_sort(suitable, MAX_PAGENO + 1);
|
||||
|
||||
/* Scanning in descend order */
|
||||
const intptr_t step = MDBX_PNL_ASCENDING ? -1 : 1;
|
||||
const intptr_t begin =
|
||||
MDBX_PNL_ASCENDING ? MDBX_PNL_GETSIZE(suitable) : 1;
|
||||
const intptr_t end =
|
||||
MDBX_PNL_ASCENDING ? 0 : MDBX_PNL_GETSIZE(suitable) + 1;
|
||||
tASSERT(txn, suitable[begin] >= suitable[end - step]);
|
||||
tASSERT(txn, most == suitable[begin]);
|
||||
|
||||
for (intptr_t i = begin + step; i != end; i += step) {
|
||||
if (suitable[i] != most - 1)
|
||||
break;
|
||||
most -= 1;
|
||||
}
|
||||
const size_t refunded = txn->geo.first_unallocated - most;
|
||||
DEBUG("refund-suitable %zu pages %" PRIaPGNO " -> %" PRIaPGNO, refunded,
|
||||
most, txn->geo.first_unallocated);
|
||||
txn->geo.first_unallocated = most;
|
||||
txn->tw.loose_count -= refunded;
|
||||
if (dl) {
|
||||
txn->tw.dirtyroom += refunded;
|
||||
dl->pages_including_loose -= refunded;
|
||||
assert(txn->tw.dirtyroom <= txn->env->options.dp_limit);
|
||||
|
||||
/* Filter-out dirty list */
|
||||
size_t r = 0;
|
||||
w = 0;
|
||||
if (dl->sorted) {
|
||||
do {
|
||||
if (dl->items[++r].pgno < most) {
|
||||
if (++w != r)
|
||||
dl->items[w] = dl->items[r];
|
||||
}
|
||||
} while (r < dl->sorted);
|
||||
dl->sorted = w;
|
||||
}
|
||||
while (r < dl->length) {
|
||||
if (dl->items[++r].pgno < most) {
|
||||
if (++w != r)
|
||||
dl->items[w] = dl->items[r];
|
||||
}
|
||||
}
|
||||
dpl_setlen(dl, w);
|
||||
tASSERT(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length ==
|
||||
(txn->parent ? txn->parent->tw.dirtyroom
|
||||
: txn->env->options.dp_limit));
|
||||
}
|
||||
goto unlink_loose;
|
||||
}
|
||||
} else {
|
||||
/* Dirtylist is mostly sorted, just refund loose pages at the end. */
|
||||
dpl_sort(txn);
|
||||
tASSERT(txn,
|
||||
dl->length < 2 || dl->items[1].pgno < dl->items[dl->length].pgno);
|
||||
tASSERT(txn, dl->sorted == dl->length);
|
||||
|
||||
/* Scan dirtylist tail-forward and cutoff suitable pages. */
|
||||
size_t n;
|
||||
for (n = dl->length; dl->items[n].pgno == txn->geo.first_unallocated - 1 &&
|
||||
dl->items[n].ptr->flags == P_LOOSE;
|
||||
--n) {
|
||||
tASSERT(txn, n > 0);
|
||||
page_t *dp = dl->items[n].ptr;
|
||||
DEBUG("refund-sorted page %" PRIaPGNO, dp->pgno);
|
||||
tASSERT(txn, dp->pgno == dl->items[n].pgno);
|
||||
txn->geo.first_unallocated -= 1;
|
||||
}
|
||||
dpl_setlen(dl, n);
|
||||
|
||||
if (dl->sorted != dl->length) {
|
||||
const size_t refunded = dl->sorted - dl->length;
|
||||
dl->sorted = dl->length;
|
||||
txn->tw.loose_count -= refunded;
|
||||
txn->tw.dirtyroom += refunded;
|
||||
dl->pages_including_loose -= refunded;
|
||||
tASSERT(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length ==
|
||||
(txn->parent ? txn->parent->tw.dirtyroom
|
||||
: txn->env->options.dp_limit));
|
||||
|
||||
/* Filter-out loose chain & dispose refunded pages. */
|
||||
unlink_loose:
|
||||
for (page_t *__restrict *__restrict link = &txn->tw.loose_pages; *link;) {
|
||||
page_t *dp = *link;
|
||||
tASSERT(txn, dp->flags == P_LOOSE);
|
||||
MDBX_ASAN_UNPOISON_MEMORY_REGION(&page_next(dp), sizeof(page_t *));
|
||||
VALGRIND_MAKE_MEM_DEFINED(&page_next(dp), sizeof(page_t *));
|
||||
if (txn->geo.first_unallocated > dp->pgno) {
|
||||
link = &page_next(dp);
|
||||
} else {
|
||||
*link = page_next(dp);
|
||||
if ((txn->flags & MDBX_WRITEMAP) == 0)
|
||||
page_shadow_release(txn->env, dp, 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
tASSERT(txn, dpl_check(txn));
|
||||
if (suitable != onstack)
|
||||
pnl_free(suitable);
|
||||
txn->tw.loose_refund_wl = txn->geo.first_unallocated;
|
||||
}
|
||||
|
||||
bool txn_refund(MDBX_txn *txn) {
|
||||
const pgno_t before = txn->geo.first_unallocated;
|
||||
|
||||
if (txn->tw.loose_pages &&
|
||||
txn->tw.loose_refund_wl > txn->geo.first_unallocated)
|
||||
refund_loose(txn);
|
||||
|
||||
while (true) {
|
||||
if (MDBX_PNL_GETSIZE(txn->tw.relist) == 0 ||
|
||||
MDBX_PNL_MOST(txn->tw.relist) != txn->geo.first_unallocated - 1)
|
||||
break;
|
||||
|
||||
refund_reclaimed(txn);
|
||||
if (!txn->tw.loose_pages ||
|
||||
txn->tw.loose_refund_wl <= txn->geo.first_unallocated)
|
||||
break;
|
||||
|
||||
const pgno_t memo = txn->geo.first_unallocated;
|
||||
refund_loose(txn);
|
||||
if (memo == txn->geo.first_unallocated)
|
||||
break;
|
||||
}
|
||||
|
||||
if (before == txn->geo.first_unallocated)
|
||||
return false;
|
||||
|
||||
if (txn->tw.spilled.list)
|
||||
/* Squash deleted pagenums if we refunded any */
|
||||
spill_purge(txn);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
#else /* MDBX_ENABLE_REFUND */
|
||||
|
||||
bool txn_refund(MDBX_txn *txn) {
|
||||
(void)txn;
|
||||
/* No online auto-compactification. */
|
||||
return false;
|
||||
}
|
||||
|
||||
#endif /* MDBX_ENABLE_REFUND */
|
485
src/sort.h
Normal file
485
src/sort.h
Normal file
@ -0,0 +1,485 @@
|
||||
/// \copyright SPDX-License-Identifier: Apache-2.0
|
||||
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
|
||||
///
|
||||
/// \file sort.h
|
||||
/// \brief Маркосы реализующие сортировку и двоичный поиск
|
||||
|
||||
#pragma once
|
||||
|
||||
#define MDBX_RADIXSORT_THRESHOLD 142
|
||||
|
||||
/* ---------------------------------------------------------------------------
|
||||
* LY: State of the art quicksort-based sorting, with internal stack
|
||||
* and network-sort for small chunks.
|
||||
* Thanks to John M. Gamble for the http://pages.ripco.net/~jgamble/nw.html */
|
||||
|
||||
#if MDBX_HAVE_CMOV
|
||||
#define SORT_CMP_SWAP(TYPE, CMP, a, b) \
|
||||
do { \
|
||||
const TYPE swap_tmp = (a); \
|
||||
const bool swap_cmp = expect_with_probability(CMP(swap_tmp, b), 0, .5); \
|
||||
(a) = swap_cmp ? swap_tmp : b; \
|
||||
(b) = swap_cmp ? b : swap_tmp; \
|
||||
} while (0)
|
||||
#else
|
||||
#define SORT_CMP_SWAP(TYPE, CMP, a, b) \
|
||||
do \
|
||||
if (expect_with_probability(!CMP(a, b), 0, .5)) { \
|
||||
const TYPE swap_tmp = (a); \
|
||||
(a) = (b); \
|
||||
(b) = swap_tmp; \
|
||||
} \
|
||||
while (0)
|
||||
#endif
|
||||
|
||||
// 3 comparators, 3 parallel operations
|
||||
// o-----^--^--o
|
||||
// | |
|
||||
// o--^--|--v--o
|
||||
// | |
|
||||
// o--v--v-----o
|
||||
//
|
||||
// [[1,2]]
|
||||
// [[0,2]]
|
||||
// [[0,1]]
|
||||
#define SORT_NETWORK_3(TYPE, CMP, begin) \
|
||||
do { \
|
||||
SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[2]); \
|
||||
SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[2]); \
|
||||
SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[1]); \
|
||||
} while (0)
|
||||
|
||||
// 5 comparators, 3 parallel operations
|
||||
// o--^--^--------o
|
||||
// | |
|
||||
// o--v--|--^--^--o
|
||||
// | | |
|
||||
// o--^--v--|--v--o
|
||||
// | |
|
||||
// o--v-----v-----o
|
||||
//
|
||||
// [[0,1],[2,3]]
|
||||
// [[0,2],[1,3]]
|
||||
// [[1,2]]
|
||||
#define SORT_NETWORK_4(TYPE, CMP, begin) \
|
||||
do { \
|
||||
SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[1]); \
|
||||
SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[3]); \
|
||||
SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[2]); \
|
||||
SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[3]); \
|
||||
SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[2]); \
|
||||
} while (0)
|
||||
|
||||
// 9 comparators, 5 parallel operations
|
||||
// o--^--^-----^-----------o
|
||||
// | | |
|
||||
// o--|--|--^--v-----^--^--o
|
||||
// | | | | |
|
||||
// o--|--v--|--^--^--|--v--o
|
||||
// | | | | |
|
||||
// o--|-----v--|--v--|--^--o
|
||||
// | | | |
|
||||
// o--v--------v-----v--v--o
|
||||
//
|
||||
// [[0,4],[1,3]]
|
||||
// [[0,2]]
|
||||
// [[2,4],[0,1]]
|
||||
// [[2,3],[1,4]]
|
||||
// [[1,2],[3,4]]
|
||||
#define SORT_NETWORK_5(TYPE, CMP, begin) \
|
||||
do { \
|
||||
SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[4]); \
|
||||
SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[3]); \
|
||||
SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[2]); \
|
||||
SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[4]); \
|
||||
SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[1]); \
|
||||
SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[3]); \
|
||||
SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[4]); \
|
||||
SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[2]); \
|
||||
SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[4]); \
|
||||
} while (0)
|
||||
|
||||
// 12 comparators, 6 parallel operations
|
||||
// o-----^--^--^-----------------o
|
||||
// | | |
|
||||
// o--^--|--v--|--^--------^-----o
|
||||
// | | | | |
|
||||
// o--v--v-----|--|--^--^--|--^--o
|
||||
// | | | | | |
|
||||
// o-----^--^--v--|--|--|--v--v--o
|
||||
// | | | | |
|
||||
// o--^--|--v-----v--|--v--------o
|
||||
// | | |
|
||||
// o--v--v-----------v-----------o
|
||||
//
|
||||
// [[1,2],[4,5]]
|
||||
// [[0,2],[3,5]]
|
||||
// [[0,1],[3,4],[2,5]]
|
||||
// [[0,3],[1,4]]
|
||||
// [[2,4],[1,3]]
|
||||
// [[2,3]]
|
||||
#define SORT_NETWORK_6(TYPE, CMP, begin) \
|
||||
do { \
|
||||
SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[2]); \
|
||||
SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[5]); \
|
||||
SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[2]); \
|
||||
SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[5]); \
|
||||
SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[1]); \
|
||||
SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[4]); \
|
||||
SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[5]); \
|
||||
SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[3]); \
|
||||
SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[4]); \
|
||||
SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[4]); \
|
||||
SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[3]); \
|
||||
SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[3]); \
|
||||
} while (0)
|
||||
|
||||
// 16 comparators, 6 parallel operations
|
||||
// o--^--------^-----^-----------------o
|
||||
// | | |
|
||||
// o--|--^-----|--^--v--------^--^-----o
|
||||
// | | | | | |
|
||||
// o--|--|--^--v--|--^-----^--|--v-----o
|
||||
// | | | | | | |
|
||||
// o--|--|--|-----v--|--^--v--|--^--^--o
|
||||
// | | | | | | | |
|
||||
// o--v--|--|--^-----v--|--^--v--|--v--o
|
||||
// | | | | | |
|
||||
// o-----v--|--|--------v--v-----|--^--o
|
||||
// | | | |
|
||||
// o--------v--v-----------------v--v--o
|
||||
//
|
||||
// [[0,4],[1,5],[2,6]]
|
||||
// [[0,2],[1,3],[4,6]]
|
||||
// [[2,4],[3,5],[0,1]]
|
||||
// [[2,3],[4,5]]
|
||||
// [[1,4],[3,6]]
|
||||
// [[1,2],[3,4],[5,6]]
|
||||
#define SORT_NETWORK_7(TYPE, CMP, begin) \
|
||||
do { \
|
||||
SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[4]); \
|
||||
SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[5]); \
|
||||
SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[6]); \
|
||||
SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[2]); \
|
||||
SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[3]); \
|
||||
SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[6]); \
|
||||
SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[4]); \
|
||||
SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[5]); \
|
||||
SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[1]); \
|
||||
SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[3]); \
|
||||
SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[5]); \
|
||||
SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[4]); \
|
||||
SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[6]); \
|
||||
SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[2]); \
|
||||
SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[4]); \
|
||||
SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[6]); \
|
||||
} while (0)
|
||||
|
||||
// 19 comparators, 6 parallel operations
|
||||
// o--^--------^-----^-----------------o
|
||||
// | | |
|
||||
// o--|--^-----|--^--v--------^--^-----o
|
||||
// | | | | | |
|
||||
// o--|--|--^--v--|--^-----^--|--v-----o
|
||||
// | | | | | | |
|
||||
// o--|--|--|--^--v--|--^--v--|--^--^--o
|
||||
// | | | | | | | | |
|
||||
// o--v--|--|--|--^--v--|--^--v--|--v--o
|
||||
// | | | | | | |
|
||||
// o-----v--|--|--|--^--v--v-----|--^--o
|
||||
// | | | | | |
|
||||
// o--------v--|--v--|--^--------v--v--o
|
||||
// | | |
|
||||
// o-----------v-----v--v--------------o
|
||||
//
|
||||
// [[0,4],[1,5],[2,6],[3,7]]
|
||||
// [[0,2],[1,3],[4,6],[5,7]]
|
||||
// [[2,4],[3,5],[0,1],[6,7]]
|
||||
// [[2,3],[4,5]]
|
||||
// [[1,4],[3,6]]
|
||||
// [[1,2],[3,4],[5,6]]
|
||||
#define SORT_NETWORK_8(TYPE, CMP, begin) \
|
||||
do { \
|
||||
SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[4]); \
|
||||
SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[5]); \
|
||||
SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[6]); \
|
||||
SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[7]); \
|
||||
SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[2]); \
|
||||
SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[3]); \
|
||||
SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[6]); \
|
||||
SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[7]); \
|
||||
SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[4]); \
|
||||
SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[5]); \
|
||||
SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[1]); \
|
||||
SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[7]); \
|
||||
SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[3]); \
|
||||
SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[5]); \
|
||||
SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[4]); \
|
||||
SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[6]); \
|
||||
SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[2]); \
|
||||
SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[4]); \
|
||||
SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[6]); \
|
||||
} while (0)
|
||||
|
||||
#define SORT_INNER(TYPE, CMP, begin, end, len) \
|
||||
switch (len) { \
|
||||
default: \
|
||||
assert(false); \
|
||||
__unreachable(); \
|
||||
case 0: \
|
||||
case 1: \
|
||||
break; \
|
||||
case 2: \
|
||||
SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[1]); \
|
||||
break; \
|
||||
case 3: \
|
||||
SORT_NETWORK_3(TYPE, CMP, begin); \
|
||||
break; \
|
||||
case 4: \
|
||||
SORT_NETWORK_4(TYPE, CMP, begin); \
|
||||
break; \
|
||||
case 5: \
|
||||
SORT_NETWORK_5(TYPE, CMP, begin); \
|
||||
break; \
|
||||
case 6: \
|
||||
SORT_NETWORK_6(TYPE, CMP, begin); \
|
||||
break; \
|
||||
case 7: \
|
||||
SORT_NETWORK_7(TYPE, CMP, begin); \
|
||||
break; \
|
||||
case 8: \
|
||||
SORT_NETWORK_8(TYPE, CMP, begin); \
|
||||
break; \
|
||||
}
|
||||
|
||||
#define SORT_SWAP(TYPE, a, b) \
|
||||
do { \
|
||||
const TYPE swap_tmp = (a); \
|
||||
(a) = (b); \
|
||||
(b) = swap_tmp; \
|
||||
} while (0)
|
||||
|
||||
#define SORT_PUSH(low, high) \
|
||||
do { \
|
||||
top->lo = (low); \
|
||||
top->hi = (high); \
|
||||
++top; \
|
||||
} while (0)
|
||||
|
||||
#define SORT_POP(low, high) \
|
||||
do { \
|
||||
--top; \
|
||||
low = top->lo; \
|
||||
high = top->hi; \
|
||||
} while (0)
|
||||
|
||||
#define SORT_IMPL(NAME, EXPECT_LOW_CARDINALITY_OR_PRESORTED, TYPE, CMP) \
|
||||
\
|
||||
static inline bool NAME##_is_sorted(const TYPE *first, const TYPE *last) { \
|
||||
while (++first <= last) \
|
||||
if (expect_with_probability(CMP(first[0], first[-1]), 1, .1)) \
|
||||
return false; \
|
||||
return true; \
|
||||
} \
|
||||
\
|
||||
typedef struct { \
|
||||
TYPE *lo, *hi; \
|
||||
} NAME##_stack; \
|
||||
\
|
||||
__hot static void NAME(TYPE *const __restrict begin, \
|
||||
TYPE *const __restrict end) { \
|
||||
NAME##_stack stack[sizeof(size_t) * CHAR_BIT], *__restrict top = stack; \
|
||||
\
|
||||
TYPE *__restrict hi = end - 1; \
|
||||
TYPE *__restrict lo = begin; \
|
||||
while (true) { \
|
||||
const ptrdiff_t len = hi - lo; \
|
||||
if (len < 8) { \
|
||||
SORT_INNER(TYPE, CMP, lo, hi + 1, len + 1); \
|
||||
if (unlikely(top == stack)) \
|
||||
break; \
|
||||
SORT_POP(lo, hi); \
|
||||
continue; \
|
||||
} \
|
||||
\
|
||||
TYPE *__restrict mid = lo + (len >> 1); \
|
||||
SORT_CMP_SWAP(TYPE, CMP, *lo, *mid); \
|
||||
SORT_CMP_SWAP(TYPE, CMP, *mid, *hi); \
|
||||
SORT_CMP_SWAP(TYPE, CMP, *lo, *mid); \
|
||||
\
|
||||
TYPE *right = hi - 1; \
|
||||
TYPE *left = lo + 1; \
|
||||
while (1) { \
|
||||
while (expect_with_probability(CMP(*left, *mid), 0, .5)) \
|
||||
++left; \
|
||||
while (expect_with_probability(CMP(*mid, *right), 0, .5)) \
|
||||
--right; \
|
||||
if (unlikely(left > right)) { \
|
||||
if (EXPECT_LOW_CARDINALITY_OR_PRESORTED) { \
|
||||
if (NAME##_is_sorted(lo, right)) \
|
||||
lo = right + 1; \
|
||||
if (NAME##_is_sorted(left, hi)) \
|
||||
hi = left; \
|
||||
} \
|
||||
break; \
|
||||
} \
|
||||
SORT_SWAP(TYPE, *left, *right); \
|
||||
mid = (mid == left) ? right : (mid == right) ? left : mid; \
|
||||
++left; \
|
||||
--right; \
|
||||
} \
|
||||
\
|
||||
if (right - lo > hi - left) { \
|
||||
SORT_PUSH(lo, right); \
|
||||
lo = left; \
|
||||
} else { \
|
||||
SORT_PUSH(left, hi); \
|
||||
hi = right; \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
if (AUDIT_ENABLED()) { \
|
||||
for (TYPE *scan = begin + 1; scan < end; ++scan) \
|
||||
assert(CMP(scan[-1], scan[0])); \
|
||||
} \
|
||||
}
|
||||
|
||||
/*------------------------------------------------------------------------------
|
||||
* LY: radix sort for large chunks */
|
||||
|
||||
#define RADIXSORT_IMPL(NAME, TYPE, EXTRACT_KEY, BUFFER_PREALLOCATED, END_GAP) \
|
||||
\
|
||||
__hot static bool NAME##_radixsort(TYPE *const begin, const size_t length) { \
|
||||
TYPE *tmp; \
|
||||
if (BUFFER_PREALLOCATED) { \
|
||||
tmp = begin + length + END_GAP; \
|
||||
/* memset(tmp, 0xDeadBeef, sizeof(TYPE) * length); */ \
|
||||
} else { \
|
||||
tmp = osal_malloc(sizeof(TYPE) * length); \
|
||||
if (unlikely(!tmp)) \
|
||||
return false; \
|
||||
} \
|
||||
\
|
||||
size_t key_shift = 0, key_diff_mask; \
|
||||
do { \
|
||||
struct { \
|
||||
pgno_t a[256], b[256]; \
|
||||
} counters; \
|
||||
memset(&counters, 0, sizeof(counters)); \
|
||||
\
|
||||
key_diff_mask = 0; \
|
||||
size_t prev_key = EXTRACT_KEY(begin) >> key_shift; \
|
||||
TYPE *r = begin, *end = begin + length; \
|
||||
do { \
|
||||
const size_t key = EXTRACT_KEY(r) >> key_shift; \
|
||||
counters.a[key & 255]++; \
|
||||
counters.b[(key >> 8) & 255]++; \
|
||||
key_diff_mask |= prev_key ^ key; \
|
||||
prev_key = key; \
|
||||
} while (++r != end); \
|
||||
\
|
||||
pgno_t ta = 0, tb = 0; \
|
||||
for (size_t i = 0; i < 256; ++i) { \
|
||||
const pgno_t ia = counters.a[i]; \
|
||||
counters.a[i] = ta; \
|
||||
ta += ia; \
|
||||
const pgno_t ib = counters.b[i]; \
|
||||
counters.b[i] = tb; \
|
||||
tb += ib; \
|
||||
} \
|
||||
\
|
||||
r = begin; \
|
||||
do { \
|
||||
const size_t key = EXTRACT_KEY(r) >> key_shift; \
|
||||
tmp[counters.a[key & 255]++] = *r; \
|
||||
} while (++r != end); \
|
||||
\
|
||||
if (unlikely(key_diff_mask < 256)) { \
|
||||
memcpy(begin, tmp, ptr_dist(end, begin)); \
|
||||
break; \
|
||||
} \
|
||||
end = (r = tmp) + length; \
|
||||
do { \
|
||||
const size_t key = EXTRACT_KEY(r) >> key_shift; \
|
||||
begin[counters.b[(key >> 8) & 255]++] = *r; \
|
||||
} while (++r != end); \
|
||||
\
|
||||
key_shift += 16; \
|
||||
} while (key_diff_mask >> 16); \
|
||||
\
|
||||
if (!(BUFFER_PREALLOCATED)) \
|
||||
osal_free(tmp); \
|
||||
return true; \
|
||||
}
|
||||
|
||||
/*------------------------------------------------------------------------------
|
||||
* LY: Binary search */
|
||||
|
||||
#if defined(__clang__) && __clang_major__ > 4 && defined(__ia32__)
|
||||
#define WORKAROUND_FOR_CLANG_OPTIMIZER_BUG(size, flag) \
|
||||
do \
|
||||
__asm __volatile("" \
|
||||
: "+r"(size) \
|
||||
: "r" /* the `b` constraint is more suitable here, but \
|
||||
cause CLANG to allocate and push/pop an one more \
|
||||
register, so using the `r` which avoids this. */ \
|
||||
(flag)); \
|
||||
while (0)
|
||||
#else
|
||||
#define WORKAROUND_FOR_CLANG_OPTIMIZER_BUG(size, flag) \
|
||||
do { \
|
||||
/* nope for non-clang or non-x86 */; \
|
||||
} while (0)
|
||||
#endif /* Workaround for CLANG */
|
||||
|
||||
/* *INDENT-OFF* */
|
||||
/* clang-format off */
|
||||
#define SEARCH_IMPL(NAME, TYPE_LIST, TYPE_ARG, CMP) \
|
||||
static __always_inline const TYPE_LIST *NAME( \
|
||||
const TYPE_LIST *it, size_t length, const TYPE_ARG item) { \
|
||||
const TYPE_LIST *const begin = it, *const end = begin + length; \
|
||||
\
|
||||
if (MDBX_HAVE_CMOV) \
|
||||
do { \
|
||||
/* Адаптивно-упрощенный шаг двоичного поиска: \
|
||||
* - без переходов при наличии cmov или аналога; \
|
||||
* - допускает лишние итерации; \
|
||||
* - но ищет пока size > 2, что требует дозавершения поиска \
|
||||
* среди остающихся 0-1-2 элементов. */ \
|
||||
const TYPE_LIST *const middle = it + (length >> 1); \
|
||||
length = (length + 1) >> 1; \
|
||||
const bool flag = expect_with_probability(CMP(*middle, item), 0, .5); \
|
||||
WORKAROUND_FOR_CLANG_OPTIMIZER_BUG(length, flag); \
|
||||
it = flag ? middle : it; \
|
||||
} while (length > 2); \
|
||||
else \
|
||||
while (length > 2) { \
|
||||
/* Вариант с использованием условного перехода. Основное отличие в \
|
||||
* том, что при "не равно" (true от компаратора) переход делается на 1 \
|
||||
* ближе к концу массива. Алгоритмически это верно и обеспечивает \
|
||||
* чуть-чуть более быструю сходимость, но зато требует больше \
|
||||
* вычислений при true от компаратора. Также ВАЖНО(!) не допускается \
|
||||
* спекулятивное выполнение при size == 0. */ \
|
||||
const TYPE_LIST *const middle = it + (length >> 1); \
|
||||
length = (length + 1) >> 1; \
|
||||
const bool flag = expect_with_probability(CMP(*middle, item), 0, .5); \
|
||||
if (flag) { \
|
||||
it = middle + 1; \
|
||||
length -= 1; \
|
||||
} \
|
||||
} \
|
||||
it += length > 1 && expect_with_probability(CMP(*it, item), 0, .5); \
|
||||
it += length > 0 && expect_with_probability(CMP(*it, item), 0, .5); \
|
||||
\
|
||||
if (AUDIT_ENABLED()) { \
|
||||
for (const TYPE_LIST *scan = begin; scan < it; ++scan) \
|
||||
assert(CMP(*scan, item)); \
|
||||
for (const TYPE_LIST *scan = it; scan < end; ++scan) \
|
||||
assert(!CMP(*scan, item)); \
|
||||
(void)begin, (void)end; \
|
||||
} \
|
||||
\
|
||||
return it; \
|
||||
}
|
||||
/* *INDENT-ON* */
|
||||
/* clang-format on */
|
484
src/spill.c
Normal file
484
src/spill.c
Normal file
@ -0,0 +1,484 @@
|
||||
/// \copyright SPDX-License-Identifier: Apache-2.0
|
||||
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
|
||||
|
||||
#include "internals.h"
|
||||
|
||||
void spill_remove(MDBX_txn *txn, size_t idx, size_t npages) {
|
||||
tASSERT(txn, idx > 0 && idx <= MDBX_PNL_GETSIZE(txn->tw.spilled.list) &&
|
||||
txn->tw.spilled.least_removed > 0);
|
||||
txn->tw.spilled.least_removed = (idx < txn->tw.spilled.least_removed)
|
||||
? idx
|
||||
: txn->tw.spilled.least_removed;
|
||||
txn->tw.spilled.list[idx] |= 1;
|
||||
MDBX_PNL_SETSIZE(txn->tw.spilled.list,
|
||||
MDBX_PNL_GETSIZE(txn->tw.spilled.list) -
|
||||
(idx == MDBX_PNL_GETSIZE(txn->tw.spilled.list)));
|
||||
|
||||
while (unlikely(npages > 1)) {
|
||||
const pgno_t pgno = (txn->tw.spilled.list[idx] >> 1) + 1;
|
||||
if (MDBX_PNL_ASCENDING) {
|
||||
if (++idx > MDBX_PNL_GETSIZE(txn->tw.spilled.list) ||
|
||||
(txn->tw.spilled.list[idx] >> 1) != pgno)
|
||||
return;
|
||||
} else {
|
||||
if (--idx < 1 || (txn->tw.spilled.list[idx] >> 1) != pgno)
|
||||
return;
|
||||
txn->tw.spilled.least_removed = (idx < txn->tw.spilled.least_removed)
|
||||
? idx
|
||||
: txn->tw.spilled.least_removed;
|
||||
}
|
||||
txn->tw.spilled.list[idx] |= 1;
|
||||
MDBX_PNL_SETSIZE(txn->tw.spilled.list,
|
||||
MDBX_PNL_GETSIZE(txn->tw.spilled.list) -
|
||||
(idx == MDBX_PNL_GETSIZE(txn->tw.spilled.list)));
|
||||
--npages;
|
||||
}
|
||||
}
|
||||
|
||||
pnl_t spill_purge(MDBX_txn *txn) {
|
||||
tASSERT(txn, txn->tw.spilled.least_removed > 0);
|
||||
const pnl_t sl = txn->tw.spilled.list;
|
||||
if (txn->tw.spilled.least_removed != INT_MAX) {
|
||||
size_t len = MDBX_PNL_GETSIZE(sl), r, w;
|
||||
for (w = r = txn->tw.spilled.least_removed; r <= len; ++r) {
|
||||
sl[w] = sl[r];
|
||||
w += 1 - (sl[r] & 1);
|
||||
}
|
||||
for (size_t i = 1; i < w; ++i)
|
||||
tASSERT(txn, (sl[i] & 1) == 0);
|
||||
MDBX_PNL_SETSIZE(sl, w - 1);
|
||||
txn->tw.spilled.least_removed = INT_MAX;
|
||||
} else {
|
||||
for (size_t i = 1; i <= MDBX_PNL_GETSIZE(sl); ++i)
|
||||
tASSERT(txn, (sl[i] & 1) == 0);
|
||||
}
|
||||
return sl;
|
||||
}
|
||||
|
||||
/*----------------------------------------------------------------------------*/
|
||||
|
||||
static int spill_page(MDBX_txn *txn, iov_ctx_t *ctx, page_t *dp,
|
||||
const size_t npages) {
|
||||
tASSERT(txn, !(txn->flags & MDBX_WRITEMAP));
|
||||
#if MDBX_ENABLE_PGOP_STAT
|
||||
txn->env->lck->pgops.spill.weak += npages;
|
||||
#endif /* MDBX_ENABLE_PGOP_STAT */
|
||||
const pgno_t pgno = dp->pgno;
|
||||
int err = iov_page(txn, ctx, dp, npages);
|
||||
if (likely(err == MDBX_SUCCESS))
|
||||
err = spill_append_span(&txn->tw.spilled.list, pgno, npages);
|
||||
return err;
|
||||
}
|
||||
|
||||
/* Set unspillable LRU-label for dirty pages watched by txn.
|
||||
* Returns the number of pages marked as unspillable. */
|
||||
static size_t spill_cursor_keep(const MDBX_txn *const txn,
|
||||
const MDBX_cursor *mc) {
|
||||
tASSERT(txn, (txn->flags & (MDBX_TXN_RDONLY | MDBX_WRITEMAP)) == 0);
|
||||
size_t keep = 0;
|
||||
while (!is_poor(mc)) {
|
||||
tASSERT(txn, mc->top >= 0);
|
||||
const page_t *mp;
|
||||
intptr_t i = 0;
|
||||
do {
|
||||
mp = mc->pg[i];
|
||||
tASSERT(txn, !is_subpage(mp));
|
||||
if (is_modifable(txn, mp)) {
|
||||
size_t const n = dpl_search(txn, mp->pgno);
|
||||
if (txn->tw.dirtylist->items[n].pgno == mp->pgno &&
|
||||
/* не считаем дважды */ dpl_age(txn, n)) {
|
||||
size_t *const ptr = ptr_disp(txn->tw.dirtylist->items[n].ptr,
|
||||
-(ptrdiff_t)sizeof(size_t));
|
||||
*ptr = txn->tw.dirtylru;
|
||||
tASSERT(txn, dpl_age(txn, n) == 0);
|
||||
++keep;
|
||||
}
|
||||
}
|
||||
} while (++i <= mc->top);
|
||||
|
||||
tASSERT(txn, is_leaf(mp));
|
||||
if (!mc->subcur || mc->ki[mc->top] >= page_numkeys(mp))
|
||||
break;
|
||||
if (!(node_flags(page_node(mp, mc->ki[mc->top])) & N_SUBDATA))
|
||||
break;
|
||||
mc = &mc->subcur->cursor;
|
||||
}
|
||||
return keep;
|
||||
}
|
||||
|
||||
static size_t spill_txn_keep(MDBX_txn *txn, MDBX_cursor *m0) {
|
||||
tASSERT(txn, (txn->flags & (MDBX_TXN_RDONLY | MDBX_WRITEMAP)) == 0);
|
||||
dpl_lru_turn(txn);
|
||||
size_t keep = m0 ? spill_cursor_keep(txn, m0) : 0;
|
||||
|
||||
TXN_FOREACH_DBI_ALL(txn, dbi) {
|
||||
if (F_ISSET(txn->dbi_state[dbi], DBI_DIRTY | DBI_VALID) &&
|
||||
txn->dbs[dbi].root != P_INVALID)
|
||||
for (MDBX_cursor *mc = txn->cursors[dbi]; mc; mc = mc->next)
|
||||
if (mc != m0)
|
||||
keep += spill_cursor_keep(txn, mc);
|
||||
}
|
||||
|
||||
return keep;
|
||||
}
|
||||
|
||||
/* Returns the spilling priority (0..255) for a dirty page:
|
||||
* 0 = should be spilled;
|
||||
* ...
|
||||
* > 255 = must not be spilled. */
|
||||
MDBX_NOTHROW_PURE_FUNCTION static unsigned
|
||||
spill_prio(const MDBX_txn *txn, const size_t i, const uint32_t reciprocal) {
|
||||
dpl_t *const dl = txn->tw.dirtylist;
|
||||
const uint32_t age = dpl_age(txn, i);
|
||||
const size_t npages = dpl_npages(dl, i);
|
||||
const pgno_t pgno = dl->items[i].pgno;
|
||||
if (age == 0) {
|
||||
DEBUG("skip %s %zu page %" PRIaPGNO, "keep", npages, pgno);
|
||||
return 256;
|
||||
}
|
||||
|
||||
page_t *const dp = dl->items[i].ptr;
|
||||
if (dp->flags & (P_LOOSE | P_SPILLED)) {
|
||||
DEBUG("skip %s %zu page %" PRIaPGNO,
|
||||
(dp->flags & P_LOOSE) ? "loose" : "parent-spilled", npages, pgno);
|
||||
return 256;
|
||||
}
|
||||
|
||||
/* Can't spill twice,
|
||||
* make sure it's not already in a parent's spill list(s). */
|
||||
MDBX_txn *parent = txn->parent;
|
||||
if (parent && (parent->flags & MDBX_TXN_SPILLS)) {
|
||||
do
|
||||
if (spill_intersect(parent, pgno, npages)) {
|
||||
DEBUG("skip-2 parent-spilled %zu page %" PRIaPGNO, npages, pgno);
|
||||
dp->flags |= P_SPILLED;
|
||||
return 256;
|
||||
}
|
||||
while ((parent = parent->parent) != nullptr);
|
||||
}
|
||||
|
||||
tASSERT(txn, age * (uint64_t)reciprocal < UINT32_MAX);
|
||||
unsigned prio = age * reciprocal >> 24;
|
||||
tASSERT(txn, prio < 256);
|
||||
if (likely(npages == 1))
|
||||
return prio = 256 - prio;
|
||||
|
||||
/* make a large/overflow pages be likely to spill */
|
||||
size_t factor = npages | npages >> 1;
|
||||
factor |= factor >> 2;
|
||||
factor |= factor >> 4;
|
||||
factor |= factor >> 8;
|
||||
factor |= factor >> 16;
|
||||
factor = (size_t)prio * log2n_powerof2(factor + 1) + /* golden ratio */ 157;
|
||||
factor = (factor < 256) ? 255 - factor : 0;
|
||||
tASSERT(txn, factor < 256 && factor < (256 - prio));
|
||||
return prio = (unsigned)factor;
|
||||
}
|
||||
|
||||
static size_t spill_gate(const MDBX_env *env, intptr_t part,
|
||||
const size_t total) {
|
||||
const intptr_t spill_min =
|
||||
env->options.spill_min_denominator
|
||||
? (total + env->options.spill_min_denominator - 1) /
|
||||
env->options.spill_min_denominator
|
||||
: 1;
|
||||
const intptr_t spill_max =
|
||||
total - (env->options.spill_max_denominator
|
||||
? total / env->options.spill_max_denominator
|
||||
: 0);
|
||||
part = (part < spill_max) ? part : spill_max;
|
||||
part = (part > spill_min) ? part : spill_min;
|
||||
eASSERT(env, part >= 0 && (size_t)part <= total);
|
||||
return (size_t)part;
|
||||
}
|
||||
|
||||
__cold int spill_slowpath(MDBX_txn *const txn, MDBX_cursor *const m0,
|
||||
const intptr_t wanna_spill_entries,
|
||||
const intptr_t wanna_spill_npages,
|
||||
const size_t need) {
|
||||
tASSERT(txn, (txn->flags & MDBX_TXN_RDONLY) == 0);
|
||||
|
||||
int rc = MDBX_SUCCESS;
|
||||
if (unlikely(txn->tw.loose_count >=
|
||||
(txn->tw.dirtylist ? txn->tw.dirtylist->pages_including_loose
|
||||
: txn->tw.writemap_dirty_npages)))
|
||||
goto done;
|
||||
|
||||
const size_t dirty_entries =
|
||||
txn->tw.dirtylist ? (txn->tw.dirtylist->length - txn->tw.loose_count) : 1;
|
||||
const size_t dirty_npages =
|
||||
(txn->tw.dirtylist ? txn->tw.dirtylist->pages_including_loose
|
||||
: txn->tw.writemap_dirty_npages) -
|
||||
txn->tw.loose_count;
|
||||
const size_t need_spill_entries =
|
||||
spill_gate(txn->env, wanna_spill_entries, dirty_entries);
|
||||
const size_t need_spill_npages =
|
||||
spill_gate(txn->env, wanna_spill_npages, dirty_npages);
|
||||
|
||||
const size_t need_spill = (need_spill_entries > need_spill_npages)
|
||||
? need_spill_entries
|
||||
: need_spill_npages;
|
||||
if (!need_spill)
|
||||
goto done;
|
||||
|
||||
if (txn->flags & MDBX_WRITEMAP) {
|
||||
NOTICE("%s-spilling %zu dirty-entries, %zu dirty-npages", "msync",
|
||||
dirty_entries, dirty_npages);
|
||||
const MDBX_env *env = txn->env;
|
||||
tASSERT(txn, txn->tw.spilled.list == nullptr);
|
||||
rc = osal_msync(&txn->env->dxb_mmap, 0,
|
||||
pgno_align2os_bytes(env, txn->geo.first_unallocated),
|
||||
MDBX_SYNC_KICK);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
goto bailout;
|
||||
#if MDBX_AVOID_MSYNC
|
||||
MDBX_ANALYSIS_ASSUME(txn->tw.dirtylist != nullptr);
|
||||
tASSERT(txn, dpl_check(txn));
|
||||
env->lck->unsynced_pages.weak +=
|
||||
txn->tw.dirtylist->pages_including_loose - txn->tw.loose_count;
|
||||
dpl_clear(txn->tw.dirtylist);
|
||||
txn->tw.dirtyroom = env->options.dp_limit - txn->tw.loose_count;
|
||||
for (page_t *lp = txn->tw.loose_pages; lp != nullptr; lp = page_next(lp)) {
|
||||
tASSERT(txn, lp->flags == P_LOOSE);
|
||||
rc = dpl_append(txn, lp->pgno, lp, 1);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
goto bailout;
|
||||
MDBX_ASAN_UNPOISON_MEMORY_REGION(&page_next(lp), sizeof(page_t *));
|
||||
VALGRIND_MAKE_MEM_DEFINED(&page_next(lp), sizeof(page_t *));
|
||||
}
|
||||
tASSERT(txn, dpl_check(txn));
|
||||
#else
|
||||
tASSERT(txn, txn->tw.dirtylist == nullptr);
|
||||
env->lck->unsynced_pages.weak += txn->tw.writemap_dirty_npages;
|
||||
txn->tw.writemap_spilled_npages += txn->tw.writemap_dirty_npages;
|
||||
txn->tw.writemap_dirty_npages = 0;
|
||||
#endif /* MDBX_AVOID_MSYNC */
|
||||
goto done;
|
||||
}
|
||||
|
||||
NOTICE("%s-spilling %zu dirty-entries, %zu dirty-npages", "write",
|
||||
need_spill_entries, need_spill_npages);
|
||||
MDBX_ANALYSIS_ASSUME(txn->tw.dirtylist != nullptr);
|
||||
tASSERT(txn, txn->tw.dirtylist->length - txn->tw.loose_count >= 1);
|
||||
tASSERT(txn, txn->tw.dirtylist->pages_including_loose - txn->tw.loose_count >=
|
||||
need_spill_npages);
|
||||
if (!txn->tw.spilled.list) {
|
||||
txn->tw.spilled.least_removed = INT_MAX;
|
||||
txn->tw.spilled.list = pnl_alloc(need_spill);
|
||||
if (unlikely(!txn->tw.spilled.list)) {
|
||||
rc = MDBX_ENOMEM;
|
||||
bailout:
|
||||
txn->flags |= MDBX_TXN_ERROR;
|
||||
return rc;
|
||||
}
|
||||
} else {
|
||||
/* purge deleted slots */
|
||||
spill_purge(txn);
|
||||
rc = pnl_reserve(&txn->tw.spilled.list, need_spill);
|
||||
(void)rc /* ignore since the resulting list may be shorter
|
||||
and pnl_append() will increase pnl on demand */
|
||||
;
|
||||
}
|
||||
|
||||
/* Сортируем чтобы запись на диск была полее последовательна */
|
||||
dpl_t *const dl = dpl_sort(txn);
|
||||
|
||||
/* Preserve pages which may soon be dirtied again */
|
||||
const size_t unspillable = spill_txn_keep(txn, m0);
|
||||
if (unspillable + txn->tw.loose_count >= dl->length) {
|
||||
#if xMDBX_DEBUG_SPILLING == 1 /* avoid false failure in debug mode */
|
||||
if (likely(txn->tw.dirtyroom + txn->tw.loose_count >= need))
|
||||
return MDBX_SUCCESS;
|
||||
#endif /* xMDBX_DEBUG_SPILLING */
|
||||
ERROR("all %zu dirty pages are unspillable since referenced "
|
||||
"by a cursor(s), use fewer cursors or increase "
|
||||
"MDBX_opt_txn_dp_limit",
|
||||
unspillable);
|
||||
goto done;
|
||||
}
|
||||
|
||||
/* Подзадача: Вытолкнуть часть страниц на диск в соответствии с LRU,
|
||||
* но при этом учесть важные поправки:
|
||||
* - лучше выталкивать старые large/overflow страницы, так будет освобождено
|
||||
* больше памяти, а также так как они (в текущем понимании) гораздо реже
|
||||
* повторно изменяются;
|
||||
* - при прочих равных лучше выталкивать смежные страницы, так будет
|
||||
* меньше I/O операций;
|
||||
* - желательно потратить на это меньше времени чем std::partial_sort_copy;
|
||||
*
|
||||
* Решение:
|
||||
* - Квантуем весь диапазон lru-меток до 256 значений и задействуем один
|
||||
* проход 8-битного radix-sort. В результате получаем 256 уровней
|
||||
* "свежести", в том числе значение lru-метки, старее которой страницы
|
||||
* должны быть выгружены;
|
||||
* - Двигаемся последовательно в сторону увеличения номеров страниц
|
||||
* и выталкиваем страницы с lru-меткой старее отсекающего значения,
|
||||
* пока не вытолкнем достаточно;
|
||||
* - Встречая страницы смежные с выталкиваемыми для уменьшения кол-ва
|
||||
* I/O операций выталкиваем и их, если они попадают в первую половину
|
||||
* между выталкиваемыми и самыми свежими lru-метками;
|
||||
* - дополнительно при сортировке умышленно старим large/overflow страницы,
|
||||
* тем самым повышая их шансы на выталкивание. */
|
||||
|
||||
/* get min/max of LRU-labels */
|
||||
uint32_t age_max = 0;
|
||||
for (size_t i = 1; i <= dl->length; ++i) {
|
||||
const uint32_t age = dpl_age(txn, i);
|
||||
age_max = (age_max >= age) ? age_max : age;
|
||||
}
|
||||
|
||||
VERBOSE("lru-head %u, age-max %u", txn->tw.dirtylru, age_max);
|
||||
|
||||
/* half of 8-bit radix-sort */
|
||||
pgno_t radix_entries[256], radix_npages[256];
|
||||
memset(&radix_entries, 0, sizeof(radix_entries));
|
||||
memset(&radix_npages, 0, sizeof(radix_npages));
|
||||
size_t spillable_entries = 0, spillable_npages = 0;
|
||||
const uint32_t reciprocal = (UINT32_C(255) << 24) / (age_max + 1);
|
||||
for (size_t i = 1; i <= dl->length; ++i) {
|
||||
const unsigned prio = spill_prio(txn, i, reciprocal);
|
||||
size_t *const ptr = ptr_disp(dl->items[i].ptr, -(ptrdiff_t)sizeof(size_t));
|
||||
TRACE("page %" PRIaPGNO
|
||||
", lru %zu, is_multi %c, npages %u, age %u of %u, prio %u",
|
||||
dl->items[i].pgno, *ptr, (dl->items[i].npages > 1) ? 'Y' : 'N',
|
||||
dpl_npages(dl, i), dpl_age(txn, i), age_max, prio);
|
||||
if (prio < 256) {
|
||||
radix_entries[prio] += 1;
|
||||
spillable_entries += 1;
|
||||
const pgno_t npages = dpl_npages(dl, i);
|
||||
radix_npages[prio] += npages;
|
||||
spillable_npages += npages;
|
||||
}
|
||||
}
|
||||
|
||||
tASSERT(txn, spillable_npages >= spillable_entries);
|
||||
pgno_t spilled_entries = 0, spilled_npages = 0;
|
||||
if (likely(spillable_entries > 0)) {
|
||||
size_t prio2spill = 0, prio2adjacent = 128,
|
||||
amount_entries = radix_entries[0], amount_npages = radix_npages[0];
|
||||
for (size_t i = 1; i < 256; i++) {
|
||||
if (amount_entries < need_spill_entries ||
|
||||
amount_npages < need_spill_npages) {
|
||||
prio2spill = i;
|
||||
prio2adjacent = i + (257 - i) / 2;
|
||||
amount_entries += radix_entries[i];
|
||||
amount_npages += radix_npages[i];
|
||||
} else if (amount_entries + amount_entries <
|
||||
spillable_entries + need_spill_entries
|
||||
/* РАВНОЗНАЧНО: amount - need_spill < spillable - amount */
|
||||
|| amount_npages + amount_npages <
|
||||
spillable_npages + need_spill_npages) {
|
||||
prio2adjacent = i;
|
||||
amount_entries += radix_entries[i];
|
||||
amount_npages += radix_npages[i];
|
||||
} else
|
||||
break;
|
||||
}
|
||||
|
||||
VERBOSE("prio2spill %zu, prio2adjacent %zu, spillable %zu/%zu,"
|
||||
" wanna-spill %zu/%zu, amount %zu/%zu",
|
||||
prio2spill, prio2adjacent, spillable_entries, spillable_npages,
|
||||
need_spill_entries, need_spill_npages, amount_entries,
|
||||
amount_npages);
|
||||
tASSERT(txn, prio2spill < prio2adjacent && prio2adjacent <= 256);
|
||||
|
||||
iov_ctx_t ctx;
|
||||
rc = iov_init(
|
||||
txn, &ctx, amount_entries, amount_npages,
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
txn->env->ioring.overlapped_fd ? txn->env->ioring.overlapped_fd :
|
||||
#endif
|
||||
txn->env->lazy_fd,
|
||||
true);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
goto bailout;
|
||||
|
||||
size_t r = 0, w = 0;
|
||||
pgno_t last = 0;
|
||||
while (r < dl->length && (spilled_entries < need_spill_entries ||
|
||||
spilled_npages < need_spill_npages)) {
|
||||
dl->items[++w] = dl->items[++r];
|
||||
unsigned prio = spill_prio(txn, w, reciprocal);
|
||||
if (prio > prio2spill &&
|
||||
(prio >= prio2adjacent || last != dl->items[w].pgno))
|
||||
continue;
|
||||
|
||||
const size_t e = w;
|
||||
last = dpl_endpgno(dl, w);
|
||||
while (--w && dpl_endpgno(dl, w) == dl->items[w + 1].pgno &&
|
||||
spill_prio(txn, w, reciprocal) < prio2adjacent)
|
||||
;
|
||||
|
||||
for (size_t i = w; ++i <= e;) {
|
||||
const unsigned npages = dpl_npages(dl, i);
|
||||
prio = spill_prio(txn, i, reciprocal);
|
||||
DEBUG("%sspill[%zu] %u page %" PRIaPGNO " (age %d, prio %u)",
|
||||
(prio > prio2spill) ? "co-" : "", i, npages, dl->items[i].pgno,
|
||||
dpl_age(txn, i), prio);
|
||||
tASSERT(txn, prio < 256);
|
||||
++spilled_entries;
|
||||
spilled_npages += npages;
|
||||
rc = spill_page(txn, &ctx, dl->items[i].ptr, npages);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
goto failed;
|
||||
}
|
||||
}
|
||||
|
||||
VERBOSE("spilled entries %u, spilled npages %u", spilled_entries,
|
||||
spilled_npages);
|
||||
tASSERT(txn, spillable_entries == 0 || spilled_entries > 0);
|
||||
tASSERT(txn, spilled_npages >= spilled_entries);
|
||||
|
||||
failed:
|
||||
while (r < dl->length)
|
||||
dl->items[++w] = dl->items[++r];
|
||||
tASSERT(txn, r - w == spilled_entries || rc != MDBX_SUCCESS);
|
||||
|
||||
dl->sorted = dpl_setlen(dl, w);
|
||||
txn->tw.dirtyroom += spilled_entries;
|
||||
txn->tw.dirtylist->pages_including_loose -= spilled_npages;
|
||||
tASSERT(txn, dpl_check(txn));
|
||||
|
||||
if (!iov_empty(&ctx)) {
|
||||
tASSERT(txn, rc == MDBX_SUCCESS);
|
||||
rc = iov_write(&ctx);
|
||||
}
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
goto bailout;
|
||||
|
||||
txn->env->lck->unsynced_pages.weak += spilled_npages;
|
||||
pnl_sort(txn->tw.spilled.list, (size_t)txn->geo.first_unallocated << 1);
|
||||
txn->flags |= MDBX_TXN_SPILLS;
|
||||
NOTICE("spilled %u dirty-entries, %u dirty-npages, now have %zu dirty-room",
|
||||
spilled_entries, spilled_npages, txn->tw.dirtyroom);
|
||||
} else {
|
||||
tASSERT(txn, rc == MDBX_SUCCESS);
|
||||
for (size_t i = 1; i <= dl->length; ++i) {
|
||||
page_t *dp = dl->items[i].ptr;
|
||||
VERBOSE(
|
||||
"unspillable[%zu]: pgno %u, npages %u, flags 0x%04X, age %u, prio %u",
|
||||
i, dp->pgno, dpl_npages(dl, i), dp->flags, dpl_age(txn, i),
|
||||
spill_prio(txn, i, reciprocal));
|
||||
}
|
||||
}
|
||||
|
||||
#if xMDBX_DEBUG_SPILLING == 2
|
||||
if (txn->tw.loose_count + txn->tw.dirtyroom <= need / 2 + 1)
|
||||
ERROR("dirty-list length: before %zu, after %zu, parent %zi, loose %zu; "
|
||||
"needed %zu, spillable %zu; "
|
||||
"spilled %u dirty-entries, now have %zu dirty-room",
|
||||
dl->length + spilled_entries, dl->length,
|
||||
(txn->parent && txn->parent->tw.dirtylist)
|
||||
? (intptr_t)txn->parent->tw.dirtylist->length
|
||||
: -1,
|
||||
txn->tw.loose_count, need, spillable_entries, spilled_entries,
|
||||
txn->tw.dirtyroom);
|
||||
ENSURE(txn->env, txn->tw.loose_count + txn->tw.dirtyroom > need / 2);
|
||||
#endif /* xMDBX_DEBUG_SPILLING */
|
||||
|
||||
done:
|
||||
return likely(txn->tw.dirtyroom + txn->tw.loose_count >
|
||||
((need > CURSOR_STACK_SIZE) ? CURSOR_STACK_SIZE : need))
|
||||
? MDBX_SUCCESS
|
||||
: MDBX_TXN_FULL;
|
||||
}
|
86
src/spill.h
Normal file
86
src/spill.h
Normal file
@ -0,0 +1,86 @@
|
||||
/// \copyright SPDX-License-Identifier: Apache-2.0
|
||||
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "essentials.h"
|
||||
|
||||
MDBX_INTERNAL void spill_remove(MDBX_txn *txn, size_t idx, size_t npages);
|
||||
MDBX_INTERNAL pnl_t spill_purge(MDBX_txn *txn);
|
||||
MDBX_INTERNAL int spill_slowpath(MDBX_txn *const txn, MDBX_cursor *const m0,
|
||||
const intptr_t wanna_spill_entries,
|
||||
const intptr_t wanna_spill_npages,
|
||||
const size_t need);
|
||||
/*----------------------------------------------------------------------------*/
|
||||
|
||||
static inline size_t spill_search(const MDBX_txn *txn, pgno_t pgno) {
|
||||
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC);
|
||||
const pnl_t pnl = txn->tw.spilled.list;
|
||||
if (likely(!pnl))
|
||||
return 0;
|
||||
pgno <<= 1;
|
||||
size_t n = pnl_search(pnl, pgno, (size_t)MAX_PAGENO + MAX_PAGENO + 1);
|
||||
return (n <= MDBX_PNL_GETSIZE(pnl) && pnl[n] == pgno) ? n : 0;
|
||||
}
|
||||
|
||||
static inline bool spill_intersect(const MDBX_txn *txn, pgno_t pgno,
|
||||
size_t npages) {
|
||||
const pnl_t pnl = txn->tw.spilled.list;
|
||||
if (likely(!pnl))
|
||||
return false;
|
||||
const size_t len = MDBX_PNL_GETSIZE(pnl);
|
||||
if (LOG_ENABLED(MDBX_LOG_EXTRA)) {
|
||||
DEBUG_EXTRA("PNL len %zu [", len);
|
||||
for (size_t i = 1; i <= len; ++i)
|
||||
DEBUG_EXTRA_PRINT(" %li", (pnl[i] & 1) ? -(long)(pnl[i] >> 1)
|
||||
: (long)(pnl[i] >> 1));
|
||||
DEBUG_EXTRA_PRINT("%s\n", "]");
|
||||
}
|
||||
const pgno_t spilled_range_begin = pgno << 1;
|
||||
const pgno_t spilled_range_last = ((pgno + (pgno_t)npages) << 1) - 1;
|
||||
#if MDBX_PNL_ASCENDING
|
||||
const size_t n =
|
||||
pnl_search(pnl, spilled_range_begin, (size_t)(MAX_PAGENO + 1) << 1);
|
||||
tASSERT(txn, n && (n == MDBX_PNL_GETSIZE(pnl) + 1 ||
|
||||
spilled_range_begin <= pnl[n]));
|
||||
const bool rc = n <= MDBX_PNL_GETSIZE(pnl) && pnl[n] <= spilled_range_last;
|
||||
#else
|
||||
const size_t n =
|
||||
pnl_search(pnl, spilled_range_last, (size_t)MAX_PAGENO + MAX_PAGENO + 1);
|
||||
tASSERT(txn, n && (n == MDBX_PNL_GETSIZE(pnl) + 1 ||
|
||||
spilled_range_last >= pnl[n]));
|
||||
const bool rc = n <= MDBX_PNL_GETSIZE(pnl) && pnl[n] >= spilled_range_begin;
|
||||
#endif
|
||||
if (ASSERT_ENABLED()) {
|
||||
bool check = false;
|
||||
for (size_t i = 0; i < npages; ++i)
|
||||
check |= spill_search(txn, (pgno_t)(pgno + i)) != 0;
|
||||
tASSERT(txn, check == rc);
|
||||
}
|
||||
return rc;
|
||||
}
|
||||
|
||||
static inline int txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0,
|
||||
const size_t need) {
|
||||
tASSERT(txn, (txn->flags & MDBX_TXN_RDONLY) == 0);
|
||||
tASSERT(txn, !m0 || cursor_is_tracked(m0));
|
||||
|
||||
const intptr_t wanna_spill_entries =
|
||||
txn->tw.dirtylist ? (need - txn->tw.dirtyroom - txn->tw.loose_count) : 0;
|
||||
const intptr_t wanna_spill_npages =
|
||||
need +
|
||||
(txn->tw.dirtylist ? txn->tw.dirtylist->pages_including_loose
|
||||
: txn->tw.writemap_dirty_npages) -
|
||||
txn->tw.loose_count - txn->env->options.dp_limit;
|
||||
|
||||
/* production mode */
|
||||
if (likely(wanna_spill_npages < 1 && wanna_spill_entries < 1)
|
||||
#if xMDBX_DEBUG_SPILLING == 1
|
||||
/* debug mode: always try to spill if xMDBX_DEBUG_SPILLING == 1 */
|
||||
&& txn->txnid % 23 > 11
|
||||
#endif
|
||||
)
|
||||
return MDBX_SUCCESS;
|
||||
|
||||
return spill_slowpath(txn, m0, wanna_spill_entries, wanna_spill_npages, need);
|
||||
}
|
104
src/subdb.c
Normal file
104
src/subdb.c
Normal file
@ -0,0 +1,104 @@
|
||||
/// \copyright SPDX-License-Identifier: Apache-2.0
|
||||
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
|
||||
|
||||
#include "internals.h"
|
||||
|
||||
int sdb_setup(const MDBX_env *env, kvx_t *const kvx, const tree_t *const db) {
|
||||
if (unlikely(!check_sdb_flags(db->flags))) {
|
||||
ERROR("incompatible or invalid db.flags (0x%x) ", db->flags);
|
||||
return MDBX_INCOMPATIBLE;
|
||||
}
|
||||
if (unlikely(!kvx->clc.k.cmp)) {
|
||||
kvx->clc.k.cmp = builtin_keycmp(db->flags);
|
||||
kvx->clc.v.cmp = builtin_datacmp(db->flags);
|
||||
}
|
||||
|
||||
kvx->clc.k.lmin = keysize_min(db->flags);
|
||||
kvx->clc.k.lmax = env_keysize_max(env, db->flags);
|
||||
kvx->clc.v.lmin = valsize_min(db->flags);
|
||||
kvx->clc.v.lmax = env_valsize_max(env, db->flags);
|
||||
|
||||
if ((db->flags & (MDBX_DUPFIXED | MDBX_INTEGERDUP)) != 0 && db->dupfix_size) {
|
||||
if (!MDBX_DISABLE_VALIDATION &&
|
||||
unlikely(db->dupfix_size < kvx->clc.v.lmin ||
|
||||
db->dupfix_size > kvx->clc.v.lmax)) {
|
||||
ERROR("db.dupfix_size (%u) <> min/max value-length (%zu/%zu)",
|
||||
db->dupfix_size, kvx->clc.v.lmin, kvx->clc.v.lmax);
|
||||
return MDBX_CORRUPTED;
|
||||
}
|
||||
kvx->clc.v.lmin = kvx->clc.v.lmax = db->dupfix_size;
|
||||
}
|
||||
return MDBX_SUCCESS;
|
||||
}
|
||||
|
||||
int sdb_fetch(MDBX_txn *txn, size_t dbi) {
|
||||
cursor_couple_t couple;
|
||||
int rc = cursor_init(&couple.outer, txn, MAIN_DBI);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return rc;
|
||||
|
||||
kvx_t *const kvx = &txn->env->kvs[dbi];
|
||||
rc = tree_search(&couple.outer, &kvx->name, 0);
|
||||
if (unlikely(rc != MDBX_SUCCESS)) {
|
||||
bailout:
|
||||
NOTICE("dbi %zu refs to inaccessible subDB `%*s` for txn %" PRIaTXN
|
||||
" (err %d)",
|
||||
dbi, (int)kvx->name.iov_len, (const char *)kvx->name.iov_base,
|
||||
txn->txnid, rc);
|
||||
return (rc == MDBX_NOTFOUND) ? MDBX_BAD_DBI : rc;
|
||||
}
|
||||
|
||||
MDBX_val data;
|
||||
struct node_search_result nsr = node_search(&couple.outer, &kvx->name);
|
||||
if (unlikely(!nsr.exact)) {
|
||||
rc = MDBX_NOTFOUND;
|
||||
goto bailout;
|
||||
}
|
||||
if (unlikely((node_flags(nsr.node) & (N_DUPDATA | N_SUBDATA)) != N_SUBDATA)) {
|
||||
NOTICE("dbi %zu refs to not a named subDB `%*s` for txn %" PRIaTXN " (%s)",
|
||||
dbi, (int)kvx->name.iov_len, (const char *)kvx->name.iov_base,
|
||||
txn->txnid, "wrong flags");
|
||||
return MDBX_INCOMPATIBLE; /* not a named DB */
|
||||
}
|
||||
|
||||
rc = node_read(&couple.outer, nsr.node, &data,
|
||||
couple.outer.pg[couple.outer.top]);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return rc;
|
||||
|
||||
if (unlikely(data.iov_len != sizeof(tree_t))) {
|
||||
NOTICE("dbi %zu refs to not a named subDB `%*s` for txn %" PRIaTXN " (%s)",
|
||||
dbi, (int)kvx->name.iov_len, (const char *)kvx->name.iov_base,
|
||||
txn->txnid, "wrong rec-size");
|
||||
return MDBX_INCOMPATIBLE; /* not a named DB */
|
||||
}
|
||||
|
||||
uint16_t flags = UNALIGNED_PEEK_16(data.iov_base, tree_t, flags);
|
||||
/* The txn may not know this DBI, or another process may
|
||||
* have dropped and recreated the DB with other flags. */
|
||||
tree_t *const db = &txn->dbs[dbi];
|
||||
if (unlikely((db->flags & DB_PERSISTENT_FLAGS) != flags)) {
|
||||
NOTICE("dbi %zu refs to the re-created subDB `%*s` for txn %" PRIaTXN
|
||||
" with different flags (present 0x%X != wanna 0x%X)",
|
||||
dbi, (int)kvx->name.iov_len, (const char *)kvx->name.iov_base,
|
||||
txn->txnid, db->flags & DB_PERSISTENT_FLAGS, flags);
|
||||
return MDBX_INCOMPATIBLE;
|
||||
}
|
||||
|
||||
memcpy(db, data.iov_base, sizeof(tree_t));
|
||||
#if !MDBX_DISABLE_VALIDATION
|
||||
const txnid_t pp_txnid = couple.outer.pg[couple.outer.top]->txnid;
|
||||
tASSERT(txn, txn->front_txnid >= pp_txnid);
|
||||
if (unlikely(db->mod_txnid > pp_txnid)) {
|
||||
ERROR("db.mod_txnid (%" PRIaTXN ") > page-txnid (%" PRIaTXN ")",
|
||||
db->mod_txnid, pp_txnid);
|
||||
return MDBX_CORRUPTED;
|
||||
}
|
||||
#endif /* !MDBX_DISABLE_VALIDATION */
|
||||
rc = sdb_setup(txn->env, kvx, db);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return rc;
|
||||
|
||||
txn->dbi_state[dbi] &= ~DBI_STALE;
|
||||
return MDBX_SUCCESS;
|
||||
}
|
610
src/tls.c
Normal file
610
src/tls.c
Normal file
@ -0,0 +1,610 @@
|
||||
/// \copyright SPDX-License-Identifier: Apache-2.0
|
||||
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
|
||||
|
||||
#include "internals.h"
|
||||
|
||||
typedef struct rthc_entry {
|
||||
MDBX_env *env;
|
||||
} rthc_entry_t;
|
||||
|
||||
#if MDBX_DEBUG
|
||||
#define RTHC_INITIAL_LIMIT 1
|
||||
#else
|
||||
#define RTHC_INITIAL_LIMIT 16
|
||||
#endif
|
||||
|
||||
static unsigned rthc_count, rthc_limit = RTHC_INITIAL_LIMIT;
|
||||
static rthc_entry_t rthc_table_static[RTHC_INITIAL_LIMIT];
|
||||
static rthc_entry_t *rthc_table = rthc_table_static;
|
||||
|
||||
static int uniq_peek(const osal_mmap_t *pending, osal_mmap_t *scan) {
|
||||
int rc;
|
||||
uint64_t bait;
|
||||
lck_t *const pending_lck = pending->lck;
|
||||
lck_t *const scan_lck = scan->lck;
|
||||
if (pending_lck) {
|
||||
bait = atomic_load64(&pending_lck->bait_uniqueness, mo_AcquireRelease);
|
||||
rc = MDBX_SUCCESS;
|
||||
} else {
|
||||
bait = 0 /* hush MSVC warning */;
|
||||
rc = osal_msync(scan, 0, sizeof(lck_t), MDBX_SYNC_DATA);
|
||||
if (rc == MDBX_SUCCESS)
|
||||
rc = osal_pread(pending->fd, &bait, sizeof(scan_lck->bait_uniqueness),
|
||||
offsetof(lck_t, bait_uniqueness));
|
||||
}
|
||||
if (likely(rc == MDBX_SUCCESS) &&
|
||||
bait == atomic_load64(&scan_lck->bait_uniqueness, mo_AcquireRelease))
|
||||
rc = MDBX_RESULT_TRUE;
|
||||
|
||||
TRACE("uniq-peek: %s, bait 0x%016" PRIx64 ",%s rc %d",
|
||||
pending_lck ? "mem" : "file", bait,
|
||||
(rc == MDBX_RESULT_TRUE) ? " found," : (rc ? " FAILED," : ""), rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
static int uniq_poke(const osal_mmap_t *pending, osal_mmap_t *scan,
|
||||
uint64_t *abra) {
|
||||
if (*abra == 0) {
|
||||
const uintptr_t tid = osal_thread_self();
|
||||
uintptr_t uit = 0;
|
||||
memcpy(&uit, &tid, (sizeof(tid) < sizeof(uit)) ? sizeof(tid) : sizeof(uit));
|
||||
*abra = rrxmrrxmsx_0(osal_monotime() + UINT64_C(5873865991930747) * uit);
|
||||
}
|
||||
const uint64_t cadabra =
|
||||
rrxmrrxmsx_0(*abra + UINT64_C(7680760450171793) * (unsigned)osal_getpid())
|
||||
<< 24 |
|
||||
*abra >> 40;
|
||||
lck_t *const scan_lck = scan->lck;
|
||||
atomic_store64(&scan_lck->bait_uniqueness, cadabra, mo_AcquireRelease);
|
||||
*abra = *abra * UINT64_C(6364136223846793005) + 1;
|
||||
return uniq_peek(pending, scan);
|
||||
}
|
||||
|
||||
__cold int rthc_uniq_check(const osal_mmap_t *pending, MDBX_env **found) {
|
||||
*found = nullptr;
|
||||
uint64_t salt = 0;
|
||||
for (size_t i = 0; i < rthc_count; ++i) {
|
||||
MDBX_env *const scan = rthc_table[i].env;
|
||||
if (!scan->lck_mmap.lck || &scan->lck_mmap == pending)
|
||||
continue;
|
||||
int err =
|
||||
atomic_load64(&scan->lck_mmap.lck->bait_uniqueness, mo_AcquireRelease)
|
||||
? uniq_peek(pending, &scan->lck_mmap)
|
||||
: uniq_poke(pending, &scan->lck_mmap, &salt);
|
||||
if (err == MDBX_ENODATA) {
|
||||
uint64_t length = 0;
|
||||
if (likely(osal_filesize(pending->fd, &length) == MDBX_SUCCESS &&
|
||||
length == 0)) {
|
||||
/* LY: skip checking since LCK-file is empty, i.e. just created. */
|
||||
DEBUG("%s", "unique (new/empty lck)");
|
||||
return MDBX_SUCCESS;
|
||||
}
|
||||
}
|
||||
if (err == MDBX_RESULT_TRUE)
|
||||
err = uniq_poke(pending, &scan->lck_mmap, &salt);
|
||||
if (err == MDBX_RESULT_TRUE) {
|
||||
(void)osal_msync(&scan->lck_mmap, 0, sizeof(lck_t), MDBX_SYNC_KICK);
|
||||
err = uniq_poke(pending, &scan->lck_mmap, &salt);
|
||||
}
|
||||
if (err == MDBX_RESULT_TRUE) {
|
||||
err = uniq_poke(pending, &scan->lck_mmap, &salt);
|
||||
*found = scan;
|
||||
DEBUG("found %p", __Wpedantic_format_voidptr(*found));
|
||||
return MDBX_SUCCESS;
|
||||
}
|
||||
if (unlikely(err != MDBX_SUCCESS)) {
|
||||
DEBUG("failed rc %d", err);
|
||||
return err;
|
||||
}
|
||||
}
|
||||
|
||||
DEBUG("%s", "unique");
|
||||
return MDBX_SUCCESS;
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
static CRITICAL_SECTION rthc_critical_section;
|
||||
#else
|
||||
|
||||
static pthread_mutex_t rthc_mutex = PTHREAD_MUTEX_INITIALIZER;
|
||||
static pthread_cond_t rthc_cond = PTHREAD_COND_INITIALIZER;
|
||||
static osal_thread_key_t rthc_key;
|
||||
static mdbx_atomic_uint32_t rthc_pending;
|
||||
|
||||
static inline uint64_t rthc_signature(const void *addr, uint8_t kind) {
|
||||
uint64_t salt = osal_thread_self() * UINT64_C(0xA2F0EEC059629A17) ^
|
||||
UINT64_C(0x01E07C6FDB596497) * (uintptr_t)(addr);
|
||||
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
|
||||
return salt << 8 | kind;
|
||||
#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
|
||||
return (uint64_t)kind << 56 | salt >> 8;
|
||||
#else
|
||||
#error "FIXME: Unsupported byte order"
|
||||
#endif /* __BYTE_ORDER__ */
|
||||
}
|
||||
|
||||
#define MDBX_THREAD_RTHC_REGISTERED(addr) rthc_signature(addr, 0x0D)
|
||||
#define MDBX_THREAD_RTHC_COUNTED(addr) rthc_signature(addr, 0xC0)
|
||||
static __thread uint64_t rthc_thread_state
|
||||
#if __has_attribute(tls_model) && \
|
||||
(defined(__PIC__) || defined(__pic__) || MDBX_BUILD_SHARED_LIBRARY)
|
||||
__attribute__((tls_model("local-dynamic")))
|
||||
#endif
|
||||
;
|
||||
|
||||
#if defined(__APPLE__) && defined(__SANITIZE_ADDRESS__) && \
|
||||
!defined(MDBX_ATTRIBUTE_NO_SANITIZE_ADDRESS)
|
||||
/* Avoid ASAN-trap due the target TLS-variable feed by Darwin's tlv_free() */
|
||||
#define MDBX_ATTRIBUTE_NO_SANITIZE_ADDRESS \
|
||||
__attribute__((__no_sanitize_address__, __noinline__))
|
||||
#else
|
||||
#define MDBX_ATTRIBUTE_NO_SANITIZE_ADDRESS inline
|
||||
#endif
|
||||
|
||||
MDBX_ATTRIBUTE_NO_SANITIZE_ADDRESS static uint64_t rthc_read(const void *rthc) {
|
||||
return *(volatile uint64_t *)rthc;
|
||||
}
|
||||
|
||||
MDBX_ATTRIBUTE_NO_SANITIZE_ADDRESS static uint64_t
|
||||
rthc_compare_and_clean(const void *rthc, const uint64_t signature) {
|
||||
#if MDBX_64BIT_CAS
|
||||
return atomic_cas64((mdbx_atomic_uint64_t *)rthc, signature, 0);
|
||||
#elif __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
|
||||
return atomic_cas32((mdbx_atomic_uint32_t *)rthc, (uint32_t)signature, 0);
|
||||
#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
|
||||
return atomic_cas32((mdbx_atomic_uint32_t *)rthc, (uint32_t)(signature >> 32),
|
||||
0);
|
||||
#else
|
||||
#error "FIXME: Unsupported byte order"
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline int rthc_atexit(void (*dtor)(void *), void *obj,
|
||||
void *dso_symbol) {
|
||||
#ifndef MDBX_HAVE_CXA_THREAD_ATEXIT_IMPL
|
||||
#if defined(LIBCXXABI_HAS_CXA_THREAD_ATEXIT_IMPL) || \
|
||||
defined(HAVE___CXA_THREAD_ATEXIT_IMPL) || __GLIBC_PREREQ(2, 18) || \
|
||||
defined(BIONIC)
|
||||
#define MDBX_HAVE_CXA_THREAD_ATEXIT_IMPL 1
|
||||
#else
|
||||
#define MDBX_HAVE_CXA_THREAD_ATEXIT_IMPL 0
|
||||
#endif
|
||||
#endif /* MDBX_HAVE_CXA_THREAD_ATEXIT_IMPL */
|
||||
|
||||
#ifndef MDBX_HAVE_CXA_THREAD_ATEXIT
|
||||
#if defined(LIBCXXABI_HAS_CXA_THREAD_ATEXIT) || \
|
||||
defined(HAVE___CXA_THREAD_ATEXIT)
|
||||
#define MDBX_HAVE_CXA_THREAD_ATEXIT 1
|
||||
#elif !MDBX_HAVE_CXA_THREAD_ATEXIT_IMPL && \
|
||||
(defined(__linux__) || defined(__gnu_linux__))
|
||||
#define MDBX_HAVE_CXA_THREAD_ATEXIT 1
|
||||
#else
|
||||
#define MDBX_HAVE_CXA_THREAD_ATEXIT 0
|
||||
#endif
|
||||
#endif /* MDBX_HAVE_CXA_THREAD_ATEXIT */
|
||||
|
||||
int rc = MDBX_ENOSYS;
|
||||
#if MDBX_HAVE_CXA_THREAD_ATEXIT_IMPL && !MDBX_HAVE_CXA_THREAD_ATEXIT
|
||||
#define __cxa_thread_atexit __cxa_thread_atexit_impl
|
||||
#endif
|
||||
#if MDBX_HAVE_CXA_THREAD_ATEXIT || defined(__cxa_thread_atexit)
|
||||
extern int __cxa_thread_atexit(void (*dtor)(void *), void *obj,
|
||||
void *dso_symbol) MDBX_WEAK_IMPORT_ATTRIBUTE;
|
||||
if (&__cxa_thread_atexit)
|
||||
rc = __cxa_thread_atexit(dtor, obj, dso_symbol);
|
||||
#elif defined(__APPLE__) || defined(_DARWIN_C_SOURCE)
|
||||
extern void _tlv_atexit(void (*termfunc)(void *objAddr), void *objAddr)
|
||||
MDBX_WEAK_IMPORT_ATTRIBUTE;
|
||||
if (&_tlv_atexit) {
|
||||
(void)dso_symbol;
|
||||
_tlv_atexit(dtor, obj);
|
||||
rc = 0;
|
||||
}
|
||||
#else
|
||||
(void)dtor;
|
||||
(void)obj;
|
||||
(void)dso_symbol;
|
||||
#endif
|
||||
return rc;
|
||||
}
|
||||
|
||||
__cold void workaround_glibc_bug21031(void) {
|
||||
/* Workaround for https://sourceware.org/bugzilla/show_bug.cgi?id=21031
|
||||
*
|
||||
* Due race between pthread_key_delete() and __nptl_deallocate_tsd()
|
||||
* The destructor(s) of thread-local-storage object(s) may be running
|
||||
* in another thread(s) and be blocked or not finished yet.
|
||||
* In such case we get a SEGFAULT after unload this library DSO.
|
||||
*
|
||||
* So just by yielding a few timeslices we give a chance
|
||||
* to such destructor(s) for completion and avoids segfault. */
|
||||
sched_yield();
|
||||
sched_yield();
|
||||
sched_yield();
|
||||
}
|
||||
#endif /* !Windows */
|
||||
|
||||
void rthc_lock(void) {
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
EnterCriticalSection(&rthc_critical_section);
|
||||
#else
|
||||
ENSURE(nullptr, osal_pthread_mutex_lock(&rthc_mutex) == 0);
|
||||
#endif
|
||||
}
|
||||
|
||||
void rthc_unlock(void) {
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
LeaveCriticalSection(&rthc_critical_section);
|
||||
#else
|
||||
ENSURE(nullptr, pthread_mutex_unlock(&rthc_mutex) == 0);
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline int thread_key_create(osal_thread_key_t *key) {
|
||||
int rc;
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
*key = TlsAlloc();
|
||||
rc = (*key != TLS_OUT_OF_INDEXES) ? MDBX_SUCCESS : GetLastError();
|
||||
#else
|
||||
rc = pthread_key_create(key, nullptr);
|
||||
#endif
|
||||
TRACE("&key = %p, value %" PRIuPTR ", rc %d", __Wpedantic_format_voidptr(key),
|
||||
(uintptr_t)*key, rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
void thread_rthc_set(osal_thread_key_t key, const void *value) {
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
ENSURE(nullptr, TlsSetValue(key, (void *)value));
|
||||
#else
|
||||
const uint64_t sign_registered =
|
||||
MDBX_THREAD_RTHC_REGISTERED(&rthc_thread_state);
|
||||
const uint64_t sign_counted = MDBX_THREAD_RTHC_COUNTED(&rthc_thread_state);
|
||||
if (value && unlikely(rthc_thread_state != sign_registered &&
|
||||
rthc_thread_state != sign_counted)) {
|
||||
rthc_thread_state = sign_registered;
|
||||
TRACE("thread registered 0x%" PRIxPTR, osal_thread_self());
|
||||
if (rthc_atexit(rthc_thread_dtor, &rthc_thread_state,
|
||||
(void *)&mdbx_version /* dso_anchor */)) {
|
||||
ENSURE(nullptr, pthread_setspecific(rthc_key, &rthc_thread_state) == 0);
|
||||
rthc_thread_state = sign_counted;
|
||||
const unsigned count_before = atomic_add32(&rthc_pending, 1);
|
||||
ENSURE(nullptr, count_before < INT_MAX);
|
||||
NOTICE("fallback to pthreads' tsd, key %" PRIuPTR ", count %u",
|
||||
(uintptr_t)rthc_key, count_before);
|
||||
(void)count_before;
|
||||
}
|
||||
}
|
||||
ENSURE(nullptr, pthread_setspecific(key, value) == 0);
|
||||
#endif
|
||||
}
|
||||
|
||||
/* dtor called for thread, i.e. for all mdbx's environment objects */
|
||||
__cold void rthc_thread_dtor(void *rthc) {
|
||||
rthc_lock();
|
||||
const uint32_t current_pid = osal_getpid();
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
TRACE(">> pid %d, thread 0x%" PRIxPTR ", module %p", current_pid,
|
||||
osal_thread_self(), rthc);
|
||||
#else
|
||||
TRACE(">> pid %d, thread 0x%" PRIxPTR ", rthc %p", current_pid,
|
||||
osal_thread_self(), rthc);
|
||||
#endif
|
||||
|
||||
for (size_t i = 0; i < rthc_count; ++i) {
|
||||
MDBX_env *const env = rthc_table[i].env;
|
||||
if (env->pid != current_pid)
|
||||
continue;
|
||||
if (!(env->flags & ENV_TXKEY))
|
||||
continue;
|
||||
reader_slot_t *const reader = thread_rthc_get(env->me_txkey);
|
||||
reader_slot_t *const begin = &env->lck_mmap.lck->rdt[0];
|
||||
reader_slot_t *const end = &env->lck_mmap.lck->rdt[env->max_readers];
|
||||
if (reader < begin || reader >= end)
|
||||
continue;
|
||||
#if !defined(_WIN32) && !defined(_WIN64)
|
||||
if (pthread_setspecific(env->me_txkey, nullptr) != 0) {
|
||||
TRACE("== thread 0x%" PRIxPTR
|
||||
", rthc %p: ignore race with tsd-key deletion",
|
||||
osal_thread_self(), __Wpedantic_format_voidptr(reader));
|
||||
continue /* ignore race with tsd-key deletion by mdbx_env_close() */;
|
||||
}
|
||||
#endif
|
||||
|
||||
TRACE("== thread 0x%" PRIxPTR
|
||||
", rthc %p, [%zi], %p ... %p (%+i), rtch-pid %i, "
|
||||
"current-pid %i",
|
||||
osal_thread_self(), __Wpedantic_format_voidptr(reader), i,
|
||||
__Wpedantic_format_voidptr(begin), __Wpedantic_format_voidptr(end),
|
||||
(int)(reader - begin), reader->pid.weak, current_pid);
|
||||
if (atomic_load32(&reader->pid, mo_Relaxed) == current_pid) {
|
||||
TRACE("==== thread 0x%" PRIxPTR ", rthc %p, cleanup", osal_thread_self(),
|
||||
__Wpedantic_format_voidptr(reader));
|
||||
(void)atomic_cas32(&reader->pid, current_pid, 0);
|
||||
atomic_store32(&env->lck->rdt_refresh_flag, true, mo_Relaxed);
|
||||
}
|
||||
}
|
||||
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
TRACE("<< thread 0x%" PRIxPTR ", module %p", osal_thread_self(), rthc);
|
||||
rthc_unlock();
|
||||
#else
|
||||
const uint64_t sign_registered = MDBX_THREAD_RTHC_REGISTERED(rthc);
|
||||
const uint64_t sign_counted = MDBX_THREAD_RTHC_COUNTED(rthc);
|
||||
const uint64_t state = rthc_read(rthc);
|
||||
if (state == sign_registered &&
|
||||
rthc_compare_and_clean(rthc, sign_registered)) {
|
||||
TRACE("== thread 0x%" PRIxPTR
|
||||
", rthc %p, pid %d, self-status %s (0x%08" PRIx64 ")",
|
||||
osal_thread_self(), rthc, osal_getpid(), "registered", state);
|
||||
} else if (state == sign_counted &&
|
||||
rthc_compare_and_clean(rthc, sign_counted)) {
|
||||
TRACE("== thread 0x%" PRIxPTR
|
||||
", rthc %p, pid %d, self-status %s (0x%08" PRIx64 ")",
|
||||
osal_thread_self(), rthc, osal_getpid(), "counted", state);
|
||||
ENSURE(nullptr, atomic_sub32(&rthc_pending, 1) > 0);
|
||||
} else {
|
||||
WARNING("thread 0x%" PRIxPTR
|
||||
", rthc %p, pid %d, self-status %s (0x%08" PRIx64 ")",
|
||||
osal_thread_self(), rthc, osal_getpid(), "wrong", state);
|
||||
}
|
||||
|
||||
if (atomic_load32(&rthc_pending, mo_AcquireRelease) == 0) {
|
||||
TRACE("== thread 0x%" PRIxPTR ", rthc %p, pid %d, wake", osal_thread_self(),
|
||||
rthc, osal_getpid());
|
||||
ENSURE(nullptr, pthread_cond_broadcast(&rthc_cond) == 0);
|
||||
}
|
||||
|
||||
TRACE("<< thread 0x%" PRIxPTR ", rthc %p", osal_thread_self(), rthc);
|
||||
/* Allow tail call optimization, i.e. gcc should generate the jmp instruction
|
||||
* instead of a call for pthread_mutex_unlock() and therefore CPU could not
|
||||
* return to current DSO's code section, which may be unloaded immediately
|
||||
* after the mutex got released. */
|
||||
pthread_mutex_unlock(&rthc_mutex);
|
||||
#endif
|
||||
}
|
||||
|
||||
__cold int rthc_register(MDBX_env *const env) {
|
||||
TRACE(">> env %p, rthc_count %u, rthc_limit %u",
|
||||
__Wpedantic_format_voidptr(env), rthc_count, rthc_limit);
|
||||
|
||||
int rc = MDBX_SUCCESS;
|
||||
for (size_t i = 0; i < rthc_count; ++i)
|
||||
if (unlikely(rthc_table[i].env == env)) {
|
||||
rc = MDBX_PANIC;
|
||||
goto bailout;
|
||||
}
|
||||
|
||||
env->me_txkey = 0;
|
||||
if (unlikely(rthc_count == rthc_limit)) {
|
||||
rthc_entry_t *new_table =
|
||||
osal_realloc((rthc_table == rthc_table_static) ? nullptr : rthc_table,
|
||||
sizeof(rthc_entry_t) * rthc_limit * 2);
|
||||
if (unlikely(new_table == nullptr)) {
|
||||
rc = MDBX_ENOMEM;
|
||||
goto bailout;
|
||||
}
|
||||
if (rthc_table == rthc_table_static)
|
||||
memcpy(new_table, rthc_table, sizeof(rthc_entry_t) * rthc_limit);
|
||||
rthc_table = new_table;
|
||||
rthc_limit *= 2;
|
||||
}
|
||||
|
||||
if ((env->flags & MDBX_NOSTICKYTHREADS) == 0) {
|
||||
rc = thread_key_create(&env->me_txkey);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
goto bailout;
|
||||
env->flags |= ENV_TXKEY;
|
||||
}
|
||||
|
||||
rthc_table[rthc_count].env = env;
|
||||
TRACE("== [%i] = env %p, key %" PRIuPTR, rthc_count,
|
||||
__Wpedantic_format_voidptr(env), (uintptr_t)env->me_txkey);
|
||||
++rthc_count;
|
||||
|
||||
bailout:
|
||||
TRACE("<< env %p, key %" PRIuPTR ", rthc_count %u, rthc_limit %u, rc %d",
|
||||
__Wpedantic_format_voidptr(env), (uintptr_t)env->me_txkey, rthc_count,
|
||||
rthc_limit, rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
__cold static int rthc_drown(MDBX_env *const env) {
|
||||
const uint32_t current_pid = osal_getpid();
|
||||
int rc = MDBX_SUCCESS;
|
||||
MDBX_env *inprocess_neighbor = nullptr;
|
||||
if (likely(env->lck_mmap.lck && current_pid == env->pid)) {
|
||||
reader_slot_t *const begin = &env->lck_mmap.lck->rdt[0];
|
||||
reader_slot_t *const end = &env->lck_mmap.lck->rdt[env->max_readers];
|
||||
TRACE("== %s env %p pid %d, readers %p ...%p, current-pid %d",
|
||||
(current_pid == env->pid) ? "cleanup" : "skip",
|
||||
__Wpedantic_format_voidptr(env), env->pid,
|
||||
__Wpedantic_format_voidptr(begin), __Wpedantic_format_voidptr(end),
|
||||
current_pid);
|
||||
bool cleaned = false;
|
||||
for (reader_slot_t *r = begin; r < end; ++r) {
|
||||
if (atomic_load32(&r->pid, mo_Relaxed) == current_pid) {
|
||||
atomic_store32(&r->pid, 0, mo_AcquireRelease);
|
||||
TRACE("== cleanup %p", __Wpedantic_format_voidptr(r));
|
||||
cleaned = true;
|
||||
}
|
||||
}
|
||||
if (cleaned)
|
||||
atomic_store32(&env->lck_mmap.lck->rdt_refresh_flag, true, mo_Relaxed);
|
||||
rc = rthc_uniq_check(&env->lck_mmap, &inprocess_neighbor);
|
||||
if (!inprocess_neighbor && env->registered_reader_pid &&
|
||||
env->lck_mmap.fd != INVALID_HANDLE_VALUE) {
|
||||
int err = lck_rpid_clear(env);
|
||||
rc = rc ? rc : err;
|
||||
}
|
||||
}
|
||||
int err = lck_destroy(env, inprocess_neighbor, current_pid);
|
||||
env->pid = 0;
|
||||
return rc ? rc : err;
|
||||
}
|
||||
|
||||
__cold int rthc_remove(MDBX_env *const env) {
|
||||
TRACE(">>> env %p, key %zu, rthc_count %u, rthc_limit %u",
|
||||
__Wpedantic_format_voidptr(env), (uintptr_t)env->me_txkey, rthc_count,
|
||||
rthc_limit);
|
||||
|
||||
int rc = MDBX_SUCCESS;
|
||||
if (likely(env->pid))
|
||||
rc = rthc_drown(env);
|
||||
|
||||
for (size_t i = 0; i < rthc_count; ++i) {
|
||||
if (rthc_table[i].env == env) {
|
||||
if (--rthc_count > 0)
|
||||
rthc_table[i] = rthc_table[rthc_count];
|
||||
else if (rthc_table != rthc_table_static) {
|
||||
void *tmp = rthc_table;
|
||||
rthc_table = rthc_table_static;
|
||||
rthc_limit = RTHC_INITIAL_LIMIT;
|
||||
osal_memory_barrier();
|
||||
osal_free(tmp);
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
TRACE("<<< %p, key %zu, rthc_count %u, rthc_limit %u",
|
||||
__Wpedantic_format_voidptr(env), (uintptr_t)env->me_txkey, rthc_count,
|
||||
rthc_limit);
|
||||
return rc;
|
||||
}
|
||||
|
||||
#if !defined(_WIN32) && !defined(_WIN64)
|
||||
__cold void rthc_afterfork(void) {
|
||||
NOTICE("drown %d rthc entries", rthc_count);
|
||||
for (size_t i = 0; i < rthc_count; ++i) {
|
||||
MDBX_env *const env = rthc_table[i].env;
|
||||
NOTICE("drown env %p", __Wpedantic_format_voidptr(env));
|
||||
if (env->lck_mmap.lck)
|
||||
osal_munmap(&env->lck_mmap);
|
||||
if (env->dxb_mmap.base) {
|
||||
osal_munmap(&env->dxb_mmap);
|
||||
#ifdef ENABLE_MEMCHECK
|
||||
VALGRIND_DISCARD(env->valgrind_handle);
|
||||
env->valgrind_handle = -1;
|
||||
#endif /* ENABLE_MEMCHECK */
|
||||
}
|
||||
env->lck = lckless_stub(env);
|
||||
rthc_drown(env);
|
||||
}
|
||||
if (rthc_table != rthc_table_static)
|
||||
osal_free(rthc_table);
|
||||
rthc_count = 0;
|
||||
rthc_table = rthc_table_static;
|
||||
rthc_limit = RTHC_INITIAL_LIMIT;
|
||||
rthc_pending.weak = 0;
|
||||
}
|
||||
#endif /* ! Windows */
|
||||
|
||||
__cold void rthc_ctor(void) {
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
InitializeCriticalSection(&rthc_critical_section);
|
||||
#else
|
||||
ENSURE(nullptr, pthread_atfork(nullptr, nullptr, rthc_afterfork) == 0);
|
||||
ENSURE(nullptr, pthread_key_create(&rthc_key, rthc_thread_dtor) == 0);
|
||||
TRACE("pid %d, &mdbx_rthc_key = %p, value 0x%x", osal_getpid(),
|
||||
__Wpedantic_format_voidptr(&rthc_key), (unsigned)rthc_key);
|
||||
#endif
|
||||
}
|
||||
|
||||
__cold void rthc_dtor(const uint32_t current_pid) {
|
||||
rthc_lock();
|
||||
#if !defined(_WIN32) && !defined(_WIN64)
|
||||
uint64_t *rthc = pthread_getspecific(rthc_key);
|
||||
TRACE("== thread 0x%" PRIxPTR ", rthc %p, pid %d, self-status 0x%08" PRIx64
|
||||
", left %d",
|
||||
osal_thread_self(), __Wpedantic_format_voidptr(rthc), current_pid,
|
||||
rthc ? rthc_read(rthc) : ~UINT64_C(0),
|
||||
atomic_load32(&rthc_pending, mo_Relaxed));
|
||||
if (rthc) {
|
||||
const uint64_t sign_registered = MDBX_THREAD_RTHC_REGISTERED(rthc);
|
||||
const uint64_t sign_counted = MDBX_THREAD_RTHC_COUNTED(rthc);
|
||||
const uint64_t state = rthc_read(rthc);
|
||||
if (state == sign_registered &&
|
||||
rthc_compare_and_clean(rthc, sign_registered)) {
|
||||
TRACE("== thread 0x%" PRIxPTR
|
||||
", rthc %p, pid %d, self-status %s (0x%08" PRIx64 ")",
|
||||
osal_thread_self(), __Wpedantic_format_voidptr(rthc), current_pid,
|
||||
"registered", state);
|
||||
} else if (state == sign_counted &&
|
||||
rthc_compare_and_clean(rthc, sign_counted)) {
|
||||
TRACE("== thread 0x%" PRIxPTR
|
||||
", rthc %p, pid %d, self-status %s (0x%08" PRIx64 ")",
|
||||
osal_thread_self(), __Wpedantic_format_voidptr(rthc), current_pid,
|
||||
"counted", state);
|
||||
ENSURE(nullptr, atomic_sub32(&rthc_pending, 1) > 0);
|
||||
} else {
|
||||
WARNING("thread 0x%" PRIxPTR
|
||||
", rthc %p, pid %d, self-status %s (0x%08" PRIx64 ")",
|
||||
osal_thread_self(), __Wpedantic_format_voidptr(rthc), current_pid,
|
||||
"wrong", state);
|
||||
}
|
||||
}
|
||||
|
||||
struct timespec abstime;
|
||||
ENSURE(nullptr, clock_gettime(CLOCK_REALTIME, &abstime) == 0);
|
||||
abstime.tv_nsec += 1000000000l / 10;
|
||||
if (abstime.tv_nsec >= 1000000000l) {
|
||||
abstime.tv_nsec -= 1000000000l;
|
||||
abstime.tv_sec += 1;
|
||||
}
|
||||
#if MDBX_DEBUG > 0
|
||||
abstime.tv_sec += 600;
|
||||
#endif
|
||||
|
||||
for (unsigned left;
|
||||
(left = atomic_load32(&rthc_pending, mo_AcquireRelease)) > 0;) {
|
||||
NOTICE("tls-cleanup: pid %d, pending %u, wait for...", current_pid, left);
|
||||
const int rc = pthread_cond_timedwait(&rthc_cond, &rthc_mutex, &abstime);
|
||||
if (rc && rc != EINTR)
|
||||
break;
|
||||
}
|
||||
thread_key_delete(rthc_key);
|
||||
#endif
|
||||
|
||||
for (size_t i = 0; i < rthc_count; ++i) {
|
||||
MDBX_env *const env = rthc_table[i].env;
|
||||
if (env->pid != current_pid)
|
||||
continue;
|
||||
if (!(env->flags & ENV_TXKEY))
|
||||
continue;
|
||||
reader_slot_t *const begin = &env->lck_mmap.lck->rdt[0];
|
||||
reader_slot_t *const end = &env->lck_mmap.lck->rdt[env->max_readers];
|
||||
thread_key_delete(env->me_txkey);
|
||||
bool cleaned = false;
|
||||
for (reader_slot_t *reader = begin; reader < end; ++reader) {
|
||||
TRACE("== [%zi] = key %" PRIuPTR ", %p ... %p, rthc %p (%+i), "
|
||||
"rthc-pid %i, current-pid %i",
|
||||
i, (uintptr_t)env->me_txkey, __Wpedantic_format_voidptr(begin),
|
||||
__Wpedantic_format_voidptr(end), __Wpedantic_format_voidptr(reader),
|
||||
(int)(reader - begin), reader->pid.weak, current_pid);
|
||||
if (atomic_load32(&reader->pid, mo_Relaxed) == current_pid) {
|
||||
(void)atomic_cas32(&reader->pid, current_pid, 0);
|
||||
TRACE("== cleanup %p", __Wpedantic_format_voidptr(reader));
|
||||
cleaned = true;
|
||||
}
|
||||
}
|
||||
if (cleaned)
|
||||
atomic_store32(&env->lck->rdt_refresh_flag, true, mo_Relaxed);
|
||||
}
|
||||
|
||||
rthc_limit = rthc_count = 0;
|
||||
if (rthc_table != rthc_table_static)
|
||||
osal_free(rthc_table);
|
||||
rthc_table = nullptr;
|
||||
rthc_unlock();
|
||||
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
DeleteCriticalSection(&rthc_critical_section);
|
||||
#else
|
||||
/* LY: yielding a few timeslices to give a more chance
|
||||
* to racing destructor(s) for completion. */
|
||||
workaround_glibc_bug21031();
|
||||
#endif
|
||||
}
|
43
src/tls.h
Normal file
43
src/tls.h
Normal file
@ -0,0 +1,43 @@
|
||||
/// \copyright SPDX-License-Identifier: Apache-2.0
|
||||
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "essentials.h"
|
||||
|
||||
MDBX_INTERNAL void rthc_ctor(void);
|
||||
MDBX_INTERNAL void rthc_dtor(const uint32_t current_pid);
|
||||
MDBX_INTERNAL void rthc_lock(void);
|
||||
MDBX_INTERNAL void rthc_unlock(void);
|
||||
|
||||
MDBX_INTERNAL int rthc_register(MDBX_env *const env);
|
||||
MDBX_INTERNAL int rthc_remove(MDBX_env *const env);
|
||||
MDBX_INTERNAL int rthc_uniq_check(const osal_mmap_t *pending, MDBX_env **found);
|
||||
|
||||
/* dtor called for thread, i.e. for all mdbx's environment objects */
|
||||
MDBX_INTERNAL void rthc_thread_dtor(void *rthc);
|
||||
|
||||
static inline void *thread_rthc_get(osal_thread_key_t key) {
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
return TlsGetValue(key);
|
||||
#else
|
||||
return pthread_getspecific(key);
|
||||
#endif
|
||||
}
|
||||
|
||||
MDBX_INTERNAL void thread_rthc_set(osal_thread_key_t key, const void *value);
|
||||
|
||||
#if !defined(_WIN32) && !defined(_WIN64)
|
||||
MDBX_INTERNAL void rthc_afterfork(void);
|
||||
MDBX_INTERNAL void workaround_glibc_bug21031(void);
|
||||
#endif /* !Windows */
|
||||
|
||||
static inline void thread_key_delete(osal_thread_key_t key) {
|
||||
TRACE("key = %" PRIuPTR, (uintptr_t)key);
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
ENSURE(nullptr, TlsFree(key));
|
||||
#else
|
||||
ENSURE(nullptr, pthread_key_delete(key) == 0);
|
||||
workaround_glibc_bug21031();
|
||||
#endif
|
||||
}
|
@ -1,17 +1,8 @@
|
||||
/* mdbx_chk.c - memory-mapped database check tool */
|
||||
|
||||
/*
|
||||
* Copyright 2015-2024 Leonid Yuriev <leo@yuriev.ru>
|
||||
* and other libmdbx authors: please see AUTHORS file.
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted only as authorized by the OpenLDAP
|
||||
* Public License.
|
||||
*
|
||||
* A copy of this license is available in the file LICENSE in the
|
||||
* top-level directory of the distribution or, alternatively, at
|
||||
* <http://www.OpenLDAP.org/license.html>. */
|
||||
/// \copyright SPDX-License-Identifier: Apache-2.0
|
||||
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
|
||||
///
|
||||
/// mdbx_chk.c - memory-mapped database check tool
|
||||
///
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#if _MSC_VER > 1800
|
||||
@ -21,7 +12,7 @@
|
||||
#endif /* _MSC_VER (warnings) */
|
||||
|
||||
#define xMDBX_TOOLS /* Avoid using internal eASSERT() */
|
||||
#include "internals.h"
|
||||
#include "essentials.h"
|
||||
|
||||
#include <ctype.h>
|
||||
|
||||
@ -59,8 +50,7 @@ static void signal_handler(int sig) {
|
||||
#define EXIT_FAILURE_CHECK_MAJOR (EXIT_FAILURE + 1)
|
||||
#define EXIT_FAILURE_CHECK_MINOR EXIT_FAILURE
|
||||
|
||||
enum MDBX_env_flags_t env_flags =
|
||||
MDBX_RDONLY | MDBX_EXCLUSIVE | MDBX_VALIDATION;
|
||||
MDBX_env_flags_t env_flags = MDBX_RDONLY | MDBX_EXCLUSIVE | MDBX_VALIDATION;
|
||||
MDBX_env *env;
|
||||
MDBX_txn *txn;
|
||||
unsigned verbose = 0;
|
||||
@ -70,8 +60,8 @@ int stuck_meta = -1;
|
||||
MDBX_chk_context_t chk;
|
||||
bool turn_meta = false;
|
||||
bool force_turn_meta = false;
|
||||
enum MDBX_chk_flags_t chk_flags = MDBX_CHK_DEFAULTS;
|
||||
enum MDBX_chk_stage chk_stage = MDBX_chk_none;
|
||||
MDBX_chk_flags_t chk_flags = MDBX_CHK_DEFAULTS;
|
||||
MDBX_chk_stage_t chk_stage = MDBX_chk_none;
|
||||
|
||||
static MDBX_chk_line_t line_struct;
|
||||
static size_t anchor_lineno;
|
||||
@ -105,7 +95,7 @@ static bool silently(enum MDBX_chk_severity severity) {
|
||||
chk.scope ? chk.scope->verbosity >> MDBX_chk_severity_prio_shift
|
||||
: verbose + (MDBX_chk_result >> MDBX_chk_severity_prio_shift);
|
||||
int prio = (severity >> MDBX_chk_severity_prio_shift);
|
||||
if (chk.scope && chk.scope->stage == MDBX_chk_traversal_subdbs && verbose < 2)
|
||||
if (chk.scope && chk.scope->stage == MDBX_chk_subdbs && verbose < 2)
|
||||
prio += 1;
|
||||
return quiet || cutoff < ((prio > 0) ? prio : 0);
|
||||
}
|
||||
@ -398,7 +388,7 @@ static int conclude(MDBX_chk_context_t *ctx) {
|
||||
" at txn-id #%" PRIi64 "...",
|
||||
ctx->result.recent_txnid);
|
||||
flush();
|
||||
err = error_fn("mdbx_env_pgwalk", mdbx_env_sync_ex(ctx->env, true, false));
|
||||
err = error_fn("walk_pages", mdbx_env_sync_ex(ctx->env, true, false));
|
||||
if (err == MDBX_SUCCESS) {
|
||||
ctx->result.problems_meta -= 1;
|
||||
ctx->result.total_problems -= 1;
|
@ -1,17 +1,10 @@
|
||||
/* mdbx_copy.c - memory-mapped database backup tool */
|
||||
|
||||
/*
|
||||
* Copyright 2015-2024 Leonid Yuriev <leo@yuriev.ru>
|
||||
* and other libmdbx authors: please see AUTHORS file.
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted only as authorized by the OpenLDAP
|
||||
* Public License.
|
||||
*
|
||||
* A copy of this license is available in the file LICENSE in the
|
||||
* top-level directory of the distribution or, alternatively, at
|
||||
* <http://www.OpenLDAP.org/license.html>. */
|
||||
/// \copyright SPDX-License-Identifier: Apache-2.0
|
||||
/// \note Please refer to the COPYRIGHT file for explanations license change,
|
||||
/// credits and acknowledgments.
|
||||
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
|
||||
///
|
||||
/// mdbx_copy.c - memory-mapped database backup tool
|
||||
///
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#if _MSC_VER > 1800
|
||||
@ -21,7 +14,7 @@
|
||||
#endif /* _MSC_VER (warnings) */
|
||||
|
||||
#define xMDBX_TOOLS /* Avoid using internal eASSERT() */
|
||||
#include "internals.h"
|
||||
#include "essentials.h"
|
||||
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
#include "wingetopt.h"
|
||||
@ -60,7 +53,7 @@ static void usage(const char *prog) {
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
int rc;
|
||||
MDBX_env *env = NULL;
|
||||
MDBX_env *env = nullptr;
|
||||
const char *progname = argv[0], *act;
|
||||
unsigned flags = MDBX_RDONLY;
|
||||
unsigned cpflags = 0;
|
||||
@ -123,7 +116,7 @@ int main(int argc, char *argv[]) {
|
||||
"mdbx_copy %s (%s, T-%s)\nRunning for copy %s to %s...\n",
|
||||
mdbx_version.git.describe, mdbx_version.git.datetime,
|
||||
mdbx_version.git.tree, argv[1], (argc == 2) ? "stdout" : argv[2]);
|
||||
fflush(NULL);
|
||||
fflush(nullptr);
|
||||
}
|
||||
|
||||
act = "opening environment";
|
@ -1,19 +1,10 @@
|
||||
/* mdbx_drop.c - memory-mapped database delete tool */
|
||||
|
||||
/*
|
||||
* Copyright 2021-2024 Leonid Yuriev <leo@yuriev.ru>
|
||||
* and other libmdbx authors: please see AUTHORS file.
|
||||
*
|
||||
* Copyright 2016-2021 Howard Chu, Symas Corp.
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted only as authorized by the OpenLDAP
|
||||
* Public License.
|
||||
*
|
||||
* A copy of this license is available in the file LICENSE in the
|
||||
* top-level directory of the distribution or, alternatively, at
|
||||
* <http://www.OpenLDAP.org/license.html>. */
|
||||
/// \copyright SPDX-License-Identifier: Apache-2.0
|
||||
/// \note Please refer to the COPYRIGHT file for explanations license change,
|
||||
/// credits and acknowledgments.
|
||||
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2021-2024
|
||||
///
|
||||
/// mdbx_drop.c - memory-mapped database delete tool
|
||||
///
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#if _MSC_VER > 1800
|
||||
@ -23,7 +14,7 @@
|
||||
#endif /* _MSC_VER (warnings) */
|
||||
|
||||
#define xMDBX_TOOLS /* Avoid using internal eASSERT() */
|
||||
#include "internals.h"
|
||||
#include "essentials.h"
|
||||
|
||||
#include <ctype.h>
|
||||
|
||||
@ -162,7 +153,7 @@ int main(int argc, char *argv[]) {
|
||||
goto env_close;
|
||||
}
|
||||
|
||||
rc = mdbx_txn_begin(env, NULL, 0, &txn);
|
||||
rc = mdbx_txn_begin(env, nullptr, 0, &txn);
|
||||
if (unlikely(rc != MDBX_SUCCESS)) {
|
||||
error("mdbx_txn_begin", rc);
|
||||
goto env_close;
|
@ -1,17 +1,10 @@
|
||||
/* mdbx_dump.c - memory-mapped database dump tool */
|
||||
|
||||
/*
|
||||
* Copyright 2015-2024 Leonid Yuriev <leo@yuriev.ru>
|
||||
* and other libmdbx authors: please see AUTHORS file.
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted only as authorized by the OpenLDAP
|
||||
* Public License.
|
||||
*
|
||||
* A copy of this license is available in the file LICENSE in the
|
||||
* top-level directory of the distribution or, alternatively, at
|
||||
* <http://www.OpenLDAP.org/license.html>. */
|
||||
/// \copyright SPDX-License-Identifier: Apache-2.0
|
||||
/// \note Please refer to the COPYRIGHT file for explanations license change,
|
||||
/// credits and acknowledgments.
|
||||
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
|
||||
///
|
||||
/// mdbx_dump.c - memory-mapped database dump tool
|
||||
///
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#if _MSC_VER > 1800
|
||||
@ -21,7 +14,7 @@
|
||||
#endif /* _MSC_VER (warnings) */
|
||||
|
||||
#define xMDBX_TOOLS /* Avoid using internal eASSERT() */
|
||||
#include "internals.h"
|
||||
#include "essentials.h"
|
||||
|
||||
#include <ctype.h>
|
||||
|
||||
@ -37,7 +30,7 @@ typedef struct flagbit {
|
||||
flagbit dbflags[] = {{MDBX_REVERSEKEY, "reversekey"},
|
||||
{MDBX_DUPSORT, "dupsort"},
|
||||
{MDBX_INTEGERKEY, "integerkey"},
|
||||
{MDBX_DUPFIXED, "dupfixed"},
|
||||
{MDBX_DUPFIXED, "dupfix"},
|
||||
{MDBX_INTEGERDUP, "integerdup"},
|
||||
{MDBX_REVERSEDUP, "reversedup"},
|
||||
{0, nullptr}};
|
||||
@ -108,7 +101,7 @@ static void error(const char *func, int rc) {
|
||||
|
||||
/* Dump in BDB-compatible format */
|
||||
static int dump_sdb(MDBX_txn *txn, MDBX_dbi dbi, char *name) {
|
||||
unsigned int flags;
|
||||
unsigned flags;
|
||||
int rc = mdbx_dbi_flags(txn, dbi, &flags);
|
||||
if (unlikely(rc != MDBX_SUCCESS)) {
|
||||
error("mdbx_dbi_flags", rc);
|
||||
@ -187,9 +180,11 @@ static int dump_sdb(MDBX_txn *txn, MDBX_dbi dbi, char *name) {
|
||||
return rc;
|
||||
}
|
||||
if (rescue) {
|
||||
cursor->mc_checking |= CC_SKIPORD;
|
||||
if (cursor->mc_xcursor)
|
||||
cursor->mc_xcursor->mx_cursor.mc_checking |= CC_SKIPORD;
|
||||
rc = mdbx_cursor_ignord(cursor);
|
||||
if (unlikely(rc != MDBX_SUCCESS)) {
|
||||
error("mdbx_cursor_ignord", rc);
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
|
||||
while ((rc = mdbx_cursor_get(cursor, &key, &data, MDBX_NEXT)) ==
|
||||
@ -245,7 +240,7 @@ static int equal_or_greater(const MDBX_val *a, const MDBX_val *b) {
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
int i, rc;
|
||||
int i, err;
|
||||
MDBX_env *env;
|
||||
MDBX_txn *txn;
|
||||
MDBX_dbi dbi;
|
||||
@ -355,47 +350,47 @@ int main(int argc, char *argv[]) {
|
||||
fflush(nullptr);
|
||||
}
|
||||
|
||||
rc = mdbx_env_create(&env);
|
||||
if (unlikely(rc != MDBX_SUCCESS)) {
|
||||
error("mdbx_env_create", rc);
|
||||
err = mdbx_env_create(&env);
|
||||
if (unlikely(err != MDBX_SUCCESS)) {
|
||||
error("mdbx_env_create", err);
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
|
||||
if (alldbs || subname) {
|
||||
rc = mdbx_env_set_maxdbs(env, 2);
|
||||
if (unlikely(rc != MDBX_SUCCESS)) {
|
||||
error("mdbx_env_set_maxdbs", rc);
|
||||
err = mdbx_env_set_maxdbs(env, 2);
|
||||
if (unlikely(err != MDBX_SUCCESS)) {
|
||||
error("mdbx_env_set_maxdbs", err);
|
||||
goto env_close;
|
||||
}
|
||||
}
|
||||
|
||||
rc = mdbx_env_open(
|
||||
err = mdbx_env_open(
|
||||
env, envname,
|
||||
envflags | (rescue ? MDBX_RDONLY | MDBX_EXCLUSIVE | MDBX_VALIDATION
|
||||
: MDBX_RDONLY),
|
||||
0);
|
||||
if (unlikely(rc != MDBX_SUCCESS)) {
|
||||
error("mdbx_env_open", rc);
|
||||
if (unlikely(err != MDBX_SUCCESS)) {
|
||||
error("mdbx_env_open", err);
|
||||
goto env_close;
|
||||
}
|
||||
|
||||
if (warmup) {
|
||||
rc = mdbx_env_warmup(env, nullptr, warmup_flags, 3600 * 65536);
|
||||
if (MDBX_IS_ERROR(rc)) {
|
||||
error("mdbx_env_warmup", rc);
|
||||
err = mdbx_env_warmup(env, nullptr, warmup_flags, 3600 * 65536);
|
||||
if (MDBX_IS_ERROR(err)) {
|
||||
error("mdbx_env_warmup", err);
|
||||
goto env_close;
|
||||
}
|
||||
}
|
||||
|
||||
rc = mdbx_txn_begin(env, nullptr, MDBX_TXN_RDONLY, &txn);
|
||||
if (unlikely(rc != MDBX_SUCCESS)) {
|
||||
error("mdbx_txn_begin", rc);
|
||||
err = mdbx_txn_begin(env, nullptr, MDBX_TXN_RDONLY, &txn);
|
||||
if (unlikely(err != MDBX_SUCCESS)) {
|
||||
error("mdbx_txn_begin", err);
|
||||
goto env_close;
|
||||
}
|
||||
|
||||
rc = mdbx_dbi_open(txn, subname, MDBX_DB_ACCEDE, &dbi);
|
||||
if (unlikely(rc != MDBX_SUCCESS)) {
|
||||
error("mdbx_dbi_open", rc);
|
||||
err = mdbx_dbi_open(txn, subname, MDBX_DB_ACCEDE, &dbi);
|
||||
if (unlikely(err != MDBX_SUCCESS)) {
|
||||
error("mdbx_dbi_open", err);
|
||||
goto txn_abort;
|
||||
}
|
||||
|
||||
@ -403,24 +398,26 @@ int main(int argc, char *argv[]) {
|
||||
assert(dbi == MAIN_DBI);
|
||||
|
||||
MDBX_cursor *cursor;
|
||||
rc = mdbx_cursor_open(txn, MAIN_DBI, &cursor);
|
||||
if (unlikely(rc != MDBX_SUCCESS)) {
|
||||
error("mdbx_cursor_open", rc);
|
||||
err = mdbx_cursor_open(txn, MAIN_DBI, &cursor);
|
||||
if (unlikely(err != MDBX_SUCCESS)) {
|
||||
error("mdbx_cursor_open", err);
|
||||
goto txn_abort;
|
||||
}
|
||||
if (rescue) {
|
||||
cursor->mc_checking |= CC_SKIPORD;
|
||||
if (cursor->mc_xcursor)
|
||||
cursor->mc_xcursor->mx_cursor.mc_checking |= CC_SKIPORD;
|
||||
err = mdbx_cursor_ignord(cursor);
|
||||
if (unlikely(err != MDBX_SUCCESS)) {
|
||||
error("mdbx_cursor_ignord", err);
|
||||
return err;
|
||||
}
|
||||
}
|
||||
|
||||
bool have_raw = false;
|
||||
int count = 0;
|
||||
MDBX_val key;
|
||||
while (MDBX_SUCCESS ==
|
||||
(rc = mdbx_cursor_get(cursor, &key, nullptr, MDBX_NEXT_NODUP))) {
|
||||
(err = mdbx_cursor_get(cursor, &key, nullptr, MDBX_NEXT_NODUP))) {
|
||||
if (user_break) {
|
||||
rc = MDBX_EINTR;
|
||||
err = MDBX_EINTR;
|
||||
break;
|
||||
}
|
||||
|
||||
@ -428,7 +425,7 @@ int main(int argc, char *argv[]) {
|
||||
continue;
|
||||
subname = osal_realloc(buf4free, key.iov_len + 1);
|
||||
if (!subname) {
|
||||
rc = MDBX_ENOMEM;
|
||||
err = MDBX_ENOMEM;
|
||||
break;
|
||||
}
|
||||
|
||||
@ -437,15 +434,15 @@ int main(int argc, char *argv[]) {
|
||||
subname[key.iov_len] = '\0';
|
||||
|
||||
MDBX_dbi sub_dbi;
|
||||
rc = mdbx_dbi_open_ex(txn, subname, MDBX_DB_ACCEDE, &sub_dbi,
|
||||
rescue ? equal_or_greater : nullptr,
|
||||
rescue ? equal_or_greater : nullptr);
|
||||
if (unlikely(rc != MDBX_SUCCESS)) {
|
||||
if (rc == MDBX_INCOMPATIBLE) {
|
||||
err = mdbx_dbi_open_ex(txn, subname, MDBX_DB_ACCEDE, &sub_dbi,
|
||||
rescue ? equal_or_greater : nullptr,
|
||||
rescue ? equal_or_greater : nullptr);
|
||||
if (unlikely(err != MDBX_SUCCESS)) {
|
||||
if (err == MDBX_INCOMPATIBLE) {
|
||||
have_raw = true;
|
||||
continue;
|
||||
}
|
||||
error("mdbx_dbi_open", rc);
|
||||
error("mdbx_dbi_open", err);
|
||||
if (!rescue)
|
||||
break;
|
||||
} else {
|
||||
@ -453,13 +450,13 @@ int main(int argc, char *argv[]) {
|
||||
if (list) {
|
||||
printf("%s\n", subname);
|
||||
} else {
|
||||
rc = dump_sdb(txn, sub_dbi, subname);
|
||||
if (unlikely(rc != MDBX_SUCCESS)) {
|
||||
err = dump_sdb(txn, sub_dbi, subname);
|
||||
if (unlikely(err != MDBX_SUCCESS)) {
|
||||
if (!rescue)
|
||||
break;
|
||||
if (!quiet)
|
||||
fprintf(stderr, "%s: %s: ignore %s for `%s` and continue\n", prog,
|
||||
envname, mdbx_strerror(rc), subname);
|
||||
envname, mdbx_strerror(err), subname);
|
||||
/* Here is a hack for rescue mode, don't do that:
|
||||
* - we should restart transaction in case error due
|
||||
* database corruption;
|
||||
@ -468,21 +465,21 @@ int main(int argc, char *argv[]) {
|
||||
* - this is possible since DB is opened in read-only exclusive
|
||||
* mode and transaction is the same, i.e. has the same address
|
||||
* and so on. */
|
||||
rc = mdbx_txn_reset(txn);
|
||||
if (unlikely(rc != MDBX_SUCCESS)) {
|
||||
error("mdbx_txn_reset", rc);
|
||||
err = mdbx_txn_reset(txn);
|
||||
if (unlikely(err != MDBX_SUCCESS)) {
|
||||
error("mdbx_txn_reset", err);
|
||||
goto env_close;
|
||||
}
|
||||
rc = mdbx_txn_renew(txn);
|
||||
if (unlikely(rc != MDBX_SUCCESS)) {
|
||||
error("mdbx_txn_renew", rc);
|
||||
err = mdbx_txn_renew(txn);
|
||||
if (unlikely(err != MDBX_SUCCESS)) {
|
||||
error("mdbx_txn_renew", err);
|
||||
goto env_close;
|
||||
}
|
||||
}
|
||||
}
|
||||
rc = mdbx_dbi_close(env, sub_dbi);
|
||||
if (unlikely(rc != MDBX_SUCCESS)) {
|
||||
error("mdbx_dbi_close", rc);
|
||||
err = mdbx_dbi_close(env, sub_dbi);
|
||||
if (unlikely(err != MDBX_SUCCESS)) {
|
||||
error("mdbx_dbi_close", err);
|
||||
break;
|
||||
}
|
||||
}
|
||||
@ -491,20 +488,20 @@ int main(int argc, char *argv[]) {
|
||||
cursor = nullptr;
|
||||
|
||||
if (have_raw && (!count /* || rescue */))
|
||||
rc = dump_sdb(txn, MAIN_DBI, nullptr);
|
||||
err = dump_sdb(txn, MAIN_DBI, nullptr);
|
||||
else if (!count) {
|
||||
if (!quiet)
|
||||
fprintf(stderr, "%s: %s does not contain multiple databases\n", prog,
|
||||
envname);
|
||||
rc = MDBX_NOTFOUND;
|
||||
err = MDBX_NOTFOUND;
|
||||
}
|
||||
} else {
|
||||
rc = dump_sdb(txn, dbi, subname);
|
||||
err = dump_sdb(txn, dbi, subname);
|
||||
}
|
||||
|
||||
switch (rc) {
|
||||
switch (err) {
|
||||
case MDBX_NOTFOUND:
|
||||
rc = MDBX_SUCCESS;
|
||||
err = MDBX_SUCCESS;
|
||||
case MDBX_SUCCESS:
|
||||
break;
|
||||
case MDBX_EINTR:
|
||||
@ -512,8 +509,8 @@ int main(int argc, char *argv[]) {
|
||||
fprintf(stderr, "Interrupted by signal/user\n");
|
||||
break;
|
||||
default:
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
error("mdbx_cursor_get", rc);
|
||||
if (unlikely(err != MDBX_SUCCESS))
|
||||
error("mdbx_cursor_get", err);
|
||||
}
|
||||
|
||||
mdbx_dbi_close(env, dbi);
|
||||
@ -523,5 +520,5 @@ env_close:
|
||||
mdbx_env_close(env);
|
||||
free(buf4free);
|
||||
|
||||
return rc ? EXIT_FAILURE : EXIT_SUCCESS;
|
||||
return err ? EXIT_FAILURE : EXIT_SUCCESS;
|
||||
}
|
@ -1,17 +1,10 @@
|
||||
/* mdbx_load.c - memory-mapped database load tool */
|
||||
|
||||
/*
|
||||
* Copyright 2015-2024 Leonid Yuriev <leo@yuriev.ru>
|
||||
* and other libmdbx authors: please see AUTHORS file.
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted only as authorized by the OpenLDAP
|
||||
* Public License.
|
||||
*
|
||||
* A copy of this license is available in the file LICENSE in the
|
||||
* top-level directory of the distribution or, alternatively, at
|
||||
* <http://www.OpenLDAP.org/license.html>. */
|
||||
/// \copyright SPDX-License-Identifier: Apache-2.0
|
||||
/// \note Please refer to the COPYRIGHT file for explanations license change,
|
||||
/// credits and acknowledgments.
|
||||
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
|
||||
///
|
||||
/// mdbx_load.c - memory-mapped database load tool
|
||||
///
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#if _MSC_VER > 1800
|
||||
@ -21,7 +14,7 @@
|
||||
#endif /* _MSC_VER (warnings) */
|
||||
|
||||
#define xMDBX_TOOLS /* Avoid using internal eASSERT() */
|
||||
#include "internals.h"
|
||||
#include "essentials.h"
|
||||
|
||||
#include <ctype.h>
|
||||
|
||||
@ -139,7 +132,7 @@ typedef struct flagbit {
|
||||
flagbit dbflags[] = {
|
||||
{MDBX_REVERSEKEY, S("reversekey")}, {MDBX_DUPSORT, S("duplicates")},
|
||||
{MDBX_DUPSORT, S("dupsort")}, {MDBX_INTEGERKEY, S("integerkey")},
|
||||
{MDBX_DUPFIXED, S("dupfixed")}, {MDBX_INTEGERDUP, S("integerdup")},
|
||||
{MDBX_DUPFIXED, S("dupfix")}, {MDBX_INTEGERDUP, S("integerdup")},
|
||||
{MDBX_REVERSEDUP, S("reversedup")}, {0, 0, nullptr}};
|
||||
|
||||
static int readhdr(void) {
|
||||
@ -375,7 +368,7 @@ static int badend(void) {
|
||||
return errno ? errno : MDBX_ENODATA;
|
||||
}
|
||||
|
||||
static __inline int unhex(unsigned char *c2) {
|
||||
static inline int unhex(unsigned char *c2) {
|
||||
int x, c;
|
||||
x = *c2++ & 0x4f;
|
||||
if (x & 0x40)
|
@ -1,17 +1,10 @@
|
||||
/* mdbx_stat.c - memory-mapped database status tool */
|
||||
|
||||
/*
|
||||
* Copyright 2015-2024 Leonid Yuriev <leo@yuriev.ru>
|
||||
* and other libmdbx authors: please see AUTHORS file.
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted only as authorized by the OpenLDAP
|
||||
* Public License.
|
||||
*
|
||||
* A copy of this license is available in the file LICENSE in the
|
||||
* top-level directory of the distribution or, alternatively, at
|
||||
* <http://www.OpenLDAP.org/license.html>. */
|
||||
/// \copyright SPDX-License-Identifier: Apache-2.0
|
||||
/// \note Please refer to the COPYRIGHT file for explanations license change,
|
||||
/// credits and acknowledgments.
|
||||
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
|
||||
///
|
||||
/// mdbx_stat.c - memory-mapped database status tool
|
||||
///
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#if _MSC_VER > 1800
|
||||
@ -21,7 +14,7 @@
|
||||
#endif /* _MSC_VER (warnings) */
|
||||
|
||||
#define xMDBX_TOOLS /* Avoid using internal eASSERT() */
|
||||
#include "internals.h"
|
||||
#include "essentials.h"
|
||||
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
#include "wingetopt.h"
|
1645
src/tree.c
Normal file
1645
src/tree.c
Normal file
File diff suppressed because it is too large
Load Diff
102
src/txl.c
Normal file
102
src/txl.c
Normal file
@ -0,0 +1,102 @@
|
||||
/// \copyright SPDX-License-Identifier: Apache-2.0
|
||||
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
|
||||
|
||||
#include "internals.h"
|
||||
|
||||
static inline size_t txl_size2bytes(const size_t size) {
|
||||
assert(size > 0 && size <= txl_max * 2);
|
||||
size_t bytes =
|
||||
ceil_powerof2(MDBX_ASSUME_MALLOC_OVERHEAD + sizeof(txnid_t) * (size + 2),
|
||||
txl_granulate * sizeof(txnid_t)) -
|
||||
MDBX_ASSUME_MALLOC_OVERHEAD;
|
||||
return bytes;
|
||||
}
|
||||
|
||||
static inline size_t txl_bytes2size(const size_t bytes) {
|
||||
size_t size = bytes / sizeof(txnid_t);
|
||||
assert(size > 2 && size <= txl_max * 2);
|
||||
return size - 2;
|
||||
}
|
||||
|
||||
MDBX_INTERNAL txl_t txl_alloc(void) {
|
||||
size_t bytes = txl_size2bytes(txl_initial);
|
||||
txl_t txl = osal_malloc(bytes);
|
||||
if (likely(txl)) {
|
||||
#if __GLIBC_PREREQ(2, 12) || defined(__FreeBSD__) || defined(malloc_usable_size)
|
||||
bytes = malloc_usable_size(txl);
|
||||
#endif /* malloc_usable_size */
|
||||
txl[0] = txl_bytes2size(bytes);
|
||||
assert(txl[0] >= txl_initial);
|
||||
txl += 1;
|
||||
*txl = 0;
|
||||
}
|
||||
return txl;
|
||||
}
|
||||
|
||||
MDBX_INTERNAL void txl_free(txl_t txl) {
|
||||
if (likely(txl))
|
||||
osal_free(txl - 1);
|
||||
}
|
||||
|
||||
MDBX_INTERNAL int txl_reserve(txl_t __restrict *__restrict ptxl,
|
||||
const size_t wanna) {
|
||||
const size_t allocated = (size_t)MDBX_PNL_ALLOCLEN(*ptxl);
|
||||
assert(MDBX_PNL_GETSIZE(*ptxl) <= txl_max &&
|
||||
MDBX_PNL_ALLOCLEN(*ptxl) >= MDBX_PNL_GETSIZE(*ptxl));
|
||||
if (likely(allocated >= wanna))
|
||||
return MDBX_SUCCESS;
|
||||
|
||||
if (unlikely(wanna > /* paranoia */ txl_max)) {
|
||||
ERROR("TXL too long (%zu > %zu)", wanna, (size_t)txl_max);
|
||||
return MDBX_TXN_FULL;
|
||||
}
|
||||
|
||||
const size_t size = (wanna + wanna - allocated < txl_max)
|
||||
? wanna + wanna - allocated
|
||||
: txl_max;
|
||||
size_t bytes = txl_size2bytes(size);
|
||||
txl_t txl = osal_realloc(*ptxl - 1, bytes);
|
||||
if (likely(txl)) {
|
||||
#if __GLIBC_PREREQ(2, 12) || defined(__FreeBSD__) || defined(malloc_usable_size)
|
||||
bytes = malloc_usable_size(txl);
|
||||
#endif /* malloc_usable_size */
|
||||
*txl = txl_bytes2size(bytes);
|
||||
assert(*txl >= wanna);
|
||||
*ptxl = txl + 1;
|
||||
return MDBX_SUCCESS;
|
||||
}
|
||||
return MDBX_ENOMEM;
|
||||
}
|
||||
|
||||
static __always_inline int __must_check_result
|
||||
txl_need(txl_t __restrict *__restrict ptxl, size_t num) {
|
||||
assert(MDBX_PNL_GETSIZE(*ptxl) <= txl_max &&
|
||||
MDBX_PNL_ALLOCLEN(*ptxl) >= MDBX_PNL_GETSIZE(*ptxl));
|
||||
assert(num <= PAGELIST_LIMIT);
|
||||
const size_t wanna = (size_t)MDBX_PNL_GETSIZE(*ptxl) + num;
|
||||
return likely(MDBX_PNL_ALLOCLEN(*ptxl) >= wanna) ? MDBX_SUCCESS
|
||||
: txl_reserve(ptxl, wanna);
|
||||
}
|
||||
|
||||
static __always_inline void txl_xappend(txl_t __restrict txl, txnid_t id) {
|
||||
assert(MDBX_PNL_GETSIZE(txl) < MDBX_PNL_ALLOCLEN(txl));
|
||||
txl[0] += 1;
|
||||
MDBX_PNL_LAST(txl) = id;
|
||||
}
|
||||
|
||||
#define TXNID_SORT_CMP(first, last) ((first) > (last))
|
||||
SORT_IMPL(txnid_sort, false, txnid_t, TXNID_SORT_CMP)
|
||||
MDBX_INTERNAL void txl_sort(txl_t txl) {
|
||||
txnid_sort(MDBX_PNL_BEGIN(txl), MDBX_PNL_END(txl));
|
||||
}
|
||||
|
||||
MDBX_INTERNAL int __must_check_result txl_append(txl_t __restrict *ptxl,
|
||||
txnid_t id) {
|
||||
if (unlikely(MDBX_PNL_GETSIZE(*ptxl) == MDBX_PNL_ALLOCLEN(*ptxl))) {
|
||||
int rc = txl_need(ptxl, txl_granulate);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return rc;
|
||||
}
|
||||
txl_xappend(*ptxl, id);
|
||||
return MDBX_SUCCESS;
|
||||
}
|
26
src/txl.h
Normal file
26
src/txl.h
Normal file
@ -0,0 +1,26 @@
|
||||
/// \copyright SPDX-License-Identifier: Apache-2.0
|
||||
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "essentials.h"
|
||||
|
||||
/* List of txnid */
|
||||
typedef txnid_t *txl_t;
|
||||
typedef const txnid_t *const_txl_t;
|
||||
|
||||
enum txl_rules {
|
||||
txl_granulate = 32,
|
||||
txl_initial =
|
||||
txl_granulate - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(txnid_t),
|
||||
txl_max = (1u << 26) - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(txnid_t)
|
||||
};
|
||||
|
||||
MDBX_INTERNAL txl_t txl_alloc(void);
|
||||
|
||||
MDBX_INTERNAL void txl_free(txl_t txl);
|
||||
|
||||
MDBX_INTERNAL int __must_check_result txl_append(txl_t __restrict *ptxl,
|
||||
txnid_t id);
|
||||
|
||||
MDBX_INTERNAL void txl_sort(txl_t txl);
|
242
src/unaligned.h
Normal file
242
src/unaligned.h
Normal file
@ -0,0 +1,242 @@
|
||||
/// \copyright SPDX-License-Identifier: Apache-2.0
|
||||
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
|
||||
|
||||
#pragma once
|
||||
|
||||
/*------------------------------------------------------------------------------
|
||||
* Unaligned access */
|
||||
|
||||
MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static inline size_t
|
||||
field_alignment(size_t alignment_baseline, size_t field_offset) {
|
||||
size_t merge = alignment_baseline | (size_t)field_offset;
|
||||
return merge & -(int)merge;
|
||||
}
|
||||
|
||||
/* read-thunk for UB-sanitizer */
|
||||
MDBX_NOTHROW_PURE_FUNCTION static inline uint8_t
|
||||
peek_u8(const uint8_t *__restrict ptr) {
|
||||
return *ptr;
|
||||
}
|
||||
|
||||
/* write-thunk for UB-sanitizer */
|
||||
static inline void poke_u8(uint8_t *__restrict ptr, const uint8_t v) {
|
||||
*ptr = v;
|
||||
}
|
||||
|
||||
static inline void *bcopy_2(void *__restrict dst, const void *__restrict src) {
|
||||
uint8_t *__restrict d = (uint8_t *)dst;
|
||||
const uint8_t *__restrict s = (uint8_t *)src;
|
||||
d[0] = s[0];
|
||||
d[1] = s[1];
|
||||
return d;
|
||||
}
|
||||
|
||||
static inline void *bcopy_4(void *const __restrict dst,
|
||||
const void *const __restrict src) {
|
||||
uint8_t *__restrict d = (uint8_t *)dst;
|
||||
const uint8_t *__restrict s = (uint8_t *)src;
|
||||
d[0] = s[0];
|
||||
d[1] = s[1];
|
||||
d[2] = s[2];
|
||||
d[3] = s[3];
|
||||
return d;
|
||||
}
|
||||
|
||||
static inline void *bcopy_8(void *const __restrict dst,
|
||||
const void *const __restrict src) {
|
||||
uint8_t *__restrict d = (uint8_t *)dst;
|
||||
const uint8_t *__restrict s = (uint8_t *)src;
|
||||
d[0] = s[0];
|
||||
d[1] = s[1];
|
||||
d[2] = s[2];
|
||||
d[3] = s[3];
|
||||
d[4] = s[4];
|
||||
d[5] = s[5];
|
||||
d[6] = s[6];
|
||||
d[7] = s[7];
|
||||
return d;
|
||||
}
|
||||
|
||||
MDBX_NOTHROW_PURE_FUNCTION static inline uint16_t
|
||||
unaligned_peek_u16(const size_t expected_alignment, const void *const ptr) {
|
||||
assert((uintptr_t)ptr % expected_alignment == 0);
|
||||
if (MDBX_UNALIGNED_OK >= 2 || (expected_alignment % sizeof(uint16_t)) == 0)
|
||||
return *(const uint16_t *)ptr;
|
||||
else {
|
||||
#if defined(__unaligned) || defined(_M_ARM) || defined(_M_ARM64) || \
|
||||
defined(_M_X64) || defined(_M_IA64)
|
||||
return *(const __unaligned uint16_t *)ptr;
|
||||
#else
|
||||
uint16_t v;
|
||||
bcopy_2((uint8_t *)&v, (const uint8_t *)ptr);
|
||||
return v;
|
||||
#endif /* _MSC_VER || __unaligned */
|
||||
}
|
||||
}
|
||||
|
||||
static inline void unaligned_poke_u16(const size_t expected_alignment,
|
||||
void *const __restrict ptr,
|
||||
const uint16_t v) {
|
||||
assert((uintptr_t)ptr % expected_alignment == 0);
|
||||
if (MDBX_UNALIGNED_OK >= 2 || (expected_alignment % sizeof(v)) == 0)
|
||||
*(uint16_t *)ptr = v;
|
||||
else {
|
||||
#if defined(__unaligned) || defined(_M_ARM) || defined(_M_ARM64) || \
|
||||
defined(_M_X64) || defined(_M_IA64)
|
||||
*((uint16_t __unaligned *)ptr) = v;
|
||||
#else
|
||||
bcopy_2((uint8_t *)ptr, (const uint8_t *)&v);
|
||||
#endif /* _MSC_VER || __unaligned */
|
||||
}
|
||||
}
|
||||
|
||||
MDBX_NOTHROW_PURE_FUNCTION static inline uint32_t
|
||||
unaligned_peek_u32(const size_t expected_alignment,
|
||||
const void *const __restrict ptr) {
|
||||
assert((uintptr_t)ptr % expected_alignment == 0);
|
||||
if (MDBX_UNALIGNED_OK >= 4 || (expected_alignment % sizeof(uint32_t)) == 0)
|
||||
return *(const uint32_t *)ptr;
|
||||
else if ((expected_alignment % sizeof(uint16_t)) == 0) {
|
||||
const uint16_t lo =
|
||||
((const uint16_t *)ptr)[__BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__];
|
||||
const uint16_t hi =
|
||||
((const uint16_t *)ptr)[__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__];
|
||||
return lo | (uint32_t)hi << 16;
|
||||
} else {
|
||||
#if defined(__unaligned) || defined(_M_ARM) || defined(_M_ARM64) || \
|
||||
defined(_M_X64) || defined(_M_IA64)
|
||||
return *(const __unaligned uint32_t *)ptr;
|
||||
#else
|
||||
uint32_t v;
|
||||
bcopy_4((uint8_t *)&v, (const uint8_t *)ptr);
|
||||
return v;
|
||||
#endif /* _MSC_VER || __unaligned */
|
||||
}
|
||||
}
|
||||
|
||||
static inline void unaligned_poke_u32(const size_t expected_alignment,
|
||||
void *const __restrict ptr,
|
||||
const uint32_t v) {
|
||||
assert((uintptr_t)ptr % expected_alignment == 0);
|
||||
if (MDBX_UNALIGNED_OK >= 4 || (expected_alignment % sizeof(v)) == 0)
|
||||
*(uint32_t *)ptr = v;
|
||||
else if ((expected_alignment % sizeof(uint16_t)) == 0) {
|
||||
((uint16_t *)ptr)[__BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__] = (uint16_t)v;
|
||||
((uint16_t *)ptr)[__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__] =
|
||||
(uint16_t)(v >> 16);
|
||||
} else {
|
||||
#if defined(__unaligned) || defined(_M_ARM) || defined(_M_ARM64) || \
|
||||
defined(_M_X64) || defined(_M_IA64)
|
||||
*((uint32_t __unaligned *)ptr) = v;
|
||||
#else
|
||||
bcopy_4((uint8_t *)ptr, (const uint8_t *)&v);
|
||||
#endif /* _MSC_VER || __unaligned */
|
||||
}
|
||||
}
|
||||
|
||||
MDBX_NOTHROW_PURE_FUNCTION static inline uint64_t
|
||||
unaligned_peek_u64(const size_t expected_alignment,
|
||||
const void *const __restrict ptr) {
|
||||
assert((uintptr_t)ptr % expected_alignment == 0);
|
||||
if (MDBX_UNALIGNED_OK >= 8 || (expected_alignment % sizeof(uint64_t)) == 0)
|
||||
return *(const uint64_t *)ptr;
|
||||
else if ((expected_alignment % sizeof(uint32_t)) == 0) {
|
||||
const uint32_t lo =
|
||||
((const uint32_t *)ptr)[__BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__];
|
||||
const uint32_t hi =
|
||||
((const uint32_t *)ptr)[__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__];
|
||||
return lo | (uint64_t)hi << 32;
|
||||
} else {
|
||||
#if defined(__unaligned) || defined(_M_ARM) || defined(_M_ARM64) || \
|
||||
defined(_M_X64) || defined(_M_IA64)
|
||||
return *(const __unaligned uint64_t *)ptr;
|
||||
#else
|
||||
uint64_t v;
|
||||
bcopy_8((uint8_t *)&v, (const uint8_t *)ptr);
|
||||
return v;
|
||||
#endif /* _MSC_VER || __unaligned */
|
||||
}
|
||||
}
|
||||
|
||||
static inline uint64_t
|
||||
unaligned_peek_u64_volatile(const size_t expected_alignment,
|
||||
const volatile void *const __restrict ptr) {
|
||||
assert((uintptr_t)ptr % expected_alignment == 0);
|
||||
assert(expected_alignment % sizeof(uint32_t) == 0);
|
||||
if (MDBX_UNALIGNED_OK >= 8 || (expected_alignment % sizeof(uint64_t)) == 0)
|
||||
return *(const volatile uint64_t *)ptr;
|
||||
else {
|
||||
#if defined(__unaligned) || defined(_M_ARM) || defined(_M_ARM64) || \
|
||||
defined(_M_X64) || defined(_M_IA64)
|
||||
return *(const volatile __unaligned uint64_t *)ptr;
|
||||
#else
|
||||
const uint32_t lo = ((const volatile uint32_t *)
|
||||
ptr)[__BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__];
|
||||
const uint32_t hi = ((const volatile uint32_t *)
|
||||
ptr)[__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__];
|
||||
return lo | (uint64_t)hi << 32;
|
||||
#endif /* _MSC_VER || __unaligned */
|
||||
}
|
||||
}
|
||||
|
||||
static inline void unaligned_poke_u64(const size_t expected_alignment,
|
||||
void *const __restrict ptr,
|
||||
const uint64_t v) {
|
||||
assert((uintptr_t)ptr % expected_alignment == 0);
|
||||
if (MDBX_UNALIGNED_OK >= 8 || (expected_alignment % sizeof(v)) == 0)
|
||||
*(uint64_t *)ptr = v;
|
||||
else if ((expected_alignment % sizeof(uint32_t)) == 0) {
|
||||
((uint32_t *)ptr)[__BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__] = (uint32_t)v;
|
||||
((uint32_t *)ptr)[__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__] =
|
||||
(uint32_t)(v >> 32);
|
||||
} else {
|
||||
#if defined(__unaligned) || defined(_M_ARM) || defined(_M_ARM64) || \
|
||||
defined(_M_X64) || defined(_M_IA64)
|
||||
*((uint64_t __unaligned *)ptr) = v;
|
||||
#else
|
||||
bcopy_8((uint8_t *)ptr, (const uint8_t *)&v);
|
||||
#endif /* _MSC_VER || __unaligned */
|
||||
}
|
||||
}
|
||||
|
||||
#define UNALIGNED_PEEK_8(ptr, struct, field) \
|
||||
peek_u8(ptr_disp(ptr, offsetof(struct, field)))
|
||||
#define UNALIGNED_POKE_8(ptr, struct, field, value) \
|
||||
poke_u8(ptr_disp(ptr, offsetof(struct, field)), value)
|
||||
|
||||
#define UNALIGNED_PEEK_16(ptr, struct, field) \
|
||||
unaligned_peek_u16(1, ptr_disp(ptr, offsetof(struct, field)))
|
||||
#define UNALIGNED_POKE_16(ptr, struct, field, value) \
|
||||
unaligned_poke_u16(1, ptr_disp(ptr, offsetof(struct, field)), value)
|
||||
|
||||
#define UNALIGNED_PEEK_32(ptr, struct, field) \
|
||||
unaligned_peek_u32(1, ptr_disp(ptr, offsetof(struct, field)))
|
||||
#define UNALIGNED_POKE_32(ptr, struct, field, value) \
|
||||
unaligned_poke_u32(1, ptr_disp(ptr, offsetof(struct, field)), value)
|
||||
|
||||
#define UNALIGNED_PEEK_64(ptr, struct, field) \
|
||||
unaligned_peek_u64(1, ptr_disp(ptr, offsetof(struct, field)))
|
||||
#define UNALIGNED_POKE_64(ptr, struct, field, value) \
|
||||
unaligned_poke_u64(1, ptr_disp(ptr, offsetof(struct, field)), value)
|
||||
|
||||
MDBX_NOTHROW_PURE_FUNCTION static inline pgno_t
|
||||
peek_pgno(const void *const __restrict ptr) {
|
||||
if (sizeof(pgno_t) == sizeof(uint32_t))
|
||||
return (pgno_t)unaligned_peek_u32(1, ptr);
|
||||
else if (sizeof(pgno_t) == sizeof(uint64_t))
|
||||
return (pgno_t)unaligned_peek_u64(1, ptr);
|
||||
else {
|
||||
pgno_t pgno;
|
||||
memcpy(&pgno, ptr, sizeof(pgno));
|
||||
return pgno;
|
||||
}
|
||||
}
|
||||
|
||||
static inline void poke_pgno(void *const __restrict ptr, const pgno_t pgno) {
|
||||
if (sizeof(pgno) == sizeof(uint32_t))
|
||||
unaligned_poke_u32(1, ptr, pgno);
|
||||
else if (sizeof(pgno) == sizeof(uint64_t))
|
||||
unaligned_poke_u64(1, ptr, pgno);
|
||||
else
|
||||
memcpy(ptr, &pgno, sizeof(pgno));
|
||||
}
|
35
src/utils.c
Normal file
35
src/utils.c
Normal file
@ -0,0 +1,35 @@
|
||||
/// \copyright SPDX-License-Identifier: Apache-2.0
|
||||
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
|
||||
|
||||
#include "internals.h"
|
||||
|
||||
MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION MDBX_INTERNAL unsigned
|
||||
log2n_powerof2(size_t value_uintptr) {
|
||||
assert(value_uintptr > 0 && value_uintptr < INT32_MAX &&
|
||||
is_powerof2(value_uintptr));
|
||||
assert((value_uintptr & -(intptr_t)value_uintptr) == value_uintptr);
|
||||
const uint32_t value_uint32 = (uint32_t)value_uintptr;
|
||||
#if __GNUC_PREREQ(4, 1) || __has_builtin(__builtin_ctz)
|
||||
STATIC_ASSERT(sizeof(value_uint32) <= sizeof(unsigned));
|
||||
return __builtin_ctz(value_uint32);
|
||||
#elif defined(_MSC_VER)
|
||||
unsigned long index;
|
||||
STATIC_ASSERT(sizeof(value_uint32) <= sizeof(long));
|
||||
_BitScanForward(&index, value_uint32);
|
||||
return index;
|
||||
#else
|
||||
static const uint8_t debruijn_ctz32[32] = {
|
||||
0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
|
||||
31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9};
|
||||
return debruijn_ctz32[(uint32_t)(value_uint32 * 0x077CB531ul) >> 27];
|
||||
#endif
|
||||
}
|
||||
|
||||
MDBX_NOTHROW_CONST_FUNCTION MDBX_INTERNAL uint64_t rrxmrrxmsx_0(uint64_t v) {
|
||||
/* Pelle Evensen's mixer, https://bit.ly/2HOfynt */
|
||||
v ^= (v << 39 | v >> 25) ^ (v << 14 | v >> 50);
|
||||
v *= UINT64_C(0xA24BAED4963EE407);
|
||||
v ^= (v << 40 | v >> 24) ^ (v << 15 | v >> 49);
|
||||
v *= UINT64_C(0x9FB21C651E98DF25);
|
||||
return v ^ v >> 28;
|
||||
}
|
87
src/utils.h
Normal file
87
src/utils.h
Normal file
@ -0,0 +1,87 @@
|
||||
/// \copyright SPDX-License-Identifier: Apache-2.0
|
||||
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
|
||||
|
||||
#pragma once
|
||||
|
||||
/* Test if the flags f are set in a flag word w. */
|
||||
#define F_ISSET(w, f) (((w) & (f)) == (f))
|
||||
|
||||
/* Round n up to an even number. */
|
||||
#define EVEN_CEIL(n) (((n) + 1UL) & -2L) /* sign-extending -2 to match n+1U */
|
||||
|
||||
/* Round n down to an even number. */
|
||||
#define EVEN_FLOOR(n) ((n) & ~(size_t)1)
|
||||
|
||||
/*
|
||||
* /
|
||||
* | -1, a < b
|
||||
* CMP2INT(a,b) = < 0, a == b
|
||||
* | 1, a > b
|
||||
* \
|
||||
*/
|
||||
#define CMP2INT(a, b) (((a) != (b)) ? (((a) < (b)) ? -1 : 1) : 0)
|
||||
|
||||
/* Pointer displacement without casting to char* to avoid pointer-aliasing */
|
||||
#define ptr_disp(ptr, disp) ((void *)(((intptr_t)(ptr)) + ((intptr_t)(disp))))
|
||||
|
||||
/* Pointer distance as signed number of bytes */
|
||||
#define ptr_dist(more, less) (((intptr_t)(more)) - ((intptr_t)(less)))
|
||||
|
||||
#define MDBX_ASAN_POISON_MEMORY_REGION(addr, size) \
|
||||
do { \
|
||||
TRACE("POISON_MEMORY_REGION(%p, %zu) at %u", (void *)(addr), \
|
||||
(size_t)(size), __LINE__); \
|
||||
ASAN_POISON_MEMORY_REGION(addr, size); \
|
||||
} while (0)
|
||||
|
||||
#define MDBX_ASAN_UNPOISON_MEMORY_REGION(addr, size) \
|
||||
do { \
|
||||
TRACE("UNPOISON_MEMORY_REGION(%p, %zu) at %u", (void *)(addr), \
|
||||
(size_t)(size), __LINE__); \
|
||||
ASAN_UNPOISON_MEMORY_REGION(addr, size); \
|
||||
} while (0)
|
||||
|
||||
MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static inline size_t
|
||||
branchless_abs(intptr_t value) {
|
||||
assert(value > INT_MIN);
|
||||
const size_t expanded_sign =
|
||||
(size_t)(value >> (sizeof(value) * CHAR_BIT - 1));
|
||||
return ((size_t)value + expanded_sign) ^ expanded_sign;
|
||||
}
|
||||
|
||||
MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static inline bool
|
||||
is_powerof2(size_t x) {
|
||||
return (x & (x - 1)) == 0;
|
||||
}
|
||||
|
||||
MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static inline size_t
|
||||
floor_powerof2(size_t value, size_t granularity) {
|
||||
assert(is_powerof2(granularity));
|
||||
return value & ~(granularity - 1);
|
||||
}
|
||||
|
||||
MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static inline size_t
|
||||
ceil_powerof2(size_t value, size_t granularity) {
|
||||
return floor_powerof2(value + granularity - 1, granularity);
|
||||
}
|
||||
|
||||
MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION MDBX_INTERNAL unsigned
|
||||
log2n_powerof2(size_t value_uintptr);
|
||||
|
||||
MDBX_NOTHROW_CONST_FUNCTION MDBX_INTERNAL uint64_t rrxmrrxmsx_0(uint64_t v);
|
||||
|
||||
struct monotime_cache {
|
||||
uint64_t value;
|
||||
int expire_countdown;
|
||||
};
|
||||
|
||||
MDBX_MAYBE_UNUSED static inline uint64_t
|
||||
monotime_since_cached(uint64_t begin_timestamp, struct monotime_cache *cache) {
|
||||
if (cache->expire_countdown)
|
||||
cache->expire_countdown -= 1;
|
||||
else {
|
||||
cache->value = osal_monotime();
|
||||
cache->expire_countdown = 42 / 3;
|
||||
}
|
||||
return cache->value - begin_timestamp;
|
||||
}
|
314
src/walk.c
Normal file
314
src/walk.c
Normal file
@ -0,0 +1,314 @@
|
||||
/// \copyright SPDX-License-Identifier: Apache-2.0
|
||||
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
|
||||
|
||||
#include "internals.h"
|
||||
|
||||
typedef struct walk_ctx {
|
||||
void *userctx;
|
||||
walk_options_t options;
|
||||
int deep;
|
||||
walk_func *visitor;
|
||||
MDBX_txn *txn;
|
||||
MDBX_cursor *cursor;
|
||||
} walk_ctx_t;
|
||||
|
||||
__cold static int walk_sdb(walk_ctx_t *ctx, walk_sdb_t *sdb);
|
||||
|
||||
static page_type_t walk_page_type(const page_t *mp) {
|
||||
if (mp)
|
||||
switch (mp->flags & ~P_SPILLED) {
|
||||
case P_BRANCH:
|
||||
return page_branch;
|
||||
case P_LEAF:
|
||||
return page_leaf;
|
||||
case P_LEAF | P_DUPFIX:
|
||||
return page_dupfix_leaf;
|
||||
case P_LARGE:
|
||||
return page_large;
|
||||
}
|
||||
return page_broken;
|
||||
}
|
||||
|
||||
static page_type_t walk_subpage_type(const page_t *sp) {
|
||||
switch (sp->flags & /* ignore legacy P_DIRTY flag */ ~P_LEGACY_DIRTY) {
|
||||
case P_LEAF | P_SUBP:
|
||||
return page_sub_leaf;
|
||||
case P_LEAF | P_DUPFIX | P_SUBP:
|
||||
return page_sub_dupfix_leaf;
|
||||
default:
|
||||
return page_sub_broken;
|
||||
}
|
||||
}
|
||||
|
||||
/* Depth-first tree traversal. */
|
||||
__cold static int walk_pgno(walk_ctx_t *ctx, walk_sdb_t *sdb, const pgno_t pgno,
|
||||
txnid_t parent_txnid) {
|
||||
assert(pgno != P_INVALID);
|
||||
page_t *mp = nullptr;
|
||||
int err = page_get(ctx->cursor, pgno, &mp, parent_txnid);
|
||||
|
||||
const page_type_t type = walk_page_type(mp);
|
||||
const size_t nentries = mp ? page_numkeys(mp) : 0;
|
||||
size_t header_size =
|
||||
(mp && !is_dupfix_leaf(mp)) ? PAGEHDRSZ + mp->lower : PAGEHDRSZ;
|
||||
size_t payload_size = 0;
|
||||
size_t unused_size =
|
||||
(mp ? page_room(mp) : ctx->txn->env->ps - header_size) - payload_size;
|
||||
size_t align_bytes = 0;
|
||||
|
||||
for (size_t i = 0; err == MDBX_SUCCESS && i < nentries; ++i) {
|
||||
if (type == page_dupfix_leaf) {
|
||||
/* DUPFIX pages have no entries[] or node headers */
|
||||
payload_size += mp->dupfix_ksize;
|
||||
continue;
|
||||
}
|
||||
|
||||
const node_t *node = page_node(mp, i);
|
||||
header_size += NODESIZE;
|
||||
const size_t node_key_size = node_ks(node);
|
||||
payload_size += node_key_size;
|
||||
|
||||
if (type == page_branch) {
|
||||
assert(i > 0 || node_ks(node) == 0);
|
||||
align_bytes += node_key_size & 1;
|
||||
continue;
|
||||
}
|
||||
|
||||
const size_t node_data_size = node_ds(node);
|
||||
assert(type == page_leaf);
|
||||
switch (node_flags(node)) {
|
||||
case 0 /* usual node */:
|
||||
payload_size += node_data_size;
|
||||
align_bytes += (node_key_size + node_data_size) & 1;
|
||||
break;
|
||||
|
||||
case N_BIGDATA /* long data on the large/overflow page */: {
|
||||
const pgno_t large_pgno = node_largedata_pgno(node);
|
||||
const size_t over_payload = node_data_size;
|
||||
const size_t over_header = PAGEHDRSZ;
|
||||
|
||||
assert(err == MDBX_SUCCESS);
|
||||
pgr_t lp = page_get_large(ctx->cursor, large_pgno, mp->txnid);
|
||||
const size_t npages =
|
||||
((err = lp.err) == MDBX_SUCCESS) ? lp.page->pages : 1;
|
||||
const size_t pagesize = pgno2bytes(ctx->txn->env, npages);
|
||||
const size_t over_unused = pagesize - over_payload - over_header;
|
||||
const int rc = ctx->visitor(large_pgno, npages, ctx->userctx, ctx->deep,
|
||||
sdb, pagesize, page_large, err, 1,
|
||||
over_payload, over_header, over_unused);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return (rc == MDBX_RESULT_TRUE) ? MDBX_SUCCESS : rc;
|
||||
payload_size += sizeof(pgno_t);
|
||||
align_bytes += node_key_size & 1;
|
||||
} break;
|
||||
|
||||
case N_SUBDATA /* sub-db */: {
|
||||
if (unlikely(node_data_size != sizeof(tree_t))) {
|
||||
ERROR("%s/%d: %s %u", "MDBX_CORRUPTED", MDBX_CORRUPTED,
|
||||
"invalid subDb node size", (unsigned)node_data_size);
|
||||
assert(err == MDBX_CORRUPTED);
|
||||
err = MDBX_CORRUPTED;
|
||||
}
|
||||
header_size += node_data_size;
|
||||
align_bytes += (node_key_size + node_data_size) & 1;
|
||||
} break;
|
||||
|
||||
case N_SUBDATA | N_DUPDATA /* dupsorted sub-tree */:
|
||||
if (unlikely(node_data_size != sizeof(tree_t))) {
|
||||
ERROR("%s/%d: %s %u", "MDBX_CORRUPTED", MDBX_CORRUPTED,
|
||||
"invalid sub-tree node size", (unsigned)node_data_size);
|
||||
assert(err == MDBX_CORRUPTED);
|
||||
err = MDBX_CORRUPTED;
|
||||
}
|
||||
header_size += node_data_size;
|
||||
align_bytes += (node_key_size + node_data_size) & 1;
|
||||
break;
|
||||
|
||||
case N_DUPDATA /* short sub-page */: {
|
||||
if (unlikely(node_data_size <= PAGEHDRSZ || (node_data_size & 1))) {
|
||||
ERROR("%s/%d: %s %u", "MDBX_CORRUPTED", MDBX_CORRUPTED,
|
||||
"invalid sub-page node size", (unsigned)node_data_size);
|
||||
assert(err == MDBX_CORRUPTED);
|
||||
err = MDBX_CORRUPTED;
|
||||
break;
|
||||
}
|
||||
|
||||
const page_t *const sp = node_data(node);
|
||||
const page_type_t subtype = walk_subpage_type(sp);
|
||||
const size_t nsubkeys = page_numkeys(sp);
|
||||
if (unlikely(subtype == page_sub_broken)) {
|
||||
ERROR("%s/%d: %s 0x%x", "MDBX_CORRUPTED", MDBX_CORRUPTED,
|
||||
"invalid sub-page flags", sp->flags);
|
||||
assert(err == MDBX_CORRUPTED);
|
||||
err = MDBX_CORRUPTED;
|
||||
}
|
||||
|
||||
size_t subheader_size =
|
||||
is_dupfix_leaf(sp) ? PAGEHDRSZ : PAGEHDRSZ + sp->lower;
|
||||
size_t subunused_size = page_room(sp);
|
||||
size_t subpayload_size = 0;
|
||||
size_t subalign_bytes = 0;
|
||||
|
||||
for (size_t ii = 0; err == MDBX_SUCCESS && ii < nsubkeys; ++ii) {
|
||||
if (subtype == page_sub_dupfix_leaf) {
|
||||
/* DUPFIX pages have no entries[] or node headers */
|
||||
subpayload_size += sp->dupfix_ksize;
|
||||
} else {
|
||||
assert(subtype == page_sub_leaf);
|
||||
const node_t *subnode = page_node(sp, ii);
|
||||
const size_t subnode_size = node_ks(subnode) + node_ds(subnode);
|
||||
subheader_size += NODESIZE;
|
||||
subpayload_size += subnode_size;
|
||||
subalign_bytes += subnode_size & 1;
|
||||
if (unlikely(node_flags(subnode) != 0)) {
|
||||
ERROR("%s/%d: %s 0x%x", "MDBX_CORRUPTED", MDBX_CORRUPTED,
|
||||
"unexpected sub-node flags", node_flags(subnode));
|
||||
assert(err == MDBX_CORRUPTED);
|
||||
err = MDBX_CORRUPTED;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const int rc =
|
||||
ctx->visitor(pgno, 0, ctx->userctx, ctx->deep + 1, sdb,
|
||||
node_data_size, subtype, err, nsubkeys, subpayload_size,
|
||||
subheader_size, subunused_size + subalign_bytes);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return (rc == MDBX_RESULT_TRUE) ? MDBX_SUCCESS : rc;
|
||||
header_size += subheader_size;
|
||||
unused_size += subunused_size;
|
||||
payload_size += subpayload_size;
|
||||
align_bytes += subalign_bytes + (node_key_size & 1);
|
||||
} break;
|
||||
|
||||
default:
|
||||
ERROR("%s/%d: %s 0x%x", "MDBX_CORRUPTED", MDBX_CORRUPTED,
|
||||
"invalid node flags", node_flags(node));
|
||||
assert(err == MDBX_CORRUPTED);
|
||||
err = MDBX_CORRUPTED;
|
||||
}
|
||||
}
|
||||
|
||||
const int rc = ctx->visitor(
|
||||
pgno, 1, ctx->userctx, ctx->deep, sdb, ctx->txn->env->ps, type, err,
|
||||
nentries, payload_size, header_size, unused_size + align_bytes);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return (rc == MDBX_RESULT_TRUE) ? MDBX_SUCCESS : rc;
|
||||
|
||||
for (size_t i = 0; err == MDBX_SUCCESS && i < nentries; ++i) {
|
||||
if (type == page_dupfix_leaf)
|
||||
continue;
|
||||
|
||||
node_t *node = page_node(mp, i);
|
||||
if (type == page_branch) {
|
||||
assert(err == MDBX_SUCCESS);
|
||||
ctx->deep += 1;
|
||||
err = walk_pgno(ctx, sdb, node_pgno(node), mp->txnid);
|
||||
ctx->deep -= 1;
|
||||
if (unlikely(err != MDBX_SUCCESS)) {
|
||||
if (err == MDBX_RESULT_TRUE)
|
||||
break;
|
||||
return err;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
assert(type == page_leaf);
|
||||
switch (node_flags(node)) {
|
||||
default:
|
||||
continue;
|
||||
|
||||
case N_SUBDATA /* sub-db */:
|
||||
if (unlikely(node_ds(node) != sizeof(tree_t))) {
|
||||
ERROR("%s/%d: %s %u", "MDBX_CORRUPTED", MDBX_CORRUPTED,
|
||||
"invalid sub-tree node size", (unsigned)node_ds(node));
|
||||
assert(err == MDBX_CORRUPTED);
|
||||
err = MDBX_CORRUPTED;
|
||||
} else {
|
||||
tree_t aligned_db;
|
||||
memcpy(&aligned_db, node_data(node), sizeof(aligned_db));
|
||||
walk_sdb_t subdb = {{node_key(node), node_ks(node)}, nullptr, nullptr};
|
||||
subdb.internal = &aligned_db;
|
||||
assert(err == MDBX_SUCCESS);
|
||||
ctx->deep += 1;
|
||||
err = walk_sdb(ctx, &subdb);
|
||||
ctx->deep -= 1;
|
||||
}
|
||||
break;
|
||||
|
||||
case N_SUBDATA | N_DUPDATA /* dupsorted sub-tree */:
|
||||
if (unlikely(node_ds(node) != sizeof(tree_t))) {
|
||||
ERROR("%s/%d: %s %u", "MDBX_CORRUPTED", MDBX_CORRUPTED,
|
||||
"invalid dupsort sub-tree node size", (unsigned)node_ds(node));
|
||||
assert(err == MDBX_CORRUPTED);
|
||||
err = MDBX_CORRUPTED;
|
||||
} else {
|
||||
tree_t aligned_db;
|
||||
memcpy(&aligned_db, node_data(node), sizeof(aligned_db));
|
||||
assert(err == MDBX_SUCCESS);
|
||||
err = cursor_dupsort_setup(ctx->cursor, node, mp);
|
||||
if (likely(err == MDBX_SUCCESS)) {
|
||||
assert(ctx->cursor->subcur ==
|
||||
&container_of(ctx->cursor, cursor_couple_t, outer)->inner);
|
||||
ctx->cursor = &ctx->cursor->subcur->cursor;
|
||||
ctx->deep += 1;
|
||||
sdb->nested = &aligned_db;
|
||||
err = walk_pgno(ctx, sdb, aligned_db.root, mp->txnid);
|
||||
sdb->nested = nullptr;
|
||||
ctx->deep -= 1;
|
||||
subcur_t *inner_xcursor = container_of(ctx->cursor, subcur_t, cursor);
|
||||
cursor_couple_t *couple =
|
||||
container_of(inner_xcursor, cursor_couple_t, inner);
|
||||
ctx->cursor = &couple->outer;
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return MDBX_SUCCESS;
|
||||
}
|
||||
|
||||
__cold static int walk_sdb(walk_ctx_t *ctx, walk_sdb_t *sdb) {
|
||||
tree_t *const db = sdb->internal;
|
||||
if (unlikely(db->root == P_INVALID))
|
||||
return MDBX_SUCCESS; /* empty db */
|
||||
|
||||
kvx_t kvx = {.clc = {.k = {.lmin = INT_MAX}, .v = {.lmin = INT_MAX}}};
|
||||
cursor_couple_t couple;
|
||||
int rc = cursor_init4walk(&couple, ctx->txn, db, &kvx);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return rc;
|
||||
|
||||
const uint8_t cursor_checking = (ctx->options & dont_check_keys_ordering)
|
||||
? z_pagecheck | z_ignord
|
||||
: z_pagecheck;
|
||||
couple.outer.checking |= cursor_checking;
|
||||
couple.inner.cursor.checking |= cursor_checking;
|
||||
couple.outer.next = ctx->cursor;
|
||||
couple.outer.top_and_flags = z_disable_tree_search_fastpath;
|
||||
ctx->cursor = &couple.outer;
|
||||
rc = walk_pgno(ctx, sdb, db->root,
|
||||
db->mod_txnid ? db->mod_txnid : ctx->txn->txnid);
|
||||
ctx->cursor = couple.outer.next;
|
||||
return rc;
|
||||
}
|
||||
|
||||
__cold int walk_pages(MDBX_txn *txn, walk_func *visitor, void *user,
|
||||
walk_options_t options) {
|
||||
int rc = check_txn(txn, MDBX_TXN_BLOCKED);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return rc;
|
||||
|
||||
walk_ctx_t ctx = {
|
||||
.txn = txn, .userctx = user, .visitor = visitor, .options = options};
|
||||
walk_sdb_t sdb = {.name = {.iov_base = MDBX_CHK_GC},
|
||||
.internal = &txn->dbs[FREE_DBI]};
|
||||
rc = walk_sdb(&ctx, &sdb);
|
||||
if (!MDBX_IS_ERROR(rc)) {
|
||||
sdb.name.iov_base = MDBX_CHK_MAIN;
|
||||
sdb.internal = &txn->dbs[MAIN_DBI];
|
||||
rc = walk_sdb(&ctx, &sdb);
|
||||
}
|
||||
return rc;
|
||||
}
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user