New reader for LMDB databases (#9950)

* Add LMDBReader op and test case

* Add testcase to load LMDB from a folder

* Add tensorflow/core/lib/lmdb/testdata/data.mdb

* Add EOF test

* Add license export

* Blacklist the test data in pip_smoke_test.py

* Address issues with respect to review

* Add LICENSE to BUILD rules

* Remove the prefx of LICENSE

* Wrap key with compat.as_bytes()

* Fixed a compilation flag

* Improve BUILD rules

* Support LMDB build in cmake

* Fix BUILD file format with buildifier

* Add fake unistd.h for lmdb to build on Windows

* Avoid building lmdb tools which depends on unistd.h

* Fix the string encoding issue in Python3

* Update lmdb library name in CMakeList.txt
This commit is contained in:
Bo Wang 2017-06-05 11:41:32 -07:00 committed by Rasmus Munk Larsen
parent 8a46d7a299
commit e6f5818636
20 changed files with 397 additions and 1 deletions

View File

@ -113,6 +113,7 @@ include(zlib)
include(gif)
include(png)
include(jpeg)
include(lmdb)
include(eigen)
include(gemmlowp)
include(jsoncpp)
@ -129,6 +130,7 @@ set(tensorflow_EXTERNAL_LIBRARIES
${gif_STATIC_LIBRARIES}
${png_STATIC_LIBRARIES}
${jpeg_STATIC_LIBRARIES}
${lmdb_STATIC_LIBRARIES}
${jsoncpp_STATIC_LIBRARIES}
${farmhash_STATIC_LIBRARIES}
${fft2d_STATIC_LIBRARIES}
@ -140,6 +142,7 @@ set(tensorflow_EXTERNAL_DEPENDENCIES
gif_copy_headers_to_destination
png_copy_headers_to_destination
jpeg_copy_headers_to_destination
lmdb_copy_headers_to_destination
jsoncpp
farmhash_copy_headers_to_destination
highwayhash_copy_headers_to_destination
@ -158,6 +161,7 @@ include_directories(
${gif_INCLUDE_DIR}
${png_INCLUDE_DIR}
${jpeg_INCLUDE_DIR}
${lmdb_INCLUDE_DIR}
${eigen_INCLUDE_DIRS}
${gemmlowp_INCLUDE_DIR}
${jsoncpp_INCLUDE_DIR}

View File

@ -0,0 +1,60 @@
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
include (ExternalProject)
set(lmdb_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/external/lmdb)
set(lmdb_URL http://mirror.bazel.build/github.com/LMDB/lmdb/archive/LMDB_0.9.19.tar.gz)
set(lmdb_HASH SHA256=108532fb94c6f227558d45be3f3347b52539f0f58290a7bb31ec06c462d05326)
set(lmdb_BUILD ${CMAKE_BINARY_DIR}/lmdb/src/lmdb)
set(lmdb_INSTALL ${CMAKE_BINARY_DIR}/lmdb/install)
ExternalProject_Add(lmdb
PREFIX lmdb
URL ${lmdb_URL}
URL_HASH ${lmdb_HASH}
PATCH_COMMAND ${CMAKE_COMMAND} -E copy_if_different
${CMAKE_CURRENT_SOURCE_DIR}/patches/lmdb/CMakeLists.txt ${lmdb_BUILD}
INSTALL_DIR ${lmdb_INSTALL}
DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
CMAKE_CACHE_ARGS
-DCMAKE_BUILD_TYPE:STRING=Release
-DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF
-DCMAKE_INSTALL_PREFIX:STRING=${lmdb_INSTALL}
-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
)
if(WIN32)
set(lmdb_STATIC_LIBRARIES ${lmdb_INSTALL}/lib/lmdb.lib)
else()
set(lmdb_STATIC_LIBRARIES ${lmdb_INSTALL}/lib/liblmdb.a)
endif()
set(lmdb_HEADERS
"${lmdb_INSTALL}/include/lmdb.h"
"${lmdb_INSTALL}/include/midl.h"
)
## put lmdb includes in the directory where they are expected
add_custom_target(lmdb_create_destination_dir
COMMAND ${CMAKE_COMMAND} -E make_directory ${lmdb_INCLUDE_DIR}
DEPENDS lmdb)
add_custom_target(lmdb_copy_headers_to_destination
DEPENDS lmdb_create_destination_dir)
foreach(header_file ${lmdb_HEADERS})
add_custom_command(TARGET lmdb_copy_headers_to_destination PRE_BUILD
COMMAND ${CMAKE_COMMAND} -E copy_if_different ${header_file} ${lmdb_INCLUDE_DIR}/)
endforeach()

View File

@ -0,0 +1,26 @@
cmake_minimum_required(VERSION 2.8.3)
project(liblmdb)
set(LIBLMDB_SRCS
"libraries/liblmdb/mdb.c"
"libraries/liblmdb/midl.c"
)
set(LIBLMDB_INCLUDES
"libraries/liblmdb/lmdb.h"
"libraries/liblmdb/midl.h"
)
include_directories("${CMAKE_CURRENT_SOURCE_DIR}")
add_library(lmdb ${LIBLMDB_SRCS})
install(TARGETS lmdb
RUNTIME DESTINATION bin COMPONENT RuntimeLibraries
LIBRARY DESTINATION lib COMPONENT RuntimeLibraries
ARCHIVE DESTINATION lib COMPONENT Development)
foreach(LIBLMDB_INCLUDE ${LIBLMDB_INCLUDES})
install(FILES ${LIBLMDB_INCLUDE} DESTINATION include COMPONENT Development)
endforeach()

View File

@ -2886,6 +2886,20 @@ filegroup(
visibility = ["//visibility:public"],
)
filegroup(
name = "lmdb_testdata",
testonly = 1,
srcs = [
# A simple key-value store:
# 0 : 'a'
# 1 : 'b'
# ...
# 9 : 'j'
"lib/lmdb/testdata/data.mdb",
],
visibility = ["//visibility:public"],
)
filegroup(
name = "example_parser_configuration_testdata",
srcs = [

View File

@ -481,7 +481,7 @@ Status GraphConstructor::ValidateShape(Node* node) {
"MutableHashTableOfTensors", "Mutex", "CuckooTable", "IndexTable",
"WholeFileReader", "TextLineReader", "FixedLengthRecordReader",
"TFRecordReader", "IdentityReader", "RefSwitch", "RefEnter",
"RefNextIteration", "RefMerge", "RefIdentity",
"RefNextIteration", "RefMerge", "RefIdentity", "LMDBReader",
// To be removed after 2017/04/24.
"ConditionalAccumulator", "SparseConditionalAccumulator", "Table",
};

View File

@ -1904,6 +1904,7 @@ cc_library(
deps = [
":fixed_length_record_reader_op",
":identity_reader_op",
":lmdb_reader_op",
":matching_files_op",
":reader_ops",
":restore_op",
@ -1938,6 +1939,14 @@ tf_kernel_library(
deps = IO_DEPS,
)
tf_kernel_library(
name = "lmdb_reader_op",
prefix = "lmdb_reader_op",
deps = IO_DEPS + [
"@lmdb",
],
)
tf_kernel_library(
name = "matching_files_op",
prefix = "matching_files_op",
@ -4313,6 +4322,7 @@ filegroup(
# not used on Android. Those ops also do not compile if included,
# unless we add the additional deps they need.
"tf_record_reader_op.*",
"lmdb_reader_op.*",
"string_to_hash_bucket_op.*",
"sdca_ops.*",
"sdca_internal.*",

View File

@ -0,0 +1,134 @@
/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "lmdb.h"
#include "tensorflow/core/framework/reader_op_kernel.h"
#include "tensorflow/core/framework/reader_base.h"
#include "tensorflow/core/lib/core/errors.h"
#include <sys/stat.h>
namespace tensorflow {
inline void MDB_CHECK(int mdb_status) {
CHECK_EQ(mdb_status, MDB_SUCCESS) << mdb_strerror(mdb_status);
}
class LMDBReader : public ReaderBase {
public:
LMDBReader(const string& node_name, Env* env)
: ReaderBase(strings::StrCat("LMDBReader '", node_name, "'")),
env_(env),
mdb_env_(nullptr),
mdb_dbi_(0),
mdb_txn_(nullptr),
mdb_cursor_(nullptr) {}
Status OnWorkStartedLocked() override {
MDB_CHECK(mdb_env_create(&mdb_env_));
int flags = MDB_RDONLY | MDB_NOTLS;
// Check if the LMDB filename is actually a file instead of a directory.
// If so, set appropriate flags so we can open it.
struct stat source_stat;
if (stat(current_work().c_str(), &source_stat) == 0 &&
(source_stat.st_mode & S_IFREG)) {
flags |= MDB_NOSUBDIR;
}
MDB_CHECK(mdb_env_open(mdb_env_, current_work().c_str(), flags, 0664));
MDB_CHECK(mdb_txn_begin(mdb_env_, nullptr, MDB_RDONLY, &mdb_txn_));
MDB_CHECK(mdb_dbi_open(mdb_txn_, nullptr, 0, &mdb_dbi_));
return Status::OK();
}
Status OnWorkFinishedLocked() override {
if (mdb_env_ != nullptr) {
if (mdb_cursor_) {
mdb_cursor_close(mdb_cursor_);
}
mdb_txn_abort(mdb_txn_);
mdb_dbi_close(mdb_env_, mdb_dbi_);
mdb_env_close(mdb_env_);
mdb_env_ = nullptr;
}
return Status::OK();
}
Status ReadLocked(string* key, string* value, bool* produced,
bool* at_end) override {
if (mdb_cursor_ == nullptr) {
MDB_CHECK(mdb_cursor_open(mdb_txn_, mdb_dbi_, &mdb_cursor_));
if (Seek(MDB_FIRST) == false) {
*at_end = true;
return Status::OK();
}
}
else {
if (Seek(MDB_NEXT) == false) {
*at_end = true;
return Status::OK();
}
}
*key = string(static_cast<const char*>(mdb_key_.mv_data),
mdb_key_.mv_size);
*value = string(static_cast<const char*>(mdb_value_.mv_data),
mdb_value_.mv_size);
*produced = true;
return Status::OK();
}
Status ResetLocked() override {
CHECK_EQ(Seek(MDB_FIRST), true);
return ReaderBase::ResetLocked();
}
private:
bool Seek(MDB_cursor_op op) {
CHECK_NOTNULL(mdb_cursor_);
int mdb_status = mdb_cursor_get(mdb_cursor_, &mdb_key_, &mdb_value_, op);
if (mdb_status == MDB_NOTFOUND) {
return false;
} else {
MDB_CHECK(mdb_status);
return true;
}
}
Env* const env_;
MDB_env* mdb_env_;
MDB_dbi mdb_dbi_;
MDB_txn* mdb_txn_;
MDB_cursor* mdb_cursor_;
MDB_val mdb_key_, mdb_value_;
};
class LMDBReaderOp : public ReaderOpKernel {
public:
explicit LMDBReaderOp(OpKernelConstruction* context)
: ReaderOpKernel(context) {
Env* env = context->env();
SetReaderFactory([this, env]() {
return new LMDBReader(name(), env);
});
}
};
REGISTER_KERNEL_BUILDER(Name("LMDBReader").Device(DEVICE_CPU),
LMDBReaderOp);
}

Binary file not shown.

View File

@ -520,6 +520,21 @@ shared_name: If non-empty, this reader is named in the given bucket
with this shared_name. Otherwise, the node name is used instead.
)doc");
REGISTER_OP("LMDBReader")
.Output("reader_handle: Ref(string)")
.Attr("container: string = ''")
.Attr("shared_name: string = ''")
.SetIsStateful()
.SetShapeFn(TwoElementOutput)
.Doc(R"doc(
A Reader that outputs the records from a LMDB file.
reader_handle: The handle to reference the Reader.
container: If non-empty, this reader is placed in the given container.
Otherwise, a default container is used.
shared_name: If non-empty, this reader is named in the given bucket
with this shared_name. Otherwise, the node name is used instead.
)doc");
// TODO(cwhipkey): mark this deprecated in favor of V2.
REGISTER_OP("IdentityReader")
.Output("reader_handle: Ref(string)")

View File

@ -26049,6 +26049,33 @@ op {
summary: "A Reader that outputs the records from a TensorFlow Records file."
is_stateful: true
}
op {
name: "LMDBReader"
output_arg {
name: "reader_handle"
description: "The handle to reference the Reader."
type: DT_STRING
is_ref: true
}
attr {
name: "container"
type: "string"
default_value {
s: ""
}
description: "If non-empty, this reader is placed in the given container.\nOtherwise, a default container is used."
}
attr {
name: "shared_name"
type: "string"
default_value {
s: ""
}
description: "If non-empty, this reader is named in the given bucket\nwith this shared_name. Otherwise, the node name is used instead."
}
summary: "A Reader that outputs the records from a LMDB database."
is_stateful: true
}
op {
name: "TakeDataset"
input_arg {

View File

@ -437,6 +437,7 @@ def import_graph_def(graph_def, input_map=None, return_elements=None,
'WholeFileReader', 'TextLineReader',
'FixedLengthRecordReader',
'TFRecordReader', 'IdentityReader',
'LMDBReader',
'RefSwitch', 'RefEnter', 'RefNextIteration',
'RefMerge', 'RefIdentity']:
pass

View File

@ -979,6 +979,7 @@ tf_py_test(
"//tensorflow/python:util",
"//tensorflow/python:variables",
],
data = ["//tensorflow/core:lmdb_testdata"],
)
cuda_py_test(

View File

@ -858,5 +858,48 @@ class AsyncReaderTest(test.TestCase):
output.append(sess.run(args))
class LMDBReaderTest(test.TestCase):
def setUp(self):
super(LMDBReaderTest, self).setUp()
def testReadFromFile(self):
with self.test_session() as sess:
reader = io_ops.LMDBReader(name="test_read_from_file")
path = os.path.join("tensorflow", "core", "lib", "lmdb", "testdata",
"data.mdb")
queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
key, value = reader.read(queue)
queue.enqueue([path]).run()
queue.close().run()
for i in range(10):
k, v = sess.run([key, value])
self.assertAllEqual(compat.as_bytes(k), compat.as_bytes(str(i)))
self.assertAllEqual(compat.as_bytes(v), compat.as_bytes(str(chr(ord('a') + i))))
with self.assertRaisesOpError("is closed and has insufficient elements "
"\\(requested 1, current size 0\\)"):
k, v = sess.run([key, value])
def testReadFromFolder(self):
with self.test_session() as sess:
reader = io_ops.LMDBReader(name="test_read_from_folder")
path = os.path.join("tensorflow", "core", "lib", "lmdb", "testdata")
queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
key, value = reader.read(queue)
queue.enqueue([path]).run()
queue.close().run()
for i in range(10):
k, v = sess.run([key, value])
self.assertAllEqual(compat.as_bytes(k), compat.as_bytes(str(i)))
self.assertAllEqual(compat.as_bytes(v), compat.as_bytes(str(chr(ord('a') + i))))
with self.assertRaisesOpError("is closed and has insufficient elements "
"\\(requested 1, current size 0\\)"):
k, v = sess.run([key, value])
if __name__ == "__main__":
test.main()

View File

@ -191,6 +191,7 @@ WholeFileReader
TextLineReaderV2
TFRecordReaderV2
WholeFileReaderV2
LMDBReader
# linalg_ops
BatchCholesky

View File

@ -26,6 +26,7 @@ See the @{$python/io_ops} guide.
@@WholeFileReader
@@IdentityReader
@@TFRecordReader
@@LMDBReader
@@FixedLengthRecordReader
@@decode_csv
@@decode_raw
@ -443,6 +444,25 @@ class TFRecordReader(ReaderBase):
ops.NotDifferentiable("TFRecordReader")
class LMDBReader(ReaderBase):
"""A Reader that outputs the records from a LMDB file.
See ReaderBase for supported methods.
"""
def __init__(self, name=None, options=None):
"""Create a LMDBReader.
Args:
name: A name for the operation (optional).
options: A LMDBRecordOptions object (optional).
"""
rr = gen_io_ops._lmdb_reader(name=name)
super(LMDBReader, self).__init__(rr)
ops.NotDifferentiable("LMDBReader")
class IdentityReader(ReaderBase):
"""A Reader that outputs the queued work as both the key and value.

View File

@ -97,6 +97,7 @@ genrule(
"@jemalloc//:COPYING",
"@jpeg//:LICENSE.md",
"@libxsmm_archive//:LICENSE",
"@lmdb//:LICENSE",
"@local_config_sycl//sycl:LICENSE.text",
"@png_archive//:LICENSE",
"@protobuf//:LICENSE",
@ -126,6 +127,7 @@ genrule(
"@jemalloc//:COPYING",
"@jpeg//:LICENSE.md",
"@libxsmm_archive//:LICENSE",
"@lmdb//:LICENSE",
"@local_config_sycl//sycl:LICENSE.text",
"@png_archive//:LICENSE",
"@protobuf//:LICENSE",

View File

@ -110,6 +110,7 @@ filegroup(
"@jemalloc//:COPYING",
"@jpeg//:LICENSE.md",
"@libxsmm_archive//:LICENSE",
"@lmdb//:LICENSE",
"@local_config_sycl//sycl:LICENSE.text",
"@nanopb_git//:LICENSE.txt",
"@org_html5lib//:LICENSE",

View File

@ -46,6 +46,7 @@ BLACKLIST = [
"//tensorflow/python:tf_optimizer",
"//tensorflow/python:compare_test_proto_py",
"//tensorflow/core:image_testdata",
"//tensorflow/core:lmdb_testdata",
"//tensorflow/core/kernels/cloud:bigquery_reader_ops",
"//tensorflow/python/feature_column:vocabulary_testdata",
"//tensorflow/python:framework/test_file_system.so",

View File

@ -507,6 +507,17 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
repository = tf_repo_name,
)
native.new_http_archive(
name = "lmdb",
urls = [
"http://mirror.bazel.build/github.com/LMDB/lmdb/archive/LMDB_0.9.19.tar.gz",
"https://github.com/LMDB/lmdb/archive/LMDB_0.9.19.tar.gz",
],
sha256 = "108532fb94c6f227558d45be3f3347b52539f0f58290a7bb31ec06c462d05326",
strip_prefix = "lmdb-LMDB_0.9.19/libraries/liblmdb",
build_file = str(Label("//third_party:lmdb.BUILD")),
)
native.new_http_archive(
name = "jsoncpp_git",
urls = [

25
third_party/lmdb.BUILD vendored Normal file
View File

@ -0,0 +1,25 @@
# Description:
# LMDB is the Lightning Memory-mapped Database.
licenses(["notice"]) # OpenLDAP Public License
exports_files(["LICENSE"])
cc_library(
name = "lmdb",
srcs = [
"mdb.c",
"midl.c",
],
hdrs = [
"lmdb.h",
"midl.h",
],
copts = [
"-w",
],
linkopts = [
"-lpthread",
],
visibility = ["//visibility:public"],
)