New reader for LMDB databases (#9950)

* Add LMDBReader op and test case * Add testcase to load LMDB from a folder * Add tensorflow/core/lib/lmdb/testdata/data.mdb * Add EOF test * Add license export * Blacklist the test data in pip_smoke_test.py * Address issues with respect to review * Add LICENSE to BUILD rules * Remove the prefx of LICENSE * Wrap key with compat.as_bytes() * Fixed a compilation flag * Improve BUILD rules * Support LMDB build in cmake * Fix BUILD file format with buildifier * Add fake unistd.h for lmdb to build on Windows * Avoid building lmdb tools which depends on unistd.h * Fix the string encoding issue in Python3 * Update lmdb library name in CMakeList.txt
2017-06-05 11:41:32 -07:00 · 2017-06-05 11:41:32 -07:00 · e6f5818636
commit e6f5818636
parent 8a46d7a299
20 changed files with 397 additions and 1 deletions
--- a/tensorflow/contrib/cmake/CMakeLists.txt
+++ b/tensorflow/contrib/cmake/CMakeLists.txt
@ -113,6 +113,7 @@ include(zlib)
 include(gif)
 include(png)
 include(jpeg)
+include(lmdb)
 include(eigen)
 include(gemmlowp)
 include(jsoncpp)
@ -129,6 +130,7 @@ set(tensorflow_EXTERNAL_LIBRARIES
    ${gif_STATIC_LIBRARIES}
    ${png_STATIC_LIBRARIES}
    ${jpeg_STATIC_LIBRARIES}
+    ${lmdb_STATIC_LIBRARIES}
    ${jsoncpp_STATIC_LIBRARIES}
    ${farmhash_STATIC_LIBRARIES}
    ${fft2d_STATIC_LIBRARIES}
@ -140,6 +142,7 @@ set(tensorflow_EXTERNAL_DEPENDENCIES
    gif_copy_headers_to_destination
    png_copy_headers_to_destination
    jpeg_copy_headers_to_destination
+    lmdb_copy_headers_to_destination
    jsoncpp
    farmhash_copy_headers_to_destination
    highwayhash_copy_headers_to_destination
@ -158,6 +161,7 @@ include_directories(
    ${gif_INCLUDE_DIR}
    ${png_INCLUDE_DIR}
    ${jpeg_INCLUDE_DIR}
+    ${lmdb_INCLUDE_DIR}
    ${eigen_INCLUDE_DIRS}
    ${gemmlowp_INCLUDE_DIR}
    ${jsoncpp_INCLUDE_DIR}
--- a/tensorflow/contrib/cmake/external/lmdb.cmake
+++ b/tensorflow/contrib/cmake/external/lmdb.cmake
@ -0,0 +1,60 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+include (ExternalProject)
+
+set(lmdb_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/external/lmdb)
+set(lmdb_URL http://mirror.bazel.build/github.com/LMDB/lmdb/archive/LMDB_0.9.19.tar.gz)
+set(lmdb_HASH SHA256=108532fb94c6f227558d45be3f3347b52539f0f58290a7bb31ec06c462d05326)
+set(lmdb_BUILD ${CMAKE_BINARY_DIR}/lmdb/src/lmdb)
+set(lmdb_INSTALL ${CMAKE_BINARY_DIR}/lmdb/install)
+
+ExternalProject_Add(lmdb
+    PREFIX lmdb
+    URL ${lmdb_URL}
+    URL_HASH ${lmdb_HASH}
+    PATCH_COMMAND ${CMAKE_COMMAND} -E copy_if_different
+        ${CMAKE_CURRENT_SOURCE_DIR}/patches/lmdb/CMakeLists.txt ${lmdb_BUILD}
+    INSTALL_DIR ${lmdb_INSTALL}
+    DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
+    CMAKE_CACHE_ARGS
+        -DCMAKE_BUILD_TYPE:STRING=Release
+        -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF
+        -DCMAKE_INSTALL_PREFIX:STRING=${lmdb_INSTALL}
+    -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+)
+
+if(WIN32)
+    set(lmdb_STATIC_LIBRARIES ${lmdb_INSTALL}/lib/lmdb.lib)
+else()
+    set(lmdb_STATIC_LIBRARIES ${lmdb_INSTALL}/lib/liblmdb.a)
+endif()
+
+set(lmdb_HEADERS
+    "${lmdb_INSTALL}/include/lmdb.h"
+    "${lmdb_INSTALL}/include/midl.h"
+)
+
+## put lmdb includes in the directory where they are expected
+add_custom_target(lmdb_create_destination_dir
+    COMMAND ${CMAKE_COMMAND} -E make_directory ${lmdb_INCLUDE_DIR}
+    DEPENDS lmdb)
+
+add_custom_target(lmdb_copy_headers_to_destination
+    DEPENDS lmdb_create_destination_dir)
+
+foreach(header_file ${lmdb_HEADERS})
+  add_custom_command(TARGET lmdb_copy_headers_to_destination PRE_BUILD
+      COMMAND ${CMAKE_COMMAND} -E copy_if_different ${header_file} ${lmdb_INCLUDE_DIR}/)
+endforeach()
--- a/tensorflow/contrib/cmake/patches/lmdb/CMakeLists.txt
+++ b/tensorflow/contrib/cmake/patches/lmdb/CMakeLists.txt
@ -0,0 +1,26 @@
+cmake_minimum_required(VERSION 2.8.3)
+
+project(liblmdb)
+
+set(LIBLMDB_SRCS
+    "libraries/liblmdb/mdb.c"
+    "libraries/liblmdb/midl.c"
+)
+
+set(LIBLMDB_INCLUDES
+    "libraries/liblmdb/lmdb.h"
+    "libraries/liblmdb/midl.h"
+)
+
+include_directories("${CMAKE_CURRENT_SOURCE_DIR}")
+
+add_library(lmdb ${LIBLMDB_SRCS})
+
+install(TARGETS lmdb
+  RUNTIME DESTINATION bin COMPONENT RuntimeLibraries
+  LIBRARY DESTINATION lib COMPONENT RuntimeLibraries
+  ARCHIVE DESTINATION lib COMPONENT Development)
+
+foreach(LIBLMDB_INCLUDE ${LIBLMDB_INCLUDES})
+  install(FILES ${LIBLMDB_INCLUDE} DESTINATION include COMPONENT Development)
+endforeach()
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@ -2886,6 +2886,20 @@ filegroup(
    visibility = ["//visibility:public"],
 )

+filegroup(
+    name = "lmdb_testdata",
+    testonly = 1,
+    srcs = [
+        # A simple key-value store:
+        #   0 : 'a'
+        #   1 : 'b'
+        #    ...
+        #   9 : 'j'
+        "lib/lmdb/testdata/data.mdb",
+    ],
+    visibility = ["//visibility:public"],
+)
+
 filegroup(
    name = "example_parser_configuration_testdata",
    srcs = [
--- a/tensorflow/core/graph/graph_constructor.cc
+++ b/tensorflow/core/graph/graph_constructor.cc
@ -481,7 +481,7 @@ Status GraphConstructor::ValidateShape(Node* node) {
          "MutableHashTableOfTensors", "Mutex", "CuckooTable", "IndexTable",
          "WholeFileReader", "TextLineReader", "FixedLengthRecordReader",
          "TFRecordReader", "IdentityReader", "RefSwitch", "RefEnter",
-          "RefNextIteration", "RefMerge", "RefIdentity",
+          "RefNextIteration", "RefMerge", "RefIdentity", "LMDBReader",
          // To be removed after 2017/04/24.
          "ConditionalAccumulator", "SparseConditionalAccumulator", "Table",
      };
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@ -1904,6 +1904,7 @@ cc_library(
    deps = [
        ":fixed_length_record_reader_op",
        ":identity_reader_op",
+        ":lmdb_reader_op",
        ":matching_files_op",
        ":reader_ops",
        ":restore_op",
@ -1938,6 +1939,14 @@ tf_kernel_library(
    deps = IO_DEPS,
 )

+tf_kernel_library(
+    name = "lmdb_reader_op",
+    prefix = "lmdb_reader_op",
+    deps = IO_DEPS + [
+        "@lmdb",
+    ],
+)
+
 tf_kernel_library(
    name = "matching_files_op",
    prefix = "matching_files_op",
@ -4313,6 +4322,7 @@ filegroup(
            # not used on Android. Those ops also do not compile if included,
            # unless we add the additional deps they need.
            "tf_record_reader_op.*",
+            "lmdb_reader_op.*",
            "string_to_hash_bucket_op.*",
            "sdca_ops.*",
            "sdca_internal.*",
--- a/tensorflow/core/kernels/lmdb_reader_op.cc
+++ b/tensorflow/core/kernels/lmdb_reader_op.cc
@ -0,0 +1,134 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "lmdb.h"
+#include "tensorflow/core/framework/reader_op_kernel.h"
+#include "tensorflow/core/framework/reader_base.h"
+#include "tensorflow/core/lib/core/errors.h"
+
+#include <sys/stat.h>
+
+namespace tensorflow {
+
+inline void MDB_CHECK(int mdb_status) {
+  CHECK_EQ(mdb_status, MDB_SUCCESS) << mdb_strerror(mdb_status);
+}
+
+class LMDBReader : public ReaderBase {
+ public:
+  LMDBReader(const string& node_name, Env* env)
+      : ReaderBase(strings::StrCat("LMDBReader '", node_name, "'")),
+        env_(env),
+        mdb_env_(nullptr),
+        mdb_dbi_(0),
+        mdb_txn_(nullptr),
+        mdb_cursor_(nullptr) {}
+
+  Status OnWorkStartedLocked() override {
+    MDB_CHECK(mdb_env_create(&mdb_env_));
+    int flags = MDB_RDONLY | MDB_NOTLS;
+
+    // Check if the LMDB filename is actually a file instead of a directory.
+    // If so, set appropriate flags so we can open it.
+    struct stat source_stat;
+    if (stat(current_work().c_str(), &source_stat) == 0 &&
+        (source_stat.st_mode & S_IFREG)) {
+      flags |= MDB_NOSUBDIR;
+    }
+
+    MDB_CHECK(mdb_env_open(mdb_env_, current_work().c_str(), flags, 0664));
+    MDB_CHECK(mdb_txn_begin(mdb_env_, nullptr, MDB_RDONLY, &mdb_txn_));
+    MDB_CHECK(mdb_dbi_open(mdb_txn_, nullptr, 0, &mdb_dbi_));
+
+    return Status::OK();
+  }
+
+  Status OnWorkFinishedLocked() override {
+    if (mdb_env_ != nullptr) {
+      if (mdb_cursor_) {
+        mdb_cursor_close(mdb_cursor_);
+      }
+      mdb_txn_abort(mdb_txn_);
+      mdb_dbi_close(mdb_env_, mdb_dbi_);
+      mdb_env_close(mdb_env_);
+      mdb_env_ = nullptr;
+    }
+    return Status::OK();
+  }
+
+  Status ReadLocked(string* key, string* value, bool* produced,
+                    bool* at_end) override {
+    if (mdb_cursor_ == nullptr) {
+      MDB_CHECK(mdb_cursor_open(mdb_txn_, mdb_dbi_, &mdb_cursor_));
+      if (Seek(MDB_FIRST) == false) {
+        *at_end = true;
+        return Status::OK();
+      }
+    }
+    else {
+      if (Seek(MDB_NEXT) == false) {
+        *at_end = true;
+        return Status::OK();
+      }
+    }
+    *key = string(static_cast<const char*>(mdb_key_.mv_data),
+                  mdb_key_.mv_size);
+    *value = string(static_cast<const char*>(mdb_value_.mv_data),
+                    mdb_value_.mv_size);
+    *produced = true;
+    return Status::OK();
+  }
+
+  Status ResetLocked() override {
+    CHECK_EQ(Seek(MDB_FIRST), true);
+    return ReaderBase::ResetLocked();
+  }
+
+ private:
+  bool Seek(MDB_cursor_op op) {
+    CHECK_NOTNULL(mdb_cursor_);
+    int mdb_status = mdb_cursor_get(mdb_cursor_, &mdb_key_, &mdb_value_, op);
+    if (mdb_status == MDB_NOTFOUND) {
+      return false;
+    } else {
+      MDB_CHECK(mdb_status);
+      return true;
+    }
+  }
+
+  Env* const env_;
+  MDB_env* mdb_env_;
+  MDB_dbi mdb_dbi_;
+
+  MDB_txn* mdb_txn_;
+  MDB_cursor* mdb_cursor_;
+  MDB_val mdb_key_, mdb_value_;
+};
+
+class LMDBReaderOp : public ReaderOpKernel {
+ public:
+  explicit LMDBReaderOp(OpKernelConstruction* context)
+      : ReaderOpKernel(context) {
+    Env* env = context->env();
+    SetReaderFactory([this, env]() {
+      return new LMDBReader(name(), env);
+    });
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("LMDBReader").Device(DEVICE_CPU),
+                        LMDBReaderOp);
+
+}
--- a/tensorflow/core/lib/lmdb/testdata/data.mdb
+++ b/tensorflow/core/lib/lmdb/testdata/data.mdb
--- a/tensorflow/core/ops/io_ops.cc
+++ b/tensorflow/core/ops/io_ops.cc
@ -520,6 +520,21 @@ shared_name: If non-empty, this reader is named in the given bucket
             with this shared_name. Otherwise, the node name is used instead.
 )doc");

+REGISTER_OP("LMDBReader")
+    .Output("reader_handle: Ref(string)")
+    .Attr("container: string = ''")
+    .Attr("shared_name: string = ''")
+    .SetIsStateful()
+    .SetShapeFn(TwoElementOutput)
+    .Doc(R"doc(
+A Reader that outputs the records from a LMDB file.
+reader_handle: The handle to reference the Reader.
+container: If non-empty, this reader is placed in the given container.
+        Otherwise, a default container is used.
+shared_name: If non-empty, this reader is named in the given bucket
+             with this shared_name. Otherwise, the node name is used instead.
+)doc");
+
 // TODO(cwhipkey): mark this deprecated in favor of V2.
 REGISTER_OP("IdentityReader")
    .Output("reader_handle: Ref(string)")
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@ -26049,6 +26049,33 @@ op {
  summary: "A Reader that outputs the records from a TensorFlow Records file."
  is_stateful: true
 }
+op {
+  name: "LMDBReader"
+  output_arg {
+    name: "reader_handle"
+    description: "The handle to reference the Reader."
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+    description: "If non-empty, this reader is placed in the given container.\nOtherwise, a default container is used."
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+    description: "If non-empty, this reader is named in the given bucket\nwith this shared_name. Otherwise, the node name is used instead."
+  }
+  summary: "A Reader that outputs the records from a LMDB database."
+  is_stateful: true
+}
 op {
  name: "TakeDataset"
  input_arg {
--- a/tensorflow/python/framework/importer.py
+++ b/tensorflow/python/framework/importer.py
@ -437,6 +437,7 @@ def import_graph_def(graph_def, input_map=None, return_elements=None,
                           'WholeFileReader', 'TextLineReader',
                           'FixedLengthRecordReader',
                           'TFRecordReader', 'IdentityReader',
+                           'LMDBReader',
                           'RefSwitch', 'RefEnter', 'RefNextIteration',
                           'RefMerge', 'RefIdentity']:
              pass
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@ -979,6 +979,7 @@ tf_py_test(
        "//tensorflow/python:util",
        "//tensorflow/python:variables",
    ],
+    data = ["//tensorflow/core:lmdb_testdata"],
 )

 cuda_py_test(
--- a/tensorflow/python/kernel_tests/reader_ops_test.py
+++ b/tensorflow/python/kernel_tests/reader_ops_test.py
@ -858,5 +858,48 @@ class AsyncReaderTest(test.TestCase):
    output.append(sess.run(args))


+class LMDBReaderTest(test.TestCase):
+
+  def setUp(self):
+    super(LMDBReaderTest, self).setUp()
+
+  def testReadFromFile(self):
+    with self.test_session() as sess:
+      reader = io_ops.LMDBReader(name="test_read_from_file")
+      path = os.path.join("tensorflow", "core", "lib", "lmdb", "testdata",
+                          "data.mdb")
+      queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
+      key, value = reader.read(queue)
+
+      queue.enqueue([path]).run()
+      queue.close().run()
+      for i in range(10):
+        k, v = sess.run([key, value])
+        self.assertAllEqual(compat.as_bytes(k), compat.as_bytes(str(i)))
+        self.assertAllEqual(compat.as_bytes(v), compat.as_bytes(str(chr(ord('a') + i))))
+
+      with self.assertRaisesOpError("is closed and has insufficient elements "
+                                    "\\(requested 1, current size 0\\)"):
+        k, v = sess.run([key, value])
+
+  def testReadFromFolder(self):
+    with self.test_session() as sess:
+      reader = io_ops.LMDBReader(name="test_read_from_folder")
+      path = os.path.join("tensorflow", "core", "lib", "lmdb", "testdata")
+      queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
+      key, value = reader.read(queue)
+
+      queue.enqueue([path]).run()
+      queue.close().run()
+      for i in range(10):
+        k, v = sess.run([key, value])
+        self.assertAllEqual(compat.as_bytes(k), compat.as_bytes(str(i)))
+        self.assertAllEqual(compat.as_bytes(v), compat.as_bytes(str(chr(ord('a') + i))))
+
+      with self.assertRaisesOpError("is closed and has insufficient elements "
+                                    "\\(requested 1, current size 0\\)"):
+        k, v = sess.run([key, value])
+
+
 if __name__ == "__main__":
  test.main()
--- a/tensorflow/python/ops/hidden_ops.txt
+++ b/tensorflow/python/ops/hidden_ops.txt
@ -191,6 +191,7 @@ WholeFileReader
 TextLineReaderV2
 TFRecordReaderV2
 WholeFileReaderV2
+LMDBReader

 # linalg_ops
 BatchCholesky
--- a/tensorflow/python/ops/io_ops.py
+++ b/tensorflow/python/ops/io_ops.py
@ -26,6 +26,7 @@ See the @{$python/io_ops} guide.
@@WholeFileReader
@@IdentityReader
@@TFRecordReader
+@@LMDBReader
@@FixedLengthRecordReader
@@decode_csv
@@decode_raw
@ -443,6 +444,25 @@ class TFRecordReader(ReaderBase):
 ops.NotDifferentiable("TFRecordReader")


+class LMDBReader(ReaderBase):
+  """A Reader that outputs the records from a LMDB file.
+
+  See ReaderBase for supported methods.
+  """
+  def __init__(self, name=None, options=None):
+    """Create a LMDBReader.
+
+    Args:
+      name: A name for the operation (optional).
+      options: A LMDBRecordOptions object (optional).
+    """
+    rr = gen_io_ops._lmdb_reader(name=name)
+    super(LMDBReader, self).__init__(rr)
+
+
+ops.NotDifferentiable("LMDBReader")
+
+
 class IdentityReader(ReaderBase):
  """A Reader that outputs the queued work as both the key and value.

--- a/tensorflow/tools/lib_package/BUILD
+++ b/tensorflow/tools/lib_package/BUILD
@ -97,6 +97,7 @@ genrule(
        "@jemalloc//:COPYING",
        "@jpeg//:LICENSE.md",
        "@libxsmm_archive//:LICENSE",
+        "@lmdb//:LICENSE",
        "@local_config_sycl//sycl:LICENSE.text",
        "@png_archive//:LICENSE",
        "@protobuf//:LICENSE",
@ -126,6 +127,7 @@ genrule(
        "@jemalloc//:COPYING",
        "@jpeg//:LICENSE.md",
        "@libxsmm_archive//:LICENSE",
+        "@lmdb//:LICENSE",
        "@local_config_sycl//sycl:LICENSE.text",
        "@png_archive//:LICENSE",
        "@protobuf//:LICENSE",
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@ -110,6 +110,7 @@ filegroup(
        "@jemalloc//:COPYING",
        "@jpeg//:LICENSE.md",
        "@libxsmm_archive//:LICENSE",
+        "@lmdb//:LICENSE",
        "@local_config_sycl//sycl:LICENSE.text",
        "@nanopb_git//:LICENSE.txt",
        "@org_html5lib//:LICENSE",
--- a/tensorflow/tools/pip_package/pip_smoke_test.py
+++ b/tensorflow/tools/pip_package/pip_smoke_test.py
@ -46,6 +46,7 @@ BLACKLIST = [
    "//tensorflow/python:tf_optimizer",
    "//tensorflow/python:compare_test_proto_py",
    "//tensorflow/core:image_testdata",
+    "//tensorflow/core:lmdb_testdata",
    "//tensorflow/core/kernels/cloud:bigquery_reader_ops",
    "//tensorflow/python/feature_column:vocabulary_testdata",
    "//tensorflow/python:framework/test_file_system.so",
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@ -507,6 +507,17 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
      repository = tf_repo_name,
  )

+  native.new_http_archive(
+    name = "lmdb",
+    urls = [
+      "http://mirror.bazel.build/github.com/LMDB/lmdb/archive/LMDB_0.9.19.tar.gz",
+      "https://github.com/LMDB/lmdb/archive/LMDB_0.9.19.tar.gz",
+    ],
+    sha256 = "108532fb94c6f227558d45be3f3347b52539f0f58290a7bb31ec06c462d05326",
+    strip_prefix = "lmdb-LMDB_0.9.19/libraries/liblmdb",
+    build_file = str(Label("//third_party:lmdb.BUILD")),
+  )
+
  native.new_http_archive(
      name = "jsoncpp_git",
      urls = [
--- a/third_party/lmdb.BUILD
+++ b/third_party/lmdb.BUILD
@ -0,0 +1,25 @@
+# Description:
+#   LMDB is the Lightning Memory-mapped Database.
+
+licenses(["notice"])  # OpenLDAP Public License
+
+exports_files(["LICENSE"])
+
+cc_library(
+    name = "lmdb",
+    srcs = [
+        "mdb.c",
+        "midl.c",
+    ],
+    hdrs = [
+        "lmdb.h",
+        "midl.h",
+    ],
+    copts = [
+        "-w",
+    ],
+    linkopts = [
+        "-lpthread",
+    ],
+    visibility = ["//visibility:public"],
+)