Merge pull request #1909 from coqui-ai/kenlm-dynamic

Dynamically link KenLM and distribute with packages
This commit is contained in:
Reuben Morais 2021-07-27 00:36:15 +02:00 committed by GitHub
commit 4b2af9ce6b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 196 additions and 49 deletions

View File

@ -26,8 +26,14 @@ package_native_client()
win_lib="-C ${tensorflow_dir}/bazel-bin/native_client/ libstt.so.if.lib" win_lib="-C ${tensorflow_dir}/bazel-bin/native_client/ libstt.so.if.lib"
fi; fi;
if [ -f "${tensorflow_dir}/bazel-bin/native_client/libkenlm.so.if.lib" ]; then
win_lib="$win_lib -C ${tensorflow_dir}/bazel-bin/native_client/ libkenlm.so.if.lib"
fi;
${TAR} --verbose -cf - \ ${TAR} --verbose -cf - \
--transform='flags=r;s|README.coqui|KenLM_License_Info.txt|' \
-C ${tensorflow_dir}/bazel-bin/native_client/ libstt.so \ -C ${tensorflow_dir}/bazel-bin/native_client/ libstt.so \
-C ${tensorflow_dir}/bazel-bin/native_client/ libkenlm.so \
${win_lib} \ ${win_lib} \
-C ${tensorflow_dir}/bazel-bin/native_client/ generate_scorer_package \ -C ${tensorflow_dir}/bazel-bin/native_client/ generate_scorer_package \
-C ${stt_dir}/ LICENSE \ -C ${stt_dir}/ LICENSE \

View File

@ -52,6 +52,31 @@ OPENFST_INCLUDES_PLATFORM = select({
"//conditions:default": ["ctcdecode/third_party/openfst-1.6.7/src/include"], "//conditions:default": ["ctcdecode/third_party/openfst-1.6.7/src/include"],
}) })
DECODER_SOURCES = [
"alphabet.cc",
"alphabet.h",
"ctcdecode/ctc_beam_search_decoder.cpp",
"ctcdecode/ctc_beam_search_decoder.h",
"ctcdecode/decoder_utils.cpp",
"ctcdecode/decoder_utils.h",
"ctcdecode/path_trie.cpp",
"ctcdecode/path_trie.h",
"ctcdecode/scorer.cpp",
"ctcdecode/scorer.h",
] + OPENFST_SOURCES_PLATFORM
DECODER_INCLUDES = [
".",
"ctcdecode/third_party/ThreadPool",
"ctcdecode/third_party/object_pool",
] + OPENFST_INCLUDES_PLATFORM
DECODER_LINKOPTS = [
"-lm",
"-ldl",
"-pthread",
]
LINUX_LINKOPTS = [ LINUX_LINKOPTS = [
"-ldl", "-ldl",
"-pthread", "-pthread",
@ -60,10 +85,12 @@ LINUX_LINKOPTS = [
"-Wl,-export-dynamic", "-Wl,-export-dynamic",
] ]
cc_library( tf_cc_shared_object(
name = "kenlm", name = "libkenlm.so",
srcs = glob([ srcs = glob([
"kenlm/lm/*.hh",
"kenlm/lm/*.cc", "kenlm/lm/*.cc",
"kenlm/util/*.hh",
"kenlm/util/*.cc", "kenlm/util/*.cc",
"kenlm/util/double-conversion/*.cc", "kenlm/util/double-conversion/*.cc",
"kenlm/util/double-conversion/*.h", "kenlm/util/double-conversion/*.h",
@ -72,10 +99,25 @@ cc_library(
"kenlm/*/*test.cc", "kenlm/*/*test.cc",
"kenlm/*/*main.cc", "kenlm/*/*main.cc",
],), ],),
copts = [
"-std=c++11"
] + select({
"//tensorflow:windows": [],
"//conditions:default": ["-fvisibility=hidden"],
}),
defines = ["KENLM_MAX_ORDER=6"],
includes = ["kenlm"],
framework_so = [],
linkopts = [],
)
cc_library(
name="kenlm",
hdrs = glob([ hdrs = glob([
"kenlm/lm/*.hh", "kenlm/lm/*.hh",
"kenlm/util/*.hh", "kenlm/util/*.hh",
]), ]),
srcs = ["libkenlm.so"],
copts = ["-std=c++11"], copts = ["-std=c++11"],
defines = ["KENLM_MAX_ORDER=6"], defines = ["KENLM_MAX_ORDER=6"],
includes = ["kenlm"], includes = ["kenlm"],
@ -83,32 +125,11 @@ cc_library(
cc_library( cc_library(
name = "decoder", name = "decoder",
srcs = [ srcs = DECODER_SOURCES,
"ctcdecode/ctc_beam_search_decoder.cpp", includes = DECODER_INCLUDES,
"ctcdecode/decoder_utils.cpp",
"ctcdecode/decoder_utils.h",
"ctcdecode/scorer.cpp",
"ctcdecode/path_trie.cpp",
"ctcdecode/path_trie.h",
"alphabet.cc",
] + OPENFST_SOURCES_PLATFORM,
hdrs = [
"ctcdecode/ctc_beam_search_decoder.h",
"ctcdecode/scorer.h",
"ctcdecode/decoder_utils.h",
"alphabet.h",
],
includes = [
".",
"ctcdecode/third_party/ThreadPool",
"ctcdecode/third_party/object_pool",
] + OPENFST_INCLUDES_PLATFORM,
deps = [":kenlm"], deps = [":kenlm"],
linkopts = [ linkopts = DECODER_LINKOPTS,
"-lm", copts = ["-fexceptions"],
"-ldl",
"-pthread",
],
) )
cc_library( cc_library(
@ -130,8 +151,8 @@ cc_library(
"tfmodelstate.h", "tfmodelstate.h",
"tfmodelstate.cc", "tfmodelstate.cc",
], ],
}), }) + DECODER_SOURCES,
copts = tf_copts() + select({ copts = tf_copts(allow_exceptions=True) + select({
# -fvisibility=hidden is not required on Windows, MSCV hides all declarations by default # -fvisibility=hidden is not required on Windows, MSCV hides all declarations by default
"//tensorflow:windows": ["/w"], "//tensorflow:windows": ["/w"],
# -Wno-sign-compare to silent a lot of warnings from tensorflow itself, # -Wno-sign-compare to silent a lot of warnings from tensorflow itself,
@ -143,16 +164,20 @@ cc_library(
}) + select({ }) + select({
"//native_client:tflite": ["-DUSE_TFLITE"], "//native_client:tflite": ["-DUSE_TFLITE"],
"//conditions:default": ["-UUSE_TFLITE"], "//conditions:default": ["-UUSE_TFLITE"],
}) + tflite_copts(), }),
linkopts = lrt_if_needed() + select({ linkopts = lrt_if_needed() + select({
"//tensorflow:macos": [], "//tensorflow:macos": [],
"//tensorflow:ios": ["-fembed-bitcode"], "//tensorflow:ios": ["-fembed-bitcode"],
"//tensorflow:linux_x86_64": LINUX_LINKOPTS, "//tensorflow:linux_x86_64": LINUX_LINKOPTS,
"//native_client:rpi3": LINUX_LINKOPTS, "//native_client:rpi3": LINUX_LINKOPTS,
"//native_client:rpi3-armv8": LINUX_LINKOPTS, "//native_client:rpi3-armv8": LINUX_LINKOPTS,
"//tensorflow:windows": [], # Bazel is has too strong opinions about static linking, so it's
# near impossible to get it to link a DLL against another DLL on Windows.
# We simply force the linker option manually here as a hacky fix.
"//tensorflow:windows": ["bazel-out/x64_windows-opt/bin/native_client/libkenlm.so.if.lib"],
"//conditions:default": [], "//conditions:default": [],
}) + tflite_linkopts(), }) + tflite_linkopts() + DECODER_LINKOPTS,
includes = DECODER_INCLUDES,
deps = select({ deps = select({
"//native_client:tflite": [ "//native_client:tflite": [
"//tensorflow/lite/kernels:builtin_ops", "//tensorflow/lite/kernels:builtin_ops",
@ -201,7 +226,7 @@ cc_library(
], ],
}) + if_cuda([ }) + if_cuda([
"//tensorflow/core:core", "//tensorflow/core:core",
]) + [":decoder"], ]) + [":kenlm"],
) )
tf_cc_shared_object( tf_cc_shared_object(
@ -231,9 +256,13 @@ cc_binary(
"generate_scorer_package.cpp", "generate_scorer_package.cpp",
"stt_errors.cc", "stt_errors.cc",
], ],
copts = ["-std=c++11"], copts = select({
"//tensorflow:windows": [],
"//conditions:default": ["-std=c++11"],
}),
deps = [ deps = [
":decoder", ":decoder",
":kenlm",
"@com_google_absl//absl/flags:flag", "@com_google_absl//absl/flags:flag",
"@com_google_absl//absl/flags:parse", "@com_google_absl//absl/flags:parse",
"@com_google_absl//absl/types:optional", "@com_google_absl//absl/types:optional",
@ -247,6 +276,10 @@ cc_binary(
] + select({ ] + select({
# ARMv7: error: Android 5.0 and later only support position-independent executables (-fPIE). # ARMv7: error: Android 5.0 and later only support position-independent executables (-fPIE).
"//tensorflow:android": ["-fPIE -pie"], "//tensorflow:android": ["-fPIE -pie"],
# Bazel is has too strong opinions about static linking, so it's
# near impossible to get it to link a DLL against another DLL on Windows.
# We simply force the linker option manually here as a hacky fix.
"//tensorflow:windows": ["bazel-out/x64_windows-opt/bin/native_client/libkenlm.so.if.lib"],
"//conditions:default": [], "//conditions:default": [],
}), }),
) )
@ -263,9 +296,8 @@ cc_binary(
cc_binary( cc_binary(
name = "trie_load", name = "trie_load",
srcs = [ srcs = [
"alphabet.h",
"trie_load.cc", "trie_load.cc",
], ] + DECODER_SOURCES,
copts = ["-std=c++11"], copts = ["-std=c++11"],
deps = [":decoder"], linkopts = DECODER_LINKOPTS,
) )

View File

@ -20,7 +20,7 @@ endif
STT_BIN := stt$(PLATFORM_EXE_SUFFIX) STT_BIN := stt$(PLATFORM_EXE_SUFFIX)
CFLAGS_STT := -std=c++11 -o $(STT_BIN) CFLAGS_STT := -std=c++11 -o $(STT_BIN)
LINK_STT := -lstt LINK_STT := -lstt -lkenlm
LINK_PATH_STT := -L${TFDIR}/bazel-bin/native_client LINK_PATH_STT := -L${TFDIR}/bazel-bin/native_client
ifeq ($(TARGET),host) ifeq ($(TARGET),host)
@ -61,7 +61,7 @@ TOOL_CC := cl.exe
TOOL_CXX := cl.exe TOOL_CXX := cl.exe
TOOL_LD := link.exe TOOL_LD := link.exe
TOOL_LIBEXE := lib.exe TOOL_LIBEXE := lib.exe
LINK_STT := $(TFDIR)\bazel-bin\native_client\libstt.so.if.lib LINK_STT := $(shell cygpath "$(TFDIR)/bazel-bin/native_client/libstt.so.if.lib") $(shell cygpath "$(TFDIR)/bazel-bin/native_client/libkenlm.so.if.lib")
LINK_PATH_STT := LINK_PATH_STT :=
CFLAGS_STT := -nologo -Fe$(STT_BIN) CFLAGS_STT := -nologo -Fe$(STT_BIN)
SOX_CFLAGS := SOX_CFLAGS :=
@ -185,7 +185,7 @@ define copy_missing_libs
new_missing="$$( (for f in $$(otool -L $$lib 2>/dev/null | tail -n +2 | awk '{ print $$1 }' | grep -v '$$lib'); do ls -hal $$f; done;) 2>&1 | grep 'No such' | cut -d':' -f2 | xargs basename -a)"; \ new_missing="$$( (for f in $$(otool -L $$lib 2>/dev/null | tail -n +2 | awk '{ print $$1 }' | grep -v '$$lib'); do ls -hal $$f; done;) 2>&1 | grep 'No such' | cut -d':' -f2 | xargs basename -a)"; \
missing_libs="$$missing_libs $$new_missing"; \ missing_libs="$$missing_libs $$new_missing"; \
elif [ "$(OS)" = "${CI_MSYS_VERSION}" ]; then \ elif [ "$(OS)" = "${CI_MSYS_VERSION}" ]; then \
missing_libs="libstt.so"; \ missing_libs="libstt.so libkenlm.so"; \
else \ else \
missing_libs="$$missing_libs $$($(LDD) $$lib | grep 'not found' | awk '{ print $$1 }')"; \ missing_libs="$$missing_libs $$($(LDD) $$lib | grep 'not found' | awk '{ print $$1 }')"; \
fi; \ fi; \

View File

@ -50,7 +50,7 @@ configure: stt_wrap.cxx package.json npm-dev
PATH="$(NODE_MODULES_BIN):${PATH}" $(NODE_BUILD_TOOL) configure $(NODE_BUILD_VERBOSE) PATH="$(NODE_MODULES_BIN):${PATH}" $(NODE_BUILD_TOOL) configure $(NODE_BUILD_VERBOSE)
build: configure stt_wrap.cxx build: configure stt_wrap.cxx
PATH="$(NODE_MODULES_BIN):${PATH}" NODE_PRE_GYP_ABI_CROSSWALK=$(NODE_PRE_GYP_ABI_CROSSWALK_FILE) AS=$(AS) CC=$(CC) CXX=$(CXX) LD=$(LD) CFLAGS="$(CFLAGS)" CXXFLAGS="$(CXXFLAGS)" LDFLAGS="$(RPATH_NODEJS) $(LDFLAGS)" LIBS=$(LIBS) $(NODE_BUILD_TOOL) $(NODE_PLATFORM_TARGET) $(NODE_RUNTIME) $(NODE_ABI_TARGET) $(NODE_DEVDIR) $(NODE_DIST_URL) --no-color rebuild $(NODE_BUILD_VERBOSE) PATH="$(NODE_MODULES_BIN):${PATH}" NODE_PRE_GYP_ABI_CROSSWALK=$(NODE_PRE_GYP_ABI_CROSSWALK_FILE) AS=$(AS) CC=$(CC) CXX=$(CXX) LD=$(LD) CFLAGS="$(CFLAGS)" CXXFLAGS="$(CXXFLAGS)" LDFLAGS="$(RPATH_NODEJS) $(LDFLAGS)" LIBS="$(LIBS)" $(NODE_BUILD_TOOL) $(NODE_PLATFORM_TARGET) $(NODE_RUNTIME) $(NODE_ABI_TARGET) $(NODE_DEVDIR) $(NODE_DIST_URL) --no-color rebuild $(NODE_BUILD_VERBOSE)
copy-deps: build copy-deps: build
$(call copy_missing_libs,lib/binding/*/*/*/stt.node,lib/binding/*/*/) $(call copy_missing_libs,lib/binding/*/*/*/stt.node,lib/binding/*/*/)

View File

@ -3,7 +3,7 @@
{ {
"target_name": "stt", "target_name": "stt",
"sources": ["stt_wrap.cxx"], "sources": ["stt_wrap.cxx"],
"libraries": ["$(LIBS)"], "libraries": [],
"include_dirs": ["../"], "include_dirs": ["../"],
"conditions": [ "conditions": [
[ [
@ -20,7 +20,22 @@
], ],
} }
}, },
] ],
[
"OS=='win'",
{
"libraries": [
"../../../tensorflow/bazel-bin/native_client/libstt.so.if.lib",
"../../../tensorflow/bazel-bin/native_client/libkenlm.so.if.lib",
],
},
{
"libraries": [
"../../../tensorflow/bazel-bin/native_client/libstt.so",
"../../../tensorflow/bazel-bin/native_client/libkenlm.so",
],
},
],
], ],
}, },
{ {

View File

@ -13,3 +13,84 @@ git grep 'double_conversion' | cut -d':' -f1 | sort | uniq | xargs sed -ri 's/do
Cherry-pick fix for MSVC: Cherry-pick fix for MSVC:
curl -vsSL https://github.com/kpu/kenlm/commit/d70e28403f07e88b276c6bd9f162d2a428530f2e.patch | git am -p1 --directory=native_client/kenlm curl -vsSL https://github.com/kpu/kenlm/commit/d70e28403f07e88b276c6bd9f162d2a428530f2e.patch | git am -p1 --directory=native_client/kenlm
Most of the KenLM code is licensed under the LGPL. There are exceptions that
have their own licenses, listed below. See comments in those files for more
details.
util/getopt.* is getopt for Windows
util/murmur_hash.cc
util/string_piece.hh and util/string_piece.cc
util/double-conversion/LICENSE covers util/double-conversion except the build files
util/file.cc contains a modified implementation of mkstemp under the LGPL
util/integer_to_string.* is BSD
For the rest:
KenLM is free software: you can redistribute it and/or modify
it under the terms of the GNU Lesser General Public License as published
by the Free Software Foundation, either version 2.1 of the License, or
(at your option) any later version.
KenLM is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public License 2.1
along with KenLM code. If not, see <http://www.gnu.org/licenses/lgpl-2.1.html>.
util/double-conversion:
Copyright 2006-2011, the V8 project authors. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the following
disclaimer in the documentation and/or other materials provided
with the distribution.
* Neither the name of Google Inc. nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
util/integer_to_string.*:
Copyright (C) 2014 Milo Yip
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.

View File

@ -23,7 +23,7 @@ extern const char *kModelNames[6];
* If so, return true and set recognized to the type. This is the only API in * If so, return true and set recognized to the type. This is the only API in
* this header designed for use by decoder authors. * this header designed for use by decoder authors.
*/ */
bool RecognizeBinary(const char *file, ModelType &recognized); KENLM_EXPORT bool RecognizeBinary(const char *file, ModelType &recognized);
struct FixedWidthParameters { struct FixedWidthParameters {
unsigned char order; unsigned char order;

View File

@ -10,13 +10,19 @@
/* Configuration for ngram model. Separate header to reduce pollution. */ /* Configuration for ngram model. Separate header to reduce pollution. */
#if defined _MSC_VER
#define KENLM_EXPORT __declspec(dllexport)
#else
#define KENLM_EXPORT __attribute__ ((visibility("default")))
#endif /* _MSC_VER */
namespace lm { namespace lm {
class EnumerateVocab; class EnumerateVocab;
namespace ngram { namespace ngram {
struct Config { struct KENLM_EXPORT Config {
// EFFECTIVE FOR BOTH ARPA AND BINARY READS // EFFECTIVE FOR BOTH ARPA AND BINARY READS
// (default true) print progress bar to messages // (default true) print progress bar to messages

View File

@ -149,7 +149,7 @@ typedef ProbingModel Model;
/* Autorecognize the file type, load, and return the virtual base class. Don't /* Autorecognize the file type, load, and return the virtual base class. Don't
* use the virtual base class if you can avoid it. Instead, use the above * use the virtual base class if you can avoid it. Instead, use the above
* classes as template arguments to your own virtual feature function.*/ * classes as template arguments to your own virtual feature function.*/
base::Model *LoadVirtual(const char *file_name, const Config &config = Config(), ModelType if_arpa = PROBING); KENLM_EXPORT base::Model *LoadVirtual(const char *file_name, const Config &config = Config(), ModelType if_arpa = PROBING);
} // namespace ngram } // namespace ngram
} // namespace lm } // namespace lm

View File

@ -10,9 +10,16 @@
#include <string> #include <string>
#include <stdint.h> #include <stdint.h>
#if defined _MSC_VER
#define KENLM_EXPORT __declspec(dllexport)
#else
#define KENLM_EXPORT __attribute__ ((visibility("default")))
#endif /* _MSC_VER */
namespace util { namespace util {
class scoped_fd { class KENLM_EXPORT scoped_fd {
public: public:
scoped_fd() : fd_(-1) {} scoped_fd() : fd_(-1) {}
@ -82,7 +89,7 @@ class EndOfFileException : public Exception {
class UnsupportedOSException : public Exception {}; class UnsupportedOSException : public Exception {};
// Open for read only. // Open for read only.
int OpenReadOrThrow(const char *name); KENLM_EXPORT int OpenReadOrThrow(const char *name);
// Create file if it doesn't exist, truncate if it does. Opened for write. // Create file if it doesn't exist, truncate if it does. Opened for write.
int CreateOrThrow(const char *name); int CreateOrThrow(const char *name);
@ -110,7 +117,7 @@ bool OutputPathIsStdout(StringPiece path);
// Return value for SizeFile when it can't size properly. // Return value for SizeFile when it can't size properly.
const uint64_t kBadSize = (uint64_t)-1; const uint64_t kBadSize = (uint64_t)-1;
uint64_t SizeFile(int fd); KENLM_EXPORT uint64_t SizeFile(int fd);
uint64_t SizeOrThrow(int fd); uint64_t SizeOrThrow(int fd);
void ResizeOrThrow(int fd, uint64_t to); void ResizeOrThrow(int fd, uint64_t to);