Add tool to extract vocabulary from the old LM binary format

2020-01-20 17:25:20 +01:00 · 2020-01-20 17:25:20 +01:00 · 708b21a63e
commit 708b21a63e
parent a156d28504
2 changed files with 82 additions and 17 deletions
--- a/native_client/BUILD
+++ b/native_client/BUILD
@ -27,20 +27,6 @@ genrule(
    tools = [":gen_workspace_status.sh"],
 )

-KENLM_SOURCES = glob(
-    [
-        "kenlm/lm/*.cc",
-        "kenlm/util/*.cc",
-        "kenlm/util/double-conversion/*.cc",
-        "kenlm/lm/*.hh",
-        "kenlm/util/*.hh",
-        "kenlm/util/double-conversion/*.h",
-    ],
-    exclude = [
-        "kenlm/*/*test.cc",
-        "kenlm/*/*main.cc",
-    ],
-)

 OPENFST_SOURCES_PLATFORM = select({
    "//tensorflow:windows": glob(["ctcdecode/third_party/openfst-1.6.9-win/src/lib/*.cc"]),
@ -60,6 +46,27 @@ LINUX_LINKOPTS = [
    "-Wl,-export-dynamic",
 ]

+cc_library(
+    name = "kenlm",
+    srcs = glob([
+        "kenlm/lm/*.cc",
+        "kenlm/util/*.cc",
+        "kenlm/util/double-conversion/*.cc",
+        "kenlm/util/double-conversion/*.h",
+    ],
+    exclude = [
+        "kenlm/*/*test.cc",
+        "kenlm/*/*main.cc",
+    ],),
+    hdrs = glob([
+        "kenlm/lm/*.hh",
+        "kenlm/util/*.hh",
+    ]),
+    copts = ["-std=c++11"],
+    defines = ["KENLM_MAX_ORDER=6"],
+    includes = ["kenlm"],
+)
+
 cc_library(
    name = "decoder",
    srcs = [
@ -69,17 +76,16 @@ cc_library(
        "ctcdecode/scorer.cpp",
        "ctcdecode/path_trie.cpp",
        "ctcdecode/path_trie.h",
-    ] + KENLM_SOURCES + OPENFST_SOURCES_PLATFORM,
+    ] + OPENFST_SOURCES_PLATFORM,
    hdrs = [
        "ctcdecode/ctc_beam_search_decoder.h",
        "ctcdecode/scorer.h",
    ],
-    defines = ["KENLM_MAX_ORDER=6"],
    includes = [
        ".",
        "ctcdecode/third_party/ThreadPool",
-        "kenlm",
    ] + OPENFST_INCLUDES_PLATFORM,
+    deps = [":kenlm"]
 )

 tf_cc_shared_object(
@ -181,6 +187,15 @@ genrule(
    cmd = "dsymutil $(location :libdeepspeech.so) -o $@"
 )

+cc_binary(
+    name = "enumerate_kenlm_vocabulary",
+    srcs = [
+        "enumerate_kenlm_vocabulary.cpp",
+    ],
+    deps = [":kenlm"],
+    copts = ["-std=c++11"],
+)
+
 cc_binary(
    name = "trie_load",
    srcs = [
--- a/native_client/enumerate_kenlm_vocabulary.cpp
+++ b/native_client/enumerate_kenlm_vocabulary.cpp
@ -0,0 +1,50 @@
+#include <string>
+#include <vector>
+#include <iostream>
+#include <fstream>
+
+#include "lm/enumerate_vocab.hh"
+#include "lm/virtual_interface.hh"
+#include "lm/word_index.hh"
+#include "lm/model.hh"
+
+const std::string START_TOKEN = "<s>";
+const std::string UNK_TOKEN = "<unk>";
+const std::string END_TOKEN = "</s>";
+
+// Implement a callback to retrieve the dictionary of language model.
+class RetrieveStrEnumerateVocab : public lm::EnumerateVocab
+{
+public:
+  RetrieveStrEnumerateVocab() {}
+
+  void Add(lm::WordIndex index, const StringPiece &str) {
+    vocabulary.push_back(std::string(str.data(), str.length()));
+  }
+
+  std::vector<std::string> vocabulary;
+};
+
+int main(int argc, char** argv)
+{
+  if (argc != 3) {
+    std::cerr << "Usage: " << argv[0] << " <kenlm_model> <output_path>" << std::endl;
+    return -1;
+  }
+
+  const char* kenlm_model = argv[1];
+  const char* output_path = argv[2];
+
+  std::unique_ptr<lm::base::Model> language_model_;
+  lm::ngram::Config config;
+  RetrieveStrEnumerateVocab enumerate;
+  config.enumerate_vocab = &enumerate;
+  language_model_.reset(lm::ngram::LoadVirtual(kenlm_model, config));
+
+  std::ofstream fout(output_path);
+  for (const std::string& word : enumerate.vocabulary) {
+    fout << word << "\n";
+  }
+
+  return 0;
+}