Stop including vocabulary data in LM.binary.

2020-01-15 23:28:35 +01:00 · 2020-01-15 23:28:35 +01:00 · 7c0354483e
commit 7c0354483e
parent d65422c8ab
4 changed files with 32 additions and 54 deletions
--- a/data/lm/README.rst
+++ b/data/lm/README.rst
@ -1,5 +1,5 @@

-lm.binary was generated from the LibriSpeech normalized LM training text, available `here <http://www.openslr.org/11>`_\ , using the `generate_lm.py` script (will generate lm.binary in the folder it is run from). KenLM's built binaries must be in your PATH (lmplz, build_binary, filter).
+lm.binary was generated from the LibriSpeech normalized LM training text, available `here <http://www.openslr.org/11>`_\ , using the `generate_lm.py` script (will generate lm.binary in the folder it is run from). `KenLM <https://github.com/kpu/kenlm>`_'s built binaries must be in your PATH (lmplz, build_binary, filter).

 The trie was then generated from the vocabulary of the language model:

--- a/data/lm/generate_lm.py
+++ b/data/lm/generate_lm.py
@ -50,6 +50,7 @@ def main():
    subprocess.check_call([
      'build_binary', '-a', '255',
                      '-q', '8',
+                      '-v',
                      'trie',
                      filtered_path,
                      'lm.binary'
--- a/data/lm/lm.binary
+++ b/data/lm/lm.binary
@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a24953ce3f013bbf5f4a1c9f5a0e5482bc56eaa81638276de522f39e62ff3a56
-size 945699324
+oid sha256:cc8d9e5f49e2fa05c56cc928520c6c79cb78ff95226ec9a07785b3a28d1a680b
+size 941235601
--- a/native_client/ctcdecode/scorer.cpp
+++ b/native_client/ctcdecode/scorer.cpp
@ -77,31 +77,9 @@ void Scorer::setup(const std::string& lm_path, const std::string& trie_path)
  VALID_CHECK_EQ(access(filename, R_OK), 0, "Invalid language model path");

  bool has_trie = trie_path.size() && access(trie_path.c_str(), R_OK) == 0;
+  VALID_CHECK(has_trie, "Invalid trie path");

  lm::ngram::Config config;
-
-  if (!has_trie) { // no trie was specified, build it now
-    RetrieveStrEnumerateVocab enumerate;
-    config.enumerate_vocab = &enumerate;
-    language_model_.reset(lm::ngram::LoadVirtual(filename, config));
-    auto vocab = enumerate.vocabulary;
-    for (size_t i = 0; i < vocab.size(); ++i) {
-      if (vocab[i] != UNK_TOKEN &&
-          vocab[i] != START_TOKEN &&
-          vocab[i] != END_TOKEN &&
-          get_utf8_str_len(vocab[i]) > 1) {
-        is_utf8_mode_ = false;
-        break;
-      }
-    }
-
-    if (alphabet_.GetSize() != 255) {
-      is_utf8_mode_ = false;
-    }
-
-    // Add spaces only in word-based scoring
-    fill_dictionary(vocab);
-  } else {
  config.load_method = util::LoadMethod::LAZY;
  language_model_.reset(lm::ngram::LoadVirtual(filename, config));

@ -131,8 +109,7 @@ void Scorer::setup(const std::string& lm_path, const std::string& trie_path)
  fst::FstReadOptions opt;
  opt.mode = fst::FstReadOptions::MAP;
  opt.source = trie_path;
-    dictionary.reset(FstType::Read(fin, opt));
-  }
+  dictionary_.reset(FstType::Read(fin, opt));

  max_order_ = language_model_->Order();
 }