Stop including vocabulary data in LM.binary.

This commit is contained in:
Reuben Morais 2020-01-15 23:28:35 +01:00
parent d65422c8ab
commit 7c0354483e
4 changed files with 32 additions and 54 deletions

View File

@ -1,5 +1,5 @@
lm.binary was generated from the LibriSpeech normalized LM training text, available `here <http://www.openslr.org/11>`_\ , using the `generate_lm.py` script (will generate lm.binary in the folder it is run from). KenLM's built binaries must be in your PATH (lmplz, build_binary, filter).
lm.binary was generated from the LibriSpeech normalized LM training text, available `here <http://www.openslr.org/11>`_\ , using the `generate_lm.py` script (will generate lm.binary in the folder it is run from). `KenLM <https://github.com/kpu/kenlm>`_'s built binaries must be in your PATH (lmplz, build_binary, filter).
The trie was then generated from the vocabulary of the language model:

View File

@ -50,6 +50,7 @@ def main():
subprocess.check_call([
'build_binary', '-a', '255',
'-q', '8',
'-v',
'trie',
filtered_path,
'lm.binary'

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:a24953ce3f013bbf5f4a1c9f5a0e5482bc56eaa81638276de522f39e62ff3a56
size 945699324
oid sha256:cc8d9e5f49e2fa05c56cc928520c6c79cb78ff95226ec9a07785b3a28d1a680b
size 941235601

View File

@ -77,31 +77,9 @@ void Scorer::setup(const std::string& lm_path, const std::string& trie_path)
VALID_CHECK_EQ(access(filename, R_OK), 0, "Invalid language model path");
bool has_trie = trie_path.size() && access(trie_path.c_str(), R_OK) == 0;
VALID_CHECK(has_trie, "Invalid trie path");
lm::ngram::Config config;
if (!has_trie) { // no trie was specified, build it now
RetrieveStrEnumerateVocab enumerate;
config.enumerate_vocab = &enumerate;
language_model_.reset(lm::ngram::LoadVirtual(filename, config));
auto vocab = enumerate.vocabulary;
for (size_t i = 0; i < vocab.size(); ++i) {
if (vocab[i] != UNK_TOKEN &&
vocab[i] != START_TOKEN &&
vocab[i] != END_TOKEN &&
get_utf8_str_len(vocab[i]) > 1) {
is_utf8_mode_ = false;
break;
}
}
if (alphabet_.GetSize() != 255) {
is_utf8_mode_ = false;
}
// Add spaces only in word-based scoring
fill_dictionary(vocab);
} else {
config.load_method = util::LoadMethod::LAZY;
language_model_.reset(lm::ngram::LoadVirtual(filename, config));
@ -131,8 +109,7 @@ void Scorer::setup(const std::string& lm_path, const std::string& trie_path)
fst::FstReadOptions opt;
opt.mode = fst::FstReadOptions::MAP;
opt.source = trie_path;
dictionary.reset(FstType::Read(fin, opt));
}
dictionary_.reset(FstType::Read(fin, opt));
max_order_ = language_model_->Order();
}