Stop including vocabulary data in LM.binary.
This commit is contained in:
parent
d65422c8ab
commit
7c0354483e
@ -1,5 +1,5 @@
|
||||
|
||||
lm.binary was generated from the LibriSpeech normalized LM training text, available `here <http://www.openslr.org/11>`_\ , using the `generate_lm.py` script (will generate lm.binary in the folder it is run from). KenLM's built binaries must be in your PATH (lmplz, build_binary, filter).
|
||||
lm.binary was generated from the LibriSpeech normalized LM training text, available `here <http://www.openslr.org/11>`_\ , using the `generate_lm.py` script (will generate lm.binary in the folder it is run from). `KenLM <https://github.com/kpu/kenlm>`_'s built binaries must be in your PATH (lmplz, build_binary, filter).
|
||||
|
||||
The trie was then generated from the vocabulary of the language model:
|
||||
|
||||
|
@ -50,6 +50,7 @@ def main():
|
||||
subprocess.check_call([
|
||||
'build_binary', '-a', '255',
|
||||
'-q', '8',
|
||||
'-v',
|
||||
'trie',
|
||||
filtered_path,
|
||||
'lm.binary'
|
||||
|
@ -1,3 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:a24953ce3f013bbf5f4a1c9f5a0e5482bc56eaa81638276de522f39e62ff3a56
|
||||
size 945699324
|
||||
oid sha256:cc8d9e5f49e2fa05c56cc928520c6c79cb78ff95226ec9a07785b3a28d1a680b
|
||||
size 941235601
|
||||
|
@ -77,63 +77,40 @@ void Scorer::setup(const std::string& lm_path, const std::string& trie_path)
|
||||
VALID_CHECK_EQ(access(filename, R_OK), 0, "Invalid language model path");
|
||||
|
||||
bool has_trie = trie_path.size() && access(trie_path.c_str(), R_OK) == 0;
|
||||
VALID_CHECK(has_trie, "Invalid trie path");
|
||||
|
||||
lm::ngram::Config config;
|
||||
config.load_method = util::LoadMethod::LAZY;
|
||||
language_model_.reset(lm::ngram::LoadVirtual(filename, config));
|
||||
|
||||
if (!has_trie) { // no trie was specified, build it now
|
||||
RetrieveStrEnumerateVocab enumerate;
|
||||
config.enumerate_vocab = &enumerate;
|
||||
language_model_.reset(lm::ngram::LoadVirtual(filename, config));
|
||||
auto vocab = enumerate.vocabulary;
|
||||
for (size_t i = 0; i < vocab.size(); ++i) {
|
||||
if (vocab[i] != UNK_TOKEN &&
|
||||
vocab[i] != START_TOKEN &&
|
||||
vocab[i] != END_TOKEN &&
|
||||
get_utf8_str_len(vocab[i]) > 1) {
|
||||
is_utf8_mode_ = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
// Read metadata and trie from file
|
||||
std::ifstream fin(trie_path, std::ios::binary);
|
||||
|
||||
if (alphabet_.GetSize() != 255) {
|
||||
is_utf8_mode_ = false;
|
||||
}
|
||||
|
||||
// Add spaces only in word-based scoring
|
||||
fill_dictionary(vocab);
|
||||
} else {
|
||||
config.load_method = util::LoadMethod::LAZY;
|
||||
language_model_.reset(lm::ngram::LoadVirtual(filename, config));
|
||||
|
||||
// Read metadata and trie from file
|
||||
std::ifstream fin(trie_path, std::ios::binary);
|
||||
|
||||
int magic;
|
||||
fin.read(reinterpret_cast<char*>(&magic), sizeof(magic));
|
||||
if (magic != MAGIC) {
|
||||
std::cerr << "Error: Can't parse trie file, invalid header. Try updating "
|
||||
"your trie file." << std::endl;
|
||||
throw 1;
|
||||
}
|
||||
|
||||
int version;
|
||||
fin.read(reinterpret_cast<char*>(&version), sizeof(version));
|
||||
if (version != FILE_VERSION) {
|
||||
std::cerr << "Error: Trie file version mismatch (" << version
|
||||
<< " instead of expected " << FILE_VERSION
|
||||
<< "). Update your trie file."
|
||||
<< std::endl;
|
||||
throw 1;
|
||||
}
|
||||
|
||||
fin.read(reinterpret_cast<char*>(&is_utf8_mode_), sizeof(is_utf8_mode_));
|
||||
|
||||
fst::FstReadOptions opt;
|
||||
opt.mode = fst::FstReadOptions::MAP;
|
||||
opt.source = trie_path;
|
||||
dictionary.reset(FstType::Read(fin, opt));
|
||||
int magic;
|
||||
fin.read(reinterpret_cast<char*>(&magic), sizeof(magic));
|
||||
if (magic != MAGIC) {
|
||||
std::cerr << "Error: Can't parse trie file, invalid header. Try updating "
|
||||
"your trie file." << std::endl;
|
||||
throw 1;
|
||||
}
|
||||
|
||||
int version;
|
||||
fin.read(reinterpret_cast<char*>(&version), sizeof(version));
|
||||
if (version != FILE_VERSION) {
|
||||
std::cerr << "Error: Trie file version mismatch (" << version
|
||||
<< " instead of expected " << FILE_VERSION
|
||||
<< "). Update your trie file."
|
||||
<< std::endl;
|
||||
throw 1;
|
||||
}
|
||||
|
||||
fin.read(reinterpret_cast<char*>(&is_utf8_mode_), sizeof(is_utf8_mode_));
|
||||
|
||||
fst::FstReadOptions opt;
|
||||
opt.mode = fst::FstReadOptions::MAP;
|
||||
opt.source = trie_path;
|
||||
dictionary_.reset(FstType::Read(fin, opt));
|
||||
|
||||
max_order_ = language_model_->Order();
|
||||
}
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user