Merge pull request #1543 from mozilla/update_lm

Update language model to a trie-based LM created from the LibriSpeech LM corpus
2018-09-17 12:42:03 -03:00 · 2018-09-17 12:42:03 -03:00 · b9946fddab
commit b9946fddab
parent bb299dc265 cb86e7e191
9 changed files with 311681 additions and 312757 deletions
--- a/.gitattributes
+++ b/.gitattributes
@ -1,3 +1,3 @@
 *.binary filter=lfs diff=lfs merge=lfs -crlf
 data/lm/trie filter=lfs diff=lfs merge=lfs -crlf
 data/lm/vocab.txt filter=lfs diff=lfs merge=lfs -text
--- a/data/lm/README.md
+++ b/data/lm/README.md
@ -0,0 +1,46 @@
 lm.binary was generated from the LibriSpeech normalized LM training text, available [here](http://www.openslr.org/11), following this recipe (Jupyter notebook code):
 ```python
 import gzip
 import io
 import os
 from urllib import request
 # Grab corpus.
 url = 'http://www.openslr.org/resources/11/librispeech-lm-norm.txt.gz'
 data_upper = '/tmp/upper.txt.gz'
 request.urlretrieve(url, data_upper)
 # Convert to lowercase and cleanup.
 data_lower = '/tmp/lower.txt'
 with open(data_lower, 'w', encoding='utf-8') as lower:
    with io.TextIOWrapper(io.BufferedReader(gzip.open(data_upper)), encoding='utf8') as upper:
        for line in upper:
            lower.write(line.lower())
 # Build pruned LM.
 lm_path = '/tmp/lm.arpa'
 !lmplz --order 5 \
       --temp_prefix /tmp/ \
       --memory 50% \
       --text {data_lower} \
       --arpa {lm_path} \
       --prune 0 0 0 1
 # Quantize and produce trie binary.
 binary_path = '/tmp/lm.binary'
 !build_binary -a 255 \
              -q 8 \
              trie \
              {lm_path} \
              {binary_path} 
 os.remove(lm_path)
 ```
 The trie was then generated from the list of unique words in the corpus (data/lm/vocab.txt):
 ```bash
 tr -s '[[:space:]]' '\n' < /tmp/lower.txt | sort -u > /tmp/vocab.txt
 ./generate_trie ../data/alphabet.txt /tmp/lm.binary /tmp/vocab.txt /tmp/trie
 ```
--- a/data/lm/lm.binary
+++ b/data/lm/lm.binary
@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a9da0224ae9baf92a32fa85dafd24c99b3837a67b880ade0a5a730449d15297b
+oid sha256:e1fa6801b25912a3625f67e0f6cafcdacb24033be9fad5fa272152a0828d7193
-size 327902357
+size 1800894585
--- a/data/lm/trie
+++ b/data/lm/trie
@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c2f8f1d721eed0ae621160626e803925efa481c8156bb97e72013c0fbf879b75
+oid sha256:00779e53cfcc1e170525b6cc6113096d3984036ddf84dd8f07ce275f7027c47d
-size 43550345
+size 227692934
--- a/data/lm/vocab.txt
+++ b/data/lm/vocab.txt
--- a/data/smoke_test/vocab.pruned.lm
+++ b/data/smoke_test/vocab.pruned.lm
--- a/data/smoke_test/vocab.trie
+++ b/data/smoke_test/vocab.trie
--- a/native_client/beam_search.h
+++ b/native_client/beam_search.h
@ -8,7 +8,7 @@
 #include "kenlm/lm/model.hh"
-typedef lm::ngram::ProbingModel Model;
+typedef lm::ngram::QuantArrayTrieModel Model;
 struct KenLMBeamState {
  float language_model_score;
--- a/native_client/generate_trie.cpp
+++ b/native_client/generate_trie.cpp
@ -7,7 +7,7 @@ using namespace std;
 #include "trie_node.h"
 #include "alphabet.h"
-typedef lm::ngram::ProbingModel Model;
+typedef lm::ngram::QuantArrayTrieModel Model;
 lm::WordIndex GetWordIndex(const Model& model, const std::string& word) {
  return model.GetVocabulary().Index(word);