Merge pull request #1543 from mozilla/update_lm

Update language model to a trie-based LM created from the LibriSpeech LM corpus
2018-09-17 12:42:03 -03:00 · 2018-09-17 12:42:03 -03:00 · b9946fddab
commit b9946fddab
parent bb299dc265 cb86e7e191
9 changed files with 311681 additions and 312757 deletions
--- a/.gitattributes
+++ b/.gitattributes
@ -1,3 +1,3 @@
-
 *.binary filter=lfs diff=lfs merge=lfs -crlf
 data/lm/trie filter=lfs diff=lfs merge=lfs -crlf
+data/lm/vocab.txt filter=lfs diff=lfs merge=lfs -text
--- a/data/lm/README.md
+++ b/data/lm/README.md
@ -0,0 +1,46 @@
+lm.binary was generated from the LibriSpeech normalized LM training text, available [here](http://www.openslr.org/11), following this recipe (Jupyter notebook code):
+
+```python
+import gzip
+import io
+import os
+
+from urllib import request
+
+# Grab corpus.
+url = 'http://www.openslr.org/resources/11/librispeech-lm-norm.txt.gz'
+data_upper = '/tmp/upper.txt.gz'
+request.urlretrieve(url, data_upper)
+
+# Convert to lowercase and cleanup.
+data_lower = '/tmp/lower.txt'
+with open(data_lower, 'w', encoding='utf-8') as lower:
+    with io.TextIOWrapper(io.BufferedReader(gzip.open(data_upper)), encoding='utf8') as upper:
+        for line in upper:
+            lower.write(line.lower())
+
+# Build pruned LM.
+lm_path = '/tmp/lm.arpa'
+!lmplz --order 5 \
+       --temp_prefix /tmp/ \
+       --memory 50% \
+       --text {data_lower} \
+       --arpa {lm_path} \
+       --prune 0 0 0 1
+
+# Quantize and produce trie binary.
+binary_path = '/tmp/lm.binary'
+!build_binary -a 255 \
+              -q 8 \
+              trie \
+              {lm_path} \
+              {binary_path} 
+os.remove(lm_path)
+```
+
+The trie was then generated from the list of unique words in the corpus (data/lm/vocab.txt):
+
+```bash
+tr -s '[[:space:]]' '\n' < /tmp/lower.txt | sort -u > /tmp/vocab.txt
+./generate_trie ../data/alphabet.txt /tmp/lm.binary /tmp/vocab.txt /tmp/trie
+```
--- a/data/lm/lm.binary
+++ b/data/lm/lm.binary
@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a9da0224ae9baf92a32fa85dafd24c99b3837a67b880ade0a5a730449d15297b
-size 327902357
+oid sha256:e1fa6801b25912a3625f67e0f6cafcdacb24033be9fad5fa272152a0828d7193
+size 1800894585
--- a/data/lm/trie
+++ b/data/lm/trie
@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c2f8f1d721eed0ae621160626e803925efa481c8156bb97e72013c0fbf879b75
-size 43550345
+oid sha256:00779e53cfcc1e170525b6cc6113096d3984036ddf84dd8f07ce275f7027c47d
+size 227692934
--- a/data/lm/vocab.txt
+++ b/data/lm/vocab.txt
--- a/data/smoke_test/vocab.pruned.lm
+++ b/data/smoke_test/vocab.pruned.lm
--- a/data/smoke_test/vocab.trie
+++ b/data/smoke_test/vocab.trie
--- a/native_client/beam_search.h
+++ b/native_client/beam_search.h
@ -8,7 +8,7 @@

 #include "kenlm/lm/model.hh"

-typedef lm::ngram::ProbingModel Model;
+typedef lm::ngram::QuantArrayTrieModel Model;

 struct KenLMBeamState {
  float language_model_score;
--- a/native_client/generate_trie.cpp
+++ b/native_client/generate_trie.cpp
@ -7,7 +7,7 @@ using namespace std;
 #include "trie_node.h"
 #include "alphabet.h"

-typedef lm::ngram::ProbingModel Model;
+typedef lm::ngram::QuantArrayTrieModel Model;

 lm::WordIndex GetWordIndex(const Model& model, const std::string& word) {
  return model.GetVocabulary().Index(word);