From ad2769f479d0ddafc44e5a9053e4202bbf8e47b1 Mon Sep 17 00:00:00 2001
From: Reuben Morais <reuben.morais@gmail.com>
Date: Wed, 13 Nov 2019 17:38:40 +0100
Subject: [PATCH 1/2] Filter LM by removing very rare words

---
 data/lm/README.rst | 23 ++++++++++++++++++-----
 data/lm/lm.binary  |  4 ++--
 data/lm/trie       |  4 ++--
 3 files changed, 22 insertions(+), 9 deletions(-)

diff --git a/data/lm/README.rst b/data/lm/README.rst
index 38a2b74d..d6f8a5c4 100644
--- a/data/lm/README.rst
+++ b/data/lm/README.rst
@@ -8,18 +8,27 @@ lm.binary was generated from the LibriSpeech normalized LM training text, availa
    import os
 
    from urllib import request
+   from collections import Counter
 
    # Grab corpus.
    url = 'http://www.openslr.org/resources/11/librispeech-lm-norm.txt.gz'
    data_upper = '/tmp/upper.txt.gz'
    request.urlretrieve(url, data_upper)
 
-   # Convert to lowercase and cleanup.
-   data_lower = '/tmp/lower.txt'
-   with open(data_lower, 'w', encoding='utf-8') as lower:
+   # Convert to lowercase and count word occurences.
+   counter = Counter()
+   data_lower = '/tmp/lower.txt.gz'
+   with io.TextIOWrapper(io.BufferedWriter(gzip.open(data_lower, 'w')), encoding='utf-8') as lower:
        with io.TextIOWrapper(io.BufferedReader(gzip.open(data_upper)), encoding='utf8') as upper:
            for line in upper:
-               lower.write(line.lower())
+               line_lower = line.lower()
+               counter.update(line_lower.split())
+               lower.write(line_lower)
+
+   # Create vocabulary file with top 500k words
+   vocab_path = '/tmp/vocab-500k.txt'
+   with open(vocab_path, 'w') as fout:
+       fout.write('\n'.join(word for word, count in counter.most_common(500000)))
 
    # Build pruned LM.
    lm_path = '/tmp/lm.arpa'
@@ -30,12 +39,16 @@ lm.binary was generated from the LibriSpeech normalized LM training text, availa
           --arpa {lm_path} \
           --prune 0 0 0 1
 
+   # Filter LM using vocabulary.
+   filtered_path = '/tmp/lm_filtered.arpa'
+   !filter single model:{lm_path} {filtered_path} < {vocab_path}
+
    # Quantize and produce trie binary.
    binary_path = '/tmp/lm.binary'
    !build_binary -a 255 \
                  -q 8 \
                  trie \
-                 {lm_path} \
+                 {filtered_path} \
                  {binary_path} 
    os.remove(lm_path)
 
diff --git a/data/lm/lm.binary b/data/lm/lm.binary
index 0c53ca35..b73d4965 100644
--- a/data/lm/lm.binary
+++ b/data/lm/lm.binary
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e1fa6801b25912a3625f67e0f6cafcdacb24033be9fad5fa272152a0828d7193
-size 1800894585
+oid sha256:111bf908121c95e6121ebf7e600aad5762c49c9348c6bc652123a2dafd28587b
+size 1728766378
diff --git a/data/lm/trie b/data/lm/trie
index 342c6a9e..8edb4157 100644
--- a/data/lm/trie
+++ b/data/lm/trie
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f2024f2e83b252df33b5d6c5cced8186abd2764ce7124970c15c0174034c3f2e
-size 24480560
+oid sha256:0281e5e784ffccb4aeae5e7d64099058a0c22e42dbb7aa2d3ef2fbbff53db3ab
+size 12200736

From 381faaf6b600c5e9704462cfd9544daa90a8977a Mon Sep 17 00:00:00 2001
From: Reuben Morais <reuben.morais@gmail.com>
Date: Fri, 15 Nov 2019 13:28:45 +0100
Subject: [PATCH 2/2] Switch to --prune 0 0 1 model and move generation code to
 a script

---
 data/lm/README.rst     | 55 ++-------------------------------------
 data/lm/generate_lm.py | 59 ++++++++++++++++++++++++++++++++++++++++++
 data/lm/lm.binary      |  4 +--
 3 files changed, 63 insertions(+), 55 deletions(-)
 create mode 100644 data/lm/generate_lm.py

diff --git a/data/lm/README.rst b/data/lm/README.rst
index d6f8a5c4..add2b195 100644
--- a/data/lm/README.rst
+++ b/data/lm/README.rst
@@ -1,59 +1,8 @@
 
-lm.binary was generated from the LibriSpeech normalized LM training text, available `here <http://www.openslr.org/11>`_\ , following this recipe (Jupyter notebook code):
-
-.. code-block:: python
-
-   import gzip
-   import io
-   import os
-
-   from urllib import request
-   from collections import Counter
-
-   # Grab corpus.
-   url = 'http://www.openslr.org/resources/11/librispeech-lm-norm.txt.gz'
-   data_upper = '/tmp/upper.txt.gz'
-   request.urlretrieve(url, data_upper)
-
-   # Convert to lowercase and count word occurences.
-   counter = Counter()
-   data_lower = '/tmp/lower.txt.gz'
-   with io.TextIOWrapper(io.BufferedWriter(gzip.open(data_lower, 'w')), encoding='utf-8') as lower:
-       with io.TextIOWrapper(io.BufferedReader(gzip.open(data_upper)), encoding='utf8') as upper:
-           for line in upper:
-               line_lower = line.lower()
-               counter.update(line_lower.split())
-               lower.write(line_lower)
-
-   # Create vocabulary file with top 500k words
-   vocab_path = '/tmp/vocab-500k.txt'
-   with open(vocab_path, 'w') as fout:
-       fout.write('\n'.join(word for word, count in counter.most_common(500000)))
-
-   # Build pruned LM.
-   lm_path = '/tmp/lm.arpa'
-   !lmplz --order 5 \
-          --temp_prefix /tmp/ \
-          --memory 50% \
-          --text {data_lower} \
-          --arpa {lm_path} \
-          --prune 0 0 0 1
-
-   # Filter LM using vocabulary.
-   filtered_path = '/tmp/lm_filtered.arpa'
-   !filter single model:{lm_path} {filtered_path} < {vocab_path}
-
-   # Quantize and produce trie binary.
-   binary_path = '/tmp/lm.binary'
-   !build_binary -a 255 \
-                 -q 8 \
-                 trie \
-                 {filtered_path} \
-                 {binary_path} 
-   os.remove(lm_path)
+lm.binary was generated from the LibriSpeech normalized LM training text, available `here <http://www.openslr.org/11>`_\ , using the `generate_lm.py` script (will generate lm.binary in the folder it is run from). KenLM's built binaries must be in your PATH (lmplz, build_binary, filter).
 
 The trie was then generated from the vocabulary of the language model:
 
 .. code-block:: bash
 
-   ./generate_trie ../data/alphabet.txt /tmp/lm.binary /tmp/trie
+   ./generate_trie ../data/alphabet.txt lm.binary trie
diff --git a/data/lm/generate_lm.py b/data/lm/generate_lm.py
new file mode 100644
index 00000000..82fe6468
--- /dev/null
+++ b/data/lm/generate_lm.py
@@ -0,0 +1,59 @@
+import gzip
+import io
+import os
+import subprocess
+import tempfile
+
+from collections import Counter
+from urllib import request
+
+def main():
+  # Grab corpus.
+  url = 'http://www.openslr.org/resources/11/librispeech-lm-norm.txt.gz'
+
+  with tempfile.TemporaryDirectory() as tmp:
+    data_upper = os.path.join(tmp, 'upper.txt.gz')
+    print('Downloading {} into {}...'.format(url, data_upper))
+    request.urlretrieve(url, data_upper)
+
+    # Convert to lowercase and count word occurences.
+    counter = Counter()
+    data_lower = os.path.join(tmp, 'lower.txt.gz')
+    print('Converting to lower case and counting word frequencies...')
+    with io.TextIOWrapper(io.BufferedWriter(gzip.open(data_lower, 'w')), encoding='utf-8') as lower:
+      with io.TextIOWrapper(io.BufferedReader(gzip.open(data_upper)), encoding='utf8') as upper:
+        for line in upper:
+          line_lower = line.lower()
+          counter.update(line_lower.split())
+          lower.write(line_lower)
+
+    # Build pruned LM.
+    lm_path = os.path.join(tmp, 'lm.arpa')
+    print('Creating ARPA file...')
+    subprocess.check_call([
+      'lmplz', '--order', '5',
+               '--temp_prefix', tmp,
+               '--memory', '50%',
+               '--text', data_lower,
+               '--arpa', lm_path,
+               '--prune', '0', '0', '1'
+    ])
+
+    # Filter LM using vocabulary of top 500k words
+    filtered_path = os.path.join(tmp, 'lm_filtered.arpa')
+    vocab_str = '\n'.join(word for word, count in counter.most_common(500000))
+    print('Filtering ARPA file...')
+    subprocess.run(['filter', 'single', 'model:{}'.format(lm_path), filtered_path], input=vocab_str.encode('utf-8'), check=True)
+
+    # Quantize and produce trie binary.
+    print('Building lm.binary...')
+    subprocess.check_call([
+      'build_binary', '-a', '255',
+                      '-q', '8',
+                      'trie',
+                      filtered_path,
+                      'lm.binary'
+    ])
+
+if __name__ == '__main__':
+  main()
diff --git a/data/lm/lm.binary b/data/lm/lm.binary
index b73d4965..16e7d6d9 100644
--- a/data/lm/lm.binary
+++ b/data/lm/lm.binary
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:111bf908121c95e6121ebf7e600aad5762c49c9348c6bc652123a2dafd28587b
-size 1728766378
+oid sha256:a24953ce3f013bbf5f4a1c9f5a0e5482bc56eaa81638276de522f39e62ff3a56
+size 945699324