From 381faaf6b600c5e9704462cfd9544daa90a8977a Mon Sep 17 00:00:00 2001 From: Reuben Morais Date: Fri, 15 Nov 2019 13:28:45 +0100 Subject: [PATCH] Switch to --prune 0 0 1 model and move generation code to a script --- data/lm/README.rst | 55 ++------------------------------------- data/lm/generate_lm.py | 59 ++++++++++++++++++++++++++++++++++++++++++ data/lm/lm.binary | 4 +-- 3 files changed, 63 insertions(+), 55 deletions(-) create mode 100644 data/lm/generate_lm.py diff --git a/data/lm/README.rst b/data/lm/README.rst index d6f8a5c4..add2b195 100644 --- a/data/lm/README.rst +++ b/data/lm/README.rst @@ -1,59 +1,8 @@ -lm.binary was generated from the LibriSpeech normalized LM training text, available `here `_\ , following this recipe (Jupyter notebook code): - -.. code-block:: python - - import gzip - import io - import os - - from urllib import request - from collections import Counter - - # Grab corpus. - url = 'http://www.openslr.org/resources/11/librispeech-lm-norm.txt.gz' - data_upper = '/tmp/upper.txt.gz' - request.urlretrieve(url, data_upper) - - # Convert to lowercase and count word occurences. - counter = Counter() - data_lower = '/tmp/lower.txt.gz' - with io.TextIOWrapper(io.BufferedWriter(gzip.open(data_lower, 'w')), encoding='utf-8') as lower: - with io.TextIOWrapper(io.BufferedReader(gzip.open(data_upper)), encoding='utf8') as upper: - for line in upper: - line_lower = line.lower() - counter.update(line_lower.split()) - lower.write(line_lower) - - # Create vocabulary file with top 500k words - vocab_path = '/tmp/vocab-500k.txt' - with open(vocab_path, 'w') as fout: - fout.write('\n'.join(word for word, count in counter.most_common(500000))) - - # Build pruned LM. - lm_path = '/tmp/lm.arpa' - !lmplz --order 5 \ - --temp_prefix /tmp/ \ - --memory 50% \ - --text {data_lower} \ - --arpa {lm_path} \ - --prune 0 0 0 1 - - # Filter LM using vocabulary. - filtered_path = '/tmp/lm_filtered.arpa' - !filter single model:{lm_path} {filtered_path} < {vocab_path} - - # Quantize and produce trie binary. - binary_path = '/tmp/lm.binary' - !build_binary -a 255 \ - -q 8 \ - trie \ - {filtered_path} \ - {binary_path} - os.remove(lm_path) +lm.binary was generated from the LibriSpeech normalized LM training text, available `here `_\ , using the `generate_lm.py` script (will generate lm.binary in the folder it is run from). KenLM's built binaries must be in your PATH (lmplz, build_binary, filter). The trie was then generated from the vocabulary of the language model: .. code-block:: bash - ./generate_trie ../data/alphabet.txt /tmp/lm.binary /tmp/trie + ./generate_trie ../data/alphabet.txt lm.binary trie diff --git a/data/lm/generate_lm.py b/data/lm/generate_lm.py new file mode 100644 index 00000000..82fe6468 --- /dev/null +++ b/data/lm/generate_lm.py @@ -0,0 +1,59 @@ +import gzip +import io +import os +import subprocess +import tempfile + +from collections import Counter +from urllib import request + +def main(): + # Grab corpus. + url = 'http://www.openslr.org/resources/11/librispeech-lm-norm.txt.gz' + + with tempfile.TemporaryDirectory() as tmp: + data_upper = os.path.join(tmp, 'upper.txt.gz') + print('Downloading {} into {}...'.format(url, data_upper)) + request.urlretrieve(url, data_upper) + + # Convert to lowercase and count word occurences. + counter = Counter() + data_lower = os.path.join(tmp, 'lower.txt.gz') + print('Converting to lower case and counting word frequencies...') + with io.TextIOWrapper(io.BufferedWriter(gzip.open(data_lower, 'w')), encoding='utf-8') as lower: + with io.TextIOWrapper(io.BufferedReader(gzip.open(data_upper)), encoding='utf8') as upper: + for line in upper: + line_lower = line.lower() + counter.update(line_lower.split()) + lower.write(line_lower) + + # Build pruned LM. + lm_path = os.path.join(tmp, 'lm.arpa') + print('Creating ARPA file...') + subprocess.check_call([ + 'lmplz', '--order', '5', + '--temp_prefix', tmp, + '--memory', '50%', + '--text', data_lower, + '--arpa', lm_path, + '--prune', '0', '0', '1' + ]) + + # Filter LM using vocabulary of top 500k words + filtered_path = os.path.join(tmp, 'lm_filtered.arpa') + vocab_str = '\n'.join(word for word, count in counter.most_common(500000)) + print('Filtering ARPA file...') + subprocess.run(['filter', 'single', 'model:{}'.format(lm_path), filtered_path], input=vocab_str.encode('utf-8'), check=True) + + # Quantize and produce trie binary. + print('Building lm.binary...') + subprocess.check_call([ + 'build_binary', '-a', '255', + '-q', '8', + 'trie', + filtered_path, + 'lm.binary' + ]) + +if __name__ == '__main__': + main() diff --git a/data/lm/lm.binary b/data/lm/lm.binary index b73d4965..16e7d6d9 100644 --- a/data/lm/lm.binary +++ b/data/lm/lm.binary @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:111bf908121c95e6121ebf7e600aad5762c49c9348c6bc652123a2dafd28587b -size 1728766378 +oid sha256:a24953ce3f013bbf5f4a1c9f5a0e5482bc56eaa81638276de522f39e62ff3a56 +size 945699324