From ad2769f479d0ddafc44e5a9053e4202bbf8e47b1 Mon Sep 17 00:00:00 2001 From: Reuben Morais Date: Wed, 13 Nov 2019 17:38:40 +0100 Subject: [PATCH 1/2] Filter LM by removing very rare words --- data/lm/README.rst | 23 ++++++++++++++++++----- data/lm/lm.binary | 4 ++-- data/lm/trie | 4 ++-- 3 files changed, 22 insertions(+), 9 deletions(-) diff --git a/data/lm/README.rst b/data/lm/README.rst index 38a2b74d..d6f8a5c4 100644 --- a/data/lm/README.rst +++ b/data/lm/README.rst @@ -8,18 +8,27 @@ lm.binary was generated from the LibriSpeech normalized LM training text, availa import os from urllib import request + from collections import Counter # Grab corpus. url = 'http://www.openslr.org/resources/11/librispeech-lm-norm.txt.gz' data_upper = '/tmp/upper.txt.gz' request.urlretrieve(url, data_upper) - # Convert to lowercase and cleanup. - data_lower = '/tmp/lower.txt' - with open(data_lower, 'w', encoding='utf-8') as lower: + # Convert to lowercase and count word occurences. + counter = Counter() + data_lower = '/tmp/lower.txt.gz' + with io.TextIOWrapper(io.BufferedWriter(gzip.open(data_lower, 'w')), encoding='utf-8') as lower: with io.TextIOWrapper(io.BufferedReader(gzip.open(data_upper)), encoding='utf8') as upper: for line in upper: - lower.write(line.lower()) + line_lower = line.lower() + counter.update(line_lower.split()) + lower.write(line_lower) + + # Create vocabulary file with top 500k words + vocab_path = '/tmp/vocab-500k.txt' + with open(vocab_path, 'w') as fout: + fout.write('\n'.join(word for word, count in counter.most_common(500000))) # Build pruned LM. lm_path = '/tmp/lm.arpa' @@ -30,12 +39,16 @@ lm.binary was generated from the LibriSpeech normalized LM training text, availa --arpa {lm_path} \ --prune 0 0 0 1 + # Filter LM using vocabulary. + filtered_path = '/tmp/lm_filtered.arpa' + !filter single model:{lm_path} {filtered_path} < {vocab_path} + # Quantize and produce trie binary. binary_path = '/tmp/lm.binary' !build_binary -a 255 \ -q 8 \ trie \ - {lm_path} \ + {filtered_path} \ {binary_path} os.remove(lm_path) diff --git a/data/lm/lm.binary b/data/lm/lm.binary index 0c53ca35..b73d4965 100644 --- a/data/lm/lm.binary +++ b/data/lm/lm.binary @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e1fa6801b25912a3625f67e0f6cafcdacb24033be9fad5fa272152a0828d7193 -size 1800894585 +oid sha256:111bf908121c95e6121ebf7e600aad5762c49c9348c6bc652123a2dafd28587b +size 1728766378 diff --git a/data/lm/trie b/data/lm/trie index 342c6a9e..8edb4157 100644 --- a/data/lm/trie +++ b/data/lm/trie @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f2024f2e83b252df33b5d6c5cced8186abd2764ce7124970c15c0174034c3f2e -size 24480560 +oid sha256:0281e5e784ffccb4aeae5e7d64099058a0c22e42dbb7aa2d3ef2fbbff53db3ab +size 12200736 From 381faaf6b600c5e9704462cfd9544daa90a8977a Mon Sep 17 00:00:00 2001 From: Reuben Morais Date: Fri, 15 Nov 2019 13:28:45 +0100 Subject: [PATCH 2/2] Switch to --prune 0 0 1 model and move generation code to a script --- data/lm/README.rst | 55 ++------------------------------------- data/lm/generate_lm.py | 59 ++++++++++++++++++++++++++++++++++++++++++ data/lm/lm.binary | 4 +-- 3 files changed, 63 insertions(+), 55 deletions(-) create mode 100644 data/lm/generate_lm.py diff --git a/data/lm/README.rst b/data/lm/README.rst index d6f8a5c4..add2b195 100644 --- a/data/lm/README.rst +++ b/data/lm/README.rst @@ -1,59 +1,8 @@ -lm.binary was generated from the LibriSpeech normalized LM training text, available `here `_\ , following this recipe (Jupyter notebook code): - -.. code-block:: python - - import gzip - import io - import os - - from urllib import request - from collections import Counter - - # Grab corpus. - url = 'http://www.openslr.org/resources/11/librispeech-lm-norm.txt.gz' - data_upper = '/tmp/upper.txt.gz' - request.urlretrieve(url, data_upper) - - # Convert to lowercase and count word occurences. - counter = Counter() - data_lower = '/tmp/lower.txt.gz' - with io.TextIOWrapper(io.BufferedWriter(gzip.open(data_lower, 'w')), encoding='utf-8') as lower: - with io.TextIOWrapper(io.BufferedReader(gzip.open(data_upper)), encoding='utf8') as upper: - for line in upper: - line_lower = line.lower() - counter.update(line_lower.split()) - lower.write(line_lower) - - # Create vocabulary file with top 500k words - vocab_path = '/tmp/vocab-500k.txt' - with open(vocab_path, 'w') as fout: - fout.write('\n'.join(word for word, count in counter.most_common(500000))) - - # Build pruned LM. - lm_path = '/tmp/lm.arpa' - !lmplz --order 5 \ - --temp_prefix /tmp/ \ - --memory 50% \ - --text {data_lower} \ - --arpa {lm_path} \ - --prune 0 0 0 1 - - # Filter LM using vocabulary. - filtered_path = '/tmp/lm_filtered.arpa' - !filter single model:{lm_path} {filtered_path} < {vocab_path} - - # Quantize and produce trie binary. - binary_path = '/tmp/lm.binary' - !build_binary -a 255 \ - -q 8 \ - trie \ - {filtered_path} \ - {binary_path} - os.remove(lm_path) +lm.binary was generated from the LibriSpeech normalized LM training text, available `here `_\ , using the `generate_lm.py` script (will generate lm.binary in the folder it is run from). KenLM's built binaries must be in your PATH (lmplz, build_binary, filter). The trie was then generated from the vocabulary of the language model: .. code-block:: bash - ./generate_trie ../data/alphabet.txt /tmp/lm.binary /tmp/trie + ./generate_trie ../data/alphabet.txt lm.binary trie diff --git a/data/lm/generate_lm.py b/data/lm/generate_lm.py new file mode 100644 index 00000000..82fe6468 --- /dev/null +++ b/data/lm/generate_lm.py @@ -0,0 +1,59 @@ +import gzip +import io +import os +import subprocess +import tempfile + +from collections import Counter +from urllib import request + +def main(): + # Grab corpus. + url = 'http://www.openslr.org/resources/11/librispeech-lm-norm.txt.gz' + + with tempfile.TemporaryDirectory() as tmp: + data_upper = os.path.join(tmp, 'upper.txt.gz') + print('Downloading {} into {}...'.format(url, data_upper)) + request.urlretrieve(url, data_upper) + + # Convert to lowercase and count word occurences. + counter = Counter() + data_lower = os.path.join(tmp, 'lower.txt.gz') + print('Converting to lower case and counting word frequencies...') + with io.TextIOWrapper(io.BufferedWriter(gzip.open(data_lower, 'w')), encoding='utf-8') as lower: + with io.TextIOWrapper(io.BufferedReader(gzip.open(data_upper)), encoding='utf8') as upper: + for line in upper: + line_lower = line.lower() + counter.update(line_lower.split()) + lower.write(line_lower) + + # Build pruned LM. + lm_path = os.path.join(tmp, 'lm.arpa') + print('Creating ARPA file...') + subprocess.check_call([ + 'lmplz', '--order', '5', + '--temp_prefix', tmp, + '--memory', '50%', + '--text', data_lower, + '--arpa', lm_path, + '--prune', '0', '0', '1' + ]) + + # Filter LM using vocabulary of top 500k words + filtered_path = os.path.join(tmp, 'lm_filtered.arpa') + vocab_str = '\n'.join(word for word, count in counter.most_common(500000)) + print('Filtering ARPA file...') + subprocess.run(['filter', 'single', 'model:{}'.format(lm_path), filtered_path], input=vocab_str.encode('utf-8'), check=True) + + # Quantize and produce trie binary. + print('Building lm.binary...') + subprocess.check_call([ + 'build_binary', '-a', '255', + '-q', '8', + 'trie', + filtered_path, + 'lm.binary' + ]) + +if __name__ == '__main__': + main() diff --git a/data/lm/lm.binary b/data/lm/lm.binary index b73d4965..16e7d6d9 100644 --- a/data/lm/lm.binary +++ b/data/lm/lm.binary @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:111bf908121c95e6121ebf7e600aad5762c49c9348c6bc652123a2dafd28587b -size 1728766378 +oid sha256:a24953ce3f013bbf5f4a1c9f5a0e5482bc56eaa81638276de522f39e62ff3a56 +size 945699324