Switch to --prune 0 0 1 model and move generation code to a script

This commit is contained in:
Reuben Morais 2019-11-15 13:28:45 +01:00
parent ad2769f479
commit 381faaf6b6
3 changed files with 63 additions and 55 deletions

View File

@ -1,59 +1,8 @@
lm.binary was generated from the LibriSpeech normalized LM training text, available `here <http://www.openslr.org/11>`_\ , following this recipe (Jupyter notebook code):
.. code-block:: python
import gzip
import io
import os
from urllib import request
from collections import Counter
# Grab corpus.
url = 'http://www.openslr.org/resources/11/librispeech-lm-norm.txt.gz'
data_upper = '/tmp/upper.txt.gz'
request.urlretrieve(url, data_upper)
# Convert to lowercase and count word occurences.
counter = Counter()
data_lower = '/tmp/lower.txt.gz'
with io.TextIOWrapper(io.BufferedWriter(gzip.open(data_lower, 'w')), encoding='utf-8') as lower:
with io.TextIOWrapper(io.BufferedReader(gzip.open(data_upper)), encoding='utf8') as upper:
for line in upper:
line_lower = line.lower()
counter.update(line_lower.split())
lower.write(line_lower)
# Create vocabulary file with top 500k words
vocab_path = '/tmp/vocab-500k.txt'
with open(vocab_path, 'w') as fout:
fout.write('\n'.join(word for word, count in counter.most_common(500000)))
# Build pruned LM.
lm_path = '/tmp/lm.arpa'
!lmplz --order 5 \
--temp_prefix /tmp/ \
--memory 50% \
--text {data_lower} \
--arpa {lm_path} \
--prune 0 0 0 1
# Filter LM using vocabulary.
filtered_path = '/tmp/lm_filtered.arpa'
!filter single model:{lm_path} {filtered_path} < {vocab_path}
# Quantize and produce trie binary.
binary_path = '/tmp/lm.binary'
!build_binary -a 255 \
-q 8 \
trie \
{filtered_path} \
{binary_path}
os.remove(lm_path)
lm.binary was generated from the LibriSpeech normalized LM training text, available `here <http://www.openslr.org/11>`_\ , using the `generate_lm.py` script (will generate lm.binary in the folder it is run from). KenLM's built binaries must be in your PATH (lmplz, build_binary, filter).
The trie was then generated from the vocabulary of the language model:
.. code-block:: bash
./generate_trie ../data/alphabet.txt /tmp/lm.binary /tmp/trie
./generate_trie ../data/alphabet.txt lm.binary trie

59
data/lm/generate_lm.py Normal file
View File

@ -0,0 +1,59 @@
import gzip
import io
import os
import subprocess
import tempfile
from collections import Counter
from urllib import request
def main():
# Grab corpus.
url = 'http://www.openslr.org/resources/11/librispeech-lm-norm.txt.gz'
with tempfile.TemporaryDirectory() as tmp:
data_upper = os.path.join(tmp, 'upper.txt.gz')
print('Downloading {} into {}...'.format(url, data_upper))
request.urlretrieve(url, data_upper)
# Convert to lowercase and count word occurences.
counter = Counter()
data_lower = os.path.join(tmp, 'lower.txt.gz')
print('Converting to lower case and counting word frequencies...')
with io.TextIOWrapper(io.BufferedWriter(gzip.open(data_lower, 'w')), encoding='utf-8') as lower:
with io.TextIOWrapper(io.BufferedReader(gzip.open(data_upper)), encoding='utf8') as upper:
for line in upper:
line_lower = line.lower()
counter.update(line_lower.split())
lower.write(line_lower)
# Build pruned LM.
lm_path = os.path.join(tmp, 'lm.arpa')
print('Creating ARPA file...')
subprocess.check_call([
'lmplz', '--order', '5',
'--temp_prefix', tmp,
'--memory', '50%',
'--text', data_lower,
'--arpa', lm_path,
'--prune', '0', '0', '1'
])
# Filter LM using vocabulary of top 500k words
filtered_path = os.path.join(tmp, 'lm_filtered.arpa')
vocab_str = '\n'.join(word for word, count in counter.most_common(500000))
print('Filtering ARPA file...')
subprocess.run(['filter', 'single', 'model:{}'.format(lm_path), filtered_path], input=vocab_str.encode('utf-8'), check=True)
# Quantize and produce trie binary.
print('Building lm.binary...')
subprocess.check_call([
'build_binary', '-a', '255',
'-q', '8',
'trie',
filtered_path,
'lm.binary'
])
if __name__ == '__main__':
main()

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:111bf908121c95e6121ebf7e600aad5762c49c9348c6bc652123a2dafd28587b
size 1728766378
oid sha256:a24953ce3f013bbf5f4a1c9f5a0e5482bc56eaa81638276de522f39e62ff3a56
size 945699324