Merge pull request #2528 from mozilla/filter-lm
Filter LM by removing very rare words
This commit is contained in:
commit
d925e6b5fc
@ -1,46 +1,8 @@
|
||||
|
||||
lm.binary was generated from the LibriSpeech normalized LM training text, available `here <http://www.openslr.org/11>`_\ , following this recipe (Jupyter notebook code):
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
import gzip
|
||||
import io
|
||||
import os
|
||||
|
||||
from urllib import request
|
||||
|
||||
# Grab corpus.
|
||||
url = 'http://www.openslr.org/resources/11/librispeech-lm-norm.txt.gz'
|
||||
data_upper = '/tmp/upper.txt.gz'
|
||||
request.urlretrieve(url, data_upper)
|
||||
|
||||
# Convert to lowercase and cleanup.
|
||||
data_lower = '/tmp/lower.txt'
|
||||
with open(data_lower, 'w', encoding='utf-8') as lower:
|
||||
with io.TextIOWrapper(io.BufferedReader(gzip.open(data_upper)), encoding='utf8') as upper:
|
||||
for line in upper:
|
||||
lower.write(line.lower())
|
||||
|
||||
# Build pruned LM.
|
||||
lm_path = '/tmp/lm.arpa'
|
||||
!lmplz --order 5 \
|
||||
--temp_prefix /tmp/ \
|
||||
--memory 50% \
|
||||
--text {data_lower} \
|
||||
--arpa {lm_path} \
|
||||
--prune 0 0 0 1
|
||||
|
||||
# Quantize and produce trie binary.
|
||||
binary_path = '/tmp/lm.binary'
|
||||
!build_binary -a 255 \
|
||||
-q 8 \
|
||||
trie \
|
||||
{lm_path} \
|
||||
{binary_path}
|
||||
os.remove(lm_path)
|
||||
lm.binary was generated from the LibriSpeech normalized LM training text, available `here <http://www.openslr.org/11>`_\ , using the `generate_lm.py` script (will generate lm.binary in the folder it is run from). KenLM's built binaries must be in your PATH (lmplz, build_binary, filter).
|
||||
|
||||
The trie was then generated from the vocabulary of the language model:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
./generate_trie ../data/alphabet.txt /tmp/lm.binary /tmp/trie
|
||||
./generate_trie ../data/alphabet.txt lm.binary trie
|
||||
|
59
data/lm/generate_lm.py
Normal file
59
data/lm/generate_lm.py
Normal file
@ -0,0 +1,59 @@
|
||||
import gzip
|
||||
import io
|
||||
import os
|
||||
import subprocess
|
||||
import tempfile
|
||||
|
||||
from collections import Counter
|
||||
from urllib import request
|
||||
|
||||
def main():
|
||||
# Grab corpus.
|
||||
url = 'http://www.openslr.org/resources/11/librispeech-lm-norm.txt.gz'
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
data_upper = os.path.join(tmp, 'upper.txt.gz')
|
||||
print('Downloading {} into {}...'.format(url, data_upper))
|
||||
request.urlretrieve(url, data_upper)
|
||||
|
||||
# Convert to lowercase and count word occurences.
|
||||
counter = Counter()
|
||||
data_lower = os.path.join(tmp, 'lower.txt.gz')
|
||||
print('Converting to lower case and counting word frequencies...')
|
||||
with io.TextIOWrapper(io.BufferedWriter(gzip.open(data_lower, 'w')), encoding='utf-8') as lower:
|
||||
with io.TextIOWrapper(io.BufferedReader(gzip.open(data_upper)), encoding='utf8') as upper:
|
||||
for line in upper:
|
||||
line_lower = line.lower()
|
||||
counter.update(line_lower.split())
|
||||
lower.write(line_lower)
|
||||
|
||||
# Build pruned LM.
|
||||
lm_path = os.path.join(tmp, 'lm.arpa')
|
||||
print('Creating ARPA file...')
|
||||
subprocess.check_call([
|
||||
'lmplz', '--order', '5',
|
||||
'--temp_prefix', tmp,
|
||||
'--memory', '50%',
|
||||
'--text', data_lower,
|
||||
'--arpa', lm_path,
|
||||
'--prune', '0', '0', '1'
|
||||
])
|
||||
|
||||
# Filter LM using vocabulary of top 500k words
|
||||
filtered_path = os.path.join(tmp, 'lm_filtered.arpa')
|
||||
vocab_str = '\n'.join(word for word, count in counter.most_common(500000))
|
||||
print('Filtering ARPA file...')
|
||||
subprocess.run(['filter', 'single', 'model:{}'.format(lm_path), filtered_path], input=vocab_str.encode('utf-8'), check=True)
|
||||
|
||||
# Quantize and produce trie binary.
|
||||
print('Building lm.binary...')
|
||||
subprocess.check_call([
|
||||
'build_binary', '-a', '255',
|
||||
'-q', '8',
|
||||
'trie',
|
||||
filtered_path,
|
||||
'lm.binary'
|
||||
])
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
@ -1,3 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:e1fa6801b25912a3625f67e0f6cafcdacb24033be9fad5fa272152a0828d7193
|
||||
size 1800894585
|
||||
oid sha256:a24953ce3f013bbf5f4a1c9f5a0e5482bc56eaa81638276de522f39e62ff3a56
|
||||
size 945699324
|
||||
|
@ -1,3 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:f2024f2e83b252df33b5d6c5cced8186abd2764ce7124970c15c0174034c3f2e
|
||||
size 24480560
|
||||
oid sha256:0281e5e784ffccb4aeae5e7d64099058a0c22e42dbb7aa2d3ef2fbbff53db3ab
|
||||
size 12200736
|
||||
|
Loading…
x
Reference in New Issue
Block a user