Merge branch 'pr2801' (Fixes #2801)

This commit is contained in:
Reuben Morais 2020-04-14 13:07:50 +02:00
commit 40250988db
3 changed files with 216 additions and 54 deletions

View File

@ -1,8 +1,30 @@
The LM binary was generated from the LibriSpeech normalized LM training text, available `here <http://www.openslr.org/11>`_.
It is created with `KenLM <https://github.com/kpu/kenlm>`_.
The LM binary was generated from the LibriSpeech normalized LM training text, available `here <http://www.openslr.org/11>`_\ , using the `generate_lm.py` script (will generate `lm.binary` and `librispeech-vocab-500k.txt` in the folder it is run from). `KenLM <https://github.com/kpu/kenlm>`_'s built binaries must be in your PATH (lmplz, build_binary, filter). You can download the LibriSpeech corpus with the following commands:
The scorer package was then built using the `generate_package.py` script:
.. code-block:: bash .. code-block:: bash
python generate_lm.py # this will create lm.binary and librispeech-vocab-500k.txt
python generate_package.py --alphabet ../alphabet.txt --lm lm.binary --vocab librispeech-vocab-500k.txt --default_alpha 0.75 --default_beta 1.85 --package kenlm.scorer wget http://www.openslr.org/resources/11/librispeech-lm-norm.txt.gz
Then use the `generate_lm.py` script to generate `lm.binary` and `vocab-500000.txt`.
As input you can use a plain text (e.g. `file.txt`) or gzipped (e.g. `file.txt.gz`) text file with one sentence in each line.
If you are not using the DeepSpeech docker container, you have to build `KenLM <https://github.com/kpu/kenlm>`_ first and then pass the build directory to the script.
.. code-block:: bash
python3 data/lm/generate_lm.py --input_txt librispeech-lm-norm.txt.gz \
--output_dir . --top_k 500000 --kenlm_bins path/to/kenlm/build/bin/ \
--arpa_order 5 --max_arpa_memory "85%" --arpa_prune "0|0|1" \
--binary_a_bits 255 --binary_q_bits 8 --binary_type trie
Afterwards you can use `generate_package.py` to generate the scorer package using the lm.binary and vocab-500000.txt files:
.. code-block:: bash
python3 generate_package.py --alphabet ../alphabet.txt --lm lm.binary --vocab vocab-500000.txt \
--package kenlm.scorer --default_alpha 0.75 --default_beta 1.85

View File

@ -1,63 +1,203 @@
import argparse
import gzip import gzip
import io import io
import os import os
import subprocess import subprocess
import tempfile
from collections import Counter from collections import Counter
from urllib import request
def main(): import progressbar
# Grab corpus.
url = 'http://www.openslr.org/resources/11/librispeech-lm-norm.txt.gz'
with tempfile.TemporaryDirectory() as tmp:
data_upper = os.path.join(tmp, 'upper.txt.gz')
print('Downloading {} into {}...'.format(url, data_upper))
request.urlretrieve(url, data_upper)
# Convert to lowercase and count word occurences. def convert_and_filter_topk(args):
""" Convert to lowercase, count word occurrences and save top-k words to a file """
counter = Counter() counter = Counter()
data_lower = os.path.join(tmp, 'lower.txt.gz') data_lower = os.path.join(args.output_dir, "lower.txt.gz")
print('Converting to lower case and counting word frequencies...')
with io.TextIOWrapper(io.BufferedWriter(gzip.open(data_lower, 'w')), encoding='utf-8') as lower: print("\nConverting to lowercase and counting word occurrences ...")
with io.TextIOWrapper(io.BufferedReader(gzip.open(data_upper)), encoding='utf8') as upper: with io.TextIOWrapper(
for line in upper: io.BufferedWriter(gzip.open(data_lower, "w+")), encoding="utf-8"
) as file_out:
# Open the input file either from input.txt or input.txt.gz
_, file_extension = os.path.splitext(args.input_txt)
if file_extension == ".gz":
file_in = io.TextIOWrapper(
io.BufferedReader(gzip.open(args.input_txt)), encoding="utf-8"
)
else:
file_in = open(args.input_txt, encoding="utf-8")
for line in progressbar.progressbar(file_in):
line_lower = line.lower() line_lower = line.lower()
counter.update(line_lower.split()) counter.update(line_lower.split())
lower.write(line_lower) file_out.write(line_lower)
# Build pruned LM. file_in.close()
lm_path = os.path.join(tmp, 'lm.arpa')
print('Creating ARPA file...')
subprocess.check_call([
'lmplz', '--order', '5',
'--temp_prefix', tmp,
'--memory', '50%',
'--text', data_lower,
'--arpa', lm_path,
'--prune', '0', '0', '1'
])
vocab_str = '\n'.join(word for word, count in counter.most_common(500000)) # Save top-k words
with open('librispeech-vocab-500k.txt', 'w') as fout: print("\nSaving top {} words ...".format(args.top_k))
fout.write(vocab_str) top_counter = counter.most_common(args.top_k)
vocab_str = "\n".join(word for word, count in top_counter)
vocab_path = "vocab-{}.txt".format(args.top_k)
vocab_path = os.path.join(args.output_dir, vocab_path)
with open(vocab_path, "w+") as file:
file.write(vocab_str)
# Filter LM using vocabulary of top 500k words print("\nCalculating word statistics ...")
print('Filtering ARPA file...') total_words = sum(counter.values())
filtered_path = os.path.join(tmp, 'lm_filtered.arpa') print(" Your text file has {} words in total".format(total_words))
subprocess.run(['filter', 'single', 'model:{}'.format(lm_path), filtered_path], input=vocab_str.encode('utf-8'), check=True) print(" It has {} unique words".format(len(counter)))
top_words_sum = sum(count for word, count in top_counter)
word_fraction = (top_words_sum / total_words) * 100
print(
" Your top-{} words are {:.4f} percent of all words".format(
args.top_k, word_fraction
)
)
print(' Your most common word "{}" occurred {} times'.format(*top_counter[0]))
last_word, last_count = top_counter[-1]
print(
' The least common word in your top-k is "{}" with {} times'.format(
last_word, last_count
)
)
for i, (w, c) in enumerate(reversed(top_counter)):
if c > last_count:
print(
' The first word with {} occurrences is "{}" at place {}'.format(
c, w, len(top_counter) - 1 - i
)
)
break
return data_lower, vocab_str
def build_lm(args, data_lower, vocab_str):
print("\nCreating ARPA file ...")
lm_path = os.path.join(args.output_dir, "lm.arpa")
subprocess.check_call(
[
os.path.join(args.kenlm_bins, "lmplz"),
"--order",
str(args.arpa_order),
"--temp_prefix",
args.output_dir,
"--memory",
args.max_arpa_memory,
"--text",
data_lower,
"--arpa",
lm_path,
"--prune",
*args.arpa_prune.split("|"),
]
)
# Filter LM using vocabulary of top-k words
print("\nFiltering ARPA file using vocabulary of top-k words ...")
filtered_path = os.path.join(args.output_dir, "lm_filtered.arpa")
subprocess.run(
[
os.path.join(args.kenlm_bins, "filter"),
"single",
"model:{}".format(lm_path),
filtered_path,
],
input=vocab_str.encode("utf-8"),
check=True,
)
# Quantize and produce trie binary. # Quantize and produce trie binary.
print('Building lm.binary...') print("\nBuilding lm.binary ...")
subprocess.check_call([ binary_path = os.path.join(args.output_dir, "lm.binary")
'build_binary', '-a', '255', subprocess.check_call(
'-q', '8', [
'-v', os.path.join(args.kenlm_bins, "build_binary"),
'trie', "-a",
str(args.binary_a_bits),
"-q",
str(args.binary_q_bits),
"-v",
args.binary_type,
filtered_path, filtered_path,
'lm.binary' binary_path,
]) ]
)
if __name__ == '__main__':
def main():
parser = argparse.ArgumentParser(
description="Generate lm.binary and top-k vocab for DeepSpeech."
)
parser.add_argument(
"--input_txt",
help="Path to a file.txt or file.txt.gz with sample sentences",
type=str,
required=True,
)
parser.add_argument(
"--output_dir", help="Directory path for the output", type=str, required=True
)
parser.add_argument(
"--top_k",
help="Use top_k most frequent words for the vocab.txt file. These will be used to filter the ARPA file.",
type=int,
required=True,
)
parser.add_argument(
"--kenlm_bins",
help="File path to the KENLM binaries lmplz, filter and build_binary",
type=str,
required=True,
)
parser.add_argument(
"--arpa_order",
help="Order of k-grams in ARPA-file generation",
type=int,
required=True,
)
parser.add_argument(
"--max_arpa_memory",
help="Maximum allowed memory usage for ARPA-file generation",
type=str,
required=True,
)
parser.add_argument(
"--arpa_prune",
help="ARPA pruning parameters. Separate values with '|'",
type=str,
required=True,
)
parser.add_argument(
"--binary_a_bits",
help="Build binary quantization value a in bits",
type=int,
required=True,
)
parser.add_argument(
"--binary_q_bits",
help="Build binary quantization value q in bits",
type=int,
required=True,
)
parser.add_argument(
"--binary_type",
help="Build binary data structure type",
type=str,
required=True,
)
args = parser.parse_args()
data_lower, vocab_str = convert_and_filter_topk(args)
build_lm(args, data_lower, vocab_str)
# Delete intermediate files
os.remove(os.path.join(args.output_dir, "lower.txt.gz"))
os.remove(os.path.join(args.output_dir, "lm.arpa"))
os.remove(os.path.join(args.output_dir, "lm_filtered.arpa"))
if __name__ == "__main__":
main() main()

View File

@ -125,7 +125,7 @@ def main():
parser.add_argument( parser.add_argument(
"--force_utf8", "--force_utf8",
default="", default="",
help="Boolean flag, force set or unset UTF-8 mode in the scorer package. If not set, infers from the vocabulary.", help="Boolean flag, force set or unset UTF-8 mode in the scorer package. If not set, infers from the vocabulary. See <https://github.com/mozilla/DeepSpeech/blob/master/doc/Decoder.rst#utf-8-mode> for further explanation",
) )
args = parser.parse_args() args = parser.parse_args()