Merge branch 'pr2801' (Fixes #2801)
This commit is contained in:
commit
40250988db
@ -1,8 +1,30 @@
|
||||
The LM binary was generated from the LibriSpeech normalized LM training text, available `here <http://www.openslr.org/11>`_.
|
||||
It is created with `KenLM <https://github.com/kpu/kenlm>`_.
|
||||
|
||||
The LM binary was generated from the LibriSpeech normalized LM training text, available `here <http://www.openslr.org/11>`_\ , using the `generate_lm.py` script (will generate `lm.binary` and `librispeech-vocab-500k.txt` in the folder it is run from). `KenLM <https://github.com/kpu/kenlm>`_'s built binaries must be in your PATH (lmplz, build_binary, filter).
|
||||
|
||||
The scorer package was then built using the `generate_package.py` script:
|
||||
You can download the LibriSpeech corpus with the following commands:
|
||||
|
||||
.. code-block:: bash
|
||||
python generate_lm.py # this will create lm.binary and librispeech-vocab-500k.txt
|
||||
python generate_package.py --alphabet ../alphabet.txt --lm lm.binary --vocab librispeech-vocab-500k.txt --default_alpha 0.75 --default_beta 1.85 --package kenlm.scorer
|
||||
|
||||
wget http://www.openslr.org/resources/11/librispeech-lm-norm.txt.gz
|
||||
|
||||
|
||||
Then use the `generate_lm.py` script to generate `lm.binary` and `vocab-500000.txt`.
|
||||
|
||||
As input you can use a plain text (e.g. `file.txt`) or gzipped (e.g. `file.txt.gz`) text file with one sentence in each line.
|
||||
|
||||
If you are not using the DeepSpeech docker container, you have to build `KenLM <https://github.com/kpu/kenlm>`_ first and then pass the build directory to the script.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
python3 data/lm/generate_lm.py --input_txt librispeech-lm-norm.txt.gz \
|
||||
--output_dir . --top_k 500000 --kenlm_bins path/to/kenlm/build/bin/ \
|
||||
--arpa_order 5 --max_arpa_memory "85%" --arpa_prune "0|0|1" \
|
||||
--binary_a_bits 255 --binary_q_bits 8 --binary_type trie
|
||||
|
||||
|
||||
Afterwards you can use `generate_package.py` to generate the scorer package using the lm.binary and vocab-500000.txt files:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
python3 generate_package.py --alphabet ../alphabet.txt --lm lm.binary --vocab vocab-500000.txt \
|
||||
--package kenlm.scorer --default_alpha 0.75 --default_beta 1.85
|
||||
|
||||
@ -1,63 +1,203 @@
|
||||
import argparse
|
||||
import gzip
|
||||
import io
|
||||
import os
|
||||
import subprocess
|
||||
import tempfile
|
||||
|
||||
from collections import Counter
|
||||
from urllib import request
|
||||
|
||||
def main():
|
||||
# Grab corpus.
|
||||
url = 'http://www.openslr.org/resources/11/librispeech-lm-norm.txt.gz'
|
||||
import progressbar
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
data_upper = os.path.join(tmp, 'upper.txt.gz')
|
||||
print('Downloading {} into {}...'.format(url, data_upper))
|
||||
request.urlretrieve(url, data_upper)
|
||||
|
||||
# Convert to lowercase and count word occurences.
|
||||
def convert_and_filter_topk(args):
|
||||
""" Convert to lowercase, count word occurrences and save top-k words to a file """
|
||||
|
||||
counter = Counter()
|
||||
data_lower = os.path.join(tmp, 'lower.txt.gz')
|
||||
print('Converting to lower case and counting word frequencies...')
|
||||
with io.TextIOWrapper(io.BufferedWriter(gzip.open(data_lower, 'w')), encoding='utf-8') as lower:
|
||||
with io.TextIOWrapper(io.BufferedReader(gzip.open(data_upper)), encoding='utf8') as upper:
|
||||
for line in upper:
|
||||
data_lower = os.path.join(args.output_dir, "lower.txt.gz")
|
||||
|
||||
print("\nConverting to lowercase and counting word occurrences ...")
|
||||
with io.TextIOWrapper(
|
||||
io.BufferedWriter(gzip.open(data_lower, "w+")), encoding="utf-8"
|
||||
) as file_out:
|
||||
|
||||
# Open the input file either from input.txt or input.txt.gz
|
||||
_, file_extension = os.path.splitext(args.input_txt)
|
||||
if file_extension == ".gz":
|
||||
file_in = io.TextIOWrapper(
|
||||
io.BufferedReader(gzip.open(args.input_txt)), encoding="utf-8"
|
||||
)
|
||||
else:
|
||||
file_in = open(args.input_txt, encoding="utf-8")
|
||||
|
||||
for line in progressbar.progressbar(file_in):
|
||||
line_lower = line.lower()
|
||||
counter.update(line_lower.split())
|
||||
lower.write(line_lower)
|
||||
file_out.write(line_lower)
|
||||
|
||||
# Build pruned LM.
|
||||
lm_path = os.path.join(tmp, 'lm.arpa')
|
||||
print('Creating ARPA file...')
|
||||
subprocess.check_call([
|
||||
'lmplz', '--order', '5',
|
||||
'--temp_prefix', tmp,
|
||||
'--memory', '50%',
|
||||
'--text', data_lower,
|
||||
'--arpa', lm_path,
|
||||
'--prune', '0', '0', '1'
|
||||
])
|
||||
file_in.close()
|
||||
|
||||
vocab_str = '\n'.join(word for word, count in counter.most_common(500000))
|
||||
with open('librispeech-vocab-500k.txt', 'w') as fout:
|
||||
fout.write(vocab_str)
|
||||
# Save top-k words
|
||||
print("\nSaving top {} words ...".format(args.top_k))
|
||||
top_counter = counter.most_common(args.top_k)
|
||||
vocab_str = "\n".join(word for word, count in top_counter)
|
||||
vocab_path = "vocab-{}.txt".format(args.top_k)
|
||||
vocab_path = os.path.join(args.output_dir, vocab_path)
|
||||
with open(vocab_path, "w+") as file:
|
||||
file.write(vocab_str)
|
||||
|
||||
# Filter LM using vocabulary of top 500k words
|
||||
print('Filtering ARPA file...')
|
||||
filtered_path = os.path.join(tmp, 'lm_filtered.arpa')
|
||||
subprocess.run(['filter', 'single', 'model:{}'.format(lm_path), filtered_path], input=vocab_str.encode('utf-8'), check=True)
|
||||
print("\nCalculating word statistics ...")
|
||||
total_words = sum(counter.values())
|
||||
print(" Your text file has {} words in total".format(total_words))
|
||||
print(" It has {} unique words".format(len(counter)))
|
||||
top_words_sum = sum(count for word, count in top_counter)
|
||||
word_fraction = (top_words_sum / total_words) * 100
|
||||
print(
|
||||
" Your top-{} words are {:.4f} percent of all words".format(
|
||||
args.top_k, word_fraction
|
||||
)
|
||||
)
|
||||
print(' Your most common word "{}" occurred {} times'.format(*top_counter[0]))
|
||||
last_word, last_count = top_counter[-1]
|
||||
print(
|
||||
' The least common word in your top-k is "{}" with {} times'.format(
|
||||
last_word, last_count
|
||||
)
|
||||
)
|
||||
for i, (w, c) in enumerate(reversed(top_counter)):
|
||||
if c > last_count:
|
||||
print(
|
||||
' The first word with {} occurrences is "{}" at place {}'.format(
|
||||
c, w, len(top_counter) - 1 - i
|
||||
)
|
||||
)
|
||||
break
|
||||
|
||||
return data_lower, vocab_str
|
||||
|
||||
|
||||
def build_lm(args, data_lower, vocab_str):
|
||||
print("\nCreating ARPA file ...")
|
||||
lm_path = os.path.join(args.output_dir, "lm.arpa")
|
||||
subprocess.check_call(
|
||||
[
|
||||
os.path.join(args.kenlm_bins, "lmplz"),
|
||||
"--order",
|
||||
str(args.arpa_order),
|
||||
"--temp_prefix",
|
||||
args.output_dir,
|
||||
"--memory",
|
||||
args.max_arpa_memory,
|
||||
"--text",
|
||||
data_lower,
|
||||
"--arpa",
|
||||
lm_path,
|
||||
"--prune",
|
||||
*args.arpa_prune.split("|"),
|
||||
]
|
||||
)
|
||||
|
||||
# Filter LM using vocabulary of top-k words
|
||||
print("\nFiltering ARPA file using vocabulary of top-k words ...")
|
||||
filtered_path = os.path.join(args.output_dir, "lm_filtered.arpa")
|
||||
subprocess.run(
|
||||
[
|
||||
os.path.join(args.kenlm_bins, "filter"),
|
||||
"single",
|
||||
"model:{}".format(lm_path),
|
||||
filtered_path,
|
||||
],
|
||||
input=vocab_str.encode("utf-8"),
|
||||
check=True,
|
||||
)
|
||||
|
||||
# Quantize and produce trie binary.
|
||||
print('Building lm.binary...')
|
||||
subprocess.check_call([
|
||||
'build_binary', '-a', '255',
|
||||
'-q', '8',
|
||||
'-v',
|
||||
'trie',
|
||||
print("\nBuilding lm.binary ...")
|
||||
binary_path = os.path.join(args.output_dir, "lm.binary")
|
||||
subprocess.check_call(
|
||||
[
|
||||
os.path.join(args.kenlm_bins, "build_binary"),
|
||||
"-a",
|
||||
str(args.binary_a_bits),
|
||||
"-q",
|
||||
str(args.binary_q_bits),
|
||||
"-v",
|
||||
args.binary_type,
|
||||
filtered_path,
|
||||
'lm.binary'
|
||||
])
|
||||
binary_path,
|
||||
]
|
||||
)
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Generate lm.binary and top-k vocab for DeepSpeech."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--input_txt",
|
||||
help="Path to a file.txt or file.txt.gz with sample sentences",
|
||||
type=str,
|
||||
required=True,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output_dir", help="Directory path for the output", type=str, required=True
|
||||
)
|
||||
parser.add_argument(
|
||||
"--top_k",
|
||||
help="Use top_k most frequent words for the vocab.txt file. These will be used to filter the ARPA file.",
|
||||
type=int,
|
||||
required=True,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--kenlm_bins",
|
||||
help="File path to the KENLM binaries lmplz, filter and build_binary",
|
||||
type=str,
|
||||
required=True,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--arpa_order",
|
||||
help="Order of k-grams in ARPA-file generation",
|
||||
type=int,
|
||||
required=True,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max_arpa_memory",
|
||||
help="Maximum allowed memory usage for ARPA-file generation",
|
||||
type=str,
|
||||
required=True,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--arpa_prune",
|
||||
help="ARPA pruning parameters. Separate values with '|'",
|
||||
type=str,
|
||||
required=True,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--binary_a_bits",
|
||||
help="Build binary quantization value a in bits",
|
||||
type=int,
|
||||
required=True,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--binary_q_bits",
|
||||
help="Build binary quantization value q in bits",
|
||||
type=int,
|
||||
required=True,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--binary_type",
|
||||
help="Build binary data structure type",
|
||||
type=str,
|
||||
required=True,
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
data_lower, vocab_str = convert_and_filter_topk(args)
|
||||
build_lm(args, data_lower, vocab_str)
|
||||
|
||||
# Delete intermediate files
|
||||
os.remove(os.path.join(args.output_dir, "lower.txt.gz"))
|
||||
os.remove(os.path.join(args.output_dir, "lm.arpa"))
|
||||
os.remove(os.path.join(args.output_dir, "lm_filtered.arpa"))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@ -125,7 +125,7 @@ def main():
|
||||
parser.add_argument(
|
||||
"--force_utf8",
|
||||
default="",
|
||||
help="Boolean flag, force set or unset UTF-8 mode in the scorer package. If not set, infers from the vocabulary.",
|
||||
help="Boolean flag, force set or unset UTF-8 mode in the scorer package. If not set, infers from the vocabulary. See <https://github.com/mozilla/DeepSpeech/blob/master/doc/Decoder.rst#utf-8-mode> for further explanation",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user