Add more parameters.

Implement some change request.
This commit is contained in:
Daniel 2020-04-01 16:54:58 +02:00
parent a79cc0cee9
commit b27e0347b1
3 changed files with 135 additions and 107 deletions

View File

@ -1,32 +1,23 @@
Download librispeech corpus The LM binary was generated from the LibriSpeech normalized LM training text, available `here <http://www.openslr.org/11>`_.
It is created with `KenLM <https://github.com/kpu/kenlm>`_.
You can download the librispeech corpus with the following commands:
.. code-block:: bash .. code-block:: bash
wget http://www.openslr.org/resources/11/librispeech-lm-norm.txt.gz -O librispeech.txt.gz wget http://www.openslr.org/resources/11/librispeech-lm-norm.txt.gz -O librispeech.txt.gz
gunzip librispeech.txt.gz gunzip librispeech.txt.gz
|
| Generate vocab-500000.txt and lm.binary files
| Optional Parameters:
* '--kenlm_bins path/to/bins/': Change the path of the kenlm binaries (defaults to directory in docker container)
* '--top_k 500000': Change the number of most frequent words
* '--arpa_order 5': Change order of k-grams in arpa-file generation
* '--max_arpa_memory 75%': Set maximum allowed memory usage for arpa-file generation
Then use the `generate_lm.py` script to generate `lm.binary` and `vocab-500000.txt`.
.. code-block:: bash .. code-block:: bash
python3 data/lm/generate_lm.py --input_txt path/to/vocab_sentences.txt --output_dir path/lm/ python3 data/lm/generate_lm.py --input_txt path/to/vocab_sentences.txt --output_dir path/lm/
|
| Generate scorer package with the above vocab-500000.txt and lm.binary files
| Optional Parameters:
* '--default_alpha 0.75' Afterwards you can generate the scorer package with the above vocab-500000.txt and lm.binary files
* '--default_beta 1.85'
* '--force_utf8 ""': See `link <https://github.com/mozilla/DeepSpeech/blob/master/doc/Decoder.rst#utf-8-mode>`_ for explanation
.. code-block:: bash .. code-block:: bash
python generate_package.py --alphabet ../alphabet.txt --lm lm.binary --vocab librispeech-vocab-500k.txt --package kenlm.scorer python generate_package.py --alphabet ../alphabet.txt --lm lm.binary --vocab vocab-500000.txt --package kenlm.scorer --default_alpha 0.75 --default_beta 1.85

View File

@ -6,129 +6,172 @@ from collections import Counter
import progressbar import progressbar
# ======================================================================================================================
def convert_and_filter_topk(args): def convert_and_filter_topk(args):
""" Convert to lowercase, count word occurrences and save top-k words to a file """ """ Convert to lowercase, count word occurrences and save top-k words to a file """
counter = Counter() counter = Counter()
data_lower = os.path.join(args.output_dir, 'lower.txt') data_lower = os.path.join(args.output_dir, "lower.txt")
print('\nConverting to lowercase and counting word occurrences ...') print("\nConverting to lowercase and counting word occurrences ...")
with open(data_lower, 'w+', encoding='utf8') as file_out: with open(data_lower, "w+", encoding="utf8") as file_out:
with open(args.input_txt, encoding='utf8') as file_in: with open(args.input_txt, encoding="utf8") as file_in:
for line in progressbar.progressbar(file_in): for line in progressbar.progressbar(file_in):
line_lower = line.lower() line_lower = line.lower()
counter.update(line_lower.split()) counter.update(line_lower.split())
file_out.write(line_lower) file_out.write(line_lower)
# Save top-k words # Save top-k words
print('\nSaving top {} words ...'.format(args.top_k)) print("\nSaving top {} words ...".format(args.top_k))
top_counter = counter.most_common(args.top_k) top_counter = counter.most_common(args.top_k)
vocab_str = '\n'.join(word for word, count in top_counter) vocab_str = "\n".join(word for word, count in top_counter)
vocab_path = 'vocab-{}.txt'.format(args.top_k) vocab_path = "vocab-{}.txt".format(args.top_k)
vocab_path = os.path.join(args.output_dir, vocab_path) vocab_path = os.path.join(args.output_dir, vocab_path)
with open(vocab_path, 'w+') as file: with open(vocab_path, "w+") as file:
file.write(vocab_str) file.write(vocab_str)
print('\nCalculating word statistics ...') print("\nCalculating word statistics ...")
total_words = sum(counter.values()) total_words = sum(counter.values())
print(' Your text file has {} words in total'.format(total_words)) print(" Your text file has {} words in total".format(total_words))
print(' It has {} unique words'.format(len(counter))) print(" It has {} unique words".format(len(counter)))
top_words_sum = sum(count for word, count in top_counter) top_words_sum = sum(count for word, count in top_counter)
word_fraction = (top_words_sum / total_words) * 100 word_fraction = (top_words_sum / total_words) * 100
print(' Your top-{} words are {:.4f} percent of all words'.format(args.top_k, word_fraction)) print(
" Your top-{} words are {:.4f} percent of all words".format(
args.top_k, word_fraction
)
)
print(' Your most common word "{}" occurred {} times'.format(*top_counter[0])) print(' Your most common word "{}" occurred {} times'.format(*top_counter[0]))
last_word, last_count = top_counter[-1] last_word, last_count = top_counter[-1]
print(' The least common word in your top-k is "{}" with {} times'.format(last_word, last_count)) print(
' The least common word in your top-k is "{}" with {} times'.format(
last_word, last_count
)
)
for i, (w, c) in enumerate(reversed(top_counter)): for i, (w, c) in enumerate(reversed(top_counter)):
if c > last_count: if c > last_count:
print(' The first word with {} occurrences is "{}" at place {}'.format(c, w, len(top_counter) - 1 - i)) print(
' The first word with {} occurrences is "{}" at place {}'.format(
c, w, len(top_counter) - 1 - i
)
)
break break
return data_lower, vocab_str return data_lower, vocab_str
# ======================================================================================================================
def build_lm(args, data_lower, vocab_str): def build_lm(args, data_lower, vocab_str):
print('\nCreating ARPA file ...') print("\nCreating ARPA file ...")
lm_path = os.path.join(args.output_dir, 'lm.arpa') lm_path = os.path.join(args.output_dir, "lm.arpa")
subprocess.check_call([ subprocess.check_call(
args.kenlm_bins + 'lmplz', [
'--order', str(args.arpa_order), args.kenlm_bins + "lmplz",
'--temp_prefix', args.output_dir, "--order",
'--memory', args.max_arpa_memory, str(args.arpa_order),
'--text', data_lower, "--temp_prefix",
'--arpa', lm_path, args.output_dir,
'--prune', '0', '0', '1' "--memory",
]) args.max_arpa_memory,
"--text",
data_lower,
"--arpa",
lm_path,
"--prune",
*args.arpa_prune.split("|"),
]
)
# Filter LM using vocabulary of top 500k words # Filter LM using vocabulary of top 500k words
print('\nFiltering ARPA file using vocabulary of top-k words ...') print("\nFiltering ARPA file using vocabulary of top-k words ...")
filtered_path = os.path.join(args.output_dir, 'lm_filtered.arpa') filtered_path = os.path.join(args.output_dir, "lm_filtered.arpa")
subprocess.run([ subprocess.run(
args.kenlm_bins + 'filter', [
'single', args.kenlm_bins + "filter",
'model:{}'.format(lm_path), "single",
filtered_path "model:{}".format(lm_path),
], input=vocab_str.encode('utf-8'), check=True) filtered_path,
],
input=vocab_str.encode("utf-8"),
check=True,
)
# Quantize and produce trie binary. # Quantize and produce trie binary.
print('\nBuilding lm.binary ...') print("\nBuilding lm.binary ...")
binary_path = os.path.join(args.output_dir, 'lm.binary') binary_path = os.path.join(args.output_dir, "lm.binary")
subprocess.check_call([ subprocess.check_call(
args.kenlm_bins + 'build_binary', [
'-a', '255', args.kenlm_bins + "build_binary",
'-q', '8', "-a",
'-v', str(args.binary_a_bits),
'trie', "-q",
filtered_path, str(args.binary_q_bits),
binary_path "-v",
]) args.binary_type,
filtered_path,
binary_path,
]
)
# ======================================================================================================================
def main(): def main():
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
description='Generate an lm.binary and top-k vocab for DeepSpeech.' description="Generate lm.binary and top-k vocab for DeepSpeech."
) )
parser.add_argument( parser.add_argument(
'--input_txt', "--input_txt",
help='File path to a .txt with sample sentences', help="File path to a .txt with sample sentences",
type=str, type=str,
required=True required=True,
) )
parser.add_argument( parser.add_argument(
'--output_dir', "--output_dir", help="Directory path for the output", type=str, required=True
help='Directory path for the output',
type=str,
required=True
) )
parser.add_argument( parser.add_argument(
'--top_k', "--top_k",
help='Use top_k most frequent words for the vocab.txt file', help="Use top_k most frequent words for the vocab.txt file",
type=int, type=int,
default=500000 default=500000,
) )
parser.add_argument( parser.add_argument(
'--kenlm_bins', "--kenlm_bins",
help='File path to the kenlm binaries lmplz, filter and build_binary', help="File path to the KENLM binaries lmplz, filter and build_binary",
type=str, type=str,
default='/DeepSpeech/native_client/kenlm/build/bin/' default="/DeepSpeech/native_client/kenlm/build/bin/",
) )
parser.add_argument( parser.add_argument(
'--arpa_order', "--arpa_order",
help='Order of k-grams in arpa-file generation', help="Order of k-grams in ARPA-file generation",
type=int, type=int,
default=5 default=5,
) )
parser.add_argument( parser.add_argument(
'--max_arpa_memory', "--max_arpa_memory",
help='Maximum allowed memory usage in arpa-file generation', help="Maximum allowed memory usage for ARPA-file generation",
type=str, type=str,
default='75%' default="75%",
)
parser.add_argument(
"--arpa_prune",
help='ARPA pruning parameters. Separate values with "|"',
type=str,
default="0|0|1",
)
parser.add_argument(
"--binary_a_bits",
help="Build binary quantization value a in bits",
type=int,
default=255,
)
parser.add_argument(
"--binary_q_bits",
help="Build binary quantization value q in bits",
type=int,
default=8,
)
parser.add_argument(
"--binary_type",
help="Build binary data structure type",
type=str,
default="trie",
) )
args = parser.parse_args() args = parser.parse_args()
@ -136,12 +179,10 @@ def main():
build_lm(args, data_lower, vocab_str) build_lm(args, data_lower, vocab_str)
# Delete intermediate files # Delete intermediate files
os.remove(os.path.join(args.output_dir, 'lower.txt')) os.remove(os.path.join(args.output_dir, "lower.txt"))
os.remove(os.path.join(args.output_dir, 'lm.arpa')) os.remove(os.path.join(args.output_dir, "lm.arpa"))
os.remove(os.path.join(args.output_dir, 'lm_filtered.arpa')) os.remove(os.path.join(args.output_dir, "lm_filtered.arpa"))
# ====================================================================================================================== if __name__ == "__main__":
if __name__ == '__main__':
main() main()

View File

@ -16,13 +16,13 @@ from ds_ctcdecoder import Scorer, Alphabet as NativeAlphabet
def create_bundle( def create_bundle(
alphabet_path, alphabet_path,
lm_path, lm_path,
vocab_path, vocab_path,
package_path, package_path,
force_utf8, force_utf8,
default_alpha, default_alpha,
default_beta, default_beta,
): ):
words = set() words = set()
vocab_looks_char_based = True vocab_looks_char_based = True
@ -112,27 +112,23 @@ def main():
required=True, required=True,
help="Path of vocabulary file. Must contain words separated by whitespace.", help="Path of vocabulary file. Must contain words separated by whitespace.",
) )
parser.add_argument( parser.add_argument("--package", required=True, help="Path to save scorer package.")
"--package",
required=True,
help="Path to save scorer package."
)
parser.add_argument( parser.add_argument(
"--default_alpha", "--default_alpha",
type=float, type=float,
default=0.75, required=True,
help="Default value of alpha hyperparameter.", help="Default value of alpha hyperparameter.",
) )
parser.add_argument( parser.add_argument(
"--default_beta", "--default_beta",
type=float, type=float,
default=1.85, required=True,
help="Default value of beta hyperparameter.", help="Default value of beta hyperparameter.",
) )
parser.add_argument( parser.add_argument(
"--force_utf8", "--force_utf8",
default="", default="",
help="Boolean flag, force set or unset UTF-8 mode in the scorer package. If not set, infers from the vocabulary. Using this wrong can result in a 'Segmentation fault' error at the model evaluation.", help="Boolean flag, force set or unset UTF-8 mode in the scorer package. If not set, infers from the vocabulary. See <https://github.com/mozilla/DeepSpeech/blob/master/doc/Decoder.rst#utf-8-mode> for further explanation",
) )
args = parser.parse_args() args = parser.parse_args()