diff --git a/data/lm/README.rst b/data/lm/README.rst index c4d1631b..228d210c 100644 --- a/data/lm/README.rst +++ b/data/lm/README.rst @@ -1,32 +1,23 @@ -Download librispeech corpus +The LM binary was generated from the LibriSpeech normalized LM training text, available `here `_. +It is created with `KenLM `_. + +You can download the librispeech corpus with the following commands: .. code-block:: bash wget http://www.openslr.org/resources/11/librispeech-lm-norm.txt.gz -O librispeech.txt.gz gunzip librispeech.txt.gz -| -| Generate vocab-500000.txt and lm.binary files -| Optional Parameters: - -* '--kenlm_bins path/to/bins/': Change the path of the kenlm binaries (defaults to directory in docker container) -* '--top_k 500000': Change the number of most frequent words -* '--arpa_order 5': Change order of k-grams in arpa-file generation -* '--max_arpa_memory 75%': Set maximum allowed memory usage for arpa-file generation +Then use the `generate_lm.py` script to generate `lm.binary` and `vocab-500000.txt`. .. code-block:: bash python3 data/lm/generate_lm.py --input_txt path/to/vocab_sentences.txt --output_dir path/lm/ -| -| Generate scorer package with the above vocab-500000.txt and lm.binary files -| Optional Parameters: -* '--default_alpha 0.75' -* '--default_beta 1.85' -* '--force_utf8 ""': See `link `_ for explanation +Afterwards you can generate the scorer package with the above vocab-500000.txt and lm.binary files .. code-block:: bash - python generate_package.py --alphabet ../alphabet.txt --lm lm.binary --vocab librispeech-vocab-500k.txt --package kenlm.scorer + python generate_package.py --alphabet ../alphabet.txt --lm lm.binary --vocab vocab-500000.txt --package kenlm.scorer --default_alpha 0.75 --default_beta 1.85 diff --git a/data/lm/generate_lm.py b/data/lm/generate_lm.py index f6446c62..410dd8a4 100644 --- a/data/lm/generate_lm.py +++ b/data/lm/generate_lm.py @@ -6,129 +6,172 @@ from collections import Counter import progressbar -# ====================================================================================================================== - def convert_and_filter_topk(args): """ Convert to lowercase, count word occurrences and save top-k words to a file """ counter = Counter() - data_lower = os.path.join(args.output_dir, 'lower.txt') + data_lower = os.path.join(args.output_dir, "lower.txt") - print('\nConverting to lowercase and counting word occurrences ...') - with open(data_lower, 'w+', encoding='utf8') as file_out: - with open(args.input_txt, encoding='utf8') as file_in: + print("\nConverting to lowercase and counting word occurrences ...") + with open(data_lower, "w+", encoding="utf8") as file_out: + with open(args.input_txt, encoding="utf8") as file_in: for line in progressbar.progressbar(file_in): line_lower = line.lower() counter.update(line_lower.split()) file_out.write(line_lower) # Save top-k words - print('\nSaving top {} words ...'.format(args.top_k)) + print("\nSaving top {} words ...".format(args.top_k)) top_counter = counter.most_common(args.top_k) - vocab_str = '\n'.join(word for word, count in top_counter) - vocab_path = 'vocab-{}.txt'.format(args.top_k) + vocab_str = "\n".join(word for word, count in top_counter) + vocab_path = "vocab-{}.txt".format(args.top_k) vocab_path = os.path.join(args.output_dir, vocab_path) - with open(vocab_path, 'w+') as file: + with open(vocab_path, "w+") as file: file.write(vocab_str) - print('\nCalculating word statistics ...') + print("\nCalculating word statistics ...") total_words = sum(counter.values()) - print(' Your text file has {} words in total'.format(total_words)) - print(' It has {} unique words'.format(len(counter))) + print(" Your text file has {} words in total".format(total_words)) + print(" It has {} unique words".format(len(counter))) top_words_sum = sum(count for word, count in top_counter) word_fraction = (top_words_sum / total_words) * 100 - print(' Your top-{} words are {:.4f} percent of all words'.format(args.top_k, word_fraction)) + print( + " Your top-{} words are {:.4f} percent of all words".format( + args.top_k, word_fraction + ) + ) print(' Your most common word "{}" occurred {} times'.format(*top_counter[0])) last_word, last_count = top_counter[-1] - print(' The least common word in your top-k is "{}" with {} times'.format(last_word, last_count)) + print( + ' The least common word in your top-k is "{}" with {} times'.format( + last_word, last_count + ) + ) for i, (w, c) in enumerate(reversed(top_counter)): if c > last_count: - print(' The first word with {} occurrences is "{}" at place {}'.format(c, w, len(top_counter) - 1 - i)) + print( + ' The first word with {} occurrences is "{}" at place {}'.format( + c, w, len(top_counter) - 1 - i + ) + ) break return data_lower, vocab_str -# ====================================================================================================================== - def build_lm(args, data_lower, vocab_str): - print('\nCreating ARPA file ...') - lm_path = os.path.join(args.output_dir, 'lm.arpa') - subprocess.check_call([ - args.kenlm_bins + 'lmplz', - '--order', str(args.arpa_order), - '--temp_prefix', args.output_dir, - '--memory', args.max_arpa_memory, - '--text', data_lower, - '--arpa', lm_path, - '--prune', '0', '0', '1' - ]) + print("\nCreating ARPA file ...") + lm_path = os.path.join(args.output_dir, "lm.arpa") + subprocess.check_call( + [ + args.kenlm_bins + "lmplz", + "--order", + str(args.arpa_order), + "--temp_prefix", + args.output_dir, + "--memory", + args.max_arpa_memory, + "--text", + data_lower, + "--arpa", + lm_path, + "--prune", + *args.arpa_prune.split("|"), + ] + ) # Filter LM using vocabulary of top 500k words - print('\nFiltering ARPA file using vocabulary of top-k words ...') - filtered_path = os.path.join(args.output_dir, 'lm_filtered.arpa') - subprocess.run([ - args.kenlm_bins + 'filter', - 'single', - 'model:{}'.format(lm_path), - filtered_path - ], input=vocab_str.encode('utf-8'), check=True) + print("\nFiltering ARPA file using vocabulary of top-k words ...") + filtered_path = os.path.join(args.output_dir, "lm_filtered.arpa") + subprocess.run( + [ + args.kenlm_bins + "filter", + "single", + "model:{}".format(lm_path), + filtered_path, + ], + input=vocab_str.encode("utf-8"), + check=True, + ) # Quantize and produce trie binary. - print('\nBuilding lm.binary ...') - binary_path = os.path.join(args.output_dir, 'lm.binary') - subprocess.check_call([ - args.kenlm_bins + 'build_binary', - '-a', '255', - '-q', '8', - '-v', - 'trie', - filtered_path, - binary_path - ]) + print("\nBuilding lm.binary ...") + binary_path = os.path.join(args.output_dir, "lm.binary") + subprocess.check_call( + [ + args.kenlm_bins + "build_binary", + "-a", + str(args.binary_a_bits), + "-q", + str(args.binary_q_bits), + "-v", + args.binary_type, + filtered_path, + binary_path, + ] + ) -# ====================================================================================================================== - def main(): parser = argparse.ArgumentParser( - description='Generate an lm.binary and top-k vocab for DeepSpeech.' + description="Generate lm.binary and top-k vocab for DeepSpeech." ) parser.add_argument( - '--input_txt', - help='File path to a .txt with sample sentences', + "--input_txt", + help="File path to a .txt with sample sentences", type=str, - required=True + required=True, ) parser.add_argument( - '--output_dir', - help='Directory path for the output', - type=str, - required=True + "--output_dir", help="Directory path for the output", type=str, required=True ) parser.add_argument( - '--top_k', - help='Use top_k most frequent words for the vocab.txt file', + "--top_k", + help="Use top_k most frequent words for the vocab.txt file", type=int, - default=500000 + default=500000, ) parser.add_argument( - '--kenlm_bins', - help='File path to the kenlm binaries lmplz, filter and build_binary', + "--kenlm_bins", + help="File path to the KENLM binaries lmplz, filter and build_binary", type=str, - default='/DeepSpeech/native_client/kenlm/build/bin/' + default="/DeepSpeech/native_client/kenlm/build/bin/", ) parser.add_argument( - '--arpa_order', - help='Order of k-grams in arpa-file generation', + "--arpa_order", + help="Order of k-grams in ARPA-file generation", type=int, - default=5 + default=5, ) parser.add_argument( - '--max_arpa_memory', - help='Maximum allowed memory usage in arpa-file generation', + "--max_arpa_memory", + help="Maximum allowed memory usage for ARPA-file generation", type=str, - default='75%' + default="75%", + ) + parser.add_argument( + "--arpa_prune", + help='ARPA pruning parameters. Separate values with "|"', + type=str, + default="0|0|1", + ) + parser.add_argument( + "--binary_a_bits", + help="Build binary quantization value a in bits", + type=int, + default=255, + ) + parser.add_argument( + "--binary_q_bits", + help="Build binary quantization value q in bits", + type=int, + default=8, + ) + parser.add_argument( + "--binary_type", + help="Build binary data structure type", + type=str, + default="trie", ) args = parser.parse_args() @@ -136,12 +179,10 @@ def main(): build_lm(args, data_lower, vocab_str) # Delete intermediate files - os.remove(os.path.join(args.output_dir, 'lower.txt')) - os.remove(os.path.join(args.output_dir, 'lm.arpa')) - os.remove(os.path.join(args.output_dir, 'lm_filtered.arpa')) + os.remove(os.path.join(args.output_dir, "lower.txt")) + os.remove(os.path.join(args.output_dir, "lm.arpa")) + os.remove(os.path.join(args.output_dir, "lm_filtered.arpa")) -# ====================================================================================================================== - -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/data/lm/generate_package.py b/data/lm/generate_package.py index 0df1ee96..98b6cd39 100644 --- a/data/lm/generate_package.py +++ b/data/lm/generate_package.py @@ -16,13 +16,13 @@ from ds_ctcdecoder import Scorer, Alphabet as NativeAlphabet def create_bundle( - alphabet_path, - lm_path, - vocab_path, - package_path, - force_utf8, - default_alpha, - default_beta, + alphabet_path, + lm_path, + vocab_path, + package_path, + force_utf8, + default_alpha, + default_beta, ): words = set() vocab_looks_char_based = True @@ -112,27 +112,23 @@ def main(): required=True, help="Path of vocabulary file. Must contain words separated by whitespace.", ) - parser.add_argument( - "--package", - required=True, - help="Path to save scorer package." - ) + parser.add_argument("--package", required=True, help="Path to save scorer package.") parser.add_argument( "--default_alpha", type=float, - default=0.75, + required=True, help="Default value of alpha hyperparameter.", ) parser.add_argument( "--default_beta", type=float, - default=1.85, + required=True, help="Default value of beta hyperparameter.", ) parser.add_argument( "--force_utf8", default="", - help="Boolean flag, force set or unset UTF-8 mode in the scorer package. If not set, infers from the vocabulary. Using this wrong can result in a 'Segmentation fault' error at the model evaluation.", + help="Boolean flag, force set or unset UTF-8 mode in the scorer package. If not set, infers from the vocabulary. See for further explanation", ) args = parser.parse_args()