Add more parameters.

Implement some change request.
2020-04-01 16:54:58 +02:00 · 2020-04-01 16:54:58 +02:00 · b27e0347b1
commit b27e0347b1
parent a79cc0cee9
3 changed files with 135 additions and 107 deletions
--- a/data/lm/README.rst
+++ b/data/lm/README.rst
@ -1,32 +1,23 @@
-Download librispeech corpus
+The LM binary was generated from the LibriSpeech normalized LM training text, available `here <http://www.openslr.org/11>`_.
+It is created with `KenLM <https://github.com/kpu/kenlm>`_.
+
+You can download the librispeech corpus with the following commands:

 .. code-block:: bash

    wget http://www.openslr.org/resources/11/librispeech-lm-norm.txt.gz -O librispeech.txt.gz
    gunzip librispeech.txt.gz

-|
-| Generate vocab-500000.txt and lm.binary files
-| Optional Parameters:
-
-* '--kenlm_bins path/to/bins/':  Change the path of the kenlm binaries (defaults to directory in docker container)
-* '--top_k 500000': Change the number of most frequent words
-* '--arpa_order 5': Change order of k-grams in arpa-file generation
-* '--max_arpa_memory 75%': Set maximum allowed memory usage for arpa-file generation

+Then use the `generate_lm.py` script to generate `lm.binary` and `vocab-500000.txt`.

 .. code-block:: bash

    python3 data/lm/generate_lm.py --input_txt path/to/vocab_sentences.txt --output_dir path/lm/

-|
-| Generate scorer package with the above vocab-500000.txt and lm.binary files
-| Optional Parameters:

-* '--default_alpha 0.75'
-* '--default_beta 1.85'
-* '--force_utf8 ""': See `link <https://github.com/mozilla/DeepSpeech/blob/master/doc/Decoder.rst#utf-8-mode>`_ for explanation
+Afterwards you can generate the scorer package with the above vocab-500000.txt and lm.binary files

 .. code-block:: bash

-    python generate_package.py --alphabet ../alphabet.txt --lm lm.binary --vocab librispeech-vocab-500k.txt --package kenlm.scorer
+    python generate_package.py --alphabet ../alphabet.txt --lm lm.binary --vocab vocab-500000.txt --package kenlm.scorer --default_alpha 0.75 --default_beta 1.85
--- a/data/lm/generate_lm.py
+++ b/data/lm/generate_lm.py
@ -6,129 +6,172 @@ from collections import Counter
 import progressbar


-# ======================================================================================================================
-
 def convert_and_filter_topk(args):
    """ Convert to lowercase, count word occurrences and save top-k words to a file """

    counter = Counter()
-    data_lower = os.path.join(args.output_dir, 'lower.txt')
+    data_lower = os.path.join(args.output_dir, "lower.txt")

-    print('\nConverting to lowercase and counting word occurrences ...')
-    with open(data_lower, 'w+', encoding='utf8') as file_out:
-        with open(args.input_txt, encoding='utf8') as file_in:
+    print("\nConverting to lowercase and counting word occurrences ...")
+    with open(data_lower, "w+", encoding="utf8") as file_out:
+        with open(args.input_txt, encoding="utf8") as file_in:
            for line in progressbar.progressbar(file_in):
                line_lower = line.lower()
                counter.update(line_lower.split())
                file_out.write(line_lower)

    # Save top-k words
-    print('\nSaving top {} words ...'.format(args.top_k))
+    print("\nSaving top {} words ...".format(args.top_k))
    top_counter = counter.most_common(args.top_k)
-    vocab_str = '\n'.join(word for word, count in top_counter)
-    vocab_path = 'vocab-{}.txt'.format(args.top_k)
+    vocab_str = "\n".join(word for word, count in top_counter)
+    vocab_path = "vocab-{}.txt".format(args.top_k)
    vocab_path = os.path.join(args.output_dir, vocab_path)
-    with open(vocab_path, 'w+') as file:
+    with open(vocab_path, "w+") as file:
        file.write(vocab_str)

-    print('\nCalculating word statistics ...')
+    print("\nCalculating word statistics ...")
    total_words = sum(counter.values())
-    print('  Your text file has {} words in total'.format(total_words))
-    print('  It has {} unique words'.format(len(counter)))
+    print("  Your text file has {} words in total".format(total_words))
+    print("  It has {} unique words".format(len(counter)))
    top_words_sum = sum(count for word, count in top_counter)
    word_fraction = (top_words_sum / total_words) * 100
-    print('  Your top-{} words are {:.4f} percent of all words'.format(args.top_k, word_fraction))
+    print(
+        "  Your top-{} words are {:.4f} percent of all words".format(
+            args.top_k, word_fraction
+        )
+    )
    print('  Your most common word "{}" occurred {} times'.format(*top_counter[0]))
    last_word, last_count = top_counter[-1]
-    print('  The least common word in your top-k is "{}" with {} times'.format(last_word, last_count))
+    print(
+        '  The least common word in your top-k is "{}" with {} times'.format(
+            last_word, last_count
+        )
+    )
    for i, (w, c) in enumerate(reversed(top_counter)):
        if c > last_count:
-            print('  The first word with {} occurrences is "{}" at place {}'.format(c, w, len(top_counter) - 1 - i))
+            print(
+                '  The first word with {} occurrences is "{}" at place {}'.format(
+                    c, w, len(top_counter) - 1 - i
+                )
+            )
            break

    return data_lower, vocab_str


-# ======================================================================================================================
-
 def build_lm(args, data_lower, vocab_str):
-    print('\nCreating ARPA file ...')
-    lm_path = os.path.join(args.output_dir, 'lm.arpa')
-    subprocess.check_call([
-        args.kenlm_bins + 'lmplz',
-        '--order', str(args.arpa_order),
-        '--temp_prefix', args.output_dir,
-        '--memory', args.max_arpa_memory,
-        '--text', data_lower,
-        '--arpa', lm_path,
-        '--prune', '0', '0', '1'
-    ])
+    print("\nCreating ARPA file ...")
+    lm_path = os.path.join(args.output_dir, "lm.arpa")
+    subprocess.check_call(
+        [
+            args.kenlm_bins + "lmplz",
+            "--order",
+            str(args.arpa_order),
+            "--temp_prefix",
+            args.output_dir,
+            "--memory",
+            args.max_arpa_memory,
+            "--text",
+            data_lower,
+            "--arpa",
+            lm_path,
+            "--prune",
+            *args.arpa_prune.split("|"),
+        ]
+    )

    # Filter LM using vocabulary of top 500k words
-    print('\nFiltering ARPA file using vocabulary of top-k words ...')
-    filtered_path = os.path.join(args.output_dir, 'lm_filtered.arpa')
-    subprocess.run([
-        args.kenlm_bins + 'filter',
-        'single',
-        'model:{}'.format(lm_path),
-        filtered_path
-    ], input=vocab_str.encode('utf-8'), check=True)
+    print("\nFiltering ARPA file using vocabulary of top-k words ...")
+    filtered_path = os.path.join(args.output_dir, "lm_filtered.arpa")
+    subprocess.run(
+        [
+            args.kenlm_bins + "filter",
+            "single",
+            "model:{}".format(lm_path),
+            filtered_path,
+        ],
+        input=vocab_str.encode("utf-8"),
+        check=True,
+    )

    # Quantize and produce trie binary.
-    print('\nBuilding lm.binary ...')
-    binary_path = os.path.join(args.output_dir, 'lm.binary')
-    subprocess.check_call([
-        args.kenlm_bins + 'build_binary',
-        '-a', '255',
-        '-q', '8',
-        '-v',
-        'trie',
-        filtered_path,
-        binary_path
-    ])
+    print("\nBuilding lm.binary ...")
+    binary_path = os.path.join(args.output_dir, "lm.binary")
+    subprocess.check_call(
+        [
+            args.kenlm_bins + "build_binary",
+            "-a",
+            str(args.binary_a_bits),
+            "-q",
+            str(args.binary_q_bits),
+            "-v",
+            args.binary_type,
+            filtered_path,
+            binary_path,
+        ]
+    )


-# ======================================================================================================================
-
 def main():
    parser = argparse.ArgumentParser(
-        description='Generate an lm.binary and top-k vocab for DeepSpeech.'
+        description="Generate lm.binary and top-k vocab for DeepSpeech."
    )
    parser.add_argument(
-        '--input_txt',
-        help='File path to a .txt with sample sentences',
+        "--input_txt",
+        help="File path to a .txt with sample sentences",
        type=str,
-        required=True
+        required=True,
    )
    parser.add_argument(
-        '--output_dir',
-        help='Directory path for the output',
-        type=str,
-        required=True
+        "--output_dir", help="Directory path for the output", type=str, required=True
    )
    parser.add_argument(
-        '--top_k',
-        help='Use top_k most frequent words for the vocab.txt file',
+        "--top_k",
+        help="Use top_k most frequent words for the vocab.txt file",
        type=int,
-        default=500000
+        default=500000,
    )
    parser.add_argument(
-        '--kenlm_bins',
-        help='File path to the kenlm binaries lmplz, filter and build_binary',
+        "--kenlm_bins",
+        help="File path to the KENLM binaries lmplz, filter and build_binary",
        type=str,
-        default='/DeepSpeech/native_client/kenlm/build/bin/'
+        default="/DeepSpeech/native_client/kenlm/build/bin/",
    )
    parser.add_argument(
-        '--arpa_order',
-        help='Order of k-grams in arpa-file generation',
+        "--arpa_order",
+        help="Order of k-grams in ARPA-file generation",
        type=int,
-        default=5
+        default=5,
    )
    parser.add_argument(
-        '--max_arpa_memory',
-        help='Maximum allowed memory usage in arpa-file generation',
+        "--max_arpa_memory",
+        help="Maximum allowed memory usage for ARPA-file generation",
        type=str,
-        default='75%'
+        default="75%",
+    )
+    parser.add_argument(
+        "--arpa_prune",
+        help='ARPA pruning parameters. Separate values with "|"',
+        type=str,
+        default="0|0|1",
+    )
+    parser.add_argument(
+        "--binary_a_bits",
+        help="Build binary quantization value a in bits",
+        type=int,
+        default=255,
+    )
+    parser.add_argument(
+        "--binary_q_bits",
+        help="Build binary quantization value q in bits",
+        type=int,
+        default=8,
+    )
+    parser.add_argument(
+        "--binary_type",
+        help="Build binary data structure type",
+        type=str,
+        default="trie",
    )
    args = parser.parse_args()

@ -136,12 +179,10 @@ def main():
    build_lm(args, data_lower, vocab_str)

    # Delete intermediate files
-    os.remove(os.path.join(args.output_dir, 'lower.txt'))
-    os.remove(os.path.join(args.output_dir, 'lm.arpa'))
-    os.remove(os.path.join(args.output_dir, 'lm_filtered.arpa'))
+    os.remove(os.path.join(args.output_dir, "lower.txt"))
+    os.remove(os.path.join(args.output_dir, "lm.arpa"))
+    os.remove(os.path.join(args.output_dir, "lm_filtered.arpa"))


-# ======================================================================================================================
-
-if __name__ == '__main__':
+if __name__ == "__main__":
    main()
--- a/data/lm/generate_package.py
+++ b/data/lm/generate_package.py
@ -16,13 +16,13 @@ from ds_ctcdecoder import Scorer, Alphabet as NativeAlphabet


 def create_bundle(
-        alphabet_path,
-        lm_path,
-        vocab_path,
-        package_path,
-        force_utf8,
-        default_alpha,
-        default_beta,
+    alphabet_path,
+    lm_path,
+    vocab_path,
+    package_path,
+    force_utf8,
+    default_alpha,
+    default_beta,
 ):
    words = set()
    vocab_looks_char_based = True
@ -112,27 +112,23 @@ def main():
        required=True,
        help="Path of vocabulary file. Must contain words separated by whitespace.",
    )
-    parser.add_argument(
-        "--package",
-        required=True,
-        help="Path to save scorer package."
-    )
+    parser.add_argument("--package", required=True, help="Path to save scorer package.")
    parser.add_argument(
        "--default_alpha",
        type=float,
-        default=0.75,
+        required=True,
        help="Default value of alpha hyperparameter.",
    )
    parser.add_argument(
        "--default_beta",
        type=float,
-        default=1.85,
+        required=True,
        help="Default value of beta hyperparameter.",
    )
    parser.add_argument(
        "--force_utf8",
        default="",
-        help="Boolean flag, force set or unset UTF-8 mode in the scorer package. If not set, infers from the vocabulary. Using this wrong can result in a 'Segmentation fault' error at the model evaluation.",
+        help="Boolean flag, force set or unset UTF-8 mode in the scorer package. If not set, infers from the vocabulary. See <https://github.com/mozilla/DeepSpeech/blob/master/doc/Decoder.rst#utf-8-mode> for further explanation",
    )
    args = parser.parse_args()