Merge branch 'pr2801' (Fixes #2801)

2020-04-14 13:07:50 +02:00 · 2020-04-14 13:07:50 +02:00 · 40250988db
commit 40250988db
parent c80d7f6f3d c27387fd98
3 changed files with 216 additions and 54 deletions
--- a/data/lm/README.rst
+++ b/data/lm/README.rst
@ -1,8 +1,30 @@
+The LM binary was generated from the LibriSpeech normalized LM training text, available `here <http://www.openslr.org/11>`_.
+It is created with `KenLM <https://github.com/kpu/kenlm>`_.

-The LM binary was generated from the LibriSpeech normalized LM training text, available `here <http://www.openslr.org/11>`_\ , using the `generate_lm.py` script (will generate `lm.binary` and `librispeech-vocab-500k.txt` in the folder it is run from). `KenLM <https://github.com/kpu/kenlm>`_'s built binaries must be in your PATH (lmplz, build_binary, filter).
-
-The scorer package was then built using the `generate_package.py` script:
+You can download the LibriSpeech corpus with the following commands:

 .. code-block:: bash
-    python generate_lm.py # this will create lm.binary and librispeech-vocab-500k.txt
-    python generate_package.py --alphabet ../alphabet.txt --lm lm.binary --vocab librispeech-vocab-500k.txt --default_alpha 0.75 --default_beta 1.85 --package kenlm.scorer
+
+    wget http://www.openslr.org/resources/11/librispeech-lm-norm.txt.gz
+
+
+Then use the `generate_lm.py` script to generate `lm.binary` and `vocab-500000.txt`.
+
+As input you can use a plain text (e.g. `file.txt`) or gzipped (e.g. `file.txt.gz`) text file with one sentence in each line.
+
+If you are not using the DeepSpeech docker container, you have to build `KenLM <https://github.com/kpu/kenlm>`_ first and then pass the build directory to the script.
+
+.. code-block:: bash
+
+    python3 data/lm/generate_lm.py --input_txt librispeech-lm-norm.txt.gz \
+      --output_dir . --top_k 500000 --kenlm_bins path/to/kenlm/build/bin/ \
+      --arpa_order 5 --max_arpa_memory "85%" --arpa_prune "0|0|1" \
+      --binary_a_bits 255 --binary_q_bits 8 --binary_type trie
+
+
+Afterwards you can use `generate_package.py` to generate the scorer package using the lm.binary and vocab-500000.txt files:
+
+.. code-block:: bash
+
+    python3 generate_package.py --alphabet ../alphabet.txt --lm lm.binary --vocab vocab-500000.txt \
+      --package kenlm.scorer --default_alpha 0.75 --default_beta 1.85
--- a/data/lm/generate_lm.py
+++ b/data/lm/generate_lm.py
@ -1,63 +1,203 @@
+import argparse
 import gzip
 import io
 import os
 import subprocess
-import tempfile
-
 from collections import Counter
-from urllib import request

-def main():
-  # Grab corpus.
-  url = 'http://www.openslr.org/resources/11/librispeech-lm-norm.txt.gz'
+import progressbar

-  with tempfile.TemporaryDirectory() as tmp:
-    data_upper = os.path.join(tmp, 'upper.txt.gz')
-    print('Downloading {} into {}...'.format(url, data_upper))
-    request.urlretrieve(url, data_upper)

-    # Convert to lowercase and count word occurences.
+def convert_and_filter_topk(args):
+    """ Convert to lowercase, count word occurrences and save top-k words to a file """
+
    counter = Counter()
-    data_lower = os.path.join(tmp, 'lower.txt.gz')
-    print('Converting to lower case and counting word frequencies...')
-    with io.TextIOWrapper(io.BufferedWriter(gzip.open(data_lower, 'w')), encoding='utf-8') as lower:
-      with io.TextIOWrapper(io.BufferedReader(gzip.open(data_upper)), encoding='utf8') as upper:
-        for line in upper:
-          line_lower = line.lower()
-          counter.update(line_lower.split())
-          lower.write(line_lower)
+    data_lower = os.path.join(args.output_dir, "lower.txt.gz")

-    # Build pruned LM.
-    lm_path = os.path.join(tmp, 'lm.arpa')
-    print('Creating ARPA file...')
-    subprocess.check_call([
-      'lmplz', '--order', '5',
-               '--temp_prefix', tmp,
-               '--memory', '50%',
-               '--text', data_lower,
-               '--arpa', lm_path,
-               '--prune', '0', '0', '1'
-    ])
+    print("\nConverting to lowercase and counting word occurrences ...")
+    with io.TextIOWrapper(
+        io.BufferedWriter(gzip.open(data_lower, "w+")), encoding="utf-8"
+    ) as file_out:

-    vocab_str = '\n'.join(word for word, count in counter.most_common(500000))
-    with open('librispeech-vocab-500k.txt', 'w') as fout:
-      fout.write(vocab_str)
+        # Open the input file either from input.txt or input.txt.gz
+        _, file_extension = os.path.splitext(args.input_txt)
+        if file_extension == ".gz":
+            file_in = io.TextIOWrapper(
+                io.BufferedReader(gzip.open(args.input_txt)), encoding="utf-8"
+            )
+        else:
+            file_in = open(args.input_txt, encoding="utf-8")

-    # Filter LM using vocabulary of top 500k words
-    print('Filtering ARPA file...')
-    filtered_path = os.path.join(tmp, 'lm_filtered.arpa')
-    subprocess.run(['filter', 'single', 'model:{}'.format(lm_path), filtered_path], input=vocab_str.encode('utf-8'), check=True)
+        for line in progressbar.progressbar(file_in):
+            line_lower = line.lower()
+            counter.update(line_lower.split())
+            file_out.write(line_lower)
+
+        file_in.close()
+
+    # Save top-k words
+    print("\nSaving top {} words ...".format(args.top_k))
+    top_counter = counter.most_common(args.top_k)
+    vocab_str = "\n".join(word for word, count in top_counter)
+    vocab_path = "vocab-{}.txt".format(args.top_k)
+    vocab_path = os.path.join(args.output_dir, vocab_path)
+    with open(vocab_path, "w+") as file:
+        file.write(vocab_str)
+
+    print("\nCalculating word statistics ...")
+    total_words = sum(counter.values())
+    print("  Your text file has {} words in total".format(total_words))
+    print("  It has {} unique words".format(len(counter)))
+    top_words_sum = sum(count for word, count in top_counter)
+    word_fraction = (top_words_sum / total_words) * 100
+    print(
+        "  Your top-{} words are {:.4f} percent of all words".format(
+            args.top_k, word_fraction
+        )
+    )
+    print('  Your most common word "{}" occurred {} times'.format(*top_counter[0]))
+    last_word, last_count = top_counter[-1]
+    print(
+        '  The least common word in your top-k is "{}" with {} times'.format(
+            last_word, last_count
+        )
+    )
+    for i, (w, c) in enumerate(reversed(top_counter)):
+        if c > last_count:
+            print(
+                '  The first word with {} occurrences is "{}" at place {}'.format(
+                    c, w, len(top_counter) - 1 - i
+                )
+            )
+            break
+
+    return data_lower, vocab_str
+
+
+def build_lm(args, data_lower, vocab_str):
+    print("\nCreating ARPA file ...")
+    lm_path = os.path.join(args.output_dir, "lm.arpa")
+    subprocess.check_call(
+        [
+            os.path.join(args.kenlm_bins, "lmplz"),
+            "--order",
+            str(args.arpa_order),
+            "--temp_prefix",
+            args.output_dir,
+            "--memory",
+            args.max_arpa_memory,
+            "--text",
+            data_lower,
+            "--arpa",
+            lm_path,
+            "--prune",
+            *args.arpa_prune.split("|"),
+        ]
+    )
+
+    # Filter LM using vocabulary of top-k words
+    print("\nFiltering ARPA file using vocabulary of top-k words ...")
+    filtered_path = os.path.join(args.output_dir, "lm_filtered.arpa")
+    subprocess.run(
+        [
+            os.path.join(args.kenlm_bins, "filter"),
+            "single",
+            "model:{}".format(lm_path),
+            filtered_path,
+        ],
+        input=vocab_str.encode("utf-8"),
+        check=True,
+    )

    # Quantize and produce trie binary.
-    print('Building lm.binary...')
-    subprocess.check_call([
-      'build_binary', '-a', '255',
-                      '-q', '8',
-                      '-v',
-                      'trie',
-                      filtered_path,
-                      'lm.binary'
-    ])
+    print("\nBuilding lm.binary ...")
+    binary_path = os.path.join(args.output_dir, "lm.binary")
+    subprocess.check_call(
+        [
+            os.path.join(args.kenlm_bins, "build_binary"),
+            "-a",
+            str(args.binary_a_bits),
+            "-q",
+            str(args.binary_q_bits),
+            "-v",
+            args.binary_type,
+            filtered_path,
+            binary_path,
+        ]
+    )

-if __name__ == '__main__':
-  main()
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Generate lm.binary and top-k vocab for DeepSpeech."
+    )
+    parser.add_argument(
+        "--input_txt",
+        help="Path to a file.txt or file.txt.gz with sample sentences",
+        type=str,
+        required=True,
+    )
+    parser.add_argument(
+        "--output_dir", help="Directory path for the output", type=str, required=True
+    )
+    parser.add_argument(
+        "--top_k",
+        help="Use top_k most frequent words for the vocab.txt file. These will be used to filter the ARPA file.",
+        type=int,
+        required=True,
+    )
+    parser.add_argument(
+        "--kenlm_bins",
+        help="File path to the KENLM binaries lmplz, filter and build_binary",
+        type=str,
+        required=True,
+    )
+    parser.add_argument(
+        "--arpa_order",
+        help="Order of k-grams in ARPA-file generation",
+        type=int,
+        required=True,
+    )
+    parser.add_argument(
+        "--max_arpa_memory",
+        help="Maximum allowed memory usage for ARPA-file generation",
+        type=str,
+        required=True,
+    )
+    parser.add_argument(
+        "--arpa_prune",
+        help="ARPA pruning parameters. Separate values with '|'",
+        type=str,
+        required=True,
+    )
+    parser.add_argument(
+        "--binary_a_bits",
+        help="Build binary quantization value a in bits",
+        type=int,
+        required=True,
+    )
+    parser.add_argument(
+        "--binary_q_bits",
+        help="Build binary quantization value q in bits",
+        type=int,
+        required=True,
+    )
+    parser.add_argument(
+        "--binary_type",
+        help="Build binary data structure type",
+        type=str,
+        required=True,
+    )
+    args = parser.parse_args()
+
+    data_lower, vocab_str = convert_and_filter_topk(args)
+    build_lm(args, data_lower, vocab_str)
+
+    # Delete intermediate files
+    os.remove(os.path.join(args.output_dir, "lower.txt.gz"))
+    os.remove(os.path.join(args.output_dir, "lm.arpa"))
+    os.remove(os.path.join(args.output_dir, "lm_filtered.arpa"))
+
+
+if __name__ == "__main__":
+    main()
--- a/data/lm/generate_package.py
+++ b/data/lm/generate_package.py
@ -125,7 +125,7 @@ def main():
    parser.add_argument(
        "--force_utf8",
        default="",
-        help="Boolean flag, force set or unset UTF-8 mode in the scorer package. If not set, infers from the vocabulary.",
+        help="Boolean flag, force set or unset UTF-8 mode in the scorer package. If not set, infers from the vocabulary. See <https://github.com/mozilla/DeepSpeech/blob/master/doc/Decoder.rst#utf-8-mode> for further explanation",
    )
    args = parser.parse_args()