Use os.join and kenlm parameter usage description.

2020-04-03 17:58:52 +02:00 · 2020-04-03 17:58:52 +02:00 · e16b72ff28
commit e16b72ff28
parent e862cd41db
2 changed files with 7 additions and 5 deletions
--- a/data/lm/README.rst
+++ b/data/lm/README.rst
@ -9,7 +9,9 @@ You can download the librispeech corpus with the following commands:
    gunzip librispeech.txt.gz


-Then use the `generate_lm.py` script to generate `lm.binary` and `vocab-500000.txt`.
+| Then use the `generate_lm.py` script to generate `lm.binary` and `vocab-500000.txt`.
+| If you are not using the DeepSpeech docker container, you have to build `KenLM <https://github.com/kpu/kenlm>`_ first
+  and then pass the build path to the script `--kenlm_bins /DeepSpeech/native_client/kenlm/build/bin/`.

 .. code-block:: bash

--- a/data/lm/generate_lm.py
+++ b/data/lm/generate_lm.py
@ -79,7 +79,7 @@ def build_lm(args, data_lower, vocab_str):
    lm_path = os.path.join(args.output_dir, "lm.arpa")
    subprocess.check_call(
        [
-            args.kenlm_bins + "lmplz",
+            os.path.join(args.kenlm_bins, "lmplz"),
            "--order",
            str(args.arpa_order),
            "--temp_prefix",
@ -95,12 +95,12 @@ def build_lm(args, data_lower, vocab_str):
        ]
    )

-    # Filter LM using vocabulary of top 500k words
+    # Filter LM using vocabulary of top-k words
    print("\nFiltering ARPA file using vocabulary of top-k words ...")
    filtered_path = os.path.join(args.output_dir, "lm_filtered.arpa")
    subprocess.run(
        [
-            args.kenlm_bins + "filter",
+            os.path.join(args.kenlm_bins, "filter"),
            "single",
            "model:{}".format(lm_path),
            filtered_path,
@ -114,7 +114,7 @@ def build_lm(args, data_lower, vocab_str):
    binary_path = os.path.join(args.output_dir, "lm.binary")
    subprocess.check_call(
        [
-            args.kenlm_bins + "build_binary",
+            os.path.join(args.kenlm_bins, "build_binary"),
            "-a",
            str(args.binary_a_bits),
            "-q",