From e16b72ff28abfb60af25c0ed6fa9dc68459f49f6 Mon Sep 17 00:00:00 2001 From: Daniel Date: Fri, 3 Apr 2020 17:58:52 +0200 Subject: [PATCH] Use os.join and kenlm parameter usage description. --- data/lm/README.rst | 4 +++- data/lm/generate_lm.py | 8 ++++---- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/data/lm/README.rst b/data/lm/README.rst index 228d210c..595cd0c0 100644 --- a/data/lm/README.rst +++ b/data/lm/README.rst @@ -9,7 +9,9 @@ You can download the librispeech corpus with the following commands: gunzip librispeech.txt.gz -Then use the `generate_lm.py` script to generate `lm.binary` and `vocab-500000.txt`. +| Then use the `generate_lm.py` script to generate `lm.binary` and `vocab-500000.txt`. +| If you are not using the DeepSpeech docker container, you have to build `KenLM `_ first + and then pass the build path to the script `--kenlm_bins /DeepSpeech/native_client/kenlm/build/bin/`. .. code-block:: bash diff --git a/data/lm/generate_lm.py b/data/lm/generate_lm.py index 2099c2ae..4d4944f2 100644 --- a/data/lm/generate_lm.py +++ b/data/lm/generate_lm.py @@ -79,7 +79,7 @@ def build_lm(args, data_lower, vocab_str): lm_path = os.path.join(args.output_dir, "lm.arpa") subprocess.check_call( [ - args.kenlm_bins + "lmplz", + os.path.join(args.kenlm_bins, "lmplz"), "--order", str(args.arpa_order), "--temp_prefix", @@ -95,12 +95,12 @@ def build_lm(args, data_lower, vocab_str): ] ) - # Filter LM using vocabulary of top 500k words + # Filter LM using vocabulary of top-k words print("\nFiltering ARPA file using vocabulary of top-k words ...") filtered_path = os.path.join(args.output_dir, "lm_filtered.arpa") subprocess.run( [ - args.kenlm_bins + "filter", + os.path.join(args.kenlm_bins, "filter"), "single", "model:{}".format(lm_path), filtered_path, @@ -114,7 +114,7 @@ def build_lm(args, data_lower, vocab_str): binary_path = os.path.join(args.output_dir, "lm.binary") subprocess.check_call( [ - args.kenlm_bins + "build_binary", + os.path.join(args.kenlm_bins, "build_binary"), "-a", str(args.binary_a_bits), "-q",