Use os.join and kenlm parameter usage description.

This commit is contained in:
Daniel 2020-04-03 17:58:52 +02:00
parent e862cd41db
commit e16b72ff28
2 changed files with 7 additions and 5 deletions

View File

@ -9,7 +9,9 @@ You can download the librispeech corpus with the following commands:
gunzip librispeech.txt.gz
Then use the `generate_lm.py` script to generate `lm.binary` and `vocab-500000.txt`.
| Then use the `generate_lm.py` script to generate `lm.binary` and `vocab-500000.txt`.
| If you are not using the DeepSpeech docker container, you have to build `KenLM <https://github.com/kpu/kenlm>`_ first
and then pass the build path to the script `--kenlm_bins /DeepSpeech/native_client/kenlm/build/bin/`.
.. code-block:: bash

View File

@ -79,7 +79,7 @@ def build_lm(args, data_lower, vocab_str):
lm_path = os.path.join(args.output_dir, "lm.arpa")
subprocess.check_call(
[
args.kenlm_bins + "lmplz",
os.path.join(args.kenlm_bins, "lmplz"),
"--order",
str(args.arpa_order),
"--temp_prefix",
@ -95,12 +95,12 @@ def build_lm(args, data_lower, vocab_str):
]
)
# Filter LM using vocabulary of top 500k words
# Filter LM using vocabulary of top-k words
print("\nFiltering ARPA file using vocabulary of top-k words ...")
filtered_path = os.path.join(args.output_dir, "lm_filtered.arpa")
subprocess.run(
[
args.kenlm_bins + "filter",
os.path.join(args.kenlm_bins, "filter"),
"single",
"model:{}".format(lm_path),
filtered_path,
@ -114,7 +114,7 @@ def build_lm(args, data_lower, vocab_str):
binary_path = os.path.join(args.output_dir, "lm.binary")
subprocess.check_call(
[
args.kenlm_bins + "build_binary",
os.path.join(args.kenlm_bins, "build_binary"),
"-a",
str(args.binary_a_bits),
"-q",