From c29c0beb725213efc6f55a3d4f4d17ef533fbcf3 Mon Sep 17 00:00:00 2001 From: Daniel Date: Wed, 8 Apr 2020 20:23:04 +0200 Subject: [PATCH] Default to required params. --- data/lm/README.rst | 9 ++++++--- data/lm/generate_lm.py | 16 ++++++++-------- 2 files changed, 14 insertions(+), 11 deletions(-) diff --git a/data/lm/README.rst b/data/lm/README.rst index 2d8e5c71..130df2c3 100644 --- a/data/lm/README.rst +++ b/data/lm/README.rst @@ -11,15 +11,18 @@ You can download the librispeech corpus with the following commands: | Then use the `generate_lm.py` script to generate `lm.binary` and `vocab-500000.txt`. | As input you can use a `file.txt` or `file.txt.gz` with one sentence in each line. | If you are not using the DeepSpeech docker container, you have to build `KenLM `_ first - and then pass the build path to the script `--kenlm_bins /DeepSpeech/native_client/kenlm/build/bin/`. + and then pass the build directory to the script. .. code-block:: bash - python3 data/lm/generate_lm.py --input_txt path/to/librispeech.txt.gz --output_dir path/lm/ + python3 data/lm/generate_lm.py --input_txt path/to/librispeech.txt.gz --output_dir path/lm/ --top_k 500000 \ + --kenlm_bins /DeepSpeech/native_client/kenlm/build/bin/ --arpa_order 5 --max_arpa_memory "85%" \ + --arpa_prune "0|0|1" --binary_a_bits 255 --binary_q_bits 8 --binary_type trie Afterwards you can generate the scorer package with the above vocab-500000.txt and lm.binary files .. code-block:: bash - python generate_package.py --alphabet ../alphabet.txt --lm lm.binary --vocab vocab-500000.txt --package kenlm.scorer --default_alpha 0.75 --default_beta 1.85 + python3 generate_package.py --alphabet ../alphabet.txt --lm lm.binary --vocab vocab-500000.txt \ + --package kenlm.scorer --default_alpha 0.75 --default_beta 1.85 diff --git a/data/lm/generate_lm.py b/data/lm/generate_lm.py index 4d4944f2..52b8d731 100644 --- a/data/lm/generate_lm.py +++ b/data/lm/generate_lm.py @@ -144,49 +144,49 @@ def main(): "--top_k", help="Use top_k most frequent words for the vocab.txt file. These will be used to filter the ARPA file.", type=int, - default=500000, + required=True, ) parser.add_argument( "--kenlm_bins", help="File path to the KENLM binaries lmplz, filter and build_binary", type=str, - default="/DeepSpeech/native_client/kenlm/build/bin/", + required=True, ) parser.add_argument( "--arpa_order", help="Order of k-grams in ARPA-file generation", type=int, - default=5, + required=True, ) parser.add_argument( "--max_arpa_memory", help="Maximum allowed memory usage for ARPA-file generation", type=str, - default="75%", + required=True, ) parser.add_argument( "--arpa_prune", help="ARPA pruning parameters. Separate values with '|'", type=str, - default="0|0|1", + required=True, ) parser.add_argument( "--binary_a_bits", help="Build binary quantization value a in bits", type=int, - default=255, + required=True, ) parser.add_argument( "--binary_q_bits", help="Build binary quantization value q in bits", type=int, - default=8, + required=True, ) parser.add_argument( "--binary_type", help="Build binary data structure type", type=str, - default="trie", + required=True, ) args = parser.parse_args()