From 117324e6659e95a17b6b31ffae556cba0f359319 Mon Sep 17 00:00:00 2001 From: david gauchard Date: Tue, 28 Apr 2020 11:58:41 +0200 Subject: [PATCH] Add a --discount_fallback option to generate_lm.py (#2945) --- data/lm/generate_lm.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/data/lm/generate_lm.py b/data/lm/generate_lm.py index 72b22ca1..659d5077 100644 --- a/data/lm/generate_lm.py +++ b/data/lm/generate_lm.py @@ -77,8 +77,7 @@ def convert_and_filter_topk(args): def build_lm(args, data_lower, vocab_str): print("\nCreating ARPA file ...") lm_path = os.path.join(args.output_dir, "lm.arpa") - subprocess.check_call( - [ + subargs = [ os.path.join(args.kenlm_bins, "lmplz"), "--order", str(args.arpa_order), @@ -93,7 +92,9 @@ def build_lm(args, data_lower, vocab_str): "--prune", *args.arpa_prune.split("|"), ] - ) + if args.discount_fallback: + subargs += ["--discount_fallback"] + subprocess.check_call(subargs) # Filter LM using vocabulary of top-k words print("\nFiltering ARPA file using vocabulary of top-k words ...") @@ -188,6 +189,12 @@ def main(): type=str, required=True, ) + parser.add_argument( + "--discount_fallback", + help="To try when such message is returned by kenlm: 'Could not calculate Kneser-Ney discounts [...] rerun with --discount_fallback'", + action="store_true", + ) + args = parser.parse_args() data_lower, vocab_str = convert_and_filter_topk(args)