diff --git a/data/lm/README.rst b/data/lm/README.rst index beba5dbc..ed7f017d 100644 --- a/data/lm/README.rst +++ b/data/lm/README.rst @@ -29,4 +29,4 @@ Afterwards you can use ``generate_package.py`` to generate the scorer package us .. code-block:: bash python3 generate_package.py --alphabet ../alphabet.txt --lm lm.binary --vocab vocab-500000.txt \ - --package kenlm.scorer --default_alpha 0.75 --default_beta 1.85 + --package kenlm.scorer --default_alpha 0.931289039105002 --default_beta 1.1834137581510284 diff --git a/data/lm/kenlm.scorer b/data/lm/kenlm.scorer index 02d5de6c..d8581c05 100644 --- a/data/lm/kenlm.scorer +++ b/data/lm/kenlm.scorer @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3ba04978fca285c34c99bf115ee61549937e422ac91def80122a767e114c035e -size 953436352 +oid sha256:d0cf926ab9cab54a8a7d70003b931b2d62ebd9105ed392d1ec9c840029867799 +size 953363776 diff --git a/training/deepspeech_training/util/flags.py b/training/deepspeech_training/util/flags.py index 63bc7bb4..fde946cc 100644 --- a/training/deepspeech_training/util/flags.py +++ b/training/deepspeech_training/util/flags.py @@ -174,8 +174,8 @@ def create_flags(): f.DEFINE_string('scorer_path', 'data/lm/kenlm.scorer', 'path to the external scorer file created with data/lm/generate_package.py') f.DEFINE_alias('scorer', 'scorer_path') f.DEFINE_integer('beam_width', 1024, 'beam width used in the CTC decoder when building candidate transcriptions') - f.DEFINE_float('lm_alpha', 0.75, 'the alpha hyperparameter of the CTC decoder. Language Model weight.') - f.DEFINE_float('lm_beta', 1.85, 'the beta hyperparameter of the CTC decoder. Word insertion weight.') + f.DEFINE_float('lm_alpha', 0.931289039105002, 'the alpha hyperparameter of the CTC decoder. Language Model weight.') + f.DEFINE_float('lm_beta', 1.1834137581510284, 'the beta hyperparameter of the CTC decoder. Word insertion weight.') f.DEFINE_float('cutoff_prob', 1.0, 'only consider characters until this probability mass is reached. 1.0 = disabled.') f.DEFINE_integer('cutoff_top_n', 300, 'only process this number of characters sorted by probability mass for each time step. If bigger than alphabet size, disabled.')