diff --git a/bin/run-tc-ldc93s1_checkpoint_bytes.sh b/bin/run-tc-ldc93s1_checkpoint_bytes.sh index 0fdd3d8c..d6fe98e9 100755 --- a/bin/run-tc-ldc93s1_checkpoint_bytes.sh +++ b/bin/run-tc-ldc93s1_checkpoint_bytes.sh @@ -19,7 +19,7 @@ python -u DeepSpeech.py --noshow_progressbar --noearly_stop \ --dev_files ${ldc93s1_csv} --dev_batch_size 1 \ --test_files ${ldc93s1_csv} --test_batch_size 1 \ --n_hidden 100 --epochs 1 \ - --max_to_keep 1 --checkpoint_dir '/tmp/ckpt_bytes' --utf8 \ + --max_to_keep 1 --checkpoint_dir '/tmp/ckpt_bytes' --bytes_output_mode \ --learning_rate 0.001 --dropout_rate 0.05 \ --scorer_path 'data/smoke_test/pruned_lm.bytes.scorer' | tee /tmp/resume.log diff --git a/bin/run-tc-ldc93s1_new_bytes.sh b/bin/run-tc-ldc93s1_new_bytes.sh index b879bee6..5ce787d3 100755 --- a/bin/run-tc-ldc93s1_new_bytes.sh +++ b/bin/run-tc-ldc93s1_new_bytes.sh @@ -27,4 +27,4 @@ python -u DeepSpeech.py --noshow_progressbar --noearly_stop \ --learning_rate 0.001 --dropout_rate 0.05 --export_dir '/tmp/train_bytes' \ --scorer_path 'data/smoke_test/pruned_lm.bytes.scorer' \ --audio_sample_rate ${audio_sample_rate} \ - --utf8 + --bytes_output_mode diff --git a/bin/run-tc-ldc93s1_new_bytes_tflite.sh b/bin/run-tc-ldc93s1_new_bytes_tflite.sh index b94608de..f1a79f12 100755 --- a/bin/run-tc-ldc93s1_new_bytes_tflite.sh +++ b/bin/run-tc-ldc93s1_new_bytes_tflite.sh @@ -21,6 +21,6 @@ python -u DeepSpeech.py --noshow_progressbar \ --checkpoint_dir '/tmp/ckpt_bytes' \ --export_dir '/tmp/train_bytes_tflite' \ --scorer_path 'data/smoke_test/pruned_lm.bytes.scorer' \ - --utf8 \ + --bytes_output_mode \ --audio_sample_rate ${audio_sample_rate} \ --export_tflite diff --git a/doc/Decoder.rst b/doc/Decoder.rst index c335c317..a491ba82 100644 --- a/doc/Decoder.rst +++ b/doc/Decoder.rst @@ -42,7 +42,7 @@ Bytes output mode **Note**: Currently, Bytes output mode makes assumptions that hold for Chinese Mandarin models but do not hold for other language targets, such as not predicting spaces. -In bytes output mode the model predicts UTF-8 bytes directly instead of letters from an alphabet file. This idea was proposed in the paper `Bytes Are All You Need `_. This mode is enabled with the ``--utf8`` flag at training and export time. At training time, the alphabet file is not used. Instead, the model is forced to have 256 labels, with labels 0-254 corresponding to UTF-8 byte values 1-255, and label 255 is used for the CTC blank symbol. If using an external scorer at decoding time, it MUST be built according to the instructions that follow. +In bytes output mode the model predicts UTF-8 bytes directly instead of letters from an alphabet file. This idea was proposed in the paper `Bytes Are All You Need `_. This mode is enabled with the ``--bytes_output_mode`` flag at training and export time. At training time, the alphabet file is not used. Instead, the model is forced to have 256 labels, with labels 0-254 corresponding to UTF-8 byte values 1-255, and label 255 is used for the CTC blank symbol. If using an external scorer at decoding time, it MUST be built according to the instructions that follow. Bytes output mode can be useful for languages with very large alphabets, such as Mandarin written with Simplified Chinese characters. It may also be useful for building multi-language models, or as a base for transfer learning. Currently these cases are untested and unsupported. Note that bytes output mode makes assumptions that hold for Mandarin written with Simplified Chinese characters and may not hold for other languages. @@ -58,7 +58,7 @@ corresponds to the following three "words", or UTF-8 byte sequences: At decoding time, the scorer is queried every time a Unicode codepoint is predicted, instead of when a space character is predicted. From the language modeling perspective, this is a character based model. From the implementation perspective, this is a word based model, because each character is composed of multiple labels. -**Acoustic models trained with ``--utf8`` MUST NOT be used with an alphabet based scorer. Conversely, acoustic models trained with an alphabet file MUST NOT be used with a UTF-8 scorer.** +**Acoustic models trained with ``--bytes_output_mode`` MUST NOT be used with an alphabet based scorer. Conversely, acoustic models trained with an alphabet file MUST NOT be used with a UTF-8 scorer.** UTF-8 scorers can be built by using an input corpus with space separated codepoints. If your corpus only contains single codepoints separated by spaces, ``generate_scorer_package`` should automatically enable bytes output mode, and it should print the message "Looks like a character based model." diff --git a/training/deepspeech_training/util/config.py b/training/deepspeech_training/util/config.py index 2bd580b5..0b9929e5 100755 --- a/training/deepspeech_training/util/config.py +++ b/training/deepspeech_training/util/config.py @@ -83,7 +83,7 @@ def initialize_globals(): if not c.available_devices: c.available_devices = [c.cpu_device] - if FLAGS.utf8: + if FLAGS.bytes_output_mode: c.alphabet = UTF8Alphabet() else: c.alphabet = Alphabet(os.path.abspath(FLAGS.alphabet_config_path)) diff --git a/training/deepspeech_training/util/evaluate_tools.py b/training/deepspeech_training/util/evaluate_tools.py index e482211e..66fc8293 100644 --- a/training/deepspeech_training/util/evaluate_tools.py +++ b/training/deepspeech_training/util/evaluate_tools.py @@ -72,7 +72,7 @@ def calculate_and_print_report(wav_filenames, labels, decodings, losses, dataset samples.sort(key=lambda s: s.loss, reverse=True) # Then order by ascending WER/CER - if FLAGS.utf8: + if FLAGS.bytes_output_mode: samples.sort(key=lambda s: s.cer) else: samples.sort(key=lambda s: s.wer) diff --git a/training/deepspeech_training/util/flags.py b/training/deepspeech_training/util/flags.py index fe78f0b7..cf321594 100644 --- a/training/deepspeech_training/util/flags.py +++ b/training/deepspeech_training/util/flags.py @@ -156,7 +156,7 @@ def create_flags(): # Decoder - f.DEFINE_boolean('utf8', False, 'enable UTF-8 mode. When this is used the model outputs UTF-8 sequences directly rather than using an alphabet mapping.') + f.DEFINE_boolean('bytes_output_mode', False, 'enable Bytes Output Mode mode. When this is used the model outputs UTF-8 byte values directly rather than using an alphabet mapping. The --alphabet_config_path option will be ignored. See the training documentation for more details.') f.DEFINE_string('alphabet_config_path', 'data/alphabet.txt', 'path to the configuration file specifying the alphabet used by the network. See the comment in data/alphabet.txt for a description of the format.') f.DEFINE_string('scorer_path', '', 'path to the external scorer file.') f.DEFINE_alias('scorer', 'scorer_path')