Merge pull request #3360 from mozilla/utf8alphabet-python-bindings

Fix binding of UTF8Alphabet class in decoder package
2020-10-06 22:07:45 +02:00 · 2020-10-06 22:07:45 +02:00 · 07fcd5bcd1
commit 07fcd5bcd1
parent 421f44cf73 cc2763e0b7
16 changed files with 3758 additions and 21 deletions
--- a/bin/run-tc-ldc93s1_checkpoint_bytes.sh
+++ b/bin/run-tc-ldc93s1_checkpoint_bytes.sh
@ -0,0 +1,31 @@
+#!/bin/sh
+
+set -xe
+
+ldc93s1_dir="./data/smoke_test"
+ldc93s1_csv="${ldc93s1_dir}/ldc93s1.csv"
+
+if [ ! -f "${ldc93s1_dir}/ldc93s1.csv" ]; then
+    echo "Downloading and preprocessing LDC93S1 example data, saving in ${ldc93s1_dir}."
+    python -u bin/import_ldc93s1.py ${ldc93s1_dir}
+fi;
+
+# Force only one visible device because we have a single-sample dataset
+# and when trying to run on multiple devices (like GPUs), this will break
+export CUDA_VISIBLE_DEVICES=0
+
+python -u DeepSpeech.py --noshow_progressbar --noearly_stop \
+  --train_files ${ldc93s1_csv} --train_batch_size 1 \
+  --dev_files ${ldc93s1_csv} --dev_batch_size 1 \
+  --test_files ${ldc93s1_csv} --test_batch_size 1 \
+  --n_hidden 100 --epochs 1 \
+  --max_to_keep 1 --checkpoint_dir '/tmp/ckpt_bytes' --bytes_output_mode \
+  --learning_rate 0.001 --dropout_rate 0.05 \
+  --scorer_path 'data/smoke_test/pruned_lm.bytes.scorer' | tee /tmp/resume.log
+
+if ! grep "Loading best validating checkpoint from" /tmp/resume.log; then
+  echo "Did not resume training from checkpoint"
+  exit 1
+else
+  exit 0
+fi
--- a/bin/run-tc-ldc93s1_new_bytes.sh
+++ b/bin/run-tc-ldc93s1_new_bytes.sh
@ -0,0 +1,30 @@
+#!/bin/sh
+
+set -xe
+
+ldc93s1_dir="./data/smoke_test"
+ldc93s1_csv="${ldc93s1_dir}/ldc93s1.csv"
+
+epoch_count=$1
+audio_sample_rate=$2
+
+if [ ! -f "${ldc93s1_dir}/ldc93s1.csv" ]; then
+    echo "Downloading and preprocessing LDC93S1 example data, saving in ${ldc93s1_dir}."
+    python -u bin/import_ldc93s1.py ${ldc93s1_dir}
+fi;
+
+# Force only one visible device because we have a single-sample dataset
+# and when trying to run on multiple devices (like GPUs), this will break
+export CUDA_VISIBLE_DEVICES=0
+
+python -u DeepSpeech.py --noshow_progressbar --noearly_stop \
+  --train_files ${ldc93s1_csv} --train_batch_size 1 \
+  --feature_cache '/tmp/ldc93s1_cache' \
+  --dev_files ${ldc93s1_csv} --dev_batch_size 1 \
+  --test_files ${ldc93s1_csv} --test_batch_size 1 \
+  --n_hidden 100 --epochs $epoch_count \
+  --max_to_keep 1 --checkpoint_dir '/tmp/ckpt_bytes' \
+  --learning_rate 0.001 --dropout_rate 0.05  --export_dir '/tmp/train_bytes' \
+  --scorer_path 'data/smoke_test/pruned_lm.bytes.scorer' \
+  --audio_sample_rate ${audio_sample_rate} \
+  --bytes_output_mode
--- a/bin/run-tc-ldc93s1_new_bytes_tflite.sh
+++ b/bin/run-tc-ldc93s1_new_bytes_tflite.sh
@ -0,0 +1,26 @@
+#!/bin/sh
+
+set -xe
+
+ldc93s1_dir="./data/smoke_test"
+ldc93s1_csv="${ldc93s1_dir}/ldc93s1.csv"
+
+audio_sample_rate=$1
+
+if [ ! -f "${ldc93s1_dir}/ldc93s1.csv" ]; then
+    echo "Downloading and preprocessing LDC93S1 example data, saving in ${ldc93s1_dir}."
+    python -u bin/import_ldc93s1.py ${ldc93s1_dir}
+fi;
+
+# Force only one visible device because we have a single-sample dataset
+# and when trying to run on multiple devices (like GPUs), this will break
+export CUDA_VISIBLE_DEVICES=0
+
+python -u DeepSpeech.py --noshow_progressbar \
+  --n_hidden 100 \
+  --checkpoint_dir '/tmp/ckpt_bytes' \
+  --export_dir '/tmp/train_bytes_tflite' \
+  --scorer_path 'data/smoke_test/pruned_lm.bytes.scorer' \
+  --bytes_output_mode \
+  --audio_sample_rate ${audio_sample_rate} \
+  --export_tflite
--- a/data/smoke_test/pruned_lm.bytes.scorer
+++ b/data/smoke_test/pruned_lm.bytes.scorer
--- a/data/smoke_test/vocab.pruned.bytes.txt
+++ b/data/smoke_test/vocab.pruned.bytes.txt
--- a/doc/Decoder.rst
+++ b/doc/Decoder.rst
@ -42,7 +42,7 @@ Bytes output mode

 **Note**: Currently, Bytes output mode makes assumptions that hold for Chinese Mandarin models but do not hold for other language targets, such as not predicting spaces.

-In bytes output mode the model predicts UTF-8 bytes directly instead of letters from an alphabet file. This idea was proposed in the paper `Bytes Are All You Need <https://arxiv.org/abs/1811.09021>`_. This mode is enabled with the ``--utf8`` flag at training and export time. At training time, the alphabet file is not used. Instead, the model is forced to have 256 labels, with labels 0-254 corresponding to UTF-8 byte values 1-255, and label 255 is used for the CTC blank symbol. If using an external scorer at decoding time, it MUST be built according to the instructions that follow.
+In bytes output mode the model predicts UTF-8 bytes directly instead of letters from an alphabet file. This idea was proposed in the paper `Bytes Are All You Need <https://arxiv.org/abs/1811.09021>`_. This mode is enabled with the ``--bytes_output_mode`` flag at training and export time. At training time, the alphabet file is not used. Instead, the model is forced to have 256 labels, with labels 0-254 corresponding to UTF-8 byte values 1-255, and label 255 is used for the CTC blank symbol. If using an external scorer at decoding time, it MUST be built according to the instructions that follow.

 Bytes output mode can be useful for languages with very large alphabets, such as Mandarin written with Simplified Chinese characters. It may also be useful for building multi-language models, or as a base for transfer learning. Currently these cases are untested and unsupported. Note that bytes output mode makes assumptions that hold for Mandarin written with Simplified Chinese characters and may not hold for other languages.

@ -58,11 +58,11 @@ corresponds to the following three "words", or UTF-8 byte sequences:

 At decoding time, the scorer is queried every time a Unicode codepoint is predicted, instead of when a space character is predicted. From the language modeling perspective, this is a character based model. From the implementation perspective, this is a word based model, because each character is composed of multiple labels.

-**Acoustic models trained with ``--utf8`` MUST NOT be used with an alphabet based scorer. Conversely, acoustic models trained with an alphabet file MUST NOT be used with a UTF-8 scorer.**
+**Acoustic models trained with ``--bytes_output_mode`` MUST NOT be used with an alphabet based scorer. Conversely, acoustic models trained with an alphabet file MUST NOT be used with a UTF-8 scorer.**

 UTF-8 scorers can be built by using an input corpus with space separated codepoints. If your corpus only contains single codepoints separated by spaces, ``generate_scorer_package`` should automatically enable bytes output mode, and it should print the message "Looks like a character based model."

-If the message "Doesn't look like a character based model." is printed, you should double check your inputs to make sure it only contains single codepoints separated by spaces. Bytes output mode can be forced by specifying the ``--force_utf8`` flag when running ``generate_scorer_package``, but it is NOT RECOMMENDED.
+If the message "Doesn't look like a character based model." is printed, you should double check your inputs to make sure it only contains single codepoints separated by spaces. Bytes output mode can be forced by specifying the ``--force_bytes_output_mode`` flag when running ``generate_scorer_package``, but it is NOT RECOMMENDED.

 See :ref:`scorer-scripts` for more details on using ``generate_scorer_package``.

--- a/native_client/ctcdecode/init.py
+++ b/native_client/ctcdecode/init.py
@ -1,7 +1,6 @@
 from __future__ import absolute_import, division, print_function

 from . import swigwrapper # pylint: disable=import-self
-from .swigwrapper import UTF8Alphabet

 # This module is built with SWIG_PYTHON_STRICT_BYTE_CHAR so we must handle
 # string encoding explicitly, here and throughout this file.
@ -89,6 +88,56 @@ class Alphabet(swigwrapper.Alphabet):
        return res.decode('utf-8')


+class UTF8Alphabet(swigwrapper.UTF8Alphabet):
+    """Convenience wrapper for Alphabet which calls init in the constructor"""
+    def __init__(self):
+        super(UTF8Alphabet, self).__init__()
+        err = self.init(b'')
+        if err != 0:
+            raise ValueError('UTF8Alphabet initialization failed with error code 0x{:X}'.format(err))
+
+    def CanEncodeSingle(self, input):
+        '''
+        Returns true if the single character/output class has a corresponding label
+        in the alphabet.
+        '''
+        return super(UTF8Alphabet, self).CanEncodeSingle(input.encode('utf-8'))
+
+    def CanEncode(self, input):
+        '''
+        Returns true if the entire string can be encoded into labels in this
+        alphabet.
+        '''
+        return super(UTF8Alphabet, self).CanEncode(input.encode('utf-8'))
+
+    def EncodeSingle(self, input):
+        '''
+        Encode a single character/output class into a label. Character must be in
+        the alphabet, this method will assert that. Use `CanEncodeSingle` to test.
+        '''
+        return super(UTF8Alphabet, self).EncodeSingle(input.encode('utf-8'))
+
+    def Encode(self, input):
+        '''
+        Encode a sequence of character/output classes into a sequence of labels.
+        Characters are assumed to always take a single Unicode codepoint.
+        Characters must be in the alphabet, this method will assert that. Use
+        `CanEncode` and `CanEncodeSingle` to test.
+        '''
+        # Convert SWIG's UnsignedIntVec to a Python list
+        res = super(UTF8Alphabet, self).Encode(input.encode('utf-8'))
+        return [el for el in res]
+
+    def DecodeSingle(self, input):
+        res = super(UTF8Alphabet, self).DecodeSingle(input)
+        return res.decode('utf-8')
+
+    def Decode(self, input):
+        '''Decode a sequence of labels into a string.'''
+        res = super(UTF8Alphabet, self).Decode(input)
+        return res.decode('utf-8')
+
+

 def ctc_beam_search_decoder(probs_seq,
                            alphabet,
--- a/native_client/generate_scorer_package.cpp
+++ b/native_client/generate_scorer_package.cpp
@ -20,7 +20,7 @@ create_package(absl::optional<string> alphabet_path,
               string lm_path,
               string vocab_path,
               string package_path,
-               absl::optional<bool> force_utf8,
+               absl::optional<bool> force_bytes_output_mode,
               float default_alpha,
               float default_beta)
 {
@ -43,27 +43,27 @@ create_package(absl::optional<string> alphabet_path,
         << (vocab_looks_char_based ? "Looks" : "Doesn't look")
         << " like a character based (Bytes Are All You Need) model.\n";

-    if (!force_utf8.has_value()) {
-        force_utf8 = vocab_looks_char_based;
-        cerr << "--force_utf8 was not specified, using value "
+    if (!force_bytes_output_mode.has_value()) {
+        force_bytes_output_mode = vocab_looks_char_based;
+        cerr << "--force_bytes_output_mode was not specified, using value "
             << "infered from vocabulary contents: "
             << (vocab_looks_char_based ? "true" : "false") << "\n";
    }

-    if (!force_utf8.value() && !alphabet_path.has_value()) {
+    if (!force_bytes_output_mode.value() && !alphabet_path.has_value()) {
        cerr << "No --alphabet file specified, not using bytes output mode, can't continue.\n";
        return 1;
    }

    Scorer scorer;
-    if (force_utf8.value()) {
+    if (force_bytes_output_mode.value()) {
        scorer.set_alphabet(UTF8Alphabet());
    } else {
        Alphabet alphabet;
        alphabet.init(alphabet_path->c_str());
        scorer.set_alphabet(alphabet);
    }
-    scorer.set_utf8_mode(force_utf8.value());
+    scorer.set_utf8_mode(force_bytes_output_mode.value());
    scorer.reset_params(default_alpha, default_beta);
    int err = scorer.load_lm(lm_path);
    if (err != DS_ERR_SCORER_NO_TRIE) {
@ -96,13 +96,13 @@ main(int argc, char** argv)
    po::options_description desc("Options");
    desc.add_options()
        ("help", "show help message")
-        ("alphabet", po::value<string>(), "Path of alphabet file to use for vocabulary construction. Words with characters not in the alphabet will not be included in the vocabulary. Optional if using UTF-8 mode.")
+        ("alphabet", po::value<string>(), "Path of alphabet file to use for vocabulary construction. Words with characters not in the alphabet will not be included in the vocabulary. Optional if using bytes output mode.")
        ("lm", po::value<string>(), "Path of KenLM binary LM file. Must be built without including the vocabulary (use the -v flag). See generate_lm.py for how to create a binary LM.")
        ("vocab", po::value<string>(), "Path of vocabulary file. Must contain words separated by whitespace.")
        ("package", po::value<string>(), "Path to save scorer package.")
        ("default_alpha", po::value<float>(), "Default value of alpha hyperparameter (float).")
        ("default_beta", po::value<float>(), "Default value of beta hyperparameter (float).")
-        ("force_utf8", po::value<bool>(), "Boolean flag, force set or unset UTF-8 mode in the scorer package. If not set, infers from the vocabulary. See <https://deepspeech.readthedocs.io/en/master/Decoder.html#utf-8-mode> for further explanation.")
+        ("force_bytes_output_mode", po::value<bool>(), "Boolean flag, force set or unset bytes output mode in the scorer package. If not set, infers from the vocabulary. See <https://deepspeech.readthedocs.io/en/master/Decoder.html#bytes-output-mode> for further explanation.")
    ;

    po::variables_map vm;
@ -122,10 +122,10 @@ main(int argc, char** argv)
        }
    }

-    // Parse optional --force_utf8
-    absl::optional<bool> force_utf8 = absl::nullopt;
-    if (vm.count("force_utf8")) {
-        force_utf8 = vm["force_utf8"].as<bool>();
+    // Parse optional --force_bytes_output_mode
+    absl::optional<bool> force_bytes_output_mode = absl::nullopt;
+    if (vm.count("force_bytes_output_mode")) {
+        force_bytes_output_mode = vm["force_bytes_output_mode"].as<bool>();
    }

    // Parse optional --alphabet
@ -138,7 +138,7 @@ main(int argc, char** argv)
                   vm["lm"].as<string>(),
                   vm["vocab"].as<string>(),
                   vm["package"].as<string>(),
-                   force_utf8,
+                   force_bytes_output_mode,
                   vm["default_alpha"].as<float>(),
                   vm["default_beta"].as<float>());

--- a/taskcluster/tc-all-utils.sh
+++ b/taskcluster/tc-all-utils.sh
@ -98,6 +98,7 @@ download_data()
  ${WGET} -P "${TASKCLUSTER_TMP_DIR}" "${model_source_mmap}"
  cp ${DS_ROOT_TASK}/DeepSpeech/ds/data/smoke_test/*.wav ${TASKCLUSTER_TMP_DIR}/
  cp ${DS_ROOT_TASK}/DeepSpeech/ds/data/smoke_test/pruned_lm.scorer ${TASKCLUSTER_TMP_DIR}/kenlm.scorer
+  cp ${DS_ROOT_TASK}/DeepSpeech/ds/data/smoke_test/pruned_lm.bytes.scorer ${TASKCLUSTER_TMP_DIR}/kenlm.bytes.scorer
  cp -R ${DS_ROOT_TASK}/DeepSpeech/ds/native_client/test ${TASKCLUSTER_TMP_DIR}/test_sources
 }

--- a/taskcluster/tc-cpp-bytes-ds-tests.sh
+++ b/taskcluster/tc-cpp-bytes-ds-tests.sh
@ -0,0 +1,16 @@
+#!/bin/bash
+
+set -xe
+
+source $(dirname "$0")/tc-tests-utils.sh
+
+bitrate=$1
+set_ldc_sample_filename "${bitrate}"
+
+download_material "${TASKCLUSTER_TMP_DIR}/ds"
+
+export PATH=${TASKCLUSTER_TMP_DIR}/ds/:$PATH
+
+# Bytes output mode with LDC93S1 takes too long to converge so we simply test
+# that loading the model won't crash
+check_versions
--- a/taskcluster/tc-train-extra-tests.sh
+++ b/taskcluster/tc-train-extra-tests.sh
@ -54,10 +54,30 @@ pushd ${HOME}/DeepSpeech/ds/

    # Test --metrics_files training argument
    time ./bin/run-tc-ldc93s1_new_metrics.sh 2 "${sample_rate}"
+
+    # Test training with bytes output mode
+    time ./bin/run-tc-ldc93s1_new_bytes.sh 200 "${sample_rate}"
+    time ./bin/run-tc-ldc93s1_new_bytes_tflite.sh "${sample_rate}"
 popd

+# Save exported model artifacts from bytes output mode training
+cp /tmp/train_bytes/output_graph.pb ${TASKCLUSTER_ARTIFACTS}/output_graph.pb
+cp /tmp/train_bytes_tflite/output_graph.tflite ${TASKCLUSTER_ARTIFACTS}/output_graph.tflite
+
 pushd ${HOME}/DeepSpeech/ds/
+    python util/taskcluster.py --source tensorflow --artifact convert_graphdef_memmapped_format --branch r1.15 --target /tmp/
+popd
+
+/tmp/convert_graphdef_memmapped_format --in_graph=/tmp/train_bytes/output_graph.pb --out_graph=/tmp/train_bytes/output_graph.pbmm
+cp /tmp/train_bytes/output_graph.pbmm ${TASKCLUSTER_ARTIFACTS}
+
+# Test resuming from checkpoints created above
+pushd ${HOME}/DeepSpeech/ds/
+    # SDB, resuming from checkpoint
    time ./bin/run-tc-ldc93s1_checkpoint_sdb.sh
+
+    # Bytes output mode, resuming from checkpoint
+    time ./bin/run-tc-ldc93s1_checkpoint_bytes.sh
 popd

 virtualenv_deactivate "${pyalias}" "deepspeech"
--- a/taskcluster/test-cpp_16k_bytes-darwin-amd64-opt.yml
+++ b/taskcluster/test-cpp_16k_bytes-darwin-amd64-opt.yml
@ -0,0 +1,12 @@
+build:
+  template_file: test-darwin-opt-base.tyml
+  dependencies:
+    - "darwin-amd64-cpu-opt"
+    - "test-training-extra_16k-linux-amd64-py36m-opt"
+    - "homebrew_tests-darwin-amd64"
+  test_model_task: "test-training-extra_16k-linux-amd64-py36m-opt"
+  args:
+    tests_cmdline: "$TASKCLUSTER_TASK_DIR/DeepSpeech/ds/taskcluster/tc-cpp-bytes-ds-tests.sh 16k"
+  metadata:
+    name: "DeepSpeech OSX AMD64 CPU C++ tests (Bytes Output Model, 16kHz)"
+    description: "Testing DeepSpeech C++ for OSX/AMD64, CPU only, optimized version (Bytes Output Model, 16kHz)"
--- a/taskcluster/test-cpp_16k_bytes-linux-amd64-opt.yml
+++ b/taskcluster/test-cpp_16k_bytes-linux-amd64-opt.yml
@ -0,0 +1,12 @@
+build:
+  template_file: test-linux-opt-base.tyml
+  dependencies:
+    - "linux-amd64-cpu-opt"
+    - "test-training-extra_16k-linux-amd64-py36m-opt"
+  test_model_task: "test-training-extra_16k-linux-amd64-py36m-opt"
+  args:
+    tests_cmdline: "${system.homedir.linux}/DeepSpeech/ds/taskcluster/tc-cpp-bytes-ds-tests.sh 16k"
+  workerType: "${docker.dsTests}"
+  metadata:
+    name: "DeepSpeech Linux AMD64 CPU C++ tests (Bytes Output Model, 16kHz)"
+    description: "Testing DeepSpeech C++ for Linux/AMD64, CPU only, optimized version (Bytes Output Model, 16kHz)"
--- a/training/deepspeech_training/util/config.py
+++ b/training/deepspeech_training/util/config.py
@ -83,7 +83,7 @@ def initialize_globals():
    if not c.available_devices:
        c.available_devices = [c.cpu_device]

-    if FLAGS.utf8:
+    if FLAGS.bytes_output_mode:
        c.alphabet = UTF8Alphabet()
    else:
        c.alphabet = Alphabet(os.path.abspath(FLAGS.alphabet_config_path))
--- a/training/deepspeech_training/util/evaluate_tools.py
+++ b/training/deepspeech_training/util/evaluate_tools.py
@ -72,7 +72,7 @@ def calculate_and_print_report(wav_filenames, labels, decodings, losses, dataset
    samples.sort(key=lambda s: s.loss, reverse=True)

    # Then order by ascending WER/CER
-    if FLAGS.utf8:
+    if FLAGS.bytes_output_mode:
        samples.sort(key=lambda s: s.cer)
    else:
        samples.sort(key=lambda s: s.wer)
--- a/training/deepspeech_training/util/flags.py
+++ b/training/deepspeech_training/util/flags.py
@ -156,7 +156,7 @@ def create_flags():

    # Decoder

-    f.DEFINE_boolean('utf8', False, 'enable UTF-8 mode. When this is used the model outputs UTF-8 sequences directly rather than using an alphabet mapping.')
+    f.DEFINE_boolean('bytes_output_mode', False, 'enable Bytes Output Mode mode. When this is used the model outputs UTF-8 byte values directly rather than using an alphabet mapping. The --alphabet_config_path option will be ignored. See the training documentation for more details.')
    f.DEFINE_string('alphabet_config_path', 'data/alphabet.txt', 'path to the configuration file specifying the alphabet used by the network. See the comment in data/alphabet.txt for a description of the format.')
    f.DEFINE_string('scorer_path', '', 'path to the external scorer file.')
    f.DEFINE_alias('scorer', 'scorer_path')