Merge pull request #3360 from mozilla/utf8alphabet-python-bindings

Fix binding of UTF8Alphabet class in decoder package
This commit is contained in:
Reuben Morais 2020-10-06 22:07:45 +02:00 committed by GitHub
commit 07fcd5bcd1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
16 changed files with 3758 additions and 21 deletions

View File

@ -0,0 +1,31 @@
#!/bin/sh
set -xe
ldc93s1_dir="./data/smoke_test"
ldc93s1_csv="${ldc93s1_dir}/ldc93s1.csv"
if [ ! -f "${ldc93s1_dir}/ldc93s1.csv" ]; then
echo "Downloading and preprocessing LDC93S1 example data, saving in ${ldc93s1_dir}."
python -u bin/import_ldc93s1.py ${ldc93s1_dir}
fi;
# Force only one visible device because we have a single-sample dataset
# and when trying to run on multiple devices (like GPUs), this will break
export CUDA_VISIBLE_DEVICES=0
python -u DeepSpeech.py --noshow_progressbar --noearly_stop \
--train_files ${ldc93s1_csv} --train_batch_size 1 \
--dev_files ${ldc93s1_csv} --dev_batch_size 1 \
--test_files ${ldc93s1_csv} --test_batch_size 1 \
--n_hidden 100 --epochs 1 \
--max_to_keep 1 --checkpoint_dir '/tmp/ckpt_bytes' --bytes_output_mode \
--learning_rate 0.001 --dropout_rate 0.05 \
--scorer_path 'data/smoke_test/pruned_lm.bytes.scorer' | tee /tmp/resume.log
if ! grep "Loading best validating checkpoint from" /tmp/resume.log; then
echo "Did not resume training from checkpoint"
exit 1
else
exit 0
fi

30
bin/run-tc-ldc93s1_new_bytes.sh Executable file
View File

@ -0,0 +1,30 @@
#!/bin/sh
set -xe
ldc93s1_dir="./data/smoke_test"
ldc93s1_csv="${ldc93s1_dir}/ldc93s1.csv"
epoch_count=$1
audio_sample_rate=$2
if [ ! -f "${ldc93s1_dir}/ldc93s1.csv" ]; then
echo "Downloading and preprocessing LDC93S1 example data, saving in ${ldc93s1_dir}."
python -u bin/import_ldc93s1.py ${ldc93s1_dir}
fi;
# Force only one visible device because we have a single-sample dataset
# and when trying to run on multiple devices (like GPUs), this will break
export CUDA_VISIBLE_DEVICES=0
python -u DeepSpeech.py --noshow_progressbar --noearly_stop \
--train_files ${ldc93s1_csv} --train_batch_size 1 \
--feature_cache '/tmp/ldc93s1_cache' \
--dev_files ${ldc93s1_csv} --dev_batch_size 1 \
--test_files ${ldc93s1_csv} --test_batch_size 1 \
--n_hidden 100 --epochs $epoch_count \
--max_to_keep 1 --checkpoint_dir '/tmp/ckpt_bytes' \
--learning_rate 0.001 --dropout_rate 0.05 --export_dir '/tmp/train_bytes' \
--scorer_path 'data/smoke_test/pruned_lm.bytes.scorer' \
--audio_sample_rate ${audio_sample_rate} \
--bytes_output_mode

View File

@ -0,0 +1,26 @@
#!/bin/sh
set -xe
ldc93s1_dir="./data/smoke_test"
ldc93s1_csv="${ldc93s1_dir}/ldc93s1.csv"
audio_sample_rate=$1
if [ ! -f "${ldc93s1_dir}/ldc93s1.csv" ]; then
echo "Downloading and preprocessing LDC93S1 example data, saving in ${ldc93s1_dir}."
python -u bin/import_ldc93s1.py ${ldc93s1_dir}
fi;
# Force only one visible device because we have a single-sample dataset
# and when trying to run on multiple devices (like GPUs), this will break
export CUDA_VISIBLE_DEVICES=0
python -u DeepSpeech.py --noshow_progressbar \
--n_hidden 100 \
--checkpoint_dir '/tmp/ckpt_bytes' \
--export_dir '/tmp/train_bytes_tflite' \
--scorer_path 'data/smoke_test/pruned_lm.bytes.scorer' \
--bytes_output_mode \
--audio_sample_rate ${audio_sample_rate} \
--export_tflite

Binary file not shown.

File diff suppressed because it is too large Load Diff

View File

@ -42,7 +42,7 @@ Bytes output mode
**Note**: Currently, Bytes output mode makes assumptions that hold for Chinese Mandarin models but do not hold for other language targets, such as not predicting spaces.
In bytes output mode the model predicts UTF-8 bytes directly instead of letters from an alphabet file. This idea was proposed in the paper `Bytes Are All You Need <https://arxiv.org/abs/1811.09021>`_. This mode is enabled with the ``--utf8`` flag at training and export time. At training time, the alphabet file is not used. Instead, the model is forced to have 256 labels, with labels 0-254 corresponding to UTF-8 byte values 1-255, and label 255 is used for the CTC blank symbol. If using an external scorer at decoding time, it MUST be built according to the instructions that follow.
In bytes output mode the model predicts UTF-8 bytes directly instead of letters from an alphabet file. This idea was proposed in the paper `Bytes Are All You Need <https://arxiv.org/abs/1811.09021>`_. This mode is enabled with the ``--bytes_output_mode`` flag at training and export time. At training time, the alphabet file is not used. Instead, the model is forced to have 256 labels, with labels 0-254 corresponding to UTF-8 byte values 1-255, and label 255 is used for the CTC blank symbol. If using an external scorer at decoding time, it MUST be built according to the instructions that follow.
Bytes output mode can be useful for languages with very large alphabets, such as Mandarin written with Simplified Chinese characters. It may also be useful for building multi-language models, or as a base for transfer learning. Currently these cases are untested and unsupported. Note that bytes output mode makes assumptions that hold for Mandarin written with Simplified Chinese characters and may not hold for other languages.
@ -58,11 +58,11 @@ corresponds to the following three "words", or UTF-8 byte sequences:
At decoding time, the scorer is queried every time a Unicode codepoint is predicted, instead of when a space character is predicted. From the language modeling perspective, this is a character based model. From the implementation perspective, this is a word based model, because each character is composed of multiple labels.
**Acoustic models trained with ``--utf8`` MUST NOT be used with an alphabet based scorer. Conversely, acoustic models trained with an alphabet file MUST NOT be used with a UTF-8 scorer.**
**Acoustic models trained with ``--bytes_output_mode`` MUST NOT be used with an alphabet based scorer. Conversely, acoustic models trained with an alphabet file MUST NOT be used with a UTF-8 scorer.**
UTF-8 scorers can be built by using an input corpus with space separated codepoints. If your corpus only contains single codepoints separated by spaces, ``generate_scorer_package`` should automatically enable bytes output mode, and it should print the message "Looks like a character based model."
If the message "Doesn't look like a character based model." is printed, you should double check your inputs to make sure it only contains single codepoints separated by spaces. Bytes output mode can be forced by specifying the ``--force_utf8`` flag when running ``generate_scorer_package``, but it is NOT RECOMMENDED.
If the message "Doesn't look like a character based model." is printed, you should double check your inputs to make sure it only contains single codepoints separated by spaces. Bytes output mode can be forced by specifying the ``--force_bytes_output_mode`` flag when running ``generate_scorer_package``, but it is NOT RECOMMENDED.
See :ref:`scorer-scripts` for more details on using ``generate_scorer_package``.

View File

@ -1,7 +1,6 @@
from __future__ import absolute_import, division, print_function
from . import swigwrapper # pylint: disable=import-self
from .swigwrapper import UTF8Alphabet
# This module is built with SWIG_PYTHON_STRICT_BYTE_CHAR so we must handle
# string encoding explicitly, here and throughout this file.
@ -89,6 +88,56 @@ class Alphabet(swigwrapper.Alphabet):
return res.decode('utf-8')
class UTF8Alphabet(swigwrapper.UTF8Alphabet):
"""Convenience wrapper for Alphabet which calls init in the constructor"""
def __init__(self):
super(UTF8Alphabet, self).__init__()
err = self.init(b'')
if err != 0:
raise ValueError('UTF8Alphabet initialization failed with error code 0x{:X}'.format(err))
def CanEncodeSingle(self, input):
'''
Returns true if the single character/output class has a corresponding label
in the alphabet.
'''
return super(UTF8Alphabet, self).CanEncodeSingle(input.encode('utf-8'))
def CanEncode(self, input):
'''
Returns true if the entire string can be encoded into labels in this
alphabet.
'''
return super(UTF8Alphabet, self).CanEncode(input.encode('utf-8'))
def EncodeSingle(self, input):
'''
Encode a single character/output class into a label. Character must be in
the alphabet, this method will assert that. Use `CanEncodeSingle` to test.
'''
return super(UTF8Alphabet, self).EncodeSingle(input.encode('utf-8'))
def Encode(self, input):
'''
Encode a sequence of character/output classes into a sequence of labels.
Characters are assumed to always take a single Unicode codepoint.
Characters must be in the alphabet, this method will assert that. Use
`CanEncode` and `CanEncodeSingle` to test.
'''
# Convert SWIG's UnsignedIntVec to a Python list
res = super(UTF8Alphabet, self).Encode(input.encode('utf-8'))
return [el for el in res]
def DecodeSingle(self, input):
res = super(UTF8Alphabet, self).DecodeSingle(input)
return res.decode('utf-8')
def Decode(self, input):
'''Decode a sequence of labels into a string.'''
res = super(UTF8Alphabet, self).Decode(input)
return res.decode('utf-8')
def ctc_beam_search_decoder(probs_seq,
alphabet,

View File

@ -20,7 +20,7 @@ create_package(absl::optional<string> alphabet_path,
string lm_path,
string vocab_path,
string package_path,
absl::optional<bool> force_utf8,
absl::optional<bool> force_bytes_output_mode,
float default_alpha,
float default_beta)
{
@ -43,27 +43,27 @@ create_package(absl::optional<string> alphabet_path,
<< (vocab_looks_char_based ? "Looks" : "Doesn't look")
<< " like a character based (Bytes Are All You Need) model.\n";
if (!force_utf8.has_value()) {
force_utf8 = vocab_looks_char_based;
cerr << "--force_utf8 was not specified, using value "
if (!force_bytes_output_mode.has_value()) {
force_bytes_output_mode = vocab_looks_char_based;
cerr << "--force_bytes_output_mode was not specified, using value "
<< "infered from vocabulary contents: "
<< (vocab_looks_char_based ? "true" : "false") << "\n";
}
if (!force_utf8.value() && !alphabet_path.has_value()) {
if (!force_bytes_output_mode.value() && !alphabet_path.has_value()) {
cerr << "No --alphabet file specified, not using bytes output mode, can't continue.\n";
return 1;
}
Scorer scorer;
if (force_utf8.value()) {
if (force_bytes_output_mode.value()) {
scorer.set_alphabet(UTF8Alphabet());
} else {
Alphabet alphabet;
alphabet.init(alphabet_path->c_str());
scorer.set_alphabet(alphabet);
}
scorer.set_utf8_mode(force_utf8.value());
scorer.set_utf8_mode(force_bytes_output_mode.value());
scorer.reset_params(default_alpha, default_beta);
int err = scorer.load_lm(lm_path);
if (err != DS_ERR_SCORER_NO_TRIE) {
@ -96,13 +96,13 @@ main(int argc, char** argv)
po::options_description desc("Options");
desc.add_options()
("help", "show help message")
("alphabet", po::value<string>(), "Path of alphabet file to use for vocabulary construction. Words with characters not in the alphabet will not be included in the vocabulary. Optional if using UTF-8 mode.")
("alphabet", po::value<string>(), "Path of alphabet file to use for vocabulary construction. Words with characters not in the alphabet will not be included in the vocabulary. Optional if using bytes output mode.")
("lm", po::value<string>(), "Path of KenLM binary LM file. Must be built without including the vocabulary (use the -v flag). See generate_lm.py for how to create a binary LM.")
("vocab", po::value<string>(), "Path of vocabulary file. Must contain words separated by whitespace.")
("package", po::value<string>(), "Path to save scorer package.")
("default_alpha", po::value<float>(), "Default value of alpha hyperparameter (float).")
("default_beta", po::value<float>(), "Default value of beta hyperparameter (float).")
("force_utf8", po::value<bool>(), "Boolean flag, force set or unset UTF-8 mode in the scorer package. If not set, infers from the vocabulary. See <https://deepspeech.readthedocs.io/en/master/Decoder.html#utf-8-mode> for further explanation.")
("force_bytes_output_mode", po::value<bool>(), "Boolean flag, force set or unset bytes output mode in the scorer package. If not set, infers from the vocabulary. See <https://deepspeech.readthedocs.io/en/master/Decoder.html#bytes-output-mode> for further explanation.")
;
po::variables_map vm;
@ -122,10 +122,10 @@ main(int argc, char** argv)
}
}
// Parse optional --force_utf8
absl::optional<bool> force_utf8 = absl::nullopt;
if (vm.count("force_utf8")) {
force_utf8 = vm["force_utf8"].as<bool>();
// Parse optional --force_bytes_output_mode
absl::optional<bool> force_bytes_output_mode = absl::nullopt;
if (vm.count("force_bytes_output_mode")) {
force_bytes_output_mode = vm["force_bytes_output_mode"].as<bool>();
}
// Parse optional --alphabet
@ -138,7 +138,7 @@ main(int argc, char** argv)
vm["lm"].as<string>(),
vm["vocab"].as<string>(),
vm["package"].as<string>(),
force_utf8,
force_bytes_output_mode,
vm["default_alpha"].as<float>(),
vm["default_beta"].as<float>());

View File

@ -98,6 +98,7 @@ download_data()
${WGET} -P "${TASKCLUSTER_TMP_DIR}" "${model_source_mmap}"
cp ${DS_ROOT_TASK}/DeepSpeech/ds/data/smoke_test/*.wav ${TASKCLUSTER_TMP_DIR}/
cp ${DS_ROOT_TASK}/DeepSpeech/ds/data/smoke_test/pruned_lm.scorer ${TASKCLUSTER_TMP_DIR}/kenlm.scorer
cp ${DS_ROOT_TASK}/DeepSpeech/ds/data/smoke_test/pruned_lm.bytes.scorer ${TASKCLUSTER_TMP_DIR}/kenlm.bytes.scorer
cp -R ${DS_ROOT_TASK}/DeepSpeech/ds/native_client/test ${TASKCLUSTER_TMP_DIR}/test_sources
}

View File

@ -0,0 +1,16 @@
#!/bin/bash
set -xe
source $(dirname "$0")/tc-tests-utils.sh
bitrate=$1
set_ldc_sample_filename "${bitrate}"
download_material "${TASKCLUSTER_TMP_DIR}/ds"
export PATH=${TASKCLUSTER_TMP_DIR}/ds/:$PATH
# Bytes output mode with LDC93S1 takes too long to converge so we simply test
# that loading the model won't crash
check_versions

View File

@ -54,10 +54,30 @@ pushd ${HOME}/DeepSpeech/ds/
# Test --metrics_files training argument
time ./bin/run-tc-ldc93s1_new_metrics.sh 2 "${sample_rate}"
# Test training with bytes output mode
time ./bin/run-tc-ldc93s1_new_bytes.sh 200 "${sample_rate}"
time ./bin/run-tc-ldc93s1_new_bytes_tflite.sh "${sample_rate}"
popd
# Save exported model artifacts from bytes output mode training
cp /tmp/train_bytes/output_graph.pb ${TASKCLUSTER_ARTIFACTS}/output_graph.pb
cp /tmp/train_bytes_tflite/output_graph.tflite ${TASKCLUSTER_ARTIFACTS}/output_graph.tflite
pushd ${HOME}/DeepSpeech/ds/
python util/taskcluster.py --source tensorflow --artifact convert_graphdef_memmapped_format --branch r1.15 --target /tmp/
popd
/tmp/convert_graphdef_memmapped_format --in_graph=/tmp/train_bytes/output_graph.pb --out_graph=/tmp/train_bytes/output_graph.pbmm
cp /tmp/train_bytes/output_graph.pbmm ${TASKCLUSTER_ARTIFACTS}
# Test resuming from checkpoints created above
pushd ${HOME}/DeepSpeech/ds/
# SDB, resuming from checkpoint
time ./bin/run-tc-ldc93s1_checkpoint_sdb.sh
# Bytes output mode, resuming from checkpoint
time ./bin/run-tc-ldc93s1_checkpoint_bytes.sh
popd
virtualenv_deactivate "${pyalias}" "deepspeech"

View File

@ -0,0 +1,12 @@
build:
template_file: test-darwin-opt-base.tyml
dependencies:
- "darwin-amd64-cpu-opt"
- "test-training-extra_16k-linux-amd64-py36m-opt"
- "homebrew_tests-darwin-amd64"
test_model_task: "test-training-extra_16k-linux-amd64-py36m-opt"
args:
tests_cmdline: "$TASKCLUSTER_TASK_DIR/DeepSpeech/ds/taskcluster/tc-cpp-bytes-ds-tests.sh 16k"
metadata:
name: "DeepSpeech OSX AMD64 CPU C++ tests (Bytes Output Model, 16kHz)"
description: "Testing DeepSpeech C++ for OSX/AMD64, CPU only, optimized version (Bytes Output Model, 16kHz)"

View File

@ -0,0 +1,12 @@
build:
template_file: test-linux-opt-base.tyml
dependencies:
- "linux-amd64-cpu-opt"
- "test-training-extra_16k-linux-amd64-py36m-opt"
test_model_task: "test-training-extra_16k-linux-amd64-py36m-opt"
args:
tests_cmdline: "${system.homedir.linux}/DeepSpeech/ds/taskcluster/tc-cpp-bytes-ds-tests.sh 16k"
workerType: "${docker.dsTests}"
metadata:
name: "DeepSpeech Linux AMD64 CPU C++ tests (Bytes Output Model, 16kHz)"
description: "Testing DeepSpeech C++ for Linux/AMD64, CPU only, optimized version (Bytes Output Model, 16kHz)"

View File

@ -83,7 +83,7 @@ def initialize_globals():
if not c.available_devices:
c.available_devices = [c.cpu_device]
if FLAGS.utf8:
if FLAGS.bytes_output_mode:
c.alphabet = UTF8Alphabet()
else:
c.alphabet = Alphabet(os.path.abspath(FLAGS.alphabet_config_path))

View File

@ -72,7 +72,7 @@ def calculate_and_print_report(wav_filenames, labels, decodings, losses, dataset
samples.sort(key=lambda s: s.loss, reverse=True)
# Then order by ascending WER/CER
if FLAGS.utf8:
if FLAGS.bytes_output_mode:
samples.sort(key=lambda s: s.cer)
else:
samples.sort(key=lambda s: s.wer)

View File

@ -156,7 +156,7 @@ def create_flags():
# Decoder
f.DEFINE_boolean('utf8', False, 'enable UTF-8 mode. When this is used the model outputs UTF-8 sequences directly rather than using an alphabet mapping.')
f.DEFINE_boolean('bytes_output_mode', False, 'enable Bytes Output Mode mode. When this is used the model outputs UTF-8 byte values directly rather than using an alphabet mapping. The --alphabet_config_path option will be ignored. See the training documentation for more details.')
f.DEFINE_string('alphabet_config_path', 'data/alphabet.txt', 'path to the configuration file specifying the alphabet used by the network. See the comment in data/alphabet.txt for a description of the format.')
f.DEFINE_string('scorer_path', '', 'path to the external scorer file.')
f.DEFINE_alias('scorer', 'scorer_path')