Add transfer learning test

This commit is contained in:
Reuben Morais 2020-02-16 19:22:57 +01:00
parent 5bba9ea5d1
commit f32fd7a33f
5 changed files with 140 additions and 60 deletions

View File

@ -23,7 +23,7 @@ python -u DeepSpeech.py --noshow_progressbar --noearly_stop \
--learning_rate 0.001 --dropout_rate 0.05 \
--scorer_path 'data/smoke_test/pruned_lm.scorer' | tee /tmp/resume.log
if ! grep "Restored variables from most recent checkpoint" /tmp/resume.log; then
if ! grep "Loading best validating checkpoint from" /tmp/resume.log; then
echo "Did not resume training from checkpoint"
exit 1
else

View File

@ -1,13 +1,19 @@
#!/bin/sh
'''
This bash script is for running minimum working examples
of transfer learning for continuous integration tests
to be run on Taskcluster.
'''
# This bash script is for running minimum working examples
# of transfer learning for continuous integration tests
# to be run on Taskcluster.
set -xe
ru_csv="data/smoke_test/russian_sample_data/ru.csv"
epoch_count=$1
ru_dir="./data/smoke_test/russian_sample_data"
ru_csv="${ru_dir}/ru.csv"
ldc93s1_dir="./data/smoke_test"
ldc93s1_csv="${ldc93s1_dir}/ldc93s1.csv"
if [ ! -f "${ldc93s1_dir}/ldc93s1.csv" ]; then
echo "Downloading and preprocessing LDC93S1 example data, saving in ${ldc93s1_dir}."
python -u bin/import_ldc93s1.py ${ldc93s1_dir}
fi;
# Force only one visible device because we have a single-sample dataset
# and when trying to run on multiple devices (like GPUs), this will break
@ -20,101 +26,98 @@ for LOAD in 'init' 'last' 'auto'; do
echo "########################################################"
echo "#### Train ENGLISH model with just --checkpoint_dir ####"
echo "########################################################"
python -u DeepSpeech.py --noshow_progressbar --noearly_stop\
python -u DeepSpeech.py --noshow_progressbar --noearly_stop \
--alphabet_config_path "./data/alphabet.txt" \
--load "$LOAD" \
--train_files "./data/ldc93s1/ldc93s1.csv" --train_batch_size 1 \
--dev_files "./data/ldc93s1/ldc93s1.csv" --dev_batch_size 1 \
--test_files "./data/ldc93s1/ldc93s1.csv" --test_batch_size 1 \
--checkpoint_dir '/tmp/ckpt/transfer/eng-cudnn' \
--train_files "${ldc93s1_csv}" --train_batch_size 1 \
--dev_files "${ldc93s1_csv}" --dev_batch_size 1 \
--test_files "${ldc93s1_csv}" --test_batch_size 1 \
--scorer_path '' \
--checkpoint_dir '/tmp/ckpt/transfer/eng' \
--n_hidden 100 \
--epochs 10 \
"$@"
--epochs 10
echo "##############################################################################"
echo "#### Train ENGLISH model with --save_checkpoint_dir --load_checkpoint_dir ####"
echo "##############################################################################"
python -u DeepSpeech.py --noshow_progressbar --noearly_stop\
python -u DeepSpeech.py --noshow_progressbar --noearly_stop \
--alphabet_config_path "./data/alphabet.txt" \
--load "$LOAD" \
--train_files "./data/ldc93s1/ldc93s1.csv" --train_batch_size 1 \
--dev_files "./data/ldc93s1/ldc93s1.csv" --dev_batch_size 1 \
--test_files "./data/ldc93s1/ldc93s1.csv" --test_batch_size 1 \
--save_checkpoint_dir '/tmp/ckpt/transfer/eng-cudnn' \
--load_checkpoint_dir '/tmp/ckpt/transfer/eng-cudnn' \
--train_files "${ldc93s1_csv}" --train_batch_size 1 \
--dev_files "${ldc93s1_csv}" --dev_batch_size 1 \
--test_files "${ldc93s1_csv}" --test_batch_size 1 \
--save_checkpoint_dir '/tmp/ckpt/transfer/eng' \
--load_checkpoint_dir '/tmp/ckpt/transfer/eng' \
--scorer_path '' \
--n_hidden 100 \
--epochs 10 \
"$@"
--epochs 10
echo "#################################################################################"
echo "#### Transfer Russian model with --save_checkpoint_dir --load_checkpoint_dir ####"
echo "#################################################################################"
python -u DeepSpeech.py --noshow_progressbar --noearly_stop\
python -u DeepSpeech.py --noshow_progressbar --noearly_stop \
--drop_source_layers 1 \
--alphabet_config_path "${ru_dir}/alphabet.ru" \
--load 'last' \
--train_files "${ru_dir}/ru.csv" --train_batch_size 1 \
--dev_files "${ru_dir}/ru.csv" --dev_batch_size 1 \
--test_files "${ru_dir}/ru.csv" --test_batch_size 1 \
--save_checkpoint_dir '/tmp/ckpt/transfer/ru-cudnn' \
--load_checkpoint_dir '/tmp/ckpt/transfer/eng-cudnn' \
--train_files "${ru_csv}" --train_batch_size 1 \
--dev_files "${ru_csv}" --dev_batch_size 1 \
--test_files "${ru_csv}" --test_batch_size 1 \
--save_checkpoint_dir '/tmp/ckpt/transfer/ru' \
--load_checkpoint_dir '/tmp/ckpt/transfer/eng' \
--scorer_path '' \
--n_hidden 100 \
--epochs 10 \
"$@"
--epochs 10
done
echo "#######################################################"
echo "##### Train ENGLISH model and transfer to RUSSIAN #####"
echo "##### while iterating over loading logic with CUDNN ###"
echo "##### while iterating over loading logic #####"
echo "#######################################################"
for LOAD in 'init' 'last' 'auto'; do
echo "########################################################"
echo "#### Train ENGLISH model with just --checkpoint_dir ####"
echo "########################################################"
python -u DeepSpeech.py --noshow_progressbar --noearly_stop\
--train_cudnn\
python -u DeepSpeech.py --noshow_progressbar --noearly_stop \
--alphabet_config_path "./data/alphabet.txt" \
--load "$LOAD" \
--train_files "./data/ldc93s1/ldc93s1.csv" --train_batch_size 1 \
--dev_files "./data/ldc93s1/ldc93s1.csv" --dev_batch_size 1 \
--test_files "./data/ldc93s1/ldc93s1.csv" --test_batch_size 1 \
--checkpoint_dir '/tmp/ckpt/transfer/eng-cudnn' \
--train_files "${ldc93s1_csv}" --train_batch_size 1 \
--dev_files "${ldc93s1_csv}" --dev_batch_size 1 \
--test_files "${ldc93s1_csv}" --test_batch_size 1 \
--checkpoint_dir '/tmp/ckpt/transfer/eng' \
--scorer_path '' \
--n_hidden 100 \
--epochs 10 \
"$@"
--epochs 10
echo "##############################################################################"
echo "#### Train ENGLISH model with --save_checkpoint_dir --load_checkpoint_dir ####"
echo "##############################################################################"
python -u DeepSpeech.py --noshow_progressbar --noearly_stop\
--train_cudnn\
python -u DeepSpeech.py --noshow_progressbar --noearly_stop \
--alphabet_config_path "./data/alphabet.txt" \
--load "$LOAD" \
--train_files "./data/ldc93s1/ldc93s1.csv" --train_batch_size 1 \
--dev_files "./data/ldc93s1/ldc93s1.csv" --dev_batch_size 1 \
--test_files "./data/ldc93s1/ldc93s1.csv" --test_batch_size 1 \
--save_checkpoint_dir '/tmp/ckpt/transfer/eng-cudnn' \
--load_checkpoint_dir '/tmp/ckpt/transfer/eng-cudnn' \
--train_files "${ldc93s1_csv}" --train_batch_size 1 \
--dev_files "${ldc93s1_csv}" --dev_batch_size 1 \
--test_files "${ldc93s1_csv}" --test_batch_size 1 \
--save_checkpoint_dir '/tmp/ckpt/transfer/eng' \
--load_checkpoint_dir '/tmp/ckpt/transfer/eng' \
--scorer_path '' \
--n_hidden 100 \
--epochs 10 \
"$@"
--epochs 10
echo "####################################################################################"
echo "#### Transfer to RUSSIAN model with --save_checkpoint_dir --load_checkpoint_dir ####"
echo "####################################################################################"
python -u DeepSpeech.py --noshow_progressbar --noearly_stop\
--load_cudnn\
python -u DeepSpeech.py --noshow_progressbar --noearly_stop \
--drop_source_layers 1 \
--alphabet_config_path "${ru_dir}/alphabet.ru" \
--load 'last' \
--train_files "${ru_dir}/ru.csv" --train_batch_size 1 \
--dev_files "${ru_dir}/ru.csv" --dev_batch_size 1 \
--test_files "${ru_dir}/ru.csv" --test_batch_size 1 \
--save_checkpoint_dir '/tmp/ckpt/transfer/ru-cudnn' \
--load_checkpoint_dir '/tmp/ckpt/transfer/eng-cudnn' \
--train_files "${ru_csv}" --train_batch_size 1 \
--dev_files "${ru_csv}" --dev_batch_size 1 \
--test_files "${ru_csv}" --test_batch_size 1 \
--save_checkpoint_dir '/tmp/ckpt/transfer/ru' \
--load_checkpoint_dir '/tmp/ckpt/transfer/eng' \
--scorer_path '' \
--n_hidden 100 \
--epochs 10 \
"$@"
--epochs 10
done

View File

@ -0,0 +1,65 @@
#!/bin/bash
set -xe
source $(dirname "$0")/tc-tests-utils.sh
pyver_full=$1
if [ -z "${pyver_full}" ]; then
echo "No python version given, aborting."
exit 1
fi;
pyver=$(echo "${pyver_full}" | cut -d':' -f1)
# 2.7.x => 27
pyver_pkg=$(echo "${pyver}" | cut -d'.' -f1,2 | tr -d '.')
py_unicode_type=$(echo "${pyver_full}" | cut -d':' -f2)
if [ "${py_unicode_type}" = "m" ]; then
pyconf="ucs2"
elif [ "${py_unicode_type}" = "mu" ]; then
pyconf="ucs4"
fi;
unset PYTHON_BIN_PATH
unset PYTHONPATH
export PYENV_ROOT="${HOME}/ds-train/.pyenv"
export PATH="${PYENV_ROOT}/bin:${HOME}/bin:$PATH"
mkdir -p ${PYENV_ROOT} || true
mkdir -p ${TASKCLUSTER_ARTIFACTS} || true
mkdir -p /tmp/train || true
mkdir -p /tmp/train_tflite || true
install_pyenv "${PYENV_ROOT}"
install_pyenv_virtualenv "$(pyenv root)/plugins/pyenv-virtualenv"
PYENV_NAME=deepspeech-train
PYTHON_CONFIGURE_OPTS="--enable-unicode=${pyconf}" pyenv install ${pyver}
pyenv virtualenv ${pyver} ${PYENV_NAME}
source ${PYENV_ROOT}/versions/${pyver}/envs/${PYENV_NAME}/bin/activate
set -o pipefail
pip install --upgrade pip==19.3.1 setuptools==45.0.0 wheel==0.33.6 | cat
pip install --upgrade -r ${HOME}/DeepSpeech/ds/requirements.txt | cat
set +o pipefail
pushd ${HOME}/DeepSpeech/ds/
verify_ctcdecoder_url
popd
platform=$(python -c 'import sys; import platform; plat = platform.system().lower(); arch = platform.machine().lower(); plat = "manylinux1" if plat == "linux" and arch == "x86_64" else plat; plat = "macosx_10_10" if plat == "darwin" else plat; sys.stdout.write("%s_%s" % (plat, platform.machine()));')
whl_ds_version="$(python -c 'from pkg_resources import parse_version; print(parse_version("'${DS_VERSION}'"))')"
decoder_pkg="ds_ctcdecoder-${whl_ds_version}-cp${pyver_pkg}-cp${pyver_pkg}${py_unicode_type}-${platform}.whl"
decoder_pkg_url=${DECODER_ARTIFACTS_ROOT}/${decoder_pkg}
LD_LIBRARY_PATH=${PY37_LDPATH}:$LD_LIBRARY_PATH pip install --verbose --only-binary :all: ${PY37_SOURCE_PACKAGE} ${decoder_pkg_url} | cat
pushd ${HOME}/DeepSpeech/ds/
time ./bin/run-tc-transfer.sh
popd
deactivate

View File

@ -0,0 +1,12 @@
build:
template_file: test-linux-opt-base.tyml
dependencies:
- "linux-amd64-ctc-opt"
system_setup:
>
apt-get -qq -y install ${python.packages_trusty.apt}
args:
tests_cmdline: "${system.homedir.linux}/DeepSpeech/ds/taskcluster/tc-transfer-tests.sh 3.6.4:m"
metadata:
name: "DeepSpeech Linux AMD64 CPU transfer learning Py3.6"
description: "Training a DeepSpeech LDC93S1 model with transfer learning for Linux/AMD64 16kHz Python 3.6, CPU only, optimized version"

View File

@ -56,11 +56,11 @@ def _load_checkpoint(session, checkpoint_path):
init_vars.add(v)
load_vars -= init_vars
for v in load_vars:
for v in sorted(load_vars, key=lambda v: v.op.name):
log_info('Loading variable from checkpoint: %s' % (v.op.name))
v.load(ckpt.get_tensor(v.op.name), session=session)
for v in init_vars:
for v in sorted(init_vars, key=lambda v: v.op.name):
log_info('Initializing variable: %s' % (v.op.name))
session.run(v.initializer)