From fb4f5b6a84ac2fb810f961c474c183d17cb56a90 Mon Sep 17 00:00:00 2001
From: Reuben Morais <reuben.morais@gmail.com>
Date: Mon, 5 Oct 2020 16:04:20 +0200
Subject: [PATCH] Add some coverage for training and inference in bytes output
 mode

---
 bin/run-tc-ldc93s1_checkpoint_bytes.sh        | 31 +++++++++++++++++++
 bin/run-tc-ldc93s1_new_bytes.sh               | 30 ++++++++++++++++++
 bin/run-tc-ldc93s1_new_bytes_tflite.sh        | 26 ++++++++++++++++
 taskcluster/tc-all-utils.sh                   |  1 +
 taskcluster/tc-cpp-bytes-ds-tests.sh          | 16 ++++++++++
 taskcluster/tc-train-extra-tests.sh           | 20 ++++++++++++
 .../test-cpp_16k_bytes-darwin-amd64-opt.yml   | 12 +++++++
 .../test-cpp_16k_bytes-linux-amd64-opt.yml    | 12 +++++++
 8 files changed, 148 insertions(+)
 create mode 100755 bin/run-tc-ldc93s1_checkpoint_bytes.sh
 create mode 100755 bin/run-tc-ldc93s1_new_bytes.sh
 create mode 100755 bin/run-tc-ldc93s1_new_bytes_tflite.sh
 create mode 100644 taskcluster/tc-cpp-bytes-ds-tests.sh
 create mode 100644 taskcluster/test-cpp_16k_bytes-darwin-amd64-opt.yml
 create mode 100644 taskcluster/test-cpp_16k_bytes-linux-amd64-opt.yml

diff --git a/bin/run-tc-ldc93s1_checkpoint_bytes.sh b/bin/run-tc-ldc93s1_checkpoint_bytes.sh
new file mode 100755
index 00000000..0fdd3d8c
--- /dev/null
+++ b/bin/run-tc-ldc93s1_checkpoint_bytes.sh
@@ -0,0 +1,31 @@
+#!/bin/sh
+
+set -xe
+
+ldc93s1_dir="./data/smoke_test"
+ldc93s1_csv="${ldc93s1_dir}/ldc93s1.csv"
+
+if [ ! -f "${ldc93s1_dir}/ldc93s1.csv" ]; then
+    echo "Downloading and preprocessing LDC93S1 example data, saving in ${ldc93s1_dir}."
+    python -u bin/import_ldc93s1.py ${ldc93s1_dir}
+fi;
+
+# Force only one visible device because we have a single-sample dataset
+# and when trying to run on multiple devices (like GPUs), this will break
+export CUDA_VISIBLE_DEVICES=0
+
+python -u DeepSpeech.py --noshow_progressbar --noearly_stop \
+  --train_files ${ldc93s1_csv} --train_batch_size 1 \
+  --dev_files ${ldc93s1_csv} --dev_batch_size 1 \
+  --test_files ${ldc93s1_csv} --test_batch_size 1 \
+  --n_hidden 100 --epochs 1 \
+  --max_to_keep 1 --checkpoint_dir '/tmp/ckpt_bytes' --utf8 \
+  --learning_rate 0.001 --dropout_rate 0.05 \
+  --scorer_path 'data/smoke_test/pruned_lm.bytes.scorer' | tee /tmp/resume.log
+
+if ! grep "Loading best validating checkpoint from" /tmp/resume.log; then
+  echo "Did not resume training from checkpoint"
+  exit 1
+else
+  exit 0
+fi
diff --git a/bin/run-tc-ldc93s1_new_bytes.sh b/bin/run-tc-ldc93s1_new_bytes.sh
new file mode 100755
index 00000000..b879bee6
--- /dev/null
+++ b/bin/run-tc-ldc93s1_new_bytes.sh
@@ -0,0 +1,30 @@
+#!/bin/sh
+
+set -xe
+
+ldc93s1_dir="./data/smoke_test"
+ldc93s1_csv="${ldc93s1_dir}/ldc93s1.csv"
+
+epoch_count=$1
+audio_sample_rate=$2
+
+if [ ! -f "${ldc93s1_dir}/ldc93s1.csv" ]; then
+    echo "Downloading and preprocessing LDC93S1 example data, saving in ${ldc93s1_dir}."
+    python -u bin/import_ldc93s1.py ${ldc93s1_dir}
+fi;
+
+# Force only one visible device because we have a single-sample dataset
+# and when trying to run on multiple devices (like GPUs), this will break
+export CUDA_VISIBLE_DEVICES=0
+
+python -u DeepSpeech.py --noshow_progressbar --noearly_stop \
+  --train_files ${ldc93s1_csv} --train_batch_size 1 \
+  --feature_cache '/tmp/ldc93s1_cache' \
+  --dev_files ${ldc93s1_csv} --dev_batch_size 1 \
+  --test_files ${ldc93s1_csv} --test_batch_size 1 \
+  --n_hidden 100 --epochs $epoch_count \
+  --max_to_keep 1 --checkpoint_dir '/tmp/ckpt_bytes' \
+  --learning_rate 0.001 --dropout_rate 0.05  --export_dir '/tmp/train_bytes' \
+  --scorer_path 'data/smoke_test/pruned_lm.bytes.scorer' \
+  --audio_sample_rate ${audio_sample_rate} \
+  --utf8
diff --git a/bin/run-tc-ldc93s1_new_bytes_tflite.sh b/bin/run-tc-ldc93s1_new_bytes_tflite.sh
new file mode 100755
index 00000000..b94608de
--- /dev/null
+++ b/bin/run-tc-ldc93s1_new_bytes_tflite.sh
@@ -0,0 +1,26 @@
+#!/bin/sh
+
+set -xe
+
+ldc93s1_dir="./data/smoke_test"
+ldc93s1_csv="${ldc93s1_dir}/ldc93s1.csv"
+
+audio_sample_rate=$1
+
+if [ ! -f "${ldc93s1_dir}/ldc93s1.csv" ]; then
+    echo "Downloading and preprocessing LDC93S1 example data, saving in ${ldc93s1_dir}."
+    python -u bin/import_ldc93s1.py ${ldc93s1_dir}
+fi;
+
+# Force only one visible device because we have a single-sample dataset
+# and when trying to run on multiple devices (like GPUs), this will break
+export CUDA_VISIBLE_DEVICES=0
+
+python -u DeepSpeech.py --noshow_progressbar \
+  --n_hidden 100 \
+  --checkpoint_dir '/tmp/ckpt_bytes' \
+  --export_dir '/tmp/train_bytes_tflite' \
+  --scorer_path 'data/smoke_test/pruned_lm.bytes.scorer' \
+  --utf8 \
+  --audio_sample_rate ${audio_sample_rate} \
+  --export_tflite
diff --git a/taskcluster/tc-all-utils.sh b/taskcluster/tc-all-utils.sh
index 2e8d0d76..3f877c5a 100755
--- a/taskcluster/tc-all-utils.sh
+++ b/taskcluster/tc-all-utils.sh
@@ -98,6 +98,7 @@ download_data()
   ${WGET} -P "${TASKCLUSTER_TMP_DIR}" "${model_source_mmap}"
   cp ${DS_ROOT_TASK}/DeepSpeech/ds/data/smoke_test/*.wav ${TASKCLUSTER_TMP_DIR}/
   cp ${DS_ROOT_TASK}/DeepSpeech/ds/data/smoke_test/pruned_lm.scorer ${TASKCLUSTER_TMP_DIR}/kenlm.scorer
+  cp ${DS_ROOT_TASK}/DeepSpeech/ds/data/smoke_test/pruned_lm.bytes.scorer ${TASKCLUSTER_TMP_DIR}/kenlm.bytes.scorer
   cp -R ${DS_ROOT_TASK}/DeepSpeech/ds/native_client/test ${TASKCLUSTER_TMP_DIR}/test_sources
 }
 
diff --git a/taskcluster/tc-cpp-bytes-ds-tests.sh b/taskcluster/tc-cpp-bytes-ds-tests.sh
new file mode 100644
index 00000000..20669af6
--- /dev/null
+++ b/taskcluster/tc-cpp-bytes-ds-tests.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+
+set -xe
+
+source $(dirname "$0")/tc-tests-utils.sh
+
+bitrate=$1
+set_ldc_sample_filename "${bitrate}"
+
+download_material "${TASKCLUSTER_TMP_DIR}/ds"
+
+export PATH=${TASKCLUSTER_TMP_DIR}/ds/:$PATH
+
+# Bytes output mode with LDC93S1 takes too long to converge so we simply test
+# that loading the model won't crash
+check_versions
diff --git a/taskcluster/tc-train-extra-tests.sh b/taskcluster/tc-train-extra-tests.sh
index 62ec225e..8ecf9465 100644
--- a/taskcluster/tc-train-extra-tests.sh
+++ b/taskcluster/tc-train-extra-tests.sh
@@ -54,10 +54,30 @@ pushd ${HOME}/DeepSpeech/ds/
 
     # Test --metrics_files training argument
     time ./bin/run-tc-ldc93s1_new_metrics.sh 2 "${sample_rate}"
+
+    # Test training with bytes output mode
+    time ./bin/run-tc-ldc93s1_new_bytes.sh 200 "${sample_rate}"
+    time ./bin/run-tc-ldc93s1_new_bytes_tflite.sh "${sample_rate}"
 popd
 
+# Save exported model artifacts from bytes output mode training
+cp /tmp/train_bytes/output_graph.pb ${TASKCLUSTER_ARTIFACTS}/output_graph.pb
+cp /tmp/train_bytes_tflite/output_graph.tflite ${TASKCLUSTER_ARTIFACTS}/output_graph.tflite
+
 pushd ${HOME}/DeepSpeech/ds/
+    python util/taskcluster.py --source tensorflow --artifact convert_graphdef_memmapped_format --branch r1.15 --target /tmp/
+popd
+
+/tmp/convert_graphdef_memmapped_format --in_graph=/tmp/train_bytes/output_graph.pb --out_graph=/tmp/train_bytes/output_graph.pbmm
+cp /tmp/train_bytes/output_graph.pbmm ${TASKCLUSTER_ARTIFACTS}
+
+# Test resuming from checkpoints created above
+pushd ${HOME}/DeepSpeech/ds/
+    # SDB, resuming from checkpoint
     time ./bin/run-tc-ldc93s1_checkpoint_sdb.sh
+
+    # Bytes output mode, resuming from checkpoint
+    time ./bin/run-tc-ldc93s1_checkpoint_bytes.sh
 popd
 
 virtualenv_deactivate "${pyalias}" "deepspeech"
diff --git a/taskcluster/test-cpp_16k_bytes-darwin-amd64-opt.yml b/taskcluster/test-cpp_16k_bytes-darwin-amd64-opt.yml
new file mode 100644
index 00000000..0b1151c1
--- /dev/null
+++ b/taskcluster/test-cpp_16k_bytes-darwin-amd64-opt.yml
@@ -0,0 +1,12 @@
+build:
+  template_file: test-darwin-opt-base.tyml
+  dependencies:
+    - "darwin-amd64-cpu-opt"
+    - "test-training-extra_16k-linux-amd64-py36m-opt"
+    - "homebrew_tests-darwin-amd64"
+  test_model_task: "test-training-extra_16k-linux-amd64-py36m-opt"
+  args:
+    tests_cmdline: "$TASKCLUSTER_TASK_DIR/DeepSpeech/ds/taskcluster/tc-cpp-bytes-ds-tests.sh 16k"
+  metadata:
+    name: "DeepSpeech OSX AMD64 CPU C++ tests (Bytes Output Model, 16kHz)"
+    description: "Testing DeepSpeech C++ for OSX/AMD64, CPU only, optimized version (Bytes Output Model, 16kHz)"
diff --git a/taskcluster/test-cpp_16k_bytes-linux-amd64-opt.yml b/taskcluster/test-cpp_16k_bytes-linux-amd64-opt.yml
new file mode 100644
index 00000000..4d483392
--- /dev/null
+++ b/taskcluster/test-cpp_16k_bytes-linux-amd64-opt.yml
@@ -0,0 +1,12 @@
+build:
+  template_file: test-linux-opt-base.tyml
+  dependencies:
+    - "linux-amd64-cpu-opt"
+    - "test-training-extra_16k-linux-amd64-py36m-opt"
+  test_model_task: "test-training-extra_16k-linux-amd64-py36m-opt"
+  args:
+    tests_cmdline: "${system.homedir.linux}/DeepSpeech/ds/taskcluster/tc-cpp-bytes-ds-tests.sh 16k"
+  workerType: "${docker.dsTests}"
+  metadata:
+    name: "DeepSpeech Linux AMD64 CPU C++ tests (Bytes Output Model, 16kHz)"
+    description: "Testing DeepSpeech C++ for Linux/AMD64, CPU only, optimized version (Bytes Output Model, 16kHz)"