diff --git a/bin/run-tc-ldc93s1_new_metrics.sh b/bin/run-tc-ldc93s1_new_metrics.sh
new file mode 100755
index 00000000..01403bf1
--- /dev/null
+++ b/bin/run-tc-ldc93s1_new_metrics.sh
@@ -0,0 +1,29 @@
+#!/bin/sh
+
+set -xe
+
+ldc93s1_dir="./data/smoke_test"
+ldc93s1_csv="${ldc93s1_dir}/ldc93s1.csv"
+
+epoch_count=$1
+audio_sample_rate=$2
+
+if [ ! -f "${ldc93s1_dir}/ldc93s1.csv" ]; then
+    echo "Downloading and preprocessing LDC93S1 example data, saving in ${ldc93s1_dir}."
+    python -u bin/import_ldc93s1.py ${ldc93s1_dir}
+fi;
+
+# Force only one visible device because we have a single-sample dataset
+# and when trying to run on multiple devices (like GPUs), this will break
+export CUDA_VISIBLE_DEVICES=0
+
+python -u DeepSpeech.py --noshow_progressbar --noearly_stop \
+  --train_files ${ldc93s1_csv} --train_batch_size 1 \
+  --dev_files ${ldc93s1_csv} --dev_batch_size 1 \
+  --test_files ${ldc93s1_csv} --test_batch_size 1 \
+  --metrics_files ${ldc93s1_csv} \
+  --n_hidden 100 --epochs $epoch_count \
+  --max_to_keep 1 --checkpoint_dir '/tmp/ckpt_metrics' \
+  --learning_rate 0.001 --dropout_rate 0.05 --export_dir '/tmp/train_metrics' \
+  --scorer_path 'data/smoke_test/pruned_lm.scorer' \
+  --audio_sample_rate ${audio_sample_rate}
diff --git a/taskcluster/tc-train-extra-tests.sh b/taskcluster/tc-train-extra-tests.sh
index dfdcf9dd..62ec225e 100644
--- a/taskcluster/tc-train-extra-tests.sh
+++ b/taskcluster/tc-train-extra-tests.sh
@@ -51,6 +51,9 @@ pushd ${HOME}/DeepSpeech/ds/
     # Testing interleaved source (SDB+CSV combination) - run twice to test preprocessed features
     time ./bin/run-tc-ldc93s1_new_sdb_csv.sh 109 "${sample_rate}"
     time ./bin/run-tc-ldc93s1_new_sdb_csv.sh 1 "${sample_rate}"
+
+    # Test --metrics_files training argument
+    time ./bin/run-tc-ldc93s1_new_metrics.sh 2 "${sample_rate}"
 popd
 
 pushd ${HOME}/DeepSpeech/ds/