From 0c669723c7e4993f03aaa0267e7f1c8abd675428 Mon Sep 17 00:00:00 2001 From: Tilman Kamp <5991088+tilmankamp@users.noreply.github.com> Date: Tue, 4 Sep 2018 18:49:44 +0200 Subject: [PATCH] Default training setup for new cluster --- .compute | 107 +++++++++++------------------------------------------ .gitignore | 1 + .install | 10 +++++ 3 files changed, 33 insertions(+), 85 deletions(-) create mode 100755 .install diff --git a/.compute b/.compute index d60d6cd3..923cbe09 100755 --- a/.compute +++ b/.compute @@ -1,89 +1,26 @@ #!/bin/bash -# this script gets executed on every node of an allocation; -# in our case we use the provided COMPUTE_* env variables -# to construct a distributed TensorFlow cluster definition +source ../tmp/venv/bin/activate -set -o pipefail +data="${DATA_ROOT}/shared/data" +fis="${data}/LDC/fisher" +swb="${data}/LDC/LDC97S62/swb" +lbs="${data}/OpenSLR/LibriSpeech/librivox" -# activating standard TensorFlow Python virtual environment -source /usr/local/tensorflow/bin/activate - -# the standard script to execute -if [ -f ".run" ]; then - base_cmd="./.run" -fi - -# reading the comma separated node list into array "nodes" -IFS=',' read -r -a nodes <<< "$COMPUTE_NODES" -# keep fist node for convenience -first=${nodes[0]} -# hostname for debugging -hostname=`hostname` -# log timestamp prefix -time_format='[%Y-%m-%d %H:%M:%.S]' -sub_processes=1 - -if ((${#nodes[@]} == 1)); then - # there is only one (this) node - so we are alone and we don't need a cluster definition - echo "Starting single node process on $hostname ..." - logfile=../single.log - touch $logfile # guarantees existence for "tail -f *.log" - $base_cmd --job_name localhost "$@" 2>&1 | ts "$time_format [single ]" >$logfile & -else - # there is more than one node so we will build a cluster definition - - # defining all cluster ports in a way that avoids collisions with other cluster allocations - # (that could eventually get scheduled on the same node) - ((port_base=10000 + (COMPUTE_JOB_NUMBER * 100) % 50000)) - ((coord_port=port_base)) - ((ps_port=port_base + 1)) - ((worker_port=port_base + 2)) - for node in "${nodes[@]}"; do - worker_hosts[$worker_port]="$node:$worker_port" - ((worker_port=worker_port + 1)) - done - - # converting worker_hosts array of host:port pairs into a comma separated list - worker_hosts=$(printf ",%s" "${worker_hosts[@]}") - worker_hosts=${worker_hosts:1} - - # shared cluster configuration - # assert: for this job it should be exactly the same on all allocated nodes - cluster="--coord_host $first --coord_port $coord_port --ps_hosts=$first:$ps_port --worker_hosts=$worker_hosts" - - # helpful for debugging potential networking issues - echo "Starting allocated node no. $COMPUTE_NODE_INDEX on $hostname of cluster (coordinator: $first:$coord_port, ps: $first:$ps_port, workers: $worker_hosts) ..." - - # starting the parameter server side by side with first worker on the first node; - # so for the moment we only run one ps per allocation - if ((COMPUTE_NODE_INDEX == 0)); then - # CUDA_VISIBLE_DEVICES="" - as the parameter server does not require a GPU; - # the GPU would be shared with the worker on the same machine; - # it turned out that this would reduce available GPU memory for the worker by almost 50% - sub_processes=2 - logfile=../ps_$COMPUTE_NODE_INDEX.log - touch $logfile # guarantees existence for "tail -f *.log" - CUDA_VISIBLE_DEVICES="" $base_cmd $cluster --job_name ps --task_index 0 "$@" 2>&1 | ts "$time_format [ps $COMPUTE_NODE_INDEX]" >$logfile & - fi - - # starting the worker - logfile=../worker_$COMPUTE_NODE_INDEX.log - touch $logfile # guarantees existence for "tail -f *.log" - $base_cmd $cluster --job_name worker --task_index $COMPUTE_NODE_INDEX "$@" 2>&1 | ts "$time_format [worker $COMPUTE_NODE_INDEX]" >$logfile & -fi - -for index in $(seq 1 $sub_processes); -do - # "wait -n" waits for any sub-process to exit - # doing this sub_processes times will wait for all sub-processes to finish - # in case of any sub-process failing, it will exit immediately - wait -n - code=$? - if ((code > 0)); then - echo "One compute process failed with exit code $code." - exit $code - else - echo "One compute process succeeded." - fi -done +python3 -u DeepSpeech.py \ + --train_files "${fis}-train.csv","${swb}-train.csv","${lbs}-train-clean-100.csv","${lbs}-train-clean-360.csv","${lbs}-train-other-500.csv" \ + --dev_files "${lbs}-dev-clean.csv"\ + --test_files "${lbs}-test-clean.csv" \ + --train_batch_size 24 \ + --dev_batch_size 48 \ + --test_batch_size 48 \ + --n_hidden 2048 \ + --learning_rate 0.0001 \ + --dropout_rate 0.2367 \ + --epoch 13 \ + --display_step 0 \ + --validation_step 1 \ + --log_level 0 \ + --checkpoint_dir "../keep" \ + --summary_dir "../keep/summaries" \ + --decoder_library_path "../tmp/native_client/libctc_decoder_with_kenlm.so" diff --git a/.gitignore b/.gitignore index 35194f3f..c0d1ad73 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,7 @@ *.pyc *.swp *.DS_Store +.pit* /.run /werlog.js /runs diff --git a/.install b/.install new file mode 100755 index 00000000..f6174d0f --- /dev/null +++ b/.install @@ -0,0 +1,10 @@ +#!/bin/bash + +virtualenv -p python3 ../tmp/venv +source ../tmp/venv/bin/activate +pip install -r <(grep -v tensorflow requirements.txt) +pip install tensorflow-gpu==1.6 + +python3 util/taskcluster.py --arch gpu --target ../tmp/native_client + +mkdir -p ../keep/summaries