Fixed dropout handling and other fixes

This commit is contained in:
Tilman Kamp 2016-10-11 14:39:36 +02:00
parent 2890264b04
commit f3439b72d5
2 changed files with 93 additions and 90 deletions

View File

@ -76,18 +76,19 @@
},
"outputs": [],
"source": [
"import os\n",
"import time\n",
"import os.path\n",
"import json\n",
"import datetime\n",
"import tempfile\n",
"import subprocess\n",
"import numpy as np\n",
"import tensorflow as tf\n",
"import json\n",
"import subprocess\n",
"import datetime\n",
"from util.log import merge_logs\n",
"from util.gpu import get_available_gpus\n",
"from util.importers.ted_lium import read_data_sets\n",
"from util.text import sparse_tensor_value_to_text, wers\n",
"from tensorflow.python.ops import ctc_ops\n",
"from util.importers.ted_lium import read_data_sets"
"from tensorflow.python.ops import ctc_ops"
]
},
{
@ -125,7 +126,7 @@
"training_iters = 1250 # TODO: Determine a reasonable value for this\n",
"batch_size = 1 # TODO: Determine a reasonable value for this\n",
"display_step = 10 # TODO: Determine a reasonable value for this\n",
"validation_step = 50\n",
"validation_step = 50 # TODO: Determine a reasonable value for this\n",
"checkpoint_step = 1000 # TODO: Determine a reasonable value for this\n",
"checkpoint_dir = tempfile.gettempdir() # TODO: Determine a reasonable value for this"
]
@ -147,7 +148,7 @@
},
"outputs": [],
"source": [
"dropout_rate = 0.01 # TODO: Validate this is a reasonable value"
"dropout_rate = 0.05 # TODO: Validate this is a reasonable value"
]
},
{
@ -400,7 +401,7 @@
},
"outputs": [],
"source": [
"def BiRNN(batch_x, n_steps, dropout):\n",
"def BiRNN(batch_x, n_steps, dropout_rate):\n",
" # Input shape: [batch_size, n_steps, n_input + 2*n_input*n_context]\n",
" batch_x = tf.transpose(batch_x, [1, 0, 2]) # Permute n_steps and batch_size\n",
" # Reshape to prepare input for first layer\n",
@ -410,20 +411,17 @@
" b1 = variable_on_cpu('b1', [n_hidden_1], tf.random_normal_initializer())\n",
" h1 = variable_on_cpu('h1', [n_input + 2*n_input*n_context, n_hidden_1], tf.random_normal_initializer())\n",
" layer_1 = tf.minimum(tf.nn.relu(tf.add(tf.matmul(batch_x, h1), b1)), relu_clip)\n",
" if dropout:\n",
" layer_1 = tf.nn.dropout(layer_1, (1 - dropout_rate))\n",
" layer_1 = tf.nn.dropout(layer_1, (1.0 - dropout_rate))\n",
" #Hidden layer with clipped RELU activation and dropout\n",
" b2 = variable_on_cpu('b2', [n_hidden_2], tf.random_normal_initializer())\n",
" h2 = variable_on_cpu('h2', [n_hidden_1, n_hidden_2], tf.random_normal_initializer())\n",
" layer_2 = tf.minimum(tf.nn.relu(tf.add(tf.matmul(layer_1, h2), b2)), relu_clip)\n",
" if dropout:\n",
" layer_2 = tf.nn.dropout(layer_2, (1 - dropout_rate))\n",
" layer_2 = tf.nn.dropout(layer_2, (1.0 - dropout_rate))\n",
" #Hidden layer with clipped RELU activation and dropout\n",
" b3 = variable_on_cpu('b3', [n_hidden_3], tf.random_normal_initializer())\n",
" h3 = variable_on_cpu('h3', [n_hidden_2, n_hidden_3], tf.random_normal_initializer())\n",
" layer_3 = tf.minimum(tf.nn.relu(tf.add(tf.matmul(layer_2, h3), b3)), relu_clip)\n",
" if dropout:\n",
" layer_3 = tf.nn.dropout(layer_3, (1 - dropout_rate))\n",
" layer_3 = tf.nn.dropout(layer_3, (1.0 - dropout_rate))\n",
" \n",
" # Define lstm cells with tensorflow\n",
" # Forward direction cell\n",
@ -449,8 +447,7 @@
" b5 = variable_on_cpu('b5', [n_hidden_5], tf.random_normal_initializer())\n",
" h5 = variable_on_cpu('h5', [(2 * n_cell_dim), n_hidden_5], tf.random_normal_initializer())\n",
" layer_5 = tf.minimum(tf.nn.relu(tf.add(tf.matmul(outputs, h5), b5)), relu_clip)\n",
" if dropout:\n",
" layer_5 = tf.nn.dropout(layer_5, (1 - dropout_rate))\n",
" layer_5 = tf.nn.dropout(layer_5, (1.0 - dropout_rate))\n",
" #Hidden layer of logits\n",
" b6 = variable_on_cpu('b6', [n_hidden_6], tf.random_normal_initializer())\n",
" h6 = variable_on_cpu('h6', [n_hidden_5, n_hidden_6], tf.random_normal_initializer())\n",
@ -471,7 +468,7 @@
"source": [
"The first few lines of the function `BiRNN`\n",
"```python\n",
"def BiRNN(batch_x, n_steps, dropout=True):\n",
"def BiRNN(batch_x, n_steps, dropout_rate):\n",
" # Input shape: [batch_size, n_steps, n_input + 2*n_input*n_context]\n",
" batch_x = tf.transpose(batch_x, [1, 0, 2]) # Permute n_steps and batch_size\n",
" # Reshape to prepare input for first layer\n",
@ -486,8 +483,7 @@
" b1 = variable_on_cpu('b1', [n_hidden_1], tf.random_normal_initializer())\n",
" h1 = variable_on_cpu('h1', [n_input + 2*n_input*n_context, n_hidden_1], tf.random_normal_initializer())\n",
" layer_1 = tf.minimum(tf.nn.relu(tf.add(tf.matmul(batch_x, h1), b1)), relu_clip)\n",
" if dropout:\n",
" layer_1 = tf.nn.dropout(layer_1, (1 - dropout_rate))\n",
" layer_1 = tf.nn.dropout(layer_1, (1.0 - dropout_rate))\n",
" ...\n",
"```\n",
"pass `batch_x` through the first layer of the non-recurrent neural network, then applies dropout to the result.\n",
@ -497,15 +493,13 @@
" #Hidden layer with clipped RELU activation and dropout\n",
" b2 = variable_on_cpu('b2', [n_hidden_2], tf.random_normal_initializer())\n",
" h2 = variable_on_cpu('h2', [n_hidden_1, n_hidden_2], tf.random_normal_initializer())\n",
" layer_2 = tf.minimum(tf.nn.relu(tf.add(tf.matmul(layer_1, h2), b2)), relu_clip)\n",
" if dropout: \n",
" layer_2 = tf.nn.dropout(layer_2, (1 - dropout_rate))\n",
" layer_2 = tf.minimum(tf.nn.relu(tf.add(tf.matmul(layer_1, h2), b2)), relu_clip) \n",
" layer_2 = tf.nn.dropout(layer_2, (1.0 - dropout_rate))\n",
" #Hidden layer with clipped RELU activation and dropout\n",
" b3 = variable_on_cpu('b3', [n_hidden_3], tf.random_normal_initializer())\n",
" h3 = variable_on_cpu('h3', [n_hidden_2, n_hidden_3], tf.random_normal_initializer())\n",
" layer_3 = tf.minimum(tf.nn.relu(tf.add(tf.matmul(layer_2, h3), b3)), relu_clip)\n",
" if dropout:\n",
" layer_3 = tf.nn.dropout(layer_3, (1 - dropout_rate))\n",
" layer_3 = tf.nn.dropout(layer_3, (1.0 - dropout_rate))\n",
"```\n",
"\n",
"Next we create the forward and backward LSTM units\n",
@ -549,8 +543,7 @@
" b5 = variable_on_cpu('b5', [n_hidden_5], tf.random_normal_initializer())\n",
" h5 = variable_on_cpu('h5', [(2 * n_cell_dim), n_hidden_5], tf.random_normal_initializer())\n",
" layer_5 = tf.minimum(tf.nn.relu(tf.add(tf.matmul(outputs, h5), b5)), relu_clip)\n",
" if dropout:\n",
" layer_5 = tf.nn.dropout(layer_5, (1 - dropout_rate))\n",
" layer_5 = tf.nn.dropout(layer_5, (1.0 - dropout_rate))\n",
"```\n",
"\n",
"The next line of `BiRNN`\n",
@ -591,7 +584,7 @@
"source": [
"In accord with [Deep Speech: Scaling up end-to-end speech recognition](http://arxiv.org/abs/1412.5567), the loss function used by our network should be the CTC loss function[[2]](http://www.cs.toronto.edu/~graves/preprint.pdf). Conveniently, this loss function is implemented in TensorFlow. Thus, we can simply make use of this implementation to define our loss.\n",
"\n",
"To do so we introduce a utility function `calculate_accuracy_and_loss()` that calculates the average loss for a mini-batch along with the accuracy"
"To do so we introduce a utility function `calculate_accuracy_and_loss()` beam search decodes a mini-batch and calculates the average loss and accuracy. Next to loss and accuracy it returns the decoded result and the batch's original Y."
]
},
{
@ -602,12 +595,12 @@
},
"outputs": [],
"source": [
"def calculate_accuracy_and_loss(n_steps, batch_set, dropout=False):\n",
"def calculate_accuracy_and_loss(n_steps, batch_set, dropout_rate):\n",
" # Obtain the next batch of data\n",
" batch_x, batch_y, batch_seq_len = batch_set.next_batch(batch_size)\n",
"\n",
" # Calculate the logits of the batch using BiRNN\n",
" logits = BiRNN(batch_x, n_steps, dropout=dropout)\n",
" logits = BiRNN(batch_x, n_steps, dropout_rate)\n",
" \n",
" # CTC loss requires the logits be time major\n",
" logits = tf.transpose(logits, [1, 0, 2])\n",
@ -618,12 +611,16 @@
" # Calculate the average loss across the batch\n",
" avg_loss = tf.reduce_mean(total_loss)\n",
" \n",
" # Compute the accuracy\n",
" # Beam search decode the batch\n",
" decoded, _ = ctc_ops.ctc_beam_search_decoder(logits, batch_seq_len)\n",
" \n",
" # Compute the edit (Levenshtein) distance \n",
" distance = tf.edit_distance(tf.cast(decoded[0], tf.int32), batch_y)\n",
" \n",
" # Compute the accuracy \n",
" accuracy = tf.reduce_mean(distance)\n",
"\n",
" # Return avg_loss and accuracy\n",
" # Return results to the caller\n",
" return avg_loss, accuracy, decoded, batch_y"
]
},
@ -633,7 +630,7 @@
"source": [
"The first lines of `calculate_accuracy_and_loss()`\n",
"```python\n",
"def calculate_accuracy_and_loss(n_steps, batch_set):\n",
"def calculate_accuracy_and_loss(n_steps, batch_set, dropout_rate):\n",
" # Obtain the next batch of data\n",
" batch_x, batch_y, batch_seq_len = batch_set.next_batch(batch_size)\n",
"```\n",
@ -642,7 +639,7 @@
"The next line\n",
"```python\n",
" # Calculate the logits from the BiRNN\n",
" logits = BiRNN(batch_x)\n",
" logits = BiRNN(batch_x, n_steps, dropout_rate)\n",
"```\n",
"calls `BiRNN()` with a batch of data and does inference on the batch.\n",
"\n",
@ -659,17 +656,22 @@
"```\n",
"calculate the average loss using tensor flow's `ctc_loss` operator. \n",
"\n",
"The next lines compute the accuracy\n",
"The next lines first beam decode the batch and then compute the accuracy on base of the Levenshtein distance between the decoded batch and the batch's original Y.\n",
"```python\n",
" # Compute the accuracy\n",
" # Beam search decode the batch\n",
" decoded, _ = ctc_ops.ctc_beam_search_decoder(logits, batch_seq_len)\n",
" accuracy = tf.reduce_mean(tf.edit_distance(tf.cast(decoded[0], tf.int32), batch_y))\n",
" \n",
" # Compute the edit (Levenshtein) distance \n",
" distance = tf.edit_distance(tf.cast(decoded[0], tf.int32), batch_y)\n",
" \n",
" # Compute the accuracy \n",
" accuracy = tf.reduce_mean(distance)\n",
"```\n",
"\n",
"Finally, the `avg_loss`, accuracy and the decoded batch are returned to the caller\n",
"Finally, the `avg_loss`, accuracy, the decoded batch and the original batch's Y are returned to the caller\n",
"```python\n",
" # Return avg_loss and accuracy\n",
" return avg_loss, accuracy, decoded\n",
" # Return results to the caller\n",
" return avg_loss, accuracy, decoded, batch_y\n",
"```"
]
},
@ -866,22 +868,25 @@
" with tf.device(available_devices[i]):\n",
" # Create a scope for all operations of tower i\n",
" with tf.name_scope('tower_%d' % i) as scope:\n",
" # Calculate the avg_loss and accuracy for this tower\n",
" # Calculate the avg_loss and accuracy and retrieve the decoded \n",
" # batch along with the original batch's labels (Y) of this tower\n",
" avg_loss, accuracy, decoded, labels = calculate_accuracy_and_loss(\\\n",
" n_steps, \\\n",
" batch_set, \\\n",
" dropout=(optimizer is not None) \\\n",
" )\n",
" n_steps, \\\n",
" batch_set, \\\n",
" dropout_rate if (optimizer is not None) else 0.0 \\\n",
" )\n",
" \n",
" # Allow for variables to be re-used by the next tower\n",
" tf.get_variable_scope().reuse_variables()\n",
" \n",
" # Retain tower's gradients\n",
" # Retain tower's decoded batch\n",
" tower_decodings.append(decoded)\n",
" \n",
" # Retain tower's labels\n",
" # Retain tower's labels (Y)\n",
" tower_labels.append(labels)\n",
" \n",
" # If we are in training, there will be an optimizer given and \n",
" # only then we will compute and retain gradients on base of the loss\n",
" if optimizer is not None:\n",
" # Compute gradients for model parameters using tower's mini-batch\n",
" gradients = optimizer.compute_gradients(avg_loss)\n",
@ -1032,7 +1037,8 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"Finally we define the log directory plus some helpers."
"Finally we define the top directory for all logs and our current log sub-directory of it.\n",
"We also add some log helpers."
]
},
{
@ -1043,7 +1049,8 @@
},
"outputs": [],
"source": [
"log_dir = '%s/%s' % (\"logs\", time.strftime(\"%Y%m%d-%H%M%S\"))\n",
"logs_dir = \"logs\"\n",
"log_dir = '%s/%s' % (logs_dir, time.strftime(\"%Y%m%d-%H%M%S\"))\n",
"\n",
"def get_git_revision_hash():\n",
" return subprocess.check_output(['git', 'rev-parse', 'HEAD']).strip()\n",
@ -1074,7 +1081,7 @@
},
"outputs": [],
"source": [
"def forward(session, data_set):\n",
"def decode_batch(data_set):\n",
" # Set n_steps parameter\n",
" n_steps = data_set.max_batch_seq_len\n",
"\n",
@ -1188,7 +1195,7 @@
" get_tower_results(n_steps, data_sets.train, optimizer)\n",
" \n",
" # Validation step preparation\n",
" validation_tower_decodings, validation_tower_labels = forward(session, data_sets.validation)\n",
" validation_tower_decodings, validation_tower_labels = decode_batch(data_sets.validation)\n",
"\n",
" # Average tower gradients\n",
" avg_tower_gradients = average_gradients(tower_gradients)\n",
@ -1218,10 +1225,9 @@
" # Define total accuracy for the epoch\n",
" total_accuracy = 0\n",
" \n",
" # Validation step to determine the best point in time to stop\n",
" # Validation step\n",
" if epoch % validation_step == 0:\n",
" _, last_validation_wer = print_wer_report(session, \"Validation\", validation_tower_decodings, validation_tower_labels)\n",
" # TODO: Determine on base of WER, if model starts overfitting\n",
" print\n",
"\n",
" # Loop over the batches\n",
@ -1307,7 +1313,7 @@
"outputs": [],
"source": [
"# Test network\n",
"test_decodings, test_labels = forward(session, ted_lium.test)\n",
"test_decodings, test_labels = decode_batch(ted_lium.test)\n",
"_, test_wer = print_wer_report(session, \"Test\", test_decodings, test_labels)"
]
},
@ -1380,41 +1386,6 @@
"Let's also re-populate a central JS file, that contains all the dumps at once."
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"written = False\n",
"logs_dir = \"logs\"\n",
"\n",
"# All direct sub directories of the logs directory\n",
"dirs = [os.path.join(logs_dir, o) for o in os.listdir(logs_dir) if os.path.isdir(os.path.join(logs_dir, o))]\n",
"\n",
"# Let's first populate a temporal file and rename it afterwards - guarantees an interruption free web experience\n",
"nhf = '%s/%s' % (logs_dir, 'new_hyper.js')\n",
"\n",
"with open(nhf, 'w') as dump_file:\n",
" # Assigning a global variable that the report page can pick up after loading the data as a regular script\n",
" dump_file.write('window.ALL_THE_DATA = [')\n",
" for d in dirs:\n",
" hf = os.path.join(d, \"hyper.json\")\n",
" if os.path.isfile(hf):\n",
" # Separate by comma if there was already something written\n",
" if written:\n",
" dump_file.write(',\\n')\n",
" written = True\n",
" # Append the whole file\n",
" dump_file.write(open(hf, 'r').read())\n",
" dump_file.write('];')\n",
" \n",
"# Finally we rename the file temporal file and overwrite a potentially existing active one\n",
"os.rename(nhf, '%s/%s' % (logs_dir, 'hyper.js'))"
]
},
{
"cell_type": "code",
"execution_count": null,
@ -1422,7 +1393,9 @@
"collapsed": true
},
"outputs": [],
"source": []
"source": [
"merge_logs(logs_dir)"
]
}
],
"metadata": {

30
util/log/__init__.py Normal file
View File

@ -0,0 +1,30 @@
import os
import os.path
def merge_logs(logs_dir):
written = False
# All direct sub directories of the logs directory
dirs = [os.path.join(logs_dir, o) for o in os.listdir(logs_dir) if os.path.isdir(os.path.join(logs_dir, o))]
# Let's first populate a temporal file and rename it afterwards - guarantees an interruption free web experience
nhf = '%s/%s' % (logs_dir, 'new_hyper.js')
with open(nhf, 'w') as dump_file:
# Assigning a global variable that the report page can pick up after loading the data as a regular script
dump_file.write('window.ALL_THE_DATA = [')
for d in dirs:
hf = os.path.join(d, "hyper.json")
if os.path.isfile(hf):
# Separate by comma if there was already something written
if written:
dump_file.write(',\n')
written = True
# Append the whole file
dump_file.write(open(hf, 'r').read())
dump_file.write('];')
# Finally we rename the temporal file and overwrite a potentially existing active one
os.rename(nhf, '%s/%s' % (logs_dir, 'hyper.js'))