Reintroduced feed_dict for context dependent dropout rates

This commit is contained in:
Tilman Kamp 2016-10-12 11:48:55 +02:00
parent f3439b72d5
commit 9fb60a7ebc
1 changed files with 30 additions and 22 deletions

View File

@ -148,7 +148,16 @@
},
"outputs": [],
"source": [
"dropout_rate = 0.05 # TODO: Validate this is a reasonable value"
"dropout_rate = 0.05 # TODO: Validate this is a reasonable value\n",
"\n",
"# This global placeholder will be used for all dropout definitions\n",
"dropout_rate_placeholder = tf.placeholder(tf.float32)\n",
"\n",
"# The feed_dict used for training employs the given dropout_rate\n",
"feed_dict_train = { dropout_rate_placeholder: dropout_rate }\n",
"\n",
"# While the feed_dict used for validation, test and train progress reporting employs zero dropout\n",
"feed_dict = { dropout_rate_placeholder: 0.0 }"
]
},
{
@ -401,7 +410,7 @@
},
"outputs": [],
"source": [
"def BiRNN(batch_x, n_steps, dropout_rate):\n",
"def BiRNN(batch_x, n_steps):\n",
" # Input shape: [batch_size, n_steps, n_input + 2*n_input*n_context]\n",
" batch_x = tf.transpose(batch_x, [1, 0, 2]) # Permute n_steps and batch_size\n",
" # Reshape to prepare input for first layer\n",
@ -411,17 +420,17 @@
" b1 = variable_on_cpu('b1', [n_hidden_1], tf.random_normal_initializer())\n",
" h1 = variable_on_cpu('h1', [n_input + 2*n_input*n_context, n_hidden_1], tf.random_normal_initializer())\n",
" layer_1 = tf.minimum(tf.nn.relu(tf.add(tf.matmul(batch_x, h1), b1)), relu_clip)\n",
" layer_1 = tf.nn.dropout(layer_1, (1.0 - dropout_rate))\n",
" layer_1 = tf.nn.dropout(layer_1, (1.0 - dropout_rate_placeholder))\n",
" #Hidden layer with clipped RELU activation and dropout\n",
" b2 = variable_on_cpu('b2', [n_hidden_2], tf.random_normal_initializer())\n",
" h2 = variable_on_cpu('h2', [n_hidden_1, n_hidden_2], tf.random_normal_initializer())\n",
" layer_2 = tf.minimum(tf.nn.relu(tf.add(tf.matmul(layer_1, h2), b2)), relu_clip)\n",
" layer_2 = tf.nn.dropout(layer_2, (1.0 - dropout_rate))\n",
" layer_2 = tf.nn.dropout(layer_2, (1.0 - dropout_rate_placeholder))\n",
" #Hidden layer with clipped RELU activation and dropout\n",
" b3 = variable_on_cpu('b3', [n_hidden_3], tf.random_normal_initializer())\n",
" h3 = variable_on_cpu('h3', [n_hidden_2, n_hidden_3], tf.random_normal_initializer())\n",
" layer_3 = tf.minimum(tf.nn.relu(tf.add(tf.matmul(layer_2, h3), b3)), relu_clip)\n",
" layer_3 = tf.nn.dropout(layer_3, (1.0 - dropout_rate))\n",
" layer_3 = tf.nn.dropout(layer_3, (1.0 - dropout_rate_placeholder))\n",
" \n",
" # Define lstm cells with tensorflow\n",
" # Forward direction cell\n",
@ -447,7 +456,7 @@
" b5 = variable_on_cpu('b5', [n_hidden_5], tf.random_normal_initializer())\n",
" h5 = variable_on_cpu('h5', [(2 * n_cell_dim), n_hidden_5], tf.random_normal_initializer())\n",
" layer_5 = tf.minimum(tf.nn.relu(tf.add(tf.matmul(outputs, h5), b5)), relu_clip)\n",
" layer_5 = tf.nn.dropout(layer_5, (1.0 - dropout_rate))\n",
" layer_5 = tf.nn.dropout(layer_5, (1.0 - dropout_rate_placeholder))\n",
" #Hidden layer of logits\n",
" b6 = variable_on_cpu('b6', [n_hidden_6], tf.random_normal_initializer())\n",
" h6 = variable_on_cpu('h6', [n_hidden_5, n_hidden_6], tf.random_normal_initializer())\n",
@ -468,7 +477,7 @@
"source": [
"The first few lines of the function `BiRNN`\n",
"```python\n",
"def BiRNN(batch_x, n_steps, dropout_rate):\n",
"def BiRNN(batch_x, n_steps):\n",
" # Input shape: [batch_size, n_steps, n_input + 2*n_input*n_context]\n",
" batch_x = tf.transpose(batch_x, [1, 0, 2]) # Permute n_steps and batch_size\n",
" # Reshape to prepare input for first layer\n",
@ -483,7 +492,7 @@
" b1 = variable_on_cpu('b1', [n_hidden_1], tf.random_normal_initializer())\n",
" h1 = variable_on_cpu('h1', [n_input + 2*n_input*n_context, n_hidden_1], tf.random_normal_initializer())\n",
" layer_1 = tf.minimum(tf.nn.relu(tf.add(tf.matmul(batch_x, h1), b1)), relu_clip)\n",
" layer_1 = tf.nn.dropout(layer_1, (1.0 - dropout_rate))\n",
" layer_1 = tf.nn.dropout(layer_1, (1.0 - dropout_rate_placeholder))\n",
" ...\n",
"```\n",
"pass `batch_x` through the first layer of the non-recurrent neural network, then applies dropout to the result.\n",
@ -494,12 +503,12 @@
" b2 = variable_on_cpu('b2', [n_hidden_2], tf.random_normal_initializer())\n",
" h2 = variable_on_cpu('h2', [n_hidden_1, n_hidden_2], tf.random_normal_initializer())\n",
" layer_2 = tf.minimum(tf.nn.relu(tf.add(tf.matmul(layer_1, h2), b2)), relu_clip) \n",
" layer_2 = tf.nn.dropout(layer_2, (1.0 - dropout_rate))\n",
" layer_2 = tf.nn.dropout(layer_2, (1.0 - dropout_rate_placeholder))\n",
" #Hidden layer with clipped RELU activation and dropout\n",
" b3 = variable_on_cpu('b3', [n_hidden_3], tf.random_normal_initializer())\n",
" h3 = variable_on_cpu('h3', [n_hidden_2, n_hidden_3], tf.random_normal_initializer())\n",
" layer_3 = tf.minimum(tf.nn.relu(tf.add(tf.matmul(layer_2, h3), b3)), relu_clip)\n",
" layer_3 = tf.nn.dropout(layer_3, (1.0 - dropout_rate))\n",
" layer_3 = tf.nn.dropout(layer_3, (1.0 - dropout_rate_placeholder))\n",
"```\n",
"\n",
"Next we create the forward and backward LSTM units\n",
@ -543,7 +552,7 @@
" b5 = variable_on_cpu('b5', [n_hidden_5], tf.random_normal_initializer())\n",
" h5 = variable_on_cpu('h5', [(2 * n_cell_dim), n_hidden_5], tf.random_normal_initializer())\n",
" layer_5 = tf.minimum(tf.nn.relu(tf.add(tf.matmul(outputs, h5), b5)), relu_clip)\n",
" layer_5 = tf.nn.dropout(layer_5, (1.0 - dropout_rate))\n",
" layer_5 = tf.nn.dropout(layer_5, (1.0 - dropout_rate_placeholder))\n",
"```\n",
"\n",
"The next line of `BiRNN`\n",
@ -584,7 +593,7 @@
"source": [
"In accord with [Deep Speech: Scaling up end-to-end speech recognition](http://arxiv.org/abs/1412.5567), the loss function used by our network should be the CTC loss function[[2]](http://www.cs.toronto.edu/~graves/preprint.pdf). Conveniently, this loss function is implemented in TensorFlow. Thus, we can simply make use of this implementation to define our loss.\n",
"\n",
"To do so we introduce a utility function `calculate_accuracy_and_loss()` beam search decodes a mini-batch and calculates the average loss and accuracy. Next to loss and accuracy it returns the decoded result and the batch's original Y."
"To do so we introduce a utility function `calculate_accuracy_and_loss()` that beam search decodes a mini-batch and calculates the average loss and accuracy. Next to loss and accuracy it returns the decoded result and the batch's original Y."
]
},
{
@ -595,12 +604,12 @@
},
"outputs": [],
"source": [
"def calculate_accuracy_and_loss(n_steps, batch_set, dropout_rate):\n",
"def calculate_accuracy_and_loss(n_steps, batch_set):\n",
" # Obtain the next batch of data\n",
" batch_x, batch_y, batch_seq_len = batch_set.next_batch(batch_size)\n",
"\n",
" # Calculate the logits of the batch using BiRNN\n",
" logits = BiRNN(batch_x, n_steps, dropout_rate)\n",
" logits = BiRNN(batch_x, n_steps)\n",
" \n",
" # CTC loss requires the logits be time major\n",
" logits = tf.transpose(logits, [1, 0, 2])\n",
@ -630,7 +639,7 @@
"source": [
"The first lines of `calculate_accuracy_and_loss()`\n",
"```python\n",
"def calculate_accuracy_and_loss(n_steps, batch_set, dropout_rate):\n",
"def calculate_accuracy_and_loss(n_steps, batch_set):\n",
" # Obtain the next batch of data\n",
" batch_x, batch_y, batch_seq_len = batch_set.next_batch(batch_size)\n",
"```\n",
@ -639,7 +648,7 @@
"The next line\n",
"```python\n",
" # Calculate the logits from the BiRNN\n",
" logits = BiRNN(batch_x, n_steps, dropout_rate)\n",
" logits = BiRNN(batch_x, n_steps)\n",
"```\n",
"calls `BiRNN()` with a batch of data and does inference on the batch.\n",
"\n",
@ -872,8 +881,7 @@
" # batch along with the original batch's labels (Y) of this tower\n",
" avg_loss, accuracy, decoded, labels = calculate_accuracy_and_loss(\\\n",
" n_steps, \\\n",
" batch_set, \\\n",
" dropout_rate if (optimizer is not None) else 0.0 \\\n",
" batch_set \\\n",
" )\n",
" \n",
" # Allow for variables to be re-used by the next tower\n",
@ -1121,7 +1129,7 @@
" \n",
" # Iterating over the towers\n",
" for i in range(len(tower_decodings)):\n",
" decoded, labels = session.run([tower_decodings[i], tower_labels[i]])\n",
" decoded, labels = session.run([tower_decodings[i], tower_labels[i]], feed_dict)\n",
" originals.extend(sparse_tensor_value_to_text(labels))\n",
" results.extend(sparse_tensor_value_to_text(decoded))\n",
" \n",
@ -1233,14 +1241,14 @@
" # Loop over the batches\n",
" for batch in range(total_batch/len(available_devices)):\n",
" # Compute the average loss for the last batch\n",
" _, batch_avg_loss = session.run([apply_gradient_op, tower_loss])\n",
" _, batch_avg_loss = session.run([apply_gradient_op, tower_loss], feed_dict_train)\n",
"\n",
" # Add batch to total_accuracy\n",
" total_accuracy += session.run(accuracy)\n",
" total_accuracy += session.run(accuracy, feed_dict_train)\n",
"\n",
" # Log all variable states in current step\n",
" step = epoch * total_batch + batch * len(available_devices)\n",
" summary_str = session.run(merged)\n",
" summary_str = session.run(merged, feed_dict_train)\n",
" writer.add_summary(summary_str, step)\n",
" writer.flush()\n",
" \n",