Reintroduced feed_dict for context dependent dropout rates
This commit is contained in:
parent
f3439b72d5
commit
9fb60a7ebc
|
@ -148,7 +148,16 @@
|
||||||
},
|
},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"dropout_rate = 0.05 # TODO: Validate this is a reasonable value"
|
"dropout_rate = 0.05 # TODO: Validate this is a reasonable value\n",
|
||||||
|
"\n",
|
||||||
|
"# This global placeholder will be used for all dropout definitions\n",
|
||||||
|
"dropout_rate_placeholder = tf.placeholder(tf.float32)\n",
|
||||||
|
"\n",
|
||||||
|
"# The feed_dict used for training employs the given dropout_rate\n",
|
||||||
|
"feed_dict_train = { dropout_rate_placeholder: dropout_rate }\n",
|
||||||
|
"\n",
|
||||||
|
"# While the feed_dict used for validation, test and train progress reporting employs zero dropout\n",
|
||||||
|
"feed_dict = { dropout_rate_placeholder: 0.0 }"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -401,7 +410,7 @@
|
||||||
},
|
},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"def BiRNN(batch_x, n_steps, dropout_rate):\n",
|
"def BiRNN(batch_x, n_steps):\n",
|
||||||
" # Input shape: [batch_size, n_steps, n_input + 2*n_input*n_context]\n",
|
" # Input shape: [batch_size, n_steps, n_input + 2*n_input*n_context]\n",
|
||||||
" batch_x = tf.transpose(batch_x, [1, 0, 2]) # Permute n_steps and batch_size\n",
|
" batch_x = tf.transpose(batch_x, [1, 0, 2]) # Permute n_steps and batch_size\n",
|
||||||
" # Reshape to prepare input for first layer\n",
|
" # Reshape to prepare input for first layer\n",
|
||||||
|
@ -411,17 +420,17 @@
|
||||||
" b1 = variable_on_cpu('b1', [n_hidden_1], tf.random_normal_initializer())\n",
|
" b1 = variable_on_cpu('b1', [n_hidden_1], tf.random_normal_initializer())\n",
|
||||||
" h1 = variable_on_cpu('h1', [n_input + 2*n_input*n_context, n_hidden_1], tf.random_normal_initializer())\n",
|
" h1 = variable_on_cpu('h1', [n_input + 2*n_input*n_context, n_hidden_1], tf.random_normal_initializer())\n",
|
||||||
" layer_1 = tf.minimum(tf.nn.relu(tf.add(tf.matmul(batch_x, h1), b1)), relu_clip)\n",
|
" layer_1 = tf.minimum(tf.nn.relu(tf.add(tf.matmul(batch_x, h1), b1)), relu_clip)\n",
|
||||||
" layer_1 = tf.nn.dropout(layer_1, (1.0 - dropout_rate))\n",
|
" layer_1 = tf.nn.dropout(layer_1, (1.0 - dropout_rate_placeholder))\n",
|
||||||
" #Hidden layer with clipped RELU activation and dropout\n",
|
" #Hidden layer with clipped RELU activation and dropout\n",
|
||||||
" b2 = variable_on_cpu('b2', [n_hidden_2], tf.random_normal_initializer())\n",
|
" b2 = variable_on_cpu('b2', [n_hidden_2], tf.random_normal_initializer())\n",
|
||||||
" h2 = variable_on_cpu('h2', [n_hidden_1, n_hidden_2], tf.random_normal_initializer())\n",
|
" h2 = variable_on_cpu('h2', [n_hidden_1, n_hidden_2], tf.random_normal_initializer())\n",
|
||||||
" layer_2 = tf.minimum(tf.nn.relu(tf.add(tf.matmul(layer_1, h2), b2)), relu_clip)\n",
|
" layer_2 = tf.minimum(tf.nn.relu(tf.add(tf.matmul(layer_1, h2), b2)), relu_clip)\n",
|
||||||
" layer_2 = tf.nn.dropout(layer_2, (1.0 - dropout_rate))\n",
|
" layer_2 = tf.nn.dropout(layer_2, (1.0 - dropout_rate_placeholder))\n",
|
||||||
" #Hidden layer with clipped RELU activation and dropout\n",
|
" #Hidden layer with clipped RELU activation and dropout\n",
|
||||||
" b3 = variable_on_cpu('b3', [n_hidden_3], tf.random_normal_initializer())\n",
|
" b3 = variable_on_cpu('b3', [n_hidden_3], tf.random_normal_initializer())\n",
|
||||||
" h3 = variable_on_cpu('h3', [n_hidden_2, n_hidden_3], tf.random_normal_initializer())\n",
|
" h3 = variable_on_cpu('h3', [n_hidden_2, n_hidden_3], tf.random_normal_initializer())\n",
|
||||||
" layer_3 = tf.minimum(tf.nn.relu(tf.add(tf.matmul(layer_2, h3), b3)), relu_clip)\n",
|
" layer_3 = tf.minimum(tf.nn.relu(tf.add(tf.matmul(layer_2, h3), b3)), relu_clip)\n",
|
||||||
" layer_3 = tf.nn.dropout(layer_3, (1.0 - dropout_rate))\n",
|
" layer_3 = tf.nn.dropout(layer_3, (1.0 - dropout_rate_placeholder))\n",
|
||||||
" \n",
|
" \n",
|
||||||
" # Define lstm cells with tensorflow\n",
|
" # Define lstm cells with tensorflow\n",
|
||||||
" # Forward direction cell\n",
|
" # Forward direction cell\n",
|
||||||
|
@ -447,7 +456,7 @@
|
||||||
" b5 = variable_on_cpu('b5', [n_hidden_5], tf.random_normal_initializer())\n",
|
" b5 = variable_on_cpu('b5', [n_hidden_5], tf.random_normal_initializer())\n",
|
||||||
" h5 = variable_on_cpu('h5', [(2 * n_cell_dim), n_hidden_5], tf.random_normal_initializer())\n",
|
" h5 = variable_on_cpu('h5', [(2 * n_cell_dim), n_hidden_5], tf.random_normal_initializer())\n",
|
||||||
" layer_5 = tf.minimum(tf.nn.relu(tf.add(tf.matmul(outputs, h5), b5)), relu_clip)\n",
|
" layer_5 = tf.minimum(tf.nn.relu(tf.add(tf.matmul(outputs, h5), b5)), relu_clip)\n",
|
||||||
" layer_5 = tf.nn.dropout(layer_5, (1.0 - dropout_rate))\n",
|
" layer_5 = tf.nn.dropout(layer_5, (1.0 - dropout_rate_placeholder))\n",
|
||||||
" #Hidden layer of logits\n",
|
" #Hidden layer of logits\n",
|
||||||
" b6 = variable_on_cpu('b6', [n_hidden_6], tf.random_normal_initializer())\n",
|
" b6 = variable_on_cpu('b6', [n_hidden_6], tf.random_normal_initializer())\n",
|
||||||
" h6 = variable_on_cpu('h6', [n_hidden_5, n_hidden_6], tf.random_normal_initializer())\n",
|
" h6 = variable_on_cpu('h6', [n_hidden_5, n_hidden_6], tf.random_normal_initializer())\n",
|
||||||
|
@ -468,7 +477,7 @@
|
||||||
"source": [
|
"source": [
|
||||||
"The first few lines of the function `BiRNN`\n",
|
"The first few lines of the function `BiRNN`\n",
|
||||||
"```python\n",
|
"```python\n",
|
||||||
"def BiRNN(batch_x, n_steps, dropout_rate):\n",
|
"def BiRNN(batch_x, n_steps):\n",
|
||||||
" # Input shape: [batch_size, n_steps, n_input + 2*n_input*n_context]\n",
|
" # Input shape: [batch_size, n_steps, n_input + 2*n_input*n_context]\n",
|
||||||
" batch_x = tf.transpose(batch_x, [1, 0, 2]) # Permute n_steps and batch_size\n",
|
" batch_x = tf.transpose(batch_x, [1, 0, 2]) # Permute n_steps and batch_size\n",
|
||||||
" # Reshape to prepare input for first layer\n",
|
" # Reshape to prepare input for first layer\n",
|
||||||
|
@ -483,7 +492,7 @@
|
||||||
" b1 = variable_on_cpu('b1', [n_hidden_1], tf.random_normal_initializer())\n",
|
" b1 = variable_on_cpu('b1', [n_hidden_1], tf.random_normal_initializer())\n",
|
||||||
" h1 = variable_on_cpu('h1', [n_input + 2*n_input*n_context, n_hidden_1], tf.random_normal_initializer())\n",
|
" h1 = variable_on_cpu('h1', [n_input + 2*n_input*n_context, n_hidden_1], tf.random_normal_initializer())\n",
|
||||||
" layer_1 = tf.minimum(tf.nn.relu(tf.add(tf.matmul(batch_x, h1), b1)), relu_clip)\n",
|
" layer_1 = tf.minimum(tf.nn.relu(tf.add(tf.matmul(batch_x, h1), b1)), relu_clip)\n",
|
||||||
" layer_1 = tf.nn.dropout(layer_1, (1.0 - dropout_rate))\n",
|
" layer_1 = tf.nn.dropout(layer_1, (1.0 - dropout_rate_placeholder))\n",
|
||||||
" ...\n",
|
" ...\n",
|
||||||
"```\n",
|
"```\n",
|
||||||
"pass `batch_x` through the first layer of the non-recurrent neural network, then applies dropout to the result.\n",
|
"pass `batch_x` through the first layer of the non-recurrent neural network, then applies dropout to the result.\n",
|
||||||
|
@ -494,12 +503,12 @@
|
||||||
" b2 = variable_on_cpu('b2', [n_hidden_2], tf.random_normal_initializer())\n",
|
" b2 = variable_on_cpu('b2', [n_hidden_2], tf.random_normal_initializer())\n",
|
||||||
" h2 = variable_on_cpu('h2', [n_hidden_1, n_hidden_2], tf.random_normal_initializer())\n",
|
" h2 = variable_on_cpu('h2', [n_hidden_1, n_hidden_2], tf.random_normal_initializer())\n",
|
||||||
" layer_2 = tf.minimum(tf.nn.relu(tf.add(tf.matmul(layer_1, h2), b2)), relu_clip) \n",
|
" layer_2 = tf.minimum(tf.nn.relu(tf.add(tf.matmul(layer_1, h2), b2)), relu_clip) \n",
|
||||||
" layer_2 = tf.nn.dropout(layer_2, (1.0 - dropout_rate))\n",
|
" layer_2 = tf.nn.dropout(layer_2, (1.0 - dropout_rate_placeholder))\n",
|
||||||
" #Hidden layer with clipped RELU activation and dropout\n",
|
" #Hidden layer with clipped RELU activation and dropout\n",
|
||||||
" b3 = variable_on_cpu('b3', [n_hidden_3], tf.random_normal_initializer())\n",
|
" b3 = variable_on_cpu('b3', [n_hidden_3], tf.random_normal_initializer())\n",
|
||||||
" h3 = variable_on_cpu('h3', [n_hidden_2, n_hidden_3], tf.random_normal_initializer())\n",
|
" h3 = variable_on_cpu('h3', [n_hidden_2, n_hidden_3], tf.random_normal_initializer())\n",
|
||||||
" layer_3 = tf.minimum(tf.nn.relu(tf.add(tf.matmul(layer_2, h3), b3)), relu_clip)\n",
|
" layer_3 = tf.minimum(tf.nn.relu(tf.add(tf.matmul(layer_2, h3), b3)), relu_clip)\n",
|
||||||
" layer_3 = tf.nn.dropout(layer_3, (1.0 - dropout_rate))\n",
|
" layer_3 = tf.nn.dropout(layer_3, (1.0 - dropout_rate_placeholder))\n",
|
||||||
"```\n",
|
"```\n",
|
||||||
"\n",
|
"\n",
|
||||||
"Next we create the forward and backward LSTM units\n",
|
"Next we create the forward and backward LSTM units\n",
|
||||||
|
@ -543,7 +552,7 @@
|
||||||
" b5 = variable_on_cpu('b5', [n_hidden_5], tf.random_normal_initializer())\n",
|
" b5 = variable_on_cpu('b5', [n_hidden_5], tf.random_normal_initializer())\n",
|
||||||
" h5 = variable_on_cpu('h5', [(2 * n_cell_dim), n_hidden_5], tf.random_normal_initializer())\n",
|
" h5 = variable_on_cpu('h5', [(2 * n_cell_dim), n_hidden_5], tf.random_normal_initializer())\n",
|
||||||
" layer_5 = tf.minimum(tf.nn.relu(tf.add(tf.matmul(outputs, h5), b5)), relu_clip)\n",
|
" layer_5 = tf.minimum(tf.nn.relu(tf.add(tf.matmul(outputs, h5), b5)), relu_clip)\n",
|
||||||
" layer_5 = tf.nn.dropout(layer_5, (1.0 - dropout_rate))\n",
|
" layer_5 = tf.nn.dropout(layer_5, (1.0 - dropout_rate_placeholder))\n",
|
||||||
"```\n",
|
"```\n",
|
||||||
"\n",
|
"\n",
|
||||||
"The next line of `BiRNN`\n",
|
"The next line of `BiRNN`\n",
|
||||||
|
@ -584,7 +593,7 @@
|
||||||
"source": [
|
"source": [
|
||||||
"In accord with [Deep Speech: Scaling up end-to-end speech recognition](http://arxiv.org/abs/1412.5567), the loss function used by our network should be the CTC loss function[[2]](http://www.cs.toronto.edu/~graves/preprint.pdf). Conveniently, this loss function is implemented in TensorFlow. Thus, we can simply make use of this implementation to define our loss.\n",
|
"In accord with [Deep Speech: Scaling up end-to-end speech recognition](http://arxiv.org/abs/1412.5567), the loss function used by our network should be the CTC loss function[[2]](http://www.cs.toronto.edu/~graves/preprint.pdf). Conveniently, this loss function is implemented in TensorFlow. Thus, we can simply make use of this implementation to define our loss.\n",
|
||||||
"\n",
|
"\n",
|
||||||
"To do so we introduce a utility function `calculate_accuracy_and_loss()` beam search decodes a mini-batch and calculates the average loss and accuracy. Next to loss and accuracy it returns the decoded result and the batch's original Y."
|
"To do so we introduce a utility function `calculate_accuracy_and_loss()` that beam search decodes a mini-batch and calculates the average loss and accuracy. Next to loss and accuracy it returns the decoded result and the batch's original Y."
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -595,12 +604,12 @@
|
||||||
},
|
},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"def calculate_accuracy_and_loss(n_steps, batch_set, dropout_rate):\n",
|
"def calculate_accuracy_and_loss(n_steps, batch_set):\n",
|
||||||
" # Obtain the next batch of data\n",
|
" # Obtain the next batch of data\n",
|
||||||
" batch_x, batch_y, batch_seq_len = batch_set.next_batch(batch_size)\n",
|
" batch_x, batch_y, batch_seq_len = batch_set.next_batch(batch_size)\n",
|
||||||
"\n",
|
"\n",
|
||||||
" # Calculate the logits of the batch using BiRNN\n",
|
" # Calculate the logits of the batch using BiRNN\n",
|
||||||
" logits = BiRNN(batch_x, n_steps, dropout_rate)\n",
|
" logits = BiRNN(batch_x, n_steps)\n",
|
||||||
" \n",
|
" \n",
|
||||||
" # CTC loss requires the logits be time major\n",
|
" # CTC loss requires the logits be time major\n",
|
||||||
" logits = tf.transpose(logits, [1, 0, 2])\n",
|
" logits = tf.transpose(logits, [1, 0, 2])\n",
|
||||||
|
@ -630,7 +639,7 @@
|
||||||
"source": [
|
"source": [
|
||||||
"The first lines of `calculate_accuracy_and_loss()`\n",
|
"The first lines of `calculate_accuracy_and_loss()`\n",
|
||||||
"```python\n",
|
"```python\n",
|
||||||
"def calculate_accuracy_and_loss(n_steps, batch_set, dropout_rate):\n",
|
"def calculate_accuracy_and_loss(n_steps, batch_set):\n",
|
||||||
" # Obtain the next batch of data\n",
|
" # Obtain the next batch of data\n",
|
||||||
" batch_x, batch_y, batch_seq_len = batch_set.next_batch(batch_size)\n",
|
" batch_x, batch_y, batch_seq_len = batch_set.next_batch(batch_size)\n",
|
||||||
"```\n",
|
"```\n",
|
||||||
|
@ -639,7 +648,7 @@
|
||||||
"The next line\n",
|
"The next line\n",
|
||||||
"```python\n",
|
"```python\n",
|
||||||
" # Calculate the logits from the BiRNN\n",
|
" # Calculate the logits from the BiRNN\n",
|
||||||
" logits = BiRNN(batch_x, n_steps, dropout_rate)\n",
|
" logits = BiRNN(batch_x, n_steps)\n",
|
||||||
"```\n",
|
"```\n",
|
||||||
"calls `BiRNN()` with a batch of data and does inference on the batch.\n",
|
"calls `BiRNN()` with a batch of data and does inference on the batch.\n",
|
||||||
"\n",
|
"\n",
|
||||||
|
@ -872,8 +881,7 @@
|
||||||
" # batch along with the original batch's labels (Y) of this tower\n",
|
" # batch along with the original batch's labels (Y) of this tower\n",
|
||||||
" avg_loss, accuracy, decoded, labels = calculate_accuracy_and_loss(\\\n",
|
" avg_loss, accuracy, decoded, labels = calculate_accuracy_and_loss(\\\n",
|
||||||
" n_steps, \\\n",
|
" n_steps, \\\n",
|
||||||
" batch_set, \\\n",
|
" batch_set \\\n",
|
||||||
" dropout_rate if (optimizer is not None) else 0.0 \\\n",
|
|
||||||
" )\n",
|
" )\n",
|
||||||
" \n",
|
" \n",
|
||||||
" # Allow for variables to be re-used by the next tower\n",
|
" # Allow for variables to be re-used by the next tower\n",
|
||||||
|
@ -1121,7 +1129,7 @@
|
||||||
" \n",
|
" \n",
|
||||||
" # Iterating over the towers\n",
|
" # Iterating over the towers\n",
|
||||||
" for i in range(len(tower_decodings)):\n",
|
" for i in range(len(tower_decodings)):\n",
|
||||||
" decoded, labels = session.run([tower_decodings[i], tower_labels[i]])\n",
|
" decoded, labels = session.run([tower_decodings[i], tower_labels[i]], feed_dict)\n",
|
||||||
" originals.extend(sparse_tensor_value_to_text(labels))\n",
|
" originals.extend(sparse_tensor_value_to_text(labels))\n",
|
||||||
" results.extend(sparse_tensor_value_to_text(decoded))\n",
|
" results.extend(sparse_tensor_value_to_text(decoded))\n",
|
||||||
" \n",
|
" \n",
|
||||||
|
@ -1233,14 +1241,14 @@
|
||||||
" # Loop over the batches\n",
|
" # Loop over the batches\n",
|
||||||
" for batch in range(total_batch/len(available_devices)):\n",
|
" for batch in range(total_batch/len(available_devices)):\n",
|
||||||
" # Compute the average loss for the last batch\n",
|
" # Compute the average loss for the last batch\n",
|
||||||
" _, batch_avg_loss = session.run([apply_gradient_op, tower_loss])\n",
|
" _, batch_avg_loss = session.run([apply_gradient_op, tower_loss], feed_dict_train)\n",
|
||||||
"\n",
|
"\n",
|
||||||
" # Add batch to total_accuracy\n",
|
" # Add batch to total_accuracy\n",
|
||||||
" total_accuracy += session.run(accuracy)\n",
|
" total_accuracy += session.run(accuracy, feed_dict_train)\n",
|
||||||
"\n",
|
"\n",
|
||||||
" # Log all variable states in current step\n",
|
" # Log all variable states in current step\n",
|
||||||
" step = epoch * total_batch + batch * len(available_devices)\n",
|
" step = epoch * total_batch + batch * len(available_devices)\n",
|
||||||
" summary_str = session.run(merged)\n",
|
" summary_str = session.run(merged, feed_dict_train)\n",
|
||||||
" writer.add_summary(summary_str, step)\n",
|
" writer.add_summary(summary_str, step)\n",
|
||||||
" writer.flush()\n",
|
" writer.flush()\n",
|
||||||
" \n",
|
" \n",
|
||||||
|
|
Loading…
Reference in New Issue