Fixed dropout handling and other fixes
This commit is contained in:
parent
2890264b04
commit
f3439b72d5
153
DeepSpeech.ipynb
153
DeepSpeech.ipynb
@ -76,18 +76,19 @@
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"import time\n",
|
||||
"import os.path\n",
|
||||
"import json\n",
|
||||
"import datetime\n",
|
||||
"import tempfile\n",
|
||||
"import subprocess\n",
|
||||
"import numpy as np\n",
|
||||
"import tensorflow as tf\n",
|
||||
"import json\n",
|
||||
"import subprocess\n",
|
||||
"import datetime\n",
|
||||
"from util.log import merge_logs\n",
|
||||
"from util.gpu import get_available_gpus\n",
|
||||
"from util.importers.ted_lium import read_data_sets\n",
|
||||
"from util.text import sparse_tensor_value_to_text, wers\n",
|
||||
"from tensorflow.python.ops import ctc_ops\n",
|
||||
"from util.importers.ted_lium import read_data_sets"
|
||||
"from tensorflow.python.ops import ctc_ops"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -125,7 +126,7 @@
|
||||
"training_iters = 1250 # TODO: Determine a reasonable value for this\n",
|
||||
"batch_size = 1 # TODO: Determine a reasonable value for this\n",
|
||||
"display_step = 10 # TODO: Determine a reasonable value for this\n",
|
||||
"validation_step = 50\n",
|
||||
"validation_step = 50 # TODO: Determine a reasonable value for this\n",
|
||||
"checkpoint_step = 1000 # TODO: Determine a reasonable value for this\n",
|
||||
"checkpoint_dir = tempfile.gettempdir() # TODO: Determine a reasonable value for this"
|
||||
]
|
||||
@ -147,7 +148,7 @@
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"dropout_rate = 0.01 # TODO: Validate this is a reasonable value"
|
||||
"dropout_rate = 0.05 # TODO: Validate this is a reasonable value"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -400,7 +401,7 @@
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def BiRNN(batch_x, n_steps, dropout):\n",
|
||||
"def BiRNN(batch_x, n_steps, dropout_rate):\n",
|
||||
" # Input shape: [batch_size, n_steps, n_input + 2*n_input*n_context]\n",
|
||||
" batch_x = tf.transpose(batch_x, [1, 0, 2]) # Permute n_steps and batch_size\n",
|
||||
" # Reshape to prepare input for first layer\n",
|
||||
@ -410,20 +411,17 @@
|
||||
" b1 = variable_on_cpu('b1', [n_hidden_1], tf.random_normal_initializer())\n",
|
||||
" h1 = variable_on_cpu('h1', [n_input + 2*n_input*n_context, n_hidden_1], tf.random_normal_initializer())\n",
|
||||
" layer_1 = tf.minimum(tf.nn.relu(tf.add(tf.matmul(batch_x, h1), b1)), relu_clip)\n",
|
||||
" if dropout:\n",
|
||||
" layer_1 = tf.nn.dropout(layer_1, (1 - dropout_rate))\n",
|
||||
" layer_1 = tf.nn.dropout(layer_1, (1.0 - dropout_rate))\n",
|
||||
" #Hidden layer with clipped RELU activation and dropout\n",
|
||||
" b2 = variable_on_cpu('b2', [n_hidden_2], tf.random_normal_initializer())\n",
|
||||
" h2 = variable_on_cpu('h2', [n_hidden_1, n_hidden_2], tf.random_normal_initializer())\n",
|
||||
" layer_2 = tf.minimum(tf.nn.relu(tf.add(tf.matmul(layer_1, h2), b2)), relu_clip)\n",
|
||||
" if dropout:\n",
|
||||
" layer_2 = tf.nn.dropout(layer_2, (1 - dropout_rate))\n",
|
||||
" layer_2 = tf.nn.dropout(layer_2, (1.0 - dropout_rate))\n",
|
||||
" #Hidden layer with clipped RELU activation and dropout\n",
|
||||
" b3 = variable_on_cpu('b3', [n_hidden_3], tf.random_normal_initializer())\n",
|
||||
" h3 = variable_on_cpu('h3', [n_hidden_2, n_hidden_3], tf.random_normal_initializer())\n",
|
||||
" layer_3 = tf.minimum(tf.nn.relu(tf.add(tf.matmul(layer_2, h3), b3)), relu_clip)\n",
|
||||
" if dropout:\n",
|
||||
" layer_3 = tf.nn.dropout(layer_3, (1 - dropout_rate))\n",
|
||||
" layer_3 = tf.nn.dropout(layer_3, (1.0 - dropout_rate))\n",
|
||||
" \n",
|
||||
" # Define lstm cells with tensorflow\n",
|
||||
" # Forward direction cell\n",
|
||||
@ -449,8 +447,7 @@
|
||||
" b5 = variable_on_cpu('b5', [n_hidden_5], tf.random_normal_initializer())\n",
|
||||
" h5 = variable_on_cpu('h5', [(2 * n_cell_dim), n_hidden_5], tf.random_normal_initializer())\n",
|
||||
" layer_5 = tf.minimum(tf.nn.relu(tf.add(tf.matmul(outputs, h5), b5)), relu_clip)\n",
|
||||
" if dropout:\n",
|
||||
" layer_5 = tf.nn.dropout(layer_5, (1 - dropout_rate))\n",
|
||||
" layer_5 = tf.nn.dropout(layer_5, (1.0 - dropout_rate))\n",
|
||||
" #Hidden layer of logits\n",
|
||||
" b6 = variable_on_cpu('b6', [n_hidden_6], tf.random_normal_initializer())\n",
|
||||
" h6 = variable_on_cpu('h6', [n_hidden_5, n_hidden_6], tf.random_normal_initializer())\n",
|
||||
@ -471,7 +468,7 @@
|
||||
"source": [
|
||||
"The first few lines of the function `BiRNN`\n",
|
||||
"```python\n",
|
||||
"def BiRNN(batch_x, n_steps, dropout=True):\n",
|
||||
"def BiRNN(batch_x, n_steps, dropout_rate):\n",
|
||||
" # Input shape: [batch_size, n_steps, n_input + 2*n_input*n_context]\n",
|
||||
" batch_x = tf.transpose(batch_x, [1, 0, 2]) # Permute n_steps and batch_size\n",
|
||||
" # Reshape to prepare input for first layer\n",
|
||||
@ -486,8 +483,7 @@
|
||||
" b1 = variable_on_cpu('b1', [n_hidden_1], tf.random_normal_initializer())\n",
|
||||
" h1 = variable_on_cpu('h1', [n_input + 2*n_input*n_context, n_hidden_1], tf.random_normal_initializer())\n",
|
||||
" layer_1 = tf.minimum(tf.nn.relu(tf.add(tf.matmul(batch_x, h1), b1)), relu_clip)\n",
|
||||
" if dropout:\n",
|
||||
" layer_1 = tf.nn.dropout(layer_1, (1 - dropout_rate))\n",
|
||||
" layer_1 = tf.nn.dropout(layer_1, (1.0 - dropout_rate))\n",
|
||||
" ...\n",
|
||||
"```\n",
|
||||
"pass `batch_x` through the first layer of the non-recurrent neural network, then applies dropout to the result.\n",
|
||||
@ -497,15 +493,13 @@
|
||||
" #Hidden layer with clipped RELU activation and dropout\n",
|
||||
" b2 = variable_on_cpu('b2', [n_hidden_2], tf.random_normal_initializer())\n",
|
||||
" h2 = variable_on_cpu('h2', [n_hidden_1, n_hidden_2], tf.random_normal_initializer())\n",
|
||||
" layer_2 = tf.minimum(tf.nn.relu(tf.add(tf.matmul(layer_1, h2), b2)), relu_clip)\n",
|
||||
" if dropout: \n",
|
||||
" layer_2 = tf.nn.dropout(layer_2, (1 - dropout_rate))\n",
|
||||
" layer_2 = tf.minimum(tf.nn.relu(tf.add(tf.matmul(layer_1, h2), b2)), relu_clip) \n",
|
||||
" layer_2 = tf.nn.dropout(layer_2, (1.0 - dropout_rate))\n",
|
||||
" #Hidden layer with clipped RELU activation and dropout\n",
|
||||
" b3 = variable_on_cpu('b3', [n_hidden_3], tf.random_normal_initializer())\n",
|
||||
" h3 = variable_on_cpu('h3', [n_hidden_2, n_hidden_3], tf.random_normal_initializer())\n",
|
||||
" layer_3 = tf.minimum(tf.nn.relu(tf.add(tf.matmul(layer_2, h3), b3)), relu_clip)\n",
|
||||
" if dropout:\n",
|
||||
" layer_3 = tf.nn.dropout(layer_3, (1 - dropout_rate))\n",
|
||||
" layer_3 = tf.nn.dropout(layer_3, (1.0 - dropout_rate))\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"Next we create the forward and backward LSTM units\n",
|
||||
@ -549,8 +543,7 @@
|
||||
" b5 = variable_on_cpu('b5', [n_hidden_5], tf.random_normal_initializer())\n",
|
||||
" h5 = variable_on_cpu('h5', [(2 * n_cell_dim), n_hidden_5], tf.random_normal_initializer())\n",
|
||||
" layer_5 = tf.minimum(tf.nn.relu(tf.add(tf.matmul(outputs, h5), b5)), relu_clip)\n",
|
||||
" if dropout:\n",
|
||||
" layer_5 = tf.nn.dropout(layer_5, (1 - dropout_rate))\n",
|
||||
" layer_5 = tf.nn.dropout(layer_5, (1.0 - dropout_rate))\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"The next line of `BiRNN`\n",
|
||||
@ -591,7 +584,7 @@
|
||||
"source": [
|
||||
"In accord with [Deep Speech: Scaling up end-to-end speech recognition](http://arxiv.org/abs/1412.5567), the loss function used by our network should be the CTC loss function[[2]](http://www.cs.toronto.edu/~graves/preprint.pdf). Conveniently, this loss function is implemented in TensorFlow. Thus, we can simply make use of this implementation to define our loss.\n",
|
||||
"\n",
|
||||
"To do so we introduce a utility function `calculate_accuracy_and_loss()` that calculates the average loss for a mini-batch along with the accuracy"
|
||||
"To do so we introduce a utility function `calculate_accuracy_and_loss()` beam search decodes a mini-batch and calculates the average loss and accuracy. Next to loss and accuracy it returns the decoded result and the batch's original Y."
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -602,12 +595,12 @@
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def calculate_accuracy_and_loss(n_steps, batch_set, dropout=False):\n",
|
||||
"def calculate_accuracy_and_loss(n_steps, batch_set, dropout_rate):\n",
|
||||
" # Obtain the next batch of data\n",
|
||||
" batch_x, batch_y, batch_seq_len = batch_set.next_batch(batch_size)\n",
|
||||
"\n",
|
||||
" # Calculate the logits of the batch using BiRNN\n",
|
||||
" logits = BiRNN(batch_x, n_steps, dropout=dropout)\n",
|
||||
" logits = BiRNN(batch_x, n_steps, dropout_rate)\n",
|
||||
" \n",
|
||||
" # CTC loss requires the logits be time major\n",
|
||||
" logits = tf.transpose(logits, [1, 0, 2])\n",
|
||||
@ -618,12 +611,16 @@
|
||||
" # Calculate the average loss across the batch\n",
|
||||
" avg_loss = tf.reduce_mean(total_loss)\n",
|
||||
" \n",
|
||||
" # Compute the accuracy\n",
|
||||
" # Beam search decode the batch\n",
|
||||
" decoded, _ = ctc_ops.ctc_beam_search_decoder(logits, batch_seq_len)\n",
|
||||
" \n",
|
||||
" # Compute the edit (Levenshtein) distance \n",
|
||||
" distance = tf.edit_distance(tf.cast(decoded[0], tf.int32), batch_y)\n",
|
||||
" \n",
|
||||
" # Compute the accuracy \n",
|
||||
" accuracy = tf.reduce_mean(distance)\n",
|
||||
"\n",
|
||||
" # Return avg_loss and accuracy\n",
|
||||
" # Return results to the caller\n",
|
||||
" return avg_loss, accuracy, decoded, batch_y"
|
||||
]
|
||||
},
|
||||
@ -633,7 +630,7 @@
|
||||
"source": [
|
||||
"The first lines of `calculate_accuracy_and_loss()`\n",
|
||||
"```python\n",
|
||||
"def calculate_accuracy_and_loss(n_steps, batch_set):\n",
|
||||
"def calculate_accuracy_and_loss(n_steps, batch_set, dropout_rate):\n",
|
||||
" # Obtain the next batch of data\n",
|
||||
" batch_x, batch_y, batch_seq_len = batch_set.next_batch(batch_size)\n",
|
||||
"```\n",
|
||||
@ -642,7 +639,7 @@
|
||||
"The next line\n",
|
||||
"```python\n",
|
||||
" # Calculate the logits from the BiRNN\n",
|
||||
" logits = BiRNN(batch_x)\n",
|
||||
" logits = BiRNN(batch_x, n_steps, dropout_rate)\n",
|
||||
"```\n",
|
||||
"calls `BiRNN()` with a batch of data and does inference on the batch.\n",
|
||||
"\n",
|
||||
@ -659,17 +656,22 @@
|
||||
"```\n",
|
||||
"calculate the average loss using tensor flow's `ctc_loss` operator. \n",
|
||||
"\n",
|
||||
"The next lines compute the accuracy\n",
|
||||
"The next lines first beam decode the batch and then compute the accuracy on base of the Levenshtein distance between the decoded batch and the batch's original Y.\n",
|
||||
"```python\n",
|
||||
" # Compute the accuracy\n",
|
||||
" # Beam search decode the batch\n",
|
||||
" decoded, _ = ctc_ops.ctc_beam_search_decoder(logits, batch_seq_len)\n",
|
||||
" accuracy = tf.reduce_mean(tf.edit_distance(tf.cast(decoded[0], tf.int32), batch_y))\n",
|
||||
" \n",
|
||||
" # Compute the edit (Levenshtein) distance \n",
|
||||
" distance = tf.edit_distance(tf.cast(decoded[0], tf.int32), batch_y)\n",
|
||||
" \n",
|
||||
" # Compute the accuracy \n",
|
||||
" accuracy = tf.reduce_mean(distance)\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"Finally, the `avg_loss`, accuracy and the decoded batch are returned to the caller\n",
|
||||
"Finally, the `avg_loss`, accuracy, the decoded batch and the original batch's Y are returned to the caller\n",
|
||||
"```python\n",
|
||||
" # Return avg_loss and accuracy\n",
|
||||
" return avg_loss, accuracy, decoded\n",
|
||||
" # Return results to the caller\n",
|
||||
" return avg_loss, accuracy, decoded, batch_y\n",
|
||||
"```"
|
||||
]
|
||||
},
|
||||
@ -866,22 +868,25 @@
|
||||
" with tf.device(available_devices[i]):\n",
|
||||
" # Create a scope for all operations of tower i\n",
|
||||
" with tf.name_scope('tower_%d' % i) as scope:\n",
|
||||
" # Calculate the avg_loss and accuracy for this tower\n",
|
||||
" # Calculate the avg_loss and accuracy and retrieve the decoded \n",
|
||||
" # batch along with the original batch's labels (Y) of this tower\n",
|
||||
" avg_loss, accuracy, decoded, labels = calculate_accuracy_and_loss(\\\n",
|
||||
" n_steps, \\\n",
|
||||
" batch_set, \\\n",
|
||||
" dropout=(optimizer is not None) \\\n",
|
||||
" )\n",
|
||||
" n_steps, \\\n",
|
||||
" batch_set, \\\n",
|
||||
" dropout_rate if (optimizer is not None) else 0.0 \\\n",
|
||||
" )\n",
|
||||
" \n",
|
||||
" # Allow for variables to be re-used by the next tower\n",
|
||||
" tf.get_variable_scope().reuse_variables()\n",
|
||||
" \n",
|
||||
" # Retain tower's gradients\n",
|
||||
" # Retain tower's decoded batch\n",
|
||||
" tower_decodings.append(decoded)\n",
|
||||
" \n",
|
||||
" # Retain tower's labels\n",
|
||||
" # Retain tower's labels (Y)\n",
|
||||
" tower_labels.append(labels)\n",
|
||||
" \n",
|
||||
" # If we are in training, there will be an optimizer given and \n",
|
||||
" # only then we will compute and retain gradients on base of the loss\n",
|
||||
" if optimizer is not None:\n",
|
||||
" # Compute gradients for model parameters using tower's mini-batch\n",
|
||||
" gradients = optimizer.compute_gradients(avg_loss)\n",
|
||||
@ -1032,7 +1037,8 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Finally we define the log directory plus some helpers."
|
||||
"Finally we define the top directory for all logs and our current log sub-directory of it.\n",
|
||||
"We also add some log helpers."
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -1043,7 +1049,8 @@
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"log_dir = '%s/%s' % (\"logs\", time.strftime(\"%Y%m%d-%H%M%S\"))\n",
|
||||
"logs_dir = \"logs\"\n",
|
||||
"log_dir = '%s/%s' % (logs_dir, time.strftime(\"%Y%m%d-%H%M%S\"))\n",
|
||||
"\n",
|
||||
"def get_git_revision_hash():\n",
|
||||
" return subprocess.check_output(['git', 'rev-parse', 'HEAD']).strip()\n",
|
||||
@ -1074,7 +1081,7 @@
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def forward(session, data_set):\n",
|
||||
"def decode_batch(data_set):\n",
|
||||
" # Set n_steps parameter\n",
|
||||
" n_steps = data_set.max_batch_seq_len\n",
|
||||
"\n",
|
||||
@ -1188,7 +1195,7 @@
|
||||
" get_tower_results(n_steps, data_sets.train, optimizer)\n",
|
||||
" \n",
|
||||
" # Validation step preparation\n",
|
||||
" validation_tower_decodings, validation_tower_labels = forward(session, data_sets.validation)\n",
|
||||
" validation_tower_decodings, validation_tower_labels = decode_batch(data_sets.validation)\n",
|
||||
"\n",
|
||||
" # Average tower gradients\n",
|
||||
" avg_tower_gradients = average_gradients(tower_gradients)\n",
|
||||
@ -1218,10 +1225,9 @@
|
||||
" # Define total accuracy for the epoch\n",
|
||||
" total_accuracy = 0\n",
|
||||
" \n",
|
||||
" # Validation step to determine the best point in time to stop\n",
|
||||
" # Validation step\n",
|
||||
" if epoch % validation_step == 0:\n",
|
||||
" _, last_validation_wer = print_wer_report(session, \"Validation\", validation_tower_decodings, validation_tower_labels)\n",
|
||||
" # TODO: Determine on base of WER, if model starts overfitting\n",
|
||||
" print\n",
|
||||
"\n",
|
||||
" # Loop over the batches\n",
|
||||
@ -1307,7 +1313,7 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Test network\n",
|
||||
"test_decodings, test_labels = forward(session, ted_lium.test)\n",
|
||||
"test_decodings, test_labels = decode_batch(ted_lium.test)\n",
|
||||
"_, test_wer = print_wer_report(session, \"Test\", test_decodings, test_labels)"
|
||||
]
|
||||
},
|
||||
@ -1380,41 +1386,6 @@
|
||||
"Let's also re-populate a central JS file, that contains all the dumps at once."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 34,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"written = False\n",
|
||||
"logs_dir = \"logs\"\n",
|
||||
"\n",
|
||||
"# All direct sub directories of the logs directory\n",
|
||||
"dirs = [os.path.join(logs_dir, o) for o in os.listdir(logs_dir) if os.path.isdir(os.path.join(logs_dir, o))]\n",
|
||||
"\n",
|
||||
"# Let's first populate a temporal file and rename it afterwards - guarantees an interruption free web experience\n",
|
||||
"nhf = '%s/%s' % (logs_dir, 'new_hyper.js')\n",
|
||||
"\n",
|
||||
"with open(nhf, 'w') as dump_file:\n",
|
||||
" # Assigning a global variable that the report page can pick up after loading the data as a regular script\n",
|
||||
" dump_file.write('window.ALL_THE_DATA = [')\n",
|
||||
" for d in dirs:\n",
|
||||
" hf = os.path.join(d, \"hyper.json\")\n",
|
||||
" if os.path.isfile(hf):\n",
|
||||
" # Separate by comma if there was already something written\n",
|
||||
" if written:\n",
|
||||
" dump_file.write(',\\n')\n",
|
||||
" written = True\n",
|
||||
" # Append the whole file\n",
|
||||
" dump_file.write(open(hf, 'r').read())\n",
|
||||
" dump_file.write('];')\n",
|
||||
" \n",
|
||||
"# Finally we rename the file temporal file and overwrite a potentially existing active one\n",
|
||||
"os.rename(nhf, '%s/%s' % (logs_dir, 'hyper.js'))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
@ -1422,7 +1393,9 @@
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
"source": [
|
||||
"merge_logs(logs_dir)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
|
30
util/log/__init__.py
Normal file
30
util/log/__init__.py
Normal file
@ -0,0 +1,30 @@
|
||||
|
||||
import os
|
||||
import os.path
|
||||
|
||||
def merge_logs(logs_dir):
|
||||
|
||||
written = False
|
||||
|
||||
# All direct sub directories of the logs directory
|
||||
dirs = [os.path.join(logs_dir, o) for o in os.listdir(logs_dir) if os.path.isdir(os.path.join(logs_dir, o))]
|
||||
|
||||
# Let's first populate a temporal file and rename it afterwards - guarantees an interruption free web experience
|
||||
nhf = '%s/%s' % (logs_dir, 'new_hyper.js')
|
||||
|
||||
with open(nhf, 'w') as dump_file:
|
||||
# Assigning a global variable that the report page can pick up after loading the data as a regular script
|
||||
dump_file.write('window.ALL_THE_DATA = [')
|
||||
for d in dirs:
|
||||
hf = os.path.join(d, "hyper.json")
|
||||
if os.path.isfile(hf):
|
||||
# Separate by comma if there was already something written
|
||||
if written:
|
||||
dump_file.write(',\n')
|
||||
written = True
|
||||
# Append the whole file
|
||||
dump_file.write(open(hf, 'r').read())
|
||||
dump_file.write('];')
|
||||
|
||||
# Finally we rename the temporal file and overwrite a potentially existing active one
|
||||
os.rename(nhf, '%s/%s' % (logs_dir, 'hyper.js'))
|
Loading…
x
Reference in New Issue
Block a user