diff --git a/DeepSpeech.ipynb b/DeepSpeech.ipynb index 54d2888a..1f043206 100644 --- a/DeepSpeech.ipynb +++ b/DeepSpeech.ipynb @@ -76,18 +76,19 @@ }, "outputs": [], "source": [ + "import os\n", "import time\n", - "import os.path\n", + "import json\n", + "import datetime\n", "import tempfile\n", + "import subprocess\n", "import numpy as np\n", "import tensorflow as tf\n", - "import json\n", - "import subprocess\n", - "import datetime\n", + "from util.log import merge_logs\n", "from util.gpu import get_available_gpus\n", + "from util.importers.ted_lium import read_data_sets\n", "from util.text import sparse_tensor_value_to_text, wers\n", - "from tensorflow.python.ops import ctc_ops\n", - "from util.importers.ted_lium import read_data_sets" + "from tensorflow.python.ops import ctc_ops" ] }, { @@ -125,7 +126,7 @@ "training_iters = 1250 # TODO: Determine a reasonable value for this\n", "batch_size = 1 # TODO: Determine a reasonable value for this\n", "display_step = 10 # TODO: Determine a reasonable value for this\n", - "validation_step = 50\n", + "validation_step = 50 # TODO: Determine a reasonable value for this\n", "checkpoint_step = 1000 # TODO: Determine a reasonable value for this\n", "checkpoint_dir = tempfile.gettempdir() # TODO: Determine a reasonable value for this" ] @@ -147,7 +148,7 @@ }, "outputs": [], "source": [ - "dropout_rate = 0.01 # TODO: Validate this is a reasonable value" + "dropout_rate = 0.05 # TODO: Validate this is a reasonable value" ] }, { @@ -400,7 +401,7 @@ }, "outputs": [], "source": [ - "def BiRNN(batch_x, n_steps, dropout):\n", + "def BiRNN(batch_x, n_steps, dropout_rate):\n", " # Input shape: [batch_size, n_steps, n_input + 2*n_input*n_context]\n", " batch_x = tf.transpose(batch_x, [1, 0, 2]) # Permute n_steps and batch_size\n", " # Reshape to prepare input for first layer\n", @@ -410,20 +411,17 @@ " b1 = variable_on_cpu('b1', [n_hidden_1], tf.random_normal_initializer())\n", " h1 = variable_on_cpu('h1', [n_input + 2*n_input*n_context, n_hidden_1], tf.random_normal_initializer())\n", " layer_1 = tf.minimum(tf.nn.relu(tf.add(tf.matmul(batch_x, h1), b1)), relu_clip)\n", - " if dropout:\n", - " layer_1 = tf.nn.dropout(layer_1, (1 - dropout_rate))\n", + " layer_1 = tf.nn.dropout(layer_1, (1.0 - dropout_rate))\n", " #Hidden layer with clipped RELU activation and dropout\n", " b2 = variable_on_cpu('b2', [n_hidden_2], tf.random_normal_initializer())\n", " h2 = variable_on_cpu('h2', [n_hidden_1, n_hidden_2], tf.random_normal_initializer())\n", " layer_2 = tf.minimum(tf.nn.relu(tf.add(tf.matmul(layer_1, h2), b2)), relu_clip)\n", - " if dropout:\n", - " layer_2 = tf.nn.dropout(layer_2, (1 - dropout_rate))\n", + " layer_2 = tf.nn.dropout(layer_2, (1.0 - dropout_rate))\n", " #Hidden layer with clipped RELU activation and dropout\n", " b3 = variable_on_cpu('b3', [n_hidden_3], tf.random_normal_initializer())\n", " h3 = variable_on_cpu('h3', [n_hidden_2, n_hidden_3], tf.random_normal_initializer())\n", " layer_3 = tf.minimum(tf.nn.relu(tf.add(tf.matmul(layer_2, h3), b3)), relu_clip)\n", - " if dropout:\n", - " layer_3 = tf.nn.dropout(layer_3, (1 - dropout_rate))\n", + " layer_3 = tf.nn.dropout(layer_3, (1.0 - dropout_rate))\n", " \n", " # Define lstm cells with tensorflow\n", " # Forward direction cell\n", @@ -449,8 +447,7 @@ " b5 = variable_on_cpu('b5', [n_hidden_5], tf.random_normal_initializer())\n", " h5 = variable_on_cpu('h5', [(2 * n_cell_dim), n_hidden_5], tf.random_normal_initializer())\n", " layer_5 = tf.minimum(tf.nn.relu(tf.add(tf.matmul(outputs, h5), b5)), relu_clip)\n", - " if dropout:\n", - " layer_5 = tf.nn.dropout(layer_5, (1 - dropout_rate))\n", + " layer_5 = tf.nn.dropout(layer_5, (1.0 - dropout_rate))\n", " #Hidden layer of logits\n", " b6 = variable_on_cpu('b6', [n_hidden_6], tf.random_normal_initializer())\n", " h6 = variable_on_cpu('h6', [n_hidden_5, n_hidden_6], tf.random_normal_initializer())\n", @@ -471,7 +468,7 @@ "source": [ "The first few lines of the function `BiRNN`\n", "```python\n", - "def BiRNN(batch_x, n_steps, dropout=True):\n", + "def BiRNN(batch_x, n_steps, dropout_rate):\n", " # Input shape: [batch_size, n_steps, n_input + 2*n_input*n_context]\n", " batch_x = tf.transpose(batch_x, [1, 0, 2]) # Permute n_steps and batch_size\n", " # Reshape to prepare input for first layer\n", @@ -486,8 +483,7 @@ " b1 = variable_on_cpu('b1', [n_hidden_1], tf.random_normal_initializer())\n", " h1 = variable_on_cpu('h1', [n_input + 2*n_input*n_context, n_hidden_1], tf.random_normal_initializer())\n", " layer_1 = tf.minimum(tf.nn.relu(tf.add(tf.matmul(batch_x, h1), b1)), relu_clip)\n", - " if dropout:\n", - " layer_1 = tf.nn.dropout(layer_1, (1 - dropout_rate))\n", + " layer_1 = tf.nn.dropout(layer_1, (1.0 - dropout_rate))\n", " ...\n", "```\n", "pass `batch_x` through the first layer of the non-recurrent neural network, then applies dropout to the result.\n", @@ -497,15 +493,13 @@ " #Hidden layer with clipped RELU activation and dropout\n", " b2 = variable_on_cpu('b2', [n_hidden_2], tf.random_normal_initializer())\n", " h2 = variable_on_cpu('h2', [n_hidden_1, n_hidden_2], tf.random_normal_initializer())\n", - " layer_2 = tf.minimum(tf.nn.relu(tf.add(tf.matmul(layer_1, h2), b2)), relu_clip)\n", - " if dropout: \n", - " layer_2 = tf.nn.dropout(layer_2, (1 - dropout_rate))\n", + " layer_2 = tf.minimum(tf.nn.relu(tf.add(tf.matmul(layer_1, h2), b2)), relu_clip) \n", + " layer_2 = tf.nn.dropout(layer_2, (1.0 - dropout_rate))\n", " #Hidden layer with clipped RELU activation and dropout\n", " b3 = variable_on_cpu('b3', [n_hidden_3], tf.random_normal_initializer())\n", " h3 = variable_on_cpu('h3', [n_hidden_2, n_hidden_3], tf.random_normal_initializer())\n", " layer_3 = tf.minimum(tf.nn.relu(tf.add(tf.matmul(layer_2, h3), b3)), relu_clip)\n", - " if dropout:\n", - " layer_3 = tf.nn.dropout(layer_3, (1 - dropout_rate))\n", + " layer_3 = tf.nn.dropout(layer_3, (1.0 - dropout_rate))\n", "```\n", "\n", "Next we create the forward and backward LSTM units\n", @@ -549,8 +543,7 @@ " b5 = variable_on_cpu('b5', [n_hidden_5], tf.random_normal_initializer())\n", " h5 = variable_on_cpu('h5', [(2 * n_cell_dim), n_hidden_5], tf.random_normal_initializer())\n", " layer_5 = tf.minimum(tf.nn.relu(tf.add(tf.matmul(outputs, h5), b5)), relu_clip)\n", - " if dropout:\n", - " layer_5 = tf.nn.dropout(layer_5, (1 - dropout_rate))\n", + " layer_5 = tf.nn.dropout(layer_5, (1.0 - dropout_rate))\n", "```\n", "\n", "The next line of `BiRNN`\n", @@ -591,7 +584,7 @@ "source": [ "In accord with [Deep Speech: Scaling up end-to-end speech recognition](http://arxiv.org/abs/1412.5567), the loss function used by our network should be the CTC loss function[[2]](http://www.cs.toronto.edu/~graves/preprint.pdf). Conveniently, this loss function is implemented in TensorFlow. Thus, we can simply make use of this implementation to define our loss.\n", "\n", - "To do so we introduce a utility function `calculate_accuracy_and_loss()` that calculates the average loss for a mini-batch along with the accuracy" + "To do so we introduce a utility function `calculate_accuracy_and_loss()` beam search decodes a mini-batch and calculates the average loss and accuracy. Next to loss and accuracy it returns the decoded result and the batch's original Y." ] }, { @@ -602,12 +595,12 @@ }, "outputs": [], "source": [ - "def calculate_accuracy_and_loss(n_steps, batch_set, dropout=False):\n", + "def calculate_accuracy_and_loss(n_steps, batch_set, dropout_rate):\n", " # Obtain the next batch of data\n", " batch_x, batch_y, batch_seq_len = batch_set.next_batch(batch_size)\n", "\n", " # Calculate the logits of the batch using BiRNN\n", - " logits = BiRNN(batch_x, n_steps, dropout=dropout)\n", + " logits = BiRNN(batch_x, n_steps, dropout_rate)\n", " \n", " # CTC loss requires the logits be time major\n", " logits = tf.transpose(logits, [1, 0, 2])\n", @@ -618,12 +611,16 @@ " # Calculate the average loss across the batch\n", " avg_loss = tf.reduce_mean(total_loss)\n", " \n", - " # Compute the accuracy\n", + " # Beam search decode the batch\n", " decoded, _ = ctc_ops.ctc_beam_search_decoder(logits, batch_seq_len)\n", + " \n", + " # Compute the edit (Levenshtein) distance \n", " distance = tf.edit_distance(tf.cast(decoded[0], tf.int32), batch_y)\n", + " \n", + " # Compute the accuracy \n", " accuracy = tf.reduce_mean(distance)\n", "\n", - " # Return avg_loss and accuracy\n", + " # Return results to the caller\n", " return avg_loss, accuracy, decoded, batch_y" ] }, @@ -633,7 +630,7 @@ "source": [ "The first lines of `calculate_accuracy_and_loss()`\n", "```python\n", - "def calculate_accuracy_and_loss(n_steps, batch_set):\n", + "def calculate_accuracy_and_loss(n_steps, batch_set, dropout_rate):\n", " # Obtain the next batch of data\n", " batch_x, batch_y, batch_seq_len = batch_set.next_batch(batch_size)\n", "```\n", @@ -642,7 +639,7 @@ "The next line\n", "```python\n", " # Calculate the logits from the BiRNN\n", - " logits = BiRNN(batch_x)\n", + " logits = BiRNN(batch_x, n_steps, dropout_rate)\n", "```\n", "calls `BiRNN()` with a batch of data and does inference on the batch.\n", "\n", @@ -659,17 +656,22 @@ "```\n", "calculate the average loss using tensor flow's `ctc_loss` operator. \n", "\n", - "The next lines compute the accuracy\n", + "The next lines first beam decode the batch and then compute the accuracy on base of the Levenshtein distance between the decoded batch and the batch's original Y.\n", "```python\n", - " # Compute the accuracy\n", + " # Beam search decode the batch\n", " decoded, _ = ctc_ops.ctc_beam_search_decoder(logits, batch_seq_len)\n", - " accuracy = tf.reduce_mean(tf.edit_distance(tf.cast(decoded[0], tf.int32), batch_y))\n", + " \n", + " # Compute the edit (Levenshtein) distance \n", + " distance = tf.edit_distance(tf.cast(decoded[0], tf.int32), batch_y)\n", + " \n", + " # Compute the accuracy \n", + " accuracy = tf.reduce_mean(distance)\n", "```\n", "\n", - "Finally, the `avg_loss`, accuracy and the decoded batch are returned to the caller\n", + "Finally, the `avg_loss`, accuracy, the decoded batch and the original batch's Y are returned to the caller\n", "```python\n", - " # Return avg_loss and accuracy\n", - " return avg_loss, accuracy, decoded\n", + " # Return results to the caller\n", + " return avg_loss, accuracy, decoded, batch_y\n", "```" ] }, @@ -866,22 +868,25 @@ " with tf.device(available_devices[i]):\n", " # Create a scope for all operations of tower i\n", " with tf.name_scope('tower_%d' % i) as scope:\n", - " # Calculate the avg_loss and accuracy for this tower\n", + " # Calculate the avg_loss and accuracy and retrieve the decoded \n", + " # batch along with the original batch's labels (Y) of this tower\n", " avg_loss, accuracy, decoded, labels = calculate_accuracy_and_loss(\\\n", - " n_steps, \\\n", - " batch_set, \\\n", - " dropout=(optimizer is not None) \\\n", - " )\n", + " n_steps, \\\n", + " batch_set, \\\n", + " dropout_rate if (optimizer is not None) else 0.0 \\\n", + " )\n", " \n", " # Allow for variables to be re-used by the next tower\n", " tf.get_variable_scope().reuse_variables()\n", " \n", - " # Retain tower's gradients\n", + " # Retain tower's decoded batch\n", " tower_decodings.append(decoded)\n", " \n", - " # Retain tower's labels\n", + " # Retain tower's labels (Y)\n", " tower_labels.append(labels)\n", " \n", + " # If we are in training, there will be an optimizer given and \n", + " # only then we will compute and retain gradients on base of the loss\n", " if optimizer is not None:\n", " # Compute gradients for model parameters using tower's mini-batch\n", " gradients = optimizer.compute_gradients(avg_loss)\n", @@ -1032,7 +1037,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Finally we define the log directory plus some helpers." + "Finally we define the top directory for all logs and our current log sub-directory of it.\n", + "We also add some log helpers." ] }, { @@ -1043,7 +1049,8 @@ }, "outputs": [], "source": [ - "log_dir = '%s/%s' % (\"logs\", time.strftime(\"%Y%m%d-%H%M%S\"))\n", + "logs_dir = \"logs\"\n", + "log_dir = '%s/%s' % (logs_dir, time.strftime(\"%Y%m%d-%H%M%S\"))\n", "\n", "def get_git_revision_hash():\n", " return subprocess.check_output(['git', 'rev-parse', 'HEAD']).strip()\n", @@ -1074,7 +1081,7 @@ }, "outputs": [], "source": [ - "def forward(session, data_set):\n", + "def decode_batch(data_set):\n", " # Set n_steps parameter\n", " n_steps = data_set.max_batch_seq_len\n", "\n", @@ -1188,7 +1195,7 @@ " get_tower_results(n_steps, data_sets.train, optimizer)\n", " \n", " # Validation step preparation\n", - " validation_tower_decodings, validation_tower_labels = forward(session, data_sets.validation)\n", + " validation_tower_decodings, validation_tower_labels = decode_batch(data_sets.validation)\n", "\n", " # Average tower gradients\n", " avg_tower_gradients = average_gradients(tower_gradients)\n", @@ -1218,10 +1225,9 @@ " # Define total accuracy for the epoch\n", " total_accuracy = 0\n", " \n", - " # Validation step to determine the best point in time to stop\n", + " # Validation step\n", " if epoch % validation_step == 0:\n", " _, last_validation_wer = print_wer_report(session, \"Validation\", validation_tower_decodings, validation_tower_labels)\n", - " # TODO: Determine on base of WER, if model starts overfitting\n", " print\n", "\n", " # Loop over the batches\n", @@ -1307,7 +1313,7 @@ "outputs": [], "source": [ "# Test network\n", - "test_decodings, test_labels = forward(session, ted_lium.test)\n", + "test_decodings, test_labels = decode_batch(ted_lium.test)\n", "_, test_wer = print_wer_report(session, \"Test\", test_decodings, test_labels)" ] }, @@ -1380,41 +1386,6 @@ "Let's also re-populate a central JS file, that contains all the dumps at once." ] }, - { - "cell_type": "code", - "execution_count": 34, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "written = False\n", - "logs_dir = \"logs\"\n", - "\n", - "# All direct sub directories of the logs directory\n", - "dirs = [os.path.join(logs_dir, o) for o in os.listdir(logs_dir) if os.path.isdir(os.path.join(logs_dir, o))]\n", - "\n", - "# Let's first populate a temporal file and rename it afterwards - guarantees an interruption free web experience\n", - "nhf = '%s/%s' % (logs_dir, 'new_hyper.js')\n", - "\n", - "with open(nhf, 'w') as dump_file:\n", - " # Assigning a global variable that the report page can pick up after loading the data as a regular script\n", - " dump_file.write('window.ALL_THE_DATA = [')\n", - " for d in dirs:\n", - " hf = os.path.join(d, \"hyper.json\")\n", - " if os.path.isfile(hf):\n", - " # Separate by comma if there was already something written\n", - " if written:\n", - " dump_file.write(',\\n')\n", - " written = True\n", - " # Append the whole file\n", - " dump_file.write(open(hf, 'r').read())\n", - " dump_file.write('];')\n", - " \n", - "# Finally we rename the file temporal file and overwrite a potentially existing active one\n", - "os.rename(nhf, '%s/%s' % (logs_dir, 'hyper.js'))" - ] - }, { "cell_type": "code", "execution_count": null, @@ -1422,7 +1393,9 @@ "collapsed": true }, "outputs": [], - "source": [] + "source": [ + "merge_logs(logs_dir)" + ] } ], "metadata": { diff --git a/util/log/__init__.py b/util/log/__init__.py new file mode 100644 index 00000000..1762f6f0 --- /dev/null +++ b/util/log/__init__.py @@ -0,0 +1,30 @@ + +import os +import os.path + +def merge_logs(logs_dir): + + written = False + + # All direct sub directories of the logs directory + dirs = [os.path.join(logs_dir, o) for o in os.listdir(logs_dir) if os.path.isdir(os.path.join(logs_dir, o))] + + # Let's first populate a temporal file and rename it afterwards - guarantees an interruption free web experience + nhf = '%s/%s' % (logs_dir, 'new_hyper.js') + + with open(nhf, 'w') as dump_file: + # Assigning a global variable that the report page can pick up after loading the data as a regular script + dump_file.write('window.ALL_THE_DATA = [') + for d in dirs: + hf = os.path.join(d, "hyper.json") + if os.path.isfile(hf): + # Separate by comma if there was already something written + if written: + dump_file.write(',\n') + written = True + # Append the whole file + dump_file.write(open(hf, 'r').read()) + dump_file.write('];') + + # Finally we rename the temporal file and overwrite a potentially existing active one + os.rename(nhf, '%s/%s' % (logs_dir, 'hyper.js'))