Fixed dropout handling and other fixes

2016-10-11 14:39:36 +02:00 · 2016-10-11 14:39:36 +02:00 · f3439b72d5
commit f3439b72d5
parent 2890264b04
2 changed files with 93 additions and 90 deletions
--- a/DeepSpeech.ipynb
+++ b/DeepSpeech.ipynb
@ -76,18 +76,19 @@
   },
   "outputs": [],
   "source": [
    "import os\n",
    "import time\n",
-    "import os.path\n",
+    "import json\n",
    "import datetime\n",
    "import tempfile\n",
    "import subprocess\n",
    "import numpy as np\n",
    "import tensorflow as tf\n",
-    "import json\n",
+    "from util.log import merge_logs\n",
    "import subprocess\n",
    "import datetime\n",
    "from util.gpu import get_available_gpus\n",
    "from util.importers.ted_lium import read_data_sets\n",
    "from util.text import sparse_tensor_value_to_text, wers\n",
-    "from tensorflow.python.ops import ctc_ops\n",
+    "from tensorflow.python.ops import ctc_ops"
    "from util.importers.ted_lium import read_data_sets"
   ]
  },
  {
@ -125,7 +126,7 @@
    "training_iters = 1250   # TODO: Determine a reasonable value for this\n",
    "batch_size = 1          # TODO: Determine a reasonable value for this\n",
    "display_step = 10       # TODO: Determine a reasonable value for this\n",
-    "validation_step = 50\n",
+    "validation_step = 50    # TODO: Determine a reasonable value for this\n",
    "checkpoint_step = 1000  # TODO: Determine a reasonable value for this\n",
    "checkpoint_dir = tempfile.gettempdir() # TODO: Determine a reasonable value for this"
   ]
@ -147,7 +148,7 @@
   },
   "outputs": [],
   "source": [
-    "dropout_rate = 0.01  # TODO: Validate this is a reasonable value"
+    "dropout_rate = 0.05  # TODO: Validate this is a reasonable value"
   ]
  },
  {
@ -400,7 +401,7 @@
   },
   "outputs": [],
   "source": [
-    "def BiRNN(batch_x, n_steps, dropout):\n",
+    "def BiRNN(batch_x, n_steps, dropout_rate):\n",
    "    # Input shape: [batch_size, n_steps, n_input + 2*n_input*n_context]\n",
    "    batch_x = tf.transpose(batch_x, [1, 0, 2])  # Permute n_steps and batch_size\n",
    "    # Reshape to prepare input for first layer\n",
@ -410,20 +411,17 @@
    "    b1 = variable_on_cpu('b1', [n_hidden_1], tf.random_normal_initializer())\n",
    "    h1 = variable_on_cpu('h1', [n_input + 2*n_input*n_context, n_hidden_1], tf.random_normal_initializer())\n",
    "    layer_1 = tf.minimum(tf.nn.relu(tf.add(tf.matmul(batch_x, h1), b1)), relu_clip)\n",
-    "    if dropout:\n",
+    "    layer_1 = tf.nn.dropout(layer_1, (1.0 - dropout_rate))\n",
    "        layer_1 = tf.nn.dropout(layer_1, (1 - dropout_rate))\n",
    "    #Hidden layer with clipped RELU activation and dropout\n",
    "    b2 = variable_on_cpu('b2', [n_hidden_2], tf.random_normal_initializer())\n",
    "    h2 = variable_on_cpu('h2', [n_hidden_1, n_hidden_2], tf.random_normal_initializer())\n",
    "    layer_2 = tf.minimum(tf.nn.relu(tf.add(tf.matmul(layer_1, h2), b2)), relu_clip)\n",
-    "    if dropout:\n",
+    "    layer_2 = tf.nn.dropout(layer_2, (1.0 - dropout_rate))\n",
    "        layer_2 = tf.nn.dropout(layer_2, (1 - dropout_rate))\n",
    "    #Hidden layer with clipped RELU activation and dropout\n",
    "    b3 = variable_on_cpu('b3', [n_hidden_3], tf.random_normal_initializer())\n",
    "    h3 = variable_on_cpu('h3', [n_hidden_2, n_hidden_3], tf.random_normal_initializer())\n",
    "    layer_3 = tf.minimum(tf.nn.relu(tf.add(tf.matmul(layer_2, h3), b3)), relu_clip)\n",
-    "    if dropout:\n",
+    "    layer_3 = tf.nn.dropout(layer_3, (1.0 - dropout_rate))\n",
    "        layer_3 = tf.nn.dropout(layer_3, (1 - dropout_rate))\n",
    "    \n",
    "    # Define lstm cells with tensorflow\n",
    "    # Forward direction cell\n",
@ -449,8 +447,7 @@
    "    b5 = variable_on_cpu('b5', [n_hidden_5], tf.random_normal_initializer())\n",
    "    h5 = variable_on_cpu('h5', [(2 * n_cell_dim), n_hidden_5], tf.random_normal_initializer())\n",
    "    layer_5 = tf.minimum(tf.nn.relu(tf.add(tf.matmul(outputs, h5), b5)), relu_clip)\n",
-    "    if dropout:\n",
+    "    layer_5 = tf.nn.dropout(layer_5, (1.0 - dropout_rate))\n",
    "        layer_5 = tf.nn.dropout(layer_5, (1 - dropout_rate))\n",
    "    #Hidden layer of logits\n",
    "    b6 = variable_on_cpu('b6', [n_hidden_6], tf.random_normal_initializer())\n",
    "    h6 = variable_on_cpu('h6', [n_hidden_5, n_hidden_6], tf.random_normal_initializer())\n",
@ -471,7 +468,7 @@
   "source": [
    "The first few lines of the function `BiRNN`\n",
    "```python\n",
-    "def BiRNN(batch_x, n_steps, dropout=True):\n",
+    "def BiRNN(batch_x, n_steps, dropout_rate):\n",
    "    # Input shape: [batch_size, n_steps, n_input + 2*n_input*n_context]\n",
    "    batch_x = tf.transpose(batch_x, [1, 0, 2])  # Permute n_steps and batch_size\n",
    "    # Reshape to prepare input for first layer\n",
@ -486,8 +483,7 @@
    "    b1 = variable_on_cpu('b1', [n_hidden_1], tf.random_normal_initializer())\n",
    "    h1 = variable_on_cpu('h1', [n_input + 2*n_input*n_context, n_hidden_1], tf.random_normal_initializer())\n",
    "    layer_1 = tf.minimum(tf.nn.relu(tf.add(tf.matmul(batch_x, h1), b1)), relu_clip)\n",
-    "    if dropout:\n",
+    "    layer_1 = tf.nn.dropout(layer_1, (1.0 - dropout_rate))\n",
    "        layer_1 = tf.nn.dropout(layer_1, (1 - dropout_rate))\n",
    "    ...\n",
    "```\n",
    "pass `batch_x` through the first layer of the non-recurrent neural network, then applies dropout to the result.\n",
@ -497,15 +493,13 @@
    "    #Hidden layer with clipped RELU activation and dropout\n",
    "    b2 = variable_on_cpu('b2', [n_hidden_2], tf.random_normal_initializer())\n",
    "    h2 = variable_on_cpu('h2', [n_hidden_1, n_hidden_2], tf.random_normal_initializer())\n",
-    "    layer_2 = tf.minimum(tf.nn.relu(tf.add(tf.matmul(layer_1, h2), b2)), relu_clip)\n",
+    "    layer_2 = tf.minimum(tf.nn.relu(tf.add(tf.matmul(layer_1, h2), b2)), relu_clip)   \n",
-    "    if dropout:    \n",
+    "    layer_2 = tf.nn.dropout(layer_2, (1.0 - dropout_rate))\n",
    "        layer_2 = tf.nn.dropout(layer_2, (1 - dropout_rate))\n",
    "    #Hidden layer with clipped RELU activation and dropout\n",
    "    b3 = variable_on_cpu('b3', [n_hidden_3], tf.random_normal_initializer())\n",
    "    h3 = variable_on_cpu('h3', [n_hidden_2, n_hidden_3], tf.random_normal_initializer())\n",
    "    layer_3 = tf.minimum(tf.nn.relu(tf.add(tf.matmul(layer_2, h3), b3)), relu_clip)\n",
-    "    if dropout:\n",
+    "    layer_3 = tf.nn.dropout(layer_3, (1.0 - dropout_rate))\n",
    "        layer_3 = tf.nn.dropout(layer_3, (1 - dropout_rate))\n",
    "```\n",
    "\n",
    "Next we create the forward and backward LSTM units\n",
@ -549,8 +543,7 @@
    "    b5 = variable_on_cpu('b5', [n_hidden_5], tf.random_normal_initializer())\n",
    "    h5 = variable_on_cpu('h5', [(2 * n_cell_dim), n_hidden_5], tf.random_normal_initializer())\n",
    "    layer_5 = tf.minimum(tf.nn.relu(tf.add(tf.matmul(outputs, h5), b5)), relu_clip)\n",
-    "    if dropout:\n",
+    "    layer_5 = tf.nn.dropout(layer_5, (1.0 - dropout_rate))\n",
    "        layer_5 = tf.nn.dropout(layer_5, (1 - dropout_rate))\n",
    "```\n",
    "\n",
    "The next line of `BiRNN`\n",
@ -591,7 +584,7 @@
   "source": [
    "In accord with [Deep Speech: Scaling up end-to-end speech recognition](http://arxiv.org/abs/1412.5567), the loss function used by our network should be the CTC loss function[[2]](http://www.cs.toronto.edu/~graves/preprint.pdf). Conveniently, this loss function is implemented in TensorFlow. Thus, we can simply make use of this implementation to define our loss.\n",
    "\n",
-    "To do so we introduce a utility function `calculate_accuracy_and_loss()` that calculates the average loss for a mini-batch along with the accuracy"
+    "To do so we introduce a utility function `calculate_accuracy_and_loss()` beam search decodes a mini-batch and calculates the average loss and accuracy. Next to loss and accuracy it returns the decoded result and the batch's original Y."
   ]
  },
  {
@ -602,12 +595,12 @@
   },
   "outputs": [],
   "source": [
-    "def calculate_accuracy_and_loss(n_steps, batch_set, dropout=False):\n",
+    "def calculate_accuracy_and_loss(n_steps, batch_set, dropout_rate):\n",
    "    # Obtain the next batch of data\n",
    "    batch_x, batch_y, batch_seq_len = batch_set.next_batch(batch_size)\n",
    "\n",
    "    # Calculate the logits of the batch using BiRNN\n",
-    "    logits = BiRNN(batch_x, n_steps, dropout=dropout)\n",
+    "    logits = BiRNN(batch_x, n_steps, dropout_rate)\n",
    "    \n",
    "    # CTC loss requires the logits be time major\n",
    "    logits = tf.transpose(logits, [1, 0, 2])\n",
@ -618,12 +611,16 @@
    "    # Calculate the average loss across the batch\n",
    "    avg_loss = tf.reduce_mean(total_loss)\n",
    "    \n",
-    "    # Compute the accuracy\n",
+    "    # Beam search decode the batch\n",
    "    decoded, _ = ctc_ops.ctc_beam_search_decoder(logits, batch_seq_len)\n",
    "    \n",
    "    # Compute the edit (Levenshtein) distance \n",
    "    distance = tf.edit_distance(tf.cast(decoded[0], tf.int32), batch_y)\n",
    "    \n",
    "    # Compute the accuracy \n",
    "    accuracy = tf.reduce_mean(distance)\n",
    "\n",
-    "    # Return avg_loss and accuracy\n",
+    "    # Return results to the caller\n",
    "    return avg_loss, accuracy, decoded, batch_y"
   ]
  },
@ -633,7 +630,7 @@
   "source": [
    "The first lines of `calculate_accuracy_and_loss()`\n",
    "```python\n",
-    "def calculate_accuracy_and_loss(n_steps, batch_set):\n",
+    "def calculate_accuracy_and_loss(n_steps, batch_set, dropout_rate):\n",
    "    # Obtain the next batch of data\n",
    "    batch_x, batch_y, batch_seq_len = batch_set.next_batch(batch_size)\n",
    "```\n",
@ -642,7 +639,7 @@
    "The next line\n",
    "```python\n",
    "    # Calculate the logits from the BiRNN\n",
-    "    logits = BiRNN(batch_x)\n",
+    "    logits = BiRNN(batch_x, n_steps, dropout_rate)\n",
    "```\n",
    "calls `BiRNN()` with a batch of data and does inference on the batch.\n",
    "\n",
@ -659,17 +656,22 @@
    "```\n",
    "calculate the average loss using tensor flow's `ctc_loss` operator. \n",
    "\n",
-    "The next lines compute the accuracy\n",
+    "The next lines first beam decode the batch and then compute the accuracy on base of the Levenshtein distance between the decoded batch and the batch's original Y.\n",
    "```python\n",
-    "    # Compute the accuracy\n",
+    "    # Beam search decode the batch\n",
    "    decoded, _ = ctc_ops.ctc_beam_search_decoder(logits, batch_seq_len)\n",
-    "    accuracy = tf.reduce_mean(tf.edit_distance(tf.cast(decoded[0], tf.int32), batch_y))\n",
+    "    \n",
    "    # Compute the edit (Levenshtein) distance \n",
    "    distance = tf.edit_distance(tf.cast(decoded[0], tf.int32), batch_y)\n",
    "    \n",
    "    # Compute the accuracy \n",
    "    accuracy = tf.reduce_mean(distance)\n",
    "```\n",
    "\n",
-    "Finally, the `avg_loss`, accuracy and the decoded batch are returned to the caller\n",
+    "Finally, the `avg_loss`, accuracy, the decoded batch and the original batch's Y are returned to the caller\n",
    "```python\n",
-    "    # Return avg_loss and accuracy\n",
+    "    # Return results to the caller\n",
-    "    return avg_loss, accuracy, decoded\n",
+    "    return avg_loss, accuracy, decoded, batch_y\n",
    "```"
   ]
  },
@ -866,22 +868,25 @@
    "        with tf.device(available_devices[i]):\n",
    "            # Create a scope for all operations of tower i\n",
    "            with tf.name_scope('tower_%d' % i) as scope:\n",
-    "                # Calculate the avg_loss and accuracy for this tower\n",
+    "                # Calculate the avg_loss and accuracy and retrieve the decoded \n",
    "                # batch along with the original batch's labels (Y) of this tower\n",
    "                avg_loss, accuracy, decoded, labels = calculate_accuracy_and_loss(\\\n",
-    "                                                n_steps, \\\n",
+    "                    n_steps, \\\n",
-    "                                                batch_set, \\\n",
+    "                    batch_set, \\\n",
-    "                                                dropout=(optimizer is not None) \\\n",
+    "                    dropout_rate if (optimizer is not None) else 0.0 \\\n",
-    "                                              )\n",
+    "                )\n",
    "                                \n",
    "                # Allow for variables to be re-used by the next tower\n",
    "                tf.get_variable_scope().reuse_variables()\n",
    "                \n",
-    "                # Retain tower's gradients\n",
+    "                # Retain tower's decoded batch\n",
    "                tower_decodings.append(decoded)\n",
    "                \n",
-    "                # Retain tower's labels\n",
+    "                # Retain tower's labels (Y)\n",
    "                tower_labels.append(labels)\n",
    "                \n",
    "                # If we are in training, there will be an optimizer given and \n",
    "                # only then we will compute and retain gradients on base of the loss\n",
    "                if optimizer is not None:\n",
    "                    # Compute gradients for model parameters using tower's mini-batch\n",
    "                    gradients = optimizer.compute_gradients(avg_loss)\n",
@ -1032,7 +1037,8 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "Finally we define the log directory plus some helpers."
+    "Finally we define the top directory for all logs and our current log sub-directory of it.\n",
    "We also add some log helpers."
   ]
  },
  {
@ -1043,7 +1049,8 @@
   },
   "outputs": [],
   "source": [
-    "log_dir = '%s/%s' % (\"logs\", time.strftime(\"%Y%m%d-%H%M%S\"))\n",
+    "logs_dir = \"logs\"\n",
    "log_dir = '%s/%s' % (logs_dir, time.strftime(\"%Y%m%d-%H%M%S\"))\n",
    "\n",
    "def get_git_revision_hash():\n",
    "    return subprocess.check_output(['git', 'rev-parse', 'HEAD']).strip()\n",
@ -1074,7 +1081,7 @@
   },
   "outputs": [],
   "source": [
-    "def forward(session, data_set):\n",
+    "def decode_batch(data_set):\n",
    "    # Set n_steps parameter\n",
    "    n_steps = data_set.max_batch_seq_len\n",
    "\n",
@ -1188,7 +1195,7 @@
    "        get_tower_results(n_steps, data_sets.train, optimizer)\n",
    "    \n",
    "    # Validation step preparation\n",
-    "    validation_tower_decodings, validation_tower_labels = forward(session, data_sets.validation)\n",
+    "    validation_tower_decodings, validation_tower_labels = decode_batch(data_sets.validation)\n",
    "\n",
    "    # Average tower gradients\n",
    "    avg_tower_gradients = average_gradients(tower_gradients)\n",
@ -1218,10 +1225,9 @@
    "        # Define total accuracy for the epoch\n",
    "        total_accuracy = 0\n",
    "        \n",
-    "        # Validation step to determine the best point in time to stop\n",
+    "        # Validation step\n",
    "        if epoch % validation_step == 0:\n",
    "            _, last_validation_wer = print_wer_report(session, \"Validation\", validation_tower_decodings, validation_tower_labels)\n",
    "            # TODO: Determine on base of WER, if model starts overfitting\n",
    "            print\n",
    "\n",
    "        # Loop over the batches\n",
@ -1307,7 +1313,7 @@
   "outputs": [],
   "source": [
    "# Test network\n",
-    "test_decodings, test_labels = forward(session, ted_lium.test)\n",
+    "test_decodings, test_labels = decode_batch(ted_lium.test)\n",
    "_, test_wer = print_wer_report(session, \"Test\", test_decodings, test_labels)"
   ]
  },
@ -1380,41 +1386,6 @@
    "Let's also re-populate a central JS file, that contains all the dumps at once."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "written = False\n",
    "logs_dir = \"logs\"\n",
    "\n",
    "# All direct sub directories of the logs directory\n",
    "dirs = [os.path.join(logs_dir, o) for o in os.listdir(logs_dir) if os.path.isdir(os.path.join(logs_dir, o))]\n",
    "\n",
    "# Let's first populate a temporal file and rename it afterwards - guarantees an interruption free web experience\n",
    "nhf = '%s/%s' % (logs_dir, 'new_hyper.js')\n",
    "\n",
    "with open(nhf, 'w') as dump_file:\n",
    "    # Assigning a global variable that the report page can pick up after loading the data as a regular script\n",
    "    dump_file.write('window.ALL_THE_DATA = [')\n",
    "    for d in dirs:\n",
    "        hf = os.path.join(d, \"hyper.json\")\n",
    "        if os.path.isfile(hf):\n",
    "            # Separate by comma if there was already something written\n",
    "            if written:\n",
    "                dump_file.write(',\\n')\n",
    "            written = True\n",
    "            # Append the whole file\n",
    "            dump_file.write(open(hf, 'r').read())\n",
    "    dump_file.write('];')\n",
    "    \n",
    "# Finally we rename the file temporal file and overwrite a potentially existing active one\n",
    "os.rename(nhf, '%s/%s' % (logs_dir, 'hyper.js'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
@ -1422,7 +1393,9 @@
    "collapsed": true
   },
   "outputs": [],
-   "source": []
+   "source": [
    "merge_logs(logs_dir)"
   ]
  }
 ],
 "metadata": {
--- a/util/log/init.py
+++ b/util/log/init.py
@ -0,0 +1,30 @@
 import os
 import os.path
 def merge_logs(logs_dir):
    written = False
    # All direct sub directories of the logs directory
    dirs = [os.path.join(logs_dir, o) for o in os.listdir(logs_dir) if os.path.isdir(os.path.join(logs_dir, o))]
    # Let's first populate a temporal file and rename it afterwards - guarantees an interruption free web experience
    nhf = '%s/%s' % (logs_dir, 'new_hyper.js')
    with open(nhf, 'w') as dump_file:
        # Assigning a global variable that the report page can pick up after loading the data as a regular script
        dump_file.write('window.ALL_THE_DATA = [')
        for d in dirs:
            hf = os.path.join(d, "hyper.json")
            if os.path.isfile(hf):
                # Separate by comma if there was already something written
                if written:
                    dump_file.write(',\n')
                written = True
                # Append the whole file
                dump_file.write(open(hf, 'r').read())
        dump_file.write('];')
    # Finally we rename the temporal file and overwrite a potentially existing active one
    os.rename(nhf, '%s/%s' % (logs_dir, 'hyper.js'))