Implementation of layer-norm in the training script
This commit is contained in:
parent
c5db91413f
commit
2fcba677bb
@ -74,7 +74,7 @@ def create_overlapping_windows(batch_x):
|
|||||||
return batch_x
|
return batch_x
|
||||||
|
|
||||||
|
|
||||||
def dense(name, x, units, dropout_rate=None, relu=True):
|
def dense(name, x, units, dropout_rate=None, relu=True, layer_norm=False):
|
||||||
with tfv1.variable_scope(name):
|
with tfv1.variable_scope(name):
|
||||||
bias = variable_on_cpu('bias', [units], tfv1.zeros_initializer())
|
bias = variable_on_cpu('bias', [units], tfv1.zeros_initializer())
|
||||||
weights = variable_on_cpu('weights', [x.shape[-1], units], tfv1.keras.initializers.VarianceScaling(scale=1.0, mode="fan_avg", distribution="uniform"))
|
weights = variable_on_cpu('weights', [x.shape[-1], units], tfv1.keras.initializers.VarianceScaling(scale=1.0, mode="fan_avg", distribution="uniform"))
|
||||||
@ -84,6 +84,10 @@ def dense(name, x, units, dropout_rate=None, relu=True):
|
|||||||
if relu:
|
if relu:
|
||||||
output = tf.minimum(tf.nn.relu(output), FLAGS.relu_clip)
|
output = tf.minimum(tf.nn.relu(output), FLAGS.relu_clip)
|
||||||
|
|
||||||
|
if layer_norm:
|
||||||
|
with tfv1.variable_scope(name):
|
||||||
|
output = tf.contrib.layers.layer_norm(output)
|
||||||
|
|
||||||
if dropout_rate is not None:
|
if dropout_rate is not None:
|
||||||
output = tf.nn.dropout(output, rate=dropout_rate)
|
output = tf.nn.dropout(output, rate=dropout_rate)
|
||||||
|
|
||||||
@ -177,9 +181,9 @@ def create_model(batch_x, seq_length, dropout, reuse=False, batch_size=None, pre
|
|||||||
|
|
||||||
# The next three blocks will pass `batch_x` through three hidden layers with
|
# The next three blocks will pass `batch_x` through three hidden layers with
|
||||||
# clipped RELU activation and dropout.
|
# clipped RELU activation and dropout.
|
||||||
layers['layer_1'] = layer_1 = dense('layer_1', batch_x, Config.n_hidden_1, dropout_rate=dropout[0])
|
layers['layer_1'] = layer_1 = dense('layer_1', batch_x, Config.n_hidden_1, dropout_rate=dropout[0], layer_norm=FLAGS.layer_norm)
|
||||||
layers['layer_2'] = layer_2 = dense('layer_2', layer_1, Config.n_hidden_2, dropout_rate=dropout[1])
|
layers['layer_2'] = layer_2 = dense('layer_2', layer_1, Config.n_hidden_2, dropout_rate=dropout[1], layer_norm=FLAGS.layer_norm)
|
||||||
layers['layer_3'] = layer_3 = dense('layer_3', layer_2, Config.n_hidden_3, dropout_rate=dropout[2])
|
layers['layer_3'] = layer_3 = dense('layer_3', layer_2, Config.n_hidden_3, dropout_rate=dropout[2], layer_norm=FLAGS.layer_norm)
|
||||||
|
|
||||||
# `layer_3` is now reshaped into `[n_steps, batch_size, 2*n_cell_dim]`,
|
# `layer_3` is now reshaped into `[n_steps, batch_size, 2*n_cell_dim]`,
|
||||||
# as the LSTM RNN expects its input to be of shape `[max_time, batch_size, input_size]`.
|
# as the LSTM RNN expects its input to be of shape `[max_time, batch_size, input_size]`.
|
||||||
@ -196,7 +200,7 @@ def create_model(batch_x, seq_length, dropout, reuse=False, batch_size=None, pre
|
|||||||
layers['rnn_output_state'] = output_state
|
layers['rnn_output_state'] = output_state
|
||||||
|
|
||||||
# Now we feed `output` to the fifth hidden layer with clipped RELU activation
|
# Now we feed `output` to the fifth hidden layer with clipped RELU activation
|
||||||
layers['layer_5'] = layer_5 = dense('layer_5', output, Config.n_hidden_5, dropout_rate=dropout[5])
|
layers['layer_5'] = layer_5 = dense('layer_5', output, Config.n_hidden_5, dropout_rate=dropout[5], layer_norm=FLAGS.layer_norm)
|
||||||
|
|
||||||
# Now we apply a final linear layer creating `n_classes` dimensional vectors, the logits.
|
# Now we apply a final linear layer creating `n_classes` dimensional vectors, the logits.
|
||||||
layers['layer_6'] = layer_6 = dense('layer_6', layer_5, Config.n_hidden_6, relu=False)
|
layers['layer_6'] = layer_6 = dense('layer_6', layer_5, Config.n_hidden_6, relu=False)
|
||||||
|
@ -135,6 +135,7 @@ def create_flags():
|
|||||||
# Geometry
|
# Geometry
|
||||||
|
|
||||||
f.DEFINE_integer('n_hidden', 2048, 'layer width to use when initialising layers')
|
f.DEFINE_integer('n_hidden', 2048, 'layer width to use when initialising layers')
|
||||||
|
f.DEFINE_boolean('layer_norm', True, 'wether to use layer-normalization after each fully-connected layer (except the last one)')
|
||||||
|
|
||||||
# Initialization
|
# Initialization
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user