From b51d81f87f5de3c26b2db59ae6ec6b5f963acd7d Mon Sep 17 00:00:00 2001
From: Scott Zhu <scottzhu@google.com>
Date: Mon, 10 Dec 2018 12:42:26 -0800
Subject: [PATCH] Update the default activation function for unified LSTM to
 'sigmoid'.

I believe for historical reason, the activation function for LSTM is hard_sigmoid because it is faster compare to sigmoid. With the new LSTM, the performance issue should be fixed with grappler swapping the backend.

PiperOrigin-RevId: 224863406
---
 tensorflow/python/keras/layers/recurrent.py   | 17 ++++++------
 .../python/keras/layers/unified_lstm_test.py  | 27 ++++++++++---------
 2 files changed, 23 insertions(+), 21 deletions(-)

diff --git a/tensorflow/python/keras/layers/recurrent.py b/tensorflow/python/keras/layers/recurrent.py
index 86a69e45d90..fb4c1736b18 100644
--- a/tensorflow/python/keras/layers/recurrent.py
+++ b/tensorflow/python/keras/layers/recurrent.py
@@ -2546,13 +2546,11 @@ class UnifiedLSTM(LSTM):
   Arguments:
     units: Positive integer, dimensionality of the output space.
     activation: Activation function to use.
-        Default: hyperbolic tangent (`tanh`). If you pass `None`, no activation
-          is applied
-        (ie. "linear" activation: `a(x) = x`).
+      Default: hyperbolic tangent (`tanh`). If you pass `None`, no activation
+      is applied (ie. "linear" activation: `a(x) = x`).
     recurrent_activation: Activation function to use for the recurrent step.
-        Default: hard sigmoid (`hard_sigmoid`). If you pass `None`, no
-          activation is applied
-        (ie. "linear" activation: `a(x) = x`).
+      Default: sigmoid (`sigmoid`). If you pass `None`, no activation is
+      applied (ie. "linear" activation: `a(x) = x`).
     use_bias: Boolean, whether the layer uses a bias vector.
     kernel_initializer: Initializer for the `kernel` weights matrix, used for
       the linear transformation of the inputs..
@@ -2602,7 +2600,7 @@ class UnifiedLSTM(LSTM):
   def __init__(self,
                units,
                activation='tanh',
-               recurrent_activation='hard_sigmoid',
+               recurrent_activation='sigmoid',
                use_bias=True,
                kernel_initializer='glorot_uniform',
                recurrent_initializer='orthogonal',
@@ -2663,8 +2661,9 @@ class UnifiedLSTM(LSTM):
     self._num_inputs = None
     self._dropout_mask = None
     self.could_use_cudnn = (
-        activation == 'tanh' and recurrent_dropout == 0 and
-        not unroll and use_bias and bias_regularizer is None)
+        activation == 'tanh' and recurrent_activation == 'sigmoid' and
+        recurrent_dropout == 0 and not unroll and use_bias and
+        bias_regularizer is None)
 
   def call(self, inputs, mask=None, training=None, initial_state=None):
     # LSTM does not support constants. Ignore it during process.
diff --git a/tensorflow/python/keras/layers/unified_lstm_test.py b/tensorflow/python/keras/layers/unified_lstm_test.py
index 932b2d331dc..a2b523b00e0 100644
--- a/tensorflow/python/keras/layers/unified_lstm_test.py
+++ b/tensorflow/python/keras/layers/unified_lstm_test.py
@@ -161,17 +161,20 @@ class UnifiedLSTMTest(test.TestCase, parameterized.TestCase):
         existing_loss = loss_value
 
   @parameterized.named_parameters(
-      ('_non_tan_activation', 'relu', 0, False, True, None),
-      ('_use_recurrent_dropout', 'tanh', 0.1, False, True, None),
-      ('_unroll', 'tanh', 0, True, True, None),
-      ('_not_use_bias', 'tanh', 0, False, False, None),
-      ('_use_bias_regularizer', 'tanh', 0, False, True, 'l2')
+      ('non_tan_activation', 'relu', 'sigmoid', 0, False, True, None),
+      ('non_sigmoid_recur_activation', 'tanh', 'relu', 0, False, True, None),
+      ('use_recurrent_dropout', 'tanh', 'sigmoid', 0.1, False, True, None),
+      ('unroll', 'tanh', 'sigmoid', 0, True, True, None),
+      ('not_use_bias', 'tanh', 'sigmoid', 0, False, False, None),
+      ('use_bias_regularizer', 'tanh', 'sigmoid', 0, False, True, 'l2')
   )
   @test_util.run_in_graph_and_eager_modes(config=_config)
-  def test_could_use_defun_backend(self, activation, recurrent_dropout,
-                                   unroll, use_bias, bias_regularizer):
+  def test_could_use_defun_backend(self, activation, recurrent_activation,
+                                   recurrent_dropout, unroll, use_bias,
+                                   bias_regularizer):
     layer = UnifiedLSTM(1,
                         activation=activation,
+                        recurrent_activation=recurrent_activation,
                         recurrent_dropout=recurrent_dropout,
                         unroll=unroll,
                         use_bias=use_bias,
@@ -270,22 +273,22 @@ class UnifiedLSTMTest(test.TestCase, parameterized.TestCase):
     inputs = keras.layers.Input(
         shape=[timestep, input_shape], dtype=dtypes.float32)
     with test_util.device(use_gpu=False):
-      # Note that CuDNN use 'sigmoid' as activation. Force the CPU
-      # implementation to use 'sigmoid' so that it will generate same output as
-      # CuDNN implementation.
-      layer = UnifiedLSTM(rnn_state_size, recurrent_activation='sigmoid')
+      layer = UnifiedLSTM(rnn_state_size)
       output = layer(inputs)
       cpu_model = keras.models.Model(inputs, output)
       weights = cpu_model.get_weights()
       y_1 = cpu_model.predict(x_train)
 
     with test_util.device(use_gpu=True):
-      layer = UnifiedLSTM(rnn_state_size, recurrent_activation='sigmoid')
+      layer = UnifiedLSTM(rnn_state_size)
       output = layer(inputs)
       gpu_model = keras.models.Model(inputs, output)
       gpu_model.set_weights(weights)
       y_2 = gpu_model.predict(x_train)
 
+    # Note that CuDNN uses 'sigmoid' as activation, so the unified LSTM uses
+    # 'sigmoid' as default. Construct the canonical LSTM with sigmoid to achieve
+    # the same output.
     with test_util.device(use_gpu=True):
       layer = keras.layers.LSTM(rnn_state_size, recurrent_activation='sigmoid')
       output = layer(inputs)