From fca5923fab6fddf3f73dda2a0bb2933213962cdf Mon Sep 17 00:00:00 2001
From: Hongkun Yu <hongkuny@google.com>
Date: Wed, 7 Oct 2020 22:03:00 -0700
Subject: [PATCH] Keep return_attention_scores consistent with
 MultiHeadAttention, before TF 2.4 release.

PiperOrigin-RevId: 336016413
Change-Id: If9cbc68586fddf1da221cfda35dd4b72b3b68897
---
 RELEASE.md                                    |  4 +
 .../python/keras/layers/dense_attention.py    | 24 ++---
 .../keras/layers/dense_attention_test.py      | 91 +++++++++----------
 ...low.keras.layers.-additive-attention.pbtxt |  2 +-
 .../tensorflow.keras.layers.-attention.pbtxt  |  2 +-
 ...low.keras.layers.-additive-attention.pbtxt |  2 +-
 .../tensorflow.keras.layers.-attention.pbtxt  |  2 +-
 7 files changed, 62 insertions(+), 65 deletions(-)

diff --git a/RELEASE.md b/RELEASE.md
index 94ac98deb56..0fd29a0027a 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -209,6 +209,10 @@ h# Release 2.4.0
     *   Improvements to Keras preprocessing layers:
         *   TextVectorization can now accept a vocabulary list or file as an
             init arg.
+    *   In `Attention` and `AdditiveAttention` layers, the `call()` method now
+        accepts a `return_attention_scores` argument. When set to
+        True, the layer returns the attention scores as an additional output
+        argument.
 *   `tf.function` / AutoGraph:
 
     *   Added `experimental_follow_type_hints` argument for `tf.function`. When
diff --git a/tensorflow/python/keras/layers/dense_attention.py b/tensorflow/python/keras/layers/dense_attention.py
index ab2912505ef..89750606c17 100644
--- a/tensorflow/python/keras/layers/dense_attention.py
+++ b/tensorflow/python/keras/layers/dense_attention.py
@@ -49,8 +49,6 @@ class BaseDenseAttention(Layer):
       flow of information from the future towards the past.
     dropout: Float between 0 and 1. Fraction of the units to drop for the
       attention scores.
-    return_attention_scores: bool, it `True`, returns the attention scores
-      (after masking and softmax) as an additional output argument.
 
   Call Arguments:
 
@@ -69,6 +67,8 @@ class BaseDenseAttention(Layer):
         `mask==False` do not contribute to the result.
     training: Python boolean indicating whether the layer should behave in
       training mode (adding dropout) or in inference mode (no dropout).
+    return_attention_scores: bool, it `True`, returns the attention scores
+      (after masking and softmax) as an additional output argument.
 
   Output:
 
@@ -77,12 +77,11 @@ class BaseDenseAttention(Layer):
       `[batch_size, Tq, Tv]`.
   """
 
-  def __init__(self, causal=False, dropout=0.0, return_attention_scores=False,
+  def __init__(self, causal=False, dropout=0.0,
                **kwargs):
     super(BaseDenseAttention, self).__init__(**kwargs)
     self.causal = causal
     self.dropout = dropout
-    self.return_attention_scores = return_attention_scores
     self.supports_masking = True
 
   def _calculate_scores(self, query, key):
@@ -140,7 +139,11 @@ class BaseDenseAttention(Layer):
     return math_ops.matmul(weights, value), weights
 
   # TODO(b/125916026): Consider exposing a __call__ method with named args.
-  def call(self, inputs, mask=None, training=None):
+  def call(self,
+           inputs,
+           mask=None,
+           training=None,
+           return_attention_scores=False):
     self._validate_call_args(inputs=inputs, mask=mask)
     q = inputs[0]
     v = inputs[1]
@@ -170,7 +173,7 @@ class BaseDenseAttention(Layer):
       # Mask of shape [batch_size, Tq, 1].
       q_mask = array_ops.expand_dims(q_mask, axis=-1)
       result *= math_ops.cast(q_mask, dtype=result.dtype)
-    if self.return_attention_scores:
+    if return_attention_scores:
       return result, attention_scores
     return result
 
@@ -209,7 +212,6 @@ class BaseDenseAttention(Layer):
     config = {
         'causal': self.causal,
         'dropout': self.dropout,
-        'return_attention_scores': self.return_attention_scores,
     }
     base_config = super(BaseDenseAttention, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
@@ -239,8 +241,6 @@ class Attention(BaseDenseAttention):
       flow of information from the future towards the past.
     dropout: Float between 0 and 1. Fraction of the units to drop for the
       attention scores.
-    return_attention_scores: bool, it `True`, returns the attention scores
-      (after masking and softmax) as an additional output argument.
 
   Call Arguments:
 
@@ -257,6 +257,8 @@ class Attention(BaseDenseAttention):
       * value_mask: A boolean mask `Tensor` of shape `[batch_size, Tv]`.
         If given, will apply the mask such that values at positions where
         `mask==False` do not contribute to the result.
+    return_attention_scores: bool, it `True`, returns the attention scores
+      (after masking and softmax) as an additional output argument.
     training: Python boolean indicating whether the layer should behave in
       training mode (adding dropout) or in inference mode (no dropout).
 
@@ -378,8 +380,6 @@ class AdditiveAttention(BaseDenseAttention):
       flow of information from the future towards the past.
     dropout: Float between 0 and 1. Fraction of the units to drop for the
       attention scores.
-    return_attention_scores: bool, it `True`, returns the attention scores
-      (after masking and softmax) as an additional output argument.
 
   Call Arguments:
 
@@ -398,6 +398,8 @@ class AdditiveAttention(BaseDenseAttention):
         `mask==False` do not contribute to the result.
     training: Python boolean indicating whether the layer should behave in
       training mode (adding dropout) or in inference mode (no dropout).
+    return_attention_scores: bool, it `True`, returns the attention scores
+      (after masking and softmax) as an additional output argument.
 
   Output:
 
diff --git a/tensorflow/python/keras/layers/dense_attention_test.py b/tensorflow/python/keras/layers/dense_attention_test.py
index 5df53a8d1fb..8570f41b34f 100644
--- a/tensorflow/python/keras/layers/dense_attention_test.py
+++ b/tensorflow/python/keras/layers/dense_attention_test.py
@@ -82,8 +82,8 @@ class BaseDenseAttentionTest(test.TestCase, parameterized.TestCase):
     # => softmax_scores000 = exp(1)/(exp(1) + exp(0)) = 0.73105857863
     #    softmax_scores001 = exp(0)/(exp(1) + exp(0)) = 0.26894142137
     #    softmax_scores002 = 0
-    expected_scores = np.array(
-        [[[0.73105857863, 0.26894142137, 0.]]], dtype=np.float32)
+    expected_scores = np.array([[[0.73105857863, 0.26894142137, 0.]]],
+                               dtype=np.float32)
     self.assertAllClose(expected_scores, actual_scores)
     # Expected tensor of shape [1, 1, 1].
     # expected000 = 0.73105857863 * 1.6 + 0.26894142137 * 0.7 - 0 * 0.8
@@ -187,8 +187,7 @@ class AttentionTest(test.TestCase, parameterized.TestCase):
 
   def test_calculate_scores_multi_dim(self):
     # Query tensor of shape [1, 2, 4]
-    q = np.array(
-        [[[1., 1.1, 1.2, 1.3], [2., 2.1, 2.2, 2.3]]], dtype=np.float32)
+    q = np.array([[[1., 1.1, 1.2, 1.3], [2., 2.1, 2.2, 2.3]]], dtype=np.float32)
     # Key tensor of shape [1, 3, 4]
     k = np.array(
         [[[1.5, 1.6, 1.7, 1.8], [2.5, 2.6, 2.7, 2.8], [3.5, 3.6, 3.7, 3.8]]],
@@ -204,8 +203,8 @@ class AttentionTest(test.TestCase, parameterized.TestCase):
     # expected010 = 2.*1.5+2.1*1.6+2.2*1.7+2.3*1.8 = 14.24
     # expected011 = 2.*2.5+2.1*2.6+2.2*2.7+2.3*2.8 = 22.84
     # expected012 = 2.*3.5+2.1*3.6+2.2*3.7+2.3*3.8 = 31.44
-    expected = np.array(
-        [[[7.64, 12.24, 16.84], [14.24, 22.84, 31.44]]], dtype=np.float32)
+    expected = np.array([[[7.64, 12.24, 16.84], [14.24, 22.84, 31.44]]],
+                        dtype=np.float32)
     self.assertAllClose(expected, actual)
 
   def test_calculate_scores_one_dim_batch_size_two(self):
@@ -241,8 +240,7 @@ class AttentionTest(test.TestCase, parameterized.TestCase):
 
   def test_shape(self):
     # Query tensor of shape [1, 2, 4]
-    q = np.array(
-        [[[1., 1.1, 1.2, 1.3], [2., 2.1, 2.2, 2.3]]], dtype=np.float32)
+    q = np.array([[[1., 1.1, 1.2, 1.3], [2., 2.1, 2.2, 2.3]]], dtype=np.float32)
     # Value tensor of shape [1, 3, 4]
     v = np.array(
         [[[1.5, 1.6, 1.7, 1.8], [2.5, 2.6, 2.7, 2.8], [3.5, 3.6, 3.7, 3.8]]],
@@ -257,8 +255,7 @@ class AttentionTest(test.TestCase, parameterized.TestCase):
 
   def test_shape_with_key(self):
     # Query tensor of shape [1, 2, 4]
-    q = np.array(
-        [[[1., 1.1, 1.2, 1.3], [2., 2.1, 2.2, 2.3]]], dtype=np.float32)
+    q = np.array([[[1., 1.1, 1.2, 1.3], [2., 2.1, 2.2, 2.3]]], dtype=np.float32)
     # Value tensor of shape [1, 3, 4]
     v = np.array(
         [[[1.5, 1.6, 1.7, 1.8], [2.5, 2.6, 2.7, 2.8], [3.5, 3.6, 3.7, 3.8]]],
@@ -342,12 +339,16 @@ class AttentionTest(test.TestCase, parameterized.TestCase):
     q_mask = np.array([[True, False]], dtype=np.bool_)
     # Value mask tensor of shape [1, 3]
     v_mask = np.array([[True, True, False]], dtype=np.bool_)
-    attention_layer = dense_attention.Attention(
-        return_attention_scores=return_attention_scores)
+    attention_layer = dense_attention.Attention()
     if return_attention_scores:
-      actual, actual_scores = attention_layer([q, v], mask=[q_mask, v_mask])
+      actual, actual_scores = attention_layer(
+          [q, v],
+          mask=[q_mask, v_mask],
+          return_attention_scores=return_attention_scores)
     else:
-      actual = attention_layer([q, v], mask=[q_mask, v_mask])
+      actual = attention_layer([q, v],
+                               mask=[q_mask, v_mask],
+                               return_attention_scores=return_attention_scores)
 
     # Expected scores of shape [1, 2, 3]
     # scores = [[[1.1*1.6, 1.1*0.7, -1.1*0.8], [-0.5*1.6, -0.5*0.7, 0.5*0.8]]]
@@ -365,10 +366,9 @@ class AttentionTest(test.TestCase, parameterized.TestCase):
     #                              = 0.61063923394
     #    attention_distribution012 = 0
     if return_attention_scores:
-      expected_scores = np.array(
-          [[[0.72908792234, 0.27091207765, 0.],
-            [0.38936076605, 0.61063923394, 0.]]],
-          dtype=np.float32)
+      expected_scores = np.array([[[0.72908792234, 0.27091207765, 0.],
+                                   [0.38936076605, 0.61063923394, 0.]]],
+                                 dtype=np.float32)
       self.assertAllClose(expected_scores, actual_scores)
     # Expected tensor of shape [1, 2, 1] with zeros where  q_mask == False.
     # expected000 = 0.72908792234 * 1.6 + 0.27091207765 * 0.7 - 0 * 0.8
@@ -406,12 +406,13 @@ class AttentionTest(test.TestCase, parameterized.TestCase):
   def test_self_attention_causal(self, return_attention_scores):
     # Query-value tensor of shape [1, 3, 1]
     q = np.array([[[0.5], [0.8], [-0.3]]], dtype=np.float32)
-    attention_layer = dense_attention.Attention(
-        causal=True, return_attention_scores=return_attention_scores)
+    attention_layer = dense_attention.Attention(causal=True)
     if return_attention_scores:
-      actual, actual_scores = attention_layer([q, q])
+      actual, actual_scores = attention_layer(
+          [q, q], return_attention_scores=return_attention_scores)
     else:
-      actual = attention_layer([q, q])
+      actual = attention_layer([q, q],
+                               return_attention_scores=return_attention_scores)
 
     # Expected scores of shape [1, 3, 3]
     # scores = [[0.25, 0.4, -0.15], [0.4, 0.64, -0.24], [-0.15, -0.24, 0.09]]
@@ -426,8 +427,7 @@ class AttentionTest(test.TestCase, parameterized.TestCase):
     #      = [0.31395396638, 0.28693232061, 0.399113713]
     if return_attention_scores:
       expected_scores = np.array(
-          [[[1., 0., 0.],
-            [0.44028635073, 0.55971364926, 0.],
+          [[[1., 0., 0.], [0.44028635073, 0.55971364926, 0.],
             [0.31395396638, 0.28693232061, 0.399113713]]],
           dtype=np.float32)
       self.assertAllClose(expected_scores, actual_scores)
@@ -437,8 +437,8 @@ class AttentionTest(test.TestCase, parameterized.TestCase):
     #             = 0.66791409477
     # expected020 = 0.31395396638 * 0.5 +0.28693232061 * 0.8 -0.399113713 * 0.3
     #             = 0.26678872577
-    expected = np.array(
-        [[[0.5], [0.66791409477], [0.26678872577]]], dtype=np.float32)
+    expected = np.array([[[0.5], [0.66791409477], [0.26678872577]]],
+                        dtype=np.float32)
     self.assertAllClose(expected, actual)
 
   def test_inputs_not_list(self):
@@ -501,24 +501,20 @@ class AttentionTest(test.TestCase, parameterized.TestCase):
     self.assertAllClose([[[0], [1]]], actual)
 
   @parameterized.named_parameters(
-      ('', False, False),
-      ('use_scale', True, False),
-      ('return_attention_scores', False, True),
+      ('', False),
+      ('use_scale', True),
   )
-  def test_serialization(self, use_scale, return_attention_scores):
+  def test_serialization(self, use_scale):
     # Test serialization with use_scale
-    layer = dense_attention.Attention(
-        use_scale=use_scale, return_attention_scores=return_attention_scores)
+    layer = dense_attention.Attention(use_scale=use_scale)
 
     config = keras.layers.serialize(layer)
     new_layer = keras.layers.deserialize(config)
     self.assertEqual(new_layer.use_scale, use_scale)
-    self.assertEqual(new_layer.return_attention_scores, return_attention_scores)
 
     config = layer.get_config()
     new_layer = dense_attention.Attention.from_config(config)
     self.assertEqual(new_layer.use_scale, use_scale)
-    self.assertEqual(new_layer.return_attention_scores, return_attention_scores)
 
 
 @combinations.generate(combinations.combine(mode=['graph', 'eager']))
@@ -542,8 +538,7 @@ class AdditiveAttentionTest(test.TestCase, parameterized.TestCase):
 
   def test_calculate_scores_multi_dim(self):
     # Query tensor of shape [1, 2, 4]
-    q = np.array(
-        [[[1., 1.1, 1.2, 1.3], [2., 2.1, 2.2, 2.3]]], dtype=np.float32)
+    q = np.array([[[1., 1.1, 1.2, 1.3], [2., 2.1, 2.2, 2.3]]], dtype=np.float32)
     # Key tensor of shape [1, 3, 4]
     k = np.array(
         [[[1.5, 1.6, 1.7, 1.8], [2.5, 2.6, 2.7, 2.8], [3.5, 3.6, 3.7, 3.8]]],
@@ -562,10 +557,9 @@ class AdditiveAttentionTest(test.TestCase, parameterized.TestCase):
     # expected011 = 0.5*tanh(2.+2.5) + 0.6*tanh(2.1+2.6) + 0.7*tanh(2.2+2.7) + 0.8*tanh(2.3+2.8) = 2.59964024652
     # expected012 = 0.5*tanh(2.+3.5) + 0.6*tanh(2.1+3.6) + 0.7*tanh(2.2+3.7) + 0.8*tanh(2.3+3.8) = 2.59995130916
     # pylint:enable=line-too-long
-    expected = np.array(
-        [[[2.58044532581, 2.59734317449, 2.59964024652],
-          [2.59734317449, 2.59964024652, 2.59995130916]]],
-        dtype=np.float32)
+    expected = np.array([[[2.58044532581, 2.59734317449, 2.59964024652],
+                          [2.59734317449, 2.59964024652, 2.59995130916]]],
+                        dtype=np.float32)
     self.assertAllClose(expected, actual)
 
   def test_calculate_scores_one_dim_batch_size_two(self):
@@ -582,14 +576,13 @@ class AdditiveAttentionTest(test.TestCase, parameterized.TestCase):
     # Expected tensor of shape [2, 1, 1].
     # expected000 = 0.5 * tanh(1.1 + 1.6) = 0.49550372683
     # expected100 = 0.5 * tanh(2.1 + 2.6) = 0.49991728277
-    expected = np.array(
-        [[[0.49550372683]], [[0.49991728277]]], dtype=np.float32)
+    expected = np.array([[[0.49550372683]], [[0.49991728277]]],
+                        dtype=np.float32)
     self.assertAllClose(expected, actual)
 
   def test_shape(self):
     # Query tensor of shape [1, 2, 4]
-    q = np.array(
-        [[[1., 1.1, 1.2, 1.3], [2., 2.1, 2.2, 2.3]]], dtype=np.float32)
+    q = np.array([[[1., 1.1, 1.2, 1.3], [2., 2.1, 2.2, 2.3]]], dtype=np.float32)
     # Value tensor of shape [1, 3, 4]
     v = np.array(
         [[[1.5, 1.6, 1.7, 1.8], [2.5, 2.6, 2.7, 2.8], [3.5, 3.6, 3.7, 3.8]]],
@@ -604,8 +597,7 @@ class AdditiveAttentionTest(test.TestCase, parameterized.TestCase):
 
   def test_shape_no_scale(self):
     # Query tensor of shape [1, 2, 4]
-    q = np.array(
-        [[[1., 1.1, 1.2, 1.3], [2., 2.1, 2.2, 2.3]]], dtype=np.float32)
+    q = np.array([[[1., 1.1, 1.2, 1.3], [2., 2.1, 2.2, 2.3]]], dtype=np.float32)
     # Value tensor of shape [1, 3, 4]
     v = np.array(
         [[[1.5, 1.6, 1.7, 1.8], [2.5, 2.6, 2.7, 2.8], [3.5, 3.6, 3.7, 3.8]]],
@@ -620,8 +612,7 @@ class AdditiveAttentionTest(test.TestCase, parameterized.TestCase):
 
   def test_shape_with_key(self):
     # Query tensor of shape [1, 2, 4]
-    q = np.array(
-        [[[1., 1.1, 1.2, 1.3], [2., 2.1, 2.2, 2.3]]], dtype=np.float32)
+    q = np.array([[[1., 1.1, 1.2, 1.3], [2., 2.1, 2.2, 2.3]]], dtype=np.float32)
     # Value tensor of shape [1, 3, 4]
     v = np.array(
         [[[1.5, 1.6, 1.7, 1.8], [2.5, 2.6, 2.7, 2.8], [3.5, 3.6, 3.7, 3.8]]],
@@ -779,8 +770,8 @@ class LowerTriangularMaskTest(test.TestCase, parameterized.TestCase):
 
   def test_orthogonal_shape(self):
     actual = dense_attention._lower_triangular_mask([3, 2])
-    expected = np.array(
-        [[True, False], [True, True], [True, True]], dtype=np.bool_)
+    expected = np.array([[True, False], [True, True], [True, True]],
+                        dtype=np.bool_)
     self.assertAllEqual(expected, actual)
 
   def test_three_dim(self):
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-additive-attention.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-additive-attention.pbtxt
index 96b809486a7..0c85d31934a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-additive-attention.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-additive-attention.pbtxt
@@ -150,7 +150,7 @@ tf_class {
   }
   member_method {
     name: "call"
-    argspec: "args=[\'self\', \'inputs\', \'mask\', \'training\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'self\', \'inputs\', \'mask\', \'training\', \'return_attention_scores\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\'], "
   }
   member_method {
     name: "compute_mask"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-attention.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-attention.pbtxt
index ae2cb7f7e20..b2c3156cf7a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-attention.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-attention.pbtxt
@@ -150,7 +150,7 @@ tf_class {
   }
   member_method {
     name: "call"
-    argspec: "args=[\'self\', \'inputs\', \'mask\', \'training\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'self\', \'inputs\', \'mask\', \'training\', \'return_attention_scores\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\'], "
   }
   member_method {
     name: "compute_mask"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-additive-attention.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-additive-attention.pbtxt
index 96b809486a7..0c85d31934a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-additive-attention.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-additive-attention.pbtxt
@@ -150,7 +150,7 @@ tf_class {
   }
   member_method {
     name: "call"
-    argspec: "args=[\'self\', \'inputs\', \'mask\', \'training\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'self\', \'inputs\', \'mask\', \'training\', \'return_attention_scores\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\'], "
   }
   member_method {
     name: "compute_mask"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-attention.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-attention.pbtxt
index ae2cb7f7e20..b2c3156cf7a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-attention.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-attention.pbtxt
@@ -150,7 +150,7 @@ tf_class {
   }
   member_method {
     name: "call"
-    argspec: "args=[\'self\', \'inputs\', \'mask\', \'training\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'self\', \'inputs\', \'mask\', \'training\', \'return_attention_scores\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\'], "
   }
   member_method {
     name: "compute_mask"