Keep return_attention_scores consistent with MultiHeadAttention, before TF 2.4 release.

PiperOrigin-RevId: 336016413 Change-Id: If9cbc68586fddf1da221cfda35dd4b72b3b68897
2020-10-07 22:03:00 -07:00 · 2020-10-07 22:03:00 -07:00 · fca5923fab
commit fca5923fab
parent 37cf5b43c3
7 changed files with 62 additions and 65 deletions
--- a/RELEASE.md
+++ b/RELEASE.md
@ -209,6 +209,10 @@ h# Release 2.4.0
    *   Improvements to Keras preprocessing layers:
        *   TextVectorization can now accept a vocabulary list or file as an
            init arg.
+    *   In `Attention` and `AdditiveAttention` layers, the `call()` method now
+        accepts a `return_attention_scores` argument. When set to
+        True, the layer returns the attention scores as an additional output
+        argument.
 *   `tf.function` / AutoGraph:

    *   Added `experimental_follow_type_hints` argument for `tf.function`. When
--- a/tensorflow/python/keras/layers/dense_attention.py
+++ b/tensorflow/python/keras/layers/dense_attention.py
@ -49,8 +49,6 @@ class BaseDenseAttention(Layer):
      flow of information from the future towards the past.
    dropout: Float between 0 and 1. Fraction of the units to drop for the
      attention scores.
-    return_attention_scores: bool, it `True`, returns the attention scores
-      (after masking and softmax) as an additional output argument.

  Call Arguments:

@ -69,6 +67,8 @@ class BaseDenseAttention(Layer):
        `mask==False` do not contribute to the result.
    training: Python boolean indicating whether the layer should behave in
      training mode (adding dropout) or in inference mode (no dropout).
+    return_attention_scores: bool, it `True`, returns the attention scores
+      (after masking and softmax) as an additional output argument.

  Output:

@ -77,12 +77,11 @@ class BaseDenseAttention(Layer):
      `[batch_size, Tq, Tv]`.
  """

-  def __init__(self, causal=False, dropout=0.0, return_attention_scores=False,
+  def __init__(self, causal=False, dropout=0.0,
               **kwargs):
    super(BaseDenseAttention, self).__init__(**kwargs)
    self.causal = causal
    self.dropout = dropout
-    self.return_attention_scores = return_attention_scores
    self.supports_masking = True

  def _calculate_scores(self, query, key):
@ -140,7 +139,11 @@ class BaseDenseAttention(Layer):
    return math_ops.matmul(weights, value), weights

  # TODO(b/125916026): Consider exposing a __call__ method with named args.
-  def call(self, inputs, mask=None, training=None):
+  def call(self,
+           inputs,
+           mask=None,
+           training=None,
+           return_attention_scores=False):
    self._validate_call_args(inputs=inputs, mask=mask)
    q = inputs[0]
    v = inputs[1]
@ -170,7 +173,7 @@ class BaseDenseAttention(Layer):
      # Mask of shape [batch_size, Tq, 1].
      q_mask = array_ops.expand_dims(q_mask, axis=-1)
      result *= math_ops.cast(q_mask, dtype=result.dtype)
-    if self.return_attention_scores:
+    if return_attention_scores:
      return result, attention_scores
    return result

@ -209,7 +212,6 @@ class BaseDenseAttention(Layer):
    config = {
        'causal': self.causal,
        'dropout': self.dropout,
-        'return_attention_scores': self.return_attention_scores,
    }
    base_config = super(BaseDenseAttention, self).get_config()
    return dict(list(base_config.items()) + list(config.items()))
@ -239,8 +241,6 @@ class Attention(BaseDenseAttention):
      flow of information from the future towards the past.
    dropout: Float between 0 and 1. Fraction of the units to drop for the
      attention scores.
-    return_attention_scores: bool, it `True`, returns the attention scores
-      (after masking and softmax) as an additional output argument.

  Call Arguments:

@ -257,6 +257,8 @@ class Attention(BaseDenseAttention):
      * value_mask: A boolean mask `Tensor` of shape `[batch_size, Tv]`.
        If given, will apply the mask such that values at positions where
        `mask==False` do not contribute to the result.
+    return_attention_scores: bool, it `True`, returns the attention scores
+      (after masking and softmax) as an additional output argument.
    training: Python boolean indicating whether the layer should behave in
      training mode (adding dropout) or in inference mode (no dropout).

@ -378,8 +380,6 @@ class AdditiveAttention(BaseDenseAttention):
      flow of information from the future towards the past.
    dropout: Float between 0 and 1. Fraction of the units to drop for the
      attention scores.
-    return_attention_scores: bool, it `True`, returns the attention scores
-      (after masking and softmax) as an additional output argument.

  Call Arguments:

@ -398,6 +398,8 @@ class AdditiveAttention(BaseDenseAttention):
        `mask==False` do not contribute to the result.
    training: Python boolean indicating whether the layer should behave in
      training mode (adding dropout) or in inference mode (no dropout).
+    return_attention_scores: bool, it `True`, returns the attention scores
+      (after masking and softmax) as an additional output argument.

  Output:

--- a/tensorflow/python/keras/layers/dense_attention_test.py
+++ b/tensorflow/python/keras/layers/dense_attention_test.py
@ -82,8 +82,8 @@ class BaseDenseAttentionTest(test.TestCase, parameterized.TestCase):
    # => softmax_scores000 = exp(1)/(exp(1) + exp(0)) = 0.73105857863
    #    softmax_scores001 = exp(0)/(exp(1) + exp(0)) = 0.26894142137
    #    softmax_scores002 = 0
-    expected_scores = np.array(
-        [[[0.73105857863, 0.26894142137, 0.]]], dtype=np.float32)
+    expected_scores = np.array([[[0.73105857863, 0.26894142137, 0.]]],
+                               dtype=np.float32)
    self.assertAllClose(expected_scores, actual_scores)
    # Expected tensor of shape [1, 1, 1].
    # expected000 = 0.73105857863 * 1.6 + 0.26894142137 * 0.7 - 0 * 0.8
@ -187,8 +187,7 @@ class AttentionTest(test.TestCase, parameterized.TestCase):

  def test_calculate_scores_multi_dim(self):
    # Query tensor of shape [1, 2, 4]
-    q = np.array(
-        [[[1., 1.1, 1.2, 1.3], [2., 2.1, 2.2, 2.3]]], dtype=np.float32)
+    q = np.array([[[1., 1.1, 1.2, 1.3], [2., 2.1, 2.2, 2.3]]], dtype=np.float32)
    # Key tensor of shape [1, 3, 4]
    k = np.array(
        [[[1.5, 1.6, 1.7, 1.8], [2.5, 2.6, 2.7, 2.8], [3.5, 3.6, 3.7, 3.8]]],
@ -204,8 +203,8 @@ class AttentionTest(test.TestCase, parameterized.TestCase):
    # expected010 = 2.*1.5+2.1*1.6+2.2*1.7+2.3*1.8 = 14.24
    # expected011 = 2.*2.5+2.1*2.6+2.2*2.7+2.3*2.8 = 22.84
    # expected012 = 2.*3.5+2.1*3.6+2.2*3.7+2.3*3.8 = 31.44
-    expected = np.array(
-        [[[7.64, 12.24, 16.84], [14.24, 22.84, 31.44]]], dtype=np.float32)
+    expected = np.array([[[7.64, 12.24, 16.84], [14.24, 22.84, 31.44]]],
+                        dtype=np.float32)
    self.assertAllClose(expected, actual)

  def test_calculate_scores_one_dim_batch_size_two(self):
@ -241,8 +240,7 @@ class AttentionTest(test.TestCase, parameterized.TestCase):

  def test_shape(self):
    # Query tensor of shape [1, 2, 4]
-    q = np.array(
-        [[[1., 1.1, 1.2, 1.3], [2., 2.1, 2.2, 2.3]]], dtype=np.float32)
+    q = np.array([[[1., 1.1, 1.2, 1.3], [2., 2.1, 2.2, 2.3]]], dtype=np.float32)
    # Value tensor of shape [1, 3, 4]
    v = np.array(
        [[[1.5, 1.6, 1.7, 1.8], [2.5, 2.6, 2.7, 2.8], [3.5, 3.6, 3.7, 3.8]]],
@ -257,8 +255,7 @@ class AttentionTest(test.TestCase, parameterized.TestCase):

  def test_shape_with_key(self):
    # Query tensor of shape [1, 2, 4]
-    q = np.array(
-        [[[1., 1.1, 1.2, 1.3], [2., 2.1, 2.2, 2.3]]], dtype=np.float32)
+    q = np.array([[[1., 1.1, 1.2, 1.3], [2., 2.1, 2.2, 2.3]]], dtype=np.float32)
    # Value tensor of shape [1, 3, 4]
    v = np.array(
        [[[1.5, 1.6, 1.7, 1.8], [2.5, 2.6, 2.7, 2.8], [3.5, 3.6, 3.7, 3.8]]],
@ -342,12 +339,16 @@ class AttentionTest(test.TestCase, parameterized.TestCase):
    q_mask = np.array([[True, False]], dtype=np.bool_)
    # Value mask tensor of shape [1, 3]
    v_mask = np.array([[True, True, False]], dtype=np.bool_)
-    attention_layer = dense_attention.Attention(
-        return_attention_scores=return_attention_scores)
+    attention_layer = dense_attention.Attention()
    if return_attention_scores:
-      actual, actual_scores = attention_layer([q, v], mask=[q_mask, v_mask])
+      actual, actual_scores = attention_layer(
+          [q, v],
+          mask=[q_mask, v_mask],
+          return_attention_scores=return_attention_scores)
    else:
-      actual = attention_layer([q, v], mask=[q_mask, v_mask])
+      actual = attention_layer([q, v],
+                               mask=[q_mask, v_mask],
+                               return_attention_scores=return_attention_scores)

    # Expected scores of shape [1, 2, 3]
    # scores = [[[1.1*1.6, 1.1*0.7, -1.1*0.8], [-0.5*1.6, -0.5*0.7, 0.5*0.8]]]
@ -365,10 +366,9 @@ class AttentionTest(test.TestCase, parameterized.TestCase):
    #                              = 0.61063923394
    #    attention_distribution012 = 0
    if return_attention_scores:
-      expected_scores = np.array(
-          [[[0.72908792234, 0.27091207765, 0.],
-            [0.38936076605, 0.61063923394, 0.]]],
-          dtype=np.float32)
+      expected_scores = np.array([[[0.72908792234, 0.27091207765, 0.],
+                                   [0.38936076605, 0.61063923394, 0.]]],
+                                 dtype=np.float32)
      self.assertAllClose(expected_scores, actual_scores)
    # Expected tensor of shape [1, 2, 1] with zeros where  q_mask == False.
    # expected000 = 0.72908792234 * 1.6 + 0.27091207765 * 0.7 - 0 * 0.8
@ -406,12 +406,13 @@ class AttentionTest(test.TestCase, parameterized.TestCase):
  def test_self_attention_causal(self, return_attention_scores):
    # Query-value tensor of shape [1, 3, 1]
    q = np.array([[[0.5], [0.8], [-0.3]]], dtype=np.float32)
-    attention_layer = dense_attention.Attention(
-        causal=True, return_attention_scores=return_attention_scores)
+    attention_layer = dense_attention.Attention(causal=True)
    if return_attention_scores:
-      actual, actual_scores = attention_layer([q, q])
+      actual, actual_scores = attention_layer(
+          [q, q], return_attention_scores=return_attention_scores)
    else:
-      actual = attention_layer([q, q])
+      actual = attention_layer([q, q],
+                               return_attention_scores=return_attention_scores)

    # Expected scores of shape [1, 3, 3]
    # scores = [[0.25, 0.4, -0.15], [0.4, 0.64, -0.24], [-0.15, -0.24, 0.09]]
@ -426,8 +427,7 @@ class AttentionTest(test.TestCase, parameterized.TestCase):
    #      = [0.31395396638, 0.28693232061, 0.399113713]
    if return_attention_scores:
      expected_scores = np.array(
-          [[[1., 0., 0.],
-            [0.44028635073, 0.55971364926, 0.],
+          [[[1., 0., 0.], [0.44028635073, 0.55971364926, 0.],
            [0.31395396638, 0.28693232061, 0.399113713]]],
          dtype=np.float32)
      self.assertAllClose(expected_scores, actual_scores)
@ -437,8 +437,8 @@ class AttentionTest(test.TestCase, parameterized.TestCase):
    #             = 0.66791409477
    # expected020 = 0.31395396638 * 0.5 +0.28693232061 * 0.8 -0.399113713 * 0.3
    #             = 0.26678872577
-    expected = np.array(
-        [[[0.5], [0.66791409477], [0.26678872577]]], dtype=np.float32)
+    expected = np.array([[[0.5], [0.66791409477], [0.26678872577]]],
+                        dtype=np.float32)
    self.assertAllClose(expected, actual)

  def test_inputs_not_list(self):
@ -501,24 +501,20 @@ class AttentionTest(test.TestCase, parameterized.TestCase):
    self.assertAllClose([[[0], [1]]], actual)

  @parameterized.named_parameters(
-      ('', False, False),
-      ('use_scale', True, False),
-      ('return_attention_scores', False, True),
+      ('', False),
+      ('use_scale', True),
  )
-  def test_serialization(self, use_scale, return_attention_scores):
+  def test_serialization(self, use_scale):
    # Test serialization with use_scale
-    layer = dense_attention.Attention(
-        use_scale=use_scale, return_attention_scores=return_attention_scores)
+    layer = dense_attention.Attention(use_scale=use_scale)

    config = keras.layers.serialize(layer)
    new_layer = keras.layers.deserialize(config)
    self.assertEqual(new_layer.use_scale, use_scale)
-    self.assertEqual(new_layer.return_attention_scores, return_attention_scores)

    config = layer.get_config()
    new_layer = dense_attention.Attention.from_config(config)
    self.assertEqual(new_layer.use_scale, use_scale)
-    self.assertEqual(new_layer.return_attention_scores, return_attention_scores)


@combinations.generate(combinations.combine(mode=['graph', 'eager']))
@ -542,8 +538,7 @@ class AdditiveAttentionTest(test.TestCase, parameterized.TestCase):

  def test_calculate_scores_multi_dim(self):
    # Query tensor of shape [1, 2, 4]
-    q = np.array(
-        [[[1., 1.1, 1.2, 1.3], [2., 2.1, 2.2, 2.3]]], dtype=np.float32)
+    q = np.array([[[1., 1.1, 1.2, 1.3], [2., 2.1, 2.2, 2.3]]], dtype=np.float32)
    # Key tensor of shape [1, 3, 4]
    k = np.array(
        [[[1.5, 1.6, 1.7, 1.8], [2.5, 2.6, 2.7, 2.8], [3.5, 3.6, 3.7, 3.8]]],
@ -562,10 +557,9 @@ class AdditiveAttentionTest(test.TestCase, parameterized.TestCase):
    # expected011 = 0.5*tanh(2.+2.5) + 0.6*tanh(2.1+2.6) + 0.7*tanh(2.2+2.7) + 0.8*tanh(2.3+2.8) = 2.59964024652
    # expected012 = 0.5*tanh(2.+3.5) + 0.6*tanh(2.1+3.6) + 0.7*tanh(2.2+3.7) + 0.8*tanh(2.3+3.8) = 2.59995130916
    # pylint:enable=line-too-long
-    expected = np.array(
-        [[[2.58044532581, 2.59734317449, 2.59964024652],
-          [2.59734317449, 2.59964024652, 2.59995130916]]],
-        dtype=np.float32)
+    expected = np.array([[[2.58044532581, 2.59734317449, 2.59964024652],
+                          [2.59734317449, 2.59964024652, 2.59995130916]]],
+                        dtype=np.float32)
    self.assertAllClose(expected, actual)

  def test_calculate_scores_one_dim_batch_size_two(self):
@ -582,14 +576,13 @@ class AdditiveAttentionTest(test.TestCase, parameterized.TestCase):
    # Expected tensor of shape [2, 1, 1].
    # expected000 = 0.5 * tanh(1.1 + 1.6) = 0.49550372683
    # expected100 = 0.5 * tanh(2.1 + 2.6) = 0.49991728277
-    expected = np.array(
-        [[[0.49550372683]], [[0.49991728277]]], dtype=np.float32)
+    expected = np.array([[[0.49550372683]], [[0.49991728277]]],
+                        dtype=np.float32)
    self.assertAllClose(expected, actual)

  def test_shape(self):
    # Query tensor of shape [1, 2, 4]
-    q = np.array(
-        [[[1., 1.1, 1.2, 1.3], [2., 2.1, 2.2, 2.3]]], dtype=np.float32)
+    q = np.array([[[1., 1.1, 1.2, 1.3], [2., 2.1, 2.2, 2.3]]], dtype=np.float32)
    # Value tensor of shape [1, 3, 4]
    v = np.array(
        [[[1.5, 1.6, 1.7, 1.8], [2.5, 2.6, 2.7, 2.8], [3.5, 3.6, 3.7, 3.8]]],
@ -604,8 +597,7 @@ class AdditiveAttentionTest(test.TestCase, parameterized.TestCase):

  def test_shape_no_scale(self):
    # Query tensor of shape [1, 2, 4]
-    q = np.array(
-        [[[1., 1.1, 1.2, 1.3], [2., 2.1, 2.2, 2.3]]], dtype=np.float32)
+    q = np.array([[[1., 1.1, 1.2, 1.3], [2., 2.1, 2.2, 2.3]]], dtype=np.float32)
    # Value tensor of shape [1, 3, 4]
    v = np.array(
        [[[1.5, 1.6, 1.7, 1.8], [2.5, 2.6, 2.7, 2.8], [3.5, 3.6, 3.7, 3.8]]],
@ -620,8 +612,7 @@ class AdditiveAttentionTest(test.TestCase, parameterized.TestCase):

  def test_shape_with_key(self):
    # Query tensor of shape [1, 2, 4]
-    q = np.array(
-        [[[1., 1.1, 1.2, 1.3], [2., 2.1, 2.2, 2.3]]], dtype=np.float32)
+    q = np.array([[[1., 1.1, 1.2, 1.3], [2., 2.1, 2.2, 2.3]]], dtype=np.float32)
    # Value tensor of shape [1, 3, 4]
    v = np.array(
        [[[1.5, 1.6, 1.7, 1.8], [2.5, 2.6, 2.7, 2.8], [3.5, 3.6, 3.7, 3.8]]],
@ -779,8 +770,8 @@ class LowerTriangularMaskTest(test.TestCase, parameterized.TestCase):

  def test_orthogonal_shape(self):
    actual = dense_attention._lower_triangular_mask([3, 2])
-    expected = np.array(
-        [[True, False], [True, True], [True, True]], dtype=np.bool_)
+    expected = np.array([[True, False], [True, True], [True, True]],
+                        dtype=np.bool_)
    self.assertAllEqual(expected, actual)

  def test_three_dim(self):
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-additive-attention.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-additive-attention.pbtxt
@ -150,7 +150,7 @@ tf_class {
  }
  member_method {
    name: "call"
-    argspec: "args=[\'self\', \'inputs\', \'mask\', \'training\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'self\', \'inputs\', \'mask\', \'training\', \'return_attention_scores\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\'], "
  }
  member_method {
    name: "compute_mask"
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-attention.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-attention.pbtxt
@ -150,7 +150,7 @@ tf_class {
  }
  member_method {
    name: "call"
-    argspec: "args=[\'self\', \'inputs\', \'mask\', \'training\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'self\', \'inputs\', \'mask\', \'training\', \'return_attention_scores\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\'], "
  }
  member_method {
    name: "compute_mask"
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-additive-attention.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-additive-attention.pbtxt
@ -150,7 +150,7 @@ tf_class {
  }
  member_method {
    name: "call"
-    argspec: "args=[\'self\', \'inputs\', \'mask\', \'training\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'self\', \'inputs\', \'mask\', \'training\', \'return_attention_scores\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\'], "
  }
  member_method {
    name: "compute_mask"
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-attention.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-attention.pbtxt
@ -150,7 +150,7 @@ tf_class {
  }
  member_method {
    name: "call"
-    argspec: "args=[\'self\', \'inputs\', \'mask\', \'training\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'self\', \'inputs\', \'mask\', \'training\', \'return_attention_scores\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\'], "
  }
  member_method {
    name: "compute_mask"