From fca5923fab6fddf3f73dda2a0bb2933213962cdf Mon Sep 17 00:00:00 2001 From: Hongkun Yu Date: Wed, 7 Oct 2020 22:03:00 -0700 Subject: [PATCH] Keep return_attention_scores consistent with MultiHeadAttention, before TF 2.4 release. PiperOrigin-RevId: 336016413 Change-Id: If9cbc68586fddf1da221cfda35dd4b72b3b68897 --- RELEASE.md | 4 + .../python/keras/layers/dense_attention.py | 24 ++--- .../keras/layers/dense_attention_test.py | 91 +++++++++---------- ...low.keras.layers.-additive-attention.pbtxt | 2 +- .../tensorflow.keras.layers.-attention.pbtxt | 2 +- ...low.keras.layers.-additive-attention.pbtxt | 2 +- .../tensorflow.keras.layers.-attention.pbtxt | 2 +- 7 files changed, 62 insertions(+), 65 deletions(-) diff --git a/RELEASE.md b/RELEASE.md index 94ac98deb56..0fd29a0027a 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -209,6 +209,10 @@ h# Release 2.4.0 * Improvements to Keras preprocessing layers: * TextVectorization can now accept a vocabulary list or file as an init arg. + * In `Attention` and `AdditiveAttention` layers, the `call()` method now + accepts a `return_attention_scores` argument. When set to + True, the layer returns the attention scores as an additional output + argument. * `tf.function` / AutoGraph: * Added `experimental_follow_type_hints` argument for `tf.function`. When diff --git a/tensorflow/python/keras/layers/dense_attention.py b/tensorflow/python/keras/layers/dense_attention.py index ab2912505ef..89750606c17 100644 --- a/tensorflow/python/keras/layers/dense_attention.py +++ b/tensorflow/python/keras/layers/dense_attention.py @@ -49,8 +49,6 @@ class BaseDenseAttention(Layer): flow of information from the future towards the past. dropout: Float between 0 and 1. Fraction of the units to drop for the attention scores. - return_attention_scores: bool, it `True`, returns the attention scores - (after masking and softmax) as an additional output argument. Call Arguments: @@ -69,6 +67,8 @@ class BaseDenseAttention(Layer): `mask==False` do not contribute to the result. training: Python boolean indicating whether the layer should behave in training mode (adding dropout) or in inference mode (no dropout). + return_attention_scores: bool, it `True`, returns the attention scores + (after masking and softmax) as an additional output argument. Output: @@ -77,12 +77,11 @@ class BaseDenseAttention(Layer): `[batch_size, Tq, Tv]`. """ - def __init__(self, causal=False, dropout=0.0, return_attention_scores=False, + def __init__(self, causal=False, dropout=0.0, **kwargs): super(BaseDenseAttention, self).__init__(**kwargs) self.causal = causal self.dropout = dropout - self.return_attention_scores = return_attention_scores self.supports_masking = True def _calculate_scores(self, query, key): @@ -140,7 +139,11 @@ class BaseDenseAttention(Layer): return math_ops.matmul(weights, value), weights # TODO(b/125916026): Consider exposing a __call__ method with named args. - def call(self, inputs, mask=None, training=None): + def call(self, + inputs, + mask=None, + training=None, + return_attention_scores=False): self._validate_call_args(inputs=inputs, mask=mask) q = inputs[0] v = inputs[1] @@ -170,7 +173,7 @@ class BaseDenseAttention(Layer): # Mask of shape [batch_size, Tq, 1]. q_mask = array_ops.expand_dims(q_mask, axis=-1) result *= math_ops.cast(q_mask, dtype=result.dtype) - if self.return_attention_scores: + if return_attention_scores: return result, attention_scores return result @@ -209,7 +212,6 @@ class BaseDenseAttention(Layer): config = { 'causal': self.causal, 'dropout': self.dropout, - 'return_attention_scores': self.return_attention_scores, } base_config = super(BaseDenseAttention, self).get_config() return dict(list(base_config.items()) + list(config.items())) @@ -239,8 +241,6 @@ class Attention(BaseDenseAttention): flow of information from the future towards the past. dropout: Float between 0 and 1. Fraction of the units to drop for the attention scores. - return_attention_scores: bool, it `True`, returns the attention scores - (after masking and softmax) as an additional output argument. Call Arguments: @@ -257,6 +257,8 @@ class Attention(BaseDenseAttention): * value_mask: A boolean mask `Tensor` of shape `[batch_size, Tv]`. If given, will apply the mask such that values at positions where `mask==False` do not contribute to the result. + return_attention_scores: bool, it `True`, returns the attention scores + (after masking and softmax) as an additional output argument. training: Python boolean indicating whether the layer should behave in training mode (adding dropout) or in inference mode (no dropout). @@ -378,8 +380,6 @@ class AdditiveAttention(BaseDenseAttention): flow of information from the future towards the past. dropout: Float between 0 and 1. Fraction of the units to drop for the attention scores. - return_attention_scores: bool, it `True`, returns the attention scores - (after masking and softmax) as an additional output argument. Call Arguments: @@ -398,6 +398,8 @@ class AdditiveAttention(BaseDenseAttention): `mask==False` do not contribute to the result. training: Python boolean indicating whether the layer should behave in training mode (adding dropout) or in inference mode (no dropout). + return_attention_scores: bool, it `True`, returns the attention scores + (after masking and softmax) as an additional output argument. Output: diff --git a/tensorflow/python/keras/layers/dense_attention_test.py b/tensorflow/python/keras/layers/dense_attention_test.py index 5df53a8d1fb..8570f41b34f 100644 --- a/tensorflow/python/keras/layers/dense_attention_test.py +++ b/tensorflow/python/keras/layers/dense_attention_test.py @@ -82,8 +82,8 @@ class BaseDenseAttentionTest(test.TestCase, parameterized.TestCase): # => softmax_scores000 = exp(1)/(exp(1) + exp(0)) = 0.73105857863 # softmax_scores001 = exp(0)/(exp(1) + exp(0)) = 0.26894142137 # softmax_scores002 = 0 - expected_scores = np.array( - [[[0.73105857863, 0.26894142137, 0.]]], dtype=np.float32) + expected_scores = np.array([[[0.73105857863, 0.26894142137, 0.]]], + dtype=np.float32) self.assertAllClose(expected_scores, actual_scores) # Expected tensor of shape [1, 1, 1]. # expected000 = 0.73105857863 * 1.6 + 0.26894142137 * 0.7 - 0 * 0.8 @@ -187,8 +187,7 @@ class AttentionTest(test.TestCase, parameterized.TestCase): def test_calculate_scores_multi_dim(self): # Query tensor of shape [1, 2, 4] - q = np.array( - [[[1., 1.1, 1.2, 1.3], [2., 2.1, 2.2, 2.3]]], dtype=np.float32) + q = np.array([[[1., 1.1, 1.2, 1.3], [2., 2.1, 2.2, 2.3]]], dtype=np.float32) # Key tensor of shape [1, 3, 4] k = np.array( [[[1.5, 1.6, 1.7, 1.8], [2.5, 2.6, 2.7, 2.8], [3.5, 3.6, 3.7, 3.8]]], @@ -204,8 +203,8 @@ class AttentionTest(test.TestCase, parameterized.TestCase): # expected010 = 2.*1.5+2.1*1.6+2.2*1.7+2.3*1.8 = 14.24 # expected011 = 2.*2.5+2.1*2.6+2.2*2.7+2.3*2.8 = 22.84 # expected012 = 2.*3.5+2.1*3.6+2.2*3.7+2.3*3.8 = 31.44 - expected = np.array( - [[[7.64, 12.24, 16.84], [14.24, 22.84, 31.44]]], dtype=np.float32) + expected = np.array([[[7.64, 12.24, 16.84], [14.24, 22.84, 31.44]]], + dtype=np.float32) self.assertAllClose(expected, actual) def test_calculate_scores_one_dim_batch_size_two(self): @@ -241,8 +240,7 @@ class AttentionTest(test.TestCase, parameterized.TestCase): def test_shape(self): # Query tensor of shape [1, 2, 4] - q = np.array( - [[[1., 1.1, 1.2, 1.3], [2., 2.1, 2.2, 2.3]]], dtype=np.float32) + q = np.array([[[1., 1.1, 1.2, 1.3], [2., 2.1, 2.2, 2.3]]], dtype=np.float32) # Value tensor of shape [1, 3, 4] v = np.array( [[[1.5, 1.6, 1.7, 1.8], [2.5, 2.6, 2.7, 2.8], [3.5, 3.6, 3.7, 3.8]]], @@ -257,8 +255,7 @@ class AttentionTest(test.TestCase, parameterized.TestCase): def test_shape_with_key(self): # Query tensor of shape [1, 2, 4] - q = np.array( - [[[1., 1.1, 1.2, 1.3], [2., 2.1, 2.2, 2.3]]], dtype=np.float32) + q = np.array([[[1., 1.1, 1.2, 1.3], [2., 2.1, 2.2, 2.3]]], dtype=np.float32) # Value tensor of shape [1, 3, 4] v = np.array( [[[1.5, 1.6, 1.7, 1.8], [2.5, 2.6, 2.7, 2.8], [3.5, 3.6, 3.7, 3.8]]], @@ -342,12 +339,16 @@ class AttentionTest(test.TestCase, parameterized.TestCase): q_mask = np.array([[True, False]], dtype=np.bool_) # Value mask tensor of shape [1, 3] v_mask = np.array([[True, True, False]], dtype=np.bool_) - attention_layer = dense_attention.Attention( - return_attention_scores=return_attention_scores) + attention_layer = dense_attention.Attention() if return_attention_scores: - actual, actual_scores = attention_layer([q, v], mask=[q_mask, v_mask]) + actual, actual_scores = attention_layer( + [q, v], + mask=[q_mask, v_mask], + return_attention_scores=return_attention_scores) else: - actual = attention_layer([q, v], mask=[q_mask, v_mask]) + actual = attention_layer([q, v], + mask=[q_mask, v_mask], + return_attention_scores=return_attention_scores) # Expected scores of shape [1, 2, 3] # scores = [[[1.1*1.6, 1.1*0.7, -1.1*0.8], [-0.5*1.6, -0.5*0.7, 0.5*0.8]]] @@ -365,10 +366,9 @@ class AttentionTest(test.TestCase, parameterized.TestCase): # = 0.61063923394 # attention_distribution012 = 0 if return_attention_scores: - expected_scores = np.array( - [[[0.72908792234, 0.27091207765, 0.], - [0.38936076605, 0.61063923394, 0.]]], - dtype=np.float32) + expected_scores = np.array([[[0.72908792234, 0.27091207765, 0.], + [0.38936076605, 0.61063923394, 0.]]], + dtype=np.float32) self.assertAllClose(expected_scores, actual_scores) # Expected tensor of shape [1, 2, 1] with zeros where q_mask == False. # expected000 = 0.72908792234 * 1.6 + 0.27091207765 * 0.7 - 0 * 0.8 @@ -406,12 +406,13 @@ class AttentionTest(test.TestCase, parameterized.TestCase): def test_self_attention_causal(self, return_attention_scores): # Query-value tensor of shape [1, 3, 1] q = np.array([[[0.5], [0.8], [-0.3]]], dtype=np.float32) - attention_layer = dense_attention.Attention( - causal=True, return_attention_scores=return_attention_scores) + attention_layer = dense_attention.Attention(causal=True) if return_attention_scores: - actual, actual_scores = attention_layer([q, q]) + actual, actual_scores = attention_layer( + [q, q], return_attention_scores=return_attention_scores) else: - actual = attention_layer([q, q]) + actual = attention_layer([q, q], + return_attention_scores=return_attention_scores) # Expected scores of shape [1, 3, 3] # scores = [[0.25, 0.4, -0.15], [0.4, 0.64, -0.24], [-0.15, -0.24, 0.09]] @@ -426,8 +427,7 @@ class AttentionTest(test.TestCase, parameterized.TestCase): # = [0.31395396638, 0.28693232061, 0.399113713] if return_attention_scores: expected_scores = np.array( - [[[1., 0., 0.], - [0.44028635073, 0.55971364926, 0.], + [[[1., 0., 0.], [0.44028635073, 0.55971364926, 0.], [0.31395396638, 0.28693232061, 0.399113713]]], dtype=np.float32) self.assertAllClose(expected_scores, actual_scores) @@ -437,8 +437,8 @@ class AttentionTest(test.TestCase, parameterized.TestCase): # = 0.66791409477 # expected020 = 0.31395396638 * 0.5 +0.28693232061 * 0.8 -0.399113713 * 0.3 # = 0.26678872577 - expected = np.array( - [[[0.5], [0.66791409477], [0.26678872577]]], dtype=np.float32) + expected = np.array([[[0.5], [0.66791409477], [0.26678872577]]], + dtype=np.float32) self.assertAllClose(expected, actual) def test_inputs_not_list(self): @@ -501,24 +501,20 @@ class AttentionTest(test.TestCase, parameterized.TestCase): self.assertAllClose([[[0], [1]]], actual) @parameterized.named_parameters( - ('', False, False), - ('use_scale', True, False), - ('return_attention_scores', False, True), + ('', False), + ('use_scale', True), ) - def test_serialization(self, use_scale, return_attention_scores): + def test_serialization(self, use_scale): # Test serialization with use_scale - layer = dense_attention.Attention( - use_scale=use_scale, return_attention_scores=return_attention_scores) + layer = dense_attention.Attention(use_scale=use_scale) config = keras.layers.serialize(layer) new_layer = keras.layers.deserialize(config) self.assertEqual(new_layer.use_scale, use_scale) - self.assertEqual(new_layer.return_attention_scores, return_attention_scores) config = layer.get_config() new_layer = dense_attention.Attention.from_config(config) self.assertEqual(new_layer.use_scale, use_scale) - self.assertEqual(new_layer.return_attention_scores, return_attention_scores) @combinations.generate(combinations.combine(mode=['graph', 'eager'])) @@ -542,8 +538,7 @@ class AdditiveAttentionTest(test.TestCase, parameterized.TestCase): def test_calculate_scores_multi_dim(self): # Query tensor of shape [1, 2, 4] - q = np.array( - [[[1., 1.1, 1.2, 1.3], [2., 2.1, 2.2, 2.3]]], dtype=np.float32) + q = np.array([[[1., 1.1, 1.2, 1.3], [2., 2.1, 2.2, 2.3]]], dtype=np.float32) # Key tensor of shape [1, 3, 4] k = np.array( [[[1.5, 1.6, 1.7, 1.8], [2.5, 2.6, 2.7, 2.8], [3.5, 3.6, 3.7, 3.8]]], @@ -562,10 +557,9 @@ class AdditiveAttentionTest(test.TestCase, parameterized.TestCase): # expected011 = 0.5*tanh(2.+2.5) + 0.6*tanh(2.1+2.6) + 0.7*tanh(2.2+2.7) + 0.8*tanh(2.3+2.8) = 2.59964024652 # expected012 = 0.5*tanh(2.+3.5) + 0.6*tanh(2.1+3.6) + 0.7*tanh(2.2+3.7) + 0.8*tanh(2.3+3.8) = 2.59995130916 # pylint:enable=line-too-long - expected = np.array( - [[[2.58044532581, 2.59734317449, 2.59964024652], - [2.59734317449, 2.59964024652, 2.59995130916]]], - dtype=np.float32) + expected = np.array([[[2.58044532581, 2.59734317449, 2.59964024652], + [2.59734317449, 2.59964024652, 2.59995130916]]], + dtype=np.float32) self.assertAllClose(expected, actual) def test_calculate_scores_one_dim_batch_size_two(self): @@ -582,14 +576,13 @@ class AdditiveAttentionTest(test.TestCase, parameterized.TestCase): # Expected tensor of shape [2, 1, 1]. # expected000 = 0.5 * tanh(1.1 + 1.6) = 0.49550372683 # expected100 = 0.5 * tanh(2.1 + 2.6) = 0.49991728277 - expected = np.array( - [[[0.49550372683]], [[0.49991728277]]], dtype=np.float32) + expected = np.array([[[0.49550372683]], [[0.49991728277]]], + dtype=np.float32) self.assertAllClose(expected, actual) def test_shape(self): # Query tensor of shape [1, 2, 4] - q = np.array( - [[[1., 1.1, 1.2, 1.3], [2., 2.1, 2.2, 2.3]]], dtype=np.float32) + q = np.array([[[1., 1.1, 1.2, 1.3], [2., 2.1, 2.2, 2.3]]], dtype=np.float32) # Value tensor of shape [1, 3, 4] v = np.array( [[[1.5, 1.6, 1.7, 1.8], [2.5, 2.6, 2.7, 2.8], [3.5, 3.6, 3.7, 3.8]]], @@ -604,8 +597,7 @@ class AdditiveAttentionTest(test.TestCase, parameterized.TestCase): def test_shape_no_scale(self): # Query tensor of shape [1, 2, 4] - q = np.array( - [[[1., 1.1, 1.2, 1.3], [2., 2.1, 2.2, 2.3]]], dtype=np.float32) + q = np.array([[[1., 1.1, 1.2, 1.3], [2., 2.1, 2.2, 2.3]]], dtype=np.float32) # Value tensor of shape [1, 3, 4] v = np.array( [[[1.5, 1.6, 1.7, 1.8], [2.5, 2.6, 2.7, 2.8], [3.5, 3.6, 3.7, 3.8]]], @@ -620,8 +612,7 @@ class AdditiveAttentionTest(test.TestCase, parameterized.TestCase): def test_shape_with_key(self): # Query tensor of shape [1, 2, 4] - q = np.array( - [[[1., 1.1, 1.2, 1.3], [2., 2.1, 2.2, 2.3]]], dtype=np.float32) + q = np.array([[[1., 1.1, 1.2, 1.3], [2., 2.1, 2.2, 2.3]]], dtype=np.float32) # Value tensor of shape [1, 3, 4] v = np.array( [[[1.5, 1.6, 1.7, 1.8], [2.5, 2.6, 2.7, 2.8], [3.5, 3.6, 3.7, 3.8]]], @@ -779,8 +770,8 @@ class LowerTriangularMaskTest(test.TestCase, parameterized.TestCase): def test_orthogonal_shape(self): actual = dense_attention._lower_triangular_mask([3, 2]) - expected = np.array( - [[True, False], [True, True], [True, True]], dtype=np.bool_) + expected = np.array([[True, False], [True, True], [True, True]], + dtype=np.bool_) self.assertAllEqual(expected, actual) def test_three_dim(self): diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-additive-attention.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-additive-attention.pbtxt index 96b809486a7..0c85d31934a 100644 --- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-additive-attention.pbtxt +++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-additive-attention.pbtxt @@ -150,7 +150,7 @@ tf_class { } member_method { name: "call" - argspec: "args=[\'self\', \'inputs\', \'mask\', \'training\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], " + argspec: "args=[\'self\', \'inputs\', \'mask\', \'training\', \'return_attention_scores\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\'], " } member_method { name: "compute_mask" diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-attention.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-attention.pbtxt index ae2cb7f7e20..b2c3156cf7a 100644 --- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-attention.pbtxt +++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-attention.pbtxt @@ -150,7 +150,7 @@ tf_class { } member_method { name: "call" - argspec: "args=[\'self\', \'inputs\', \'mask\', \'training\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], " + argspec: "args=[\'self\', \'inputs\', \'mask\', \'training\', \'return_attention_scores\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\'], " } member_method { name: "compute_mask" diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-additive-attention.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-additive-attention.pbtxt index 96b809486a7..0c85d31934a 100644 --- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-additive-attention.pbtxt +++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-additive-attention.pbtxt @@ -150,7 +150,7 @@ tf_class { } member_method { name: "call" - argspec: "args=[\'self\', \'inputs\', \'mask\', \'training\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], " + argspec: "args=[\'self\', \'inputs\', \'mask\', \'training\', \'return_attention_scores\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\'], " } member_method { name: "compute_mask" diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-attention.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-attention.pbtxt index ae2cb7f7e20..b2c3156cf7a 100644 --- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-attention.pbtxt +++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-attention.pbtxt @@ -150,7 +150,7 @@ tf_class { } member_method { name: "call" - argspec: "args=[\'self\', \'inputs\', \'mask\', \'training\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], " + argspec: "args=[\'self\', \'inputs\', \'mask\', \'training\', \'return_attention_scores\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\'], " } member_method { name: "compute_mask"