From 1a99f30ec75c9a851af3ddde8eb954d9c6920af3 Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Sun, 10 Jan 2021 19:07:11 +0000 Subject: [PATCH 1/3] Fix InvalidArgumentError error when mixed precision plicy is used in Attention/AdditiveAttention layer This PR tries to address the issue raised in 46064 where InvalidArgumentError error is thrown when mixed precision plicy is used in keras Attention/AdditiveAttention layer. This PR fixes 46064. Signed-off-by: Yong Tang --- .../python/keras/layers/dense_attention.py | 2 +- .../python/keras/layers/dense_attention_test.py | 16 ++++++++++++++++ 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/tensorflow/python/keras/layers/dense_attention.py b/tensorflow/python/keras/layers/dense_attention.py index 34879524b64..6cebcb2f892 100644 --- a/tensorflow/python/keras/layers/dense_attention.py +++ b/tensorflow/python/keras/layers/dense_attention.py @@ -126,7 +126,7 @@ class BaseDenseAttention(Layer): if scores_mask is not None: padding_mask = math_ops.logical_not(scores_mask) # Bias so padding positions do not contribute to attention distribution. - scores -= 1.e9 * math_ops.cast(padding_mask, dtype=K.floatx()) + scores -= 1.e9 * math_ops.cast(padding_mask, dtype=scores.dtype) if training is None: training = K.learning_phase() weights = nn.softmax(scores) diff --git a/tensorflow/python/keras/layers/dense_attention_test.py b/tensorflow/python/keras/layers/dense_attention_test.py index 8570f41b34f..b0d146c5bc6 100644 --- a/tensorflow/python/keras/layers/dense_attention_test.py +++ b/tensorflow/python/keras/layers/dense_attention_test.py @@ -24,9 +24,12 @@ import numpy as np from tensorflow.python import keras from tensorflow.python.eager import context from tensorflow.python.keras import combinations +from tensorflow.python.keras.mixed_precision import policy from tensorflow.python.keras.layers import core from tensorflow.python.keras.layers import dense_attention from tensorflow.python.ops import array_ops +from tensorflow.python.ops import math_ops +from tensorflow.python.ops import random_ops from tensorflow.python.platform import test @@ -757,6 +760,19 @@ class AdditiveAttentionTest(test.TestCase, parameterized.TestCase): new_layer = dense_attention.AdditiveAttention.from_config(config) self.assertEqual(new_layer.use_scale, True) + def test_mixed_float16_policy(self): + # Test case for GitHub issue: + # https://github.com/tensorflow/tensorflow/issues/46064 + try: + policy.set_policy('mixed_float16') + q = math_ops.cast(random_ops.random_uniform((2, 3, 4), seed=1), 'float16') + v = math_ops.cast(random_ops.random_uniform((2, 3, 4), seed=2), 'float16') + k = math_ops.cast(random_ops.random_uniform((2, 3, 4), seed=3), 'float16') + layer = dense_attention.AdditiveAttention(causal=True) + _ = layer([q, v, k]) + finally: + policy.set_policy('float32') + @combinations.generate(combinations.combine(mode=['graph', 'eager'])) class LowerTriangularMaskTest(test.TestCase, parameterized.TestCase): From 739896bfdd96c81ca8cd28d159619b97cd6e7b71 Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Fri, 15 Jan 2021 18:09:29 +0000 Subject: [PATCH 2/3] Update to use policy.policy_scope to address review comment. Signed-off-by: Yong Tang --- tensorflow/python/keras/layers/dense_attention_test.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/tensorflow/python/keras/layers/dense_attention_test.py b/tensorflow/python/keras/layers/dense_attention_test.py index b0d146c5bc6..e5717735a8a 100644 --- a/tensorflow/python/keras/layers/dense_attention_test.py +++ b/tensorflow/python/keras/layers/dense_attention_test.py @@ -763,15 +763,12 @@ class AdditiveAttentionTest(test.TestCase, parameterized.TestCase): def test_mixed_float16_policy(self): # Test case for GitHub issue: # https://github.com/tensorflow/tensorflow/issues/46064 - try: - policy.set_policy('mixed_float16') + with policy.policy_scope('mixed_float16'): q = math_ops.cast(random_ops.random_uniform((2, 3, 4), seed=1), 'float16') v = math_ops.cast(random_ops.random_uniform((2, 3, 4), seed=2), 'float16') k = math_ops.cast(random_ops.random_uniform((2, 3, 4), seed=3), 'float16') layer = dense_attention.AdditiveAttention(causal=True) _ = layer([q, v, k]) - finally: - policy.set_policy('float32') @combinations.generate(combinations.combine(mode=['graph', 'eager'])) From a265344a534ffdbff4f6ab64289cd5abe4b65eea Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Fri, 15 Jan 2021 22:33:12 +0000 Subject: [PATCH 3/3] Add test in layer_correctness_test.py, and use 65504 for float16 padding_mask in Attention. Update: additionally passing causal=True to Attention as well (from comment feedback) Signed-off-by: Yong Tang --- tensorflow/python/keras/layers/dense_attention.py | 6 +++++- .../python/keras/mixed_precision/layer_correctness_test.py | 6 ++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/tensorflow/python/keras/layers/dense_attention.py b/tensorflow/python/keras/layers/dense_attention.py index 6cebcb2f892..54f657b113d 100644 --- a/tensorflow/python/keras/layers/dense_attention.py +++ b/tensorflow/python/keras/layers/dense_attention.py @@ -126,7 +126,11 @@ class BaseDenseAttention(Layer): if scores_mask is not None: padding_mask = math_ops.logical_not(scores_mask) # Bias so padding positions do not contribute to attention distribution. - scores -= 1.e9 * math_ops.cast(padding_mask, dtype=scores.dtype) + # Note 65504. is the max float16 value. + if scores.dtype is dtypes.float16: + scores -= 65504. * math_ops.cast(padding_mask, dtype=scores.dtype) + else: + scores -= 1.e9 * math_ops.cast(padding_mask, dtype=scores.dtype) if training is None: training = K.learning_phase() weights = nn.softmax(scores) diff --git a/tensorflow/python/keras/mixed_precision/layer_correctness_test.py b/tensorflow/python/keras/mixed_precision/layer_correctness_test.py index bbccc8721cd..a583bc0bd9e 100644 --- a/tensorflow/python/keras/mixed_precision/layer_correctness_test.py +++ b/tensorflow/python/keras/mixed_precision/layer_correctness_test.py @@ -139,6 +139,12 @@ class LayerCorrectnessTest(keras_parameterized.TestCase): (2, 2, 2)), ('Bidirectional', lambda: wrappers.Bidirectional(recurrent.SimpleRNN(units=4)), (2, 2, 2)), + ('AttentionLayer', + lambda: dense_attention.Attention(causal=True), + [(2, 2, 3), (2, 3, 3), (2, 3, 3)]), + ('AdditiveAttentionLayerCausal', + lambda: dense_attention.AdditiveAttention(causal=True), + [(2, 3, 4), (2, 3, 4), (2, 3, 4)]), ) def test_layer(self, f32_layer_fn, input_shape, rtol=2e-3, atol=2e-3, input_data=None):