From 1a99f30ec75c9a851af3ddde8eb954d9c6920af3 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Sun, 10 Jan 2021 19:07:11 +0000
Subject: [PATCH 1/3] Fix InvalidArgumentError error when mixed precision plicy
 is used in Attention/AdditiveAttention layer

This PR tries to address the issue raised in 46064 where
InvalidArgumentError error is thrown when mixed precision plicy is used
in keras Attention/AdditiveAttention layer.

This PR fixes 46064.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 .../python/keras/layers/dense_attention.py       |  2 +-
 .../python/keras/layers/dense_attention_test.py  | 16 ++++++++++++++++
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/keras/layers/dense_attention.py b/tensorflow/python/keras/layers/dense_attention.py
index 34879524b64..6cebcb2f892 100644
--- a/tensorflow/python/keras/layers/dense_attention.py
+++ b/tensorflow/python/keras/layers/dense_attention.py
@@ -126,7 +126,7 @@ class BaseDenseAttention(Layer):
     if scores_mask is not None:
       padding_mask = math_ops.logical_not(scores_mask)
       # Bias so padding positions do not contribute to attention distribution.
-      scores -= 1.e9 * math_ops.cast(padding_mask, dtype=K.floatx())
+      scores -= 1.e9 * math_ops.cast(padding_mask, dtype=scores.dtype)
     if training is None:
       training = K.learning_phase()
     weights = nn.softmax(scores)
diff --git a/tensorflow/python/keras/layers/dense_attention_test.py b/tensorflow/python/keras/layers/dense_attention_test.py
index 8570f41b34f..b0d146c5bc6 100644
--- a/tensorflow/python/keras/layers/dense_attention_test.py
+++ b/tensorflow/python/keras/layers/dense_attention_test.py
@@ -24,9 +24,12 @@ import numpy as np
 from tensorflow.python import keras
 from tensorflow.python.eager import context
 from tensorflow.python.keras import combinations
+from tensorflow.python.keras.mixed_precision import policy
 from tensorflow.python.keras.layers import core
 from tensorflow.python.keras.layers import dense_attention
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import random_ops
 from tensorflow.python.platform import test
 
 
@@ -757,6 +760,19 @@ class AdditiveAttentionTest(test.TestCase, parameterized.TestCase):
     new_layer = dense_attention.AdditiveAttention.from_config(config)
     self.assertEqual(new_layer.use_scale, True)
 
+  def test_mixed_float16_policy(self):
+    # Test case for GitHub issue:
+    # https://github.com/tensorflow/tensorflow/issues/46064
+    try:
+      policy.set_policy('mixed_float16')
+      q = math_ops.cast(random_ops.random_uniform((2, 3, 4), seed=1), 'float16')
+      v = math_ops.cast(random_ops.random_uniform((2, 3, 4), seed=2), 'float16')
+      k = math_ops.cast(random_ops.random_uniform((2, 3, 4), seed=3), 'float16')
+      layer = dense_attention.AdditiveAttention(causal=True)
+      _ = layer([q, v, k])
+    finally:
+      policy.set_policy('float32')
+
 
 @combinations.generate(combinations.combine(mode=['graph', 'eager']))
 class LowerTriangularMaskTest(test.TestCase, parameterized.TestCase):

From 739896bfdd96c81ca8cd28d159619b97cd6e7b71 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Fri, 15 Jan 2021 18:09:29 +0000
Subject: [PATCH 2/3] Update to use policy.policy_scope to address review
 comment.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/keras/layers/dense_attention_test.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/tensorflow/python/keras/layers/dense_attention_test.py b/tensorflow/python/keras/layers/dense_attention_test.py
index b0d146c5bc6..e5717735a8a 100644
--- a/tensorflow/python/keras/layers/dense_attention_test.py
+++ b/tensorflow/python/keras/layers/dense_attention_test.py
@@ -763,15 +763,12 @@ class AdditiveAttentionTest(test.TestCase, parameterized.TestCase):
   def test_mixed_float16_policy(self):
     # Test case for GitHub issue:
     # https://github.com/tensorflow/tensorflow/issues/46064
-    try:
-      policy.set_policy('mixed_float16')
+    with policy.policy_scope('mixed_float16'):
       q = math_ops.cast(random_ops.random_uniform((2, 3, 4), seed=1), 'float16')
       v = math_ops.cast(random_ops.random_uniform((2, 3, 4), seed=2), 'float16')
       k = math_ops.cast(random_ops.random_uniform((2, 3, 4), seed=3), 'float16')
       layer = dense_attention.AdditiveAttention(causal=True)
       _ = layer([q, v, k])
-    finally:
-      policy.set_policy('float32')
 
 
 @combinations.generate(combinations.combine(mode=['graph', 'eager']))

From a265344a534ffdbff4f6ab64289cd5abe4b65eea Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Fri, 15 Jan 2021 22:33:12 +0000
Subject: [PATCH 3/3] Add test in layer_correctness_test.py, and use 65504 for
 float16 padding_mask in Attention.

Update: additionally passing causal=True to Attention as well (from comment feedback)

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/keras/layers/dense_attention.py           | 6 +++++-
 .../python/keras/mixed_precision/layer_correctness_test.py  | 6 ++++++
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/keras/layers/dense_attention.py b/tensorflow/python/keras/layers/dense_attention.py
index 6cebcb2f892..54f657b113d 100644
--- a/tensorflow/python/keras/layers/dense_attention.py
+++ b/tensorflow/python/keras/layers/dense_attention.py
@@ -126,7 +126,11 @@ class BaseDenseAttention(Layer):
     if scores_mask is not None:
       padding_mask = math_ops.logical_not(scores_mask)
       # Bias so padding positions do not contribute to attention distribution.
-      scores -= 1.e9 * math_ops.cast(padding_mask, dtype=scores.dtype)
+      # Note 65504. is the max float16 value.
+      if scores.dtype is dtypes.float16:
+        scores -= 65504. * math_ops.cast(padding_mask, dtype=scores.dtype)
+      else:
+        scores -= 1.e9 * math_ops.cast(padding_mask, dtype=scores.dtype)
     if training is None:
       training = K.learning_phase()
     weights = nn.softmax(scores)
diff --git a/tensorflow/python/keras/mixed_precision/layer_correctness_test.py b/tensorflow/python/keras/mixed_precision/layer_correctness_test.py
index bbccc8721cd..a583bc0bd9e 100644
--- a/tensorflow/python/keras/mixed_precision/layer_correctness_test.py
+++ b/tensorflow/python/keras/mixed_precision/layer_correctness_test.py
@@ -139,6 +139,12 @@ class LayerCorrectnessTest(keras_parameterized.TestCase):
        (2, 2, 2)),
       ('Bidirectional',
        lambda: wrappers.Bidirectional(recurrent.SimpleRNN(units=4)), (2, 2, 2)),
+      ('AttentionLayer',
+       lambda: dense_attention.Attention(causal=True),
+       [(2, 2, 3), (2, 3, 3), (2, 3, 3)]),
+      ('AdditiveAttentionLayerCausal',
+       lambda: dense_attention.AdditiveAttention(causal=True),
+       [(2, 3, 4), (2, 3, 4), (2, 3, 4)]),
   )
   def test_layer(self, f32_layer_fn, input_shape, rtol=2e-3, atol=2e-3,
                  input_data=None):