Rename LossScalingGradientTape to LossScaleGradientTape.

This makes it more consistent with LossScale and LossScaleOptimizer.

Since LossScalingGradientTape is not yet in a stable release, no need to worry about breaking anyone.

PiperOrigin-RevId: 272067559
This commit is contained in:
Reed Wanderman-Milne 2019-09-30 14:35:03 -07:00 committed by TensorFlower Gardener
parent 0175008120
commit e323183bf6
4 changed files with 27 additions and 27 deletions

View File

@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
# ============================================================================== # ==============================================================================
"""Contains Loss Scaling Gradient Tape.""" """Contains Loss Scale Gradient Tape."""
from __future__ import absolute_import from __future__ import absolute_import
from __future__ import division from __future__ import division
@ -26,8 +26,8 @@ from tensorflow.python.util import nest
from tensorflow.python.util.tf_export import tf_export from tensorflow.python.util.tf_export import tf_export
@tf_export("mixed_precision.experimental.LossScalingGradientTape", v1=[]) @tf_export("mixed_precision.experimental.LossScaleGradientTape", v1=[])
class LossScalingGradientTape(backprop.GradientTape): class LossScaleGradientTape(backprop.GradientTape):
"""A gradient tape that scales losses and unscales resulting gradients. """A gradient tape that scales losses and unscales resulting gradients.
Operates as a normal gradient tape, but takes in a Operates as a normal gradient tape, but takes in a
@ -51,7 +51,7 @@ class LossScalingGradientTape(backprop.GradientTape):
model_loss_scale = tf.train.experimental.DynamicLossScale() model_loss_scale = tf.train.experimental.DynamicLossScale()
for step in training_steps: for step in training_steps:
with LossScalingGradientTape(model_loss_scale) as tape: with LossScaleGradientTape(model_loss_scale) as tape:
logits = ... # Run model and get logits logits = ... # Run model and get logits
loss = tf.nn.softmax_cross_entropy_with_logits(logits=logits, loss = tf.nn.softmax_cross_entropy_with_logits(logits=logits,
labels=labels) labels=labels)
@ -66,7 +66,7 @@ class LossScalingGradientTape(backprop.GradientTape):
loss_scale, loss_scale,
persistent=False, persistent=False,
watch_accessed_variables=True): watch_accessed_variables=True):
"""Creates a new LossScalingGradientTape. """Creates a new LossScaleGradientTape.
Args: Args:
loss_scale: `tf.train.experimental.LossScale` object that loss_scale: `tf.train.experimental.LossScale` object that
@ -89,8 +89,8 @@ class LossScalingGradientTape(backprop.GradientTape):
raise ValueError("`loss_scale` must be an instance of LossScale.") raise ValueError("`loss_scale` must be an instance of LossScale.")
# always make a persistent tape to loop over loss scaling # always make a persistent tape to loop over loss scaling
super(LossScalingGradientTape, self).__init__(True, super(LossScaleGradientTape, self).__init__(True,
watch_accessed_variables) watch_accessed_variables)
self._outer_persistent = persistent self._outer_persistent = persistent
self._loss_scale = loss_scale self._loss_scale = loss_scale
@ -142,7 +142,7 @@ class LossScalingGradientTape(backprop.GradientTape):
loss_scale = self._loss_scale() loss_scale = self._loss_scale()
scaled_target = nest.map_structure(lambda t: t * loss_scale, target) scaled_target = nest.map_structure(lambda t: t * loss_scale, target)
old_grads = super(LossScalingGradientTape, self).gradient( old_grads = super(LossScaleGradientTape, self).gradient(
scaled_target, sources, output_gradients, unconnected_gradients) scaled_target, sources, output_gradients, unconnected_gradients)
inv_loss_scale = 1.0 / self._loss_scale() inv_loss_scale = 1.0 / self._loss_scale()
grads = nest.map_structure(lambda g: inv_loss_scale * g, old_grads) grads = nest.map_structure(lambda g: inv_loss_scale * g, old_grads)

View File

@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
# ============================================================================== # ==============================================================================
"""Tests for lsgt.LossScalingGradientTape.""" """Tests for lsgt.LossScaleGradientTape."""
from __future__ import absolute_import from __future__ import absolute_import
from __future__ import division from __future__ import division
from __future__ import print_function from __future__ import print_function
@ -27,13 +27,13 @@ from tensorflow.python.training.experimental import loss_scale as loss_scale_mod
from tensorflow.python.training.experimental import loss_scaling_gradient_tape as lsgt from tensorflow.python.training.experimental import loss_scaling_gradient_tape as lsgt
class LossScalingGradientTapeTest(test.TestCase, parameterized.TestCase): class LossScaleGradientTapeTest(test.TestCase, parameterized.TestCase):
@parameterized.parameters(loss_scale_module.FixedLossScale, @parameterized.parameters(loss_scale_module.FixedLossScale,
loss_scale_module.DynamicLossScale) loss_scale_module.DynamicLossScale)
def test_basic_tapes_eager_mode(self, loss_scale): def test_basic_tapes_eager_mode(self, loss_scale):
x = constant_op.constant(3.0) x = constant_op.constant(3.0)
with lsgt.LossScalingGradientTape(loss_scale(32)) as g: with lsgt.LossScaleGradientTape(loss_scale(32)) as g:
g.watch(x) g.watch(x)
y = x * x y = x * x
dy_dx = g.gradient(y, x) dy_dx = g.gradient(y, x)
@ -47,7 +47,7 @@ class LossScalingGradientTapeTest(test.TestCase, parameterized.TestCase):
@def_function.function @def_function.function
def _inner_test(): def _inner_test():
x = constant_op.constant(3.0) x = constant_op.constant(3.0)
with lsgt.LossScalingGradientTape(loss_scale) as g: with lsgt.LossScaleGradientTape(loss_scale) as g:
g.watch(x) g.watch(x)
y = x * x y = x * x
return g.gradient(y, x) return g.gradient(y, x)
@ -57,9 +57,9 @@ class LossScalingGradientTapeTest(test.TestCase, parameterized.TestCase):
loss_scale_module.DynamicLossScale) loss_scale_module.DynamicLossScale)
def test_nested_tapes(self, loss_scale): def test_nested_tapes(self, loss_scale):
x = constant_op.constant(3.0) x = constant_op.constant(3.0)
with lsgt.LossScalingGradientTape(loss_scale(32)) as g: with lsgt.LossScaleGradientTape(loss_scale(32)) as g:
g.watch(x) g.watch(x)
with lsgt.LossScalingGradientTape(loss_scale(32)) as gg: with lsgt.LossScaleGradientTape(loss_scale(32)) as gg:
gg.watch(x) gg.watch(x)
y = x * x y = x * x
dy_dx = gg.gradient(y, x) dy_dx = gg.gradient(y, x)
@ -71,7 +71,7 @@ class LossScalingGradientTapeTest(test.TestCase, parameterized.TestCase):
loss_scale_module.DynamicLossScale) loss_scale_module.DynamicLossScale)
def test_non_persistent_tapes_error(self, loss_scale): def test_non_persistent_tapes_error(self, loss_scale):
x = constant_op.constant(3.0) x = constant_op.constant(3.0)
with lsgt.LossScalingGradientTape(loss_scale(32), persistent=False) as g: with lsgt.LossScaleGradientTape(loss_scale(32), persistent=False) as g:
g.watch(x) g.watch(x)
y = x * x y = x * x
z = y * y z = y * y
@ -83,7 +83,7 @@ class LossScalingGradientTapeTest(test.TestCase, parameterized.TestCase):
loss_scale_module.DynamicLossScale) loss_scale_module.DynamicLossScale)
def test_persistent_tapes(self, loss_scale): def test_persistent_tapes(self, loss_scale):
x = constant_op.constant(3.0) x = constant_op.constant(3.0)
with lsgt.LossScalingGradientTape(loss_scale(32), persistent=True) as g: with lsgt.LossScaleGradientTape(loss_scale(32), persistent=True) as g:
g.watch(x) g.watch(x)
y = x * x y = x * x
z = y * y z = y * y
@ -97,7 +97,7 @@ class LossScalingGradientTapeTest(test.TestCase, parameterized.TestCase):
def test_nested_sources(self, loss_scale): def test_nested_sources(self, loss_scale):
x = (constant_op.constant(19.0), (constant_op.constant(8.), x = (constant_op.constant(19.0), (constant_op.constant(8.),
constant_op.constant(9.))) constant_op.constant(9.)))
with lsgt.LossScalingGradientTape(loss_scale(32)) as g: with lsgt.LossScaleGradientTape(loss_scale(32)) as g:
g.watch(x) g.watch(x)
y = x * 13 y = x * 13
dy_dx = g.gradient(y, x) dy_dx = g.gradient(y, x)
@ -107,7 +107,7 @@ class LossScalingGradientTapeTest(test.TestCase, parameterized.TestCase):
loss_scale_module.DynamicLossScale) loss_scale_module.DynamicLossScale)
def test_nested_targets(self, loss_scale): def test_nested_targets(self, loss_scale):
w = constant_op.constant(3.0) w = constant_op.constant(3.0)
with lsgt.LossScalingGradientTape(loss_scale(32)) as g: with lsgt.LossScaleGradientTape(loss_scale(32)) as g:
g.watch(w) g.watch(w)
x = w * 5 x = w * 5
y = w * 7 y = w * 7
@ -119,7 +119,7 @@ class LossScalingGradientTapeTest(test.TestCase, parameterized.TestCase):
loss_scale_module.DynamicLossScale) loss_scale_module.DynamicLossScale)
def test_scaling_inf_gradient(self, loss_scale): def test_scaling_inf_gradient(self, loss_scale):
x = constant_op.constant(1.0) x = constant_op.constant(1.0)
with lsgt.LossScalingGradientTape(loss_scale(32)) as g: with lsgt.LossScaleGradientTape(loss_scale(32)) as g:
g.watch(x) g.watch(x)
y = x * np.inf y = x * np.inf
dy_dx = g.gradient(y, x) dy_dx = g.gradient(y, x)
@ -129,7 +129,7 @@ class LossScalingGradientTapeTest(test.TestCase, parameterized.TestCase):
loss_scale_module.DynamicLossScale) loss_scale_module.DynamicLossScale)
def test_scaling_nan_gradient(self, loss_scale): def test_scaling_nan_gradient(self, loss_scale):
x = constant_op.constant(1.0) x = constant_op.constant(1.0)
with lsgt.LossScalingGradientTape(loss_scale(32)) as g: with lsgt.LossScaleGradientTape(loss_scale(32)) as g:
g.watch(x) g.watch(x)
y = x * np.nan y = x * np.nan
dy_dx = g.gradient(y, x) dy_dx = g.gradient(y, x)
@ -139,7 +139,7 @@ class LossScalingGradientTapeTest(test.TestCase, parameterized.TestCase):
def test_dynamic_scale_to_one_on_non_finite_gradient(self, non_finite_term): def test_dynamic_scale_to_one_on_non_finite_gradient(self, non_finite_term):
loss_scale = loss_scale_module.DynamicLossScale(initial_loss_scale=32) loss_scale = loss_scale_module.DynamicLossScale(initial_loss_scale=32)
x = constant_op.constant(1.0) x = constant_op.constant(1.0)
with lsgt.LossScalingGradientTape(loss_scale) as g: with lsgt.LossScaleGradientTape(loss_scale) as g:
g.watch(x) g.watch(x)
y = x * non_finite_term y = x * non_finite_term
g.gradient(y, x) g.gradient(y, x)
@ -150,7 +150,7 @@ class LossScalingGradientTapeTest(test.TestCase, parameterized.TestCase):
is_non_finite): is_non_finite):
loss_scale = loss_scale_module.FixedLossScale(32) loss_scale = loss_scale_module.FixedLossScale(32)
x = constant_op.constant(1.0) x = constant_op.constant(1.0)
with lsgt.LossScalingGradientTape(loss_scale) as g: with lsgt.LossScaleGradientTape(loss_scale) as g:
g.watch(x) g.watch(x)
y = x * non_finite_term y = x * non_finite_term
dy_dx = g.gradient(y, x) dy_dx = g.gradient(y, x)
@ -160,7 +160,7 @@ class LossScalingGradientTapeTest(test.TestCase, parameterized.TestCase):
def test_dynamic_loss_scaling_down_loop(self): def test_dynamic_loss_scaling_down_loop(self):
loss_scale = loss_scale_module.DynamicLossScale(initial_loss_scale=32) loss_scale = loss_scale_module.DynamicLossScale(initial_loss_scale=32)
x = constant_op.constant(1.0) x = constant_op.constant(1.0)
with lsgt.LossScalingGradientTape(loss_scale) as g: with lsgt.LossScaleGradientTape(loss_scale) as g:
g.watch(x) g.watch(x)
y = x * (3.0 * (10**37)) # grad will be inf after scaling y = x * (3.0 * (10**37)) # grad will be inf after scaling
dy_dx = g.gradient(y, x) dy_dx = g.gradient(y, x)
@ -170,7 +170,7 @@ class LossScalingGradientTapeTest(test.TestCase, parameterized.TestCase):
def test_dynamic_loss_scaling_inf_target_post_scale(self): def test_dynamic_loss_scaling_inf_target_post_scale(self):
loss_scale = loss_scale_module.DynamicLossScale(initial_loss_scale=32.0) loss_scale = loss_scale_module.DynamicLossScale(initial_loss_scale=32.0)
x = constant_op.constant(3.0 * (10**37)) x = constant_op.constant(3.0 * (10**37))
with lsgt.LossScalingGradientTape(loss_scale) as g: with lsgt.LossScaleGradientTape(loss_scale) as g:
g.watch(x) g.watch(x)
y = x * 3.0 # target will be inf after scaling y = x * 3.0 # target will be inf after scaling
dy_dx = g.gradient(y, x) dy_dx = g.gradient(y, x)

View File

@ -1,6 +1,6 @@
path: "tensorflow.mixed_precision.experimental.LossScalingGradientTape" path: "tensorflow.mixed_precision.experimental.LossScaleGradientTape"
tf_class { tf_class {
is_instance: "<class \'tensorflow.python.training.experimental.loss_scaling_gradient_tape.LossScalingGradientTape\'>" is_instance: "<class \'tensorflow.python.training.experimental.loss_scaling_gradient_tape.LossScaleGradientTape\'>"
is_instance: "<class \'tensorflow.python.eager.backprop.GradientTape\'>" is_instance: "<class \'tensorflow.python.eager.backprop.GradientTape\'>"
is_instance: "<type \'object\'>" is_instance: "<type \'object\'>"
member_method { member_method {

View File

@ -1,7 +1,7 @@
path: "tensorflow.mixed_precision.experimental" path: "tensorflow.mixed_precision.experimental"
tf_module { tf_module {
member { member {
name: "LossScalingGradientTape" name: "LossScaleGradientTape"
mtype: "<type \'type\'>" mtype: "<type \'type\'>"
} }
} }