Rename LossScalingGradientTape to LossScaleGradientTape.
This makes it more consistent with LossScale and LossScaleOptimizer. Since LossScalingGradientTape is not yet in a stable release, no need to worry about breaking anyone. PiperOrigin-RevId: 272067559
This commit is contained in:
parent
0175008120
commit
e323183bf6
@ -12,7 +12,7 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
# ==============================================================================
|
# ==============================================================================
|
||||||
"""Contains Loss Scaling Gradient Tape."""
|
"""Contains Loss Scale Gradient Tape."""
|
||||||
|
|
||||||
from __future__ import absolute_import
|
from __future__ import absolute_import
|
||||||
from __future__ import division
|
from __future__ import division
|
||||||
@ -26,8 +26,8 @@ from tensorflow.python.util import nest
|
|||||||
from tensorflow.python.util.tf_export import tf_export
|
from tensorflow.python.util.tf_export import tf_export
|
||||||
|
|
||||||
|
|
||||||
@tf_export("mixed_precision.experimental.LossScalingGradientTape", v1=[])
|
@tf_export("mixed_precision.experimental.LossScaleGradientTape", v1=[])
|
||||||
class LossScalingGradientTape(backprop.GradientTape):
|
class LossScaleGradientTape(backprop.GradientTape):
|
||||||
"""A gradient tape that scales losses and unscales resulting gradients.
|
"""A gradient tape that scales losses and unscales resulting gradients.
|
||||||
|
|
||||||
Operates as a normal gradient tape, but takes in a
|
Operates as a normal gradient tape, but takes in a
|
||||||
@ -51,7 +51,7 @@ class LossScalingGradientTape(backprop.GradientTape):
|
|||||||
model_loss_scale = tf.train.experimental.DynamicLossScale()
|
model_loss_scale = tf.train.experimental.DynamicLossScale()
|
||||||
|
|
||||||
for step in training_steps:
|
for step in training_steps:
|
||||||
with LossScalingGradientTape(model_loss_scale) as tape:
|
with LossScaleGradientTape(model_loss_scale) as tape:
|
||||||
logits = ... # Run model and get logits
|
logits = ... # Run model and get logits
|
||||||
loss = tf.nn.softmax_cross_entropy_with_logits(logits=logits,
|
loss = tf.nn.softmax_cross_entropy_with_logits(logits=logits,
|
||||||
labels=labels)
|
labels=labels)
|
||||||
@ -66,7 +66,7 @@ class LossScalingGradientTape(backprop.GradientTape):
|
|||||||
loss_scale,
|
loss_scale,
|
||||||
persistent=False,
|
persistent=False,
|
||||||
watch_accessed_variables=True):
|
watch_accessed_variables=True):
|
||||||
"""Creates a new LossScalingGradientTape.
|
"""Creates a new LossScaleGradientTape.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
loss_scale: `tf.train.experimental.LossScale` object that
|
loss_scale: `tf.train.experimental.LossScale` object that
|
||||||
@ -89,8 +89,8 @@ class LossScalingGradientTape(backprop.GradientTape):
|
|||||||
raise ValueError("`loss_scale` must be an instance of LossScale.")
|
raise ValueError("`loss_scale` must be an instance of LossScale.")
|
||||||
|
|
||||||
# always make a persistent tape to loop over loss scaling
|
# always make a persistent tape to loop over loss scaling
|
||||||
super(LossScalingGradientTape, self).__init__(True,
|
super(LossScaleGradientTape, self).__init__(True,
|
||||||
watch_accessed_variables)
|
watch_accessed_variables)
|
||||||
self._outer_persistent = persistent
|
self._outer_persistent = persistent
|
||||||
self._loss_scale = loss_scale
|
self._loss_scale = loss_scale
|
||||||
|
|
||||||
@ -142,7 +142,7 @@ class LossScalingGradientTape(backprop.GradientTape):
|
|||||||
loss_scale = self._loss_scale()
|
loss_scale = self._loss_scale()
|
||||||
scaled_target = nest.map_structure(lambda t: t * loss_scale, target)
|
scaled_target = nest.map_structure(lambda t: t * loss_scale, target)
|
||||||
|
|
||||||
old_grads = super(LossScalingGradientTape, self).gradient(
|
old_grads = super(LossScaleGradientTape, self).gradient(
|
||||||
scaled_target, sources, output_gradients, unconnected_gradients)
|
scaled_target, sources, output_gradients, unconnected_gradients)
|
||||||
inv_loss_scale = 1.0 / self._loss_scale()
|
inv_loss_scale = 1.0 / self._loss_scale()
|
||||||
grads = nest.map_structure(lambda g: inv_loss_scale * g, old_grads)
|
grads = nest.map_structure(lambda g: inv_loss_scale * g, old_grads)
|
||||||
|
@ -12,7 +12,7 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
# ==============================================================================
|
# ==============================================================================
|
||||||
"""Tests for lsgt.LossScalingGradientTape."""
|
"""Tests for lsgt.LossScaleGradientTape."""
|
||||||
from __future__ import absolute_import
|
from __future__ import absolute_import
|
||||||
from __future__ import division
|
from __future__ import division
|
||||||
from __future__ import print_function
|
from __future__ import print_function
|
||||||
@ -27,13 +27,13 @@ from tensorflow.python.training.experimental import loss_scale as loss_scale_mod
|
|||||||
from tensorflow.python.training.experimental import loss_scaling_gradient_tape as lsgt
|
from tensorflow.python.training.experimental import loss_scaling_gradient_tape as lsgt
|
||||||
|
|
||||||
|
|
||||||
class LossScalingGradientTapeTest(test.TestCase, parameterized.TestCase):
|
class LossScaleGradientTapeTest(test.TestCase, parameterized.TestCase):
|
||||||
|
|
||||||
@parameterized.parameters(loss_scale_module.FixedLossScale,
|
@parameterized.parameters(loss_scale_module.FixedLossScale,
|
||||||
loss_scale_module.DynamicLossScale)
|
loss_scale_module.DynamicLossScale)
|
||||||
def test_basic_tapes_eager_mode(self, loss_scale):
|
def test_basic_tapes_eager_mode(self, loss_scale):
|
||||||
x = constant_op.constant(3.0)
|
x = constant_op.constant(3.0)
|
||||||
with lsgt.LossScalingGradientTape(loss_scale(32)) as g:
|
with lsgt.LossScaleGradientTape(loss_scale(32)) as g:
|
||||||
g.watch(x)
|
g.watch(x)
|
||||||
y = x * x
|
y = x * x
|
||||||
dy_dx = g.gradient(y, x)
|
dy_dx = g.gradient(y, x)
|
||||||
@ -47,7 +47,7 @@ class LossScalingGradientTapeTest(test.TestCase, parameterized.TestCase):
|
|||||||
@def_function.function
|
@def_function.function
|
||||||
def _inner_test():
|
def _inner_test():
|
||||||
x = constant_op.constant(3.0)
|
x = constant_op.constant(3.0)
|
||||||
with lsgt.LossScalingGradientTape(loss_scale) as g:
|
with lsgt.LossScaleGradientTape(loss_scale) as g:
|
||||||
g.watch(x)
|
g.watch(x)
|
||||||
y = x * x
|
y = x * x
|
||||||
return g.gradient(y, x)
|
return g.gradient(y, x)
|
||||||
@ -57,9 +57,9 @@ class LossScalingGradientTapeTest(test.TestCase, parameterized.TestCase):
|
|||||||
loss_scale_module.DynamicLossScale)
|
loss_scale_module.DynamicLossScale)
|
||||||
def test_nested_tapes(self, loss_scale):
|
def test_nested_tapes(self, loss_scale):
|
||||||
x = constant_op.constant(3.0)
|
x = constant_op.constant(3.0)
|
||||||
with lsgt.LossScalingGradientTape(loss_scale(32)) as g:
|
with lsgt.LossScaleGradientTape(loss_scale(32)) as g:
|
||||||
g.watch(x)
|
g.watch(x)
|
||||||
with lsgt.LossScalingGradientTape(loss_scale(32)) as gg:
|
with lsgt.LossScaleGradientTape(loss_scale(32)) as gg:
|
||||||
gg.watch(x)
|
gg.watch(x)
|
||||||
y = x * x
|
y = x * x
|
||||||
dy_dx = gg.gradient(y, x)
|
dy_dx = gg.gradient(y, x)
|
||||||
@ -71,7 +71,7 @@ class LossScalingGradientTapeTest(test.TestCase, parameterized.TestCase):
|
|||||||
loss_scale_module.DynamicLossScale)
|
loss_scale_module.DynamicLossScale)
|
||||||
def test_non_persistent_tapes_error(self, loss_scale):
|
def test_non_persistent_tapes_error(self, loss_scale):
|
||||||
x = constant_op.constant(3.0)
|
x = constant_op.constant(3.0)
|
||||||
with lsgt.LossScalingGradientTape(loss_scale(32), persistent=False) as g:
|
with lsgt.LossScaleGradientTape(loss_scale(32), persistent=False) as g:
|
||||||
g.watch(x)
|
g.watch(x)
|
||||||
y = x * x
|
y = x * x
|
||||||
z = y * y
|
z = y * y
|
||||||
@ -83,7 +83,7 @@ class LossScalingGradientTapeTest(test.TestCase, parameterized.TestCase):
|
|||||||
loss_scale_module.DynamicLossScale)
|
loss_scale_module.DynamicLossScale)
|
||||||
def test_persistent_tapes(self, loss_scale):
|
def test_persistent_tapes(self, loss_scale):
|
||||||
x = constant_op.constant(3.0)
|
x = constant_op.constant(3.0)
|
||||||
with lsgt.LossScalingGradientTape(loss_scale(32), persistent=True) as g:
|
with lsgt.LossScaleGradientTape(loss_scale(32), persistent=True) as g:
|
||||||
g.watch(x)
|
g.watch(x)
|
||||||
y = x * x
|
y = x * x
|
||||||
z = y * y
|
z = y * y
|
||||||
@ -97,7 +97,7 @@ class LossScalingGradientTapeTest(test.TestCase, parameterized.TestCase):
|
|||||||
def test_nested_sources(self, loss_scale):
|
def test_nested_sources(self, loss_scale):
|
||||||
x = (constant_op.constant(19.0), (constant_op.constant(8.),
|
x = (constant_op.constant(19.0), (constant_op.constant(8.),
|
||||||
constant_op.constant(9.)))
|
constant_op.constant(9.)))
|
||||||
with lsgt.LossScalingGradientTape(loss_scale(32)) as g:
|
with lsgt.LossScaleGradientTape(loss_scale(32)) as g:
|
||||||
g.watch(x)
|
g.watch(x)
|
||||||
y = x * 13
|
y = x * 13
|
||||||
dy_dx = g.gradient(y, x)
|
dy_dx = g.gradient(y, x)
|
||||||
@ -107,7 +107,7 @@ class LossScalingGradientTapeTest(test.TestCase, parameterized.TestCase):
|
|||||||
loss_scale_module.DynamicLossScale)
|
loss_scale_module.DynamicLossScale)
|
||||||
def test_nested_targets(self, loss_scale):
|
def test_nested_targets(self, loss_scale):
|
||||||
w = constant_op.constant(3.0)
|
w = constant_op.constant(3.0)
|
||||||
with lsgt.LossScalingGradientTape(loss_scale(32)) as g:
|
with lsgt.LossScaleGradientTape(loss_scale(32)) as g:
|
||||||
g.watch(w)
|
g.watch(w)
|
||||||
x = w * 5
|
x = w * 5
|
||||||
y = w * 7
|
y = w * 7
|
||||||
@ -119,7 +119,7 @@ class LossScalingGradientTapeTest(test.TestCase, parameterized.TestCase):
|
|||||||
loss_scale_module.DynamicLossScale)
|
loss_scale_module.DynamicLossScale)
|
||||||
def test_scaling_inf_gradient(self, loss_scale):
|
def test_scaling_inf_gradient(self, loss_scale):
|
||||||
x = constant_op.constant(1.0)
|
x = constant_op.constant(1.0)
|
||||||
with lsgt.LossScalingGradientTape(loss_scale(32)) as g:
|
with lsgt.LossScaleGradientTape(loss_scale(32)) as g:
|
||||||
g.watch(x)
|
g.watch(x)
|
||||||
y = x * np.inf
|
y = x * np.inf
|
||||||
dy_dx = g.gradient(y, x)
|
dy_dx = g.gradient(y, x)
|
||||||
@ -129,7 +129,7 @@ class LossScalingGradientTapeTest(test.TestCase, parameterized.TestCase):
|
|||||||
loss_scale_module.DynamicLossScale)
|
loss_scale_module.DynamicLossScale)
|
||||||
def test_scaling_nan_gradient(self, loss_scale):
|
def test_scaling_nan_gradient(self, loss_scale):
|
||||||
x = constant_op.constant(1.0)
|
x = constant_op.constant(1.0)
|
||||||
with lsgt.LossScalingGradientTape(loss_scale(32)) as g:
|
with lsgt.LossScaleGradientTape(loss_scale(32)) as g:
|
||||||
g.watch(x)
|
g.watch(x)
|
||||||
y = x * np.nan
|
y = x * np.nan
|
||||||
dy_dx = g.gradient(y, x)
|
dy_dx = g.gradient(y, x)
|
||||||
@ -139,7 +139,7 @@ class LossScalingGradientTapeTest(test.TestCase, parameterized.TestCase):
|
|||||||
def test_dynamic_scale_to_one_on_non_finite_gradient(self, non_finite_term):
|
def test_dynamic_scale_to_one_on_non_finite_gradient(self, non_finite_term):
|
||||||
loss_scale = loss_scale_module.DynamicLossScale(initial_loss_scale=32)
|
loss_scale = loss_scale_module.DynamicLossScale(initial_loss_scale=32)
|
||||||
x = constant_op.constant(1.0)
|
x = constant_op.constant(1.0)
|
||||||
with lsgt.LossScalingGradientTape(loss_scale) as g:
|
with lsgt.LossScaleGradientTape(loss_scale) as g:
|
||||||
g.watch(x)
|
g.watch(x)
|
||||||
y = x * non_finite_term
|
y = x * non_finite_term
|
||||||
g.gradient(y, x)
|
g.gradient(y, x)
|
||||||
@ -150,7 +150,7 @@ class LossScalingGradientTapeTest(test.TestCase, parameterized.TestCase):
|
|||||||
is_non_finite):
|
is_non_finite):
|
||||||
loss_scale = loss_scale_module.FixedLossScale(32)
|
loss_scale = loss_scale_module.FixedLossScale(32)
|
||||||
x = constant_op.constant(1.0)
|
x = constant_op.constant(1.0)
|
||||||
with lsgt.LossScalingGradientTape(loss_scale) as g:
|
with lsgt.LossScaleGradientTape(loss_scale) as g:
|
||||||
g.watch(x)
|
g.watch(x)
|
||||||
y = x * non_finite_term
|
y = x * non_finite_term
|
||||||
dy_dx = g.gradient(y, x)
|
dy_dx = g.gradient(y, x)
|
||||||
@ -160,7 +160,7 @@ class LossScalingGradientTapeTest(test.TestCase, parameterized.TestCase):
|
|||||||
def test_dynamic_loss_scaling_down_loop(self):
|
def test_dynamic_loss_scaling_down_loop(self):
|
||||||
loss_scale = loss_scale_module.DynamicLossScale(initial_loss_scale=32)
|
loss_scale = loss_scale_module.DynamicLossScale(initial_loss_scale=32)
|
||||||
x = constant_op.constant(1.0)
|
x = constant_op.constant(1.0)
|
||||||
with lsgt.LossScalingGradientTape(loss_scale) as g:
|
with lsgt.LossScaleGradientTape(loss_scale) as g:
|
||||||
g.watch(x)
|
g.watch(x)
|
||||||
y = x * (3.0 * (10**37)) # grad will be inf after scaling
|
y = x * (3.0 * (10**37)) # grad will be inf after scaling
|
||||||
dy_dx = g.gradient(y, x)
|
dy_dx = g.gradient(y, x)
|
||||||
@ -170,7 +170,7 @@ class LossScalingGradientTapeTest(test.TestCase, parameterized.TestCase):
|
|||||||
def test_dynamic_loss_scaling_inf_target_post_scale(self):
|
def test_dynamic_loss_scaling_inf_target_post_scale(self):
|
||||||
loss_scale = loss_scale_module.DynamicLossScale(initial_loss_scale=32.0)
|
loss_scale = loss_scale_module.DynamicLossScale(initial_loss_scale=32.0)
|
||||||
x = constant_op.constant(3.0 * (10**37))
|
x = constant_op.constant(3.0 * (10**37))
|
||||||
with lsgt.LossScalingGradientTape(loss_scale) as g:
|
with lsgt.LossScaleGradientTape(loss_scale) as g:
|
||||||
g.watch(x)
|
g.watch(x)
|
||||||
y = x * 3.0 # target will be inf after scaling
|
y = x * 3.0 # target will be inf after scaling
|
||||||
dy_dx = g.gradient(y, x)
|
dy_dx = g.gradient(y, x)
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
path: "tensorflow.mixed_precision.experimental.LossScalingGradientTape"
|
path: "tensorflow.mixed_precision.experimental.LossScaleGradientTape"
|
||||||
tf_class {
|
tf_class {
|
||||||
is_instance: "<class \'tensorflow.python.training.experimental.loss_scaling_gradient_tape.LossScalingGradientTape\'>"
|
is_instance: "<class \'tensorflow.python.training.experimental.loss_scaling_gradient_tape.LossScaleGradientTape\'>"
|
||||||
is_instance: "<class \'tensorflow.python.eager.backprop.GradientTape\'>"
|
is_instance: "<class \'tensorflow.python.eager.backprop.GradientTape\'>"
|
||||||
is_instance: "<type \'object\'>"
|
is_instance: "<type \'object\'>"
|
||||||
member_method {
|
member_method {
|
@ -1,7 +1,7 @@
|
|||||||
path: "tensorflow.mixed_precision.experimental"
|
path: "tensorflow.mixed_precision.experimental"
|
||||||
tf_module {
|
tf_module {
|
||||||
member {
|
member {
|
||||||
name: "LossScalingGradientTape"
|
name: "LossScaleGradientTape"
|
||||||
mtype: "<type \'type\'>"
|
mtype: "<type \'type\'>"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user