Support clipping limits for TPU embedding in tpu_estimator.py.
PiperOrigin-RevId: 243928920
This commit is contained in:
parent
18daf67542
commit
7d2fd55ef6
@ -216,16 +216,23 @@ VariablesAndOps = collections.namedtuple(
|
|||||||
class _OptimizationParameters(object):
|
class _OptimizationParameters(object):
|
||||||
"""Parameters common to all optimizations."""
|
"""Parameters common to all optimizations."""
|
||||||
|
|
||||||
def __init__(self, learning_rate, use_gradient_accumulation):
|
def __init__(self, learning_rate, use_gradient_accumulation,
|
||||||
|
clip_weight_min, clip_weight_max):
|
||||||
self.learning_rate = learning_rate
|
self.learning_rate = learning_rate
|
||||||
self.use_gradient_accumulation = use_gradient_accumulation
|
self.use_gradient_accumulation = use_gradient_accumulation
|
||||||
|
self.clip_weight_min = clip_weight_min
|
||||||
|
self.clip_weight_max = clip_weight_max
|
||||||
|
|
||||||
|
|
||||||
class AdagradParameters(_OptimizationParameters):
|
class AdagradParameters(_OptimizationParameters):
|
||||||
"""Optimization parameters for Adagrad."""
|
"""Optimization parameters for Adagrad."""
|
||||||
|
|
||||||
def __init__(self, learning_rate, initial_accumulator=0.1,
|
def __init__(self,
|
||||||
use_gradient_accumulation=True):
|
learning_rate,
|
||||||
|
initial_accumulator=0.1,
|
||||||
|
use_gradient_accumulation=True,
|
||||||
|
clip_weight_min=None,
|
||||||
|
clip_weight_max=None):
|
||||||
"""Optimization parameters for Adagrad.
|
"""Optimization parameters for Adagrad.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
@ -235,9 +242,12 @@ class AdagradParameters(_OptimizationParameters):
|
|||||||
gradients calculation less accurate but faster. Please see
|
gradients calculation less accurate but faster. Please see
|
||||||
`optimization_parameters.proto` for details.
|
`optimization_parameters.proto` for details.
|
||||||
for details.
|
for details.
|
||||||
|
clip_weight_min: the minimum value to clip by; None means -infinity.
|
||||||
|
clip_weight_max: the maximum value to clip by; None means +infinity.
|
||||||
"""
|
"""
|
||||||
super(AdagradParameters, self).__init__(learning_rate,
|
super(AdagradParameters,
|
||||||
use_gradient_accumulation)
|
self).__init__(learning_rate, use_gradient_accumulation,
|
||||||
|
clip_weight_min, clip_weight_max)
|
||||||
if initial_accumulator <= 0:
|
if initial_accumulator <= 0:
|
||||||
raise ValueError('Adagrad initial_accumulator must be positive')
|
raise ValueError('Adagrad initial_accumulator must be positive')
|
||||||
self.initial_accumulator = initial_accumulator
|
self.initial_accumulator = initial_accumulator
|
||||||
@ -246,13 +256,16 @@ class AdagradParameters(_OptimizationParameters):
|
|||||||
class AdamParameters(_OptimizationParameters):
|
class AdamParameters(_OptimizationParameters):
|
||||||
"""Optimization parameters for Adam."""
|
"""Optimization parameters for Adam."""
|
||||||
|
|
||||||
def __init__(self, learning_rate,
|
def __init__(self,
|
||||||
|
learning_rate,
|
||||||
beta1=0.9,
|
beta1=0.9,
|
||||||
beta2=0.999,
|
beta2=0.999,
|
||||||
epsilon=1e-08,
|
epsilon=1e-08,
|
||||||
lazy_adam=True,
|
lazy_adam=True,
|
||||||
sum_inside_sqrt=True,
|
sum_inside_sqrt=True,
|
||||||
use_gradient_accumulation=True):
|
use_gradient_accumulation=True,
|
||||||
|
clip_weight_min=None,
|
||||||
|
clip_weight_max=None):
|
||||||
"""Optimization parameters for Adam.
|
"""Optimization parameters for Adam.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
@ -270,9 +283,12 @@ class AdamParameters(_OptimizationParameters):
|
|||||||
gradients calculation less accurate but faster. Please see
|
gradients calculation less accurate but faster. Please see
|
||||||
`optimization_parameters.proto` for details.
|
`optimization_parameters.proto` for details.
|
||||||
for details.
|
for details.
|
||||||
|
clip_weight_min: the minimum value to clip by; None means -infinity.
|
||||||
|
clip_weight_max: the maximum value to clip by; None means +infinity.
|
||||||
"""
|
"""
|
||||||
super(AdamParameters, self).__init__(learning_rate,
|
super(AdamParameters,
|
||||||
use_gradient_accumulation)
|
self).__init__(learning_rate, use_gradient_accumulation,
|
||||||
|
clip_weight_min, clip_weight_max)
|
||||||
if beta1 < 0. or beta1 >= 1.:
|
if beta1 < 0. or beta1 >= 1.:
|
||||||
raise ValueError('beta1 must be between 0. and 1; got {}.'.format(beta1))
|
raise ValueError('beta1 must be between 0. and 1; got {}.'.format(beta1))
|
||||||
if beta2 < 0. or beta2 >= 1.:
|
if beta2 < 0. or beta2 >= 1.:
|
||||||
@ -291,15 +307,19 @@ class AdamParameters(_OptimizationParameters):
|
|||||||
|
|
||||||
|
|
||||||
class StochasticGradientDescentParameters(_OptimizationParameters):
|
class StochasticGradientDescentParameters(_OptimizationParameters):
|
||||||
"""Optimization parameters for stochastic gradient descent.
|
"""Optimization parameters for stochastic gradient descent."""
|
||||||
|
|
||||||
Args:
|
def __init__(self, learning_rate, clip_weight_min=None,
|
||||||
learning_rate: a floating point value. The learning rate.
|
clip_weight_max=None):
|
||||||
"""
|
"""Optimization parameters for stochastic gradient descent.
|
||||||
|
|
||||||
def __init__(self, learning_rate):
|
Args:
|
||||||
super(StochasticGradientDescentParameters, self).__init__(
|
learning_rate: a floating point value. The learning rate.
|
||||||
learning_rate, False)
|
clip_weight_min: the minimum value to clip by; None means -infinity.
|
||||||
|
clip_weight_max: the maximum value to clip by; None means +infinity.
|
||||||
|
"""
|
||||||
|
super(StochasticGradientDescentParameters,
|
||||||
|
self).__init__(learning_rate, False, clip_weight_min, clip_weight_max)
|
||||||
|
|
||||||
|
|
||||||
class TPUEmbedding(object):
|
class TPUEmbedding(object):
|
||||||
@ -566,6 +586,12 @@ class TPUEmbedding(object):
|
|||||||
optimization_parameters_pb2.GradientAccumulationStatus.ENABLED
|
optimization_parameters_pb2.GradientAccumulationStatus.ENABLED
|
||||||
if self._optimization_parameters.use_gradient_accumulation else
|
if self._optimization_parameters.use_gradient_accumulation else
|
||||||
optimization_parameters_pb2.GradientAccumulationStatus.DISABLED)
|
optimization_parameters_pb2.GradientAccumulationStatus.DISABLED)
|
||||||
|
if self._optimization_parameters.clip_weight_min is not None:
|
||||||
|
table_descriptor.optimization_parameters.clipping_limits.lower.value = (
|
||||||
|
self._optimization_parameters.clip_weight_min)
|
||||||
|
if self._optimization_parameters.clip_weight_max is not None:
|
||||||
|
table_descriptor.optimization_parameters.clipping_limits.upper.value = (
|
||||||
|
self._optimization_parameters.clip_weight_max)
|
||||||
self._optimizer_handler.set_optimization_parameters(table_descriptor)
|
self._optimizer_handler.set_optimization_parameters(table_descriptor)
|
||||||
|
|
||||||
config_proto.mode = self._mode
|
config_proto.mode = self._mode
|
||||||
|
Loading…
Reference in New Issue
Block a user