Added beta parameter from FTRL paper to main optimizer class.
PiperOrigin-RevId: 325290409 Change-Id: I0aa85a26b188b9ab3e1faa7462dca4c5d81f8712
This commit is contained in:
parent
3cb0418477
commit
7df6aa0253
@ -95,6 +95,7 @@
|
||||
* Error messages when Functional API construction goes wrong (and when ops cannot be converted to Keras layers automatically) should be clearer and easier to understand.
|
||||
* `Optimizer.minimize` can now accept a loss `Tensor` and a `GradientTape`
|
||||
as an alternative to accepting a `callable` loss.
|
||||
* Added `beta` parameter to FTRL optimizer to match paper.
|
||||
* `tf.function` / AutoGraph:
|
||||
* Added `experimental_follow_type_hints` argument for `tf.function`. When
|
||||
True, the function may use type annotations to optimize the tracing
|
||||
|
@ -49,7 +49,8 @@ class FtrlOptimizer(optimizer.Optimizer):
|
||||
name="Ftrl",
|
||||
accum_name=None,
|
||||
linear_name=None,
|
||||
l2_shrinkage_regularization_strength=0.0):
|
||||
l2_shrinkage_regularization_strength=0.0,
|
||||
beta=None):
|
||||
r"""Construct a new FTRL optimizer.
|
||||
|
||||
Args:
|
||||
@ -79,10 +80,11 @@ class FtrlOptimizer(optimizer.Optimizer):
|
||||
function w.r.t. the weights w.
|
||||
Specifically, in the absence of L1 regularization, it is equivalent to
|
||||
the following update rule:
|
||||
w_{t+1} = w_t - lr_t / (1 + 2*L2*lr_t) * g_t -
|
||||
2*L2_shrinkage*lr_t / (1 + 2*L2*lr_t) * w_t
|
||||
w_{t+1} = w_t - lr_t / (beta + 2*L2*lr_t) * g_t -
|
||||
2*L2_shrinkage*lr_t / (beta + 2*L2*lr_t) * w_t
|
||||
where lr_t is the learning rate at t.
|
||||
When input is sparse shrinkage will only happen on the active weights.
|
||||
beta: A float value; corresponds to the beta parameter in the paper.
|
||||
|
||||
Raises:
|
||||
ValueError: If one of the arguments is invalid.
|
||||
@ -119,12 +121,13 @@ class FtrlOptimizer(optimizer.Optimizer):
|
||||
self._initial_accumulator_value = initial_accumulator_value
|
||||
self._l1_regularization_strength = l1_regularization_strength
|
||||
self._l2_regularization_strength = l2_regularization_strength
|
||||
self._beta = (0.0 if beta is None else beta)
|
||||
self._l2_shrinkage_regularization_strength = (
|
||||
l2_shrinkage_regularization_strength)
|
||||
self._learning_rate_tensor = None
|
||||
self._learning_rate_power_tensor = None
|
||||
self._l1_regularization_strength_tensor = None
|
||||
self._l2_regularization_strength_tensor = None
|
||||
self._adjusted_l2_regularization_strength_tensor = None
|
||||
self._l2_shrinkage_regularization_strength_tensor = None
|
||||
self._accum_name = accum_name
|
||||
self._linear_name = linear_name
|
||||
@ -142,8 +145,14 @@ class FtrlOptimizer(optimizer.Optimizer):
|
||||
self._learning_rate, name="learning_rate")
|
||||
self._l1_regularization_strength_tensor = ops.convert_to_tensor(
|
||||
self._l1_regularization_strength, name="l1_regularization_strength")
|
||||
self._l2_regularization_strength_tensor = ops.convert_to_tensor(
|
||||
self._l2_regularization_strength, name="l2_regularization_strength")
|
||||
# L2 regularization strength with beta added in so that the underlying
|
||||
# TensorFlow ops do not need to include that parameter.
|
||||
self._adjusted_l2_regularization_strength_tensor = ops.convert_to_tensor(
|
||||
self._l2_regularization_strength + self._beta /
|
||||
(2. * self._learning_rate),
|
||||
name="adjusted_l2_regularization_strength")
|
||||
assert self._adjusted_l2_regularization_strength_tensor is not None
|
||||
self._beta_tensor = ops.convert_to_tensor(self._beta, name="beta")
|
||||
self._l2_shrinkage_regularization_strength_tensor = ops.convert_to_tensor(
|
||||
self._l2_shrinkage_regularization_strength,
|
||||
name="l2_shrinkage_regularization_strength")
|
||||
@ -162,7 +171,7 @@ class FtrlOptimizer(optimizer.Optimizer):
|
||||
math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype),
|
||||
math_ops.cast(self._l1_regularization_strength_tensor,
|
||||
var.dtype.base_dtype),
|
||||
math_ops.cast(self._l2_regularization_strength_tensor,
|
||||
math_ops.cast(self._adjusted_l2_regularization_strength_tensor,
|
||||
var.dtype.base_dtype),
|
||||
math_ops.cast(self._learning_rate_power_tensor, var.dtype.base_dtype),
|
||||
use_locking=self._use_locking)
|
||||
@ -175,7 +184,7 @@ class FtrlOptimizer(optimizer.Optimizer):
|
||||
math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype),
|
||||
math_ops.cast(self._l1_regularization_strength_tensor,
|
||||
var.dtype.base_dtype),
|
||||
math_ops.cast(self._l2_regularization_strength_tensor,
|
||||
math_ops.cast(self._adjusted_l2_regularization_strength_tensor,
|
||||
var.dtype.base_dtype),
|
||||
math_ops.cast(self._l2_shrinkage_regularization_strength_tensor,
|
||||
var.dtype.base_dtype),
|
||||
@ -194,7 +203,7 @@ class FtrlOptimizer(optimizer.Optimizer):
|
||||
math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype),
|
||||
math_ops.cast(self._l1_regularization_strength_tensor,
|
||||
var.dtype.base_dtype),
|
||||
math_ops.cast(self._l2_regularization_strength_tensor,
|
||||
math_ops.cast(self._adjusted_l2_regularization_strength_tensor,
|
||||
var.dtype.base_dtype),
|
||||
math_ops.cast(self._learning_rate_power_tensor, var.dtype.base_dtype),
|
||||
use_locking=self._use_locking)
|
||||
@ -207,7 +216,7 @@ class FtrlOptimizer(optimizer.Optimizer):
|
||||
math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype),
|
||||
math_ops.cast(self._l1_regularization_strength_tensor,
|
||||
var.dtype.base_dtype),
|
||||
math_ops.cast(self._l2_regularization_strength_tensor,
|
||||
math_ops.cast(self._adjusted_l2_regularization_strength_tensor,
|
||||
var.dtype.base_dtype),
|
||||
math_ops.cast(self._l2_shrinkage_regularization_strength_tensor,
|
||||
var.dtype.base_dtype),
|
||||
@ -227,7 +236,7 @@ class FtrlOptimizer(optimizer.Optimizer):
|
||||
math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype),
|
||||
math_ops.cast(self._l1_regularization_strength_tensor,
|
||||
var.dtype.base_dtype),
|
||||
math_ops.cast(self._l2_regularization_strength_tensor,
|
||||
math_ops.cast(self._adjusted_l2_regularization_strength_tensor,
|
||||
var.dtype.base_dtype),
|
||||
math_ops.cast(self._learning_rate_power_tensor, var.dtype.base_dtype),
|
||||
use_locking=self._use_locking)
|
||||
@ -241,7 +250,7 @@ class FtrlOptimizer(optimizer.Optimizer):
|
||||
math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype),
|
||||
math_ops.cast(self._l1_regularization_strength_tensor,
|
||||
var.dtype.base_dtype),
|
||||
math_ops.cast(self._l2_regularization_strength_tensor,
|
||||
math_ops.cast(self._adjusted_l2_regularization_strength_tensor,
|
||||
var.dtype.base_dtype),
|
||||
math_ops.cast(self._l2_shrinkage_regularization_strength_tensor,
|
||||
grad.dtype.base_dtype),
|
||||
@ -260,7 +269,8 @@ class FtrlOptimizer(optimizer.Optimizer):
|
||||
indices,
|
||||
math_ops.cast(self._learning_rate_tensor, grad.dtype),
|
||||
math_ops.cast(self._l1_regularization_strength_tensor, grad.dtype),
|
||||
math_ops.cast(self._l2_regularization_strength_tensor, grad.dtype),
|
||||
math_ops.cast(self._adjusted_l2_regularization_strength_tensor,
|
||||
grad.dtype),
|
||||
math_ops.cast(self._learning_rate_power_tensor, grad.dtype),
|
||||
use_locking=self._use_locking)
|
||||
else:
|
||||
@ -272,7 +282,8 @@ class FtrlOptimizer(optimizer.Optimizer):
|
||||
indices,
|
||||
math_ops.cast(self._learning_rate_tensor, grad.dtype),
|
||||
math_ops.cast(self._l1_regularization_strength_tensor, grad.dtype),
|
||||
math_ops.cast(self._l2_regularization_strength_tensor, grad.dtype),
|
||||
math_ops.cast(self._adjusted_l2_regularization_strength_tensor,
|
||||
grad.dtype),
|
||||
math_ops.cast(self._l2_shrinkage_regularization_strength_tensor,
|
||||
grad.dtype),
|
||||
math_ops.cast(self._learning_rate_power_tensor, grad.dtype),
|
||||
|
@ -161,6 +161,65 @@ class FtrlOptimizerTest(test.TestCase):
|
||||
self.assertAllCloseAccordingToType(
|
||||
np.array([-0.93460727, -1.86147261]), v1_val)
|
||||
|
||||
def testFtrlWithBeta(self):
|
||||
# The v1 optimizers do not support eager execution
|
||||
with ops.Graph().as_default():
|
||||
for dtype in [dtypes.half, dtypes.float32]:
|
||||
with self.cached_session():
|
||||
var0 = variables.Variable([1.0, 2.0], dtype=dtype)
|
||||
var1 = variables.Variable([4.0, 3.0], dtype=dtype)
|
||||
grads0 = constant_op.constant([0.1, 0.2], dtype=dtype)
|
||||
grads1 = constant_op.constant([0.01, 0.02], dtype=dtype)
|
||||
|
||||
opt = ftrl.FtrlOptimizer(3.0, initial_accumulator_value=0.1, beta=0.1)
|
||||
update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
|
||||
self.evaluate(variables.global_variables_initializer())
|
||||
|
||||
v0_val, v1_val = self.evaluate([var0, var1])
|
||||
self.assertAllCloseAccordingToType([1.0, 2.0], v0_val)
|
||||
self.assertAllCloseAccordingToType([4.0, 3.0], v1_val)
|
||||
|
||||
# Run 10 steps FTRL
|
||||
for _ in range(10):
|
||||
update.run()
|
||||
v0_val, v1_val = self.evaluate([var0, var1])
|
||||
self.assertAllCloseAccordingToType(
|
||||
np.array([-6.096838, -9.162214]), v0_val)
|
||||
self.assertAllCloseAccordingToType(
|
||||
np.array([-0.717741, -1.425132]), v1_val)
|
||||
|
||||
def testFtrlWithL2_Beta(self):
|
||||
# The v1 optimizers do not support eager execution
|
||||
with ops.Graph().as_default():
|
||||
for dtype in [dtypes.half, dtypes.float32]:
|
||||
with self.cached_session():
|
||||
var0 = variables.Variable([1.0, 2.0], dtype=dtype)
|
||||
var1 = variables.Variable([4.0, 3.0], dtype=dtype)
|
||||
grads0 = constant_op.constant([0.1, 0.2], dtype=dtype)
|
||||
grads1 = constant_op.constant([0.01, 0.02], dtype=dtype)
|
||||
|
||||
opt = ftrl.FtrlOptimizer(
|
||||
3.0,
|
||||
initial_accumulator_value=0.1,
|
||||
l1_regularization_strength=0.0,
|
||||
l2_regularization_strength=0.1,
|
||||
beta=0.1)
|
||||
update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
|
||||
self.evaluate(variables.global_variables_initializer())
|
||||
|
||||
v0_val, v1_val = self.evaluate([var0, var1])
|
||||
self.assertAllCloseAccordingToType([1.0, 2.0], v0_val)
|
||||
self.assertAllCloseAccordingToType([4.0, 3.0], v1_val)
|
||||
|
||||
# Run 10 steps FTRL
|
||||
for _ in range(10):
|
||||
update.run()
|
||||
v0_val, v1_val = self.evaluate([var0, var1])
|
||||
self.assertAllCloseAccordingToType(
|
||||
np.array([-2.735487, -4.704625]), v0_val)
|
||||
self.assertAllCloseAccordingToType(
|
||||
np.array([-0.294335, -0.586556]), v1_val)
|
||||
|
||||
def testFtrlWithL1_L2(self):
|
||||
# The v1 optimizers do not support eager execution
|
||||
with ops.Graph().as_default():
|
||||
|
@ -18,7 +18,7 @@ tf_class {
|
||||
}
|
||||
member_method {
|
||||
name: "__init__"
|
||||
argspec: "args=[\'self\', \'learning_rate\', \'learning_rate_power\', \'initial_accumulator_value\', \'l1_regularization_strength\', \'l2_regularization_strength\', \'use_locking\', \'name\', \'accum_name\', \'linear_name\', \'l2_shrinkage_regularization_strength\'], varargs=None, keywords=None, defaults=[\'-0.5\', \'0.1\', \'0.0\', \'0.0\', \'False\', \'Ftrl\', \'None\', \'None\', \'0.0\'], "
|
||||
argspec: "args=[\'self\', \'learning_rate\', \'learning_rate_power\', \'initial_accumulator_value\', \'l1_regularization_strength\', \'l2_regularization_strength\', \'use_locking\', \'name\', \'accum_name\', \'linear_name\', \'l2_shrinkage_regularization_strength\', \'beta\'], varargs=None, keywords=None, defaults=[\'-0.5\', \'0.1\', \'0.0\', \'0.0\', \'False\', \'Ftrl\', \'None\', \'None\', \'0.0\', \'None\'], "
|
||||
}
|
||||
member_method {
|
||||
name: "apply_gradients"
|
||||
|
Loading…
Reference in New Issue
Block a user