From 820368cd94782367e29a9fba1d5fd0d48cd1fe8f Mon Sep 17 00:00:00 2001 From: Zhenyu Tan Date: Tue, 7 Apr 2020 10:33:09 -0700 Subject: [PATCH] Adadelta use case clarification. Contributed from https://github.com/tensorflow/tensorflow/pull/36849 by abhilash1910 PiperOrigin-RevId: 305288812 Change-Id: I710013945f0cd38db0f19247584a74b2fd35e0e5 --- tensorflow/python/keras/optimizer_v2/adadelta.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/tensorflow/python/keras/optimizer_v2/adadelta.py b/tensorflow/python/keras/optimizer_v2/adadelta.py index 99bd2f8e8bf..12f9e40c394 100644 --- a/tensorflow/python/keras/optimizer_v2/adadelta.py +++ b/tensorflow/python/keras/optimizer_v2/adadelta.py @@ -45,6 +45,19 @@ class Adadelta(optimizer_v2.OptimizerV2): don't have to set an initial learning rate. In this version, initial learning rate can be set, as in most other Keras optimizers. + According to section 4.3 ("Effective Learning rates"), near the end of + training step sizes converge to 1 which is effectively a high learning + rate which would cause divergence. This occurs only near the end of the + training as gradients and step sizes are small, and the epsilon constant + in the numerator and denominator dominate past gradients and parameter + updates which converge the learning rate to 1. + + According to section 4.4("Speech Data"),where a large neural network with + 4 hidden layers was trained on a corpus of US English data, ADADELTA was + used with 100 network replicas.The epsilon used is 1e-6 with rho=0.95 + which converged faster than ADAGRAD, by the following construction: + def __init__(self, lr=1.0, rho=0.95, epsilon=1e-6, decay=0., **kwargs): + Args: learning_rate: A `Tensor`, floating point value, or a schedule that is a `tf.keras.optimizers.schedules.LearningRateSchedule`. The learning rate.