diff --git a/tensorflow/python/keras/optimizer_v2/adam.py b/tensorflow/python/keras/optimizer_v2/adam.py index 4fa7c736154..965ae8669bb 100644 --- a/tensorflow/python/keras/optimizer_v2/adam.py +++ b/tensorflow/python/keras/optimizer_v2/adam.py @@ -64,7 +64,7 @@ class Adam(optimizer_v2.OptimizerV2): $$t := 0 \text{(Initialize timestep)}$$ The update rule for `variable` with gradient `g` uses an optimization - described at the end of section2 of the paper: + described at the end of section 2 of the paper: $$t := t + 1$$ $$lr_t := \text{learning\_rate} * \sqrt{1 - beta_2^t} / (1 - beta_1^t)$$ @@ -82,7 +82,7 @@ class Adam(optimizer_v2.OptimizerV2): $$t := 0 \text{(Initialize timestep)}$$ The update rule for `variable` with gradient `g` uses an optimization - described at the end of section2 of the paper: + described at the end of section 2 of the paper: $$t := t + 1$$ $$lr_t := \text{learning\_rate} * \sqrt{1 - beta_2^t} / (1 - beta_1^t)$$ diff --git a/tensorflow/python/training/adam.py b/tensorflow/python/training/adam.py index 81e03c73585..46ec3be54ec 100644 --- a/tensorflow/python/training/adam.py +++ b/tensorflow/python/training/adam.py @@ -52,7 +52,7 @@ class AdamOptimizer(optimizer.Optimizer): $$t := 0 \text{(Initialize timestep)}$$ The update rule for `variable` with gradient `g` uses an optimization - described at the end of section2 of the paper: + described at the end of section 2 of the paper: $$t := t + 1$$ $$lr_t := \text{learning\_rate} * \sqrt{1 - beta_2^t} / (1 - beta_1^t)$$