From 8d8ea6b0bdf1806b33445e3d7d42cdb5e4ec1ea9 Mon Sep 17 00:00:00 2001 From: mrTsjolder Date: Wed, 28 Nov 2018 23:43:36 +0100 Subject: [PATCH 1/6] first steps in cleaning doc citations --- .../python/training/learning_rate_decay.py | 46 +++++++++++-------- tensorflow/python/training/momentum.py | 16 ++++--- tensorflow/python/training/moving_averages.py | 17 +++++-- .../python/training/proximal_adagrad.py | 8 +++- .../training/proximal_gradient_descent.py | 5 +- tensorflow/python/training/rmsprop.py | 8 ++-- 6 files changed, 65 insertions(+), 35 deletions(-) diff --git a/tensorflow/python/training/learning_rate_decay.py b/tensorflow/python/training/learning_rate_decay.py index ab9d923bedc..b092286176e 100644 --- a/tensorflow/python/training/learning_rate_decay.py +++ b/tensorflow/python/training/learning_rate_decay.py @@ -429,9 +429,6 @@ def inverse_time_decay(learning_rate, def cosine_decay(learning_rate, global_step, decay_steps, alpha=0.0, name=None): """Applies cosine decay to the learning rate. - See [Loshchilov & Hutter, ICLR2016], SGDR: Stochastic Gradient Descent - with Warm Restarts. https://arxiv.org/abs/1608.03983 - When training a model, it is often recommended to lower the learning rate as the training progresses. This function applies a cosine decay function to a provided initial learning rate. It requires a `global_step` value to @@ -468,6 +465,11 @@ def cosine_decay(learning_rate, global_step, decay_steps, alpha=0.0, name=None): Raises: ValueError: if `global_step` is not supplied. + References: + Stochastic Gradient Descent with Warm Restarts: + [Loshchilov et al., 2017](https://openreview.net/forum?id=Skq89Scxx¬eId=Skq89Scxx) + ([pdf](https://openreview.net/pdf?id=Skq89Scxx)) + @compatibility(eager) When eager execution is enabled, this function returns a function which in turn returns the decayed learning rate Tensor. This can be useful for changing @@ -494,9 +496,6 @@ def cosine_decay_restarts(learning_rate, name=None): """Applies cosine decay with restarts to the learning rate. - See [Loshchilov & Hutter, ICLR2016], SGDR: Stochastic Gradient Descent - with Warm Restarts. https://arxiv.org/abs/1608.03983 - When training a model, it is often recommended to lower the learning rate as the training progresses. This function applies a cosine decay function with restarts to a provided initial learning rate. It requires a `global_step` @@ -536,6 +535,11 @@ def cosine_decay_restarts(learning_rate, Raises: ValueError: if `global_step` is not supplied. + References: + Stochastic Gradient Descent with Warm Restarts: + [Loshchilov et al., 2017](https://openreview.net/forum?id=Skq89Scxx¬eId=Skq89Scxx) + ([pdf](https://openreview.net/pdf?id=Skq89Scxx)) + @compatibility(eager) When eager execution is enabled, this function returns a function which in turn returns the decayed learning rate Tensor. This can be useful for changing @@ -567,13 +571,6 @@ def linear_cosine_decay(learning_rate, name=None): """Applies linear cosine decay to the learning rate. - See [Bello et al., ICML2017] Neural Optimizer Search with RL. - https://arxiv.org/abs/1709.07417 - - For the idea of warm starts here controlled by `num_periods`, - see [Loshchilov & Hutter, ICLR2016] SGDR: Stochastic Gradient Descent - with Warm Restarts. https://arxiv.org/abs/1608.03983 - Note that linear cosine decay is more aggressive than cosine decay and larger initial learning rates can typically be used. @@ -618,6 +615,14 @@ def linear_cosine_decay(learning_rate, Raises: ValueError: if `global_step` is not supplied. + References: + Neural Optimizer Search with Reinforcement Learning: + [Bello et al., 2017](http://proceedings.mlr.press/v70/bello17a.html) + ([pdf](http://proceedings.mlr.press/v70/bello17a/bello17a.pdf)) + Stochastic Gradient Descent with Warm Restarts: + [Loshchilov et al., 2017](https://openreview.net/forum?id=Skq89Scxx¬eId=Skq89Scxx) + ([pdf](https://openreview.net/pdf?id=Skq89Scxx)) + @compatibility(eager) When eager execution is enabled, this function returns a function which in turn returns the decayed learning rate Tensor. This can be useful for changing @@ -651,13 +656,6 @@ def noisy_linear_cosine_decay(learning_rate, name=None): """Applies noisy linear cosine decay to the learning rate. - See [Bello et al., ICML2017] Neural Optimizer Search with RL. - https://arxiv.org/abs/1709.07417 - - For the idea of warm starts here controlled by `num_periods`, - see [Loshchilov & Hutter, ICLR2016] SGDR: Stochastic Gradient Descent - with Warm Restarts. https://arxiv.org/abs/1608.03983 - Note that linear cosine decay is more aggressive than cosine decay and larger initial learning rates can typically be used. @@ -708,6 +706,14 @@ def noisy_linear_cosine_decay(learning_rate, Raises: ValueError: if `global_step` is not supplied. + References: + Neural Optimizer Search with Reinforcement Learning: + [Bello et al., 2017](http://proceedings.mlr.press/v70/bello17a.html) + ([pdf](http://proceedings.mlr.press/v70/bello17a/bello17a.pdf)) + Stochastic Gradient Descent with Warm Restarts: + [Loshchilov et al., 2017](https://openreview.net/forum?id=Skq89Scxx¬eId=Skq89Scxx) + ([pdf](https://openreview.net/pdf?id=Skq89Scxx)) + @compatibility(eager) When eager execution is enabled, this function returns a function which in turn returns the decayed learning rate Tensor. This can be useful for changing diff --git a/tensorflow/python/training/momentum.py b/tensorflow/python/training/momentum.py index f3bc83bbfa1..ca1104496f2 100644 --- a/tensorflow/python/training/momentum.py +++ b/tensorflow/python/training/momentum.py @@ -54,17 +54,21 @@ class MomentumOptimizer(optimizer.Optimizer): name: Optional name prefix for the operations created when applying gradients. Defaults to "Momentum". use_nesterov: If `True` use Nesterov Momentum. - See [Sutskever et al., 2013]( - http://jmlr.org/proceedings/papers/v28/sutskever13.pdf). + See (Sutskever et al., 2013). This implementation always computes gradients at the value of the variable(s) passed to the optimizer. Using Nesterov Momentum makes the variable(s) track the values called `theta_t + mu*v_t` in the paper. - This implementation is an approximation of the original formula, valid - for high values of momentum. It will compute the "adjusted gradient" - in NAG by assuming that the new gradient will be estimated by the - current average gradient plus the product of momentum and the change + This implementation is an approximation of the original formula, valid + for high values of momentum. It will compute the "adjusted gradient" + in NAG by assuming that the new gradient will be estimated by the + current average gradient plus the product of momentum and the change in the average gradient. + References: + On the importance of initialization and momentum in deep learning: + [Sutskever et al., 2013](http://proceedings.mlr.press/v28/sutskever13.html) + ([pdf](http://proceedings.mlr.press/v28/sutskever13.pdf)) + @compatibility(eager) When eager execution is enabled, `learning_rate` and `momentum` can each be a callable that takes no arguments and returns the actual value to use. This diff --git a/tensorflow/python/training/moving_averages.py b/tensorflow/python/training/moving_averages.py index cc58a952689..e1fa7ee2994 100644 --- a/tensorflow/python/training/moving_averages.py +++ b/tensorflow/python/training/moving_averages.py @@ -46,8 +46,7 @@ def assign_moving_average(variable, value, decay, zero_debias=True, name=None): `zero_debias` optionally enables scaling by the mathematically correct debiasing factor of 1 - decay ** num_updates - See `ADAM: A Method for Stochastic Optimization` Section 3 for more details - (https://arxiv.org/abs/1412.6980). + See Section 3 of (Kingma et al., 2015) for more details. The names of the debias shadow variables, by default, include both the scope they were created in and the scope of the variables they debias. They are also @@ -72,12 +71,17 @@ def assign_moving_average(variable, value, decay, zero_debias=True, name=None): value: A tensor with the same shape as 'variable'. decay: A float Tensor or float value. The moving average decay. zero_debias: A python bool. If true, assume the variable is 0-initialized - and unbias it, as in https://arxiv.org/abs/1412.6980. See docstring in + and unbias it, as in (Kingma et al., 2015). See docstring in `_zero_debias` for more details. name: Optional name of the returned operation. Returns: A tensor which if evaluated will compute and return the new moving average. + + References: + A Method for Stochastic Optimization: + [Kingma et al., 2015](https://arxiv.org/abs/1412.6980) + ([pdf](https://arxiv.org/pdf/1412.6980.pdf)) """ def update_fn(v, value, decay=decay): decay = ops.convert_to_tensor(1.0 - decay, name="decay") @@ -176,7 +180,7 @@ def _zero_debias(unbiased_var, value, decay): All exponential moving averages initialized with Tensors are initialized to 0, and therefore are biased to 0. Variables initialized to 0 and used as EMAs are similarly biased. This function creates the debias updated amount according to - a scale factor, as in https://arxiv.org/abs/1412.6980. + a scale factor, as in (Kingma et al., 2015). To demonstrate the bias the results from 0-initialization, take an EMA that was initialized to `0` with decay `b`. After `t` timesteps of seeing the @@ -201,6 +205,11 @@ def _zero_debias(unbiased_var, value, decay): Returns: The amount that the unbiased variable should be updated. Computing this tensor will also update the shadow variables appropriately. + + References: + A Method for Stochastic Optimization: + [Kingma et al., 2015](https://arxiv.org/abs/1412.6980) + ([pdf](https://arxiv.org/pdf/1412.6980.pdf)) """ with variable_scope.variable_scope( unbiased_var.name[:-len(":0")], values=[unbiased_var, diff --git a/tensorflow/python/training/proximal_adagrad.py b/tensorflow/python/training/proximal_adagrad.py index 2ea628a56b4..857ba227d0f 100644 --- a/tensorflow/python/training/proximal_adagrad.py +++ b/tensorflow/python/training/proximal_adagrad.py @@ -31,7 +31,13 @@ class ProximalAdagradOptimizer(optimizer.Optimizer): # pylint: disable=line-too-long """Optimizer that implements the Proximal Adagrad algorithm. - See this [paper](http://papers.nips.cc/paper/3793-efficient-learning-using-forward-backward-splitting.pdf). + References: + Adaptive Subgradient Methods for Online Learning and Stochastic Optimization: + [Duchi et al., 2011](http://jmlr.org/papers/v12/duchi11a.html) + ([pdf](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)) + Efficient Learning using Forward-Backward Splitting: + [Duchi et al., 2009](http://papers.nips.cc/paper/3793-efficient-learning-using-forward-backward-splitting) + ([pdf](http://papers.nips.cc/paper/3793-efficient-learning-using-forward-backward-splitting.pdf)) """ def __init__(self, learning_rate, initial_accumulator_value=0.1, diff --git a/tensorflow/python/training/proximal_gradient_descent.py b/tensorflow/python/training/proximal_gradient_descent.py index 6eca0e6cb5f..194e28f73b3 100644 --- a/tensorflow/python/training/proximal_gradient_descent.py +++ b/tensorflow/python/training/proximal_gradient_descent.py @@ -32,7 +32,10 @@ class ProximalGradientDescentOptimizer(optimizer.Optimizer): # pylint: disable=line-too-long """Optimizer that implements the proximal gradient descent algorithm. - See this [paper](http://papers.nips.cc/paper/3793-efficient-learning-using-forward-backward-splitting.pdf). + References: + Efficient Learning using Forward-Backward Splitting: + [Duchi et al., 2009](http://papers.nips.cc/paper/3793-efficient-learning-using-forward-backward-splitting) + ([pdf](http://papers.nips.cc/paper/3793-efficient-learning-using-forward-backward-splitting.pdf)) """ def __init__(self, learning_rate, l1_regularization_strength=0.0, diff --git a/tensorflow/python/training/rmsprop.py b/tensorflow/python/training/rmsprop.py index fb53b5883f5..a0e79656c39 100644 --- a/tensorflow/python/training/rmsprop.py +++ b/tensorflow/python/training/rmsprop.py @@ -52,10 +52,12 @@ from tensorflow.python.util.tf_export import tf_export @tf_export(v1=["train.RMSPropOptimizer"]) class RMSPropOptimizer(optimizer.Optimizer): - """Optimizer that implements the RMSProp algorithm. + """Optimizer that implements the RMSProp algorithm (Tielemans et al. 2012). - See the - [paper](http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf). + References: + Coursera slide 29: + Hinton, 2012 + ([pdf](http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf)) """ def __init__(self, From 593272265dd97a121cbd7f4f36560420e13fb08b Mon Sep 17 00:00:00 2001 From: mrTsjolder Date: Thu, 29 Nov 2018 22:06:24 +0100 Subject: [PATCH 2/6] fix docs for rest training module --- tensorflow/python/training/adadelta.py | 6 ++++-- tensorflow/python/training/adagrad.py | 7 ++++--- tensorflow/python/training/adagrad_da.py | 7 +++++-- tensorflow/python/training/adam.py | 6 ++++-- tensorflow/python/training/ftrl.py | 21 ++++++++++++------- tensorflow/python/training/moving_averages.py | 4 ++-- 6 files changed, 33 insertions(+), 18 deletions(-) diff --git a/tensorflow/python/training/adadelta.py b/tensorflow/python/training/adadelta.py index dd210160004..7ba80f31946 100644 --- a/tensorflow/python/training/adadelta.py +++ b/tensorflow/python/training/adadelta.py @@ -29,8 +29,10 @@ from tensorflow.python.util.tf_export import tf_export class AdadeltaOptimizer(optimizer.Optimizer): """Optimizer that implements the Adadelta algorithm. - See [M. D. Zeiler](http://arxiv.org/abs/1212.5701) - ([pdf](http://arxiv.org/pdf/1212.5701v1.pdf)) + References: + ADADELTA - An Adaptive Learning Rate Method: + [Zeiler, 2012](http://arxiv.org/abs/1212.5701) + ([pdf](http://arxiv.org/pdf/1212.5701v1.pdf)) """ def __init__(self, learning_rate=0.001, rho=0.95, epsilon=1e-8, diff --git a/tensorflow/python/training/adagrad.py b/tensorflow/python/training/adagrad.py index 10c043bae17..bee0aa5b7b9 100644 --- a/tensorflow/python/training/adagrad.py +++ b/tensorflow/python/training/adagrad.py @@ -32,9 +32,10 @@ from tensorflow.python.util.tf_export import tf_export class AdagradOptimizer(optimizer.Optimizer): """Optimizer that implements the Adagrad algorithm. - See this [paper](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf) - or this - [intro](https://ppasupat.github.io/a9online/uploads/proximal_notes.pdf). + References: + Adaptive Subgradient Methods for Online Learning and Stochastic Optimization: + [Duchi et al., 2011](http://jmlr.org/papers/v12/duchi11a.html) + ([pdf](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)) """ def __init__(self, learning_rate, initial_accumulator_value=0.1, diff --git a/tensorflow/python/training/adagrad_da.py b/tensorflow/python/training/adagrad_da.py index e23b7134b3b..a5a07fa2333 100644 --- a/tensorflow/python/training/adagrad_da.py +++ b/tensorflow/python/training/adagrad_da.py @@ -30,8 +30,6 @@ from tensorflow.python.util.tf_export import tf_export class AdagradDAOptimizer(optimizer.Optimizer): """Adagrad Dual Averaging algorithm for sparse linear models. - See this [paper](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf). - This optimizer takes care of regularization of unseen features in a mini batch by updating them when they are seen with a closed form update rule that is equivalent to having updated them on every mini-batch. @@ -40,6 +38,11 @@ class AdagradDAOptimizer(optimizer.Optimizer): trained model. This optimizer only guarantees sparsity for linear models. Be careful when using AdagradDA for deep networks as it will require careful initialization of the gradient accumulators for it to train. + + References: + Adaptive Subgradient Methods for Online Learning and Stochastic Optimization: + [Duchi et al., 2011](http://jmlr.org/papers/v12/duchi11a.html) + ([pdf](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)) """ def __init__(self, diff --git a/tensorflow/python/training/adam.py b/tensorflow/python/training/adam.py index 46ec3be54ec..9ae86bbbe72 100644 --- a/tensorflow/python/training/adam.py +++ b/tensorflow/python/training/adam.py @@ -32,8 +32,10 @@ from tensorflow.python.util.tf_export import tf_export class AdamOptimizer(optimizer.Optimizer): """Optimizer that implements the Adam algorithm. - See [Kingma et al., 2014](http://arxiv.org/abs/1412.6980) - ([pdf](http://arxiv.org/pdf/1412.6980.pdf)). + References: + Adam - A Method for Stochastic Optimization: + [Kingma et al., 2015](https://arxiv.org/abs/1412.6980) + ([pdf](https://arxiv.org/pdf/1412.6980.pdf)) """ def __init__(self, diff --git a/tensorflow/python/training/ftrl.py b/tensorflow/python/training/ftrl.py index a2ef3c76b4e..0007c0e80c5 100644 --- a/tensorflow/python/training/ftrl.py +++ b/tensorflow/python/training/ftrl.py @@ -29,11 +29,14 @@ from tensorflow.python.util.tf_export import tf_export class FtrlOptimizer(optimizer.Optimizer): """Optimizer that implements the FTRL algorithm. - See this [paper]( - https://www.eecs.tufts.edu/~dsculley/papers/ad-click-prediction.pdf). - This version has support for both online L2 (the L2 penalty given in the paper - above) and shrinkage-type L2 (which is the addition of an L2 penalty to the - loss function). + This version has support for both online L2 (McMahan et al., 2013) and + shrinkage-type L2, which is the addition of an L2 penalty + to the loss function. + + References: + Ad-click prediction: + [McMahan et al., 2013](https://dl.acm.org/citation.cfm?id=2488200) + ([pdf](https://dl.acm.org/ft_gateway.cfm?id=2488200&ftid=1388399&dwn=1&CFID=32233078&CFTOKEN=d60fe57a294c056a-CB75C374-F915-E7A6-1573FBBC7BF7D526)) """ def __init__(self, @@ -53,8 +56,7 @@ class FtrlOptimizer(optimizer.Optimizer): learning_rate: A float value or a constant float `Tensor`. learning_rate_power: A float value, must be less or equal to zero. Controls how the learning rate decreases during training. Use zero for - a fixed learning rate. See section 3.1 in the - [paper](https://www.eecs.tufts.edu/~dsculley/papers/ad-click-prediction.pdf). + a fixed learning rate. See section 3.1 in (McMahan et al., 2013). initial_accumulator_value: The starting value for accumulators. Only zero or positive values are allowed. l1_regularization_strength: A float value, must be greater than or @@ -84,6 +86,11 @@ class FtrlOptimizer(optimizer.Optimizer): Raises: ValueError: If one of the arguments is invalid. + + References: + Ad-click prediction: + [McMahan et al., 2013](https://dl.acm.org/citation.cfm?id=2488200) + ([pdf](https://dl.acm.org/ft_gateway.cfm?id=2488200&ftid=1388399&dwn=1&CFID=32233078&CFTOKEN=d60fe57a294c056a-CB75C374-F915-E7A6-1573FBBC7BF7D526)) """ super(FtrlOptimizer, self).__init__(use_locking, name) diff --git a/tensorflow/python/training/moving_averages.py b/tensorflow/python/training/moving_averages.py index e1fa7ee2994..06ae174da2f 100644 --- a/tensorflow/python/training/moving_averages.py +++ b/tensorflow/python/training/moving_averages.py @@ -79,7 +79,7 @@ def assign_moving_average(variable, value, decay, zero_debias=True, name=None): A tensor which if evaluated will compute and return the new moving average. References: - A Method for Stochastic Optimization: + Adam - A Method for Stochastic Optimization: [Kingma et al., 2015](https://arxiv.org/abs/1412.6980) ([pdf](https://arxiv.org/pdf/1412.6980.pdf)) """ @@ -207,7 +207,7 @@ def _zero_debias(unbiased_var, value, decay): tensor will also update the shadow variables appropriately. References: - A Method for Stochastic Optimization: + Adam - A Method for Stochastic Optimization: [Kingma et al., 2015](https://arxiv.org/abs/1412.6980) ([pdf](https://arxiv.org/pdf/1412.6980.pdf)) """ From d478447e4224dc6ef6ee8936a18ab30d2a0a36f8 Mon Sep 17 00:00:00 2001 From: mrTsjolder Date: Wed, 9 Jan 2019 09:26:33 +0100 Subject: [PATCH 3/6] Clean up doc citations in ops module --- tensorflow/python/ops/clip_ops.py | 9 +- tensorflow/python/ops/ctc_ops.py | 49 +++-- tensorflow/python/ops/distributions/beta.py | 10 +- .../python/ops/distributions/dirichlet.py | 10 +- tensorflow/python/ops/distributions/gamma.py | 10 +- .../python/ops/distributions/student_t.py | 10 +- tensorflow/python/ops/image_ops_impl.py | 6 +- tensorflow/python/ops/init_ops.py | 29 ++- .../ops/linalg/linear_operator_circulant.py | 7 +- tensorflow/python/ops/linalg_grad.py | 22 ++- tensorflow/python/ops/losses/losses_impl.py | 5 +- tensorflow/python/ops/metrics_impl.py | 7 +- tensorflow/python/ops/nn_impl.py | 52 ++++-- tensorflow/python/ops/nn_ops.py | 171 ++++++++++++------ tensorflow/python/ops/random_grad.py | 11 +- tensorflow/python/ops/random_ops.py | 11 +- tensorflow/python/ops/rnn_cell_impl.py | 55 ++++-- tensorflow/python/ops/signal/dct_ops.py | 14 +- tensorflow/python/ops/signal/mel_ops.py | 8 +- tensorflow/python/ops/signal/mfcc_ops.py | 11 +- tensorflow/python/ops/signal/signal.py | 6 - tensorflow/python/ops/signal/spectral_ops.py | 10 +- tensorflow/python/ops/signal/window_ops.py | 8 +- 23 files changed, 319 insertions(+), 212 deletions(-) diff --git a/tensorflow/python/ops/clip_ops.py b/tensorflow/python/ops/clip_ops.py index 4b6de9a32ce..3e72b610a3b 100644 --- a/tensorflow/python/ops/clip_ops.py +++ b/tensorflow/python/ops/clip_ops.py @@ -243,9 +243,7 @@ def clip_by_global_norm(t_list, clip_norm, use_norm=None, name=None): Any of the entries of `t_list` that are of type `None` are ignored. - This is the correct way to perform gradient clipping (for example, see - [Pascanu et al., 2012](http://arxiv.org/abs/1211.5063) - ([pdf](http://arxiv.org/pdf/1211.5063.pdf))). + This is the correct way to perform gradient clipping (Pascanu et al., 2012). However, it is slower than `clip_by_norm()` because all the parameters must be ready before the clipping operation can be performed. @@ -264,6 +262,11 @@ def clip_by_global_norm(t_list, clip_norm, use_norm=None, name=None): Raises: TypeError: If `t_list` is not a sequence. InvalidArgumentError: If global norm is not finite. + + References: + On the difficulty of training Recurrent Neural Networks: + [Pascanu et al., 2012](http://proceedings.mlr.press/v28/pascanu13.html) + ([pdf](http://proceedings.mlr.press/v28/pascanu13.pdf)) """ if (not isinstance(t_list, collections.Sequence) or isinstance(t_list, six.string_types)): diff --git a/tensorflow/python/ops/ctc_ops.py b/tensorflow/python/ops/ctc_ops.py index 80502daaac3..1bd59f7a387 100644 --- a/tensorflow/python/ops/ctc_ops.py +++ b/tensorflow/python/ops/ctc_ops.py @@ -50,12 +50,7 @@ def ctc_loss(labels, inputs=None, sequence_length=None, logits=None): """Computes the CTC (Connectionist Temporal Classification) Loss. - This op implements the CTC loss as presented in the article: - - [A. Graves, S. Fernandez, F. Gomez, J. Schmidhuber. - Connectionist Temporal Classification: Labeling Unsegmented Sequence Data - with Recurrent Neural Networks. ICML 2006, Pittsburgh, USA, - pp. 369-376.](http://www.cs.toronto.edu/~graves/icml_2006.pdf) + This op implements the CTC loss as presented in (Graves et al., 2016). Input requirements: @@ -153,6 +148,11 @@ def ctc_loss(labels, inputs=None, sequence_length=None, Raises: TypeError: if labels is not a `SparseTensor`. + + References: + Connectionist Temporal Classification - Labeling Unsegmented Sequence Data with Recurrent Neural Networks: + [Graves et al., 2016](https://dl.acm.org/citation.cfm?id=1143891) + ([pdf](http://www.cs.toronto.edu/~graves/icml_2006.pdf)) """ # The second, third, etc output tensors contain the gradients. We use it in # _CTCLossGrad() below. @@ -590,12 +590,7 @@ def ctc_loss_v2(labels, logits, label_length, logit_length, blank_index=None, name=None): """Computes CTC (Connectionist Temporal Classification) loss. - This op implements the CTC loss as presented in the article: - - [A. Graves, S. Fernandez, F. Gomez, J. Schmidhuber. - Connectionist Temporal Classification: Labeling Unsegmented Sequence Data - with Recurrent Neural Networks. ICML 2006, Pittsburgh, USA, - pp. 369-376.](http://www.cs.toronto.edu/~graves/icml_2006.pdf) + This op implements the CTC loss as presented in (Graves et al., 2016). Notes: - Same as the "Classic CTC" in TensorFlow 1.x's tf.nn.ctc_loss setting of @@ -632,6 +627,11 @@ def ctc_loss_v2(labels, logits, label_length, logit_length, Returns: loss: tensor of shape [batch_size], negative log probabilities. + + References: + Connectionist Temporal Classification - Labeling Unsegmented Sequence Data with Recurrent Neural Networks: + [Graves et al., 2016](https://dl.acm.org/citation.cfm?id=1143891) + ([pdf](http://www.cs.toronto.edu/~graves/icml_2006.pdf)) """ if isinstance(labels, sparse_tensor.SparseTensor): if blank_index is None: @@ -677,21 +677,8 @@ def ctc_loss_dense(labels, logits, label_length, logit_length, blank_index=0, name=None): """Computes CTC (Connectionist Temporal Classification) loss. - This op implements the CTC loss as presented in the article: - - [A. Graves, S. Fernandez, F. Gomez, J. Schmidhuber. - Connectionist Temporal Classification: Labeling Unsegmented Sequence Data - with Recurrent Neural Networks. ICML 2006, Pittsburgh, USA, - pp. 369-376.](http://www.cs.toronto.edu/~graves/icml_2006.pdf) - - Using the batched forward backward algorithm described in: - - [Sim, K. C., Narayanan, A., Bagby, T., Sainath, T. N., & Bacchiani, M. - Improving the efficiency of forward-backward algorithm using batched - computation in TensorFlow. - Automatic Speech Recognition and Understanding Workshop (ASRU), - 2017 IEEE (pp. 258-264). - ](https://ieeexplore.ieee.org/iel7/8260578/8268903/08268944.pdf) + This op implements the CTC loss as presented in (Graves et al., 2016), + using the batched forward backward algorithm described in (Sim et al., 2017). Notes: Significant differences from tf.nn.ctc_loss: @@ -733,6 +720,14 @@ def ctc_loss_dense(labels, logits, label_length, logit_length, Returns: loss: tensor of shape [batch_size], negative log probabilities. + + References: + Connectionist Temporal Classification - Labeling Unsegmented Sequence Data with Recurrent Neural Networks: + [Graves et al., 2016](https://dl.acm.org/citation.cfm?id=1143891) + ([pdf](http://www.cs.toronto.edu/~graves/icml_2006.pdf)) + Improving the efficiency of forward-backward algorithm using batched computation in TensorFlow: + [Sim et al., 2017](https://ieeexplore.ieee.org/document/8268944) + ([pdf](http://bacchiani.net/resume/papers/ASRU2017.pdf)) """ with ops.name_scope(name, "ctc_loss_dense", diff --git a/tensorflow/python/ops/distributions/beta.py b/tensorflow/python/ops/distributions/beta.py index 1d1a666317f..aefc0830ed4 100644 --- a/tensorflow/python/ops/distributions/beta.py +++ b/tensorflow/python/ops/distributions/beta.py @@ -91,10 +91,8 @@ class Beta(distribution.Distribution): density. Samples of this distribution are reparameterized (pathwise differentiable). - The derivatives are computed using the approach described in the paper - - [Michael Figurnov, Shakir Mohamed, Andriy Mnih. - Implicit Reparameterization Gradients, 2018](https://arxiv.org/abs/1805.08498) + The derivatives are computed using the approach described in + (Figurnov et al., 2018). #### Examples @@ -149,6 +147,10 @@ class Beta(distribution.Distribution): grads = tf.gradients(loss, [alpha, beta]) ``` + References: + Implicit Reparameterization Gradients: + [Figurnov et al., 2018](http://papers.nips.cc/paper/7326-implicit-reparameterization-gradients) + ([pdf](http://papers.nips.cc/paper/7326-implicit-reparameterization-gradients.pdf)) """ @deprecation.deprecated( diff --git a/tensorflow/python/ops/distributions/dirichlet.py b/tensorflow/python/ops/distributions/dirichlet.py index 971ce46efbc..5c23b6e1490 100644 --- a/tensorflow/python/ops/distributions/dirichlet.py +++ b/tensorflow/python/ops/distributions/dirichlet.py @@ -97,10 +97,8 @@ class Dirichlet(distribution.Distribution): density. Samples of this distribution are reparameterized (pathwise differentiable). - The derivatives are computed using the approach described in the paper - - [Michael Figurnov, Shakir Mohamed, Andriy Mnih. - Implicit Reparameterization Gradients, 2018](https://arxiv.org/abs/1805.08498) + The derivatives are computed using the approach described in + (Figurnov et al., 2018). #### Examples @@ -155,6 +153,10 @@ class Dirichlet(distribution.Distribution): grads = tf.gradients(loss, alpha) ``` + References: + Implicit Reparameterization Gradients: + [Figurnov et al., 2018](http://papers.nips.cc/paper/7326-implicit-reparameterization-gradients) + ([pdf](http://papers.nips.cc/paper/7326-implicit-reparameterization-gradients.pdf)) """ @deprecation.deprecated( diff --git a/tensorflow/python/ops/distributions/gamma.py b/tensorflow/python/ops/distributions/gamma.py index 57505d1b131..4f22323ba29 100644 --- a/tensorflow/python/ops/distributions/gamma.py +++ b/tensorflow/python/ops/distributions/gamma.py @@ -93,10 +93,8 @@ class Gamma(distribution.Distribution): `rate` is very large. See note in `tf.random_gamma` docstring. Samples of this distribution are reparameterized (pathwise differentiable). - The derivatives are computed using the approach described in the paper - - [Michael Figurnov, Shakir Mohamed, Andriy Mnih. - Implicit Reparameterization Gradients, 2018](https://arxiv.org/abs/1805.08498) + The derivatives are computed using the approach described in + (Figurnov et al., 2018). #### Examples @@ -120,6 +118,10 @@ class Gamma(distribution.Distribution): grads = tf.gradients(loss, [concentration, rate]) ``` + References: + Implicit Reparameterization Gradients: + [Figurnov et al., 2018](http://papers.nips.cc/paper/7326-implicit-reparameterization-gradients) + ([pdf](http://papers.nips.cc/paper/7326-implicit-reparameterization-gradients.pdf)) """ @deprecation.deprecated( diff --git a/tensorflow/python/ops/distributions/student_t.py b/tensorflow/python/ops/distributions/student_t.py index 351f5605e24..aad63a1cd24 100644 --- a/tensorflow/python/ops/distributions/student_t.py +++ b/tensorflow/python/ops/distributions/student_t.py @@ -82,10 +82,8 @@ class StudentT(distribution.Distribution): t-distribution std. dev. is `scale sqrt(df / (df - 2))` when `df > 2`. Samples of this distribution are reparameterized (pathwise differentiable). - The derivatives are computed using the approach described in the paper - - [Michael Figurnov, Shakir Mohamed, Andriy Mnih. - Implicit Reparameterization Gradients, 2018](https://arxiv.org/abs/1805.08498) + The derivatives are computed using the approach described in + (Figurnov et al., 2018). #### Examples @@ -139,6 +137,10 @@ class StudentT(distribution.Distribution): grads = tf.gradients(loss, [df, loc, scale]) ``` + References: + Implicit Reparameterization Gradients: + [Figurnov et al., 2018](http://papers.nips.cc/paper/7326-implicit-reparameterization-gradients) + ([pdf](http://papers.nips.cc/paper/7326-implicit-reparameterization-gradients.pdf)) """ @deprecation.deprecated( diff --git a/tensorflow/python/ops/image_ops_impl.py b/tensorflow/python/ops/image_ops_impl.py index f2809b5be18..021b3af4906 100644 --- a/tensorflow/python/ops/image_ops_impl.py +++ b/tensorflow/python/ops/image_ops_impl.py @@ -1594,7 +1594,8 @@ def adjust_contrast(images, contrast_factor): @tf_export('image.adjust_gamma') def adjust_gamma(image, gamma=1, gain=1): - """Performs Gamma Correction on the input image. + """Performs [Gamma Correction](http://en.wikipedia.org/wiki/Gamma_correction) + on the input image. Also known as Power Law Transform. This function transforms the input image pixelwise according to the equation `Out = In**gamma` @@ -1616,9 +1617,6 @@ def adjust_gamma(image, gamma=1, gain=1): the output image will be darker than the input image. For gamma less than 1, the histogram will shift towards right and the output image will be brighter than the input image. - - References: - [1] http://en.wikipedia.org/wiki/Gamma_correction """ with ops.name_scope(None, 'adjust_gamma', [image, gamma, gain]) as name: diff --git a/tensorflow/python/ops/init_ops.py b/tensorflow/python/ops/init_ops.py index 035534ef49c..3b96641a209 100644 --- a/tensorflow/python/ops/init_ops.py +++ b/tensorflow/python/ops/init_ops.py @@ -417,8 +417,9 @@ class UniformUnitScaling(Initializer): calling the initializer. Only floating point types are supported. References: - [Sussillo et al., 2014](https://arxiv.org/abs/1412.6558) - ([pdf](http://arxiv.org/pdf/1412.6558.pdf)) + Random Walk Initialization for Training Very Deep Feedforward Networks: + [Sussillo et al., 2014](https://arxiv.org/abs/1412.6558) + ([pdf](http://arxiv.org/pdf/1412.6558.pdf)) """ @deprecated_args(None, @@ -581,6 +582,7 @@ class Orthogonal(Initializer): calling the initializer. Only floating point types are supported. References: + Exact solutions to the nonlinear dynamics of learning in deep linear neural networks: [Saxe et al., 2014](https://openreview.net/forum?id=_wzZwKpTDF_9C) ([pdf](https://arxiv.org/pdf/1312.6120.pdf)) """ @@ -649,6 +651,7 @@ class ConvolutionDeltaOrthogonal(Initializer): calling the initializer. Only floating point types are supported. References: + Dynamical Isometry and a Mean Field Theory of CNNs: [Xiao et al., 2018](http://proceedings.mlr.press/v80/xiao18a.html) ([pdf](http://proceedings.mlr.press/v80/xiao18a/xiao18a.pdf)) """ @@ -710,6 +713,7 @@ class ConvolutionOrthogonal(Initializer): calling the initializer. Only floating point types are supported. References: + Dynamical Isometry and a Mean Field Theory of CNNs: [Xiao et al., 2018](http://proceedings.mlr.press/v80/xiao18a.html) ([pdf](http://proceedings.mlr.press/v80/xiao18a/xiao18a.pdf)) """ @@ -780,6 +784,7 @@ class ConvolutionOrthogonal2D(ConvolutionOrthogonal): calling the initializer. Only floating point types are supported. References: + Dynamical Isometry and a Mean Field Theory of CNNs: [Xiao et al., 2018](http://proceedings.mlr.press/v80/xiao18a.html) ([pdf](http://proceedings.mlr.press/v80/xiao18a/xiao18a.pdf)) """ @@ -922,6 +927,7 @@ class ConvolutionOrthogonal1D(ConvolutionOrthogonal): calling the initializer. Only floating point types are supported. References: + Dynamical Isometry and a Mean Field Theory of CNNs: [Xiao et al., 2018](http://proceedings.mlr.press/v80/xiao18a.html) ([pdf](http://proceedings.mlr.press/v80/xiao18a/xiao18a.pdf)) """ @@ -1043,6 +1049,7 @@ class ConvolutionOrthogonal3D(ConvolutionOrthogonal): calling the initializer. Only floating point types are supported. References: + Dynamical Isometry and a Mean Field Theory of CNNs: [Xiao et al., 2018](http://proceedings.mlr.press/v80/xiao18a.html) ([pdf](http://proceedings.mlr.press/v80/xiao18a/xiao18a.pdf)) """ @@ -1240,6 +1247,7 @@ class GlorotUniform(VarianceScaling): calling the initializer. Only floating point types are supported. References: + Understanding the difficulty of training deep feedforward neural networks: [Glorot et al., 2010](http://proceedings.mlr.press/v9/glorot10a.html) ([pdf](http://jmlr.org/proceedings/papers/v9/glorot10a/glorot10a.pdf)) """ @@ -1279,6 +1287,7 @@ class GlorotNormal(VarianceScaling): calling the initializer. Only floating point types are supported. References: + Understanding the difficulty of training deep feedforward neural networks: [Glorot et al., 2010](http://proceedings.mlr.press/v9/glorot10a.html) ([pdf](http://jmlr.org/proceedings/papers/v9/glorot10a/glorot10a.pdf)) """ @@ -1337,10 +1346,10 @@ def lecun_normal(seed=None): An initializer. References: - - Self-Normalizing Neural Networks, - [Klambauer et al., 2017](https://papers.nips.cc/paper/6698-self-normalizing-neural-networks) # pylint: disable=line-too-long + Self-Normalizing Neural Networks: + [Klambauer et al., 2017](https://papers.nips.cc/paper/6698-self-normalizing-neural-networks) ([pdf](https://papers.nips.cc/paper/6698-self-normalizing-neural-networks.pdf)) - - Efficient Backprop, + Efficient Backprop: [Lecun et al., 1998](http://yann.lecun.com/exdb/publis/pdf/lecun-98b.pdf) """ return VarianceScaling( @@ -1362,10 +1371,10 @@ def lecun_uniform(seed=None): An initializer. References: - - Self-Normalizing Neural Networks, - [Klambauer et al., 2017](https://papers.nips.cc/paper/6698-self-normalizing-neural-networks) # pylint: disable=line-too-long + Self-Normalizing Neural Networks: + [Klambauer et al., 2017](https://papers.nips.cc/paper/6698-self-normalizing-neural-networks) ([pdf](https://papers.nips.cc/paper/6698-self-normalizing-neural-networks.pdf)) - - Efficient Backprop, + Efficient Backprop: [Lecun et al., 1998](http://yann.lecun.com/exdb/publis/pdf/lecun-98b.pdf) """ return VarianceScaling( @@ -1388,8 +1397,8 @@ def he_normal(seed=None): An initializer. References: - [He et al., 2015] - (https://www.cv-foundation.org/openaccess/content_iccv_2015/html/He_Delving_Deep_into_ICCV_2015_paper.html) # pylint: disable=line-too-long + Delving Deep into Rectifiers: + [He et al., 2015](https://www.cv-foundation.org/openaccess/content_iccv_2015/html/He_Delving_Deep_into_ICCV_2015_paper.html) ([pdf](https://www.cv-foundation.org/openaccess/content_iccv_2015/papers/He_Delving_Deep_into_ICCV_2015_paper.pdf)) """ return VarianceScaling( diff --git a/tensorflow/python/ops/linalg/linear_operator_circulant.py b/tensorflow/python/ops/linalg/linear_operator_circulant.py index b74baa5dfdb..19b422c59a9 100644 --- a/tensorflow/python/ops/linalg/linear_operator_circulant.py +++ b/tensorflow/python/ops/linalg/linear_operator_circulant.py @@ -527,8 +527,6 @@ class LinearOperatorCirculant(_BaseLinearOperatorCirculant): This means that the result of matrix multiplication `v = Au` has `Lth` column given circular convolution between `h` with the `Lth` column of `u`. - See http://ee.stanford.edu/~gray/toeplitz.pdf - #### Description in terms of the frequency spectrum There is an equivalent description in terms of the [batch] spectrum `H` and @@ -684,6 +682,11 @@ class LinearOperatorCirculant(_BaseLinearOperatorCirculant): * If `is_X == False`, callers should expect the operator to not have `X`. * If `is_X == None` (the default), callers should have no expectation either way. + + References: + Toeplitz and Circulant Matrices - A Review: + [Gray, 2006](https://www.nowpublishers.com/article/Details/CIT-006) + ([pdf](https://ee.stanford.edu/~gray/toeplitz.pdf)) """ def __init__(self, diff --git a/tensorflow/python/ops/linalg_grad.py b/tensorflow/python/ops/linalg_grad.py index f6c9d2c6a6d..f05ddd7120c 100644 --- a/tensorflow/python/ops/linalg_grad.py +++ b/tensorflow/python/ops/linalg_grad.py @@ -14,14 +14,22 @@ # ============================================================================== """Gradients for operators defined in linalg_ops.py. -Useful reference for derivative formulas is -An extended collection of matrix derivative results for forward and reverse -mode algorithmic differentiation by Mike Giles: -http://eprints.maths.ox.ac.uk/1079/1/NA-08-01.pdf +Useful reference for derivative formulas is (Mike Giles, 2008). -A detailed derivation of formulas for backpropagating through spectral layers -(SVD and Eig) by Ionescu, Vantzos & Sminchisescu: -https://arxiv.org/pdf/1509.07838v4.pdf +Ionescu et al. (2015) provide a detailed derivation of formulas for +backpropagating through spectral layers (SVD and Eig). + +References: + An extended collection of matrix derivative results for + forward and reverse mode automatic differentiation: + [Mike Giles, 2008](https://ora.ox.ac.uk/objects/uuid:8d0c0a29-c92b-4153-a1d2-38b276e93124) + ([pdf](http://eprints.maths.ox.ac.uk/1079/1/NA-08-01.pdf)) + Matrix Backpropagation for Deep Networks with Structured Layers + [Ionescu et al., 2015](https://www.cv-foundation.org/openaccess/content_iccv_2015/html/Ionescu_Matrix_Backpropagation_for_ICCV_2015_paper.html) + ([pdf](https://www.cv-foundation.org/openaccess/content_iccv_2015/papers/Ionescu_Matrix_Backpropagation_for_ICCV_2015_paper.pdf)) + Training Deep Networks with Structured Layers by Matrix Backpropagation: + [Ionescu et al., 2015](https://arxiv.org/abs/1509.07838) + ([pdf](https://arxiv.org/pdf/1509.07838.pdf)) """ from __future__ import absolute_import from __future__ import division diff --git a/tensorflow/python/ops/losses/losses_impl.py b/tensorflow/python/ops/losses/losses_impl.py index 6cd1d8e5f8b..c51d35d1001 100644 --- a/tensorflow/python/ops/losses/losses_impl.py +++ b/tensorflow/python/ops/losses/losses_impl.py @@ -371,7 +371,8 @@ def hinge_loss(labels, logits, weights=1.0, scope=None, def huber_loss(labels, predictions, weights=1.0, delta=1.0, scope=None, loss_collection=ops.GraphKeys.LOSSES, reduction=Reduction.SUM_BY_NONZERO_WEIGHTS): - """Adds a Huber Loss term to the training procedure. + """Adds a [Huber Loss](https://en.wikipedia.org/wiki/Huber_loss) term + to the training procedure. For each value x in `error=labels-predictions`, the following is calculated: @@ -382,8 +383,6 @@ def huber_loss(labels, predictions, weights=1.0, delta=1.0, scope=None, where d is `delta`. - See: https://en.wikipedia.org/wiki/Huber_loss - `weights` acts as a coefficient for the loss. If a scalar is provided, then the loss is simply scaled by the given value. If `weights` is a tensor of size `[batch_size]`, then the total loss for each sample of the batch is rescaled diff --git a/tensorflow/python/ops/metrics_impl.py b/tensorflow/python/ops/metrics_impl.py index e3292e081fe..ada8179409c 100644 --- a/tensorflow/python/ops/metrics_impl.py +++ b/tensorflow/python/ops/metrics_impl.py @@ -745,7 +745,7 @@ def auc(labels, epsilon = 1.0e-6 def interpolate_pr_auc(tp, fp, fn): - """Interpolation formula inspired by section 4 of Davis & Goadrich 2006. + """Interpolation formula inspired by section 4 of (Davis et al., 2006). Note here we derive & use a closed formula not present in the paper - as follows: @@ -774,6 +774,11 @@ def auc(labels, fn: false negative counts Returns: pr_auc: an approximation of the area under the P-R curve. + + References: + The Relationship Between Precision-Recall and ROC Curves: + [Davis et al., 2006](https://dl.acm.org/citation.cfm?id=1143874) + ([pdf](https://www.biostat.wisc.edu/~page/rocpr.pdf)) """ dtp = tp[:num_thresholds - 1] - tp[1:] p = tp + fp diff --git a/tensorflow/python/ops/nn_impl.py b/tensorflow/python/ops/nn_impl.py index 1b4de7d3ac9..fb50904e8db 100644 --- a/tensorflow/python/ops/nn_impl.py +++ b/tensorflow/python/ops/nn_impl.py @@ -1192,7 +1192,6 @@ def batch_normalization(x, name=None): r"""Batch normalization. - As described in http://arxiv.org/abs/1502.03167. Normalizes a tensor by `mean` and `variance`, and applies (optionally) a `scale` \\(\gamma\\) to it, as well as an `offset` \\(\beta\\): @@ -1231,6 +1230,11 @@ def batch_normalization(x, Returns: the normalized, scaled, offset tensor. + + References: + Batch Normalization - Accelerating Deep Network Training by Reducing Internal Covariate Shift: + [Ioffe et al., 2015](http://proceedings.mlr.press/v37/ioffe15.html) + ([pdf](http://proceedings.mlr.press/v37/ioffe15.pdf)) """ with ops.name_scope(name, "batchnorm", [x, mean, variance, scale, offset]): inv = math_ops.rsqrt(variance + variance_epsilon) @@ -1255,8 +1259,6 @@ def fused_batch_norm( name=None): r"""Batch normalization. - As described in http://arxiv.org/abs/1502.03167. - Args: x: Input `Tensor` of 4 dimensions. scale: A `Tensor` of 1 dimension for scaling. @@ -1277,6 +1279,11 @@ def fused_batch_norm( Raises: ValueError: If mean or variance is not None when is_training is True. + + References: + Batch Normalization - Accelerating Deep Network Training by Reducing Internal Covariate Shift: + [Ioffe et al., 2015](http://proceedings.mlr.press/v37/ioffe15.html) + ([pdf](http://proceedings.mlr.press/v37/ioffe15.pdf)) """ x = ops.convert_to_tensor(x, name="input") scale = ops.convert_to_tensor(scale, name="scale") @@ -1352,6 +1359,11 @@ def batch_norm_with_global_normalization(t=None, Returns: A batch-normalized `t`. + + References: + Batch Normalization - Accelerating Deep Network Training by Reducing Internal Covariate Shift: + [Ioffe et al., 2015](http://proceedings.mlr.press/v37/ioffe15.html) + ([pdf](http://proceedings.mlr.press/v37/ioffe15.pdf)) """ t = deprecated_argument_lookup("input", input, "t", t) m = deprecated_argument_lookup("mean", mean, "m", m) @@ -1394,6 +1406,11 @@ def batch_norm_with_global_normalization_v2(input, Returns: A batch-normalized `t`. + + References: + Batch Normalization - Accelerating Deep Network Training by Reducing Internal Covariate Shift: + [Ioffe et al., 2015](http://proceedings.mlr.press/v37/ioffe15.html) + ([pdf](http://proceedings.mlr.press/v37/ioffe15.pdf)) """ return batch_norm_with_global_normalization(t=input, m=mean, @@ -1723,12 +1740,6 @@ def nce_loss(weights, name="nce_loss"): """Computes and returns the noise-contrastive estimation training loss. - See [Noise-contrastive estimation: A new estimation principle for - unnormalized statistical - models](http://www.jmlr.org/proceedings/papers/v9/gutmann10a/gutmann10a.pdf). - Also see our [Candidate Sampling Algorithms - Reference](https://www.tensorflow.org/extras/candidate_sampling.pdf) - A common use case is to use this method for training, and calculate the full sigmoid loss for evaluation or inference. In this case, you must set `partition_strategy="div"` for the two losses to be consistent, as in the @@ -1788,9 +1799,9 @@ def nce_loss(weights, remove_accidental_hits: A `bool`. Whether to remove "accidental hits" where a sampled class equals one of the target classes. If set to `True`, this is a "Sampled Logistic" loss instead of NCE, and we are - learning to generate log-odds instead of log probabilities. See - our [Candidate Sampling Algorithms Reference] - (https://www.tensorflow.org/extras/candidate_sampling.pdf). + learning to generate log-odds instead of log probabilities. See + our Candidate Sampling Algorithms Reference + ([pdf](https://www.tensorflow.org/extras/candidate_sampling.pdf)). Default is False. partition_strategy: A string specifying the partitioning strategy, relevant if `len(weights) > 1`. Currently `"div"` and `"mod"` are supported. @@ -1799,6 +1810,11 @@ def nce_loss(weights, Returns: A `batch_size` 1-D tensor of per-example NCE losses. + + References: + Noise-contrastive estimation - A new estimation principle for unnormalized statistical models: + [Gutmann et al., 2010](http://proceedings.mlr.press/v9/gutmann10a) + ([pdf](http://proceedings.mlr.press/v9/gutmann10a/gutmann10a.pdf)) """ logits, labels = _compute_sampled_logits( weights=weights, @@ -1955,11 +1971,9 @@ def sampled_softmax_loss(weights, logits=logits) ``` - See our [Candidate Sampling Algorithms Reference] - (https://www.tensorflow.org/extras/candidate_sampling.pdf) - - Also see Section 3 of [Jean et al., 2014](http://arxiv.org/abs/1412.2007) - ([pdf](http://arxiv.org/pdf/1412.2007.pdf)) for the math. + See our Candidate Sampling Algorithms Reference + ([pdf](https://www.tensorflow.org/extras/candidate_sampling.pdf)). + Also see Section 3 of (Jean et al., 2014) for the math. Args: weights: A `Tensor` of shape `[num_classes, dim]`, or a list of `Tensor` @@ -1990,6 +2004,10 @@ def sampled_softmax_loss(weights, Returns: A `batch_size` 1-D tensor of per-example sampled softmax losses. + References: + On Using Very Large Target Vocabulary for Neural Machine Translation: + [Jean et al., 2014](https://aclanthology.coli.uni-saarland.de/papers/P15-1001/p15-1001) + ([pdf](http://aclweb.org/anthology/P15-1001)) """ logits, labels = _compute_sampled_logits( weights=weights, diff --git a/tensorflow/python/ops/nn_ops.py b/tensorflow/python/ops/nn_ops.py index f95c5fc6d7f..3dafcd33711 100644 --- a/tensorflow/python/ops/nn_ops.py +++ b/tensorflow/python/ops/nn_ops.py @@ -1390,15 +1390,10 @@ def atrous_conv2d(value, filters, rate, padding, name=None): the amount of computation. For a description of atrous convolution and how it can be used for dense - feature extraction, please see: [Semantic Image Segmentation with Deep - Convolutional Nets and Fully Connected CRFs](http://arxiv.org/abs/1412.7062). - The same operation is investigated further in [Multi-Scale Context Aggregation - by Dilated Convolutions](http://arxiv.org/abs/1511.07122). Previous works - that effectively use atrous convolution in different ways are, among others, - [OverFeat: Integrated Recognition, Localization and Detection using - Convolutional Networks](http://arxiv.org/abs/1312.6229) and [Fast Image - Scanning with Deep Max-Pooling Convolutional Neural - Networks](http://arxiv.org/abs/1302.1700). + feature extraction, please see: (Chen et al., 2015). The same operation is + investigated further in (Yu et al., 2016). Previous works that effectively + use atrous convolution in different ways are, among others, + (Sermanet et al., 2014) and (Giusti et al., 2013). Atrous convolution is also closely related to the so-called noble identities in multi-rate signal processing. @@ -1479,6 +1474,20 @@ def atrous_conv2d(value, filters, rate, padding, name=None): Raises: ValueError: If input/output depth does not match `filters`' shape, or if padding is other than `'VALID'` or `'SAME'`. + + References: + Multi-Scale Context Aggregation by Dilated Convolutions: + [Yu et al., 2016](https://arxiv.org/abs/1511.07122) + ([pdf](https://arxiv.org/pdf/1511.07122.pdf)) + Semantic Image Segmentation with Deep Convolutional Nets and Fully Connected CRFs: + [Chen et al., 2015](http://arxiv.org/abs/1412.7062) + ([pdf](https://arxiv.org/pdf/1412.7062)) + OverFeat - Integrated Recognition, Localization and Detection using Convolutional Networks: + [Sermanet et al., 2014](https://arxiv.org/abs/1312.6229) + ([pdf](https://arxiv.org/pdf/1312.6229.pdf)) + Fast Image Scanning with Deep Max-Pooling Convolutional Neural Networks: + [Giusti et al., 2013](https://ieeexplore.ieee.org/abstract/document/6738831) + ([pdf](https://arxiv.org/pdf/1302.1700.pdf)) """ return convolution( input=value, @@ -1701,10 +1710,9 @@ def conv1d_transpose( name=None): """The transpose of `conv1d`. - This operation is sometimes called "deconvolution" after [Deconvolutional - Networks](https://www.matthewzeiler.com/mattzeiler/deconvolutionalnetworks.pdf), - but is really the transpose (gradient) of `conv1d` rather than an actual - deconvolution. + This operation is sometimes called "deconvolution" after + (Zeiler et al., 2010), but is actually the transpose (gradient) of `conv1d` + rather than an actual deconvolution. Args: input: A 3-D `Tensor` of type `float` and shape @@ -1733,6 +1741,11 @@ def conv1d_transpose( ValueError: If input/output depth does not match `filter`'s shape, if `output_shape` is not at 3-element vector, if `padding` is other than `'VALID'` or `'SAME'`, or if `data_format` is invalid. + + References: + Deconvolutional Networks: + [Zeiler et al., 2010](https://ieeexplore.ieee.org/abstract/document/5539957) + ([pdf](http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.232.4023&rep=rep1&type=pdf)) """ with ops.name_scope(name, "conv1d_transpose", [input, filters, output_shape]) as name: @@ -2090,10 +2103,9 @@ def conv2d_transpose( dilations=None): """The transpose of `conv2d`. - This operation is sometimes called "deconvolution" after [Deconvolutional - Networks](https://www.matthewzeiler.com/mattzeiler/deconvolutionalnetworks.pdf), - but is really the transpose (gradient) of `conv2d` rather than an actual - deconvolution. + This operation is sometimes called "deconvolution" after + (Zeiler et al., 2010), but is really the transpose (gradient) of `conv2d` + rather than an actual deconvolution. Args: value: A 4-D `Tensor` of type `float` and shape @@ -2130,6 +2142,11 @@ def conv2d_transpose( Raises: ValueError: If input/output depth does not match `filter`'s shape, or if padding is other than `'VALID'` or `'SAME'`. + + References: + Deconvolutional Networks: + [Zeiler et al., 2010](https://ieeexplore.ieee.org/abstract/document/5539957) + ([pdf](http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.232.4023&rep=rep1&type=pdf)) """ value = deprecated_argument_lookup("input", input, "value", value) filter = deprecated_argument_lookup("filters", filters, "filter", filter) @@ -2158,10 +2175,9 @@ def conv2d_transpose_v2( name=None): """The transpose of `conv2d`. - This operation is sometimes called "deconvolution" after [Deconvolutional - Networks](http://www.matthewzeiler.com/pubs/cvpr2010/cvpr2010.pdf), but is - actually the transpose (gradient) of `conv2d` rather than an actual - deconvolution. + This operation is sometimes called "deconvolution" after + (Zeiler et al., 2010), but is really the transpose (gradient) of + `atrous_conv2d` rather than an actual deconvolution. Args: input: A 4-D `Tensor` of type `float` and shape `[batch, height, width, @@ -2196,6 +2212,11 @@ def conv2d_transpose_v2( Raises: ValueError: If input/output depth does not match `filter`'s shape, or if padding is other than `'VALID'` or `'SAME'`. + + References: + Deconvolutional Networks: + [Zeiler et al., 2010](https://ieeexplore.ieee.org/abstract/document/5539957) + ([pdf](http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.232.4023&rep=rep1&type=pdf)) """ with ops.name_scope(name, "conv2d_transpose", [input, filter, output_shape]) as name: @@ -2226,10 +2247,9 @@ def atrous_conv2d_transpose(value, name=None): """The transpose of `atrous_conv2d`. - This operation is sometimes called "deconvolution" after [Deconvolutional - Networks](https://www.matthewzeiler.com/mattzeiler/deconvolutionalnetworks.pdf), - but is really the transpose (gradient) of `atrous_conv2d` rather than an - actual deconvolution. + This operation is sometimes called "deconvolution" after + (Zeiler et al., 2010), but is really the transpose (gradient) of + `atrous_conv2d` rather than an actual deconvolution. Args: value: A 4-D `Tensor` of type `float`. It needs to be in the default `NHWC` @@ -2259,6 +2279,11 @@ def atrous_conv2d_transpose(value, ValueError: If input/output depth does not match `filters`' shape, or if padding is other than `'VALID'` or `'SAME'`, or if the `rate` is less than one, or if the output_shape is not a tensor with 4 elements. + + References: + Deconvolutional Networks: + [Zeiler et al., 2010](https://ieeexplore.ieee.org/abstract/document/5539957) + ([pdf](http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.232.4023&rep=rep1&type=pdf)) """ with ops.name_scope(name, "atrous_conv2d_transpose", [value, filters, output_shape]) as name: @@ -2421,10 +2446,9 @@ def conv3d_transpose( dilations=None): """The transpose of `conv3d`. - This operation is sometimes called "deconvolution" after [Deconvolutional - Networks](https://www.matthewzeiler.com/mattzeiler/deconvolutionalnetworks.pdf), - but is really the transpose (gradient) of `conv3d` rather than an actual - deconvolution. + This operation is sometimes called "deconvolution" after + (Zeiler et al., 2010), but is really the transpose (gradient) of `conv3d` + rather than an actual deconvolution. Args: value: A 5-D `Tensor` of type `float` and shape @@ -2458,6 +2482,11 @@ def conv3d_transpose( Raises: ValueError: If input/output depth does not match `filter`'s shape, or if padding is other than `'VALID'` or `'SAME'`. + + References: + Deconvolutional Networks: + [Zeiler et al., 2010](https://ieeexplore.ieee.org/abstract/document/5539957) + ([pdf](http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.232.4023&rep=rep1&type=pdf)) """ filter = deprecated_argument_lookup("filters", filters, "filter", filter) value = deprecated_argument_lookup("input", input, "value", value) @@ -2483,10 +2512,9 @@ def conv3d_transpose_v2(input, # pylint: disable=redefined-builtin name=None): """The transpose of `conv3d`. - This operation is sometimes called "deconvolution" after [Deconvolutional - Networks](http://www.matthewzeiler.com/pubs/cvpr2010/cvpr2010.pdf), but is - actually the transpose (gradient) of `conv2d` rather than an actual - deconvolution. + This operation is sometimes called "deconvolution" after + (Zeiler et al., 2010), but is really the transpose (gradient) of `conv3d` + rather than an actual deconvolution. Args: input: A 5-D `Tensor` of type `float` and shape `[batch, height, width, @@ -2517,6 +2545,11 @@ def conv3d_transpose_v2(input, # pylint: disable=redefined-builtin Returns: A `Tensor` with the same type as `value`. + + References: + Deconvolutional Networks: + [Zeiler et al., 2010](https://ieeexplore.ieee.org/abstract/document/5539957) + ([pdf](http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.232.4023&rep=rep1&type=pdf)) """ with ops.name_scope(name, "conv3d_transpose", [input, filter, output_shape]) as name: @@ -2556,10 +2589,9 @@ def conv_transpose(input, # pylint: disable=redefined-builtin name=None): """The transpose of `convolution`. - This operation is sometimes called "deconvolution" after [Deconvolutional - Networks](http://www.matthewzeiler.com/pubs/cvpr2010/cvpr2010.pdf), but is - actually the transpose (gradient) of `convolution` rather than an actual - deconvolution. + This operation is sometimes called "deconvolution" after + (Zeiler et al., 2010), but is really the transpose (gradient) of `conv3d` + rather than an actual deconvolution. Args: input: An N+2 dimensional `Tensor` of shape @@ -2597,6 +2629,11 @@ def conv_transpose(input, # pylint: disable=redefined-builtin Returns: A `Tensor` with the same type as `value`. + + References: + Deconvolutional Networks: + [Zeiler et al., 2010](https://ieeexplore.ieee.org/abstract/document/5539957) + ([pdf](http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.232.4023&rep=rep1&type=pdf)) """ with ops.name_scope(name, "conv_transpose", [input, filter, output_shape]) as name: @@ -2705,6 +2742,11 @@ def crelu(features, name=None, axis=-1): Returns: A `Tensor` with the same type as `features`. + + References: + Understanding and Improving Convolutional Neural Networks via Concatenated Rectified Linear Units: + [Shang et al., 2016](http://proceedings.mlr.press/v48/shang16) + ([pdf](http://proceedings.mlr.press/v48/shang16.pdf)) """ with ops.name_scope(name, "CRelu", [features]) as name: features = ops.convert_to_tensor(features, name="features") @@ -2722,9 +2764,6 @@ crelu_v2.__doc__ = crelu.__doc__ def relu6(features, name=None): """Computes Rectified Linear 6: `min(max(features, 0), 6)`. - Source: [Convolutional Deep Belief Networks on CIFAR-10. A. - Krizhevsky](http://www.cs.utoronto.ca/~kriz/conv-cifar10-aug2010.pdf) - Args: features: A `Tensor` with type `float`, `double`, `int32`, `int64`, `uint8`, `int16`, or `int8`. @@ -2732,6 +2771,11 @@ def relu6(features, name=None): Returns: A `Tensor` with the same type as `features`. + + References: + Convolutional Deep Belief Networks on CIFAR-10: + Krizhevsky et al., 2010 + ([pdf](http://www.cs.utoronto.ca/~kriz/conv-cifar10-aug2010.pdf)) """ with ops.name_scope(name, "Relu6", [features]) as name: features = ops.convert_to_tensor(features, name="features") @@ -2742,10 +2786,6 @@ def relu6(features, name=None): def leaky_relu(features, alpha=0.2, name=None): """Compute the Leaky ReLU activation function. - "Rectifier Nonlinearities Improve Neural Network Acoustic Models" - AL Maas, AY Hannun, AY Ng - Proc. ICML, 2013 - https://ai.stanford.edu/~amaas/papers/relu_hybrid_icml2013_final.pdf - Args: features: A `Tensor` representing preactivation values. Must be one of the following types: `float16`, `float32`, `float64`, `int32`, `int64`. @@ -2754,6 +2794,11 @@ def leaky_relu(features, alpha=0.2, name=None): Returns: The activation value. + + References: + Rectifier Nonlinearities Improve Neural Network Acoustic Models: + [Maas et al., 2013](http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.693.1422) + ([pdf](http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.693.1422&rep=rep1&type=pdf)) """ with ops.name_scope(name, "LeakyRelu", [features, alpha]) as name: features = ops.convert_to_tensor(features, name="features") @@ -4328,9 +4373,6 @@ def fractional_max_pool(value, 3. K <= (a[i+1] - a[i]) <= K+1 : all intervals are K or K+1 size 4. length(row_pooling_sequence) = output_row_length+1 - For more details on fractional max pooling, see this paper: [Benjamin Graham, - Fractional Max-Pooling](http://arxiv.org/abs/1412.6071) - Args: value: A `Tensor`. 4-D with shape `[batch, height, width, channels]`. pooling_ratio: A list of `floats` that has length >= 4. Pooling ratio for @@ -4341,8 +4383,7 @@ def fractional_max_pool(value, ratio on height and width dimensions respectively. pseudo_random: An optional `bool`. Defaults to `False`. When set to `True`, generates the pooling sequence in a pseudorandom fashion, otherwise, in a - random fashion. Check paper [Benjamin Graham, Fractional - Max-Pooling](http://arxiv.org/abs/1412.6071) for difference between + random fashion. Check (Graham, 2015) for difference between pseudorandom and random. overlapping: An optional `bool`. Defaults to `False`. When set to `True`, it means when pooling, the values at the boundary of adjacent pooling @@ -4366,6 +4407,11 @@ def fractional_max_pool(value, `value`. row_pooling_sequence: A `Tensor` of type `int64`. col_pooling_sequence: A `Tensor` of type `int64`. + + References: + Fractional Max-Pooling: + [Graham, 2015](https://arxiv.org/abs/1412.6071) + ([pdf](https://arxiv.org/pdf/1412.6071.pdf)) """ return gen_nn_ops.fractional_max_pool(value, pooling_ratio, pseudo_random, overlapping, deterministic, seed, seed2, @@ -4407,9 +4453,6 @@ def fractional_max_pool_v2(value, 3. K <= (a[i+1] - a[i]) <= K+1 : all intervals are K or K+1 size 4. length(row_pooling_sequence) = output_row_length+1 - For more details on fractional max pooling, see this paper: [Benjamin Graham, - Fractional Max-Pooling](http://arxiv.org/abs/1412.6071) - Args: value: A `Tensor`. 4-D with shape `[batch, height, width, channels]`. pooling_ratio: An int or list of `ints` that has length `1`, `2` or `4`. @@ -4420,8 +4463,7 @@ def fractional_max_pool_v2(value, 1.73 are pooling ratio on height and width dimensions respectively. pseudo_random: An optional `bool`. Defaults to `False`. When set to `True`, generates the pooling sequence in a pseudorandom fashion, otherwise, in a - random fashion. Check paper [Benjamin Graham, Fractional - Max-Pooling](http://arxiv.org/abs/1412.6071) for difference between + random fashion. Check paper (Graham, 2015) for difference between pseudorandom and random. overlapping: An optional `bool`. Defaults to `False`. When set to `True`, it means when pooling, the values at the boundary of adjacent pooling @@ -4442,6 +4484,11 @@ def fractional_max_pool_v2(value, `value`. row_pooling_sequence: A `Tensor` of type `int64`. col_pooling_sequence: A `Tensor` of type `int64`. + + References: + Fractional Max-Pooling: + [Graham, 2015](https://arxiv.org/abs/1412.6071) + ([pdf](https://arxiv.org/pdf/1412.6071.pdf)) """ pooling_ratio = _get_sequence(pooling_ratio, 2, 3, "pooling_ratio") @@ -4486,8 +4533,7 @@ def fractional_avg_pool(value, ratio on height and width dimensions respectively. pseudo_random: An optional `bool`. Defaults to `False`. When set to `True`, generates the pooling sequence in a pseudorandom fashion, otherwise, in a - random fashion. Check paper [Benjamin Graham, Fractional - Max-Pooling](http://arxiv.org/abs/1412.6071) for difference between + random fashion. Check paper (Graham, 2015) for difference between pseudorandom and random. overlapping: An optional `bool`. Defaults to `False`. When set to `True`, it means when pooling, the values at the boundary of adjacent pooling @@ -4511,6 +4557,11 @@ def fractional_avg_pool(value, `value`. row_pooling_sequence: A `Tensor` of type `int64`. col_pooling_sequence: A `Tensor` of type `int64`. + + References: + Fractional Max-Pooling: + [Graham, 2015](https://arxiv.org/abs/1412.6071) + ([pdf](https://arxiv.org/pdf/1412.6071.pdf)) """ return gen_nn_ops.fractional_avg_pool(value, pooling_ratio, pseudo_random, overlapping, deterministic, seed, seed2, @@ -4541,8 +4592,7 @@ def fractional_avg_pool_v2(value, ratio on height and width dimensions respectively. pseudo_random: An optional `bool`. Defaults to `False`. When set to `True`, generates the pooling sequence in a pseudorandom fashion, otherwise, in a - random fashion. Check paper [Benjamin Graham, Fractional - Max-Pooling](http://arxiv.org/abs/1412.6071) for difference between + random fashion. Check paper (Graham, 2015) for difference between pseudorandom and random. overlapping: An optional `bool`. Defaults to `False`. When set to `True`, it means when pooling, the values at the boundary of adjacent pooling @@ -4563,6 +4613,11 @@ def fractional_avg_pool_v2(value, `value`. row_pooling_sequence: A `Tensor` of type `int64`. col_pooling_sequence: A `Tensor` of type `int64`. + + References: + Fractional Max-Pooling: + [Graham, 2015](https://arxiv.org/abs/1412.6071) + ([pdf](https://arxiv.org/pdf/1412.6071.pdf)) """ if seed == 0: return gen_nn_ops.fractional_avg_pool(value, pooling_ratio, pseudo_random, diff --git a/tensorflow/python/ops/random_grad.py b/tensorflow/python/ops/random_grad.py index baa8e2e2cd3..d6dbfab3558 100644 --- a/tensorflow/python/ops/random_grad.py +++ b/tensorflow/python/ops/random_grad.py @@ -36,8 +36,8 @@ def add_leading_unit_dimensions(x, num_dimensions): def _RandomGammaGrad(op, grad): # pylint: disable=invalid-name """Returns the gradient of a Gamma sample w.r.t. alpha. - The gradient is computed using implicit differentiation, see - "Implicit Reparameterization Gradients" (https://arxiv.org/abs/1805.08498). + The gradient is computed using implicit differentiation + (Figurnov et al., 2018). Args: op: A `RandomGamma` operation. We assume that the inputs to the operation @@ -46,7 +46,12 @@ def _RandomGammaGrad(op, grad): # pylint: disable=invalid-name `op.outputs[0]`. Returns: - A `Tensor` with derivatives `dloss / dalpha` + A `Tensor` with derivatives `dloss / dalpha`. + + References: + Implicit Reparameterization Gradients: + [Figurnov et al., 2018](http://papers.nips.cc/paper/7326-implicit-reparameterization-gradients) + ([pdf](http://papers.nips.cc/paper/7326-implicit-reparameterization-gradients.pdf)) """ shape = op.inputs[0] alpha = op.inputs[1] diff --git a/tensorflow/python/ops/random_ops.py b/tensorflow/python/ops/random_ops.py index cd332ede178..9eb5e642674 100644 --- a/tensorflow/python/ops/random_ops.py +++ b/tensorflow/python/ops/random_ops.py @@ -417,10 +417,8 @@ def random_gamma(shape, `alpha << 1` or large values of `beta`, i.e., `beta >> 1`. The samples are differentiable w.r.t. alpha and beta. - The derivatives are computed using the approach described in the paper - - [Michael Figurnov, Shakir Mohamed, Andriy Mnih. - Implicit Reparameterization Gradients, 2018](https://arxiv.org/abs/1805.08498) + The derivatives are computed using the approach described in + (Figurnov et al., 2018). Example: @@ -466,6 +464,11 @@ def random_gamma(shape, samples: a `Tensor` of shape `tf.concat([shape, tf.shape(alpha + beta)], axis=0)` with values of type `dtype`. + + References: + Implicit Reparameterization Gradients: + [Figurnov et al., 2018](http://papers.nips.cc/paper/7326-implicit-reparameterization-gradients) + ([pdf](http://papers.nips.cc/paper/7326-implicit-reparameterization-gradients.pdf)) """ with ops.name_scope(name, "random_gamma", [shape, alpha, beta]): shape = ops.convert_to_tensor(shape, name="shape", dtype=dtypes.int32) diff --git a/tensorflow/python/ops/rnn_cell_impl.py b/tensorflow/python/ops/rnn_cell_impl.py index dc545e1d6ab..fc1d421b515 100644 --- a/tensorflow/python/ops/rnn_cell_impl.py +++ b/tensorflow/python/ops/rnn_cell_impl.py @@ -479,7 +479,7 @@ class BasicRNNCell(LayerRNNCell): @tf_export(v1=["nn.rnn_cell.GRUCell"]) class GRUCell(LayerRNNCell): - """Gated Recurrent Unit cell (cf. http://arxiv.org/abs/1406.1078). + """Gated Recurrent Unit cell. Note that this cell is not optimized for performance. Please use `tf.contrib.cudnn_rnn.CudnnGRU` for better performance on GPU, or @@ -501,6 +501,11 @@ class GRUCell(LayerRNNCell): of the first input). Required when `build` is called before `call`. **kwargs: Dict, keyword named properties for common layer attributes, like `trainable` etc when constructing the cell from configs of get_config(). + + References: + Learning Phrase Representations using RNN Encoder–Decoder for Statistical Machine Translation: + [Cho et al., 2014](https://aclanthology.coli.uni-saarland.de/papers/D14-1179/d14-1179) + ([pdf](http://emnlp2014.org/papers/pdf/EMNLP2014179.pdf)) """ @deprecated(None, "This class is equivalent as tf.keras.layers.GRUCell," @@ -635,7 +640,7 @@ class BasicLSTMCell(LayerRNNCell): Basic LSTM recurrent network cell. - The implementation is based on: http://arxiv.org/abs/1409.2329. + The implementation is based on (Zaremba et al., 2015). We add forget_bias (default: 1) to the biases of the forget gate in order to reduce the scale of forgetting in the beginning of the training. @@ -650,6 +655,14 @@ class BasicLSTMCell(LayerRNNCell): `tf.contrib.cudnn_rnn.CudnnLSTM` for better performance on GPU, or `tf.contrib.rnn.LSTMBlockCell` and `tf.contrib.rnn.LSTMBlockFusedCell` for better performance on CPU. + + References: + Recurrent Neural Network Regularization: + [Zaremba et al., 2015](https://arxiv.org/abs/1409.2329) + ([pdf](https://arxiv.org/pdf/1409.2329.pdf)) + Long Short-Term Memory: + [Hochreiter et al., 1997](https://www.mitpressjournals.org/doi/abs/10.1162/neco.1997.9.8.1735) + ([pdf](http://ml.jku.at/publications/older/3504.pdf)) """ @deprecated(None, "This class is equivalent as tf.keras.layers.LSTMCell," @@ -802,20 +815,8 @@ class BasicLSTMCell(LayerRNNCell): class LSTMCell(LayerRNNCell): """Long short-term memory unit (LSTM) recurrent network cell. - The default non-peephole implementation is based on: - - https://pdfs.semanticscholar.org/1154/0131eae85b2e11d53df7f1360eeb6476e7f4.pdf - - Felix Gers, Jurgen Schmidhuber, and Fred Cummins. - "Learning to forget: Continual prediction with LSTM." IET, 850-855, 1999. - - The peephole implementation is based on: - - https://research.google.com/pubs/archive/43905.pdf - - Hasim Sak, Andrew Senior, and Francoise Beaufays. - "Long short-term memory recurrent neural network architectures for - large scale acoustic modeling." INTERSPEECH, 2014. + The default non-peephole implementation is based on (Gers et al., 1999). + The peephole implementation is based on (Sak et al., 2014). The class uses optional peep-hole connections, optional cell clipping, and an optional projection layer. @@ -824,6 +825,17 @@ class LSTMCell(LayerRNNCell): `tf.contrib.cudnn_rnn.CudnnLSTM` for better performance on GPU, or `tf.contrib.rnn.LSTMBlockCell` and `tf.contrib.rnn.LSTMBlockFusedCell` for better performance on CPU. + + References: + Long short-term memory recurrent neural network architectures for large scale acoustic modeling: + [Sak et al., 2014](https://www.isca-speech.org/archive/interspeech_2014/i14_0338.html) + ([pdf](https://www.isca-speech.org/archive/archive_papers/interspeech_2014/i14_0338.pdf)) + Learning to forget: + [Gers et al., 1999](http://digital-library.theiet.org/content/conferences/10.1049/cp_19991218) + ([pdf](https://arxiv.org/pdf/1409.2329.pdf)) + Long Short-Term Memory: + [Hochreiter et al., 1997](https://www.mitpressjournals.org/doi/abs/10.1162/neco.1997.9.8.1735) + ([pdf](http://ml.jku.at/publications/older/3504.pdf)) """ @deprecated(None, "This class is equivalent as tf.keras.layers.LSTMCell," @@ -1197,10 +1209,8 @@ class DropoutWrapperBase(object): """Create a cell with added input, state, and/or output dropout. If `variational_recurrent` is set to `True` (**NOT** the default behavior), - then the same dropout mask is applied at every step, as described in: - - Y. Gal, Z Ghahramani. "A Theoretically Grounded Application of Dropout in - Recurrent Neural Networks". https://arxiv.org/abs/1512.05287 + then the same dropout mask is applied at every step, as described in + (Gal et al., 2016). Otherwise a different dropout mask is applied at every time step. @@ -1256,6 +1266,11 @@ class DropoutWrapperBase(object): TypeError: if `cell` is not an `RNNCell`, or `keep_state_fn` is provided but not `callable`. ValueError: if any of the keep_probs are not between 0 and 1. + + References: + A Theoretically Grounded Application of Dropout in Recurrent Neural Networks: + [Gal et al., 2016](http://papers.nips.cc/paper/6241-a-theoretically-grounded-application-of-dropout-in-recurrent-neural-networks) + ([pdf](http://papers.nips.cc/paper/6241-a-theoretically-grounded-application-of-dropout-in-recurrent-neural-networks.pdf)) """ super(DropoutWrapperBase, self).__init__(cell) assert_like_rnncell("cell", cell) diff --git a/tensorflow/python/ops/signal/dct_ops.py b/tensorflow/python/ops/signal/dct_ops.py index 0d080c5977c..ee8566f9561 100644 --- a/tensorflow/python/ops/signal/dct_ops.py +++ b/tensorflow/python/ops/signal/dct_ops.py @@ -51,13 +51,13 @@ def _validate_dct_arguments(input_tensor, dct_type, n, axis, norm): # TODO(rjryan): Implement `n` and `axis` parameters. @tf_export("signal.dct", v1=["signal.dct", "spectral.dct"]) def dct(input, type=2, n=None, axis=-1, norm=None, name=None): # pylint: disable=redefined-builtin - """Computes the 1D [Discrete Cosine Transform (DCT)][dct] of `input`. + """Computes the 1D [Discrete Cosine Transform (DCT)](https://en.wikipedia.org/wiki/Discrete_cosine_transform) + of `input`. Currently only Types I, II and III are supported. Type I is implemented using a length `2N` padded `tf.spectral.rfft`. Type II is implemented using a length `2N` padded `tf.spectral.rfft`, as - described here: - https://dsp.stackexchange.com/a/10606. + described [here](https://dsp.stackexchange.com/a/10606). Type III is a fairly straightforward inverse of Type II (i.e. using a length `2N` padded `tf.spectral.irfft`). @@ -83,8 +83,6 @@ def dct(input, type=2, n=None, axis=-1, norm=None, name=None): # pylint: disabl ValueError: If `type` is not `1`, `2` or `3`, `n` is not `None, `axis` is not `-1`, or `norm` is not `None` or `'ortho'`. ValueError: If `type` is `1` and `norm` is `ortho`. - - [dct]: https://en.wikipedia.org/wiki/Discrete_cosine_transform """ _validate_dct_arguments(input, type, n, axis, norm) with _ops.name_scope(name, "dct", [input]): @@ -151,7 +149,8 @@ def dct(input, type=2, n=None, axis=-1, norm=None, name=None): # pylint: disabl # TODO(rjryan): Implement `n` and `axis` parameters. @tf_export("signal.idct", v1=["signal.idct", "spectral.idct"]) def idct(input, type=2, n=None, axis=-1, norm=None, name=None): # pylint: disable=redefined-builtin - """Computes the 1D [Inverse Discrete Cosine Transform (DCT)][idct] of `input`. + """Computes the 1D [Inverse Discrete Cosine Transform (DCT)](https://en.wikipedia.org/wiki/Discrete_cosine_transform#Inverse_transforms) + of `input`. Currently only Types I, II and III are supported. Type III is the inverse of Type II, and vice versa. @@ -183,9 +182,6 @@ def idct(input, type=2, n=None, axis=-1, norm=None, name=None): # pylint: disab Raises: ValueError: If `type` is not `1`, `2` or `3`, `n` is not `None, `axis` is not `-1`, or `norm` is not `None` or `'ortho'`. - - [idct]: - https://en.wikipedia.org/wiki/Discrete_cosine_transform#Inverse_transforms """ _validate_dct_arguments(input, type, n, axis, norm) inverse_type = {1: 1, 2: 3, 3: 2}[type] diff --git a/tensorflow/python/ops/signal/mel_ops.py b/tensorflow/python/ops/signal/mel_ops.py index 6488e1df59b..a6fd2f1803d 100644 --- a/tensorflow/python/ops/signal/mel_ops.py +++ b/tensorflow/python/ops/signal/mel_ops.py @@ -95,12 +95,14 @@ def linear_to_mel_weight_matrix(num_mel_bins=20, upper_edge_hertz=3800.0, dtype=dtypes.float32, name=None): - """Returns a matrix to warp linear scale spectrograms to the [mel scale][mel]. + """Returns a matrix to warp linear scale spectrograms to the + [mel scale](https://en.wikipedia.org/wiki/Mel_scale). Returns a weight matrix that can be used to re-weight a `Tensor` containing `num_spectrogram_bins` linearly sampled frequency information from `[0, sample_rate / 2]` into `num_mel_bins` frequency information from - `[lower_edge_hertz, upper_edge_hertz]` on the [mel scale][mel]. + `[lower_edge_hertz, upper_edge_hertz]` on the + [mel scale](https://en.wikipedia.org/wiki/Mel_scale). For example, the returned matrix `A` can be used to right-multiply a spectrogram `S` of shape `[frames, num_spectrogram_bins]` of linear @@ -144,8 +146,6 @@ def linear_to_mel_weight_matrix(num_mel_bins=20, ValueError: If num_mel_bins/num_spectrogram_bins/sample_rate are not positive, lower_edge_hertz is negative, frequency edges are incorrectly ordered, or upper_edge_hertz is larger than the Nyquist frequency. - - [mel]: https://en.wikipedia.org/wiki/Mel_scale """ with ops.name_scope(name, 'linear_to_mel_weight_matrix') as name: # Note: As num_spectrogram_bins is passed to `math_ops.linspace` diff --git a/tensorflow/python/ops/signal/mfcc_ops.py b/tensorflow/python/ops/signal/mfcc_ops.py index 675d60ee94d..902d63ac0f1 100644 --- a/tensorflow/python/ops/signal/mfcc_ops.py +++ b/tensorflow/python/ops/signal/mfcc_ops.py @@ -28,12 +28,14 @@ from tensorflow.python.util.tf_export import tf_export @tf_export('signal.mfccs_from_log_mel_spectrograms') def mfccs_from_log_mel_spectrograms(log_mel_spectrograms, name=None): - """Computes [MFCCs][mfcc] of `log_mel_spectrograms`. + """Computes [MFCCs](https://en.wikipedia.org/wiki/Mel-frequency_cepstrum) + of `log_mel_spectrograms`. Implemented with GPU-compatible ops and supports gradients. - [Mel-Frequency Cepstral Coefficient (MFCC)][mfcc] calculation consists of - taking the DCT-II of a log-magnitude mel-scale spectrogram. [HTK][htk]'s MFCCs + [Mel-Frequency Cepstral Coefficient (MFCC)](https://en.wikipedia.org/wiki/Mel-frequency_cepstrum) + calculation consists of taking the DCT-II of a log-magnitude mel-scale + spectrogram. [HTK](https://en.wikipedia.org/wiki/HTK_(software))'s MFCCs use a particular scaling of the DCT-II which is almost orthogonal normalization. We follow this convention. @@ -83,9 +85,6 @@ def mfccs_from_log_mel_spectrograms(log_mel_spectrograms, name=None): Raises: ValueError: If `num_mel_bins` is not positive. - - [mfcc]: https://en.wikipedia.org/wiki/Mel-frequency_cepstrum - [htk]: https://en.wikipedia.org/wiki/HTK_(software) """ with ops.name_scope(name, 'mfccs_from_log_mel_spectrograms', [log_mel_spectrograms]): diff --git a/tensorflow/python/ops/signal/signal.py b/tensorflow/python/ops/signal/signal.py index cdc4d1c1911..97bcb0cdf04 100644 --- a/tensorflow/python/ops/signal/signal.py +++ b/tensorflow/python/ops/signal/signal.py @@ -26,12 +26,6 @@ guide. @@linear_to_mel_weight_matrix @@overlap_and_add @@stft - -[hamming]: https://en.wikipedia.org/wiki/Window_function#Hamming_window -[hann]: https://en.wikipedia.org/wiki/Window_function#Hann_window -[mel]: https://en.wikipedia.org/wiki/Mel_scale -[mfcc]: https://en.wikipedia.org/wiki/Mel-frequency_cepstrum -[stft]: https://en.wikipedia.org/wiki/Short-time_Fourier_transform """ from __future__ import absolute_import diff --git a/tensorflow/python/ops/signal/spectral_ops.py b/tensorflow/python/ops/signal/spectral_ops.py index ba1709b4cfc..a38547604e1 100644 --- a/tensorflow/python/ops/signal/spectral_ops.py +++ b/tensorflow/python/ops/signal/spectral_ops.py @@ -37,7 +37,8 @@ from tensorflow.python.util.tf_export import tf_export def stft(signals, frame_length, frame_step, fft_length=None, window_fn=window_ops.hann_window, pad_end=False, name=None): - """Computes the [Short-time Fourier Transform][stft] of `signals`. + """Computes the [Short-time Fourier Transform](https://en.wikipedia.org/wiki/Short-time_Fourier_transform) + of `signals`. Implemented with GPU-compatible ops and supports gradients. @@ -62,8 +63,6 @@ def stft(signals, frame_length, frame_step, fft_length=None, Raises: ValueError: If `signals` is not at least rank 1, `frame_length` is not scalar, or `frame_step` is not scalar. - - [stft]: https://en.wikipedia.org/wiki/Short-time_Fourier_transform """ with ops.name_scope(name, 'stft', [signals, frame_length, frame_step]): @@ -160,7 +159,8 @@ def inverse_stft(stfts, fft_length=None, window_fn=window_ops.hann_window, name=None): - """Computes the inverse [Short-time Fourier Transform][stft] of `stfts`. + """Computes the inverse [Short-time Fourier Transform](https://en.wikipedia.org/wiki/Short-time_Fourier_transform) + of `stfts`. To reconstruct an original waveform, a complimentary window function should be used in inverse_stft. Such a window function can be constructed with @@ -217,8 +217,6 @@ def inverse_stft(stfts, Raises: ValueError: If `stfts` is not at least rank 2, `frame_length` is not scalar, `frame_step` is not scalar, or `fft_length` is not scalar. - - [stft]: https://en.wikipedia.org/wiki/Short-time_Fourier_transform """ with ops.name_scope(name, 'inverse_stft', [stfts]): stfts = ops.convert_to_tensor(stfts, name='stfts') diff --git a/tensorflow/python/ops/signal/window_ops.py b/tensorflow/python/ops/signal/window_ops.py index 730c989cfe9..a9add54401a 100644 --- a/tensorflow/python/ops/signal/window_ops.py +++ b/tensorflow/python/ops/signal/window_ops.py @@ -32,7 +32,7 @@ from tensorflow.python.util.tf_export import tf_export @tf_export('signal.hann_window') def hann_window(window_length, periodic=True, dtype=dtypes.float32, name=None): - """Generate a [Hann window][hann]. + """Generate a [Hann window](https://en.wikipedia.org/wiki/Window_function#Hann_and_Hamming_windows). Args: window_length: A scalar `Tensor` indicating the window length to generate. @@ -48,8 +48,6 @@ def hann_window(window_length, periodic=True, dtype=dtypes.float32, name=None): Raises: ValueError: If `dtype` is not a floating point type. - - [hann]: https://en.wikipedia.org/wiki/Window_function#Hann_and_Hamming_windows """ return _raised_cosine_window(name, 'hann_window', window_length, periodic, dtype, 0.5, 0.5) @@ -58,7 +56,7 @@ def hann_window(window_length, periodic=True, dtype=dtypes.float32, name=None): @tf_export('signal.hamming_window') def hamming_window(window_length, periodic=True, dtype=dtypes.float32, name=None): - """Generate a [Hamming][hamming] window. + """Generate a [Hamming](https://en.wikipedia.org/wiki/Window_function#Hann_and_Hamming_windows) window. Args: window_length: A scalar `Tensor` indicating the window length to generate. @@ -74,8 +72,6 @@ def hamming_window(window_length, periodic=True, dtype=dtypes.float32, Raises: ValueError: If `dtype` is not a floating point type. - - [hamming]: https://en.wikipedia.org/wiki/Window_function#Hann_and_Hamming_windows """ return _raised_cosine_window(name, 'hamming_window', window_length, periodic, dtype, 0.54, 0.46) From f1a917fd7ac07848ca796e1f7e31bed40f7ddb3d Mon Sep 17 00:00:00 2001 From: mrTsjolder Date: Wed, 9 Jan 2019 10:52:59 +0100 Subject: [PATCH 4/6] finish doc citation clean up --- tensorflow/python/distribute/all_reduce.py | 7 +++- tensorflow/python/layers/normalization.py | 45 ++++++++++++---------- 2 files changed, 31 insertions(+), 21 deletions(-) diff --git a/tensorflow/python/distribute/all_reduce.py b/tensorflow/python/distribute/all_reduce.py index bd7c45ae27a..58fdbbbecf7 100644 --- a/tensorflow/python/distribute/all_reduce.py +++ b/tensorflow/python/distribute/all_reduce.py @@ -427,7 +427,7 @@ def build_recursive_hd_all_reduce(input_tensors, red_op, un_op=None): """Construct a subgraph for recursive halving-doubling all-reduce. The recursive halving-doubling algorithm is described in - http://www.mcs.anl.gov/~thakur/papers/ijhpca-coll.pdf + (Thakur et al., 2015). The concept is to arrange the participating n devices in a linear sequence where devices exchange data pairwise @@ -459,6 +459,11 @@ def build_recursive_hd_all_reduce(input_tensors, red_op, un_op=None): Raises: ValueError: num_devices not a power of 2, or tensor len not divisible by 2 the proper number of times. + + References: + Optimization of Collective Communication Operations in MPICH: + [Thakur et al., 2005](https://journals.sagepub.com/doi/abs/10.1177/1094342005051521) + ([pdf](http://wwwi10.lrr.in.tum.de/~gerndt/home/Teaching/HPCSeminar/mpich_multi_coll.pdf)) """ devices = [t.device for t in input_tensors] input_tensors, shape = _flatten_tensors(input_tensors) diff --git a/tensorflow/python/layers/normalization.py b/tensorflow/python/layers/normalization.py index 02eb57b1bb2..e0dd2a0ed35 100644 --- a/tensorflow/python/layers/normalization.py +++ b/tensorflow/python/layers/normalization.py @@ -29,12 +29,7 @@ from tensorflow.python.util.tf_export import tf_export @tf_export(v1=['layers.BatchNormalization']) class BatchNormalization(keras_layers.BatchNormalization, base.Layer): - """Batch Normalization layer from http://arxiv.org/abs/1502.03167. - - "Batch Normalization: Accelerating Deep Network Training by Reducing - Internal Covariate Shift" - - Sergey Ioffe, Christian Szegedy + """Batch Normalization layer from (Ioffe et al., 2015). Arguments: axis: An `int` or list of `int`, the axis or axes that should be @@ -66,9 +61,9 @@ class BatchNormalization(keras_layers.BatchNormalization, base.Layer): not safe to use when doing asynchronous distributed training. gamma_constraint: An optional projection function to be applied to the `gamma` weight after being updated by an `Optimizer`. - renorm: Whether to use Batch Renormalization - (https://arxiv.org/abs/1702.03275). This adds extra variables during - training. The inference is the same for either value of this parameter. + renorm: Whether to use Batch Renormalization (Ioffe, 2017). + This adds extra variables during training. + The inference is the same for either value of this parameter. renorm_clipping: A dictionary that may map keys 'rmax', 'rmin', 'dmax' to scalar `Tensors` used to clip the renorm correction. The correction `(r, d)` is used as `corrected_value = normalized_value * r + d`, with @@ -102,6 +97,14 @@ class BatchNormalization(keras_layers.BatchNormalization, base.Layer): `None`, no adjustment is applied. Cannot be specified if virtual_batch_size is specified. name: A string, the name of the layer. + + References: + Batch Normalization - Accelerating Deep Network Training by Reducing Internal Covariate Shift: + [Ioffe et al., 2015](http://proceedings.mlr.press/v37/ioffe15.html) + ([pdf](http://proceedings.mlr.press/v37/ioffe15.pdf)) + Batch Renormalization - Towards Reducing Minibatch Dependence in Batch-Normalized Models: + [Ioffe, 2017](http://papers.nips.cc/paper/6790-batch-renormalization-towards-reducing-minibatch-dependence-in-batch-normalized-models) + ([pdf](http://papers.nips.cc/paper/6790-batch-renormalization-towards-reducing-minibatch-dependence-in-batch-normalized-models.pdf)) """ def __init__(self, @@ -182,14 +185,8 @@ def batch_normalization(inputs, fused=None, virtual_batch_size=None, adjustment=None): - """Functional interface for the batch normalization layer. - - Reference: http://arxiv.org/abs/1502.03167 - - "Batch Normalization: Accelerating Deep Network Training by Reducing - Internal Covariate Shift" - - Sergey Ioffe, Christian Szegedy + """Functional interface for the batch normalization layer from_config + (Ioffe et al., 2015). Note: when training, the moving_mean and moving_variance need to be updated. By default the update ops are placed in `tf.GraphKeys.UPDATE_OPS`, so they @@ -245,9 +242,9 @@ def batch_normalization(inputs, name: String, the name of the layer. reuse: Boolean, whether to reuse the weights of a previous layer by the same name. - renorm: Whether to use Batch Renormalization - (https://arxiv.org/abs/1702.03275). This adds extra variables during - training. The inference is the same for either value of this parameter. + renorm: Whether to use Batch Renormalization (Ioffe, 2017). + This adds extra variables during training. + The inference is the same for either value of this parameter. renorm_clipping: A dictionary that may map keys 'rmax', 'rmin', 'dmax' to scalar `Tensors` used to clip the renorm correction. The correction `(r, d)` is used as `corrected_value = normalized_value * r + d`, with @@ -284,6 +281,14 @@ def batch_normalization(inputs, Raises: ValueError: if eager execution is enabled. + + References: + Batch Normalization - Accelerating Deep Network Training by Reducing Internal Covariate Shift: + [Ioffe et al., 2015](http://proceedings.mlr.press/v37/ioffe15.html) + ([pdf](http://proceedings.mlr.press/v37/ioffe15.pdf)) + Batch Renormalization - Towards Reducing Minibatch Dependence in Batch-Normalized Models: + [Ioffe, 2017](http://papers.nips.cc/paper/6790-batch-renormalization-towards-reducing-minibatch-dependence-in-batch-normalized-models) + ([pdf](http://papers.nips.cc/paper/6790-batch-renormalization-towards-reducing-minibatch-dependence-in-batch-normalized-models.pdf)) """ layer = BatchNormalization( axis=axis, From 4eb77e6062ad7d7cdebc14e308d494fef020dca1 Mon Sep 17 00:00:00 2001 From: mrTsjolder Date: Thu, 10 Jan 2019 20:39:22 +0100 Subject: [PATCH 5/6] Revert one-liners to original state --- tensorflow/python/ops/signal/dct_ops.py | 14 +++++++++----- tensorflow/python/ops/signal/mel_ops.py | 8 ++++---- tensorflow/python/ops/signal/mfcc_ops.py | 11 ++++++----- tensorflow/python/ops/signal/signal.py | 6 ++++++ tensorflow/python/ops/signal/spectral_ops.py | 10 ++++++---- tensorflow/python/ops/signal/window_ops.py | 8 ++++++-- 6 files changed, 37 insertions(+), 20 deletions(-) diff --git a/tensorflow/python/ops/signal/dct_ops.py b/tensorflow/python/ops/signal/dct_ops.py index ee8566f9561..0d080c5977c 100644 --- a/tensorflow/python/ops/signal/dct_ops.py +++ b/tensorflow/python/ops/signal/dct_ops.py @@ -51,13 +51,13 @@ def _validate_dct_arguments(input_tensor, dct_type, n, axis, norm): # TODO(rjryan): Implement `n` and `axis` parameters. @tf_export("signal.dct", v1=["signal.dct", "spectral.dct"]) def dct(input, type=2, n=None, axis=-1, norm=None, name=None): # pylint: disable=redefined-builtin - """Computes the 1D [Discrete Cosine Transform (DCT)](https://en.wikipedia.org/wiki/Discrete_cosine_transform) - of `input`. + """Computes the 1D [Discrete Cosine Transform (DCT)][dct] of `input`. Currently only Types I, II and III are supported. Type I is implemented using a length `2N` padded `tf.spectral.rfft`. Type II is implemented using a length `2N` padded `tf.spectral.rfft`, as - described [here](https://dsp.stackexchange.com/a/10606). + described here: + https://dsp.stackexchange.com/a/10606. Type III is a fairly straightforward inverse of Type II (i.e. using a length `2N` padded `tf.spectral.irfft`). @@ -83,6 +83,8 @@ def dct(input, type=2, n=None, axis=-1, norm=None, name=None): # pylint: disabl ValueError: If `type` is not `1`, `2` or `3`, `n` is not `None, `axis` is not `-1`, or `norm` is not `None` or `'ortho'`. ValueError: If `type` is `1` and `norm` is `ortho`. + + [dct]: https://en.wikipedia.org/wiki/Discrete_cosine_transform """ _validate_dct_arguments(input, type, n, axis, norm) with _ops.name_scope(name, "dct", [input]): @@ -149,8 +151,7 @@ def dct(input, type=2, n=None, axis=-1, norm=None, name=None): # pylint: disabl # TODO(rjryan): Implement `n` and `axis` parameters. @tf_export("signal.idct", v1=["signal.idct", "spectral.idct"]) def idct(input, type=2, n=None, axis=-1, norm=None, name=None): # pylint: disable=redefined-builtin - """Computes the 1D [Inverse Discrete Cosine Transform (DCT)](https://en.wikipedia.org/wiki/Discrete_cosine_transform#Inverse_transforms) - of `input`. + """Computes the 1D [Inverse Discrete Cosine Transform (DCT)][idct] of `input`. Currently only Types I, II and III are supported. Type III is the inverse of Type II, and vice versa. @@ -182,6 +183,9 @@ def idct(input, type=2, n=None, axis=-1, norm=None, name=None): # pylint: disab Raises: ValueError: If `type` is not `1`, `2` or `3`, `n` is not `None, `axis` is not `-1`, or `norm` is not `None` or `'ortho'`. + + [idct]: + https://en.wikipedia.org/wiki/Discrete_cosine_transform#Inverse_transforms """ _validate_dct_arguments(input, type, n, axis, norm) inverse_type = {1: 1, 2: 3, 3: 2}[type] diff --git a/tensorflow/python/ops/signal/mel_ops.py b/tensorflow/python/ops/signal/mel_ops.py index a6fd2f1803d..6488e1df59b 100644 --- a/tensorflow/python/ops/signal/mel_ops.py +++ b/tensorflow/python/ops/signal/mel_ops.py @@ -95,14 +95,12 @@ def linear_to_mel_weight_matrix(num_mel_bins=20, upper_edge_hertz=3800.0, dtype=dtypes.float32, name=None): - """Returns a matrix to warp linear scale spectrograms to the - [mel scale](https://en.wikipedia.org/wiki/Mel_scale). + """Returns a matrix to warp linear scale spectrograms to the [mel scale][mel]. Returns a weight matrix that can be used to re-weight a `Tensor` containing `num_spectrogram_bins` linearly sampled frequency information from `[0, sample_rate / 2]` into `num_mel_bins` frequency information from - `[lower_edge_hertz, upper_edge_hertz]` on the - [mel scale](https://en.wikipedia.org/wiki/Mel_scale). + `[lower_edge_hertz, upper_edge_hertz]` on the [mel scale][mel]. For example, the returned matrix `A` can be used to right-multiply a spectrogram `S` of shape `[frames, num_spectrogram_bins]` of linear @@ -146,6 +144,8 @@ def linear_to_mel_weight_matrix(num_mel_bins=20, ValueError: If num_mel_bins/num_spectrogram_bins/sample_rate are not positive, lower_edge_hertz is negative, frequency edges are incorrectly ordered, or upper_edge_hertz is larger than the Nyquist frequency. + + [mel]: https://en.wikipedia.org/wiki/Mel_scale """ with ops.name_scope(name, 'linear_to_mel_weight_matrix') as name: # Note: As num_spectrogram_bins is passed to `math_ops.linspace` diff --git a/tensorflow/python/ops/signal/mfcc_ops.py b/tensorflow/python/ops/signal/mfcc_ops.py index 902d63ac0f1..675d60ee94d 100644 --- a/tensorflow/python/ops/signal/mfcc_ops.py +++ b/tensorflow/python/ops/signal/mfcc_ops.py @@ -28,14 +28,12 @@ from tensorflow.python.util.tf_export import tf_export @tf_export('signal.mfccs_from_log_mel_spectrograms') def mfccs_from_log_mel_spectrograms(log_mel_spectrograms, name=None): - """Computes [MFCCs](https://en.wikipedia.org/wiki/Mel-frequency_cepstrum) - of `log_mel_spectrograms`. + """Computes [MFCCs][mfcc] of `log_mel_spectrograms`. Implemented with GPU-compatible ops and supports gradients. - [Mel-Frequency Cepstral Coefficient (MFCC)](https://en.wikipedia.org/wiki/Mel-frequency_cepstrum) - calculation consists of taking the DCT-II of a log-magnitude mel-scale - spectrogram. [HTK](https://en.wikipedia.org/wiki/HTK_(software))'s MFCCs + [Mel-Frequency Cepstral Coefficient (MFCC)][mfcc] calculation consists of + taking the DCT-II of a log-magnitude mel-scale spectrogram. [HTK][htk]'s MFCCs use a particular scaling of the DCT-II which is almost orthogonal normalization. We follow this convention. @@ -85,6 +83,9 @@ def mfccs_from_log_mel_spectrograms(log_mel_spectrograms, name=None): Raises: ValueError: If `num_mel_bins` is not positive. + + [mfcc]: https://en.wikipedia.org/wiki/Mel-frequency_cepstrum + [htk]: https://en.wikipedia.org/wiki/HTK_(software) """ with ops.name_scope(name, 'mfccs_from_log_mel_spectrograms', [log_mel_spectrograms]): diff --git a/tensorflow/python/ops/signal/signal.py b/tensorflow/python/ops/signal/signal.py index 97bcb0cdf04..cdc4d1c1911 100644 --- a/tensorflow/python/ops/signal/signal.py +++ b/tensorflow/python/ops/signal/signal.py @@ -26,6 +26,12 @@ guide. @@linear_to_mel_weight_matrix @@overlap_and_add @@stft + +[hamming]: https://en.wikipedia.org/wiki/Window_function#Hamming_window +[hann]: https://en.wikipedia.org/wiki/Window_function#Hann_window +[mel]: https://en.wikipedia.org/wiki/Mel_scale +[mfcc]: https://en.wikipedia.org/wiki/Mel-frequency_cepstrum +[stft]: https://en.wikipedia.org/wiki/Short-time_Fourier_transform """ from __future__ import absolute_import diff --git a/tensorflow/python/ops/signal/spectral_ops.py b/tensorflow/python/ops/signal/spectral_ops.py index a38547604e1..ba1709b4cfc 100644 --- a/tensorflow/python/ops/signal/spectral_ops.py +++ b/tensorflow/python/ops/signal/spectral_ops.py @@ -37,8 +37,7 @@ from tensorflow.python.util.tf_export import tf_export def stft(signals, frame_length, frame_step, fft_length=None, window_fn=window_ops.hann_window, pad_end=False, name=None): - """Computes the [Short-time Fourier Transform](https://en.wikipedia.org/wiki/Short-time_Fourier_transform) - of `signals`. + """Computes the [Short-time Fourier Transform][stft] of `signals`. Implemented with GPU-compatible ops and supports gradients. @@ -63,6 +62,8 @@ def stft(signals, frame_length, frame_step, fft_length=None, Raises: ValueError: If `signals` is not at least rank 1, `frame_length` is not scalar, or `frame_step` is not scalar. + + [stft]: https://en.wikipedia.org/wiki/Short-time_Fourier_transform """ with ops.name_scope(name, 'stft', [signals, frame_length, frame_step]): @@ -159,8 +160,7 @@ def inverse_stft(stfts, fft_length=None, window_fn=window_ops.hann_window, name=None): - """Computes the inverse [Short-time Fourier Transform](https://en.wikipedia.org/wiki/Short-time_Fourier_transform) - of `stfts`. + """Computes the inverse [Short-time Fourier Transform][stft] of `stfts`. To reconstruct an original waveform, a complimentary window function should be used in inverse_stft. Such a window function can be constructed with @@ -217,6 +217,8 @@ def inverse_stft(stfts, Raises: ValueError: If `stfts` is not at least rank 2, `frame_length` is not scalar, `frame_step` is not scalar, or `fft_length` is not scalar. + + [stft]: https://en.wikipedia.org/wiki/Short-time_Fourier_transform """ with ops.name_scope(name, 'inverse_stft', [stfts]): stfts = ops.convert_to_tensor(stfts, name='stfts') diff --git a/tensorflow/python/ops/signal/window_ops.py b/tensorflow/python/ops/signal/window_ops.py index a9add54401a..730c989cfe9 100644 --- a/tensorflow/python/ops/signal/window_ops.py +++ b/tensorflow/python/ops/signal/window_ops.py @@ -32,7 +32,7 @@ from tensorflow.python.util.tf_export import tf_export @tf_export('signal.hann_window') def hann_window(window_length, periodic=True, dtype=dtypes.float32, name=None): - """Generate a [Hann window](https://en.wikipedia.org/wiki/Window_function#Hann_and_Hamming_windows). + """Generate a [Hann window][hann]. Args: window_length: A scalar `Tensor` indicating the window length to generate. @@ -48,6 +48,8 @@ def hann_window(window_length, periodic=True, dtype=dtypes.float32, name=None): Raises: ValueError: If `dtype` is not a floating point type. + + [hann]: https://en.wikipedia.org/wiki/Window_function#Hann_and_Hamming_windows """ return _raised_cosine_window(name, 'hann_window', window_length, periodic, dtype, 0.5, 0.5) @@ -56,7 +58,7 @@ def hann_window(window_length, periodic=True, dtype=dtypes.float32, name=None): @tf_export('signal.hamming_window') def hamming_window(window_length, periodic=True, dtype=dtypes.float32, name=None): - """Generate a [Hamming](https://en.wikipedia.org/wiki/Window_function#Hann_and_Hamming_windows) window. + """Generate a [Hamming][hamming] window. Args: window_length: A scalar `Tensor` indicating the window length to generate. @@ -72,6 +74,8 @@ def hamming_window(window_length, periodic=True, dtype=dtypes.float32, Raises: ValueError: If `dtype` is not a floating point type. + + [hamming]: https://en.wikipedia.org/wiki/Window_function#Hann_and_Hamming_windows """ return _raised_cosine_window(name, 'hamming_window', window_length, periodic, dtype, 0.54, 0.46) From 4d2b7f2a0a21e9e8b1ea501fb1e220b37f002517 Mon Sep 17 00:00:00 2001 From: mrTsjolder Date: Tue, 9 Apr 2019 01:07:39 +0200 Subject: [PATCH 6/6] get rid of hidden character --- tensorflow/python/ops/rnn_cell_impl.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tensorflow/python/ops/rnn_cell_impl.py b/tensorflow/python/ops/rnn_cell_impl.py index fc1d421b515..50ca6c74430 100644 --- a/tensorflow/python/ops/rnn_cell_impl.py +++ b/tensorflow/python/ops/rnn_cell_impl.py @@ -504,8 +504,8 @@ class GRUCell(LayerRNNCell): References: Learning Phrase Representations using RNN Encoder–Decoder for Statistical Machine Translation: - [Cho et al., 2014](https://aclanthology.coli.uni-saarland.de/papers/D14-1179/d14-1179) - ([pdf](http://emnlp2014.org/papers/pdf/EMNLP2014179.pdf)) + [Cho et al., 2014](https://aclanthology.coli.uni-saarland.de/papers/D14-1179/d14-1179) + ([pdf](http://emnlp2014.org/papers/pdf/EMNLP2014179.pdf)) """ @deprecated(None, "This class is equivalent as tf.keras.layers.GRUCell,"