396 lines
14 KiB
Python
396 lines
14 KiB
Python
# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
# ==============================================================================
|
|
"""Student's t distribution class."""
|
|
|
|
from __future__ import absolute_import
|
|
from __future__ import division
|
|
from __future__ import print_function
|
|
|
|
import numpy as np
|
|
|
|
from tensorflow.python.framework import constant_op
|
|
from tensorflow.python.framework import dtypes
|
|
from tensorflow.python.framework import ops
|
|
from tensorflow.python.framework import tensor_shape
|
|
from tensorflow.python.ops import array_ops
|
|
from tensorflow.python.ops import check_ops
|
|
from tensorflow.python.ops import control_flow_ops
|
|
from tensorflow.python.ops import math_ops
|
|
from tensorflow.python.ops import nn
|
|
from tensorflow.python.ops import random_ops
|
|
from tensorflow.python.ops import special_math_ops
|
|
from tensorflow.python.ops.distributions import distribution
|
|
from tensorflow.python.ops.distributions import util as distribution_util
|
|
from tensorflow.python.util import deprecation
|
|
from tensorflow.python.util.tf_export import tf_export
|
|
|
|
|
|
__all__ = [
|
|
"StudentT",
|
|
"StudentTWithAbsDfSoftplusScale",
|
|
]
|
|
|
|
|
|
@tf_export(v1=["distributions.StudentT"])
|
|
class StudentT(distribution.Distribution):
|
|
"""Student's t-distribution.
|
|
|
|
This distribution has parameters: degree of freedom `df`, location `loc`,
|
|
and `scale`.
|
|
|
|
#### Mathematical details
|
|
|
|
The probability density function (pdf) is,
|
|
|
|
```none
|
|
pdf(x; df, mu, sigma) = (1 + y**2 / df)**(-0.5 (df + 1)) / Z
|
|
where,
|
|
y = (x - mu) / sigma
|
|
Z = abs(sigma) sqrt(df pi) Gamma(0.5 df) / Gamma(0.5 (df + 1))
|
|
```
|
|
|
|
where:
|
|
* `loc = mu`,
|
|
* `scale = sigma`, and,
|
|
* `Z` is the normalization constant, and,
|
|
* `Gamma` is the [gamma function](
|
|
https://en.wikipedia.org/wiki/Gamma_function).
|
|
|
|
The StudentT distribution is a member of the [location-scale family](
|
|
https://en.wikipedia.org/wiki/Location-scale_family), i.e., it can be
|
|
constructed as,
|
|
|
|
```none
|
|
X ~ StudentT(df, loc=0, scale=1)
|
|
Y = loc + scale * X
|
|
```
|
|
|
|
Notice that `scale` has semantics more similar to standard deviation than
|
|
variance. However it is not actually the std. deviation; the Student's
|
|
t-distribution std. dev. is `scale sqrt(df / (df - 2))` when `df > 2`.
|
|
|
|
Samples of this distribution are reparameterized (pathwise differentiable).
|
|
The derivatives are computed using the approach described in
|
|
(Figurnov et al., 2018).
|
|
|
|
#### Examples
|
|
|
|
Examples of initialization of one or a batch of distributions.
|
|
|
|
```python
|
|
import tensorflow_probability as tfp
|
|
tfd = tfp.distributions
|
|
|
|
# Define a single scalar Student t distribution.
|
|
single_dist = tfd.StudentT(df=3)
|
|
|
|
# Evaluate the pdf at 1, returning a scalar Tensor.
|
|
single_dist.prob(1.)
|
|
|
|
# Define a batch of two scalar valued Student t's.
|
|
# The first has degrees of freedom 2, mean 1, and scale 11.
|
|
# The second 3, 2 and 22.
|
|
multi_dist = tfd.StudentT(df=[2, 3], loc=[1, 2.], scale=[11, 22.])
|
|
|
|
# Evaluate the pdf of the first distribution on 0, and the second on 1.5,
|
|
# returning a length two tensor.
|
|
multi_dist.prob([0, 1.5])
|
|
|
|
# Get 3 samples, returning a 3 x 2 tensor.
|
|
multi_dist.sample(3)
|
|
```
|
|
|
|
Arguments are broadcast when possible.
|
|
|
|
```python
|
|
# Define a batch of two Student's t distributions.
|
|
# Both have df 2 and mean 1, but different scales.
|
|
dist = tfd.StudentT(df=2, loc=1, scale=[11, 22.])
|
|
|
|
# Evaluate the pdf of both distributions on the same point, 3.0,
|
|
# returning a length 2 tensor.
|
|
dist.prob(3.0)
|
|
```
|
|
|
|
Compute the gradients of samples w.r.t. the parameters:
|
|
|
|
```python
|
|
df = tf.constant(2.0)
|
|
loc = tf.constant(2.0)
|
|
scale = tf.constant(11.0)
|
|
dist = tfd.StudentT(df=df, loc=loc, scale=scale)
|
|
samples = dist.sample(5) # Shape [5]
|
|
loss = tf.reduce_mean(tf.square(samples)) # Arbitrary loss function
|
|
# Unbiased stochastic gradients of the loss function
|
|
grads = tf.gradients(loss, [df, loc, scale])
|
|
```
|
|
|
|
References:
|
|
Implicit Reparameterization Gradients:
|
|
[Figurnov et al., 2018]
|
|
(http://papers.nips.cc/paper/7326-implicit-reparameterization-gradients)
|
|
([pdf](http://papers.nips.cc/paper/7326-implicit-reparameterization-gradients.pdf))
|
|
"""
|
|
|
|
@deprecation.deprecated(
|
|
"2019-01-01",
|
|
"The TensorFlow Distributions library has moved to "
|
|
"TensorFlow Probability "
|
|
"(https://github.com/tensorflow/probability). You "
|
|
"should update all references to use `tfp.distributions` "
|
|
"instead of `tf.distributions`.",
|
|
warn_once=True)
|
|
def __init__(self,
|
|
df,
|
|
loc,
|
|
scale,
|
|
validate_args=False,
|
|
allow_nan_stats=True,
|
|
name="StudentT"):
|
|
"""Construct Student's t distributions.
|
|
|
|
The distributions have degree of freedom `df`, mean `loc`, and scale
|
|
`scale`.
|
|
|
|
The parameters `df`, `loc`, and `scale` must be shaped in a way that
|
|
supports broadcasting (e.g. `df + loc + scale` is a valid operation).
|
|
|
|
Args:
|
|
df: Floating-point `Tensor`. The degrees of freedom of the
|
|
distribution(s). `df` must contain only positive values.
|
|
loc: Floating-point `Tensor`. The mean(s) of the distribution(s).
|
|
scale: Floating-point `Tensor`. The scaling factor(s) for the
|
|
distribution(s). Note that `scale` is not technically the standard
|
|
deviation of this distribution but has semantics more similar to
|
|
standard deviation than variance.
|
|
validate_args: Python `bool`, default `False`. When `True` distribution
|
|
parameters are checked for validity despite possibly degrading runtime
|
|
performance. When `False` invalid inputs may silently render incorrect
|
|
outputs.
|
|
allow_nan_stats: Python `bool`, default `True`. When `True`,
|
|
statistics (e.g., mean, mode, variance) use the value "`NaN`" to
|
|
indicate the result is undefined. When `False`, an exception is raised
|
|
if one or more of the statistic's batch members are undefined.
|
|
name: Python `str` name prefixed to Ops created by this class.
|
|
|
|
Raises:
|
|
TypeError: if loc and scale are different dtypes.
|
|
"""
|
|
parameters = dict(locals())
|
|
with ops.name_scope(name, values=[df, loc, scale]) as name:
|
|
with ops.control_dependencies([check_ops.assert_positive(df)]
|
|
if validate_args else []):
|
|
self._df = array_ops.identity(df, name="df")
|
|
self._loc = array_ops.identity(loc, name="loc")
|
|
self._scale = array_ops.identity(scale, name="scale")
|
|
check_ops.assert_same_float_dtype(
|
|
(self._df, self._loc, self._scale))
|
|
super(StudentT, self).__init__(
|
|
dtype=self._scale.dtype,
|
|
reparameterization_type=distribution.FULLY_REPARAMETERIZED,
|
|
validate_args=validate_args,
|
|
allow_nan_stats=allow_nan_stats,
|
|
parameters=parameters,
|
|
graph_parents=[self._df, self._loc, self._scale],
|
|
name=name)
|
|
|
|
@staticmethod
|
|
def _param_shapes(sample_shape):
|
|
return dict(
|
|
zip(("df", "loc", "scale"), (
|
|
[ops.convert_to_tensor(
|
|
sample_shape, dtype=dtypes.int32)] * 3)))
|
|
|
|
@property
|
|
def df(self):
|
|
"""Degrees of freedom in these Student's t distribution(s)."""
|
|
return self._df
|
|
|
|
@property
|
|
def loc(self):
|
|
"""Locations of these Student's t distribution(s)."""
|
|
return self._loc
|
|
|
|
@property
|
|
def scale(self):
|
|
"""Scaling factors of these Student's t distribution(s)."""
|
|
return self._scale
|
|
|
|
def _batch_shape_tensor(self):
|
|
return array_ops.broadcast_dynamic_shape(
|
|
array_ops.shape(self.df),
|
|
array_ops.broadcast_dynamic_shape(
|
|
array_ops.shape(self.loc), array_ops.shape(self.scale)))
|
|
|
|
def _batch_shape(self):
|
|
return array_ops.broadcast_static_shape(
|
|
array_ops.broadcast_static_shape(self.df.get_shape(),
|
|
self.loc.get_shape()),
|
|
self.scale.get_shape())
|
|
|
|
def _event_shape_tensor(self):
|
|
return constant_op.constant([], dtype=math_ops.int32)
|
|
|
|
def _event_shape(self):
|
|
return tensor_shape.TensorShape([])
|
|
|
|
def _sample_n(self, n, seed=None):
|
|
# The sampling method comes from the fact that if:
|
|
# X ~ Normal(0, 1)
|
|
# Z ~ Chi2(df)
|
|
# Y = X / sqrt(Z / df)
|
|
# then:
|
|
# Y ~ StudentT(df).
|
|
shape = array_ops.concat([[n], self.batch_shape_tensor()], 0)
|
|
normal_sample = random_ops.random_normal(shape, dtype=self.dtype, seed=seed)
|
|
df = self.df * array_ops.ones(self.batch_shape_tensor(), dtype=self.dtype)
|
|
gamma_sample = random_ops.random_gamma(
|
|
[n],
|
|
0.5 * df,
|
|
beta=0.5,
|
|
dtype=self.dtype,
|
|
seed=distribution_util.gen_new_seed(seed, salt="student_t"))
|
|
samples = normal_sample * math_ops.rsqrt(gamma_sample / df)
|
|
return samples * self.scale + self.loc # Abs(scale) not wanted.
|
|
|
|
def _log_prob(self, x):
|
|
return self._log_unnormalized_prob(x) - self._log_normalization()
|
|
|
|
def _log_unnormalized_prob(self, x):
|
|
y = (x - self.loc) / self.scale # Abs(scale) superfluous.
|
|
return -0.5 * (self.df + 1.) * math_ops.log1p(y**2. / self.df)
|
|
|
|
def _log_normalization(self):
|
|
return (math_ops.log(math_ops.abs(self.scale)) +
|
|
0.5 * math_ops.log(self.df) +
|
|
0.5 * np.log(np.pi) +
|
|
math_ops.lgamma(0.5 * self.df) -
|
|
math_ops.lgamma(0.5 * (self.df + 1.)))
|
|
|
|
def _cdf(self, x):
|
|
# Take Abs(scale) to make subsequent where work correctly.
|
|
y = (x - self.loc) / math_ops.abs(self.scale)
|
|
x_t = self.df / (y**2. + self.df)
|
|
neg_cdf = 0.5 * math_ops.betainc(0.5 * self.df, 0.5, x_t)
|
|
return array_ops.where_v2(math_ops.less(y, 0.), neg_cdf, 1. - neg_cdf)
|
|
|
|
def _entropy(self):
|
|
v = array_ops.ones(self.batch_shape_tensor(),
|
|
dtype=self.dtype)[..., array_ops.newaxis]
|
|
u = v * self.df[..., array_ops.newaxis]
|
|
beta_arg = array_ops.concat([u, v], -1) / 2.
|
|
return (math_ops.log(math_ops.abs(self.scale)) +
|
|
0.5 * math_ops.log(self.df) +
|
|
special_math_ops.lbeta(beta_arg) +
|
|
0.5 * (self.df + 1.) *
|
|
(math_ops.digamma(0.5 * (self.df + 1.)) -
|
|
math_ops.digamma(0.5 * self.df)))
|
|
|
|
@distribution_util.AppendDocstring(
|
|
"""The mean of Student's T equals `loc` if `df > 1`, otherwise it is
|
|
`NaN`. If `self.allow_nan_stats=True`, then an exception will be raised
|
|
rather than returning `NaN`.""")
|
|
def _mean(self):
|
|
mean = self.loc * array_ops.ones(self.batch_shape_tensor(),
|
|
dtype=self.dtype)
|
|
if self.allow_nan_stats:
|
|
nan = np.array(np.nan, dtype=self.dtype.as_numpy_dtype())
|
|
return array_ops.where_v2(
|
|
math_ops.greater(
|
|
self.df,
|
|
array_ops.ones(self.batch_shape_tensor(), dtype=self.dtype)),
|
|
mean, array_ops.fill(self.batch_shape_tensor(), nan, name="nan"))
|
|
else:
|
|
return control_flow_ops.with_dependencies(
|
|
[
|
|
check_ops.assert_less(
|
|
array_ops.ones([], dtype=self.dtype),
|
|
self.df,
|
|
message="mean not defined for components of df <= 1"),
|
|
],
|
|
mean)
|
|
|
|
@distribution_util.AppendDocstring("""
|
|
The variance for Student's T equals
|
|
|
|
```
|
|
df / (df - 2), when df > 2
|
|
infinity, when 1 < df <= 2
|
|
NaN, when df <= 1
|
|
```
|
|
""")
|
|
def _variance(self):
|
|
# We need to put the tf.where inside the outer tf.where to ensure we never
|
|
# hit a NaN in the gradient.
|
|
denom = array_ops.where_v2(
|
|
math_ops.greater(self.df, 2.), self.df - 2.,
|
|
array_ops.ones_like(self.df))
|
|
# Abs(scale) superfluous.
|
|
var = (array_ops.ones(self.batch_shape_tensor(), dtype=self.dtype) *
|
|
math_ops.square(self.scale) * self.df / denom)
|
|
# When 1 < df <= 2, variance is infinite.
|
|
inf = np.array(np.inf, dtype=self.dtype.as_numpy_dtype())
|
|
result_where_defined = array_ops.where_v2(
|
|
self.df > array_ops.fill(self.batch_shape_tensor(), 2.), var,
|
|
array_ops.fill(self.batch_shape_tensor(), inf, name="inf"))
|
|
|
|
if self.allow_nan_stats:
|
|
nan = np.array(np.nan, dtype=self.dtype.as_numpy_dtype())
|
|
return array_ops.where_v2(
|
|
math_ops.greater(
|
|
self.df,
|
|
array_ops.ones(self.batch_shape_tensor(), dtype=self.dtype)),
|
|
result_where_defined,
|
|
array_ops.fill(self.batch_shape_tensor(), nan, name="nan"))
|
|
else:
|
|
return control_flow_ops.with_dependencies(
|
|
[
|
|
check_ops.assert_less(
|
|
array_ops.ones([], dtype=self.dtype),
|
|
self.df,
|
|
message="variance not defined for components of df <= 1"),
|
|
],
|
|
result_where_defined)
|
|
|
|
def _mode(self):
|
|
return array_ops.identity(self.loc)
|
|
|
|
|
|
class StudentTWithAbsDfSoftplusScale(StudentT):
|
|
"""StudentT with `df = floor(abs(df))` and `scale = softplus(scale)`."""
|
|
|
|
@deprecation.deprecated(
|
|
"2019-01-01",
|
|
"Use `tfd.StudentT(tf.floor(tf.abs(df)), loc, "
|
|
"tf.nn.softplus(scale)) instead.",
|
|
warn_once=True)
|
|
def __init__(self,
|
|
df,
|
|
loc,
|
|
scale,
|
|
validate_args=False,
|
|
allow_nan_stats=True,
|
|
name="StudentTWithAbsDfSoftplusScale"):
|
|
parameters = dict(locals())
|
|
with ops.name_scope(name, values=[df, scale]) as name:
|
|
super(StudentTWithAbsDfSoftplusScale, self).__init__(
|
|
df=math_ops.floor(math_ops.abs(df)),
|
|
loc=loc,
|
|
scale=nn.softplus(scale, name="softplus_scale"),
|
|
validate_args=validate_args,
|
|
allow_nan_stats=allow_nan_stats,
|
|
name=name)
|
|
self._parameters = parameters
|