Merge commit for internal changes
This commit is contained in:
commit
bf589e3da5
@ -1,6 +1,6 @@
|
||||
package(default_visibility = ["//visibility:public"])
|
||||
|
||||
archive_dir = "eigen-eigen-db7b61411772"
|
||||
archive_dir = "eigen-eigen-0a13bf3e579d"
|
||||
|
||||
cc_library(
|
||||
name = "eigen",
|
||||
|
@ -24,6 +24,14 @@ py_library(
|
||||
],
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "contrib_kernels",
|
||||
visibility = ["//visibility:public"],
|
||||
deps = [
|
||||
"//tensorflow/contrib/linear_optimizer/kernels:sdca_ops",
|
||||
],
|
||||
)
|
||||
|
||||
filegroup(
|
||||
name = "all_files",
|
||||
srcs = glob(
|
||||
|
@ -211,6 +211,18 @@ class FullyConnectedTest(tf.test.TestCase):
|
||||
tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
|
||||
self.assertEqual(1, cnt[0])
|
||||
|
||||
def test_empty_x_results_in_empty_output(self):
|
||||
# Empty x is common if someone masks their input with tf.boolean_mask in
|
||||
# order to drop missing entries, and in a particular batch all entries are
|
||||
# missing.
|
||||
with self.test_session():
|
||||
x = tf.constant([[]], shape=[0, 3])
|
||||
self.assertEqual(0, tf.size(x).eval())
|
||||
y = tf.contrib.layers.fully_connected(x, 2, activation_fn=tf.nn.softmax)
|
||||
tf.initialize_all_variables().run()
|
||||
expected_y = np.array([]).reshape(0,2)
|
||||
np.testing.assert_array_equal(expected_y, y.eval())
|
||||
|
||||
|
||||
class Convolution2dTest(tf.test.TestCase):
|
||||
|
||||
|
@ -22,16 +22,17 @@ These loss ops are, by design, minimal, enabling flexibility in how
|
||||
their output can be used.
|
||||
|
||||
@@reduce_batch_sum
|
||||
@@reduce_batch_mean
|
||||
|
||||
@@absolute_loss
|
||||
@@squared_loss
|
||||
@@logistic_loss
|
||||
|
||||
@@sum_absolute_loss
|
||||
@@sum_squared_loss
|
||||
@@mean_absolute_loss
|
||||
@@mean_squared_loss
|
||||
@@root_mean_squared_loss
|
||||
@@sum_logistic_loss
|
||||
|
||||
@@scalar_absolute_loss
|
||||
@@scalar_squared_loss
|
||||
@@scalar_logistic_loss
|
||||
"""
|
||||
|
||||
@ -39,14 +40,15 @@ from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
from tensorflow.contrib.layers.python.framework import tensor_util
|
||||
from tensorflow.python.framework import ops
|
||||
from tensorflow.python.ops import array_ops
|
||||
from tensorflow.python.ops import math_ops
|
||||
from tensorflow.python.ops import nn
|
||||
|
||||
__all__ = ["reduce_batch_sum", "reduce_batch_mean", "absolute_loss",
|
||||
"squared_loss", "sum_squared_loss", "mean_absolute_loss",
|
||||
"mean_squared_loss", "root_mean_squared_loss",
|
||||
__all__ = ["reduce_batch_sum", "absolute_loss", "squared_loss", "logistic_loss",
|
||||
"sum_absolute_loss", "sum_squared_loss", "sum_logistic_loss",
|
||||
"scalar_absolute_loss", "scalar_squared_loss",
|
||||
"scalar_logistic_loss"]
|
||||
|
||||
|
||||
@ -120,31 +122,11 @@ def reduce_batch_sum(x, name=None):
|
||||
return _reduce_batch(x, math_ops.reduce_sum, name)
|
||||
|
||||
|
||||
def reduce_batch_mean(x, name=None):
|
||||
"""Given a tensor `x`, returns the mean across all dimensions except dim 0.
|
||||
|
||||
Given a tensor with the number of dimensions > 1, reduce_batch_mean
|
||||
will calculate the mean across all dimensions except for dimension
|
||||
0. This function is useful for calculating the mean loss (error)
|
||||
across all examples in a batch when training. As an example, given a
|
||||
tensor of shape [batch_size, d1, d2], this function will calculate
|
||||
the mean across dimensions d1 and d2, returning a tensor of shape
|
||||
[batch_size].
|
||||
|
||||
Tensors of dimension 1 are returned as-is.
|
||||
|
||||
Args:
|
||||
x: A `Tensor` with dimension > 0.
|
||||
name: A name for the operation (optional).
|
||||
|
||||
Returns:
|
||||
A `Tensor` with values averaged across all dimensions > 0.
|
||||
|
||||
Raises:
|
||||
ValueError: If `x` has dimension 0.
|
||||
|
||||
"""
|
||||
return _reduce_batch(x, math_ops.reduce_mean, name)
|
||||
def _validate_predicted_and_target(predicted, target):
|
||||
# TODO(ptucker): Optionally add assert op for shape check, for cases when
|
||||
# shape is not fully defined at graph construction time?
|
||||
predicted.get_shape().assert_is_compatible_with(target.get_shape())
|
||||
tensor_util.assert_same_float_dtype([predicted, target])
|
||||
|
||||
|
||||
def absolute_loss(predicted, target, name=None):
|
||||
@ -172,12 +154,12 @@ def absolute_loss(predicted, target, name=None):
|
||||
with ops.op_scope([predicted, target], name, "absolute_loss") as scope:
|
||||
predicted = ops.convert_to_tensor(predicted, name="predicted")
|
||||
target = ops.convert_to_tensor(target, name="target")
|
||||
predicted.get_shape().assert_is_compatible_with(target.get_shape())
|
||||
_validate_predicted_and_target(predicted, target)
|
||||
return math_ops.abs(target - predicted, name=scope)
|
||||
|
||||
|
||||
def squared_loss(predicted, target, name=None):
|
||||
"""Computes and returns the per-example squared loss.
|
||||
"""Computes and returns the per-example squared loss, divided by 2.
|
||||
|
||||
Computes the per-example squared difference between the target and
|
||||
predicted tensors. The tensors must have the same shape.
|
||||
@ -200,27 +182,33 @@ def squared_loss(predicted, target, name=None):
|
||||
with ops.op_scope([predicted, target], name, "squared_loss") as scope:
|
||||
predicted = ops.convert_to_tensor(predicted, name="predicted")
|
||||
target = ops.convert_to_tensor(target, name="target")
|
||||
predicted.get_shape().assert_is_compatible_with(target.get_shape())
|
||||
return math_ops.square(target - predicted, name=scope)
|
||||
_validate_predicted_and_target(predicted, target)
|
||||
return math_ops.div(math_ops.square(target - predicted), 2.0, name=scope)
|
||||
|
||||
|
||||
def sum_squared_loss(predicted, target, name=None):
|
||||
"""Calculates 1/2 the sum of the squared loss across batches.
|
||||
def logistic_loss(logit, target, name=None):
|
||||
"""Calculates the logistic cross-entropy loss.
|
||||
|
||||
Computes the squared difference between the target and predicted
|
||||
tensors, sums across all dimensions except dimension 0, and divides
|
||||
by 2:
|
||||
**WARNING:** `logit` must be unscaled, while the `target` should be a
|
||||
normalized probability prediction. See
|
||||
`tf.nn.sigmoid_cross_entropy_with_logits` for more details.
|
||||
|
||||
losses = reduce_batch_sum(squared_loss(predicted, target)) / 2.0
|
||||
Args:
|
||||
logit: A `Tensor` of shape `[batch_size, dim_1, ..., dim_n]`
|
||||
of predicted logit values.
|
||||
target: A `Tensor` of shape `[batch_size, dim_1, ..., dim_n]` of
|
||||
target values. The shape of the target tensor should match the
|
||||
`predicted` tensor.
|
||||
name: A name for the operation (optional).
|
||||
|
||||
where `losses` is a tensor with dimensions [batch_size].
|
||||
Returns:
|
||||
A `Tensor` of the logistic cross-entropy loss.
|
||||
"""
|
||||
return nn.sigmoid_cross_entropy_with_logits(logit, target, name=name)
|
||||
|
||||
The tensors must have the same shape.
|
||||
|
||||
This function is equivalent to typical formulations of L2 loss, and
|
||||
similar to TensorFlow's l2_loss function. It differs from the
|
||||
l2_loss function by allowing the caller to specify both the
|
||||
predicted and target tensors.
|
||||
def _sum_loss(predicted, target, loss_fn, name="sum_loss"):
|
||||
"""Apply loss function, then sum across all non-batch dimensions.
|
||||
|
||||
Args:
|
||||
predicted: A `Tensor` of shape `[batch_size, dim_1, ..., dim_n]`
|
||||
@ -228,30 +216,23 @@ def sum_squared_loss(predicted, target, name=None):
|
||||
target: A `Tensor` of shape `[batch_size, dim_1, ..., dim_n]` of
|
||||
target values. The shape of the target tensor should match the
|
||||
`predicted` tensor.
|
||||
loss_fn: Loss to apply, takes 2 tensors as parameters and returns a tensor.
|
||||
name: A name for the operation (optional).
|
||||
|
||||
Returns:
|
||||
A `[batch_size]` tensor of squared losses summed across all dimensions
|
||||
except dimension 0, divided by 2.
|
||||
|
||||
Raises:
|
||||
ValueError: If `predicted` and `target` shapes do not match.
|
||||
|
||||
A `[batch_size]` tensor of losses, averaged across all dimensions except
|
||||
dimension 0.
|
||||
"""
|
||||
with ops.op_scope([predicted, target], name, "sum_squared_loss") as scope:
|
||||
return math_ops.div(
|
||||
reduce_batch_sum(squared_loss(predicted, target)),
|
||||
2.0,
|
||||
name=scope)
|
||||
return reduce_batch_sum(loss_fn(predicted, target), name=name)
|
||||
|
||||
|
||||
def mean_absolute_loss(predicted, target, name=None):
|
||||
"""Calculates the mean absolute loss across batches.
|
||||
def sum_absolute_loss(predicted, target, name="sum_absolute_loss"):
|
||||
"""Calculates the sum of absolute losses across batches.
|
||||
|
||||
Computes the absolute difference between the target and predicted
|
||||
tensors, averaged across all dimensions except dimension 0:
|
||||
|
||||
losses = reduce_batch_mean(absolute_loss(predicted, target))
|
||||
losses = reduce_batch_sum(absolute_loss(predicted, target))
|
||||
|
||||
where `losses` is a tensor with dimensions [batch_size].
|
||||
|
||||
@ -275,22 +256,26 @@ def mean_absolute_loss(predicted, target, name=None):
|
||||
ValueError: If `predicted` and `target` shapes do not match.
|
||||
|
||||
"""
|
||||
with ops.op_scope([predicted, target], name, "mean_absolute_loss") as scope:
|
||||
return reduce_batch_mean(absolute_loss(predicted, target), name=scope)
|
||||
return _sum_loss(predicted, target, absolute_loss, name=name)
|
||||
|
||||
|
||||
def mean_squared_loss(predicted, target, name=None):
|
||||
"""Calculates the mean squared loss across batches.
|
||||
def sum_squared_loss(predicted, target, name="sum_squared_loss"):
|
||||
"""Calculates the sum of the squared loss across batches.
|
||||
|
||||
Computes the squared difference between the target and predicted
|
||||
tensors, and averages across all dimensions except dimension 0:
|
||||
tensors, sums across all dimensions except dimension 0.
|
||||
|
||||
losses = reduce_batch_mean(squared_loss(predicted, target))
|
||||
losses = reduce_batch_sum(squared_loss(predicted, target))
|
||||
|
||||
where `losses` is a tensor with dimensions [batch_size].
|
||||
|
||||
The tensors must have the same shape.
|
||||
|
||||
This function is equivalent to typical formulations of L2 loss, and
|
||||
similar to TensorFlow's l2_loss function. It differs from the
|
||||
l2_loss function by allowing the caller to specify both the
|
||||
predicted and target tensors.
|
||||
|
||||
Args:
|
||||
predicted: A `Tensor` of shape `[batch_size, dim_1, ..., dim_n]`
|
||||
of predicted values.
|
||||
@ -300,29 +285,63 @@ def mean_squared_loss(predicted, target, name=None):
|
||||
name: A name for the operation (optional).
|
||||
|
||||
Returns:
|
||||
A `[batch_size]` tensor of squared differences, averaged across
|
||||
all dimensions except dimension 0.
|
||||
A `[batch_size]` tensor of squared losses summed across all dimensions
|
||||
except dimension 0.
|
||||
|
||||
Raises:
|
||||
ValueError: If `predicted` and `target` shapes do not match.
|
||||
|
||||
"""
|
||||
with ops.op_scope([predicted, target], name, "mean_squared_loss") as scope:
|
||||
return reduce_batch_mean(squared_loss(predicted, target), name=scope)
|
||||
return _sum_loss(predicted, target, squared_loss, name=name)
|
||||
|
||||
|
||||
def root_mean_squared_loss(predicted, target, name=None):
|
||||
"""Calculates the root mean squared loss across batches.
|
||||
def sum_logistic_loss(logit, target, name="sum_logistic_loss"):
|
||||
"""Calculates the sum of the logistic loss across batches.
|
||||
|
||||
Computes the root mean squared loss between the target and predicted
|
||||
tensors, which is the square root of the mean squared differences
|
||||
between the predicted and target tensors:
|
||||
Computes the logistic between logit and predicted tensors, summed across all
|
||||
dimensions except dimension 0.
|
||||
|
||||
losses = sqrt(mean_squared_loss(predicted, target))
|
||||
**WARNING:** `logit` must be unscaled, while the `target` should be a
|
||||
normalized probability prediction. See
|
||||
`tf.nn.sigmoid_cross_entropy_with_logits` for more details.
|
||||
|
||||
where `losses` is a tensor with dimensions [batch_size].
|
||||
Args:
|
||||
logit: A `Tensor` of shape `[batch_size, dim_1, ..., dim_n]`
|
||||
of predicted logit values.
|
||||
target: A `Tensor` of shape `[batch_size, dim_1, ..., dim_n]` of
|
||||
target values. The shape of the target tensor should match the
|
||||
`predicted` tensor.
|
||||
name: A name for the operation (optional).
|
||||
|
||||
The tensors must have the same shape.
|
||||
Returns:
|
||||
A `[batch_size]` tensor of logistic losses summed across all dimensions
|
||||
except dimension 0.
|
||||
"""
|
||||
return _sum_loss(logit, target, logistic_loss, name=name)
|
||||
|
||||
|
||||
def _scalar_loss(predicted, target, loss_fn, name=None):
|
||||
"""Reduces losses to a scalar.
|
||||
|
||||
Args:
|
||||
predicted: A `Tensor` of shape `[batch_size, dim_1, ..., dim_n]`
|
||||
of predicted values.
|
||||
target: A `Tensor` of shape `[batch_size, dim_1, ..., dim_n]` of
|
||||
target values. The shape of the target tensor should match the
|
||||
`predicted` tensor.
|
||||
loss_fn: Loss to apply, takes 2 tensors as parameters and returns a tensor.
|
||||
name: A name for the operation (optional).
|
||||
|
||||
Returns:
|
||||
Caculate sum of losses per example, then average across batch.
|
||||
"""
|
||||
with ops.op_scope([predicted, target], name, "scalar_loss") as scope:
|
||||
return math_ops.reduce_mean(
|
||||
_sum_loss(predicted, target, loss_fn), name=scope)
|
||||
|
||||
|
||||
def scalar_absolute_loss(predicted, target, name="scalar_absolute_loss"):
|
||||
"""Reduces absolute losses to a scalar.
|
||||
|
||||
Args:
|
||||
predicted: A `Tensor` of shape `[batch_size, dim_1, ..., dim_n]`
|
||||
@ -333,20 +352,29 @@ def root_mean_squared_loss(predicted, target, name=None):
|
||||
name: A name for the operation (optional).
|
||||
|
||||
Returns:
|
||||
A `[batch_size]` tensor of the root mean squared differences.
|
||||
|
||||
Raises:
|
||||
ValueError: If `predicted` and `target` shapes do not match.
|
||||
|
||||
Caculate sum of absolute losses per example, then average across batch.
|
||||
"""
|
||||
with ops.op_scope([predicted, target],
|
||||
name,
|
||||
"root_mean_squared_loss") as scope:
|
||||
return math_ops.sqrt(mean_squared_loss(predicted, target),
|
||||
name=scope)
|
||||
return _scalar_loss(predicted, target, loss_fn=absolute_loss, name=name)
|
||||
|
||||
|
||||
def scalar_logistic_loss(logit, target, name=None):
|
||||
def scalar_squared_loss(predicted, target, name="scalar_squared_loss"):
|
||||
"""Reduces squared losses to a scalar.
|
||||
|
||||
Args:
|
||||
predicted: A `Tensor` of shape `[batch_size, dim_1, ..., dim_n]`
|
||||
of predicted values.
|
||||
target: A `Tensor` of shape `[batch_size, dim_1, ..., dim_n]` of
|
||||
target values. The shape of the target tensor should match the
|
||||
`predicted` tensor.
|
||||
name: A name for the operation (optional).
|
||||
|
||||
Returns:
|
||||
Caculate sum of squared losses per example, then average across batch.
|
||||
"""
|
||||
return _scalar_loss(predicted, target, loss_fn=squared_loss, name=name)
|
||||
|
||||
|
||||
def scalar_logistic_loss(logit, target, name="scalar_logistic_loss"):
|
||||
"""Calculates the logistic cross-entropy loss, averaged across batches.
|
||||
|
||||
**WARNING:** `logit` must be unscaled, while the `target` should be a
|
||||
@ -368,8 +396,5 @@ def scalar_logistic_loss(logit, target, name=None):
|
||||
Raises:
|
||||
ValueError: If `logit` and `target` shapes do not match.
|
||||
"""
|
||||
with ops.op_scope([logit, target], name,
|
||||
"scalar_logistic_loss") as scope:
|
||||
batch_loss = reduce_batch_sum(nn.sigmoid_cross_entropy_with_logits(logit,
|
||||
target))
|
||||
return math_ops.reduce_mean(batch_loss, [0], name=scope)
|
||||
return _scalar_loss(logit, target, loss_fn=logistic_loss, name=name)
|
||||
|
||||
|
@ -21,6 +21,10 @@ from __future__ import print_function
|
||||
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
from tensorflow.contrib.layers.python.framework import tensor_util
|
||||
|
||||
pi = 3.14
|
||||
indiana_pi = 3.2 # https://en.wikipedia.org/wiki/Indiana_Pi_Bill
|
||||
|
||||
|
||||
class ReduceBatchSumTest(tf.test.TestCase):
|
||||
@ -89,72 +93,6 @@ class ReduceBatchSumTest(tf.test.TestCase):
|
||||
self.assertAllClose(expected_result, actual_result.eval())
|
||||
|
||||
|
||||
class ReduceBatchMeanTest(tf.test.TestCase):
|
||||
|
||||
def testDimensionNone(self):
|
||||
with self.test_session():
|
||||
input_array = np.array([
|
||||
[1.0, 2.0],
|
||||
[-1.0, -2.0]
|
||||
], dtype=np.float32)
|
||||
placeholder_vec = tf.placeholder(tf.float32, name="placeholder_vec")
|
||||
expected_result = np.array([1.5, -1.5])
|
||||
actual_result = tf.contrib.layers.reduce_batch_mean(placeholder_vec)
|
||||
self.assertEqual(actual_result.get_shape().as_list(), [None])
|
||||
self.assertAllClose(expected_result, actual_result.eval(feed_dict={
|
||||
placeholder_vec: input_array
|
||||
}))
|
||||
|
||||
def testDimension0(self):
|
||||
with self.test_session():
|
||||
input_vec = tf.constant(2.0)
|
||||
with self.assertRaises(ValueError):
|
||||
tf.contrib.layers.reduce_batch_mean(input_vec)
|
||||
|
||||
def testDimension1(self):
|
||||
with self.test_session():
|
||||
input_vec = tf.constant([1.0, 2.0])
|
||||
expected_result = np.array([1.0, 2.0])
|
||||
actual_result = tf.contrib.layers.reduce_batch_mean(input_vec)
|
||||
self.assertAllClose(expected_result, actual_result.eval())
|
||||
|
||||
def testDimension2(self):
|
||||
with self.test_session():
|
||||
input_vec = tf.constant([
|
||||
[1.0, 2.0],
|
||||
[-1.0, -2.0]
|
||||
])
|
||||
expected_result = np.array([1.5, -1.5])
|
||||
actual_result = tf.contrib.layers.reduce_batch_mean(input_vec)
|
||||
self.assertAllClose(expected_result, actual_result.eval())
|
||||
|
||||
def testReturnShape(self):
|
||||
with self.test_session():
|
||||
input_vec = tf.constant([
|
||||
[1.0, 2.0],
|
||||
[-1.0, -2.0]
|
||||
])
|
||||
expected_result = np.array([3.0, -3.0])
|
||||
actual_result = tf.contrib.layers.reduce_batch_mean(input_vec)
|
||||
self.assertShapeEqual(expected_result, actual_result)
|
||||
|
||||
def testDimensionN(self):
|
||||
with self.test_session():
|
||||
input_vec = tf.constant([
|
||||
[
|
||||
[1.0, 2.0],
|
||||
[3.0, 4.0]
|
||||
],
|
||||
[
|
||||
[5.0, 6.0],
|
||||
[7.0, 8.0]
|
||||
]
|
||||
])
|
||||
expected_result = np.array([2.5, 6.5])
|
||||
actual_result = tf.contrib.layers.reduce_batch_mean(input_vec)
|
||||
self.assertAllClose(expected_result, actual_result.eval())
|
||||
|
||||
|
||||
class AbsoluteLossTest(tf.test.TestCase):
|
||||
|
||||
def _getTestVectors(self):
|
||||
@ -191,7 +129,7 @@ class SquaredLossTest(tf.test.TestCase):
|
||||
target = tf.constant([1.0, 0.0, 3.0, 2.0], shape=[2, 2], name="target")
|
||||
predicted = tf.constant([1.1, -0.2, 3.3, 1.6], shape=[2, 2],
|
||||
name="predicted")
|
||||
expected_loss = np.array([0.01, 0.04, 0.09, 0.16]).reshape(2, 2)
|
||||
expected_loss = np.array([0.005, 0.02, 0.045, 0.08]).reshape(2, 2)
|
||||
return target, predicted, expected_loss
|
||||
|
||||
def testSquaredLoss(self):
|
||||
@ -250,114 +188,108 @@ class SumSquaredLossTest(tf.test.TestCase):
|
||||
tf.contrib.layers.sum_squared_loss(incompatible_shape, target)
|
||||
|
||||
|
||||
class MeanAbsoluteLossTest(tf.test.TestCase):
|
||||
class ScalarAbsoluteLossTest(tf.test.TestCase):
|
||||
|
||||
def _getTestVectors(self):
|
||||
target = tf.constant([[0.0, 1.0, 2.0],
|
||||
[3.0, 2.0, 4.0]],
|
||||
shape=[2, 3],
|
||||
name="target")
|
||||
predicted = tf.constant([[3.0, -3.0, 0.0],
|
||||
[1.0, 2.0, 0.0]],
|
||||
shape=[2, 3],
|
||||
name="predicted")
|
||||
expected_loss = np.array([3.0, 2.0])
|
||||
return target, predicted, expected_loss
|
||||
|
||||
def testMeanAbsoluteLoss(self):
|
||||
def testScalarAbsoluteLoss(self):
|
||||
with self.test_session():
|
||||
target, predicted, expected_loss = self._getTestVectors()
|
||||
result = tf.contrib.layers.mean_absolute_loss(predicted, target)
|
||||
self.assertAllClose(expected_loss, result.eval())
|
||||
actual = tf.constant([pi], name="pi")
|
||||
actual_placeholder = tf.placeholder(tf.float32)
|
||||
label = tf.constant([indiana_pi], name="lbl")
|
||||
label_placeholder = tf.placeholder(tf.float32, name="lbl_ph")
|
||||
expected_loss = abs(indiana_pi - pi)
|
||||
|
||||
def testMeanAbsoluteLossReturnShape(self):
|
||||
# Both shapes are set.
|
||||
both_shapes_loss = tf.contrib.layers.scalar_absolute_loss(actual, label)
|
||||
tf.initialize_all_variables().run()
|
||||
np.testing.assert_almost_equal(
|
||||
both_shapes_loss.eval(), expected_loss, decimal=6)
|
||||
|
||||
# No shape for 'actual' - check that the loss layer can be created.
|
||||
no_actual_shape_loss = tf.contrib.layers.scalar_absolute_loss(
|
||||
actual_placeholder, label)
|
||||
tf.initialize_all_variables().run()
|
||||
np.testing.assert_almost_equal(
|
||||
no_actual_shape_loss.eval({actual_placeholder: [pi]}),
|
||||
expected_loss, decimal=6)
|
||||
|
||||
# No shape for 'label' - check that the loss layer can be created.
|
||||
no_label_shape_loss = tf.contrib.layers.scalar_absolute_loss(
|
||||
actual, label_placeholder)
|
||||
tf.initialize_all_variables().run()
|
||||
np.testing.assert_almost_equal(
|
||||
no_label_shape_loss.eval({label_placeholder: [indiana_pi]}),
|
||||
expected_loss, decimal=6)
|
||||
|
||||
# No shapes.
|
||||
no_shape_loss = tf.contrib.layers.scalar_absolute_loss(
|
||||
actual_placeholder, label_placeholder)
|
||||
tf.initialize_all_variables().run()
|
||||
np.testing.assert_almost_equal(
|
||||
no_shape_loss.eval({label_placeholder: [indiana_pi],
|
||||
actual_placeholder: [pi]}),
|
||||
expected_loss, decimal=6)
|
||||
|
||||
# Evaluate the previous one again, but this time with different
|
||||
# (matching) shapes. This should still work.
|
||||
np.testing.assert_almost_equal(
|
||||
no_shape_loss.eval({label_placeholder: [indiana_pi, indiana_pi],
|
||||
actual_placeholder: [pi, pi]}),
|
||||
expected_loss, decimal=6)
|
||||
|
||||
|
||||
class ScalarSquaredLossTest(tf.test.TestCase):
|
||||
|
||||
def testScalarSquaredLoss(self):
|
||||
with self.test_session():
|
||||
target, predicted, expected_loss = self._getTestVectors()
|
||||
result = tf.contrib.layers.mean_absolute_loss(predicted, target)
|
||||
self.assertShapeEqual(expected_loss, result)
|
||||
actual = tf.constant([pi], name="pi")
|
||||
actual_placeholder = tf.placeholder(tf.float32)
|
||||
label = tf.constant([indiana_pi], name="lbl")
|
||||
label_placeholder = tf.placeholder(tf.float32, name="lbl_ph")
|
||||
expected_loss = (indiana_pi - pi) * (indiana_pi - pi) / 2
|
||||
|
||||
def testInvalidShapesValueError(self):
|
||||
with self.test_session():
|
||||
target = tf.constant([1.0, 0.0, 3.0, 2.0], shape=[2, 2], name="target")
|
||||
incompatible_shape = tf.constant([0.0, 1.1], shape=[2],
|
||||
name="incompatible_shape")
|
||||
with self.assertRaises(ValueError):
|
||||
tf.contrib.layers.mean_absolute_loss(incompatible_shape, target)
|
||||
# Both shapes are set.
|
||||
both_shapes_loss = tf.contrib.layers.scalar_squared_loss(actual, label)
|
||||
tf.initialize_all_variables().run()
|
||||
np.testing.assert_almost_equal(
|
||||
both_shapes_loss.eval(), expected_loss, decimal=6)
|
||||
|
||||
# No shape for 'actual' - check that the loss layer can be created.
|
||||
no_actual_shape_loss = tf.contrib.layers.scalar_squared_loss(
|
||||
actual_placeholder, label)
|
||||
tf.initialize_all_variables().run()
|
||||
np.testing.assert_almost_equal(
|
||||
no_actual_shape_loss.eval({actual_placeholder: [pi]}),
|
||||
expected_loss, decimal=6)
|
||||
|
||||
# No shape for 'label' - check that the loss layer can be created.
|
||||
no_label_shape_loss = tf.contrib.layers.scalar_squared_loss(
|
||||
actual, label_placeholder)
|
||||
tf.initialize_all_variables().run()
|
||||
np.testing.assert_almost_equal(
|
||||
no_label_shape_loss.eval({label_placeholder: [indiana_pi]}),
|
||||
expected_loss,
|
||||
decimal=6)
|
||||
|
||||
# No shapes.
|
||||
no_shape_loss = tf.contrib.layers.scalar_squared_loss(
|
||||
actual_placeholder, label_placeholder)
|
||||
tf.initialize_all_variables().run()
|
||||
np.testing.assert_almost_equal(
|
||||
no_shape_loss.eval({label_placeholder: [indiana_pi],
|
||||
actual_placeholder: [pi]}),
|
||||
expected_loss, decimal=6)
|
||||
|
||||
# Evaluate the previous one again, but this time with different
|
||||
# (matching) shapes. This should still work.
|
||||
np.testing.assert_almost_equal(
|
||||
no_shape_loss.eval({label_placeholder: [indiana_pi, indiana_pi],
|
||||
actual_placeholder: [pi, pi]}),
|
||||
expected_loss, decimal=6)
|
||||
|
||||
|
||||
class MeanSquaredLossTest(tf.test.TestCase):
|
||||
class ScalarLogisticLossTest(tf.test.TestCase):
|
||||
|
||||
def _getTestVectors(self):
|
||||
target = tf.constant([[0.0, 1.0, 2.0],
|
||||
[3.0, 2.0, 4.0]],
|
||||
shape=[2, 3],
|
||||
name="target")
|
||||
predicted = tf.constant([[3.0, -3.0, 0.0],
|
||||
[1.0, 2.0, 0.0]],
|
||||
shape=[2, 3],
|
||||
name="predicted")
|
||||
expected_loss = np.array([9.666667, 6.666667])
|
||||
return target, predicted, expected_loss
|
||||
|
||||
def testMeanSquaredLoss(self):
|
||||
with self.test_session():
|
||||
target, predicted, expected_loss = self._getTestVectors()
|
||||
result = tf.contrib.layers.mean_squared_loss(predicted, target)
|
||||
self.assertAllClose(expected_loss, result.eval())
|
||||
|
||||
def testMeanSquaredLossReturnShape(self):
|
||||
with self.test_session():
|
||||
target, predicted, expected_loss = self._getTestVectors()
|
||||
result = tf.contrib.layers.mean_squared_loss(predicted, target)
|
||||
self.assertShapeEqual(expected_loss, result)
|
||||
|
||||
def testInvalidShapesValueError(self):
|
||||
with self.test_session():
|
||||
target = tf.constant([1.0, 0.0, 3.0, 2.0], shape=[2, 2], name="target")
|
||||
incompatible_shape = tf.constant([0.0, 1.1], shape=[2],
|
||||
name="incompatible_shape")
|
||||
with self.assertRaises(ValueError):
|
||||
tf.contrib.layers.mean_squared_loss(incompatible_shape, target)
|
||||
|
||||
|
||||
class RootMeanSquaredLossTest(tf.test.TestCase):
|
||||
|
||||
def _getTestVectors(self):
|
||||
target = tf.constant([[0.0, 1.0, 2.0],
|
||||
[3.0, 2.0, 4.0]],
|
||||
shape=[2, 3],
|
||||
name="target")
|
||||
predicted = tf.constant([[3.0, -3.0, 0.0],
|
||||
[1.0, 2.0, 0.0]],
|
||||
shape=[2, 3],
|
||||
name="predicted")
|
||||
expected_loss = np.array([3.109126, 2.5819889])
|
||||
return target, predicted, expected_loss
|
||||
|
||||
def testRootMeanSquaredLoss(self):
|
||||
with self.test_session():
|
||||
target, predicted, expected_loss = self._getTestVectors()
|
||||
result = tf.contrib.layers.root_mean_squared_loss(predicted, target)
|
||||
self.assertAllClose(expected_loss, result.eval())
|
||||
|
||||
def testRootMeanSquaredLossReturnShape(self):
|
||||
with self.test_session():
|
||||
target, predicted, expected_loss = self._getTestVectors()
|
||||
result = tf.contrib.layers.root_mean_squared_loss(predicted, target)
|
||||
self.assertShapeEqual(expected_loss, result)
|
||||
|
||||
def testInvalidShapesValueError(self):
|
||||
with self.test_session():
|
||||
target = tf.constant([1.0, 0.0, 3.0, 2.0], shape=[2, 2], name="target")
|
||||
incompatible_shape = tf.constant([0.0, 1.1], shape=[2],
|
||||
name="incompatible_shape")
|
||||
with self.assertRaises(ValueError):
|
||||
tf.contrib.layers.root_mean_squared_loss(incompatible_shape, target)
|
||||
|
||||
|
||||
class MeanScalarLogisticLossTest(tf.test.TestCase):
|
||||
|
||||
def _get_mean_sigmoid_logistic_loss(self, logit, target):
|
||||
def _expected_loss(self, logit, target):
|
||||
sigmoid = 1.0 / (1.0 + np.exp(-logit))
|
||||
logistic_loss = (target * -np.log(sigmoid)) - (
|
||||
(1.0 - target) * np.log(1.0 - sigmoid))
|
||||
@ -365,14 +297,13 @@ class MeanScalarLogisticLossTest(tf.test.TestCase):
|
||||
|
||||
return np.sum(batch_losses) / len(batch_losses)
|
||||
|
||||
def test_mean__scalar_logistic_loss(self):
|
||||
def test_scalar_logistic_loss(self):
|
||||
logit = np.array([[9.45, -42], [4.2, 1], [-0.6, 20]])
|
||||
target = np.array([[0.8, 0.9], [0.45, 0.99999], [0.1, 0.0006]])
|
||||
expected_loss = self._get_mean_sigmoid_logistic_loss(logit, target)
|
||||
with self.test_session():
|
||||
result = tf.contrib.layers.scalar_logistic_loss(
|
||||
tf.constant(logit), tf.constant(target))
|
||||
self.assertAllClose(expected_loss, result.eval())
|
||||
self.assertAllClose(self._expected_loss(logit, target), result.eval())
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
@ -36,6 +36,7 @@ py_test(
|
||||
name = "sdca_ops_test",
|
||||
srcs = ["python/kernel_tests/sdca_ops_test.py"],
|
||||
srcs_version = "PY2AND3",
|
||||
tags = ["noasan"], # doesn't pass ASAN for some reason
|
||||
deps = [
|
||||
":sdca_ops_py",
|
||||
"//tensorflow:tensorflow_py",
|
||||
|
@ -112,12 +112,13 @@ def make_dense_variable_dict(num_dense_features, num_examples):
|
||||
def get_binary_predictions_for_logistic(predictions, cutoff=0.5):
|
||||
return tf.cast(
|
||||
tf.greater_equal(predictions, tf.ones_like(predictions) * cutoff),
|
||||
tf.float32)
|
||||
dtype=tf.float32)
|
||||
|
||||
|
||||
def get_binary_predictions_for_hinge(predictions):
|
||||
all_ones = tf.ones_like(predictions)
|
||||
return tf.add(tf.sign(predictions), all_ones) / 2
|
||||
return tf.cast(
|
||||
tf.greater_equal(predictions, tf.zeros_like(predictions)),
|
||||
dtype=tf.float32)
|
||||
|
||||
|
||||
# Setup the single container shared across all tests. This is testing proper
|
||||
|
@ -28,9 +28,11 @@ from tensorflow.python.framework import ops
|
||||
from tensorflow.python.framework.load_library import load_op_library
|
||||
from tensorflow.python.framework.ops import convert_to_tensor
|
||||
from tensorflow.python.framework.ops import name_scope
|
||||
from tensorflow.python.framework.ops import op_scope
|
||||
from tensorflow.python.ops import array_ops
|
||||
from tensorflow.python.ops import control_flow_ops
|
||||
from tensorflow.python.ops import math_ops
|
||||
from tensorflow.python.ops import state_ops
|
||||
from tensorflow.python.ops import variables as var_ops
|
||||
from tensorflow.python.ops.nn import sigmoid_cross_entropy_with_logits
|
||||
from tensorflow.python.platform import resource_loader
|
||||
@ -55,6 +57,7 @@ def _maybe_load_sdca_ops():
|
||||
assert _sdca_ops, 'Could not load _sdca_ops.so'
|
||||
|
||||
|
||||
# TODO(rohananil): add op_scope to appropriate methods.
|
||||
class SdcaModel(object):
|
||||
"""Stochastic dual coordinate ascent solver for linear models.
|
||||
|
||||
@ -255,13 +258,20 @@ class SdcaModel(object):
|
||||
predictions = math_ops.sigmoid(predictions)
|
||||
return predictions
|
||||
|
||||
def minimize(self):
|
||||
def minimize(self, global_step=None, name=None):
|
||||
"""Add operations to train a linear model by minimizing the loss function.
|
||||
|
||||
Args:
|
||||
global_step: Optional `Variable` to increment by one after the
|
||||
variables have been updated.
|
||||
name: Optional name for the returned operation.
|
||||
|
||||
Returns:
|
||||
An Operation that updates the variables passed in the constructor.
|
||||
"""
|
||||
with name_scope('sdca/minimize'):
|
||||
# Technically, the op depends on a lot more than the variables,
|
||||
# but we'll keep the list short.
|
||||
with op_scope([], name, 'sdca/minimize'):
|
||||
sparse_features_indices = []
|
||||
sparse_features_values = []
|
||||
for sf in self._examples['sparse_features']:
|
||||
@ -301,7 +311,7 @@ class SdcaModel(object):
|
||||
assign_ops.append(var.assign(slot_var))
|
||||
assign_group = control_flow_ops.group(*assign_ops)
|
||||
with ops.control_dependencies([assign_group]):
|
||||
return _sdca_ops.sdca_shrink_l1(
|
||||
shrink_l1 = _sdca_ops.sdca_shrink_l1(
|
||||
self._convert_n_to_tensor(
|
||||
self._variables['sparse_features_weights'],
|
||||
as_ref=True),
|
||||
@ -310,6 +320,11 @@ class SdcaModel(object):
|
||||
as_ref=True),
|
||||
l1=self._options['symmetric_l1_regularization'],
|
||||
l2=self._symmetric_l2_regularization())
|
||||
if not global_step:
|
||||
return shrink_l1
|
||||
with ops.control_dependencies([shrink_l1]):
|
||||
with ops.colocate_with(global_step):
|
||||
return state_ops.assign_add(global_step, 1, name=name).op
|
||||
|
||||
def approximate_duality_gap(self):
|
||||
"""Add operations to compute the approximate duality gap.
|
||||
|
@ -968,7 +968,6 @@ tf_cuda_library(
|
||||
tf_cuda_library(
|
||||
name = "gpu_runtime",
|
||||
srcs = [
|
||||
"common_runtime/gpu/gpu_allocator_retry.cc",
|
||||
"common_runtime/gpu/gpu_bfc_allocator.cc",
|
||||
"common_runtime/gpu/gpu_debug_allocator.cc",
|
||||
"common_runtime/gpu/gpu_device.cc",
|
||||
@ -982,7 +981,6 @@ tf_cuda_library(
|
||||
"common_runtime/gpu_device_context.h",
|
||||
],
|
||||
hdrs = [
|
||||
"common_runtime/gpu/gpu_allocator_retry.h",
|
||||
"common_runtime/gpu/gpu_bfc_allocator.h",
|
||||
"common_runtime/gpu/gpu_debug_allocator.h",
|
||||
"common_runtime/gpu/gpu_device.h",
|
||||
@ -991,7 +989,6 @@ tf_cuda_library(
|
||||
"common_runtime/gpu/gpu_util.h",
|
||||
"common_runtime/gpu/pool_allocator.h",
|
||||
"common_runtime/gpu/process_state.h",
|
||||
"common_runtime/gpu/visitable_allocator.h",
|
||||
],
|
||||
copts = tf_copts(),
|
||||
linkstatic = 1,
|
||||
|
@ -420,18 +420,26 @@ void TF_Run_Helper(TF_Session* s, const char* handle,
|
||||
run_options->length)) {
|
||||
status->status =
|
||||
tensorflow::errors::InvalidArgument("Unparseable RunOptions proto");
|
||||
return;
|
||||
}
|
||||
if (run_outputs != nullptr && run_outputs->data != nullptr) {
|
||||
status->status = tensorflow::errors::InvalidArgument(
|
||||
"Passing non-empty run_outputs is invalid.");
|
||||
return;
|
||||
}
|
||||
RunOutputs run_outputs_proto;
|
||||
|
||||
RunOutputs run_outputs_proto;
|
||||
result = s->session->Run(run_options_proto, inputs, output_tensor_names,
|
||||
target_node_names, &outputs, &run_outputs_proto);
|
||||
|
||||
// Serialize back to upstream client, who now owns the new buffer
|
||||
int proto_size = run_outputs_proto.ByteSize();
|
||||
void* str_buf = reinterpret_cast<void*>(operator new(proto_size));
|
||||
run_outputs_proto.SerializeToArray(str_buf, proto_size);
|
||||
run_outputs->data = str_buf;
|
||||
run_outputs->length = proto_size;
|
||||
if (run_outputs != nullptr) {
|
||||
int proto_size = run_outputs_proto.ByteSize();
|
||||
void* str_buf = reinterpret_cast<void*>(operator new(proto_size));
|
||||
run_outputs_proto.SerializeToArray(str_buf, proto_size);
|
||||
run_outputs->data = str_buf;
|
||||
run_outputs->length = proto_size;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// NOTE(zongheng): PRun does not support RunOptions yet.
|
||||
|
@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#include "tensorflow/core/common_runtime/gpu/gpu_allocator_retry.h"
|
||||
#include "tensorflow/core/common_runtime/allocator_retry.h"
|
||||
#include "tensorflow/core/platform/env.h"
|
||||
#include "tensorflow/core/platform/logging.h"
|
||||
#include "tensorflow/core/platform/mutex.h"
|
||||
@ -21,9 +21,9 @@ limitations under the License.
|
||||
|
||||
namespace tensorflow {
|
||||
|
||||
GPUAllocatorRetry::GPUAllocatorRetry() : env_(Env::Default()) {}
|
||||
AllocatorRetry::AllocatorRetry() : env_(Env::Default()) {}
|
||||
|
||||
void* GPUAllocatorRetry::AllocateRaw(
|
||||
void* AllocatorRetry::AllocateRaw(
|
||||
std::function<void*(size_t alignment, size_t num_bytes,
|
||||
bool verbose_failure)>
|
||||
alloc_func,
|
@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_ALLOCATOR_RETRY_H_
|
||||
#define TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_ALLOCATOR_RETRY_H_
|
||||
#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_ALLOCATOR_RETRY_H_
|
||||
#define TENSORFLOW_CORE_COMMON_RUNTIME_ALLOCATOR_RETRY_H_
|
||||
|
||||
#include "tensorflow/core/platform/env.h"
|
||||
#include "tensorflow/core/platform/mutex.h"
|
||||
@ -23,9 +23,9 @@ limitations under the License.
|
||||
namespace tensorflow {
|
||||
|
||||
// A retrying wrapper for a memory allocator.
|
||||
class GPUAllocatorRetry {
|
||||
class AllocatorRetry {
|
||||
public:
|
||||
GPUAllocatorRetry();
|
||||
AllocatorRetry();
|
||||
|
||||
// Call 'alloc_func' to obtain memory. On first call,
|
||||
// 'verbose_failure' will be false. If return value is nullptr,
|
||||
@ -50,11 +50,11 @@ class GPUAllocatorRetry {
|
||||
};
|
||||
|
||||
// Implementation details below
|
||||
inline void GPUAllocatorRetry::NotifyDealloc() {
|
||||
inline void AllocatorRetry::NotifyDealloc() {
|
||||
mutex_lock l(mu_);
|
||||
memory_returned_.notify_all();
|
||||
}
|
||||
|
||||
} // namespace tensorflow
|
||||
|
||||
#endif // TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_ALLOCATOR_RETRY_H_
|
||||
#endif // TENSORFLOW_CORE_COMMON_RUNTIME_ALLOCATOR_RETRY_H_
|
702
tensorflow/core/common_runtime/bfc_allocator.cc
Normal file
702
tensorflow/core/common_runtime/bfc_allocator.cc
Normal file
@ -0,0 +1,702 @@
|
||||
/* Copyright 2015 Google Inc. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#include "tensorflow/core/common_runtime/bfc_allocator.h"
|
||||
|
||||
#include "tensorflow/core/common_runtime/allocator_retry.h"
|
||||
#include "tensorflow/core/lib/core/bits.h"
|
||||
#include "tensorflow/core/lib/gtl/stl_util.h"
|
||||
#include "tensorflow/core/lib/strings/numbers.h"
|
||||
#include "tensorflow/core/lib/strings/str_util.h"
|
||||
#include "tensorflow/core/lib/strings/strcat.h"
|
||||
#include "tensorflow/core/platform/logging.h"
|
||||
#include "tensorflow/core/platform/mutex.h"
|
||||
#include "tensorflow/core/platform/types.h"
|
||||
|
||||
namespace tensorflow {
|
||||
|
||||
BFCAllocator::BFCAllocator(SubAllocator* sub_allocator, size_t total_memory,
|
||||
bool allow_growth, const string& name)
|
||||
: suballocator_(sub_allocator),
|
||||
name_(name),
|
||||
free_chunks_list_(kInvalidChunkHandle),
|
||||
next_allocation_id_(1) {
|
||||
if (allow_growth) {
|
||||
// 1MiB smallest initial allocation, unless total memory available
|
||||
// is less.
|
||||
curr_region_allocation_bytes_ =
|
||||
RoundedBytes(std::min(total_memory, size_t{1048576}));
|
||||
} else {
|
||||
curr_region_allocation_bytes_ = RoundedBytes(total_memory);
|
||||
}
|
||||
|
||||
// Allocate the requested amount of memory.
|
||||
memory_limit_ = total_memory;
|
||||
stats_.bytes_limit = static_cast<int64>(total_memory);
|
||||
|
||||
// Create a bunch of bins of various good sizes.
|
||||
|
||||
// We create bins to fit all possible ranges that cover the
|
||||
// memory_limit_ starting from allocations up to 256 bytes to
|
||||
// allocations up to (and including) the memory limit.
|
||||
for (BinNum b = 0; b < kNumBins; b++) {
|
||||
size_t bin_size = BinNumToSize(b);
|
||||
VLOG(1) << "Creating bin of max chunk size "
|
||||
<< strings::HumanReadableNumBytes(bin_size);
|
||||
new (BinFromIndex(b)) Bin(this, bin_size);
|
||||
CHECK_EQ(BinForSize(bin_size), BinFromIndex(b));
|
||||
CHECK_EQ(BinForSize(bin_size + 255), BinFromIndex(b));
|
||||
CHECK_EQ(BinForSize(bin_size * 2 - 1), BinFromIndex(b));
|
||||
if (b + 1 < kNumBins) {
|
||||
CHECK_NE(BinForSize(bin_size * 2), BinFromIndex(b));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
BFCAllocator::~BFCAllocator() {
|
||||
// Return memory back.
|
||||
VLOG(2) << "Number of regions allocated: "
|
||||
<< region_manager_.regions().size();
|
||||
for (const auto& region : region_manager_.regions()) {
|
||||
suballocator_->Free(region.ptr(), region.memory_size());
|
||||
}
|
||||
|
||||
for (BinNum b = 0; b < kNumBins; b++) {
|
||||
BinFromIndex(b)->~Bin();
|
||||
}
|
||||
}
|
||||
|
||||
BFCAllocator::Chunk* BFCAllocator::ChunkFromHandle(ChunkHandle h) {
|
||||
DCHECK_GE(h, 0);
|
||||
DCHECK_LT(h, static_cast<int>(chunks_.size()));
|
||||
return &(chunks_[h]);
|
||||
}
|
||||
|
||||
bool BFCAllocator::Extend(size_t rounded_bytes) {
|
||||
// Do we have enough space to handle the client's request?
|
||||
// If not, fail immediately.
|
||||
if (total_region_allocated_bytes_ + rounded_bytes > memory_limit_) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// If curr_region_allocation_bytes_ is not enough to satisfy the
|
||||
// allocation, keep multiplying by a power of two until that is
|
||||
// sufficient.
|
||||
bool increased_allocation = false;
|
||||
while (rounded_bytes > curr_region_allocation_bytes_) {
|
||||
curr_region_allocation_bytes_ *= 2;
|
||||
increased_allocation = true;
|
||||
}
|
||||
|
||||
// Try allocating.
|
||||
size_t bytes = curr_region_allocation_bytes_;
|
||||
void* mem_addr = suballocator_->Alloc(32, bytes);
|
||||
if (mem_addr == nullptr && !started_backpedal_) {
|
||||
// Only backpedal once.
|
||||
started_backpedal_ = true;
|
||||
|
||||
static constexpr float kBackpedalFactor = 0.9;
|
||||
|
||||
// Try allocating less memory.
|
||||
bytes = RoundedBytes(bytes * kBackpedalFactor);
|
||||
while (mem_addr == nullptr && bytes > rounded_bytes) {
|
||||
mem_addr = suballocator_->Alloc(32, bytes);
|
||||
bytes = RoundedBytes(bytes * kBackpedalFactor);
|
||||
}
|
||||
}
|
||||
|
||||
if (mem_addr == nullptr) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!increased_allocation) {
|
||||
// Increase the region size of the next required allocation.
|
||||
curr_region_allocation_bytes_ *= 2;
|
||||
}
|
||||
|
||||
VLOG(1) << "Extending allocation by " << strings::HumanReadableNumBytes(bytes)
|
||||
<< " bytes.";
|
||||
|
||||
total_region_allocated_bytes_ += bytes;
|
||||
VLOG(1) << "Total allocated bytes: "
|
||||
<< strings::HumanReadableNumBytes(total_region_allocated_bytes_);
|
||||
|
||||
VLOG(1) << "Allocated memory at " << mem_addr << " to "
|
||||
<< static_cast<void*>(static_cast<char*>(mem_addr) + bytes);
|
||||
region_manager_.AddAllocationRegion(mem_addr, bytes);
|
||||
|
||||
// Create one large chunk for the whole memory space that will
|
||||
// be chunked later.
|
||||
ChunkHandle h = AllocateChunk();
|
||||
BFCAllocator::Chunk* c = ChunkFromHandle(h);
|
||||
c->ptr = mem_addr;
|
||||
c->size = bytes;
|
||||
c->allocation_id = -1;
|
||||
c->prev = kInvalidChunkHandle;
|
||||
c->next = kInvalidChunkHandle;
|
||||
|
||||
region_manager_.set_handle(c->ptr, h);
|
||||
|
||||
// TODO(vrv): Try to merge this new region with an existing region,
|
||||
// if the address space is contiguous, to avoid fragmentation
|
||||
// across regions.
|
||||
|
||||
// Insert the chunk into the right bin.
|
||||
InsertFreeChunkIntoBin(h);
|
||||
|
||||
// Invoke visitors on newly allocated region.
|
||||
for (auto visitor : region_visitors_) {
|
||||
visitor(mem_addr, bytes);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
BFCAllocator::ChunkHandle BFCAllocator::AllocateChunk() {
|
||||
if (free_chunks_list_ != kInvalidChunkHandle) {
|
||||
ChunkHandle h = free_chunks_list_;
|
||||
Chunk* c = ChunkFromHandle(h);
|
||||
free_chunks_list_ = c->next;
|
||||
return h;
|
||||
} else {
|
||||
ChunkHandle h = chunks_.size();
|
||||
chunks_.resize(h + 1);
|
||||
return h;
|
||||
}
|
||||
}
|
||||
|
||||
void BFCAllocator::DeallocateChunk(ChunkHandle h) {
|
||||
Chunk* c = ChunkFromHandle(h);
|
||||
c->next = free_chunks_list_;
|
||||
free_chunks_list_ = h;
|
||||
}
|
||||
|
||||
void* BFCAllocator::AllocateRaw(size_t unused_alignment, size_t num_bytes) {
|
||||
// Fast path: Try once to allocate without getting the retry_helper_ involved
|
||||
void* r = AllocateRawInternal(unused_alignment, num_bytes, false);
|
||||
if (r != nullptr) {
|
||||
return r;
|
||||
} else {
|
||||
static const int64 kMaxMillisToWait = 10000; // 10 seconds
|
||||
return retry_helper_.AllocateRaw(
|
||||
[this](size_t a, size_t nb, bool v) {
|
||||
return AllocateRawInternal(a, nb, v);
|
||||
},
|
||||
kMaxMillisToWait, unused_alignment, num_bytes);
|
||||
}
|
||||
}
|
||||
|
||||
void* BFCAllocator::AllocateRaw(size_t unused_alignment, size_t num_bytes,
|
||||
const AllocationAttributes& allocation_attr) {
|
||||
if (allocation_attr.no_retry_on_failure) {
|
||||
// Return immediately upon the first failure if this is for allocating an
|
||||
// optional scratch space.
|
||||
void* result = AllocateRawInternal(unused_alignment, num_bytes, false);
|
||||
if (result == nullptr) {
|
||||
// The counter incrementing is not thread-safe. But we don't really care.
|
||||
// TODO(zhengxq): we should implement a LOG_FIRST_N and LOG_EVERY_N for
|
||||
// more general usage.
|
||||
static int log_counter = 0;
|
||||
if (log_counter < 10) {
|
||||
log_counter++;
|
||||
LOG(WARNING)
|
||||
<< "Ran out of memory trying to allocate "
|
||||
<< strings::HumanReadableNumBytes(num_bytes)
|
||||
<< ". The caller indicates that this is not a failure, but"
|
||||
<< " may mean that there could be performance gains if more"
|
||||
<< " memory is available.";
|
||||
}
|
||||
}
|
||||
return result;
|
||||
} else {
|
||||
return AllocateRaw(unused_alignment, num_bytes);
|
||||
}
|
||||
}
|
||||
|
||||
// static
|
||||
size_t BFCAllocator::RoundedBytes(size_t bytes) {
|
||||
size_t rounded_bytes =
|
||||
(kMinAllocationSize *
|
||||
((bytes + kMinAllocationSize - 1) / kMinAllocationSize));
|
||||
DCHECK_EQ(size_t{0}, rounded_bytes % kMinAllocationSize);
|
||||
return rounded_bytes;
|
||||
}
|
||||
|
||||
void* BFCAllocator::AllocateRawInternal(size_t unused_alignment,
|
||||
size_t num_bytes,
|
||||
bool dump_log_on_failure) {
|
||||
if (num_bytes == 0) {
|
||||
LOG(ERROR) << "tried to allocate 0 bytes";
|
||||
return nullptr;
|
||||
}
|
||||
// First, always allocate memory of at least kMinAllocationSize
|
||||
// bytes, and always allocate multiples of kMinAllocationSize bytes
|
||||
// so all memory addresses are nicely byte aligned.
|
||||
size_t rounded_bytes = RoundedBytes(num_bytes);
|
||||
|
||||
// The BFC allocator tries to find the best fit first.
|
||||
BinNum bin_num = BinNumForSize(rounded_bytes);
|
||||
|
||||
mutex_lock l(lock_);
|
||||
void* ptr = FindChunkPtr(bin_num, rounded_bytes, num_bytes);
|
||||
if (ptr != nullptr) {
|
||||
return ptr;
|
||||
}
|
||||
|
||||
// Try to extend
|
||||
if (Extend(rounded_bytes)) {
|
||||
ptr = FindChunkPtr(bin_num, rounded_bytes, num_bytes);
|
||||
if (ptr != nullptr) {
|
||||
return ptr;
|
||||
}
|
||||
}
|
||||
|
||||
// We searched all bins for an existing free chunk to use and
|
||||
// couldn't find one. This means we must have run out of memory,
|
||||
// Dump the memory log for analysis.
|
||||
if (dump_log_on_failure) {
|
||||
DumpMemoryLog(rounded_bytes);
|
||||
LOG(WARNING) << RenderOccupancy();
|
||||
LOG(WARNING) << "Ran out of memory trying to allocate "
|
||||
<< strings::HumanReadableNumBytes(num_bytes)
|
||||
<< ". See logs for memory state.";
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
void* BFCAllocator::FindChunkPtr(BinNum bin_num, size_t rounded_bytes,
|
||||
size_t num_bytes) {
|
||||
// First identify the first bin that could satisfy rounded_bytes.
|
||||
for (; bin_num < kNumBins; bin_num++) {
|
||||
// Start searching from the first bin for the smallest chunk that fits
|
||||
// rounded_bytes.
|
||||
Bin* b = BinFromIndex(bin_num);
|
||||
for (auto citer = b->free_chunks.begin(); citer != b->free_chunks.end();
|
||||
++citer) {
|
||||
const BFCAllocator::ChunkHandle h = (*citer);
|
||||
BFCAllocator::Chunk* chunk = ChunkFromHandle(h);
|
||||
DCHECK(!chunk->in_use());
|
||||
if (chunk->size >= rounded_bytes) {
|
||||
// We found an existing chunk that fits us that wasn't in use, so remove
|
||||
// it from the free bin structure prior to using.
|
||||
RemoveFreeChunkIterFromBin(&b->free_chunks, citer);
|
||||
|
||||
// If we can break the size of the chunk into two reasonably
|
||||
// large pieces, do so.
|
||||
//
|
||||
// TODO(vrv): What should be the criteria when deciding when
|
||||
// to split?
|
||||
if (chunk->size >= rounded_bytes * 2) {
|
||||
SplitChunk(h, rounded_bytes);
|
||||
chunk = ChunkFromHandle(h); // Update chunk pointer in case it moved
|
||||
}
|
||||
|
||||
// The requested size of the returned chunk is what the user
|
||||
// has allocated.
|
||||
chunk->requested_size = num_bytes;
|
||||
// Assign a unique id and increment the id counter, marking the
|
||||
// chunk as being in use.
|
||||
chunk->allocation_id = next_allocation_id_++;
|
||||
|
||||
// Update stats.
|
||||
++stats_.num_allocs;
|
||||
stats_.bytes_in_use += chunk->size;
|
||||
stats_.max_bytes_in_use =
|
||||
std::max(stats_.max_bytes_in_use, stats_.bytes_in_use);
|
||||
stats_.max_alloc_size =
|
||||
std::max<std::size_t>(stats_.max_alloc_size, chunk->size);
|
||||
|
||||
VLOG(4) << "Returning: " << chunk->ptr;
|
||||
if (VLOG_IS_ON(4)) {
|
||||
LOG(INFO) << "A: " << RenderOccupancy();
|
||||
}
|
||||
return chunk->ptr;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
void BFCAllocator::SplitChunk(BFCAllocator::ChunkHandle h, size_t num_bytes) {
|
||||
// Allocate the new chunk before we do any ChunkFromHandle
|
||||
ChunkHandle h_new_chunk = AllocateChunk();
|
||||
|
||||
Chunk* c = ChunkFromHandle(h);
|
||||
CHECK(!c->in_use() && (c->bin_num == kInvalidBinNum));
|
||||
|
||||
// Create a new chunk starting num_bytes after c
|
||||
BFCAllocator::Chunk* new_chunk = ChunkFromHandle(h_new_chunk);
|
||||
new_chunk->ptr = static_cast<void*>(static_cast<char*>(c->ptr) + num_bytes);
|
||||
region_manager_.set_handle(new_chunk->ptr, h_new_chunk);
|
||||
|
||||
// Set the new sizes of the chunks.
|
||||
new_chunk->size = c->size - num_bytes;
|
||||
c->size = num_bytes;
|
||||
|
||||
// The new chunk is not in use.
|
||||
new_chunk->allocation_id = -1;
|
||||
|
||||
// Maintain the pointers.
|
||||
// c <-> c_neighbor becomes
|
||||
// c <-> new_chunk <-> c_neighbor
|
||||
BFCAllocator::ChunkHandle h_neighbor = c->next;
|
||||
new_chunk->prev = h;
|
||||
new_chunk->next = h_neighbor;
|
||||
c->next = h_new_chunk;
|
||||
if (h_neighbor != kInvalidChunkHandle) {
|
||||
Chunk* c_neighbor = ChunkFromHandle(h_neighbor);
|
||||
c_neighbor->prev = h_new_chunk;
|
||||
}
|
||||
|
||||
// Add the newly free chunk to the free bin.
|
||||
InsertFreeChunkIntoBin(h_new_chunk);
|
||||
}
|
||||
|
||||
void BFCAllocator::DeallocateRaw(void* ptr) {
|
||||
DeallocateRawInternal(ptr);
|
||||
retry_helper_.NotifyDealloc();
|
||||
}
|
||||
|
||||
void BFCAllocator::DeallocateRawInternal(void* ptr) {
|
||||
if (ptr == nullptr) {
|
||||
LOG(ERROR) << "tried to deallocate nullptr";
|
||||
return;
|
||||
}
|
||||
mutex_lock l(lock_);
|
||||
|
||||
// Find the chunk from the ptr.
|
||||
BFCAllocator::ChunkHandle h = region_manager_.get_handle(ptr);
|
||||
CHECK(h != kInvalidChunkHandle);
|
||||
|
||||
// Consider coalescing it.
|
||||
FreeAndMaybeCoalesce(h);
|
||||
|
||||
if (VLOG_IS_ON(4)) {
|
||||
LOG(INFO) << "F: " << RenderOccupancy();
|
||||
}
|
||||
}
|
||||
|
||||
// Merges h1 and h2 when Chunk(h1)->next is h2 and Chunk(h2)->prev is c1.
|
||||
// We merge Chunk(h2) into Chunk(h1).
|
||||
void BFCAllocator::Merge(BFCAllocator::ChunkHandle h1,
|
||||
BFCAllocator::ChunkHandle h2) {
|
||||
Chunk* c1 = ChunkFromHandle(h1);
|
||||
Chunk* c2 = ChunkFromHandle(h2);
|
||||
// We can only merge chunks that are not in use.
|
||||
CHECK(!c1->in_use() && !c2->in_use());
|
||||
|
||||
// c1's prev doesn't change, still points to the same ptr, and is
|
||||
// still not in use.
|
||||
|
||||
// Fix up neighbor pointers
|
||||
//
|
||||
// c1 <-> c2 <-> c3 should become
|
||||
// c1 <-> c3
|
||||
|
||||
BFCAllocator::ChunkHandle h3 = c2->next;
|
||||
c1->next = h3;
|
||||
CHECK(c2->prev == h1);
|
||||
if (h3 != kInvalidChunkHandle) {
|
||||
BFCAllocator::Chunk* c3 = ChunkFromHandle(h3);
|
||||
c3->prev = h1;
|
||||
}
|
||||
|
||||
// Set the new size
|
||||
c1->size += c2->size;
|
||||
|
||||
DeleteChunk(h2);
|
||||
}
|
||||
|
||||
void BFCAllocator::DeleteChunk(ChunkHandle h) {
|
||||
// Delete h and cleanup all state
|
||||
Chunk* c = ChunkFromHandle(h);
|
||||
// VLOG(4) << "Removing: " << c->ptr;
|
||||
region_manager_.erase(c->ptr);
|
||||
DeallocateChunk(h);
|
||||
}
|
||||
|
||||
void BFCAllocator::InsertFreeChunkIntoBin(BFCAllocator::ChunkHandle h) {
|
||||
Chunk* c = ChunkFromHandle(h);
|
||||
CHECK(!c->in_use() && (c->bin_num == kInvalidBinNum));
|
||||
BinNum bin_num = BinNumForSize(c->size);
|
||||
Bin* new_bin = BinFromIndex(bin_num);
|
||||
c->bin_num = bin_num;
|
||||
new_bin->free_chunks.insert(h);
|
||||
}
|
||||
|
||||
void BFCAllocator::RemoveFreeChunkIterFromBin(
|
||||
BFCAllocator::Bin::FreeChunkSet* free_chunks,
|
||||
const BFCAllocator::Bin::FreeChunkSet::iterator& citer) {
|
||||
ChunkHandle h = *citer;
|
||||
Chunk* c = ChunkFromHandle(h);
|
||||
CHECK(!c->in_use() && (c->bin_num != kInvalidBinNum));
|
||||
free_chunks->erase(citer);
|
||||
c->bin_num = kInvalidBinNum;
|
||||
}
|
||||
|
||||
void BFCAllocator::RemoveFreeChunkFromBin(BFCAllocator::ChunkHandle h) {
|
||||
Chunk* c = ChunkFromHandle(h);
|
||||
CHECK(!c->in_use() && (c->bin_num != kInvalidBinNum));
|
||||
int count = BinFromIndex(c->bin_num)->free_chunks.erase(h);
|
||||
CHECK(count > 0) << "Could not find chunk in bin";
|
||||
c->bin_num = kInvalidBinNum;
|
||||
}
|
||||
|
||||
void BFCAllocator::FreeAndMaybeCoalesce(BFCAllocator::ChunkHandle h) {
|
||||
Chunk* c = ChunkFromHandle(h);
|
||||
CHECK(c->in_use() && (c->bin_num == kInvalidBinNum));
|
||||
|
||||
// Mark the chunk as no longer in use
|
||||
c->allocation_id = -1;
|
||||
|
||||
// Updates the stats.
|
||||
stats_.bytes_in_use -= c->size;
|
||||
|
||||
// This chunk is no longer in-use, consider coalescing the chunk
|
||||
// with adjacent chunks.
|
||||
ChunkHandle chunk_to_reassign = h;
|
||||
|
||||
// If the next chunk is free, coalesce the two
|
||||
if (c->next != kInvalidChunkHandle) {
|
||||
Chunk* cnext = ChunkFromHandle(c->next);
|
||||
if (!cnext->in_use()) {
|
||||
// VLOG(8) << "Chunk at " << cnext->ptr << " merging with c " <<
|
||||
// c->ptr;
|
||||
|
||||
chunk_to_reassign = h;
|
||||
|
||||
// Deletes c->next
|
||||
RemoveFreeChunkFromBin(c->next);
|
||||
Merge(h, ChunkFromHandle(h)->next);
|
||||
}
|
||||
}
|
||||
|
||||
// If the previous chunk is free, coalesce the two
|
||||
c = ChunkFromHandle(h);
|
||||
if (c->prev != kInvalidChunkHandle) {
|
||||
Chunk* cprev = ChunkFromHandle(c->prev);
|
||||
if (!cprev->in_use()) {
|
||||
// VLOG(8) << "Chunk at " << c->ptr << " merging into c->prev "
|
||||
// << cprev->ptr;
|
||||
|
||||
chunk_to_reassign = c->prev;
|
||||
|
||||
// Deletes c
|
||||
RemoveFreeChunkFromBin(c->prev);
|
||||
Merge(ChunkFromHandle(h)->prev, h);
|
||||
c = ChunkFromHandle(h);
|
||||
}
|
||||
}
|
||||
|
||||
InsertFreeChunkIntoBin(chunk_to_reassign);
|
||||
}
|
||||
|
||||
void BFCAllocator::AddAllocVisitor(Visitor visitor) {
|
||||
VLOG(1) << "AddVisitor";
|
||||
mutex_lock l(lock_);
|
||||
region_visitors_.push_back(visitor);
|
||||
for (const auto& region : region_manager_.regions()) {
|
||||
visitor(region.ptr(), region.memory_size());
|
||||
}
|
||||
}
|
||||
|
||||
bool BFCAllocator::TracksAllocationSizes() { return true; }
|
||||
|
||||
size_t BFCAllocator::RequestedSize(void* ptr) {
|
||||
mutex_lock l(lock_);
|
||||
BFCAllocator::ChunkHandle h = region_manager_.get_handle(ptr);
|
||||
CHECK(h != kInvalidChunkHandle)
|
||||
<< "Asked for requested size of pointer we never allocated: " << ptr;
|
||||
BFCAllocator::Chunk* c = ChunkFromHandle(h);
|
||||
return c->requested_size;
|
||||
}
|
||||
|
||||
size_t BFCAllocator::AllocatedSize(void* ptr) {
|
||||
mutex_lock l(lock_);
|
||||
BFCAllocator::ChunkHandle h = region_manager_.get_handle(ptr);
|
||||
CHECK(h != kInvalidChunkHandle)
|
||||
<< "Asked for allocated size of pointer we never allocated: " << ptr;
|
||||
BFCAllocator::Chunk* c = ChunkFromHandle(h);
|
||||
return c->size;
|
||||
}
|
||||
|
||||
int64 BFCAllocator::AllocationId(void* ptr) {
|
||||
mutex_lock l(lock_);
|
||||
BFCAllocator::ChunkHandle h = region_manager_.get_handle(ptr);
|
||||
CHECK(h != kInvalidChunkHandle)
|
||||
<< "Asked for allocation id of pointer we never allocated: " << ptr;
|
||||
BFCAllocator::Chunk* c = ChunkFromHandle(h);
|
||||
return c->allocation_id;
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
||||
void RenderRegion(char* rendered, const size_t resolution,
|
||||
const size_t total_render_size, const size_t offset,
|
||||
const void* base_ptr, const void* ptr, const size_t size,
|
||||
const char c) {
|
||||
const char* base_ptr_c = static_cast<const char*>(base_ptr);
|
||||
const char* ptr_c = static_cast<const char*>(ptr);
|
||||
|
||||
size_t start_location =
|
||||
((ptr_c - base_ptr_c + offset) * resolution) / total_render_size;
|
||||
CHECK_GE(start_location, 0);
|
||||
CHECK_LT(start_location, resolution);
|
||||
size_t end_location =
|
||||
((ptr_c + size - 1 - base_ptr_c + offset) * resolution) /
|
||||
total_render_size;
|
||||
CHECK_GE(end_location, 0);
|
||||
CHECK_LT(end_location, resolution);
|
||||
|
||||
for (size_t i = start_location; i <= end_location; ++i) {
|
||||
rendered[i] = c;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
string BFCAllocator::RenderOccupancy() {
|
||||
// Make a buffer for the ASCII-art representation.
|
||||
const size_t resolution = 100;
|
||||
char rendered[resolution];
|
||||
|
||||
// Compute the total region size to render over
|
||||
size_t total_region_size = 0;
|
||||
for (const auto& region : region_manager_.regions()) {
|
||||
total_region_size += region.memory_size();
|
||||
}
|
||||
|
||||
// Start out with everything empty
|
||||
RenderRegion(rendered, resolution, total_region_size, 0, nullptr, nullptr,
|
||||
total_region_size, '_');
|
||||
|
||||
size_t region_offset = 0;
|
||||
for (const auto& region : region_manager_.regions()) {
|
||||
ChunkHandle h = region_manager_.get_handle(region.ptr());
|
||||
// Then render each chunk left to right.
|
||||
while (h != kInvalidChunkHandle) {
|
||||
Chunk* c = ChunkFromHandle(h);
|
||||
if (c->in_use()) {
|
||||
// Render the wasted space
|
||||
size_t wasted = c->size - c->requested_size;
|
||||
if (wasted > 0) {
|
||||
RenderRegion(rendered, resolution, total_region_size,
|
||||
region_offset + c->requested_size, region.ptr(), c->ptr,
|
||||
wasted, 'x');
|
||||
}
|
||||
// Then the occupied space
|
||||
RenderRegion(rendered, resolution, total_region_size, region_offset,
|
||||
region.ptr(), c->ptr, c->requested_size, '*');
|
||||
}
|
||||
h = c->next;
|
||||
}
|
||||
region_offset += region.memory_size();
|
||||
}
|
||||
|
||||
return StringPiece(rendered, resolution).ToString();
|
||||
}
|
||||
|
||||
void BFCAllocator::DumpMemoryLog(size_t num_bytes) {
|
||||
// For each bin: tally up the total number of chunks and bytes.
|
||||
// Note that bins hold only free chunks.
|
||||
for (BinNum bin_num = 0; bin_num < kNumBins; bin_num++) {
|
||||
Bin* b = BinFromIndex(bin_num);
|
||||
|
||||
size_t total_bytes_in_use = 0;
|
||||
size_t total_bytes_in_bin = 0;
|
||||
size_t total_requested_bytes_in_use = 0;
|
||||
size_t total_requested_bytes_in_bin = 0;
|
||||
size_t total_chunks_in_use = 0;
|
||||
size_t total_chunks_in_bin = 0;
|
||||
for (ChunkHandle h : b->free_chunks) {
|
||||
Chunk* c = ChunkFromHandle(h);
|
||||
total_bytes_in_bin += c->size;
|
||||
total_requested_bytes_in_bin += c->requested_size;
|
||||
++total_chunks_in_bin;
|
||||
if (c->in_use()) {
|
||||
total_bytes_in_use += c->size;
|
||||
total_requested_bytes_in_use += c->requested_size;
|
||||
++total_chunks_in_use;
|
||||
}
|
||||
}
|
||||
|
||||
LOG(INFO) << "Bin (" << b->bin_size
|
||||
<< "): \tTotal Chunks: " << total_chunks_in_bin
|
||||
<< ", Chunks in use: " << total_chunks_in_use << " "
|
||||
<< strings::HumanReadableNumBytes(total_bytes_in_bin)
|
||||
<< " allocated for chunks. "
|
||||
<< strings::HumanReadableNumBytes(total_requested_bytes_in_bin)
|
||||
<< " client-requested for chunks. "
|
||||
<< strings::HumanReadableNumBytes(total_bytes_in_use)
|
||||
<< " in use in bin. "
|
||||
<< strings::HumanReadableNumBytes(total_requested_bytes_in_use)
|
||||
<< " client-requested in use in bin.";
|
||||
}
|
||||
|
||||
// Find the bin that we would have liked to allocate in, so we
|
||||
// can get some further analysis about fragmentation.
|
||||
Bin* b = BinForSize(num_bytes);
|
||||
|
||||
LOG(INFO) << "Bin for " << strings::HumanReadableNumBytes(num_bytes)
|
||||
<< " was " << strings::HumanReadableNumBytes(b->bin_size)
|
||||
<< ", Chunk State: ";
|
||||
|
||||
for (ChunkHandle h : b->free_chunks) {
|
||||
Chunk* c = ChunkFromHandle(h);
|
||||
LOG(INFO) << c->DebugString(this, true);
|
||||
}
|
||||
|
||||
// Next show the chunks that are in use, and also summarize their
|
||||
// number by size.
|
||||
std::map<size_t, int> in_use_by_size;
|
||||
for (const auto& region : region_manager_.regions()) {
|
||||
ChunkHandle h = region_manager_.get_handle(region.ptr());
|
||||
while (h != kInvalidChunkHandle) {
|
||||
const Chunk* c = ChunkFromHandle(h);
|
||||
if (c->in_use()) {
|
||||
in_use_by_size[c->size]++;
|
||||
LOG(INFO) << "Chunk at " << c->ptr << " of size " << c->size;
|
||||
}
|
||||
h = c->next;
|
||||
}
|
||||
|
||||
h = region_manager_.get_handle(region.ptr());
|
||||
while (h != kInvalidChunkHandle) {
|
||||
const Chunk* c = ChunkFromHandle(h);
|
||||
if (!c->in_use()) {
|
||||
LOG(INFO) << "Free at " << c->ptr << " of size " << c->size;
|
||||
}
|
||||
h = c->next;
|
||||
}
|
||||
}
|
||||
|
||||
LOG(INFO) << " Summary of in-use Chunks by size: ";
|
||||
size_t total_bytes = 0;
|
||||
for (auto& it : in_use_by_size) {
|
||||
LOG(INFO) << it.second << " Chunks of size " << it.first << " totalling "
|
||||
<< strings::HumanReadableNumBytes(it.first * it.second);
|
||||
total_bytes += (it.first * it.second);
|
||||
}
|
||||
LOG(INFO) << "Sum Total of in-use chunks: "
|
||||
<< strings::HumanReadableNumBytes(total_bytes);
|
||||
LOG(INFO) << "Stats: \n" << stats_.DebugString();
|
||||
}
|
||||
|
||||
void BFCAllocator::GetStats(AllocatorStats* stats) {
|
||||
mutex_lock l(lock_);
|
||||
*stats = stats_;
|
||||
}
|
||||
|
||||
} // namespace tensorflow
|
413
tensorflow/core/common_runtime/bfc_allocator.h
Normal file
413
tensorflow/core/common_runtime/bfc_allocator.h
Normal file
@ -0,0 +1,413 @@
|
||||
/* Copyright 2015 Google Inc. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#ifndef TENSORFLOW_COMMON_RUNTIME_BFC_ALLOCATOR_H_
|
||||
#define TENSORFLOW_COMMON_RUNTIME_BFC_ALLOCATOR_H_
|
||||
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
#include "tensorflow/core/common_runtime/allocator_retry.h"
|
||||
#include "tensorflow/core/common_runtime/visitable_allocator.h"
|
||||
#include "tensorflow/core/lib/gtl/stl_util.h"
|
||||
#include "tensorflow/core/lib/strings/strcat.h"
|
||||
#include "tensorflow/core/platform/macros.h"
|
||||
#include "tensorflow/core/platform/mutex.h"
|
||||
#include "tensorflow/core/platform/thread_annotations.h"
|
||||
#include "tensorflow/core/platform/types.h"
|
||||
#include "tensorflow/core/protobuf/config.pb.h"
|
||||
|
||||
namespace tensorflow {
|
||||
|
||||
// A memory allocator that implements a 'best-fit with coalescing'
|
||||
// algorithm. This is essentially a very simple version of Doug Lea's
|
||||
// malloc (dlmalloc).
|
||||
//
|
||||
// The goal of this allocator is to support defragmentation via
|
||||
// coalescing. One assumption we make is that the process using this
|
||||
// allocator owns pretty much all of the memory, and that nearly
|
||||
// all requests to allocate memory go through this interface.
|
||||
class BFCAllocator : public VisitableAllocator {
|
||||
public:
|
||||
// Takes ownership of sub_allocator.
|
||||
BFCAllocator(SubAllocator* sub_allocator, size_t total_memory,
|
||||
bool allow_growth, const string& name);
|
||||
~BFCAllocator() override;
|
||||
|
||||
string Name() override { return name_; }
|
||||
void* AllocateRaw(size_t alignment, size_t num_bytes) override;
|
||||
void* AllocateRaw(size_t alignment, size_t num_bytes,
|
||||
const AllocationAttributes& allocation_attr) override;
|
||||
void DeallocateRaw(void* ptr) override;
|
||||
|
||||
void AddAllocVisitor(Visitor visitor) override;
|
||||
|
||||
// Does nothing, because memory is never freed.
|
||||
void AddFreeVisitor(Visitor visitor) override {}
|
||||
|
||||
bool TracksAllocationSizes() override;
|
||||
|
||||
size_t RequestedSize(void* ptr) override;
|
||||
|
||||
size_t AllocatedSize(void* ptr) override;
|
||||
|
||||
int64 AllocationId(void* ptr) override;
|
||||
|
||||
void GetStats(AllocatorStats* stats) override;
|
||||
|
||||
private:
|
||||
struct Bin;
|
||||
|
||||
void* AllocateRawInternal(size_t alignment, size_t num_bytes,
|
||||
bool dump_log_on_failure);
|
||||
void DeallocateRawInternal(void* ptr);
|
||||
|
||||
// A ChunkHandle is an index into the chunks_ vector in BFCAllocator
|
||||
// kInvalidChunkHandle means an invalid chunk
|
||||
typedef int ChunkHandle;
|
||||
static const int kInvalidChunkHandle = -1;
|
||||
|
||||
typedef int BinNum;
|
||||
static const int kInvalidBinNum = -1;
|
||||
static const int kNumBins = 21;
|
||||
|
||||
// Chunks point to memory. Their prev/next pointers form a
|
||||
// doubly-linked list of addresses sorted by base address that
|
||||
// must be contiguous. Chunks contain information about whether
|
||||
// they are in use or whether they are free, and contain a pointer
|
||||
// to the bin they are in.
|
||||
struct Chunk {
|
||||
size_t size = 0; // Full size of buffer.
|
||||
|
||||
// We sometimes give chunks that are larger than needed to reduce
|
||||
// fragmentation. requested_size keeps track of what the client
|
||||
// actually wanted so we can understand whether our splitting
|
||||
// strategy is efficient.
|
||||
size_t requested_size = 0;
|
||||
|
||||
// allocation_id is set to -1 when the chunk is not in use. It is assigned a
|
||||
// value greater than zero before the chunk is returned from
|
||||
// AllocateRaw, and this value is unique among values assigned by
|
||||
// the parent allocator.
|
||||
int64 allocation_id = -1;
|
||||
void* ptr = nullptr; // pointer to granted subbuffer.
|
||||
|
||||
// If not kInvalidChunkHandle, the memory referred to by 'prev' is directly
|
||||
// preceding the memory used by this chunk. E.g., It should start
|
||||
// at 'ptr - prev->size'
|
||||
ChunkHandle prev = kInvalidChunkHandle;
|
||||
|
||||
// If not kInvalidChunkHandle, the memory referred to by 'next' is directly
|
||||
// following the memory used by this chunk. E.g., It should be at
|
||||
// 'ptr + size'
|
||||
ChunkHandle next = kInvalidChunkHandle;
|
||||
|
||||
// What bin are we in?
|
||||
BinNum bin_num = kInvalidBinNum;
|
||||
|
||||
bool in_use() const { return allocation_id != -1; }
|
||||
|
||||
string DebugString(BFCAllocator* a, bool recurse) {
|
||||
string dbg;
|
||||
strings::StrAppend(&dbg, " Size: ", strings::HumanReadableNumBytes(size),
|
||||
" | Requested Size: ",
|
||||
strings::HumanReadableNumBytes(requested_size),
|
||||
" | in_use: ", in_use());
|
||||
if (recurse && prev != BFCAllocator::kInvalidChunkHandle) {
|
||||
Chunk* p = a->ChunkFromHandle(prev);
|
||||
strings::StrAppend(&dbg, ", prev: ", p->DebugString(a, false));
|
||||
}
|
||||
if (recurse && next != BFCAllocator::kInvalidChunkHandle) {
|
||||
Chunk* n = a->ChunkFromHandle(next);
|
||||
strings::StrAppend(&dbg, ", next: ", n->DebugString(a, false));
|
||||
}
|
||||
return dbg;
|
||||
}
|
||||
};
|
||||
|
||||
// A Bin is a collection of similar-sized free chunks.
|
||||
struct Bin {
|
||||
// All chunks in this bin have >= bin_size memory.
|
||||
size_t bin_size = 0;
|
||||
|
||||
struct ChunkComparator {
|
||||
explicit ChunkComparator(BFCAllocator* allocator)
|
||||
: allocator_(allocator) {}
|
||||
// Sort first by size and then use pointer address as a tie breaker.
|
||||
bool operator()(const ChunkHandle ha, const ChunkHandle hb) const {
|
||||
const Chunk* a = allocator_->ChunkFromHandle(ha);
|
||||
const Chunk* b = allocator_->ChunkFromHandle(hb);
|
||||
if (a->size != b->size) {
|
||||
return a->size < b->size;
|
||||
}
|
||||
return a->ptr < b->ptr;
|
||||
}
|
||||
|
||||
private:
|
||||
BFCAllocator* allocator_; // The parent allocator
|
||||
};
|
||||
|
||||
typedef std::set<ChunkHandle, ChunkComparator> FreeChunkSet;
|
||||
// List of free chunks within the bin, sorted by chunk size.
|
||||
// Chunk * not owned.
|
||||
FreeChunkSet free_chunks;
|
||||
Bin(BFCAllocator* allocator, size_t bs)
|
||||
: bin_size(bs), free_chunks(ChunkComparator(allocator)) {}
|
||||
};
|
||||
|
||||
static const size_t kMinAllocationBits = 8;
|
||||
static const size_t kMinAllocationSize = 1 << kMinAllocationBits;
|
||||
|
||||
// AllocationRegion maps pointers to ChunkHandles for a single
|
||||
// contiguous memory region.
|
||||
//
|
||||
// This class is thread-compatible.
|
||||
class AllocationRegion {
|
||||
public:
|
||||
AllocationRegion(void* ptr, size_t memory_size)
|
||||
: ptr_(ptr),
|
||||
memory_size_(memory_size),
|
||||
end_ptr_(
|
||||
static_cast<void*>(static_cast<char*>(ptr_) + memory_size_)) {
|
||||
DCHECK_EQ(0, memory_size % kMinAllocationSize);
|
||||
const size_t n_handles =
|
||||
(memory_size + kMinAllocationSize - 1) / kMinAllocationSize;
|
||||
handles_ = new ChunkHandle[n_handles];
|
||||
for (size_t i = 0; i < n_handles; i++) {
|
||||
handles_[i] = kInvalidChunkHandle;
|
||||
}
|
||||
}
|
||||
|
||||
AllocationRegion() {}
|
||||
|
||||
~AllocationRegion() { delete[] handles_; }
|
||||
|
||||
AllocationRegion(AllocationRegion&& other) { Swap(other); }
|
||||
|
||||
AllocationRegion& operator=(AllocationRegion&& other) {
|
||||
Swap(other);
|
||||
return *this;
|
||||
}
|
||||
|
||||
void* ptr() const { return ptr_; }
|
||||
void* end_ptr() const { return end_ptr_; }
|
||||
size_t memory_size() const { return memory_size_; }
|
||||
ChunkHandle get_handle(const void* p) const {
|
||||
return handles_[IndexFor(p)];
|
||||
}
|
||||
void set_handle(const void* p, ChunkHandle h) { handles_[IndexFor(p)] = h; }
|
||||
void erase(const void* p) { set_handle(p, kInvalidChunkHandle); }
|
||||
|
||||
private:
|
||||
void Swap(AllocationRegion& other) {
|
||||
std::swap(ptr_, other.ptr_);
|
||||
std::swap(memory_size_, other.memory_size_);
|
||||
std::swap(end_ptr_, other.end_ptr_);
|
||||
std::swap(handles_, other.handles_);
|
||||
}
|
||||
|
||||
int IndexFor(const void* p) const {
|
||||
std::uintptr_t p_int = reinterpret_cast<std::uintptr_t>(p);
|
||||
std::uintptr_t base_int = reinterpret_cast<std::uintptr_t>(ptr_);
|
||||
DCHECK_GE(p_int, base_int);
|
||||
DCHECK_LT(p_int, base_int + memory_size_);
|
||||
return static_cast<int>(((p_int - base_int) >> kMinAllocationBits));
|
||||
}
|
||||
|
||||
// Metadata about the allocation region.
|
||||
void* ptr_ = nullptr;
|
||||
size_t memory_size_ = 0;
|
||||
void* end_ptr_ = nullptr;
|
||||
|
||||
// Array of size "memory_size / kMinAllocationSize". It is
|
||||
// indexed by (p-base) / kMinAllocationSize, contains ChunkHandle
|
||||
// for the memory allocation represented by "p"
|
||||
ChunkHandle* handles_ = nullptr;
|
||||
|
||||
TF_DISALLOW_COPY_AND_ASSIGN(AllocationRegion);
|
||||
};
|
||||
|
||||
// RegionManager aggregates one or more "AllocationRegions" and provides
|
||||
// a layer of indirection from pointers to the underlying ChunkHandle,
|
||||
// allowing allocation across multiple discontiguous memory regions.
|
||||
//
|
||||
// This class is thread-compatible.
|
||||
class RegionManager {
|
||||
public:
|
||||
RegionManager() {}
|
||||
~RegionManager() {}
|
||||
|
||||
void AddAllocationRegion(void* ptr, size_t memory_size) {
|
||||
// Insert sorted by end_ptr
|
||||
auto entry =
|
||||
std::upper_bound(regions_.begin(), regions_.end(), ptr, &Comparator);
|
||||
regions_.insert(entry, AllocationRegion(ptr, memory_size));
|
||||
}
|
||||
|
||||
ChunkHandle get_handle(const void* p) const {
|
||||
return RegionFor(p)->get_handle(p);
|
||||
}
|
||||
|
||||
void set_handle(const void* p, ChunkHandle h) {
|
||||
return MutableRegionFor(p)->set_handle(p, h);
|
||||
}
|
||||
void erase(const void* p) { return MutableRegionFor(p)->erase(p); }
|
||||
|
||||
const std::vector<AllocationRegion>& regions() const { return regions_; }
|
||||
|
||||
private:
|
||||
static bool Comparator(const void* ptr, const AllocationRegion& other) {
|
||||
return ptr < other.end_ptr();
|
||||
}
|
||||
|
||||
AllocationRegion* MutableRegionFor(const void* p) {
|
||||
return const_cast<AllocationRegion*>(RegionFor(p));
|
||||
}
|
||||
|
||||
const AllocationRegion* RegionFor(const void* p) const {
|
||||
auto entry =
|
||||
std::upper_bound(regions_.begin(), regions_.end(), p, &Comparator);
|
||||
|
||||
if (entry != regions_.end()) {
|
||||
return &(*entry);
|
||||
}
|
||||
|
||||
LOG(FATAL) << "Could not find Region for " << p;
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
private:
|
||||
std::vector<AllocationRegion> regions_;
|
||||
};
|
||||
|
||||
// Returns 'bytes' rounded up to the next highest kMinAllocationSize.
|
||||
size_t RoundedBytes(size_t bytes);
|
||||
|
||||
// Try to add a new memory region that can satisfy an allocation of
|
||||
// 'rounded_bytes' bytes. Returns true on success and false on
|
||||
// failure.
|
||||
bool Extend(size_t rounded_bytes) EXCLUSIVE_LOCKS_REQUIRED(lock_);
|
||||
|
||||
// Returns a pointer to an underlying allocated chunk of size
|
||||
// 'rounded_bytes'.
|
||||
void* FindChunkPtr(BinNum bin_num, size_t rounded_bytes, size_t num_bytes)
|
||||
EXCLUSIVE_LOCKS_REQUIRED(lock_);
|
||||
|
||||
// Splits the chunk specified by 'h' into two chunks, one at least
|
||||
// of size 'num_bytes'.
|
||||
void SplitChunk(ChunkHandle h, size_t num_bytes)
|
||||
EXCLUSIVE_LOCKS_REQUIRED(lock_);
|
||||
|
||||
// Merges the two chunk handles. Requires that the chunks are
|
||||
// contiguous in their allocation.
|
||||
void Merge(ChunkHandle h, ChunkHandle h2) EXCLUSIVE_LOCKS_REQUIRED(lock_);
|
||||
|
||||
// Frees the memory represented by 'h', coalescing the chunk if
|
||||
// possible.
|
||||
void FreeAndMaybeCoalesce(ChunkHandle h) EXCLUSIVE_LOCKS_REQUIRED(lock_);
|
||||
|
||||
// Adds the chunk 'h' to the proper free bin.
|
||||
void InsertFreeChunkIntoBin(ChunkHandle h) EXCLUSIVE_LOCKS_REQUIRED(lock_);
|
||||
|
||||
// Removes the free chunk pointed to by 'c' from the set free_chunks.
|
||||
void RemoveFreeChunkIterFromBin(Bin::FreeChunkSet* free_chunks,
|
||||
const Bin::FreeChunkSet::iterator& c)
|
||||
EXCLUSIVE_LOCKS_REQUIRED(lock_);
|
||||
|
||||
// Removes a free chunk from the bin.
|
||||
void RemoveFreeChunkFromBin(ChunkHandle h) EXCLUSIVE_LOCKS_REQUIRED(lock_);
|
||||
|
||||
// Removes the chunk metadata represented by 'h'.
|
||||
void DeleteChunk(ChunkHandle h) EXCLUSIVE_LOCKS_REQUIRED(lock_);
|
||||
|
||||
string RenderOccupancy() EXCLUSIVE_LOCKS_REQUIRED(lock_);
|
||||
void DumpMemoryLog(size_t num_bytes) EXCLUSIVE_LOCKS_REQUIRED(lock_);
|
||||
|
||||
ChunkHandle AllocateChunk() EXCLUSIVE_LOCKS_REQUIRED(lock_);
|
||||
void DeallocateChunk(ChunkHandle h) EXCLUSIVE_LOCKS_REQUIRED(lock_);
|
||||
|
||||
Chunk* ChunkFromHandle(ChunkHandle h) EXCLUSIVE_LOCKS_REQUIRED(lock_);
|
||||
|
||||
AllocatorRetry retry_helper_;
|
||||
|
||||
// Structures immutable after construction
|
||||
size_t memory_limit_ = 0;
|
||||
inline int Log2FloorNonZero(uint64 n) {
|
||||
#if defined(__GNUC__)
|
||||
return 63 ^ __builtin_clzll(n);
|
||||
#else
|
||||
int r = 0;
|
||||
while (n > 0) {
|
||||
r++;
|
||||
n >>= 1;
|
||||
}
|
||||
return r;
|
||||
#endif
|
||||
}
|
||||
|
||||
// Map from bin size to Bin
|
||||
Bin* BinFromIndex(BinNum index) {
|
||||
return reinterpret_cast<Bin*>(&(bins_space_[index * sizeof(Bin)]));
|
||||
}
|
||||
size_t BinNumToSize(BinNum index) {
|
||||
return static_cast<size_t>(256) << index;
|
||||
}
|
||||
BinNum BinNumForSize(size_t bytes) {
|
||||
uint64 v = std::max<size_t>(bytes, 256) >> kMinAllocationBits;
|
||||
int b = std::min(kNumBins - 1, Log2FloorNonZero(v));
|
||||
return b;
|
||||
}
|
||||
Bin* BinForSize(size_t bytes) { return BinFromIndex(BinNumForSize(bytes)); }
|
||||
|
||||
char bins_space_[sizeof(Bin) * kNumBins];
|
||||
|
||||
// The size of the current region allocation.
|
||||
size_t curr_region_allocation_bytes_;
|
||||
|
||||
// The total number of allocated bytes by the allocator.
|
||||
size_t total_region_allocated_bytes_ = 0;
|
||||
|
||||
// An indicator that expansion of a region has hit the limits
|
||||
// of the available memory.
|
||||
bool started_backpedal_ = false;
|
||||
|
||||
std::unique_ptr<SubAllocator> suballocator_;
|
||||
string name_;
|
||||
|
||||
// Structures mutable after construction
|
||||
mutable mutex lock_;
|
||||
RegionManager region_manager_ GUARDED_BY(lock_);
|
||||
|
||||
std::vector<Chunk> chunks_;
|
||||
ChunkHandle free_chunks_list_; // Ptr to head of linked list of free Chunks
|
||||
|
||||
// Called once on each region, ASAP.
|
||||
std::vector<Visitor> region_visitors_;
|
||||
|
||||
// Counter containing the next unique identifier to assign to a
|
||||
// newly-created chunk.
|
||||
int64 next_allocation_id_ GUARDED_BY(lock_);
|
||||
|
||||
// Stats.
|
||||
AllocatorStats stats_ GUARDED_BY(lock_);
|
||||
|
||||
TF_DISALLOW_COPY_AND_ASSIGN(BFCAllocator);
|
||||
};
|
||||
|
||||
} // namespace tensorflow
|
||||
|
||||
#endif // TENSORFLOW_COMMON_RUNTIME_BFC_ALLOCATOR_H_
|
@ -1170,37 +1170,44 @@ FunctionBody* SymbolicGradientHelper::Compute() {
|
||||
Copy();
|
||||
|
||||
Graph* g = gbody_->graph;
|
||||
|
||||
const int num_y = gbody_->ret_nodes.size();
|
||||
|
||||
// Populate 'y_node_outputs_' with node function body outputs.
|
||||
// Populate 'y_grad_nodes' with initial gradient nodes for each return node of
|
||||
// the original function body (these will be 'arg' nodes in the function
|
||||
// gradient body).
|
||||
const int num_y = gbody_->ret_nodes.size();
|
||||
std::vector<Node*> y_grad_nodes;
|
||||
y_grad_nodes.reserve(num_y);
|
||||
std::vector<NodeOut> y_node_outputs;
|
||||
y_node_outputs.reserve(num_y);
|
||||
std::vector<NodeOut> y_grad_node_outputs;
|
||||
y_grad_node_outputs.reserve(num_y);
|
||||
for (int i = 0; i < num_y; ++i) {
|
||||
Node* y = gbody_->ret_nodes[i];
|
||||
y_node_outputs.push_back({y, 0});
|
||||
DCHECK_EQ(y->type_string(), kRetOp);
|
||||
const DataType dtype = y->input_type(0);
|
||||
const int index = gbody_->arg_nodes.size();
|
||||
Node* dy = AddArg(g, dtype, index);
|
||||
gbody_->arg_types.push_back(dtype);
|
||||
gbody_->arg_nodes.push_back(dy);
|
||||
y_grad_nodes.push_back(dy);
|
||||
y_grad_node_outputs.push_back({dy, 0});
|
||||
}
|
||||
|
||||
// Populate 'x_nodes' with function args (not including 'y_grad_nodes').
|
||||
// Populate 'x_nodes' with function args (excluding 'y_grad_node_outputs').
|
||||
const int num_x = fbody_->arg_nodes.size();
|
||||
std::vector<Node*> x_nodes;
|
||||
x_nodes.reserve(num_x);
|
||||
std::vector<NodeOut> x_node_outputs;
|
||||
x_node_outputs.reserve(num_x);
|
||||
for (size_t i = 0; i < fbody_->arg_nodes.size(); ++i) {
|
||||
x_nodes.push_back(gbody_->arg_nodes[i]);
|
||||
x_node_outputs.push_back({gbody_->arg_nodes[i], 0});
|
||||
}
|
||||
|
||||
// Call AddSymbolicGradients which will add nodes to graph 'g' that
|
||||
// compute the function gradient (adding an entry in 'x_grad_nodes' for
|
||||
// each node in 'x_nodes').
|
||||
std::vector<GradNodeOutput> x_grad_nodes(x_nodes.size());
|
||||
TF_CHECK_OK(AddSymbolicGradients(gbody_->ret_nodes, x_nodes, y_grad_nodes,
|
||||
&x_grad_nodes, g));
|
||||
// compute the function gradient (adding an entry in 'x_grad_node_outputs' for
|
||||
// each node in 'x_node_outputs').
|
||||
std::vector<NodeOut> x_grad_node_outputs;
|
||||
TF_CHECK_OK(AddSymbolicGradients(y_node_outputs, x_node_outputs,
|
||||
y_grad_node_outputs, &x_grad_node_outputs,
|
||||
g));
|
||||
|
||||
// Remove the old return nodes from the function body.
|
||||
for (Node* n : gbody_->ret_nodes) {
|
||||
@ -1211,7 +1218,7 @@ FunctionBody* SymbolicGradientHelper::Compute() {
|
||||
// Add new return nodes to the function gradient body for each node
|
||||
// in 'x_grad_nodes'.
|
||||
for (size_t i = 0; i < fbody_->arg_types.size(); ++i) {
|
||||
Endpoint grad = {x_grad_nodes[i].node, x_grad_nodes[i].index};
|
||||
Endpoint grad = {x_grad_node_outputs[i].node, x_grad_node_outputs[i].index};
|
||||
Node* ret = AddRet(g, grad, i);
|
||||
gbody_->ret_nodes.push_back(ret);
|
||||
}
|
||||
|
@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#include "tensorflow/core/common_runtime/gpu/gpu_allocator_retry.h"
|
||||
#include "tensorflow/core/common_runtime/allocator_retry.h"
|
||||
|
||||
#include <vector>
|
||||
#include "tensorflow/core/lib/core/notification.h"
|
||||
@ -55,7 +55,7 @@ class FakeAllocator {
|
||||
}
|
||||
|
||||
private:
|
||||
GPUAllocatorRetry retry_;
|
||||
AllocatorRetry retry_;
|
||||
void* good_ptr_ = reinterpret_cast<void*>(0xdeadbeef);
|
||||
mutex mu_;
|
||||
size_t memory_capacity_ GUARDED_BY(mu_);
|
||||
|
@ -15,17 +15,7 @@ limitations under the License.
|
||||
|
||||
#include "tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h"
|
||||
|
||||
#include "tensorflow/core/common_runtime/gpu/gpu_allocator_retry.h"
|
||||
#include "tensorflow/core/common_runtime/gpu/gpu_init.h"
|
||||
#include "tensorflow/core/lib/core/bits.h"
|
||||
#include "tensorflow/core/lib/gtl/stl_util.h"
|
||||
#include "tensorflow/core/lib/strings/numbers.h"
|
||||
#include "tensorflow/core/lib/strings/str_util.h"
|
||||
#include "tensorflow/core/lib/strings/strcat.h"
|
||||
#include "tensorflow/core/platform/logging.h"
|
||||
#include "tensorflow/core/platform/mutex.h"
|
||||
#include "tensorflow/core/platform/stream_executor.h"
|
||||
#include "tensorflow/core/platform/types.h"
|
||||
|
||||
namespace gpu = ::perftools::gputools;
|
||||
|
||||
@ -36,680 +26,9 @@ GPUBFCAllocator::GPUBFCAllocator(int device_id, size_t total_memory)
|
||||
|
||||
GPUBFCAllocator::GPUBFCAllocator(int device_id, size_t total_memory,
|
||||
const GPUOptions& gpu_options)
|
||||
: device_id_(device_id),
|
||||
free_chunks_list_(kInvalidChunkHandle),
|
||||
next_allocation_id_(1) {
|
||||
// Get a pointer to the stream_executor for this device
|
||||
stream_exec_ = GPUMachineManager()->ExecutorForDevice(device_id).ValueOrDie();
|
||||
|
||||
if (gpu_options.allow_growth()) {
|
||||
// 1MiB smallest initial allocation, unless total memory available
|
||||
// is less.
|
||||
curr_region_allocation_bytes_ =
|
||||
RoundedBytes(std::min(total_memory, size_t{1048576}));
|
||||
} else {
|
||||
curr_region_allocation_bytes_ = RoundedBytes(total_memory);
|
||||
}
|
||||
|
||||
// Allocate the requested amount of memory.
|
||||
gpu_memory_size_ = total_memory;
|
||||
stats_.bytes_limit = static_cast<int64>(total_memory);
|
||||
|
||||
// Create a bunch of bins of various good sizes.
|
||||
|
||||
// We create bins to fit all possible ranges that cover the
|
||||
// gpu_memory_size_ starting from allocations up to 256 bytes to
|
||||
// allocations up to (and including) the memory limit.
|
||||
for (BinNum b = 0; b < kNumBins; b++) {
|
||||
size_t bin_size = BinNumToSize(b);
|
||||
VLOG(1) << "Creating bin of max chunk size "
|
||||
<< strings::HumanReadableNumBytes(bin_size);
|
||||
new (BinFromIndex(b)) Bin(this, bin_size);
|
||||
CHECK_EQ(BinForSize(bin_size), BinFromIndex(b));
|
||||
CHECK_EQ(BinForSize(bin_size + 255), BinFromIndex(b));
|
||||
CHECK_EQ(BinForSize(bin_size * 2 - 1), BinFromIndex(b));
|
||||
if (b + 1 < kNumBins) {
|
||||
CHECK_NE(BinForSize(bin_size * 2), BinFromIndex(b));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
GPUBFCAllocator::~GPUBFCAllocator() {
|
||||
// Return memory back.
|
||||
VLOG(2) << "Number of regions allocated: "
|
||||
<< region_manager_.regions().size();
|
||||
for (const auto& region : region_manager_.regions()) {
|
||||
gpu::DeviceMemoryBase gpu_ptr{region.ptr()};
|
||||
stream_exec_->Deallocate(&gpu_ptr);
|
||||
}
|
||||
|
||||
for (BinNum b = 0; b < kNumBins; b++) {
|
||||
BinFromIndex(b)->~Bin();
|
||||
}
|
||||
}
|
||||
|
||||
GPUBFCAllocator::Chunk* GPUBFCAllocator::ChunkFromHandle(ChunkHandle h) {
|
||||
DCHECK_GE(h, 0);
|
||||
DCHECK_LT(h, static_cast<int>(chunks_.size()));
|
||||
return &(chunks_[h]);
|
||||
}
|
||||
|
||||
bool GPUBFCAllocator::Extend(size_t rounded_bytes) {
|
||||
// Do we have enough space to handle the client's request?
|
||||
// If not, fail immediately.
|
||||
if (total_region_allocated_bytes_ + rounded_bytes > gpu_memory_size_) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// If curr_region_allocation_bytes_ is not enough to satisfy the
|
||||
// allocation, keep multiplying by a power of two until that is
|
||||
// sufficient.
|
||||
bool increased_allocation = false;
|
||||
while (rounded_bytes > curr_region_allocation_bytes_) {
|
||||
curr_region_allocation_bytes_ *= 2;
|
||||
increased_allocation = true;
|
||||
}
|
||||
|
||||
// Try allocating.
|
||||
size_t bytes = curr_region_allocation_bytes_;
|
||||
gpu::DeviceMemory<char> gpu_mem = stream_exec_->AllocateArray<char>(bytes);
|
||||
if (gpu_mem == nullptr && !started_backpedal_) {
|
||||
// Only backpedal once.
|
||||
started_backpedal_ = true;
|
||||
|
||||
static constexpr float kBackpedalFactor = 0.9;
|
||||
|
||||
// Try allocating less memory.
|
||||
bytes = RoundedBytes(bytes * kBackpedalFactor);
|
||||
while (gpu_mem == nullptr && bytes > rounded_bytes) {
|
||||
gpu_mem = stream_exec_->AllocateArray<char>(bytes);
|
||||
bytes = RoundedBytes(bytes * kBackpedalFactor);
|
||||
}
|
||||
}
|
||||
|
||||
if (gpu_mem == nullptr) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!increased_allocation) {
|
||||
// Increase the region size of the next required allocation.
|
||||
curr_region_allocation_bytes_ *= 2;
|
||||
}
|
||||
|
||||
VLOG(1) << "Extending allocation by " << strings::HumanReadableNumBytes(bytes)
|
||||
<< " bytes.";
|
||||
|
||||
total_region_allocated_bytes_ += bytes;
|
||||
VLOG(1) << "Total allocated bytes: "
|
||||
<< strings::HumanReadableNumBytes(total_region_allocated_bytes_);
|
||||
|
||||
void* gpu_mem_base = gpu_mem.opaque();
|
||||
VLOG(1) << "Allocated memory at " << gpu_mem_base << " to "
|
||||
<< static_cast<void*>(static_cast<char*>(gpu_mem_base) + bytes);
|
||||
region_manager_.AddAllocationRegion(gpu_mem_base, bytes);
|
||||
|
||||
// Create one large chunk for the whole memory space that will
|
||||
// be chunked later.
|
||||
ChunkHandle h = AllocateChunk();
|
||||
GPUBFCAllocator::Chunk* c = ChunkFromHandle(h);
|
||||
c->ptr = gpu_mem_base;
|
||||
c->size = bytes;
|
||||
c->allocation_id = -1;
|
||||
c->prev = kInvalidChunkHandle;
|
||||
c->next = kInvalidChunkHandle;
|
||||
|
||||
region_manager_.set_handle(c->ptr, h);
|
||||
|
||||
// TODO(vrv): Try to merge this new region with an existing region,
|
||||
// if the address space is contiguous, to avoid fragmentation
|
||||
// across regions.
|
||||
|
||||
// Insert the chunk into the right bin.
|
||||
InsertFreeChunkIntoBin(h);
|
||||
|
||||
// Invoke visitors on newly allocated region.
|
||||
for (auto visitor : region_visitors_) {
|
||||
visitor(gpu_mem_base, bytes);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
GPUBFCAllocator::ChunkHandle GPUBFCAllocator::AllocateChunk() {
|
||||
if (free_chunks_list_ != kInvalidChunkHandle) {
|
||||
ChunkHandle h = free_chunks_list_;
|
||||
Chunk* c = ChunkFromHandle(h);
|
||||
free_chunks_list_ = c->next;
|
||||
return h;
|
||||
} else {
|
||||
ChunkHandle h = chunks_.size();
|
||||
chunks_.resize(h + 1);
|
||||
return h;
|
||||
}
|
||||
}
|
||||
|
||||
void GPUBFCAllocator::DeallocateChunk(ChunkHandle h) {
|
||||
Chunk* c = ChunkFromHandle(h);
|
||||
c->next = free_chunks_list_;
|
||||
free_chunks_list_ = h;
|
||||
}
|
||||
|
||||
void* GPUBFCAllocator::AllocateRaw(size_t unused_alignment, size_t num_bytes) {
|
||||
// Fast path: Try once to allocate without getting the retry_helper_ involved
|
||||
void* r = AllocateRawInternal(unused_alignment, num_bytes, false);
|
||||
if (r != nullptr) {
|
||||
return r;
|
||||
} else {
|
||||
static const int64 kMaxMillisToWait = 10000; // 10 seconds
|
||||
return retry_helper_.AllocateRaw(
|
||||
[this](size_t a, size_t nb, bool v) {
|
||||
return AllocateRawInternal(a, nb, v);
|
||||
},
|
||||
kMaxMillisToWait, unused_alignment, num_bytes);
|
||||
}
|
||||
}
|
||||
|
||||
void* GPUBFCAllocator::AllocateRaw(
|
||||
size_t unused_alignment, size_t num_bytes,
|
||||
const AllocationAttributes& allocation_attr) {
|
||||
if (allocation_attr.no_retry_on_failure) {
|
||||
// Return immediately upon the first failure if this is for allocating an
|
||||
// optional scratch space.
|
||||
void* result = AllocateRawInternal(unused_alignment, num_bytes, false);
|
||||
if (result == nullptr) {
|
||||
// The counter incrementing is not thread-safe. But we don't really care.
|
||||
// TODO(zhengxq): we should implement a LOG_FIRST_N and LOG_EVERY_N for
|
||||
// more general usage.
|
||||
static int log_counter = 0;
|
||||
if (log_counter < 10) {
|
||||
log_counter++;
|
||||
LOG(WARNING)
|
||||
<< "Ran out of memory trying to allocate "
|
||||
<< strings::HumanReadableNumBytes(num_bytes)
|
||||
<< ". The caller indicates that this is not a failure, but"
|
||||
<< " may mean that there could be performance gains if more"
|
||||
<< " memory is available.";
|
||||
}
|
||||
}
|
||||
return result;
|
||||
} else {
|
||||
return AllocateRaw(unused_alignment, num_bytes);
|
||||
}
|
||||
}
|
||||
|
||||
// static
|
||||
size_t GPUBFCAllocator::RoundedBytes(size_t bytes) {
|
||||
size_t rounded_bytes =
|
||||
(kMinAllocationSize *
|
||||
((bytes + kMinAllocationSize - 1) / kMinAllocationSize));
|
||||
DCHECK_EQ(size_t{0}, rounded_bytes % kMinAllocationSize);
|
||||
return rounded_bytes;
|
||||
}
|
||||
|
||||
void* GPUBFCAllocator::AllocateRawInternal(size_t unused_alignment,
|
||||
size_t num_bytes,
|
||||
bool dump_log_on_failure) {
|
||||
if (num_bytes == 0) {
|
||||
LOG(ERROR) << "tried to allocate 0 bytes";
|
||||
return nullptr;
|
||||
}
|
||||
// First, always allocate memory of at least kMinAllocationSize
|
||||
// bytes, and always allocate multiples of kMinAllocationSize bytes
|
||||
// so all memory addresses are nicely byte aligned.
|
||||
size_t rounded_bytes = RoundedBytes(num_bytes);
|
||||
|
||||
// The BFC allocator tries to find the best fit first.
|
||||
BinNum bin_num = BinNumForSize(rounded_bytes);
|
||||
|
||||
mutex_lock l(lock_);
|
||||
void* ptr = FindChunkPtr(bin_num, rounded_bytes, num_bytes);
|
||||
if (ptr != nullptr) {
|
||||
return ptr;
|
||||
}
|
||||
|
||||
// Try to extend
|
||||
if (Extend(rounded_bytes)) {
|
||||
ptr = FindChunkPtr(bin_num, rounded_bytes, num_bytes);
|
||||
if (ptr != nullptr) {
|
||||
return ptr;
|
||||
}
|
||||
}
|
||||
|
||||
// We searched all bins for an existing free chunk to use and
|
||||
// couldn't find one. This means we must have run out of memory,
|
||||
// Dump the memory log for analysis.
|
||||
if (dump_log_on_failure) {
|
||||
DumpMemoryLog(rounded_bytes);
|
||||
LOG(WARNING) << RenderOccupancy();
|
||||
LOG(WARNING) << "Ran out of memory trying to allocate "
|
||||
<< strings::HumanReadableNumBytes(num_bytes)
|
||||
<< ". See logs for memory state.";
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
void* GPUBFCAllocator::FindChunkPtr(BinNum bin_num, size_t rounded_bytes,
|
||||
size_t num_bytes) {
|
||||
// First identify the first bin that could satisfy rounded_bytes.
|
||||
for (; bin_num < kNumBins; bin_num++) {
|
||||
// Start searching from the first bin for the smallest chunk that fits
|
||||
// rounded_bytes.
|
||||
Bin* b = BinFromIndex(bin_num);
|
||||
for (auto citer = b->free_chunks.begin(); citer != b->free_chunks.end();
|
||||
++citer) {
|
||||
const GPUBFCAllocator::ChunkHandle h = (*citer);
|
||||
GPUBFCAllocator::Chunk* chunk = ChunkFromHandle(h);
|
||||
DCHECK(!chunk->in_use());
|
||||
if (chunk->size >= rounded_bytes) {
|
||||
// We found an existing chunk that fits us that wasn't in use, so remove
|
||||
// it from the free bin structure prior to using.
|
||||
RemoveFreeChunkIterFromBin(&b->free_chunks, citer);
|
||||
|
||||
// If we can break the size of the chunk into two reasonably
|
||||
// large pieces, do so.
|
||||
//
|
||||
// TODO(vrv): What should be the criteria when deciding when
|
||||
// to split?
|
||||
if (chunk->size >= rounded_bytes * 2) {
|
||||
SplitChunk(h, rounded_bytes);
|
||||
chunk = ChunkFromHandle(h); // Update chunk pointer in case it moved
|
||||
}
|
||||
|
||||
// The requested size of the returned chunk is what the user
|
||||
// has allocated.
|
||||
chunk->requested_size = num_bytes;
|
||||
// Assign a unique id and increment the id counter, marking the
|
||||
// chunk as being in use.
|
||||
chunk->allocation_id = next_allocation_id_++;
|
||||
|
||||
// Update stats.
|
||||
++stats_.num_allocs;
|
||||
stats_.bytes_in_use += chunk->size;
|
||||
stats_.max_bytes_in_use =
|
||||
std::max(stats_.max_bytes_in_use, stats_.bytes_in_use);
|
||||
stats_.max_alloc_size =
|
||||
std::max<std::size_t>(stats_.max_alloc_size, chunk->size);
|
||||
|
||||
VLOG(4) << "Returning: " << chunk->ptr;
|
||||
if (VLOG_IS_ON(4)) {
|
||||
LOG(INFO) << "A: " << RenderOccupancy();
|
||||
}
|
||||
return chunk->ptr;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
void GPUBFCAllocator::SplitChunk(GPUBFCAllocator::ChunkHandle h,
|
||||
size_t num_bytes) {
|
||||
// Allocate the new chunk before we do any ChunkFromHandle
|
||||
ChunkHandle h_new_chunk = AllocateChunk();
|
||||
|
||||
Chunk* c = ChunkFromHandle(h);
|
||||
CHECK(!c->in_use() && (c->bin_num == kInvalidBinNum));
|
||||
|
||||
// Create a new chunk starting num_bytes after c
|
||||
GPUBFCAllocator::Chunk* new_chunk = ChunkFromHandle(h_new_chunk);
|
||||
new_chunk->ptr = static_cast<void*>(static_cast<char*>(c->ptr) + num_bytes);
|
||||
region_manager_.set_handle(new_chunk->ptr, h_new_chunk);
|
||||
|
||||
// Set the new sizes of the chunks.
|
||||
new_chunk->size = c->size - num_bytes;
|
||||
c->size = num_bytes;
|
||||
|
||||
// The new chunk is not in use.
|
||||
new_chunk->allocation_id = -1;
|
||||
|
||||
// Maintain the pointers.
|
||||
// c <-> c_neighbor becomes
|
||||
// c <-> new_chunk <-> c_neighbor
|
||||
GPUBFCAllocator::ChunkHandle h_neighbor = c->next;
|
||||
new_chunk->prev = h;
|
||||
new_chunk->next = h_neighbor;
|
||||
c->next = h_new_chunk;
|
||||
if (h_neighbor != kInvalidChunkHandle) {
|
||||
Chunk* c_neighbor = ChunkFromHandle(h_neighbor);
|
||||
c_neighbor->prev = h_new_chunk;
|
||||
}
|
||||
|
||||
// Add the newly free chunk to the free bin.
|
||||
InsertFreeChunkIntoBin(h_new_chunk);
|
||||
}
|
||||
|
||||
void GPUBFCAllocator::DeallocateRaw(void* ptr) {
|
||||
DeallocateRawInternal(ptr);
|
||||
retry_helper_.NotifyDealloc();
|
||||
}
|
||||
|
||||
void GPUBFCAllocator::DeallocateRawInternal(void* ptr) {
|
||||
if (ptr == nullptr) {
|
||||
LOG(ERROR) << "tried to deallocate nullptr";
|
||||
return;
|
||||
}
|
||||
mutex_lock l(lock_);
|
||||
|
||||
// Find the chunk from the ptr.
|
||||
GPUBFCAllocator::ChunkHandle h = region_manager_.get_handle(ptr);
|
||||
CHECK(h != kInvalidChunkHandle);
|
||||
|
||||
// Consider coalescing it.
|
||||
FreeAndMaybeCoalesce(h);
|
||||
|
||||
if (VLOG_IS_ON(4)) {
|
||||
LOG(INFO) << "F: " << RenderOccupancy();
|
||||
}
|
||||
}
|
||||
|
||||
// Merges h1 and h2 when Chunk(h1)->next is h2 and Chunk(h2)->prev is c1.
|
||||
// We merge Chunk(h2) into Chunk(h1).
|
||||
void GPUBFCAllocator::Merge(GPUBFCAllocator::ChunkHandle h1,
|
||||
GPUBFCAllocator::ChunkHandle h2) {
|
||||
Chunk* c1 = ChunkFromHandle(h1);
|
||||
Chunk* c2 = ChunkFromHandle(h2);
|
||||
// We can only merge chunks that are not in use.
|
||||
CHECK(!c1->in_use() && !c2->in_use());
|
||||
|
||||
// c1's prev doesn't change, still points to the same ptr, and is
|
||||
// still not in use.
|
||||
|
||||
// Fix up neighbor pointers
|
||||
//
|
||||
// c1 <-> c2 <-> c3 should become
|
||||
// c1 <-> c3
|
||||
|
||||
GPUBFCAllocator::ChunkHandle h3 = c2->next;
|
||||
c1->next = h3;
|
||||
CHECK(c2->prev == h1);
|
||||
if (h3 != kInvalidChunkHandle) {
|
||||
GPUBFCAllocator::Chunk* c3 = ChunkFromHandle(h3);
|
||||
c3->prev = h1;
|
||||
}
|
||||
|
||||
// Set the new size
|
||||
c1->size += c2->size;
|
||||
|
||||
DeleteChunk(h2);
|
||||
}
|
||||
|
||||
void GPUBFCAllocator::DeleteChunk(ChunkHandle h) {
|
||||
// Delete h and cleanup all state
|
||||
Chunk* c = ChunkFromHandle(h);
|
||||
// VLOG(4) << "Removing: " << c->ptr;
|
||||
region_manager_.erase(c->ptr);
|
||||
DeallocateChunk(h);
|
||||
}
|
||||
|
||||
void GPUBFCAllocator::InsertFreeChunkIntoBin(GPUBFCAllocator::ChunkHandle h) {
|
||||
Chunk* c = ChunkFromHandle(h);
|
||||
CHECK(!c->in_use() && (c->bin_num == kInvalidBinNum));
|
||||
BinNum bin_num = BinNumForSize(c->size);
|
||||
Bin* new_bin = BinFromIndex(bin_num);
|
||||
c->bin_num = bin_num;
|
||||
new_bin->free_chunks.insert(h);
|
||||
}
|
||||
|
||||
void GPUBFCAllocator::RemoveFreeChunkIterFromBin(
|
||||
GPUBFCAllocator::Bin::FreeChunkSet* free_chunks,
|
||||
const GPUBFCAllocator::Bin::FreeChunkSet::iterator& citer) {
|
||||
ChunkHandle h = *citer;
|
||||
Chunk* c = ChunkFromHandle(h);
|
||||
CHECK(!c->in_use() && (c->bin_num != kInvalidBinNum));
|
||||
free_chunks->erase(citer);
|
||||
c->bin_num = kInvalidBinNum;
|
||||
}
|
||||
|
||||
void GPUBFCAllocator::RemoveFreeChunkFromBin(GPUBFCAllocator::ChunkHandle h) {
|
||||
Chunk* c = ChunkFromHandle(h);
|
||||
CHECK(!c->in_use() && (c->bin_num != kInvalidBinNum));
|
||||
int count = BinFromIndex(c->bin_num)->free_chunks.erase(h);
|
||||
CHECK(count > 0) << "Could not find chunk in bin";
|
||||
c->bin_num = kInvalidBinNum;
|
||||
}
|
||||
|
||||
void GPUBFCAllocator::FreeAndMaybeCoalesce(GPUBFCAllocator::ChunkHandle h) {
|
||||
Chunk* c = ChunkFromHandle(h);
|
||||
CHECK(c->in_use() && (c->bin_num == kInvalidBinNum));
|
||||
|
||||
// Mark the chunk as no longer in use
|
||||
c->allocation_id = -1;
|
||||
|
||||
// Updates the stats.
|
||||
stats_.bytes_in_use -= c->size;
|
||||
|
||||
// This chunk is no longer in-use, consider coalescing the chunk
|
||||
// with adjacent chunks.
|
||||
ChunkHandle chunk_to_reassign = h;
|
||||
|
||||
// If the next chunk is free, coalesce the two
|
||||
if (c->next != kInvalidChunkHandle) {
|
||||
Chunk* cnext = ChunkFromHandle(c->next);
|
||||
if (!cnext->in_use()) {
|
||||
// VLOG(8) << "Chunk at " << cnext->ptr << " merging with c " <<
|
||||
// c->ptr;
|
||||
|
||||
chunk_to_reassign = h;
|
||||
|
||||
// Deletes c->next
|
||||
RemoveFreeChunkFromBin(c->next);
|
||||
Merge(h, ChunkFromHandle(h)->next);
|
||||
}
|
||||
}
|
||||
|
||||
// If the previous chunk is free, coalesce the two
|
||||
c = ChunkFromHandle(h);
|
||||
if (c->prev != kInvalidChunkHandle) {
|
||||
Chunk* cprev = ChunkFromHandle(c->prev);
|
||||
if (!cprev->in_use()) {
|
||||
// VLOG(8) << "Chunk at " << c->ptr << " merging into c->prev "
|
||||
// << cprev->ptr;
|
||||
|
||||
chunk_to_reassign = c->prev;
|
||||
|
||||
// Deletes c
|
||||
RemoveFreeChunkFromBin(c->prev);
|
||||
Merge(ChunkFromHandle(h)->prev, h);
|
||||
c = ChunkFromHandle(h);
|
||||
}
|
||||
}
|
||||
|
||||
InsertFreeChunkIntoBin(chunk_to_reassign);
|
||||
}
|
||||
|
||||
void GPUBFCAllocator::AddAllocVisitor(Visitor visitor) {
|
||||
VLOG(1) << "AddVisitor";
|
||||
mutex_lock l(lock_);
|
||||
region_visitors_.push_back(visitor);
|
||||
for (const auto& region : region_manager_.regions()) {
|
||||
visitor(region.ptr(), region.memory_size());
|
||||
}
|
||||
}
|
||||
|
||||
bool GPUBFCAllocator::TracksAllocationSizes() { return true; }
|
||||
|
||||
size_t GPUBFCAllocator::RequestedSize(void* ptr) {
|
||||
mutex_lock l(lock_);
|
||||
GPUBFCAllocator::ChunkHandle h = region_manager_.get_handle(ptr);
|
||||
CHECK(h != kInvalidChunkHandle)
|
||||
<< "Asked for requested size of pointer we never allocated: " << ptr;
|
||||
GPUBFCAllocator::Chunk* c = ChunkFromHandle(h);
|
||||
return c->requested_size;
|
||||
}
|
||||
|
||||
size_t GPUBFCAllocator::AllocatedSize(void* ptr) {
|
||||
mutex_lock l(lock_);
|
||||
GPUBFCAllocator::ChunkHandle h = region_manager_.get_handle(ptr);
|
||||
CHECK(h != kInvalidChunkHandle)
|
||||
<< "Asked for allocated size of pointer we never allocated: " << ptr;
|
||||
GPUBFCAllocator::Chunk* c = ChunkFromHandle(h);
|
||||
return c->size;
|
||||
}
|
||||
|
||||
int64 GPUBFCAllocator::AllocationId(void* ptr) {
|
||||
mutex_lock l(lock_);
|
||||
GPUBFCAllocator::ChunkHandle h = region_manager_.get_handle(ptr);
|
||||
CHECK(h != kInvalidChunkHandle)
|
||||
<< "Asked for allocation id of pointer we never allocated: " << ptr;
|
||||
GPUBFCAllocator::Chunk* c = ChunkFromHandle(h);
|
||||
return c->allocation_id;
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
||||
void RenderRegion(char* rendered, const size_t resolution,
|
||||
const size_t total_render_size, const size_t offset,
|
||||
const void* base_ptr, const void* ptr, const size_t size,
|
||||
const char c) {
|
||||
const char* base_ptr_c = static_cast<const char*>(base_ptr);
|
||||
const char* ptr_c = static_cast<const char*>(ptr);
|
||||
|
||||
size_t start_location =
|
||||
((ptr_c - base_ptr_c + offset) * resolution) / total_render_size;
|
||||
CHECK_GE(start_location, 0);
|
||||
CHECK_LT(start_location, resolution);
|
||||
size_t end_location =
|
||||
((ptr_c + size - 1 - base_ptr_c + offset) * resolution) /
|
||||
total_render_size;
|
||||
CHECK_GE(end_location, 0);
|
||||
CHECK_LT(end_location, resolution);
|
||||
|
||||
for (size_t i = start_location; i <= end_location; ++i) {
|
||||
rendered[i] = c;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
string GPUBFCAllocator::RenderOccupancy() {
|
||||
// Make a buffer for the ASCII-art representation.
|
||||
const size_t resolution = 100;
|
||||
char rendered[resolution];
|
||||
|
||||
// Compute the total region size to render over
|
||||
size_t total_region_size = 0;
|
||||
for (const auto& region : region_manager_.regions()) {
|
||||
total_region_size += region.memory_size();
|
||||
}
|
||||
|
||||
// Start out with everything empty
|
||||
RenderRegion(rendered, resolution, total_region_size, 0, nullptr, nullptr,
|
||||
total_region_size, '_');
|
||||
|
||||
size_t region_offset = 0;
|
||||
for (const auto& region : region_manager_.regions()) {
|
||||
ChunkHandle h = region_manager_.get_handle(region.ptr());
|
||||
// Then render each chunk left to right.
|
||||
while (h != kInvalidChunkHandle) {
|
||||
Chunk* c = ChunkFromHandle(h);
|
||||
if (c->in_use()) {
|
||||
// Render the wasted space
|
||||
size_t wasted = c->size - c->requested_size;
|
||||
if (wasted > 0) {
|
||||
RenderRegion(rendered, resolution, total_region_size,
|
||||
region_offset + c->requested_size, region.ptr(), c->ptr,
|
||||
wasted, 'x');
|
||||
}
|
||||
// Then the occupied space
|
||||
RenderRegion(rendered, resolution, total_region_size, region_offset,
|
||||
region.ptr(), c->ptr, c->requested_size, '*');
|
||||
}
|
||||
h = c->next;
|
||||
}
|
||||
region_offset += region.memory_size();
|
||||
}
|
||||
|
||||
return StringPiece(rendered, resolution).ToString();
|
||||
}
|
||||
|
||||
void GPUBFCAllocator::DumpMemoryLog(size_t num_bytes) {
|
||||
// For each bin: tally up the total number of chunks and bytes.
|
||||
// Note that bins hold only free chunks.
|
||||
for (BinNum bin_num = 0; bin_num < kNumBins; bin_num++) {
|
||||
Bin* b = BinFromIndex(bin_num);
|
||||
|
||||
size_t total_bytes_in_use = 0;
|
||||
size_t total_bytes_in_bin = 0;
|
||||
size_t total_requested_bytes_in_use = 0;
|
||||
size_t total_requested_bytes_in_bin = 0;
|
||||
size_t total_chunks_in_use = 0;
|
||||
size_t total_chunks_in_bin = 0;
|
||||
for (ChunkHandle h : b->free_chunks) {
|
||||
Chunk* c = ChunkFromHandle(h);
|
||||
total_bytes_in_bin += c->size;
|
||||
total_requested_bytes_in_bin += c->requested_size;
|
||||
++total_chunks_in_bin;
|
||||
if (c->in_use()) {
|
||||
total_bytes_in_use += c->size;
|
||||
total_requested_bytes_in_use += c->requested_size;
|
||||
++total_chunks_in_use;
|
||||
}
|
||||
}
|
||||
|
||||
LOG(INFO) << "Bin (" << b->bin_size
|
||||
<< "): \tTotal Chunks: " << total_chunks_in_bin
|
||||
<< ", Chunks in use: " << total_chunks_in_use << " "
|
||||
<< strings::HumanReadableNumBytes(total_bytes_in_bin)
|
||||
<< " allocated for chunks. "
|
||||
<< strings::HumanReadableNumBytes(total_requested_bytes_in_bin)
|
||||
<< " client-requested for chunks. "
|
||||
<< strings::HumanReadableNumBytes(total_bytes_in_use)
|
||||
<< " in use in bin. "
|
||||
<< strings::HumanReadableNumBytes(total_requested_bytes_in_use)
|
||||
<< " client-requested in use in bin.";
|
||||
}
|
||||
|
||||
// Find the bin that we would have liked to allocate in, so we
|
||||
// can get some further analysis about fragmentation.
|
||||
Bin* b = BinForSize(num_bytes);
|
||||
|
||||
LOG(INFO) << "Bin for " << strings::HumanReadableNumBytes(num_bytes)
|
||||
<< " was " << strings::HumanReadableNumBytes(b->bin_size)
|
||||
<< ", Chunk State: ";
|
||||
|
||||
for (ChunkHandle h : b->free_chunks) {
|
||||
Chunk* c = ChunkFromHandle(h);
|
||||
LOG(INFO) << c->DebugString(this, true);
|
||||
}
|
||||
|
||||
// Next show the chunks that are in use, and also summarize their
|
||||
// number by size.
|
||||
std::map<size_t, int> in_use_by_size;
|
||||
for (const auto& region : region_manager_.regions()) {
|
||||
ChunkHandle h = region_manager_.get_handle(region.ptr());
|
||||
while (h != kInvalidChunkHandle) {
|
||||
const Chunk* c = ChunkFromHandle(h);
|
||||
if (c->in_use()) {
|
||||
in_use_by_size[c->size]++;
|
||||
LOG(INFO) << "Chunk at " << c->ptr << " of size " << c->size;
|
||||
}
|
||||
h = c->next;
|
||||
}
|
||||
|
||||
h = region_manager_.get_handle(region.ptr());
|
||||
while (h != kInvalidChunkHandle) {
|
||||
const Chunk* c = ChunkFromHandle(h);
|
||||
if (!c->in_use()) {
|
||||
LOG(INFO) << "Free at " << c->ptr << " of size " << c->size;
|
||||
}
|
||||
h = c->next;
|
||||
}
|
||||
}
|
||||
|
||||
LOG(INFO) << " Summary of in-use Chunks by size: ";
|
||||
size_t total_bytes = 0;
|
||||
for (auto& it : in_use_by_size) {
|
||||
LOG(INFO) << it.second << " Chunks of size " << it.first << " totalling "
|
||||
<< strings::HumanReadableNumBytes(it.first * it.second);
|
||||
total_bytes += (it.first * it.second);
|
||||
}
|
||||
LOG(INFO) << "Sum Total of in-use chunks: "
|
||||
<< strings::HumanReadableNumBytes(total_bytes);
|
||||
LOG(INFO) << "Stats: \n" << stats_.DebugString();
|
||||
}
|
||||
|
||||
void GPUBFCAllocator::GetStats(AllocatorStats* stats) {
|
||||
mutex_lock l(lock_);
|
||||
*stats = stats_;
|
||||
}
|
||||
: BFCAllocator(
|
||||
new GPUMemAllocator(
|
||||
GPUMachineManager()->ExecutorForDevice(device_id).ValueOrDie()),
|
||||
total_memory, gpu_options.allow_growth(), "gpu_bfc") {}
|
||||
|
||||
} // namespace tensorflow
|
||||
|
@ -21,396 +21,62 @@ limitations under the License.
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
#include "tensorflow/core/common_runtime/gpu/gpu_allocator_retry.h"
|
||||
#include "tensorflow/core/common_runtime/gpu/visitable_allocator.h"
|
||||
#include "tensorflow/core/lib/gtl/stl_util.h"
|
||||
#include "tensorflow/core/lib/strings/strcat.h"
|
||||
#include "tensorflow/core/platform/macros.h"
|
||||
#include "tensorflow/core/platform/mutex.h"
|
||||
#include "tensorflow/core/common_runtime/allocator_retry.h"
|
||||
#include "tensorflow/core/common_runtime/bfc_allocator.h"
|
||||
#include "tensorflow/core/platform/stream_executor.h"
|
||||
#include "tensorflow/core/platform/thread_annotations.h"
|
||||
#include "tensorflow/core/platform/types.h"
|
||||
#include "tensorflow/core/protobuf/config.pb.h"
|
||||
|
||||
namespace gpu = ::perftools::gputools;
|
||||
|
||||
namespace tensorflow {
|
||||
|
||||
// A GPU memory allocator that implements a 'best-fit with coalescing'
|
||||
// algorithm. This is essentially a very simple version of Doug Lea's
|
||||
// malloc (dlmalloc).
|
||||
//
|
||||
// The goal of this allocator is to support defragmentation via
|
||||
// coalescing. One assumption we make is that the process using this
|
||||
// allocator owns pretty much all of the GPU memory, and that nearly
|
||||
// all requests to allocate GPU memory go through this interface.
|
||||
class GPUBFCAllocator : public VisitableAllocator {
|
||||
// algorithm.
|
||||
class GPUBFCAllocator : public BFCAllocator {
|
||||
public:
|
||||
// 'device_id' refers to the StreamExecutor ID of the device within
|
||||
// the process and must reference a valid ID in the process.
|
||||
GPUBFCAllocator(int device_id, size_t total_memory);
|
||||
GPUBFCAllocator(int device_id, size_t total_memory,
|
||||
const GPUOptions& gpu_options);
|
||||
~GPUBFCAllocator() override;
|
||||
|
||||
string Name() override { return "gpu_bfc"; }
|
||||
void* AllocateRaw(size_t alignment, size_t num_bytes) override;
|
||||
void* AllocateRaw(size_t alignment, size_t num_bytes,
|
||||
const AllocationAttributes& allocation_attr) override;
|
||||
void DeallocateRaw(void* ptr) override;
|
||||
|
||||
void AddAllocVisitor(Visitor visitor) override;
|
||||
|
||||
// Does nothing, because gpu memory is never freed.
|
||||
void AddFreeVisitor(Visitor visitor) override {}
|
||||
|
||||
bool TracksAllocationSizes() override;
|
||||
|
||||
size_t RequestedSize(void* ptr) override;
|
||||
|
||||
size_t AllocatedSize(void* ptr) override;
|
||||
|
||||
int64 AllocationId(void* ptr) override;
|
||||
|
||||
void GetStats(AllocatorStats* stats) override;
|
||||
|
||||
private:
|
||||
struct Bin;
|
||||
|
||||
void* AllocateRawInternal(size_t alignment, size_t num_bytes,
|
||||
bool dump_log_on_failure);
|
||||
void DeallocateRawInternal(void* ptr);
|
||||
|
||||
// A ChunkHandle is an index into the chunks_ vector in GPUBFCAllocator
|
||||
// kInvalidChunkHandle means an invalid chunk
|
||||
typedef int ChunkHandle;
|
||||
static const int kInvalidChunkHandle = -1;
|
||||
|
||||
typedef int BinNum;
|
||||
static const int kInvalidBinNum = -1;
|
||||
static const int kNumBins = 21;
|
||||
|
||||
// Chunks point to GPU memory. Their prev/next pointers form a
|
||||
// doubly-linked list of addresses sorted by GPU base address that
|
||||
// must be contiguous. Chunks contain information about whether
|
||||
// they are in use or whether they are free, and contain a pointer
|
||||
// to the bin they are in.
|
||||
struct Chunk {
|
||||
size_t size = 0; // Full size of GPU buffer.
|
||||
|
||||
// We sometimes give chunks that are larger than needed to reduce
|
||||
// fragmentation. requested_size keeps track of what the client
|
||||
// actually wanted so we can understand whether our splitting
|
||||
// strategy is efficient.
|
||||
size_t requested_size = 0;
|
||||
|
||||
// allocation_id is set to -1 when the chunk is not in use. It is assigned a
|
||||
// value greater than zero before the chunk is returned from
|
||||
// AllocateRaw, and this value is unique among values assigned by
|
||||
// the parent allocator.
|
||||
int64 allocation_id = -1;
|
||||
void* ptr = nullptr; // pointer to granted GPU subbuffer.
|
||||
|
||||
// If not kInvalidChunkHandle, the memory referred to by 'prev' is directly
|
||||
// preceding the memory used by this chunk. E.g., It should start
|
||||
// at 'ptr - prev->size'
|
||||
ChunkHandle prev = kInvalidChunkHandle;
|
||||
|
||||
// If not kInvalidChunkHandle, the memory referred to by 'next' is directly
|
||||
// following the memory used by this chunk. E.g., It should be at
|
||||
// 'ptr + size'
|
||||
ChunkHandle next = kInvalidChunkHandle;
|
||||
|
||||
// What bin are we in?
|
||||
BinNum bin_num = kInvalidBinNum;
|
||||
|
||||
bool in_use() const { return allocation_id != -1; }
|
||||
|
||||
string DebugString(GPUBFCAllocator* a, bool recurse) {
|
||||
string dbg;
|
||||
strings::StrAppend(&dbg, " Size: ", strings::HumanReadableNumBytes(size),
|
||||
" | Requested Size: ",
|
||||
strings::HumanReadableNumBytes(requested_size),
|
||||
" | in_use: ", in_use());
|
||||
if (recurse && prev != GPUBFCAllocator::kInvalidChunkHandle) {
|
||||
Chunk* p = a->ChunkFromHandle(prev);
|
||||
strings::StrAppend(&dbg, ", prev: ", p->DebugString(a, false));
|
||||
}
|
||||
if (recurse && next != GPUBFCAllocator::kInvalidChunkHandle) {
|
||||
Chunk* n = a->ChunkFromHandle(next);
|
||||
strings::StrAppend(&dbg, ", next: ", n->DebugString(a, false));
|
||||
}
|
||||
return dbg;
|
||||
}
|
||||
};
|
||||
|
||||
// A Bin is a collection of similar-sized free chunks.
|
||||
struct Bin {
|
||||
// All chunks in this bin have >= bin_size memory.
|
||||
size_t bin_size = 0;
|
||||
|
||||
struct ChunkComparator {
|
||||
explicit ChunkComparator(GPUBFCAllocator* allocator)
|
||||
: allocator_(allocator) {}
|
||||
// Sort first by size and then use pointer address as a tie breaker.
|
||||
bool operator()(const ChunkHandle ha, const ChunkHandle hb) const {
|
||||
const Chunk* a = allocator_->ChunkFromHandle(ha);
|
||||
const Chunk* b = allocator_->ChunkFromHandle(hb);
|
||||
if (a->size != b->size) {
|
||||
return a->size < b->size;
|
||||
}
|
||||
return a->ptr < b->ptr;
|
||||
}
|
||||
|
||||
private:
|
||||
GPUBFCAllocator* allocator_; // The parent allocator
|
||||
};
|
||||
|
||||
typedef std::set<ChunkHandle, ChunkComparator> FreeChunkSet;
|
||||
// List of free chunks within the bin, sorted by chunk size.
|
||||
// Chunk * not owned.
|
||||
FreeChunkSet free_chunks;
|
||||
Bin(GPUBFCAllocator* allocator, size_t bs)
|
||||
: bin_size(bs), free_chunks(ChunkComparator(allocator)) {}
|
||||
};
|
||||
|
||||
static const size_t kMinAllocationBits = 8;
|
||||
static const size_t kMinAllocationSize = 1 << kMinAllocationBits;
|
||||
|
||||
// AllocationRegion maps pointers to ChunkHandles for a single
|
||||
// contiguous memory region.
|
||||
//
|
||||
// This class is thread-compatible.
|
||||
class AllocationRegion {
|
||||
public:
|
||||
AllocationRegion(void* ptr, size_t memory_size)
|
||||
: ptr_(ptr),
|
||||
memory_size_(memory_size),
|
||||
end_ptr_(
|
||||
static_cast<void*>(static_cast<char*>(ptr_) + memory_size_)) {
|
||||
DCHECK_EQ(0, memory_size % kMinAllocationSize);
|
||||
const size_t n_handles =
|
||||
(memory_size + kMinAllocationSize - 1) / kMinAllocationSize;
|
||||
handles_ = new ChunkHandle[n_handles];
|
||||
for (size_t i = 0; i < n_handles; i++) {
|
||||
handles_[i] = kInvalidChunkHandle;
|
||||
}
|
||||
}
|
||||
|
||||
AllocationRegion() {}
|
||||
|
||||
~AllocationRegion() { delete[] handles_; }
|
||||
|
||||
AllocationRegion(AllocationRegion&& other) { Swap(other); }
|
||||
|
||||
AllocationRegion& operator=(AllocationRegion&& other) {
|
||||
Swap(other);
|
||||
return *this;
|
||||
}
|
||||
|
||||
void* ptr() const { return ptr_; }
|
||||
void* end_ptr() const { return end_ptr_; }
|
||||
size_t memory_size() const { return memory_size_; }
|
||||
ChunkHandle get_handle(const void* p) const {
|
||||
return handles_[IndexFor(p)];
|
||||
}
|
||||
void set_handle(const void* p, ChunkHandle h) { handles_[IndexFor(p)] = h; }
|
||||
void erase(const void* p) { set_handle(p, kInvalidChunkHandle); }
|
||||
|
||||
private:
|
||||
void Swap(AllocationRegion& other) {
|
||||
std::swap(ptr_, other.ptr_);
|
||||
std::swap(memory_size_, other.memory_size_);
|
||||
std::swap(end_ptr_, other.end_ptr_);
|
||||
std::swap(handles_, other.handles_);
|
||||
}
|
||||
|
||||
int IndexFor(const void* p) const {
|
||||
std::uintptr_t p_int = reinterpret_cast<std::uintptr_t>(p);
|
||||
std::uintptr_t base_int = reinterpret_cast<std::uintptr_t>(ptr_);
|
||||
DCHECK_GE(p_int, base_int);
|
||||
DCHECK_LT(p_int, base_int + memory_size_);
|
||||
return static_cast<int>(((p_int - base_int) >> kMinAllocationBits));
|
||||
}
|
||||
|
||||
// Metadata about the allocation region.
|
||||
void* ptr_ = nullptr;
|
||||
size_t memory_size_ = 0;
|
||||
void* end_ptr_ = nullptr;
|
||||
|
||||
// Array of size "memory_size / kMinAllocationSize". It is
|
||||
// indexed by (p-base) / kMinAllocationSize, contains ChunkHandle
|
||||
// for the memory allocation represented by "p"
|
||||
ChunkHandle* handles_ = nullptr;
|
||||
|
||||
TF_DISALLOW_COPY_AND_ASSIGN(AllocationRegion);
|
||||
};
|
||||
|
||||
// RegionManager aggregates one or more "AllocationRegions" and provides
|
||||
// a layer of indirection from pointers to the underlying ChunkHandle,
|
||||
// allowing allocation across multiple discontiguous memory regions.
|
||||
//
|
||||
// This class is thread-compatible.
|
||||
class RegionManager {
|
||||
public:
|
||||
RegionManager() {}
|
||||
~RegionManager() {}
|
||||
|
||||
void AddAllocationRegion(void* ptr, size_t memory_size) {
|
||||
// Insert sorted by end_ptr
|
||||
auto entry =
|
||||
std::upper_bound(regions_.begin(), regions_.end(), ptr, &Comparator);
|
||||
regions_.insert(entry, AllocationRegion(ptr, memory_size));
|
||||
}
|
||||
|
||||
ChunkHandle get_handle(const void* p) const {
|
||||
return RegionFor(p)->get_handle(p);
|
||||
}
|
||||
|
||||
void set_handle(const void* p, ChunkHandle h) {
|
||||
return MutableRegionFor(p)->set_handle(p, h);
|
||||
}
|
||||
void erase(const void* p) { return MutableRegionFor(p)->erase(p); }
|
||||
|
||||
const std::vector<AllocationRegion>& regions() const { return regions_; }
|
||||
|
||||
private:
|
||||
static bool Comparator(const void* ptr, const AllocationRegion& other) {
|
||||
return ptr < other.end_ptr();
|
||||
}
|
||||
|
||||
AllocationRegion* MutableRegionFor(const void* p) {
|
||||
return const_cast<AllocationRegion*>(RegionFor(p));
|
||||
}
|
||||
|
||||
const AllocationRegion* RegionFor(const void* p) const {
|
||||
auto entry =
|
||||
std::upper_bound(regions_.begin(), regions_.end(), p, &Comparator);
|
||||
|
||||
if (entry != regions_.end()) {
|
||||
return &(*entry);
|
||||
}
|
||||
|
||||
LOG(FATAL) << "Could not find Region for " << p;
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
private:
|
||||
std::vector<AllocationRegion> regions_;
|
||||
};
|
||||
|
||||
// Returns 'bytes' rounded up to the next highest kMinAllocationSize.
|
||||
size_t RoundedBytes(size_t bytes);
|
||||
|
||||
// Try to add a new memory region that can satisfy an allocation of
|
||||
// 'rounded_bytes' bytes. Returns true on success and false on
|
||||
// failure.
|
||||
bool Extend(size_t rounded_bytes) EXCLUSIVE_LOCKS_REQUIRED(lock_);
|
||||
|
||||
// Returns a pointer to an underlying allocated chunk of size
|
||||
// 'rounded_bytes'.
|
||||
void* FindChunkPtr(BinNum bin_num, size_t rounded_bytes, size_t num_bytes)
|
||||
EXCLUSIVE_LOCKS_REQUIRED(lock_);
|
||||
|
||||
// Splits the chunk specified by 'h' into two chunks, one at least
|
||||
// of size 'num_bytes'.
|
||||
void SplitChunk(ChunkHandle h, size_t num_bytes)
|
||||
EXCLUSIVE_LOCKS_REQUIRED(lock_);
|
||||
|
||||
// Merges the two chunk handles. Requires that the chunks are
|
||||
// contiguous in their allocation.
|
||||
void Merge(ChunkHandle h, ChunkHandle h2) EXCLUSIVE_LOCKS_REQUIRED(lock_);
|
||||
|
||||
// Frees the memory represented by 'h', coalescing the chunk if
|
||||
// possible.
|
||||
void FreeAndMaybeCoalesce(ChunkHandle h) EXCLUSIVE_LOCKS_REQUIRED(lock_);
|
||||
|
||||
// Adds the chunk 'h' to the proper free bin.
|
||||
void InsertFreeChunkIntoBin(ChunkHandle h) EXCLUSIVE_LOCKS_REQUIRED(lock_);
|
||||
|
||||
// Removes the free chunk pointed to by 'c' from the set free_chunks.
|
||||
void RemoveFreeChunkIterFromBin(Bin::FreeChunkSet* free_chunks,
|
||||
const Bin::FreeChunkSet::iterator& c)
|
||||
EXCLUSIVE_LOCKS_REQUIRED(lock_);
|
||||
|
||||
// Removes a free chunk from the bin.
|
||||
void RemoveFreeChunkFromBin(ChunkHandle h) EXCLUSIVE_LOCKS_REQUIRED(lock_);
|
||||
|
||||
// Removes the chunk metadata represented by 'h'.
|
||||
void DeleteChunk(ChunkHandle h) EXCLUSIVE_LOCKS_REQUIRED(lock_);
|
||||
|
||||
string RenderOccupancy() EXCLUSIVE_LOCKS_REQUIRED(lock_);
|
||||
void DumpMemoryLog(size_t num_bytes) EXCLUSIVE_LOCKS_REQUIRED(lock_);
|
||||
|
||||
ChunkHandle AllocateChunk() EXCLUSIVE_LOCKS_REQUIRED(lock_);
|
||||
void DeallocateChunk(ChunkHandle h) EXCLUSIVE_LOCKS_REQUIRED(lock_);
|
||||
|
||||
Chunk* ChunkFromHandle(ChunkHandle h) EXCLUSIVE_LOCKS_REQUIRED(lock_);
|
||||
|
||||
GPUAllocatorRetry retry_helper_;
|
||||
|
||||
// Structures immutable after construction
|
||||
const int device_id_;
|
||||
size_t gpu_memory_size_ = 0;
|
||||
inline int Log2FloorNonZero(uint64 n) {
|
||||
#if defined(__GNUC__)
|
||||
return 63 ^ __builtin_clzll(n);
|
||||
#else
|
||||
int r = 0;
|
||||
while (n > 0) {
|
||||
r++;
|
||||
n >>= 1;
|
||||
}
|
||||
return r;
|
||||
#endif
|
||||
}
|
||||
|
||||
// Map from bin size to Bin
|
||||
Bin* BinFromIndex(BinNum index) {
|
||||
return reinterpret_cast<Bin*>(&(bins_space_[index * sizeof(Bin)]));
|
||||
}
|
||||
size_t BinNumToSize(BinNum index) {
|
||||
return static_cast<size_t>(256) << index;
|
||||
}
|
||||
BinNum BinNumForSize(size_t bytes) {
|
||||
uint64 v = std::max<size_t>(bytes, 256) >> kMinAllocationBits;
|
||||
int b = std::min(kNumBins - 1, Log2FloorNonZero(v));
|
||||
return b;
|
||||
}
|
||||
Bin* BinForSize(size_t bytes) { return BinFromIndex(BinNumForSize(bytes)); }
|
||||
|
||||
char bins_space_[sizeof(Bin) * kNumBins];
|
||||
|
||||
perftools::gputools::StreamExecutor* stream_exec_; // Not owned.
|
||||
|
||||
// The size of the current region allocation.
|
||||
size_t curr_region_allocation_bytes_;
|
||||
|
||||
// The total number of allocated bytes by the allocator.
|
||||
size_t total_region_allocated_bytes_ = 0;
|
||||
|
||||
// An indicator that expansion of a region has hit the limits
|
||||
// of the available GPU memory.
|
||||
bool started_backpedal_ = false;
|
||||
|
||||
// Structures mutable after construction
|
||||
mutable mutex lock_;
|
||||
RegionManager region_manager_ GUARDED_BY(lock_);
|
||||
|
||||
std::vector<Chunk> chunks_;
|
||||
ChunkHandle free_chunks_list_; // Ptr to head of linked list of free Chunks
|
||||
|
||||
// Called once on each region, ASAP.
|
||||
std::vector<Visitor> region_visitors_;
|
||||
|
||||
// Counter containing the next unique identifier to assign to a
|
||||
// newly-created chunk.
|
||||
int64 next_allocation_id_ GUARDED_BY(lock_);
|
||||
|
||||
// Stats.
|
||||
AllocatorStats stats_ GUARDED_BY(lock_);
|
||||
virtual ~GPUBFCAllocator() {}
|
||||
|
||||
TF_DISALLOW_COPY_AND_ASSIGN(GPUBFCAllocator);
|
||||
};
|
||||
|
||||
// Suballocator for GPU memory.
|
||||
class GPUMemAllocator : public SubAllocator {
|
||||
public:
|
||||
// Note: stream_exec cannot be null.
|
||||
explicit GPUMemAllocator(perftools::gputools::StreamExecutor* stream_exec)
|
||||
: stream_exec_(stream_exec) {
|
||||
CHECK(stream_exec_ != nullptr);
|
||||
}
|
||||
~GPUMemAllocator() override {}
|
||||
|
||||
void* Alloc(size_t alignment, size_t num_bytes) override {
|
||||
void* ptr = nullptr;
|
||||
if (num_bytes > 0) {
|
||||
ptr = stream_exec_->AllocateArray<char>(num_bytes).opaque();
|
||||
}
|
||||
return ptr;
|
||||
}
|
||||
|
||||
void Free(void* ptr, size_t num_bytes) override {
|
||||
if (ptr != nullptr) {
|
||||
gpu::DeviceMemoryBase gpu_ptr(ptr);
|
||||
stream_exec_->Deallocate(&gpu_ptr);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
perftools::gputools::StreamExecutor* stream_exec_; // not owned, non-null
|
||||
|
||||
TF_DISALLOW_COPY_AND_ASSIGN(GPUMemAllocator);
|
||||
};
|
||||
|
||||
} // namespace tensorflow
|
||||
|
||||
#endif // TENSORFLOW_COMMON_RUNTIME_GPU_GPU_BFC_ALLOCATOR_H_
|
||||
|
@ -20,7 +20,7 @@ limitations under the License.
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
|
||||
#include "tensorflow/core/common_runtime/gpu/visitable_allocator.h"
|
||||
#include "tensorflow/core/common_runtime/visitable_allocator.h"
|
||||
#include "tensorflow/core/platform/macros.h"
|
||||
#include "tensorflow/core/platform/stream_executor.h"
|
||||
#include "tensorflow/core/platform/types.h"
|
||||
|
@ -226,30 +226,6 @@ TEST(EventMgr, ManySmallTensorsSeparateCallsFlushed) {
|
||||
}
|
||||
}
|
||||
|
||||
// Running the polling loop should clear the queue, without an explict
|
||||
// poll call here, given a moderate delay.
|
||||
TEST(EventMgr, LongDelayedPolling) {
|
||||
auto stream_exec = GPUMachineManager()->ExecutorForDevice(0).ValueOrDie();
|
||||
EventMgr em(stream_exec, GPUOptions());
|
||||
TEST_EventMgrHelper th(&em);
|
||||
EXPECT_EQ(0, th.queue_size());
|
||||
EXPECT_EQ(0, th.free_size());
|
||||
std::unique_ptr<gpu::Stream> stream(new gpu::Stream(stream_exec));
|
||||
CHECK(stream.get());
|
||||
stream->Init();
|
||||
for (int i = 0; i < 5; ++i) {
|
||||
TensorReferenceVector* v = new TensorReferenceVector;
|
||||
AddTensorReference(v, 100 * 1048576);
|
||||
th.QueueTensors(stream.get(), v);
|
||||
EXPECT_EQ(1 + i, th.queue_size());
|
||||
EXPECT_EQ(0, th.free_size());
|
||||
}
|
||||
th.StartPollingLoop();
|
||||
sleep(1);
|
||||
EXPECT_EQ(0, th.queue_size());
|
||||
EXPECT_EQ(5, th.free_size());
|
||||
}
|
||||
|
||||
// Deleting the EventMgr when events are still pending should shut
|
||||
// down gracefully.
|
||||
TEST(EventMgr, NonEmptyShutdown) {
|
||||
|
@ -24,7 +24,7 @@ limitations under the License.
|
||||
#include <map>
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
#include "tensorflow/core/common_runtime/gpu/visitable_allocator.h"
|
||||
#include "tensorflow/core/common_runtime/visitable_allocator.h"
|
||||
#include "tensorflow/core/lib/core/bits.h"
|
||||
#include "tensorflow/core/platform/logging.h"
|
||||
#include "tensorflow/core/platform/macros.h"
|
||||
@ -35,14 +35,6 @@ limitations under the License.
|
||||
|
||||
namespace tensorflow {
|
||||
|
||||
// Interface of an object that does the underlying alloc/free of memory.
|
||||
class SubAllocator {
|
||||
public:
|
||||
virtual ~SubAllocator() {}
|
||||
virtual void* Alloc(size_t alignment, size_t num_bytes) = 0;
|
||||
virtual void Free(void* ptr, size_t num_bytes) = 0;
|
||||
};
|
||||
|
||||
// Interface of an object that rounds up integers.
|
||||
class RoundUpInterface {
|
||||
public:
|
||||
|
@ -187,9 +187,17 @@ Allocator* ProcessState::GetCUDAHostAllocator(int numa_node) {
|
||||
gpu::Platform* gpu_platform = GPUMachineManager();
|
||||
gpu::StreamExecutor* se = gpu_platform->ExecutorForDevice(0).ValueOrDie();
|
||||
CHECK(se);
|
||||
Allocator* allocator = new PoolAllocator(
|
||||
100 /*pool_size_limit*/, true /*auto_resize*/,
|
||||
new CUDAHostAllocator(se), new Pow2Rounder, "cuda_host");
|
||||
Allocator* allocator = nullptr;
|
||||
static constexpr bool kCudaHostMemoryUseBFC = true;
|
||||
if (kCudaHostMemoryUseBFC) {
|
||||
allocator =
|
||||
new BFCAllocator(new CUDAHostAllocator(se), 1LL << 36 /*64GB max*/,
|
||||
true /*allow_growth*/, "cuda_host_bfc" /*name*/);
|
||||
} else {
|
||||
allocator = new PoolAllocator(
|
||||
100 /*pool_size_limit*/, true /*auto_resize*/,
|
||||
new CUDAHostAllocator(se), new Pow2Rounder, "cuda_host");
|
||||
}
|
||||
if (LogMemory::IsEnabled()) {
|
||||
// Wrap the allocator to track allocation ids for better logging
|
||||
// at the cost of performance.
|
||||
|
@ -315,11 +315,20 @@ class ColocationGraph {
|
||||
device_set_->FindMatchingDevices(specified_device_name,
|
||||
&devices_matching_nodedef);
|
||||
if (devices_matching_nodedef.empty()) {
|
||||
// Sometimes it is almost impossible to understand the problem
|
||||
// without a list of available devices.
|
||||
std::vector<string> device_names;
|
||||
for (const Device* device : device_set_->devices()) {
|
||||
device_names.push_back(device->name());
|
||||
}
|
||||
std::sort(device_names.begin(), device_names.end());
|
||||
|
||||
return errors::InvalidArgument(
|
||||
"Could not satisfy explicit device specification '",
|
||||
node->def().device(),
|
||||
"' because no devices matching that specification "
|
||||
"are registered in this process");
|
||||
"are registered in this process; available devices: ",
|
||||
str_util::Join(device_names, ", "));
|
||||
} else if (specified_device_name.has_type) {
|
||||
return errors::InvalidArgument(
|
||||
"Could not satisfy explicit device specification '",
|
||||
|
@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#ifndef TENSORFLOW_COMMON_RUNTIME_GPU_VISITABLE_ALLOCATOR_H_
|
||||
#define TENSORFLOW_COMMON_RUNTIME_GPU_VISITABLE_ALLOCATOR_H_
|
||||
#ifndef TENSORFLOW_COMMON_RUNTIME_VISITABLE_ALLOCATOR_H_
|
||||
#define TENSORFLOW_COMMON_RUNTIME_VISITABLE_ALLOCATOR_H_
|
||||
|
||||
#include <functional>
|
||||
#include "tensorflow/core/framework/allocator.h"
|
||||
@ -42,4 +42,4 @@ class VisitableAllocator : public Allocator {
|
||||
virtual void AddFreeVisitor(Visitor visitor) = 0;
|
||||
};
|
||||
} // namespace tensorflow
|
||||
#endif // TENSORFLOW_COMMON_RUNTIME_GPU_VISITABLE_ALLOCATOR_H_
|
||||
#endif // TENSORFLOW_COMMON_RUNTIME_VISITABLE_ALLOCATOR_H_
|
@ -292,6 +292,15 @@ Allocator* cpu_allocator();
|
||||
// AllocatorStats. By default, it's disabled.
|
||||
void EnableCPUAllocatorStats(bool enable);
|
||||
|
||||
// Abstract interface of an object that does the underlying suballoc/free of
|
||||
// memory for a higher-level allocator.
|
||||
class SubAllocator {
|
||||
public:
|
||||
virtual ~SubAllocator() {}
|
||||
virtual void* Alloc(size_t alignment, size_t num_bytes) = 0;
|
||||
virtual void Free(void* ptr, size_t num_bytes) = 0;
|
||||
};
|
||||
|
||||
} // namespace tensorflow
|
||||
|
||||
#endif // TENSORFLOW_FRAMEWORK_ALLOCATOR_H_
|
||||
|
@ -38,6 +38,26 @@ static void CheckStats(Allocator* a, int64 num_allocs, int64 bytes_in_use,
|
||||
#endif
|
||||
}
|
||||
|
||||
TEST(AllocatorAttributesTest, AllCombos) {
|
||||
for (bool on_host : {false, true}) {
|
||||
for (bool nic_compatible : {false, true}) {
|
||||
for (bool gpu_compatible : {false, true}) {
|
||||
for (bool track_sizes : {false, true}) {
|
||||
AllocatorAttributes aa;
|
||||
aa.set_on_host(on_host);
|
||||
aa.set_nic_compatible(nic_compatible);
|
||||
aa.set_gpu_compatible(gpu_compatible);
|
||||
aa.set_track_sizes(track_sizes);
|
||||
EXPECT_EQ(on_host, aa.on_host());
|
||||
EXPECT_EQ(nic_compatible, aa.nic_compatible());
|
||||
EXPECT_EQ(gpu_compatible, aa.gpu_compatible());
|
||||
EXPECT_EQ(track_sizes, aa.track_sizes());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST(CPUAllocatorTest, Simple) {
|
||||
EnableCPUAllocatorStats(true);
|
||||
Allocator* a = cpu_allocator();
|
||||
|
@ -40,37 +40,30 @@ static const char* const kRetOp = "_Retval";
|
||||
static const char* const kGradientOp = "SymbolicGradient";
|
||||
static const char* const kNodeLabel = "Func";
|
||||
|
||||
// Represents the index-th output of a node.
|
||||
struct Endpoint {
|
||||
Node* node;
|
||||
int index;
|
||||
|
||||
// Returns the string name represents this endpoint.
|
||||
string name() const {
|
||||
if (index == 0) {
|
||||
return node->name();
|
||||
} else {
|
||||
return strings::StrCat(node->name(), ":", index);
|
||||
}
|
||||
string NodeOut::name() const {
|
||||
if (index == 0) {
|
||||
return node->name();
|
||||
} else {
|
||||
return strings::StrCat(node->name(), ":", index);
|
||||
}
|
||||
}
|
||||
|
||||
DataType dtype() const { return node->output_type(index); }
|
||||
};
|
||||
DataType NodeOut::dtype() const { return node->output_type(index); }
|
||||
|
||||
struct EndpointHash {
|
||||
uint64 operator()(const Endpoint& x) const {
|
||||
struct NodeOutHash {
|
||||
uint64 operator()(const NodeOut& x) const {
|
||||
return Hash64(reinterpret_cast<const char*>(&x.node), sizeof(Node*),
|
||||
x.index);
|
||||
}
|
||||
};
|
||||
|
||||
struct EndpointEq {
|
||||
bool operator()(const Endpoint& x, const Endpoint& y) const {
|
||||
struct NodeOutEq {
|
||||
bool operator()(const NodeOut& x, const NodeOut& y) const {
|
||||
return (x.node == y.node) && (x.index == y.index);
|
||||
}
|
||||
};
|
||||
|
||||
static Node* AddZerosLike(Graph* g, Endpoint input) {
|
||||
static Node* AddZerosLike(Graph* g, NodeOut input) {
|
||||
DCHECK_LT(0, input.dtype());
|
||||
DCHECK_LT(input.dtype(), DT_FLOAT_REF);
|
||||
NodeDef ndef;
|
||||
@ -85,7 +78,7 @@ static Node* AddZerosLike(Graph* g, Endpoint input) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
static Node* AddSymGrad(Graph* g, Node* n, gtl::ArraySlice<Endpoint> grads) {
|
||||
static Node* AddSymGrad(Graph* g, Node* n, gtl::ArraySlice<NodeOut> grads) {
|
||||
const int num_x = n->num_inputs();
|
||||
const int num_y = n->num_outputs();
|
||||
CHECK_EQ(num_y, grads.size());
|
||||
@ -95,19 +88,19 @@ static Node* AddSymGrad(Graph* g, Node* n, gtl::ArraySlice<Endpoint> grads) {
|
||||
ndef.set_op(kGradientOp);
|
||||
|
||||
// The gradient node should have num_x + num_y inputs.
|
||||
std::vector<Endpoint> n_inputs(num_x);
|
||||
std::vector<NodeOut> n_inputs(num_x);
|
||||
for (const Edge* e : n->in_edges()) {
|
||||
if (e->IsControlEdge()) continue;
|
||||
n_inputs[e->dst_input()] = {e->src(), e->src_output()};
|
||||
}
|
||||
DataTypeVector in_types;
|
||||
for (const Endpoint& ep : n_inputs) {
|
||||
ndef.add_input(ep.name());
|
||||
in_types.push_back(ep.dtype());
|
||||
for (const NodeOut& nout : n_inputs) {
|
||||
ndef.add_input(nout.name());
|
||||
in_types.push_back(nout.dtype());
|
||||
}
|
||||
for (const Endpoint& ep : grads) {
|
||||
ndef.add_input(ep.name());
|
||||
in_types.push_back(ep.dtype());
|
||||
for (const NodeOut& nout : grads) {
|
||||
ndef.add_input(nout.name());
|
||||
in_types.push_back(nout.dtype());
|
||||
}
|
||||
CHECK_EQ(ndef.input_size(), num_x + num_y);
|
||||
|
||||
@ -128,34 +121,34 @@ static Node* AddSymGrad(Graph* g, Node* n, gtl::ArraySlice<Endpoint> grads) {
|
||||
|
||||
class SymbolicGradientBuilder {
|
||||
public:
|
||||
SymbolicGradientBuilder(gtl::ArraySlice<Node*> y_nodes,
|
||||
gtl::ArraySlice<Node*> x_nodes,
|
||||
gtl::ArraySlice<Node*> y_grad_nodes,
|
||||
std::vector<GradNodeOutput>* x_grad_nodes,
|
||||
SymbolicGradientBuilder(gtl::ArraySlice<NodeOut> y_node_outputs,
|
||||
gtl::ArraySlice<NodeOut> x_node_outputs,
|
||||
gtl::ArraySlice<NodeOut> y_grad_node_outputs,
|
||||
std::vector<NodeOut>* x_grad_node_outputs,
|
||||
Graph* graph);
|
||||
|
||||
Status Compute();
|
||||
|
||||
private:
|
||||
gtl::ArraySlice<Node*> y_nodes_;
|
||||
gtl::ArraySlice<Node*> x_nodes_;
|
||||
gtl::ArraySlice<Node*> y_grad_nodes_;
|
||||
std::vector<GradNodeOutput>* x_grad_nodes_;
|
||||
gtl::ArraySlice<NodeOut> y_node_outputs_;
|
||||
gtl::ArraySlice<NodeOut> x_node_outputs_;
|
||||
gtl::ArraySlice<NodeOut> y_grad_node_outputs_;
|
||||
std::vector<NodeOut>* x_grad_node_outputs_;
|
||||
Graph* graph_; // Not owned.
|
||||
|
||||
// A vector of output endpoints which represents backpropagated
|
||||
// gradients
|
||||
typedef std::vector<Endpoint> BackpropedGradients;
|
||||
typedef std::vector<NodeOut> BackpropedGradients;
|
||||
|
||||
// backprops_ is a map from an output endpoint to its accumulated
|
||||
// gradients. When an output endpoint has accumulated all its
|
||||
// backprops_ is a map from a node output to its accumulated
|
||||
// gradients. When a node output has accumulated all its
|
||||
// gradients, we add a node which sums them up.
|
||||
std::unordered_map<Endpoint, BackpropedGradients, EndpointHash, EndpointEq>
|
||||
std::unordered_map<NodeOut, BackpropedGradients, NodeOutHash, NodeOutEq>
|
||||
backprops_;
|
||||
|
||||
// pending[i] is count-down counter for i-th node's expected
|
||||
// backprops. When pending[i] becomes zero, we collected all
|
||||
// backprop gradients for all output endpoint of the ith-node.
|
||||
// backprop gradients for all outputs of the ith-node.
|
||||
std::vector<int> pending_;
|
||||
|
||||
// 'ready' keeps track of nodes that have been completely
|
||||
@ -163,7 +156,8 @@ class SymbolicGradientBuilder {
|
||||
// add dy as an input of the gradient function.
|
||||
std::deque<Node*> ready_;
|
||||
|
||||
// The set of nodes at which to stop backprop (and populate 'x_grad_nodes_').
|
||||
// The set of nodes at which to stop backprop.
|
||||
// Maps from node.id -> index of 'x_node_outputs_'
|
||||
std::unordered_map<int, int> stop_nodes_;
|
||||
|
||||
// Initialize pending_ and ready_.
|
||||
@ -173,33 +167,35 @@ class SymbolicGradientBuilder {
|
||||
// to 'dst', when the backprop algorithm constructs the node
|
||||
// 'dst_grad' which computes the gradient, we need to propagate it
|
||||
// to 'src'.
|
||||
void BackpropAlongEdge(const Endpoint& dst_grad, const Endpoint& src);
|
||||
void BackpropZerosAlongEdge(const Endpoint& src);
|
||||
void BackpropAlongEdge(const NodeOut& dst_grad, const NodeOut& src);
|
||||
void BackpropZerosAlongEdge(const NodeOut& src);
|
||||
|
||||
Endpoint SumGradients(const Endpoint& src);
|
||||
NodeOut SumGradients(const NodeOut& src);
|
||||
|
||||
TF_DISALLOW_COPY_AND_ASSIGN(SymbolicGradientBuilder);
|
||||
};
|
||||
|
||||
SymbolicGradientBuilder::SymbolicGradientBuilder(
|
||||
gtl::ArraySlice<Node*> y_nodes,
|
||||
gtl::ArraySlice<Node*> x_nodes,
|
||||
gtl::ArraySlice<Node*> y_grad_nodes,
|
||||
std::vector<GradNodeOutput>* x_grad_nodes,
|
||||
Graph* graph) : y_nodes_(y_nodes), x_nodes_(x_nodes),
|
||||
y_grad_nodes_(y_grad_nodes), x_grad_nodes_(x_grad_nodes),
|
||||
graph_(graph) {
|
||||
CHECK_EQ(y_nodes_.size(), y_grad_nodes.size());
|
||||
x_grad_nodes_->clear();
|
||||
x_grad_nodes_->resize(x_nodes_.size());
|
||||
stop_nodes_.reserve(x_nodes_.size());
|
||||
for (int i = 0; i < x_nodes_.size(); ++i) {
|
||||
stop_nodes_.insert(std::make_pair(x_nodes_[i]->id(), i));
|
||||
gtl::ArraySlice<NodeOut> y_node_outputs,
|
||||
gtl::ArraySlice<NodeOut> x_node_outputs,
|
||||
gtl::ArraySlice<NodeOut> y_grad_node_outputs,
|
||||
std::vector<NodeOut>* x_grad_node_outputs, Graph* graph)
|
||||
: y_node_outputs_(y_node_outputs),
|
||||
x_node_outputs_(x_node_outputs),
|
||||
y_grad_node_outputs_(y_grad_node_outputs),
|
||||
x_grad_node_outputs_(x_grad_node_outputs),
|
||||
graph_(graph) {
|
||||
CHECK_EQ(y_node_outputs_.size(), y_grad_node_outputs.size());
|
||||
x_grad_node_outputs_->clear();
|
||||
x_grad_node_outputs_->resize(x_node_outputs_.size());
|
||||
stop_nodes_.reserve(x_node_outputs_.size());
|
||||
for (int i = 0; i < x_node_outputs_.size(); ++i) {
|
||||
stop_nodes_.insert(std::make_pair(x_node_outputs_[i].node->id(), i));
|
||||
}
|
||||
}
|
||||
|
||||
void SymbolicGradientBuilder::BackpropAlongEdge(const Endpoint& dst_grad,
|
||||
const Endpoint& src) {
|
||||
void SymbolicGradientBuilder::BackpropAlongEdge(const NodeOut& dst_grad,
|
||||
const NodeOut& src) {
|
||||
CHECK_NOTNULL(src.node);
|
||||
auto iter = backprops_.find(src);
|
||||
if (iter != backprops_.end()) {
|
||||
@ -211,7 +207,7 @@ void SymbolicGradientBuilder::BackpropAlongEdge(const Endpoint& dst_grad,
|
||||
}
|
||||
}
|
||||
|
||||
void SymbolicGradientBuilder::BackpropZerosAlongEdge(const Endpoint& src) {
|
||||
void SymbolicGradientBuilder::BackpropZerosAlongEdge(const NodeOut& src) {
|
||||
CHECK_NOTNULL(src.node);
|
||||
auto iter = backprops_.find(src);
|
||||
if (iter != backprops_.end()) {
|
||||
@ -227,9 +223,9 @@ void SymbolicGradientBuilder::InitBackprop() {
|
||||
backprops_.clear();
|
||||
std::unordered_set<Node*> visited;
|
||||
std::deque<Node*> queue;
|
||||
for (Node* n : x_nodes_) {
|
||||
queue.push_back(n);
|
||||
visited.insert(n);
|
||||
for (const NodeOut& nout : x_node_outputs_) {
|
||||
queue.push_back(nout.node);
|
||||
visited.insert(nout.node);
|
||||
}
|
||||
|
||||
// Going forward to figure out which endpoints need backprop-ed.
|
||||
@ -255,20 +251,19 @@ void SymbolicGradientBuilder::InitBackprop() {
|
||||
}
|
||||
|
||||
{
|
||||
const int num_y = y_grad_nodes_.size();
|
||||
const int num_y = y_grad_node_outputs_.size();
|
||||
for (int i = 0; i < num_y; ++i) {
|
||||
Node* y = y_nodes_[i];
|
||||
Node* dy = y_grad_nodes_[i];
|
||||
Node* y = y_node_outputs_[i].node;
|
||||
for (const Edge* e : y->in_edges()) {
|
||||
if (e->IsControlEdge()) continue;
|
||||
BackpropAlongEdge({dy, e->dst_input()}, {e->src(), e->src_output()});
|
||||
BackpropAlongEdge(y_grad_node_outputs_[i], {e->src(), e->src_output()});
|
||||
}
|
||||
}
|
||||
}
|
||||
CHECK(!ready_.empty());
|
||||
}
|
||||
|
||||
Endpoint SymbolicGradientBuilder::SumGradients(const Endpoint& src) {
|
||||
NodeOut SymbolicGradientBuilder::SumGradients(const NodeOut& src) {
|
||||
const DataType dtype = src.dtype();
|
||||
auto iter = backprops_.find(src);
|
||||
CHECK(iter != backprops_.end());
|
||||
@ -286,8 +281,8 @@ Endpoint SymbolicGradientBuilder::SumGradients(const Endpoint& src) {
|
||||
NodeDef ndef;
|
||||
ndef.set_name(graph_->NewName(kNodeLabel));
|
||||
ndef.set_op("AddN"); // N-way Add
|
||||
for (const Endpoint& ep : grads) {
|
||||
ndef.add_input(ep.name());
|
||||
for (const NodeOut& nout : grads) {
|
||||
ndef.add_input(nout.name());
|
||||
}
|
||||
AddNodeAttr("N", static_cast<int64>(grads.size()), &ndef);
|
||||
AddNodeAttr("T", dtype, &ndef);
|
||||
@ -295,8 +290,8 @@ Endpoint SymbolicGradientBuilder::SumGradients(const Endpoint& src) {
|
||||
Node* add = graph_->AddNode(ndef, &s);
|
||||
TF_CHECK_OK(s);
|
||||
for (size_t i = 0; i < grads.size(); ++i) {
|
||||
const Endpoint& ep = grads[i];
|
||||
graph_->AddEdge(ep.node, ep.index, add, i);
|
||||
const NodeOut& nout = grads[i];
|
||||
graph_->AddEdge(nout.node, nout.index, add, i);
|
||||
}
|
||||
return {add, 0};
|
||||
}
|
||||
@ -312,7 +307,7 @@ Status SymbolicGradientBuilder::Compute() {
|
||||
InitBackprop();
|
||||
|
||||
// Backward propagation.
|
||||
gtl::InlinedVector<Endpoint, 8> dy;
|
||||
gtl::InlinedVector<NodeOut, 8> dy;
|
||||
while (!ready_.empty()) {
|
||||
// n has collected all gradients.
|
||||
Node* n = ready_.front();
|
||||
@ -324,11 +319,11 @@ Status SymbolicGradientBuilder::Compute() {
|
||||
|
||||
auto iter = stop_nodes_.find(n->id());
|
||||
if (iter != stop_nodes_.end()) {
|
||||
// Stop backprop and add gradient sum to 'x_grad_nodes'.
|
||||
// Stop backprop and add gradient sum to 'x_grad_node_outputs_'.
|
||||
// TODO(andydavis) Support stop nodes with more than one output.
|
||||
CHECK_EQ(1, num_y);
|
||||
Endpoint grad = SumGradients({n, 0});
|
||||
(*x_grad_nodes_)[iter->second] = {grad.node, grad.index};
|
||||
const int index = iter->second;
|
||||
(*x_grad_node_outputs_)[index] = SumGradients(x_node_outputs_[index]);
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -350,6 +345,7 @@ Status SymbolicGradientBuilder::Compute() {
|
||||
|
||||
// Adds a gradient node with num_x + num_y inputs and num_x
|
||||
// outputs.
|
||||
// TODO(andydavis) Support primitive gradient ops.
|
||||
Node* grad = AddSymGrad(graph_, n, dy);
|
||||
for (const Edge* e : n->in_edges()) {
|
||||
if (e->IsControlEdge()) continue;
|
||||
@ -369,12 +365,13 @@ Status SymbolicGradientBuilder::Compute() {
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status AddSymbolicGradients(gtl::ArraySlice<Node*> y_nodes,
|
||||
gtl::ArraySlice<Node*> x_nodes,
|
||||
gtl::ArraySlice<Node*> y_grad_nodes,
|
||||
std::vector<GradNodeOutput>* x_grad_nodes,
|
||||
Status AddSymbolicGradients(gtl::ArraySlice<NodeOut> y_node_outputs,
|
||||
gtl::ArraySlice<NodeOut> x_node_outputs,
|
||||
gtl::ArraySlice<NodeOut> y_grad_node_outputs,
|
||||
std::vector<NodeOut>* x_grad_node_outputs,
|
||||
Graph* graph) {
|
||||
SymbolicGradientBuilder builder(y_nodes, x_nodes, y_grad_nodes, x_grad_nodes,
|
||||
SymbolicGradientBuilder builder(y_node_outputs, x_node_outputs,
|
||||
y_grad_node_outputs, x_grad_node_outputs,
|
||||
graph);
|
||||
return builder.Compute();
|
||||
}
|
||||
|
@ -16,40 +16,41 @@ limitations under the License.
|
||||
#ifndef THIRD_PARTY_TENSORFLOW_CORE_GRAPH_GRADIENTS_H_
|
||||
#define THIRD_PARTY_TENSORFLOW_CORE_GRAPH_GRADIENTS_H_
|
||||
|
||||
#include "tensorflow/core/graph/graph.h"
|
||||
#include "tensorflow/core/lib/core/status.h"
|
||||
#include "tensorflow/core/lib/gtl/array_slice.h"
|
||||
|
||||
namespace tensorflow {
|
||||
|
||||
// GradNodeOutput represents a single gradient node output.
|
||||
struct GradNodeOutput {
|
||||
// Represents the output of 'node' at 'index'.
|
||||
struct NodeOut {
|
||||
Node* node;
|
||||
int index;
|
||||
|
||||
// Returns the string name that represents the output of this node.
|
||||
string name() const;
|
||||
// Returns the data type of the output of this node.
|
||||
DataType dtype() const;
|
||||
};
|
||||
|
||||
// NOTE: This API is a work in progress and will likely be changing frequently.
|
||||
//
|
||||
// Given initial gradient nodes 'y_grad_nodes' (which compute the symbolic
|
||||
// partial derivatives of some loss function 'L' w.r.t the inputs of each
|
||||
// node in 'y_nodes'), adds gradient nodes to 'graph' that compute the sum
|
||||
// of all gradients flowing into the single output of each node in 'x_nodes'.
|
||||
// Note that gradient nodes will not be added to 'graph' which compute
|
||||
// the symbolic partial derivative of 'L' w.r.t. each node in 'x_nodes' (i.e.
|
||||
// backprop will stop at these nodes). This restriction will be lifted in
|
||||
// a subsequent CL.
|
||||
// Given initial gradient-node outputs 'y_grad_node_outputs' (which compute the
|
||||
// symbolic partial derivatives of some loss function 'L' w.r.t the node outputs
|
||||
// 'y_node_outputs'), adds gradient nodes to 'graph' that compute the symbolic
|
||||
// partial derivatives of 'L' w.r.t the node outputs 'x_node_outputs'.
|
||||
//
|
||||
// REQUIRES: Each node in 'x_nodes' must have a single output (this
|
||||
// restriction will be removed in a subsequent change).
|
||||
// REQUIRES: Each node in 'x_node_outputs' to be unique, and so to have a single
|
||||
// output (this restriction will be removed in a subsequent change).
|
||||
|
||||
// TODO(andydavis) Add support for returning 'x_node' gradients by endpoint
|
||||
// (i.e. {node, index}).
|
||||
// TODO(andydavis) Add symbolic gradient support for general graphs (the current
|
||||
// implementation only supports gradients for functions). In particular,
|
||||
// the nodes in 'x_nodes' are currently restricted to have one output.
|
||||
Status AddSymbolicGradients(gtl::ArraySlice<Node*> y_nodes,
|
||||
gtl::ArraySlice<Node*> x_nodes,
|
||||
gtl::ArraySlice<Node*> y_grad_nodes,
|
||||
std::vector<GradNodeOutput>* x_grad_nodes,
|
||||
|
||||
Status AddSymbolicGradients(gtl::ArraySlice<NodeOut> y_node_outputs,
|
||||
gtl::ArraySlice<NodeOut> x_node_outputs,
|
||||
gtl::ArraySlice<NodeOut> y_grad_node_outputs,
|
||||
std::vector<NodeOut>* x_grad_node_outputs,
|
||||
Graph* graph);
|
||||
|
||||
} // namespace tensorflow
|
||||
|
@ -214,6 +214,21 @@ cc_header_only_library(
|
||||
deps = [":bounds_check"],
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "image_resizer_state",
|
||||
hdrs = ["image_resizer_state.h"],
|
||||
visibility = ["//visibility:private"],
|
||||
deps = [
|
||||
"//tensorflow/core:lib",
|
||||
"//third_party/eigen3",
|
||||
],
|
||||
)
|
||||
|
||||
cc_header_only_library(
|
||||
name = "image_resizer_state_lib",
|
||||
deps = [":image_resizer_state"],
|
||||
)
|
||||
|
||||
# OpKernel libraries ----------------------------------------------------------
|
||||
|
||||
tf_kernel_libraries(
|
||||
@ -221,7 +236,6 @@ tf_kernel_libraries(
|
||||
prefixes = [
|
||||
"bcast_ops",
|
||||
"bitcast_op",
|
||||
"depthtospace_op",
|
||||
"concat_op",
|
||||
"constant_op",
|
||||
"diag_op",
|
||||
@ -239,7 +253,6 @@ tf_kernel_libraries(
|
||||
"reverse_sequence_op",
|
||||
"shape_ops",
|
||||
"slice_op",
|
||||
"spacetodepth_op",
|
||||
"split_op",
|
||||
"tile_ops",
|
||||
"transpose_op",
|
||||
@ -250,6 +263,7 @@ tf_kernel_libraries(
|
||||
deps = [
|
||||
":bounds_check",
|
||||
":concat_lib",
|
||||
":depth_space_ops",
|
||||
":fill_functor",
|
||||
":ops_util",
|
||||
":split_lib",
|
||||
@ -545,6 +559,7 @@ tf_kernel_libraries(
|
||||
"sample_distorted_bounding_box_op",
|
||||
],
|
||||
deps = [
|
||||
":image_resizer_state",
|
||||
"//tensorflow/core:framework",
|
||||
"//tensorflow/core:image_ops_op_lib",
|
||||
"//tensorflow/core:lib",
|
||||
@ -830,6 +845,31 @@ tf_kernel_library(
|
||||
],
|
||||
)
|
||||
|
||||
tf_kernel_library(
|
||||
name = "depth_space_ops",
|
||||
srcs = [
|
||||
"depthtospace_op.cc",
|
||||
"spacetodepth_op.cc",
|
||||
],
|
||||
hdrs = [
|
||||
"depthtospace_op.h",
|
||||
"spacetodepth_op.h",
|
||||
],
|
||||
gpu_srcs = [
|
||||
"depthtospace_op.h",
|
||||
"depthtospace_op_gpu.cu.cc",
|
||||
"spacetodepth_op.h",
|
||||
"spacetodepth_op_gpu.cu.cc",
|
||||
],
|
||||
visibility = ["//visibility:private"],
|
||||
deps = [
|
||||
"//tensorflow/core:framework",
|
||||
"//tensorflow/core:lib",
|
||||
"//third_party/eigen3",
|
||||
],
|
||||
alwayslink = 0,
|
||||
)
|
||||
|
||||
tf_kernel_libraries(
|
||||
name = "parsing",
|
||||
prefixes = [
|
||||
@ -1062,6 +1102,7 @@ filegroup(
|
||||
"slice_op.h",
|
||||
"softmax_op.cc",
|
||||
"softmax_op.h",
|
||||
"softmax_op_functor.h",
|
||||
"split_lib.h",
|
||||
"split_lib_cpu.cc",
|
||||
"split_op.cc",
|
||||
@ -1095,10 +1136,12 @@ filegroup(
|
||||
"batch_norm_op.h",
|
||||
"control_flow_ops.h",
|
||||
"conv_2d.h",
|
||||
"image_resizer_state.h",
|
||||
"maxpooling_op.h",
|
||||
"reduction_ops.h",
|
||||
"reduction_ops_common.h",
|
||||
"relu_op.h",
|
||||
"relu_op_functor.h",
|
||||
"save_restore_tensor.h",
|
||||
"softplus_op.h",
|
||||
"softsign_op.h",
|
||||
|
@ -113,6 +113,39 @@ perftools::gputools::DeviceMemory<T> AsDeviceMemory(const T* cuda_memory) {
|
||||
perftools::gputools::DeviceMemory<T> typed(wrapped);
|
||||
return typed;
|
||||
}
|
||||
|
||||
class CublasScratchAllocator : public perftools::gputools::ScratchAllocator {
|
||||
public:
|
||||
using Stream = ::perftools::gputools::Stream;
|
||||
using DeviceMemoryBytes = ::perftools::gputools::DeviceMemory<uint8>;
|
||||
|
||||
CublasScratchAllocator(OpKernelContext* context) : context_(context) {}
|
||||
|
||||
int64 GetMemoryLimitInBytes(Stream* stream) override { return -1; }
|
||||
|
||||
perftools::gputools::port::StatusOr<DeviceMemoryBytes> AllocateBytes(
|
||||
Stream* stream, int64 byte_size) override {
|
||||
Tensor temporary_memory;
|
||||
|
||||
Status allocation_status(context_->allocate_temp(
|
||||
DT_UINT8, TensorShape({byte_size}), &temporary_memory));
|
||||
if (!allocation_status.ok()) {
|
||||
return perftools::gputools::port::StatusOr<DeviceMemoryBytes>(
|
||||
DeviceMemoryBytes::MakeFromByteSize(nullptr, 0));
|
||||
}
|
||||
// Hold the reference of the allocated tensors until the end of the
|
||||
// allocator.
|
||||
allocated_tensors_.push_back(temporary_memory);
|
||||
return perftools::gputools::port::StatusOr<DeviceMemoryBytes>(
|
||||
DeviceMemoryBytes::MakeFromByteSize(
|
||||
temporary_memory.flat<uint8>().data(),
|
||||
temporary_memory.flat<uint8>().size()));
|
||||
}
|
||||
|
||||
private:
|
||||
OpKernelContext* context_;
|
||||
std::vector<Tensor> allocated_tensors_;
|
||||
};
|
||||
} // namespace
|
||||
|
||||
template <typename Scalar>
|
||||
@ -162,12 +195,14 @@ struct LaunchBatchMatMul<GPUDevice, Scalar> {
|
||||
// where A, B and C are assumed to be in column major.
|
||||
// We want the output to be in row-major, so we can compute
|
||||
// C' = B' x A' (' stands for transpose)
|
||||
CublasScratchAllocator scratch_allocator(context);
|
||||
bool blas_launch_status =
|
||||
stream->ThenBlasGemmBatched(blas_transpose_b, blas_transpose_a, n, m, k,
|
||||
static_cast<Scalar>(1.0), b_ptrs,
|
||||
adj_y ? k : n, a_ptrs, adj_x ? m : k,
|
||||
static_cast<Scalar>(0.0), c_ptrs, n,
|
||||
batch_size)
|
||||
stream
|
||||
->ThenBlasGemmBatchedWithScratch(
|
||||
blas_transpose_b, blas_transpose_a, n, m, k,
|
||||
static_cast<Scalar>(1.0), b_ptrs, adj_y ? k : n, a_ptrs,
|
||||
adj_x ? m : k, static_cast<Scalar>(0.0), c_ptrs, n, batch_size,
|
||||
&scratch_allocator)
|
||||
.ok();
|
||||
if (!blas_launch_status) {
|
||||
context->SetStatus(errors::Internal(
|
||||
@ -265,9 +300,7 @@ REGISTER_CPU(int32);
|
||||
REGISTER_CPU(complex64);
|
||||
|
||||
#ifdef GOOGLE_CUDA
|
||||
// TODO(kalakris): The GPU implementation is currently disabled due to issues
|
||||
// encountered in practice. See b/24534272.
|
||||
// REGISTER_GPU(float);
|
||||
REGISTER_GPU(float);
|
||||
#endif // GOOGLE_CUDA
|
||||
|
||||
#undef REGISTER_CPU
|
||||
|
@ -45,7 +45,7 @@ class DecodeCSVOp : public OpKernel {
|
||||
OP_REQUIRES_OK(ctx, ctx->input("records", &records));
|
||||
OP_REQUIRES_OK(ctx, ctx->input_list("record_defaults", &record_defaults));
|
||||
|
||||
for (int i = 0; i < record_defaults.size(); ++i) {
|
||||
for (int64 i = 0; i < record_defaults.size(); ++i) {
|
||||
OP_REQUIRES(ctx, record_defaults[i].NumElements() < 2,
|
||||
errors::InvalidArgument(
|
||||
"There should only be 1 default per field but field ", i,
|
||||
@ -53,7 +53,7 @@ class DecodeCSVOp : public OpKernel {
|
||||
}
|
||||
|
||||
auto records_t = records->flat<string>();
|
||||
int records_size = records_t.size();
|
||||
int64 records_size = records_t.size();
|
||||
|
||||
OpOutputList output;
|
||||
OP_REQUIRES_OK(ctx, ctx->output_list("output", &output));
|
||||
@ -63,7 +63,7 @@ class DecodeCSVOp : public OpKernel {
|
||||
output.allocate(i, records->shape(), &out);
|
||||
}
|
||||
|
||||
for (int i = 0; i < records_size; ++i) {
|
||||
for (int64 i = 0; i < records_size; ++i) {
|
||||
const StringPiece record(records_t(i));
|
||||
std::vector<string> fields;
|
||||
ExtractFields(ctx, record, &fields);
|
||||
@ -165,7 +165,7 @@ class DecodeCSVOp : public OpKernel {
|
||||
|
||||
void ExtractFields(OpKernelContext* ctx, StringPiece input,
|
||||
std::vector<string>* result) {
|
||||
int current_idx = 0;
|
||||
int64 current_idx = 0;
|
||||
if (!input.empty()) {
|
||||
while (static_cast<size_t>(current_idx) < input.size()) {
|
||||
if (input[current_idx] == '\n' || input[current_idx] == '\r') {
|
||||
|
@ -21,6 +21,8 @@ limitations under the License.
|
||||
#include <string>
|
||||
#include <utility>
|
||||
|
||||
#include "tensorflow/core/kernels/depthtospace_op.h"
|
||||
|
||||
#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
|
||||
#include "tensorflow/core/framework/op.h"
|
||||
#include "tensorflow/core/framework/op_kernel.h"
|
||||
@ -60,8 +62,8 @@ class DepthToSpaceOp : public OpKernel {
|
||||
"instead of: ", dims));
|
||||
|
||||
const int batch_size = input.dim_size(0);
|
||||
const int height = input.dim_size(1);
|
||||
const int width = input.dim_size(2);
|
||||
const int input_height = input.dim_size(1);
|
||||
const int input_width = input.dim_size(2);
|
||||
const int input_depth = input.dim_size(3);
|
||||
|
||||
const int block_size_sq = block_size_ * block_size_;
|
||||
@ -73,41 +75,58 @@ class DepthToSpaceOp : public OpKernel {
|
||||
"should be divisible by: ", block_size_sq));
|
||||
|
||||
const int output_depth = input_depth / block_size_sq;
|
||||
const int output_width = width * block_size_;
|
||||
const int output_height = height * block_size_;
|
||||
const int output_width = input_width * block_size_;
|
||||
const int output_height = input_height * block_size_;
|
||||
|
||||
// Allocate output tensor.
|
||||
Tensor* outputs_tensor = nullptr;
|
||||
Tensor* output = nullptr;
|
||||
OP_REQUIRES_OK(context, context->allocate_output(
|
||||
0, TensorShape({batch_size, output_height,
|
||||
output_width, output_depth}),
|
||||
&outputs_tensor));
|
||||
&output));
|
||||
|
||||
auto Toutput = outputs_tensor->tensor<T, 4>();
|
||||
auto Tinput = input.tensor<T, 4>();
|
||||
typename TTypes<T, 4>::ConstTensor Tinput = input.tensor<T, 4>();
|
||||
typename TTypes<T, 4>::Tensor Toutput = output->tensor<T, 4>();
|
||||
|
||||
for (int b = 0; b < batch_size; ++b) {
|
||||
for (int h = 0; h < output_height; ++h) {
|
||||
const int in_h = h / block_size_;
|
||||
const int offset_h = (h % block_size_);
|
||||
for (int w = 0; w < output_width; ++w) {
|
||||
const int in_w = w / block_size_;
|
||||
const int offset_w = (w % block_size_);
|
||||
const int offset_d =
|
||||
(offset_h * block_size_ + offset_w) * output_depth;
|
||||
for (int d = 0; d < output_depth; ++d) {
|
||||
const int in_d = d + offset_d;
|
||||
Toutput(b, h, w, d) = Tinput(b, in_h, in_w, in_d);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
functor::DepthToSpaceOpFunctor<Device, T> functor;
|
||||
functor(context->eigen_device<Device>(), Tinput, block_size_, Toutput);
|
||||
};
|
||||
|
||||
private:
|
||||
int block_size_;
|
||||
};
|
||||
|
||||
// Partial specialization of DepthToSpaceOpFunctor for a CPUDevice.
|
||||
namespace functor {
|
||||
template <typename T>
|
||||
struct DepthToSpaceOpFunctor<CPUDevice, T> {
|
||||
void operator()(const CPUDevice& d, typename TTypes<T, 4>::ConstTensor input,
|
||||
int block_size, typename TTypes<T, 4>::Tensor output) {
|
||||
const int batch_size = output.dimension(0);
|
||||
const int output_height = output.dimension(1);
|
||||
const int output_width = output.dimension(2);
|
||||
const int output_depth = output.dimension(3);
|
||||
|
||||
for (int b = 0; b < batch_size; ++b) {
|
||||
for (int h = 0; h < output_height; ++h) {
|
||||
const int in_h = h / block_size;
|
||||
const int offset_h = (h % block_size);
|
||||
for (int w = 0; w < output_width; ++w) {
|
||||
const int in_w = w / block_size;
|
||||
const int offset_w = (w % block_size);
|
||||
const int offset_d =
|
||||
(offset_h * block_size + offset_w) * output_depth;
|
||||
for (int d = 0; d < output_depth; ++d) {
|
||||
const int in_d = d + offset_d;
|
||||
output(b, h, w, d) = input(b, in_h, in_w, in_d);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
} // namespace functor
|
||||
|
||||
#define REGISTER(type) \
|
||||
REGISTER_KERNEL_BUILDER( \
|
||||
Name("DepthToSpace").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
|
||||
@ -116,4 +135,10 @@ class DepthToSpaceOp : public OpKernel {
|
||||
TF_CALL_ALL_TYPES(REGISTER);
|
||||
#undef REGISTER
|
||||
|
||||
#if GOOGLE_CUDA
|
||||
REGISTER_KERNEL_BUILDER(
|
||||
Name("DepthToSpace").Device(DEVICE_GPU).TypeConstraint<float>("T"),
|
||||
DepthToSpaceOp<GPUDevice, float>);
|
||||
#endif // GOOGLE_CUDA
|
||||
|
||||
} // end namespace tensorflow
|
||||
|
44
tensorflow/core/kernels/depthtospace_op.h
Normal file
44
tensorflow/core/kernels/depthtospace_op.h
Normal file
@ -0,0 +1,44 @@
|
||||
/* Copyright 2015 Google Inc. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#ifndef TENSORFLOW_CORE_KERNELS_DEPTHTOSPACE_OP_H_
|
||||
#define TENSORFLOW_CORE_KERNELS_DEPTHTOSPACE_OP_H_
|
||||
// Functor definition for XentOp, must be compilable by nvcc.
|
||||
|
||||
#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
|
||||
#include "tensorflow/core/framework/tensor_types.h"
|
||||
|
||||
namespace tensorflow {
|
||||
namespace functor {
|
||||
|
||||
// Functor used by DepthToSpaceOp to do the computations.
|
||||
template <typename Device, typename T>
|
||||
struct DepthToSpaceOpFunctor {
|
||||
// Implements the depth to space conversion.
|
||||
//
|
||||
// input: 4-D input tensor.
|
||||
// block_size: block size for the conversion.
|
||||
// output: 4-D output tensor.
|
||||
//
|
||||
// The dimensions of the tensors are guaranteed to be correct when the
|
||||
// functor is called.
|
||||
void operator()(const Device& d, typename TTypes<T, 4>::ConstTensor input,
|
||||
int block_size, typename TTypes<T, 4>::Tensor output);
|
||||
};
|
||||
|
||||
} // namespace functor
|
||||
} // namespace tensorflow
|
||||
|
||||
#endif // TENSORFLOW_CORE_KERNELS_DEPTHTOSPACE_OP_H_
|
88
tensorflow/core/kernels/depthtospace_op_gpu.cu.cc
Normal file
88
tensorflow/core/kernels/depthtospace_op_gpu.cu.cc
Normal file
@ -0,0 +1,88 @@
|
||||
/* Copyright 2015 Google Inc. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#if GOOGLE_CUDA
|
||||
|
||||
#define EIGEN_USE_GPU
|
||||
|
||||
#include "tensorflow/core/kernels/depthtospace_op.h"
|
||||
|
||||
#include "tensorflow/core/framework/tensor_types.h"
|
||||
#include "tensorflow/core/platform/types.h"
|
||||
#include "tensorflow/core/util/cuda_kernel_helper.h"
|
||||
|
||||
namespace tensorflow {
|
||||
|
||||
typedef Eigen::GpuDevice GPUDevice;
|
||||
|
||||
template <typename dtype>
|
||||
__global__ void D2S(const int32 nthreads, const dtype* input_ptr,
|
||||
const int block_size, const int batch_size,
|
||||
const int input_height, const int input_width,
|
||||
const int input_depth, const int output_height,
|
||||
const int output_width, const int output_depth,
|
||||
dtype* output_ptr) {
|
||||
CUDA_1D_KERNEL_LOOP(out_idx, nthreads) {
|
||||
// out_idx = d + output_depth * (w + output_width * (h + output_height * b))
|
||||
const int d = out_idx % output_depth;
|
||||
const int out_idx2 = out_idx / output_depth;
|
||||
const int w = out_idx2 % output_width;
|
||||
const int out_idx3 = out_idx2 / output_width;
|
||||
const int h = out_idx3 % output_height;
|
||||
const int b = out_idx3 / output_height;
|
||||
|
||||
const int in_h = h / block_size;
|
||||
const int offset_h = h % block_size;
|
||||
const int in_w = w / block_size;
|
||||
const int offset_w = w % block_size;
|
||||
const int offset_d = (offset_h * block_size + offset_w) * output_depth;
|
||||
const int in_d = d + offset_d;
|
||||
const int inp_idx =
|
||||
in_d + input_depth * (in_w + input_width * (in_h + input_height * b));
|
||||
*(output_ptr + out_idx) = ldg(input_ptr + inp_idx);
|
||||
}
|
||||
}
|
||||
|
||||
// Specialization of DepthToSpaceOpFunctor for a GPUDevice.
|
||||
namespace functor {
|
||||
template <typename T>
|
||||
struct DepthToSpaceOpFunctor<GPUDevice, T> {
|
||||
void operator()(const GPUDevice& d, typename TTypes<T, 4>::ConstTensor input,
|
||||
int block_size, typename TTypes<T, 4>::Tensor output) {
|
||||
const int batch_size = output.dimension(0);
|
||||
const int input_height = input.dimension(1);
|
||||
const int input_width = input.dimension(2);
|
||||
const int input_depth = input.dimension(3);
|
||||
const int output_height = output.dimension(1);
|
||||
const int output_width = output.dimension(2);
|
||||
const int output_depth = output.dimension(3);
|
||||
|
||||
const int total_count =
|
||||
batch_size * output_height * output_width * output_depth;
|
||||
CudaLaunchConfig config = GetCudaLaunchConfig(total_count, d);
|
||||
D2S<<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
|
||||
config.virtual_thread_count, input.data(), block_size, batch_size,
|
||||
input_height, input_width, input_depth, output_height, output_width,
|
||||
output_depth, output.data());
|
||||
}
|
||||
};
|
||||
} // end namespace functor
|
||||
|
||||
// Instantiate the GPU implementation for float.
|
||||
template struct functor::DepthToSpaceOpFunctor<GPUDevice, float>;
|
||||
|
||||
} // end namespace tensorflow
|
||||
|
||||
#endif // GOOGLE_CUDA
|
111
tensorflow/core/kernels/image_resizer_state.h
Normal file
111
tensorflow/core/kernels/image_resizer_state.h
Normal file
@ -0,0 +1,111 @@
|
||||
/* Copyright 2016 Google Inc. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
// This is a helper struct to package up the input and ouput
|
||||
// parameters of an image resizer (the height, widths, etc.). To
|
||||
// reduce code duplication and ensure consistency across the different
|
||||
// resizers, it performs the input validation.
|
||||
|
||||
#ifndef TENSORFLOW_KERNELS_IMAGE_RESIZER_STATE_H_
|
||||
#define TENSORFLOW_KERNELS_IMAGE_RESIZER_STATE_H_
|
||||
|
||||
#define EIGEN_USE_THREADS
|
||||
|
||||
#include <math.h>
|
||||
#include <algorithm>
|
||||
#include <array>
|
||||
|
||||
#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
|
||||
#include "tensorflow/core/framework/op_kernel.h"
|
||||
#include "tensorflow/core/framework/register_types.h"
|
||||
#include "tensorflow/core/framework/tensor.h"
|
||||
#include "tensorflow/core/framework/tensor_shape.h"
|
||||
#include "tensorflow/core/framework/types.h"
|
||||
#include "tensorflow/core/kernels/bounds_check.h"
|
||||
|
||||
namespace tensorflow {
|
||||
|
||||
struct ImageResizerState {
|
||||
explicit ImageResizerState(bool align_corners)
|
||||
: align_corners_(align_corners) {}
|
||||
|
||||
// ValidateAndCreateOutput checks the bounds on the input tensors
|
||||
// and requested size, sets up some of the resizing state such as the
|
||||
// height_scale and width_scale, and allocates the output.
|
||||
// If any of these operations fails, it sets an error status in
|
||||
// the context, which the caller must check.
|
||||
void ValidateAndCreateOutput(OpKernelContext* context, const Tensor& input) {
|
||||
OP_REQUIRES(context, input.dims() == 4,
|
||||
errors::InvalidArgument("input must be 4-dimensional",
|
||||
input.shape().DebugString()));
|
||||
const Tensor& shape_t = context->input(1);
|
||||
OP_REQUIRES(context, shape_t.dims() == 1,
|
||||
errors::InvalidArgument("shape_t must be 1-dimensional",
|
||||
shape_t.shape().DebugString()));
|
||||
OP_REQUIRES(context, shape_t.NumElements() == 2,
|
||||
errors::InvalidArgument("shape_t must have two elements",
|
||||
shape_t.shape().DebugString()));
|
||||
auto Svec = shape_t.vec<int32>();
|
||||
batch_size = input.dim_size(0);
|
||||
out_height = internal::SubtleMustCopy(Svec(0));
|
||||
out_width = internal::SubtleMustCopy(Svec(1));
|
||||
OP_REQUIRES(
|
||||
context,
|
||||
FastBoundsCheck(input.dim_size(1), std::numeric_limits<int32>::max()) &&
|
||||
FastBoundsCheck(input.dim_size(2),
|
||||
std::numeric_limits<int32>::max()),
|
||||
errors::InvalidArgument("input sizes must be between 0 and max int32"));
|
||||
|
||||
in_height = static_cast<int32>(input.dim_size(1));
|
||||
in_width = static_cast<int32>(input.dim_size(2));
|
||||
channels = input.dim_size(3);
|
||||
OP_REQUIRES(context, out_height > 0 && out_width > 0,
|
||||
errors::InvalidArgument("output dimensions must be positive"));
|
||||
OP_REQUIRES(
|
||||
context, channels > 0,
|
||||
errors::InvalidArgument("image must have at least one channel"));
|
||||
OP_REQUIRES(
|
||||
context, input.dim_size(1) > 0 && input.dim_size(2) > 0,
|
||||
errors::InvalidArgument("input image must be of non-zero size"));
|
||||
OP_REQUIRES_OK(context, context->allocate_output(
|
||||
0, TensorShape({input.dim_size(0), out_height,
|
||||
out_width, input.dim_size(3)}),
|
||||
&output));
|
||||
|
||||
height_scale = (align_corners_ && out_height > 1)
|
||||
? (in_height - 1) / static_cast<float>(out_height - 1)
|
||||
: in_height / static_cast<float>(out_height);
|
||||
width_scale = (align_corners_ && out_width > 1)
|
||||
? (in_width - 1) / static_cast<float>(out_width - 1)
|
||||
: in_width / static_cast<float>(out_width);
|
||||
}
|
||||
|
||||
int64 batch_size;
|
||||
int64 out_height;
|
||||
int64 out_width;
|
||||
int64 in_height;
|
||||
int64 in_width;
|
||||
int64 channels;
|
||||
float height_scale;
|
||||
float width_scale;
|
||||
Tensor* output;
|
||||
|
||||
private:
|
||||
bool align_corners_;
|
||||
};
|
||||
|
||||
} // namespace tensorflow
|
||||
|
||||
#endif // TENSORFLOW_KERNELS_IMAGE_RESIZER_STATE_H_
|
@ -492,6 +492,8 @@ static void BM_ConvFloatDepthwise(int iters, int batch, int rows, int cols,
|
||||
// OD: output_depth
|
||||
// KR: kernel_rows
|
||||
// KC: kernel_cols
|
||||
// STR: stride
|
||||
// PAD: padding
|
||||
|
||||
#define BM_ConvFloatDepthwiseFwd(BS, R, C, ID, DM, OD, KR, KC, STR, PAD, \
|
||||
LABEL) \
|
||||
@ -509,12 +511,25 @@ static void BM_ConvFloatDepthwise(int iters, int batch, int rows, int cols,
|
||||
strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", DM, "_", OD, "_", \
|
||||
KR, "_", KC, "_", STR, "_", PAD, "_cpu4")); \
|
||||
} \
|
||||
static void BM_ConvFloatDepthwiseFwdGPU_##LABEL(int iters) { \
|
||||
BM_ConvFloatDepthwise( \
|
||||
iters, BS, R, C, ID, DM, OD, KR, KC, DEPTHWISE_CONV_OP_FWD, 1, STR, \
|
||||
PAD, true, \
|
||||
strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", DM, "_", OD, "_", \
|
||||
KR, "_", KC, "_", STR, "_", PAD, "_gpu")); \
|
||||
} \
|
||||
BENCHMARK(BM_ConvFloatDepthwiseFwdCPU1_##LABEL); \
|
||||
BENCHMARK(BM_ConvFloatDepthwiseFwdCPU4_##LABEL)
|
||||
BENCHMARK(BM_ConvFloatDepthwiseFwdCPU4_##LABEL); \
|
||||
BENCHMARK(BM_ConvFloatDepthwiseFwdGPU_##LABEL);
|
||||
|
||||
// TODO(andydavis,jmchen) Add more benchmarks.
|
||||
// The configurations below are mostly from mobilenet models.
|
||||
BM_ConvFloatDepthwiseFwd(32, 112, 112, 3, 8, 24, 3, 3, 1, SAME, conv0);
|
||||
BM_ConvFloatDepthwiseFwd(32, 112, 112, 64, 1, 64, 3, 3, 1, SAME, conv1);
|
||||
BM_ConvFloatDepthwiseFwd(32, 56, 56, 128, 1, 128, 3, 3, 1, SAME, conv2);
|
||||
BM_ConvFloatDepthwiseFwd(32, 56, 56, 128, 1, 128, 3, 3, 2, SAME, conv3);
|
||||
BM_ConvFloatDepthwiseFwd(32, 28, 28, 128, 1, 128, 3, 3, 1, SAME, conv4);
|
||||
BM_ConvFloatDepthwiseFwd(32, 14, 14, 512, 1, 512, 3, 3, 1, SAME, conv5);
|
||||
BM_ConvFloatDepthwiseFwd(32, 7, 7, 1024, 1, 1024, 3, 3, 1, SAME, conv6);
|
||||
|
||||
static void BM_LRNFloat(int iters, int depth, int cols, int rows,
|
||||
int batch_size, int range, int num_threads,
|
||||
|
@ -30,147 +30,6 @@ namespace tensorflow {
|
||||
typedef Eigen::ThreadPoolDevice CPUDevice;
|
||||
typedef Eigen::GpuDevice GPUDevice;
|
||||
|
||||
template <typename Device, typename T>
|
||||
class ReluOp : public UnaryElementWiseOp<T, ReluOp<Device, T>> {
|
||||
public:
|
||||
using UnaryElementWiseOp<T, ReluOp<Device, T>>::UnaryElementWiseOp;
|
||||
|
||||
void Operate(OpKernelContext* context, const Tensor& input, Tensor* output) {
|
||||
functor::Relu<Device, T> functor;
|
||||
functor(context->eigen_device<Device>(), input.flat<T>(),
|
||||
output->flat<T>());
|
||||
}
|
||||
};
|
||||
|
||||
// Out of line check to save code space (we have this code once, rather
|
||||
// than once for every NDIMS * NumTypes * Num_different_relu_variants
|
||||
// functions.
|
||||
static void ValidateSameSizeHelper(OpKernelContext* context, const Tensor& g,
|
||||
const Tensor& a) {
|
||||
OP_REQUIRES(context, a.IsSameSize(g),
|
||||
errors::InvalidArgument("g and a must be the same size"));
|
||||
}
|
||||
static bool ValidateSameSize(OpKernelContext* context, const Tensor& g,
|
||||
const Tensor& a) {
|
||||
ValidateSameSizeHelper(context, g, a);
|
||||
return context->status().ok();
|
||||
}
|
||||
|
||||
template <typename Device, typename T>
|
||||
class ReluGradOp : public BinaryElementWiseOp<T, ReluGradOp<Device, T>> {
|
||||
public:
|
||||
using BinaryElementWiseOp<T, ReluGradOp<Device, T>>::BinaryElementWiseOp;
|
||||
|
||||
void OperateNoTemplate(OpKernelContext* context, const Tensor& g,
|
||||
const Tensor& a, Tensor* output);
|
||||
|
||||
// INPUTS:
|
||||
// g (gradients): backpropagated gradients
|
||||
// a (inputs): either the inputs that were passed to ReluOp(), or its
|
||||
// outputs (using either one yields the same result here).
|
||||
// OUTPUT:
|
||||
// gradients to backprop
|
||||
template <int NDIMS>
|
||||
void Operate(OpKernelContext* context, const Tensor& g, const Tensor& a,
|
||||
Tensor* output) {
|
||||
OperateNoTemplate(context, g, a, output);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Device, typename T>
|
||||
void ReluGradOp<Device, T>::OperateNoTemplate(OpKernelContext* context,
|
||||
const Tensor& g, const Tensor& a,
|
||||
Tensor* output) {
|
||||
if (!ValidateSameSize(context, g, a)) return;
|
||||
functor::ReluGrad<Device, T> functor;
|
||||
functor(context->eigen_device<Device>(), g.flat<T>(), a.flat<T>(),
|
||||
output->flat<T>());
|
||||
}
|
||||
|
||||
template <typename Device, typename T>
|
||||
class Relu6Op : public UnaryElementWiseOp<T, Relu6Op<Device, T>> {
|
||||
public:
|
||||
using UnaryElementWiseOp<T, Relu6Op<Device, T>>::UnaryElementWiseOp;
|
||||
|
||||
void Operate(OpKernelContext* context, const Tensor& input, Tensor* output) {
|
||||
functor::Relu6<Device, T> functor;
|
||||
functor(context->eigen_device<Device>(), input.flat<T>(),
|
||||
output->flat<T>());
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Device, typename T>
|
||||
class Relu6GradOp : public BinaryElementWiseOp<T, Relu6GradOp<Device, T>> {
|
||||
public:
|
||||
using BinaryElementWiseOp<T, Relu6GradOp<Device, T>>::BinaryElementWiseOp;
|
||||
|
||||
void OperateNoTemplate(OpKernelContext* context, const Tensor& g,
|
||||
const Tensor& a, Tensor* output);
|
||||
|
||||
// INPUTS:
|
||||
// g (gradients): backpropagated gradients
|
||||
// a (inputs): inputs that were passed to Relu6Op()
|
||||
// OUTPUT:
|
||||
// gradients to backprop
|
||||
template <int NDIMS>
|
||||
void Operate(OpKernelContext* context, const Tensor& g, const Tensor& a,
|
||||
Tensor* output) {
|
||||
OperateNoTemplate(context, g, a, output);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Device, typename T>
|
||||
void Relu6GradOp<Device, T>::OperateNoTemplate(OpKernelContext* context,
|
||||
const Tensor& g, const Tensor& a,
|
||||
Tensor* output) {
|
||||
if (!ValidateSameSize(context, g, a)) return;
|
||||
functor::Relu6Grad<Device, T> functor;
|
||||
functor(context->eigen_device<Device>(), g.flat<T>(), a.flat<T>(),
|
||||
output->flat<T>());
|
||||
}
|
||||
|
||||
template <typename Device, typename T>
|
||||
class EluOp : public UnaryElementWiseOp<T, EluOp<Device, T>> {
|
||||
public:
|
||||
using UnaryElementWiseOp<T, EluOp<Device, T>>::UnaryElementWiseOp;
|
||||
|
||||
void Operate(OpKernelContext* context, const Tensor& input, Tensor* output) {
|
||||
functor::Elu<Device, T> functor;
|
||||
functor(context->eigen_device<Device>(), input.flat<T>(),
|
||||
output->flat<T>());
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Device, typename T>
|
||||
class EluGradOp : public BinaryElementWiseOp<T, EluGradOp<Device, T>> {
|
||||
public:
|
||||
using BinaryElementWiseOp<T, EluGradOp<Device, T>>::BinaryElementWiseOp;
|
||||
|
||||
void OperateNoTemplate(OpKernelContext* context, const Tensor& g,
|
||||
const Tensor& a, Tensor* output);
|
||||
|
||||
// INPUTS:
|
||||
// g (gradients): backpropagated gradients
|
||||
// a (outputs): outputs of the EluOp()
|
||||
// OUTPUT:
|
||||
// gradients to backprop
|
||||
template <int NDIMS>
|
||||
void Operate(OpKernelContext* context, const Tensor& g, const Tensor& a,
|
||||
Tensor* output) {
|
||||
OperateNoTemplate(context, g, a, output);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Device, typename T>
|
||||
void EluGradOp<Device, T>::OperateNoTemplate(OpKernelContext* context,
|
||||
const Tensor& g, const Tensor& a,
|
||||
Tensor* output) {
|
||||
if (!ValidateSameSize(context, g, a)) return;
|
||||
functor::EluGrad<Device, T> functor;
|
||||
functor(context->eigen_device<Device>(), g.flat<T>(), a.flat<T>(),
|
||||
output->flat<T>());
|
||||
}
|
||||
|
||||
#define REGISTER_RELU_KERNELS(type) \
|
||||
REGISTER_KERNEL_BUILDER( \
|
||||
Name("Relu").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
|
||||
|
@ -13,118 +13,168 @@ See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
// See docs in ../ops/nn_ops.cc.
|
||||
|
||||
#ifndef TENSORFLOW_KERNELS_RELU_OP_H_
|
||||
#define TENSORFLOW_KERNELS_RELU_OP_H_
|
||||
// Functor definition for ReluOp and ReluGradOp, must be compilable by nvcc.
|
||||
|
||||
#define EIGEN_USE_THREADS
|
||||
|
||||
#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
|
||||
#include "tensorflow/core/framework/tensor_types.h"
|
||||
#include "tensorflow/core/framework/numeric_op.h"
|
||||
#include "tensorflow/core/framework/op_kernel.h"
|
||||
#include "tensorflow/core/framework/register_types.h"
|
||||
#include "tensorflow/core/framework/tensor.h"
|
||||
#include "tensorflow/core/kernels/relu_op_functor.h"
|
||||
#include "tensorflow/core/lib/core/errors.h"
|
||||
|
||||
namespace tensorflow {
|
||||
namespace functor {
|
||||
|
||||
// Functor used by ReluOp to do the computations.
|
||||
template <typename Device, typename T>
|
||||
struct Relu {
|
||||
// Computes Relu activation.
|
||||
//
|
||||
// features: any shape.
|
||||
// activations: same shape as "features".
|
||||
void operator()(const Device& d, typename TTypes<T>::ConstTensor features,
|
||||
typename TTypes<T>::Tensor activations) {
|
||||
activations.device(d) = features.cwiseMax(static_cast<T>(0));
|
||||
class ReluOp : public UnaryElementWiseOp<T, ReluOp<Device, T>> {
|
||||
public:
|
||||
using UnaryElementWiseOp<T, ReluOp<Device, T>>::UnaryElementWiseOp;
|
||||
|
||||
void Operate(OpKernelContext* context, const Tensor& input, Tensor* output) {
|
||||
functor::Relu<Device, T> functor;
|
||||
functor(context->eigen_device<Device>(), input.flat<T>(),
|
||||
output->flat<T>());
|
||||
}
|
||||
};
|
||||
|
||||
// Functor used by ReluGradOp to do the computations.
|
||||
template <typename Device, typename T>
|
||||
struct ReluGrad {
|
||||
// Computes ReluGrad backprops.
|
||||
//
|
||||
// gradients: gradients backpropagated to the Relu op.
|
||||
// features: either the inputs that were passed to the Relu or, or its
|
||||
// outputs (using either one yields the same result here).
|
||||
// backprops: gradients to backpropagate to the Relu inputs.
|
||||
void operator()(const Device& d, typename TTypes<T>::ConstTensor gradients,
|
||||
typename TTypes<T>::ConstTensor features,
|
||||
typename TTypes<T>::Tensor backprops) {
|
||||
// NOTE: When the activation is exactly zero, we do not propagate the
|
||||
// associated gradient value. This allows the output of the Relu to be used,
|
||||
// as well as its input.
|
||||
backprops.device(d) =
|
||||
gradients * (features > features.constant(static_cast<T>(0)));
|
||||
// Out of line check to save code space (we have this code once, rather
|
||||
// than once for every NDIMS * NumTypes * Num_different_relu_variants
|
||||
// functions.
|
||||
struct ReluHelpers {
|
||||
static void ValidateSameSizeHelper(OpKernelContext* context, const Tensor& g,
|
||||
const Tensor& a) {
|
||||
OP_REQUIRES(context, a.IsSameSize(g),
|
||||
errors::InvalidArgument("g and a must be the same size"));
|
||||
}
|
||||
static bool ValidateSameSize(OpKernelContext* context, const Tensor& g,
|
||||
const Tensor& a) {
|
||||
ValidateSameSizeHelper(context, g, a);
|
||||
return context->status().ok();
|
||||
}
|
||||
};
|
||||
|
||||
// Functor used by Relu6Op to do the computations.
|
||||
template <typename Device, typename T>
|
||||
struct Relu6 {
|
||||
// Computes Relu6 activation.
|
||||
//
|
||||
// features: any shape.
|
||||
// activations: same shape as "features".
|
||||
void operator()(const Device& d, typename TTypes<T>::ConstTensor features,
|
||||
typename TTypes<T>::Tensor activations) {
|
||||
activations.device(d) =
|
||||
features.cwiseMax(static_cast<T>(0)).cwiseMin(static_cast<T>(6));
|
||||
class ReluGradOp : public BinaryElementWiseOp<T, ReluGradOp<Device, T>> {
|
||||
public:
|
||||
using BinaryElementWiseOp<T, ReluGradOp<Device, T>>::BinaryElementWiseOp;
|
||||
|
||||
void OperateNoTemplate(OpKernelContext* context, const Tensor& g,
|
||||
const Tensor& a, Tensor* output);
|
||||
|
||||
// INPUTS:
|
||||
// g (gradients): backpropagated gradients
|
||||
// a (inputs): either the inputs that were passed to ReluOp(), or its
|
||||
// outputs (using either one yields the same result here).
|
||||
// OUTPUT:
|
||||
// gradients to backprop
|
||||
template <int NDIMS>
|
||||
void Operate(OpKernelContext* context, const Tensor& g, const Tensor& a,
|
||||
Tensor* output) {
|
||||
OperateNoTemplate(context, g, a, output);
|
||||
}
|
||||
};
|
||||
|
||||
// Functor used by ReluGradOp to do the computations.
|
||||
template <typename Device, typename T>
|
||||
struct Relu6Grad {
|
||||
// Computes Relu6Grad backprops.
|
||||
//
|
||||
// gradients: gradients backpropagated to the Relu6 op.
|
||||
// features: inputs that where passed to the Relu6 op.
|
||||
// backprops: gradients to backpropagate to the Relu6 inputs.
|
||||
void operator()(const Device& d, typename TTypes<T>::ConstTensor gradients,
|
||||
typename TTypes<T>::ConstTensor features,
|
||||
typename TTypes<T>::Tensor backprops) {
|
||||
// NOTE: When the activation is exactly zero or six, we
|
||||
// arbitrarily choose to not propagate the associated gradient
|
||||
// value.
|
||||
backprops.device(d) = gradients *
|
||||
(features > features.constant(static_cast<T>(0))) *
|
||||
(features < features.constant(static_cast<T>(6)));
|
||||
void ReluGradOp<Device, T>::OperateNoTemplate(OpKernelContext* context,
|
||||
const Tensor& g, const Tensor& a,
|
||||
Tensor* output) {
|
||||
if (!ReluHelpers::ValidateSameSize(context, g, a)) return;
|
||||
functor::ReluGrad<Device, T> functor;
|
||||
functor(context->eigen_device<Device>(), g.flat<T>(), a.flat<T>(),
|
||||
output->flat<T>());
|
||||
}
|
||||
|
||||
template <typename Device, typename T>
|
||||
class Relu6Op : public UnaryElementWiseOp<T, Relu6Op<Device, T>> {
|
||||
public:
|
||||
using UnaryElementWiseOp<T, Relu6Op<Device, T>>::UnaryElementWiseOp;
|
||||
|
||||
void Operate(OpKernelContext* context, const Tensor& input, Tensor* output) {
|
||||
functor::Relu6<Device, T> functor;
|
||||
functor(context->eigen_device<Device>(), input.flat<T>(),
|
||||
output->flat<T>());
|
||||
}
|
||||
};
|
||||
|
||||
// Functor used by EluOp to do the computations.
|
||||
template <typename Device, typename T>
|
||||
struct Elu {
|
||||
// Computes Elu activation.
|
||||
//
|
||||
// features: any shape.
|
||||
// activations: same shape as "features".
|
||||
void operator()(const Device& d, typename TTypes<T>::ConstTensor features,
|
||||
typename TTypes<T>::Tensor activations) {
|
||||
// features.constant(?)
|
||||
activations.device(d) =
|
||||
(features < static_cast<T>(0))
|
||||
.select(features.exp() - features.constant(static_cast<T>(1)),
|
||||
features);
|
||||
class Relu6GradOp : public BinaryElementWiseOp<T, Relu6GradOp<Device, T>> {
|
||||
public:
|
||||
using BinaryElementWiseOp<T, Relu6GradOp<Device, T>>::BinaryElementWiseOp;
|
||||
|
||||
void OperateNoTemplate(OpKernelContext* context, const Tensor& g,
|
||||
const Tensor& a, Tensor* output);
|
||||
|
||||
// INPUTS:
|
||||
// g (gradients): backpropagated gradients
|
||||
// a (inputs): inputs that were passed to Relu6Op()
|
||||
// OUTPUT:
|
||||
// gradients to backprop
|
||||
template <int NDIMS>
|
||||
void Operate(OpKernelContext* context, const Tensor& g, const Tensor& a,
|
||||
Tensor* output) {
|
||||
OperateNoTemplate(context, g, a, output);
|
||||
}
|
||||
};
|
||||
|
||||
// Functor used by EluGradOp to do the computations.
|
||||
template <typename Device, typename T>
|
||||
struct EluGrad {
|
||||
// Computes EluGrad backprops.
|
||||
//
|
||||
// gradients: gradients backpropagated to the Elu op.
|
||||
// activations: outputs of the Elu op.
|
||||
// backprops: gradients to backpropagate to the Elu inputs.
|
||||
void operator()(const Device& d, typename TTypes<T>::ConstTensor gradients,
|
||||
typename TTypes<T>::ConstTensor activations,
|
||||
typename TTypes<T>::Tensor backprops) {
|
||||
backprops.device(d) =
|
||||
(activations < static_cast<T>(0))
|
||||
.select((activations + static_cast<T>(1)) * gradients, gradients);
|
||||
void Relu6GradOp<Device, T>::OperateNoTemplate(OpKernelContext* context,
|
||||
const Tensor& g, const Tensor& a,
|
||||
Tensor* output) {
|
||||
if (!ReluHelpers::ValidateSameSize(context, g, a)) return;
|
||||
functor::Relu6Grad<Device, T> functor;
|
||||
functor(context->eigen_device<Device>(), g.flat<T>(), a.flat<T>(),
|
||||
output->flat<T>());
|
||||
}
|
||||
|
||||
template <typename Device, typename T>
|
||||
class EluOp : public UnaryElementWiseOp<T, EluOp<Device, T>> {
|
||||
public:
|
||||
using UnaryElementWiseOp<T, EluOp<Device, T>>::UnaryElementWiseOp;
|
||||
|
||||
void Operate(OpKernelContext* context, const Tensor& input, Tensor* output) {
|
||||
functor::Elu<Device, T> functor;
|
||||
functor(context->eigen_device<Device>(), input.flat<T>(),
|
||||
output->flat<T>());
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace functor
|
||||
template <typename Device, typename T>
|
||||
class EluGradOp : public BinaryElementWiseOp<T, EluGradOp<Device, T>> {
|
||||
public:
|
||||
using BinaryElementWiseOp<T, EluGradOp<Device, T>>::BinaryElementWiseOp;
|
||||
|
||||
void OperateNoTemplate(OpKernelContext* context, const Tensor& g,
|
||||
const Tensor& a, Tensor* output);
|
||||
|
||||
// INPUTS:
|
||||
// g (gradients): backpropagated gradients
|
||||
// a (outputs): outputs of the EluOp()
|
||||
// OUTPUT:
|
||||
// gradients to backprop
|
||||
template <int NDIMS>
|
||||
void Operate(OpKernelContext* context, const Tensor& g, const Tensor& a,
|
||||
Tensor* output) {
|
||||
OperateNoTemplate(context, g, a, output);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Device, typename T>
|
||||
void EluGradOp<Device, T>::OperateNoTemplate(OpKernelContext* context,
|
||||
const Tensor& g, const Tensor& a,
|
||||
Tensor* output) {
|
||||
if (!ReluHelpers::ValidateSameSize(context, g, a)) return;
|
||||
functor::EluGrad<Device, T> functor;
|
||||
functor(context->eigen_device<Device>(), g.flat<T>(), a.flat<T>(),
|
||||
output->flat<T>());
|
||||
}
|
||||
|
||||
} // namespace tensorflow
|
||||
|
||||
#undef EIGEN_USE_THREADS
|
||||
|
||||
#endif // TENSORFLOW_KERNELS_RELU_OP_H_
|
||||
|
130
tensorflow/core/kernels/relu_op_functor.h
Normal file
130
tensorflow/core/kernels/relu_op_functor.h
Normal file
@ -0,0 +1,130 @@
|
||||
/* Copyright 2015 Google Inc. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#ifndef TENSORFLOW_KERNELS_RELU_OP_FUNCTOR_H_
|
||||
#define TENSORFLOW_KERNELS_RELU_OP_FUNCTOR_H_
|
||||
// Functor definition for ReluOp and ReluGradOp, must be compilable by nvcc.
|
||||
|
||||
#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
|
||||
#include "tensorflow/core/framework/tensor_types.h"
|
||||
|
||||
namespace tensorflow {
|
||||
namespace functor {
|
||||
|
||||
// Functor used by ReluOp to do the computations.
|
||||
template <typename Device, typename T>
|
||||
struct Relu {
|
||||
// Computes Relu activation.
|
||||
//
|
||||
// features: any shape.
|
||||
// activations: same shape as "features".
|
||||
void operator()(const Device& d, typename TTypes<T>::ConstTensor features,
|
||||
typename TTypes<T>::Tensor activations) {
|
||||
activations.device(d) = features.cwiseMax(static_cast<T>(0));
|
||||
}
|
||||
};
|
||||
|
||||
// Functor used by ReluGradOp to do the computations.
|
||||
template <typename Device, typename T>
|
||||
struct ReluGrad {
|
||||
// Computes ReluGrad backprops.
|
||||
//
|
||||
// gradients: gradients backpropagated to the Relu op.
|
||||
// features: either the inputs that were passed to the Relu or, or its
|
||||
// outputs (using either one yields the same result here).
|
||||
// backprops: gradients to backpropagate to the Relu inputs.
|
||||
void operator()(const Device& d, typename TTypes<T>::ConstTensor gradients,
|
||||
typename TTypes<T>::ConstTensor features,
|
||||
typename TTypes<T>::Tensor backprops) {
|
||||
// NOTE: When the activation is exactly zero, we do not propagate the
|
||||
// associated gradient value. This allows the output of the Relu to be used,
|
||||
// as well as its input.
|
||||
backprops.device(d) =
|
||||
gradients * (features > features.constant(static_cast<T>(0)));
|
||||
}
|
||||
};
|
||||
|
||||
// Functor used by Relu6Op to do the computations.
|
||||
template <typename Device, typename T>
|
||||
struct Relu6 {
|
||||
// Computes Relu6 activation.
|
||||
//
|
||||
// features: any shape.
|
||||
// activations: same shape as "features".
|
||||
void operator()(const Device& d, typename TTypes<T>::ConstTensor features,
|
||||
typename TTypes<T>::Tensor activations) {
|
||||
activations.device(d) =
|
||||
features.cwiseMax(static_cast<T>(0)).cwiseMin(static_cast<T>(6));
|
||||
}
|
||||
};
|
||||
|
||||
// Functor used by ReluGradOp to do the computations.
|
||||
template <typename Device, typename T>
|
||||
struct Relu6Grad {
|
||||
// Computes Relu6Grad backprops.
|
||||
//
|
||||
// gradients: gradients backpropagated to the Relu6 op.
|
||||
// features: inputs that where passed to the Relu6 op.
|
||||
// backprops: gradients to backpropagate to the Relu6 inputs.
|
||||
void operator()(const Device& d, typename TTypes<T>::ConstTensor gradients,
|
||||
typename TTypes<T>::ConstTensor features,
|
||||
typename TTypes<T>::Tensor backprops) {
|
||||
// NOTE: When the activation is exactly zero or six, we
|
||||
// arbitrarily choose to not propagate the associated gradient
|
||||
// value.
|
||||
backprops.device(d) = gradients *
|
||||
(features > features.constant(static_cast<T>(0))) *
|
||||
(features < features.constant(static_cast<T>(6)));
|
||||
}
|
||||
};
|
||||
|
||||
// Functor used by EluOp to do the computations.
|
||||
template <typename Device, typename T>
|
||||
struct Elu {
|
||||
// Computes Elu activation.
|
||||
//
|
||||
// features: any shape.
|
||||
// activations: same shape as "features".
|
||||
void operator()(const Device& d, typename TTypes<T>::ConstTensor features,
|
||||
typename TTypes<T>::Tensor activations) {
|
||||
// features.constant(?)
|
||||
activations.device(d) =
|
||||
(features < static_cast<T>(0))
|
||||
.select(features.exp() - features.constant(static_cast<T>(1)),
|
||||
features);
|
||||
}
|
||||
};
|
||||
|
||||
// Functor used by EluGradOp to do the computations.
|
||||
template <typename Device, typename T>
|
||||
struct EluGrad {
|
||||
// Computes EluGrad backprops.
|
||||
//
|
||||
// gradients: gradients backpropagated to the Elu op.
|
||||
// activations: outputs of the Elu op.
|
||||
// backprops: gradients to backpropagate to the Elu inputs.
|
||||
void operator()(const Device& d, typename TTypes<T>::ConstTensor gradients,
|
||||
typename TTypes<T>::ConstTensor activations,
|
||||
typename TTypes<T>::Tensor backprops) {
|
||||
backprops.device(d) =
|
||||
(activations < static_cast<T>(0))
|
||||
.select((activations + static_cast<T>(1)) * gradients, gradients);
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace functor
|
||||
} // namespace tensorflow
|
||||
|
||||
#endif // TENSORFLOW_KERNELS_RELU_OP_FUNCTOR_H_
|
@ -19,7 +19,7 @@ limitations under the License.
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
#include "tensorflow/core/kernels/relu_op.h"
|
||||
#include "tensorflow/core/kernels/relu_op_functor.h"
|
||||
|
||||
#include "tensorflow/core/framework/register_types.h"
|
||||
#include "tensorflow/core/framework/tensor_types.h"
|
||||
|
@ -24,6 +24,7 @@ limitations under the License.
|
||||
#include "tensorflow/core/framework/tensor.h"
|
||||
#include "tensorflow/core/framework/tensor_shape.h"
|
||||
#include "tensorflow/core/framework/types.h"
|
||||
#include "tensorflow/core/kernels/image_resizer_state.h"
|
||||
#include "tensorflow/core/lib/core/status.h"
|
||||
#include "tensorflow/core/platform/logging.h"
|
||||
|
||||
@ -40,49 +41,22 @@ class ResizeAreaOp : public OpKernel {
|
||||
|
||||
void Compute(OpKernelContext* context) override {
|
||||
const Tensor& input = context->input(0);
|
||||
OP_REQUIRES(context, input.dims() == 4,
|
||||
errors::InvalidArgument("input must be 4-dimensional",
|
||||
input.shape().DebugString()));
|
||||
const Tensor& shape_t = context->input(1);
|
||||
OP_REQUIRES(context, shape_t.dims() == 1,
|
||||
errors::InvalidArgument("shape_t must be 1-dimensional",
|
||||
shape_t.shape().DebugString()));
|
||||
OP_REQUIRES(context, shape_t.NumElements() == 2,
|
||||
errors::InvalidArgument("shape_t must have two elements",
|
||||
shape_t.shape().DebugString()));
|
||||
ImageResizerState st(align_corners_);
|
||||
st.ValidateAndCreateOutput(context, input);
|
||||
|
||||
auto Svec = shape_t.vec<int32>();
|
||||
Tensor* output = nullptr;
|
||||
OP_REQUIRES_OK(context, context->allocate_output(
|
||||
0, TensorShape({input.dim_size(0), Svec(0),
|
||||
Svec(1), input.dim_size(3)}),
|
||||
&output));
|
||||
const int64 batch_size = input.dim_size(0);
|
||||
const int64 in_height = input.dim_size(1);
|
||||
const int64 in_width = input.dim_size(2);
|
||||
const int64 channels = input.dim_size(3);
|
||||
const int64 out_height = output->dim_size(1);
|
||||
const int64 out_width = output->dim_size(2);
|
||||
if (!context->status().ok()) return;
|
||||
|
||||
typename TTypes<T, 4>::ConstTensor input_data = input.tensor<T, 4>();
|
||||
typename TTypes<float, 4>::Tensor output_data = output->tensor<float, 4>();
|
||||
typename TTypes<float, 4>::Tensor output_data =
|
||||
st.output->tensor<float, 4>();
|
||||
|
||||
// A temporary tensor for computing the sum.
|
||||
Tensor sum_tensor;
|
||||
OP_REQUIRES_OK(
|
||||
context, context->allocate_temp(DataTypeToEnum<float>::value,
|
||||
TensorShape({channels}), &sum_tensor));
|
||||
OP_REQUIRES_OK(context, context->allocate_temp(DataTypeToEnum<float>::value,
|
||||
TensorShape({st.channels}),
|
||||
&sum_tensor));
|
||||
typename TTypes<float, 1>::Tensor sum_data = sum_tensor.vec<float>();
|
||||
|
||||
const float height_scale =
|
||||
(align_corners_ && out_height > 1)
|
||||
? (in_height - 1) / static_cast<float>(out_height - 1)
|
||||
: in_height / static_cast<float>(out_height);
|
||||
const float width_scale =
|
||||
(align_corners_ && out_width > 1)
|
||||
? (in_width - 1) / static_cast<float>(out_width - 1)
|
||||
: in_width / static_cast<float>(out_width);
|
||||
|
||||
// When using this algorithm for downsizing, the target pixel value is the
|
||||
// weighted average of all the source pixels. The weight is determined by
|
||||
// the contribution percentage of the source pixel.
|
||||
@ -102,19 +76,19 @@ class ResizeAreaOp : public OpKernel {
|
||||
// out[0] = (in[0] * 1.0 + in[1] * 1/3) * scale
|
||||
// out[1] = (in[1] * 2/3 + in[2] * 2/3 * scale
|
||||
// out[2] = (in[3] * 1/3 + in[3] * 1.0) * scale
|
||||
float scale = 1.0 / (height_scale * width_scale);
|
||||
for (int64 b = 0; b < batch_size; ++b) {
|
||||
for (int64 y = 0; y < out_height; ++y) {
|
||||
const float in_y = y * height_scale;
|
||||
const float in_y1 = (y + 1) * height_scale;
|
||||
float scale = 1.0 / (st.height_scale * st.width_scale);
|
||||
for (int64 b = 0; b < st.batch_size; ++b) {
|
||||
for (int64 y = 0; y < st.out_height; ++y) {
|
||||
const float in_y = y * st.height_scale;
|
||||
const float in_y1 = (y + 1) * st.height_scale;
|
||||
// The start and end height indices of all the cells that could
|
||||
// contribute to the target cell.
|
||||
int64 y_start = floor(in_y);
|
||||
int64 y_end = ceil(in_y1);
|
||||
|
||||
for (int64 x = 0; x < out_width; ++x) {
|
||||
const float in_x = x * width_scale;
|
||||
const float in_x1 = (x + 1) * width_scale;
|
||||
for (int64 x = 0; x < st.out_width; ++x) {
|
||||
const float in_x = x * st.width_scale;
|
||||
const float in_x1 = (x + 1) * st.width_scale;
|
||||
// The start and end width indices of all the cells that could
|
||||
// contribute to the target cell.
|
||||
int64 x_start = floor(in_x);
|
||||
@ -127,16 +101,16 @@ class ResizeAreaOp : public OpKernel {
|
||||
for (int64 j = x_start; j < x_end; ++j) {
|
||||
float scale_x =
|
||||
j < in_x ? j + 1 - in_x : (j + 1 > in_x1 ? in_x1 - j : 1.0);
|
||||
for (int64 c = 0; c < channels; ++c) {
|
||||
for (int64 c = 0; c < st.channels; ++c) {
|
||||
#define BOUND(val, limit) std::min(((limit)-1ll), (std::max(0ll, (val))))
|
||||
sum_data(c) +=
|
||||
input_data(b, BOUND(i, in_height), BOUND(j, in_width), c) *
|
||||
scale_y * scale_x * scale;
|
||||
sum_data(c) += input_data(b, BOUND(i, st.in_height),
|
||||
BOUND(j, st.in_width), c) *
|
||||
scale_y * scale_x * scale;
|
||||
#undef BOUND
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int64 c = 0; c < channels; ++c) {
|
||||
for (int64 c = 0; c < st.channels; ++c) {
|
||||
output_data(b, y, x, c) = sum_data(c);
|
||||
}
|
||||
}
|
||||
|
@ -26,6 +26,7 @@ limitations under the License.
|
||||
#include "tensorflow/core/framework/tensor.h"
|
||||
#include "tensorflow/core/framework/tensor_shape.h"
|
||||
#include "tensorflow/core/framework/types.h"
|
||||
#include "tensorflow/core/kernels/image_resizer_state.h"
|
||||
#include "tensorflow/core/lib/core/status.h"
|
||||
#include "tensorflow/core/platform/logging.h"
|
||||
|
||||
@ -92,62 +93,28 @@ class ResizeBicubicOp : public OpKernel {
|
||||
|
||||
void Compute(OpKernelContext* context) override {
|
||||
const Tensor& input = context->input(0);
|
||||
OP_REQUIRES(context, input.dims() == 4,
|
||||
errors::InvalidArgument("input must be 4-dimensional",
|
||||
input.shape().DebugString()));
|
||||
const Tensor& shape_t = context->input(1);
|
||||
OP_REQUIRES(context, shape_t.dims() == 1,
|
||||
errors::InvalidArgument("shape_t must be 1-dimensional",
|
||||
shape_t.shape().DebugString()));
|
||||
OP_REQUIRES(context, shape_t.NumElements() == 2,
|
||||
errors::InvalidArgument("shape_t must have two elements",
|
||||
shape_t.shape().DebugString()));
|
||||
ImageResizerState st(align_corners_);
|
||||
st.ValidateAndCreateOutput(context, input);
|
||||
|
||||
auto Svec = shape_t.vec<int32>();
|
||||
// Initialize shape to the batch size of the input, then add
|
||||
// the rest of the dimensions
|
||||
Tensor* output = nullptr;
|
||||
OP_REQUIRES_OK(context, context->allocate_output(
|
||||
0, TensorShape({input.dim_size(0), Svec(0),
|
||||
Svec(1), input.dim_size(3)}),
|
||||
&output));
|
||||
const int64 batch_size = input.dim_size(0);
|
||||
const int64 in_height = input.dim_size(1);
|
||||
const int64 in_width = input.dim_size(2);
|
||||
const int64 channels = input.dim_size(3);
|
||||
const int64 out_height = output->dim_size(1);
|
||||
const int64 out_width = output->dim_size(2);
|
||||
CHECK_GT(in_height, 0);
|
||||
CHECK_GT(in_width, 0);
|
||||
CHECK_GT(channels, 0);
|
||||
CHECK_GT(out_height, 0);
|
||||
CHECK_GT(out_width, 0);
|
||||
if (!context->status().ok()) return;
|
||||
|
||||
typename TTypes<T, 4>::ConstTensor input_data = input.tensor<T, 4>();
|
||||
typename TTypes<float, 4>::Tensor output_data = output->tensor<float, 4>();
|
||||
|
||||
const float height_scale =
|
||||
(align_corners_ && out_height > 1)
|
||||
? (in_height - 1) / static_cast<float>(out_height - 1)
|
||||
: in_height / static_cast<float>(out_height);
|
||||
const float width_scale =
|
||||
(align_corners_ && out_width > 1)
|
||||
? (in_width - 1) / static_cast<float>(out_width - 1)
|
||||
: in_width / static_cast<float>(out_width);
|
||||
typename TTypes<float, 4>::Tensor output_data =
|
||||
st.output->tensor<float, 4>();
|
||||
|
||||
std::array<float, 4> coeff = {{0.0, 0.0, 0.0, 0.0}};
|
||||
for (int64 b = 0; b < batch_size; ++b) {
|
||||
for (int64 y = 0; y < out_height; ++y) {
|
||||
for (int64 b = 0; b < st.batch_size; ++b) {
|
||||
for (int64 y = 0; y < st.out_height; ++y) {
|
||||
std::array<float, 4> y_weights;
|
||||
std::array<int64, 4> y_indices;
|
||||
GetWeightsAndIndices(height_scale, y, in_height, &y_weights,
|
||||
GetWeightsAndIndices(st.height_scale, y, st.in_height, &y_weights,
|
||||
&y_indices);
|
||||
for (int64 x = 0; x < out_width; ++x) {
|
||||
for (int64 x = 0; x < st.out_width; ++x) {
|
||||
std::array<float, 4> x_weights;
|
||||
std::array<int64, 4> x_indices;
|
||||
GetWeightsAndIndices(width_scale, x, in_width, &x_weights,
|
||||
GetWeightsAndIndices(st.width_scale, x, st.in_width, &x_weights,
|
||||
&x_indices);
|
||||
for (int64 c = 0; c < channels; ++c) {
|
||||
for (int64 c = 0; c < st.channels; ++c) {
|
||||
// Use a 4x4 patch to compute the interpolated output value at
|
||||
// (b, y, x, c).
|
||||
for (int64 i = 0; i < 4; ++i) {
|
||||
|
@ -23,6 +23,7 @@ limitations under the License.
|
||||
#include "tensorflow/core/framework/tensor.h"
|
||||
#include "tensorflow/core/framework/tensor_shape.h"
|
||||
#include "tensorflow/core/framework/types.h"
|
||||
#include "tensorflow/core/kernels/image_resizer_state.h"
|
||||
#include "tensorflow/core/lib/core/status.h"
|
||||
#include "tensorflow/core/platform/logging.h"
|
||||
|
||||
@ -39,64 +40,29 @@ class ResizeBilinearOp : public OpKernel {
|
||||
|
||||
void Compute(OpKernelContext* context) override {
|
||||
const Tensor& input = context->input(0);
|
||||
OP_REQUIRES(context, input.dims() == 4,
|
||||
errors::InvalidArgument("input must be 4-dimensional",
|
||||
input.shape().DebugString()));
|
||||
const Tensor& shape_t = context->input(1);
|
||||
OP_REQUIRES(context, shape_t.dims() == 1,
|
||||
errors::InvalidArgument("shape_t must be 1-dimensional",
|
||||
shape_t.shape().DebugString()));
|
||||
OP_REQUIRES(context, shape_t.NumElements() == 2,
|
||||
errors::InvalidArgument("shape_t must have two elements",
|
||||
shape_t.shape().DebugString()));
|
||||
ImageResizerState st(align_corners_);
|
||||
st.ValidateAndCreateOutput(context, input);
|
||||
|
||||
auto Svec = shape_t.vec<int32>();
|
||||
// Initialize shape to the batch size of the input, then add
|
||||
// the rest of the dimensions
|
||||
Tensor* output = nullptr;
|
||||
OP_REQUIRES_OK(context, context->allocate_output(
|
||||
0, TensorShape({input.dim_size(0), Svec(0),
|
||||
Svec(1), input.dim_size(3)}),
|
||||
&output));
|
||||
|
||||
const int64 batch_size = input.dim_size(0);
|
||||
const int64 in_height = input.dim_size(1);
|
||||
const int64 in_width = input.dim_size(2);
|
||||
const int64 channels = input.dim_size(3);
|
||||
const int64 out_height = output->dim_size(1);
|
||||
const int64 out_width = output->dim_size(2);
|
||||
CHECK_GT(in_height, 0);
|
||||
CHECK_GT(in_width, 0);
|
||||
CHECK_GT(channels, 0);
|
||||
CHECK_GT(out_height, 0);
|
||||
CHECK_GT(out_width, 0);
|
||||
if (!context->status().ok()) return;
|
||||
|
||||
typename TTypes<T, 4>::ConstTensor input_data = input.tensor<T, 4>();
|
||||
typename TTypes<float, 4>::Tensor output_data = output->tensor<float, 4>();
|
||||
typename TTypes<float, 4>::Tensor output_data =
|
||||
st.output->tensor<float, 4>();
|
||||
|
||||
const float height_scale =
|
||||
(align_corners_ && out_height > 1)
|
||||
? (in_height - 1) / static_cast<float>(out_height - 1)
|
||||
: in_height / static_cast<float>(out_height);
|
||||
const float width_scale =
|
||||
(align_corners_ && out_width > 1)
|
||||
? (in_width - 1) / static_cast<float>(out_width - 1)
|
||||
: in_width / static_cast<float>(out_width);
|
||||
|
||||
for (int b = 0; b < batch_size; ++b) {
|
||||
for (int y = 0; y < out_height; ++y) {
|
||||
const float in_y = y * height_scale;
|
||||
for (int b = 0; b < st.batch_size; ++b) {
|
||||
for (int y = 0; y < st.out_height; ++y) {
|
||||
const float in_y = y * st.height_scale;
|
||||
const int top_y_index = static_cast<int>(floorf(in_y));
|
||||
const int bottom_y_index =
|
||||
std::min(static_cast<int64>(ceilf(in_y)), (in_height - 1));
|
||||
std::min(static_cast<int64>(ceilf(in_y)), (st.in_height - 1));
|
||||
const float y_lerp = in_y - top_y_index;
|
||||
for (int x = 0; x < out_width; ++x) {
|
||||
const float in_x = x * width_scale;
|
||||
for (int x = 0; x < st.out_width; ++x) {
|
||||
const float in_x = x * st.width_scale;
|
||||
const int left_x_index = static_cast<int>(floorf(in_x));
|
||||
const int right_x_index =
|
||||
std::min(static_cast<int64>(ceilf(in_x)), (in_width - 1));
|
||||
std::min(static_cast<int64>(ceilf(in_x)), (st.in_width - 1));
|
||||
const float x_lerp = in_x - left_x_index;
|
||||
for (int c = 0; c < channels; ++c) {
|
||||
for (int c = 0; c < st.channels; ++c) {
|
||||
const float top_left = input_data(b, top_y_index, left_x_index, c);
|
||||
const float top_right =
|
||||
input_data(b, top_y_index, right_x_index, c);
|
||||
|
@ -23,6 +23,7 @@ limitations under the License.
|
||||
#include "tensorflow/core/framework/tensor.h"
|
||||
#include "tensorflow/core/framework/tensor_shape.h"
|
||||
#include "tensorflow/core/framework/types.h"
|
||||
#include "tensorflow/core/kernels/image_resizer_state.h"
|
||||
#include "tensorflow/core/lib/core/status.h"
|
||||
#include "tensorflow/core/platform/logging.h"
|
||||
|
||||
@ -44,56 +45,28 @@ class ResizeNearestNeighborOp : public OpKernel {
|
||||
|
||||
void Compute(OpKernelContext* context) override {
|
||||
const Tensor& input = context->input(0);
|
||||
OP_REQUIRES(context, input.dims() == 4,
|
||||
errors::InvalidArgument("input must be 4-dimensional",
|
||||
input.shape().DebugString()));
|
||||
const Tensor& shape_t = context->input(1);
|
||||
OP_REQUIRES(context, shape_t.dims() == 1,
|
||||
errors::InvalidArgument("shape_t must be 1-dimensional",
|
||||
shape_t.shape().DebugString()));
|
||||
OP_REQUIRES(context, shape_t.NumElements() == 2,
|
||||
errors::InvalidArgument("shape_t must have two elements",
|
||||
shape_t.shape().DebugString()));
|
||||
ImageResizerState st(align_corners_);
|
||||
st.ValidateAndCreateOutput(context, input);
|
||||
|
||||
auto sizes = shape_t.vec<int32>();
|
||||
OP_REQUIRES(context, sizes(0) > 0 && sizes(1) > 0,
|
||||
errors::InvalidArgument("shape_t's elements must be positive"));
|
||||
if (!context->status().ok()) return;
|
||||
|
||||
// Initialize shape to the batch size of the input, then add
|
||||
// the rest of the dimensions
|
||||
Tensor* output = nullptr;
|
||||
OP_REQUIRES_OK(
|
||||
context, context->allocate_output(0, TensorShape({input.dim_size(0), sizes(0),
|
||||
sizes(1), input.dim_size(3)}),
|
||||
&output));
|
||||
|
||||
const int64 batch_size = input.dim_size(0);
|
||||
const int64 in_height = input.dim_size(1);
|
||||
const int64 in_width = input.dim_size(2);
|
||||
const int64 channels = input.dim_size(3);
|
||||
const int64 out_height = output->dim_size(1);
|
||||
const int64 out_width = output->dim_size(2);
|
||||
OP_REQUIRES(context, st.in_height < (1 << 24) && st.in_width < (1 << 24),
|
||||
errors::InvalidArgument("nearest neighbor requires max height "
|
||||
"& width of 2^24"));
|
||||
|
||||
typename TTypes<T, 4>::ConstTensor input_data = input.tensor<T, 4>();
|
||||
typename TTypes<T, 4>::Tensor output_data = output->tensor<T, 4>();
|
||||
typename TTypes<T, 4>::Tensor output_data = st.output->tensor<T, 4>();
|
||||
|
||||
const float height_scale =
|
||||
(align_corners_ && out_height > 1)
|
||||
? (in_height - 1) / static_cast<float>(out_height - 1)
|
||||
: in_height / static_cast<float>(out_height);
|
||||
const float width_scale =
|
||||
(align_corners_ && out_width > 1)
|
||||
? (in_width - 1) / static_cast<float>(out_width - 1)
|
||||
: in_width / static_cast<float>(out_width);
|
||||
|
||||
for (int b = 0; b < batch_size; ++b) {
|
||||
for (int y = 0; y < out_height; ++y) {
|
||||
const int in_y = std::min(static_cast<int64>(floorf(y * height_scale)),
|
||||
(in_height - 1));
|
||||
for (int x = 0; x < out_width; ++x) {
|
||||
const int in_x = std::min(static_cast<int64>(floorf(x * width_scale)),
|
||||
(in_width - 1));
|
||||
for (int c = 0; c < channels; ++c) {
|
||||
for (int b = 0; b < st.batch_size; ++b) {
|
||||
for (int y = 0; y < st.out_height; ++y) {
|
||||
const int in_y =
|
||||
std::min(static_cast<int64>(floorf(y * st.height_scale)),
|
||||
(st.in_height - 1));
|
||||
for (int x = 0; x < st.out_width; ++x) {
|
||||
const int in_x =
|
||||
std::min(static_cast<int64>(floorf(x * st.width_scale)),
|
||||
(st.in_width - 1));
|
||||
for (int c = 0; c < st.channels; ++c) {
|
||||
output_data(b, y, x, c) = input_data(b, in_y, in_x, c);
|
||||
}
|
||||
}
|
||||
|
@ -28,29 +28,6 @@ namespace tensorflow {
|
||||
typedef Eigen::ThreadPoolDevice CPUDevice;
|
||||
typedef Eigen::GpuDevice GPUDevice;
|
||||
|
||||
template <typename Device, typename T>
|
||||
class SoftmaxOp : public OpKernel {
|
||||
public:
|
||||
explicit SoftmaxOp(OpKernelConstruction* context) : OpKernel(context) {
|
||||
log_ = StringPiece(name()).starts_with("Log");
|
||||
}
|
||||
|
||||
void Compute(OpKernelContext* context) override {
|
||||
const Tensor& logits_in = context->input(0);
|
||||
OP_REQUIRES(context, TensorShapeUtils::IsMatrix(logits_in.shape()),
|
||||
errors::InvalidArgument("logits must be 2-dimensional"));
|
||||
Tensor* softmax_out = nullptr;
|
||||
OP_REQUIRES_OK(
|
||||
context, context->allocate_output(0, logits_in.shape(), &softmax_out));
|
||||
functor::SoftmaxFunctor<Device, T> functor;
|
||||
functor(context->eigen_device<Device>(), logits_in.matrix<T>(),
|
||||
softmax_out->matrix<T>(), log_);
|
||||
}
|
||||
|
||||
private:
|
||||
bool log_;
|
||||
};
|
||||
|
||||
// Partial specialization for a CPUDevice, that uses the Eigen implementation
|
||||
// from SoftmaxEigenImpl.
|
||||
namespace functor {
|
||||
|
@ -13,89 +13,48 @@ See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
// See docs in ../ops/nn_ops.cc.
|
||||
|
||||
#ifndef TENSORFLOW_KERNELS_SOFTMAX_OP_H_
|
||||
#define TENSORFLOW_KERNELS_SOFTMAX_OP_H_
|
||||
// Functor definition for SoftmaxOp, must be compilable by nvcc.
|
||||
|
||||
#define EIGEN_USE_THREADS
|
||||
|
||||
#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
|
||||
#include "tensorflow/core/framework/tensor_types.h"
|
||||
#include "tensorflow/core/framework/op_kernel.h"
|
||||
#include "tensorflow/core/framework/tensor.h"
|
||||
#include "tensorflow/core/framework/tensor_shape.h"
|
||||
#include "tensorflow/core/kernels/softmax_op_functor.h"
|
||||
|
||||
namespace tensorflow {
|
||||
namespace functor {
|
||||
|
||||
// Functor used by SoftmaxOp to do the computations.
|
||||
template <typename Device, typename T>
|
||||
struct SoftmaxFunctor {
|
||||
// Computes Softmax or LogSoftmax activation.
|
||||
//
|
||||
// logits: dim: batch_size, num_classes.
|
||||
// softmax: dims: batch_size, num_classes.
|
||||
// log: boolean
|
||||
void operator()(const Device& d, typename TTypes<T>::ConstMatrix logits,
|
||||
typename TTypes<T>::Matrix softmax, const bool log);
|
||||
};
|
||||
class SoftmaxOp : public OpKernel {
|
||||
public:
|
||||
explicit SoftmaxOp(OpKernelConstruction* context) : OpKernel(context) {
|
||||
log_ = StringPiece(name()).starts_with("Log");
|
||||
}
|
||||
|
||||
// Eigen code implementing SoftmaxFunctor::operator() or
|
||||
// LogSoftmaxFunctor::operator().
|
||||
// This code works for both CPU and GPU and is used by the functor
|
||||
// specializations for both device types.
|
||||
template <typename Device, typename T>
|
||||
struct SoftmaxEigenImpl {
|
||||
static void Compute(const Device& d, typename TTypes<T>::ConstMatrix logits,
|
||||
typename TTypes<T>::Matrix softmax, const bool log) {
|
||||
const int kBatchDim = 0;
|
||||
const int kClassDim = 1;
|
||||
|
||||
const int batch_size = logits.dimension(kBatchDim);
|
||||
const int num_classes = logits.dimension(kClassDim);
|
||||
|
||||
// These arrays are used to reduce along the class dimension, and broadcast
|
||||
// the resulting value to all classes.
|
||||
#if !defined(EIGEN_HAS_INDEX_LIST)
|
||||
Eigen::DSizes<int, 1> along_class(kClassDim);
|
||||
Eigen::DSizes<int, 2> batch_by_one(batch_size, 1);
|
||||
Eigen::DSizes<int, 2> one_by_class(1, num_classes);
|
||||
#else
|
||||
Eigen::IndexList<Eigen::type2index<kClassDim> > along_class;
|
||||
Eigen::IndexList<Eigen::type2index<1> > depth_dim;
|
||||
Eigen::IndexList<int, Eigen::type2index<1> > batch_by_one;
|
||||
batch_by_one.set(0, batch_size);
|
||||
Eigen::IndexList<Eigen::type2index<1>, int> one_by_class;
|
||||
one_by_class.set(1, num_classes);
|
||||
#endif
|
||||
//shifted_logits = logits - max(logits along classes);
|
||||
auto shifted_logits = (logits - logits.maximum(along_class)
|
||||
.eval()
|
||||
.reshape(batch_by_one)
|
||||
.broadcast(one_by_class));
|
||||
if (log) {
|
||||
// Calculate the log of the softmax
|
||||
// softmax = logits - max(logits along classes);
|
||||
softmax.device(d) = shifted_logits;
|
||||
// softmax = softmax - log(sum(exp(softmax along classes)));
|
||||
softmax.device(d) = (softmax -
|
||||
softmax.exp().sum(along_class)
|
||||
.eval()
|
||||
.reshape(batch_by_one)
|
||||
.broadcast(one_by_class)
|
||||
.log());
|
||||
} else {
|
||||
// NOTE(touts): If you modify this implementation please run
|
||||
// the BM_ImageNetSoftmaxFwd benchmark in nn_ops_test.cc.
|
||||
//
|
||||
// softmax = exp(logits - max(logits along classes));
|
||||
softmax.device(d) = shifted_logits.exp();
|
||||
// softmax = softmax / sum(softmax along classes);
|
||||
softmax.device(d) = (softmax /
|
||||
softmax.sum(along_class)
|
||||
.eval()
|
||||
.reshape(batch_by_one)
|
||||
.broadcast(one_by_class));
|
||||
void Compute(OpKernelContext* context) override {
|
||||
const Tensor& logits_in = context->input(0);
|
||||
OP_REQUIRES(context, TensorShapeUtils::IsMatrix(logits_in.shape()),
|
||||
errors::InvalidArgument("logits must be 2-dimensional"));
|
||||
Tensor* softmax_out = nullptr;
|
||||
OP_REQUIRES_OK(
|
||||
context, context->allocate_output(0, logits_in.shape(), &softmax_out));
|
||||
if (logits_in.NumElements()) {
|
||||
functor::SoftmaxFunctor<Device, T> functor;
|
||||
functor(context->eigen_device<Device>(), logits_in.matrix<T>(),
|
||||
softmax_out->matrix<T>(), log_);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
bool log_;
|
||||
};
|
||||
|
||||
} // namespace functor
|
||||
} // namespace tensorflow
|
||||
|
||||
#undef EIGEN_USE_THREADS
|
||||
|
||||
#endif // TENSORFLOW_KERNELS_SOFTMAX_OP_H_
|
||||
|
101
tensorflow/core/kernels/softmax_op_functor.h
Normal file
101
tensorflow/core/kernels/softmax_op_functor.h
Normal file
@ -0,0 +1,101 @@
|
||||
/* Copyright 2015 Google Inc. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#ifndef TENSORFLOW_KERNELS_SOFTMAX_OP_FUNCTOR_H_
|
||||
#define TENSORFLOW_KERNELS_SOFTMAX_OP_FUNCTOR_H_
|
||||
// Functor definition for SoftmaxOp, must be compilable by nvcc.
|
||||
|
||||
#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
|
||||
#include "tensorflow/core/framework/tensor_types.h"
|
||||
|
||||
namespace tensorflow {
|
||||
namespace functor {
|
||||
|
||||
// Functor used by SoftmaxOp to do the computations.
|
||||
template <typename Device, typename T>
|
||||
struct SoftmaxFunctor {
|
||||
// Computes Softmax or LogSoftmax activation.
|
||||
//
|
||||
// logits: dim: batch_size, num_classes.
|
||||
// softmax: dims: batch_size, num_classes.
|
||||
// log: boolean
|
||||
void operator()(const Device& d, typename TTypes<T>::ConstMatrix logits,
|
||||
typename TTypes<T>::Matrix softmax, const bool log);
|
||||
};
|
||||
|
||||
// Eigen code implementing SoftmaxFunctor::operator() or
|
||||
// LogSoftmaxFunctor::operator().
|
||||
// This code works for both CPU and GPU and is used by the functor
|
||||
// specializations for both device types.
|
||||
template <typename Device, typename T>
|
||||
struct SoftmaxEigenImpl {
|
||||
static void Compute(const Device& d, typename TTypes<T>::ConstMatrix logits,
|
||||
typename TTypes<T>::Matrix softmax, const bool log) {
|
||||
const int kBatchDim = 0;
|
||||
const int kClassDim = 1;
|
||||
|
||||
const int batch_size = logits.dimension(kBatchDim);
|
||||
const int num_classes = logits.dimension(kClassDim);
|
||||
|
||||
// These arrays are used to reduce along the class dimension, and broadcast
|
||||
// the resulting value to all classes.
|
||||
#if !defined(EIGEN_HAS_INDEX_LIST)
|
||||
Eigen::DSizes<int, 1> along_class(kClassDim);
|
||||
Eigen::DSizes<int, 2> batch_by_one(batch_size, 1);
|
||||
Eigen::DSizes<int, 2> one_by_class(1, num_classes);
|
||||
#else
|
||||
Eigen::IndexList<Eigen::type2index<kClassDim> > along_class;
|
||||
Eigen::IndexList<Eigen::type2index<1> > depth_dim;
|
||||
Eigen::IndexList<int, Eigen::type2index<1> > batch_by_one;
|
||||
batch_by_one.set(0, batch_size);
|
||||
Eigen::IndexList<Eigen::type2index<1>, int> one_by_class;
|
||||
one_by_class.set(1, num_classes);
|
||||
#endif
|
||||
//shifted_logits = logits - max(logits along classes);
|
||||
auto shifted_logits = (logits - logits.maximum(along_class)
|
||||
.eval()
|
||||
.reshape(batch_by_one)
|
||||
.broadcast(one_by_class));
|
||||
if (log) {
|
||||
// Calculate the log of the softmax
|
||||
// softmax = logits - max(logits along classes);
|
||||
softmax.device(d) = shifted_logits;
|
||||
// softmax = softmax - log(sum(exp(softmax along classes)));
|
||||
softmax.device(d) = (softmax -
|
||||
softmax.exp().sum(along_class)
|
||||
.eval()
|
||||
.reshape(batch_by_one)
|
||||
.broadcast(one_by_class)
|
||||
.log());
|
||||
} else {
|
||||
// NOTE(touts): If you modify this implementation please run
|
||||
// the BM_ImageNetSoftmaxFwd benchmark in nn_ops_test.cc.
|
||||
//
|
||||
// softmax = exp(logits - max(logits along classes));
|
||||
softmax.device(d) = shifted_logits.exp();
|
||||
// softmax = softmax / sum(softmax along classes);
|
||||
softmax.device(d) = (softmax /
|
||||
softmax.sum(along_class)
|
||||
.eval()
|
||||
.reshape(batch_by_one)
|
||||
.broadcast(one_by_class));
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace functor
|
||||
} // namespace tensorflow
|
||||
|
||||
#endif // TENSORFLOW_KERNELS_SOFTMAX_OP_FUNCTOR_H_
|
@ -17,7 +17,7 @@ limitations under the License.
|
||||
|
||||
#define EIGEN_USE_GPU
|
||||
|
||||
#include "tensorflow/core/kernels/softmax_op.h"
|
||||
#include "tensorflow/core/kernels/softmax_op_functor.h"
|
||||
|
||||
#include "tensorflow/core/framework/tensor_types.h"
|
||||
#include "tensorflow/core/platform/types.h"
|
||||
|
@ -21,6 +21,8 @@ limitations under the License.
|
||||
#include <string>
|
||||
#include <utility>
|
||||
|
||||
#include "tensorflow/core/kernels/spacetodepth_op.h"
|
||||
|
||||
#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
|
||||
#include "tensorflow/core/framework/op.h"
|
||||
#include "tensorflow/core/framework/op_kernel.h"
|
||||
@ -89,28 +91,44 @@ class SpaceToDepthOp : public OpKernel {
|
||||
auto Toutput = outputs_tensor->tensor<T, 4>();
|
||||
auto Tinput = input.tensor<T, 4>();
|
||||
|
||||
for (int b = 0; b < batch_size; ++b) {
|
||||
for (int h = 0; h < height; ++h) {
|
||||
const int out_h = h / block_size_;
|
||||
const int offset_h = (h % block_size_);
|
||||
for (int w = 0; w < width; ++w) {
|
||||
const int out_w = w / block_size_;
|
||||
const int offset_w = (w % block_size_);
|
||||
const int offset_d =
|
||||
(offset_h * block_size_ + offset_w) * input_depth;
|
||||
for (int d = 0; d < input_depth; ++d) {
|
||||
const int out_d = d + offset_d;
|
||||
Toutput(b, out_h, out_w, out_d) = Tinput(b, h, w, d);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
functor::SpaceToDepthOpFunctor<Device, T> functor;
|
||||
functor(context->eigen_device<Device>(), Tinput, block_size_, Toutput);
|
||||
};
|
||||
|
||||
private:
|
||||
int block_size_;
|
||||
};
|
||||
|
||||
// Partial specialization of SpaceToDepthOpFunctor for a CPUDevice.
|
||||
namespace functor {
|
||||
template <typename T>
|
||||
struct SpaceToDepthOpFunctor<CPUDevice, T> {
|
||||
void operator()(const CPUDevice& d, typename TTypes<T, 4>::ConstTensor input,
|
||||
int block_size, typename TTypes<T, 4>::Tensor output) {
|
||||
const int batch_size = output.dimension(0);
|
||||
const int input_height = input.dimension(1);
|
||||
const int input_width = input.dimension(2);
|
||||
const int input_depth = input.dimension(3);
|
||||
|
||||
for (int b = 0; b < batch_size; ++b) {
|
||||
for (int h = 0; h < input_height; ++h) {
|
||||
const int out_h = h / block_size;
|
||||
const int offset_h = (h % block_size);
|
||||
for (int w = 0; w < input_width; ++w) {
|
||||
const int out_w = w / block_size;
|
||||
const int offset_w = (w % block_size);
|
||||
const int offset_d = (offset_h * block_size + offset_w) * input_depth;
|
||||
for (int d = 0; d < input_depth; ++d) {
|
||||
const int out_d = d + offset_d;
|
||||
output(b, out_h, out_w, out_d) = input(b, h, w, d);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
} // namespace functor
|
||||
|
||||
#define REGISTER(type) \
|
||||
REGISTER_KERNEL_BUILDER( \
|
||||
Name("SpaceToDepth").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
|
||||
@ -119,4 +137,10 @@ class SpaceToDepthOp : public OpKernel {
|
||||
TF_CALL_ALL_TYPES(REGISTER);
|
||||
#undef REGISTER
|
||||
|
||||
#if GOOGLE_CUDA
|
||||
REGISTER_KERNEL_BUILDER(
|
||||
Name("SpaceToDepth").Device(DEVICE_GPU).TypeConstraint<float>("T"),
|
||||
SpaceToDepthOp<GPUDevice, float>);
|
||||
#endif // GOOGLE_CUDA
|
||||
|
||||
} // end namespace tensorflow
|
||||
|
44
tensorflow/core/kernels/spacetodepth_op.h
Normal file
44
tensorflow/core/kernels/spacetodepth_op.h
Normal file
@ -0,0 +1,44 @@
|
||||
/* Copyright 2015 Google Inc. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#ifndef TENSORFLOW_CORE_KERNELS_SPACETODEPTH_OP_H_
|
||||
#define TENSORFLOW_CORE_KERNELS_SPACETODEPTH_OP_H_
|
||||
// Functor definition for XentOp, must be compilable by nvcc.
|
||||
|
||||
#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
|
||||
#include "tensorflow/core/framework/tensor_types.h"
|
||||
|
||||
namespace tensorflow {
|
||||
namespace functor {
|
||||
|
||||
// Functor used by SpaceToDepthOp to do the computations.
|
||||
template <typename Device, typename T>
|
||||
struct SpaceToDepthOpFunctor {
|
||||
// Implements the space to depth conversion.
|
||||
//
|
||||
// input: 4-D input tensor.
|
||||
// block_size: block size for the conversion.
|
||||
// output: 4-D output tensor.
|
||||
//
|
||||
// The dimensions of the tensors are guaranteed to be right when the
|
||||
// functor is called.
|
||||
void operator()(const Device& d, typename TTypes<T, 4>::ConstTensor input,
|
||||
int block_size, typename TTypes<T, 4>::Tensor output);
|
||||
};
|
||||
|
||||
} // namespace functor
|
||||
} // namespace tensorflow
|
||||
|
||||
#endif // TENSORFLOW_CORE_KERNELS_SPACETODEPTH_OP_H_
|
89
tensorflow/core/kernels/spacetodepth_op_gpu.cu.cc
Normal file
89
tensorflow/core/kernels/spacetodepth_op_gpu.cu.cc
Normal file
@ -0,0 +1,89 @@
|
||||
/* Copyright 2015 Google Inc. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#if GOOGLE_CUDA
|
||||
|
||||
#define EIGEN_USE_GPU
|
||||
|
||||
#include "tensorflow/core/kernels/spacetodepth_op.h"
|
||||
|
||||
#include "tensorflow/core/framework/tensor_types.h"
|
||||
#include "tensorflow/core/platform/types.h"
|
||||
#include "tensorflow/core/util/cuda_kernel_helper.h"
|
||||
|
||||
namespace tensorflow {
|
||||
|
||||
typedef Eigen::GpuDevice GPUDevice;
|
||||
|
||||
template <typename dtype>
|
||||
__global__ void S2D(const int32 nthreads, const dtype* input_ptr,
|
||||
const int block_size, const int batch_size,
|
||||
const int input_height, const int input_width,
|
||||
const int input_depth, const int output_height,
|
||||
const int output_width, const int output_depth,
|
||||
dtype* output_ptr) {
|
||||
CUDA_1D_KERNEL_LOOP(inp_idx, nthreads) {
|
||||
// inp_idx = d + input_depth * (w + input_width * (h + input_height * b))
|
||||
const int d = inp_idx % input_depth;
|
||||
const int inp_idx2 = inp_idx / input_depth;
|
||||
const int w = inp_idx2 % input_width;
|
||||
const int inp_idx3 = inp_idx2 / input_width;
|
||||
const int h = inp_idx3 % input_height;
|
||||
const int b = inp_idx3 / input_height;
|
||||
|
||||
const int out_h = h / block_size;
|
||||
const int offset_h = h % block_size;
|
||||
const int out_w = w / block_size;
|
||||
const int offset_w = w % block_size;
|
||||
const int offset_d = (offset_h * block_size + offset_w) * input_depth;
|
||||
const int out_d = d + offset_d;
|
||||
const int out_idx =
|
||||
out_d +
|
||||
output_depth * (out_w + output_width * (out_h + output_height * b));
|
||||
*(output_ptr + out_idx) = ldg(input_ptr + inp_idx);
|
||||
}
|
||||
}
|
||||
|
||||
// Specialization of SpaceToDepthOpFunctor for a CPUDevice.
|
||||
namespace functor {
|
||||
template <typename T>
|
||||
struct SpaceToDepthOpFunctor<GPUDevice, T> {
|
||||
void operator()(const GPUDevice& d, typename TTypes<T, 4>::ConstTensor input,
|
||||
int block_size, typename TTypes<T, 4>::Tensor output) {
|
||||
const int batch_size = output.dimension(0);
|
||||
const int input_height = input.dimension(1);
|
||||
const int input_width = input.dimension(2);
|
||||
const int input_depth = input.dimension(3);
|
||||
const int output_height = output.dimension(1);
|
||||
const int output_width = output.dimension(2);
|
||||
const int output_depth = output.dimension(3);
|
||||
|
||||
const int total_count =
|
||||
batch_size * input_height * input_width * input_depth;
|
||||
CudaLaunchConfig config = GetCudaLaunchConfig(total_count, d);
|
||||
S2D<<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
|
||||
config.virtual_thread_count, input.data(), block_size, batch_size,
|
||||
input_height, input_width, input_depth, output_height, output_width,
|
||||
output_depth, output.data());
|
||||
}
|
||||
};
|
||||
} // end namespace functor
|
||||
|
||||
// Instantiate the GPU implementation for float.
|
||||
template struct functor::SpaceToDepthOpFunctor<GPUDevice, float>;
|
||||
|
||||
} // end namespace tensorflow
|
||||
|
||||
#endif // GOOGLE_CUDA
|
@ -23,6 +23,7 @@ limitations under the License.
|
||||
#include "tensorflow/core/framework/register_types.h"
|
||||
#include "tensorflow/core/framework/tensor.h"
|
||||
#include "tensorflow/core/framework/tensor_shape.h"
|
||||
#include "tensorflow/core/kernels/bounds_check.h"
|
||||
#include "tensorflow/core/kernels/transpose_functor.h"
|
||||
#include "tensorflow/core/lib/core/status.h"
|
||||
#include "tensorflow/core/lib/strings/str_util.h"
|
||||
@ -55,8 +56,8 @@ class InvertPermutationOp : public OpKernel {
|
||||
auto Tout = output->vec<int32>();
|
||||
std::fill_n(Tout.data(), N, -1);
|
||||
for (int i = 0; i < N; ++i) {
|
||||
const int32 d = Tin(i);
|
||||
OP_REQUIRES(context, 0 <= d && d < N,
|
||||
const int32 d = internal::SubtleMustCopy(Tin(i));
|
||||
OP_REQUIRES(context, FastBoundsCheck(d, N),
|
||||
errors::InvalidArgument(d, " is not between 0 and ", N));
|
||||
OP_REQUIRES(context, Tout(d) == -1,
|
||||
errors::InvalidArgument(d, " is duplicated in the input."));
|
||||
@ -107,18 +108,26 @@ void TransposeOp::Compute(OpKernelContext* ctx) {
|
||||
errors::InvalidArgument(
|
||||
"transpose expects a vector of size ", input.dims(),
|
||||
". But input(1) is a vector of size ", Vperm.size()));
|
||||
gtl::ArraySlice<int32> permutation(
|
||||
reinterpret_cast<const int32*>(Vperm.data()), dims);
|
||||
// using volatile instead of SubtleMustCopy here so that the
|
||||
// asynchrony boundary is permutation.
|
||||
const volatile int32* perm_begin =
|
||||
reinterpret_cast<const volatile int32*>(Vperm.data());
|
||||
const std::vector<int32> permutation(perm_begin, perm_begin + dims);
|
||||
TensorShape shape;
|
||||
|
||||
// Check whether permutation is a permutation of integers of [0 .. dims).
|
||||
gtl::InlinedVector<bool, 8> bits(dims);
|
||||
for (const int32 d : permutation) {
|
||||
bool is_identity = true;
|
||||
for (int i = 0; i < dims; ++i) {
|
||||
const int32 d = permutation[i];
|
||||
OP_REQUIRES(
|
||||
ctx, 0 <= d && d < dims,
|
||||
errors::InvalidArgument(d, " is out of range [0 .. ", dims, ")"));
|
||||
bits[d] = true;
|
||||
shape.AddDim(input.dim_size(d));
|
||||
if (d != i) {
|
||||
is_identity = false;
|
||||
}
|
||||
}
|
||||
for (int i = 0; i < dims; ++i) {
|
||||
OP_REQUIRES(ctx, bits[i], errors::InvalidArgument(
|
||||
@ -126,8 +135,8 @@ void TransposeOp::Compute(OpKernelContext* ctx) {
|
||||
str_util::Join(permutation, ","), "}."));
|
||||
}
|
||||
|
||||
// 0-D and 1-D transposes do nothing
|
||||
if (dims <= 1) {
|
||||
// 0-D, 1-D, and identity transposes do nothing.
|
||||
if (dims <= 1 || is_identity) {
|
||||
ctx->set_output(0, input);
|
||||
return;
|
||||
}
|
||||
|
@ -139,7 +139,8 @@ class Session {
|
||||
|
||||
/// \brief Like `Run`, but allows users to pass in a `RunOptions` proto and
|
||||
/// to retrieve non-Tensor metadata output via a `RunOutputs` proto for this
|
||||
/// step.
|
||||
/// step. `run_outputs` may be nullptr, in which case any metadata output is
|
||||
/// discarded.
|
||||
/// NOTE: This API is still experimental and may change.
|
||||
virtual Status Run(const RunOptions& run_options,
|
||||
const std::vector<std::pair<string, Tensor> >& inputs,
|
||||
@ -148,8 +149,8 @@ class Session {
|
||||
std::vector<Tensor>* outputs, RunOutputs* run_outputs);
|
||||
|
||||
/// \brief Sets up a graph for partial execution. All future feeds and
|
||||
/// fetches are specified by 'input_names' and 'output_names'. Returns
|
||||
/// 'handle' that can be used to perform a sequence of partial feeds and
|
||||
/// fetches are specified by `input_names` and `output_names`. Returns
|
||||
/// `handle` that can be used to perform a sequence of partial feeds and
|
||||
/// fetches.
|
||||
/// NOTE: This API is still experimental and may change.
|
||||
virtual Status PRunSetup(const std::vector<string>& input_names,
|
||||
@ -157,7 +158,7 @@ class Session {
|
||||
const std::vector<string>& target_nodes,
|
||||
string* handle);
|
||||
|
||||
/// \brief Continues the pending execution specified by 'handle' with the
|
||||
/// \brief Continues the pending execution specified by `handle` with the
|
||||
/// provided input tensors and fills `outputs` for the endpoints specified
|
||||
/// in `output_names`.
|
||||
/// NOTE: This API is still experimental and may change.
|
||||
|
@ -268,15 +268,26 @@ extern void TF_ExtendGraph(TF_Session*, const void* proto, size_t proto_len,
|
||||
// failure, inputs[] become the property of the implementation (the
|
||||
// implementation will eventually call TF_DeleteTensor on each input).
|
||||
//
|
||||
// The caller retains the ownership of both `run_options` and `run_outputs`, and
|
||||
// should manually call TF_DeleteBuffer on them.
|
||||
// Any NULL and non-NULL value combinations for (`run_options`,
|
||||
// `run_outputs`) are valid.
|
||||
//
|
||||
// - `run_options` may be NULL, in which case it will be ignored; or
|
||||
// non-NULL, in which case it must point to a `TF_Buffer` containing the
|
||||
// serialized representation of a `RunOptions` protocol buffer.
|
||||
// - `run_output` may be NULL, in which case it will be ignored; or non-NULL,
|
||||
// in which case it must point to an empty, freshly allocated `TF_Buffer`
|
||||
// that may be updated to contain the serialized representation of a
|
||||
// `RunOutput` protocol buffer.
|
||||
//
|
||||
// The caller retains the ownership of `run_options` and/or `run_outputs` (when
|
||||
// not NULL) and should manually call TF_DeleteBuffer on them.
|
||||
//
|
||||
// On success, the tensors corresponding to output_names[0,noutputs-1]
|
||||
// are placed in outputs[], and these outputs[] become the property
|
||||
// of the caller (the caller must eventually call TF_DeleteTensor on
|
||||
// them).
|
||||
//
|
||||
// On failure, outputs[] contains nulls.
|
||||
// On failure, outputs[] contains NULLs.
|
||||
extern void TF_Run(TF_Session*,
|
||||
// RunOptions
|
||||
const TF_Buffer* run_options,
|
||||
@ -341,7 +352,7 @@ extern void TF_PRun(TF_Session*, const char* handle,
|
||||
// On success, place OK in status and return the newly created library handle.
|
||||
// The caller owns the library handle.
|
||||
//
|
||||
// On failure, place an error status in status and return nullptr.
|
||||
// On failure, place an error status in status and return NULL.
|
||||
extern TF_Library* TF_LoadLibrary(const char* library_filename,
|
||||
TF_Status* status);
|
||||
|
||||
|
@ -39,8 +39,10 @@ void Shard(int num_workers, thread::ThreadPool* workers, int64 total,
|
||||
// much. Let us assume each cost unit is 1ns, kMinCostPerShard=10000
|
||||
// is 10us.
|
||||
static const int64 kMinCostPerShard = 10000;
|
||||
const int num_shards = std::max(
|
||||
1, std::min<int>(num_workers, total * cost_per_unit / kMinCostPerShard));
|
||||
const int num_shards =
|
||||
std::max<int>(1, std::min(static_cast<int64>(num_workers),
|
||||
total * cost_per_unit / kMinCostPerShard));
|
||||
|
||||
// Each shard contains up to "block_size" units. [0, total) is sharded
|
||||
// into:
|
||||
// [0, block_size), [block_size, 2*block_size), ...
|
||||
|
@ -59,6 +59,25 @@ TEST(Shard, Basic) {
|
||||
}
|
||||
}
|
||||
|
||||
TEST(Shard, OverflowTest) {
|
||||
thread::ThreadPool threads(Env::Default(), "test", 3);
|
||||
mutex mu;
|
||||
for (auto workers : {1, 2, 3}) {
|
||||
const int64 total_elements = 1LL << 32;
|
||||
const int64 cost_per_unit = 10000;
|
||||
int num_shards = 0;
|
||||
int64 num_elements = 0;
|
||||
Shard(workers, &threads, total_elements, cost_per_unit,
|
||||
[&mu, &num_shards, &num_elements](int64 start, int64 limit) {
|
||||
mutex_lock l(mu);
|
||||
++num_shards;
|
||||
num_elements += limit - start;
|
||||
});
|
||||
EXPECT_EQ(num_shards, workers);
|
||||
EXPECT_EQ(num_elements, total_elements);
|
||||
}
|
||||
}
|
||||
|
||||
void BM_Sharding(int iters, int arg) {
|
||||
thread::ThreadPool threads(Env::Default(), "test", 16);
|
||||
const int64 total = 1LL << 30;
|
||||
|
@ -157,3 +157,17 @@ void ReadFileToVector(AAssetManager* const asset_manager,
|
||||
VLOG(0) << "Read " << str_vector->size() << " values from " << filename;
|
||||
}
|
||||
|
||||
void WriteProtoToFile(const char* const filename,
|
||||
const google::protobuf::MessageLite& message) {
|
||||
std::fstream outfile;
|
||||
outfile.open(filename, std::fstream::binary | std::fstream::out);
|
||||
if (outfile.fail()) {
|
||||
LOG(WARNING) << "Failed to write proto to " << filename;
|
||||
return;
|
||||
} else {
|
||||
google::protobuf::io::OstreamOutputStream raw_out(&outfile);
|
||||
google::protobuf::io::CodedOutputStream coded_out(&raw_out);
|
||||
message.SerializeToCodedStream(&coded_out);
|
||||
}
|
||||
VLOG(0) << "Wrote proto to " << filename;
|
||||
}
|
||||
|
@ -42,4 +42,7 @@ void ReadFileToString(AAssetManager* const asset_manager,
|
||||
void ReadFileToVector(AAssetManager* const asset_manager,
|
||||
const char* const filename, std::vector<std::string>* str_vector);
|
||||
|
||||
void WriteProtoToFile(const char* const filename,
|
||||
const google::protobuf::MessageLite& message);
|
||||
|
||||
#endif // ORG_TENSORFLOW_JNI_JNI_UTILS_H_
|
||||
|
@ -21,13 +21,16 @@ limitations under the License.
|
||||
|
||||
#include <jni.h>
|
||||
#include <pthread.h>
|
||||
#include <sys/stat.h>
|
||||
#include <unistd.h>
|
||||
#include <queue>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
|
||||
#include "tensorflow/core/framework/step_stats.pb.h"
|
||||
#include "tensorflow/core/framework/tensor.h"
|
||||
#include "tensorflow/core/framework/types.pb.h"
|
||||
#include "tensorflow/core/lib/strings/stringprintf.h"
|
||||
#include "tensorflow/core/platform/env.h"
|
||||
#include "tensorflow/core/platform/logging.h"
|
||||
#include "tensorflow/core/platform/mutex.h"
|
||||
@ -51,6 +54,12 @@ static int g_image_mean; // The image mean.
|
||||
static int g_num_runs = 0;
|
||||
static int64 g_timing_total_us = 0;
|
||||
|
||||
#ifdef SAVE_STEP_STATS
|
||||
static const bool kSaveStepStats = true;
|
||||
#else
|
||||
static const bool kSaveStepStats = false;
|
||||
#endif
|
||||
|
||||
inline static int64 CurrentThreadTimeUs() {
|
||||
struct timeval tv;
|
||||
gettimeofday(&tv, NULL);
|
||||
@ -199,11 +208,30 @@ static std::string ClassifyImage(const RGBA* const bitmap_src,
|
||||
std::vector<tensorflow::Tensor> output_tensors;
|
||||
std::vector<std::string> output_names({"output:0"});
|
||||
|
||||
const int64 start_time = CurrentThreadTimeUs();
|
||||
tensorflow::Status s =
|
||||
session->Run(input_tensors, output_names, {}, &output_tensors);
|
||||
const int64 end_time = CurrentThreadTimeUs();
|
||||
tensorflow::Status s;
|
||||
int64 start_time, end_time;
|
||||
|
||||
if (kSaveStepStats) {
|
||||
RunOptions run_options;
|
||||
run_options.set_trace_level(RunOptions::FULL_TRACE);
|
||||
RunOutputs run_outputs;
|
||||
start_time = CurrentThreadTimeUs();
|
||||
s = session->Run(run_options, input_tensors, output_names, {},
|
||||
&output_tensors, &run_outputs);
|
||||
end_time = CurrentThreadTimeUs();
|
||||
assert(run_outputs.has_step_stats());
|
||||
|
||||
const StepStats& stats = run_outputs.step_stats();
|
||||
|
||||
mkdir("/sdcard/tf/", 0755);
|
||||
const string filename =
|
||||
strings::Printf("/sdcard/tf/stepstats%05d.pb", g_num_runs);
|
||||
WriteProtoToFile(filename.c_str(), stats);
|
||||
} else {
|
||||
start_time = CurrentThreadTimeUs();
|
||||
s = session->Run(input_tensors, output_names, {}, &output_tensors);
|
||||
end_time = CurrentThreadTimeUs();
|
||||
}
|
||||
const int64 elapsed_time_inf = end_time - start_time;
|
||||
g_timing_total_us += elapsed_time_inf;
|
||||
VLOG(0) << "End computing. Ran in " << elapsed_time_inf / 1000 << "ms ("
|
||||
|
@ -40,6 +40,7 @@ py_library(
|
||||
name = "platform",
|
||||
srcs = glob(["platform/**/*.py"]),
|
||||
srcs_version = "PY2AND3",
|
||||
deps = ["//tensorflow/core:protos_all_py"],
|
||||
)
|
||||
|
||||
py_library(
|
||||
@ -1006,6 +1007,7 @@ py_test(
|
||||
name = "session_test",
|
||||
srcs = ["client/session_test.py"],
|
||||
srcs_version = "PY2AND3",
|
||||
tags = ["noasan"],
|
||||
deps = [
|
||||
":framework",
|
||||
":framework_test_lib",
|
||||
@ -1034,12 +1036,12 @@ cpu_only_kernel_test_list = glob([
|
||||
"kernel_tests/attention_ops_test.py",
|
||||
"kernel_tests/barrier_ops_test.py",
|
||||
"kernel_tests/bcast_ops_test.py",
|
||||
"kernel_tests/benchmark_test.py",
|
||||
"kernel_tests/candidate_sampler_ops_test.py",
|
||||
"kernel_tests/cholesky_op_test.py",
|
||||
"kernel_tests/clip_ops_test.py",
|
||||
"kernel_tests/decode_csv_op_test.py",
|
||||
"kernel_tests/decode_raw_op_test.py",
|
||||
"kernel_tests/depthtospace_op_test.py",
|
||||
"kernel_tests/determinant_op_test.py",
|
||||
"kernel_tests/diag_op_test.py",
|
||||
"kernel_tests/edit_distance_op_test.py",
|
||||
@ -1069,7 +1071,6 @@ cpu_only_kernel_test_list = glob([
|
||||
"kernel_tests/sparse_reorder_op_test.py",
|
||||
"kernel_tests/sparse_to_dense_op_test.py",
|
||||
"kernel_tests/sparsemask_op_test.py",
|
||||
"kernel_tests/spacetodepth_op_test.py",
|
||||
"kernel_tests/summary_ops_test.py",
|
||||
"kernel_tests/template_test.py",
|
||||
"kernel_tests/topk_op_test.py",
|
||||
|
@ -59,7 +59,7 @@ from tensorflow.core.framework.attr_value_pb2 import *
|
||||
from tensorflow.core.protobuf.config_pb2 import *
|
||||
from tensorflow.core.util.event_pb2 import *
|
||||
# Import things out of contrib
|
||||
from tensorflow import contrib
|
||||
import tensorflow.contrib as contrib
|
||||
|
||||
# Framework
|
||||
from tensorflow.python.framework.framework_lib import *
|
||||
@ -101,6 +101,7 @@ from tensorflow.python.framework import framework_lib
|
||||
from tensorflow.python.ops import array_ops
|
||||
from tensorflow.python.ops import constant_op
|
||||
from tensorflow.python.ops import control_flow_ops
|
||||
from tensorflow.python.ops import histogram_ops
|
||||
from tensorflow.python.ops import io_ops
|
||||
from tensorflow.python.ops import math_ops
|
||||
from tensorflow.python.ops import script_ops
|
||||
@ -117,8 +118,8 @@ _whitelist = set([app, compat, contrib, errors, flags, gfile, image,
|
||||
# strings of other modules.
|
||||
__all__ = make_all(__name__,
|
||||
[framework_lib, array_ops, client_lib, constant_op,
|
||||
control_flow_ops, io_ops, math_ops, nn, script_ops,
|
||||
sparse_ops, state_ops, train])
|
||||
control_flow_ops, histogram_ops, io_ops, math_ops, nn,
|
||||
script_ops, sparse_ops, state_ops, train])
|
||||
|
||||
# Symbols whitelisted for export without documentation.
|
||||
# TODO(cwhipkey): review these and move to contrib, expose through
|
||||
|
@ -294,7 +294,7 @@ class BaseSession(SessionInterface):
|
||||
[`SparseTensorValue`](../../api_docs/python/sparse_ops.md#SparseTensorValue).
|
||||
|
||||
The optional `options` argument expects a [`RunOptions`] proto. The options
|
||||
allow controling the behavior of this particular step (e.g. turning tracing
|
||||
allow controlling the behavior of this particular step (e.g. turning tracing
|
||||
on).
|
||||
|
||||
The optional `run_outputs` argument expects a [`RunOutputs`] proto. When
|
||||
|
@ -25,7 +25,6 @@ import numpy as np
|
||||
import six
|
||||
from six.moves import xrange # pylint: disable=redefined-builtin
|
||||
|
||||
from tensorflow.core.framework import step_stats_pb2
|
||||
from tensorflow.core.lib.core import error_codes_pb2
|
||||
from tensorflow.core.protobuf import config_pb2
|
||||
from tensorflow.python.client import session
|
||||
@ -927,13 +926,32 @@ class SessionTest(test_util.TensorFlowTestCase):
|
||||
sess.run(constant_op.constant(1.0),
|
||||
options=run_options,
|
||||
run_outputs=run_outputs)
|
||||
|
||||
self.assertTrue(run_outputs.HasField('step_stats'))
|
||||
self.assertEquals(len(run_outputs.step_stats.dev_stats), 1)
|
||||
|
||||
step_stats = step_stats_pb2.StepStats()
|
||||
self.assertEquals(len(step_stats.dev_stats), 0)
|
||||
def testRunOptionsRunOutputs(self):
|
||||
run_options = config_pb2.RunOptions(
|
||||
trace_level=config_pb2.RunOptions.FULL_TRACE)
|
||||
run_outputs = config_pb2.RunOutputs()
|
||||
|
||||
step_stats.CopyFrom(run_outputs.step_stats)
|
||||
self.assertEquals(len(step_stats.dev_stats), 1)
|
||||
with ops.device('/cpu:0'):
|
||||
with session.Session() as sess:
|
||||
# all combinations are valid
|
||||
sess.run(constant_op.constant(1.0), options=None, run_outputs=None)
|
||||
sess.run(constant_op.constant(1.0), options=None,
|
||||
run_outputs=run_outputs)
|
||||
self.assertTrue(not run_outputs.HasField('step_stats'))
|
||||
|
||||
sess.run(constant_op.constant(1.0), options=run_options,
|
||||
run_outputs=None)
|
||||
self.assertTrue(not run_outputs.HasField('step_stats'))
|
||||
|
||||
sess.run(constant_op.constant(1.0), options=run_options,
|
||||
run_outputs=run_outputs)
|
||||
|
||||
self.assertTrue(run_outputs.HasField('step_stats'))
|
||||
self.assertEquals(len(run_outputs.step_stats.dev_stats), 1)
|
||||
|
||||
def testFeedShapeCompatibility(self):
|
||||
with session.Session() as sess:
|
||||
|
@ -81,6 +81,7 @@ def all_libraries(module_to_name, members, documented):
|
||||
exclude_symbols=["sparse_matmul", "arg_min", "arg_max",
|
||||
"lin_space", "sparse_segment_mean_grad"],
|
||||
prefix=PREFIX_TEXT),
|
||||
library("histogram_ops", "Histograms"),
|
||||
library("control_flow_ops", "Control Flow", prefix=PREFIX_TEXT),
|
||||
library("image", "Images", tf.image, exclude_symbols=["ResizeMethod"],
|
||||
prefix=PREFIX_TEXT),
|
||||
|
@ -165,9 +165,8 @@ class TensorFlowTestCase(googletest.TestCase):
|
||||
text_format.Merge(expected_message_maybe_ascii, expected_message)
|
||||
self._AssertProtoEquals(expected_message, message)
|
||||
else:
|
||||
assert False, ("Can't compare protos of type " +
|
||||
type(expected_message_maybe_ascii) + " and " +
|
||||
type(message))
|
||||
assert False, ("Can't compare protos of type %s and %s" %
|
||||
(type(expected_message_maybe_ascii), type(message)))
|
||||
|
||||
def assertProtoEqualsVersion(
|
||||
self, expected, actual, producer=versions.GRAPH_DEF_VERSION,
|
||||
|
158
tensorflow/python/kernel_tests/benchmark_test.py
Normal file
158
tensorflow/python/kernel_tests/benchmark_test.py
Normal file
@ -0,0 +1,158 @@
|
||||
# Copyright 2016 Google Inc. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ==============================================================================
|
||||
|
||||
"""Tests for tensorflow.python.framework.importer."""
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import os
|
||||
import random
|
||||
|
||||
import tensorflow as tf
|
||||
|
||||
from google.protobuf import text_format
|
||||
from tensorflow.core.util import test_log_pb2
|
||||
from tensorflow.python.platform import benchmark
|
||||
|
||||
|
||||
# Used by SomeRandomBenchmark class below.
|
||||
_ran_somebenchmark_1 = [False]
|
||||
_ran_somebenchmark_2 = [False]
|
||||
_ran_somebenchmark_but_shouldnt = [False]
|
||||
|
||||
|
||||
class SomeRandomBenchmark(tf.test.Benchmark):
|
||||
"""This Benchmark should automatically be registered in the registry."""
|
||||
|
||||
def _dontRunThisBenchmark(self):
|
||||
_ran_somebenchmark_but_shouldnt[0] = True
|
||||
|
||||
def notBenchmarkMethod(self):
|
||||
_ran_somebenchmark_but_shouldnt[0] = True
|
||||
|
||||
def benchmark1(self):
|
||||
_ran_somebenchmark_1[0] = True
|
||||
|
||||
def benchmark2(self):
|
||||
_ran_somebenchmark_2[0] = True
|
||||
|
||||
|
||||
class TestReportingBenchmark(tf.test.Benchmark):
|
||||
"""This benchmark (maybe) reports some stuff."""
|
||||
|
||||
def benchmarkReport1(self):
|
||||
self.report_benchmark(iters=1)
|
||||
|
||||
def benchmarkReport2(self):
|
||||
self.report_benchmark(
|
||||
iters=2, name="custom_benchmark_name",
|
||||
extras={"number_key": 3, "other_key": "string"})
|
||||
|
||||
|
||||
class BenchmarkTest(tf.test.TestCase):
|
||||
|
||||
def testGlobalBenchmarkRegistry(self):
|
||||
registry = list(benchmark.GLOBAL_BENCHMARK_REGISTRY)
|
||||
self.assertEqual(len(registry), 2)
|
||||
self.assertTrue(SomeRandomBenchmark in registry)
|
||||
self.assertTrue(TestReportingBenchmark in registry)
|
||||
|
||||
def testRunSomeRandomBenchmark(self):
|
||||
# Validate that SomeBenchmark has not run yet
|
||||
self.assertFalse(_ran_somebenchmark_1[0])
|
||||
self.assertFalse(_ran_somebenchmark_2[0])
|
||||
self.assertFalse(_ran_somebenchmark_but_shouldnt[0])
|
||||
|
||||
# Run other benchmarks, but this wont run the one we care about
|
||||
benchmark._run_benchmarks("unrelated")
|
||||
|
||||
# Validate that SomeBenchmark has not run yet
|
||||
self.assertFalse(_ran_somebenchmark_1[0])
|
||||
self.assertFalse(_ran_somebenchmark_2[0])
|
||||
self.assertFalse(_ran_somebenchmark_but_shouldnt[0])
|
||||
|
||||
# Run all the benchmarks, avoid generating any reports
|
||||
if benchmark.TEST_REPORTER_TEST_ENV in os.environ:
|
||||
del os.environ[benchmark.TEST_REPORTER_TEST_ENV]
|
||||
benchmark._run_benchmarks("SomeRandom")
|
||||
|
||||
# Validate that SomeRandomBenchmark ran correctly
|
||||
self.assertTrue(_ran_somebenchmark_1[0])
|
||||
self.assertTrue(_ran_somebenchmark_2[0])
|
||||
self.assertFalse(_ran_somebenchmark_but_shouldnt[0])
|
||||
|
||||
def testReportingBenchmark(self):
|
||||
tempdir = tf.test.get_temp_dir()
|
||||
try:
|
||||
tf.gfile.MakeDirs(tempdir)
|
||||
except OSError as e:
|
||||
# It's OK if the directory already exists.
|
||||
if " exists:" not in str(e):
|
||||
raise e
|
||||
|
||||
prefix = os.path.join(
|
||||
tempdir, "reporting_bench_%016x_" % random.getrandbits(64))
|
||||
expected_output_file = "%s%s" % (
|
||||
prefix, "TestReportingBenchmark.benchmarkReport1")
|
||||
expected_output_file_2 = "%s%s" % (
|
||||
prefix, "TestReportingBenchmark.custom_benchmark_name")
|
||||
try:
|
||||
self.assertFalse(tf.gfile.Exists(expected_output_file))
|
||||
# Run benchmark but without env, shouldn't write anything
|
||||
if benchmark.TEST_REPORTER_TEST_ENV in os.environ:
|
||||
del os.environ[benchmark.TEST_REPORTER_TEST_ENV]
|
||||
reporting = TestReportingBenchmark()
|
||||
reporting.benchmarkReport1() # This should run without writing anything
|
||||
self.assertFalse(tf.gfile.Exists(expected_output_file))
|
||||
|
||||
# Runbenchmark with env, should write
|
||||
os.environ[benchmark.TEST_REPORTER_TEST_ENV] = prefix
|
||||
|
||||
reporting = TestReportingBenchmark()
|
||||
reporting.benchmarkReport1() # This should write
|
||||
reporting.benchmarkReport2() # This should write
|
||||
|
||||
# Check the files were written
|
||||
self.assertTrue(tf.gfile.Exists(expected_output_file))
|
||||
self.assertTrue(tf.gfile.Exists(expected_output_file_2))
|
||||
|
||||
# Check the contents are correct
|
||||
expected_1 = test_log_pb2.BenchmarkEntry()
|
||||
expected_1.name = "TestReportingBenchmark.benchmarkReport1"
|
||||
expected_1.iters = 1
|
||||
|
||||
expected_2 = test_log_pb2.BenchmarkEntry()
|
||||
expected_2.name = "TestReportingBenchmark.custom_benchmark_name"
|
||||
expected_2.iters = 2
|
||||
expected_2.extras["number_key"].double_value = 3
|
||||
expected_2.extras["other_key"].string_value = "string"
|
||||
|
||||
read_benchmark_1 = tf.gfile.GFile(expected_output_file, "r").read()
|
||||
read_benchmark_1 = text_format.Merge(
|
||||
read_benchmark_1, test_log_pb2.BenchmarkEntry())
|
||||
self.assertProtoEquals(expected_1, read_benchmark_1)
|
||||
|
||||
read_benchmark_2 = tf.gfile.GFile(expected_output_file_2, "r").read()
|
||||
read_benchmark_2 = text_format.Merge(
|
||||
read_benchmark_2, test_log_pb2.BenchmarkEntry())
|
||||
self.assertProtoEquals(expected_2, read_benchmark_2)
|
||||
|
||||
finally:
|
||||
tf.gfile.DeleteRecursively(tempdir)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
tf.test.main()
|
@ -25,12 +25,17 @@ import tensorflow as tf
|
||||
|
||||
class DepthToSpaceTest(tf.test.TestCase):
|
||||
|
||||
def _testOne(self, inputs, block_size, outputs):
|
||||
for use_gpu in [False, True]:
|
||||
with self.test_session(use_gpu=use_gpu):
|
||||
x_tf = tf.depth_to_space(tf.to_float(inputs), block_size)
|
||||
self.assertAllEqual(x_tf.eval(), outputs)
|
||||
|
||||
def testBasic(self):
|
||||
x_np = [[[[1, 2, 3, 4]]]]
|
||||
with self.test_session(use_gpu=False):
|
||||
block_size = 2
|
||||
x_tf = tf.depth_to_space(x_np, block_size)
|
||||
self.assertAllEqual(x_tf.eval(), [[[[1], [2]], [[3], [4]]]])
|
||||
block_size = 2
|
||||
x_out = [[[[1], [2]], [[3], [4]]]]
|
||||
self._testOne(x_np, block_size, x_out)
|
||||
|
||||
# Tests for larger input dimensions. To make sure elements are
|
||||
# correctly ordered spatially.
|
||||
@ -40,12 +45,28 @@ class DepthToSpaceTest(tf.test.TestCase):
|
||||
[[9, 10, 11, 12],
|
||||
[13, 14, 15, 16]]]]
|
||||
block_size = 2
|
||||
with self.test_session(use_gpu=False):
|
||||
x_tf = tf.depth_to_space(x_np, block_size)
|
||||
self.assertAllEqual(x_tf.eval(), [[[[1], [2], [5], [6]],
|
||||
[[3], [4], [7], [8]],
|
||||
[[9], [10], [13], [14]],
|
||||
[[11], [12], [15], [16]]]])
|
||||
x_out = [[[[1], [2], [5], [6]],
|
||||
[[3], [4], [7], [8]],
|
||||
[[9], [10], [13], [14]],
|
||||
[[11], [12], [15], [16]]]]
|
||||
self._testOne(x_np, block_size, x_out)
|
||||
|
||||
def testBlockSize2Batch10(self):
|
||||
block_size = 2
|
||||
def batch_input_elt(i):
|
||||
return [[[1 * i, 2 * i, 3 * i, 4 * i],
|
||||
[5 * i, 6 * i, 7 * i, 8 * i]],
|
||||
[[9 * i, 10 * i, 11 * i, 12 * i],
|
||||
[13 * i, 14 * i, 15 * i, 16 * i]]]
|
||||
def batch_output_elt(i):
|
||||
return [[[1 * i], [2 * i], [5 * i], [6 * i]],
|
||||
[[3 * i], [4 * i], [7 * i], [8 * i]],
|
||||
[[9 * i], [10 * i], [13 * i], [14 * i]],
|
||||
[[11 * i], [12 * i], [15 * i], [16 * i]]]
|
||||
batch_size = 10
|
||||
x_np = [batch_input_elt(i) for i in xrange(batch_size)]
|
||||
x_out = [batch_output_elt(i) for i in xrange(batch_size)]
|
||||
self._testOne(x_np, block_size, x_out)
|
||||
|
||||
# Tests for different width and height.
|
||||
def testNonSquare(self):
|
||||
@ -53,46 +74,42 @@ class DepthToSpaceTest(tf.test.TestCase):
|
||||
[[5, 50, 6, 60, 7, 70, 8, 80]],
|
||||
[[9, 90, 10, 100, 11, 110, 12, 120]]]]
|
||||
block_size = 2
|
||||
with self.test_session(use_gpu=False):
|
||||
x_tf = tf.depth_to_space(x_np, block_size)
|
||||
self.assertAllEqual(x_tf.eval(), [[[[1, 10], [2, 20]],
|
||||
[[3, 30], [4, 40]],
|
||||
[[5, 50], [6, 60]],
|
||||
[[7, 70], [8, 80]],
|
||||
[[9, 90], [10, 100]],
|
||||
[[11, 110], [12, 120]]]])
|
||||
x_out = [[[[1, 10], [2, 20]],
|
||||
[[3, 30], [4, 40]],
|
||||
[[5, 50], [6, 60]],
|
||||
[[7, 70], [8, 80]],
|
||||
[[9, 90], [10, 100]],
|
||||
[[11, 110], [12, 120]]]]
|
||||
self._testOne(x_np, block_size, x_out)
|
||||
|
||||
# Tests for larger input dimensions. To make sure elements are
|
||||
# correctly ordered spatially.
|
||||
def testBlockSize4FlatInput(self):
|
||||
x_np = [[[[1, 2, 5, 6, 3, 4, 7, 8, 9, 10, 13, 14, 11, 12, 15, 16]]]]
|
||||
block_size = 4
|
||||
with self.test_session(use_gpu=False):
|
||||
x_tf = tf.depth_to_space(x_np, block_size)
|
||||
self.assertAllEqual(x_tf.eval(), [[[[1], [2], [5], [6]],
|
||||
[[3], [4], [7], [8]],
|
||||
[[9], [10], [13], [14]],
|
||||
[[11], [12], [15], [16]]]])
|
||||
x_out = [[[[1], [2], [5], [6]],
|
||||
[[3], [4], [7], [8]],
|
||||
[[9], [10], [13], [14]],
|
||||
[[11], [12], [15], [16]]]]
|
||||
self._testOne(x_np, block_size, x_out)
|
||||
|
||||
# Tests for larger input depths.
|
||||
# To make sure elements are properly interleaved in depth.
|
||||
def testDepthInterleaved(self):
|
||||
x_np = [[[[1, 10, 2, 20, 3, 30, 4, 40]]]]
|
||||
block_size = 2
|
||||
with self.test_session(use_gpu=False):
|
||||
x_tf = tf.depth_to_space(x_np, block_size)
|
||||
self.assertAllEqual(x_tf.eval(), [[[[1, 10], [2, 20]],
|
||||
[[3, 30], [4, 40]]]])
|
||||
x_out = [[[[1, 10], [2, 20]],
|
||||
[[3, 30], [4, 40]]]]
|
||||
self._testOne(x_np, block_size, x_out)
|
||||
|
||||
# Tests for larger input depths. Here an odd depth.
|
||||
# To make sure elements are properly interleaved in depth.
|
||||
def testDepthInterleavedDepth3(self):
|
||||
x_np = [[[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]]]]
|
||||
block_size = 2
|
||||
with self.test_session(use_gpu=False):
|
||||
x_tf = tf.depth_to_space(x_np, block_size)
|
||||
self.assertAllEqual(x_tf.eval(), [[[[1, 2, 3], [4, 5, 6]],
|
||||
[[7, 8, 9], [10, 11, 12]]]])
|
||||
x_out = [[[[1, 2, 3], [4, 5, 6]],
|
||||
[[7, 8, 9], [10, 11, 12]]]]
|
||||
self._testOne(x_np, block_size, x_out)
|
||||
|
||||
# Tests for larger input depths.
|
||||
# To make sure elements are properly interleaved in depth.
|
||||
@ -102,13 +119,11 @@ class DepthToSpaceTest(tf.test.TestCase):
|
||||
[[9, 90, 10, 100, 11, 110, 12, 120],
|
||||
[13, 130, 14, 140, 15, 150, 16, 160]]]]
|
||||
block_size = 2
|
||||
with self.test_session(use_gpu=False):
|
||||
x_tf = tf.depth_to_space(x_np, block_size)
|
||||
self.assertAllEqual(x_tf.eval(),
|
||||
[[[[1, 10], [2, 20], [5, 50], [6, 60]],
|
||||
[[3, 30], [4, 40], [7, 70], [8, 80]],
|
||||
[[9, 90], [10, 100], [13, 130], [14, 140]],
|
||||
[[11, 110], [12, 120], [15, 150], [16, 160]]]])
|
||||
x_out = [[[[1, 10], [2, 20], [5, 50], [6, 60]],
|
||||
[[3, 30], [4, 40], [7, 70], [8, 80]],
|
||||
[[9, 90], [10, 100], [13, 130], [14, 140]],
|
||||
[[11, 110], [12, 120], [15, 150], [16, 160]]]]
|
||||
self._testOne(x_np, block_size, x_out)
|
||||
|
||||
# Error handling:
|
||||
|
||||
@ -205,5 +220,6 @@ class DepthToSpaceGradientTest(tf.test.TestCase):
|
||||
block_size = 3
|
||||
self._compare(1, 2, 3, 2, block_size)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
tf.test.main()
|
||||
|
@ -184,7 +184,8 @@ class RNNCellTest(tf.test.TestCase):
|
||||
x = tf.zeros([1, 1], dtype=tf.int32)
|
||||
m = tf.zeros([1, 2])
|
||||
g, new_m = tf.nn.rnn_cell.EmbeddingWrapper(
|
||||
tf.nn.rnn_cell.GRUCell(2), 3)(x, m)
|
||||
tf.nn.rnn_cell.GRUCell(2),
|
||||
embedding_classes=3, embedding_size=2)(x, m)
|
||||
sess.run([tf.initialize_all_variables()])
|
||||
res = sess.run([g, new_m], {x.name: np.array([[1]]),
|
||||
m.name: np.array([[0.1, 0.1]])})
|
||||
|
@ -19,7 +19,6 @@ from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import sys
|
||||
import time
|
||||
import timeit
|
||||
|
||||
@ -953,6 +952,7 @@ def graph_creation_static_vs_dynamic_rnn_benchmark(max_time):
|
||||
|
||||
print("%d \t %f \t %f \t %f" %
|
||||
(max_time, delta_static, delta_dynamic, delta_dynamic/delta_static))
|
||||
return delta_static, delta_dynamic
|
||||
|
||||
|
||||
def _timer(sess, ops):
|
||||
@ -1013,6 +1013,8 @@ def static_vs_dynamic_rnn_benchmark(batch_size, max_time, num_units, use_gpu):
|
||||
(batch_size, max_time, num_units, use_gpu, delta_static,
|
||||
delta_dynamic, delta_dynamic/delta_static))
|
||||
|
||||
return delta_static, delta_dynamic
|
||||
|
||||
|
||||
def _dynamic_rnn_swap_memory_benchmark(inputs_t, sequence_length,
|
||||
swap_memory):
|
||||
@ -1061,6 +1063,7 @@ def dynamic_rnn_swap_memory_benchmark(batch_size, max_time, num_units):
|
||||
|
||||
print("%d \t %d \t %d \t %f \t %f \t %f" %
|
||||
(batch_size, max_time, num_units, no_swap, swap, swap/no_swap))
|
||||
return no_swap, swap
|
||||
|
||||
|
||||
def rnn_long_sequence_benchmark(batch_size, seqlen, num_units,
|
||||
@ -1097,34 +1100,55 @@ def rnn_long_sequence_benchmark(batch_size, seqlen, num_units,
|
||||
elapsed/seqlen))
|
||||
|
||||
|
||||
def main(_):
|
||||
print("Graph Creation: Static Unroll vs. Dynamic Unroll LSTM")
|
||||
print("max_t \t dt(static) \t dt(dynamic) \t dt(dynamic)/dt(static)")
|
||||
for max_time in (1, 25, 50):
|
||||
graph_creation_static_vs_dynamic_rnn_benchmark(max_time)
|
||||
class BenchmarkRNN(tf.test.Benchmark):
|
||||
|
||||
print("Calculation: Static Unroll with Dynamic Flow LSTM "
|
||||
"vs. Dynamic Unroll LSTM")
|
||||
print("batch \t max_t \t units \t gpu \t dt(static) \t dt(dynamic) "
|
||||
"\t dt(dynamic)/dt(static)")
|
||||
for batch_size in (256,):
|
||||
for max_time in (50,):
|
||||
for num_units in (512, 256, 128):
|
||||
for use_gpu in (False, True):
|
||||
static_vs_dynamic_rnn_benchmark(
|
||||
batch_size, max_time, num_units, use_gpu)
|
||||
def benchmarkGraphCreationStaticVsDynamicLSTM(self):
|
||||
print("Graph Creation: Static Unroll vs. Dynamic Unroll LSTM")
|
||||
print("max_t \t dt(static) \t dt(dynamic) \t dt(dynamic)/dt(static)")
|
||||
for max_time in (1, 25, 50):
|
||||
s_dt, d_dt = graph_creation_static_vs_dynamic_rnn_benchmark(max_time)
|
||||
self.report_benchmark(name="graph_creation_time_static_T%02d" % max_time,
|
||||
iters=5, wall_time=s_dt)
|
||||
self.report_benchmark(name="graph_creation_time_dynamic_T%02d" % max_time,
|
||||
iters=5, wall_time=d_dt)
|
||||
|
||||
print("Calculation: Dynamic LSTM No Memory Swap vs. Memory Swap")
|
||||
print("batch \t max_t \t units \t no_swap \t swap \t swap/no_swap")
|
||||
for batch_size in (256, 512):
|
||||
for max_time in (100,):
|
||||
for num_units in (512, 256, 128):
|
||||
dynamic_rnn_swap_memory_benchmark(batch_size, max_time, num_units)
|
||||
def benchmarkStaticUnrollVsDynamicFlowLSTM(self):
|
||||
print("Calculation: Static Unroll with Dynamic Flow LSTM "
|
||||
"vs. Dynamic Unroll LSTM")
|
||||
print("batch \t max_t \t units \t gpu \t dt(static) \t dt(dynamic) "
|
||||
"\t dt(dynamic)/dt(static)")
|
||||
for batch_size in (256,):
|
||||
for max_time in (50,):
|
||||
for num_units in (512, 256, 128):
|
||||
for use_gpu in (False, True):
|
||||
s_dt, d_dt = static_vs_dynamic_rnn_benchmark(
|
||||
batch_size, max_time, num_units, use_gpu)
|
||||
self.report_benchmark(
|
||||
name="static_unroll_time_T%02d_B%03d_N%03d_gpu_%s"
|
||||
% (max_time, batch_size, num_units, use_gpu),
|
||||
iters=10, wall_time=s_dt)
|
||||
self.report_benchmark(
|
||||
name="dynamic_unroll_time_T%02d_B%03d_N%03d_gpu_%s"
|
||||
% (max_time, batch_size, num_units, use_gpu),
|
||||
iters=10, wall_time=d_dt)
|
||||
|
||||
def benchmarkDynamicLSTMNoMemorySwapVsMemorySwap(self):
|
||||
print("Calculation: Dynamic LSTM No Memory Swap vs. Memory Swap")
|
||||
print("batch \t max_t \t units \t no_swap \t swap \t swap/no_swap")
|
||||
for batch_size in (256, 512):
|
||||
for max_time in (100,):
|
||||
for num_units in (512, 256, 128):
|
||||
no_swap, swap = dynamic_rnn_swap_memory_benchmark(
|
||||
batch_size, max_time, num_units)
|
||||
self.report_benchmark(
|
||||
name="dynamic_lstm_no_memory_swap_T%02d_B%03d_N%03d"
|
||||
% (max_time, batch_size, num_units),
|
||||
iters=10, wall_time=no_swap)
|
||||
self.report_benchmark(
|
||||
name="dynamic_lstm_with_memory_swap_T%02d_B%03d_N%03d"
|
||||
% (max_time, batch_size, num_units),
|
||||
iters=10, wall_time=swap)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
if "--benchmarks" in sys.argv:
|
||||
sys.argv.remove("--benchmarks")
|
||||
tf.app.run()
|
||||
else:
|
||||
tf.test.main()
|
||||
tf.test.main()
|
||||
|
@ -121,6 +121,13 @@ class SoftmaxTest(tf.test.TestCase):
|
||||
self._testOverflow(use_gpu=False)
|
||||
|
||||
|
||||
def testEmpty(self):
|
||||
with self.test_session():
|
||||
x = tf.constant([[]], shape=[0, 3])
|
||||
self.assertEqual(0, tf.size(x).eval())
|
||||
expected_y = np.array([]).reshape(0, 3)
|
||||
np.testing.assert_array_equal(expected_y, tf.nn.softmax(x).eval())
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
tf.test.main()
|
||||
|
@ -25,13 +25,18 @@ import tensorflow as tf
|
||||
|
||||
class SpaceToDepthTest(tf.test.TestCase):
|
||||
|
||||
def _testOne(self, inputs, block_size, outputs):
|
||||
for use_gpu in [False, True]:
|
||||
with self.test_session(use_gpu=use_gpu):
|
||||
x_tf = tf.space_to_depth(tf.to_float(inputs), block_size)
|
||||
self.assertAllEqual(x_tf.eval(), outputs)
|
||||
|
||||
def testBasic(self):
|
||||
x_np = [[[[1], [2]],
|
||||
[[3], [4]]]]
|
||||
with self.test_session(use_gpu=False):
|
||||
block_size = 2
|
||||
out_tf = tf.space_to_depth(x_np, block_size)
|
||||
self.assertAllEqual(out_tf.eval(), [[[[1, 2, 3, 4]]]])
|
||||
block_size = 2
|
||||
x_out = [[[[1, 2, 3, 4]]]]
|
||||
self._testOne(x_np, block_size, x_out)
|
||||
|
||||
# Tests for larger input dimensions. To make sure elements are
|
||||
# correctly ordered spatially.
|
||||
@ -40,14 +45,12 @@ class SpaceToDepthTest(tf.test.TestCase):
|
||||
[[3], [4], [7], [8]],
|
||||
[[9], [10], [13], [14]],
|
||||
[[11], [12], [15], [16]]]]
|
||||
|
||||
with self.test_session(use_gpu=False):
|
||||
block_size = 2
|
||||
out_tf = tf.space_to_depth(x_np, block_size)
|
||||
self.assertAllEqual(out_tf.eval(), [[[[1, 2, 3, 4],
|
||||
[5, 6, 7, 8]],
|
||||
[[9, 10, 11, 12],
|
||||
[13, 14, 15, 16]]]])
|
||||
block_size = 2
|
||||
x_out = [[[[1, 2, 3, 4],
|
||||
[5, 6, 7, 8]],
|
||||
[[9, 10, 11, 12],
|
||||
[13, 14, 15, 16]]]]
|
||||
self._testOne(x_np, block_size, x_out)
|
||||
|
||||
# Tests for larger input dimensions. To make sure elements are
|
||||
# correctly ordered in depth. Here, larger block size.
|
||||
@ -56,34 +59,27 @@ class SpaceToDepthTest(tf.test.TestCase):
|
||||
[[3], [4], [7], [8]],
|
||||
[[9], [10], [13], [14]],
|
||||
[[11], [12], [15], [16]]]]
|
||||
|
||||
with self.test_session(use_gpu=False):
|
||||
block_size = 4
|
||||
out_tf = tf.space_to_depth(x_np, block_size)
|
||||
self.assertAllEqual(
|
||||
out_tf.eval(),
|
||||
[[[[1, 2, 5, 6, 3, 4, 7, 8, 9, 10, 13, 14, 11, 12, 15, 16]]]])
|
||||
block_size = 4
|
||||
x_out = [[[[1, 2, 5, 6, 3, 4, 7, 8, 9, 10, 13, 14, 11, 12, 15, 16]]]]
|
||||
self._testOne(x_np, block_size, x_out)
|
||||
|
||||
# Tests for larger input depths.
|
||||
# To make sure elements are properly interleaved in depth.
|
||||
def testDepthInterleaved(self):
|
||||
x_np = [[[[1, 10], [2, 20]],
|
||||
[[3, 30], [4, 40]]]]
|
||||
with self.test_session(use_gpu=False):
|
||||
block_size = 2
|
||||
out_tf = tf.space_to_depth(x_np, block_size)
|
||||
self.assertAllEqual(out_tf.eval(), [[[[1, 10, 2, 20, 3, 30, 4, 40]]]])
|
||||
block_size = 2
|
||||
x_out = [[[[1, 10, 2, 20, 3, 30, 4, 40]]]]
|
||||
self._testOne(x_np, block_size, x_out)
|
||||
|
||||
# Tests for larger input depths. Here an odd depth.
|
||||
# To make sure elements are properly interleaved in depth.
|
||||
def testDepthInterleavedDepth3(self):
|
||||
x_np = [[[[1, 2, 3], [4, 5, 6]],
|
||||
[[7, 8, 9], [10, 11, 12]]]]
|
||||
with self.test_session(use_gpu=False):
|
||||
block_size = 2
|
||||
out_tf = tf.space_to_depth(x_np, block_size)
|
||||
self.assertAllEqual(out_tf.eval(),
|
||||
[[[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]]]])
|
||||
block_size = 2
|
||||
x_out = [[[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]]]]
|
||||
self._testOne(x_np, block_size, x_out)
|
||||
|
||||
# Tests for larger input dimensions AND for larger input depths.
|
||||
# To make sure elements are properly interleaved in depth and ordered
|
||||
@ -93,14 +89,29 @@ class SpaceToDepthTest(tf.test.TestCase):
|
||||
[[3, 30], [4, 40], [7, 70], [8, 80]],
|
||||
[[9, 90], [10, 100], [13, 130], [14, 140]],
|
||||
[[11, 110], [12, 120], [15, 150], [16, 160]]]]
|
||||
with self.test_session(use_gpu=False):
|
||||
block_size = 2
|
||||
out_tf = tf.space_to_depth(x_np, block_size)
|
||||
self.assertAllEqual(out_tf.eval(),
|
||||
[[[[1, 10, 2, 20, 3, 30, 4, 40],
|
||||
[5, 50, 6, 60, 7, 70, 8, 80]],
|
||||
[[9, 90, 10, 100, 11, 110, 12, 120],
|
||||
[13, 130, 14, 140, 15, 150, 16, 160]]]])
|
||||
block_size = 2
|
||||
x_out = [[[[1, 10, 2, 20, 3, 30, 4, 40],
|
||||
[5, 50, 6, 60, 7, 70, 8, 80]],
|
||||
[[9, 90, 10, 100, 11, 110, 12, 120],
|
||||
[13, 130, 14, 140, 15, 150, 16, 160]]]]
|
||||
self._testOne(x_np, block_size, x_out)
|
||||
|
||||
def testBlockSize2Batch10(self):
|
||||
block_size = 2
|
||||
def batch_input_elt(i):
|
||||
return [[[1 * i], [2 * i], [5 * i], [6 * i]],
|
||||
[[3 * i], [4 * i], [7 * i], [8 * i]],
|
||||
[[9 * i], [10 * i], [13 * i], [14 * i]],
|
||||
[[11 * i], [12 * i], [15 * i], [16 * i]]]
|
||||
def batch_output_elt(i):
|
||||
return [[[1 * i, 2 * i, 3 * i, 4 * i],
|
||||
[5 * i, 6 * i, 7 * i, 8 * i]],
|
||||
[[9 * i, 10 * i, 11 * i, 12 * i],
|
||||
[13 * i, 14 * i, 15 * i, 16 * i]]]
|
||||
batch_size = 10
|
||||
x_np = [batch_input_elt(i) for i in xrange(batch_size)]
|
||||
x_out = [batch_output_elt(i) for i in xrange(batch_size)]
|
||||
self._testOne(x_np, block_size, x_out)
|
||||
|
||||
# Tests for different width and height.
|
||||
def testNonSquare(self):
|
||||
@ -110,13 +121,11 @@ class SpaceToDepthTest(tf.test.TestCase):
|
||||
[[7, 70], [8, 80]],
|
||||
[[9, 90], [10, 100]],
|
||||
[[11, 110], [12, 120]]]]
|
||||
with self.test_session(use_gpu=False):
|
||||
block_size = 2
|
||||
out_tf = tf.space_to_depth(x_np, block_size)
|
||||
self.assertAllEqual(out_tf.eval(),
|
||||
[[[[1, 10, 2, 20, 3, 30, 4, 40]],
|
||||
[[5, 50, 6, 60, 7, 70, 8, 80]],
|
||||
[[9, 90, 10, 100, 11, 110, 12, 120]]]])
|
||||
block_size = 2
|
||||
x_out = [[[[1, 10, 2, 20, 3, 30, 4, 40]],
|
||||
[[5, 50, 6, 60, 7, 70, 8, 80]],
|
||||
[[9, 90, 10, 100, 11, 110, 12, 120]]]]
|
||||
self._testOne(x_np, block_size, x_out)
|
||||
|
||||
# Error handling:
|
||||
|
||||
|
@ -405,6 +405,7 @@ def boolean_mask(tensor, mask, name="boolean_mask"):
|
||||
ValueError: If shapes do not conform.
|
||||
|
||||
Examples:
|
||||
|
||||
```python
|
||||
# 2-D example
|
||||
a = [[1, 2], [3, 4], [5, 6]]
|
||||
|
@ -218,7 +218,7 @@ class QueueBase(object):
|
||||
return gen_data_flow_ops._queue_enqueue(self._queue_ref, vals, name=scope)
|
||||
|
||||
def enqueue_many(self, vals, name=None):
|
||||
"""Enqueues zero or elements to this queue.
|
||||
"""Enqueues zero or more elements to this queue.
|
||||
|
||||
This operation slices each component tensor along the 0th dimension to
|
||||
make multiple queue elements. All of the tensors in `vals` must have the
|
||||
|
@ -12,7 +12,11 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ==============================================================================
|
||||
"""Operations for histograms."""
|
||||
# pylint: disable=g-short-docstring-punctuation
|
||||
"""## Histograms
|
||||
|
||||
@@histogram_fixed_width
|
||||
"""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
@ -24,30 +28,34 @@ from tensorflow.python.ops import array_ops
|
||||
from tensorflow.python.ops import clip_ops
|
||||
from tensorflow.python.ops import math_ops
|
||||
from tensorflow.python.ops import state_ops
|
||||
from tensorflow.python.ops import variable_scope
|
||||
|
||||
|
||||
def histogram_fixed_width(hist,
|
||||
new_values,
|
||||
def histogram_fixed_width(values,
|
||||
value_range,
|
||||
use_locking=False,
|
||||
name='histogram_fixed_width'):
|
||||
"""Update histogram Variable with new values.
|
||||
nbins=100,
|
||||
use_locking=True,
|
||||
dtype=dtypes.int32,
|
||||
name=None):
|
||||
"""Return histogram of values.
|
||||
|
||||
This Op fills histogram with counts of values falling within fixed-width,
|
||||
half-open bins.
|
||||
Given the tensor `values`, this operation returns a rank 1 histogram counting
|
||||
the number of entries in `values` that fell into every bin. The bins are
|
||||
equal width and determined by the arguments `value_range` and `nbins`.
|
||||
|
||||
Args:
|
||||
hist: 1-D mutable `Tensor`, e.g. a `Variable`.
|
||||
new_values: Numeric `Tensor`.
|
||||
values: Numeric `Tensor`.
|
||||
value_range: Shape [2] `Tensor`. new_values <= value_range[0] will be
|
||||
mapped to hist[0], values >= value_range[1] will be mapped to hist[-1].
|
||||
Must be same dtype as new_values.
|
||||
nbins: Integer number of bins in this histogram.
|
||||
use_locking: Boolean.
|
||||
If `True`, use locking during the operation (optional).
|
||||
name: A name for this operation (optional).
|
||||
dtype: dtype for returned histogram.
|
||||
name: A name for this operation (defaults to 'histogram_fixed_width').
|
||||
|
||||
Returns:
|
||||
An op that updates `hist` with `new_values` when evaluated.
|
||||
A `Variable` holding histogram of values.
|
||||
|
||||
Examples:
|
||||
```python
|
||||
@ -57,24 +65,21 @@ def histogram_fixed_width(hist,
|
||||
new_values = [-1.0, 0.0, 1.5, 2.0, 5.0, 15]
|
||||
|
||||
with tf.default_session() as sess:
|
||||
hist = variables.Variable(array_ops.zeros(nbins, dtype=tf.int32))
|
||||
hist_update = histogram_ops.histogram_fixed_width(hist, new_values,
|
||||
value_range)
|
||||
hist = tf.histogram_fixed_width(new_values, value_range, nbins=5)
|
||||
variables.initialize_all_variables().run()
|
||||
sess.run(hist_update) => [2, 1, 1, 0, 2]
|
||||
sess.run(hist) => [2, 1, 1, 0, 2]
|
||||
```
|
||||
"""
|
||||
with ops.op_scope([hist, new_values, value_range], name) as scope:
|
||||
new_values = ops.convert_to_tensor(new_values, name='new_values')
|
||||
new_values = array_ops.reshape(new_values, [-1])
|
||||
with variable_scope.variable_op_scope(
|
||||
[values, value_range], name, 'histogram_fixed_width') as scope:
|
||||
values = ops.convert_to_tensor(values, name='values')
|
||||
values = array_ops.reshape(values, [-1])
|
||||
value_range = ops.convert_to_tensor(value_range, name='value_range')
|
||||
dtype = hist.dtype
|
||||
|
||||
# Map tensor values that fall within value_range to [0, 1].
|
||||
scaled_values = math_ops.truediv(new_values - value_range[0],
|
||||
scaled_values = math_ops.truediv(values - value_range[0],
|
||||
value_range[1] - value_range[0],
|
||||
name='scaled_values')
|
||||
nbins = math_ops.cast(hist.get_shape()[0], scaled_values.dtype)
|
||||
|
||||
# map tensor values within the open interval value_range to {0,.., nbins-1},
|
||||
# values outside the open interval will be zero or less, or nbins or more.
|
||||
@ -87,9 +92,18 @@ def histogram_fixed_width(hist,
|
||||
# Dummy vector to scatter.
|
||||
# TODO(langmore) Replace non-ideal creation of large dummy vector once an
|
||||
# alternative to scatter is available.
|
||||
updates = array_ops.ones([indices.get_shape()[0]], dtype=dtype)
|
||||
return state_ops.scatter_add(hist,
|
||||
indices,
|
||||
updates,
|
||||
use_locking=use_locking,
|
||||
name=scope)
|
||||
updates = array_ops.ones_like(indices, dtype=dtype)
|
||||
|
||||
hist = variable_scope.get_variable('hist',
|
||||
initializer=array_ops.zeros_initializer(
|
||||
[nbins],
|
||||
dtype=dtype),
|
||||
trainable=False)
|
||||
hist_assign_zero = hist.assign(array_ops.zeros_like(hist))
|
||||
|
||||
with ops.control_dependencies([hist_assign_zero]):
|
||||
return state_ops.scatter_add(hist,
|
||||
indices,
|
||||
updates,
|
||||
use_locking=use_locking,
|
||||
name=scope.name)
|
||||
|
@ -17,149 +17,132 @@ from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
from tensorflow.python.framework import dtypes
|
||||
from tensorflow.python.framework import test_util
|
||||
from tensorflow.python.ops import array_ops
|
||||
from tensorflow.python.ops import histogram_ops
|
||||
from tensorflow.python.ops import variables
|
||||
from tensorflow.python.platform import googletest
|
||||
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
|
||||
|
||||
class HistogramFixedWidthTest(test_util.TensorFlowTestCase):
|
||||
class HistogramFixedWidthTest(tf.test.TestCase):
|
||||
|
||||
def setUp(self):
|
||||
self.rng = np.random.RandomState(0)
|
||||
|
||||
def test_empty_input_gives_all_zero_counts(self):
|
||||
# Bins will be:
|
||||
# (-inf, 1), [1, 2), [2, 3), [3, 4), [4, inf)
|
||||
value_range = [0.0, 5.0]
|
||||
values = []
|
||||
expected_bin_counts = [0, 0, 0, 0, 0]
|
||||
with self.test_session():
|
||||
hist = tf.histogram_fixed_width(values, value_range, nbins=5)
|
||||
tf.initialize_all_variables().run()
|
||||
|
||||
# Hist should start "fresh" with every eval.
|
||||
self.assertAllClose(expected_bin_counts, hist.eval())
|
||||
self.assertAllClose(expected_bin_counts, hist.eval())
|
||||
|
||||
def test_one_update_on_constant_input(self):
|
||||
# Bins will be:
|
||||
# (-inf, 1), [1, 2), [2, 3), [3, 4), [4, inf)
|
||||
nbins = [5]
|
||||
value_range = [0.0, 5.0]
|
||||
new_values = [-1.0, 0.0, 1.5, 2.0, 5.0, 15]
|
||||
values = [-1.0, 0.0, 1.5, 2.0, 5.0, 15]
|
||||
expected_bin_counts = [2, 1, 1, 0, 2]
|
||||
with self.test_session() as sess:
|
||||
hist = variables.Variable(array_ops.zeros(nbins, dtype=dtypes.int32))
|
||||
hist_update = histogram_ops.histogram_fixed_width(hist, new_values,
|
||||
value_range)
|
||||
variables.initialize_all_variables().run()
|
||||
self.assertTrue(hist.dtype.is_compatible_with(hist_update.dtype))
|
||||
updated_hist_array = sess.run(hist_update)
|
||||
with self.test_session():
|
||||
hist = tf.histogram_fixed_width(values, value_range, nbins=5)
|
||||
tf.initialize_all_variables().run()
|
||||
|
||||
# The new updated_hist_array is returned by the updating op.
|
||||
self.assertAllClose(expected_bin_counts, updated_hist_array)
|
||||
|
||||
# hist should contain updated values, but eval() should not change it.
|
||||
# Hist should start "fresh" with every eval.
|
||||
self.assertAllClose(expected_bin_counts, hist.eval())
|
||||
self.assertAllClose(expected_bin_counts, hist.eval())
|
||||
|
||||
def test_one_update_on_constant_2d_input(self):
|
||||
# Bins will be:
|
||||
# (-inf, 1), [1, 2), [2, 3), [3, 4), [4, inf)
|
||||
nbins = [5]
|
||||
value_range = [0.0, 5.0]
|
||||
new_values = [[-1.0, 0.0, 1.5], [2.0, 5.0, 15]]
|
||||
values = [[-1.0, 0.0, 1.5], [2.0, 5.0, 15]]
|
||||
expected_bin_counts = [2, 1, 1, 0, 2]
|
||||
with self.test_session() as sess:
|
||||
hist = variables.Variable(array_ops.zeros(nbins, dtype=dtypes.int32))
|
||||
hist_update = histogram_ops.histogram_fixed_width(hist, new_values,
|
||||
value_range)
|
||||
variables.initialize_all_variables().run()
|
||||
self.assertTrue(hist.dtype.is_compatible_with(hist_update.dtype))
|
||||
updated_hist_array = sess.run(hist_update)
|
||||
with self.test_session():
|
||||
hist = tf.histogram_fixed_width(values, value_range, nbins=5)
|
||||
tf.initialize_all_variables().run()
|
||||
|
||||
# The new updated_hist_array is returned by the updating op.
|
||||
self.assertAllClose(expected_bin_counts, updated_hist_array)
|
||||
|
||||
# hist should contain updated values, but eval() should not change it.
|
||||
# Hist should start "fresh" with every eval.
|
||||
self.assertAllClose(expected_bin_counts, hist.eval())
|
||||
self.assertAllClose(expected_bin_counts, hist.eval())
|
||||
|
||||
def test_two_updates_on_constant_input(self):
|
||||
# Bins will be:
|
||||
# (-inf, 1), [1, 2), [2, 3), [3, 4), [4, inf)
|
||||
nbins = [5]
|
||||
value_range = [0.0, 5.0]
|
||||
new_values_1 = [-1.0, 0.0, 1.5, 2.0, 5.0, 15]
|
||||
new_values_2 = [1.5, 4.5, 4.5, 4.5, 0.0, 0.0]
|
||||
values_1 = [-1.0, 0.0, 1.5, 2.0, 5.0, 15]
|
||||
values_2 = [1.5, 4.5, 4.5, 4.5, 0.0, 0.0]
|
||||
expected_bin_counts_1 = [2, 1, 1, 0, 2]
|
||||
expected_bin_counts_2 = [4, 2, 1, 0, 5]
|
||||
with self.test_session() as sess:
|
||||
hist = variables.Variable(array_ops.zeros(nbins, dtype=dtypes.int32))
|
||||
new_values = array_ops.placeholder(dtypes.float32, shape=[6])
|
||||
hist_update = histogram_ops.histogram_fixed_width(hist, new_values,
|
||||
value_range)
|
||||
variables.initialize_all_variables().run()
|
||||
updated_hist_array = sess.run(hist_update,
|
||||
feed_dict={new_values: new_values_1})
|
||||
expected_bin_counts_2 = [2, 1, 0, 0, 3]
|
||||
with self.test_session():
|
||||
values = tf.placeholder(tf.float32, shape=[6])
|
||||
hist = tf.histogram_fixed_width(values, value_range, nbins=5)
|
||||
tf.initialize_all_variables().run()
|
||||
|
||||
# The new updated_hist_array is returned by the updating op.
|
||||
# hist should contain the updated values.
|
||||
self.assertAllClose(expected_bin_counts_1, updated_hist_array)
|
||||
self.assertAllClose(expected_bin_counts_1, hist.eval())
|
||||
|
||||
updated_hist_array = sess.run(hist_update,
|
||||
feed_dict={new_values: new_values_2})
|
||||
self.assertAllClose(expected_bin_counts_2, updated_hist_array)
|
||||
self.assertAllClose(expected_bin_counts_2, hist.eval())
|
||||
# The values in hist should depend on the current feed and nothing else.
|
||||
self.assertAllClose(expected_bin_counts_1,
|
||||
hist.eval(feed_dict={values: values_1}))
|
||||
self.assertAllClose(expected_bin_counts_2,
|
||||
hist.eval(feed_dict={values: values_2}))
|
||||
self.assertAllClose(expected_bin_counts_1,
|
||||
hist.eval(feed_dict={values: values_1}))
|
||||
self.assertAllClose(expected_bin_counts_1,
|
||||
hist.eval(feed_dict={values: values_1}))
|
||||
|
||||
def test_two_updates_on_scalar_input(self):
|
||||
# Bins will be:
|
||||
# (-inf, 1), [1, 2), [2, 3), [3, 4), [4, inf)
|
||||
nbins = [5]
|
||||
value_range = [0.0, 5.0]
|
||||
new_values_1 = 1.5
|
||||
new_values_2 = 2.5
|
||||
values_1 = 1.5
|
||||
values_2 = 2.5
|
||||
expected_bin_counts_1 = [0, 1, 0, 0, 0]
|
||||
expected_bin_counts_2 = [0, 1, 1, 0, 0]
|
||||
with self.test_session() as sess:
|
||||
hist = variables.Variable(array_ops.zeros(nbins, dtype=dtypes.int32))
|
||||
new_values = array_ops.placeholder(dtypes.float32, shape=[])
|
||||
hist_update = histogram_ops.histogram_fixed_width(hist, new_values,
|
||||
value_range)
|
||||
variables.initialize_all_variables().run()
|
||||
expected_bin_counts_2 = [0, 0, 1, 0, 0]
|
||||
with self.test_session():
|
||||
values = tf.placeholder(tf.float32, shape=[])
|
||||
hist = tf.histogram_fixed_width(values, value_range, nbins=5)
|
||||
tf.initialize_all_variables().run()
|
||||
|
||||
# The new updated_hist_array is returned by the updating op.
|
||||
# hist should contain the updated values.
|
||||
updated_hist_array = sess.run(hist_update,
|
||||
feed_dict={new_values: new_values_1})
|
||||
self.assertAllClose(expected_bin_counts_1, updated_hist_array)
|
||||
self.assertAllClose(expected_bin_counts_1, hist.eval())
|
||||
# The values in hist should depend on the current feed and nothing else.
|
||||
self.assertAllClose(expected_bin_counts_2,
|
||||
hist.eval(feed_dict={values: values_2}))
|
||||
self.assertAllClose(expected_bin_counts_1,
|
||||
hist.eval(feed_dict={values: values_1}))
|
||||
self.assertAllClose(expected_bin_counts_1,
|
||||
hist.eval(feed_dict={values: values_1}))
|
||||
self.assertAllClose(expected_bin_counts_2,
|
||||
hist.eval(feed_dict={values: values_2}))
|
||||
|
||||
updated_hist_array = sess.run(hist_update,
|
||||
feed_dict={new_values: new_values_2})
|
||||
self.assertAllClose(expected_bin_counts_2, updated_hist_array)
|
||||
self.assertAllClose(expected_bin_counts_2, hist.eval())
|
||||
|
||||
def test_multiple_random_3d_updates_results_in_right_dist(self):
|
||||
# Update with uniform 3-D rvs. Resultant
|
||||
def test_multiple_random_accumulating_updates_results_in_right_dist(self):
|
||||
# Accumulate the updates in a new variable. Resultant
|
||||
# histogram should be uniform. Use only 3 bins because with many bins it
|
||||
# would be unlikely that all would be close to 1/n. If someone ever wants
|
||||
# to test that, it would be better to check that the cdf was linear.
|
||||
nbins = [3]
|
||||
value_range = [1.0, 4.14159]
|
||||
with self.test_session() as sess:
|
||||
hist = variables.Variable(array_ops.zeros(nbins, dtype=dtypes.int32))
|
||||
new_values = array_ops.placeholder(dtypes.float32, shape=[4, 4, 4])
|
||||
hist_update = histogram_ops.histogram_fixed_width(hist, new_values,
|
||||
value_range)
|
||||
variables.initialize_all_variables().run()
|
||||
values = tf.placeholder(tf.float32, shape=[4, 4, 4])
|
||||
hist = tf.histogram_fixed_width(values,
|
||||
value_range,
|
||||
nbins=3,
|
||||
dtype=tf.int64)
|
||||
|
||||
hist_accum = tf.Variable(tf.zeros_initializer([3], dtype=tf.int64))
|
||||
hist_accum = hist_accum.assign_add(hist)
|
||||
|
||||
tf.initialize_all_variables().run()
|
||||
|
||||
for _ in range(100):
|
||||
# Map the rv: U[0, 1] --> U[value_range[0], value_range[1]].
|
||||
new_values_arr = (
|
||||
values_arr = (
|
||||
value_range[0] +
|
||||
(value_range[1] - value_range[0]) * self.rng.rand(4, 4, 4))
|
||||
|
||||
# The new updated_hist_array is returned by the updating op.
|
||||
# hist should contain the updated values.
|
||||
updated_hist_array = sess.run(hist_update,
|
||||
feed_dict={new_values: new_values_arr})
|
||||
hist_accum_arr = sess.run(hist_accum, feed_dict={values: values_arr})
|
||||
|
||||
pmf = updated_hist_array / float(updated_hist_array.sum())
|
||||
pmf = hist_accum_arr / float(hist_accum_arr.sum())
|
||||
np.testing.assert_allclose(1 / 3, pmf, atol=0.02)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
googletest.main()
|
||||
tf.test.main()
|
||||
|
@ -92,6 +92,7 @@ The "producer" functions add a queue to the graph and a corresponding
|
||||
|
||||
@@match_filenames_once
|
||||
@@limit_epochs
|
||||
@@input_producer
|
||||
@@range_input_producer
|
||||
@@slice_input_producer
|
||||
@@string_input_producer
|
||||
|
@ -556,15 +556,13 @@ class EmbeddingWrapper(RNNCell):
|
||||
feed into your RNN.
|
||||
"""
|
||||
|
||||
def __init__(self, cell, embedding_classes=0, embedding=None,
|
||||
initializer=None):
|
||||
def __init__(self, cell, embedding_classes, embedding_size, initializer=None):
|
||||
"""Create a cell with an added input embedding.
|
||||
|
||||
Args:
|
||||
cell: an RNNCell, an embedding will be put before its inputs.
|
||||
embedding_classes: integer, how many symbols will be embedded.
|
||||
embedding: Variable, the embedding to use; if None, a new embedding
|
||||
will be created; if set, then embedding_classes is not required.
|
||||
embedding_size: integer, the size of the vectors we embed into.
|
||||
initializer: an initializer to use when creating the embedding;
|
||||
if None, the initializer from variable scope or a default one is used.
|
||||
|
||||
@ -574,21 +572,12 @@ class EmbeddingWrapper(RNNCell):
|
||||
"""
|
||||
if not isinstance(cell, RNNCell):
|
||||
raise TypeError("The parameter cell is not RNNCell.")
|
||||
if embedding_classes < 1 and embedding is None:
|
||||
raise ValueError("Pass embedding or embedding_classes must be > 0: %d."
|
||||
% embedding_classes)
|
||||
if embedding_classes > 0 and embedding is not None:
|
||||
if embedding.size[0] != embedding_classes:
|
||||
raise ValueError("You declared embedding_classes=%d but passed an "
|
||||
"embedding for %d classes." % (embedding.size[0],
|
||||
embedding_classes))
|
||||
if embedding.size[1] != cell.input_size:
|
||||
raise ValueError("You passed embedding with output size %d and a cell"
|
||||
" that accepts size %d." % (embedding.size[1],
|
||||
cell.input_size))
|
||||
if embedding_classes <= 0 or embedding_size <= 0:
|
||||
raise ValueError("Both embedding_classes and embedding_size must be > 0: "
|
||||
"%d, %d." % (embedding_classes, embedding_size))
|
||||
self._cell = cell
|
||||
self._embedding_classes = embedding_classes
|
||||
self._embedding = embedding
|
||||
self._embedding_size = embedding_size
|
||||
self._initializer = initializer
|
||||
|
||||
@property
|
||||
@ -607,20 +596,17 @@ class EmbeddingWrapper(RNNCell):
|
||||
"""Run the cell on embedded inputs."""
|
||||
with vs.variable_scope(scope or type(self).__name__): # "EmbeddingWrapper"
|
||||
with ops.device("/cpu:0"):
|
||||
if self._embedding:
|
||||
embedding = self._embedding
|
||||
if self._initializer:
|
||||
initializer = self._initializer
|
||||
elif vs.get_variable_scope().initializer:
|
||||
initializer = vs.get_variable_scope().initializer
|
||||
else:
|
||||
if self._initializer:
|
||||
initializer = self._initializer
|
||||
elif vs.get_variable_scope().initializer:
|
||||
initializer = vs.get_variable_scope().initializer
|
||||
else:
|
||||
# Default initializer for embeddings should have variance=1.
|
||||
sqrt3 = math.sqrt(3) # Uniform(-sqrt(3), sqrt(3)) has variance=1.
|
||||
initializer = init_ops.random_uniform_initializer(-sqrt3, sqrt3)
|
||||
embedding = vs.get_variable("embedding", [self._embedding_classes,
|
||||
self._cell.input_size],
|
||||
initializer=initializer)
|
||||
# Default initializer for embeddings should have variance=1.
|
||||
sqrt3 = math.sqrt(3) # Uniform(-sqrt(3), sqrt(3)) has variance=1.
|
||||
initializer = init_ops.random_uniform_initializer(-sqrt3, sqrt3)
|
||||
embedding = vs.get_variable("embedding", [self._embedding_classes,
|
||||
self._embedding_size],
|
||||
initializer=initializer)
|
||||
embedded = embedding_ops.embedding_lookup(
|
||||
embedding, array_ops.reshape(inputs, [-1]))
|
||||
return self._cell(embedded, state)
|
||||
|
@ -311,7 +311,9 @@ def embedding_rnn_seq2seq(encoder_inputs, decoder_inputs, cell,
|
||||
"""
|
||||
with variable_scope.variable_scope(scope or "embedding_rnn_seq2seq"):
|
||||
# Encoder.
|
||||
encoder_cell = rnn_cell.EmbeddingWrapper(cell, num_encoder_symbols)
|
||||
encoder_cell = rnn_cell.EmbeddingWrapper(
|
||||
cell, embedding_classes=num_encoder_symbols,
|
||||
embedding_size=cell.input_size)
|
||||
_, encoder_state = rnn.rnn(encoder_cell, encoder_inputs, dtype=dtype)
|
||||
|
||||
# Decoder.
|
||||
@ -686,7 +688,9 @@ def embedding_attention_seq2seq(encoder_inputs, decoder_inputs, cell,
|
||||
"""
|
||||
with variable_scope.variable_scope(scope or "embedding_attention_seq2seq"):
|
||||
# Encoder.
|
||||
encoder_cell = rnn_cell.EmbeddingWrapper(cell, num_encoder_symbols)
|
||||
encoder_cell = rnn_cell.EmbeddingWrapper(
|
||||
cell, embedding_classes=num_encoder_symbols,
|
||||
embedding_size=cell.input_size)
|
||||
encoder_outputs, encoder_state = rnn.rnn(
|
||||
encoder_cell, encoder_inputs, dtype=dtype)
|
||||
|
||||
@ -772,7 +776,9 @@ def one2many_rnn_seq2seq(encoder_inputs, decoder_inputs_dict, cell,
|
||||
|
||||
with variable_scope.variable_scope(scope or "one2many_rnn_seq2seq"):
|
||||
# Encoder.
|
||||
encoder_cell = rnn_cell.EmbeddingWrapper(cell, num_encoder_symbols)
|
||||
encoder_cell = rnn_cell.EmbeddingWrapper(
|
||||
cell, embedding_classes=num_encoder_symbols,
|
||||
embedding_size=cell.input_size)
|
||||
_, encoder_state = rnn.rnn(encoder_cell, encoder_inputs, dtype=dtype)
|
||||
|
||||
# Decoder.
|
||||
|
@ -774,7 +774,7 @@ def _SerializeManySparseShape(op): # pylint: disable=invalid-name
|
||||
return [tensor_shape.matrix(None, 3)]
|
||||
|
||||
|
||||
def deserialize_many_sparse(serialized_sparse, dtype, name=None):
|
||||
def deserialize_many_sparse(serialized_sparse, dtype, rank=None, name=None):
|
||||
"""Deserialize and concatenate `SparseTensors` from a serialized minibatch.
|
||||
|
||||
The input `serialized_sparse` must be a string matrix of shape `[N x 3]` where
|
||||
@ -823,6 +823,7 @@ def deserialize_many_sparse(serialized_sparse, dtype, name=None):
|
||||
serialized_sparse: 2-D `Tensor` of type `string` of shape `[N, 3]`.
|
||||
The serialized and packed `SparseTensor' objects.
|
||||
dtype: The `dtype` of the serialized `SparseTensor` objects.
|
||||
rank: (optional) Python int, the rank of the `SparseTensor` objects.
|
||||
name: A name prefix for the returned tensors (optional)
|
||||
|
||||
Returns:
|
||||
@ -835,6 +836,10 @@ def deserialize_many_sparse(serialized_sparse, dtype, name=None):
|
||||
gen_sparse_ops._deserialize_many_sparse(
|
||||
serialized_sparse, dtype, name=name))
|
||||
|
||||
# Feed rank data back in, if available
|
||||
output_indices.set_shape([None, rank])
|
||||
output_shape.set_shape([rank])
|
||||
|
||||
return ops.SparseTensor(output_indices, output_values, output_shape)
|
||||
|
||||
|
||||
|
@ -42,6 +42,7 @@ from tensorflow.python.ops.control_flow_ops import foldr
|
||||
from tensorflow.python.ops.control_flow_ops import map_fn
|
||||
from tensorflow.python.ops.data_flow_ops import *
|
||||
from tensorflow.python.ops.gradients import *
|
||||
from tensorflow.python.ops.histogram_ops import *
|
||||
from tensorflow.python.ops.init_ops import *
|
||||
from tensorflow.python.ops.io_ops import *
|
||||
from tensorflow.python.ops.linalg_ops import *
|
||||
|
213
tensorflow/python/platform/benchmark.py
Normal file
213
tensorflow/python/platform/benchmark.py
Normal file
@ -0,0 +1,213 @@
|
||||
# Copyright 2016 Google Inc. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ==============================================================================
|
||||
|
||||
"""Utilities to run benchmarks."""
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import inspect
|
||||
import numbers
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
|
||||
import six # pylint: disable=unused-import
|
||||
|
||||
from google.protobuf import text_format
|
||||
from tensorflow.core.util import test_log_pb2
|
||||
from tensorflow.python.platform import app
|
||||
from tensorflow.python.platform import gfile
|
||||
|
||||
# When a subclass of the Benchmark class is created, it is added to
|
||||
# the registry automatically
|
||||
GLOBAL_BENCHMARK_REGISTRY = set()
|
||||
|
||||
# Environment variable that determines whether benchmarks are written.
|
||||
# See also tensorflow/core/util/reporter.h TestReporter::kTestReporterEnv.
|
||||
TEST_REPORTER_TEST_ENV = "TEST_REPORT_FILE_PREFIX"
|
||||
|
||||
|
||||
def _global_report_benchmark(
|
||||
name, iters=None, cpu_time=None, wall_time=None,
|
||||
throughput=None, extras=None):
|
||||
"""Method for recording a benchmark directly.
|
||||
|
||||
Args:
|
||||
name: The BenchmarkEntry name.
|
||||
iters: (optional) How many iterations were run
|
||||
cpu_time: (optional) Total cpu time in seconds
|
||||
wall_time: (optional) Total wall time in seconds
|
||||
throughput: (optional) Throughput (in MB/s)
|
||||
extras: (optional) Dict mapping string keys to additional benchmark info.
|
||||
|
||||
Raises:
|
||||
TypeError: if extras is not a dict.
|
||||
IOError: if the benchmark output file already exists.
|
||||
"""
|
||||
if extras is not None:
|
||||
if not isinstance(extras, dict):
|
||||
raise TypeError("extras must be a dict")
|
||||
|
||||
test_env = os.environ.get(TEST_REPORTER_TEST_ENV, None)
|
||||
if test_env is None:
|
||||
# Reporting was not requested
|
||||
return
|
||||
|
||||
entry = test_log_pb2.BenchmarkEntry()
|
||||
entry.name = name
|
||||
if iters is not None:
|
||||
entry.iters = iters
|
||||
if cpu_time is not None:
|
||||
entry.cpu_time = cpu_time
|
||||
if wall_time is not None:
|
||||
entry.wall_time = wall_time
|
||||
if throughput is not None:
|
||||
entry.throughput = throughput
|
||||
if extras is not None:
|
||||
for (k, v) in extras.items():
|
||||
if isinstance(v, numbers.Number):
|
||||
entry.extras[k].double_value = v
|
||||
else:
|
||||
entry.extras[k].string_value = str(v)
|
||||
|
||||
serialized_entry = text_format.MessageToString(entry)
|
||||
|
||||
mangled_name = name.replace("/", "__")
|
||||
output_path = "%s%s" % (test_env, mangled_name)
|
||||
if gfile.Exists(output_path):
|
||||
raise IOError("File already exists: %s" % output_path)
|
||||
with gfile.GFile(output_path, "w") as out:
|
||||
out.write(serialized_entry)
|
||||
|
||||
|
||||
class _BenchmarkRegistrar(type):
|
||||
"""The Benchmark class registrar. Used by abstract Benchmark class."""
|
||||
|
||||
def __new__(mcs, clsname, base, attrs):
|
||||
newclass = super(mcs, _BenchmarkRegistrar).__new__(
|
||||
mcs, clsname, base, attrs)
|
||||
if len(newclass.mro()) > 2:
|
||||
# Only the base Benchmark abstract class has mro length 2.
|
||||
# The rest subclass from it and are therefore registered.
|
||||
GLOBAL_BENCHMARK_REGISTRY.add(newclass)
|
||||
return newclass
|
||||
|
||||
|
||||
class Benchmark(object):
|
||||
"""Abstract class that provides helper functions for running benchmarks.
|
||||
|
||||
Any class subclassing this one is immediately registered in the global
|
||||
benchmark registry.
|
||||
|
||||
Only methods whose names start with the word "benchmark" will be run during
|
||||
benchmarking.
|
||||
"""
|
||||
__metaclass__ = _BenchmarkRegistrar
|
||||
|
||||
def _get_name(self, overwrite_name):
|
||||
"""Returns full name of class and method calling report_benchmark."""
|
||||
|
||||
# Expect that the caller called report_benchmark, which called _get_name.
|
||||
caller = inspect.stack()[2]
|
||||
calling_class = caller[0].f_locals.get("self", None)
|
||||
# Use the method name, or overwrite_name is provided.
|
||||
name = overwrite_name if overwrite_name is not None else caller[3]
|
||||
if calling_class is not None:
|
||||
# Prefix the name with the class name.
|
||||
class_name = type(calling_class).__name__
|
||||
name = "%s.%s" % (class_name, name)
|
||||
return name
|
||||
|
||||
def report_benchmark(
|
||||
self,
|
||||
iters=None,
|
||||
cpu_time=None,
|
||||
wall_time=None,
|
||||
throughput=None,
|
||||
extras=None,
|
||||
name=None):
|
||||
"""Report a benchmark.
|
||||
|
||||
Args:
|
||||
iters: (optional) How many iterations were run
|
||||
cpu_time: (optional) Total cpu time in seconds
|
||||
wall_time: (optional) Total wall time in seconds
|
||||
throughput: (optional) Throughput (in MB/s)
|
||||
extras: (optional) Dict mapping string keys to additional benchmark info.
|
||||
name: (optional) Override the BenchmarkEntry name with `name`.
|
||||
Otherwise it is inferred from the calling class and top-level
|
||||
method name.
|
||||
"""
|
||||
name = self._get_name(overwrite_name=name)
|
||||
_global_report_benchmark(
|
||||
name=name, iters=iters, cpu_time=cpu_time, wall_time=wall_time,
|
||||
throughput=throughput, extras=extras)
|
||||
|
||||
|
||||
def _run_specific_benchmark(benchmark_class):
|
||||
benchmark = benchmark_class()
|
||||
attrs = dir(benchmark)
|
||||
# Only run methods of this class whose names start with "benchmark"
|
||||
for attr in attrs:
|
||||
if not attr.startswith("benchmark"):
|
||||
continue
|
||||
benchmark_fn = getattr(benchmark, attr)
|
||||
if not callable(benchmark_fn):
|
||||
continue
|
||||
# Call this benchmark method
|
||||
benchmark_fn()
|
||||
|
||||
|
||||
def _run_benchmarks(regex):
|
||||
"""Run benchmarks that match regex `regex`.
|
||||
|
||||
This function goes through the global benchmark registry, and matches
|
||||
benchmark **classe names** of the form "module.name.BenchmarkClass" to
|
||||
the given regex. If a class matches, all of its benchmark methods
|
||||
are run.
|
||||
|
||||
Args:
|
||||
regex: The string regular expression to match Benchmark classes against.
|
||||
"""
|
||||
registry = list(GLOBAL_BENCHMARK_REGISTRY)
|
||||
|
||||
# Match benchmarks in registry against regex
|
||||
for benchmark in registry:
|
||||
benchmark_name = "%s.%s" % (benchmark.__module__, benchmark.__name__)
|
||||
if re.search(regex, benchmark_name):
|
||||
# Found a match
|
||||
|
||||
_run_specific_benchmark(benchmark)
|
||||
|
||||
|
||||
def benchmarks_main(true_main=None):
|
||||
"""Run benchmarks as declared in args.
|
||||
|
||||
Args:
|
||||
true_main: True main function to run if benchmarks are not requested.
|
||||
"""
|
||||
argv = sys.argv
|
||||
found_arg = [arg for arg in argv
|
||||
if arg.startswith("--benchmarks=")
|
||||
or arg.startswith("-benchmarks=")]
|
||||
if found_arg:
|
||||
# Remove --benchmarks arg from sys.argv
|
||||
argv.remove(found_arg[0])
|
||||
|
||||
regex = found_arg[0].split("=")[1]
|
||||
app.run(lambda _: _run_benchmarks(regex))
|
||||
else:
|
||||
true_main()
|
@ -23,8 +23,8 @@ import sys
|
||||
from tensorflow.python.platform import flags
|
||||
|
||||
|
||||
def run():
|
||||
def run(main=None):
|
||||
f = flags.FLAGS
|
||||
f._parse_flags()
|
||||
main = sys.modules['__main__'].main
|
||||
main = main or sys.modules['__main__'].main
|
||||
sys.exit(main(sys.argv))
|
||||
|
@ -21,7 +21,20 @@ from __future__ import print_function
|
||||
# pylint: disable=g-import-not-at-top
|
||||
# pylint: disable=wildcard-import
|
||||
from . import control_imports
|
||||
from tensorflow.python.platform import benchmark
|
||||
|
||||
# Import the Benchmark class
|
||||
Benchmark = benchmark.Benchmark # pylint: disable=invalid-name
|
||||
|
||||
if control_imports.USE_OSS and control_imports.OSS_GOOGLETEST:
|
||||
from tensorflow.python.platform.default._googletest import *
|
||||
from tensorflow.python.platform.default._googletest import main as g_main
|
||||
else:
|
||||
from tensorflow.python.platform.google._googletest import *
|
||||
from tensorflow.python.platform.google._googletest import main as g_main
|
||||
|
||||
|
||||
# Redefine main to allow running benchmarks
|
||||
def main():
|
||||
# Benchmarks determine whether to run tests or not, by calling g_main
|
||||
benchmark.benchmarks_main(true_main=g_main)
|
||||
|
@ -72,6 +72,10 @@ from tensorflow.python.kernel_tests.gradient_checker import compute_gradient
|
||||
# pylint: enable=unused-import
|
||||
|
||||
|
||||
# Import Benchmark class
|
||||
Benchmark = googletest.Benchmark # pylint: disable=invalid-name
|
||||
|
||||
|
||||
def main():
|
||||
"""Runs all unit tests."""
|
||||
return googletest.main()
|
||||
|
@ -131,6 +131,8 @@ class Coordinator(object):
|
||||
# Event set when threads must stop.
|
||||
self._stop_event = threading.Event()
|
||||
# Python exc_info to report.
|
||||
# If not None, it should hold the returned value of sys.exc_info(), which is
|
||||
# a tuple containing exception (type, value, traceback).
|
||||
self._exc_info_to_raise = None
|
||||
|
||||
def request_stop(self, ex=None):
|
||||
@ -138,6 +140,10 @@ class Coordinator(object):
|
||||
|
||||
After this is called, calls to `should_stop()` will return `True`.
|
||||
|
||||
Note: If an exception is being passed in, in must be in the context of
|
||||
handling the exception (i.e. `try: ... except Exception as ex: ...`) and not
|
||||
a newly created one.
|
||||
|
||||
Args:
|
||||
ex: Optional `Exception`, or Python `exc_info` tuple as returned by
|
||||
`sys.exc_info()`. If this is the first call to `request_stop()` the
|
||||
@ -154,6 +160,22 @@ class Coordinator(object):
|
||||
logging.info("Error reported to Coordinator: %s",
|
||||
compat.as_str_any(ex))
|
||||
self._exc_info_to_raise = sys.exc_info()
|
||||
# self._exc_info_to_raise should contain a tuple containing exception
|
||||
# (type, value, traceback)
|
||||
if (len(self._exc_info_to_raise) != 3 or
|
||||
not self._exc_info_to_raise[0] or
|
||||
not self._exc_info_to_raise[1]):
|
||||
# Raise, catch and record the exception here so that error happens
|
||||
# where expected.
|
||||
try:
|
||||
raise ValueError(
|
||||
"ex must be a tuple or sys.exc_info must return the current "
|
||||
"exception: %s"
|
||||
% self._exc_info_to_raise)
|
||||
except ValueError:
|
||||
# Record this error so it kills the coordinator properly.
|
||||
self._exc_info_to_raise = sys.exc_info()
|
||||
|
||||
self._stop_event.set()
|
||||
|
||||
def clear_stop(self):
|
||||
|
@ -84,20 +84,63 @@ def limit_epochs(tensor, num_epochs=None, name=None):
|
||||
return array_ops.identity(tensor, name=name)
|
||||
|
||||
|
||||
def _input_producer(input_tensor, dtype, num_epochs, shuffle, seed, capacity,
|
||||
shared_name, name, summary_name):
|
||||
if shuffle:
|
||||
input_tensor = random_ops.random_shuffle(input_tensor, seed=seed)
|
||||
input_tensor = limit_epochs(input_tensor, num_epochs)
|
||||
def input_producer(input_tensor, element_shape=None, num_epochs=None,
|
||||
shuffle=True, seed=None, capacity=32, shared_name=None,
|
||||
summary_name=None, name=None):
|
||||
"""Output the rows of `input_tensor` to a queue for an input pipeline.
|
||||
|
||||
q = data_flow_ops.FIFOQueue(capacity=capacity, dtypes=[dtype], shapes=[[]],
|
||||
shared_name=shared_name, name=name)
|
||||
enq = q.enqueue_many([input_tensor])
|
||||
queue_runner.add_queue_runner(queue_runner.QueueRunner(q, [enq]))
|
||||
logging_ops.scalar_summary("queue/%s/%s" % (q.name, summary_name),
|
||||
math_ops.cast(q.size(), dtypes.float32) *
|
||||
(1. / capacity))
|
||||
return q
|
||||
Args:
|
||||
input_tensor: A tensor with the rows to produce. Must be at
|
||||
one-dimensional. Must either have a fully-defined shape, or
|
||||
`element_shape` must be defined.
|
||||
element_shape: (Optional.) A `TensorShape` representing the shape of a
|
||||
row of `input_tensor`, if it cannot be inferred.
|
||||
num_epochs: (Optional.) An integer. If specified `input_producer` produces
|
||||
each row of `input_tensor` `num_epochs` times before generating an
|
||||
`OutOfRange` error. If not specified, `input_producer` can cycle through
|
||||
the rows of `input_tensor` an unlimited number of times.
|
||||
shuffle: (Optional.) A boolean. If true, the rows are randomly shuffled
|
||||
within each eopch.
|
||||
seed: (Optional.) An integer. The seed to use if `shuffle` is true.
|
||||
capacity: (Optional.) The capacity of the queue to be used for buffering
|
||||
the input.
|
||||
shared_name: (Optional.) If set, this queue will be shared under the given
|
||||
name across multiple sessions.
|
||||
summary_name: (Optional.) If set, a scalar summary for the current queue
|
||||
size will be generated, using this name as part of the tag.
|
||||
name: (Optional.) A name for queue.
|
||||
|
||||
Returns:
|
||||
A queue with the output rows. A `QueueRunner` for the queue is
|
||||
added to the current `QUEUE_RUNNER` collection of the current
|
||||
graph.
|
||||
|
||||
Raises:
|
||||
ValueError: If the shape of the input cannot be inferred from the arguments.
|
||||
"""
|
||||
with ops.op_scope([input_tensor], name, "input_producer"):
|
||||
input_tensor = ops.convert_to_tensor(input_tensor, name="input_tensor")
|
||||
element_shape = input_tensor.get_shape()[1:].merge_with(element_shape)
|
||||
if not element_shape.is_fully_defined():
|
||||
raise ValueError("Either `input_tensor` must have a fully defined shape "
|
||||
"or `element_shape` must be specified")
|
||||
|
||||
if shuffle:
|
||||
input_tensor = random_ops.random_shuffle(input_tensor, seed=seed)
|
||||
|
||||
input_tensor = limit_epochs(input_tensor, num_epochs)
|
||||
|
||||
q = data_flow_ops.FIFOQueue(capacity=capacity,
|
||||
dtypes=[input_tensor.dtype.base_dtype],
|
||||
shapes=[element_shape],
|
||||
shared_name=shared_name, name=name)
|
||||
enq = q.enqueue_many([input_tensor])
|
||||
queue_runner.add_queue_runner(queue_runner.QueueRunner(q, [enq]))
|
||||
if summary_name is not None:
|
||||
logging_ops.scalar_summary("queue/%s/%s" % (q.name, summary_name),
|
||||
math_ops.cast(q.size(), dtypes.float32) *
|
||||
(1. / capacity))
|
||||
return q
|
||||
|
||||
|
||||
def string_input_producer(string_tensor, num_epochs=None, shuffle=True,
|
||||
@ -108,9 +151,9 @@ def string_input_producer(string_tensor, num_epochs=None, shuffle=True,
|
||||
string_tensor: A 1-D string tensor with the strings to produce.
|
||||
num_epochs: An integer (optional). If specified, `string_input_producer`
|
||||
produces each string from `string_tensor` `num_epochs` times before
|
||||
generating an OutOfRange error. If not specified, `string_input_producer`
|
||||
can cycle through the strings in `string_tensor` an unlimited number of
|
||||
times.
|
||||
generating an `OutOfRange` error. If not specified,
|
||||
`string_input_producer` can cycle through the strings in `string_tensor`
|
||||
an unlimited number of times.
|
||||
shuffle: Boolean. If true, the strings are randomly shuffled within each
|
||||
epoch.
|
||||
seed: An integer (optional). Seed used if shuffle == True.
|
||||
@ -137,9 +180,9 @@ def string_input_producer(string_tensor, num_epochs=None, shuffle=True,
|
||||
logging_ops.Assert(math_ops.greater(array_ops.size(string_tensor), 0),
|
||||
[not_null_err])]):
|
||||
string_tensor = array_ops.identity(string_tensor)
|
||||
return _input_producer(
|
||||
return input_producer(
|
||||
input_tensor=string_tensor,
|
||||
dtype=dtypes.string,
|
||||
element_shape=[],
|
||||
num_epochs=num_epochs,
|
||||
shuffle=shuffle,
|
||||
seed=seed,
|
||||
@ -173,8 +216,8 @@ def range_input_producer(limit, num_epochs=None, shuffle=True, seed=None,
|
||||
"""
|
||||
with ops.op_scope([limit], name, "input_producer") as name:
|
||||
range_tensor = math_ops.range(limit)
|
||||
return _input_producer(
|
||||
range_tensor, dtypes.int32, num_epochs, shuffle, seed, capacity,
|
||||
return input_producer(
|
||||
range_tensor, [], num_epochs, shuffle, seed, capacity,
|
||||
shared_name, name, "fraction_of_%d_full" % capacity)
|
||||
|
||||
|
||||
@ -231,51 +274,104 @@ def _flatten(tensor_list_list):
|
||||
return [tensor for tensor_list in tensor_list_list for tensor in tensor_list]
|
||||
|
||||
|
||||
class _SparseMetaData(object):
|
||||
"""Store information about the Tensor: Is it sparse?, dtype, and rank."""
|
||||
|
||||
def __init__(self, sparse, dtype, rank):
|
||||
self._sparse = sparse
|
||||
self._dtype = dtype
|
||||
self._rank = rank
|
||||
|
||||
def __eq__(self, other):
|
||||
if self.sparse != other.sparse:
|
||||
return False
|
||||
if not self.sparse:
|
||||
return True
|
||||
if self.dtype != other.dtype:
|
||||
return False
|
||||
if not self.rank.is_compatible_with(other.rank):
|
||||
return False
|
||||
return True
|
||||
|
||||
def __ne__(self, other):
|
||||
return not self.__eq__(other)
|
||||
|
||||
def __str__(self):
|
||||
return "[SparseMetaData(%s, %s, %s)]" % (self.sparse, self.dtype, self.rank)
|
||||
|
||||
def merge_with(self, other):
|
||||
if self != other:
|
||||
raise ValueError("SparseMetaData objects are incompatible: %s vs. %s"
|
||||
% (self, other))
|
||||
if self.sparse:
|
||||
self.rank.merge_with(other.rank)
|
||||
return self
|
||||
|
||||
@property
|
||||
def dtype(self):
|
||||
return self._dtype
|
||||
|
||||
@property
|
||||
def sparse(self):
|
||||
return self._sparse
|
||||
|
||||
@property
|
||||
def rank(self):
|
||||
return self._rank
|
||||
|
||||
|
||||
def _serialize_sparse_tensors(tensor_list, enqueue_many):
|
||||
"""Serialize SparseTensors for feeding into batch, etc."""
|
||||
is_sparse_list = [isinstance(t, ops.SparseTensor) for t in tensor_list]
|
||||
sparse_dtypes_list = [
|
||||
t.dtype if isinstance(t, ops.SparseTensor) else None
|
||||
sparse_info_list = [
|
||||
_SparseMetaData(sparse=True,
|
||||
dtype=t.dtype,
|
||||
rank=t.shape.get_shape().with_rank(1)[0])
|
||||
if isinstance(t, ops.SparseTensor)
|
||||
else _SparseMetaData(False, None, None)
|
||||
for t in tensor_list]
|
||||
|
||||
def _maybe_serialize(t, is_sparse):
|
||||
if not is_sparse:
|
||||
def _maybe_serialize(t, sparse):
|
||||
if not sparse:
|
||||
return t
|
||||
return (sparse_ops.serialize_many_sparse(t) if enqueue_many
|
||||
else sparse_ops.serialize_sparse(t))
|
||||
|
||||
serialized_list = [
|
||||
_maybe_serialize(t, is_sparse)
|
||||
for (t, is_sparse) in zip(tensor_list, is_sparse_list)]
|
||||
return serialized_list, is_sparse_list, sparse_dtypes_list
|
||||
_maybe_serialize(t, info.sparse) for (t, info)
|
||||
in zip(tensor_list, sparse_info_list)]
|
||||
|
||||
return serialized_list, sparse_info_list
|
||||
|
||||
|
||||
def _serialize_sparse_tensors_join(tensor_list_list, enqueue_many):
|
||||
"""Serialize SparseTensors for feeding into batch_join, etc."""
|
||||
(s0, is_sparse_list, sparse_dtypes_list) = _serialize_sparse_tensors(
|
||||
(s0, sparse_info_list) = _serialize_sparse_tensors(
|
||||
tensor_list_list[0], enqueue_many)
|
||||
serialized_list_list = [s0]
|
||||
for tensor_list in tensor_list_list[1:]:
|
||||
(s, is_sparse_candidate, sparse_dtypes_candidate) = (
|
||||
_serialize_sparse_tensors(tensor_list, enqueue_many))
|
||||
if is_sparse_candidate != is_sparse_list:
|
||||
s, sparse_info_candidate = _serialize_sparse_tensors(
|
||||
tensor_list, enqueue_many)
|
||||
if sparse_info_list != sparse_info_candidate:
|
||||
raise ValueError("Inconsistent SparseTensors list: %s vs. %s"
|
||||
% (tensor_list_list[0], tensor_list))
|
||||
if sparse_dtypes_candidate != sparse_dtypes_list:
|
||||
raise ValueError("Inconsistent SparseTensor dtypes in list: %s vs. %s"
|
||||
% (tensor_list_list[0], tensor_list))
|
||||
sparse_info_list = [
|
||||
info.merge_with(candidate)
|
||||
for (info, candidate) in zip(sparse_info_list, sparse_info_candidate)]
|
||||
serialized_list_list.append(s)
|
||||
return (serialized_list_list, is_sparse_list, sparse_dtypes_list)
|
||||
|
||||
return (serialized_list_list, sparse_info_list)
|
||||
|
||||
|
||||
def _deserialize_sparse_tensors(serialized_list, is_sparse_list, sparse_dtypes):
|
||||
def _deserialize_sparse_tensors(serialized_list, sparse_info_list):
|
||||
"""Deserialize SparseTensors after dequeue in batch, batch_join, etc."""
|
||||
received_sequence = isinstance(serialized_list, collections.Sequence)
|
||||
if not received_sequence:
|
||||
serialized_list = (serialized_list,)
|
||||
tensors = [sparse_ops.deserialize_many_sparse(s, sparse_dtype) if is_sparse
|
||||
else s
|
||||
for (s, is_sparse, sparse_dtype)
|
||||
in zip(serialized_list, is_sparse_list, sparse_dtypes)]
|
||||
tensors = [
|
||||
sparse_ops.deserialize_many_sparse(s, info.dtype, info.rank.value)
|
||||
if info.sparse else s
|
||||
for (s, info)
|
||||
in zip(serialized_list, sparse_info_list)]
|
||||
return tensors if received_sequence else tensors[0]
|
||||
|
||||
|
||||
@ -345,7 +441,8 @@ def _enqueue(queue, tensor_list, threads, enqueue_many):
|
||||
|
||||
|
||||
def batch(tensor_list, batch_size, num_threads=1, capacity=32,
|
||||
enqueue_many=False, shapes=None, shared_name=None, name=None):
|
||||
enqueue_many=False, shapes=None,
|
||||
shared_name=None, name=None):
|
||||
"""Creates batches of tensors in `tensor_list`.
|
||||
|
||||
This function is implemented using a queue. A `QueueRunner` for the
|
||||
@ -394,7 +491,7 @@ def batch(tensor_list, batch_size, num_threads=1, capacity=32,
|
||||
"""
|
||||
with ops.op_scope(tensor_list, name, "batch") as name:
|
||||
tensor_list = _validate(tensor_list)
|
||||
tensor_list, is_sparse, sparse_dtypes = _serialize_sparse_tensors(
|
||||
(tensor_list, sparse_info) = _serialize_sparse_tensors(
|
||||
tensor_list, enqueue_many)
|
||||
types = _dtypes([tensor_list])
|
||||
shapes = _shapes([tensor_list], shapes, enqueue_many)
|
||||
@ -407,7 +504,7 @@ def batch(tensor_list, batch_size, num_threads=1, capacity=32,
|
||||
math_ops.cast(queue.size(), dtypes.float32) * (1. / capacity))
|
||||
|
||||
dequeued = queue.dequeue_many(batch_size, name=name)
|
||||
dequeued = _deserialize_sparse_tensors(dequeued, is_sparse, sparse_dtypes)
|
||||
dequeued = _deserialize_sparse_tensors(dequeued, sparse_info)
|
||||
return dequeued
|
||||
|
||||
|
||||
@ -478,8 +575,8 @@ def batch_join(tensor_list_list, batch_size, capacity=32, enqueue_many=False,
|
||||
"""
|
||||
with ops.op_scope(_flatten(tensor_list_list), name, "batch_join") as name:
|
||||
tensor_list_list = _validate_join(tensor_list_list)
|
||||
tensor_list_list, is_sparse, sparse_dtypes = (
|
||||
_serialize_sparse_tensors_join(tensor_list_list, enqueue_many))
|
||||
tensor_list_list, sparse_info = _serialize_sparse_tensors_join(
|
||||
tensor_list_list, enqueue_many)
|
||||
types = _dtypes(tensor_list_list)
|
||||
shapes = _shapes(tensor_list_list, shapes, enqueue_many)
|
||||
# TODO(josh11b,mrry): Switch to BatchQueue once it is written.
|
||||
@ -491,7 +588,7 @@ def batch_join(tensor_list_list, batch_size, capacity=32, enqueue_many=False,
|
||||
math_ops.cast(queue.size(), dtypes.float32) * (1. / capacity))
|
||||
|
||||
dequeued = queue.dequeue_many(batch_size, name=name)
|
||||
dequeued = _deserialize_sparse_tensors(dequeued, is_sparse, sparse_dtypes)
|
||||
dequeued = _deserialize_sparse_tensors(dequeued, sparse_info)
|
||||
return dequeued
|
||||
|
||||
|
||||
@ -567,7 +664,7 @@ def shuffle_batch(tensor_list, batch_size, capacity, min_after_dequeue,
|
||||
"""
|
||||
with ops.op_scope(tensor_list, name, "shuffle_batch") as name:
|
||||
tensor_list = _validate(tensor_list)
|
||||
tensor_list, is_sparse, sparse_dtypes = _serialize_sparse_tensors(
|
||||
tensor_list, sparse_info = _serialize_sparse_tensors(
|
||||
tensor_list, enqueue_many)
|
||||
types = _dtypes([tensor_list])
|
||||
shapes = _shapes([tensor_list], shapes, enqueue_many)
|
||||
@ -586,7 +683,7 @@ def shuffle_batch(tensor_list, batch_size, capacity, min_after_dequeue,
|
||||
logging_ops.scalar_summary(summary_name, full)
|
||||
|
||||
dequeued = queue.dequeue_many(batch_size, name=name)
|
||||
dequeued = _deserialize_sparse_tensors(dequeued, is_sparse, sparse_dtypes)
|
||||
dequeued = _deserialize_sparse_tensors(dequeued, sparse_info)
|
||||
return dequeued
|
||||
|
||||
|
||||
@ -652,8 +749,8 @@ def shuffle_batch_join(tensor_list_list, batch_size, capacity,
|
||||
with ops.op_scope(
|
||||
_flatten(tensor_list_list), name, "shuffle_batch_join") as name:
|
||||
tensor_list_list = _validate_join(tensor_list_list)
|
||||
tensor_list_list, is_sparse, sparse_dtypes = (
|
||||
_serialize_sparse_tensors_join(tensor_list_list, enqueue_many))
|
||||
tensor_list_list, sparse_info = _serialize_sparse_tensors_join(
|
||||
tensor_list_list, enqueue_many)
|
||||
types = _dtypes(tensor_list_list)
|
||||
shapes = _shapes(tensor_list_list, shapes, enqueue_many)
|
||||
queue = data_flow_ops.RandomShuffleQueue(
|
||||
@ -671,5 +768,5 @@ def shuffle_batch_join(tensor_list_list, batch_size, capacity,
|
||||
logging_ops.scalar_summary(summary_name, full)
|
||||
|
||||
dequeued = queue.dequeue_many(batch_size, name=name)
|
||||
dequeued = _deserialize_sparse_tensors(dequeued, is_sparse, sparse_dtypes)
|
||||
dequeued = _deserialize_sparse_tensors(dequeued, sparse_info)
|
||||
return dequeued
|
||||
|
@ -69,6 +69,60 @@ class LimitEpochsTest(tf.test.TestCase):
|
||||
love_me_two_times.eval()
|
||||
|
||||
|
||||
class InputProducerTest(tf.test.TestCase):
|
||||
|
||||
def testNoShuffle(self):
|
||||
with self.test_session():
|
||||
input_tensor = [[1, 2, 3, 4],
|
||||
[5, 6, 7, 8],
|
||||
[9, 10, 11, 12]]
|
||||
num_epochs = 2
|
||||
queue = tf.train.input_producer(
|
||||
input_tensor, num_epochs=num_epochs, shuffle=False)
|
||||
dequeue_many = queue.dequeue_many(len(input_tensor) * num_epochs)
|
||||
dequeue = queue.dequeue()
|
||||
tf.initialize_all_variables().run()
|
||||
threads = tf.train.start_queue_runners()
|
||||
|
||||
# No randomness, so just see repeated copies of the input.
|
||||
self.assertAllEqual(input_tensor * num_epochs, dequeue_many.eval())
|
||||
|
||||
# Reached the limit.
|
||||
with self.assertRaises(tf.errors.OutOfRangeError):
|
||||
dequeue.eval()
|
||||
for thread in threads:
|
||||
thread.join()
|
||||
|
||||
def testNoShapeInference(self):
|
||||
with self.test_session():
|
||||
# Disable shape inference for the input.
|
||||
input_value = [[1, 2, 3, 4],
|
||||
[5, 6, 7, 8],
|
||||
[9, 10, 11, 12]]
|
||||
input_tensor = tf.placeholder_with_default(input_value, shape=None)
|
||||
num_epochs = 2
|
||||
queue = tf.train.input_producer(
|
||||
input_tensor, element_shape=[4], num_epochs=num_epochs, shuffle=False)
|
||||
dequeue_many = queue.dequeue_many(len(input_value) * num_epochs)
|
||||
dequeue = queue.dequeue()
|
||||
tf.initialize_all_variables().run()
|
||||
threads = tf.train.start_queue_runners()
|
||||
|
||||
# No randomness, so just see repeated copies of the input.
|
||||
self.assertAllEqual(input_value * num_epochs, dequeue_many.eval())
|
||||
|
||||
# Reached the limit.
|
||||
with self.assertRaises(tf.errors.OutOfRangeError):
|
||||
dequeue.eval()
|
||||
for thread in threads:
|
||||
thread.join()
|
||||
|
||||
def testShapeError(self):
|
||||
input_tensor = tf.placeholder(tf.float32, None)
|
||||
with self.assertRaisesRegexp(ValueError, "fully defined shape"):
|
||||
_ = tf.train.input_producer(input_tensor)
|
||||
|
||||
|
||||
class StringInputProducerTest(tf.test.TestCase):
|
||||
|
||||
def testNoShuffle(self):
|
||||
|
@ -25,11 +25,14 @@ import time
|
||||
|
||||
import six
|
||||
|
||||
from tensorflow.core.framework import graph_pb2
|
||||
from tensorflow.core.framework import summary_pb2
|
||||
from tensorflow.core.util import event_pb2
|
||||
from tensorflow.python import pywrap_tensorflow
|
||||
from tensorflow.python.framework import ops
|
||||
from tensorflow.python.lib.io import tf_record
|
||||
from tensorflow.python.platform import gfile
|
||||
from tensorflow.python.platform import logging
|
||||
from tensorflow.python.util import compat
|
||||
|
||||
|
||||
@ -53,7 +56,8 @@ class SummaryWriter(object):
|
||||
@@close
|
||||
"""
|
||||
|
||||
def __init__(self, logdir, graph_def=None, max_queue=10, flush_secs=120):
|
||||
def __init__(self, logdir, graph=None, max_queue=10, flush_secs=120,
|
||||
graph_def=None):
|
||||
"""Creates a `SummaryWriter` and an event file.
|
||||
|
||||
On construction the summary writer creates a new event file in `logdir`.
|
||||
@ -61,7 +65,7 @@ class SummaryWriter(object):
|
||||
call one of the following functions: `add_summary()`, `add_session_log()`,
|
||||
`add_event()`, or `add_graph()`.
|
||||
|
||||
If you pass a `graph_def` protocol buffer to the constructor it is added to
|
||||
If you pass a `Graph` to the constructor it is added to
|
||||
the event file. (This is equivalent to calling `add_graph()` later).
|
||||
|
||||
TensorBoard will pick the graph from the file and display it graphically so
|
||||
@ -72,8 +76,8 @@ class SummaryWriter(object):
|
||||
...create a graph...
|
||||
# Launch the graph in a session.
|
||||
sess = tf.Session()
|
||||
# Create a summary writer, add the 'graph_def' to the event file.
|
||||
writer = tf.train.SummaryWriter(<some-directory>, sess.graph_def)
|
||||
# Create a summary writer, add the 'graph' to the event file.
|
||||
writer = tf.train.SummaryWriter(<some-directory>, sess.graph)
|
||||
```
|
||||
|
||||
The other arguments to the constructor control the asynchronous writes to
|
||||
@ -86,10 +90,11 @@ class SummaryWriter(object):
|
||||
|
||||
Args:
|
||||
logdir: A string. Directory where event file will be written.
|
||||
graph_def: A `GraphDef` protocol buffer.
|
||||
graph: A `Graph` object, such as `sess.graph`.
|
||||
max_queue: Integer. Size of the queue for pending events and summaries.
|
||||
flush_secs: Number. How often, in seconds, to flush the
|
||||
pending events and summaries to disk.
|
||||
graph_def: DEPRECATED: Use the `graph` argument instead.
|
||||
"""
|
||||
self._logdir = logdir
|
||||
if not gfile.IsDirectory(self._logdir):
|
||||
@ -100,8 +105,9 @@ class SummaryWriter(object):
|
||||
self._worker = _EventLoggerThread(self._event_queue, self._ev_writer,
|
||||
flush_secs)
|
||||
self._worker.start()
|
||||
if graph_def is not None:
|
||||
self.add_graph(graph_def)
|
||||
if graph is not None or graph_def is not None:
|
||||
# Calling it with both graph and graph_def for backward compatibility.
|
||||
self.add_graph(graph=graph, graph_def=graph_def)
|
||||
|
||||
def add_summary(self, summary, global_step=None):
|
||||
"""Adds a `Summary` protocol buffer to the event file.
|
||||
@ -154,23 +160,65 @@ class SummaryWriter(object):
|
||||
"""
|
||||
self._event_queue.put(event)
|
||||
|
||||
def add_graph(self, graph_def, global_step=None):
|
||||
"""Adds a `GraphDef` protocol buffer to the event file.
|
||||
|
||||
The graph described by the protocol buffer will be displayed by
|
||||
TensorBoard. Most users pass a graph in the constructor instead.
|
||||
|
||||
Args:
|
||||
graph_def: A `GraphDef` protocol buffer.
|
||||
global_step: Number. Optional global step counter to record with the
|
||||
graph.
|
||||
"""
|
||||
def _add_graph_def(self, graph_def, global_step=None):
|
||||
graph_bytes = graph_def.SerializeToString()
|
||||
event = event_pb2.Event(wall_time=time.time(), graph_def=graph_bytes)
|
||||
if global_step is not None:
|
||||
event.step = int(global_step)
|
||||
self._event_queue.put(event)
|
||||
|
||||
def add_graph(self, graph, global_step=None, graph_def=None):
|
||||
"""Adds a `Graph` to the event file.
|
||||
|
||||
The graph described by the protocol buffer will be displayed by
|
||||
TensorBoard. Most users pass a graph in the constructor instead.
|
||||
|
||||
Args:
|
||||
graph: A `Graph` object, such as `sess.graph`.
|
||||
global_step: Number. Optional global step counter to record with the
|
||||
graph.
|
||||
graph_def: DEPRECATED. Use the `graph` parameter instead.
|
||||
|
||||
Raises:
|
||||
ValueError: If both graph and graph_def are passed to the method.
|
||||
"""
|
||||
|
||||
if graph is not None and graph_def is not None:
|
||||
raise ValueError("Please pass only graph, or graph_def (deprecated), "
|
||||
"but not both.")
|
||||
|
||||
if isinstance(graph, ops.Graph) or isinstance(graph_def, ops.Graph):
|
||||
# The user passed a `Graph`.
|
||||
|
||||
# Check if the user passed it via the graph or the graph_def argument and
|
||||
# correct for that.
|
||||
if not isinstance(graph, ops.Graph):
|
||||
logging.warning("When passing a `Graph` object, please use the `graph`"
|
||||
" named argument instead of `graph_def`.")
|
||||
graph = graph_def
|
||||
|
||||
# Serialize the graph with additional info.
|
||||
true_graph_def = graph.as_graph_def(add_shapes=True)
|
||||
elif (isinstance(graph, graph_pb2.GraphDef)
|
||||
or isinstance(graph_def, graph_pb2.GraphDef)):
|
||||
# The user passed a `GraphDef`.
|
||||
logging.warning("Passing a `GraphDef` to the SummaryWriter is deprecated."
|
||||
" Pass a `Graph` object instead, such as `sess.graph`.")
|
||||
|
||||
# Check if the user passed it via the graph or the graph_def argument and
|
||||
# correct for that.
|
||||
if isinstance(graph, graph_pb2.GraphDef):
|
||||
true_graph_def = graph
|
||||
else:
|
||||
true_graph_def = graph_def
|
||||
|
||||
else:
|
||||
# The user passed neither `Graph`, nor `GraphDef`.
|
||||
raise TypeError("The passed graph must be an instance of `Graph` "
|
||||
"or the deprecated `GraphDef`")
|
||||
# Finally, add the graph_def to the summary writer.
|
||||
self._add_graph_def(true_graph_def, global_step)
|
||||
|
||||
def flush(self):
|
||||
"""Flushes the event file to disk.
|
||||
|
||||
|
@ -49,6 +49,25 @@ class SummaryWriterTestCase(tf.test.TestCase):
|
||||
def _assertRecent(self, t):
|
||||
self.assertTrue(abs(t - time.time()) < 5)
|
||||
|
||||
def _assertEventsWithGraph(self, test_dir, g, has_shapes):
|
||||
rr = self._EventsReader(test_dir)
|
||||
|
||||
# The first event should list the file_version.
|
||||
ev = next(rr)
|
||||
self._assertRecent(ev.wall_time)
|
||||
self.assertEquals("brain.Event:2", ev.file_version)
|
||||
|
||||
# The next event should have the graph.
|
||||
ev = next(rr)
|
||||
self._assertRecent(ev.wall_time)
|
||||
self.assertEquals(0, ev.step)
|
||||
ev_graph = tf.GraphDef()
|
||||
ev_graph.ParseFromString(ev.graph_def)
|
||||
self.assertProtoEquals(g.as_graph_def(add_shapes=has_shapes), ev_graph)
|
||||
|
||||
# We should be done.
|
||||
self.assertRaises(StopIteration, lambda: next(rr))
|
||||
|
||||
def testAddingSummaryAndGraph(self):
|
||||
test_dir = self._CleanTestDir("basics")
|
||||
sw = tf.train.SummaryWriter(test_dir)
|
||||
@ -105,30 +124,54 @@ class SummaryWriterTestCase(tf.test.TestCase):
|
||||
# We should be done.
|
||||
self.assertRaises(StopIteration, lambda: next(rr))
|
||||
|
||||
def testInitializingWithGraphDef(self):
|
||||
test_dir = self._CleanTestDir("basics_with_graph")
|
||||
def testGraphAsNamed(self):
|
||||
test_dir = self._CleanTestDir("basics_named_graph")
|
||||
with tf.Graph().as_default() as g:
|
||||
tf.constant([12], name="douze")
|
||||
sw = tf.train.SummaryWriter(test_dir, graph=g)
|
||||
sw.close()
|
||||
self._assertEventsWithGraph(test_dir, g, True)
|
||||
|
||||
def testGraphAsPositional(self):
|
||||
test_dir = self._CleanTestDir("basics_positional_graph")
|
||||
with tf.Graph().as_default() as g:
|
||||
tf.constant([12], name="douze")
|
||||
sw = tf.train.SummaryWriter(test_dir, g)
|
||||
sw.close()
|
||||
self._assertEventsWithGraph(test_dir, g, True)
|
||||
|
||||
def testGraphDefAsNamed(self):
|
||||
test_dir = self._CleanTestDir("basics_named_graph_def")
|
||||
with tf.Graph().as_default() as g:
|
||||
tf.constant([12], name="douze")
|
||||
gd = g.as_graph_def()
|
||||
sw = tf.train.SummaryWriter(test_dir, graph_def=gd)
|
||||
sw.close()
|
||||
rr = self._EventsReader(test_dir)
|
||||
self._assertEventsWithGraph(test_dir, g, False)
|
||||
|
||||
# The first event should list the file_version.
|
||||
ev = next(rr)
|
||||
self._assertRecent(ev.wall_time)
|
||||
self.assertEquals("brain.Event:2", ev.file_version)
|
||||
def testGraphDefAsPositional(self):
|
||||
test_dir = self._CleanTestDir("basics_positional_graph_def")
|
||||
with tf.Graph().as_default() as g:
|
||||
tf.constant([12], name="douze")
|
||||
gd = g.as_graph_def()
|
||||
sw = tf.train.SummaryWriter(test_dir, gd)
|
||||
sw.close()
|
||||
self._assertEventsWithGraph(test_dir, g, False)
|
||||
|
||||
# The next event should have the graph.
|
||||
ev = next(rr)
|
||||
self._assertRecent(ev.wall_time)
|
||||
self.assertEquals(0, ev.step)
|
||||
ev_graph = tf.GraphDef()
|
||||
ev_graph.ParseFromString(ev.graph_def)
|
||||
self.assertProtoEquals(gd, ev_graph)
|
||||
def testGraphAndGraphDef(self):
|
||||
with self.assertRaises(ValueError):
|
||||
test_dir = self._CleanTestDir("basics_graph_and_graph_def")
|
||||
with tf.Graph().as_default() as g:
|
||||
tf.constant([12], name="douze")
|
||||
gd = g.as_graph_def()
|
||||
sw = tf.train.SummaryWriter(test_dir, graph=g, graph_def=gd)
|
||||
sw.close()
|
||||
|
||||
# We should be done.
|
||||
self.assertRaises(StopIteration, lambda: next(rr))
|
||||
def testNeitherGraphNorGraphDef(self):
|
||||
with self.assertRaises(TypeError):
|
||||
test_dir = self._CleanTestDir("basics_string_instead_of_graph")
|
||||
sw = tf.train.SummaryWriter(test_dir, "string instead of graph object")
|
||||
sw.close()
|
||||
|
||||
# Checks that values returned from session Run() calls are added correctly to
|
||||
# summaries. These are numpy types so we need to check they fit in the
|
||||
|
@ -844,7 +844,7 @@ class SVSummaryThread(coordinator.LooperThread):
|
||||
self._sess = sess
|
||||
|
||||
def run_loop(self):
|
||||
if self._sv.global_step:
|
||||
if self._sv.global_step is not None:
|
||||
summary_strs, global_step = self._sess.run([self._sv.summary_op,
|
||||
self._sv.global_step])
|
||||
else:
|
||||
@ -912,7 +912,7 @@ class SVTimerCheckpointThread(coordinator.LooperThread):
|
||||
def run_loop(self):
|
||||
self._sv.saver.save(self._sess, self._sv.save_path,
|
||||
global_step=self._sv.global_step)
|
||||
if self._sv.summary_writer and self._sv.global_step:
|
||||
if self._sv.summary_writer and self._sv.global_step is not None:
|
||||
current_step = training_util.global_step(self._sess, self._sv.global_step)
|
||||
self._sv.summary_writer.add_session_log(
|
||||
SessionLog(status=SessionLog.CHECKPOINT,
|
||||
|
@ -50,6 +50,7 @@ namespace perftools {
|
||||
namespace gputools {
|
||||
|
||||
class Stream;
|
||||
class ScratchAllocator;
|
||||
|
||||
template <typename ElemT>
|
||||
class DeviceMemory;
|
||||
@ -880,14 +881,14 @@ class BlasSupport {
|
||||
const port::ArraySlice<DeviceMemory<float> *> &a, int lda,
|
||||
const port::ArraySlice<DeviceMemory<float> *> &b, int ldb, float beta,
|
||||
const port::ArraySlice<DeviceMemory<float> *> &c, int ldc,
|
||||
int batch_count) = 0;
|
||||
int batch_count, ScratchAllocator *scratch_allocator) = 0;
|
||||
virtual bool DoBlasGemmBatched(
|
||||
Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
|
||||
uint64 n, uint64 k, double alpha,
|
||||
const port::ArraySlice<DeviceMemory<double> *> &a, int lda,
|
||||
const port::ArraySlice<DeviceMemory<double> *> &b, int ldb, double beta,
|
||||
const port::ArraySlice<DeviceMemory<double> *> &c, int ldc,
|
||||
int batch_count) = 0;
|
||||
int batch_count, ScratchAllocator *scratch_allocator) = 0;
|
||||
virtual bool DoBlasGemmBatched(
|
||||
Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
|
||||
uint64 n, uint64 k, std::complex<float> alpha,
|
||||
@ -895,7 +896,7 @@ class BlasSupport {
|
||||
const port::ArraySlice<DeviceMemory<std::complex<float>> *> &b, int ldb,
|
||||
std::complex<float> beta,
|
||||
const port::ArraySlice<DeviceMemory<std::complex<float>> *> &c, int ldc,
|
||||
int batch_count) = 0;
|
||||
int batch_count, ScratchAllocator *scratch_allocator) = 0;
|
||||
virtual bool DoBlasGemmBatched(
|
||||
Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
|
||||
uint64 n, uint64 k, std::complex<double> alpha,
|
||||
@ -903,7 +904,7 @@ class BlasSupport {
|
||||
const port::ArraySlice<DeviceMemory<std::complex<double>> *> &b, int ldb,
|
||||
std::complex<double> beta,
|
||||
const port::ArraySlice<DeviceMemory<std::complex<double>> *> &c, int ldc,
|
||||
int batch_count) = 0;
|
||||
int batch_count, ScratchAllocator *scratch_allocator) = 0;
|
||||
|
||||
// Computes a matrix-matrix product where one input matrix is Hermitian:
|
||||
//
|
||||
@ -1140,7 +1141,7 @@ class BlasSupport {
|
||||
|
||||
// Macro used to quickly declare overrides for abstract virtuals in the
|
||||
// BlasSupport base class.
|
||||
#define TENSORFLOW_STREAM_EXECUTOR_GPU_BLAS_SUPPORT_OVERRIDES \
|
||||
#define TENSORFLOW_STREAM_EXECUTOR_GPU_BLAS_SUPPORT_OVERRIDES \
|
||||
bool DoBlasAsum(Stream *stream, uint64 elem_count, \
|
||||
const DeviceMemory<float> &x, int incx, \
|
||||
DeviceMemory<float> *result) override; \
|
||||
@ -1626,14 +1627,14 @@ class BlasSupport {
|
||||
const port::ArraySlice<DeviceMemory<float> *> &a, int lda, \
|
||||
const port::ArraySlice<DeviceMemory<float> *> &b, int ldb, float beta, \
|
||||
const port::ArraySlice<DeviceMemory<float> *> &c, int ldc, \
|
||||
int batch_count) override; \
|
||||
int batch_count, ScratchAllocator *scratch_allocator) override; \
|
||||
bool DoBlasGemmBatched( \
|
||||
Stream *stream, blas::Transpose transa, blas::Transpose transb, \
|
||||
uint64 m, uint64 n, uint64 k, double alpha, \
|
||||
const port::ArraySlice<DeviceMemory<double> *> &a, int lda, \
|
||||
const port::ArraySlice<DeviceMemory<double> *> &b, int ldb, double beta, \
|
||||
const port::ArraySlice<DeviceMemory<double> *> &c, int ldc, \
|
||||
int batch_count) override; \
|
||||
int batch_count, ScratchAllocator *scratch_allocator) override; \
|
||||
bool DoBlasGemmBatched( \
|
||||
Stream *stream, blas::Transpose transa, blas::Transpose transb, \
|
||||
uint64 m, uint64 n, uint64 k, std::complex<float> alpha, \
|
||||
@ -1641,7 +1642,7 @@ class BlasSupport {
|
||||
const port::ArraySlice<DeviceMemory<std::complex<float>> *> &b, int ldb, \
|
||||
std::complex<float> beta, \
|
||||
const port::ArraySlice<DeviceMemory<std::complex<float>> *> &c, int ldc, \
|
||||
int batch_count) override; \
|
||||
int batch_count, ScratchAllocator *scratch_allocator) override; \
|
||||
bool DoBlasGemmBatched( \
|
||||
Stream *stream, blas::Transpose transa, blas::Transpose transb, \
|
||||
uint64 m, uint64 n, uint64 k, std::complex<double> alpha, \
|
||||
@ -1650,7 +1651,7 @@ class BlasSupport {
|
||||
const port::ArraySlice<DeviceMemory<std::complex<double>> *> &b, \
|
||||
int ldb, std::complex<double> beta, \
|
||||
const port::ArraySlice<DeviceMemory<std::complex<double>> *> &c, \
|
||||
int ldc, int batch_count) override; \
|
||||
int ldc, int batch_count, ScratchAllocator *scratch_allocator) override; \
|
||||
bool DoBlasHemm(Stream *stream, blas::Side side, blas::UpperLower uplo, \
|
||||
uint64 m, uint64 n, std::complex<float> alpha, \
|
||||
const DeviceMemory<std::complex<float>> &a, int lda, \
|
||||
|
@ -19,6 +19,7 @@ limitations under the License.
|
||||
|
||||
#include <complex>
|
||||
|
||||
#include "third_party/gpus/cuda/include/cublas_v2.h"
|
||||
#include "tensorflow/stream_executor/cuda/cuda_activation.h"
|
||||
#include "tensorflow/stream_executor/cuda/cuda_gpu_executor.h"
|
||||
#include "tensorflow/stream_executor/cuda/cuda_helpers.h"
|
||||
@ -34,8 +35,8 @@ limitations under the License.
|
||||
#include "tensorflow/stream_executor/platform/logging.h"
|
||||
#include "tensorflow/stream_executor/platform/port.h"
|
||||
#include "tensorflow/stream_executor/plugin_registry.h"
|
||||
#include "tensorflow/stream_executor/scratch_allocator.h"
|
||||
#include "tensorflow/stream_executor/stream_executor.h"
|
||||
#include "third_party/gpus/cuda/include/cublas_v2.h"
|
||||
|
||||
namespace perftools {
|
||||
namespace gputools {
|
||||
@ -1707,37 +1708,64 @@ template <typename T, typename FuncT>
|
||||
port::Status CUDABlas::DoBlasGemmBatchedInternal(
|
||||
FuncT cublas_func, Stream *stream, blas::Transpose transa,
|
||||
blas::Transpose transb, uint64 m, uint64 n, uint64 k, T alpha,
|
||||
const port::ArraySlice<DeviceMemory<T> *> &a_array, int lda,
|
||||
const port::ArraySlice<DeviceMemory<T> *> &b_array, int ldb, T beta,
|
||||
const port::ArraySlice<DeviceMemory<T> *> &c_array, int ldc,
|
||||
int batch_count) {
|
||||
std::vector<T *> a_ptr_vec, b_ptr_vec, c_ptr_vec;
|
||||
const port::ArraySlice<DeviceMemory<T> *> &a_ptrs_to_wrappers, int lda,
|
||||
const port::ArraySlice<DeviceMemory<T> *> &b_ptrs_to_wrappers, int ldb,
|
||||
T beta, const port::ArraySlice<DeviceMemory<T> *> &c_ptrs_to_wrappers,
|
||||
int ldc, int batch_count, ScratchAllocator *scratch_allocator) {
|
||||
std::vector<T *> a_raw_ptrs, b_raw_ptrs, c_raw_ptrs;
|
||||
for (int i = 0; i < batch_count; ++i) {
|
||||
a_ptr_vec.push_back(static_cast<T *>(a_array[i]->opaque()));
|
||||
b_ptr_vec.push_back(static_cast<T *>(b_array[i]->opaque()));
|
||||
c_ptr_vec.push_back(static_cast<T *>(c_array[i]->opaque()));
|
||||
a_raw_ptrs.push_back(static_cast<T *>(a_ptrs_to_wrappers[i]->opaque()));
|
||||
b_raw_ptrs.push_back(static_cast<T *>(b_ptrs_to_wrappers[i]->opaque()));
|
||||
c_raw_ptrs.push_back(static_cast<T *>(c_ptrs_to_wrappers[i]->opaque()));
|
||||
}
|
||||
|
||||
typedef typename CUDAComplexT<T>::type CUDA_T;
|
||||
SE_ASSIGN_OR_RETURN(
|
||||
std::unique_ptr<TemporaryDeviceMemory<CUDA_T *>> a_ptr_array,
|
||||
stream->AllocateTemporaryArray<CUDA_T *>(batch_count));
|
||||
SE_ASSIGN_OR_RETURN(
|
||||
std::unique_ptr<TemporaryDeviceMemory<CUDA_T *>> b_ptr_array,
|
||||
stream->AllocateTemporaryArray<CUDA_T *>(batch_count));
|
||||
SE_ASSIGN_OR_RETURN(
|
||||
std::unique_ptr<TemporaryDeviceMemory<CUDA_T *>> c_ptr_array,
|
||||
stream->AllocateTemporaryArray<CUDA_T *>(batch_count));
|
||||
|
||||
if (!stream->ThenMemcpy(a_ptr_array->mutable_device_memory(),
|
||||
a_ptr_vec.data(), batch_count * sizeof(T *))
|
||||
.ok() ||
|
||||
!stream->ThenMemcpy(b_ptr_array->mutable_device_memory(),
|
||||
b_ptr_vec.data(), batch_count * sizeof(T *))
|
||||
.ok() ||
|
||||
!stream->ThenMemcpy(c_ptr_array->mutable_device_memory(),
|
||||
c_ptr_vec.data(), batch_count * sizeof(T *))
|
||||
.ok()) {
|
||||
const size_t size = batch_count * sizeof(CUDA_T *);
|
||||
|
||||
// Device-side copy of pointers to matrices.
|
||||
DeviceMemory<CUDA_T *> a;
|
||||
DeviceMemory<CUDA_T *> b;
|
||||
DeviceMemory<CUDA_T *> c;
|
||||
|
||||
// If temporary space is allocated for device-side copies of pointers to
|
||||
// matrices, that temporary space should not be freed until this function
|
||||
// returns. Although the values for these unique_ptrs are not set here, they
|
||||
// are declared at this scope so they will be destroyed when the function
|
||||
// returns.
|
||||
//
|
||||
// If a scratch allocator is provided, these pointers will not be used at all.
|
||||
std::unique_ptr<TemporaryDeviceMemory<CUDA_T *>> a_temporary;
|
||||
std::unique_ptr<TemporaryDeviceMemory<CUDA_T *>> b_temporary;
|
||||
std::unique_ptr<TemporaryDeviceMemory<CUDA_T *>> c_temporary;
|
||||
|
||||
// Decide how to allocate device-side copy of pointers to matrices based on
|
||||
// whether a scratch allocator was passed.
|
||||
if (scratch_allocator != nullptr) {
|
||||
SE_ASSIGN_OR_RETURN(DeviceMemory<uint8> a_bytes,
|
||||
scratch_allocator->AllocateBytes(stream, size));
|
||||
SE_ASSIGN_OR_RETURN(DeviceMemory<uint8> b_bytes,
|
||||
scratch_allocator->AllocateBytes(stream, size));
|
||||
SE_ASSIGN_OR_RETURN(DeviceMemory<uint8> c_bytes,
|
||||
scratch_allocator->AllocateBytes(stream, size));
|
||||
a = DeviceMemory<CUDA_T *>(a_bytes);
|
||||
b = DeviceMemory<CUDA_T *>(b_bytes);
|
||||
c = DeviceMemory<CUDA_T *>(c_bytes);
|
||||
} else {
|
||||
SE_ASSIGN_OR_RETURN(a_temporary,
|
||||
stream->AllocateTemporaryArray<CUDA_T *>(batch_count));
|
||||
SE_ASSIGN_OR_RETURN(b_temporary,
|
||||
stream->AllocateTemporaryArray<CUDA_T *>(batch_count));
|
||||
SE_ASSIGN_OR_RETURN(c_temporary,
|
||||
stream->AllocateTemporaryArray<CUDA_T *>(batch_count));
|
||||
a = DeviceMemory<CUDA_T *>(*a_temporary->mutable_device_memory());
|
||||
b = DeviceMemory<CUDA_T *>(*b_temporary->mutable_device_memory());
|
||||
c = DeviceMemory<CUDA_T *>(*c_temporary->mutable_device_memory());
|
||||
}
|
||||
|
||||
if (!stream->ThenMemcpy(&a, a_raw_ptrs.data(), size).ok() ||
|
||||
!stream->ThenMemcpy(&b, b_raw_ptrs.data(), size).ok() ||
|
||||
!stream->ThenMemcpy(&c, c_raw_ptrs.data(), size).ok()) {
|
||||
return port::Status(port::error::INTERNAL,
|
||||
"failed to copy memory from host to device in "
|
||||
"CUDABlas::DoBlasGemmBatched");
|
||||
@ -1746,13 +1774,9 @@ port::Status CUDABlas::DoBlasGemmBatchedInternal(
|
||||
bool ok = DoBlasInternal(
|
||||
cublas_func, stream, true /* = pointer_mode_host */,
|
||||
CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k,
|
||||
CUDAComplex(&alpha),
|
||||
const_cast<const CUDA_T **>(CUDAMemory(a_ptr_array->device_memory())),
|
||||
lda,
|
||||
const_cast<const CUDA_T **>(CUDAMemory(b_ptr_array->device_memory())),
|
||||
ldb, CUDAComplex(&beta),
|
||||
const_cast<CUDA_T **>(CUDAMemory(c_ptr_array->device_memory())), ldc,
|
||||
batch_count);
|
||||
CUDAComplex(&alpha), const_cast<const CUDA_T **>(CUDAMemory(a)), lda,
|
||||
const_cast<const CUDA_T **>(CUDAMemory(b)), ldb, CUDAComplex(&beta),
|
||||
const_cast<CUDA_T **>(CUDAMemory(c)), ldc, batch_count);
|
||||
|
||||
if (ok) {
|
||||
return port::Status::OK();
|
||||
@ -1767,10 +1791,11 @@ bool CUDABlas::DoBlasGemmBatched(
|
||||
const port::ArraySlice<DeviceMemory<float> *> &a_array, int lda,
|
||||
const port::ArraySlice<DeviceMemory<float> *> &b_array, int ldb, float beta,
|
||||
const port::ArraySlice<DeviceMemory<float> *> &c_array, int ldc,
|
||||
int batch_count) {
|
||||
int batch_count, ScratchAllocator *scratch_allocator) {
|
||||
SE_RETURN_STATUS_AS_BOOL(DoBlasGemmBatchedInternal(
|
||||
dynload::cublasSgemmBatched, stream, transa, transb, m, n, k, alpha,
|
||||
a_array, lda, b_array, ldb, beta, c_array, ldc, batch_count));
|
||||
a_array, lda, b_array, ldb, beta, c_array, ldc, batch_count,
|
||||
scratch_allocator));
|
||||
}
|
||||
|
||||
bool CUDABlas::DoBlasGemmBatched(
|
||||
@ -1779,10 +1804,11 @@ bool CUDABlas::DoBlasGemmBatched(
|
||||
const port::ArraySlice<DeviceMemory<double> *> &a_array, int lda,
|
||||
const port::ArraySlice<DeviceMemory<double> *> &b_array, int ldb,
|
||||
double beta, const port::ArraySlice<DeviceMemory<double> *> &c_array,
|
||||
int ldc, int batch_count) {
|
||||
int ldc, int batch_count, ScratchAllocator *scratch_allocator) {
|
||||
SE_RETURN_STATUS_AS_BOOL(DoBlasGemmBatchedInternal(
|
||||
dynload::cublasDgemmBatched, stream, transa, transb, m, n, k, alpha,
|
||||
a_array, lda, b_array, ldb, beta, c_array, ldc, batch_count));
|
||||
a_array, lda, b_array, ldb, beta, c_array, ldc, batch_count,
|
||||
scratch_allocator));
|
||||
}
|
||||
|
||||
bool CUDABlas::DoBlasGemmBatched(
|
||||
@ -1793,10 +1819,11 @@ bool CUDABlas::DoBlasGemmBatched(
|
||||
const port::ArraySlice<DeviceMemory<std::complex<float>> *> &b_array,
|
||||
int ldb, std::complex<float> beta,
|
||||
const port::ArraySlice<DeviceMemory<std::complex<float>> *> &c_array,
|
||||
int ldc, int batch_count) {
|
||||
int ldc, int batch_count, ScratchAllocator *scratch_allocator) {
|
||||
SE_RETURN_STATUS_AS_BOOL(DoBlasGemmBatchedInternal(
|
||||
dynload::cublasCgemmBatched, stream, transa, transb, m, n, k, alpha,
|
||||
a_array, lda, b_array, ldb, beta, c_array, ldc, batch_count));
|
||||
a_array, lda, b_array, ldb, beta, c_array, ldc, batch_count,
|
||||
scratch_allocator));
|
||||
}
|
||||
|
||||
bool CUDABlas::DoBlasGemmBatched(
|
||||
@ -1807,10 +1834,11 @@ bool CUDABlas::DoBlasGemmBatched(
|
||||
const port::ArraySlice<DeviceMemory<std::complex<double>> *> &b_array,
|
||||
int ldb, std::complex<double> beta,
|
||||
const port::ArraySlice<DeviceMemory<std::complex<double>> *> &c_array,
|
||||
int ldc, int batch_count) {
|
||||
int ldc, int batch_count, ScratchAllocator *scratch_allocator) {
|
||||
SE_RETURN_STATUS_AS_BOOL(DoBlasGemmBatchedInternal(
|
||||
dynload::cublasZgemmBatched, stream, transa, transb, m, n, k, alpha,
|
||||
a_array, lda, b_array, ldb, beta, c_array, ldc, batch_count));
|
||||
a_array, lda, b_array, ldb, beta, c_array, ldc, batch_count,
|
||||
scratch_allocator));
|
||||
}
|
||||
|
||||
bool CUDABlas::DoBlasHemm(Stream *stream, blas::Side side,
|
||||
|
@ -93,7 +93,7 @@ class CUDABlas : public blas::BlasSupport {
|
||||
const port::ArraySlice<DeviceMemory<T> *> &a_array, int lda,
|
||||
const port::ArraySlice<DeviceMemory<T> *> &b_array, int ldb, T beta,
|
||||
const port::ArraySlice<DeviceMemory<T> *> &c_array, int ldc,
|
||||
int batch_count);
|
||||
int batch_count, ScratchAllocator *scratch_allocator);
|
||||
|
||||
// mutex that guards the cuBLAS handle for this device.
|
||||
mutex mu_;
|
||||
|
@ -2986,6 +2986,17 @@ Stream &Stream::ThenBlasGemmBatched(
|
||||
int lda, const port::ArraySlice<DeviceMemory<float> *> &b, int ldb,
|
||||
float beta, const port::ArraySlice<DeviceMemory<float> *> &c, int ldc,
|
||||
int batch_count) {
|
||||
return ThenBlasGemmBatchedWithScratch(transa, transb, m, n, k, alpha, a, lda,
|
||||
b, ldb, beta, c, ldc, batch_count,
|
||||
nullptr);
|
||||
}
|
||||
|
||||
Stream &Stream::ThenBlasGemmBatchedWithScratch(
|
||||
blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n,
|
||||
uint64 k, float alpha, const port::ArraySlice<DeviceMemory<float> *> &a,
|
||||
int lda, const port::ArraySlice<DeviceMemory<float> *> &b, int ldb,
|
||||
float beta, const port::ArraySlice<DeviceMemory<float> *> &c, int ldc,
|
||||
int batch_count, ScratchAllocator *scratch_allocator) {
|
||||
VLOG_CALL(PARAM(transa), PARAM(transb), PARAM(m), PARAM(n), PARAM(k),
|
||||
PARAM(alpha), PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb),
|
||||
PARAM(beta), PARAM(c), PARAM(ldc), PARAM(batch_count));
|
||||
@ -2993,9 +3004,12 @@ Stream &Stream::ThenBlasGemmBatched(
|
||||
ThenBlasImpl<blas::Transpose, blas::Transpose, uint64, uint64, uint64, float,
|
||||
const port::ArraySlice<DeviceMemory<float> *> &, int,
|
||||
const port::ArraySlice<DeviceMemory<float> *> &, int, float,
|
||||
const port::ArraySlice<DeviceMemory<float> *> &, int, int> impl;
|
||||
const port::ArraySlice<DeviceMemory<float> *> &, int, int,
|
||||
ScratchAllocator *>
|
||||
impl;
|
||||
return impl(this, &blas::BlasSupport::DoBlasGemmBatched, transa, transb, m, n,
|
||||
k, alpha, a, lda, b, ldb, beta, c, ldc, batch_count);
|
||||
k, alpha, a, lda, b, ldb, beta, c, ldc, batch_count,
|
||||
scratch_allocator);
|
||||
}
|
||||
|
||||
Stream &Stream::ThenBlasGemmBatched(
|
||||
@ -3004,6 +3018,17 @@ Stream &Stream::ThenBlasGemmBatched(
|
||||
int lda, const port::ArraySlice<DeviceMemory<double> *> &b, int ldb,
|
||||
double beta, const port::ArraySlice<DeviceMemory<double> *> &c, int ldc,
|
||||
int batch_count) {
|
||||
return ThenBlasGemmBatchedWithScratch(transa, transb, m, n, k, alpha, a, lda,
|
||||
b, ldb, beta, c, ldc, batch_count,
|
||||
nullptr);
|
||||
}
|
||||
|
||||
Stream &Stream::ThenBlasGemmBatchedWithScratch(
|
||||
blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n,
|
||||
uint64 k, double alpha, const port::ArraySlice<DeviceMemory<double> *> &a,
|
||||
int lda, const port::ArraySlice<DeviceMemory<double> *> &b, int ldb,
|
||||
double beta, const port::ArraySlice<DeviceMemory<double> *> &c, int ldc,
|
||||
int batch_count, ScratchAllocator *scratch_allocator) {
|
||||
VLOG_CALL(PARAM(transa), PARAM(transb), PARAM(m), PARAM(n), PARAM(k),
|
||||
PARAM(alpha), PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb),
|
||||
PARAM(beta), PARAM(c), PARAM(ldc), PARAM(batch_count));
|
||||
@ -3011,9 +3036,12 @@ Stream &Stream::ThenBlasGemmBatched(
|
||||
ThenBlasImpl<blas::Transpose, blas::Transpose, uint64, uint64, uint64, double,
|
||||
const port::ArraySlice<DeviceMemory<double> *> &, int,
|
||||
const port::ArraySlice<DeviceMemory<double> *> &, int, double,
|
||||
const port::ArraySlice<DeviceMemory<double> *> &, int, int> impl;
|
||||
const port::ArraySlice<DeviceMemory<double> *> &, int, int,
|
||||
ScratchAllocator *>
|
||||
impl;
|
||||
return impl(this, &blas::BlasSupport::DoBlasGemmBatched, transa, transb, m, n,
|
||||
k, alpha, a, lda, b, ldb, beta, c, ldc, batch_count);
|
||||
k, alpha, a, lda, b, ldb, beta, c, ldc, batch_count,
|
||||
scratch_allocator);
|
||||
}
|
||||
|
||||
Stream &Stream::ThenBlasGemmBatched(
|
||||
@ -3024,6 +3052,19 @@ Stream &Stream::ThenBlasGemmBatched(
|
||||
std::complex<float> beta,
|
||||
const port::ArraySlice<DeviceMemory<std::complex<float>> *> &c, int ldc,
|
||||
int batch_count) {
|
||||
return ThenBlasGemmBatchedWithScratch(transa, transb, m, n, k, alpha, a, lda,
|
||||
b, ldb, beta, c, ldc, batch_count,
|
||||
nullptr);
|
||||
}
|
||||
|
||||
Stream &Stream::ThenBlasGemmBatchedWithScratch(
|
||||
blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n,
|
||||
uint64 k, std::complex<float> alpha,
|
||||
const port::ArraySlice<DeviceMemory<std::complex<float>> *> &a, int lda,
|
||||
const port::ArraySlice<DeviceMemory<std::complex<float>> *> &b, int ldb,
|
||||
std::complex<float> beta,
|
||||
const port::ArraySlice<DeviceMemory<std::complex<float>> *> &c, int ldc,
|
||||
int batch_count, ScratchAllocator *scratch_allocator) {
|
||||
VLOG_CALL(PARAM(transa), PARAM(transb), PARAM(m), PARAM(n), PARAM(k),
|
||||
PARAM(alpha), PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb),
|
||||
PARAM(beta), PARAM(c), PARAM(ldc), PARAM(batch_count));
|
||||
@ -3035,9 +3076,11 @@ Stream &Stream::ThenBlasGemmBatched(
|
||||
const port::ArraySlice<DeviceMemory<std::complex<float>> *> &,
|
||||
int, std::complex<float>,
|
||||
const port::ArraySlice<DeviceMemory<std::complex<float>> *> &,
|
||||
int, int> impl;
|
||||
int, int, ScratchAllocator *>
|
||||
impl;
|
||||
return impl(this, &blas::BlasSupport::DoBlasGemmBatched, transa, transb, m, n,
|
||||
k, alpha, a, lda, b, ldb, beta, c, ldc, batch_count);
|
||||
k, alpha, a, lda, b, ldb, beta, c, ldc, batch_count,
|
||||
scratch_allocator);
|
||||
}
|
||||
|
||||
Stream &Stream::ThenBlasGemmBatched(
|
||||
@ -3048,6 +3091,19 @@ Stream &Stream::ThenBlasGemmBatched(
|
||||
std::complex<double> beta,
|
||||
const port::ArraySlice<DeviceMemory<std::complex<double>> *> &c, int ldc,
|
||||
int batch_count) {
|
||||
return ThenBlasGemmBatchedWithScratch(transa, transb, m, n, k, alpha, a, lda,
|
||||
b, ldb, beta, c, ldc, batch_count,
|
||||
nullptr);
|
||||
}
|
||||
|
||||
Stream &Stream::ThenBlasGemmBatchedWithScratch(
|
||||
blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n,
|
||||
uint64 k, std::complex<double> alpha,
|
||||
const port::ArraySlice<DeviceMemory<std::complex<double>> *> &a, int lda,
|
||||
const port::ArraySlice<DeviceMemory<std::complex<double>> *> &b, int ldb,
|
||||
std::complex<double> beta,
|
||||
const port::ArraySlice<DeviceMemory<std::complex<double>> *> &c, int ldc,
|
||||
int batch_count, ScratchAllocator *scratch_allocator) {
|
||||
VLOG_CALL(PARAM(transa), PARAM(transb), PARAM(m), PARAM(n), PARAM(k),
|
||||
PARAM(alpha), PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb),
|
||||
PARAM(beta), PARAM(c), PARAM(ldc), PARAM(batch_count));
|
||||
@ -3059,9 +3115,11 @@ Stream &Stream::ThenBlasGemmBatched(
|
||||
const port::ArraySlice<DeviceMemory<std::complex<double>> *> &,
|
||||
int, std::complex<double>,
|
||||
const port::ArraySlice<DeviceMemory<std::complex<double>> *> &,
|
||||
int, int> impl;
|
||||
int, int, ScratchAllocator *>
|
||||
impl;
|
||||
return impl(this, &blas::BlasSupport::DoBlasGemmBatched, transa, transb, m, n,
|
||||
k, alpha, a, lda, b, ldb, beta, c, ldc, batch_count);
|
||||
k, alpha, a, lda, b, ldb, beta, c, ldc, batch_count,
|
||||
scratch_allocator);
|
||||
}
|
||||
|
||||
Stream &Stream::ThenSetRngSeed(const uint8 *seed, uint64 seed_bytes) {
|
||||
|
@ -944,6 +944,34 @@ class Stream {
|
||||
std::complex<double> beta,
|
||||
const port::ArraySlice<DeviceMemory<std::complex<double>> *> &c, int ldc,
|
||||
int batch_count);
|
||||
Stream &ThenBlasGemmBatchedWithScratch(
|
||||
blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n,
|
||||
uint64 k, float alpha, const port::ArraySlice<DeviceMemory<float> *> &a,
|
||||
int lda, const port::ArraySlice<DeviceMemory<float> *> &b, int ldb,
|
||||
float beta, const port::ArraySlice<DeviceMemory<float> *> &c, int ldc,
|
||||
int batch_count, ScratchAllocator *scratch_allocator);
|
||||
Stream &ThenBlasGemmBatchedWithScratch(
|
||||
blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n,
|
||||
uint64 k, double alpha, const port::ArraySlice<DeviceMemory<double> *> &a,
|
||||
int lda, const port::ArraySlice<DeviceMemory<double> *> &b, int ldb,
|
||||
double beta, const port::ArraySlice<DeviceMemory<double> *> &c, int ldc,
|
||||
int batch_count, ScratchAllocator *scratch_allocator);
|
||||
Stream &ThenBlasGemmBatchedWithScratch(
|
||||
blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n,
|
||||
uint64 k, std::complex<float> alpha,
|
||||
const port::ArraySlice<DeviceMemory<std::complex<float>> *> &a, int lda,
|
||||
const port::ArraySlice<DeviceMemory<std::complex<float>> *> &b, int ldb,
|
||||
std::complex<float> beta,
|
||||
const port::ArraySlice<DeviceMemory<std::complex<float>> *> &c, int ldc,
|
||||
int batch_count, ScratchAllocator *scratch_allocator);
|
||||
Stream &ThenBlasGemmBatchedWithScratch(
|
||||
blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n,
|
||||
uint64 k, std::complex<double> alpha,
|
||||
const port::ArraySlice<DeviceMemory<std::complex<double>> *> &a, int lda,
|
||||
const port::ArraySlice<DeviceMemory<std::complex<double>> *> &b, int ldb,
|
||||
std::complex<double> beta,
|
||||
const port::ArraySlice<DeviceMemory<std::complex<double>> *> &c, int ldc,
|
||||
int batch_count, ScratchAllocator *scratch_allocator);
|
||||
|
||||
// See BlasSupport::DoBlasHemm.
|
||||
Stream &ThenBlasHemm(blas::Side side, blas::UpperLower uplo, uint64 m,
|
||||
|
@ -115,7 +115,7 @@ The #center div contains tf-charts embedded inside tf-collapsable-panes.
|
||||
<p>
|
||||
Maybe data hasn't loaded yet, or maybe you need
|
||||
to add some <code>tf.scalar_summary</code> ops to your graph, and
|
||||
serialize them using the <code>tf.training.summary_io.SummaryWriter</code>.
|
||||
serialize them using the <code>tf.train.SummaryWriter</code>.
|
||||
</p>
|
||||
</div>
|
||||
</template>
|
||||
|
@ -75,7 +75,6 @@ Properties out:
|
||||
display: flex;
|
||||
flex-grow: 1;
|
||||
flex-shrink: 1;
|
||||
height: 0px; /* hackhack So the flex-grow takes over and gives it space */
|
||||
}
|
||||
.x-button {
|
||||
font-size: 13px;
|
||||
|
@ -515,6 +515,13 @@ function addEdges(h: Hierarchy, graph: SlimGraph,
|
||||
let sourceAncestorIndex = getPath(graph.nodes[baseEdge.v], sourcePath);
|
||||
let destAncestorIndex = getPath(graph.nodes[baseEdge.w], destPath);
|
||||
|
||||
// If the hierarchical path cannot be found for either endpoint, then we
|
||||
// cannot create the edge. This happens for example when a node has a
|
||||
// control dependency on a summary node, which are embedded.
|
||||
if (sourceAncestorIndex === -1 || destAncestorIndex === -1) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Find the lowest shared ancestor between source and dest by looking for
|
||||
// the highest nodes that differ between their ancestor paths.
|
||||
while (sourcePath[sourceAncestorIndex] === destPath[destAncestorIndex]) {
|
||||
|
@ -87,7 +87,7 @@ export const PARAMS = {
|
||||
*/
|
||||
labelHeight: 20,
|
||||
/** X-space between each extracted node and the core graph. */
|
||||
extractXOffset: 50,
|
||||
extractXOffset: 15,
|
||||
/** Y-space between each extracted node. */
|
||||
extractYOffset: 20
|
||||
},
|
||||
@ -486,9 +486,24 @@ function layoutMetanode(renderNodeInfo: render.RenderGroupNodeInfo): void {
|
||||
return height + yOffset + child.height;
|
||||
}, 0);
|
||||
|
||||
// Compute the total padding between the core graph, in-extract and
|
||||
// out-extract boxes.
|
||||
let numParts = 0;
|
||||
if (renderNodeInfo.isolatedInExtract.length > 0) {
|
||||
numParts++;
|
||||
}
|
||||
if (renderNodeInfo.isolatedOutExtract.length > 0) {
|
||||
numParts++;
|
||||
}
|
||||
if (renderNodeInfo.coreGraph.nodeCount() > 0) {
|
||||
numParts++;
|
||||
}
|
||||
let offset = PARAMS.subscene.meta.extractXOffset;
|
||||
let padding = numParts <= 1 ? 0 : (numParts <= 2 ? offset : 2 * offset);
|
||||
|
||||
// Add the in-extract and out-extract width to the core box width.
|
||||
renderNodeInfo.coreBox.width += renderNodeInfo.inExtractBox.width +
|
||||
renderNodeInfo.outExtractBox.width;
|
||||
renderNodeInfo.outExtractBox.width + padding;
|
||||
renderNodeInfo.coreBox.height =
|
||||
params.labelHeight +
|
||||
Math.max(
|
||||
|
@ -964,8 +964,6 @@ export class RenderNodeInfo {
|
||||
|
||||
/** Label vertical offset from the center of node shape */
|
||||
labelOffset: number;
|
||||
/** X-space between each extracted node and the core graph. */
|
||||
extractXOffset: number;
|
||||
/** Rectangle radius (for making rounded rectangle) */
|
||||
radius: number;
|
||||
|
||||
@ -1027,7 +1025,6 @@ export class RenderNodeInfo {
|
||||
|
||||
// Params for node box.
|
||||
this.labelOffset = 0;
|
||||
this.extractXOffset = 0;
|
||||
this.radius = 0;
|
||||
|
||||
// Params for expanded node
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user