Merge commit for internal changes

This commit is contained in:
Vijay Vasudevan 2016-06-07 10:10:03 -07:00
commit b7c416926e
104 changed files with 4860 additions and 965 deletions

View File

@ -1,6 +1,6 @@
package(default_visibility = ["//visibility:public"])
archive_dir = "eigen-eigen-d02e6a705c30"
archive_dir = "eigen-eigen-0c0b79ecd74c"
cc_library(
name = "eigen",

View File

@ -7,7 +7,7 @@
include (ExternalProject)
set(eigen_archive_hash "d02e6a705c30")
set(eigen_archive_hash "0c0b79ecd74c")
set(eigen_INCLUDE_DIRS
${CMAKE_CURRENT_BINARY_DIR}
@ -16,7 +16,7 @@ set(eigen_INCLUDE_DIRS
${tensorflow_source_dir}/third_party/eigen3
)
set(eigen_URL https://bitbucket.org/eigen/eigen/get/${eigen_archive_hash}.tar.gz)
set(eigen_HASH SHA256=532956172daa8aba87c750791ff89a5c38cdb07e2525afe17ecb4bef812d67cf)
set(eigen_HASH SHA256=b4b5884b03bd4bae114d02b36e2435ad1504ed8e51431d16c876b6f6a365882b)
set(eigen_BUILD ${CMAKE_CURRENT_BINARY_DIR}/eigen/src/eigen)
set(eigen_INSTALL ${CMAKE_CURRENT_BINARY_DIR}/eigen/install)

View File

@ -198,6 +198,30 @@ py_test(
],
)
py_test(
name = "experiment_test",
size = "small",
srcs = ["python/learn/tests/experiment_test.py"],
srcs_version = "PY2AND3",
deps = [
":learn",
"//tensorflow:tensorflow_py",
"//tensorflow/python:framework_test_lib",
],
)
py_test(
name = "learn_runner_test",
size = "small",
srcs = ["python/learn/tests/learn_runner_test.py"],
srcs_version = "PY2AND3",
deps = [
":learn",
"//tensorflow:tensorflow_py",
"//tensorflow/python:framework_test_lib",
],
)
py_test(
name = "tensor_signature_test",
srcs = ["python/learn/estimators/tensor_signature_test.py"],
@ -439,6 +463,19 @@ py_test(
],
)
py_test(
name = "stability_test",
size = "small",
srcs = ["python/learn/tests/stability_test.py"],
srcs_version = "PY2AND3",
deps = [
":learn",
"//tensorflow:tensorflow_py",
"//tensorflow/python:framework",
"//tensorflow/python:framework_test_lib",
],
)
py_binary(
name = "inspect_checkpoint",
srcs = [

View File

@ -21,6 +21,7 @@ from __future__ import print_function
import numpy as np
# pylint: disable=wildcard-import
from tensorflow.contrib.learn.python.learn import datasets
from tensorflow.contrib.learn.python.learn import estimators
from tensorflow.contrib.learn.python.learn import graph_actions
@ -30,9 +31,9 @@ from tensorflow.contrib.learn.python.learn import monitors
from tensorflow.contrib.learn.python.learn import ops
from tensorflow.contrib.learn.python.learn import preprocessing
from tensorflow.contrib.learn.python.learn import utils
# pylint: disable=wildcard-import
from tensorflow.contrib.learn.python.learn.dataframe import *
from tensorflow.contrib.learn.python.learn.estimators import *
from tensorflow.contrib.learn.python.learn.experiment import Experiment
from tensorflow.contrib.learn.python.learn.graph_actions import evaluate
from tensorflow.contrib.learn.python.learn.graph_actions import infer
from tensorflow.contrib.learn.python.learn.graph_actions import NanLossDuringTrainingError

View File

@ -50,11 +50,11 @@ class DNNClassifier(dnn_linear_combined.DNNLinearCombinedClassifier):
def input_fn_eval: # returns x, Y
pass
estimator.evaluate(input_fn_eval)
estimator.predict(x)
estimator.evaluate(input_fn=input_fn_eval)
estimator.predict(x=x)
```
Input of `fit`, `train`, and `evaluate` should have following features,
Input of `fit` and `evaluate` should have following features,
otherwise there will be a `KeyError`:
if `weight_column_name` is not `None`, a feature with
`key=weight_column_name` whose value is a `Tensor`.
@ -145,11 +145,11 @@ class DNNRegressor(dnn_linear_combined.DNNLinearCombinedRegressor):
def input_fn_eval: # returns x, Y
pass
estimator.evaluate(input_fn_eval)
estimator.predict(x)
estimator.evaluate(input_fn=input_fn_eval)
estimator.predict(x=x)
```
Input of `fit`, `train`, and `evaluate` should have following features,
Input of `fit` and `evaluate` should have following features,
otherwise there will be a `KeyError`:
if `weight_column_name` is not `None`, a feature with
`key=weight_column_name` whose value is a `Tensor`.

View File

@ -397,12 +397,12 @@ class DNNLinearCombinedClassifier(_DNNLinearCombinedBaseEstimator):
...
def input_fn_eval: # returns x, y
...
estimator.train(input_fn_train)
estimator.evaluate(input_fn_eval)
estimator.predict(x)
estimator.fit(input_fn=input_fn_train)
estimator.evaluate(input_fn=input_fn_eval)
estimator.predict(x=x)
```
Input of `fit`, `train`, and `evaluate` should have following features,
Input of `fit` and `evaluate` should have following features,
otherwise there will be a `KeyError`:
if `weight_column_name` is not `None`, a feature with
`key=weight_column_name` whose value is a `Tensor`.

View File

@ -42,7 +42,14 @@ def _prepare_iris_data_for_logistic_regression():
return iris
def _iris_input_fn():
def _iris_input_multiclass_fn():
iris = tf.contrib.learn.datasets.load_iris()
return {
'feature': tf.constant(iris.data, dtype=tf.float32)
}, tf.constant(iris.target, shape=[150, 1], dtype=tf.int32)
def _iris_input_logistic_fn():
iris = _prepare_iris_data_for_logistic_regression()
return {
'feature': tf.constant(iris.data, dtype=tf.float32)
@ -64,8 +71,8 @@ class DNNLinearCombinedClassifierTest(tf.test.TestCase):
dnn_feature_columns=cont_features,
dnn_hidden_units=[3, 3])
classifier.fit(input_fn=_iris_input_fn, steps=100)
scores = classifier.evaluate(input_fn=_iris_input_fn, steps=100)
classifier.fit(input_fn=_iris_input_logistic_fn, steps=100)
scores = classifier.evaluate(input_fn=_iris_input_logistic_fn, steps=100)
self.assertGreater(scores['accuracy'], 0.9)
def testLogisticRegression_TensorData(self):
@ -127,8 +134,8 @@ class DNNLinearCombinedClassifierTest(tf.test.TestCase):
dnn_feature_columns=cont_features,
dnn_hidden_units=[3, 3])
classifier.fit(input_fn=_iris_input_fn, steps=100)
scores = classifier.evaluate(input_fn=_iris_input_fn, steps=100)
classifier.fit(input_fn=_iris_input_multiclass_fn, steps=100)
scores = classifier.evaluate(input_fn=_iris_input_multiclass_fn, steps=100)
self.assertGreater(scores['accuracy'], 0.9)
def testWeightColumn(self):
@ -210,8 +217,8 @@ class DNNLinearCombinedClassifierTest(tf.test.TestCase):
dnn_hidden_units=[3, 3],
dnn_optimizer=tf.train.AdagradOptimizer(learning_rate=0.1))
classifier.fit(input_fn=_iris_input_fn, steps=100)
scores = classifier.evaluate(input_fn=_iris_input_fn, steps=100)
classifier.fit(input_fn=_iris_input_logistic_fn, steps=100)
scores = classifier.evaluate(input_fn=_iris_input_logistic_fn, steps=100)
self.assertGreater(scores['accuracy'], 0.9)
def testCustomOptimizerByString(self):
@ -230,8 +237,8 @@ class DNNLinearCombinedClassifierTest(tf.test.TestCase):
dnn_hidden_units=[3, 3],
dnn_optimizer='Adagrad')
classifier.fit(input_fn=_iris_input_fn, steps=100)
scores = classifier.evaluate(input_fn=_iris_input_fn, steps=100)
classifier.fit(input_fn=_iris_input_logistic_fn, steps=100)
scores = classifier.evaluate(input_fn=_iris_input_logistic_fn, steps=100)
self.assertGreater(scores['accuracy'], 0.9)
def testPredict(self):

View File

@ -111,8 +111,8 @@ class BaseEstimator(sklearn.BaseEstimator):
self._model_dir = model_dir
if self._model_dir is None:
self._model_dir = tempfile.mkdtemp()
logging.info('Using temporary folder as model directory: %s',
self._model_dir)
logging.warning('Using temporary folder as model directory: %s',
self._model_dir)
# Create a run configuration
if config is None:
@ -135,9 +135,8 @@ class BaseEstimator(sklearn.BaseEstimator):
self._graph = None
def fit(
self, x=None, y=None, input_fn=None, steps=None, batch_size=None,
monitors=None):
def fit(self, x=None, y=None, input_fn=None, steps=None, batch_size=None,
monitors=None):
"""Trains a model given training data `x` predictions and `y` targets.
Args:
@ -421,21 +420,20 @@ class BaseEstimator(sklearn.BaseEstimator):
monitors=None,
log_every_steps=100,
fail_on_nan_loss=True):
# TODO(wicke): This is a hack and needs to go.
if self._config.execution_mode not in ('all', 'train'):
return
# TODO(wicke): Remove this once Model and associated code are gone.
if hasattr(self._config, 'execution_mode'):
if self._config.execution_mode not in ('all', 'train'):
return
if not self._model_dir:
raise ValueError('Estimator\'s model_dir should be non-empty.')
# Stagger startup of worker sessions based on task id.
sleep_secs = min(self._config.training_worker_max_startup_secs,
self._config.task *
self._config.training_worker_session_startup_stagger_secs)
if sleep_secs:
logging.info('Waiting %d secs before starting task %d.', sleep_secs,
self._config.task)
time.sleep(sleep_secs)
# Stagger startup of worker sessions based on task id.
sleep_secs = min(
self._config.training_worker_max_startup_secs,
self._config.task *
self._config.training_worker_session_startup_stagger_secs)
if sleep_secs:
logging.info('Waiting %d secs before starting task %d.', sleep_secs,
self._config.task)
time.sleep(sleep_secs)
# Device allocation
device_fn = device_fn or self._device_fn
@ -454,7 +452,7 @@ class BaseEstimator(sklearn.BaseEstimator):
monitors += monitors_lib.get_default_monitors(
loss_op=loss_op,
summary_op=logging_ops.get_summary_op(),
save_summary_steps=100,
save_summary_steps=self._config.save_summary_steps,
summary_writer=graph_actions.get_summary_writer(self._model_dir))
is_chief = self._config.task == 0
@ -478,8 +476,9 @@ class BaseEstimator(sklearn.BaseEstimator):
log_every_steps=log_every_steps,
supervisor_is_chief=is_chief,
supervisor_master=self._config.master,
supervisor_save_model_secs=self._config.save_checkpoints_secs,
feed_fn=feed_fn,
max_steps=steps,
steps=steps,
fail_on_nan_loss=fail_on_nan_loss,
monitors=monitors)
@ -513,8 +512,9 @@ class BaseEstimator(sklearn.BaseEstimator):
feed_fn=None,
metrics=None,
name=''):
# TODO(wicke): This is a hack and needs to go.
if self._config.execution_mode not in ('all', 'evaluate', 'eval_evalset'):
# TODO(wicke): Remove this once Model and associated code are gone.
if (hasattr(self._config, 'execution_mode') and
self._config.execution_mode not in ('all', 'evaluate', 'eval_evalset')):
return
# Check that model has been trained.

View File

@ -46,10 +46,10 @@ class LinearClassifier(dnn_linear_combined.DNNLinearCombinedClassifier):
...
estimator.fit(input_fn=input_fn_train)
estimator.evaluate(input_fn=input_fn_eval)
estimator.predict(x)
estimator.predict(x=x)
```
Input of `fit`, `train`, and `evaluate` should have following features,
Input of `fit` and `evaluate` should have following features,
otherwise there will be a `KeyError`:
if `weight_column_name` is not `None`, a feature with
`key=weight_column_name` whose value is a `Tensor`.
@ -126,10 +126,10 @@ class LinearRegressor(dnn_linear_combined.DNNLinearCombinedRegressor):
...
estimator.fit(input_fn=input_fn_train)
estimator.evaluate(input_fn=input_fn_eval)
estimator.predict(x)
estimator.predict(x=x)
```
Input of `fit`, `train`, and `evaluate` should have following features,
Input of `fit` and `evaluate` should have following features,
otherwise there will be a KeyError:
if `weight_column_name` is not None:
key=weight_column_name, value=a `Tensor`

View File

@ -24,79 +24,54 @@ from tensorflow.python import GPUOptions
class RunConfig(object):
"""This class specifies the specific configurations for the run.
"""This class specifies the specific configurations for the run."""
Parameters:
execution_mode: Runners use this flag to execute different tasks, like
training vs evaluation. 'all' (the default) executes both training and
eval.
master: TensorFlow master. Empty string (the default) for local.
task: Task id of the replica running the training (default: 0).
num_ps_replicas: Number of parameter server tasks to use (default: 0).
training_worker_session_startup_stagger_secs: Seconds to sleep between the
startup of each worker task session (default: 5).
training_worker_max_startup_secs: Max seconds to wait before starting any
worker (default: 60).
eval_delay_secs: Number of seconds between the beginning of each eval run.
If one run takes more than this amount of time, the next run will start
immediately once that run completes (default 60).
eval_steps: Number of steps to run in each eval (default: 100).
num_cores: Number of cores to be used (default: 4).
verbose: Controls the verbosity, possible values:
0: the algorithm and debug information is muted.
1: trainer prints the progress.
2: log device placement is printed.
gpu_memory_fraction: Fraction of GPU memory used by the process on
each GPU uniformly on the same machine.
tf_random_seed: Random seed for TensorFlow initializers.
Setting this value allows consistency between reruns.
keep_checkpoint_max: The maximum number of recent checkpoint files to keep.
As new files are created, older files are deleted.
If None or 0, all checkpoint files are kept.
Defaults to 5 (that is, the 5 most recent checkpoint files are kept.)
keep_checkpoint_every_n_hours: Number of hours between each checkpoint
to be saved. The default value of 10,000 hours effectively disables
the feature.
Attributes:
tf_master: Tensorflow master.
tf_config: Tensorflow Session Config proto.
tf_random_seed: Tensorflow random seed.
keep_checkpoint_max: Maximum number of checkpoints to keep.
keep_checkpoint_every_n_hours: Number of hours between each checkpoint.
"""
# TODO(wicke): Cull unused options.
# TODO(wicke): Move options out once functionality is covered by monitors
def __init__(self,
execution_mode='all',
master='',
task=0,
num_ps_replicas=0,
training_worker_session_startup_stagger_secs=5,
training_worker_max_startup_secs=60,
eval_delay_secs=60,
eval_steps=100,
num_cores=4,
verbose=1,
log_device_placement=False,
gpu_memory_fraction=1,
tf_random_seed=42,
save_summary_steps=100,
save_checkpoints_secs=60,
keep_checkpoint_max=5,
keep_checkpoint_every_n_hours=10000):
self.execution_mode = execution_mode
"""Constructor.
Args:
master: TensorFlow master. Empty string (the default) for local.
task: Task id of the replica running the training (default: 0).
num_ps_replicas: Number of parameter server tasks to use (default: 0).
num_cores: Number of cores to be used (default: 4).
log_device_placement: Log the op placement to devices (default: False).
gpu_memory_fraction: Fraction of GPU memory used by the process on
each GPU uniformly on the same machine.
tf_random_seed: Random seed for TensorFlow initializers.
Setting this value allows consistency between reruns.
save_summary_steps: Save summaries every this many steps.
save_checkpoints_secs: Save checkpoints every this many seconds.
keep_checkpoint_max: The maximum number of recent checkpoint files to
keep. As new files are created, older files are deleted. If None or 0,
all checkpoint files are kept. Defaults to 5 (that is, the 5 most recent
checkpoint files are kept.)
keep_checkpoint_every_n_hours: Number of hours between each checkpoint
to be saved. The default value of 10,000 hours effectively disables
the feature.
"""
self.master = master
self.task = task
self.num_ps_replicas = num_ps_replicas
self.training_worker_session_startup_stagger_secs = (
training_worker_session_startup_stagger_secs)
self.training_worker_max_startup_secs = training_worker_max_startup_secs
self.eval_delay_secs = eval_delay_secs
self.eval_steps = eval_steps
gpu_options = GPUOptions(
per_process_gpu_memory_fraction=gpu_memory_fraction)
self.tf_config = ConfigProto(log_device_placement=(verbose > 1),
self.tf_config = ConfigProto(log_device_placement=log_device_placement,
inter_op_parallelism_threads=num_cores,
intra_op_parallelism_threads=num_cores,
gpu_options=gpu_options)
self.tf_random_seed = tf_random_seed
self.save_summary_steps = save_summary_steps
self.save_checkpoints_secs = save_checkpoints_secs
self.keep_checkpoint_max = keep_checkpoint_max
self.keep_checkpoint_every_n_hours = keep_checkpoint_every_n_hours

View File

@ -0,0 +1,134 @@
# Copyright 2016 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Experiment class collecting information needed for a single training run."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import time
from tensorflow.python.platform import tf_logging as logging
class Experiment(object):
"""Experiment is a class containing all information needed to train a model.
"""
def __init__(self, estimator,
train_input_fn,
eval_input_fn,
eval_metrics=None):
"""Constructor for Experiment.
Args:
estimator: `Estimator` object.
train_input_fn: function, returns features and targets for training.
eval_input_fn: function, returns features and targets for evaluation.
eval_metrics: `dict` of string, metric function. If `None`, default set
is used.
"""
super(Experiment, self).__init__()
self._estimator = estimator
self._train_input_fn = train_input_fn
self._eval_input_fn = eval_input_fn
self._eval_metrics = eval_metrics
def train(self, steps=None, monitors=None, delay_secs=0):
"""Fit the estimator using the training data.
Train the estimator for `steps` steps, after waiting for `delay_secs`
seconds. If `steps` is `None`, train forever.
Args:
steps: Perform this many steps of training. `None`, the default, means
train forever.
monitors: A list of monitors to pass to the `Estimator`'s `fit` function.
delay_secs: Start training after this many seconds.
Returns:
The trained estimator.
"""
if delay_secs:
logging.info("Waiting %d secs before starting training.", delay_secs)
time.sleep(delay_secs)
return self._estimator.fit(input_fn=self._train_input_fn,
steps=steps, monitors=monitors)
def evaluate(self, steps=None, delay_secs=0):
"""Evaluate on the evaluation data.
Runs evaluation on the evaluation data and returns the result. If `steps`
is given, only run for this many steps. Start the evaluation after
`delay_secs` seconds.
Args:
steps: Run this many steps of evaluation.
delay_secs: Start evaluating after waiting for this many seconds.
Returns:
The result of the `evaluate` call to the `Estimator`.
"""
if delay_secs:
logging.info("Waiting %d secs before starting eval.", delay_secs)
time.sleep(delay_secs)
return self._estimator.evaluate(input_fn=self._eval_input_fn,
steps=steps,
metrics=self._eval_metrics)
def _continuous_eval(self, input_fn, steps=1000, delay_secs=0,
throttle_delay_secs=60):
"""Run continuous eval on the eval data.
Run `steps` steps of evaluation on the evaluation data set. This function
starts evaluating after `delay_secs` seconds and then runs no more than one
evaluation per `throttle_delay_secs`. It never returns.
Args:
input_fn: The input to use for this eval.
steps: Number of steps per evaluation run.
delay_secs: Start evaluating after this many seconds.
throttle_delay_secs: Do not re-evaluate unless the last evaluation was
started at least this many seconds ago.
"""
if delay_secs:
logging.info("Waiting %f secs before starting eval.", delay_secs)
time.sleep(delay_secs)
while True:
start = time.time()
self._estimator.evaluate(input_fn=input_fn,
steps=steps,
metrics=self._eval_metrics)
duration = time.time() - start
if duration < throttle_delay_secs:
difference = throttle_delay_secs - duration
logging.info("Waiting %f secs before starting next eval run.",
difference)
time.sleep(difference)
def continuous_eval(self, steps=1000, delay_secs=0, throttle_delay_secs=60):
self._continuous_eval(self._eval_input_fn, steps=steps,
delay_secs=delay_secs,
throttle_delay_secs=throttle_delay_secs)
def continuous_eval_on_train_data(self, steps=1000, delay_secs=0,
throttle_delay_secs=60):
self._continuous_eval(self._train_input_fn, steps=steps,
delay_secs=delay_secs,
throttle_delay_secs=throttle_delay_secs)

View File

@ -30,8 +30,9 @@ from six import reraise
from tensorflow.contrib.framework.python.ops import ops as contrib_ops
from tensorflow.contrib.framework.python.ops import variables as contrib_variables
from tensorflow.contrib.layers.python.layers import summaries
from tensorflow.contrib.learn.python.learn import monitors as monitors_lib
from tensorflow.contrib.learn.python.learn.utils import checkpoints
from tensorflow.core.framework import summary_pb2
from tensorflow.python.client import session as tf_session
from tensorflow.python.framework import errors
from tensorflow.python.framework import ops
@ -131,7 +132,7 @@ def train(graph,
supervisor_save_model_secs=600,
supervisor_save_summaries_steps=100,
feed_fn=None,
max_steps=None,
steps=None,
fail_on_nan_loss=True,
monitors=None):
"""Train a model.
@ -173,7 +174,7 @@ def train(graph,
`supervisor_save_summaries_steps` seconds when training.
feed_fn: A function that is called every iteration to produce a `feed_dict`
passed to `session.run` calls. Optional.
max_steps: Train until `global_step_tensor` evaluates to this value.
steps: Trains for this many steps (e.g. current global step + `steps`).
fail_on_nan_loss: If true, raise `NanLossDuringTrainingError` if `loss_op`
evaluates to `NaN`. If false, continue training as if nothing happened.
monitors: List of `BaseMonitor` subclass instances. Used for callbacks
@ -192,28 +193,36 @@ def train(graph,
if not output_dir:
raise ValueError('Output directory should be non-empty.')
global_step_tensor = contrib_variables.assert_or_get_global_step(
graph, global_step_tensor)
if global_step_tensor is None:
raise ValueError('No "global_step" was provided or found in the graph.')
with graph.as_default():
global_step_tensor = contrib_variables.assert_or_get_global_step(
graph, global_step_tensor)
if global_step_tensor is None:
raise ValueError('No "global_step" was provided or found in the graph.')
summary_writer = (get_summary_writer(output_dir)
if supervisor_is_chief else None)
# Get current step.
try:
start_step = checkpoints.load_variable(
output_dir, global_step_tensor.name)
except (errors.NotFoundError, ValueError):
start_step = 0
# TODO(ipolosukhin): Replace all functionality of Supervisor with Monitors.
if not supervisor_is_chief:
# monitors should run only on the chief.
monitors = []
elif not monitors:
monitors = monitors_lib.get_default_monitors(
loss_op=loss_op,
summary_op=logging_ops.get_summary_op(),
save_summary_steps=supervisor_save_summaries_steps,
summary_writer=summary_writer)
summary_writer = (get_summary_writer(output_dir)
if supervisor_is_chief else None)
# Start monitors, can create graph parts.
for monitor in monitors:
monitor.begin(max_steps=max_steps)
# TODO(ipolosukhin): Replace all functionality of Supervisor with Monitors.
if not supervisor_is_chief:
# monitors should run only on the chief.
monitors = []
elif not monitors:
monitors = monitors_lib.get_default_monitors(
loss_op=loss_op,
summary_op=logging_ops.get_summary_op(),
save_summary_steps=supervisor_save_summaries_steps,
summary_writer=summary_writer)
# Start monitors, can create graph parts.
for monitor in monitors:
monitor.begin(max_steps=start_step + steps)
supervisor = tf_supervisor.Supervisor(
graph,
@ -235,6 +244,7 @@ def train(graph,
get_current_step = lambda: session.run(global_step_tensor)
start_step = get_current_step()
max_steps = start_step + steps
last_step = start_step
last_log_step = start_step
loss_value = None
@ -375,6 +385,28 @@ def _start_queue_runners(session, coord):
return threads
def _eval_results_to_str(eval_results):
return ', '.join('%s = %s' % (k, v) for k, v in eval_results.items())
def _write_summary_results(output_dir, eval_results, current_global_step):
"""Writes eval results into summary file in given dir."""
logging.info('Saving evaluation summary for %d step: %s' % (
current_global_step, _eval_results_to_str(eval_results)))
summary_writer = get_summary_writer(output_dir)
summary = summary_pb2.Summary()
for key in eval_results:
if eval_results[key] is None:
continue
value = summary.value.add()
value.tag = key
if (isinstance(eval_results[key], np.float32) or
isinstance(eval_results[key], float)):
value.simple_value = float(eval_results[key])
summary_writer.add_summary(summary, current_global_step)
summary_writer.close()
# TODO(ptucker): Add unit test.
def evaluate(graph,
output_dir,
@ -424,32 +456,26 @@ def evaluate(graph,
eval steps were run.
global_step: The global step this evaluation corresponds to.
"""
global_step_tensor = contrib_variables.assert_or_get_global_step(
graph, global_step_tensor)
with graph.as_default():
global_step_tensor = contrib_variables.assert_or_get_global_step(
graph, global_step_tensor)
for key, value in eval_dict.items():
if not summaries.is_summary_tag_unique(key):
continue
if isinstance(value, ops.Tensor):
summaries.summarize_tensor(value, tag=key)
# Create or get summary op, global_step and saver.
saver = _get_saver()
local_init_op = _get_local_init_op()
ready_op = _get_ready_op()
# Create or get summary op, global_step and saver.
summary_op = logging_ops.get_summary_op()
saver = _get_saver()
local_init_op = _get_local_init_op()
ready_op = _get_ready_op()
session_manager = session_manager_lib.SessionManager(
local_init_op=local_init_op,
ready_op=ready_op)
session, initialized = session_manager.recover_session(
master=supervisor_master,
saver=saver,
checkpoint_dir=checkpoint_path)
session_manager = session_manager_lib.SessionManager(
local_init_op=local_init_op,
ready_op=ready_op)
session, initialized = session_manager.recover_session(
master=supervisor_master,
saver=saver,
checkpoint_dir=checkpoint_path)
# Start queue runners.
coord = coordinator.Coordinator()
threads = _start_queue_runners(session, coord)
# Start queue runners.
coord = coordinator.Coordinator()
threads = _start_queue_runners(session, coord)
with session:
if not initialized:
@ -488,8 +514,7 @@ def evaluate(graph,
duration = time.time() - start_time
logging.info('Results after %d steps (%.3f sec/batch): %s.',
step, float(duration),
', '.join('%s = %s' % (k, v)
for k, v in eval_results.items()))
_eval_results_to_str(eval_results))
finally:
if eval_results is None or step != eval_step:
eval_results = session.run(eval_dict, feed_dict=feed_dict)
@ -498,20 +523,6 @@ def evaluate(graph,
coord.request_stop()
coord.join(threads, stop_grace_period_secs=120)
# Make our own summary writer and write a summary to the eval dir.
# Only is feed_fn is not provided.
# TODO(ipolosukhin): Convert evaluation to use streaming_metrics,
# then we can save for non feed_fn as well.
if summary_op is not None and feed_fn is None:
summary_writer = None
try:
summary_writer = get_summary_writer(output_dir)
summary_str = session.run(summary_op)
if summary_str:
summary_writer.add_summary(summary_str, current_global_step)
finally:
if summary_writer:
summary_writer.close()
# catch OutOfRangeError which is thrown when queue is out of data (and for
# other reasons as well).
except errors.OutOfRangeError as e:
@ -526,6 +537,9 @@ def evaluate(graph,
else:
logging.warn('Input iterator is exhausted: %s.', e)
# Save summaries for this evaluation.
_write_summary_results(output_dir, eval_results, current_global_step)
return eval_results, current_global_step

View File

@ -0,0 +1,75 @@
# pylint: disable=g-bad-file-header
# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Runs an Experiment."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from tensorflow.contrib.learn.python.learn.experiment import Experiment
from tensorflow.python.platform import flags
from tensorflow.python.platform import tf_logging as logging
FLAGS = flags.FLAGS
flags.DEFINE_string('schedule', '', 'Schedule to run for this experiment. '
'A schedule identifies a method on the Experiment '
'instance returned by the function passed to the '
'run() call')
flags.DEFINE_string('output_dir', '', 'Base output directory. Made '
'available to the experiment builder function passed '
'to run(). All files written by the Experiment are '
'expected to be written into this directory.')
def run(experiment_fn):
"""Make and run an experiment."""
if not FLAGS.output_dir:
raise RuntimeError('Must specify an output directory (use --output_dir).')
if not FLAGS.schedule:
raise RuntimeError('Must specify a schedule (use --schedule).')
if not callable(experiment_fn):
raise TypeError('Experiment builder "%s" is not callable.' %
experiment_fn)
# Call the builder
experiment = experiment_fn(output_dir=FLAGS.output_dir)
if not isinstance(experiment, Experiment):
raise TypeError('Experiment builder did not return an Experiment '
'instance, got %s instead.' % type(experiment))
# Execute the schedule
taskname = FLAGS.schedule
if not hasattr(experiment, taskname):
logging.error('Schedule references non-existent task %s', taskname)
valid_tasks = [x for x in experiment.__dict__
if callable(getattr(experiment, x))]
logging.error('Allowed values for this experiment are: %s', valid_tasks)
raise ValueError('Schedule references non-existent task %s', taskname)
task = getattr(experiment, taskname)
if not callable(task):
logging.error('Schedule references non-callable member %s', taskname)
valid_tasks = [x for x in experiment.__dict__
if callable(getattr(experiment, x))]
logging.error('Allowed values for this experiment are: %s', valid_tasks)
raise TypeError('Schedule references non-callable member %s', taskname)
return task()

View File

@ -19,7 +19,12 @@ from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
import six
from tensorflow.python.framework import ops
from tensorflow.python.platform import tf_logging as logging
from tensorflow.python.training import saver
from tensorflow.python.training import summary_io
@ -135,17 +140,27 @@ class PrintTensor(EveryN):
"""
def __init__(self, tensor_names, every_n=100, first_n=1):
"""Initializes PrintTensor monitor.
Args:
tensor_names: `dict` of tag to tensor names or
`iterable` of tensor names (strings).
every_n: Print every N steps.
first_n: Print first N steps.
"""
super(PrintTensor, self).__init__(every_n, first_n)
if not isinstance(tensor_names, dict):
tensor_names = {item: item for item in tensor_names}
self._tensor_names = tensor_names
def every_n_step_begin(self, unused_step):
return self._tensor_names
return list(self._tensor_names.values())
def every_n_step_end(self, step, outputs):
stats = []
for name in self._tensor_names:
if name in outputs:
stats.append("%s = %s" % (name, str(outputs[name])))
for tag, tensor_name in six.iteritems(self._tensor_names):
if tensor_name in outputs:
stats.append("%s = %s" % (tag, str(outputs[tensor_name])))
logging.info("Step %d: %s" % (step, ", ".join(stats)))
@ -179,14 +194,45 @@ class SummarySaver(EveryN):
class ValidationMonitor(EveryN):
"""Runs evaluation every n steps.
Can do early stopping on validation loss if `early_stopping_rounds` provided.
"""Runs evaluation of the Estimator every n steps.
Can do early stopping on validation metrics if
`early_stopping_rounds` provided.
"""
def __init__(self, x=None, y=None, input_fn=None,
every_n_steps=100, early_stopping_rounds=None):
def __init__(self, x=None, y=None, input_fn=None, batch_size=None,
every_n_steps=100, metrics=None, early_stopping_rounds=None,
early_stopping_metric="loss",
early_stopping_metric_minimize=True, name=None):
"""Initializes ValidationMonitor.
Args:
x: matrix or tensor of shape [n_samples, n_features...]. Can be
iterator that returns arrays of features. The training input
samples for fitting the model. If set, `input_fn` must be `None`.
y: vector or matrix [n_samples] or [n_samples, n_outputs]. Can be
iterator that returns array of targets. The training target values
(class labels in classification, real numbers in regression). If set,
`input_fn` must be `None`.
input_fn: Input function. If set, `x`, `y`, and `batch_size` must be
`None`.
batch_size: minibatch size to use on the input, defaults to first
dimension of `x`. Must be `None` if `input_fn` is provided.
every_n_steps: Runs this monitor every N steps.
metrics: Dict of metric ops to run. If None, the default metric functions
are used; if {}, no metrics are used.
early_stopping_rounds: If validation metric didn't go down for this many
steps, then stop training.
early_stopping_metric: `str`, name of the metric to early stop.
early_stopping_metric_minimize: `bool`, True if minimize, False
if maximize. For example, minimize `loss` or `mean_squared_error` and
maximize `accuracy` or `f1`.
name: `str`, appended to output sub-folder. If None uses `eval`
sub-folder, else, `eval-%name%` is used to save sum.
Raises:
ValueError: If both x and input_fn are provided.
"""
super(ValidationMonitor, self).__init__(every_n_steps=every_n_steps,
first_n_steps=-1)
if x is None and input_fn is None:
@ -194,25 +240,64 @@ class ValidationMonitor(EveryN):
self.x = x
self.y = y
self.input_fn = input_fn
self.min_loss_step = 0
self.min_loss = None
self.batch_size = batch_size
self.metrics = metrics
self.early_stopping_rounds = early_stopping_rounds
self.early_stopping_metric = early_stopping_metric
self.early_stopping_metric_minimize = early_stopping_metric_minimize
self.name = name
self._best_value_step = None
self._best_value = None
self._early_stopped = False
self._latest_path = None
self._latest_path_step = None
@property
def early_stopped(self):
return self._early_stopped
@property
def best_step(self):
return self._best_value_step
@property
def best_value(self):
return self._best_value
def every_n_step_end(self, step, unused_outputs):
# Check that we are not running evaluation on the same checkpoint.
latest_path = saver.latest_checkpoint(self._estimator.model_dir)
if latest_path == self._latest_path:
logging.info("Skipping evaluation due to same checkpoint %s for step %d "
"as for step %d.", latest_path, step, self._latest_path_step)
return False
self._latest_path = latest_path
self._latest_path_step = step
# Run evaluation and log it.
outputs = self._estimator.evaluate(
x=self.x, y=self.y, input_fn=self.input_fn)
x=self.x, y=self.y, input_fn=self.input_fn, batch_size=self.batch_size,
metrics=self.metrics, name=self.name)
stats = []
for name in outputs:
stats.append("%s = %s" % (name, str(outputs[name])))
logging.info("Validation (step %d): %s" % (step, ", ".join(stats)))
# Early stopping logic.
if self.early_stopping_rounds is not None:
if self.min_loss is None or outputs["loss"] < self.min_loss:
self.min_loss = outputs["loss"]
self.min_loss_step = step
stop_now = (step - self.min_loss_step >= self.early_stopping_rounds)
if (self._best_value is None or
(self.early_stopping_metric_minimize and
outputs[self.early_stopping_metric] < self._best_value) or
(not self.early_stopping_metric_minimize and
outputs[self.early_stopping_metric] > self._best_value)):
self._best_value = outputs[self.early_stopping_metric]
self._best_value_step = step
stop_now = (step - self._best_value_step >= self.early_stopping_rounds)
if stop_now:
logging.info("Stopping. Best step: {} with loss {}."
.format(self.min_loss_step, self.min_loss))
logging.info("Stopping. Best step: {} with {} = {}."
.format(self._best_value_step,
self.early_stopping_metric, self._best_value))
self._early_stopped = True
return True
return False
@ -220,7 +305,7 @@ class ValidationMonitor(EveryN):
class CaptureVariable(EveryN):
"""Capture a variable value into a `list`.
It's useful for unit testing.
This monitor is useful for unit testing.
"""
def __init__(self, var_name, every_n=100, first_n=1):
@ -239,9 +324,85 @@ def get_default_monitors(loss_op=None, summary_op=None, save_summary_steps=100,
output_dir=None, summary_writer=None):
monitors = []
if loss_op is not None:
monitors.append(PrintTensor([loss_op.name]))
monitors.append(PrintTensor(tensor_names={"loss": loss_op.name}))
if summary_op is not None:
monitors.append(SummarySaver(summary_op, save_steps=save_summary_steps,
output_dir=output_dir,
summary_writer=summary_writer))
return monitors
class GraphDump(BaseMonitor):
"""Dumps almost all tensors in the graph at every step.
Note, this is very expensive, prefer `PrintTensor` or `CaptureVariable` if
you are not debugging.
"""
IGNORE_OPS = ["Const", "Assign", "Identity", "Placeholder",
"RandomUniform", "Cast", "RestoreSlice"]
def __init__(self, ignore_ops=None):
"""Initializes GraphDump monitor.
Args:
ignore_ops: `list` of string names of `Operation`s to ignore.
If `None` GraphDump.IGNORE_OPS list is used.
"""
self.ignore_ops = ignore_ops or GraphDump.IGNORE_OPS
self._data = []
def begin(self, max_steps):
self.tensors = []
graph = ops.get_default_graph()
graph_def = graph.as_graph_def()
for node in graph_def.node:
if node.op in self.ignore_ops:
continue
try:
self.tensors.append(graph.get_tensor_by_name(node.name + ":0"))
except KeyError:
pass
def step_begin(self, step):
return self.tensors
def step_end(self, step, outputs):
self._data.append(outputs)
@property
def data(self):
return self._data
def compare(self, other_dump, step, atol=1e-06):
"""Compares two `GraphDump` monitors and returns differences.
Args:
other_dump: Another `GraphDump` monitor.
step: `int`, step to compare on.
atol: `float`, absolute tolerance in comparison of floating arrays.
Returns:
Returns tuple:
matched: `list` of keys that matched.
non_matched: `dict` of keys to difference.
"""
non_matched = {}
matched = []
for key in self.data[step]:
if not isinstance(key, str) and not isinstance(key, unicode):
continue
value1, value2 = self.data[step][key], other_dump.data[step][key]
if isinstance(value1, str):
continue
if isinstance(value1, np.ndarray):
if not np.allclose(value1, value2, atol=atol):
non_matched[key] = value1 - value2
else:
matched.append(key)
else:
if value1 != value2:
non_matched[key] = (value1, value2)
else:
matched.append(key)
return matched, non_matched

View File

@ -28,6 +28,12 @@ from tensorflow.contrib.learn.python.learn.estimators._sklearn import accuracy_s
from tensorflow.contrib.learn.python.learn.estimators._sklearn import train_test_split
def _get_summary_events(folder):
if not tf.gfile.Exists(folder):
raise ValueError('Folder %s doesn\'t exist.' % folder)
return tf.contrib.testing.latest_summaries(folder)
class EarlyStoppingTest(tf.test.TestCase):
"""Early stopping tests."""
@ -35,36 +41,54 @@ class EarlyStoppingTest(tf.test.TestCase):
random.seed(42)
iris = datasets.load_iris()
x_train, x_test, y_train, y_test = train_test_split(iris.data,
iris.target,
test_size=0.2,
random_state=42)
x_train, x_test, y_train, y_test = train_test_split(
iris.data, iris.target, test_size=0.2, random_state=42)
x_train, x_val, y_train, y_val = train_test_split(x_train,
y_train,
test_size=0.2)
val_monitor = learn.monitors.ValidationMonitor(x_val,
y_val,
early_stopping_rounds=100)
x_train, x_val, y_train, y_val = train_test_split(
x_train, y_train, test_size=0.2, random_state=42)
val_monitor = learn.monitors.ValidationMonitor(
x_val, y_val, every_n_steps=50, early_stopping_rounds=100,
early_stopping_metric='accuracy', early_stopping_metric_minimize=False)
# classifier without early stopping - overfitting
classifier1 = learn.TensorFlowDNNClassifier(hidden_units=[10, 20, 10],
n_classes=3,
steps=1000)
classifier1 = learn.TensorFlowDNNClassifier(
hidden_units=[10, 20, 10], n_classes=3, steps=1000)
classifier1.fit(x_train, y_train)
accuracy_score(y_test, classifier1.predict(x_test))
_ = accuracy_score(y_test, classifier1.predict(x_test))
# Full 1000 steps, 11 summaries and no evaluation summary.
# 11 summaries = first + every 100 out of 1000 steps.
self.assertEqual(11, len(_get_summary_events(classifier1.model_dir)))
with self.assertRaises(ValueError):
_get_summary_events(classifier1.model_dir + '/eval')
# classifier with early stopping - improved accuracy on testing set
classifier2 = learn.TensorFlowDNNClassifier(hidden_units=[10, 20, 10],
n_classes=3,
steps=1000)
classifier2 = learn.TensorFlowDNNClassifier(
hidden_units=[10, 20, 10], n_classes=3, steps=2000,
config=tf.contrib.learn.RunConfig(save_checkpoints_secs=1))
classifier2.fit(x_train, y_train, monitors=[val_monitor])
accuracy_score(y_test, classifier2.predict(x_test))
_ = accuracy_score(y_val, classifier2.predict(x_val))
_ = accuracy_score(y_test, classifier2.predict(x_test))
# Note, this test is unstable, so not checking for equality.
# See stability_test for examples of stability issues.
if val_monitor.early_stopped:
self.assertLess(val_monitor.best_step, 2000)
# Note, due to validation monitor stopping after the best score occur,
# the accuracy at current checkpoint is less.
# TODO(ipolosukhin): Time machine for restoring old checkpoints?
# flaky, still not always best_value better then score2 value.
# self.assertGreater(val_monitor.best_value, score2_val)
# Early stopped, unstable so checking only < then max.
self.assertLess(len(_get_summary_events(classifier2.model_dir)), 21)
self.assertLess(len(_get_summary_events(
classifier2.model_dir + '/eval')), 4)
# TODO(ipolosukhin): Restore this?
# self.assertGreater(score2, score1, "No improvement using early stopping.")
if __name__ == "__main__":
if __name__ == '__main__':
tf.test.main()

View File

@ -0,0 +1,119 @@
# pylint: disable=g-bad-file-header
# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for TaskRunner and Experiment class."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import time
import tensorflow as tf
class TestEstimator(object):
def __init__(self):
self.eval_count = 0
self.fit_count = 0
def evaluate(self, **kwargs):
tf.logging.info('evaluate called with args: %s' % kwargs)
self.eval_count += 1
if self.eval_count > 5:
tf.logging.info('Ran 6 evals. Done.')
raise StopIteration()
return [(key, kwargs[key]) for key in sorted(kwargs.keys())]
def fit(self, **kwargs):
tf.logging.info('fit called with args: %s' % kwargs)
self.fit_count += 1
return [(key, kwargs[key]) for key in sorted(kwargs.keys())]
class ExperimentTest(tf.test.TestCase):
def test_train(self):
est = TestEstimator()
ex = tf.contrib.learn.Experiment(est,
train_input_fn='train_input',
eval_input_fn='eval_input',
eval_metrics='eval_metrics')
ex.train(delay_secs=0)
self.assertEquals(1, est.fit_count)
self.assertEquals(0, est.eval_count)
def test_train_delay(self):
est = TestEstimator()
ex = tf.contrib.learn.Experiment(est,
train_input_fn='train_input',
eval_input_fn='eval_input')
for delay in [0, 1, 3]:
start = time.time()
ex.train(delay_secs=delay)
duration = time.time() - start
tf.logging.info('train duration (expected %f): %f', delay, duration)
self.assertTrue(duration > delay - 0.5 and duration < delay + 0.5)
def test_evaluate(self):
est = TestEstimator()
ex = tf.contrib.learn.Experiment(est,
train_input_fn='train_input',
eval_input_fn='eval_input',
eval_metrics='eval_metrics')
ex.evaluate(steps='steps', delay_secs=0)
self.assertEquals(1, est.eval_count)
self.assertEquals(0, est.fit_count)
def test_evaluate_delay(self):
est = TestEstimator()
ex = tf.contrib.learn.Experiment(est,
train_input_fn='train_input',
eval_input_fn='eval_input')
for delay in [0, 1, 3]:
start = time.time()
ex.evaluate(delay_secs=delay)
duration = time.time() - start
tf.logging.info('eval duration (expected %f): %f', delay, duration)
self.assertTrue(duration > delay - 0.5 and duration < delay + 0.5)
def test_continuous_eval(self):
est = TestEstimator()
ex = tf.contrib.learn.Experiment(est,
train_input_fn='train_input',
eval_input_fn='eval_input',
eval_metrics='eval_metrics')
self.assertRaises(StopIteration, ex.continuous_eval,
delay_secs=0, throttle_delay_secs=0)
self.assertEquals(6, est.eval_count)
self.assertEquals(0, est.fit_count)
def test_continuous_eval_throttle_delay(self):
for delay in [0, 1, 2]:
est = TestEstimator()
ex = tf.contrib.learn.Experiment(est,
train_input_fn='train_input',
eval_input_fn='eval_input',
eval_metrics='eval_metrics')
start = time.time()
self.assertRaises(StopIteration, ex.continuous_eval,
delay_secs=0, throttle_delay_secs=delay)
duration = time.time() - start
expected = 5 * delay
tf.logging.info('eval duration (expected %f): %f', expected, duration)
self.assertTrue(duration > expected - 0.5 and duration < expected + 0.5)
if __name__ == '__main__':
tf.test.main()

View File

@ -0,0 +1,107 @@
# pylint: disable=g-bad-file-header
# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""learn_main tests."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
from tensorflow.contrib.learn.python.learn import learn_runner
FLAGS = learn_runner.FLAGS
class TestExperiment(tf.contrib.learn.Experiment):
def __init__(self, default=None):
self.default = default
def simple_task(self):
return "simple_task, default=%s." % self.default
# pylint: disable=unused-argument
def build_experiment(output_dir):
tf.logging.info("In default build_experiment.")
return TestExperiment()
def build_non_experiment(output_dir):
return "Ceci n'est pas un Experiment."
# pylint: enable=unused-argument
class MainTest(tf.test.TestCase):
def setUp(self):
# Make sure the flags exist. It's unclear why this is necessary.
if not hasattr(FLAGS, "output_dir"):
learn_runner.flags.DEFINE_string("output_dir", "/tmp", "Fake")
if not hasattr(FLAGS, "schedule"):
learn_runner.flags.DEFINE_string("schedule", "simple_task", "Fake")
def test_run(self):
FLAGS.output_dir = "/tmp"
FLAGS.schedule = "simple_task"
self.assertEqual("simple_task, default=None.",
learn_runner.run(build_experiment))
def test_fail_no_output_dir(self):
FLAGS.output_dir = ""
FLAGS.schedule = "simple_test"
self.assertRaisesRegexp(RuntimeError,
"Must specify an output directory",
learn_runner.run, build_experiment)
def test_fail_no_schedule(self):
FLAGS.output_dir = "/tmp"
FLAGS.schedule = ""
self.assertRaisesRegexp(RuntimeError, "Must specify a schedule",
learn_runner.run, build_experiment)
def test_fail_non_callable(self):
FLAGS.output_dir = "/tmp"
FLAGS.schedule = "simple_test"
self.assertRaisesRegexp(TypeError,
"Experiment builder .* is not callable",
learn_runner.run, "not callable")
def test_fail_not_experiment(self):
FLAGS.output_dir = "/tmp"
FLAGS.schedule = "simple_test"
self.assertRaisesRegexp(
TypeError, "Experiment builder did not return an Experiment",
learn_runner.run, build_non_experiment)
def test_fail_non_existent_task(self):
FLAGS.output_dir = "/tmp"
FLAGS.schedule = "mirage"
self.assertRaisesRegexp(
ValueError, "Schedule references non-existent task",
learn_runner.run, build_experiment)
def test_fail_non_callable_task(self):
FLAGS.output_dir = "/tmp"
FLAGS.schedule = "default"
self.assertRaisesRegexp(
TypeError, "Schedule references non-callable member",
learn_runner.run, build_experiment)
if __name__ == "__main__":
tf.test.main()

View File

@ -0,0 +1,88 @@
# pylint: disable=g-bad-file-header
# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Non-linear estimator tests."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
# import random
import tensorflow as tf
class StabilityTest(tf.test.TestCase):
"""Tests that estiamtors are reproducible."""
def testRandomStability(self):
my_seed, minval, maxval = 42, -0.3333, 0.3333
with tf.Graph().as_default() as g:
with self.test_session(graph=g) as session:
tf.set_random_seed(my_seed)
x = tf.random_uniform([10, 10], minval=minval, maxval=maxval)
val1 = session.run(x)
with tf.Graph().as_default() as g:
with self.test_session(graph=g) as session:
tf.set_random_seed(my_seed)
x = tf.random_uniform([10, 10], minval=minval, maxval=maxval)
val2 = session.run(x)
self.assertAllClose(val1, val2)
def testLinearRegression(self):
# TODO(ipolosukhin): This doesn't pass at all, but should...
pass
# random.seed(42)
# boston = tf.contrib.learn.datasets.load_boston()
# regressor = tf.contrib.learn.LinearRegressor()
# regressor.fit(x=boston.data, y=boston.target, steps=1)
# regressor2 = tf.contrib.learn.LinearRegressor()
# regressor2.fit(x=boston.data, y=boston.target, steps=1)
# self.assertAllClose(regressor.weights_, regressor2.weights_)
# self.assertAllClose(regressor.bias_, regressor2.bias_)
# self.assertAllClose(regressor.predict(boston.data),
# regressor2.predict(boston.data), atol=1e-05)
def testDNNRegression(self):
# TODO(ipolosukhin): This doesn't pass at all, but should...
# Either bugs or just general instability.
pass
# random.seed(42)
# boston = tf.contrib.learn.datasets.load_boston()
# regressor = tf.contrib.learn.DNNRegressor(
# hidden_units=[10],
# optimizer=tf.train.GradientDescentOptimizer(learning_rate=0.001))
# graph_dump = tf.contrib.learn.monitors.GraphDump()
# regressor.fit(x=boston.data, y=boston.target, steps=1,
# monitors=[graph_dump], batch_size=1)
# regressor2 = tf.contrib.learn.DNNRegressor(
# hidden_units=[10],
# optimizer=tf.train.GradientDescentOptimizer(learning_rate=0.001))
# graph_dump2 = tf.contrib.learn.monitors.GraphDump()
# regressor2.fit(x=boston.data, y=boston.target, steps=1,
# monitors=[graph_dump2], batch_size=1)
# _, non_match = graph_dump.compare(graph_dump2, 0, atol=1e-02)
# self.assertEmpty(non_match.keys())
# for weight1, weight2 in zip(regressor.weights_, regressor2.weights_):
# self.assertAllClose(weight1, weight2)
# for bias1, bias2 in zip(regressor.bias_, regressor2.bias_):
# self.assertAllClose(bias1, bias2)
# self.assertAllClose(regressor.predict(boston.data),
# regressor2.predict(boston.data), atol=1e-05)
if __name__ == '__main__':
tf.test.main()

View File

@ -0,0 +1,229 @@
/* Copyright 2015 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include <algorithm>
#include <atomic>
#include <set>
#include <unordered_map>
#include <vector>
#include "tensorflow/core/graph/quantize_training.h"
#include "tensorflow/core/common_runtime/executor.h"
#include "tensorflow/core/common_runtime/function.h"
#include "tensorflow/core/common_runtime/memory_types.h"
#include "tensorflow/core/framework/log_memory.h"
#include "tensorflow/core/framework/op_kernel.h"
#include "tensorflow/core/graph/algorithm.h"
#include "tensorflow/core/graph/node_builder.h"
#include "tensorflow/core/graph/subgraph.h"
#include "tensorflow/core/lib/strings/strcat.h"
#include "tensorflow/core/public/session_options.h"
namespace tensorflow {
namespace {
// Node types to rewrite. Insert quantize_and_dequantize op for their inputs.
const std::unordered_set<string, StringPiece::Hasher> nodes_to_rewrite{
"MatMul", "Conv2D"};
// Contains necessary parameters to convert an edge.
struct EdgeToConvert {
// Edge is not owned here.
const Edge* edge;
int32 num_bits;
bool signed_input;
bool range_given;
float input_min;
float input_max;
EdgeToConvert(const Edge* e, int32 bits, bool sign, bool range, float min,
float max) {
edge = e;
num_bits = bits;
signed_input = sign;
range_given = range;
input_min = min;
input_max = max;
}
};
// Decide if a node is in backward pass by checking if its name is led by
// "gradients".
// TODO(jmchen): Make this check more robust as it is not guaranteed that the
// forward node will not be named with a leading "gradients".
inline bool IsGradientNode(const Graph* graph, const Node* node) {
static const string tag = "gradients";
return (node->name().compare(0, tag.size(), tag) == 0);
}
// Find the type of the input to set the parameters for the
// quantize_and_dequantize op.
// Returns true if the root tensor op type is known, false otherwise.
bool FindType(const Graph* graph, const Node* node, bool* signed_input,
bool* range_given, float* input_min, float* input_max) {
const string src_op = node->type_string();
if (src_op == "Const" || src_op == "Variable") {
*signed_input = true;
*range_given = false;
} else if (src_op == "Relu") {
// Range is not given for Relu.
*signed_input = false;
*range_given = false;
} else if (src_op == "Relu6") {
*signed_input = false;
*range_given = true;
*input_min = 0;
*input_max = 6;
} else if (src_op == "Sigmoid") {
*signed_input = false;
*range_given = true;
*input_min = 0;
*input_max = 1;
} else if (src_op == "Tanh") {
*signed_input = true;
*range_given = true;
*input_min = -1;
*input_max = 1;
} else if (src_op == "Reshape") {
// Reshape has 2 inputs and the first one is the tensor.
for (const Edge* edge : node->in_edges()) {
if (edge->src_output() != Graph::kControlSlot && edge->dst_input() == 0) {
FindType(graph, edge->src(), signed_input, range_given, input_min,
input_max);
}
}
} else if (src_op == "Identity" || src_op == "MaxPool" ||
src_op == "AvgPool" || src_op == "MaxPool3D" ||
src_op == "AvgPool3D") {
// All these Ops only have 1 data input.
for (const Edge* edge : node->in_edges()) {
if (edge->src_output() != Graph::kControlSlot) {
FindType(graph, edge->src(), signed_input, range_given, input_min,
input_max);
}
}
} else {
// Unknown type, could be the model input examples.
// TODO: Set the params for input with user's hint.
*signed_input = true;
*range_given = false;
return false;
}
return true;
}
// Insert conversion op, connect it to the graph and remove the old edge.
Status ProcessTargetEdges(Graph* graph,
const std::vector<EdgeToConvert>& target_edges) {
// Remember previous convert ops to avoid duplicated conversion on the same
// input.
std::unordered_map<string, Node*, StringPiece::Hasher> name_index;
for (const EdgeToConvert edge : target_edges) {
Node* convert_node;
string name =
strings::StrCat(edge.edge->src()->name(), "/_QuantizeAndDequantize");
auto iter = name_index.find(name);
if (iter == name_index.end()) {
TF_RETURN_IF_ERROR(NodeBuilder(name, "_QuantizeAndDequantize")
.Input(edge.edge->src())
.Attr("signed_input", edge.signed_input)
.Attr("num_bits", edge.num_bits)
.Attr("range_given", edge.range_given)
.Attr("input_min", edge.input_min)
.Attr("input_max", edge.input_max)
.Finalize(graph, &convert_node));
name_index[name] = convert_node;
} else {
convert_node = iter->second;
}
graph->AddEdge(convert_node, 0, edge.edge->dst(), edge.edge->dst_input());
graph->RemoveEdge(edge.edge);
}
return Status::OK();
}
} // namespace
Status DoQuantizeTraining(int32 num_bits, Graph* graph) {
if (graph == nullptr) {
return errors::InvalidArgument("Cannot accept empty graph pointer.");
}
if (num_bits < 1 || num_bits > 63) {
return errors::OutOfRange("num_bits should be in range [1, 63] but is: ",
num_bits);
}
int potential_input = 0;
std::vector<EdgeToConvert> target_edges;
for (Node* node : graph->nodes()) {
if (nodes_to_rewrite.find(node->type_string()) != nodes_to_rewrite.end() &&
!IsGradientNode(graph, node)) {
// Find out which types are the inputs and convert them accordingly.
// 1. Const/Variable OP: This is quantized as signed tensors with no given
// range.
// 2. Activation OP: Set the range accordingly for different types of
// activations. Currently we handle {Relu, Relu6, Sigmoid, Tanh}
// 3. Identity OP: The quantization parameters depend on its input.
// 4. Pooling OPs: various pooling ops. Also depends on its input.
// 5. Reshape OP: Also depends on the first input to this op.
// 6. Not-Listed-Above OP: If there is only 1 such op, consider it as the
// model input. However, if there are >1 unknown ops, then returns an
// error for now to avoid unexpected bahavior.
// Note: The list above might not be a complete list. Please let us
// know if you see the error so we can handle your case.
for (const Edge* edge : node->in_edges()) {
if (edge->src_output() == Graph::kControlSlot) {
// Skip the control dependency input.
continue;
} else {
bool signed_input = false;
bool range_given = false;
float input_min = 0;
float input_max = 0;
bool known_op = FindType(graph, edge->src(), &signed_input,
&range_given, &input_min, &input_max);
if (!known_op) {
// Unknown op is considered as input.
// Only support one input for now.
// TODO: Make this configurable if this is the desirable way to find
// input.
if (potential_input > 0) {
return errors::Unimplemented(
"Find a second unknown op: ", edge->src()->name(),
" with type: ", edge->src()->type_string(),
"; Unknown ops are considered as model input for now and "
"only 1 input is supported currently.");
}
potential_input++;
}
target_edges.emplace_back(EdgeToConvert(
edge, num_bits, signed_input, range_given, input_min, input_max));
}
}
}
}
TF_RETURN_IF_ERROR(ProcessTargetEdges(graph, target_edges));
return Status::OK();
}
} // namespace tensorflow

View File

@ -0,0 +1,37 @@
/* Copyright 2016 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_GRAPH_QUANTIZE_TRAINING_H_
#define TENSORFLOW_GRAPH_QUANTIZE_TRAINING_H_
#include "tensorflow/core/graph/graph.h"
namespace tensorflow {
// Rewrites graph for quantized training.
// Rewrites the forward pass to include the precision loss with quantization so
// the model can learn to deal with such loss and achieve better accuracy when
// it is quantized later for inference.
// Note that the num_bits should be in [1, 63] and 'g' must be not null.
//
// On success, returns OK.
//
// On failure, returns the error status. Possible errors include:
// - num_bits out of range.
// - g is null.
// - More than 1 unknown ops encountered.
Status DoQuantizeTraining(int32 num_bits, Graph* g);
} // namespace tensorflow
#endif // TENSORFLOW_GRAPH_QUANTIZE_TRAINING_H_

View File

@ -0,0 +1,161 @@
/* Copyright 2015 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include <map>
#include <string>
#include <unordered_map>
#include <vector>
#include "tensorflow/core/graph/quantize_training.h"
#include "tensorflow/core/common_runtime/device_factory.h"
#include "tensorflow/core/common_runtime/device_mgr.h"
#include "tensorflow/core/framework/node_def_util.h"
#include "tensorflow/core/framework/tensor.h"
#include "tensorflow/core/framework/tensor_shape.h"
#include "tensorflow/core/framework/tensor_testutil.h"
#include "tensorflow/core/framework/types.h"
#include "tensorflow/core/graph/node_builder.h"
#include "tensorflow/core/graph/testlib.h"
#include "tensorflow/core/lib/core/status_test_util.h"
#include "tensorflow/core/lib/core/threadpool.h"
#include "tensorflow/core/lib/strings/strcat.h"
#include "tensorflow/core/platform/test.h"
#include "tensorflow/core/public/session_options.h"
namespace tensorflow {
namespace {
class QuantizeTrainingTest : public ::testing::Test {
protected:
QuantizeTrainingTest() { Reset(); }
void Reset() { g_.reset(new Graph(OpRegistry::Global())); }
template <typename T>
Node* Constant(gtl::ArraySlice<T> values, TensorShape shape) {
return test::graph::Constant(g_.get(), test::AsTensor(values, shape));
}
std::unique_ptr<Graph> g_;
};
TEST_F(QuantizeTrainingTest, NormalGraph) {
// Construct the following graph
/*
m1 m2
/ \ / \
Relu Identity c
| |
a b
*/
Reset();
Graph* g = g_.get();
Node* a = Constant<float>({1.0, 2.0, 3.0, 4.0}, {2, 2});
Node* b = Constant<float>({1.0, 2.0, 3.0, 4.0}, {2, 2});
Node* c = Constant<float>({0.0, 1.0, 1.0, 0.0}, {2, 2});
g->AddControlEdge(g->source_node(), a);
g->AddControlEdge(g->source_node(), b);
g->AddControlEdge(g->source_node(), c);
Node* relu = test::graph::Relu(g, a);
Node* identity = test::graph::Identity(g, b);
Node* m1 = test::graph::Matmul(g, relu, identity, false, false);
Node* m2 = test::graph::Matmul(g, identity, c, false, false);
g->AddControlEdge(m1, g->sink_node());
g->AddControlEdge(m2, g->sink_node());
// The graph after the rewriting should be:
// "Q" is the quantize_and_dequantize op.
// Note the Q in the middle is shared by both m1 and m2.
/*
m1 m2
/ \ / \
Q Q Q
| | |
Relu Identity c
| |
a b
*/
int num_bits = 8;
// 4 edges to modify
TF_ASSERT_OK(DoQuantizeTraining(num_bits, g));
// There should be 12 nodes in total including the source and sink nodes.
EXPECT_EQ(12, g->num_nodes());
// Nodes m1 and m2's inputs should be the quantize_and_dequantize op.
std::vector<Node*> target_nodes{m1, m2};
for (Node* n : target_nodes) {
for (Node* in : n->in_nodes()) {
EXPECT_EQ("_QuantizeAndDequantize", in->type_string());
}
}
// relu, identity, c should now connect to the quantize_and_dequantize nodes.
std::vector<Node*> target_inputs{relu, identity, c};
for (Node* n : target_inputs) {
for (Node* out : n->out_nodes()) {
EXPECT_EQ("_QuantizeAndDequantize", out->type_string());
}
}
// Quantize_and_dequantize node for identity should have signed_input==true.
NodeDef identity_Q = identity->out_nodes().begin()->def();
ASSERT_EQ("true",
SummarizeAttrValue(identity_Q.attr().find("signed_input")->second));
// Quantize_and_dequantize node for relu should have signed_input==false.
NodeDef relu_Q = relu->out_nodes().begin()->def();
ASSERT_EQ("false",
SummarizeAttrValue(relu_Q.attr().find("signed_input")->second));
}
TEST_F(QuantizeTrainingTest, WithBackwardNodes) {
// Construct the same graph plus another backward Matmul.
Reset();
Graph* g = g_.get();
Node* a = Constant<float>({1.0, 2.0, 3.0, 4.0}, {2, 2});
Node* b = Constant<float>({1.0, 2.0, 3.0, 4.0}, {2, 2});
Node* c = Constant<float>({0.0, 1.0, 1.0, 0.0}, {2, 2});
g->AddControlEdge(g->source_node(), a);
g->AddControlEdge(g->source_node(), b);
g->AddControlEdge(g->source_node(), c);
Node* relu = test::graph::Relu(g, a);
Node* identity = test::graph::Identity(g, b);
Node* m1 = test::graph::Matmul(g, relu, identity, false, false);
Node* m2 = test::graph::Matmul(g, identity, c, false, false);
g->AddControlEdge(m1, g->sink_node());
g->AddControlEdge(m2, g->sink_node());
// Add a Matmul node with name starting with "gradients".
Node* backward_m;
TF_ASSERT_OK(NodeBuilder(g->NewName("gradients/n"), "MatMul")
.Input(m1)
.Input(m2)
.Attr("transpose_a", true)
.Attr("transpose_b", false)
.Finalize(g, &backward_m));
g->AddControlEdge(backward_m, g->sink_node());
int num_bits = 8;
// Still 4 changes since the inputs of backward node will not be converted.
TF_ASSERT_OK(DoQuantizeTraining(num_bits, g));
// Nodes m1 and m2's inputs should now be the quantize_and_dequantize op.
EXPECT_EQ(13, g->num_nodes());
EXPECT_EQ(2, m2->num_inputs());
}
#undef SIMPLE_GRAPH
} // namespace
} // namespace tensorflow

View File

@ -384,6 +384,15 @@ Node* GetSessionTensor(Graph* g, Node* in) {
return ret;
}
Node* Relu(Graph* g, Node* in) {
Node* ret;
TF_CHECK_OK(NodeBuilder(g->NewName("n"), "Relu")
.Input(in, 0)
.Attr("T", DT_FLOAT)
.Finalize(g, &ret));
return ret;
}
void ToGraphDef(Graph* g, GraphDef* gdef) { g->ToGraphDef(gdef); }
} // end namespace graph

View File

@ -169,6 +169,9 @@ Node* GetSessionTensor(Graph* g, Node* in);
// given in "tensors".
Node* Concat(Graph* g, Node* concat_dim, gtl::ArraySlice<Node*> tensors);
// Add a Relu node in "g".
Node* Relu(Graph* g, Node* in);
} // end namespace graph
} // end namespace test
} // end namespace tensorflow

View File

@ -23,6 +23,7 @@ limitations under the License.
#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
#include "tensorflow/core/framework/numeric_op.h"
#include "tensorflow/core/framework/op_kernel.h"
#include "tensorflow/core/framework/register_types.h"
#include "tensorflow/core/framework/tensor.h"
#include "tensorflow/core/framework/tensor_shape.h"
#include "tensorflow/core/framework/tensor_slice.h"
@ -99,10 +100,12 @@ class AvgPoolingOp : public UnaryOp<T> {
TensorFormat data_format_;
};
REGISTER_KERNEL_BUILDER(Name("AvgPool")
.Device(DEVICE_CPU)
.TypeConstraint<float>("T"),
AvgPoolingOp<CPUDevice, float>);
REGISTER_KERNEL_BUILDER(
Name("AvgPool").Device(DEVICE_CPU).TypeConstraint<float>("T"),
AvgPoolingOp<CPUDevice, float>);
REGISTER_KERNEL_BUILDER(
Name("AvgPool").Device(DEVICE_CPU).TypeConstraint<Eigen::half>("T"),
AvgPoolingOp<CPUDevice, Eigen::half>);
#if GOOGLE_CUDA
template <typename T>
@ -181,14 +184,17 @@ namespace functor {
const Eigen::PaddingType& padding); \
extern template struct SpatialAvgPooling<GPUDevice, T>;
DECLARE_GPU_SPEC(Eigen::half);
DECLARE_GPU_SPEC(float);
#undef DECLARE_GPU_SPEC
} // namespace functor
REGISTER_KERNEL_BUILDER(Name("AvgPool")
.Device(DEVICE_GPU)
.TypeConstraint<float>("T"),
AvgPoolingOp<GPUDevice, float>);
REGISTER_KERNEL_BUILDER(
Name("AvgPool").Device(DEVICE_GPU).TypeConstraint<Eigen::half>("T"),
AvgPoolingOp<GPUDevice, Eigen::half>);
REGISTER_KERNEL_BUILDER(
Name("AvgPool").Device(DEVICE_GPU).TypeConstraint<float>("T"),
AvgPoolingOp<GPUDevice, float>);
#endif // GOOGLE_CUDA
// The operation to compute AvgPool gradients.
@ -300,7 +306,7 @@ class AvgPoolingGradOp : public OpKernel {
GetBroadcastSize(c, in_cols, window_cols, col_stride,
pad_cols, &cindex, &csize));
T divide_coeff = 1.0 / (rsize * csize);
T divide_coeff(1.0 / (rsize * csize));
int64 output_index =
(b * out_backprop_rows + r) * out_backprop_cols + c;
for (int64 r_dst = rindex; r_dst < rindex + rsize; ++r_dst) {
@ -337,16 +343,16 @@ class AvgPoolingGradOp : public OpKernel {
TensorFormat data_format_;
};
REGISTER_KERNEL_BUILDER(Name("AvgPoolGrad")
.Device(DEVICE_CPU)
.TypeConstraint<float>("T")
.HostMemory("orig_input_shape"),
AvgPoolingGradOp<CPUDevice, float>);
REGISTER_KERNEL_BUILDER(Name("AvgPoolGrad")
.Device(DEVICE_CPU)
.TypeConstraint<double>("T")
.HostMemory("orig_input_shape"),
AvgPoolingGradOp<CPUDevice, double>);
#define REGISTER_CPU_KERNEL(T) \
REGISTER_KERNEL_BUILDER(Name("AvgPoolGrad") \
.Device(DEVICE_CPU) \
.TypeConstraint<T>("T") \
.HostMemory("orig_input_shape"), \
AvgPoolingGradOp<CPUDevice, T>);
TF_CALL_float(REGISTER_CPU_KERNEL);
TF_CALL_double(REGISTER_CPU_KERNEL);
TF_CALL_half(REGISTER_CPU_KERNEL);
#if GOOGLE_CUDA
@ -416,6 +422,12 @@ REGISTER_KERNEL_BUILDER(Name("AvgPoolGrad")
.HostMemory("orig_input_shape")
.Label("cudnn"),
AvgPoolingGradOp<GPUDevice, float>);
REGISTER_KERNEL_BUILDER(Name("AvgPoolGrad")
.Device(DEVICE_GPU)
.TypeConstraint<Eigen::half>("T")
.HostMemory("orig_input_shape")
.Label("cudnn"),
AvgPoolingGradOp<GPUDevice, Eigen::half>);
// A custom GPU kernel based AvgPoolingGrad implementation. It includes the
// padding as the candidates for the pooling operation.
@ -532,6 +544,11 @@ REGISTER_KERNEL_BUILDER(Name("AvgPoolGrad")
.TypeConstraint<float>("T")
.HostMemory("orig_input_shape"),
AvgPoolingGradOpCustomGPUKernel<float>);
REGISTER_KERNEL_BUILDER(Name("AvgPoolGrad")
.Device(DEVICE_GPU)
.TypeConstraint<Eigen::half>("T")
.HostMemory("orig_input_shape"),
AvgPoolingGradOpCustomGPUKernel<Eigen::half>);
#endif // GOOGLE_CUDA

View File

@ -33,6 +33,7 @@ typedef Eigen::GpuDevice GPUDevice;
#define DEFINE_GPU_KERNELS(T) \
template struct functor::SpatialAvgPooling<GPUDevice, T>;
DEFINE_GPU_KERNELS(Eigen::half)
DEFINE_GPU_KERNELS(float)
#undef DEFINE_GPU_KERNELS
@ -57,7 +58,7 @@ __global__ void AvePoolBackwardNHWC(const int nthreads,
const int phend = min(h / stride_h + 1, pooled_height);
const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
const int pwend = min(w / stride_w + 1, pooled_width);
dtype gradient = 0;
dtype gradient(0);
const dtype* const top_diff_slice =
top_diff + n * pooled_height * pooled_width * channels + c;
for (int ph = phstart; ph < phend; ++ph) {
@ -104,6 +105,12 @@ template bool RunAvePoolBackwardNHWC(
const int pooled_width, const int kernel_h, const int kernel_w,
const int stride_h, const int stride_w, const int pad_t, const int pad_l,
float* const bottom_diff, const GPUDevice& d);
template bool RunAvePoolBackwardNHWC(
const Eigen::half* const top_diff, const int num, const int height,
const int width, const int channels, const int pooled_height,
const int pooled_width, const int kernel_h, const int kernel_w,
const int stride_h, const int stride_w, const int pad_t, const int pad_l,
Eigen::half* const bottom_diff, const GPUDevice& d);
} // end namespace tensorflow

View File

@ -159,9 +159,9 @@ class BatchNormGradOp : public OpKernel {
.TypeConstraint<T>("T"), \
BatchNormOp<CPUDevice, T>);
REGISTER_KERNEL(Eigen::half);
REGISTER_KERNEL(float);
REGISTER_KERNEL(double);
TF_CALL_half(REGISTER_KERNEL);
TF_CALL_float(REGISTER_KERNEL);
TF_CALL_double(REGISTER_KERNEL);
#undef REGISTER_KERNEL
#if GOOGLE_CUDA
@ -179,8 +179,8 @@ namespace functor {
#define DECLARE_GPU_SPECS(T) DECLARE_GPU_SPEC(T);
DECLARE_GPU_SPECS(Eigen::half);
DECLARE_GPU_SPECS(float);
TF_CALL_half(DECLARE_GPU_SPECS);
TF_CALL_float(DECLARE_GPU_SPECS);
#undef DECLARE_GPU_SPEC
} // namespace functor
@ -191,8 +191,8 @@ DECLARE_GPU_SPECS(float);
.TypeConstraint<T>("T"), \
BatchNormOp<GPUDevice, T>);
REGISTER_GPU_KERNEL(Eigen::half);
REGISTER_GPU_KERNEL(float);
TF_CALL_half(REGISTER_GPU_KERNEL);
TF_CALL_float(REGISTER_GPU_KERNEL);
#undef REGISTER_GPU_KERNEL
#endif // GOOGLE_CUDA
@ -203,9 +203,9 @@ REGISTER_GPU_KERNEL(float);
.TypeConstraint<T>("T"), \
BatchNormGradOp<CPUDevice, T>);
REGISTER_KERNEL(Eigen::half);
REGISTER_KERNEL(float);
REGISTER_KERNEL(double);
TF_CALL_half(REGISTER_KERNEL);
TF_CALL_float(REGISTER_KERNEL);
TF_CALL_double(REGISTER_KERNEL);
#undef REGISTER_KERNEL
#if GOOGLE_CUDA
@ -226,8 +226,8 @@ namespace functor {
#define DECLARE_GPU_SPECS(T) DECLARE_GPU_SPEC(T);
DECLARE_GPU_SPECS(Eigen::half);
DECLARE_GPU_SPECS(float);
TF_CALL_half(DECLARE_GPU_SPECS);
TF_CALL_float(DECLARE_GPU_SPECS);
#undef DECLARE_GPU_SPEC
} // namespace functor
@ -238,8 +238,8 @@ DECLARE_GPU_SPECS(float);
.TypeConstraint<T>("T"), \
BatchNormGradOp<GPUDevice, T>);
REGISTER_GPU_KERNEL(Eigen::half);
REGISTER_GPU_KERNEL(float);
TF_CALL_half(REGISTER_GPU_KERNEL);
TF_CALL_float(REGISTER_GPU_KERNEL);
#undef REGISTER_GPU_KERNEL
#endif // GOOGLE_CUDA

View File

@ -20,6 +20,7 @@ limitations under the License.
#include <numeric>
#include "tensorflow/core/framework/op_kernel.h"
#include "tensorflow/core/framework/register_types.h"
#include "tensorflow/core/framework/tensor.h"
#include "tensorflow/core/framework/types.h"
@ -182,18 +183,14 @@ class CheckNumericsOp<GPUDevice, T> : public OpKernel {
} // namespace
REGISTER_KERNEL_BUILDER(Name("CheckNumerics")
.Device(DEVICE_CPU)
.TypeConstraint<Eigen::half>("T"),
CheckNumericsOp<CPUDevice, Eigen::half>);
REGISTER_KERNEL_BUILDER(Name("CheckNumerics")
.Device(DEVICE_CPU)
.TypeConstraint<float>("T"),
CheckNumericsOp<CPUDevice, float>);
REGISTER_KERNEL_BUILDER(Name("CheckNumerics")
.Device(DEVICE_CPU)
.TypeConstraint<double>("T"),
CheckNumericsOp<CPUDevice, double>);
#define REGISTER_CPU_KERNEL(T) \
REGISTER_KERNEL_BUILDER( \
Name("CheckNumerics").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
CheckNumericsOp<CPUDevice, T>);
TF_CALL_half(REGISTER_CPU_KERNEL);
TF_CALL_float(REGISTER_CPU_KERNEL);
TF_CALL_double(REGISTER_CPU_KERNEL);
#if GOOGLE_CUDA
REGISTER_KERNEL_BUILDER(Name("CheckNumerics")
.Device(DEVICE_GPU)

View File

@ -22,6 +22,7 @@ limitations under the License.
#include <vector>
#include "tensorflow/core/framework/numeric_op.h"
#include "tensorflow/core/framework/op_kernel.h"
#include "tensorflow/core/framework/register_types.h"
#include "tensorflow/core/framework/tensor.h"
#include "tensorflow/core/framework/tensor_shape.h"
#include "tensorflow/core/framework/tensor_slice.h"
@ -622,35 +623,24 @@ class Conv2DCustomBackpropInputOp : public OpKernel {
TF_DISALLOW_COPY_AND_ASSIGN(Conv2DCustomBackpropInputOp);
};
REGISTER_KERNEL_BUILDER(
Name("Conv2DBackpropInput").Device(DEVICE_CPU).TypeConstraint<float>("T"),
Conv2DCustomBackpropInputOp<CPUDevice, float>);
REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropInput")
.Device(DEVICE_CPU)
.TypeConstraint<Eigen::half>("T"),
Conv2DCustomBackpropInputOp<CPUDevice, Eigen::half>);
#define REGISTER_CPU_KERNELS(T) \
REGISTER_KERNEL_BUILDER( \
Name("Conv2DBackpropInput").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
Conv2DCustomBackpropInputOp<CPUDevice, T>); \
REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropInput") \
.Device(DEVICE_CPU) \
.Label("custom") \
.TypeConstraint<T>("T"), \
Conv2DCustomBackpropInputOp<CPUDevice, T>); \
REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropInput") \
.Device(DEVICE_CPU) \
.Label("eigen_tensor") \
.TypeConstraint<T>("T"), \
Conv2DFastBackpropInputOp<CPUDevice, T>);
REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropInput")
.Device(DEVICE_CPU)
.Label("custom")
.TypeConstraint<float>("T"),
Conv2DCustomBackpropInputOp<CPUDevice, float>);
REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropInput")
.Device(DEVICE_CPU)
.Label("custom")
.TypeConstraint<Eigen::half>("T"),
Conv2DCustomBackpropInputOp<CPUDevice, Eigen::half>);
REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropInput")
.Device(DEVICE_CPU)
.Label("eigen_tensor")
.TypeConstraint<float>("T"),
Conv2DFastBackpropInputOp<CPUDevice, float>);
REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropInput")
.Device(DEVICE_CPU)
.Label("eigen_tensor")
.TypeConstraint<Eigen::half>("T"),
Conv2DFastBackpropInputOp<CPUDevice, Eigen::half>);
TF_CALL_half(REGISTER_CPU_KERNELS);
TF_CALL_float(REGISTER_CPU_KERNELS);
#undef REGISTER_CPU_KERNELS
template <typename Device, class T>
class Conv2DFastBackpropFilterOp : public OpKernel {
@ -867,35 +857,24 @@ class Conv2DCustomBackpropFilterOp : public OpKernel {
TF_DISALLOW_COPY_AND_ASSIGN(Conv2DCustomBackpropFilterOp);
};
REGISTER_KERNEL_BUILDER(
Name("Conv2DBackpropFilter").Device(DEVICE_CPU).TypeConstraint<float>("T"),
Conv2DCustomBackpropFilterOp<CPUDevice, float>);
REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropFilter")
.Device(DEVICE_CPU)
.TypeConstraint<Eigen::half>("T"),
Conv2DCustomBackpropFilterOp<CPUDevice, Eigen::half>);
#define REGISTER_CPU_KERNELS(T) \
REGISTER_KERNEL_BUILDER( \
Name("Conv2DBackpropFilter").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
Conv2DCustomBackpropFilterOp<CPUDevice, T>); \
REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropFilter") \
.Device(DEVICE_CPU) \
.Label("custom") \
.TypeConstraint<T>("T"), \
Conv2DCustomBackpropFilterOp<CPUDevice, T>); \
REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropFilter") \
.Device(DEVICE_CPU) \
.Label("eigen_tensor") \
.TypeConstraint<T>("T"), \
Conv2DFastBackpropFilterOp<CPUDevice, T>);
REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropFilter")
.Device(DEVICE_CPU)
.Label("custom")
.TypeConstraint<float>("T"),
Conv2DCustomBackpropFilterOp<CPUDevice, float>);
REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropFilter")
.Device(DEVICE_CPU)
.Label("custom")
.TypeConstraint<Eigen::half>("T"),
Conv2DCustomBackpropFilterOp<CPUDevice, Eigen::half>);
REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropFilter")
.Device(DEVICE_CPU)
.Label("eigen_tensor")
.TypeConstraint<float>("T"),
Conv2DFastBackpropFilterOp<CPUDevice, float>);
REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropFilter")
.Device(DEVICE_CPU)
.Label("eigen_tensor")
.TypeConstraint<Eigen::half>("T"),
Conv2DFastBackpropFilterOp<CPUDevice, Eigen::half>);
TF_CALL_half(REGISTER_CPU_KERNELS);
TF_CALL_float(REGISTER_CPU_KERNELS);
#undef REGISTER_CPU_KERNELS
// GPU definitions of both ops.
#if GOOGLE_CUDA

View File

@ -20,6 +20,7 @@ limitations under the License.
#include "tensorflow/core/framework/numeric_op.h"
#include "tensorflow/core/framework/op_kernel.h"
#include "tensorflow/core/framework/register_types.h"
#include "tensorflow/core/framework/tensor.h"
#include "tensorflow/core/framework/tensor_shape.h"
#include "tensorflow/core/framework/tensor_slice.h"
@ -194,14 +195,13 @@ class Conv3DBackpropInputOp : public OpKernel {
Padding padding_;
};
REGISTER_KERNEL_BUILDER(
Name("Conv3DBackpropInput").Device(DEVICE_CPU).TypeConstraint<float>("T"),
Conv3DBackpropInputOp<CPUDevice, float>);
#ifndef IS_MOBILE_PLATFORM
REGISTER_KERNEL_BUILDER(
Name("Conv3DBackpropInput").Device(DEVICE_CPU).TypeConstraint<double>("T"),
Conv3DBackpropInputOp<CPUDevice, double>);
#endif
#define REGISTER_CPU_KERNEL(T) \
REGISTER_KERNEL_BUILDER( \
Name("Conv3DBackpropInput").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
Conv3DBackpropInputOp<CPUDevice, T>);
TF_CALL_float(REGISTER_CPU_KERNEL);
TF_CALL_double(REGISTER_CPU_KERNEL);
#undef REGISTER_CPU_KERNEL
// Backprop for filter.
template <typename Device, class T>
@ -303,14 +303,13 @@ class Conv3DBackpropFilterOp : public OpKernel {
Padding padding_;
};
REGISTER_KERNEL_BUILDER(
Name("Conv3DBackpropFilter").Device(DEVICE_CPU).TypeConstraint<float>("T"),
Conv3DBackpropFilterOp<CPUDevice, float>);
#ifndef IS_MOBILE_PLATFORM
REGISTER_KERNEL_BUILDER(
Name("Conv3DBackpropFilter").Device(DEVICE_CPU).TypeConstraint<double>("T"),
Conv3DBackpropFilterOp<CPUDevice, double>);
#endif
#define REGISTER_CPU_KERNEL(T) \
REGISTER_KERNEL_BUILDER( \
Name("Conv3DBackpropFilter").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
Conv3DBackpropFilterOp<CPUDevice, T>);
TF_CALL_float(REGISTER_CPU_KERNEL);
TF_CALL_double(REGISTER_CPU_KERNEL);
#undef REGISTER_CPU_KERNEL
// GPU definitions of both ops.
#if GOOGLE_CUDA

View File

@ -23,6 +23,7 @@ limitations under the License.
#include <vector>
#include "tensorflow/core/framework/numeric_op.h"
#include "tensorflow/core/framework/op_kernel.h"
#include "tensorflow/core/framework/register_types.h"
#include "tensorflow/core/framework/tensor.h"
#include "tensorflow/core/framework/tensor_shape.h"
#include "tensorflow/core/framework/tensor_slice.h"
@ -245,12 +246,13 @@ class Conv2DOp : public BinaryOp<T> {
TF_DISALLOW_COPY_AND_ASSIGN(Conv2DOp);
};
REGISTER_KERNEL_BUILDER(
Name("Conv2D").Device(DEVICE_CPU).TypeConstraint<float>("T"),
Conv2DOp<CPUDevice, float>);
REGISTER_KERNEL_BUILDER(
Name("Conv2D").Device(DEVICE_CPU).TypeConstraint<Eigen::half>("T"),
Conv2DOp<CPUDevice, Eigen::half>);
#define REGISTER_CPU(T) \
REGISTER_KERNEL_BUILDER( \
Name("Conv2D").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
Conv2DOp<CPUDevice, T>);
TF_CALL_half(REGISTER_CPU);
TF_CALL_float(REGISTER_CPU);
#if GOOGLE_CUDA
int64 GetCudnnWorkspaceLimit(const string& envvar_in_mb,

View File

@ -21,6 +21,7 @@ limitations under the License.
#include "tensorflow/core/framework/numeric_op.h"
#include "tensorflow/core/framework/op_kernel.h"
#include "tensorflow/core/framework/register_types.h"
#include "tensorflow/core/framework/tensor.h"
#include "tensorflow/core/framework/tensor_shape.h"
#include "tensorflow/core/framework/tensor_slice.h"
@ -120,15 +121,13 @@ class Conv3DOp : public BinaryOp<T> {
Padding padding_;
};
REGISTER_KERNEL_BUILDER(
Name("Conv3D").Device(DEVICE_CPU).TypeConstraint<float>("T"),
Conv3DOp<CPUDevice, float>);
#ifndef IS_MOBILE_PLATFORM
REGISTER_KERNEL_BUILDER(
Name("Conv3D").Device(DEVICE_CPU).TypeConstraint<double>("T"),
Conv3DOp<CPUDevice, double>);
#endif
#define REGISTER_CPU_KERNEL(T) \
REGISTER_KERNEL_BUILDER( \
Name("Conv3D").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
Conv3DOp<CPUDevice, T>);
TF_CALL_float(REGISTER_CPU_KERNEL);
TF_CALL_double(REGISTER_CPU_KERNEL);
#undef REGISTER_CPU_KERNEL
#if GOOGLE_CUDA

View File

@ -566,16 +566,14 @@ class DepthwiseConv2dNativeBackpropInputOp : public OpKernel {
TF_DISALLOW_COPY_AND_ASSIGN(DepthwiseConv2dNativeBackpropInputOp);
};
REGISTER_KERNEL_BUILDER(Name("DepthwiseConv2dNativeBackpropInput")
.Device(DEVICE_CPU)
.TypeConstraint<float>("T"),
DepthwiseConv2dNativeBackpropInputOp<CPUDevice, float>);
REGISTER_KERNEL_BUILDER(
Name("DepthwiseConv2dNativeBackpropInput")
.Device(DEVICE_CPU)
.TypeConstraint<double>("T"),
DepthwiseConv2dNativeBackpropInputOp<CPUDevice, double>);
#define REGISTER_CPU_KERNEL(T) \
REGISTER_KERNEL_BUILDER(Name("DepthwiseConv2dNativeBackpropInput") \
.Device(DEVICE_CPU) \
.TypeConstraint<T>("T"), \
DepthwiseConv2dNativeBackpropInputOp<CPUDevice, T>);
TF_CALL_float(REGISTER_CPU_KERNEL);
TF_CALL_double(REGISTER_CPU_KERNEL);
#undef REGISTER_CPU_KERNEL
#if GOOGLE_CUDA
REGISTER_KERNEL_BUILDER(Name("DepthwiseConv2dNativeBackpropInput")
@ -951,17 +949,15 @@ class DepthwiseConv2dNativeBackpropFilterOp : public OpKernel {
TF_DISALLOW_COPY_AND_ASSIGN(DepthwiseConv2dNativeBackpropFilterOp);
};
REGISTER_KERNEL_BUILDER(
Name("DepthwiseConv2dNativeBackpropFilter")
.Device(DEVICE_CPU)
.TypeConstraint<float>("T"),
DepthwiseConv2dNativeBackpropFilterOp<CPUDevice, float>);
REGISTER_KERNEL_BUILDER(
Name("DepthwiseConv2dNativeBackpropFilter")
.Device(DEVICE_CPU)
.TypeConstraint<double>("T"),
DepthwiseConv2dNativeBackpropFilterOp<CPUDevice, double>);
#define REGISTER_CPU_KERNEL(T) \
REGISTER_KERNEL_BUILDER( \
Name("DepthwiseConv2dNativeBackpropFilter") \
.Device(DEVICE_CPU) \
.TypeConstraint<T>("T"), \
DepthwiseConv2dNativeBackpropFilterOp<CPUDevice, T>);
TF_CALL_float(REGISTER_CPU_KERNEL);
TF_CALL_double(REGISTER_CPU_KERNEL);
#undef REGISTER_CPU_KERNEL
#if GOOGLE_CUDA
REGISTER_KERNEL_BUILDER(

View File

@ -376,14 +376,13 @@ class DepthwiseConv2dNativeOp : public BinaryOp<T> {
TF_DISALLOW_COPY_AND_ASSIGN(DepthwiseConv2dNativeOp);
};
REGISTER_KERNEL_BUILDER(
Name("DepthwiseConv2dNative").Device(DEVICE_CPU).TypeConstraint<float>("T"),
DepthwiseConv2dNativeOp<CPUDevice, float>);
#define REGISTER_CPU_KERNEL(T) \
REGISTER_KERNEL_BUILDER( \
Name("DepthwiseConv2dNative").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
DepthwiseConv2dNativeOp<CPUDevice, T>);
REGISTER_KERNEL_BUILDER(Name("DepthwiseConv2dNative")
.Device(DEVICE_CPU)
.TypeConstraint<double>("T"),
DepthwiseConv2dNativeOp<CPUDevice, double>);
TF_CALL_float(REGISTER_CPU_KERNEL);
TF_CALL_double(REGISTER_CPU_KERNEL);
#if GOOGLE_CUDA
REGISTER_KERNEL_BUILDER(

View File

@ -143,13 +143,11 @@ class DrawBoundingBoxesOp : public OpKernel {
}
};
REGISTER_KERNEL_BUILDER(
Name("DrawBoundingBoxes").Device(DEVICE_CPU).TypeConstraint<float>("T"),
DrawBoundingBoxesOp<float>);
REGISTER_KERNEL_BUILDER(Name("DrawBoundingBoxes")
.Device(DEVICE_CPU)
.TypeConstraint<Eigen::half>("T"),
DrawBoundingBoxesOp<Eigen::half>);
#define REGISTER_CPU_KERNEL(T) \
REGISTER_KERNEL_BUILDER( \
Name("DrawBoundingBoxes").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
DrawBoundingBoxesOp<T>);
TF_CALL_half(REGISTER_CPU_KERNEL);
TF_CALL_float(REGISTER_CPU_KERNEL);
} // namespace tensorflow

View File

@ -309,7 +309,7 @@ struct AvgPoolMeanReducer {
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE AvgPoolMeanReducer() : scalarCount_(0) {
typedef typename packet_traits<T>::type Packet;
packetCount_ = pset1<Packet>(0.0);
packetCount_ = pset1<Packet>(T(0.0));
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) {

View File

@ -21,6 +21,7 @@ limitations under the License.
#include "tensorflow/core/framework/op.h"
#include "tensorflow/core/framework/op_kernel.h"
#include "tensorflow/core/framework/register_types.h"
#include "tensorflow/core/kernels/fill_functor.h"
#if GOOGLE_CUDA
@ -202,17 +203,19 @@ struct MatMulFunctor<CPUDevice, T> {
.Label("cublas"), \
MatMulOp<GPUDevice, T, true /* cublas */>)
REGISTER_CPU(float);
REGISTER_CPU(double);
REGISTER_CPU(int32);
REGISTER_CPU(Eigen::half);
REGISTER_CPU(complex64);
REGISTER_CPU(complex128);
TF_CALL_float(REGISTER_CPU);
TF_CALL_double(REGISTER_CPU);
TF_CALL_half(REGISTER_CPU);
TF_CALL_int32(REGISTER_CPU);
TF_CALL_complex64(REGISTER_CPU);
TF_CALL_complex128(REGISTER_CPU);
#if GOOGLE_CUDA
REGISTER_GPU(float);
REGISTER_GPU(double);
TF_CALL_float(REGISTER_GPU);
TF_CALL_double(REGISTER_GPU);
#if CUDA_VERSION >= 7050
REGISTER_GPU(Eigen::half);
TF_CALL_half(REGISTER_GPU);
#endif
#endif // GOOGLE_CUDA

View File

@ -160,7 +160,7 @@ static void SpatialMaxPoolWithArgMaxHelper(
const int in_end = limit * in_size;
EigenMatrixMap in_shard(input_backprop_flat.data() + in_start, 1,
in_end - in_start);
in_shard.setConstant(0);
in_shard.setConstant(T(0));
// Backpropagate.
const int out_size = out_height * out_width * depth;
@ -187,8 +187,12 @@ static void SpatialMaxPoolWithArgMaxHelper(
params.tensor_in_batch, shard_cost, shard);
}
REGISTER_KERNEL_BUILDER(Name("MaxPool").Device(DEVICE_CPU),
MaxPoolingOp<CPUDevice, float>);
REGISTER_KERNEL_BUILDER(
Name("MaxPool").Device(DEVICE_CPU).TypeConstraint<float>("T"),
MaxPoolingOp<CPUDevice, float>);
REGISTER_KERNEL_BUILDER(
Name("MaxPool").Device(DEVICE_CPU).TypeConstraint<Eigen::half>("T"),
MaxPoolingOp<CPUDevice, Eigen::half>);
#if GOOGLE_CUDA
// Forward declarations for the functor specializations for GPU.
@ -212,6 +216,7 @@ DECLARE_GPU_SPEC(float);
// kernel_label_map.
REGISTER_KERNEL_BUILDER(Name("MaxPool")
.Device(DEVICE_GPU)
.TypeConstraint<float>("T")
.Label("eigen_tensor"),
MaxPoolingOp<Eigen::GpuDevice, float>);
#endif // GOOGLE_CUDA
@ -297,11 +302,16 @@ class MaxPoolingGradOp : public OpKernel {
TensorFormat data_format_;
};
REGISTER_KERNEL_BUILDER(Name("MaxPoolGrad").Device(DEVICE_CPU),
MaxPoolingGradOp<CPUDevice, float>);
REGISTER_KERNEL_BUILDER(
Name("MaxPoolGrad").Device(DEVICE_CPU).TypeConstraint<float>("T"),
MaxPoolingGradOp<CPUDevice, float>);
REGISTER_KERNEL_BUILDER(
Name("MaxPoolGrad").Device(DEVICE_CPU).TypeConstraint<Eigen::half>("T"),
MaxPoolingGradOp<CPUDevice, Eigen::half>);
#ifdef GOOGLE_CUDA
template <typename T>
static void MaxPoolingBackwardCustomKernel(
OpKernelContext* context, const std::vector<int32>& size,
const std::vector<int32>& stride, Padding padding, const Tensor* tensor_in,
@ -318,12 +328,12 @@ static void MaxPoolingBackwardCustomKernel(
}
MaxPoolBackwardNoMask(
tensor_in->flat<float>().data(), params.tensor_in_batch,
tensor_in->flat<T>().data(), params.tensor_in_batch,
params.tensor_in_rows, params.tensor_in_cols, params.depth,
params.out_height, params.out_width, params.window_rows,
params.window_cols, params.row_stride, params.col_stride, params.pad_rows,
params.pad_cols, out_backprop.flat<float>().data(),
output->flat<float>().data(), context->eigen_device<Eigen::GpuDevice>());
params.pad_cols, out_backprop.flat<T>().data(),
output->flat<T>().data(), context->eigen_device<Eigen::GpuDevice>());
}
template <class T>
@ -378,8 +388,8 @@ class MaxPoolingGradOp<Eigen::GpuDevice, T> : public OpKernel {
} else {
CHECK(data_format_ == FORMAT_NHWC)
<< "Non-Cudnn MaxPoolGrad only supports NHWC format";
MaxPoolingBackwardCustomKernel(context, ksize_, stride_, padding_,
&tensor_in, out_backprop, output_shape);
MaxPoolingBackwardCustomKernel<T>(context, ksize_, stride_, padding_,
&tensor_in, out_backprop, output_shape);
}
}
@ -391,8 +401,12 @@ class MaxPoolingGradOp<Eigen::GpuDevice, T> : public OpKernel {
bool use_dnn_;
};
REGISTER_KERNEL_BUILDER(Name("MaxPoolGrad").Device(DEVICE_GPU),
MaxPoolingGradOp<Eigen::GpuDevice, float>);
REGISTER_KERNEL_BUILDER(
Name("MaxPoolGrad").Device(DEVICE_GPU).TypeConstraint<float>("T"),
MaxPoolingGradOp<Eigen::GpuDevice, float>);
REGISTER_KERNEL_BUILDER(
Name("MaxPoolGrad").Device(DEVICE_GPU).TypeConstraint<Eigen::half>("T"),
MaxPoolingGradOp<Eigen::GpuDevice, Eigen::half>);
#endif // GOOGLE_CUDA
@ -625,8 +639,12 @@ struct LaunchMaxPoolingNoMask<Eigen::GpuDevice, T> {
}
};
REGISTER_KERNEL_BUILDER(Name("MaxPool").Device(DEVICE_GPU),
MaxPoolingNoMaskOp<Eigen::GpuDevice, float>);
REGISTER_KERNEL_BUILDER(
Name("MaxPool").Device(DEVICE_GPU).TypeConstraint<float>("T"),
MaxPoolingNoMaskOp<Eigen::GpuDevice, float>);
REGISTER_KERNEL_BUILDER(
Name("MaxPool").Device(DEVICE_GPU).TypeConstraint<Eigen::half>("T"),
MaxPoolingNoMaskOp<Eigen::GpuDevice, Eigen::half>);
template <typename T>
struct LaunchMaxPoolingWithArgmax<Eigen::GpuDevice, T> {
@ -649,8 +667,14 @@ struct LaunchMaxPoolingWithArgmax<Eigen::GpuDevice, T> {
REGISTER_KERNEL_BUILDER(Name("MaxPoolWithArgmax")
.Device(DEVICE_GPU)
.TypeConstraint<int64>("Targmax"),
.TypeConstraint<int64>("Targmax")
.TypeConstraint<float>("T"),
MaxPoolingWithArgmaxOp<Eigen::GpuDevice, float>);
REGISTER_KERNEL_BUILDER(Name("MaxPoolWithArgmax")
.Device(DEVICE_GPU)
.TypeConstraint<int64>("Targmax")
.TypeConstraint<Eigen::half>("T"),
MaxPoolingWithArgmaxOp<Eigen::GpuDevice, Eigen::half>);
template <typename T>
struct LaunchMaxPoolingGradWithArgmax<Eigen::GpuDevice, T> {
@ -675,10 +699,18 @@ struct LaunchMaxPoolingGradWithArgmax<Eigen::GpuDevice, T> {
}
};
REGISTER_KERNEL_BUILDER(Name("MaxPoolGradWithArgmax")
.Device(DEVICE_GPU)
.TypeConstraint<int64>("Targmax"),
MaxPoolingGradWithArgmaxOp<Eigen::GpuDevice, float>);
REGISTER_KERNEL_BUILDER(
Name("MaxPoolGradWithArgmax")
.Device(DEVICE_GPU)
.TypeConstraint<float>("T")
.TypeConstraint<int64>("Targmax"),
MaxPoolingGradWithArgmaxOp<Eigen::GpuDevice, float>);
REGISTER_KERNEL_BUILDER(
Name("MaxPoolGradWithArgmax")
.Device(DEVICE_GPU)
.TypeConstraint<Eigen::half>("T")
.TypeConstraint<int64>("Targmax"),
MaxPoolingGradWithArgmaxOp<Eigen::GpuDevice, Eigen::half>);
#endif // GOOGLE_CUDA

View File

@ -110,7 +110,7 @@ __global__ void MaxPoolForwardNHWC(const int nthreads, const dtype* bottom_data,
int wend = min(wstart + kernel_w, width);
hstart = max(hstart, 0);
wstart = max(wstart, 0);
dtype maxval = -FLT_MAX;
dtype maxval = Eigen::NumTraits<dtype>::lowest();
int maxidx = -1;
const dtype* bottom_data_n = bottom_data + n * height * width * channels;
for (int h = hstart; h < hend; ++h) {
@ -149,7 +149,7 @@ __global__ void MaxPoolBackwardNoMaskNHWC(
int wend = min(wstart + kernel_w, width);
hstart = max(hstart, 0);
wstart = max(wstart, 0);
dtype maxval = -FLT_MAX;
dtype maxval = Eigen::NumTraits<dtype>::lowest();
int maxidx = -1;
const dtype* bottom_data_n = bottom_data + n * height * width * channels;
for (int h = hstart; h < hend; ++h) {
@ -165,8 +165,8 @@ __global__ void MaxPoolBackwardNoMaskNHWC(
// Atomically accumulate the bottom diff. The index could still be
// uninitialized, if all the bottom_data are NaN.
if (maxidx != -1) {
atomicAdd(bottom_diff + n * height * width * channels + maxidx,
top_diff[index]);
CudaAtomicAdd(bottom_diff + n * height * width * channels + maxidx,
top_diff[index]);
}
}
}
@ -185,8 +185,8 @@ __global__ void MaxPoolBackwardNoMaskNHWC(
// bottom_offset: the pre-computed per-image offset of the maxpool input.
// This is equal to H*W*C.
// bottom_diff: the gradient with respect to the input.
// This function relies on atomicAdd to avoid race conditions. Also, before the
// kernel is run, you will need to make sure that bottom_diff is filled with
// This function relies on CudaAtomicAdd to avoid race conditions. Also, before
// the kernel is run, you will need to make sure that bottom_diff is filled with
// zero first.
template <typename dtype>
__global__ void MaxPoolBackward(const int nthreads, const dtype* top_diff,
@ -194,8 +194,8 @@ __global__ void MaxPoolBackward(const int nthreads, const dtype* top_diff,
const int bottom_offset, dtype* bottom_diff) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
int image_id = (index / top_offset);
atomicAdd(bottom_diff + image_id * bottom_offset + mask[index],
top_diff[index]);
CudaAtomicAdd(bottom_diff + image_id * bottom_offset + mask[index],
top_diff[index]);
}
}
@ -219,6 +219,23 @@ bool MaxPoolForwardWithOptionalArgmax(
return d.ok();
}
bool MaxPoolForwardWithOptionalArgmax(
const Eigen::half* bottom_data, const int batch, const int height,
const int width, const int channels, const int pooled_height,
const int pooled_width, const int kernel_h, const int kernel_w,
const int stride_h, const int stride_w, const int pad_t, const int pad_l,
Eigen::half* top_data, int64* mask, const Eigen::GpuDevice& d) {
const int kThreadsPerBlock = 1024;
const int output_size = batch * channels * pooled_height * pooled_width;
MaxPoolForwardNHWC<<<(output_size + kThreadsPerBlock - 1) / kThreadsPerBlock,
kThreadsPerBlock, 0, d.stream()>>>(
output_size, bottom_data, height, width, channels, pooled_height,
pooled_width, kernel_h, kernel_w, stride_h, stride_w, pad_t, pad_l,
top_data, mask);
return d.ok();
}
bool MaxPoolBackwardNoMask(const float* bottom_data, const int batch,
const int height, const int width,
const int channels, const int pooled_height,
@ -243,6 +260,30 @@ bool MaxPoolBackwardNoMask(const float* bottom_data, const int batch,
return d.ok();
}
bool MaxPoolBackwardNoMask(const Eigen::half* bottom_data, const int batch,
const int height, const int width,
const int channels, const int pooled_height,
const int pooled_width, const int kernel_h,
const int kernel_w, const int stride_h,
const int stride_w, const int pad_t, const int pad_l,
const Eigen::half* top_diff, Eigen::half* bottom_diff,
const Eigen::GpuDevice& d) {
const int kThreadsPerBlock = 1024;
const int bottom_size = batch * channels * height * width;
const int top_size = batch * channels * pooled_height * pooled_width;
SetZero<<<(bottom_size + kThreadsPerBlock - 1) / kThreadsPerBlock,
kThreadsPerBlock, 0, d.stream()>>>(bottom_size, bottom_diff);
MaxPoolBackwardNoMaskNHWC<<<(top_size + kThreadsPerBlock - 1) /
kThreadsPerBlock,
kThreadsPerBlock, 0, d.stream()>>>(
top_size, bottom_data, height, width, channels, pooled_height,
pooled_width, kernel_h, kernel_w, stride_h, stride_w, pad_t, pad_l,
top_diff, bottom_diff);
return d.ok();
}
bool MaxPoolBackwardWithArgmax(const int output_size, const int input_size,
const float* top_diff, const int64* mask,
const int top_offset, const int bottom_offset,
@ -256,12 +297,27 @@ bool MaxPoolBackwardWithArgmax(const int output_size, const int input_size,
return d.ok();
}
bool MaxPoolBackwardWithArgmax(const int output_size, const int input_size,
const Eigen::half* top_diff, const int64* mask,
const int top_offset, const int bottom_offset,
Eigen::half* bottom_diff,
const Eigen::GpuDevice& d) {
const int kThreadsPerBlock = 1024;
SetZero<<<(input_size + kThreadsPerBlock - 1) / kThreadsPerBlock,
kThreadsPerBlock, 0, d.stream()>>>(input_size, bottom_diff);
MaxPoolBackward<<<(output_size + kThreadsPerBlock - 1) / kThreadsPerBlock,
kThreadsPerBlock, 0, d.stream()>>>(
output_size, top_diff, mask, top_offset, bottom_offset, bottom_diff);
return d.ok();
}
typedef Eigen::GpuDevice GPUDevice;
#define DEFINE_GPU_KERNELS(T) \
template struct functor::SpatialMaxPooling<GPUDevice, T>;
DEFINE_GPU_KERNELS(float)
DEFINE_GPU_KERNELS(Eigen::half)
#undef DEFINE_GPU_KERNELS

View File

@ -37,11 +37,24 @@ bool MaxPoolForwardWithOptionalArgmax(
const int stride_h, const int stride_w, const int pad_t, const int pad_l,
float* top_data, int64* mask, const Eigen::GpuDevice& d);
bool MaxPoolForwardWithOptionalArgmax(
const Eigen::half* bottom_data, const int batch, const int height,
const int width, const int channels, const int pooled_height,
const int pooled_width, const int kernel_h, const int kernel_w,
const int stride_h, const int stride_w, const int pad_t, const int pad_l,
Eigen::half* top_data, int64* mask, const Eigen::GpuDevice& d);
bool MaxPoolBackwardWithArgmax(const int output_size, const int input_size,
const float* top_diff, const int64* mask,
const int top_offset, const int bottom_offset,
float* bottom_diff, const Eigen::GpuDevice& d);
bool MaxPoolBackwardWithArgmax(const int output_size, const int input_size,
const Eigen::half* top_diff, const int64* mask,
const int top_offset, const int bottom_offset,
Eigen::half* bottom_diff,
const Eigen::GpuDevice& d);
bool MaxPoolBackwardNoMask(const float* bottom_data, const int batch,
const int height, const int width,
const int channels, const int pooled_height,
@ -51,6 +64,15 @@ bool MaxPoolBackwardNoMask(const float* bottom_data, const int batch,
const float* top_diff, float* bottom_diff,
const Eigen::GpuDevice& d);
bool MaxPoolBackwardNoMask(const Eigen::half* bottom_data, const int batch,
const int height, const int width,
const int channels, const int pooled_height,
const int pooled_width, const int kernel_h,
const int kernel_w, const int stride_h,
const int stride_w, const int pad_t, const int pad_l,
const Eigen::half* top_diff, Eigen::half* bottom_diff,
const Eigen::GpuDevice& d);
} // namespace tensorflow
#endif // TENSORFLOW_CORE_KERNELS_MAXPOOLING_OP_GPU_H_

View File

@ -104,10 +104,8 @@ class PackOp : public OpKernel {
PackOp<CPUDevice, type>)
TF_CALL_ALL_TYPES(REGISTER_PACK);
REGISTER_PACK(quint8);
REGISTER_PACK(qint8);
REGISTER_PACK(qint32);
REGISTER_PACK(bfloat16);
TF_CALL_QUANTIZED_TYPES(REGISTER_PACK);
TF_CALL_bfloat16(REGISTER_PACK);
#undef REGISTER_PACK

View File

@ -124,6 +124,7 @@ namespace functor {
extern template struct TransformDepth<GPUDevice, T, Eigen::DenseIndex>;
DECLARE_GPU_SPEC(float);
DECLARE_GPU_SPEC(Eigen::half);
#undef DECLARE_GPU_SPEC
} // namespace functor
@ -368,7 +369,9 @@ void DnnPoolingGradOp<T>::Compute(
}
}
template class DnnPoolingOp<Eigen::half>;
template class DnnPoolingOp<float>;
template class DnnPoolingGradOp<Eigen::half>;
template class DnnPoolingGradOp<float>;
#endif // GOOGLE_CUDA

View File

@ -311,7 +311,7 @@ void SpatialAvgPool(OpKernelContext* context, Tensor* output,
}
}
}
DCHECK_GT(out_count.minCoeff(), 0);
DCHECK_GT(out_count.minCoeff(), T(0));
out_mat.array().rowwise() /= out_count.transpose().array();
}

View File

@ -23,6 +23,7 @@ limitations under the License.
#include "tensorflow/core/framework/op.h"
#include "tensorflow/core/framework/op_kernel.h"
#include "tensorflow/core/framework/register_types.h"
#include "tensorflow/core/framework/type_traits.h"
#include "tensorflow/core/framework/types.h"
#include "tensorflow/core/lib/core/errors.h"
@ -107,15 +108,14 @@ struct QuantizeAndDequantizeOneScaleFunctor<CPUDevice, T> {
};
} // namespace functor
REGISTER_KERNEL_BUILDER(Name("_QuantizeAndDequantize")
.Device(DEVICE_CPU)
.TypeConstraint<float>("T"),
QuantizeAndDequantizeOp<CPUDevice, float>);
REGISTER_KERNEL_BUILDER(Name("_QuantizeAndDequantize")
.Device(DEVICE_CPU)
.TypeConstraint<double>("T"),
QuantizeAndDequantizeOp<CPUDevice, double>);
#define REGISTER_CPU_KERNEL(T) \
REGISTER_KERNEL_BUILDER(Name("_QuantizeAndDequantize") \
.Device(DEVICE_CPU) \
.TypeConstraint<T>("T"), \
QuantizeAndDequantizeOp<CPUDevice, T>);
TF_CALL_float(REGISTER_CPU_KERNEL);
TF_CALL_double(REGISTER_CPU_KERNEL);
#undef REGISTER_CPU_KERNEL
#if GOOGLE_CUDA
REGISTER_KERNEL_BUILDER(Name("_QuantizeAndDequantize")

View File

@ -451,11 +451,11 @@ class MultinomialOp : public OpKernel {
.TypeConstraint<IntType>("Tout"), \
RandomUniformIntOp<CPUDevice, IntType>);
REGISTER(Eigen::half);
REGISTER(float);
REGISTER(double);
REGISTER_INT(int32);
REGISTER_INT(int64);
TF_CALL_half(REGISTER);
TF_CALL_float(REGISTER);
TF_CALL_double(REGISTER);
TF_CALL_int32(REGISTER_INT);
TF_CALL_int64(REGISTER_INT);
#undef REGISTER
#undef REGISTER_INT
@ -505,11 +505,11 @@ REGISTER_INT(int64);
.TypeConstraint<IntType>("Tout"), \
RandomUniformIntOp<GPUDevice, IntType>);
REGISTER(Eigen::half);
REGISTER(float);
REGISTER(double);
REGISTER_INT(int32);
REGISTER_INT(int64);
TF_CALL_half(REGISTER);
TF_CALL_float(REGISTER);
TF_CALL_double(REGISTER);
TF_CALL_int32(REGISTER_INT);
TF_CALL_int64(REGISTER_INT);
#undef REGISTER
#undef REGISTER_INT

View File

@ -25,8 +25,8 @@ TF_CALL_REAL_NUMBER_TYPES(REGISTER_CPU_KERNELS);
// NOTE: We should have mean(complex64,int32), too. But that needs to
// change Eigen::internal::MeanReducer to cast int to complex<float>.
// We don't see immediate need of mean(complex64,int32) anyway.
REGISTER_CPU_KERNELS(complex64);
REGISTER_CPU_KERNELS(complex128);
TF_CALL_complex64(REGISTER_CPU_KERNELS);
TF_CALL_complex128(REGISTER_CPU_KERNELS);
#undef REGISTER_CPU_KERNELS
#if GOOGLE_CUDA

View File

@ -159,15 +159,12 @@ TF_CALL_REAL_NUMBER_TYPES(REGISTER_KERNEL);
#undef REGISTER_KERNEL
REGISTER_KERNEL_BUILDER(Name("ResizeBilinearGrad")
.Device(DEVICE_CPU)
.TypeConstraint<float>("T"),
ResizeBilinearOpGrad<CPUDevice, float>);
REGISTER_KERNEL_BUILDER(Name("ResizeBilinearGrad")
.Device(DEVICE_CPU)
.TypeConstraint<Eigen::half>("T"),
ResizeBilinearOpGrad<CPUDevice, Eigen::half>);
REGISTER_KERNEL_BUILDER(
Name("ResizeBilinearGrad").Device(DEVICE_CPU).TypeConstraint<double>("T"),
ResizeBilinearOpGrad<CPUDevice, double>);
#define REGISTER_CPU_GRAD_KERNEL(T) \
REGISTER_KERNEL_BUILDER( \
Name("ResizeBilinearGrad").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
ResizeBilinearOpGrad<CPUDevice, T>);
TF_CALL_half(REGISTER_CPU_GRAD_KERNEL);
TF_CALL_float(REGISTER_CPU_GRAD_KERNEL);
TF_CALL_double(REGISTER_CPU_GRAD_KERNEL);
} // namespace tensorflow

View File

@ -97,13 +97,13 @@ class ReverseOp : public OpKernel {
.HostMemory("dims"), \
ReverseOp<CPUDevice, T>)
REGISTER_KERNEL(uint8);
REGISTER_KERNEL(int8);
REGISTER_KERNEL(int32);
REGISTER_KERNEL(bool);
REGISTER_KERNEL(Eigen::half);
REGISTER_KERNEL(float);
REGISTER_KERNEL(double);
TF_CALL_uint8(REGISTER_KERNEL);
TF_CALL_int8(REGISTER_KERNEL);
TF_CALL_int32(REGISTER_KERNEL);
TF_CALL_bool(REGISTER_KERNEL);
TF_CALL_half(REGISTER_KERNEL);
TF_CALL_float(REGISTER_KERNEL);
TF_CALL_double(REGISTER_KERNEL);
#undef REGISTER_KERNEL
#if GOOGLE_CUDA
@ -129,13 +129,13 @@ namespace functor {
DECLARE_GPU_SPEC_DIM(T, 7) \
DECLARE_GPU_SPEC_DIM(T, 8)
DECLARE_GPU_SPEC(uint8);
DECLARE_GPU_SPEC(int8);
DECLARE_GPU_SPEC(int32);
DECLARE_GPU_SPEC(bool);
DECLARE_GPU_SPEC(Eigen::half);
DECLARE_GPU_SPEC(float);
DECLARE_GPU_SPEC(double);
TF_CALL_uint8(DECLARE_GPU_SPEC);
TF_CALL_int8(DECLARE_GPU_SPEC);
TF_CALL_int32(DECLARE_GPU_SPEC);
TF_CALL_bool(DECLARE_GPU_SPEC);
TF_CALL_half(DECLARE_GPU_SPEC);
TF_CALL_float(DECLARE_GPU_SPEC);
TF_CALL_double(DECLARE_GPU_SPEC);
#undef DECLARE_GPU_SPEC
#undef DECLARE_GPU_SPEC_DIM
} // namespace functor
@ -147,11 +147,11 @@ DECLARE_GPU_SPEC(double);
.TypeConstraint<T>("T") \
.HostMemory("dims"), \
ReverseOp<GPUDevice, T>)
REGISTER_GPU_KERNEL(uint8);
REGISTER_GPU_KERNEL(int8);
REGISTER_GPU_KERNEL(Eigen::half);
REGISTER_GPU_KERNEL(float);
REGISTER_GPU_KERNEL(double);
TF_CALL_uint8(REGISTER_GPU_KERNEL);
TF_CALL_int8(REGISTER_GPU_KERNEL);
TF_CALL_half(REGISTER_GPU_KERNEL);
TF_CALL_float(REGISTER_GPU_KERNEL);
TF_CALL_double(REGISTER_GPU_KERNEL);
#undef REGISTER_GPU_KERNEL
#endif // GOOGLE_CUDA

View File

@ -118,21 +118,16 @@ class LinSpaceOp : public OpKernel {
}
};
REGISTER_KERNEL_BUILDER(Name("LinSpace")
.Device(DEVICE_CPU)
.TypeConstraint<float>("T")
.HostMemory("start")
.HostMemory("stop")
.HostMemory("num")
.HostMemory("output"),
LinSpaceOp<float>);
REGISTER_KERNEL_BUILDER(Name("LinSpace")
.Device(DEVICE_CPU)
.TypeConstraint<double>("T")
.HostMemory("start")
.HostMemory("stop")
.HostMemory("num")
.HostMemory("output"),
LinSpaceOp<double>);
#define REGISTER_CPU_KERNEL(T) \
REGISTER_KERNEL_BUILDER(Name("LinSpace") \
.Device(DEVICE_CPU) \
.TypeConstraint<T>("T") \
.HostMemory("start") \
.HostMemory("stop") \
.HostMemory("num") \
.HostMemory("output"), \
LinSpaceOp<T>);
TF_CALL_float(REGISTER_CPU_KERNEL);
TF_CALL_double(REGISTER_CPU_KERNEL);
} // namespace tensorflow

View File

@ -20,6 +20,7 @@ limitations under the License.
#include "tensorflow/core/kernels/softmax_op.h"
#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
#include "tensorflow/core/framework/op_kernel.h"
#include "tensorflow/core/framework/register_types.h"
#include "tensorflow/core/framework/tensor.h"
#include "tensorflow/core/framework/tensor_shape.h"
@ -40,27 +41,22 @@ struct SoftmaxFunctor<CPUDevice, T> {
};
} // namespace functor
REGISTER_KERNEL_BUILDER(
Name("Softmax").Device(DEVICE_CPU).TypeConstraint<Eigen::half>("T"),
SoftmaxOp<CPUDevice, Eigen::half>);
REGISTER_KERNEL_BUILDER(Name("Softmax")
.Device(DEVICE_CPU)
.TypeConstraint<float>("T"),
SoftmaxOp<CPUDevice, float>);
REGISTER_KERNEL_BUILDER(Name("Softmax")
.Device(DEVICE_CPU)
.TypeConstraint<double>("T"),
SoftmaxOp<CPUDevice, double>);
REGISTER_KERNEL_BUILDER(
Name("LogSoftmax").Device(DEVICE_CPU).TypeConstraint<Eigen::half>("T"),
SoftmaxOp<CPUDevice, Eigen::half>);
REGISTER_KERNEL_BUILDER(
Name("LogSoftmax").Device(DEVICE_CPU).TypeConstraint<float>("T"),
SoftmaxOp<CPUDevice, float>);
REGISTER_KERNEL_BUILDER(Name("LogSoftmax")
.Device(DEVICE_CPU)
.TypeConstraint<double>("T"),
SoftmaxOp<CPUDevice, double>);
#define REGISTER_CPU(T) \
REGISTER_KERNEL_BUILDER( \
Name("Softmax").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
SoftmaxOp<CPUDevice, T>);
TF_CALL_half(REGISTER_CPU);
TF_CALL_float(REGISTER_CPU);
TF_CALL_double(REGISTER_CPU);
#undef REGISTER_CPU
#define REGISTER_CPU(T) \
REGISTER_KERNEL_BUILDER( \
Name("LogSoftmax").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
SoftmaxOp<CPUDevice, T>);
TF_CALL_half(REGISTER_CPU);
TF_CALL_float(REGISTER_CPU);
TF_CALL_double(REGISTER_CPU);
#if GOOGLE_CUDA
REGISTER_KERNEL_BUILDER(

View File

@ -157,10 +157,15 @@ EIGEN_STRONG_INLINE Packet8f pinterleave4x64<Packet8f>(const Packet8f& from) {
return _mm256_castsi256_ps(_mm256_permute4x64_epi64(_mm256_castps_si256(from),
_MM_SHUFFLE(3, 1, 2, 0)));
#else
__int64_t tmp1 = _mm256_extract_epi64(_mm256_castps_si256(from), 1);
__int64_t tmp2 = _mm256_extract_epi64(_mm256_castps_si256(from), 2);
__m256i tmp3 = _mm256_insert_epi64(_mm256_castps_si256(from), tmp1, 2);
return _mm256_castsi256_ps(_mm256_insert_epi64(tmp3, tmp2, 1));
auto tmp1 = _mm256_extract_epi32(_mm256_castps_si256(from), 2);
auto tmp2 = _mm256_extract_epi32(_mm256_castps_si256(from), 3);
auto tmp3 = _mm256_extract_epi32(_mm256_castps_si256(from), 4);
auto tmp4 = _mm256_extract_epi32(_mm256_castps_si256(from), 5);
auto tmp5 = _mm256_insert_epi32(_mm256_castps_si256(from), tmp1, 4);
tmp5 = _mm256_insert_epi32(tmp5, tmp2, 5);
tmp5 = _mm256_insert_epi32(tmp5, tmp3, 2);
tmp5 = _mm256_insert_epi32(tmp5, tmp4, 3);
return _mm256_castsi256_ps(tmp5);
#endif
}
// Return a Packet with 4 floats loaded from 4 bfloat16 values

View File

@ -238,25 +238,25 @@ class SparseMatmulOpTest : public ::testing::Test {
TEST_F(SparseMatmulOpTest, BroadcastPacketTest) {
for (int i = 0; i < PacketSize; ++i) ref[i] = data1[0];
internal::pstore(data2, internal::pbroadcast_first<Packet>(
internal::pload<Packet>(data1)));
internal::pstoreu(data2, internal::pbroadcast_first<Packet>(
internal::ploadu<Packet>(data1)));
ASSERT_TRUE(areApprox(ref, data2, PacketSize));
if (PacketSize > 1) {
for (int i = 0; i < PacketSize; ++i) ref[i] = data1[1];
internal::pstore(data2, internal::pbroadcast_second<Packet>(
internal::pload<Packet>(data1)));
internal::pstoreu(data2, internal::pbroadcast_second<Packet>(
internal::ploadu<Packet>(data1)));
ASSERT_TRUE(areApprox(ref, data2, PacketSize));
if (PacketSize > 2) {
for (int i = 0; i < PacketSize; ++i) ref[i] = data1[2];
internal::pstore(data2, internal::pbroadcast_third<Packet>(
internal::pload<Packet>(data1)));
internal::pstoreu(data2, internal::pbroadcast_third<Packet>(
internal::ploadu<Packet>(data1)));
ASSERT_TRUE(areApprox(ref, data2, PacketSize));
if (PacketSize > 3) {
for (int i = 0; i < PacketSize; ++i) ref[i] = data1[3];
internal::pstore(data2, internal::pbroadcast_fourth<Packet>(
internal::pload<Packet>(data1)));
internal::pstoreu(data2, internal::pbroadcast_fourth<Packet>(
internal::ploadu<Packet>(data1)));
ASSERT_TRUE(areApprox(ref, data2, PacketSize));
}
}
@ -276,8 +276,8 @@ TEST_F(SparseMatmulOpTest, InterleavePacketTest) {
for (int i = 0; i < PacketSize; ++i) ref[i] = data1[i];
}
internal::pstore(
data2, internal::pinterleave4x64<Packet>(internal::pload<Packet>(data1)));
internal::pstoreu(data2, internal::pinterleave4x64<Packet>(
internal::ploadu<Packet>(data1)));
ASSERT_TRUE(areApprox(ref, data2, PacketSize));
}
@ -294,8 +294,8 @@ TEST_F(SparseMatmulOpTest, Bfloat16ExpandTest) {
ref[i] = data3[i];
}
}
internal::pstore(data2, internal::pexpand_bf16_l<Packet>(
internal::pload<Packet>(data3_bfloat16)));
internal::pstoreu(data2, internal::pexpand_bf16_l<Packet>(
internal::ploadu<Packet>(data3_bfloat16)));
ASSERT_TRUE(areApprox(ref, data2, PacketSize));
if (PacketSize == 8) { // AVX
@ -311,18 +311,18 @@ TEST_F(SparseMatmulOpTest, Bfloat16ExpandTest) {
}
}
internal::pstore(data2, internal::pexpand_bf16_u<Packet>(
internal::pload<Packet>(data3_bfloat16)));
internal::pstoreu(data2, internal::pexpand_bf16_u<Packet>(
internal::ploadu<Packet>(data3_bfloat16)));
ASSERT_TRUE(areApprox(ref, data2, PacketSize));
}
TEST_F(SparseMatmulOpTest, Bfloat16LoadTest) {
if (PacketSize >= 4) {
for (int i = 0; i < 4; ++i) ref[i] = data3[i];
internal::pstore(data2, internal::pload4bf16<Packet>(data3_bfloat16));
internal::pstoreu(data2, internal::pload4bf16<Packet>(data3_bfloat16));
ASSERT_TRUE(areApprox(ref, data2, 4));
internal::pstore(data2, internal::pload2bf16<Packet>(data3_bfloat16));
internal::pstoreu(data2, internal::pload2bf16<Packet>(data3_bfloat16));
ASSERT_TRUE(areApprox(ref, data2, 2));
}
}

View File

@ -26,6 +26,7 @@ limitations under the License.
#include <vector>
#include "tensorflow/core/framework/numeric_op.h"
#include "tensorflow/core/framework/op_kernel.h"
#include "tensorflow/core/framework/register_types.h"
#include "tensorflow/core/framework/tensor.h"
#include "tensorflow/core/framework/type_index.h"
#include "tensorflow/core/lib/core/errors.h"
@ -92,18 +93,22 @@ class TileOp : public OpKernel {
HANDLE_DIM(T, 4) \
HANDLE_DIM(T, 5)
HANDLE_TYPE(DT_BOOL);
HANDLE_TYPE(DT_FLOAT);
HANDLE_TYPE(DT_DOUBLE);
HANDLE_TYPE(DT_UINT8);
HANDLE_TYPE(DT_INT32);
HANDLE_TYPE(DT_INT16);
HANDLE_TYPE(DT_INT64);
HANDLE_TYPE(DT_HALF);
HANDLE_TYPE(DT_COMPLEX64);
HANDLE_TYPE(DT_COMPLEX128);
HANDLE_TYPE(DT_STRING); // when DEVICE=CPUDevice.
#define HANDLE_TYPE_NAME(T) HANDLE_TYPE(DataTypeToEnum<T>::value)
// Invoke macro using TF_CALL_* so type-filtering for platform applies.
TF_CALL_bool(HANDLE_TYPE_NAME);
TF_CALL_float(HANDLE_TYPE_NAME);
TF_CALL_double(HANDLE_TYPE_NAME);
TF_CALL_uint8(HANDLE_TYPE_NAME);
TF_CALL_int32(HANDLE_TYPE_NAME);
TF_CALL_int16(HANDLE_TYPE_NAME);
TF_CALL_int64(HANDLE_TYPE_NAME);
TF_CALL_half(HANDLE_TYPE_NAME);
TF_CALL_string(HANDLE_TYPE_NAME); // when DEVICE=CPUDevice.
TF_CALL_complex64(HANDLE_TYPE_NAME);
TF_CALL_complex128(HANDLE_TYPE_NAME);
#undef HANDLE_TYPE_NAME
#undef HANDLE_TYPE
#undef HANDLE_DIM
@ -165,17 +170,20 @@ inline void TileOp<Device>::HandleCase(
HANDLE_CASE(device, dtype, 4); \
HANDLE_CASE(device, dtype, 5);
HANDLE_CASE_DIM(CPUDevice, DT_BOOL);
HANDLE_CASE_DIM(CPUDevice, DT_FLOAT);
HANDLE_CASE_DIM(CPUDevice, DT_DOUBLE);
HANDLE_CASE_DIM(CPUDevice, DT_UINT8);
HANDLE_CASE_DIM(CPUDevice, DT_INT32);
HANDLE_CASE_DIM(CPUDevice, DT_INT16);
HANDLE_CASE_DIM(CPUDevice, DT_INT64);
HANDLE_CASE_DIM(CPUDevice, DT_HALF);
HANDLE_CASE_DIM(CPUDevice, DT_COMPLEX64);
HANDLE_CASE_DIM(CPUDevice, DT_COMPLEX128);
HANDLE_CASE_DIM(CPUDevice, DT_STRING);
#define HANDLE_TYPE_NAME_CPU(T) \
HANDLE_CASE_DIM(CPUDevice, DataTypeToEnum<T>::value);
TF_CALL_bool(HANDLE_TYPE_NAME_CPU);
TF_CALL_float(HANDLE_TYPE_NAME_CPU);
TF_CALL_double(HANDLE_TYPE_NAME_CPU);
TF_CALL_uint8(HANDLE_TYPE_NAME_CPU);
TF_CALL_int32(HANDLE_TYPE_NAME_CPU);
TF_CALL_int16(HANDLE_TYPE_NAME_CPU);
TF_CALL_int64(HANDLE_TYPE_NAME_CPU);
TF_CALL_half(HANDLE_TYPE_NAME_CPU);
TF_CALL_complex64(HANDLE_TYPE_NAME_CPU);
TF_CALL_complex128(HANDLE_TYPE_NAME_CPU);
TF_CALL_string(HANDLE_TYPE_NAME_CPU);
#if GOOGLE_CUDA
HANDLE_CASE_DIM(GPUDevice, DT_FLOAT);
@ -186,6 +194,7 @@ HANDLE_CASE_DIM(GPUDevice, DT_INT64);
HANDLE_CASE_DIM(GPUDevice, DT_HALF);
#endif // GOOGLE_CUDA
#undef HANDLE_TYPE_NAME_CPU
#undef HANDLE_CASE_DIM
#undef HANDLE_CASE
@ -249,13 +258,16 @@ class TileGradientOp : public OpKernel {
HANDLE_DIM(T, 4) \
HANDLE_DIM(T, 5)
HANDLE_TYPE(DT_FLOAT);
HANDLE_TYPE(DT_DOUBLE);
HANDLE_TYPE(DT_INT32);
HANDLE_TYPE(DT_INT16);
HANDLE_TYPE(DT_INT64);
HANDLE_TYPE(DT_HALF);
#define HANDLE_TYPE_NAME(T) HANDLE_TYPE(DataTypeToEnum<T>::value)
TF_CALL_float(HANDLE_TYPE_NAME);
TF_CALL_double(HANDLE_TYPE_NAME);
TF_CALL_int32(HANDLE_TYPE_NAME);
TF_CALL_int16(HANDLE_TYPE_NAME);
TF_CALL_int64(HANDLE_TYPE_NAME);
TF_CALL_half(HANDLE_TYPE_NAME);
#undef HANDLE_TYPE_NAME
#undef HANDLE_TYPE
#undef HANDLE_DIM
@ -390,14 +402,17 @@ inline void TileGradientOp<Device>::HandleCase(
HANDLE_CASE(device, dtype, 4); \
HANDLE_CASE(device, dtype, 5);
HANDLE_CASE_DIM(CPUDevice, DT_FLOAT);
HANDLE_CASE_DIM(CPUDevice, DT_DOUBLE);
HANDLE_CASE_DIM(CPUDevice, DT_INT16);
HANDLE_CASE_DIM(CPUDevice, DT_INT32);
HANDLE_CASE_DIM(CPUDevice, DT_INT64);
HANDLE_CASE_DIM(CPUDevice, DT_HALF);
HANDLE_CASE_DIM(CPUDevice, DT_COMPLEX64);
HANDLE_CASE_DIM(CPUDevice, DT_COMPLEX128);
#define HANDLE_TYPE_NAME_CPU(T) \
HANDLE_CASE_DIM(CPUDevice, DataTypeToEnum<T>::value);
TF_CALL_float(HANDLE_TYPE_NAME_CPU);
TF_CALL_double(HANDLE_TYPE_NAME_CPU);
TF_CALL_int16(HANDLE_TYPE_NAME_CPU);
TF_CALL_int32(HANDLE_TYPE_NAME_CPU);
TF_CALL_int64(HANDLE_TYPE_NAME_CPU);
TF_CALL_half(HANDLE_TYPE_NAME_CPU);
TF_CALL_complex64(HANDLE_TYPE_NAME_CPU);
TF_CALL_complex128(HANDLE_TYPE_NAME_CPU);
#if GOOGLE_CUDA
HANDLE_CASE_DIM(GPUDevice, DT_FLOAT);
@ -409,6 +424,7 @@ HANDLE_CASE_DIM(GPUDevice, DT_HALF);
#endif // GOOGLE_CUDA
#undef HANDLE_TYPE_NAME_CPU
#undef HANDLE_CASE_DIM
#undef HANDLE_CASE

View File

@ -16,17 +16,28 @@ limitations under the License.
#define EIGEN_USE_THREADS
#include "tensorflow/core/kernels/training_ops.h"
#include <algorithm>
#include "tensorflow/core/framework/op_kernel.h"
#include "tensorflow/core/kernels/bounds_check.h"
#include "tensorflow/core/framework/op_kernel.h"
#include "tensorflow/core/framework/register_types.h"
namespace tensorflow {
typedef Eigen::ThreadPoolDevice CPUDevice;
typedef Eigen::GpuDevice GPUDevice;
namespace functor {
namespace {
template <class T>
inline T sgn(const T x) {
T zero(0);
T one(1);
return (x == zero ? zero : (x < zero ? -one : one));
}
}
namespace functor {
template <typename T>
struct ApplyGradientDescent<CPUDevice, T> {
void operator()(const CPUDevice& d, typename TTypes<T>::Flat var,
@ -56,6 +67,34 @@ struct ApplyAdadelta<CPUDevice, T> {
}
};
template <typename T>
struct ApplyProximalGradientDescent<CPUDevice, T> {
void operator()(const CPUDevice& d, typename TTypes<T>::Flat var,
typename TTypes<T>::ConstScalar lr,
typename TTypes<T>::ConstScalar l1,
typename TTypes<T>::ConstScalar l2,
typename TTypes<T>::ConstFlat grad) {
// Note that here is Fobos update, for details please refer:
// http://papers.nips.cc/paper/3793-efficient-learning-using-forward-backward-splitting.pdf
// TODO(xbing): merge the logic for ProximalGradientDescent and
// ProximalAdagrad.
auto prox_var = var;
// compute v = w - lr * grad.
prox_var.device(d) -= grad * lr();
if (l1() > 0) {
var.device(d) = prox_var.abs() - var.constant(lr() * l1());
// compute sign(v) * max(|v| - lr * l1, 0)
var.device(d) = prox_var.sign() * var.cwiseMax(T(0.0));
} else {
var.device(d) = prox_var;
}
if (l2() > 0) {
// compute v / (1.0 + l2 * lr)
var.device(d) = var / (var.constant(1.0) + var.constant(l2() * lr()));
}
}
};
template <typename T>
struct ApplyAdagrad<CPUDevice, T> {
void operator()(const CPUDevice& d, typename TTypes<T>::Flat var,
@ -67,6 +106,35 @@ struct ApplyAdagrad<CPUDevice, T> {
}
};
template <typename T>
struct ApplyProximalAdagrad<CPUDevice, T> {
void operator()(const CPUDevice& d, typename TTypes<T>::Flat var,
typename TTypes<T>::Flat accum,
typename TTypes<T>::ConstScalar lr,
typename TTypes<T>::ConstScalar l1,
typename TTypes<T>::ConstScalar l2,
typename TTypes<T>::ConstFlat grad) {
// Fobos update per paper with Adagrad learning rate.
accum.device(d) += grad.square();
// Adagrad learning rate.
auto learning_rate = accum.constant(lr()) * accum.rsqrt();
auto prox_var = var;
// compute v = w - lr * grad.
prox_var.device(d) -= grad * learning_rate;
if (l1() > 0) {
var.device(d) = prox_var.abs() - learning_rate * prox_var.constant(l1());
// compute sign(v) * max(|v| - lr * l1, 0)
var.device(d) = prox_var.sign() * var.cwiseMax(T(0.0));
} else {
var.device(d) = prox_var;
}
if (l2() > 0) {
var.device(d) =
var / (var.constant(1.0) + var.constant(l2()) * learning_rate);
}
}
};
template <typename T>
struct ApplyFtrl<CPUDevice, T> {
void operator()(const CPUDevice& d, typename TTypes<T>::Flat var,
@ -221,10 +289,11 @@ class ApplyGradientDescentOp : public OpKernel {
REGISTER_KERNEL_BUILDER( \
Name("ApplyGradientDescent").Device(DEVICE_##D).TypeConstraint<T>("T"), \
ApplyGradientDescentOp<D##Device, T>);
#define REGISTER_CPU_KERNELS(T) REGISTER_KERNELS(CPU, T);
REGISTER_KERNELS(CPU, Eigen::half);
REGISTER_KERNELS(CPU, float);
REGISTER_KERNELS(CPU, double);
TF_CALL_half(REGISTER_CPU_KERNELS);
TF_CALL_float(REGISTER_CPU_KERNELS);
TF_CALL_double(REGISTER_CPU_KERNELS);
#if GOOGLE_CUDA
// Forward declarations of the functor specializations for GPU.
@ -246,6 +315,7 @@ REGISTER_KERNELS(GPU, Eigen::half);
REGISTER_KERNELS(GPU, float);
REGISTER_KERNELS(GPU, double);
#endif
#undef REGISTER_CPU_KERNELS
#undef REGISTER_KERNELS
template <typename Device, typename T>
@ -347,10 +417,11 @@ typedef Eigen::GpuDevice GPUDevice;
REGISTER_KERNEL_BUILDER( \
Name("ApplyAdadelta").Device(DEVICE_##D).TypeConstraint<T>("T"), \
ApplyAdadeltaOp<D##Device, T>);
#define REGISTER_CPU_KERNELS(T) REGISTER_KERNELS(CPU, T);
REGISTER_KERNELS(CPU, Eigen::half);
REGISTER_KERNELS(CPU, float);
REGISTER_KERNELS(CPU, double);
TF_CALL_half(REGISTER_CPU_KERNELS);
TF_CALL_float(REGISTER_CPU_KERNELS);
TF_CALL_double(REGISTER_CPU_KERNELS);
#if GOOGLE_CUDA
// Forward declarations of the functor specializations for GPU.
@ -374,6 +445,7 @@ REGISTER_KERNELS(GPU, Eigen::half);
REGISTER_KERNELS(GPU, float);
REGISTER_KERNELS(GPU, double);
#endif
#undef REGISTER_CPU_KERNELS
#undef REGISTER_KERNELS
// Note, this op works on cpu only.
@ -483,7 +555,6 @@ class SparseApplyAdadeltaOp : public OpKernel {
accum_update_ =
accum_update_ * accum_update_.constant(rho_scalar) +
update.square() * update.constant(static_cast<T>(1) - rho_scalar);
auto v = var_flat.template chip<0>(index);
v -= update * update.constant(lr_scalar);
}
@ -505,14 +576,213 @@ class SparseApplyAdadeltaOp : public OpKernel {
.TypeConstraint<T>("T") \
.TypeConstraint<Tindices>("Tindices"), \
SparseApplyAdadeltaOp<T, Tindices>);
#define REGISTER_CPU_KERNELS(T) \
REGISTER_KERNELS(T, int32); \
REGISTER_KERNELS(T, int64);
TF_CALL_half(REGISTER_CPU_KERNELS);
TF_CALL_float(REGISTER_CPU_KERNELS);
TF_CALL_double(REGISTER_CPU_KERNELS);
#undef REGISTER_CPU_KERNELS
#undef REGISTER_KERNELS
// Note, this op works on cpu only.
template <typename Device, typename T>
class ApplyProximalGradientDescentOp : public OpKernel {
public:
explicit ApplyProximalGradientDescentOp(OpKernelConstruction* ctx)
: OpKernel(ctx) {
OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_));
}
void Compute(OpKernelContext* ctx) override {
auto locks = MaybeLockMutexesInOrder(ctx, use_exclusive_lock_, {0});
Tensor var = ctx->mutable_input(0, use_exclusive_lock_);
OP_REQUIRES(
ctx, var.IsInitialized(),
errors::FailedPrecondition(
"Attempting to use uninitialized variables: ", def().input(0)));
const Tensor& alpha = ctx->input(1);
OP_REQUIRES(ctx, IsLegacyScalar(alpha.shape()),
errors::InvalidArgument("alpha is not a scalar: ",
alpha.shape().DebugString()));
const Tensor& l1 = ctx->input(2);
OP_REQUIRES(
ctx, TensorShapeUtils::IsScalar(l1.shape()),
errors::InvalidArgument("l1 regularization strength is not a scalar: ",
l1.shape().DebugString()));
const Tensor& l2 = ctx->input(3);
OP_REQUIRES(
ctx, TensorShapeUtils::IsScalar(l2.shape()),
errors::InvalidArgument("l2 regularization strength is not a scalar: ",
l2.shape().DebugString()));
const Tensor& delta = ctx->input(4);
OP_REQUIRES(
ctx, var.shape().IsSameSize(delta.shape()),
errors::InvalidArgument("var and delta do not have the same shape",
var.shape().DebugString(), " ",
delta.shape().DebugString()));
const Device& device = ctx->template eigen_device<Device>();
functor::ApplyProximalGradientDescent<Device, T>()(
device, var.flat<T>(), alpha.scalar<T>(), l1.scalar<T>(),
l2.scalar<T>(), delta.flat<T>());
ctx->forward_ref_input_to_ref_output(0, 0);
}
private:
bool use_exclusive_lock_;
};
#define REGISTER_KERNELS(D, T) \
REGISTER_KERNEL_BUILDER(Name("ApplyProximalGradientDescent") \
.Device(DEVICE_##D) \
.TypeConstraint<T>("T"), \
ApplyProximalGradientDescentOp<D##Device, T>);
REGISTER_KERNELS(CPU, float);
REGISTER_KERNELS(CPU, double);
#undef REGISTER_KERNELS
// Note, this op works on cpu only.
template <typename T, typename Tindex>
class SparseApplyProximalGradientDescentOp : public OpKernel {
public:
explicit SparseApplyProximalGradientDescentOp(OpKernelConstruction* ctx)
: OpKernel(ctx) {
OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_));
}
void Compute(OpKernelContext* ctx) override NO_THREAD_SAFETY_ANALYSIS {
auto locks = MaybeLockMutexesInOrder(ctx, use_exclusive_lock_, {0, 1});
Tensor var = ctx->mutable_input(0, use_exclusive_lock_);
OP_REQUIRES(ctx, TensorShapeUtils::IsVectorOrHigher(var.shape()),
errors::InvalidArgument("var must be at least 1 dimensional"));
const Tensor& lr = ctx->input(1);
OP_REQUIRES(ctx, IsLegacyScalar(lr.shape()),
errors::InvalidArgument("lr is not a scalar: ",
lr.shape().DebugString()));
const Tensor& l1 = ctx->input(2);
OP_REQUIRES(
ctx, TensorShapeUtils::IsScalar(l1.shape()),
errors::InvalidArgument("l1 regularization strength is not a scalar: ",
l1.shape().DebugString()));
const Tensor& l2 = ctx->input(3);
OP_REQUIRES(
ctx, TensorShapeUtils::IsScalar(l2.shape()),
errors::InvalidArgument("l2 regularization strength is not a scalar: ",
l2.shape().DebugString()));
const Tensor& grad = ctx->input(4);
const Tensor& indices = ctx->input(5);
OP_REQUIRES(ctx, TensorShapeUtils::IsVector(indices.shape()),
errors::InvalidArgument("indices must be one-dimensional"));
int64 inner_dim = 1;
for (int d = 1; d < var.dims(); d++) {
OP_REQUIRES(ctx, var.dim_size(d) == grad.dim_size(d),
errors::InvalidArgument(strings::StrCat(
"var and grad must match in dimension ", d)));
inner_dim *= grad.dim_size(d);
}
const Tindex N = indices.dim_size(0);
OP_REQUIRES(
ctx, grad.dim_size(0) == N,
errors::InvalidArgument(
"grad must be the same size as indices in the first dimension."));
if (N > 0) {
if (inner_dim > 1) {
const Tindex first_dim_size = var.dim_size(0);
auto indices_vec = indices.vec<Tindex>();
auto var_flat = var.flat_outer_dims<T>();
auto grad_flat = grad.flat_outer_dims<T>();
T lr_scalar = lr.scalar<T>()();
T l1_scalar = l1.scalar<T>()();
T l2_scalar = l2.scalar<T>()();
// TODO(xbing): extract the common logic for the Fobos update.
for (Tindex i = 0; i < N; i++) {
const Tindex index = internal::SubtleMustCopy(indices_vec(i));
OP_REQUIRES(ctx, FastBoundsCheck(index, first_dim_size),
errors::InvalidArgument(
strings::StrCat("Index ", index, " at offset ", i,
" in indices is out of range")));
auto g = grad_flat.template chip<0>(i);
auto v = var_flat.template chip<0>(index);
// compute learning_rate for current step.
auto learning_rate = v.constant(lr_scalar);
auto prox_v = v;
// v = w - g * learning_rate.
prox_v -= g * learning_rate;
if (l1_scalar > 0) {
v = prox_v.abs() - learning_rate * prox_v.constant(l1_scalar);
// compute sign(v) * max(|v|, 0)
v = prox_v.sign() * v.cwiseMax(static_cast<T>(0.0));
} else {
v = prox_v;
}
if (l2_scalar > 0) {
v /= (v.constant(1.0) + v.constant(l2_scalar) * learning_rate);
}
}
} else {
CHECK_EQ(1, inner_dim);
auto indices_vec = indices.vec<Tindex>();
auto var_flat = var.flat<T>();
auto grad_flat = grad.flat<T>();
T lr_scalar = lr.scalar<T>()();
T l1_scalar = l1.scalar<T>()();
T l2_scalar = l2.scalar<T>()();
const Tindex first_dim_size = var_flat.size();
for (Tindex i = 0; i < N; i++) {
const Tindex index = internal::SubtleMustCopy(indices_vec(i));
OP_REQUIRES(ctx, FastBoundsCheck(index, first_dim_size),
errors::InvalidArgument(
strings::StrCat("Index ", index, " at offset ", i,
" in indices is out of range")));
const T& g = grad_flat(i);
auto learning_rate = lr_scalar;
auto prox_v = var_flat(index);
prox_v -= learning_rate * g;
if (l1_scalar > 0) {
var_flat(index) = std::abs(prox_v) - learning_rate * l1_scalar;
var_flat(index) =
sgn(prox_v) * std::max(var_flat(index), static_cast<T>(0.0));
} else {
var_flat(index) = prox_v;
}
if (l2_scalar > 0) {
var_flat(index) /= (1.0 + l2_scalar * learning_rate);
}
}
}
}
ctx->forward_ref_input_to_ref_output(0, 0);
}
private:
bool use_exclusive_lock_;
};
#define REGISTER_KERNELS(T, Tindices) \
REGISTER_KERNEL_BUILDER(Name("SparseApplyProximalGradientDescent") \
.Device(DEVICE_CPU) \
.TypeConstraint<T>("T") \
.TypeConstraint<Tindices>("Tindices"), \
SparseApplyProximalGradientDescentOp<T, Tindices>);
REGISTER_KERNELS(Eigen::half, int32);
REGISTER_KERNELS(Eigen::half, int64);
REGISTER_KERNELS(float, int32);
REGISTER_KERNELS(float, int64);
REGISTER_KERNELS(double, int32);
REGISTER_KERNELS(double, int64);
#undef REGISTER_KERNELS
template <typename Device, typename T>
@ -568,10 +838,11 @@ typedef Eigen::GpuDevice GPUDevice;
REGISTER_KERNEL_BUILDER( \
Name("ApplyAdagrad").Device(DEVICE_##D).TypeConstraint<T>("T"), \
ApplyAdagradOp<D##Device, T>);
#define REGISTER_CPU_KERNELS(T) REGISTER_KERNELS(CPU, T);
REGISTER_KERNELS(CPU, Eigen::half);
REGISTER_KERNELS(CPU, float);
REGISTER_KERNELS(CPU, double);
TF_CALL_half(REGISTER_CPU_KERNELS);
TF_CALL_float(REGISTER_CPU_KERNELS);
TF_CALL_double(REGISTER_CPU_KERNELS);
#if GOOGLE_CUDA
// Forward declarations of the functor specializations for GPU.
@ -593,15 +864,80 @@ REGISTER_KERNELS(GPU, Eigen::half);
REGISTER_KERNELS(GPU, float);
REGISTER_KERNELS(GPU, double);
#endif
#undef REGISTER_CPU_KERNELS
#undef REGISTER_KERNELS
template <typename Device, typename T>
class ApplyProximalAdagradOp : public OpKernel {
public:
explicit ApplyProximalAdagradOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_));
}
void Compute(OpKernelContext* ctx) override {
auto locks = MaybeLockMutexesInOrder(ctx, use_exclusive_lock_, {0, 1});
Tensor var = ctx->mutable_input(0, use_exclusive_lock_);
Tensor accum = ctx->mutable_input(1, use_exclusive_lock_);
OP_REQUIRES(
ctx, var.IsInitialized(),
errors::FailedPrecondition(
"Attempting to use uninitialized variables: ", def().input(0)));
OP_REQUIRES(
ctx, accum.IsInitialized(),
errors::FailedPrecondition(
"Attempting to use uninitialized variables: ", def().input(1)));
OP_REQUIRES(
ctx, var.shape().IsSameSize(accum.shape()),
errors::InvalidArgument("var and accum do not have the same shape",
var.shape().DebugString(), " ",
accum.shape().DebugString()));
const Tensor& lr = ctx->input(2);
OP_REQUIRES(ctx, IsLegacyScalar(lr.shape()),
errors::InvalidArgument("lr is not a scalar: ",
lr.shape().DebugString()));
const Tensor& l1 = ctx->input(3);
OP_REQUIRES(
ctx, TensorShapeUtils::IsScalar(l1.shape()),
errors::InvalidArgument("l1 regularization strength is not a scalar: ",
l1.shape().DebugString()));
const Tensor& l2 = ctx->input(4);
OP_REQUIRES(
ctx, TensorShapeUtils::IsScalar(l2.shape()),
errors::InvalidArgument("l2 regularization strength is not a scalar: ",
l2.shape().DebugString()));
const Tensor& grad = ctx->input(5);
OP_REQUIRES(
ctx, var.shape().IsSameSize(grad.shape()),
errors::InvalidArgument("var and grad do not have the same shape",
var.shape().DebugString(), " ",
grad.shape().DebugString()));
const Device& device = ctx->template eigen_device<Device>();
functor::ApplyProximalAdagrad<Device, T>()(
device, var.flat<T>(), accum.flat<T>(), lr.scalar<T>(), l1.scalar<T>(),
l2.scalar<T>(), grad.flat<T>());
ctx->forward_ref_input_to_ref_output(0, 0);
}
private:
bool use_exclusive_lock_;
};
typedef Eigen::ThreadPoolDevice CPUDevice;
typedef Eigen::GpuDevice GPUDevice;
#define REGISTER_KERNELS(D, T) \
REGISTER_KERNEL_BUILDER( \
Name("ApplyProximalAdagrad").Device(DEVICE_##D).TypeConstraint<T>("T"), \
ApplyProximalAdagradOp<D##Device, T>);
REGISTER_KERNELS(CPU, float);
REGISTER_KERNELS(CPU, double);
#undef REGISTER_KERNELS
namespace {
template <class T>
inline T sgn(const T x) {
T zero(0);
T one(1);
return (x == zero ? zero : (x < zero ? -one : one));
}
template <typename T>
inline T FtrlCompute(const T& accum, const T& linear, const T& lr, const T& l1,
@ -730,9 +1066,167 @@ class SparseApplyAdagradOp : public OpKernel {
.TypeConstraint<T>("T") \
.TypeConstraint<Tindices>("Tindices"), \
SparseApplyAdagradOp<T, Tindices>);
#define REGISTER_CPU_KERNELS(T) \
REGISTER_KERNELS(T, int32); \
REGISTER_KERNELS(T, int64);
TF_CALL_half(REGISTER_CPU_KERNELS);
TF_CALL_float(REGISTER_CPU_KERNELS);
TF_CALL_double(REGISTER_CPU_KERNELS);
#undef REGISTER_CPU_KERNELS
#undef REGISTER_KERNELS
// Note, this op works on cpu only.
template <typename T, typename Tindex>
class SparseApplyProximalAdagradOp : public OpKernel {
public:
explicit SparseApplyProximalAdagradOp(OpKernelConstruction* ctx)
: OpKernel(ctx) {
OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_));
}
void Compute(OpKernelContext* ctx) override NO_THREAD_SAFETY_ANALYSIS {
auto locks = MaybeLockMutexesInOrder(ctx, use_exclusive_lock_, {0, 1});
Tensor var = ctx->mutable_input(0, use_exclusive_lock_);
Tensor accum = ctx->mutable_input(1, use_exclusive_lock_);
OP_REQUIRES(
ctx, var.IsInitialized(),
errors::FailedPrecondition(
"Attempting to use uninitialized variables: ", def().input(0)));
OP_REQUIRES(
ctx, accum.IsInitialized(),
errors::FailedPrecondition(
"Attempting to use uninitialized variables: ", def().input(1)));
OP_REQUIRES(
ctx, var.shape().IsSameSize(accum.shape()),
errors::InvalidArgument("var and accum do not have the same shape",
var.shape().DebugString(), " ",
accum.shape().DebugString()));
OP_REQUIRES(ctx, TensorShapeUtils::IsVectorOrHigher(var.shape()),
errors::InvalidArgument("var must be at least 1 dimensional"));
const Tensor& lr = ctx->input(2);
OP_REQUIRES(ctx, IsLegacyScalar(lr.shape()),
errors::InvalidArgument("lr is not a scalar: ",
lr.shape().DebugString()));
const Tensor& l1 = ctx->input(3);
OP_REQUIRES(
ctx, TensorShapeUtils::IsScalar(l1.shape()),
errors::InvalidArgument("l1 regularization strength is not a scalar: ",
l1.shape().DebugString()));
const Tensor& l2 = ctx->input(4);
OP_REQUIRES(
ctx, TensorShapeUtils::IsScalar(l2.shape()),
errors::InvalidArgument("l2 regularization strength is not a scalar: ",
l2.shape().DebugString()));
const Tensor& grad = ctx->input(5);
const Tensor& indices = ctx->input(6);
OP_REQUIRES(ctx, TensorShapeUtils::IsVector(indices.shape()),
errors::InvalidArgument("indices must be one-dimensional"));
int64 inner_dim = 1;
for (int d = 1; d < var.dims(); d++) {
OP_REQUIRES(ctx, var.dim_size(d) == grad.dim_size(d),
errors::InvalidArgument(strings::StrCat(
"var and grad must match in dimension ", d)));
inner_dim *= grad.dim_size(d);
}
const Tindex N = indices.dim_size(0);
OP_REQUIRES(
ctx, grad.dim_size(0) == N,
errors::InvalidArgument(
"grad must be the same size as indices in the first dimension."));
if (N > 0) {
if (inner_dim > 1) {
const Tindex first_dim_size = var.dim_size(0);
auto indices_vec = indices.vec<Tindex>();
auto var_flat = var.flat_outer_dims<T>();
auto accum_flat = accum.flat_outer_dims<T>();
auto grad_flat = grad.flat_outer_dims<T>();
T lr_scalar = lr.scalar<T>()();
T l1_scalar = l1.scalar<T>()();
T l2_scalar = l2.scalar<T>()();
for (Tindex i = 0; i < N; i++) {
const Tindex index = internal::SubtleMustCopy(indices_vec(i));
OP_REQUIRES(ctx, FastBoundsCheck(index, first_dim_size),
errors::InvalidArgument(
strings::StrCat("Index ", index, " at offset ", i,
" in indices is out of range")));
auto a = accum_flat.template chip<0>(index);
auto g = grad_flat.template chip<0>(i);
auto v = var_flat.template chip<0>(index);
a += g.square();
// compute learning_rate for current step.
auto learning_rate = a.constant(lr_scalar) * a.rsqrt();
auto prox_v = v;
// v = w - g * learning_rate.
prox_v -= g * learning_rate;
if (l1_scalar > 0) {
v = prox_v.abs() - learning_rate * prox_v.constant(l1_scalar);
// compute sign(v) * max(|v|, 0)
v = prox_v.sign() * v.cwiseMax(static_cast<T>(0.0));
} else {
v = prox_v;
}
if (l2_scalar > 0) {
v /= (v.constant(1.0) + v.constant(l2_scalar) * learning_rate);
}
}
} else {
CHECK_EQ(1, inner_dim);
auto indices_vec = indices.vec<Tindex>();
auto var_flat = var.flat<T>();
auto accum_flat = accum.flat<T>();
auto grad_flat = grad.flat<T>();
T lr_scalar = lr.scalar<T>()();
T l1_scalar = l1.scalar<T>()();
T l2_scalar = l2.scalar<T>()();
const Tindex first_dim_size = accum_flat.size();
for (Tindex i = 0; i < N; i++) {
const Tindex index = internal::SubtleMustCopy(indices_vec(i));
OP_REQUIRES(ctx, FastBoundsCheck(index, first_dim_size),
errors::InvalidArgument(
strings::StrCat("Index ", index, " at offset ", i,
" in indices is out of range")));
T& a = accum_flat(index);
const T& g = grad_flat(i);
a += g * g;
auto learning_rate = lr_scalar / std::sqrt(a);
auto prox_v = var_flat(index);
prox_v -= learning_rate * g;
if (l1_scalar > 0) {
var_flat(index) = std::abs(prox_v) - learning_rate * l1_scalar;
var_flat(index) =
sgn(prox_v) * std::max(var_flat(index), static_cast<T>(0.0));
} else {
var_flat(index) = prox_v;
}
if (l2_scalar > 0) {
var_flat(index) /= (1.0 + l2_scalar * learning_rate);
}
}
}
}
ctx->forward_ref_input_to_ref_output(0, 0);
}
private:
bool use_exclusive_lock_;
};
#define REGISTER_KERNELS(T, Tindices) \
REGISTER_KERNEL_BUILDER(Name("SparseApplyProximalAdagrad") \
.Device(DEVICE_CPU) \
.TypeConstraint<T>("T") \
.TypeConstraint<Tindices>("Tindices"), \
SparseApplyProximalAdagradOp<T, Tindices>);
REGISTER_KERNELS(Eigen::half, int32);
REGISTER_KERNELS(Eigen::half, int64);
REGISTER_KERNELS(float, int32);
REGISTER_KERNELS(float, int64);
REGISTER_KERNELS(double, int32);
@ -822,10 +1316,13 @@ typedef Eigen::GpuDevice GPUDevice;
REGISTER_KERNEL_BUILDER( \
Name("ApplyFtrl").Device(DEVICE_##D).TypeConstraint<T>("T"), \
ApplyFtrlOp<D##Device, T>);
#define REGISTER_CPU_KERNELS(T) REGISTER_KERNELS(CPU, T);
REGISTER_KERNELS(CPU, Eigen::half);
REGISTER_KERNELS(CPU, float);
REGISTER_KERNELS(CPU, double);
TF_CALL_half(REGISTER_CPU_KERNELS);
TF_CALL_float(REGISTER_CPU_KERNELS);
TF_CALL_double(REGISTER_CPU_KERNELS);
#undef REGISTER_CPU_KERNELS
#undef REGISTER_KERNELS
// Note, this op works on cpu only.
@ -1002,13 +1499,15 @@ class SparseApplyFtrlOp : public OpKernel {
.TypeConstraint<T>("T") \
.TypeConstraint<Tindices>("Tindices"), \
SparseApplyFtrlOp<CPUDevice, T, Tindices>);
#define REGISTER_CPU_KERNELS(T) \
REGISTER_KERNELS(T, int32); \
REGISTER_KERNELS(T, int64);
REGISTER_KERNELS(Eigen::half, int32);
REGISTER_KERNELS(Eigen::half, int64);
REGISTER_KERNELS(float, int32);
REGISTER_KERNELS(float, int64);
REGISTER_KERNELS(double, int32);
REGISTER_KERNELS(double, int64);
TF_CALL_half(REGISTER_CPU_KERNELS);
TF_CALL_float(REGISTER_CPU_KERNELS);
TF_CALL_double(REGISTER_CPU_KERNELS);
#undef REGISTER_CPU_KERNELS
#undef REGISTER_KERNELS
template <typename Device, typename T>
@ -1070,10 +1569,11 @@ typedef Eigen::GpuDevice GPUDevice;
REGISTER_KERNEL_BUILDER( \
Name("ApplyMomentum").Device(DEVICE_##D).TypeConstraint<T>("T"), \
ApplyMomentumOp<D##Device, T>);
#define REGISTER_CPU_KERNELS(T) REGISTER_KERNELS(CPU, T);
REGISTER_KERNELS(CPU, Eigen::half);
REGISTER_KERNELS(CPU, float);
REGISTER_KERNELS(CPU, double);
TF_CALL_half(REGISTER_CPU_KERNELS);
TF_CALL_float(REGISTER_CPU_KERNELS);
TF_CALL_double(REGISTER_CPU_KERNELS);
#if GOOGLE_CUDA
// Forward declarations of the functor specializations for GPU.
@ -1096,6 +1596,7 @@ REGISTER_KERNELS(GPU, Eigen::half);
REGISTER_KERNELS(GPU, float);
REGISTER_KERNELS(GPU, double);
#endif
#undef REGISTER_CPU_KERNELS
#undef REGISTER_KERNELS
// Note, this op works on cpu only.
@ -1129,7 +1630,7 @@ class SparseApplyMomentumOp : public OpKernel {
const Tensor& lr = ctx->input(2);
OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(lr.shape()),
errors::InvalidArgument("lr is not a scalar: ",
errors::InvalidArgument("lr is not a scalar : ",
lr.shape().DebugString()));
const Tensor& grad = ctx->input(3);
const Tensor& indices = ctx->input(4);
@ -1188,13 +1689,15 @@ class SparseApplyMomentumOp : public OpKernel {
.TypeConstraint<T>("T") \
.TypeConstraint<Tindices>("Tindices"), \
SparseApplyMomentumOp<T, Tindices>);
#define REGISTER_CPU_KERNELS(T) \
REGISTER_KERNELS(T, int32); \
REGISTER_KERNELS(T, int64);
REGISTER_KERNELS(Eigen::half, int32);
REGISTER_KERNELS(Eigen::half, int64);
REGISTER_KERNELS(float, int32);
REGISTER_KERNELS(float, int64);
REGISTER_KERNELS(double, int32);
REGISTER_KERNELS(double, int64);
TF_CALL_half(REGISTER_CPU_KERNELS);
TF_CALL_float(REGISTER_CPU_KERNELS);
TF_CALL_double(REGISTER_CPU_KERNELS);
#undef REGISTER_CPU_KERNELS
#undef REGISTER_KERNELS
template <typename Device, typename T>
@ -1237,7 +1740,7 @@ class ApplyAdamOp : public OpKernel {
errors::InvalidArgument("beta2_power is not a scalar: ",
beta2_power.shape().DebugString()));
OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(lr.shape()),
errors::InvalidArgument("lr is not a scalar: ",
errors::InvalidArgument("lr is not a scalar : ",
lr.shape().DebugString()));
OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(beta1.shape()),
errors::InvalidArgument("beta1 is not a scalar: ",
@ -1285,10 +1788,11 @@ typedef Eigen::GpuDevice GPUDevice;
REGISTER_KERNEL_BUILDER( \
Name("ApplyAdam").Device(DEVICE_##D).TypeConstraint<T>("T"), \
ApplyAdamOp<D##Device, T>);
#define REGISTER_CPU_KERNELS(T) REGISTER_KERNELS(CPU, T);
REGISTER_KERNELS(CPU, Eigen::half);
REGISTER_KERNELS(CPU, float);
REGISTER_KERNELS(CPU, double);
TF_CALL_half(REGISTER_CPU_KERNELS);
TF_CALL_float(REGISTER_CPU_KERNELS);
TF_CALL_double(REGISTER_CPU_KERNELS);
#if GOOGLE_CUDA
// Forward declarations of the functor specializations for GPU.
@ -1316,6 +1820,7 @@ REGISTER_KERNELS(GPU, Eigen::half);
REGISTER_KERNELS(GPU, float);
REGISTER_KERNELS(GPU, double);
#endif
#undef REGISTER_CPU_KERNELS
#undef REGISTER_KERNELS
template <typename Device, typename T>
@ -1352,7 +1857,7 @@ class ApplyRMSPropOp : public OpKernel {
const Tensor& grad = ctx->input(7);
OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(lr.shape()),
errors::InvalidArgument("lr is not a scalar: ",
errors::InvalidArgument("lr is not a scalar : ",
lr.shape().DebugString()));
OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(rho.shape()),
errors::InvalidArgument("rho is not a scalar: ",
@ -1400,10 +1905,11 @@ typedef Eigen::GpuDevice GPUDevice;
REGISTER_KERNEL_BUILDER( \
Name("ApplyRMSProp").Device(DEVICE_##D).TypeConstraint<T>("T"), \
ApplyRMSPropOp<D##Device, T>);
#define REGISTER_CPU_KERNELS(T) REGISTER_KERNELS(CPU, T);
REGISTER_KERNELS(CPU, Eigen::half);
REGISTER_KERNELS(CPU, float);
REGISTER_KERNELS(CPU, double);
TF_CALL_half(REGISTER_CPU_KERNELS);
TF_CALL_float(REGISTER_CPU_KERNELS);
TF_CALL_double(REGISTER_CPU_KERNELS);
#if GOOGLE_CUDA
// Forward declarations of the functor specializations for GPU.
@ -1428,6 +1934,7 @@ REGISTER_KERNELS(GPU, Eigen::half);
REGISTER_KERNELS(GPU, float);
REGISTER_KERNELS(GPU, double);
#endif
#undef REGISTER_CPU_KERNELS
#undef REGISTER_KERNELS
} // namespace tensorflow

View File

@ -44,6 +44,24 @@ struct ApplyAdadelta {
typename TTypes<T>::ConstFlat grad);
};
template <typename Device, typename T>
struct FobosElasticNet {
void operator()(const Device& d, typename TTypes<T>::Flat var,
typename TTypes<T>::ConstScalar lr,
typename TTypes<T>::ConstScalar l1,
typename TTypes<T>::ConstScalar l2,
typename TTypes<T>::ConstFlat grad);
};
template <typename Device, typename T>
struct ApplyProximalGradientDescent {
void operator()(const Device& d, typename TTypes<T>::Flat var,
typename TTypes<T>::ConstScalar lr,
typename TTypes<T>::ConstScalar l1,
typename TTypes<T>::ConstScalar l2,
typename TTypes<T>::ConstFlat grad);
};
template <typename Device, typename T>
struct ApplyAdagrad {
void operator()(const Device& d, typename TTypes<T>::Flat var,
@ -52,6 +70,16 @@ struct ApplyAdagrad {
typename TTypes<T>::ConstFlat grad);
};
template <typename Device, typename T>
struct ApplyProximalAdagrad {
void operator()(const Device& d, typename TTypes<T>::Flat var,
typename TTypes<T>::Flat accum,
typename TTypes<T>::ConstScalar lr,
typename TTypes<T>::ConstScalar l1,
typename TTypes<T>::ConstScalar l2,
typename TTypes<T>::ConstFlat grad);
};
template <typename Device, typename T>
struct ApplyFtrl {
void operator()(const Device& d, typename TTypes<T>::Flat var,

View File

@ -20,6 +20,7 @@ limitations under the License.
#include "tensorflow/core/kernels/xent_op.h"
#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
#include "tensorflow/core/framework/op_kernel.h"
#include "tensorflow/core/framework/register_types.h"
#include "tensorflow/core/framework/tensor.h"
#include "tensorflow/core/framework/tensor_shape.h"
@ -86,18 +87,14 @@ struct XentFunctor<CPUDevice, T> {
};
} // namespace functor
REGISTER_KERNEL_BUILDER(Name("SoftmaxCrossEntropyWithLogits")
.Device(DEVICE_CPU)
.TypeConstraint<Eigen::half>("T"),
SoftmaxXentWithLogitsOp<CPUDevice, Eigen::half>);
REGISTER_KERNEL_BUILDER(Name("SoftmaxCrossEntropyWithLogits")
.Device(DEVICE_CPU)
.TypeConstraint<float>("T"),
SoftmaxXentWithLogitsOp<CPUDevice, float>);
REGISTER_KERNEL_BUILDER(Name("SoftmaxCrossEntropyWithLogits")
.Device(DEVICE_CPU)
.TypeConstraint<double>("T"),
SoftmaxXentWithLogitsOp<CPUDevice, double>);
#define REGISTER_CPU(T) \
REGISTER_KERNEL_BUILDER(Name("SoftmaxCrossEntropyWithLogits") \
.Device(DEVICE_CPU) \
.TypeConstraint<T>("T"), \
SoftmaxXentWithLogitsOp<CPUDevice, T>);
TF_CALL_half(REGISTER_CPU);
TF_CALL_float(REGISTER_CPU);
TF_CALL_double(REGISTER_CPU);
#if GOOGLE_CUDA
REGISTER_KERNEL_BUILDER(Name("SoftmaxCrossEntropyWithLogits")

View File

@ -1825,6 +1825,127 @@ op {
}
}
}
op {
name: "ApplyProximalAdagrad"
input_arg {
name: "var"
type_attr: "T"
is_ref: true
}
input_arg {
name: "accum"
type_attr: "T"
is_ref: true
}
input_arg {
name: "lr"
type_attr: "T"
}
input_arg {
name: "l1"
type_attr: "T"
}
input_arg {
name: "l2"
type_attr: "T"
}
input_arg {
name: "grad"
type_attr: "T"
}
output_arg {
name: "out"
type_attr: "T"
is_ref: true
}
attr {
name: "T"
type: "type"
allowed_values {
list {
type: DT_FLOAT
type: DT_DOUBLE
type: DT_INT64
type: DT_INT32
type: DT_UINT8
type: DT_UINT16
type: DT_INT16
type: DT_INT8
type: DT_COMPLEX64
type: DT_COMPLEX128
type: DT_QINT8
type: DT_QUINT8
type: DT_QINT32
type: DT_HALF
}
}
}
attr {
name: "use_locking"
type: "bool"
default_value {
b: false
}
}
}
op {
name: "ApplyProximalGradientDescent"
input_arg {
name: "var"
type_attr: "T"
is_ref: true
}
input_arg {
name: "alpha"
type_attr: "T"
}
input_arg {
name: "l1"
type_attr: "T"
}
input_arg {
name: "l2"
type_attr: "T"
}
input_arg {
name: "delta"
type_attr: "T"
}
output_arg {
name: "out"
type_attr: "T"
is_ref: true
}
attr {
name: "T"
type: "type"
allowed_values {
list {
type: DT_FLOAT
type: DT_DOUBLE
type: DT_INT64
type: DT_INT32
type: DT_UINT8
type: DT_UINT16
type: DT_INT16
type: DT_INT8
type: DT_COMPLEX64
type: DT_COMPLEX128
type: DT_QINT8
type: DT_QUINT8
type: DT_QINT32
type: DT_HALF
}
}
}
attr {
name: "use_locking"
type: "bool"
default_value {
b: false
}
}
}
op {
name: "ApplyRMSProp"
input_arg {
@ -3011,6 +3132,63 @@ op {
}
}
}
op {
name: "AvgPool"
input_arg {
name: "value"
type_attr: "T"
}
output_arg {
name: "output"
type_attr: "T"
}
attr {
name: "ksize"
type: "list(int)"
has_minimum: true
minimum: 4
}
attr {
name: "strides"
type: "list(int)"
has_minimum: true
minimum: 4
}
attr {
name: "padding"
type: "string"
allowed_values {
list {
s: "SAME"
s: "VALID"
}
}
}
attr {
name: "data_format"
type: "string"
default_value {
s: "NHWC"
}
allowed_values {
list {
s: "NHWC"
s: "NCHW"
}
}
}
attr {
name: "T"
type: "type"
allowed_values {
list {
type: DT_FLOAT
type: DT_HALF
type: DT_DOUBLE
}
}
}
}
op {
name: "AvgPool3D"
input_arg {
@ -3232,6 +3410,67 @@ op {
}
}
}
op {
name: "AvgPoolGrad"
input_arg {
name: "orig_input_shape"
type: DT_INT32
}
input_arg {
name: "grad"
type_attr: "T"
}
output_arg {
name: "output"
type_attr: "T"
}
attr {
name: "ksize"
type: "list(int)"
has_minimum: true
minimum: 4
}
attr {
name: "strides"
type: "list(int)"
has_minimum: true
minimum: 4
}
attr {
name: "padding"
type: "string"
allowed_values {
list {
s: "SAME"
s: "VALID"
}
}
}
attr {
name: "data_format"
type: "string"
default_value {
s: "NHWC"
}
allowed_values {
list {
s: "NHWC"
s: "NCHW"
}
}
}
attr {
name: "T"
type: "type"
allowed_values {
list {
type: DT_FLOAT
type: DT_HALF
type: DT_DOUBLE
}
}
}
}
op {
name: "BatchCholesky"
input_arg {
@ -11801,6 +12040,124 @@ op {
}
}
}
op {
name: "MaxPool"
input_arg {
name: "input"
type_attr: "T"
}
output_arg {
name: "output"
type_attr: "T"
}
attr {
name: "ksize"
type: "list(int)"
has_minimum: true
minimum: 4
}
attr {
name: "strides"
type: "list(int)"
has_minimum: true
minimum: 4
}
attr {
name: "padding"
type: "string"
allowed_values {
list {
s: "SAME"
s: "VALID"
}
}
}
attr {
name: "data_format"
type: "string"
default_value {
s: "NHWC"
}
allowed_values {
list {
s: "NHWC"
s: "NCHW"
}
}
}
attr {
name: "T"
type: "type"
default_value {
type: DT_FLOAT
}
allowed_values {
list {
type: DT_FLOAT
type: DT_HALF
}
}
}
}
op {
name: "MaxPool"
input_arg {
name: "input"
type_attr: "T"
}
output_arg {
name: "output"
type_attr: "T"
}
attr {
name: "T"
type: "type"
default_value {
type: DT_FLOAT
}
allowed_values {
list {
type: DT_FLOAT
type: DT_HALF
}
}
}
attr {
name: "ksize"
type: "list(int)"
has_minimum: true
minimum: 4
}
attr {
name: "strides"
type: "list(int)"
has_minimum: true
minimum: 4
}
attr {
name: "padding"
type: "string"
allowed_values {
list {
s: "SAME"
s: "VALID"
}
}
}
attr {
name: "data_format"
type: "string"
default_value {
s: "NHWC"
}
allowed_values {
list {
s: "NHWC"
s: "NCHW"
}
}
}
}
op {
name: "MaxPool3D"
input_arg {
@ -12014,6 +12371,73 @@ op {
}
}
}
op {
name: "MaxPoolGrad"
input_arg {
name: "orig_input"
type_attr: "T"
}
input_arg {
name: "orig_output"
type_attr: "T"
}
input_arg {
name: "grad"
type_attr: "T"
}
output_arg {
name: "output"
type_attr: "T"
}
attr {
name: "ksize"
type: "list(int)"
has_minimum: true
minimum: 4
}
attr {
name: "strides"
type: "list(int)"
has_minimum: true
minimum: 4
}
attr {
name: "padding"
type: "string"
allowed_values {
list {
s: "SAME"
s: "VALID"
}
}
}
attr {
name: "data_format"
type: "string"
default_value {
s: "NHWC"
}
allowed_values {
list {
s: "NHWC"
s: "NCHW"
}
}
}
attr {
name: "T"
type: "type"
default_value {
type: DT_FLOAT
}
allowed_values {
list {
type: DT_FLOAT
type: DT_HALF
}
}
}
}
op {
name: "MaxPoolGradWithArgmax"
input_arg {
@ -12065,6 +12489,70 @@ op {
}
}
}
op {
name: "MaxPoolGradWithArgmax"
input_arg {
name: "input"
type_attr: "T"
}
input_arg {
name: "grad"
type_attr: "T"
}
input_arg {
name: "argmax"
type_attr: "Targmax"
}
output_arg {
name: "output"
type_attr: "T"
}
attr {
name: "ksize"
type: "list(int)"
has_minimum: true
minimum: 4
}
attr {
name: "strides"
type: "list(int)"
has_minimum: true
minimum: 4
}
attr {
name: "padding"
type: "string"
allowed_values {
list {
s: "SAME"
s: "VALID"
}
}
}
attr {
name: "Targmax"
type: "type"
allowed_values {
list {
type: DT_INT32
type: DT_INT64
}
}
}
attr {
name: "T"
type: "type"
default_value {
type: DT_FLOAT
}
allowed_values {
list {
type: DT_FLOAT
type: DT_HALF
}
}
}
}
op {
name: "MaxPoolWithArgmax"
input_arg {
@ -12115,6 +12603,69 @@ op {
}
}
}
op {
name: "MaxPoolWithArgmax"
input_arg {
name: "input"
type_attr: "T"
}
output_arg {
name: "output"
type_attr: "T"
}
output_arg {
name: "argmax"
type_attr: "Targmax"
}
attr {
name: "ksize"
type: "list(int)"
has_minimum: true
minimum: 4
}
attr {
name: "strides"
type: "list(int)"
has_minimum: true
minimum: 4
}
attr {
name: "Targmax"
type: "type"
default_value {
type: DT_INT64
}
allowed_values {
list {
type: DT_INT32
type: DT_INT64
}
}
}
attr {
name: "padding"
type: "string"
allowed_values {
list {
s: "SAME"
s: "VALID"
}
}
}
attr {
name: "T"
type: "type"
default_value {
type: DT_FLOAT
}
allowed_values {
list {
type: DT_FLOAT
type: DT_HALF
}
}
}
}
op {
name: "Maximum"
input_arg {
@ -20499,6 +21050,155 @@ op {
}
}
}
op {
name: "SparseApplyProximalAdagrad"
input_arg {
name: "var"
type_attr: "T"
is_ref: true
}
input_arg {
name: "accum"
type_attr: "T"
is_ref: true
}
input_arg {
name: "lr"
type_attr: "T"
}
input_arg {
name: "l1"
type_attr: "T"
}
input_arg {
name: "l2"
type_attr: "T"
}
input_arg {
name: "grad"
type_attr: "T"
}
input_arg {
name: "indices"
type_attr: "Tindices"
}
output_arg {
name: "out"
type_attr: "T"
is_ref: true
}
attr {
name: "T"
type: "type"
allowed_values {
list {
type: DT_FLOAT
type: DT_DOUBLE
type: DT_INT64
type: DT_INT32
type: DT_UINT8
type: DT_UINT16
type: DT_INT16
type: DT_INT8
type: DT_COMPLEX64
type: DT_COMPLEX128
type: DT_QINT8
type: DT_QUINT8
type: DT_QINT32
type: DT_HALF
}
}
}
attr {
name: "Tindices"
type: "type"
allowed_values {
list {
type: DT_INT32
type: DT_INT64
}
}
}
attr {
name: "use_locking"
type: "bool"
default_value {
b: false
}
}
}
op {
name: "SparseApplyProximalGradientDescent"
input_arg {
name: "var"
type_attr: "T"
is_ref: true
}
input_arg {
name: "alpha"
type_attr: "T"
}
input_arg {
name: "l1"
type_attr: "T"
}
input_arg {
name: "l2"
type_attr: "T"
}
input_arg {
name: "grad"
type_attr: "T"
}
input_arg {
name: "indices"
type_attr: "Tindices"
}
output_arg {
name: "out"
type_attr: "T"
is_ref: true
}
attr {
name: "T"
type: "type"
allowed_values {
list {
type: DT_FLOAT
type: DT_DOUBLE
type: DT_INT64
type: DT_INT32
type: DT_UINT8
type: DT_UINT16
type: DT_INT16
type: DT_INT8
type: DT_COMPLEX64
type: DT_COMPLEX128
type: DT_QINT8
type: DT_QUINT8
type: DT_QINT32
type: DT_HALF
}
}
}
attr {
name: "Tindices"
type: "type"
allowed_values {
list {
type: DT_INT32
type: DT_INT64
}
}
}
attr {
name: "use_locking"
type: "bool"
default_value {
b: false
}
}
}
op {
name: "SparseConcat"
input_arg {

View File

@ -154,22 +154,25 @@ Status MaxPoolGrad(const AttrSlice& attrs, FunctionDef* g) {
// clang-format off
*g = FDH::Define(
// Arg defs
{"input: float", "grad: float"},
{"input: T", "grad: T"},
// Ret val defs
{"output: float"},
{"output: T"},
// Attr defs
{"ksize: list(int) >= 4",
{"T: {float, half} = DT_FLOAT",
"ksize: list(int) >= 4",
"strides: list(int) >= 4",
GetPaddingAttrString()},
// Nodes
{
// Invoke MaxPool again to recompute the outputs (removed by CSE?).
{{"maxpool"}, "MaxPool", {"input"},
/*Attrs=*/{{"ksize", "$ksize"},
/*Attrs=*/{{"T", "$T"},
{"ksize", "$ksize"},
{"strides", "$strides"},
{"padding", "$padding"}}},
{{"output"}, "MaxPoolGrad", {"input", "maxpool", "grad"},
/*Attrs=*/{{"ksize", "$ksize"},
/*Attrs=*/{{"T", "$T"},
{"ksize", "$ksize"},
{"strides", "$strides"},
{"padding", "$padding"}}}
});

View File

@ -28,7 +28,7 @@ REGISTER_OP("AvgPool")
.Attr("strides: list(int) >= 4")
.Attr(GetPaddingAttrString())
.Attr(GetConvnetDataFormatAttrString())
.Attr("T: {float, double}")
.Attr("T: {float, half, double}")
.Doc(R"doc(
Performs average pooling on the input.
@ -55,7 +55,7 @@ REGISTER_OP("AvgPoolGrad")
.Attr("strides: list(int) >= 4")
.Attr(GetPaddingAttrString())
.Attr(GetConvnetDataFormatAttrString())
.Attr("T: {float, double}")
.Attr("T: {float, half, double}")
.Doc(R"doc(
Computes gradients of the average pooling function.
@ -642,12 +642,13 @@ output: The gradients for LRN.
// --------------------------------------------------------------------------
REGISTER_OP("MaxPool")
.Attr("T: {float, half} = DT_FLOAT")
.Attr("ksize: list(int) >= 4")
.Attr("strides: list(int) >= 4")
.Attr(GetPaddingAttrString())
.Attr(GetConvnetDataFormatAttrString())
.Input("input: float")
.Output("output: float")
.Input("input: T")
.Output("output: T")
.Doc(R"doc(
Performs max pooling on the input.
@ -669,10 +670,11 @@ REGISTER_OP("MaxPoolGrad")
.Attr("strides: list(int) >= 4")
.Attr(GetPaddingAttrString())
.Attr(GetConvnetDataFormatAttrString())
.Input("orig_input: float")
.Input("orig_output: float")
.Input("grad: float")
.Output("output: float")
.Input("orig_input: T")
.Input("orig_output: T")
.Input("grad: T")
.Output("output: T")
.Attr("T: {float, half} = DT_FLOAT")
.Doc(R"doc(
Computes gradients of the maxpooling function.
@ -696,9 +698,10 @@ REGISTER_OP("MaxPoolWithArgmax")
.Attr("strides: list(int) >= 4")
.Attr("Targmax: {int32, int64} = DT_INT64")
.Attr(GetPaddingAttrString())
.Input("input: float")
.Output("output: float")
.Input("input: T")
.Output("output: T")
.Output("argmax: Targmax")
.Attr("T: {float, half} = DT_FLOAT")
.Doc(R"doc(
Performs max pooling on the input and outputs both max values and indices.
@ -720,10 +723,11 @@ REGISTER_OP("MaxPoolGradWithArgmax")
.Attr("strides: list(int) >= 4")
.Attr(GetPaddingAttrString())
.Attr("Targmax: {int32, int64}")
.Input("input: float")
.Input("grad: float")
.Input("input: T")
.Input("grad: T")
.Input("argmax: Targmax")
.Output("output: float")
.Output("output: T")
.Attr("T: {float, half} = DT_FLOAT")
.Doc(R"doc(
Computes gradients of the maxpooling function.

View File

@ -591,12 +591,12 @@ op {
}
input_arg {
name: "l1"
description: "Scaling factor. Must be a scalar."
description: "L1 regulariation. Must be a scalar."
type_attr: "T"
}
input_arg {
name: "l2"
description: "Scaling factor. Must be a scalar."
description: "L2 regulariation. Must be a scalar."
type_attr: "T"
}
input_arg {
@ -767,6 +767,146 @@ op {
summary: "Update \'*var\' according to the momentum scheme."
description: "accum = accum * momentum + grad\nvar -= lr * accum"
}
op {
name: "ApplyProximalAdagrad"
input_arg {
name: "var"
description: "Should be from a Variable()."
type_attr: "T"
is_ref: true
}
input_arg {
name: "accum"
description: "Should be from a Variable()."
type_attr: "T"
is_ref: true
}
input_arg {
name: "lr"
description: "Scaling factor. Must be a scalar."
type_attr: "T"
}
input_arg {
name: "l1"
description: "L1 regularization. Must be a scalar."
type_attr: "T"
}
input_arg {
name: "l2"
description: "L2 regularization. Must be a scalar."
type_attr: "T"
}
input_arg {
name: "grad"
description: "The gradient."
type_attr: "T"
}
output_arg {
name: "out"
description: "Same as \"var\"."
type_attr: "T"
is_ref: true
}
attr {
name: "T"
type: "type"
allowed_values {
list {
type: DT_FLOAT
type: DT_DOUBLE
type: DT_INT64
type: DT_INT32
type: DT_UINT8
type: DT_UINT16
type: DT_INT16
type: DT_INT8
type: DT_COMPLEX64
type: DT_COMPLEX128
type: DT_QINT8
type: DT_QUINT8
type: DT_QINT32
type: DT_HALF
}
}
}
attr {
name: "use_locking"
type: "bool"
default_value {
b: false
}
description: "If True, updating of the var and accum tensors will be protected by\na lock; otherwise the behavior is undefined, but may exhibit less contention."
}
summary: "Update \'*var\' and \'*accum\' according to FOBOS with Adagrad learning rate."
description: "accum += grad * grad\nprox_v = var - lr * grad * (1 / sqrt(accum))\nvar = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0}"
}
op {
name: "ApplyProximalGradientDescent"
input_arg {
name: "var"
description: "Should be from a Variable()."
type_attr: "T"
is_ref: true
}
input_arg {
name: "alpha"
description: "Scaling factor. Must be a scalar."
type_attr: "T"
}
input_arg {
name: "l1"
description: "L1 regularization. Must be a scalar."
type_attr: "T"
}
input_arg {
name: "l2"
description: "L2 regularization. Must be a scalar."
type_attr: "T"
}
input_arg {
name: "delta"
description: "The change."
type_attr: "T"
}
output_arg {
name: "out"
description: "Same as \"var\"."
type_attr: "T"
is_ref: true
}
attr {
name: "T"
type: "type"
allowed_values {
list {
type: DT_FLOAT
type: DT_DOUBLE
type: DT_INT64
type: DT_INT32
type: DT_UINT8
type: DT_UINT16
type: DT_INT16
type: DT_INT8
type: DT_COMPLEX64
type: DT_COMPLEX128
type: DT_QINT8
type: DT_QUINT8
type: DT_QINT32
type: DT_HALF
}
}
}
attr {
name: "use_locking"
type: "bool"
default_value {
b: false
}
description: "If True, the subtraction will be protected by a lock;\notherwise the behavior is undefined, but may exhibit less contention."
}
summary: "Update \'*var\' as FOBOS algorithm with fixed learning rate."
description: "prox_v = var - alpha * delta\nvar = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0}"
}
op {
name: "ApplyRMSProp"
input_arg {
@ -1251,6 +1391,7 @@ op {
allowed_values {
list {
type: DT_FLOAT
type: DT_HALF
type: DT_DOUBLE
}
}
@ -1447,6 +1588,7 @@ op {
allowed_values {
list {
type: DT_FLOAT
type: DT_HALF
type: DT_DOUBLE
}
}
@ -6614,12 +6756,25 @@ op {
input_arg {
name: "input"
description: "4-D input to pool over."
type: DT_FLOAT
type_attr: "T"
}
output_arg {
name: "output"
description: "The max pooled output tensor."
type: DT_FLOAT
type_attr: "T"
}
attr {
name: "T"
type: "type"
default_value {
type: DT_FLOAT
}
allowed_values {
list {
type: DT_FLOAT
type: DT_HALF
}
}
}
attr {
name: "ksize"
@ -6798,22 +6953,22 @@ op {
input_arg {
name: "orig_input"
description: "The original input tensor."
type: DT_FLOAT
type_attr: "T"
}
input_arg {
name: "orig_output"
description: "The original output tensor."
type: DT_FLOAT
type_attr: "T"
}
input_arg {
name: "grad"
description: "4-D. Gradients w.r.t. the output of `max_pool`."
type: DT_FLOAT
type_attr: "T"
}
output_arg {
name: "output"
description: "Gradients w.r.t. the input to `max_pool`."
type: DT_FLOAT
type_attr: "T"
}
attr {
name: "ksize"
@ -6854,6 +7009,19 @@ op {
}
}
}
attr {
name: "T"
type: "type"
default_value {
type: DT_FLOAT
}
allowed_values {
list {
type: DT_FLOAT
type: DT_HALF
}
}
}
summary: "Computes gradients of the maxpooling function."
}
op {
@ -6861,12 +7029,12 @@ op {
input_arg {
name: "input"
description: "The original input."
type: DT_FLOAT
type_attr: "T"
}
input_arg {
name: "grad"
description: "4-D with shape `[batch, height, width, channels]`. Gradients w.r.t. the\noutput of `max_pool`."
type: DT_FLOAT
type_attr: "T"
}
input_arg {
name: "argmax"
@ -6876,7 +7044,7 @@ op {
output_arg {
name: "output"
description: "Gradients w.r.t. the input of `max_pool`."
type: DT_FLOAT
type_attr: "T"
}
attr {
name: "ksize"
@ -6913,6 +7081,19 @@ op {
}
}
}
attr {
name: "T"
type: "type"
default_value {
type: DT_FLOAT
}
allowed_values {
list {
type: DT_FLOAT
type: DT_HALF
}
}
}
summary: "Computes gradients of the maxpooling function."
}
op {
@ -6920,12 +7101,12 @@ op {
input_arg {
name: "input"
description: "4-D with shape `[batch, height, width, channels]`. Input to pool over."
type: DT_FLOAT
type_attr: "T"
}
output_arg {
name: "output"
description: "The max pooled output tensor."
type: DT_FLOAT
type_attr: "T"
}
output_arg {
name: "argmax"
@ -6970,6 +7151,19 @@ op {
}
}
}
attr {
name: "T"
type: "type"
default_value {
type: DT_FLOAT
}
allowed_values {
list {
type: DT_FLOAT
type: DT_HALF
}
}
}
summary: "Performs max pooling on the input and outputs both max values and indices."
description: "The indices in `argmax` are flattened, so that a maximum value at position\n`[b, y, x, c]` becomes flattened index\n`((b * height + y) * width + x) * channels + c`."
}
@ -11380,12 +11574,12 @@ op {
}
input_arg {
name: "l1"
description: "Scaling factor. Must be a scalar."
description: "L1 regularization. Must be a scalar."
type_attr: "T"
}
input_arg {
name: "l2"
description: "Scaling factor. Must be a scalar."
description: "L2 regularization. Must be a scalar."
type_attr: "T"
}
input_arg {
@ -11525,6 +11719,176 @@ op {
summary: "Update relevant entries in \'*var\' and \'*accum\' according to the momentum scheme."
description: "That is for rows we have grad for, we update var and accum as follows:\n\naccum = accum * momentum + grad\nvar -= lr * accum"
}
op {
name: "SparseApplyProximalAdagrad"
input_arg {
name: "var"
description: "Should be from a Variable()."
type_attr: "T"
is_ref: true
}
input_arg {
name: "accum"
description: "Should be from a Variable()."
type_attr: "T"
is_ref: true
}
input_arg {
name: "lr"
description: "Learning rate. Must be a scalar."
type_attr: "T"
}
input_arg {
name: "l1"
description: "L1 regularization. Must be a scalar."
type_attr: "T"
}
input_arg {
name: "l2"
description: "L2 regularization. Must be a scalar."
type_attr: "T"
}
input_arg {
name: "grad"
description: "The gradient."
type_attr: "T"
}
input_arg {
name: "indices"
description: "A vector of indices into the first dimension of var and accum."
type_attr: "Tindices"
}
output_arg {
name: "out"
description: "Same as \"var\"."
type_attr: "T"
is_ref: true
}
attr {
name: "T"
type: "type"
allowed_values {
list {
type: DT_FLOAT
type: DT_DOUBLE
type: DT_INT64
type: DT_INT32
type: DT_UINT8
type: DT_UINT16
type: DT_INT16
type: DT_INT8
type: DT_COMPLEX64
type: DT_COMPLEX128
type: DT_QINT8
type: DT_QUINT8
type: DT_QINT32
type: DT_HALF
}
}
}
attr {
name: "Tindices"
type: "type"
allowed_values {
list {
type: DT_INT32
type: DT_INT64
}
}
}
attr {
name: "use_locking"
type: "bool"
default_value {
b: false
}
description: "If True, updating of the var and accum tensors will be protected by\na lock; otherwise the behavior is undefined, but may exhibit less contention."
}
summary: "Sparse update entries in \'*var\' and \'*accum\' according to FOBOS algorithm."
description: "That is for rows we have grad for, we update var and accum as follows:\naccum += grad * grad\nprox_v = var\nprox_v -= lr * grad * (1 / sqrt(accum))\nvar = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0}"
}
op {
name: "SparseApplyProximalGradientDescent"
input_arg {
name: "var"
description: "Should be from a Variable()."
type_attr: "T"
is_ref: true
}
input_arg {
name: "alpha"
description: "Scaling factor. Must be a scalar."
type_attr: "T"
}
input_arg {
name: "l1"
description: "L1 regularization. Must be a scalar."
type_attr: "T"
}
input_arg {
name: "l2"
description: "L2 regularization. Must be a scalar."
type_attr: "T"
}
input_arg {
name: "grad"
description: "The gradient."
type_attr: "T"
}
input_arg {
name: "indices"
description: "A vector of indices into the first dimension of var and accum."
type_attr: "Tindices"
}
output_arg {
name: "out"
description: "Same as \"var\"."
type_attr: "T"
is_ref: true
}
attr {
name: "T"
type: "type"
allowed_values {
list {
type: DT_FLOAT
type: DT_DOUBLE
type: DT_INT64
type: DT_INT32
type: DT_UINT8
type: DT_UINT16
type: DT_INT16
type: DT_INT8
type: DT_COMPLEX64
type: DT_COMPLEX128
type: DT_QINT8
type: DT_QUINT8
type: DT_QINT32
type: DT_HALF
}
}
}
attr {
name: "Tindices"
type: "type"
allowed_values {
list {
type: DT_INT32
type: DT_INT64
}
}
}
attr {
name: "use_locking"
type: "bool"
default_value {
b: false
}
description: "If True, the subtraction will be protected by a lock;\notherwise the behavior is undefined, but may exhibit less contention."
}
summary: "Sparse update \'*var\' as FOBOS algorithm with fixed learning rate."
description: "That is for rows we have grad for, we update var as follows:\nprox_v = var - alpha * grad\nvar = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0}"
}
op {
name: "SparseConcat"
input_arg {

View File

@ -35,6 +35,59 @@ use_locking: If `True`, the subtraction will be protected by a lock;
otherwise the behavior is undefined, but may exhibit less contention.
)doc");
REGISTER_OP("ApplyProximalGradientDescent")
.Input("var: Ref(T)")
.Input("alpha: T")
.Input("l1: T")
.Input("l2: T")
.Input("delta: T")
.Output("out: Ref(T)")
.Attr("T: numbertype")
.Attr("use_locking: bool = false")
.Doc(R"doc(
Update '*var' as FOBOS algorithm with fixed learning rate.
prox_v = var - alpha * delta
var = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0}
var: Should be from a Variable().
alpha: Scaling factor. Must be a scalar.
l1: L1 regularization. Must be a scalar.
l2: L2 regularization. Must be a scalar.
delta: The change.
out: Same as "var".
use_locking: If True, the subtraction will be protected by a lock;
otherwise the behavior is undefined, but may exhibit less contention.
)doc");
REGISTER_OP("SparseApplyProximalGradientDescent")
.Input("var: Ref(T)")
.Input("alpha: T")
.Input("l1: T")
.Input("l2: T")
.Input("grad: T")
.Input("indices: Tindices")
.Output("out: Ref(T)")
.Attr("T: numbertype")
.Attr("Tindices: {int32, int64}")
.Attr("use_locking: bool = false")
.Doc(R"doc(
Sparse update '*var' as FOBOS algorithm with fixed learning rate.
That is for rows we have grad for, we update var as follows:
prox_v = var - alpha * grad
var = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0}
var: Should be from a Variable().
alpha: Scaling factor. Must be a scalar.
l1: L1 regularization. Must be a scalar.
l2: L2 regularization. Must be a scalar.
grad: The gradient.
indices: A vector of indices into the first dimension of var and accum.
out: Same as "var".
use_locking: If True, the subtraction will be protected by a lock;
otherwise the behavior is undefined, but may exhibit less contention.
)doc");
REGISTER_OP("ApplyAdadelta")
.Input("var: Ref(T)")
.Input("accum: Ref(T)")
@ -117,6 +170,33 @@ use_locking: If `True`, updating of the var and accum tensors will be protected
contention.
)doc");
REGISTER_OP("ApplyProximalAdagrad")
.Input("var: Ref(T)")
.Input("accum: Ref(T)")
.Input("lr: T")
.Input("l1: T")
.Input("l2: T")
.Input("grad: T")
.Output("out: Ref(T)")
.Attr("T: numbertype")
.Attr("use_locking: bool = false")
.Doc(R"doc(
Update '*var' and '*accum' according to FOBOS with Adagrad learning rate.
accum += grad * grad
prox_v = var - lr * grad * (1 / sqrt(accum))
var = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0}
var: Should be from a Variable().
accum: Should be from a Variable().
grad: The gradient.
lr: Scaling factor. Must be a scalar.
l1: L1 regularization. Must be a scalar.
l2: L2 regularization. Must be a scalar.
out: Same as "var".
use_locking: If True, updating of the var and accum tensors will be protected by
a lock; otherwise the behavior is undefined, but may exhibit less contention.
)doc");
REGISTER_OP("SparseApplyAdagrad")
.Input("var: Ref(T)")
.Input("accum: Ref(T)")
@ -145,6 +225,39 @@ use_locking: If `True`, updating of the var and accum tensors will be protected
contention.
)doc");
REGISTER_OP("SparseApplyProximalAdagrad")
.Input("var: Ref(T)")
.Input("accum: Ref(T)")
.Input("lr: T")
.Input("l1: T")
.Input("l2: T")
.Input("grad: T")
.Input("indices: Tindices")
.Output("out: Ref(T)")
.Attr("T: numbertype")
.Attr("Tindices: {int32, int64}")
.Attr("use_locking: bool = false")
.Doc(R"doc(
Sparse update entries in '*var' and '*accum' according to FOBOS algorithm.
That is for rows we have grad for, we update var and accum as follows:
accum += grad * grad
prox_v = var
prox_v -= lr * grad * (1 / sqrt(accum))
var = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0}
var: Should be from a Variable().
accum: Should be from a Variable().
lr: Learning rate. Must be a scalar.
l1: L1 regularization. Must be a scalar.
l2: L2 regularization. Must be a scalar.
grad: The gradient.
indices: A vector of indices into the first dimension of var and accum.
out: Same as "var".
use_locking: If True, updating of the var and accum tensors will be protected by
a lock; otherwise the behavior is undefined, but may exhibit less contention.
)doc");
REGISTER_OP("ApplyFtrl")
.Input("var: Ref(T)")
.Input("accum: Ref(T)")
@ -171,8 +284,8 @@ accum: Should be from a Variable().
linear: Should be from a Variable().
grad: The gradient.
lr: Scaling factor. Must be a scalar.
l1: Scaling factor. Must be a scalar.
l2: Scaling factor. Must be a scalar.
l1: L1 regulariation. Must be a scalar.
l2: L2 regulariation. Must be a scalar.
lr_power: Scaling factor. Must be a scalar.
out: Same as "var".
use_locking: If `True`, updating of the var and accum tensors will be protected
@ -210,8 +323,8 @@ linear: Should be from a Variable().
grad: The gradient.
indices: A vector of indices into the first dimension of var and accum.
lr: Scaling factor. Must be a scalar.
l1: Scaling factor. Must be a scalar.
l2: Scaling factor. Must be a scalar.
l1: L1 regularization. Must be a scalar.
l2: L2 regularization. Must be a scalar.
lr_power: Scaling factor. Must be a scalar.
out: Same as "var".
use_locking: If `True`, updating of the var and accum tensors will be protected

View File

@ -69,9 +69,9 @@ class Stat {
: static_cast<HighPrecisionValueType>(sum_) / count_;
}
ValueType rms() const { return sqrt(squared_sum_ / count_); }
ValueType std_deviation() const { return all_same() ? 0 : rms() - avg(); }
ValueType std_deviation() const {
return all_same() ? 0 : sqrt(squared_sum_ / count_ - avg() * avg());
}
void OutputToStream(std::ostream* stream) const {
if (empty()) {

View File

@ -18,35 +18,38 @@ from __future__ import print_function
from sklearn import datasets
from sklearn import metrics
from sklearn.cross_validation import train_test_split
import tensorflow as tf
from tensorflow.contrib import learn
iris = datasets.load_iris()
X_train, X_test, y_train, y_test = train_test_split(iris.data,
iris.target,
test_size=0.2,
random_state=42)
def main(unused_argv):
iris = datasets.load_iris()
x_train, x_test, y_train, y_test = train_test_split(
iris.data, iris.target, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train,
test_size=0.2,
random_state=42)
val_monitor = learn.monitors.ValidationMonitor(X_val, y_val,
early_stopping_rounds=200)
x_train, x_val, y_train, y_val = train_test_split(
x_train, y_train, test_size=0.2, random_state=42)
val_monitor = learn.monitors.ValidationMonitor(
x_val, y_val, early_stopping_rounds=200)
# classifier with early stopping on training data
classifier1 = learn.TensorFlowDNNClassifier(hidden_units=[10, 20, 10],
n_classes=3,
model_dir='/tmp/iris_model/')
classifier1.fit(X_train, y_train, steps=2000)
score1 = metrics.accuracy_score(y_test, classifier1.predict(X_test))
# classifier with early stopping on training data
classifier1 = learn.TensorFlowDNNClassifier(
hidden_units=[10, 20, 10], n_classes=3, model_dir='/tmp/iris_model/')
classifier1.fit(x=x_train, y=y_train, steps=2000)
score1 = metrics.accuracy_score(y_test, classifier1.predict(x_test))
# classifier with early stopping on validation data
classifier2 = learn.TensorFlowDNNClassifier(hidden_units=[10, 20, 10],
n_classes=3,
model_dir='/tmp/iris_model_val/')
classifier2.fit(X_train, y_train, val_monitor, steps=2000)
score2 = metrics.accuracy_score(y_test, classifier2.predict(X_test))
# classifier with early stopping on validation data, save frequently for
# monitor to pick up new checkpoints.
classifier2 = learn.TensorFlowDNNClassifier(
hidden_units=[10, 20, 10], n_classes=3, model_dir='/tmp/iris_model_val/',
config=tf.contrib.learn.RunConfig(save_checkpoints_secs=1))
classifier2.fit(x=x_train, y=y_train, steps=2000, monitors=[val_monitor])
score2 = metrics.accuracy_score(y_test, classifier2.predict(x_test))
# In many applications, the score is improved by using early stopping
print(score2 > score1)
# In many applications, the score is improved by using early stopping
print(score2 > score1)
if __name__ == '__main__':
tf.app.run()

View File

@ -164,7 +164,7 @@ def train():
feed_dict=feed_dict(True),
options=run_options,
run_metadata=run_metadata)
train_writer.add_run_metadata(run_metadata, 'step%d' % i)
train_writer.add_run_metadata(run_metadata, 'step%03d' % i)
train_writer.add_summary(summary, i)
print('Adding run metadata for', i)
else: # Record a summary

View File

@ -216,7 +216,7 @@ This operation returns a 1-D integer tensor representing the shape of `input`.
For example:
```prettyprint
```python
# 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]
shape(t) ==> [2, 2, 3]
```
@ -224,7 +224,7 @@ shape(t) ==> [2, 2, 3]
##### Args:
* <b>`input`</b>: A `Tensor`.
* <b>`input`</b>: A `Tensor` or `SparseTensor`.
* <b>`name`</b>: A name for the operation (optional).
##### Returns:

View File

@ -796,11 +796,11 @@ A classifier for TensorFlow DNN models.
def input_fn_eval: # returns x, Y
pass
estimator.evaluate(input_fn_eval)
estimator.predict(x)
estimator.evaluate(input_fn=input_fn_eval)
estimator.predict(x=x)
```
Input of `fit`, `train`, and `evaluate` should have following features,
Input of `fit` and `evaluate` should have following features,
otherwise there will be a `KeyError`:
if `weight_column_name` is not `None`, a feature with
`key=weight_column_name` whose value is a `Tensor`.
@ -1140,11 +1140,11 @@ A regressor for TensorFlow DNN models.
def input_fn_eval: # returns x, Y
pass
estimator.evaluate(input_fn_eval)
estimator.predict(x)
estimator.evaluate(input_fn=input_fn_eval)
estimator.predict(x=x)
```
Input of `fit`, `train`, and `evaluate` should have following features,
Input of `fit` and `evaluate` should have following features,
otherwise there will be a `KeyError`:
if `weight_column_name` is not `None`, a feature with
`key=weight_column_name` whose value is a `Tensor`.
@ -2245,10 +2245,10 @@ Linear classifier model.
...
estimator.fit(input_fn=input_fn_train)
estimator.evaluate(input_fn=input_fn_eval)
estimator.predict(x)
estimator.predict(x=x)
```
Input of `fit`, `train`, and `evaluate` should have following features,
Input of `fit` and `evaluate` should have following features,
otherwise there will be a `KeyError`:
if `weight_column_name` is not `None`, a feature with
`key=weight_column_name` whose value is a `Tensor`.
@ -2579,10 +2579,10 @@ Linear regressor model.
...
estimator.fit(input_fn=input_fn_train)
estimator.evaluate(input_fn=input_fn_eval)
estimator.predict(x)
estimator.predict(x=x)
```
Input of `fit`, `train`, and `evaluate` should have following features,
Input of `fit` and `evaluate` should have following features,
otherwise there will be a KeyError:
if `weight_column_name` is not None:
key=weight_column_name, value=a `Tensor`
@ -4238,50 +4238,33 @@ Perform various training, evaluation, and inference actions on a graph.
### `class tf.contrib.learn.RunConfig` {#RunConfig}
This class specifies the specific configurations for the run.
Parameters:
execution_mode: Runners use this flag to execute different tasks, like
training vs evaluation. 'all' (the default) executes both training and
eval.
master: TensorFlow master. Empty string (the default) for local.
task: Task id of the replica running the training (default: 0).
num_ps_replicas: Number of parameter server tasks to use (default: 0).
training_worker_session_startup_stagger_secs: Seconds to sleep between the
startup of each worker task session (default: 5).
training_worker_max_startup_secs: Max seconds to wait before starting any
worker (default: 60).
eval_delay_secs: Number of seconds between the beginning of each eval run.
If one run takes more than this amount of time, the next run will start
immediately once that run completes (default 60).
eval_steps: Number of steps to run in each eval (default: 100).
num_cores: Number of cores to be used (default: 4).
verbose: Controls the verbosity, possible values:
0: the algorithm and debug information is muted.
1: trainer prints the progress.
2: log device placement is printed.
gpu_memory_fraction: Fraction of GPU memory used by the process on
each GPU uniformly on the same machine.
tf_random_seed: Random seed for TensorFlow initializers.
Setting this value allows consistency between reruns.
keep_checkpoint_max: The maximum number of recent checkpoint files to keep.
As new files are created, older files are deleted.
If None or 0, all checkpoint files are kept.
Defaults to 5 (that is, the 5 most recent checkpoint files are kept.)
keep_checkpoint_every_n_hours: Number of hours between each checkpoint
to be saved. The default value of 10,000 hours effectively disables
the feature.
Attributes:
tf_master: Tensorflow master.
tf_config: Tensorflow Session Config proto.
tf_random_seed: Tensorflow random seed.
keep_checkpoint_max: Maximum number of checkpoints to keep.
keep_checkpoint_every_n_hours: Number of hours between each checkpoint.
- - -
#### `tf.contrib.learn.RunConfig.__init__(execution_mode='all', master='', task=0, num_ps_replicas=0, training_worker_session_startup_stagger_secs=5, training_worker_max_startup_secs=60, eval_delay_secs=60, eval_steps=100, num_cores=4, verbose=1, gpu_memory_fraction=1, tf_random_seed=42, keep_checkpoint_max=5, keep_checkpoint_every_n_hours=10000)` {#RunConfig.__init__}
#### `tf.contrib.learn.RunConfig.__init__(master='', task=0, num_ps_replicas=0, num_cores=4, log_device_placement=False, gpu_memory_fraction=1, tf_random_seed=42, save_summary_steps=100, save_checkpoints_secs=60, keep_checkpoint_max=5, keep_checkpoint_every_n_hours=10000)` {#RunConfig.__init__}
Constructor.
##### Args:
* <b>`master`</b>: TensorFlow master. Empty string (the default) for local.
* <b>`task`</b>: Task id of the replica running the training (default: 0).
* <b>`num_ps_replicas`</b>: Number of parameter server tasks to use (default: 0).
* <b>`num_cores`</b>: Number of cores to be used (default: 4).
* <b>`log_device_placement`</b>: Log the op placement to devices (default: False).
* <b>`gpu_memory_fraction`</b>: Fraction of GPU memory used by the process on
each GPU uniformly on the same machine.
* <b>`tf_random_seed`</b>: Random seed for TensorFlow initializers.
Setting this value allows consistency between reruns.
* <b>`save_summary_steps`</b>: Save summaries every this many steps.
* <b>`save_checkpoints_secs`</b>: Save checkpoints every this many seconds.
* <b>`keep_checkpoint_max`</b>: The maximum number of recent checkpoint files to
keep. As new files are created, older files are deleted. If None or 0,
all checkpoint files are kept. Defaults to 5 (that is, the 5 most recent
checkpoint files are kept.)
* <b>`keep_checkpoint_every_n_hours`</b>: Number of hours between each checkpoint
to be saved. The default value of 10,000 hours effectively disables
the feature.
@ -4394,7 +4377,7 @@ Run `output_dict` tensors `n` times, with the same `feed_dict` each run.
- - -
### `tf.contrib.learn.train(graph, output_dir, train_op, loss_op, global_step_tensor=None, init_op=None, init_feed_dict=None, init_fn=None, log_every_steps=10, supervisor_is_chief=True, supervisor_master='', supervisor_save_model_secs=600, supervisor_save_summaries_steps=100, feed_fn=None, max_steps=None, fail_on_nan_loss=True, monitors=None)` {#train}
### `tf.contrib.learn.train(graph, output_dir, train_op, loss_op, global_step_tensor=None, init_op=None, init_feed_dict=None, init_fn=None, log_every_steps=10, supervisor_is_chief=True, supervisor_master='', supervisor_save_model_secs=600, supervisor_save_summaries_steps=100, feed_fn=None, steps=None, fail_on_nan_loss=True, monitors=None)` {#train}
Train a model.
@ -4437,7 +4420,7 @@ program is terminated with exit code 1.
`supervisor_save_summaries_steps` seconds when training.
* <b>`feed_fn`</b>: A function that is called every iteration to produce a `feed_dict`
passed to `session.run` calls. Optional.
* <b>`max_steps`</b>: Train until `global_step_tensor` evaluates to this value.
* <b>`steps`</b>: Trains for this many steps (e.g. current global step + `steps`).
* <b>`fail_on_nan_loss`</b>: If true, raise `NanLossDuringTrainingError` if `loss_op`
evaluates to `NaN`. If false, continue training as if nothing happened.
* <b>`monitors`</b>: List of `BaseMonitor` subclass instances. Used for callbacks

View File

@ -18,10 +18,10 @@ Linear regressor model.
...
estimator.fit(input_fn=input_fn_train)
estimator.evaluate(input_fn=input_fn_eval)
estimator.predict(x)
estimator.predict(x=x)
```
Input of `fit`, `train`, and `evaluate` should have following features,
Input of `fit` and `evaluate` should have following features,
otherwise there will be a KeyError:
if `weight_column_name` is not None:
key=weight_column_name, value=a `Tensor`

View File

@ -18,10 +18,10 @@ Linear classifier model.
...
estimator.fit(input_fn=input_fn_train)
estimator.evaluate(input_fn=input_fn_eval)
estimator.predict(x)
estimator.predict(x=x)
```
Input of `fit`, `train`, and `evaluate` should have following features,
Input of `fit` and `evaluate` should have following features,
otherwise there will be a `KeyError`:
if `weight_column_name` is not `None`, a feature with
`key=weight_column_name` whose value is a `Tensor`.

View File

@ -1,4 +1,4 @@
### `tf.contrib.learn.train(graph, output_dir, train_op, loss_op, global_step_tensor=None, init_op=None, init_feed_dict=None, init_fn=None, log_every_steps=10, supervisor_is_chief=True, supervisor_master='', supervisor_save_model_secs=600, supervisor_save_summaries_steps=100, feed_fn=None, max_steps=None, fail_on_nan_loss=True, monitors=None)` {#train}
### `tf.contrib.learn.train(graph, output_dir, train_op, loss_op, global_step_tensor=None, init_op=None, init_feed_dict=None, init_fn=None, log_every_steps=10, supervisor_is_chief=True, supervisor_master='', supervisor_save_model_secs=600, supervisor_save_summaries_steps=100, feed_fn=None, steps=None, fail_on_nan_loss=True, monitors=None)` {#train}
Train a model.
@ -41,7 +41,7 @@ program is terminated with exit code 1.
`supervisor_save_summaries_steps` seconds when training.
* <b>`feed_fn`</b>: A function that is called every iteration to produce a `feed_dict`
passed to `session.run` calls. Optional.
* <b>`max_steps`</b>: Train until `global_step_tensor` evaluates to this value.
* <b>`steps`</b>: Trains for this many steps (e.g. current global step + `steps`).
* <b>`fail_on_nan_loss`</b>: If true, raise `NanLossDuringTrainingError` if `loss_op`
evaluates to `NaN`. If false, continue training as if nothing happened.
* <b>`monitors`</b>: List of `BaseMonitor` subclass instances. Used for callbacks

View File

@ -6,7 +6,7 @@ This operation returns a 1-D integer tensor representing the shape of `input`.
For example:
```prettyprint
```python
# 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]
shape(t) ==> [2, 2, 3]
```
@ -14,7 +14,7 @@ shape(t) ==> [2, 2, 3]
##### Args:
* <b>`input`</b>: A `Tensor`.
* <b>`input`</b>: A `Tensor` or `SparseTensor`.
* <b>`name`</b>: A name for the operation (optional).
##### Returns:

View File

@ -21,11 +21,11 @@ A classifier for TensorFlow DNN models.
def input_fn_eval: # returns x, Y
pass
estimator.evaluate(input_fn_eval)
estimator.predict(x)
estimator.evaluate(input_fn=input_fn_eval)
estimator.predict(x=x)
```
Input of `fit`, `train`, and `evaluate` should have following features,
Input of `fit` and `evaluate` should have following features,
otherwise there will be a `KeyError`:
if `weight_column_name` is not `None`, a feature with
`key=weight_column_name` whose value is a `Tensor`.

View File

@ -1,47 +1,30 @@
This class specifies the specific configurations for the run.
- - -
Parameters:
execution_mode: Runners use this flag to execute different tasks, like
training vs evaluation. 'all' (the default) executes both training and
eval.
master: TensorFlow master. Empty string (the default) for local.
task: Task id of the replica running the training (default: 0).
num_ps_replicas: Number of parameter server tasks to use (default: 0).
training_worker_session_startup_stagger_secs: Seconds to sleep between the
startup of each worker task session (default: 5).
training_worker_max_startup_secs: Max seconds to wait before starting any
worker (default: 60).
eval_delay_secs: Number of seconds between the beginning of each eval run.
If one run takes more than this amount of time, the next run will start
immediately once that run completes (default 60).
eval_steps: Number of steps to run in each eval (default: 100).
num_cores: Number of cores to be used (default: 4).
verbose: Controls the verbosity, possible values:
0: the algorithm and debug information is muted.
1: trainer prints the progress.
2: log device placement is printed.
gpu_memory_fraction: Fraction of GPU memory used by the process on
#### `tf.contrib.learn.RunConfig.__init__(master='', task=0, num_ps_replicas=0, num_cores=4, log_device_placement=False, gpu_memory_fraction=1, tf_random_seed=42, save_summary_steps=100, save_checkpoints_secs=60, keep_checkpoint_max=5, keep_checkpoint_every_n_hours=10000)` {#RunConfig.__init__}
Constructor.
##### Args:
* <b>`master`</b>: TensorFlow master. Empty string (the default) for local.
* <b>`task`</b>: Task id of the replica running the training (default: 0).
* <b>`num_ps_replicas`</b>: Number of parameter server tasks to use (default: 0).
* <b>`num_cores`</b>: Number of cores to be used (default: 4).
* <b>`log_device_placement`</b>: Log the op placement to devices (default: False).
* <b>`gpu_memory_fraction`</b>: Fraction of GPU memory used by the process on
each GPU uniformly on the same machine.
tf_random_seed: Random seed for TensorFlow initializers.
* <b>`tf_random_seed`</b>: Random seed for TensorFlow initializers.
Setting this value allows consistency between reruns.
keep_checkpoint_max: The maximum number of recent checkpoint files to keep.
As new files are created, older files are deleted.
If None or 0, all checkpoint files are kept.
Defaults to 5 (that is, the 5 most recent checkpoint files are kept.)
keep_checkpoint_every_n_hours: Number of hours between each checkpoint
* <b>`save_summary_steps`</b>: Save summaries every this many steps.
* <b>`save_checkpoints_secs`</b>: Save checkpoints every this many seconds.
* <b>`keep_checkpoint_max`</b>: The maximum number of recent checkpoint files to
keep. As new files are created, older files are deleted. If None or 0,
all checkpoint files are kept. Defaults to 5 (that is, the 5 most recent
checkpoint files are kept.)
* <b>`keep_checkpoint_every_n_hours`</b>: Number of hours between each checkpoint
to be saved. The default value of 10,000 hours effectively disables
the feature.
Attributes:
tf_master: Tensorflow master.
tf_config: Tensorflow Session Config proto.
tf_random_seed: Tensorflow random seed.
keep_checkpoint_max: Maximum number of checkpoints to keep.
keep_checkpoint_every_n_hours: Number of hours between each checkpoint.
- - -
#### `tf.contrib.learn.RunConfig.__init__(execution_mode='all', master='', task=0, num_ps_replicas=0, training_worker_session_startup_stagger_secs=5, training_worker_max_startup_secs=60, eval_delay_secs=60, eval_steps=100, num_cores=4, verbose=1, gpu_memory_fraction=1, tf_random_seed=42, keep_checkpoint_max=5, keep_checkpoint_every_n_hours=10000)` {#RunConfig.__init__}

View File

@ -18,21 +18,28 @@ a probability distribution for each entry, see
on `logits` internally for efficiency. Do not call this op with the
output of `softmax`, as it will produce incorrect results.
`logits` must have the shape `[batch_size, num_classes]`
and dtype `float32` or `float64`.
`labels` must have the shape `[batch_size]` and dtype `int32` or `int64`.
A common use case is to have logits of shape `[batch_size, num_classes]` and
labels of shape `[batch_size]`. But higher dimensions are supported.
##### Args:
* <b>`logits`</b>: Unscaled log probabilities.
* <b>`labels`</b>: Each entry `labels[i]` must be an index in `[0, num_classes)`. Other
values will result in a loss of 0, but incorrect gradient computations.
* <b>`logits`</b>: Unscaled log probabilities of rank `r` and shape
`[d_0, d_1, ..., d_{r-2}, num_classes]` and dtype `float32` or `float64`.
* <b>`labels`</b>: `Tensor` of shape `[d_0, d_1, ..., d_{r-2}]` and dtype `int32` or
`int64`. Each entry in `labels` must be an index in `[0, num_classes)`.
Other values will result in a loss of 0, but incorrect gradient
computations.
* <b>`name`</b>: A name for the operation (optional).
##### Returns:
A 1-D `Tensor` of length `batch_size` of the same type as `logits` with the
softmax cross entropy loss.
A `Tensor` of the same shape as `labels` and of the same type as `logits`
with the softmax cross entropy loss.
##### Raises:
* <b>`ValueError`</b>: If logits are scalars (need to have rank >= 1) or if the rank
of the labels is not equal to the rank of the labels minus one.

View File

@ -21,11 +21,11 @@ A regressor for TensorFlow DNN models.
def input_fn_eval: # returns x, Y
pass
estimator.evaluate(input_fn_eval)
estimator.predict(x)
estimator.evaluate(input_fn=input_fn_eval)
estimator.predict(x=x)
```
Input of `fit`, `train`, and `evaluate` should have following features,
Input of `fit` and `evaluate` should have following features,
otherwise there will be a `KeyError`:
if `weight_column_name` is not `None`, a feature with
`key=weight_column_name` whose value is a `Tensor`.

View File

@ -9,7 +9,7 @@ The indices in `argmax` are flattened, so that a maximum value at position
##### Args:
* <b>`input`</b>: A `Tensor` of type `float32`.
* <b>`input`</b>: A `Tensor`. Must be one of the following types: `float32`, `half`.
4-D with shape `[batch, height, width, channels]`. Input to pool over.
* <b>`ksize`</b>: A list of `ints` that has length `>= 4`.
The size of the window for each dimension of the input tensor.
@ -25,6 +25,6 @@ The indices in `argmax` are flattened, so that a maximum value at position
A tuple of `Tensor` objects (output, argmax).
* <b>`output`</b>: A `Tensor` of type `float32`. The max pooled output tensor.
* <b>`output`</b>: A `Tensor`. Has the same type as `input`. The max pooled output tensor.
* <b>`argmax`</b>: A `Tensor` of type `Targmax`. 4-D. The flattened indices of the max values chosen for each output.

View File

@ -3,8 +3,8 @@
Masks elements of `IndexedSlices`.
Given an `IndexedSlices` instance `a`, returns another `IndexedSlices` that
contains a subset of the slices of `a`. Only the slices at indices specified
in `mask_indices` are returned.
contains a subset of the slices of `a`. Only the slices at indices not
specified in `mask_indices` are returned.
This is useful when you need to extract a subset of slices in an
`IndexedSlices` object.
@ -18,7 +18,7 @@ a.indices => [12, 26, 37, 45]
tf.shape(a.values) => [4, 10]
# `b` will be the subset of `a` slices at its second and third indices, so
# we want to mask of its first and last indices (which are at absolute
# we want to mask its first and last indices (which are at absolute
# indices 12, 45)
b = tf.sparse_mask(a, [12, 45])

View File

@ -690,7 +690,7 @@ The indices in `argmax` are flattened, so that a maximum value at position
##### Args:
* <b>`input`</b>: A `Tensor` of type `float32`.
* <b>`input`</b>: A `Tensor`. Must be one of the following types: `float32`, `half`.
4-D with shape `[batch, height, width, channels]`. Input to pool over.
* <b>`ksize`</b>: A list of `ints` that has length `>= 4`.
The size of the window for each dimension of the input tensor.
@ -706,7 +706,7 @@ The indices in `argmax` are flattened, so that a maximum value at position
A tuple of `Tensor` objects (output, argmax).
* <b>`output`</b>: A `Tensor` of type `float32`. The max pooled output tensor.
* <b>`output`</b>: A `Tensor`. Has the same type as `input`. The max pooled output tensor.
* <b>`argmax`</b>: A `Tensor` of type `Targmax`. 4-D. The flattened indices of the max values chosen for each output.
@ -1244,23 +1244,30 @@ a probability distribution for each entry, see
on `logits` internally for efficiency. Do not call this op with the
output of `softmax`, as it will produce incorrect results.
`logits` must have the shape `[batch_size, num_classes]`
and dtype `float32` or `float64`.
`labels` must have the shape `[batch_size]` and dtype `int32` or `int64`.
A common use case is to have logits of shape `[batch_size, num_classes]` and
labels of shape `[batch_size]`. But higher dimensions are supported.
##### Args:
* <b>`logits`</b>: Unscaled log probabilities.
* <b>`labels`</b>: Each entry `labels[i]` must be an index in `[0, num_classes)`. Other
values will result in a loss of 0, but incorrect gradient computations.
* <b>`logits`</b>: Unscaled log probabilities of rank `r` and shape
`[d_0, d_1, ..., d_{r-2}, num_classes]` and dtype `float32` or `float64`.
* <b>`labels`</b>: `Tensor` of shape `[d_0, d_1, ..., d_{r-2}]` and dtype `int32` or
`int64`. Each entry in `labels` must be an index in `[0, num_classes)`.
Other values will result in a loss of 0, but incorrect gradient
computations.
* <b>`name`</b>: A name for the operation (optional).
##### Returns:
A 1-D `Tensor` of length `batch_size` of the same type as `logits` with the
softmax cross entropy loss.
A `Tensor` of the same shape as `labels` and of the same type as `logits`
with the softmax cross entropy loss.
##### Raises:
* <b>`ValueError`</b>: If logits are scalars (need to have rank >= 1) or if the rank
of the labels is not equal to the rank of the labels minus one.
- - -

View File

@ -1895,8 +1895,8 @@ Requires `updates.shape = indices.shape + ref.shape[1:]`.
Masks elements of `IndexedSlices`.
Given an `IndexedSlices` instance `a`, returns another `IndexedSlices` that
contains a subset of the slices of `a`. Only the slices at indices specified
in `mask_indices` are returned.
contains a subset of the slices of `a`. Only the slices at indices not
specified in `mask_indices` are returned.
This is useful when you need to extract a subset of slices in an
`IndexedSlices` object.
@ -1910,7 +1910,7 @@ a.indices => [12, 26, 37, 45]
tf.shape(a.values) => [4, 10]
# `b` will be the subset of `a` slices at its second and third indices, so
# we want to mask of its first and last indices (which are at absolute
# we want to mask its first and last indices (which are at absolute
# indices 12, 45)
b = tf.sparse_mask(a, [12, 45])

View File

@ -10,7 +10,7 @@ There are three main methods of getting data into a TensorFlow program:
[TOC]
## Feeding
## Feeding
TensorFlow's feed mechanism lets you inject data into any Tensor in a
computation graph. A python computation can thus feed data directly into the
@ -377,11 +377,11 @@ Again, the example queue will have some elements queued, so training will
continue until those are exhausted. If the example queue is a
[`RandomShuffleQueue`](../../api_docs/python/io_ops.md#RandomShuffleQueue), say
because you are using `shuffle_batch` or `shuffle_batch_join`, it normally will
avoid ever going having fewer than its `min_after_dequeue` attr elements
buffered. However, once the queue is closed that restriction will be lifted and
the queue will eventually empty. At that point the actual training threads,
when they try and dequeue from example queue, will start getting `OutOfRange`
errors and exiting. Once all the training threads are done,
avoid ever having fewer than its `min_after_dequeue` attr elements buffered.
However, once the queue is closed that restriction will be lifted and the queue
will eventually empty. At that point the actual training threads, when they
try and dequeue from example queue, will start getting `OutOfRange` errors and
exiting. Once all the training threads are done,
[`tf.train.Coordinator.join`](../../api_docs/python/train.md#Coordinator.join)
will return and you can exit cleanly.

View File

@ -146,7 +146,7 @@ for step in xrange(1000000):
# When done, ask the threads to stop.
coord.request_stop()
# And wait for them to actually do it.
coord.join(threads)
coord.join(enqueue_threads)
```
## Handling Exceptions

View File

@ -178,9 +178,9 @@ https://github.com/tensorflow/tensorflow/blob/master/tensorflow/g3doc/get_starte
[bazel](https://github.com/bazelbuild/bazel)).
Next:
```
```bash
cd tensorflow/models/rnn/ptb
python ptb_word_lm --data_path=/tmp/simple-examples/data/ --model small
python ptb_word_lm.py --data_path=/tmp/simple-examples/data/ --model small
```
There are 3 supported model configurations in the tutorial code: "small",

View File

@ -48,9 +48,19 @@ EVAL_FREQUENCY = 100 # Number of steps between evaluations.
tf.app.flags.DEFINE_boolean("self_test", False, "True if running a self test.")
tf.app.flags.DEFINE_boolean('use_fp16', False,
"Use half floats instead of full floats if True.")
FLAGS = tf.app.flags.FLAGS
def data_type():
"""Return the type of the activations, weights, and placeholder variables."""
if FLAGS.use_fp16:
return tf.float16
else:
return tf.float32
def maybe_download(filename):
"""Download the data from Yann's website, unless it's already here."""
if not tf.gfile.Exists(WORK_DIRECTORY):
@ -142,11 +152,11 @@ def main(argv=None): # pylint: disable=unused-argument
# These placeholder nodes will be fed a batch of training data at each
# training step using the {feed_dict} argument to the Run() call below.
train_data_node = tf.placeholder(
tf.float32,
data_type(),
shape=(BATCH_SIZE, IMAGE_SIZE, IMAGE_SIZE, NUM_CHANNELS))
train_labels_node = tf.placeholder(tf.int64, shape=(BATCH_SIZE,))
eval_data = tf.placeholder(
tf.float32,
data_type(),
shape=(EVAL_BATCH_SIZE, IMAGE_SIZE, IMAGE_SIZE, NUM_CHANNELS))
# The variables below hold all the trainable weights. They are passed an
@ -155,24 +165,24 @@ def main(argv=None): # pylint: disable=unused-argument
conv1_weights = tf.Variable(
tf.truncated_normal([5, 5, NUM_CHANNELS, 32], # 5x5 filter, depth 32.
stddev=0.1,
seed=SEED))
conv1_biases = tf.Variable(tf.zeros([32]))
conv2_weights = tf.Variable(
tf.truncated_normal([5, 5, 32, 64],
stddev=0.1,
seed=SEED))
conv2_biases = tf.Variable(tf.constant(0.1, shape=[64]))
seed=SEED, dtype=data_type()))
conv1_biases = tf.Variable(tf.zeros([32], dtype=data_type()))
conv2_weights = tf.Variable(tf.truncated_normal(
[5, 5, 32, 64], stddev=0.1,
seed=SEED, dtype=data_type()))
conv2_biases = tf.Variable(tf.constant(0.1, shape=[64], dtype=data_type()))
fc1_weights = tf.Variable( # fully connected, depth 512.
tf.truncated_normal(
[IMAGE_SIZE // 4 * IMAGE_SIZE // 4 * 64, 512],
stddev=0.1,
seed=SEED))
fc1_biases = tf.Variable(tf.constant(0.1, shape=[512]))
fc2_weights = tf.Variable(
tf.truncated_normal([512, NUM_LABELS],
tf.truncated_normal([IMAGE_SIZE // 4 * IMAGE_SIZE // 4 * 64, 512],
stddev=0.1,
seed=SEED))
fc2_biases = tf.Variable(tf.constant(0.1, shape=[NUM_LABELS]))
seed=SEED,
dtype=data_type()))
fc1_biases = tf.Variable(tf.constant(0.1, shape=[512], dtype=data_type()))
fc2_weights = tf.Variable(tf.truncated_normal([512, NUM_LABELS],
stddev=0.1,
seed=SEED,
dtype=data_type()))
fc2_biases = tf.Variable(tf.constant(
0.1, shape=[NUM_LABELS], dtype=data_type()))
# We will replicate the model structure for the training subgraph, as well
# as the evaluation subgraphs, while sharing the trainable parameters.
@ -230,7 +240,7 @@ def main(argv=None): # pylint: disable=unused-argument
# Optimizer: set up a variable that's incremented once per batch and
# controls the learning rate decay.
batch = tf.Variable(0)
batch = tf.Variable(0, dtype=data_type())
# Decay once per epoch, using an exponential schedule starting at 0.01.
learning_rate = tf.train.exponential_decay(
0.01, # Base learning rate.

View File

@ -99,6 +99,42 @@ def GetShrunkInceptionMaxPoolShapes(shrink=30):
class PoolingTest(tf.test.TestCase):
def _VerifyOneType(self, pool_func, input_sizes, ksize, strides, padding,
data_format, data_type, expected, use_gpu):
"""Verifies the output values of the pooling function.
Args:
pool_func: Function to be called, co.MaxPool, co.AvgPool,
or the Lua version.
input_sizes: Input tensor dimensions.
ksize: The kernel size dimensions
strides: The stride dimensions
padding: Padding type.
data_format: The data format we use to run the pooling operation.
data_type: The data type to use to run the pooling operation.
expected: An array containing the expected operation outputs.
use_gpu: Whether we are running on GPU.
"""
total_size = 1
for s in input_sizes:
total_size *= s
# Initializes the input tensor with array containing incrementing
# numbers from 1.
x = [f * 1.0 for f in range(1, total_size + 1)]
with self.test_session(use_gpu=use_gpu) as sess:
t = tf.constant(x, shape=input_sizes, dtype=data_type)
if data_format == "NCHW":
t = NHWCToNCHW(t)
ksize = NHWCToNCHW(ksize)
strides = NHWCToNCHW(strides)
t = pool_func(t, ksize=ksize, strides=strides, padding=padding,
data_format=data_format)
if data_format == "NCHW":
t = NCHWToNHWC(t)
actual = t.eval()
self.assertAllCloseAccordingToType(expected, actual.flatten())
self.assertShapeEqual(actual, t)
def _VerifyOneTest(self, pool_func, input_sizes, ksize, strides, padding,
data_format, expected, use_gpu):
"""Verifies the output values of the pooling function.
@ -114,25 +150,12 @@ class PoolingTest(tf.test.TestCase):
expected: An array containing the expected operation outputs.
use_gpu: Whether we are running on GPU.
"""
total_size = 1
for s in input_sizes:
total_size *= s
# Initializes the input tensor with array containing incrementing
# numbers from 1.
x = [f * 1.0 for f in range(1, total_size + 1)]
with self.test_session(use_gpu=use_gpu) as sess:
t = tf.constant(x, shape=input_sizes)
if data_format == "NCHW":
t = NHWCToNCHW(t)
ksize = NHWCToNCHW(ksize)
strides = NHWCToNCHW(strides)
t = pool_func(t, ksize=ksize, strides=strides, padding=padding,
data_format=data_format)
if data_format == "NCHW":
t = NCHWToNHWC(t)
actual = t.eval()
self.assertAllClose(expected, actual.flatten())
self.assertShapeEqual(actual, t)
self._VerifyOneType(pool_func, input_sizes, ksize, strides, padding,
data_format, tf.float32, expected, use_gpu)
if not use_gpu or test_util.CudaSupportsHalfMatMulAndConv():
self._VerifyOneType(pool_func, input_sizes, ksize, strides, padding,
data_format, tf.float16, expected, use_gpu)
def _VerifyValues(self, pool_func, input_sizes, ksize, strides, padding,
expected, use_gpu):
@ -372,32 +395,40 @@ class PoolingTest(tf.test.TestCase):
def testKernelSmallerThanStrideValid(self):
for use_gpu in [True, False]:
self._VerifyValues(tf.nn.max_pool, input_sizes=[1, 7, 7, 1],
ksize=[1, 2, 2, 1], strides=[1, 3, 3, 1],
padding="VALID",
expected=[9, 12, 30, 33],
use_gpu=use_gpu)
self._VerifyValues(tf.nn.max_pool,
input_sizes=[1, 7, 7, 1],
ksize=[1, 2, 2, 1],
strides=[1, 3, 3, 1],
padding="VALID",
expected=[9, 12, 30, 33],
use_gpu=use_gpu)
self._VerifyValues(tf.nn.avg_pool, input_sizes=[1, 7, 7, 1],
ksize=[1, 2, 2, 1], strides=[1, 3, 3, 1],
padding="VALID",
expected=[5, 8, 26, 29],
use_gpu=use_gpu)
self._VerifyValues(tf.nn.avg_pool,
input_sizes=[1, 7, 7, 1],
ksize=[1, 2, 2, 1],
strides=[1, 3, 3, 1],
padding="VALID",
expected=[5, 8, 26, 29],
use_gpu=use_gpu)
def testKernelSmallerThanStrideSame(self):
for use_gpu in [True, False]:
for pool_func in [tf.nn.max_pool, tf.nn.avg_pool]:
self._VerifyValues(pool_func, input_sizes=[1, 3, 3, 1],
ksize=[1, 1, 1, 1], strides=[1, 2, 2, 1],
padding="SAME",
expected=[1, 3, 7, 9],
use_gpu=use_gpu)
for pool_func in [tf.nn.max_pool, tf.nn.avg_pool]:
self._VerifyValues(pool_func,
input_sizes=[1, 3, 3, 1],
ksize=[1, 1, 1, 1],
strides=[1, 2, 2, 1],
padding="SAME",
expected=[1, 3, 7, 9],
use_gpu=use_gpu)
self._VerifyValues(pool_func, input_sizes=[1, 4, 4, 1],
ksize=[1, 1, 1, 1], strides=[1, 2, 2, 1],
padding="SAME",
expected=[1, 3, 9, 11],
use_gpu=use_gpu)
self._VerifyValues(pool_func,
input_sizes=[1, 4, 4, 1],
ksize=[1, 1, 1, 1],
strides=[1, 2, 2, 1],
padding="SAME",
expected=[1, 3, 9, 11],
use_gpu=use_gpu)
def _testDepthwiseMaxPoolInvalidConfig(self, in_size, ksize, strides,
error_msg, use_gpu=False):
@ -425,43 +456,49 @@ class PoolingTest(tf.test.TestCase):
# The following are tests that verify that the CPU and GPU implementations
# produce the same resuts.
def _CompareMaxPoolingFwd(self, input_shape, ksize, strides, padding):
tensor_input = np.random.rand(*input_shape).astype(np.float32)
with self.test_session(use_gpu=True):
t = tf.constant(tensor_input, shape=input_shape)
out_op, _ = tf.nn.max_pool_with_argmax(t, ksize, strides, padding)
gpu_val = out_op.eval()
with self.test_session(use_gpu=False):
t = tf.constant(tensor_input, shape=input_shape)
out_op = tf.nn.max_pool(t, ksize, strides, padding)
cpu_val = out_op.eval()
self.assertAllClose(cpu_val, gpu_val, rtol=1e-5, atol=1e-5)
for dtype in np.float32, np.float16:
tensor_input = np.random.rand(*input_shape).astype(dtype)
with self.test_session(use_gpu=True):
t = tf.constant(tensor_input, shape=input_shape)
out_op, _ = tf.nn.max_pool_with_argmax(t, ksize, strides, padding)
gpu_val = out_op.eval()
with self.test_session(use_gpu=False):
t = tf.constant(tensor_input, shape=input_shape)
out_op = tf.nn.max_pool(t, ksize, strides, padding)
cpu_val = out_op.eval()
self.assertAllCloseAccordingToType(cpu_val, gpu_val)
def _CompareMaxPoolingBk(self, input_shape, output_shape, ksize, strides,
padding):
# Generate numbers in a narrow range, so that there are many duplicates
# in the input.
tensor_input = np.random.random_integers(0, 3,
input_shape).astype(np.float32)
tensor_output = np.random.rand(*output_shape).astype(np.float32)
with self.test_session(use_gpu=True):
t = tf.constant(tensor_input, shape=input_shape)
_, argmax_op = tf.nn.max_pool_with_argmax(t, ksize, strides, padding)
argmax = argmax_op.eval()
grad_in = tf.constant(tensor_output, shape=output_shape)
out_op = gen_nn_ops._max_pool_grad_with_argmax(t, grad_in, argmax,
ksize, strides, padding)
gpu_val = out_op.eval()
self.assertShapeEqual(gpu_val, out_op)
with self.test_session(use_gpu=False):
t = tf.constant(tensor_input, shape=input_shape)
out_op = tf.nn.max_pool(t, ksize, strides, padding)
orig_out = out_op.eval()
grad_in = tf.constant(tensor_output, shape=output_shape)
out_op = gen_nn_ops._max_pool_grad(t, orig_out, grad_in, ksize,
strides, padding)
cpu_val = out_op.eval()
self.assertShapeEqual(cpu_val, out_op)
self.assertAllClose(cpu_val, gpu_val, rtol=1e-5, atol=1e-5)
for dtype in np.float32, np.float16:
# Generate numbers in a narrow range, so that there are many duplicates
# in the input.
tensor_input = np.random.random_integers(0, 3, input_shape).astype(dtype)
tensor_output = np.random.rand(*output_shape).astype(dtype)
with self.test_session(use_gpu=True):
t = tf.constant(tensor_input, shape=input_shape)
_, argmax_op = tf.nn.max_pool_with_argmax(t, ksize, strides, padding)
argmax = argmax_op.eval()
grad_in = tf.constant(tensor_output, shape=output_shape)
out_op = gen_nn_ops._max_pool_grad_with_argmax(t, grad_in, argmax,
ksize, strides, padding)
gpu_val = out_op.eval()
self.assertShapeEqual(gpu_val, out_op)
with self.test_session(use_gpu=False):
t = tf.constant(tensor_input, shape=input_shape)
out_op = tf.nn.max_pool(t, ksize, strides, padding)
orig_out = out_op.eval()
grad_in = tf.constant(tensor_output, shape=output_shape)
out_op = gen_nn_ops._max_pool_grad(t, orig_out, grad_in, ksize, strides,
padding)
cpu_val = out_op.eval()
self.assertShapeEqual(cpu_val, out_op)
if dtype == np.float16:
# The CPU version accumulates its gradient on fp16, so it's less
# accurate than the GPU version that does the accumulation on fp32
self.assertAllClose(cpu_val, gpu_val, rtol=0.01, atol=0.01)
else:
self.assertAllClose(cpu_val, gpu_val)
def testMaxPoolingWithArgmax(self):
# MaxPoolWithArgMax is implemented only on GPU.

View File

@ -30,6 +30,9 @@ from tensorflow.python.ops import sparse_ops
class SparseXentTest(tf.test.TestCase):
def _npXent(self, features, labels):
is_higher_dim = len(features.shape) > 2
features = np.reshape(features, [-1, features.shape[-1]])
labels = np.reshape(labels, [-1])
batch_dim = 0
class_dim = 1
batch_size = features.shape[batch_dim]
@ -40,14 +43,15 @@ class SparseXentTest(tf.test.TestCase):
labels_mat[np.arange(batch_size), labels] = 1.0
bp = (probs - labels_mat)
l = -np.sum(labels_mat * np.log(probs + 1.0e-20), axis=1)
return l, bp
return l, bp, is_higher_dim
def _testXent(self, np_features, np_labels, use_gpu=False):
np_loss, np_backprop = self._npXent(np_features, np_labels)
np_loss, np_backprop, is_higher_dim = self._npXent(np_features, np_labels)
with self.test_session(use_gpu=use_gpu) as sess:
loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
np_features, np_labels)
backprop = loss.op.outputs[1]
backprop = (loss.op.inputs[0].op.outputs[1] if is_higher_dim
else loss.op.outputs[1])
tf_loss, tf_backprop = sess.run([loss, backprop])
self.assertAllCloseAccordingToType(np_loss, tf_loss)
self.assertAllCloseAccordingToType(np_backprop, tf_backprop)
@ -71,14 +75,6 @@ class SparseXentTest(tf.test.TestCase):
self._testSingleClass(use_gpu=True)
self._testSingleClass(use_gpu=False)
def testRankTooLarge(self):
np_features = np.array(
[[[1., 1., 1., 1.]], [[1., 2., 3., 4.]]]).astype(np.float32)
np_labels = np.array([1, 2])
self.assertRaisesRegexp(
ValueError, "must have rank 2",
tf.nn.sparse_softmax_cross_entropy_with_logits, np_features, np_labels)
def testNpXent(self):
# We create 2 batches of logits for testing.
# batch 0 is the boring uniform distribution: 1, 1, 1, 1, with target 3.
@ -104,7 +100,7 @@ class SparseXentTest(tf.test.TestCase):
# With a hard 1, the backprop is [0.032 - 1.0 = -0.968, 0.087, 0.237, 0.644]
# The loss for this batch is [1.0 * -log(0.25), 1.0 * -log(0.032)]
# = [1.3862, 3.4420]
np_loss, np_backprop = self._npXent(np.array(features), np.array(labels))
np_loss, np_backprop, _ = self._npXent(np.array(features), np.array(labels))
self.assertAllClose(np.array([[0.25, 0.25, 0.25, -0.75],
[-0.968, 0.087, 0.237, 0.6439]]),
np_backprop,
@ -114,15 +110,21 @@ class SparseXentTest(tf.test.TestCase):
def testShapeMismatch(self):
with self.test_session():
with self.assertRaises(ValueError):
with self.assertRaisesRegexp(ValueError, ".*Rank mismatch:*"):
tf.nn.sparse_softmax_cross_entropy_with_logits(
[[0., 1.], [2., 3.]], [[0, 2]])
[[0., 1.], [2., 3.], [2., 3.]], [[0, 2]])
def testNotMatrix(self):
def testScalar(self):
with self.test_session():
with self.assertRaises(ValueError):
with self.assertRaisesRegexp(ValueError, ".*Logits cannot be scalars*"):
tf.nn.sparse_softmax_cross_entropy_with_logits(
[0., 1., 2., 3.], [0, 2])
tf.constant(1.0), tf.constant(0))
def testVector(self):
with self.test_session():
loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
tf.constant([1.0]), tf.constant(0))
self.assertAllClose(0.0, loss.eval())
def testFloat(self):
for label_dtype in np.int32, np.int64:
@ -155,6 +157,31 @@ class SparseXentTest(tf.test.TestCase):
print("cross entropy gradient err = ", err)
self.assertLess(err, 5e-8)
def _testHighDim(self, use_gpu, features, labels):
np_loss, np_backprop, _ = self._npXent(np.array(features), np.array(labels))
# manually reshape loss
np_loss = np.reshape(np_loss, np.array(labels).shape)
with self.test_session(use_gpu=use_gpu) as sess:
loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
features, labels)
backprop = loss.op.inputs[0].op.outputs[1]
tf_loss, tf_backprop = sess.run([loss, backprop])
self.assertAllCloseAccordingToType(np_loss, tf_loss)
self.assertAllCloseAccordingToType(np_backprop, tf_backprop)
def testHighDim(self):
features = [[[1., 1., 1., 1.]], [[1., 2., 3., 4.]]]
labels = [[3], [0]]
self._testHighDim(True, features, labels)
self._testHighDim(False, features, labels)
def testHighDim2(self):
features = [[[1., 1., 1., 1.], [2., 2., 2., 2.]],
[[1., 2., 3., 4.], [5., 6., 7., 8.]]]
labels = [[3, 2], [0, 3]]
self._testHighDim(True, features, labels)
self._testHighDim(False, features, labels)
def _sparse_vs_dense_xent_benchmark_dense(labels, logits):
labels = tf.identity(labels)

View File

@ -126,6 +126,7 @@ def shape(input, name=None):
else:
return gen_array_ops.shape(input, name=name)
def rank(input, name=None):
"""Returns the rank of a tensor.
@ -612,8 +613,8 @@ def sparse_mask(a, mask_indices, name=None):
"""Masks elements of `IndexedSlices`.
Given an `IndexedSlices` instance `a`, returns another `IndexedSlices` that
contains a subset of the slices of `a`. Only the slices at indices specified
in `mask_indices` are returned.
contains a subset of the slices of `a`. Only the slices at indices not
specified in `mask_indices` are returned.
This is useful when you need to extract a subset of slices in an
`IndexedSlices` object.
@ -627,7 +628,7 @@ def sparse_mask(a, mask_indices, name=None):
tf.shape(a.values) => [4, 10]
# `b` will be the subset of `a` slices at its second and third indices, so
# we want to mask of its first and last indices (which are at absolute
# we want to mask its first and last indices (which are at absolute
# indices 12, 45)
b = tf.sparse_mask(a, [12, 45])

View File

@ -440,30 +440,65 @@ def sparse_softmax_cross_entropy_with_logits(logits, labels, name=None):
on `logits` internally for efficiency. Do not call this op with the
output of `softmax`, as it will produce incorrect results.
`logits` must have the shape `[batch_size, num_classes]`
and dtype `float32` or `float64`.
`labels` must have the shape `[batch_size]` and dtype `int32` or `int64`.
A common use case is to have logits of shape `[batch_size, num_classes]` and
labels of shape `[batch_size]`. But higher dimensions are supported.
Args:
logits: Unscaled log probabilities.
labels: Each entry `labels[i]` must be an index in `[0, num_classes)`. Other
values will result in a loss of 0, but incorrect gradient computations.
logits: Unscaled log probabilities of rank `r` and shape
`[d_0, d_1, ..., d_{r-2}, num_classes]` and dtype `float32` or `float64`.
labels: `Tensor` of shape `[d_0, d_1, ..., d_{r-2}]` and dtype `int32` or
`int64`. Each entry in `labels` must be an index in `[0, num_classes)`.
Other values will result in a loss of 0, but incorrect gradient
computations.
name: A name for the operation (optional).
Returns:
A 1-D `Tensor` of length `batch_size` of the same type as `logits` with the
softmax cross entropy loss.
A `Tensor` of the same shape as `labels` and of the same type as `logits`
with the softmax cross entropy loss.
Raises:
ValueError: If logits are scalars (need to have rank >= 1) or if the rank
of the labels is not equal to the rank of the labels minus one.
"""
# TODO(pcmurray) Raise an error when the label is not an index in
# [0, num_classes). Note: This could break users who call this with bad
# labels, but disregard the bad results.
# The second output tensor contains the gradients. We use it in
# _CrossEntropyGrad() in nn_grad but not here.
cost, unused_backprop = gen_nn_ops._sparse_softmax_cross_entropy_with_logits(
logits, labels, name=name)
return cost
# Reshape logits and labels to rank 2.
with ops.op_scope([labels, logits], name,
"SparseSoftmaxCrossEntropyWithLogits"):
labels = ops.convert_to_tensor(labels)
logits = ops.convert_to_tensor(logits)
# Store label shape for result later.
labels_static_shape = labels.get_shape()
labels_shape = array_ops.shape(labels)
if logits.get_shape().ndims is not None and logits.get_shape().ndims == 0:
raise ValueError("Logits cannot be scalars - received shape %s.",
logits.get_shape())
if logits.get_shape().ndims is not None and (
labels_static_shape.ndims is not None and
labels_static_shape.ndims != logits.get_shape().ndims - 1):
raise ValueError("Rank mismatch: Labels rank (received %s) should equal "
"logits rank (received %s) - 1.",
labels_static_shape.ndims, logits.get_shape().ndims)
# Check if no reshapes are required.
if logits.get_shape().ndims == 2:
cost, _ = gen_nn_ops._sparse_softmax_cross_entropy_with_logits(
logits, labels, name=name)
return cost
# Reshape logits to 2 dim, labels to 1 dim.
num_classes = array_ops.gather(array_ops.shape(logits),
array_ops.rank(logits) - 1)
logits = array_ops.reshape(logits, [-1, num_classes])
labels = array_ops.reshape(labels, [-1])
# The second output tensor contains the gradients. We use it in
# _CrossEntropyGrad() in nn_grad but not here.
cost, _ = gen_nn_ops._sparse_softmax_cross_entropy_with_logits(
logits, labels, name=name)
cost = array_ops.reshape(cost, labels_shape)
cost.set_shape(labels_static_shape)
return cost
@ops.RegisterShape("SparseSoftmaxCrossEntropyWithLogits")

View File

@ -0,0 +1,101 @@
# Copyright 2015 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""ProximalAdagrad for TensorFlow."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from tensorflow.python.framework import ops
from tensorflow.python.ops import constant_op
from tensorflow.python.training import optimizer
from tensorflow.python.training import training_ops
class ProximalAdagradOptimizer(optimizer.Optimizer):
# pylint: disable=line-too-long
"""Optimizer that implements the Proximal Adagrad algorithm.
See this [paper](http://papers.nips.cc/paper/3793-efficient-learning-using-forward-backward-splitting.pdf).
@@__init__
"""
def __init__(self, learning_rate, initial_accumulator_value=0.1,
l1_regularization_strength=0.0, l2_regularization_strength=0.0,
use_locking=False, name="ProximalAdagrad"):
"""Construct a new ProximalAdagrad optimizer.
Args:
learning_rate: A `Tensor` or a floating point value. The learning rate.
initial_accumulator_value: A floating point value.
Starting value for the accumulators, must be positive.
l1_regularization_strength: A float value, must be greater than or
equal to zero.
l2_regularization_strength: A float value, must be greater than or
equal to zero.
use_locking: If `True` use locks for update operations.
name: Optional name prefix for the operations created when applying
gradients. Defaults to "Adagrad".
Raises:
ValueError: If the `initial_accumulator_value` is invalid.
"""
if initial_accumulator_value <= 0.0:
raise ValueError("initial_accumulator_value must be positive: %s" %
initial_accumulator_value)
super(ProximalAdagradOptimizer, self).__init__(use_locking, name)
self._learning_rate = learning_rate
self._initial_accumulator_value = initial_accumulator_value
self._l1_regularization_strength = l1_regularization_strength
self._l2_regularization_strength = l2_regularization_strength
# Created in Initialize.
self._l1_regularization_strength_tensor = None
self._l2_regularization_strength_tensor = None
self._learning_rate_tensor = None
def _create_slots(self, var_list):
for v in var_list:
with ops.colocate_with(v):
val = constant_op.constant(self._initial_accumulator_value,
shape=v.get_shape())
self._get_or_make_slot(v, val, "accumulator", self._name)
def _prepare(self):
self._learning_rate_tensor = ops.convert_to_tensor(self._learning_rate,
name="learning_rate")
self._l1_regularization_strength_tensor = ops.convert_to_tensor(
self._l1_regularization_strength,
name="l1_regularization_strength")
self._l2_regularization_strength_tensor = ops.convert_to_tensor(
self._l2_regularization_strength,
name="l2_regularization_strength")
def _apply_dense(self, grad, var):
acc = self.get_slot(var, "accumulator")
return training_ops.apply_proximal_adagrad(
var, acc, self._learning_rate_tensor,
self._l1_regularization_strength_tensor,
self._l2_regularization_strength_tensor,
grad, use_locking=self._use_locking)
def _apply_sparse(self, grad, var):
acc = self.get_slot(var, "accumulator")
return training_ops.sparse_apply_proximal_adagrad(
var, acc, self._learning_rate_tensor,
self._l1_regularization_strength_tensor,
self._l2_regularization_strength_tensor,
grad.values, grad.indices,
use_locking=self._use_locking)

View File

@ -0,0 +1,205 @@
# Copyright 2015 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Functional tests for Proximal Adagrad operations."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
import tensorflow as tf
class ProximalAdagradOptimizerTest(tf.test.TestCase):
def testProximalAdagradwithoutRegularization(self):
with self.test_session() as sess:
var0 = tf.Variable([0.0, 0.0])
var1 = tf.Variable([0.0, 0.0])
grads0 = tf.constant([0.1, 0.2])
grads1 = tf.constant([0.01, 0.02])
opt = tf.train.ProximalAdagradOptimizer(3.0,
initial_accumulator_value=0.1,
l1_regularization_strength=0.0,
l2_regularization_strength=0.0)
update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
tf.initialize_all_variables().run()
v0_val, v1_val = sess.run([var0, var1])
self.assertAllClose([0.0, 0.0], v0_val)
self.assertAllClose([0.0, 0.0], v1_val)
# Run 3 steps Proximal Adagrad.
for _ in range(3):
update.run()
v0_val, v1_val = sess.run([var0, var1])
self.assertAllClose(np.array([-2.60260963, -4.29698515]),
v0_val)
self.assertAllClose(np.array([-0.28432083, -0.56694895]),
v1_val)
def testProximalAdagradwithoutRegularization2(self):
with self.test_session() as sess:
var0 = tf.Variable([1.0, 2.0])
var1 = tf.Variable([4.0, 3.0])
grads0 = tf.constant([0.1, 0.2])
grads1 = tf.constant([0.01, 0.02])
opt = tf.train.ProximalAdagradOptimizer(3.0,
initial_accumulator_value=0.1,
l1_regularization_strength=0.0,
l2_regularization_strength=0.0)
update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
tf.initialize_all_variables().run()
v0_val, v1_val = sess.run([var0, var1])
self.assertAllClose([1.0, 2.0], v0_val)
self.assertAllClose([4.0, 3.0], v1_val)
# Run 3 steps Proximal Adagrad.
for _ in range(3):
update.run()
v0_val, v1_val = sess.run([var0, var1])
self.assertAllClose(np.array([-1.60261, -2.296985]),
v0_val)
self.assertAllClose(np.array([3.715679, 2.433051]),
v1_val)
def testProximalAdagradWithL1(self):
with self.test_session() as sess:
var0 = tf.Variable([1.0, 2.0])
var1 = tf.Variable([4.0, 3.0])
grads0 = tf.constant([0.1, 0.2])
grads1 = tf.constant([0.01, 0.02])
opt = tf.train.ProximalAdagradOptimizer(3.0,
initial_accumulator_value=0.1,
l1_regularization_strength=0.001,
l2_regularization_strength=0.0)
update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
tf.initialize_all_variables().run()
v0_val, v1_val = sess.run([var0, var1])
self.assertAllClose([1.0, 2.0], v0_val)
self.assertAllClose([4.0, 3.0], v1_val)
# Run 10 steps Proximal Adagrad
for _ in range(10):
update.run()
v0_val, v1_val = sess.run([var0, var1])
self.assertAllClose(np.array([0.662907, 0.767398]),
v0_val)
self.assertAllClose(np.array([2.959304, 1.029232]),
v1_val)
def testProximalAdagradWithL1_L2(self):
with self.test_session() as sess:
var0 = tf.Variable([1.0, 2.0])
var1 = tf.Variable([4.0, 3.0])
grads0 = tf.constant([0.1, 0.2])
grads1 = tf.constant([0.01, 0.02])
opt = tf.train.ProximalAdagradOptimizer(3.0,
initial_accumulator_value=0.1,
l1_regularization_strength=0.001,
l2_regularization_strength=2.0)
update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
tf.initialize_all_variables().run()
v0_val, v1_val = sess.run([var0, var1])
self.assertAllClose([1.0, 2.0], v0_val)
self.assertAllClose([4.0, 3.0], v1_val)
# Run 10 steps Proximal Adagrad.
for _ in range(10):
update.run()
v0_val, v1_val = sess.run([var0, var1])
self.assertAllClose(np.array([0.043069, 0.080461]),
v0_val)
self.assertAllClose(np.array([0.004069, 0.008578]),
v1_val)
def applyOptimizer(self, opt, steps=5, is_sparse=False):
if is_sparse:
var0 = tf.Variable([[1.0], [2.0]])
var1 = tf.Variable([[3.0], [4.0]])
grads0 = tf.IndexedSlices(tf.constant([0.1], shape=[1, 1]),
tf.constant([0]),
tf.constant([2, 1]))
grads1 = tf.IndexedSlices(tf.constant([0.02], shape=[1, 1]),
tf.constant([1]),
tf.constant([2, 1]))
else:
var0 = tf.Variable([1.0, 2.0])
var1 = tf.Variable([3.0, 4.0])
grads0 = tf.constant([0.1, 0.2])
grads1 = tf.constant([0.01, 0.02])
update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
tf.initialize_all_variables().run()
sess = tf.get_default_session()
v0_val, v1_val = sess.run([var0, var1])
if is_sparse:
self.assertAllClose([[1.0], [2.0]], v0_val)
self.assertAllClose([[3.0], [4.0]], v1_val)
else:
self.assertAllClose([1.0, 2.0], v0_val)
self.assertAllClose([3.0, 4.0], v1_val)
# Run ProximalAdagrad for a few steps
for _ in range(steps):
update.run()
v0_val, v1_val = sess.run([var0, var1])
return v0_val, v1_val
def testEquivAdagradwithoutRegularization(self):
with self.test_session():
val0, val1 = self.applyOptimizer(
tf.train.ProximalAdagradOptimizer(3.0,
initial_accumulator_value=0.1,
l1_regularization_strength=0.0,
l2_regularization_strength=0.0))
with self.test_session():
val2, val3 = self.applyOptimizer(
tf.train.AdagradOptimizer(3.0, initial_accumulator_value=0.1))
self.assertAllClose(val0, val2)
self.assertAllClose(val1, val3)
def testEquivSparseAdagradwithoutRegularization(self):
with self.test_session():
val0, val1 = self.applyOptimizer(
tf.train.ProximalAdagradOptimizer(3.0,
initial_accumulator_value=0.1,
l1_regularization_strength=0.0,
l2_regularization_strength=0.0),
is_sparse=True)
with self.test_session():
val2, val3 = self.applyOptimizer(
tf.train.AdagradOptimizer(3.0, initial_accumulator_value=0.1),
is_sparse=True)
self.assertAllClose(val0, val2)
self.assertAllClose(val1, val3)
if __name__ == "__main__":
tf.test.main()

View File

@ -0,0 +1,81 @@
# Copyright 2015 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""ProximalGradientDescent for TensorFlow."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from tensorflow.python.framework import ops
# pylint: disable=unused-import
from tensorflow.python.ops import math_ops
# pylint: enable=unused-import
from tensorflow.python.training import optimizer
from tensorflow.python.training import training_ops
class ProximalGradientDescentOptimizer(optimizer.Optimizer):
# pylint: disable=line-too-long
"""Optimizer that implements the proximal gradient descent algorithm.
See this [paper](http://papers.nips.cc/paper/3793-efficient-learning-using-forward-backward-splitting.pdf).
@@__init__
"""
def __init__(self, learning_rate, l1_regularization_strength=0.0,
l2_regularization_strength=0.0, use_locking=False,
name="ProximalGradientDescent"):
"""Construct a new proximal gradient descent optimizer.
Args:
learning_rate: A Tensor or a floating point value. The learning
rate to use.
l1_regularization_strength: A float value, must be greater than or
equal to zero.
l2_regularization_strength: A float value, must be greater than or
equal to zero.
use_locking: If True use locks for update operations.
name: Optional name prefix for the operations created when applying
gradients. Defaults to "GradientDescent".
"""
super(ProximalGradientDescentOptimizer, self).__init__(use_locking, name)
self._learning_rate = learning_rate
self._l1_regularization_strength = l1_regularization_strength
self._l2_regularization_strength = l2_regularization_strength
self._l1_regularization_strength_tensor = None
self._l2_regularization_strength_tensor = None
def _apply_dense(self, grad, var):
return training_ops.apply_proximal_gradient_descent(
var,
self._learning_rate_tensor,
self._l1_regularization_strength_tensor,
self._l2_regularization_strength_tensor,
grad,
use_locking=self._use_locking).op
def _apply_sparse(self, grad, var):
delta = ops.IndexedSlices(grad.values * self._learning_rate_tensor,
grad.indices, grad.dense_shape)
return var.scatter_sub(delta, use_locking=self._use_locking)
def _prepare(self):
self._learning_rate_tensor = ops.convert_to_tensor(self._learning_rate,
name="learning_rate")
self._l1_regularization_strength_tensor = ops.convert_to_tensor(
self._l1_regularization_strength, name="l1_regularization_strength")
self._l2_regularization_strength_tensor = ops.convert_to_tensor(
self._l2_regularization_strength, name="l2_regularization_strength")

View File

@ -0,0 +1,178 @@
# Copyright 2015 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Functional tests for Proximal Gradient Descent operations."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
import tensorflow as tf
class ProximalGradientDescentOptimizerTest(tf.test.TestCase):
def testProximalGradientDescentwithoutRegularization(self):
with self.test_session() as sess:
var0 = tf.Variable([0.0, 0.0])
var1 = tf.Variable([0.0, 0.0])
grads0 = tf.constant([0.1, 0.2])
grads1 = tf.constant([0.01, 0.02])
opt = tf.train.ProximalGradientDescentOptimizer(
3.0,
l1_regularization_strength=0.0,
l2_regularization_strength=0.0)
update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
tf.initialize_all_variables().run()
v0_val, v1_val = sess.run([var0, var1])
self.assertAllClose([0.0, 0.0], v0_val)
self.assertAllClose([0.0, 0.0], v1_val)
# Run 3 steps Proximal Gradient Descent.
for _ in range(3):
update.run()
v0_val, v1_val = sess.run([var0, var1])
self.assertAllClose(np.array([-0.9, -1.8]),
v0_val)
self.assertAllClose(np.array([-0.09, -0.18]),
v1_val)
def testProximalGradientDescentwithoutRegularization2(self):
with self.test_session() as sess:
var0 = tf.Variable([1.0, 2.0])
var1 = tf.Variable([4.0, 3.0])
grads0 = tf.constant([0.1, 0.2])
grads1 = tf.constant([0.01, 0.02])
opt = tf.train.ProximalGradientDescentOptimizer(
3.0,
l1_regularization_strength=0.0,
l2_regularization_strength=0.0)
update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
tf.initialize_all_variables().run()
v0_val, v1_val = sess.run([var0, var1])
self.assertAllClose([1.0, 2.0], v0_val)
self.assertAllClose([4.0, 3.0], v1_val)
# Run 3 steps Proximal Gradient Descent
for _ in range(3):
update.run()
v0_val, v1_val = sess.run([var0, var1])
self.assertAllClose(np.array([0.1, 0.2]),
v0_val)
self.assertAllClose(np.array([3.91, 2.82]),
v1_val)
def testProximalGradientDescentWithL1_L2(self):
with self.test_session() as sess:
var0 = tf.Variable([1.0, 2.0])
var1 = tf.Variable([4.0, 3.0])
grads0 = tf.constant([0.1, 0.2])
grads1 = tf.constant([0.01, 0.02])
opt = tf.train.ProximalGradientDescentOptimizer(
3.0,
l1_regularization_strength=0.001,
l2_regularization_strength=2.0)
update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
tf.initialize_all_variables().run()
v0_val, v1_val = sess.run([var0, var1])
self.assertAllClose([1.0, 2.0], v0_val)
self.assertAllClose([4.0, 3.0], v1_val)
# Run 10 steps Proximal Gradient Descent
for _ in range(10):
update.run()
v0_val, v1_val = sess.run([var0, var1])
self.assertAllClose(np.array([0.037125, 0.074625]),
v0_val)
self.assertAllClose(np.array([0.003375, 0.007125]),
v1_val)
def applyOptimizer(self, opt, steps=5, is_sparse=False):
if is_sparse:
var0 = tf.Variable([[1.0], [2.0]])
var1 = tf.Variable([[3.0], [4.0]])
grads0 = tf.IndexedSlices(tf.constant([0.1], shape=[1, 1]),
tf.constant([0]),
tf.constant([2, 1]))
grads1 = tf.IndexedSlices(tf.constant([0.02], shape=[1, 1]),
tf.constant([1]),
tf.constant([2, 1]))
else:
var0 = tf.Variable([1.0, 2.0])
var1 = tf.Variable([3.0, 4.0])
grads0 = tf.constant([0.1, 0.2])
grads1 = tf.constant([0.01, 0.02])
update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
tf.initialize_all_variables().run()
sess = tf.get_default_session()
v0_val, v1_val = sess.run([var0, var1])
if is_sparse:
self.assertAllClose([[1.0], [2.0]], v0_val)
self.assertAllClose([[3.0], [4.0]], v1_val)
else:
self.assertAllClose([1.0, 2.0], v0_val)
self.assertAllClose([3.0, 4.0], v1_val)
# Run ProximalAdagrad for a few steps
for _ in range(steps):
update.run()
v0_val, v1_val = sess.run([var0, var1])
return v0_val, v1_val
def testEquivSparseGradientDescentwithoutRegularizaion(self):
with self.test_session():
val0, val1 = self.applyOptimizer(
tf.train.ProximalGradientDescentOptimizer(
3.0,
l1_regularization_strength=0.0,
l2_regularization_strength=0.0),
is_sparse=True)
with self.test_session():
val2, val3 = self.applyOptimizer(
tf.train.GradientDescentOptimizer(3.0), is_sparse=True)
self.assertAllClose(val0, val2)
self.assertAllClose(val1, val3)
def testEquivGradientDescentwithoutRegularizaion(self):
with self.test_session():
val0, val1 = self.applyOptimizer(
tf.train.ProximalGradientDescentOptimizer(
3.0,
l1_regularization_strength=0.0,
l2_regularization_strength=0.0))
with self.test_session():
val2, val3 = self.applyOptimizer(
tf.train.GradientDescentOptimizer(3.0))
self.assertAllClose(val0, val2)
self.assertAllClose(val1, val3)
if __name__ == "__main__":
tf.test.main()

View File

@ -151,6 +151,7 @@ from tensorflow.python.ops import state_ops
from tensorflow.python.training.adadelta import AdadeltaOptimizer
from tensorflow.python.training.adagrad import AdagradOptimizer
from tensorflow.python.training.proximal_adagrad import ProximalAdagradOptimizer
from tensorflow.python.training.adam import AdamOptimizer
from tensorflow.python.training.ftrl import FtrlOptimizer
from tensorflow.python.training.momentum import MomentumOptimizer
@ -158,6 +159,7 @@ from tensorflow.python.training.moving_averages import ExponentialMovingAverage
from tensorflow.python.training.optimizer import Optimizer
from tensorflow.python.training.rmsprop import RMSPropOptimizer
from tensorflow.python.training.gradient_descent import GradientDescentOptimizer
from tensorflow.python.training.proximal_gradient_descent import ProximalGradientDescentOptimizer
from tensorflow.python.training.sync_replicas_optimizer import SyncReplicasOptimizer
# Utility classes for training.

View File

@ -69,6 +69,17 @@ def _ApplyAdagradShape(op):
grad_shape = op.inputs[3].get_shape().merge_with(accum_shape)
return [grad_shape]
@ops.RegisterShape("ApplyProximalAdagrad")
def _ApplyProximalAdagradShape(op):
"""Shape function for the ApplyProximalAdagrad op."""
var_shape = op.inputs[0].get_shape()
accum_shape = op.inputs[1].get_shape().merge_with(var_shape)
_AssertInputIsScalar(op, 2) # lr
_AssertInputIsScalar(op, 3) # l1
_AssertInputIsScalar(op, 4) # l2
grad_shape = op.inputs[5].get_shape().merge_with(accum_shape)
return [grad_shape]
@ops.RegisterShape("ApplyFtrl")
def _ApplyFtrlShape(op):
@ -133,6 +144,32 @@ def _ApplyGradientDescentShape(op):
delta_shape = op.inputs[2].get_shape().merge_with(var_shape)
return [delta_shape]
@ops.RegisterShape("ApplyProximalGradientDescent")
def _ApplyProximalGradientDescentShape(op):
"""Shape function for the ApplyProximalGradientDescent op."""
var_shape = op.inputs[0].get_shape()
_AssertInputIsScalar(op, 1) # alpha
_AssertInputIsScalar(op, 2) # l1
_AssertInputIsScalar(op, 3) # l2
delta_shape = op.inputs[4].get_shape().merge_with(var_shape)
return [delta_shape]
@ops.RegisterShape("SparseApplyProximalGradientDescent")
def _SparseApplyProximalGradientDescentShape(op):
"""Shape function for the SparseApplyGradientDescent op."""
var_shape = op.inputs[0].get_shape()
_AssertInputIsScalar(op, 1) # lr
_AssertInputIsScalar(op, 2) # l1
_AssertInputIsScalar(op, 3) # l2
grad_shape = op.inputs[4].get_shape().merge_with(
tensor_shape.TensorShape([None]).concatenate(var_shape[1:]))
unused_indices_shape = op.inputs[5].get_shape().merge_with(
tensor_shape.vector(grad_shape[0]))
return [var_shape]
@ops.RegisterShape("SparseApplyAdadelta")
def _SparseApplyAdadeltaShape(op):
"""Shape function for the SparseApplyAdadelta op."""
@ -148,6 +185,7 @@ def _SparseApplyAdadeltaShape(op):
tensor_shape.vector(grad_shape[0]))
return [accum_update_shape]
@ops.RegisterShape("SparseApplyAdagrad")
def _SparseApplyAdagradShape(op):
"""Shape function for the SparseApplyAdagrad op."""
@ -161,6 +199,21 @@ def _SparseApplyAdagradShape(op):
return [accum_shape]
@ops.RegisterShape("SparseApplyProximalAdagrad")
def _SparseApplyProximalAdagradShape(op):
"""Shape function for the SparseApplyProximalAdagrad op."""
var_shape = op.inputs[0].get_shape()
accum_shape = op.inputs[1].get_shape().merge_with(var_shape)
_AssertInputIsScalar(op, 2) # lr
_AssertInputIsScalar(op, 3) # l1
_AssertInputIsScalar(op, 4) # l2
grad_shape = op.inputs[5].get_shape().merge_with(
tensor_shape.TensorShape([None]).concatenate(accum_shape[1:]))
unused_indices_shape = op.inputs[6].get_shape().merge_with(
tensor_shape.vector(grad_shape[0]))
return [accum_shape]
@ops.RegisterShape("SparseApplyFtrl")
def _SparseApplyFtrlShape(op):
"""Shape function for the SparseApplyFtrl op."""

View File

@ -1876,6 +1876,40 @@ bool CudnnSupport::DoPoolForward(
return true;
}
bool CudnnSupport::DoPoolForward(
Stream* stream, const dnn::PoolingDescriptor& pooling_dimensions,
const dnn::BatchDescriptor& input_dimensions,
const DeviceMemory<Eigen::half>& input_data,
const dnn::BatchDescriptor& output_dimensions,
DeviceMemory<Eigen::half>* output_data) {
mutex_lock lock{dnn_handle_mutex_};
auto status = dynload::cudnnSetStream(parent_, ToHandle(dnn_handle_),
AsCUDAStreamValue(stream));
if (status != CUDNN_STATUS_SUCCESS) {
LOG(ERROR) << "failed to set stream for cudnn handle: " << ToString(status);
return false;
}
// Alpha is the scaling factor for input.
float alpha = 1.0;
// Beta is the scaling factor for output.
float beta = 0.0;
ScopedTensorDescriptor src_desc{parent_, input_dimensions, CUDNN_DATA_HALF};
ScopedTensorDescriptor dest_desc{parent_, output_dimensions, CUDNN_DATA_HALF};
ScopedPoolingDescriptor pooling_desc{parent_, pooling_dimensions};
status = dynload::cudnnPoolingForward(
parent_, ToHandle(dnn_handle_), pooling_desc.handle(), &alpha,
src_desc.handle(), input_data.opaque(), &beta, dest_desc.handle(),
output_data->opaque());
if (status != CUDNN_STATUS_SUCCESS) {
LOG(ERROR) << "failed to enqueue forward pooling on stream: "
<< ToString(status);
return false;
}
return true;
}
bool CudnnSupport::DoPoolBackward(
Stream* stream, const dnn::PoolingDescriptor& pooling_dimensions,
const dnn::BatchDescriptor& input_dimensions,
@ -1914,6 +1948,43 @@ bool CudnnSupport::DoPoolBackward(
return true;
}
bool CudnnSupport::DoPoolBackward(
Stream* stream, const dnn::PoolingDescriptor& pooling_dimensions,
const dnn::BatchDescriptor& input_dimensions,
const DeviceMemory<Eigen::half>& input_data,
const dnn::BatchDescriptor& output_dimensions,
const DeviceMemory<Eigen::half>& output_data,
const DeviceMemory<Eigen::half>& input_diff_data,
DeviceMemory<Eigen::half>* output_diff_data) {
mutex_lock lock{dnn_handle_mutex_};
auto status = dynload::cudnnSetStream(parent_, ToHandle(dnn_handle_),
AsCUDAStreamValue(stream));
if (status != CUDNN_STATUS_SUCCESS) {
LOG(ERROR) << "failed to set stream for cudnn handle: " << ToString(status);
return false;
}
// Alpha is the scaling factor for input.
float alpha = 1.0;
// Beta is the scaling factor for output.
float beta = 0.0;
ScopedTensorDescriptor src_desc{parent_, input_dimensions, CUDNN_DATA_HALF};
ScopedTensorDescriptor dest_desc{parent_, output_dimensions, CUDNN_DATA_HALF};
ScopedPoolingDescriptor pooling_desc{parent_, pooling_dimensions};
status = dynload::cudnnPoolingBackward(
parent_, ToHandle(dnn_handle_), pooling_desc.handle(), &alpha,
dest_desc.handle(), output_data.opaque(), dest_desc.handle(),
input_diff_data.opaque(), src_desc.handle(), input_data.opaque(), &beta,
src_desc.handle(), output_diff_data->opaque());
if (status != CUDNN_STATUS_SUCCESS) {
LOG(ERROR) << "failed to enqueue backward pooling on stream: "
<< ToString(status);
return false;
}
return true;
}
bool CudnnSupport::DoNormalize(
Stream* stream, const dnn::NormalizeDescriptor& normalize_descriptor,
const DeviceMemory<float>& input_data, DeviceMemory<float>* output_data) {

View File

@ -201,6 +201,13 @@ class CudnnSupport : public dnn::DnnSupport {
const dnn::BatchDescriptor& output_dimensions,
DeviceMemory<float>* output_data) override;
bool DoPoolForward(Stream* stream,
const dnn::PoolingDescriptor& pooling_dimensions,
const dnn::BatchDescriptor& input_dimensions,
const DeviceMemory<Eigen::half>& input_data,
const dnn::BatchDescriptor& output_dimensions,
DeviceMemory<Eigen::half>* output_data) override;
bool DoPoolBackward(Stream* stream,
const dnn::PoolingDescriptor& pooling_dimensions,
const dnn::BatchDescriptor& input_dimensions,
@ -210,6 +217,15 @@ class CudnnSupport : public dnn::DnnSupport {
const DeviceMemory<float>& input_diff_data,
DeviceMemory<float>* output_diff_data) override;
bool DoPoolBackward(Stream* stream,
const dnn::PoolingDescriptor& pooling_dimensions,
const dnn::BatchDescriptor& input_dimensions,
const DeviceMemory<Eigen::half>& input_data,
const dnn::BatchDescriptor& output_dimensions,
const DeviceMemory<Eigen::half>& output_data,
const DeviceMemory<Eigen::half>& input_diff_data,
DeviceMemory<Eigen::half>* output_diff_data) override;
bool DoNormalize(Stream* stream,
const dnn::NormalizeDescriptor& normalize_descriptor,
const DeviceMemory<float>& input_data,

View File

@ -1011,6 +1011,13 @@ class DnnSupport {
const dnn::BatchDescriptor& output_dimensions,
DeviceMemory<float>* output_data) = 0;
virtual bool DoPoolForward(Stream* stream,
const dnn::PoolingDescriptor& pooling_dimensions,
const dnn::BatchDescriptor& input_dimensions,
const DeviceMemory<Eigen::half>& input_data,
const dnn::BatchDescriptor& output_dimensions,
DeviceMemory<Eigen::half>* output_data) = 0;
// Performs differentiation of the pooling operation.
virtual bool DoPoolBackward(Stream* stream,
const dnn::PoolingDescriptor& pooling_dimensions,
@ -1021,6 +1028,15 @@ class DnnSupport {
const DeviceMemory<float>& input_diff_data,
DeviceMemory<float>* output_diff_data) = 0;
virtual bool DoPoolBackward(Stream* stream,
const dnn::PoolingDescriptor& pooling_dimensions,
const dnn::BatchDescriptor& input_dimensions,
const DeviceMemory<Eigen::half>& input_data,
const dnn::BatchDescriptor& output_dimensions,
const DeviceMemory<Eigen::half>& output_data,
const DeviceMemory<Eigen::half>& input_diff_data,
DeviceMemory<Eigen::half>* output_diff_data) = 0;
// Applies local response normalization to the values from
// input_data and writes the result to output_data. See comments on
// NormalizeDescriptor for a description of local response

View File

@ -909,6 +909,30 @@ Stream &Stream::ThenPoolForward(
return *this;
}
Stream &Stream::ThenPoolForward(
const dnn::PoolingDescriptor &pooling_dimensions,
const dnn::BatchDescriptor &input_dimensions,
const DeviceMemory<Eigen::half> &input_data,
const dnn::BatchDescriptor &output_dimensions,
DeviceMemory<Eigen::half> *output_data) {
VLOG_CALL(PARAM(pooling_dimensions), PARAM(input_dimensions),
PARAM(input_data), PARAM(output_dimensions), PARAM(output_data));
if (ok()) {
if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
CheckError(dnn->DoPoolForward(this, pooling_dimensions, input_dimensions,
input_data, output_dimensions,
output_data));
} else {
SetError();
LOG(WARNING)
<< "attempting to perform DNN operation using StreamExecutor "
"without DNN support";
}
}
return *this;
}
Stream &Stream::ThenPoolBackward(
const dnn::PoolingDescriptor &pooling_dimensions,
const dnn::BatchDescriptor &input_dimensions,
@ -936,6 +960,33 @@ Stream &Stream::ThenPoolBackward(
return *this;
}
Stream &Stream::ThenPoolBackward(
const dnn::PoolingDescriptor &pooling_dimensions,
const dnn::BatchDescriptor &input_dimensions,
const DeviceMemory<Eigen::half> &input_data,
const dnn::BatchDescriptor &output_dimensions,
const DeviceMemory<Eigen::half> &output_data,
const DeviceMemory<Eigen::half> &input_diff_data,
DeviceMemory<Eigen::half> *output_diff_data) {
VLOG_CALL(PARAM(pooling_dimensions), PARAM(input_dimensions),
PARAM(input_data), PARAM(output_dimensions), PARAM(output_data),
PARAM(input_diff_data), PARAM(output_diff_data));
if (ok()) {
if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
CheckError(dnn->DoPoolBackward(this, pooling_dimensions, input_dimensions,
input_data, output_dimensions, output_data,
input_diff_data, output_diff_data));
} else {
SetError();
LOG(WARNING)
<< "attempting to perform DNN operation using StreamExecutor "
"without DNN support";
}
}
return *this;
}
Stream &Stream::ThenNormalize(
const dnn::NormalizeDescriptor &normalize_descriptor,
const DeviceMemory<float> &input_data, DeviceMemory<float> *output_data) {

View File

@ -421,6 +421,12 @@ class Stream {
const dnn::BatchDescriptor &output_dimensions,
DeviceMemory<float> *output_data);
Stream &ThenPoolForward(const dnn::PoolingDescriptor &pooling_dimensions,
const dnn::BatchDescriptor &input_dimensions,
const DeviceMemory<Eigen::half> &input_data,
const dnn::BatchDescriptor &output_dimensions,
DeviceMemory<Eigen::half> *output_data);
Stream &ThenPoolBackward(const dnn::PoolingDescriptor &pooling_dimensions,
const dnn::BatchDescriptor &input_dimensions,
const DeviceMemory<float> &input_data,
@ -429,6 +435,14 @@ class Stream {
const DeviceMemory<float> &input_diff_data,
DeviceMemory<float> *output_diff_data);
Stream &ThenPoolBackward(const dnn::PoolingDescriptor &pooling_dimensions,
const dnn::BatchDescriptor &input_dimensions,
const DeviceMemory<Eigen::half> &input_data,
const dnn::BatchDescriptor &output_dimensions,
const DeviceMemory<Eigen::half> &output_data,
const DeviceMemory<Eigen::half> &input_diff_data,
DeviceMemory<Eigen::half> *output_diff_data);
Stream &ThenNormalize(const dnn::NormalizeDescriptor &normalize_descriptor,
const DeviceMemory<float> &input_data,
DeviceMemory<float> *output_data);

View File

@ -6,8 +6,8 @@
def tf_workspace(path_prefix = "", tf_repo_name = ""):
native.new_http_archive(
name = "eigen_archive",
url = "https://bitbucket.org/eigen/eigen/get/d02e6a705c30.tar.gz",
sha256 = "532956172daa8aba87c750791ff89a5c38cdb07e2525afe17ecb4bef812d67cf",
url = "https://bitbucket.org/eigen/eigen/get/0c0b79ecd74c.tar.gz",
sha256 = "b4b5884b03bd4bae114d02b36e2435ad1504ed8e51431d16c876b6f6a365882b",
build_file = path_prefix + "eigen.BUILD",
)

View File

@ -1 +1 @@
#include "eigen-eigen-d02e6a705c30/Eigen/Cholesky"
#include "eigen-eigen-0c0b79ecd74c/Eigen/Cholesky"

View File

@ -1 +1 @@
#include "eigen-eigen-d02e6a705c30/Eigen/Core"
#include "eigen-eigen-0c0b79ecd74c/Eigen/Core"

View File

@ -1 +1 @@
#include "eigen-eigen-d02e6a705c30/Eigen/Eigenvalues"
#include "eigen-eigen-0c0b79ecd74c/Eigen/Eigenvalues"

Some files were not shown because too many files have changed in this diff Show More