diff --git a/eigen.BUILD b/eigen.BUILD index 79bafe65b62..e32f3aab492 100644 --- a/eigen.BUILD +++ b/eigen.BUILD @@ -1,6 +1,6 @@ package(default_visibility = ["//visibility:public"]) -archive_dir = "eigen-eigen-d02e6a705c30" +archive_dir = "eigen-eigen-0c0b79ecd74c" cc_library( name = "eigen", diff --git a/tensorflow/contrib/cmake/external/eigen.cmake b/tensorflow/contrib/cmake/external/eigen.cmake index db409760faa..d3075ab9d23 100644 --- a/tensorflow/contrib/cmake/external/eigen.cmake +++ b/tensorflow/contrib/cmake/external/eigen.cmake @@ -7,7 +7,7 @@ include (ExternalProject) -set(eigen_archive_hash "d02e6a705c30") +set(eigen_archive_hash "0c0b79ecd74c") set(eigen_INCLUDE_DIRS ${CMAKE_CURRENT_BINARY_DIR} @@ -16,7 +16,7 @@ set(eigen_INCLUDE_DIRS ${tensorflow_source_dir}/third_party/eigen3 ) set(eigen_URL https://bitbucket.org/eigen/eigen/get/${eigen_archive_hash}.tar.gz) -set(eigen_HASH SHA256=532956172daa8aba87c750791ff89a5c38cdb07e2525afe17ecb4bef812d67cf) +set(eigen_HASH SHA256=b4b5884b03bd4bae114d02b36e2435ad1504ed8e51431d16c876b6f6a365882b) set(eigen_BUILD ${CMAKE_CURRENT_BINARY_DIR}/eigen/src/eigen) set(eigen_INSTALL ${CMAKE_CURRENT_BINARY_DIR}/eigen/install) diff --git a/tensorflow/contrib/learn/BUILD b/tensorflow/contrib/learn/BUILD index e5e2e88dbbf..59d3ea145d8 100644 --- a/tensorflow/contrib/learn/BUILD +++ b/tensorflow/contrib/learn/BUILD @@ -198,6 +198,30 @@ py_test( ], ) +py_test( + name = "experiment_test", + size = "small", + srcs = ["python/learn/tests/experiment_test.py"], + srcs_version = "PY2AND3", + deps = [ + ":learn", + "//tensorflow:tensorflow_py", + "//tensorflow/python:framework_test_lib", + ], +) + +py_test( + name = "learn_runner_test", + size = "small", + srcs = ["python/learn/tests/learn_runner_test.py"], + srcs_version = "PY2AND3", + deps = [ + ":learn", + "//tensorflow:tensorflow_py", + "//tensorflow/python:framework_test_lib", + ], +) + py_test( name = "tensor_signature_test", srcs = ["python/learn/estimators/tensor_signature_test.py"], @@ -439,6 +463,19 @@ py_test( ], ) +py_test( + name = "stability_test", + size = "small", + srcs = ["python/learn/tests/stability_test.py"], + srcs_version = "PY2AND3", + deps = [ + ":learn", + "//tensorflow:tensorflow_py", + "//tensorflow/python:framework", + "//tensorflow/python:framework_test_lib", + ], +) + py_binary( name = "inspect_checkpoint", srcs = [ diff --git a/tensorflow/contrib/learn/python/learn/__init__.py b/tensorflow/contrib/learn/python/learn/__init__.py index f94553eec50..375d90960d7 100644 --- a/tensorflow/contrib/learn/python/learn/__init__.py +++ b/tensorflow/contrib/learn/python/learn/__init__.py @@ -21,6 +21,7 @@ from __future__ import print_function import numpy as np +# pylint: disable=wildcard-import from tensorflow.contrib.learn.python.learn import datasets from tensorflow.contrib.learn.python.learn import estimators from tensorflow.contrib.learn.python.learn import graph_actions @@ -30,9 +31,9 @@ from tensorflow.contrib.learn.python.learn import monitors from tensorflow.contrib.learn.python.learn import ops from tensorflow.contrib.learn.python.learn import preprocessing from tensorflow.contrib.learn.python.learn import utils -# pylint: disable=wildcard-import from tensorflow.contrib.learn.python.learn.dataframe import * from tensorflow.contrib.learn.python.learn.estimators import * +from tensorflow.contrib.learn.python.learn.experiment import Experiment from tensorflow.contrib.learn.python.learn.graph_actions import evaluate from tensorflow.contrib.learn.python.learn.graph_actions import infer from tensorflow.contrib.learn.python.learn.graph_actions import NanLossDuringTrainingError diff --git a/tensorflow/contrib/learn/python/learn/estimators/dnn.py b/tensorflow/contrib/learn/python/learn/estimators/dnn.py index 9b2bbd7562a..5079d6a6296 100644 --- a/tensorflow/contrib/learn/python/learn/estimators/dnn.py +++ b/tensorflow/contrib/learn/python/learn/estimators/dnn.py @@ -50,11 +50,11 @@ class DNNClassifier(dnn_linear_combined.DNNLinearCombinedClassifier): def input_fn_eval: # returns x, Y pass - estimator.evaluate(input_fn_eval) - estimator.predict(x) + estimator.evaluate(input_fn=input_fn_eval) + estimator.predict(x=x) ``` - Input of `fit`, `train`, and `evaluate` should have following features, + Input of `fit` and `evaluate` should have following features, otherwise there will be a `KeyError`: if `weight_column_name` is not `None`, a feature with `key=weight_column_name` whose value is a `Tensor`. @@ -145,11 +145,11 @@ class DNNRegressor(dnn_linear_combined.DNNLinearCombinedRegressor): def input_fn_eval: # returns x, Y pass - estimator.evaluate(input_fn_eval) - estimator.predict(x) + estimator.evaluate(input_fn=input_fn_eval) + estimator.predict(x=x) ``` - Input of `fit`, `train`, and `evaluate` should have following features, + Input of `fit` and `evaluate` should have following features, otherwise there will be a `KeyError`: if `weight_column_name` is not `None`, a feature with `key=weight_column_name` whose value is a `Tensor`. diff --git a/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined.py b/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined.py index 06e5e9d9df4..c7b33d527ac 100644 --- a/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined.py +++ b/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined.py @@ -397,12 +397,12 @@ class DNNLinearCombinedClassifier(_DNNLinearCombinedBaseEstimator): ... def input_fn_eval: # returns x, y ... - estimator.train(input_fn_train) - estimator.evaluate(input_fn_eval) - estimator.predict(x) + estimator.fit(input_fn=input_fn_train) + estimator.evaluate(input_fn=input_fn_eval) + estimator.predict(x=x) ``` - Input of `fit`, `train`, and `evaluate` should have following features, + Input of `fit` and `evaluate` should have following features, otherwise there will be a `KeyError`: if `weight_column_name` is not `None`, a feature with `key=weight_column_name` whose value is a `Tensor`. diff --git a/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined_test.py b/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined_test.py index d405e56bb05..407ca38f662 100644 --- a/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined_test.py +++ b/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined_test.py @@ -42,7 +42,14 @@ def _prepare_iris_data_for_logistic_regression(): return iris -def _iris_input_fn(): +def _iris_input_multiclass_fn(): + iris = tf.contrib.learn.datasets.load_iris() + return { + 'feature': tf.constant(iris.data, dtype=tf.float32) + }, tf.constant(iris.target, shape=[150, 1], dtype=tf.int32) + + +def _iris_input_logistic_fn(): iris = _prepare_iris_data_for_logistic_regression() return { 'feature': tf.constant(iris.data, dtype=tf.float32) @@ -64,8 +71,8 @@ class DNNLinearCombinedClassifierTest(tf.test.TestCase): dnn_feature_columns=cont_features, dnn_hidden_units=[3, 3]) - classifier.fit(input_fn=_iris_input_fn, steps=100) - scores = classifier.evaluate(input_fn=_iris_input_fn, steps=100) + classifier.fit(input_fn=_iris_input_logistic_fn, steps=100) + scores = classifier.evaluate(input_fn=_iris_input_logistic_fn, steps=100) self.assertGreater(scores['accuracy'], 0.9) def testLogisticRegression_TensorData(self): @@ -127,8 +134,8 @@ class DNNLinearCombinedClassifierTest(tf.test.TestCase): dnn_feature_columns=cont_features, dnn_hidden_units=[3, 3]) - classifier.fit(input_fn=_iris_input_fn, steps=100) - scores = classifier.evaluate(input_fn=_iris_input_fn, steps=100) + classifier.fit(input_fn=_iris_input_multiclass_fn, steps=100) + scores = classifier.evaluate(input_fn=_iris_input_multiclass_fn, steps=100) self.assertGreater(scores['accuracy'], 0.9) def testWeightColumn(self): @@ -210,8 +217,8 @@ class DNNLinearCombinedClassifierTest(tf.test.TestCase): dnn_hidden_units=[3, 3], dnn_optimizer=tf.train.AdagradOptimizer(learning_rate=0.1)) - classifier.fit(input_fn=_iris_input_fn, steps=100) - scores = classifier.evaluate(input_fn=_iris_input_fn, steps=100) + classifier.fit(input_fn=_iris_input_logistic_fn, steps=100) + scores = classifier.evaluate(input_fn=_iris_input_logistic_fn, steps=100) self.assertGreater(scores['accuracy'], 0.9) def testCustomOptimizerByString(self): @@ -230,8 +237,8 @@ class DNNLinearCombinedClassifierTest(tf.test.TestCase): dnn_hidden_units=[3, 3], dnn_optimizer='Adagrad') - classifier.fit(input_fn=_iris_input_fn, steps=100) - scores = classifier.evaluate(input_fn=_iris_input_fn, steps=100) + classifier.fit(input_fn=_iris_input_logistic_fn, steps=100) + scores = classifier.evaluate(input_fn=_iris_input_logistic_fn, steps=100) self.assertGreater(scores['accuracy'], 0.9) def testPredict(self): diff --git a/tensorflow/contrib/learn/python/learn/estimators/estimator.py b/tensorflow/contrib/learn/python/learn/estimators/estimator.py index e6c2a30134b..20531ab691b 100644 --- a/tensorflow/contrib/learn/python/learn/estimators/estimator.py +++ b/tensorflow/contrib/learn/python/learn/estimators/estimator.py @@ -111,8 +111,8 @@ class BaseEstimator(sklearn.BaseEstimator): self._model_dir = model_dir if self._model_dir is None: self._model_dir = tempfile.mkdtemp() - logging.info('Using temporary folder as model directory: %s', - self._model_dir) + logging.warning('Using temporary folder as model directory: %s', + self._model_dir) # Create a run configuration if config is None: @@ -135,9 +135,8 @@ class BaseEstimator(sklearn.BaseEstimator): self._graph = None - def fit( - self, x=None, y=None, input_fn=None, steps=None, batch_size=None, - monitors=None): + def fit(self, x=None, y=None, input_fn=None, steps=None, batch_size=None, + monitors=None): """Trains a model given training data `x` predictions and `y` targets. Args: @@ -421,21 +420,20 @@ class BaseEstimator(sklearn.BaseEstimator): monitors=None, log_every_steps=100, fail_on_nan_loss=True): - # TODO(wicke): This is a hack and needs to go. - if self._config.execution_mode not in ('all', 'train'): - return + # TODO(wicke): Remove this once Model and associated code are gone. + if hasattr(self._config, 'execution_mode'): + if self._config.execution_mode not in ('all', 'train'): + return - if not self._model_dir: - raise ValueError('Estimator\'s model_dir should be non-empty.') - - # Stagger startup of worker sessions based on task id. - sleep_secs = min(self._config.training_worker_max_startup_secs, - self._config.task * - self._config.training_worker_session_startup_stagger_secs) - if sleep_secs: - logging.info('Waiting %d secs before starting task %d.', sleep_secs, - self._config.task) - time.sleep(sleep_secs) + # Stagger startup of worker sessions based on task id. + sleep_secs = min( + self._config.training_worker_max_startup_secs, + self._config.task * + self._config.training_worker_session_startup_stagger_secs) + if sleep_secs: + logging.info('Waiting %d secs before starting task %d.', sleep_secs, + self._config.task) + time.sleep(sleep_secs) # Device allocation device_fn = device_fn or self._device_fn @@ -454,7 +452,7 @@ class BaseEstimator(sklearn.BaseEstimator): monitors += monitors_lib.get_default_monitors( loss_op=loss_op, summary_op=logging_ops.get_summary_op(), - save_summary_steps=100, + save_summary_steps=self._config.save_summary_steps, summary_writer=graph_actions.get_summary_writer(self._model_dir)) is_chief = self._config.task == 0 @@ -478,8 +476,9 @@ class BaseEstimator(sklearn.BaseEstimator): log_every_steps=log_every_steps, supervisor_is_chief=is_chief, supervisor_master=self._config.master, + supervisor_save_model_secs=self._config.save_checkpoints_secs, feed_fn=feed_fn, - max_steps=steps, + steps=steps, fail_on_nan_loss=fail_on_nan_loss, monitors=monitors) @@ -513,8 +512,9 @@ class BaseEstimator(sklearn.BaseEstimator): feed_fn=None, metrics=None, name=''): - # TODO(wicke): This is a hack and needs to go. - if self._config.execution_mode not in ('all', 'evaluate', 'eval_evalset'): + # TODO(wicke): Remove this once Model and associated code are gone. + if (hasattr(self._config, 'execution_mode') and + self._config.execution_mode not in ('all', 'evaluate', 'eval_evalset')): return # Check that model has been trained. diff --git a/tensorflow/contrib/learn/python/learn/estimators/linear.py b/tensorflow/contrib/learn/python/learn/estimators/linear.py index 515134be932..4884b1290e5 100644 --- a/tensorflow/contrib/learn/python/learn/estimators/linear.py +++ b/tensorflow/contrib/learn/python/learn/estimators/linear.py @@ -46,10 +46,10 @@ class LinearClassifier(dnn_linear_combined.DNNLinearCombinedClassifier): ... estimator.fit(input_fn=input_fn_train) estimator.evaluate(input_fn=input_fn_eval) - estimator.predict(x) + estimator.predict(x=x) ``` - Input of `fit`, `train`, and `evaluate` should have following features, + Input of `fit` and `evaluate` should have following features, otherwise there will be a `KeyError`: if `weight_column_name` is not `None`, a feature with `key=weight_column_name` whose value is a `Tensor`. @@ -126,10 +126,10 @@ class LinearRegressor(dnn_linear_combined.DNNLinearCombinedRegressor): ... estimator.fit(input_fn=input_fn_train) estimator.evaluate(input_fn=input_fn_eval) - estimator.predict(x) + estimator.predict(x=x) ``` - Input of `fit`, `train`, and `evaluate` should have following features, + Input of `fit` and `evaluate` should have following features, otherwise there will be a KeyError: if `weight_column_name` is not None: key=weight_column_name, value=a `Tensor` diff --git a/tensorflow/contrib/learn/python/learn/estimators/run_config.py b/tensorflow/contrib/learn/python/learn/estimators/run_config.py index 19d37d26b08..bfcf0d3e1f5 100644 --- a/tensorflow/contrib/learn/python/learn/estimators/run_config.py +++ b/tensorflow/contrib/learn/python/learn/estimators/run_config.py @@ -24,79 +24,54 @@ from tensorflow.python import GPUOptions class RunConfig(object): - """This class specifies the specific configurations for the run. + """This class specifies the specific configurations for the run.""" - Parameters: - execution_mode: Runners use this flag to execute different tasks, like - training vs evaluation. 'all' (the default) executes both training and - eval. - master: TensorFlow master. Empty string (the default) for local. - task: Task id of the replica running the training (default: 0). - num_ps_replicas: Number of parameter server tasks to use (default: 0). - training_worker_session_startup_stagger_secs: Seconds to sleep between the - startup of each worker task session (default: 5). - training_worker_max_startup_secs: Max seconds to wait before starting any - worker (default: 60). - eval_delay_secs: Number of seconds between the beginning of each eval run. - If one run takes more than this amount of time, the next run will start - immediately once that run completes (default 60). - eval_steps: Number of steps to run in each eval (default: 100). - num_cores: Number of cores to be used (default: 4). - verbose: Controls the verbosity, possible values: - 0: the algorithm and debug information is muted. - 1: trainer prints the progress. - 2: log device placement is printed. - gpu_memory_fraction: Fraction of GPU memory used by the process on - each GPU uniformly on the same machine. - tf_random_seed: Random seed for TensorFlow initializers. - Setting this value allows consistency between reruns. - keep_checkpoint_max: The maximum number of recent checkpoint files to keep. - As new files are created, older files are deleted. - If None or 0, all checkpoint files are kept. - Defaults to 5 (that is, the 5 most recent checkpoint files are kept.) - keep_checkpoint_every_n_hours: Number of hours between each checkpoint - to be saved. The default value of 10,000 hours effectively disables - the feature. - - Attributes: - tf_master: Tensorflow master. - tf_config: Tensorflow Session Config proto. - tf_random_seed: Tensorflow random seed. - keep_checkpoint_max: Maximum number of checkpoints to keep. - keep_checkpoint_every_n_hours: Number of hours between each checkpoint. - """ - - # TODO(wicke): Cull unused options. + # TODO(wicke): Move options out once functionality is covered by monitors def __init__(self, - execution_mode='all', master='', task=0, num_ps_replicas=0, - training_worker_session_startup_stagger_secs=5, - training_worker_max_startup_secs=60, - eval_delay_secs=60, - eval_steps=100, num_cores=4, - verbose=1, + log_device_placement=False, gpu_memory_fraction=1, tf_random_seed=42, + save_summary_steps=100, + save_checkpoints_secs=60, keep_checkpoint_max=5, keep_checkpoint_every_n_hours=10000): - self.execution_mode = execution_mode + """Constructor. + + Args: + master: TensorFlow master. Empty string (the default) for local. + task: Task id of the replica running the training (default: 0). + num_ps_replicas: Number of parameter server tasks to use (default: 0). + num_cores: Number of cores to be used (default: 4). + log_device_placement: Log the op placement to devices (default: False). + gpu_memory_fraction: Fraction of GPU memory used by the process on + each GPU uniformly on the same machine. + tf_random_seed: Random seed for TensorFlow initializers. + Setting this value allows consistency between reruns. + save_summary_steps: Save summaries every this many steps. + save_checkpoints_secs: Save checkpoints every this many seconds. + keep_checkpoint_max: The maximum number of recent checkpoint files to + keep. As new files are created, older files are deleted. If None or 0, + all checkpoint files are kept. Defaults to 5 (that is, the 5 most recent + checkpoint files are kept.) + keep_checkpoint_every_n_hours: Number of hours between each checkpoint + to be saved. The default value of 10,000 hours effectively disables + the feature. + """ self.master = master self.task = task self.num_ps_replicas = num_ps_replicas - self.training_worker_session_startup_stagger_secs = ( - training_worker_session_startup_stagger_secs) - self.training_worker_max_startup_secs = training_worker_max_startup_secs - self.eval_delay_secs = eval_delay_secs - self.eval_steps = eval_steps gpu_options = GPUOptions( per_process_gpu_memory_fraction=gpu_memory_fraction) - self.tf_config = ConfigProto(log_device_placement=(verbose > 1), + self.tf_config = ConfigProto(log_device_placement=log_device_placement, inter_op_parallelism_threads=num_cores, intra_op_parallelism_threads=num_cores, gpu_options=gpu_options) self.tf_random_seed = tf_random_seed + self.save_summary_steps = save_summary_steps + self.save_checkpoints_secs = save_checkpoints_secs self.keep_checkpoint_max = keep_checkpoint_max self.keep_checkpoint_every_n_hours = keep_checkpoint_every_n_hours diff --git a/tensorflow/contrib/learn/python/learn/experiment.py b/tensorflow/contrib/learn/python/learn/experiment.py new file mode 100644 index 00000000000..045dd730550 --- /dev/null +++ b/tensorflow/contrib/learn/python/learn/experiment.py @@ -0,0 +1,134 @@ +# Copyright 2016 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Experiment class collecting information needed for a single training run.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import time + +from tensorflow.python.platform import tf_logging as logging + + +class Experiment(object): + """Experiment is a class containing all information needed to train a model. + """ + + def __init__(self, estimator, + train_input_fn, + eval_input_fn, + eval_metrics=None): + """Constructor for Experiment. + + Args: + estimator: `Estimator` object. + train_input_fn: function, returns features and targets for training. + eval_input_fn: function, returns features and targets for evaluation. + eval_metrics: `dict` of string, metric function. If `None`, default set + is used. + """ + super(Experiment, self).__init__() + self._estimator = estimator + self._train_input_fn = train_input_fn + self._eval_input_fn = eval_input_fn + self._eval_metrics = eval_metrics + + def train(self, steps=None, monitors=None, delay_secs=0): + """Fit the estimator using the training data. + + Train the estimator for `steps` steps, after waiting for `delay_secs` + seconds. If `steps` is `None`, train forever. + + Args: + steps: Perform this many steps of training. `None`, the default, means + train forever. + monitors: A list of monitors to pass to the `Estimator`'s `fit` function. + delay_secs: Start training after this many seconds. + + Returns: + The trained estimator. + """ + + if delay_secs: + logging.info("Waiting %d secs before starting training.", delay_secs) + time.sleep(delay_secs) + + return self._estimator.fit(input_fn=self._train_input_fn, + steps=steps, monitors=monitors) + + def evaluate(self, steps=None, delay_secs=0): + """Evaluate on the evaluation data. + + Runs evaluation on the evaluation data and returns the result. If `steps` + is given, only run for this many steps. Start the evaluation after + `delay_secs` seconds. + + Args: + steps: Run this many steps of evaluation. + delay_secs: Start evaluating after waiting for this many seconds. + + Returns: + The result of the `evaluate` call to the `Estimator`. + """ + + if delay_secs: + logging.info("Waiting %d secs before starting eval.", delay_secs) + time.sleep(delay_secs) + + return self._estimator.evaluate(input_fn=self._eval_input_fn, + steps=steps, + metrics=self._eval_metrics) + + def _continuous_eval(self, input_fn, steps=1000, delay_secs=0, + throttle_delay_secs=60): + """Run continuous eval on the eval data. + + Run `steps` steps of evaluation on the evaluation data set. This function + starts evaluating after `delay_secs` seconds and then runs no more than one + evaluation per `throttle_delay_secs`. It never returns. + + Args: + input_fn: The input to use for this eval. + steps: Number of steps per evaluation run. + delay_secs: Start evaluating after this many seconds. + throttle_delay_secs: Do not re-evaluate unless the last evaluation was + started at least this many seconds ago. + """ + if delay_secs: + logging.info("Waiting %f secs before starting eval.", delay_secs) + time.sleep(delay_secs) + + while True: + start = time.time() + self._estimator.evaluate(input_fn=input_fn, + steps=steps, + metrics=self._eval_metrics) + duration = time.time() - start + if duration < throttle_delay_secs: + difference = throttle_delay_secs - duration + logging.info("Waiting %f secs before starting next eval run.", + difference) + time.sleep(difference) + + def continuous_eval(self, steps=1000, delay_secs=0, throttle_delay_secs=60): + self._continuous_eval(self._eval_input_fn, steps=steps, + delay_secs=delay_secs, + throttle_delay_secs=throttle_delay_secs) + + def continuous_eval_on_train_data(self, steps=1000, delay_secs=0, + throttle_delay_secs=60): + self._continuous_eval(self._train_input_fn, steps=steps, + delay_secs=delay_secs, + throttle_delay_secs=throttle_delay_secs) diff --git a/tensorflow/contrib/learn/python/learn/graph_actions.py b/tensorflow/contrib/learn/python/learn/graph_actions.py index ef57d7ce360..d96f99efa29 100644 --- a/tensorflow/contrib/learn/python/learn/graph_actions.py +++ b/tensorflow/contrib/learn/python/learn/graph_actions.py @@ -30,8 +30,9 @@ from six import reraise from tensorflow.contrib.framework.python.ops import ops as contrib_ops from tensorflow.contrib.framework.python.ops import variables as contrib_variables -from tensorflow.contrib.layers.python.layers import summaries from tensorflow.contrib.learn.python.learn import monitors as monitors_lib +from tensorflow.contrib.learn.python.learn.utils import checkpoints +from tensorflow.core.framework import summary_pb2 from tensorflow.python.client import session as tf_session from tensorflow.python.framework import errors from tensorflow.python.framework import ops @@ -131,7 +132,7 @@ def train(graph, supervisor_save_model_secs=600, supervisor_save_summaries_steps=100, feed_fn=None, - max_steps=None, + steps=None, fail_on_nan_loss=True, monitors=None): """Train a model. @@ -173,7 +174,7 @@ def train(graph, `supervisor_save_summaries_steps` seconds when training. feed_fn: A function that is called every iteration to produce a `feed_dict` passed to `session.run` calls. Optional. - max_steps: Train until `global_step_tensor` evaluates to this value. + steps: Trains for this many steps (e.g. current global step + `steps`). fail_on_nan_loss: If true, raise `NanLossDuringTrainingError` if `loss_op` evaluates to `NaN`. If false, continue training as if nothing happened. monitors: List of `BaseMonitor` subclass instances. Used for callbacks @@ -192,28 +193,36 @@ def train(graph, if not output_dir: raise ValueError('Output directory should be non-empty.') - global_step_tensor = contrib_variables.assert_or_get_global_step( - graph, global_step_tensor) - if global_step_tensor is None: - raise ValueError('No "global_step" was provided or found in the graph.') + with graph.as_default(): + global_step_tensor = contrib_variables.assert_or_get_global_step( + graph, global_step_tensor) + if global_step_tensor is None: + raise ValueError('No "global_step" was provided or found in the graph.') - summary_writer = (get_summary_writer(output_dir) - if supervisor_is_chief else None) + # Get current step. + try: + start_step = checkpoints.load_variable( + output_dir, global_step_tensor.name) + except (errors.NotFoundError, ValueError): + start_step = 0 - # TODO(ipolosukhin): Replace all functionality of Supervisor with Monitors. - if not supervisor_is_chief: - # monitors should run only on the chief. - monitors = [] - elif not monitors: - monitors = monitors_lib.get_default_monitors( - loss_op=loss_op, - summary_op=logging_ops.get_summary_op(), - save_summary_steps=supervisor_save_summaries_steps, - summary_writer=summary_writer) + summary_writer = (get_summary_writer(output_dir) + if supervisor_is_chief else None) - # Start monitors, can create graph parts. - for monitor in monitors: - monitor.begin(max_steps=max_steps) + # TODO(ipolosukhin): Replace all functionality of Supervisor with Monitors. + if not supervisor_is_chief: + # monitors should run only on the chief. + monitors = [] + elif not monitors: + monitors = monitors_lib.get_default_monitors( + loss_op=loss_op, + summary_op=logging_ops.get_summary_op(), + save_summary_steps=supervisor_save_summaries_steps, + summary_writer=summary_writer) + + # Start monitors, can create graph parts. + for monitor in monitors: + monitor.begin(max_steps=start_step + steps) supervisor = tf_supervisor.Supervisor( graph, @@ -235,6 +244,7 @@ def train(graph, get_current_step = lambda: session.run(global_step_tensor) start_step = get_current_step() + max_steps = start_step + steps last_step = start_step last_log_step = start_step loss_value = None @@ -375,6 +385,28 @@ def _start_queue_runners(session, coord): return threads +def _eval_results_to_str(eval_results): + return ', '.join('%s = %s' % (k, v) for k, v in eval_results.items()) + + +def _write_summary_results(output_dir, eval_results, current_global_step): + """Writes eval results into summary file in given dir.""" + logging.info('Saving evaluation summary for %d step: %s' % ( + current_global_step, _eval_results_to_str(eval_results))) + summary_writer = get_summary_writer(output_dir) + summary = summary_pb2.Summary() + for key in eval_results: + if eval_results[key] is None: + continue + value = summary.value.add() + value.tag = key + if (isinstance(eval_results[key], np.float32) or + isinstance(eval_results[key], float)): + value.simple_value = float(eval_results[key]) + summary_writer.add_summary(summary, current_global_step) + summary_writer.close() + + # TODO(ptucker): Add unit test. def evaluate(graph, output_dir, @@ -424,32 +456,26 @@ def evaluate(graph, eval steps were run. global_step: The global step this evaluation corresponds to. """ - global_step_tensor = contrib_variables.assert_or_get_global_step( - graph, global_step_tensor) + with graph.as_default(): + global_step_tensor = contrib_variables.assert_or_get_global_step( + graph, global_step_tensor) - for key, value in eval_dict.items(): - if not summaries.is_summary_tag_unique(key): - continue - if isinstance(value, ops.Tensor): - summaries.summarize_tensor(value, tag=key) + # Create or get summary op, global_step and saver. + saver = _get_saver() + local_init_op = _get_local_init_op() + ready_op = _get_ready_op() - # Create or get summary op, global_step and saver. - summary_op = logging_ops.get_summary_op() - saver = _get_saver() - local_init_op = _get_local_init_op() - ready_op = _get_ready_op() + session_manager = session_manager_lib.SessionManager( + local_init_op=local_init_op, + ready_op=ready_op) + session, initialized = session_manager.recover_session( + master=supervisor_master, + saver=saver, + checkpoint_dir=checkpoint_path) - session_manager = session_manager_lib.SessionManager( - local_init_op=local_init_op, - ready_op=ready_op) - session, initialized = session_manager.recover_session( - master=supervisor_master, - saver=saver, - checkpoint_dir=checkpoint_path) - - # Start queue runners. - coord = coordinator.Coordinator() - threads = _start_queue_runners(session, coord) + # Start queue runners. + coord = coordinator.Coordinator() + threads = _start_queue_runners(session, coord) with session: if not initialized: @@ -488,8 +514,7 @@ def evaluate(graph, duration = time.time() - start_time logging.info('Results after %d steps (%.3f sec/batch): %s.', step, float(duration), - ', '.join('%s = %s' % (k, v) - for k, v in eval_results.items())) + _eval_results_to_str(eval_results)) finally: if eval_results is None or step != eval_step: eval_results = session.run(eval_dict, feed_dict=feed_dict) @@ -498,20 +523,6 @@ def evaluate(graph, coord.request_stop() coord.join(threads, stop_grace_period_secs=120) - # Make our own summary writer and write a summary to the eval dir. - # Only is feed_fn is not provided. - # TODO(ipolosukhin): Convert evaluation to use streaming_metrics, - # then we can save for non feed_fn as well. - if summary_op is not None and feed_fn is None: - summary_writer = None - try: - summary_writer = get_summary_writer(output_dir) - summary_str = session.run(summary_op) - if summary_str: - summary_writer.add_summary(summary_str, current_global_step) - finally: - if summary_writer: - summary_writer.close() # catch OutOfRangeError which is thrown when queue is out of data (and for # other reasons as well). except errors.OutOfRangeError as e: @@ -526,6 +537,9 @@ def evaluate(graph, else: logging.warn('Input iterator is exhausted: %s.', e) + # Save summaries for this evaluation. + _write_summary_results(output_dir, eval_results, current_global_step) + return eval_results, current_global_step diff --git a/tensorflow/contrib/learn/python/learn/learn_runner.py b/tensorflow/contrib/learn/python/learn/learn_runner.py new file mode 100644 index 00000000000..97c30d57466 --- /dev/null +++ b/tensorflow/contrib/learn/python/learn/learn_runner.py @@ -0,0 +1,75 @@ +# pylint: disable=g-bad-file-header +# Copyright 2016 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Runs an Experiment.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from tensorflow.contrib.learn.python.learn.experiment import Experiment +from tensorflow.python.platform import flags +from tensorflow.python.platform import tf_logging as logging + + +FLAGS = flags.FLAGS + + +flags.DEFINE_string('schedule', '', 'Schedule to run for this experiment. ' + 'A schedule identifies a method on the Experiment ' + 'instance returned by the function passed to the ' + 'run() call') +flags.DEFINE_string('output_dir', '', 'Base output directory. Made ' + 'available to the experiment builder function passed ' + 'to run(). All files written by the Experiment are ' + 'expected to be written into this directory.') + + +def run(experiment_fn): + """Make and run an experiment.""" + + if not FLAGS.output_dir: + raise RuntimeError('Must specify an output directory (use --output_dir).') + if not FLAGS.schedule: + raise RuntimeError('Must specify a schedule (use --schedule).') + + if not callable(experiment_fn): + raise TypeError('Experiment builder "%s" is not callable.' % + experiment_fn) + + # Call the builder + experiment = experiment_fn(output_dir=FLAGS.output_dir) + if not isinstance(experiment, Experiment): + raise TypeError('Experiment builder did not return an Experiment ' + 'instance, got %s instead.' % type(experiment)) + + # Execute the schedule + taskname = FLAGS.schedule + if not hasattr(experiment, taskname): + logging.error('Schedule references non-existent task %s', taskname) + valid_tasks = [x for x in experiment.__dict__ + if callable(getattr(experiment, x))] + logging.error('Allowed values for this experiment are: %s', valid_tasks) + raise ValueError('Schedule references non-existent task %s', taskname) + + task = getattr(experiment, taskname) + if not callable(task): + logging.error('Schedule references non-callable member %s', taskname) + valid_tasks = [x for x in experiment.__dict__ + if callable(getattr(experiment, x))] + logging.error('Allowed values for this experiment are: %s', valid_tasks) + raise TypeError('Schedule references non-callable member %s', taskname) + + return task() diff --git a/tensorflow/contrib/learn/python/learn/monitors.py b/tensorflow/contrib/learn/python/learn/monitors.py index 066843faeff..f2ce5b0ceb2 100644 --- a/tensorflow/contrib/learn/python/learn/monitors.py +++ b/tensorflow/contrib/learn/python/learn/monitors.py @@ -19,7 +19,12 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +import numpy as np +import six + +from tensorflow.python.framework import ops from tensorflow.python.platform import tf_logging as logging +from tensorflow.python.training import saver from tensorflow.python.training import summary_io @@ -135,17 +140,27 @@ class PrintTensor(EveryN): """ def __init__(self, tensor_names, every_n=100, first_n=1): + """Initializes PrintTensor monitor. + + Args: + tensor_names: `dict` of tag to tensor names or + `iterable` of tensor names (strings). + every_n: Print every N steps. + first_n: Print first N steps. + """ super(PrintTensor, self).__init__(every_n, first_n) + if not isinstance(tensor_names, dict): + tensor_names = {item: item for item in tensor_names} self._tensor_names = tensor_names def every_n_step_begin(self, unused_step): - return self._tensor_names + return list(self._tensor_names.values()) def every_n_step_end(self, step, outputs): stats = [] - for name in self._tensor_names: - if name in outputs: - stats.append("%s = %s" % (name, str(outputs[name]))) + for tag, tensor_name in six.iteritems(self._tensor_names): + if tensor_name in outputs: + stats.append("%s = %s" % (tag, str(outputs[tensor_name]))) logging.info("Step %d: %s" % (step, ", ".join(stats))) @@ -179,14 +194,45 @@ class SummarySaver(EveryN): class ValidationMonitor(EveryN): - """Runs evaluation every n steps. - - Can do early stopping on validation loss if `early_stopping_rounds` provided. + """Runs evaluation of the Estimator every n steps. + Can do early stopping on validation metrics if + `early_stopping_rounds` provided. """ - def __init__(self, x=None, y=None, input_fn=None, - every_n_steps=100, early_stopping_rounds=None): + def __init__(self, x=None, y=None, input_fn=None, batch_size=None, + every_n_steps=100, metrics=None, early_stopping_rounds=None, + early_stopping_metric="loss", + early_stopping_metric_minimize=True, name=None): + """Initializes ValidationMonitor. + + Args: + x: matrix or tensor of shape [n_samples, n_features...]. Can be + iterator that returns arrays of features. The training input + samples for fitting the model. If set, `input_fn` must be `None`. + y: vector or matrix [n_samples] or [n_samples, n_outputs]. Can be + iterator that returns array of targets. The training target values + (class labels in classification, real numbers in regression). If set, + `input_fn` must be `None`. + input_fn: Input function. If set, `x`, `y`, and `batch_size` must be + `None`. + batch_size: minibatch size to use on the input, defaults to first + dimension of `x`. Must be `None` if `input_fn` is provided. + every_n_steps: Runs this monitor every N steps. + metrics: Dict of metric ops to run. If None, the default metric functions + are used; if {}, no metrics are used. + early_stopping_rounds: If validation metric didn't go down for this many + steps, then stop training. + early_stopping_metric: `str`, name of the metric to early stop. + early_stopping_metric_minimize: `bool`, True if minimize, False + if maximize. For example, minimize `loss` or `mean_squared_error` and + maximize `accuracy` or `f1`. + name: `str`, appended to output sub-folder. If None uses `eval` + sub-folder, else, `eval-%name%` is used to save sum. + + Raises: + ValueError: If both x and input_fn are provided. + """ super(ValidationMonitor, self).__init__(every_n_steps=every_n_steps, first_n_steps=-1) if x is None and input_fn is None: @@ -194,25 +240,64 @@ class ValidationMonitor(EveryN): self.x = x self.y = y self.input_fn = input_fn - self.min_loss_step = 0 - self.min_loss = None + self.batch_size = batch_size + self.metrics = metrics self.early_stopping_rounds = early_stopping_rounds + self.early_stopping_metric = early_stopping_metric + self.early_stopping_metric_minimize = early_stopping_metric_minimize + self.name = name + self._best_value_step = None + self._best_value = None + self._early_stopped = False + self._latest_path = None + self._latest_path_step = None + + @property + def early_stopped(self): + return self._early_stopped + + @property + def best_step(self): + return self._best_value_step + + @property + def best_value(self): + return self._best_value def every_n_step_end(self, step, unused_outputs): + # Check that we are not running evaluation on the same checkpoint. + latest_path = saver.latest_checkpoint(self._estimator.model_dir) + if latest_path == self._latest_path: + logging.info("Skipping evaluation due to same checkpoint %s for step %d " + "as for step %d.", latest_path, step, self._latest_path_step) + return False + self._latest_path = latest_path + self._latest_path_step = step + + # Run evaluation and log it. outputs = self._estimator.evaluate( - x=self.x, y=self.y, input_fn=self.input_fn) + x=self.x, y=self.y, input_fn=self.input_fn, batch_size=self.batch_size, + metrics=self.metrics, name=self.name) stats = [] for name in outputs: stats.append("%s = %s" % (name, str(outputs[name]))) logging.info("Validation (step %d): %s" % (step, ", ".join(stats))) + + # Early stopping logic. if self.early_stopping_rounds is not None: - if self.min_loss is None or outputs["loss"] < self.min_loss: - self.min_loss = outputs["loss"] - self.min_loss_step = step - stop_now = (step - self.min_loss_step >= self.early_stopping_rounds) + if (self._best_value is None or + (self.early_stopping_metric_minimize and + outputs[self.early_stopping_metric] < self._best_value) or + (not self.early_stopping_metric_minimize and + outputs[self.early_stopping_metric] > self._best_value)): + self._best_value = outputs[self.early_stopping_metric] + self._best_value_step = step + stop_now = (step - self._best_value_step >= self.early_stopping_rounds) if stop_now: - logging.info("Stopping. Best step: {} with loss {}." - .format(self.min_loss_step, self.min_loss)) + logging.info("Stopping. Best step: {} with {} = {}." + .format(self._best_value_step, + self.early_stopping_metric, self._best_value)) + self._early_stopped = True return True return False @@ -220,7 +305,7 @@ class ValidationMonitor(EveryN): class CaptureVariable(EveryN): """Capture a variable value into a `list`. - It's useful for unit testing. + This monitor is useful for unit testing. """ def __init__(self, var_name, every_n=100, first_n=1): @@ -239,9 +324,85 @@ def get_default_monitors(loss_op=None, summary_op=None, save_summary_steps=100, output_dir=None, summary_writer=None): monitors = [] if loss_op is not None: - monitors.append(PrintTensor([loss_op.name])) + monitors.append(PrintTensor(tensor_names={"loss": loss_op.name})) if summary_op is not None: monitors.append(SummarySaver(summary_op, save_steps=save_summary_steps, output_dir=output_dir, summary_writer=summary_writer)) return monitors + + +class GraphDump(BaseMonitor): + """Dumps almost all tensors in the graph at every step. + + Note, this is very expensive, prefer `PrintTensor` or `CaptureVariable` if + you are not debugging. + """ + + IGNORE_OPS = ["Const", "Assign", "Identity", "Placeholder", + "RandomUniform", "Cast", "RestoreSlice"] + + def __init__(self, ignore_ops=None): + """Initializes GraphDump monitor. + + Args: + ignore_ops: `list` of string names of `Operation`s to ignore. + If `None` GraphDump.IGNORE_OPS list is used. + """ + self.ignore_ops = ignore_ops or GraphDump.IGNORE_OPS + self._data = [] + + def begin(self, max_steps): + self.tensors = [] + graph = ops.get_default_graph() + graph_def = graph.as_graph_def() + for node in graph_def.node: + if node.op in self.ignore_ops: + continue + try: + self.tensors.append(graph.get_tensor_by_name(node.name + ":0")) + except KeyError: + pass + + def step_begin(self, step): + return self.tensors + + def step_end(self, step, outputs): + self._data.append(outputs) + + @property + def data(self): + return self._data + + def compare(self, other_dump, step, atol=1e-06): + """Compares two `GraphDump` monitors and returns differences. + + Args: + other_dump: Another `GraphDump` monitor. + step: `int`, step to compare on. + atol: `float`, absolute tolerance in comparison of floating arrays. + + Returns: + Returns tuple: + matched: `list` of keys that matched. + non_matched: `dict` of keys to difference. + """ + non_matched = {} + matched = [] + for key in self.data[step]: + if not isinstance(key, str) and not isinstance(key, unicode): + continue + value1, value2 = self.data[step][key], other_dump.data[step][key] + if isinstance(value1, str): + continue + if isinstance(value1, np.ndarray): + if not np.allclose(value1, value2, atol=atol): + non_matched[key] = value1 - value2 + else: + matched.append(key) + else: + if value1 != value2: + non_matched[key] = (value1, value2) + else: + matched.append(key) + return matched, non_matched diff --git a/tensorflow/contrib/learn/python/learn/tests/early_stopping_test.py b/tensorflow/contrib/learn/python/learn/tests/early_stopping_test.py index 818ea38e15c..afaf4ecbf51 100644 --- a/tensorflow/contrib/learn/python/learn/tests/early_stopping_test.py +++ b/tensorflow/contrib/learn/python/learn/tests/early_stopping_test.py @@ -28,6 +28,12 @@ from tensorflow.contrib.learn.python.learn.estimators._sklearn import accuracy_s from tensorflow.contrib.learn.python.learn.estimators._sklearn import train_test_split +def _get_summary_events(folder): + if not tf.gfile.Exists(folder): + raise ValueError('Folder %s doesn\'t exist.' % folder) + return tf.contrib.testing.latest_summaries(folder) + + class EarlyStoppingTest(tf.test.TestCase): """Early stopping tests.""" @@ -35,36 +41,54 @@ class EarlyStoppingTest(tf.test.TestCase): random.seed(42) iris = datasets.load_iris() - x_train, x_test, y_train, y_test = train_test_split(iris.data, - iris.target, - test_size=0.2, - random_state=42) + x_train, x_test, y_train, y_test = train_test_split( + iris.data, iris.target, test_size=0.2, random_state=42) - x_train, x_val, y_train, y_val = train_test_split(x_train, - y_train, - test_size=0.2) - val_monitor = learn.monitors.ValidationMonitor(x_val, - y_val, - early_stopping_rounds=100) + x_train, x_val, y_train, y_val = train_test_split( + x_train, y_train, test_size=0.2, random_state=42) + val_monitor = learn.monitors.ValidationMonitor( + x_val, y_val, every_n_steps=50, early_stopping_rounds=100, + early_stopping_metric='accuracy', early_stopping_metric_minimize=False) # classifier without early stopping - overfitting - classifier1 = learn.TensorFlowDNNClassifier(hidden_units=[10, 20, 10], - n_classes=3, - steps=1000) + classifier1 = learn.TensorFlowDNNClassifier( + hidden_units=[10, 20, 10], n_classes=3, steps=1000) classifier1.fit(x_train, y_train) - accuracy_score(y_test, classifier1.predict(x_test)) + _ = accuracy_score(y_test, classifier1.predict(x_test)) + + # Full 1000 steps, 11 summaries and no evaluation summary. + # 11 summaries = first + every 100 out of 1000 steps. + self.assertEqual(11, len(_get_summary_events(classifier1.model_dir))) + with self.assertRaises(ValueError): + _get_summary_events(classifier1.model_dir + '/eval') # classifier with early stopping - improved accuracy on testing set - classifier2 = learn.TensorFlowDNNClassifier(hidden_units=[10, 20, 10], - n_classes=3, - steps=1000) + classifier2 = learn.TensorFlowDNNClassifier( + hidden_units=[10, 20, 10], n_classes=3, steps=2000, + config=tf.contrib.learn.RunConfig(save_checkpoints_secs=1)) classifier2.fit(x_train, y_train, monitors=[val_monitor]) - accuracy_score(y_test, classifier2.predict(x_test)) + _ = accuracy_score(y_val, classifier2.predict(x_val)) + _ = accuracy_score(y_test, classifier2.predict(x_test)) + + # Note, this test is unstable, so not checking for equality. + # See stability_test for examples of stability issues. + if val_monitor.early_stopped: + self.assertLess(val_monitor.best_step, 2000) + # Note, due to validation monitor stopping after the best score occur, + # the accuracy at current checkpoint is less. + # TODO(ipolosukhin): Time machine for restoring old checkpoints? + # flaky, still not always best_value better then score2 value. + # self.assertGreater(val_monitor.best_value, score2_val) + + # Early stopped, unstable so checking only < then max. + self.assertLess(len(_get_summary_events(classifier2.model_dir)), 21) + self.assertLess(len(_get_summary_events( + classifier2.model_dir + '/eval')), 4) # TODO(ipolosukhin): Restore this? # self.assertGreater(score2, score1, "No improvement using early stopping.") -if __name__ == "__main__": +if __name__ == '__main__': tf.test.main() diff --git a/tensorflow/contrib/learn/python/learn/tests/experiment_test.py b/tensorflow/contrib/learn/python/learn/tests/experiment_test.py new file mode 100644 index 00000000000..1da4b2451e5 --- /dev/null +++ b/tensorflow/contrib/learn/python/learn/tests/experiment_test.py @@ -0,0 +1,119 @@ +# pylint: disable=g-bad-file-header +# Copyright 2016 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tests for TaskRunner and Experiment class.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import time + +import tensorflow as tf + + +class TestEstimator(object): + + def __init__(self): + self.eval_count = 0 + self.fit_count = 0 + + def evaluate(self, **kwargs): + tf.logging.info('evaluate called with args: %s' % kwargs) + self.eval_count += 1 + if self.eval_count > 5: + tf.logging.info('Ran 6 evals. Done.') + raise StopIteration() + return [(key, kwargs[key]) for key in sorted(kwargs.keys())] + + def fit(self, **kwargs): + tf.logging.info('fit called with args: %s' % kwargs) + self.fit_count += 1 + return [(key, kwargs[key]) for key in sorted(kwargs.keys())] + + +class ExperimentTest(tf.test.TestCase): + + def test_train(self): + est = TestEstimator() + ex = tf.contrib.learn.Experiment(est, + train_input_fn='train_input', + eval_input_fn='eval_input', + eval_metrics='eval_metrics') + ex.train(delay_secs=0) + self.assertEquals(1, est.fit_count) + self.assertEquals(0, est.eval_count) + + def test_train_delay(self): + est = TestEstimator() + ex = tf.contrib.learn.Experiment(est, + train_input_fn='train_input', + eval_input_fn='eval_input') + for delay in [0, 1, 3]: + start = time.time() + ex.train(delay_secs=delay) + duration = time.time() - start + tf.logging.info('train duration (expected %f): %f', delay, duration) + self.assertTrue(duration > delay - 0.5 and duration < delay + 0.5) + + def test_evaluate(self): + est = TestEstimator() + ex = tf.contrib.learn.Experiment(est, + train_input_fn='train_input', + eval_input_fn='eval_input', + eval_metrics='eval_metrics') + ex.evaluate(steps='steps', delay_secs=0) + self.assertEquals(1, est.eval_count) + self.assertEquals(0, est.fit_count) + + def test_evaluate_delay(self): + est = TestEstimator() + ex = tf.contrib.learn.Experiment(est, + train_input_fn='train_input', + eval_input_fn='eval_input') + for delay in [0, 1, 3]: + start = time.time() + ex.evaluate(delay_secs=delay) + duration = time.time() - start + tf.logging.info('eval duration (expected %f): %f', delay, duration) + self.assertTrue(duration > delay - 0.5 and duration < delay + 0.5) + + def test_continuous_eval(self): + est = TestEstimator() + ex = tf.contrib.learn.Experiment(est, + train_input_fn='train_input', + eval_input_fn='eval_input', + eval_metrics='eval_metrics') + self.assertRaises(StopIteration, ex.continuous_eval, + delay_secs=0, throttle_delay_secs=0) + self.assertEquals(6, est.eval_count) + self.assertEquals(0, est.fit_count) + + def test_continuous_eval_throttle_delay(self): + for delay in [0, 1, 2]: + est = TestEstimator() + ex = tf.contrib.learn.Experiment(est, + train_input_fn='train_input', + eval_input_fn='eval_input', + eval_metrics='eval_metrics') + start = time.time() + self.assertRaises(StopIteration, ex.continuous_eval, + delay_secs=0, throttle_delay_secs=delay) + duration = time.time() - start + expected = 5 * delay + tf.logging.info('eval duration (expected %f): %f', expected, duration) + self.assertTrue(duration > expected - 0.5 and duration < expected + 0.5) + + +if __name__ == '__main__': + tf.test.main() diff --git a/tensorflow/contrib/learn/python/learn/tests/learn_runner_test.py b/tensorflow/contrib/learn/python/learn/tests/learn_runner_test.py new file mode 100644 index 00000000000..ef030562fff --- /dev/null +++ b/tensorflow/contrib/learn/python/learn/tests/learn_runner_test.py @@ -0,0 +1,107 @@ +# pylint: disable=g-bad-file-header +# Copyright 2016 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""learn_main tests.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import tensorflow as tf +from tensorflow.contrib.learn.python.learn import learn_runner + + +FLAGS = learn_runner.FLAGS + + +class TestExperiment(tf.contrib.learn.Experiment): + + def __init__(self, default=None): + self.default = default + + def simple_task(self): + return "simple_task, default=%s." % self.default + + +# pylint: disable=unused-argument +def build_experiment(output_dir): + tf.logging.info("In default build_experiment.") + return TestExperiment() + + +def build_non_experiment(output_dir): + return "Ceci n'est pas un Experiment." +# pylint: enable=unused-argument + + +class MainTest(tf.test.TestCase): + + def setUp(self): + # Make sure the flags exist. It's unclear why this is necessary. + if not hasattr(FLAGS, "output_dir"): + learn_runner.flags.DEFINE_string("output_dir", "/tmp", "Fake") + if not hasattr(FLAGS, "schedule"): + learn_runner.flags.DEFINE_string("schedule", "simple_task", "Fake") + + def test_run(self): + FLAGS.output_dir = "/tmp" + FLAGS.schedule = "simple_task" + self.assertEqual("simple_task, default=None.", + learn_runner.run(build_experiment)) + + def test_fail_no_output_dir(self): + FLAGS.output_dir = "" + FLAGS.schedule = "simple_test" + self.assertRaisesRegexp(RuntimeError, + "Must specify an output directory", + learn_runner.run, build_experiment) + + def test_fail_no_schedule(self): + FLAGS.output_dir = "/tmp" + FLAGS.schedule = "" + self.assertRaisesRegexp(RuntimeError, "Must specify a schedule", + learn_runner.run, build_experiment) + + def test_fail_non_callable(self): + FLAGS.output_dir = "/tmp" + FLAGS.schedule = "simple_test" + self.assertRaisesRegexp(TypeError, + "Experiment builder .* is not callable", + learn_runner.run, "not callable") + + def test_fail_not_experiment(self): + FLAGS.output_dir = "/tmp" + FLAGS.schedule = "simple_test" + self.assertRaisesRegexp( + TypeError, "Experiment builder did not return an Experiment", + learn_runner.run, build_non_experiment) + + def test_fail_non_existent_task(self): + FLAGS.output_dir = "/tmp" + FLAGS.schedule = "mirage" + self.assertRaisesRegexp( + ValueError, "Schedule references non-existent task", + learn_runner.run, build_experiment) + + def test_fail_non_callable_task(self): + FLAGS.output_dir = "/tmp" + FLAGS.schedule = "default" + self.assertRaisesRegexp( + TypeError, "Schedule references non-callable member", + learn_runner.run, build_experiment) + + +if __name__ == "__main__": + tf.test.main() diff --git a/tensorflow/contrib/learn/python/learn/tests/stability_test.py b/tensorflow/contrib/learn/python/learn/tests/stability_test.py new file mode 100644 index 00000000000..373656cebd4 --- /dev/null +++ b/tensorflow/contrib/learn/python/learn/tests/stability_test.py @@ -0,0 +1,88 @@ +# pylint: disable=g-bad-file-header +# Copyright 2016 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Non-linear estimator tests.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +# import random + +import tensorflow as tf + + +class StabilityTest(tf.test.TestCase): + """Tests that estiamtors are reproducible.""" + + def testRandomStability(self): + my_seed, minval, maxval = 42, -0.3333, 0.3333 + with tf.Graph().as_default() as g: + with self.test_session(graph=g) as session: + tf.set_random_seed(my_seed) + x = tf.random_uniform([10, 10], minval=minval, maxval=maxval) + val1 = session.run(x) + with tf.Graph().as_default() as g: + with self.test_session(graph=g) as session: + tf.set_random_seed(my_seed) + x = tf.random_uniform([10, 10], minval=minval, maxval=maxval) + val2 = session.run(x) + self.assertAllClose(val1, val2) + + def testLinearRegression(self): + # TODO(ipolosukhin): This doesn't pass at all, but should... + pass +# random.seed(42) +# boston = tf.contrib.learn.datasets.load_boston() +# regressor = tf.contrib.learn.LinearRegressor() +# regressor.fit(x=boston.data, y=boston.target, steps=1) +# regressor2 = tf.contrib.learn.LinearRegressor() +# regressor2.fit(x=boston.data, y=boston.target, steps=1) +# self.assertAllClose(regressor.weights_, regressor2.weights_) +# self.assertAllClose(regressor.bias_, regressor2.bias_) +# self.assertAllClose(regressor.predict(boston.data), +# regressor2.predict(boston.data), atol=1e-05) + + def testDNNRegression(self): + # TODO(ipolosukhin): This doesn't pass at all, but should... + # Either bugs or just general instability. + pass +# random.seed(42) +# boston = tf.contrib.learn.datasets.load_boston() +# regressor = tf.contrib.learn.DNNRegressor( +# hidden_units=[10], +# optimizer=tf.train.GradientDescentOptimizer(learning_rate=0.001)) +# graph_dump = tf.contrib.learn.monitors.GraphDump() +# regressor.fit(x=boston.data, y=boston.target, steps=1, +# monitors=[graph_dump], batch_size=1) +# regressor2 = tf.contrib.learn.DNNRegressor( +# hidden_units=[10], +# optimizer=tf.train.GradientDescentOptimizer(learning_rate=0.001)) +# graph_dump2 = tf.contrib.learn.monitors.GraphDump() +# regressor2.fit(x=boston.data, y=boston.target, steps=1, +# monitors=[graph_dump2], batch_size=1) +# _, non_match = graph_dump.compare(graph_dump2, 0, atol=1e-02) +# self.assertEmpty(non_match.keys()) +# for weight1, weight2 in zip(regressor.weights_, regressor2.weights_): +# self.assertAllClose(weight1, weight2) +# for bias1, bias2 in zip(regressor.bias_, regressor2.bias_): +# self.assertAllClose(bias1, bias2) +# self.assertAllClose(regressor.predict(boston.data), +# regressor2.predict(boston.data), atol=1e-05) + + +if __name__ == '__main__': + tf.test.main() diff --git a/tensorflow/core/graph/quantize_training.cc b/tensorflow/core/graph/quantize_training.cc new file mode 100644 index 00000000000..23ce7daeff1 --- /dev/null +++ b/tensorflow/core/graph/quantize_training.cc @@ -0,0 +1,229 @@ +/* Copyright 2015 Google Inc. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include +#include +#include +#include +#include + +#include "tensorflow/core/graph/quantize_training.h" + +#include "tensorflow/core/common_runtime/executor.h" +#include "tensorflow/core/common_runtime/function.h" +#include "tensorflow/core/common_runtime/memory_types.h" +#include "tensorflow/core/framework/log_memory.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/graph/algorithm.h" +#include "tensorflow/core/graph/node_builder.h" +#include "tensorflow/core/graph/subgraph.h" +#include "tensorflow/core/lib/strings/strcat.h" +#include "tensorflow/core/public/session_options.h" + +namespace tensorflow { +namespace { +// Node types to rewrite. Insert quantize_and_dequantize op for their inputs. +const std::unordered_set nodes_to_rewrite{ + "MatMul", "Conv2D"}; + +// Contains necessary parameters to convert an edge. +struct EdgeToConvert { + // Edge is not owned here. + const Edge* edge; + int32 num_bits; + bool signed_input; + bool range_given; + float input_min; + float input_max; + + EdgeToConvert(const Edge* e, int32 bits, bool sign, bool range, float min, + float max) { + edge = e; + num_bits = bits; + signed_input = sign; + range_given = range; + input_min = min; + input_max = max; + } +}; + +// Decide if a node is in backward pass by checking if its name is led by +// "gradients". +// TODO(jmchen): Make this check more robust as it is not guaranteed that the +// forward node will not be named with a leading "gradients". +inline bool IsGradientNode(const Graph* graph, const Node* node) { + static const string tag = "gradients"; + return (node->name().compare(0, tag.size(), tag) == 0); +} + +// Find the type of the input to set the parameters for the +// quantize_and_dequantize op. +// Returns true if the root tensor op type is known, false otherwise. +bool FindType(const Graph* graph, const Node* node, bool* signed_input, + bool* range_given, float* input_min, float* input_max) { + const string src_op = node->type_string(); + if (src_op == "Const" || src_op == "Variable") { + *signed_input = true; + *range_given = false; + } else if (src_op == "Relu") { + // Range is not given for Relu. + *signed_input = false; + *range_given = false; + } else if (src_op == "Relu6") { + *signed_input = false; + *range_given = true; + *input_min = 0; + *input_max = 6; + } else if (src_op == "Sigmoid") { + *signed_input = false; + *range_given = true; + *input_min = 0; + *input_max = 1; + } else if (src_op == "Tanh") { + *signed_input = true; + *range_given = true; + *input_min = -1; + *input_max = 1; + } else if (src_op == "Reshape") { + // Reshape has 2 inputs and the first one is the tensor. + for (const Edge* edge : node->in_edges()) { + if (edge->src_output() != Graph::kControlSlot && edge->dst_input() == 0) { + FindType(graph, edge->src(), signed_input, range_given, input_min, + input_max); + } + } + } else if (src_op == "Identity" || src_op == "MaxPool" || + src_op == "AvgPool" || src_op == "MaxPool3D" || + src_op == "AvgPool3D") { + // All these Ops only have 1 data input. + for (const Edge* edge : node->in_edges()) { + if (edge->src_output() != Graph::kControlSlot) { + FindType(graph, edge->src(), signed_input, range_given, input_min, + input_max); + } + } + } else { + // Unknown type, could be the model input examples. + // TODO: Set the params for input with user's hint. + *signed_input = true; + *range_given = false; + return false; + } + + return true; +} + +// Insert conversion op, connect it to the graph and remove the old edge. +Status ProcessTargetEdges(Graph* graph, + const std::vector& target_edges) { + // Remember previous convert ops to avoid duplicated conversion on the same + // input. + std::unordered_map name_index; + for (const EdgeToConvert edge : target_edges) { + Node* convert_node; + string name = + strings::StrCat(edge.edge->src()->name(), "/_QuantizeAndDequantize"); + + auto iter = name_index.find(name); + if (iter == name_index.end()) { + TF_RETURN_IF_ERROR(NodeBuilder(name, "_QuantizeAndDequantize") + .Input(edge.edge->src()) + .Attr("signed_input", edge.signed_input) + .Attr("num_bits", edge.num_bits) + .Attr("range_given", edge.range_given) + .Attr("input_min", edge.input_min) + .Attr("input_max", edge.input_max) + .Finalize(graph, &convert_node)); + + name_index[name] = convert_node; + } else { + convert_node = iter->second; + } + + graph->AddEdge(convert_node, 0, edge.edge->dst(), edge.edge->dst_input()); + graph->RemoveEdge(edge.edge); + } + + return Status::OK(); +} + +} // namespace + +Status DoQuantizeTraining(int32 num_bits, Graph* graph) { + if (graph == nullptr) { + return errors::InvalidArgument("Cannot accept empty graph pointer."); + } + + if (num_bits < 1 || num_bits > 63) { + return errors::OutOfRange("num_bits should be in range [1, 63] but is: ", + num_bits); + } + int potential_input = 0; + std::vector target_edges; + for (Node* node : graph->nodes()) { + if (nodes_to_rewrite.find(node->type_string()) != nodes_to_rewrite.end() && + !IsGradientNode(graph, node)) { + // Find out which types are the inputs and convert them accordingly. + // 1. Const/Variable OP: This is quantized as signed tensors with no given + // range. + // 2. Activation OP: Set the range accordingly for different types of + // activations. Currently we handle {Relu, Relu6, Sigmoid, Tanh} + // 3. Identity OP: The quantization parameters depend on its input. + // 4. Pooling OPs: various pooling ops. Also depends on its input. + // 5. Reshape OP: Also depends on the first input to this op. + // 6. Not-Listed-Above OP: If there is only 1 such op, consider it as the + // model input. However, if there are >1 unknown ops, then returns an + // error for now to avoid unexpected bahavior. + // Note: The list above might not be a complete list. Please let us + // know if you see the error so we can handle your case. + for (const Edge* edge : node->in_edges()) { + if (edge->src_output() == Graph::kControlSlot) { + // Skip the control dependency input. + continue; + } else { + bool signed_input = false; + bool range_given = false; + float input_min = 0; + float input_max = 0; + bool known_op = FindType(graph, edge->src(), &signed_input, + &range_given, &input_min, &input_max); + if (!known_op) { + // Unknown op is considered as input. + // Only support one input for now. + // TODO: Make this configurable if this is the desirable way to find + // input. + if (potential_input > 0) { + return errors::Unimplemented( + "Find a second unknown op: ", edge->src()->name(), + " with type: ", edge->src()->type_string(), + "; Unknown ops are considered as model input for now and " + "only 1 input is supported currently."); + } + potential_input++; + } + + target_edges.emplace_back(EdgeToConvert( + edge, num_bits, signed_input, range_given, input_min, input_max)); + } + } + } + } + + TF_RETURN_IF_ERROR(ProcessTargetEdges(graph, target_edges)); + + return Status::OK(); +} + +} // namespace tensorflow diff --git a/tensorflow/core/graph/quantize_training.h b/tensorflow/core/graph/quantize_training.h new file mode 100644 index 00000000000..694c491620a --- /dev/null +++ b/tensorflow/core/graph/quantize_training.h @@ -0,0 +1,37 @@ +/* Copyright 2016 Google Inc. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_GRAPH_QUANTIZE_TRAINING_H_ +#define TENSORFLOW_GRAPH_QUANTIZE_TRAINING_H_ + +#include "tensorflow/core/graph/graph.h" + +namespace tensorflow { +// Rewrites graph for quantized training. +// Rewrites the forward pass to include the precision loss with quantization so +// the model can learn to deal with such loss and achieve better accuracy when +// it is quantized later for inference. +// Note that the num_bits should be in [1, 63] and 'g' must be not null. +// +// On success, returns OK. +// +// On failure, returns the error status. Possible errors include: +// - num_bits out of range. +// - g is null. +// - More than 1 unknown ops encountered. +Status DoQuantizeTraining(int32 num_bits, Graph* g); +} // namespace tensorflow + +#endif // TENSORFLOW_GRAPH_QUANTIZE_TRAINING_H_ diff --git a/tensorflow/core/graph/quantize_training_test.cc b/tensorflow/core/graph/quantize_training_test.cc new file mode 100644 index 00000000000..d6663e0a508 --- /dev/null +++ b/tensorflow/core/graph/quantize_training_test.cc @@ -0,0 +1,161 @@ +/* Copyright 2015 Google Inc. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include +#include +#include +#include + +#include "tensorflow/core/graph/quantize_training.h" + +#include "tensorflow/core/common_runtime/device_factory.h" +#include "tensorflow/core/common_runtime/device_mgr.h" +#include "tensorflow/core/framework/node_def_util.h" +#include "tensorflow/core/framework/tensor.h" +#include "tensorflow/core/framework/tensor_shape.h" +#include "tensorflow/core/framework/tensor_testutil.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/graph/node_builder.h" +#include "tensorflow/core/graph/testlib.h" +#include "tensorflow/core/lib/core/status_test_util.h" +#include "tensorflow/core/lib/core/threadpool.h" +#include "tensorflow/core/lib/strings/strcat.h" +#include "tensorflow/core/platform/test.h" +#include "tensorflow/core/public/session_options.h" + +namespace tensorflow { +namespace { + +class QuantizeTrainingTest : public ::testing::Test { + protected: + QuantizeTrainingTest() { Reset(); } + void Reset() { g_.reset(new Graph(OpRegistry::Global())); } + + template + Node* Constant(gtl::ArraySlice values, TensorShape shape) { + return test::graph::Constant(g_.get(), test::AsTensor(values, shape)); + } + + std::unique_ptr g_; +}; + +TEST_F(QuantizeTrainingTest, NormalGraph) { + // Construct the following graph + /* + m1 m2 + / \ / \ + Relu Identity c + | | + a b + */ + Reset(); + Graph* g = g_.get(); + Node* a = Constant({1.0, 2.0, 3.0, 4.0}, {2, 2}); + Node* b = Constant({1.0, 2.0, 3.0, 4.0}, {2, 2}); + Node* c = Constant({0.0, 1.0, 1.0, 0.0}, {2, 2}); + g->AddControlEdge(g->source_node(), a); + g->AddControlEdge(g->source_node(), b); + g->AddControlEdge(g->source_node(), c); + Node* relu = test::graph::Relu(g, a); + Node* identity = test::graph::Identity(g, b); + Node* m1 = test::graph::Matmul(g, relu, identity, false, false); + Node* m2 = test::graph::Matmul(g, identity, c, false, false); + g->AddControlEdge(m1, g->sink_node()); + g->AddControlEdge(m2, g->sink_node()); + + // The graph after the rewriting should be: + // "Q" is the quantize_and_dequantize op. + // Note the Q in the middle is shared by both m1 and m2. + /* + m1 m2 + / \ / \ + Q Q Q + | | | + Relu Identity c + | | + a b + */ + int num_bits = 8; + // 4 edges to modify + TF_ASSERT_OK(DoQuantizeTraining(num_bits, g)); + + // There should be 12 nodes in total including the source and sink nodes. + EXPECT_EQ(12, g->num_nodes()); + // Nodes m1 and m2's inputs should be the quantize_and_dequantize op. + std::vector target_nodes{m1, m2}; + for (Node* n : target_nodes) { + for (Node* in : n->in_nodes()) { + EXPECT_EQ("_QuantizeAndDequantize", in->type_string()); + } + } + + // relu, identity, c should now connect to the quantize_and_dequantize nodes. + std::vector target_inputs{relu, identity, c}; + for (Node* n : target_inputs) { + for (Node* out : n->out_nodes()) { + EXPECT_EQ("_QuantizeAndDequantize", out->type_string()); + } + } + + // Quantize_and_dequantize node for identity should have signed_input==true. + NodeDef identity_Q = identity->out_nodes().begin()->def(); + ASSERT_EQ("true", + SummarizeAttrValue(identity_Q.attr().find("signed_input")->second)); + // Quantize_and_dequantize node for relu should have signed_input==false. + NodeDef relu_Q = relu->out_nodes().begin()->def(); + ASSERT_EQ("false", + SummarizeAttrValue(relu_Q.attr().find("signed_input")->second)); +} + +TEST_F(QuantizeTrainingTest, WithBackwardNodes) { + // Construct the same graph plus another backward Matmul. + Reset(); + Graph* g = g_.get(); + Node* a = Constant({1.0, 2.0, 3.0, 4.0}, {2, 2}); + Node* b = Constant({1.0, 2.0, 3.0, 4.0}, {2, 2}); + Node* c = Constant({0.0, 1.0, 1.0, 0.0}, {2, 2}); + g->AddControlEdge(g->source_node(), a); + g->AddControlEdge(g->source_node(), b); + g->AddControlEdge(g->source_node(), c); + Node* relu = test::graph::Relu(g, a); + Node* identity = test::graph::Identity(g, b); + Node* m1 = test::graph::Matmul(g, relu, identity, false, false); + Node* m2 = test::graph::Matmul(g, identity, c, false, false); + g->AddControlEdge(m1, g->sink_node()); + g->AddControlEdge(m2, g->sink_node()); + + // Add a Matmul node with name starting with "gradients". + Node* backward_m; + TF_ASSERT_OK(NodeBuilder(g->NewName("gradients/n"), "MatMul") + .Input(m1) + .Input(m2) + .Attr("transpose_a", true) + .Attr("transpose_b", false) + .Finalize(g, &backward_m)); + g->AddControlEdge(backward_m, g->sink_node()); + + int num_bits = 8; + // Still 4 changes since the inputs of backward node will not be converted. + TF_ASSERT_OK(DoQuantizeTraining(num_bits, g)); + + // Nodes m1 and m2's inputs should now be the quantize_and_dequantize op. + EXPECT_EQ(13, g->num_nodes()); + EXPECT_EQ(2, m2->num_inputs()); +} + +#undef SIMPLE_GRAPH + +} // namespace +} // namespace tensorflow diff --git a/tensorflow/core/graph/testlib.cc b/tensorflow/core/graph/testlib.cc index 0d0a84db799..ec878437dc8 100644 --- a/tensorflow/core/graph/testlib.cc +++ b/tensorflow/core/graph/testlib.cc @@ -384,6 +384,15 @@ Node* GetSessionTensor(Graph* g, Node* in) { return ret; } +Node* Relu(Graph* g, Node* in) { + Node* ret; + TF_CHECK_OK(NodeBuilder(g->NewName("n"), "Relu") + .Input(in, 0) + .Attr("T", DT_FLOAT) + .Finalize(g, &ret)); + return ret; +} + void ToGraphDef(Graph* g, GraphDef* gdef) { g->ToGraphDef(gdef); } } // end namespace graph diff --git a/tensorflow/core/graph/testlib.h b/tensorflow/core/graph/testlib.h index 511f6b4310c..bc4863563f9 100644 --- a/tensorflow/core/graph/testlib.h +++ b/tensorflow/core/graph/testlib.h @@ -169,6 +169,9 @@ Node* GetSessionTensor(Graph* g, Node* in); // given in "tensors". Node* Concat(Graph* g, Node* concat_dim, gtl::ArraySlice tensors); +// Add a Relu node in "g". +Node* Relu(Graph* g, Node* in); + } // end namespace graph } // end namespace test } // end namespace tensorflow diff --git a/tensorflow/core/kernels/avgpooling_op.cc b/tensorflow/core/kernels/avgpooling_op.cc index fc7f6d1a5a4..d666546602e 100644 --- a/tensorflow/core/kernels/avgpooling_op.cc +++ b/tensorflow/core/kernels/avgpooling_op.cc @@ -23,6 +23,7 @@ limitations under the License. #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" #include "tensorflow/core/framework/numeric_op.h" #include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" #include "tensorflow/core/framework/tensor.h" #include "tensorflow/core/framework/tensor_shape.h" #include "tensorflow/core/framework/tensor_slice.h" @@ -99,10 +100,12 @@ class AvgPoolingOp : public UnaryOp { TensorFormat data_format_; }; -REGISTER_KERNEL_BUILDER(Name("AvgPool") - .Device(DEVICE_CPU) - .TypeConstraint("T"), - AvgPoolingOp); +REGISTER_KERNEL_BUILDER( + Name("AvgPool").Device(DEVICE_CPU).TypeConstraint("T"), + AvgPoolingOp); +REGISTER_KERNEL_BUILDER( + Name("AvgPool").Device(DEVICE_CPU).TypeConstraint("T"), + AvgPoolingOp); #if GOOGLE_CUDA template @@ -181,14 +184,17 @@ namespace functor { const Eigen::PaddingType& padding); \ extern template struct SpatialAvgPooling; +DECLARE_GPU_SPEC(Eigen::half); DECLARE_GPU_SPEC(float); #undef DECLARE_GPU_SPEC } // namespace functor -REGISTER_KERNEL_BUILDER(Name("AvgPool") - .Device(DEVICE_GPU) - .TypeConstraint("T"), - AvgPoolingOp); +REGISTER_KERNEL_BUILDER( + Name("AvgPool").Device(DEVICE_GPU).TypeConstraint("T"), + AvgPoolingOp); +REGISTER_KERNEL_BUILDER( + Name("AvgPool").Device(DEVICE_GPU).TypeConstraint("T"), + AvgPoolingOp); #endif // GOOGLE_CUDA // The operation to compute AvgPool gradients. @@ -300,7 +306,7 @@ class AvgPoolingGradOp : public OpKernel { GetBroadcastSize(c, in_cols, window_cols, col_stride, pad_cols, &cindex, &csize)); - T divide_coeff = 1.0 / (rsize * csize); + T divide_coeff(1.0 / (rsize * csize)); int64 output_index = (b * out_backprop_rows + r) * out_backprop_cols + c; for (int64 r_dst = rindex; r_dst < rindex + rsize; ++r_dst) { @@ -337,16 +343,16 @@ class AvgPoolingGradOp : public OpKernel { TensorFormat data_format_; }; -REGISTER_KERNEL_BUILDER(Name("AvgPoolGrad") - .Device(DEVICE_CPU) - .TypeConstraint("T") - .HostMemory("orig_input_shape"), - AvgPoolingGradOp); -REGISTER_KERNEL_BUILDER(Name("AvgPoolGrad") - .Device(DEVICE_CPU) - .TypeConstraint("T") - .HostMemory("orig_input_shape"), - AvgPoolingGradOp); +#define REGISTER_CPU_KERNEL(T) \ + REGISTER_KERNEL_BUILDER(Name("AvgPoolGrad") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("T") \ + .HostMemory("orig_input_shape"), \ + AvgPoolingGradOp); + +TF_CALL_float(REGISTER_CPU_KERNEL); +TF_CALL_double(REGISTER_CPU_KERNEL); +TF_CALL_half(REGISTER_CPU_KERNEL); #if GOOGLE_CUDA @@ -416,6 +422,12 @@ REGISTER_KERNEL_BUILDER(Name("AvgPoolGrad") .HostMemory("orig_input_shape") .Label("cudnn"), AvgPoolingGradOp); +REGISTER_KERNEL_BUILDER(Name("AvgPoolGrad") + .Device(DEVICE_GPU) + .TypeConstraint("T") + .HostMemory("orig_input_shape") + .Label("cudnn"), + AvgPoolingGradOp); // A custom GPU kernel based AvgPoolingGrad implementation. It includes the // padding as the candidates for the pooling operation. @@ -532,6 +544,11 @@ REGISTER_KERNEL_BUILDER(Name("AvgPoolGrad") .TypeConstraint("T") .HostMemory("orig_input_shape"), AvgPoolingGradOpCustomGPUKernel); +REGISTER_KERNEL_BUILDER(Name("AvgPoolGrad") + .Device(DEVICE_GPU) + .TypeConstraint("T") + .HostMemory("orig_input_shape"), + AvgPoolingGradOpCustomGPUKernel); #endif // GOOGLE_CUDA diff --git a/tensorflow/core/kernels/avgpooling_op_gpu.cu.cc b/tensorflow/core/kernels/avgpooling_op_gpu.cu.cc index 9e894b1734d..a190b2168a7 100644 --- a/tensorflow/core/kernels/avgpooling_op_gpu.cu.cc +++ b/tensorflow/core/kernels/avgpooling_op_gpu.cu.cc @@ -33,6 +33,7 @@ typedef Eigen::GpuDevice GPUDevice; #define DEFINE_GPU_KERNELS(T) \ template struct functor::SpatialAvgPooling; +DEFINE_GPU_KERNELS(Eigen::half) DEFINE_GPU_KERNELS(float) #undef DEFINE_GPU_KERNELS @@ -57,7 +58,7 @@ __global__ void AvePoolBackwardNHWC(const int nthreads, const int phend = min(h / stride_h + 1, pooled_height); const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1; const int pwend = min(w / stride_w + 1, pooled_width); - dtype gradient = 0; + dtype gradient(0); const dtype* const top_diff_slice = top_diff + n * pooled_height * pooled_width * channels + c; for (int ph = phstart; ph < phend; ++ph) { @@ -104,6 +105,12 @@ template bool RunAvePoolBackwardNHWC( const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_t, const int pad_l, float* const bottom_diff, const GPUDevice& d); +template bool RunAvePoolBackwardNHWC( + const Eigen::half* const top_diff, const int num, const int height, + const int width, const int channels, const int pooled_height, + const int pooled_width, const int kernel_h, const int kernel_w, + const int stride_h, const int stride_w, const int pad_t, const int pad_l, + Eigen::half* const bottom_diff, const GPUDevice& d); } // end namespace tensorflow diff --git a/tensorflow/core/kernels/batch_norm_op.cc b/tensorflow/core/kernels/batch_norm_op.cc index a5f526780f2..f4aa7596435 100644 --- a/tensorflow/core/kernels/batch_norm_op.cc +++ b/tensorflow/core/kernels/batch_norm_op.cc @@ -159,9 +159,9 @@ class BatchNormGradOp : public OpKernel { .TypeConstraint("T"), \ BatchNormOp); -REGISTER_KERNEL(Eigen::half); -REGISTER_KERNEL(float); -REGISTER_KERNEL(double); +TF_CALL_half(REGISTER_KERNEL); +TF_CALL_float(REGISTER_KERNEL); +TF_CALL_double(REGISTER_KERNEL); #undef REGISTER_KERNEL #if GOOGLE_CUDA @@ -179,8 +179,8 @@ namespace functor { #define DECLARE_GPU_SPECS(T) DECLARE_GPU_SPEC(T); -DECLARE_GPU_SPECS(Eigen::half); -DECLARE_GPU_SPECS(float); +TF_CALL_half(DECLARE_GPU_SPECS); +TF_CALL_float(DECLARE_GPU_SPECS); #undef DECLARE_GPU_SPEC } // namespace functor @@ -191,8 +191,8 @@ DECLARE_GPU_SPECS(float); .TypeConstraint("T"), \ BatchNormOp); -REGISTER_GPU_KERNEL(Eigen::half); -REGISTER_GPU_KERNEL(float); +TF_CALL_half(REGISTER_GPU_KERNEL); +TF_CALL_float(REGISTER_GPU_KERNEL); #undef REGISTER_GPU_KERNEL #endif // GOOGLE_CUDA @@ -203,9 +203,9 @@ REGISTER_GPU_KERNEL(float); .TypeConstraint("T"), \ BatchNormGradOp); -REGISTER_KERNEL(Eigen::half); -REGISTER_KERNEL(float); -REGISTER_KERNEL(double); +TF_CALL_half(REGISTER_KERNEL); +TF_CALL_float(REGISTER_KERNEL); +TF_CALL_double(REGISTER_KERNEL); #undef REGISTER_KERNEL #if GOOGLE_CUDA @@ -226,8 +226,8 @@ namespace functor { #define DECLARE_GPU_SPECS(T) DECLARE_GPU_SPEC(T); -DECLARE_GPU_SPECS(Eigen::half); -DECLARE_GPU_SPECS(float); +TF_CALL_half(DECLARE_GPU_SPECS); +TF_CALL_float(DECLARE_GPU_SPECS); #undef DECLARE_GPU_SPEC } // namespace functor @@ -238,8 +238,8 @@ DECLARE_GPU_SPECS(float); .TypeConstraint("T"), \ BatchNormGradOp); -REGISTER_GPU_KERNEL(Eigen::half); -REGISTER_GPU_KERNEL(float); +TF_CALL_half(REGISTER_GPU_KERNEL); +TF_CALL_float(REGISTER_GPU_KERNEL); #undef REGISTER_GPU_KERNEL #endif // GOOGLE_CUDA diff --git a/tensorflow/core/kernels/check_numerics_op.cc b/tensorflow/core/kernels/check_numerics_op.cc index bc322ed139f..1d8874b4dfe 100644 --- a/tensorflow/core/kernels/check_numerics_op.cc +++ b/tensorflow/core/kernels/check_numerics_op.cc @@ -20,6 +20,7 @@ limitations under the License. #include #include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" #include "tensorflow/core/framework/tensor.h" #include "tensorflow/core/framework/types.h" @@ -182,18 +183,14 @@ class CheckNumericsOp : public OpKernel { } // namespace -REGISTER_KERNEL_BUILDER(Name("CheckNumerics") - .Device(DEVICE_CPU) - .TypeConstraint("T"), - CheckNumericsOp); -REGISTER_KERNEL_BUILDER(Name("CheckNumerics") - .Device(DEVICE_CPU) - .TypeConstraint("T"), - CheckNumericsOp); -REGISTER_KERNEL_BUILDER(Name("CheckNumerics") - .Device(DEVICE_CPU) - .TypeConstraint("T"), - CheckNumericsOp); +#define REGISTER_CPU_KERNEL(T) \ + REGISTER_KERNEL_BUILDER( \ + Name("CheckNumerics").Device(DEVICE_CPU).TypeConstraint("T"), \ + CheckNumericsOp); +TF_CALL_half(REGISTER_CPU_KERNEL); +TF_CALL_float(REGISTER_CPU_KERNEL); +TF_CALL_double(REGISTER_CPU_KERNEL); + #if GOOGLE_CUDA REGISTER_KERNEL_BUILDER(Name("CheckNumerics") .Device(DEVICE_GPU) diff --git a/tensorflow/core/kernels/conv_grad_ops.cc b/tensorflow/core/kernels/conv_grad_ops.cc index f9b7ed6ace1..014a3d78a94 100644 --- a/tensorflow/core/kernels/conv_grad_ops.cc +++ b/tensorflow/core/kernels/conv_grad_ops.cc @@ -22,6 +22,7 @@ limitations under the License. #include #include "tensorflow/core/framework/numeric_op.h" #include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" #include "tensorflow/core/framework/tensor.h" #include "tensorflow/core/framework/tensor_shape.h" #include "tensorflow/core/framework/tensor_slice.h" @@ -622,35 +623,24 @@ class Conv2DCustomBackpropInputOp : public OpKernel { TF_DISALLOW_COPY_AND_ASSIGN(Conv2DCustomBackpropInputOp); }; -REGISTER_KERNEL_BUILDER( - Name("Conv2DBackpropInput").Device(DEVICE_CPU).TypeConstraint("T"), - Conv2DCustomBackpropInputOp); -REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropInput") - .Device(DEVICE_CPU) - .TypeConstraint("T"), - Conv2DCustomBackpropInputOp); +#define REGISTER_CPU_KERNELS(T) \ + REGISTER_KERNEL_BUILDER( \ + Name("Conv2DBackpropInput").Device(DEVICE_CPU).TypeConstraint("T"), \ + Conv2DCustomBackpropInputOp); \ + REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropInput") \ + .Device(DEVICE_CPU) \ + .Label("custom") \ + .TypeConstraint("T"), \ + Conv2DCustomBackpropInputOp); \ + REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropInput") \ + .Device(DEVICE_CPU) \ + .Label("eigen_tensor") \ + .TypeConstraint("T"), \ + Conv2DFastBackpropInputOp); -REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropInput") - .Device(DEVICE_CPU) - .Label("custom") - .TypeConstraint("T"), - Conv2DCustomBackpropInputOp); -REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropInput") - .Device(DEVICE_CPU) - .Label("custom") - .TypeConstraint("T"), - Conv2DCustomBackpropInputOp); - -REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropInput") - .Device(DEVICE_CPU) - .Label("eigen_tensor") - .TypeConstraint("T"), - Conv2DFastBackpropInputOp); -REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropInput") - .Device(DEVICE_CPU) - .Label("eigen_tensor") - .TypeConstraint("T"), - Conv2DFastBackpropInputOp); +TF_CALL_half(REGISTER_CPU_KERNELS); +TF_CALL_float(REGISTER_CPU_KERNELS); +#undef REGISTER_CPU_KERNELS template class Conv2DFastBackpropFilterOp : public OpKernel { @@ -867,35 +857,24 @@ class Conv2DCustomBackpropFilterOp : public OpKernel { TF_DISALLOW_COPY_AND_ASSIGN(Conv2DCustomBackpropFilterOp); }; -REGISTER_KERNEL_BUILDER( - Name("Conv2DBackpropFilter").Device(DEVICE_CPU).TypeConstraint("T"), - Conv2DCustomBackpropFilterOp); -REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropFilter") - .Device(DEVICE_CPU) - .TypeConstraint("T"), - Conv2DCustomBackpropFilterOp); +#define REGISTER_CPU_KERNELS(T) \ + REGISTER_KERNEL_BUILDER( \ + Name("Conv2DBackpropFilter").Device(DEVICE_CPU).TypeConstraint("T"), \ + Conv2DCustomBackpropFilterOp); \ + REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropFilter") \ + .Device(DEVICE_CPU) \ + .Label("custom") \ + .TypeConstraint("T"), \ + Conv2DCustomBackpropFilterOp); \ + REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropFilter") \ + .Device(DEVICE_CPU) \ + .Label("eigen_tensor") \ + .TypeConstraint("T"), \ + Conv2DFastBackpropFilterOp); -REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropFilter") - .Device(DEVICE_CPU) - .Label("custom") - .TypeConstraint("T"), - Conv2DCustomBackpropFilterOp); -REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropFilter") - .Device(DEVICE_CPU) - .Label("custom") - .TypeConstraint("T"), - Conv2DCustomBackpropFilterOp); - -REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropFilter") - .Device(DEVICE_CPU) - .Label("eigen_tensor") - .TypeConstraint("T"), - Conv2DFastBackpropFilterOp); -REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropFilter") - .Device(DEVICE_CPU) - .Label("eigen_tensor") - .TypeConstraint("T"), - Conv2DFastBackpropFilterOp); +TF_CALL_half(REGISTER_CPU_KERNELS); +TF_CALL_float(REGISTER_CPU_KERNELS); +#undef REGISTER_CPU_KERNELS // GPU definitions of both ops. #if GOOGLE_CUDA diff --git a/tensorflow/core/kernels/conv_grad_ops_3d.cc b/tensorflow/core/kernels/conv_grad_ops_3d.cc index d75bc026cd3..af6048a98bf 100644 --- a/tensorflow/core/kernels/conv_grad_ops_3d.cc +++ b/tensorflow/core/kernels/conv_grad_ops_3d.cc @@ -20,6 +20,7 @@ limitations under the License. #include "tensorflow/core/framework/numeric_op.h" #include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" #include "tensorflow/core/framework/tensor.h" #include "tensorflow/core/framework/tensor_shape.h" #include "tensorflow/core/framework/tensor_slice.h" @@ -194,14 +195,13 @@ class Conv3DBackpropInputOp : public OpKernel { Padding padding_; }; -REGISTER_KERNEL_BUILDER( - Name("Conv3DBackpropInput").Device(DEVICE_CPU).TypeConstraint("T"), - Conv3DBackpropInputOp); -#ifndef IS_MOBILE_PLATFORM -REGISTER_KERNEL_BUILDER( - Name("Conv3DBackpropInput").Device(DEVICE_CPU).TypeConstraint("T"), - Conv3DBackpropInputOp); -#endif +#define REGISTER_CPU_KERNEL(T) \ + REGISTER_KERNEL_BUILDER( \ + Name("Conv3DBackpropInput").Device(DEVICE_CPU).TypeConstraint("T"), \ + Conv3DBackpropInputOp); +TF_CALL_float(REGISTER_CPU_KERNEL); +TF_CALL_double(REGISTER_CPU_KERNEL); +#undef REGISTER_CPU_KERNEL // Backprop for filter. template @@ -303,14 +303,13 @@ class Conv3DBackpropFilterOp : public OpKernel { Padding padding_; }; -REGISTER_KERNEL_BUILDER( - Name("Conv3DBackpropFilter").Device(DEVICE_CPU).TypeConstraint("T"), - Conv3DBackpropFilterOp); -#ifndef IS_MOBILE_PLATFORM -REGISTER_KERNEL_BUILDER( - Name("Conv3DBackpropFilter").Device(DEVICE_CPU).TypeConstraint("T"), - Conv3DBackpropFilterOp); -#endif +#define REGISTER_CPU_KERNEL(T) \ + REGISTER_KERNEL_BUILDER( \ + Name("Conv3DBackpropFilter").Device(DEVICE_CPU).TypeConstraint("T"), \ + Conv3DBackpropFilterOp); +TF_CALL_float(REGISTER_CPU_KERNEL); +TF_CALL_double(REGISTER_CPU_KERNEL); +#undef REGISTER_CPU_KERNEL // GPU definitions of both ops. #if GOOGLE_CUDA diff --git a/tensorflow/core/kernels/conv_ops.cc b/tensorflow/core/kernels/conv_ops.cc index 3a8ecacf93b..c64c6cd35c1 100644 --- a/tensorflow/core/kernels/conv_ops.cc +++ b/tensorflow/core/kernels/conv_ops.cc @@ -23,6 +23,7 @@ limitations under the License. #include #include "tensorflow/core/framework/numeric_op.h" #include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" #include "tensorflow/core/framework/tensor.h" #include "tensorflow/core/framework/tensor_shape.h" #include "tensorflow/core/framework/tensor_slice.h" @@ -245,12 +246,13 @@ class Conv2DOp : public BinaryOp { TF_DISALLOW_COPY_AND_ASSIGN(Conv2DOp); }; -REGISTER_KERNEL_BUILDER( - Name("Conv2D").Device(DEVICE_CPU).TypeConstraint("T"), - Conv2DOp); -REGISTER_KERNEL_BUILDER( - Name("Conv2D").Device(DEVICE_CPU).TypeConstraint("T"), - Conv2DOp); +#define REGISTER_CPU(T) \ + REGISTER_KERNEL_BUILDER( \ + Name("Conv2D").Device(DEVICE_CPU).TypeConstraint("T"), \ + Conv2DOp); + +TF_CALL_half(REGISTER_CPU); +TF_CALL_float(REGISTER_CPU); #if GOOGLE_CUDA int64 GetCudnnWorkspaceLimit(const string& envvar_in_mb, diff --git a/tensorflow/core/kernels/conv_ops_3d.cc b/tensorflow/core/kernels/conv_ops_3d.cc index 35dd92e3159..697b3f62679 100644 --- a/tensorflow/core/kernels/conv_ops_3d.cc +++ b/tensorflow/core/kernels/conv_ops_3d.cc @@ -21,6 +21,7 @@ limitations under the License. #include "tensorflow/core/framework/numeric_op.h" #include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" #include "tensorflow/core/framework/tensor.h" #include "tensorflow/core/framework/tensor_shape.h" #include "tensorflow/core/framework/tensor_slice.h" @@ -120,15 +121,13 @@ class Conv3DOp : public BinaryOp { Padding padding_; }; -REGISTER_KERNEL_BUILDER( - Name("Conv3D").Device(DEVICE_CPU).TypeConstraint("T"), - Conv3DOp); - -#ifndef IS_MOBILE_PLATFORM -REGISTER_KERNEL_BUILDER( - Name("Conv3D").Device(DEVICE_CPU).TypeConstraint("T"), - Conv3DOp); -#endif +#define REGISTER_CPU_KERNEL(T) \ + REGISTER_KERNEL_BUILDER( \ + Name("Conv3D").Device(DEVICE_CPU).TypeConstraint("T"), \ + Conv3DOp); +TF_CALL_float(REGISTER_CPU_KERNEL); +TF_CALL_double(REGISTER_CPU_KERNEL); +#undef REGISTER_CPU_KERNEL #if GOOGLE_CUDA diff --git a/tensorflow/core/kernels/depthwise_conv_grad_op.cc b/tensorflow/core/kernels/depthwise_conv_grad_op.cc index ffc6eeb809f..161c88d8145 100644 --- a/tensorflow/core/kernels/depthwise_conv_grad_op.cc +++ b/tensorflow/core/kernels/depthwise_conv_grad_op.cc @@ -566,16 +566,14 @@ class DepthwiseConv2dNativeBackpropInputOp : public OpKernel { TF_DISALLOW_COPY_AND_ASSIGN(DepthwiseConv2dNativeBackpropInputOp); }; -REGISTER_KERNEL_BUILDER(Name("DepthwiseConv2dNativeBackpropInput") - .Device(DEVICE_CPU) - .TypeConstraint("T"), - DepthwiseConv2dNativeBackpropInputOp); - -REGISTER_KERNEL_BUILDER( - Name("DepthwiseConv2dNativeBackpropInput") - .Device(DEVICE_CPU) - .TypeConstraint("T"), - DepthwiseConv2dNativeBackpropInputOp); +#define REGISTER_CPU_KERNEL(T) \ + REGISTER_KERNEL_BUILDER(Name("DepthwiseConv2dNativeBackpropInput") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("T"), \ + DepthwiseConv2dNativeBackpropInputOp); +TF_CALL_float(REGISTER_CPU_KERNEL); +TF_CALL_double(REGISTER_CPU_KERNEL); +#undef REGISTER_CPU_KERNEL #if GOOGLE_CUDA REGISTER_KERNEL_BUILDER(Name("DepthwiseConv2dNativeBackpropInput") @@ -951,17 +949,15 @@ class DepthwiseConv2dNativeBackpropFilterOp : public OpKernel { TF_DISALLOW_COPY_AND_ASSIGN(DepthwiseConv2dNativeBackpropFilterOp); }; -REGISTER_KERNEL_BUILDER( - Name("DepthwiseConv2dNativeBackpropFilter") - .Device(DEVICE_CPU) - .TypeConstraint("T"), - DepthwiseConv2dNativeBackpropFilterOp); - -REGISTER_KERNEL_BUILDER( - Name("DepthwiseConv2dNativeBackpropFilter") - .Device(DEVICE_CPU) - .TypeConstraint("T"), - DepthwiseConv2dNativeBackpropFilterOp); +#define REGISTER_CPU_KERNEL(T) \ + REGISTER_KERNEL_BUILDER( \ + Name("DepthwiseConv2dNativeBackpropFilter") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("T"), \ + DepthwiseConv2dNativeBackpropFilterOp); +TF_CALL_float(REGISTER_CPU_KERNEL); +TF_CALL_double(REGISTER_CPU_KERNEL); +#undef REGISTER_CPU_KERNEL #if GOOGLE_CUDA REGISTER_KERNEL_BUILDER( diff --git a/tensorflow/core/kernels/depthwise_conv_op.cc b/tensorflow/core/kernels/depthwise_conv_op.cc index c96365f4f02..4bee59aecd6 100644 --- a/tensorflow/core/kernels/depthwise_conv_op.cc +++ b/tensorflow/core/kernels/depthwise_conv_op.cc @@ -376,14 +376,13 @@ class DepthwiseConv2dNativeOp : public BinaryOp { TF_DISALLOW_COPY_AND_ASSIGN(DepthwiseConv2dNativeOp); }; -REGISTER_KERNEL_BUILDER( - Name("DepthwiseConv2dNative").Device(DEVICE_CPU).TypeConstraint("T"), - DepthwiseConv2dNativeOp); +#define REGISTER_CPU_KERNEL(T) \ + REGISTER_KERNEL_BUILDER( \ + Name("DepthwiseConv2dNative").Device(DEVICE_CPU).TypeConstraint("T"), \ + DepthwiseConv2dNativeOp); -REGISTER_KERNEL_BUILDER(Name("DepthwiseConv2dNative") - .Device(DEVICE_CPU) - .TypeConstraint("T"), - DepthwiseConv2dNativeOp); +TF_CALL_float(REGISTER_CPU_KERNEL); +TF_CALL_double(REGISTER_CPU_KERNEL); #if GOOGLE_CUDA REGISTER_KERNEL_BUILDER( diff --git a/tensorflow/core/kernels/draw_bounding_box_op.cc b/tensorflow/core/kernels/draw_bounding_box_op.cc index 5fb2c9e471e..a825c5bb10f 100644 --- a/tensorflow/core/kernels/draw_bounding_box_op.cc +++ b/tensorflow/core/kernels/draw_bounding_box_op.cc @@ -143,13 +143,11 @@ class DrawBoundingBoxesOp : public OpKernel { } }; -REGISTER_KERNEL_BUILDER( - Name("DrawBoundingBoxes").Device(DEVICE_CPU).TypeConstraint("T"), - DrawBoundingBoxesOp); - -REGISTER_KERNEL_BUILDER(Name("DrawBoundingBoxes") - .Device(DEVICE_CPU) - .TypeConstraint("T"), - DrawBoundingBoxesOp); +#define REGISTER_CPU_KERNEL(T) \ + REGISTER_KERNEL_BUILDER( \ + Name("DrawBoundingBoxes").Device(DEVICE_CPU).TypeConstraint("T"), \ + DrawBoundingBoxesOp); +TF_CALL_half(REGISTER_CPU_KERNEL); +TF_CALL_float(REGISTER_CPU_KERNEL); } // namespace tensorflow diff --git a/tensorflow/core/kernels/eigen_pooling.h b/tensorflow/core/kernels/eigen_pooling.h index 349cbf9d0e8..aa3b2748935 100644 --- a/tensorflow/core/kernels/eigen_pooling.h +++ b/tensorflow/core/kernels/eigen_pooling.h @@ -309,7 +309,7 @@ struct AvgPoolMeanReducer { EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE AvgPoolMeanReducer() : scalarCount_(0) { typedef typename packet_traits::type Packet; - packetCount_ = pset1(0.0); + packetCount_ = pset1(T(0.0)); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) { diff --git a/tensorflow/core/kernels/matmul_op.cc b/tensorflow/core/kernels/matmul_op.cc index 6d956f4e3ed..ac1a5fea4d2 100644 --- a/tensorflow/core/kernels/matmul_op.cc +++ b/tensorflow/core/kernels/matmul_op.cc @@ -21,6 +21,7 @@ limitations under the License. #include "tensorflow/core/framework/op.h" #include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" #include "tensorflow/core/kernels/fill_functor.h" #if GOOGLE_CUDA @@ -202,17 +203,19 @@ struct MatMulFunctor { .Label("cublas"), \ MatMulOp) -REGISTER_CPU(float); -REGISTER_CPU(double); -REGISTER_CPU(int32); -REGISTER_CPU(Eigen::half); -REGISTER_CPU(complex64); -REGISTER_CPU(complex128); +TF_CALL_float(REGISTER_CPU); +TF_CALL_double(REGISTER_CPU); +TF_CALL_half(REGISTER_CPU); + +TF_CALL_int32(REGISTER_CPU); +TF_CALL_complex64(REGISTER_CPU); +TF_CALL_complex128(REGISTER_CPU); + #if GOOGLE_CUDA -REGISTER_GPU(float); -REGISTER_GPU(double); +TF_CALL_float(REGISTER_GPU); +TF_CALL_double(REGISTER_GPU); #if CUDA_VERSION >= 7050 -REGISTER_GPU(Eigen::half); +TF_CALL_half(REGISTER_GPU); #endif #endif // GOOGLE_CUDA diff --git a/tensorflow/core/kernels/maxpooling_op.cc b/tensorflow/core/kernels/maxpooling_op.cc index 5e3f2196992..f883acf3d6a 100644 --- a/tensorflow/core/kernels/maxpooling_op.cc +++ b/tensorflow/core/kernels/maxpooling_op.cc @@ -160,7 +160,7 @@ static void SpatialMaxPoolWithArgMaxHelper( const int in_end = limit * in_size; EigenMatrixMap in_shard(input_backprop_flat.data() + in_start, 1, in_end - in_start); - in_shard.setConstant(0); + in_shard.setConstant(T(0)); // Backpropagate. const int out_size = out_height * out_width * depth; @@ -187,8 +187,12 @@ static void SpatialMaxPoolWithArgMaxHelper( params.tensor_in_batch, shard_cost, shard); } -REGISTER_KERNEL_BUILDER(Name("MaxPool").Device(DEVICE_CPU), - MaxPoolingOp); +REGISTER_KERNEL_BUILDER( + Name("MaxPool").Device(DEVICE_CPU).TypeConstraint("T"), + MaxPoolingOp); +REGISTER_KERNEL_BUILDER( + Name("MaxPool").Device(DEVICE_CPU).TypeConstraint("T"), + MaxPoolingOp); #if GOOGLE_CUDA // Forward declarations for the functor specializations for GPU. @@ -212,6 +216,7 @@ DECLARE_GPU_SPEC(float); // kernel_label_map. REGISTER_KERNEL_BUILDER(Name("MaxPool") .Device(DEVICE_GPU) + .TypeConstraint("T") .Label("eigen_tensor"), MaxPoolingOp); #endif // GOOGLE_CUDA @@ -297,11 +302,16 @@ class MaxPoolingGradOp : public OpKernel { TensorFormat data_format_; }; -REGISTER_KERNEL_BUILDER(Name("MaxPoolGrad").Device(DEVICE_CPU), - MaxPoolingGradOp); +REGISTER_KERNEL_BUILDER( + Name("MaxPoolGrad").Device(DEVICE_CPU).TypeConstraint("T"), + MaxPoolingGradOp); +REGISTER_KERNEL_BUILDER( + Name("MaxPoolGrad").Device(DEVICE_CPU).TypeConstraint("T"), + MaxPoolingGradOp); #ifdef GOOGLE_CUDA +template static void MaxPoolingBackwardCustomKernel( OpKernelContext* context, const std::vector& size, const std::vector& stride, Padding padding, const Tensor* tensor_in, @@ -318,12 +328,12 @@ static void MaxPoolingBackwardCustomKernel( } MaxPoolBackwardNoMask( - tensor_in->flat().data(), params.tensor_in_batch, + tensor_in->flat().data(), params.tensor_in_batch, params.tensor_in_rows, params.tensor_in_cols, params.depth, params.out_height, params.out_width, params.window_rows, params.window_cols, params.row_stride, params.col_stride, params.pad_rows, - params.pad_cols, out_backprop.flat().data(), - output->flat().data(), context->eigen_device()); + params.pad_cols, out_backprop.flat().data(), + output->flat().data(), context->eigen_device()); } template @@ -378,8 +388,8 @@ class MaxPoolingGradOp : public OpKernel { } else { CHECK(data_format_ == FORMAT_NHWC) << "Non-Cudnn MaxPoolGrad only supports NHWC format"; - MaxPoolingBackwardCustomKernel(context, ksize_, stride_, padding_, - &tensor_in, out_backprop, output_shape); + MaxPoolingBackwardCustomKernel(context, ksize_, stride_, padding_, + &tensor_in, out_backprop, output_shape); } } @@ -391,8 +401,12 @@ class MaxPoolingGradOp : public OpKernel { bool use_dnn_; }; -REGISTER_KERNEL_BUILDER(Name("MaxPoolGrad").Device(DEVICE_GPU), - MaxPoolingGradOp); +REGISTER_KERNEL_BUILDER( + Name("MaxPoolGrad").Device(DEVICE_GPU).TypeConstraint("T"), + MaxPoolingGradOp); +REGISTER_KERNEL_BUILDER( + Name("MaxPoolGrad").Device(DEVICE_GPU).TypeConstraint("T"), + MaxPoolingGradOp); #endif // GOOGLE_CUDA @@ -625,8 +639,12 @@ struct LaunchMaxPoolingNoMask { } }; -REGISTER_KERNEL_BUILDER(Name("MaxPool").Device(DEVICE_GPU), - MaxPoolingNoMaskOp); +REGISTER_KERNEL_BUILDER( + Name("MaxPool").Device(DEVICE_GPU).TypeConstraint("T"), + MaxPoolingNoMaskOp); +REGISTER_KERNEL_BUILDER( + Name("MaxPool").Device(DEVICE_GPU).TypeConstraint("T"), + MaxPoolingNoMaskOp); template struct LaunchMaxPoolingWithArgmax { @@ -649,8 +667,14 @@ struct LaunchMaxPoolingWithArgmax { REGISTER_KERNEL_BUILDER(Name("MaxPoolWithArgmax") .Device(DEVICE_GPU) - .TypeConstraint("Targmax"), + .TypeConstraint("Targmax") + .TypeConstraint("T"), MaxPoolingWithArgmaxOp); +REGISTER_KERNEL_BUILDER(Name("MaxPoolWithArgmax") + .Device(DEVICE_GPU) + .TypeConstraint("Targmax") + .TypeConstraint("T"), + MaxPoolingWithArgmaxOp); template struct LaunchMaxPoolingGradWithArgmax { @@ -675,10 +699,18 @@ struct LaunchMaxPoolingGradWithArgmax { } }; -REGISTER_KERNEL_BUILDER(Name("MaxPoolGradWithArgmax") - .Device(DEVICE_GPU) - .TypeConstraint("Targmax"), - MaxPoolingGradWithArgmaxOp); +REGISTER_KERNEL_BUILDER( + Name("MaxPoolGradWithArgmax") + .Device(DEVICE_GPU) + .TypeConstraint("T") + .TypeConstraint("Targmax"), + MaxPoolingGradWithArgmaxOp); +REGISTER_KERNEL_BUILDER( + Name("MaxPoolGradWithArgmax") + .Device(DEVICE_GPU) + .TypeConstraint("T") + .TypeConstraint("Targmax"), + MaxPoolingGradWithArgmaxOp); #endif // GOOGLE_CUDA diff --git a/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc b/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc index 1bdca42f4e7..91b50b1e111 100644 --- a/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc +++ b/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc @@ -110,7 +110,7 @@ __global__ void MaxPoolForwardNHWC(const int nthreads, const dtype* bottom_data, int wend = min(wstart + kernel_w, width); hstart = max(hstart, 0); wstart = max(wstart, 0); - dtype maxval = -FLT_MAX; + dtype maxval = Eigen::NumTraits::lowest(); int maxidx = -1; const dtype* bottom_data_n = bottom_data + n * height * width * channels; for (int h = hstart; h < hend; ++h) { @@ -149,7 +149,7 @@ __global__ void MaxPoolBackwardNoMaskNHWC( int wend = min(wstart + kernel_w, width); hstart = max(hstart, 0); wstart = max(wstart, 0); - dtype maxval = -FLT_MAX; + dtype maxval = Eigen::NumTraits::lowest(); int maxidx = -1; const dtype* bottom_data_n = bottom_data + n * height * width * channels; for (int h = hstart; h < hend; ++h) { @@ -165,8 +165,8 @@ __global__ void MaxPoolBackwardNoMaskNHWC( // Atomically accumulate the bottom diff. The index could still be // uninitialized, if all the bottom_data are NaN. if (maxidx != -1) { - atomicAdd(bottom_diff + n * height * width * channels + maxidx, - top_diff[index]); + CudaAtomicAdd(bottom_diff + n * height * width * channels + maxidx, + top_diff[index]); } } } @@ -185,8 +185,8 @@ __global__ void MaxPoolBackwardNoMaskNHWC( // bottom_offset: the pre-computed per-image offset of the maxpool input. // This is equal to H*W*C. // bottom_diff: the gradient with respect to the input. -// This function relies on atomicAdd to avoid race conditions. Also, before the -// kernel is run, you will need to make sure that bottom_diff is filled with +// This function relies on CudaAtomicAdd to avoid race conditions. Also, before +// the kernel is run, you will need to make sure that bottom_diff is filled with // zero first. template __global__ void MaxPoolBackward(const int nthreads, const dtype* top_diff, @@ -194,8 +194,8 @@ __global__ void MaxPoolBackward(const int nthreads, const dtype* top_diff, const int bottom_offset, dtype* bottom_diff) { CUDA_1D_KERNEL_LOOP(index, nthreads) { int image_id = (index / top_offset); - atomicAdd(bottom_diff + image_id * bottom_offset + mask[index], - top_diff[index]); + CudaAtomicAdd(bottom_diff + image_id * bottom_offset + mask[index], + top_diff[index]); } } @@ -219,6 +219,23 @@ bool MaxPoolForwardWithOptionalArgmax( return d.ok(); } +bool MaxPoolForwardWithOptionalArgmax( + const Eigen::half* bottom_data, const int batch, const int height, + const int width, const int channels, const int pooled_height, + const int pooled_width, const int kernel_h, const int kernel_w, + const int stride_h, const int stride_w, const int pad_t, const int pad_l, + Eigen::half* top_data, int64* mask, const Eigen::GpuDevice& d) { + const int kThreadsPerBlock = 1024; + const int output_size = batch * channels * pooled_height * pooled_width; + + MaxPoolForwardNHWC<<<(output_size + kThreadsPerBlock - 1) / kThreadsPerBlock, + kThreadsPerBlock, 0, d.stream()>>>( + output_size, bottom_data, height, width, channels, pooled_height, + pooled_width, kernel_h, kernel_w, stride_h, stride_w, pad_t, pad_l, + top_data, mask); + return d.ok(); +} + bool MaxPoolBackwardNoMask(const float* bottom_data, const int batch, const int height, const int width, const int channels, const int pooled_height, @@ -243,6 +260,30 @@ bool MaxPoolBackwardNoMask(const float* bottom_data, const int batch, return d.ok(); } +bool MaxPoolBackwardNoMask(const Eigen::half* bottom_data, const int batch, + const int height, const int width, + const int channels, const int pooled_height, + const int pooled_width, const int kernel_h, + const int kernel_w, const int stride_h, + const int stride_w, const int pad_t, const int pad_l, + const Eigen::half* top_diff, Eigen::half* bottom_diff, + const Eigen::GpuDevice& d) { + const int kThreadsPerBlock = 1024; + const int bottom_size = batch * channels * height * width; + const int top_size = batch * channels * pooled_height * pooled_width; + + SetZero<<<(bottom_size + kThreadsPerBlock - 1) / kThreadsPerBlock, + kThreadsPerBlock, 0, d.stream()>>>(bottom_size, bottom_diff); + + MaxPoolBackwardNoMaskNHWC<<<(top_size + kThreadsPerBlock - 1) / + kThreadsPerBlock, + kThreadsPerBlock, 0, d.stream()>>>( + top_size, bottom_data, height, width, channels, pooled_height, + pooled_width, kernel_h, kernel_w, stride_h, stride_w, pad_t, pad_l, + top_diff, bottom_diff); + return d.ok(); +} + bool MaxPoolBackwardWithArgmax(const int output_size, const int input_size, const float* top_diff, const int64* mask, const int top_offset, const int bottom_offset, @@ -256,12 +297,27 @@ bool MaxPoolBackwardWithArgmax(const int output_size, const int input_size, return d.ok(); } +bool MaxPoolBackwardWithArgmax(const int output_size, const int input_size, + const Eigen::half* top_diff, const int64* mask, + const int top_offset, const int bottom_offset, + Eigen::half* bottom_diff, + const Eigen::GpuDevice& d) { + const int kThreadsPerBlock = 1024; + SetZero<<<(input_size + kThreadsPerBlock - 1) / kThreadsPerBlock, + kThreadsPerBlock, 0, d.stream()>>>(input_size, bottom_diff); + MaxPoolBackward<<<(output_size + kThreadsPerBlock - 1) / kThreadsPerBlock, + kThreadsPerBlock, 0, d.stream()>>>( + output_size, top_diff, mask, top_offset, bottom_offset, bottom_diff); + return d.ok(); +} + typedef Eigen::GpuDevice GPUDevice; #define DEFINE_GPU_KERNELS(T) \ template struct functor::SpatialMaxPooling; DEFINE_GPU_KERNELS(float) +DEFINE_GPU_KERNELS(Eigen::half) #undef DEFINE_GPU_KERNELS diff --git a/tensorflow/core/kernels/maxpooling_op_gpu.h b/tensorflow/core/kernels/maxpooling_op_gpu.h index 05e865f81c0..d1c73a372e9 100644 --- a/tensorflow/core/kernels/maxpooling_op_gpu.h +++ b/tensorflow/core/kernels/maxpooling_op_gpu.h @@ -37,11 +37,24 @@ bool MaxPoolForwardWithOptionalArgmax( const int stride_h, const int stride_w, const int pad_t, const int pad_l, float* top_data, int64* mask, const Eigen::GpuDevice& d); +bool MaxPoolForwardWithOptionalArgmax( + const Eigen::half* bottom_data, const int batch, const int height, + const int width, const int channels, const int pooled_height, + const int pooled_width, const int kernel_h, const int kernel_w, + const int stride_h, const int stride_w, const int pad_t, const int pad_l, + Eigen::half* top_data, int64* mask, const Eigen::GpuDevice& d); + bool MaxPoolBackwardWithArgmax(const int output_size, const int input_size, const float* top_diff, const int64* mask, const int top_offset, const int bottom_offset, float* bottom_diff, const Eigen::GpuDevice& d); +bool MaxPoolBackwardWithArgmax(const int output_size, const int input_size, + const Eigen::half* top_diff, const int64* mask, + const int top_offset, const int bottom_offset, + Eigen::half* bottom_diff, + const Eigen::GpuDevice& d); + bool MaxPoolBackwardNoMask(const float* bottom_data, const int batch, const int height, const int width, const int channels, const int pooled_height, @@ -51,6 +64,15 @@ bool MaxPoolBackwardNoMask(const float* bottom_data, const int batch, const float* top_diff, float* bottom_diff, const Eigen::GpuDevice& d); +bool MaxPoolBackwardNoMask(const Eigen::half* bottom_data, const int batch, + const int height, const int width, + const int channels, const int pooled_height, + const int pooled_width, const int kernel_h, + const int kernel_w, const int stride_h, + const int stride_w, const int pad_t, const int pad_l, + const Eigen::half* top_diff, Eigen::half* bottom_diff, + const Eigen::GpuDevice& d); + } // namespace tensorflow #endif // TENSORFLOW_CORE_KERNELS_MAXPOOLING_OP_GPU_H_ diff --git a/tensorflow/core/kernels/pack_op.cc b/tensorflow/core/kernels/pack_op.cc index c2d2cf3b65e..2f8b4515d09 100644 --- a/tensorflow/core/kernels/pack_op.cc +++ b/tensorflow/core/kernels/pack_op.cc @@ -104,10 +104,8 @@ class PackOp : public OpKernel { PackOp) TF_CALL_ALL_TYPES(REGISTER_PACK); -REGISTER_PACK(quint8); -REGISTER_PACK(qint8); -REGISTER_PACK(qint32); -REGISTER_PACK(bfloat16); +TF_CALL_QUANTIZED_TYPES(REGISTER_PACK); +TF_CALL_bfloat16(REGISTER_PACK); #undef REGISTER_PACK diff --git a/tensorflow/core/kernels/pooling_ops_common.cc b/tensorflow/core/kernels/pooling_ops_common.cc index 3867cc824f8..f5d7771af7f 100644 --- a/tensorflow/core/kernels/pooling_ops_common.cc +++ b/tensorflow/core/kernels/pooling_ops_common.cc @@ -124,6 +124,7 @@ namespace functor { extern template struct TransformDepth; DECLARE_GPU_SPEC(float); +DECLARE_GPU_SPEC(Eigen::half); #undef DECLARE_GPU_SPEC } // namespace functor @@ -368,7 +369,9 @@ void DnnPoolingGradOp::Compute( } } +template class DnnPoolingOp; template class DnnPoolingOp; +template class DnnPoolingGradOp; template class DnnPoolingGradOp; #endif // GOOGLE_CUDA diff --git a/tensorflow/core/kernels/pooling_ops_common.h b/tensorflow/core/kernels/pooling_ops_common.h index 138d1cb2ca6..593c90b0097 100644 --- a/tensorflow/core/kernels/pooling_ops_common.h +++ b/tensorflow/core/kernels/pooling_ops_common.h @@ -311,7 +311,7 @@ void SpatialAvgPool(OpKernelContext* context, Tensor* output, } } } - DCHECK_GT(out_count.minCoeff(), 0); + DCHECK_GT(out_count.minCoeff(), T(0)); out_mat.array().rowwise() /= out_count.transpose().array(); } diff --git a/tensorflow/core/kernels/quantize_and_dequantize_op.cc b/tensorflow/core/kernels/quantize_and_dequantize_op.cc index 2c6e799a2db..98887ce9c3e 100644 --- a/tensorflow/core/kernels/quantize_and_dequantize_op.cc +++ b/tensorflow/core/kernels/quantize_and_dequantize_op.cc @@ -23,6 +23,7 @@ limitations under the License. #include "tensorflow/core/framework/op.h" #include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" #include "tensorflow/core/framework/type_traits.h" #include "tensorflow/core/framework/types.h" #include "tensorflow/core/lib/core/errors.h" @@ -107,15 +108,14 @@ struct QuantizeAndDequantizeOneScaleFunctor { }; } // namespace functor -REGISTER_KERNEL_BUILDER(Name("_QuantizeAndDequantize") - .Device(DEVICE_CPU) - .TypeConstraint("T"), - QuantizeAndDequantizeOp); - -REGISTER_KERNEL_BUILDER(Name("_QuantizeAndDequantize") - .Device(DEVICE_CPU) - .TypeConstraint("T"), - QuantizeAndDequantizeOp); +#define REGISTER_CPU_KERNEL(T) \ + REGISTER_KERNEL_BUILDER(Name("_QuantizeAndDequantize") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("T"), \ + QuantizeAndDequantizeOp); +TF_CALL_float(REGISTER_CPU_KERNEL); +TF_CALL_double(REGISTER_CPU_KERNEL); +#undef REGISTER_CPU_KERNEL #if GOOGLE_CUDA REGISTER_KERNEL_BUILDER(Name("_QuantizeAndDequantize") diff --git a/tensorflow/core/kernels/random_op.cc b/tensorflow/core/kernels/random_op.cc index b42ef1d31d5..ed559142db9 100644 --- a/tensorflow/core/kernels/random_op.cc +++ b/tensorflow/core/kernels/random_op.cc @@ -451,11 +451,11 @@ class MultinomialOp : public OpKernel { .TypeConstraint("Tout"), \ RandomUniformIntOp); -REGISTER(Eigen::half); -REGISTER(float); -REGISTER(double); -REGISTER_INT(int32); -REGISTER_INT(int64); +TF_CALL_half(REGISTER); +TF_CALL_float(REGISTER); +TF_CALL_double(REGISTER); +TF_CALL_int32(REGISTER_INT); +TF_CALL_int64(REGISTER_INT); #undef REGISTER #undef REGISTER_INT @@ -505,11 +505,11 @@ REGISTER_INT(int64); .TypeConstraint("Tout"), \ RandomUniformIntOp); -REGISTER(Eigen::half); -REGISTER(float); -REGISTER(double); -REGISTER_INT(int32); -REGISTER_INT(int64); +TF_CALL_half(REGISTER); +TF_CALL_float(REGISTER); +TF_CALL_double(REGISTER); +TF_CALL_int32(REGISTER_INT); +TF_CALL_int64(REGISTER_INT); #undef REGISTER #undef REGISTER_INT diff --git a/tensorflow/core/kernels/reduction_ops_sum.cc b/tensorflow/core/kernels/reduction_ops_sum.cc index 9539f93644a..26af8c9c2c6 100644 --- a/tensorflow/core/kernels/reduction_ops_sum.cc +++ b/tensorflow/core/kernels/reduction_ops_sum.cc @@ -25,8 +25,8 @@ TF_CALL_REAL_NUMBER_TYPES(REGISTER_CPU_KERNELS); // NOTE: We should have mean(complex64,int32), too. But that needs to // change Eigen::internal::MeanReducer to cast int to complex. // We don't see immediate need of mean(complex64,int32) anyway. -REGISTER_CPU_KERNELS(complex64); -REGISTER_CPU_KERNELS(complex128); +TF_CALL_complex64(REGISTER_CPU_KERNELS); +TF_CALL_complex128(REGISTER_CPU_KERNELS); #undef REGISTER_CPU_KERNELS #if GOOGLE_CUDA diff --git a/tensorflow/core/kernels/resize_bilinear_op.cc b/tensorflow/core/kernels/resize_bilinear_op.cc index 9df91d13811..606c6c8a9ce 100644 --- a/tensorflow/core/kernels/resize_bilinear_op.cc +++ b/tensorflow/core/kernels/resize_bilinear_op.cc @@ -159,15 +159,12 @@ TF_CALL_REAL_NUMBER_TYPES(REGISTER_KERNEL); #undef REGISTER_KERNEL -REGISTER_KERNEL_BUILDER(Name("ResizeBilinearGrad") - .Device(DEVICE_CPU) - .TypeConstraint("T"), - ResizeBilinearOpGrad); -REGISTER_KERNEL_BUILDER(Name("ResizeBilinearGrad") - .Device(DEVICE_CPU) - .TypeConstraint("T"), - ResizeBilinearOpGrad); -REGISTER_KERNEL_BUILDER( - Name("ResizeBilinearGrad").Device(DEVICE_CPU).TypeConstraint("T"), - ResizeBilinearOpGrad); +#define REGISTER_CPU_GRAD_KERNEL(T) \ + REGISTER_KERNEL_BUILDER( \ + Name("ResizeBilinearGrad").Device(DEVICE_CPU).TypeConstraint("T"), \ + ResizeBilinearOpGrad); +TF_CALL_half(REGISTER_CPU_GRAD_KERNEL); +TF_CALL_float(REGISTER_CPU_GRAD_KERNEL); +TF_CALL_double(REGISTER_CPU_GRAD_KERNEL); + } // namespace tensorflow diff --git a/tensorflow/core/kernels/reverse_op.cc b/tensorflow/core/kernels/reverse_op.cc index 9d4c3a2a556..18fb4805156 100644 --- a/tensorflow/core/kernels/reverse_op.cc +++ b/tensorflow/core/kernels/reverse_op.cc @@ -97,13 +97,13 @@ class ReverseOp : public OpKernel { .HostMemory("dims"), \ ReverseOp) -REGISTER_KERNEL(uint8); -REGISTER_KERNEL(int8); -REGISTER_KERNEL(int32); -REGISTER_KERNEL(bool); -REGISTER_KERNEL(Eigen::half); -REGISTER_KERNEL(float); -REGISTER_KERNEL(double); +TF_CALL_uint8(REGISTER_KERNEL); +TF_CALL_int8(REGISTER_KERNEL); +TF_CALL_int32(REGISTER_KERNEL); +TF_CALL_bool(REGISTER_KERNEL); +TF_CALL_half(REGISTER_KERNEL); +TF_CALL_float(REGISTER_KERNEL); +TF_CALL_double(REGISTER_KERNEL); #undef REGISTER_KERNEL #if GOOGLE_CUDA @@ -129,13 +129,13 @@ namespace functor { DECLARE_GPU_SPEC_DIM(T, 7) \ DECLARE_GPU_SPEC_DIM(T, 8) -DECLARE_GPU_SPEC(uint8); -DECLARE_GPU_SPEC(int8); -DECLARE_GPU_SPEC(int32); -DECLARE_GPU_SPEC(bool); -DECLARE_GPU_SPEC(Eigen::half); -DECLARE_GPU_SPEC(float); -DECLARE_GPU_SPEC(double); +TF_CALL_uint8(DECLARE_GPU_SPEC); +TF_CALL_int8(DECLARE_GPU_SPEC); +TF_CALL_int32(DECLARE_GPU_SPEC); +TF_CALL_bool(DECLARE_GPU_SPEC); +TF_CALL_half(DECLARE_GPU_SPEC); +TF_CALL_float(DECLARE_GPU_SPEC); +TF_CALL_double(DECLARE_GPU_SPEC); #undef DECLARE_GPU_SPEC #undef DECLARE_GPU_SPEC_DIM } // namespace functor @@ -147,11 +147,11 @@ DECLARE_GPU_SPEC(double); .TypeConstraint("T") \ .HostMemory("dims"), \ ReverseOp) -REGISTER_GPU_KERNEL(uint8); -REGISTER_GPU_KERNEL(int8); -REGISTER_GPU_KERNEL(Eigen::half); -REGISTER_GPU_KERNEL(float); -REGISTER_GPU_KERNEL(double); +TF_CALL_uint8(REGISTER_GPU_KERNEL); +TF_CALL_int8(REGISTER_GPU_KERNEL); +TF_CALL_half(REGISTER_GPU_KERNEL); +TF_CALL_float(REGISTER_GPU_KERNEL); +TF_CALL_double(REGISTER_GPU_KERNEL); #undef REGISTER_GPU_KERNEL #endif // GOOGLE_CUDA diff --git a/tensorflow/core/kernels/sequence_ops.cc b/tensorflow/core/kernels/sequence_ops.cc index 67a8a90c2fb..0acde9c498b 100644 --- a/tensorflow/core/kernels/sequence_ops.cc +++ b/tensorflow/core/kernels/sequence_ops.cc @@ -118,21 +118,16 @@ class LinSpaceOp : public OpKernel { } }; -REGISTER_KERNEL_BUILDER(Name("LinSpace") - .Device(DEVICE_CPU) - .TypeConstraint("T") - .HostMemory("start") - .HostMemory("stop") - .HostMemory("num") - .HostMemory("output"), - LinSpaceOp); -REGISTER_KERNEL_BUILDER(Name("LinSpace") - .Device(DEVICE_CPU) - .TypeConstraint("T") - .HostMemory("start") - .HostMemory("stop") - .HostMemory("num") - .HostMemory("output"), - LinSpaceOp); +#define REGISTER_CPU_KERNEL(T) \ + REGISTER_KERNEL_BUILDER(Name("LinSpace") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("T") \ + .HostMemory("start") \ + .HostMemory("stop") \ + .HostMemory("num") \ + .HostMemory("output"), \ + LinSpaceOp); +TF_CALL_float(REGISTER_CPU_KERNEL); +TF_CALL_double(REGISTER_CPU_KERNEL); } // namespace tensorflow diff --git a/tensorflow/core/kernels/softmax_op.cc b/tensorflow/core/kernels/softmax_op.cc index 82376862ca6..8ec8409e21d 100644 --- a/tensorflow/core/kernels/softmax_op.cc +++ b/tensorflow/core/kernels/softmax_op.cc @@ -20,6 +20,7 @@ limitations under the License. #include "tensorflow/core/kernels/softmax_op.h" #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" #include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" #include "tensorflow/core/framework/tensor.h" #include "tensorflow/core/framework/tensor_shape.h" @@ -40,27 +41,22 @@ struct SoftmaxFunctor { }; } // namespace functor -REGISTER_KERNEL_BUILDER( - Name("Softmax").Device(DEVICE_CPU).TypeConstraint("T"), - SoftmaxOp); -REGISTER_KERNEL_BUILDER(Name("Softmax") - .Device(DEVICE_CPU) - .TypeConstraint("T"), - SoftmaxOp); -REGISTER_KERNEL_BUILDER(Name("Softmax") - .Device(DEVICE_CPU) - .TypeConstraint("T"), - SoftmaxOp); -REGISTER_KERNEL_BUILDER( - Name("LogSoftmax").Device(DEVICE_CPU).TypeConstraint("T"), - SoftmaxOp); -REGISTER_KERNEL_BUILDER( - Name("LogSoftmax").Device(DEVICE_CPU).TypeConstraint("T"), - SoftmaxOp); -REGISTER_KERNEL_BUILDER(Name("LogSoftmax") - .Device(DEVICE_CPU) - .TypeConstraint("T"), - SoftmaxOp); +#define REGISTER_CPU(T) \ + REGISTER_KERNEL_BUILDER( \ + Name("Softmax").Device(DEVICE_CPU).TypeConstraint("T"), \ + SoftmaxOp); +TF_CALL_half(REGISTER_CPU); +TF_CALL_float(REGISTER_CPU); +TF_CALL_double(REGISTER_CPU); + +#undef REGISTER_CPU +#define REGISTER_CPU(T) \ + REGISTER_KERNEL_BUILDER( \ + Name("LogSoftmax").Device(DEVICE_CPU).TypeConstraint("T"), \ + SoftmaxOp); +TF_CALL_half(REGISTER_CPU); +TF_CALL_float(REGISTER_CPU); +TF_CALL_double(REGISTER_CPU); #if GOOGLE_CUDA REGISTER_KERNEL_BUILDER( diff --git a/tensorflow/core/kernels/sparse_matmul_op.h b/tensorflow/core/kernels/sparse_matmul_op.h index d10cbad1d20..613c6a15c5b 100644 --- a/tensorflow/core/kernels/sparse_matmul_op.h +++ b/tensorflow/core/kernels/sparse_matmul_op.h @@ -157,10 +157,15 @@ EIGEN_STRONG_INLINE Packet8f pinterleave4x64(const Packet8f& from) { return _mm256_castsi256_ps(_mm256_permute4x64_epi64(_mm256_castps_si256(from), _MM_SHUFFLE(3, 1, 2, 0))); #else - __int64_t tmp1 = _mm256_extract_epi64(_mm256_castps_si256(from), 1); - __int64_t tmp2 = _mm256_extract_epi64(_mm256_castps_si256(from), 2); - __m256i tmp3 = _mm256_insert_epi64(_mm256_castps_si256(from), tmp1, 2); - return _mm256_castsi256_ps(_mm256_insert_epi64(tmp3, tmp2, 1)); + auto tmp1 = _mm256_extract_epi32(_mm256_castps_si256(from), 2); + auto tmp2 = _mm256_extract_epi32(_mm256_castps_si256(from), 3); + auto tmp3 = _mm256_extract_epi32(_mm256_castps_si256(from), 4); + auto tmp4 = _mm256_extract_epi32(_mm256_castps_si256(from), 5); + auto tmp5 = _mm256_insert_epi32(_mm256_castps_si256(from), tmp1, 4); + tmp5 = _mm256_insert_epi32(tmp5, tmp2, 5); + tmp5 = _mm256_insert_epi32(tmp5, tmp3, 2); + tmp5 = _mm256_insert_epi32(tmp5, tmp4, 3); + return _mm256_castsi256_ps(tmp5); #endif } // Return a Packet with 4 floats loaded from 4 bfloat16 values diff --git a/tensorflow/core/kernels/sparse_matmul_op_test.cc b/tensorflow/core/kernels/sparse_matmul_op_test.cc index cb885808000..45cad2e23b1 100644 --- a/tensorflow/core/kernels/sparse_matmul_op_test.cc +++ b/tensorflow/core/kernels/sparse_matmul_op_test.cc @@ -238,25 +238,25 @@ class SparseMatmulOpTest : public ::testing::Test { TEST_F(SparseMatmulOpTest, BroadcastPacketTest) { for (int i = 0; i < PacketSize; ++i) ref[i] = data1[0]; - internal::pstore(data2, internal::pbroadcast_first( - internal::pload(data1))); + internal::pstoreu(data2, internal::pbroadcast_first( + internal::ploadu(data1))); ASSERT_TRUE(areApprox(ref, data2, PacketSize)); if (PacketSize > 1) { for (int i = 0; i < PacketSize; ++i) ref[i] = data1[1]; - internal::pstore(data2, internal::pbroadcast_second( - internal::pload(data1))); + internal::pstoreu(data2, internal::pbroadcast_second( + internal::ploadu(data1))); ASSERT_TRUE(areApprox(ref, data2, PacketSize)); if (PacketSize > 2) { for (int i = 0; i < PacketSize; ++i) ref[i] = data1[2]; - internal::pstore(data2, internal::pbroadcast_third( - internal::pload(data1))); + internal::pstoreu(data2, internal::pbroadcast_third( + internal::ploadu(data1))); ASSERT_TRUE(areApprox(ref, data2, PacketSize)); if (PacketSize > 3) { for (int i = 0; i < PacketSize; ++i) ref[i] = data1[3]; - internal::pstore(data2, internal::pbroadcast_fourth( - internal::pload(data1))); + internal::pstoreu(data2, internal::pbroadcast_fourth( + internal::ploadu(data1))); ASSERT_TRUE(areApprox(ref, data2, PacketSize)); } } @@ -276,8 +276,8 @@ TEST_F(SparseMatmulOpTest, InterleavePacketTest) { for (int i = 0; i < PacketSize; ++i) ref[i] = data1[i]; } - internal::pstore( - data2, internal::pinterleave4x64(internal::pload(data1))); + internal::pstoreu(data2, internal::pinterleave4x64( + internal::ploadu(data1))); ASSERT_TRUE(areApprox(ref, data2, PacketSize)); } @@ -294,8 +294,8 @@ TEST_F(SparseMatmulOpTest, Bfloat16ExpandTest) { ref[i] = data3[i]; } } - internal::pstore(data2, internal::pexpand_bf16_l( - internal::pload(data3_bfloat16))); + internal::pstoreu(data2, internal::pexpand_bf16_l( + internal::ploadu(data3_bfloat16))); ASSERT_TRUE(areApprox(ref, data2, PacketSize)); if (PacketSize == 8) { // AVX @@ -311,18 +311,18 @@ TEST_F(SparseMatmulOpTest, Bfloat16ExpandTest) { } } - internal::pstore(data2, internal::pexpand_bf16_u( - internal::pload(data3_bfloat16))); + internal::pstoreu(data2, internal::pexpand_bf16_u( + internal::ploadu(data3_bfloat16))); ASSERT_TRUE(areApprox(ref, data2, PacketSize)); } TEST_F(SparseMatmulOpTest, Bfloat16LoadTest) { if (PacketSize >= 4) { for (int i = 0; i < 4; ++i) ref[i] = data3[i]; - internal::pstore(data2, internal::pload4bf16(data3_bfloat16)); + internal::pstoreu(data2, internal::pload4bf16(data3_bfloat16)); ASSERT_TRUE(areApprox(ref, data2, 4)); - internal::pstore(data2, internal::pload2bf16(data3_bfloat16)); + internal::pstoreu(data2, internal::pload2bf16(data3_bfloat16)); ASSERT_TRUE(areApprox(ref, data2, 2)); } } diff --git a/tensorflow/core/kernels/tile_ops.cc b/tensorflow/core/kernels/tile_ops.cc index 52d2e637d17..5990bfbcf3c 100644 --- a/tensorflow/core/kernels/tile_ops.cc +++ b/tensorflow/core/kernels/tile_ops.cc @@ -26,6 +26,7 @@ limitations under the License. #include #include "tensorflow/core/framework/numeric_op.h" #include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" #include "tensorflow/core/framework/tensor.h" #include "tensorflow/core/framework/type_index.h" #include "tensorflow/core/lib/core/errors.h" @@ -92,18 +93,22 @@ class TileOp : public OpKernel { HANDLE_DIM(T, 4) \ HANDLE_DIM(T, 5) - HANDLE_TYPE(DT_BOOL); - HANDLE_TYPE(DT_FLOAT); - HANDLE_TYPE(DT_DOUBLE); - HANDLE_TYPE(DT_UINT8); - HANDLE_TYPE(DT_INT32); - HANDLE_TYPE(DT_INT16); - HANDLE_TYPE(DT_INT64); - HANDLE_TYPE(DT_HALF); - HANDLE_TYPE(DT_COMPLEX64); - HANDLE_TYPE(DT_COMPLEX128); - HANDLE_TYPE(DT_STRING); // when DEVICE=CPUDevice. +#define HANDLE_TYPE_NAME(T) HANDLE_TYPE(DataTypeToEnum::value) + // Invoke macro using TF_CALL_* so type-filtering for platform applies. + TF_CALL_bool(HANDLE_TYPE_NAME); + TF_CALL_float(HANDLE_TYPE_NAME); + TF_CALL_double(HANDLE_TYPE_NAME); + TF_CALL_uint8(HANDLE_TYPE_NAME); + TF_CALL_int32(HANDLE_TYPE_NAME); + TF_CALL_int16(HANDLE_TYPE_NAME); + TF_CALL_int64(HANDLE_TYPE_NAME); + TF_CALL_half(HANDLE_TYPE_NAME); + TF_CALL_string(HANDLE_TYPE_NAME); // when DEVICE=CPUDevice. + TF_CALL_complex64(HANDLE_TYPE_NAME); + TF_CALL_complex128(HANDLE_TYPE_NAME); + +#undef HANDLE_TYPE_NAME #undef HANDLE_TYPE #undef HANDLE_DIM @@ -165,17 +170,20 @@ inline void TileOp::HandleCase( HANDLE_CASE(device, dtype, 4); \ HANDLE_CASE(device, dtype, 5); -HANDLE_CASE_DIM(CPUDevice, DT_BOOL); -HANDLE_CASE_DIM(CPUDevice, DT_FLOAT); -HANDLE_CASE_DIM(CPUDevice, DT_DOUBLE); -HANDLE_CASE_DIM(CPUDevice, DT_UINT8); -HANDLE_CASE_DIM(CPUDevice, DT_INT32); -HANDLE_CASE_DIM(CPUDevice, DT_INT16); -HANDLE_CASE_DIM(CPUDevice, DT_INT64); -HANDLE_CASE_DIM(CPUDevice, DT_HALF); -HANDLE_CASE_DIM(CPUDevice, DT_COMPLEX64); -HANDLE_CASE_DIM(CPUDevice, DT_COMPLEX128); -HANDLE_CASE_DIM(CPUDevice, DT_STRING); +#define HANDLE_TYPE_NAME_CPU(T) \ + HANDLE_CASE_DIM(CPUDevice, DataTypeToEnum::value); + +TF_CALL_bool(HANDLE_TYPE_NAME_CPU); +TF_CALL_float(HANDLE_TYPE_NAME_CPU); +TF_CALL_double(HANDLE_TYPE_NAME_CPU); +TF_CALL_uint8(HANDLE_TYPE_NAME_CPU); +TF_CALL_int32(HANDLE_TYPE_NAME_CPU); +TF_CALL_int16(HANDLE_TYPE_NAME_CPU); +TF_CALL_int64(HANDLE_TYPE_NAME_CPU); +TF_CALL_half(HANDLE_TYPE_NAME_CPU); +TF_CALL_complex64(HANDLE_TYPE_NAME_CPU); +TF_CALL_complex128(HANDLE_TYPE_NAME_CPU); +TF_CALL_string(HANDLE_TYPE_NAME_CPU); #if GOOGLE_CUDA HANDLE_CASE_DIM(GPUDevice, DT_FLOAT); @@ -186,6 +194,7 @@ HANDLE_CASE_DIM(GPUDevice, DT_INT64); HANDLE_CASE_DIM(GPUDevice, DT_HALF); #endif // GOOGLE_CUDA +#undef HANDLE_TYPE_NAME_CPU #undef HANDLE_CASE_DIM #undef HANDLE_CASE @@ -249,13 +258,16 @@ class TileGradientOp : public OpKernel { HANDLE_DIM(T, 4) \ HANDLE_DIM(T, 5) - HANDLE_TYPE(DT_FLOAT); - HANDLE_TYPE(DT_DOUBLE); - HANDLE_TYPE(DT_INT32); - HANDLE_TYPE(DT_INT16); - HANDLE_TYPE(DT_INT64); - HANDLE_TYPE(DT_HALF); +#define HANDLE_TYPE_NAME(T) HANDLE_TYPE(DataTypeToEnum::value) + TF_CALL_float(HANDLE_TYPE_NAME); + TF_CALL_double(HANDLE_TYPE_NAME); + TF_CALL_int32(HANDLE_TYPE_NAME); + TF_CALL_int16(HANDLE_TYPE_NAME); + TF_CALL_int64(HANDLE_TYPE_NAME); + TF_CALL_half(HANDLE_TYPE_NAME); + +#undef HANDLE_TYPE_NAME #undef HANDLE_TYPE #undef HANDLE_DIM @@ -390,14 +402,17 @@ inline void TileGradientOp::HandleCase( HANDLE_CASE(device, dtype, 4); \ HANDLE_CASE(device, dtype, 5); -HANDLE_CASE_DIM(CPUDevice, DT_FLOAT); -HANDLE_CASE_DIM(CPUDevice, DT_DOUBLE); -HANDLE_CASE_DIM(CPUDevice, DT_INT16); -HANDLE_CASE_DIM(CPUDevice, DT_INT32); -HANDLE_CASE_DIM(CPUDevice, DT_INT64); -HANDLE_CASE_DIM(CPUDevice, DT_HALF); -HANDLE_CASE_DIM(CPUDevice, DT_COMPLEX64); -HANDLE_CASE_DIM(CPUDevice, DT_COMPLEX128); +#define HANDLE_TYPE_NAME_CPU(T) \ + HANDLE_CASE_DIM(CPUDevice, DataTypeToEnum::value); + +TF_CALL_float(HANDLE_TYPE_NAME_CPU); +TF_CALL_double(HANDLE_TYPE_NAME_CPU); +TF_CALL_int16(HANDLE_TYPE_NAME_CPU); +TF_CALL_int32(HANDLE_TYPE_NAME_CPU); +TF_CALL_int64(HANDLE_TYPE_NAME_CPU); +TF_CALL_half(HANDLE_TYPE_NAME_CPU); +TF_CALL_complex64(HANDLE_TYPE_NAME_CPU); +TF_CALL_complex128(HANDLE_TYPE_NAME_CPU); #if GOOGLE_CUDA HANDLE_CASE_DIM(GPUDevice, DT_FLOAT); @@ -409,6 +424,7 @@ HANDLE_CASE_DIM(GPUDevice, DT_HALF); #endif // GOOGLE_CUDA +#undef HANDLE_TYPE_NAME_CPU #undef HANDLE_CASE_DIM #undef HANDLE_CASE diff --git a/tensorflow/core/kernels/training_ops.cc b/tensorflow/core/kernels/training_ops.cc index f00e7ef8742..b16c9c860a9 100644 --- a/tensorflow/core/kernels/training_ops.cc +++ b/tensorflow/core/kernels/training_ops.cc @@ -16,17 +16,28 @@ limitations under the License. #define EIGEN_USE_THREADS #include "tensorflow/core/kernels/training_ops.h" +#include +#include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/kernels/bounds_check.h" #include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" namespace tensorflow { typedef Eigen::ThreadPoolDevice CPUDevice; typedef Eigen::GpuDevice GPUDevice; -namespace functor { +namespace { +template +inline T sgn(const T x) { + T zero(0); + T one(1); + return (x == zero ? zero : (x < zero ? -one : one)); +} +} +namespace functor { template struct ApplyGradientDescent { void operator()(const CPUDevice& d, typename TTypes::Flat var, @@ -56,6 +67,34 @@ struct ApplyAdadelta { } }; +template +struct ApplyProximalGradientDescent { + void operator()(const CPUDevice& d, typename TTypes::Flat var, + typename TTypes::ConstScalar lr, + typename TTypes::ConstScalar l1, + typename TTypes::ConstScalar l2, + typename TTypes::ConstFlat grad) { + // Note that here is Fobos update, for details please refer: + // http://papers.nips.cc/paper/3793-efficient-learning-using-forward-backward-splitting.pdf + // TODO(xbing): merge the logic for ProximalGradientDescent and + // ProximalAdagrad. + auto prox_var = var; + // compute v = w - lr * grad. + prox_var.device(d) -= grad * lr(); + if (l1() > 0) { + var.device(d) = prox_var.abs() - var.constant(lr() * l1()); + // compute sign(v) * max(|v| - lr * l1, 0) + var.device(d) = prox_var.sign() * var.cwiseMax(T(0.0)); + } else { + var.device(d) = prox_var; + } + if (l2() > 0) { + // compute v / (1.0 + l2 * lr) + var.device(d) = var / (var.constant(1.0) + var.constant(l2() * lr())); + } + } +}; + template struct ApplyAdagrad { void operator()(const CPUDevice& d, typename TTypes::Flat var, @@ -67,6 +106,35 @@ struct ApplyAdagrad { } }; +template +struct ApplyProximalAdagrad { + void operator()(const CPUDevice& d, typename TTypes::Flat var, + typename TTypes::Flat accum, + typename TTypes::ConstScalar lr, + typename TTypes::ConstScalar l1, + typename TTypes::ConstScalar l2, + typename TTypes::ConstFlat grad) { + // Fobos update per paper with Adagrad learning rate. + accum.device(d) += grad.square(); + // Adagrad learning rate. + auto learning_rate = accum.constant(lr()) * accum.rsqrt(); + auto prox_var = var; + // compute v = w - lr * grad. + prox_var.device(d) -= grad * learning_rate; + if (l1() > 0) { + var.device(d) = prox_var.abs() - learning_rate * prox_var.constant(l1()); + // compute sign(v) * max(|v| - lr * l1, 0) + var.device(d) = prox_var.sign() * var.cwiseMax(T(0.0)); + } else { + var.device(d) = prox_var; + } + if (l2() > 0) { + var.device(d) = + var / (var.constant(1.0) + var.constant(l2()) * learning_rate); + } + } +}; + template struct ApplyFtrl { void operator()(const CPUDevice& d, typename TTypes::Flat var, @@ -221,10 +289,11 @@ class ApplyGradientDescentOp : public OpKernel { REGISTER_KERNEL_BUILDER( \ Name("ApplyGradientDescent").Device(DEVICE_##D).TypeConstraint("T"), \ ApplyGradientDescentOp); +#define REGISTER_CPU_KERNELS(T) REGISTER_KERNELS(CPU, T); -REGISTER_KERNELS(CPU, Eigen::half); -REGISTER_KERNELS(CPU, float); -REGISTER_KERNELS(CPU, double); +TF_CALL_half(REGISTER_CPU_KERNELS); +TF_CALL_float(REGISTER_CPU_KERNELS); +TF_CALL_double(REGISTER_CPU_KERNELS); #if GOOGLE_CUDA // Forward declarations of the functor specializations for GPU. @@ -246,6 +315,7 @@ REGISTER_KERNELS(GPU, Eigen::half); REGISTER_KERNELS(GPU, float); REGISTER_KERNELS(GPU, double); #endif +#undef REGISTER_CPU_KERNELS #undef REGISTER_KERNELS template @@ -347,10 +417,11 @@ typedef Eigen::GpuDevice GPUDevice; REGISTER_KERNEL_BUILDER( \ Name("ApplyAdadelta").Device(DEVICE_##D).TypeConstraint("T"), \ ApplyAdadeltaOp); +#define REGISTER_CPU_KERNELS(T) REGISTER_KERNELS(CPU, T); -REGISTER_KERNELS(CPU, Eigen::half); -REGISTER_KERNELS(CPU, float); -REGISTER_KERNELS(CPU, double); +TF_CALL_half(REGISTER_CPU_KERNELS); +TF_CALL_float(REGISTER_CPU_KERNELS); +TF_CALL_double(REGISTER_CPU_KERNELS); #if GOOGLE_CUDA // Forward declarations of the functor specializations for GPU. @@ -374,6 +445,7 @@ REGISTER_KERNELS(GPU, Eigen::half); REGISTER_KERNELS(GPU, float); REGISTER_KERNELS(GPU, double); #endif +#undef REGISTER_CPU_KERNELS #undef REGISTER_KERNELS // Note, this op works on cpu only. @@ -483,7 +555,6 @@ class SparseApplyAdadeltaOp : public OpKernel { accum_update_ = accum_update_ * accum_update_.constant(rho_scalar) + update.square() * update.constant(static_cast(1) - rho_scalar); - auto v = var_flat.template chip<0>(index); v -= update * update.constant(lr_scalar); } @@ -505,14 +576,213 @@ class SparseApplyAdadeltaOp : public OpKernel { .TypeConstraint("T") \ .TypeConstraint("Tindices"), \ SparseApplyAdadeltaOp); +#define REGISTER_CPU_KERNELS(T) \ + REGISTER_KERNELS(T, int32); \ + REGISTER_KERNELS(T, int64); + +TF_CALL_half(REGISTER_CPU_KERNELS); +TF_CALL_float(REGISTER_CPU_KERNELS); +TF_CALL_double(REGISTER_CPU_KERNELS); + +#undef REGISTER_CPU_KERNELS +#undef REGISTER_KERNELS + +// Note, this op works on cpu only. +template +class ApplyProximalGradientDescentOp : public OpKernel { + public: + explicit ApplyProximalGradientDescentOp(OpKernelConstruction* ctx) + : OpKernel(ctx) { + OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_)); + } + + void Compute(OpKernelContext* ctx) override { + auto locks = MaybeLockMutexesInOrder(ctx, use_exclusive_lock_, {0}); + Tensor var = ctx->mutable_input(0, use_exclusive_lock_); + + OP_REQUIRES( + ctx, var.IsInitialized(), + errors::FailedPrecondition( + "Attempting to use uninitialized variables: ", def().input(0))); + const Tensor& alpha = ctx->input(1); + OP_REQUIRES(ctx, IsLegacyScalar(alpha.shape()), + errors::InvalidArgument("alpha is not a scalar: ", + alpha.shape().DebugString())); + const Tensor& l1 = ctx->input(2); + OP_REQUIRES( + ctx, TensorShapeUtils::IsScalar(l1.shape()), + errors::InvalidArgument("l1 regularization strength is not a scalar: ", + l1.shape().DebugString())); + const Tensor& l2 = ctx->input(3); + OP_REQUIRES( + ctx, TensorShapeUtils::IsScalar(l2.shape()), + errors::InvalidArgument("l2 regularization strength is not a scalar: ", + l2.shape().DebugString())); + + const Tensor& delta = ctx->input(4); + OP_REQUIRES( + ctx, var.shape().IsSameSize(delta.shape()), + errors::InvalidArgument("var and delta do not have the same shape", + var.shape().DebugString(), " ", + delta.shape().DebugString())); + + const Device& device = ctx->template eigen_device(); + functor::ApplyProximalGradientDescent()( + device, var.flat(), alpha.scalar(), l1.scalar(), + l2.scalar(), delta.flat()); + + ctx->forward_ref_input_to_ref_output(0, 0); + } + + private: + bool use_exclusive_lock_; +}; + +#define REGISTER_KERNELS(D, T) \ + REGISTER_KERNEL_BUILDER(Name("ApplyProximalGradientDescent") \ + .Device(DEVICE_##D) \ + .TypeConstraint("T"), \ + ApplyProximalGradientDescentOp); + +REGISTER_KERNELS(CPU, float); +REGISTER_KERNELS(CPU, double); +#undef REGISTER_KERNELS + +// Note, this op works on cpu only. +template +class SparseApplyProximalGradientDescentOp : public OpKernel { + public: + explicit SparseApplyProximalGradientDescentOp(OpKernelConstruction* ctx) + : OpKernel(ctx) { + OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_)); + } + + void Compute(OpKernelContext* ctx) override NO_THREAD_SAFETY_ANALYSIS { + auto locks = MaybeLockMutexesInOrder(ctx, use_exclusive_lock_, {0, 1}); + Tensor var = ctx->mutable_input(0, use_exclusive_lock_); + OP_REQUIRES(ctx, TensorShapeUtils::IsVectorOrHigher(var.shape()), + errors::InvalidArgument("var must be at least 1 dimensional")); + + const Tensor& lr = ctx->input(1); + OP_REQUIRES(ctx, IsLegacyScalar(lr.shape()), + errors::InvalidArgument("lr is not a scalar: ", + lr.shape().DebugString())); + const Tensor& l1 = ctx->input(2); + OP_REQUIRES( + ctx, TensorShapeUtils::IsScalar(l1.shape()), + errors::InvalidArgument("l1 regularization strength is not a scalar: ", + l1.shape().DebugString())); + const Tensor& l2 = ctx->input(3); + OP_REQUIRES( + ctx, TensorShapeUtils::IsScalar(l2.shape()), + errors::InvalidArgument("l2 regularization strength is not a scalar: ", + l2.shape().DebugString())); + + const Tensor& grad = ctx->input(4); + const Tensor& indices = ctx->input(5); + OP_REQUIRES(ctx, TensorShapeUtils::IsVector(indices.shape()), + errors::InvalidArgument("indices must be one-dimensional")); + + int64 inner_dim = 1; + for (int d = 1; d < var.dims(); d++) { + OP_REQUIRES(ctx, var.dim_size(d) == grad.dim_size(d), + errors::InvalidArgument(strings::StrCat( + "var and grad must match in dimension ", d))); + inner_dim *= grad.dim_size(d); + } + const Tindex N = indices.dim_size(0); + OP_REQUIRES( + ctx, grad.dim_size(0) == N, + errors::InvalidArgument( + "grad must be the same size as indices in the first dimension.")); + + if (N > 0) { + if (inner_dim > 1) { + const Tindex first_dim_size = var.dim_size(0); + auto indices_vec = indices.vec(); + auto var_flat = var.flat_outer_dims(); + auto grad_flat = grad.flat_outer_dims(); + T lr_scalar = lr.scalar()(); + T l1_scalar = l1.scalar()(); + T l2_scalar = l2.scalar()(); + + // TODO(xbing): extract the common logic for the Fobos update. + for (Tindex i = 0; i < N; i++) { + const Tindex index = internal::SubtleMustCopy(indices_vec(i)); + OP_REQUIRES(ctx, FastBoundsCheck(index, first_dim_size), + errors::InvalidArgument( + strings::StrCat("Index ", index, " at offset ", i, + " in indices is out of range"))); + auto g = grad_flat.template chip<0>(i); + auto v = var_flat.template chip<0>(index); + // compute learning_rate for current step. + auto learning_rate = v.constant(lr_scalar); + auto prox_v = v; + // v = w - g * learning_rate. + prox_v -= g * learning_rate; + if (l1_scalar > 0) { + v = prox_v.abs() - learning_rate * prox_v.constant(l1_scalar); + // compute sign(v) * max(|v|, 0) + v = prox_v.sign() * v.cwiseMax(static_cast(0.0)); + } else { + v = prox_v; + } + if (l2_scalar > 0) { + v /= (v.constant(1.0) + v.constant(l2_scalar) * learning_rate); + } + } + } else { + CHECK_EQ(1, inner_dim); + auto indices_vec = indices.vec(); + auto var_flat = var.flat(); + auto grad_flat = grad.flat(); + T lr_scalar = lr.scalar()(); + T l1_scalar = l1.scalar()(); + T l2_scalar = l2.scalar()(); + const Tindex first_dim_size = var_flat.size(); + + for (Tindex i = 0; i < N; i++) { + const Tindex index = internal::SubtleMustCopy(indices_vec(i)); + OP_REQUIRES(ctx, FastBoundsCheck(index, first_dim_size), + errors::InvalidArgument( + strings::StrCat("Index ", index, " at offset ", i, + " in indices is out of range"))); + const T& g = grad_flat(i); + auto learning_rate = lr_scalar; + auto prox_v = var_flat(index); + prox_v -= learning_rate * g; + if (l1_scalar > 0) { + var_flat(index) = std::abs(prox_v) - learning_rate * l1_scalar; + var_flat(index) = + sgn(prox_v) * std::max(var_flat(index), static_cast(0.0)); + } else { + var_flat(index) = prox_v; + } + if (l2_scalar > 0) { + var_flat(index) /= (1.0 + l2_scalar * learning_rate); + } + } + } + } + + ctx->forward_ref_input_to_ref_output(0, 0); + } + + private: + bool use_exclusive_lock_; +}; + +#define REGISTER_KERNELS(T, Tindices) \ + REGISTER_KERNEL_BUILDER(Name("SparseApplyProximalGradientDescent") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("T") \ + .TypeConstraint("Tindices"), \ + SparseApplyProximalGradientDescentOp); -REGISTER_KERNELS(Eigen::half, int32); -REGISTER_KERNELS(Eigen::half, int64); REGISTER_KERNELS(float, int32); REGISTER_KERNELS(float, int64); REGISTER_KERNELS(double, int32); REGISTER_KERNELS(double, int64); - #undef REGISTER_KERNELS template @@ -568,10 +838,11 @@ typedef Eigen::GpuDevice GPUDevice; REGISTER_KERNEL_BUILDER( \ Name("ApplyAdagrad").Device(DEVICE_##D).TypeConstraint("T"), \ ApplyAdagradOp); +#define REGISTER_CPU_KERNELS(T) REGISTER_KERNELS(CPU, T); -REGISTER_KERNELS(CPU, Eigen::half); -REGISTER_KERNELS(CPU, float); -REGISTER_KERNELS(CPU, double); +TF_CALL_half(REGISTER_CPU_KERNELS); +TF_CALL_float(REGISTER_CPU_KERNELS); +TF_CALL_double(REGISTER_CPU_KERNELS); #if GOOGLE_CUDA // Forward declarations of the functor specializations for GPU. @@ -593,15 +864,80 @@ REGISTER_KERNELS(GPU, Eigen::half); REGISTER_KERNELS(GPU, float); REGISTER_KERNELS(GPU, double); #endif +#undef REGISTER_CPU_KERNELS +#undef REGISTER_KERNELS + +template +class ApplyProximalAdagradOp : public OpKernel { + public: + explicit ApplyProximalAdagradOp(OpKernelConstruction* ctx) : OpKernel(ctx) { + OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_)); + } + + void Compute(OpKernelContext* ctx) override { + auto locks = MaybeLockMutexesInOrder(ctx, use_exclusive_lock_, {0, 1}); + Tensor var = ctx->mutable_input(0, use_exclusive_lock_); + Tensor accum = ctx->mutable_input(1, use_exclusive_lock_); + OP_REQUIRES( + ctx, var.IsInitialized(), + errors::FailedPrecondition( + "Attempting to use uninitialized variables: ", def().input(0))); + OP_REQUIRES( + ctx, accum.IsInitialized(), + errors::FailedPrecondition( + "Attempting to use uninitialized variables: ", def().input(1))); + OP_REQUIRES( + ctx, var.shape().IsSameSize(accum.shape()), + errors::InvalidArgument("var and accum do not have the same shape", + var.shape().DebugString(), " ", + accum.shape().DebugString())); + const Tensor& lr = ctx->input(2); + OP_REQUIRES(ctx, IsLegacyScalar(lr.shape()), + errors::InvalidArgument("lr is not a scalar: ", + lr.shape().DebugString())); + const Tensor& l1 = ctx->input(3); + OP_REQUIRES( + ctx, TensorShapeUtils::IsScalar(l1.shape()), + errors::InvalidArgument("l1 regularization strength is not a scalar: ", + l1.shape().DebugString())); + const Tensor& l2 = ctx->input(4); + OP_REQUIRES( + ctx, TensorShapeUtils::IsScalar(l2.shape()), + errors::InvalidArgument("l2 regularization strength is not a scalar: ", + l2.shape().DebugString())); + + const Tensor& grad = ctx->input(5); + OP_REQUIRES( + ctx, var.shape().IsSameSize(grad.shape()), + errors::InvalidArgument("var and grad do not have the same shape", + var.shape().DebugString(), " ", + grad.shape().DebugString())); + + const Device& device = ctx->template eigen_device(); + functor::ApplyProximalAdagrad()( + device, var.flat(), accum.flat(), lr.scalar(), l1.scalar(), + l2.scalar(), grad.flat()); + + ctx->forward_ref_input_to_ref_output(0, 0); + } + + private: + bool use_exclusive_lock_; +}; + +typedef Eigen::ThreadPoolDevice CPUDevice; +typedef Eigen::GpuDevice GPUDevice; + +#define REGISTER_KERNELS(D, T) \ + REGISTER_KERNEL_BUILDER( \ + Name("ApplyProximalAdagrad").Device(DEVICE_##D).TypeConstraint("T"), \ + ApplyProximalAdagradOp); + +REGISTER_KERNELS(CPU, float); +REGISTER_KERNELS(CPU, double); #undef REGISTER_KERNELS namespace { -template -inline T sgn(const T x) { - T zero(0); - T one(1); - return (x == zero ? zero : (x < zero ? -one : one)); -} template inline T FtrlCompute(const T& accum, const T& linear, const T& lr, const T& l1, @@ -730,9 +1066,167 @@ class SparseApplyAdagradOp : public OpKernel { .TypeConstraint("T") \ .TypeConstraint("Tindices"), \ SparseApplyAdagradOp); +#define REGISTER_CPU_KERNELS(T) \ + REGISTER_KERNELS(T, int32); \ + REGISTER_KERNELS(T, int64); + +TF_CALL_half(REGISTER_CPU_KERNELS); +TF_CALL_float(REGISTER_CPU_KERNELS); +TF_CALL_double(REGISTER_CPU_KERNELS); + +#undef REGISTER_CPU_KERNELS +#undef REGISTER_KERNELS + +// Note, this op works on cpu only. +template +class SparseApplyProximalAdagradOp : public OpKernel { + public: + explicit SparseApplyProximalAdagradOp(OpKernelConstruction* ctx) + : OpKernel(ctx) { + OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_)); + } + + void Compute(OpKernelContext* ctx) override NO_THREAD_SAFETY_ANALYSIS { + auto locks = MaybeLockMutexesInOrder(ctx, use_exclusive_lock_, {0, 1}); + Tensor var = ctx->mutable_input(0, use_exclusive_lock_); + Tensor accum = ctx->mutable_input(1, use_exclusive_lock_); + OP_REQUIRES( + ctx, var.IsInitialized(), + errors::FailedPrecondition( + "Attempting to use uninitialized variables: ", def().input(0))); + OP_REQUIRES( + ctx, accum.IsInitialized(), + errors::FailedPrecondition( + "Attempting to use uninitialized variables: ", def().input(1))); + OP_REQUIRES( + ctx, var.shape().IsSameSize(accum.shape()), + errors::InvalidArgument("var and accum do not have the same shape", + var.shape().DebugString(), " ", + accum.shape().DebugString())); + OP_REQUIRES(ctx, TensorShapeUtils::IsVectorOrHigher(var.shape()), + errors::InvalidArgument("var must be at least 1 dimensional")); + + const Tensor& lr = ctx->input(2); + OP_REQUIRES(ctx, IsLegacyScalar(lr.shape()), + errors::InvalidArgument("lr is not a scalar: ", + lr.shape().DebugString())); + const Tensor& l1 = ctx->input(3); + OP_REQUIRES( + ctx, TensorShapeUtils::IsScalar(l1.shape()), + errors::InvalidArgument("l1 regularization strength is not a scalar: ", + l1.shape().DebugString())); + const Tensor& l2 = ctx->input(4); + OP_REQUIRES( + ctx, TensorShapeUtils::IsScalar(l2.shape()), + errors::InvalidArgument("l2 regularization strength is not a scalar: ", + l2.shape().DebugString())); + + const Tensor& grad = ctx->input(5); + const Tensor& indices = ctx->input(6); + OP_REQUIRES(ctx, TensorShapeUtils::IsVector(indices.shape()), + errors::InvalidArgument("indices must be one-dimensional")); + + int64 inner_dim = 1; + for (int d = 1; d < var.dims(); d++) { + OP_REQUIRES(ctx, var.dim_size(d) == grad.dim_size(d), + errors::InvalidArgument(strings::StrCat( + "var and grad must match in dimension ", d))); + inner_dim *= grad.dim_size(d); + } + const Tindex N = indices.dim_size(0); + OP_REQUIRES( + ctx, grad.dim_size(0) == N, + errors::InvalidArgument( + "grad must be the same size as indices in the first dimension.")); + + if (N > 0) { + if (inner_dim > 1) { + const Tindex first_dim_size = var.dim_size(0); + auto indices_vec = indices.vec(); + auto var_flat = var.flat_outer_dims(); + auto accum_flat = accum.flat_outer_dims(); + auto grad_flat = grad.flat_outer_dims(); + T lr_scalar = lr.scalar()(); + T l1_scalar = l1.scalar()(); + T l2_scalar = l2.scalar()(); + + for (Tindex i = 0; i < N; i++) { + const Tindex index = internal::SubtleMustCopy(indices_vec(i)); + OP_REQUIRES(ctx, FastBoundsCheck(index, first_dim_size), + errors::InvalidArgument( + strings::StrCat("Index ", index, " at offset ", i, + " in indices is out of range"))); + auto a = accum_flat.template chip<0>(index); + auto g = grad_flat.template chip<0>(i); + auto v = var_flat.template chip<0>(index); + a += g.square(); + // compute learning_rate for current step. + auto learning_rate = a.constant(lr_scalar) * a.rsqrt(); + auto prox_v = v; + // v = w - g * learning_rate. + prox_v -= g * learning_rate; + if (l1_scalar > 0) { + v = prox_v.abs() - learning_rate * prox_v.constant(l1_scalar); + // compute sign(v) * max(|v|, 0) + v = prox_v.sign() * v.cwiseMax(static_cast(0.0)); + } else { + v = prox_v; + } + if (l2_scalar > 0) { + v /= (v.constant(1.0) + v.constant(l2_scalar) * learning_rate); + } + } + } else { + CHECK_EQ(1, inner_dim); + auto indices_vec = indices.vec(); + auto var_flat = var.flat(); + auto accum_flat = accum.flat(); + auto grad_flat = grad.flat(); + T lr_scalar = lr.scalar()(); + T l1_scalar = l1.scalar()(); + T l2_scalar = l2.scalar()(); + const Tindex first_dim_size = accum_flat.size(); + + for (Tindex i = 0; i < N; i++) { + const Tindex index = internal::SubtleMustCopy(indices_vec(i)); + OP_REQUIRES(ctx, FastBoundsCheck(index, first_dim_size), + errors::InvalidArgument( + strings::StrCat("Index ", index, " at offset ", i, + " in indices is out of range"))); + T& a = accum_flat(index); + const T& g = grad_flat(i); + a += g * g; + auto learning_rate = lr_scalar / std::sqrt(a); + auto prox_v = var_flat(index); + prox_v -= learning_rate * g; + if (l1_scalar > 0) { + var_flat(index) = std::abs(prox_v) - learning_rate * l1_scalar; + var_flat(index) = + sgn(prox_v) * std::max(var_flat(index), static_cast(0.0)); + } else { + var_flat(index) = prox_v; + } + if (l2_scalar > 0) { + var_flat(index) /= (1.0 + l2_scalar * learning_rate); + } + } + } + } + + ctx->forward_ref_input_to_ref_output(0, 0); + } + + private: + bool use_exclusive_lock_; +}; + +#define REGISTER_KERNELS(T, Tindices) \ + REGISTER_KERNEL_BUILDER(Name("SparseApplyProximalAdagrad") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("T") \ + .TypeConstraint("Tindices"), \ + SparseApplyProximalAdagradOp); -REGISTER_KERNELS(Eigen::half, int32); -REGISTER_KERNELS(Eigen::half, int64); REGISTER_KERNELS(float, int32); REGISTER_KERNELS(float, int64); REGISTER_KERNELS(double, int32); @@ -822,10 +1316,13 @@ typedef Eigen::GpuDevice GPUDevice; REGISTER_KERNEL_BUILDER( \ Name("ApplyFtrl").Device(DEVICE_##D).TypeConstraint("T"), \ ApplyFtrlOp); +#define REGISTER_CPU_KERNELS(T) REGISTER_KERNELS(CPU, T); -REGISTER_KERNELS(CPU, Eigen::half); -REGISTER_KERNELS(CPU, float); -REGISTER_KERNELS(CPU, double); +TF_CALL_half(REGISTER_CPU_KERNELS); +TF_CALL_float(REGISTER_CPU_KERNELS); +TF_CALL_double(REGISTER_CPU_KERNELS); + +#undef REGISTER_CPU_KERNELS #undef REGISTER_KERNELS // Note, this op works on cpu only. @@ -1002,13 +1499,15 @@ class SparseApplyFtrlOp : public OpKernel { .TypeConstraint("T") \ .TypeConstraint("Tindices"), \ SparseApplyFtrlOp); +#define REGISTER_CPU_KERNELS(T) \ + REGISTER_KERNELS(T, int32); \ + REGISTER_KERNELS(T, int64); -REGISTER_KERNELS(Eigen::half, int32); -REGISTER_KERNELS(Eigen::half, int64); -REGISTER_KERNELS(float, int32); -REGISTER_KERNELS(float, int64); -REGISTER_KERNELS(double, int32); -REGISTER_KERNELS(double, int64); +TF_CALL_half(REGISTER_CPU_KERNELS); +TF_CALL_float(REGISTER_CPU_KERNELS); +TF_CALL_double(REGISTER_CPU_KERNELS); + +#undef REGISTER_CPU_KERNELS #undef REGISTER_KERNELS template @@ -1070,10 +1569,11 @@ typedef Eigen::GpuDevice GPUDevice; REGISTER_KERNEL_BUILDER( \ Name("ApplyMomentum").Device(DEVICE_##D).TypeConstraint("T"), \ ApplyMomentumOp); +#define REGISTER_CPU_KERNELS(T) REGISTER_KERNELS(CPU, T); -REGISTER_KERNELS(CPU, Eigen::half); -REGISTER_KERNELS(CPU, float); -REGISTER_KERNELS(CPU, double); +TF_CALL_half(REGISTER_CPU_KERNELS); +TF_CALL_float(REGISTER_CPU_KERNELS); +TF_CALL_double(REGISTER_CPU_KERNELS); #if GOOGLE_CUDA // Forward declarations of the functor specializations for GPU. @@ -1096,6 +1596,7 @@ REGISTER_KERNELS(GPU, Eigen::half); REGISTER_KERNELS(GPU, float); REGISTER_KERNELS(GPU, double); #endif +#undef REGISTER_CPU_KERNELS #undef REGISTER_KERNELS // Note, this op works on cpu only. @@ -1129,7 +1630,7 @@ class SparseApplyMomentumOp : public OpKernel { const Tensor& lr = ctx->input(2); OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(lr.shape()), - errors::InvalidArgument("lr is not a scalar: ", + errors::InvalidArgument("lr is not a scalar : ", lr.shape().DebugString())); const Tensor& grad = ctx->input(3); const Tensor& indices = ctx->input(4); @@ -1188,13 +1689,15 @@ class SparseApplyMomentumOp : public OpKernel { .TypeConstraint("T") \ .TypeConstraint("Tindices"), \ SparseApplyMomentumOp); +#define REGISTER_CPU_KERNELS(T) \ + REGISTER_KERNELS(T, int32); \ + REGISTER_KERNELS(T, int64); -REGISTER_KERNELS(Eigen::half, int32); -REGISTER_KERNELS(Eigen::half, int64); -REGISTER_KERNELS(float, int32); -REGISTER_KERNELS(float, int64); -REGISTER_KERNELS(double, int32); -REGISTER_KERNELS(double, int64); +TF_CALL_half(REGISTER_CPU_KERNELS); +TF_CALL_float(REGISTER_CPU_KERNELS); +TF_CALL_double(REGISTER_CPU_KERNELS); + +#undef REGISTER_CPU_KERNELS #undef REGISTER_KERNELS template @@ -1237,7 +1740,7 @@ class ApplyAdamOp : public OpKernel { errors::InvalidArgument("beta2_power is not a scalar: ", beta2_power.shape().DebugString())); OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(lr.shape()), - errors::InvalidArgument("lr is not a scalar: ", + errors::InvalidArgument("lr is not a scalar : ", lr.shape().DebugString())); OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(beta1.shape()), errors::InvalidArgument("beta1 is not a scalar: ", @@ -1285,10 +1788,11 @@ typedef Eigen::GpuDevice GPUDevice; REGISTER_KERNEL_BUILDER( \ Name("ApplyAdam").Device(DEVICE_##D).TypeConstraint("T"), \ ApplyAdamOp); +#define REGISTER_CPU_KERNELS(T) REGISTER_KERNELS(CPU, T); -REGISTER_KERNELS(CPU, Eigen::half); -REGISTER_KERNELS(CPU, float); -REGISTER_KERNELS(CPU, double); +TF_CALL_half(REGISTER_CPU_KERNELS); +TF_CALL_float(REGISTER_CPU_KERNELS); +TF_CALL_double(REGISTER_CPU_KERNELS); #if GOOGLE_CUDA // Forward declarations of the functor specializations for GPU. @@ -1316,6 +1820,7 @@ REGISTER_KERNELS(GPU, Eigen::half); REGISTER_KERNELS(GPU, float); REGISTER_KERNELS(GPU, double); #endif +#undef REGISTER_CPU_KERNELS #undef REGISTER_KERNELS template @@ -1352,7 +1857,7 @@ class ApplyRMSPropOp : public OpKernel { const Tensor& grad = ctx->input(7); OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(lr.shape()), - errors::InvalidArgument("lr is not a scalar: ", + errors::InvalidArgument("lr is not a scalar : ", lr.shape().DebugString())); OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(rho.shape()), errors::InvalidArgument("rho is not a scalar: ", @@ -1400,10 +1905,11 @@ typedef Eigen::GpuDevice GPUDevice; REGISTER_KERNEL_BUILDER( \ Name("ApplyRMSProp").Device(DEVICE_##D).TypeConstraint("T"), \ ApplyRMSPropOp); +#define REGISTER_CPU_KERNELS(T) REGISTER_KERNELS(CPU, T); -REGISTER_KERNELS(CPU, Eigen::half); -REGISTER_KERNELS(CPU, float); -REGISTER_KERNELS(CPU, double); +TF_CALL_half(REGISTER_CPU_KERNELS); +TF_CALL_float(REGISTER_CPU_KERNELS); +TF_CALL_double(REGISTER_CPU_KERNELS); #if GOOGLE_CUDA // Forward declarations of the functor specializations for GPU. @@ -1428,6 +1934,7 @@ REGISTER_KERNELS(GPU, Eigen::half); REGISTER_KERNELS(GPU, float); REGISTER_KERNELS(GPU, double); #endif +#undef REGISTER_CPU_KERNELS #undef REGISTER_KERNELS } // namespace tensorflow diff --git a/tensorflow/core/kernels/training_ops.h b/tensorflow/core/kernels/training_ops.h index 7b4291cccc6..b9946cd9228 100644 --- a/tensorflow/core/kernels/training_ops.h +++ b/tensorflow/core/kernels/training_ops.h @@ -44,6 +44,24 @@ struct ApplyAdadelta { typename TTypes::ConstFlat grad); }; +template +struct FobosElasticNet { + void operator()(const Device& d, typename TTypes::Flat var, + typename TTypes::ConstScalar lr, + typename TTypes::ConstScalar l1, + typename TTypes::ConstScalar l2, + typename TTypes::ConstFlat grad); +}; + +template +struct ApplyProximalGradientDescent { + void operator()(const Device& d, typename TTypes::Flat var, + typename TTypes::ConstScalar lr, + typename TTypes::ConstScalar l1, + typename TTypes::ConstScalar l2, + typename TTypes::ConstFlat grad); +}; + template struct ApplyAdagrad { void operator()(const Device& d, typename TTypes::Flat var, @@ -52,6 +70,16 @@ struct ApplyAdagrad { typename TTypes::ConstFlat grad); }; +template +struct ApplyProximalAdagrad { + void operator()(const Device& d, typename TTypes::Flat var, + typename TTypes::Flat accum, + typename TTypes::ConstScalar lr, + typename TTypes::ConstScalar l1, + typename TTypes::ConstScalar l2, + typename TTypes::ConstFlat grad); +}; + template struct ApplyFtrl { void operator()(const Device& d, typename TTypes::Flat var, diff --git a/tensorflow/core/kernels/xent_op.cc b/tensorflow/core/kernels/xent_op.cc index de83c25f06e..639bad5f04f 100644 --- a/tensorflow/core/kernels/xent_op.cc +++ b/tensorflow/core/kernels/xent_op.cc @@ -20,6 +20,7 @@ limitations under the License. #include "tensorflow/core/kernels/xent_op.h" #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" #include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" #include "tensorflow/core/framework/tensor.h" #include "tensorflow/core/framework/tensor_shape.h" @@ -86,18 +87,14 @@ struct XentFunctor { }; } // namespace functor -REGISTER_KERNEL_BUILDER(Name("SoftmaxCrossEntropyWithLogits") - .Device(DEVICE_CPU) - .TypeConstraint("T"), - SoftmaxXentWithLogitsOp); -REGISTER_KERNEL_BUILDER(Name("SoftmaxCrossEntropyWithLogits") - .Device(DEVICE_CPU) - .TypeConstraint("T"), - SoftmaxXentWithLogitsOp); -REGISTER_KERNEL_BUILDER(Name("SoftmaxCrossEntropyWithLogits") - .Device(DEVICE_CPU) - .TypeConstraint("T"), - SoftmaxXentWithLogitsOp); +#define REGISTER_CPU(T) \ + REGISTER_KERNEL_BUILDER(Name("SoftmaxCrossEntropyWithLogits") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("T"), \ + SoftmaxXentWithLogitsOp); +TF_CALL_half(REGISTER_CPU); +TF_CALL_float(REGISTER_CPU); +TF_CALL_double(REGISTER_CPU); #if GOOGLE_CUDA REGISTER_KERNEL_BUILDER(Name("SoftmaxCrossEntropyWithLogits") diff --git a/tensorflow/core/ops/compat/ops_history.v0.pbtxt b/tensorflow/core/ops/compat/ops_history.v0.pbtxt index ed60c227a5f..c20e8c36001 100644 --- a/tensorflow/core/ops/compat/ops_history.v0.pbtxt +++ b/tensorflow/core/ops/compat/ops_history.v0.pbtxt @@ -1825,6 +1825,127 @@ op { } } } +op { + name: "ApplyProximalAdagrad" + input_arg { + name: "var" + type_attr: "T" + is_ref: true + } + input_arg { + name: "accum" + type_attr: "T" + is_ref: true + } + input_arg { + name: "lr" + type_attr: "T" + } + input_arg { + name: "l1" + type_attr: "T" + } + input_arg { + name: "l2" + type_attr: "T" + } + input_arg { + name: "grad" + type_attr: "T" + } + output_arg { + name: "out" + type_attr: "T" + is_ref: true + } + attr { + name: "T" + type: "type" + allowed_values { + list { + type: DT_FLOAT + type: DT_DOUBLE + type: DT_INT64 + type: DT_INT32 + type: DT_UINT8 + type: DT_UINT16 + type: DT_INT16 + type: DT_INT8 + type: DT_COMPLEX64 + type: DT_COMPLEX128 + type: DT_QINT8 + type: DT_QUINT8 + type: DT_QINT32 + type: DT_HALF + } + } + } + attr { + name: "use_locking" + type: "bool" + default_value { + b: false + } + } +} +op { + name: "ApplyProximalGradientDescent" + input_arg { + name: "var" + type_attr: "T" + is_ref: true + } + input_arg { + name: "alpha" + type_attr: "T" + } + input_arg { + name: "l1" + type_attr: "T" + } + input_arg { + name: "l2" + type_attr: "T" + } + input_arg { + name: "delta" + type_attr: "T" + } + output_arg { + name: "out" + type_attr: "T" + is_ref: true + } + attr { + name: "T" + type: "type" + allowed_values { + list { + type: DT_FLOAT + type: DT_DOUBLE + type: DT_INT64 + type: DT_INT32 + type: DT_UINT8 + type: DT_UINT16 + type: DT_INT16 + type: DT_INT8 + type: DT_COMPLEX64 + type: DT_COMPLEX128 + type: DT_QINT8 + type: DT_QUINT8 + type: DT_QINT32 + type: DT_HALF + } + } + } + attr { + name: "use_locking" + type: "bool" + default_value { + b: false + } + } +} op { name: "ApplyRMSProp" input_arg { @@ -3011,6 +3132,63 @@ op { } } } +op { + name: "AvgPool" + input_arg { + name: "value" + type_attr: "T" + } + output_arg { + name: "output" + type_attr: "T" + } + attr { + name: "ksize" + type: "list(int)" + has_minimum: true + minimum: 4 + } + attr { + name: "strides" + type: "list(int)" + has_minimum: true + minimum: 4 + } + attr { + name: "padding" + type: "string" + allowed_values { + list { + s: "SAME" + s: "VALID" + } + } + } + attr { + name: "data_format" + type: "string" + default_value { + s: "NHWC" + } + allowed_values { + list { + s: "NHWC" + s: "NCHW" + } + } + } + attr { + name: "T" + type: "type" + allowed_values { + list { + type: DT_FLOAT + type: DT_HALF + type: DT_DOUBLE + } + } + } +} op { name: "AvgPool3D" input_arg { @@ -3232,6 +3410,67 @@ op { } } } +op { + name: "AvgPoolGrad" + input_arg { + name: "orig_input_shape" + type: DT_INT32 + } + input_arg { + name: "grad" + type_attr: "T" + } + output_arg { + name: "output" + type_attr: "T" + } + attr { + name: "ksize" + type: "list(int)" + has_minimum: true + minimum: 4 + } + attr { + name: "strides" + type: "list(int)" + has_minimum: true + minimum: 4 + } + attr { + name: "padding" + type: "string" + allowed_values { + list { + s: "SAME" + s: "VALID" + } + } + } + attr { + name: "data_format" + type: "string" + default_value { + s: "NHWC" + } + allowed_values { + list { + s: "NHWC" + s: "NCHW" + } + } + } + attr { + name: "T" + type: "type" + allowed_values { + list { + type: DT_FLOAT + type: DT_HALF + type: DT_DOUBLE + } + } + } +} op { name: "BatchCholesky" input_arg { @@ -11801,6 +12040,124 @@ op { } } } +op { + name: "MaxPool" + input_arg { + name: "input" + type_attr: "T" + } + output_arg { + name: "output" + type_attr: "T" + } + attr { + name: "ksize" + type: "list(int)" + has_minimum: true + minimum: 4 + } + attr { + name: "strides" + type: "list(int)" + has_minimum: true + minimum: 4 + } + attr { + name: "padding" + type: "string" + allowed_values { + list { + s: "SAME" + s: "VALID" + } + } + } + attr { + name: "data_format" + type: "string" + default_value { + s: "NHWC" + } + allowed_values { + list { + s: "NHWC" + s: "NCHW" + } + } + } + attr { + name: "T" + type: "type" + default_value { + type: DT_FLOAT + } + allowed_values { + list { + type: DT_FLOAT + type: DT_HALF + } + } + } +} +op { + name: "MaxPool" + input_arg { + name: "input" + type_attr: "T" + } + output_arg { + name: "output" + type_attr: "T" + } + attr { + name: "T" + type: "type" + default_value { + type: DT_FLOAT + } + allowed_values { + list { + type: DT_FLOAT + type: DT_HALF + } + } + } + attr { + name: "ksize" + type: "list(int)" + has_minimum: true + minimum: 4 + } + attr { + name: "strides" + type: "list(int)" + has_minimum: true + minimum: 4 + } + attr { + name: "padding" + type: "string" + allowed_values { + list { + s: "SAME" + s: "VALID" + } + } + } + attr { + name: "data_format" + type: "string" + default_value { + s: "NHWC" + } + allowed_values { + list { + s: "NHWC" + s: "NCHW" + } + } + } +} op { name: "MaxPool3D" input_arg { @@ -12014,6 +12371,73 @@ op { } } } +op { + name: "MaxPoolGrad" + input_arg { + name: "orig_input" + type_attr: "T" + } + input_arg { + name: "orig_output" + type_attr: "T" + } + input_arg { + name: "grad" + type_attr: "T" + } + output_arg { + name: "output" + type_attr: "T" + } + attr { + name: "ksize" + type: "list(int)" + has_minimum: true + minimum: 4 + } + attr { + name: "strides" + type: "list(int)" + has_minimum: true + minimum: 4 + } + attr { + name: "padding" + type: "string" + allowed_values { + list { + s: "SAME" + s: "VALID" + } + } + } + attr { + name: "data_format" + type: "string" + default_value { + s: "NHWC" + } + allowed_values { + list { + s: "NHWC" + s: "NCHW" + } + } + } + attr { + name: "T" + type: "type" + default_value { + type: DT_FLOAT + } + allowed_values { + list { + type: DT_FLOAT + type: DT_HALF + } + } + } +} op { name: "MaxPoolGradWithArgmax" input_arg { @@ -12065,6 +12489,70 @@ op { } } } +op { + name: "MaxPoolGradWithArgmax" + input_arg { + name: "input" + type_attr: "T" + } + input_arg { + name: "grad" + type_attr: "T" + } + input_arg { + name: "argmax" + type_attr: "Targmax" + } + output_arg { + name: "output" + type_attr: "T" + } + attr { + name: "ksize" + type: "list(int)" + has_minimum: true + minimum: 4 + } + attr { + name: "strides" + type: "list(int)" + has_minimum: true + minimum: 4 + } + attr { + name: "padding" + type: "string" + allowed_values { + list { + s: "SAME" + s: "VALID" + } + } + } + attr { + name: "Targmax" + type: "type" + allowed_values { + list { + type: DT_INT32 + type: DT_INT64 + } + } + } + attr { + name: "T" + type: "type" + default_value { + type: DT_FLOAT + } + allowed_values { + list { + type: DT_FLOAT + type: DT_HALF + } + } + } +} op { name: "MaxPoolWithArgmax" input_arg { @@ -12115,6 +12603,69 @@ op { } } } +op { + name: "MaxPoolWithArgmax" + input_arg { + name: "input" + type_attr: "T" + } + output_arg { + name: "output" + type_attr: "T" + } + output_arg { + name: "argmax" + type_attr: "Targmax" + } + attr { + name: "ksize" + type: "list(int)" + has_minimum: true + minimum: 4 + } + attr { + name: "strides" + type: "list(int)" + has_minimum: true + minimum: 4 + } + attr { + name: "Targmax" + type: "type" + default_value { + type: DT_INT64 + } + allowed_values { + list { + type: DT_INT32 + type: DT_INT64 + } + } + } + attr { + name: "padding" + type: "string" + allowed_values { + list { + s: "SAME" + s: "VALID" + } + } + } + attr { + name: "T" + type: "type" + default_value { + type: DT_FLOAT + } + allowed_values { + list { + type: DT_FLOAT + type: DT_HALF + } + } + } +} op { name: "Maximum" input_arg { @@ -20499,6 +21050,155 @@ op { } } } +op { + name: "SparseApplyProximalAdagrad" + input_arg { + name: "var" + type_attr: "T" + is_ref: true + } + input_arg { + name: "accum" + type_attr: "T" + is_ref: true + } + input_arg { + name: "lr" + type_attr: "T" + } + input_arg { + name: "l1" + type_attr: "T" + } + input_arg { + name: "l2" + type_attr: "T" + } + input_arg { + name: "grad" + type_attr: "T" + } + input_arg { + name: "indices" + type_attr: "Tindices" + } + output_arg { + name: "out" + type_attr: "T" + is_ref: true + } + attr { + name: "T" + type: "type" + allowed_values { + list { + type: DT_FLOAT + type: DT_DOUBLE + type: DT_INT64 + type: DT_INT32 + type: DT_UINT8 + type: DT_UINT16 + type: DT_INT16 + type: DT_INT8 + type: DT_COMPLEX64 + type: DT_COMPLEX128 + type: DT_QINT8 + type: DT_QUINT8 + type: DT_QINT32 + type: DT_HALF + } + } + } + attr { + name: "Tindices" + type: "type" + allowed_values { + list { + type: DT_INT32 + type: DT_INT64 + } + } + } + attr { + name: "use_locking" + type: "bool" + default_value { + b: false + } + } +} +op { + name: "SparseApplyProximalGradientDescent" + input_arg { + name: "var" + type_attr: "T" + is_ref: true + } + input_arg { + name: "alpha" + type_attr: "T" + } + input_arg { + name: "l1" + type_attr: "T" + } + input_arg { + name: "l2" + type_attr: "T" + } + input_arg { + name: "grad" + type_attr: "T" + } + input_arg { + name: "indices" + type_attr: "Tindices" + } + output_arg { + name: "out" + type_attr: "T" + is_ref: true + } + attr { + name: "T" + type: "type" + allowed_values { + list { + type: DT_FLOAT + type: DT_DOUBLE + type: DT_INT64 + type: DT_INT32 + type: DT_UINT8 + type: DT_UINT16 + type: DT_INT16 + type: DT_INT8 + type: DT_COMPLEX64 + type: DT_COMPLEX128 + type: DT_QINT8 + type: DT_QUINT8 + type: DT_QINT32 + type: DT_HALF + } + } + } + attr { + name: "Tindices" + type: "type" + allowed_values { + list { + type: DT_INT32 + type: DT_INT64 + } + } + } + attr { + name: "use_locking" + type: "bool" + default_value { + b: false + } + } +} op { name: "SparseConcat" input_arg { diff --git a/tensorflow/core/ops/nn_grad.cc b/tensorflow/core/ops/nn_grad.cc index c1a42e74beb..e3b876b2401 100644 --- a/tensorflow/core/ops/nn_grad.cc +++ b/tensorflow/core/ops/nn_grad.cc @@ -154,22 +154,25 @@ Status MaxPoolGrad(const AttrSlice& attrs, FunctionDef* g) { // clang-format off *g = FDH::Define( // Arg defs - {"input: float", "grad: float"}, + {"input: T", "grad: T"}, // Ret val defs - {"output: float"}, + {"output: T"}, // Attr defs - {"ksize: list(int) >= 4", + {"T: {float, half} = DT_FLOAT", + "ksize: list(int) >= 4", "strides: list(int) >= 4", GetPaddingAttrString()}, // Nodes { // Invoke MaxPool again to recompute the outputs (removed by CSE?). {{"maxpool"}, "MaxPool", {"input"}, - /*Attrs=*/{{"ksize", "$ksize"}, + /*Attrs=*/{{"T", "$T"}, + {"ksize", "$ksize"}, {"strides", "$strides"}, {"padding", "$padding"}}}, {{"output"}, "MaxPoolGrad", {"input", "maxpool", "grad"}, - /*Attrs=*/{{"ksize", "$ksize"}, + /*Attrs=*/{{"T", "$T"}, + {"ksize", "$ksize"}, {"strides", "$strides"}, {"padding", "$padding"}}} }); diff --git a/tensorflow/core/ops/nn_ops.cc b/tensorflow/core/ops/nn_ops.cc index fee145be538..b53945a4a0b 100644 --- a/tensorflow/core/ops/nn_ops.cc +++ b/tensorflow/core/ops/nn_ops.cc @@ -28,7 +28,7 @@ REGISTER_OP("AvgPool") .Attr("strides: list(int) >= 4") .Attr(GetPaddingAttrString()) .Attr(GetConvnetDataFormatAttrString()) - .Attr("T: {float, double}") + .Attr("T: {float, half, double}") .Doc(R"doc( Performs average pooling on the input. @@ -55,7 +55,7 @@ REGISTER_OP("AvgPoolGrad") .Attr("strides: list(int) >= 4") .Attr(GetPaddingAttrString()) .Attr(GetConvnetDataFormatAttrString()) - .Attr("T: {float, double}") + .Attr("T: {float, half, double}") .Doc(R"doc( Computes gradients of the average pooling function. @@ -642,12 +642,13 @@ output: The gradients for LRN. // -------------------------------------------------------------------------- REGISTER_OP("MaxPool") + .Attr("T: {float, half} = DT_FLOAT") .Attr("ksize: list(int) >= 4") .Attr("strides: list(int) >= 4") .Attr(GetPaddingAttrString()) .Attr(GetConvnetDataFormatAttrString()) - .Input("input: float") - .Output("output: float") + .Input("input: T") + .Output("output: T") .Doc(R"doc( Performs max pooling on the input. @@ -669,10 +670,11 @@ REGISTER_OP("MaxPoolGrad") .Attr("strides: list(int) >= 4") .Attr(GetPaddingAttrString()) .Attr(GetConvnetDataFormatAttrString()) - .Input("orig_input: float") - .Input("orig_output: float") - .Input("grad: float") - .Output("output: float") + .Input("orig_input: T") + .Input("orig_output: T") + .Input("grad: T") + .Output("output: T") + .Attr("T: {float, half} = DT_FLOAT") .Doc(R"doc( Computes gradients of the maxpooling function. @@ -696,9 +698,10 @@ REGISTER_OP("MaxPoolWithArgmax") .Attr("strides: list(int) >= 4") .Attr("Targmax: {int32, int64} = DT_INT64") .Attr(GetPaddingAttrString()) - .Input("input: float") - .Output("output: float") + .Input("input: T") + .Output("output: T") .Output("argmax: Targmax") + .Attr("T: {float, half} = DT_FLOAT") .Doc(R"doc( Performs max pooling on the input and outputs both max values and indices. @@ -720,10 +723,11 @@ REGISTER_OP("MaxPoolGradWithArgmax") .Attr("strides: list(int) >= 4") .Attr(GetPaddingAttrString()) .Attr("Targmax: {int32, int64}") - .Input("input: float") - .Input("grad: float") + .Input("input: T") + .Input("grad: T") .Input("argmax: Targmax") - .Output("output: float") + .Output("output: T") + .Attr("T: {float, half} = DT_FLOAT") .Doc(R"doc( Computes gradients of the maxpooling function. diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt index 5fb34e79d1a..a8d445c3c4c 100644 --- a/tensorflow/core/ops/ops.pbtxt +++ b/tensorflow/core/ops/ops.pbtxt @@ -591,12 +591,12 @@ op { } input_arg { name: "l1" - description: "Scaling factor. Must be a scalar." + description: "L1 regulariation. Must be a scalar." type_attr: "T" } input_arg { name: "l2" - description: "Scaling factor. Must be a scalar." + description: "L2 regulariation. Must be a scalar." type_attr: "T" } input_arg { @@ -767,6 +767,146 @@ op { summary: "Update \'*var\' according to the momentum scheme." description: "accum = accum * momentum + grad\nvar -= lr * accum" } +op { + name: "ApplyProximalAdagrad" + input_arg { + name: "var" + description: "Should be from a Variable()." + type_attr: "T" + is_ref: true + } + input_arg { + name: "accum" + description: "Should be from a Variable()." + type_attr: "T" + is_ref: true + } + input_arg { + name: "lr" + description: "Scaling factor. Must be a scalar." + type_attr: "T" + } + input_arg { + name: "l1" + description: "L1 regularization. Must be a scalar." + type_attr: "T" + } + input_arg { + name: "l2" + description: "L2 regularization. Must be a scalar." + type_attr: "T" + } + input_arg { + name: "grad" + description: "The gradient." + type_attr: "T" + } + output_arg { + name: "out" + description: "Same as \"var\"." + type_attr: "T" + is_ref: true + } + attr { + name: "T" + type: "type" + allowed_values { + list { + type: DT_FLOAT + type: DT_DOUBLE + type: DT_INT64 + type: DT_INT32 + type: DT_UINT8 + type: DT_UINT16 + type: DT_INT16 + type: DT_INT8 + type: DT_COMPLEX64 + type: DT_COMPLEX128 + type: DT_QINT8 + type: DT_QUINT8 + type: DT_QINT32 + type: DT_HALF + } + } + } + attr { + name: "use_locking" + type: "bool" + default_value { + b: false + } + description: "If True, updating of the var and accum tensors will be protected by\na lock; otherwise the behavior is undefined, but may exhibit less contention." + } + summary: "Update \'*var\' and \'*accum\' according to FOBOS with Adagrad learning rate." + description: "accum += grad * grad\nprox_v = var - lr * grad * (1 / sqrt(accum))\nvar = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0}" +} +op { + name: "ApplyProximalGradientDescent" + input_arg { + name: "var" + description: "Should be from a Variable()." + type_attr: "T" + is_ref: true + } + input_arg { + name: "alpha" + description: "Scaling factor. Must be a scalar." + type_attr: "T" + } + input_arg { + name: "l1" + description: "L1 regularization. Must be a scalar." + type_attr: "T" + } + input_arg { + name: "l2" + description: "L2 regularization. Must be a scalar." + type_attr: "T" + } + input_arg { + name: "delta" + description: "The change." + type_attr: "T" + } + output_arg { + name: "out" + description: "Same as \"var\"." + type_attr: "T" + is_ref: true + } + attr { + name: "T" + type: "type" + allowed_values { + list { + type: DT_FLOAT + type: DT_DOUBLE + type: DT_INT64 + type: DT_INT32 + type: DT_UINT8 + type: DT_UINT16 + type: DT_INT16 + type: DT_INT8 + type: DT_COMPLEX64 + type: DT_COMPLEX128 + type: DT_QINT8 + type: DT_QUINT8 + type: DT_QINT32 + type: DT_HALF + } + } + } + attr { + name: "use_locking" + type: "bool" + default_value { + b: false + } + description: "If True, the subtraction will be protected by a lock;\notherwise the behavior is undefined, but may exhibit less contention." + } + summary: "Update \'*var\' as FOBOS algorithm with fixed learning rate." + description: "prox_v = var - alpha * delta\nvar = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0}" +} op { name: "ApplyRMSProp" input_arg { @@ -1251,6 +1391,7 @@ op { allowed_values { list { type: DT_FLOAT + type: DT_HALF type: DT_DOUBLE } } @@ -1447,6 +1588,7 @@ op { allowed_values { list { type: DT_FLOAT + type: DT_HALF type: DT_DOUBLE } } @@ -6614,12 +6756,25 @@ op { input_arg { name: "input" description: "4-D input to pool over." - type: DT_FLOAT + type_attr: "T" } output_arg { name: "output" description: "The max pooled output tensor." - type: DT_FLOAT + type_attr: "T" + } + attr { + name: "T" + type: "type" + default_value { + type: DT_FLOAT + } + allowed_values { + list { + type: DT_FLOAT + type: DT_HALF + } + } } attr { name: "ksize" @@ -6798,22 +6953,22 @@ op { input_arg { name: "orig_input" description: "The original input tensor." - type: DT_FLOAT + type_attr: "T" } input_arg { name: "orig_output" description: "The original output tensor." - type: DT_FLOAT + type_attr: "T" } input_arg { name: "grad" description: "4-D. Gradients w.r.t. the output of `max_pool`." - type: DT_FLOAT + type_attr: "T" } output_arg { name: "output" description: "Gradients w.r.t. the input to `max_pool`." - type: DT_FLOAT + type_attr: "T" } attr { name: "ksize" @@ -6854,6 +7009,19 @@ op { } } } + attr { + name: "T" + type: "type" + default_value { + type: DT_FLOAT + } + allowed_values { + list { + type: DT_FLOAT + type: DT_HALF + } + } + } summary: "Computes gradients of the maxpooling function." } op { @@ -6861,12 +7029,12 @@ op { input_arg { name: "input" description: "The original input." - type: DT_FLOAT + type_attr: "T" } input_arg { name: "grad" description: "4-D with shape `[batch, height, width, channels]`. Gradients w.r.t. the\noutput of `max_pool`." - type: DT_FLOAT + type_attr: "T" } input_arg { name: "argmax" @@ -6876,7 +7044,7 @@ op { output_arg { name: "output" description: "Gradients w.r.t. the input of `max_pool`." - type: DT_FLOAT + type_attr: "T" } attr { name: "ksize" @@ -6913,6 +7081,19 @@ op { } } } + attr { + name: "T" + type: "type" + default_value { + type: DT_FLOAT + } + allowed_values { + list { + type: DT_FLOAT + type: DT_HALF + } + } + } summary: "Computes gradients of the maxpooling function." } op { @@ -6920,12 +7101,12 @@ op { input_arg { name: "input" description: "4-D with shape `[batch, height, width, channels]`. Input to pool over." - type: DT_FLOAT + type_attr: "T" } output_arg { name: "output" description: "The max pooled output tensor." - type: DT_FLOAT + type_attr: "T" } output_arg { name: "argmax" @@ -6970,6 +7151,19 @@ op { } } } + attr { + name: "T" + type: "type" + default_value { + type: DT_FLOAT + } + allowed_values { + list { + type: DT_FLOAT + type: DT_HALF + } + } + } summary: "Performs max pooling on the input and outputs both max values and indices." description: "The indices in `argmax` are flattened, so that a maximum value at position\n`[b, y, x, c]` becomes flattened index\n`((b * height + y) * width + x) * channels + c`." } @@ -11380,12 +11574,12 @@ op { } input_arg { name: "l1" - description: "Scaling factor. Must be a scalar." + description: "L1 regularization. Must be a scalar." type_attr: "T" } input_arg { name: "l2" - description: "Scaling factor. Must be a scalar." + description: "L2 regularization. Must be a scalar." type_attr: "T" } input_arg { @@ -11525,6 +11719,176 @@ op { summary: "Update relevant entries in \'*var\' and \'*accum\' according to the momentum scheme." description: "That is for rows we have grad for, we update var and accum as follows:\n\naccum = accum * momentum + grad\nvar -= lr * accum" } +op { + name: "SparseApplyProximalAdagrad" + input_arg { + name: "var" + description: "Should be from a Variable()." + type_attr: "T" + is_ref: true + } + input_arg { + name: "accum" + description: "Should be from a Variable()." + type_attr: "T" + is_ref: true + } + input_arg { + name: "lr" + description: "Learning rate. Must be a scalar." + type_attr: "T" + } + input_arg { + name: "l1" + description: "L1 regularization. Must be a scalar." + type_attr: "T" + } + input_arg { + name: "l2" + description: "L2 regularization. Must be a scalar." + type_attr: "T" + } + input_arg { + name: "grad" + description: "The gradient." + type_attr: "T" + } + input_arg { + name: "indices" + description: "A vector of indices into the first dimension of var and accum." + type_attr: "Tindices" + } + output_arg { + name: "out" + description: "Same as \"var\"." + type_attr: "T" + is_ref: true + } + attr { + name: "T" + type: "type" + allowed_values { + list { + type: DT_FLOAT + type: DT_DOUBLE + type: DT_INT64 + type: DT_INT32 + type: DT_UINT8 + type: DT_UINT16 + type: DT_INT16 + type: DT_INT8 + type: DT_COMPLEX64 + type: DT_COMPLEX128 + type: DT_QINT8 + type: DT_QUINT8 + type: DT_QINT32 + type: DT_HALF + } + } + } + attr { + name: "Tindices" + type: "type" + allowed_values { + list { + type: DT_INT32 + type: DT_INT64 + } + } + } + attr { + name: "use_locking" + type: "bool" + default_value { + b: false + } + description: "If True, updating of the var and accum tensors will be protected by\na lock; otherwise the behavior is undefined, but may exhibit less contention." + } + summary: "Sparse update entries in \'*var\' and \'*accum\' according to FOBOS algorithm." + description: "That is for rows we have grad for, we update var and accum as follows:\naccum += grad * grad\nprox_v = var\nprox_v -= lr * grad * (1 / sqrt(accum))\nvar = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0}" +} +op { + name: "SparseApplyProximalGradientDescent" + input_arg { + name: "var" + description: "Should be from a Variable()." + type_attr: "T" + is_ref: true + } + input_arg { + name: "alpha" + description: "Scaling factor. Must be a scalar." + type_attr: "T" + } + input_arg { + name: "l1" + description: "L1 regularization. Must be a scalar." + type_attr: "T" + } + input_arg { + name: "l2" + description: "L2 regularization. Must be a scalar." + type_attr: "T" + } + input_arg { + name: "grad" + description: "The gradient." + type_attr: "T" + } + input_arg { + name: "indices" + description: "A vector of indices into the first dimension of var and accum." + type_attr: "Tindices" + } + output_arg { + name: "out" + description: "Same as \"var\"." + type_attr: "T" + is_ref: true + } + attr { + name: "T" + type: "type" + allowed_values { + list { + type: DT_FLOAT + type: DT_DOUBLE + type: DT_INT64 + type: DT_INT32 + type: DT_UINT8 + type: DT_UINT16 + type: DT_INT16 + type: DT_INT8 + type: DT_COMPLEX64 + type: DT_COMPLEX128 + type: DT_QINT8 + type: DT_QUINT8 + type: DT_QINT32 + type: DT_HALF + } + } + } + attr { + name: "Tindices" + type: "type" + allowed_values { + list { + type: DT_INT32 + type: DT_INT64 + } + } + } + attr { + name: "use_locking" + type: "bool" + default_value { + b: false + } + description: "If True, the subtraction will be protected by a lock;\notherwise the behavior is undefined, but may exhibit less contention." + } + summary: "Sparse update \'*var\' as FOBOS algorithm with fixed learning rate." + description: "That is for rows we have grad for, we update var as follows:\nprox_v = var - alpha * grad\nvar = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0}" +} op { name: "SparseConcat" input_arg { diff --git a/tensorflow/core/ops/training_ops.cc b/tensorflow/core/ops/training_ops.cc index e6a805a8777..5eb011684b4 100644 --- a/tensorflow/core/ops/training_ops.cc +++ b/tensorflow/core/ops/training_ops.cc @@ -35,6 +35,59 @@ use_locking: If `True`, the subtraction will be protected by a lock; otherwise the behavior is undefined, but may exhibit less contention. )doc"); +REGISTER_OP("ApplyProximalGradientDescent") + .Input("var: Ref(T)") + .Input("alpha: T") + .Input("l1: T") + .Input("l2: T") + .Input("delta: T") + .Output("out: Ref(T)") + .Attr("T: numbertype") + .Attr("use_locking: bool = false") + .Doc(R"doc( +Update '*var' as FOBOS algorithm with fixed learning rate. +prox_v = var - alpha * delta +var = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0} + +var: Should be from a Variable(). +alpha: Scaling factor. Must be a scalar. +l1: L1 regularization. Must be a scalar. +l2: L2 regularization. Must be a scalar. +delta: The change. +out: Same as "var". +use_locking: If True, the subtraction will be protected by a lock; + otherwise the behavior is undefined, but may exhibit less contention. +)doc"); + +REGISTER_OP("SparseApplyProximalGradientDescent") + .Input("var: Ref(T)") + .Input("alpha: T") + .Input("l1: T") + .Input("l2: T") + .Input("grad: T") + .Input("indices: Tindices") + .Output("out: Ref(T)") + .Attr("T: numbertype") + .Attr("Tindices: {int32, int64}") + .Attr("use_locking: bool = false") + .Doc(R"doc( +Sparse update '*var' as FOBOS algorithm with fixed learning rate. + +That is for rows we have grad for, we update var as follows: +prox_v = var - alpha * grad +var = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0} + +var: Should be from a Variable(). +alpha: Scaling factor. Must be a scalar. +l1: L1 regularization. Must be a scalar. +l2: L2 regularization. Must be a scalar. +grad: The gradient. +indices: A vector of indices into the first dimension of var and accum. +out: Same as "var". +use_locking: If True, the subtraction will be protected by a lock; + otherwise the behavior is undefined, but may exhibit less contention. +)doc"); + REGISTER_OP("ApplyAdadelta") .Input("var: Ref(T)") .Input("accum: Ref(T)") @@ -117,6 +170,33 @@ use_locking: If `True`, updating of the var and accum tensors will be protected contention. )doc"); +REGISTER_OP("ApplyProximalAdagrad") + .Input("var: Ref(T)") + .Input("accum: Ref(T)") + .Input("lr: T") + .Input("l1: T") + .Input("l2: T") + .Input("grad: T") + .Output("out: Ref(T)") + .Attr("T: numbertype") + .Attr("use_locking: bool = false") + .Doc(R"doc( +Update '*var' and '*accum' according to FOBOS with Adagrad learning rate. +accum += grad * grad +prox_v = var - lr * grad * (1 / sqrt(accum)) +var = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0} + +var: Should be from a Variable(). +accum: Should be from a Variable(). +grad: The gradient. +lr: Scaling factor. Must be a scalar. +l1: L1 regularization. Must be a scalar. +l2: L2 regularization. Must be a scalar. +out: Same as "var". +use_locking: If True, updating of the var and accum tensors will be protected by +a lock; otherwise the behavior is undefined, but may exhibit less contention. +)doc"); + REGISTER_OP("SparseApplyAdagrad") .Input("var: Ref(T)") .Input("accum: Ref(T)") @@ -145,6 +225,39 @@ use_locking: If `True`, updating of the var and accum tensors will be protected contention. )doc"); +REGISTER_OP("SparseApplyProximalAdagrad") + .Input("var: Ref(T)") + .Input("accum: Ref(T)") + .Input("lr: T") + .Input("l1: T") + .Input("l2: T") + .Input("grad: T") + .Input("indices: Tindices") + .Output("out: Ref(T)") + .Attr("T: numbertype") + .Attr("Tindices: {int32, int64}") + .Attr("use_locking: bool = false") + .Doc(R"doc( +Sparse update entries in '*var' and '*accum' according to FOBOS algorithm. + +That is for rows we have grad for, we update var and accum as follows: +accum += grad * grad +prox_v = var +prox_v -= lr * grad * (1 / sqrt(accum)) +var = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0} + +var: Should be from a Variable(). +accum: Should be from a Variable(). +lr: Learning rate. Must be a scalar. +l1: L1 regularization. Must be a scalar. +l2: L2 regularization. Must be a scalar. +grad: The gradient. +indices: A vector of indices into the first dimension of var and accum. +out: Same as "var". +use_locking: If True, updating of the var and accum tensors will be protected by +a lock; otherwise the behavior is undefined, but may exhibit less contention. +)doc"); + REGISTER_OP("ApplyFtrl") .Input("var: Ref(T)") .Input("accum: Ref(T)") @@ -171,8 +284,8 @@ accum: Should be from a Variable(). linear: Should be from a Variable(). grad: The gradient. lr: Scaling factor. Must be a scalar. -l1: Scaling factor. Must be a scalar. -l2: Scaling factor. Must be a scalar. +l1: L1 regulariation. Must be a scalar. +l2: L2 regulariation. Must be a scalar. lr_power: Scaling factor. Must be a scalar. out: Same as "var". use_locking: If `True`, updating of the var and accum tensors will be protected @@ -210,8 +323,8 @@ linear: Should be from a Variable(). grad: The gradient. indices: A vector of indices into the first dimension of var and accum. lr: Scaling factor. Must be a scalar. -l1: Scaling factor. Must be a scalar. -l2: Scaling factor. Must be a scalar. +l1: L1 regularization. Must be a scalar. +l2: L2 regularization. Must be a scalar. lr_power: Scaling factor. Must be a scalar. out: Same as "var". use_locking: If `True`, updating of the var and accum tensors will be protected diff --git a/tensorflow/core/util/stat_summarizer.h b/tensorflow/core/util/stat_summarizer.h index 9a82bdc3e18..c5dea66c65c 100644 --- a/tensorflow/core/util/stat_summarizer.h +++ b/tensorflow/core/util/stat_summarizer.h @@ -69,9 +69,9 @@ class Stat { : static_cast(sum_) / count_; } - ValueType rms() const { return sqrt(squared_sum_ / count_); } - - ValueType std_deviation() const { return all_same() ? 0 : rms() - avg(); } + ValueType std_deviation() const { + return all_same() ? 0 : sqrt(squared_sum_ / count_ - avg() * avg()); + } void OutputToStream(std::ostream* stream) const { if (empty()) { diff --git a/tensorflow/examples/skflow/iris_val_based_early_stopping.py b/tensorflow/examples/skflow/iris_val_based_early_stopping.py index e6e0b7d76d9..72e0595544f 100644 --- a/tensorflow/examples/skflow/iris_val_based_early_stopping.py +++ b/tensorflow/examples/skflow/iris_val_based_early_stopping.py @@ -18,35 +18,38 @@ from __future__ import print_function from sklearn import datasets from sklearn import metrics from sklearn.cross_validation import train_test_split +import tensorflow as tf from tensorflow.contrib import learn -iris = datasets.load_iris() -X_train, X_test, y_train, y_test = train_test_split(iris.data, - iris.target, - test_size=0.2, - random_state=42) +def main(unused_argv): + iris = datasets.load_iris() + x_train, x_test, y_train, y_test = train_test_split( + iris.data, iris.target, test_size=0.2, random_state=42) -X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, - test_size=0.2, - random_state=42) -val_monitor = learn.monitors.ValidationMonitor(X_val, y_val, - early_stopping_rounds=200) + x_train, x_val, y_train, y_val = train_test_split( + x_train, y_train, test_size=0.2, random_state=42) + val_monitor = learn.monitors.ValidationMonitor( + x_val, y_val, early_stopping_rounds=200) -# classifier with early stopping on training data -classifier1 = learn.TensorFlowDNNClassifier(hidden_units=[10, 20, 10], - n_classes=3, - model_dir='/tmp/iris_model/') -classifier1.fit(X_train, y_train, steps=2000) -score1 = metrics.accuracy_score(y_test, classifier1.predict(X_test)) + # classifier with early stopping on training data + classifier1 = learn.TensorFlowDNNClassifier( + hidden_units=[10, 20, 10], n_classes=3, model_dir='/tmp/iris_model/') + classifier1.fit(x=x_train, y=y_train, steps=2000) + score1 = metrics.accuracy_score(y_test, classifier1.predict(x_test)) -# classifier with early stopping on validation data -classifier2 = learn.TensorFlowDNNClassifier(hidden_units=[10, 20, 10], - n_classes=3, - model_dir='/tmp/iris_model_val/') -classifier2.fit(X_train, y_train, val_monitor, steps=2000) -score2 = metrics.accuracy_score(y_test, classifier2.predict(X_test)) + # classifier with early stopping on validation data, save frequently for + # monitor to pick up new checkpoints. + classifier2 = learn.TensorFlowDNNClassifier( + hidden_units=[10, 20, 10], n_classes=3, model_dir='/tmp/iris_model_val/', + config=tf.contrib.learn.RunConfig(save_checkpoints_secs=1)) + classifier2.fit(x=x_train, y=y_train, steps=2000, monitors=[val_monitor]) + score2 = metrics.accuracy_score(y_test, classifier2.predict(x_test)) -# In many applications, the score is improved by using early stopping -print(score2 > score1) + # In many applications, the score is improved by using early stopping + print(score2 > score1) + + +if __name__ == '__main__': + tf.app.run() diff --git a/tensorflow/examples/tutorials/mnist/mnist_with_summaries.py b/tensorflow/examples/tutorials/mnist/mnist_with_summaries.py index 91bc69c6b21..5c1b2f1eed9 100644 --- a/tensorflow/examples/tutorials/mnist/mnist_with_summaries.py +++ b/tensorflow/examples/tutorials/mnist/mnist_with_summaries.py @@ -164,7 +164,7 @@ def train(): feed_dict=feed_dict(True), options=run_options, run_metadata=run_metadata) - train_writer.add_run_metadata(run_metadata, 'step%d' % i) + train_writer.add_run_metadata(run_metadata, 'step%03d' % i) train_writer.add_summary(summary, i) print('Adding run metadata for', i) else: # Record a summary diff --git a/tensorflow/g3doc/api_docs/python/array_ops.md b/tensorflow/g3doc/api_docs/python/array_ops.md index b80026bd3a2..74bdd822e42 100644 --- a/tensorflow/g3doc/api_docs/python/array_ops.md +++ b/tensorflow/g3doc/api_docs/python/array_ops.md @@ -216,7 +216,7 @@ This operation returns a 1-D integer tensor representing the shape of `input`. For example: -```prettyprint +```python # 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]] shape(t) ==> [2, 2, 3] ``` @@ -224,7 +224,7 @@ shape(t) ==> [2, 2, 3] ##### Args: -* `input`: A `Tensor`. +* `input`: A `Tensor` or `SparseTensor`. * `name`: A name for the operation (optional). ##### Returns: diff --git a/tensorflow/g3doc/api_docs/python/contrib.learn.md b/tensorflow/g3doc/api_docs/python/contrib.learn.md index 7e8228938a0..b764ca43534 100644 --- a/tensorflow/g3doc/api_docs/python/contrib.learn.md +++ b/tensorflow/g3doc/api_docs/python/contrib.learn.md @@ -796,11 +796,11 @@ A classifier for TensorFlow DNN models. def input_fn_eval: # returns x, Y pass - estimator.evaluate(input_fn_eval) - estimator.predict(x) + estimator.evaluate(input_fn=input_fn_eval) + estimator.predict(x=x) ``` - Input of `fit`, `train`, and `evaluate` should have following features, + Input of `fit` and `evaluate` should have following features, otherwise there will be a `KeyError`: if `weight_column_name` is not `None`, a feature with `key=weight_column_name` whose value is a `Tensor`. @@ -1140,11 +1140,11 @@ A regressor for TensorFlow DNN models. def input_fn_eval: # returns x, Y pass - estimator.evaluate(input_fn_eval) - estimator.predict(x) + estimator.evaluate(input_fn=input_fn_eval) + estimator.predict(x=x) ``` - Input of `fit`, `train`, and `evaluate` should have following features, + Input of `fit` and `evaluate` should have following features, otherwise there will be a `KeyError`: if `weight_column_name` is not `None`, a feature with `key=weight_column_name` whose value is a `Tensor`. @@ -2245,10 +2245,10 @@ Linear classifier model. ... estimator.fit(input_fn=input_fn_train) estimator.evaluate(input_fn=input_fn_eval) - estimator.predict(x) + estimator.predict(x=x) ``` - Input of `fit`, `train`, and `evaluate` should have following features, + Input of `fit` and `evaluate` should have following features, otherwise there will be a `KeyError`: if `weight_column_name` is not `None`, a feature with `key=weight_column_name` whose value is a `Tensor`. @@ -2579,10 +2579,10 @@ Linear regressor model. ... estimator.fit(input_fn=input_fn_train) estimator.evaluate(input_fn=input_fn_eval) - estimator.predict(x) + estimator.predict(x=x) ``` - Input of `fit`, `train`, and `evaluate` should have following features, + Input of `fit` and `evaluate` should have following features, otherwise there will be a KeyError: if `weight_column_name` is not None: key=weight_column_name, value=a `Tensor` @@ -4238,50 +4238,33 @@ Perform various training, evaluation, and inference actions on a graph. ### `class tf.contrib.learn.RunConfig` {#RunConfig} This class specifies the specific configurations for the run. - -Parameters: - execution_mode: Runners use this flag to execute different tasks, like - training vs evaluation. 'all' (the default) executes both training and - eval. - master: TensorFlow master. Empty string (the default) for local. - task: Task id of the replica running the training (default: 0). - num_ps_replicas: Number of parameter server tasks to use (default: 0). - training_worker_session_startup_stagger_secs: Seconds to sleep between the - startup of each worker task session (default: 5). - training_worker_max_startup_secs: Max seconds to wait before starting any - worker (default: 60). - eval_delay_secs: Number of seconds between the beginning of each eval run. - If one run takes more than this amount of time, the next run will start - immediately once that run completes (default 60). - eval_steps: Number of steps to run in each eval (default: 100). - num_cores: Number of cores to be used (default: 4). - verbose: Controls the verbosity, possible values: - 0: the algorithm and debug information is muted. - 1: trainer prints the progress. - 2: log device placement is printed. - gpu_memory_fraction: Fraction of GPU memory used by the process on - each GPU uniformly on the same machine. - tf_random_seed: Random seed for TensorFlow initializers. - Setting this value allows consistency between reruns. - keep_checkpoint_max: The maximum number of recent checkpoint files to keep. - As new files are created, older files are deleted. - If None or 0, all checkpoint files are kept. - Defaults to 5 (that is, the 5 most recent checkpoint files are kept.) - keep_checkpoint_every_n_hours: Number of hours between each checkpoint - to be saved. The default value of 10,000 hours effectively disables - the feature. - -Attributes: - tf_master: Tensorflow master. - tf_config: Tensorflow Session Config proto. - tf_random_seed: Tensorflow random seed. - keep_checkpoint_max: Maximum number of checkpoints to keep. - keep_checkpoint_every_n_hours: Number of hours between each checkpoint. - - - -#### `tf.contrib.learn.RunConfig.__init__(execution_mode='all', master='', task=0, num_ps_replicas=0, training_worker_session_startup_stagger_secs=5, training_worker_max_startup_secs=60, eval_delay_secs=60, eval_steps=100, num_cores=4, verbose=1, gpu_memory_fraction=1, tf_random_seed=42, keep_checkpoint_max=5, keep_checkpoint_every_n_hours=10000)` {#RunConfig.__init__} +#### `tf.contrib.learn.RunConfig.__init__(master='', task=0, num_ps_replicas=0, num_cores=4, log_device_placement=False, gpu_memory_fraction=1, tf_random_seed=42, save_summary_steps=100, save_checkpoints_secs=60, keep_checkpoint_max=5, keep_checkpoint_every_n_hours=10000)` {#RunConfig.__init__} + +Constructor. + +##### Args: +* `master`: TensorFlow master. Empty string (the default) for local. +* `task`: Task id of the replica running the training (default: 0). +* `num_ps_replicas`: Number of parameter server tasks to use (default: 0). +* `num_cores`: Number of cores to be used (default: 4). +* `log_device_placement`: Log the op placement to devices (default: False). +* `gpu_memory_fraction`: Fraction of GPU memory used by the process on + each GPU uniformly on the same machine. +* `tf_random_seed`: Random seed for TensorFlow initializers. + Setting this value allows consistency between reruns. +* `save_summary_steps`: Save summaries every this many steps. +* `save_checkpoints_secs`: Save checkpoints every this many seconds. +* `keep_checkpoint_max`: The maximum number of recent checkpoint files to + keep. As new files are created, older files are deleted. If None or 0, + all checkpoint files are kept. Defaults to 5 (that is, the 5 most recent + checkpoint files are kept.) +* `keep_checkpoint_every_n_hours`: Number of hours between each checkpoint + to be saved. The default value of 10,000 hours effectively disables + the feature. @@ -4394,7 +4377,7 @@ Run `output_dict` tensors `n` times, with the same `feed_dict` each run. - - - -### `tf.contrib.learn.train(graph, output_dir, train_op, loss_op, global_step_tensor=None, init_op=None, init_feed_dict=None, init_fn=None, log_every_steps=10, supervisor_is_chief=True, supervisor_master='', supervisor_save_model_secs=600, supervisor_save_summaries_steps=100, feed_fn=None, max_steps=None, fail_on_nan_loss=True, monitors=None)` {#train} +### `tf.contrib.learn.train(graph, output_dir, train_op, loss_op, global_step_tensor=None, init_op=None, init_feed_dict=None, init_fn=None, log_every_steps=10, supervisor_is_chief=True, supervisor_master='', supervisor_save_model_secs=600, supervisor_save_summaries_steps=100, feed_fn=None, steps=None, fail_on_nan_loss=True, monitors=None)` {#train} Train a model. @@ -4437,7 +4420,7 @@ program is terminated with exit code 1. `supervisor_save_summaries_steps` seconds when training. * `feed_fn`: A function that is called every iteration to produce a `feed_dict` passed to `session.run` calls. Optional. -* `max_steps`: Train until `global_step_tensor` evaluates to this value. +* `steps`: Trains for this many steps (e.g. current global step + `steps`). * `fail_on_nan_loss`: If true, raise `NanLossDuringTrainingError` if `loss_op` evaluates to `NaN`. If false, continue training as if nothing happened. * `monitors`: List of `BaseMonitor` subclass instances. Used for callbacks diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.learn.LinearRegressor.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.learn.LinearRegressor.md index 51796a694cd..9079336f9ce 100644 --- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.learn.LinearRegressor.md +++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.learn.LinearRegressor.md @@ -18,10 +18,10 @@ Linear regressor model. ... estimator.fit(input_fn=input_fn_train) estimator.evaluate(input_fn=input_fn_eval) - estimator.predict(x) + estimator.predict(x=x) ``` - Input of `fit`, `train`, and `evaluate` should have following features, + Input of `fit` and `evaluate` should have following features, otherwise there will be a KeyError: if `weight_column_name` is not None: key=weight_column_name, value=a `Tensor` diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.learn.LinearClassifier.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.learn.LinearClassifier.md index 5bc561962b3..9d5dc8bd7bc 100644 --- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.learn.LinearClassifier.md +++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.learn.LinearClassifier.md @@ -18,10 +18,10 @@ Linear classifier model. ... estimator.fit(input_fn=input_fn_train) estimator.evaluate(input_fn=input_fn_eval) - estimator.predict(x) + estimator.predict(x=x) ``` - Input of `fit`, `train`, and `evaluate` should have following features, + Input of `fit` and `evaluate` should have following features, otherwise there will be a `KeyError`: if `weight_column_name` is not `None`, a feature with `key=weight_column_name` whose value is a `Tensor`. diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.learn.train.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.learn.train.md index 65057636ce7..33ec7f0d532 100644 --- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.learn.train.md +++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.learn.train.md @@ -1,4 +1,4 @@ -### `tf.contrib.learn.train(graph, output_dir, train_op, loss_op, global_step_tensor=None, init_op=None, init_feed_dict=None, init_fn=None, log_every_steps=10, supervisor_is_chief=True, supervisor_master='', supervisor_save_model_secs=600, supervisor_save_summaries_steps=100, feed_fn=None, max_steps=None, fail_on_nan_loss=True, monitors=None)` {#train} +### `tf.contrib.learn.train(graph, output_dir, train_op, loss_op, global_step_tensor=None, init_op=None, init_feed_dict=None, init_fn=None, log_every_steps=10, supervisor_is_chief=True, supervisor_master='', supervisor_save_model_secs=600, supervisor_save_summaries_steps=100, feed_fn=None, steps=None, fail_on_nan_loss=True, monitors=None)` {#train} Train a model. @@ -41,7 +41,7 @@ program is terminated with exit code 1. `supervisor_save_summaries_steps` seconds when training. * `feed_fn`: A function that is called every iteration to produce a `feed_dict` passed to `session.run` calls. Optional. -* `max_steps`: Train until `global_step_tensor` evaluates to this value. +* `steps`: Trains for this many steps (e.g. current global step + `steps`). * `fail_on_nan_loss`: If true, raise `NanLossDuringTrainingError` if `loss_op` evaluates to `NaN`. If false, continue training as if nothing happened. * `monitors`: List of `BaseMonitor` subclass instances. Used for callbacks diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.shape.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.shape.md index 4262f41a3d3..4cbbcf4ab15 100644 --- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.shape.md +++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.shape.md @@ -6,7 +6,7 @@ This operation returns a 1-D integer tensor representing the shape of `input`. For example: -```prettyprint +```python # 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]] shape(t) ==> [2, 2, 3] ``` @@ -14,7 +14,7 @@ shape(t) ==> [2, 2, 3] ##### Args: -* `input`: A `Tensor`. +* `input`: A `Tensor` or `SparseTensor`. * `name`: A name for the operation (optional). ##### Returns: diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.learn.DNNClassifier.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.learn.DNNClassifier.md index 645304ee74c..c68a339de35 100644 --- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.learn.DNNClassifier.md +++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.learn.DNNClassifier.md @@ -21,11 +21,11 @@ A classifier for TensorFlow DNN models. def input_fn_eval: # returns x, Y pass - estimator.evaluate(input_fn_eval) - estimator.predict(x) + estimator.evaluate(input_fn=input_fn_eval) + estimator.predict(x=x) ``` - Input of `fit`, `train`, and `evaluate` should have following features, + Input of `fit` and `evaluate` should have following features, otherwise there will be a `KeyError`: if `weight_column_name` is not `None`, a feature with `key=weight_column_name` whose value is a `Tensor`. diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.learn.RunConfig.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.learn.RunConfig.md index ffdf8703c09..35a71be5f8c 100644 --- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.learn.RunConfig.md +++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.learn.RunConfig.md @@ -1,47 +1,30 @@ This class specifies the specific configurations for the run. +- - - -Parameters: - execution_mode: Runners use this flag to execute different tasks, like - training vs evaluation. 'all' (the default) executes both training and - eval. - master: TensorFlow master. Empty string (the default) for local. - task: Task id of the replica running the training (default: 0). - num_ps_replicas: Number of parameter server tasks to use (default: 0). - training_worker_session_startup_stagger_secs: Seconds to sleep between the - startup of each worker task session (default: 5). - training_worker_max_startup_secs: Max seconds to wait before starting any - worker (default: 60). - eval_delay_secs: Number of seconds between the beginning of each eval run. - If one run takes more than this amount of time, the next run will start - immediately once that run completes (default 60). - eval_steps: Number of steps to run in each eval (default: 100). - num_cores: Number of cores to be used (default: 4). - verbose: Controls the verbosity, possible values: - 0: the algorithm and debug information is muted. - 1: trainer prints the progress. - 2: log device placement is printed. - gpu_memory_fraction: Fraction of GPU memory used by the process on +#### `tf.contrib.learn.RunConfig.__init__(master='', task=0, num_ps_replicas=0, num_cores=4, log_device_placement=False, gpu_memory_fraction=1, tf_random_seed=42, save_summary_steps=100, save_checkpoints_secs=60, keep_checkpoint_max=5, keep_checkpoint_every_n_hours=10000)` {#RunConfig.__init__} + +Constructor. + +##### Args: + + +* `master`: TensorFlow master. Empty string (the default) for local. +* `task`: Task id of the replica running the training (default: 0). +* `num_ps_replicas`: Number of parameter server tasks to use (default: 0). +* `num_cores`: Number of cores to be used (default: 4). +* `log_device_placement`: Log the op placement to devices (default: False). +* `gpu_memory_fraction`: Fraction of GPU memory used by the process on each GPU uniformly on the same machine. - tf_random_seed: Random seed for TensorFlow initializers. +* `tf_random_seed`: Random seed for TensorFlow initializers. Setting this value allows consistency between reruns. - keep_checkpoint_max: The maximum number of recent checkpoint files to keep. - As new files are created, older files are deleted. - If None or 0, all checkpoint files are kept. - Defaults to 5 (that is, the 5 most recent checkpoint files are kept.) - keep_checkpoint_every_n_hours: Number of hours between each checkpoint +* `save_summary_steps`: Save summaries every this many steps. +* `save_checkpoints_secs`: Save checkpoints every this many seconds. +* `keep_checkpoint_max`: The maximum number of recent checkpoint files to + keep. As new files are created, older files are deleted. If None or 0, + all checkpoint files are kept. Defaults to 5 (that is, the 5 most recent + checkpoint files are kept.) +* `keep_checkpoint_every_n_hours`: Number of hours between each checkpoint to be saved. The default value of 10,000 hours effectively disables the feature. -Attributes: - tf_master: Tensorflow master. - tf_config: Tensorflow Session Config proto. - tf_random_seed: Tensorflow random seed. - keep_checkpoint_max: Maximum number of checkpoints to keep. - keep_checkpoint_every_n_hours: Number of hours between each checkpoint. -- - - - -#### `tf.contrib.learn.RunConfig.__init__(execution_mode='all', master='', task=0, num_ps_replicas=0, training_worker_session_startup_stagger_secs=5, training_worker_max_startup_secs=60, eval_delay_secs=60, eval_steps=100, num_cores=4, verbose=1, gpu_memory_fraction=1, tf_random_seed=42, keep_checkpoint_max=5, keep_checkpoint_every_n_hours=10000)` {#RunConfig.__init__} - - - diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.nn.sparse_softmax_cross_entropy_with_logits.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.nn.sparse_softmax_cross_entropy_with_logits.md index 6d53d84c5b7..93fe03b2d78 100644 --- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.nn.sparse_softmax_cross_entropy_with_logits.md +++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.nn.sparse_softmax_cross_entropy_with_logits.md @@ -18,21 +18,28 @@ a probability distribution for each entry, see on `logits` internally for efficiency. Do not call this op with the output of `softmax`, as it will produce incorrect results. -`logits` must have the shape `[batch_size, num_classes]` -and dtype `float32` or `float64`. - -`labels` must have the shape `[batch_size]` and dtype `int32` or `int64`. +A common use case is to have logits of shape `[batch_size, num_classes]` and +labels of shape `[batch_size]`. But higher dimensions are supported. ##### Args: -* `logits`: Unscaled log probabilities. -* `labels`: Each entry `labels[i]` must be an index in `[0, num_classes)`. Other - values will result in a loss of 0, but incorrect gradient computations. +* `logits`: Unscaled log probabilities of rank `r` and shape + `[d_0, d_1, ..., d_{r-2}, num_classes]` and dtype `float32` or `float64`. +* `labels`: `Tensor` of shape `[d_0, d_1, ..., d_{r-2}]` and dtype `int32` or + `int64`. Each entry in `labels` must be an index in `[0, num_classes)`. + Other values will result in a loss of 0, but incorrect gradient + computations. * `name`: A name for the operation (optional). ##### Returns: - A 1-D `Tensor` of length `batch_size` of the same type as `logits` with the - softmax cross entropy loss. + A `Tensor` of the same shape as `labels` and of the same type as `logits` + with the softmax cross entropy loss. + +##### Raises: + + +* `ValueError`: If logits are scalars (need to have rank >= 1) or if the rank + of the labels is not equal to the rank of the labels minus one. diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.learn.DNNRegressor.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.learn.DNNRegressor.md index 581ba4e57e0..f31650eb29b 100644 --- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.learn.DNNRegressor.md +++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.learn.DNNRegressor.md @@ -21,11 +21,11 @@ A regressor for TensorFlow DNN models. def input_fn_eval: # returns x, Y pass - estimator.evaluate(input_fn_eval) - estimator.predict(x) + estimator.evaluate(input_fn=input_fn_eval) + estimator.predict(x=x) ``` - Input of `fit`, `train`, and `evaluate` should have following features, + Input of `fit` and `evaluate` should have following features, otherwise there will be a `KeyError`: if `weight_column_name` is not `None`, a feature with `key=weight_column_name` whose value is a `Tensor`. diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.nn.max_pool_with_argmax.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.nn.max_pool_with_argmax.md index 0bf84c16d06..5424efd7a76 100644 --- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.nn.max_pool_with_argmax.md +++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.nn.max_pool_with_argmax.md @@ -9,7 +9,7 @@ The indices in `argmax` are flattened, so that a maximum value at position ##### Args: -* `input`: A `Tensor` of type `float32`. +* `input`: A `Tensor`. Must be one of the following types: `float32`, `half`. 4-D with shape `[batch, height, width, channels]`. Input to pool over. * `ksize`: A list of `ints` that has length `>= 4`. The size of the window for each dimension of the input tensor. @@ -25,6 +25,6 @@ The indices in `argmax` are flattened, so that a maximum value at position A tuple of `Tensor` objects (output, argmax). -* `output`: A `Tensor` of type `float32`. The max pooled output tensor. +* `output`: A `Tensor`. Has the same type as `input`. The max pooled output tensor. * `argmax`: A `Tensor` of type `Targmax`. 4-D. The flattened indices of the max values chosen for each output. diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.sparse_mask.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.sparse_mask.md index d2fa38733b2..4dcd98e6897 100644 --- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.sparse_mask.md +++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.sparse_mask.md @@ -3,8 +3,8 @@ Masks elements of `IndexedSlices`. Given an `IndexedSlices` instance `a`, returns another `IndexedSlices` that -contains a subset of the slices of `a`. Only the slices at indices specified -in `mask_indices` are returned. +contains a subset of the slices of `a`. Only the slices at indices not +specified in `mask_indices` are returned. This is useful when you need to extract a subset of slices in an `IndexedSlices` object. @@ -18,7 +18,7 @@ a.indices => [12, 26, 37, 45] tf.shape(a.values) => [4, 10] # `b` will be the subset of `a` slices at its second and third indices, so -# we want to mask of its first and last indices (which are at absolute +# we want to mask its first and last indices (which are at absolute # indices 12, 45) b = tf.sparse_mask(a, [12, 45]) diff --git a/tensorflow/g3doc/api_docs/python/nn.md b/tensorflow/g3doc/api_docs/python/nn.md index a6dfbc297db..3c961b8215b 100644 --- a/tensorflow/g3doc/api_docs/python/nn.md +++ b/tensorflow/g3doc/api_docs/python/nn.md @@ -690,7 +690,7 @@ The indices in `argmax` are flattened, so that a maximum value at position ##### Args: -* `input`: A `Tensor` of type `float32`. +* `input`: A `Tensor`. Must be one of the following types: `float32`, `half`. 4-D with shape `[batch, height, width, channels]`. Input to pool over. * `ksize`: A list of `ints` that has length `>= 4`. The size of the window for each dimension of the input tensor. @@ -706,7 +706,7 @@ The indices in `argmax` are flattened, so that a maximum value at position A tuple of `Tensor` objects (output, argmax). -* `output`: A `Tensor` of type `float32`. The max pooled output tensor. +* `output`: A `Tensor`. Has the same type as `input`. The max pooled output tensor. * `argmax`: A `Tensor` of type `Targmax`. 4-D. The flattened indices of the max values chosen for each output. @@ -1244,23 +1244,30 @@ a probability distribution for each entry, see on `logits` internally for efficiency. Do not call this op with the output of `softmax`, as it will produce incorrect results. -`logits` must have the shape `[batch_size, num_classes]` -and dtype `float32` or `float64`. - -`labels` must have the shape `[batch_size]` and dtype `int32` or `int64`. +A common use case is to have logits of shape `[batch_size, num_classes]` and +labels of shape `[batch_size]`. But higher dimensions are supported. ##### Args: -* `logits`: Unscaled log probabilities. -* `labels`: Each entry `labels[i]` must be an index in `[0, num_classes)`. Other - values will result in a loss of 0, but incorrect gradient computations. +* `logits`: Unscaled log probabilities of rank `r` and shape + `[d_0, d_1, ..., d_{r-2}, num_classes]` and dtype `float32` or `float64`. +* `labels`: `Tensor` of shape `[d_0, d_1, ..., d_{r-2}]` and dtype `int32` or + `int64`. Each entry in `labels` must be an index in `[0, num_classes)`. + Other values will result in a loss of 0, but incorrect gradient + computations. * `name`: A name for the operation (optional). ##### Returns: - A 1-D `Tensor` of length `batch_size` of the same type as `logits` with the - softmax cross entropy loss. + A `Tensor` of the same shape as `labels` and of the same type as `logits` + with the softmax cross entropy loss. + +##### Raises: + + +* `ValueError`: If logits are scalars (need to have rank >= 1) or if the rank + of the labels is not equal to the rank of the labels minus one. - - - diff --git a/tensorflow/g3doc/api_docs/python/state_ops.md b/tensorflow/g3doc/api_docs/python/state_ops.md index 4f5c0a7af58..68cd7d33cef 100644 --- a/tensorflow/g3doc/api_docs/python/state_ops.md +++ b/tensorflow/g3doc/api_docs/python/state_ops.md @@ -1895,8 +1895,8 @@ Requires `updates.shape = indices.shape + ref.shape[1:]`. Masks elements of `IndexedSlices`. Given an `IndexedSlices` instance `a`, returns another `IndexedSlices` that -contains a subset of the slices of `a`. Only the slices at indices specified -in `mask_indices` are returned. +contains a subset of the slices of `a`. Only the slices at indices not +specified in `mask_indices` are returned. This is useful when you need to extract a subset of slices in an `IndexedSlices` object. @@ -1910,7 +1910,7 @@ a.indices => [12, 26, 37, 45] tf.shape(a.values) => [4, 10] # `b` will be the subset of `a` slices at its second and third indices, so -# we want to mask of its first and last indices (which are at absolute +# we want to mask its first and last indices (which are at absolute # indices 12, 45) b = tf.sparse_mask(a, [12, 45]) diff --git a/tensorflow/g3doc/how_tos/reading_data/index.md b/tensorflow/g3doc/how_tos/reading_data/index.md index 554cb854db7..b7ae72c9164 100644 --- a/tensorflow/g3doc/how_tos/reading_data/index.md +++ b/tensorflow/g3doc/how_tos/reading_data/index.md @@ -10,7 +10,7 @@ There are three main methods of getting data into a TensorFlow program: [TOC] -## Feeding +## Feeding TensorFlow's feed mechanism lets you inject data into any Tensor in a computation graph. A python computation can thus feed data directly into the @@ -377,11 +377,11 @@ Again, the example queue will have some elements queued, so training will continue until those are exhausted. If the example queue is a [`RandomShuffleQueue`](../../api_docs/python/io_ops.md#RandomShuffleQueue), say because you are using `shuffle_batch` or `shuffle_batch_join`, it normally will -avoid ever going having fewer than its `min_after_dequeue` attr elements -buffered. However, once the queue is closed that restriction will be lifted and -the queue will eventually empty. At that point the actual training threads, -when they try and dequeue from example queue, will start getting `OutOfRange` -errors and exiting. Once all the training threads are done, +avoid ever having fewer than its `min_after_dequeue` attr elements buffered. +However, once the queue is closed that restriction will be lifted and the queue +will eventually empty. At that point the actual training threads, when they +try and dequeue from example queue, will start getting `OutOfRange` errors and +exiting. Once all the training threads are done, [`tf.train.Coordinator.join`](../../api_docs/python/train.md#Coordinator.join) will return and you can exit cleanly. diff --git a/tensorflow/g3doc/how_tos/threading_and_queues/index.md b/tensorflow/g3doc/how_tos/threading_and_queues/index.md index c6124f92f14..46444a02dbe 100644 --- a/tensorflow/g3doc/how_tos/threading_and_queues/index.md +++ b/tensorflow/g3doc/how_tos/threading_and_queues/index.md @@ -146,7 +146,7 @@ for step in xrange(1000000): # When done, ask the threads to stop. coord.request_stop() # And wait for them to actually do it. -coord.join(threads) +coord.join(enqueue_threads) ``` ## Handling Exceptions diff --git a/tensorflow/g3doc/tutorials/recurrent/index.md b/tensorflow/g3doc/tutorials/recurrent/index.md index b5afc186597..5ed26a5e040 100644 --- a/tensorflow/g3doc/tutorials/recurrent/index.md +++ b/tensorflow/g3doc/tutorials/recurrent/index.md @@ -178,9 +178,9 @@ https://github.com/tensorflow/tensorflow/blob/master/tensorflow/g3doc/get_starte [bazel](https://github.com/bazelbuild/bazel)). Next: -``` +```bash cd tensorflow/models/rnn/ptb -python ptb_word_lm --data_path=/tmp/simple-examples/data/ --model small +python ptb_word_lm.py --data_path=/tmp/simple-examples/data/ --model small ``` There are 3 supported model configurations in the tutorial code: "small", diff --git a/tensorflow/models/image/mnist/convolutional.py b/tensorflow/models/image/mnist/convolutional.py index 95e5347c62c..1893e681210 100644 --- a/tensorflow/models/image/mnist/convolutional.py +++ b/tensorflow/models/image/mnist/convolutional.py @@ -48,9 +48,19 @@ EVAL_FREQUENCY = 100 # Number of steps between evaluations. tf.app.flags.DEFINE_boolean("self_test", False, "True if running a self test.") +tf.app.flags.DEFINE_boolean('use_fp16', False, + "Use half floats instead of full floats if True.") FLAGS = tf.app.flags.FLAGS +def data_type(): + """Return the type of the activations, weights, and placeholder variables.""" + if FLAGS.use_fp16: + return tf.float16 + else: + return tf.float32 + + def maybe_download(filename): """Download the data from Yann's website, unless it's already here.""" if not tf.gfile.Exists(WORK_DIRECTORY): @@ -142,11 +152,11 @@ def main(argv=None): # pylint: disable=unused-argument # These placeholder nodes will be fed a batch of training data at each # training step using the {feed_dict} argument to the Run() call below. train_data_node = tf.placeholder( - tf.float32, + data_type(), shape=(BATCH_SIZE, IMAGE_SIZE, IMAGE_SIZE, NUM_CHANNELS)) train_labels_node = tf.placeholder(tf.int64, shape=(BATCH_SIZE,)) eval_data = tf.placeholder( - tf.float32, + data_type(), shape=(EVAL_BATCH_SIZE, IMAGE_SIZE, IMAGE_SIZE, NUM_CHANNELS)) # The variables below hold all the trainable weights. They are passed an @@ -155,24 +165,24 @@ def main(argv=None): # pylint: disable=unused-argument conv1_weights = tf.Variable( tf.truncated_normal([5, 5, NUM_CHANNELS, 32], # 5x5 filter, depth 32. stddev=0.1, - seed=SEED)) - conv1_biases = tf.Variable(tf.zeros([32])) - conv2_weights = tf.Variable( - tf.truncated_normal([5, 5, 32, 64], - stddev=0.1, - seed=SEED)) - conv2_biases = tf.Variable(tf.constant(0.1, shape=[64])) + seed=SEED, dtype=data_type())) + conv1_biases = tf.Variable(tf.zeros([32], dtype=data_type())) + conv2_weights = tf.Variable(tf.truncated_normal( + [5, 5, 32, 64], stddev=0.1, + seed=SEED, dtype=data_type())) + conv2_biases = tf.Variable(tf.constant(0.1, shape=[64], dtype=data_type())) fc1_weights = tf.Variable( # fully connected, depth 512. - tf.truncated_normal( - [IMAGE_SIZE // 4 * IMAGE_SIZE // 4 * 64, 512], - stddev=0.1, - seed=SEED)) - fc1_biases = tf.Variable(tf.constant(0.1, shape=[512])) - fc2_weights = tf.Variable( - tf.truncated_normal([512, NUM_LABELS], + tf.truncated_normal([IMAGE_SIZE // 4 * IMAGE_SIZE // 4 * 64, 512], stddev=0.1, - seed=SEED)) - fc2_biases = tf.Variable(tf.constant(0.1, shape=[NUM_LABELS])) + seed=SEED, + dtype=data_type())) + fc1_biases = tf.Variable(tf.constant(0.1, shape=[512], dtype=data_type())) + fc2_weights = tf.Variable(tf.truncated_normal([512, NUM_LABELS], + stddev=0.1, + seed=SEED, + dtype=data_type())) + fc2_biases = tf.Variable(tf.constant( + 0.1, shape=[NUM_LABELS], dtype=data_type())) # We will replicate the model structure for the training subgraph, as well # as the evaluation subgraphs, while sharing the trainable parameters. @@ -230,7 +240,7 @@ def main(argv=None): # pylint: disable=unused-argument # Optimizer: set up a variable that's incremented once per batch and # controls the learning rate decay. - batch = tf.Variable(0) + batch = tf.Variable(0, dtype=data_type()) # Decay once per epoch, using an exponential schedule starting at 0.01. learning_rate = tf.train.exponential_decay( 0.01, # Base learning rate. diff --git a/tensorflow/python/kernel_tests/pooling_ops_test.py b/tensorflow/python/kernel_tests/pooling_ops_test.py index 333bfa17f95..011078036d0 100644 --- a/tensorflow/python/kernel_tests/pooling_ops_test.py +++ b/tensorflow/python/kernel_tests/pooling_ops_test.py @@ -99,6 +99,42 @@ def GetShrunkInceptionMaxPoolShapes(shrink=30): class PoolingTest(tf.test.TestCase): + def _VerifyOneType(self, pool_func, input_sizes, ksize, strides, padding, + data_format, data_type, expected, use_gpu): + """Verifies the output values of the pooling function. + + Args: + pool_func: Function to be called, co.MaxPool, co.AvgPool, + or the Lua version. + input_sizes: Input tensor dimensions. + ksize: The kernel size dimensions + strides: The stride dimensions + padding: Padding type. + data_format: The data format we use to run the pooling operation. + data_type: The data type to use to run the pooling operation. + expected: An array containing the expected operation outputs. + use_gpu: Whether we are running on GPU. + """ + total_size = 1 + for s in input_sizes: + total_size *= s + # Initializes the input tensor with array containing incrementing + # numbers from 1. + x = [f * 1.0 for f in range(1, total_size + 1)] + with self.test_session(use_gpu=use_gpu) as sess: + t = tf.constant(x, shape=input_sizes, dtype=data_type) + if data_format == "NCHW": + t = NHWCToNCHW(t) + ksize = NHWCToNCHW(ksize) + strides = NHWCToNCHW(strides) + t = pool_func(t, ksize=ksize, strides=strides, padding=padding, + data_format=data_format) + if data_format == "NCHW": + t = NCHWToNHWC(t) + actual = t.eval() + self.assertAllCloseAccordingToType(expected, actual.flatten()) + self.assertShapeEqual(actual, t) + def _VerifyOneTest(self, pool_func, input_sizes, ksize, strides, padding, data_format, expected, use_gpu): """Verifies the output values of the pooling function. @@ -114,25 +150,12 @@ class PoolingTest(tf.test.TestCase): expected: An array containing the expected operation outputs. use_gpu: Whether we are running on GPU. """ - total_size = 1 - for s in input_sizes: - total_size *= s - # Initializes the input tensor with array containing incrementing - # numbers from 1. - x = [f * 1.0 for f in range(1, total_size + 1)] - with self.test_session(use_gpu=use_gpu) as sess: - t = tf.constant(x, shape=input_sizes) - if data_format == "NCHW": - t = NHWCToNCHW(t) - ksize = NHWCToNCHW(ksize) - strides = NHWCToNCHW(strides) - t = pool_func(t, ksize=ksize, strides=strides, padding=padding, - data_format=data_format) - if data_format == "NCHW": - t = NCHWToNHWC(t) - actual = t.eval() - self.assertAllClose(expected, actual.flatten()) - self.assertShapeEqual(actual, t) + self._VerifyOneType(pool_func, input_sizes, ksize, strides, padding, + data_format, tf.float32, expected, use_gpu) + + if not use_gpu or test_util.CudaSupportsHalfMatMulAndConv(): + self._VerifyOneType(pool_func, input_sizes, ksize, strides, padding, + data_format, tf.float16, expected, use_gpu) def _VerifyValues(self, pool_func, input_sizes, ksize, strides, padding, expected, use_gpu): @@ -372,32 +395,40 @@ class PoolingTest(tf.test.TestCase): def testKernelSmallerThanStrideValid(self): for use_gpu in [True, False]: - self._VerifyValues(tf.nn.max_pool, input_sizes=[1, 7, 7, 1], - ksize=[1, 2, 2, 1], strides=[1, 3, 3, 1], - padding="VALID", - expected=[9, 12, 30, 33], - use_gpu=use_gpu) + self._VerifyValues(tf.nn.max_pool, + input_sizes=[1, 7, 7, 1], + ksize=[1, 2, 2, 1], + strides=[1, 3, 3, 1], + padding="VALID", + expected=[9, 12, 30, 33], + use_gpu=use_gpu) - self._VerifyValues(tf.nn.avg_pool, input_sizes=[1, 7, 7, 1], - ksize=[1, 2, 2, 1], strides=[1, 3, 3, 1], - padding="VALID", - expected=[5, 8, 26, 29], - use_gpu=use_gpu) + self._VerifyValues(tf.nn.avg_pool, + input_sizes=[1, 7, 7, 1], + ksize=[1, 2, 2, 1], + strides=[1, 3, 3, 1], + padding="VALID", + expected=[5, 8, 26, 29], + use_gpu=use_gpu) def testKernelSmallerThanStrideSame(self): for use_gpu in [True, False]: - for pool_func in [tf.nn.max_pool, tf.nn.avg_pool]: - self._VerifyValues(pool_func, input_sizes=[1, 3, 3, 1], - ksize=[1, 1, 1, 1], strides=[1, 2, 2, 1], - padding="SAME", - expected=[1, 3, 7, 9], - use_gpu=use_gpu) + for pool_func in [tf.nn.max_pool, tf.nn.avg_pool]: + self._VerifyValues(pool_func, + input_sizes=[1, 3, 3, 1], + ksize=[1, 1, 1, 1], + strides=[1, 2, 2, 1], + padding="SAME", + expected=[1, 3, 7, 9], + use_gpu=use_gpu) - self._VerifyValues(pool_func, input_sizes=[1, 4, 4, 1], - ksize=[1, 1, 1, 1], strides=[1, 2, 2, 1], - padding="SAME", - expected=[1, 3, 9, 11], - use_gpu=use_gpu) + self._VerifyValues(pool_func, + input_sizes=[1, 4, 4, 1], + ksize=[1, 1, 1, 1], + strides=[1, 2, 2, 1], + padding="SAME", + expected=[1, 3, 9, 11], + use_gpu=use_gpu) def _testDepthwiseMaxPoolInvalidConfig(self, in_size, ksize, strides, error_msg, use_gpu=False): @@ -425,43 +456,49 @@ class PoolingTest(tf.test.TestCase): # The following are tests that verify that the CPU and GPU implementations # produce the same resuts. def _CompareMaxPoolingFwd(self, input_shape, ksize, strides, padding): - tensor_input = np.random.rand(*input_shape).astype(np.float32) - with self.test_session(use_gpu=True): - t = tf.constant(tensor_input, shape=input_shape) - out_op, _ = tf.nn.max_pool_with_argmax(t, ksize, strides, padding) - gpu_val = out_op.eval() - with self.test_session(use_gpu=False): - t = tf.constant(tensor_input, shape=input_shape) - out_op = tf.nn.max_pool(t, ksize, strides, padding) - cpu_val = out_op.eval() - self.assertAllClose(cpu_val, gpu_val, rtol=1e-5, atol=1e-5) + for dtype in np.float32, np.float16: + tensor_input = np.random.rand(*input_shape).astype(dtype) + with self.test_session(use_gpu=True): + t = tf.constant(tensor_input, shape=input_shape) + out_op, _ = tf.nn.max_pool_with_argmax(t, ksize, strides, padding) + gpu_val = out_op.eval() + with self.test_session(use_gpu=False): + t = tf.constant(tensor_input, shape=input_shape) + out_op = tf.nn.max_pool(t, ksize, strides, padding) + cpu_val = out_op.eval() + self.assertAllCloseAccordingToType(cpu_val, gpu_val) def _CompareMaxPoolingBk(self, input_shape, output_shape, ksize, strides, padding): - # Generate numbers in a narrow range, so that there are many duplicates - # in the input. - tensor_input = np.random.random_integers(0, 3, - input_shape).astype(np.float32) - tensor_output = np.random.rand(*output_shape).astype(np.float32) - with self.test_session(use_gpu=True): - t = tf.constant(tensor_input, shape=input_shape) - _, argmax_op = tf.nn.max_pool_with_argmax(t, ksize, strides, padding) - argmax = argmax_op.eval() - grad_in = tf.constant(tensor_output, shape=output_shape) - out_op = gen_nn_ops._max_pool_grad_with_argmax(t, grad_in, argmax, - ksize, strides, padding) - gpu_val = out_op.eval() - self.assertShapeEqual(gpu_val, out_op) - with self.test_session(use_gpu=False): - t = tf.constant(tensor_input, shape=input_shape) - out_op = tf.nn.max_pool(t, ksize, strides, padding) - orig_out = out_op.eval() - grad_in = tf.constant(tensor_output, shape=output_shape) - out_op = gen_nn_ops._max_pool_grad(t, orig_out, grad_in, ksize, - strides, padding) - cpu_val = out_op.eval() - self.assertShapeEqual(cpu_val, out_op) - self.assertAllClose(cpu_val, gpu_val, rtol=1e-5, atol=1e-5) + for dtype in np.float32, np.float16: + # Generate numbers in a narrow range, so that there are many duplicates + # in the input. + tensor_input = np.random.random_integers(0, 3, input_shape).astype(dtype) + tensor_output = np.random.rand(*output_shape).astype(dtype) + with self.test_session(use_gpu=True): + t = tf.constant(tensor_input, shape=input_shape) + _, argmax_op = tf.nn.max_pool_with_argmax(t, ksize, strides, padding) + argmax = argmax_op.eval() + grad_in = tf.constant(tensor_output, shape=output_shape) + out_op = gen_nn_ops._max_pool_grad_with_argmax(t, grad_in, argmax, + ksize, strides, padding) + gpu_val = out_op.eval() + self.assertShapeEqual(gpu_val, out_op) + with self.test_session(use_gpu=False): + t = tf.constant(tensor_input, shape=input_shape) + out_op = tf.nn.max_pool(t, ksize, strides, padding) + orig_out = out_op.eval() + grad_in = tf.constant(tensor_output, shape=output_shape) + out_op = gen_nn_ops._max_pool_grad(t, orig_out, grad_in, ksize, strides, + padding) + cpu_val = out_op.eval() + self.assertShapeEqual(cpu_val, out_op) + if dtype == np.float16: + # The CPU version accumulates its gradient on fp16, so it's less + # accurate than the GPU version that does the accumulation on fp32 + self.assertAllClose(cpu_val, gpu_val, rtol=0.01, atol=0.01) + else: + self.assertAllClose(cpu_val, gpu_val) def testMaxPoolingWithArgmax(self): # MaxPoolWithArgMax is implemented only on GPU. diff --git a/tensorflow/python/kernel_tests/sparse_xent_op_test.py b/tensorflow/python/kernel_tests/sparse_xent_op_test.py index a8050cb08db..eb6bdff8b5a 100644 --- a/tensorflow/python/kernel_tests/sparse_xent_op_test.py +++ b/tensorflow/python/kernel_tests/sparse_xent_op_test.py @@ -30,6 +30,9 @@ from tensorflow.python.ops import sparse_ops class SparseXentTest(tf.test.TestCase): def _npXent(self, features, labels): + is_higher_dim = len(features.shape) > 2 + features = np.reshape(features, [-1, features.shape[-1]]) + labels = np.reshape(labels, [-1]) batch_dim = 0 class_dim = 1 batch_size = features.shape[batch_dim] @@ -40,14 +43,15 @@ class SparseXentTest(tf.test.TestCase): labels_mat[np.arange(batch_size), labels] = 1.0 bp = (probs - labels_mat) l = -np.sum(labels_mat * np.log(probs + 1.0e-20), axis=1) - return l, bp + return l, bp, is_higher_dim def _testXent(self, np_features, np_labels, use_gpu=False): - np_loss, np_backprop = self._npXent(np_features, np_labels) + np_loss, np_backprop, is_higher_dim = self._npXent(np_features, np_labels) with self.test_session(use_gpu=use_gpu) as sess: loss = tf.nn.sparse_softmax_cross_entropy_with_logits( np_features, np_labels) - backprop = loss.op.outputs[1] + backprop = (loss.op.inputs[0].op.outputs[1] if is_higher_dim + else loss.op.outputs[1]) tf_loss, tf_backprop = sess.run([loss, backprop]) self.assertAllCloseAccordingToType(np_loss, tf_loss) self.assertAllCloseAccordingToType(np_backprop, tf_backprop) @@ -71,14 +75,6 @@ class SparseXentTest(tf.test.TestCase): self._testSingleClass(use_gpu=True) self._testSingleClass(use_gpu=False) - def testRankTooLarge(self): - np_features = np.array( - [[[1., 1., 1., 1.]], [[1., 2., 3., 4.]]]).astype(np.float32) - np_labels = np.array([1, 2]) - self.assertRaisesRegexp( - ValueError, "must have rank 2", - tf.nn.sparse_softmax_cross_entropy_with_logits, np_features, np_labels) - def testNpXent(self): # We create 2 batches of logits for testing. # batch 0 is the boring uniform distribution: 1, 1, 1, 1, with target 3. @@ -104,7 +100,7 @@ class SparseXentTest(tf.test.TestCase): # With a hard 1, the backprop is [0.032 - 1.0 = -0.968, 0.087, 0.237, 0.644] # The loss for this batch is [1.0 * -log(0.25), 1.0 * -log(0.032)] # = [1.3862, 3.4420] - np_loss, np_backprop = self._npXent(np.array(features), np.array(labels)) + np_loss, np_backprop, _ = self._npXent(np.array(features), np.array(labels)) self.assertAllClose(np.array([[0.25, 0.25, 0.25, -0.75], [-0.968, 0.087, 0.237, 0.6439]]), np_backprop, @@ -114,15 +110,21 @@ class SparseXentTest(tf.test.TestCase): def testShapeMismatch(self): with self.test_session(): - with self.assertRaises(ValueError): + with self.assertRaisesRegexp(ValueError, ".*Rank mismatch:*"): tf.nn.sparse_softmax_cross_entropy_with_logits( - [[0., 1.], [2., 3.]], [[0, 2]]) + [[0., 1.], [2., 3.], [2., 3.]], [[0, 2]]) - def testNotMatrix(self): + def testScalar(self): with self.test_session(): - with self.assertRaises(ValueError): + with self.assertRaisesRegexp(ValueError, ".*Logits cannot be scalars*"): tf.nn.sparse_softmax_cross_entropy_with_logits( - [0., 1., 2., 3.], [0, 2]) + tf.constant(1.0), tf.constant(0)) + + def testVector(self): + with self.test_session(): + loss = tf.nn.sparse_softmax_cross_entropy_with_logits( + tf.constant([1.0]), tf.constant(0)) + self.assertAllClose(0.0, loss.eval()) def testFloat(self): for label_dtype in np.int32, np.int64: @@ -155,6 +157,31 @@ class SparseXentTest(tf.test.TestCase): print("cross entropy gradient err = ", err) self.assertLess(err, 5e-8) + def _testHighDim(self, use_gpu, features, labels): + np_loss, np_backprop, _ = self._npXent(np.array(features), np.array(labels)) + # manually reshape loss + np_loss = np.reshape(np_loss, np.array(labels).shape) + with self.test_session(use_gpu=use_gpu) as sess: + loss = tf.nn.sparse_softmax_cross_entropy_with_logits( + features, labels) + backprop = loss.op.inputs[0].op.outputs[1] + tf_loss, tf_backprop = sess.run([loss, backprop]) + self.assertAllCloseAccordingToType(np_loss, tf_loss) + self.assertAllCloseAccordingToType(np_backprop, tf_backprop) + + def testHighDim(self): + features = [[[1., 1., 1., 1.]], [[1., 2., 3., 4.]]] + labels = [[3], [0]] + self._testHighDim(True, features, labels) + self._testHighDim(False, features, labels) + + def testHighDim2(self): + features = [[[1., 1., 1., 1.], [2., 2., 2., 2.]], + [[1., 2., 3., 4.], [5., 6., 7., 8.]]] + labels = [[3, 2], [0, 3]] + self._testHighDim(True, features, labels) + self._testHighDim(False, features, labels) + def _sparse_vs_dense_xent_benchmark_dense(labels, logits): labels = tf.identity(labels) diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py index c71ec9f065e..fd442c6eb88 100644 --- a/tensorflow/python/ops/array_ops.py +++ b/tensorflow/python/ops/array_ops.py @@ -126,6 +126,7 @@ def shape(input, name=None): else: return gen_array_ops.shape(input, name=name) + def rank(input, name=None): """Returns the rank of a tensor. @@ -612,8 +613,8 @@ def sparse_mask(a, mask_indices, name=None): """Masks elements of `IndexedSlices`. Given an `IndexedSlices` instance `a`, returns another `IndexedSlices` that - contains a subset of the slices of `a`. Only the slices at indices specified - in `mask_indices` are returned. + contains a subset of the slices of `a`. Only the slices at indices not + specified in `mask_indices` are returned. This is useful when you need to extract a subset of slices in an `IndexedSlices` object. @@ -627,7 +628,7 @@ def sparse_mask(a, mask_indices, name=None): tf.shape(a.values) => [4, 10] # `b` will be the subset of `a` slices at its second and third indices, so - # we want to mask of its first and last indices (which are at absolute + # we want to mask its first and last indices (which are at absolute # indices 12, 45) b = tf.sparse_mask(a, [12, 45]) diff --git a/tensorflow/python/ops/nn_ops.py b/tensorflow/python/ops/nn_ops.py index 8fb81a813ad..baaa6391e95 100644 --- a/tensorflow/python/ops/nn_ops.py +++ b/tensorflow/python/ops/nn_ops.py @@ -440,30 +440,65 @@ def sparse_softmax_cross_entropy_with_logits(logits, labels, name=None): on `logits` internally for efficiency. Do not call this op with the output of `softmax`, as it will produce incorrect results. - `logits` must have the shape `[batch_size, num_classes]` - and dtype `float32` or `float64`. - - `labels` must have the shape `[batch_size]` and dtype `int32` or `int64`. + A common use case is to have logits of shape `[batch_size, num_classes]` and + labels of shape `[batch_size]`. But higher dimensions are supported. Args: - logits: Unscaled log probabilities. - labels: Each entry `labels[i]` must be an index in `[0, num_classes)`. Other - values will result in a loss of 0, but incorrect gradient computations. + logits: Unscaled log probabilities of rank `r` and shape + `[d_0, d_1, ..., d_{r-2}, num_classes]` and dtype `float32` or `float64`. + labels: `Tensor` of shape `[d_0, d_1, ..., d_{r-2}]` and dtype `int32` or + `int64`. Each entry in `labels` must be an index in `[0, num_classes)`. + Other values will result in a loss of 0, but incorrect gradient + computations. name: A name for the operation (optional). Returns: - A 1-D `Tensor` of length `batch_size` of the same type as `logits` with the - softmax cross entropy loss. + A `Tensor` of the same shape as `labels` and of the same type as `logits` + with the softmax cross entropy loss. + + Raises: + ValueError: If logits are scalars (need to have rank >= 1) or if the rank + of the labels is not equal to the rank of the labels minus one. """ # TODO(pcmurray) Raise an error when the label is not an index in # [0, num_classes). Note: This could break users who call this with bad # labels, but disregard the bad results. - # The second output tensor contains the gradients. We use it in - # _CrossEntropyGrad() in nn_grad but not here. - cost, unused_backprop = gen_nn_ops._sparse_softmax_cross_entropy_with_logits( - logits, labels, name=name) - return cost + # Reshape logits and labels to rank 2. + with ops.op_scope([labels, logits], name, + "SparseSoftmaxCrossEntropyWithLogits"): + labels = ops.convert_to_tensor(labels) + logits = ops.convert_to_tensor(logits) + + # Store label shape for result later. + labels_static_shape = labels.get_shape() + labels_shape = array_ops.shape(labels) + if logits.get_shape().ndims is not None and logits.get_shape().ndims == 0: + raise ValueError("Logits cannot be scalars - received shape %s.", + logits.get_shape()) + if logits.get_shape().ndims is not None and ( + labels_static_shape.ndims is not None and + labels_static_shape.ndims != logits.get_shape().ndims - 1): + raise ValueError("Rank mismatch: Labels rank (received %s) should equal " + "logits rank (received %s) - 1.", + labels_static_shape.ndims, logits.get_shape().ndims) + # Check if no reshapes are required. + if logits.get_shape().ndims == 2: + cost, _ = gen_nn_ops._sparse_softmax_cross_entropy_with_logits( + logits, labels, name=name) + return cost + # Reshape logits to 2 dim, labels to 1 dim. + num_classes = array_ops.gather(array_ops.shape(logits), + array_ops.rank(logits) - 1) + logits = array_ops.reshape(logits, [-1, num_classes]) + labels = array_ops.reshape(labels, [-1]) + # The second output tensor contains the gradients. We use it in + # _CrossEntropyGrad() in nn_grad but not here. + cost, _ = gen_nn_ops._sparse_softmax_cross_entropy_with_logits( + logits, labels, name=name) + cost = array_ops.reshape(cost, labels_shape) + cost.set_shape(labels_static_shape) + return cost @ops.RegisterShape("SparseSoftmaxCrossEntropyWithLogits") diff --git a/tensorflow/python/training/proximal_adagrad.py b/tensorflow/python/training/proximal_adagrad.py new file mode 100644 index 00000000000..d1bfe707124 --- /dev/null +++ b/tensorflow/python/training/proximal_adagrad.py @@ -0,0 +1,101 @@ +# Copyright 2015 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""ProximalAdagrad for TensorFlow.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from tensorflow.python.framework import ops +from tensorflow.python.ops import constant_op +from tensorflow.python.training import optimizer +from tensorflow.python.training import training_ops + + +class ProximalAdagradOptimizer(optimizer.Optimizer): + # pylint: disable=line-too-long + """Optimizer that implements the Proximal Adagrad algorithm. + + See this [paper](http://papers.nips.cc/paper/3793-efficient-learning-using-forward-backward-splitting.pdf). + + @@__init__ + """ + + def __init__(self, learning_rate, initial_accumulator_value=0.1, + l1_regularization_strength=0.0, l2_regularization_strength=0.0, + use_locking=False, name="ProximalAdagrad"): + """Construct a new ProximalAdagrad optimizer. + + Args: + learning_rate: A `Tensor` or a floating point value. The learning rate. + initial_accumulator_value: A floating point value. + Starting value for the accumulators, must be positive. + l1_regularization_strength: A float value, must be greater than or + equal to zero. + l2_regularization_strength: A float value, must be greater than or + equal to zero. + use_locking: If `True` use locks for update operations. + name: Optional name prefix for the operations created when applying + gradients. Defaults to "Adagrad". + + Raises: + ValueError: If the `initial_accumulator_value` is invalid. + """ + if initial_accumulator_value <= 0.0: + raise ValueError("initial_accumulator_value must be positive: %s" % + initial_accumulator_value) + super(ProximalAdagradOptimizer, self).__init__(use_locking, name) + self._learning_rate = learning_rate + self._initial_accumulator_value = initial_accumulator_value + self._l1_regularization_strength = l1_regularization_strength + self._l2_regularization_strength = l2_regularization_strength + # Created in Initialize. + self._l1_regularization_strength_tensor = None + self._l2_regularization_strength_tensor = None + self._learning_rate_tensor = None + + def _create_slots(self, var_list): + for v in var_list: + with ops.colocate_with(v): + val = constant_op.constant(self._initial_accumulator_value, + shape=v.get_shape()) + self._get_or_make_slot(v, val, "accumulator", self._name) + + def _prepare(self): + self._learning_rate_tensor = ops.convert_to_tensor(self._learning_rate, + name="learning_rate") + self._l1_regularization_strength_tensor = ops.convert_to_tensor( + self._l1_regularization_strength, + name="l1_regularization_strength") + self._l2_regularization_strength_tensor = ops.convert_to_tensor( + self._l2_regularization_strength, + name="l2_regularization_strength") + + def _apply_dense(self, grad, var): + acc = self.get_slot(var, "accumulator") + return training_ops.apply_proximal_adagrad( + var, acc, self._learning_rate_tensor, + self._l1_regularization_strength_tensor, + self._l2_regularization_strength_tensor, + grad, use_locking=self._use_locking) + + def _apply_sparse(self, grad, var): + acc = self.get_slot(var, "accumulator") + return training_ops.sparse_apply_proximal_adagrad( + var, acc, self._learning_rate_tensor, + self._l1_regularization_strength_tensor, + self._l2_regularization_strength_tensor, + grad.values, grad.indices, + use_locking=self._use_locking) diff --git a/tensorflow/python/training/proximal_adagrad_test.py b/tensorflow/python/training/proximal_adagrad_test.py new file mode 100644 index 00000000000..30e6245ef24 --- /dev/null +++ b/tensorflow/python/training/proximal_adagrad_test.py @@ -0,0 +1,205 @@ +# Copyright 2015 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Functional tests for Proximal Adagrad operations.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np +import tensorflow as tf + + +class ProximalAdagradOptimizerTest(tf.test.TestCase): + + def testProximalAdagradwithoutRegularization(self): + with self.test_session() as sess: + var0 = tf.Variable([0.0, 0.0]) + var1 = tf.Variable([0.0, 0.0]) + grads0 = tf.constant([0.1, 0.2]) + grads1 = tf.constant([0.01, 0.02]) + opt = tf.train.ProximalAdagradOptimizer(3.0, + initial_accumulator_value=0.1, + l1_regularization_strength=0.0, + l2_regularization_strength=0.0) + update = opt.apply_gradients(zip([grads0, grads1], [var0, var1])) + tf.initialize_all_variables().run() + + v0_val, v1_val = sess.run([var0, var1]) + self.assertAllClose([0.0, 0.0], v0_val) + self.assertAllClose([0.0, 0.0], v1_val) + + # Run 3 steps Proximal Adagrad. + for _ in range(3): + update.run() + + v0_val, v1_val = sess.run([var0, var1]) + self.assertAllClose(np.array([-2.60260963, -4.29698515]), + v0_val) + self.assertAllClose(np.array([-0.28432083, -0.56694895]), + v1_val) + + def testProximalAdagradwithoutRegularization2(self): + with self.test_session() as sess: + var0 = tf.Variable([1.0, 2.0]) + var1 = tf.Variable([4.0, 3.0]) + grads0 = tf.constant([0.1, 0.2]) + grads1 = tf.constant([0.01, 0.02]) + + opt = tf.train.ProximalAdagradOptimizer(3.0, + initial_accumulator_value=0.1, + l1_regularization_strength=0.0, + l2_regularization_strength=0.0) + update = opt.apply_gradients(zip([grads0, grads1], [var0, var1])) + tf.initialize_all_variables().run() + + v0_val, v1_val = sess.run([var0, var1]) + self.assertAllClose([1.0, 2.0], v0_val) + self.assertAllClose([4.0, 3.0], v1_val) + + # Run 3 steps Proximal Adagrad. + for _ in range(3): + update.run() + v0_val, v1_val = sess.run([var0, var1]) + self.assertAllClose(np.array([-1.60261, -2.296985]), + v0_val) + self.assertAllClose(np.array([3.715679, 2.433051]), + v1_val) + + def testProximalAdagradWithL1(self): + with self.test_session() as sess: + var0 = tf.Variable([1.0, 2.0]) + var1 = tf.Variable([4.0, 3.0]) + grads0 = tf.constant([0.1, 0.2]) + grads1 = tf.constant([0.01, 0.02]) + + opt = tf.train.ProximalAdagradOptimizer(3.0, + initial_accumulator_value=0.1, + l1_regularization_strength=0.001, + l2_regularization_strength=0.0) + update = opt.apply_gradients(zip([grads0, grads1], [var0, var1])) + tf.initialize_all_variables().run() + + v0_val, v1_val = sess.run([var0, var1]) + self.assertAllClose([1.0, 2.0], v0_val) + self.assertAllClose([4.0, 3.0], v1_val) + + # Run 10 steps Proximal Adagrad + for _ in range(10): + update.run() + v0_val, v1_val = sess.run([var0, var1]) + self.assertAllClose(np.array([0.662907, 0.767398]), + v0_val) + self.assertAllClose(np.array([2.959304, 1.029232]), + v1_val) + + def testProximalAdagradWithL1_L2(self): + with self.test_session() as sess: + var0 = tf.Variable([1.0, 2.0]) + var1 = tf.Variable([4.0, 3.0]) + grads0 = tf.constant([0.1, 0.2]) + grads1 = tf.constant([0.01, 0.02]) + + opt = tf.train.ProximalAdagradOptimizer(3.0, + initial_accumulator_value=0.1, + l1_regularization_strength=0.001, + l2_regularization_strength=2.0) + update = opt.apply_gradients(zip([grads0, grads1], [var0, var1])) + tf.initialize_all_variables().run() + + v0_val, v1_val = sess.run([var0, var1]) + self.assertAllClose([1.0, 2.0], v0_val) + self.assertAllClose([4.0, 3.0], v1_val) + + # Run 10 steps Proximal Adagrad. + for _ in range(10): + update.run() + + v0_val, v1_val = sess.run([var0, var1]) + self.assertAllClose(np.array([0.043069, 0.080461]), + v0_val) + self.assertAllClose(np.array([0.004069, 0.008578]), + v1_val) + + def applyOptimizer(self, opt, steps=5, is_sparse=False): + if is_sparse: + var0 = tf.Variable([[1.0], [2.0]]) + var1 = tf.Variable([[3.0], [4.0]]) + grads0 = tf.IndexedSlices(tf.constant([0.1], shape=[1, 1]), + tf.constant([0]), + tf.constant([2, 1])) + grads1 = tf.IndexedSlices(tf.constant([0.02], shape=[1, 1]), + tf.constant([1]), + tf.constant([2, 1])) + else: + var0 = tf.Variable([1.0, 2.0]) + var1 = tf.Variable([3.0, 4.0]) + grads0 = tf.constant([0.1, 0.2]) + grads1 = tf.constant([0.01, 0.02]) + + update = opt.apply_gradients(zip([grads0, grads1], [var0, var1])) + tf.initialize_all_variables().run() + + sess = tf.get_default_session() + v0_val, v1_val = sess.run([var0, var1]) + if is_sparse: + self.assertAllClose([[1.0], [2.0]], v0_val) + self.assertAllClose([[3.0], [4.0]], v1_val) + else: + self.assertAllClose([1.0, 2.0], v0_val) + self.assertAllClose([3.0, 4.0], v1_val) + + # Run ProximalAdagrad for a few steps + for _ in range(steps): + update.run() + + v0_val, v1_val = sess.run([var0, var1]) + return v0_val, v1_val + + def testEquivAdagradwithoutRegularization(self): + with self.test_session(): + val0, val1 = self.applyOptimizer( + tf.train.ProximalAdagradOptimizer(3.0, + initial_accumulator_value=0.1, + l1_regularization_strength=0.0, + l2_regularization_strength=0.0)) + + with self.test_session(): + val2, val3 = self.applyOptimizer( + tf.train.AdagradOptimizer(3.0, initial_accumulator_value=0.1)) + + self.assertAllClose(val0, val2) + self.assertAllClose(val1, val3) + + def testEquivSparseAdagradwithoutRegularization(self): + with self.test_session(): + val0, val1 = self.applyOptimizer( + tf.train.ProximalAdagradOptimizer(3.0, + initial_accumulator_value=0.1, + l1_regularization_strength=0.0, + l2_regularization_strength=0.0), + is_sparse=True) + + with self.test_session(): + val2, val3 = self.applyOptimizer( + tf.train.AdagradOptimizer(3.0, initial_accumulator_value=0.1), + is_sparse=True) + + self.assertAllClose(val0, val2) + self.assertAllClose(val1, val3) + + +if __name__ == "__main__": + tf.test.main() diff --git a/tensorflow/python/training/proximal_gradient_descent.py b/tensorflow/python/training/proximal_gradient_descent.py new file mode 100644 index 00000000000..299c6fa1c7b --- /dev/null +++ b/tensorflow/python/training/proximal_gradient_descent.py @@ -0,0 +1,81 @@ +# Copyright 2015 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""ProximalGradientDescent for TensorFlow.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from tensorflow.python.framework import ops +# pylint: disable=unused-import +from tensorflow.python.ops import math_ops +# pylint: enable=unused-import +from tensorflow.python.training import optimizer +from tensorflow.python.training import training_ops + + +class ProximalGradientDescentOptimizer(optimizer.Optimizer): + # pylint: disable=line-too-long + """Optimizer that implements the proximal gradient descent algorithm. + + See this [paper](http://papers.nips.cc/paper/3793-efficient-learning-using-forward-backward-splitting.pdf). + + @@__init__ + """ + + def __init__(self, learning_rate, l1_regularization_strength=0.0, + l2_regularization_strength=0.0, use_locking=False, + name="ProximalGradientDescent"): + """Construct a new proximal gradient descent optimizer. + + Args: + learning_rate: A Tensor or a floating point value. The learning + rate to use. + l1_regularization_strength: A float value, must be greater than or + equal to zero. + l2_regularization_strength: A float value, must be greater than or + equal to zero. + use_locking: If True use locks for update operations. + name: Optional name prefix for the operations created when applying + gradients. Defaults to "GradientDescent". + """ + super(ProximalGradientDescentOptimizer, self).__init__(use_locking, name) + self._learning_rate = learning_rate + self._l1_regularization_strength = l1_regularization_strength + self._l2_regularization_strength = l2_regularization_strength + self._l1_regularization_strength_tensor = None + self._l2_regularization_strength_tensor = None + + def _apply_dense(self, grad, var): + return training_ops.apply_proximal_gradient_descent( + var, + self._learning_rate_tensor, + self._l1_regularization_strength_tensor, + self._l2_regularization_strength_tensor, + grad, + use_locking=self._use_locking).op + + def _apply_sparse(self, grad, var): + delta = ops.IndexedSlices(grad.values * self._learning_rate_tensor, + grad.indices, grad.dense_shape) + return var.scatter_sub(delta, use_locking=self._use_locking) + + def _prepare(self): + self._learning_rate_tensor = ops.convert_to_tensor(self._learning_rate, + name="learning_rate") + self._l1_regularization_strength_tensor = ops.convert_to_tensor( + self._l1_regularization_strength, name="l1_regularization_strength") + self._l2_regularization_strength_tensor = ops.convert_to_tensor( + self._l2_regularization_strength, name="l2_regularization_strength") diff --git a/tensorflow/python/training/proximal_gradient_descent_test.py b/tensorflow/python/training/proximal_gradient_descent_test.py new file mode 100644 index 00000000000..4dd02526873 --- /dev/null +++ b/tensorflow/python/training/proximal_gradient_descent_test.py @@ -0,0 +1,178 @@ +# Copyright 2015 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Functional tests for Proximal Gradient Descent operations.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np +import tensorflow as tf + + +class ProximalGradientDescentOptimizerTest(tf.test.TestCase): + + def testProximalGradientDescentwithoutRegularization(self): + with self.test_session() as sess: + var0 = tf.Variable([0.0, 0.0]) + var1 = tf.Variable([0.0, 0.0]) + grads0 = tf.constant([0.1, 0.2]) + grads1 = tf.constant([0.01, 0.02]) + opt = tf.train.ProximalGradientDescentOptimizer( + 3.0, + l1_regularization_strength=0.0, + l2_regularization_strength=0.0) + update = opt.apply_gradients(zip([grads0, grads1], [var0, var1])) + tf.initialize_all_variables().run() + + v0_val, v1_val = sess.run([var0, var1]) + self.assertAllClose([0.0, 0.0], v0_val) + self.assertAllClose([0.0, 0.0], v1_val) + + # Run 3 steps Proximal Gradient Descent. + for _ in range(3): + update.run() + + v0_val, v1_val = sess.run([var0, var1]) + self.assertAllClose(np.array([-0.9, -1.8]), + v0_val) + self.assertAllClose(np.array([-0.09, -0.18]), + v1_val) + + def testProximalGradientDescentwithoutRegularization2(self): + with self.test_session() as sess: + var0 = tf.Variable([1.0, 2.0]) + var1 = tf.Variable([4.0, 3.0]) + grads0 = tf.constant([0.1, 0.2]) + grads1 = tf.constant([0.01, 0.02]) + + opt = tf.train.ProximalGradientDescentOptimizer( + 3.0, + l1_regularization_strength=0.0, + l2_regularization_strength=0.0) + update = opt.apply_gradients(zip([grads0, grads1], [var0, var1])) + tf.initialize_all_variables().run() + + v0_val, v1_val = sess.run([var0, var1]) + self.assertAllClose([1.0, 2.0], v0_val) + self.assertAllClose([4.0, 3.0], v1_val) + + # Run 3 steps Proximal Gradient Descent + for _ in range(3): + update.run() + + v0_val, v1_val = sess.run([var0, var1]) + self.assertAllClose(np.array([0.1, 0.2]), + v0_val) + self.assertAllClose(np.array([3.91, 2.82]), + v1_val) + + def testProximalGradientDescentWithL1_L2(self): + with self.test_session() as sess: + var0 = tf.Variable([1.0, 2.0]) + var1 = tf.Variable([4.0, 3.0]) + grads0 = tf.constant([0.1, 0.2]) + grads1 = tf.constant([0.01, 0.02]) + + opt = tf.train.ProximalGradientDescentOptimizer( + 3.0, + l1_regularization_strength=0.001, + l2_regularization_strength=2.0) + update = opt.apply_gradients(zip([grads0, grads1], [var0, var1])) + tf.initialize_all_variables().run() + + v0_val, v1_val = sess.run([var0, var1]) + self.assertAllClose([1.0, 2.0], v0_val) + self.assertAllClose([4.0, 3.0], v1_val) + + # Run 10 steps Proximal Gradient Descent + for _ in range(10): + update.run() + + v0_val, v1_val = sess.run([var0, var1]) + self.assertAllClose(np.array([0.037125, 0.074625]), + v0_val) + self.assertAllClose(np.array([0.003375, 0.007125]), + v1_val) + + def applyOptimizer(self, opt, steps=5, is_sparse=False): + if is_sparse: + var0 = tf.Variable([[1.0], [2.0]]) + var1 = tf.Variable([[3.0], [4.0]]) + grads0 = tf.IndexedSlices(tf.constant([0.1], shape=[1, 1]), + tf.constant([0]), + tf.constant([2, 1])) + grads1 = tf.IndexedSlices(tf.constant([0.02], shape=[1, 1]), + tf.constant([1]), + tf.constant([2, 1])) + else: + var0 = tf.Variable([1.0, 2.0]) + var1 = tf.Variable([3.0, 4.0]) + grads0 = tf.constant([0.1, 0.2]) + grads1 = tf.constant([0.01, 0.02]) + + update = opt.apply_gradients(zip([grads0, grads1], [var0, var1])) + tf.initialize_all_variables().run() + + sess = tf.get_default_session() + v0_val, v1_val = sess.run([var0, var1]) + if is_sparse: + self.assertAllClose([[1.0], [2.0]], v0_val) + self.assertAllClose([[3.0], [4.0]], v1_val) + else: + self.assertAllClose([1.0, 2.0], v0_val) + self.assertAllClose([3.0, 4.0], v1_val) + + # Run ProximalAdagrad for a few steps + for _ in range(steps): + update.run() + + v0_val, v1_val = sess.run([var0, var1]) + return v0_val, v1_val + + def testEquivSparseGradientDescentwithoutRegularizaion(self): + with self.test_session(): + val0, val1 = self.applyOptimizer( + tf.train.ProximalGradientDescentOptimizer( + 3.0, + l1_regularization_strength=0.0, + l2_regularization_strength=0.0), + is_sparse=True) + + with self.test_session(): + val2, val3 = self.applyOptimizer( + tf.train.GradientDescentOptimizer(3.0), is_sparse=True) + + self.assertAllClose(val0, val2) + self.assertAllClose(val1, val3) + + def testEquivGradientDescentwithoutRegularizaion(self): + with self.test_session(): + val0, val1 = self.applyOptimizer( + tf.train.ProximalGradientDescentOptimizer( + 3.0, + l1_regularization_strength=0.0, + l2_regularization_strength=0.0)) + + with self.test_session(): + val2, val3 = self.applyOptimizer( + tf.train.GradientDescentOptimizer(3.0)) + + self.assertAllClose(val0, val2) + self.assertAllClose(val1, val3) + + +if __name__ == "__main__": + tf.test.main() diff --git a/tensorflow/python/training/training.py b/tensorflow/python/training/training.py index fecc0e0c00f..84ffd6c1dad 100644 --- a/tensorflow/python/training/training.py +++ b/tensorflow/python/training/training.py @@ -151,6 +151,7 @@ from tensorflow.python.ops import state_ops from tensorflow.python.training.adadelta import AdadeltaOptimizer from tensorflow.python.training.adagrad import AdagradOptimizer +from tensorflow.python.training.proximal_adagrad import ProximalAdagradOptimizer from tensorflow.python.training.adam import AdamOptimizer from tensorflow.python.training.ftrl import FtrlOptimizer from tensorflow.python.training.momentum import MomentumOptimizer @@ -158,6 +159,7 @@ from tensorflow.python.training.moving_averages import ExponentialMovingAverage from tensorflow.python.training.optimizer import Optimizer from tensorflow.python.training.rmsprop import RMSPropOptimizer from tensorflow.python.training.gradient_descent import GradientDescentOptimizer +from tensorflow.python.training.proximal_gradient_descent import ProximalGradientDescentOptimizer from tensorflow.python.training.sync_replicas_optimizer import SyncReplicasOptimizer # Utility classes for training. diff --git a/tensorflow/python/training/training_ops.py b/tensorflow/python/training/training_ops.py index 46955e43c56..86197523387 100644 --- a/tensorflow/python/training/training_ops.py +++ b/tensorflow/python/training/training_ops.py @@ -69,6 +69,17 @@ def _ApplyAdagradShape(op): grad_shape = op.inputs[3].get_shape().merge_with(accum_shape) return [grad_shape] +@ops.RegisterShape("ApplyProximalAdagrad") +def _ApplyProximalAdagradShape(op): + """Shape function for the ApplyProximalAdagrad op.""" + var_shape = op.inputs[0].get_shape() + accum_shape = op.inputs[1].get_shape().merge_with(var_shape) + _AssertInputIsScalar(op, 2) # lr + _AssertInputIsScalar(op, 3) # l1 + _AssertInputIsScalar(op, 4) # l2 + grad_shape = op.inputs[5].get_shape().merge_with(accum_shape) + return [grad_shape] + @ops.RegisterShape("ApplyFtrl") def _ApplyFtrlShape(op): @@ -133,6 +144,32 @@ def _ApplyGradientDescentShape(op): delta_shape = op.inputs[2].get_shape().merge_with(var_shape) return [delta_shape] + +@ops.RegisterShape("ApplyProximalGradientDescent") +def _ApplyProximalGradientDescentShape(op): + """Shape function for the ApplyProximalGradientDescent op.""" + var_shape = op.inputs[0].get_shape() + _AssertInputIsScalar(op, 1) # alpha + _AssertInputIsScalar(op, 2) # l1 + _AssertInputIsScalar(op, 3) # l2 + delta_shape = op.inputs[4].get_shape().merge_with(var_shape) + return [delta_shape] + + +@ops.RegisterShape("SparseApplyProximalGradientDescent") +def _SparseApplyProximalGradientDescentShape(op): + """Shape function for the SparseApplyGradientDescent op.""" + var_shape = op.inputs[0].get_shape() + _AssertInputIsScalar(op, 1) # lr + _AssertInputIsScalar(op, 2) # l1 + _AssertInputIsScalar(op, 3) # l2 + grad_shape = op.inputs[4].get_shape().merge_with( + tensor_shape.TensorShape([None]).concatenate(var_shape[1:])) + unused_indices_shape = op.inputs[5].get_shape().merge_with( + tensor_shape.vector(grad_shape[0])) + return [var_shape] + + @ops.RegisterShape("SparseApplyAdadelta") def _SparseApplyAdadeltaShape(op): """Shape function for the SparseApplyAdadelta op.""" @@ -148,6 +185,7 @@ def _SparseApplyAdadeltaShape(op): tensor_shape.vector(grad_shape[0])) return [accum_update_shape] + @ops.RegisterShape("SparseApplyAdagrad") def _SparseApplyAdagradShape(op): """Shape function for the SparseApplyAdagrad op.""" @@ -161,6 +199,21 @@ def _SparseApplyAdagradShape(op): return [accum_shape] +@ops.RegisterShape("SparseApplyProximalAdagrad") +def _SparseApplyProximalAdagradShape(op): + """Shape function for the SparseApplyProximalAdagrad op.""" + var_shape = op.inputs[0].get_shape() + accum_shape = op.inputs[1].get_shape().merge_with(var_shape) + _AssertInputIsScalar(op, 2) # lr + _AssertInputIsScalar(op, 3) # l1 + _AssertInputIsScalar(op, 4) # l2 + grad_shape = op.inputs[5].get_shape().merge_with( + tensor_shape.TensorShape([None]).concatenate(accum_shape[1:])) + unused_indices_shape = op.inputs[6].get_shape().merge_with( + tensor_shape.vector(grad_shape[0])) + return [accum_shape] + + @ops.RegisterShape("SparseApplyFtrl") def _SparseApplyFtrlShape(op): """Shape function for the SparseApplyFtrl op.""" diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.cc b/tensorflow/stream_executor/cuda/cuda_dnn.cc index 23a8066e796..9d860e59a29 100644 --- a/tensorflow/stream_executor/cuda/cuda_dnn.cc +++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc @@ -1876,6 +1876,40 @@ bool CudnnSupport::DoPoolForward( return true; } +bool CudnnSupport::DoPoolForward( + Stream* stream, const dnn::PoolingDescriptor& pooling_dimensions, + const dnn::BatchDescriptor& input_dimensions, + const DeviceMemory& input_data, + const dnn::BatchDescriptor& output_dimensions, + DeviceMemory* output_data) { + mutex_lock lock{dnn_handle_mutex_}; + auto status = dynload::cudnnSetStream(parent_, ToHandle(dnn_handle_), + AsCUDAStreamValue(stream)); + if (status != CUDNN_STATUS_SUCCESS) { + LOG(ERROR) << "failed to set stream for cudnn handle: " << ToString(status); + return false; + } + + // Alpha is the scaling factor for input. + float alpha = 1.0; + // Beta is the scaling factor for output. + float beta = 0.0; + + ScopedTensorDescriptor src_desc{parent_, input_dimensions, CUDNN_DATA_HALF}; + ScopedTensorDescriptor dest_desc{parent_, output_dimensions, CUDNN_DATA_HALF}; + ScopedPoolingDescriptor pooling_desc{parent_, pooling_dimensions}; + status = dynload::cudnnPoolingForward( + parent_, ToHandle(dnn_handle_), pooling_desc.handle(), &alpha, + src_desc.handle(), input_data.opaque(), &beta, dest_desc.handle(), + output_data->opaque()); + if (status != CUDNN_STATUS_SUCCESS) { + LOG(ERROR) << "failed to enqueue forward pooling on stream: " + << ToString(status); + return false; + } + return true; +} + bool CudnnSupport::DoPoolBackward( Stream* stream, const dnn::PoolingDescriptor& pooling_dimensions, const dnn::BatchDescriptor& input_dimensions, @@ -1914,6 +1948,43 @@ bool CudnnSupport::DoPoolBackward( return true; } +bool CudnnSupport::DoPoolBackward( + Stream* stream, const dnn::PoolingDescriptor& pooling_dimensions, + const dnn::BatchDescriptor& input_dimensions, + const DeviceMemory& input_data, + const dnn::BatchDescriptor& output_dimensions, + const DeviceMemory& output_data, + const DeviceMemory& input_diff_data, + DeviceMemory* output_diff_data) { + mutex_lock lock{dnn_handle_mutex_}; + auto status = dynload::cudnnSetStream(parent_, ToHandle(dnn_handle_), + AsCUDAStreamValue(stream)); + if (status != CUDNN_STATUS_SUCCESS) { + LOG(ERROR) << "failed to set stream for cudnn handle: " << ToString(status); + return false; + } + + // Alpha is the scaling factor for input. + float alpha = 1.0; + // Beta is the scaling factor for output. + float beta = 0.0; + + ScopedTensorDescriptor src_desc{parent_, input_dimensions, CUDNN_DATA_HALF}; + ScopedTensorDescriptor dest_desc{parent_, output_dimensions, CUDNN_DATA_HALF}; + ScopedPoolingDescriptor pooling_desc{parent_, pooling_dimensions}; + status = dynload::cudnnPoolingBackward( + parent_, ToHandle(dnn_handle_), pooling_desc.handle(), &alpha, + dest_desc.handle(), output_data.opaque(), dest_desc.handle(), + input_diff_data.opaque(), src_desc.handle(), input_data.opaque(), &beta, + src_desc.handle(), output_diff_data->opaque()); + if (status != CUDNN_STATUS_SUCCESS) { + LOG(ERROR) << "failed to enqueue backward pooling on stream: " + << ToString(status); + return false; + } + return true; +} + bool CudnnSupport::DoNormalize( Stream* stream, const dnn::NormalizeDescriptor& normalize_descriptor, const DeviceMemory& input_data, DeviceMemory* output_data) { diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.h b/tensorflow/stream_executor/cuda/cuda_dnn.h index 523a0c6c5d3..434ab730a78 100644 --- a/tensorflow/stream_executor/cuda/cuda_dnn.h +++ b/tensorflow/stream_executor/cuda/cuda_dnn.h @@ -201,6 +201,13 @@ class CudnnSupport : public dnn::DnnSupport { const dnn::BatchDescriptor& output_dimensions, DeviceMemory* output_data) override; + bool DoPoolForward(Stream* stream, + const dnn::PoolingDescriptor& pooling_dimensions, + const dnn::BatchDescriptor& input_dimensions, + const DeviceMemory& input_data, + const dnn::BatchDescriptor& output_dimensions, + DeviceMemory* output_data) override; + bool DoPoolBackward(Stream* stream, const dnn::PoolingDescriptor& pooling_dimensions, const dnn::BatchDescriptor& input_dimensions, @@ -210,6 +217,15 @@ class CudnnSupport : public dnn::DnnSupport { const DeviceMemory& input_diff_data, DeviceMemory* output_diff_data) override; + bool DoPoolBackward(Stream* stream, + const dnn::PoolingDescriptor& pooling_dimensions, + const dnn::BatchDescriptor& input_dimensions, + const DeviceMemory& input_data, + const dnn::BatchDescriptor& output_dimensions, + const DeviceMemory& output_data, + const DeviceMemory& input_diff_data, + DeviceMemory* output_diff_data) override; + bool DoNormalize(Stream* stream, const dnn::NormalizeDescriptor& normalize_descriptor, const DeviceMemory& input_data, diff --git a/tensorflow/stream_executor/dnn.h b/tensorflow/stream_executor/dnn.h index fbb44dc7390..0ae482a73c4 100644 --- a/tensorflow/stream_executor/dnn.h +++ b/tensorflow/stream_executor/dnn.h @@ -1011,6 +1011,13 @@ class DnnSupport { const dnn::BatchDescriptor& output_dimensions, DeviceMemory* output_data) = 0; + virtual bool DoPoolForward(Stream* stream, + const dnn::PoolingDescriptor& pooling_dimensions, + const dnn::BatchDescriptor& input_dimensions, + const DeviceMemory& input_data, + const dnn::BatchDescriptor& output_dimensions, + DeviceMemory* output_data) = 0; + // Performs differentiation of the pooling operation. virtual bool DoPoolBackward(Stream* stream, const dnn::PoolingDescriptor& pooling_dimensions, @@ -1021,6 +1028,15 @@ class DnnSupport { const DeviceMemory& input_diff_data, DeviceMemory* output_diff_data) = 0; + virtual bool DoPoolBackward(Stream* stream, + const dnn::PoolingDescriptor& pooling_dimensions, + const dnn::BatchDescriptor& input_dimensions, + const DeviceMemory& input_data, + const dnn::BatchDescriptor& output_dimensions, + const DeviceMemory& output_data, + const DeviceMemory& input_diff_data, + DeviceMemory* output_diff_data) = 0; + // Applies local response normalization to the values from // input_data and writes the result to output_data. See comments on // NormalizeDescriptor for a description of local response diff --git a/tensorflow/stream_executor/stream.cc b/tensorflow/stream_executor/stream.cc index 446a3c9a7d1..be823d9500f 100644 --- a/tensorflow/stream_executor/stream.cc +++ b/tensorflow/stream_executor/stream.cc @@ -909,6 +909,30 @@ Stream &Stream::ThenPoolForward( return *this; } +Stream &Stream::ThenPoolForward( + const dnn::PoolingDescriptor &pooling_dimensions, + const dnn::BatchDescriptor &input_dimensions, + const DeviceMemory &input_data, + const dnn::BatchDescriptor &output_dimensions, + DeviceMemory *output_data) { + VLOG_CALL(PARAM(pooling_dimensions), PARAM(input_dimensions), + PARAM(input_data), PARAM(output_dimensions), PARAM(output_data)); + + if (ok()) { + if (dnn::DnnSupport *dnn = parent_->AsDnn()) { + CheckError(dnn->DoPoolForward(this, pooling_dimensions, input_dimensions, + input_data, output_dimensions, + output_data)); + } else { + SetError(); + LOG(WARNING) + << "attempting to perform DNN operation using StreamExecutor " + "without DNN support"; + } + } + return *this; +} + Stream &Stream::ThenPoolBackward( const dnn::PoolingDescriptor &pooling_dimensions, const dnn::BatchDescriptor &input_dimensions, @@ -936,6 +960,33 @@ Stream &Stream::ThenPoolBackward( return *this; } +Stream &Stream::ThenPoolBackward( + const dnn::PoolingDescriptor &pooling_dimensions, + const dnn::BatchDescriptor &input_dimensions, + const DeviceMemory &input_data, + const dnn::BatchDescriptor &output_dimensions, + const DeviceMemory &output_data, + const DeviceMemory &input_diff_data, + DeviceMemory *output_diff_data) { + VLOG_CALL(PARAM(pooling_dimensions), PARAM(input_dimensions), + PARAM(input_data), PARAM(output_dimensions), PARAM(output_data), + PARAM(input_diff_data), PARAM(output_diff_data)); + + if (ok()) { + if (dnn::DnnSupport *dnn = parent_->AsDnn()) { + CheckError(dnn->DoPoolBackward(this, pooling_dimensions, input_dimensions, + input_data, output_dimensions, output_data, + input_diff_data, output_diff_data)); + } else { + SetError(); + LOG(WARNING) + << "attempting to perform DNN operation using StreamExecutor " + "without DNN support"; + } + } + return *this; +} + Stream &Stream::ThenNormalize( const dnn::NormalizeDescriptor &normalize_descriptor, const DeviceMemory &input_data, DeviceMemory *output_data) { diff --git a/tensorflow/stream_executor/stream.h b/tensorflow/stream_executor/stream.h index aac945c9e02..c131250de1e 100644 --- a/tensorflow/stream_executor/stream.h +++ b/tensorflow/stream_executor/stream.h @@ -421,6 +421,12 @@ class Stream { const dnn::BatchDescriptor &output_dimensions, DeviceMemory *output_data); + Stream &ThenPoolForward(const dnn::PoolingDescriptor &pooling_dimensions, + const dnn::BatchDescriptor &input_dimensions, + const DeviceMemory &input_data, + const dnn::BatchDescriptor &output_dimensions, + DeviceMemory *output_data); + Stream &ThenPoolBackward(const dnn::PoolingDescriptor &pooling_dimensions, const dnn::BatchDescriptor &input_dimensions, const DeviceMemory &input_data, @@ -429,6 +435,14 @@ class Stream { const DeviceMemory &input_diff_data, DeviceMemory *output_diff_data); + Stream &ThenPoolBackward(const dnn::PoolingDescriptor &pooling_dimensions, + const dnn::BatchDescriptor &input_dimensions, + const DeviceMemory &input_data, + const dnn::BatchDescriptor &output_dimensions, + const DeviceMemory &output_data, + const DeviceMemory &input_diff_data, + DeviceMemory *output_diff_data); + Stream &ThenNormalize(const dnn::NormalizeDescriptor &normalize_descriptor, const DeviceMemory &input_data, DeviceMemory *output_data); diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl index 07f83651e02..d9cfb85fc36 100644 --- a/tensorflow/workspace.bzl +++ b/tensorflow/workspace.bzl @@ -6,8 +6,8 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""): native.new_http_archive( name = "eigen_archive", - url = "https://bitbucket.org/eigen/eigen/get/d02e6a705c30.tar.gz", - sha256 = "532956172daa8aba87c750791ff89a5c38cdb07e2525afe17ecb4bef812d67cf", + url = "https://bitbucket.org/eigen/eigen/get/0c0b79ecd74c.tar.gz", + sha256 = "b4b5884b03bd4bae114d02b36e2435ad1504ed8e51431d16c876b6f6a365882b", build_file = path_prefix + "eigen.BUILD", ) diff --git a/third_party/eigen3/Eigen/Cholesky b/third_party/eigen3/Eigen/Cholesky index 56059bcc61c..7415ae4d0d5 100644 --- a/third_party/eigen3/Eigen/Cholesky +++ b/third_party/eigen3/Eigen/Cholesky @@ -1 +1 @@ -#include "eigen-eigen-d02e6a705c30/Eigen/Cholesky" +#include "eigen-eigen-0c0b79ecd74c/Eigen/Cholesky" diff --git a/third_party/eigen3/Eigen/Core b/third_party/eigen3/Eigen/Core index c1d4a2e0f8c..787e1c076ea 100644 --- a/third_party/eigen3/Eigen/Core +++ b/third_party/eigen3/Eigen/Core @@ -1 +1 @@ -#include "eigen-eigen-d02e6a705c30/Eigen/Core" +#include "eigen-eigen-0c0b79ecd74c/Eigen/Core" diff --git a/third_party/eigen3/Eigen/Eigenvalues b/third_party/eigen3/Eigen/Eigenvalues index 0a0731ba19b..b6e1b81eb5b 100644 --- a/third_party/eigen3/Eigen/Eigenvalues +++ b/third_party/eigen3/Eigen/Eigenvalues @@ -1 +1 @@ -#include "eigen-eigen-d02e6a705c30/Eigen/Eigenvalues" +#include "eigen-eigen-0c0b79ecd74c/Eigen/Eigenvalues" diff --git a/third_party/eigen3/Eigen/LU b/third_party/eigen3/Eigen/LU index d6b39b8d235..a0782af0405 100644 --- a/third_party/eigen3/Eigen/LU +++ b/third_party/eigen3/Eigen/LU @@ -1 +1 @@ -#include "eigen-eigen-d02e6a705c30/Eigen/LU" +#include "eigen-eigen-0c0b79ecd74c/Eigen/LU" diff --git a/third_party/eigen3/Eigen/QR b/third_party/eigen3/Eigen/QR index a5406e93bc6..0a9bee2898f 100644 --- a/third_party/eigen3/Eigen/QR +++ b/third_party/eigen3/Eigen/QR @@ -1 +1 @@ -#include "eigen-eigen-d02e6a705c30/Eigen/QR" +#include "eigen-eigen-0c0b79ecd74c/Eigen/QR" diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/Tensor b/third_party/eigen3/unsupported/Eigen/CXX11/Tensor index 4f730236b78..5228bcda62e 100644 --- a/third_party/eigen3/unsupported/Eigen/CXX11/Tensor +++ b/third_party/eigen3/unsupported/Eigen/CXX11/Tensor @@ -1 +1 @@ -#include "eigen-eigen-d02e6a705c30/unsupported/Eigen/CXX11/Tensor" +#include "eigen-eigen-0c0b79ecd74c/unsupported/Eigen/CXX11/Tensor" diff --git a/tools/bazel.rc.template b/tools/bazel.rc.template index d4dddb5211f..02856822c95 100644 --- a/tools/bazel.rc.template +++ b/tools/bazel.rc.template @@ -2,6 +2,7 @@ build:cuda --crosstool_top=//third_party/gpus/crosstool build:cuda --define=using_cuda=true --define=using_cuda_nvcc=true build --force_python=py$PYTHON_MAJOR_VERSION +build --host_force_python=py$PYTHON_MAJOR_VERSION build --python$PYTHON_MAJOR_VERSION_path=$PYTHON_BINARY build --define=use_fast_cpp_protos=true build --define=allow_oversize_protos=true